I am wondering if somebody could help me re-write this in a more sensible and smarter way?
sed -e '1d; $d' <someinputfile> |
awk -F"\t" '{split($2,a,/-/); print $1","a[1]","a[2]","$3","$4","$5","$6}' |
sed -e "s/,/\",\"/g" |
sed 's/^/"/;s/$/"/' |
sed -e $'1i\\\"field_one","field_two","field_three","field_four","field_five","field_six","field_seven"'
It is possible to write the correct output already with awk and I assume there are much better ways to write this.
Shorter way? More efficient way? More correct way? POSIX compliant? GNU compliant?
If you can help please also try to explain the changes as I really want to understand "how" and "what is what" :)
Thanks!
What it does is:
Deletes first and last lines
Splits the second field based on separator - and prints it (here it should be possible to print the right format right away?)
Change , to "," from the previous awk print
Add " around all lines
Add a new header
If somebody wants to play with inputfile here is an example:
START 9 1997-07-27T13:37:01Z
X1 24087-27 Axgma8PYjc1yRJlUr41688 1997-07-27T13:09:00Z 9876 OK
X1 642-68 6nwPtLQTqAAKufH3ejoEeg 1997-07-27T14:31:00Z 9876 OK
X1 642-31 qfKH99UnxZTcp2AN8NNB21 1997-07-27T16:15:00Z 9876 OK
X1 642-24 PouJBByqUJkqhKHBynUesD 1997-07-27T16:15:00Z 9876 OK
X1 642-30 J7t2sJKKtcxWJr18I84A46 1997-07-27T16:15:00Z 9876 OK
X1 642-29 g7hPkNpUywvk6FvGqgpHsx 1997-07-27T16:15:00Z 9876 OK
X1 642-26 W2KM24xvmy0Q8cLV950tXq 1997-07-27T16:15:00Z 9876 OK
X1 642-25 dqu8jB5tUthIKevNAQXgld 1997-07-27T16:15:00Z 9876 OK
X1 753-32 Gh0kZkIJr8j6FSYljbpyyy 1997-07-27T16:15:00Z 9876 OK
X1 753-23 Jvl8LMh6SDHfgvLfJIHi5l 1997-07-27T16:15:00Z 9876 OK
X1 753-28 IZ83996cthjhZGYcAk97iJ 1997-07-27T16:15:00Z 9876 OK
X1 753-22 YJwokU0Dq6xiydkf3EDyxl 1997-07-27T16:15:00Z 9876 OK
X1 753-36 OZHOMirRKjA3LcXTbPJL31 1997-07-27T16:15:00Z 9876 OK
X1 753-34 LvMgT6ed1b1e3uwasGi48G 1997-07-27T16:15:00Z 9877 OK
X1 753-35 VJk4x8sTG1BJTnZYvgu6px 1997-07-27T16:15:00Z 9876 OK
X1 663-27 mkZXgTHKBjmAplrDeoQZXo 1997-07-27T16:15:00Z 9875 ERR
X1 f1K1PzQ9sp2QAv1AX0Zix4 1997-07-27T16:27:00Z 9875 ERR
DONE 69 3QXFXKQAFRSZXJLJ6JZ9NWMXR00B1V1J1FUMBQAA9DQSRCTZF8JXAWWSGHSDIPQ9
Thanks!
PS: Since I'm not sure if you will get the same output on your computer here is how it correctly looks for me when I run it and how I want it:
"field_one","field_two","field_three","field_four","field_five","field_six","field_seven"
"X1","24087","27","Axgma8PYjc1yRJlUr41688","1997-07-27T13:09:00Z","9876","OK"
"X1","642","68","6nwPtLQTqAAKufH3ejoEeg","1997-07-27T14:31:00Z","9876","OK"
"X1","642","31","qfKH99UnxZTcp2AN8NNB21","1997-07-27T16:15:00Z","9876","OK"
"X1","642","24","PouJBByqUJkqhKHBynUesD","1997-07-27T16:15:00Z","9876","OK"
"X1","642","30","J7t2sJKKtcxWJr18I84A46","1997-07-27T16:15:00Z","9876","OK"
"X1","642","29","g7hPkNpUywvk6FvGqgpHsx","1997-07-27T16:15:00Z","9876","OK"
"X1","642","26","W2KM24xvmy0Q8cLV950tXq","1997-07-27T16:15:00Z","9876","OK"
"X1","642","25","dqu8jB5tUthIKevNAQXgld","1997-07-27T16:15:00Z","9876","OK"
"X1","753","32","Gh0kZkIJr8j6FSYljbpyyy","1997-07-27T16:15:00Z","9876","OK"
"X1","753","23","Jvl8LMh6SDHfgvLfJIHi5l","1997-07-27T16:15:00Z","9876","OK"
"X1","753","28","IZ83996cthjhZGYcAk97iJ","1997-07-27T16:15:00Z","9876","OK"
"X1","753","22","YJwokU0Dq6xiydkf3EDyxl","1997-07-27T16:15:00Z","9876","OK"
"X1","753","36","OZHOMirRKjA3LcXTbPJL31","1997-07-27T16:15:00Z","9876","OK"
"X1","753","34","LvMgT6ed1b1e3uwasGi48G","1997-07-27T16:15:00Z","9877","OK"
"X1","753","35","VJk4x8sTG1BJTnZYvgu6px","1997-07-27T16:15:00Z","9876","OK"
"X1","663","27","mkZXgTHKBjmAplrDeoQZXo","1997-07-27T16:15:00Z","9875","ERR"
"X1","","","f1K1PzQ9sp2QAv1AX0Zix4","1997-07-27T16:27:00Z","9875","ERR"
One awk idea:
awk '
BEGIN { FS="\t"
OFS="\",\"" # define output field delimiter as <doublequote> <comma> <doublequote>
# print header
print "\"field_one","field_two","field_three","field_four","field_five","field_six","field_seven\""
}
FNR>1 { if (prev) print prev
split($2,a,"-")
# reformat current line and save in variable "prev", to be printed on next pass; add <doublequote> on ends
prev= "\"" $1 OFS a[1] OFS a[2] OFS $3 OFS $4 OFS $5 OFS $6 "\""
}
' input.dat
This generates:
"field_one","field_two","field_three","field_four","field_five","field_six","field_seven"
"X1","24087","27","Axgma8PYjc1yRJlUr41688","1997-07-27T13:09:00Z","9876","OK"
"X1","642","68","6nwPtLQTqAAKufH3ejoEeg","1997-07-27T14:31:00Z","9876","OK"
"X1","642","31","qfKH99UnxZTcp2AN8NNB21","1997-07-27T16:15:00Z","9876","OK"
"X1","642","24","PouJBByqUJkqhKHBynUesD","1997-07-27T16:15:00Z","9876","OK"
"X1","642","30","J7t2sJKKtcxWJr18I84A46","1997-07-27T16:15:00Z","9876","OK"
"X1","642","29","g7hPkNpUywvk6FvGqgpHsx","1997-07-27T16:15:00Z","9876","OK"
"X1","642","26","W2KM24xvmy0Q8cLV950tXq","1997-07-27T16:15:00Z","9876","OK"
"X1","642","25","dqu8jB5tUthIKevNAQXgld","1997-07-27T16:15:00Z","9876","OK"
"X1","753","32","Gh0kZkIJr8j6FSYljbpyyy","1997-07-27T16:15:00Z","9876","OK"
"X1","753","23","Jvl8LMh6SDHfgvLfJIHi5l","1997-07-27T16:15:00Z","9876","OK"
"X1","753","28","IZ83996cthjhZGYcAk97iJ","1997-07-27T16:15:00Z","9876","OK"
"X1","753","22","YJwokU0Dq6xiydkf3EDyxl","1997-07-27T16:15:00Z","9876","OK"
"X1","753","36","OZHOMirRKjA3LcXTbPJL31","1997-07-27T16:15:00Z","9876","OK"
"X1","753","34","LvMgT6ed1b1e3uwasGi48G","1997-07-27T16:15:00Z","9877","OK"
"X1","753","35","VJk4x8sTG1BJTnZYvgu6px","1997-07-27T16:15:00Z","9876","OK"
"X1","663","27","mkZXgTHKBjmAplrDeoQZXo","1997-07-27T16:15:00Z","9875","ERR"
"X1","","","f1K1PzQ9sp2QAv1AX0Zix4","1997-07-27T16:27:00Z","9875","ERR"
Given:
sed -E 's/\t/\\t/g' file
START\t9\t1997-07-27T13:37:01Z
X1\t24087-27\tAxgma8PYjc1yRJlUr41688\t1997-07-27T13:09:00Z\t9876\tOK
X1\t642-68\t6nwPtLQTqAAKufH3ejoEeg\t1997-07-27T14:31:00Z\t9876\tOK
X1\t642-31\tqfKH99UnxZTcp2AN8NNB21\t1997-07-27T16:15:00Z\t9876\tOK
X1\t642-24\tPouJBByqUJkqhKHBynUesD\t1997-07-27T16:15:00Z\t9876\tOK
X1\t642-30\tJ7t2sJKKtcxWJr18I84A46\t1997-07-27T16:15:00Z\t9876\tOK
X1\t642-29\tg7hPkNpUywvk6FvGqgpHsx\t1997-07-27T16:15:00Z\t9876\tOK
X1\t642-26\tW2KM24xvmy0Q8cLV950tXq\t1997-07-27T16:15:00Z\t9876\tOK
X1\t642-25\tdqu8jB5tUthIKevNAQXgld\t1997-07-27T16:15:00Z\t9876\tOK
X1\t753-32\tGh0kZkIJr8j6FSYljbpyyy\t1997-07-27T16:15:00Z\t9876\tOK
X1\t753-23\tJvl8LMh6SDHfgvLfJIHi5l\t1997-07-27T16:15:00Z\t9876\tOK
X1\t753-28\tIZ83996cthjhZGYcAk97iJ\t1997-07-27T16:15:00Z\t9876\tOK
X1\t753-22\tYJwokU0Dq6xiydkf3EDyxl\t1997-07-27T16:15:00Z\t9876\tOK
X1\t753-36\tOZHOMirRKjA3LcXTbPJL31\t1997-07-27T16:15:00Z\t9876\tOK
X1\t753-34\tLvMgT6ed1b1e3uwasGi48G\t1997-07-27T16:15:00Z\t9877\tOK
X1\t753-35\tVJk4x8sTG1BJTnZYvgu6px\t1997-07-27T16:15:00Z\t9876\tOK
X1\t663-27\tmkZXgTHKBjmAplrDeoQZXo\t1997-07-27T16:15:00Z\t9875\tERR
X1\t\tf1K1PzQ9sp2QAv1AX0Zix4\t1997-07-27T16:27:00Z\t9875\tERR
DONE\t69\t3QXFXKQAFRSZXJLJ6JZ9NWMXR00B1V1J1FUMBQAA9DQSRCTZF8JXAWWSGHSDIPQ9
It is a very good idea to use a proper CSV parser to deal with issues like this.
Ruby is ubiquitous and has a very lightweight but capable CSV parser included in the distribution.
Here is a ruby:
ruby -r csv -e '
data=CSV.parse($<.read, **{:col_sep=>"\t"})
d2=CSV::Table.new([], headers:["field_one","field_two","field_three","field_four","field_five","field_six","field_seven"])
data[1...-1].each { |r|
r_=[]
r.each_with_index { |e,i|
if i == 1
e && e[/-/] ? (r_.concat e.split(/-/,2)) : (r_.concat ["",""])
else
r_ << e
end
}
d2 << r_ }
puts d2.to_csv(**{:force_quotes=>true})
' file
Prints:
"field_one","field_two","field_three","field_four","field_five","field_six","field_seven"
"X1","24087","27","Axgma8PYjc1yRJlUr41688","1997-07-27T13:09:00Z","9876","OK"
"X1","642","68","6nwPtLQTqAAKufH3ejoEeg","1997-07-27T14:31:00Z","9876","OK"
"X1","642","31","qfKH99UnxZTcp2AN8NNB21","1997-07-27T16:15:00Z","9876","OK"
"X1","642","24","PouJBByqUJkqhKHBynUesD","1997-07-27T16:15:00Z","9876","OK"
"X1","642","30","J7t2sJKKtcxWJr18I84A46","1997-07-27T16:15:00Z","9876","OK"
"X1","642","29","g7hPkNpUywvk6FvGqgpHsx","1997-07-27T16:15:00Z","9876","OK"
"X1","642","26","W2KM24xvmy0Q8cLV950tXq","1997-07-27T16:15:00Z","9876","OK"
"X1","642","25","dqu8jB5tUthIKevNAQXgld","1997-07-27T16:15:00Z","9876","OK"
"X1","753","32","Gh0kZkIJr8j6FSYljbpyyy","1997-07-27T16:15:00Z","9876","OK"
"X1","753","23","Jvl8LMh6SDHfgvLfJIHi5l","1997-07-27T16:15:00Z","9876","OK"
"X1","753","28","IZ83996cthjhZGYcAk97iJ","1997-07-27T16:15:00Z","9876","OK"
"X1","753","22","YJwokU0Dq6xiydkf3EDyxl","1997-07-27T16:15:00Z","9876","OK"
"X1","753","36","OZHOMirRKjA3LcXTbPJL31","1997-07-27T16:15:00Z","9876","OK"
"X1","753","34","LvMgT6ed1b1e3uwasGi48G","1997-07-27T16:15:00Z","9877","OK"
"X1","753","35","VJk4x8sTG1BJTnZYvgu6px","1997-07-27T16:15:00Z","9876","OK"
"X1","663","27","mkZXgTHKBjmAplrDeoQZXo","1997-07-27T16:15:00Z","9875","ERR"
"X1","","","f1K1PzQ9sp2QAv1AX0Zix4","1997-07-27T16:27:00Z","9875","ERR"
I would rework this part of your code
awk -F"\t" '{split($2,a,/-/); print $1","a[1]","a[2]","$3","$4","$5","$6}' | sed -e "s/,/\",\"/g" | sed 's/^/"/;s/$/"/' | sed -e $'1i\\\"field_one","field_two","field_three","field_four","field_five","field_six","field_seven"'
following way, 1st step: use "," rather than , and then changing it, i.e.
awk -F"\t" '{split($2,a,/-/); print $1"\",\""a[1]"\",\""a[2]"\",\""$3"\",\""$4"\",\""$5"\",\""$6}' | sed 's/^/"/;s/$/"/' | sed -e $'1i\\\"field_one","field_two","field_three","field_four","field_five","field_six","field_seven"'
2nd step: add leading " and trailing " in print i.e.
awk -F"\t" '{split($2,a,/-/); print "\""$1"\",\""a[1]"\",\""a[2]"\",\""$3"\",\""$4"\",\""$5"\",\""$6"\""}' | sed -e $'1i\\\"field_one","field_two","field_three","field_four","field_five","field_six","field_seven"'
3rd step: use BEGIN to print header i.e.
awk -F"\t" 'BEGIN{print "\"field_one\",\"field_two\",\"field_three\",\"field_four\",\"field_five\",\"field_six\",\"field_seven\""}{split($2,a,/-/); print "\""$1"\",\""a[1]"\",\""a[2]"\",\""$3"\",\""$4"\",\""$5"\",\""$6"\""}'
(tested in gawk 4.2.1)
not as elegant of a solution as i hoped for, but it got the job done -
instead of hard-coding in verbal names for fields, it would simple compute the required header row on the fly based on actual input, which also accounts for the anticipated split of field 2
gnice gcat sample.txt \
\
| mawk2 'function trim(_) { return \
\
substr("",gsub("^[,\42]*|[,\42]*$","\42",_))_
} BEGIN { FS = "[ \11]+"
OFS = "\42\54\42"
} NR==2 {
for(_=NF+!_;_;_—) {
___=(_)(OFS)___
}
printf("%*s\n",gsub("[0-9]+[^0-9]+",\
"field_&",___)~"",trim(___))
} !/^(START|DONE)/ {
printf("\42%.0s%s\42\n",$1=$(($0=\
$(sub("[-]"," ",$2)<""))~""),$0) } ' | lgp3 3
"field_1","field_2","field_3","field_4","field_5","field_6","field_7"
"X1","24087","27","Axgma8PYjc1yRJlUr41688","1997-07-27T13:09:00Z","9876","OK"
"X1","642","68","6nwPtLQTqAAKufH3ejoEeg","1997-07-27T14:31:00Z","9876","OK"
"X1","642","31","qfKH99UnxZTcp2AN8NNB21","1997-07-27T16:15:00Z","9876","OK"
"X1","642","24","PouJBByqUJkqhKHBynUesD","1997-07-27T16:15:00Z","9876","OK"
"X1","642","30","J7t2sJKKtcxWJr18I84A46","1997-07-27T16:15:00Z","9876","OK"
"X1","642","29","g7hPkNpUywvk6FvGqgpHsx","1997-07-27T16:15:00Z","9876","OK"
"X1","642","26","W2KM24xvmy0Q8cLV950tXq","1997-07-27T16:15:00Z","9876","OK"
"X1","642","25","dqu8jB5tUthIKevNAQXgld","1997-07-27T16:15:00Z","9876","OK"
"X1","753","32","Gh0kZkIJr8j6FSYljbpyyy","1997-07-27T16:15:00Z","9876","OK"
"X1","753","23","Jvl8LMh6SDHfgvLfJIHi5l","1997-07-27T16:15:00Z","9876","OK"
"X1","753","28","IZ83996cthjhZGYcAk97iJ","1997-07-27T16:15:00Z","9876","OK"
"X1","753","22","YJwokU0Dq6xiydkf3EDyxl","1997-07-27T16:15:00Z","9876","OK"
"X1","753","36","OZHOMirRKjA3LcXTbPJL31","1997-07-27T16:15:00Z","9876","OK"
"X1","753","34","LvMgT6ed1b1e3uwasGi48G","1997-07-27T16:15:00Z","9877","OK"
"X1","753","35","VJk4x8sTG1BJTnZYvgu6px","1997-07-27T16:15:00Z","9876","OK"
"X1","663","27","mkZXgTHKBjmAplrDeoQZXo","1997-07-27T16:15:00Z","9875","ERR"
"X1","f1K1PzQ9sp2QAv1AX0Zix4","1997-07-27T16:27:00Z","9875","ERR"
Would like to print based on 2nd column ,count of line items, sum of 3rd column and unique values of first column.Having around 100 InputTest files and not sorted ..
Am using below 3 commands to achieve the desired output , would like to know the simplest way ...
InputTest*.txt
abc,xx,5,sss
abc,yy,10,sss
def,xx,15,sss
def,yy,20,sss
abc,xx,5,sss
abc,yy,10,sss
def,xx,15,sss
def,yy,20,sss
ghi,zz,10,sss
Step#1:
cat InputTest*.txt | awk -F, '{key=$2;++a[key];b[key]=b[key]+$3} END {for(i in a) print i","a[i]","b[i]}'
Op#1
xx,4,40
yy,4,60
zz,1,10
Step#2
awk -F ',' '{print $1,$2}' InputTest*.txt | sort | uniq >Op_UniqTest2.txt
Op#2
abc xx
abc yy
def xx
def yy
ghi zz
Step#3
awk '{print $2}' Op_UniqTest2.txt | sort | uniq -c
Op#3
2 xx
2 yy
1 zz
Desired Output:
xx,4,40,2
yy,4,60,2
zz,1,10,1
Looking for suggestions !!!
BEGIN { FS = OFS = "," }
{ ++lines[$2]; if (!seen[$2,$1]++) ++diff[$2]; count[$2]+=$3 }
END { for(i in lines) print i, lines[i], count[i], diff[i] }
lines tracks the number of occurrences of each value in column 2
seen records unique combinations of the second and first column, incrementing diff[$2] whenever a unique combination is found. The ++ after seen[$2,$1] means that the condition will only be true the first time the combination is found, as the value of seen[$2,$1] will be increased to 1 and !seen[$2,$1] will be false.
count keeps a total of the third column
$ awk -f avn.awk file
xx,4,40,2
yy,4,60,2
zz,1,10,1
Using awk:
$ awk '
BEGIN { FS = OFS = "," }
{ keys[$2]++; sum[$2]+=$3 } !seen[$1,$2]++ { count[$2]++ }
END { for(key in keys) print key, keys[key], sum[key], count[key] }
' file
xx,4,40,2
yy,4,60,2
zz,1,10,1
Set the input and output field separator to , in BEGIN block. We use arrays keys to identify and count keys. sum array keeps the sum for each keys. count allows us to keep track of unique column1 for each of column2 values.