handling non existent entries during transpose - awk

I have a data file and need to do transpose in awk handling non-existent values also.
id|type|cost|date|ship
0|A|223|201603|PORT
0|A|22|201602|PORT
0|A|422|201601|DOCK
1|B|3213|201602|DOCK
1|B|3213|201601|PORT
2|C|2321|201601|DOCK
I need to transform and get the output as
id|type|201601|201602|201603|
0|A|422.0|22.0|223.0|
1|B|3213.0|3213.0|n/a|
2|C|2321.0|n/a|n/a|
Im trying pivot in awk, but it is not working.
awk -F"|" -v OFS="|" ' NR>1 { k=$1 OFS $2;id[k]++;date[$4]++;cost[k]=$3 } END {printf("%s","id|type|");for(i in date) printf("%s|",i);print ""; for(i in id) { printf("%s|",i); for(j in date) { if( (i) in cost) { printf("%s|", cost[i OFS j]);} else { printf("%s|","n/a")} } print "" } } ' data.txt
It is just printing.
id|type|201601|201602|201603|
0|A||||
1|B||||
2|C||||

You may try this gnu awk:
awk 'BEGIN {FS=OFS="|"; PROCINFO["sorted_in"]="#ind_str_asc"} NR>1{hdr[$4]; map[$1 OFS $2][$4] = $3} END {printf "%s", "id" OFS "type" OFS; for (i in hdr) printf "%s", i OFS; print ""; for (r in map) {printf "%s", r OFS; for (i in hdr) printf "%s%s", (map[r][i] == "" ? "n/a" : sprintf ("%.1f", map[r][i])), OFS; print ""}}' file
id|type|201601|201602|201603|
0|A|422.0|22.0|223.0|
1|B|3213.0|3213.0|n/a|
2|C|2321.0|n/a|n/a|
A more readable awk:
awk 'BEGIN {
FS=OFS="|"
PROCINFO["sorted_in"]="#ind_str_asc"
}
NR > 1 {
hdr[$4]
map[$1 OFS $2][$4] = $3
}
END {
# print header
printf "%s", "id" OFS "type" OFS
for (i in hdr)
printf "%s", i OFS
print ""
# print body
for (r in map) {
printf "%s", r OFS
for (i in hdr)
printf "%s%s", (map[r][i] == "" ? "n/a" : sprintf ("%.1f", map[r][i])), OFS
print ""
}
}' file

Related

Data Conversion Using Sed or Awk - Name to Title

I have data in the below format:
APP_OWNER : hari
APP_AREA : Work:Business Area:AUS
APP_ID : 124080
APP_OWNER : ari
APP_AREA : Work:AUS
APP_ID : 124345
I want the data to be converted to below format.
APP_OWNER,APP_AREA,APP_ID
hari,Work:Business Area:AUS,124080
ari,Work:AUS,124345
I can convert one line but how to do it with 3 lines at the same time?
My Attempt with one line
sed '0,/: /s//\n/' test.txt
Original Question : Convert Rows to Columns Shell Script
Regards
Here is an awk solution without hardcoding any value:
awk -F '[[:blank:]]+:[[:blank:]]+' 'NR == 1 {fh=$1} !($1 in hdr) {tr = (tr == "" ? "" : tr ",") $1; hdr[$1]} $1 == fh && NR > 1 {print (body ? "" : tr ORS) td; body=1; td=""} {td = (td == "" ? "" : td ",") $2} END {print td}' file
APP_OWNER,APP_AREA,APP_ID
hari,Work:Business Area:AUS,124080
ari,Work:AUS,124345
Here is more readable version:
awk -F '[[:blank:]]+:[[:blank:]]+' '
NR == 1 {
fh = $1
}
!($1 in hdr) { # collect headers only once
tr = (tr == "" ? "" : tr ",") $1
hdr[$1]
}
$1 == fh && NR > 1 { # print header and body
print (body ? "" : tr ORS) td # if body=1 then print only body
body = 1
td = ""
}
{
td = (td == "" ? "" : td ",") $2
}
END {
print td
}' file
Could you please try following, written and tested with shown samples.
awk -F'[[:blank:]]+:[[:blank:]]+' -v OFS="," '
/^APP_OWNER/{
if(heading){
count=1
print heading
}
if(val){
print val
}
val=""
}
count=="" && !($1 in head){
head[$1]
heading=(heading?heading OFS:"")$1
}
{
val=(val?val OFS:"")$2
}
END{
if(val){
print val
}
}
' Input_file
$ cat tst.awk
BEGIN { OFS="," }
{
tag = val = $0
sub(/[[:space:]]*:.*/,"",tag)
sub(/[^:]+:[[:space:]]*/,"",val)
tag2val[tag] = val
}
!seen[tag]++ {
tags[++numTags] = tag
}
(NR%3) == 0 {
if ( !doneHdr++ ) {
for (tagNr=1; tagNr<=numTags; tagNr++) {
tag = tags[tagNr]
printf "%s%s", tag, (tagNr<numTags ? OFS : ORS)
}
}
for (tagNr=1; tagNr<=numTags; tagNr++) {
tag = tags[tagNr]
val = tag2val[tag]
printf "%s%s", val, (tagNr<numTags ? OFS : ORS)
}
delete tag2val
}
$ awk -f tst.awk file
APP_OWNER,APP_AREA,APP_ID
hari,Work:Business Area:AUS,124080
ari,Work:AUS,124345

is it possible to drop a column containing a specific value using unix?

I have a very large variant calling data. I can not pull out the result I want.
here is an example
bac1 bac2 bac3 bac4
1 0 0 1
Now I want to drop the columns that contain 0 using the ubuntu command-line. The result would be like this
bac1 bac4
1 1
I tried this
awk -F "\t" -v "pat=0\t" 'NR == 2 {for (i=1; i <= NF; i++) Take[i] = (pat != $i)}{for (i =1; i <= NF; i++) if (Take [i]) printf $i FS; print ""}'
And the output is this:
NC_045512.2 18876 NC_045512.2_18876_T_C T C . PASS GT 1
Header of this output is:
#CHROM POS ID REF ALT QUAL FILTER FORMAT EPI_ISL_422804
So the final output had to be like this:
#CHROM POS ID REF ALT QUAL FILTER FORMAT EPI_ISL_422804
NC_045512.2 18876 NC_045512.2_18876_T_C T C . PASS GT 1
The file is not always 2 lines but at most it can be 4 lines.
It does not return the header line that's because I used NR == 2. Is there any way I cant get the header column as well??
If your input file always only has 1 data line as in your example then:
$ cat tst.awk
BEGIN { FS=OFS="\t" }
NR == 1 { split($0,hdr); next }
{
for (i = 1; i <= NF; i++) {
if ($i != 0) {
cols[++nf] = i
}
}
for (i = 1; i <= nf; i++) {
printf "%s%s", hdr[cols[i]], (i<nf ? OFS : ORS)
}
for (i = 1; i <= nf; i++) {
printf "%s%s", $(cols[i]), (i<nf ? OFS : ORS)
}
}
.
$ awk -f tst.awk file
bac1 bac4
1 1
otherwise if your input can have more than 1 data line then you need a 2-pass approach:
$ cat tst.awk
BEGIN { FS=OFS="\t" }
NR == FNR {
if (NR > 1) {
for (i = 1; i <= NF; i++) {
if ($i == 0) {
zeroCols[i]
}
}
}
next
}
FNR == 1 {
for (i = 1; i <= NF; i++) {
if (! (i in zeroCols) ) {
cols[++nf] = i
}
}
}
{
for (i = 1; i <= nf; i++) {
printf "%s%s", $(cols[i]), (i<nf ? OFS : ORS)
}
}
.
$ awk -f tst.awk file file
bac1 bac4
1 1
Long version with if:
awk 'NR==1{
split($0,array,FS)
}
NR==2{
s=0
for(i=1;i<=NF;i++){
if($i!=0){
if(s==0){
s=1
printf("%s",array[i])
}
else{
printf("%s%s",OFS,array[i])
}
}
}
print ""
s=0
for(i=1;i<=NF;i++){
if($i!=0){
if(s==0){
s=1
printf("%s",$i)
}
else{
printf("%s%s",OFS,$i)
}
}
}
print ""
}' FS='\t' OFS="\t" file
One line:
awk 'NR==1{split($0,array,FS)} NR==2{s=0; for(i=1;i<=NF;i++) {if($i!=0) {if(s==0) {s=1; printf("%s",array[i])} else {printf("%s%s",OFS,array[i])}}} print ""; s=0; for(i=1;i<=NF;i++){if($i!=0){if(s==0){s=1; printf("%s",$i)} else {printf("%s%s",OFS,$i)}}} print ""}' FS='\t' OFS="\t" file
Output:
bac1 bac4
1 1

merge file on the basis of 2 fields

file1
session=1|w,eventbase=4,operation=1,rule=15
session=1|e,eventbase=5,operation=2,rule=14
session=2|t,eventbase=,operation=1,rule=13
file2
field1,field2,field3,session=1,fieldn,operation=1,fieldn
field1,field2,field3,session=1,fieldn,operation=2,fieldn
field1,field2,field3,session=2,fieldn,operation=2,fieldn
field1,field2,field3,session=2,fieldn,operation=1,fieldn
Output
field1,field2,field3,session=1,fieldn,operation=1,fieldn,eventbase=4,rule=15
field1,field2,field3,session=1,fieldn,operation=2,fieldn,eventbase=5,rule=14
field1,field2,field3,session=2,fieldn,operation=2,fieldn,NOMATCH
field1,field2,field3,session=2,fieldn,operation=1,fieldn,eventbase=,rule=13
I have Tried
BEGIN { FS = OFS = "," }
FNR == NR {
split($1,s,"|")
session=s[1];
a[session,$3] = session","$2","$3","$4;
next
}
{
split($4,x,"|");
nsession=x[1];
if(nsession in a)print $0 a[nsession,$6];
else print $0",NOMATCH";
}
Issue is I am not able to FIND nsession in 2D array a with if(nsession in a)
matching 2 files on the combination basis of session and operation
Thanks.. it helped.. Now I am learning :) Thanks team
BEGIN { FS = OFS = "," }
FNR == NR {
split($1,s,"|")
session=s[1];
a[session,$3] = session","$2","$3","$4;
next
}
{
split($4,x,"|");
nsession=x[1];
key=nsession SUBSEP $6
if(key in a)print $0 a[nsession,$6];
else print $0",NOMATCH";
}
You can try
awk -f merge.awk file1 file2
where merge.awk is
NR==FNR {
sub(/[[:blank:]]*$/,"")
getSessionInfo(1)
ar[ses,op]=",eventbase="evb",rule="rule
next
}
{
sub(/[[:blank:]]*$/,"")
getSessionInfo(0)
if ((ses,op) in ar)
print $0 ar[ses,op]
else
print $0 ",NOMATCH"
}
function getSessionInfo(f, a) {
match($0,/session=([^|])[|,]/,a)
ses=a[1]
match($0,/operation=([^,]),/,a)
op=a[1]
if (f) {
match($0,/eventbase=([^,]),/,a)
evb=a[1]
match($0,/rule=(.*)$/,a)
rule=a[1]
}
}

awk: Print string in file2 not matching with string file2

I have two file with comma separated values, I want to remove all the strings in file1 matching with strings in file 2.
file1:
soap,cosmetics,june,hello,good
file2:
june,hello
output:
soap,cosmetics,good
I tried this, but not working. I'm not sure where I'm going wrong. Any help appreciated.
BEGIN {
FS=","
}
NR==FNR {
a[NR]=$0
next
}
{
for (j=1;j<=NF;j++) {
split($0, d, ",")
if (d[j] in a == 0) {
line = (line ? line "," : "") d[j]
}
}
print line
line = ""
}
Here's one way using awk. Run like:
awk -f script.awk file2 file1
Contents of script.awk:
BEGIN {
FS=","
}
FNR==NR {
for(i=1;i<=NF;i++) {
a[$i]
}
next
}
{
for(j=1;j<=NF;j++) {
if (!($j in a)) {
r = (r ? r FS : "") $j
}
}
}
END {
print r
}
Results:
soap,cosmetics,good
Alternatively, here's the one-liner:
awk -F, 'FNR==NR { for(i=1;i<=NF;i++) a[$i]; next } { for(j=1;j<=NF;j++) if (!($j in a)) r = (r ? r FS : "") $j } END { print r }' file2 file1
$ gawk -v RS='[,\n]' 'NR==FNR{a[$0];next} !($0 in a){o=o s $0;s=","} END{print o}' file2 file1
soap,cosmetics,good

awk '/range start/,/range end/' within script

How do I use the awk range pattern '/begin regex/,/end regex/' within a self-contained awk script?
To clarify, given program csv.awk:
#!/usr/bin/awk -f
BEGIN {
FS = "\""
}
/TREE/,/^$/
{
line="";
for (i=1; i<=NF; i++) {
if (i != 2) line=line $i;
}
split(line, v, ",");
if (v[5] ~ "FOAM") {
print NR, v[5];
}
}
and file chunk:
TREE
10362900,A,INSTL - SEAL,Revise
,10362901,A,ASSY / DETAIL - PANEL,Revise
,,-203,ASSY - PANEL,Qty -,Add
,,,-309,PANEL,Qty 1,Add
,,,,"FABRICATE FROM TEKLAM NE1G1-02-250 PER TPS-CN-500, TYPE A"
,,,-311,PANEL,Qty 1,Add
,,,,"FABRICATE FROM TEKLAM NE1G1-02-750 PER TPS-CN-500, TYPE A"
,,,-313,FOAM SEAL,1.00 X 20.21 X .50 THK,Qty 1,Add
,,,,"BMS1-68, GRADE B, FORM II, COLOR BAC706 (BLACK)"
,,,-315,FOAM SEAL,1.50 X 8.00 X .25 THK,Qty 1,Add
,,,,"BMS1-68, GRADE B, FORM II, COLOR BAC706 (BLACK)"
,PN HERE,Dual Lock,Add
,
10442900,IR,INSTL - SEAL,Update (not released)
,10362901,A,ASSY / DETAIL - PANEL,Revise
,PN HERE,Dual Lock,Add
I want to have this output:
27 FOAM SEAL
29 FOAM SEAL
What is the syntax for adding the command line form '/begin regex/,/end regex/' to the script to operate on those lines only? All my attempts lead to syntax errors and googling only gives me the cli form.
why not use 2 steps:
% awk '/start/,/end/' < input.csv | awk csv.awk
Simply do:
#!/usr/bin/awk -f
BEGIN {
FS = "\""
}
/from/,/to/ {
line="";
for (i=1; i<=NF; i++) {
if (i != 2) line=line $i;
}
split(line, v, ",");
if (v[5] ~ "FOAM") {
print NR, v[5];
}
}
If the from to regexes are dynamic:
#!/usr/bin/awk -f
BEGIN {
FS = "\""
FROM=ARGV[1]
TO=ARGV[2]
if (ARGC == 4) { # the pattern was the only thing, so force read from standard input
ARGV[1] = "-"
} else {
ARGV[1] = ARGV[3]
}
}
{ if ($0 ~ FROM) { p = 1 ; l = 0} }
{ if ($0 ~ TO) { p = 0 ; l = 1} }
{
if (p == 1 || l == 1) {
line="";
for (i=1; i<=NF; i++) {
if (i != 2) line=line $i;
}
split(line, v, ",");
if (v[5] ~ "FOAM") {
print NR, v[5];
}
l = 0 }
}
Now you have to call it like: ./scriptname.awk "FROM_REGEX" "TO_REGEX" INPUTFILE. The last param is optional, if missing STDIN can be used.
HTH
You need to show us what you have tried. Is there something about /begin regex/ or /end regex/ you're not telling us, other wise your script with the additions should work, i.e.
#!/usr/bin/awk -f
BEGIN {
FS = "\""
}
/begin regex/,/end regex/{
line="";
for (i=1; i<=NF; i++) {
if (i != 2) line=line $i;
}
split(line, v, ",");
if (v[5] ~ "FOAM") {
print NR, v[5];
}
}
OR are you using an old Unix, where there is old awk as /usr/bin/awk and New awk as /usr/bin/nawk. Also see if you have /usr/xpg4/bin/awk or gawk (path could be anything).
Finally, show us the error messages you are getting.
I hope this helps.