AWK script to help format averages of column - awk

Below I am trying to have this awk script display each individual exam average as well as the exams themselves average. I know its a matter of where each line of code is placed as to how its executed. This is what I need it to look like:
Name Exam 1 Exam 2 Exam 3 Exam 4 Average
Joe 0.0 75 87 91
John 0.0 86 72 83
Exam 1 Average: 0.0
Exam 2 Average: 80.5
Exam 3 Average: 79.5
#!/usr/bin/awk -f
NR == 1{
printf "%s \t %28s %7s %7s %7s %7s\n", "Name", "Exam 1", "Exam 2", "Exam 3", "Exam 4", "Av\
erage"
}
{
examSTUAVG = ($3 + $4 + $5) / 4;
printf "%s \t %28s %7s %7s %7s %7.1f\n", $1, "0", $3, $4, $5,examSTUAVG
{exam2Total += $3}
{exam3Total += $4}
{exam4Total += $5}
printf "Exam 1 Average is %19s\n", "0.0"
printf "Exam 2 Average is %19.1f\n", exam2Total / NR
printf "Exam 3 Average is %19.1f\n", exam3Total / NR
printf "Exam 4 Average is %19.1f\n", exam4Total / NR
}
{ print ""}

You need this script. You can save it as: program.awk. I added END block for printing the averages values at final.
#!/usr/bin/awk -f
{
if(NR == 1){
printf "%s \t %28s %7s %7s %7s %7s\n", "Name", "Exam 1", "Exam 2", "Exam 3", "Exam 4", "Average"
}
else{
examSTUAVG = ($3 + $4 + $5) / 4;
printf "%s \t %28s %7s %7s %7s %7.1f\n", $1,$2, $3, $4, $5,examSTUAVG
{exam1Total += $2}
{exam2Total += $3}
{exam3Total += $4}
{exam4Total += $5}
}
}
END{
myrows=NR-1
printf "Exam 1 Average is %19.1f\n", exam1Total / myrows
printf "Exam 2 Average is %19.1f\n", exam2Total / myrows
printf "Exam 3 Average is %19.1f\n", exam3Total / myrows
printf "Exam 4 Average is %19.1f\n", exam4Total / myrows
}
The input is the file data.txt.
Name Exam 1 Exam 2 Exam 3 Exam 4 Average
Joe 0.0 75 87 91
John 0.0 86 72 83
And execute it as:
./program.awk data.txt
I got this output:
Name Exam 1 Exam 2 Exam 3 Exam 4 Average
Joe 0.0 75 87 91 63.2
John 0.0 86 72 83 60.2
Exam 1 Average is 0.0
Exam 2 Average is 80.5
Exam 3 Average is 79.5
Exam 4 Average is 87.0

Related

For each unique occurrence in field, transform each unique occurrence in another field in a different column

I have a file
splice_region_variant,intron_variant A1CF 1
3_prime_UTR_variant A1CF 18
intron_variant A1CF 204
downstream_gene_variant A1CF 22
synonymous_variant A1CF 6
missense_variant A1CF 8
5_prime_UTR_variant A2M 1
stop_gained A2M 1
missense_variant A2M 15
splice_region_variant,intron_variant A2M 2
synonymous_variant A2M 2
upstream_gene_variant A2M 22
intron_variant A2M 308
missense_variant A4GNT 1
intron_variant A4GNT 21
5_prime_UTR_variant A4GNT 3
3_prime_UTR_variant A4GNT 7
This file is sorted by $2
for each occurrence of an unique element in $2, I wanna transform in a column each unique occurrence of an element in $1, with corresponding value in $3, or 0 if the record is not there. So that I have:
splice_region_variant,intron_variant 3_prime_UTR_variant intron_variant downstream_gene_variant synonymous_variant missense_variant 5_prime_UTR_variant stop_gained upstream_gene_variant
A1CF 1 18 204 22 6 8 0 0 0
A2M 2 0 308 0 2 15 1 1 22
A4GNT 0 7 21 0 0 22 3 0 0
test file:
a x 2
b,c x 4
dd x 3
e,e,t x 5
a b 1
cc b 2
e,e,t b 1
This is what I'm getting:
a b,c dd e,e,t cc
x 5 2 4 3
b 1 2 1
EDIT: This might be doing it but doesn't output 0s in blank fields
'BEGIN {FS = OFS = "\t"}
NR > 1 {data[$2][$1] = $3; blocks[$1]}
END {
PROCINFO["sorted_in"] = "#ind_str_asc"
# header
printf "gene"
for (block in blocks) {
printf "%s%s", OFS, block
}
print ""
# data
for (ts in data) {
printf "%s", ts
for (block in blocks) {
printf "%s%s", OFS, data[ts][block]
}
print ""
}
}' file
modified from https://unix.stackexchange.com/questions/424642/dynamic-transposing-rows-to-columns-using-awk-based-on-row-value
If you want to print 0 if a certain value is absent, you could do something like this:
val = data[ts][block] ? data[ts][block] : 0;
printf "%s%s", OFS, val

Print sorted output with awk to avoid pipe sort command

I'm trying to match the lines containing (123) and then manipulate field 2 replacing x and + by space that will give 4 columns. Then change order of column 3 by Column 4.
To finally print sorted first by column 3 and second by column 4.
I'm able to get the output piping sort command after awk output in this way.
$ echo "
0: 1920x1663+0+0 kpwr(746)
323: 892x550+71+955 kpwr(746)
211: 891x550+1003+410 kpwr(746)
210: 892x451+71+410 kpwr(746)
415: 891x451+1003+1054 kpwr(746)
1: 894x532+70+330 kpwr(123)
324: 894x532+1001+975 kpwr(123)
2: 894x631+1001+330 kpwr(123)
212: 894x631+70+876 kpwr(123)
61: 892x1+71+375 kpwr(0)
252: 892x1+71+921 kpwr(0)" |
awk '/\(123\)/{b = gensub(/(.+)x(.+)\+(.+)\+(.+)/, "\\1 \\2 \\4 \\3", "g", $2); print b}' |
sort -k3 -k4 -n
894 532 330 70
894 631 330 1001
894 631 876 70
894 532 975 1001
How can I get the same output using only awk without the need to pipe sort? Thanks for any help.
Here is how you can get it from awk (gnu) itself:
awk '/\(123\)/{
$2 = gensub(/(.+)x(.+)\+(.+)\+(.+)/, "\\1 \\2 \\4 \\3", "g", $2)
split($2, a) # split by space and store into array a
# store array by index 3 and 4
rec[a[3]][a[4]] = (rec[a[3]][a[4]] == "" ? "" : rec[a[3]][a[4]] ORS) $2
}
END {
PROCINFO["sorted_in"]="#ind_num_asc" # sort by numeric key ascending
for (i in rec) # print stored array rec
for (j in rec[i])
print rec[i][j]
}' file
894 532 330 70
894 631 330 1001
894 631 876 70
894 532 975 1001
Can you handle GNU awk?:
$ gawk '
BEGIN {
PROCINFO["sorted_in"]="#val_num_asc" # for order strategy
}
/\(123\)$/ { # pick records
split($2,t,/[+x]/) # split 2nd field
if((t[4] in a) && (t[3] in a[t[4]])) { # if index collision
n=split(a[t[4]][t[3]],u,ORS) # split stacked element
u[n+1]=t[1] OFS t[2] OFS t[4] OFS t[3] # add new data
delete a[t[4]][t[3]] # del before rebuilding
for(i in u) # sort on whole record
a[t[4]][t[3]]=a[t[4]][t[3]] ORS u[i] # restack to element
} else
a[t[4]][t[3]]=t[1] OFS t[2] OFS t[4] OFS t[3] # no collision, just add
}
END {
PROCINFO["sorted_in"]="#ind_num_asc" # strategy on output
for(i in a)
for(j in a[i])
print a[i][j]
}' file
Output:
894 532 330 70
894 631 330 1001
894 631 876 70
894 532 975 1001
With collisioning data like:
1: 894x532+70+330 kpwr(123) # this
1: 123x456+70+330 kpwr(123) # and this, notice order
324: 894x532+1001+975 kpwr(123)
2: 894x631+1001+330 kpwr(123)
212: 894x631+70+876 kpwr(123)
output would be:
123 456 330 70 # ordered by the whole record when collision
894 532 330 70
894 631 330 1001
894 631 876 70
894 532 975 1001
I was almost done with writing and my solution was ditto as #anubhava's so adding a bit tweak to his solution :) This one will take care of multiple lines of same values here.
awk '
BEGIN{
PROCINFO["sorted_in"]="#ind_num_asc"
}
/\(123\)/{
$2 = gensub(/(.+)x(.+)\+(.+)\+(.+)/, "\\1 \\2 \\4 \\3", "g", $2)
split($2, a," ")
arr[a[3]][a[4]] = (arr[a[3]][a[4]]!=""?arr[a[3]][a[4]] ORS:"")$2
}
END {
for (i in arr){
for (j in arr[i]){ print arr[i][j] }
}
}' Input_file

AWK failing to sum floats

I am trying to sum the last 12 values in a field in a particular csv file, but AWK is failing to correctly sum the values. If I output the data to a new file then run the same AWK statement against the new file it works.
Here are the contents of the original file. The fields are separated by ";"
I want to sum the values in the 3rd field
...$ tail -12 OriginalFile.csv...
02/02/2020 10:30:00;50727.421;0.264;55772.084;0.360;57110.502;0.384
02/02/2020 10:35:00;50727.455;0.408;55772.126;0.504;57110.548;0.552
02/02/2020 10:40:00;50727.489;0.408;55772.168;0.504;57110.593;0.540
02/02/2020 10:45:00;50727.506;0.204;55772.193;0.300;57110.621;0.336
02/02/2020 10:50:00;50727.541;0.420;55772.236;0.516;57110.667;0.552
02/02/2020 10:55:00;50727.566;0.300;55772.269;0.396;57110.703;0.432
02/02/2020 11:00:00;50727.590;0.288;55772.300;0.372;57110.737;0.408
02/02/2020 11:05:00;50727.605;0.180;55772.321;0.252;57110.762;0.300
02/02/2020 11:10:00;50727.621;0.192;55772.344;0.276;57110.786;0.288
02/02/2020 11:15:00;50727.659;0.456;55772.389;0.540;57110.835;0.588
02/02/2020 11:20:00;50727.681;0.264;55772.417;0.336;57110.866;0.372
02/02/2020 11:25:00;50727.704;0.276;55772.448;0.372;57110.900;0.408
I used the following code to print the original value and the summed value of field 3 for each record, but it just returns the same output for the summed value for each line
...$ awk 'BEGIN { FS = ";" } ; { sum += $3 } { print $3, sum }' OriginalFile.csv|tail -12...
0.264 2.00198e+09
0.408 2.00198e+09
0.408 2.00198e+09
0.204 2.00198e+09
0.420 2.00198e+09
0.300 2.00198e+09
0.288 2.00198e+09
0.180 2.00198e+09
0.192 2.00198e+09
0.456 2.00198e+09
0.264 2.00198e+09
0.276 2.00198e+09
If I output the contents of the file into a different file, the same code works as expected
...$ tail -12 OriginalFile.csv > testfile2.csv...
...$ awk 'BEGIN { FS = ";" } ; { sum += $3 } { print $3, sum }' testfile2.csv...
0.264 0.264
0.408 0.672
0.408 1.08
0.204 1.284
0.420 1.704
0.300 2.004
0.288 2.292
0.180 2.472
0.192 2.664
0.456 3.12
0.264 3.384
0.276 3.66
How can I get the correct output from the original file without having to create a new file?
As #Shawn's excellent comment points out, the order in which you pipe in your data is the problem. By the time you reach the 12th line from the end, sum is already 2.00198e+09; adding many small fractions is not significant, so it seems like it is "the same output".
Simply:
tail -12 OriginalFile.csv | awk 'BEGIN { FS = ";" } ; { sum += $3 } { print $3, sum }'

awk to match field between two files and use conditions on match

I am trying to look for $2 of file1 (skipping the header) in $2 of file2 and if they match and the value in $10 is > 30 and $11 is > 49, then print the line to a output file. The below awk has syntax errors in it though shellcheck didn't return any. Both the input and output are tab-delimited. I think the below is close, but not sure what is wrong. Thank you :).
awk
awk -F'\t' -v OFS='\t' 'NR==FNR{A[$2];next}$2 in A
{if($10 >.5 OFS $11 > 49)
print ; next
' file1 file2
awk: cmd. line:2: {if($10 >.5 OFS $11 > 49)
awk: cmd. line:2: ^ syntax error
awk: cmd. line:3: print ; next
awk: cmd. line:3: ^ unexpected newline or end of string
file1
Missing in IDP but found in Reference:
2 166848646 G A exonic SCN1A 68 13 16;20 0;0 17;15 0;0 0;0 0;0 c.[5139C>T]+[=] 52.94
file2
chr2 166245425 SCN2A AMPL5155065355 SNP Het C/T C T 54 100 50 23 27
chr2 166848646 SCN1A AMPL1543060606 SNP Het G/A G A 52.9411764706 100 68 32 36
desired output
2 166848646 G A exonic SCN1A 68 13 16;20 0;0 17;15 0;0 0;0 0;0 c.[5139C>T]+[=] 52.94
edit with new awk
awk -F'\t' -v OFS='\t' 'NR==FNR{A[$2];next}$2 in A {
if($10 >.5 OFS $11 > 49) >>> if($10 >.5 && $11 > 49)
print }
' file1 file2 > out
awk: cmd. line:2: if($10 >.5 OFS $11 > 49) >>> if($10 >.5 && $11 > 49)
awk: cmd. line:2: ^ syntax error
here you go...
$ awk 'BEGIN{FS=OFS="\t"} NR==FNR{a[$2]; next}
($2 in a) && $10>30 && $11>49 ' file1 file2

AWK - Select lines to print according to score

I have a tab-separated file containing a series of lemmas with associated scores.
The file contains 5 columns, the first column is the lemma and the third is the one that contains the score. What I need to do is print the line as it is, when lemma is not repeated and print the line with the highest score when lemma is repeated.
IN
Lemma --- Score --- ---
cserép 06a 55 6 bueno
darázs 05 38 1 bueno
dél 06a 34 1 bueno
dér 06a 29 1 bueno
díj 05 14 89 malo
díj 06a 2 101 malo
díj 06b 2 101 malo
díj 07 90 13 bueno
díj 08a 2 101 malo
díj 08b 2 101 malo
egér 06a 66 5 bueno
fonal 05 12 1 bueno
fonal 07 52 4 bueno
Desired output
Lemma --- Score --- ---
cserép 06a 55 6 bueno
darázs 05 38 1 bueno
dél 06a 34 1 bueno
dér 06a 29 1 bueno
díj 07 90 13 bueno
egér 06a 66 5 bueno
fonal 07 52 4 malo
What I have done. But it only works when the lemma is repeated once.
BEGIN {
OFS=FS="\t";
flag="";
}
{
id=$1;
if (id != flag)
{
if (line != "")
{
sub("^;","",line);
z=split(line,A,";");
if ((A[3] > A[8]) && (A[8] != ""))
{
print A[1]"\t"A[2]"\t"A[3]"\t"A[4]"\t"A[5];
}
else if ((A[8] > A[3]) && (A[8] != ""))
{
print A[6]"\t"A[7]"\t"A[8]"\t"A[9]"\t"A[10]
}
else
{
print A[1]"\t"A[2]"\t"A[3]"\t"A[4]"\t"A[5];
}
}
delete line;
flag=id;
}
line[$1]=line[$1]";"$2";"$3";"$4";"$5;
}
END {
line=line ";"$1";"$2";"$3";"$4";"$5
sub("^;","",line);
z=split(line,A,";");
if ((A[3] > A[8]) && (A[8] != ""))
{
print A[1]"\t"A[2]"\t"A[3]"\t"A[4]"\t"A[5];
}
else if ((A[8] > A[3]) && (A[8] != ""))
{
print A[6]"\t"A[7]"\t"A[8]"\t"A[9]"\t"A[10]
}
else
{
print A[1]"\t"A[2]"\t"A[3]"\t"A[4]"\t"A[5]
}
}
This one doesn't require the file to be sorted by lemma, but, it keeps all the lines to be printed in memory (one for each lemma) so may not be appropriate for a file with millions of different lemmas.
It also does not respect the order of the original file.
Finally, it assumes that all scores are non-negative!
$ cat lemma.awk
BEGIN { FS = OFS = "\t" }
NR == 1 { print }
NR > 1 {
if ($3 > score[$1]) {
score[$1] = $3
line[$1] = $0
}
}
END { for (lemma in line) print line[lemma] }
$ awk -f lemma.awk lemma.txt
Lemma --- Score --- ---
cserép 06a 55 6 bueno
díj 07 90 13 bueno
fonal 07 52 4 bueno
darázs 05 38 1 bueno
egér 06a 66 5 bueno
dél 06a 34 1 bueno
dér 06a 29 1 bueno
Tested with gnu awk:
prevLemma != $1 {
if( prevLemma ) {
print line;
}
prevLemma = $1;
prevScore = $3;
line = $0;
}
prevLemma == $1 { if( prevScore < $3 ) {
prevScore = $3;
line = $0;
}
}
END { print line;}
assumption is: the file is sorted by lemma
when the lemma changes (or at the very beginning when the var is empty) the lemma, score and line are saved
when the lemma changes (or in the END), the line for the previous lemma is printed
when the current line belongs to the same lemma and has a higher score the values are saved again
$ cat tst.awk
$1 != prev { printf "%s", maxLine; maxLine=""; max=$3; prev=$1 }
$3 >= max { max=$3; maxLine=$0 ORS }
END { printf "%s", maxLine }
$ awk -f tst.awk file
Lemma --- Score --- ---
cserép 06a 55 6 bueno
darázs 05 38 1 bueno
dél 06a 34 1 bueno
dér 06a 29 1 bueno
díj 07 90 13 bueno
egér 06a 66 5 bueno
fonal 07 52 4 bueno
Use a script:
if ($1 != $5) print $0
else
{
score($NR) = $3
print $0
}
Actually , this might be better done with perl.