Sum column and count lines - awk

I am trying to sum certain numbers in colum 2, it works with my code. But I want to count also how many times the same value in colum 2 is repeated and print in the last column.
file1
36 2605 1 2
36 2605 1 2
36 2603 1 2
36 2605 1 2
36 2605 1 2
36 2605 1 2
36 2606 1 2
Output Desired
2603 36 1 2 1
2605 180 5 10 5
2606 36 1 2 1
I tried
awk '{a[$2]+=$1}{b[$2]+=$3}{c[$2]+=$4;count[$2]+=$2}END{for(i in a)print i,a[i],b[i],c[i],count[i]}' file1
Thanks in advance

Renamed the vars and added pretty print:
awk '
{
sum1[$2]+=$1
sum3[$2]+=$3
sum4[$2]+=$4
count[$2]++
len2=((l=length($2))>len2?l:len2)
len1=((l=length(sum1[$2]))>len1?l:len1)
len3=((l=length(sum3[$2]))>len3?l:len3)
len4=((l=length(sum4[$2]))>len4?l:len4)
len5=((l=length(sum5[$2]))>len5?l:len5)
}
END {
for(i in count) {
printf "%*d %*d %*d %*d %*d\n",
len2,i,len1,sum1[i],len3,sum3[i],len4,sum4[i],len5,count[i]
}
}' file
Output:
2603 36 1 2 1
2605 180 5 10 5
2606 36 1 2 1

Space chars are relatively inexpensive these days, you should really consider getting some for your code, especially if you want other people to read it to help you debug it! Here's the code you posted:
awk '{a[$2]+=$1}{b[$2]+=$3}{c[$2]+=$4;count[$2]+=$2}END{for(i in a)print i,a[i],b[i],c[i],count[i]}' file1
and here it is after having been run through a code beautifier (I used gawk -o):
{
a[$2] += $1
}
{
b[$2] += $3
}
{
c[$2] += $4
count[$2] += $2
}
END {
for (i in a) {
print i, a[i], b[i], c[i], count[i]
}
}
See how just by adding some white space it's now vastly easier to understand and so the bug in how count[$2] is being populated is glaringly obvious? Some meaningful variable names are always extremely useful too and I hear alphanumeric chars are on special right now!
FWIW here's how I'd do this:
$ cat tst.awk
BEGIN { keyFldNr = 2 }
{
numOutFlds = 0
for (i=1; i<=NF; i++) {
if (i != keyFldNr) {
sum[$keyFldNr,++numOutFlds] += $i
}
}
cnt[$keyFldNr]++
}
END {
for (key in cnt) {
printf "%s%s", key, OFS
for (i=1; i<=numOutFlds; i++) {
printf "%s%s", sum[key,i], OFS
}
print cnt[key]
}
}
$ awk -f tst.awk file
2603 36 1 2 1
2605 180 5 10 5
2606 36 1 2 1
$ awk -f tst.awk file | column -t
2603 36 1 2 1
2605 180 5 10 5
2606 36 1 2 1
Notice that it'll work as-is no matter how many fields you have on each line and if you need to use a different field for the key that you count and sum on then you just change the value of keyFldNr in the BEGIN section from 2 to whatever you want it to be.

A non-awk approach, using the very useful GNU datamash, which is designed for tasks like this one:
$ datamash -Ws groupby 2 sum 1,3,4 count 2 < input.txt
2603 36 1 2 1
2605 180 5 10 5
2606 36 1 2 1
Read as: For each group of rows with the same value in column 2, display that value, the sums of columns 1, 3 and 4, and the number of rows in the group.

You've almost nailed it, you're not increasing count[$2] properly.
$ awk '{a[$2]+=$1;b[$2]+=$3;c[$2]+=$4;count[$2]++}
END{for(i in a) print i,a[i],b[i],c[i],count[i]}' file
2603 36 1 2 1
2605 180 5 10 5
2606 36 1 2 1

no need external program, faster ~21ms, tried on pure gnu awk
awk '{if($0~/^[A-Za-z0-9]/)a[NR]=$2" "$1" "$3" "$4}END{asort(a);$0="";for(;i++<NR;){split(a[i],b);if($1==""||b[1]==$1){$2+=b[2];$3+=b[3];$4+=b[4];$5++} else {print;$2=b[2];$3=b[3];$4=b[4];$5=1} $1=b[1]} print}' file1

Related

How to calculate anomaly using awk

A have a file:
file.txt
1 32
2 34
3 32
4 43
5 25
6 34
7 65
8 34
9 23
10 44
I would like to find anomaly on send column:
my below script printing anomalies considering row-2 to row-10 values. It is not considering row-1 values.
awk 'FNR==NR{
f=1;
if($1 >= 1 && $1 <= 10){
count++;
SUM+=$2;
};
next
}
FNR==1 && f==1{
AVG=SUM/count;
next
}
($1 >= 1 && $1 <= 10){
print $1, $2-AVG
}
' file.txt file.txt
My desire output:
1 -4.6
2 -2.6
3 -4.6
4 6.4
5 -11.6
6 -2.6
7 28.4
8 -2.6
9 -13.6
10 7.4
I got a solution of it:
awk '{f=$1>=1 && $1<=10}f && NR==FNR{sum+=$2; c++; next}f{ print $1, $2-(sum/c) }' file.txt file.txt
I am still wondering why the first script is not giving correct answer.
Since this is just 2 columns file, this can be done in a single pass awk also:
awk '{map[$1] = $2; s += $2}
END {mean = s/NR; for (i in map) print i, map[i] - mean}' file
1 -4.6
2 -2.6
3 -4.6
4 6.4
5 -11.6
6 -2.6
7 28.4
8 -2.6
9 -13.6
10 7.4
The first script in the OP is not giving the correct value, because you skip the first line in the second pass of your file. This is seen in the statement (FNR==1 && f==1) { AVG=sum/count; next }. Due to the next statement, you skip the computation of the deviation from the mean value for the first record.
This is an efficient computation of the deviation from the mean in a double pass:
awk '(NR==FNR){s+=$2;c++;next}
(FNR==1){s/=c}
{print $1,$2-s}' file file
If file contains values bigger than 10 or smaller than 1 in the first, column, but you only want to see this for values in the range of [0,10], then you can do:
awk '($1<1 || $1>10) {next}
(NR==FNR){s+=$2;c++;next}
(FNR==1){s/=c}
{print $1,$2-s}' file file
There are still other optimizations that can be done, but these only become beneficial when working with extremely large files (many millions of lines).

Column manipulating using Bash & Awk

Let's assume have an example1.txt file consisting of few rows.
item item item
A B C
100 20 2
100 22 3
100 23 4
101 26 2
102 28 2
103 29 3
103 30 2
103 32 2
104 33 2
104 34 2
104 35 2
104 36 3
There are few commands I would like to perform to filter out the txt files and add a few more columns.
At first, I want to apply a condition when item C is equal to 2. Using awk command I can do that in the following way.
Therefore The return text file would be:
awk '$3 == 2 { print $1 "\t" $2 "\t" $3} ' example1.txt > example2.txt
item item item
A B C
100 20 2
101 26 2
102 28 2
103 30 2
103 32 2
104 33 2
104 34 2
104 35 2
Now I want to count two things:
I want to count the total unique number in column 1.
For example, in the above case example2.txt, it would be:
(100,101,102,103,104) = 5
And I would like to add the repeating column A number and add that to a new column.
I would like to have like this:
item item item item
A B C D
100 20 2 1
101 26 2 1
102 28 2 1
103 30 2 2
103 32 2 2
104 33 2 3
104 34 2 3
104 35 2 3
~
Above Item D column (4th), 1st row is 1, because it did not have any repetitive. but in 4th row, it's 2 because 103 is repetitive twice. Therefore I have added 2 in the 4th and 5th columns. Similarly, the last three columns in Item 4 is 3, because item A is repetitive three times in these three columns.
You may try this awk:
awk -v OFS='\t' 'NR <= 2 {
print $0, (NR == 1 ? "item" : "D")
}
FNR == NR && $3 == 2 {
++freq[$1]
next
}
$3 == 2 {
print $0, freq[$1]
}' file{,}
item item item item
A B C D
100 20 2 1
101 26 2 1
102 28 2 1
103 30 2 2
103 32 2 2
104 33 2 3
104 34 2 3
104 35 2 3
Could you please try following. In case you want to save output into same Input_file then append > temp && mv temp Input_file to following code.
awk '
FNR==NR{
if($3==2){
a[$1,$3]++
}
next
}
FNR==1{
$(NF+1)="item"
print
next
}
FNR==2{
$(NF+1)="D"
print
next
}
$3!=2{
next
}
FNR>2{
$(NF+1)=a[$1,$3]
}
1
' Input_file Input_file | column -t
Output will be as follows.
item item item item
A B C D
100 20 2 1
101 26 2 1
102 28 2 1
103 30 2 2
103 32 2 2
104 33 2 3
104 34 2 3
104 35 2 3
Explanation: Adding detailed explanation for above code.
awk ' ##Starting awk program fro here.
FNR==NR{ ##Checking condition if FNR==NR which will be TRUE when 1st time Input_file is being read.
if($3==2){ ##Checking condition if 3rd field is 2 then do following.
a[$1,$3]++ ##Creating an array a whose index is $1,$3 and keep adding its index with 1 here.
}
next ##next will skip further statements from here.
}
FNR==1{ ##Checking condition if this is first line.
$(NF+1)="item" ##Adding a new field with string item in it.
print ##Printing 1st line here.
next ##next will skip further statements from here.
}
FNR==2{ ##Checking condition if this is second line.
$(NF+1)="D" ##Adding a new field with string item in it.
print ##Printing 1st line here.
next ##next will skip further statements from here.
}
$3!=2{ ##Checking condition if 3rd field is NOT equal to 2 then do following.
next ##next will skip further statements from here.
}
FNR>2{ ##Checking condition if line is greater than 2 then do following.
$(NF+1)=a[$1,$3] ##Creating new field with value of array a with index of $1,$3 here.
}
1 ##1 will print edited/non-edited lines here.
' Input_file Input_file ##Mentioning Input_file names 2 times here.
Similar to the others, but using awk with a single-pass and storing the information in arrays regarding the records seen and the count for D with the arrays ord and Dcnt used to map the information for each, e.g.
awk '
FNR == 1 { h1=$0"\titem" } # header 1 with extra "\titem"
FNR == 2 { h2=$0"\tD" } # header 2 with exter "\tD"
FNR > 2 && $3 == 2 { # remaining rows with $3 == 2
D[$1]++ # for D colum times A seen
seen[$1,$2] = $0 # save records seen
ord[++n] = $1 SUBSEP $2 # save order all records appear
Dcnt[n] = $1 # save order mapped to $1 for D
}
END {
printf "%s\n%s\n", h1, h2 # output headers
for (i=1; i<=n; i++) # loop outputing info with D column added
print seen[ord[i]]"\t"D[Dcnt[i]]
}
' example.txt
(note: SUBSEP is a built-in variable that corresponds to the substring separator used when using the comma to concatenate fields for an array index, e.g. seen[$1,$2] to allow comparison outside of an array. It is by default "\034")
Example Output
item item item item
A B C D
100 20 2 1
101 26 2 1
102 28 2 1
103 30 2 2
103 32 2 2
104 33 2 3
104 34 2 3
104 35 2 3
Always more than one way to skin-the-cat with awk.
Assuming the file is not a big file;
awk 'NR==FNR && $3 == 2{a[$1]++;next}$3==2{$4=a[$1];print;}' file.txt file.txt
You parse through the file twice. In the first iteration, you calculate the 4th column and have it in an array. In the second parsing, we set the count as 4th column,and get the whole line printed.

To sum adjacent lines from the same column in AWK

I have a file:
A 1 20
B 2 21
C 3 22
D 4 23
I have to find the sum of values from 0-3rd line then the sum of line 1 to 3 and finally the sum of line 2 to 3. The last value has to be simply 0. In another words, I want to get an output file with two columns where the values are the sum of adjacent lines something like this:
10 86
9 66
7 45
0 0
The last row has to have two zeros as values. How to do it in AWK?
This might be what you want:
$ tac file | awk 'NR==1{ print 0, 0; a=$2; b=$3; next} { print a+=$2, b+=$3 }' | tac
10 86
9 66
7 45
0 0
Avoid two tacs by accumulating the sums in two arrays:
$ awk '{
for (i = 1; i <= NR; ++i) { sum2[i] += $2; sum3[i] += $3 }
}
END {
sum2[NR] = sum3[NR] = 0
for (i = 1; i <= NR; ++i) print sum2[i], sum3[i]
}' file
10 86
9 66
7 45
0 0
The value of each row is added into all the previous rows. Once all rows have been processed, the last values are zeroed out and everything is printed.

How to sum a selection of columns?

I'd like to sum multiple columns in a text file similar to this:
GeneA Sample 34 7 8 16
GeneA Sample 17 7 10 91
GeneA Sample 42 9 8 11
I'd like to generate the sum at the bottom of columns 3-5 so it will look like:
GeneA Sample 34 7 8 16
GeneA Sample 17 7 10 91
GeneA Sample 42 9 8 11
93 23 26
I can use this for a single column but don't know how to specify a range of columns:
awk -F'\t' '{sum+=$3} END {print sum}' input file> out
The easiest way is just repeat summing for each column, i.
awk -F '\t' '{
s3 += $3
s4 += $4
s5 += $5
}
END {
print s3, s4, s5
}' input_file > out
In awk:
$ awk '
{
for(i=3;i<=NF;i++) # loop wanted fields
s[i]+=$i } # sum to hash, index on field #
END {
for(i=3;i<=NF;i++) # same old loop
printf "%s%s",s[i],(i==NF?ORS:OFS) } # output
' file
93 23 26 118
Currently the for loop goes thru every numeric field. Change the parameters if needed.
$ awk -v OFS='\t' '{s3+=$3; s4+=$4; s5+=$5; $1=$1} 1;
END {print "","",s3,s4,s5}' file
GeneA Sample 34 7 8 16
GeneA Sample 17 7 10 91
GeneA Sample 42 9 8 11
93 23 26
Try this. Note that NF just means number of fields. And AWK indexing starts with 1. So the example here has a range of 3 to the last col.
awk '{ for(i=3;i<=NF;i++) sum[i] += $i } END { for(i=3;i<=NF;i++) printf( "%d ", sum[i] ); print "" }' input_file
If you want fewer columns, say 3 and 4, then I'd suggest:
awk '{ for(i=3;i<=4 && i<=NF;i++) sum[i] += $i } END { for(i=3;i<=4 && i<=NF;i++) printf( "%d ", sum[i] ); print "" }' input_file

Average of a fixed number of rows

Given the following input:
256 1 4 1 130.363
256 1 4 2 128.332
256 1 4 3 130.262
256 1 4 4 128.395
256 1 4 5 128.484
64 2 4 1 95.227
64 2 4 2 96.582
64 2 4 3 95.785
64 2 4 4 93.944
64 2 4 5 97.398
64 4 4 1 143.519
64 4 4 2 143.579
64 4 4 3 143.937
64 4 4 4 142.292
64 4 4 5 143.304
I am trying to obtain the average of a given number of rows. In this case, I've got 5 samples indicated by the 4th column. So the expected output should be:
256 1 4 129.167
64 2 4 95.787
64 4 4 143.326
To loop over, I have tried something like
awk 'BEGIN {i = 1; while (s[$4] <= 5) { print $4 } }'
But it is not even printing what I want. Also tried this
awk '{array[$1" "$2]+=$5} END { for (i in array) {print i" " array[i]/length(array)}}'
$ awk '{curr = $1 OFS $2 OFS $3} curr!=prev {if (cnt) print prev, sum/cnt; prev=curr; sum=cnt=0} {sum+=$5; cnt++} END{if (cnt) print prev, sum/cnt}' file
256 1 4 129.167
64 2 4 95.7872
64 4 4 143.326
The differences between this and #NinjaGaiden's solution are that:
This relies on all the data associated with key values being
contiguous as shown in your sample input while NGs does not.
This does not save the contents of the input file in memory while NGs does.
This will print the output in the same order it occurred in the input while NGs will print it in random (hash) order.
Try this
awk '{k=$1" "$2" "$3; j[k]+=$5;z[k]+=1} END { for (x in j) { print x,j[x]/z[x] }} ' f
awk '{array[$1" "$2" "$3]+=$5} END { for (i in array) {print i" " array[i]/length(array)}}'
This also work with a variable number of rows
awk '
{
a[$1" "$2" "$3]+=$5
b[$1" "$2" "$3]++
} END { for (i in a) { print i, (a[i] / b[i]) }}
' file