Replace a pattern-matching field with field's value from previous row - awk

maybe the title is not a good description but, essentially, this is the problem I am trying to solve:
I have a text file with n rows and m space-separater fields in each row
if field j of row i matches a pattern, replace it with field j from row i - 1
I am not forced to use AWK (GAWK in this instance), but it seemed a good choice for this operation. This is the sript I wrote and it work as expected, but I was wondering if there is a more time-efficient way to solve the problem
{
if ($0!~/NoData/) {
split($0, data, " ");
print $0
} else {
split($0, row, " ", seps);
for(i in row) {if (row[i]~/NoData/) row[i]=data[i]; else data[i]=row[i]; printf "%s%s", row[i], seps[i];}
printf "\n"
}
}
As a sample, the script, running on this input file
0.8147 0.2785 0.9572 0.7922 0.6787 0.7060
0.9058 0.5469 0.4854 0.9595 0.7577 0.0318
0.1270 0.9575 0.8003 0.6557 0.7431 0.2769
0.9134 0.9649 NoData 0.0357 0.3922 0.0462
0.6324 0.1576 NoData NoData 0.6555 0.0971
0.0975 0.9706 NoData NoData 0.1712 0.8235
should produce this result
0.8147 0.2785 0.9572 0.7922 0.6787 0.7060
0.9058 0.5469 0.4854 0.9595 0.7577 0.0318
0.1270 0.9575 0.8003 0.6557 0.7431 0.2769
0.9134 0.9649 0.8003 0.0357 0.3922 0.0462
0.6324 0.1576 0.8003 0.0357 0.6555 0.0971
0.0975 0.9706 0.8003 0.0357 0.1712 0.8235

awk '{for(i=1;i<=NF;i++){ if($i~/NoData/){ $i=last[i]; } last[i]=$i } }1' file
If you want to preserve original formatting, you may use below, if you have gawk 4th argument for split may be utilized.
awk '{
split($0,D,/[^[:space:]]*/);
s = "";
for(i=1;i<=NF;i++){
if($i~/NoData/){ $i = last[i]; }
last[i]=$i ;
s = s sprintf("%s%s",D[i],$i)
}
print s
}' file
OR by setting OFS="" or OFS=
awk -v OFS= '{
split($0,D,/[^[:space:]]*/);
for(i=1;i<=NF;i++){
if($i~/NoData/){ $i = last[i]; }
last[i]=$i ;
$i = sprintf("%s%s",D[i],$i)
}
}1' file
Example - 1 ( Preserve Formatting )
$ cat file
0.8147 0.2785 0.9572 0.7922 0.6787 0.7060
0.9058 0.5469 0.4854 0.9595 0.7577 0.0318
0.1270 0.9575 0.8003 0.6557 0.7431 0.2769
0.9134 0.9649 NoData 0.0357 0.3922 0.0462
0.6324 0.1576 NoData NoData 0.6555 0.0971
0.0975 0.9706 NoData NoData 0.1712 0.8235
$ awk '{
split($0,D,/[^[:space:]]*/);
s = "";
for(i=1;i<=NF;i++){
if($i~/NoData/){ $i = last[i]; }
last[i]=$i ;
s = s sprintf("%s%s",D[i],$i)
}
print s
}' file
0.8147 0.2785 0.9572 0.7922 0.6787 0.7060
0.9058 0.5469 0.4854 0.9595 0.7577 0.0318
0.1270 0.9575 0.8003 0.6557 0.7431 0.2769
0.9134 0.9649 0.8003 0.0357 0.3922 0.0462
0.6324 0.1576 0.8003 0.0357 0.6555 0.0971
0.0975 0.9706 0.8003 0.0357 0.1712 0.8235
Example - 2 ( Without Preserving Source Formatting )
It takes single space as output separator by default, in case if you set OFS it will override default value.
$ cat file
0.8147 0.2785 0.9572 0.7922 0.6787 0.7060
0.9058 0.5469 0.4854 0.9595 0.7577 0.0318
0.1270 0.9575 0.8003 0.6557 0.7431 0.2769
0.9134 0.9649 NoData 0.0357 0.3922 0.0462
0.6324 0.1576 NoData NoData 0.6555 0.0971
0.0975 0.9706 NoData NoData 0.1712 0.8235
$ awk '{for(i=1;i<=NF;i++){ if($i~/NoData/){ $i=last[i]; } last[i]=$i } }1' file
0.8147 0.2785 0.9572 0.7922 0.6787 0.7060
0.9058 0.5469 0.4854 0.9595 0.7577 0.0318
0.1270 0.9575 0.8003 0.6557 0.7431 0.2769
0.9134 0.9649 0.8003 0.0357 0.3922 0.0462
0.6324 0.1576 0.8003 0.0357 0.6555 0.0971
0.0975 0.9706 0.8003 0.0357 0.1712 0.8235

Related

awk print lines until next match on the same line

I have the following type of data file:
0.033333 0.000000 0.000000
-46.956 -46.956 -23.678 -23.677 -23.055 -23.054 -22.974 -22.974 -8.033 -8.032
-7.375 -7.356 -7.182 -7.159 -6.695 -6.661 -6.628 -6.598 -4.477 -4.477
-4.470 -4.462 -4.387 -4.380 3.799 3.800 5.939 5.960 6.116 6.117
6.625 6.642 7.648 7.651 7.686 7.687 8.077 8.078 8.123 8.126
8.478 8.497 8.550 8.552 11.625 11.626 12.652 12.653 12.722 12.726
13.860 13.864 14.291 14.293 14.966 15.046 17.063 17.252 18.011 18.015
0.016667 0.000000 0.000000
-46.956 -46.956 -23.677 -23.677 -23.055 -23.054 -22.974 -22.974 -8.037 -8.036
-7.371 -7.361 -7.177 -7.165 -6.686 -6.669 -6.620 -6.605 -4.476 -4.475
-4.471 -4.465 -4.385 -4.382 3.811 3.812 5.942 5.952 6.115 6.115
6.629 6.638 7.651 7.653 7.688 7.689 8.072 8.073 8.122 8.123
8.491 8.501 8.556 8.556 11.612 11.612 12.665 12.665 12.730 12.733
13.835 13.837 14.288 14.289 14.991 15.031 17.132 17.225 18.053 18.055
0.000000 0.000000 0.000000
-46.956 -46.956 -23.677 -23.677 -23.055 -23.055 -22.974 -22.974 -8.038 -8.038
-7.366 -7.366 -7.172 -7.172 -6.678 -6.678 -6.613 -6.613 -4.475 -4.475
-4.469 -4.469 -4.384 -4.384 3.816 3.816 5.946 5.946 6.115 6.115
6.633 6.633 7.653 7.653 7.689 7.689 8.070 8.070 8.122 8.122
8.498 8.498 8.558 8.558 11.607 11.607 12.668 12.668 12.735 12.735
13.827 13.827 14.287 14.287 15.013 15.013 17.186 17.186 18.068 18.068
I need to change this to look like this:
0.033333 0.000000 0.000000 -46.956 -46.956 -23.678 -23.677 -23.055 -23.054 -22.974 -22.974 -8.033 -8.032 -7.375 -7.356 -7.182 -7.159 -6.695 -6.661 -6.628 -6.598 -4.477 -4.477 -4.470 -4.462 -4.387 -4.380 3.799 3.800 5.939 5.960 6.116 6.117 6.625 6.642 7.648 7.651 7.686 7.687 8.077 8.078 8.123 8.126 8.478 8.497 8.550 8.552 11.625 11.626 12.652 12.653 12.722 12.726 13.860 13.864 14.291 14.293 14.966 15.046 17.063 17.252 18.011 18.015
0.016667 0.000000 0.000000 -46.956 -46.956 -23.677 -23.677 -23.055 -23.054 -22.974 -22.974 -8.037 -8.036 -7.371 -7.361 -7.177 -7.165 -6.686 -6.669 -6.620 -6.605 -4.476 -4.475 -4.471 -4.465 -4.385 -4.382 3.811 3.812 5.942 5.952 6.115 6.115 6.629 6.638 7.651 7.653 7.688 7.689 8.072 8.073 8.122 8.123 8.491 8.501 8.556 8.556 11.612 11.612 12.665 12.665 12.730 12.733 13.835 13.837 14.288 14.289 14.991 15.031 17.132 17.225 18.053 18.055
0.000000 0.000000 0.000000 -46.956 -46.956 -23.677 -23.677 -23.055 -23.055 -22.974 -22.974 -8.038 -8.038 -7.366 -7.366 -7.172 -7.172 -6.678 -6.678 -6.613 -6.613 -4.475 -4.475 -4.469 -4.469 -4.384 -4.384 3.816 3.816 5.946 5.946 6.115 6.115 6.633 6.633 7.653 7.653 7.689 7.689 8.070 8.070 8.122 8.122 8.498 8.498 8.558 8.558 11.607 11.607 12.668 12.668 12.735 12.735 13.827 13.827 14.287 14.287 15.013 15.013 17.186 17.186 18.068 18.068
Basically look for the lines with 3 fields only and from there start to remove the line break character until the next line with 3 fields. Also I want to remove all the spaces at the beginning of the line with the 3 fields. Hope this is clearer from the above example.
I have tried the following code:
BEGIN {
ORS=" ";
}
NF==3 {x=NR+6} (NR<=x) {print}
Trouble is that I get a completely different result. I don't know how to add a \n character before the next pattern match. So I get:
0.033333 0.000000 0.000000 -46.956 -46.956 -23.678 -23.677 -23.055 -23.054 -22.974 -22.974 -8.033 -8.032 -7.375 -7.356 -7.182 -7.159 -6.695 -6.661 -6.628 -6.598 -4.477 -4.477 -4.470 -4.462 -4.387 -4.380 3.799 3.800 5.939 5.960 6.116 6.117 6.625 6.642 7.648 7.651 7.686 7.687 8.077 8.078 8.123 8.126 8.478 8.497 8.550 8.552 11.625 11.626 12.652 12.653 12.722 12.726 13.860 13.864 14.291 14.293 14.966 15.046 17.063 17.252 18.011 18.015 0.016667 0.000000 0.000000 -46.956 -46.956 -23.677 -23.677 -23.055 -23.054 -22.974 -22.974 -8.037 -8.036 -7.371 -7.361 -7.177 -7.165 -6.686 -6.669 -6.620 -6.605 -4.476 -4.475 -4.471 -4.465 -4.385 -4.382 3.811 3.812 5.942 5.952 6.115 6.115 6.629 6.638 7.651 7.653 7.688 7.689 8.072 8.073 8.122 8.123 8.491 8.501 8.556 8.556 11.612 11.612 12.665 12.665 12.730 12.733 13.835 13.837 14.288 14.289 14.991 15.031 17.132 17.225 18.053 18.055 0.000000 0.000000 0.000000 -46.956 -46.956 -23.677 -23.677 -23.055 -23.055 -22.974 -22.974 -8.038 -8.038 -7.366 -7.366 -7.172 -7.172 -6.678 -6.678 -6.613 -6.613 -4.475 -4.475 -4.469 -4.469 -4.384 -4.384 3.816 3.816 5.946 5.946 6.115 6.115 6.633 6.633 7.653 7.653 7.689 7.689 8.070 8.070 8.122 8.122 8.498 8.498 8.558 8.558 11.607 11.607 12.668 12.668 12.735 12.735 13.827 13.827 14.287 14.287 15.013 15.013 17.186 17.186 18.068
I also don't know how to get rid of all the space characters on the line with the pattern match.
One awk idea:
awk '
NF==3 { sub(/^[[:space:]]+/,"") # remove leading white space
printf "%s%s",eol,$0 # initially eol="" (undefined)
eol="\n" # next time print this line with a leading "\n" (to close out previous line)
next}
{ printf "%s%s",OFS,$0 } # OP will need to decide if the extra OFS is needed here or can be removed
END { print "" } # terminate last line of output with a "\n"
' file
This generates:
0.033333 0.000000 0.000000 -46.956 -46.956 -23.678 -23.677 -23.055 -23.054 -22.974 -22.974 -8.033 -8.032 -7.375 -7.356 -7.182 -7.159 -6.695 -6.661 -6.628 -6.598 -4.477 -4.477 -4.470 -4.462 -4.387 -4.380 3.799 3.800 5.939 5.960 6.116 6.117 6.625 6.642 7.648 7.651 7.686 7.687 8.077 8.078 8.123 8.126 8.478 8.497 8.550 8.552 11.625 11.626 12.652 12.653 12.722 12.726 13.860 13.864 14.291 14.293 14.966 15.046 17.063 17.252 18.011 18.015
0.016667 0.000000 0.000000 -46.956 -46.956 -23.677 -23.677 -23.055 -23.054 -22.974 -22.974 -8.037 -8.036 -7.371 -7.361 -7.177 -7.165 -6.686 -6.669 -6.620 -6.605 -4.476 -4.475 -4.471 -4.465 -4.385 -4.382 3.811 3.812 5.942 5.952 6.115 6.115 6.629 6.638 7.651 7.653 7.688 7.689 8.072 8.073 8.122 8.123 8.491 8.501 8.556 8.556 11.612 11.612 12.665 12.665 12.730 12.733 13.835 13.837 14.288 14.289 14.991 15.031 17.132 17.225 18.053 18.055
0.000000 0.000000 0.000000 -46.956 -46.956 -23.677 -23.677 -23.055 -23.055 -22.974 -22.974 -8.038 -8.038 -7.366 -7.366 -7.172 -7.172 -6.678 -6.678 -6.613 -6.613 -4.475 -4.475 -4.469 -4.469 -4.384 -4.384 3.816 3.816 5.946 5.946 6.115 6.115 6.633 6.633 7.653 7.653 7.689 7.689 8.070 8.070 8.122 8.122 8.498 8.498 8.558 8.558 11.607 11.607 12.668 12.668 12.735 12.735 13.827 13.827 14.287 14.287 15.013 15.013 17.186 17.186 18.068 18.068
Since you always have 7 lines per record, all you need is this, using GNU awk for multi-char RS and RT:
$ awk -v RS='([^\n]+\n){7}' -v ORS= '{$0=RT; $1=$1} 1' file
0.033333 0.000000 0.000000 -46.956 -46.956 -23.678 -23.677 -23.055 -23.054 -22.974 -22.974 -8.033 -8.032 -7.375 -7.356 -7.182 -7.159 -6.695 -6.661 -6.628 -6.598 -4.477 -4.477 -4.470 -4.462 -4.387 -4.380 3.799 3.800 5.939 5.960 6.116 6.117 6.625 6.642 7.648 7.651 7.686 7.687 8.077 8.078 8.123 8.126 8.478 8.497 8.550 8.552 11.625 11.626 12.652 12.653 12.722 12.726 13.860 13.864 14.291 14.293 14.966 15.046 17.063 17.252 18.011 18.015
0.016667 0.000000 0.000000 -46.956 -46.956 -23.677 -23.677 -23.055 -23.054 -22.974 -22.974 -8.037 -8.036 -7.371 -7.361 -7.177 -7.165 -6.686 -6.669 -6.620 -6.605 -4.476 -4.475 -4.471 -4.465 -4.385 -4.382 3.811 3.812 5.942 5.952 6.115 6.115 6.629 6.638 7.651 7.653 7.688 7.689 8.072 8.073 8.122 8.123 8.491 8.501 8.556 8.556 11.612 11.612 12.665 12.665 12.730 12.733 13.835 13.837 14.288 14.289 14.991 15.031 17.132 17.225 18.053 18.055
0.000000 0.000000 0.000000 -46.956 -46.956 -23.677 -23.677 -23.055 -23.055 -22.974 -22.974 -8.038 -8.038 -7.366 -7.366 -7.172 -7.172 -6.678 -6.678 -6.613 -6.613 -4.475 -4.475 -4.469 -4.469 -4.384 -4.384 3.816 3.816 5.946 5.946 6.115 6.115 6.633 6.633 7.653 7.653 7.689 7.689 8.070 8.070 8.122 8.122 8.498 8.498 8.558 8.558 11.607 11.607 12.668 12.668 12.735 12.735 13.827 13.827 14.287 14.287 15.013 15.013 17.186 17.186 18.068 18.06
or this using any awk:
$ awk '{rec=rec FS $0} !(NR%7){$0=rec; rec=""; $1=$1; print}' file
0.033333 0.000000 0.000000 -46.956 -46.956 -23.678 -23.677 -23.055 -23.054 -22.974 -22.974 -8.033 -8.032 -7.375 -7.356 -7.182 -7.159 -6.695 -6.661 -6.628 -6.598 -4.477 -4.477 -4.470 -4.462 -4.387 -4.380 3.799 3.800 5.939 5.960 6.116 6.117 6.625 6.642 7.648 7.651 7.686 7.687 8.077 8.078 8.123 8.126 8.478 8.497 8.550 8.552 11.625 11.626 12.652 12.653 12.722 12.726 13.860 13.864 14.291 14.293 14.966 15.046 17.063 17.252 18.011 18.015
0.016667 0.000000 0.000000 -46.956 -46.956 -23.677 -23.677 -23.055 -23.054 -22.974 -22.974 -8.037 -8.036 -7.371 -7.361 -7.177 -7.165 -6.686 -6.669 -6.620 -6.605 -4.476 -4.475 -4.471 -4.465 -4.385 -4.382 3.811 3.812 5.942 5.952 6.115 6.115 6.629 6.638 7.651 7.653 7.688 7.689 8.072 8.073 8.122 8.123 8.491 8.501 8.556 8.556 11.612 11.612 12.665 12.665 12.730 12.733 13.835 13.837 14.288 14.289 14.991 15.031 17.132 17.225 18.053 18.055
0.000000 0.000000 0.000000 -46.956 -46.956 -23.677 -23.677 -23.055 -23.055 -22.974 -22.974 -8.038 -8.038 -7.366 -7.366 -7.172 -7.172 -6.678 -6.678 -6.613 -6.613 -4.475 -4.475 -4.469 -4.469 -4.384 -4.384 3.816 3.816 5.946 5.946 6.115 6.115 6.633 6.633 7.653 7.653 7.689 7.689 8.070 8.070 8.122 8.122 8.498 8.498 8.558 8.558 11.607 11.607 12.668 12.668 12.735 12.735 13.827 13.827 14.287 14.287 15.013 15.013 17.186 17.186 18.068 18.06
awk -v ORS= '
NF==3 {
if (NR>1) print "\n"
sub(/^[[:space:]]*/,"")
}
1;
END { print "\n" }
' file
unset default newline for print (OFS=)
when 3-field line detected
print a newline (unless this is first line)
strip leading whitespace
default print (1;) - with no trailing newline
print final newline at the end
This code assumes all lines have leading whitespace (as shown in the sample input), so that no field separator is needed on joined lines.
Your original code is actually not far from working:
awk '
BEGIN { ORS=" " } # or maybe ORS=""
NF==3 {
sub(/^[[:space:]]*/,"") # strip leading whitespace
x = NR+6
}
NR<=x { print }
NR==x { printf "\n" }
' file
An even simpler solution if we know that the 3-field lines always have much more leading whitespace than any other line (e.g. 8 or more):
awk -v RS='[[:space:]]{8,}' 'gsub(/\n/,"")' file
set input record separator to be lots of spaces
strip all embedded newlines
implicit print will append a trailing newline
Note that the first (empty) record is conveniently elided because gsub fails (no newlines removed) and so does not trigger the implicit print.
Another note: This requires a version of awk that supports multi-character RS (e.g. gawk, busybox; but not mawk, original-awk).
Final note: This method, while shorter code, appears to run significantly more slowly (about 10% of the speed of the first version).
For super-slow (about 1% the speed of the first awk version), and if squeezing whitespace is not a problem, there is also the extremely compact:
<file xargs -n63

How do I get awk to print fields from the second row of a file?

I have a file that looks like this:
measured 10.8 0.0000 0.0000 0.0236 0.0304 0.0383 0.0433 0.0437 0.0442 0.0452
0.0455 0.0448 0.0440 0.0423 0.0386 0.0344 0.0274 0.0000 0.0000
I want gawk to print all the numbers in one long single column like this:
0.0000
0.0000
0.0236
0.0304
0.0383
0.0433
0.0437
0.0442
0.0452
0.0455
0.0448
0.0440
0.0423
0.0386
0.0344
0.0274
0.0000
0.0000
I run the command gawk '/measured/ { printf $3"\n" $4"\n" $5"\n" $6"\n" $7"\n" $8"\n" $9"\n" $10"\n" $11"\n" $12"\n" $13"\n" $14"\n" $15"\n" $16"\n" $17"\n" $18"\n" }' filename.txt
But I just get the first row of numbers:
0.0000
0.0000
0.0236
0.0304
0.0383
0.0433
0.0437
0.0442
0.0452
How do I get gawk to print the second row?
$ cat tst.awk
BEGIN { OFS = "\n" }
/measured/ { c=2; $1=$2=""; $0=$0 }
c && c-- { $1=$1; print }
$ awk -f tst.awk file
0.0000
0.0000
0.0236
0.0304
0.0383
0.0433
0.0437
0.0442
0.0452
0.0455
0.0448
0.0440
0.0423
0.0386
0.0344
0.0274
0.0000
0.0000
$ grep -A1 measured file | tr -s ' ' \\n | tail -n+4
0.0000
0.0000
0.0236
0.0304
0.0383
0.0433
0.0437
0.0442
0.0452
0.0455
0.0448
0.0440
0.0423
0.0386
0.0344
0.0274
0.0000
0.0000
with awk
$ awk -v OFS='\n' '/measured/ {p=1; for(i=3;i<=NF;i++) print $i; next}
p {$1=$1; print; exit}' file
If the number of fields is guaranteed to be as in the example, you can use the following command:
awk '{for(i=NF-8;i<=NF;i++){print $i}}' input.file
The GNU implementation of Awk allows an arbitrary regular expression as the RS record separator If the keyword measured occurs before each batch of numbers, we can use that keyword as the separator:
$ gawk 'BEGIN { RS = "measured" } { for (i = 1; i <= NF ; i++) print "field " i " = " $i }'
measured 10.8 0.0000 0.0000 0.0236 0.0304 0.0383 0.0433 0.0437 0.0442 0.0452
0.0455 0.0448 0.0440 0.0423 0.0386 0.0344 0.0274 0.0000 0.000
field 1 = 10.8
field 2 = 0.0000
field 3 = 0.0000
field 4 = 0.0236
field 5 = 0.0304
field 6 = 0.0383
field 7 = 0.0433
field 8 = 0.0437
field 9 = 0.0442
field 10 = 0.0452
field 11 = 0.0455
field 12 = 0.0448
field 13 = 0.0440
field 14 = 0.0423
field 15 = 0.0386
field 16 = 0.0344
field 17 = 0.0274
field 18 = 0.0000
field 19 = 0.000
As you can see, all the fields between the measured record separators are parsed out regardless of line breaks. Fields are separated on any mixture of spaces, tabs and newlines.
Note that because measured appears first, we get an empty record. The output you see above is, effectively, from the second record. The first record is the whitespcae before measured, which contains no fields.
In other words, he record separator is really expected to be a terminator, except that it can be missing after the last record.

For each different occurrence in field, print lines with max value associated

I have
ID=exon-XM_030285750.2 LOC100221041 7895
ID=exon-XM_030285760.2 LOC100221041 8757
ID=exon-XM_030285720.2 LOC100221041 8656
ID=exon-XM_030285738.2 LOC100221041 8183
ID=exon-XM_030285728.2 LOC100221041 8402
ID=exon-XM_030285733.2 LOC100221041 7398
ID=exon-XM_030285715.2 LOC100221041 8780
ID=exon-XM_030285707.2 LOC100221041 8963
ID=exon-XM_030285694.2 DCBLD2 5838
ID=exon-XM_030285774.2 CMSS1 1440
ID=exon-XM_012570107.3 CMSS1 1502
ID=exon-XM_012570104.3 FILIP1L 6371
ID=exon-XM_030285654.2 FILIP1L 6456
ID=exon-XM_030285647.2 FILIP1L 6488
ID=exon-XM_032751000.1 FILIP1L 5886
ID=exon-XM_030285671.2 FILIP1L 5622
ID=exon-XM_030285682.2 FILIP1L 5395
ID=exon-XR_004369230.1 LOC116808959 2289
I want to print the line for which each element in $2 is associates with highest value in $3
ID=exon-XM_030285707.2 LOC100221041 8963
ID=exon-XM_030285694.2 DCBLD2 5838
ID=exon-XM_012570107.3 CMSS1 1502
ID=exon-XM_030285647.2 FILIP1L 6488
ID=exon-XR_004369230.1 LOC116808959 2289
I tried this
awk -f avg.sh test | awk 'BEGIN {OFS = "\t"} arr[$2]==0 {arr[$2]=$3} ($3 > arr[$2]) {arr[$2]=$3} END{for (i in arr) {print i, arr[i]}}'
from here
how to conditionally filter rows in awk
but I would like to also keep $1 in the output and keep the same ordering as in the input.
The answer to this
Computing averages of chunks of a column
shows how to build an array that keeps the original ordering, but I'm falling putting the two together
Could you please try following, written and tested with shown samples in GNU awk.
awk '
!arr1[$2]++{
found[++count]=$2
}
{
arr[$2]=(arr[$2]>$3?arr[$2]:$3)
val[$2 OFS $3]=$1
}
END{
for(i=1;i<=count;i++){
print val[found[i] OFS arr[found[i]]],found[i],arr[found[i]]
}
}' Input_file
Output will be as follows.
ID=exon-XM_030285707.2 1 8963
ID=exon-XM_030285694.2 2 5838
ID=exon-XM_012570107.3 3 1502
ID=exon-XM_030285647.2 4 6488
ID=exon-XR_004369230.1 5 2289
To get in TAB separated form try following.
awk -v OFS="\t" '
!arr1[$2]++{
found[++count]=$2
}
{
arr[$2]=(arr[$2]>$3?arr[$2]:$3)
val[$2 OFS $3]=$1
}
END{
for(i=1;i<=count;i++){
print val[found[i] OFS arr[found[i]]],found[i],arr[found[i]]
}
}' Input_file |
column -t -s $'\t'
You may use this awk:
awk '!($2 in max) || $3 > max[$2] {
if(!($2 in max))
ord[++n] = $2
max[$2] = $3
rec[$2] = $0
}
END {
for (i=1; i<=n; ++i)
print rec[ord[i]]
}' file | column -t
ID=exon-XM_030285707.2 LOC100221041 8963
ID=exon-XM_030285694.2 DCBLD2 5838
ID=exon-XM_012570107.3 CMSS1 1502
ID=exon-XM_030285647.2 FILIP1L 6488
ID=exon-XR_004369230.1 LOC116808959 2289
You can do with sort and awk.
If ordering is optional.
$ sort -k2,2 -k3,3nr madza.txt | awk ' $2!=p2 { if(NR>1) print p; p=$0;p2=$2 } END { print p }'
ID=exon-XR_004369230.1 LOC116808959 2289
ID=exon-XM_030285707.2 LOC100221041 8963
ID=exon-XM_030285647.2 FILIP1L 6488
ID=exon-XM_030285694.2 DCBLD2 5838
ID=exon-XM_012570107.3 CMSS1 1502
$
To keep the ordering, you can introduce seq numbers and remove them at the last.
$ awk ' { $(NF+1)=NR}1 ' madza.txt | sort -k2,2 -k3,3nr | awk ' $2!=p2 { if(NR>1) print p; p=$0;p2=$2 } END { print p }' | sort -k4 -n | awk ' {NF=NF-1}1 '
ID=exon-XM_030285707.2 LOC100221041 8963
ID=exon-XM_030285694.2 DCBLD2 5838
ID=exon-XM_012570107.3 CMSS1 1502
ID=exon-XM_030285647.2 FILIP1L 6488
ID=exon-XR_004369230.1 LOC116808959 2289
$

AWK failing to sum floats

I am trying to sum the last 12 values in a field in a particular csv file, but AWK is failing to correctly sum the values. If I output the data to a new file then run the same AWK statement against the new file it works.
Here are the contents of the original file. The fields are separated by ";"
I want to sum the values in the 3rd field
...$ tail -12 OriginalFile.csv...
02/02/2020 10:30:00;50727.421;0.264;55772.084;0.360;57110.502;0.384
02/02/2020 10:35:00;50727.455;0.408;55772.126;0.504;57110.548;0.552
02/02/2020 10:40:00;50727.489;0.408;55772.168;0.504;57110.593;0.540
02/02/2020 10:45:00;50727.506;0.204;55772.193;0.300;57110.621;0.336
02/02/2020 10:50:00;50727.541;0.420;55772.236;0.516;57110.667;0.552
02/02/2020 10:55:00;50727.566;0.300;55772.269;0.396;57110.703;0.432
02/02/2020 11:00:00;50727.590;0.288;55772.300;0.372;57110.737;0.408
02/02/2020 11:05:00;50727.605;0.180;55772.321;0.252;57110.762;0.300
02/02/2020 11:10:00;50727.621;0.192;55772.344;0.276;57110.786;0.288
02/02/2020 11:15:00;50727.659;0.456;55772.389;0.540;57110.835;0.588
02/02/2020 11:20:00;50727.681;0.264;55772.417;0.336;57110.866;0.372
02/02/2020 11:25:00;50727.704;0.276;55772.448;0.372;57110.900;0.408
I used the following code to print the original value and the summed value of field 3 for each record, but it just returns the same output for the summed value for each line
...$ awk 'BEGIN { FS = ";" } ; { sum += $3 } { print $3, sum }' OriginalFile.csv|tail -12...
0.264 2.00198e+09
0.408 2.00198e+09
0.408 2.00198e+09
0.204 2.00198e+09
0.420 2.00198e+09
0.300 2.00198e+09
0.288 2.00198e+09
0.180 2.00198e+09
0.192 2.00198e+09
0.456 2.00198e+09
0.264 2.00198e+09
0.276 2.00198e+09
If I output the contents of the file into a different file, the same code works as expected
...$ tail -12 OriginalFile.csv > testfile2.csv...
...$ awk 'BEGIN { FS = ";" } ; { sum += $3 } { print $3, sum }' testfile2.csv...
0.264 0.264
0.408 0.672
0.408 1.08
0.204 1.284
0.420 1.704
0.300 2.004
0.288 2.292
0.180 2.472
0.192 2.664
0.456 3.12
0.264 3.384
0.276 3.66
How can I get the correct output from the original file without having to create a new file?
As #Shawn's excellent comment points out, the order in which you pipe in your data is the problem. By the time you reach the 12th line from the end, sum is already 2.00198e+09; adding many small fractions is not significant, so it seems like it is "the same output".
Simply:
tail -12 OriginalFile.csv | awk 'BEGIN { FS = ";" } ; { sum += $3 } { print $3, sum }'

Count how many repeated times each record appears and select minimum and maximum of specific column

1 .- Fist i would like to count how many times each record appears the key is substr($0,20,18), Print always the last line for each record repeated and print in the output file in last column
2.- Find the minimum and maximum value on column 7 and print in columns 4 and 5 in output file.
Input file
M G 36829.00 37145.00 1 2161 36840.00 37146.00 37576
M G 36829.00 37145.00 217 4321 36852.00 37146.00 37576
M G 36829.00 37145.00 433 6481 36864.00 37146.00 37576
M G 36829.00 37145.00 649 8641 36876.00 37146.00 37576
M G 36829.00 37145.00 865 10801 36888.00 37146.00 37576
M G 36833.00 38033.00 1 4321 36840.00 37602.00 38464
M G 36833.00 38033.00 433 8641 36852.00 37602.00 38464
M G 36833.00 38033.00 865 12961 36864.00 37602.00 38464
M G 36833.00 38033.00 1297 17281 36876.00 37602.00 38464
M G 36833.00 38033.00 1729 21601 36888.00 37602.00 38464
M G 37265.00 38105.00 1 4321 36840.00 37674.00 38536
M G 37265.00 38105.00 433 8641 36852.00 37674.00 38536
M G 37265.00 38105.00 865 12961 36864.00 37674.00 38536
M G 37265.00 38105.00 1297 17281 36876.00 37674.00 38536
M G 37265.00 38105.00 1729 21601 36888.00 37674.00 38536
M G 37265.00 38105.00 2161 25921 36900.00 37674.00 38536
M G 37271.00 38885.00 1 2211 36840.00 38454.00 38894
M G 37271.00 38885.00 222 4421 36852.00 38454.00 38894
M G 37271.00 38885.00 443 6631 36864.00 38454.00 38894
M G 37271.00 38885.00 664 8841 36876.00 38454.00 38894
Desired Output file
36829.00 37145.00 10801 36840.00 36888.00 37146.00 37576 5
36833.00 38033.00 21601 36840.00 36888.00 37602.00 38464 5
37265.00 38105.00 25921 36840.00 36900.00 37674.00 38536 6
37271.00 38885.00 8841 36840.00 36876.00 38454.00 38894 4
I tried.
To count how many times each record appears.
awk '{dups[substr($0,20,18)]++} END{for (num in dups) {print num,dups[num]}}' file
To find the minimum and maximum in column 7.
awk '{\
l = substr($7,1,5);\
printf ("%5d \n",l);\
}' file |
awk ' {D1=substr($1, 1, 5)
D2=substr($1, 1, 5)+0
}
!(D1 in MIN) {MIN[D1]=D2
MAX[D1]=D2
next
}
D2 < MIN[D1] {MIN[D1]=D2}
D2 > MAX[D1] {MAX[D1]=D2}
END {for (m in MIN) print m, MIN[m], MAX[m]}
Thanks in advance.
It sounds like this is what you're trying to do:
$ cat tst.awk
{ currKey = $3 FS $4 }
currKey != prevKey { prt(); min=$7; cnt=0 }
{ prevRec=$0; prevKey=currKey; max=$7; cnt++ }
END { prt() }
function prt( f) {
if ( cnt ) {
split(prevRec,f)
print f[3], f[4], f[6], min, max, f[7], f[8], cnt
}
}
$ sort -k3,4n -k7n file | awk -f tst.awk | column -t
36829.00 37145.00 10801 36840.00 36888.00 36888.00 37146.00 5
36833.00 38033.00 21601 36840.00 36888.00 36888.00 37602.00 5
37265.00 38105.00 25921 36840.00 36900.00 36900.00 37674.00 6
37271.00 38885.00 8841 36840.00 36876.00 36876.00 38454.00 4
Does not keep the order of input file but works even if your file is not ordered by key first
awk '
{
$7+=0;
COUNT[$9]+=1;
C1[$9]=$3;
C2[$9]=$4;
C3[$9]=$6;
C6[$9]=$8
}
!($9 in MIN){
MIN[$9]=$7;
MAX[$9]=$7;
next
}
$7<MIN[$9]{
MIN[$9]=$7
}
$7>MAX[$9]{
MAX[$9]=$7
}
END{
for(id in COUNT){
print C1[id], C2[id], C3[id], MIN[id], MAX[id], C6[id], id, COUNT[id]
}
}' <file>
Output :
37271.00 38885.00 8841 36840 36876 38454.00 38894 4
36833.00 38033.00 21601 36840 36888 37602.00 38464 5
36829.00 37145.00 10801 36840 36888 37146.00 37576 5
37265.00 38105.00 25921 36840 36900 37674.00 38536 6
Could you please try following.
awk '
{
val=substr($0,20,18)
$1=$2=""
sub(/^[[:space:]]+/,"")
}
prev!=val && prev{
print first,second,min,max,third,count
count=""
}
{
min=min<$5?min?min:$5:$5
max=max>$5?max:$5
prev=val
count++
first=$1 OFS $2
second=$4
third=$(NF-1) OFS $NF
}
END{
if(prev){
print first,second,min,max,third,count
}
}
' Input_file | column -t