AWK: sums problematic in awk table transform script

AWK: sums problematic in awk table transform script - awk

I have the following file:
cat st_in.txt
2015-01-01 2 A FI
2015-02-03 4 B VI
2015-03-01 6 A FI
2015-01-08 -4 C VE
2016-01-05 -3 B VE
2016-02-03 -1 D FE
2016-04-01 -2 B FE
2016-06-13 -5 D VE
2017-01-01 2 A VI
2017-02-03 3 A VI
2017-02-04 8 C FI
2017-01-05 -1 B FE
And want to transform it like this:
2015 2016 2017
A 8.00 0.00 0.00
C 0.00 0.00 8.00
sumFI 8.00 0.00 8.00
A 0.00 0.00 5.00
B 4.00 0.00 0.00
sumVI 4.00 0.00 5.00
sumI 12.00 0.00 13.00
B 0.00 -2.00 -1.00
D 0.00 -1.00 0.00
sumFE 0.00 -3.00 -1.00
B 0.00 -3.00 0.00
C -4.00 0.00 0.00
D 0.00 -5.00 0.00
sumVE -4.00 -8.00 0.00
sumE -4.00 -11.00 -1.00
net 8.00 -11.00 12.00
To do that I have written the following script:
#!/usr/bin/env bash
awk '
BEGIN {
OFS = "\t"
}
{
yr = substr($1, 1, 4)
sub(/-.*/, "", $1)
minYr = (NR == 1 || $1 < minYr ? $1 : minYr)
maxYr = (NR == 1 || $1 > maxYr ? $1 : maxYr)
H[$4][$3]
W[yr][$4][$3] += $2
yT[$1] += $2
val[$1][$4] += $2
}
END {
for (yr = minYr; yr <= maxYr; yr++) {
printf "%s%s", OFS, yr
}
print ""
print ""
for (cT in H) {
for (c in H[cT]) {
printf c, OFS
for (yr = minYr; yr <= maxYr; yr++) {
printf "%s%0.2f", OFS, W[yr][cT][c]
}
print ""
}
printf "sum" cT, OFS
for (yr = minYr; yr <= maxYr; yr++) {
printf "%s%0.2f", OFS, val[yr][cT]
}
print ""
print ""
if (cT == "VI") {
printf "sumI", OFS
for (yr = minYr; yr <= maxYr; yr++) {
printf "%s%0.2f", OFS, W[yr][cT][c]
}
} else if (cT == "VE") {
printf "sumE", OFS
for (yr = minYr; yr <= maxYr; yr++) {
printf "%s%0.2f", OFS, W[yr][cT][c]
}
}
print ORS
}
printf "net"
for (yr = minYr; yr <= maxYr; yr++) {
printf "%s%0.2f", OFS, yT[yr]
}
print ""
}
' "${#:--}"
Which prints the following:
./trans1 st_in.txt
2015 2016 2017
A 8.00 0.00 0.00
C 0.00 0.00 8.00
sumFI 8.00 0.00 8.00
A 0.00 0.00 5.00
B 4.00 0.00 0.00
sumVI 4.00 0.00 5.00
sumI 4.00 0.00 0.00
B 0.00 -2.00 -1.00
D 0.00 -1.00 0.00
sumFE 0.00 -3.00 -1.00
B 0.00 -3.00 0.00
C -4.00 0.00 0.00
D 0.00 -5.00 0.00
sumVE -4.00 -8.00 0.00
sumE 0.00 -5.00 0.00
net 8.00 -11.00 12.00
I am not worried about the formatting here. The console printout is similar to the target. But what I am struggling with is to get the correct sums for sumI (sumFI + sumVI) and sumE(sumFE + sumVE). Can someone please help with that?
EDIT by Ed Morton to provide meaningful data types and variable names, assuming the OPs current data is column types of Date Amount Item Type as it was in their first question:
$ cat trans1
#!/usr/bin/env bash
awk '
BEGIN {
OFS = "\t"
}
{
date = $1
amount = $2
item = $3
type = $4
year = substr(date, 1, 4)
minYear = (NR == 1 || year < minYear ? year : minYear)
maxYear = (NR == 1 || year > maxYear ? year : maxYear)
types_items[type][item]
yearsTypesItems2amounts[year][type][item] += $2
years2amounts[year] += $2
yearsTypes2amounts[year][type] += $2
}
END {
for (year = minYear; year <= maxYear; year++) {
printf "%s%s", OFS, year
}
print ""
print ""
for (type in types_items) {
for (item in types_items[type]) {
printf item, OFS
for (year = minYear; year <= maxYear; year++) {
printf "%s%0.2f", OFS, yearsTypesItems2amounts[year][type][item]
}
print ""
}
printf "sum" type, OFS
for (year = minYear; year <= maxYear; year++) {
printf "%s%0.2f", OFS, yearsTypes2amounts[year][type]
}
print ""
print ""
if (type == "VI") {
printf "sumI", OFS
for (year = minYear; year <= maxYear; year++) {
printf "%s%0.2f", OFS, yearsTypesItems2amounts[year][type][item] # <--- NOTE: item unset here
}
} else if (type == "VE") {
printf "sumE", OFS
for (year = minYear; year <= maxYear; year++) {
printf "%s%0.2f", OFS, yearsTypesItems2amounts[year][type][item] # <--- NOTE: item unset here
}
}
print ORS
}
printf "net"
for (year = minYear; year <= maxYear; year++) {
printf "%s%0.2f", OFS, years2amounts[year]
}
print ""
}
' "${#:--}"
See the "NOTE"s above where by just doing this renaming exercise it became obvious where some bugs are (looks like you were using the wrong array and should be using yearsTypes2amounts[year][type] instead of yearsTypesItems2amounts[year][type][item]).

Continuing to use GNU awk for arrays of arrays and assuming the OPs current data is column types of Date Amount Item Type as it was in your first question:
$ cat trans1
#!/usr/bin/env bash
awk '
BEGIN {
OFS = "\t"
}
{
date = $1
amount = $2
item = $3
type = $4
year = substr(date, 1, 4)
minYear = (NR == 1 || year < minYear ? year : minYear)
maxYear = (NR == 1 || year > maxYear ? year : maxYear)
cat = substr(type,2)
ctiys2amounts[cat][type][item][year] += amount
}
END {
for (year = minYear; year <= maxYear; year++) {
printf "%s%s", OFS, year
}
print ORS
for (cat in ctiys2amounts) {
delete catSum
for (type in ctiys2amounts[cat]) {
delete typeSum
for (item in ctiys2amounts[cat][type]) {
printf "%s", item
for (year = minYear; year <= maxYear; year++) {
amount = ctiys2amounts[cat][type][item][year]
printf "%s%0.2f", OFS, amount
typeSum[year] += amount
}
print ""
}
printf "sum%s", type
for (year = minYear; year <= maxYear; year++) {
printf "%s%0.2f", OFS, typeSum[year]
catSum[year] += typeSum[year]
}
print ORS
}
printf "sum%s", cat
for (year = minYear; year <= maxYear; year++) {
printf "%s%0.2f", OFS, catSum[year]
yearSum[year] += catSum[year]
}
print ORS
}
printf "net"
for (year = minYear; year <= maxYear; year++) {
printf "%s%0.2f", OFS, yearSum[year]
}
print ""
}
' "${#:--}"
$ ./trans1 st_in.txt
2015 2016 2017
B 0.00 -2.00 -1.00
D 0.00 -1.00 0.00
sumFE 0.00 -3.00 -1.00
B 0.00 -3.00 0.00
C -4.00 0.00 0.00
D 0.00 -5.00 0.00
sumVE -4.00 -8.00 0.00
sumE -4.00 -11.00 -1.00
A 8.00 0.00 0.00
C 0.00 0.00 8.00
sumFI 8.00 0.00 8.00
A 0.00 0.00 5.00
B 4.00 0.00 0.00
sumVI 4.00 0.00 5.00
sumI 12.00 0.00 13.00
net 8.00 -11.00 12.00

Related

AWK: Help on transforming data table

I have the following file called in.txt:
2020-01-01 fruit banana 3.4
2020-03-02 alcohol smirnov 26.99
2020-03-10 fruit orange 4.20
2020-04-03 fruit orange 4.20
2021-09-01 alcohol beer 6.00
2021-08-03 fruit mango 6.99
2022-01-01 fruit orange 4.30
2022-03-04 alcohol beer 6.00
2022-03-03 alcohol beer 6.00
2022-04-01 fruit mango 7.20
I want to transform the file so it reads something like this:
2020-01-01 2021-01-01 2022-01-01
-2020-12-31 -2021-12-31 -2022-12-31
fruit banana 3.40 0.00 0.00
orange 8.40 0.00 4.30
mango 0.00 6.99 7.20
Subt 11.80 6.99 11.50
alcohol beer 0.00 6.00 12.00
smirnov 26.99 0.00 0.00
Subt 26.99 6.00 12.00
Total 38.59 12.99 23.50
I have started writing the following script but am stuck on how to approach this. How can I display totals columns side by side. The other problem is that this is just dummy data. I have many different categories other than fruit and alcohol and it seems wrong to write if statements and for-loops for each one. Also how can I print fruit and alcohol out just once rather than for every iteration of column 3 and bring the date range to the top. Help is much appreciated.
#!/usr/bin/env bash
awk '
BEGIN{
FS=OFS="\t";
}
{
if ($2 ~ fruit && $1 >= "2020-01-01" && $1 <= "2020-12-31") {
a[$3]+=$4;
sa+=$4;
}
}
END {
PROCINFO["sorted_in"]="#ind_str_asc";
for (i in a) {
print "fruit", i, a[i]
}
}
' "${#:--}"

Would you please try the following:
#!/bin/bash
awk '
{
year = substr($1, 1, 4) # extract year
if (from == "" || from > year) from = year # first (smallest) year
if (to == "" || to < year) to = year # last (largest) year
if ($3 in category == 0) {
category[$3] = $2 # map item to category
list[$2] = list[$2] fs[$2] $3 # csv of items
fs[$2] = "," # delimiter for csv
}
sum[$3,year] += $4 # sum of the item in the year
subt[$2,year] += $4 # sum of the category in the year
ttl[year] += $4 # sum in the year
}
END {
format1 = "%-10s%-10s" # format for the left cells
format2 = "%-16s" # format for the header
format3 = "%-16.2f" # format for the amounts
# print upper header
printf(format1, "", "")
for (y = from; y <= to; y++) {
printf(format2, y "-01-01")
}
print ""
# print second header
printf(format1, "", "")
for (y = from; y <= to; y++) {
printf(format2, "-" y "-12-31")
}
print ""
for (cat in list) { # loop over the categories ("fruit" and "alcohol")
n = split(list[cat], item, ",") # split into items
for (i = 1; i <= n; i++) { # loop over the items
printf(format1, i == 1 ? cat : "", item[i])
for (y = from; y <= to; y++) { # loop over years
printf(format3, sum[item[i],y]) # append the sum of the year
}
print "" # finally break the line
}
print "" # insert blank line
printf(format1, "Subt", "")
for (y = from; y <= to; y++) {
printf(format3, subt[cat,y]) # append the subtotal
}
print "\n"
}
printf(format1, "Total", "")
for (y = from; y <= to; y++) {
printf(format3, ttl[y]) # append the total amount
}
print ""
}
' in.txt
Output with the provided input:
2020-01-01 2021-01-01 2022-01-01
-2020-12-31 -2021-12-31 -2022-12-31
alcohol smirnov 26.99 0.00 0.00
beer 0.00 6.00 12.00
Subt 26.99 6.00 12.00
fruit banana 3.40 0.00 0.00
orange 8.40 0.00 4.30
mango 0.00 6.99 7.20
Subt 11.80 6.99 11.50
Total 38.79 12.99 23.50
Please forgive me the order of items are not same as the OP's.

Using GNU awk for arrays of arrays:
$ cat tst.awk
BEGIN { OFS="\t" }
{
sub(/-.*/,"",$1)
minYear = ( NR==1 || $1 < minYear ? $1 : minYear )
maxYear = ( NR==1 || $1 > maxYear ? $1 : maxYear )
items[$2][$3]
vals[$1][$2][$3] += $4
typeTots[$1][$2] += $4
yearTots[$1] += $4
}
END {
printf "%s", OFS
for ( year=minYear; year<=maxYear; year++ ) {
printf "%s%s", OFS, year
}
print ""
for ( type in items ) {
itemCnt = 0
for ( item in items[type] ) {
printf "%s%s%s", (itemCnt++ ? "" : type), OFS, item
for ( year=minYear; year<=maxYear; year++ ) {
printf "%s%0.2f", OFS, vals[year][type][item]
}
print ""
}
printf "Subt%s", OFS
for ( year=minYear; year<=maxYear; year++ ) {
printf "%s%0.2f", OFS, typeTots[year][type]
}
print ORS
}
printf "Total%s", OFS
for ( year=minYear; year<=maxYear; year++ ) {
printf "%s%0.2f", OFS, yearTots[year]
}
print ""
}
$ awk -f tst.awk in.txt
2020 2021 2022
alcohol beer 0.00 6.00 12.00
smirnov 26.99 0.00 0.00
Subt 26.99 6.00 12.00
fruit orange 8.40 0.00 4.30
mango 0.00 6.99 7.20
banana 3.40 0.00 0.00
Subt 11.80 6.99 11.50
Total 38.79 12.99 23.50
or if you really want specific date ranges instead of just the year in the header:
$ cat tst.awk
BEGIN { OFS="\t" }
{
sub(/-.*/,"",$1)
minYear = ( NR==1 || $1 < minYear ? $1 : minYear )
maxYear = ( NR==1 || $1 > maxYear ? $1 : maxYear )
items[$2][$3]
vals[$1][$2][$3] += $4
typeTots[$1][$2] += $4
yearTots[$1] += $4
}
END {
printf "%s", OFS
for ( year=minYear; year<=maxYear; year++ ) {
printf "%s%s-01-01", OFS, year
}
print ""
printf "%s", OFS
for ( year=minYear; year<=maxYear; year++ ) {
printf "%s-%s-12-31", OFS, year
}
print ""
for ( type in items ) {
itemCnt = 0
for ( item in items[type] ) {
printf "%s%s%s", (itemCnt++ ? "" : type), OFS, item
for ( year=minYear; year<=maxYear; year++ ) {
printf "%s%0.2f", OFS, vals[year][type][item]
}
print ""
}
printf "Subt%s", OFS
for ( year=minYear; year<=maxYear; year++ ) {
printf "%s%0.2f", OFS, typeTots[year][type]
}
print ORS
}
printf "Total%s", OFS
for ( year=minYear; year<=maxYear; year++ ) {
printf "%s%0.2f", OFS, yearTots[year]
}
print ""
}
$ awk -f tst.awk in.txt | column -s$'\t' -t
2020-01-01 2021-01-01 2022-01-01
-2020-12-31 -2021-12-31 -2022-12-31
alcohol beer 0.00 6.00 12.00
smirnov 26.99 0.00 0.00
Subt 26.99 6.00 12.00
fruit orange 8.40 0.00 4.30
mango 0.00 6.99 7.20
banana 3.40 0.00 0.00
Subt 11.80 6.99 11.50
Total 38.79 12.99 23.50

I believe the following piece of awk code is a good start. The remaining part to do is just some cleanup and some extra code for the sums.
BEGIN{
# how many divisions per year
n=1
# initialisation of some variables
tmax=0;tmin=999999; ymax=qmax=0;ymin=9999;qmin=99
}
# convert date to quarter,trim,half
{ y=$1+0; q=(substr($1,6,7)+0)%n}
# compute min max time
(y*100+q < tmin) { ymin=y;qmin=q;tmin=y*100+q }
(y*100+q > tmax) { ymax=y;qmax=q;tmax=y*100+q }
# Create arrays that keep track of everything
# a : prices by year,q,category and element
# b : just a list of categories, eg fruit
# c : just a list of elements and the category it belongs to.
{ a[y,q,$2,$3]=$4; b[$2]; c[$3]=$2 }
END{
# loop over categories (eg fruit)
for(i in b) {
# loop over elemnts
for(j in c) {
# exclude elements that do not belong to category
if (i!=c[j]) continue
s=i OFS j;
# loop over the time
for (y=ymin;y<=ymax;y++) {
for (q=0;q<n;++q) {
if (y*100+q < tmin) continue
if (y*100+q > tmax) continue
s=s OFS a[y,q,i,j]+0
}
}
print s
}
}
}
This currently outputs:
alcohol beer 0 6 6
alcohol smirnov 26.99 0 0
fruit orange 4.2 0 4.3
fruit mango 0 6.99 7.2
fruit banana 3.4 0 0

awk script to sum numbers in a column over a loop not working for some iterations in the loop

Sample input
12.0000 0.6000000 0.05
13.0000 1.6000000 0.05
14.0000 2.6000000 0.05
15.0000 3.0000000 0.05
15.0000 3.2000000 0.05
15.0000 3.4000000 0.05
15.0000 3.6000000 0.10
15.0000 3.8000000 0.10
15.0000 4.0000000 0.10
15.0000 4.2000000 0.11
15.0000 4.4000000 0.12
15.0000 4.6000000 0.13
15.0000 4.8000000 0.14
15.0000 5.0000000 0.15
15.0000 5.2000000 0.14
15.0000 5.4000000 0.13
15.0000 5.6000000 0.12
15.0000 5.8000000 0.11
15.0000 6.0000000 0.10
15.0000 6.2000000 0.10
15.0000 6.4000000 0.10
15.0000 6.6000000 0.05
15.0000 6.8000000 0.05
15.0000 7.0000000 0.05
Goal
Print line 1 in output as 0 0
For $2 = 5.000000, $3 = 0.15.
Print line 2 in output as 1 0.15
For $2 = 4.800000 through $2 = 5.200000, sum+=$3 for each line (i.e. 0.14 + 0.15 + 0.14 = 0.43).
Print line 3 in output as 2 0.43.
For $2 = 4.600000 through $2 = 5.400000, sum+=$3 for each line (i.e. 0.13 + 0.14 + 0.15 + 0.14 + 0.13 = 0.69).
Print line 4 in output as 3 0.69
Continue this pattern until $2 = 5.000000 +- 1.6 (9 lines total, plus line 1 as 0 0 = 10 total lines in output)
Desired Output
0 0
1 0.15
2 0.43
3 0.69
4 0.93
5 1.15
6 1.35
7 1.55
8 1.75
9 1.85
Attempt
Script 1
#!/bin/bash
for (( i=0; i<=8; i++ )); do
awk '$2 >= 5.0000000-'$i'*0.2 {sum+=$3}
$2 == 5.0000000+'$i'*0.2 {print '$i', sum; exit
}' test.dat
done > test.out
produces
0 0.15
1 0.43
2 0.69
3 0.93
4 1.15
5 1.35
6 1.55
7 1.75
8 1.85
This is very close. However, the output is missing 0 0 for line 1, and because of this, lines 2 through 10 have $1 and $2 mismatched by 1 line.
Script 2
#!/bin/bash
for (( i=0; i<=8; i++ )); do
awk ''$i'==0 {sum=0}
'$i'>0 && $2 > 5.0000000-'$i'*0.2 {sum+=$3}
$2 == 5.0000000+'$i'*0.2 - ('$i' ? 0.2 : 0) {print '$i', sum; exit
}' test.dat
done > test.out
which produces
0 0
1 0.15
2 0.43
4 0.93
5 1.15
6 1.35
7 1.55
$1 and $2 are now correctly matched. However, I am missing the lines with $1=3, $1=8, and $1=9 completely. Adding the ternary operator causes my code to skip these iterations in the loop somehow.
Question
Can anyone explain what's wrong with script 2, or how to achieve the desired output in one line of code? Thank you.
Solution
I used Ed Morton's solution to solve this. Both of them work for different goals. Instead of using the modulus to save array space, I constrained the array to $1 = 15.0000. I did this instead of the modulus in order to include two other "key" variables that I had wanted to also sum over at different parts of the input, into separate output files.
Furthermore, as far as I understood it, the script summed only for lines with $2 >= 5.0000000, and then multiplied the summation by 2, in order to include the lines with $2 <= 5.0000000. This works for the sample input here because I made $3 symmetric around 0.15. I modified it to sum them separately, though.
awk 'BEGIN { key=5; range=9}
$1 == 15.0000 {
a[NR] = $3
}
$2 == key { keyIdx = NR}
END {
print (0, 0) > "test.out"
sum = a[keyIdx]
for (delta=1; delta<=range; delta++) {
print (delta, sum) > "test.out"
plusIdx = (keyIdx + delta)
minusIdx = (keyIdx - delta)
sum += a[plusIdx] + a[minusIdx]
}
exit
}' test.dat

Is this what you're trying to do?
$ cat tst.awk
$2 == 5 { keyNr = NR }
{ nr2val[NR] = $3 }
END {
print 0, 0
sum = nr2val[keyNr]
for (delta=1; delta<=9; delta++) {
print delta, sum
sum += nr2val[keyNr+delta] + nr2val[keyNr-delta]
}
}
$ awk -f tst.awk file
0 0
1 0.15
2 0.43
3 0.69
4 0.93
5 1.15
6 1.35
7 1.55
8 1.75
9 1.85
We could optimize it to only store 2*(range=9) values in vals[] (using a modulus operator NR%(2*range) for the index) and do the calculation when we hit an NR that's range lines past the line where $2 == key rather than doing it after we've read the whole of the input if it's either too slow or your input file is too big to store all in memory, e.g.:
$ cat tst.awk
BEGIN { key=5; range=9 }
{
idx = NR % (2*range)
nr2val[idx] = $3
}
$2 == key { keyIdx = idx; endNr = NR+range }
NR == endNr { exit }
END {
print 0, 0
sum = nr2val[keyIdx]
for (delta=1; delta<=range; delta++) {
print delta, sum
idx = (keyIdx + delta) % (2*range)
sum += nr2val[idx] + nr2val[idx]
}
exit
}
$ awk -f tst.awk file
0 0
1 0.15
2 0.43
3 0.69
4 0.93
5 1.15
6 1.35
7 1.55
8 1.75
9 1.85

I like your problem. It is an adequate challenge.
My approach is to put all possible into the awk script. And scan the input file only once. Because I/O manipulation is slower than computation (these days).
Do as many computation (actually 9) on the relevant input line.
The required inputs are variable F1 and text file input.txt
The execution command is:
awk -v F1=95 -f script.awk input.txt
So the logic is:
1. Initialize: Compute the 9 range markers and store their values in an array.
2. Store the 3rd input value in an order array `field3`. We use this array to compute the sum.
3. On each line that has 1st field equals 15.0000.
3.1 If found begin marker then mark it.
3.2 If found end marker then compute the sum, and mark it.
4. Finalize: Output all the computed results
script.awk including few debug printout to assist in debugging
BEGIN {
itrtns = 8; # iterations count consistent all over the program.
for (i = 0; i <= itrtns; i++) { # compute range markers per iteration
F1start[i] = (F1 - 2 - i)/5 - 14; # print "F1start["i"]="F1start[i];
F1stop[i] = (F1 - 2 + i)/5 - 14; # print "F1stop["i"]="F1stop[i];
b[i] = F1start[i] + (i ? 0.2 : 0); # print "b["i"]="b[i];
}
}
{ field3[NR] = $3;} # store 3rd input field in ordered array.
$1==15.0000 { # for each input line that has 1st input field 15.0000
currVal = $2 + 0; # convert 2nd input field to numeric value
for (i = 0; i <= itrtns; i++) { # on each line scan for range markers
# print "i="i, "currVal="currVal, "b["i"]="b[i], "F1stop["i"]="F1stop[i], isZero(currVal-b[i]), isZero(currVal-F1stop[i]);
if (isZero(currVal - b[i])) { # if there is a begin marker
F1idx[i] = NR; # store the marker index postion
# print "F1idx["i"] =", F1idx[i];
}
if (isZero(currVal - F1stop[i])) { # if there is an end marker
for (s = F1idx[i]; s <= NR; s++) {sum[i] += field3[s];} # calculate its sum
F2idx[i] = NR; # store its end marker postion (for debug report)
# print "field3["NR"]=", field3[NR];
}
}
}
END { # output the computed results
for (i = 0; i <= itrtns; i++) {print i, sum[i], "rows("F1idx[i]"-"F2idx[i]")"}
}
function isZero(floatArg) { # floating point number pecision comparison
tolerance = 0.00000000001;
if (floatArg < tolerance && floatArg > -1 * tolerance )
return 1;
return 0;
}
Provided input.txt from the question.
12.0000 0.6000000 0.05
13.0000 1.6000000 0.05
14.0000 2.6000000 0.05
15.0000 3.0000000 0.05
15.0000 3.2000000 0.05
15.0000 3.4000000 0.05
15.0000 3.6000000 0.10
15.0000 3.8000000 0.10
15.0000 4.0000000 0.10
15.0000 4.2000000 0.11
15.0000 4.4000000 0.12
15.0000 4.6000000 0.13
15.0000 4.8000000 0.14
15.0000 5.0000000 0.15
15.0000 5.2000000 0.14
15.0000 5.4000000 0.13
15.0000 5.6000000 0.12
15.0000 5.8000000 0.11
15.0000 6.0000000 0.10
15.0000 6.2000000 0.10
15.0000 6.4000000 0.10
15.0000 6.6000000 0.05
15.0000 6.8000000 0.05
15.0000 7.0000000 0.05
The output for: awk -v F1=95 -f script.awk input.txt
0 0.13 rows(12-12)
1 0.27 rows(12-13)
2 0.54 rows(11-14)
3 0.79 rows(10-15)
4 1.02 rows(9-16)
5 1.24 rows(8-17)
6 1.45 rows(7-18)
7 1.6 rows(6-19)
8 1.75 rows(5-20)
The output for: awk -v F1=97 -f script.awk input.txt
0 0.15 rows(14-14)
1 0.29 rows(14-15)
2 0.56 rows(13-16)
3 0.81 rows(12-17)
4 1.04 rows(11-18)
5 1.25 rows(10-19)
6 1.45 rows(9-20)
7 1.65 rows(8-21)
8 1.8 rows(7-22)

How to print columns that have values satisfying a range?

I have a big table with thousands of columns and rows. But for the sake of simplification, let say that I have a table consisting 11 rows and 100 columns. The table cells contain values between 0 and 1. The table looks like below:
Sample1 Sample2 Sample3 Sample4
1 0 0.001 0.002
0.74 0.52 0.654 0.75
0.65 0.64 0.455 0.72
0.24 0.51 0.512 0.78
0.25 0.555 0.557 0.25
0.003 0.454 0.532 0.23
0.02 0.56 0.643 0.22
1 0.495 0.555 0.99
0.992 1 0.999 0.98
0.12 0 0.968 1
Now I would like to scan all the cells for all values that are within a specific range of 0.80 >= value >= 0.70. Any cells that contain values as such, the whole column will be printed, including the headers.
The intended output would be as below:
Sample1 Sample4
1 0.002
0.74 0.75
0.65 0.72
0.24 0.78
0.25 0.25
0.003 0.23
0.02 0.22
1 0.99
0.992 0.98
0.12 1
Commands using awk would be preferable, but I do not know whether it is the best for this kind of extraction.
Please kindly advise me on how to do this. Any help will be very much appreciated. Thank you.

awk to the rescue!
$ awk 'NR==FNR && NR>1{for(i=1; i<=NF; i++)
if(0.7<=$i && $i<=0.8) col[i]=1; next}
{for(i=1 ;i<=NF; i++)
if(col[i]) printf "%s", $i OFS; print ""}' file{,} | column -t
Sample1 Sample4
1 0.002
0.74 0.75
0.65 0.72
0.24 0.78
0.25 0.25
0.003 0.23
0.02 0.22
1 0.99
0.992 0.98
0.12 1
a double scan algorithm, marks the filtered columns in the first round and prints them in the second round.

Not sure whether this will work with the size of table you're dealing with, but any solution is likely going to have to store something in an array, or be multi-pass.
My first thought was that given rotate.awk like this:
{
for (i=1; i<=NF; i++) {
d[i,NR]=$i
}
}
END {
for (i=1; i<=NF; i++) {
tab=""
for (j=1; j<=NR; j++) {
printf "%s%s", tab, d[i,j]
tab="\t"
}
printf "\n"
}
}
You could analyse your results between two rotations:
$ awk -f rotate.awk file.tsv | awk -v n=0.7 -v m=0.8 '{x=0; for (i=2; i<=NF; i++) if ($i >= n && $i <= m) x=1} x' | awk -f rotate.awk
Sample1 Sample4
1 0.002
0.74 0.75
0.65 0.72
0.24 0.78
0.25 0.25
0.003 0.23
0.02 0.22
1 0.99
0.992 0.98
0.12 1
Again, you may be limited by the amount of memory your system can allocate to awk to contain the array required for rotation.
An alternative which doesn't use gobs of memory to store an array would be a multi-pass approach that identifies column numbers, then uses them as input to a print script:
$ awk -v n=0.7 -v m=0.8 '{for (i=1; i<=NF; i++) if ($i >= n && $i <= m) print i}' file.tsv |
awk 'NR==FNR{c[$1];next} {tab="";for (i=1; i<=NF; i++) if (i in c) {printf "%s%s",tab,$i;tab="\t"} printf "\n" }' - file.tsv
The idea here is that the FIRST awk script selects the columns to be printed, and prints just those column numbers. The SECOND awk script takes two inputs; first, it reads the list of column numbers from stdin (-) and populates an array with them. Then it steps through the input file, printing columns whose numbers are within the array.

$ cat tst.awk
BEGIN { FS=OFS="\t" }
NR==FNR {
if (FNR > 1) {
for (i=1; i<=NF; i++) {
if ( ($i >= 0.7) && ($i <= 0.8) ) {
good[i]
}
}
}
next
}
{
c=0
for (i=1; i<=NF; i++) {
if (i in good) {
printf "%s%s", (c++ ? OFS : ""), $i
}
}
print ""
}
$ awk -f tst.awk file file
Sample1 Sample4
1 0.002
0.74 0.75
0.65 0.72
0.24 0.78
0.25 0.25
0.003 0.23
0.02 0.22
1 0.99
0.992 0.98
0.12 1

Need to print the lines that matches regex for both themselves and neighboring lanes

I have a problem and thus i ask :)
i have input... something like this
48 06-Jul-16 00:04:26.850000, 0.3
1 06-Jul-16 00:04:29.200000, 0.35
60 06-Jul-16 00:04:29.250000, 0.3
1 06-Jul-16 00:04:32.190476, 0.35
11 06-Jul-16 00:04:32.238095, 0.3
1 06-Jul-16 00:04:32.761905, 0.35
20 06-Jul-16 00:04:32.809524, 0.3
1 06-Jul-16 00:04:33.800000, 0.35
14 06-Jul-16 00:04:33.850000, 0.3
1 06-Jul-16 00:04:34.550000, 0.35
4 06-Jul-16 00:04:34.600000, 0.3
1 06-Jul-16 00:04:34.800000, 0.35
28 06-Jul-16 00:04:34.850000, 0.3
2 06-Jul-16 00:04:36.238095, 0.35
12 06-Jul-16 00:04:36.333333, 0.3
1 06-Jul-16 00:04:36.904762, 0.35
1 06-Jul-16 00:04:36.952381, 0.3
1 06-Jul-16 00:04:37.000000, 0.35
22 06-Jul-16 00:04:37.050000, 0.3
2 06-Jul-16 00:04:38.150000, 0.35
10 06-Jul-16 00:04:38.250000, 0.3
1 06-Jul-16 00:04:38.750000, 0.35
1 06-Jul-16 00:04:38.800000, 0.3
and I need an output where the first column is 1, it's previous line and next line 1st columns must be more than 12 and fourth columns both bigger or both lower than current rows 4th column, like this
1 06-Jul-16 00:04:29.200000, 0.35
1 06-Jul-16 00:04:33.800000, 0.35
I tried to play with awk a bit but to no vain
awk '($1=1) && NR+1($1>12) && NR-1($1>12){print $0}'
And I understand that it is totally wrong.
Thanks for help.

$ cat tst.awk
NR==FNR { a[NR] = $0; next }
{ split(a[FNR-1],p); split(a[FNR+1],n) }
(FNR > 1) && ($1 == 1) && (p[1] > 12) && (n[1] > 12) &&
( ( (p[4] > $4) && (n[4] > $4) ) ||
( (p[4] < $4) && (n[4] < $4) ) )
$ awk -f tst.awk file file
1 06-Jul-16 00:04:29.200000, 0.35
1 06-Jul-16 00:04:33.800000, 0.35
or if you prefer it in one pass at the expense of a little more complexity:
$ cat tst.awk
{ split(prev,p); split(curr,c); split($0,n) }
(NR > 2) && (c[1] == 1) && (p[1] > 12) && (n[1] > 12) &&
( ( (p[4] > c[4]) && (n[4] > c[4]) ) ||
( (p[4] < c[4]) && (n[4] < c[4]) ) ) {
print curr
}
{prev = curr; curr = $0 }
$ awk -f tst.awk file
1 06-Jul-16 00:04:29.200000, 0.35
1 06-Jul-16 00:04:33.800000, 0.35
p = previous, c = current, n = next.

$ cat > test.awk
t[1]==1 && p[1]>12 && $1>12 && ((t[4]>p[4] && t[4]>$4) || (t[4]<p[4] && t[4]<$4)) {
print t[0]
}
{
p[1]=t[1];t[1]=$1;
p[0]=t[0];t[0]=$0;
p[4]=t[4];t[4]=$4;
}
$ awk -f test.awk test.in
1 06-Jul-16 00:04:29.200000, 0.35
1 06-Jul-16 00:04:33.800000, 0.35

This only reads the file once:
$1 == 1 {
line[1] = $0;
value[1] = $NF;
next;
}
$1 > 12 {
if (line[1] && line[0]) {
if ((value[0] > value[1] && $NF > value[1]) || (value[0] < value[1] && $NF < value[1])) {
print line[1];
line[1] = "";
}
}
line[0] = $0;
value[0] = $NF;
next;
}
{
line[0] = "";
}

transpose a file using a key field in awk

I want to use awk to transpose a file over a group field. Tks
I have a file like this.
id Name Value
1 B1 0.1
1 B2 0.3
1 B3 0.4
2 B1 0.2
2 B2 0.09
2 B3 0.10
I want a file like this:
B1 B2 B3
1 0.1 0.3 0.4
2 0.2 0.09 0.10

$ cat tst.awk
BEGIN { OFS="\t" }
NR==1 { next }
(prev != "") && ($1 != prev) {
if (++nr == 1) {
print hdr
rec = prev rec
}
print rec
hdr = ""
rec = $1
}
{
hdr = hdr OFS $2
rec = rec OFS $3
prev = $1
}
END { print rec }
$ awk -f tst.awk file
B1 B2 B3
1 0.1 0.3 0.4
2 0.2 0.09 0.10

We Keep Coding

sql objective-c vba vb.net react-native apache vue.js tensorflow api pandas

AWK: sums problematic in awk table transform script - awk

Related

AWK: Help on transforming data table

awk script to sum numbers in a column over a loop not working for some iterations in the loop

How to print columns that have values satisfying a range?

Need to print the lines that matches regex for both themselves and neighboring lanes

transpose a file using a key field in awk

Categories

Resources