AWK: Help on transforming data table - awk

I have the following file called in.txt:
2020-01-01 fruit banana 3.4
2020-03-02 alcohol smirnov 26.99
2020-03-10 fruit orange 4.20
2020-04-03 fruit orange 4.20
2021-09-01 alcohol beer 6.00
2021-08-03 fruit mango 6.99
2022-01-01 fruit orange 4.30
2022-03-04 alcohol beer 6.00
2022-03-03 alcohol beer 6.00
2022-04-01 fruit mango 7.20
I want to transform the file so it reads something like this:
2020-01-01 2021-01-01 2022-01-01
-2020-12-31 -2021-12-31 -2022-12-31
fruit banana 3.40 0.00 0.00
orange 8.40 0.00 4.30
mango 0.00 6.99 7.20
Subt 11.80 6.99 11.50
alcohol beer 0.00 6.00 12.00
smirnov 26.99 0.00 0.00
Subt 26.99 6.00 12.00
Total 38.59 12.99 23.50
I have started writing the following script but am stuck on how to approach this. How can I display totals columns side by side. The other problem is that this is just dummy data. I have many different categories other than fruit and alcohol and it seems wrong to write if statements and for-loops for each one. Also how can I print fruit and alcohol out just once rather than for every iteration of column 3 and bring the date range to the top. Help is much appreciated.
#!/usr/bin/env bash
awk '
BEGIN{
FS=OFS="\t";
}
{
if ($2 ~ fruit && $1 >= "2020-01-01" && $1 <= "2020-12-31") {
a[$3]+=$4;
sa+=$4;
}
}
END {
PROCINFO["sorted_in"]="#ind_str_asc";
for (i in a) {
print "fruit", i, a[i]
}
}
' "${#:--}"

Would you please try the following:
#!/bin/bash
awk '
{
year = substr($1, 1, 4) # extract year
if (from == "" || from > year) from = year # first (smallest) year
if (to == "" || to < year) to = year # last (largest) year
if ($3 in category == 0) {
category[$3] = $2 # map item to category
list[$2] = list[$2] fs[$2] $3 # csv of items
fs[$2] = "," # delimiter for csv
}
sum[$3,year] += $4 # sum of the item in the year
subt[$2,year] += $4 # sum of the category in the year
ttl[year] += $4 # sum in the year
}
END {
format1 = "%-10s%-10s" # format for the left cells
format2 = "%-16s" # format for the header
format3 = "%-16.2f" # format for the amounts
# print upper header
printf(format1, "", "")
for (y = from; y <= to; y++) {
printf(format2, y "-01-01")
}
print ""
# print second header
printf(format1, "", "")
for (y = from; y <= to; y++) {
printf(format2, "-" y "-12-31")
}
print ""
for (cat in list) { # loop over the categories ("fruit" and "alcohol")
n = split(list[cat], item, ",") # split into items
for (i = 1; i <= n; i++) { # loop over the items
printf(format1, i == 1 ? cat : "", item[i])
for (y = from; y <= to; y++) { # loop over years
printf(format3, sum[item[i],y]) # append the sum of the year
}
print "" # finally break the line
}
print "" # insert blank line
printf(format1, "Subt", "")
for (y = from; y <= to; y++) {
printf(format3, subt[cat,y]) # append the subtotal
}
print "\n"
}
printf(format1, "Total", "")
for (y = from; y <= to; y++) {
printf(format3, ttl[y]) # append the total amount
}
print ""
}
' in.txt
Output with the provided input:
2020-01-01 2021-01-01 2022-01-01
-2020-12-31 -2021-12-31 -2022-12-31
alcohol smirnov 26.99 0.00 0.00
beer 0.00 6.00 12.00
Subt 26.99 6.00 12.00
fruit banana 3.40 0.00 0.00
orange 8.40 0.00 4.30
mango 0.00 6.99 7.20
Subt 11.80 6.99 11.50
Total 38.79 12.99 23.50
Please forgive me the order of items are not same as the OP's.

Using GNU awk for arrays of arrays:
$ cat tst.awk
BEGIN { OFS="\t" }
{
sub(/-.*/,"",$1)
minYear = ( NR==1 || $1 < minYear ? $1 : minYear )
maxYear = ( NR==1 || $1 > maxYear ? $1 : maxYear )
items[$2][$3]
vals[$1][$2][$3] += $4
typeTots[$1][$2] += $4
yearTots[$1] += $4
}
END {
printf "%s", OFS
for ( year=minYear; year<=maxYear; year++ ) {
printf "%s%s", OFS, year
}
print ""
for ( type in items ) {
itemCnt = 0
for ( item in items[type] ) {
printf "%s%s%s", (itemCnt++ ? "" : type), OFS, item
for ( year=minYear; year<=maxYear; year++ ) {
printf "%s%0.2f", OFS, vals[year][type][item]
}
print ""
}
printf "Subt%s", OFS
for ( year=minYear; year<=maxYear; year++ ) {
printf "%s%0.2f", OFS, typeTots[year][type]
}
print ORS
}
printf "Total%s", OFS
for ( year=minYear; year<=maxYear; year++ ) {
printf "%s%0.2f", OFS, yearTots[year]
}
print ""
}
$ awk -f tst.awk in.txt
2020 2021 2022
alcohol beer 0.00 6.00 12.00
smirnov 26.99 0.00 0.00
Subt 26.99 6.00 12.00
fruit orange 8.40 0.00 4.30
mango 0.00 6.99 7.20
banana 3.40 0.00 0.00
Subt 11.80 6.99 11.50
Total 38.79 12.99 23.50
or if you really want specific date ranges instead of just the year in the header:
$ cat tst.awk
BEGIN { OFS="\t" }
{
sub(/-.*/,"",$1)
minYear = ( NR==1 || $1 < minYear ? $1 : minYear )
maxYear = ( NR==1 || $1 > maxYear ? $1 : maxYear )
items[$2][$3]
vals[$1][$2][$3] += $4
typeTots[$1][$2] += $4
yearTots[$1] += $4
}
END {
printf "%s", OFS
for ( year=minYear; year<=maxYear; year++ ) {
printf "%s%s-01-01", OFS, year
}
print ""
printf "%s", OFS
for ( year=minYear; year<=maxYear; year++ ) {
printf "%s-%s-12-31", OFS, year
}
print ""
for ( type in items ) {
itemCnt = 0
for ( item in items[type] ) {
printf "%s%s%s", (itemCnt++ ? "" : type), OFS, item
for ( year=minYear; year<=maxYear; year++ ) {
printf "%s%0.2f", OFS, vals[year][type][item]
}
print ""
}
printf "Subt%s", OFS
for ( year=minYear; year<=maxYear; year++ ) {
printf "%s%0.2f", OFS, typeTots[year][type]
}
print ORS
}
printf "Total%s", OFS
for ( year=minYear; year<=maxYear; year++ ) {
printf "%s%0.2f", OFS, yearTots[year]
}
print ""
}
$ awk -f tst.awk in.txt | column -s$'\t' -t
2020-01-01 2021-01-01 2022-01-01
-2020-12-31 -2021-12-31 -2022-12-31
alcohol beer 0.00 6.00 12.00
smirnov 26.99 0.00 0.00
Subt 26.99 6.00 12.00
fruit orange 8.40 0.00 4.30
mango 0.00 6.99 7.20
banana 3.40 0.00 0.00
Subt 11.80 6.99 11.50
Total 38.79 12.99 23.50

I believe the following piece of awk code is a good start. The remaining part to do is just some cleanup and some extra code for the sums.
BEGIN{
# how many divisions per year
n=1
# initialisation of some variables
tmax=0;tmin=999999; ymax=qmax=0;ymin=9999;qmin=99
}
# convert date to quarter,trim,half
{ y=$1+0; q=(substr($1,6,7)+0)%n}
# compute min max time
(y*100+q < tmin) { ymin=y;qmin=q;tmin=y*100+q }
(y*100+q > tmax) { ymax=y;qmax=q;tmax=y*100+q }
# Create arrays that keep track of everything
# a : prices by year,q,category and element
# b : just a list of categories, eg fruit
# c : just a list of elements and the category it belongs to.
{ a[y,q,$2,$3]=$4; b[$2]; c[$3]=$2 }
END{
# loop over categories (eg fruit)
for(i in b) {
# loop over elemnts
for(j in c) {
# exclude elements that do not belong to category
if (i!=c[j]) continue
s=i OFS j;
# loop over the time
for (y=ymin;y<=ymax;y++) {
for (q=0;q<n;++q) {
if (y*100+q < tmin) continue
if (y*100+q > tmax) continue
s=s OFS a[y,q,i,j]+0
}
}
print s
}
}
}
This currently outputs:
alcohol beer 0 6 6
alcohol smirnov 26.99 0 0
fruit orange 4.2 0 4.3
fruit mango 0 6.99 7.2
fruit banana 3.4 0 0

Related

AWK: sums problematic in awk table transform script

I have the following file:
cat st_in.txt
2015-01-01 2 A FI
2015-02-03 4 B VI
2015-03-01 6 A FI
2015-01-08 -4 C VE
2016-01-05 -3 B VE
2016-02-03 -1 D FE
2016-04-01 -2 B FE
2016-06-13 -5 D VE
2017-01-01 2 A VI
2017-02-03 3 A VI
2017-02-04 8 C FI
2017-01-05 -1 B FE
And want to transform it like this:
2015 2016 2017
A 8.00 0.00 0.00
C 0.00 0.00 8.00
sumFI 8.00 0.00 8.00
A 0.00 0.00 5.00
B 4.00 0.00 0.00
sumVI 4.00 0.00 5.00
sumI 12.00 0.00 13.00
B 0.00 -2.00 -1.00
D 0.00 -1.00 0.00
sumFE 0.00 -3.00 -1.00
B 0.00 -3.00 0.00
C -4.00 0.00 0.00
D 0.00 -5.00 0.00
sumVE -4.00 -8.00 0.00
sumE -4.00 -11.00 -1.00
net 8.00 -11.00 12.00
To do that I have written the following script:
#!/usr/bin/env bash
awk '
BEGIN {
OFS = "\t"
}
{
yr = substr($1, 1, 4)
sub(/-.*/, "", $1)
minYr = (NR == 1 || $1 < minYr ? $1 : minYr)
maxYr = (NR == 1 || $1 > maxYr ? $1 : maxYr)
H[$4][$3]
W[yr][$4][$3] += $2
yT[$1] += $2
val[$1][$4] += $2
}
END {
for (yr = minYr; yr <= maxYr; yr++) {
printf "%s%s", OFS, yr
}
print ""
print ""
for (cT in H) {
for (c in H[cT]) {
printf c, OFS
for (yr = minYr; yr <= maxYr; yr++) {
printf "%s%0.2f", OFS, W[yr][cT][c]
}
print ""
}
printf "sum" cT, OFS
for (yr = minYr; yr <= maxYr; yr++) {
printf "%s%0.2f", OFS, val[yr][cT]
}
print ""
print ""
if (cT == "VI") {
printf "sumI", OFS
for (yr = minYr; yr <= maxYr; yr++) {
printf "%s%0.2f", OFS, W[yr][cT][c]
}
} else if (cT == "VE") {
printf "sumE", OFS
for (yr = minYr; yr <= maxYr; yr++) {
printf "%s%0.2f", OFS, W[yr][cT][c]
}
}
print ORS
}
printf "net"
for (yr = minYr; yr <= maxYr; yr++) {
printf "%s%0.2f", OFS, yT[yr]
}
print ""
}
' "${#:--}"
Which prints the following:
./trans1 st_in.txt
2015 2016 2017
A 8.00 0.00 0.00
C 0.00 0.00 8.00
sumFI 8.00 0.00 8.00
A 0.00 0.00 5.00
B 4.00 0.00 0.00
sumVI 4.00 0.00 5.00
sumI 4.00 0.00 0.00
B 0.00 -2.00 -1.00
D 0.00 -1.00 0.00
sumFE 0.00 -3.00 -1.00
B 0.00 -3.00 0.00
C -4.00 0.00 0.00
D 0.00 -5.00 0.00
sumVE -4.00 -8.00 0.00
sumE 0.00 -5.00 0.00
net 8.00 -11.00 12.00
I am not worried about the formatting here. The console printout is similar to the target. But what I am struggling with is to get the correct sums for sumI (sumFI + sumVI) and sumE(sumFE + sumVE). Can someone please help with that?
EDIT by Ed Morton to provide meaningful data types and variable names, assuming the OPs current data is column types of Date Amount Item Type as it was in their first question:
$ cat trans1
#!/usr/bin/env bash
awk '
BEGIN {
OFS = "\t"
}
{
date = $1
amount = $2
item = $3
type = $4
year = substr(date, 1, 4)
minYear = (NR == 1 || year < minYear ? year : minYear)
maxYear = (NR == 1 || year > maxYear ? year : maxYear)
types_items[type][item]
yearsTypesItems2amounts[year][type][item] += $2
years2amounts[year] += $2
yearsTypes2amounts[year][type] += $2
}
END {
for (year = minYear; year <= maxYear; year++) {
printf "%s%s", OFS, year
}
print ""
print ""
for (type in types_items) {
for (item in types_items[type]) {
printf item, OFS
for (year = minYear; year <= maxYear; year++) {
printf "%s%0.2f", OFS, yearsTypesItems2amounts[year][type][item]
}
print ""
}
printf "sum" type, OFS
for (year = minYear; year <= maxYear; year++) {
printf "%s%0.2f", OFS, yearsTypes2amounts[year][type]
}
print ""
print ""
if (type == "VI") {
printf "sumI", OFS
for (year = minYear; year <= maxYear; year++) {
printf "%s%0.2f", OFS, yearsTypesItems2amounts[year][type][item] # <--- NOTE: item unset here
}
} else if (type == "VE") {
printf "sumE", OFS
for (year = minYear; year <= maxYear; year++) {
printf "%s%0.2f", OFS, yearsTypesItems2amounts[year][type][item] # <--- NOTE: item unset here
}
}
print ORS
}
printf "net"
for (year = minYear; year <= maxYear; year++) {
printf "%s%0.2f", OFS, years2amounts[year]
}
print ""
}
' "${#:--}"
See the "NOTE"s above where by just doing this renaming exercise it became obvious where some bugs are (looks like you were using the wrong array and should be using yearsTypes2amounts[year][type] instead of yearsTypesItems2amounts[year][type][item]).
Continuing to use GNU awk for arrays of arrays and assuming the OPs current data is column types of Date Amount Item Type as it was in your first question:
$ cat trans1
#!/usr/bin/env bash
awk '
BEGIN {
OFS = "\t"
}
{
date = $1
amount = $2
item = $3
type = $4
year = substr(date, 1, 4)
minYear = (NR == 1 || year < minYear ? year : minYear)
maxYear = (NR == 1 || year > maxYear ? year : maxYear)
cat = substr(type,2)
ctiys2amounts[cat][type][item][year] += amount
}
END {
for (year = minYear; year <= maxYear; year++) {
printf "%s%s", OFS, year
}
print ORS
for (cat in ctiys2amounts) {
delete catSum
for (type in ctiys2amounts[cat]) {
delete typeSum
for (item in ctiys2amounts[cat][type]) {
printf "%s", item
for (year = minYear; year <= maxYear; year++) {
amount = ctiys2amounts[cat][type][item][year]
printf "%s%0.2f", OFS, amount
typeSum[year] += amount
}
print ""
}
printf "sum%s", type
for (year = minYear; year <= maxYear; year++) {
printf "%s%0.2f", OFS, typeSum[year]
catSum[year] += typeSum[year]
}
print ORS
}
printf "sum%s", cat
for (year = minYear; year <= maxYear; year++) {
printf "%s%0.2f", OFS, catSum[year]
yearSum[year] += catSum[year]
}
print ORS
}
printf "net"
for (year = minYear; year <= maxYear; year++) {
printf "%s%0.2f", OFS, yearSum[year]
}
print ""
}
' "${#:--}"
$ ./trans1 st_in.txt
2015 2016 2017
B 0.00 -2.00 -1.00
D 0.00 -1.00 0.00
sumFE 0.00 -3.00 -1.00
B 0.00 -3.00 0.00
C -4.00 0.00 0.00
D 0.00 -5.00 0.00
sumVE -4.00 -8.00 0.00
sumE -4.00 -11.00 -1.00
A 8.00 0.00 0.00
C 0.00 0.00 8.00
sumFI 8.00 0.00 8.00
A 0.00 0.00 5.00
B 4.00 0.00 0.00
sumVI 4.00 0.00 5.00
sumI 12.00 0.00 13.00
net 8.00 -11.00 12.00

awk equivalents for tidyverse concepts (melt and spread)

I have some text logs that I need to parse and format into CSV.
I have a working R script but it is slow once file sizes increase and this problem seems like a good candidate for a speed up using awk (or other commandline tools?) as I understand.
I have not done much with awk, and the issue I am having is translating how I think about processing in R to how awk scripting is done.
Example truncated input data (Scrap.log):
; these are comment lines
; *******************************************************************************
; \\C:\Users\Computer\Folder\Folder\Scrap.log
!!G 99999 % % % % % % % % CURRENT XYZ ABC STATE1 STATE2
_START Header1 Header2 Header3 Header4 Header5 Header6 Header7
10 12.23 1.91 6.63 1.68 50.03 0.50 13.97
11 11.32 1.94 6.64 1.94 50.12 0.58 15.10
12 12.96 2.15 6.57 2.12 55.60 0.62 16.24
13 11.43 2.18 6.60 2.36 50.89 0.68 17.39
14 14.91 2.32 6.64 2.59 56.09 0.73 18.41
15 13.16 2.38 6.53 2.85 51.62 0.81 19.30
16 15.02 2.50 6.67 3.05 56.22 0.85 20.12
!!G 99999 % % % % % % % % CURRENT XYZ ABC STATE1 STATE2
_START Header8 Header9 Header10 Header11 Header12 Header13 Header14
10 22.03 24.41 15.01 51.44 44.28 16.57 11.52
11 21.05 24.62 15.62 51.23 45.42 16.47 11.98
12 20.11 24.64 16.38 52.16 46.59 16.54 12.42
13 24.13 24.93 17.23 52.34 47.72 16.51 12.88
14 27.17 24.95 18.06 52.79 48.72 16.45 13.30
15 22.87 25.04 19.27 53.01 49.50 16.47 13.63
16 23.08 25.22 20.12 53.75 50.64 16.55 14.03
Expected output (truncated):
HH1,HH2,HH3,HH4,HH5,HH6,HH7,HH8,HH9,HH10,HH11,HH12,HH13,HH14,START,HeaderName,Value
99999,CURRENT,XYZ,ABC,STATE1,STATE2,%,%,%,%,%,%,%,%,10,Header1,12.23
99999,CURRENT,XYZ,ABC,STATE1,STATE2,%,%,%,%,%,%,%,%,10,Header2,1.91
99999,CURRENT,XYZ,ABC,STATE1,STATE2,%,%,%,%,%,%,%,%,10,Header3,6.63
99999,CURRENT,XYZ,ABC,STATE1,STATE2,%,%,%,%,%,%,%,%,10,Header4,1.68
99999,CURRENT,XYZ,ABC,STATE1,STATE2,%,%,%,%,%,%,%,%,10,Header5,50.03
99999,CURRENT,XYZ,ABC,STATE1,STATE2,%,%,%,%,%,%,%,%,10,Header6,0.5
99999,CURRENT,XYZ,ABC,STATE1,STATE2,%,%,%,%,%,%,%,%,10,Header7,13.97
99999,CURRENT,XYZ,ABC,STATE1,STATE2,%,%,%,%,%,%,%,%,11,Header1,11.32
99999,CURRENT,XYZ,ABC,STATE1,STATE2,%,%,%,%,%,%,%,%,11,Header2,1.94
99999,CURRENT,XYZ,ABC,STATE1,STATE2,%,%,%,%,%,%,%,%,11,Header3,6.64
99999,CURRENT,XYZ,ABC,STATE1,STATE2,%,%,%,%,%,%,%,%,11,Header4,1.94
99999,CURRENT,XYZ,ABC,STATE1,STATE2,%,%,%,%,%,%,%,%,11,Header5,50.12
99999,CURRENT,XYZ,ABC,STATE1,STATE2,%,%,%,%,%,%,%,%,11,Header6,0.58
99999,CURRENT,XYZ,ABC,STATE1,STATE2,%,%,%,%,%,%,%,%,11,Header7,15.1
99999,CURRENT,XYZ,ABC,STATE1,STATE2,%,%,%,%,%,%,%,%,12,Header1,12.96
99999,CURRENT,XYZ,ABC,STATE1,STATE2,%,%,%,%,%,%,%,%,12,Header2,2.15
99999,CURRENT,XYZ,ABC,STATE1,STATE2,%,%,%,%,%,%,%,%,12,Header3,6.57
99999,CURRENT,XYZ,ABC,STATE1,STATE2,%,%,%,%,%,%,%,%,12,Header4,2.12
99999,CURRENT,XYZ,ABC,STATE1,STATE2,%,%,%,%,%,%,%,%,12,Header5,55.6
99999,CURRENT,XYZ,ABC,STATE1,STATE2,%,%,%,%,%,%,%,%,12,Header6,0.62
99999,CURRENT,XYZ,ABC,STATE1,STATE2,%,%,%,%,%,%,%,%,12,Header7,16.24
99999,CURRENT,XYZ,ABC,STATE1,STATE2,%,%,%,%,%,%,%,%,13,Header1,11.43
99999,CURRENT,XYZ,ABC,STATE1,STATE2,%,%,%,%,%,%,%,%,13,Header2,2.18
...
My general steps in the R script:
add a single header row with new names at the top of file
spread the top row (starting with !!G) to each row
melt the header column (_START) from wide to long format
Pieces I have working in awk so far include:
how to grab and print the header lines
awk '/_START/ {header = $0; print header}' Scrap.log
How to write a single row with the new header values
awk ' BEGIN{ ORS=" "; for (counter = 1; counter <= 14; counter++) print "HH",counter;}'
I know each block is separated by a newline and starts with a !!G, so can write a match on that. Unsure if a split-apply-combine type of thinking works well in awk?
awk '/!!G/,/\n/ {print}' Scrap.log
alternatively, I tried setting RS/FS parameters like:
awk ' BEGIN{RS="\n";FS=" ";}/^!!G/{header=$0;print header}/[0-9]/{print $2}END{}' Scrap.log
I then get stuck on iterating over the rows and fields to do the melt step as well as combining the capture groups correctly.
How do I combine all these pieces to get to the CSV format?
I think the following:
awk '
BEGIN{
# output the header line
print "HH1,HH2,HH3,HH4,HH5,HH6,HH7,HH8,HH9,HH10,HH11,HH12,HH13,HH14,START,HeaderName,Value"
}
# ignore comment lines
/;/{next}
/!!G/{
valcnt = 1
# save and shuffle the values
val[valcnt++] = $2
val[valcnt++] = $11
val[valcnt++] = $12
val[valcnt++] = $13
val[valcnt++] = $14
val[valcnt++] = $15
val[valcnt++] = $3
val[valcnt++] = $4
val[valcnt++] = $5
val[valcnt++] = $6
val[valcnt++] = $7
val[valcnt++] = $8
val[valcnt++] = $9
val[valcnt++] = $10
next
}
/_START /{
# these are headers - save them to head, to be reused later
for (i = 2; i <= NF; ++i) {
# fun fact: its indexed on NF
head[i] = $i
}
next
}
# this function is redundant, but its just easier for me to think about the code
function output(firstval, header, value, \
cur, i) {
cur = valcnt
val[cur++] = firstval
val[cur++] = header
val[cur++] = value
# output val as csv
for (i = 1; i < cur; ++i) {
printf "%s%s", val[i], i != cur - 1 ? "," : "\n"
}
}
/[0-9]+/{
for (i = 2; i <= NF; ++i) {
# add these 3 to all the other values and output them
# ie. add first column, the header from header and the value
output($1, head[i], $i)
}
}
'
Should output what you want. Tested on repl.

Is there an efficient way to do a vertical lookup kind of task in AWK using multiple files?

I am struggling a lot with the following task which I currently try to accomplish using AWK. I am not very familiar with AWK so I am not even sure if AWK is the best for this. If this is better to be solved with Python, please let me know (but I know even less of Python).
I need to prepare an input file for an analysis which is based on collecting gene expression P-values of multiple species from different files. For each species there are multiple treatment files.
In brief: I need to collect P-values linked to sequenceIDs from multiple files and put them in a single file ordered per Orthogroup. For each Orthogroup I only need to keep the lowest P-value per species treatment file
Orthogroupfile: A list of all orthogroups: on each line one orthogroup, every column is a sequenceID, 1st column is the orthogroupID.
OG0052916: TRINITY_TN_DN99904_c0_g1 TRINITY_AG_DN38054_c0_g1 TRINITY_AG_DN41618_c0_g1 TRINITY_AG_DN47300_c0_g1
OG0001002: TRINITY_AG_DN119624_c0_g1 TRINITY_AG_DN161549_c0_g1 TRINITY_AG_DN60596_c0_g1 TRINITY_MB_DN61252_c1_g1 TRINITY_SE_DN51134_c2_g1 TRINITY_SL_DN27816_c0_g1 TRINITY_SL_DN76945_c4_g1 TRINITY_SL_DN77747_c0_g1 TRINITY_SL_DN77747_c1_g1 TRINITY_TN_DN52316_c0_g1
OG0002002: TRINITY_AG_DN56841_c0_g1 TRINITY_MB_DN200880_c1_g1 TRINITY_SE_DN45370_c1_g1 TRINITY_SE_DN53999_c0_g1 TRINITY_SL_DN16333_c0_g1 TRINITY_SL_DN65991_c0_g1 TRINITY_TN_DN180200_c0_g1 TRINITY_TN_DN48658_c0_g1
OG0052920: TRINITY_TN_DN99983_c0_g1 TRINITY_AG_DN12345_c0_g1
Speciesfile: For each species I have a separate file summarising differential gene expression data. But for every species I have multiple treatments and thus multiple species treatment files. For me important is the P-value (10th column) and the sequence ID (1st column). Each species in the analysis has such a file, the two-letter code in the sequence IDs is a species code ("AG", "TN", "SE", "SL", "MB")
Speciesfile treatment 1 e.g. AG.txt:
AG.txt:
TRINITY_AG_DN38054_c0_g1 0.364813449
TRINITY_AG_DN41618_c0_g1 0.000130019
TRINITY_AG_DN47300_c0_g1 0.000195804
TRINITY_AG_DN119624_c0_g1 0.067
TRINITY_AG_DN161549_c0_g1 0.00036
TRINITY_AG_DN60596_c0_g1 0.023
TRINITY_AG_DN12345_c0_g1 NA
TRINITY_AG_DN56841_c0_g1 0.034
Speciesfile treatment 2 e.g. AA.txt:
TRINITY_AG_DN38054_c0_g1 3.364813449e-07
TRINITY_AG_DN41618_c0_g1 6.000130019e-03
TRINITY_AG_DN47300_c0_g1 8.000195804e-02
TRINITY_AG_DN119624_c0_g1 5.067e-05
TRINITY_AG_DN161549_c0_g1 5.00036e-06
TRINITY_AG_DN60596_c0_g1 4.023e-7
TRINITY_AG_DN12345_c0_g1 0.03
TRINITY_AG_DN56841_c0_g1 2.034e-2
Speciesfile treatment 1 e.g. TN.txt:
TRINITY_TN_DN99904_c0_g1 0.005
TRINITY_TN_DN99983_c0_g1 0.063
TRINITY_TN_DN180200_c0_g1 0.0326
TRINITY_TN_DN48658_c0_g1 0.02762
TRINITY_TN_DN52316_c0_g1 0.000737267
speciesfile treatment 2 e.g. TA.txt
TRINITY_TN_DN99904_c0_g1 6.005e-4
TRINITY_TN_DN99983_c0_g1 9.063e-03
TRINITY_TN_DN180200_c0_g1 1.0326e-1
TRINITY_TN_DN48658_c0_g1 3.02762e-09
TRINITY_TN_DN52316_c0_g1 2.000737267e-10
MB.txt:
TRINITY_MB_DN61252_c1_g1 0.0004378
TRINITY_MB_DN200880_c1_g1 0.00007281
SE.txt:
TRINITY_SE_DN51134_c2_g1 0.0007367
TRINITY_SE_DN53999_c0_g1 0.00376
TRINITY_SE_DN45370_c1_g1 0.00067356
The output file that I need “summarises” information from the different species with a Orthogroup on each line. I am only interested in the P-values
First column: Orthogroup ID
Second column: lowest P-value (for all
genes of sp1 in this Orthogroup e.g. "AG", so this is species treatment file dependent )
Third column: total nr.
of genes of sp1 in this Orthogroup (this will be similar for different treatments of the same species)
Fourth column: total number of
genes for sp1. in the cluster (but this can always be the same as
the third column)
The next three columns is than repeating the same for the other species, and continues afterwards. NA if there are no genes present of that species in that orthogroup
Example output.txt, which includes the P-value information for all different species "AG", "MB", "TN", "SE" and "SL":
Group AG-Pvalue AG-nGenes AG-ClusterSize MB-Pvalue MB-nGenes MB-ClusterSize SE-Pvalue SE-nGenes SE-ClusterSize TN-Pvalue TN-nGenes TN-ClusterSize AA-Pvalue AA-nGenes AA-ClusterSize TA-Pvalue TA-nGenes TA-ClusterSize
OG0052916 0.000130019 3 3 NA NA NA NA NA NA 0.005 1 1 3.364813449e-07 3 3 6.005e-4 1 1
OG0002002 0.034 1 1 0.00007281 1 1 0.00067356 3 3 0.02762 2 2 2.034e-2 1 1 3.02762e-09 2 2
OG0001002: 0.00036 3 3 0.0004378 1 1 0.0007367 1 1 0.000737267 1 1 5.067e-05 3 3 2.000737267e-10 1 1
OG0052920: NA NA NA NA NA NA NA NA NA 0.063 1 1 0.03 1 1 9.063e-03 1 1
"Next-Orthogroup" "lowest P-value of the diet treatment per species" "nr of genes of this species in this orthogroup"
I realise this problem consists of 3 different problems:
1 a simple vertical look-up
2 a if - then choice, if multiple genes in Orthogroup than copy the lowest P-value
3 calculate the number of genes per species per Orthogroup.
I wanted to tackle this one by one, but failed already at the first step:
awk 'NR==FNR{a[$0];next} $1 in a {print $10}’ Orthogroups1.txt TN.txt
Check all columns of file 1 for occurrence in file 2 and print the 10th column.
If anyone could help me with the above? Even if it is just a direction, thank you so much!
The following awk script performs the following, based on the question (assuming latest post capture all requirements)
Load the lookup tables AG.txt, MB.txt, ... (BEGIN block)
Read the main data file, and find out min, count per group/species.
Print the output (END block)
awk '
BEGIN {
# Load all XX.txt files
n_species=split("AG,MB,TN,SE,SL", species, ",")
for (s in species) {
sfile = species[s] ".txt"
nn=0
while ( (getline < sfile) > 0 ) { v[$1] = $2; nn++ }
print "Loaded:", sfile, nn > "/dev/stderr"
}
}
{
g = $1 # Group
# Calculate count, min per group
for (i=2 ; i<=NF ; i++ ) {
id=$i
split(id, parts, "_")
ss=parts[2] # Species
val = v[id]
if ( val ) {
if ( !vcount[g, ss] || val < vmin[g, ss] ) vmin[g, ss] = val
vcount[g, ss]++
group[g]++
# print "SET", id, g, ss, val, vmin[g,ss], vcount[g, ss]
}
}
}
END {
# Header Line
printf "%s", "group"
for (s in species) {
ss = species[s]
printf " %s-PValue %s-nGenees %s-ClusterSize", ss, ss, ss
}
printf "\n"
# Print line
ng = 0
for (g in group) {
ng++
printf "%s", g
for (s in species) {
ss = species[s]
# print "GET", g, ss, vmin[g, ss], vcount[g, ss], "X"
s_min = vmin[g, ss]
s_count = vcount[g, ss]
s_cs = vcount[g, ss]
if ( !s_count ) { s_count = s_min = s_cs = "NA" }
printf " %s %s %s", s_min, s_count, s_cs
}
printf "\n"
}
print "Groups:", ng > "/dev/stderr"
}' < data.txt
Upgraded Answer, to address additional data files, per additional information from OP:
Invoke with list of species/treatments, the indicator 'DATA=1', and the data file.
script.awk AG.txt MB.txt SE.txt TN.txt AA.txt TA.txt DATA=1 data.txt
script.awk
#! /usr/bin/awk -f
!DATA {
# Calculate key from file name
if ( FNR == 1 ) {
ncol++
k = gensub("(.*/)?([^/]+)\\.([^/]*)$", "\\2", 1, FILENAME)
cols[ncol] = k
}
v[k, $1] = $2
# Track keys
ccount[k]++
next
}
{
g=$1 # Group
# Calculate count, min per group
for (i=2 ; i<=NF ; i++ ) {
id=$i
split(id, parts, "_")
for (k in cols) {
ss = cols[k]
val = v[ss, id]
if ( !val ) continue
if ( !vcount[g, ss] || val < vmin[g, ss] ) vmin[g, ss] = val
vcount[g, ss]++
gcount[g, ss]++
group[g]++
# print "SET", id, g, ss, val, vmin[g,ss], vcount[g, ss]
}
}
}
END {
# Header Line
printf "%s", "group"
for (k in cols) {
ss = cols[k]
printf " %s-PValue %s-nGenees %s-ClusterSize", ss, ss, ss
}
printf "\n"
# Print line
ng = 0
for (g in group) {
ng++
printf "%s", g
for (k in cols) {
ss = cols[k]
s_min = vmin[g, ss]
s_count = vcount[g, ss]
s_cs = gcount[g, ss]
# print "GET", g, ss, vmin[g, ss], vcount[g, ss], "X"
if ( !s_count ) { s_count = s_min = s_cs = "NA" }
printf " %s %s %s", s_min, s_count, s_cs
# printf " %s %d %d", vmin[g, ss] ? vmin[g, ss] : "NA" , vcount[g, ss], vcount[g, ss]
}
printf "\n"
}
for (k in cols ) {
ss = cols[k]
print "Col:", ss, ccount[ss] > "/dev/stderr"
}
print "Groups:", ng > "/dev/stderr"
}
Output for:
awk -f ./script-spcomp.awk AG.txt MB.txt SE.txt TN.txt AA.txt TA.txt DATA=1 data.txt
group AG-PValue AG-nGenees AG-ClusterSize MB-PValue MB-nGenees MB-ClusterSize SE-PValue SE-nGenees SE-ClusterSize TN-PValue TN-nGenees TN-ClusterSize AA-PValue AA-nGenees AA-ClusterSize TA-PValue TA-nGenees TA-ClusterSize
OG0052920: NA NA NA NA NA NA NA NA NA 0.063 1 1 NA NA NA 9.063e-03 1 1
OG0052916: 0.000130019 3 3 NA NA NA NA NA NA 0.005 1 1 3.364813449e-07 3 3 6.005e-4 1 1
OG0002002: 0.034 1 1 0.00007281 1 1 0.00067356 2 2 0.02762 2 2 2.034e-2 1 1 3.02762e-09 2 2
OG0001002: 0.00036 3 3 0.0004378 1 1 0.0007367 1 1 0.000737267 1 1 4.023e-7 3 3 2.000737267e-10 1 1
When running with the modified list list of columns, the output is:
awk -f script-spcomp.awk TA.txt SE.txt MB.txt AG.txt AA.txt TN.txt DATA=1
ortho.txt
group TA-PValue TA-nGenees TA-ClusterSize SE-PValue SE-nGenees SE-ClusterSize MB-PValue MB-nGenees MB-ClusterSize AG-PValue AG-nGenees AG-ClusterSize AA-PValue AA-nGenees AA-ClusterSize TN-PValue TN-nGenees TN-ClusterSize
OG0052920: 9.063e-03 1 1 NA NA NA NA NA NA NA NA NA NA NA NA 0.063 1 1
OG0052916: 6.005e-4 1 1 NA NA NA NA NA NA 0.000130019 3 3 3.364813449e-07 3 3 0.005 1 1
OG0002002: 3.02762e-09 2 2 0.00067356 2 2 0.00007281 1 1 0.034 1 1 2.034e-2 1 1 0.02762 2 2
OG0001002: 2.000737267e-10 1 1 0.0007367 1 1 0.0004378 1 1 0.00036 3 3 4.023e-7 3 3 0.000737267 1 1

Using awk to find min and max?

I have a large file that contains many types of lines. One of the types is lines of this form:
The lines all start with ATOM, and the 7th-9th fields are the x, y, and z values of the specified atom. How can I use awk to find all ATOM lines and then calculate the min and max of the x, y, and z values?
This is my file: http://pastebin.com/EqA2SUMy
One of the types is lines of this form:
ATOM 1 N ASP A 435 7.397 28.376 121.784 1.00 34.35 N
ATOM 2 CA ASP A 435 8.023 27.301 122.545 1.00 30.66 C
ATOM 3 C ASP A 435 8.170 27.721 124.009 1.00 31.39 C
ATOM 4 O ASP A 435 9.078 28.509 124.284 1.00 38.78 O
Can anyone show me how to do this please?
#!awk -f
BEGIN {
min7 = min8 = min9 = 1000
}
$1 == "ATOM" {
if ($7 < min7)
min7 = $7
if ($8 < min8)
min8 = $8
if ($9 < min9)
min9 = $9
if ($7 > max7)
max7 = $7
if ($8 > max8)
max8 = $8
if ($9 > max9)
max9 = $9
}
END {
print min7, min8, min9
print max7, max8, max9
}

Taking similar consecutive rows and appending them into one longer row with AWK

I've got a big 7-column text file with sorted rows like this:
gi|352964122|gb|JH286168.1| 00884 C C 14 1.00 u
gi|352964122|gb|JH286168.1| 00884 C C 26 0.76 p
gi|352964122|gb|JH286168.1| 00884 C C 33 0.89 f
gi|352964122|gb|JH286168.1| 00885 G G 14 1.00 u
gi|352964122|gb|JH286168.1| 00885 A A 30 0.84 f
gi|352964122|gb|JH286168.1| 00886 T T 31 0.81 f
What I'm needing to do is, if the first two columns are the same in consecutive rows, append the rest of the columns to the first row. There can be 1, 2, or 3 "similar" rows, and I need placeholders to keep columns intact if less than 3. So the above would look like this:
gi|352964122|gb|JH286168.1| 00884 C C 14 1.00 u C C 26 0.76 p C C 33 0.89 f
gi|352964122|gb|JH286168.1| 00885 G G 14 1.00 u - - - ------------ G G 33 0.89 f
gi|352964122|gb|JH286168.1| 00886 T T 31 0.81 f - - - ---- - - - ------ - - -- ----- - -
I've tried many approaches with AWK but can't quite get it. How might this be done?
I'm unsure about how you get your second row but this might match at least how I understand the goal:
awk '
{
head=$1 " " $2
tail=$3 " " $4 " " $5 " " $6 " "$7
if(previous!=head) {
if(previous!="") printf("%s %s %s %s\n",previous,p[1],p[2],p[3])
previous=head
i=1
p[i]=tail
p[2]=p[3]="- - - -"
} else {
i=i+1
p[i]=tail
}
}
END { printf("%s %s %s %s\n",previous,p[1],p[2],p[3]) }'
Output:
gi|352964122|gb|JH286168.1| 00884 C C 14 1.00 u C C 26 0.76 p C C 33 0.89 f
gi|352964122|gb|JH286168.1| 00885 G G 14 1.00 u A A 30 0.84 f - - - -
gi|352964122|gb|JH286168.1| 00886 T T 31 0.81 f - - - - - - - -
This should do it:
(Edit: I didn't notice you needed placeholders. I'll look into it....)
awk '
$1 == last1 && $2 == last2 {
printf " %s %s %s %s %s",$3,$4,$5,$6,$7;
last1 = $1; last2 = $2;
next;
}
{
$1 = $1; # normalize spacing
printf "%s%s", NR==1?"":"\n", $0;
last1 = $1; last2 = $2;
}
END { print ""; }
' file
$ cat tst.awk
BEGIN { maxRecs = 3 }
function prta( i, dflt) {
dflt = a[1]
gsub(/[^[:space:]]+/,"-",dflt)
printf "%s ", key
for (i=1; i<=maxRecs; i++) {
printf "%s%s", (i in a ? a[i] : dflt), (i<maxRecs ? OFS : ORS)
delete a[i]
}
numRecs = 0
}
{ key = $1 FS $2 }
prev && (key != prev) { prta() }
{
$1 = $1
sub(/([^[:space:]]+[[:space:]]+){2}/,"")
a[++numRecs] = $0
prev = key
}
END { prta() }
$
$ awk -f tst.awk file
gi|352964122|gb|JH286168.1| 00885 C C 14 1.00 u C C 26 0.76 p C C 33 0.89 f
gi|352964122|gb|JH286168.1| 00886 G G 14 1.00 u A A 30 0.84 f - - - - -
gi|352964122|gb|JH286168.1| 00886 T T 31 0.81 f - - - - - - - - - -