I have the following file called in.txt:
2020-01-01 fruit banana 3.4
2020-03-02 alcohol smirnov 26.99
2020-03-10 fruit orange 4.20
2020-04-03 fruit orange 4.20
2021-09-01 alcohol beer 6.00
2021-08-03 fruit mango 6.99
2022-01-01 fruit orange 4.30
2022-03-04 alcohol beer 6.00
2022-03-03 alcohol beer 6.00
2022-04-01 fruit mango 7.20
I want to transform the file so it reads something like this:
2020-01-01 2021-01-01 2022-01-01
-2020-12-31 -2021-12-31 -2022-12-31
fruit banana 3.40 0.00 0.00
orange 8.40 0.00 4.30
mango 0.00 6.99 7.20
Subt 11.80 6.99 11.50
alcohol beer 0.00 6.00 12.00
smirnov 26.99 0.00 0.00
Subt 26.99 6.00 12.00
Total 38.59 12.99 23.50
I have started writing the following script but am stuck on how to approach this. How can I display totals columns side by side. The other problem is that this is just dummy data. I have many different categories other than fruit and alcohol and it seems wrong to write if statements and for-loops for each one. Also how can I print fruit and alcohol out just once rather than for every iteration of column 3 and bring the date range to the top. Help is much appreciated.
#!/usr/bin/env bash
awk '
BEGIN{
FS=OFS="\t";
}
{
if ($2 ~ fruit && $1 >= "2020-01-01" && $1 <= "2020-12-31") {
a[$3]+=$4;
sa+=$4;
}
}
END {
PROCINFO["sorted_in"]="#ind_str_asc";
for (i in a) {
print "fruit", i, a[i]
}
}
' "${#:--}"
Would you please try the following:
#!/bin/bash
awk '
{
year = substr($1, 1, 4) # extract year
if (from == "" || from > year) from = year # first (smallest) year
if (to == "" || to < year) to = year # last (largest) year
if ($3 in category == 0) {
category[$3] = $2 # map item to category
list[$2] = list[$2] fs[$2] $3 # csv of items
fs[$2] = "," # delimiter for csv
}
sum[$3,year] += $4 # sum of the item in the year
subt[$2,year] += $4 # sum of the category in the year
ttl[year] += $4 # sum in the year
}
END {
format1 = "%-10s%-10s" # format for the left cells
format2 = "%-16s" # format for the header
format3 = "%-16.2f" # format for the amounts
# print upper header
printf(format1, "", "")
for (y = from; y <= to; y++) {
printf(format2, y "-01-01")
}
print ""
# print second header
printf(format1, "", "")
for (y = from; y <= to; y++) {
printf(format2, "-" y "-12-31")
}
print ""
for (cat in list) { # loop over the categories ("fruit" and "alcohol")
n = split(list[cat], item, ",") # split into items
for (i = 1; i <= n; i++) { # loop over the items
printf(format1, i == 1 ? cat : "", item[i])
for (y = from; y <= to; y++) { # loop over years
printf(format3, sum[item[i],y]) # append the sum of the year
}
print "" # finally break the line
}
print "" # insert blank line
printf(format1, "Subt", "")
for (y = from; y <= to; y++) {
printf(format3, subt[cat,y]) # append the subtotal
}
print "\n"
}
printf(format1, "Total", "")
for (y = from; y <= to; y++) {
printf(format3, ttl[y]) # append the total amount
}
print ""
}
' in.txt
Output with the provided input:
2020-01-01 2021-01-01 2022-01-01
-2020-12-31 -2021-12-31 -2022-12-31
alcohol smirnov 26.99 0.00 0.00
beer 0.00 6.00 12.00
Subt 26.99 6.00 12.00
fruit banana 3.40 0.00 0.00
orange 8.40 0.00 4.30
mango 0.00 6.99 7.20
Subt 11.80 6.99 11.50
Total 38.79 12.99 23.50
Please forgive me the order of items are not same as the OP's.
Using GNU awk for arrays of arrays:
$ cat tst.awk
BEGIN { OFS="\t" }
{
sub(/-.*/,"",$1)
minYear = ( NR==1 || $1 < minYear ? $1 : minYear )
maxYear = ( NR==1 || $1 > maxYear ? $1 : maxYear )
items[$2][$3]
vals[$1][$2][$3] += $4
typeTots[$1][$2] += $4
yearTots[$1] += $4
}
END {
printf "%s", OFS
for ( year=minYear; year<=maxYear; year++ ) {
printf "%s%s", OFS, year
}
print ""
for ( type in items ) {
itemCnt = 0
for ( item in items[type] ) {
printf "%s%s%s", (itemCnt++ ? "" : type), OFS, item
for ( year=minYear; year<=maxYear; year++ ) {
printf "%s%0.2f", OFS, vals[year][type][item]
}
print ""
}
printf "Subt%s", OFS
for ( year=minYear; year<=maxYear; year++ ) {
printf "%s%0.2f", OFS, typeTots[year][type]
}
print ORS
}
printf "Total%s", OFS
for ( year=minYear; year<=maxYear; year++ ) {
printf "%s%0.2f", OFS, yearTots[year]
}
print ""
}
$ awk -f tst.awk in.txt
2020 2021 2022
alcohol beer 0.00 6.00 12.00
smirnov 26.99 0.00 0.00
Subt 26.99 6.00 12.00
fruit orange 8.40 0.00 4.30
mango 0.00 6.99 7.20
banana 3.40 0.00 0.00
Subt 11.80 6.99 11.50
Total 38.79 12.99 23.50
or if you really want specific date ranges instead of just the year in the header:
$ cat tst.awk
BEGIN { OFS="\t" }
{
sub(/-.*/,"",$1)
minYear = ( NR==1 || $1 < minYear ? $1 : minYear )
maxYear = ( NR==1 || $1 > maxYear ? $1 : maxYear )
items[$2][$3]
vals[$1][$2][$3] += $4
typeTots[$1][$2] += $4
yearTots[$1] += $4
}
END {
printf "%s", OFS
for ( year=minYear; year<=maxYear; year++ ) {
printf "%s%s-01-01", OFS, year
}
print ""
printf "%s", OFS
for ( year=minYear; year<=maxYear; year++ ) {
printf "%s-%s-12-31", OFS, year
}
print ""
for ( type in items ) {
itemCnt = 0
for ( item in items[type] ) {
printf "%s%s%s", (itemCnt++ ? "" : type), OFS, item
for ( year=minYear; year<=maxYear; year++ ) {
printf "%s%0.2f", OFS, vals[year][type][item]
}
print ""
}
printf "Subt%s", OFS
for ( year=minYear; year<=maxYear; year++ ) {
printf "%s%0.2f", OFS, typeTots[year][type]
}
print ORS
}
printf "Total%s", OFS
for ( year=minYear; year<=maxYear; year++ ) {
printf "%s%0.2f", OFS, yearTots[year]
}
print ""
}
$ awk -f tst.awk in.txt | column -s$'\t' -t
2020-01-01 2021-01-01 2022-01-01
-2020-12-31 -2021-12-31 -2022-12-31
alcohol beer 0.00 6.00 12.00
smirnov 26.99 0.00 0.00
Subt 26.99 6.00 12.00
fruit orange 8.40 0.00 4.30
mango 0.00 6.99 7.20
banana 3.40 0.00 0.00
Subt 11.80 6.99 11.50
Total 38.79 12.99 23.50
I believe the following piece of awk code is a good start. The remaining part to do is just some cleanup and some extra code for the sums.
BEGIN{
# how many divisions per year
n=1
# initialisation of some variables
tmax=0;tmin=999999; ymax=qmax=0;ymin=9999;qmin=99
}
# convert date to quarter,trim,half
{ y=$1+0; q=(substr($1,6,7)+0)%n}
# compute min max time
(y*100+q < tmin) { ymin=y;qmin=q;tmin=y*100+q }
(y*100+q > tmax) { ymax=y;qmax=q;tmax=y*100+q }
# Create arrays that keep track of everything
# a : prices by year,q,category and element
# b : just a list of categories, eg fruit
# c : just a list of elements and the category it belongs to.
{ a[y,q,$2,$3]=$4; b[$2]; c[$3]=$2 }
END{
# loop over categories (eg fruit)
for(i in b) {
# loop over elemnts
for(j in c) {
# exclude elements that do not belong to category
if (i!=c[j]) continue
s=i OFS j;
# loop over the time
for (y=ymin;y<=ymax;y++) {
for (q=0;q<n;++q) {
if (y*100+q < tmin) continue
if (y*100+q > tmax) continue
s=s OFS a[y,q,i,j]+0
}
}
print s
}
}
}
This currently outputs:
alcohol beer 0 6 6
alcohol smirnov 26.99 0 0
fruit orange 4.2 0 4.3
fruit mango 0 6.99 7.2
fruit banana 3.4 0 0
I have some text logs that I need to parse and format into CSV.
I have a working R script but it is slow once file sizes increase and this problem seems like a good candidate for a speed up using awk (or other commandline tools?) as I understand.
I have not done much with awk, and the issue I am having is translating how I think about processing in R to how awk scripting is done.
Example truncated input data (Scrap.log):
; these are comment lines
; *******************************************************************************
; \\C:\Users\Computer\Folder\Folder\Scrap.log
!!G 99999 % % % % % % % % CURRENT XYZ ABC STATE1 STATE2
_START Header1 Header2 Header3 Header4 Header5 Header6 Header7
10 12.23 1.91 6.63 1.68 50.03 0.50 13.97
11 11.32 1.94 6.64 1.94 50.12 0.58 15.10
12 12.96 2.15 6.57 2.12 55.60 0.62 16.24
13 11.43 2.18 6.60 2.36 50.89 0.68 17.39
14 14.91 2.32 6.64 2.59 56.09 0.73 18.41
15 13.16 2.38 6.53 2.85 51.62 0.81 19.30
16 15.02 2.50 6.67 3.05 56.22 0.85 20.12
!!G 99999 % % % % % % % % CURRENT XYZ ABC STATE1 STATE2
_START Header8 Header9 Header10 Header11 Header12 Header13 Header14
10 22.03 24.41 15.01 51.44 44.28 16.57 11.52
11 21.05 24.62 15.62 51.23 45.42 16.47 11.98
12 20.11 24.64 16.38 52.16 46.59 16.54 12.42
13 24.13 24.93 17.23 52.34 47.72 16.51 12.88
14 27.17 24.95 18.06 52.79 48.72 16.45 13.30
15 22.87 25.04 19.27 53.01 49.50 16.47 13.63
16 23.08 25.22 20.12 53.75 50.64 16.55 14.03
Expected output (truncated):
HH1,HH2,HH3,HH4,HH5,HH6,HH7,HH8,HH9,HH10,HH11,HH12,HH13,HH14,START,HeaderName,Value
99999,CURRENT,XYZ,ABC,STATE1,STATE2,%,%,%,%,%,%,%,%,10,Header1,12.23
99999,CURRENT,XYZ,ABC,STATE1,STATE2,%,%,%,%,%,%,%,%,10,Header2,1.91
99999,CURRENT,XYZ,ABC,STATE1,STATE2,%,%,%,%,%,%,%,%,10,Header3,6.63
99999,CURRENT,XYZ,ABC,STATE1,STATE2,%,%,%,%,%,%,%,%,10,Header4,1.68
99999,CURRENT,XYZ,ABC,STATE1,STATE2,%,%,%,%,%,%,%,%,10,Header5,50.03
99999,CURRENT,XYZ,ABC,STATE1,STATE2,%,%,%,%,%,%,%,%,10,Header6,0.5
99999,CURRENT,XYZ,ABC,STATE1,STATE2,%,%,%,%,%,%,%,%,10,Header7,13.97
99999,CURRENT,XYZ,ABC,STATE1,STATE2,%,%,%,%,%,%,%,%,11,Header1,11.32
99999,CURRENT,XYZ,ABC,STATE1,STATE2,%,%,%,%,%,%,%,%,11,Header2,1.94
99999,CURRENT,XYZ,ABC,STATE1,STATE2,%,%,%,%,%,%,%,%,11,Header3,6.64
99999,CURRENT,XYZ,ABC,STATE1,STATE2,%,%,%,%,%,%,%,%,11,Header4,1.94
99999,CURRENT,XYZ,ABC,STATE1,STATE2,%,%,%,%,%,%,%,%,11,Header5,50.12
99999,CURRENT,XYZ,ABC,STATE1,STATE2,%,%,%,%,%,%,%,%,11,Header6,0.58
99999,CURRENT,XYZ,ABC,STATE1,STATE2,%,%,%,%,%,%,%,%,11,Header7,15.1
99999,CURRENT,XYZ,ABC,STATE1,STATE2,%,%,%,%,%,%,%,%,12,Header1,12.96
99999,CURRENT,XYZ,ABC,STATE1,STATE2,%,%,%,%,%,%,%,%,12,Header2,2.15
99999,CURRENT,XYZ,ABC,STATE1,STATE2,%,%,%,%,%,%,%,%,12,Header3,6.57
99999,CURRENT,XYZ,ABC,STATE1,STATE2,%,%,%,%,%,%,%,%,12,Header4,2.12
99999,CURRENT,XYZ,ABC,STATE1,STATE2,%,%,%,%,%,%,%,%,12,Header5,55.6
99999,CURRENT,XYZ,ABC,STATE1,STATE2,%,%,%,%,%,%,%,%,12,Header6,0.62
99999,CURRENT,XYZ,ABC,STATE1,STATE2,%,%,%,%,%,%,%,%,12,Header7,16.24
99999,CURRENT,XYZ,ABC,STATE1,STATE2,%,%,%,%,%,%,%,%,13,Header1,11.43
99999,CURRENT,XYZ,ABC,STATE1,STATE2,%,%,%,%,%,%,%,%,13,Header2,2.18
...
My general steps in the R script:
add a single header row with new names at the top of file
spread the top row (starting with !!G) to each row
melt the header column (_START) from wide to long format
Pieces I have working in awk so far include:
how to grab and print the header lines
awk '/_START/ {header = $0; print header}' Scrap.log
How to write a single row with the new header values
awk ' BEGIN{ ORS=" "; for (counter = 1; counter <= 14; counter++) print "HH",counter;}'
I know each block is separated by a newline and starts with a !!G, so can write a match on that. Unsure if a split-apply-combine type of thinking works well in awk?
awk '/!!G/,/\n/ {print}' Scrap.log
alternatively, I tried setting RS/FS parameters like:
awk ' BEGIN{RS="\n";FS=" ";}/^!!G/{header=$0;print header}/[0-9]/{print $2}END{}' Scrap.log
I then get stuck on iterating over the rows and fields to do the melt step as well as combining the capture groups correctly.
How do I combine all these pieces to get to the CSV format?
I think the following:
awk '
BEGIN{
# output the header line
print "HH1,HH2,HH3,HH4,HH5,HH6,HH7,HH8,HH9,HH10,HH11,HH12,HH13,HH14,START,HeaderName,Value"
}
# ignore comment lines
/;/{next}
/!!G/{
valcnt = 1
# save and shuffle the values
val[valcnt++] = $2
val[valcnt++] = $11
val[valcnt++] = $12
val[valcnt++] = $13
val[valcnt++] = $14
val[valcnt++] = $15
val[valcnt++] = $3
val[valcnt++] = $4
val[valcnt++] = $5
val[valcnt++] = $6
val[valcnt++] = $7
val[valcnt++] = $8
val[valcnt++] = $9
val[valcnt++] = $10
next
}
/_START /{
# these are headers - save them to head, to be reused later
for (i = 2; i <= NF; ++i) {
# fun fact: its indexed on NF
head[i] = $i
}
next
}
# this function is redundant, but its just easier for me to think about the code
function output(firstval, header, value, \
cur, i) {
cur = valcnt
val[cur++] = firstval
val[cur++] = header
val[cur++] = value
# output val as csv
for (i = 1; i < cur; ++i) {
printf "%s%s", val[i], i != cur - 1 ? "," : "\n"
}
}
/[0-9]+/{
for (i = 2; i <= NF; ++i) {
# add these 3 to all the other values and output them
# ie. add first column, the header from header and the value
output($1, head[i], $i)
}
}
'
Should output what you want. Tested on repl.