find first 5 maximum values in each line using awk - awk

I am trying to read a text file like the following
word 1 2 3 4 5 6 7 8 9 10
hello 0.2 0.3 0.5 0.1 0.7 0.8 0.6 0.1 0.9
I would like to print the word, "hello", and the maximun 5 values along with the number of the column where they are, like this using awk:
hello 10 0.9 7 0.8 6 0.7 8 0.6 3 0.5
I have thought something like this awk '{ for (i=1; i <= 10; i++) a[$i]=$i};END{c=asort(a)?? for(i in a)print i,a[i]??}', but I would like to print in each line read.

With GNU awk 4.* for sorted_in:
$ cat tst.awk
BEGIN { PROCINFO["sorted_in"] = "#val_num_desc" }
NR>1 {
split($0,a)
printf "%s", a[1]
delete a[1]
for (i in a) {
printf " %d %s", i, a[i]
if (++c == 5) {
c=0
break
}
}
print ""
}
$ awk -f tst.awk file
hello 10 0.9 7 0.8 6 0.7 8 0.6 4 0.5

here is an awk assisted Unix tool set solution.
$ awk -v RS=" " 'NR==1; NR>1{print NR, $0 | "sort -k2nr"} ' file | head -6 | xargs
hello 10 0.9 7 0.8 6 0.7 8 0.6 4 0.5
I think your expected output has some typos.

You stated [you] would like to print in each line read so no limits to records read:
$ awk '{delete a; for(i=2; i<=NF; i++) {a[$i]=$i; b[$i]=i}; n=asort(a); printf "%s: ",$1; for(i=n; i>n-(n>=5?5:n); i--) printf "%s %s ", b[a[i]], a[i]; printf "\n"}' test.in
word: 11 10 10 9 9 8 8 7 7 6
hello: 10 0.9 7 0.8 6 0.7 8 0.6 4 0.5
Walk-thru version:
{
delete a # delete the array before each record
for(i=2; i<=NF; i++) { # from the second field to the last
a[$i]=$i # set field to array index and value
b[$i]=i # remember the field number
}
n=asort(a) # sort the a array
printf "%s: ",$1 # print the record identifier ie. the first field
for(i=n; i>n-(n>=5?5:n); i--) # for the 5 (or value count) biggest values
printf "%s %s", b[a[i]], a[i] # print them out
printf "\n" # enter after each record
}
If a value repeats, it's only printed once.

Using Perl
$ cat cloudy.txt
word 1 2 3 4 5 6 7 8 9 10
hello 0.2 0.3 0.5 0.1 0.7 0.8 0.6 0.1 0.9
$ perl -lane '%kv=();%kv=map{ $_=>$F[$_] } 1..$#F; printf("$F[0] ");$i=0; for $x (reverse sort {$a <=> $b} values %kv) { #y=grep $x eq $kv{$_}, (keys %kv); printf("%d %.1f
",$y[0]+1,$x) if $i++ <5 } print "" ' cloudy.txt
word 11 10.0 10 9.0 9 8.0 8 7.0 7 6.0
hello 10 0.9 7 0.8 6 0.7 8 0.6 4 0.5
$

Related

awk equivalents for tidyverse concepts (melt and spread)

I have some text logs that I need to parse and format into CSV.
I have a working R script but it is slow once file sizes increase and this problem seems like a good candidate for a speed up using awk (or other commandline tools?) as I understand.
I have not done much with awk, and the issue I am having is translating how I think about processing in R to how awk scripting is done.
Example truncated input data (Scrap.log):
; these are comment lines
; *******************************************************************************
; \\C:\Users\Computer\Folder\Folder\Scrap.log
!!G 99999 % % % % % % % % CURRENT XYZ ABC STATE1 STATE2
_START Header1 Header2 Header3 Header4 Header5 Header6 Header7
10 12.23 1.91 6.63 1.68 50.03 0.50 13.97
11 11.32 1.94 6.64 1.94 50.12 0.58 15.10
12 12.96 2.15 6.57 2.12 55.60 0.62 16.24
13 11.43 2.18 6.60 2.36 50.89 0.68 17.39
14 14.91 2.32 6.64 2.59 56.09 0.73 18.41
15 13.16 2.38 6.53 2.85 51.62 0.81 19.30
16 15.02 2.50 6.67 3.05 56.22 0.85 20.12
!!G 99999 % % % % % % % % CURRENT XYZ ABC STATE1 STATE2
_START Header8 Header9 Header10 Header11 Header12 Header13 Header14
10 22.03 24.41 15.01 51.44 44.28 16.57 11.52
11 21.05 24.62 15.62 51.23 45.42 16.47 11.98
12 20.11 24.64 16.38 52.16 46.59 16.54 12.42
13 24.13 24.93 17.23 52.34 47.72 16.51 12.88
14 27.17 24.95 18.06 52.79 48.72 16.45 13.30
15 22.87 25.04 19.27 53.01 49.50 16.47 13.63
16 23.08 25.22 20.12 53.75 50.64 16.55 14.03
Expected output (truncated):
HH1,HH2,HH3,HH4,HH5,HH6,HH7,HH8,HH9,HH10,HH11,HH12,HH13,HH14,START,HeaderName,Value
99999,CURRENT,XYZ,ABC,STATE1,STATE2,%,%,%,%,%,%,%,%,10,Header1,12.23
99999,CURRENT,XYZ,ABC,STATE1,STATE2,%,%,%,%,%,%,%,%,10,Header2,1.91
99999,CURRENT,XYZ,ABC,STATE1,STATE2,%,%,%,%,%,%,%,%,10,Header3,6.63
99999,CURRENT,XYZ,ABC,STATE1,STATE2,%,%,%,%,%,%,%,%,10,Header4,1.68
99999,CURRENT,XYZ,ABC,STATE1,STATE2,%,%,%,%,%,%,%,%,10,Header5,50.03
99999,CURRENT,XYZ,ABC,STATE1,STATE2,%,%,%,%,%,%,%,%,10,Header6,0.5
99999,CURRENT,XYZ,ABC,STATE1,STATE2,%,%,%,%,%,%,%,%,10,Header7,13.97
99999,CURRENT,XYZ,ABC,STATE1,STATE2,%,%,%,%,%,%,%,%,11,Header1,11.32
99999,CURRENT,XYZ,ABC,STATE1,STATE2,%,%,%,%,%,%,%,%,11,Header2,1.94
99999,CURRENT,XYZ,ABC,STATE1,STATE2,%,%,%,%,%,%,%,%,11,Header3,6.64
99999,CURRENT,XYZ,ABC,STATE1,STATE2,%,%,%,%,%,%,%,%,11,Header4,1.94
99999,CURRENT,XYZ,ABC,STATE1,STATE2,%,%,%,%,%,%,%,%,11,Header5,50.12
99999,CURRENT,XYZ,ABC,STATE1,STATE2,%,%,%,%,%,%,%,%,11,Header6,0.58
99999,CURRENT,XYZ,ABC,STATE1,STATE2,%,%,%,%,%,%,%,%,11,Header7,15.1
99999,CURRENT,XYZ,ABC,STATE1,STATE2,%,%,%,%,%,%,%,%,12,Header1,12.96
99999,CURRENT,XYZ,ABC,STATE1,STATE2,%,%,%,%,%,%,%,%,12,Header2,2.15
99999,CURRENT,XYZ,ABC,STATE1,STATE2,%,%,%,%,%,%,%,%,12,Header3,6.57
99999,CURRENT,XYZ,ABC,STATE1,STATE2,%,%,%,%,%,%,%,%,12,Header4,2.12
99999,CURRENT,XYZ,ABC,STATE1,STATE2,%,%,%,%,%,%,%,%,12,Header5,55.6
99999,CURRENT,XYZ,ABC,STATE1,STATE2,%,%,%,%,%,%,%,%,12,Header6,0.62
99999,CURRENT,XYZ,ABC,STATE1,STATE2,%,%,%,%,%,%,%,%,12,Header7,16.24
99999,CURRENT,XYZ,ABC,STATE1,STATE2,%,%,%,%,%,%,%,%,13,Header1,11.43
99999,CURRENT,XYZ,ABC,STATE1,STATE2,%,%,%,%,%,%,%,%,13,Header2,2.18
...
My general steps in the R script:
add a single header row with new names at the top of file
spread the top row (starting with !!G) to each row
melt the header column (_START) from wide to long format
Pieces I have working in awk so far include:
how to grab and print the header lines
awk '/_START/ {header = $0; print header}' Scrap.log
How to write a single row with the new header values
awk ' BEGIN{ ORS=" "; for (counter = 1; counter <= 14; counter++) print "HH",counter;}'
I know each block is separated by a newline and starts with a !!G, so can write a match on that. Unsure if a split-apply-combine type of thinking works well in awk?
awk '/!!G/,/\n/ {print}' Scrap.log
alternatively, I tried setting RS/FS parameters like:
awk ' BEGIN{RS="\n";FS=" ";}/^!!G/{header=$0;print header}/[0-9]/{print $2}END{}' Scrap.log
I then get stuck on iterating over the rows and fields to do the melt step as well as combining the capture groups correctly.
How do I combine all these pieces to get to the CSV format?
I think the following:
awk '
BEGIN{
# output the header line
print "HH1,HH2,HH3,HH4,HH5,HH6,HH7,HH8,HH9,HH10,HH11,HH12,HH13,HH14,START,HeaderName,Value"
}
# ignore comment lines
/;/{next}
/!!G/{
valcnt = 1
# save and shuffle the values
val[valcnt++] = $2
val[valcnt++] = $11
val[valcnt++] = $12
val[valcnt++] = $13
val[valcnt++] = $14
val[valcnt++] = $15
val[valcnt++] = $3
val[valcnt++] = $4
val[valcnt++] = $5
val[valcnt++] = $6
val[valcnt++] = $7
val[valcnt++] = $8
val[valcnt++] = $9
val[valcnt++] = $10
next
}
/_START /{
# these are headers - save them to head, to be reused later
for (i = 2; i <= NF; ++i) {
# fun fact: its indexed on NF
head[i] = $i
}
next
}
# this function is redundant, but its just easier for me to think about the code
function output(firstval, header, value, \
cur, i) {
cur = valcnt
val[cur++] = firstval
val[cur++] = header
val[cur++] = value
# output val as csv
for (i = 1; i < cur; ++i) {
printf "%s%s", val[i], i != cur - 1 ? "," : "\n"
}
}
/[0-9]+/{
for (i = 2; i <= NF; ++i) {
# add these 3 to all the other values and output them
# ie. add first column, the header from header and the value
output($1, head[i], $i)
}
}
'
Should output what you want. Tested on repl.

MDF to VTK Converting using AWK

I am a beginner so sorry if this has been covered before,but I can't seem to find exactly what I need to solve my problem.
I am trying to write an AWK "script" that can convert an MDF(Mesh Definition File) as input into a (VALID) VTK file as output.
I have a sample MDF file that looks like this :
TITLE "1"
NMESHPOINTS 4
NNODES 4
NELEMENTS_TRIANG1 2
TIMESTEP 0.00001
NINTERNAL_TIMESTEPS 1000
NEXTERNAL_TIMESTEPS 100
DAMPING_FACTOR 0.01
MESHPOINT_COORDINATES
1 0.0 0.0 0.0
2 1.0 0.0 0.0
3 1.0 1.0 0.0
4 0.0 1.0 0.0
NODES_TRIANG1
1 1 2 3
2 1 3 4
And I want to make a valid VTK file from this input.
Here is how the output should look like:
# vtk DataFile Version 1.0
2D Unstructured Grid
ASCII
DATASET UNSTRUCTURED_GRID
POINTS 4 float
0.0 0.0 0.0
1.0 0.0 0.0
1.0 1.0 0.0
0.0 1.0 0.0
CELLS 2 8
3 0 1 2
3 0 2 3
CELL_TYPES 2
5
5
I tried to make a picture how the mappings works I hope it explains some of them.
To make it a bit easier for this specific example let's say we only want to work with triangles.
Sadly I dont have the same file as VTK and MDF too, I tried to manualy write one.
Is there any way to do this with AWK?
Any help will be much appreciated!!
Excellent diagram showing the input -> output mapping! Made it extremely easy to write this:
$ cat tst.awk
$1 ~ /^[[:alpha:]]/ { f[$1] = $2 }
!NF { block = "" }
$1 == "MESHPOINT_COORDINATES" {
block = $1
print "# vtk DataFile Version 1.0"
print "2D Unstructured Grid"
print "ASCII"
print ""
print "DATASET UNSTRUCTURED_GRID"
printf "POINTS %d float\n", f["NMESHPOINTS"]
next
}
block == "MESHPOINT_COORDINATES" {
$1 = ""
sub(/^[[:space:]]+/,"")
print
}
$1 == "NODES_TRIANG1" {
block = $1
printf "\nCELLS %d %d\n", f["NELEMENTS_TRIANG1"], f["NELEMENTS_TRIANG1"] * 4
next
}
block == "NODES_TRIANG1" {
printf "%s", 3
for (i=2; i<=NF; i++) {
printf " %s", $i - 1
}
print ""
nlines++
}
END {
printf "\nCELL_TYPES %d\n", nlines
for (i=1; i<=nlines; i++) {
print 5
}
}
.
$ awk -f tst.awk file.mdf
# vtk DataFile Version 1.0
2D Unstructured Grid
ASCII
DATASET UNSTRUCTURED_GRID
POINTS 4 float
0.0 0.0 0.0
1.0 0.0 0.0
1.0 1.0 0.0
0.0 1.0 0.0
CELLS 2 8
3 0 1 2
3 0 2 3
CELL_TYPES 2
5
5
Normally we only answer questions where the poster has attempted to solve it themselves first but you put enough effort into creating the example and describing the mapping that IMHO you deserve help with a solution so - see the above, try to figure out how it's working yourself (add "prints", check the man page, etc.) and then post a new question if you have any specific questions about it.

Using awk, how to average numbers in column between two strings in a text file

A text file containing multiple tabular delimited columns between strings with an example below.
Code 1 (3)
5 10 7 1 1
6 10 9 1 1
7 10 10 1 1
Code 2 (2)
9 11 3 1 3
10 8 5 2 1
Code 3 (1)
12 10 2 1 1
Code 4 (2)
14 8 1 1 3
15 8 7 5 1
I would like to average the numbers in the third column for each code block. The example below is what the output should look like.
8.67
4
2
4
Attempt 1
awk '$3~/^[[:digit:]]/ {i++; sum+=$3; print $3} $3!~/[[:digit:]]/ {print sum/i; sum=0;i=0}' in.txt
Returned fatal: division by zero attempted.
Attempt 2
awk -v OFS='\t' '/^Code/ { if (NR > 1) {i++; sum+=$3;} {print sum/i;}}' in.txt
Returned another division by zero error.
Attempt 3
awk -v OFS='\t' '/^Code/ { if (NR > 1) { print s/i; s=0; i=0; } else { s += $3; i += 1; }}' in.txt
Returned 1 value: 0.
Attempt 4
awk -v OFS='\t' '/^Code/ {
if (NR > 1)
i++
print sum += $3/i
}
END {
i++
print sum += $3/i
}'
Returned:
0
0
0
0.3
I am not sure where that last number is coming from, but this has been the closest solution so far. I am getting a number for each block, but not the average.
Could you please try following.
awk '
/^Code/{
if(value!=0 && value){
print sum/value
}
sum=value=""
next
}
{
sum+=$NF;
value++
}
END{
if(value!=0 && value){
print sum/value
}
}
' Input_file

Difference between adjacent data rows, with multiple columns

If I have:
1 2 3 4 5 6 . .
3 4 5 4 2 1 . .
5 7 5 7 2 0 . .
.
.
I want to show the difference of adjacent data rows, so that it would show:
2 2 2 0 -3 -5 . .
2 3 0 3 0 -1 . .
.
.
I found the post difference between number in the same column using AWK, and adapting the second answer, I thought that this will do the job:
awk 'NR>1{print $0-p} {p=$0}' file
But that produces output in and of a single column. How do I get it to retain the column structure of the data?
$ cat tst.awk
NR>1 {
for (i=1; i<=NF; i++) {
printf "%2d%s", $i - p[i], (i<NF ? OFS : ORS)
}
}
{ split($0,p) }
$ awk -f tst.awk file
2 2 2 0 -3 -5
2 3 0 3 0 -1
Try something like this:
awk '{for (i=1; i <= NF; i++) { c[i] = $i - c[i] }; count = NF }
END { for (i = 1; i <= count; i++) { printf c[i] " "}}' numbers
Written out:
$ cat > subtr.awk
{
for (i=1; i<=NF; i++) b[i]=a[i]
# for (i in a) b[i]=a[i]
n=split($0,a)
}
NR > 1 {
for (i=1; i<=NF; i++) {
#for(i in a) {
printf "%s%s", a[i]-b[i], (i==n?ORS:OFS)
}
delete b
}
Test it:
$ awk -f subtr.awk file
2 2 2 0 -3 -5
2 3 0 3 0 -1

Find max/min of each column in awk

I have a file with an variable number of columns:
Input:
1 1 2
2 1 5
5 2 3
7 0 -1
4 1 4
I want to print the max and min of each column:
Desired output:
max: 7 2 5
min: 1 0 -1
For a single column, e.g. $1, I know I can find the max and min using something like:
awk '{if(min==""){min=max=$1}; if($1>max) {max=$1}; if($1<min) {min=$1};} END {printf "%.2g %.2g\n", min, max}'
Question
How can I extend this to loop over all columns (not necessarily just the 3 in my example)?
Many thanks!
awk 'NR==1{for(i=1;i<=NF;i++)min[i]=max[i]=$i;}
{for(i=1;i<=NF;i++){if($i<min[i]){min[i]=$i}else if($i>max[i])max[i]=$i;}}
END{printf "max:\t"; for(i in max) printf "%d ",max[i]; printf "\nmin:\t"; for(i in min)printf "%d ",min[i];}' input.txt
input.txt:
1 1 2 2
2 1 5 3
5 2 3 10
7 0 -1 0
4 1 4 5
output:
max: 7 2 5 10
min: 1 0 -1 0
Like this
awk 'NR==1{for(i=1;i<=NF;i++){xmin[i]=$i;xmax[i]=$i}}
{for(i=1;i<=NF;i++){if($i<xmin[i])xmin[i]=$i;if($i>xmax[i])xmax[i]=$i}}
END{for(i=1;i<=NF;i++)print xmin[i],xmax[i]}' file
Let's try to make it a bit shorter by using the min=(current<min?current:min) expression. This is a ternary operator that is the same as saying if (current<min) min=current.
Also, printf "%.2g%s", min[i], (i==NF?"\n":" ") prints the new line on the END{} block whenever it reaches the last field.
awk 'NR==1{for (i=1; i<=NF; i++) {min[i]=$i}; next}
{for (i=1; i<=NF; i++) { min[i]=(min[i]>$i?$i:min[i]); max[i]=(max[i]<$i?$i:max[i]) }}
END {printf "min: "; for (i=1;i<=NF;i++) printf "%.2g%s", min[i], (i==NF?"\n":" ");
printf "max: "; for (i=1;i<=NF;i++) printf "%.2g%s", max[i], (i==NF?"\n":" ")}' file
Sample output:
$ awk 'NR==1{for (i=1; i<=NF; i++) {min[i]=$i}; next} {for (i=1; i<=NF; i++) { min[i]=(min[i]>$i?$i:min[i]); max[i]=(max[i]<$i?$i:max[i]) }} END {printf "min: "; for (i=1;i<=NF;i++) printf "%.2g%s", min[i], (i==NF?"\n":" "); printf "max: "; for (i=1;i<=NF;i++) printf "%.2g%s", max[i], (i==NF?"\n":" ")}' file
min: 1 0 -1
max: 7 2 5