awk - print on column-specific conditions, print empty fields for non-match - awk

My large file has date in $1 and 19 columns with numerical values, separated by ";". One or many columns must meet a column-specific smaller-than-condition for printout. I'd like to print them on the same line, beginning with $1 (date) – keeping the original table structure intact, ie, printing empty fields for those which do not meet their specific condition on this particular row. Only print lines with at least 1 match.
Later, I will have to exchange the printout conditions to larger-than and several between-values (<>) which is why I'm interested in a programmatic awk solution as opposed to manipulation of data in a time-consuming spreadsheet application.
My OS is MacOS.
Excerpt from input; changed ";" to "," because the system didn't let me use ";" and added spaces to the delimiter for readability):
1503_12_26 , 100.09 , 33.54 , 75.72 , 9.17 , 96.01 , 29.46 , 71.64 , 5.09 , 98.96 , 32.41 , 74.59 , 8.04 , 28.33 , 70.51 , 95 , 70.63 , 28.45 , 90.92
1503_12_27 , 102.94 , 31.87 , 80.47 , 9.4 , 98.89 , 27.82 , 76.42 , 5.35 , 102.7 , 31.63 , 80.23 , 9.16 , 27.58 , 76.18 , 97.59 , 75.12 , 26.52 , 93.54
1503_12_28 , 107.76 , 32.44 , 87.2 , 11.88 , 103.26 , 27.94 , 82.7 , 7.38 , 106.64 , 31.32 , 86.08 , 10.76 , 26.82 , 81.58 , 100.38 , 79.82 , 25.06 , 95.88
1503_12_29 , 112.87 , 33.57 , 94.21 , 14.91 , 107.58 , 28.28 , 88.92 , 9.62 , 110.66 , 31.36 , 92 , 12.7 , 26.07 , 86.71 , 103.25 , 84.59 , 23.95 , 97.96
1503_12_30 , 117.99 , 34.95 , 101.21 , 18.17 , 111.69 , 28.65 , 94.91 , 11.87 , 114.68 , 31.64 , 97.9 , 14.86 , 25.34 , 91.6 , 106.12 , 89.34 , 23.08 , 99.82
1503_12_31 , 123.07 , 36.51 , 108.12 , 21.56 , 115.63 , 29.07 , 100.68 , 14.12 , 118.66 , 32.1 , 103.71 , 17.15 , 24.66 , 96.27 , 108.95 , 94 , 22.39 , 101.51
1504_01_01 , 128.08 , 38.22 , 114.88 , 25.02 , 119.42 , 29.56 , 106.22 , 16.36 , 122.58 , 32.72 , 109.38 , 19.52 , 24.06 , 100.72 , 111.72 , 98.52 , 21.86 , 103.06
Desired output:
1503-12-26 ; ; 33.54 ; ; 9.17 ; ; ; ; 5.09 ; ; ; ; 8.04 ; ; ; ; ; ;
1503-12-27 ; ; 31.87 ; ; 9.4 ; ; ; ; 5.35 ; ; ; ; 9.16 ; ; ; ; ; ;
1503-12-28 ; ; 32.44 ; ; 11.88 ; ; ; ; 7.38 ; ; ; ; ; ; ; ; ; ;
1503-12-29 ; ; 33.57 ; ; 14.91 ; ; ; ; 9.62 ; ; ; ; ; ; ; ; ; ;
1503-12-30 ; ; 34.95 ; ; 18.17 ; ; ; ; ; ; ; ; ; ; ; ; ; ;
1503-12-31 ; ; ; ; 21.56 ; ; ; ; ; ; ; ; ; ; ; ; ; ;
I started out with a chain of OR || conditions for each column but couldn't think of how to empty the fields that don't match their criteria.
awk -F";" ' $2 < 80 || $3 < 35 || $4 < 60 || $5 < 24 || $6 < 45 || $7 < 24 || $8 < 30 || $9 < 8 || $10 < 60 || $11 < 24 || $12 < 30 || $13 < 10 || $14 < 10 || $15 < 10 || $16 < 24 || $17 < 8 || $18 < 8 || $19 < 8 ' input.txt > output.txt
It just prints all fields of a row if 1 field is a hit.
Then separate awk statements for each column in a bash-file (.sh) and appending them to output.txt. But I get separate lines for each hit, and I also won't know which column had met its condition.
So I added new fields to printout: column header ; condition ; $value.
example:
awk -F";" ' $3 < 35 {print $1 "\;SJMV\;<35\;" $3} ' input.txt >>output.txt
Output:
1503-12-26 ; SJMV ; \<35 ; 33.54
1503-12-26 ; SJM ; \<24 ; 9.17
1503-12-26 ; JM ; \<10 ; 8.04
1503-12-26 ; SJ ; \<8 ; 5.09
Couldn't think of a way to get this back into the table structure, either.
I might have to do some IF, FOR, or WHILE loops within the first attempted chain of OR ||
to empty the non-matching fields. But I don't know how.
This is my first Q here. Appreciate comments re improvement, as well.

Clear the field if its test fails.
Remember to set the output delimiter.
awk -F";" -v OFS=";" '
!( $2 < 80 ) { $2="" }
!( $3 < 35 ) { $3="" }
!( $4 < 60 ) { $4="" }
!( $5 < 24 ) { $5="" }
!( $6 < 45 ) { $6="" }
!( $7 < 24 ) { $7="" }
!( $8 < 30 ) { $8="" }
!( $9 < 8 ) { $9="" }
!( $10 < 60 ) { $10="" }
!( $11 < 24 ) { $11="" }
!( $12 < 30 ) { $12="" }
!( $13 < 10 ) { $13="" }
!( $14 < 10 ) { $14="" }
!( $15 < 10 ) { $15="" }
!( $16 < 24 ) { $16="" }
!( $17 < 8 ) { $17="" }
!( $18 < 8 ) { $18="" }
!( $19 < 8 ) { $19="" }
{ print }
' input.txt > output.txt
To exclude lines where all columns were cleared, you could check if any tests succeeded:
awk -F";" -v OFS=";" '
{
t=0
if ( $2 < 80 ) t=1; else $2=""
if ( $3 < 35 ) t=1; else $3=""
if ( $4 < 60 ) t=1; else $4=""
if ( $5 < 24 ) t=1; else $5=""
if ( $6 < 45 ) t=1; else $6=""
if ( $7 < 24 ) t=1; else $7=""
if ( $8 < 30 ) t=1; else $8=""
if ( $9 < 8 ) t=1; else $9=""
if ( $10 < 60 ) t=1; else $10=""
if ( $11 < 24 ) t=1; else $11=""
if ( $12 < 30 ) t=1; else $12=""
if ( $13 < 10 ) t=1; else $13=""
if ( $14 < 10 ) t=1; else $14=""
if ( $15 < 10 ) t=1; else $15=""
if ( $16 < 24 ) t=1; else $16=""
if ( $17 < 8 ) t=1; else $17=""
if ( $18 < 8 ) t=1; else $18=""
if ( $19 < 8 ) t=1; else $19=""
if (t) print
}
' input.txt > output.txt

Related

AWK new line sorting

I have a script that sorts numbers:
{
if ($1 <= 9) xd++
else if ($1 > 9 && $1 <= 19) xd1++
else if ($1 > 19 && $1 <= 29) xd2++
else if ($1 > 29 && $1 <= 39) xd3++
else if ($1 > 39 && $1 <= 49) xd4++
else if ($1 > 49 && $1 <= 59) xd5++
else if ($1 > 59 && $1 <= 69) xd6++
else if ($1 > 69 && $1 <= 79) xd7++
else if ($1 > 79 && $1 <= 89) xd8++
else if ($1 > 89 && $1 <= 99) xd9++
else if ($1 == 100) xd10++
} END {
print "0-9 : "xd, "10-19 : " xd1, "20-29 : " xd2, "30-39 : " xd3, "40-49 : " xd4, "50-59 : " xd5, "60-69 : " xd6, "70-79 : " xd7, "80-89 : " xd8, "90-99 : " xd9, "100 : " xd10
}
output:
$ cat xd1 | awk -f script.awk
0-9 : 16 10-19 : 4 20-29 : 30-39 : 2 40-49 : 1 50-59 : 1 60-69 : 1 70-79 : 1 80-89 : 1 90-99 : 1 100 : 2
how to make that every tenth was on a new line?
like this:
0-9 : 16
10-19 : 4
20-29 :
30-39 : 2
print with \n doesn't work
additionally:
in the top ten I have 16 numbers, how can I get this information using the "+" sign
like this:
0-9 : 16 ++++++++++++++++
10-19 : 4 ++++
20-29 :
30-39 : 2 ++
thank you in advance
If we rewrite the current code to use an array to keep track of counts, we can then use a simple for loop to print the results on individual lines, eg:
{ if ($1 <= 9) xd[0]++
else if ($1 <= 19) xd[1]++
else if ($1 <= 29) xd[2]++
else if ($1 <= 39) xd[3]++
else if ($1 <= 49) xd[4]++
else if ($1 <= 59) xd[5]++
else if ($1 <= 69) xd[6]++
else if ($1 <= 79) xd[7]++
else if ($1 <= 89) xd[8]++
else if ($1 <= 99) xd[9]++
else xd[10]++
}
END { for (i=0;i<=9;i++)
print (i*10) "-" (i*10)+9, ":", xd[i]
print "100 :", xd[10]
}
At this point we could also replace the 1st part of the script with a comparable for loop, eg:
{ for (i=0;i<=9;i++)
if ($1 <= (i*10)+9) {
xd[i]++
next
}
xd[10]++
}
END { for (i=0;i<=9;i++)
print (i*10) "-" (i*10)+9, ":", xd[i]
print "100 :", xd[10]
}
As for the additional requirement to print a variable number of + on the end of each line we can add a function (prt()) to generate the variable number of +:
function prt(n ,x) {
x=""
if (n) {
x=sprintf("%*s",n," ")
gsub(/ /,"+",x)
}
return x
}
{ for (i=0;i<=9;i++)
if ($1 <= (i*10)+9) {
xd[i]++
next
}
xd[10]++
}
END { for (i=0;i<=9;i++)
print (i*10) "-" (i*10)+9, ":", xd[i], prt(xd[i])
print "100 :", xd[10], prt(xd[10])
}
how to make that every tenth was on a new line?
Inform GNU AWK that you want OFS (output field separator) to be newline, consider following simple example
awk 'BEGIN{x=1;y=2;z=3}END{print "x is " x, "y is " y, "z is " z}' emptyfile
gives output
x is 1 y is 2 z is 3
whilst
awk 'BEGIN{OFS="\n";x=1;y=2;z=3}END{print "x is " x, "y is " y, "z is " z}' emptyfile
gives output
x is 1
y is 2
z is 3
Explanation: OFS value (default: space) is used for joining arguments of print. If you want to know more about OFS then read 8 Powerful Awk Built-in Variables – FS, OFS, RS, ORS, NR, NF, FILENAME, FNR
(tested in gawk 4.2.1)
you don't need to hard-code in 10-buckets like that :
jot -r 300 1 169 | mawk '
BEGIN { _+=(_+=_^=_<_)*_*_ } { ++___[_<(__=int(($!!_)/_))?_:__] }
END {
____ = sprintf("%*s", NR, _)
gsub(".","+",____)
for(__=_-_;__<=_;__++) {
printf(" [%3.f %-6s] : %5.f %.*s\n",__*_,+__==+_?"+ "\
: " , " __*_--+_++, ___[__], ___[__], ____) } }'
[ 0 , 9 ] : 16 ++++++++++++++++
[ 10 , 19 ] : 17 +++++++++++++++++
[ 20 , 29 ] : 16 ++++++++++++++++
[ 30 , 39 ] : 19 +++++++++++++++++++
[ 40 , 49 ] : 14 ++++++++++++++
[ 50 , 59 ] : 18 ++++++++++++++++++
[ 60 , 69 ] : 18 ++++++++++++++++++
[ 70 , 79 ] : 16 ++++++++++++++++
[ 80 , 89 ] : 20 ++++++++++++++++++++
[ 90 , 99 ] : 19 +++++++++++++++++++
[100 + ] : 127 ++++++++++++++++++++++++++++++++++++++++++
++++++++++++++++++++++++++++++++++++++++++
+++++++++++++++++++++++++++++++++++++++++++

VHDL, process not updating variable's value, implementing a Merge Sort

I am trying to implement an example of Merge Sort algorithm on VHDL in order to sort 4 128bit numbers.
I am using sequential code. I have a process on which I make comparisons. The process consists of 3 phases which implement the logic behind Merge Sort.
The problem is that I am using a variable count : integer which counts clock cycles. I want the phases to follow clock cycles.
It seems like the simulation enters the first IF statement (of Phase 1) but does not enter the others so I guess count variable does not update its value.
I have tried several changes but seems like I am missing something here.
I know the post is a little big, I would appreciate any help! Thanks!
entity Merge_Sort is
Port ( clk : in STD_LOGIC;
reset : in STD_LOGIC;
en : in STD_LOGIC;
In_a : in STD_LOGIC_VECTOR(15 downto 0 ) ;
In_b : in STD_LOGIC_VECTOR(15 downto 0 ) ;
In_c : in STD_LOGIC_VECTOR(15 downto 0 ) ;
In_d : in STD_LOGIC_VECTOR(15 downto 0 ) ;
Sorted_a : out STD_LOGIC_VECTOR(15 downto 0 ) ;
Sorted_b : out STD_LOGIC_VECTOR(15 downto 0 ) ;
Sorted_c : out STD_LOGIC_VECTOR(15 downto 0 ) ;
Sorted_d : out STD_LOGIC_VECTOR(15 downto 0 ) );
end Merge_Sort;
architecture Behavioral of Merge_Sort is
signal temp1a,temp1b,temp1c,temp1d : STD_LOGIC_VECTOR(15 downto 0 ) ;
TYPE arr2 IS ARRAY (0 to 1 ) of STD_LOGIC_VECTOR(15 downto 0) ;
TYPE arr4 IS ARRAY (0 to 3 ) of STD_LOGIC_VECTOR(15 downto 0) ;
signal Array1 , Array2 : arr2 ;
signal mergedArr : arr4 ;
signal temp : std_logic_vector(15 downto 0 ) ;
begin
temp1a <= (others =>'0' ) WHEN reset ='1' else -- Asychronous Resetting
In_a ;
temp1b <= (others =>'0' ) WHEN reset ='1' else
In_b ;
temp1c <= (others =>'0' ) WHEN reset ='1' else -- Asychronous Resetting
In_c ;
temp1d <= (others =>'0' ) WHEN reset ='1' else
In_d ;
Sorted_a <= MergedArr(0) ;
Sorted_b <= MergedArr(1) ;
Sorted_c <= MergedArr(2) ;
Sorted_d <= MergedArr(3) ;
Sort: PROCESS(clk)
variable count : integer range 0 to 3 ;
BEGIN
if(reset ='1' ) then count := 0 ;
end if ;
IF ( clk'EVENT AND clk='1' ) then -- Conditions for process to run
IF (en ='1') then
IF(count =0) THEN -- Phase 1 of sort
if (temp1a<temp1b ) then Array1(0)<=temp1a ; Array1(1) <= temp1b ;
else Array1(1)<=temp1a ; Array1(0) <= temp1b ;
end if ;
if (temp1c<temp1d ) then Array2(0)<=temp1c ;Array2(1) <= temp1d ;
else Array2(1)<=temp1c ; Array2(0) <= temp1d ;
end if ;
count := count +1 ;
END IF ;
IF( count = 1) THEN -- Phase 2 of sort , computing min and max of array
if ( Array1(1) < Array2(1) ) then MergedArr(1) <= Array1(1) ; MergedArr(3) <= Array2(1) ;
else MergedArr(3) <= Array1(1) ; MergedArr(1) <= Array2(1) ;
end if ;
if ( Array1(0) < Array2(0) ) then MergedArr(0) <= Array1(0) ; MergedArr(2) <= Array2(0) ;
else MergedArr(2) <= Array1(0) ; MergedArr(0) <= Array2(0) ;
end if ;
count:= count +1 ;
END IF ;
IF(count =2 ) THEN -- Phase 3 of sort , FINAL
if ( MergedArr(1) > MergedArr(2) ) then
temp<= MergedArr(2 ) ;
MergedArr(2) <= MergedArr(1) ;
MergedArr(1) <= temp ;
end if ;
END IF ;
END IF;
END IF ;
END PROCESS ;
end Behavioral;
The variable count is updated immediately inside each IF condition. So, for example, inside the IF(count=0), it is incremented to 1. Then it reaches the IF(count=1) statement, which will of course already be true.
I think really all you need to do is change it to IF ... ELSIF statements:
IF(count =0) THEN -- Phase 1 of sort
...
ELSIF( count = 1) THEN -- Phase 2 of sort , computing min and max of array
...
ELSIF(count =2 ) THEN -- Phase 3 of sort , FINAL
and I think it will work exactly as you expect.

compare file and print class

I have
file1:
id position
a1 21
a1 39
a1 77
b1 88
b1 122
c1 22
file 2
id class position1 position2
a1 Xfact 1 40
a1 Xred 41 66
a1 xbreak 69 89
b1 Xbreak 77 133
b1 Xred 140 199
c1 Xfact 1 15
c1 Xbreak 19 35
I want something like this
output:
id position class
a1 21 Xfact
a1 39 Xfact
a1 77 Xbreak
b1 88 Xbreak
b1 122 Xbreak
c1 22 Xbreak
I need a simple awk script , which print id and position from file1, take position from file1 and compare it to file 2 positions. if position in file 1 lies in range of position 1 and 2 in file two. print corresponding class
One way using awk. It's not a simple script. The process explained in short: The key point is the variable 'all_ranges', when reset reads from file of ranges saving its data, and when set, stop that process and begin reading from 'id-position'
file, checks position in the data of the array and prints if matches the range. I've tried to avoid to process the file of ranges many times and do it by chunks, which made it more complex.
EDIT to add that I assume id field in both files are sorted. Otherwise this script will fail miserably and you will need another approach.
Content of script.awk:
BEGIN {
## Arguments:
## ARGV[0] = awk
## ARGV[1] = <first_input_argument>
## ARGV[2] = <second_input_argument>
## ARGC = 3
f2 = ARGV[ --ARGC ];
all_ranges = 0
## Read first line from file with ranges to get 'class' header.
getline line <f2
split( line, fields )
class_header = fields[2];
}
## Special case for the header.
FNR == 1 {
printf "%s\t%s\n", $0, class_header;
next;
}
## Data.
FNR > 1 {
while ( 1 ) {
if ( ! all_ranges ) {
## Read line from file with range positions.
ret = getline line <f2
## Check error.
if ( ret == -1 ) {
printf "%s\n", "ERROR: " ERRNO
close( f2 );
exit 1;
}
## Check end of file.
if ( ret == 0 ) {
break;
}
## Split line in spaces.
num = split( line, fields )
if ( num != 4 ) {
printf "%s\n", "ERROR: Bad format of file " f2;
exit 2;
}
range_id = fields[1];
if ( $1 == fields[1] ) {
ranges[ fields[3], fields[4] ] = fields[2];
continue;
}
else {
all_ranges = 1
}
}
if ( range_id == $1 ) {
delete ranges;
ranges[ fields[3], fields[4] ] = fields[2];
all_ranges = 0;
continue;
}
for ( range in ranges ) {
split( range, pos, SUBSEP )
if ( $2 >= pos[1] && $2 <= pos[2] ) {
printf "%s\t%s\n", $0, ranges[ range ];
break;
}
}
break;
}
}
END {
for ( range in ranges ) {
split( range, pos, SUBSEP )
if ( $2 >= pos[1] && $2 <= pos[2] ) {
printf "%s\t%s\n", $0, ranges[ range ];
break;
}
}
}
Run it like:
awk -f script.awk file1 file2 | column -t
With following result:
id position class
a1 21 Xfact
a1 39 Xfact
a1 77 xbreak
b1 88 Xbreak
b1 122 Xbreak
c1 22 Xbreak

How substract millisecond with AWK - script

I'm trying to create an awk script to subtract milliseconds between 2 records joined-up for example:
By command line I might do this:
Input:
06:20:00.120
06:20:00.361
06:20:15.205
06:20:15.431
06:20:35.073
06:20:36.190
06:20:59.604
06:21:00.514
06:21:25.145
06:21:26.125
Command:
awk '{ if ( ( NR % 2 ) == 0 ) { printf("%s\n",$0) } else { printf("%s ",$0) } }' input
I'll obtain this:
06:20:00.120 06:20:00.361
06:20:15.205 06:20:15.431
06:20:35.073 06:20:36.190
06:20:59.604 06:21:00.514
06:21:25.145 06:21:26.125
To substract milliseconds properly:
awk '{ if ( ( NR % 2 ) == 0 ) { printf("%s\n",$0) } else { printf("%s ",$0) } }' input| awk -F':| ' '{print $3, $6}'
And to avoid negative numbers:
awk '{if ($2<$1) sub(/00/, "60",$2); print $0}'
awk '{$3=($2-$1); print $3}'
The goal is get this:
Call 1 0.241 ms
Call 2 0.226 ms
Call 3 1.117 ms
Call 4 0.91 ms
Call 5 0.98 ms
And finally and average.
I might perform this but command by command. I dunno how to place this into a script.
Please need help.
Using awk:
awk '
BEGIN { cmd = "date +%s.%N -d " }
NR%2 {
cmd $0 | getline var1;
next
}
{
cmd $0 | getline var2;
var3 = var2 - var1;
print "Call " ++i, var3 " ms"
}
' file
Call 1 0.241 ms
Call 2 0.226 ms
Call 3 1.117 ms
Call 4 0.91 ms
Call 5 0.98 ms
One way using awk:
Content of script.awk:
## For every input line.
{
## Convert formatted dates to time in miliseconds.
t1 = to_ms( $0 )
getline
t2 = to_ms( $0 )
## Calculate difference between both dates in miliseconds.
tr = (t1 >= t2) ? t1 - t2 : t2 - t1
## Print to output with time converted to a readable format.
printf "Call %d %s ms\n", ++cont, to_time( tr )
}
## Convert a date in format hh:mm:ss:mmm to miliseconds.
function to_ms(time, time_ms, time_arr)
{
split( time, time_arr, /:|\./ )
time_ms = ( time_arr[1] * 3600 + time_arr[2] * 60 + time_arr[3] ) * 1000 + time_arr[4]
return time_ms
}
## Convert a time in miliseconds to format hh:mm:ss:mmm. In case of 'hours' or 'minutes'
## with a value of 0, don't print them.
function to_time(i_ms, time)
{
ms = int( i_ms % 1000 )
s = int( i_ms / 1000 )
h = int( s / 3600 )
s = s % 3600
m = int( s / 60 )
s = s % 60
# time = (h != 0 ? h ":" : "") (m != 0 ? m ":" : "") s "." ms
time = (h != 0 ? h ":" : "") (m != 0 ? m ":" : "") s "." sprintf( "%03d", ms )
return time
}
Run the script:
awk -f script.awk infile
Result:
Call 1 0.241 ms
Call 2 0.226 ms
Call 3 1.117 ms
Call 4 0.910 ms
Call 5 0.980 ms
If you're not tied to awk:
to_epoch() { date -d "$1" "+%s.%N"; }
count=0
paste - - < input |
while read t1 t2; do
((count++))
diff=$(printf "%s-%s\n" $(to_epoch "$t2") $(to_epoch "$t1") | bc -l)
printf "Call %d %5.3f ms\n" $count $diff
done

Union "tables" with awk

I have multiple "tables" in a file, such as:
col1, col2, col3, col4
1, 2, 3, 4
5, 6, 7, 8
col2, col3, col5
10, 11, 12
13, 14, 15
And I would like to collapse these 2 tables to:
col1, col2, col3, col4, col5
1 , 2 , 3 , 4 ,
5 , 6 , 7 , 8 ,
, 10 , 11 , , 12
, 13 , 14 , , 15
(Note: extra whitespace left just to make things easier to understand)
This would seem to require at least 2 passes, one to collect the full list of columns, and another one to create the output table. Is it possible to do this with awk? If not, what other tool would you recommend?
give this a try:
Code:
$ cat s.awk
NR==FNR{
if (match($1, /^col/))
maxIndex=(substr($NF,4,1)>maxIndex)?substr($NF,4,1):maxColumn
next
}
FNR==1{
for (i=1;i<=maxIndex;i++)
header=(i==maxIndex)?header "col"i:header "col" i ", "
print header
}
/^col[1-9]/{
for (i in places)
delete places[i]
for (i=1;i<=NF;i++){
n=substr($i,4,1)
places[n]=i
}
}
/^[0-9]/{
s=""
for (i=1;i<=maxIndex;i++)
s=(i in places)? s $places[i] " " : s ", "
print s
}
Call with:
awk -f s.awk file file | column -t
Output:
col1, col2, col3, col4, col5
1, 2, 3, 4 ,
5, 6, 7, 8 ,
, 10, 11, , 12
, 13, 14, , 15
HTH Chris
The code assumes that the tables are separated by empty lines:
awk -F', *' 'END {
for (i = 0; ++i <= c;)
printf "%s", (cols[i] (i < c ? OFS : RS))
for (i = 0; ++i <= n;)
for (j = 0; ++j <= c;)
printf "%s", (vals[i, cols[j]] (j < c ? OFS : RS))
}
!NF {
fnr = NR + 1; next
}
NR == 1 || NR == fnr {
for (i = 0; ++i <= NF;) {
_[$i]++ || cols[++c] = $i
idx[i] = $i
}
next
}
{
++n; for (i = 0; ++i <= NF;)
vals[n, idx[i]] = $i
}' OFS=', ' tables
If you have the tables in separate files:
awk -F', *' 'END {
for (i = 0; ++i <= c;)
printf "%s", (cols[i] (i < c ? OFS : RS))
for (i = 0; ++i <= n;)
for (j = 0; ++j <= c;)
printf "%s", (vals[i, cols[j]] (j < c ? OFS : RS))
}
FNR == 1 {
for (i = 0; ++i <= NF;) {
_[$i]++ || cols[++c] = $i
idx[i] = $i
}
next
}
{
++n; for (i = 0; ++i <= NF;)
vals[n, idx[i]] = $i
}' OFS=', ' file1 file2 [.. filen]
Here's a one-pass perl solution. It assumes there is at least one blank line between each table in the file.
perl -00 -ne '
BEGIN {
%column2idx = ();
#idx2column = ();
$lineno = 0;
#lines = ();
}
chomp;
#rows = split /\n/;
#field_map = ();
#F = split /, /, $rows[0];
for ($i=0; $i < #F; $i++) {
if (not exists $column2idx{$F[$i]}) {
$idx = #idx2column;
$column2idx{$F[$i]} = $idx;
$idx2column[$idx] = $F[$i];
}
$field_map[$i] = $column2idx{$F[$i]};
}
for ($i=1; $i < #rows; $i++) {
#{$lines[$lineno]} = ();
#F = split /, /, $rows[$i];
for ($j=0; $j < #F; $j++) {
$lines[$lineno][$field_map[$j]] = $F[$j];
}
$lineno++;
}
END {
$ncols = #idx2column;
print join(", ", #idx2column), "\n";
foreach $row (#lines) {
#row = ();
for ($i=0; $i < $ncols; $i++) {
push #row, $row->[$i];
}
print join(", ", #row), "\n";
}
}
' tables | column -t
output
col1, col2, col3, col4, col5
1, 2, 3, 4,
5, 6, 7, 8,
, 10, 11, , 12
, 13, 14, , 15