Convert rows into columns using awk - awk

Not all columns (&data) are present for all records. Hence whenever fields missing are missing, they should be replaced with nulls.
My Input format:
.set 1000
EMP_NAME="Rob"
EMP_DES="Developer"
EMP_DEP="Sales"
EMP_DOJ="20-10-2010"
EMR_MGR="Jack"
.set 1001
EMP_NAME="Koster"
EMP_DEP="Promotions"
EMP_DOJ="20-10-2011"
.set 1002
EMP_NAME="Boua"
EMP_DES="TA"
EMR_MGR="James"
My desired output Format:
Rob~Developer~Sales~20-10-2010~Jack
Koster~~Promotions~20-10-2011~
Boua~TA~~~James
I tried the below:
awk 'NR>1{printf "%s"(/^\.set/?RS:"~"),a} {a=substr($0,index($0,"=")+1)} END {print a}' $line
This is printing:
Rob~Developer~Sales~20-10-2010~Jack
Koster~Promotions~20-10-2011~
Boua~TA~James~

This awk script produces the desired output:
BEGIN { FS = "[=\"]+"; OFS = "~" }
/\.set/ { ++records; next }
NR > 1 { f[records,$1] = $2 }
END {
for (i = 1; i <= records; ++i) {
print f[i,"EMP_NAME"], f[i,"EMP_DES"], f[i,"EMP_DEP"], f[i,"EMP_DOJ"], f[i,"EMR_MGR"]
}
}
A two-dimensional array is used to store all of the values that are defined for each record.
After all the file has been processed, the loop goes through each row of the array and prints all of the values. The elements that are undefined will be evaluated as an empty string.
Specifying the elements explicity allows you to control the order in which they are printed. Using print rather than printf allows you to make correct use of the OFS variable which has been set to ~, as well as the ORS which is a newline character by default.
Thanks to #Ed for his helpful comments that pointed out some flaws in my original script.
Output:
Rob~Developer~Sales~20-10-2010~Jack
Koster~~Promotions~20-10-2011~
Boua~TA~~~James

$ cat tst.awk
BEGIN{ FS="[=\"]+"; OFS="~" }
/\.set/ { ++numRecs; next }
{ name2val[numRecs,$1] = $2 }
!seen[$1]++ { names[++numNames] = $1 }
END {
for (recNr=1; recNr<=numRecs; recNr++)
for (nameNr=1; nameNr<=numNames; nameNr++)
printf "%s%s", name2val[recNr,names[nameNr]], (nameNr<numNames?OFS:ORS)
}
$ awk -f tst.awk file
Rob~Developer~Sales~20-10-2010~Jack
Koster~~Promotions~20-10-2011~
Boua~TA~~~James
If you want some pre-defined order of fields in your output rather than creating it on the fly from the rows in each record as they're read, just populate the names[] array explicitly in the BEGIN section and if you have that situation AND don't want to save the whole file in memory:
$ cat tst.awk
BEGIN{
FS="[=\"]+"; OFS="~";
numNames=split("EMP_NAME EMP_DES EMP_DEP EMP_DOJ EMR_MGR",names,/ /)
}
function prtName2val( nameNr, i) {
if ( length(name2val) ) {
for (nameNr=1; nameNr<=numNames; nameNr++)
printf "%s%s", name2val[names[nameNr]], (nameNr<numNames?OFS:ORS)
delete name2val
}
}
/\.set/ { prtName2val(); next }
{ name2val[$1] = $2 }
END { prtName2val() }
$ awk -f tst.awk file
Rob~Developer~Sales~20-10-2010~Jack
Koster~~Promotions~20-10-2011~
Boua~TA~~~James
The above uses GNU awk for length(name2val) and delete name2val, if you don't have that then use for (i in name2val) { do stuff; break } and split("",name2val) instead..

This is all I can suggest:
awk '{ t = $0; sub(/^[^"]*"/, "", t); gsub(/"[^"]*"/, "~", t); sub(/".*/, "", t); print t }' file
Or sed:
sed -re 's|^[^"]*"||; s|"[^"]*"|~|g; s|".*||' file
Output:
Rob~Developer~Sales~20-10-2010~Jack~Koster~Promotions~20-10-2011~Boua~TA~James

Related

change field value of one file based on another input file using awk

I have a sparse matrix ("matrix.csv") with 10k rows and 4 columns (1st column is "user", and the rest columns are called "slots" and contain 0s or 1s), like this:
user1,0,1,0,0
user2,0,1,0,1
user3,1,0,0,0
Some of the slots that contain a "0" should be changed to contain a "1".
I have another file ("slots2change.csv") that tells me which slots should be changed, like this:
user1,3
user3,2
user3,4
So for user1, I need to change slot3 to contain a "1" instead of a "0", and for user3 I should change slot2 and slot4 to contain a "1" instead of a "0", and so on.
Expected result:
user1,0,1,1,0
user2,0,1,0,1
user3,1,1,0,1
How can I achieve this using awk or sed?
Looking at this post: awk or sed change field of file based on another input file, a user proposed an answer that is valid if the "slots2change.csv" file do not contain the same user in diferent rows, which is not the case in here.
The solution proposed was:
awk 'BEGIN{FS=OFS=","}
NR==FNR{arr[$1]=$2;next}
NR!=FNR {for (i in arr)
if ($1 == i) {
F=arr[i] + 1
$F=1
}
print
}
' slots2change.csv matrix.csv
But that answer doesn't apply in the case where the "slots2change.csv" file contain the same user in different rows, as is now the case.
Any ideas?
Using GNU awk for arrays of arrays:
$ cat tst.awk
BEGIN { FS=OFS="," }
NR == FNR {
users2slots[$1][$2]
next
}
$1 in users2slots {
for ( slot in users2slots[$1] ) {
$(slot+1) = 1
}
}
{ print }
$ awk -f tst.awk slots2change.csv matrix.csv
user1,0,1,1,0
user2,0,1,0,1
user3,1,1,0,1
or using any awk:
$ cat tst.awk
BEGIN { FS=OFS="," }
NR == FNR {
if ( !seen[$0]++ ) {
users2slots[$1] = ($1 in users2slots ? users2slots[$1] FS : "") $2
}
next
}
$1 in users2slots {
split(users2slots[$1],slots)
for ( idx in slots ) {
slot = slots[idx]
$(slot+1) = 1
}
}
{ print }
$ awk -f tst.awk slots2change.csv matrix.csv
user1,0,1,1,0
user2,0,1,0,1
user3,1,1,0,1
Using sed
while IFS="," read -r user slot; do
sed -Ei "/$user/{s/(([^,]*,){$slot})[^,]*/\11/}" matrix.csv
done < slots2change.csv
$ cat matrix.csv
user1,0,1,1,0
user2,0,1,0,1
user3,1,1,0,1
If the order in which the users are outputted doesn't matter then you could do something like this:
awk '
BEGIN { FS = OFS = "," }
FNR == NR {
fieldsCount[$1] = NF
for (i = 1; i <= NF; i++ )
matrix[$1,i] = $i
next
}
{ matrix[$1,$2+1] = 1 }
END {
for ( id in fieldsCount ) {
nf = fieldsCount[id]
for (i = 1; i <= nf; i++)
printf "%s%s", matrix[id,i], (i < nf ? OFS : ORS)
}
}
' matrix.csv slots2change.csv
user1,0,1,1,0
user2,0,1,0,1
user3,1,1,0,1
This might work for you (GNU sed):
sed -E 's#(.*),(.*)#/^\1/s/,[01]/,1/\2#' fileChanges | sed -f - fileCsv
Create a sed script from the file containing the changes and apply it to the intended file.
The solution above, manufactures a match and substitution for each line in the file changes. This is then piped through to second invocation of sed which applies the sed script to the csv file.

Multiple options of nf for identify duplicate in different positions awk?

I hope you find yourself well, I am writing to know if it is possible to do something like this in awk
I NEED SOMETHING LIKE MANY CASE OF NF...
FOR NF = 7 PK IS $1,$5, BUT FOR NF=8 $1,$6
INPUT
AAA|BBB|CCC|DDD|111|20220129|JONH1
AAA|XXX|YYY|DDD|444|20210115|JONH2
AAA|B10|CCC|DDD|000|20200127|JONH3
AAA|BBB|MMM|DDD|444|20200131|JONH4
AAA|BBB|CCC|DDD|777|0054256|JONH5|MARY
AAA|BBB|CCC|DDD|111|0036000|JONH5|MARY
AAA|BBB|CCC|DDD|888|0089999|CENTRAL|MARY
AAA|BBB|CCC|DDD|999|0054256|JONH5|MARY
AAA|BBB|CCC|DDD|202|0054256|JONH5|MARY|MIAMI|FL
DESIRE OUTPUTS
file .PK_OK_1
AAA|BBB|CCC|DDD|111|20220129|JONH1
AAA|B10|CCC|DDD|000|20200127|JONH3
file DUPLICATE_PK_1
AAA|XXX|YYY|DDD|444|20210115|JONH2
AAA|BBB|MMM|DDD|444|20200131|JONH4
file PK_OK_2
AAA|BBB|CCC|DDD|111|0036000|JONH5|MARY
AAA|BBB|CCC|DDD|888|0089999|CENTRAL|MARY
file DUPLICATE_PK_2
AAA|BBB|CCC|DDD|777|0054256|JONH5|MARY
AAA|BBB|CCC|DDD|999|0054256|JONH5|MARY
file INVALID_LENGHT
AAA|BBB|CCC|DDD|202|0054256|JONH5|MARY|MIAMI|FL
MY CODE IS something like this (NOM_ARCH IS A VARIABLE)
BEGIN { FS="|";
OFS="|"
}
NF == 7 {
if (!seen[$1,$5]) {
print > NOM_ARCH".PK_OK_1"; seen[$1,$5]=1
}else{
print > NOM_ARCH".DUPLICATE_PK_1"
}
next
}
NF == 8 {
if (!seen[$1,$6]) {
print > NOM_ARCH".PK_OK_2"; seen[$1,$6]=1
}else{
print > NOM_ARCH".DUPLICATE_PK_2"
}
next
}
{ print > NOM_ARCH".INVALID_LENGHT" }
With your shown samples, please try following awk code.
awk '
BEGIN{ FS=OFS="|" }
{
if(NF==7){ key=($1 FS $5) }
if(NF==8){ key=($1 FS $6) }
}
FNR==NR{
arr1[key]++
next
}
NF==7{
outputFile=(arr1[key]==1?"file.PK_OK_1":"file_DUPLICATE_PK_1")
}
NF==8{
outputFile=(arr1[key]==1?"file.PK_OK_2":"file_DUPLICATE_PK_2")
}
NF>8{
outputFile="file_INVALID_LENGHTH"
}
{
print > (outputFile)
}
' Input_file Input_file
OR use following code without ternary operators as per OP's request:
awk '
BEGIN{ FS=OFS="|" }
{
if(NF==7){ key=($1 FS $5) }
if(NF==8){ key=($1 FS $6) }
}
FNR==NR{
arr1[key]++
next
}
NF==7{
if(arr1[key]==1){ outputFile="file.PK_OK_1" }
else { outputFile="file_DUPLICATE_PK_1"}
}
NF==8{
if(arr1[key]==1){ outputFile="file.PK_OK_2" }
else { outputFile="file_DUPLICATE_PK_2"}
}
NF>8{
outputFile="file_INVALID_LENGHTH"
}
{
print > (outputFile)
}
' Input_file Input_file
Explanation: Adding detailed explanation for above.
## Starting awk program from here.
awk '
## Starting BEGIN section of this program from here, setting FS and OFS to | here.
BEGIN{ FS=OFS="|" }
##Starting main program from here.
{
##Checking condition if NF is 7 then set key to $1 FS $5.
if(NF==7){ key=($1 FS $5) }
##Checking condition if NF is 8 then set key to $1 FS $6.
if(NF==8){ key=($1 FS $6) }
}
##Checking condition FNR==NR which will be TRUE when 1st time Input_file is being read.
FNR==NR{
##Creating array arr1 with index of key and keep increasing same key value with 1 here.
arr1[key]++
##next will skip all further statements from here.
next
}
##Checking condition if NF==7 then do following.
NF==7{
##Setting outputFile(where contents will be written to), either file.PK_OK_1 OR file_DUPLICATE_PK_1 depending upon value of arr1.
##Basically it uses ternary operators ? and :
##Statements after ? will executed if condition arr1[key]==1 is TRUE.
##Statements after : will be executed if condition ar1[key]==1 is FALSE.
outputFile=(arr1[key]==1?"file.PK_OK_1":"file_DUPLICATE_PK_1")
}
##Checking condition if NF==8 then do following.
NF==8{
##Setting outputFile(where contents will be written to), either file.PK_OK_2 OR file_DUPLICATE_PK_2 depending upon value of arr1.
outputFile=(arr1[key]==1?"file.PK_OK_2":"file_DUPLICATE_PK_2")
}
##Checking condition if NF>8 then do following.
NF>8{
##Setting outputFile(where contents will be written to) to file_INVALID_LENGHTH here.
outputFile="file_INVALID_LENGHTH"
}
{
##Printing current line to outputFile(already set its value above)
print > (outputFile)
}
##Mentioning Input_file names here.
' Input_file Input_file
Normally I'd recommend a first pass with sort and uniq -c for efficiency but I started out assuming the wrong requirements and so wrote most of this under that assumption and so I've just tweaked it now for the real requirements and so here's how to do it all in one awk script:
$ cat tst.awk
BEGIN {
FS=OFS="|"
map[7] = 1
map[8] = 2
}
{ key = $1 FS $(NF-2) FS NF }
NR==FNR {
cnt[key]++
next
}
{
if ( NF in map ) {
sfx = ( cnt[key]>1 ? "DUPLICATE_PK" : "PK_OK" ) "_" map[NF]
}
else {
sfx = "INVALID_LENGTH"
}
print > (nom_arch "." sfx)
}
$ awk -v nom_arch='foo' -f tst.awk file file
$ head foo.*
==> foo.DUPLICATE_PK_1 <==
AAA|XXX|YYY|DDD|444|20210115|JONH2
AAA|BBB|MMM|DDD|444|20200131|JONH4
==> foo.DUPLICATE_PK_2 <==
AAA|BBB|CCC|DDD|777|0054256|JONH5|MARY
AAA|BBB|CCC|DDD|999|0054256|JONH5|MARY
==> foo.INVALID_LENGTH <==
AAA|BBB|CCC|DDD|202|0054256|JONH5|MARY|MIAMI|FL
==> foo.PK_OK_1 <==
AAA|BBB|CCC|DDD|111|20220129|JONH1
AAA|B10|CCC|DDD|000|20200127|JONH3
==> foo.PK_OK_2 <==
AAA|BBB|CCC|DDD|111|0036000|JONH5|MARY
AAA|BBB|CCC|DDD|888|0089999|CENTRAL|MARY
I corrected the spelling of LENGTH above.
Note that NF is included in key = $1 FS $(NF-2) FS NF so we avoid a potential case pointed out by #rowboat where a line with 7 fields has the same $1 and $(NF-2) as a line with 8 fields and so we would otherwise end up counting that twice when it should be 2 separate counts of 1.
We could have used NF-6 instead of map[NF] when setting the sfx but the map[] is useful for identifying valid NF values too and there may be other values of NF in future for which the sfx can't be determined by just subtracting 6.
This uses GNU awk for multidimensional arrays:
# classify.awk
BEGIN {
FS = "|"
ok[7] = ".PK_OK_1"; dup[7] = ".DUPLICATE_PK_1"
ok[8] = ".PK_OK_2"; dup[8] = ".DUPLICATE_PK_2"
}
NF < 7 || NF > 8 {
print > nom_arch".INVALID_LENGTH"
next
}
{
pk = $1 SUBSEP (NF == 7 ? $5 : $6)
count[NF][pk]++
lines[NF][pk] = lines[NF][pk] $0 ORS
}
END {
for (nf in count)
for (pk in count[nf]) {
outfile = nom_arch (count[nf][pk] == 1 ? ok[nf] : dup[nf])
sub(ORS"$", "", lines[nf][pk])
print lines[nf][pk] > outfile
}
}
Then this will produce the desired output files
gawk -f classify.awk -v nom_arch="foo" file
The awk SUBSEP variable is used in array keys when you do something like
var[x,y] = 10
awk uses the value of SUBSEP to join the values of x and y.
The default SUBSEP value is octal value 034, an ASCII character unlikely to appear in text data.
This version is more portable, does not require GNU awk
BEGIN {
FS = "|"
ok[7] = ".PK_OK_1"; dup[7] = ".DUPLICATE_PK_1"
ok[8] = ".PK_OK_2"; dup[8] = ".DUPLICATE_PK_2"
}
NF < 7 || NF > 8 {
print > (nom_arch".INVALID_LENGTH")
next
}
{
pk = NF SUBSEP $1 SUBSEP (NF == 7 ? $5 : $6)
count[pk]++
lines[pk] = lines[pk] $0 ORS
}
END {
for (pk in count) {
sub(ORS"$", "", lines[pk])
nf = pk; sub(SUBSEP".*", "", nf)
outfile = nom_arch (count[pk] == 1 ? ok[nf] : dup[nf])
print lines[pk] > outfile
}
}
If it's ok to put the first occurrence of a dup in with the OK's, then one pass is easy.
NOM_ARCH=/tmp/mytest
awk -v nom_arch="$NOM_ARCH" ' BEGIN { FS=OFS="|" }
{ if (NF ~ /^[78]$/) { key=($1 FS NF-2) } else { print > (nom_arch ".INVALID_LENGTH"); next; }
print > ( nom_arch "." ( seen[key]++ ? "DUPLICATE_PK" : "PK_OK" ) "_" NF-6 ) } ' file
c.f. AAA|B10|CCC|DDD|000|20200127|JONH3 and AAA|BBB|CCC|DDD|999|0054256|JONH5|MARY which land in the OK files as the first hit, but subsequent dups get seen and directed elsewhere. Note that it might still be faster to shift those records between smaller files on a second pass after the fact.
Personally, I'd probably just split the records to key-sorted files by NF first. Then the second pass each is easy.
NOM_ARCH=/tmp/mytest
# this pre-sort is likely the slow part, though smaller files and in parallel
awk 'BEGIN { FS=OFS="|" } { k2=NF-2; print | "sort -t\\| -k1,1 -k"k2","k2">NF"NF; }' file
shopt -s extglob; cat NF!([78]) > $NOM_ARCH.INVALID_LENGTH &
​for f in NF[78]; do
awk -v nom_arch="$NOM_ARCH" '
BEGIN { FS=OFS="|"; lastkey=""; lastrec=""; }
END { if(""!=lastrec){print lastrec>f} }
{ key=($1 FS $(NF-2));
if ( key==lastkey ) {
f=(nom_arch".DUPLICATE_PK_"NF-6);
if(""!=lastrec){print lastrec>f}
print $0>f;
lastrec="";
} else {
if(""!=lastrec){print lastrec>f}
f=(nom_arch".PK_OK_"NF-6);
lastkey=($1 FS $(NF-2));
lastrec=$0;
}
}' "$f" &
​done
​wait
Now your data should be sorted to files. This likely reorders the records in those files (see below), so if that matters you should add sorts to those outputs as well.
mytest.PK_OK_1:
​AAA|B10|CCC|DDD|000|20200127|JONH3
​AAA|BBB|CCC|DDD|111|20220129|JONH1
mytest.PK_OK_2:
​AAA|BBB|CCC|DDD|111|0036000|JONH5|MARY
​AAA|BBB|CCC|DDD|888|0089999|CENTRAL|MARY
mytest.DUPLICATE_PK_1:
​AAA|BBB|MMM|DDD|444|20200131|JONH4
​AAA|XXX|YYY|DDD|444|20210115|JONH2
mytest.DUPLICATE_PK_2:
​AAA|BBB|CCC|DDD|777|0054256|JONH5|MARY
​AAA|BBB|CCC|DDD|999|0054256|JONH5|MARY
mytest.INVALID_LENGTH:
​ AAA|BBB|CCC|DDD|202|0054256|JONH5|MARY|MIAMI|FL
This uses more disk space but less memory than an internal lookup table, and is likely a lot slower.
YMMV.

Truncation of strings after running awk script

I have this code
BEGIN { FS=OFS=";" }
{ key = $(NF-1) }
NR == FNR {
for (i=1; i<(NF-1); i++) {
if ( !seen[key,$i]++ ) {
map[key] = (key in map ? map[key] OFS : "") $i
}
}
next
}
{ print $0 map[key] }
I use code in this way
awk -f tst.awk 2.txt 1.txt
I have two text files
1.txt
AA;BB;
2.txt
CC;DD;BB;AA;
I try to generate this 3.txt output
AA;BB;CC;DD;
but with this script is not possible because this script return only AA;BB;
logic: The above just uses literal strings in a hash lookup of array indices so it doesn't care what characters you have in your input. However about sample output:if in 2.txt there are common fields also in 1.txt.for example BB;AA; then you need concatenate them in a single row, i.e AA;BB;CC;DD; Ordering is not required, for example is not relevant if output is BB;AA;DD;CC; Only condition that is required is avoid duplicates but my script already does this
Could you please try following, as per OP's comment both files have only 1 line. So using paste command to combine both the files and then processing its output by awk command.
paste -d';' 1.txt 2.txt |
awk '
BEGIN{
FS=OFS=";"
}
{
for(i=1;i<=NF;i++){
if(!seen[$i]++){ val=(val?val OFS:"")$i }
}
print val
delete seen
val=""
}'

Sum up from line "A" to line "B" from a big file using awk

aNumber|bNumber|startDate|timeZone|duration|currencyType|cost|
22677512549|778|2014-07-02 10:16:35.000|NULL|NULL|localCurrency|0.00|
22675557361|76457227|2014-07-02 10:16:38.000|NULL|NULL|localCurrency|10.00|
22677521277|778|2014-07-02 10:16:42.000|NULL|NULL|localCurrency|0.00|
22676099496|77250331|2014-07-02 10:16:42.000|NULL|NULL|localCurrency|1.00|
22667222160|22667262389|2014-07-02 10:16:43.000|NULL|NULL|localCurrency|10.00|
22665799922|70110055|2014-07-02 10:16:45.000|NULL|NULL|localCurrency|20.00|
22676239633|433|2014-07-02 10:16:48.000|NULL|NULL|localCurrency|0.00|
22677277255|76919167|2014-07-02 10:16:51.000|NULL|NULL|localCurrency|1.00|
This is the input (sample of million of line) i have in csv file.
I want to sum up duration based on date.
My concern is i want to sum up first 1000000 lines
the awk program i'm using is:
test.awk
BEGIN { FS = "|" }
NR>1 && NR<=1000000
FNR == 1{ next }
{
sub(/ .*/,"",$3)
key=sprintf("%10s",$3)
duration[key] += $5 } END {
printf "%-10s %16s,"dAccused","Duration"
for (i in duration) {
printf "%-4s %16.2f i,duration[i]
}}
i run my script as
$awk -f test.awk 'file'
The input i have doesn't condsidered my condition NR>1 && NR<=1000000
ANY SUGGESTION? PLEASE!
You're looking for this:
BEGIN { FS = "|" }
1 < NR && NR <= 1000000 {
sub(/ .*/, "", $3)
key = sprintf("%10s",$3)
duration[key] += $5
}
END {
printf "%-10s %16s\n", "dAccused", "Duration"
for (i in duration) {
printf "%-4s %16.2f i,duration[i]
}
}
A lot of errors become obvious with proper indentation.
The reason you saw 1,000,000 lines was due to this:
NR>1 && NR<=1000000
That is a condition with no action block. The default action is to print the current record if the condition is true. That's why you see a lot of awk one-liners end with the number 1
You didn't post any expected output and your duration field is always NULL so it's still not clear what you really want output, but this is probably the right approach:
$ cat tst.awk
BEGIN { FS = "|" }
NR==1 { for (i=1;i<NF;i++) f[$i] = i; next }
{
sub(/ .*/,"",$(f["startDate"]))
sum[$(f["startDate"])] += $(f["duration"])
}
NR==1000000 { exit }
END { for (date in sum) print date, sum[date] }
$ awk -f tst.awk file
2014-07-02 0
Instead of discarding your header line, it uses it to create an array f[] that maps the field names to their order in each line so instead of having to hard-code that duration is field 4 (or whatever) you just reference it as $(f["duration"]).
Any time your input file has a header line, don't discard it - use it so your script is not coupled to the order of fields in your input file.

awk transpose column to line and adjust field width [duplicate]

With this script every field is printed out according to the longest word of the current file, but needs to have a line break every file. How can this be achieved?
awk 'BEGIN{ORS="\n"}FNR=NR{a[i++]=$0; if(length($0) > length(max)) max=$0;l=length(max)} END{ for(j=1; j<=i;j++) printf("%-"(l+1)"s,",a[j-1])}' file1 file2 >outfile
file1
HELLO
WORLD
SOUTH IS
WARM
NORTH IS
COLD
file2
HELLO
WORLD
SOUTH
WARM
NORTH
COLD
output
HELLO ,WORLD ,SOUTH IS ,WARM ,NORTH IS ,COLD
HELLO ,WORLD ,SOUTH ,WARM ,NORTH ,COLD
It's not entirely clear what you are asking for, but perhaps you just want:
FNR==1 {print "\n"}
Which will print a newline whenever it starts reading the first line of a file. Make sure this pattern/action is before any others so that the newline prints before any other action prints anything for the first line of the current file. (This does not appear to apply in your case, since no such action exists.)
Took me some time, got it solved with this script.
awk '{ NR>1 && FNR==1 ? l=length($0) && a[i++]= "\n" $0 : a[i++]=$0 }
{if(NR>1 && FNR==1) for(e=(i-c);e<=(i-1);e++) b[e]=d ;c=FNR; d=l }
{ if( length($0) > l) l=length($0)+1 }
END{for(e=(i-c+1);e<=i;e++) b[e]=d; for(j=1;j<=i;j++) printf("%-"b[j]"s,",a[j-1] )}' infiles* >outfile
#!/usr/bin/awk -f
function beginfile (file) {
split("", a)
max = 0
delim = ""
}
function endfile (file) {
for (i = 1; i <= lines; i++) {
printf "%s%-*s", delim, max, a[i]
delim = " ,"
}
printf "\n"
}
FILENAME != _oldfilename \
{
if (_oldfilename != "")
endfile(_oldfilename)
_oldfilename = FILENAME
beginfile(FILENAME)
}
END { endfile(FILENAME) }
{
len = length($0)
if (len > max) {
max = len
}
a[FNR] = $0
lines = FNR
}
To run it:
chmod u+x filename
./filename file1 file2
Note that in gawk you can do delete a instead of split("", a). GAWK 4 has builtin BEGINFILE and ENDFILE.