Combine multiple text files using awk - awk

I have some minutely stats saved in text files and named as 1min.txt, 2min.txt etc.
1min.txt
F1,21
F2,32
F3,22
2min.txt
F2,12
F4,32
I would like to combine these files in the following format:
combined.txt
Field 1min 2min
F1 21 0
F2 32 12
F3 22 0
F4 0 32
Some fields may not exist in some files and 0 will be entered for those fields.
I've tried to do it using awk but couldn't find an easy way. Can someone please help?
Thanks

Using awk:
awk -F, '
!seen[FILENAME]++ {
fname[++numFile] = FILENAME
}
{
flds[$1]++;
map[FILENAME,$1] = $2
}
END {
printf "%-10s", "FIELD";
for (cnt=1; cnt<=numFile; cnt++) {
file = fname[cnt];
sub (/.txt/, "", file);
printf "%-10s", file;
}
print "";
for (fld in flds) {
printf "%-10s", fld;
for (cnt=1; cnt<=numFile; cnt++) {
printf "%-10s", map[fname[cnt],fld]+0
}
print "";
}
}' 1min.txt 2min.txt
Output:
FIELD 1min 2min
F1 21 0
F2 32 12
F3 22 0
F4 0 32
Once you have reviewed the output, you can re-direct the output to another file. You can pass as many files at the end as you want. If you have way too many then you can even use shell glob, for eg: *.txt
Note: I haven't guaranteed the order of fields since they are not always present in all files.
Here is a pure fun perl japh that will do the same:
perl -F, -lane'
$f{$ARGV}++; $h{$F[0]}
{$ARGV}= $F[ 1 ]
}{print join"\t",
"FIELD", map{s/.[tx]+
//x ;$_}sort{$a
<=>$b} keys%f;print
join"\n", map{$f
=$_; join
"\t", $f,map
{$h{$f
}{$_}
//=0}
sort{$a
<=>$b}
keys%f
}sort
keys%h;
' *.txt
Output:
FIELD 1min 2min
F1 21 0
F2 32 12
F3 22 0
F4 0 32

$ cat tst.awk
BEGIN { FS=","; OFS="\t" }
{ keys[$1]; val[$1,NR==FNR] = $2 }
END {
print "Field", "1min", "2min"
for (key in keys) {
print key, val[key,1]+0, val[key,0]+0
}
}
$ awk -f tst.awk 1min.txt 2min.txt
Field 1min 2min
F1 21 0
F2 32 12
F3 22 0
F4 0 32
If you care about the output order, tell us what order you're looking for - the order the keys were seen across both files or alphabetical or something else. If it's the order they are seen then that'd be:
$ cat tst.awk
BEGIN { FS=","; OFS="\t" }
!seen[$1]++ { keys[++numKeys] = $1 }
{ val[$1,NR==FNR] = $2 }
END {
print "Field", "1min", "2min"
for (k=1; k<=numKeys; k++) {
key = keys[k]
print key, val[key,1]+0, val[key,0]+0
}
}

Using join:
join -t , input1 input2 -j 1 -o "0 1.2 2.2" -e 0 -a1 -a2 | column -t -s,
Gives:
F1 21 0
F2 32 12
F3 22 0
F4 0 32
To add a header:
join -t , input1 input2 -j 1 -o "0 1.2 2.2" -e 0 -a1 -a2 | \
sed '1iField,1min,2min' | column -t -s,
And the result looks like:
Field 1min 2min
F1 21 0
F2 32 12
F3 22 0
F4 0 32

Awk allows you to explicitly read from files, so you can just put all the logic in a BEGIN section if you want. Here is an example:
awk -F, '
BEGIN {
while (getline <"1min.txt") {
field[$1]=1
a1[$1]=$2
}
while (getline <"2min.txt") {
field[$1]=1
a2[$1]=$2
}
print "Field\t1min\t2min"
for (x in field) {
print x"\t"(a1[x]+0)"\t"(a2[x]+0)
}
}
'

I have written some python code to solve you problem.
fh_1 = open("1min.txt", "r")
fh_2 = open("2min.txt", "r")
fh_3 = open("combine.txt", "w")
min_c_1 = {}
min_c_2 = {}
lines_of_text = ["Field 1min 2min\n"]
for l1 in fh_1.readlines():
data = l1.split(',')
min_c_1[data[0]] = data[1].rstrip()
for l1 in fh_2.readlines():
data = l1.split(',')
min_c_2[data[0]] = data[1].rstrip()
for key in min_c_1.keys():
if key in min_c_2.keys():
msg = str(key) + " " + str(min_c_1[key]) + " " + str(min_c_2[key]) + "\n"
lines_of_text.append(msg)
del min_c_2[key]
else:
msg = str(key) + " " + str(min_c_1[key]) + " 0" + "\n"
lines_of_text.append(msg)
for key in min_c_2.keys():
msg = str(key) + " 0" + " " + str(min_c_2[key]) + "\n"
lines_of_text.append(msg)
fh_3.writelines(lines_of_text)
fh_1.close()
fh_2.close()
fh_3.close()
Please let me know if it does not help.

Related

How to make an array of alphabets from a file and update in a new file

I have a single column file.
A
A
A
B
B
B
C
C
D
I want to use this file and want to make a new one as below
command="A" "B" "C" "D"
TYPE=1 1 1 2 2 2 3 3 4,
These A B C D are random alphabets and varies file to file.
I tried to overcome the solution with below shell script
#!/bin/bash
NQ=$(cat RLP.txt | wc -l)
ELEMENT='element='
echo "$ELEMENT" > element.txt
TYPE='types='
echo "$TYPE" > types.txt
for i in `seq 1 1 $NQ`
do
RLP=$(echo "$i" | tail -n 1)
cat RLP.txt | head -n "$RLP" | tail -n 1 > el.$RLP.txt
done
paste element.txt el.*.txt
paste types.txt
The output of paste element.txt el.*.txt is element= A A A B B B C C D
I could not remove the repeated alphabets and put the reaming alphabets in "".
and cold not move forward for with second command to get
TYPE=1 1 1 2 2 2 3 3 4,
which represents that the 1st alphabets repeated three times, 2nd alphabets repeated three times, 3rd alphabets repeated two times and so on..
$ cat tst.awk
!seen[$1]++ {
cmd = cmd sep "\"" $1 "\""
cnt++
}
{
type = type sep cnt
sep = OFS
}
END {
print "command=" cmd
print "TYPE=" type ","
}
$ awk -f tst.awk file
command="A" "B" "C" "D"
TYPE=1 1 1 2 2 2 3 3 4,
Instead of using multiple text processing tools in a pipeline, this can be achieved by one awk command as below
awk '
{
unique[$0]
}
prev !~ $0 {
alpha[NR] = idx++
}
{
prev = $0
alpha[NR] = idx
}
END {
for (i in unique) {
str = str ? (str " " "\"" i "\"") : "\"" i "\""
}
first = "command=" str
str = ""
for (i = 1; i <= NR; i++) {
str = str ? (str " " alpha[i]) : alpha[i]
}
second = "TYPE=" str ","
print(first "\n" second) > "types.txt"
close("types.txt")
}' RLP.txt
The command works as follows
Each unique line in the file is saved as an index in into the array unique
The array alpha keeps track of the unique value counter, i.e. every time a value in the file changes, the counter is incremented at the corresponding line number NR
The END block is all about constructing the output from the array to a string value and writing the result to the new file "types.txt"
Pure bash implementation. Requires at least Bash version 4 for the associative array
#!/bin/bash
outfile="./RLP.txt"
infile="./infile"
declare -A map
while read line; do
(( map["$line"]++ ))
done < "$infile"
command="command="
command+=$(printf "\"%s\" " "${!map[#]}")
type="$(
for i in "${map[#]}"; do
((k++))
for (( j=0; j < i; j++ )); do
printf " %d" "$k"
done
done
),"
echo "$command" >> "$outfile"
echo "TYPE=${type#* }" >> "$outfile"

Newline in the middle of output

I'm trying the cross two files using their location.
f1:
Location Consequence SYMBOL Feature gnomAD_AF gnomAD_AFR_AF gnomAD_AMR_AF gnomAD_ASJ_AF gnomAD_EAS_AF gnomAD_FIN_AF gnomAD_NFE_AF gnomAD_OTH_AF gnomAD_SAS_AF CLIN_SIG CADD_phred CADD_raw CADD_raw_rankscore REVEL_rankscore REVEL_score clinvar_clnsig clinvar_golden_stars
1:45330550-45330550 missense_variant MUTYH NM_001128425.1 2.541e-05 0 0 0 5.945e-05 0 2.818e-05 0 6.821e-05 uncertain_significance 23.7 4.061544 0.54541 0.76110 0.461 - -
1:45331556-45331556 missense_variant,splice_region_variant MUTYH NM_001128425.1 0.002958 0.0007277 0.003068 0.0002038 0 0.002182 0.004831 0.003839 9.747e-05 likely_pathogenic,pathogenic 29.4 6.349794 0.87691 0.99202 0.954 5,5,5,5,5,5,5 2,0,2,2,0,0,0
f2:
chromosome start stop ref alt
12 132668439 132668439 G A
17 7673593 7673593 G C
I managed to do it using this:
awk -v OFS="\t" 'NR==1{h1=$0}NR==FNR{arr[$1":"$2"-"$3] = $0; next}FNR==1{print h1, $0}NR>FNR{if($1 in arr){print arr[$1], $0}}' f2 f1 > res
However I have a newline in the middle of every line just after printing the h1 or arr[$1] and i don't understand why.
chromosome start stop ref alt
Location Consequence SYMBOL Feature gnomAD_AF gnomAD_AFR_AF gnomAD_AMR_AF gnomAD_ASJ_AF gnomAD_EAS_AF gnomAD_FIN_AF gnomAD_NFE_AF gnomAD_OTH_AF gnomAD_SAS_AF CLIN_SIG CADD_phred CADD_raw CADD_raw_rankscore REVEL_rankscore REVEL_score clinvar_clnsig clinvar_golden_stars
1 45330550 45330550 C T
1:45330550-45330550 missense_variant MUTYH NM_001128425.1 2.541e-05 0 0 0 5.945e-05 0 2.818e-05 0 6.821e-05 uncertain_significance 23.7 4.061544 0.54541 0.76110 0.461 - -
1 45331556 45331556 C T
I have even tried using individual variables to print h1 but the problem still persisted.
Any insights?
I think we are missing a couple next statements? Hopefully also the following repaired code has a formatting that will help clarify and make the code more understandable:
awk '
BEGIN { OFS = "\t"; h1 = ""; split("", arr) }
{ $1 = $1 }
NR == 1 { h1 = $0; next }
FNR == 1 { print h1, $0; next }
NR == FNR { arr[$1":"$2"-"$3] = $0; next }
($1 in arr) { print arr[$1], $0 }
' f2 f1 > res
If we want FS = OFS = "\t" we can specify this in the BEGIN section and get rid of the { $1 = $1 } reformatting the buffer for tab delimited output.

find common elements in >2 files

I have three files as shown below
file1.txt
"aba" 0 0
"aba" 0 0 1
"abc" 0 1
"abd" 1 1
"xxx" 0 0
file2.txt
"xyz" 0 0
"aba" 0 0 0 0
"aba" 0 0 0 1
"xxx" 0 0
"abc" 1 1
file3.txt
"xyx" 0 0
"aba" 0 0
"aba" 0 1 0
"xxx" 0 0 0 1
"abc" 1 1
I want to find the similar elements in all the three files based on first two columns. To find similar elements in two files i have used something like
awk 'FNR==NR{a[$1,$2]++;next}a[$1,$2]' file1.txt file2.txt
But, how can we find similar elements in all the files, when the input files are more than 2?
Can anyone help?
With the current awk solution, the output ignores the duplicate key columns and gives the output as
"xxx" 0 0
If we assume the output comes from file1.txt, the expected output is:
"aba" 0 0
"aba" 0 0 1
"xxx" 0 0
i.e it should get the rows with duplicate key columns as well.
Try following solution generalized for N files. It saves data of first file in a hash with value of 1, and for each hit from next files that value is incremented. At the end I compare if the value of each key it's the same as the number of files processed and print only those that match.
awk '
FNR == NR { arr[$1,$2] = 1; next }
{ if ( arr[$1,$2] ) { arr[$1,$2]++ } }
END {
for ( key in arr ) {
if ( arr[key] != ARGC - 1 ) { continue }
split( key, key_arr, SUBSEP )
printf "%s %s\n", key_arr[1], key_arr[2]
}
}
' file{1..3}
It yields:
"xxx" 0
"aba" 0
EDIT to add a version that prints the whole line (see comments). I've added another array with same key where I save the line, and also use it in the printf function. I've left old code commented.
awk '
##FNR == NR { arr[$1,$2] = 1; next }
FNR == NR { arr[$1,$2] = 1; line[$1,$2] = $0; next }
{ if ( arr[$1,$2] ) { arr[$1,$2]++ } }
END {
for ( key in arr ) {
if ( arr[key] != ARGC - 1 ) { continue }
##split( key, key_arr, SUBSEP )
##printf "%s %s\n", key_arr[1], key_arr[2]
printf "%s\n", line[ key ]
}
}
' file{1..3}
NEW EDIT (see comments) to add a version that handles multiple lines with same key. Basically I join all entries instead saving only one, changing line[$1,$2] = $0 with line[$1,$2] = line[$1,$2] ( line[$1,$2] ? SUBSEP : "" ) $0. At the time of printing I do the reverse splitting with the separator (SUBSEP variable) and print each entry.
awk '
FNR == NR {
arr[$1,$2] = 1
line[$1,$2] = line[$1,$2] ( line[$1,$2] ? SUBSEP : "" ) $0
next
}
FNR == 1 { delete found }
{ if ( arr[$1,$2] && ! found[$1,$2] ) { arr[$1,$2]++; found[$1,$2] = 1 } }
END {
num_files = ARGC -1
for ( key in arr ) {
if ( arr[key] < num_files ) { continue }
split( line[ key ], line_arr, SUBSEP )
for ( i = 1; i <= length( line_arr ); i++ ) {
printf "%s\n", line_arr[ i ]
}
}
}
' file{1..3}
With new data edited in question, it yields:
"xxx" 0 0
"aba" 0 0
"aba" 0 0 1
This python script will list out the common lines among all files :
import sys
i,l = 0,[]
for files in sys.argv[1:]:
l.append(set())
for line in open(files): l[i].add(" ".join(line.split()[0:2]))
i+=1
commonFields = reduce(lambda s1, s2: s1 & s2, l)
for files in sys.argv[1:]:
print "Common lines in ",files
for line in open(files):
for fields in commonFields:
if fields in line:
print line,
break
Usage : python script.py file1 file2 file3 ...
For three files, all you need is:
awk 'FNR==NR { a[$1,$2]; next} ($1,$2) in a' file1.txt file2.txt file3.txt
The FNR==NR block returns true for only the first file in the arguments list. The next statement in this block forces a skip over the remained of the code. Therefore, ($1,$2) in a is performed for all files in the arguments list excluding the first one. To process more files in the way you have, all you need to do is list them.
If you need more powerful globbing on the command line, use extglob. You can turn it on with shopt -s extglob, and turn it off with shopt -u extglob. For example:
awk 'FNR==NR { a[$1,$2]; next} ($1,$2) in a' file1.txt !(file1.txt)
If you have hard to find files, use find. For example:
awk 'FNR==NR { a[$1,$2]; next} ($1,$2) in a' file1.txt $(find /path/to/files -type f -name "*[23].txt")
I assume you're looking for a glob range for 'N' files. For example:
awk 'FNR==NR { a[$1,$2]; next} ($1,$2) in a' file1.txt file{2,3}.txt

How to "do something" for each input text files

Say that I read in the following information stored in three diffrent text files (Can be many more)
File 1
1 2 rt 45
2 3 er 44
File 2
rf r 4 5
3 er 4 t
er t yu 4
File 3
er tyu 3er 3r
der 4r 5e
edr rty tyu 4r
edr 5t yt5 45
When I read in this information I want it to print this information from these two files into separate arrays as for now they are printed out in the same time
Now I Have this script printing out all information at the same time
{
TESTd[NR-1] = $2; g++
}
END {
for (i = 0 ; i <= g-1; i ++ ) {
print " [\"" TESTd[i] "\"]"
}
print " _____"
}
But is there a way to read in multiple files and do this for every text file?
Like instead of getting this output when doing awk -f test.awk 1.txt 2.txt 3.txt
["2"]
["3"]
["r"]
["er"]
["t"]
["tyu"]
["4r"]
["rty"]
["5t"]
_____
I get this output
["2"]
["3"]
_____
["r"]
["er"]
["t"]
_____
["tyu"]
["4r"]
["rty"]
["5t"]
_____
And reading in each file at the time is preferably not an option here since I will have like 30 text files.
EDIT________________________________________________________________
I want to do this in awk if possible because I'm going to do something like this
{
PRINTONCE[NR-1] = $2; g++
PRINTONEATTIME[NR-1] = $3
}
END {
#Do this for all arguments once
for (i = 0 ; i <= g-1; i ++ ) {
print " [\"" PRINTONCE[i] "\"] \n"
}
print " _____"
#Do this for loop for every .txt file that is read in as an argument
#for(j=0;j<args.length;j++){
for (i = 0 ; i <= g-1; i ++ ) {
print " [\"" PRINTONEATTIME[i] "\"] \n"
}
print " _____"
}
From what i understand, you have an awk script that works and you want to run that awk script on many files and want their output to have a new line(or _) in between so you can distinguish which output is from which file.
Try this bash script :-
dir=~/*.txt #all txt files in ~(home) directory
for f in $dir
do
echo "File is $f"
awk 'BEGIN{print "Hello"}' $f #your awk code will take $f file as input.
echo "------------------"; echo;
done
Also, if you do not want to do this to all files you can write the for loop as for f in 1.txt 2.txt 3.txt.
If you don't want to do it in awk directly. You can call it like this in bash or zsh for example:
for fic in test*.txt; awk -f test.awk $fic
It's quite simple to do it directly in awk:
# define a function to print out the array
function dump(array, n) {
for (i = 0 ; i <= n-1; i ++ ) {
print " [\"" array[i] "\"]"
}
print " _____"
}
# dump and reset when starting a new file
FNR==1 && NR!=1 {
dump(TESTd, g)
delete TESTd
g = 0
}
# add data to the array
{
TESTd[FNR-1] = $2; g++
}
# dump at the end
END {
dump(TESTd, g)
}
N.B. using delete TESTd is a non-standard gawk feature, but the question is tagged as gawk so I assumed it's OK to use it.
Alternatively you could use one or more of ARGIND, ARGV, ARGC or FILENAME to distinguish the different files.
Or as suggested by see https://stackoverflow.com/a/10691259/981959, with gawk 4 you can use an ENDFILE group instead of END in your original:
{
TESTd[FNR-1] = $2; g++
}
ENDFILE {
for (i = 0 ; i <= g-1; i ++ ) {
print " [\"" TESTd[i] "\"]"
}
print " _____"
delete TESTd
g = 0
}
Write a bash shell script or a basic shell script. Try to put below into test.sh. Then call /bin/sh test.sh or /bin/bash test.sh, see which one will work
for f in *.txt
do
echo "File is $f"
awk -F '\t' 'blah blah' $f >> output.txt
done
Or write a bash shell script to call your awk script
for f in *.txt
do
echo "File is $f"
/bin/sh yourscript.sh
done

awk Merge two files based on common field and print similarities and differences

I have two files I would like to merge into a third but I need to see both when they share a common field and where they differ.Since there are minor differences in other fields, I cannot use a diff tool and I thought this could be done with awk.
File 1:
aWonderfulMachine 1 mlqsjflk
AnotherWonderfulMachine 2 mlksjf
YetAnother WonderfulMachine 3 sdg
TrashWeWon'tBuy 4 jhfgjh
MoreTrash 5 qsfqf
MiscelleneousStuff 6 qfsdf
MoreMiscelleneousStuff 7 qsfwsf
File2:
aWonderfulMachine 22 dfhdhg
aWonderfulMachine 23 dfhh
aWonderfulMachine 24 qdgfqf
AnotherWonderfulMachine 25 qsfsq
AnotherWonderfulMachine 26 qfwdsf
MoreDifferentStuff 27 qsfsdf
StrangeStuffBought 28 qsfsdf
Desired output:
aWonderfulMachine 1 mlqsjflk aWonderfulMachine 22 dfhdhg
aWonderfulMachine 23 dfhdhg
aWonderfulMachine 24 dfhh
AnotherWonderfulMachine 2 mlksjf AnotherWonderfulMachine 25 qfwdsf
AnotherWonderfulMachine 26 qfwdsf
File1
YetAnother WonderfulMachine 3 sdg
TrashWeWon'tBuy 4 jhfgjh
MoreTrash 5 qsfqf
MiscelleneousStuff 6 qfsdf
MoreMiscelleneousStuff 7 qsfwsf
File2
MoreDifferentStuff 27 qsfsdf
StrangeStuffBought 28 qsfsdf
I have tried a few awks scripts here and there, but they are either based on two fields only, and I don't know how to modify the output, or they delete the duplicates based on two fields only, etc (I am new to this and awk syntax is tough).
Thank you much in advance for your help.
You can come very close using these three commands:
join <(sort file1) <(sort file2)
join -v 1 <(sort file1) <(sort file2)
join -v 2 <(sort file1) <(sort file2)
This assumes a shell, such as Bash, that supports process substitution (<()). If you're using a shell that doesn't, the files would need to be pre-sorted.
To do this in AWK:
#!/usr/bin/awk -f
BEGIN { FS="\t"; flag=1; file1=ARGV[1]; file2=ARGV[2] }
FNR == NR { lines1[$1] = $0; count1[$1]++; next } # process the first file
{ # process the second file and do output
lines2[$1] = $0;
count2[$1]++;
if ($1 != prev) { flag = 1 };
if (count1[$1]) {
if (flag) printf "%s ", lines1[$1];
else printf "\t\t\t\t\t"
flag = 0;
printf "\t%s\n", $0
}
prev = $1
}
END { # output lines that are unique to one file or the other
print "File 1: " file1
for (i in lines1) if (! (i in lines2)) print lines1[i]
print "File 2: " file2
for (i in lines2) if (! (i in lines1)) print lines2[i]
}
To run it:
$ ./script.awk file1 file2
The lines won't be output in the same order that they appear in the input files. The second input file (file2) needs to be sorted since the script assumes that similar lines are adjacent. You will probably want to adjust the tabs or other spacing in the script. I haven't done much in that regard.
One way to do it (albeit with hardcoded file names):
BEGIN {
FS="\t";
readfile(ARGV[1], s1);
readfile(ARGV[2], s2);
ARGV[1] = ARGV[2] = "/dev/null"
}
END{
for (k in s1) {
if ( s2[k] ) printpair(k,s1,s2);
}
print "file1:"
for (k in s1) {
if ( !s2[k] ) print s1[k];
}
print "file2:"
for (k in s2) {
if ( !s1[k] ) print s2[k];
}
}
function readfile(fname, sary) {
while ( getline <fname ) {
key = $1;
if (sary[key]) {
sary[key] = sary[key] "\n" $0;
} else {
sary[key] = $0;
};
}
close(fname);
}
function printpair(key, s1, s2) {
n1 = split(s1[key],l1,"\n");
n2 = split(s2[key],l2,"\n");
for (i=1; i<=max(n1,n2); i++){
if (i==1) {
b = l1[1];
gsub("."," ",b);
}
if (i<=n1) { f1 = l1[i] } else { f1 = b };
if (i<=n2) { f2 = l2[i] } else { f2 = b };
printf("%s\t%s\n",f1,f2);
}
}
function max(x,y){ z = x; if (y>x) z = y; return z; }
Not particularly elegant, but it handles many-to-many cases.