I need to retrieve all rows from a file starting from some column matching from another file.
My first file is:
col1,col2,col3
1TF4,WP_110462952.1,AEV67733.1
1TF4,EGD45884.1,AEV67733.1
2BTO,NP_006073.2,XP_037953971.1
2BTO,XP_037953971.1,XP_037953971.1
The second one is:
col1,col2,col3,col4,col5
BAA13425.1,SDD02770.1,38.176,296,175
BAA13425.1,WP_002465021.1,32.056,287,185
BBE42932.1,AEG17356.1,40.909,110,64
BBE42932.1,WP_048124638.1,40.367,109,64
I want to retrieve all rows from the second file, where its file2_col1=file1_col3 and file2_col2=file1_col1
I tried like this but it doesn't print everything
awk -F"," 'FILENAME=="file1"{A[$3$2]=$3$2}
FILENAME=="file2"{if(A[$1$2]){print $0}}' file1 file2 > test
I want to retrieve all rows from the second file, where its file2_col1=file1_col3 and file2_col2=file1_col1
You may use this 2 pass awk solution:
awk -F, 'FNR == NR {seen[$3,$1]; next} FNR == 1 || ($1,$2) in seen' file1 file2
col1,col2,col3,col4,col5
BAA13425.1,2BTO,32.056,287,185
BAA13425.1,2BTO,12.410,641,123
Where input files are:
cat file1
col1,col2,col3
1TF4,WP_110462952.1,AEV67733.1
1TF4,EGD45884.BAA13425.1
2BTO,NP_006073.2,BAA13425.1
2BTO,XP_037953971.1,BAA13425.1
cat file2
col1,col2,col3,col4,col5
BAA13425.1,SDD02770.1,38.176,296,175
BAA13425.1,2BTO,32.056,287,185
BBE42932.1,AEG17356.1,40.909,110,64
BBE42932.1,WP_048124638.1,40.367,109,64
BAA13425.1,2BTO,12.410,641,123
The purpose is to check if values for column 2 and 3 in file1 match with column 1 in file2. If any value match, then replace values in file2 for column 2 and 3 using the information of file1 columns 4 and 5.
file1
100,31431,37131,999991.70,2334362.30
100,31431,37471,111113.20,2334363.30
100,31433,36769,777775.60,2334361.90
102,31433,36853,333322.00,2334362.80
file2
3143137113 318512.50 2334387.50 100
3143137131 318737.50 2334387.50 100
3143137201 319612.50 2334387.50 100
3143137219 319837.50 2334387.50 100
3143137471 322987.50 2334387.50 100
3143137491 323237.50 2334387.50 100
3143336687 313187.50 2334412.50 100
3143336723 313637.50 2334412.50 100
3143336769 314212.50 2334412.50 100
3143336825 314912.50 2334412.50 100
3143336853 315262.50 2334412.50 102
Output desired
31431,37113,318512.50,2334387.50,100
31431,37131,999991.70,2334362.30,100
31431,37201,319612.50,2334387.50,100
31431,37219,319837.50,2334387.50,100
31431,37471,111113.20,2334363.30,100
31431,37491,323237.50,2334387.50,100
31433,36687,313187.50,2334412.50,100
31433,36723,313637.50,2334412.50,100
31433,36769,777775.60,2334361.90,100
31433,36825,314912.50,2334412.50,100
31433,36853,333322.00,2334362.80,102
I tried
awk -F[, ] 'FNR==NR{a[$1 $2]=$0;next}$1 in a{print $0 ,a[$1 $2]}' file1 file2
Thanks in advance
Could you please try following.
awk '
BEGIN{
OFS=","
}
FNR==NR{
a[$2 $3]=$2 OFS $3
b[$2 $3]=$4;c[$2 $3]=$5
next
}
($1 in a){
$2=b[$1]
$3=c[$1];$1=a[$1]
print
next
}
{
$1=$1
sub(/^...../,"&,",$1)
print
}
' FS="," file1 FS=" " file2
Output will be as follows.
31431,37113,318512.50,2334387.50,100
31431,37131,999991.70,2334362.30,100
31431,37201,319612.50,2334387.50,100
31431,37219,319837.50,2334387.50,100
31431,37471,111113.20,2334363.30,100
31431,37491,323237.50,2334387.50,100
31433,36687,313187.50,2334412.50,100
31433,36723,313637.50,2334412.50,100
31433,36769,777775.60,2334361.90,100
31433,36825,314912.50,2334412.50,100
31433,36853,333322.00,2334362.80,102
Try this:
$ awk -F, 'NR==FNR{tmp=$0;sub($1 FS,"",tmp);a[$2 $3]=tmp;next} $1 in a{print a[$1],$NF;next} {$1=substr($1,1,5) OFS substr($1,6,5);} 1' OFS=, file1 FS=' ' file2
31431,37113,318512.50,2334387.50,100
31431,37131,999991.70,2334362.30,100
31431,37201,319612.50,2334387.50,100
31431,37219,319837.50,2334387.50,100
31431,37471,111113.20,2334363.30,100
31431,37491,323237.50,2334387.50,100
31433,36687,313187.50,2334412.50,100
31433,36723,313637.50,2334412.50,100
31433,36769,777775.60,2334361.90,100
31433,36825,314912.50,2334412.50,100
31433,36853,333322.00,2334362.80,102
Above assumes $1 of file does not include regex characters, so to be accurate and safe, better use this:
awk -F, 'NR==FNR{$1="";a[$2 $3]=substr($0,2);next} $1 in a{print a[$1],$NF;next} {$1=substr($1,1,5) OFS substr($1,6,5);} 1' OFS=, file1 FS=' ' file2
However this one assumes the FS of file1 is 1 character only.
And that leads to another change/efficiency improvement:
awk -F, 'NR==FNR{a[$2 $3]=substr($0,length($1 FS)+1);next} $1 in a{print a[$1],$NF;next} {$1=substr($1,1,5) OFS substr($1,6,5);} 1' OFS=, file1 FS=' ' file2
$ cat file1 #It contains ID:Name
5:John
4:Michel
$ cat file2 #It contains ID
5
4
3
I want to Replace the IDs in file2 with Names from file1, output required
John
Michel
NO MATCH FOUND
I need to expand the below code to reult NO MATCH FOUND text.
awk -F":" 'NR==FNR {a[$1]=$2;next} {print a[$1]}' file1 file2
My current result:
John
Michel
<< empty line
Thanks,
You can use a ternary operator for this: print ($1 in a)?a[$1]:"NO MATCH FOUND". That is, if $1 is in the array, print it; otherwise, print the text "NO MATCH FOUND".
All together:
$ awk -F":" 'NR==FNR {a[$1]=$2;next} {print ($1 in a)?a[$1]:"NO MATCH FOUND"}' f1 f2
John
Michel
NO MATCH FOUND
You can test whether the index occurs in the array:
$ awk -F":" 'NR==FNR {a[$1]=$2;next} $1 in a {print a[$1]; next} {print "NOT FOUND"}' file1 file2
John
Michel
NOT FOUND
if file2 has only digit (no space at the end)
awk -F ':' '$1 in A {print A[$1];next}{if($2~/^$/) print "NOT FOUND";else A[$1]=$2}' file1
if not
awk -F '[:[:blank:]]' '$1 in A {print A[$1];next}{if($2~/^$/) print "NOT FOUND";else A[$1]=$2}' file1 file2