I have a tab limited file as like this
chr20 102 K245 A T 56.0 AC.02 AC=0.1;DC=45;AC_old=452;DP=21;sample=kj;sample_name=DKl;New_sample=rdf
chr10 8742 JH245 G T 86.0 AC.742 AC=2.1;DC=75;AC_old=42;DP=1;sample=KHS;sample_name=WEKl;New_sample=ASEf
chrX 2302 XS245 G A 786.0 AC.452 AC=8;DC=5;AC_old=4A2;DP=5;sample=SED;sample_name=MHNSKl;New_sample=rdf
And Need to extract only AC,DC,sample as like this
chr20 102 K245 A T 56.0 AC.02 AC=0.1 DC=45 sample=kj
chr10 8742 JH245 G T 86.0 AC.742 AC=2.1 DC=75 sample=KHS
chrX 2302 XS245 G A 786.0 AC.452 AC=8 DC=5 sample=SED
I have tried with grep as like this, but not served the purpose
grep -wF "AC|DC|sample" < file.txt
Could you please try following, written and tested with your shown samples only in GNU awk.
awk '
match($0,/AC\.[0-9]+/){
val1=value=""
val1=substr($0,1,RSTART+RLENGTH)
num=split($NF,arr,";")
for(i=1;i<=num;i++){
if(arr[i]~/^(AC=|DC=|sample=)/){
value=(value?value OFS:"")arr[i]
}
}
print val1,value
}
' Input_file
Explanation: Adding detailed explanation for above.
awk ' ##Starting awk program from here.
match($0,/AC\.[0-9]+/){ ##using match function which matches regex AC\.[0-9]+ here.
val1=value="" ##Nullifying val1 and value here.
val1=substr($0,1,RSTART+RLENGTH) ##val1 is having sub string of matched regex.
num=split($NF,arr,";") ##Splitting last field into arr here.
for(i=1;i<=num;i++){ ##Going through all values of last field.
if(arr[i]~/^(AC=|DC=|sample=)/){ ##Checking condition if last field is either AC= OR DC= OR sample= here.
value=(value?value OFS:"")arr[i] ##Create value which has array item value in it.
}
}
print val1,value ##Printing val1 and value here.
}
' Input_file ##mentioning Input_file name here.
You can use
awk -F\; '$1 ~ /AC|DC|sample/{print $1 OFS $2 OFS $5}' file
Here,
-F\; sets the field separator to ;
$1 ~ /AC|DC|sample/ only takes lines having AC, DC or sample in Field 1
{print $1 OFS $2 OFS $5} - prints Field 1, 2 and 5 with spaces as separators.
See the online demo:
s='chr20 102 K245 A T 56.0 AC.02 AC=0.1;DC=45;AC_old=452;DP=21;sample=kj;sample_name=DKl;New_sample=rdf
chr10 8742 JH245 G T 86.0 AC.742 AC=2.1;DC=75;AC_old=42;DP=1;sample=KHS;sample_name=WEKl;New_sample=ASEf
chrX 2302 XS245 G A 786.0 AC.452 AC=8;DC=5;AC_old=4A2;DP=5;sample=SED;sample_name=MHNSKl;New_sample=rdf'
awk -F\; '$1 ~ /AC|DC|sample/{print $1 OFS $2 OFS $5}' <<< "$s"
Output:
chr20 102 K245 A T 56.0 AC.02 AC=0.1 DC=45 sample=kj
chr10 8742 JH245 G T 86.0 AC.742 AC=2.1 DC=75 sample=KHS
chrX 2302 XS245 G A 786.0 AC.452 AC=8 DC=5 sample=SED
You may use this awk:
awk -F '[\t;]+' -v OFS='\t' '{s=""; for (i=1; i<=6; ++i) s = (i == 1 ? "" : s OFS) $i; for (i=6; i<=NF; ++i) if ($i ~ /^([AD]C|sample)[=.]/) s = s OFS $i; print s}' file
chr20 102 K245 A T 56.0 AC.02 AC=0.1 DC=45 sample=kj
chr10 8742 JH245 G T 86.0 AC.742 AC=2.1 DC=75 sample=KHS
chrX 2302 XS245 G A 786.0 AC.452 AC=8 DC=5 sample=SED
A more readable version:
awk -F '[\t;]+' -v OFS='\t' '
{
s = ""
for (i=1; i<=6; ++i)
s = (i == 1 ? "" : s OFS) $i
for (i=6; i<=NF; ++i)
if ($i ~ /^([AD]C|sample)[=.]/)
s = s OFS $i
print s
}' file
Related
I have a two files and trying compare the files on the basis of columns
File_1
CALL_3 CALL_1
CALL_2 CALL_5
CALL_3 CALL_2
CALL_1 CALL_4
File_2
CALL_1 GAP:A GAP:G
CALL_3 GAP:C GAP:Q GAP:R
CALL_5 GAP:R GAP:A
CALL_4 GAP:C GAP:D GAP:A GAP:W
CALL_2 GAP:C GAP:R GAP:A
I want to print only those interaction from file_1 having atleast one GAP_id is comman between these two.
Expected output
CALL_2 CALL_5 GAP:A GAP:R
CALL_3 CALL_2 GAP:C GAP:R
CALL_1 CALL_4 GAP:A
I tried the following :
awk 'NR==FNR {
a[$1]=($1 OFS $2 OFS $3 OFS $4 OFS $5 OFS $6 OFS $7 OFS $8 OFS $9)
next
}
($1 in a)&&($2 in a) {
print a[$1],a[$2]
}' File_2 File_1
It is working good for fixed number of columns. But number of columns is not fixed in file_2 (more than 1000 columns). How to get the expected output?
Could you please try following.
awk '
FNR==NR{
val=$1
$1=""
$0=$0
$1=$1
a[val]=$0
next
}
{
val=""
num1=split(a[$1],array1," ")
for(i=1;i<=num1;i++){
array3[array1[i]]
}
num2=split(a[$2],array2," ")
for(i=1;i<=num2;i++){
array4[array2[i]]
}
for(k in array3){
if(k in array4){
val=(val?val OFS:"")k
}
}
if(val){
print $0,val
}
val=""
delete array1
delete array2
delete array3
delete array4
}
' Input_file2 Input_file1
Output will be as follows.
CALL_2 CALL_5 GAP:A GAP:R
CALL_3 CALL_2 GAP:C GAP:R
CALL_1 CALL_4 GAP:A
Explanation: Adding detailed explanation for above code.
awk ' ##Starting awk program here.
FNR==NR{ ##Checking condition FNR==NR which will be TRUE for first Input_file is being read.
val=$1 ##Creating a variable named val whose value is $1 of current line.
$1="" ##Nullifying $1 here.
$0=$0 ##Re-assigning value of current line to itself, so that initial space will be removed.
$1=$1 ##Re-assigning value of current line to itself, so that initial space will be removed.
a[val]=$0 ##Creating an array named a whose index is val and value is $0.
next ##next will skip all further statements from here.
}
{
val="" ##Nullifying variable val here.
num1=split(a[$1],array1," ") ##splitting array a with index $1 to array1 and having its total number in num1.
for(i=1;i<=num1;i++){ ##Starting a for loop from i=1 till value of num1
array3[array1[i]] ##Creating an array named array3 with index of array1 with index i.
}
num2=split(a[$2],array2," ") ##splitting array a with index $2 to array2 and having its total number in num2.
for(i=1;i<=num2;i++){ ##Starting a for loop from i=1 till value of num2.
array4[array2[i]] ##Creating an array named array4 with value of array2 with index i.
}
for(k in array3){ ##Traversing through array3 here.
if(k in array4){ ##Checking condition if k which is index of array3 is present in array4 then do following.
val=(val?val OFS:"")k ##Creating variable named val whose value is variable k with concatenating its own value each time to it.
}
}
if(val){ ##Checking condition if variable val is NOT NULL then do following.
print $0,val ##Printing current line and variable val here.
}
val="" ##Nullifying variable val here.
delete array1 ##Deleting array1 here.
delete array2 ##Deleting array2 here.
delete array3 ##Deleting array3 here.
delete array4 ##Deleting array4 here.
}
' Input_file2 Input_file1 ##Mentioning Input_file names here.
With GNU awk for arrays of arrays:
$ cat tst.awk
NR==FNR {
for (i=2; i<=NF; i++) {
gaps[$1][$i]
}
next
}
{
common = ""
for (gap in gaps[$1]) {
if (gap in gaps[$2]) {
common = common OFS gap
}
}
if ( common != "" ) {
print $0 common
}
}
$ awk -f tst.awk file2 file1
CALL_2 CALL_5 GAP:A GAP:R
CALL_3 CALL_2 GAP:C GAP:R
CALL_1 CALL_4 GAP:A
With any awk:
$ cat tst.awk
NR==FNR {
key = $1
sub(/[^[:space:]]+[[:space:]]+/,"")
gaps[key] = $0
next
}
{
mkSet(gaps[$1],gaps1)
mkSet(gaps[$2],gaps2)
common = ""
for (gap in gaps1) {
if (gap in gaps2) {
common = common OFS gap
}
}
if ( common != "" ) {
print $0 common
}
}
function mkSet(str,arr, i,tmp) {
delete arr
split(str,tmp)
for (i in tmp) {
arr[tmp[i]]
}
}
$ awk -f tst.awk file2 file1
CALL_2 CALL_5 GAP:A GAP:R
CALL_3 CALL_2 GAP:C GAP:R
CALL_1 CALL_4 GAP:A
I did it in bash with coreutils. A oneliner:
join -12 -21 <(join -11 -21 <(sort file_1) <(sort file_2) | sort -k2) <(sort file_2) | xargs -l1 bash -c 'a=$(<<<"${#:3}" tr " " "\n" | sort | uniq -d | tr "\n" " "); if [ -n "$a" ]; then printf "%s %s %s\n" "$1" "$2" "$a"; fi' --
Or a bit more lines:
join -12 -21 <(
join -11 -21 <(sort file_1) <(sort file_2) | sort -k2
) <(
sort file_2
) |
xargs -l1 bash -c '
a=$(<<<"${#:3}" tr " " "\n" | sort | uniq -d | tr "\n" " ");
if [ -n "$a" ]; then
printf "%s %s %s\n" "$1" "$2" "$a"
fi
' --
Join file_1 with file_2 on the first fields.
Join the result from point 1 on field 2 with file_2 again
Then for each line:
Get only the duplicates of the GAP* parts
If there are any duplicates print the CALL_* with the duplicates
Results in:
CALL_2 CALL_3 GAP:C GAP:R
CALL_4 CALL_1 GAP:A
CALL_5 CALL_2 GAP:A GAP:R
With awk this is straightforward:
$ awk '(NR==FNR){$1=$1;a[$1]=$0;next}
{str=strt=$1 OFS $2}
{split(a[$1],b,OFS)}
{for(i in b) if(index(a[$2] OFS, OFS b[i] OFS)) str=str OFS a[$2]}
(str!=strt){print str}' file2 file1
How does this work:
(NR==FNR){$1=$1;a[$1]=$0;next}
The first line buffers file2 in an associative array a[key]=value where key is the first element and value the full line. Eg.
a["CALL_1"]="CALL_1 GAP:A GAP:G"
Remark, that we substituted all FS into OFS using $1=$1.
{str=strt=$1 OFS $2}
This just stores CALL_1 CALL_2 in the variable str
{split(a[$1],b,OFS)}: split the buffered line into array b
{for(i in b) if(index(a[$2] OFS, OFS b[i] OFS)) str=str OFS a[$2]}
For all entries in array b, check if the string OFS b[i] OFS is found in the string a[$2] OFS. We add the extra OFS to ensure field matches. We do test for values like OFS CALL_2 OFS, but this will never match. This is a tiny overhead, but fixing this would create much more overhead.
A more optimised version would read:
$ awk '(NR==FNR){k=$1;$1="";a[k]=$1;c[k]=NF-1;next}
{str=strt=$1 OFS $2}
(c[$1]< c[$2]) {split(substr(a[$1],2),b,OFS);s=a[$2] OFS}
(c[$1]>=c[$2]) {split(substr(a[$2],2),b,OFS);s=a[$1] OFS}
{for(i in b) if(index(s, OFS b[i] OFS)) str=str OFS a[$2]}
(str!=strt){print str}' file2 file1
I have tried several awk and sed commands and GNU datamash to change the format and code the missing fields as "??" of this data file with no success. I have a file with a format that looks like the following:
ind_1 SNP_1 AA
ind_1 SNP_2 AB
ind_1 SNP_3 AA
ind_2 SNP_1 AA
ind_2 SNP_2 AA
ind_3 SNP_1 AB
ind_3 SNP_2 AA
ind_3 SNP_3 AB
ind_3 SNP_4 AA
desired format:
SNP_1 SNP_2 SNP_3 SNP_4
ind_1 AA AB AA ??
ind_2 AA AA ?? ??
ind_3 AB AA AB AA
i first tried using GNU datamash
datamash --no-strict transpose < input1.txt
then i tried this awk:
awk '
!b[$1 FS $2]++{
a[++i]=$1 FS $2
}
{
c[$1 FS $2]=c[$1 FS $2]?c[$1 FS $2] FS $4:$4
}
END{
for(k=1;k<=i;k++){
print a[k],c[a[k]]
}}
' Input1_txt
awk to the rescue!
with true multidimensional arrays it would be easier, but this works for most awks
awk -v OFS='\t' '{vals[$1]; cols[$2]; a[$1,$2]=$3}
END {for(j in cols) printf "%s", OFS j;
print "";
for(i in vals)
{printf "%s", i;
for(j in cols) printf "%s", OFS (((i,j) in a)?a[i,j]:"??");
print ""}}
I have this file:
$ head -n 4 badRegionFromHWE.merged
seqnames start end width strand
chr1 144118070 145868461 1750392 *
chr7 100049516 101110026 1060511 *
chr7 141508887 142999071 1490185 *
$
I want to not print out the header line and print column 1,2,3 separated by tabs. So I wrote this:
awk 'OFS="\t";NR>1{print$1,$2,$3}' badRegionFromHWE.merged | head
seqnames start end width strand
chr1 144118070 145868461 1750392 *
chr1 144118070 145868461
chr7 100049516 101110026 1060511 *
chr7 100049516 101110026
chr7 141508887 142999071 1490185 *
chr7 141508887 142999071
It doesn't do what I wanted it to do!
The assignment OFS="\t" evaluates to true (non-zero, non-empty) on every line, so it prints every line. You should enclose the expression in a BEGIN block:
awk 'BEGIN { OFS="\t" } NR > 1 { print$1, $2, $3 }' badRegionFromHWE.merged
I am trying to output a tab-delimited result that uses the data from a tab-delimited file to combine and subtract specific lines.
If $4 matches in each line then the first matching sequential $6 value is added to $2, unless the value is 1, then the original $2 is used (like in the case of line 1). This is the new or adjusted $2 value.
The last matching sequential $6 value is added to $2 and this is the new or adjusted $3 value.
The new $2 and $3 vales are combined with $1 in the format $1:$2-$3 and the $5 value is printed on the line.
The awk command below works great as long as the $4 values are unique, but that is not always the case. I can not seem to add in a condition
that checks $6 and if the numbers are not sequential (1 2 is, but then there is a break between 92 93 94), when there is a break a new line.
Maybe there is another way but hopefully this helps. Thank you :)
file
chrX 110956442 110956535 chrX:110956442-110956535 ALG13 1 19
chrX 110956442 110956535 chrX:110956442-110956535 ALG13 2 19
chrX 110956442 110956535 chrX:110956442-110956535 ALG13 92 18
chrX 110956442 110956535 chrX:110956442-110956535 ALG13 93 18
chrX 110956442 110956535 chrX:110956442-110956535 ALG13 94 18
chrX 110961329 110961512 chrX:110961329-110961512 ALG13 2 1
chrX 110961329 110961512 chrX:110961329-110961512 ALG13 3 1
chrX 25031028 25031925 chrX:25031028-25031925 ARX 651 3
desired output
chrX:110956442-110956444 ALG13
chrX:110956534-110956536 ALG13
chrX:110961331-110961332 ALG13
chrx:25031679-25031679 ARX
awk
awk 'FNR==NR {S[$4]++;next} ($4 in S){if(S[$4]>1){print $1 OFS $2 OFS $2+S[$4] OFS $5;}
else {if($6==1){print $1 OFS $2 OFS $2 OFS $5}
else {print $1 OFS $2+$6 OFS $2+$6 OFS $5}};delete S[$4]}' file file
current output
chrX 110956442 110956449 ALG13
chrX 110961329 110961334 ALG13
chrX 25031028 25031031 ARX
This does most of what you want:
function myprint(start, first, last, key) {
print "chrX:" (start + first) "-" (start + last) "\t" key;
}
NR == 1 {
last_start = $2;
key = $5;
first_stop = $6;
last_stop = $6;
next;
}
{
if ($2 == last_start) {
if ($6 != (last_stop + 1)) {
myprint(last_start, first_stop, last_stop, key);
first_stop = $6;
}
} else {
myprint(last_start, first_stop, last_stop, key);
last_start = $2;
first_stop = ($6 == 1) ? 0 : $6;
}
key = $5;
last_stop = $6;
}
END {
myprint(last_start, first_stop, last_stop, key);
}
However, it took me quite some time to understand your requirements, and I still do not understand why the second line of your desired output is chrX:110956534-110956628 ALG13, since 94 - 92 == 2.
I am trying to use awk to do the below steps
find matching fields $1 strings between file1 and file2
if the $1 strings match then $2 in file1 is divided by $3 in file2 (that is x which is 3 signifigant figures rounded up)
x is multiplied by 100
each x is subtracted from 100 and that is the %
file1
USH2A 21
GIT1 357
PALB2 3
file2
GIT1 21 3096
USH2A 71 17718
PALB2 13 3954
awk
awk 'NR==FNR{a[$1]=$1;next;}{if ($1 in a) print $1, $2/a[$3];else print;}' file2 file1 > test
awk: cmd. line:1: (FILENAME=search FNR=2) fatal: division by zero attempted
awk 'NR==FNR{a[$1]=$1;next;}{if ($1 in a) print $1, $2/a[$3];else print;}' file1 file2 > test
awk: cmd. line:1: (FILENAME=search FNR=1) fatal: division by zero attempted
example
USH2A match is found so (21/17718)*100 = 0.11 and 100-0.11 = 99.99%
GIT1 match is found so (357/3096)*100 = 11.53 and 100-11.53 = 88.47%
PALB2 match is found so (3/3954) *100 = 0.07 and 100-0.7 = 99.93%
I am going line by line in the code and can see that I am already getting errors. Thank you :).
awk to the rescue!
$ awk 'function ceil(v) {return int(v)==v?v:int(v+1)}
NR==FNR{f1[$1]=$2; next}
$1 in f1{print $1, ceil(10000*(1-f1[$1]/$3))/100 "%"}' file1 file2
GIT1 88.47%
USH2A 99.89%
PALB2 99.93%
note that there is no round-up in awk so defined a ceil function for this task.
$ cat tst.awk
NR==FNR { a[$1]=$3; next }
$1 in a {
x = (a[$1] ? ($2*100)/a[$1] : 0)
printf "%s match is found so (%d/%d) *100 = %.2f and 100-%.2f = %.2f%%\n", $1, $2, a[$1], x, x, 100-x
}
$ awk -f tst.awk file2 file1
USH2A match is found so (21/17718) *100 = 0.12 and 100-0.12 = 99.88%
GIT1 match is found so (357/3096) *100 = 11.53 and 100-11.53 = 88.47%
PALB2 match is found so (3/3954) *100 = 0.08 and 100-0.08 = 99.92%