I am working on a variant calling format (vcf) file, and I tried to show you guys what I am trying to do:
Input:
1 877803 838425 GC G
1 878077 966631 C CCACGG
Output:
1 877803 838425 C -
1 878077 966631 - CACGG
In summary, I am trying to delete the first letters of longer strings.
And here is my code:
awk 'BEGIN { OFS="\t" } /#/ {next}
{
m = split($4, a, //)
n = split($5, b, //)
x = "-"
delete y
if (m>n){
for (i = n+1; i <= m; i++) {
y = sprintf("%s", a[i])
}
print $1, $2, $3, y, x
}
else if (n>m){
for (j = m+1; i <= n; i++) {
y = sprintf("%s", b[j]) ## Problem here
}
print $1, $2, $3, x, y
}
}' input.vcf > output.vcf
But,
I am getting the following error in line 15, not even in line 9
awk: cmd. line:15: (FILENAME=input.vcf FNR=1) fatal: attempt to use array y in a scalar context
I don't know how to concatenate array elements into a one string using awk.
I will be very happy if you guys help me.
Merry X-Mas!
You may try this awk:
awk -v OFS="\t" 'function trim(s) { return (length(s) == 1 ? "-" : substr(s, 2)); } {$4 = trim($4); $5 = trim($5)} 1' file
1 877803 838425 C -
1 878077 966631 - CACGG
More readable form:
awk -v OFS="\t" 'function trim(s) {
return (length(s) == 1 ? "-" : substr(s, 2))
}
{
$4 = trim($4)
$5 = trim($5)
} 1' file
You can use awk's substr function to process the 4th and 5th space delimited fields:
awk '{ substr($4,2)==""?$4="-":$4=substr($4,2);substr($5,2)==""?$5="-":$5=substr($5,2)}1' file
If the string from position 2 onwards in field 4 is equal to "", set field 4 to "-" otherwise, set field 4 to the extract of the field from position 2 to the end of the field. Do the same with field 5. Print lines modified or not with short hand 1.
In the below awk I am trying to extract and compare each substring in $4 that stars with p.. If the first three letters is the same as the last three (there is a digit in between) then that p. is updated to p.(3 letters)(digit)(=) --- the () are only to show that there are 3 enteries and are not needed. If the 3 letters are different then that line is unchanged. In the below file line 1 in an example. In my actual data there are about 10,000 rows wth about 50 columns, but $4 is the only one that will have these values in ut, that is te p. The format of the p. will always be three letters followed by a 1-4 digit # followed by 3 more letters. The awk attempt below I think will extract each p. and split on the ;, but I am not sure how to compare to check if the three letters are the same. Thank you :).
file tab-delimited
Chr Start ExonicFunc.refGene AAChange.refGene
chr1 155880573 synonymous SNV RIT1:NM_001256821:exon2:c.31G>C:p.Glu110Glu;RIT1:NM_001256822:exon2:c.31G>C:p.Glu110Glu
chr1 155880573 nonsynonymous SNV RIT1:NM_001256821:exon2:c.31G>C:p.Glu11Gln
desired output tab-delimited
Chr Start ExonicFunc.refGene AAChange.refGene
chr1 155880573 synonymous SNV RIT1:NM_001256821:exon2:c.31G>C:p.Glu110=;RIT1:NM_001256822:exon2:c.31G>C:p.Glu110=
chr1 155880573 nonsynonymous SNV RIT1:NM_001256821:exon2:c.31G>C:p.Glu11Gln
awk
awk '
BEGIN { OFS="\t" }
$4 ~ /:NM/ {
ostring=""
# split $4 by ";" and cycle through them
nNM=split($4,NM,";")
for (n=1; n<=nNM; n++) {
if (n>1) ostring=(ostring ";") # append ";"
if (match(NM[n],/p[.].*/)) {
# copy up to "p."
ostring=(ostring substr(NM[n],1,RSTART+1))
# Get the substring after "p."
VAL=substr(NM[n],RSTART+2)
# Get its length
lenVAL=length(VAL)
# store aa array
aa=[{while(length($4)=3){print substr($044,1,3);gsub(/^./,"")}]}' file
Extended GNU awk solution:
awk 'NR==1; NR > 1{
len = split($4, a, /\<p\.[a-zA-Z]{3}[0-9]+[a-zA-Z]{3}\>/, seps);
if (len == 1){ print; next }
res = ""
for (i=1; i < len; i++) {
s = seps[i];
if (substr(s, 3, 3) == substr(s, length(s) - 2)) {
seps[i] = substr(s, 1, length(s) - 3)"=";
}
}
for (i=1; i <= len; i++)
res = res a[i] (seps[i]? seps[i]:"");
$4 = res; print
}' FS='\t' OFS='\t' file
The output:
Chr Start ExonicFunc.refGene AAChange.refGene
chr1 155880573 synonymous SNV RIT1:NM_001256821:exon2:c.31G>C:p.Glu110=;RIT1:NM_001256822:exon2:c.31G>C:p.Glu110=
chr1 155880573 nonsynonymous SNV RIT1:NM_001256821:exon2:c.31G>C:p.Glu11Gln
Time performance measurement:
Input testfile:
$ wc -l testfile
10000 testfile
time(awk 'NR==1; NR > 1{
len = split($4, a, /\<p\.[a-zA-Z]{3}[0-9]+[a-zA-Z]{3}\>/, seps);
if (len == 1){ print; next }
res = ""
for (i=1; i < len; i++) {
s = seps[i];
if (substr(s, 3, 3) == substr(s, length(s) - 2)) {
seps[i] = substr(s, 1, length(s) - 3)"=";
}
}
for (i=1; i <= len; i++)
res = res a[i] (seps[i]? seps[i]:"");
$4 = res; print
}' FS='\t' OFS='\t' testfile >/dev/null)
real 0m0.269s
user 0m0.256s
sys 0m0.000s
time(awk 'BEGIN { FS=OFS="\t" }
NR>1 {
head = ""
tail = $4
while ( match(tail,/(p\.([[:alpha:]]{3})[0-9]+)([[:alpha:]]{3})/,a) ) {
head = head substr(tail,1,RSTART-1) a[1] (a[2] == a[3] ? "=" : a[3])
tail = substr(tail,RSTART+RLENGTH)
}
$4 = head tail
}
{ print }' testfile >/dev/null)
real 0m0.470s
user 0m0.416s
sys 0m0.008s
With GNU awk for the 3rd arg to match():
$ cat tst.awk
BEGIN { FS=OFS="\t" }
NR>1 {
head = ""
tail = $4
while ( match(tail,/(p\.([[:alpha:]]{3})[0-9]+)([[:alpha:]]{3})/,a) ) {
head = head substr(tail,1,RSTART-1) a[1] (a[2] == a[3] ? "=" : a[3])
tail = substr(tail,RSTART+RLENGTH)
}
$4 = head tail
}
{ print }
$ gawk -f tst.awk file
Chr Start ExonicFunc.refGene AAChange.refGene
chr1 155880573 synonymous SNV RIT1:NM_001256821:exon2:c.31G>C:p.Glu110=;RIT1:NM_001256822:exon2:c.31G>C:p.Glu110=
chr1 155880573 nonsynonymous SNV RIT1:NM_001256821:exon2:c.31G>C:p.Glu11Gln
I have data which looks like this
1 3
1 2
1 9
5 4
4 6
5 6
5 8
5 9
4 2
I would like the output to be
1 3,2,9
5 4,6,8,9
4 6,2
This is just sample data but my original one has lots more values.
So this worked
So this basically creates a hash table, using the first column as a key and the second column of the line as the value:
awk '{line="";for (i = 2; i <= NF; i++) line = line $i ", "; table[$1]=table[$1] line;} END {for (key in table) print key " => " table[key];}' trial.txt
OUTPUT
4 => 6, 2
5 => 4, 6, 8, 9
1 => 3, 2, 9
I'd write
awk -v OFS=, '
{
key = $1
$1 = ""
values[key] = values[key] $0
}
END {
for (key in values) {
sub(/^,/, "", values[key])
print key " " values[key]
}
}
' file
If you want only the unique values for each key (requires GNU awk for multi-dimensional arrays)
gawk -v OFS=, '
{ for (i=2; i<=NF; i++) values[$1][$i] = i }
END {
for (key in values) {
printf "%s ", key
sep = ""
for (val in values[key]) {
printf "%s%s", sep, val
sep = ","
}
print ""
}
}
' file
or perl
perl -lane '
$key = shift #F;
$values{$key}{$_} = 1 for #F;
} END {
$, = " ";
print $_, join(",", keys %{$values{$_}}) for keys %values;
' file
if not concerned with the order of the keys, I think this is the idiomatic awk solution.
$ awk '{a[$1]=($1 in a?a[$1]",":"") $2}
END{for(k in a) print k,a[k]}' file |
column -t
4 6,2
5 4,6,8,9
1 3,2,9
I have a big ascii-file that looks like this:
12,3,0.12,965.814
11,3,0.22,4313.2
14,3,0.42,7586.22
17,4,0,0
11,4,0,0
15,4,0,0
13,4,0,0
17,4,0,0
11,4,0,0
18,3,0.12,2764.86
12,3,0.22,2058.3
11,3,0.42,2929.62
10,4,0,0
10,4,0,0
14,4,0,0
12,4,0,0
19,3,0.12,1920.64
20,3,0.22,1721.51
12,3,0.42,1841.55
11,4,0,0
15,4,0,0
19,4,0,0
11,4,0,0
13,4,0,0
17,3,0.12,2738.99
12,3,0.22,1719.3
18,3,0.42,3757.72
.
.
.
I want to calculate a selected moving average over three values with awk. The selection should be done by the second and the third column.
A moving average should be calculated only for lines with the second column is 3.
The I would like to calculate three moving averages selected by the third column (contains for every "block" the same values in the same order).
The moving average shall then be calculated of the fourth column.
I would like to output the whole line of second moving-average value and replace the fourth column with the result.
I know that sounds very complicated, so I will give an example what I want to calculate and also the desired result:
(965.814+2764.86+1920.64)/3 = 1883.77
and output the result together with line 10:
18,3,0.12,1883.77
Then continue with the second, eleventh and eighteenth line...
The end result for my data example shall look like this:
18,3,0.12,1883.77
12,3,0.22,2697.67
11,3,0.42,4119.13
19,3,0.12,2474.83
20,3,0.22,1833.04
12,3,0.42,2842.96
I tried to calculate the moving-average with the following code in awk but think I designed the script wrong because awk tells me syntax error for every "$2 == 3".
BEGIN { FS="," ; OFS = "," }
$2 == 3 {
a; b; c; d; e; f = 0
line1 = $0; a = $3; b = $4; getline
line2 = $0; c = $3; d = $4; getline
line3 = $0; e = $3; f = $4
$2 == 3 {
line11 = $0; a = $3; b += $4; getline
line22 = $0; c = $3; d += $4; getline
line33 = $0; e = $3; f += $4
$2 == 3 {
line111 = $0; a = $3; b += $4; getline
line222 = $0; c = $3; d += $4; getline
line333 = $0; e = $3; f += $4
}
}
$0 = line11; $3 = a; $4 = b/3; print
$0 = line22; $3 = c; $4 = d/3; print
$0 = line33; $3 = e; $4 = f/3
}
{print}
Can you help me understanding how to correct my script (I think I have shortcomings with the philosophy of awk) or to start a completly new script because there is an easier solution out there ;-)
I also tried another idea:
BEGIN { FS="," ; OFS = "," }
i=0;
do {
i++;
a; b; c; d; e; f = 0
$2 == 3 {
line1 = $0; a = $3; b += $4; getline
line2 = $0; c = $3; d += $4; getline
line3 = $0; e = $3; f += $4
}while(i<3)
$0 = line1; $3 = a; $4 = b/3; print
$0 = line2; $3 = c; $4 = d/3; print
$0 = line3; $3 = e; $4 = f/3
}
{print}
This one also does not work, awk gives me two syntax errors (one at the "do" and the other after the "$$2 == 3").
I changed and tried a lot in both scripts and at some point they ran without errors but they did not give the desired output at all, so I thought there has to be a general problem.
I hope you can help me, that would be really nice!
Normalize your input
If you normalize your input using the right tools, then the task of finding a solution is far easier
My idea is to use awk to select the records where $2==3 and then use sort to group the data on the numerical value of the third column
% echo '12,3,0.12,965.814
11,3,0.22,4313.2
14,3,0.42,7586.22
17,4,0,0
11,4,0,0
15,4,0,0
13,4,0,0
17,4,0,0
11,4,0,0
18,3,0.12,2764.86
12,3,0.22,2058.3
11,3,0.42,2929.62
10,4,0,0
10,4,0,0
14,4,0,0
12,4,0,0
19,3,0.12,1920.64
20,3,0.22,1721.51
12,3,0.42,1841.55
11,4,0,0
15,4,0,0
19,4,0,0
11,4,0,0
13,4,0,0
17,3,0.12,2738.99
12,3,0.22,1719.3
18,3,0.42,3757.72' | \
awk -F, '$2==3' | \
sort --field-separator=, --key=3,3 --numeric-sort --stable
12,3,0.12,965.814
18,3,0.12,2764.86
19,3,0.12,1920.64
17,3,0.12,2738.99
11,3,0.22,4313.2
12,3,0.22,2058.3
20,3,0.22,1721.51
12,3,0.22,1719.3
14,3,0.42,7586.22
11,3,0.42,2929.62
12,3,0.42,1841.55
18,3,0.42,3757.72
%
Reason on normalized input
As you can see, the situation is now much clearer and we can try to design an algorithm to output a 3-elements running mean.
% awk -F, '$2==3' YOUR_FILE | \
sort --field-separator=, --key=3,3 --numeric-sort --stable | \
awk -F, '
$3!=prev {prev=$3
c=0
s[1]=0;s[2]=0;s[3]=0}
{old=new
new=$0
c = c+1; i = (c-1)%3+1; s[i] = $4
if(c>2)print old FS (s[1]+s[2]+s[3])/3}'
18,3,0.12,2764.86,1883.77
19,3,0.12,1920.64,2474.83
12,3,0.22,2058.3,2697.67
20,3,0.22,1721.51,1833.04
11,3,0.42,2929.62,4119.13
12,3,0.42,1841.55,2842.96
Oops,
I forgot your requirement on SUBSTITUTING $4 with the running mean, I will come out with a solution unless you're faster than me...
Edit: change the line
{old=new
to
{split(new,old,",")
and change the line
if(c>2)print old FS (s[1]+s[2]+s[3])/3}'
to
if(c>2) print old[1] FS old[2] FS old[3] FS (s[1]+s[2]+s[3])/3}'