Count how many repeated times each record appears and select minimum and maximum of specific column - awk

1 .- Fist i would like to count how many times each record appears the key is substr($0,20,18), Print always the last line for each record repeated and print in the output file in last column
2.- Find the minimum and maximum value on column 7 and print in columns 4 and 5 in output file.
Input file
M G 36829.00 37145.00 1 2161 36840.00 37146.00 37576
M G 36829.00 37145.00 217 4321 36852.00 37146.00 37576
M G 36829.00 37145.00 433 6481 36864.00 37146.00 37576
M G 36829.00 37145.00 649 8641 36876.00 37146.00 37576
M G 36829.00 37145.00 865 10801 36888.00 37146.00 37576
M G 36833.00 38033.00 1 4321 36840.00 37602.00 38464
M G 36833.00 38033.00 433 8641 36852.00 37602.00 38464
M G 36833.00 38033.00 865 12961 36864.00 37602.00 38464
M G 36833.00 38033.00 1297 17281 36876.00 37602.00 38464
M G 36833.00 38033.00 1729 21601 36888.00 37602.00 38464
M G 37265.00 38105.00 1 4321 36840.00 37674.00 38536
M G 37265.00 38105.00 433 8641 36852.00 37674.00 38536
M G 37265.00 38105.00 865 12961 36864.00 37674.00 38536
M G 37265.00 38105.00 1297 17281 36876.00 37674.00 38536
M G 37265.00 38105.00 1729 21601 36888.00 37674.00 38536
M G 37265.00 38105.00 2161 25921 36900.00 37674.00 38536
M G 37271.00 38885.00 1 2211 36840.00 38454.00 38894
M G 37271.00 38885.00 222 4421 36852.00 38454.00 38894
M G 37271.00 38885.00 443 6631 36864.00 38454.00 38894
M G 37271.00 38885.00 664 8841 36876.00 38454.00 38894
Desired Output file
36829.00 37145.00 10801 36840.00 36888.00 37146.00 37576 5
36833.00 38033.00 21601 36840.00 36888.00 37602.00 38464 5
37265.00 38105.00 25921 36840.00 36900.00 37674.00 38536 6
37271.00 38885.00 8841 36840.00 36876.00 38454.00 38894 4
I tried.
To count how many times each record appears.
awk '{dups[substr($0,20,18)]++} END{for (num in dups) {print num,dups[num]}}' file
To find the minimum and maximum in column 7.
awk '{\
l = substr($7,1,5);\
printf ("%5d \n",l);\
}' file |
awk ' {D1=substr($1, 1, 5)
D2=substr($1, 1, 5)+0
}
!(D1 in MIN) {MIN[D1]=D2
MAX[D1]=D2
next
}
D2 < MIN[D1] {MIN[D1]=D2}
D2 > MAX[D1] {MAX[D1]=D2}
END {for (m in MIN) print m, MIN[m], MAX[m]}
Thanks in advance.

It sounds like this is what you're trying to do:
$ cat tst.awk
{ currKey = $3 FS $4 }
currKey != prevKey { prt(); min=$7; cnt=0 }
{ prevRec=$0; prevKey=currKey; max=$7; cnt++ }
END { prt() }
function prt( f) {
if ( cnt ) {
split(prevRec,f)
print f[3], f[4], f[6], min, max, f[7], f[8], cnt
}
}
$ sort -k3,4n -k7n file | awk -f tst.awk | column -t
36829.00 37145.00 10801 36840.00 36888.00 36888.00 37146.00 5
36833.00 38033.00 21601 36840.00 36888.00 36888.00 37602.00 5
37265.00 38105.00 25921 36840.00 36900.00 36900.00 37674.00 6
37271.00 38885.00 8841 36840.00 36876.00 36876.00 38454.00 4

Does not keep the order of input file but works even if your file is not ordered by key first
awk '
{
$7+=0;
COUNT[$9]+=1;
C1[$9]=$3;
C2[$9]=$4;
C3[$9]=$6;
C6[$9]=$8
}
!($9 in MIN){
MIN[$9]=$7;
MAX[$9]=$7;
next
}
$7<MIN[$9]{
MIN[$9]=$7
}
$7>MAX[$9]{
MAX[$9]=$7
}
END{
for(id in COUNT){
print C1[id], C2[id], C3[id], MIN[id], MAX[id], C6[id], id, COUNT[id]
}
}' <file>
Output :
37271.00 38885.00 8841 36840 36876 38454.00 38894 4
36833.00 38033.00 21601 36840 36888 37602.00 38464 5
36829.00 37145.00 10801 36840 36888 37146.00 37576 5
37265.00 38105.00 25921 36840 36900 37674.00 38536 6

Could you please try following.
awk '
{
val=substr($0,20,18)
$1=$2=""
sub(/^[[:space:]]+/,"")
}
prev!=val && prev{
print first,second,min,max,third,count
count=""
}
{
min=min<$5?min?min:$5:$5
max=max>$5?max:$5
prev=val
count++
first=$1 OFS $2
second=$4
third=$(NF-1) OFS $NF
}
END{
if(prev){
print first,second,min,max,third,count
}
}
' Input_file | column -t

Related

Counting max and min per row across columns and outputting associated column names

I'm trying to count both max and min (except 0s) per row across columns and outputting associated column names.
I'm trying this:
BEGIN{OFS="\t"}
NR==1{print $1,$2,"ref","max","ref","min";
for(i=3;i<=6;++i)BASES[i]=$(i);
}
NR>1{l=1;basemax=BASES[3];basemin=BASES[3]; max=$3; min=$3;
for(i=4;i<=6;++i){
if($i>max){basemax=BASES[i];max=$i;}
else if($i==max){basemax=basemax","BASES[i];++l}
}
for(i=4;i<=6;++i){
if($i<min && $i !=0){basemmin=BASES[i];mim=$i}
else if($i==min){basemin=basemin","BASES[i];++l}
}
print $1,$2,basemax,max,basemin,min
}
In a input that looks like this
chr pos C T A G
NC_044998.1 3732 22 0 7 0
NC_044998.1 3733 22 0 0 0
NC_044998.1 3734 22 3 3 0
NC_044998.1 3735 22 0 0 3
NC_044998.1 3736 0 7 22 3
NC_044998.1 3737 0 0 0 25
NC_044998.1 3738 22 7 0 0
NC_044998.1 3739 7 3 22 25
NC_044998.1 3740 0 22 22 0
NC_044998.1 3741 22 0 0 0
The desired output is
chr pos ref max ref min
NC_044998.1 3732 C 22 A 7
NC_044998.1 3733 C 22 C 22
NC_044998.1 3734 C 22 T,A 3
NC_044998.1 3735 C 22 G 3
NC_044998.1 3736 A 22 G 3
NC_044998.1 3737 G 25 G 25
NC_044998.1 3738 C 22 C 22
NC_044998.1 3739 G 25 C 7
NC_044998.1 3740 T,A 22 T,A 22
NC_044998.1 3741 C 22 C 22
But it outputs this instead
chr pos ref max ref min
NC_044998.1 3732 C 22 C 22
NC_044998.1 3733 C 22 C 22
NC_044998.1 3734 C 22 C 22
NC_044998.1 3735 C 22 C 22
NC_044998.1 3736 A 22 C 0
NC_044998.1 3737 G 25 C,T,A 0
NC_044998.1 3738 C 22 C 22
NC_044998.1 3739 G 25 C 7
NC_044998.1 3740 T 22 C,A,G 0
NC_044998.1 3741 C 22 C 22
With your shown samples, please try following awk code. Written and tested in GNU awk.
awk -v startField="3" -v endField="6" '
BEGIN{ OFS="\t"; print "chr pos ref max ref min"}
FNR==1{
for(i=startField;i<=endField;i++){
heading[i]=$i
}
next
}
{
min=max2=maxInd2=minInd=max=maxInd=minAllInd=maxAllInd=maxAllInd2=""
for(i=startField;i<=endField;i++){
if($i!=0){
minInd=(min>$i?i:(min==$i?minInd","i:(minInd!=""?minInd:i)))
min=(min>$i?$i:(min!=""?min:$i))
}
maxInd=(max<$i?i:(max==$i?maxInd","i:(maxInd!=""?maxInd:i)))
max=(max<$i?$i:(max!=""?max:$i))
}
for(i=startField+1;i<=endField;i++){
maxInd2=(max2<$i?i:(max2==$i?maxInd2","i:(maxInd2!=""?maxInd2:i)))
max2=(max2<$i?$i:(max2!=""?max2:$i))
}
num1=split(maxInd,arr1,",")
num2=split(minInd,arr2,",")
num3=split(maxInd2,arr3,",")
if(num1>1){
for(k=1;k<=num1;k++){
maxAllInd = (maxAllInd?maxAllInd ",":"") heading[arr1[k]]
}
}
else{
maxAllInd = heading[maxInd]
}
if(num2>1){
for(k=1;k<=num2;k++){
minAllInd = (minAllInd?minAllInd ",":"") heading[arr2[k]]
}
}
else{
minAllInd = heading[minInd]
}
if(num3>1){
for(k=1;k<=num3;k++){
maxAllInd2 = (maxAllInd2?maxAllInd2 ",":"") heading[arr3[k]]
}
}
else{
maxAllInd2 = heading[maxInd2]
}
if(startField>1){
NF=(startField-1)
if(min !=0 ){
print $0,maxAllInd,max,minAllInd,min
}
if(min == 0 && max2 != 0){
print $0,maxAllInd,max,maxAllInd2,max2
}
if(min == 0 && max2 == 0){
print $0,maxAllInd,max,maxAllInd,max
}
}
else{
if(min !=0 ){
print maxAllInd,max,minAllInd,min
}
if(min == 0 && max2 != 0){
print maxAllInd,max,maxAllInd2,max2
}
if(min == 0 && max2 == 0){
print maxAllInd,max,maxAllInd,max
}
}
}
' Input_file
This awk script should work for you:
cat maxmin.awk
NR == 1 {
for (i=b; i<=NF; ++i)
hdr[i] = $i
print $1, $2, "ref", "max", "ref", "min"
next
}
{
for (i=b; i<=NF; ++i) {
max = ($i > max ? $i : max)
min = ($i && (min == "" || $i < min) ? $i : min)
}
for (i=b; i<=NF; ++i) {
if ($i == min)
rmin = (rmin ? rmin "," : "") hdr[i]
if ($i == max)
rmax = (rmax ? rmax "," : "") hdr[i]
}
print $1, $2, rmax, max, rmin, min
max = min = rmax = rmin = ""
}
And use it as:
awk -v b=3 -f maxmin.awk gg | column -t
chr pos ref max ref min
NC_044998.1 3732 C 22 A 7
NC_044998.1 3733 C 22 C 22
NC_044998.1 3734 C 22 T,A 3
NC_044998.1 3735 C 22 G 3
NC_044998.1 3736 A 22 G 3
NC_044998.1 3737 G 25 G 25
NC_044998.1 3738 C 22 T 7
NC_044998.1 3739 G 25 T 3
NC_044998.1 3740 T,A 22 T,A 22
NC_044998.1 3741 C 22 C 22
column -t has been used for tabular output only.
You have some typos in variable names such as basemmin and mim.
If the count of C is 0, the min value has no chance to be updated.
You can combine the two for loops into one.
The variable l is not used.
Then would you please try the following:
awk -v OFS="\t" '
NR==1 {
print $1, $2, "ref", "max", "ref", "min"
for (i = 3; i <= 6; i++) bases[i] = $i
}
NR>1 {
basemax = bases[3]; basemin = bases[3]; max = $3; min = $3
for (i = 4; i <= 6; i++) {
if ($i > max) {basemax = bases[i]; max = $i}
else if ($i == max) {basemax = basemax "," bases[i]}
if ($i < min && $i != 0 || min == 0) {basemin = bases[i]; min = $i}
else if ($i == min) {basemin = basemin "," bases[i]}
}
print $1, $2, basemax, max, basemin, min
}' input_file
Output:
chr pos ref max ref min
NC_044998.1 3732 C 22 A 7
NC_044998.1 3733 C 22 C 22
NC_044998.1 3734 C 22 T,A 3
NC_044998.1 3735 C 22 G 3
NC_044998.1 3736 A 22 G 3
NC_044998.1 3737 G 25 G 25
NC_044998.1 3738 C 22 T 7
NC_044998.1 3739 G 25 T 3
NC_044998.1 3740 T,A 22 T,A 22
NC_044998.1 3741 C 22 C 22
Please note the output slightly differs from your desired output, which may contain typos.

Sum columns in pandas based on the names of the columns

I have a dataframe with the population by age in several cities:
City Age_25 Age_26 Age_27 Age_28 Age_29 Age_30
New York 11312 3646 4242 4344 4242 6464
London 6446 2534 3343 63475 34433 34434
Paris 5242 34343 6667 132 323 3434
Hong Kong 354 979 878 6776 7676 898
Buenos Aires 4244 7687 78 8676 786 9798
I want to create a new dataframe with the sum of the columns based on ranges of three years. That is, people from 25 to 27 and people from 28 to 30. Like this:
City Age_25_27 Age_28_30
New York 19200 15050
London 12323 132342
Paris 46252 3889
Hong Kong 2211 15350
Buenos Aires 12009 19260
In this example I gave a range of three year but in mine real database it has to be 5 five and with 100 ages.
How could I do that? I've saw some related answers but neither work very well in my case.
Try this:
age_columns = df.filter(like='Age_').columns
n = age_columns.str.split('_').str[-1].astype(int)
df['Age_25-27'] = df[age_columns[(n >= 25) & (n <= 27)]].sum(axis=1)
df['Age_28-30'] = df[age_columns[(n >= 28) & (n <= 30)]].sum(axis=1)
Output:
>>> df
City Age_25 Age_26 Age_27 Age_28 Age_29 Age_30 Age_25-27 Age_28-30
New York 11312 3646 4242 4344 4242 6464.0 19200 15050.0
London 6446 2534 3343 63475 34433 34434 NaN 69352 68867.0
Paris 5242 34343 6667 132 323 3434 NaN 41142 3757.0
Hong Kong 354 979 878 6776 7676 898.0 2211 15350.0
Buenos Aires 4244 7687 78 8676 786 9798.0 12009 19260.0
You can use groupby:
In [1]: import pandas as pd
...: import numpy as np
In [2]: d = {
...: 'City': ['New York', 'London', 'Paris', 'Hong Kong', 'Buenos Aires'],
...: 'Age_25': [11312, 6446, 5242, 354, 4244],
...: 'Age_26': [3646, 2534, 34343, 979, 7687],
...: 'Age_27': [4242, 3343, 6667, 878, 78],
...: 'Age_28': [4344, 63475, 132, 6776, 8676],
...: 'Age_29': [4242, 34433, 323, 7676, 786],
...: 'Age_30': [6464, 34434, 3434, 898, 9798]
...: }
...:
...: df = pd.DataFrame(data=d)
...: df = df.set_index('City')
...: df
Out[2]:
Age_25 Age_26 Age_27 Age_28 Age_29 Age_30
City
New York 11312 3646 4242 4344 4242 6464
London 6446 2534 3343 63475 34433 34434
Paris 5242 34343 6667 132 323 3434
Hong Kong 354 979 878 6776 7676 898
Buenos Aires 4244 7687 78 8676 786 9798
In [3]: n_cols = 3 # change to 5 for actual dataset
...: sum_every_n_cols_df = df.groupby((np.arange(len(df.columns)) // n_cols) + 1, axis=1).sum()
...: sum_every_n_cols_df
Out[3]:
1 2
City
New York 19200 15050
London 12323 132342
Paris 46252 3889
Hong Kong 2211 15350
Buenos Aires 12009 19260
You can extract the columns of the dataframe and put them in a list. Use
col_list = df.columns
But ultimately, I think what you'd want to do is more of a while loop with your inputs (band of 5 and up to 100 ages) as static values that you iterate over.
band = 5
start = 20
max_age = 120
i = start
while i < max_age:
age_start = i
age_end = i
sum_cols = []
col_name = 'age_' + str(age_start) + '_to_' + str(age_end)
for i in range(age_start,age_end):
age_adder = 'age_' + str(i)
df[col_name] += df[age_adder]
i += band

Match valueS in two files and replace in selected columns

If values for columns 1,2 and 5 in file1 match with columns 1,2 and 9 in file2.
Then replace values in column 1,2 in file2, using the information of columns 3,4 of file1
Add character R for lines replaced and O for lines not replaced in the output file. Also add columns 1 and 2 from file1 for matched records.
file1
37267.00 37181.00 37267.00 37181.00 2605
37269.00 37181.00 37267.00 37184.00 2605
37271.00 37181.00 37271.00 37181.00 2603
36829.00 37185.00 36820.00 37184.00 2605
36831.00 37187.00 36831.00 37185.00 2605
36833.00 37189.00 36833.00 37189.00 2605
36835.00 37191.00 36831.00 37194.00 2606
file2
37267.00 37181.00 8424 36840.00 37260.00 37146.00 37612.00 36 2605
37269.00 37181.00 8424 36840.00 37260.00 37146.00 37612.00 36 2605
37271.00 37181.00 8424 36840.00 37260.00 37146.00 37612.00 36 2603
36829.00 37185.00 8640 36840.00 37260.00 37146.00 37624.00 36 2605
36831.00 37187.00 8640 36840.00 37260.00 37146.00 37624.00 36 2605
36833.00 37189.00 8640 36840.00 37260.00 37146.00 37624.00 36 2605
36835.00 37191.00 8640 36840.00 37260.00 37146.00 37624.00 36 2606
output desired
37267.00 37181.00 8424 36840.00 37260.00 37146.00 37612.00 36 2605 O 37267.00 37181.00
37267.00 37184.00 8424 36840.00 37260.00 37146.00 37612.00 36 2605 R 37269.00 37181.00
37271.00 37181.00 8424 36840.00 37260.00 37146.00 37612.00 36 2603 O 37271.00 37181.00
36820.00 37184.00 8640 36840.00 37260.00 37146.00 37624.00 36 2605 R 36829.00 37185.00
36831.00 37185.00 8640 36840.00 37260.00 37146.00 37624.00 36 2605 R 36831.00 37187.00
36833.00 37189.00 8640 36840.00 37260.00 37146.00 37624.00 36 2605 O 36833.00 37189.00
36831.00 37194.00 8640 36840.00 37260.00 37146.00 37624.00 36 2606 R 36835.00 37191.00
I tried
awk '
FNR==NR{
a[$1 $2 $5]=$3 $4
b[$3 $4]=$3
c[$3 $4]=$4
next
}
($1 in a){
$1=b[$1]
$2=c[$1]
$1=a[$1]
found=1
}
{
$0=found==1?$0",R":$0",O"
sub(/^...../,"&,")
$1=$1
found=""
}
1
' FS=" " file1 FS=" " OFS=" " file2
Thanks in advance
EDIT: Since OP has changed Input_file's sample data so adding this solution now.
awk '
FNR==NR{
a[$3,$4,$5]=$3
b[$3,$4,$5]=$4
next
}
{
val=$1 SUBSEP $2 SUBSEP $9
val_last=$1 OFS $2
}
(val in a){
$2=b[val]
$1=a[val]
print $0,"R",val_last
next
}
{
print $0,"O",val_last
}' Input_file1 Input_file2 | column -t
Seems your shown expected output is not matching conditions which you have explained if that is the case could you please try following(tested with your shown samples only).
awk '
FNR==NR{
a[$1,$2,$5]=$3
b[$1,$2,$5]=$4
next
}
{
val=$1 SUBSEP $2 SUBSEP $9
}
(val in a){
$2=b[val]
$1=a[val]
print $0,"R"
next
}
{
print $0,"O"
}' Input_file1 Input_file2
Why OP's code is not working: Because once $1 i changed from Input_file1 then next element can't be set since $1 value of current line is changed to Input_file1's $1 now.
tried on gnu awk
awk 'NR==FNR{r[NR]=$0;next} {x=split(r[FNR],a);if(a[1]==$1&&a[2]==$2&&a[5]==$9){$1=a[3];$2=a[4];print $0,"R",a[1],a[2]} else {print $0,"O",a[1],a[2]}}' file1 file2

SQL query is not working (Error in rsqlite_send_query)

This is what the head of my data frame looks like
> head(d19_1)
SMZ SIZ1_diff SIZ1_base SIZ2_diff SIZ2_base SIZ3_diff SIZ3_base SIZ4_diff SIZ4_base SIZ5_diff SIZ5_base
1 1 -620 4170 -189 1347 -35 2040 82 1437 244 1533
2 2 -219 831 -57 255 -4 392 8 282 14 297
3 3 -426 834 -162 294 -134 379 -81 241 -22 221
4 4 -481 676 -142 216 -114 267 -50 158 -43 166
5 5 -233 1711 -109 584 54 913 71 624 74 707
6 6 -322 1539 -79 512 -50 799 23 532 63 576
Total_og Total_base %_SIZ1 %_SIZ2 %_SIZ3 %_SIZ4 %_SIZ5 Total_og Total_base
1 11980 12648 14.86811 14.03118 1.715686 5.706333 15.916504 11980 12648
2 2156 2415 26.35379 22.35294 1.020408 2.836879 4.713805 2156 2415
3 1367 2314 51.07914 55.10204 35.356201 33.609959 9.954751 1367 2314
4 790 1736 71.15385 65.74074 42.696629 31.645570 25.903614 790 1736
5 5339 5496 13.61777 18.66438 5.914567 11.378205 10.466761 5339 5496
6 4362 4747 20.92268 15.42969 6.257822 4.323308 10.937500 4362 4747
The datatype of the data frame is as below str(d19_1)
> str(d19_1)
'data.frame': 1588 obs. of 20 variables:
$ SMZ : int 1 2 3 4 5 6 7 8 9 10 ...
$ SIZ1_diff : int -620 -219 -426 -481 -233 -322 -176 -112 -34 -103 ...
$ SIZ1_base : int 4170 831 834 676 1711 1539 720 1396 998 1392 ...
$ SIZ2_diff : int -189 -57 -162 -142 -109 -79 -12 72 -36 -33 ...
$ SIZ2_base : int 1347 255 294 216 584 512 196 437 343 479 ...
$ SIZ3_diff : int -35 -4 -134 -114 54 -50 16 4 26 83 ...
$ SIZ3_base : int 2040 392 379 267 913 799 361 804 566 725 ...
$ SIZ4_diff : int 82 8 -81 -50 71 23 36 127 46 75 ...
$ SIZ4_base : int 1437 282 241 158 624 532 242 471 363 509 ...
$ SIZ5_diff : int 244 14 -22 -43 74 63 11 143 79 125 ...
$ SIZ5_base : int 1533 297 221 166 707 576 263 582 429 536 ...
$ Total_og : int 11980 2156 1367 790 5339 4362 2027 4715 3465 4561 ...
$ Total_base: int 12648 2415 2314 1736 5496 4747 2168 4464 3278 4375 ...
$ %_SIZ1 : num 14.9 26.4 51.1 71.2 13.6 ...
$ %_SIZ2 : num 14 22.4 55.1 65.7 18.7 ...
$ %_SIZ3 : num 1.72 1.02 35.36 42.7 5.91 ...
$ %_SIZ4 : num 5.71 2.84 33.61 31.65 11.38 ...
$ %_SIZ5 : num 15.92 4.71 9.95 25.9 10.47 ...
$ Total_og : int 11980 2156 1367 790 5339 4362 2027 4715 3465 4561 ...
$ Total_base: int 12648 2415 2314 1736 5496 4747 2168 4464 3278 4375 ...
When I run the below query, it is returning me the below error and I don't know why. I don't have any column in table
Query
d20_1 <- sqldf('SELECT *, CASE
WHEN SMZ BETWEEN 1 AND 110 THEN "Baltimore City"
WHEN SMZ BETWEEN 111 AND 217 THEN "Anne Arundel County"
WHEN SMZ BETWEEN 218 AND 405 THEN "Baltimore County"
WHEN SMZ BETWEEN 406 AND 453 THEN "Carroll County"
WHEN SMZ BETWEEN 454 AND 524 THEN "Harford County"
WHEN SMZ BETWEEN 1667 AND 1674 THEN "York County"
ELSE 0
END Jurisdiction
FROM d19_1')
Error:
Error in rsqlite_send_query(conn#ptr, statement) :
table d19_1 has no column named <NA>
Your code works correctly for me:
d19_1 <- structure(list(SMZ = 1:6, SIZ1_diff = c(-620L, -219L, -426L,
-481L, -233L, -322L), SIZ1_base = c(4170L, 831L, 834L, 676L,
1711L, 1539L), SIZ2_diff = c(-189L, -57L, -162L, -142L, -109L,
-79L), SIZ2_base = c(1347L, 255L, 294L, 216L, 584L, 512L), SIZ3_diff = c(-35L,
-4L, -134L, -114L, 54L, -50L), SIZ3_base = c(2040L, 392L, 379L,
267L, 913L, 799L), SIZ4_diff = c(82L, 8L, -81L, -50L, 71L, 23L
), SIZ4_base = c(1437L, 282L, 241L, 158L, 624L, 532L), SIZ5_diff = c(244L,
14L, -22L, -43L, 74L, 63L), SIZ5_base = c(1533L, 297L, 221L,
166L, 707L, 576L), Total_og = c(11980L, 2156L, 1367L, 790L, 5339L,
4362L), Total_base = c(12648L, 2415L, 2314L, 1736L, 5496L, 4747L
), X._SIZ1 = c(14.86811, 26.35379, 51.07914, 71.15385, 13.61777,
20.92268), X._SIZ2 = c(14.03118, 22.35294, 55.10204, 65.74074,
18.66438, 15.42969), X._SIZ3 = c(1.715686, 1.020408, 35.356201,
42.696629, 5.914567, 6.257822), X._SIZ4 = c(5.706333, 2.836879,
33.609959, 31.64557, 11.378205, 4.323308), X._SIZ5 = c(15.916504,
4.713805, 9.954751, 25.903614, 10.466761, 10.9375), Total_og.1 = c(11980L,
2156L, 1367L, 790L, 5339L, 4362L), Total_base.1 = c(12648L, 2415L,
2314L, 1736L, 5496L, 4747L)), .Names = c("SMZ", "SIZ1_diff",
"SIZ1_base", "SIZ2_diff", "SIZ2_base", "SIZ3_diff", "SIZ3_base",
"SIZ4_diff", "SIZ4_base", "SIZ5_diff", "SIZ5_base", "Total_og",
"Total_base", "X._SIZ1", "X._SIZ2", "X._SIZ3", "X._SIZ4", "X._SIZ5",
"Total_og.1", "Total_base.1"), row.names = c(NA, -6L), class = "data.frame")
library(sqldf)
sqldf('SELECT *, CASE
WHEN SMZ BETWEEN 1 AND 110 THEN "Baltimore City"
WHEN SMZ BETWEEN 111 AND 217 THEN "Anne Arundel County"
WHEN SMZ BETWEEN 218 AND 405 THEN "Baltimore County"
WHEN SMZ BETWEEN 406 AND 453 THEN "Carroll County"
WHEN SMZ BETWEEN 454 AND 524 THEN "Harford County"
WHEN SMZ BETWEEN 1667 AND 1674 THEN "York County"
ELSE 0
END Jurisdiction
FROM d19_1')

calculating atom distances in PDB files

I have this pdb file and I want to calculate the distance between the atom 7 and 8 ($2) with the atoms 12,14,15,17 and 18. If the distance is lower than 5 angstrons, the value should be printed
ATOM 1 N ASN p 140 38.455 18.232 -3.207 1.00 7.39 N
ATOM 2 CA ASN p 140 37.856 18.151 -4.534 1.00 7.91 C
ATOM 3 C ASN p 140 38.700 18.848 -5.595 1.00 10.75 C
ATOM 4 O ASN p 140 39.797 19.271 -5.313 1.00 9.25 O
ATOM 5 CB ASN p 140 36.435 18.715 -4.446 1.00 7.62 C
ATOM 6 CG ASN p 140 35.556 17.898 -3.501 1.00 6.82 C
ATOM 7 OD1 ASN p 140 35.269 18.315 -2.323 1.00 8.53 O
ATOM 8 ND2 ASN p 140 35.197 16.691 -3.945 1.00 5.41 N
TER 9 ASN 140
HETATM 10 C 08H p 1 29.121 15.727 -1.182 1.00 5.89 C
HETATM 11 C 08H p 1 29.763 16.230 -0.040 1.00 5.86 C
HETATM 12 N 08H p 1 31.023 16.810 -0.046 1.00 6.15 N
HETATM 13 C 08H p 1 31.533 17.872 0.633 1.00 6.24 C
HETATM 14 N 08H p 1 32.815 18.037 0.299 1.00 6.83 N
HETATM 15 N 08H p 1 33.151 17.112 -0.526 1.00 7.37 C
HETATM 16 C 08H p 1 32.058 16.349 -0.758 1.00 7.06 C
HETATM 17 O 08H p 1 31.956 15.215 -1.730 1.00 8.15 O
HETATM 18 N 08H p 1 30.979 15.691 -2.746 1.00 10.31 N
HETATM 19 C 08H p 1 29.651 15.777 -2.509 1.00 6.71 C
HETATM 20 O HOH p 170 34.699 19.032 2.134 1.00 6.42 O
Based on a similar script, I wrote this code
# usage: awk -f test.awk structure.pdb
BEGIN{print "asparagine and ligand in the structure..."; ORS=""}
$1=="ATOM" && $3~"ND2|OD1" && $4=="ASN" || $1=="HETATM" && $12~"N|O" && $4!~"HOH" {
print $2,$3,$4,$6"\n"
atm_x[$2]=$7; atm_y[$2]=$8; atm_z[$2]=$9
}
END{ ORS="\n"
for (key1 in atm_x) { list=list" "key1
for (key2 in atm_x) {
if (index(list, key2) != 0 ) continue
dx=atm_x[key1]-atm_x[key2]
dy=atm_y[key1]-atm_y[key2]
dz=atm_z[key1]-atm_z[key2]
distance=sqrt(dx^2+dy^2+dz^2)
if (distance < 5 && distance != 0 ) {
i++
candidate[i]=key1"-"key2": "distance
}
}
}
print "\nCandidates ..."
for (keys in candidate) {print candidate[keys]}
}
when I run this script I get the following result
asparagine and ligand in the structure...
7 OD1 ASN 140
8 ND2 ASN 140
12 N 08H 1
14 N 08H 1
17 O 08H 1
18 N 08H 1
Candidates ...
7-8: 2.2964
7-14: 3.60198
7-17: 4.57576
8-17: 4.19391
8-18: 4.49768
12-14: 2.19905
12-17: 2.50007
12-18: 2.92303
14-17: 3.58028
14-18: 4.25989
17-18: 1.48774
The problem is that I don't want to print the distances when the atoms have the same residue name ($4). I'm new to awk and was wondering what's the best way to handle this. Any suggestions would be appreciated!!
awk '
($1=="ATOM" && ($3=="ND2" || $3=="OD1") && $4=="ASN") || \
($1=="HETATM" && ($12=="N" || $12 =="O") && $4!="HOH") {
atom[$2] = 1
x[$2] = $7
y[$2] = $8
z[$2] = $9
name[$2] = $4
}
END {
for (a in atom) {
for (b in atom) {
if (a > b && name[a] != name[b]) {
dist = sqrt((x[a]-x[b])^2 + (y[a]-y[b])^2 + (z[a]-z[b])^2)
if (dist < 5)
printf "%s-%s: %.4f\n", a, b, dist
}
}
}
}
' pdbfile
7-17: 4.5758
7-14: 3.6020
8-17: 4.1939
8-18: 4.4977