awk print 4 columns and a substring of $8 - awk
I have a file in VCF format with several info in 8th column:
# ... rest of file ...
1 11850891 rs753917964 C T 22276.39 PASS non_cancer_nhomalt_nfe_seu=0;AC_eas=0;AN_eas=18390;AF_eas=0.00000e+00;nhomalt_eas=0;nhomalt=0;non_neuro_AC_nfe_female=1;non_neuro_AN_nfe_female=39830;non_neuro_AF_nfe_female=2.51067e-05;non_neuro_nhomalt_nfe_female=0;non_neuro_AC_afr=0;non_neuro_AN_afr=16214;non_neuro_AF_afr=0.00000e+00;non_neuro_nhomalt_afr=0;controls_AC_raw=2;controls_AN_raw=109408;controls_AF_raw=1.82802e-05;controls_nhomalt_raw=0;non_cancer_AC_eas=0;non_cancer_AN_eas=17690;non_cancer_AF_eas=0.00000e+00;non_cancer_nhomalt_eas=0;non_cancer_AC_amr_female=0;non_cancer_AN_amr_female=20086;non_cancer_AF_amr_female=0.00000e+00;non_cancer_nhomalt_amr_female=0;non_neuro_AC_nfe_swe=0;non_neuro_AN_nfe_swe=14780;non_neuro_AF_nfe_swe=0.00000e+00;non_neuro_nhomalt_nfe_swe=0;controls_AC_male=2;controls_AN_male=58114;controls_AF_male=3.44151e-05;controls_nhomalt_male=0;non_topmed_AC_male=5;non_topmed_AN_male=133538;non_topmed_AF_male=3.74425e-05;non_topmed_nhomalt_male=0;controls_AC_eas_jpn=0;controls_AN_eas_jpn=114;controls_AF_eas_jpn=0.00000e+00;controls_nhomalt_eas_jpn=0;controls_AC_nfe_female=0;controls_AN_nfe_female=19148;controls_AF_nfe_female=0.00000e+00;controls_nhomalt_nfe_female=0;non_neuro_AC_amr=0;non_neuro_AN_amr=30522;non_neuro_AF_amr=0.00000e+00;non_neuro_nhomalt_amr=0;non_neuro_AC_eas_female=0;non_neuro_AN_eas_female=6838;non_neuro_AF_eas_female=0.00000e+00;non_neuro_nhomalt_eas_female=0;AC_asj_male=0;AN_asj_male=5180;AF_asj_male=0.00000e+00;nhomalt_asj_male=0;controls_AC_nfe_male=1;controls_AN_nfe_male=23620;controls_AF_nfe_male=4.23370e-05;controls_nhomalt_nfe_male=0;non_neuro_AC_fin=0;non_neuro_AN_fin=16732;non_neuro_AF_fin=0.00000e+00;non_neuro_nhomalt_fin=0;non_topmed_AC_sas=2;non_topmed_AN_sas=30616;non_topmed_AF_sas=6.53253e-05;non_topmed_nhomalt_sas=0;non_cancer_AC_nfe_female=1;non_cancer_AN_nfe_female=44296;non_cancer_AF_nfe_female=2.25754e-05;non_cancer_nhomalt_nfe_female=0;AC_oth_female=0;AN_oth_female=2928;AF_oth_female=0.00000e+00;nhomalt_oth_female=0;non_cancer_AC_asj=0;non_cancer_AN_asj=9572;non_cancer_AF_asj=0.00000e+00;non_cancer_nhomalt_asj=0;AC_nfe_swe=0;AN_nfe_swe=26134;AF_nfe_swe=0.00000e+00;nhomalt_nfe_swe=0;controls_AC_nfe=1;controls_AN_nfe=42768;controls_AF_nfe=2.33820e-05;controls_nhomalt_nfe=0;controls_AC_oth_female=0;controls_AN_oth_female=986;controls_AF_oth_female=0.00000e+00;controls_nhomalt_oth_female=0;controls_AC_asj=0;controls_AN_asj=2320;controls_AF_asj=0.00000e+00;controls_nhomalt_asj=0;non_neuro_AC_amr_male=0;non_neuro_AN_amr_male=12256;non_neuro_AF_amr_male=0.00000e+00;non_neuro_nhomalt_amr_male=0;controls_AC_nfe_nwe=0;controls_AN_nfe_nwe=14452;controls_AF_nfe_nwe=0.00000e+00;controls_nhomalt_nfe_nwe=0;AC_nfe_nwe=0;AN_nfe_nwe=42210;AF_nfe_nwe=0.00000e+00;nhomalt_nfe_nwe=0;controls_AC_nfe_seu=0;controls_AN_nfe_seu=4760;controls_AF_nfe_seu=0.00000e+00;controls_nhomalt_nfe_seu=0;controls_AC_sas_female=0;controls_AN_sas_female=4214;controls_AF_sas_female=0.00000e+00;controls_nhomalt_sas_female=0;non_neuro_AC_amr_female=0;non_neuro_AN_amr_female=18266;non_neuro_AF_amr_female=0.00000e+00;non_neuro_nhomalt_amr_female=0;non_cancer_AC_eas_jpn=0;non_cancer_AN_eas_jpn=124;non_cancer_AF_eas_jpn=0.00000e+00;non_cancer_nhomalt_eas_jpn=0;non_neuro_AC_nfe_onf=2;non_neuro_AN_nfe_onf=27808;non_neuro_AF_nfe_onf=7.19217e-05;non_neuro_nhomalt_nfe_onf=0;non_topmed_AC_eas_male=0;non_topmed_AN_eas_male=9062;non_topmed_AF_eas_male=0.00000e+00;non_topmed_nhomalt_eas_male=0;AC_eas_jpn=0;AN_eas_jpn=152;AF_eas_jpn=0.00000e+00;nhomalt_eas_jpn=0;non_cancer_AC_afr_male=0;non_cancer_AN_afr_male=5672;non_cancer_AF_afr_male=0.00000e+00;non_cancer_nhomalt_afr_male=0;non_cancer_AC_afr=0;non_cancer_AN_afr=14902;non_cancer_AF_afr=0.00000e+00;non_cancer_nhomalt_afr=0;controls_AC_amr_female=0;controls_AN_amr_female=10226;controls_AF_amr_female=0.00000e+00;controls_nhomalt_amr_female=0;non_neuro_AC_fin_male=0;non_neuro_AN_fin_male=9082;non_neuro_AF_fin_male=0.00000e+00;non_neuro_nhomalt_fin_male=0;AC_female=1;AN_female=115558;AF_female=8.65366e-06;nhomalt_female=0;non_neuro_AC_nfe_bgr=0;non_neuro_AN_nfe_bgr=452;non_neuro_AF_nfe_bgr=0.00000e+00;non_neuro_nhomalt_nfe_bgr=0;non_neuro_AC_oth_male=0;non_neuro_AN_oth_male=2476;non_neuro_AF_oth_male=0.00000e+00;non_neuro_nhomalt_oth_male=0;non_topmed_AC_nfe_est=0;non_topmed_AN_nfe_est=238;non_topmed_AF_nfe_est=0.00000e+00;non_topmed_nhomalt_nfe_est=0;non_topmed_AC_nfe_nwe=0;non_topmed_AN_nfe_nwe=41090;non_topmed_AF_nfe_nwe=0.00000e+00;non_topmed_nhomalt_nfe_nwe=0;non_topmed_AC_amr_male=0;non_topmed_AN_amr_male=14262;non_topmed_AF_amr_male=0.00000e+00;non_topmed_nhomalt_amr_male=0;non_cancer_AC_amr=0;non_cancer_AN_amr=34258;non_cancer_AF_amr=0.00000e+00;non_cancer_nhomalt_amr=0;non_topmed_AC_nfe_swe=0;non_topmed_AN_nfe_swe=26072;non_topmed_AF_nfe_swe=0.00000e+00;non_topmed_nhomalt_nfe_swe=0;non_topmed_AC_nfe_onf=2;non_topmed_AN_nfe_onf=30190;non_topmed_AF_nfe_onf=6.62471e-05;non_topmed_nhomalt_nfe_onf=0;controls_AC_eas_kor=0;controls_AN_eas_kor=1888;controls_AF_eas_kor=0.00000e+00;controls_nhomalt_eas_kor=0;non_topmed_AC_eas_oea=0;non_topmed_AN_eas_oea=14416;non_topmed_AF_eas_oea=0.00000e+00;non_topmed_nhomalt_eas_oea=0;controls_AC_eas_male=0;controls_AN_eas_male=4258;controls_AF_eas_male=0.00000e+00;controls_nhomalt_eas_male=0;controls_AC_oth_male=0;controls_AN_oth_male=928;controls_AF_oth_male=0.00000e+00;controls_nhomalt_oth_male=0;non_topmed_AC=6;non_topmed_AN=244846;non_topmed_AF=2.45052e-05;non_topmed_nhomalt=0;controls_AC_fin=0;controls_AN_fin=13392;controls_AF_fin=0.00000e+00;controls_nhomalt_fin=0;AC_eas_kor=0;AN_eas_kor=3818;AF_eas_kor=0.00000e+00;nhomalt_eas_kor=0;non_neuro_AC_nfe=4;non_neuro_AN_nfe=89556;non_neuro_AF_nfe=4.46648e-05;non_neuro_nhomalt_nfe=0;non_neuro_AC_fin_female=0;non_neuro_AN_fin_female=7650;non_neuro_AF_fin_female=0.00000e+00;non_neuro_nhomalt_fin_female=0;non_cancer_AC_nfe_male=3;non_cancer_AN_nfe_male=58440;non_cancer_AF_nfe_male=5.13347e-05;non_cancer_nhomalt_nfe_male=0;controls_AC_eas_oea=0;controls_AN_eas_oea=7044;controls_AF_eas_oea=0.00000e+00;controls_nhomalt_eas_oea=0;non_topmed_AC_nfe_seu=2;non_topmed_AN_nfe_seu=11408;non_topmed_AF_nfe_seu=1.75316e-04;non_topmed_nhomalt_nfe_seu=0;controls_AC_eas_female=0;controls_AN_eas_female=4788;controls_AF_eas_female=0.00000e+00;controls_nhomalt_eas_female=0;non_topmed_AC_asj=0;non_topmed_AN_asj=9998;non_topmed_AF_asj=0.00000e+00;non_topmed_nhomalt_asj=0;controls_AC_nfe_onf=1;controls_AN_nfe_onf=9998;controls_AF_nfe_onf=1.00020e-04;controls_nhomalt_nfe_onf=0;non_neuro_AC=6;non_neuro_AN=208122;non_neuro_AF=2.88292e-05;non_neuro_nhomalt=0;AC_eas_oea=0;AN_eas_oea=14420;AF_eas_oea=0.00000e+00;nhomalt_eas_oea=0;non_topmed_AC_nfe=4;non_topmed_AN_nfe=111660;non_topmed_AF_nfe=3.58230e-05;non_topmed_nhomalt_nfe=0;non_cancer_AC_oth=0;non_cancer_AN_oth=5620;non_cancer_AF_oth=0.00000e+00;non_cancer_nhomalt_oth=0;non_topmed_AC_raw=6;non_topmed_AN_raw=244878;non_topmed_AF_raw=2.45020e-05;non_topmed_nhomalt_raw=0;non_neuro_AC_nfe_est=0;non_neuro_AN_nfe_est=216;non_neuro_AF_nfe_est=0.00000e+00;non_neuro_nhomalt_nfe_est=0;non_topmed_AC_oth_male=0;non_topmed_AN_oth_male=3186;non_topmed_AF_oth_male=0.00000e+00;non_topmed_nhomalt_oth_male=0;non_cancer_AC_oth_male=0;non_cancer_AN_oth_male=2938;non_cancer_AF_oth_male=0.00000e+00;non_cancer_nhomalt_oth_male=0;AC_nfe_est=0;AN_nfe_est=242;AF_nfe_est=0.00000e+00;nhomalt_nfe_est=0;non_cancer_AC_afr_female=0;non_cancer_AN_afr_female=9230;non_cancer_AF_afr_female=0.00000e+00;non_cancer_nhomalt_afr_female=0;non_topmed_AC_afr_male=0;non_topmed_AN_afr_male=4686;non_topmed_AF_afr_male=0.00000e+00;non_topmed_nhomalt_afr_male=0;AC_eas_male=0;AN_eas_male=9066;AF_eas_male=0.00000e+00;nhomalt_eas_male=0;controls_AC_eas=0;controls_AN_eas=9046;controls_AF_eas=0.00000e+00;controls_nhomalt_eas=0;non_neuro_AC_eas_male=0;non_neuro_AN_eas_male=6574;non_neuro_AF_eas_male=0.00000e+00;non_neuro_nhomalt_eas_male=0;non_cancer_AC_nfe_nwe=0;non_cancer_AN_nfe_nwe=39490;non_cancer_AF_nfe_nwe=0.00000e+00;non_cancer_nhomalt_nfe_nwe=0;controls_AC_sas=1;controls_AN_sas=15690;controls_AF_sas=6.37349e-05;controls_nhomalt_sas=0;non_neuro_AC_sas_male=2;non_neuro_AN_sas_male=23066;non_neuro_AF_sas_male=8.67077e-05;non_neuro_nhomalt_sas_male=0;non_neuro_AC_asj_male=0;non_neuro_AN_asj_male=3126;non_neuro_AF_asj_male=0.00000e+00;non_neuro_nhomalt_asj_male=0;non_cancer_AC_nfe_bgr=0;non_cancer_AN_nfe_bgr=2526;non_cancer_AF_nfe_bgr=0.00000e+00;non_cancer_nhomalt_nfe_bgr=0;controls_AC_oth=0;controls_AN_oth=1914;controls_AF_oth=0.00000e+00;controls_nhomalt_oth=0;non_cancer_AC_eas_female=0;non_cancer_AN_eas_female=8946;non_cancer_AF_eas_female=0.00000e+00;non_cancer_nhomalt_eas_female=0;AC_nfe=4;AN_nfe=113750;AF_nfe=3.51648e-05;nhomalt_nfe=0;non_topmed_AC_female=1;non_topmed_AN_female=111308;non_topmed_AF_female=8.98408e-06;non_topmed_nhomalt_female=0;non_neuro_AC_asj=0;non_neuro_AN_asj=6212;non_neuro_AF_asj=0.00000e+00;non_neuro_nhomalt_asj=0;non_topmed_AC_eas_female=0;non_topmed_AN_eas_female=9324;non_topmed_AF_eas_female=0.00000e+00;non_topmed_nhomalt_eas_female=0;non_neuro_AC_raw=6;non_neuro_AN_raw=208136;non_neuro_AF_raw=2.88273e-05;non_neuro_nhomalt_raw=0;non_topmed_AC_eas=0;non_topmed_AN_eas=18386;non_topmed_AF_eas=0.00000e+00;non_topmed_nhomalt_eas=0;non_topmed_AC_fin_male=0;non_topmed_AN_fin_male=11272;non_topmed_AF_fin_male=0.00000e+00;non_topmed_nhomalt_fin_male=0;non_cancer_AC_asj_male=0;non_cancer_AN_asj_male=4976;non_cancer_AF_asj_male=0.00000e+00;non_cancer_nhomalt_asj_male=0;AC_fin=0;AN_fin=21646;AF_fin=0.00000e+00;nhomalt_fin=0;AC_nfe_male=3;AN_nfe_male=63598;AF_nfe_male=4.71713e-05;nhomalt_nfe_male=0;non_topmed_AC_eas_kor=0;non_topmed_AN_eas_kor=3818;non_topmed_AF_eas_kor=0.00000e+00;non_topmed_nhomalt_eas_kor=0;controls_AC_amr_male=0;controls_AN_amr_male=6884;controls_AF_amr_male=0.00000e+00;controls_nhomalt_amr_male=0;non_neuro_AC_eas_oea=0;non_neuro_AN_eas_oea=9446;non_neuro_AF_eas_oea=0.00000e+00;non_neuro_nhomalt_eas_oea=0;AC_sas_female=0;AN_sas_female=7544;AF_sas_female=0.00000e+00;nhomalt_sas_female=0;controls_AC_afr_female=0;controls_AN_afr_female=4240;controls_AF_afr_female=0.00000e+00;controls_nhomalt_afr_female=0;controls_AC_amr=0;controls_AN_amr=17110;controls_AF_amr=0.00000e+00;controls_nhomalt_amr=0;non_topmed_AC_eas_jpn=0;non_topmed_AN_eas_jpn=152;non_topmed_AF_eas_jpn=0.00000e+00;non_topmed_nhomalt_eas_jpn=0;AC_asj_female=0;AN_asj_female=4900;AF_asj_female=0.00000e+00;nhomalt_asj_female=0;non_topmed_AC_nfe_bgr=0;non_topmed_AN_nfe_bgr=2662;non_topmed_AF_nfe_bgr=0.00000e+00;non_topmed_nhomalt_nfe_bgr=0;non_cancer_AC_nfe_est=0;non_cancer_AN_nfe_est=158;non_cancer_AF_nfe_est=0.00000e+00;non_cancer_nhomalt_nfe_est=0;non_neuro_AC_eas=0;non_neuro_AN_eas=13412;non_neuro_AF_eas=0.00000e+00;non_neuro_nhomalt_eas=0;non_cancer_AC_nfe=4;non_cancer_AN_nfe=102736;non_cancer_AF_nfe=3.89347e-05;non_cancer_nhomalt_nfe=0;non_neuro_AC_male=5;non_neuro_AN_male=112470;non_neuro_AF_male=4.44563e-05;non_neuro_nhomalt_male=0;non_neuro_AC_sas_female=0;non_neuro_AN_sas_female=7542;non_neuro_AF_sas_female=0.00000e+00;non_neuro_nhomalt_sas_female=0;AC_asj=0;AN_asj=10080;AF_asj=0.00000e+00;nhomalt_asj=0;controls_AC_nfe_est=0;controls_AN_nfe_est=70;controls_AF_nfe_est=0.00000e+00;controls_nhomalt_nfe_est=0;non_topmed_AC_asj_female=0;non_topmed_AN_asj_female=4854;non_topmed_AF_asj_female=0.00000e+00;non_topmed_nhomalt_asj_female=0;non_cancer_AC_nfe_swe=0;non_cancer_AN_nfe_swe=25290;non_cancer_AF_nfe_swe=0.00000e+00;non_cancer_nhomalt_nfe_swe=0;non_cancer_AC=6;non_cancer_AN=236934;non_cancer_AF=2.53235e-05;non_cancer_nhomalt=0;non_topmed_AC_oth=0;non_topmed_AN_oth=6064;non_topmed_AF_oth=0.00000e+00;non_topmed_nhomalt_oth=0;non_topmed_AC_fin_female=0;non_topmed_AN_fin_female=10372;non_topmed_AF_fin_female=0.00000e+00;non_topmed_nhomalt_fin_female=0;non_cancer_AC_fin_female=0;non_cancer_AN_fin_female=10362;non_cancer_AF_fin_female=0.00000e+00;non_cancer_nhomalt_fin_female=0;AC_oth=0;AN_oth=6140;AF_oth=0.00000e+00;nhomalt_oth=0;non_neuro_AC_nfe_male=3;non_neuro_AN_nfe_male=49726;non_neuro_AF_nfe_male=6.03306e-05;non_neuro_nhomalt_nfe_male=0;controls_AC_female=0;controls_AN_female=51290;controls_AF_female=0.00000e+00;controls_nhomalt_female=0;non_cancer_AC_fin=0;non_cancer_AN_fin=21630;non_cancer_AF_fin=0.00000e+00;non_cancer_nhomalt_fin=0;non_topmed_AC_fin=0;non_topmed_AN_fin=21644;non_topmed_AF_fin=0.00000e+00;non_topmed_nhomalt_fin=0;non_cancer_AC_eas_oea=0;non_cancer_AN_eas_oea=13792;non_cancer_AF_eas_oea=0.00000e+00;non_cancer_nhomalt_eas_oea=0;non_topmed_AC_nfe_female=1;non_topmed_AN_nfe_female=48806;non_topmed_AF_nfe_female=2.04893e-05;non_topmed_nhomalt_nfe_female=0;non_cancer_AC_sas_male=2;non_cancer_AN_sas_male=23032;non_cancer_AF_sas_male=8.68357e-05;non_cancer_nhomalt_sas_male=0;controls_AC_asj_male=0;controls_AN_asj_male=1106;controls_AF_asj_male=0.00000e+00;controls_nhomalt_asj_male=0;non_cancer_AC_raw=6;non_cancer_AN_raw=236958;non_cancer_AF_raw=2.53209e-05;non_cancer_nhomalt_raw=0;non_cancer_AC_eas_male=0;non_cancer_AN_eas_male=8744;non_cancer_AF_eas_male=0.00000e+00;non_cancer_nhomalt_eas_male=0;non_topmed_AC_asj_male=0;non_topmed_AN_asj_male=5144;non_topmed_AF_asj_male=0.00000e+00;non_topmed_nhomalt_asj_male=0;non_neuro_AC_oth=0;non_neuro_AN_oth=4866;non_neuro_AF_oth=0.00000e+00;non_neuro_nhomalt_oth=0;AC_male=5;AN_male=135906;AF_male=3.67901e-05;nhomalt_male=0;controls_AC_fin_female=0;controls_AN_fin_female=6474;controls_AF_fin_female=0.00000e+00;controls_nhomalt_fin_female=0;controls_AC_nfe_bgr=0;controls_AN_nfe_bgr=678;controls_AF_nfe_bgr=0.00000e+00;controls_nhomalt_nfe_bgr=0;controls_AC_asj_female=0;controls_AN_asj_female=1214;controls_AF_asj_female=0.00000e+00;controls_nhomalt_asj_female=0;AC_amr_male=0;AN_amr_male=14320;AF_amr_male=0.00000e+00;nhomalt_amr_male=0;AC_amr_female=0;AN_amr_female=20270;AF_amr_female=0.00000e+00;nhomalt_amr_female=0;non_topmed_AC_sas_male=2;non_topmed_AN_sas_male=23072;non_topmed_AF_sas_male=8.66852e-05;non_topmed_nhomalt_sas_male=0;AC_oth_male=0;AN_oth_male=3212;AF_oth_male=0.00000e+00;nhomalt_oth_male=0;non_cancer_AC_sas=2;non_cancer_AN_sas=30526;non_cancer_AF_sas=6.55179e-05;non_cancer_nhomalt_sas=0;non_neuro_AC_nfe_seu=2;non_neuro_AN_nfe_seu=10942;non_neuro_AF_nfe_seu=1.82782e-04;non_neuro_nhomalt_nfe_seu=0;non_cancer_AC_eas_kor=0;non_cancer_AN_eas_kor=3774;controls_AF_sas_male=8.71384e-05;controls_nhomalt_sas_male=0;non_topmed_AC_sas_female=0;non_topmed_AN_sas_female=7544;non_topmed_AF_sas_female=0.00000e+00;non_topmed_nhomalt_sas_female=0;non_topmed_AC_afr=0;non_topmed_AN_afr=12022;non_topmed_AF_afr=0.00000e+00;non_topmed_nhomalt_afr=0;controls_AC=2;controls_AN=109404;controls_AF=1.82809e-05;AN_popmax=30616;AF_popmax=6.53253e-05;nhomalt_popmax=0;non_cancer_AF_eas_kor=0.00000e+00;non_cancer_nhomalt_eas_kor=0;non_topmed_AC_afr_female=0;non_topmed_AN_afr_female=7336;non_topmed_AF_afr_female=0.00000e+00;non_topmed_nhomalt_afr_female=0;controls_AC_sas_male=1;controls_AN_sas_male=11476
1 11850892 rs373747884 G A 34745.78 PASS non_topmed_nhomalt_male=0;controls_AC_eas_jpn=0;controls_AN_eas_jpn=114;controls_AF_eas_jpn=0.00000e+00;controls_nhomalt_eas_jpn=0;controls_AC_nfe_female=2;controls_AN_nfe_female=19148;controls_AF_nfe_female=1.04450e-04;controls_nhomalt_nfe_female=0;non_neuro_AC_amr=1;non_neuro_AN_amr=30522;non_neuro_AF_amr=3.27633e-05;non_neuro_nhomalt_amr=0;non_neuro_AC_eas_female=0;non_neuro_AN_eas_female=6840;non_neuro_AF_eas_female=0.00000e+00;non_neuro_nhomalt_eas_female=0;AC_asj_male=0;AN_asj_male=5180;AF_asj_male=0.00000e+00;nhomalt_asj_male=0;controls_AC_nfe_male=0;controls_AN_nfe_male=23620;controls_AF_nfe_male=0.00000e+00;controls_nhomalt_nfe_male=0;non_neuro_AC_fin=0;non_neuro_AN_fin=16734;non_neuro_AF_fin=0.00000e+00;non_neuro_nhomalt_fin=0;non_topmed_AC_sas=2;non_topmed_AN_sas=30614;non_topmed_AF_sas=6.53296e-05;non_topmed_nhomalt_sas=0;non_cancer_AC_nfe_female=4;non_cancer_AN_nfe_female=44298;non_cancer_AF_nfe_female=9.02975e-05;non_cancer_nhomalt_nfe_female=0;AC_oth_female=0;AN_oth_female=2928;AF_oth_female=0.00000e+00;nhomalt_oth_female=0;non_cancer_AC_asj=0;non_cancer_AN_asj=9572;non_cancer_AF_asj=0.00000e+00;non_cancer_nhomalt_asj=0;AC_nfe_swe=0;AN_nfe_swe=26134;AF_nfe_swe=0.00000e+00;nhomalt_nfe_swe=0;controls_AC_nfe=2;controls_AN_nfe=42768;controls_AF_nfe=4.67639e-05;controls_nhomalt_nfe=0;controls_AC_oth_female=0;controls_AN_oth_female=986;controls_AF_oth_female=0.00000e+00;controls_nhomalt_oth_female=0;controls_AC_asj=0;controls_AN_asj=2320;controls_AF_asj=0.00000e+00;controls_nhomalt_asj=0;non_neuro_AC_amr_male=0;non_neuro_AN_amr_male=12256;non_neuro_AF_amr_male=0.00000e+00;non_neuro_nhomalt_amr_male=0;controls_AC_nfe_nwe=2;controls_AN_nfe_nwe=14452;controls_AF_nfe_nwe=1.38389e-04;controls_nhomalt_nfe_nwe=0;AC_nfe_nwe=4;AN_nfe_nwe=42212;AF_nfe_nwe=9.47598e-05;nhomalt_nfe_nwe=0;controls_AC_nfe_seu=0;controls_AN_nfe_seu=4760;controls_AF_nfe_seu=0.00000e+00;controls_nhomalt_nfe_seu=0;controls_AC_sas_female=1;controls_AN_sas_female=4212;controls_AF_sas_female=2.37417e-04;controls_nhomalt_sas_female=0;non_neuro_AC_amr_female=1;non_neuro_AN_amr_female=18266;non_neuro_AF_amr_female=5.47465e-05;non_neuro_nhomalt_amr_female=0;non_cancer_AC_eas_jpn=0;non_cancer_AN_eas_jpn=124;non_cancer_AF_eas_jpn=0.00000e+00;non_cancer_nhomalt_eas_jpn=0;non_neuro_AC_nfe_onf=1;non_neuro_AN_nfe_onf=27808;non_neuro_AF_nfe_onf=3.59609e-05;non_neuro_nhomalt_nfe_onf=0;non_topmed_AC_eas_male=2;non_topmed_AN_eas_male=9062;non_topmed_AF_eas_male=2.20702e-04;non_topmed_nhomalt_eas_male=0;AC_eas_jpn=0;AN_eas_jpn=152;AF_eas_jpn=0.00000e+00;nhomalt_eas_jpn=0;non_cancer_AC_afr_male=0;non_cancer_AN_afr_male=5672;non_cancer_AF_afr_male=0.00000e+00;non_cancer_nhomalt_afr_male=0;non_cancer_AC_afr=1;non_cancer_AN_afr=14902;non_cancer_AF_afr=6.71051e-05;non_cancer_nhomalt_afr=0;controls_AC_amr_female=0;controls_AN_amr_female=10226;controls_AF_amr_female=0.00000e+00;controls_nhomalt_amr_female=0;non_neuro_AC_fin_male=0;non_neuro_AN_fin_male=9084;non_neuro_AF_fin_male=0.00000e+00;non_neuro_nhomalt_fin_male=0;AC_female=7;AN_female=115558;AF_female=6.05756e-05;nhomalt_female=0;non_neuro_AC_nfe_bgr=0;non_neuro_AN_nfe_bgr=452;non_neuro_AF_nfe_bgr=0.00000e+00;non_neuro_nhomalt_nfe_bgr=0;non_neuro_AC_oth_male=0;non_neuro_AN_oth_male=2476;non_neuro_AF_oth_male=0.00000e+00;non_neuro_nhomalt_oth_male=0;non_topmed_AC_nfe_est=0;non_topmed_AN_nfe_est=238;non_topmed_AF_nfe_est=0.00000e+00;non_topmed_nhomalt_nfe_est=0;non_topmed_AC_nfe_nwe=4;non_topmed_AN_nfe_nwe=41092;non_topmed_AF_nfe_nwe=9.73425e-05;non_topmed_nhomalt_nfe_nwe=0;non_topmed_AC_amr_male=0;non_topmed_AN_amr_male=14262;non_topmed_AF_amr_male=0.00000e+00;non_topmed_nhomalt_amr_male=0;non_cancer_AC_amr=1;non_cancer_AN_amr=34258;non_cancer_AF_amr=2.91903e-05;non_cancer_nhomalt_amr=0;non_topmed_AC_nfe_swe=0;non_topmed_AN_nfe_swe=26072;non_topmed_AF_nfe_swe=0.00000e+00;non_topmed_nhomalt_nfe_swe=0;non_topmed_AC_nfe_onf=1;non_topmed_AN_nfe_onf=30190;non_topmed_AF_nfe_onf=3.31236e-05;non_topmed_nhomalt_nfe_onf=0;controls_AC_eas_kor=0;controls_AN_eas_kor=1888;controls_AF_eas_kor=0.00000e+00;controls_nhomalt_eas_kor=0;non_topmed_AC_eas_oea=2;non_topmed_AN_eas_oea=14418;non_topmed_AF_eas_oea=1.38715e-04;non_topmed_nhomalt_eas_oea=0;controls_AC_eas_male=2;controls_AN_eas_male=4258;controls_AF_eas_male=4.69704e-04;controls_nhomalt_eas_male=0;controls_AC_oth_male=0;controls_AN_oth_male=928;controls_AF_oth_male=0.00000e+00;controls_nhomalt_oth_male=0;non_topmed_AC=11;non_topmed_AN=244848;non_topmed_AF=4.49258e-05;non_topmed_nhomalt=0;controls_AC_fin=0;controls_AN_fin=13394;controls_AF_fin=0.00000e+00;controls_nhomalt_fin=0;AC_eas_kor=0;AN_eas_kor=3818;AF_eas_kor=0.00000e+00;nhomalt_eas_kor=0;non_neuro_AC_nfe=4;non_neuro_AN_nfe=89554;non_neuro_AF_nfe=4.46658e-05;non_neuro_nhomalt_nfe=0;non_neuro_AC_fin_female=0;non_neuro_AN_fin_female=7650;non_neuro_AF_fin_female=0.00000e+00;non_neuro_nhomalt_fin_female=0;non_cancer_AC_nfe_male=1;non_cancer_AN_nfe_male=58442;non_cancer_AF_nfe_male=1.71110e-05;non_cancer_nhomalt_nfe_male=0;controls_AC_eas_oea=2;controls_AN_eas_oea=7044;controls_AF_eas_oea=2.83930e-04;controls_nhomalt_eas_oea=0;non_topmed_AC_nfe_seu=0;non_topmed_AN_nfe_seu=11408;non_topmed_AF_nfe_seu=0.00000e+00;non_topmed_nhomalt_nfe_seu=0;controls_AC_eas_female=0;controls_AN_eas_female=4788;controls_AF_eas_female=0.00000e+00;controls_nhomalt_eas_female=0;non_topmed_AC_asj=0;non_topmed_AN_asj=9998;non_topmed_AF_asj=0.00000e+00;non_topmed_nhomalt_asj=0;controls_AC_nfe_onf=0;controls_AN_nfe_onf=9998;controls_AF_nfe_onf=0.00000e+00;controls_nhomalt_nfe_onf=0;non_neuro_AC=9;non_neuro_AN=208120;non_neuro_AF=4.32443e-05;non_neuro_nhomalt=0;AC_eas_oea=2;AN_eas_oea=14422;AF_eas_oea=1.38677e-04;nhomalt_eas_oea=0;non_topmed_AC_nfe=5;non_topmed_AN_nfe=111662;non_topmed_AF_nfe=4.47780e-05;non_topmed_nhomalt_nfe=0;non_cancer_AC_oth=0;non_cancer_AN_oth=5620;non_cancer_AF_oth=0.00000e+00;non_cancer_nhomalt_oth=0;non_topmed_AC_raw=11;non_topmed_AN_raw=244878;non_topmed_AF_raw=4.49203e-05;non_topmed_nhomalt_raw=0;non_neuro_AC_nfe_est=0;non_neuro_AN_nfe_est=216;non_neuro_AF_nfe_est=0.00000e+00;non_neuro_nhomalt_nfe_est=0;non_topmed_AC_oth_male=0;non_topmed_AN_oth_male=3186;non_topmed_AF_oth_male=0.00000e+00;non_topmed_nhomalt_oth_male=0;non_cancer_AC_oth_male=0;non_cancer_AN_oth_male=2938;non_cancer_AF_oth_male=0.00000e+00;non_cancer_nhomalt_oth_male=0;AC_nfe_est=0;AN_nfe_est=242;AF_nfe_est=0.00000e+00;nhomalt_nfe_est=0;non_cancer_AC_afr_female=1;non_cancer_AN_afr_female=9230;non_cancer_AF_afr_female=1.08342e-04;non_cancer_nhomalt_afr_female=0;non_topmed_AC_afr_male=0;non_topmed_AN_afr_male=4686;non_topmed_AF_afr_male=0.00000e+00;non_topmed_nhomalt_afr_male=0;AC_eas_male=2;AN_eas_male=9066;AF_eas_male=2.20604e-04;nhomalt_eas_male=0;controls_AC_eas=2;controls_AN_eas=9046;controls_AF_eas=2.21092e-04;controls_nhomalt_eas=0;non_neuro_AC_eas_male=1;non_neuro_AN_eas_male=6574;non_neuro_AF_eas_male=1.52114e-04;non_neuro_nhomalt_eas_male=0;non_cancer_AC_nfe_nwe=4;non_cancer_AN_nfe_nwe=39494;non_cancer_AF_nfe_nwe=1.01281e-04;non_cancer_nhomalt_nfe_nwe=0;controls_AC_sas=1;controls_AN_sas=15688;controls_AF_sas=6.37430e-05;controls_nhomalt_sas=0;non_neuro_AC_sas_male=1;non_neuro_AN_sas_male=23066;non_neuro_AF_sas_male=4.33539e-05;non_neuro_nhomalt_sas_male=0;non_neuro_AC_asj_male=0;non_neuro_AN_asj_male=3126;non_neuro_AF_asj_male=0.00000e+00;non_neuro_nhomalt_asj_male=0;non_cancer_AC_nfe_bgr=0;non_cancer_AN_nfe_bgr=2526;non_cancer_AF_nfe_bgr=0.00000e+00;non_cancer_nhomalt_nfe_bgr=0;controls_AC_oth=0;controls_AN_oth=1914;controls_AF_oth=0.00000e+00;controls_nhomalt_oth=0;non_cancer_AC_eas_female=0;non_cancer_AN_eas_female=8948;non_cancer_AF_eas_female=0.00000e+00;non_cancer_nhomalt_eas_female=0;AC_nfe=5;AN_nfe=113752;AF_nfe=4.39553e-05;nhomalt_nfe=0;non_topmed_AC_female=7;non_topmed_AN_female=111308;non_topmed_AF_female=6.28886e-05;non_topmed_nhomalt_female=0;non_neuro_AC_asj=0;non_neuro_AN_asj=6212;non_neuro_AF_asj=0.00000e+00;non_neuro_nhomalt_asj=0;non_topmed_AC_eas_female=0;non_topmed_AN_eas_female=9326;non_topmed_AF_eas_female=0.00000e+00;non_topmed_nhomalt_eas_female=0;non_neuro_AC_raw=9;non_neuro_AN_raw=208136;non_neuro_AF_raw=4.32410e-05;non_neuro_nhomalt_raw=0;non_topmed_AC_eas=2;non_topmed_AN_eas=18388;non_topmed_AF_eas=1.08767e-04;non_topmed_nhomalt_eas=0;non_topmed_AC_fin_male=0;non_topmed_AN_fin_male=11274;non_topmed_AF_fin_male=0.00000e+00;non_topmed_nhomalt_fin_male=0;non_cancer_AC_asj_male=0;non_cancer_AN_asj_male=4976;non_cancer_AF_asj_male=0.00000e+00;non_cancer_nhomalt_asj_male=0;AC_fin=0;AN_fin=21648;AF_fin=0.00000e+00;nhomalt_fin=0;AC_nfe_male=1;AN_nfe_male=63598;AF_nfe_male=1.57238e-05;nhomalt_nfe_male=0;non_topmed_AC_eas_kor=0;non_topmed_AN_eas_kor=3818;non_topmed_AF_eas_kor=0.00000e+00;non_topmed_nhomalt_eas_kor=0;controls_AC_amr_male=0;controls_AN_amr_male=6884;controls_AF_amr_male=0.00000e+00;controls_nhomalt_amr_male=0;non_neuro_AC_eas_oea=1;non_neuro_AN_eas_oea=9448;non_neuro_AF_eas_oea=1.05843e-04;non_neuro_nhomalt_eas_oea=0;AC_sas_female=1;AN_sas_female=7542;AF_sas_female=1.32591e-04;nhomalt_sas_female=0;controls_AC_afr_female=1;controls_AN_afr_female=4240;controls_AF_afr_female=2.35849e-04;controls_nhomalt_afr_female=0;controls_AC_amr=0;controls_AN_amr=17110;controls_AF_amr=0.00000e+00;controls_nhomalt_amr=0;non_topmed_AC_eas_jpn=0;non_topmed_AN_eas_jpn=152;non_topmed_AF_eas_jpn=0.00000e+00;non_topmed_nhomalt_eas_jpn=0;AC_asj_female=0;AN_asj_female=4900;AF_asj_female=0.00000e+00;nhomalt_asj_female=0;non_topmed_AC_nfe_bgr=0;non_topmed_AN_nfe_bgr=2662;non_topmed_AF_nfe_bgr=0.00000e+00;non_topmed_nhomalt_nfe_bgr=0;non_cancer_AC_nfe_est=0;non_cancer_AN_nfe_est=158;non_cancer_AF_nfe_est=0.00000e+00;non_cancer_nhomalt_nfe_est=0;non_neuro_AC_eas=1;non_neuro_AN_eas=13414;non_neuro_AF_eas=7.45490e-05;non_neuro_nhomalt_eas=0;non_cancer_AC_nfe=5;non_cancer_AN_nfe=102740;non_cancer_AF_nfe=4.86665e-05;non_cancer_nhomalt_nfe=0;non_neuro_AC_male=3;non_neuro_AN_male=112470;non_neuro_AF_male=2.66738e-05;non_neuro_nhomalt_male=0;non_neuro_AC_sas_female=1;non_neuro_AN_sas_female=7540;non_neuro_AF_sas_female=1.32626e-04;non_neuro_nhomalt_sas_female=0;AC_asj=0;AN_asj=10080;AF_asj=0.00000e+00;nhomalt_asj=0;controls_AC_nfe_est=0;controls_AN_nfe_est=70;controls_AF_nfe_est=0.00000e+00;controls_nhomalt_nfe_est=0;non_topmed_AC_asj_female=0;non_topmed_AN_asj_female=4854;non_topmed_AF_asj_female=0.00000e+00;non_topmed_nhomalt_asj_female=0;non_cancer_AC_nfe_swe=0;non_cancer_AN_nfe_swe=25290;non_cancer_AF_nfe_swe=0.00000e+00;non_cancer_nhomalt_nfe_swe=0;non_cancer_AC=11;non_cancer_AN=236940;non_cancer_AF=4.64253e-05;non_cancer_nhomalt=0;non_topmed_AC_oth=0;non_topmed_AN_oth=6064;non_topmed_AF_oth=0.00000e+00;non_topmed_nhomalt_oth=0;non_topmed_AC_fin_female=0;non_topmed_AN_fin_female=10372;non_topmed_AF_fin_female=0.00000e+00;non_topmed_nhomalt_fin_female=0;non_cancer_AC_fin_female=0;non_cancer_AN_fin_female=10362;non_cancer_AF_fin_female=0.00000e+00;non_cancer_nhomalt_fin_female=0;AC_oth=0;AN_oth=6140;AF_oth=0.00000e+00;nhomalt_oth=0;non_neuro_AC_nfe_male=1;non_neuro_AN_nfe_male=49724;non_neuro_AF_nfe_male=2.01110e-05;non_neuro_nhomalt_nfe_male=0;controls_AC_female=4;controls_AN_female=51288;controls_AF_female=7.79910e-05;controls_nhomalt_female=0;non_cancer_AC_fin=0;non_cancer_AN_fin=21632;non_cancer_AF_fin=0.00000e+00;non_cancer_nhomalt_fin=0;non_topmed_AC_fin=0;non_topmed_AN_fin=21646;non_topmed_AF_fin=0.00000e+00;non_topmed_nhomalt_fin=0;non_cancer_AC_eas_oea=2;non_cancer_AN_eas_oea=13794;non_cancer_AF_eas_oea=1.44991e-04;non_cancer_nhomalt_eas_oea=0;non_topmed_AC_nfe_female=4;non_topmed_AN_nfe_female=48808;non_topmed_AF_nfe_female=8.19538e-05;non_topmed_nhomalt_nfe_female=0;non_cancer_AC_sas_male=1;non_cancer_AN_sas_male=23032;non_cancer_AF_sas_male=4.34179e-05;non_cancer_nhomalt_sas_male=0;controls_AC_asj_male=0;controls_AN_asj_male=1106;controls_AF_asj_male=0.00000e+00;controls_nhomalt_asj_male=0;non_cancer_AC_raw=11;non_cancer_AN_raw=236958;non_cancer_AF_raw=4.64217e-05;non_cancer_nhomalt_raw=0;non_cancer_AC_eas_male=2;non_cancer_AN_eas_male=8744;non_cancer_AF_eas_male=2.28728e-04;non_cancer_nhomalt_eas_male=0;non_topmed_AC_asj_male=0;non_topmed_AN_asj_male=5144;non_topmed_AF_asj_male=0.00000e+00;non_topmed_nhomalt_asj_male=0;non_neuro_AC_oth=0;non_neuro_AN_oth=4866;non_neuro_AF_oth=0.00000e+00;non_neuro_nhomalt_oth=0;AC_male=4;AN_male=135908;AF_male=2.94317e-05;nhomalt_male=0;controls_AC_fin_female=0;controls_AN_fin_female=6474;controls_AF_fin_female=0.00000e+00;controls_nhomalt_fin_female=0;controls_AC_nfe_bgr=0;controls_AN_nfe_bgr=678;controls_AF_nfe_bgr=0.00000e+00;controls_nhomalt_nfe_bgr=0;controls_AC_asj_female=0;controls_AN_asj_female=1214;controls_AF_asj_female=0.00000e+00;controls_nhomalt_asj_female=0;AC_amr_male=0;AN_amr_male=14320;AF_amr_male=0.00000e+00;nhomalt_amr_male=0;AC_amr_female=1;AN_amr_female=20270;AF_amr_female=4.93340e-05;nhomalt_amr_female=0;non_topmed_AC_sas_male=1;non_topmed_AN_sas_male=23072;non_topmed_AF_sas_male=4.33426e-05;non_topmed_nhomalt_sas_male=0;AC_oth_male=0;AN_oth_male=3212;AF_oth_male=0.00000e+00;nhomalt_oth_male=0;non_cancer_AC_sas=2;non_cancer_AN_sas=30524;non_cancer_AF_sas=6.55222e-05;controls_AF_sas_male=0.00000e+00;controls_nhomalt_sas_male=0;non_topmed_AC_sas_female=1;non_topmed_AN_sas_female=7542;non_topmed_AF_sas_female=1.32591e-04;non_topmed_nhomalt_sas_female=0;non_topmed_AC_afr=1;non_topmed_AN_afr=12020;non_topmed_AF_afr=8.31947e-05;non_topmed_nhomalt_afr=0;controls_AC=6;controls_AN=109404;controls_AF=5.48426e-05;AF_popmax=1.08743e-04;nhomalt_popmax=0;non_cancer_nhomalt_sas=0;non_neuro_AC_nfe_seu=0;non_neuro_AN_nfe_seu=10942;non_neuro_AF_nfe_seu=0.00000e+00;non_neuro_nhomalt_nfe_seu=0;non_cancer_AC_eas_kor=0;non_cancer_AN_eas_kor=3774;non_cancer_AF_eas_kor=0.00000e+00;non_cancer_nhomalt_eas_kor=0;non_topmed_AC_afr_female=1;non_topmed_AN_afr_female=7334;non_topmed_AF_afr_female=1.36351e-04;non_topmed_nhomalt_afr_female=0;controls_AC_sas_male=0;controls_AN_sas_male=11476
# ... rest of file ...
I need to merge a tabix command with an awk to print $1,$2,$4,$5 and a substring of $8 (only the number after AF_popmax= and the number after nhomalt_popmax=), as example:
# ...
1 11850891 C T AF_popmax=6.53253e-05;nhomalt_popmax=0
1 11850892 G A AF_popmax=1.08743e-04;nhomalt_popmax=0
# ...
I tried this command:
tabix file_input.vcf.bgz 1:11850891-55525202 | awk '{$8=substr(/;AF_popmax=[^;]*/,""); print $1,$2,$4,$5,$8}'
but I only get a 0/1 incolumn $8:
1 11850891 C T 1
1 11850892 G A 1
Does anybody got a clue?
Thank you very much in advance for any help (other approaches are welcomed)
I will try to explain you a very generic way to do this which you can adapt easily if you want to print out other stuff.
Assume you have a string which has a format like this:
key1=value1;key2=value2;key3=value3
and you would like to make a selection or some operations with these values, then the easiest is to store these in an associative array such that we have:
array["key1"] => value1
array["key2"] => value2
array["key3"] => value3
array["key1","full"] => "key1=value1"
array["key2","full"] => "key2=value2"
array["key3","full"] => "key3=value3"
you can use the following function for that:
function str2map(str,fs1,fs2,map, n,tmp) {
n=split(str,map,fs1)
for (;n>0;n--) {
split(map[n],tmp,fs2);
map[tmp[1]]=tmp[2]; map[tmp[1],"full"]=map[n]
delete map[n]
}
}
And this leads to the following awk program:
awk '
function str2map(str,fs1,fs2,map, n,tmp) {
n=split(str,map,fs1)
for (;n>0;n--) {
split(map[n],tmp,fs2);
map[tmp[1]]=tmp[2]; map[tmp[1],"full"]=map[n]
delete map[n]
}
}
{ str2map($8,";","=",map) }
{ print $1,$2,$4,$5,map["AF_popmax","full"] ";" map["nhomalt_popmax","full"] }
' file
This outputs
1 11850891 C T AF_popmax=6.53253e-05;nhomalt_popmax=0
1 11850892 G A AF_popmax=1.08743e-04;nhomalt_popmax=0
The advantage of this method is that you can easily adapt your code to print any other key you are interested in, or even make selections based on this.
Could you please try following.
awk '
match($0,/AF_popmax=[^;]*/){
val1=substr($0,RSTART,RLENGTH)
}
match($0,/nhomalt_popmax=[^;]*/){
val2=substr($0,RSTART,RLENGTH)
print $1,$2,$4,$5,val1,val2
}
' Input_file
Related
awk does not get multiple matches in a line with match
AWK has the match(s, r [, a]) function which according to the manual is capable of recording all occuring patterns into array "a": ...If array a is provided, a is cleared and then elements 1 through n are filled with the portions of s that match the corresponding parenthesized subexpression in r. The 0'th element of a contains the portion of s matched by the entire regular expression r. Subscripts a[n, "start"], and a[n, "length"] provide the starting index in the string and length respectively, of EACH matching substring. I expect that the following line: echo 123412341234 | awk '{match($0,"1",arr); print arr[0] arr[1] arr[2];)' prints 111 But in fact "match" ignores all other matches except the first one. Could please someone tell me please what is the proper syntax here to populate "arr" with all occurrences of "1"?
match only finds first match and stops there. You will have to run match in a loop or else use this way where we use split input on anything this is not 1: echo '123412341234' | awk -F '[^1]+' '{print $1 $2 $3}' 111 Or using split in gnu-awk: echo '123412341234' | awk 'split($0, a, /1/, m) {print m[1] m[2] m[3]}' 111
I would harness GNU AWK patsplit function for that task following way, let file.txt content be 123412341234 then awk '{patsplit($0,arr,"1");print arr[1] arr[2] arr[3]}' file.txt gives output 111 Explanation: patsplit is function which allows you to get similar effect to using FPAT variable, it does put all matches of 3rd argument into array provided as 2nd argument (clearing it if is not empty) found in string provided as 1st argument. Observe that 1st finding does goes under key 1, 2nd under 2, 3rd under 3 and so on (there is nothing under 0) (tested in GNU Awk 5.0.1)
If sub is allowed then you can do a substitution here. Try following awk code once. awk '{gsub(/[^1]+/,"")} 1' Input_file
patsplit() is basically same as wrapping the desired regex pattern with a custom pair of SEPs before splitting, which is what anysplit() is emulating here, while being UTF-8 friendly. echo "123\uC350abc:\uF8FF:|\U1F921#xyz" | mawk2x '{ print ("\t\f"($0)"\n")>>(STDERR) anysplit($_, reFLEX_UCode8 "|[[-_!-/3-?]",___=2,__) OFS="\t" for(_ in __) { if (!(_%___)) { printf(" matched_items[ %2d ] = # %-2d = \42%s\42\n", _,_/___,__[_]) } } } END { printf(ORS) }' 123썐abc::|🤡#xyz matched_items[ 2 ] = # 1 = "3썐" matched_items[ 4 ] = # 2 = "::" matched_items[ 6 ] = # 3 = "🤡#" In the background, anysplit() is nothing all that complicated either : xs3pFS is a 3-byte string of \301\032\365 that I assumed would be extremely rare to show up even in binary data. gsub(patRE, xs3pFS ((pat=="&")?"\\":"") "&" xs3pFS,_) gsub(xs3pFS "("xs3pFS")+", "",_) return split(_, ar8, xs3pFS) By splitting the input string in this manner, all the desired items would exist in even-numbered array indices, while the rest of the string would be distributed along odd-numbered indices, somewhat similar to the 2nd array i.e. 4th argument in gawk's split() and patsplit() for the seps, but difference being that both the matches and the seps, whichever way you want to see them, are in the same array. When you print out every cell in the array, you'll see : _SEPS_[ 1 ] = # 1 = "123" matched_items [ 2 ] = # 1 = "썐" _SEPS_[ 3 ] = # 2 = "abc" matched_items [ 4 ] = # 2 = "::" _SEPS_[ 5 ] = # 3 = "|" matched_items [ 6 ] = # 3 = "🤡#" _SEPS_[ 7 ] = # 4 = "xyz"
Generating a new file after processing data in Shell script
The input file which is shown below is generated by performing results of 2 other files i.e awk 'BEGIN{FS=OFS=","} FNR==NR{arr[$0];next} {print $1,$2,$3,$5,($4 in arr)?1:0}' $NGW_REG_RESP_FILE $NGW_REG_REQ_FILE >> $NGW_REG_FILE $NGW_REG_FILE file contains below data based on that i have to create a new file 2020-12-21,18,1,1,1 2020-12-21,18,1,1,0 2020-12-21,18,1,2,1 2020-12-21,18,1,2,1 2020-12-21,18,2,1,1 2020-12-21,18,2,1,1 2020-12-21,18,2,1,0 2020-12-21,18,3,2,1 2020-12-21,18,3,2,1 2020-12-21,18,4,2,0 2020-12-21,18,4,2,1 2020-12-21,18,3,2,0 What this data indicates is: Date,Hour,Quarter,ReqType,Success/failed Reqtype there were 2 possibilities: 1-> incoming 2-> outgoing last field: 1->success 0-> failed Quarter -> 1,2,3,4 I want to read this file and generate a new file that contains data like below (MY OUTPUT FILE): 2020-12-21,18,1,1,1,1 2020-12-21,18,1,2,2,0 2020-12-21,18,2,1,2,1 ..... Explanation: heading: date,hour,quarter,reqType,Success_count,Failure_count (for reference to understand o/p file) Date H Q ReqID SuccessCnt Fail Count 2020-12-21,18,1,1 ,1 ,1 Explanation: in input file for quarter 1 both reqTypes(1&2) were present there will be at max 2 entry in each quarter. in quarter 1 for reqid 1 there were 2 requests, 1 got success and other got failed so 1 as success cnt and 1 as failure cnt 2020-12-21,18,1,2,2,0 here quarter 1 ,for req ID 2 there were 2 requests both got success so success count be 2 and failure count be 0 **UPDATE The answer which is given in the comment is worked exactly what I was looking for. I have some updates in the sample input file, i.e one more columns gets added before the last column i.e the STATUS CODE which you can see in the below input i,e 200,400,300 2020-12-21,18,1,1,200,1 2020-12-21,18,2,1,400,0 2020-12-21,18,2,1,300,0 The existing code gives the below result in the output file: i.e Total count of success/failed in that quarter. Which is Correct. What I want to do is add one more column to the output file, next to the total failed count i.e the array holding those status codes. 2020-12-21,18,1,1,1,0,[] //empty array in end bcs there is no failed req,1,success req 2020-12-21,18,2,1,0,2,[400,300] // here 2 failed req,0 success request <DATE>,<HOUR>,<QUARTER>,<REQ_TYPE>,<SUCCESS_COUNT>,<FAIL_CNT>,<ARRAY_HOLDING_STATUSCODE> I have added below changes to the code , Bu not getting how to iterate in side the same for loop `cat $input_file | grep -v Orig | awk -F, '{ if ($NF==1) { map[$1][$2][$3][$4]["success"]++ } else { map[$1][$2][$3][$4]["fail"]++ harish[$1][$2][$3][$4][$5]++ //ADDED THIS } } END { PROCINFO["sorted_in"]="#ind_num_asc"; for (i in map) { for (j in map[i]) { for (k in map[i][j]) { for (l in map[i][j][k]) { print i","j","k","l","(map[i][j][k][l]["success"]==""?"0":map[i][j][k][l]["success"])","(map[i][j][k][l]["fail"]==""?"0":map[i][j][k][l]["fail"]) } } } } }' >> OUTPUT_FILE.txt`
With awk (GNU awk for array sorting): awk -F, '{ if ($NF==1) { map[$1][$2][$3][$4]["success"]++ } else { map[$1][$2][$3][$4]["fail"]++ } } END { PROCINFO["sorted_in"]="#ind_num_asc";for (i in map) { for (j in map[i]) { for (k in map[i][j]) { for (l in map[i][j][k]) { print i","j","k","l","(map[i][j][k][l]["success"]==""?"0":map[i][j][k][l]["success"])","(map[i][j][k][l]["fail"]==""?"0":map[i][j][k][l]["fail"]) } } } } }' $NGW_REG_FILE Explanation: awk -F, '{ if ($NF==1) { map[$1][$2][$3][$4]["success"]++ # If last field is 1, increment a success index in array map with other fields as further indexes } else { map[$1][$2][$3][$4]["fail"]++ # Otherwise increment a fail index } } END { PROCINFO["sorted_in"]="#ind_num_asc"; # Set the array ordering for (i in map) { for (j in map[i]) { for (k in map[i][j]) { for (l in map[i][j][k]) { print i","j","k","l","(map[i][j][k][l]["success"]==""?"0":map[i][j][k][l]["success"])","(map[i][j][k][l]["fail"]==""?"0":map[i][j][k][l]["fail"]) # Loop through the array and print the data in the format required. If there is no entry in the success or fail index, print 0. } } } } }' $NGW_REG_FILE
How to return 0 if awk returns null from processing an expression?
I currently have a awk method to parse through whether or not an expression output contains more than one line. If it does, it aggregates and prints the sum. For example: someexpression=$'JOBID PARTITION NAME USER ST TIME NODES NODELIST(REASON)' might be the one-liner where it DOESN'T yield any information. Then, echo "$someexpression" | awk ' NR>1 {a[$4]++} END { for (i in a) { printf "%d\n", a[i] } }' this will yield NULL or an empty return. Instead, I would like to have it return a numeric value of $0$ if empty. How can I modify the above to do this?
Nothing in UNIX "returns" anything (despite the unfortunately named keyword for setting the exit status of a function), everything (tools, functions, scripts) outputs X and exits with status Y. Consider these 2 identical functions named foo(), one in C and one in shell: C (x=foo() means set x to the return code of foo()): foo() { printf "7\n"; // this is outputting 7 from the full program return 3; // this is returning 3 from this function } x=foo(); <- 7 is output on screen and x has value '3' shell (x=foo means set x to the output of foo()): foo() { printf "7\n"; # this is outputting 7 from just this function return 3; # this is setting this functions exit status to 3 } x=foo <- nothing is output on screen, x has value '7', and '$?' has value '3' Note that what the return statement does is vastly different in each. Within an awk script, printing and return codes from functions behave the same as they do in C but in terms of a call to the awk tool, externally it behaves the same as every other UNIX tool and shell script and produces output and sets an exit status. So when discussing anything in UNIX avoid using the term "return" as it's imprecise and ambiguous and so different people will think you mean "output" while others think you mean "exit status". In this case I assume you mean "output" BUT you should instead consider setting a non-zero exit status when there's no match like grep does, e.g.: echo "$someexpression" | awk ' NR>1 {a[$4]++} END { for (i in a) { print a[i] } exit (NR < 2) }' and then your code that uses the above can test for the success/fail exit status rather than testing for a specific output value, just like if you were doing the equivalent with grep. You can of course tweak the above to: echo "$someexpression" | awk ' NR>1 {a[$4]++} END { if ( NR > 1 ) { for (i in a) { print a[i] } } else { print "$0$" exit 1 } }' if necessary and then you have both a specific output value and a success/fail exit status.
You may keep a flag inside for loop to detect whether loop has executed or not: echo "$someexpression" | awk 'NR>1 { a[$4]++ } END { for (i in a) { p = 1 printf "%d\n", a[i] } if (!p) print "$0$" }' $0$
gsub for substituting translations not working
I have a dictionary dict with records separated by ":" and data fields by new lines, for example: :one 1 :two 2 :three 3 :four 4 Now I want awk to substitute all occurrences of each record in the input file, eg onetwotwotwoone two threetwoone four My first awk script looked like this and works just fine: BEGIN { RS = ":" ; FS = "\n"} NR == FNR { rep[$1] = $2 next } { for (key in rep) grub(key,rep[key]) print } giving me: 12221 2 321 4 Unfortunately another dict file contains some character used by regular expressions, so I have to substitute escape characters in my script. By moving key and rep[key] into a string (which can then be parsed for escape characters), the script will only substitute the second record in the dict. Why? And how to solve? Here's the current second part of the script: { for (key in rep) orig=key trans=rep[key] gsub(/[\]\[^$.*?+{}\\()|]/, "\\\\&", orig) gsub(orig,trans) print } All scripts are run by awk -f translate.awk dict input Thanks in advance!
Your fundamental problem is using strings in regexp and backreference contexts when you don't want them and then trying to escape the metacharacters in your strings to disable the characters that you're enabling by using them in those contexts. If you want strings, use them in string contexts, that's all. You won't want this: gsub(regexp,backreference-enabled-string) You want something more like this: index(...,string) substr(string) I think this is what you're trying to do: $ cat tst.awk BEGIN { FS = ":" } NR == FNR { if ( NR%2 ) { key = $2 } else { rep[key] = $0 } next } { for ( key in rep ) { head = "" tail = $0 while ( start = index(tail,key) ) { head = head substr(tail,1,start-1) rep[key] tail = substr(tail,start+length(key)) } $0 = head tail } print } $ awk -f tst.awk dict file 12221 2 321 4
Never mind for asking.... Just some missing parentheses...?! { for (key in rep) { orig=key trans=rep[key] gsub(/[\]\[^$.*?+{}\\()|]/, "\\\\&", orig) gsub(orig,trans) } print } works like a charm.
How to detect the last line in awk before END?
I'm trying to concatenate String values and print them, but if the last types are Strings and there is no change of type then the concatenation won't print: input.txt: String 1 String 2 Number 5 Number 2 String 3 String 3 awk: awk ' BEGIN { tot=0; ant_t=""; } { t = $1; val=$2; #if string, concatenate its value if (t == "String") { tot+=val; nx=1; } else { nx=0; } #if type change, add tot to res if (t != "String" && ant_t == "String") { res=res tot; tot=0; } ant_t=t; #if string, go next if (nx == 1) { next; } res=res"\n"val; } END { print res; }' input.txt Current output: 3 5 2 Expected output: 3 5 2 6 How can I detect if awk is reading last line, so if there won't be change of type it will check if it is the last line?
awk reads line by line hence it cannot determine if it is reading the last line or not. The END block can be useful to perform actions once the end of file has reached. To perform what you expect awk '/String/{sum+=$2} /Number/{if(sum) print sum; sum=0; print $2} END{if(sum) print sum}' will produce output as 3 5 2 6 what it does? /String/ selects line that matches String so is Number sum+=$2 performs the concatanation with String lines. When Number occurs, print the sum and reset to zero
Like this maybe: awk -v lines="$(wc -l < /etc/hosts)" 'NR==lines{print "LAST"};1' /etc/hosts I am pre-calculating the number of lines (using wc) and passing that into awk as a variable called lines, if that is unclear.
Just change last line to: END { print res; print tot;}'
awk '$1~"String"{x+=$2;y=1}$1~"Number"{if (y){print x;x=0;y=0;}print $2}END{if(y) print x}' file Explanation y is used as a boolean, and I check at the END if the last pattern was a string and print the sum You can actually use x as the boolean like nu11p01n73R does which is smarter Test $ cat file String 1 String 2 Number 5 Number 2 String 3 String 3 $ awk '$1~"String"{x+=$2;y=1}$1~"Number"{if (y){print x;x=0;y=0;}print $2}END{if(y) print x}' file 3 5 2 6