I want fields 1,2,3,5
With cut I do:
cut -f1-3,5
However with awk I would do:
awk '{for (i=0;i<=5;i++) {if (i!=4) {print $i}} }'
But I want to make it more succinct. Moreover, in other cases I could have more fields with varying distances. awk '{for (i in 1 2 3 5) {print $i}}' doesn't work. How can I do this
For the job of picking fields by position number and field ranges etc cut does the job better. If you really want to mimic this behavior in awk assuming you have other tasks to do in awk as well, you may consider following code:
cat fcut.awk
BEGIN {
n = split(f, a, /,/)
for (i=1; i<=n; ++i) {
if (split(a[i], b, /-/) == 2) {
for (j=b[1]; j<=b[2]; ++j)
fld[j]
}
else
fld[a[i]]
}
}
{
for (i=1; i<=NF; ++i) {
if (i in fld)
s = (st++ ? s OFS : "") $i
}
print s
s = st = ""
}
Now run it as:
awk -v f='1-3,5' -f fcut.awk file
This does what cut does and a bit more:
$ echo 'a b c d e f g' |
awk -v ranges='1-3,5' '
BEGIN {
split(ranges,r,/,/)
for ( i=1; i in r; i++ ) {
n = split(r[i],range,/-/)
for ( j=range[1]; j<=range[n]; j++ ) {
f[++onf] = j
}
}
}
{
for ( i=1; i<=onf; i++ ) {
printf "%s%s", $(f[i]), (i<onf ? OFS : ORS)
}
}
'
a b c e
The above assumes if you specify the same field number multiple times then you want it printed that many times, and you want the fields printed in the order you specify so you can, for example, rearrange order and/or duplicate fields, e.g.:
$ echo 'a b c d e f g' |
awk -v ranges='6,1-3,5,2,1' '
BEGIN {
split(ranges,r,/,/)
for ( i=1; i in r; i++ ) {
n = split(r[i],range,/-/)
for ( j=range[1]; j<=range[n]; j++ ) {
f[++onf] = j
}
}
}
{
for ( i=1; i<=onf; i++ ) {
printf "%s%s", $(f[i]), (i<onf ? OFS : ORS)
}
}
'
f a b c e b a
I have a very large variant calling data. I can not pull out the result I want.
here is an example
bac1 bac2 bac3 bac4
1 0 0 1
Now I want to drop the columns that contain 0 using the ubuntu command-line. The result would be like this
bac1 bac4
1 1
I tried this
awk -F "\t" -v "pat=0\t" 'NR == 2 {for (i=1; i <= NF; i++) Take[i] = (pat != $i)}{for (i =1; i <= NF; i++) if (Take [i]) printf $i FS; print ""}'
And the output is this:
NC_045512.2 18876 NC_045512.2_18876_T_C T C . PASS GT 1
Header of this output is:
#CHROM POS ID REF ALT QUAL FILTER FORMAT EPI_ISL_422804
So the final output had to be like this:
#CHROM POS ID REF ALT QUAL FILTER FORMAT EPI_ISL_422804
NC_045512.2 18876 NC_045512.2_18876_T_C T C . PASS GT 1
The file is not always 2 lines but at most it can be 4 lines.
It does not return the header line that's because I used NR == 2. Is there any way I cant get the header column as well??
If your input file always only has 1 data line as in your example then:
$ cat tst.awk
BEGIN { FS=OFS="\t" }
NR == 1 { split($0,hdr); next }
{
for (i = 1; i <= NF; i++) {
if ($i != 0) {
cols[++nf] = i
}
}
for (i = 1; i <= nf; i++) {
printf "%s%s", hdr[cols[i]], (i<nf ? OFS : ORS)
}
for (i = 1; i <= nf; i++) {
printf "%s%s", $(cols[i]), (i<nf ? OFS : ORS)
}
}
.
$ awk -f tst.awk file
bac1 bac4
1 1
otherwise if your input can have more than 1 data line then you need a 2-pass approach:
$ cat tst.awk
BEGIN { FS=OFS="\t" }
NR == FNR {
if (NR > 1) {
for (i = 1; i <= NF; i++) {
if ($i == 0) {
zeroCols[i]
}
}
}
next
}
FNR == 1 {
for (i = 1; i <= NF; i++) {
if (! (i in zeroCols) ) {
cols[++nf] = i
}
}
}
{
for (i = 1; i <= nf; i++) {
printf "%s%s", $(cols[i]), (i<nf ? OFS : ORS)
}
}
.
$ awk -f tst.awk file file
bac1 bac4
1 1
Long version with if:
awk 'NR==1{
split($0,array,FS)
}
NR==2{
s=0
for(i=1;i<=NF;i++){
if($i!=0){
if(s==0){
s=1
printf("%s",array[i])
}
else{
printf("%s%s",OFS,array[i])
}
}
}
print ""
s=0
for(i=1;i<=NF;i++){
if($i!=0){
if(s==0){
s=1
printf("%s",$i)
}
else{
printf("%s%s",OFS,$i)
}
}
}
print ""
}' FS='\t' OFS="\t" file
One line:
awk 'NR==1{split($0,array,FS)} NR==2{s=0; for(i=1;i<=NF;i++) {if($i!=0) {if(s==0) {s=1; printf("%s",array[i])} else {printf("%s%s",OFS,array[i])}}} print ""; s=0; for(i=1;i<=NF;i++){if($i!=0){if(s==0){s=1; printf("%s",$i)} else {printf("%s%s",OFS,$i)}}} print ""}' FS='\t' OFS="\t" file
Output:
bac1 bac4
1 1
I have my array:
array = [1:"PLCH2", 2:"PLCH1", 3:"PLCH2"]
I want to loop on array to create a new array unique of unique values and obtain:
unique = [1:"PLCH2", 2:"PLCH1"]
how can I achieve that ?
EDIT: as per #Ed Morton request, I show below how my array is populated. In fact, this post is the key solution to my previous post.
in my file.txt, I have:
PLCH2:A1007int&PLCH1:D987int&PLCH2:P977L
INTS11:P446P&INTS11:P449P&INTS11:P518P&INTS11:P547P&INTS11:P553P
I use split to obtain array:
awk '{
split($0,a,"&")
for ( i in a ) {
split(a[i], b, ":");
array[i] = b[1];
}
}' file.txt
This might be what you're trying to do:
$ cat tst.awk
BEGIN {
split("PLCH2 PLCH1 PLCH2",array)
printf "array ="
for (i=1; i in array; i++) {
printf " %s:\"%s\"", i, array[i]
}
print ""
for (i=1; i in array; i++) {
if ( !seen[array[i]]++ ) {
unique[++j] = array[i]
}
}
printf "unique ="
for (i=1; i in unique; i++) {
printf " %s:\"%s\"", i, unique[i]
}
print ""
}
$ awk -f tst.awk
array = 1:"PLCH2" 2:"PLCH1" 3:"PLCH2"
unique = 1:"PLCH2" 2:"PLCH1"
EDIT: given your updated question, here's how I'd really approach that:
$ cat tst.awk
BEGIN { FS="[:&]" }
{
numVals=0
for (i=1; i<NF; i+=2) {
vals[++numVals] = $i
}
print "vals =" arr2str(vals)
delete seen
numUniq=0
for (i=1; i<=numVals; i++) {
if ( !seen[vals[i]]++ ) {
uniq[++numUniq] = vals[i]
}
}
print "uniq =" arr2str(uniq)
}
function arr2str(arr, str, i) {
for (i=1; i in arr; i++) {
str = str sprintf(" %s:\"%s\"", i, arr[i])
}
return str
}
$ awk -f tst.awk file
vals = 1:"PLCH2" 2:"PLCH1" 3:"PLCH2"
uniq = 1:"PLCH2" 2:"PLCH1"
vals = 1:"INTS11" 2:"INTS11" 3:"INTS11" 4:"INTS11" 5:"INTS11"
uniq = 1:"INTS11" 2:"PLCH1"
I have these two csv files:
File A:
veículo;carro;sust
automóvel;carro;sust
viatura;carro;sust
breve;rápido;adj
excepcional;excelente;adj
maravilhoso;excelente;adj
amistoso;simpático;adj
amigável;simpático;adj
...
File B:
"A001","carro","sust","excelente","adj","ocorrer","adv","bom","adj"
...
In the file A, $1(word) is synonym for $2(word) and $3(word) the part of speech.
In the lines of the file B we can skip $1,the remaining columns are words and their part of speech.
What I need to to do is to look line by line each pair (word-pos) in the file A and generate a line for each synonym. It is difficult to explain.
Desired Output:
"A001","carro","sust","excelente","adj","ocorrer","adv","bom","adj"
"A001","viatura","sust","excelente","adj","ocorrer","adv","bom","adj"
"A001","veículo","sust","excelente","adj","ocorrer","adv","bom","adj"
"A001","automóvel","sust","excelente","adj","ocorrer","adv","bom","adj"
"A001","carro","sust","excepcional","adj","ocorrer","adv","bom","adj"
"A001","viatura","sust","excepcional","adj","ocorrer","adv","bom","adj"
"A001","veículo","sust","excepcional","adj","ocorrer","adv","bom","adj"
"A001","automóvel","sust","excepcional","adj","ocorrer","adv","bom","adj"
"A001","carro","sust","maravilhoso","adj","ocorrer","adv","bom","adj"
"A001","viatura","sust","maravilhoso","adj","ocorrer","adv","bom","adj"
"A001","veículo","sust","maravilhoso","adj","ocorrer","adv","bom","adj"
"A001","automóvel","sust","maravilhoso","adj","ocorrer","adv","bom","adj"
Done:
BEGIN {
FS="[,;]";
OFS=";";
}
FNR==NR{
sinonim[$1","$2","$3]++;
next;
}
{
s1=split($0,AX,"\n");
for (i=1;i<=s1;i++)
{
s2=split(AX[i],BX,",");
for (j=2;j<=NF;j+=2)
{
lineX=BX[j]","BX[j+1];
gsub(/\"/,"",lineX);
for (item in sinonim)
{
s3=split(item,CX,",");
lineS=CX[2]","CX[3];
if (lineX == lineS)
{
BX[j]=CX[1];
lineD=""
for (t=1;t<=s2;t++)
{
lineD=lineD BX[t]",";
}
lineF=lineF lineD"\n";
}
}
}
}
print lineF
}
$ cat tst.awk
BEGIN { FS=";" }
NR==FNR { synonyms[$2,$3][$2]; synonyms[$2,$3][$1]; next }
FNR==1 { FS=OFS="\",\""; $0=$0 }
{
gsub(/^"|"$/,"")
for (i=2;i<NF;i+=2) {
if ( ($i,$(i+1)) in synonyms) {
for (synonym in synonyms[$i,$(i+1)]) {
$i = synonym
for (j=2;j<NF;j+=2) {
if ( ($j,$(j+1)) in synonyms) {
for (synonym in synonyms[$j,$(j+1)]) {
orig = $0
$j = synonym
if (!seen[$0]++) {
print "\"" $0 "\""
}
$0 = orig
}
}
}
}
}
}
}
.
$ awk -f tst.awk fileA fileB
"A001","carro","sust","excelente","adj","ocorrer","adv","bom","adj"
"A001","veículo","sust","excelente","adj","ocorrer","adv","bom","adj"
"A001","automóvel","sust","excelente","adj","ocorrer","adv","bom","adj"
"A001","viatura","sust","excelente","adj","ocorrer","adv","bom","adj"
"A001","carro","sust","maravilhoso","adj","ocorrer","adv","bom","adj"
"A001","carro","sust","excepcional","adj","ocorrer","adv","bom","adj"
"A001","veículo","sust","maravilhoso","adj","ocorrer","adv","bom","adj"
"A001","veículo","sust","excepcional","adj","ocorrer","adv","bom","adj"
"A001","automóvel","sust","maravilhoso","adj","ocorrer","adv","bom","adj"
"A001","automóvel","sust","excepcional","adj","ocorrer","adv","bom","adj"
"A001","viatura","sust","maravilhoso","adj","ocorrer","adv","bom","adj"
"A001","viatura","sust","excepcional","adj","ocorrer","adv","bom","adj"
The above uses GNU awk for multi-dimensional arrays, with other awks it's a simple tweak to use synonyms[$2,$3] = synonyms[$2,$3] " " $2 etc. or similar and then split() later instead of synonyms[$2,$3][$2] and in.
BEGIN { FS="[,;]"; OFS="," }
NR == FNR { key = "\"" $2 "\""; synonym[key] = synonym[key] "," $1; next }
{
print;
if ($2 in synonym) {
count = split(substr(synonym[$2], 2), choices)
for (i = 1; i <= count; i++) {
$2 = "\"" choices[i] "\""
print
}
}
}