Writing multiple awk programs in a single file [closed]

Writing multiple awk programs in a single file [closed] - awk

Closed. This question needs details or clarity. It is not currently accepting answers.
Want to improve this question? Add details and clarify the problem by editing this post.
Closed 4 days ago.
Improve this question
Have been writing some awk scripts. For each type of file I have a different implementation for what needs to do. Would it be possible to have everything in a single file, and if so, can I see an exampled of how such a thing can be done?
Currently I am using different awk files.
For instance, I have this in one file
{
kmd="tput sgr0"
rst = ( (kmd | getline outp) > 0 ? outp : "<" "sgr0" ">" )
close(kmd)
kmd="tput bold; tput setaf 196"
frg = ( (kmd | getline outp) > 0 ? outp : "<" knam ">" )
close(kmd)
hit = match($0, prl)
if (hit) {
fm="%s%s%s\n" ; printf(fm, frg, $0, rst) }
else {
fm="%s\n" ; printf(fm, $0) }
}
And this in another file
BEGIN {
kl=0
kmd="tput sgr0"
rst = ( (kmd | getline outp) > 0 ? outp : "<" "sgr0" ">" )
close(kmd)
ka = "Wht 15 Grn 34 Blu 39 Ylw 11 Red 196 Amb 214"
kb = "Cyn 51 Mgn 201 Syp 39 Code 39"
ks = sprintf("%s %s", ka, kb)
n = split(ks, kaggr) # split makes kaggr start at index 1
for ( i=1; i<n; i+=2 ) {
knam = kaggr[i] ":"
knum = kaggr[i+1]
kmd=sprintf("%s%d\n", "tput bold; tput setaf ", knum)
tseq[knam] = ( (kmd | getline outp) > 0 ? outp : "<" knam ">" )
close(kmd)
}
}
## Detect keyword in index of array tseq, tseq[indx].
($1 in tseq) { kl=1 ; ctp=$1 ; next }
/Rst:/ { kl=0 ; next } # Reset colour rendition
kl { print tseq[ctp] $0 rst } # Print with specified colour
!kl { print } # Use terminal text colour

Related

Writing from lines into columns based on third column

I have files that look like this -- already sorted by year (inside years sorted by id, which appears to be equivalent to strict sorting by id, but this may not always apply).
ID,COU,YEA, VOT
1,USA,2000,1
2,USA,2000,0
3,USA,2001,1
4,USA,2003,2
5,USA,2003,0
I would like to rewrite them like this (ids for year N after 1999 in column 2N-1, corresponding votes in column 2N):
2000 IDS, VOTE, 2001 IDS, VOTE, 2002 IDS, VOTE, 2003 IDS, VOTE
1,1,3,1, , ,4,2
2,0, , , , ,5,0
I don't know how to do it. My basic thinking with awk was:
if $3 == 2000, { print $1, $4 }
if $3 == 2001, { print " "," ", $1, $4 } etc
But there are two problems:
this way the columns for years other than 2000 would start with a lot of empty lines
I have found no intelligent way to generalise the print command, so I would have to write 20 if-statements
The only working idea I have is, to create 20 unneeded files and glue them with paste which I have never used, but which seems suitable, according to man on my system.

The key is to use multidimensional arrays
BEGIN {FS = ","}
NR == 2 {minYear = maxYear = $3}
NR > 1 {
year=$3
count[year]++
id[year, count[year]] = $1
vote[year, count[year]] = $4
if (year < minYear) minYear = year
if (year > maxYear) maxYear = year
if (count[year] > maxCount) maxCount = count[year]
}
END {
sep = ""
for (y=minYear; y<=maxYear; y++) {
printf "%s%d ID,VOTE", sep, y
sep = ","
}
print ""
for (i=1; i<=maxCount; i++) {
sep = ""
for (y=minYear; y<=maxYear; y++) {
printf "%s%s,%s", sep, id[y, i], vote[y, i]
sep = ","
}
print ""
}
}
Then,
$ awk -f transpose.awk input_file
2000 ID,VOTE,2001 ID,VOTE,2002 ID,VOTE,2003 ID,VOTE
1,1,3,1,,,4,2
2,0,,,,,5,0
If you really want hte spaces in the output, change the last printf to
printf "%s%s,%s", sep,
((y, i) in id ? id[y, i] : " "),
((y, i) in vote ? vote[y, i] : " ")

This is functionally the same as #Glenn's and no better than it in any way so his should remain the accepted answer but I came up with it before looking at his and thought it might be useful to post it anyway to show some small alternatives in style and implementation details:
$ cat tst.awk
BEGIN { FS=OFS="," }
NR == 1 { next }
{
id = $1
year = $3
votes = $4
if ( ++numYears == 1 ) {
begYear = year
}
endYear = year
yearIds[year,++numIds[year]] = id
yearVotes[year,numIds[year]] = votes
maxIds = (numIds[year] > maxIds ? numIds[year] : maxIds)
}
END {
for (year=begYear; year<=endYear; year++) {
printf "%s IDS%sVOTE%s", year, OFS, (year<endYear ? OFS : ORS)
}
for (idNr=1; idNr<=maxIds; idNr++) {
for (year=begYear; year<=endYear; year++) {
id = votes = " "
if ( (year,idNr) in yearIds ) {
id = yearIds[year,idNr]
votes = yearVotes[year,idNr]
}
printf "%s%s%s%s", id, OFS, votes, (year<endYear ? OFS : ORS)
}
}
}
$ awk -f tst.awk file
2000 IDS,VOTE,2001 IDS,VOTE,2002 IDS,VOTE,2003 IDS,VOTE
1,1,3,1, , ,4,2
2,0, , , , ,5,0

With respect and permission of glenn jackman I am taking his suggested code the only thing I am trying to add here is get maximum and minimum year in awk's variable itself and NOT calculating it inside main block of awk program, since OP confirmed that Input_file is sorted by year. Answers by Glenn and Ed sir are awesome, just thought to add a variant here.
BTW we could use awk in stead of using tail and heads in variables too here :)
awk -v max=$(tail -1 Input_file | cut -d, -f3) -v min=$(head -2 Input_file | tail -1 | cut -d, -f3) '
BEGIN { FS = "," }
NR > 1 {
year=$3
count[year]++
id[year, count[year]] = $1
vote[year, count[year]] = $4
if (count[year] > maxCount) maxCount = count[year]
}
END {
sep = ""
for (y=min; y<=max; y++) {
printf "%s%d ID,VOTE", sep, y
sep = ","
}
print ""
for (i=1; i<=maxCount; i++) {
sep = ""
for (y=min; y<=max; y++) {
printf "%s%s,%s", sep, id[y, i], vote[y, i]
sep = ","
}
print ""
}
}' Input_file

Faster algorithm/language for string replacement [closed]

Closed. This question does not meet Stack Overflow guidelines. It is not currently accepting answers.
This question does not appear to be about programming within the scope defined in the help center.
Closed 2 years ago.
Improve this question
EDIT:
Method 3 provided below is way faster by testing, reduce the estimated runtime from 2-3 days to < 1 day.
I had a sample file with a long string >50M like this.
CACTGCTGTCACCCTCCATGCACCTGCCCACCCTCCAAGGATCNNNNNNNCACTGCTGTCACCCTCCATGCACCTGCCCACCCTCCAAGGATCaagctCCgaTNNNNNNNNNNNNGgtgtgtatatatcatgtgtgGCCCTAGGGCCCTAGGGCCCTAtgtgtgGCCCTAGGGCtgtgtgGCCCTAGGGCGGatgtgtggtgtgtggggttagggttagggttaNNNNNNNNNNNCCCTCCAAGGATCaagctCCgaTNNNNNNNNNNNNGgtgtgtatataGCCCTAGGtcatgtgtgatgtgtggtgtgtggggttagggttagggttaNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGCCCTAGGNNNNNNNGCCCTAGGNNNNNNNNNNNNNNAGGGCCCTAGGGCCCTAtgtgtgGCCCTAGGGCtgtgtgGCCCTAGGGCGGagtatatatcatgtgtgatgtgttggggtNNNNNNGgtgtgtatatatcatagggAGGGCCCTAGGGCCCTAtgtgtgGCCCTAGGGCtgtgtgGCCCTAGGGCGGagtatatatcatgtgtgatgtgtggtgtgggtgtgtggggttagggAGGGCCCTAGGGCCCTAtgtgtgGCCCTAGGGCtgtgtgGCCCTAGGGCGGagtatatatcatgtgtgatgtggtgtgtggggttagggttagggttaNNNNNNNNNNNNtgttgttttattttcttacaggtggtgtgtggggttagggttagggttaNNNNNNNNNNNCCCTCCAAGGATCaagctCCgaTNNNNNNNNNNNNGgtgtgtatatatcatgtAGCCCTAGGGatgtgtggtgtgtggggttagggttagggttaNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNttgtggtgtgtggtgNNNNNAGGGCtggtgtgtggggttagggAtagggAGGGCCCTAGGGCCCTAtgtgtgGCCCTAGGGCtgtgtgGCCCTAGGGCGGagtatatatcatgtgtgatgtgtggtgtgtggggGGGCCCTAGGGCCCTAtgtgtgGCCCTAGGGCtgtgtgGCCCTAGGGCGGagtatatatcatgtgtgatgtgtggtgtgtggggttagggNNNNNNNNNNNNNNNNNNNNNNNNNNNNAGaggcatattgatcCCCTCCAAGGATCaagctCCgaTNNNNNNNggttagggttNNNNNGgtgtCCCTAGGGCCCTAGGGCCCTAtgtgtgGCCCTAGGGCtgtgtgGCCCTAGGGCGGagtatatatcatgtgtgatgtgtggtgtgtggggttagggttagggttaNNNNNNNNNNNNtgttgttttattttcttacaggtggtgtgtggggttagggttagggttaNNNNNNNNNNNCCCTCCAAGGATCaagctCCgaTNNNNNNNNNNNNGgtgtgtatatatcatgtAGCCCTAGGGatgtgtggtgtgtggggttagggttagggttaNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNttgtggtgtgtggtgNNNNNAGGGCCCTAGGGCCCTAtgtgtgGCCCTAGGGCtgtgtgGCCCTAGGGCGGagtatatatcatgtgtgatgtgttggggtNNNNNNGgtgtgtatatatcatagggAGGGCCCTAGGGCCCTAtgtgtgGCCCTAGGGCtgtgtgGCCCTAGGGCGGagtatatatcatgtgtgatgtgtggtgtgggtgtgtggggttagggAGGGCCCTAGGGCCCTAtgtgtgGCCCTAGGGCtgtgtgGCCCTAGGGCGGagtatatatcatgtgtgNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
For everything substring with length k = 50 (which means there are
length(file)-k+1 substring)
if the A||T||a||t (Upper & Lower case) is >40%,
replace every character in that substring with N or n (preserving
case).
Sample output:
CACTGCTGTCACCCTCCATGCACCTGCCCACCCTCCAAGGATCNNNNNNNCACTGCTGTCACCCTCCATGCACCTGCCCACCCTCCAAGGATCaagctCCgaTNNNNNNNNNNNNGgnnnnnnnnnnnnnnnnnnnNNNNNNNNNNNNNNNNNNNNNNnnnnnnNNNNNNNNNNnnnnnnNNNNNNNNNNNNnnnnnnnnnnnnnnnnnnnnnnnnnnnggttaNNNNNNNNNNNNNNNNNNNNNNNNnnnnnNNnnNNNNNNNNNNNNNNnnnnnnnnnnnNNNNNNNNnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGCCCTAGGNNNNNNNGCCCTAGGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNnnnnnnNNNNNNNNNNnnnnnnNNNNNNNNNNNNnnnnnnnnnnnnnnnnnnnnnnnnnnnnnNNNNNNNnnnnnnnnnnnnnnnnnnnNNNNNNNNNNNNNNNNNnnnnnnNNNNNNNNNNnnnnnnNNNNNNNNNNNNnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnNNNNNNNNNNNNNNNNNnnnnnnNNNNNNNNNNnnnnnnNNNNNNNNNNNNnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnNNNNNNNNNNNNnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnNNNNNNNNNNNNNNNNNNNNNNNNnnnnnNNnnNNNNNNNNNNNNNNnnnnnnnnnnnnnnnnnNNNNNNNNNNnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNttgtggtgtgtggtgNNNNNAGGNNnnnnnnnnnnnnnnnnnnNnnnnnNNNNNNNNNNNNNNNNNnnnnnnNNNNNNNNNNnnnnnnNNNNNNNNNNNNnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnNNNNNNNNNNNNNNNNnnnnnnNNNNNNNNNNnnnnnnNNNNNNNNNNNNnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnngggttagggNNNNNNNNNNNNNNNNNNNNNNNNNNNNAGaggcatattgatcCCCTCCAAGGATCaagctCCgaTNNNNNNNggttagggttNNNNNGnnnnNNNNNNNNNNNNNNNNNNNNNnnnnnnNNNNNNNNNNnnnnnnNNNNNNNNNNNNnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnNNNNNNNNNNNNnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnNNNNNNNNNNNNNNNNNNNNNNNNnnnnnNNnnNNNNNNNNNNNNNNnnnnnnnnnnnnnnnnnNNNNNNNNNNnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNttgtggtgtgtggtgNNNNNNNNNNNNNNNNNNNNNNnnnnnnNNNNNNNNNNnnnnnnNNNNNNNNNNNNnnnnnnnnnnnnnnnnnnnnnnnnnnnnnNNNNNNNnnnnnnnnnnnnnnnnnnnNNNNNNNNNNNNNNNNNnnnnnnNNNNNNNNNNnnnnnnNNNNNNNNNNNNnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnNNNNNNNNNNNNNNNNNnnnnnnNNNNNNNNNNnnnnnnNNNNNNNNNNNNnnnnnnnnnnnnnnnngNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
I was using AWK in command line for ease, but it just runs extremely slow with string replacement... and consume only <5% CPU somehow
Code: https://repl.it/#hey0wing/DutifulHandsomeMisrac-2
# Method 1
cat chr22.fa | head -n1 > chr22.masked.fa
cat chr22.fa | tail -n+2 | awk -v k=100 -v r=40 '{
printf("chr22.fa: %d\n",length($0))
i = 1;
while (i <= length($0)-k+1) {
x = substr($0, i, k)
if (i == 1) {
rate = gsub(/A/,"A",x) + gsub(/T/,"T",x) + gsub(/a/,"a",x) + gsub(/t/,"t",x)
} else {
prevx = substr($0,i-1,1)
if (prevx == "A" || prevx == "a" || prevx == "T" || prevx == "t")
rate -= 1
nextx = substr(x,k,1)
if (nextx == "A" || nextx == "a" || nextx == "T" || nextx == "t")
rate += 1
}
if (rate>r*k/100) {
h++
highGC[i] = i
}
printf("index-r:%f%% high-AT:%d \r",i/(length($0)-k+1)*100,h)
i += 1
}
printf("index-r:%f%% high-AT:%d\n\n",i/(length($0)-k+1)*100,h)
for (j in highGC) {
y = highGC[j]
SUB++
printf("sub-r:%f%% \r",SUB/h*100)
x = substr($0, y, k)
gsub (/[AGCT]/,"N",x)
gsub (/[agct]/,"n",x)
$0 = substr($0,1,y-1) x substr($0,y+k)
}
printf("sub-r:%f%%\nsubstituted:%d\n\n",SUB/h*100,SUB)
printf("%s",$0) >> "chr22.masked.fa"
}'
# Method 2
cat chr22.fa | head -n1 > chr22.masked2.fa
cat chr22.fa | tail -n+2 | awk -v k="100" -v r=40 '{
printf("chr22.fa: %d\n",length($0))
i = 1;
h = 0;
while (i<=length($0)-k+1) {
x = substr($0, i, k)
rate = gsub(/[ATX]/,"X",x) + gsub(/[atx]/,"x",x)
if (rate>r/k*100) {
h++
gsub (/[GC]/,"N",x)
gsub (/[gc]/,"n",x)
$0 = substr($0,1,i-1) x substr($0,i+k)
}
printf("index-r:%f%% sub-r:%f%% \r",i/(length($0)-k+1)*100,h/544*100)
i += 1
}
gsub (/X/,"N",$0)
gsub (/x/,"n",$0)
printf("index-r:%f%% sub-r:%f%% \n",i/(length($0)-k+1)*100,h/544*100)
printf("%s",$0) >> "chr22.masked2.fa"
}'
# Method 3
cat chr22.fa | head -n1 > chr22.masked3.fa
cat chr22.fa | tail -n+2 | awk -v k="100" -v r=40 '{
printf("chr22.fa: %d\n",length($0))
i = 1;
h = 0;
while (i <= length($0)-k+1) {
x = substr($0, i, k)
rate = gsub(/A/,"A",x) + gsub(/T/,"T",x) + gsub(/a/,"a",x) + gsub(/t/,"t",x)
if (rate>r/k*100) {
h++
gsub(/[ACGT]/,"N",x)
gsub(/[acgt]/,"n",x)
if (i == 1) {
s = x
} else {
s = substr(s,1,length(s)-k+1) x
}
} else {
if (i == 1) {
s = x
} else {
s = s substr(x,k,1)
}
}
printf("index-r:%f%% sub-r:%f%% \r",i/(length($0)-k+1)*100,h/544*100)
i += 1
}
printf("index-r:%f%% sub-r:%f%% \n\n",i/(length($0)-k+1)*100,h/544*100)
printf("%s",s) >> "chr22.masked3.fa"
}'
The estimated runtime is around 2-3 days ...
Are there any faster algorithm for this problem? If no, are there any language can perform string replacement faster?
More info:
the AWK command consume ~30% CPU at WSL & GitBash, but only ~5% on windows cmd with an OpenSSH client, where the progress rate is similar

Okay, there's an O(n) solution that involves a sliding window on to your data set. The following algorithm should suffice:
set window to ""
while true:
if window is "":
read k characters into window, exit while if less available
set atCount to number of characters in window matching "AaTt".
if atCount > 40% of k:
for each char in window:
if char uppercase:
output "N"
else:
output "n"
window = ""
else:
if first character of window matches "AaTt":
decrease atCount
remove first character of window
read next character into end of window, exit while if none available
if last character of window matches "AaTt":
increase atCount
What this does is to run a sliding window through your data, at each point testing if the proportion of AaTt characters in that window is more than 40%.
If so, it outputs the desired Nn characters and reloads the next k-sized window.
If it's not over 40%, it removes the first character in the windows and adds the next one to the end, adjusting the count of AaTt characters correctly.
If, at any point, there aren't enough characters left to satisfy a check (k when loading a full window, or 1 when sliding), it exits the loop.

Try some perl:
perl -slpe '
my $len = length;
for (my $i = 0; $i < $len; $i += $k) {
my $substring = substr($_, $i, $k);
my $count = $substring =~ tr/aAtT/aAtT/;
if ($count >= $k * $threshold) {
$substring =~ s/[[:lower:]]/n/g;
$substring =~ s/[[:upper:]]/N/g;
substr($_, $i, $k) = $substring;
}
}
' -- -k=50 -threshold=0.4 file

How to make an array of alphabets from a file and update in a new file

I have a single column file.
A
A
A
B
B
B
C
C
D
I want to use this file and want to make a new one as below
command="A" "B" "C" "D"
TYPE=1 1 1 2 2 2 3 3 4,
These A B C D are random alphabets and varies file to file.
I tried to overcome the solution with below shell script
#!/bin/bash
NQ=$(cat RLP.txt | wc -l)
ELEMENT='element='
echo "$ELEMENT" > element.txt
TYPE='types='
echo "$TYPE" > types.txt
for i in `seq 1 1 $NQ`
do
RLP=$(echo "$i" | tail -n 1)
cat RLP.txt | head -n "$RLP" | tail -n 1 > el.$RLP.txt
done
paste element.txt el.*.txt
paste types.txt
The output of paste element.txt el.*.txt is element= A A A B B B C C D
I could not remove the repeated alphabets and put the reaming alphabets in "".
and cold not move forward for with second command to get
TYPE=1 1 1 2 2 2 3 3 4,
which represents that the 1st alphabets repeated three times, 2nd alphabets repeated three times, 3rd alphabets repeated two times and so on..

$ cat tst.awk
!seen[$1]++ {
cmd = cmd sep "\"" $1 "\""
cnt++
}
{
type = type sep cnt
sep = OFS
}
END {
print "command=" cmd
print "TYPE=" type ","
}
$ awk -f tst.awk file
command="A" "B" "C" "D"
TYPE=1 1 1 2 2 2 3 3 4,

Instead of using multiple text processing tools in a pipeline, this can be achieved by one awk command as below
awk '
{
unique[$0]
}
prev !~ $0 {
alpha[NR] = idx++
}
{
prev = $0
alpha[NR] = idx
}
END {
for (i in unique) {
str = str ? (str " " "\"" i "\"") : "\"" i "\""
}
first = "command=" str
str = ""
for (i = 1; i <= NR; i++) {
str = str ? (str " " alpha[i]) : alpha[i]
}
second = "TYPE=" str ","
print(first "\n" second) > "types.txt"
close("types.txt")
}' RLP.txt
The command works as follows
Each unique line in the file is saved as an index in into the array unique
The array alpha keeps track of the unique value counter, i.e. every time a value in the file changes, the counter is incremented at the corresponding line number NR
The END block is all about constructing the output from the array to a string value and writing the result to the new file "types.txt"

Pure bash implementation. Requires at least Bash version 4 for the associative array
#!/bin/bash
outfile="./RLP.txt"
infile="./infile"
declare -A map
while read line; do
(( map["$line"]++ ))
done < "$infile"
command="command="
command+=$(printf "\"%s\" " "${!map[#]}")
type="$(
for i in "${map[#]}"; do
((k++))
for (( j=0; j < i; j++ )); do
printf " %d" "$k"
done
done
),"
echo "$command" >> "$outfile"
echo "TYPE=${type#* }" >> "$outfile"

Match specific pattern and print just the matched string in the previous line

I update the question with additional information
I have a .fastq file formatted in the following way
#M01790:39:000000000-C3C6P:1:1101:14141:1618 1:N:0:8 (sequence name)
CATCTACATATTCACATATAGACATGAAACACCTGTGGTTCTTCCTC.. (sequence)
+
ACCCGGGGGGGGGDGGGFGGGGGGFGGGGGGGGGGGFGGGGFGFGFF.. (sequence quality)
For each sequence the format is the same (repetition of 4 lines)
What I am trying to do is searching for a specific regex pattern ([A-Z]{5,}ACA[A-Z]{5,}ACA[A-Z]{5,})in a window of n=35 characters of the 2nd line, cut it if found and report it at the end of the previous line.
So far I've written a bunch of code that does almost what I want.I thought using the match function together wit the substr of my window of interest but i didn't achieve my goal. I report below the script.awk :
match(substr($0,0,35),/regexp/,a) {
print p,a[0] #print the previous line respect to the matched one
print #print the current line
for(i=0;i<=1;i++) { # print the 2 lines following
getline
print
}
}#store previous line
{ p = $0 }
Starting from a file like this:
#M01790:39:000000000-C3C6P:1:1101:14141:1618 1:N:0:8
AACATCTACATATTCACATATAGACATGAAACACCTGTGGTTCTTCCTC..
+
GGGGGGGGDGGGFGGGGGGFGGGGGGGGGGGFGGGGFGFGFFGGGGFGF..
I would like to obtain an output like this:
#M01790:39:000000000-C3C6P:1:1101:14141:1618 1:N:0:8 TATTCACATATAGACATGAAA #is the string that matched the regexp WITHOUT initial AA that doesn' match my expression
ATATTCACATATAGACATGAAACACCTGTGGTTCTTCCTC #without initial AA
+
GGGFGGGGGGFGGGGGGGGGGGFGGGGFGFGFFGGGGFGF # without "GGGGGGGGDGGGFGGGGGGFGGG" that is the same number of characters removed in the 2nd line

$ cat tst.awk
BEGIN {
tgtStr = "pattern"
tgtLgth = length(tgtStr)
winLgth = 35
numLines = 4
}
{
lineNr = ( (NR-1) % numLines ) + 1
rec[lineNr] = $0
}
lineNr == numLines {
if ( idx = index(substr(rec[2],1,winLgth),tgtStr) ) {
rec[1] = rec[1] " " tgtStr
rec[2] = substr(rec[2],idx+tgtLgth)
rec[4] = substr(rec[4],idx+tgtLgth)
}
for ( lineNr=1; lineNr<=numLines; lineNr++ ) {
print rec[lineNr]
}
}
$ awk -f tst.awk file
#M01790:39:000000000-C3C6P:1:1101:14141:1618 1:N:0:8 pattern
ATATTCACATATAGACATGAAACACCTGTGGTTCTTCCTC..
+
GGGFGGGGGGFGGGGGGGGGGGFGGGGFGFGFFGGGGFGF..
wrt the code you posted:
substr($0,0,35) - strings, fields, line numbers, and arrays in awk start at 1 not 0 so that should be substr($0,1,35). Awk will compensate for your mistake and treat it as if you had written 1 instead of 0 in this case but get used to starting everything at 1 to avoid mistakes when it matters.
for(i=0;i<=1;i++) - should be for(i=1;i<=2;i++) for the same reason.
getline - not an appropriate use and syntactically fragile, see for(i=0;i<=1;i++)
Update - per your comment below that pattern is actually a regexp rather than a string:
$ cat tst.awk
BEGIN {
tgtRegexp = "[A-Z]{5,}ACA[A-Z]{5,}ACA[A-Z]{5,}"
winLgth = 35
numLines = 4
}
{
lineNr = ( (NR-1) % numLines ) + 1
rec[lineNr] = $0
}
lineNr == numLines {
if ( match(substr(rec[2],1,winLgth),tgtRegexp) ) {
rec[1] = rec[1] " " substr(rec[2],RSTART,RLENGTH)
rec[2] = substr(rec[2],RSTART+RLENGTH)
rec[4] = substr(rec[4],RSTART+RLENGTH)
}
for ( lineNr=1; lineNr<=numLines; lineNr++ ) {
print rec[lineNr]
}
}

I warn you, I wanted to have some fun and it is twisted.
awk -v pattern=pattern -v window=15 '
BEGIN{RS="#";FS=OFS="\n"}
{pos = match($2, pattern); n_del=pos+length(pattern)}
pos && (n_del<=window){$1 = $1 " " pattern; $2=substr($2, n_del); $4=substr($4, n_del)}
NR!=1{printf "%s%s", RS, $0}
' file
Input :
#M01790:39:000000000-C3C6P:1:1101:14141:1618 1:N:0:8
CATCTACpatternATATTCACATATAGACATGAAACACCTGTGGTTCTTCCTC..
+
ACCCGGGGGGGGGDGGGFGGGGGGFGGGGGGGGGGGFGGGGFGFGFFGGGGFGF..
#M01790:39:000000000-C3C6P:1:1101:14141:1618 1:N:0:8
CATCTACGCpatternATATTCACATATAGACATGAAACACCTGTGGTTCTTCCTC..
+
ACCCGGGGDGGGGGGDGGGFGGGGGGFGGGGGGGGGGGFGGGGFGFGFFGGGGFGF..
Output :
#M01790:39:000000000-C3C6P:1:1101:14141:1618 1:N:0:8 pattern
ATATTCACATATAGACATGAAACACCTGTGGTTCTTCCTC..
+
GGGFGGGGGGFGGGGGGGGGGGFGGGGFGFGFFGGGGFGF..
#M01790:39:000000000-C3C6P:1:1101:14141:1618 1:N:0:8
CATCTACGCpatternATATTCACATATAGACATGAAACACCTGTGGTTCTTCCTC..
+
ACCCGGGGDGGGGGGDGGGFGGGGGGFGGGGGGGGGGGFGGGGFGFGFFGGGGFGF..
Second block is not updated because window is 15 and it cannot find the pattern within this window.
I used variable RS to deal with entire 4 lines block with $0, $1, $2, $3 and $4. Because input file starts with RS and does not end with RS, I prefered to not set ORS and use printf instead of print.

awk | Rearrange fields of CSV file on the basis of column value

I need you help in writing awk for the below problem. I have one source file and required output of it.
Source File
a:5,b:1,c:2,session:4,e:8
b:3,a:11,c:5,e:9,session:3,c:3
Output File
session:4,a=5,b=1,c=2
session:3,a=11,b=3,c=5|3
Notes:
Fields are not organised in source file
In Output file: fields are organised in their specific format, for example: all a values are in 2nd column and then b and then c
For value c, in second line, its coming as n number of times, so in output its merged with PIPE symbol.
Please help.

Will work in any modern awk:
$ cat file
a:5,b:1,c:2,session:4,e:8
a:5,c:2,session:4,e:8
b:3,a:11,c:5,e:9,session:3,c:3
$ cat tst.awk
BEGIN{ FS="[,:]"; split("session,a,b,c",order) }
{
split("",val) # or delete(val) in gawk
for (i=1;i<NF;i+=2) {
val[$i] = (val[$i]=="" ? "" : val[$i] "|") $(i+1)
}
for (i=1;i in order;i++) {
name = order[i]
printf "%s%s", (i==1 ? name ":" : "," name "="), val[name]
}
print ""
}
$ awk -f tst.awk file
session:4,a=5,b=1,c=2
session:4,a=5,b=,c=2
session:3,a=11,b=3,c=5|3
If you actually want the e values printed, unlike your posted desired output, just add ,e to the string in the split() in the BEGIN section wherever you'd like those values to appear in the ordered output.
Note that when b was missing from the input on line 2 above, it output a null value as you said you wanted.

Try with:
awk '
BEGIN {
FS = "[,:]"
OFS = ","
}
{
for ( i = 1; i <= NF; i+= 2 ) {
if ( $i == "session" ) { printf "%s:%s", $i, $(i+1); continue }
hash[$i] = hash[$i] (hash[$i] ? "|" : "") $(i+1)
}
asorti( hash, hash_orig )
for ( i = 1; i <= length(hash); i++ ) {
printf ",%s:%s", hash_orig[i], hash[ hash_orig[i] ]
}
printf "\n"
delete hash
delete hash_orig
}
' infile
that splits line with any comma or colon and traverses all odd fields to save either them and its values in a hash to print at the end. It yields:
session:4,a:5,b:1,c:2,e:8
session:3,a:11,b:3,c:5|3,e:9

We Keep Coding

sql objective-c vba vb.net react-native apache vue.js tensorflow api pandas

Writing multiple awk programs in a single file [closed] - awk

Related

Writing from lines into columns based on third column

Faster algorithm/language for string replacement [closed]

How to make an array of alphabets from a file and update in a new file

Match specific pattern and print just the matched string in the previous line

awk | Rearrange fields of CSV file on the basis of column value

Categories

Resources