awk replacement using specific values in two columns - awk

I have a file that looks like this:
20:60479_C_T 60479 C T 0 0 0 0 0 1 0 1
20:60522_T_TC 60522 T TC 0 0 0 0 0 0 0
20:60568_A_C 60568 A C 0 0 1 0 0 1
20:60571_C_A 60571 C A 0 1 0 1 0 0
20:60579_G_A 60579 G A 0 0 1 0 0 0
My current file is bigger with 3 million rows and 3,000 columns. I want to use the values in columns $3 and $4 to replace 0 and 1 in the rest of columns. The desired output would be:
20:60479_C_T 60479 C T C C C C C T C T
20:60522_T_TC 60522 T TC T T T T T T T
20:60568_A_C 60568 A C A A C A A C
20:60571_C_A 60571 C A C A C A C C
20:60579_G_A 60579 G A G G A G G G
I know how to do it for a couple of columns:
awk '{d["0"]=$3; d["1"]=$4; print "20", $1, "0", $2, d[$5], d[$6];}' myfile
But I don't know how to do it automatically for all the columns and avoid adding all the columns manually

$ awk '{d[0]=$3; d[1]=$4; for (i=5; i<=NF; i++) $i=d[$i]} 1' file
20:60479_C_T 60479 C T C C C C C T C T
20:60522_T_TC 60522 T TC T T T T T T T
20:60568_A_C 60568 A C A A C A A C
20:60571_C_A 60571 C A C A C A C C
20:60579_G_A 60579 G A G G A G G G

Since you have a variable number of columns, you can probably get away with something like:
awk <testprog.in '{for (i = 5; i <= NF; i++){$i = $($i+3)}print}'
The "magic" here is the assigning of $($i+3) to $i for all values of i between 5 and the field count (inclusive).
The expression $i+3 will turn 0 and 1 into 3 and 4 respectively and so the next step will be evaluating $3 or $4 (the C and T in the first line for example) and using that to replace the item.
The output of you small test case is, as expected:
20:60479_C_T 60479 C T C C C C C T C T
20:60522_T_TC 60522 T TC T T T T T T T
20:60568_A_C 60568 A C A A C A A C
20:60571_C_A 60571 C A C A C A C C
20:60579_G_A 60579 G A G G A G G G
You will, of course, need to check the performance of this with your larger data sets. On my box, a three-million-line file with 3000 entries each takes about half an hour.
Compare that with a C program (although admittedly quick'n'dirty, heavily tied to your specific input data, without what I would generally consider necessary error checking) which only takes about ten minutes.
For completeness, here's the C variant which, assuming it's called prog.c, you can compile with something like gcc -o prog prog.c and run with something like ./prog <testprog.in:
#include <stdio.h>
#include <ctype.h>
static char buff[102040];
static char *getStr(char *buff, int *pSz) {
if (*buff == 0) return NULL;
char *nextBuff = buff;
while ((nextBuff[0] != 0) && isspace(nextBuff[0])) {
nextBuff++;
}
if (*nextBuff == 0) return NULL;
*pSz = 0;
while ((nextBuff[*pSz] != 0) && ! isspace(nextBuff[*pSz])) {
(*pSz)++;
}
return nextBuff;
}
int main(void) {
char *str, *str3, *str4; int sz, sz3, sz4;
while (fgets(buff, sizeof(buff), stdin) != NULL) {
str = getStr(buff, &sz); printf("%*.*s", sz, sz, str);
str = getStr(str + sz, &sz); printf(" %*.*s", sz, sz, str);
str3 = getStr(str + sz, &sz3); printf(" %*.*s", sz3, sz3, str3);
str4 = getStr(str3 + sz3, &sz4); printf(" %*.*s", sz4, sz4, str4);
str = getStr(str4 + sz4, &sz);
while (str != NULL) {
if (*str == '0') {
printf(" %*.*s", sz3, sz3, str3);
} else {
printf(" %*.*s", sz4, sz4, str4);
}
str = getStr(str + sz, &sz);
}
printf("\n");
}
return 0;
}

Using gsub in awk you could try this as an option:
$ awk '{d[1]=$1;d[2]=$2;gsub(/0/,$3);gsub(/1/,$4);$1=d[1];$2=d[2];}1' myfile
20:60479_C_T 60479 C T C C C C C T C T
20:60522_T_TC 60522 T TC T T T T T T T
20:60568_A_C 60568 A C A A C A A C
20:60571_C_A 60571 C A C A C A C C
20:60579_G_A 60579 G A G G A G G G

Related

add filename without the extension at certain columns using awk

I would like to leave empty first four columns, then I want to add filename without extension in the last 4 columns. I have files as file.frq and goes on. Later I will apply this to the 200 files in loop.
input
CHR POS REF ALT AF HOM Het Number of animals
1 94980034 C T 0 0 0 5
1 94980057 C T 0 0 0 5
Desired output
file file file file
CHR POS REF ALT AF HOM Het Number of animals
1 94980034 C T 0 0 0 5
1 94980057 C T 0 0 0 5
I tried this from Add file name and empty column to existing file in awk
awk '{$0=(NR==1? " \t"" \t"" \t"" \t":FILENAME"\t") "\t" $0}7' file2.frq
But it gave me this:
CHR POS REF ALT AF HOM Het Number of animals
file2.frq 1 94980034 C T 0 0 0 5
file2.frq 1 94980057 C T 0 0 0 5
file2.frq 1 94980062 G C 0 0 0 5
and I also tried this
awk -v OFS="\t" '{print FILENAME, $1=" ",$2=" ",$3=" ", $4=" ",$5 - end}' file2.frq
but it gave me this
CHR POS REF ALT AF HOM Het Number of animals
file2.frq 1 94980034 C T 0 0 0 5
file2.frq 1 94980057 C T 0 0 0 5
any help will be appreciated!
Assuming your input is tab-separated like your desired output:
awk '
BEGIN { FS=OFS="\t" }
NR==1 {
orig = $0
fname = FILENAME
sub(/\.[^.]*$/,"",fname)
$1=$2=$3=$4 = ""
$5=$6=$7=$8 = fname
print
$0 = orig
}
1' file.txt
file file file file
CHR POS REF ALT AF HOM Het Number of animals
1 94980034 C T 0 0 0 5
1 94980057 C T 0 0 0 5
To see it in table format:
$ awk '
BEGIN { FS=OFS="\t" }
NR==1 {
orig = $0
fname = FILENAME
sub(/\.[^.]*$/,"",fname)
$1=$2=$3=$4 = ""
$5=$6=$7=$8 = fname
print
$0 = orig
}
1' file.txt | column -s$'\t' -t
file file file file
CHR POS REF ALT AF HOM Het Number of animals
1 94980034 C T 0 0 0 5
1 94980057 C T 0 0 0 5

Determine type of triangle

I'm trying the hacker rank type of triangle below where based on the 3 sides it has to be determined if the triangle is equilateral, isosceles, scaelene, or not a triangle.
https://www.hackerrank.com/challenges/what-type-of-triangle/problem
I'm not sure why the code below isn't passing the test case. Unfortunately, I can't download the test case to see why it isn't working.
SELECT CASE WHEN A = B AND B = C AND A = C THEN 'Equilateral'
WHEN (A = B AND B != C AND A != C) OR (B = C AND A != B AND A != C) OR (A = C AND A != B AND B != C) THEN 'Isosceles'
WHEN ((A + B) < C) OR ((B + C) < A) OR ((C + A) < B) THEN 'Not a triangle'
ELSE 'Scalene' END
FROM Triangles
Try something like this:
SELECT
CASE
WHEN A + B > C AND A + C > B AND B + C > A THEN
CASE
WHEN A = B AND B = C THEN 'Equilateral'
WHEN A = B OR B = C OR A = C THEN 'Isosceles'
ELSE 'Scalene' END
ELSE 'Not A Triangle' END
FROM TRIANGLES
Only test for the type of triangle when it is a triangle.
Try this :
SELECT CASE WHEN A + B > C AND A+C>B AND B+C>A THEN
CASE WHEN A = B AND B = C THEN 'Equilateral'
WHEN A = B OR B = C OR A = C THEN 'Isosceles'
WHEN A != B OR B != C OR A != C THEN 'Scalene'
END
ELSE 'Not A Triangle' END FROM TRIANGLES;
The condition for scalene triangle does not include the following condition in your original query : Side 1 + Side 2 <= Side 3

awk merge columns from multiple files, append different values and remove same values

I have two files:
try7.txt
a 32145
b eioue
c 32654895
d bdefgac
e kkloi
f 6549465
g test123452
h est0124358
try8.txt
a 32145562
b eioueddf
c 32654
d bdefgac
e kkloi
f 6549465dww
g test123
h est0124358df
i 63574968fd
j dfsdfcd5
desired output:
a 32145562 32145
b eioueddf eioue
c 32654 32654895
d bdefgac 0
e kkloi 0
f 6549465dww 6549465
g test123 test123452
h est0124358df est0124358
i 63574968fd 0
j dfsdfcd5 0
actual output:
a 32145562 32145
b eioueddf eioue
c 32654 32654895
d bdefgac bdefgac
e kkloi kkloi
f 6549465dww 6549465
g test123 test123452
h est0124358df est0124358
i 63574968fd 0
j dfsdfcd5 0
The codes I found:
awk 'NR==FNR{a[$1]=$2;next}
{if($1 in a){print $0,a[$1];delete a[$1]}
else print $0,"0"}
END{for(x in a)print x,"0",a[x]}' try7.txt try8.txt|sort -n|column -t
How do I modify these codes to meet my requirement?
Bit lengthy
awk 'FNR==NR{a[$1]=$2; next}
($1 in a) && a[$1] != $2{print $1,$2,a[$1]}
($1 in a) && a[$1] == $2 {print $1, $2,"0"}
!($1 in a ){print $1, $2, 0}' try7 try8
Will give an output as
a 32145562 32145
b eioueddf eioue
c 32654 32654895
d bdefgac 0
e kkloi 0
f 6549465dww 6549465
g test123 test123452
h est0124358df est0124358
i 63574968fd 0
j dfsdfcd5 0

Insert characters between strings in vb.net

I have a string which contains
A F C A D A B A F G H A B C D A E A X B
I want to append a tag before and after A B like [A B]
A F C A D [A B] A F G H [A B] C D A E [A X B] J H A
Working in vb.net windows form
This is definitely not the most elegant solution but I believe it works:
String strInput = "A F C A D A B A F G H A B C D A E A X B J H A A C D A E X B";
StringBuilder sbOutput = new StringBuilder();
StringBuilder sbTemp = new StringBuilder();
foreach (Char ch in strInput)
{
if (ch.Equals('A'))
{
if (sbTemp.Length > 0)
{
sbOutput.Append(sbTemp.ToString());
sbTemp = new StringBuilder().Append(ch);
}
else
{
sbTemp.Append(ch);
}
}
else
{
if (sbTemp.Length > 0)
{
sbTemp.Append(ch);
if (ch.Equals('B'))
{
sbOutput.Append(String.Format("[{0}]", sbTemp.ToString()));
sbTemp = new StringBuilder();
}
}
else
{
sbOutput.Append(ch);
}
}
}
str = str.Replace("A B", "[A B]")
A hard coded replace of "A B" to "[A B]" won't work if you have "A X B", So i would suggest a possible solution, which is regex:
Try these two patterns:
Dim pattern1 as string = #"(A)[^A]*(?=B)"
Dim str as string = Regex.Replace(inputString, pattern1, "[\1"); // make `A` to `[A` followed by a `B`
Dim pattern2 as string = #"(?<=A)[^A]*(B)"
str = Regex.Replace(str, pattern1, "\1]"); // make `B` to `B]` // make `B` to `B]` preceded by `A`
Debug.Print(str);
Hope it helps!

You have four numbers, how do you figure out which one is greatest?

Is there a very simple algorithm to figure out which of 4 numbers is the greatest?
var lst = new List<int>() { 1, 7, 3, 4 };
var max = lst.Max();
I got no VB, but you get the idea.
If they are in an array, something like this should work:
VB:
Dim ar As Integer() = {3, 6, 9, 12}
Dim largest As Integer = ar(0)
For i As Integer = 1 To ar.Length - 1
If ar(i) > largest Then
largest = ar(i)
End If
Next
C#:
int[] ar = {3, 6, 9, 12};
int largest = ar[0];
for(int i = 1; i < ar.Length; i++) {
if(ar[i] > largest) {
largest = ar[i];
}
}
If you're using a language that supports some sort of max function or array sorting definitely use those features. Or choose any of the other sane answers in this thread. However, just for fun:
maximum = (var1 > var2 ? var1 : var2) > (var3 > var 4 ? var3 : var 4) ?
(var1 > var2 ? var1 : var2) :
(var3 > var 4 ? var3 : var 4);
Put the numbers into an array, sort the array, then select the one whose index is array length -1.
Or you could put the numbers into an array, sort the array, reverse the array, and then select index 0.
If you need to write your own sorting algorithm, the simplest one to implement is likely to be the bubble sort.
With VB.Net you could the following and it will work for any number of numbers
Public Function Max(ParamArray items As Integer()) As Integer
if items.Length = 0 Then
throw New ArgumentException("need at least 1 number")
End IF
return items.Max()
End Function
Then you can now do
Max(1,2,3,4)
There are plenty of ways you could do this.
A really naive approach would be:
#Pseudocode
If number1 > number2 and number1 > number3 and number1 > number4: return number1
Else if number2 > number3 and number2 > number4: return number2
Else if number3 > number4: return number3
Else: return number4
It's more practical to use arrays but if you're starting that could be more complicated than simple if blocks.
If they're in an array - and doing it explicitly rather than using sort:
int max = int.MinValue; // i.e. the "largest" negative number
int largest = -1;
for (int index = 0; index < array.Length; index++)
{
if (array[index] > max)
{
max = array[index];
largest = index;
}
}
The greatest value will be max and it's index in largest.
Nate's answer is more efficient as it uses the first element of the array as the initial value. So the first three lines of my solution would become:
int max = array[0];
int largest = 0;
for (int index = 0; index < array.Length; index++)
Get A, B, C, D from user;
largest = A;
if B > largest then
Largest = B;
if C > largest then
Largest = C;
if D > largest then
largest = D;
Print largest
In Java, if a is an int[4]:
Math.max(Math.max(a[0], a[1]), Math.max(a[2], a[3]))
Dim MyValues As New List(Of Double)
Dim MaxValue As Double
Dim tValue As Double
MyValues.Add(12.58)
MyValues.Add(3.58)
MyValues.Add(518.6)
MyValues.Add(244)
MyValues.Add(31.25)
For Each tValue In MyValues
If MaxValue < tValue Then
MaxValue = tValue
End If
Next
MsgBox(MaxValue)
My first question would be why? Second would be, if it's only four numbers then it really doesn't matter. Whatever takes your fancy. I personally would go with the fewest lines of code. Which would be to use the built in array.Sort method, then take the last item.
I would also consider using LINQ, just because you can.
Or Math.Max in a nasty nested way so Math.Max(Number1,Math.Max(Number2,Math.Max(Number3,Number4))))
If there could be hundreds of numbers, then I would try and pick a better algorithm. Probably the one suggested by #ChrisF, although it would depend on where the numbers are coming from, EG a database could find the max much easier, or if the numbers are being read from somewhere sequentially then you could store the max as you read the numbers.
this is my own analization. i made this code to display the lowest and highest numbers among the 4 inputted numbers from the textbox,. it will display the lowest and highest to appointed labels. if u input two same lowest or highest numbers, a msgbox appear to notify u somehow that u inputted same highest or lowest numbers and it displays back to its appropriate label. i used labels for the display of lowest and highest. here's my fb: iver saladaga anoos , 2ndyear student of JHCSC tambulig, zamboanga del sur, philippines..
so here it is! it worked for me. im using vb6 enterprise edition. :)
Private Sub Command1_Click()
Dim A, B, C, D As Long
A = Text1.Text
B = Text2.Text
C = Text3.Text
D = Text4.Text
If A < B And A < C And A < D Then
Label9.Caption = A
Else
If A > B And A > C And A > D Then
Label10.Caption = A
End If
End If
If A < B And A < D And A < C Then
Label9.Caption = A
Else
If A > B And A > D And A > C Then
Label10.Caption = A
End If
End If
If A < C And A < B And A < D Then
Label9.Caption = A
Else
If A > C And A > B And A > D Then
Label10.Caption = A
End If
End If
If A < C And A < D And A < B Then
Label9.Caption = A
Else
If A > C And A > D And A > B Then
Label10.Caption = A
End If
End If
If A < D And A < C And A < B Then
Label9.Caption = A
Else
If A > D And A > C And A > B Then
Label10.Caption = A
End If
End If
If A < D And A < B And A < C Then
Label9.Caption = A
Else
If A > D And A > B And A > C Then
Label10.Caption = A
End If
End If
If B < C And B < A And B < D Then
Label9.Caption = B
Else
If B > C And B > A And B > D Then
Label10.Caption = B
End If
End If
If B < C And B < D And B < A Then
Label9.Caption = B
Else
If B > C And B > D And B > A Then
Label10.Caption = B
End If
End If
If B < A And B < C And B < D Then
Label9.Caption = B
Else
If B > A And B > C And B > D Then
Label10.Caption = B
End If
End If
If B < A And B < D And B < C Then
Label9.Caption = B
Else
If B > A And B > D And B > C Then
Label10.Caption = B
End If
End If
If B < D And B < C And B < A Then
Label9.Caption = B
Else
If B > D And B > C And B > A Then
Label10.Caption = B
End If
End If
If B < D And B < A And B < C Then
Label9.Caption = B
Else
If B > D And B > A And B > C Then
Label10.Caption = B
End If
End If
If C < A And C < B And C < D Then
Label9.Caption = C
Else
If C > A And C > B And C > D Then
Label10.Caption = C
End If
End If
If C < A And C < D And C < B Then
Label9.Caption = C
Else
If C > A And C > D And C > B Then
Label10.Caption = C
End If
End If
If C < B And C < A And C < D Then
Label9.Caption = C
Else
If C > B And C > A And C > D Then
Label10.Caption = C
End If
End If
If C < B And C < D And C < A Then
Label9.Caption = C
Else
If C > B And C > D And C > A Then
Label10.Caption = C
End If
End If
If C < D And C < A And C < B Then
Label9.Caption = C
Else
If C > D And C > A And C > B Then
Label10.Caption = C
End If
End If
If C < D And C < B And C < A Then
Label9.Caption = C
Else
If C > D And C > B And C > A Then
Label10.Caption = C
End If
End If
If D < A And D < B And D < C Then
Label9.Caption = D
Else
If D > A And D > B And D > C Then
Label10.Caption = D
End If
End If
If D < A And D < C And D < B Then
Label9.Caption = D
Else
If D > A And D > C And D > B Then
Label10.Caption = D
End If
End If
If D < B And D < A And D < C Then
Label9.Caption = D
Else
If D > B And D > A And D > C Then
Label10.Caption = D
End If
End If
If D < B And D < C And D < A Then
Label9.Caption = D
Else
If D > B And D > C And D > A Then
Label10.Caption = D
End If
End If
If D < C And D < B And D < A Then
Label9.Caption = D
Else
If D > C And D > B And D > A Then
Label10.Caption = D
End If
End If
If D < C And D < A And D < B Then
Label9.Caption = D
Else
If D > C And D > A And D > B Then
Label10.Caption = D
End If
End If
Command2.Enabled = True
If A = D And A > C And A > B Then
MsgBox "Same highest numbers (" + A + ") is inputted."
highest = A
Label10.Caption = highest
Else
If A = D And A < C And A < B Then
MsgBox "Same lowest numbers (" + A + ") is inputted."
lowest = A
Label9.Caption = lowest
End If
End If
If A = B And A > C And A > D Then
MsgBox "Same highest numbers (" + A + ") is inputted."
highest = A
Label10.Caption = highest
Else
If A = B And A < C And A < D Then
MsgBox "Same lowest numbers (" + A + ") is inputted."
lowest = A
Label9.Caption = lowest
End If
End If
If B = D And B > A And B > C Then
MsgBox "Same highest numbers (" + B + ") is inputted."
highest = B
Label10.Caption = highest
Else
If B = D And B < A And B < C Then
MsgBox "Same lowest numbers (" + B + ") is inputted."
lowest = B
Label9.Caption = lowest
End If
End If
If B = C And B > D And B > A Then
MsgBox "Same highest numbers (" + B + ") is inputted."
highest = B
Label10.Caption = highest
Else
If B = C And B < D And B < A Then
MsgBox "Same lowest numbers (" + B + ") is inputted."
lowest = B
Label9.Caption = lowest
End If
End If
If C = A And C > D And C > B Then
MsgBox "Same highest numbers (" + C + ") is inputted."
highest = C
Label10.Caption = highest
Else
If C = A And C < D And C < B Then
MsgBox "Same lowest numbers (" + C + ") is inputted."
lowest = C
Label9.Caption = lowest
End If
End If
If C = D And C > B And C > A Then
MsgBox "Same highest numbers (" + C + ") is inputted."
highest = C
Label10.Caption = highest
Else
If C = D And C < B And C < A Then
MsgBox "Same lowest numbers (" + C + ") is inputted."
lowest = C
Label9.Caption = lowest
End If
End If
End Sub
Private Sub Command2_Click()
Text1.Text = Clear
Text2.Text = Clear
Text3.Text = Clear
Text4.Text = Clear
Label9.Caption = Clear
Label10.Caption = Clear
Command2.Enabled = False
Command1.Enabled = False
Text1.SetFocus
End Sub
Private Sub Command3_Click()
End
End Sub
Private Sub Form_Load()
Command2.Enabled = False
Command1.Enabled = False
End Sub
Private Sub Text1_Change()
Command1.Enabled = True
End Sub