I'm trying to parse two csv files that contains thousands of rows. The data is to be matched and appended based solely on the data in the first column. I currently have it parsing the files and outputting to 3 files:
1 - key matched
2 - file1 only
3 - file2 only
The issue I am having is that I have noticed once it makes one match it move on to the next line rather than finding the other entries. for the data in question I would rather output multiple lines containing some duplicates than to miss nay data. (The name column for instance varies depending on who entered the data)
INPUT FILES
file1.csv
topic,group,name,allow
fishing,boaties,dave,yes
fishing,divers,steve,no
flying,red,luke,yes
walking,red,tom,yes
file2.csv
Resource,name,email,funny
fishing,frank,frank#home.com,no
swiming,lee,lee#wallbanger.com,no
driving,lee,lee#wallbanger.com,no
CURRENT OUTPUT
key matched
topic,group,name,allow,Resource,name,email,funny
fishing,divers,steve,no,fishing,frank,frank#home.com,no
file1_only
topic,group,user,allow
fishing,divers,steve,no
flying,red,luke,yes
walking,red,tom,yes
file2_only
Resource,user,email,funny
swiming,lee,lee#wallbanger.com,no
driving,lee,lee#wallbanger.com,no
Expected Output
key matched
topic,group,name,allow,Resource,name,email,funny
fishing,divers,steve,no,fishing,frank,frank#home.com,no
fishing,boaties,dave,yes,fishing,frank,frank#home.com,no
file1_only
topic,group,user,allow
flying,red,luke,yes
walking,red,tom,yes
file2_only
Resource,user,email,funny
swiming,lee,lee#wallbanger.com,no
driving,lee,lee#wallbanger.com,no
So for every key in file 1 column 1, it needs to output/append every key that matches in file2 column1.
This is my current awk filter. Im guessing I need to add a loop in if possible?
BEGIN { FS=OFS="," }
FNR==1 { next }
{ key = $1 }
NR==FNR {
file1[key] = $0
next
}
key in file1 {
print file1[key], $0 > "./out_combined.csv"
delete file1[key]
next
}
{
print > "./out_file2_only.csv"
}
END {
for (key in file1) {
print file1[key] > "./out_file1_only.csv"
}
}
$ cat tst.awk
BEGIN { FS=OFS="," }
FNR==1 {
if ( NR==FNR ) {
file1hdr = $0
}
else {
print file1hdr > "./out_file1_only.csv"
print > "./out_file2_only.csv"
print file1hdr, $0 > "./out_combined.csv"
}
next
}
{ key = $1 }
NR==FNR {
file1[key,++cnt[key]] = $0
next
}
{
file2[key]
if ( key in cnt ) {
for ( i=1; i<=cnt[key]; i++ ) {
print file1[key,i], $0 > "./out_combined.csv"
}
}
else {
print > "./out_file2_only.csv"
}
}
END {
for ( key in cnt ) {
if ( !(key in file2) ) {
for ( i=1; i<=cnt[key]; i++ ) {
print file1[key,i] > "./out_file1_only.csv"
}
}
}
}
$ awk -f tst.awk file1.csv file2.csv
$ head out_*
==> out_combined.csv <==
topic,group,name,allow,Resource,name,email,funny
fishing,boaties,dave,yes,fishing,frank,frank#home.com,no
fishing,divers,steve,no,fishing,frank,frank#home.com,no
==> out_file1_only.csv <==
topic,group,name,allow
flying,red,luke,yes
walking,red,tom,yes
==> out_file2_only.csv <==
Resource,name,email,funny
swiming,lee,lee#wallbanger.com,no
driving,lee,lee#wallbanger.com,no
NR>NRMIN{
if($3 == "Leu") {
if($4 == "CD1" || $4 == "HD11" || $4 == "HD12" || $4 == "HD13") {
next;
}
}
elseif($3 == "Val") {
if($4 == "CD1" || $4 == "HD11" || $4 == "HD12" || $4 == "HD13") {
next;
}
}
else {
print;
}
}
I intend to selectively print lines of a space-delimited file.
Please let me know why the above code is giving an error when gawk -f FILE_Modifier.awk NRMIN = 90 FILE > NEWFILE
Error Message
gawk: FILE_Modifier.awk:7: elseif($3 == "Val") {
gawk: FILE_Modifier.awk:7: ^ syntax error
gawk: FILE_Modifier.awk:12: else {
gawk: FILE_Modifier.awk:12: ^ syntax error
There is no elseif. Anyway, you can rewrite the script as just:
awk -v nrmin=90 '(NR > nrmin) && !(($3 ~ /^(Leu|Val)$/) && ($4 ~ /^(CD1|HD11|HD12|HD13)$/))' file
Don't use all upper case variable names to avoid clashes with builtin names. Do set variables up front using -v unless you have a specific reason not to.
I have awk file named throughput.awk to calculate throughput from trace files in NS-2.
BEGIN {
FS="[[:space:]]|_"
}
{
action = $1;
node_id = $4;
time = $2;
dest = $6;
app = $10;
pkt_size = $11;
if ( action == "r" && dest == "MAC" && app == "cbr" && time > 10 && (node_id == 1)) {
sum_ = sum_ + pkt_size;
}
}
END {
}
what I want is I have to calculate each node's throughput for multiple nodes from TCL script maybe like this :
for {set node 1} {$node < N } {incr node}
exec awk -f throughput.awk test.tr
}
so "node" variable inside trace files can be changed from TCL. How to do that?
Just use -v parameter:
for (node=1;node<N;node++){
exec awk -v node=$node -f throughput.awk test.tr
}
And inside awk
if ( action == "r" && dest == "MAC" && app == "cbr" && time > 10 && (node_id == node)) {
sum_ = sum_ + pkt_size;
}
Before the "=" node will be the name of the variable inside awk and it's value ($node) will be node val in Tcl
I am trying to build a script which tail -f | awk the log file which is getting updated every second. awk part will fetch me only the required part of the log file based on my search parameter. Output XML is also captured in an output file. Script is working fine - as expected.
Issue - However ever after the search is performed it stays hung due to tail -f. Any idea how to update below script - so that once the output XML is captured, it should break the tail part??
XMLF=/appl/logs/abc.log
aa_pam=${1-xml}
[[ ${2-xml} = "xml" ]] && tof=xml_$(date +%Y%m%d%H%M%S).xml || tof=$2
tail -f $XMLF | \
awk ' BEGIN { Print_SW=0; Cnt_line=1; i=0}
/\<\?xml version\=/ { if (Print_SW==1) p_out(Cnt_Line,i)
Print_SW=0; Cnt_line=1;
}
{ Trap_arry[Cnt_line++]=$0;
}
/'${1-xml}'/ { Print_SW=1;
}
/\<\/XYZ_999/ { if (Print_SW==1) p_out(Cnt_Line, i);
Print_SW=0; Cnt_line=1; }
END { if (Print_SW==1) p_out(Cnt_Line, i); }
function p_out(Cnt_Line, i) {
for (i=1;i<Cnt_line;i++) {print Trap_arry[i] | "tee '$tof'" }
}
' | tee $tof
Update Tried as per below suggestion of using exit - it is existing the script successfully - however the xml that is getting captured at output is getting duplicated. So in the output file - same XML appears twice..!!
XMLF=/appl/logs/abc.log
aa_pam=${1-xml}
[[ ${2-xml} = "xml" ]] && tof=xml_$(date +%Y%m%d%H%M%S).xml || tof=$2
tail -f $XMLF | \
awk ' BEGIN { Print_SW=0; Cnt_line=1; i=0}
/\<\?xml version\=/ { if (Print_SW==1) p_out(Cnt_Line,i)
Print_SW=0; Cnt_line=1;
}
{ Trap_arry[Cnt_line++]=$0;
}
/'${1-xml}'/ { Print_SW=1;
}
/\<\/XYZ_999/ { if (Print_SW==1) p_out(Cnt_Line, i);
Print_SW=0; Cnt_line=1; }
END { if (Print_SW==1) p_out(Cnt_Line, i); }
function p_out(Cnt_Line, i) {
for (i=1;i<Cnt_line;i++) {print Trap_arry[i] | "tee '$tof'" } { exit }
}
' | tee $tof
Call exit (which will jump to your END block prior to termination) after you are finished capturing your output.
When awk terminates, the next write() to stdout by tail -f will result in an EPIPE error. tail knows to terminate when that happens.
UPDATE: You seem to be having some problem trying to decide where to put the exit. It should not be in p_out because you call p_out from both the closing XML tag match expression and from the END block. Try this instead:
XMLF=/appl/logs/abc.log
aa_pam=${1-xml}
[[ ${2-xml} = "xml" ]] && tof=xml_$(date +%Y%m%d%H%M%S).xml || tof=$2
tail -f $XMLF | \
awk '
BEGIN {
Print_SW=0
Cnt_line=1
i=0
}
/\<\?xml version\=/ {
if (Print_SW==1)
p_out(Cnt_Line,i)
Print_SW=0
Cnt_line=1
}
{
Trap_arry[Cnt_line++]=$0
}
/'${1-xml}'/ {
Print_SW=1;
}
/\<\/XYZ_999/ {
if (Print_SW==1)
p_out(Cnt_Line, i)
Print_SW=0
Cnt_line=1
exit
}
END {
if (Print_SW==1)
p_out(Cnt_Line, i);
}
function p_out(Cnt_Line, i) {
for (i=1;i<Cnt_line;i++) {
print Trap_arry[i] | "tee '$tof'"
}
}
' | tee $tof
You could, in the awk script, add a line such as:
/some-end-of-xml-marker/ { close(/dev/stdin) ; }
I didn't try it, but you get the idea: close STDIN when you reached the end of the file, so that the loop in awk stops and you get to the END part (not tested, I hope this proves to be correct...)
Based on this question How to break a tail -f command in bash you could try
#! /bin/bash
XMLF=/appl/logs/abc.log
aa_pam=${1-xml}
[[ ${2-xml} = "xml" ]] && tof=xml_$(date +%Y%m%d%H%M%S).xml || tof=$2
mkfifo log.pipe
tail -f "$XMLF" > log.pipe & tail_pid=$!
awk -vpar1="$aa_pam" -vtof="$tof" -f t.awk < log.pipe
kill $tail_pid
rm log.pipe
where t.awk is:
/<\?xml version\=/ {
if (Print_SW==1) {
p_out(Cnt_Line)
}
Print_SW=0
Cnt_line=0
}
{
Trap_arry[++Cnt_line]=$0
}
$0 ~ par1 {
Print_SW=1;
}
/<\/XYZ_999/ {
if (Print_SW==1)
p_out(Cnt_Line)
Print_SW=0
Cnt_line=0
}
function p_out(Cnt_Line, i) {
for (i=1; i<Cnt_line; i++) {
print Trap_arry[i] | ("tee " tof)
}
exit 1
}
Hi I have the following awk program
Problem is that I don't know why it complains on line 3
"awk ' invalid char ' ' ' in expression " when I do awk -f make.awk info.txt
Anyone of you out there who are brighter than me in this area? =)
#!/bin/bash
function labels2 () {
awk '
/[0-9]/{
print substr($3,length($3)-11), $3
}' $# | /bin/sort -u | awk '{print "BUILD: " NR, $2}'
}
function labels () {
awk '
/[0-9]/{
BL[$3] = substr($3,length($3)-11)
}
END {
asort(BL)
for (i in BL) {
print i, BL[i]
}
}' $#
}
labels $#
exit 0
for a in $#
do
labels $# | gawk '
/BUILD:/ {
BUILD[$2] = $3
BUILDCNT ++
next
}
/[0-9]/ {
DATEd[$3] = $1
TIMEd[$3] = $2
MODULESd[$3] = $4
CASESd[$3] = $5
FAILEDd[$3] = $6
COVERd[$3] = $7
LOCd[$3] = $8
}
END {
SUBSYSTEM=substr(FILENAME, 1, length(FILENAME)-7)
LABEL= "\"" toupper(SUBSYSTEM) "\""
print "{"
print "subsystem: " LABEL ","
print " date: {"
print " label: " LABEL ","
print " data: ["
for (i = 0 ; i <= BUILDCNT; i ++ ) {
B=BUILD[i]
if (DATEd[B]) { print " [" i ", \"" DATEd[B] "\"]," }
}
print " ]"
print " },"
}
' - $a
done
That's not an awk program, but a bash script. To run it, do
chmod +x yourscript
and then
./yourscript parameters