I have files that look like this -- already sorted by year (inside years sorted by id, which appears to be equivalent to strict sorting by id, but this may not always apply).
ID,COU,YEA, VOT
1,USA,2000,1
2,USA,2000,0
3,USA,2001,1
4,USA,2003,2
5,USA,2003,0
I would like to rewrite them like this (ids for year N after 1999 in column 2N-1, corresponding votes in column 2N):
2000 IDS, VOTE, 2001 IDS, VOTE, 2002 IDS, VOTE, 2003 IDS, VOTE
1,1,3,1, , ,4,2
2,0, , , , ,5,0
I don't know how to do it. My basic thinking with awk was:
if $3 == 2000, { print $1, $4 }
if $3 == 2001, { print " "," ", $1, $4 } etc
But there are two problems:
this way the columns for years other than 2000 would start with a lot of empty lines
I have found no intelligent way to generalise the print command, so I would have to write 20 if-statements
The only working idea I have is, to create 20 unneeded files and glue them with paste which I have never used, but which seems suitable, according to man on my system.
The key is to use multidimensional arrays
BEGIN {FS = ","}
NR == 2 {minYear = maxYear = $3}
NR > 1 {
year=$3
count[year]++
id[year, count[year]] = $1
vote[year, count[year]] = $4
if (year < minYear) minYear = year
if (year > maxYear) maxYear = year
if (count[year] > maxCount) maxCount = count[year]
}
END {
sep = ""
for (y=minYear; y<=maxYear; y++) {
printf "%s%d ID,VOTE", sep, y
sep = ","
}
print ""
for (i=1; i<=maxCount; i++) {
sep = ""
for (y=minYear; y<=maxYear; y++) {
printf "%s%s,%s", sep, id[y, i], vote[y, i]
sep = ","
}
print ""
}
}
Then,
$ awk -f transpose.awk input_file
2000 ID,VOTE,2001 ID,VOTE,2002 ID,VOTE,2003 ID,VOTE
1,1,3,1,,,4,2
2,0,,,,,5,0
If you really want hte spaces in the output, change the last printf to
printf "%s%s,%s", sep,
((y, i) in id ? id[y, i] : " "),
((y, i) in vote ? vote[y, i] : " ")
This is functionally the same as #Glenn's and no better than it in any way so his should remain the accepted answer but I came up with it before looking at his and thought it might be useful to post it anyway to show some small alternatives in style and implementation details:
$ cat tst.awk
BEGIN { FS=OFS="," }
NR == 1 { next }
{
id = $1
year = $3
votes = $4
if ( ++numYears == 1 ) {
begYear = year
}
endYear = year
yearIds[year,++numIds[year]] = id
yearVotes[year,numIds[year]] = votes
maxIds = (numIds[year] > maxIds ? numIds[year] : maxIds)
}
END {
for (year=begYear; year<=endYear; year++) {
printf "%s IDS%sVOTE%s", year, OFS, (year<endYear ? OFS : ORS)
}
for (idNr=1; idNr<=maxIds; idNr++) {
for (year=begYear; year<=endYear; year++) {
id = votes = " "
if ( (year,idNr) in yearIds ) {
id = yearIds[year,idNr]
votes = yearVotes[year,idNr]
}
printf "%s%s%s%s", id, OFS, votes, (year<endYear ? OFS : ORS)
}
}
}
$ awk -f tst.awk file
2000 IDS,VOTE,2001 IDS,VOTE,2002 IDS,VOTE,2003 IDS,VOTE
1,1,3,1, , ,4,2
2,0, , , , ,5,0
With respect and permission of glenn jackman I am taking his suggested code the only thing I am trying to add here is get maximum and minimum year in awk's variable itself and NOT calculating it inside main block of awk program, since OP confirmed that Input_file is sorted by year. Answers by Glenn and Ed sir are awesome, just thought to add a variant here.
BTW we could use awk in stead of using tail and heads in variables too here :)
awk -v max=$(tail -1 Input_file | cut -d, -f3) -v min=$(head -2 Input_file | tail -1 | cut -d, -f3) '
BEGIN { FS = "," }
NR > 1 {
year=$3
count[year]++
id[year, count[year]] = $1
vote[year, count[year]] = $4
if (count[year] > maxCount) maxCount = count[year]
}
END {
sep = ""
for (y=min; y<=max; y++) {
printf "%s%d ID,VOTE", sep, y
sep = ","
}
print ""
for (i=1; i<=maxCount; i++) {
sep = ""
for (y=min; y<=max; y++) {
printf "%s%s,%s", sep, id[y, i], vote[y, i]
sep = ","
}
print ""
}
}' Input_file
Would like to read and count the field value == "TRUE" only from 3rd field to 5th field.
Input.txt
Locationx,Desc,A,B,C,Locationy
ab123,Name1,TRUE,TRUE,TRUE,ab1234
ab123,Name2,TRUE,FALSE,TRUE,ab1234
ab123,Name2,FALSE,FALSE,TRUE,ab1234
ab123,Name1,TRUE,TRUE,TRUE,ab1234
ab123,Name2,TRUE,TRUE,TRUE,ab1234
ab123,Name3,FALSE,FALSE,FALSE,ab1234
ab123,Name3,TRUE,FALSE,FALSE,ab1234
ab123,Name3,TRUE,TRUE,FALSE,ab1234
ab123,Name3,TRUE,TRUE,FALSE,ab1234
ab123,Name1,TRUE,TRUE,FALSE,ab1234
While reading the headers from 3rd field to 5th field , i,e A, B, C want to generate unique combinations and permutations like A,B,C,AB,AC,AB,ABC only.
Note: AA, BB, CC, BA etc excluded
If the "TRUE" is considered for "AB" combination count then it should not be considered for "A" conut & "B" count again to avoid duplicate ..
Example#1
Locationx,Desc,A,B,C,Locationy
ab123,Name1,TRUE,TRUE,TRUE,ab1234
Op#1
Desc,A,B,C,AB,AC,BC,ABC
Name1,,,,,,,1
Example#2
Locationx,Desc,A,B,C,Locationy
ab123,Name1,TRUE,TRUE,FALSE,ab1234
Op#2
Desc,A,B,C,AB,AC,BC,ABC
Name1,,,,1,,,
Example#3
Locationx,Desc,A,B,C,Locationy
ab123,Name1,FALSE,TRUE,FALSE,ab1234
Op#3
Desc,A,B,C,AB,AC,BC,ABC
Name1,,1,,,,,
Desired Output:
Desc,A,B,C,AB,AC,BC,ABC
Name1,,,,1,,,2
Name2,,,1,,1,,1
Name3,1,,,2,,,
Actual File is like below :
Input.txt
Locationx,Desc,INCOMING,OUTGOING,SMS,RECHARGE,DEBIT,DATA,Locationy
ab123,Name1,TRUE,TRUE,TRUE,FALSE,FALSE,FALSE,ab1234
ab123,Name2,TRUE,TRUE,FALSE,TRUE,TRUE,TRUE,ab1234
ab123,Name2,TRUE,TRUE,TRUE,TRUE,FALSE,FALSE,ab1234
ab123,Name1,TRUE,TRUE,TRUE,TRUE,FALSE,TRUE,ab1234
ab123,Name2,TRUE,TRUE,TRUE,TRUE,FALSE,TRUE,ab1234
ab123,Name3,FALSE,FALSE,FALSE,TRUE,FALSE,FALSE,ab1234
ab123,Name3,TRUE,TRUE,TRUE,TRUE,TRUE,TRUE,ab1234
ab123,Name3,TRUE,TRUE,FALSE,TRUE,FALSE,FALSE,ab1234
ab123,Name3,TRUE,TRUE,FALSE,TRUE,FALSE,FALSE,ab1234
ab123,Name1,TRUE,TRUE,FALSE,FALSE,FALSE,TRUE,ab1234
Have tried lot , nothing is materialised , any suggestions please !!!
Edit: Desired Output from Actual Input:
Desc,INCOMING-OUTGOING-SMS-RECHARGE-DEBIT-DATA,OUTGOING-SMS-RECHARGE-DEBIT-DATA,INCOMING-SMS-RECHARGE-DEBIT-DATA,INCOMING-OUTGOING-RECHARGE-DEBIT-DATA,INCOMING-OUTGOING-SMS-RECHARGE-DATA,INCOMING-OUTGOING-SMS-RECHARGE-DEBIT,SMS-RECHARGE-DEBIT-DATA,OUTGOING-RECHARGE-DEBIT-DATA,OUTGOING-SMS-RECHARGE-DATA,OUTGOING-SMS-RECHARGE-DEBIT,INCOMING-RECHARGE-DEBIT-DATA,INCOMING-SMS-DEBIT-DATA,INCOMING-SMS-RECHARGE-DATA,INCOMING-SMS-RECHARGE-DEBIT,INCOMING-OUTGOING-DEBIT-DATA,INCOMING-OUTGOING-RECHARGE-DATA,INCOMING-OUTGOING-RECHARGE-DEBIT,INCOMING-OUTGOING-SMS-DATA,INCOMING-OUTGOING-SMS-DEBIT,INCOMING-OUTGOING-SMS-RECHARGE,RECHARGE-DEBIT-DATA,SMS-DEBIT-DATA,SMS-RECHARGE-DATA,SMS-RECHARGE-DEBIT,OUTGOING-RECHARGE-DATA,OUTGOING-RECHARGE-DEBIT,OUTGOING-SMS-DATA,OUTGOING-SMS-DEBIT,OUTGOING-SMS-RECHARGE,INCOMING-DEBIT-DATA,INCOMING-RECHARGE-DATA,INCOMING-RECHARGE-DEBIT,INCOMING-SMS-DATA,INCOMING-SMS-DEBIT,INCOMING-SMS-RECHARGE,INCOMING-OUTGOING-DATA,INCOMING-OUTGOING-DEBIT,INCOMING-OUTGOING-RECHARGE,INCOMING-OUTGOING-SMS,DEBIT-DATA,RECHARGE-DATA,RECHARGE-DEBIT,SMS-DATA,SMS-DEBIT,SMS-RECHARGE,OUTGOING-DATA,OUTGOING-DEBIT,OUTGOING-RECHARGE,OUTGOING-SMS,INCOMING-DATA,INCOMING-DEBIT,INCOMING-RECHARGE,INCOMING-SMS,INCOMING-OUTGOING,DATA,DEBIT,RECHARGE,SMS,OUTGOING,INCOMING
Name1,,,,,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1,,,1,,,,,,,,,,,,,,,,,,,,,
Name2,,,,1,1,,,,,,,,,,,,,,,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
Name3,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2,,,,,,,,,,,,,,,,,,,1,,,
Don't have Perl and Python access !!!
I have written a perl script that does this for you. As you can see from the size and comments, it is really simple to get this done.
#!/usr/bin/perl
use strict;
use warnings;
use autodie;
use Algorithm::Combinatorics qw(combinations);
## change the file to the path where your file exists
open my $fh, '<', 'file';
my (%data, #new_labels);
## capture the header line in an array
my #header = split /,/, <$fh>;
## backup the header
my #fields = #header;
## remove first, second and last columns
#header = splice #header, 2, -1;
## generate unique combinations
for my $iter (1 .. +#header) {
my $combination = combinations(\#header, $iter);
while (my $pair = $combination->next) {
push #new_labels, "#$pair";
}
}
## iterate through rest of the file
while(my $line = <$fh>) {
my #line = split /,/, $line;
## identify combined labels that are true
my #is_true = map { $fields[$_] } grep { $line[$_] eq "TRUE" } 0 .. $#line;
## increment counter in hash map keyed at description and then new labels
++$data{$line[1]}{$_} for map { s/ /-/g; $_ } "#is_true";
}
## print the new header
print join ( ",", "Desc", map {s/ /-/g; $_} reverse #new_labels ) . "\n";
## print the description and counter values
for my $desc (sort keys %data){
print join ( ",", $desc, ( map { $data{$desc}{$_} //= "" } reverse #new_labels ) ) . "\n";
}
Output:
Desc,INCOMING-OUTGOING-SMS-RECHARGE-DEBIT-DATA,OUTGOING-SMS-RECHARGE-DEBIT-DATA,INCOMING-SMS-RECHARGE-DEBIT-DATA,INCOMING-OUTGOING-RECHARGE-DEBIT-DATA,INCOMING-OUTGOING-SMS-DEBIT-DATA,INCOMING-OUTGOING-SMS-RECHARGE-DATA,INCOMING-OUTGOING-SMS-RECHARGE-DEBIT,SMS-RECHARGE-DEBIT-DATA,OUTGOING-RECHARGE-DEBIT-DATA,OUTGOING-SMS-DEBIT-DATA,OUTGOING-SMS-RECHARGE-DATA,OUTGOING-SMS-RECHARGE-DEBIT,INCOMING-RECHARGE-DEBIT-DATA,INCOMING-SMS-DEBIT-DATA,INCOMING-SMS-RECHARGE-DATA,INCOMING-SMS-RECHARGE-DEBIT,INCOMING-OUTGOING-DEBIT-DATA,INCOMING-OUTGOING-RECHARGE-DATA,INCOMING-OUTGOING-RECHARGE-DEBIT,INCOMING-OUTGOING-SMS-DATA,INCOMING-OUTGOING-SMS-DEBIT,INCOMING-OUTGOING-SMS-RECHARGE,RECHARGE-DEBIT-DATA,SMS-DEBIT-DATA,SMS-RECHARGE-DATA,SMS-RECHARGE-DEBIT,OUTGOING-DEBIT-DATA,OUTGOING-RECHARGE-DATA,OUTGOING-RECHARGE-DEBIT,OUTGOING-SMS-DATA,OUTGOING-SMS-DEBIT,OUTGOING-SMS-RECHARGE,INCOMING-DEBIT-DATA,INCOMING-RECHARGE-DATA,INCOMING-RECHARGE-DEBIT,INCOMING-SMS-DATA,INCOMING-SMS-DEBIT,INCOMING-SMS-RECHARGE,INCOMING-OUTGOING-DATA,INCOMING-OUTGOING-DEBIT,INCOMING-OUTGOING-RECHARGE,INCOMING-OUTGOING-SMS,DEBIT-DATA,RECHARGE-DATA,RECHARGE-DEBIT,SMS-DATA,SMS-DEBIT,SMS-RECHARGE,OUTGOING-DATA,OUTGOING-DEBIT,OUTGOING-RECHARGE,OUTGOING-SMS,INCOMING-DATA,INCOMING-DEBIT,INCOMING-RECHARGE,INCOMING-SMS,INCOMING-OUTGOING,DATA,DEBIT,RECHARGE,SMS,OUTGOING,INCOMING
Name1,,,,,,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1,,,1,,,,,,,,,,,,,,,,,,,,,
Name2,,,,1,,1,,,,,,,,,,,,,,,,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
Name3,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2,,,,,,,,,,,,,,,,,,,1,,,
Note: Please revisit your expected output. It has few mistakes in it as you can see from the output generated from the script above.
Here is an attempt at solving this using awk:
Content of script.awk
BEGIN { FS = OFS = "," }
function combinations(flds, itr, i, pre) {
for (i=++cnt; i<=numRecs; i++) {
++n
sep = ""
for (pre=1; pre<=itr; pre++) {
newRecs[n] = newRecs[n] sep (sprintf ("%s", flds[pre]));
sep = "-"
}
newRecs[n] = newRecs[n] sep (sprintf ("%s", flds[i])) ;
}
}
NR==1 {
for (fld=3; fld<NF; fld++) {
recs[++numRecs] = $fld
}
for (iter=0; iter<numRecs; iter++) {
combinations(recs, iter)
}
next
}
!seen[$2]++ { desc[++d] = $2 }
{
y = 0;
var = sep = ""
for (idx=3; idx<NF; idx++) {
if ($idx == "TRUE") {
is_true[++y] = recs[idx-2]
}
}
for (z=1; z<=y; z++) {
var = var sep sprintf ("%s", is_true[z])
sep = "-"
}
data[$2,var]++;
}
END{
printf "%s," , "Desc"
for (k=1; k<=n; k++) {
printf "%s%s", newRecs[k],(k==n?RS:FS)
}
for (name=1; name<=d; name++) {
printf "%s,", desc[name];
for (nR=1; nR<=n; nR++) {
printf "%s%s", (data[desc[name],newRecs[nR]]?data[desc[name],newRecs[nR]]:""), (nR==n?RS:FS)
}
}
}
Sample file
Locationx,Desc,A,B,C,Locationy
ab123,Name1,TRUE,TRUE,TRUE,ab1234
ab123,Name2,TRUE,FALSE,TRUE,ab1234
ab123,Name2,FALSE,FALSE,TRUE,ab1234
ab123,Name1,TRUE,TRUE,TRUE,ab1234
ab123,Name2,TRUE,TRUE,TRUE,ab1234
ab123,Name3,FALSE,FALSE,FALSE,ab1234
ab123,Name3,TRUE,FALSE,FALSE,ab1234
ab123,Name3,TRUE,TRUE,FALSE,ab1234
ab123,Name3,TRUE,TRUE,FALSE,ab1234
ab123,Name1,TRUE,TRUE,FALSE,ab1234
Execution:
$ awk -f script.awk file
Desc,A,B,C,A-B,A-C,A-B-C
Name1,,,,1,,2
Name2,,,1,,1,1
Name3,1,,,2,,
Now, there is pretty evident bug in the combination function. It does not recurse to print all combinations. For eg: for A B C D it will print
A
B
C
AB
AC
ABC
but not BC