How to write this output in array foam, which remaining array - array postion output 0.0000 - pygtk

import gtk
class PyApp(gtk.Window):
def __init__(self):
super(PyApp, self).__init__()
self.set_title("Entry")
self.set_size_request(300, 300)
self.set_position(gtk.WIN_POS_CENTER)
fixed = gtk.Fixed()
self.label = gtk.Label("Entry")
fixed.put(self.label, 40, 40)
self.entry = gtk.Entry()
fixed.put(self.entry, 80, 40)
button1 = gtk.Button("OK")
button1.connect('clicked', self.button_clicked)
fixed.put(button1, 130, 90)
self.connect("destroy", gtk.main_quit)
self.add(fixed)
self.show_all()
def button_clicked(self, widget):
with open('entry.txt', 'w') as f:
f.write(self.entry.get_text())
PyApp()
gtk.main()
I need the output to be like this:
entry_text 0.0000 0.0000 0.0000
0.0000 0.0000 0.0000 0.0000
0.0000 0.0000 0.0000 0.0000
0.0000 0.0000 0.0000 0.0000
All help is appreciated!

Related

Plot secondary x_axis in ggplot

Dear All seniors and members,
Hope you are doing great. I have data set, which I like to plot the secondary x-axis in ggplot. I could not make it to work for the last 4 hours. below is my dataset.
Pathway ES NES p_value q_value Group
1 HALLMARK_HYPOXIA 0.49 2.25 0.000 0.000 Top
2 HALLMARK_EPITHELIAL_MESENCHYMAL_TRANSITION 0.44 2.00 0.000 0.000 Top
3 HALLMARK_UV_RESPONSE_DN 0.45 1.98 0.000 0.000 Top
4 HALLMARK_TGF_BETA_SIGNALING 0.48 1.77 0.003 0.004 Top
5 HALLMARK_HEDGEHOG_SIGNALING 0.52 1.76 0.003 0.003 Top
6 HALLMARK_ESTROGEN_RESPONSE_EARLY 0.38 1.73 0.000 0.004 Top
7 HALLMARK_KRAS_SIGNALING_DN 0.37 1.69 0.000 0.005 Top
8 HALLMARK_INTERFERON_ALPHA_RESPONSE 0.37 1.54 0.009 0.021 Top
9 HALLMARK_TNFA_SIGNALING_VIA_NFKB 0.32 1.45 0.005 0.048 Top
10 HALLMARK_NOTCH_SIGNALING 0.42 1.42 0.070 0.059 Top
11 HALLMARK_COAGULATION 0.32 1.39 0.031 0.067 Top
12 HALLMARK_MITOTIC_SPINDLE 0.30 1.37 0.025 0.078 Top
13 HALLMARK_ANGIOGENESIS 0.40 1.37 0.088 0.074 Top
14 HALLMARK_WNT_BETA_CATENIN_SIGNALING 0.35 1.23 0.173 0.216 Top
15 HALLMARK_OXIDATIVE_PHOSPHORYLATION -0.65 -3.43 0.000 0.000 Bottom
16 HALLMARK_MYC_TARGETS_V1 -0.49 -2.56 0.000 0.000 Bottom
17 HALLMARK_E2F_TARGETS -0.45 -2.37 0.000 0.000 Bottom
18 HALLMARK_DNA_REPAIR -0.46 -2.33 0.000 0.000 Bottom
19 HALLMARK_ADIPOGENESIS -0.42 -2.26 0.000 0.000 Bottom
20 HALLMARK_FATTY_ACID_METABOLISM -0.41 -2.06 0.000 0.000 Bottom
21 HALLMARK_PEROXISOME -0.43 -2.01 0.000 0.000 Bottom
22 HALLMARK_MYC_TARGETS_V2 -0.43 -1.84 0.003 0.001 Bottom
23 HALLMARK_CHOLESTEROL_HOMEOSTASIS -0.42 -1.83 0.003 0.001 Bottom
24 HALLMARK_ALLOGRAFT_REJECTION -0.34 -1.78 0.000 0.003 Bottom
25 HALLMARK_MTORC1_SIGNALING -0.32 -1.67 0.000 0.004 Bottom
26 HALLMARK_P53_PATHWAY -0.29 -1.52 0.000 0.015 Bottom
27 HALLMARK_UV_RESPONSE_UP -0.28 -1.41 0.013 0.036 Bottom
28 HALLMARK_REACTIVE_OXYGEN_SPECIES_PATHWAY -0.35 -1.39 0.057 0.040 Bottom
29 HALLMARK_HEME_METABOLISM -0.26 -1.34 0.014 0.061 Bottom
30 HALLMARK_G2M_CHECKPOINT -0.23 -1.20 0.080 0.172 Bottom
I like to plot like the following plot (plot # 1)
Here is my current codes chunks.
ggplot(data, aes(reorder(Pathway, NES), NES, fill= Group)) +
theme_classic() + geom_col() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1, size = 8),
axis.title = element_text(face = "bold", size = 12),
axis.text = element_text(face = "bold", size = 8), plot.title = element_text(hjust = 0.5)) + labs(x="Pathway", y="Normalized Enrichment Score",
title="2Gy_5f vs. 0Gy") + coord_flip()
This code produces the following plot (plot # 2)
So I would like to generate the plot where I have secondary x-axis with q_value (same like the first bar plot I have attached). Any help is greatly appreciated. Note: I used coord_flip so it turn angle of x-axis.
Kind Regards,
synat
[1]: https://i.stack.imgur.com/dBFIS.jpg
[2]: https://i.stack.imgur.com/yDbC5.jpg
Maybe you don't need a secondary axis per se to get the plot style you seek.
library(tidyverse)
ggplot(data, aes(x = NES, y = reorder(Pathway, NES), fill= Group)) +
theme_classic() +
geom_col() +
geom_text(aes(x = 2.5, y = reorder(Pathway, NES), label = q_value), hjust = 0) +
annotate("text", x = 2.5, y = length(data$Pathway) + 1, hjust = 0, fontface = "bold", label = "q_value" ) +
coord_cartesian(xlim = c(NA, 3),
ylim = c(NA, length(data$Pathway) + 1),
clip = "off") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1, size = 8),
axis.title = element_text(face = "bold", size = 12),
axis.text = element_text(face = "bold", size = 8),
plot.title = element_text(hjust = 0.5)) +
labs(x="Pathway", y="Normalized Enrichment Score",
title="2Gy_5f vs. 0Gy")
And for future reference you can read in data in the format you pasted like so:
data <- read_table(
"
Pathway ES NES p_value q_value Group
HALLMARK_HYPOXIA 0.49 2.25 0.000 0.000 Top
HALLMARK_EPITHELIAL_MESENCHYMAL_TRANSITION 0.44 2.00 0.000 0.000 Top
HALLMARK_UV_RESPONSE_DN 0.45 1.98 0.000 0.000 Top
HALLMARK_TGF_BETA_SIGNALING 0.48 1.77 0.003 0.004 Top
HALLMARK_HEDGEHOG_SIGNALING 0.52 1.76 0.003 0.003 Top
HALLMARK_ESTROGEN_RESPONSE_EARLY 0.38 1.73 0.000 0.004 Top
HALLMARK_KRAS_SIGNALING_DN 0.37 1.69 0.000 0.005 Top
HALLMARK_INTERFERON_ALPHA_RESPONSE 0.37 1.54 0.009 0.021 Top
HALLMARK_TNFA_SIGNALING_VIA_NFKB 0.32 1.45 0.005 0.048 Top
HALLMARK_NOTCH_SIGNALING 0.42 1.42 0.070 0.059 Top
HALLMARK_COAGULATION 0.32 1.39 0.031 0.067 Top
HALLMARK_MITOTIC_SPINDLE 0.30 1.37 0.025 0.078 Top
HALLMARK_ANGIOGENESIS 0.40 1.37 0.088 0.074 Top
HALLMARK_WNT_BETA_CATENIN_SIGNALING 0.35 1.23 0.173 0.216 Top
HALLMARK_OXIDATIVE_PHOSPHORYLATION -0.65 -3.43 0.000 0.000 Bottom
HALLMARK_MYC_TARGETS_V1 -0.49 -2.56 0.000 0.000 Bottom
HALLMARK_E2F_TARGETS -0.45 -2.37 0.000 0.000 Bottom
HALLMARK_DNA_REPAIR -0.46 -2.33 0.000 0.000 Bottom
HALLMARK_ADIPOGENESIS -0.42 -2.26 0.000 0.000 Bottom
HALLMARK_FATTY_ACID_METABOLISM -0.41 -2.06 0.000 0.000 Bottom
HALLMARK_PEROXISOME -0.43 -2.01 0.000 0.000 Bottom
HALLMARK_MYC_TARGETS_V2 -0.43 -1.84 0.003 0.001 Bottom
HALLMARK_CHOLESTEROL_HOMEOSTASIS -0.42 -1.83 0.003 0.001 Bottom
HALLMARK_ALLOGRAFT_REJECTION -0.34 -1.78 0.000 0.003 Bottom
HALLMARK_MTORC1_SIGNALING -0.32 -1.67 0.000 0.004 Bottom
HALLMARK_P53_PATHWAY -0.29 -1.52 0.000 0.015 Bottom
HALLMARK_UV_RESPONSE_UP -0.28 -1.41 0.013 0.036 Bottom
HALLMARK_REACTIVE_OXYGEN_SPECIES_PATHWAY -0.35 -1.39 0.057 0.040 Bottom
HALLMARK_HEME_METABOLISM -0.26 -1.34 0.014 0.061 Bottom
HALLMARK_G2M_CHECKPOINT -0.23 -1.20 0.080 0.172 Bottom")
Created on 2021-11-23 by the reprex package (v2.0.1)

Apply transformation to masked dataframe

I have this matrix df.head():
0 1 2 3 4 5 6 7 8 9 ... 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857
0 0.0 0.0 0.0 0.0 0.0 0.00000 0.0 0.0 0.0 0.00000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0 0.0 0.00000 0.0 0.0 0.0 0.00000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0 0.0 0.00000 0.0 0.0 0.0 30.88689 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
3 0.0 0.0 0.0 0.0 0.0 0.00000 0.0 0.0 0.0 0.00000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
4 0.0 0.0 0.0 0.0 0.0 42.43819 0.0 0.0 0.0 0.00000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
5 rows × 1858 columns
And I need to apply a transformation to it every time a value other than 0.0 is found, dividing the value by 0.32
So far I have the mask, like so:
normalize = 0.32
mask = (df>=0.0)
df = df.where(mask)
How do I apply such a transformation on a very large dataframe, after masking it?
You don't need mask, just divide your dataframe by 0.32.
df / 0.32
>>> df
A B
0 0 3
1 5 0
>>> df / 0.32
A B
0 0.000 9.375
1 15.625 0.000
If you needed to use mask, try;
mask = (df.eq(0))
df.where(mask, df/0.32)

How do I get awk to print fields from the second row of a file?

I have a file that looks like this:
measured 10.8 0.0000 0.0000 0.0236 0.0304 0.0383 0.0433 0.0437 0.0442 0.0452
0.0455 0.0448 0.0440 0.0423 0.0386 0.0344 0.0274 0.0000 0.0000
I want gawk to print all the numbers in one long single column like this:
0.0000
0.0000
0.0236
0.0304
0.0383
0.0433
0.0437
0.0442
0.0452
0.0455
0.0448
0.0440
0.0423
0.0386
0.0344
0.0274
0.0000
0.0000
I run the command gawk '/measured/ { printf $3"\n" $4"\n" $5"\n" $6"\n" $7"\n" $8"\n" $9"\n" $10"\n" $11"\n" $12"\n" $13"\n" $14"\n" $15"\n" $16"\n" $17"\n" $18"\n" }' filename.txt
But I just get the first row of numbers:
0.0000
0.0000
0.0236
0.0304
0.0383
0.0433
0.0437
0.0442
0.0452
How do I get gawk to print the second row?
$ cat tst.awk
BEGIN { OFS = "\n" }
/measured/ { c=2; $1=$2=""; $0=$0 }
c && c-- { $1=$1; print }
$ awk -f tst.awk file
0.0000
0.0000
0.0236
0.0304
0.0383
0.0433
0.0437
0.0442
0.0452
0.0455
0.0448
0.0440
0.0423
0.0386
0.0344
0.0274
0.0000
0.0000
$ grep -A1 measured file | tr -s ' ' \\n | tail -n+4
0.0000
0.0000
0.0236
0.0304
0.0383
0.0433
0.0437
0.0442
0.0452
0.0455
0.0448
0.0440
0.0423
0.0386
0.0344
0.0274
0.0000
0.0000
with awk
$ awk -v OFS='\n' '/measured/ {p=1; for(i=3;i<=NF;i++) print $i; next}
p {$1=$1; print; exit}' file
If the number of fields is guaranteed to be as in the example, you can use the following command:
awk '{for(i=NF-8;i<=NF;i++){print $i}}' input.file
The GNU implementation of Awk allows an arbitrary regular expression as the RS record separator If the keyword measured occurs before each batch of numbers, we can use that keyword as the separator:
$ gawk 'BEGIN { RS = "measured" } { for (i = 1; i <= NF ; i++) print "field " i " = " $i }'
measured 10.8 0.0000 0.0000 0.0236 0.0304 0.0383 0.0433 0.0437 0.0442 0.0452
0.0455 0.0448 0.0440 0.0423 0.0386 0.0344 0.0274 0.0000 0.000
field 1 = 10.8
field 2 = 0.0000
field 3 = 0.0000
field 4 = 0.0236
field 5 = 0.0304
field 6 = 0.0383
field 7 = 0.0433
field 8 = 0.0437
field 9 = 0.0442
field 10 = 0.0452
field 11 = 0.0455
field 12 = 0.0448
field 13 = 0.0440
field 14 = 0.0423
field 15 = 0.0386
field 16 = 0.0344
field 17 = 0.0274
field 18 = 0.0000
field 19 = 0.000
As you can see, all the fields between the measured record separators are parsed out regardless of line breaks. Fields are separated on any mixture of spaces, tabs and newlines.
Note that because measured appears first, we get an empty record. The output you see above is, effectively, from the second record. The first record is the whitespcae before measured, which contains no fields.
In other words, he record separator is really expected to be a terminator, except that it can be missing after the last record.

create multi-indexed dataframe

I do not know how to create a multi-indexed df (that has unequal number of 2nd-indices). here is a sample:
data = [{'caterpillar': [('Сatérpillar',
{'fuzz': 0.82,
'levenshtein': 0.98,
'jaro_winkler': 0.9192,
'hamming': 0.98}),
('caterpiⅼⅼaʀ',
{'fuzz': 0.73,
'levenshtein': 0.97,
'jaro_winkler': 0.9114,
'hamming': 0.97}),
('cÂteԻpillÂr',
{'fuzz': 0.73,
'levenshtein': 0.97,
'jaro_winkler': 0.881,
'hamming': 0.97})]},
{'elementis': [('elEmENtis',
{'fuzz': 1.0, 'levenshtein': 1.0, 'jaro_winkler': 1.0, 'hamming': 1.0}),
('ÊlemĚntis',
{'fuzz': 0.78,
'levenshtein': 0.98,
'jaro_winkler': 0.863,
'hamming': 0.98}),
('еlÈmÈntis',
{'fuzz': 0.67,
'levenshtein': 0.97,
'jaro_winkler': 0.8333,
'hamming': 0.97})]},
{'gibson': [('giBᏚon',
{'fuzz': 0.83,
'levenshtein': 0.99,
'jaro_winkler': 0.9319,
'hamming': 0.99}),
('ɡibsoN',
{'fuzz': 0.83,
'levenshtein': 0.99,
'jaro_winkler': 0.9206,
'hamming': 0.99}),
('giЬႽon',
{'fuzz': 0.67,
'levenshtein': 0.98,
'jaro_winkler': 0.84,
'hamming': 0.98}),
('glbsՕn',
{'fuzz': 0.67,
'levenshtein': 0.98,
'jaro_winkler': 0.8333,
'hamming': 0.98})]}]
I want a df like this (note: 'Other Name' has differing number of values for each 'Orig Name':
Orig Name| Other Name| fuzz| levenstein| Jaro-Winkler| Hamming
------------------------------------------------------------------------
caterpillar Сatérpillar 0.82 0.98. 0.9192 0.98
caterpiⅼⅼaʀ 0.73 0.97 0.9114 0.97
cÂteԻpillÂr 0.73 0.97 0.881 0.97
gibson giBᏚon 0.83. 0.99 0.9319 0.99
ɡibsoN 0.83 0.99. 0.9206 0.99
giЬႽon 0.67. 0.98 0.84 0.98
glbsՕn 0.67. 0.98. 0.8333 0.98
elementis .........
--------------------------------------------------------------------------
I tried :
orig_name_list = [x for d in data for x, v in d.items()]
value_list = [v for d in data for x, v in d.items()]
other_names = [tup[0] for tup_list in value_list for tup in tup_list]
algos = ['fuzz', 'levenshtein', 'jaro_winkler', 'hamming']
Not sure how to proceed from there. Suggestions are appreciated.
Let's try concat:
pd.concat([pd.DataFrame([x[1]]).assign(OrigName=k, OtherName=x[0])
for df in data for k,d in df.items() for x in d])
Output:
fuzz levenshtein jaro_winkler hamming OrigName OtherName
0 0.82 0.98 0.9192 0.98 caterpillar Сatérpillar
0 0.73 0.97 0.9114 0.97 caterpillar caterpiⅼⅼaʀ
0 0.73 0.97 0.8810 0.97 caterpillar cÂteԻpillÂr
0 1.00 1.00 1.0000 1.00 elementis elEmENtis
0 0.78 0.98 0.8630 0.98 elementis ÊlemĚntis
0 0.67 0.97 0.8333 0.97 elementis еlÈmÈntis
0 0.83 0.99 0.9319 0.99 gibson giBᏚon
0 0.83 0.99 0.9206 0.99 gibson ɡibsoN
0 0.67 0.98 0.8400 0.98 gibson giЬႽon
0 0.67 0.98 0.8333 0.98 gibson glbsՕn
One way to do this is to reformat your data for json record consumption via the pd.json_normalize function. Your json is currently not formatted correctly to be stored into a dataframe easily:
new_data = []
for entry in data:
new_entry = {}
for name, matches in entry.items():
new_entry["name"] = name
new_entry["matches"] = []
for match in matches:
match[1]["match"] = match[0]
new_entry["matches"].append(match[1])
new_data.append(new_entry)
df = pd.json_normalize(new_data, "matches", ["name"]).set_index(["name", "match"])
print(df)
fuzz levenshtein jaro_winkler hamming
name match
caterpillar Сatérpillar 0.82 0.98 0.9192 0.98
caterpiⅼⅼaʀ 0.73 0.97 0.9114 0.97
cÂteԻpillÂr 0.73 0.97 0.8810 0.97
elementis elEmENtis 1.00 1.00 1.0000 1.00
ÊlemĚntis 0.78 0.98 0.8630 0.98
еlÈmÈntis 0.67 0.97 0.8333 0.97
gibson giBᏚon 0.83 0.99 0.9319 0.99
ɡibsoN 0.83 0.99 0.9206 0.99
giЬႽon 0.67 0.98 0.8400 0.98
glbsՕn 0.67 0.98 0.8333 0.98

Extract date from date time - change . to , and print sum up of different field

aNumber bNumber startDate cost balanceAfter trafficCase Operator unknown3 MainAmount BALANCEBEFORE
22676239633 433 2014-07-02 10:16:48.000 0,00 0.20 0 Short Code 397224944 0.0000 0.2000
22677277255 76919167 2014-07-02 10:16:51.000 1,00 92.60 0 Airtel 126268625 0.0000 92.6000
22676777508 76701575 2014-07-02 10:16:55.000 1,00 217.00 0 Airtel 4132186103 0.0000 217.0000
22665706841 433 2014-07-02 10:16:57.000 0,00 69.50 0 Short Code 4133821554 0.0000 69.5000
22665799922 70110055 2014-07-03 10:16:45.000 20,00 0.50 0 Telmob 126260244 20.0000 0.5000
22676239633 433 2014-07-03 10:16:48.000 0,00 0.20 0 Short Code 397224944 0.0000 0.2000
22677277255 76919167 2014-07-04 10:16:51.000 1,00 92.60 0 Airtel 126268625 0.0000 92.6000
22676777508 76701575 2014-07-04 10:16:55.000 1,00 217.00 0 Airtel 4132186103 0.0000 217.0000
22665706841 433 2014-07-05 10:16:57.000 0,00 69.50 0 Short Code 4133821554 0.0000 69.5000
Here is a sample of the data I have. I want to sum up cost, balanceAfter, MainAmount and BALANCEBEFORE at each time the date changed but my concern is I have date combined with time and my decimal separator is dot instead of comma so my awk script can't perform the operation.
Can I have an AWK script which will first extract only the date so in the end I will have an output looking like:
Date Cost balanceAfter MainAmount BALANCEBEFORE
02/07/2014 2,00 379,3 0 379,3
03/07/2014 20,00 0,7 20 0,7
04/07/2014 2,00 309,6 0 309,6
05/07/2014 0,00 69,5 0 69,5
HERE IS MY AWK SCRIPT
awk -F 'NR==1 {header=$0; next} {a[$3]+=$4 a[$3]+=$5 a[$3]+=$9 a[$3]+=$10} END {for (i in a) {printf "%d\t%d\n", i, a[i]}; tot+=a[i]};' out.txt>output.doc
EDIT: Avoid pre-processing step as per Etan Reisner's suggestion to use $NF to work around differing numbers of tokens in Operator column.
$ cat data.txt
aNumber bNumber startDate cost balanceAfter trafficCase Operator unknown3 MainAmount BALANCEBEFORE
22676239633 433 2014-07-02 10:16:48.000 0,00 0.20 0 Short Code 397224944 0.0000 0.2000
22677277255 76919167 2014-07-02 10:16:51.000 1,00 92.60 0 Airtel 126268625 0.0000 92.6000
22676777508 76701575 2014-07-02 10:16:55.000 1,00 217.00 0 Airtel 4132186103 0.0000 217.0000
22665706841 433 2014-07-02 10:16:57.000 0,00 69.50 0 Short Code 4133821554 0.0000 69.5000
22665799922 70110055 2014-07-03 10:16:45.000 20,00 0.50 0 Telmob 126260244 20.0000 0.5000
22676239633 433 2014-07-03 10:16:48.000 0,00 0.20 0 Short Code 397224944 0.0000 0.2000
22677277255 76919167 2014-07-04 10:16:51.000 1,00 92.60 0 Airtel 126268625 0.0000 92.6000
22676777508 76701575 2014-07-04 10:16:55.000 1,00 217.00 0 Airtel 4132186103 0.0000 217.0000
22665706841 433 2014-07-05 10:16:57.000 0,00 69.50 0 Short Code 4133821554 0.0000 69.5000
$ cat so2.awk
NR > 1 {
cost = $5;
balanceAfter = $6;
mainAmount = $(NF - 1);
balanceBefore = $NF;
sub(",", ".", cost);
sub(",", ".", balanceAfter);
sub(",", ".", mainAmount);
sub(",", ".", balanceBefore);
dateCost[$3] += cost;
dateBalanceAfter[$3] += balanceAfter;
dateMainAmount[$3] += mainAmount;
dateBalanceBefore[$3] += balanceBefore;
}
END {
printf("%s\t%s\t%s\t%s\t%s\n", "Date", "Cost", "BalanceAfter", "MainAmount", "BalanceBefore");
for (i in dateCost) {
printf("%s\t%f\t%f\t%f\t%f\n", i, dateCost[i], dateBalanceAfter[i], dateMainAmount[i], dateBalanceBefore[i]);
}
}
$ awk -f so2.awk data.txt
Date Cost BalanceAfter MainAmount BalanceBefore
2014-07-02 2.000000 379.300000 0.000000 379.300000
2014-07-03 20.000000 0.700000 20.000000 0.700000
2014-07-04 2.000000 309.600000 0.000000 309.600000
2014-07-05 0.000000 69.500000 0.000000 69.500000
This requires no pre-processing of the file:
awk '
BEGIN {print "Date Cost BalanceAfter MainAmount BalanceBefore"}
NR == 1 {next}
function showday() {
printf "%s\t%.2f\t%.1f\t%d\t%.1f\n", date, cost, bAfter, main, bBefore
}
date != $3 {
if (date) showday()
date = $3
cost = bAfter = main = bBefore = 0
}
{
sub(/,/, ".", $5)
cost += $5
bAfter += $6
main += $(NF-1)
bBefore += $NF
}
END {showday()}
' file | column -t
Date Cost BalanceAfter MainAmount BalanceBefore
2014-07-02 2.00 379.3 0 379.3
2014-07-03 20.00 0.7 20 0.7
2014-07-04 2.00 309.6 0 309.6
2014-07-05 0.00 69.5 0 69.5