Problem in predict with Gaussian process model - error-handling

When trying to use predict on the gaussian process model for NLP I computed, I get the following error message:
Warning messages:
1: In predict.gam(model_gaussian, X_test) :
not all required variables have been supplied in newdata!
2: 'newdata' has 38 lines, but the variables found have 92 lines.
I do not know what to do, as the 70/30 split is pretty usual and I do not really want to do 50/50.
Thank you for any input!
This is what I did:
> library(mgcv)
> library(caret)
> set.seed(1234)
> #split into test and training set
> obs.num <- createDataPartition(dtf$Score, times = 1,p = 0.7, list = FALSE)
> training_set <- dtf[obs.num,]
> test_set <- dtf[-obs.num,]
>
> y_train <- training_set["Score"]
> y_test<-test_set["Score"]
>
> myvars <- names(training_set) %in% c("Score")
> X_train <- training_set[!myvars]
> X_train <- mutate_all(X_train, function(x) as.numeric(as.character(x)))
>
> myvars <- names(test_set) %in% c("Score")
> X_test <- test_set[!myvars]
> X_test <- mutate_all(X_test, function(x) as.numeric(as.character(x)))
>
> X_train<-as.matrix(X_train)
> X_test<-as.data.frame(X_test)
> y_train <- mutate_all(y_train, function(x) as.numeric(as.character(x)))
> colnames(X_test) <- colnames(X_train)
>
> #Fit the Gaussian Process model
> model_gaussian <- mgcv::gam(Score ~ s(X_train, bs = "cs"), data=y_train, method = "REML")
>
> #Make predictions on the test data
> predictions_gaussian <- predict(model_gaussian, X_test)
My data looks like this (only an excerpt):
> dtf
anschau arbeit ausfuhr besprech bess dat denk einverstand feedback find froh Score
1 0.10001188 0.037861606 0.12703891 0.02950353 0.05116445 0.04224152 0.05849694 0.07709305 0.16337123 0.02358434 0.08420209 0.61
2 0.00000000 0.066708545 0.00000000 0.00000000 0.09014690 0.07442553 0.00000000 0.00000000 0.00000000 0.08310673 0.00000000 0.61
3 0.00000000 0.067784489 0.00000000 0.03521389 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.64
4 0.00000000 0.042709739 0.05732244 0.02662513 0.02308640 0.00000000 0.02639496 0.00000000 0.00000000 0.01064172 0.00000000 0.61
5 0.06271932 0.023743719 0.00000000 0.03700442 0.00000000 0.00000000 0.00000000 0.04834649 0.05122657 0.01479018 0.00000000 0.67
6 0.00000000 0.012076547 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.04513555 0.05371512 0.64
7 0.00000000 0.030453901 0.00000000 0.02373110 0.00000000 0.03397687 0.00000000 0.00000000 0.00000000 0.01897002 0.00000000 0.66
8 0.05606727 0.021225446 0.00000000 0.04961957 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.04720420 0.70
9 0.00000000 0.026939989 0.00000000 0.00000000 0.02427032 0.00000000 0.00000000 0.00000000 0.03874831 0.01118744 0.00000000 0.75
10 0.00000000 0.000000000 0.00000000 0.00000000 0.05915890 0.00000000 0.13527418 0.08913884 0.00000000 0.00000000 0.00000000 0.61
11 0.00000000 0.023743719 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.04834649 0.00000000 0.02958036 0.00000000 0.70
12 0.00000000 0.014294688 0.00000000 0.04455635 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.01780859 0.00000000 0.62
13 0.00000000 0.056035177 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.01745241 0.00000000 0.57
14 0.04869000 0.009216312 0.00000000 0.01436356 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.69
15 0.00000000 0.010300584 0.00000000 0.01605339 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.01283266 0.00000000 0.67

Related

Plot secondary x_axis in ggplot

Dear All seniors and members,
Hope you are doing great. I have data set, which I like to plot the secondary x-axis in ggplot. I could not make it to work for the last 4 hours. below is my dataset.
Pathway ES NES p_value q_value Group
1 HALLMARK_HYPOXIA 0.49 2.25 0.000 0.000 Top
2 HALLMARK_EPITHELIAL_MESENCHYMAL_TRANSITION 0.44 2.00 0.000 0.000 Top
3 HALLMARK_UV_RESPONSE_DN 0.45 1.98 0.000 0.000 Top
4 HALLMARK_TGF_BETA_SIGNALING 0.48 1.77 0.003 0.004 Top
5 HALLMARK_HEDGEHOG_SIGNALING 0.52 1.76 0.003 0.003 Top
6 HALLMARK_ESTROGEN_RESPONSE_EARLY 0.38 1.73 0.000 0.004 Top
7 HALLMARK_KRAS_SIGNALING_DN 0.37 1.69 0.000 0.005 Top
8 HALLMARK_INTERFERON_ALPHA_RESPONSE 0.37 1.54 0.009 0.021 Top
9 HALLMARK_TNFA_SIGNALING_VIA_NFKB 0.32 1.45 0.005 0.048 Top
10 HALLMARK_NOTCH_SIGNALING 0.42 1.42 0.070 0.059 Top
11 HALLMARK_COAGULATION 0.32 1.39 0.031 0.067 Top
12 HALLMARK_MITOTIC_SPINDLE 0.30 1.37 0.025 0.078 Top
13 HALLMARK_ANGIOGENESIS 0.40 1.37 0.088 0.074 Top
14 HALLMARK_WNT_BETA_CATENIN_SIGNALING 0.35 1.23 0.173 0.216 Top
15 HALLMARK_OXIDATIVE_PHOSPHORYLATION -0.65 -3.43 0.000 0.000 Bottom
16 HALLMARK_MYC_TARGETS_V1 -0.49 -2.56 0.000 0.000 Bottom
17 HALLMARK_E2F_TARGETS -0.45 -2.37 0.000 0.000 Bottom
18 HALLMARK_DNA_REPAIR -0.46 -2.33 0.000 0.000 Bottom
19 HALLMARK_ADIPOGENESIS -0.42 -2.26 0.000 0.000 Bottom
20 HALLMARK_FATTY_ACID_METABOLISM -0.41 -2.06 0.000 0.000 Bottom
21 HALLMARK_PEROXISOME -0.43 -2.01 0.000 0.000 Bottom
22 HALLMARK_MYC_TARGETS_V2 -0.43 -1.84 0.003 0.001 Bottom
23 HALLMARK_CHOLESTEROL_HOMEOSTASIS -0.42 -1.83 0.003 0.001 Bottom
24 HALLMARK_ALLOGRAFT_REJECTION -0.34 -1.78 0.000 0.003 Bottom
25 HALLMARK_MTORC1_SIGNALING -0.32 -1.67 0.000 0.004 Bottom
26 HALLMARK_P53_PATHWAY -0.29 -1.52 0.000 0.015 Bottom
27 HALLMARK_UV_RESPONSE_UP -0.28 -1.41 0.013 0.036 Bottom
28 HALLMARK_REACTIVE_OXYGEN_SPECIES_PATHWAY -0.35 -1.39 0.057 0.040 Bottom
29 HALLMARK_HEME_METABOLISM -0.26 -1.34 0.014 0.061 Bottom
30 HALLMARK_G2M_CHECKPOINT -0.23 -1.20 0.080 0.172 Bottom
I like to plot like the following plot (plot # 1)
Here is my current codes chunks.
ggplot(data, aes(reorder(Pathway, NES), NES, fill= Group)) +
theme_classic() + geom_col() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1, size = 8),
axis.title = element_text(face = "bold", size = 12),
axis.text = element_text(face = "bold", size = 8), plot.title = element_text(hjust = 0.5)) + labs(x="Pathway", y="Normalized Enrichment Score",
title="2Gy_5f vs. 0Gy") + coord_flip()
This code produces the following plot (plot # 2)
So I would like to generate the plot where I have secondary x-axis with q_value (same like the first bar plot I have attached). Any help is greatly appreciated. Note: I used coord_flip so it turn angle of x-axis.
Kind Regards,
synat
[1]: https://i.stack.imgur.com/dBFIS.jpg
[2]: https://i.stack.imgur.com/yDbC5.jpg
Maybe you don't need a secondary axis per se to get the plot style you seek.
library(tidyverse)
ggplot(data, aes(x = NES, y = reorder(Pathway, NES), fill= Group)) +
theme_classic() +
geom_col() +
geom_text(aes(x = 2.5, y = reorder(Pathway, NES), label = q_value), hjust = 0) +
annotate("text", x = 2.5, y = length(data$Pathway) + 1, hjust = 0, fontface = "bold", label = "q_value" ) +
coord_cartesian(xlim = c(NA, 3),
ylim = c(NA, length(data$Pathway) + 1),
clip = "off") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1, size = 8),
axis.title = element_text(face = "bold", size = 12),
axis.text = element_text(face = "bold", size = 8),
plot.title = element_text(hjust = 0.5)) +
labs(x="Pathway", y="Normalized Enrichment Score",
title="2Gy_5f vs. 0Gy")
And for future reference you can read in data in the format you pasted like so:
data <- read_table(
"
Pathway ES NES p_value q_value Group
HALLMARK_HYPOXIA 0.49 2.25 0.000 0.000 Top
HALLMARK_EPITHELIAL_MESENCHYMAL_TRANSITION 0.44 2.00 0.000 0.000 Top
HALLMARK_UV_RESPONSE_DN 0.45 1.98 0.000 0.000 Top
HALLMARK_TGF_BETA_SIGNALING 0.48 1.77 0.003 0.004 Top
HALLMARK_HEDGEHOG_SIGNALING 0.52 1.76 0.003 0.003 Top
HALLMARK_ESTROGEN_RESPONSE_EARLY 0.38 1.73 0.000 0.004 Top
HALLMARK_KRAS_SIGNALING_DN 0.37 1.69 0.000 0.005 Top
HALLMARK_INTERFERON_ALPHA_RESPONSE 0.37 1.54 0.009 0.021 Top
HALLMARK_TNFA_SIGNALING_VIA_NFKB 0.32 1.45 0.005 0.048 Top
HALLMARK_NOTCH_SIGNALING 0.42 1.42 0.070 0.059 Top
HALLMARK_COAGULATION 0.32 1.39 0.031 0.067 Top
HALLMARK_MITOTIC_SPINDLE 0.30 1.37 0.025 0.078 Top
HALLMARK_ANGIOGENESIS 0.40 1.37 0.088 0.074 Top
HALLMARK_WNT_BETA_CATENIN_SIGNALING 0.35 1.23 0.173 0.216 Top
HALLMARK_OXIDATIVE_PHOSPHORYLATION -0.65 -3.43 0.000 0.000 Bottom
HALLMARK_MYC_TARGETS_V1 -0.49 -2.56 0.000 0.000 Bottom
HALLMARK_E2F_TARGETS -0.45 -2.37 0.000 0.000 Bottom
HALLMARK_DNA_REPAIR -0.46 -2.33 0.000 0.000 Bottom
HALLMARK_ADIPOGENESIS -0.42 -2.26 0.000 0.000 Bottom
HALLMARK_FATTY_ACID_METABOLISM -0.41 -2.06 0.000 0.000 Bottom
HALLMARK_PEROXISOME -0.43 -2.01 0.000 0.000 Bottom
HALLMARK_MYC_TARGETS_V2 -0.43 -1.84 0.003 0.001 Bottom
HALLMARK_CHOLESTEROL_HOMEOSTASIS -0.42 -1.83 0.003 0.001 Bottom
HALLMARK_ALLOGRAFT_REJECTION -0.34 -1.78 0.000 0.003 Bottom
HALLMARK_MTORC1_SIGNALING -0.32 -1.67 0.000 0.004 Bottom
HALLMARK_P53_PATHWAY -0.29 -1.52 0.000 0.015 Bottom
HALLMARK_UV_RESPONSE_UP -0.28 -1.41 0.013 0.036 Bottom
HALLMARK_REACTIVE_OXYGEN_SPECIES_PATHWAY -0.35 -1.39 0.057 0.040 Bottom
HALLMARK_HEME_METABOLISM -0.26 -1.34 0.014 0.061 Bottom
HALLMARK_G2M_CHECKPOINT -0.23 -1.20 0.080 0.172 Bottom")
Created on 2021-11-23 by the reprex package (v2.0.1)

create multi-indexed dataframe

I do not know how to create a multi-indexed df (that has unequal number of 2nd-indices). here is a sample:
data = [{'caterpillar': [('Сatérpillar',
{'fuzz': 0.82,
'levenshtein': 0.98,
'jaro_winkler': 0.9192,
'hamming': 0.98}),
('caterpiⅼⅼaʀ',
{'fuzz': 0.73,
'levenshtein': 0.97,
'jaro_winkler': 0.9114,
'hamming': 0.97}),
('cÂteԻpillÂr',
{'fuzz': 0.73,
'levenshtein': 0.97,
'jaro_winkler': 0.881,
'hamming': 0.97})]},
{'elementis': [('elEmENtis',
{'fuzz': 1.0, 'levenshtein': 1.0, 'jaro_winkler': 1.0, 'hamming': 1.0}),
('ÊlemĚntis',
{'fuzz': 0.78,
'levenshtein': 0.98,
'jaro_winkler': 0.863,
'hamming': 0.98}),
('еlÈmÈntis',
{'fuzz': 0.67,
'levenshtein': 0.97,
'jaro_winkler': 0.8333,
'hamming': 0.97})]},
{'gibson': [('giBᏚon',
{'fuzz': 0.83,
'levenshtein': 0.99,
'jaro_winkler': 0.9319,
'hamming': 0.99}),
('ɡibsoN',
{'fuzz': 0.83,
'levenshtein': 0.99,
'jaro_winkler': 0.9206,
'hamming': 0.99}),
('giЬႽon',
{'fuzz': 0.67,
'levenshtein': 0.98,
'jaro_winkler': 0.84,
'hamming': 0.98}),
('glbsՕn',
{'fuzz': 0.67,
'levenshtein': 0.98,
'jaro_winkler': 0.8333,
'hamming': 0.98})]}]
I want a df like this (note: 'Other Name' has differing number of values for each 'Orig Name':
Orig Name| Other Name| fuzz| levenstein| Jaro-Winkler| Hamming
------------------------------------------------------------------------
caterpillar Сatérpillar 0.82 0.98. 0.9192 0.98
caterpiⅼⅼaʀ 0.73 0.97 0.9114 0.97
cÂteԻpillÂr 0.73 0.97 0.881 0.97
gibson giBᏚon 0.83. 0.99 0.9319 0.99
ɡibsoN 0.83 0.99. 0.9206 0.99
giЬႽon 0.67. 0.98 0.84 0.98
glbsՕn 0.67. 0.98. 0.8333 0.98
elementis .........
--------------------------------------------------------------------------
I tried :
orig_name_list = [x for d in data for x, v in d.items()]
value_list = [v for d in data for x, v in d.items()]
other_names = [tup[0] for tup_list in value_list for tup in tup_list]
algos = ['fuzz', 'levenshtein', 'jaro_winkler', 'hamming']
Not sure how to proceed from there. Suggestions are appreciated.
Let's try concat:
pd.concat([pd.DataFrame([x[1]]).assign(OrigName=k, OtherName=x[0])
for df in data for k,d in df.items() for x in d])
Output:
fuzz levenshtein jaro_winkler hamming OrigName OtherName
0 0.82 0.98 0.9192 0.98 caterpillar Сatérpillar
0 0.73 0.97 0.9114 0.97 caterpillar caterpiⅼⅼaʀ
0 0.73 0.97 0.8810 0.97 caterpillar cÂteԻpillÂr
0 1.00 1.00 1.0000 1.00 elementis elEmENtis
0 0.78 0.98 0.8630 0.98 elementis ÊlemĚntis
0 0.67 0.97 0.8333 0.97 elementis еlÈmÈntis
0 0.83 0.99 0.9319 0.99 gibson giBᏚon
0 0.83 0.99 0.9206 0.99 gibson ɡibsoN
0 0.67 0.98 0.8400 0.98 gibson giЬႽon
0 0.67 0.98 0.8333 0.98 gibson glbsՕn
One way to do this is to reformat your data for json record consumption via the pd.json_normalize function. Your json is currently not formatted correctly to be stored into a dataframe easily:
new_data = []
for entry in data:
new_entry = {}
for name, matches in entry.items():
new_entry["name"] = name
new_entry["matches"] = []
for match in matches:
match[1]["match"] = match[0]
new_entry["matches"].append(match[1])
new_data.append(new_entry)
df = pd.json_normalize(new_data, "matches", ["name"]).set_index(["name", "match"])
print(df)
fuzz levenshtein jaro_winkler hamming
name match
caterpillar Сatérpillar 0.82 0.98 0.9192 0.98
caterpiⅼⅼaʀ 0.73 0.97 0.9114 0.97
cÂteԻpillÂr 0.73 0.97 0.8810 0.97
elementis elEmENtis 1.00 1.00 1.0000 1.00
ÊlemĚntis 0.78 0.98 0.8630 0.98
еlÈmÈntis 0.67 0.97 0.8333 0.97
gibson giBᏚon 0.83 0.99 0.9319 0.99
ɡibsoN 0.83 0.99 0.9206 0.99
giЬႽon 0.67 0.98 0.8400 0.98
glbsՕn 0.67 0.98 0.8333 0.98

Pandas X_axis hourly [duplicate]

This question already has answers here:
Pandas timeseries plot setting x-axis major and minor ticks and labels
(2 answers)
Closed 5 years ago.
I have this little Pandascode:
graph = auswahl[['Volumenstrom_Außen', 'Vpunkt_Gesamt','Zuluft_Druck_10','Abluft_Druck_10']]
a = graph.plot(figsize=[50,10])
a.set(ylabel="m³/h", xlabel="Zeit", title="Volumenströme")#,ylim=[0,100])
a.legend(loc="upper left")
plt.show()
How can I set the X-Axis showing every Hour?
the dataframe looks like this:
Volumenstrom_Außen Vpunkt_Gesamt Zuluft_Druck Abluft_Druck
Zeit
2018-02-15 16:49:00 1021.708443 752.699 49.328 46.811
2018-02-15 16:49:15 1021.708443 752.699 49.328 46.811
2018-02-15 16:49:30 1021.708443 752.699 49.328 46.811
2018-02-15 16:49:45 1021.708443 752.699 49.328 46.811
2018-02-15 16:50:00 1021.708443 752.699 49.328 46.811
2018-02-15 16:50:15 1021.708443 752.699 49.328 46.811
2018-02-15 16:50:30 1021.708443 752.699 49.328 46.811
2018-02-15 16:50:45 1021.708443 752.699 49.328 46.811
2018-02-15 16:51:00 1092.171094 752.699 49.328 46.811
2018-02-15 16:51:15 1092.171094 752.699 49.328 46.811
Let's take this example dataframe, whose index is at minute granularity
import pandas as pd
import random
ts_index = pd.date_range('1/1/2000', periods=1000, freq='T')
v1 = [random.random() for i in range(1000)]
v2 = [random.random() for i in range(1000)]
v3 = [random.random() for i in range(1000)]
ts_df = pd.DataFrame({'v1':v1,'v2':v2,'v3':v3},index=ts_index)
ts_df.head()
v1 v2 v3
2000-01-01 00:00:00 0.593039 0.017351 0.742111
2000-01-01 00:01:00 0.563233 0.837362 0.869767
2000-01-01 00:02:00 0.453925 0.962600 0.690868
2000-01-01 00:03:00 0.757895 0.123610 0.622777
2000-01-01 00:04:00 0.759841 0.906674 0.263902
We could use pandas.DataFrame.resample to downsample this data to hourly granularity, like shown below
hourly_mean_df = ts_df.resample('H').mean() # you can use .sum() also
hourly_mean_df.head()
v1 v2 v3
2000-01-01 00:00:00 0.516001 0.461119 0.467895
2000-01-01 01:00:00 0.530603 0.458208 0.550892
2000-01-01 02:00:00 0.472090 0.522278 0.508345
2000-01-01 03:00:00 0.515713 0.486906 0.541538
2000-01-01 04:00:00 0.514543 0.478097 0.489217
Now you can plot this hourly summary
hourly_mean_df.plot()

How to extract lines between multiline patterns?

I have a file which looks like:
blah blah blah blah blah blah blah blah
blah blah blah blah blah blah blah blah
blah blah blah blah blah blah blah blah
<empty line here>
Total DOS and NOS and partial (IT) DOSDOWN
<empty line here>
E Total 1
<empty line here>
-1.5000 0.004 0.000 0.004
-1.4953 0.004 0.000 0.004
-1.4906 0.004 0.000 0.004
-1.4859 0.004 0.000 0.004
-1.4812 0.004 0.000 0.004
0.3563 0.708 5.510 0.708
0.3609 0.562 5.513 0.562
0.3656 0.381 5.515 0.381
0.3703 0.149 5.517 0.149
<empty line here>
Sublattice 1 Atom Fe spin DOWN
What I want is to extract all lines between (first pattern)
Total DOS and NOS and partial (IT) DOSUP
<empty line here>
E Total 1
<empty line here>
and (second pattern)
<empty line here>
Sublattice 1 Atom Fe spin DOWN
i.e. I want to get
-1.5000 0.004 0.000 0.004
-1.4953 0.004 0.000 0.004
-1.4906 0.004 0.000 0.004
-1.4859 0.004 0.000 0.004
-1.4812 0.004 0.000 0.004
0.3563 0.708 5.510 0.708
0.3609 0.562 5.513 0.562
0.3656 0.381 5.515 0.381
0.3703 0.149 5.517 0.149
So, at the end of the day I want to have lines between two multiline patterns.
As I understand awk can detect multiline patterns via state machine (see here), but I failed to do it in my case.
Any suggestion how to resolve this problem would be very much appreciated.
Here's a solution based on Ed Morton's trick.
awk -v RS= 'n==2; /Total DOS/ || n {n++;next} {n=0}' input.txt
Here's how this works.
RS= puts awk into multi-line mode, so that records contain blocks of lines.
n==2; prints any record processed while this condition is met.
/RE/ || n is a condition that evaluates to true if EITHER the RE (pattern) is matched within the current record or the variable n is non-zero.
{n++;next} obviously increments n and skips to the next record.
{n=0} And if we haven't already skipped to the next record, we reset n.
The effect of all this is that we print the record that is two records after the one with the matched pattern. You could of course adjust the condition that begins the counter to whatever you like. $2=="Total" for example. Salt to taste.
sh-3.2$ cat input.txt
blah blah blah blah blah blah blah blah
blah blah blah blah blah blah blah blah
blah blah blah blah blah blah blah blah
Total DOS and NOS and partial (IT) DOSUP
E Total 1
-1.5000 0.004 0.000 0.004
-1.4953 0.004 0.000 0.004
-1.4906 0.004 0.000 0.004
....... ..... ..... .....
0.3609 0.562 5.513 0.562
0.3656 0.381 5.515 0.381
0.3703 0.149 5.517 0.149
blah blah blah blah
sh-3.2$ awk -v RS= 'n==2; /Total DOS and NOS/||n{n++;next} {n=0}' input.txt
-1.5000 0.004 0.000 0.004
-1.4953 0.004 0.000 0.004
-1.4906 0.004 0.000 0.004
....... ..... ..... .....
0.3609 0.562 5.513 0.562
0.3656 0.381 5.515 0.381
0.3703 0.149 5.517 0.149
Using sed: sed -n '5,/^$/{/^$/d}'
But that assumes that "multiline starting pattern" is always at the beginning of the file. Otherwise it gets a bit more complicated. Like this:
/Total/{N;N;N}
/Total.*Total/,/^$/{
/Total/d
/^$/d
}
Here I am assuming that 'Total' matches the beginning of multiline pattern, 'Total.*Total' matches the whole pattern. Replace N;N;N with something more complex if there are other patterns that start with first line of you multiline pattern but are shorter than 4 lines.
From your comments it sounds like all you need is:
awk -v RS= '/Total DOS/{tgt=NR+2} NR==tgt' file
If not then edit your question to clarify. Make it NR==tgt{print; exit} if you only want the first matching block in the file output and efficiency is a concern. Change the regexp if necessary to be as much of the Total DOS... line as you need to match to make it unique.
Here it is running against your provided sample input:
$ cat file
blah blah blah blah blah blah blah blah
blah blah blah blah blah blah blah blah
blah blah blah blah blah blah blah blah
Total DOS and NOS and partial (IT) DOSUP
E Total 1
-1.5000 0.004 0.000 0.004
-1.4953 0.004 0.000 0.004
-1.4906 0.004 0.000 0.004
....... ..... ..... .....
0.3609 0.562 5.513 0.562
0.3656 0.381 5.515 0.381
0.3703 0.149 5.517 0.149
blah blah blah blah
$ awk -v RS= '/Total DOS/{tgt=NR+2} NR==tgt' file
-1.5000 0.004 0.000 0.004
-1.4953 0.004 0.000 0.004
-1.4906 0.004 0.000 0.004
....... ..... ..... .....
0.3609 0.562 5.513 0.562
0.3656 0.381 5.515 0.381
0.3703 0.149 5.517 0.149

Extract date from date time - change . to , and print sum up of different field

aNumber bNumber startDate cost balanceAfter trafficCase Operator unknown3 MainAmount BALANCEBEFORE
22676239633 433 2014-07-02 10:16:48.000 0,00 0.20 0 Short Code 397224944 0.0000 0.2000
22677277255 76919167 2014-07-02 10:16:51.000 1,00 92.60 0 Airtel 126268625 0.0000 92.6000
22676777508 76701575 2014-07-02 10:16:55.000 1,00 217.00 0 Airtel 4132186103 0.0000 217.0000
22665706841 433 2014-07-02 10:16:57.000 0,00 69.50 0 Short Code 4133821554 0.0000 69.5000
22665799922 70110055 2014-07-03 10:16:45.000 20,00 0.50 0 Telmob 126260244 20.0000 0.5000
22676239633 433 2014-07-03 10:16:48.000 0,00 0.20 0 Short Code 397224944 0.0000 0.2000
22677277255 76919167 2014-07-04 10:16:51.000 1,00 92.60 0 Airtel 126268625 0.0000 92.6000
22676777508 76701575 2014-07-04 10:16:55.000 1,00 217.00 0 Airtel 4132186103 0.0000 217.0000
22665706841 433 2014-07-05 10:16:57.000 0,00 69.50 0 Short Code 4133821554 0.0000 69.5000
Here is a sample of the data I have. I want to sum up cost, balanceAfter, MainAmount and BALANCEBEFORE at each time the date changed but my concern is I have date combined with time and my decimal separator is dot instead of comma so my awk script can't perform the operation.
Can I have an AWK script which will first extract only the date so in the end I will have an output looking like:
Date Cost balanceAfter MainAmount BALANCEBEFORE
02/07/2014 2,00 379,3 0 379,3
03/07/2014 20,00 0,7 20 0,7
04/07/2014 2,00 309,6 0 309,6
05/07/2014 0,00 69,5 0 69,5
HERE IS MY AWK SCRIPT
awk -F 'NR==1 {header=$0; next} {a[$3]+=$4 a[$3]+=$5 a[$3]+=$9 a[$3]+=$10} END {for (i in a) {printf "%d\t%d\n", i, a[i]}; tot+=a[i]};' out.txt>output.doc
EDIT: Avoid pre-processing step as per Etan Reisner's suggestion to use $NF to work around differing numbers of tokens in Operator column.
$ cat data.txt
aNumber bNumber startDate cost balanceAfter trafficCase Operator unknown3 MainAmount BALANCEBEFORE
22676239633 433 2014-07-02 10:16:48.000 0,00 0.20 0 Short Code 397224944 0.0000 0.2000
22677277255 76919167 2014-07-02 10:16:51.000 1,00 92.60 0 Airtel 126268625 0.0000 92.6000
22676777508 76701575 2014-07-02 10:16:55.000 1,00 217.00 0 Airtel 4132186103 0.0000 217.0000
22665706841 433 2014-07-02 10:16:57.000 0,00 69.50 0 Short Code 4133821554 0.0000 69.5000
22665799922 70110055 2014-07-03 10:16:45.000 20,00 0.50 0 Telmob 126260244 20.0000 0.5000
22676239633 433 2014-07-03 10:16:48.000 0,00 0.20 0 Short Code 397224944 0.0000 0.2000
22677277255 76919167 2014-07-04 10:16:51.000 1,00 92.60 0 Airtel 126268625 0.0000 92.6000
22676777508 76701575 2014-07-04 10:16:55.000 1,00 217.00 0 Airtel 4132186103 0.0000 217.0000
22665706841 433 2014-07-05 10:16:57.000 0,00 69.50 0 Short Code 4133821554 0.0000 69.5000
$ cat so2.awk
NR > 1 {
cost = $5;
balanceAfter = $6;
mainAmount = $(NF - 1);
balanceBefore = $NF;
sub(",", ".", cost);
sub(",", ".", balanceAfter);
sub(",", ".", mainAmount);
sub(",", ".", balanceBefore);
dateCost[$3] += cost;
dateBalanceAfter[$3] += balanceAfter;
dateMainAmount[$3] += mainAmount;
dateBalanceBefore[$3] += balanceBefore;
}
END {
printf("%s\t%s\t%s\t%s\t%s\n", "Date", "Cost", "BalanceAfter", "MainAmount", "BalanceBefore");
for (i in dateCost) {
printf("%s\t%f\t%f\t%f\t%f\n", i, dateCost[i], dateBalanceAfter[i], dateMainAmount[i], dateBalanceBefore[i]);
}
}
$ awk -f so2.awk data.txt
Date Cost BalanceAfter MainAmount BalanceBefore
2014-07-02 2.000000 379.300000 0.000000 379.300000
2014-07-03 20.000000 0.700000 20.000000 0.700000
2014-07-04 2.000000 309.600000 0.000000 309.600000
2014-07-05 0.000000 69.500000 0.000000 69.500000
This requires no pre-processing of the file:
awk '
BEGIN {print "Date Cost BalanceAfter MainAmount BalanceBefore"}
NR == 1 {next}
function showday() {
printf "%s\t%.2f\t%.1f\t%d\t%.1f\n", date, cost, bAfter, main, bBefore
}
date != $3 {
if (date) showday()
date = $3
cost = bAfter = main = bBefore = 0
}
{
sub(/,/, ".", $5)
cost += $5
bAfter += $6
main += $(NF-1)
bBefore += $NF
}
END {showday()}
' file | column -t
Date Cost BalanceAfter MainAmount BalanceBefore
2014-07-02 2.00 379.3 0 379.3
2014-07-03 20.00 0.7 20 0.7
2014-07-04 2.00 309.6 0 309.6
2014-07-05 0.00 69.5 0 69.5