What is the most efficient way of creating a tf.dataset from multiple json.gz files with multiple text records? - tensorflow

I have thousands of json.gz files, each with a variety of information about scientific papers. For each file, I have to extract the relevant information - e.g. title and labels - to make a dataset, then transform it to a tf.dataset. However, it is poorly efficient since I cannot filter the subjects directly or shuffle them in a single step.
I would like to read them using tf.dataset.interleave in order to shuffle them, but also to filter them according to specific labels.
Here is how I'm doing it up to now.
import tensorflow as tf
import pandas as pd
#For relevant feature extraction
def load_file(file):
#with gzip.open(bytes.decode(file), 'r') as fin: # 4. gzip
with gzip.open(file, 'r') as fin:
json_bytes = fin.read()
json_str = json_bytes.decode('utf-8') # 2. string (i.e. JSON)
bb = json.loads(json_str)
bb = pd.json_normalize(bb, 'items', ['indexed', ['title', 'publisher', 'type','indexed.date-parts', 'subject']],
errors='ignore')
bb.dropna(subset=['title', 'publisher', 'type','indexed.date-parts', 'subject'], inplace=True)
bb.subject = bb.subject.apply(lambda x: int(themes[list(set(x) & set(list(themes.keys())))[0]]) if len(list(set(x) & set(list(themes.keys()))))>0 else len(list(themes.keys()))+1)
bb.title = bb.title.str.join('').values
#bb['author'] = bb['author'].apply(lambda x: '; '.join([', '.join([i['given'], i['family']]) for i in x]))
bb['indexed.date-parts'] = bb['indexed.date-parts'].apply(lambda tpl: datetime.datetime.strptime('-'.join(str(x) for x in tpl[0]), '%Y-%m-%d').strftime('%Y-%m-%d'))
#bb = bb.sample(n=32, replace=True)
#return bb.title.str.join('').values, bb.subject.str.join(', ').values
return dict(bb[['title', 'publisher', 'type','indexed.date-parts', 'subject' ]])
file_list = ['file_2021_01/10625.json.gz',
'file_2021_01/23897.json.gz',
'file_2021_01/12169.json.gz',
'file_2021_01/427.json.gz',...]
filenames = tf.data.Dataset.list_files(file_list, shuffle=True)
dataset = filenames.apply(
tf.data.experimental.parallel_interleave(
lambda x: tf.data.Dataset.from_tensor_slices(tf.numpy_function(load_file, [x], (tf.int64))), cycle_length=1))
However, it results it a error:
InternalError: Unsupported object type dict
[[{{node PyFunc}}]] [Op:IteratorGetNext]
Thanks

Related

Tensor flow and tflearn Chatbot keeps on getting high probability even when user input is wrong

I coded a simple AI chatbot with TensorFlow and tflearn and it runs just fine but the issue is when the user inputs the wrong thing, the bot is supposed to say it doesnt understand if the prediction accuracy is less than 70%, but the bot always scores above that even if the user gives jibberish like "rjrigrejfr". The bot assumes theyre greeting them. The patterns its supposed to study in the json are "patterns": ["Hi", "How are you", "Wassup", "Hello", "Good day", "Waddup", "Yo"]. I can share the json file if needed its short. Anyway, this is the python code:
import numpy as np
import nltk
import tensorflow
import tflearn
import random
import json
import pickle
# Some extra configuration:
from nltk.stem.lancaster import LancasterStemmer
stemmer = LancasterStemmer()
nltk.download('punkt')
# Load the data from the json file into a variable.
with open("intents.json") as file:
data = json.load(file)
# If we already have saved data, we do not need to retrain the model and waste time (could develop into an issue in more complex programs. Save in pickle. )
try:
with open("data.pickle", "rb") as f: # rb stands for bytes.
words, labels, training, output = pickle.load(f)
# --- Pre-training data preparation ---
except:
words = []
docsx = [] # Stores patterns
docsy = [] # Stores intents
labels = [] # All the specific tag values such as greeting, contact, etc.
for intent in data["intents"]:
for pattern in intent["patterns"]:
w = nltk.word_tokenize(pattern) # nltk function that splits the sentences inside intent into words list.
words.extend(w) # Add the tokenized list to words list.
docsx.append(w)
docsy.append(intent["tag"]) # append the classification of the sentence
if intent["tag"] not in labels:
labels.append(intent["tag"])
words = [stemmer.stem(w.lower()) for w in words if w not in ".?!"] # Stemming the words to remove unnecessary elements leaving their root. Convert all to lowercase.
words = sorted(list(set(words))) # Set ensures no duplicate elements then we convert back to list and sort it.
labels = sorted(labels)
training = []
output = []
out_empty = [0 for i in range(len(labels))] # Gives a list of 0 ints based on # of tags. This is useful later in the program when binerizing.
# One hot encoding the intent categories. Need to one-hot code the data which improves the efficiency of the ML to "binerize" the data.
# In this case, we have a list of 0s and 1s if the word appears it is assigned a 1 else a 0.
for x, doc in enumerate(docsx):
bag = [] # Bag of words or the one-hot coded data for the ML.
docx_word_stemmed = [stemmer.stem(word) for word in doc] # Stemming the data in docx.
# Now adding and transforming data into the one-hot coded list/bag of words data.
for i in words:
if i in docx_word_stemmed: # Checking against stemmed words:
# Word exists
bag.append(1)
else:
bag.append(0)
output_row = out_empty[:] # Copying out_empty
# Going through the labels list using .index() and for the occurance of docx value in docy, assign binary 1.
output_row[labels.index(docsy[x])] = 1
training.append(bag)
output.append(output_row)
# Required to use numpy arrays for use in tflearn. It is also faster.
training = np.array(training)
output = np.array(output)
# Saving the data so we do not need to do the data configuration every time.
with open("data.pickle", "wb") as f:
pickle.dump((words, labels, training, output), f)
try:
model.load('model.tflearn')
except:
tensorflow.compat.v1.reset_default_graph()
net = tflearn.input_data(shape=[None, len(training[0])])
net = tflearn.fully_connected(net, 8)
net = tflearn.fully_connected(net, 8)
net = tflearn.fully_connected(net, len(output[0]), activation='softmax')
net = tflearn.regression(net)
model = tflearn.DNN(net)
model.fit(training, output, n_epoch=1000, batch_size=8, show_metric=True)
model.save("model.tflearn")
def bagofwords(sentence, words):
bag = [0 for _ in range(len(words))] # blank bag of words.
# Tokenize s and then stem it.
sentence_words = nltk.word_tokenize(sentence)
sentence_words = [stemmer.stem(word.lower()) for word in sentence_words]
for string in sentence_words:
for i, word in enumerate(words):
if word == string:
bag[i] = 1
return np.array(bag)
def chat():
print("Hello there! I'm the SRO AI Virtual Assistant. How am I help you?")
# Figure out the error slime!
while True:
user_input = input("Type here:")
if user_input == "quit":
break
result = model.predict([bagofwords(user_input, words)])[0] #bagofwords func and predict function to give predictions on what the user is saying.
best_result = np.argmax(result) # We want to only use the best result.
tag = labels[best_result]
print(result[best_result])
# Open JSON file and pick a response.
if result[best_result] > 0.7:
for tg in data["intents"]:
if tg['tag'] == tag:
responses = tg['responses']
print(random.choice(responses))
else:
print("I don't quite understand")
chat()

passing panda dataframe data to functions and its not outputting the results

In my code, I am trying to extract data from csv file to use in the function, but it doesnt output anything, and gives no error. My code works because I tried it with just numpy array as inputs. not sure why it doesnt work with panda.
import numpy as np
import pandas as pd
import os
# change the current directory to the directory where the running script file is
os.chdir(os.path.dirname(os.path.abspath(__file__)))
# finding best fit line for y=mx+b by iteration
def gradient_descent(x,y):
m_iter = b_iter = 1 #starting point
iteration = 10000
n = len(x)
learning_rate = 0.05
last_mse = 10000
#take baby steps to reach global minima
for i in range(iteration):
y_predicted = m_iter*x + b_iter
#mse = 1/n*sum([value**2 for value in (y-y_predicted)]) # cost function to minimize
mse = 1/n*sum((y-y_predicted)**2) # cost function to minimize
if (last_mse - mse)/mse < 0.001:
break
# recall MSE formula is 1/n*sum((yi-y_predicted)^2), where y_predicted = m*x+b
# using partial deriv of MSE formula, d/dm and d/db
dm = -(2/n)*sum(x*(y-y_predicted))
db = -(2/n)*sum((y-y_predicted))
# use current predicted value to get the next value for prediction
# by using learning rate
m_iter = m_iter - learning_rate*dm
b_iter = b_iter - learning_rate*db
print('m is {}, b is {}, cost is {}, iteration {}'.format(m_iter,b_iter,mse,i))
last_mse = mse
#x = np.array([1,2,3,4,5])
#y = np.array([5,7,8,10,13])
#gradient_descent(x,y)
df = pd.read_csv('Linear_Data.csv')
x = df['Area']
y = df['Price']
gradient_descent(x,y)
My code works because I tried it with just numpy array as inputs. not sure why it doesnt work with panda.
Well no, your code also works with pandas dataframes:
df = pd.DataFrame({'Area': [1,2,3,4,5], 'Price': [5,7,8,10,13]})
x = df['Area']
y = df['Price']
gradient_descent(x,y)
Above will give you the same output as with numpy arrays.
Try to check what's in Linear_Data.csv and/or add some print statements in the gradient_descent function just to check your assumptions. I would suggest to first of all add a print statement before the condition with the break statement:
print(last_mse, mse)
if (last_mse - mse)/mse < 0.001:
break

deep feature synthesis depth for transformation primitives | featuretools

I am trying to use the featuretools library to make new features on a simple dataset, however, whenever I try to use a bigger max_depth, nothing happens... Here is my code so far:
# imports
import featuretools as ft
# creating the EntitySet
es = ft.EntitySet()
es.entity_from_dataframe(entity_id='data', dataframe=data, make_index=True, index='index')
# Run deep feature synthesis with transformation primitives
feature_matrix, feature_defs = ft.dfs(entityset=es, target_entity='data', max_depth=3,
trans_primitives=['add_numeric', 'multiply_numeric'])
When I look at the features created, I get the basic things f1*f2 and f1+f2, but I would like more complex engineered features like f2*(f1+f2) or f1+(f2+f1). I thought increasing max_depth would do this but apparently not.
How could I do this, if at all?
I have managed to answer my own question, so I'll post it here.
You can create deeper features by running "Deep Feature Synthesis" on already generated features. Here is an example:
# imports
import featuretools as ft
# creating the EntitySet
es = ft.EntitySet()
es.entity_from_dataframe(entity_id='data', dataframe=data, make_index=True, index='index')
# Run deep feature synthesis with transformation primitives
feature_matrix, feature_defs = ft.dfs(entityset=es, target_entity='data',
trans_primitives=['add_numeric','multiply_numeric'])
# creating an EntitySet from the new features
deep_es = ft.EntitySet()
deep_es.entity_from_dataframe(entity_id='data', index='index', dataframe=feature_matrix)
# Run deep feature synthesis with transformation primitives
deep_feature_matrix, deep_feature_defs=ft.dfs(entityset=deep_es, target_entity='data',
trans_primitives=['add_numeric','multiply_numeric'])
Now, looking at the columns of deep_feature_matrix here is what we see (assuming a dataset with 2 features):
"f1", "f2", "f1+f2", "f1*f2", "f1+f1*f2", "f1+f1+f2", "f1*f2+f1+f2", "f1*f2+f2", "f1+f2+f2", "f1*f1*f2", "f1*f1+f2", "f1*f2*f1+f2", "f1*f2*f2", "f1+f2*f2"
I have also made a function that automatically does this (includes a full docstring):
def auto_feature_engineering(X, y, selection_percent=0.1, selection_strategy="best", num_depth_steps=2, transformatives=['divide_numeric', 'multiply_numeric']):
"""
Automatically perform deep feature engineering and
feature selection.
Parameters
----------
X : pd.DataFrame
Data to perform automatic feature engineering on.
y : pd.DataFrame
Target variable to find correlations of all
features at each depth step to perform feature
selection, y is not needed if selection_percent=1.
selection_percent : float, optional
Defines what percent of all the new features to
keep for the next depth step.
selection_strategy : {'best', 'random'}, optional
Strategy used for feature selection, if 'best',
it will select the best features for the next depth
step, if 'random', it will select features at random.
num_depth_steps : integer, optional
The number of depth steps. Every depth step, the model
generates brand new features from the features made in
the last step, then selects a percent of these new
features.
transformatives : list, optional
List of all possible transformations of the data to use
when feature engineering, you can find the full list
of possible transformations as well as what each one
does using the following code:
`ft.primitives.list_primitives()[ft.primitives.list_primitives()["type"]=="transform"]`
make sure to `import featuretools as ft`.
Returns
-------
pd.DataFrame
a dataframe of the brand new features.
"""
from sklearn.feature_selection import mutual_info_classif
selected_feature_df = X.copy()
for i in range(num_depth_steps):
# Perform feature engineering
es = ft.EntitySet()
es.entity_from_dataframe(entity_id='data', dataframe=selected_feature_df,
make_index=True, index='index')
feature_matrix, feature_defs = ft.dfs(entityset=es, target_entity='data', trans_primitives=transformatives)
# Remove features that are the same
feature_corrs = feature_matrix.corr()[list(feature_matrix.keys())[0]]
existing_corrs = []
good_keys = []
for key in feature_corrs.to_dict().keys():
if feature_corrs[key] not in existing_corrs:
existing_corrs.append(feature_corrs[key])
good_keys.append(key)
feature_matrix = feature_matrix[good_keys]
# Remove illegal features
legal_features = list(feature_matrix.columns)
for feature in list(feature_matrix.columns):
raw_feature_list = []
for j in range(len(feature.split(" "))):
if j%2==0:
raw_feature_list.append(feature.split(" ")[j])
if len(raw_feature_list) > i+2: # num_depth_steps = 1, means max_num_raw_features_in_feature = 2
legal_features.remove(feature)
feature_matrix = feature_matrix[legal_features]
# Perform feature selection
if int(selection_percent)!=1:
if selection_strategy=="best":
corrs = mutual_info_classif(feature_matrix.reset_index(drop=True), y)
corrs = pd.Series(corrs, name="")
selected_corrs = corrs[corrs>=corrs.quantile(1-selection_percent)]
selected_feature_df = feature_matrix.iloc[:, list(selected_corrs.keys())].reset_index(drop=True)
elif selection_strategy=="random":
selected_feature_df = feature_matrix.sample(frac=(selection_percent), axis=1).reset_index(drop=True)
else:
raise Exception("selection_strategy can be either 'best' or 'random', got '"+str(selection_strategy)+"'.")
else:
selected_feature_df = feature_matrix.reset_index(drop=True)
if num_depth_steps!=1:
rename_dict = {}
for col in list(selected_feature_df.columns):
rename_dict[col] = "("+col+")"
selected_feature_df = selected_feature_df.rename(columns=rename_dict)
if num_depth_steps!=1:
rename_dict = {}
for feature_name in list(selected_feature_df.columns):
rename_dict[feature_name] = feature_name[int(num_depth_steps-1):-int(num_depth_steps-1)]
selected_feature_df = selected_feature_df.rename(columns=rename_dict)
return selected_feature_df
Here is an example of using it:
# Imports
>>> import seaborn as sns
>>> import pandas as pd
>>> import numpy as np
>>> from sklearn.preprocessing import OrdinalEncoder
# Load the penguins dataset
>>> penguins = sns.load_dataset("penguins")
>>> penguins.head()
species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g sex
0 Adelie Torgersen 39.1 18.7 181.0 3750.0 Male
1 Adelie Torgersen 39.5 17.4 186.0 3800.0 Female
2 Adelie Torgersen 40.3 18.0 195.0 3250.0 Female
3 Adelie Torgersen NaN NaN NaN NaN NaN
4 Adelie Torgersen 36.7 19.3 193.0 3450.0 Female
# Fill in NaN values of features using the distribution of the feature
>>> for feature in ["bill_length_mm", "bill_depth_mm", "flipper_length_mm", "body_mass_g", "sex"]:
... s = penguins[feature].value_counts(normalize=True)
... dist = penguins[feature].value_counts(normalize=True).values
... missing = penguins[feature].isnull()
... penguins.loc[missing, feature] = np.random.choice(s.index, size=len(penguins[missing]),p=s.values)
# Make X and y
>>> X = penguins[["bill_length_mm", "bill_depth_mm", "flipper_length_mm", "body_mass_g"]]
>>> y = penguins[["sex"]]
# Encode "sex" so that "Male" is 1 and "Female" is 0
>>> ord_enc = OrdinalEncoder()
>>> y = pd.DataFrame(ord_enc.fit_transform(y).astype(np.int8), columns=["sex"])
# Generate new dataset with more features
>>> penguins_with_more_features = auto_feature_engineering(X, y, selection_percent=1.)
# Correlations of the raw features
>>> find_correlations(X, y)
body_mass_g 0.422959
bill_depth_mm 0.353526
bill_length_mm 0.342109
flipper_length_mm 0.246944
Name: sex, dtype: float64
# Top 10% correlations of new features
>>> summarize_corr_series(find_top_percent(find_correlations(penguins_with_more_features, y), 0.1))
(flipper_length_mm / bill_depth_mm) / (body_mass_g): 0.7241123396175027
(bill_depth_mm * body_mass_g) / (flipper_length_mm): 0.7237223914820166
(bill_depth_mm * body_mass_g) * (bill_depth_mm): 0.7222108721971968
(bill_depth_mm * body_mass_g): 0.7202272416625914
(bill_depth_mm * body_mass_g) * (flipper_length_mm): 0.6425813490692588
(bill_depth_mm * bill_length_mm) * (body_mass_g): 0.6398235593646668
(bill_depth_mm * flipper_length_mm) * (flipper_length_mm): 0.6360645935216128
(bill_depth_mm * flipper_length_mm): 0.6083364815975281
(bill_depth_mm * body_mass_g) * (body_mass_g): 0.5888925994060027
In this example, we would like to predict the gender of penguins given their attributes body_mass_g, bill_depth_mm, bill_length_mm and flipper_length_mm.
You might notice these other mysterious functions I used in the example, namely find_correlations, summarize_corr_series and find_top_percent. These are other convenient functions I made to help summarize the results from auto_feature_engineering. Here is the code to them (note they haven't been documented):
def summarize_corr_series(feature_corr_series):
max_feature_name_size = 0
for key in feature_corr_series.to_dict().keys():
if len(key) > max_feature_name_size:
max_feature_name_size = len(key)
max_new_feature_corr = feature_corr_series.max()
for key in feature_corr_series.to_dict().keys():
whitespace = []
for i in range(max_feature_name_size-len(key)):
whitespace.append(" ")
whitespace = "".join(whitespace)
print(key+": "+whitespace+str(abs(feature_corr_series[key])))
def find_top_percent(series, percent):
return series[series>series.quantile(1-percent)]
def find_correlations(X, y):
return abs(pd.concat([X.reset_index(drop=True), y.reset_index(drop=True)], axis=1).corr())[y.columns[0]].drop(y.columns[0]).sort_values(ascending=False)
It is really unfortunate that featuretools does not easily support this use case since it appears to be quite common. The best way I've found to do this is to create the first order features you want using the dfs function and then add the second order features you want manually.
For instance the MWE below (using the iris dataset) performs the AddNumeric primitive using dfs and then applies the DivideNumeric primitive to the newly created features using only the original features (and avoids the same base feature appearing multiple times in a transformed feature).
import numpy as np
import pandas as pd
import sklearn
import featuretools as ft
iris = sklearn.datasets.load_iris()
data = pd.DataFrame(
data= np.c_[iris['data'],
iris['target']],
columns= iris['feature_names'] + ['target']
)
ignore_cols = ['target']
entity_set = ft.EntitySet(id="iris")
entity_set.entity_from_dataframe(
entity_id="iris_main",
dataframe=data,
index="index",
)
new_features = ft.dfs(
entityset=entity_set,
target_entity="iris_main",
trans_primitives=["add_numeric"],
features_only=True,
primitive_options={
"add_numeric": {
"ignore_variables": {"iris_main": ignore_cols},
},
},
)
transformed_features = [i for i in new_features if isinstance(i, ft.feature_base.feature_base.TransformFeature)]
original_features = [i for i in new_features if isinstance(i, ft.feature_base.feature_base.IdentityFeature) and i.get_name() not in ignore_cols]
depth_two_features = []
for trans_feat in transformed_features:
for orig_feat in original_features:
if orig_feat.get_name() not in [i.get_name() for i in trans_feat.base_features]:
feat = ft.Feature([trans_feat, orig_feat], primitive=ft.primitives.DivideNumeric)
depth_two_features.append(feat)
data = ft.calculate_feature_matrix(
features= original_features + transformed_features + depth_two_features,
entityset=entity_set,
verbose=True,
)
The benefit of this approach is that it gives you more fine grained control to customise this how you want and avoids the computational cost of creating unnecessary features you don't want.

how to make a memory efficient multiple dimension groupby/stack using xarray?

I have a large time series of np.float64 with a 5-min frequency (size is ~2,500,000 ~=24 years).
I'm using Xarray to represent it in-memory and the time-dimension is named 'time'.
I want to group-by 'time.hour' and then 'time.dayofyear' (or vice-versa) and remove both their mean from the time-series.
In order to do that efficiently, i need to reorder the time-series into a new xr.DataArray with the dimensions of ['hour', 'dayofyear', 'rest'].
I wrote a function that plays with the GroupBy objects of Xarray and manages to do just that although it takes a lot of memory to do that...
I have a machine with 32GB RAM and i still get the MemoryError from numpy.
I know the code works because i used it on an hourly re-sampled version of my original time-series. so here's the code:
def time_series_stack(time_da, time_dim='time', grp1='hour', grp2='dayofyear'):
"""Takes a time-series xr.DataArray objects and reshapes it using
grp1 and grp2. outout is a xr.Dataset that includes the reshaped DataArray
, its datetime-series and the grps."""
import xarray as xr
import numpy as np
import pandas as pd
# try to infer the freq and put it into attrs for later reconstruction:
freq = pd.infer_freq(time_da[time_dim].values)
name = time_da.name
time_da.attrs['freq'] = freq
attrs = time_da.attrs
# drop all NaNs:
time_da = time_da.dropna(time_dim)
# group grp1 and concat:
grp_obj1 = time_da.groupby(time_dim + '.' + grp1)
s_list = []
for grp_name, grp_inds in grp_obj1.groups.items():
da = time_da.isel({time_dim: grp_inds})
s_list.append(da)
grps1 = [x for x in grp_obj1.groups.keys()]
stacked_da = xr.concat(s_list, dim=grp1)
stacked_da[grp1] = grps1
# group over the concatenated da and concat again:
grp_obj2 = stacked_da.groupby(time_dim + '.' + grp2)
s_list = []
for grp_name, grp_inds in grp_obj2.groups.items():
da = stacked_da.isel({time_dim: grp_inds})
s_list.append(da)
grps2 = [x for x in grp_obj2.groups.keys()]
stacked_da = xr.concat(s_list, dim=grp2)
stacked_da[grp2] = grps2
# numpy part:
# first, loop over both dims and drop NaNs, append values and datetimes:
vals = []
dts = []
for i, grp1_val in enumerate(stacked_da[grp1]):
da = stacked_da.sel({grp1: grp1_val})
for j, grp2_val in enumerate(da[grp2]):
val = da.sel({grp2: grp2_val}).dropna(time_dim)
vals.append(val.values)
dts.append(val[time_dim].values)
# second, we get the max of the vals after the second groupby:
max_size = max([len(x) for x in vals])
# we fill NaNs and NaT for the remainder of them:
concat_sizes = [max_size - len(x) for x in vals]
concat_arrys = [np.empty((x)) * np.nan for x in concat_sizes]
concat_vals = [np.concatenate(x) for x in list(zip(vals, concat_arrys))]
# 1970-01-01 is the NaT for this time-series:
concat_arrys = [np.zeros((x), dtype='datetime64[ns]')
for x in concat_sizes]
concat_dts = [np.concatenate(x) for x in list(zip(dts, concat_arrys))]
concat_vals = np.array(concat_vals)
concat_dts = np.array(concat_dts)
# finally , we reshape them:
concat_vals = concat_vals.reshape((stacked_da[grp1].shape[0],
stacked_da[grp2].shape[0],
max_size))
concat_dts = concat_dts.reshape((stacked_da[grp1].shape[0],
stacked_da[grp2].shape[0],
max_size))
# create a Dataset and DataArrays for them:
sda = xr.Dataset()
sda.attrs = attrs
sda[name] = xr.DataArray(concat_vals, dims=[grp1, grp2, 'rest'])
sda[time_dim] = xr.DataArray(concat_dts, dims=[grp1, grp2, 'rest'])
sda[grp1] = grps1
sda[grp2] = grps2
sda['rest'] = range(max_size)
return sda
So for the 2,500,000 items time-series, numpy throws the MemoryError so I'm guessing this has to be my memory bottle-neck. What can i do to solve this ?
Would Dask help me ? and if so how can i implement it ?
Like you, I ran it without issue when inputting a small time series (10,000 long). However, when inputting a 100,000 long time series xr.DataArraythe grp_obj2 for loop ran away and used all the memory of the system.
This is what I used to generate the time series xr.DataArray:
n = 10**5
times = np.datetime64('2000-01-01') + np.arange(n) * np.timedelta64(5,'m')
data = np.random.randn(n)
time_da = xr.DataArray(data, name='rand_data', dims=('time'), coords={'time': times})
# time_da.to_netcdf('rand_time_series.nc')
As you point out, Dask would be a way to solve it but I can't see a clear path at the moment...
Typically, the kind of problem with Dask would be to:
Make the input a dataset from a file (like NetCDF). This will not load the file in memory but allow Dask to pull data from disk one chunk at a time.
Define all calculations with dask.delayed or dask.futures methods for entire body of code up until the writing the output. This is what allows Dask to chunk a small piece of data to read then write.
Calculate one chunk of work and immediately write output to new dataset file. Effectively you ending up steaming one chunk of input to one chunk of output at a time (but also threaded/parallelized).
I tried importing Dask and breaking the input time_da xr.DataArray into chunks for Dask to work on but it didn't help. From what I can tell, the line stacked_da = xr.concat(s_list, dim=grp1) forces Dask to make a full copy of stacked_da in memory and much more...
One workaround to this is to write stacked_da to disk then immediately read it again:
##For group1
xr.concat(s_list, dim=grp1).to_netcdf('stacked_da1.nc')
stacked_da = xr.load_dataset('stacked_da1.nc')
stacked_da[grp1] = grps1
##For group2
xr.concat(s_list, dim=grp2).to_netcdf('stacked_da2.nc')
stacked_da = xr.load_dataset('stacked_da2.nc')
stacked_da[grp2] = grps2
However, the file size for stacked_da1.nc is 19MB and stacked_da2.nc gets huge at 6.5GB. This is for time_da with 100,000 elements... so there's clearly something amiss...
Originally, it sounded like you want to subtract the mean of the groups from the time series data. It looks like Xarray docs has an example for that. http://xarray.pydata.org/en/stable/groupby.html#grouped-arithmetic
The key is to group once and loop over the groups and then group again on each of the groups and append it to list.
Next i concat and use pd.MultiIndex.from_product for the groups.
No Memory problems and no Dask needed and it only takes a few seconds to run.
here's the code, enjoy:
def time_series_stack(time_da, time_dim='time', grp1='hour', grp2='month',
plot=True):
"""Takes a time-series xr.DataArray objects and reshapes it using
grp1 and grp2. output is a xr.Dataset that includes the reshaped DataArray
, its datetime-series and the grps. plots the mean also"""
import xarray as xr
import pandas as pd
# try to infer the freq and put it into attrs for later reconstruction:
freq = pd.infer_freq(time_da[time_dim].values)
name = time_da.name
time_da.attrs['freq'] = freq
attrs = time_da.attrs
# drop all NaNs:
time_da = time_da.dropna(time_dim)
# first grouping:
grp_obj1 = time_da.groupby(time_dim + '.' + grp1)
da_list = []
t_list = []
for grp1_name, grp1_inds in grp_obj1.groups.items():
da = time_da.isel({time_dim: grp1_inds})
# second grouping:
grp_obj2 = da.groupby(time_dim + '.' + grp2)
for grp2_name, grp2_inds in grp_obj2.groups.items():
da2 = da.isel({time_dim: grp2_inds})
# extract datetimes and rewrite time coord to 'rest':
times = da2[time_dim]
times = times.rename({time_dim: 'rest'})
times.coords['rest'] = range(len(times))
t_list.append(times)
da2 = da2.rename({time_dim: 'rest'})
da2.coords['rest'] = range(len(da2))
da_list.append(da2)
# get group keys:
grps1 = [x for x in grp_obj1.groups.keys()]
grps2 = [x for x in grp_obj2.groups.keys()]
# concat and convert to dataset:
stacked_ds = xr.concat(da_list, dim='all').to_dataset(name=name)
stacked_ds[time_dim] = xr.concat(t_list, 'all')
# create a multiindex for the groups:
mindex = pd.MultiIndex.from_product([grps1, grps2], names=[grp1, grp2])
stacked_ds.coords['all'] = mindex
# unstack:
ds = stacked_ds.unstack('all')
ds.attrs = attrs
return ds

use Featureunion in scikit-learn to combine two pandas columns for tfidf

While using this as a model for spam classification, I'd like to add an additional feature of the Subject plus the body.
I have all of my features in a pandas dataframe. For example, the subject is df['Subject'], the body is df['body_text'] and the spam/ham label is df['ham/spam']
I receive the following error:
TypeError: 'FeatureUnion' object is not iterable
How can I use both df['Subject'] and df['body_text'] as features all while running them through the pipeline function?
from sklearn.pipeline import FeatureUnion
features = df[['Subject', 'body_text']].values
combined_2 = FeatureUnion(list(features))
pipeline = Pipeline([
('count_vectorizer', CountVectorizer(ngram_range=(1, 2))),
('tfidf_transformer', TfidfTransformer()),
('classifier', MultinomialNB())])
pipeline.fit(combined_2, df['ham/spam'])
k_fold = KFold(n=len(df), n_folds=6)
scores = []
confusion = numpy.array([[0, 0], [0, 0]])
for train_indices, test_indices in k_fold:
train_text = combined_2.iloc[train_indices]
train_y = df.iloc[test_indices]['ham/spam'].values
test_text = combined_2.iloc[test_indices]
test_y = df.iloc[test_indices]['ham/spam'].values
pipeline.fit(train_text, train_y)
predictions = pipeline.predict(test_text)
prediction_prob = pipeline.predict_proba(test_text)
confusion += confusion_matrix(test_y, predictions)
score = f1_score(test_y, predictions, pos_label='spam')
scores.append(score)
FeatureUnion was not meant to be used that way. It instead takes two feature extractors / vectorizers and applies them to the input. It does not take data in the constructor the way it is shown.
CountVectorizer is expecting a sequence of strings. The easiest way to provide it with that is to concatenate the strings together. That would pass both the text in both columns to the same CountVectorizer.
combined_2 = df['Subject'] + ' ' + df['body_text']
An alternative method would be to run CountVectorizer and optionally TfidfTransformer individually on each column, and then stack the results.
import scipy.sparse as sp
subject_vectorizer = CountVectorizer(...)
subject_vectors = subject_vectorizer.fit_transform(df['Subject'])
body_vectorizer = CountVectorizer(...)
body_vectors = body_vectorizer.fit_transform(df['body_text'])
combined_2 = sp.hstack([subject_vectors, body_vectors], format='csr')
A third option is to implement your own transformer that would extract a dataframe column.
class DataFrameColumnExtracter(TransformerMixin):
def __init__(self, column):
self.column = column
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
return X[self.column]
In that case you could use FeatureUnion on two pipelines, each containing your custom transformer, then CountVectorizer.
subj_pipe = make_pipeline(
DataFrameColumnExtracter('Subject'),
CountVectorizer()
)
body_pipe = make_pipeline(
DataFrameColumnExtracter('body_text'),
CountVectorizer()
)
feature_union = make_union(subj_pipe, body_pipe)
This feature union of pipelines will take the dataframe and each pipeline will process its column. It will produce the concatenation of term count matrices from the two columns given.
sparse_matrix_of_counts = feature_union.fit_transform(df)
This feature union can also be added as the first step in a larger pipeline.