subclassing tables.Group class - pytables

Is it allowable to subclass tables.Group?
The following code works fine
In [1]: import tables
In [2]: class Friendly_group(tables.Group):
...: def __repr__(self):
...: return 'hello!'
...:
In [3]: hf = tables.open_file('data', mode='w')
In [4]: fgroup = Friendly_group(hf.root, 'fgroup', new=True)
In [5]: hf
Out[5]:
File(filename=data, title='', mode='w', root_uep='/', filters=Filters(complevel=0, shuffle=False, fletcher32=False))
/ (RootGroup) ''
/fgroup (Friendly_group) ''
In [6]: hf.root.fgroup
Out[6]: hello!
But after reading back, that group stops being friendly
In [7]: hf.close()
In [8]: hf = tables.open_file('data', mode='r')
In [9]: hf
Out[9]:
File(filename=data, title='', mode='r', root_uep='/', filters=Filters(complevel=0, shuffle=False, fletcher32=False))
/ (RootGroup) ''
/fgroup (Group) ''
In [10]: hf.root.fgroup
Out[10]:
/fgroup (Group) ''
children := []
SO checker forces me to add some details to this post, but I really don't know how can I increase clearness of my question, so please, excuse me for this dummy piece of text.

Yes this is possible. The missing piece that you do not have that is needed for depersistence is to override the _c_classid class attribute. You probably want to look at other group subclasses that are present in tables/group.py. For instance, take the TransactionGroupG (stripped of some backwards compatibility features),
class TransactionGroupG(NotLoggedMixin, Group):
_c_classid = 'TRANSGROUP'
def _g_width_warning(self):
warnings.warn("""\
the number of transactions is exceeding the recommended maximum (%d);\
be ready to see PyTables asking for *lots* of memory and possibly slow I/O"""
% (self._v_max_group_width,), PerformanceWarning)
This is fairly minimal.

Related

deep feature synthesis depth for transformation primitives | featuretools

I am trying to use the featuretools library to make new features on a simple dataset, however, whenever I try to use a bigger max_depth, nothing happens... Here is my code so far:
# imports
import featuretools as ft
# creating the EntitySet
es = ft.EntitySet()
es.entity_from_dataframe(entity_id='data', dataframe=data, make_index=True, index='index')
# Run deep feature synthesis with transformation primitives
feature_matrix, feature_defs = ft.dfs(entityset=es, target_entity='data', max_depth=3,
trans_primitives=['add_numeric', 'multiply_numeric'])
When I look at the features created, I get the basic things f1*f2 and f1+f2, but I would like more complex engineered features like f2*(f1+f2) or f1+(f2+f1). I thought increasing max_depth would do this but apparently not.
How could I do this, if at all?
I have managed to answer my own question, so I'll post it here.
You can create deeper features by running "Deep Feature Synthesis" on already generated features. Here is an example:
# imports
import featuretools as ft
# creating the EntitySet
es = ft.EntitySet()
es.entity_from_dataframe(entity_id='data', dataframe=data, make_index=True, index='index')
# Run deep feature synthesis with transformation primitives
feature_matrix, feature_defs = ft.dfs(entityset=es, target_entity='data',
trans_primitives=['add_numeric','multiply_numeric'])
# creating an EntitySet from the new features
deep_es = ft.EntitySet()
deep_es.entity_from_dataframe(entity_id='data', index='index', dataframe=feature_matrix)
# Run deep feature synthesis with transformation primitives
deep_feature_matrix, deep_feature_defs=ft.dfs(entityset=deep_es, target_entity='data',
trans_primitives=['add_numeric','multiply_numeric'])
Now, looking at the columns of deep_feature_matrix here is what we see (assuming a dataset with 2 features):
"f1", "f2", "f1+f2", "f1*f2", "f1+f1*f2", "f1+f1+f2", "f1*f2+f1+f2", "f1*f2+f2", "f1+f2+f2", "f1*f1*f2", "f1*f1+f2", "f1*f2*f1+f2", "f1*f2*f2", "f1+f2*f2"
I have also made a function that automatically does this (includes a full docstring):
def auto_feature_engineering(X, y, selection_percent=0.1, selection_strategy="best", num_depth_steps=2, transformatives=['divide_numeric', 'multiply_numeric']):
"""
Automatically perform deep feature engineering and
feature selection.
Parameters
----------
X : pd.DataFrame
Data to perform automatic feature engineering on.
y : pd.DataFrame
Target variable to find correlations of all
features at each depth step to perform feature
selection, y is not needed if selection_percent=1.
selection_percent : float, optional
Defines what percent of all the new features to
keep for the next depth step.
selection_strategy : {'best', 'random'}, optional
Strategy used for feature selection, if 'best',
it will select the best features for the next depth
step, if 'random', it will select features at random.
num_depth_steps : integer, optional
The number of depth steps. Every depth step, the model
generates brand new features from the features made in
the last step, then selects a percent of these new
features.
transformatives : list, optional
List of all possible transformations of the data to use
when feature engineering, you can find the full list
of possible transformations as well as what each one
does using the following code:
`ft.primitives.list_primitives()[ft.primitives.list_primitives()["type"]=="transform"]`
make sure to `import featuretools as ft`.
Returns
-------
pd.DataFrame
a dataframe of the brand new features.
"""
from sklearn.feature_selection import mutual_info_classif
selected_feature_df = X.copy()
for i in range(num_depth_steps):
# Perform feature engineering
es = ft.EntitySet()
es.entity_from_dataframe(entity_id='data', dataframe=selected_feature_df,
make_index=True, index='index')
feature_matrix, feature_defs = ft.dfs(entityset=es, target_entity='data', trans_primitives=transformatives)
# Remove features that are the same
feature_corrs = feature_matrix.corr()[list(feature_matrix.keys())[0]]
existing_corrs = []
good_keys = []
for key in feature_corrs.to_dict().keys():
if feature_corrs[key] not in existing_corrs:
existing_corrs.append(feature_corrs[key])
good_keys.append(key)
feature_matrix = feature_matrix[good_keys]
# Remove illegal features
legal_features = list(feature_matrix.columns)
for feature in list(feature_matrix.columns):
raw_feature_list = []
for j in range(len(feature.split(" "))):
if j%2==0:
raw_feature_list.append(feature.split(" ")[j])
if len(raw_feature_list) > i+2: # num_depth_steps = 1, means max_num_raw_features_in_feature = 2
legal_features.remove(feature)
feature_matrix = feature_matrix[legal_features]
# Perform feature selection
if int(selection_percent)!=1:
if selection_strategy=="best":
corrs = mutual_info_classif(feature_matrix.reset_index(drop=True), y)
corrs = pd.Series(corrs, name="")
selected_corrs = corrs[corrs>=corrs.quantile(1-selection_percent)]
selected_feature_df = feature_matrix.iloc[:, list(selected_corrs.keys())].reset_index(drop=True)
elif selection_strategy=="random":
selected_feature_df = feature_matrix.sample(frac=(selection_percent), axis=1).reset_index(drop=True)
else:
raise Exception("selection_strategy can be either 'best' or 'random', got '"+str(selection_strategy)+"'.")
else:
selected_feature_df = feature_matrix.reset_index(drop=True)
if num_depth_steps!=1:
rename_dict = {}
for col in list(selected_feature_df.columns):
rename_dict[col] = "("+col+")"
selected_feature_df = selected_feature_df.rename(columns=rename_dict)
if num_depth_steps!=1:
rename_dict = {}
for feature_name in list(selected_feature_df.columns):
rename_dict[feature_name] = feature_name[int(num_depth_steps-1):-int(num_depth_steps-1)]
selected_feature_df = selected_feature_df.rename(columns=rename_dict)
return selected_feature_df
Here is an example of using it:
# Imports
>>> import seaborn as sns
>>> import pandas as pd
>>> import numpy as np
>>> from sklearn.preprocessing import OrdinalEncoder
# Load the penguins dataset
>>> penguins = sns.load_dataset("penguins")
>>> penguins.head()
species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g sex
0 Adelie Torgersen 39.1 18.7 181.0 3750.0 Male
1 Adelie Torgersen 39.5 17.4 186.0 3800.0 Female
2 Adelie Torgersen 40.3 18.0 195.0 3250.0 Female
3 Adelie Torgersen NaN NaN NaN NaN NaN
4 Adelie Torgersen 36.7 19.3 193.0 3450.0 Female
# Fill in NaN values of features using the distribution of the feature
>>> for feature in ["bill_length_mm", "bill_depth_mm", "flipper_length_mm", "body_mass_g", "sex"]:
... s = penguins[feature].value_counts(normalize=True)
... dist = penguins[feature].value_counts(normalize=True).values
... missing = penguins[feature].isnull()
... penguins.loc[missing, feature] = np.random.choice(s.index, size=len(penguins[missing]),p=s.values)
# Make X and y
>>> X = penguins[["bill_length_mm", "bill_depth_mm", "flipper_length_mm", "body_mass_g"]]
>>> y = penguins[["sex"]]
# Encode "sex" so that "Male" is 1 and "Female" is 0
>>> ord_enc = OrdinalEncoder()
>>> y = pd.DataFrame(ord_enc.fit_transform(y).astype(np.int8), columns=["sex"])
# Generate new dataset with more features
>>> penguins_with_more_features = auto_feature_engineering(X, y, selection_percent=1.)
# Correlations of the raw features
>>> find_correlations(X, y)
body_mass_g 0.422959
bill_depth_mm 0.353526
bill_length_mm 0.342109
flipper_length_mm 0.246944
Name: sex, dtype: float64
# Top 10% correlations of new features
>>> summarize_corr_series(find_top_percent(find_correlations(penguins_with_more_features, y), 0.1))
(flipper_length_mm / bill_depth_mm) / (body_mass_g): 0.7241123396175027
(bill_depth_mm * body_mass_g) / (flipper_length_mm): 0.7237223914820166
(bill_depth_mm * body_mass_g) * (bill_depth_mm): 0.7222108721971968
(bill_depth_mm * body_mass_g): 0.7202272416625914
(bill_depth_mm * body_mass_g) * (flipper_length_mm): 0.6425813490692588
(bill_depth_mm * bill_length_mm) * (body_mass_g): 0.6398235593646668
(bill_depth_mm * flipper_length_mm) * (flipper_length_mm): 0.6360645935216128
(bill_depth_mm * flipper_length_mm): 0.6083364815975281
(bill_depth_mm * body_mass_g) * (body_mass_g): 0.5888925994060027
In this example, we would like to predict the gender of penguins given their attributes body_mass_g, bill_depth_mm, bill_length_mm and flipper_length_mm.
You might notice these other mysterious functions I used in the example, namely find_correlations, summarize_corr_series and find_top_percent. These are other convenient functions I made to help summarize the results from auto_feature_engineering. Here is the code to them (note they haven't been documented):
def summarize_corr_series(feature_corr_series):
max_feature_name_size = 0
for key in feature_corr_series.to_dict().keys():
if len(key) > max_feature_name_size:
max_feature_name_size = len(key)
max_new_feature_corr = feature_corr_series.max()
for key in feature_corr_series.to_dict().keys():
whitespace = []
for i in range(max_feature_name_size-len(key)):
whitespace.append(" ")
whitespace = "".join(whitespace)
print(key+": "+whitespace+str(abs(feature_corr_series[key])))
def find_top_percent(series, percent):
return series[series>series.quantile(1-percent)]
def find_correlations(X, y):
return abs(pd.concat([X.reset_index(drop=True), y.reset_index(drop=True)], axis=1).corr())[y.columns[0]].drop(y.columns[0]).sort_values(ascending=False)
It is really unfortunate that featuretools does not easily support this use case since it appears to be quite common. The best way I've found to do this is to create the first order features you want using the dfs function and then add the second order features you want manually.
For instance the MWE below (using the iris dataset) performs the AddNumeric primitive using dfs and then applies the DivideNumeric primitive to the newly created features using only the original features (and avoids the same base feature appearing multiple times in a transformed feature).
import numpy as np
import pandas as pd
import sklearn
import featuretools as ft
iris = sklearn.datasets.load_iris()
data = pd.DataFrame(
data= np.c_[iris['data'],
iris['target']],
columns= iris['feature_names'] + ['target']
)
ignore_cols = ['target']
entity_set = ft.EntitySet(id="iris")
entity_set.entity_from_dataframe(
entity_id="iris_main",
dataframe=data,
index="index",
)
new_features = ft.dfs(
entityset=entity_set,
target_entity="iris_main",
trans_primitives=["add_numeric"],
features_only=True,
primitive_options={
"add_numeric": {
"ignore_variables": {"iris_main": ignore_cols},
},
},
)
transformed_features = [i for i in new_features if isinstance(i, ft.feature_base.feature_base.TransformFeature)]
original_features = [i for i in new_features if isinstance(i, ft.feature_base.feature_base.IdentityFeature) and i.get_name() not in ignore_cols]
depth_two_features = []
for trans_feat in transformed_features:
for orig_feat in original_features:
if orig_feat.get_name() not in [i.get_name() for i in trans_feat.base_features]:
feat = ft.Feature([trans_feat, orig_feat], primitive=ft.primitives.DivideNumeric)
depth_two_features.append(feat)
data = ft.calculate_feature_matrix(
features= original_features + transformed_features + depth_two_features,
entityset=entity_set,
verbose=True,
)
The benefit of this approach is that it gives you more fine grained control to customise this how you want and avoids the computational cost of creating unnecessary features you don't want.

Add to items, with multiple occurrences [duplicate]

I have unsorted array of indexes:
i = np.array([1,5,2,6,4,3,6,7,4,3,2])
I also have an array of values of the same length:
v = np.array([2,5,2,3,4,1,2,1,6,4,2])
I have array with zeros of desired values:
d = np.zeros(10)
Now I want to add to elements in d values of v based on it's index in i.
If I do it in plain python I would do it like this:
for index,value in enumerate(v):
idx = i[index]
d[idx] += v[index]
It is ugly and inefficient. How can I change it?
np.add.at(d, i, v)
You'd think d[i] += v would work, but if you try to do multiple additions to the same cell that way, one of them overrides the others. The ufunc.at method avoids those problems.
We can use np.bincount which is supposedly pretty efficient for such accumulative weighted counting, so here's one with that -
counts = np.bincount(i,v)
d[:counts.size] = counts
Alternatively, using minlength input argument and for a generic case when d could be any array and we want to add into it -
d += np.bincount(i,v,minlength=d.size).astype(d.dtype, copy=False)
Runtime tests
This section compares np.add.at based approach listed in the other post with the np.bincount based one listed earlier in this post.
In [61]: def bincount_based(d,i,v):
...: counts = np.bincount(i,v)
...: d[:counts.size] = counts
...:
...: def add_at_based(d,i,v):
...: np.add.at(d, i, v)
...:
In [62]: # Inputs (random numbers)
...: N = 10000
...: i = np.random.randint(0,1000,(N))
...: v = np.random.randint(0,1000,(N))
...:
...: # Setup output arrays for two approaches
...: M = 12000
...: d1 = np.zeros(M)
...: d2 = np.zeros(M)
...:
In [63]: bincount_based(d1,i,v) # Run approaches
...: add_at_based(d2,i,v)
...:
In [64]: np.allclose(d1,d2) # Verify outputs
Out[64]: True
In [67]: # Setup output arrays for two approaches again for timing
...: M = 12000
...: d1 = np.zeros(M)
...: d2 = np.zeros(M)
...:
In [68]: %timeit add_at_based(d2,i,v)
1000 loops, best of 3: 1.83 ms per loop
In [69]: %timeit bincount_based(d1,i,v)
10000 loops, best of 3: 52.7 µs per loop

Speeding up Euclidean Distance in python [duplicate]

How do you optimize this code?
At the moment it is running to slow for the amount of data that goes through this loop. This code runs 1-nearest neighbor. It will predict the label of the training_element based off the p_data_set
# [x] , [[x1],[x2],[x3]], [l1, l2, l3]
def prediction(training_element, p_data_set, p_label_set):
temp = np.array([], dtype=float)
for p in p_data_set:
temp = np.append(temp, distance.euclidean(training_element, p))
minIndex = np.argmin(temp)
return p_label_set[minIndex]
Use a k-D tree for fast nearest-neighbour lookups, e.g. scipy.spatial.cKDTree:
from scipy.spatial import cKDTree
# I assume that p_data_set is (nsamples, ndims)
tree = cKDTree(p_data_set)
# training_elements is also assumed to be (nsamples, ndims)
dist, idx = tree.query(training_elements, k=1)
predicted_labels = p_label_set[idx]
You could use distance.cdist to directly get the distances temp and then use .argmin() to get min-index, like so -
minIndex = distance.cdist(training_element[None],p_data_set).argmin()
Here's an alternative approach using np.einsum -
subs = p_data_set - training_element
minIndex = np.einsum('ij,ij->i',subs,subs).argmin()
Runtime test
Well I was thinking cKDTree would easily beat cdist, but I guess training_element being a 1D array isn't too heavy for cdist and I am seeing it to beat out cKDTree instead by a good 10x+ margin!
Here's the timing results -
In [422]: # Setup arrays
...: p_data_set = np.random.randint(0,9,(40000,100))
...: training_element = np.random.randint(0,9,(100,))
...:
In [423]: def tree_based(p_data_set,training_element): ##ali_m's soln
...: tree = cKDTree(p_data_set)
...: dist, idx = tree.query(training_element, k=1)
...: return idx
...:
...: def einsum_based(p_data_set,training_element):
...: subs = p_data_set - training_element
...: return np.einsum('ij,ij->i',subs,subs).argmin()
...:
In [424]: %timeit tree_based(p_data_set,training_element)
1 loops, best of 3: 210 ms per loop
In [425]: %timeit einsum_based(p_data_set,training_element)
100 loops, best of 3: 17.3 ms per loop
In [426]: %timeit distance.cdist(training_element[None],p_data_set).argmin()
100 loops, best of 3: 14.8 ms per loop
Python can be quite fast programming language if used properly.
This is my suggestion (faster_prediction):
import numpy as np
import time
def euclidean(a,b):
return np.linalg.norm(a-b)
def prediction(training_element, p_data_set, p_label_set):
temp = np.array([], dtype=float)
for p in p_data_set:
temp = np.append(temp, euclidean(training_element, p))
minIndex = np.argmin(temp)
return p_label_set[minIndex]
def faster_prediction(training_element, p_data_set, p_label_set):
temp = np.tile(training_element, (p_data_set.shape[0],1))
temp = np.sqrt(np.sum( (temp - p_data_set)**2 , 1))
minIndex = np.argmin(temp)
return p_label_set[minIndex]
training_element = [1,2,3]
p_data_set = np.random.rand(100000, 3)*10
p_label_set = np.r_[0:p_data_set.shape[0]]
t1 = time.time()
result_1 = prediction(training_element, p_data_set, p_label_set)
t2 = time.time()
t3 = time.time()
result_2 = faster_prediction(training_element, p_data_set, p_label_set)
t4 = time.time()
print "Execution time 1:", t2-t1, "value: ", result_1
print "Execution time 2:", t4-t3, "value: ", result_2
print "Speed up: ", (t4-t3) / (t2-t1)
I get the following result on pretty old laptop:
Execution time 1: 21.6033108234 value: 9819
Execution time 2: 0.0176379680634 value: 9819
Speed up: 1224.81857013
which makes me think I must have done some stupid mistake :)
In case of very huge data, where memory might be an issue, I suggest using Cython or implementing function in C++ and wrapping it in python.

Explaining the result of pipeline execution of multiple hincrby commands in redis

This rudimentary one has me stumped. I've been tinkering around with redis-py, trying to learn the ropes. One thing I'm trying is:
pipeline1 = my_server.pipeline()
for hash_obj in hash_objs:
num = pipeline1.hincrby(hash_obj,"num",amount=-1)
result1 = pipeline1.execute()
print result1
>>> [0L,0L]
There were two redis hashes in the list hash_objs. What I see printed on the screen is [0L,0L]. Can someone help me decipher what this output means? What's L? I was hoping to get the resulting int values of num for each hash_obj (e.g. [2,0]).
My objective is to decrement num in each hash_obj by 1, and wherever num ends up as 0, delete the hash_obj.
I'm trying to accomplish that in two separate pipelines; the code above is the attempting to decrement all num values in all hash_objs. After this, I would delete the relevant hash_objs if warranted. I'm still developing my understanding of how to effectively use pipelining in redis.
Nothing wrong with the code above - the L means long (integer) and the result printout is consistent assuming that the hashes were set to 1 before the run. If you set the hashes beforehand to 3 and 1 (steps 3 and 4 below), respectively, you'll get the expected result in step 9:
In [1]: import redis
In [2]: r = redis.StrictRedis()
In [3]: r.hset('h1', 'num', 3)
Out[3]: 1L
In [4]: r.hset('h2', 'num', 1)
Out[4]: 1L
In [5]: hashes = ['h1', 'h2']
In [6]: p = r.pipeline()
In [7]: for h in hashes:
...: p.hincrby(h, 'num', -1)
...:
In [8]: res = p.execute()
In [9]: res
Out[9]: [2L, 0L]
Note: the 1L in 3 and 4 means that the key was created.
Now you can iterate on the result and continue the processing. In your case, however, it would make more sense to use just one pipeline and instead of executing the hincrby call a Lua script decrements and deletes the key if the result is 0, such as the one below (which returns 1 if the key was deleted):
In [1]: import redis
In [2]: r = redis.StrictRedis()
In [3]: r.hset('h1', 'num', 3)
Out[3]: 0L
In [4]: r.hset('h2', 'num', 1)
Out[4]: 0L
In [5]: s = r.script_load('if redis.call("HINCRBY", KEYS[1], ARGV[1], ARGV[2]) <= 0 then redis.call("DEL", KEYS[1]) return 1 end return 0')
In [6]: p = r.pipeline()
In [7]: for h in ['h1', 'h2']:
...: p.evalsha(s, 1, h, 'num', -1)
...:
In [8]: p.execute()
Out[8]: [0L, 1L]

Itertools for containers

Considder the following interactive example
>>> l=imap(str,xrange(1,4))
>>> list(l)
['1', '2', '3']
>>> list(l)
[]
Does anyone know if there is already an implementation somewhere out there with a version of imap (and the other itertools functions) such that the second time list(l) is executed you get the same as the first. And I don't want the regular map because building the entire output in memory can be a waste of memory if you use larger ranges.
I want something that basically does something like
class cmap:
def __init__(self, function, *iterators):
self._function = function
self._iterators = iterators
def __iter__(self):
return itertools.imap(self._function, *self._iterators)
def __len__(self):
return min( map(len, self._iterators) )
But it would be a waste of time to do this manually for all itertools if someone already did this.
ps.
Do you think containers are more zen then iterators since for an iterator something like
for i in iterator:
do something
implicitly empties the iterator while a container you explicitly need to remove elements.
You do not have to build such an object for each type of container. Basically, you have the following:
mkimap = lambda: imap(str,xrange(1,4))
list(mkimap())
list(mkimap())
Now you onlky need a nice wrapping object to prevent the "ugly" function calls. This could work this way:
class MultiIter(object):
def __init__(self, f, *a, **k):
if a or k:
self.create = lambda: f(*a, **k)
else: # optimize
self.create = f
def __iter__(self):
return self.create()
l = MultiIter(lambda: imap(str, xrange(1,4)))
# or
l = MultiIter(imap, str, xrange(1,4))
# or even
#MultiIter
def l():
return imap(str, xrange(1,4))
# and then
print list(l)
print list(l)
(untested, hope it works, but you should get the idea)
For your 2nd question: Iterators and containers both have their uses. You should take whatever best fits your needs.
You may be looking for itertools.tee()
Iterators are my favorite topic ;)
from itertools import imap
class imap2(object):
def __init__(self, f, *args):
self.g = imap(f,*args)
self.lst = []
self.done = False
def __iter__(self):
while True:
try: # try to get something from g
x = next(self.g)
except StopIteration:
if self.done:
# give the old values
for x in self.lst:
yield x
else:
# g was consumed for the first time
self.done = True
return
else:
self.lst.append(x)
yield x
l=imap2(str,xrange(1,4))
print list(l)
print list(l)