Extending array in HDF5 using PyTables - pytables

I normally use h5py to do the HDF5 stuff in Python and if I want to create a dataset which I want to extend later or, I do:
f = h5py.File('foo.h5', 'w')
d = f.create_dataset('whatever', (5, 5), maxshape=(None, 5), dtype='i8', chunks=True)
...
d.resize((23, 5))
...
The maxshape(None, ...) sets the first dimension to "infinity", so it's extensible.
Now I have a project where I need to stick with PyTables and wanted to build up large arrays step by step. Is there a way to extend arrays in PyTables?
This is roughly the idea:
import tables as tb
import numpy as np
filename = "foo.h5"
h5file = tb.File(filename, "a")
gbar = h5file.create_group(h5file.root, "bar", "Pressure")
h5file.create_array(gbar, 'left', np.array((1, 2, 3, 4)), "...")
# now extend the shape of (4,) and append more arrays iteratively???
h5file.close()

I found the solution in the docs: tables.EArray
http://www.pytables.org/usersguide/libref/homogenous_storage.html#earrayclassdescr
Here is a descriptive example code which adds two "columns" with two different ways of dtype definition. The with block can be called multiple times and it will extend the columns:
import tables as tb
import numpy as np
filename = 'foo.h5'
with tb.File(filename, "a") as f:
if "/foo" not in f:
group = f.create_group("/", 'foo', 'Foo Information')
else:
group = f.root.foo
if "col1" not in group:
a = tb.Atom.from_dtype(np.dtype('<f8'), dflt=0.0)
arr = f.create_earray(group, 'col1', a, (0,), "Bar")
else:
arr = getattr(group, "col1")
arr.append(np.arange(10))
arr.append(np.arange(40, 45))
if "col2" not in group:
b = tb.Int64Atom()
arr = f.create_earray(group, 'col2', b, (0,), "Baz")
else:
arr = getattr(group, "col")
arr.append(np.arange(7))
arr.append(np.arange(30, 38))

Related

What is the most efficient way of creating a tf.dataset from multiple json.gz files with multiple text records?

I have thousands of json.gz files, each with a variety of information about scientific papers. For each file, I have to extract the relevant information - e.g. title and labels - to make a dataset, then transform it to a tf.dataset. However, it is poorly efficient since I cannot filter the subjects directly or shuffle them in a single step.
I would like to read them using tf.dataset.interleave in order to shuffle them, but also to filter them according to specific labels.
Here is how I'm doing it up to now.
import tensorflow as tf
import pandas as pd
#For relevant feature extraction
def load_file(file):
#with gzip.open(bytes.decode(file), 'r') as fin: # 4. gzip
with gzip.open(file, 'r') as fin:
json_bytes = fin.read()
json_str = json_bytes.decode('utf-8') # 2. string (i.e. JSON)
bb = json.loads(json_str)
bb = pd.json_normalize(bb, 'items', ['indexed', ['title', 'publisher', 'type','indexed.date-parts', 'subject']],
errors='ignore')
bb.dropna(subset=['title', 'publisher', 'type','indexed.date-parts', 'subject'], inplace=True)
bb.subject = bb.subject.apply(lambda x: int(themes[list(set(x) & set(list(themes.keys())))[0]]) if len(list(set(x) & set(list(themes.keys()))))>0 else len(list(themes.keys()))+1)
bb.title = bb.title.str.join('').values
#bb['author'] = bb['author'].apply(lambda x: '; '.join([', '.join([i['given'], i['family']]) for i in x]))
bb['indexed.date-parts'] = bb['indexed.date-parts'].apply(lambda tpl: datetime.datetime.strptime('-'.join(str(x) for x in tpl[0]), '%Y-%m-%d').strftime('%Y-%m-%d'))
#bb = bb.sample(n=32, replace=True)
#return bb.title.str.join('').values, bb.subject.str.join(', ').values
return dict(bb[['title', 'publisher', 'type','indexed.date-parts', 'subject' ]])
file_list = ['file_2021_01/10625.json.gz',
'file_2021_01/23897.json.gz',
'file_2021_01/12169.json.gz',
'file_2021_01/427.json.gz',...]
filenames = tf.data.Dataset.list_files(file_list, shuffle=True)
dataset = filenames.apply(
tf.data.experimental.parallel_interleave(
lambda x: tf.data.Dataset.from_tensor_slices(tf.numpy_function(load_file, [x], (tf.int64))), cycle_length=1))
However, it results it a error:
InternalError: Unsupported object type dict
[[{{node PyFunc}}]] [Op:IteratorGetNext]
Thanks

discrete numpy array to continuous array

I have some discrete data in an array, such that:
arr = np.array([[1,1,1],[2,2,2],[3,3,3],[2,2,2],[1,1,1]])
whose plot looks like:
I also have an index array, such that each unique value in arr is associated with a unique index value, like:
ind = np.array([[1,1,1],[2,2,2],[3,3,3],[4,4,4],[5,5,5]])
What is the most pythonic way of converting arr from discrete values to continuous values, so that the array would look like this when plotted?:
therefore, interpolating between the discrete points to make continuous data
I found a solution to this if anyone has a similar issue. It is maybe not the most elegant so modifications are welcome:
def ref_linear_interp(x, y):
arr = []
ux=np.unique(x) #unique x values
for u in ux:
idx = y[x==u]
try:
min = y[x==u-1][0]
max = y[x==u][0]
except:
min = y[x==u][0]
max = y[x==u][0]
try:
min = y[x==u][0]
max = y[x==u+1][0]
except:
min = y[x==u][0]
max = y[x==u][0]
if min==max:
sub = np.full((len(idx)), min)
arr.append(sub)
else:
sub = np.linspace(min, max, len(idx))
arr.append(sub)
return np.concatenate(arr, axis=None).ravel()
y = np.array([[1,1,1],[2,2,2],[3,3,3],[2,2,2],[1,1,1]])
x = np.array([[1,1,1],[2,2,2],[3,3,3],[4,4,4],[5,5,5]])
z = np.arange(1, 16, 1)
Here is an answer for the symmetric solution that I would expect when reading the question:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
# create the data as described
numbers = [1,2,3,2,1]
nblock = 3
df = pd.DataFrame({
"x": np.arange(nblock*len(numbers)),
"y": np.repeat(numbers, nblock),
"label": np.repeat(np.arange(len(numbers)), nblock)
})
Expecting a constant block size of 3, we could use a rolling window:
df['y-smooth'] = df['y'].rolling(nblock, center=True).mean()
# fill NaNs
df['y-smooth'].bfill(inplace=True)
df['y-smooth'].ffill(inplace=True)
plt.plot(df['x'], df['y-smooth'], marker='*')
If the block size is allowed to vary, we could determine the block centers and interpolate piecewise.
centers = df[['x', 'y', 'label']].groupby('label').mean()
df['y-interp'] = np.interp(df['x'], centers['x'], centers['y'])
plt.plot(df['x'], df['y-interp'], marker='*')
Note: You may also try
centers = df[['x', 'y', 'label']].groupby('label').min() to select the left corner of the labelled blocks.

Filtering list based on array columns to create a [dict(str:list)] result

My table looks like this(df):
category
product_in_cat
cat1
[A,B,C]
cat2
[E,F,G]
"category" is str, and product_in_cat is list type. I have a list:product=[A,B,G]
I want to get a final [dict(str:list)] looks like:
[{cat1:[A,B]},{cat2:[G]}]
I think I can use below code:
list1=[]
for inde,row in df.iterrows():
list1.append.({row['category']:row['product_in_cat'] in product})
I know this part is not correct,row['product_in_cat'] in product but I am not sure how to filter out the list column base on the given "product" list. Please help, and thank you in advance!
You can use np.intersect1d to find the common part of two lists:
import numpy as np
df_ = df['product_in_cat'].apply(lambda x: np.intersect1d(x, product).tolist())
l = [{k: v} for k, v in zip(df['category'], df_)]
print(l)
[{'cat1': ['A', 'B']}, {'cat2': ['G']}]
You can use convert each list in the column to a set and use intersection with the external product list:
import pandas as pd
lst = ['A','B','G']
data = {'category':['cat 1','cat 2'],
'product_in_cat': [ ['A','B','C'] ,['E','F','G']]}
df = pd.DataFrame(data)
dict(zip(df['category'],df['product_in_cat'].apply(lambda x: set(x).intersection(lst))))
#output
{'cat 1': {'A', 'B'}, 'cat 2': {'G'}}

Scikit-pandas, cross val score number of features

I am getting familiar with scikit and its pandas integration using the Titanic tutorial on Kaggle. I have cleaned my data and would like to make some prediction. I can do it calling a pipeline fit and transform - unfortunately I get an error trying to do the same with cross_val_score.
I am using the sklearn-pandas cross_val_score
The code is as follows:
mapping = [
('Age', None),
('Embarked',LabelBinarizer()),
('Fare',None),
('Pclass',LabelBinarizer()),
('Sex',LabelBinarizer()),
('Group',LabelBinarizer()),
('familySize',None),
('familyType',LabelBinarizer()),
('Title',LabelBinarizer())
]
pipe = Pipeline([
('featurize', DataFrameMapper(mapping)),
('logReg', LogisticRegression())
])
X = df_train[df_train.columns.drop('Survived')]
y = df_train['Survived']
#model = pipe.fit(X = X, y = y)
#prediction = model.predict(df_train)
score = cross_val_score(pipe, X = X, y = y, scoring = 'accuracy')
df_train is a Pandas dataframe containing all my training set, including outcomes. The two commented lines:
model = pipe.fit(X = X, y = y)
prediction = model.predict(df_train)
Work fine and prediction returns me an array with predicted outcomes. Using the same with cross_val_score, I get the following error:
X has 20 features per sample; expecting 19
Full code below, can be run with the Titanic CSV files on Kaggle (https://www.kaggle.com/c/titanic/data)
#%% Libraries import
import pandas as pd
import numpy as np
from sklearn_pandas import DataFrameMapper, cross_val_score
from sklearn.preprocessing import LabelBinarizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
#%% Read the data
path = 'E:/Kaggle/Titanic/Data/'
file_training = 'train.csv'
file_test = 'test.csv'
#Import the training and test dataset and concatenate them
df_training = pd.read_csv(path + file_training, header = 0, index_col = 'PassengerId')
df_test = pd.read_csv(path + file_test, header = 0, index_col = 'PassengerId')
# Work on the concatenated training and test data for feature engineering and clean-up
df = pd.concat([df_training, df_test], keys = ['train','test'])
#%% Initial data exploration and cleaning
df.describe(include = 'all')
pd.isnull(df).sum() > 0
#%% Preprocesing and Cleanup
#Create new columns with the name (to identify individuals part of a family)
df['LName'] = df['Name'].apply(lambda x:x.split(',')[0].strip())
df['FName'] = df['Name'].apply(lambda x:x.split(',')[1].split('.')[1].strip())
#Get the title
df['Title'] = df['Name'].apply(lambda x:x.split(',')[1].split('.')[0].strip())
titleDic = {
'Master' : 'kid',
'Mlle' : 'unmarriedWoman',
'Miss' : 'unmarriedWoman',
'Ms' : 'unmarriedWoman',
'Jonkheer' : 'noble',
'Don' : 'noble',
'Dona' : 'noble',
'Sir' : 'noble',
'Lady' : 'noble',
'the Countess' : 'noble',
'Capt' : 'ranked',
'Major' : 'ranked',
'Col' : 'ranked',
'Mr' : 'standard',
'Mme' : 'standard',
'Mrs' : 'standard',
'Dr' : 'academic',
'Rev' : 'academic'
}
df['Group'] = df['Title'].map(titleDic)
#%% Working with the family size
#Get the family size
df['familySize'] = df['Parch'] + df['SibSp'] + 1
#Add a family tag (single, couple, small, large)
df['familyType'] = pd.cut(df['familySize'],
[1,2,3,5,np.inf],
labels = ['single','couple','sFamily','bFamily'],
right = False)
#%% Filling empty values
#Fill empty values with the mean or mode for the column
#Fill the missing values with mean for age per title, class and gender. Store value in AgeFull variable
agePivot = pd.DataFrame(df.groupby(['Group', 'Sex'])['Age'].median())
agePivot.columns = ['AgeFull']
df = pd.merge(df, agePivot, left_on = ['Group', 'Sex'], right_index = True)
df.loc[df['Age'].isnull(),['Age']] = df['AgeFull']
#Embark location missing values
embarkPivot = pd.DataFrame(df.groupby(['Group'])['Embarked'].agg(lambda x:x.value_counts().index[0]))
embarkPivot.columns = ['embarkFull']
df = pd.merge(df, embarkPivot, left_on = ['Group'], right_index = True)
df.loc[df['Embarked'].isnull(),['Embarked']] = df['embarkFull']
#Fill the missing fare value
df.loc[df['Fare'].isnull(), 'Fare'] = df['Fare'].mean()
#%% Final clean-up (drop temporary columns)
df = df.drop(['AgeFull', 'embarkFull'], 1)
#%% Preparation for training
df_train = df.loc['train']
df_test = df.loc['test']
#Creation of dummy variables
mapping = [
('Age', None),
('Embarked',LabelBinarizer()),
('Fare',None),
('Pclass',LabelBinarizer()),
('Sex',LabelBinarizer()),
('Group',LabelBinarizer()),
('familySize',None),
('familyType',LabelBinarizer()),
('Title',LabelBinarizer())
]
pipe = Pipeline(steps = [
('featurize', DataFrameMapper(mapping)),
('logReg', LogisticRegression())
])
#Uncommenting the line below fixes the code - why?
#df_train = df_train.sort_index()
X = df_train[df_train.columns.drop(['Survived'])]
y = df_train.Survived
score = cross_val_score(pipe, X = df_train, y = df_train.Survived, scoring = 'accuracy')
This is very interesting. I have solved the issue just by sorting using the index the DataFrame before passing it to the cross_val_score in the pipeline.
df_train = df_train.sort_index()
Could anyone explain me why this would have an impact on how Scikit is working?

Iterator with memory?

I'm working on an application which use a markov chain.
An example on this code follows:
chain = MarkovChain(order=1)
train_seq = ["","hello","this","is","a","beautiful","world"]
for i, word in enum(train_seq):
chain.train(previous_state=train_seq[i-1],next_state=word)
What I am looking for is to iterate over train_seq, but to keep the N last elements.
for states in unknown(train_seq,order=1):
# states should be a list of states, with states[-1] the newest word,
# and states[:-1] should be the previous occurrences of the iteration.
chain.train(*states)
Hope the description of my problem is clear enough for
window will give you n items from iterable at a time.
from collections import deque
def window(iterable, n=3):
it = iter(iterable)
d = deque(maxlen = n)
for elem in it:
d.append(elem)
yield tuple(d)
print [x for x in window([1, 2, 3, 4, 5])]
# [(1,), (1, 2), (1, 2, 3), (2, 3, 4), (3, 4, 5)]
If you want the same number of items even the first few times,
from collections import deque
from itertools import islice
def window(iterable, n=3):
it = iter(iterable)
d = deque((next(it) for Null in range(n-1)), n)
for elem in it:
d.append(elem)
yield tuple(d)
print [x for x in window([1, 2, 3, 4, 5])]
will do that.
seq = [1,2,3,4,5,6,7]
for w in zip(seq, seq[1:]):
print w
You can also do the following to create an arbitrarily-sized pairs:
tuple_size = 2
for w in zip(*(seq[i:] for i in range(tuple_size)))
print w
edit: But it's probably better using the iterative zip:
from itertools import izip
tuple_size = 4
for w in izip(*(seq[i:] for i in range(tuple_size)))
print w
I tried this on my system with seq being 10,000,000 integers and the results were fairly instant.
Improving upon yan's answer to avoid copies:
from itertools import *
def staggered_iterators(sequence, count):
iterator = iter(sequence)
for i in xrange(count):
result, iterator = tee(iterator)
yield result
next(iterator)
tuple_size = 4
for w in izip(*(i for i in takewhile(staggered_iterators(seq, order)))):
print w