Iterator with memory? - iterator

I'm working on an application which use a markov chain.
An example on this code follows:
chain = MarkovChain(order=1)
train_seq = ["","hello","this","is","a","beautiful","world"]
for i, word in enum(train_seq):
chain.train(previous_state=train_seq[i-1],next_state=word)
What I am looking for is to iterate over train_seq, but to keep the N last elements.
for states in unknown(train_seq,order=1):
# states should be a list of states, with states[-1] the newest word,
# and states[:-1] should be the previous occurrences of the iteration.
chain.train(*states)
Hope the description of my problem is clear enough for

window will give you n items from iterable at a time.
from collections import deque
def window(iterable, n=3):
it = iter(iterable)
d = deque(maxlen = n)
for elem in it:
d.append(elem)
yield tuple(d)
print [x for x in window([1, 2, 3, 4, 5])]
# [(1,), (1, 2), (1, 2, 3), (2, 3, 4), (3, 4, 5)]
If you want the same number of items even the first few times,
from collections import deque
from itertools import islice
def window(iterable, n=3):
it = iter(iterable)
d = deque((next(it) for Null in range(n-1)), n)
for elem in it:
d.append(elem)
yield tuple(d)
print [x for x in window([1, 2, 3, 4, 5])]
will do that.

seq = [1,2,3,4,5,6,7]
for w in zip(seq, seq[1:]):
print w
You can also do the following to create an arbitrarily-sized pairs:
tuple_size = 2
for w in zip(*(seq[i:] for i in range(tuple_size)))
print w
edit: But it's probably better using the iterative zip:
from itertools import izip
tuple_size = 4
for w in izip(*(seq[i:] for i in range(tuple_size)))
print w
I tried this on my system with seq being 10,000,000 integers and the results were fairly instant.

Improving upon yan's answer to avoid copies:
from itertools import *
def staggered_iterators(sequence, count):
iterator = iter(sequence)
for i in xrange(count):
result, iterator = tee(iterator)
yield result
next(iterator)
tuple_size = 4
for w in izip(*(i for i in takewhile(staggered_iterators(seq, order)))):
print w

Related

Randomly select items from two equally sized tensors

Assume that we have two equally sized tensors of size batch_size * 1. For each index in the batch dimension we want to choose randomly between the two tensors. My solution was to create an indices tensor that contains random 0 or 1 indices of size batch_size and use those to index_select from the concatenation of the two tensors. However, to do so I had the "view" that cat tensor and the solution ended up to be quite "ugly":
import torch
bs = 8
a = torch.zeros(bs, 1)
print("a size", a.size())
b = torch.ones(bs, 1)
c = torch.cat([a, b], dim=-1)
print(c)
print("c size", c.size())
# create bs number of random 0 and 1's
indices = torch.randint(0, 2, [bs])
print("idxs size", indices.size())
print("idxs", indices)
# use `indices` to slice the `cat`ted tensor
d = c.view(1, -1).index_select(-1, indices).view(-1, 1)
print("d size", d.size())
print(d)
I am wondering whether there is a prettier and, more importantly, more efficient solution.
Posting two answers that I got over at the PyTorch forums
import torch
bs = 8
a = torch.zeros(bs, 1)
b = torch.ones(bs, 1)
c = torch.cat([a, b], dim=-1)
choices_flat = c.view(-1)
# index = torch.randint(choices_flat.numel(), (bs,))
# or if replace = False
index = torch.randperm(choices_flat.numel())[:bs]
select = choices_flat[index]
print(select)
import torch
bs = 8
a = torch.zeros(bs, 1)
print("a size", a.size())
b = torch.ones(bs, 1)
idx = torch.randint(2 * bs, (bs,))
d = torch.cat([a, b])[idx] # [bs, 1]

Extending array in HDF5 using PyTables

I normally use h5py to do the HDF5 stuff in Python and if I want to create a dataset which I want to extend later or, I do:
f = h5py.File('foo.h5', 'w')
d = f.create_dataset('whatever', (5, 5), maxshape=(None, 5), dtype='i8', chunks=True)
...
d.resize((23, 5))
...
The maxshape(None, ...) sets the first dimension to "infinity", so it's extensible.
Now I have a project where I need to stick with PyTables and wanted to build up large arrays step by step. Is there a way to extend arrays in PyTables?
This is roughly the idea:
import tables as tb
import numpy as np
filename = "foo.h5"
h5file = tb.File(filename, "a")
gbar = h5file.create_group(h5file.root, "bar", "Pressure")
h5file.create_array(gbar, 'left', np.array((1, 2, 3, 4)), "...")
# now extend the shape of (4,) and append more arrays iteratively???
h5file.close()
I found the solution in the docs: tables.EArray
http://www.pytables.org/usersguide/libref/homogenous_storage.html#earrayclassdescr
Here is a descriptive example code which adds two "columns" with two different ways of dtype definition. The with block can be called multiple times and it will extend the columns:
import tables as tb
import numpy as np
filename = 'foo.h5'
with tb.File(filename, "a") as f:
if "/foo" not in f:
group = f.create_group("/", 'foo', 'Foo Information')
else:
group = f.root.foo
if "col1" not in group:
a = tb.Atom.from_dtype(np.dtype('<f8'), dflt=0.0)
arr = f.create_earray(group, 'col1', a, (0,), "Bar")
else:
arr = getattr(group, "col1")
arr.append(np.arange(10))
arr.append(np.arange(40, 45))
if "col2" not in group:
b = tb.Int64Atom()
arr = f.create_earray(group, 'col2', b, (0,), "Baz")
else:
arr = getattr(group, "col")
arr.append(np.arange(7))
arr.append(np.arange(30, 38))

Cython Typing List of Strings

I'm trying to use cython to improve the performance of a loop, but I'm running
into some issues declaring the types of the inputs.
How do I include a field in my typed struct which is a string that can be
either 'front' or 'back'
I have a np.recarray that looks like the following (note the length of the
recarray is unknown as compile time)
import numpy as np
weights = np.recarray(4, dtype=[('a', np.int64), ('b', np.str_, 5), ('c', np.float64)])
weights[0] = (0, "front", 0.5)
weights[1] = (0, "back", 0.5)
weights[2] = (1, "front", 1.0)
weights[3] = (1, "back", 0.0)
as well as inputs of a list of strings and a pandas.Timestamp
import pandas as pd
ts = pd.Timestamp("2015-01-01")
contracts = ["CLX16", "CLZ16"]
I am trying to cythonize the following loop
def ploop(weights, contracts, timestamp):
cwts = []
for gen_num, position, weighting in weights:
if weighting != 0:
if position == "front":
cntrct_idx = gen_num
elif position == "back":
cntrct_idx = gen_num + 1
else:
raise ValueError("transition.columns must contain "
"'front' or 'back'")
cwts.append((gen_num, contracts[cntrct_idx], weighting, timestamp))
return cwts
My attempt involved typing the weights input as a struct in cython,
in a file struct_test.pyx as follows
import numpy as np
cimport numpy as np
cdef packed struct tstruct:
np.int64_t gen_num
char[5] position
np.float64_t weighting
def cloop(tstruct[:] weights_array, contracts, timestamp):
cdef tstruct weights
cdef int i
cdef int cntrct_idx
cwts = []
for k in xrange(len(weights_array)):
w = weights_array[k]
if w.weighting != 0:
if w.position == "front":
cntrct_idx = w.gen_num
elif w.position == "back":
cntrct_idx = w.gen_num + 1
else:
raise ValueError("transition.columns must contain "
"'front' or 'back'")
cwts.append((w.gen_num, contracts[cntrct_idx], w.weighting,
timestamp))
return cwts
But I am receiving runtime errors, which I believe are related to the
char[5] position.
import pyximport
pyximport.install()
import struct_test
struct_test.cloop(weights, contracts, ts)
ValueError: Does not understand character buffer dtype format string ('w')
In addition I am a bit unclear how I would go about typing contracts as well
as timestamp.
Your ploop (without the timestamp variable) produces:
In [226]: ploop(weights, contracts)
Out[226]: [(0, 'CLX16', 0.5), (0, 'CLZ16', 0.5), (1, 'CLZ16', 1.0)]
Equivalent function without a loop:
def ploopless(weights, contracts):
arr_contracts = np.array(contracts) # to allow array indexing
wgts1 = weights[weights['c']!=0]
mask = wgts1['b']=='front'
wgts1['b'][mask] = arr_contracts[wgts1['a'][mask]]
mask = wgts1['b']=='back'
wgts1['b'][mask] = arr_contracts[wgts1['a'][mask]+1]
return wgts1.tolist()
In [250]: ploopless(weights, contracts)
Out[250]: [(0, 'CLX16', 0.5), (0, 'CLZ16', 0.5), (1, 'CLZ16', 1.0)]
I'm taking advantage of the fact that returned list of tuples has same (int, str, int) layout as the input weight array. So I'm just making a copy of weights and replacing selected values of the b field.
Note that I use the field selection index before the mask one. The boolean mask produces a copy, so we have to careful about indexing order.
I'm guessing that loop-less array version will be competitive in time with the cloop (on realistic arrays). The string and list operations in cloop probably limit its speedup.

Compute value of variable for multiple input values

I have a tensorflow graph which is trained. After training, I want to sample one variable for multiple intermediate values. Simplified:
a = tf.placeholder(tf.float32, [1])
b = a + 10
c = b * 10
Now I want to query c for values of b. Currently, I am using an outer loop
b_values = [0, 1, 2, 3, 4, 5]
samples = []
for b_value in b_values:
samples += [sess.run(c,
feed_dict={b: [b_value]})]
This loop takes quite a bit of time, I think it is because b_values contains 5000 values in my case. Is there a way of running sess.run only once, and passing all b_values at once? I cannot really modify the graph a->b->c, but I could add something to it if that helps.
You could do it as follows:
import tensorflow as tf
import numpy as np
import time
a = tf.placeholder(tf.float32, [None,1])
b = a + 10
c = b * 10
sess = tf.Session()
b_values = np.random.randint(500,size=(5000,1))
samples = []
t = time.time()
for b_value in b_values:
samples += [sess.run(c,feed_dict={b: [b_value]})]
print time.time()-t
#print samples
t=time.time()
samples = sess.run(c,feed_dict={b:b_values})
print time.time()-t
#print samples
Output: (time in seconds)
0.874449968338
0.000532150268555
Hope this helps !

apply generic function in a vectorized fashion using numpy/pandas

I am trying to vectorize my code and, thanks in large part to some users (https://stackoverflow.com/users/3293881/divakar, https://stackoverflow.com/users/625914/behzad-nouri), I was able to make huge progress. Essentially, I am trying to apply a generic function (in this case max_dd_array_ret) to each of the bins I found (see vectorize complex slicing with pandas dataframe for details on date vectorization and Start, End and Duration of Maximum Drawdown in Python for the rationale behind max_dd_array_ret). the problem is the following: I should be able to obtain the result df_2 and, to some degree, ranged_DD(asd_1.values, starts, ends+1) is what I am looking for, except for the tragic effect that it's as if the first two bins are merged and the last one is missing as it can be gauged by looking at the results.
any explanation and fix is very welcomed
import pandas as pd
import numpy as np
from time import time
from scipy.stats import binned_statistic
def max_dd_array_ret(xs):
xs = (xs+1).cumprod()
i = np.argmax(np.maximum.accumulate(xs) - xs) # end of the period
j = np.argmax(xs[:i])
max_dd = abs(xs[j]/xs[i] -1)
return max_dd if max_dd is not None else 0
def get_ranges_arr(starts,ends):
# Taken from https://stackoverflow.com/a/37626057/3293881
counts = ends - starts
counts_csum = counts.cumsum()
id_arr = np.ones(counts_csum[-1],dtype=int)
id_arr[0] = starts[0]
id_arr[counts_csum[:-1]] = starts[1:] - ends[:-1] + 1
return id_arr.cumsum()
def ranged_DD(arr,starts,ends):
# Get all indices and the IDs corresponding to same groups
idx = get_ranges_arr(starts,ends)
id_arr = np.repeat(np.arange(starts.size),ends-starts)
slice_arr = arr[idx]
return binned_statistic(id_arr, slice_arr, statistic=max_dd_array_ret)[0]
asd_1 = pd.Series(0.01 * np.random.randn(500), index=pd.date_range('2011-1-1', periods=500)).pct_change()
index_1 = pd.to_datetime(['2011-2-2', '2011-4-3', '2011-5-1','2011-7-2', '2011-8-3', '2011-9-1','2011-10-2', '2011-11-3', '2011-12-1','2012-1-2', '2012-2-3', '2012-3-1',])
index_2 = pd.to_datetime(['2011-2-15', '2011-4-16', '2011-5-17','2011-7-17', '2011-8-17', '2011-9-17','2011-10-17', '2011-11-17', '2011-12-17','2012-1-17', '2012-2-17', '2012-3-17',])
starts = asd_1.index.searchsorted(index_1)
ends = asd_1.index.searchsorted(index_2)
df_2 = pd.DataFrame([max_dd_array_ret(asd_1.loc[i:j]) for i, j in zip(index_1, index_2)], index=index_1)
print(df_2[0].values)
print(ranged_DD(asd_1.values, starts, ends+1))
results:
df_2
[ 1.75893509 6.08002911 2.60131797 1.55631781 1.8770067 2.50709085
1.43863472 1.85322338 1.84767224 1.32605754 1.48688414 5.44786663]
ranged_DD(asd_1.values, starts, ends+1)
[ 6.08002911 2.60131797 1.55631781 1.8770067 2.50709085 1.43863472
1.85322338 1.84767224 1.32605754 1.48688414]
which are identical except for the first two:
[ 1.75893509 6.08002911 vs [ 6.08002911
and the last two
1.48688414 5.44786663] vs 1.48688414]
p.s.:while looking in more detail at the docs (http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.binned_statistic.html) I found that this might be the problem
"All but the last (righthand-most) bin is half-open. In other words,
if bins is [1, 2, 3, 4], then the first bin is [1, 2) (including 1,
but excluding 2) and the second [2, 3). The last bin, however, is [3,
4], which includes 4. New in version 0.11.0."
problem is I don't how to reset it.