How to write seed_features that include a conditional statement - pandas

I'm trying to write a seed feature that produces reward if place == 1 else 0.
place and reward are both ft.variable_types.Numeric:
Entity: results
Variables:
id (dtype: index)
place (dtype: numeric)
reward (dtype: numeric)
I've tried the following alternatives with no luck:
Alternative 1
roi = (ft.Feature(es['results']['reward'])
if (ft.Feature(es['results']['place']) == 1)
else 0).rename('roi')
produces AssertionError: Column "roi" missing frome dataframe
when generating the features.
Alternative 2
roi = ((ft.Feature(es['results']['place']) == 1) *
ft.Feature(es['results']['reward'])).rename('roi')
produces AssertionError: Provided inputs don't match input type requirements when assigning the seed feature.
Alternative 2 should work since in Python:
>>> True * 3.14
3.14
>>> False * 3.14
0.0
The full stack trace:
---------------------------------------------------------------------------
AssertionError Traceback (most recent call last)
<ipython-input-211-94dd07d98076> in <module>()
23
24
---> 25 roi = ((ft.Feature(es['results']['place']) == 1) * ft.Feature(es['results']['reward'])).rename('roi')
~/dev/venv/lib/python3.6/site-packages/featuretools/feature_base/feature_base.py in __mul__(self, other)
287 def __mul__(self, other):
288 """Multiply by other"""
--> 289 return self._handle_binary_comparision(other, primitives.MultiplyNumeric, primitives.MultiplyNumericScalar)
290
291 def __rmul__(self, other):
~/dev/venv/lib/python3.6/site-packages/featuretools/feature_base/feature_base.py in _handle_binary_comparision(self, other, Primitive, PrimitiveScalar)
230 def _handle_binary_comparision(self, other, Primitive, PrimitiveScalar):
231 if isinstance(other, FeatureBase):
--> 232 return Feature([self, other], primitive=Primitive)
233
234 return Feature([self], primitive=PrimitiveScalar(other))
~/dev/venv/lib/python3.6/site-packages/featuretools/feature_base/feature_base.py in __new__(self, base, entity, groupby, parent_entity, primitive, use_previous, where)
755 primitive=primitive,
756 groupby=groupby)
--> 757 return TransformFeature(base, primitive=primitive)
758
759 raise Exception("Unrecognized feature initialization")
~/dev/venv/lib/python3.6/site-packages/featuretools/feature_base/feature_base.py in __init__(self, base_features, primitive, name)
660 relationship_path=RelationshipPath([]),
661 primitive=primitive,
--> 662 name=name)
663
664 #classmethod
~/dev/venv/lib/python3.6/site-packages/featuretools/feature_base/feature_base.py in __init__(self, entity, base_features, relationship_path, primitive, name, names)
56 self._names = names
57
---> 58 assert self._check_input_types(), ("Provided inputs don't match input "
59 "type requirements")
60
AssertionError: Provided inputs don't match input type requirements

This should work on featuretools v0.11.0. Here is an example using a demo dataset. Both unit_price and total are numeric.
import featuretools as ft
es = ft.demo.load_retail(nrows=100)
es['order_products']
Entity: order_products
Variables:
...
unit_price (dtype: numeric)
total (dtype: numeric)
...
I create the seed feature.
unit_price = ft.Feature(es['order_products']['unit_price'])
total = ft.Feature(es['order_products']['total'])
seed = ((total == 1) * unit_price).rename('seed')
Then, calculate the feature matrix.
fm, fd = ft.dfs(target_entity='customers', entityset=es, seed_features=[seed])
fm.filter(regex='seed').columns.tolist()[:5]
['SUM(order_products.seed)',
'STD(order_products.seed)',
'MAX(order_products.seed)',
'SKEW(order_products.seed)',
'MIN(order_products.seed)']
In your case, this would be the seed feature.
place = ft.Feature(es['results']['place'])
reward = ft.Feature(es['results']['reward'])
roi = ((reward == 1) * place).rename('roi')
Let me know if that helps.

Related

Length mismatch error in ColumnTransformer sklearn v

Length Mismatch error when setting transform_output to "pandas" on the custom transformer (deleting NaN values)
I'm implementing the custom transformer to delete the rows containing NaNs. The code is
from sklearn.base import BaseEstimator,TransformerMixin
class NaRemover(BaseEstimator,TransformerMixin):
def __init__(self):
self._columns = []
def fit(self, X):
self._columns = X.columns.values
return self
def transform(self, X):
X = X.dropna()
return X
It works correctly as standalone.
Then I put it in the ColumnTransformer:
features = X_train.columns.values
ct_nan = ColumnTransformer([('delete_na',NaRemover(),features)])
ct_nan.fit(X_train)
and get the error:
ValueError: Length mismatch: Expected axis has 109 elements, new values have 140 elements
Problem is caused by the function that wraps the output into the pandas dataframe
129 # dense_config == "pandas"
--> 130 return _wrap_in_pandas_container(
131 data_to_wrap=data_to_wrap,
132 index=getattr(original_input, "index", None),
As far as could gather, it checks the integrity of the dataframe index, which I obviously destroy when applying transform (although I don't understand why should it check it on the fit stage)
214 def set_axis(self, axis: int, new_labels: Index) -> None:
215 # Caller is responsible for ensuring we have an Index object.
--> 216 self._validate_set_axis(axis, new_labels)
217 self.axes[axis] = new_labels
218
/usr/local/lib/python3.8/dist-packages/pandas/core/internals/base.py in _validate_set_axis(self, axis, new_labels)
55
56 elif new_len != old_len:
---> 57 raise ValueError(
58 f"Length mismatch: Expected axis has {old_len} elements, new "
59 f"values have {new_len} elements"
Is it what the functionality supposed to be? Are the transformers changing the shape of the dataframe not allowed? And if not, how can I overcome the problem?

Pandas groupy "aggregate" does not see column

I am working on a huge database where I did a pandas apply to categorize the type of cliente based on the type of the product he consumed:
Sample DF:
import pandas as pd
import numpy as np
from datetime import datetime
num_variables = 1000
rng = np.random.default_rng()
data = pd.DataFrame({
'id' : np.random.randint(1,999999999,num_variables),
'date' : [np.random.choice(pd.date_range(datetime(2021,1,1),datetime(2022,12,31))) for i in range(num_variables)],
'product' : [np.random.choice(['giftcards', 'afiliates']) for i in range(num_variables)],
'brand' : [np.random.choice(['brand_1', 'brand_2', 'brand_4', 'brand_6']) for i in range(num_variables)],
'gmv' : rng.random(num_variables) * 100,
'revenue' : rng.random(num_variables) * 100,})
data = data.astype({'product':'category', 'brand':'category'})
base = data.groupby(['id', 'product']).aggregate({'product' : 'count'})
base = base.unstack()
Now I need to group clients by the "type" column and just count how much there are in each group.
first, apply the categorization function and its application :
def setup(row):
if row[('product', 'afiliates')] >= 1 and row[('product', 'giftcards')] == 0:
return 'afiliates'
if row[('product', 'afiliates')] == 0 and row[('product', 'giftcards')] >= 1:
return 'gift'
if row[('product', 'afiliates')] >= 1 and row[('product', 'giftcards')] >= 1:
return 'both'
base['type'] = base.apply(setup, axis=1)
base.reset_index(inplace=True)
So far, so good. If I run an groupby.agg, I get these results:
results = base[['type','id']].groupby(['type'], dropna=False).agg('count')
but if instead of agg I try an agregate, it does not work.
results = base[['type','id']].groupby(['type']).aggregate({'id': 'count'})
Output exceeds the size limit. Open the full output data in a text editor
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
Cell In[10], line 2
1 #results = base[['type','id']].groupby(['type'], dropna=False).agg('count')
----> 2 results = base[['type','id']].groupby(['type']).aggregate({'id': 'count'})
File c:\Users\fabio\AppData\Local\Programs\Python\Python311\Lib\site-packages\pandas\core\groupby\generic.py:894, in DataFrameGroupBy.aggregate(self, func, engine, engine_kwargs, *args, **kwargs)
891 func = maybe_mangle_lambdas(func)
893 op = GroupByApply(self, func, args, kwargs)
--> 894 result = op.agg()
895 if not is_dict_like(func) and result is not None:
896 return result
File c:\Users\fabio\AppData\Local\Programs\Python\Python311\Lib\site-packages\pandas\core\apply.py:169, in Apply.agg(self)
166 return self.apply_str()
168 if is_dict_like(arg):
--> 169 return self.agg_dict_like()
170 elif is_list_like(arg):
171 # we require a list, but not a 'str'
172 return self.agg_list_like()
File c:\Users\fabio\AppData\Local\Programs\Python\Python311\Lib\site-packages\pandas\core\apply.py:478, in Apply.agg_dict_like(self)
475 selected_obj = obj._selected_obj
476 selection = obj._selection
--> 478 arg = self.normalize_dictlike_arg("agg", selected_obj, arg)
...
606 # eg. {'A' : ['mean']}, normalize all to
607 # be list-likes
608 # Cannot use func.values() because arg may be a Series
KeyError: "Column(s) ['id'] do not exist"
What am I missing?
I´ve made the same question on Pandas Github.
They helped me, I will reproduce the answer here.
you can see how to access your columns using:
print(base.columns.tolist())
[('id', ''), ('product', 'afiliates'), ('product', 'giftcards'), ('type', '')]
When you have a MultiIndex for columns, you need to specify each level as a tuple. So you can do:
base[['type','id']].groupby(['type']).aggregate({('id', ''): 'count'})
Regarding the title of this issue - agg and aggregate are aliases, they do not behave differently.
I suppose there is a bit of an oddity here - why can you do base[['id']] but not specify {'id': ...} in agg? The reason is because column selection can return multiple columns (e.g. in the example here, base[['product']] returns a DataFrame with two columns), whereas agg must have one column and one column only. Thus, it is necessary to specify all levels in agg.

TypeError: Wrong number or type of arguments for overloaded function 'new_Date'

I am new to python. I am getting an error when running below code. The issue seems to be with date. can someone help me to correct i please. I have tried changing the date format in the excel but it does not solve the issue. The excel have a list of several bonds. I want to generate the coupon dates of the different bonds
BondData = pd.read_excel (r'C:\Users\Avishen\Desktop\Python\BONDDATA.xlsx')
Data = pd.DataFrame(BondData)
def scheduledates():
tenor = ql.Period(ql.Semiannual)
day_count = ql.Thirty360
calendar = ql.UnitedStates()
businessConvention = ql.Unadjusted
dateGeneration = ql.DateGeneration.Backward
monthEnd = False
# Dates in Bond Period
return ql.Schedule (issueDate, maturityDate, tenor, calendar, businessConvention,
businessConvention , dateGeneration, monthEnd)
new_df["Dates"]= Data.apply(lambda x: scheduledates(),axis = 1)
new_df["ISIN"] = Data.ISIN
new_df
Error:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-4-877415e9cf83> in <module>
21 businessConvention , dateGeneration, monthEnd)
22
---> 23 new_df["Dates"]= Data.apply(lambda x: scheduledates(),axis = 1)
24 new_df["ISIN"] = Data.ISIN
25 new_df
~\anaconda3\lib\site-packages\pandas\core\frame.py in apply(self, func, axis, raw, result_type, args, **kwds)
7546 kwds=kwds,
7547 )
-> 7548 return op.get_result()
7549
7550 def applymap(self, func) -> "DataFrame":
~\anaconda3\lib\site-packages\pandas\core\apply.py in get_result(self)
178 return self.apply_raw()
179
--> 180 return self.apply_standard()
181
182 def apply_empty_result(self):
~\anaconda3\lib\site-packages\pandas\core\apply.py in apply_standard(self)
269
270 def apply_standard(self):
--> 271 results, res_index = self.apply_series_generator()
272
273 # wrap results
~\anaconda3\lib\site-packages\pandas\core\apply.py in apply_series_generator(self)
298 for i, v in enumerate(series_gen):
299 # ignore SettingWithCopy here in case the user mutates
--> 300 results[i] = self.f(v)
301 if isinstance(results[i], ABCSeries):
302 # If we have a view on v, we need to make a copy because
<ipython-input-4-877415e9cf83> in <lambda>(x)
21 businessConvention , dateGeneration, monthEnd)
22
---> 23 new_df["Dates"]= Data.apply(lambda x: scheduledates(),axis = 1)
24 new_df["ISIN"] = Data.ISIN
25 new_df
<ipython-input-4-877415e9cf83> in scheduledates()
8
9 def scheduledates():
---> 10 issueDate = ql.Date(Data.issuedate)
11 maturityDate = ql.Date(Data.maturitydate)
12 tenor = ql.Period(ql.Semiannual)
~\anaconda3\lib\site-packages\QuantLib\QuantLib.py in __init__(self, *args)
425
426 def __init__(self, *args):
--> 427 _QuantLib.Date_swiginit(self, _QuantLib.new_Date(*args))
428
429 def weekdayNumber(self):
TypeError: Wrong number or type of arguments for overloaded function 'new_Date'.
Possible C/C++ prototypes are:
Date::Date()
Date::Date(Day,Month,Year)
Date::Date(Day,Month,Year,Hour,Minute,Second,Millisecond,Microsecond)
Date::Date(Day,Month,Year,Hour,Minute,Second,Millisecond)
Date::Date(Day,Month,Year,Hour,Minute,Second)
Date::Date(BigInteger)
Date::Date(std::string const &,std::string)
---------------------------------------------------------------------------
Data = pd.DataFrame(BondData)
Fields from Bond Data
ISIN
issuedate
maturitydate
coupon
Tradeyield
Bond_Price
MarketPrice
Nominal_Amount
From the traceback, the problem is the line:
issueDate = ql.Date(Data.issuedate)
(which for some reason is not in the code you pasted). Coming from Excel, issuedate should be an integer and thus compatible with the ql.Date constructor, but it's possible that pandas is reading it as a string or some other type. You should examine the data frame and check the type of the column. If it's not what you expect, you'll have to figure out if there are data in that column that pandas can't interpret as integers, and either clean them up of force the conversion somehow before passing them to ql.Date.

Numba / Numpy - Understanding Error Message

I'm experimenting with Numba to try and speed up a union-find algorithm I'm working on. Here's some example code. When I experiment with some sample data I cannot understand the type complaint that Numba appears to be raising.
from numba import jit
import numpy as np
indices = np.arange(8806806, dtype=np.int64)
sizes = np.ones(8806806, dtype=np.int64)
connected_components = 8806806
#jit(npython=True)
def root(p: int) -> int:
while p != indices[p]:
indices[p] = indices[indices[p]]
p = indices[p]
return p
#jit(npython=True)
def connected( p: int, q: int) -> bool:
return root(p) == root(q)
#jit(npython=True)
def union( p: int, q: int) -> None:
root1 = root(p)
root2 = root(q)
if root1 == root2:
return
if (sizes[root1] < sizes[root2]):
indices[root1] = root2
sizes[root2] += sizes[root1]
else:
indices[root2] = root1
sizes[root1] += sizes[root2]
connected_components -= 1
#jit(nopython=True)
def process_values(arr):
for row in arr:
typed_arr = row.astype('int64')
for first, second in zip(arr, arr[1:]):
union(first, second)
process_values(
np.array(
[np.array([8018361, 4645960]),
np.array([1137555, 7763897]),
np.array([7532943, 2248813]),
np.array([5352737, 71466, 3590473, 5352738, 2712260])], dtype='object'))
I cannot understand this error:
TypingError Traceback (most recent call last)
<ipython-input-45-62735e65f581> in <module>
44 np.array([1137555, 7763897]),
45 np.array([7532943, 2248813]),
---> 46 np.array([5352737, 71466, 3590473, 5352738, 2712260])], dtype='object'))
/opt/conda/lib/python3.7/site-packages/numba/core/dispatcher.py in _compile_for_args(self, *args, **kws)
399 e.patch_message(msg)
400
--> 401 error_rewrite(e, 'typing')
402 except errors.UnsupportedError as e:
403 # Something unsupported is present in the user code, add help info
/opt/conda/lib/python3.7/site-packages/numba/core/dispatcher.py in error_rewrite(e, issue_type)
342 raise e
343 else:
--> 344 reraise(type(e), e, None)
345
346 argtypes = []
/opt/conda/lib/python3.7/site-packages/numba/core/utils.py in reraise(tp, value, tb)
78 value = tp()
79 if value.__traceback__ is not tb:
---> 80 raise value.with_traceback(tb)
81 raise value
82
TypingError: Failed in nopython mode pipeline (step: nopython frontend)
non-precise type array(pyobject, 1d, C)
[1] During: typing of argument at <ipython-input-45-62735e65f581> (36)
File "<ipython-input-45-62735e65f581>", line 36:
def process_values(arr):
for row in arr:
^
Does this have anything to do with process_values taking an array of irregularly shaped arrays? Any pointers? Thanks!
the problem is that Numba does not accept arrays of dtype 'object'. You seem to be placing arrays inside arrays, you will have to use lists inside lists. Look for the typed.List class in Numba, https://numba.pydata.org/numba-doc/dev/reference/pysupported.html#typed-list
Alternatively, you can use awkward arrays: https://github.com/scikit-hep/awkward-1.0

tf.keras.layers.Concatenate() works with a list but fails on a tuple of tensors

This will work:
tf.keras.layers.Concatenate()([features['a'], features['b']])
While this:
tf.keras.layers.Concatenate()((features['a'], features['b']))
Results in:
TypeError: int() argument must be a string or a number, not 'TensorShapeV1'
Is that expected? If so - why does it matter what sequence do I pass?
Thanks,
Zach
EDIT (adding a code example):
import pandas as pd
import numpy as np
data = {
'a': [1.0, 2.0, 3.0],
'b': [0.1, 0.3, 0.2],
}
with tf.Session() as sess:
ds = tf.data.Dataset.from_tensor_slices(data)
ds = ds.batch(1)
it = ds.make_one_shot_iterator()
features = it.get_next()
concat = tf.keras.layers.Concatenate()((features['a'], features['b']))
try:
while True:
print(sess.run(concat))
except tf.errors.OutOfRangeError:
pass
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-135-0e1a45017941> in <module>()
6 features = it.get_next()
7
----> 8 concat = tf.keras.layers.Concatenate()((features['a'], features['b']))
9
10
google3/third_party/tensorflow/python/keras/engine/base_layer.py in __call__(self, inputs, *args, **kwargs)
751 # the user has manually overwritten the build method do we need to
752 # build it.
--> 753 self.build(input_shapes)
754 # We must set self.built since user defined build functions are not
755 # constrained to set self.built.
google3/third_party/tensorflow/python/keras/utils/tf_utils.py in wrapper(instance, input_shape)
148 tuple(tensor_shape.TensorShape(x).as_list()) for x in input_shape]
149 else:
--> 150 input_shape = tuple(tensor_shape.TensorShape(input_shape).as_list())
151 output_shape = fn(instance, input_shape)
152 if output_shape is not None:
google3/third_party/tensorflow/python/framework/tensor_shape.py in __init__(self, dims)
688 else:
689 # Got a list of dimensions
--> 690 self._dims = [as_dimension(d) for d in dims_iter]
691
692 #property
google3/third_party/tensorflow/python/framework/tensor_shape.py in as_dimension(value)
630 return value
631 else:
--> 632 return Dimension(value)
633
634
google3/third_party/tensorflow/python/framework/tensor_shape.py in __init__(self, value)
183 raise TypeError("Cannot convert %s to Dimension" % value)
184 else:
--> 185 self._value = int(value)
186 if (not isinstance(value, compat.bytes_or_text_types) and
187 self._value != value):
TypeError: int() argument must be a string or a number, not 'TensorShapeV1'
https://github.com/keras-team/keras/blob/master/keras/layers/merge.py#L329
comment on the concanate class states it requires a list.
this class calls K.backend's concatenate function
https://github.com/keras-team/keras/blob/master/keras/backend/tensorflow_backend.py#L2041
which also states it requires a list.
in tensorflow https://github.com/tensorflow/tensorflow/blob/r1.12/tensorflow/python/ops/array_ops.py#L1034
also states it requires a list of tensors. Why? I don't know. in this function the tensors (variable called "values") actually gets checked if its a list or tuple. but somewhere along the way you still get an error.