Pandas Apply function on Groupby behaviour - pandas

I am trying to create a column that computes the ratio of consumption based on a monthly basis. I have created the function but once i use i run it pandas shows a typeerror:
Below is the function error and error stack
The consumption ratio function is:
def consumption_ratio(row):
c_consumption = row["consumption"].iloc[0]
month = row["month"].iloc[0]
year = row["year"].iloc[0]
house = row["houseid-meterid"].iloc[0]
if month == 2 and year == 2019:
return 0
else:
if month == 1:
# print(f"This is the {month} month")
prevyear = year - 1
prevmonth = 12
prev_record = water_df.query("`houseid-meterid` == #house and year == #prevyear and month == #prevmonth")
try:
ratio = c_consumption / prev_record["consumption"]
except ZeroDivisionError:
ratio = 0
# print(f"Non regular rations {ratio}")
return ratio
else:
prevmonth = month - 1
prev_record = water_df.query("`houseid-meterid` == #house and year == #year and month == #prevmonth")
# print(prev_record)
try:
ratio = c_consumption/ prev_record["consumption"]
except ZeroDivisionError:
ratio = 0
# ratio = c_consumption / prev_record["consumption"]
# print(f"Regular ratios {ratio}")
return ratio
The code executes here:
water_df["consumption_ratio"] = water_df.groupby(['Datetime', 'houseid-meterid']).apply(consumption_ratio)
The error stack looks like this:
ValueError Traceback (most recent call last)
File D:\ML Projects\Bityarn-UtilitiesAnalysis\venv\lib\site-packages\pandas\core\frame.py:12017, in _reindex_for_setitem(value, index)
12016 try:
> 12017 reindexed_value = value.reindex(index)._values
12018 except ValueError as err:
12019 # raised in MultiIndex.from_tuples, see test_insert_error_msmgs
File D:\ML Projects\Bityarn-UtilitiesAnalysis\venv\lib\site-packages\pandas\core\series.py:5094, in Series.reindex(self, *args, **kwargs)
5093 kwargs.update({"index": index})
-> 5094 return super().reindex(**kwargs)
File D:\ML Projects\Bityarn-UtilitiesAnalysis\venv\lib\site-packages\pandas\core\generic.py:5289, in NDFrame.reindex(self, *args, **kwargs)
5288 # perform the reindex on the axes
-> 5289 return self._reindex_axes(
5290 axes, level, limit, tolerance, method, fill_value, copy
5291 ).__finalize__(self, method="reindex")
File D:\ML Projects\Bityarn-UtilitiesAnalysis\venv\lib\site-packages\pandas\core\generic.py:5304, in NDFrame._reindex_axes(self, axes, level, limit, tolerance, method, fill_value, copy)
5303 ax = self._get_axis(a)
-> 5304 new_index, indexer = ax.reindex(
5305 labels, level=level, limit=limit, tolerance=tolerance, method=method
5306 )
5308 axis = self._get_axis_number(a)
File D:\ML Projects\Bityarn-UtilitiesAnalysis\venv\lib\site-packages\pandas\core\indexes\base.py:4477, in Index.reindex(self, target, method, level, limit, tolerance)
4470 warnings.warn(
4471 "reindexing with a non-unique Index is deprecated and "
4472 "will raise in a future version.",
4473 FutureWarning,
4474 stacklevel=find_stack_level(),
4475 )
-> 4477 target = self._wrap_reindex_result(target, indexer, preserve_names)
4478 return target, indexer
File D:\ML Projects\Bityarn-UtilitiesAnalysis\venv\lib\site-packages\pandas\core\indexes\multi.py:2556, in MultiIndex._wrap_reindex_result(self, target, indexer, preserve_names)
2555 try:
-> 2556 target = MultiIndex.from_tuples(target)
2557 except TypeError:
2558 # not all tuples, see test_constructor_dict_multiindex_reindex_flat
File D:\ML Projects\Bityarn-UtilitiesAnalysis\venv\lib\site-packages\pandas\core\indexes\multi.py:205, in names_compat.<locals>.new_meth(self_or_cls, *args, **kwargs)
203 kwargs["names"] = kwargs.pop("name")
--> 205 return meth(self_or_cls, *args, **kwargs)
File D:\ML Projects\Bityarn-UtilitiesAnalysis\venv\lib\site-packages\pandas\core\indexes\multi.py:573, in MultiIndex.from_tuples(cls, tuples, sortorder, names)
571 tuples = np.asarray(tuples._values)
--> 573 arrays = list(lib.tuples_to_object_array(tuples).T)
574 elif isinstance(tuples, list):
File D:\ML Projects\Bityarn-UtilitiesAnalysis\venv\lib\site-packages\pandas\_libs\lib.pyx:2978, in pandas._libs.lib.tuples_to_object_array()
ValueError: Buffer dtype mismatch, expected 'Python object' but got 'long long'
The above exception was the direct cause of the following exception:
TypeError Traceback (most recent call last)
Cell In[34], line 1
----> 1 water_df["consumption_ratio"] = water_df.groupby(['Datetime', 'houseid-meterid']).apply(consumption_ratio)
File D:\ML Projects\Bityarn-UtilitiesAnalysis\venv\lib\site-packages\pandas\core\frame.py:3978, in DataFrame.__setitem__(self, key, value)
3975 self._setitem_array([key], value)
3976 else:
3977 # set column
-> 3978 self._set_item(key, value)
File D:\ML Projects\Bityarn-UtilitiesAnalysis\venv\lib\site-packages\pandas\core\frame.py:4172, in DataFrame._set_item(self, key, value)
4162 def _set_item(self, key, value) -> None:
4163 """
4164 Add series to DataFrame in specified column.
4165
(...)
4170 ensure homogeneity.
4171 """
-> 4172 value = self._sanitize_column(value)
4174 if (
4175 key in self.columns
4176 and value.ndim == 1
4177 and not is_extension_array_dtype(value)
4178 ):
4179 # broadcast across multiple columns if necessary
4180 if not self.columns.is_unique or isinstance(self.columns, MultiIndex):
File D:\ML Projects\Bityarn-UtilitiesAnalysis\venv\lib\site-packages\pandas\core\frame.py:4909, in DataFrame._sanitize_column(self, value)
4907 return _reindex_for_setitem(value, self.index)
4908 elif is_dict_like(value):
-> 4909 return _reindex_for_setitem(Series(value), self.index)
4911 if is_list_like(value):
4912 com.require_length_match(value, self.index)
File D:\ML Projects\Bityarn-UtilitiesAnalysis\venv\lib\site-packages\pandas\core\frame.py:12024, in _reindex_for_setitem(value, index)
12020 if not value.index.is_unique:
12021 # duplicate axis
12022 raise err
> 12024 raise TypeError(
12025 "incompatible index of inserted column with frame index"
12026 ) from err
12027 return reindexed_value
TypeError: incompatible index of inserted column with frame index
The dataset is of the form
year month houseid-meterid Datetime cleaned_quantity
2019 2 m5 2019-02-01 7.0
2019 3 m5 2019-03-01 23.0
2019 4 m5 2019-04-01 14.0
2019 4 m5 2019-05-01 22.0
The expected output should be
year month houseid-meterid Datetime consumption consumption-ratio
2019 2 m5 2019-02-01 7.0 0
2019 3 m5 2019-03-01 23.0 3.285
2019 4 m5 2019-04-01 14.0 0.608
2019 4 m5 2019-05-01 22.0 1.571
What am i doing wrong?

Cahnge your function for next with iter for first consumption, if no exist is added 0, then append ratio to column consumption_ratio and return row instead ratio or 0, last remove assign to water_df["consumption_ratio"] = in groupby in last row of code:
def consumption_ratio(row):
c_consumption = row["consumption"].iloc[0]
# print (c_consumption)
month = row["month"].iloc[0]
year = row["year"].iloc[0]
house = row["houseid-meterid"].iloc[0]
if month == 2 and year == 2019:
ratio=0
else:
if month == 1:
# print(f"This is the {month} month")
prevyear = year - 1
prevmonth = 12
prev_record = water_df.query("`houseid-meterid` == #house and year == #prevyear and month == #prevmonth")
try:
ratio = c_consumption / next(iter(prev_record["consumption"]), 0)
except ZeroDivisionError:
ratio = 0
# print(f"Non regular rations {ratio}")
else:
prevmonth = month - 1
prev_record = water_df.query("`houseid-meterid` == #house and year == #year and month == #prevmonth")
# print(prev_record)
try:
ratio = c_consumption/ next(iter(prev_record["consumption"]), 0)
except ZeroDivisionError:
ratio = 0
# ratio = c_consumption / prev_record["consumption"]
# print(f"Regular ratios {ratio}")
row['consumption_ratio'] = ratio
return row
df = water_df.groupby(['Datetime', 'houseid-meterid']).apply(consumption_ratio)

Related

Can't plot data frame column, I want to plot a scatter graph

My data
I want to make a scatter graph of 2 columns but its not working. if you could help please, i have confirmed the column exists in show columns, then it doesnt exsist as show by code at bottom.
df = data
x = Birth rate (object)
y = CO₂ (per capita). (float64)
i tried
data.plot(x='Birth_rate', y='CO₂_(per capita)')
plt.title('1960 - C02 emissions vs. Crude birth rates')
It resulted in Keyerror 'Birth rate'
File /shared-libs/python3.9/py/lib/python3.9/site-packages/pandas/plotting/_core.py:920, in PlotAccessor.__call__(self, *args, **kwargs)
918 if is_integer(x) and not data.columns.holds_integer():
919 x = data_cols[x]
--> 920 elif not isinstance(data[x], ABCSeries):
921 raise ValueError("x must be a label or position")
922 data = data.set_index(x)
File /shared-libs/python3.9/py/lib/python3.9/site-packages/pandas/core/frame.py:3024, in DataFrame.__getitem__(self, key)
3022 if self.columns.nlevels > 1:
3023 return self._getitem_multilevel(key)
-> 3024 indexer = self.columns.get_loc(key)
3025 if is_integer(indexer):
3026 indexer = [indexer]
File /shared-libs/python3.9/py/lib/python3.9/site-packages/pandas/core/indexes/base.py:3083, in Index.get_loc(self, key, method, tolerance)
3081 return self._engine.get_loc(casted_key)
3082 except KeyError as err:
-> 3083 raise KeyError(key) from err
3085 if tolerance is not None:
3086 tolerance = self._convert_tolerance(tolerance, np.asarray(key))
KeyError: 'Birth_rate'
I tried an index aswell and go this error:
KeyError: "None of ['Birth_rate'] are in the columns"

Pandas Error Handling - "day is out of range for month"

I wonder how do I handle and spot the data which has wrong format in pandas.
I tried to convert string into pd data form however,
somewhere in the middle the format is not in line with the format that was expected (I assume), an error message popped.
I want to print what the data is
and I want to fix it into the right format that I could keep moving on
how do I handle this issue?
My code is as below.
def date_format(df):
target = pd.to_datetime(df['Issue Date'])
return target
The error message is
--------------------------------------------------------------------------- TypeError Traceback (most recent call
last)
~/anaconda3/lib/python3.8/site-packages/pandas/core/arrays/datetimes.py
in objects_to_datetime64ns(data, dayfirst, yearfirst, utc, errors,
require_iso8601, allow_object) 2053 try:
-> 2054 values, tz_parsed = conversion.datetime_to_datetime64(data) 2055 # If
tzaware, these values represent unix timestamps, so we
pandas/_libs/tslibs/conversion.pyx in
pandas._libs.tslibs.conversion.datetime_to_datetime64()
TypeError: Unrecognized value type: <class 'str'>
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call
last) in
----> 1 seg0=generateDb(csv,2017,1000000,0)
in generateDb(csv, year, segment,
index)
2
3 targetSeg0=segment_df(csv,segment,index)
----> 4 targetSeg0['Issue Date']=date_format(targetSeg0)
5 targetSeg0=remove_etc(targetSeg0)
6 filter_date(targetSeg0,year)
in date_format(df)
1 def date_format(df):
----> 2 target = pd.to_datetime(df['Issue Date'])
3 return target
~/anaconda3/lib/python3.8/site-packages/pandas/core/tools/datetimes.py
in to_datetime(arg, errors, dayfirst, yearfirst, utc, format, exact,
unit, infer_datetime_format, origin, cache)
797 result = result.tz_localize(tz)
798 elif isinstance(arg, ABCSeries):
--> 799 cache_array = _maybe_cache(arg, format, cache, convert_listlike)
800 if not cache_array.empty:
801 result = arg.map(cache_array)
~/anaconda3/lib/python3.8/site-packages/pandas/core/tools/datetimes.py
in _maybe_cache(arg, format, cache, convert_listlike)
168 unique_dates = unique(arg)
169 if len(unique_dates) < len(arg):
--> 170 cache_dates = convert_listlike(unique_dates, format)
171 cache_array = Series(cache_dates, index=unique_dates)
172 return cache_array
~/anaconda3/lib/python3.8/site-packages/pandas/core/tools/datetimes.py
in _convert_listlike_datetimes(arg, format, name, tz, unit, errors,
infer_datetime_format, dayfirst, yearfirst, exact)
457 assert format is None or infer_datetime_format
458 utc = tz == "utc"
--> 459 result, tz_parsed = objects_to_datetime64ns(
460 arg,
461 dayfirst=dayfirst,
~/anaconda3/lib/python3.8/site-packages/pandas/core/arrays/datetimes.py
in objects_to_datetime64ns(data, dayfirst, yearfirst, utc, errors,
require_iso8601, allow_object) 2057 return
values.view("i8"), tz_parsed 2058 except (ValueError,
TypeError):
-> 2059 raise e 2060 2061 if tz_parsed is not None:
~/anaconda3/lib/python3.8/site-packages/pandas/core/arrays/datetimes.py
in objects_to_datetime64ns(data, dayfirst, yearfirst, utc, errors,
require_iso8601, allow_object) 2042 2043 try:
-> 2044 result, tz_parsed = tslib.array_to_datetime( 2045 data, 2046 errors=errors,
pandas/_libs/tslib.pyx in pandas._libs.tslib.array_to_datetime()
pandas/_libs/tslib.pyx in pandas._libs.tslib.array_to_datetime()
pandas/_libs/tslib.pyx in
pandas._libs.tslib.array_to_datetime_object()
pandas/_libs/tslib.pyx in
pandas._libs.tslib.array_to_datetime_object()
pandas/_libs/tslibs/parsing.pyx in
pandas._libs.tslibs.parsing.parse_datetime_string()
pandas/_libs/tslibs/parsing.pyx in
pandas._libs.tslibs.parsing._parse_delimited_date()
datetime.pxd in cpython.datetime.datetime_new()
ValueError: day is out of range for month
I'm eventually gonna filter the data as below.
def filter_date(df,year):
booldf = df['Issue Date'] >= pd.to_datetime(f"{year}-01-01")
booldf2 = df['Issue Date'] <= pd.to_datetime(f"{year}-12-31")
return df[
(df['Issue Date'] >= pd.to_datetime(f"{year}-01-01")) & (df['Issue Date'] <= pd.to_datetime(f"{year}-12-31"))
]
Sample data form is as below.
The Error occurs because of wrong data like below. (51th month doesn't exist).. I guess this is where the error came from.
Based on your comments I assume this will help:
Using as example (99/99/9999 is the incorrect data):
df = pd.DataFrame(["09/26/2016", "06/14/2017", "09/05/2018", "06/16/2017", "05/09/2018", "99/99/9999"], columns=['Issue Date']).
You mean something like this:
pd.to_datetime(df["Issue Date"], errors="coerce")
output:
see https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_datetime.html
if you want to you drop the rows with NaTs next with:
df = df.dropna(axis=0, subset=['Issue Date'])

TypeError: Wrong number or type of arguments for overloaded function 'new_Date'

I am new to python. I am getting an error when running below code. The issue seems to be with date. can someone help me to correct i please. I have tried changing the date format in the excel but it does not solve the issue. The excel have a list of several bonds. I want to generate the coupon dates of the different bonds
BondData = pd.read_excel (r'C:\Users\Avishen\Desktop\Python\BONDDATA.xlsx')
Data = pd.DataFrame(BondData)
def scheduledates():
tenor = ql.Period(ql.Semiannual)
day_count = ql.Thirty360
calendar = ql.UnitedStates()
businessConvention = ql.Unadjusted
dateGeneration = ql.DateGeneration.Backward
monthEnd = False
# Dates in Bond Period
return ql.Schedule (issueDate, maturityDate, tenor, calendar, businessConvention,
businessConvention , dateGeneration, monthEnd)
new_df["Dates"]= Data.apply(lambda x: scheduledates(),axis = 1)
new_df["ISIN"] = Data.ISIN
new_df
Error:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-4-877415e9cf83> in <module>
21 businessConvention , dateGeneration, monthEnd)
22
---> 23 new_df["Dates"]= Data.apply(lambda x: scheduledates(),axis = 1)
24 new_df["ISIN"] = Data.ISIN
25 new_df
~\anaconda3\lib\site-packages\pandas\core\frame.py in apply(self, func, axis, raw, result_type, args, **kwds)
7546 kwds=kwds,
7547 )
-> 7548 return op.get_result()
7549
7550 def applymap(self, func) -> "DataFrame":
~\anaconda3\lib\site-packages\pandas\core\apply.py in get_result(self)
178 return self.apply_raw()
179
--> 180 return self.apply_standard()
181
182 def apply_empty_result(self):
~\anaconda3\lib\site-packages\pandas\core\apply.py in apply_standard(self)
269
270 def apply_standard(self):
--> 271 results, res_index = self.apply_series_generator()
272
273 # wrap results
~\anaconda3\lib\site-packages\pandas\core\apply.py in apply_series_generator(self)
298 for i, v in enumerate(series_gen):
299 # ignore SettingWithCopy here in case the user mutates
--> 300 results[i] = self.f(v)
301 if isinstance(results[i], ABCSeries):
302 # If we have a view on v, we need to make a copy because
<ipython-input-4-877415e9cf83> in <lambda>(x)
21 businessConvention , dateGeneration, monthEnd)
22
---> 23 new_df["Dates"]= Data.apply(lambda x: scheduledates(),axis = 1)
24 new_df["ISIN"] = Data.ISIN
25 new_df
<ipython-input-4-877415e9cf83> in scheduledates()
8
9 def scheduledates():
---> 10 issueDate = ql.Date(Data.issuedate)
11 maturityDate = ql.Date(Data.maturitydate)
12 tenor = ql.Period(ql.Semiannual)
~\anaconda3\lib\site-packages\QuantLib\QuantLib.py in __init__(self, *args)
425
426 def __init__(self, *args):
--> 427 _QuantLib.Date_swiginit(self, _QuantLib.new_Date(*args))
428
429 def weekdayNumber(self):
TypeError: Wrong number or type of arguments for overloaded function 'new_Date'.
Possible C/C++ prototypes are:
Date::Date()
Date::Date(Day,Month,Year)
Date::Date(Day,Month,Year,Hour,Minute,Second,Millisecond,Microsecond)
Date::Date(Day,Month,Year,Hour,Minute,Second,Millisecond)
Date::Date(Day,Month,Year,Hour,Minute,Second)
Date::Date(BigInteger)
Date::Date(std::string const &,std::string)
---------------------------------------------------------------------------
Data = pd.DataFrame(BondData)
Fields from Bond Data
ISIN
issuedate
maturitydate
coupon
Tradeyield
Bond_Price
MarketPrice
Nominal_Amount
From the traceback, the problem is the line:
issueDate = ql.Date(Data.issuedate)
(which for some reason is not in the code you pasted). Coming from Excel, issuedate should be an integer and thus compatible with the ql.Date constructor, but it's possible that pandas is reading it as a string or some other type. You should examine the data frame and check the type of the column. If it's not what you expect, you'll have to figure out if there are data in that column that pandas can't interpret as integers, and either clean them up of force the conversion somehow before passing them to ql.Date.

How to write seed_features that include a conditional statement

I'm trying to write a seed feature that produces reward if place == 1 else 0.
place and reward are both ft.variable_types.Numeric:
Entity: results
Variables:
id (dtype: index)
place (dtype: numeric)
reward (dtype: numeric)
I've tried the following alternatives with no luck:
Alternative 1
roi = (ft.Feature(es['results']['reward'])
if (ft.Feature(es['results']['place']) == 1)
else 0).rename('roi')
produces AssertionError: Column "roi" missing frome dataframe
when generating the features.
Alternative 2
roi = ((ft.Feature(es['results']['place']) == 1) *
ft.Feature(es['results']['reward'])).rename('roi')
produces AssertionError: Provided inputs don't match input type requirements when assigning the seed feature.
Alternative 2 should work since in Python:
>>> True * 3.14
3.14
>>> False * 3.14
0.0
The full stack trace:
---------------------------------------------------------------------------
AssertionError Traceback (most recent call last)
<ipython-input-211-94dd07d98076> in <module>()
23
24
---> 25 roi = ((ft.Feature(es['results']['place']) == 1) * ft.Feature(es['results']['reward'])).rename('roi')
~/dev/venv/lib/python3.6/site-packages/featuretools/feature_base/feature_base.py in __mul__(self, other)
287 def __mul__(self, other):
288 """Multiply by other"""
--> 289 return self._handle_binary_comparision(other, primitives.MultiplyNumeric, primitives.MultiplyNumericScalar)
290
291 def __rmul__(self, other):
~/dev/venv/lib/python3.6/site-packages/featuretools/feature_base/feature_base.py in _handle_binary_comparision(self, other, Primitive, PrimitiveScalar)
230 def _handle_binary_comparision(self, other, Primitive, PrimitiveScalar):
231 if isinstance(other, FeatureBase):
--> 232 return Feature([self, other], primitive=Primitive)
233
234 return Feature([self], primitive=PrimitiveScalar(other))
~/dev/venv/lib/python3.6/site-packages/featuretools/feature_base/feature_base.py in __new__(self, base, entity, groupby, parent_entity, primitive, use_previous, where)
755 primitive=primitive,
756 groupby=groupby)
--> 757 return TransformFeature(base, primitive=primitive)
758
759 raise Exception("Unrecognized feature initialization")
~/dev/venv/lib/python3.6/site-packages/featuretools/feature_base/feature_base.py in __init__(self, base_features, primitive, name)
660 relationship_path=RelationshipPath([]),
661 primitive=primitive,
--> 662 name=name)
663
664 #classmethod
~/dev/venv/lib/python3.6/site-packages/featuretools/feature_base/feature_base.py in __init__(self, entity, base_features, relationship_path, primitive, name, names)
56 self._names = names
57
---> 58 assert self._check_input_types(), ("Provided inputs don't match input "
59 "type requirements")
60
AssertionError: Provided inputs don't match input type requirements
This should work on featuretools v0.11.0. Here is an example using a demo dataset. Both unit_price and total are numeric.
import featuretools as ft
es = ft.demo.load_retail(nrows=100)
es['order_products']
Entity: order_products
Variables:
...
unit_price (dtype: numeric)
total (dtype: numeric)
...
I create the seed feature.
unit_price = ft.Feature(es['order_products']['unit_price'])
total = ft.Feature(es['order_products']['total'])
seed = ((total == 1) * unit_price).rename('seed')
Then, calculate the feature matrix.
fm, fd = ft.dfs(target_entity='customers', entityset=es, seed_features=[seed])
fm.filter(regex='seed').columns.tolist()[:5]
['SUM(order_products.seed)',
'STD(order_products.seed)',
'MAX(order_products.seed)',
'SKEW(order_products.seed)',
'MIN(order_products.seed)']
In your case, this would be the seed feature.
place = ft.Feature(es['results']['place'])
reward = ft.Feature(es['results']['reward'])
roi = ((reward == 1) * place).rename('roi')
Let me know if that helps.

Can pandas df have cell values of numpy array

I want to store Numpy arrays as values for cells in my Dataframe. Is there any way to do this?
Basically i have pixel data which is a (512,512) Numpy array that i want to save as the value for pixel_data column corresponding to its particular id in the ID column of my Dataframe. How can i do this?
Heres what i tried:
for f in train_files[:10]:
id_tmp = f.split('/')[4].split('.')[0]
first_dcm = pydicom.read_file(f)
img = first_dcm.pixel_array
window = get_windowing(first_dcm)
image = window_image(img, *window)
train.loc[train.Image == id_tmp, 'img_before_w'] = img
train.loc[train.Image == id_tmp, 'img_after_w'] = image
The error i got:
ValueError Traceback (most recent call last)
<ipython-input-47-32236f8c9ccc> in <module>
5 window = get_windowing(first_dcm)
6 image = window_image(img, *window)
----> 7 train.loc[train.Image == id_tmp, 'img_before_w'] = img
8 train.loc[train.Image == id_tmp, 'img_after_w'] = image
9
/opt/conda/lib/python3.6/site-packages/pandas/core/indexing.py in __setitem__(self, key, value)
203 key = com.apply_if_callable(key, self.obj)
204 indexer = self._get_setitem_indexer(key)
--> 205 self._setitem_with_indexer(indexer, value)
206
207 def _validate_key(self, key, axis: int):
/opt/conda/lib/python3.6/site-packages/pandas/core/indexing.py in _setitem_with_indexer(self, indexer, value)
525 if len(labels) != value.shape[1]:
526 raise ValueError(
--> 527 "Must have equal len keys and value "
528 "when setting with an ndarray"
529 )
ValueError: Must have equal len keys and value when setting with an ndarray
Taking sample dataframe as below:
train=pd.DataFrame({'Image':[1,2,3,2],'img_before_w':[np.nan, np.nan, np.nan,np.nan]})
print(train) gives
Image img_before_w
0 1 NaN
1 2 NaN
2 3 NaN
3 2 NaN
Now, for example, if you want to insert pixel data when train.Image == 2, then it can be achieved using below code:
mask = train.Image == 2 # contains True for desired rows
target_index=mask[mask==True].index # gives index of rows, wherever condition is met
train.loc[mask, 'img_before_w'] = pd.Series([[512,512]]*len(target_index), index=target_index) # inserts [512,512] array in rows wherever condition is met, in given column
Now, print(train) gives, desired output:
Image img_before_w
0 1 NaN
1 2 [512, 512]
2 3 NaN
3 2 [512, 512]