Problem while trying to delete row with certain value - pandas

I have a problem while trying to delete row:
Error:
ValueError Traceback (most recent call last)
<ipython-input-186-83339e440bcb> in <module>()
1 df.head()
2 df['bathrooms'] = df['bathrooms'].astype('int64')
----> 3 df['bathrooms'] = df[df['bathrooms'] != 28]
1 frames
/usr/local/lib/python3.7/dist-packages/pandas/core/frame.py in _set_item_frame_value(self, key, value)
3727 len_cols = 1 if is_scalar(cols) else len(cols)
3728 if len_cols != len(value.columns):
-> 3729 raise ValueError("Columns must be same length as key")
3730
3731 # align right-hand-side columns if self.columns
ValueError: Columns must be same length as key
Code:
df['bathrooms'] = df['bathrooms'].astype('int64')
df['bathrooms'] = df[df['bathrooms'] != 28]
dataframe:
Any help is appreciated very

df['bathrooms'] != 28 gives you bool values.
df[df['bathrooms'] != 28] gives you a dataframe.
then you are assigning a dataframe to a column. df['bathrooms'] = df[df['bathrooms'] != 28]
If you want a new dataframe you can do:
df = df[df['bathrooms'] != 28]

Related

ValueError: could not convert string to float: 'n/a'

My Error is:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-88-b8c70965fe8e> in <module>
10 new_df["CustomerLocation"] = new_df.CustomerLocation.replace(x, (float(x[0]), float(x[1])))
11 print("floated x:", new_df.CustomerLocation)
---> 12 new_df["MerchantLocation"] = new_df.MerchantLocation.replace(y, (float(y[0]), float(y[1])))
13 print(haversine((float(x[0]), float(x[1])), (float(y[0]), float(y[1])), unit='mi'), "miles")
14 else:
ValueError: could not convert string to float: 'n/a'
My code is:
new_df=pd.DataFrame({'MerchantLocation':(tuple(num) for num in data.merchant_long_lat), 'CustomerLocation': (tuple(num) for num in data.long_lat)})
for index, row in new_df.iterrows():
x=row.CustomerLocation
y=row.MerchantLocation
#new_df["MerchantLocation"] = new_df.MerchantLocation.replace(y, (y[0].replace(r'^\s*$', np.NaN, regex=True), y[1].replace(r'^\s*$', np.NaN, regex=True)))
if x[0]!="" or x[1]!="" or y[0]!="" or y[1]!="":
print("x:",x)
new_df["CustomerLocation"] = new_df.CustomerLocation.replace(x, (float(x[0]), float(x[1])))
new_df["MerchantLocation"] = new_df.MerchantLocation.replace(y, (float(y[0]), float(y[1])))
print(haversine((float(x[0]), float(x[1])), (float(y[0]), float(y[1])), unit='mi'), "miles")
else:
print("There is an empty string")
There is an empty cell in raw excel data as I checked. This is from ANZ's Virtual Internship. I am unable to catch the empty string. Please help!
Try surrounding your problematic code region with a try-catch statement.
Something like:
if x[0]!="" or x[1]!="" or y[0]!="" or y[1]!="":
try:
print("x:",x)
new_df["CustomerLocation"] = new_df.CustomerLocation.replace(x, (float(x[0]), float(x[1])))
new_df["MerchantLocation"] = new_df.MerchantLocation.replace(y, (float(y[0]), float(y[1])))
print(haversine((float(x[0]), float(x[1])), (float(y[0]), float(y[1])), unit='mi'), "miles")
except:
print("one of these should be N/A")
print(x[0], x[1], y[0], y[1])
else:
print("There is an empty string")

pandas DataFrame booler

df = pd.DataFrame(np.random.randn(5,6), columns=list('abcdef'))
df[df.loc[0] > 1]
IndexingError Traceback (most recent call
last) in
1 df = pd.DataFrame(np.random.randn(5,6), columns=list('abcdef'))
2
----> 3 df[df.loc[0] > 1]
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\frame.py in
getitem(self, key) 2891 # Do we have a (boolean) 1d indexer? 2892 if com.is_bool_indexer(key):
-> 2893 return self._getitem_bool_array(key) 2894 2895 # We are left with two options: a single key, and a
collection of keys,
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\frame.py in
_getitem_bool_array(self, key) 2943 # check_bool_indexer will throw exception if Series key cannot 2944 # be
reindexed to match DataFrame rows
-> 2945 key = check_bool_indexer(self.index, key) 2946 indexer = key.nonzero()[0] 2947 return
self._take_with_is_copy(indexer, axis=0)
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexing.py in
check_bool_indexer(index, key) 2182 mask =
isna(result._values) 2183 if mask.any():
-> 2184 raise IndexingError( 2185 "Unalignable boolean Series provided as " 2186
"indexer (index of the boolean Series and of "
IndexingError: Unalignable boolean Series provided as indexer (index
of the boolean Series and of the indexed object do not match).
and I try
df(pd.DataFrame(df.loc[0] > 1).T)
TypeError Traceback (most recent call
last) in
1 df = pd.DataFrame(np.random.randn(5,6), columns=list('abcdef'))
2
----> 3 df(pd.DataFrame(df.loc[0] > 1).T)
TypeError: 'DataFrame' object is not callable
what should I do ? Thanks.

Can pandas df have cell values of numpy array

I want to store Numpy arrays as values for cells in my Dataframe. Is there any way to do this?
Basically i have pixel data which is a (512,512) Numpy array that i want to save as the value for pixel_data column corresponding to its particular id in the ID column of my Dataframe. How can i do this?
Heres what i tried:
for f in train_files[:10]:
id_tmp = f.split('/')[4].split('.')[0]
first_dcm = pydicom.read_file(f)
img = first_dcm.pixel_array
window = get_windowing(first_dcm)
image = window_image(img, *window)
train.loc[train.Image == id_tmp, 'img_before_w'] = img
train.loc[train.Image == id_tmp, 'img_after_w'] = image
The error i got:
ValueError Traceback (most recent call last)
<ipython-input-47-32236f8c9ccc> in <module>
5 window = get_windowing(first_dcm)
6 image = window_image(img, *window)
----> 7 train.loc[train.Image == id_tmp, 'img_before_w'] = img
8 train.loc[train.Image == id_tmp, 'img_after_w'] = image
9
/opt/conda/lib/python3.6/site-packages/pandas/core/indexing.py in __setitem__(self, key, value)
203 key = com.apply_if_callable(key, self.obj)
204 indexer = self._get_setitem_indexer(key)
--> 205 self._setitem_with_indexer(indexer, value)
206
207 def _validate_key(self, key, axis: int):
/opt/conda/lib/python3.6/site-packages/pandas/core/indexing.py in _setitem_with_indexer(self, indexer, value)
525 if len(labels) != value.shape[1]:
526 raise ValueError(
--> 527 "Must have equal len keys and value "
528 "when setting with an ndarray"
529 )
ValueError: Must have equal len keys and value when setting with an ndarray
Taking sample dataframe as below:
train=pd.DataFrame({'Image':[1,2,3,2],'img_before_w':[np.nan, np.nan, np.nan,np.nan]})
print(train) gives
Image img_before_w
0 1 NaN
1 2 NaN
2 3 NaN
3 2 NaN
Now, for example, if you want to insert pixel data when train.Image == 2, then it can be achieved using below code:
mask = train.Image == 2 # contains True for desired rows
target_index=mask[mask==True].index # gives index of rows, wherever condition is met
train.loc[mask, 'img_before_w'] = pd.Series([[512,512]]*len(target_index), index=target_index) # inserts [512,512] array in rows wherever condition is met, in given column
Now, print(train) gives, desired output:
Image img_before_w
0 1 NaN
1 2 [512, 512]
2 3 NaN
3 2 [512, 512]

how to fix the calculation error which says 'DataFrame' object is not callable

im working on football data set and this is following error im getting. please help,
#what is the win rate of HomeTeam?
n_matches = df.shape[0]
n_features = df.shape[1] -1
n_homewin = len(df(df.FTR == 'H'))
win_rate = (float(n_homewin) / (n_matches)) * 100
print ("Total number of matches,{}".format(n_matches))
print ("Number of features,{}".format(n_features))
print ("Number of maches won by hom team,{}".format (n_homewin))
print ("win rate of home team,{:.2f}%" .format(win_rate))
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-122-7e4d81fc684e> in <module>
5 n_features = df.shape[1] -1
6
----> 7 n_homewin = len(df(df.FTR == 'H'))
8
9 win_rate = (float(n_homewin) / (n_matches)) * 100
TypeError: 'DataFrame' object is not
expected result should print the team winning ratio
I think problem is with (), need [] for filter by boolean indexing:
n_homewin = len(df[df.FTR == 'H'])
Or simplier count Trues values by sum:
n_homewin = (df.FTR == 'H').sum()
you should modify it to df[df.FTR == 'H']. The parentheses imply a function call

trimming column named is generating ValueError

I have a table which I run through a function to trim its columns down to length 128 (I know it's really long, there isn't anything I can do about that) characters so it can use to_sql to create a database from it.
def truncate_column_names(df, length):
rename = {}
for col in df.columns:
if len(col) > length:
new_col = col[:length-3]+"..."
rename[col] = new_col
result = df.rename(columns=rename)
return result
This function works fine and I get a table out just fine but the problem comes when I tried to save the file I get the error
ValueError: Buffer has wrong number of dimensions (expected 1, got 2)
The method I have doing some housekeeping before saving to a file included dropping duplicates and that is where this error is being spit out. I tested this by saving the original dataFrame and then just loading it, running the truncate function, and then trying drop_duplicates on the result and I get the same error.
The headers for the file before I try truncating looks like this:
http://pastebin.com/WXmvwHDg
I trimmed the file down to 1 record and still have the problem.
This was a result of the truncating causing some columns to have non-unique names.
To confirm this was an issue I did a short test:
In [113]: df = pd.DataFrame(columns=["ab", "ac", "ad"])
In [114]: df
Out[114]:
Empty DataFrame
Columns: [ab, ac, ad]
Index: []
In [115]: df.drop_duplicates()
Out[115]:
Empty DataFrame
Columns: [ab, ac, ad]
Index: []
In [116]: df.columns
Out[116]: Index([u'ab', u'ac', u'ad'], dtype='object')
In [117]: df.columns = df.columns.str[:1]
In [118]: df
Out[118]:
Empty DataFrame
Columns: [a, a, a]
Index: []
In [119]: df.drop_duplicates()
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-119-daf275b6788b> in <module>()
----> 1 df.drop_duplicates()
C:\Miniconda\lib\site-packages\pandas\util\decorators.pyc in wrapper(*args, **kw
args)
86 else:
87 kwargs[new_arg_name] = new_arg_value
---> 88 return func(*args, **kwargs)
89 return wrapper
90 return _deprecate_kwarg
C:\Miniconda\lib\site-packages\pandas\core\frame.pyc in drop_duplicates(self, su
bset, take_last, inplace)
2826 deduplicated : DataFrame
2827 """
-> 2828 duplicated = self.duplicated(subset, take_last=take_last)
2829
2830 if inplace:
C:\Miniconda\lib\site-packages\pandas\util\decorators.pyc in wrapper(*args, **kw
args)
86 else:
87 kwargs[new_arg_name] = new_arg_value
---> 88 return func(*args, **kwargs)
89 return wrapper
90 return _deprecate_kwarg
C:\Miniconda\lib\site-packages\pandas\core\frame.pyc in duplicated(self, subset,
take_last)
2871
2872 vals = (self[col].values for col in subset)
-> 2873 labels, shape = map(list, zip( * map(f, vals)))
2874
2875 ids = get_group_index(labels, shape, sort=False, xnull=False)
C:\Miniconda\lib\site-packages\pandas\core\frame.pyc in f(vals)
2860
2861 def f(vals):
-> 2862 labels, shape = factorize(vals, size_hint=min(len(self), _SI
ZE_HINT_LIMIT))
2863 return labels.astype('i8',copy=False), len(shape)
2864
C:\Miniconda\lib\site-packages\pandas\core\algorithms.pyc in factorize(values, s
ort, order, na_sentinel, size_hint)
133 table = hash_klass(size_hint or len(vals))
134 uniques = vec_klass()
--> 135 labels = table.get_labels(vals, uniques, 0, na_sentinel)
136
137 labels = com._ensure_platform_int(labels)
pandas\hashtable.pyx in pandas.hashtable.PyObjectHashTable.get_labels (pandas\ha
shtable.c:13946)()
ValueError: Buffer has wrong number of dimensions (expected 1, got 2)
and got the same result. using df.columns.unique() after the truncation i had ~200 duplicate columns after the truncation