Pandas dropna throwing ValueError: "Cannot convert non-finite values (NA or inf) to integer" - pandas

Pandas: 0.25.3
Python: 3.7.4
I have a data frame, and I want to remove the columns which contain only NaN values. That should be easy, because there is a Pandas DataFrame function which does exactly that—dropna. Here's my code:
long_summary = long_summary.dropna(axis='columns', how='all')
But that simple line throws an exception:
ValueError: Cannot convert non-finite values (NA or inf) to integer
I cannot see how calling dropna would lead to this exception. What is going on and how do I fix it?
I'll include the whole exception stack just-in-case that makes the problem clearer:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-88-b4926abd4d81> in <module>
----> 1 long_summary = long_summary.dropna(axis='columns', how='all')
c:\users\timregan\appdata\local\programs\python\python37\lib\site-packages\pandas\core\frame.py in dropna(self, axis, how, thresh, subset, inplace)
4860 agg_obj = self.take(indices, axis=agg_axis)
4861
-> 4862 count = agg_obj.count(axis=agg_axis)
4863
4864 if thresh is not None:
c:\users\timregan\appdata\local\programs\python\python37\lib\site-packages\pandas\core\frame.py in count(self, axis, level, numeric_only)
7848 result = Series(counts, index=frame._get_agg_axis(axis))
7849
-> 7850 return result.astype("int64")
7851
7852 def _count_level(self, level, axis=0, numeric_only=False):
c:\users\timregan\appdata\local\programs\python\python37\lib\site-packages\pandas\core\generic.py in astype(self, dtype, copy, errors, **kwargs)
5880 # else, only a single dtype is given
5881 new_data = self._data.astype(
-> 5882 dtype=dtype, copy=copy, errors=errors, **kwargs
5883 )
5884 return self._constructor(new_data).__finalize__(self)
c:\users\timregan\appdata\local\programs\python\python37\lib\site-packages\pandas\core\internals\managers.py in astype(self, dtype, **kwargs)
579
580 def astype(self, dtype, **kwargs):
--> 581 return self.apply("astype", dtype=dtype, **kwargs)
582
583 def convert(self, **kwargs):
c:\users\timregan\appdata\local\programs\python\python37\lib\site-packages\pandas\core\internals\managers.py in apply(self, f, axes, filter, do_integrity_check, consolidate, **kwargs)
436 kwargs[k] = obj.reindex(b_items, axis=axis, copy=align_copy)
437
--> 438 applied = getattr(b, f)(**kwargs)
439 result_blocks = _extend_blocks(applied, result_blocks)
440
c:\users\timregan\appdata\local\programs\python\python37\lib\site-packages\pandas\core\internals\blocks.py in astype(self, dtype, copy, errors, values, **kwargs)
557
558 def astype(self, dtype, copy=False, errors="raise", values=None, **kwargs):
--> 559 return self._astype(dtype, copy=copy, errors=errors, values=values, **kwargs)
560
561 def _astype(self, dtype, copy=False, errors="raise", values=None, **kwargs):
c:\users\timregan\appdata\local\programs\python\python37\lib\site-packages\pandas\core\internals\blocks.py in _astype(self, dtype, copy, errors, values, **kwargs)
641 # _astype_nansafe works fine with 1-d only
642 vals1d = values.ravel()
--> 643 values = astype_nansafe(vals1d, dtype, copy=True, **kwargs)
644
645 # TODO(extension)
c:\users\timregan\appdata\local\programs\python\python37\lib\site-packages\pandas\core\dtypes\cast.py in astype_nansafe(arr, dtype, copy, skipna)
698 if not np.isfinite(arr).all():
699 raise ValueError(
--> 700 "Cannot convert non-finite values (NA or inf) to " "integer"
701 )
702
ValueError: Cannot convert non-finite values (NA or inf) to integer
(N.B. the data types of my columns are int64, Int32, and float64)
In the comments Scott asked for data to reproduce this issue. The redacted CSV is available on Dropbox here.
df = pd.read_csv('E:\\Temp\\dropna.csv')
df.dropna(axis='columns', how='all')
But be warned, the CSV is 3.3 GB and the resulting data frame has over 60 million rows. It tried cutting out rows, but it seems to need to be this long to trigger the error.

Related

matplotlib issue: how to erase this one?

import maplotlib.pyplot as plt
import pandas as pd
df = pd.DataFrame(np.random.randn(30,3)*100+1000,
index=pd.date_range(start='2018-09-01', periods=30, freq='D'),
columns=['1', '2', 3'])
df[:5].plot.bar()
a Seeing the graph, each x label has '00:00:00', which is unnecessary.
So I tried to delete these by writing this code.
df[:5].plot.bar(x=df[:5].index.date
But it has an error like this.
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-52-92dd89374fec> in <module>
----> 1 df[:5].plot.bar(x=df[:5].index.date, stacked=True)
~\anaconda3\lib\site-packages\pandas\plotting\_core.py in bar(self, x, y, **kwargs)
1001 >>> ax = df.plot.bar(x='lifespan', rot=0)
1002 """
-> 1003 return self(kind="bar", x=x, y=y, **kwargs)
1004
1005 def barh(self, x=None, y=None, **kwargs):
~\anaconda3\lib\site-packages\pandas\plotting\_core.py in __call__(self, *args, **kwargs)
810 if is_integer(x) and not data.columns.holds_integer():
811 x = data_cols[x]
--> 812 elif not isinstance(data[x], ABCSeries):
813 raise ValueError("x must be a label or position")
814 data = data.set_index(x)
~\anaconda3\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
2804 if is_iterator(key):
2805 key = list(key)
-> 2806 indexer = self.loc._get_listlike_indexer(key, axis=1, raise_missing=True)[1]
2807
2808 # take() does not accept boolean indexers
~\anaconda3\lib\site-packages\pandas\core\indexing.py in _get_listlike_indexer(self, key, axis, raise_missing)
1550 keyarr, indexer, new_indexer = ax._reindex_non_unique(keyarr)
1551
-> 1552 self._validate_read_indexer(
1553 keyarr, indexer, o._get_axis_number(axis), raise_missing=raise_missing
1554 )
~\anaconda3\lib\site-packages\pandas\core\indexing.py in _validate_read_indexer(self, key, indexer, axis, raise_missing)
1638 if missing == len(indexer):
1639 axis_name = self.obj._get_axis_name(axis)
-> 1640 raise KeyError(f"None of [{key}] are in the [{axis_name}]")
1641
1642 # We (temporarily) allow for some missing keys with .loc, except in
KeyError: "None of [Index([2018-09-01, 2018-09-02, 2018-09-03, 2018-09-04, 2018-09-05], dtype='object')] are in the [columns]"
What's the problem?? I just followed the book, but it did come out.
You can change index values before selecting first 5 rows:
df.index = df.index.date
df[:5].plot.bar()
Or:
df.rename(lambda x: x.date())[:5].plot.bar()

How can I convert price columns to an integer?

How can I convert price columns to an integer?
code:
car_sales["Total Sales"] = car_sales["Price"].astype(int).cumsum()
car_sales
error:
ValueError Traceback (most recent call last)
<ipython-input-124-b84f0a711067> in <module>
----> 1 car_sales["Total Sales"] = car_sales["Price"].astype(int).cumsum()
2 car_sales
~\anaconda3\lib\site-packages\pandas\core\generic.py in astype(self, dtype, copy, errors)
5696 else:
5697 # else, only a single dtype is given
-> 5698 new_data = self._data.astype(dtype=dtype, copy=copy, errors=errors)
5699 return self._constructor(new_data).__finalize__(self)
5700
~\anaconda3\lib\site-packages\pandas\core\internals\managers.py in astype(self, dtype, copy, errors)
580
581 def astype(self, dtype, copy: bool = False, errors: str = "raise"):
--> 582 return self.apply("astype", dtype=dtype, copy=copy, errors=errors)
583
584 def convert(self, **kwargs):
~\anaconda3\lib\site-packages\pandas\core\internals\managers.py in apply(self, f, filter, **kwargs)
440 applied = b.apply(f, **kwargs)
441 else:
--> 442 applied = getattr(b, f)(**kwargs)
443 result_blocks = _extend_blocks(applied, result_blocks)
444
~\anaconda3\lib\site-packages\pandas\core\internals\blocks.py in astype(self, dtype, copy, errors)
623 vals1d = values.ravel()
624 try:
--> 625 values = astype_nansafe(vals1d, dtype, copy=True)
626 except (ValueError, TypeError):
627 # e.g. astype_nansafe can fail on object-dtype of strings
~\anaconda3\lib\site-packages\pandas\core\dtypes\cast.py in astype_nansafe(arr, dtype, copy, skipna)
872 # work around NumPy brokenness, #1987
873 if np.issubdtype(dtype.type, np.integer):
--> 874 return lib.astype_intsafe(arr.ravel(), dtype).reshape(arr.shape)
875
876 # if we have a datetime/timedelta array of objects
pandas\_libs\lib.pyx in pandas._libs.lib.astype_intsafe()
ValueError: invalid literal for int() with base 10: ' 4 00'
There exists a to_numeric function in pandas. See here.
car_sales["Total Sales"] = pd.to_numeric(car_sales["Price"], errors='coerce').cumsum()
This does return nan for 4 00 however so you must be careful. Follow what David Erickson said.
As an example if it is all spaces are supposed to be decimals, then
car_sales["Price"].str.replace(' ', '.')
should work if done before the conversion from an object.

why it is showing " TypeError: Invalid shape (20,) for image data "

import numpy as np
import matplotlib.pyplot as plt
m1=np.random.randint(0,20,4*5)
m1.reshape(4,5)
plt.imshow(m1)
**when I have executed above code in python 3 on jupyter notebook, I got Type error anyone please answer to this error in simplest way **
like
TypeError Traceback (most recent call last)
in
----> 1 plt.imshow(m1)
c:\users\jaiprakash\appdata\local\programs\python\python37-32\lib\site-packages\matplotlib\pyplot.py in imshow(X, cmap, norm, aspect, interpolation, alpha, vmin, vmax, origin, extent, shape, filternorm, filterrad, imlim, resample, url, data, **kwargs)
2649 filternorm=filternorm, filterrad=filterrad, imlim=imlim,
2650 resample=resample, url=url, **({"data": data} if data is not
-> 2651 None else {}), **kwargs)
2652 sci(__ret)
2653 return __ret
c:\users\jaiprakash\appdata\local\programs\python\python37-32\lib\site-packages\matplotlib\__init__.py in inner(ax, data, *args, **kwargs)
1563 def inner(ax, *args, data=None, **kwargs):
1564 if data is None:
-> 1565 return func(ax, *map(sanitize_sequence, args), **kwargs)
1566
1567 bound = new_sig.bind(ax, *args, **kwargs)
c:\users\jaiprakash\appdata\local\programs\python\python37-32\lib\site-packages\matplotlib\cbook\deprecation.py in wrapper(*args, **kwargs)
356 f"%(removal)s. If any parameter follows {name!r}, they "
357 f"should be pass as keyword, not positionally.")
--> 358 return func(*args, **kwargs)
359
360 return wrapper
c:\users\jaiprakash\appdata\local\programs\python\python37-32\lib\site-packages\matplotlib\cbook\deprecation.py in wrapper(*args, **kwargs)
356 f"%(removal)s. If any parameter follows {name!r}, they "
357 f"should be pass as keyword, not positionally.")
--> 358 return func(*args, **kwargs)
359
360 return wrapper
c:\users\jaiprakash\appdata\local\programs\python\python37-32\lib\site-packages\matplotlib\axes\_axes.py in imshow(self, X, cmap, norm, aspect, interpolation, alpha, vmin, vmax, origin, extent, shape, filternorm, filterrad, imlim, resample, url, **kwargs)
5613 resample=resample, **kwargs)
5614
-> 5615 im.set_data(X)
5616 im.set_alpha(alpha)
5617 if im.get_clip_path() is None:
c:\users\jaiprakash\appdata\local\programs\python\python37-32\lib\site-packages\matplotlib\image.py in set_data(self, A)
697 or self._A.ndim == 3 and self._A.shape[-1] in [3, 4]):
698 raise TypeError("Invalid shape {} for image data"
--> 699 .format(self._A.shape))
700
701 if self._A.ndim == 3:
TypeError: Invalid shape (20,) for image data
When you call m1.reshape(4,5) you dont assign it to a variable. The method wont change the shape of m1 unless you reassign it to m1
import numpy as np
import matplotlib.pyplot as plt
#m1=np.random.randint(low=0, high=20, size=(4,5)) # << personally I would have done this & not bothered with the reshape
m1 = np.random.randint(0,20,4*5)
m1 = m1.reshape(4, 5)
plt.imshow(m1)

TypeError: dtype '<class 'datetime.timedelta'>' not understood

I have two dates where the difference of the dates determine how many days the user has been active.
df['days_active'] = df['last_login'] - df['first_login']
Then I use datetime.timedelta days method on valid objects, which used to work until I updated to the current panda
df['days_active'] = df['days_active'].astype(dt.timedelta).map(lambda x: np.nan if pd.isnull(x) else x.days)
TypeError Traceback (most recent call last)
<ipython-input-8-335b54b7b187> in <module>()
1 df['days_active'] = df['last_login'] - df['first_login']
----> 2 df['days_active'] = df['days_active'].astype(dt.timedelta).map(lambda x: np.nan if pd.isnull(x) else x.days)
3 df['weeks_active'] = df['days_active']/7
4 df['weekly_min_avg'] = df['total_minutes']/df['weeks_active']
5 frames
/usr/local/lib/python3.6/dist-packages/pandas/core/generic.py in astype(self, dtype, copy, errors, **kwargs)
5689 # else, only a single dtype is given
5690 new_data = self._data.astype(dtype=dtype, copy=copy, errors=errors,
-> 5691 **kwargs)
5692 return self._constructor(new_data).__finalize__(self)
5693
/usr/local/lib/python3.6/dist-packages/pandas/core/internals/managers.py in astype(self, dtype, **kwargs)
529
530 def astype(self, dtype, **kwargs):
--> 531 return self.apply('astype', dtype=dtype, **kwargs)
532
533 def convert(self, **kwargs):
/usr/local/lib/python3.6/dist-packages/pandas/core/internals/managers.py in apply(self, f, axes, filter, do_integrity_check, consolidate, **kwargs)
393 copy=align_copy)
394
--> 395 applied = getattr(b, f)(**kwargs)
396 result_blocks = _extend_blocks(applied, result_blocks)
397
/usr/local/lib/python3.6/dist-packages/pandas/core/internals/blocks.py in astype(self, dtype, copy, errors, values, **kwargs)
532 def astype(self, dtype, copy=False, errors='raise', values=None, **kwargs):
533 return self._astype(dtype, copy=copy, errors=errors, values=values,
--> 534 **kwargs)
535
536 def _astype(self, dtype, copy=False, errors='raise', values=None,
/usr/local/lib/python3.6/dist-packages/pandas/core/internals/blocks.py in _astype(self, dtype, copy, errors, values, **kwargs)
593
594 # convert dtypes if needed
--> 595 dtype = pandas_dtype(dtype)
596 # astype processing
597 if is_dtype_equal(self.dtype, dtype):
/usr/local/lib/python3.6/dist-packages/pandas/core/dtypes/common.py in pandas_dtype(dtype)
2027 return npdtype
2028 elif npdtype.kind == 'O':
-> 2029 raise TypeError("dtype '{}' not understood".format(dtype))
2030
2031 return npdtype
TypeError: dtype '<class 'datetime.timedelta'>' not understood
Thanks to #root for the solution to this issue.
Changing
df['days_active'] = df['days_active'].astype(dt.timedelta).map(lambda x: np.nan if pd.isnull(x) else x.days)
To
df['days_active'] = df['days_active'].dt.days
should solve the issue

Can't perform calculations on DataFrame values

I am trying to apply a formula to each value in a Pandas DataFrame, however, I am getting an error.
def transform_x(x):
return x/0.65
transformed = input_df.applymap(transform_x)
This returns the following error:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-72-66afcc1d1b80> in <module>
3
4
----> 5 transformed = input_df.applymap(transform_x)
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\frame.py in applymap(self, func)
6551 return lib.map_infer(x.astype(object).values, func)
6552
-> 6553 return self.apply(infer)
6554
6555 # ----------------------------------------------------------------------
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\frame.py in apply(self, func, axis, broadcast, raw, reduce, result_type, args, **kwds)
6485 args=args,
6486 kwds=kwds)
-> 6487 return op.get_result()
6488
6489 def applymap(self, func):
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\apply.py in get_result(self)
149 return self.apply_raw()
150
--> 151 return self.apply_standard()
152
153 def apply_empty_result(self):
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\apply.py in apply_standard(self)
255
256 # compute the result using the series generator
--> 257 self.apply_series_generator()
258
259 # wrap results
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\apply.py in apply_series_generator(self)
284 try:
285 for i, v in enumerate(series_gen):
--> 286 results[i] = self.f(v)
287 keys.append(v.name)
288 except Exception as e:
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\frame.py in infer(x)
6549 if x.empty:
6550 return lib.map_infer(x, func)
-> 6551 return lib.map_infer(x.astype(object).values, func)
6552
6553 return self.apply(infer)
pandas\_libs\lib.pyx in pandas._libs.lib.map_infer()
<ipython-input-72-66afcc1d1b80> in transform_x(x)
1 def transform_x(x):
----> 2 return x/0.65
3
4
5 transformed = input_df.applymap(transform_x)
TypeError: ("unsupported operand type(s) for /: 'str' and 'float'", 'occurred at index (column_a)')
I have tried converting the type of the DataFrame to float, as I thought that this might be the issue, however, I am encountering a different problem.
input_df = input_df.astype(float)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-71-2102a8e5c505> in <module>
----> 1 input_df= input_df.astype(float)
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\generic.py in astype(self, dtype, copy, errors, **kwargs)
5689 # else, only a single dtype is given
5690 new_data = self._data.astype(dtype=dtype, copy=copy, errors=errors,
-> 5691 **kwargs)
5692 return self._constructor(new_data).__finalize__(self)
5693
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\internals\managers.py in astype(self, dtype, **kwargs)
529
530 def astype(self, dtype, **kwargs):
--> 531 return self.apply('astype', dtype=dtype, **kwargs)
532
533 def convert(self, **kwargs):
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\internals\managers.py in apply(self, f, axes, filter, do_integrity_check, consolidate, **kwargs)
393 copy=align_copy)
394
--> 395 applied = getattr(b, f)(**kwargs)
396 result_blocks = _extend_blocks(applied, result_blocks)
397
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\internals\blocks.py in astype(self, dtype, copy, errors, values, **kwargs)
532 def astype(self, dtype, copy=False, errors='raise', values=None, **kwargs):
533 return self._astype(dtype, copy=copy, errors=errors, values=values,
--> 534 **kwargs)
535
536 def _astype(self, dtype, copy=False, errors='raise', values=None,
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\internals\blocks.py in _astype(self, dtype, copy, errors, values, **kwargs)
631
632 # _astype_nansafe works fine with 1-d only
--> 633 values = astype_nansafe(values.ravel(), dtype, copy=True)
634
635 # TODO(extension)
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\dtypes\cast.py in astype_nansafe(arr, dtype, copy, skipna)
700 if copy or is_object_dtype(arr) or is_object_dtype(dtype):
701 # Explicit copy, or required since NumPy can't view from / to object.
--> 702 return arr.astype(dtype, copy=True)
703
704 return arr.view(dtype)
ValueError: could not convert string to float:
I am really not sure what is going wrong. I have tried exporting the DataFrames as a csv and, aside from the indexes which do contain text, the values are all floats. Is this something to do with the indexes perhaps?
As an addendum, I tried using pd.to_numeric outside of a lambda function but it also returned an error:
input_df = pd.to_numeric(input_df, errors='coerce')
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-93-7178dce9054b> in <module>
----> 1 input_df = pd.to_numeric(input_df, errors='coerce')
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\tools\numeric.py in to_numeric(arg, errors, downcast)
120 values = np.array([arg], dtype='O')
121 elif getattr(arg, 'ndim', 1) > 1:
--> 122 raise TypeError('arg must be a list, tuple, 1-d array, or Series')
123 else:
124 values = arg
TypeError: arg must be a list, tuple, 1-d array, or Series
You may try something like:
input_df = input_df.apply(lambda x: pd.to_neumeric(x,errors='coerce')).applymap(transform_x)
the input_df is a 2D array but pd.to_neumeric() takes only list, tuple, 1-d array, or Series so you cannot call a dataframe under it.Hence we take the help of lambda x to pass each series individually .
Once all the df has neumeric data, apply your function.