matplotlib issue: how to erase this one? - pandas

import maplotlib.pyplot as plt
import pandas as pd
df = pd.DataFrame(np.random.randn(30,3)*100+1000,
index=pd.date_range(start='2018-09-01', periods=30, freq='D'),
columns=['1', '2', 3'])
df[:5].plot.bar()
a Seeing the graph, each x label has '00:00:00', which is unnecessary.
So I tried to delete these by writing this code.
df[:5].plot.bar(x=df[:5].index.date
But it has an error like this.
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-52-92dd89374fec> in <module>
----> 1 df[:5].plot.bar(x=df[:5].index.date, stacked=True)
~\anaconda3\lib\site-packages\pandas\plotting\_core.py in bar(self, x, y, **kwargs)
1001 >>> ax = df.plot.bar(x='lifespan', rot=0)
1002 """
-> 1003 return self(kind="bar", x=x, y=y, **kwargs)
1004
1005 def barh(self, x=None, y=None, **kwargs):
~\anaconda3\lib\site-packages\pandas\plotting\_core.py in __call__(self, *args, **kwargs)
810 if is_integer(x) and not data.columns.holds_integer():
811 x = data_cols[x]
--> 812 elif not isinstance(data[x], ABCSeries):
813 raise ValueError("x must be a label or position")
814 data = data.set_index(x)
~\anaconda3\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
2804 if is_iterator(key):
2805 key = list(key)
-> 2806 indexer = self.loc._get_listlike_indexer(key, axis=1, raise_missing=True)[1]
2807
2808 # take() does not accept boolean indexers
~\anaconda3\lib\site-packages\pandas\core\indexing.py in _get_listlike_indexer(self, key, axis, raise_missing)
1550 keyarr, indexer, new_indexer = ax._reindex_non_unique(keyarr)
1551
-> 1552 self._validate_read_indexer(
1553 keyarr, indexer, o._get_axis_number(axis), raise_missing=raise_missing
1554 )
~\anaconda3\lib\site-packages\pandas\core\indexing.py in _validate_read_indexer(self, key, indexer, axis, raise_missing)
1638 if missing == len(indexer):
1639 axis_name = self.obj._get_axis_name(axis)
-> 1640 raise KeyError(f"None of [{key}] are in the [{axis_name}]")
1641
1642 # We (temporarily) allow for some missing keys with .loc, except in
KeyError: "None of [Index([2018-09-01, 2018-09-02, 2018-09-03, 2018-09-04, 2018-09-05], dtype='object')] are in the [columns]"
What's the problem?? I just followed the book, but it did come out.

You can change index values before selecting first 5 rows:
df.index = df.index.date
df[:5].plot.bar()
Or:
df.rename(lambda x: x.date())[:5].plot.bar()

Related

IndexError: positional indexers are out-of-bounds

I am new to python, Don't know how to fix this error. I am building a sentiment analysis classifier using word2vec.
Following is the code where I got the error:
pos_train_w2v = wordvec_df.iloc[:18046,:]
pos_test_w2v = wordvec_df.iloc[18046:,:]
splitting data into training and validation set
xtrain_w2v, xvalid_w2v, ytrain, yvalid = train_test_split(pos_train_w2v, positive_train['Label'], random_state=42, test_size=0.3)
xtrain_w2v = pos_train_w2v.iloc[ytrain.index,:]
xvalid_w2v = pos_train_w2v.iloc[yvalid.index,:]
Following is the error i received:
IndexError Traceback (most recent call last)
in ()
5 xtrain_w2v, xvalid_w2v, ytrain, yvalid = train_test_split(pos_train_w2v, positive_train['Label'], random_state=42, test_size=0.3)
6
----> 7 xtrain_w2v = pos_train_w2v.iloc[ytrain.index,:]
8 xvalid_w2v = pos_train_w2v.iloc[yvalid.index,:]
3 frames
/usr/local/lib/python3.7/dist-packages/pandas/core/indexing.py in getitem(self, key)
923 with suppress(KeyError, IndexError):
924 return self.obj._get_value(*key, takeable=self._takeable)
--> 925 return self._getitem_tuple(key)
926 else:
927 # we by definition only have the 0th axis
/usr/local/lib/python3.7/dist-packages/pandas/core/indexing.py in _getitem_tuple(self, tup)
1504 def _getitem_tuple(self, tup: tuple):
1505
-> 1506 self._has_valid_tuple(tup)
1507 with suppress(IndexingError):
1508 return self._getitem_lowerdim(tup)
/usr/local/lib/python3.7/dist-packages/pandas/core/indexing.py in _has_valid_tuple(self, key)
752 for i, k in enumerate(key):
753 try:
--> 754 self._validate_key(k, i)
755 except ValueError as err:
756 raise ValueError(
/usr/local/lib/python3.7/dist-packages/pandas/core/indexing.py in _validate_key(self, key, axis)
1422 # check that the key does not exceed the maximum size of the index
1423 if len(arr) and (arr.max() >= len_axis or arr.min() < -len_axis):
-> 1424 raise IndexError("positional indexers are out-of-bounds")
1425 else:
1426 raise ValueError(f"Can only index by location with a [{self._valid_types}]")
IndexError: positional indexers are out-of-bounds

Getting error message when trying to get at risk numbers below KM-plot (lifelines)

I've used lifelines a lot, but when I'm re-running old code that previously worked fine I get the following error: KeyError: "None of [Index(['At risk', 'Censored', 'Events'], dtype='object')] are in the [index]"
I'm guessing there has been some changes to the code when displaying at risk counts, but I can't find any evidence of it in the lifelines documentation. I am using version 27.0
Snippet of the table with data
index
t2p
O
1
354
False
2
113
False
3
1222
False
4
13
True
5
59
False
6
572
False
Code:
ax = plt.subplot(111)
m = KaplanMeierFitter()
ax = m.fit(h.t2p, h.O, label='PPI').plot_cumulative_density(ax=ax,ci_show=False)
add_at_risk_counts(m)
Full error:
KeyError Traceback (most recent call last)
<ipython-input-96-a8ce3ea9e60c> in <module>
4 ax = m.fit(h.t2p, h.O, label='PPI').plot_cumulative_density(ax=ax,ci_show=False)
5
----> 6 add_at_risk_counts(m)
7
8
~\AppData\Local\Continuum\anaconda3\lib\site-packages\lifelines\plotting.py in add_at_risk_counts(labels, rows_to_show, ypos, xticks, ax, at_risk_count_from_start_of_period, *fitters, **kwargs)
510 .rename({"at_risk": "At risk", "censored": "Censored", "observed": "Events"})
511 )
--> 512 counts.extend([int(c) for c in event_table_slice.loc[rows_to_show]])
513
514 if n_rows > 1:
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\indexing.py in __getitem__(self, key)
1766
1767 maybe_callable = com.apply_if_callable(key, self.obj)
-> 1768 return self._getitem_axis(maybe_callable, axis=axis)
1769
1770 def _is_scalar_access(self, key: Tuple):
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\indexing.py in _getitem_axis(self, key, axis)
1952 raise ValueError("Cannot index with multidimensional key")
1953
-> 1954 return self._getitem_iterable(key, axis=axis)
1955
1956 # nested tuple slicing
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\indexing.py in _getitem_iterable(self, key, axis)
1593 else:
1594 # A collection of keys
-> 1595 keyarr, indexer = self._get_listlike_indexer(key, axis, raise_missing=False)
1596 return self.obj._reindex_with_indexers(
1597 {axis: [keyarr, indexer]}, copy=True, allow_dups=True
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\indexing.py in _get_listlike_indexer(self, key, axis, raise_missing)
1551
1552 self._validate_read_indexer(
-> 1553 keyarr, indexer, o._get_axis_number(axis), raise_missing=raise_missing
1554 )
1555 return keyarr, indexer
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\indexing.py in _validate_read_indexer(self, key, indexer, axis, raise_missing)
1638 if missing == len(indexer):
1639 axis_name = self.obj._get_axis_name(axis)
-> 1640 raise KeyError(f"None of [{key}] are in the [{axis_name}]")
1641
1642 # We (temporarily) allow for some missing keys with .loc, except in
KeyError: "None of [Index(['At risk', 'Censored', 'Events'], dtype='object')] are in the [index]"

Pandas dropna throwing ValueError: "Cannot convert non-finite values (NA or inf) to integer"

Pandas: 0.25.3
Python: 3.7.4
I have a data frame, and I want to remove the columns which contain only NaN values. That should be easy, because there is a Pandas DataFrame function which does exactly that—dropna. Here's my code:
long_summary = long_summary.dropna(axis='columns', how='all')
But that simple line throws an exception:
ValueError: Cannot convert non-finite values (NA or inf) to integer
I cannot see how calling dropna would lead to this exception. What is going on and how do I fix it?
I'll include the whole exception stack just-in-case that makes the problem clearer:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-88-b4926abd4d81> in <module>
----> 1 long_summary = long_summary.dropna(axis='columns', how='all')
c:\users\timregan\appdata\local\programs\python\python37\lib\site-packages\pandas\core\frame.py in dropna(self, axis, how, thresh, subset, inplace)
4860 agg_obj = self.take(indices, axis=agg_axis)
4861
-> 4862 count = agg_obj.count(axis=agg_axis)
4863
4864 if thresh is not None:
c:\users\timregan\appdata\local\programs\python\python37\lib\site-packages\pandas\core\frame.py in count(self, axis, level, numeric_only)
7848 result = Series(counts, index=frame._get_agg_axis(axis))
7849
-> 7850 return result.astype("int64")
7851
7852 def _count_level(self, level, axis=0, numeric_only=False):
c:\users\timregan\appdata\local\programs\python\python37\lib\site-packages\pandas\core\generic.py in astype(self, dtype, copy, errors, **kwargs)
5880 # else, only a single dtype is given
5881 new_data = self._data.astype(
-> 5882 dtype=dtype, copy=copy, errors=errors, **kwargs
5883 )
5884 return self._constructor(new_data).__finalize__(self)
c:\users\timregan\appdata\local\programs\python\python37\lib\site-packages\pandas\core\internals\managers.py in astype(self, dtype, **kwargs)
579
580 def astype(self, dtype, **kwargs):
--> 581 return self.apply("astype", dtype=dtype, **kwargs)
582
583 def convert(self, **kwargs):
c:\users\timregan\appdata\local\programs\python\python37\lib\site-packages\pandas\core\internals\managers.py in apply(self, f, axes, filter, do_integrity_check, consolidate, **kwargs)
436 kwargs[k] = obj.reindex(b_items, axis=axis, copy=align_copy)
437
--> 438 applied = getattr(b, f)(**kwargs)
439 result_blocks = _extend_blocks(applied, result_blocks)
440
c:\users\timregan\appdata\local\programs\python\python37\lib\site-packages\pandas\core\internals\blocks.py in astype(self, dtype, copy, errors, values, **kwargs)
557
558 def astype(self, dtype, copy=False, errors="raise", values=None, **kwargs):
--> 559 return self._astype(dtype, copy=copy, errors=errors, values=values, **kwargs)
560
561 def _astype(self, dtype, copy=False, errors="raise", values=None, **kwargs):
c:\users\timregan\appdata\local\programs\python\python37\lib\site-packages\pandas\core\internals\blocks.py in _astype(self, dtype, copy, errors, values, **kwargs)
641 # _astype_nansafe works fine with 1-d only
642 vals1d = values.ravel()
--> 643 values = astype_nansafe(vals1d, dtype, copy=True, **kwargs)
644
645 # TODO(extension)
c:\users\timregan\appdata\local\programs\python\python37\lib\site-packages\pandas\core\dtypes\cast.py in astype_nansafe(arr, dtype, copy, skipna)
698 if not np.isfinite(arr).all():
699 raise ValueError(
--> 700 "Cannot convert non-finite values (NA or inf) to " "integer"
701 )
702
ValueError: Cannot convert non-finite values (NA or inf) to integer
(N.B. the data types of my columns are int64, Int32, and float64)
In the comments Scott asked for data to reproduce this issue. The redacted CSV is available on Dropbox here.
df = pd.read_csv('E:\\Temp\\dropna.csv')
df.dropna(axis='columns', how='all')
But be warned, the CSV is 3.3 GB and the resulting data frame has over 60 million rows. It tried cutting out rows, but it seems to need to be this long to trigger the error.

TypeError: dtype '<class 'datetime.timedelta'>' not understood

I have two dates where the difference of the dates determine how many days the user has been active.
df['days_active'] = df['last_login'] - df['first_login']
Then I use datetime.timedelta days method on valid objects, which used to work until I updated to the current panda
df['days_active'] = df['days_active'].astype(dt.timedelta).map(lambda x: np.nan if pd.isnull(x) else x.days)
TypeError Traceback (most recent call last)
<ipython-input-8-335b54b7b187> in <module>()
1 df['days_active'] = df['last_login'] - df['first_login']
----> 2 df['days_active'] = df['days_active'].astype(dt.timedelta).map(lambda x: np.nan if pd.isnull(x) else x.days)
3 df['weeks_active'] = df['days_active']/7
4 df['weekly_min_avg'] = df['total_minutes']/df['weeks_active']
5 frames
/usr/local/lib/python3.6/dist-packages/pandas/core/generic.py in astype(self, dtype, copy, errors, **kwargs)
5689 # else, only a single dtype is given
5690 new_data = self._data.astype(dtype=dtype, copy=copy, errors=errors,
-> 5691 **kwargs)
5692 return self._constructor(new_data).__finalize__(self)
5693
/usr/local/lib/python3.6/dist-packages/pandas/core/internals/managers.py in astype(self, dtype, **kwargs)
529
530 def astype(self, dtype, **kwargs):
--> 531 return self.apply('astype', dtype=dtype, **kwargs)
532
533 def convert(self, **kwargs):
/usr/local/lib/python3.6/dist-packages/pandas/core/internals/managers.py in apply(self, f, axes, filter, do_integrity_check, consolidate, **kwargs)
393 copy=align_copy)
394
--> 395 applied = getattr(b, f)(**kwargs)
396 result_blocks = _extend_blocks(applied, result_blocks)
397
/usr/local/lib/python3.6/dist-packages/pandas/core/internals/blocks.py in astype(self, dtype, copy, errors, values, **kwargs)
532 def astype(self, dtype, copy=False, errors='raise', values=None, **kwargs):
533 return self._astype(dtype, copy=copy, errors=errors, values=values,
--> 534 **kwargs)
535
536 def _astype(self, dtype, copy=False, errors='raise', values=None,
/usr/local/lib/python3.6/dist-packages/pandas/core/internals/blocks.py in _astype(self, dtype, copy, errors, values, **kwargs)
593
594 # convert dtypes if needed
--> 595 dtype = pandas_dtype(dtype)
596 # astype processing
597 if is_dtype_equal(self.dtype, dtype):
/usr/local/lib/python3.6/dist-packages/pandas/core/dtypes/common.py in pandas_dtype(dtype)
2027 return npdtype
2028 elif npdtype.kind == 'O':
-> 2029 raise TypeError("dtype '{}' not understood".format(dtype))
2030
2031 return npdtype
TypeError: dtype '<class 'datetime.timedelta'>' not understood
Thanks to #root for the solution to this issue.
Changing
df['days_active'] = df['days_active'].astype(dt.timedelta).map(lambda x: np.nan if pd.isnull(x) else x.days)
To
df['days_active'] = df['days_active'].dt.days
should solve the issue

pandas groupby got KeyError

I am using pandas to calculate some stats of a data file and got some error. It's reproducible by this simple sample code:
import pandas as pd
df = pd.DataFrame({'A': [1,2,3,4,5,6,7,8,9],
'B': [1,2,3,1,2,3,1,2,3],
'C': ['a', 'b', 'a', 'b', 'a', 'b', 'a','a', 'b']})
def testFun2(x):
return pd.DataFrame({'xlen': x.shape[0]})
def testFun(x):
b = x['B']
print "b equals to {}".format(b) # This line prints okay
c = x['C']
out = pd.DataFrame()
for a in x['A'].unique():
subx = x[x.A == a]
subxg = testFun2(subx)
out = pd.concat([out, subxg])
return out
df.groupby(['B', 'C']).apply(lambda x: testFun(x))
The whole error output look like this:
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-21-979d23aa904c> in <module>()
18 return out
19
---> 20 df.groupby(['B', 'C']).apply(lambda x: testFun(x))
C:\Users\Administrator\Anaconda2\lib\site-packages\pandas\core\groupby\groupby.pyc in apply(self, func, *args, **kwargs)
928
929 with _group_selection_context(self):
--> 930 return self._python_apply_general(f)
931
932 return result
C:\Users\Administrator\Anaconda2\lib\site-packages\pandas\core\groupby\groupby.pyc in _python_apply_general(self, f)
934 def _python_apply_general(self, f):
935 keys, values, mutated = self.grouper.apply(f, self._selected_obj,
--> 936 self.axis)
937
938 return self._wrap_applied_output(
C:\Users\Administrator\Anaconda2\lib\site-packages\pandas\core\groupby\groupby.pyc in apply(self, f, data, axis)
2271 # group might be modified
2272 group_axes = _get_axes(group)
-> 2273 res = f(group)
2274 if not _is_indexed_like(res, group_axes):
2275 mutated = True
<ipython-input-21-979d23aa904c> in <lambda>(x)
18 return out
19
---> 20 df.groupby(['B', 'C']).apply(lambda x: testFun(x))
<ipython-input-21-979d23aa904c> in testFun(x)
9
10 def testFun(x):
---> 11 b = x['B']
12 c = x['C']
13 out = pd.DataFrame()
C:\Users\Administrator\Anaconda2\lib\site-packages\pandas\core\frame.pyc in __getitem__(self, key)
2686 return self._getitem_multilevel(key)
2687 else:
-> 2688 return self._getitem_column(key)
2689
2690 def _getitem_column(self, key):
C:\Users\Administrator\Anaconda2\lib\site-packages\pandas\core\frame.pyc in _getitem_column(self, key)
2693 # get column
2694 if self.columns.is_unique:
-> 2695 return self._get_item_cache(key)
2696
2697 # duplicate columns & possible reduce dimensionality
C:\Users\Administrator\Anaconda2\lib\site-packages\pandas\core\generic.pyc in _get_item_cache(self, item)
2487 res = cache.get(item)
2488 if res is None:
-> 2489 values = self._data.get(item)
2490 res = self._box_item_values(item, values)
2491 cache[item] = res
C:\Users\Administrator\Anaconda2\lib\site-packages\pandas\core\internals.pyc in get(self, item, fastpath)
4113
4114 if not isna(item):
-> 4115 loc = self.items.get_loc(item)
4116 else:
4117 indexer = np.arange(len(self.items))[isna(self.items)]
C:\Users\Administrator\Anaconda2\lib\site-packages\pandas\core\indexes\base.pyc in get_loc(self, key, method, tolerance)
3078 return self._engine.get_loc(key)
3079 except KeyError:
-> 3080 return self._engine.get_loc(self._maybe_cast_indexer(key))
3081
3082 indexer = self.get_indexer([key], method=method, tolerance=tolerance)
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: 'B'
However, I found that if the testFun2 is changed to something simpler, like:
def testFun2(x):
return 1
then the error won't occur. This is very confusing to me - the testFun2 has nothing to do with the line b = x['B'], right? Why did I get the error in the first place? Thanks!