How do select 2nd column or a matrix from a pandas dataframe? - pandas

How do you select column other than the first column?
import pandas as pd
df = pd.read_csv('bio.csv')
df
I could select the first column, i.e., "Index"
df['Index']
However, I could not select the second column, i.e., "Height".
df['Height']
Here is the trace:
KeyError Traceback (most recent call last)
C:\util\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
2441 try:
-> 2442 return self._engine.get_loc(key)
2443 except KeyError:
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: 'Height'
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
<ipython-input-8-58aff8413556> in <module>()
----> 1 df['Height']
C:\util\Anaconda3\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
1962 return self._getitem_multilevel(key)
1963 else:
-> 1964 return self._getitem_column(key)
1965
1966 def _getitem_column(self, key):
C:\util\Anaconda3\lib\site-packages\pandas\core\frame.py in _getitem_column(self, key)
1969 # get column
1970 if self.columns.is_unique:
-> 1971 return self._get_item_cache(key)
1972
1973 # duplicate columns & possible reduce dimensionality
C:\util\Anaconda3\lib\site-packages\pandas\core\generic.py in _get_item_cache(self, item)
1643 res = cache.get(item)
1644 if res is None:
-> 1645 values = self._data.get(item)
1646 res = self._box_item_values(item, values)
1647 cache[item] = res
C:\util\Anaconda3\lib\site-packages\pandas\core\internals.py in get(self, item, fastpath)
3588
3589 if not isnull(item):
-> 3590 loc = self.items.get_loc(item)
3591 else:
3592 indexer = np.arange(len(self.items))[isnull(self.items)]
C:\util\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
2442 return self._engine.get_loc(key)
2443 except KeyError:
-> 2444 return self._engine.get_loc(self._maybe_cast_indexer(key))
2445
2446 indexer = self.get_indexer([key], method=method, tolerance=tolerance)
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: 'Height'

Below is the complete answer
import pandas as pd
df = pd.read_csv('bio.csv', sep='[ \t]*,[ \t]*', engine='python')
df['Height']
Theis is the output:
Out[22]: 0 65.78
1 71.52
2 69.40
3 68.22
4 67.79
5 68.70
6 69.80
7 70.01
8 67.90
9 66.78
10 66.49
11 67.62
12 68.30
13 67.12
14 68.28
15 71.09

Related

Dummy variable is not created

I am facing a problem while creating a dummy variable There is a problem with the 'town' column.
it's giving a key error but my syntax is correct.
please help me I didn't understand what is the problem even I am correct from my side.
import pandas as pd
import numpy as np
df= pd.read_csv('homeprices.csv')
df
dummies=pd.get_dummies(df['town'])
dummies
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
c:\users\saurabh singh\appdata\local\programs\python\python37\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
2894 try:
-> 2895 return self._engine.get_loc(casted_key)
2896 except KeyError as err:
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: 'town'
The above exception was the direct cause of the following exception:
KeyError Traceback (most recent call last)
<ipython-input-30-b0961e3e5942> in <module>
1 # df = pd.concat([df, pd.get_dummies(df['town'])], axis=1)
----> 2 dummies=pd.get_dummies(df['town'])
3 dummies
c:\users\saurabh singh\appdata\local\programs\python\python37\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
2904 if self.columns.nlevels > 1:
2905 return self._getitem_multilevel(key)
-> 2906 indexer = self.columns.get_loc(key)
2907 if is_integer(indexer):
2908 indexer = [indexer]
c:\users\saurabh singh\appdata\local\programs\python\python37\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
2895 return self._engine.get_loc(casted_key)
2896 except KeyError as err:
-> 2897 raise KeyError(key) from err
2898
2899 if tolerance is not None:
KeyError: 'town'
df.columns
Index(['town ', 'area', 'price'], dtype='object')
Your column town has a space on it.
Change your column names as follows
df.columns = ['town', 'area', 'price']
After this, you can use
dummies=pd.get_dummies(df['town'])
Or just change df['town'] to df['town ']

Python function to calculate balance for every row corresponding to individual transactions

I am working on a bank statement, corresponding to the output dataframe and an ending balance corresponding to the output['balance'][0] I would like to calculate all balance values for the individual transactions as described below. It's a very straightforward calculation and yet it doesn't seem to be working - is there something quite obvious I am missing? Thanks in advance!
output['balance'] = ''
output['balance'][0] = 21.15
if len(output[amount]) > 0:
return output[balance][i+1].append((output[balance][i]-output[amount][i+1]))
else:
output[balance].append((output[balance][0]))
output[['balance']] = output['Amount'].apply(lambda amount: bal_calc(output, amount))```
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
~\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
2896 try:
-> 2897 return self._engine.get_loc(key)
2898 except KeyError:
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: 4.95
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
<ipython-input-271-b85947935fca> in <module>
----> 1 output[['balance']] = output['Amount'].apply(lambda amount: bal_calc(output, amount))
~\Anaconda3\lib\site-packages\pandas\core\series.py in apply(self, func, convert_dtype, args, **kwds)
4040 else:
4041 values = self.astype(object).values
-> 4042 mapped = lib.map_infer(values, f, convert=convert_dtype)
4043
4044 if len(mapped) and isinstance(mapped[0], Series):
pandas/_libs/lib.pyx in pandas._libs.lib.map_infer()
<ipython-input-271-b85947935fca> in <lambda>(amount)
----> 1 output[['balance']] = output['Amount'].apply(lambda amount: bal_calc(output, amount))
<ipython-input-270-cbf5ac20716d> in bal_calc(output, amount)
2 output['balance'] = ''
3 output['balance'][0] = 21.15
----> 4 if len(output[amount]) > 0:
5 return output[balance][i+1].append((output[balance][i]-output[amount][i+1]))
6 else:
~\Anaconda3\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
2978 if self.columns.nlevels > 1:
2979 return self._getitem_multilevel(key)
-> 2980 indexer = self.columns.get_loc(key)
2981 if is_integer(indexer):
2982 indexer = [indexer]
~\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
2897 return self._engine.get_loc(key)
2898 except KeyError:
-> 2899 return self._engine.get_loc(self._maybe_cast_indexer(key))
2900 indexer = self.get_indexer([key], method=method, tolerance=tolerance)
2901 if indexer.ndim > 1 or indexer.size > 1:
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: 4.95
It will be easier to understand your problem you can post your existing dataframe and intended dataframe. From your description I think you can approach calculating balance like this
import pandas as pd
## creating dummy dataframe for testing
arr = np.random.choice(range(500,1000),(10,2))
debit_credit = np.random.choice([0,1], (10))
arr[:,0] = arr[:,0] * debit_credit
arr[:,1] = arr[:,1] * (1-debit_credit)
df=pd.DataFrame(arr, columns=["Debit", "Credit"])
display(df)
## calculating Balance
df["Balance"] = (df.Debit-df.Credit).cumsum()
display(df)
Output
Debit Credit Balance
0 957 0 957
1 0 698 259
2 608 0 867
3 0 969 -102
4 0 766 -868
5 0 551 -1419
6 985 0 -434
7 861 0 427
8 927 0 1354
9 0 923 431
bs['balance'][0] = 21.15
for i in range(1, len(bs)):
bs.loc[i, 'balance'] = bs.loc[i-1, 'balance'] + bs.loc[i, 'Credit'] -bs.loc[i, 'Debit']

KeyError: 6 when adding labels to a regplot

I have a dataframe with the following columns: 'Project', 'Hours', 'Revenue', 'Rate'
I try to build a regplot with Rate on X-axis and Revenue on the Y-axis and add Project name label to the dots at my regplot. Here is my code:
ax=sns.regplot(x = 'Revenue', y = 'Rate',data= df_hours_revenue, ci = None)
sns.set(rc={'figure.figsize':(20,10)})
# ax.set_xlabel('')
# ax.set_ylabel('Rate (£/hour)')
sns.set_context('poster')
plt.title("Rate vs. project revenue 2018-2019")
for line in range(0,df_hours_revenue.shape[0]):
ax.text(df_hours_revenue.Revenue[line], df_hours_revenue.Rate[line],
df_hours_revenue.Project[line], horizontalalignment='left',
size='small', color='Black', weight='normal')
The result is a graph with some labels added and this output error which I do not understand:
KeyError Traceback (most recent call last)
<ipython-input-55-fdcfd6157523> in <module>
9
10 for line in range(0,df_hours_revenue.shape[0]):
---> 11 ax.text(df_hours_revenue.Revenue[line], df_hours_revenue.Rate[line],
12 df_hours_revenue.Project[line], horizontalalignment='left',
13 size='small', color='Black', weight='normal')
~/anaconda3/lib/python3.7/site-packages/pandas/core/series.py in __getitem__(self, key)
866 key = com.apply_if_callable(key, self)
867 try:
--> 868 result = self.index.get_value(self, key)
869
870 if not is_scalar(result):
~/anaconda3/lib/python3.7/site-packages/pandas/core/indexes/base.py in get_value(self, series, key)
4373 try:
4374 return self._engine.get_value(s, k,
-> 4375 tz=getattr(series.dtype, 'tz', None))
4376 except KeyError as e1:
4377 if len(self) > 0 and (self.holds_integer() or self.is_boolean()):
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_value()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_value()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item()
KeyError: 6
If you want to access Series' item by its position, you should be using .iloc, e.g.:
df_hours_revenue.Revenue.iloc[line]
Read more here.

pandas groupby got KeyError

I am using pandas to calculate some stats of a data file and got some error. It's reproducible by this simple sample code:
import pandas as pd
df = pd.DataFrame({'A': [1,2,3,4,5,6,7,8,9],
'B': [1,2,3,1,2,3,1,2,3],
'C': ['a', 'b', 'a', 'b', 'a', 'b', 'a','a', 'b']})
def testFun2(x):
return pd.DataFrame({'xlen': x.shape[0]})
def testFun(x):
b = x['B']
print "b equals to {}".format(b) # This line prints okay
c = x['C']
out = pd.DataFrame()
for a in x['A'].unique():
subx = x[x.A == a]
subxg = testFun2(subx)
out = pd.concat([out, subxg])
return out
df.groupby(['B', 'C']).apply(lambda x: testFun(x))
The whole error output look like this:
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-21-979d23aa904c> in <module>()
18 return out
19
---> 20 df.groupby(['B', 'C']).apply(lambda x: testFun(x))
C:\Users\Administrator\Anaconda2\lib\site-packages\pandas\core\groupby\groupby.pyc in apply(self, func, *args, **kwargs)
928
929 with _group_selection_context(self):
--> 930 return self._python_apply_general(f)
931
932 return result
C:\Users\Administrator\Anaconda2\lib\site-packages\pandas\core\groupby\groupby.pyc in _python_apply_general(self, f)
934 def _python_apply_general(self, f):
935 keys, values, mutated = self.grouper.apply(f, self._selected_obj,
--> 936 self.axis)
937
938 return self._wrap_applied_output(
C:\Users\Administrator\Anaconda2\lib\site-packages\pandas\core\groupby\groupby.pyc in apply(self, f, data, axis)
2271 # group might be modified
2272 group_axes = _get_axes(group)
-> 2273 res = f(group)
2274 if not _is_indexed_like(res, group_axes):
2275 mutated = True
<ipython-input-21-979d23aa904c> in <lambda>(x)
18 return out
19
---> 20 df.groupby(['B', 'C']).apply(lambda x: testFun(x))
<ipython-input-21-979d23aa904c> in testFun(x)
9
10 def testFun(x):
---> 11 b = x['B']
12 c = x['C']
13 out = pd.DataFrame()
C:\Users\Administrator\Anaconda2\lib\site-packages\pandas\core\frame.pyc in __getitem__(self, key)
2686 return self._getitem_multilevel(key)
2687 else:
-> 2688 return self._getitem_column(key)
2689
2690 def _getitem_column(self, key):
C:\Users\Administrator\Anaconda2\lib\site-packages\pandas\core\frame.pyc in _getitem_column(self, key)
2693 # get column
2694 if self.columns.is_unique:
-> 2695 return self._get_item_cache(key)
2696
2697 # duplicate columns & possible reduce dimensionality
C:\Users\Administrator\Anaconda2\lib\site-packages\pandas\core\generic.pyc in _get_item_cache(self, item)
2487 res = cache.get(item)
2488 if res is None:
-> 2489 values = self._data.get(item)
2490 res = self._box_item_values(item, values)
2491 cache[item] = res
C:\Users\Administrator\Anaconda2\lib\site-packages\pandas\core\internals.pyc in get(self, item, fastpath)
4113
4114 if not isna(item):
-> 4115 loc = self.items.get_loc(item)
4116 else:
4117 indexer = np.arange(len(self.items))[isna(self.items)]
C:\Users\Administrator\Anaconda2\lib\site-packages\pandas\core\indexes\base.pyc in get_loc(self, key, method, tolerance)
3078 return self._engine.get_loc(key)
3079 except KeyError:
-> 3080 return self._engine.get_loc(self._maybe_cast_indexer(key))
3081
3082 indexer = self.get_indexer([key], method=method, tolerance=tolerance)
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: 'B'
However, I found that if the testFun2 is changed to something simpler, like:
def testFun2(x):
return 1
then the error won't occur. This is very confusing to me - the testFun2 has nothing to do with the line b = x['B'], right? Why did I get the error in the first place? Thanks!

finding max element from column of dataframe gives error

I am trying to find largest element from a column in my DataFrame but this gives the following error.
And i have tested that it only gives error to this column name only and rest of the columns just work fine.
This is my DataFrame created from a file posts1.csv
import pandas as pd
posts_n = pd.read_csv('posts1.csv',encoding='latin-1')
posts=posts_n.fillna(0)
When i try to find max element from a particular column ie "score" ,
max_post = posts['score'].max()
max_post
i get following error
KeyError Traceback (most recent call last)
~\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
2441 try:
-> 2442 return self._engine.get_loc(key)
2443 except KeyError:
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: 'score'
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
<ipython-input-12-09c353ba0de2> in <module>()
34 #MAximum posts done by a user
35
---> 36 max_post = posts['score'].max()
37 max_post
38 #scr=posts.iloc[:,4]
~\Anaconda3\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
1962 return self._getitem_multilevel(key)
1963 else:
-> 1964 return self._getitem_column(key)
1965
1966 def _getitem_column(self, key):
~\Anaconda3\lib\site-packages\pandas\core\frame.py in _getitem_column(self, key)
1969 # get column
1970 if self.columns.is_unique:
-> 1971 return self._get_item_cache(key)
1972
1973 # duplicate columns & possible reduce dimensionality
~\Anaconda3\lib\site-packages\pandas\core\generic.py in _get_item_cache(self, item)
1643 res = cache.get(item)
1644 if res is None:
-> 1645 values = self._data.get(item)
1646 res = self._box_item_values(item, values)
1647 cache[item] = res
~\Anaconda3\lib\site-packages\pandas\core\internals.py in get(self, item, fastpath)
3588
3589 if not isnull(item):
-> 3590 loc = self.items.get_loc(item)
3591 else:
3592 indexer = np.arange(len(self.items))[isnull(self.items)]
~\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
2442 return self._engine.get_loc(key)
2443 except KeyError:
-> 2444 return self._engine.get_loc(self._maybe_cast_indexer(key))
2445
2446 indexer = self.get_indexer([key], method=method, tolerance=tolerance)
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: 'score'
This is how data looks
posts1.csv
'score' is not in the (column) index, so instead of loading in the first line of the csv as the header line, you read it in as data.
try the following:
posts = pd.read_csv('posts1.csv', header=1)