How to get a specific column from a data frame? - pandas

I am working with a data frame, that has three columns: comment_id, class and comment_message, I need to store the three columns, but I am getting an error when I try to store the column called: class, my complete code looks as follows:
from sklearn import svm
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
df1=pd.read_csv("C:/Users/acamagon/Downloads/dataSet",sep=',')
#print(df1)
comment_id = df1['comment_id']
comment_message = df1['comment_message']
print(comment_message)
here is whe the problem comes:
#Here is the problem
classification = df1['class']
the file looks as follows:
comment_id,comment_message,class
10154395643583692_10154397346673692,quisiera saber el precio y las caracteristicas del selulae samsung s5 xfavoor,1
10154395643583692_10154397434578692,"buenos dias, necesito que le den seguimiento a un telefono que deje en garantia desde octubre en el cac urban center de xalapa, veracruz. ya van 4 veces y me dicen que el telefono no esta y ya va para 3 meses que lo deje. espero me den una respuesta pronto. me comunico al *111 y solo me dicen que el folio sigue en pendiente.",1
10154395643583692_10154397511368692,no sirve su aplicacion de mi telcel... [[PHOTO]],0
10154395643583692_10154397598508692,"buenas tardes, gracias por su atencion brindada... pude resolver mi duda y asi sabre que es lo mejor para mi. saludos.",1
10154394898978692_10154397173938692,q precio tiene el plan????,2
10154394898978692_10154397265133692,para solicitarlo?,1
The error is the following:
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
C:\Program Files\Anaconda3\lib\site-packages\pandas\indexes\base.py in get_loc(self, key, method, tolerance)
1944 try:
-> 1945 return self._engine.get_loc(key)
1946 except KeyError:
pandas\index.pyx in pandas.index.IndexEngine.get_loc (pandas\index.c:4154)()
pandas\index.pyx in pandas.index.IndexEngine.get_loc (pandas\index.c:4018)()
pandas\hashtable.pyx in pandas.hashtable.PyObjectHashTable.get_item (pandas\hashtable.c:12368)()
pandas\hashtable.pyx in pandas.hashtable.PyObjectHashTable.get_item (pandas\hashtable.c:12322)()
KeyError: 'class'
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
<ipython-input-54-f52e2494564b> in <module>()
15
16
---> 17 classification = df1['class']
18
19
C:\Program Files\Anaconda3\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
1995 return self._getitem_multilevel(key)
1996 else:
-> 1997 return self._getitem_column(key)
1998
1999 def _getitem_column(self, key):
C:\Program Files\Anaconda3\lib\site-packages\pandas\core\frame.py in _getitem_column(self, key)
2002 # get column
2003 if self.columns.is_unique:
-> 2004 return self._get_item_cache(key)
2005
2006 # duplicate columns & possible reduce dimensionality
C:\Program Files\Anaconda3\lib\site-packages\pandas\core\generic.py in _get_item_cache(self, item)
1348 res = cache.get(item)
1349 if res is None:
-> 1350 values = self._data.get(item)
1351 res = self._box_item_values(item, values)
1352 cache[item] = res
C:\Program Files\Anaconda3\lib\site-packages\pandas\core\internals.py in get(self, item, fastpath)
3288
3289 if not isnull(item):
-> 3290 loc = self.items.get_loc(item)
3291 else:
3292 indexer = np.arange(len(self.items))[isnull(self.items)]
C:\Program Files\Anaconda3\lib\site-packages\pandas\indexes\base.py in get_loc(self, key, method, tolerance)
1945 return self._engine.get_loc(key)
1946 except KeyError:
-> 1947 return self._engine.get_loc(self._maybe_cast_indexer(key))
1948
1949 indexer = self.get_indexer([key], method=method, tolerance=tolerance)
pandas\index.pyx in pandas.index.IndexEngine.get_loc (pandas\index.c:4154)()
pandas\index.pyx in pandas.index.IndexEngine.get_loc (pandas\index.c:4018)()
pandas\hashtable.pyx in pandas.hashtable.PyObjectHashTable.get_item (pandas\hashtable.c:12368)()
pandas\hashtable.pyx in pandas.hashtable.PyObjectHashTable.get_item (pandas\hashtable.c:12322)()
KeyError: 'class'

Try this:
df1.columns = [c.strip() for c in list(df1.columns.values)]
print(df1["class"])
The problem was that your class header contained whitespace. Stripping that whitespace with .strip() allows pandas to find the header, thus avoiding the KeyError.

Related

Why dataframe is not displayed in console using pyspark?

This is the session object I have created for dataframe
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
This is the code for Dataframe creation
data = [("James","","Smith",30,"M",60000),
("Michael","Rose","",50,"M",70000),
("Robert","","Williams",42,"",400000),
("Maria","Anne","Jones",38,"F",500000),
("Jen","Mary","Brown",45,"F",0)]
columns = ["first_name","middle_name","last_name","Age","gender","salary"]
pysparkDF = spark.createDataFrame(data = data, schema = columns)
pysparkDF.printSchema()
pysparkDF.show(truncate=False)
I want dataframe should be displayed in console of six columns but it is showing me the erro of Py4jjjavaerror exceeds size limit
Output exceeds the size limit. Open the full output data in a text editor
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
Cell In[28], line 10
8 pysparkDF = spark.createDataFrame(data = data, schema = columns)
9 pysparkDF.printSchema()
---> 10 pysparkDF.show(truncate=False)
File c:\Users\baps\AppData\Local\Programs\Python\Python38-32\lib\site-packages\pyspark\sql\dataframe.py:615, in DataFrame.show(self, n, truncate, vertical)
610 except ValueError:
611 raise TypeError(
612 "Parameter 'truncate={}' should be either bool or int.".format(truncate)
613 )
--> 615 print(self._jdf.showString(n, int_truncate, vertical))
File c:\Users\baps\AppData\Local\Programs\Python\Python38-32\lib\site-packages\py4j\java_gateway.py:1321, in JavaMember.__call__(self, *args)
1315 command = proto.CALL_COMMAND_NAME +\
1316 self.command_header +\
1317 args_command +\
1318 proto.END_COMMAND_PART
1320 answer = self.gateway_client.send_command(command)
-> 1321 return_value = get_return_value(
1322 answer, self.gateway_client, self.target_id, self.name)
1324 for temp_arg in temp_args:
1325 temp_arg._detach()
...
at java.lang.ProcessImpl.<init>(ProcessImpl.java:453)
at java.lang.ProcessImpl.start(ProcessImpl.java:140)
at java.lang.ProcessBuilder.start(ProcessBuilder.java:1029)
... 30 more

IndexError: positional indexers are out-of-bounds

I am new to python, Don't know how to fix this error. I am building a sentiment analysis classifier using word2vec.
Following is the code where I got the error:
pos_train_w2v = wordvec_df.iloc[:18046,:]
pos_test_w2v = wordvec_df.iloc[18046:,:]
splitting data into training and validation set
xtrain_w2v, xvalid_w2v, ytrain, yvalid = train_test_split(pos_train_w2v, positive_train['Label'], random_state=42, test_size=0.3)
xtrain_w2v = pos_train_w2v.iloc[ytrain.index,:]
xvalid_w2v = pos_train_w2v.iloc[yvalid.index,:]
Following is the error i received:
IndexError Traceback (most recent call last)
in ()
5 xtrain_w2v, xvalid_w2v, ytrain, yvalid = train_test_split(pos_train_w2v, positive_train['Label'], random_state=42, test_size=0.3)
6
----> 7 xtrain_w2v = pos_train_w2v.iloc[ytrain.index,:]
8 xvalid_w2v = pos_train_w2v.iloc[yvalid.index,:]
3 frames
/usr/local/lib/python3.7/dist-packages/pandas/core/indexing.py in getitem(self, key)
923 with suppress(KeyError, IndexError):
924 return self.obj._get_value(*key, takeable=self._takeable)
--> 925 return self._getitem_tuple(key)
926 else:
927 # we by definition only have the 0th axis
/usr/local/lib/python3.7/dist-packages/pandas/core/indexing.py in _getitem_tuple(self, tup)
1504 def _getitem_tuple(self, tup: tuple):
1505
-> 1506 self._has_valid_tuple(tup)
1507 with suppress(IndexingError):
1508 return self._getitem_lowerdim(tup)
/usr/local/lib/python3.7/dist-packages/pandas/core/indexing.py in _has_valid_tuple(self, key)
752 for i, k in enumerate(key):
753 try:
--> 754 self._validate_key(k, i)
755 except ValueError as err:
756 raise ValueError(
/usr/local/lib/python3.7/dist-packages/pandas/core/indexing.py in _validate_key(self, key, axis)
1422 # check that the key does not exceed the maximum size of the index
1423 if len(arr) and (arr.max() >= len_axis or arr.min() < -len_axis):
-> 1424 raise IndexError("positional indexers are out-of-bounds")
1425 else:
1426 raise ValueError(f"Can only index by location with a [{self._valid_types}]")
IndexError: positional indexers are out-of-bounds

Getting error message when trying to get at risk numbers below KM-plot (lifelines)

I've used lifelines a lot, but when I'm re-running old code that previously worked fine I get the following error: KeyError: "None of [Index(['At risk', 'Censored', 'Events'], dtype='object')] are in the [index]"
I'm guessing there has been some changes to the code when displaying at risk counts, but I can't find any evidence of it in the lifelines documentation. I am using version 27.0
Snippet of the table with data
index
t2p
O
1
354
False
2
113
False
3
1222
False
4
13
True
5
59
False
6
572
False
Code:
ax = plt.subplot(111)
m = KaplanMeierFitter()
ax = m.fit(h.t2p, h.O, label='PPI').plot_cumulative_density(ax=ax,ci_show=False)
add_at_risk_counts(m)
Full error:
KeyError Traceback (most recent call last)
<ipython-input-96-a8ce3ea9e60c> in <module>
4 ax = m.fit(h.t2p, h.O, label='PPI').plot_cumulative_density(ax=ax,ci_show=False)
5
----> 6 add_at_risk_counts(m)
7
8
~\AppData\Local\Continuum\anaconda3\lib\site-packages\lifelines\plotting.py in add_at_risk_counts(labels, rows_to_show, ypos, xticks, ax, at_risk_count_from_start_of_period, *fitters, **kwargs)
510 .rename({"at_risk": "At risk", "censored": "Censored", "observed": "Events"})
511 )
--> 512 counts.extend([int(c) for c in event_table_slice.loc[rows_to_show]])
513
514 if n_rows > 1:
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\indexing.py in __getitem__(self, key)
1766
1767 maybe_callable = com.apply_if_callable(key, self.obj)
-> 1768 return self._getitem_axis(maybe_callable, axis=axis)
1769
1770 def _is_scalar_access(self, key: Tuple):
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\indexing.py in _getitem_axis(self, key, axis)
1952 raise ValueError("Cannot index with multidimensional key")
1953
-> 1954 return self._getitem_iterable(key, axis=axis)
1955
1956 # nested tuple slicing
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\indexing.py in _getitem_iterable(self, key, axis)
1593 else:
1594 # A collection of keys
-> 1595 keyarr, indexer = self._get_listlike_indexer(key, axis, raise_missing=False)
1596 return self.obj._reindex_with_indexers(
1597 {axis: [keyarr, indexer]}, copy=True, allow_dups=True
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\indexing.py in _get_listlike_indexer(self, key, axis, raise_missing)
1551
1552 self._validate_read_indexer(
-> 1553 keyarr, indexer, o._get_axis_number(axis), raise_missing=raise_missing
1554 )
1555 return keyarr, indexer
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\indexing.py in _validate_read_indexer(self, key, indexer, axis, raise_missing)
1638 if missing == len(indexer):
1639 axis_name = self.obj._get_axis_name(axis)
-> 1640 raise KeyError(f"None of [{key}] are in the [{axis_name}]")
1641
1642 # We (temporarily) allow for some missing keys with .loc, except in
KeyError: "None of [Index(['At risk', 'Censored', 'Events'], dtype='object')] are in the [index]"

matplotlib issue: how to erase this one?

import maplotlib.pyplot as plt
import pandas as pd
df = pd.DataFrame(np.random.randn(30,3)*100+1000,
index=pd.date_range(start='2018-09-01', periods=30, freq='D'),
columns=['1', '2', 3'])
df[:5].plot.bar()
a Seeing the graph, each x label has '00:00:00', which is unnecessary.
So I tried to delete these by writing this code.
df[:5].plot.bar(x=df[:5].index.date
But it has an error like this.
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-52-92dd89374fec> in <module>
----> 1 df[:5].plot.bar(x=df[:5].index.date, stacked=True)
~\anaconda3\lib\site-packages\pandas\plotting\_core.py in bar(self, x, y, **kwargs)
1001 >>> ax = df.plot.bar(x='lifespan', rot=0)
1002 """
-> 1003 return self(kind="bar", x=x, y=y, **kwargs)
1004
1005 def barh(self, x=None, y=None, **kwargs):
~\anaconda3\lib\site-packages\pandas\plotting\_core.py in __call__(self, *args, **kwargs)
810 if is_integer(x) and not data.columns.holds_integer():
811 x = data_cols[x]
--> 812 elif not isinstance(data[x], ABCSeries):
813 raise ValueError("x must be a label or position")
814 data = data.set_index(x)
~\anaconda3\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
2804 if is_iterator(key):
2805 key = list(key)
-> 2806 indexer = self.loc._get_listlike_indexer(key, axis=1, raise_missing=True)[1]
2807
2808 # take() does not accept boolean indexers
~\anaconda3\lib\site-packages\pandas\core\indexing.py in _get_listlike_indexer(self, key, axis, raise_missing)
1550 keyarr, indexer, new_indexer = ax._reindex_non_unique(keyarr)
1551
-> 1552 self._validate_read_indexer(
1553 keyarr, indexer, o._get_axis_number(axis), raise_missing=raise_missing
1554 )
~\anaconda3\lib\site-packages\pandas\core\indexing.py in _validate_read_indexer(self, key, indexer, axis, raise_missing)
1638 if missing == len(indexer):
1639 axis_name = self.obj._get_axis_name(axis)
-> 1640 raise KeyError(f"None of [{key}] are in the [{axis_name}]")
1641
1642 # We (temporarily) allow for some missing keys with .loc, except in
KeyError: "None of [Index([2018-09-01, 2018-09-02, 2018-09-03, 2018-09-04, 2018-09-05], dtype='object')] are in the [columns]"
What's the problem?? I just followed the book, but it did come out.
You can change index values before selecting first 5 rows:
df.index = df.index.date
df[:5].plot.bar()
Or:
df.rename(lambda x: x.date())[:5].plot.bar()

ParserError: Error tokenizing data. C error: Expected x fields in line y, saw z

I am new to python. Ran across below error when trying to read in dozens of zip files into a signle df. Each zip file contains a csv file. I searched extensively on stack overflow and haven't find a solution yet. my suspicion is that this csv contains Chinese characters as well as urls.
my code:
import os
import zipfile
import pandas as pd
import glob
file_path = os.getcwd() #obtain file path
allFiles = glob.glob(file_path + "/*.zip") #return list of all file names
list_ = []
for file in allFiles:
with zipfile.ZipFile(file) as f:
df = pd.read_csv(f.open(f.namelist()[0]),encoding='latin-1',header=None)
list_.append(df)
df = pd.concat(list_)
df
and here is the error message:
ParserError Traceback (most recent call last)
<ipython-input-25-471e9d0ab709> in <module>()
6 for file in allFiles:
7 with zipfile.ZipFile(file) as f:
----> 8 df = pd.read_csv(f.open(f.namelist()[0]),encoding='latin-1',skiprows=[0, 1],header=None)
9 list_.append(df)
10 df = pd.concat(list_)
D:\anaconda\lib\site-packages\pandas\io\parsers.py in parser_f(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, escapechar, comment, encoding, dialect, tupleize_cols, error_bad_lines, warn_bad_lines, skipfooter, skip_footer, doublequote, delim_whitespace, as_recarray, compact_ints, use_unsigned, low_memory, buffer_lines, memory_map, float_precision)
653 skip_blank_lines=skip_blank_lines)
654
--> 655 return _read(filepath_or_buffer, kwds)
656
657 parser_f.__name__ = name
D:\anaconda\lib\site-packages\pandas\io\parsers.py in _read(filepath_or_buffer, kwds)
409
410 try:
--> 411 data = parser.read(nrows)
412 finally:
413 parser.close()
D:\anaconda\lib\site-packages\pandas\io\parsers.py in read(self, nrows)
1003 raise ValueError('skipfooter not supported for iteration')
1004
-> 1005 ret = self._engine.read(nrows)
1006
1007 if self.options.get('as_recarray'):
D:\anaconda\lib\site-packages\pandas\io\parsers.py in read(self, nrows)
1746 def read(self, nrows=None):
1747 try:
-> 1748 data = self._reader.read(nrows)
1749 except StopIteration:
1750 if self._first_chunk:
pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader.read (pandas\_libs\parsers.c:10862)()
pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader._read_low_memory (pandas\_libs\parsers.c:11138)()
pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader._read_rows (pandas\_libs\parsers.c:11884)()
pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader._tokenize_rows (pandas\_libs\parsers.c:11755)()
pandas\_libs\parsers.pyx in pandas._libs.parsers.raise_parser_error (pandas\_libs\parsers.c:28765)()
ParserError: Error tokenizing data. C error: Expected 1 fields in line 9, saw 2
I tried using "error_bad_line=False", yet it returns another error.
Eventually, I managed to by pass this problem by renaming both the zip file titles and the CSV file titles, keeping only numbers for my own reference. And changed the read_csv encoding to "latin-1".