ParserError: Error tokenizing data. C error: Expected x fields in line y, saw z - pandas

I am new to python. Ran across below error when trying to read in dozens of zip files into a signle df. Each zip file contains a csv file. I searched extensively on stack overflow and haven't find a solution yet. my suspicion is that this csv contains Chinese characters as well as urls.
my code:
import os
import zipfile
import pandas as pd
import glob
file_path = os.getcwd() #obtain file path
allFiles = glob.glob(file_path + "/*.zip") #return list of all file names
list_ = []
for file in allFiles:
with zipfile.ZipFile(file) as f:
df = pd.read_csv(f.open(f.namelist()[0]),encoding='latin-1',header=None)
list_.append(df)
df = pd.concat(list_)
df
and here is the error message:
ParserError Traceback (most recent call last)
<ipython-input-25-471e9d0ab709> in <module>()
6 for file in allFiles:
7 with zipfile.ZipFile(file) as f:
----> 8 df = pd.read_csv(f.open(f.namelist()[0]),encoding='latin-1',skiprows=[0, 1],header=None)
9 list_.append(df)
10 df = pd.concat(list_)
D:\anaconda\lib\site-packages\pandas\io\parsers.py in parser_f(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, escapechar, comment, encoding, dialect, tupleize_cols, error_bad_lines, warn_bad_lines, skipfooter, skip_footer, doublequote, delim_whitespace, as_recarray, compact_ints, use_unsigned, low_memory, buffer_lines, memory_map, float_precision)
653 skip_blank_lines=skip_blank_lines)
654
--> 655 return _read(filepath_or_buffer, kwds)
656
657 parser_f.__name__ = name
D:\anaconda\lib\site-packages\pandas\io\parsers.py in _read(filepath_or_buffer, kwds)
409
410 try:
--> 411 data = parser.read(nrows)
412 finally:
413 parser.close()
D:\anaconda\lib\site-packages\pandas\io\parsers.py in read(self, nrows)
1003 raise ValueError('skipfooter not supported for iteration')
1004
-> 1005 ret = self._engine.read(nrows)
1006
1007 if self.options.get('as_recarray'):
D:\anaconda\lib\site-packages\pandas\io\parsers.py in read(self, nrows)
1746 def read(self, nrows=None):
1747 try:
-> 1748 data = self._reader.read(nrows)
1749 except StopIteration:
1750 if self._first_chunk:
pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader.read (pandas\_libs\parsers.c:10862)()
pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader._read_low_memory (pandas\_libs\parsers.c:11138)()
pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader._read_rows (pandas\_libs\parsers.c:11884)()
pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader._tokenize_rows (pandas\_libs\parsers.c:11755)()
pandas\_libs\parsers.pyx in pandas._libs.parsers.raise_parser_error (pandas\_libs\parsers.c:28765)()
ParserError: Error tokenizing data. C error: Expected 1 fields in line 9, saw 2

I tried using "error_bad_line=False", yet it returns another error.
Eventually, I managed to by pass this problem by renaming both the zip file titles and the CSV file titles, keeping only numbers for my own reference. And changed the read_csv encoding to "latin-1".

Related

unable to read csv file in jupyter notebook and following errors coming

import pandas as pd
df = pd.read_csv('D:\Tableau\codebasics_files\Weather_data.csv.xlsx')
df
​
UnicodeDecodeError Traceback (most recent call last)
~\AppData\Local\Temp\ipykernel_18872\1985582496.py in
1 import pandas as pd
----> 2 df = pd.read_csv('D:\Tableau\codebasics_files\Weather_data.csv.xlsx')
3 df
C:\ProgramData\Anaconda3\lib\site-packages\pandas\util_decorators.py in wrapper(*args, **kwargs)
309 stacklevel=stacklevel,
310 )
--> 311 return func(*args, **kwargs)
312
313 return wrapper
C:\ProgramData\Anaconda3\lib\site-packages\pandas\io\parsers\readers.py in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, error_bad_lines, warn_bad_lines, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options)
676 kwds.update(kwds_defaults)
677
--> 678 return _read(filepath_or_buffer, kwds)
679
680
C:\ProgramData\Anaconda3\lib\site-packages\pandas\io\parsers\readers.py in _read(filepath_or_buffer, kwds)
573
574 # Create the parser.
--> 575 parser = TextFileReader(filepath_or_buffer, **kwds)
576
577 if chunksize or iterator:
C:\ProgramData\Anaconda3\lib\site-packages\pandas\io\parsers\readers.py in init(self, f, engine, **kwds)
930
931 self.handles: IOHandles | None = None
--> 932 self._engine = self._make_engine(f, self.engine)
933
934 def close(self):
C:\ProgramData\Anaconda3\lib\site-packages\pandas\io\parsers\readers.py in _make_engine(self, f, engine)
1232
1233 try:
-> 1234 return mapping[engine](f, **self.options)
1235 except Exception:
1236 if self.handles is not None:
C:\ProgramData\Anaconda3\lib\site-packages\pandas\io\parsers\c_parser_wrapper.py in init(self, src, **kwds)
73
74 kwds["dtype"] = ensure_dtype_objs(kwds.get("dtype", None))
---> 75 self._reader = parsers.TextReader(src, **kwds)
76
77 self.unnamed_cols = self._reader.unnamed_cols
C:\ProgramData\Anaconda3\lib\site-packages\pandas_libs\parsers.pyx in pandas._libs.parsers.TextReader.cinit()
C:\ProgramData\Anaconda3\lib\site-packages\pandas_libs\parsers.pyx in pandas._libs.parsers.TextReader._get_header()
C:\ProgramData\Anaconda3\lib\site-packages\pandas_libs\parsers.pyx in pandas._libs.parsers.TextReader._tokenize_rows()
C:\ProgramData\Anaconda3\lib\site-packages\pandas_libs\parsers.pyx in pandas._libs.parsers.raise_parser_error()
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xaa in position 14: invalid start byte
​
i tried some options using youtube but not working
Looking at the very end of the file extension, you're importing a .xlsx file, not a CSV file.
Try opening the file on Excel and export it as a CSV. You need to make sure that .csv is the last characters of the file.
I think you need to use XLSX function for load and read the XLSX file inside pandas.
For that you need to use this line of code inside your code:
import pandas as pd
df = pd.read_excel('D:\Tableau\codebasics_files\Weather_data.csv.xlsx')

Why dataframe is not displayed in console using pyspark?

This is the session object I have created for dataframe
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
This is the code for Dataframe creation
data = [("James","","Smith",30,"M",60000),
("Michael","Rose","",50,"M",70000),
("Robert","","Williams",42,"",400000),
("Maria","Anne","Jones",38,"F",500000),
("Jen","Mary","Brown",45,"F",0)]
columns = ["first_name","middle_name","last_name","Age","gender","salary"]
pysparkDF = spark.createDataFrame(data = data, schema = columns)
pysparkDF.printSchema()
pysparkDF.show(truncate=False)
I want dataframe should be displayed in console of six columns but it is showing me the erro of Py4jjjavaerror exceeds size limit
Output exceeds the size limit. Open the full output data in a text editor
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
Cell In[28], line 10
8 pysparkDF = spark.createDataFrame(data = data, schema = columns)
9 pysparkDF.printSchema()
---> 10 pysparkDF.show(truncate=False)
File c:\Users\baps\AppData\Local\Programs\Python\Python38-32\lib\site-packages\pyspark\sql\dataframe.py:615, in DataFrame.show(self, n, truncate, vertical)
610 except ValueError:
611 raise TypeError(
612 "Parameter 'truncate={}' should be either bool or int.".format(truncate)
613 )
--> 615 print(self._jdf.showString(n, int_truncate, vertical))
File c:\Users\baps\AppData\Local\Programs\Python\Python38-32\lib\site-packages\py4j\java_gateway.py:1321, in JavaMember.__call__(self, *args)
1315 command = proto.CALL_COMMAND_NAME +\
1316 self.command_header +\
1317 args_command +\
1318 proto.END_COMMAND_PART
1320 answer = self.gateway_client.send_command(command)
-> 1321 return_value = get_return_value(
1322 answer, self.gateway_client, self.target_id, self.name)
1324 for temp_arg in temp_args:
1325 temp_arg._detach()
...
at java.lang.ProcessImpl.<init>(ProcessImpl.java:453)
at java.lang.ProcessImpl.start(ProcessImpl.java:140)
at java.lang.ProcessBuilder.start(ProcessBuilder.java:1029)
... 30 more

Pandas can't read in excel file

Something is wrong with my pandas module. I tried to read in an excel file using the following code, which works on my classmate's computer, but it's giving me an error on my computer:
FFT1=pd.read_excel('FFT1.xlsx', sheet_name='sheet1')
The file named 'FFT1.xlsx' is in the same directory as my jupyter notebook. The error message says:
XLRDError Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_7436/2793485739.py in <module>
----> 1 FFT1=pd.read_excel('FFT1.xlsx', sheet_name='sheet1')
D:\Softwares\Anaconda\lib\site-packages\pandas\io\excel\_base.py in read_excel(io, sheet_name, header, names, index_col, usecols, squeeze, dtype, engine, converters, true_values, false_values, skiprows, nrows, na_values, keep_default_na, verbose, parse_dates, date_parser, thousands, comment, skipfooter, convert_float, mangle_dupe_cols, **kwds)
302
303 if not isinstance(io, ExcelFile):
--> 304 io = ExcelFile(io, engine=engine)
305 elif engine and engine != io.engine:
306 raise ValueError(
D:\Softwares\Anaconda\lib\site-packages\pandas\io\excel\_base.py in __init__(self, io, engine)
819 self._io = stringify_path(io)
820
--> 821 self._reader = self._engines[engine](self._io)
822
823 def __fspath__(self):
D:\Softwares\Anaconda\lib\site-packages\pandas\io\excel\_xlrd.py in __init__(self, filepath_or_buffer)
19 err_msg = "Install xlrd >= 1.0.0 for Excel support"
20 import_optional_dependency("xlrd", extra=err_msg)
---> 21 super().__init__(filepath_or_buffer)
22
23 #property
D:\Softwares\Anaconda\lib\site-packages\pandas\io\excel\_base.py in __init__(self, filepath_or_buffer)
351 self.book = self.load_workbook(filepath_or_buffer)
352 elif isinstance(filepath_or_buffer, str):
--> 353 self.book = self.load_workbook(filepath_or_buffer)
354 elif isinstance(filepath_or_buffer, bytes):
355 self.book = self.load_workbook(BytesIO(filepath_or_buffer))
D:\Softwares\Anaconda\lib\site-packages\pandas\io\excel\_xlrd.py in load_workbook(self, filepath_or_buffer)
34 return open_workbook(file_contents=data)
35 else:
---> 36 return open_workbook(filepath_or_buffer)
37
38 #property
D:\Softwares\Anaconda\lib\site-packages\xlrd\__init__.py in open_workbook(filename, logfile, verbosity, use_mmap, file_contents, encoding_override, formatting_info, on_demand, ragged_rows, ignore_workbook_corruption)
168 # files that xlrd can parse don't start with the expected signature.
169 if file_format and file_format != 'xls':
--> 170 raise XLRDError(FILE_FORMAT_DESCRIPTIONS[file_format]+'; not supported')
171
172 bk = open_workbook_xls(
XLRDError: Excel xlsx file; not supported
How should I fix this?
Make sure that you already install openpyxl, if you don't try
pip install openpyxl
Change your code to
FFT1=pd.read_excel('FFT1.xlsx', sheet_name='sheet1',engine='openpyxl')

How change the value in a koalas dataframe based in a condition

I am using Koalas and I want to change the value of a column based on a condition.
In pandas I can do that using:
import pandas as pd
df_test = pd.DataFrame({
'a': [1,2,3]
,'b': ['one','two','three']})
df_test2 = pd.DataFrame({
'c': [2,1,3]
,'d': ['one','two','three']})
df_test.loc[df_test.a.isin(df_test2['c']),'b'] = 'four'
df_test.head()
a b
0 1 four
1 2 four
2 3 four
I am trying to use the same in Koalas, but I have this error:
---------------------------------------------------------------------------
PandasNotImplementedError Traceback (most recent call last)
<ipython-input-15-814219258adb> in <module>
5 new_loans['write_offs'] = 0
6
----> 7 new_loans.loc[(new_loans['ID'].isin(userinput_write_offs['id'])),'write_offs'] = 1
8 new_loans.loc[new_loans['write_offs']==1,'is_active'] = 0
9 new_loans = new_loans.sort_values(by = ['ZOHOID','Disb Date'])
/usr/local/lib/python3.7/dist-packages/databricks/koalas/base.py in isin(self, values)
894 )
895
--> 896 return self._with_new_scol(self.spark.column.isin(list(values)))
897
898 def isnull(self) -> Union["Series", "Index"]:
/usr/local/lib/python3.7/dist-packages/databricks/koalas/series.py in __iter__(self)
5871
5872 def __iter__(self):
-> 5873 return MissingPandasLikeSeries.__iter__(self)
5874
5875 if sys.version_info >= (3, 7):
/usr/local/lib/python3.7/dist-packages/databricks/koalas/missing/__init__.py in unsupported_function(*args, **kwargs)
21 def unsupported_function(*args, **kwargs):
22 raise PandasNotImplementedError(
---> 23 class_name=class_name, method_name=method_name, reason=reason
24 )
25
PandasNotImplementedError: The method `pd.Series.__iter__()` is not implemented. If you want to collect your data as an NumPy array, use 'to_numpy()' instead.
How could I do the same operation in Koalas?
UPDATE
Following this question: Assign Koalas Column from Numpy Result I have done:
df_test.loc[df_test.a.isin(df_test2['c'].to_list()),'b'] = 'four'
But now I have this error:
---------------------------------------------------------------------------
PythonException Traceback (most recent call last)
/usr/local/lib/python3.7/dist-packages/IPython/core/formatters.py in __call__(self, obj)
700 type_pprinters=self.type_printers,
701 deferred_pprinters=self.deferred_printers)
--> 702 printer.pretty(obj)
703 printer.flush()
704 return stream.getvalue()
/usr/local/lib/python3.7/dist-packages/IPython/lib/pretty.py in pretty(self, obj)
392 if cls is not object \
393 and callable(cls.__dict__.get('__repr__')):
--> 394 return _repr_pprint(obj, self, cycle)
395
396 return _default_pprint(obj, self, cycle)
/usr/local/lib/python3.7/dist-packages/IPython/lib/pretty.py in _repr_pprint(obj, p, cycle)
698 """A pprint that just redirects to the normal repr function."""
699 # Find newlines and replace them with p.break_()
--> 700 output = repr(obj)
701 lines = output.splitlines()
702 with p.group():
/usr/local/lib/python3.7/dist-packages/databricks/koalas/frame.py in __repr__(self)
10614 return self._to_internal_pandas().to_string()
10615
> 10616 pdf = self._get_or_create_repr_pandas_cache(max_display_count)
10617 pdf_length = len(pdf)
10618 pdf = pdf.iloc[:max_display_count]
/usr/local/lib/python3.7/dist-packages/databricks/koalas/frame.py in _get_or_create_repr_pandas_cache(self, n)
10606 def _get_or_create_repr_pandas_cache(self, n):
10607 if not hasattr(self, "_repr_pandas_cache") or n not in self._repr_pandas_cache:
> 10608 self._repr_pandas_cache = {n: self.head(n + 1)._to_internal_pandas()}
10609 return self._repr_pandas_cache[n]
10610
/usr/local/lib/python3.7/dist-packages/databricks/koalas/frame.py in _to_internal_pandas(self)
10602 This method is for internal use only.
10603 """
> 10604 return self._internal.to_pandas_frame
10605
10606 def _get_or_create_repr_pandas_cache(self, n):
/usr/local/lib/python3.7/dist-packages/databricks/koalas/utils.py in wrapped_lazy_property(self)
514 def wrapped_lazy_property(self):
515 if not hasattr(self, attr_name):
--> 516 setattr(self, attr_name, fn(self))
517 return getattr(self, attr_name)
518
/usr/local/lib/python3.7/dist-packages/databricks/koalas/internal.py in to_pandas_frame(self)
807 """ Return as pandas DataFrame. """
808 sdf = self.to_internal_spark_frame
--> 809 pdf = sdf.toPandas()
810 if len(pdf) == 0 and len(sdf.schema) > 0:
811 pdf = pdf.astype(
/usr/local/spark/python/pyspark/sql/pandas/conversion.py in toPandas(self)
136
137 # Below is toPandas without Arrow optimization.
--> 138 pdf = pd.DataFrame.from_records(self.collect(), columns=self.columns)
139 column_counter = Counter(self.columns)
140
/usr/local/spark/python/pyspark/sql/dataframe.py in collect(self)
594 """
595 with SCCallSiteSync(self._sc) as css:
--> 596 sock_info = self._jdf.collectToPython()
597 return list(_load_from_socket(sock_info, BatchedSerializer(PickleSerializer())))
598
/usr/local/lib/python3.7/dist-packages/py4j/java_gateway.py in __call__(self, *args)
1303 answer = self.gateway_client.send_command(command)
1304 return_value = get_return_value(
-> 1305 answer, self.gateway_client, self.target_id, self.name)
1306
1307 for temp_arg in temp_args:
/usr/local/spark/python/pyspark/sql/utils.py in deco(*a, **kw)
132 # Hide where the exception came from that shows a non-Pythonic
133 # JVM exception message.
--> 134 raise_from(converted)
135 else:
136 raise
/usr/local/spark/python/pyspark/sql/utils.py in raise_from(e)
PythonException:
An exception was thrown from the Python worker. Please see the stack trace below.
Traceback (most recent call last):
File "/opt/spark/python/lib/pyspark.zip/pyspark/worker.py", line 589, in main
func, profiler, deserializer, serializer = read_udfs(pickleSer, infile, eval_type)
File "/opt/spark/python/lib/pyspark.zip/pyspark/worker.py", line 447, in read_udfs
udfs.append(read_single_udf(pickleSer, infile, eval_type, runner_conf, udf_index=i))
File "/opt/spark/python/lib/pyspark.zip/pyspark/worker.py", line 254, in read_single_udf
f, return_type = read_command(pickleSer, infile)
File "/opt/spark/python/lib/pyspark.zip/pyspark/worker.py", line 74, in read_command
command = serializer._read_with_length(file)
File "/opt/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 172, in _read_with_length
return self.loads(obj)
File "/opt/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 458, in loads
return pickle.loads(obj, encoding=encoding)
File "/opt/spark/python/lib/pyspark.zip/pyspark/cloudpickle.py", line 1110, in subimport
__import__(name)
ModuleNotFoundError: No module named 'pandas'
Why is trying to use pandas?
Koalas package exposes Pandas Like APIs on high level for the users but under the hood implementation is done using PySpark APIs.
I observed that within the stack track log you have pasted, a pandas dataframe is being created from sdf spark Dataframe using toPandas() method and assigned to pdf.
In the implementation of toPandas() function, pandas and numpy are being imported.
check line numbers 809 & 138.
/usr/local/lib/python3.7/dist-packages/databricks/koalas/internal.py in to_pandas_frame(self)
807 """ Return as pandas DataFrame. """
808 sdf = self.to_internal_spark_frame
--> 809 pdf = sdf.toPandas()
810 if len(pdf) == 0 and len(sdf.schema) > 0:
811 pdf = pdf.astype(
/usr/local/spark/python/pyspark/sql/pandas/conversion.py in toPandas(self)
136
137 # Below is toPandas without Arrow optimization.
--> 138 pdf = pd.DataFrame.from_records(self.collect(), columns=self.columns)
139 column_counter = Counter(self.columns)
140
/usr/local/spark/python/pyspark/sql/dataframe.py in collect(self)
594 """
595 with SCCallSiteSync(self._sc) as css:
--> 596 sock_info = self._jdf.collectToPython()
597 return list(_load_from_socket(sock_info, BatchedSerializer(PickleSerializer())))
598
you can check out the implementation of toPandas() function at the following link:
https://github.com/apache/spark/blob/master/python/pyspark/sql/pandas/conversion.py

\xc9 accents when exporting a data frame to csv

Pandas' to_csv() method produces an error when strings in my data frame contain \xc9 accents. Any idea how I can quickly solve this?
Thanks
--------------------------------------------------------------------------- UnicodeEncodeError Traceback (most recent call
last) /Users/slegroux1/Projects/FeaturEmo/en_features.py in ()
360 print "----- No plot. Corresponding features are missing."
361 if name == 'main':
--> 362 main()
/Users/slegroux1/Projects/FeaturEmo/en_features.py in main()
40 if output:
41 embed()
---> 42 dict2frame(features).to_csv(output)
43 if plot and not(directory or my_csv_list):
44 plot_features(features)
/Library/Frameworks/EPD64.framework/Versions/7.3/lib/python2.7/site-packages/pandas/core/frame.pyc
in to_csv(self, path_or_buf, sep, na_rep, float_format, cols, header,
index, index_label, mode, nanRep, encoding, quoting, line_terminator,
chunksize, tupleize_cols, **kwds) 1408
chunksize=chunksize,engine=kwds.get("engine"), 1409
tupleize_cols=tupleize_cols)
-> 1410 formatter.save() 1411 1412 def to_excel(self, excel_writer, sheet_name='sheet1', na_rep='',
/Library/Frameworks/EPD64.framework/Versions/7.3/lib/python2.7/site-packages/pandas/core/format.pyc
in save(self)
970
971 else:
--> 972 self._save()
973
974
/Library/Frameworks/EPD64.framework/Versions/7.3/lib/python2.7/site-packages/pandas/core/format.pyc
in _save(self) 1076 break 1077
-> 1078 self._save_chunk(start_i, end_i) 1079 1080 def _save_chunk(self, start_i, end_i):
/Library/Frameworks/EPD64.framework/Versions/7.3/lib/python2.7/site-packages/pandas/core/format.pyc
in _save_chunk(self, start_i, end_i) 1094 ix =
data_index.to_native_types(slicer=slicer, na_rep=self.na_rep,
float_format=self.float_format) 1095
-> 1096 lib.write_csv_rows(self.data, ix, self.nlevels, self.cols, self.writer) 1097 1098 # from collections import
namedtuple
/Library/Frameworks/EPD64.framework/Versions/7.3/lib/python2.7/site-packages/pandas/lib.so
in pandas.lib.write_csv_rows (pandas/lib.c:13871)()
UnicodeEncodeError: 'ascii' codec can't encode character u'\xc9' in
position 0: ordinal not in range(128)
You should specify an encoding using the encoding argument to to_csv. For example, df.to_csv(filename, encoding='utf-8').