Unicode DecodeError while concatenating - pandas

I am trying to train my model and i have csv file and one gz file, which was generated earlier. I am getting this error as mentioned below and not sure what is wrong.
Traceback (most recent call last):
File "Model.py", line 87, in <module>
data = pd.concat([pd.read_csv(log)])
File "/usr/local/lib/python3.6/site-packages/pandas/io/parsers.py", line 678, in parser_f
return _read(filepath_or_buffer, kwds)
File "/usr/local/lib/python3.6/site-packages/pandas/io/parsers.py", line 440, in _read
parser = TextFileReader(filepath_or_buffer, **kwds)
File "/usr/local/lib/python3.6/site-packages/pandas/io/parsers.py", line 787, in __init__
self._make_engine(self.engine)
File "/usr/local/lib/python3.6/site-packages/pandas/io/parsers.py", line 1014, in _make_engine
self._engine = CParserWrapper(self.f, **self.options)
File "/usr/local/lib/python3.6/site-packages/pandas/io/parsers.py", line 1708, in __init__
self._reader = parsers.TextReader(src, **kwds)
File "pandas/_libs/parsers.pyx", line 539, in pandas._libs.parsers.TextReader.__cinit__
File "pandas/_libs/parsers.pyx", line 767, in pandas._libs.parsers.TextReader._get_header
UnicodeDecodeError: 'utf-8' codec can't decode byte 0x8b in position 1: invalid start byte
Mycode:
for foo in range(0,1):
# Read dataframe
#data = pd.concat([pd.read_csv(log.replace('0',str(idx),1)) for idx in range(5)])
log = path + 'train_features/log_.csv'
test_log = path + 'test_features/log_features.gz'
data = pd.concat([pd.read_csv(log)])

Try:
data = pd.read_csv(log, encoding = "utf-8")
Although I don't understand why you need the for loop or pd.concat
If you don't know your type of encoding, try: this:
import chardet
with open(log, 'rb') as f:
result = chardet.detect(f.read()) # or readline if the file is large
data = pd.read_csv(log, encoding=result['encoding'])
source

Related

MissingDataError: exog contains inf or nans

Input:
import statsmodels.api as sm
import pandas as pd
# reading data from the csv
data = pd.read_csv('/Users/justkiddings/Desktop/Python/TM/TM.csv')
# defining the variables
x = data['FSP'].tolist()
y = data['RSP'].tolist()
# adding the constant term
x = sm.add_constant(x)
# performing the regression
# and fitting the model
result = sm.OLS(y, x).fit()
# printing the summary table
print(result.summary())
Output:
runfile('/Users/justkiddings/Desktop/Python/Code/untitled28.py', wdir='/Users/justkiddings/Desktop/Python/Code')
Traceback (most recent call last):
File "/Users/justkiddings/opt/anaconda3/lib/python3.9/site-packages/spyder_kernels/py3compat.py", line 356, in compat_exec
exec(code, globals, locals)
File "/Users/justkiddings/Desktop/Python/Code/untitled28.py", line 24, in <module>
result = sm.OLS(y, x).fit()
File "/Users/justkiddings/opt/anaconda3/lib/python3.9/site-packages/statsmodels/regression/linear_model.py", line 890, in __init__
super(OLS, self).__init__(endog, exog, missing=missing,
File "/Users/justkiddings/opt/anaconda3/lib/python3.9/site-packages/statsmodels/regression/linear_model.py", line 717, in __init__
super(WLS, self).__init__(endog, exog, missing=missing,
File "/Users/justkiddings/opt/anaconda3/lib/python3.9/site-packages/statsmodels/regression/linear_model.py", line 191, in __init__
super(RegressionModel, self).__init__(endog, exog, **kwargs)
File "/Users/justkiddings/opt/anaconda3/lib/python3.9/site-packages/statsmodels/base/model.py", line 267, in __init__
super().__init__(endog, exog, **kwargs)
File "/Users/justkiddings/opt/anaconda3/lib/python3.9/site-packages/statsmodels/base/model.py", line 92, in __init__
self.data = self._handle_data(endog, exog, missing, hasconst,
File "/Users/justkiddings/opt/anaconda3/lib/python3.9/site-packages/statsmodels/base/model.py", line 132, in _handle_data
data = handle_data(endog, exog, missing, hasconst, **kwargs)
File "/Users/justkiddings/opt/anaconda3/lib/python3.9/site-packages/statsmodels/base/data.py", line 673, in handle_data
return klass(endog, exog=exog, missing=missing, hasconst=hasconst,
File "/Users/justkiddings/opt/anaconda3/lib/python3.9/site-packages/statsmodels/base/data.py", line 86, in __init__
self._handle_constant(hasconst)
File "/Users/justkiddings/opt/anaconda3/lib/python3.9/site-packages/statsmodels/base/data.py", line 132, in _handle_constant
raise MissingDataError('exog contains inf or nans')
MissingDataError: exog contains inf or nans
Some of the Data:
DATE,HOUR,STATION,CO,FSP,NO2,NOX,O3,RSP,SO2
1/1/2022,1,TUEN MUN,75,38,39,40,83,59,2
1/1/2022,2,TUEN MUN,72,35,29,30,90,61,2
1/1/2022,3,TUEN MUN,74,38,28,30,91,66,2
1/1/2022,4,TUEN MUN,76,39,31,32,79,61,2
1/1/2022,5,TUEN MUN,72,38,25,26,83,65,2
1/1/2022,6,TUEN MUN,74,37,24,25,86,60,2
I have removed the N.A. in my dataset and they have converted into blanks. (Eg. 3/1/2022,12,TUEN MUN,85,,53,70,59,,5) Why there is MissingDataError? How to fix it? Thanks.

I'm trying to import CSV file using pandas, But I'm getting Error

C:\Users\Misti\PycharmProjects\pythonProject\venv\Scripts\python.exe C:/Users/Misti/PycharmProjects/pythonProject/main.py
Traceback (most recent call last):
File "C:/Users/Misti/PycharmProjects/pythonProject/main.py", line 2, in <module>
df = pd.read_csv('milestone2.csv')
File "C:\Users\Misti\PycharmProjects\pythonProject\venv\lib\site-packages\pandas\util\_decorators.py", line 311, in wrapper
return func(*args, **kwargs)
File "C:\Users\Misti\PycharmProjects\pythonProject\venv\lib\site-packages\pandas\io\parsers\readers.py", line 586, in read_csv
return _read(filepath_or_buffer, kwds)
File "C:\Users\Misti\PycharmProjects\pythonProject\venv\lib\site-packages\pandas\io\parsers\readers.py", line 482, in _read
parser = TextFileReader(filepath_or_buffer, **kwds)
File "C:\Users\Misti\PycharmProjects\pythonProject\venv\lib\site-packages\pandas\io\parsers\readers.py", line 811, in __init__
self._engine = self._make_engine(self.engine)
File "C:\Users\Misti\PycharmProjects\pythonProject\venv\lib\site-packages\pandas\io\parsers\readers.py", line 1040, in _make_engine
return mapping[engine](self.f, **self.options) # type: ignore[call-arg]
File "C:\Users\Misti\PycharmProjects\pythonProject\venv\lib\site-packages\pandas\io\parsers\c_parser_wrapper.py", line 69, in __init__
self._reader = parsers.TextReader(self.handles.handle, **kwds)
File "pandas\_libs\parsers.pyx", line 549, in pandas._libs.parsers.TextReader.__cinit__
pandas.errors.EmptyDataError: No columns to parse from file
Process finished with exit code 1

compute() in dask not working

I am trying a simple parallel computation in Dask.
This is my code.
import time
import dask as dask
import dask.distributed as distributed
import dask.dataframe as dd
import dask.delayed as delayed
from dask.distributed import Client,progress
client = Client('localhost:8786')
df = dd.read_csv('file.csv')
ddf = df.groupby(['col1'])[['col2']].sum()
ddf = ddf.compute()
print ddf
It seems fine from the documentation but on running I am getting this :
Traceback (most recent call last):
File "dask_prg1.py", line 17, in <module>
ddf = ddf.compute()
File "/usr/local/lib/python2.7/site-packages/dask/base.py", line 156, in compute
(result,) = compute(self, traverse=False, **kwargs)
File "/usr/local/lib/python2.7/site-packages/dask/base.py", line 402, in compute
results = schedule(dsk, keys, **kwargs)
File "/usr/local/lib/python2.7/site-packages/distributed/client.py", line 2159, in get
direct=direct)
File "/usr/local/lib/python2.7/site-packages/distributed/client.py", line 1562, in gather
asynchronous=asynchronous)
File "/usr/local/lib/python2.7/site-packages/distributed/client.py", line 652, in sync
return sync(self.loop, func, *args, **kwargs)
File "/usr/local/lib/python2.7/site-packages/distributed/utils.py", line 275, in sync
six.reraise(*error[0])
File "/usr/local/lib/python2.7/site-packages/distributed/utils.py", line 260, in f
result[0] = yield make_coro()
File "/usr/local/lib/python2.7/site-packages/tornado/gen.py", line 1099, in run
value = future.result()
File "/usr/local/lib/python2.7/site-packages/tornado/concurrent.py", line 260, in result
raise_exc_info(self._exc_info)
File "/usr/local/lib/python2.7/site-packages/tornado/gen.py", line 1107, in run
yielded = self.gen.throw(*exc_info)
File "/usr/local/lib/python2.7/site-packages/distributed/client.py", line 1439, in _gather
traceback)
File "/usr/local/lib/python2.7/site-packages/dask/bytes/core.py", line 122, in read_block_from_file
with lazy_file as f:
File "/usr/local/lib/python2.7/site-packages/dask/bytes/core.py", line 166, in __enter__
f = SeekableFile(self.fs.open(self.path, mode=mode))
File "/usr/local/lib/python2.7/site-packages/dask/bytes/local.py", line 58, in open
return open(self._normalize_path(path), mode=mode)
IOError: [Errno 2] No such file or directory: 'file.csv'
I am not understanding what is wrong.Kindly help me with this .Thank you in advance .
You may wish to pass the absolute file path to read_csv. The reason is, that you are giving the work of opening and reading the file to a dask worker, and you might not have started that worked with the same working directory as your script/session.

AttributeError: 'S3File' object has not attribute 'getvalue', while running to_csv

I'm running to_csv command as follows to an ouput file on a s3 bucket with ServerSideEncryption enabled:
to_csv("s3://mys3bucket/result.csv",
storage_option={'s3_additional_kwargs':
{'ServerSideEncryption': 'AES256'}})
I'm getting the following attribute error:
File "/usr/lib/python2.7/site-packages/dask/dataframe/core.py", line 1091, in to_csv
return to_csv(self, filename, **kwargs)
File "/usr/lib/python2.7/site-packages/dask/dataframe/io/csv.py", line 577, in to_csv
delayed(values).compute(get=get, scheduler=scheduler)
File "/usr/lib/python2.7/site-packages/dask/base.py", line 156, in compute
(result,) = compute(self, traverse=False, **kwargs)
File "/usr/lib/python2.7/site-packages/dask/base.py", line 400, in compute
results = schedule(dsk, keys, **kwargs)
File "/usr/lib/python2.7/site-packages/distributed/client.py", line 2159, in get
direct=direct)
File "/usr/lib/python2.7/site-packages/distributed/client.py", line 1562, in gather
asynchronous=asynchronous)
File "/usr/lib/python2.7/site-packages/distributed/client.py", line 652, in sync
return sync(self.loop, func, *args, **kwargs)
File "/usr/lib/python2.7/site-packages/distributed/utils.py", line 275, in sync
six.reraise(*error[0])
File "/usr/lib/python2.7/site-packages/distributed/utils.py", line 260, in f
result[0] = yield make_coro()
File "/usr/lib64/python2.7/site-packages/tornado/gen.py", line 1099, in run
value = future.result()
File "/usr/lib64/python2.7/site-packages/tornado/concurrent.py", line 260, in result
raise_exc_info(self._exc_info)
File "/usr/lib64/python2.7/site-packages/tornado/gen.py", line 1107, in run
yielded = self.gen.throw(*exc_info)
File "/usr/lib/python2.7/site-packages/distributed/client.py", line 1439, in _gather
traceback)
File "/usr/lib/python2.7/site-packages/dask/dataframe/io/csv.py", line 439, in _to_csv_chunk
df.to_csv(f, **kwargs)
File "/usr/lib64/python2.7/site-packages/pandas/core/frame.py", line 1745, in to_csv
formatter.save()
File "/usr/lib64/python2.7/site-packages/pandas/io/formats/csvs.py", line 161, in save
buf = f.getvalue()
File "/usr/lib/python2.7/site-packages/dask/bytes/utils.py", line 136, in __getattr__
return getattr(self.file, key)
AttributeError: 'S3File' object has no attribute 'getvalue'
I searched for this error, but couldn't find a relevant solution.
Do you have any idea?

pandas.read_csv gives FileNotFound error inside a loop

pandas.read_csv is working properly when used as a single statement. But it is giving FileNotFoundError when it is being used inside a loop even though the file exists.
for filename in os.listdir("./Datasets/pollution"):
print(filename) # To check which file is under processing
df = pd.read_csv(filename, sep=",").head(1)
These above lines are giving this following error.
pollutionData184866.csv <----- The name of the file is printed properly.
Traceback (most recent call last):
File "/home/parnab/PycharmProjects/FinalYearProject/locationExtractor.py", line 13, in <module>
df = pd.read_csv(i, sep=",").head(1)
File "/usr/lib/python3.6/site-packages/pandas/io/parsers.py", line 646, in parser_f
return _read(filepath_or_buffer, kwds)
File "/usr/lib/python3.6/site-packages/pandas/io/parsers.py", line 389, in _read
parser = TextFileReader(filepath_or_buffer, **kwds)
File "/usr/lib/python3.6/site-packages/pandas/io/parsers.py", line 730, in __init__
self._make_engine(self.engine)
File "/usr/lib/python3.6/site-packages/pandas/io/parsers.py", line 923, in _make_engine
self._engine = CParserWrapper(self.f, **self.options)
File "/usr/lib/python3.6/site-packages/pandas/io/parsers.py", line 1390, in __init__
self._reader = _parser.TextReader(src, **kwds)
File "pandas/parser.pyx", line 373, in pandas.parser.TextReader.__cinit__ (pandas/parser.c:4184)
File "pandas/parser.pyx", line 667, in pandas.parser.TextReader._setup_parser_source (pandas/parser.c:8449)
FileNotFoundError: File b'pollutionData184866.csv' does not exist
But when I am doing
filename = 'pollutionData184866.csv'
df = pd.read_csv(filename, sep=',')
It is working fine.
What am I doing wrong?
os.listdir("./Datasets/pollution") returns a list of files without a path and according to the path "./Datasets/pollution" you are parsing CSV files NOT from the current directory ".", so changing it to glob.glob('./Datasets/pollution/*.csv') should work, because glob.glob() returns a list of satisfying files/directories including given path
Demo:
In [19]: os.listdir('d:/temp/.data/629509')
Out[19]:
['AAON_data.csv',
'AAON_data.png',
'AAPL_data.csv',
'AAPL_data.png',
'AAP_data.csv',
'AAP_data.png']
In [20]: glob.glob('d:/temp/.data/629509/*.csv')
Out[20]:
['d:/temp/.data/629509\\AAON_data.csv',
'd:/temp/.data/629509\\AAPL_data.csv',
'd:/temp/.data/629509\\AAP_data.csv']