How can I import a CSV file in python? - pandas

I'm trying to read from a CSV file using the pandas library in python (using spyder (Python 3.7)), But I am getting an error
Traceback (most recent call last):
File "<ipython-input-104-744f03c12bee>", line 1, in
datasets = pd.read_csv('Data.csv')
File
"\Continuum\anaconda3\lib\site-packages\pandas\io\parsers.py",
line 702, in parser_f
return _read(filepath_or_buffer, kwds)
File
"\Continuum\anaconda3\lib\site-packages\pandas\io\parsers.py",
line 435, in _read
data = parser.read(nrows)
File
"\Continuum\anaconda3\lib\site-packages\pandas\io\parsers.py",
line 1154, in read
df = DataFrame(col_dict, columns=columns, index=index)
File
"\Continuum\anaconda3\lib\site-packages\pandas\core\frame.py",
line 392, in init
mgr = init_dict(data, index, columns, dtype=dtype)
File
"\Continuum\anaconda3\lib\site-packages\pandas\core\internals\construction.py",
line 212, in init_dict
return arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype)
File
"\Continuum\anaconda3\lib\site-packages\pandas\core\internals\construction.py",
line 56, in arrays_to_mgr
arrays = _homogenize(arrays, index, dtype)
File
"\Continuum\anaconda3\lib\site-packages\pandas\core\internals\construction.py",
line 277, in _homogenize
raise_cast_failure=False)
File
"\Continuum\anaconda3\lib\site-packages\pandas\core\internals\construction.py",
line 582, in sanitize_array
subarr = _try_cast(data, True, dtype, copy, raise_cast_failure)
File
"\Continuum\anaconda3\lib\site-packages\pandas\core\internals\construction.py",
line 720, in _try_cast
subarr = np.array(arr, dtype=object, copy=copy)
TypeError: 'numpy.ndarray' object is not callable
I have imported the numpy library as np, as well as the pandas library as pd; the working directory has been set correctly to the same as the file save location.
import pandas as pd
datasets = pd.read_csv('Data.csv')
Following is the data that I am currently working on:
Data

Actually your file(data.csv) is not a csv file. Its in PNG format. So, you can use PIL module in python.
from PIL import Image
imframe = Image.open('data.csv')
imframe

Related

Error I don't understand with Geoplotlib when trying to display a trajectory on a map

I'm trying to display a trajectory on a map using geoplotlib but I have the following error that I don't understand:
Traceback (most recent call last):
File "C:\Users\jlouvet\anaconda3\lib\site-packages\pandas\core\indexes\base.py", line 3361, in get_loc
return self._engine.get_loc(casted_key)
File "pandas_libs\index.pyx", line 76, in pandas._libs.index.IndexEngine.get_loc
File "pandas_libs\index.pyx", line 108, in pandas._libs.index.IndexEngine.get_loc
File "pandas_libs\hashtable_class_helper.pxi", line 5198, in pandas._libs.hashtable.PyObjectHashTable.get_item
File "pandas_libs\hashtable_class_helper.pxi", line 5206, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'lon'
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "C:\Users\jlouvet\anaconda3\lib\site-packages\geoplotlib_init_.py", line 32, in _runapp
app.start()
File "C:\Users\jlouvet\anaconda3\lib\site-packages\geoplotlib\core.py", line 364, in start
self.proj.fit(BoundingBox.from_bboxes([l.bbox() for l in self.geoplotlib_config.layers]),
File "C:\Users\jlouvet\anaconda3\lib\site-packages\geoplotlib\core.py", line 364, in
self.proj.fit(BoundingBox.from_bboxes([l.bbox() for l in self.geoplotlib_config.layers]),
File "C:\Users\jlouvet\anaconda3\lib\site-packages\geoplotlib\layers.py", line 159, in bbox
return BoundingBox.from_points(lons=self.data['lon'], lats=self.data['lat'])
File "C:\Users\jlouvet\anaconda3\lib\site-packages\pandas\core\frame.py", line 3458, in getitem
indexer = self.columns.get_loc(key)
File "C:\Users\jlouvet\anaconda3\lib\site-packages\pandas\core\indexes\base.py", line 3363, in get_loc
raise KeyError(key) from err
KeyError: 'lon'
My code is as follows:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import geoplotlib
import pyglet
os.chdir(r'C:\Users\jlouvet\Desktop\Données_bateau\CASINO_CSV')
data = pd.read_csv("20210817.csv", sep=';')
trajectory = data[['Latitude', 'Longitude']]
geoplotlib.dot(trajectory)
geoplotlib.show()
Thank you in advance for your help.

Trouble using pandas df.rolling() with my own functions

I have a pandas dataframe raw_data with two columns: 'T' and 'BP':
T BP
0 -0.500 115.790
1 -0.499 115.441
2 -0.498 115.441
3 -0.497 115.441
4 -0.496 115.790
... ... ...
647163 646.663 105.675
647164 646.664 105.327
647165 646.665 105.327
647166 646.666 105.327
647167 646.667 104.978
[647168 rows x 2 columns]
I want to apply the Hodges-Lehmann mean (it's a robust average) over a rolling window and create a new column. Here's the function:
def hodgesLehmannMean(x):
m = np.add.outer(x, x)
ind = np.tril_indices(len(x), 0)
return 0.5 * np.median(m[ind])
I therefore write:
raw_data[new_col] = raw_data['BP'].rolling(21, min_periods=1, center=True,
win_type=None, axis=0, closed=None).agg(hodgesLehmannMean)
but I get a string of error messages:
Traceback (most recent call last):
File "C:\Users\tkpme\miniconda3\lib\runpy.py", line 194, in _run_module_as_main
return _run_code(code, main_globals, None,
File "C:\Users\tkpme\miniconda3\lib\runpy.py", line 87, in _run_code
exec(code, run_globals)
File "c:\Users\tkpme\.vscode\extensions\ms-python.python-2020.8.101144\pythonFiles\lib\python\debugpy\__main__.py", line 45, in <module>
cli.main()
File "c:\Users\tkpme\.vscode\extensions\ms-python.python-2020.8.101144\pythonFiles\lib\python\debugpy/..\debugpy\server\cli.py", line 430, in main
run()
File "c:\Users\tkpme\.vscode\extensions\ms-python.python-2020.8.101144\pythonFiles\lib\python\debugpy/..\debugpy\server\cli.py", line 267, in run_file
runpy.run_path(options.target, run_name=compat.force_str("__main__"))
File "C:\Users\tkpme\miniconda3\lib\runpy.py", line 265, in run_path
return _run_module_code(code, init_globals, run_name,
File "C:\Users\tkpme\miniconda3\lib\runpy.py", line 97, in _run_module_code
_run_code(code, mod_globals, init_globals,
File "C:\Users\tkpme\miniconda3\lib\runpy.py", line 87, in _run_code
exec(code, run_globals)
File "c:\Users\tkpme\OneDrive\Documents\Work\CMC\BP Satya and Suresh\Code\Naveen_peak_detect test.py", line 227, in <module>
main()
File "c:\Users\tkpme\OneDrive\Documents\Work\CMC\BP Satya and Suresh\Code\Naveen_peak_detect test.py", line 75, in main
raw_data[new_col] = raw_data['BP'].rolling(FILTER_WINDOW, min_periods=1, center=True, win_type=None,
File "C:\Users\tkpme\miniconda3\lib\site-packages\pandas\core\window\rolling.py", line 1961, in aggregate
return super().aggregate(func, *args, **kwargs)
File "C:\Users\tkpme\miniconda3\lib\site-packages\pandas\core\window\rolling.py", line 523, in aggregate
return self.apply(func, raw=False, args=args, kwargs=kwargs)
File "C:\Users\tkpme\miniconda3\lib\site-packages\pandas\core\window\rolling.py", line 1987, in apply
return super().apply(
File "C:\Users\tkpme\miniconda3\lib\site-packages\pandas\core\window\rolling.py", line 1300, in apply
return self._apply(
File "C:\Users\tkpme\miniconda3\lib\site-packages\pandas\core\window\rolling.py", line 507, in _apply
result = calc(values)
File "C:\Users\tkpme\miniconda3\lib\site-packages\pandas\core\window\rolling.py", line 495, in calc
return func(x, start, end, min_periods)
File "C:\Users\tkpme\miniconda3\lib\site-packages\pandas\core\window\rolling.py", line 1326, in apply_func
return window_func(values, begin, end, min_periods)
File "pandas\_libs\window\aggregations.pyx", line 1375, in pandas._libs.window.aggregations.roll_generic_fixed
File "c:\Users\tkpme\OneDrive\Documents\Work\CMC\BP Satya and Suresh\Code\Naveen_peak_detect test.py", line 222, in hodgesLehmannMean
m = np.add.outer(x, x)
File "C:\Users\tkpme\miniconda3\lib\site-packages\pandas\core\series.py", line 705, in __array_ufunc__
return construct_return(result)
File "C:\Users\tkpme\miniconda3\lib\site-packages\pandas\core\series.py", line 694, in construct_return
raise NotImplementedError
NotImplementedError
which appear to be driven by the line
m = np.add.outer(x, x)
and points to something not being implemented or numpy being missing. But I import numpy right at the beginning as follows:
import numpy as np
import pandas as pd
The function works perfectly well on its own if I feed it a list or a numpy array, so I'm not sure what the problem is. Interestingly, if I use the median instead of the Hodges-Lehmann Mean, it runs like a charm
raw_data[new_col] = raw_data['BP'].rolling(21, min_periods=1, center=True,
win_type=None, axis=0, closed=None).median()
What is the cause of my problem, and how do I fix it?
Sincerely
Thomas Philips
I've tried your code with a small dataframe and it worked well, so maybe there is something on your dataframe that must be cleaned or transformed.
Solved it. It turns out that
m = np.add.outer(x, x)
requires x to be array like. When I tested it using lists, numpy arrays, etc. it worked perfectly, just as it did for you. But the .rolling line generates a slice of a dataframe, which is not array like, and the function fails with a confusing error message. I modified the function to create a numpy array from the input and it now works as it should.
def hodgesLehmannMean(x):
x_array = np.array(x)
m = np.add.outer(x_array, x_array)
ind = np.tril_indices(len(x_array), 0)
return 0.5 * np.median(m[ind])
Thanks for looking at it!

Anaconda Pandas breaks on reading hdf file on Python 3.6.x

I am using an Anaconda environment with Python 3.6.8, created with conda create -n temp pandas pytables h5py python=3.6.8. When I try to read a .h5 file like:
f = pd.read_hdf(filename, key)
I get an ValueError exception:
Traceback (most recent call last):
File "read_data.py", line 6, in <module>
f = pd.read_hdf(filename, key)
File "/home/fauzanzaid/anaconda3/envs/temp/lib/python3.6/site-packages/pandas/io/pytables.py", line 394, in read_hdf
return store.select(key, auto_close=auto_close, **kwargs)
File "/home/fauzanzaid/anaconda3/envs/temp/lib/python3.6/site-packages/pandas/io/pytables.py", line 741, in select
return it.get_result()
File "/home/fauzanzaid/anaconda3/envs/temp/lib/python3.6/site-packages/pandas/io/pytables.py", line 1483, in get_result
results = self.func(self.start, self.stop, where)
File "/home/fauzanzaid/anaconda3/envs/temp/lib/python3.6/site-packages/pandas/io/pytables.py", line 734, in func
columns=columns)
File "/home/fauzanzaid/anaconda3/envs/temp/lib/python3.6/site-packages/pandas/io/pytables.py", line 2928, in read
ax = self.read_index('axis%d' % i, start=_start, stop=_stop)
File "/home/fauzanzaid/anaconda3/envs/temp/lib/python3.6/site-packages/pandas/io/pytables.py", line 2523, in read_index
_, index = self.read_index_node(getattr(self.group, key), **kwargs)
File "/home/fauzanzaid/anaconda3/envs/temp/lib/python3.6/site-packages/pandas/io/pytables.py", line 2621, in read_index_node
data = node[start:stop]
File "/home/fauzanzaid/anaconda3/envs/temp/lib/python3.6/site-packages/tables/vlarray.py", line 685, in __getitem__
return self.read(start, stop, step)
File "/home/fauzanzaid/anaconda3/envs/temp/lib/python3.6/site-packages/tables/vlarray.py", line 821, in read
listarr = self._read_array(start, stop, step)
File "tables/hdf5extension.pyx", line 2155, in tables.hdf5extension.VLArray._read_array
ValueError: cannot set WRITEABLE flag to True of this array
This problem goes away if I use an environment with python 3.7, or 3.5. However, I need to use python 3.6.
How can I resolve this error?
I downgraded numpy to 1.14.3 with below command, and it worked for me:
pip3 install numpy==1.14.3

How to write numpy arrays directly to s3 in a deep learning application backed by spark

We are generating ~10k numpy arrays using keras and then finally we have to save those arrays as .npy files to s3. But the problem is for saving to s3 inside the map function of spark we have to create intermediate file.What we want is instead of creating intermediate files directly stream them to s3. I used this "Cottoncandy" library but then its not working inside spark map function and throwing error as:-
pickle.PicklingError: Could not serialize object: TypeError: can't pickle thread.lock objects
Is there any possible way/library available which we can use inside a deep learning application inside spark map function to directly stream the numpy arrays to s3 ?
I have my rdd of numpy array as:
features_rdd
options I tried:-
def writePartition(xs):
cci = cc.get_interface('BUCKET_NAME', ACCESS_KEY=os.environ.get("AWS_ACCESS_KEY_ID"),
SECRET_KEY=os.environ.get("AWS_SECRET_ACCESS_KEY"), endpoint_url='https://s3.amazonaws.com')
#output_path, format_name
for k,v in xs:
file_name_with_domain = get_file_with_parents(k, 1)
file_name = ...
file_name_without_ext = get_file_name_without_ext(file_name)
bucket_name = OUTPUT.split('/', 1)[0]
rest_of_path = OUTPUT.split('/', 1)[1]
final_path = rest_of_path + '/' + file_name_without_ext + '.' + '.npy'
LOGGER.info("Saving to S3....")
response = cci.upload_npy_array(final_path, v)
features_rdd.foreachpartition(writePartition)
option 2:-
def writePartition1(xs):
s3 = boto3.client('s3',region_name='us-east-1')
for k,v in xs:
...
...
np.save(local_dir_full_path, v)
s3.upload_file(local_dir_full_path, 'BUCKET', s3_full_path)
os.remove(local_dir_full_path)
features_rdd.foreachpartition(writePartition1)
Error:-
File "/usr/lib64/python2.7/pickle.py", line 286, in save
f(self, obj) # Call unbound method with explicit self
File "/usr/lib64/python2.7/pickle.py", line 655, in save_dict
self._batch_setitems(obj.iteritems())
File "/usr/lib64/python2.7/pickle.py", line 687, in _batch_setitems
save(v)
File "/usr/lib64/python2.7/pickle.py", line 286, in save
f(self, obj) # Call unbound method with explicit self
File "/usr/lib64/python2.7/pickle.py", line 606, in save_list
self._batch_appends(iter(obj))
File "/usr/lib64/python2.7/pickle.py", line 642, in _batch_appends
save(tmp[0])
File "/usr/lib64/python2.7/pickle.py", line 331, in save
self.save_reduce(obj=obj, *rv)
File "/mnt/yarn/usercache/hadoop/appcache/application_1541683970451_0003/container_1541683970451_0003_01_000001/pyspark.zip/pyspark/cloudpickle.py", line 600, in save_reduce
save(state)
File "/usr/lib64/python2.7/pickle.py", line 286, in save
f(self, obj) # Call unbound method with explicit self
File "/usr/lib64/python2.7/pickle.py", line 655, in save_dict
self._batch_setitems(obj.iteritems())
File "/usr/lib64/python2.7/pickle.py", line 687, in _batch_setitems
save(v)
File "/usr/lib64/python2.7/pickle.py", line 331, in save
self.save_reduce(obj=obj, *rv)
File "/mnt/yarn/usercache/hadoop/appcache/application_1541683970451_0003/container_1541683970451_0003_01_000001/pyspark.zip/pyspark/cloudpickle.py", line 600, in save_reduce
save(state)
File "/usr/lib64/python2.7/pickle.py", line 286, in save
f(self, obj) # Call unbound method with explicit self
File "/usr/lib64/python2.7/pickle.py", line 655, in save_dict
self._batch_setitems(obj.iteritems())
File "/usr/lib64/python2.7/pickle.py", line 687, in _batch_setitems
save(v)
File "/usr/lib64/python2.7/pickle.py", line 306, in save
rv = reduce(self.proto)
TypeError: can't pickle thread.lock objects
Traceback (most recent call last):
File "six_file_boto3_write1.py", line 248, in <module>
run()
File "six_file_boto3_write1.py", line 239, in run
features_rdd.foreachPartition(writePartitionWithBoto)
File "/mnt/yarn/usercache/hadoop/appcache/application_1541683970451_0003/container_1541683970451_0003_01_000001/pyspark.zip/pyspark/rdd.py", line 799, in foreachPartition
File "/mnt/yarn/usercache/hadoop/appcache/application_1541683970451_0003/container_1541683970451_0003_01_000001/pyspark.zip/pyspark/rdd.py", line 1041, in count
File "/mnt/yarn/usercache/hadoop/appcache/application_1541683970451_0003/container_1541683970451_0003_01_000001/pyspark.zip/pyspark/rdd.py", line 1032, in sum
File "/mnt/yarn/usercache/hadoop/appcache/application_1541683970451_0003/container_1541683970451_0003_01_000001/pyspark.zip/pyspark/rdd.py", line 906, in fold
File "/mnt/yarn/usercache/hadoop/appcache/application_1541683970451_0003/container_1541683970451_0003_01_000001/pyspark.zip/pyspark/rdd.py", line 809, in collect
File "/mnt/yarn/usercache/hadoop/appcache/application_1541683970451_0003/container_1541683970451_0003_01_000001/pyspark.zip/pyspark/rdd.py", line 2455, in _jrdd
File "/mnt/yarn/usercache/hadoop/appcache/application_1541683970451_0003/container_1541683970451_0003_01_000001/pyspark.zip/pyspark/rdd.py", line 2388, in _wrap_function
File "/mnt/yarn/usercache/hadoop/appcache/application_1541683970451_0003/container_1541683970451_0003_01_000001/pyspark.zip/pyspark/rdd.py", line 2374, in _prepare_for_python_RDD
File "/mnt/yarn/usercache/hadoop/appcache/application_1541683970451_0003/container_1541683970451_0003_01_000001/pyspark.zip/pyspark/serializers.py", line 464, in dumps
File "/mnt/yarn/usercache/hadoop/appcache/application_1541683970451_0003/container_1541683970451_0003_01_000001/pyspark.zip/pyspark/cloudpickle.py", line 704, in dumps
File "/mnt/yarn/usercache/hadoop/appcache/application_1541683970451_0003/container_1541683970451_0003_01_000001/pyspark.zip/pyspark/cloudpickle.py", line 162, in dump
pickle.PicklingError: Could not serialize object: TypeError: can't pickle thread.lock objects
imports:-
from pyspark.sql import SparkSession
from keras.preprocessing import image
from keras.applications.vgg16 import VGG16
from keras.models import Model
from io import BytesIO
from keras.applications.vgg16 import preprocess_input
import numpy as np
import logging
import os
import boto3
import cottoncandy as cc
So,basically the application works perfectly fine till features_rdd. Even I can verify the count. But when I am trying to save these features that part its not working. Added the imports above
updates:-
def extract_features(model,obj):
try:
print('executing vgg16 feature extractor...')
img = image.load_img(BytesIO(obj), target_size=(224, 224,3))
img_data = image.img_to_array(img)
img_data = np.expand_dims(img_data, axis=0)
img_data = preprocess_input(img_data)
vgg16_feature = model.predict(img_data)[0]
print('++++++++++++++++++++++++++++',vgg16_feature.shape)
return vgg16_feature
except Exception as e:
print('Error......{}'.format(e.args))
return []
def extract_features_(xs):
model_data = initVGG16()
for k, v in xs:
yield k, extract_features(model_data, v)
spark = SparkSession \
.builder \
.appName('test-app') \
.getOrCreate()
sc = spark.sparkContext
s3_files_rdd = sc.binaryFiles(RESOLVED_IMAGE_PATH)
s3_files_rdd.persist()
features_rdd = s3_files_rdd.mapPartitions(extract_features_)

compute() in dask not working

I am trying a simple parallel computation in Dask.
This is my code.
import time
import dask as dask
import dask.distributed as distributed
import dask.dataframe as dd
import dask.delayed as delayed
from dask.distributed import Client,progress
client = Client('localhost:8786')
df = dd.read_csv('file.csv')
ddf = df.groupby(['col1'])[['col2']].sum()
ddf = ddf.compute()
print ddf
It seems fine from the documentation but on running I am getting this :
Traceback (most recent call last):
File "dask_prg1.py", line 17, in <module>
ddf = ddf.compute()
File "/usr/local/lib/python2.7/site-packages/dask/base.py", line 156, in compute
(result,) = compute(self, traverse=False, **kwargs)
File "/usr/local/lib/python2.7/site-packages/dask/base.py", line 402, in compute
results = schedule(dsk, keys, **kwargs)
File "/usr/local/lib/python2.7/site-packages/distributed/client.py", line 2159, in get
direct=direct)
File "/usr/local/lib/python2.7/site-packages/distributed/client.py", line 1562, in gather
asynchronous=asynchronous)
File "/usr/local/lib/python2.7/site-packages/distributed/client.py", line 652, in sync
return sync(self.loop, func, *args, **kwargs)
File "/usr/local/lib/python2.7/site-packages/distributed/utils.py", line 275, in sync
six.reraise(*error[0])
File "/usr/local/lib/python2.7/site-packages/distributed/utils.py", line 260, in f
result[0] = yield make_coro()
File "/usr/local/lib/python2.7/site-packages/tornado/gen.py", line 1099, in run
value = future.result()
File "/usr/local/lib/python2.7/site-packages/tornado/concurrent.py", line 260, in result
raise_exc_info(self._exc_info)
File "/usr/local/lib/python2.7/site-packages/tornado/gen.py", line 1107, in run
yielded = self.gen.throw(*exc_info)
File "/usr/local/lib/python2.7/site-packages/distributed/client.py", line 1439, in _gather
traceback)
File "/usr/local/lib/python2.7/site-packages/dask/bytes/core.py", line 122, in read_block_from_file
with lazy_file as f:
File "/usr/local/lib/python2.7/site-packages/dask/bytes/core.py", line 166, in __enter__
f = SeekableFile(self.fs.open(self.path, mode=mode))
File "/usr/local/lib/python2.7/site-packages/dask/bytes/local.py", line 58, in open
return open(self._normalize_path(path), mode=mode)
IOError: [Errno 2] No such file or directory: 'file.csv'
I am not understanding what is wrong.Kindly help me with this .Thank you in advance .
You may wish to pass the absolute file path to read_csv. The reason is, that you are giving the work of opening and reading the file to a dask worker, and you might not have started that worked with the same working directory as your script/session.