How do I read tabulator separated CSV in blaze? - pandas

I have a "CSV" data file with the following format (well, it's rather a TSV):
event pdg x y z t px py pz ekin
3383 11 -161.515 5.01938e-05 -0.000187112 0.195413 0.664065 0.126078 -0.736968 0.00723234
1694 11 -161.515 -0.000355633 0.000263174 0.195413 0.511853 -0.523429 0.681196 0.00472714
4228 11 -161.535 6.59631e-06 -3.32796e-05 0.194947 -0.713983 -0.0265468 -0.69966 0.0108681
4233 11 -161.515 -0.000524488 6.5069e-05 0.195413 0.942642 0.331324 0.0406377 0.017594
This file is interpretable as-is in pandas:
from pandas import read_csv, read_table
data = read_csv("test.csv", sep="\t", index_col=False) # Works
data = read_table("test.csv", index_col=False) # Works
However, when I try to read it in blaze (that declares to use pandas keyword arguments), an exception is thrown:
from blaze import Data
Data("test.csv") # Attempt 1
Data("test.csv", sep="\t") # Attempt 2
Data("test.csv", sep="\t", index_col=False) # Attempt 3
None of these works and pandas is not used at all. The "sniffer" that tries to deduce column names and types just calls csv.Sniffer.sniff() from standard library (which fails).
Is there a way how to properly read this file in blaze (given that its "little brother" has few hundred MBs, I want to use blaze's sequential processing capabilities)?
Thanks for any ideas.
Edit: I think it might be a problem of odo/csv and filed an issue: https://github.com/blaze/odo/issues/327
Edit2:
Complete error:
Error Traceback (most recent call last) in () ----> 1 bz.Data("test.csv", sep="\t", index_col=False)
/home/[username-hidden]/anaconda3/lib/python3.4/site-packages/blaze/interactive.py in Data(data, dshape, name, fields, columns, schema, **kwargs)
54 if isinstance(data, _strtypes):
55 data = resource(data, schema=schema, dshape=dshape, columns=columns,
---> 56 **kwargs)
57 if (isinstance(data, Iterator) and
58 not isinstance(data, tuple(not_an_iterator))):
/home/[username-hidden]/anaconda3/lib/python3.4/site-packages/odo/regex.py in __call__(self, s, *args, **kwargs)
62
63 def __call__(self, s, *args, **kwargs):
---> 64 return self.dispatch(s)(s, *args, **kwargs)
65
66 #property
/home/[username-hidden]/anaconda3/lib/python3.4/site-packages/odo/backends/csv.py in resource_csv(uri, **kwargs)
276 #resource.register('.+\.(csv|tsv|ssv|data|dat)(\.gz|\.bz2?)?')
277 def resource_csv(uri, **kwargs):
--> 278 return CSV(uri, **kwargs)
279
280
/home/[username-hidden]/anaconda3/lib/python3.4/site-packages/odo/backends/csv.py in __init__(self, path, has_header, encoding, sniff_nbytes, **kwargs)
102 if has_header is None:
103 self.has_header = (not os.path.exists(path) or
--> 104 infer_header(path, sniff_nbytes))
105 else:
106 self.has_header = has_header
/home/[username-hidden]/anaconda3/lib/python3.4/site-packages/odo/backends/csv.py in infer_header(path, nbytes, encoding, **kwargs)
58 with open_file(path, 'rb') as f:
59 raw = f.read(nbytes)
---> 60 return csv.Sniffer().has_header(raw if PY2 else raw.decode(encoding))
61
62
/home/[username-hidden]/anaconda3/lib/python3.4/csv.py in has_header(self, sample)
392 # subtracting from the likelihood of the first row being a header.
393
--> 394 rdr = reader(StringIO(sample), self.sniff(sample))
395
396 header = next(rdr) # assume first row is header
/home/[username-hidden]/anaconda3/lib/python3.4/csv.py in sniff(self, sample, delimiters)
187
188 if not delimiter:
--> 189 raise Error("Could not determine delimiter")
190
191 class dialect(Dialect):
Error: Could not determine delimiter

I am working with Python 2.7.10, dask v0.7.1, blaze v0.8.2 and conda v3.17.0.
conda install dask
conda install blaze
Here is a way you can import the data for use with blaze. Parse the data first with pandas and then convert it into blaze. Perhaps this defeats the purpose, but there are no troubles this way.
As a side note in order to parse the data file correctly your line in pandas parse statment should be:
from blaze import Data
from pandas import DataFrame, read_csv
data = read_csv("csvdata.dat", sep="\s*", index_col=False)
bdata = Data(data)
Now the data is formatted correctly with no errors, bdata:
event pdg x y z t px py \
0 3383 11 -161.515 0.000050 -0.000187 0.195413 0.664065 0.126078
1 1694 11 -161.515 -0.000356 0.000263 0.195413 0.511853 -0.523429
2 4228 11 -161.535 0.000007 -0.000033 0.194947 -0.713983 -0.026547
3 4233 11 -161.515 -0.000524 0.000065 0.195413 0.942642 0.331324
pz ekin
0 -0.736968 0.007232
1 0.681196 0.004727
2 -0.699660 0.010868
Here is an alternative, use dask, it probably can do the same chunking, or large scale processing you are looking for. Dask certainly makes it immediately easy to correctly load a tsv format.
In [17]: import dask.dataframe as dd
In [18]: df = dd.read_csv('tsvdata.txt', sep='\t', index_col=False)
In [19]: df.head()
Out[19]:
event pdg x y z t px py \
0 3383 11 -161.515 0.000050 -0.000187 0.195413 0.664065 0.126078
1 1694 11 -161.515 -0.000356 0.000263 0.195413 0.511853 -0.523429
2 4228 11 -161.535 0.000007 -0.000033 0.194947 -0.713983 -0.026547
3 4233 11 -161.515 -0.000524 0.000065 0.195413 0.942642 0.331324
4 854 11 -161.515 0.000032 0.000418 0.195414 0.675752 0.315671
pz ekin
0 -0.736968 0.007232
1 0.681196 0.004727
2 -0.699660 0.010868
3 0.040638 0.017594
4 -0.666116 0.012641
In [20]:
See also: http://dask.pydata.org/en/latest/array-blaze.html#how-to-use-blaze-with-dask

Related

Confusing indexer of pandas

I found the bracket indexer([]) very confusing.
import pandas as pd
import numpy as np
aa = np.asarray([[1,2,3],[4,5,6],[7,8,9]])
df = pd.DataFrame(aa)
df
output
0 1 2
0 1 2 3
1 4 5 6
2 7 8 9
Then I tried to index it with []
df[1]
output as below, it seems it gets me the values of a column
0 2
1 5
2 8
but..when I do
df[1:3]
it gets me the rows...
0 1 2
1 4 5 6
2 7 8 9
Besides that, it does not allow me to do
df[1,2]
it gives me error
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
Untitled-1.ipynb Cell 19' in <cell line: 1>()
----> 1 df[1,2]
File d:\ProgramData\Miniconda3\lib\site-packages\pandas\core\frame.py:3458, in DataFrame.__getitem__(self, key)
3456 if self.columns.nlevels > 1:
3457 return self._getitem_multilevel(key)
-> 3458 indexer = self.columns.get_loc(key)
3459 if is_integer(indexer):
3460 indexer = [indexer]
File d:\ProgramData\Miniconda3\lib\site-packages\pandas\core\indexes\range.py:388, in RangeIndex.get_loc(self, key, method, tolerance)
386 except ValueError as err:
387 raise KeyError(key) from err
--> 388 raise KeyError(key)
389 return super().get_loc(key, method=method, tolerance=tolerance)
KeyError: (1, 2)
Should I avoid using [] and always use loc and iloc instead ?
In pandas, if you want to select values by numeric index, you use iloc. a dataframe has 2 axes, so to select a specific cell you have to specify both axes (row and column). see the code.
df.iloc[0,0] # this should return the value 1
df.iloc[0,:] # this returns the first row
df.iloc[:,0] # first column
df.iloc[:2,:2] # this returns a slice of the dataframe which is the first two rows with the first two columns
to select values by labels (column names and index labels), use loc

Truth value of a Dataframe is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all()

I've tried to do pairplot by seaborn with my csv data (this link) by follow code according to seaborn site.
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
freq_data = pd.read_csv('C:\\Users\\frequency.csv')
freq = sns.load_dataset(freq_data)
df = sns.pairplot(iris, hue="condition", height=2.5)
plt.show()
the results show the trackback of ambiguous of dataframe
Traceback (most recent call last):
File "\.vscode\test.py", line 8, in <module>
freq = sns.load_dataset(freq_data)
File "\site-packages\seaborn\utils.py", line 485, in load_dataset
if name not in get_dataset_names():
File "\site-packages\pandas\core\generic.py", line 1441, in __nonzero__
raise ValueError(
ValueError: The truth value of a DataFrame is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().
I've checked my data and result it here
condition area sphericity aspect_ratio
0 20 kHz 0.249 0.287 1.376
1 20 kHz 0.954 0.721 1.421
2 20 kHz 0.118 0.260 1.409
3 20 kHz 0.540 0.552 1.526
4 20 kHz 0.448 0.465 1.160
.. ... ... ... ...
310 30 kHz 6.056 0.955 2.029
311 30 kHz 4.115 1.097 1.398
312 30 kHz 11.055 1.816 1.838
313 30 kHz 4.360 1.183 1.162
314 30 kHz 10.596 0.940 1.715
[315 rows x 4 columns]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 315 entries, 0 to 314
Data columns (total 4 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 condition 315 non-null object
1 area 315 non-null float64
2 sphericity 315 non-null float64
3 aspect_ratio 315 non-null float64
dtypes: float64(3), object(1)
memory usage: 10.0+ KB
I have no ideas what happen with my dataframe :(
Please advice me to solve these problem
Thank you everyone
The first argument of seaborn.load_dataset() is name of the dataset ({name}.csv on https://github.com/mwaskom/seaborn-data) not a pandas.DataFrame object. The return value of seaborn.load_dataset() is just pandas.DataFrame, so you don't need to do
freq = sns.load_dataset(freq_data)
Moreover, you may want freq_data rather than iris in df = sns.pairplot(iris, hue="condition", height=2.5).
Here is the final example code
from io import StringIO
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
TESTDATA = StringIO("""condition;area;sphericity;aspect_ratio
20 kHz;0.249;0.287;1.376
20 kHz;0.954;0.721;1.421
20 kHz;0.118;0.260;1.409
20 kHz;0.540;0.552;1.526
20 kHz;0.448;0.465;1.160
30 kHz;6.056;0.955;2.029
30 kHz;4.115;1.097;1.398
30 kHz;11.055;1.816;1.838
30 kHz;4.360;1.183;1.162
30 kHz;10.596;0.940;1.715
""")
freq_data = pd.read_csv(TESTDATA, sep=";")
sns.pairplot(freq_data, hue="condition", height=2.5)
plt.show()

Is there another way to solve about pandas set option?

I'm analyzing a data-frame and want to check more detailed lists
but even though I searched some solutions from google,
I don't understand why the result is not changed.
what is the problem??
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
# Import data
df = df = pd.read_csv(r"C:\Users\Administrator\Desktop\medical.txt")
pd.set_option("display.max_rows", 50)
pd.set_option('display.max_columns', 15)
print(df)
id age gender height weight ap_hi ap_lo cholesterol gluc
0 0 18393 2 168 62.0 110 80 1 1
1 1 20228 1 156 85.0 140 90 3 1
2 2 18857 1 165 64.0 130 70 3 1
3 3 17623 2 169 82.0 150 100 1 1
4 4 17474 1 156 56.0 100 60 1 1
... ... ... ... ... ... ... ... ...
69995 99993 19240 2 168 76.0 120 80 1 1
69996 99995 22601 1 158 126.0 140 90 2 2
69997 99996 19066 2 183 105.0 180 90 3 1
69998 99998 22431 1 163 72.0 135 80 1 2
69999 99999 20540 1 170 72.0 120 80 2 1
Look at https://pandas.pydata.org/pandas-docs/stable/user_guide/options.html at "Frequently used options" chapter.
You can see that if the "max_rows" is lower than the total number of rows in your dataframe then it is displayed like your results.
Below a copy past of the interesting part in the link that I gave you:
if there are a way to display enough columns
pd.set_option('display.width',1000)
or
pd.set_option('display.width',None)
but to rows may be you only use
df.head(50)
or
df.tail(50)
or follows to DisplayAll
pd.set_option("display.max_rows", None)
Why set that is useless:
The second parameter is not the maximum number of rows that can be viewed, but an internal template parameter
code as follows:
set_option = CallableDynamicDoc(_set_option, _set_option_tmpl)
CallableDynamicDoc:
class CallableDynamicDoc:
def __init__(self, func, doc_tmpl):
self.__doc_tmpl__ = doc_tmpl
self.__func__ = func
def __call__(self, *args, **kwds):
return self.__func__(*args, **kwds)
#property
def __doc__(self):
opts_desc = _describe_option("all", _print_desc=False)
opts_list = pp_options_list(list(_registered_options.keys()))
return self.__doc_tmpl__.format(opts_desc=opts_desc, opts_list=opts_list)

Appending pandas data to hdf store, getting 'TypeError: object of type 'int' has no len()' error

Motivation:
I have about 30 million rows of data, one column being an index value, the other being a list of 512 int32 numbers. I wish to only retrieve maybe a thousand or so at a time, so I want to create some sort of datastore that can look up the data by index, while leaving the rest on the disk.
Right now the data is split up into 184 files, which can be opened by pandas.
This is what my dataframe looks like
df.head()
IndexID NumpyIds
1899317 [0, 47715, 1757, 9, 38994, 230, 12, 241, 12228...
22861131 [0, 48156, 154, 6304, 43611, 11, 9496, 8982, 1...
2163410 [0, 26039, 41156, 227, 860, 3320, 6673, 260, 1...
15760716 [0, 40883, 4086, 11, 5, 18559, 1923, 1494, 4, ...
12244098 [0, 45651, 4128, 227, 5, 10397, 995, 731, 9, 3...
There is the index, and then the column 'NumpyIds' which are numpy arrays of size 512, containing int32 ints.
I then tried this:
store = pd.HDFStore('/data2.h5')
store.put('index', df, format='table', append=True)
And got this
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-12-05b956667991> in <module>()
----> 1 store.put('index', df, format='table', append=True, data_columns=True)
2 store.close
4 frames
/usr/local/lib/python3.6/dist-packages/pandas/io/pytables.py in put(self, key, value, format, index, append, complib, complevel, min_itemsize, nan_rep, data_columns, encoding, errors)
1040 data_columns=data_columns,
1041 encoding=encoding,
-> 1042 errors=errors,
1043 )
1044
/usr/local/lib/python3.6/dist-packages/pandas/io/pytables.py in _write_to_group(self, key, value, format, axes, index, append, complib, complevel, fletcher32, min_itemsize, chunksize, expectedrows, dropna, nan_rep, data_columns, encoding, errors)
1707 dropna=dropna,
1708 nan_rep=nan_rep,
-> 1709 data_columns=data_columns,
1710 )
1711
/usr/local/lib/python3.6/dist-packages/pandas/io/pytables.py in write(self, obj, axes, append, complib, complevel, fletcher32, min_itemsize, chunksize, expectedrows, dropna, nan_rep, data_columns)
4141 min_itemsize=min_itemsize,
4142 nan_rep=nan_rep,
-> 4143 data_columns=data_columns,
4144 )
4145
/usr/local/lib/python3.6/dist-packages/pandas/io/pytables.py in _create_axes(self, axes, obj, validate, nan_rep, data_columns, min_itemsize)
3811 nan_rep=nan_rep,
3812 encoding=self.encoding,
-> 3813 errors=self.errors,
3814 )
3815 adj_name = _maybe_adjust_name(new_name, self.version)
/usr/local/lib/python3.6/dist-packages/pandas/io/pytables.py in _maybe_convert_for_string_atom(name, block, existing_col, min_itemsize, nan_rep, encoding, errors)
4798 # we cannot serialize this data, so report an exception on a column
4799 # by column basis
-> 4800 for i in range(len(block.shape[0])):
4801
4802 col = block.iget(i)
TypeError: object of type 'int' has no len()
What am I trying to do?
I have 184 pandas files which I am trying to concatenate into 1 hdf file for fast look up using the index.
For example
store['index'][21]
Would give me that 512 dimension vector for the index of 21.
Edit:
I tried creating a column for every number, so
df[[str(i) for i in range(512)]] = pd.DataFrame(df.NumpyIds.to_numpy(), index=df.index)
df.drop(columns='NumpyIds', inplace=True)
store.put('index', df, format='table', append=True)
store.close
Which works, although I feel this may be a hack rather than an ideal workaround. But now the issue is I can't seem to get those values from the index
store.select(key='index', start=2163410)
returns
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 ... 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511
IndexID
0 rows × 512 columns
Which are the column names, but not the data in that column. Also this method takes a lot of RAM. I am wondering if it loads all the data at once, rather than just the index specified.
Another workaround I'm trying is opening the data directly in h5py
df = pd.read_hdf(hdf_files[0])
df.set_index('IndexID', inplace=True)
df.to_hdf('testhdf.h5', key='df')
h = h5py.File('testhdf.h5')
But I can't seem to figure out how to retrieve data by index from this store
h['df'][2163410]
/usr/local/lib/python3.6/dist-packages/h5py/_hl/base.py in _e(self, name, lcpl)
135 else:
136 try:
--> 137 name = name.encode('ascii')
138 coding = h5t.CSET_ASCII
139 except UnicodeEncodeError:
AttributeError: 'int' object has no attribute 'encode'
as far as I know, this is a BUG.
See #34274.
I've fixed it in #38919. Now it shows appropriate error message.

Input contains infinity or a value too large for dtype('float64')

I've seen many similar questions here, but none of the answers solved my problem.
I am trying to do a Power Transform in my dataset, but I am still obtaining such error.
The dataset does not contain inf or nan values, and I make sure that they are not greater than float64.max. I also tried to reindex the dataframe before.
features_training = features_training.astype(np.float64)
target_training = target_training.astype(np.float64)
features_test = features_test.astype(np.float64)
target_test = target_test.astype(np.float64)
print(np.where(features_training.values >= np.finfo(np.float64).max))
print(np.where(features_test.values >= np.finfo(np.float64).max))
print(np.where(target_training.values >= np.finfo(np.float64).max))
print(np.where(target_test.values >= np.finfo(np.float64).max))
print(np.isnan(features_training.values).any())
print(np.isnan(features_test.values).any())
print(np.isnan(target_training.values).any())
print(np.isnan(target_test.values).any())
print(np.isinf(features_training.values).any())
print(np.isinf(features_test.values).any())
print(np.isinf(target_training.values).any())
print(np.isinf(target_test.values).any())
pt_X = PowerTransformer().fit(features_training)
pt_Y = PowerTransformer().fit(np.asarray(target_training).reshape(-1,1))
features_training = pt_X.transform(features_training)
target_training = pt_Y.transform(np.asarray(target_training).reshape(-1,1))
features_test = pt_X.transform(features_test)
target_test = pt_Y.transform(np.asarray(target_test).reshape(-1,1))
Using dataframe.info()
features training
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Columns: 138 entries
dtypes: float64(138)
memory usage: 545.6 KB
None
target training
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 1 columns):
506 non-null float64
dtypes: float64(1)
memory usage: 4.0 KB
None
features test
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 482 entries, 0 to 481
Columns: 138 entries
dtypes: float64(138)
memory usage: 519.7 KB
None
target test
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 482 entries, 0 to 481
Data columns (total 1 columns):
482 non-null float64
dtypes: float64(1)
memory usage: 3.8 KB
None
Error traceback
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-100-6ca93dd1855a> in <module>
21 # features already normalized. Target remains the same
22 features_training, features_test, target_training, target_test, ptX_, pt_Y = normalization(features_training, features_test,
---> 23 target_training, target_test)
24
25 model.fit(features_training, target_training)
<ipython-input-99-9199a48b9d30> in normalization(features_training, features_test, target_training, target_test)
47 target_training = pt_Y.transform(np.asarray(target_training).reshape(-1,1))
48
---> 49 features_test = pt_X.transform(features_test)
50 target_test = pt_Y.transform(np.asarray(target_test).reshape(-1,1))
51
~\AppData\Local\Continuum\anaconda2\envs\env36\lib\site-packages\sklearn\preprocessing\data.py in transform(self, X)
2731
2732 if self.standardize:
-> 2733 X = self._scaler.transform(X)
2734
2735 return X
~\AppData\Local\Continuum\anaconda2\envs\env36\lib\site-packages\sklearn\preprocessing\data.py in transform(self, X, copy)
756 X = check_array(X, accept_sparse='csr', copy=copy,
757 estimator=self, dtype=FLOAT_DTYPES,
--> 758 force_all_finite='allow-nan')
759
760 if sparse.issparse(X):
~\AppData\Local\Continuum\anaconda2\envs\env36\lib\site-packages\sklearn\utils\validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
540 if force_all_finite:
541 _assert_all_finite(array,
--> 542 allow_nan=force_all_finite == 'allow-nan')
543
544 if ensure_min_samples > 0:
~\AppData\Local\Continuum\anaconda2\envs\env36\lib\site-packages\sklearn\utils\validation.py in _assert_all_finite(X, allow_nan)
54 not allow_nan and not np.isfinite(X).all()):
55 type_err = 'infinity' if allow_nan else 'NaN, infinity'
---> 56 raise ValueError(msg_err.format(type_err, X.dtype))
57 # for object dtype data, we only check for NaNs (GH-13254)
58 elif X.dtype == np.dtype('object') and not allow_nan:
ValueError: Input contains infinity or a value too large for dtype('float64').