Why am I getting an error when I try to do simple arithmetic on constants in an HDF5 where clause? Here's an example:
>>> import pandas
>>> import numpy as np
>>> d = pandas.DataFrame({"A": np.arange(10), "B": np.random.randint(1, 100, 10)})
>>> store = pandas.HDFStore('teststore.h5', mode='w')
>>> store.append('thingy', d, format='table', data_columns=True, append=False)
>>> store.select('thingy', where="B>50")
A B
0 0 61
1 1 63
6 6 80
7 7 79
8 8 52
9 9 82
>>> store.select('thingy', where="B>40+10")
Traceback (most recent call last):
File "<pyshell#26>", line 1, in <module>
store.select('thingy', where="B>40+10")
File "c:\users\brenbarn\documents\python\extensions\pandas\pandas\io\pytables.py", line 682, in select
return it.get_result()
File "c:\users\brenbarn\documents\python\extensions\pandas\pandas\io\pytables.py", line 1365, in get_result
results = self.func(self.start, self.stop, where)
File "c:\users\brenbarn\documents\python\extensions\pandas\pandas\io\pytables.py", line 675, in func
columns=columns, **kwargs)
File "c:\users\brenbarn\documents\python\extensions\pandas\pandas\io\pytables.py", line 4006, in read
if not self.read_axes(where=where, **kwargs):
File "c:\users\brenbarn\documents\python\extensions\pandas\pandas\io\pytables.py", line 3212, in read_axes
self.selection = Selection(self, where=where, **kwargs)
File "c:\users\brenbarn\documents\python\extensions\pandas\pandas\io\pytables.py", line 4527, in __init__
self.condition, self.filter = self.terms.evaluate()
File "c:\users\brenbarn\documents\python\extensions\pandas\pandas\computation\pytables.py", line 580, in evaluate
self.condition = self.terms.prune(ConditionBinOp)
File "c:\users\brenbarn\documents\python\extensions\pandas\pandas\computation\pytables.py", line 122, in prune
res = pr(left.value, right.prune(klass))
File "c:\users\brenbarn\documents\python\extensions\pandas\pandas\computation\pytables.py", line 118, in prune
res = pr(left.value, right.value)
File "c:\users\brenbarn\documents\python\extensions\pandas\pandas\computation\pytables.py", line 113, in pr
encoding=self.encoding).evaluate()
File "c:\users\brenbarn\documents\python\extensions\pandas\pandas\computation\pytables.py", line 317, in evaluate
raise ValueError("query term is not valid [%s]" % self)
ValueError: query term is not valid [[Condition : [None]]]
Querying directly on the underlying pytables object seems to work:
>>> for row in store.get_storer('thingy').table.where("B>40+10"):
... print(row[:])
(0L, 0, 61)
(1L, 1, 63)
(6L, 6, 80)
(7L, 7, 79)
(8L, 8, 52)
(9L, 9, 82)
So what is going on here?
This is simply not supported. I suppose it could fail with a slightly better message. it is trying to and the 2 nodes (the comparison and the +10) and doesn't know how to deal with it as it's not a comparison operation.
I suppose it could be implemented but IMHO is needlessly complex
Related
The documentation for Tensorflow's tf.image.ssim() says that if you want to return an element-wise structural similarity map, you can set "return_index_map" to True. It is returning an error when I define the argument as either True or False. I am running Tensorflow 2.10 on an Apple M1 Max.
When I run:
import tensorflow as tf
a = tf.ones((20, 20, 20, 20))
b = tf.zeros((20, 20, 20, 20))
sim = tf.image.ssim(a, b, max_val=1, return_index_map=True)
I get the error:
Traceback (most recent call last):
File "/Users/miguel/opt/miniconda3/envs/py39_tensorflow/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3433, in run_code
exec(code_obj, self.user_global_ns, self.user_ns)
File "<ipython-input-4-967be677c86c>", line 1, in <module>
sim = tf.image.ssim(a, b, max_val=1, return_index_map=True)
File "/Users/miguel/opt/miniconda3/envs/py39_tensorflow/lib/python3.9/site-packages/tensorflow/python/util/traceback_utils.py", line 153, in error_handler
raise e.with_traceback(filtered_tb) from None
File "/Users/miguel/opt/miniconda3/envs/py39_tensorflow/lib/python3.9/site-packages/tensorflow/python/util/dispatch.py", line 1170, in op_dispatch_handler
result = api_dispatcher.Dispatch(args, kwargs)
TypeError: Got an unexpected keyword argument 'return_index_map'
Input:
import statsmodels.api as sm
import pandas as pd
# reading data from the csv
data = pd.read_csv('/Users/justkiddings/Desktop/Python/TM/TM.csv')
# defining the variables
x = data['FSP'].tolist()
y = data['RSP'].tolist()
# adding the constant term
x = sm.add_constant(x)
# performing the regression
# and fitting the model
result = sm.OLS(y, x).fit()
# printing the summary table
print(result.summary())
Output:
runfile('/Users/justkiddings/Desktop/Python/Code/untitled28.py', wdir='/Users/justkiddings/Desktop/Python/Code')
Traceback (most recent call last):
File "/Users/justkiddings/opt/anaconda3/lib/python3.9/site-packages/spyder_kernels/py3compat.py", line 356, in compat_exec
exec(code, globals, locals)
File "/Users/justkiddings/Desktop/Python/Code/untitled28.py", line 24, in <module>
result = sm.OLS(y, x).fit()
File "/Users/justkiddings/opt/anaconda3/lib/python3.9/site-packages/statsmodels/regression/linear_model.py", line 890, in __init__
super(OLS, self).__init__(endog, exog, missing=missing,
File "/Users/justkiddings/opt/anaconda3/lib/python3.9/site-packages/statsmodels/regression/linear_model.py", line 717, in __init__
super(WLS, self).__init__(endog, exog, missing=missing,
File "/Users/justkiddings/opt/anaconda3/lib/python3.9/site-packages/statsmodels/regression/linear_model.py", line 191, in __init__
super(RegressionModel, self).__init__(endog, exog, **kwargs)
File "/Users/justkiddings/opt/anaconda3/lib/python3.9/site-packages/statsmodels/base/model.py", line 267, in __init__
super().__init__(endog, exog, **kwargs)
File "/Users/justkiddings/opt/anaconda3/lib/python3.9/site-packages/statsmodels/base/model.py", line 92, in __init__
self.data = self._handle_data(endog, exog, missing, hasconst,
File "/Users/justkiddings/opt/anaconda3/lib/python3.9/site-packages/statsmodels/base/model.py", line 132, in _handle_data
data = handle_data(endog, exog, missing, hasconst, **kwargs)
File "/Users/justkiddings/opt/anaconda3/lib/python3.9/site-packages/statsmodels/base/data.py", line 673, in handle_data
return klass(endog, exog=exog, missing=missing, hasconst=hasconst,
File "/Users/justkiddings/opt/anaconda3/lib/python3.9/site-packages/statsmodels/base/data.py", line 86, in __init__
self._handle_constant(hasconst)
File "/Users/justkiddings/opt/anaconda3/lib/python3.9/site-packages/statsmodels/base/data.py", line 132, in _handle_constant
raise MissingDataError('exog contains inf or nans')
MissingDataError: exog contains inf or nans
Some of the Data:
DATE,HOUR,STATION,CO,FSP,NO2,NOX,O3,RSP,SO2
1/1/2022,1,TUEN MUN,75,38,39,40,83,59,2
1/1/2022,2,TUEN MUN,72,35,29,30,90,61,2
1/1/2022,3,TUEN MUN,74,38,28,30,91,66,2
1/1/2022,4,TUEN MUN,76,39,31,32,79,61,2
1/1/2022,5,TUEN MUN,72,38,25,26,83,65,2
1/1/2022,6,TUEN MUN,74,37,24,25,86,60,2
I have removed the N.A. in my dataset and they have converted into blanks. (Eg. 3/1/2022,12,TUEN MUN,85,,53,70,59,,5) Why there is MissingDataError? How to fix it? Thanks.
I have a pandas dataframe raw_data with two columns: 'T' and 'BP':
T BP
0 -0.500 115.790
1 -0.499 115.441
2 -0.498 115.441
3 -0.497 115.441
4 -0.496 115.790
... ... ...
647163 646.663 105.675
647164 646.664 105.327
647165 646.665 105.327
647166 646.666 105.327
647167 646.667 104.978
[647168 rows x 2 columns]
I want to apply the Hodges-Lehmann mean (it's a robust average) over a rolling window and create a new column. Here's the function:
def hodgesLehmannMean(x):
m = np.add.outer(x, x)
ind = np.tril_indices(len(x), 0)
return 0.5 * np.median(m[ind])
I therefore write:
raw_data[new_col] = raw_data['BP'].rolling(21, min_periods=1, center=True,
win_type=None, axis=0, closed=None).agg(hodgesLehmannMean)
but I get a string of error messages:
Traceback (most recent call last):
File "C:\Users\tkpme\miniconda3\lib\runpy.py", line 194, in _run_module_as_main
return _run_code(code, main_globals, None,
File "C:\Users\tkpme\miniconda3\lib\runpy.py", line 87, in _run_code
exec(code, run_globals)
File "c:\Users\tkpme\.vscode\extensions\ms-python.python-2020.8.101144\pythonFiles\lib\python\debugpy\__main__.py", line 45, in <module>
cli.main()
File "c:\Users\tkpme\.vscode\extensions\ms-python.python-2020.8.101144\pythonFiles\lib\python\debugpy/..\debugpy\server\cli.py", line 430, in main
run()
File "c:\Users\tkpme\.vscode\extensions\ms-python.python-2020.8.101144\pythonFiles\lib\python\debugpy/..\debugpy\server\cli.py", line 267, in run_file
runpy.run_path(options.target, run_name=compat.force_str("__main__"))
File "C:\Users\tkpme\miniconda3\lib\runpy.py", line 265, in run_path
return _run_module_code(code, init_globals, run_name,
File "C:\Users\tkpme\miniconda3\lib\runpy.py", line 97, in _run_module_code
_run_code(code, mod_globals, init_globals,
File "C:\Users\tkpme\miniconda3\lib\runpy.py", line 87, in _run_code
exec(code, run_globals)
File "c:\Users\tkpme\OneDrive\Documents\Work\CMC\BP Satya and Suresh\Code\Naveen_peak_detect test.py", line 227, in <module>
main()
File "c:\Users\tkpme\OneDrive\Documents\Work\CMC\BP Satya and Suresh\Code\Naveen_peak_detect test.py", line 75, in main
raw_data[new_col] = raw_data['BP'].rolling(FILTER_WINDOW, min_periods=1, center=True, win_type=None,
File "C:\Users\tkpme\miniconda3\lib\site-packages\pandas\core\window\rolling.py", line 1961, in aggregate
return super().aggregate(func, *args, **kwargs)
File "C:\Users\tkpme\miniconda3\lib\site-packages\pandas\core\window\rolling.py", line 523, in aggregate
return self.apply(func, raw=False, args=args, kwargs=kwargs)
File "C:\Users\tkpme\miniconda3\lib\site-packages\pandas\core\window\rolling.py", line 1987, in apply
return super().apply(
File "C:\Users\tkpme\miniconda3\lib\site-packages\pandas\core\window\rolling.py", line 1300, in apply
return self._apply(
File "C:\Users\tkpme\miniconda3\lib\site-packages\pandas\core\window\rolling.py", line 507, in _apply
result = calc(values)
File "C:\Users\tkpme\miniconda3\lib\site-packages\pandas\core\window\rolling.py", line 495, in calc
return func(x, start, end, min_periods)
File "C:\Users\tkpme\miniconda3\lib\site-packages\pandas\core\window\rolling.py", line 1326, in apply_func
return window_func(values, begin, end, min_periods)
File "pandas\_libs\window\aggregations.pyx", line 1375, in pandas._libs.window.aggregations.roll_generic_fixed
File "c:\Users\tkpme\OneDrive\Documents\Work\CMC\BP Satya and Suresh\Code\Naveen_peak_detect test.py", line 222, in hodgesLehmannMean
m = np.add.outer(x, x)
File "C:\Users\tkpme\miniconda3\lib\site-packages\pandas\core\series.py", line 705, in __array_ufunc__
return construct_return(result)
File "C:\Users\tkpme\miniconda3\lib\site-packages\pandas\core\series.py", line 694, in construct_return
raise NotImplementedError
NotImplementedError
which appear to be driven by the line
m = np.add.outer(x, x)
and points to something not being implemented or numpy being missing. But I import numpy right at the beginning as follows:
import numpy as np
import pandas as pd
The function works perfectly well on its own if I feed it a list or a numpy array, so I'm not sure what the problem is. Interestingly, if I use the median instead of the Hodges-Lehmann Mean, it runs like a charm
raw_data[new_col] = raw_data['BP'].rolling(21, min_periods=1, center=True,
win_type=None, axis=0, closed=None).median()
What is the cause of my problem, and how do I fix it?
Sincerely
Thomas Philips
I've tried your code with a small dataframe and it worked well, so maybe there is something on your dataframe that must be cleaned or transformed.
Solved it. It turns out that
m = np.add.outer(x, x)
requires x to be array like. When I tested it using lists, numpy arrays, etc. it worked perfectly, just as it did for you. But the .rolling line generates a slice of a dataframe, which is not array like, and the function fails with a confusing error message. I modified the function to create a numpy array from the input and it now works as it should.
def hodgesLehmannMean(x):
x_array = np.array(x)
m = np.add.outer(x_array, x_array)
ind = np.tril_indices(len(x_array), 0)
return 0.5 * np.median(m[ind])
Thanks for looking at it!
Hello I am trying to create a Dask Dataframe by pulling data from an Oracle Database as:
import cx_Oracle
import pandas as pd
import dask
import dask.dataframe as dd
# Build connection string/URL
user='user'
pw='pw'
host = 'xxx-yyy-x000'
port = '9999'
sid= 'XXXXX000'
ora_uri = 'oracle+cx_oracle://{user}:{password}#{sid}'.format(user=user, password=pw, sid=cx_Oracle.makedsn(host,port,sid))
tstquery ="select ID from EXAMPLE where rownum <= 5"
# Create Pandas Dataframe from ORACLE Query pull
tstdf1 = pd.read_sql(tstquery
,con = ora_uri
)
print("Dataframe tstdf1 created by pd.read_sql")
print(tstdf1.info())
# Create Dask Dataframe from ORACLE Query pull
tstdf2 = dd.read_sql_table(table = tstquery
,uri = ora_uri
,index_col = 'ID'
)
print(tstdf2.info())
As you can see the Pandas DF gets created but not the Dask DF. Following is the stdout:
Dataframe tstdf1 created by pd.read_sql
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 1 columns):
ID 5 non-null int64
dtypes: int64(1)
memory usage: 120.0 bytes
None
Traceback (most recent call last):
File "dk_test.py", line 40, in <module>
,index_col = 'ID'
File "---------------------------python3.6/site-packages/dask/dataframe/io/sql.py", line 103, in read_sql_table
table = sa.Table(table, m, autoload=True, autoload_with=engine, schema=schema)
File "<string>", line 2, in __new__
File "---------------------------python3.6/site-packages/sqlalchemy/util/deprecations.py", line 130, in warned
return fn(*args, **kwargs)
File "---------------------------python3.6/site-packages/sqlalchemy/sql/schema.py", line 496, in __new__
metadata._remove_table(name, schema)
File "---------------------------python3.6/site-packages/sqlalchemy/util/langhelpers.py", line 68, in __exit__
compat.reraise(exc_type, exc_value, exc_tb)
File "---------------------------python3.6/site-packages/sqlalchemy/util/compat.py", line 154, in reraise
raise value
File "---------------------------python3.6/site-packages/sqlalchemy/sql/schema.py", line 491, in __new__
table._init(name, metadata, *args, **kw)
File "---------------------------python3.6/site-packages/sqlalchemy/sql/schema.py", line 585, in _init
resolve_fks=resolve_fks,
File "---------------------------python3.6/site-packages/sqlalchemy/sql/schema.py", line 609, in _autoload
_extend_on=_extend_on,
File "---------------------------python3.6/site-packages/sqlalchemy/engine/base.py", line 2147, in run_callable
return conn.run_callable(callable_, *args, **kwargs)
File "---------------------------python3.6/site-packages/sqlalchemy/engine/base.py", line 1604, in run_callable
return callable_(self, *args, **kwargs)
File "---------------------------python3.6/site-packages/sqlalchemy/engine/default.py", line 429, in reflecttable
table, include_columns, exclude_columns, resolve_fks, **opts
File "---------------------------python3.6/site-packages/sqlalchemy/engine/reflection.py", line 653, in reflecttable
raise exc.NoSuchTableError(table.name)
sqlalchemy.exc.NoSuchTableError: select ID from EXAMPLE where rownum <= 5
Needless to say, the table exists (As demonstrated by the creation of the Pandas DF), the Index is on
the col ID as well. What is the problem ?
Why do I get an error:
import pandas as pd
a = pd.Series(index=[4,5,6], data=0)
print a.loc[4:5]
a.loc[4:5] += 1
Output:
4 0
5 0
Traceback (most recent call last):
File "temp1.py", line 9, in <module>
dtype: int64
a.loc[4:5] += 1
File "lib\site-packages\pandas\core\indexing.py", line 88, in __setitem__
self._setitem_with_indexer(indexer, value)
File "lib\site-packages\pandas\core\indexing.py", line 177, in _setitem_with_indexer
value = self._align_series(indexer, value)
File "lib\site-packages\pandas\core\indexing.py", line 206, in _align_series
raise ValueError('Incompatible indexer with Series')
ValueError: Incompatible indexer with Series
Pandas 0.12.
I think this is a bug, you can work around this by use tuple index:
import pandas as pd
a = pd.Series(index=[4,5,6], data=0)
print a.loc[4:5]
a.loc[4:5,] += 1