Problems using pandas read sql with a connection using cx_Oracle 6.0b2 - sql

When using cx_Oracle 5.3 I did not have this issue, but for a particularly large query that I am trying to run using:
connection = cx_Oracle.connect('Username/Password#host/dbname')
pd.read_sql(Query,connection)
I get the following value error:
ValueError Traceback (most recent call last)
<ipython-input-22-916f315e0bf6> in <module>()
----> 1 OracleEx = pd.read_sql(x,connection)
2 OracleEx.head()
C:\Users\kevinb\AppData\Local\Continuum\Anaconda3\lib\site-packages\pandas\io\sql.py in read_sql(sql, con, index_col, coerce_float, params, parse_dates, columns, chunksize)
497 sql, index_col=index_col, params=params,
498 coerce_float=coerce_float, parse_dates=parse_dates,
--> 499 chunksize=chunksize)
500
501 try:
C:\Users\kevinb\AppData\Local\Continuum\Anaconda3\lib\site-packages\pandas\io\sql.py in read_query(self, sql, index_col, coerce_float, params, parse_dates, chunksize)
1606 parse_dates=parse_dates)
1607 else:
-> 1608 data = self._fetchall_as_list(cursor)
1609 cursor.close()
1610
C:\Users\kevinb\AppData\Local\Continuum\Anaconda3\lib\site-packages\pandas\io\sql.py in _fetchall_as_list(self, cur)
1615
1616 def _fetchall_as_list(self, cur):
-> 1617 result = cur.fetchall()
1618 if not isinstance(result, list):
1619 result = list(result)
ValueError: invalid literal for int() with base 10: '8.9'
Setting up my own cursor and using cur.fetchall() I get a similar result:
ValueError Traceback (most recent call last)
<ipython-input-46-d32c0f219cdf> in <module>()
----> 1 y=x.fetchall()
2 pd.DataFrame(y)
ValueError: invalid literal for int() with base 10: '7.3'
The values '8.9' and '7.3' change with every run.
Any ideas on why I am getting these value errors?
pd.read_sql and using cur.fetchall() have worked for some queries, but not the particular one I am using which has worked in previous versions of cx_Oracle.

Please try with the release candidate instead of beta 2. There was an issue when retrieving certain numeric expressions.
python -m pip install cx_Oracle --upgrade --pre

Related

Pandasql returns error with a basic example

The following code when run
import pandas as pd
from pandasql import sqldf
df = pd.DataFrame({'col1': [1, 2, 3, 4], 'col2': [10, 20, 30, 40]})
query = "SELECT * FROM df WHERE col1 > 2"
result = sqldf(query, globals())
print(result)
gives the following error:
Output exceeds the size limit. Open the full output data in a text editor
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
File ~/.virtualenvs/r-reticulate/lib64/python3.11/site-packages/sqlalchemy/engine/base.py:1410, in Connection.execute(self, statement, parameters, execution_options)
1409 try:
-> 1410 meth = statement._execute_on_connection
1411 except AttributeError as err:
AttributeError: 'str' object has no attribute '_execute_on_connection'
The above exception was the direct cause of the following exception:
ObjectNotExecutableError Traceback (most recent call last)
Cell In[1], line 11
8 query = "SELECT * FROM df WHERE col1 > 2"
10 # Execute the query using pandasql
---> 11 result = sqldf(query, globals())
13 print(result)
File ~/.virtualenvs/r-reticulate/lib64/python3.11/site-packages/pandasql/sqldf.py:156, in sqldf(query, env, db_uri)
124 def sqldf(query, env=None, db_uri='sqlite:///:memory:'):
125 """
126 Query pandas data frames using sql syntax
127 This function is meant for backward compatibility only. New users are encouraged to use the PandaSQL class.
(...)
154 >>> sqldf("select avg(x) from df;", locals())
...
1416 distilled_parameters,
1417 execution_options or NO_OPTIONS,
1418 )
ObjectNotExecutableError: Not an executable object: 'SELECT * FROM df WHERE col1 > 2'
Could someone please help me?
The problem could be fixed by downgrading SQLAlchemy:
pip install SQLAlchemy==1.4.46
See bug report for more details.

ufunc 'divide' not supported for the input types...... error problem while trying to get the NumPy average

I am new to Numpy and I have been trying to get the average of an array I derived from another array.
This is the code that have been giving me error: "ufunc 'divide' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe'' "
import numpy as np
import pandas as pd
​
cars = pd.read_csv('data/co2_emissions_canada.csv')
cars_makes = cars['Make'].to_numpy()
cars_models = cars['Model'].to_numpy()
cars_classes = cars['Vehicle Class'].to_numpy()
cars_engine_sizes = cars['Engine Size(L)'].to_numpy()
cars_cylinders = cars['Cylinders'].to_numpy()
cars_transmissions = cars['Transmission'].to_numpy()
cars_fuel_types = cars['Fuel Type'].to_numpy()
cars_fuel_consumption = cars['Fuel Consumption Comb (L/100 km)'].to_numpy()
cars_co2_emissions = cars['CO2 Emissions(g/km)'].to_numpy()
​
#the median of the cars_engine_sizes
print(np.median(cars_engine_sizes))
#the average fuel consumption for regular gasoline (Fuel Type = X), #premium gasoline (Z), ethanol (E), and diesel (D)?
fuel_typesx=np.array(cars_fuel_types[cars_fuel_types=='X'])
print(np.average(fuel_typesx))
fuel_typesz=np.array(cars_fuel_types[cars_fuel_types=='Z'])
print(np.average(fuel_typesz))
fuel_typese=np.array(cars_fuel_types[cars_fuel_types=='E'])
print(np.average(fuel_typese))
please, what am i missing
I'm guessing the FULL error message looks something like this:
In [753]: np.average(np.array(['A','B','C','A'],dtype=object))
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
Input In [753], in <cell line: 1>()
----> 1 np.average(np.array(['A','B','C','A'],dtype=object))
File <__array_function__ internals>:5, in average(*args, **kwargs)
File ~\anaconda3\lib\site-packages\numpy\lib\function_base.py:380, in average(a, axis, weights, returned)
377 a = np.asanyarray(a)
379 if weights is None:
--> 380 avg = a.mean(axis)
381 scl = avg.dtype.type(a.size/avg.size)
382 else:
File ~\anaconda3\lib\site-packages\numpy\core\_methods.py:191, in _mean(a, axis, dtype, out, keepdims, where)
189 ret = ret.dtype.type(ret / rcount)
190 else:
--> 191 ret = ret / rcount
193 return ret
TypeError: ufunc 'true_divide' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''
cars_fuel_types comes from a dataframe, and evidently contains strings like 'E'. So it is object dtype. Even if you select like values, you can't take an 'average'.
average takes the sum of values and divides by the count. sum for python strings is concatenation, not some sort of math.
In [754]: np.sum(np.array(['A','B','C','A'],dtype=object))
Out[754]: 'ABCA'

Converting financial information to pandas data frame

I am trying to get stock data such as balance_sheet, income_statement and cash_flow for multiple stocks and converting it to a data frame for manipulations.
here is the getting the data part of the code :
**import yahoo_fin.stock_info as yfs
tickers = ['AMZN','AAPL','MSFT','DIS','GOOG']
balance_sheet=[]
income_statement=[]
cash_flow=[]
balance_sheet.append({ticker : yfs.get_balance_sheet(ticker) for ticker in tickers})
income_statement.append({ticker : yfs.get_income_statement(ticker) for ticker in tickers })
cash_flow.append({ticker : yfs.get_cash_flow(ticker) for ticker in tickers})**
This part works well and returns a dictionary for each category. I then this :
my_dict=cash_flow+balance_sheet+income_statement
dff=pd.DataFrame.from_dict(my_dict, orient='columns', dtype=None, columns=None)
Note that when I try orient='index' I get the following error message :
**AttributeError Traceback (most recent call last)
in
1 my_dict=cash_flow+balance_sheet+income_statement
----> 2 dff=pd.DataFrame.from_dict(my_dict, orient='index', dtype=None, columns=None)
3 # dff=dff.set_index('endDate')
4 dff
5 # cash_flow
/opt/anaconda3/lib/python3.8/site-packages/pandas/core/frame.py in from_dict(cls, data, orient, dtype, columns)
1361 if len(data) > 0:
1362 # TODO speed up Series case
-> 1363 if isinstance(list(data.values())[0], (Series, dict)):
1364 data = _from_nested_dict(data)
1365 else:
enter code here
AttributeError: 'list' object has no attribute 'values'**
If someone could let me know what I'm doing wrong that would be very appreciated ! :)

How do I filter by date and count these dates using relational algebra (Reframe)?

I'm really stuck. I read the Reframe documentation https://reframe.readthedocs.io/en/latest/ that's based on pandas and I tried multiple things on my own but it still doesn't work. So I got a CSV file called weddings that looks like this:
Weddings, Date
Wedding1,20181107
Wedding2,20181107
And many more rows. As you can see, there are duplicates in the date column but this doesn't matter I think. I want to count the amount of weddings filtered by date, for example, the amount of weddings after 5 october 2016 (20161005). So first I tried this:
Weddings = Relation('weddings.csv')
Weddings.sort(['Date']).query('Date > 20161005').project(['Weddings', 'Date'])
This seems logical to me but I get a Keyerror 'Date' and don't know why? So I tried something more simple
Weddings = Relation('weddings.csv')
Weddings.groupby(['Date']).count()
And this doesn't work either, I still get a Keyerror 'Date' and don't know why. Can someone help me?
Track trace
KeyError Traceback (most recent call last)
<ipython-input-44-b358cdf55fdb> in <module>()
1
2 Weddings = Relation('weddings.csv')
----> 3 weddings.sort(['Date']).query('Date > 20161005').project(['Weddings', 'Date'])
4
5
~\Documents\Reframe.py in sort(self, *args,
**kwargs)
110 """
111
--> 112 return Relation(super().sort_values(*args, **kwargs))
113
114 def intersect(self, other):
~\Anaconda3\lib\site-packages\pandas\core\frame.py in sort_values(self, by,
axis, ascending, inplace, kind, na_position)
4416 by = by[0]
4417 k = self._get_label_or_level_values(by, axis=axis,
-> 4418 stacklevel=stacklevel)
4419
4420 if isinstance(ascending, (tuple, list)):
~\Anaconda3\lib\site-packages\pandas\core\generic.py in
_get_label_or_level_values(self, key, axis, stacklevel)
1377 values = self.axes[axis].get_level_values(key)._values
1378 else:
-> 1379 raise KeyError(key)
1380
1381 # Check for duplicates
KeyError: 'Date'

to_dataframe() bug when query returns no results

If a valid BigQuery query returns 0 rows, to_dataframe() crashes. (btw, I am running this on Google Cloud Datalab)
for example:
q = bq.Query('SELECT * FROM [isb-cgc:tcga_201510_alpha.Somatic_Mutation_calls] WHERE ( Protein_Change="V600E" ) LIMIT 10')
r = q.results()
r.to_dataframe()
produces:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-17-de55245104c0> in <module>()
----> 1 r.to_dataframe()
/usr/local/lib/python2.7/dist-packages/gcp/bigquery/_table.pyc in to_dataframe(self, start_row, max_rows)
628 # Need to reorder the dataframe to preserve column ordering
629 ordered_fields = [field.name for field in self.schema]
--> 630 return df[ordered_fields]
631
632 def to_file(self, destination, format='csv', csv_delimiter=',', csv_header=True):
TypeError: 'NoneType' object has no attribute '__getitem__'
is this a known bug?
Certainly not a known bug. Please do log a bug as mentioned by Felipe.
Contributions, both bug reports, and of course fixes, are welcome! :)