Use of pandasql results in TypeError: "'str' object is not callable" - pandasql

I have created a DF using
hdr=pd.read_csv("../input/worksheets/Header.csv")
hdr.head()
which results in
Attempting a pandasql query on this such as
from pandasql import sqldf
pysqldf = lambda q: sqldf(q, globals())
hdr_sql = pysqldf("Select ID, Gender, Location from hdr")
results in
Could somebody please tell me what I'm doing wrong?
Many thanks

Related

Writing a scalable INSERT statement using cx_Oracle

I am attempting to write a script that will allow me to insert values from an uploaded dataframe into a table inside of an Oracle DB; but my issue lies with
too many columns to hard-code
columns aren't one-to-one
What I'm hoping for is a way to write out the columns, check to see if they sync with the columns of my dataframe and from there use an INSERT VALUES sql statement to input the values from the dataframe to the ODS table.
so far these are the important parts of my script:
import pandas as pd
import cx_Oracle
import config
df = pd.read_excel("Employee_data.xlsx")
conn = None
try:
conn = cx_Oracle.connect(config.username, config.password, config.dsn, encoding=config.encoding)
except cx_Oracle.Error as error:
print(error)
finally:
cursor = conn.cursor
sql = "SELECT * FROM ODSMGR.EMPLOYEE_TABLE"
cursor.execute(sql)
data = cursor.fetchall()
col_names = []
for i in range(0, len(cursor.description)):
col_names.append(cursor.description[i][0])
#instead of using df.columns I use:
rows = [tuple(x) for x in df.values]
which prints my ODS column names, and allows me to conveniently store my rows from the df in an array but I'm at a loss for how to import these to the ODS. I found something like:
cursor.execute("insert into ODSMGR.EMPLOYEE_TABLE(col1,col2) values (:col1, :col2)", {":col1df":df, "col2df:df"})
but that'll mean I'll have to hard-code everything which wouldn't be scalable. I'm hoping I can get some sort of insight to help. It's just difficult since the columns aren't 1-to-1 and that there is some compression/collapsing of columns from the DF to the ODS but any help is appreciated.
NOTE: I've also attempted to use SQLalchemy but I am always given an error "ORA-12505: TNS:listener does not currently know of SID given in connect descriptor" which is really strange given that I am able to connect with cx_Oracle
EDIT 1:
I was able to get a list of columns that share the same name; so after running:
import numpy as np
a = np.intersect1d(df.columns, col_names)
print("common columns:", a)
I was able to get a list of columns that the two datasets share.
I also tried to use this as my engine:
engine = create_engine("oracle+cx_oracle://username:password#ODS-test.domain.com:1521/?ODS-Test")
dtyp = {c:types.VARCHAR(df[c].str.len().max())
for c in df.columns[df.dtypes=='object'].tolist()}
df.to_sql('ODS.EMPLOYEE_TABLE', con = engine, dtype=dtyp, if_exists='append')
which has given me nothing but errors.

How to migrate pandas read_sql from psycopg2 to sqlalchemy with a tuple as one of the query params

With pandas=1.4.0, it emits a Warning about not using psycopg2 directly within read_sql, but to use sqlalchemy. While attempting to do such a migration, I can not resolve how to pass a tuple as one of the query parameters. For example, this presently works:
import pandas as pd
import psycopg2
read_sql(
"SELECT * from news where id in %s",
psycopg2.connect("dbname=mydatabase"),
params=[(1, 2, 3),],
)
attempting to migrate this to sqlalchemy like so:
import pandas as pd
read_sql(
"SELECT * from news where id in %s",
"postgresql://localhost/mydatabase",
params=[(1, 2, 3),],
)
results in
...snipped...
File "/opt/miniconda3/envs/prod/lib/python3.8/site-packages/sqlalchemy/engine/base.py", line 1802, in _execute_context
self.dialect.do_execute(
File "/opt/miniconda3/envs/prod/lib/python3.8/site-packages/sqlalchemy/engine/default.py", line 732, in do_execute
cursor.execute(statement, parameters)
TypeError: not all arguments converted during string formatting
So how do I pass a tuple as a params argument within pandas read_sql?
Wrap your query with a SQLAlchemy text object, use named parameters and pass the parameter values as a dictionary:
import pandas as pd
from sqlalchemy import text
read_sql(
text("SELECT * from news where id in :ids"),
"postgresql://localhost/mydatabase",
params={'id': (1, 2, 3),},
)

snowflake.connector SQL compilation error invalid identifier from pandas dataframe

I'm trying to ingest a df I created from a json response into an existing table (the table is currently empty because I can't seem to get this to work)
The df looks something like the below table:
index
clicks_affiliated
0
3214
1
2221
but I'm seeing the following error:
snowflake.connector.errors.ProgrammingError: 000904 (42000): SQL
compilation error: error line 1 at position 94
invalid identifier '"clicks_affiliated"'
and the column names in snowflake match to the columns in my dataframe.
This is my code:
import pandas as pd
from snowflake.sqlalchemy import URL
from sqlalchemy import create_engine
import snowflake.connector
from snowflake.connector.pandas_tools import write_pandas, pd_writer
from pandas import json_normalize
import requests
df_norm = json_normalize(json_response, 'reports')
#I've tried also adding the below line (and removing it) but I see the same error
df = df_norm.reset_index(drop=True)
def create_db_engine(db_name, schema_name):
engine = URL(
account="ab12345.us-west-2",
user="my_user",
password="my_pw",
database="DB",
schema="PUBLIC",
warehouse="WH1",
role="DEV"
)
return engine
def create_table(out_df, table_name, idx=False):
url = create_db_engine(db_name="DB", schema_name="PUBLIC")
engine = create_engine(url)
connection = engine.connect()
try:
out_df.to_sql(
table_name, connection, if_exists="append", index=idx, method=pd_writer
)
except ConnectionError:
print("Unable to connect to database!")
finally:
connection.close()
engine.dispose()
return True
print(df.head)
create_table(df, "reporting")
So... it turns out I needed to change my columns in my dataframe to uppercase
I've added this after the dataframe creation to do so and it worked:
df.columns = map(lambda x: str(x).upper(), df.columns)

Unable to write dataframe of pyspark into mysql database [duplicate]

I am attempting to insert records into a MySql table. The table contains id and name as columns.
I am doing like below in a pyspark shell.
name = 'tester_1'
id = '103'
import pandas as pd
l = [id,name]
df = pd.DataFrame([l])
df.write.format('jdbc').options(
url='jdbc:mysql://localhost/database_name',
driver='com.mysql.jdbc.Driver',
dbtable='DestinationTableName',
user='your_user_name',
password='your_password').mode('append').save()
I am getting the below attribute error
AttributeError: 'DataFrame' object has no attribute 'write'
What am I doing wrong? What is the correct method to insert records into a MySql table from pySpark
Use Spark DataFrame instead of pandas', as .write is available on Spark Dataframe only
So the final code could be
data =['103', 'tester_1']
df = sc.parallelize(data).toDF(['id', 'name'])
df.write.format('jdbc').options(
url='jdbc:mysql://localhost/database_name',
driver='com.mysql.jdbc.Driver',
dbtable='DestinationTableName',
user='your_user_name',
password='your_password').mode('append').save()
Just to add #mrsrinivas answer's.
Make sure that you have jar location of sql connector available in your spark session. This code helps:
spark = SparkSession\
.builder\
.config("spark.jars", "/Users/coder/Downloads/mysql-connector-java-8.0.22.jar")\
.master("local[*]")\
.appName("pivot and unpivot")\
.getOrCreate()
otherwise it will throw an error.

psycopg2: can't adapt type 'numpy.int64'

I have a dataframe with the dtypes shown below and I want to insert the dataframe into a postgres DB but it fails due to error can't adapt type 'numpy.int64'
id_code int64
sector object
created_date float64
updated_date float64
How can I convert these types to native python types such as from int64 (which is essentially 'numpy.int64') to a classic int that would then be acceptable to postgres via the psycopg2 client.
data['id_code'].astype(np.int) defaults to int64
It is nonetheless possible to convert from one numpy type to another (e.g from int to float)
data['id_code'].astype(float)
changes to
dtype: float64
The bottomline is that psycopg2 doesn't seem to understand numpy datatypes if any one has ideas how to convert them to classic types that would be helpful.
Updated: Insertion to DB
def insert_many():
"""Add data to the table."""
sql_query = """INSERT INTO classification(
id_code, sector, created_date, updated_date)
VALUES (%s, %s, %s, %s);"""
data = pd.read_excel(fh, sheet_name=sheetname)
data_list = list(data.to_records())
conn = None
try:
conn = psycopg2.connect(db)
cur = conn.cursor()
cur.executemany(sql_query, data_list)
conn.commit()
cur.close()
except(Exception, psycopg2.DatabaseError) as error:
print(error)
finally:
if conn is not None:
conn.close()
Add below somewhere in your code:
import numpy
from psycopg2.extensions import register_adapter, AsIs
def addapt_numpy_float64(numpy_float64):
return AsIs(numpy_float64)
def addapt_numpy_int64(numpy_int64):
return AsIs(numpy_int64)
register_adapter(numpy.float64, addapt_numpy_float64)
register_adapter(numpy.int64, addapt_numpy_int64)
same problem here, successfully solve this problem after I transform series to nd.array and int.
you can try as following:
data['id_code'].values.astype(int)
--
update:
if the value including NaN, it still wrong.
It seems that psycopg2 can't explain the np.int64 format, therefore the following methods works for me.
import numpy as np
from psycopg2.extensions import register_adapter, AsIs
psycopg2.extensions.register_adapter(np.int64, psycopg2._psycopg.AsIs)
I'm not sure why your data_list contains NumPy data types, but the same thing happens to me when I run your code. Here is an alternative way to construct data_list that so that integers and floats end up as their native python types:
data_list = [list(row) for row in data.itertuples(index=False)]
Alternate approach
I think you could accomplish the same thing in fewer lines of code by using pandas to_sql:
import sqlalchemy
import pandas as pd
data = pd.read_excel(fh, sheet_name=sheetname)
engine = sqlalchemy.create_engine("postgresql://username#hostname/dbname")
data.to_sql(engine, 'classification', if_exists='append', index=False)
I had the same issue and fixed it using:
df = df.convert_dtypes()