sqlalchemy composite key error when cell run twice - pandas

My configuration cell contains the following:
from sqlalchemy import create_engine
import io
def df_to_postgres(dataframe, table_name):
engine = create_engine('postgresql://postgres:pass#localhost:5432/postgres')
df=dataframe
df.head(0).to_sql(table_name.lower(), engine, if_exists='replace', index=False)
conn = engine.raw_connection()
cur = conn.cursor()
output = io.StringIO()
df.to_csv(output, sep='\t', header=False, index=False)
output.seek(0)
contents = output.getvalue()
cur.copy_expert('COPY ' + table_name + ' FROM STDIN', output)
conn.commit()
and the cell uploading the dataframe has:
df_to_postgres(product_table,'final.product')
The product_table has a composite key which has been created in postgres. When I run it the 1st time everything is good, when I try the 2nd time I get:
UniqueViolation Traceback (most recent call last)
<timed eval> in <module>
~\AppData\Local\Temp/ipykernel_8004/2936867318.py in df_to_postgres(dataframe, table_name)
15 output.seek(0)
16 contents = output.getvalue()
---> 17 cur.copy_expert('COPY ' + table_name + ' FROM STDIN', output)
18 conn.commit()
UniqueViolation: duplicate key value violates unique constraint "product_pkey"
DETAIL: Key (sku, unit)=(000002, kg) already exists.
CONTEXT: COPY product, line 1
I thought that if_exists='replace' would have solved this problem. Any solution?

Related

Creating a table in postgres using pandas and psycopg2 gives me an error

I am trying to create a database in postgres database using pandas and psycopg2.
The syntax are fine but still gives me an error like:
---------------------------------------------------------------------------
SyntaxError Traceback (most recent call last)
Cell In[67], line 9
7 sql = "CREATE TABLE linux (Distribution, {})".format(', '.join(column_names))
8 # Execute the statement
----> 9 cur.execute(sql)
11 # Commit the changes
12 engine.commit()
SyntaxError: syntax error at end of input
LINE 1: ...tion_Commitment, Forked_From, Target_Audience, Cost, Status)
These are the things i tried.
import pandas as pd
import psycopg2
engine = psycopg2.connect(dbname="pandas", user="postgres", password="root", host="localhost")
cur = engine.cursor()
# Define the table
column_names = [
"Founder",
"Maintainer",
"Initial_Release_Year",
"Current_Stable_Version",
"Security_Updates",
"Release_Date",
"System_Distribution_Commitment",
"Forked_From",
"Target_Audience",
"Cost",
"Status"
]
sql = "CREATE TABLE linux (Distribution, {})".format(', '.join(column_names))
# Execute the statement
cur.execute(sql)
# Commit the changes
engine.commit()
# Close the cursor and connection
cur.close()
engine.close()
You need to specify data type for each column like shown below.Change data types to those that are actual in your case.
schema = [
("Founder", "varchar(50)"),
("Maintainer", "varchar(50)"),
("Initial_Release_Year", "date"),
("Current_Stable_Version", "varchar(20)"),
("Security_Updates", "varchar(50)"),
("Release_Date", "date"),
("System_Distribution_Commitment", "varchar(50)"),
("Forked_From", "date"),
("Target_Audience", "varchar(50)"),
("Cost", "numeric(10, 2)"),
("Status", "varchar(10)")
]
sql = "CREATE TABLE linux (Distribution varchar(30), {})"\
.format(', '.join(map(' '.join, schema)))

Jupyter load dataframe into DB2 table

I have a Jupyter notebook and I am trying to read CSV files from URLS and then load them in a DB2. My first problem is that sometimes the table exists and I need to drop it first - second, seems impossible to load the data into tables.. what am I doing wrong?
import pandas as pd
import io
import requests
import ibm_db
import ibm_db_dbi
After this I try
dsn = "DRIVER={{IBM DB2 ODBC DRIVER}};" + \
"DATABASE=BLUDB;" + \
"HOSTNAME=myhostname;" + \
"PORT=50000;" + \
"PROTOCOL=TCPIP;" + \
"UID=myuid;" + \
"PWD=mypassword;"
url1="https://ibm.box.com/shared/static/05c3415cbfbtfnr2fx4atenb2sd361ze.csv"
s1=requests.get(url1).content
df1=pd.read_csv(io.StringIO(s1.decode('utf-8')))
url2="https://ibm.box.com/shared/static/f9gjvj1gjmxxzycdhplzt01qtz0s7ew7.csv"
s2=requests.get(url2).content
df2=pd.read_csv(io.StringIO(s2.decode('utf-8')))
url3="https://ibm.box.com/shared/static/svflyugsr9zbqy5bmowgswqemfpm1x7f.csv"
s3=requests.get(url3).content
df3=pd.read_csv(io.StringIO(s3.decode('utf-8')))
all works. And connection as well
hdbc = ibm_db.connect(dsn, "", "")
hdbi = ibm_db_dbi.Connection(hdbc)
and this fails
#DropTableIfExists = ibm_db.exec_immediate(hdbc, 'DROP TABLE QBG03137.DATAFRAME1')
CreateTable = ibm_db.exec_immediate(hdbc, sql)
resultSet = ibm_db.exec_immediate(hdbc, sql)
#define row and fetch tuple
row = ibm_db.fetch_tuple(resultSet)
comma = ""
while (row != False):
for column in row:
print(comma,end="")
print(column,end="")
comma = ","
print()
row = ibm_db.fetch_tuple(resultSet)
With error
---------------------------------------------------------------------------
Exception Traceback (most recent call last)
<ipython-input-94-840854e92bd7> in <module>
5
6 # Testing table exists
----> 7 resultSet = ibm_db.exec_immediate(hdbc, sql)
8
9 #define row and fetch tuple
Exception: [IBM][CLI Driver][DB2/LINUXX8664] SQL0601N The name of the object to be created is identical to the existing name "QBG03137.DATAFRAME1" of type "TABLE". SQLSTATE=42710 SQLCODE=-601

to_sql add column if not exists (sqlalchemy mysql)

while appending rows to mysql table with pandas to_sql like
df.to_sql('table_name', engine, if_exists='append')
the appending df may contain some new column, which does not exist in mysql table_name,
so, I get pymysql.err.InternalError fired:
InternalError: (pymysql.err.InternalError) (1054, "Unknown column 'new_columns' in 'field list'")
While trying to catch this exception to add new column to the mysql table I cannot catch pymysql.err.InternalError exception for some reason, so I try to do it in a weird way with BaseException like this:
while True:
try:
df.to_sql(table_name, engine, if_exists='append')
except BaseException as e:
b = e.args
missing_column = b[0].split('(pymysql.err.InternalError) (1054, "Unknown column \'')[1].split(' in \'field list\'")')[0].replace("'",'')
with sql.engine.connect() as con:
con.execute(f'ALTER TABLE {table_name} ADD COLUMN {missing_column} TEXT;')
else: break
This solution ugly and unstable, so I would appreciate your advice!
you can check sql table columns before appending new data to it:
import pandas as pd
target_name = "your_table"
target_cols = pd.read_sql_query(f"select * from {target_name} limit 1;").columns.tolist()
your_cols = df.columns.tolist()
if set(your_cols) - set(target_cols) == set():
# APPEND OPERATION...
else:
# NEW COL OPERATION...

Is there an alternative way for collect() in pyspark? py4j.protocol.Py4JJavaError: An error occurred while calling 0323.collectToPython

Pyspark script crashes when i use collect() or show() in pyspark. My dataframe has only 570 rows, so i don't uderstand what is happening.
I have a Dataframe and i have created a functions that extracts a list with distinct rows from it. It was working fine, then suddenly i had an error:
py4j.protocol.Py4JJavaError: An error occurred while calling
0323.collectToPython
A similar error i have when i try to show() the dataframe.
Is there an alternative method to extract a list with distinct values from a dataframe?
required_list = [(col1,col2), (col1,col2)]
Sorry for not posting the code, but its a large script and its confidential.
Update:
I have a function that extract distinct values from a df:
def extract_dist(df, select_cols):
val = len(select_cols)
list_val = [row[0:val] for row in df.select(*select_cols)).distinct.na.drop().collect()]
return list_val
The function worked fine until i had the error.
I have a main script where i import these function and also another function that calculates a dataframe:
def calculate_df(df_join, v_srs, v_db, v_tbl, sql_context):
cmd = "impala-shel....'create table db.tbl as select * from v_db.v_tbl'"
os.system(cmd)
select_stm = "select * from db.tbl"
df = sql_context(select_stm)
cmd = "impala-shel....'drop table if exists db.tbl'"
os.system(cmd)
join cond = [...]
joined_df = df.join(df_join, join_cond, 'left').select(..)
df1 = joined_df.filer(...)
df2 = joined_df.filer(...)
final_df = df1.union(df2)
final_df.show() # error from show
return final_df
Main script:
import extract_dist
import calculate_df
df_join = ...extract from a hive table
for conn in details:
v_db = conn['database'].upper()
v_tbl = conn['table'].upper()
v_name = conn['descr'].upper()
if v_name in lst:
df = calculate_df(df_join, v_name, v_db, v_tbl, sqlContext)
df = df.filter(...column isin list)
df = df.filter(..).filter(..)
# extract list with distinct rows from df using dist function
df.show() # error from show
dist_list = extract_dist(df, [col1,col2]) # error from collect
for x, y in dist_list:
....
If i don't use show() the the error appears when i run the collect() method.
The same scripts worked before and suddenly failed. It there a memory issue? i have to clear memory?
SOLVED:
I have found the issue. After i created the dataframe from a table i have dropped the table.
cmd = "impala-shel....'drop table if exists db.tbl'"
os.system(cmd)
After i removed the command with drop table the script ran successfully.
I will drop the temporary table at the end of the script, after i finish with the extracted dataframe. I didn't know that if we create a dataframe and after that drop the source table i will have error afterwards.

Postgres 9.5 upsert command in pandas or psycopg2?

Most of the examples I see are people inserting a single row into a database with the ON CONFLICT DO UPDATE syntax.
Does anyone have any examples using SQLAlchemy or pandas.to_sql?
99% of my inserts are using psycopg2 COPY command (so I save a csv or stringio and then bulk insert), and the other 1% are pd.to_sql. All of my logic to check for new rows or dimensions is done in Python.
def find_new_rows(existing, current, id_col):
current[id_col] = current[id_col].astype(int)
x = existing[['datetime', id_col, 'key1']]
y = current[['datetime', id_col, 'key2']]
final = pd.merge(y, x, how='left', on=['datetime', id_col])
final = final[~(final['key2'] == final['key1'])]
final = final.drop(['key1'], axis=1)
current = pd.merge(current, final, how='left', on=['datetime', id_col])
current = current.loc[current['key2_y'] == 1]
current.drop(['key2_x', 'key2_y'], axis=1, inplace=True)
return current
Can someone show me an example of using the new PostgreSQL syntax for upsert with pyscopg2? A common use case is to check for dimension changes (between 50k - 100k rows daily which I compare to existing values) which is CONFLICT DO NOTHING to only add new rows.
Another use case is that I have fact data which changes over time. I only take the most recent value (I currently use a view to select distinct), but it would be better to UPSERT, if possible.
Here is my code for bulk insert & insert on conflict update query for postgresql from pandas dataframe:
Lets say id is unique key for both postgresql table and pandas df and you want to insert and update based on this id.
import pandas as pd
from sqlalchemy import create_engine, text
engine = create_engine(postgresql://username:pass#host:port/dbname)
query = text(f"""
INSERT INTO schema.table(name, title, id)
VALUES {','.join([str(i) for i in list(df.to_records(index=False))])}
ON CONFLICT (id)
DO UPDATE SET name= excluded.name,
title= excluded.title
""")
engine.execute(query)
Make sure that your df columns must be same order with your table.
FYI, this is the solution I am using currently.
It seems to work fine for my purposes. I had to add a line to replace null (NaT) timestamps with None though, because I was getting an error when I was loading each row into the database.
def create_update_query(table):
"""This function creates an upsert query which replaces existing data based on primary key conflicts"""
columns = ', '.join([f'{col}' for col in DATABASE_COLUMNS])
constraint = ', '.join([f'{col}' for col in PRIMARY_KEY])
placeholder = ', '.join([f'%({col})s' for col in DATABASE_COLUMNS])
updates = ', '.join([f'{col} = EXCLUDED.{col}' for col in DATABASE_COLUMNS])
query = f"""INSERT INTO {table} ({columns})
VALUES ({placeholder})
ON CONFLICT ({constraint})
DO UPDATE SET {updates};"""
query.split()
query = ' '.join(query.split())
return query
def load_updates(df, table, connection):
conn = connection.get_conn()
cursor = conn.cursor()
df1 = df.where((pd.notnull(df)), None)
insert_values = df1.to_dict(orient='records')
for row in insert_values:
cursor.execute(create_update_query(table=table), row)
conn.commit()
row_count = len(insert_values)
logging.info(f'Inserted {row_count} rows.')
cursor.close()
del cursor
conn.close()
For my case, I wrote to a temporary table first, then merged the temp table into the actual table I wanted to upsert to. Performing the upsert this way avoids any conflicts where the strings may have single quotes in them.
def upsert_dataframe_to_table(self, table_name: str, df: pd.DataFrame, schema: str, id_col:str):
"""
Takes the given dataframe and inserts it into the table given. The data is inserted unless the key for that
data already exists in the dataframe. If the key already exists, the data for that key is overwritten.
:param table_name: The name of the table to send the data
:param df: The dataframe with the data to send to the table
:param schema: the name of the schema where the table exists
:param id_col: The name of the primary key column
:return: None
"""
engine = create_engine(
f'postgresql://{postgres_configs["username"]}:{postgres_configs["password"]}#{postgres_configs["host"]}'
f':{postgres_configs["port"]}/{postgres_configs["db"]}'
)
df.to_sql('temp_table', engine, if_exists='replace')
updates = ', '.join([f'{col} = EXCLUDED.{col}' for col in df.columns if col != id_col])
columns = ', '.join([f'{col}' for col in df.columns])
query = f'INSERT INTO "{schema}".{table_name} ({columns}) ' \
f'SELECT {columns} FROM temp_table ' \
f'ON CONFLICT ({id_col}) DO ' \
f'UPDATE SET {updates} '
self.cursor.execute(query)
self.cursor.execute('DROP TABLE temp_table')
self.conn.commit()