using psycopg2 to insert pandas dataframe into postgres db errors on enum datatype - pandas

When I try to insert a row from a pandas dataframe using psycopg2, I keep getting the following error. InvalidTextRepresentation: invalid input value for enum enum_name: "column_name"
Does psycopg2 handle enum datatypes? When I changed the first enum column to varchar, the error went to the second enum column, so seems like there's something it doesn't like about the enum datatype. I've included my complete code below. Many thanks in advance!
# helper function for build_insert_query
def create_col_name_string(col_names):
"""
input: takes list of column names
output: returns string of all col names
"""
return ','.join(col_names)
# helper function for insert_into_table
# builds query "INSERT INTO schema.table (col_names) VALUES(%s, %s...);"
def build_insert_query(schema, table, col_names):
"""
input: takes schema and table name as strings as well as list of df column names
output: returns formatted query
"""
s = ','.join(['%s' for i in range(len(col_names))])
query = "INSERT INTO %s.%s "% (schema, table)
query = query + "(" + create_col_name_string(col_names) + ")"
query = query + "VALUES(" + s + ");"
return query
# helper function for append_from_df_to_db
def insert_into_table(cur, schema, table, col_names):
"""
inserts an individual record into table
"""
query = build_insert_query(schema, table, col_names)
# gets a string of col names
vars_to_insert = create_col_name_string(col_names)
cur.execute(query, tuple(col_names))
# loops over rows in dataframe and inserts into database
def append_from_df_to_db(cur, df, schema, table, col_names):
for i, row in df.iterrows():
insert_into_table(cur, schema, table, col_names)

Related

Execute same function on different columns to make rows to append another table

How could I perform the same operation for 15 columns on a DataFrame?
How could I parallelize the operation?
I have an input data that I need to update a reference table. There are more columns but I think these 3 help to understand what I am trying to do.
Table: input
rowid
col1
col2
col3
id1
col1_data1
col2_data1
col3_data1
id2
col1_data2
col2_data2
col3_data2
The reference table contains the values of each corresponding cell of the column, then the md5 and finally the column name
Table: references
col_data
md5
ref_name
col1_data1
md5_col1_data1
col1_name
col1_data2
md5_col1_data2
col1_name
col1_data3
md5_col1_data3
col1_name
col2_data1
md5_col2_data1
col2_name
col2_data2
md5_col2_data2
col2_name
col2_data3
md5_col2_data3
col2_name
col3_data1
md5_col3_data1
col3_name
col3_data2
md5_col3_data2
col3_name
col3_data3
md5_col3_data3
col3_name
I created a function similar to this that checks the input table against
the reference table and when new data is found then the reference is created and
a dataframe is returned so that at the end the references table is updated
def repeatedly_excuted_funcion(input_data, references, col_name):
"""
input_data is the full dataframe
references is the table to check if has the value and if not create it
col_name is the name of the column that will be considered on the execution
"""
# ... some code ...
return partial_df
df_col1 = repeatedly_excuted_funcion(input_data, references, "col1")
df_col2 = repeatedly_excuted_funcion(input_data, references, "col2")
data_to_append = df_col1.union(df_col2)
df_col3 = repeatedly_excuted_funcion(input_data, references, "col3")
data_to_append = data_to_append.union(df_col2)
I only put a 3 column example but there are 15 columns to check.
At the end the idea is to update the references table with the new calculated md5 values.
(
data_to_append.write.format("delta")
.mode("append")
.saveAsTable(database_table)
)
No function, no unions. 1 shuffle (anti join).
Create all the 3 final columns (data, md5, col_name) inside the array in Input table
Unpivot - from every 1 row of 15 cols make 1 col of 15 rows
Split the 1 array col into 3 data cols
Filter out rows which already exist in References
Append result
from pyspark.sql import functions as F
cols = ['col1', 'col2',..., 'col15']
# Change Input columns to arrays
df_input = df_input.select(
*[F.array(F.col(c), F.md5(c), F.lit(c)).alias(c) for c in cols]
)
# Unpivot Input table
stack_string = ", ".join([f"`{c}`" for c in cols])
df_input2 = df_input.select(
F.expr(f"stack({len(cols)}, {stack_string}) as col_data"))
# Make 3 columns from 1 array column
df_input3 = df_input2.select(
F.element_at('col_data', 1).alias('col_data'),
F.element_at('col_data', 2).alias('md5'),
F.element_at('col_data', 3).alias('ref_name'),
)
# Keep only rows which don't exist in References table
data_to_append = df_input3.join(df_references, 'col_data', 'anti')
(
data_to_append.write.format("delta")
.mode("append")
.saveAsTable(database_table)
)
Create an empty DF with the correct schema.
Get All the columns,
Union this to all the rows.
I'm not sure for 15 itesm it's worth parallelizing, or you wouldn't run into issues with spark context (as it's not availble inside an executor). Meaning you would have to have pure python code inside of repeatedly_excuted_function. You might be able to do all rows at once with a UDF, but I'm not sure if that would perform as well. (UDFs are known for poor performance due to the lack of vectorization).
from pyspark.sql.types import StructType,StructField, StringType
unionSchema = StructType([
StructField('column', StringType(), True)])
my_union = spark.createDataFrame( data = [] ,
schema = unionSchema )
for i in myDF.columns:
my_union = my_union.union(repeatedly_excuted_funcion(input_data, references, i)
what about pivoting the data and performing one join?
The code below creates map, the input is a little annoying as I create in python a list of [lit(column_name1), col(column_name1), lit(column_name2), ...] , the main purpose of this map is to explode it and then the first table is in a similar format as the reference df and one normal join can be performed.
from itertools import chain
from pyspark.sql.functions import create_map, array, lit, col, explode
column_names = ["col1", "col2", "col3"]
df \
.withColumn("features_map", create_map(
list(chain(*[(lit(c), col(c)) for c in column_names]))
)) \
.select("rowid", explode("features_map").alias("ref_name", "col_data")) \
.join(ref_df, on=["ref_name", "col_data"], how="left") ....

pandas merge rows with overlapping column names using default non-NaN value

There is a list of tables and some configurations, specified in csv format.
I want to use default configuration from configs, and override it in tables if needed
configs = """
schema|new_name
dbo|{table}
qa|test_{table}
"""
tables = """
table|schema|new_name
employee|hr|{schema}_{table}
advertisers
users
"""
configs = pandas.read_csv(io.StringIO(configs), sep='|')
tables = pandas.read_csv(io.StringIO(tables), sep='|')
I want to cross-join/merge/concatenate/combine them to get a dataframe which contains:
final = """
table|schema|new_name
employee|hr|{schema}_{table}
advertisers|dbo|{table}
users|dbo|{table}
employee|hr|{schema}_{table}
advertisers|qa|test_{table}
users|qa|test_{table}
"""
If schema is not specified, use 'dbo' schema and 'users/advertisers' table name.
If schema is specified, use 'hr' schema and 'hr_employee' table name.
Basically - when horizontal concat of 2 rows with overlapping column names, create one column using whichever value is not NaN.
What pandas command should I use ?
EDIT:
#cross join both DataFrames
df = tables.merge(configs, suffixes=('','_'), how='cross')
#get columns with suffix _
cols = df.columns[df.columns.str.endswith('_')]
#remove suffix
new = cols.str.strip('_')
#replace missing values from cols by _ colums
df[new] = df[new].fillna(df[cols].rename(columns=lambda x: x.strip('_')))
#remove columns with _
df = df.drop(cols, axis=1)
print (df)
table schema new_name
0 employee hr {schema}_{table}
1 employee hr {schema}_{table}
2 advertisers dbo {table}
3 advertisers qa test_{table}
4 users dbo {table}
5 users qa test_{table}

Create a Python script which compares several excel files(snapshots) and compares and creates a new dataframe with rows which are diffirent

Am new in Python and will appreciate your help.
I would like to create a python script which perfoms data validation by using my first file excel_file[0] as df1 and comparing it against several other excel_file[0:100] and looping through them while comparing with df1 and appending those rows which are diffirent to a new dataframe df3. Even though I have several columns, I would like to base my comparison on two columns which includes a primary key column; such that if the keys in the two dataframes match; then compare df1 and df2(loop).
Here's what I have tried..
***## import python module: pandasql which allows SQL syntax for Pandas;
It needs installation first though:pip install -U pandasql
from pandasql import sqldf
pysqldf = lambda q: sqldf(q, locals(), globals() )
dateTimeObj = dt.datetime.now()
print('start file merge: ' ,dateTimeObj)
#path = os.getcwd()
##files = os.listdir(path1)
files=os.path.abspath(mydrive')
files
dff1 = pd.DataFrame()
##df2 = pd.DataFrame()
# method 1
excel_files = glob.glob(files+ "\*.xlsx")
##excel_files = [f for f in files if f[-4:] == '\*.xlsx' or f[-3:] == '*.xls']
df1=pd.read_excel(excel_files[14])
for f in excel_files[0:100]:
df2 = pd.read_excel(f)
## Lets drop the any unanamed column
##df1=df1.drop(df1.iloc[:, [0]], axis = 1)
### Gets all Rows and columns which are diffirent after comparing the two dataframes ; The
clause " _key HAVING COUNT(*)= 1" resolves to True if the two dataframes are diffirent
### Else we use The clause " _key HAVING COUNT(*)= 2" to output similar rows and columns
data=pysqldf("SELECT * FROM ( SELECT * FROM df1 UNION ALL SELECT * FROM df2) df1 GROUP BY _key
HAVING COUNT(*) = 1 ;")
## df = dff1.append(data).reset_index(drop = True)
print(dt.datetime.now().strftime("%x %X")+': files appended to make a Master file')***

psycopg2.errors.InvalidTextRepresentation while using COPY in postgresql

I am using a custom callable to pandas.to_sql(). The below snippet is from pandas documentation for using it
import csv
from io import StringIO
def psql_insert_copy(table, conn, keys, data_iter):
"""
Execute SQL statement inserting data
Parameters
----------
table : pandas.io.sql.SQLTable
conn : sqlalchemy.engine.Engine or sqlalchemy.engine.Connection
keys : list of str
Column names
data_iter : Iterable that iterates the values to be inserted
"""
# gets a DBAPI connection that can provide a cursor
dbapi_conn = conn.connection
with dbapi_conn.cursor() as cur:
s_buf = StringIO()
writer = csv.writer(s_buf)
writer.writerows(data_iter)
s_buf.seek(0)
columns = ', '.join('"{}"'.format(k) for k in keys)
if table.schema:
table_name = '{}.{}'.format(table.schema, table.name)
else:
table_name = table.name
sql = 'COPY {} ({}) FROM STDIN WITH CSV'.format(
table_name, columns)
cur.copy_expert(sql=sql, file=s_buf)
but while using this copy functionality, I am getting the error
psycopg2.errors.InvalidTextRepresentation: invalid input syntax for integer: "3.0"
This is not a problem with input as this table schemas and values where working initially, when I have used to_sql() function without using the custom callable 'psql_insert_copy()'. I am using sqlalchemy engine for getting the connection cursor
I would recommend using string fields in the table for such actions, or writing the entire (sql) script manually, indicating the types of table fields

Postgres 9.5 upsert command in pandas or psycopg2?

Most of the examples I see are people inserting a single row into a database with the ON CONFLICT DO UPDATE syntax.
Does anyone have any examples using SQLAlchemy or pandas.to_sql?
99% of my inserts are using psycopg2 COPY command (so I save a csv or stringio and then bulk insert), and the other 1% are pd.to_sql. All of my logic to check for new rows or dimensions is done in Python.
def find_new_rows(existing, current, id_col):
current[id_col] = current[id_col].astype(int)
x = existing[['datetime', id_col, 'key1']]
y = current[['datetime', id_col, 'key2']]
final = pd.merge(y, x, how='left', on=['datetime', id_col])
final = final[~(final['key2'] == final['key1'])]
final = final.drop(['key1'], axis=1)
current = pd.merge(current, final, how='left', on=['datetime', id_col])
current = current.loc[current['key2_y'] == 1]
current.drop(['key2_x', 'key2_y'], axis=1, inplace=True)
return current
Can someone show me an example of using the new PostgreSQL syntax for upsert with pyscopg2? A common use case is to check for dimension changes (between 50k - 100k rows daily which I compare to existing values) which is CONFLICT DO NOTHING to only add new rows.
Another use case is that I have fact data which changes over time. I only take the most recent value (I currently use a view to select distinct), but it would be better to UPSERT, if possible.
Here is my code for bulk insert & insert on conflict update query for postgresql from pandas dataframe:
Lets say id is unique key for both postgresql table and pandas df and you want to insert and update based on this id.
import pandas as pd
from sqlalchemy import create_engine, text
engine = create_engine(postgresql://username:pass#host:port/dbname)
query = text(f"""
INSERT INTO schema.table(name, title, id)
VALUES {','.join([str(i) for i in list(df.to_records(index=False))])}
ON CONFLICT (id)
DO UPDATE SET name= excluded.name,
title= excluded.title
""")
engine.execute(query)
Make sure that your df columns must be same order with your table.
FYI, this is the solution I am using currently.
It seems to work fine for my purposes. I had to add a line to replace null (NaT) timestamps with None though, because I was getting an error when I was loading each row into the database.
def create_update_query(table):
"""This function creates an upsert query which replaces existing data based on primary key conflicts"""
columns = ', '.join([f'{col}' for col in DATABASE_COLUMNS])
constraint = ', '.join([f'{col}' for col in PRIMARY_KEY])
placeholder = ', '.join([f'%({col})s' for col in DATABASE_COLUMNS])
updates = ', '.join([f'{col} = EXCLUDED.{col}' for col in DATABASE_COLUMNS])
query = f"""INSERT INTO {table} ({columns})
VALUES ({placeholder})
ON CONFLICT ({constraint})
DO UPDATE SET {updates};"""
query.split()
query = ' '.join(query.split())
return query
def load_updates(df, table, connection):
conn = connection.get_conn()
cursor = conn.cursor()
df1 = df.where((pd.notnull(df)), None)
insert_values = df1.to_dict(orient='records')
for row in insert_values:
cursor.execute(create_update_query(table=table), row)
conn.commit()
row_count = len(insert_values)
logging.info(f'Inserted {row_count} rows.')
cursor.close()
del cursor
conn.close()
For my case, I wrote to a temporary table first, then merged the temp table into the actual table I wanted to upsert to. Performing the upsert this way avoids any conflicts where the strings may have single quotes in them.
def upsert_dataframe_to_table(self, table_name: str, df: pd.DataFrame, schema: str, id_col:str):
"""
Takes the given dataframe and inserts it into the table given. The data is inserted unless the key for that
data already exists in the dataframe. If the key already exists, the data for that key is overwritten.
:param table_name: The name of the table to send the data
:param df: The dataframe with the data to send to the table
:param schema: the name of the schema where the table exists
:param id_col: The name of the primary key column
:return: None
"""
engine = create_engine(
f'postgresql://{postgres_configs["username"]}:{postgres_configs["password"]}#{postgres_configs["host"]}'
f':{postgres_configs["port"]}/{postgres_configs["db"]}'
)
df.to_sql('temp_table', engine, if_exists='replace')
updates = ', '.join([f'{col} = EXCLUDED.{col}' for col in df.columns if col != id_col])
columns = ', '.join([f'{col}' for col in df.columns])
query = f'INSERT INTO "{schema}".{table_name} ({columns}) ' \
f'SELECT {columns} FROM temp_table ' \
f'ON CONFLICT ({id_col}) DO ' \
f'UPDATE SET {updates} '
self.cursor.execute(query)
self.cursor.execute('DROP TABLE temp_table')
self.conn.commit()