Streamlit Not Populating st.table With Results From a Database Query - sql

I'm trying to do something I thought was easy with Streamlit:
Populate selectbox options using a database query. So far so good.
Based on the option selected, run another query using the selected option's value,
then populate a table with the results returned.
Here's the code. Note that db connection details and other unrelated code was left out.
import streamlit as st
import snowflake.connector as sf
import pandas as pd
conn = None
list_results = None
detail_results = None
def list_results_select():
global list_results
list_results = pd.read_sql_query("SELECT some_id, some_value FROM some_table", conn, index_col = None)
return list_results
def detail_results_select():
global detail_results
detail_results = pd.read_sql_query("SELECT col_1, col_2, col_n FROM some_table2 WHERE id = " + st.session_state.stuff_list, conn, index_col = None)
list_results_select()
st.title("View Stuff You Can Pick From a Select Box")
st.selectbox("Stuff List", list_results["some_id"], key = "stuff_list", on_change = detail_results_select)
st.table(data = detail_results)
All the database interaction works fine. On load, the list is populated. Upon choosing an option, a db call is made, and the correct results are returned to the detail_results variable. The problem is that the table always shows "empty" no matter what. Any suggestions? Thank you.

Related

Writing a scalable INSERT statement using cx_Oracle

I am attempting to write a script that will allow me to insert values from an uploaded dataframe into a table inside of an Oracle DB; but my issue lies with
too many columns to hard-code
columns aren't one-to-one
What I'm hoping for is a way to write out the columns, check to see if they sync with the columns of my dataframe and from there use an INSERT VALUES sql statement to input the values from the dataframe to the ODS table.
so far these are the important parts of my script:
import pandas as pd
import cx_Oracle
import config
df = pd.read_excel("Employee_data.xlsx")
conn = None
try:
conn = cx_Oracle.connect(config.username, config.password, config.dsn, encoding=config.encoding)
except cx_Oracle.Error as error:
print(error)
finally:
cursor = conn.cursor
sql = "SELECT * FROM ODSMGR.EMPLOYEE_TABLE"
cursor.execute(sql)
data = cursor.fetchall()
col_names = []
for i in range(0, len(cursor.description)):
col_names.append(cursor.description[i][0])
#instead of using df.columns I use:
rows = [tuple(x) for x in df.values]
which prints my ODS column names, and allows me to conveniently store my rows from the df in an array but I'm at a loss for how to import these to the ODS. I found something like:
cursor.execute("insert into ODSMGR.EMPLOYEE_TABLE(col1,col2) values (:col1, :col2)", {":col1df":df, "col2df:df"})
but that'll mean I'll have to hard-code everything which wouldn't be scalable. I'm hoping I can get some sort of insight to help. It's just difficult since the columns aren't 1-to-1 and that there is some compression/collapsing of columns from the DF to the ODS but any help is appreciated.
NOTE: I've also attempted to use SQLalchemy but I am always given an error "ORA-12505: TNS:listener does not currently know of SID given in connect descriptor" which is really strange given that I am able to connect with cx_Oracle
EDIT 1:
I was able to get a list of columns that share the same name; so after running:
import numpy as np
a = np.intersect1d(df.columns, col_names)
print("common columns:", a)
I was able to get a list of columns that the two datasets share.
I also tried to use this as my engine:
engine = create_engine("oracle+cx_oracle://username:password#ODS-test.domain.com:1521/?ODS-Test")
dtyp = {c:types.VARCHAR(df[c].str.len().max())
for c in df.columns[df.dtypes=='object'].tolist()}
df.to_sql('ODS.EMPLOYEE_TABLE', con = engine, dtype=dtyp, if_exists='append')
which has given me nothing but errors.

Fastest way of fetching data with parameters from sql database with pandas

I'm trying to fetch data from financial database. Therefore I need this data for several stocks.
Now I'm curious: what's the fastest way of doing this?
For example:
dsn_dwh = 'IvyDB_EU'
import pandas as pd
import pypyodbc as odbc
class dbEU(object):
# Database Object to access ODBC database
def __init__(self):
self._db_connection = odbc.connect(r'DSN='+dsn_dwh+';')
def query(self, query, params):
return pd.read_sql(query, self._db_connection, params = params)
def __del__(self):
self._db_connection.close()
startdat=pd.to_datetime('12/01/2018')
enddat=pd.to_datetime('today')
securities = ['504880', '504881', '504882']
sql = """SELECT * FROM
[IvyDB_EU].[dbo].[SECURITY_PRICE]
WHERE [SecurityID] = ? AND Date >= ? AND Date <=?
"""
df_all = pd.DataFrame()
for s in securities:
params = [int(s)]
params.append(startdat.strftime('%m/%d/%Y'))
params.append(enddat.strftime('%m/%d/%Y'))
db = dbEU()
df = db.query(sql, params)
df_all = df_all.append(df,ignore_index=True)
print(df_all)
Sure one way would be to do this in the WHERE statement with [SecurityID] IN ['504880', '504881', '504882'] but this only works in this simple query. With larger nested querys I need to be able to find another way.
Thanks,
Marvin

psycopg2.errors.InvalidTextRepresentation while using COPY in postgresql

I am using a custom callable to pandas.to_sql(). The below snippet is from pandas documentation for using it
import csv
from io import StringIO
def psql_insert_copy(table, conn, keys, data_iter):
"""
Execute SQL statement inserting data
Parameters
----------
table : pandas.io.sql.SQLTable
conn : sqlalchemy.engine.Engine or sqlalchemy.engine.Connection
keys : list of str
Column names
data_iter : Iterable that iterates the values to be inserted
"""
# gets a DBAPI connection that can provide a cursor
dbapi_conn = conn.connection
with dbapi_conn.cursor() as cur:
s_buf = StringIO()
writer = csv.writer(s_buf)
writer.writerows(data_iter)
s_buf.seek(0)
columns = ', '.join('"{}"'.format(k) for k in keys)
if table.schema:
table_name = '{}.{}'.format(table.schema, table.name)
else:
table_name = table.name
sql = 'COPY {} ({}) FROM STDIN WITH CSV'.format(
table_name, columns)
cur.copy_expert(sql=sql, file=s_buf)
but while using this copy functionality, I am getting the error
psycopg2.errors.InvalidTextRepresentation: invalid input syntax for integer: "3.0"
This is not a problem with input as this table schemas and values where working initially, when I have used to_sql() function without using the custom callable 'psql_insert_copy()'. I am using sqlalchemy engine for getting the connection cursor
I would recommend using string fields in the table for such actions, or writing the entire (sql) script manually, indicating the types of table fields

Is there an alternative way for collect() in pyspark? py4j.protocol.Py4JJavaError: An error occurred while calling 0323.collectToPython

Pyspark script crashes when i use collect() or show() in pyspark. My dataframe has only 570 rows, so i don't uderstand what is happening.
I have a Dataframe and i have created a functions that extracts a list with distinct rows from it. It was working fine, then suddenly i had an error:
py4j.protocol.Py4JJavaError: An error occurred while calling
0323.collectToPython
A similar error i have when i try to show() the dataframe.
Is there an alternative method to extract a list with distinct values from a dataframe?
required_list = [(col1,col2), (col1,col2)]
Sorry for not posting the code, but its a large script and its confidential.
Update:
I have a function that extract distinct values from a df:
def extract_dist(df, select_cols):
val = len(select_cols)
list_val = [row[0:val] for row in df.select(*select_cols)).distinct.na.drop().collect()]
return list_val
The function worked fine until i had the error.
I have a main script where i import these function and also another function that calculates a dataframe:
def calculate_df(df_join, v_srs, v_db, v_tbl, sql_context):
cmd = "impala-shel....'create table db.tbl as select * from v_db.v_tbl'"
os.system(cmd)
select_stm = "select * from db.tbl"
df = sql_context(select_stm)
cmd = "impala-shel....'drop table if exists db.tbl'"
os.system(cmd)
join cond = [...]
joined_df = df.join(df_join, join_cond, 'left').select(..)
df1 = joined_df.filer(...)
df2 = joined_df.filer(...)
final_df = df1.union(df2)
final_df.show() # error from show
return final_df
Main script:
import extract_dist
import calculate_df
df_join = ...extract from a hive table
for conn in details:
v_db = conn['database'].upper()
v_tbl = conn['table'].upper()
v_name = conn['descr'].upper()
if v_name in lst:
df = calculate_df(df_join, v_name, v_db, v_tbl, sqlContext)
df = df.filter(...column isin list)
df = df.filter(..).filter(..)
# extract list with distinct rows from df using dist function
df.show() # error from show
dist_list = extract_dist(df, [col1,col2]) # error from collect
for x, y in dist_list:
....
If i don't use show() the the error appears when i run the collect() method.
The same scripts worked before and suddenly failed. It there a memory issue? i have to clear memory?
SOLVED:
I have found the issue. After i created the dataframe from a table i have dropped the table.
cmd = "impala-shel....'drop table if exists db.tbl'"
os.system(cmd)
After i removed the command with drop table the script ran successfully.
I will drop the temporary table at the end of the script, after i finish with the extracted dataframe. I didn't know that if we create a dataframe and after that drop the source table i will have error afterwards.

Postgres 9.5 upsert command in pandas or psycopg2?

Most of the examples I see are people inserting a single row into a database with the ON CONFLICT DO UPDATE syntax.
Does anyone have any examples using SQLAlchemy or pandas.to_sql?
99% of my inserts are using psycopg2 COPY command (so I save a csv or stringio and then bulk insert), and the other 1% are pd.to_sql. All of my logic to check for new rows or dimensions is done in Python.
def find_new_rows(existing, current, id_col):
current[id_col] = current[id_col].astype(int)
x = existing[['datetime', id_col, 'key1']]
y = current[['datetime', id_col, 'key2']]
final = pd.merge(y, x, how='left', on=['datetime', id_col])
final = final[~(final['key2'] == final['key1'])]
final = final.drop(['key1'], axis=1)
current = pd.merge(current, final, how='left', on=['datetime', id_col])
current = current.loc[current['key2_y'] == 1]
current.drop(['key2_x', 'key2_y'], axis=1, inplace=True)
return current
Can someone show me an example of using the new PostgreSQL syntax for upsert with pyscopg2? A common use case is to check for dimension changes (between 50k - 100k rows daily which I compare to existing values) which is CONFLICT DO NOTHING to only add new rows.
Another use case is that I have fact data which changes over time. I only take the most recent value (I currently use a view to select distinct), but it would be better to UPSERT, if possible.
Here is my code for bulk insert & insert on conflict update query for postgresql from pandas dataframe:
Lets say id is unique key for both postgresql table and pandas df and you want to insert and update based on this id.
import pandas as pd
from sqlalchemy import create_engine, text
engine = create_engine(postgresql://username:pass#host:port/dbname)
query = text(f"""
INSERT INTO schema.table(name, title, id)
VALUES {','.join([str(i) for i in list(df.to_records(index=False))])}
ON CONFLICT (id)
DO UPDATE SET name= excluded.name,
title= excluded.title
""")
engine.execute(query)
Make sure that your df columns must be same order with your table.
FYI, this is the solution I am using currently.
It seems to work fine for my purposes. I had to add a line to replace null (NaT) timestamps with None though, because I was getting an error when I was loading each row into the database.
def create_update_query(table):
"""This function creates an upsert query which replaces existing data based on primary key conflicts"""
columns = ', '.join([f'{col}' for col in DATABASE_COLUMNS])
constraint = ', '.join([f'{col}' for col in PRIMARY_KEY])
placeholder = ', '.join([f'%({col})s' for col in DATABASE_COLUMNS])
updates = ', '.join([f'{col} = EXCLUDED.{col}' for col in DATABASE_COLUMNS])
query = f"""INSERT INTO {table} ({columns})
VALUES ({placeholder})
ON CONFLICT ({constraint})
DO UPDATE SET {updates};"""
query.split()
query = ' '.join(query.split())
return query
def load_updates(df, table, connection):
conn = connection.get_conn()
cursor = conn.cursor()
df1 = df.where((pd.notnull(df)), None)
insert_values = df1.to_dict(orient='records')
for row in insert_values:
cursor.execute(create_update_query(table=table), row)
conn.commit()
row_count = len(insert_values)
logging.info(f'Inserted {row_count} rows.')
cursor.close()
del cursor
conn.close()
For my case, I wrote to a temporary table first, then merged the temp table into the actual table I wanted to upsert to. Performing the upsert this way avoids any conflicts where the strings may have single quotes in them.
def upsert_dataframe_to_table(self, table_name: str, df: pd.DataFrame, schema: str, id_col:str):
"""
Takes the given dataframe and inserts it into the table given. The data is inserted unless the key for that
data already exists in the dataframe. If the key already exists, the data for that key is overwritten.
:param table_name: The name of the table to send the data
:param df: The dataframe with the data to send to the table
:param schema: the name of the schema where the table exists
:param id_col: The name of the primary key column
:return: None
"""
engine = create_engine(
f'postgresql://{postgres_configs["username"]}:{postgres_configs["password"]}#{postgres_configs["host"]}'
f':{postgres_configs["port"]}/{postgres_configs["db"]}'
)
df.to_sql('temp_table', engine, if_exists='replace')
updates = ', '.join([f'{col} = EXCLUDED.{col}' for col in df.columns if col != id_col])
columns = ', '.join([f'{col}' for col in df.columns])
query = f'INSERT INTO "{schema}".{table_name} ({columns}) ' \
f'SELECT {columns} FROM temp_table ' \
f'ON CONFLICT ({id_col}) DO ' \
f'UPDATE SET {updates} '
self.cursor.execute(query)
self.cursor.execute('DROP TABLE temp_table')
self.conn.commit()