Jupyter load dataframe into DB2 table - dataframe

I have a Jupyter notebook and I am trying to read CSV files from URLS and then load them in a DB2. My first problem is that sometimes the table exists and I need to drop it first - second, seems impossible to load the data into tables.. what am I doing wrong?
import pandas as pd
import io
import requests
import ibm_db
import ibm_db_dbi
After this I try
dsn = "DRIVER={{IBM DB2 ODBC DRIVER}};" + \
"DATABASE=BLUDB;" + \
"HOSTNAME=myhostname;" + \
"PORT=50000;" + \
"PROTOCOL=TCPIP;" + \
"UID=myuid;" + \
"PWD=mypassword;"
url1="https://ibm.box.com/shared/static/05c3415cbfbtfnr2fx4atenb2sd361ze.csv"
s1=requests.get(url1).content
df1=pd.read_csv(io.StringIO(s1.decode('utf-8')))
url2="https://ibm.box.com/shared/static/f9gjvj1gjmxxzycdhplzt01qtz0s7ew7.csv"
s2=requests.get(url2).content
df2=pd.read_csv(io.StringIO(s2.decode('utf-8')))
url3="https://ibm.box.com/shared/static/svflyugsr9zbqy5bmowgswqemfpm1x7f.csv"
s3=requests.get(url3).content
df3=pd.read_csv(io.StringIO(s3.decode('utf-8')))
all works. And connection as well
hdbc = ibm_db.connect(dsn, "", "")
hdbi = ibm_db_dbi.Connection(hdbc)
and this fails
#DropTableIfExists = ibm_db.exec_immediate(hdbc, 'DROP TABLE QBG03137.DATAFRAME1')
CreateTable = ibm_db.exec_immediate(hdbc, sql)
resultSet = ibm_db.exec_immediate(hdbc, sql)
#define row and fetch tuple
row = ibm_db.fetch_tuple(resultSet)
comma = ""
while (row != False):
for column in row:
print(comma,end="")
print(column,end="")
comma = ","
print()
row = ibm_db.fetch_tuple(resultSet)
With error
---------------------------------------------------------------------------
Exception Traceback (most recent call last)
<ipython-input-94-840854e92bd7> in <module>
5
6 # Testing table exists
----> 7 resultSet = ibm_db.exec_immediate(hdbc, sql)
8
9 #define row and fetch tuple
Exception: [IBM][CLI Driver][DB2/LINUXX8664] SQL0601N The name of the object to be created is identical to the existing name "QBG03137.DATAFRAME1" of type "TABLE". SQLSTATE=42710 SQLCODE=-601

Related

Creating a table in postgres using pandas and psycopg2 gives me an error

I am trying to create a database in postgres database using pandas and psycopg2.
The syntax are fine but still gives me an error like:
---------------------------------------------------------------------------
SyntaxError Traceback (most recent call last)
Cell In[67], line 9
7 sql = "CREATE TABLE linux (Distribution, {})".format(', '.join(column_names))
8 # Execute the statement
----> 9 cur.execute(sql)
11 # Commit the changes
12 engine.commit()
SyntaxError: syntax error at end of input
LINE 1: ...tion_Commitment, Forked_From, Target_Audience, Cost, Status)
These are the things i tried.
import pandas as pd
import psycopg2
engine = psycopg2.connect(dbname="pandas", user="postgres", password="root", host="localhost")
cur = engine.cursor()
# Define the table
column_names = [
"Founder",
"Maintainer",
"Initial_Release_Year",
"Current_Stable_Version",
"Security_Updates",
"Release_Date",
"System_Distribution_Commitment",
"Forked_From",
"Target_Audience",
"Cost",
"Status"
]
sql = "CREATE TABLE linux (Distribution, {})".format(', '.join(column_names))
# Execute the statement
cur.execute(sql)
# Commit the changes
engine.commit()
# Close the cursor and connection
cur.close()
engine.close()
You need to specify data type for each column like shown below.Change data types to those that are actual in your case.
schema = [
("Founder", "varchar(50)"),
("Maintainer", "varchar(50)"),
("Initial_Release_Year", "date"),
("Current_Stable_Version", "varchar(20)"),
("Security_Updates", "varchar(50)"),
("Release_Date", "date"),
("System_Distribution_Commitment", "varchar(50)"),
("Forked_From", "date"),
("Target_Audience", "varchar(50)"),
("Cost", "numeric(10, 2)"),
("Status", "varchar(10)")
]
sql = "CREATE TABLE linux (Distribution varchar(30), {})"\
.format(', '.join(map(' '.join, schema)))

sqlalchemy composite key error when cell run twice

My configuration cell contains the following:
from sqlalchemy import create_engine
import io
def df_to_postgres(dataframe, table_name):
engine = create_engine('postgresql://postgres:pass#localhost:5432/postgres')
df=dataframe
df.head(0).to_sql(table_name.lower(), engine, if_exists='replace', index=False)
conn = engine.raw_connection()
cur = conn.cursor()
output = io.StringIO()
df.to_csv(output, sep='\t', header=False, index=False)
output.seek(0)
contents = output.getvalue()
cur.copy_expert('COPY ' + table_name + ' FROM STDIN', output)
conn.commit()
and the cell uploading the dataframe has:
df_to_postgres(product_table,'final.product')
The product_table has a composite key which has been created in postgres. When I run it the 1st time everything is good, when I try the 2nd time I get:
UniqueViolation Traceback (most recent call last)
<timed eval> in <module>
~\AppData\Local\Temp/ipykernel_8004/2936867318.py in df_to_postgres(dataframe, table_name)
15 output.seek(0)
16 contents = output.getvalue()
---> 17 cur.copy_expert('COPY ' + table_name + ' FROM STDIN', output)
18 conn.commit()
UniqueViolation: duplicate key value violates unique constraint "product_pkey"
DETAIL: Key (sku, unit)=(000002, kg) already exists.
CONTEXT: COPY product, line 1
I thought that if_exists='replace' would have solved this problem. Any solution?

psycopg2.errors.InvalidTextRepresentation while using COPY in postgresql

I am using a custom callable to pandas.to_sql(). The below snippet is from pandas documentation for using it
import csv
from io import StringIO
def psql_insert_copy(table, conn, keys, data_iter):
"""
Execute SQL statement inserting data
Parameters
----------
table : pandas.io.sql.SQLTable
conn : sqlalchemy.engine.Engine or sqlalchemy.engine.Connection
keys : list of str
Column names
data_iter : Iterable that iterates the values to be inserted
"""
# gets a DBAPI connection that can provide a cursor
dbapi_conn = conn.connection
with dbapi_conn.cursor() as cur:
s_buf = StringIO()
writer = csv.writer(s_buf)
writer.writerows(data_iter)
s_buf.seek(0)
columns = ', '.join('"{}"'.format(k) for k in keys)
if table.schema:
table_name = '{}.{}'.format(table.schema, table.name)
else:
table_name = table.name
sql = 'COPY {} ({}) FROM STDIN WITH CSV'.format(
table_name, columns)
cur.copy_expert(sql=sql, file=s_buf)
but while using this copy functionality, I am getting the error
psycopg2.errors.InvalidTextRepresentation: invalid input syntax for integer: "3.0"
This is not a problem with input as this table schemas and values where working initially, when I have used to_sql() function without using the custom callable 'psql_insert_copy()'. I am using sqlalchemy engine for getting the connection cursor
I would recommend using string fields in the table for such actions, or writing the entire (sql) script manually, indicating the types of table fields

Is there an alternative way for collect() in pyspark? py4j.protocol.Py4JJavaError: An error occurred while calling 0323.collectToPython

Pyspark script crashes when i use collect() or show() in pyspark. My dataframe has only 570 rows, so i don't uderstand what is happening.
I have a Dataframe and i have created a functions that extracts a list with distinct rows from it. It was working fine, then suddenly i had an error:
py4j.protocol.Py4JJavaError: An error occurred while calling
0323.collectToPython
A similar error i have when i try to show() the dataframe.
Is there an alternative method to extract a list with distinct values from a dataframe?
required_list = [(col1,col2), (col1,col2)]
Sorry for not posting the code, but its a large script and its confidential.
Update:
I have a function that extract distinct values from a df:
def extract_dist(df, select_cols):
val = len(select_cols)
list_val = [row[0:val] for row in df.select(*select_cols)).distinct.na.drop().collect()]
return list_val
The function worked fine until i had the error.
I have a main script where i import these function and also another function that calculates a dataframe:
def calculate_df(df_join, v_srs, v_db, v_tbl, sql_context):
cmd = "impala-shel....'create table db.tbl as select * from v_db.v_tbl'"
os.system(cmd)
select_stm = "select * from db.tbl"
df = sql_context(select_stm)
cmd = "impala-shel....'drop table if exists db.tbl'"
os.system(cmd)
join cond = [...]
joined_df = df.join(df_join, join_cond, 'left').select(..)
df1 = joined_df.filer(...)
df2 = joined_df.filer(...)
final_df = df1.union(df2)
final_df.show() # error from show
return final_df
Main script:
import extract_dist
import calculate_df
df_join = ...extract from a hive table
for conn in details:
v_db = conn['database'].upper()
v_tbl = conn['table'].upper()
v_name = conn['descr'].upper()
if v_name in lst:
df = calculate_df(df_join, v_name, v_db, v_tbl, sqlContext)
df = df.filter(...column isin list)
df = df.filter(..).filter(..)
# extract list with distinct rows from df using dist function
df.show() # error from show
dist_list = extract_dist(df, [col1,col2]) # error from collect
for x, y in dist_list:
....
If i don't use show() the the error appears when i run the collect() method.
The same scripts worked before and suddenly failed. It there a memory issue? i have to clear memory?
SOLVED:
I have found the issue. After i created the dataframe from a table i have dropped the table.
cmd = "impala-shel....'drop table if exists db.tbl'"
os.system(cmd)
After i removed the command with drop table the script ran successfully.
I will drop the temporary table at the end of the script, after i finish with the extracted dataframe. I didn't know that if we create a dataframe and after that drop the source table i will have error afterwards.

Pyspark code to select data from HIVE table and write in HDFS as parquet format

I am trying to select data from a partioned HIVE table (partioned on column - label_yyyy_mm_dd) for a selected date range and write append in HDFS as a parquet file. However getting error. Below is the code and error.
from pyspark.sql.functions
import current_date, date_format, date_sub from datetime import datetime, timedelta import datetime
q = """select label_yyyy_mm_dd
,label_yyyy_mm
,q_media_name
,a_accepted
,a_end_ts
,a_media_name
,a_resource_name
,a_start_ts
,k_callpurpose
,k_srf
,q_entry_ordinal
,q_interaction_id
,q_interaction_type
,q_ixn_resource_id
,q_resource_name
,a_consult_rcv_warm_engage_time
,a_consult_rcv_warm_hold_time
,a_consult_rcv_warm_wrap_time
,a_customer_handle_count
,a_customer_talk_duration
,a_interaction_resource_id
,a_interaction_id
,a_wrap_time
a_technical_result
,k_ixn_type
,k_ixn_type_source
,k_transfer_count
,k_language
,k_agentauth
,k_auth,k_rg
,k_channel
,k_gms_result
,k_connid
,k_rbcprimaryid
,k_agent_id
,a_interaction_resource_ordinal
from prod_T0V0_cct0.cct0_gim_measures_gold A
inner join prod_T0V0_cct0.yle0_gim_date_time B on A.a_start_date_time_key = B.date_time_key where label_yyyy_mm_dd
>='2017/03/07' AND label_yyyy_mm_dd <='2017/03/31'""" spark.sql(q).write.mode('append').parquet('hdfs:/prod/11323/app/H9A0/data/T0V0/DIG/test.parquet/label_yyyy_mm_dd=%s' %label_yyyy_mm_dd)
Error message :-
NameError Traceback (most recent call last)
<ipython-input-4-e695e7530d80> in <module>()
42 where label_yyyy_mm_dd >='2017/03/07'
43 AND label_yyyy_mm_dd <='2017/03/31'"""
---> 44 spark.sql(q).write.mode('append').parquet('hdfs:/prod/11323/app/H9A0/data/T0V0/DIG/test.parquet/label_yyyy_mm_dd=%s'%label_yyyy_mm_dd)
NameError: name 'label_yyyy_mm_dd' is not defined
First, you have
q = """select label_yyyy_mm_dd
,label_yyyy_mm
,q_media_name and so on'''
Create DataFrame which has columns from 'q'.
df = spark.sql(q)
Then, select 'label_yyyy_mm_dd' column from DataFrame 'df'
label_yyyy_mm_dd = df.select('label_yyyy_mm_dd')
Convert them to string by taking the first value
label_yyyy_mm_dd_coll = ",".join(str("{0}".format(value.label_yyyy_mm_dd )) for
value in label_yyyy_mm_dd.take(1))
Pass, the variable to write it.
df.write.mode('append').parquet('hdfs:/prod/11323/app/H9A0/data/T0V0/DIG/test.parquet/label_yyyy_mm_dd=%s' % label_yyyy_mm_dd_coll)
You are loading the complete dataframe into q dataframe. So, when you are passing
%label_yyyy_mm_dd
it was unable to read that column
try this:
label_yyyy_mm_dd = q.select(“label_yyyy_mm_dd”) \
.rdd.map(lambda x:str(x[“label_yyyy_mm_dd”])).collect()
(perform collect if that column contains multiple values
otherwise if you need only first value just replace .first()
but collect is not suggested as it implies load.