I have a Jupyter notebook and I am trying to read CSV files from URLS and then load them in a DB2. My first problem is that sometimes the table exists and I need to drop it first - second, seems impossible to load the data into tables.. what am I doing wrong?
import pandas as pd
import io
import requests
import ibm_db
import ibm_db_dbi
After this I try
dsn = "DRIVER={{IBM DB2 ODBC DRIVER}};" + \
"DATABASE=BLUDB;" + \
"HOSTNAME=myhostname;" + \
"PORT=50000;" + \
"PROTOCOL=TCPIP;" + \
"UID=myuid;" + \
"PWD=mypassword;"
url1="https://ibm.box.com/shared/static/05c3415cbfbtfnr2fx4atenb2sd361ze.csv"
s1=requests.get(url1).content
df1=pd.read_csv(io.StringIO(s1.decode('utf-8')))
url2="https://ibm.box.com/shared/static/f9gjvj1gjmxxzycdhplzt01qtz0s7ew7.csv"
s2=requests.get(url2).content
df2=pd.read_csv(io.StringIO(s2.decode('utf-8')))
url3="https://ibm.box.com/shared/static/svflyugsr9zbqy5bmowgswqemfpm1x7f.csv"
s3=requests.get(url3).content
df3=pd.read_csv(io.StringIO(s3.decode('utf-8')))
all works. And connection as well
hdbc = ibm_db.connect(dsn, "", "")
hdbi = ibm_db_dbi.Connection(hdbc)
and this fails
#DropTableIfExists = ibm_db.exec_immediate(hdbc, 'DROP TABLE QBG03137.DATAFRAME1')
CreateTable = ibm_db.exec_immediate(hdbc, sql)
resultSet = ibm_db.exec_immediate(hdbc, sql)
#define row and fetch tuple
row = ibm_db.fetch_tuple(resultSet)
comma = ""
while (row != False):
for column in row:
print(comma,end="")
print(column,end="")
comma = ","
print()
row = ibm_db.fetch_tuple(resultSet)
With error
---------------------------------------------------------------------------
Exception Traceback (most recent call last)
<ipython-input-94-840854e92bd7> in <module>
5
6 # Testing table exists
----> 7 resultSet = ibm_db.exec_immediate(hdbc, sql)
8
9 #define row and fetch tuple
Exception: [IBM][CLI Driver][DB2/LINUXX8664] SQL0601N The name of the object to be created is identical to the existing name "QBG03137.DATAFRAME1" of type "TABLE". SQLSTATE=42710 SQLCODE=-601
Related
I am trying to create a database in postgres database using pandas and psycopg2.
The syntax are fine but still gives me an error like:
---------------------------------------------------------------------------
SyntaxError Traceback (most recent call last)
Cell In[67], line 9
7 sql = "CREATE TABLE linux (Distribution, {})".format(', '.join(column_names))
8 # Execute the statement
----> 9 cur.execute(sql)
11 # Commit the changes
12 engine.commit()
SyntaxError: syntax error at end of input
LINE 1: ...tion_Commitment, Forked_From, Target_Audience, Cost, Status)
These are the things i tried.
import pandas as pd
import psycopg2
engine = psycopg2.connect(dbname="pandas", user="postgres", password="root", host="localhost")
cur = engine.cursor()
# Define the table
column_names = [
"Founder",
"Maintainer",
"Initial_Release_Year",
"Current_Stable_Version",
"Security_Updates",
"Release_Date",
"System_Distribution_Commitment",
"Forked_From",
"Target_Audience",
"Cost",
"Status"
]
sql = "CREATE TABLE linux (Distribution, {})".format(', '.join(column_names))
# Execute the statement
cur.execute(sql)
# Commit the changes
engine.commit()
# Close the cursor and connection
cur.close()
engine.close()
You need to specify data type for each column like shown below.Change data types to those that are actual in your case.
schema = [
("Founder", "varchar(50)"),
("Maintainer", "varchar(50)"),
("Initial_Release_Year", "date"),
("Current_Stable_Version", "varchar(20)"),
("Security_Updates", "varchar(50)"),
("Release_Date", "date"),
("System_Distribution_Commitment", "varchar(50)"),
("Forked_From", "date"),
("Target_Audience", "varchar(50)"),
("Cost", "numeric(10, 2)"),
("Status", "varchar(10)")
]
sql = "CREATE TABLE linux (Distribution varchar(30), {})"\
.format(', '.join(map(' '.join, schema)))
My configuration cell contains the following:
from sqlalchemy import create_engine
import io
def df_to_postgres(dataframe, table_name):
engine = create_engine('postgresql://postgres:pass#localhost:5432/postgres')
df=dataframe
df.head(0).to_sql(table_name.lower(), engine, if_exists='replace', index=False)
conn = engine.raw_connection()
cur = conn.cursor()
output = io.StringIO()
df.to_csv(output, sep='\t', header=False, index=False)
output.seek(0)
contents = output.getvalue()
cur.copy_expert('COPY ' + table_name + ' FROM STDIN', output)
conn.commit()
and the cell uploading the dataframe has:
df_to_postgres(product_table,'final.product')
The product_table has a composite key which has been created in postgres. When I run it the 1st time everything is good, when I try the 2nd time I get:
UniqueViolation Traceback (most recent call last)
<timed eval> in <module>
~\AppData\Local\Temp/ipykernel_8004/2936867318.py in df_to_postgres(dataframe, table_name)
15 output.seek(0)
16 contents = output.getvalue()
---> 17 cur.copy_expert('COPY ' + table_name + ' FROM STDIN', output)
18 conn.commit()
UniqueViolation: duplicate key value violates unique constraint "product_pkey"
DETAIL: Key (sku, unit)=(000002, kg) already exists.
CONTEXT: COPY product, line 1
I thought that if_exists='replace' would have solved this problem. Any solution?
I am using a custom callable to pandas.to_sql(). The below snippet is from pandas documentation for using it
import csv
from io import StringIO
def psql_insert_copy(table, conn, keys, data_iter):
"""
Execute SQL statement inserting data
Parameters
----------
table : pandas.io.sql.SQLTable
conn : sqlalchemy.engine.Engine or sqlalchemy.engine.Connection
keys : list of str
Column names
data_iter : Iterable that iterates the values to be inserted
"""
# gets a DBAPI connection that can provide a cursor
dbapi_conn = conn.connection
with dbapi_conn.cursor() as cur:
s_buf = StringIO()
writer = csv.writer(s_buf)
writer.writerows(data_iter)
s_buf.seek(0)
columns = ', '.join('"{}"'.format(k) for k in keys)
if table.schema:
table_name = '{}.{}'.format(table.schema, table.name)
else:
table_name = table.name
sql = 'COPY {} ({}) FROM STDIN WITH CSV'.format(
table_name, columns)
cur.copy_expert(sql=sql, file=s_buf)
but while using this copy functionality, I am getting the error
psycopg2.errors.InvalidTextRepresentation: invalid input syntax for integer: "3.0"
This is not a problem with input as this table schemas and values where working initially, when I have used to_sql() function without using the custom callable 'psql_insert_copy()'. I am using sqlalchemy engine for getting the connection cursor
I would recommend using string fields in the table for such actions, or writing the entire (sql) script manually, indicating the types of table fields
Pyspark script crashes when i use collect() or show() in pyspark. My dataframe has only 570 rows, so i don't uderstand what is happening.
I have a Dataframe and i have created a functions that extracts a list with distinct rows from it. It was working fine, then suddenly i had an error:
py4j.protocol.Py4JJavaError: An error occurred while calling
0323.collectToPython
A similar error i have when i try to show() the dataframe.
Is there an alternative method to extract a list with distinct values from a dataframe?
required_list = [(col1,col2), (col1,col2)]
Sorry for not posting the code, but its a large script and its confidential.
Update:
I have a function that extract distinct values from a df:
def extract_dist(df, select_cols):
val = len(select_cols)
list_val = [row[0:val] for row in df.select(*select_cols)).distinct.na.drop().collect()]
return list_val
The function worked fine until i had the error.
I have a main script where i import these function and also another function that calculates a dataframe:
def calculate_df(df_join, v_srs, v_db, v_tbl, sql_context):
cmd = "impala-shel....'create table db.tbl as select * from v_db.v_tbl'"
os.system(cmd)
select_stm = "select * from db.tbl"
df = sql_context(select_stm)
cmd = "impala-shel....'drop table if exists db.tbl'"
os.system(cmd)
join cond = [...]
joined_df = df.join(df_join, join_cond, 'left').select(..)
df1 = joined_df.filer(...)
df2 = joined_df.filer(...)
final_df = df1.union(df2)
final_df.show() # error from show
return final_df
Main script:
import extract_dist
import calculate_df
df_join = ...extract from a hive table
for conn in details:
v_db = conn['database'].upper()
v_tbl = conn['table'].upper()
v_name = conn['descr'].upper()
if v_name in lst:
df = calculate_df(df_join, v_name, v_db, v_tbl, sqlContext)
df = df.filter(...column isin list)
df = df.filter(..).filter(..)
# extract list with distinct rows from df using dist function
df.show() # error from show
dist_list = extract_dist(df, [col1,col2]) # error from collect
for x, y in dist_list:
....
If i don't use show() the the error appears when i run the collect() method.
The same scripts worked before and suddenly failed. It there a memory issue? i have to clear memory?
SOLVED:
I have found the issue. After i created the dataframe from a table i have dropped the table.
cmd = "impala-shel....'drop table if exists db.tbl'"
os.system(cmd)
After i removed the command with drop table the script ran successfully.
I will drop the temporary table at the end of the script, after i finish with the extracted dataframe. I didn't know that if we create a dataframe and after that drop the source table i will have error afterwards.
I am trying to select data from a partioned HIVE table (partioned on column - label_yyyy_mm_dd) for a selected date range and write append in HDFS as a parquet file. However getting error. Below is the code and error.
from pyspark.sql.functions
import current_date, date_format, date_sub from datetime import datetime, timedelta import datetime
q = """select label_yyyy_mm_dd
,label_yyyy_mm
,q_media_name
,a_accepted
,a_end_ts
,a_media_name
,a_resource_name
,a_start_ts
,k_callpurpose
,k_srf
,q_entry_ordinal
,q_interaction_id
,q_interaction_type
,q_ixn_resource_id
,q_resource_name
,a_consult_rcv_warm_engage_time
,a_consult_rcv_warm_hold_time
,a_consult_rcv_warm_wrap_time
,a_customer_handle_count
,a_customer_talk_duration
,a_interaction_resource_id
,a_interaction_id
,a_wrap_time
a_technical_result
,k_ixn_type
,k_ixn_type_source
,k_transfer_count
,k_language
,k_agentauth
,k_auth,k_rg
,k_channel
,k_gms_result
,k_connid
,k_rbcprimaryid
,k_agent_id
,a_interaction_resource_ordinal
from prod_T0V0_cct0.cct0_gim_measures_gold A
inner join prod_T0V0_cct0.yle0_gim_date_time B on A.a_start_date_time_key = B.date_time_key where label_yyyy_mm_dd
>='2017/03/07' AND label_yyyy_mm_dd <='2017/03/31'""" spark.sql(q).write.mode('append').parquet('hdfs:/prod/11323/app/H9A0/data/T0V0/DIG/test.parquet/label_yyyy_mm_dd=%s' %label_yyyy_mm_dd)
Error message :-
NameError Traceback (most recent call last)
<ipython-input-4-e695e7530d80> in <module>()
42 where label_yyyy_mm_dd >='2017/03/07'
43 AND label_yyyy_mm_dd <='2017/03/31'"""
---> 44 spark.sql(q).write.mode('append').parquet('hdfs:/prod/11323/app/H9A0/data/T0V0/DIG/test.parquet/label_yyyy_mm_dd=%s'%label_yyyy_mm_dd)
NameError: name 'label_yyyy_mm_dd' is not defined
First, you have
q = """select label_yyyy_mm_dd
,label_yyyy_mm
,q_media_name and so on'''
Create DataFrame which has columns from 'q'.
df = spark.sql(q)
Then, select 'label_yyyy_mm_dd' column from DataFrame 'df'
label_yyyy_mm_dd = df.select('label_yyyy_mm_dd')
Convert them to string by taking the first value
label_yyyy_mm_dd_coll = ",".join(str("{0}".format(value.label_yyyy_mm_dd )) for
value in label_yyyy_mm_dd.take(1))
Pass, the variable to write it.
df.write.mode('append').parquet('hdfs:/prod/11323/app/H9A0/data/T0V0/DIG/test.parquet/label_yyyy_mm_dd=%s' % label_yyyy_mm_dd_coll)
You are loading the complete dataframe into q dataframe. So, when you are passing
%label_yyyy_mm_dd
it was unable to read that column
try this:
label_yyyy_mm_dd = q.select(“label_yyyy_mm_dd”) \
.rdd.map(lambda x:str(x[“label_yyyy_mm_dd”])).collect()
(perform collect if that column contains multiple values
otherwise if you need only first value just replace .first()
but collect is not suggested as it implies load.