Python Dbf append to memory indexed table fails - python-3.8

I'm using Python dbf-0.99.1 library from Ethan Furman. This approach to add record to table fails:
tab = dbf.Table( "MYTABLE" )
tab.open(mode=dbf.READ_WRITE)
idx = tab.create_index(lambda rec: (rec.id if not is_deleted(rec) else DoNotIndex ) ) # without this, append works
rec = { "id":id, "col2": val2 } # some values, id is numeric and is not None
tab.append( rec ) # fails here
My table contains various character and numeric columns. This is just an example. The exceptions is:
line 5959, in append
newrecord = Record(recnum=header.record_count, layout=meta, kamikaze=kamikaze)
line 3102, in __new__
record._update_disk()
line 3438, in _update_disk
index(self)
line 7550, in __call__
vindex = bisect_right(self._values, key)
TypeError: '<' not supported between instances of 'NoneType' and 'int'
Any help appreciated. Thanks.
EDIT: Here is testing script
import dbf
from dbf import is_deleted, DoNotIndex
tab = dbf.Table('temptable', "ID N(12,0)" )
tab.open(mode=dbf.READ_WRITE)
rc = { "id":1 }
tab.append( rc ) # need some data without index first
idx = tab.create_index(lambda rec: (rec.id if not is_deleted(rec) else DoNotIndex ) )
rc = { "id":2 }
tab.append( rc ) # fails here

Related

cx_oracle ORA-01036: illegal variable name/number

I have this group of python scripts where I do an API call, merge the resulting df together, then output to a table in an Oracle db. This exact script is working perfectly in three other scripts configured same, except a different API, but in this particular script, this error is getting thrown. I read up on binds, but I can't see how I'm doing it incorrectly for a tuple. Thanks in advance for your time.
sf_joined = pd.merge(sf_opp, sf_account,on=["CustID","CustID"])
# sf_joined.to_csv('sf_joined.csv', index=False)
# sf_types = sf_joined.dtypes
# print(sf_types)
char_columns = sf_joined.select_dtypes(include=['object']).columns
for col in char_columns:
if col not in ['rundate','Amount','Estimated_GC','Probability','MDC','DaysOpen']:
sf_joined[col] = sf_joined[col].fillna('')
# sf_joined[col] = sf_joined[col].map(lambda x: x.encode('utf-8'))
sf_joined[col] = sf_joined[col].map(lambda x: x[:1000])
pw = '****'
db_con = cx_Oracle.connect('mktg', pw, "prd-bia-db-***.o******.com:1521/BIPRD", encoding = "UTF-8", nencoding = "UTF-8")
cur = db_con.cursor()
print(db_con.version)
cur.execute('drop table cs_salesforce')
create_opps = """create table cs_salesforce(
rundate date,
sfoppid varchar(500)
)
"""
cur.execute(create_opps)
all_opps = []
for x in sf_joined.itertuples():
all_opps.append(x[1:])
insert_statement = """insert into cs_salesforce(rundate,sfoppid)values(:1, :2)"""
cur.executemany(insert_statement, all_opps)
db_con.commit()
Error:
runfile('C:/python_scripts_prod/cs_salesforce.py', wdir='C:/python_scripts_prod')
18.3.0.0.0
Traceback (most recent call last):
File "C:\python_scripts_prod\cs_salesforce.py", line 163, in <module>
cur.executemany(insert_statement, all_opps)
DatabaseError: ORA-01036: illegal variable name/number

Creating sqlite db in Python 3 with constructed f string and receiving: sqlite3.OperationalError: near "(": syntax error

I am trying to create a sqlite3 db table using a constructed f string in python 3, however I am receiving the below error:
sqlite3.OperationalError: near "(": syntax error
I had hoped that I wouldn't need to ask here for a syntax error but I have been searching on stackoverflow as well as generally online to identify the issue with no success.
I have compared the code to other samples and equally do not see any difference to the construction, except for that it doesn't appear to be common to use f strings.
I have read the pros/cons of passing parameters and would prefer this f string unless it is the root cause.
I expect the issue might be obvious, however any pointers would be greatly appreciated.
Below is the full code:
import sqlite3
import pandas as pd
db_path = [PATH TO DATABASE]
db_table_name = [TABLE NAME]
header_source = [PATH TO .XLSX]
def ReadHeaders():
df = pd.read_excel(header_source)
col_list = list(df.columns.values)
prep_col_list = [item.replace(" ", "_") for item in col_list]
col_string = " TEXT, _".join(prep_col_list)
final_col_string = col_string.replace("Primary_ID TEXT", "Primary_ID PRIMARY KEY")
return final_col_string
def CreateSQLdb():
cols = ReadHeaders()
conn = sqlite3.connect(db_path)
c = conn.cursor()
c.execute(f""" CREATE TABLE IF NOT EXISTS {db_table_name} ({cols}) """)
conn.commit()
conn.close()
A sample of the string that is created for the table headers is:
_link TEXT, _Primary_ID PRIMARY KEY, _Status_Description TEXT, _Price_List_Status TEXT, _Brand TEXT, _36_Character_Description TEXT
Solved
After breaking everything down, the root cause was the constructed string. I was able to identify it when trying to export the constructed string to a .txt file and received a unicode error.
Code before:
return final_col_string
Code after:
return final_col_string.encode(encoding="utf-8")
I also added a simple check of the table info for confirmation
def ShowTable(c):
c.execute(f"PRAGMA table_info({db_table_name})")
print (c.fetchall())
Complete code encase anyone else comes across this issue:
import sqlite3
import pandas as pd
db_path = [PATH TO DATABASE]
db_table_name = [TABLE NAME]
header_source = [PATH TO .XLSX]
def ReadHeaders():
df = pd.read_excel(header_source)
col_list = list(df.columns.values)
prep_col_list = [item.replace(" ", "_") for item in col_list]
col_string = " TEXT, _".join(prep_col_list)
final_col_string = col_string.replace("Primary_ID TEXT", "Primary_ID PRIMARY KEY")
return final_col_string.encode(encoding="utf-8")
def CreateSQLdb():
cols = ReadHeaders()
conn = sqlite3.connect(db_path)
c = conn.cursor()
c.execute(f""" CREATE TABLE IF NOT EXISTS {db_table_name} ({cols}) """)
conn.commit()
conn.close()
def ShowTable(c):
c.execute(f"PRAGMA table_info({db_table_name})")
print (c.fetchall())
if __name__ == "__main__":
CreateSQLdb()

python TypeError: expected string or buffer when parsing JSON from a file

I realize this problem has been answered for other folks but none of the threads are helping me solve it. I'm trying to parse a JSON structure and add all values in the sent_file when the keys match with the tweet_file. The error I'm getting
import sys
import json
def main():
sent_file = open(sys.argv[1])
tweet_file = open(sys.argv[2])
scores = {}
#tweet = {}
#tweet_text = {}
#hw()
#lines(sent_file)
#lines(tweet_file)
for line in sent_file:
term,score = line.split("\t")
scores[term] = int(score)
#print scores.items()
for tweets in tweet_file:
current_sent_value = 0
tweet = {} #this is a dict
#print type(tweets) str
tweet = json.loads(tweets)#[0] #this assignment changes tweet to a list. Why?
if 'text' in tweet:
tweet_text = {}
unicode_string = tweet['text']
encoded_string = unicode_string.encode('utf-8')
tweet_text = encoded_string.split()
for key in tweet_text:
for key in scores:
#print type(tweet_text) -- list
#print type(scores) --dict
if tweet_text.get(key) == scores.get(key): # get() does not work on a list. tweet_text is a list.
current_sent_value += scores(value)
print current_sent_value
if name == 'main':
main()
The error is here \assignment1\tweet_sentiment2.py", line 42, in main
if tweet_text.get(key) == scores.get(key): # get() does not work on a list. tweet_text is a list.
AttributeError: 'list' object has no attribute 'get'

Flask & SQLAlchemy & PostgreSQL - In a query can an 'int' be cast to a 'string' to permit use of 'like'

Using Flask and SQLAlchemy is it possible to create a query where a column can be cast from a number to a string so that .like() can be used as a filter?
The sample code below illustrates what I'm after, however Test 3 is a broken statement (ie: No attempt at casting so the query fails. Error is below)
Test 1 - demonstrates a standard select
Test 2 - demonstrates a select using like on a string
Can 'test 3' be modified to permit a like on a number?
In PostgreSQL the SQL query would be:
SELECT * FROM mytable WHERE number::varchar like '%2%'
Any assistance gratefully appreciated.
from flask import Flask
from flask_sqlalchemy import SQLAlchemy
from sqlalchemy import Table, Column, Integer, String
app = Flask(__name__)
app.debug = True
app.config.from_pyfile('config.py')
db = SQLAlchemy( app )
class MyTable(db.Model):
'''My Sample Table'''
__tablename__ = 'mytable'
number = db.Column( db.Integer, primary_key = True )
text = db.Column( db.String )
def __repr__(self):
return( 'MyTable( ' + str( self.number ) + ', ' + self.text + ')' )
test_1 = (db.session.query(MyTable)
.all())
print "Test 1 = " + str( test_1 )
test_2 = (db.session.query(MyTable)
.filter( MyTable.text.like( '%orl%' ) )
.all())
print "Test 2 = " + str( test_2 )
test_3 = (db.session.query(MyTable)
.filter( MyTable.number.like( '%2%' ) )
.all())
And the sample data:
=> select * from mytable;
number | text
--------+-------
100 | Hello
20 | World
And the error:
Traceback (most recent call last):
File "sample.py", line 33, in <module>
.filter( MyTable.number.like( '%2%' ) )
File "/usr/lib64/python2.7/site-packages/sqlalchemy/orm/query.py", line 2320, in all
return list(self)
File "/usr/lib64/python2.7/site-packages/sqlalchemy/orm/query.py", line 2438, in __iter__
return self._execute_and_instances(context)
File "/usr/lib64/python2.7/site-packages/sqlalchemy/orm/query.py", line 2453, in _execute_and_instances
result = conn.execute(querycontext.statement, self._params)
File "/usr/lib64/python2.7/site-packages/sqlalchemy/engine/base.py", line 729, in execute
return meth(self, multiparams, params)
File "/usr/lib64/python2.7/site-packages/sqlalchemy/sql/elements.py", line 322, in _execute_on_connection
return connection._execute_clauseelement(self, multiparams, params)
File "/usr/lib64/python2.7/site-packages/sqlalchemy/engine/base.py", line 826, in _execute_clauseelement
compiled_sql, distilled_params
File "/usr/lib64/python2.7/site-packages/sqlalchemy/engine/base.py", line 958, in _execute_context
context)
File "/usr/lib64/python2.7/site-packages/sqlalchemy/engine/base.py", line 1159, in _handle_dbapi_exception
exc_info
File "/usr/lib64/python2.7/site-packages/sqlalchemy/util/compat.py", line 199, in raise_from_cause
reraise(type(exception), exception, tb=exc_tb)
File "/usr/lib64/python2.7/site-packages/sqlalchemy/engine/base.py", line 951, in _execute_context
context)
File "/usr/lib64/python2.7/site-packages/sqlalchemy/engine/default.py", line 436, in do_execute
cursor.execute(statement, parameters)
sqlalchemy.exc.ProgrammingError: (ProgrammingError) operator does not exist: integer ~~ unknown
LINE 3: WHERE mytable.number LIKE '%2%'
^
HINT: No operator matches the given name and argument type(s). You might need to add explicit type casts.
'SELECT mytable.number AS mytable_number, mytable.text AS mytable_text \nFROM mytable \nWHERE mytable.number LIKE %(number_1)s' {'number_1': '%2%'}
Solved. The Query method filter can take an expression, so the solution is:
from sqlalchemy import cast, String
result = (db.session.query(MyTable)
.filter( cast( MyTable.number, String ).like( '%2%' ) )
.all())
With the result:
Test 3 = [MyTable( 20, World)]
Found the information in the SQLAlchemy Query API documentation.

I just want to load 5GB from MySql into BigQuery

Long time no see. I'd want to get 5GB of data from MySql into BigQuery. My best bet seems to be some sort of CSV export / import. Which doesn't work for various reasons, see:
agile-coral-830:splitpapers1501200518aa150120052659
agile-coral-830:splitpapers1501200545aa150120055302
agile-coral-830:splitpapers1501200556aa150120060231
This is likely because I don't have the right MySql incantation able to generate perfect CSV in accordance with RFC 4180. However, instead of arguing RFC 4180 minutia, this whole load business could be solved in five minutes by supporting customizable multi-character field separators and multi-character line separators. I'm pretty sure my data doesn't contain either ### nor ###, so the following would work like a charm:
mysql> select * from $TABLE_NAME
into outfile '$DATA.csv'
fields terminated by '###'
enclosed by ''
lines terminated by '###'
$ bq load --nosync -F '###' -E '###' $TABLE_NAME $DATA.csv $SCHEMA.json
Edit: Fields contain '\n', '\r', ',' and '"'. They also contain NULLs, which MySql represents as [escape]N, in the example "N. Sample row:
"10.1.1.1.1483","5","9074080","Candidate high myopia loci on chromosomes 18p and 12q do not play a major role in susceptibility to common myopia","Results
There was no strong evidence of linkage of common myopia to these candidate regions: all two-point and multipoint heterogeneity LOD scores were < 1.0 and non-parametric linkage p-values were > 0.01. However, one Amish family showed slight evidence of linkage (LOD>1.0) on 12q; another 3 Amish families each gave LOD >1.0 on 18p; and 3 Jewish families each gave LOD >1.0 on 12q.
Conclusions
Significant evidence of linkage (LOD> 3) of myopia was not found on chromosome 18p or 12q loci in these families. These results suggest that these loci do not play a major role in the causation of common myopia in our families studied.","2004","BMC MEDICAL GENETICS","JOURNAL","N,"5","20","","","","0","1","USER","2007-11-19 05:00:00","rep1","PDFLib TET","0","2009-05-24 20:33:12"
I found loading through a CSV very difficult. More restrictions and complications. I have been messing around this morning with moving data from MySQL to BigQuery.
Bellow is a Python script that will build the table decorator and stream the data directly into the BigQuery table.
My db is in the Cloud so you may need to change the connection string. Fill in the missing values for your particular situation then call it by:
SQLToBQBatch(tableName, limit)
I put the limit in to test with. For my final test I sent 999999999 for the limit and everything worked fine.
I would recommend using a backend module to run this over 5g.
Use "RowToJSON" to clean up and invalid characters (ie anything non utf8).
I haven't tested on 5gb but it was able to do 50k rows in about 20 seconds. The same load in CSV was over 2 minutes.
I wrote this to test things, so please excuse the bad codding practices and mini hacks. It works so feel free to clean it up for any production level work.
import MySQLdb
import logging
from apiclient.discovery import build
from oauth2client.appengine import AppAssertionCredentials
import httplib2
OAUTH_SCOPE = 'https://www.googleapis.com/auth/bigquery'
PROJECT_ID =
DATASET_ID =
TABLE_ID =
SQL_DATABASE_NAME =
SQL_DATABASE_DB =
SQL_USER =
SQL_PASS =
def Connect():
return MySQLdb.connect(unix_socket='/cloudsql/' + SQL_DATABASE_NAME, db=SQL_DATABASE_DB, user=SQL_USER, passwd=SQL_PASS)
def RowToJSON(cursor, row, fields):
newData = {}
for i, value in enumerate(row):
try:
if fields[i]["type"] == bqTypeDict["int"]:
value = int(value)
else:
value = float(value)
except:
if value is not None:
value = value.replace("\x92", "'") \
.replace("\x96", "'") \
.replace("\x93", '"') \
.replace("\x94", '"') \
.replace("\x97", '-') \
.replace("\xe9", 'e') \
.replace("\x91", "'") \
.replace("\x85", "...") \
.replace("\xb4", "'") \
.replace('"', '""')
newData[cursor.description[i][0]] = value
return newData
def GetBuilder():
return build('bigquery', 'v2',http = AppAssertionCredentials(scope=OAUTH_SCOPE).authorize(httplib2.Http()))
bqTypeDict = { 'int' : 'INTEGER',
'varchar' : 'STRING',
'double' : 'FLOAT',
'tinyint' : 'INTEGER',
'decimal' : 'FLOAT',
'text' : 'STRING',
'smallint' : 'INTEGER',
'char' : 'STRING',
'bigint' : 'INTEGER',
'float' : 'FLOAT',
'longtext' : 'STRING'
}
def BuildFeilds(table):
conn = Connect()
cursor = conn.cursor()
cursor.execute("DESCRIBE %s;" % table)
tableDecorator = cursor.fetchall()
fields = []
for col in tableDecorator:
field = {}
field["name"] = col[0]
colType = col[1].split("(")[0]
if colType not in bqTypeDict:
logging.warning("Unknown type detected, using string: %s", str(col[1]))
field["type"] = bqTypeDict.get(colType, "STRING")
if col[2] == "YES":
field["mode"] = "NULLABLE"
fields.append(field)
return fields
def SQLToBQBatch(table, limit=3000):
logging.info("****************************************************")
logging.info("Starting SQLToBQBatch. Got: Table: %s, Limit: %i" % (table, limit))
bqDest = GetBuilder()
fields = BuildFeilds(table)
try:
responce = bqDest.datasets().insert(projectId=PROJECT_ID, body={'datasetReference' :
{'datasetId' : DATASET_ID} }).execute()
logging.info("Added Dataset")
logging.info(responce)
except Exception, e:
logging.info(e)
if ("Already Exists: " in str(e)):
logging.info("Dataset already exists")
else:
logging.error("Error creating dataset: " + str(e), "Error")
try:
responce = bqDest.tables().insert(projectId=PROJECT_ID, datasetId=DATASET_ID, body={'tableReference' : {'projectId' : PROJECT_ID,
'datasetId' : DATASET_ID,
'tableId' : TABLE_ID},
'schema' : {'fields' : fields}}
).execute()
logging.info("Added Table")
logging.info(responce)
except Exception, e:
logging.info(e)
if ("Already Exists: " in str(e)):
logging.info("Table already exists")
else:
logging.error("Error creating table: " + str(e), "Error")
conn = Connect()
cursor = conn.cursor()
logging.info("Starting load loop")
count = -1
cur_pos = 0
total = 0
batch_size = 1000
while count != 0 and cur_pos < limit:
count = 0
if batch_size + cur_pos > limit:
batch_size = limit - cur_pos
sqlCommand = "SELECT * FROM %s LIMIT %i, %i" % (table, cur_pos, batch_size)
logging.info("Running: %s", sqlCommand)
cursor.execute(sqlCommand)
data = []
for _, row in enumerate(cursor.fetchall()):
data.append({"json": RowToJSON(cursor, row, fields)})
count += 1
logging.info("Read complete")
if count != 0:
logging.info("Sending request")
insertResponse = bqDest.tabledata().insertAll(
projectId=PROJECT_ID,
datasetId=DATASET_ID,
tableId=TABLE_ID,
body={"rows":data}).execute()
cur_pos += batch_size
total += count
logging.info("Done %i, Total: %i, Response: %s", count, total, insertResponse)
if "insertErrors" in insertResponse:
logging.error("Error inserting data index: %i", insertResponse["insertErrors"]["index"])
for error in insertResponse["insertErrors"]["errors"]:
logging.error(error)
else:
logging.info("No more rows")
• Generate google service account key
o IAM & Admin > Service account > create_Service_account
o Once created then create key , download and save It to the project folder on local machine – google_key.json
• Run the code in pycharm environment after installing the packages.
NOTE : The table data in mysql remains intact. Also , if one uses preview in BQ to see that you won’t see. Go to console and fire the query.
o CODE
o import MySQLdb
from google.cloud import bigquery
import mysql.connector
import logging
import os
from MySQLdb.converters import conversions
import click
import MySQLdb.cursors
from google.cloud.exceptions import ServiceUnavailable
import sys
bqTypeDict = {'int': 'INTEGER',
'varchar': 'STRING',
'double': 'FLOAT',
'tinyint': 'INTEGER',
'decimal': 'FLOAT',
'text': 'STRING',
'smallint': 'INTEGER',
'char': 'STRING',
'bigint': 'INTEGER',
'float': 'FLOAT',
'longtext': 'STRING',
'datetime': 'TIMESTAMP'
}
def conv_date_to_timestamp(str_date):
import time
import datetime
date_time = MySQLdb.times.DateTime_or_None(str_date)
unix_timestamp = (date_time - datetime.datetime(1970, 1, 1)).total_seconds()
return unix_timestamp
def Connect(host, database, user, password):
return mysql.connector.connect(host='',
port='',
database='recommendation_spark',
user='',
password='')
def BuildSchema(host, database, user, password, table):
logging.debug('build schema for table %s in database %s' % (table, database))
conn = Connect(host, database, user, password)
cursor = conn.cursor()
cursor.execute("DESCRIBE %s;" % table)
tableDecorator = cursor.fetchall()
schema = []
for col in tableDecorator:
colType = col[1].split("(")[0]
if colType not in bqTypeDict:
logging.warning("Unknown type detected, using string: %s", str(col[1]))
field_mode = "NULLABLE" if col[2] == "YES" else "REQUIRED"
field = bigquery.SchemaField(col[0], bqTypeDict.get(colType, "STRING"), mode=field_mode)
schema.append(field)
return tuple(schema)
def bq_load(table, data, max_retries=5):
logging.info("Sending request")
uploaded_successfully = False
num_tries = 0
while not uploaded_successfully and num_tries < max_retries:
try:
insertResponse = table.insert_data(data)
for row in insertResponse:
if 'errors' in row:
logging.error('not able to upload data: %s', row['errors'])
uploaded_successfully = True
except ServiceUnavailable as e:
num_tries += 1
logging.error('insert failed with exception trying again retry %d', num_tries)
except Exception as e:
num_tries += 1
logging.error('not able to upload data: %s', str(e))
#click.command()
#click.option('-h', '--host', default='tempus-qa.hashmapinc.com', help='MySQL hostname')
#click.option('-d', '--database', required=True, help='MySQL database')
#click.option('-u', '--user', default='root', help='MySQL user')
#click.option('-p', '--password', default='docker', help='MySQL password')
#click.option('-t', '--table', required=True, help='MySQL table')
#click.option('-i', '--projectid', required=True, help='Google BigQuery Project ID')
#click.option('-n', '--dataset', required=True, help='Google BigQuery Dataset name')
#click.option('-l', '--limit', default=0, help='max num of rows to load')
#click.option('-s', '--batch_size', default=1000, help='max num of rows to load')
#click.option('-k', '--key', default='key.json',help='Location of google service account key (relative to current working dir)')
#click.option('-v', '--verbose', default=0, count=True, help='verbose')
def SQLToBQBatch(host, database, user, password, table, projectid, dataset, limit, batch_size, key, verbose):
# set to max verbose level
verbose = verbose if verbose < 3 else 3
loglevel = logging.ERROR - (10 * verbose)
logging.basicConfig(level=loglevel)
logging.info("Starting SQLToBQBatch. Got: Table: %s, Limit: %i", table, limit)
## set env key to authenticate application
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = os.path.join(os.getcwd(), key)
print('file found')
# Instantiates a client
bigquery_client = bigquery.Client()
print('Project id created')
try:
bq_dataset = bigquery_client.dataset(dataset)
bq_dataset.create()
logging.info("Added Dataset")
except Exception as e:
if ("Already Exists: " in str(e)):
logging.info("Dataset already exists")
else:
logging.error("Error creating dataset: %s Error", str(e))
bq_table = bq_dataset.table(table)
bq_table.schema = BuildSchema(host, database, user, password, table)
print('Creating schema using build schema')
bq_table.create()
logging.info("Added Table %s", table)
conn = Connect(host, database, user, password)
cursor = conn.cursor()
logging.info("Starting load loop")
cursor.execute("SELECT * FROM %s" % (table))
cur_batch = []
count = 0
for row in cursor:
count += 1
if limit != 0 and count >= limit:
logging.info("limit of %d rows reached", limit)
break
cur_batch.append(row)
if count % batch_size == 0 and count != 0:
bq_load(bq_table, cur_batch)
cur_batch = []
logging.info("processed %i rows", count)
# send last elements
bq_load(bq_table, cur_batch)
logging.info("Finished (%i total)", count)
print("table created")
if __name__ == '__main__':
# run the command
SQLToBQBatch()
o Command to run the file : python mysql_to_bq.py -d 'recommendation_spark' -t temp_market_store -i inductive-cocoa-250507 -n practice123 -k key.json