ValueError: NaTType does not support timetuple when converting a dataframe to dictionary using to_dict('records') - pandas

I'm running this flask app
from flask import Flask, request, jsonify, render_template
from flask_cors import CORS, cross_origin
import json
import pandas as pd
# Create the app object
app = Flask(__name__)
cors = CORS(app, resources= {r"/*": {'origins' : "*"}})
# importing function for calculations
from Record_Matching import Matching
#app.route("/query", methods = ['get'])
#cross_origin()
def query():
# service_account_creds = request.json
query1 = request.args.get('query1', type = str)
query2 = request.args.get('query2', type = str)
querycolumns = request.args.get('querycolumns')
project_id = request.args.get('project_id', type = str)
service_account_creds = request.args.get('service_account')
SS = request.args.get('SS', type = float)
TT = request.args.get('TT', type = float)
result = Matching(query1,query2, SS,TT, service_account_creds, project_id, querycolumns)
return result
if __name__ == "__main__":
app.run(host="localhost", port=8080, debug=True)
and I'm importing the matching function from this python scripts
import pandas as pd
from google.cloud import bigquery
from google.oauth2 import service_account
import recordlinkage
from recordlinkage.preprocessing import phonetic
from pandas.io.json import json_normalize
import uuid
from uuid import uuid4
import random
import string
import json
import ast
# Results to data frame function
def gcp2df(sql, client):
query = client.query(sql)
results = query.result()
return results.to_dataframe()
# Exporting df to bigquery - table parameter example: "dataset.tablename"
# def insert(df, table):
# client = bigquery.Client()
# job_config = bigquery.LoadJobConfig(write_disposition=bigquery.job.WriteDisposition.WRITE_TRUNCATE)
# return client.load_table_from_dataframe(df, table, job_config = job_config)
def pair(df1, df2, TT, querycolumns):
# function to take pair from list and compare:
L = querycolumns
l=len(querycolumns)
p1=0
p2=1
# To generate phonetics we need to make sure all names are in english.
# thus we'll replace non-english words by random english strings
df1[L[p1]] = df1[L[p1]].astype(str)
df2[L[p2]] = df2[L[p2]].astype(str)
for i in range(0,len(df1)):
if df1[L[p1]][i].isascii() == False:
df1[L[p1]][i] = ''.join(random.choices(string.ascii_lowercase, k=5))
for i in range(0,len(df2)):
if df2[L[p2]][i].isascii() == False:
df2[L[p2]][i] = ''.join(random.choices(string.ascii_lowercase, k=5))
compare = recordlinkage.Compare()
df1["phonetic_given_name"] = phonetic(df1[L[p1]], "soundex")
df2["phonetic_given_name"] = phonetic(df2[L[p2]], "soundex")
df1["initials"] = (df1[L[p1]].str[0] + df1[L[p1]].str[-1])
df2["initials"] = (df2[L[p2]].str[0] + df2[L[p2]].str[-1])
indexer = recordlinkage.Index()
indexer.block('initials')
candidate_links = indexer.index(df1, df2)
compare.exact('phonetic_given_name', 'phonetic_given_name', label="phonetic_given_name")
# O(n) a function that uses two pointers to track consecutive pairs for the input list
while p2 <=l:
compare.string(L[p1], L[p2], method='jarowinkler',threshold = TT, label=L[p1])
p1+=2
p2+=2
features = compare.compute(candidate_links,df1, df2)
return features
def Matching(query1,query2, SS,TT, service_account_creds, project_id, querycolumns):
service_account_creds = ast.literal_eval(service_account_creds)
credentials = service_account.Credentials(service_account_creds, service_account_creds['client_email'],
service_account_creds['token_uri'])
job_config = bigquery.LoadJobConfig()
client = bigquery.Client( project = project_id)
SS=int(SS)
TT=float(TT)
df1 = gcp2df("""{}""".format(query1), client)
df2 = gcp2df("""{}""".format(query2), client)
querycolumns = json.loads(querycolumns)
querycolumns = list(querycolumns.values())
features = pair(df1, df2, TT, querycolumns)
features['Similarity_score'] = features.sum(axis=1)
features = features[features['Similarity_score']>=SS].reset_index()
final = features[['level_0', 'level_1']]
final.rename(columns= {'level_0':'df1_index', 'level_1':'df2_index'}, inplace= True)
final['Unique_ID'] = [uuid.uuid4() for _ in range(len(final.index))]
final['Unique_ID'] = final['Unique_ID'].astype(str)
final['Similarity_Score'] = SS
final_duplicates = final['df1_index'].value_counts().max()
# insert(final,"test-ahmed-project.Record_Linkage.Matching_Indices")
message = "Mission accomplished!, your highest number of duplicates is " + str(final_duplicates)
return {'message':message,'final':final.to_dict('records'), 'df1':df1.to_dict('records')}
I'm not sure why when I return df1 as a dictionary it shows ValueError error when I try to to use the function from flask app, but when I run it in a jupytor notebook using the same dataframe that I'm taking from bigquery, it works just fine, so why does it not work on the flask app?
I tried to_dict('record') to convert a dataframe to a dictionary,
it looking online many resources suggest the error exists because the data contains missing values, but it shouldn't be a problem because when I try converting the same dataframe to dictionary in jupyter notebook it works just fine.

Related

Pandas combine mutilple columns in a BQ table to generate payload for FB conversions api

I am reading from a bigquery table to generate a payload to upload to FB conversions api.
cols=["payload","client_user_agent","event_source_url"]
I am copying the column values directly from the bq table as I am unable to print the full output of the dataframe in note book.
payload="{"pageDetail":{"pageName":"Confirmation","pageContentType":"cart","pageSiteSection":"cart","breadcrumbs":[{"title":"Home","url":"/en/home.html"},{"title":"Cart","url":"/cart"},{"title":"Confirmation","url":"/order-confirmation="}],"pageCategory":"Home","pageCategory1":"Cart","pageCategory2":"Confirmation","proBtbGlobalHeader":false},"orderDetails":{"hceid":"3b94a","orderConfirmed":true,"orderDate":"2021-01-15","orderId":"0123","unique":2,"pricingSummary":{"total":54.01},"items":[{"productId":"0456","quantity":1,"shippingAddress":{"postalCode":"V4N 3X3"},"promotion":{"voucherCode":null},"clickToInstall":{"eligible":false}},{"productId":"0789","quantity":1,"fulfillment":{"fulfillmentCost":""},"shippingAddress":{"postalCode":"A4N 3Y3"},"promotion":{"voucherCode":null},"clickToInstall":{"eligible":false}}],"billingAddress":{"postalCode":"M$X1A7"}},"event":{"type":"Load","page":"Confirmation","timestamp":1610706772998,"language":"English","url":"https://www"}}"
client_user_agent="Mozilla/5.0"
event_source_url= "https://www.def.com="
I need the value for email=[orderDetails][hceid] and value=["orderDetails"]["pricingSummary"]["total"]
Initially all the payload I wanted was in a single column and I was able to achieve the uploads with the following code
import time
from facebook_business.adobjects.serverside.event import Event
from facebook_business.adobjects.serverside.event_request import EventRequest
from facebook_business.adobjects.serverside.user_data import UserData
from facebook_business.adobjects.serverside.custom_data import CustomData
from facebook_business.api import FacebookAdsApi
import pandas as pd
import json
FacebookAdsApi.init(access_token=access_token)
query='''SELECT JSON_EXTRACT(payload, '$') AS payload FROM `project.dataset.events` WHERE eventType = 'Page Load' AND pagename = "Confirmation" limit 1'''
df = pd.read_gbq(query, project_id= project, dialect='standard')
payload = df.to_dict(orient="records")
for i in payload:
#print(type(i["payload"]))
k = json.loads(i["payload"])
email = k["orderDetails"]["hcemuid"]
user_data = UserData(email)
value=k["orderDetails"]["pricingSummary"]["total"]
order_id = k["orderDetails"]["orderId"]
custom_data = CustomData(
currency='CAD',
value=value)
event = Event(
event_name='Purchase',
event_time=int(time.time()),
user_data=user_data,
custom_data=custom_data,
event_id = order_id,
data_processing_options= [])
events = [event]
#print(events)
event_request = EventRequest(
events=events,
test_event_code='TEST8609',
pixel_id=pixel_id)
#print(event_request)
a=event_request.execute()
print(a)
Now there are additional values client_user_agent that needs to be part of user data and event_source_url as parts of events in the above code that are present as two different columns in GBQ table.
I have tried similar code as above for multiple columns but I am receiving a
TypeError: Object of type Series is not JSON serializable
So I tried concatenating the columns and then create a json serializable object but I am not able to do an upload.
Below is where I am stuck and lost and not sure how to proceed further any inputs appreciated.
import time
from facebook_business.adobjects.serverside.event import Event
from facebook_business.adobjects.serverside.event_request import EventRequest
from facebook_business.adobjects.serverside.user_data import UserData
from facebook_business.adobjects.serverside.custom_data import CustomData
from facebook_business.api import FacebookAdsApi
import pandas as pd
import json
FacebookAdsApi.init(access_token=access_token)
query='''SELECT payload AS payload,location.userAgent as client_user_agent,location.referrer as event_source_url FROM `project.Dataset.events` WHERE eventType = 'Page Load' AND pagename = "Confirmation" limit 1'''
df = pd.read_gbq(query, project_id= project, dialect='standard')
df.reset_index(drop=True, inplace=True)
payload = df.to_dict(orient="records")
print(payload)
## cols = ['payload', 'client_user_agent', 'event_source_url']
## df['combined'] = df[cols].apply(lambda row: ','.join(row.values.astype(str)), axis=1)
## del df["payload"]
## del df["client"]
## del df["source"]
## payload = df.to_dict(orient="records")
#tried concatinating all columns in a the dataframe but not able to create a valid json object for upload
columns = ['payload', 'client_user_agent', 'event_source_url']
df['payload'] = df['payload'].str.replace(r'}"$', '')
payload = df[columns].to_dict(orient='records')
print(payload)
## df = df.drop(columns=columns)
## pd.options.display.max_rows = 4000
# #print(payload)
# for i in payload:
# print(i["payload"])
# k = json.loads(i["payload"])
# email = k["orderDetails"]["hcemuid"]
# print(email)
I am following the instructions from this page:https://developers.facebook.com/docs/marketing-api/conversions-api
I have used the bigquery json_extract_scalar function to extract data from nested column instead of pandas which is a relatively better solution for my scenario.

Writing Data from pandas dataframe to PostgreSQL gives error of 'DataFrame' objects are mutable, thus they cannot be hashed

i am trying to save a data frame which was first imported in pandas from postgresql as dfraw and then do some manipulation and create another dataframe as df and save it back in postgresql same database using sql alchemy. but when i am trying to save it back its giving error of 'DataFrame' objects are mutable, thus they cannot be hashed
PFB code below
import psycopg2
import pandas as pd
import numpy as np
import sqlalchemy
from sqlalchemy import create_engine
# connect the database to python
# Update connection string information
host = "something.something.azure.com"
dbname = "abcd"
user = "abcd"
password = "abcd"
sslmode = "require"
schema = 'xyz'
# Construct connection string
conn_string = "host={0} user={1} dbname={2} password={3} sslmode={4}".format(host, user, dbname, password, sslmode)
conn = psycopg2.connect(conn_string)
print("Connection established")
cursor = conn.cursor()
# Fetch all rows from table
cursor.execute("SELECT * FROM xyz.abc;")
rows = cursor.fetchall()
# Convert the tuples in dataframes
dfraw = pd.DataFrame(rows, columns =["ID","Timestamp","K","S","H 18","H 19","H 20","H 21","H 22","H 23","H 24","H 2zzz","H zzz4","H zzzzzz","H zzz6","H zzz7","H zzz8","H zzz9","H 60","H zzz0","H zzz2"])
dfraw[["S","H 18","H 19","H 20","H 21","H 22","H 23","H 24","H 2zzz","H zzz4","H zzzzzz","H zzz6","H zzz7","H zzz8","H zzz9","H 60","H zzz0","H zzz2"]] = dfraw[["S","H 18","H 19","H 20","H 21","H 22","H 23","H 24","H 2zzz","H zzz4","H zzzzzz","H zzz6","H zzz7","H zzz8","H zzz9","H 60","H zzz0","H zzz2"]].apply(pd.to_numeric)
dfraw[["Timestamp","K"]]=dfraw[["Timestamp","K"]].apply(pd.to_datetime)
# Creating temp files
temp1 = dfraw
dfraw = temp1
# creating some fucntions for data manipulation and imputations
def remZero(df,dropCol):
for k in df.drop(dropCol,axis=1):
if all(df[k] == 0):
continue
if any(df[k] == 0):
print(k)
df[k] = df[k].replace(to_replace=0, method='ffill')
return df
# Drop Columns function
dropCol = ['Timestamp','K','ID','H','C','S']
dropCol2 = ['Timestamp','K','ID','Shift']
df = remZero(dfraw,dropCol)
from sqlalchemy import create_engine
engine = create_engine('postgresql://abcd:abcd#something.something.azure.com:5432/abcd')
df.to_sql(name = df,
con=engine,
index = False,
if_exists= 'replace'
)
Error Message
Found basic error in the code I just missed putting the inverted comma before the data frame name to be published. The basic hygiene was missed
df.to_sql(name = "df",
con=engine,
index = False,
if_exists= 'replace'
)

could not convert string to float in python

i try to analysis the Principle Component from cvs file but when i run the code i get this error
C:\Users\Lenovo\Desktop>python pca.py
ValueError: could not convert string to float: Annee;NET;INT;SUB;LMT;DCT;IMM;EXP;VRD
this is my cvs file
i try to remove any space and any think
this is my python script, i don't know what i miss
Note: i run this code under python2.7
from sklearn.externals import joblib
import numpy as np
import glob
import os
import time
import numpy
my_matrix = numpy.loadtxt(open("pca.csv","rb"),delimiter= ",",skiprows=0)
def pca(dataMat, r, autoset_r=False, autoset_rate=0.9):
"""
purpose: principal components analysis
"""
print("Start to do PCA...")
t1 = time.time()
meanVal = np.mean(dataMat, axis=0)
meanRemoved = dataMat - meanVal
# normData = meanRemoved / np.std(dataMat)
covMat = np.cov(meanRemoved, rowvar=0)
eigVals, eigVects = np.linalg.eig(np.mat(covMat))
eigValIndex = np.argsort(-eigVals)
if autoset_r:
r = autoset_eigNum(eigVals, autoset_rate)
print("autoset: take top {} of {} features".format(r, meanRemoved.shape[1]))
r_eigValIndex = eigValIndex[:r]
r_eigVect = eigVects[:, r_eigValIndex]
lowDDataMat = meanRemoved * r_eigVect
reconMat = (lowDDataMat * r_eigVect.T) + meanVal
t2 = time.time()
print("PCA takes %f seconds" %(t2-t1))
joblib.dump(r_eigVect, './pca_args_save/r_eigVect.eig')
joblib.dump(meanVal, './pca_args_save/meanVal.mean')
return lowDDataMat, reconMat
def autoset_eigNum(eigValues, rate=0.99):
eigValues_sorted = sorted(eigValues, reverse=True)
eigVals_total = eigValues.sum()
for i in range(1, len(eigValues_sorted)+1):
eigVals_sum = sum(eigValues_sorted[:i])
if eigVals_sum / eigVals_total >= rate:
break
return i
It seemed that NumPy has some problem parsing your index row to float.
Try setting skiprows = 1 in your np.readtxt command in order to skip the table header.

Unable to reload data as a csv file from IPython Notebook

I have the following IPython Notebook, I am trying to access data base of movies from rotten tomatoes website.
But Rotten Tomatoes limits to 10,000 API requests a day
So I don't want to re-run this function every time when I restart the notebook, I am trying to save and reload this data as a CSV file. When I convert the data to a csv file I am getting this processing symbol[*] inside IPython notebook. After some time I am getting the following error
ConnectionError: HTTPConnectionPool(host='api.rottentomatoes.com', port=80): Max retries exceeded with url: /api/public/v1.0/movie_alias.json?apikey=5xr26r2qtgf9h3kcq5kt6y4v&type=imdb&id=0113845 (Caused by <class 'socket.gaierror'>: [Errno 11002] getaddrinfo failed)
Is this problem due to slow internet connection? Should I make some changes to my code? Kindly help me with this.
The code for the file is shown below:
%matplotlib inline
import json
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
api_key = '5xr26r2qtgf9h3kcq5kt6y4v'
movie_id = '770672122' # toy story 3
url = 'http://api.rottentomatoes.com/api/public/v1.0/movies/%s/reviews.json' % movie_id
#these are "get parameters"
options = {'review_type': 'top_critic', 'page_limit': 20, 'page': 1, 'apikey': api_key}
data = requests.get(url, params=options).text
data = json.loads(data) # load a json string into a collection of lists and dicts
print json.dumps(data['reviews'][0], indent=2) # dump an object into a json string
from io import StringIO
movie_txt = requests.get('https://raw.github.com/cs109/cs109_data/master/movies.dat').text
movie_file = StringIO(movie_txt) # treat a string like a file
movies = pd.read_csv(movie_file,delimiter='\t')
movies
#print the first row
movies[['id', 'title', 'imdbID', 'year']]
def base_url():
return 'http://api.rottentomatoes.com/api/public/v1.0/'
def rt_id_by_imdb(imdb):
"""
Queries the RT movie_alias API. Returns the RT id associated with an IMDB ID,
or raises a KeyError if no match was found
"""
url = base_url() + 'movie_alias.json'
imdb = "%7.7i" % imdb
params = dict(id=imdb, type='imdb', apikey=api_key)
r = requests.get(url, params=params).text
r = json.loads(r)
return r['id']
def _imdb_review(imdb):
"""
Query the RT reviews API, to return the first page of reviews
for a movie specified by its IMDB ID
Returns a list of dicts
"""
rtid = rt_id_by_imdb(imdb)
url = base_url() + 'movies/{0}/reviews.json'.format(rtid)
params = dict(review_type='top_critic',
page_limit=20,
page=1,
country='us',
apikey=api_key)
data = json.loads(requests.get(url, params=params).text)
data = data['reviews']
data = [dict(fresh=r['freshness'],
quote=r['quote'],
critic=r['critic'],
publication=r['publication'],
review_date=r['date'],
imdb=imdb, rtid=rtid
) for r in data]
return data
def fetch_reviews(movies, row):
m = movies.irow(row)
try:
result = pd.DataFrame(_imdb_review(m['imdbID']))
result['title'] = m['title']
except KeyError:
return None
return result
def build_table(movies, rows):
dfs = [fetch_reviews(movies, r) for r in range(rows)]
dfs = [d for d in dfs if d is not None]
return pd.concat(dfs, ignore_index=True)
critics = build_table(movies, 3000)
critics.to_csv('critics.csv', index=False)
critics = pd.read_csv('critics.csv')

Concatenate Data From URLS Recursively Inside one DataFrame

I'm trying to create one dataframe with data from multiple urls I'm scraping. The code works however I'm unable to store the data in one DataFrame recursively. The DataFrame (called frame) is replaced with a new url's data each time rather than having the new data concatenated to the same frame. Thank you, I deeply appreciate your help!
import urllib
import re
import json
import pandas
import pylab
import numpy
import matplotlib.pyplot
from pandas import *
from pylab import *
from threading import Thread
import sqlite3
urls = ['http://www.ratemyprofessors.com/ShowRatings.jsp?tid=1176131' , 'http://www.ratemyprofessors.com/ShowRatings.jsp?tid=795226', 'http://www.ratemyprofessors.com/ShowRatings.jsp?tid=1176131' , 'http://www.ratemyprofessors.com/ShowRatings.jsp?tid=1807944', 'http://www.ratemyprofessors.com/ShowRatings.jsp?tid=277459' , 'http://www.ratemyprofessors.com/ShowRatings.jsp?tid=1076779' , 'http://www.ratemyprofessors.com/ShowRatings.jsp?tid=971546']
i=0
regex = '<p class="commentText">(.+?)</p>'
regex2 = '<strong>Easiness</strong><span>(.+?)</span></p>'
regex3 = 'Helpfulness</strong><span>(.+?)</span></p>'
regex4 = 'Clarity</strong><span>(.+?)</span></p>'
regex5 = 'Rater Interest</strong><span>(.+?)</span></p>'
regex6 = '<div class="date">(.+?)</div>'
regex7 = '<div class="class"><p style="word-wrap:break-word;">(.+?)</p>'
regex8 = '<meta name="prof_name" content="(.+?)"/>'
pattern = re.compile(regex)
easiness = re.compile(regex2)
helpfulness = re.compile(regex3)
clarity = re.compile(regex4)
interest = re.compile(regex5)
date = re.compile(regex6)
mathclass = re.compile(regex7)
prof_name = re.compile(regex8)
while i < len(urls):
htmlfile = urllib.urlopen(urls[i])
htmltext = htmlfile.read()
content = re.findall(pattern,htmltext)
Easiness = re.findall(easiness,htmltext)
Helpfulness = re.findall(helpfulness, htmltext)
Clarity = re.findall(clarity, htmltext)
Interest = re.findall(interest, htmltext)
Date = re.findall(date, htmltext)
Class = re.findall(mathclass, htmltext)
PROFNAME=re.findall(prof_name, htmltext)
i+=1
frame = DataFrame({'Comments': content, 'Easiness': Easiness, 'Helpfulness': Helpfulness,
'Clarity': Clarity, 'Rater Interest': Interest, 'Class': Class,
'Date': Date[1:len(Date)], 'Professor': PROFNAME[0]})
print frame
Use pd.concat:
frames = []
while i < len(urls):
htmlfile = urllib.urlopen(urls[i])
htmltext = htmlfile.read()
content = re.findall(pattern,htmltext)
Easiness = re.findall(easiness,htmltext)
Helpfulness = re.findall(helpfulness, htmltext)
Clarity = re.findall(clarity, htmltext)
Interest = re.findall(interest, htmltext)
Date = re.findall(date, htmltext)
Class = re.findall(mathclass, htmltext)
PROFNAME=re.findall(prof_name, htmltext)
i+=1
frames.append(DataFrame({'Comments': content, 'Easiness': Easiness, 'Helpfulness': Helpfulness,
'Clarity': Clarity, 'Rater Interest': Interest, 'Class': Class,
'Date': Date[1:len(Date)], 'Professor': PROFNAME[0]}))
pd.concat(frames)
You are overwriting your frame with each iteration of the loop. As Phillip Cloud suggested, you can make a list of frames that you append with each loop. I simplified your code differently, but I think this gives you what you want.
import urllib
import re
import pandas as pd
urls = ['http://www.ratemyprofessors.com/ShowRatings.jsp?tid=1176131',
'http://www.ratemyprofessors.com/ShowRatings.jsp?tid=795226',
'http://www.ratemyprofessors.com/ShowRatings.jsp?tid=1176131',
'http://www.ratemyprofessors.com/ShowRatings.jsp?tid=1807944',
'http://www.ratemyprofessors.com/ShowRatings.jsp?tid=277459',
'http://www.ratemyprofessors.com/ShowRatings.jsp?tid=1076779',
'http://www.ratemyprofessors.com/ShowRatings.jsp?tid=971546']
regex = {'pattern' : re.compile('<p class="commentText">(.+?)</p>'),
'easiness' : re.compile('<strong>Easiness</strong><span>(.+?)</span></p>'),
'helpfulness' : re.compile('Helpfulness</strong><span>(.+?)</span></p>'),
'clarity' : re.compile('Clarity</strong><span>(.+?)</span></p>'),
'interest' : re.compile('Rater Interest</strong><span>(.+?)</span></p>'),
'date' : re.compile('<div class="date">(.+?)</div>'),
'mathclass' : re.compile('<div class="class"><p style="word-wrap:break-word;">(.+?)</p>'),
'prof_name' : re.compile('<meta name="prof_name" content="(.+?)"/>')}
# Make a dictionary with empty lists using the same keys
d = {}
for k in regex.keys():
d[k] = []
# Now fill those lists
for url in urls:
htmlfile = urllib.urlopen(url)
htmltext = htmlfile.read()
for k, v in regex.iteritems():
d[k].append(re.findall(v, htmltext))
frame = pd.DataFrame(d) # Dump the dict into a DataFrame
print frame