convert pgn database to pandas dataframe - chess

Helo!
Using chess.pgn to convert a Chess database into a dataframe, to read the nth game from the database do I need to read all the previous ones first? I can't jump directly to the game n? If I want to distribute the processing in a database with 10^8 games, I can't start reading in the 9e7th game?
import pandas as pd
import chess.pgn
from datetime import datetime as dt
import os
import glob
nome_arquivo = "Analises_01.pgn"
inicio = 0
numero_jogos = 1.47e8
arquivo = open(nome_arquivo, encoding="utf8")
ratings = []
for j in range(numero_jogos):
first_game = chess.pgn.read_game(arquivo)
if j >= inicio:
try:
Brancas = int(first_game.headers["WhiteElo"])
Pretas = int(first_game.headers["BlackElo"])
ratings.append([Brancas, Pretas])
except:
pass

I hope this code can help you. I didn't use Pandas or a data frame, sorry. It will just make a list to indexing all the pgn games. So, game_index[n] will return the string of the game number n+1.
PGN = open('your_pgn_path_here.pgn')
text_PGN = PGN.read()
game_index = []
actual_game = ''
for string in text_PGN :
if string == '\n' :
if actual_game[-2] == '\n' and actual_game[-1] == '\n' :
actual_game += string
game_index.append(actual_game)
actual_game = ''
else :
actual_game += string
else :
actual_game += string

import chess.pgn
import pandas as pd
pgn = open("your_pgn_path_here.pgn")
my_list = []
for i in pgn:
i = chess.pgn.read_game(pgn)
my_list.append(i)
df = pd.DataFrame(my_list)
#shows 210 game in dataframe
print(df[0][210])

Related

ValueError: NaTType does not support timetuple when converting a dataframe to dictionary using to_dict('records')

I'm running this flask app
from flask import Flask, request, jsonify, render_template
from flask_cors import CORS, cross_origin
import json
import pandas as pd
# Create the app object
app = Flask(__name__)
cors = CORS(app, resources= {r"/*": {'origins' : "*"}})
# importing function for calculations
from Record_Matching import Matching
#app.route("/query", methods = ['get'])
#cross_origin()
def query():
# service_account_creds = request.json
query1 = request.args.get('query1', type = str)
query2 = request.args.get('query2', type = str)
querycolumns = request.args.get('querycolumns')
project_id = request.args.get('project_id', type = str)
service_account_creds = request.args.get('service_account')
SS = request.args.get('SS', type = float)
TT = request.args.get('TT', type = float)
result = Matching(query1,query2, SS,TT, service_account_creds, project_id, querycolumns)
return result
if __name__ == "__main__":
app.run(host="localhost", port=8080, debug=True)
and I'm importing the matching function from this python scripts
import pandas as pd
from google.cloud import bigquery
from google.oauth2 import service_account
import recordlinkage
from recordlinkage.preprocessing import phonetic
from pandas.io.json import json_normalize
import uuid
from uuid import uuid4
import random
import string
import json
import ast
# Results to data frame function
def gcp2df(sql, client):
query = client.query(sql)
results = query.result()
return results.to_dataframe()
# Exporting df to bigquery - table parameter example: "dataset.tablename"
# def insert(df, table):
# client = bigquery.Client()
# job_config = bigquery.LoadJobConfig(write_disposition=bigquery.job.WriteDisposition.WRITE_TRUNCATE)
# return client.load_table_from_dataframe(df, table, job_config = job_config)
def pair(df1, df2, TT, querycolumns):
# function to take pair from list and compare:
L = querycolumns
l=len(querycolumns)
p1=0
p2=1
# To generate phonetics we need to make sure all names are in english.
# thus we'll replace non-english words by random english strings
df1[L[p1]] = df1[L[p1]].astype(str)
df2[L[p2]] = df2[L[p2]].astype(str)
for i in range(0,len(df1)):
if df1[L[p1]][i].isascii() == False:
df1[L[p1]][i] = ''.join(random.choices(string.ascii_lowercase, k=5))
for i in range(0,len(df2)):
if df2[L[p2]][i].isascii() == False:
df2[L[p2]][i] = ''.join(random.choices(string.ascii_lowercase, k=5))
compare = recordlinkage.Compare()
df1["phonetic_given_name"] = phonetic(df1[L[p1]], "soundex")
df2["phonetic_given_name"] = phonetic(df2[L[p2]], "soundex")
df1["initials"] = (df1[L[p1]].str[0] + df1[L[p1]].str[-1])
df2["initials"] = (df2[L[p2]].str[0] + df2[L[p2]].str[-1])
indexer = recordlinkage.Index()
indexer.block('initials')
candidate_links = indexer.index(df1, df2)
compare.exact('phonetic_given_name', 'phonetic_given_name', label="phonetic_given_name")
# O(n) a function that uses two pointers to track consecutive pairs for the input list
while p2 <=l:
compare.string(L[p1], L[p2], method='jarowinkler',threshold = TT, label=L[p1])
p1+=2
p2+=2
features = compare.compute(candidate_links,df1, df2)
return features
def Matching(query1,query2, SS,TT, service_account_creds, project_id, querycolumns):
service_account_creds = ast.literal_eval(service_account_creds)
credentials = service_account.Credentials(service_account_creds, service_account_creds['client_email'],
service_account_creds['token_uri'])
job_config = bigquery.LoadJobConfig()
client = bigquery.Client( project = project_id)
SS=int(SS)
TT=float(TT)
df1 = gcp2df("""{}""".format(query1), client)
df2 = gcp2df("""{}""".format(query2), client)
querycolumns = json.loads(querycolumns)
querycolumns = list(querycolumns.values())
features = pair(df1, df2, TT, querycolumns)
features['Similarity_score'] = features.sum(axis=1)
features = features[features['Similarity_score']>=SS].reset_index()
final = features[['level_0', 'level_1']]
final.rename(columns= {'level_0':'df1_index', 'level_1':'df2_index'}, inplace= True)
final['Unique_ID'] = [uuid.uuid4() for _ in range(len(final.index))]
final['Unique_ID'] = final['Unique_ID'].astype(str)
final['Similarity_Score'] = SS
final_duplicates = final['df1_index'].value_counts().max()
# insert(final,"test-ahmed-project.Record_Linkage.Matching_Indices")
message = "Mission accomplished!, your highest number of duplicates is " + str(final_duplicates)
return {'message':message,'final':final.to_dict('records'), 'df1':df1.to_dict('records')}
I'm not sure why when I return df1 as a dictionary it shows ValueError error when I try to to use the function from flask app, but when I run it in a jupytor notebook using the same dataframe that I'm taking from bigquery, it works just fine, so why does it not work on the flask app?
I tried to_dict('record') to convert a dataframe to a dictionary,
it looking online many resources suggest the error exists because the data contains missing values, but it shouldn't be a problem because when I try converting the same dataframe to dictionary in jupyter notebook it works just fine.

Pandas DataFrame - How to extract string patterns with hidden characters

I am scraping names, prices and images from this website. There are 8 items in total, but in the DF I would like to filter only the items that contain the pattern "Original Zaino Antifurto". When I try to apply the bp_filter to the DF I get an error, probably due to hidden characters.
Does anyone know how to filter for this pattern avoiding the error?
import requests
from bs4 import BeautifulSoup
import pandas as pd
url_xd = 'https://www.xd-design.com/it-it/catalogsearch/result/?q=Bobby+Original+Zaino+Antifurto'
req_xd = requests.get(url_xd)
pars_xd = BeautifulSoup(req_xd.content, 'html.parser')
con_xd = pars_xd.find_all('div', class_ = 'product details product-item-details')
names_xd = []
prices_xd = []
picts_xd = []
for container in con_xd:
name = container.find("a", class_="product-item-link").text
names_xd.append(name)
for container in con_xd:
price = container.find("span", class_="price").text
prices_xd.append(price)
for container in con_xd:
pict = container.find("a").get("href")
picts_xd.append(pict)
bp_xd = pd.DataFrame({'(XD-Design) Item_Name': names_xd,
'Item_Price_EUR': prices_xd,
'Link_to_Pict': picts_xd })
bp_xd['Item_Price_EUR'] = bp_xd['Item_Price_EUR'].str.replace('€','').str.replace(',','.').astype(float)
bp_xd['(XD-Design) Item_Name'] = bp_xd['(XD-Design) Item_Name'].str.strip()
bp_filter = bp_xd['(XD-Design) Item_Name'][bp_xd['(XD-Design) Item_Name'].str.contains('Original Zaino Antifurto')]
# bp_xd[bp_filter]
Here you have the fixed working code
import requests
from bs4 import BeautifulSoup
import pandas as pd
url_xd = 'https://www.xd-design.com/it-it/catalogsearch/result/?q=Bobby+Original+Zaino+Antifurto'
req_xd = requests.get(url_xd)
pars_xd = BeautifulSoup(req_xd.content, 'html.parser')
con_xd = pars_xd.find_all('div', class_ = 'product details product-item-details')
names_xd = [c.find("a", class_="product-item-link").text for c in con_xd]
prices_xd = [c.find("span", class_="price").text for c in con_xd]
picts_xd = [c.find("a").get("href") for c in con_xd]
df = pd.DataFrame({'(XD-Design) Item_Name': names_xd,
'Item_Price_EUR': prices_xd,
'Link_to_Pict': picts_xd })
df['Item_Price_EUR'] = df['Item_Price_EUR'].str.replace('€','').str.replace(',','.').astype(float)
df['(XD-Design) Item_Name'] = df['(XD-Design) Item_Name'].str.strip()
df = df.loc[df['(XD-Design) Item_Name'].apply(lambda x: 1 if 'Original Zaino Antifurto' in x else 0) == 1]

TypeError: ufunc add cannot use operands with types dtype('<M8[ns]') and dtype('<M8[ns]')

I am trying to set an ARIMA model to some data, for this, I used 'autocorrelation_plot()' with my time series. It's generates however the error in the title.
I have an attribute table composed, among others, of a Date and time fiels.
I extracted them (after transforming the attribute table into a numpy table), put them in a 'datetime' variable and appended them all in a list:
O,A = [],[]
dt = datetime.strptime(dt1, "%Y/%m/%d %H:%M")
A.append(dt)
I tried then to create time series and printed them to be sure of the results:
data2 = pd.Series(A, O)
print data2
The results were satisfying, until I decided to auto-correlate :
Auto-correlation command :
autocorrelation_plot(data2)
After this command, it returns:
TypeError: ufunc add cannot use operands with types dtype('M8[ns]') and dtype('M8[ns]')
I guess it's due to the conversion of the datetime.strptime to a numpy ?
I tried to follow some suggestions from previous questions
index.to_pydatetime() , dtype, M8[ns] error ..., in vain.
Minimal reproducible example:
from pandas import datetime
from pandas import DataFrame
import pandas as pd
from matplotlib import pyplot as plt
from pandas.tools.plotting import autocorrelation_plot
arr = arcpy.da.TableToNumPyArray(inTable ,("PROVINCE","ZONE_CODE","MEAN", "Datetime","Time"))
arr_length = len(arr)
j = 1
O,A = [],[]
while j<=55: #I have 55 provinces
i = 0
while i<arr_length:
if arr[i][1]== j:
O.append(arr[i][2])
c = str(arr[i][3])
d = str(c[0:4]+"/"+c[5:7]+"/"+c[8:10])
t = str(arr[i][4])
if t=="10":
dt1 = str(d+" 10:00")
else:
dt1 = str(d+" 14:00")
dt = datetime.strptime(dt1, "%Y/%m/%d %H:%M")
A.append(dt)
i = i+1
data2 = pd.Series(A, O)
print data2
autocorrelation_plot(data2)
del A[:]
del O[:]
j += 1
Screenshot of the results:
results
I used this to solve my issue:
import matplotlib.dates as mpl_dates
df.reset_index(inplace=True)
df['Date']=df['Date'].apply(mpl_dates.date2num)
df = df.astype(float)
I found a solution, it can look barbaric, but it works!
I've just "recreated" pd.Series() with the pd.Series I had:
data2 = pd.Series(O, A)
autocorrelation_plot(pd.Series(data2))
plt.show()

Represent negative timedelta in most basic form

If I create a negative Timedelta for e.g. 0.5 hours, the internal representation looks as follow:
In [2]: pd.Timedelta('-0.5h')
Out[2]: Timedelta('-1 days +23:30:00')
How can I get back a (str) representation of this Timedelta in the form -00:30?
I want to display these deltas and requiring the user to calculate the expression -1 day + something is a bit award.
I can't add comment to you so adding it here. Don't know if this helps but I think you can use python humanize.
import humanize as hm
hm.naturaltime((pd.Timedelta('-0.5h')))
Out:
'30 minutes from now'
Ok, I will live with a hack going trough a date:
sign = ''
date = pd.to_datetime('today')
if delta.total_seconds() < 0:
sign = '-'
date = date - delta
else:
date = date + delta
print '{}{:%H:%M}'.format(sign, date.to_pydatetime())
You can use the components of a Pandas timedelta
import pandas as pd
t = pd.Timedelta('-0.5h')
print t.components
>> Components(days=-1L, hours=23L, minutes=30L, seconds=0L, milliseconds=0L, microseconds=0L, nanoseconds=0L)
You can access each component with
print t.components.days
>> -1
print t.components.hours
>> 23
print t.components.minutes
>> 30
The rest is then formatting.
source
This is a total hack that won't work for Series data, but....
import pandas as pd
import numpy as np
t = pd.Timedelta('-0.5h').components
mins = t.days*24*60 + t.hours*60 + t.minutes
print str(np.sign(mins))[0]+str(divmod(abs(mins), 60)[0]).zfill(2)+':'+str(divmod(abs(mins), 60)[1]).zfill(2)
>> -00:30
I was looking for something similar (see https://github.com/pandas-dev/pandas/issues/17232 )
I'm not sure if it will be implemented in Pandas, so here is a workaround
import pandas as pd
def timedelta2str(td, display_plus=False, format=None):
"""
Parameters
----------
format : None|all|even_day|sub_day|long
Returns
-------
converted : string of a Timedelta
>>> td = pd.Timedelta('00:00:00.000')
>>> timedelta2str(td)
'0 days'
>>> td = pd.Timedelta('00:01:29.123')
>>> timedelta2str(td, display_plus=True, format='sub_day')
'+ 00:01:29.123000'
>>> td = pd.Timedelta('-00:01:29.123')
>>> timedelta2str(td, display_plus=True, format='sub_day')
'- 00:01:29.123000'
"""
td_zero = pd.Timedelta(0)
sign_sep = ' '
if td >= td_zero:
s = td._repr_base(format=format)
if display_plus:
s = "+" + sign_sep + s
return s
else:
s = timedelta2str(-td, display_plus=False, format=format)
s = "-" + sign_sep + s
return s
if __name__ == "__main__":
import doctest
doctest.testmod()

Concatenate Data From URLS Recursively Inside one DataFrame

I'm trying to create one dataframe with data from multiple urls I'm scraping. The code works however I'm unable to store the data in one DataFrame recursively. The DataFrame (called frame) is replaced with a new url's data each time rather than having the new data concatenated to the same frame. Thank you, I deeply appreciate your help!
import urllib
import re
import json
import pandas
import pylab
import numpy
import matplotlib.pyplot
from pandas import *
from pylab import *
from threading import Thread
import sqlite3
urls = ['http://www.ratemyprofessors.com/ShowRatings.jsp?tid=1176131' , 'http://www.ratemyprofessors.com/ShowRatings.jsp?tid=795226', 'http://www.ratemyprofessors.com/ShowRatings.jsp?tid=1176131' , 'http://www.ratemyprofessors.com/ShowRatings.jsp?tid=1807944', 'http://www.ratemyprofessors.com/ShowRatings.jsp?tid=277459' , 'http://www.ratemyprofessors.com/ShowRatings.jsp?tid=1076779' , 'http://www.ratemyprofessors.com/ShowRatings.jsp?tid=971546']
i=0
regex = '<p class="commentText">(.+?)</p>'
regex2 = '<strong>Easiness</strong><span>(.+?)</span></p>'
regex3 = 'Helpfulness</strong><span>(.+?)</span></p>'
regex4 = 'Clarity</strong><span>(.+?)</span></p>'
regex5 = 'Rater Interest</strong><span>(.+?)</span></p>'
regex6 = '<div class="date">(.+?)</div>'
regex7 = '<div class="class"><p style="word-wrap:break-word;">(.+?)</p>'
regex8 = '<meta name="prof_name" content="(.+?)"/>'
pattern = re.compile(regex)
easiness = re.compile(regex2)
helpfulness = re.compile(regex3)
clarity = re.compile(regex4)
interest = re.compile(regex5)
date = re.compile(regex6)
mathclass = re.compile(regex7)
prof_name = re.compile(regex8)
while i < len(urls):
htmlfile = urllib.urlopen(urls[i])
htmltext = htmlfile.read()
content = re.findall(pattern,htmltext)
Easiness = re.findall(easiness,htmltext)
Helpfulness = re.findall(helpfulness, htmltext)
Clarity = re.findall(clarity, htmltext)
Interest = re.findall(interest, htmltext)
Date = re.findall(date, htmltext)
Class = re.findall(mathclass, htmltext)
PROFNAME=re.findall(prof_name, htmltext)
i+=1
frame = DataFrame({'Comments': content, 'Easiness': Easiness, 'Helpfulness': Helpfulness,
'Clarity': Clarity, 'Rater Interest': Interest, 'Class': Class,
'Date': Date[1:len(Date)], 'Professor': PROFNAME[0]})
print frame
Use pd.concat:
frames = []
while i < len(urls):
htmlfile = urllib.urlopen(urls[i])
htmltext = htmlfile.read()
content = re.findall(pattern,htmltext)
Easiness = re.findall(easiness,htmltext)
Helpfulness = re.findall(helpfulness, htmltext)
Clarity = re.findall(clarity, htmltext)
Interest = re.findall(interest, htmltext)
Date = re.findall(date, htmltext)
Class = re.findall(mathclass, htmltext)
PROFNAME=re.findall(prof_name, htmltext)
i+=1
frames.append(DataFrame({'Comments': content, 'Easiness': Easiness, 'Helpfulness': Helpfulness,
'Clarity': Clarity, 'Rater Interest': Interest, 'Class': Class,
'Date': Date[1:len(Date)], 'Professor': PROFNAME[0]}))
pd.concat(frames)
You are overwriting your frame with each iteration of the loop. As Phillip Cloud suggested, you can make a list of frames that you append with each loop. I simplified your code differently, but I think this gives you what you want.
import urllib
import re
import pandas as pd
urls = ['http://www.ratemyprofessors.com/ShowRatings.jsp?tid=1176131',
'http://www.ratemyprofessors.com/ShowRatings.jsp?tid=795226',
'http://www.ratemyprofessors.com/ShowRatings.jsp?tid=1176131',
'http://www.ratemyprofessors.com/ShowRatings.jsp?tid=1807944',
'http://www.ratemyprofessors.com/ShowRatings.jsp?tid=277459',
'http://www.ratemyprofessors.com/ShowRatings.jsp?tid=1076779',
'http://www.ratemyprofessors.com/ShowRatings.jsp?tid=971546']
regex = {'pattern' : re.compile('<p class="commentText">(.+?)</p>'),
'easiness' : re.compile('<strong>Easiness</strong><span>(.+?)</span></p>'),
'helpfulness' : re.compile('Helpfulness</strong><span>(.+?)</span></p>'),
'clarity' : re.compile('Clarity</strong><span>(.+?)</span></p>'),
'interest' : re.compile('Rater Interest</strong><span>(.+?)</span></p>'),
'date' : re.compile('<div class="date">(.+?)</div>'),
'mathclass' : re.compile('<div class="class"><p style="word-wrap:break-word;">(.+?)</p>'),
'prof_name' : re.compile('<meta name="prof_name" content="(.+?)"/>')}
# Make a dictionary with empty lists using the same keys
d = {}
for k in regex.keys():
d[k] = []
# Now fill those lists
for url in urls:
htmlfile = urllib.urlopen(url)
htmltext = htmlfile.read()
for k, v in regex.iteritems():
d[k].append(re.findall(v, htmltext))
frame = pd.DataFrame(d) # Dump the dict into a DataFrame
print frame