How to append the result from a nested loop - google-colaboratory

I´m trying to append the nested loop in a single frame:
import pandas as pd
pd.set_option("display.max_colwidth", 150)
#pd.options.display.format = '{:.2f}'.format
def busca_informes_cvm(ano,mes):
url = 'http://dados.cvm.gov.br/dados/FI/DOC/INF_DIARIO/DADOS/inf_diario_fi_{:4d}{:02d}.csv'.format(ano,mes)
return pd.read_csv(url, sep=';')
list_anos = [2020, 2021]
list_meses = [6,7]
for i in list_anos:
for j in list_meses:
saida = busca_informes_cvm(i,j)
saida
Any help would be appreciated.

You could append data frames using the append method starting from an empty data frame.
saida = pd.DataFrame()
for i in list_anos:
for j in list_meses:
saida = saida.append(busca_informes_cvm(i,j))
saida
It's likely to be inefficient for large datasets.

Related

convert pgn database to pandas dataframe

Helo!
Using chess.pgn to convert a Chess database into a dataframe, to read the nth game from the database do I need to read all the previous ones first? I can't jump directly to the game n? If I want to distribute the processing in a database with 10^8 games, I can't start reading in the 9e7th game?
import pandas as pd
import chess.pgn
from datetime import datetime as dt
import os
import glob
nome_arquivo = "Analises_01.pgn"
inicio = 0
numero_jogos = 1.47e8
arquivo = open(nome_arquivo, encoding="utf8")
ratings = []
for j in range(numero_jogos):
first_game = chess.pgn.read_game(arquivo)
if j >= inicio:
try:
Brancas = int(first_game.headers["WhiteElo"])
Pretas = int(first_game.headers["BlackElo"])
ratings.append([Brancas, Pretas])
except:
pass
I hope this code can help you. I didn't use Pandas or a data frame, sorry. It will just make a list to indexing all the pgn games. So, game_index[n] will return the string of the game number n+1.
PGN = open('your_pgn_path_here.pgn')
text_PGN = PGN.read()
game_index = []
actual_game = ''
for string in text_PGN :
if string == '\n' :
if actual_game[-2] == '\n' and actual_game[-1] == '\n' :
actual_game += string
game_index.append(actual_game)
actual_game = ''
else :
actual_game += string
else :
actual_game += string
import chess.pgn
import pandas as pd
pgn = open("your_pgn_path_here.pgn")
my_list = []
for i in pgn:
i = chess.pgn.read_game(pgn)
my_list.append(i)
df = pd.DataFrame(my_list)
#shows 210 game in dataframe
print(df[0][210])

Use pandas to read the csv file with several uncertain factors

I have asked the related question of string in: Find the number of \n before a given word in a long string. But this method cannot solve the complicate case I happened to. Thus I want to find out a solution of Pandas here.
I have a csv file (I just represent as a string):
csvfile = 'Idnum\tId\nkey:maturity\n2\nmaturity\tpara1\tpara2\n1Y\t0\t0\n2Y\t0\t0'
I want to use the pandas:
value = pandas.read_csv(csvfile, sep = '\t', skiprows = 3).set_index('maturity')
to obtain the table like:
and set the first columan maturity as index.
But there are several uncertain factors in the csvfile:
1..set_index('maturity'), the key maturity
of index is included in the row key: maturity. Then I should find the row key: xxxx and obtain the string xxxx
2.skiprows = 3: the number of skipped rows before the title:
is uncertain. The csvfile can be something like:
'Idnum\tId\nkey:maturity\n2\n\n\n\n\n\nmaturity\tpara1\tpara2\n1Y\t0\t0\n2Y\t0\t0'
I should find the row number of title (namely the row beginning with xxxx found in the rowkey: xxxx).
3.sep = '\t': the csvfile may use space as separator like:
csvfile = 'Idnum Id\nkey: maturity\n2\nmaturity para1 para2\n1Y 0 0\n2Y 0 0'
So is there any general code of pandas to deal with the csvfile with above uncertain factors?
Actually the string:
csvfile = 'Idnum\tId\nkey:maturity\n2\nmaturity\tpara1\tpara2\n1Y\t0\t0\n2Y\t0\t0'
is from a StringIO: data
data.getvalue() = 'Idnum\tId\nkey:maturity\n2\nmaturity\tpara1\tpara2\n1Y\t0\t0\n2Y\t0\t0'
I am not familiar with this structure and even I want to obtain a table of original data without any edition by using:
value = pandas.read_csv(data, sep = '\t')
There will be a error.
You can read the file line by line, collecting the necessary information and then pass the remainder to pd.read_csv with the appropriate arguments:
from io import StringIO
import re
import pandas as pd
with open('data.csv') as fh:
key = next(filter(lambda x: x.startswith('key:'), fh)).lstrip('key:').strip()
header = re.split('[ \t]+', next(filter(lambda x: x.startswith(key), fh)).strip())
df = pd.read_csv(StringIO(fh.read()), header=None, names=header, index_col=0, sep=r'\s+')
Example for data via StringIO:
fh = StringIO('Idnum\tId\nkey:maturity\n2\nmaturity\tpara1\tpara2\n1Y\t0\t0\n2Y\t0\t0')
key = next(filter(lambda x: x.startswith('key:'), fh)).lstrip('key:').strip()
header = re.split('[ \t]+', next(filter(lambda x: x.startswith(key), fh)).strip())
df = pd.read_csv(fh, header=None, names=header, index_col=0, sep=r'\s+')
If you do not mind reading the csv file twice you can try doing something like:
from io import StringIO
csvfile = 'Idnum\tId\nkey:maturity\n2\nmaturity\tpara1\tpara2\n1Y\t0\t0\n2Y\t0\t0'
data = pd.read_csv(StringIO(csvfile), sep='\t', error_bad_lines=False, header=None)
skiprows = len(data)
pd.read_csv(StringIO(csvfile), sep='\t', skiprows=skiprows)
same for you other example:
csvfile = 'Idnum\tId\nkey:maturity\n2\n\n\n\n\n\nmaturity\tpara1\tpara2\n1Y\t0\t0\n2Y\t0\t0'
data = pd.read_csv(StringIO(csvfile), sep='\t', error_bad_lines=False, header=None)
skiprows = len(data)
pd.read_csv(StringIO(csvfile), sep='\t', skiprows=skiprows)
This assumes that you know the sep of the file
Also if you want to find the key:
csvfile = 'Idnum\tId\nkey:maturity\n2\n\n\n\n\n\nmaturity\tpara1\tpara2\n1Y\t0\t0\n2Y\t0\t0'
data = pd.read_csv(StringIO(csvfile), sep='\t', error_bad_lines=False, header=None)
key = [x.replace('key:','') for x in data[0] if x.find('key')>-1]
skiprows = len(data)
pd.read_csv(StringIO(csvfile), sep='\t', skiprows=skiprows).set_index(key)

dask how to define a custom (time fold) function that operates in parallel and returns a dataframe with a different shape

I am trying to implement a time fold function to be 'map'ed to various partitions of a dask dataframe which in turn changes the shape of the dataframe in question (or alternatively produces a new dataframe with the altered shape). This is how far I have gotten. The result 'res' returned on compute is a list of 3 delayed objects. When I try to compute each of them in a loop (last tow lines of code) this results in a "TypeError: 'DataFrame' object is not callable" After going through the examples for map_partitions, I also tried altering the input DF (inplace) in the function with no return value which causes a similar TypeError with NoneType. What am I missing?
Also, looking at the visualization (attached) I feel like there is a need for reducing the individually computed (folded) partitions into a single DF. How do I do this?
#! /usr/bin/env python
# Start dask scheduler and workers
# dask-scheduler &
# dask-worker --nthreads 1 --nprocs 6 --memory-limit 3GB localhost:8786 --local-directory /dev/shm &
from dask.distributed import Client
from dask.delayed import delayed
import pandas as pd
import numpy as np
import dask.dataframe as dd
import math
foldbucketsecs=30
periodicitysecs=15
secsinday=24 * 60 * 60
chunksizesecs=60 # 1 minute
numts = 5
start = 1525132800 # 01/05
end = 1525132800 + (3 * 60) # 3 minute
c = Client('127.0.0.1:8786')
def fold(df, start, bucket):
return df
def reduce_folds(df):
return df
def load(epoch):
idx = []
for ts in range(0, chunksizesecs, periodicitysecs):
idx.append(epoch + ts)
d = np.random.rand(chunksizesecs/periodicitysecs, numts)
ts = []
for i in range(0, numts):
tsname = "ts_%s" % (i)
ts.append(tsname)
gts.append(tsname)
res = pd.DataFrame(index=idx, data=d, columns=ts, dtype=np.float64)
res.index = pd.to_datetime(arg=res.index, unit='s')
return res
gts = []
load(start)
cols = len(gts)
idx1 = pd.DatetimeIndex(start=start, freq=('%sS' % periodicitysecs), end=start+periodicitysecs, dtype='datetime64[s]')
meta = pd.DataFrame(index=idx1[:0], data=[], columns=gts, dtype=np.float64)
dfs = [delayed(load)(fn) for fn in range(start, end, chunksizesecs)]
from_delayed = dd.from_delayed(dfs, meta, 'sorted')
nfolds = int(math.ceil((end - start)/foldbucketsecs))
cprime = nfolds * cols
gtsnew = []
for i in range(0, cprime):
gtsnew.append("ts_%s,fold=%s" % (i%cols, i/cols))
idx2 = pd.DatetimeIndex(start=start, freq=('%sS' % periodicitysecs), end=start+foldbucketsecs, dtype='datetime64[s]')
meta = pd.DataFrame(index=idx2[:0], data=[], columns=gtsnew, dtype=np.float64)
folded_df = from_delayed.map_partitions(delayed(fold)(from_delayed, start, foldbucketsecs), meta=meta)
result = c.submit(reduce_folds, folded_df)
c.gather(result).visualize(filename='/usr/share/nginx/html/svg/df4.svg')
res = c.gather(result).compute()
for f in res:
f.compute()
Never mind! It was my fault, instead of wrapping my function in delayed I simply passed it to the map_partitions call like so and it worked.
folded_df = from_delayed.map_partitions(fold, start, foldbucketsecs, nfolds, meta=meta)

Pandas DataFrame expand existing dataset to finer timestamp

I am trying to make this piece of code faster, it is failing on conversion of ~120K rows to ~1.7m.
Essentially, I am trying to convert each date stamped entry into 14, representing each DOW from PayPeriodEndingDate to T-14
Does anyone have a better suggestion other than iteruples to do this loop?
Thanks!!
df_Final = pd.DataFrame()
for row in merge4.itertuples():
listX = []
listX.append(row)
df = pd.DataFrame(listX*14)
df = df.reset_index().drop('Index',axis=1)
df['Hours'] = df['Hours']/14
df['AmountPaid'] = df['AmountPaid']/14
df['PayPeriodEnding'] = np.arange(df.loc[:,'PayPeriodEnding'][0] - np.timedelta64(14,'D'), df.loc[:,'PayPeriodEnding'][0], dtype='datetime64[D]')
frames = [df_Final,df]
df_Final = pd.concat(frames,axis=0)
df_Final

Concatenate Data From URLS Recursively Inside one DataFrame

I'm trying to create one dataframe with data from multiple urls I'm scraping. The code works however I'm unable to store the data in one DataFrame recursively. The DataFrame (called frame) is replaced with a new url's data each time rather than having the new data concatenated to the same frame. Thank you, I deeply appreciate your help!
import urllib
import re
import json
import pandas
import pylab
import numpy
import matplotlib.pyplot
from pandas import *
from pylab import *
from threading import Thread
import sqlite3
urls = ['http://www.ratemyprofessors.com/ShowRatings.jsp?tid=1176131' , 'http://www.ratemyprofessors.com/ShowRatings.jsp?tid=795226', 'http://www.ratemyprofessors.com/ShowRatings.jsp?tid=1176131' , 'http://www.ratemyprofessors.com/ShowRatings.jsp?tid=1807944', 'http://www.ratemyprofessors.com/ShowRatings.jsp?tid=277459' , 'http://www.ratemyprofessors.com/ShowRatings.jsp?tid=1076779' , 'http://www.ratemyprofessors.com/ShowRatings.jsp?tid=971546']
i=0
regex = '<p class="commentText">(.+?)</p>'
regex2 = '<strong>Easiness</strong><span>(.+?)</span></p>'
regex3 = 'Helpfulness</strong><span>(.+?)</span></p>'
regex4 = 'Clarity</strong><span>(.+?)</span></p>'
regex5 = 'Rater Interest</strong><span>(.+?)</span></p>'
regex6 = '<div class="date">(.+?)</div>'
regex7 = '<div class="class"><p style="word-wrap:break-word;">(.+?)</p>'
regex8 = '<meta name="prof_name" content="(.+?)"/>'
pattern = re.compile(regex)
easiness = re.compile(regex2)
helpfulness = re.compile(regex3)
clarity = re.compile(regex4)
interest = re.compile(regex5)
date = re.compile(regex6)
mathclass = re.compile(regex7)
prof_name = re.compile(regex8)
while i < len(urls):
htmlfile = urllib.urlopen(urls[i])
htmltext = htmlfile.read()
content = re.findall(pattern,htmltext)
Easiness = re.findall(easiness,htmltext)
Helpfulness = re.findall(helpfulness, htmltext)
Clarity = re.findall(clarity, htmltext)
Interest = re.findall(interest, htmltext)
Date = re.findall(date, htmltext)
Class = re.findall(mathclass, htmltext)
PROFNAME=re.findall(prof_name, htmltext)
i+=1
frame = DataFrame({'Comments': content, 'Easiness': Easiness, 'Helpfulness': Helpfulness,
'Clarity': Clarity, 'Rater Interest': Interest, 'Class': Class,
'Date': Date[1:len(Date)], 'Professor': PROFNAME[0]})
print frame
Use pd.concat:
frames = []
while i < len(urls):
htmlfile = urllib.urlopen(urls[i])
htmltext = htmlfile.read()
content = re.findall(pattern,htmltext)
Easiness = re.findall(easiness,htmltext)
Helpfulness = re.findall(helpfulness, htmltext)
Clarity = re.findall(clarity, htmltext)
Interest = re.findall(interest, htmltext)
Date = re.findall(date, htmltext)
Class = re.findall(mathclass, htmltext)
PROFNAME=re.findall(prof_name, htmltext)
i+=1
frames.append(DataFrame({'Comments': content, 'Easiness': Easiness, 'Helpfulness': Helpfulness,
'Clarity': Clarity, 'Rater Interest': Interest, 'Class': Class,
'Date': Date[1:len(Date)], 'Professor': PROFNAME[0]}))
pd.concat(frames)
You are overwriting your frame with each iteration of the loop. As Phillip Cloud suggested, you can make a list of frames that you append with each loop. I simplified your code differently, but I think this gives you what you want.
import urllib
import re
import pandas as pd
urls = ['http://www.ratemyprofessors.com/ShowRatings.jsp?tid=1176131',
'http://www.ratemyprofessors.com/ShowRatings.jsp?tid=795226',
'http://www.ratemyprofessors.com/ShowRatings.jsp?tid=1176131',
'http://www.ratemyprofessors.com/ShowRatings.jsp?tid=1807944',
'http://www.ratemyprofessors.com/ShowRatings.jsp?tid=277459',
'http://www.ratemyprofessors.com/ShowRatings.jsp?tid=1076779',
'http://www.ratemyprofessors.com/ShowRatings.jsp?tid=971546']
regex = {'pattern' : re.compile('<p class="commentText">(.+?)</p>'),
'easiness' : re.compile('<strong>Easiness</strong><span>(.+?)</span></p>'),
'helpfulness' : re.compile('Helpfulness</strong><span>(.+?)</span></p>'),
'clarity' : re.compile('Clarity</strong><span>(.+?)</span></p>'),
'interest' : re.compile('Rater Interest</strong><span>(.+?)</span></p>'),
'date' : re.compile('<div class="date">(.+?)</div>'),
'mathclass' : re.compile('<div class="class"><p style="word-wrap:break-word;">(.+?)</p>'),
'prof_name' : re.compile('<meta name="prof_name" content="(.+?)"/>')}
# Make a dictionary with empty lists using the same keys
d = {}
for k in regex.keys():
d[k] = []
# Now fill those lists
for url in urls:
htmlfile = urllib.urlopen(url)
htmltext = htmlfile.read()
for k, v in regex.iteritems():
d[k].append(re.findall(v, htmltext))
frame = pd.DataFrame(d) # Dump the dict into a DataFrame
print frame