Problem with merging multiply excel files from python - pandas

My dtype is changing after i unhash the foo and groupby i get # we require a list, but not a 'str'.
I wanted if the value (in my case Date) in the 1 column is the same then the text from the 3 column goes there after a ',' sign, in my final project
import os
import pandas as pd
import dateutil
from pandas import DataFrame
from datetime import datetime, timedelta
data_file_folder = '.\Data'
df = []
for file in os.listdir(data_file_folder):
if file.endswith('.xlsx'):
print('Loading File {0}...'.format(file))
df.append(pd.read_excel(os.path.join(data_file_folder,file),sheet_name='Sheet1'))
df_master = pd.concat(df,axis=0)
df_master['Date'] = df_master['Date'].dt.date
#foo = lambda a: ", ".join(a)
#df_master = df_master.groupby(by='Date').agg({'Tweet': foo}).reset_index()
#df_master.to_excel('.\NewFolder\example.xlsx',index=False)
#df_master

Related

compare 2 pandas dataframes

import glob
import pandas as pd
import numpy as np
import os
import fnmatch
import zipfile
df1 = pd.read_csv("2016Q12ExactTargetE1.csv",names = ['FileName'])
print("\nRead " ,df1.shape[0] , "Records")
# accessing and printing files in directory and subdirectory
for filename in glob.glob('c:\\temp\\*.zip', recursive=True):
#print(filename)
myzip=filename
zf = zipfile.ZipFile(myzip)
zfl = zf.namelist()
eml_files = fnmatch.filter(zfl, "*.eml")
df2 = pd.DataFrame(eml_files )
print("\nRead2 " ,df2.shape[0] , "Records")
The csv file
FileName
F0B1F7B371C427E6FDDE1078287A3C71.eml
E107A8CADF8F87B05599A3AAF03D5BA1.eml
30B54778C0B912F2516F6C390A137E91.eml
D06DD3162620490F7E9F8ADD1AE0F621.eml
10E3BAFB831EA97615DBBBF18D601EC1.eml
the eml_files looks like
['00E6E77CE9890A3F34343997BCA33791.eml',
'109E4F29239EA8259707B2E3D0D00351.eml',
'403EBEC70C1F305B72EFAA3822D75871.eml',
'30B54778C0B912F2516F6C390A137E91.eml',
'E107A8CADF8F87B05599A3AAF03D5BA1.eml',
'F0B1F7B371C427E6FDDE1078287A3C71.eml',
'00654E78278B0BBDFBF29BAEA3F61051.eml',
'10E3BAFB831EA97615DBBBF18D601EC1.eml',
'30295A4958D6787060A9BD30ABA3BD81.eml',
'712FE30B1D680ACF5F5194E05E7AFCC1.eml',
'80E928FB95A365F85AE1A99DC8418061.eml',
'91681F0020EAC9AC7F010E917CD72F51.eml',
'C0542641286DE272AB1FAEF954BA1951.eml',
'D06DD3162620490F7E9F8ADD1AE0F621.eml',
'214C558DD0ABCAC2EA3BE06DE95E0811.eml',
'4101E93C02FBA028CEA078B9A3542B01.eml',
'51159C8E5965890AE7356E92BC1C6921.eml',
'50775947EFD5010C3D5EA799F36029A1.eml']
How can I compare the two dataframes df1 and df2
Thank you
I tried
df3=df1.compare(df2, keep_equal=True)
but I get an error
Can only compare identically-labeled DataFrame objects
because the df2 is created by zipfile.namelist() which is diffrent from df1 which is read from a csv

How to add avwap to pandas_ta?

I am trying to get anchored vwap from specific date using pandas_ta. How to set anchor to specific date?
import pandas as pd
import yfinance as yf
import pandas_ta as ta
from datetime import datetime, timedelta, date
import warnings
import plac
data = yf.download("aapl", start="2021-07-01", end="2022-08-01")
df = pd.DataFrame(data)
df1 = df.ta.vwap(anchor = "D")
df14 = pd.concat([df, df1],axis=1)
print(df14)
pandas_ta.vwap anchor depending on the index values, as pandas-ta said(reference)
anchor (str): How to anchor VWAP. Depending on the index values, it will
implement various Timeseries Offset Aliases as listed here:
https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#timeseries-offset-aliases
Default: "D".
In further words, you can't specify a specific date as TradingView did.
To anchor a date ourself,
import pandas as pd
import numpy as np
import yfinance as yf
import pandas_ta as ta
# set anchor date
anchored_date = pd.to_datetime('2022-01-30')
data = yf.download("aapl", start="2022-01-01", end="2022-08-01")
df = pd.DataFrame(data)
df1 = df.ta.vwap(anchor = "D")
df14 = pd.concat([df, df1],axis=1)
# I create a column 'typical_price', it should be identical with 'VWAP_D'
df14['typical_price'] = (df14['High'] + df14['Low'] + df14['Close'])/3
tpp_d = ((df14['High'] + df14['Low'] + df14['Close'])*df14['Volume'])/3
df14['anchored_VWAP'] = tpp_d.where(df14.index >= anchored_date).groupby(df14.index >= anchored_date).cumsum()/df14['Volume'].where(df14.index >= anchored_date).groupby(df14.index >= anchored_date).cumsum()
df14
Plot

Convert R object(Dataframe) to Pandas Dataframe using rpy2

Iam using rpy2 to get comorbidity Index of patients , i got the results but iam not able to convert those output to pandas Dataframe
below is the code
#creating Datframe
data = {"person_id":[1,1,1,2,2,3],
"dx_1":["F11","E40","","F32","C77","G10"],
"dx_2":["F1P","E400","","F322","C737",""]}
#converting Pandas Dataframe to R Datframe using rpy2
import rpy2
from rpy2.robjects import pandas2ri
import rpy2.robjects.numpy2ri
from rpy2.robjects.packages import importr
r_dataframe = pandas2ri.py2ri(df1)
print(r_dataframe)
#installing 'comorbidity ' package using rpy2
R = rpy2.robjects.r
DTW = importr('comorbidity')
#executing comorbidity function by using one column icd_1
output = DTW.comorbidity(x = r_dataframe, id = "person_id", code = "icd_1",
score = "charlson", assign0 = False,
icd = "icd10")
print(output)
but not able to convert output to pandas dataframe
import rpy2, rpy2.robjects as robjects, rpy2.robjects.packages as rpackages
from rpy2.robjects.vectors import StrVector
#Converting data frames back and forth between rpy2 and pandas
from rpy2.robjects import r, pandas2ri
#convert output to pandas dataframe
pandas2ri.ri2py_dataframe(output)
getting below error
TypeError: Parameter 'categories' must be list-like, was
please help
Thanks in advance

How to use pd.DataFrame method to manually create a dataframe from info scraped using beautifulsoup4

I made it to the point where all tr data data has been scraped and I am able to get a nice printout. But when I go to implement the pd.DataFrame as in df= pd.DataFrame({"A": a}) etc, I get a syntax error
Here is a list of my imported libraries in the Jupyter Notebook:
import pandas as pd
import numpy as np
import bs4 as bs
import requests
import urllib.request
import csv
import html5lib
from pandas.io.html import read_html
import re
Here is my code:
source = urllib.request.urlopen('https://www.zipcodestogo.com/Texas/').read()
soup = bs.BeautifulSoup(source,'html.parser')
table_rows = soup.find_all('tr')
table_rows
for tr in table_rows:
td = tr.find_all('td')
row = [i.text for i in td]
print(row)
texas_info = pd.DataFrame({
"title": Texas
"Zip Code" : [Zip Code],
"City" :[City],
})
texas_info.head()
I expect to get a dataframe with two columns, one being the 'Zip Code' and the other the 'Cities'
If you want to create manually, with bs4 4.7.1 you can use :not, :contains and :nth-of-type pseudo classes to isolate the two columns of interest, then construct a dict then convert to df
import pandas as pd
import urllib
from bs4 import BeautifulSoup as bs
source = urllib.request.urlopen('https://www.zipcodestogo.com/Texas/').read()
soup = bs(source,'lxml')
zips = [item.text for item in soup.select('.inner_table:contains(Texas) td:nth-of-type(1):not([colspan])')]
cities = [item.text for item in soup.select('.inner_table:contains(Texas) td:nth-of-type(2):not([colspan])')]
d = {'Zips': zips,'Cities': cities}
df = pd.DataFrame(d)
df = df[1:].reset_index(drop = True)
You could combine selectors into one line:
import pandas as pd
import urllib
from bs4 import BeautifulSoup as bs
source = urllib.request.urlopen('https://www.zipcodestogo.com/Texas/').read()
soup = bs(source,'lxml')
items = [item.text for item in soup.select('.inner_table:contains(Texas) td:nth-of-type(1):not([colspan]), .inner_table:contains(Texas) td:nth-of-type(2):not([colspan])')]
d = {'Zips': items[0::2],'Cities': items[1::2]}
df = pd.DataFrame(d)
df = df[1:].reset_index(drop = True)
print(df)
I note you want to create manually but worth knowing for future readers that you could just use pandas read_html
import pandas as pd
table = pd.read_html('https://www.zipcodestogo.com/Texas/')[1]
table.columns = table.iloc[1]
table = table[2:]
table = table.drop(['Zip Code Map', 'County'], axis=1).reset_index(drop=True)
print(table)
Try creating the DataFrame and perform the for loop to append each row in the table into the DataFrame.
df = pd.DataFrame()
for tr in table_rows:
td = tr.find_all('td')
row = [i.text for i in td]
print(row)
zipCode = row[0] # assuming first column
city = row[1] # assuming second column
df = df.append({"Zip Code": zipCode, "City" : city}, ignore_index=True)
If you only need these two columns, you should not include title in the DataFrame (that will create another column); that line also happened to be where the syntax error occurred because of the missing comma.

How do I enable the REFS_OK flag in nditer in numpy in Python 3.3?

Does anyone know how one goes about enabling the REFS_OK flag in numpy? I cannot seem to find a clear explanation online.
My code is:
import sys
import string
import numpy as np
import pandas as pd
SNP_df = pd.read_csv('SNPs.txt',sep='\t',index_col = None ,header = None,nrows = 101)
output = open('100 SNPs.fa','a')
for i in SNP_df:
data = SNP_df[i]
data = np.array(data)
for j in np.nditer(data):
if j == 0:
output.write(("\n>%s\n")%(str(data(j))))
else:
output.write(data(j))
I keep getting the error message: Iterator operand or requested dtype holds references, but the REFS_OK was not enabled.
I cannot work out how to enable the REFS_OK flag so the program can continue...
I have isolated the problem. There is no need to use np.nditer. The main problem was with me misinterpreting how Python would read iterator variables in a for loop. The corrected code is below.
import sys
import string
import fileinput
import numpy as np
SNP_df = pd.read_csv('datafile.txt',sep='\t',index_col = None ,header = None,nrows = 5000)
output = open('outputFile.fa','a')
for i in range(1,51):
data = SNP_df[i]
data = np.array(data)
for j in range(0,1):
output.write(("\n>%s\n")%(str(data[j])))
for k in range(1,len(data)):
output.write(str(data[k]))
If you really want to enable the flag, I have an working example.
Python 2.7, numpy 1.14.2, pandas 0.22.0
import pandas as pd
import numpy as np
# get all data as panda DataFrame
data = pd.read_csv("./monthdata.csv")
print(data)
# get values as numpy array
data_ar = data.values # numpy.ndarray, every element is a row
for row in data_ar:
print(row)
sum = 0
count = 0
for month in np.nditer(row, flags=["refs_OK"], op_flags=["readwrite"]):
print month