compare 2 pandas dataframes - dataframe

import glob
import pandas as pd
import numpy as np
import os
import fnmatch
import zipfile
df1 = pd.read_csv("2016Q12ExactTargetE1.csv",names = ['FileName'])
print("\nRead " ,df1.shape[0] , "Records")
# accessing and printing files in directory and subdirectory
for filename in glob.glob('c:\\temp\\*.zip', recursive=True):
#print(filename)
myzip=filename
zf = zipfile.ZipFile(myzip)
zfl = zf.namelist()
eml_files = fnmatch.filter(zfl, "*.eml")
df2 = pd.DataFrame(eml_files )
print("\nRead2 " ,df2.shape[0] , "Records")
The csv file
FileName
F0B1F7B371C427E6FDDE1078287A3C71.eml
E107A8CADF8F87B05599A3AAF03D5BA1.eml
30B54778C0B912F2516F6C390A137E91.eml
D06DD3162620490F7E9F8ADD1AE0F621.eml
10E3BAFB831EA97615DBBBF18D601EC1.eml
the eml_files looks like
['00E6E77CE9890A3F34343997BCA33791.eml',
'109E4F29239EA8259707B2E3D0D00351.eml',
'403EBEC70C1F305B72EFAA3822D75871.eml',
'30B54778C0B912F2516F6C390A137E91.eml',
'E107A8CADF8F87B05599A3AAF03D5BA1.eml',
'F0B1F7B371C427E6FDDE1078287A3C71.eml',
'00654E78278B0BBDFBF29BAEA3F61051.eml',
'10E3BAFB831EA97615DBBBF18D601EC1.eml',
'30295A4958D6787060A9BD30ABA3BD81.eml',
'712FE30B1D680ACF5F5194E05E7AFCC1.eml',
'80E928FB95A365F85AE1A99DC8418061.eml',
'91681F0020EAC9AC7F010E917CD72F51.eml',
'C0542641286DE272AB1FAEF954BA1951.eml',
'D06DD3162620490F7E9F8ADD1AE0F621.eml',
'214C558DD0ABCAC2EA3BE06DE95E0811.eml',
'4101E93C02FBA028CEA078B9A3542B01.eml',
'51159C8E5965890AE7356E92BC1C6921.eml',
'50775947EFD5010C3D5EA799F36029A1.eml']
How can I compare the two dataframes df1 and df2
Thank you
I tried
df3=df1.compare(df2, keep_equal=True)
but I get an error
Can only compare identically-labeled DataFrame objects
because the df2 is created by zipfile.namelist() which is diffrent from df1 which is read from a csv

Related

Problem with merging multiply excel files from python

My dtype is changing after i unhash the foo and groupby i get # we require a list, but not a 'str'.
I wanted if the value (in my case Date) in the 1 column is the same then the text from the 3 column goes there after a ',' sign, in my final project
import os
import pandas as pd
import dateutil
from pandas import DataFrame
from datetime import datetime, timedelta
data_file_folder = '.\Data'
df = []
for file in os.listdir(data_file_folder):
if file.endswith('.xlsx'):
print('Loading File {0}...'.format(file))
df.append(pd.read_excel(os.path.join(data_file_folder,file),sheet_name='Sheet1'))
df_master = pd.concat(df,axis=0)
df_master['Date'] = df_master['Date'].dt.date
#foo = lambda a: ", ".join(a)
#df_master = df_master.groupby(by='Date').agg({'Tweet': foo}).reset_index()
#df_master.to_excel('.\NewFolder\example.xlsx',index=False)
#df_master

Read web content into a dataframe without writing to a file

I am trying to read data from the following link to a data frame without saving locally (this is important). I figured out a way (below), but is there an efficient way to do this?
from urllib.request import urlopen
import pandas as pd
from io import StringIO
from matplotlib.dates import DateFormatter
from datetime import datetime
uri = 'https://mesonet.agron.iastate.edu/cgi-bin/request/asos.py?station=AXA&data=all&year1=2022&month1=12&day1=1&year2=2022&month2=12&day2=1&tz=Etc%2FUTC&format=onlycomma&latlon=no&elev=no&missing=M&trace=T&direct=no&report_type=3&report_type=4'
data = urlopen(uri, timeout=300).read().decode("utf-8")
dateparse = lambda x: datetime.strptime(x.strip(), '%Y-%m-%d %H:%M')
str1 = data.split('\n')
dfList = []
for ii in range(1,len(str1)):
if len(str1[ii])>0:
df1 = pd.read_csv(StringIO(str1[ii]), parse_dates=[1], date_parser=dateparse, header=None) #Read each string into a dataframe
if not df1.empty:
df2 = df1.iloc[:,0:3] #Get the first five columns
if df2.iloc[0,-1] != 'M': #Don't append the ones with missing data
dfList.append(df2)
df = pd.concat(dfList, axis=0, ignore_index=True)
df.columns = ['Station','Date','Temp']
ax1 = df.plot(x=1,y=2)
ax1.get_figure().autofmt_xdate()
Using requests, pandas and io:
from io import StringIO
import pandas as pd
import requests
url = (
"https://mesonet.agron.iastate.edu/cgi-bin/request/asos.py?"
"station=AXA&data=all&year1=2022&month1=12&day1=1&year2=2022&"
"month2=12&day2=1&tz=Etc%2FUTC&format=onlycomma&latlon=no&"
"elev=no&missing=M&trace=T&direct=no&report_type=3&report_type=4"
)
with requests.Session() as request:
response = request.get(url, timeout=30)
if response.status_code != 200:
print(response.raise_for_status())
df = pd.read_csv(StringIO(response.text), sep=",")
print(df)

How to create multiple pandas profiling reports for multiple csv files in a directory? The report name should match the file name

I tried this,
import glob
import os
import pandas as pd
import pandas_profiling
from pandas_profiling import ProfileReport
files = glob.glob("D:\home_health_services_current_data\*.csv")
df = pd.DataFrame()
for f in files:
csv = pd.read_csv(f)
df = df.append(csv)
profile = ProfileReport(df, title="Profiling Report", explorative=True)
profile.to_file("D:\proj_report\profilerep\prof_report.html")

Concatenate CSVs into XLSX files based on filename (pandas)

I have a bunch of CSVs with names '<3-letter-string> YYYY.csv'. There are four different versions of <3-letter-string>, and I want to sort the csvs into four xlsxs, each identified by that three letter string.
My code:
import pandas as pd
import os
full_df = pd.DataFrame()
for filename in os.listdir('C:/Users/XXXXXX/ZZZZZZ'):
if filename.endswith(".csv"):
print(filename)
df = pd.read_csv(filename, skiprows=1, names=['ID','Units Sold','Retail Dollars'])
df['Year'] = filename[-8:-4]
full_df = pd.concat([full_df, df])
full_df.to_excel(filename[0:3] + '.xlsx', index=False)
This makes four different xlsxs, which is what I want, but they're all a mixture of the different csvs.
How do I tell pandas to group them into four separate xlsxs according to the filename? My initial thought is to include filename slicing in the penultimate line and create four different concatenated full_df dataframes to write separately, but I'm not sure how.
import pandas as pd
import os
def Get_Yo_Fantasy_Hennnnnyyyyy():
full_df = pd.DataFrame()
for filename in os.listdir("path"):
if filename.endswith(".csv"):
print(filename)
df = pd.read_csv(
filename,
skiprows=1,
names=["ID", "Units Sold", "Retail Dollars"])
df["Year"] = filename[-8:-4]
df["Type"] = filename[0:3]
full_df = pd.concat([full_df, df])
for i in list(full_df.Type.unique()):
full_df[full_df.Type.str.contains(i)].to_excel(
"{}".format(i) + ".xlsx", index=False)
Get_Yo_Fantasy_Hennnnnyyyyy()

How to use pd.DataFrame method to manually create a dataframe from info scraped using beautifulsoup4

I made it to the point where all tr data data has been scraped and I am able to get a nice printout. But when I go to implement the pd.DataFrame as in df= pd.DataFrame({"A": a}) etc, I get a syntax error
Here is a list of my imported libraries in the Jupyter Notebook:
import pandas as pd
import numpy as np
import bs4 as bs
import requests
import urllib.request
import csv
import html5lib
from pandas.io.html import read_html
import re
Here is my code:
source = urllib.request.urlopen('https://www.zipcodestogo.com/Texas/').read()
soup = bs.BeautifulSoup(source,'html.parser')
table_rows = soup.find_all('tr')
table_rows
for tr in table_rows:
td = tr.find_all('td')
row = [i.text for i in td]
print(row)
texas_info = pd.DataFrame({
"title": Texas
"Zip Code" : [Zip Code],
"City" :[City],
})
texas_info.head()
I expect to get a dataframe with two columns, one being the 'Zip Code' and the other the 'Cities'
If you want to create manually, with bs4 4.7.1 you can use :not, :contains and :nth-of-type pseudo classes to isolate the two columns of interest, then construct a dict then convert to df
import pandas as pd
import urllib
from bs4 import BeautifulSoup as bs
source = urllib.request.urlopen('https://www.zipcodestogo.com/Texas/').read()
soup = bs(source,'lxml')
zips = [item.text for item in soup.select('.inner_table:contains(Texas) td:nth-of-type(1):not([colspan])')]
cities = [item.text for item in soup.select('.inner_table:contains(Texas) td:nth-of-type(2):not([colspan])')]
d = {'Zips': zips,'Cities': cities}
df = pd.DataFrame(d)
df = df[1:].reset_index(drop = True)
You could combine selectors into one line:
import pandas as pd
import urllib
from bs4 import BeautifulSoup as bs
source = urllib.request.urlopen('https://www.zipcodestogo.com/Texas/').read()
soup = bs(source,'lxml')
items = [item.text for item in soup.select('.inner_table:contains(Texas) td:nth-of-type(1):not([colspan]), .inner_table:contains(Texas) td:nth-of-type(2):not([colspan])')]
d = {'Zips': items[0::2],'Cities': items[1::2]}
df = pd.DataFrame(d)
df = df[1:].reset_index(drop = True)
print(df)
I note you want to create manually but worth knowing for future readers that you could just use pandas read_html
import pandas as pd
table = pd.read_html('https://www.zipcodestogo.com/Texas/')[1]
table.columns = table.iloc[1]
table = table[2:]
table = table.drop(['Zip Code Map', 'County'], axis=1).reset_index(drop=True)
print(table)
Try creating the DataFrame and perform the for loop to append each row in the table into the DataFrame.
df = pd.DataFrame()
for tr in table_rows:
td = tr.find_all('td')
row = [i.text for i in td]
print(row)
zipCode = row[0] # assuming first column
city = row[1] # assuming second column
df = df.append({"Zip Code": zipCode, "City" : city}, ignore_index=True)
If you only need these two columns, you should not include title in the DataFrame (that will create another column); that line also happened to be where the syntax error occurred because of the missing comma.