How can I fix 'list' object has no attribute 'to_csv' issue? - selenium

from selenium import webdriver
import time
from selenium.webdriver.common.keys import Keys # keys içerisinden enter yapabilmesini sağlıyoruz
browser = webdriver.Chrome("C:/Users/EMRE/Desktop/SCRAPE/chromedriver_win32/chromedriver.exe")
import pandas as pd
browser.get("http://event.ybu.edu.tr/kulupler/")
import csv
#browser.fullscreen_window()
#time.sleep(2)
#for i in range(6):
#browser.execute_script('window.scrollTo(0,document.body.scrollHeight)') #sayfayı aşağıya doğru çekmek için
#time.sleep(1)
Kulup_button = browser.find_element_by_xpath("/html/body/div[2]/div[2]/div[1]/a/div/div[1]/div") #ilk kulüp için sonra değiştir
Kulup_button.click()
time.sleep(1)
for i in range(1):
browser.execute_script('window.scrollTo(0,document.body.scrollHeight)') #sayfayı aşağıya doğru çekmek için
time.sleep(1)
kulupnames = browser.find_elements_by_xpath("/html/body/div[2]/div[2]/section/div/div[2]/div/div[1]/div/div[1]/table/tbody/tr[1]/td[2]")
kulupList=[]
for kulupname in kulupnames:
kulupList.append(kulupname.text)
mails = browser.find_elements_by_css_selector("#bilgiler > a.btn.bg-orange.btn-social")
MailList=[]
for mail in mails:
MailList.append(mail.text)
FacebookAdresses = browser.find_elements_by_css_selector("#bilgiler > a.btn.bg-blue.btn-social")
FacebookList=[]
for FacebookAdress in FacebookAdresses:
FacebookList.append(FacebookAdress.text)
TwitterAdresses = browser.find_elements_by_css_selector("#bilgiler > a.btn.btn-social.bg-aqua")
TwitterList=[]
for TwitterAdress in TwitterAdresses:
TwitterList.append(TwitterAdress.text)
InstagramAdresses = browser.find_elements_by_css_selector("#bilgiler > a.btn.btn-social.bg-light-blue")
InstagramList=[]
for InstagramAdress in InstagramAdresses:
InstagramList.append(InstagramAdress.text)
AkademikDanismanlar = browser.find_elements_by_xpath("/html/body/div[2]/div[2]/section/div/div[2]/div/div[1]/div/div[1]/table/tbody/tr[2]/td[2]")
DanismanList=[]
for AkademikDanisman in AkademikDanismanlar:
DanismanList.append(AkademikDanisman.text)
KulupBaskanlari = browser.find_elements_by_xpath("/html/body/div[2]/div[2]/section/div/div[2]/div/div[1]/div/div[1]/table/tbody/tr[3]/td[2]")
BaskanList=[]
for KulupBaskani in KulupBaskanlari:
BaskanList.append(KulupBaskani.text)
ToplamUyeler = browser.find_elements_by_xpath("/html/body/div[2]/div[2]/section/div/div[2]/div/div[1]/div/div[1]/table/tbody/tr[4]/td[2]")
UyeList=[]
for Uye in ToplamUyeler:
UyeList.append(Uye.text)
Etkinlikler = browser.find_elements_by_xpath("/html/body/div[2]/div[2]/section/div/div[2]/div/div[1]/div/div[1]/table/tbody/tr[5]/td[2]")
EtkinlikList=[]
for Etkinlik in Etkinlikler:
EtkinlikList.append(Etkinlik.text)
time.sleep(5)
browser.quit()
DataFile = csv.writer(open('AYBU.csv','w'))
DataFile.writerow(['KulupAdi','MailAdresi','FacebookAdresi','TwitterAdresi','InstagramAdres','AkademikDanisman','KulupBaskani','ToplamUyeSayisi','ToplamEtkinlikSayisi'])
DataFile.to_csv("AYBU.csv", index = False, encoding='utf-8-sig')
liste = ['kulupList','MailList','FacebookList','TwitterList','InstagramList','DanismanList','BaskanList','UyeList','EtkinlikList']
df = pd.DataFrame(data = liste)
liste.to_csv("AYBU.csv", index = False, encoding='utf-8-sig')
I am trying to save my variable list as dataframe to csv.

You have a couple flaws in your code that I can see.
I took your code and made it work and I'll explain how:
import csv
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.keys import Keys # keys içerisinden enter yapabilmesini sağlıyoruz
browser = webdriver.Chrome()
time.sleep(5)
browser.get("http://event.ybu.edu.tr/kulupler/")
Kulup_button = browser.find_element_by_xpath("/html/body/div[2]/div[2]/div[1]/a/div/div[1]/div") #ilk kulüp için sonra değiştir
Kulup_button.click()
time.sleep(1)
for _ in range(1):
browser.execute_script('window.scrollTo(0,document.body.scrollHeight)') #sayfayı aşağıya doğru çekmek için
time.sleep(1)
kulupnames = browser.find_elements_by_xpath("/html/body/div[2]/div[2]/section/div/div[2]/div/div[1]/div/div[1]/table/tbody/tr[1]/td[2]")
kulupList = [kulupname.text for kulupname in kulupnames]
mails = browser.find_elements_by_css_selector("#bilgiler > a.btn.bg-orange.btn-social")
MailList = [mail.text for mail in mails]
FacebookAdresses = browser.find_elements_by_css_selector("#bilgiler > a.btn.bg-blue.btn-social")
FacebookList = [FacebookAdress.text for FacebookAdress in FacebookAdresses]
TwitterAdresses = browser.find_elements_by_css_selector("#bilgiler > a.btn.btn-social.bg-aqua")
TwitterList = [TwitterAdress.text for TwitterAdress in TwitterAdresses]
InstagramAdresses = browser.find_elements_by_css_selector("#bilgiler > a.btn.btn-social.bg-light-blue")
InstagramList = [InstagramAdress.text for InstagramAdress in InstagramAdresses]
AkademikDanismanlar = browser.find_elements_by_xpath("/html/body/div[2]/div[2]/section/div/div[2]/div/div[1]/div/div[1]/table/tbody/tr[2]/td[2]")
DanismanList = [
AkademikDanisman.text for AkademikDanisman in AkademikDanismanlar
]
KulupBaskanlari = browser.find_elements_by_xpath("/html/body/div[2]/div[2]/section/div/div[2]/div/div[1]/div/div[1]/table/tbody/tr[3]/td[2]")
BaskanList = [KulupBaskani.text for KulupBaskani in KulupBaskanlari]
ToplamUyeler = browser.find_elements_by_xpath("/html/body/div[2]/div[2]/section/div/div[2]/div/div[1]/div/div[1]/table/tbody/tr[4]/td[2]")
UyeList = [Uye.text for Uye in ToplamUyeler]
Etkinlikler = browser.find_elements_by_xpath("/html/body/div[2]/div[2]/section/div/div[2]/div/div[1]/div/div[1]/table/tbody/tr[5]/td[2]")
EtkinlikList = [Etkinlik.text for Etkinlik in Etkinlikler]
time.sleep(5)
browser.quit()
with open('AYBU.csv','w') as datafile:
DataFile = csv.writer(datafile)
DataFile.writerow(['KulupAdi','MailAdresi','FacebookAdresi','TwitterAdresi','InstagramAdres','AkademikDanisman','KulupBaskani','ToplamUyeSayisi','ToplamEtkinlikSayisi'])
liste = [kulupList,MailList,FacebookList,TwitterList,InstagramList,DanismanList,BaskanList,UyeList,EtkinlikList]
df = pd.DataFrame(data = liste)
df.to_csv("AYBU.csv", index = False, encoding='utf-8-sig')
The key changes here are at the bottom (don't mind the clean up of the generators).
DataFile = csv.writer(open('AYBU.csv','w'))
DataFile.writerow(['KulupAdi','MailAdresi','FacebookAdresi','TwitterAdresi','InstagramAdres','AkademikDanisman','KulupBaskani','ToplamUyeSayisi','ToplamEtkinlikSayisi'])
DataFile.to_csv("AYBU.csv", index = False, encoding='utf-8-sig')
liste = ['kulupList','MailList','FacebookList','TwitterList','InstagramList','DanismanList','BaskanList','UyeList','EtkinlikList']
df = pd.DataFrame(data = liste)
liste.to_csv("AYBU.csv", index = False, encoding='utf-8-sig')
This code doesnt work.
with open('AYBU.csv','w') as datafile:
DataFile = csv.writer(datafile)
DataFile.writerow(['KulupAdi','MailAdresi','FacebookAdresi','TwitterAdresi','InstagramAdres','AkademikDanisman','KulupBaskani','ToplamUyeSayisi','ToplamEtkinlikSayisi'])
liste = [kulupList,MailList,FacebookList,TwitterList,InstagramList,DanismanList,BaskanList,UyeList,EtkinlikList]
df = pd.DataFrame(data = liste)
df.to_csv("AYBU.csv", index = False, encoding='utf-8-sig')
You had 'list' as strings.
pandas is able to use .to_csv but csv.writer is not.

Related

AttributeError:'str' object has no attribute 'unique' (Pandas.unique)

In my script, I use pandas module. When I execute my file.py - everything works well. But I've converted my file.py to file.exe with auto-py-to-exe and got an error: AttributeError:'str' object has no attribute 'unique'. It's strange because it worked normally. The line where becomes an error: wells=list(file[0].unique()). Who knows this issue, please help.
import tkinter as tk
import tkinter.filedialog as fd
import pandas as pd
import os
import datetime
from datetime import datetime, date
import numpy as np
pd.set_option('display.notebook_repr_html', False)
pd.set_option('display.max_columns', 80)
pd.set_option('display.max_rows', 200)
pd.set_option('display.width', 800)
def resource_path(relative_path):
try:
base_path = sys._MEIPASS
except Exception:
base_path = os.path.abspath(".")
return os.path.join(base_path, relative_path)
def open():
global file_excel, name
file_excel = fd.askopenfilename(initialdir='/Desktop', title='Открыть файл', filetypes = [("Excel", "*.xlsx")])
name = os.path.basename(file_excel)
name=os.path.splitext(name)[0]
file_excel=pd.read_excel(file_excel, skiprows=[0], header=None)
win.destroy()
return file_excel, name
win = tk.Tk()
path = resource_path("image.png")
photo = tk.PhotoImage(file=path)
win.iconphoto(False, photo)
win.config(bg='#FFC')
win.title('Конвертация в формат .ev')
win.geometry('400x130+500+500')
win.resizable(False, False)
label_1 = tk.Label(win, text = 'Выберите файл с испытаниями скважин:',
bg = '#FFC',
font=('Arial', 10, 'bold'),
padx=20,
pady=10).pack()
btn_1 = tk.Button(win, text = 'Выбрать Excel',
command = open,
activebackground = '#6F6',
font=('Arial', 12, 'bold'),
padx=20,
pady=10,
relief = tk.RAISED,
bd=2).pack()
win.mainloop()
wells=list(file_excel[0].unique())
file_excel[1] = pd.to_datetime(file_excel[1], errors='coerce').dt.strftime("%d/%m/%Y")
file_excel[4] = np.where(file_excel[1].str, 'Perforation', np.nan)
file_excel.iloc[:,[2,3]]=file_excel.iloc[:,[2,3]].abs()
col_list = list(file_excel)
col_list[4], col_list[2] = col_list[2], col_list[4]
file_excel.columns = col_list
Perforation=pd.DataFrame(data=None)
for i in wells:
well_name=pd.DataFrame({'WELLNAME '+i}, columns=[1])
Perforation=Perforation.append(well_name)
Perforation=Perforation.append(file_excel.iloc[:,[1,2,3,4]][file_excel.iloc[:,0]==i])
Perforation=Perforation.append(pd.Series(dtype = 'object'), ignore_index=True)
def SaveFile():
Save=fd.asksaveasfile(mode='w',defaultextension=".ev", initialfile=name)
Save.write(Perforation.to_string(index=False, header=False, na_rep=' '))
win.destroy()
win = tk.Tk()
path = resource_path("image.png")
photo = tk.PhotoImage(file=path)
win.iconphoto(False, photo)
win.config(bg='#FFC')
win.title('Конвертация в формат .ev')
win.geometry('400x130+500+500')
win.resizable(False, False)
label_1 = tk.Label(win, text = 'Сохранение:',
bg = '#FFC',
font=('Arial', 10, 'bold'),
padx=20,
pady=10).pack()
btn_1 = tk.Button(win, text = 'Сохранить как',
command = SaveFile,
activebackground = '#6F6',
font=('Arial', 12, 'bold'),
padx=20,
pady=10,
relief = tk.RAISED,
bd=2).pack()
win.mainloop()
type of file[0]
Error screen
When I created virtual env I should have added openpyxl module. And I made it and everything is fine now

How can I scrape data from new tab?

browser.get("http://event.ybu.edu.tr/kulupler/") #main url
time.sleep(1)
browser.execute_script("window.open('http://event.ybu.edu.tr/kulup/afak', 'new window')") #open new tab
for i in range(1):
browser.execute_script('window.scrollTo(0,document.body.scrollHeight)') #sayfayı aşağıya doğru çekmek için
time.sleep(1)
kulupnames = browser.find_elements_by_xpath("/html/body/div[2]/div[2]/section/div/div[2]/div/div[1]/div/div[1]/table/tbody/tr[1]/td[2]")
kulupList=[]
for kulupname in kulupnames:
kulupList.append(kulupname.text)
mails = browser.find_elements_by_css_selector("#bilgiler > a.btn.bg-orange.btn-social")
MailList=[]
for mail in mails:
MailList.append(mail.text)
FacebookAdresses = browser.find_elements_by_css_selector("#bilgiler > a.btn.bg-blue.btn-social")
FacebookList=[]
for FacebookAdress in FacebookAdresses:
FacebookList.append(FacebookAdress.text)
TwitterAdresses = browser.find_elements_by_css_selector("#bilgiler > a.btn.btn-social.bg-aqua")
TwitterList=[]
for TwitterAdress in TwitterAdresses:
TwitterList.append(TwitterAdress.text)
InstagramAdresses = browser.find_elements_by_css_selector("#bilgiler > a.btn.btn-social.bg-light-blue")
InstagramList=[]
for InstagramAdress in InstagramAdresses:
InstagramList.append(InstagramAdress.text)
AkademikDanismanlar = browser.find_elements_by_xpath("/html/body/div[2]/div[2]/section/div/div[2]/div/div[1]/div/div[1]/table/tbody/tr[2]/td[2]")
DanismanList=[]
for AkademikDanisman in AkademikDanismanlar:
DanismanList.append(AkademikDanisman.text)
KulupBaskanlari = browser.find_elements_by_xpath("/html/body/div[2]/div[2]/section/div/div[2]/div/div[1]/div/div[1]/table/tbody/tr[3]/td[2]")
BaskanList=[]
for KulupBaskani in KulupBaskanlari:
BaskanList.append(KulupBaskani.text)
ToplamUyeler = browser.find_elements_by_xpath("/html/body/div[2]/div[2]/section/div/div[2]/div/div[1]/div/div[1]/table/tbody/tr[4]/td[2]")
UyeList=[]
for Uye in ToplamUyeler:
UyeList.append(Uye.text)
Etkinlikler = browser.find_elements_by_xpath("/html/body/div[2]/div[2]/section/div/div[2]/div/div[1]/div/div[1]/table/tbody/tr[5]/td[2]")
EtkinlikList=[]
for Etkinlik in Etkinlikler:
EtkinlikList.append(Etkinlik.text)
time.sleep(2)
browser.quit()
#DataFile = csv.writer(open('AYBU.csv','w'))
#DataFile.writerow(['KulupAdi','MailAdresi','FacebookAdresi','TwitterAdresi','InstagramAdresi','AkademikDanisman','KulupBaskani','ToplamUyeSayisi','ToplamEtkinlikSayisi'])
#DataFile.to_csv("AYBU.csv", index = False, encoding='utf-8-sig')
liste = {'kulupList':kulupList,'MailList':MailList,'FacebookList':FacebookList,'TwitterList':TwitterList,'InstagramList':InstagramList,'DanismanList':DanismanList,'BaskanList':BaskanList,'UyeList':UyeList,'EtkinlikList':EtkinlikList}
df = pd.DataFrame(data = liste)
df.to_csv("AYBU.csv", index=False, encoding='utf-8-sig')
the general process:
browser.get("http://event.ybu.edu.tr/kulupler/") #main url
# Open a new window
browser.execute_script("window.open('');")
# Switch to the new window
browser.switch_to.window(browser.window_handles[1])
browser.get('http://event.ybu.edu.tr/kulup/afak')
:
:
# close the active tab
browser.close()
time.sleep(3)
# Switch back to the first tab
browser.switch_to.window(browser.window_handles[0])

Why is beautifulsoup not parsing a simple Wikipedia Table

To help fight covid19 here in the Philippines, I'm trying to do data analysis. My data source is table of incidences in Wikipedia. See https://en.wikipedia.org/wiki/2020_coronavirus_pandemic_in_the_Philippines
Tried to get table in python with Beautiful soup but I cannot seem to get the content of the columns [Facility of admission or consultation, Had recent travel history abroad]. See screenshot:
What am I doing wrong?
Here's my code: (can also be found here https://github.com/gio888/covid19_ph2/blob/master/covid_import_from_wikipedia.ipynb)
import pandas as pd
import requests
from bs4 import BeautifulSoup
url = "https://en.wikipedia.org/wiki/Template:2019%E2%80%9320_coronavirus_pandemic_data/Philippines_medical_cases_summary"
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
table = soup.find('table', class_='wikitable')
n_columns = 0
n_rows=0
column_names = []
for row in table.find_all('tr'):
td_tags = row.find_all('td')
if len(td_tags) > 0:
n_rows+=1
if n_columns == 0:
n_columns = len(td_tags)
th_tags = row.find_all('th')
if len(th_tags) > 0 and len(column_names) == 0:
for th in th_tags:
column_names.append(th.get_text())
columns = column_names if len(column_names) > 0 else range(0,n_columns)
df = pd.DataFrame(columns = columns,index= range(0,n_rows))
row_marker = 0
for row in table.find_all('tr'):
column_marker = 0
columns = row.find_all('td')
for column in columns:
df.iat[row_marker,column_marker] = column.get_text()
column_marker += 1
if len(columns) > 0:
row_marker += 1
for col in df:
try:
df[col] = df[col].astype(float)
except ValueError:
pass
df
import pandas as pd
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
options = Options()
options.add_argument('--headless')
driver = webdriver.Firefox(options=options)
driver = webdriver.Firefox()
driver.get(
"https://en.wikipedia.org/wiki/2020_coronavirus_pandemic_in_the_Philippines")
items = [["yes", "Yes"], ["no", "No"], [
"TBA", "TBA"], ["status-d", "Died"], ["status-r", "Recovered"], ["status-a", "Admitted"]]
for item in items:
script = (
"document.querySelectorAll('.{}').forEach((element) => element.innerHTML = '{}')".format(*item))
driver.execute_script(script)
df = pd.read_html(driver.page_source)[2]
df.to_csv("data.csv", index=False)
driver.quit()
Output: View Online
import pandas as pd
import requests
from bs4 import BeautifulSoup
#url = "https://en.wikipedia.org/wiki/Template:2019%E2%80%9320_coronavirus_pandemic_data/Philippines_medical_cases_summary"
css_content = {
'status-a': 'Admitted',
'status-r': 'Recovered',
'status-d': 'Died',
'yes':'Yes',
'no': 'No',
'tba':'TBA',
"covid-sticky":'skip_header'
}
def Check_att(source,value,attribute='class'):
# <tag att='value'> <td class='x'>
if col_value : return col_value
if value in source.attrs.get(attribute, []) :
return css_content.get(value,'')
return ''
url = 'https://en.wikipedia.org/wiki/2020_coronavirus_pandemic_in_the_Philippines'
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
table = soup.find('table', class_='wikitable')
column_names = [col_name.text.rstrip('\n').strip() for col_name in table.select('tr.covid-sticky > th')]
n_rows = len(table.select('tr > td'))
df = pd.DataFrame(columns = column_names,index= range(0,n_rows))
for row_index,row in enumerate(table.find_all('tr')[1:],0):
# if Check_att(row,"covid-sticky") :continue
columns = row.find_all('td')
for col_index , column in enumerate(columns,0):
col_value = ''
col_value = Check_att(column,'status-a')
col_value = Check_att(column,'status-r')
col_value = Check_att(column,'status-d')
col_value = Check_att(column,'yes')
col_value = Check_att(column,'no')
col_value = Check_att(column,'tba')
if not col_value :
col_value = column.get_text().rstrip('\n').strip()
df.iat[row_index,col_index] = col_value
for col in df:
try:
df[col] = df[col].astype(float)
except ValueError:
pass
print(df)

Pandas DataFrame - How to extract string patterns with hidden characters

I am scraping names, prices and images from this website. There are 8 items in total, but in the DF I would like to filter only the items that contain the pattern "Original Zaino Antifurto". When I try to apply the bp_filter to the DF I get an error, probably due to hidden characters.
Does anyone know how to filter for this pattern avoiding the error?
import requests
from bs4 import BeautifulSoup
import pandas as pd
url_xd = 'https://www.xd-design.com/it-it/catalogsearch/result/?q=Bobby+Original+Zaino+Antifurto'
req_xd = requests.get(url_xd)
pars_xd = BeautifulSoup(req_xd.content, 'html.parser')
con_xd = pars_xd.find_all('div', class_ = 'product details product-item-details')
names_xd = []
prices_xd = []
picts_xd = []
for container in con_xd:
name = container.find("a", class_="product-item-link").text
names_xd.append(name)
for container in con_xd:
price = container.find("span", class_="price").text
prices_xd.append(price)
for container in con_xd:
pict = container.find("a").get("href")
picts_xd.append(pict)
bp_xd = pd.DataFrame({'(XD-Design) Item_Name': names_xd,
'Item_Price_EUR': prices_xd,
'Link_to_Pict': picts_xd })
bp_xd['Item_Price_EUR'] = bp_xd['Item_Price_EUR'].str.replace('€','').str.replace(',','.').astype(float)
bp_xd['(XD-Design) Item_Name'] = bp_xd['(XD-Design) Item_Name'].str.strip()
bp_filter = bp_xd['(XD-Design) Item_Name'][bp_xd['(XD-Design) Item_Name'].str.contains('Original Zaino Antifurto')]
# bp_xd[bp_filter]
Here you have the fixed working code
import requests
from bs4 import BeautifulSoup
import pandas as pd
url_xd = 'https://www.xd-design.com/it-it/catalogsearch/result/?q=Bobby+Original+Zaino+Antifurto'
req_xd = requests.get(url_xd)
pars_xd = BeautifulSoup(req_xd.content, 'html.parser')
con_xd = pars_xd.find_all('div', class_ = 'product details product-item-details')
names_xd = [c.find("a", class_="product-item-link").text for c in con_xd]
prices_xd = [c.find("span", class_="price").text for c in con_xd]
picts_xd = [c.find("a").get("href") for c in con_xd]
df = pd.DataFrame({'(XD-Design) Item_Name': names_xd,
'Item_Price_EUR': prices_xd,
'Link_to_Pict': picts_xd })
df['Item_Price_EUR'] = df['Item_Price_EUR'].str.replace('€','').str.replace(',','.').astype(float)
df['(XD-Design) Item_Name'] = df['(XD-Design) Item_Name'].str.strip()
df = df.loc[df['(XD-Design) Item_Name'].apply(lambda x: 1 if 'Original Zaino Antifurto' in x else 0) == 1]

Email body cut off when adding to dataframe

I am trying to parse email data to a dataframe but the majority of the email body seems to disappear when I view the dataframe.
I have tried printing the body before adding to the dataframe and it appears to be parsed correctly but when I use iloc to add to the dataframe it cuts off.
from bs4 import BeautifulSoup
from html2text import HTML2Text
import pandas as pd
import easyimap
import getpass
import email
import base64
import os
import email
import mimetypes
from datetime import datetime
from email.utils import parsedate_to_datetime
def to_text(html, rehtml=False):
parser = HTML2Text()
parser.wrap_links = False
parser.skip_internal_links = True
parser.inline_links = True
parser.ignore_anchors = True
parser.ignore_images = True
parser.ignore_emphasis = True
parser.ignore_links = True
text = parser.handle(html)
text = text.strip(' \t\n\r')
if rehtml:
text = text.replace('\n', '<br/>')
text = text.replace('\\', '')
return text
imap_password = getpass.getpass()
user = 'pmccabe#tradevela.com\edc-notifications'
host = 'outlook.office365.com'
password = imap_password
#'
folders = ('"INBOX/Americas/Not Raised"', '"INBOX/Americas/Raised"', '"INBOX/APAC/Not Raised"', '"INBOX/APAC/Raised"',
'"INBOX/Consolidated/Not Raised"', '"INBOX/Consolidated/Raised"', '"INBOX/EMEA"', '"INBOX/EMEA/Not Raised"', '"INBOX/EMEA/Raised"')
df = pd.DataFrame(columns=['Subject','Sender','From','To','Body','References','content_type', 'local_date_time',
'Classification', 'in_reply_to','return_path', 'mime_version', 'message_id', 'folder_name'])
for mailbox in folders:
#Connect to mailbox read_only = True to ensure the mail is not marked as read.
imapper = easyimap.connect(host, user, password, mailbox,read_only=True)
#fetch each mail up to limit and return email data and add to a dataframe
for mail_id in imapper.listids(limit=5000):
try:
mail = imapper.mail(mail_id, include_raw=True)
#convert body to text using to_text function and add to dataframe
df.loc[mail_id, ['Body']] = to_text(mail.body, rehtml=False)
#return mail features to dataframe
df.loc[mail_id, ['Subject']] = mail.title
df.loc[mail_id, ['Sender']] = mail.sender
df.loc [mail_id, ['From']] = mail.from_addr
df.loc [mail_id, ['To']] = mail.to
df.loc [mail_id, ['References']] = mail.references
df.loc [mail_id, ['content_type']] = mail.content_type
#converting the date to datetime and taking account of time difference changes
date_= mail.date
df.loc [mail_id, ['local_date_time']] = datetime.fromtimestamp(parsedate_to_datetime(date_).timestamp()).strftime('%Y-%m-%d %H:%M:%S')
#parsing the keyword data from the raw mail data to provide the classification
raw_data = mail.raw
email_message = email.message_from_bytes(raw_data)
df.loc [mail_id, ['Classification']] = email_message['Keywords']
df.loc [mail_id, ['in_reply_to']] = mail.in_reply_to
df.loc [mail_id, ['return_path']] = mail.return_path
df.loc [mail_id, ['mime_version']] = mail.mime_version
df.loc [mail_id, ['message_id']] = mail.message_id
df.loc [mail_id, ['folder_name']] = mailbox
except:
#if error print email to file
counter = 1
for part in email_message.walk():
if part.get_content_maintype() == "multipart":
continue
filename = part.get_filename()
content_type = part.get_content_type()
if not filename:
ext = mimetypes.guess_extension(content_type)
if not ext:
ext = '.bin'
if 'text' in content_type:
ext = '.txt'
elif 'html' in content_type:
ext = '.html'
filename = 'msg-part-%08d%s' %(counter, ext)
counter += 1
#save file
date_ = datetime.fromtimestamp(parsedate_to_datetime(date_).timestamp()).strftime('%Y-%m-%d %H:%M:%S')
save_path = os.path.join(os.getcwd(), "emails", date_, mail.title)
if not os.path.exists(save_path):
os.makedirs(save_path)
with open(os.path.join(save_path, filename), 'wb') as fp:
fp.write(part.get_payload(decode=True))
counter += 1
data frame should contain all the email body content
Updated Jupyter notebook and this has fixed this issue.