So I am trying to scrape job posts from Glassdoor using Requests, Beautiful Soup and Selenium. The entire code works except that, even after scraping data from 30 pages, most entries turn out to be duplicates (almost 80% of them!). Its not a headless scraper so I know it is going to each new page. What could be the reason for so many duplicate entries? Could it be some sort of anti-scraping tool being used by Glassdoor or is something off in my code?
The result turns out to be 870 entries of which a whopping 690 are duplicates!
My code:
def glassdoor_scraper(url):
driver = webdriver.Chrome()
driver.get(url)
time.sleep(10)
# Getting to the page where we want to start scraping
jobs_search_title = driver.find_element(By.ID, 'KeywordSearch')
jobs_search_title.send_keys('Data Analyst')
jobs_search_location = driver.find_element(By.ID, 'LocationSearch')
time.sleep(1)
jobs_search_location.clear()
jobs_search_location.send_keys('United States')
click_search = driver.find_element(By.ID, 'HeroSearchButton')
click_search.click()
for page_num in range(1,10):
time.sleep(10)
res = requests.get(driver.current_url)
soup = BeautifulSoup(res.text,'html.parser')
time.sleep(2)
companies = soup.select('.css-l2wjgv.e1n63ojh0.jobLink')
for company in companies:
companies_list.append(company.text)
positions = soup.select('.jobLink.css-1rd3saf.eigr9kq2')
for position in positions:
positions_list.append(position.text)
locations = soup.select('.css-l2fjlt.pr-xxsm.css-iii9i8.e1rrn5ka0')
for location in locations:
locations_list.append(location.text)
job_post = soup.select('.eigr9kq3')
for job in job_post:
salary_info = job.select('.e1wijj242')
if len(salary_info) > 0:
for salary in salary_info:
salaries_list.append(salary.text)
else:
salaries_list.append('Salary Not Found')
ratings = soup.select('.e1rrn5ka3')
for index, rating in enumerate(ratings):
if len(rating.text) > 0:
ratings_list.append(rating.text)
else:
ratings_list.append('Rating Not Found')
next_page = driver.find_elements(By.CLASS_NAME, 'e13qs2073')[1]
next_page.click()
time.sleep(5)
try:
close_jobalert_popup = driver.find_element(By.CLASS_NAME, 'modal_closeIcon')
except:
pass
else:
time.sleep(1)
close_jobalert_popup.click()
continue
#driver.close()
print(f'{len(companies_list)} jobs found for you!')
global glassdoor_dataset
glassdoor_dataset = pd.DataFrame(
{'Company Name': companies_list,
'Company Rating': ratings_list,
'Position Title': positions_list,
'Location' : locations_list,
'Est. Salary' : salaries_list
})
glassdoor_dataset.to_csv(r'glassdoor_jobs_scraped.csv')
You're going way too fast. You need to put some waits.
I see you have put Implicit Waits. Trying putting Explicit Waits instead.
Something like this:
(put your own conditions. you can try invisibility element too. like if something is invisible and then visible to ensure you are on next page now)
if not then increase your time.sleep()
WebDriverWait(driver, 40).until(expected_conditions.visibility_of_element_located(
(By.XPATH, '//*[#id="wrapper"]/section/div/div/div[2]/button[2]')))
I don't think the repetition is due to a code issue - I think glassdoor just starts cycling results after a while. [If interested, see this gist for some stats - basically, from the 7th page or so, most of the 1st page results seem to be shown on every page onwards. I did a small test manually - with only 5 listings, by id, and even directly on an un-automated browser, they started repeating after a while....]
My suggestion would be to just filter them before looping to the next page - there's a data-id attribute for each li wrapped around the listings which seems to be a unique identifier. If we add that to the other columns' lists, we can start collecting only un-collected listings; if you just edit the for page_num loop to:
for page_num in range(1, 10):
time.sleep(10)
scrapedUrls.append(driver.current_url)
res = requests.get(driver.current_url)
soup = BeautifulSoup(res.text, 'html.parser')
# soup = BeautifulSoup(driver.page_source, 'html.parser') # no noticable improvement
time.sleep(2)
filteredListings = [
di for di in soup.select('li[data-id]') if
di.get('data-id') not in datId_list
]
datId_list += [di.get('data-id') for di in filteredListings]
companies_list += [
t.select_one('.css-l2wjgv.e1n63ojh0.jobLink').get_text(strip=True)
if t.select_one('.css-l2wjgv.e1n63ojh0.jobLink')
else None for t in filteredListings
]
positions_list += [
t.select_one('.jobLink.css-1rd3saf.eigr9kq2').get_text(strip=True)
if t.select_one('.jobLink.css-1rd3saf.eigr9kq2')
else None for t in filteredListings
]
locations_list += [
t.select_one(
'.css-l2fjlt.pr-xxsm.css-iii9i8.e1rrn5ka0').get_text(strip=True)
if t.select_one('.css-l2fjlt.pr-xxsm.css-iii9i8.e1rrn5ka0')
else None for t in filteredListings
]
job_post = [
t.select('.eigr9kq3 .e1wijj242') for t in filteredListings
]
salaries_list += [
'Salary Not Found' if not j else
(j[0].text if len(j) == 1 else [s.text for s in j])
for j in job_post
]
ratings_list += [
t.select_one('.e1rrn5ka3').get_text(strip=True)
if t.select_one('.e1rrn5ka3')
else 'Rating Not Found' for t in filteredListings
]
and, if you added datId_list to the dataframe, it could serve as a meaningful index
dfDict = {'Data-Id': datId_list,
'Company Name': companies_list,
'Company Rating': ratings_list,
'Position Title': positions_list,
'Location': locations_list,
'Est. Salary': salaries_list
}
for k in dfDict:
print(k, len(dfDict[k]))
glassdoor_dataset = pd.DataFrame(dfDict)
glassdoor_dataset.set_index('Data-Id', drop=True)
glassdoor_dataset.to_csv(r'glassdoor_jobs_scraped.csv')
I want to return a dataframe from this function, which can be used elsewhere (for plotly graph to be exact).
My idea is to use the dataframe I can create with points_sum(), save it as the team name, and then use that dataframe in my px.line(dataframe = team_name).
In essence, I want to use the men_points_df variable after I created it.
def points_sum(team):
points = 0
men_points = []
for index, row in menscore_df.iterrows():
if row['hometeam'] == team:
if row['homegoals'] > row['awaygoals']:
points += 2
elif row['homegoals'] == row['awaygoals']:
points += 1
elif row['homegoals'] < row['awaygoals']:
points == points
date = str(row['date'])
men_points.append([date, points])
if row['awayteam'] == team:
if row['homegoals'] < row['awaygoals']:
points += 2
elif row['homegoals'] == row['awaygoals']:
points += 1
elif row['homegoals'] > row['awaygoals']:
points == points
date = str(row['date'])
men_points.append([date, points])
men_points_df = pd.DataFrame(men_points, columns = ["Date", 'Points'])
return men_points_df
In plotly, I am trying to use my new dataframe (men_points_df), like below, but I get the error undefined name, even though I can print it (for example: test = points_sum("FIF") (FIF is one of the team names) and it shows the correct dataframe in the console (when I type test):
elif pathname == "/page-3":
return [html.H1('Seasonal performance',
style={'textAlign':'center'}),
html.Div(
children=[
html.H2('Select team',style={'textAlign':'center'}),
html.Br(),
html.Br(),
dcc.Dropdown(
id='team_dd',
options=[{'label': v, 'value': k} for k,v in teams_all.items()],
)]),
dcc.Graph(id="performance_graph")
]
Output(component_id="performance_graph", component_property="figure"),
Input(component_id="team_dd", component_property="value")
def update_graph(option_selected):
title = "none selected"
if option_selected:
title = option_selected
line_fig = px.line(
test, # <------------ THIS IS THE ISSUE
title = f"{title}",
x = "Date", y = "Points")
return line_fig
Just call points_sum in the update_graph function, before you use test:
def update_graph(option_selected):
title = "none selected"
if option_selected:
title = option_selected
# vvv Here vvv
test = points_sum("FIF")
line_fig = px.line(
test, #THIS IS THE ISSUE
title = f"{title}",
x = "Date", y = "Points")
return line_fig
I am having trouble appending and deleting rows. My table changes a lot and must be rebuilt often so this has been a little tricky. All of my information comes from an SQL database. I am loading the results into a pandas DataFrame and then using it to populate the GridTableBase class. I am now trying to Append and Delete rows, but am having trouble overriding the class. I have been able to somewhat get it to work, but it behaves weird. For some reason, self.table.AppendRows(row) doesn't work and throws an error. The original was self.table.AppendRow(row), but AppendRow isn't a method. So I had to use a different method. I have to change a value in order to get the GridTableMessage to realize there has been a change, which is what I am doing here data.iloc[data.shape[0]-1,0] = str(val)
Ideally, I would add/delete the row from the table itself, but I can't figure out how to do that. I have derived most of my code from here https://github.com/wxWidgets/Phoenix/blob/master/demo/Grid_MegaExample.py but a lot of that will not work properly for me.
As of now, I can append a row, but for some reason, it appends 2 even though only one has been added to the DataFrame and GetNumberRows is returning the correct count. I assume it has something to do with the way I am accessing the table class. Can anyone provide some clarity?
def rowPopup(self, row, evt):
"""(row, evt) -> display a popup menu when a row label is right clicked"""
appendID = wx.Window.NewControlId()#wx.NewId()
deleteID = wx.Window.NewControlId()#wx.NewId()
x = self.GetRowSize(row)/2
if not self.GetSelectedRows():
self.SelectRow(row)
menu = wx.Menu()
xo, yo = evt.GetPosition()
menu.Append(appendID, "Append Row")
menu.Append(deleteID, "Delete Row(s)")
def append(event, self=self, row=row):#event, self=self, row=row
global data
#print("Append")
#self.table.AppendRows(row)
dlg = wx.TextEntryDialog(self,'Enter a new Key ID to insert into the ' + str("'") + data.columns[0] + str("'") + ' column.', 'Insert New Record')
dlg.SetValue("")
if dlg.ShowModal() == wx.ID_OK:
#print('You entered: %s\n' % dlg.GetValue())
val = dlg.GetValue()
#data[~pd.isnull(data).all(1)].fillna('')
#data['tables_id'].apply('(g)'.format)
data.loc[data.iloc[-1].name + 1,:] = ""
data.iloc[data.shape[0]-1,0] = str(val)
self.Reset()
#print(data)
#data = data.append(pd.Series(dtype='object'), ignore_index=True)
#self.data = DataTable(data)
#data[~pd.isnull(data).all(1)].fillna('')
#self.data = DataTable(data)
def delete(event, self=self, row=row):#event, self=self, row=row
global data
rows = self.GetSelectedRows()
data.drop(data.index[rows],inplace=True)
print (data)
self.Reset()
#self.table.DeleteRow(row)
#print(row)
#print(rows)
#EVT_MENU(self, appendID, append)
#EVT_MENU(self, deleteID, delete)
self.Bind(wx.EVT_MENU, append, id=appendID)
self.Bind(wx.EVT_MENU, delete, id=deleteID)
self.PopupMenu(menu, wx.Point(round(x), round(yo)))
menu.Destroy()
class DataTable(gridlib.GridTableBase):
def __init__(self, data):
gridlib.GridTableBase.__init__(self)
self.headerRows = 1
if data is None:
data = pd.DataFrame()
self.data = data
print("Instance")
#Store the row and col length to see if table has changed in size
self._rows = self.GetNumberRows()
self._cols = self.GetNumberCols()
self.odd=gridlib.GridCellAttr()
self.odd.SetBackgroundColour((217,217,217))
self.even=gridlib.GridCellAttr()
self.even.SetBackgroundColour((255,255,255))
def GetAttr(self, row, col, kind):
attr = [self.even, self.odd][row % 2]
attr.IncRef()
return attr
def GetNumberRows(self):
#print("# Rows:",len(self.data))
return len(self.data)# - 1
def GetTypeName(self, row, col):
#print(wx.grid.GRID_VALUE_STRING)
return wx.grid.GRID_VALUE_STRING
def GetNumberCols(self):
#print("# Cols:",len(self.data.columns)+ 1)
return len(self.data.columns) + 1
#return len(self.data.columns) #+ 1
def IsEmptyCell(self, row, col):
return False
def GetValue(self, row, col):
if col == 0:
try:
return self.data.index[row]
except:
print("Row,Col(",row,col,")","OOB")
return ""
else:
try:
return str(self.data.iloc[row, col - 1])
except:
print("Row,Col(",row,col,")","OOB")
return ""
def GetColLabelValue(self, col):
if col == 0:
if self.data.index.name is None:
return 'Index'
else:
return self.data.index.name
return self.data.columns[col - 1]
def ResetView(self, grid):
"""
(wxGrid) -> Reset the grid view. Call this to
update the grid if rows and columns have been added or deleted
"""
print('Old::' , self._rows, self._cols)
print('New::' , self.GetNumberRows(),self.GetNumberCols())
print(data)
grid.BeginBatch()
for current, new, delmsg, addmsg in [
(self._rows, self.GetNumberRows(), gridlib.GRIDTABLE_NOTIFY_ROWS_DELETED, gridlib.GRIDTABLE_NOTIFY_ROWS_APPENDED),
(self._cols, self.GetNumberCols(), gridlib.GRIDTABLE_NOTIFY_COLS_DELETED, gridlib.GRIDTABLE_NOTIFY_COLS_APPENDED),
]:
if new < current:
msg = gridlib.GridTableMessage(self,delmsg,new,current-new)
#grid.ProcessTableMessage(msg)
self.GetView().ProcessTableMessage(msg)
print("OvN:",self._rows,self.GetNumberRows())
return True
if new > current:
msg = gridlib.GridTableMessage(self,addmsg,new-current)
self.GetView().ProcessTableMessage(msg)
grid.ProcessTableMessage(msg)
#self.UpdateValues(grid)
msg = gridlib.GridTableMessage(self, gridlib.GRIDTABLE_REQUEST_VIEW_GET_VALUES)
grid.ProcessTableMessage(msg)
print("OvN:",self._rows,self.GetNumberRows())
grid.EndBatch()
self._rows = self.GetNumberRows()
self._cols = self.GetNumberCols()
# update the column rendering plugins
#self._updateColAttrs(grid)
# XXX
# Okay, this is really stupid, we need to "jiggle" the size
# to get the scrollbars to recalibrate when the underlying
# grid changes.
h,w = grid.GetSize()
grid.SetSize((h+1, w))
grid.SetSize((h, w))
grid.ForceRefresh()
def UpdateValues(self, grid):#self, grid
"""Update all displayed values"""
# This sends an event to the grid table to update all of the values
msg = gridlib.GridTableMessage(self, gridlib.GRIDTABLE_REQUEST_VIEW_GET_VALUES)
grid.table.ProcessTableMessage(msg)
class DataGrid(gridlib.Grid):
def __init__(self, parent, data, lc, tc): # data
gridlib.Grid.__init__(self, parent, - 1) #,colnames,-1 # data
self.lc = lc
self.tc = tc
self.table = DataTable(data)
self.SetTable(self.table, True)
self.Bind(gridlib.EVT_GRID_LABEL_RIGHT_CLICK, self.OnLabelRightClicked)
self.Bind(gridlib.EVT_GRID_CELL_RIGHT_CLICK, self.OnCellRightClick)
self.Bind(gridlib.EVT_GRID_CELL_CHANGED, self.onCellChanged) #wx.grid
def Reset(self):
"""reset the view based on the data in the table. Call
this when rows are added or destroyed"""
self.table.ResetView(self)
def OnCellRightClick(self, event):
print ("OnCellRightClick: (%d,%d)\n" % (event.GetRow(), event.GetCol()))
def OnLabelRightClicked(self, evt):
row, col = evt.GetRow(), evt.GetCol()
if row == -1: print("col")#self.colPopup(col, evt)
elif col == -1: self.rowPopup(row, evt)
def rowPopup(self, row, evt):
"""(row, evt) -> display a popup menu when a row label is right clicked"""
appendID = wx.Window.NewControlId()#wx.NewId()
deleteID = wx.Window.NewControlId()#wx.NewId()
x = self.GetRowSize(row)/2
if not self.GetSelectedRows():
self.SelectRow(row)
menu = wx.Menu()
xo, yo = evt.GetPosition()
menu.Append(appendID, "Append Row")
menu.Append(deleteID, "Delete Row(s)")
def append(event, self=self, row=row):#event, self=self, row=row
global data
#print("Append")
#self.table.AppendRows(row)
dlg = wx.TextEntryDialog(self,'Enter a new Key ID to insert into the ' + str("'") + data.columns[0] + str("'") + ' column.', 'Insert New Record')
dlg.SetValue("")
if dlg.ShowModal() == wx.ID_OK:
val = dlg.GetValue()
#data[~pd.isnull(data).all(1)].fillna('')
#data['tables_id'].apply('(g)'.format)
data.loc[data.iloc[-1].name + 1,:] = ""
data.iloc[data.shape[0]-1,0] = str(val)
self.Reset()
#print(data)
#self.data = DataTable(data)
def delete(event, self=self, row=row):#event, self=self, row=row
global data
rows = self.GetSelectedRows()
data.drop(data.index[rows],inplace=True)
print (data)
self.Reset()
self.Bind(wx.EVT_MENU, append, id=appendID)
self.Bind(wx.EVT_MENU, delete, id=deleteID)
self.PopupMenu(menu, wx.Point(round(x), round(yo)))
menu.Destroy()
class MainFrame(wx.Frame):
def __init__(self, parent, data): # (self, parent, data):
wx.Frame.__init__(self, parent, -1, "Varkey Foundation") #, size=(640,480))
#Create a panel
self.p = wx.Panel(self)
self.Maximize(True)
#Create blank dataframe
data = pd.DataFrame() #pd.DataFrame(np.random.randint(0,100,size=(200, 5)),columns=list('EFGHD')
#data.reset_index(drop=True, inplace=True)
self.data = DataTable(data)
self.nb = wx.Notebook(self.p)
self.p.SetBackgroundColour( wx.Colour( 0, 0, 0 ) ) # 38,38,38
self.nb.SetBackgroundColour(wx.Colour(58, 56, 56) )
#self.SetBackgroundColour( wx.Colour( 255, 255, 56 ) )
#create the page windows as children of the notebook
self.page1 = PageOne(self.nb)
self.page2 = PageTwo(self.nb)
self.page3 = PageThree(self.nb)
# add the pages to the notebook with the label to show on the tab
self.nb.AddPage(self.page1, "Data")
self.nb.AddPage(self.page2, "Analyze")
self.nb.AddPage(self.page3, "Change Log")
#CreateFonts
self.b_font = wx.Font(14,wx.ROMAN,wx.NORMAL,wx.BOLD, True)
self.lbl_font = wx.Font(14,wx.ROMAN,wx.NORMAL,wx.NORMAL, True)
self.cb_font = wx.Font(11,wx.SCRIPT,wx.ITALIC,wx.NORMAL, True)
self.h_font = wx.Font(18,wx.DECORATIVE,wx.ITALIC,wx.BOLD, True)
#Create username textcontrol <<<<<<<<<<<< Passed to grid class
self.tc_user =wx.TextCtrl(self.p,value='cmccall95',size = (130,25))
self.tc_password =wx.TextCtrl(self.p,value='Achilles95', style=wx.TE_PASSWORD | wx.TE_PROCESS_ENTER,size = (130,25))
self.tc_password.Bind(wx.EVT_TEXT_ENTER,self.onLogin)
self.tc_user.SetFont(self.cb_font)
self.tc_password.SetFont(self.cb_font)
#Create Change log lstCtrl <<<<<<<<<<<< Passed to grid class
self.lc_change = wx.ListCtrl(self.p,-1,style = wx.TE_MULTILINE | wx.LC_REPORT | wx.LC_VRULES)
self.lc_change.InsertColumn(0,"User ID")
self.lc_change.InsertColumn(1,"Status")
self.lc_change.InsertColumn(2,"Description")
self.lc_change.InsertColumn(3,"Date/Time")
#Set column widths
self.lc_change.SetColumnWidth(0, 75)
self.lc_change.SetColumnWidth(1, 75)
self.lc_change.SetColumnWidth(2, 450)
self.lc_change.SetColumnWidth(3, 125)
#Create the grid and continue layout
self.grid = DataGrid(self.page1, data, self.lc_change, self.tc_user)
#More layout code...
def onLoadNewData(self, event): #This is how I'm replacing the data in my table class
global data
self.Freeze()
if self.combo_table.GetValue():
#Connect to db
self.connect_mysql()
#Determine db table
self.getTable()
#Get new data
sql_query = "SELECT * FROM " + tbl
self.cursor.execute(sql_query)
temp = pd.read_sql(sql_query, con=self.db_con)
temp.reset_index(drop=True, inplace=True)
data = temp[~pd.isnull(temp).all(1)].fillna('')
#Create title #if data:
if not data.empty:
self.title.SetLabel(str(self.combo_table.GetValue()))
print(str(self.combo_table.GetValue()))
self.grid.Destroy()
self.grid = DataGrid(self.page1, data, self.lc_change, self.tc_user)
#self.grid.HideCol(0)
self.grid.AutoSizeColumns()
#Insert grid into existing sizer
self.p1_sizer.Insert(1,self.grid,1,wx.RIGHT| wx.LEFT|wx.EXPAND, 20)
self.p1_sizer.Layout()
#RESIZE
else:
print("Error:Dataframe is empty")
self.close_connection()
else:
print('CANT BE BLANK')
self.Thaw()
if __name__ == '__main__':
import sys
app = wx.App()
frame = MainFrame(None, sys.stdout) # (None, sys.stdout)
frame.Show(True)
app.MainLoop()
For my job, I built a scrapy spider to quickly check in on ~200-500 website landing pages for clues that the pages are not functioning, outside of just 400-style errors. (e.g. check for the presence of "out of stock" on page.) This check happens across approx. 30 different websites under my purview, all of them using the same page structure.
This has worked fine, every day, for 4 months.
Then, suddenly, and without change to the code, I started getting unpredictable errors, about 4 weeks ago:
url_title = response.css("title::text").extract_first()
AttributeError: 'Response' object has no attribute 'css'
If I run this spider, this error will occur with, say... 3 out of 400 pages.
Then, if immediately run the spider again, those same 3 pages are scraped just fine without error, and 4 totally different pages will return the same error.
Furthermore, if I run the EXACT same spider as below, but replace mapping with just these 7 erroneous landing pages, they are scraped perfectly fine.
Is there something in my code that's not quite right??
I'm going to attach the whole code - sorry in advance!! - I just fear that something I might deem as superfluous may in fact be the cause. So this is the whole thing, but with sensitive data replaced with ####.
I've checked all of the affected pages, and of course the css is valid, and the title is always present.
I've done sudo apt-get update & sudo apt-get dist-upgrade on the server running scrapy, in hopes that this would help. No luck.
import scrapy
from scrapy import signals
from sqlalchemy.orm import sessionmaker
from datetime import date, datetime, timedelta
from scrapy.http.request import Request
from w3lib.url import safe_download_url
from sqlalchemy import and_, or_, not_
import smtplib
from email.MIMEMultipart import MIMEMultipart
from email.MIMEText import MIMEText
from sqlalchemy.engine import create_engine
engine = create_engine('mysql://######:#######localhost/LandingPages', pool_recycle=3600, echo=False)
#conn = engine.connect()
from LandingPageVerifier.models import LandingPagesFacebook, LandingPagesGoogle, LandingPagesSimplifi, LandingPagesScrapeLog, LandingPagesScrapeResults
Session = sessionmaker(bind=engine)
session = Session()
# today = datetime.now().strftime("%Y-%m-%d")
# thisyear = datetime.now().strftime("%Y")
# thismonth = datetime.now().strftime("%m")
# thisday = datetime.now().strftime("%d")
# start = date(year=2019,month=04,day=09)
todays_datetime = datetime(datetime.today().year, datetime.today().month, datetime.today().day)
print todays_datetime
landingpages_today_fb = session.query(LandingPagesFacebook).filter(LandingPagesFacebook.created_on >= todays_datetime).all()
landingpages_today_google = session.query(LandingPagesGoogle).filter(LandingPagesGoogle.created_on >= todays_datetime).all()
landingpages_today_simplifi = session.query(LandingPagesSimplifi).filter(LandingPagesSimplifi.created_on >= todays_datetime).all()
session.close()
#Mix 'em together!
landingpages_today = landingpages_today_fb + landingpages_today_google + landingpages_today_simplifi
#landingpages_today = landingpages_today_fb
#Do some iterating and formatting work
landingpages_today = [(u.ad_url_full, u.client_id) for u in landingpages_today]
#print landingpages_today
landingpages_today = list(set(landingpages_today))
#print 'Unique pages: '
#print landingpages_today
# unique_landingpages = [(u[0]) for u in landingpages_today]
# unique_landingpage_client = [(u[1]) for u in landingpages_today]
# print 'Pages----->', len(unique_landingpages)
class LandingPage004Spider(scrapy.Spider):
name='LandingPage004Spider'
#classmethod
def from_crawler(cls, crawler, *args, **kwargs):
spider = super(LandingPage004Spider, cls).from_crawler(crawler, *args, **kwargs)
#crawler.signals.connect(spider.spider_opened, signals.spider_opened)
crawler.signals.connect(spider.spider_closed, signals.spider_closed)
return spider
def spider_closed(self, spider):
#stats = spider.crawler.stats.get_stats()
stats = spider.crawler.stats.get_value('item_scraped_count'),
Session = sessionmaker(bind=engine)
session = Session()
logitem = LandingPagesScrapeLog(scrape_count = spider.crawler.stats.get_value('item_scraped_count'),
is200 = spider.crawler.stats.get_value('downloader/response_status_count/200'),
is400 = spider.crawler.stats.get_value('downloader/response_status_count/400'),
is403 = spider.crawler.stats.get_value('downloader/response_status_count/403'),
is404 = spider.crawler.stats.get_value('downloader/response_status_count/404'),
is500 = spider.crawler.stats.get_value('downloader/response_status_count/500'),
scrapy_errors = spider.crawler.stats.get_value('log_count/ERROR'),
scrapy_criticals = spider.crawler.stats.get_value('log_count/CRITICAL'),
)
session.add(logitem)
session.commit()
session.close()
#mapping = landingpages_today
handle_httpstatus_list = [200, 302, 404, 400, 500]
start_urls = []
def start_requests(self):
for url, client_id in self.mapping:
yield Request(url, callback=self.parse, meta={'client_id': client_id})
def parse(self, response):
##DEBUG - return all scraped data
#wholepage = response.body.lower()
url = response.url
if 'redirect_urls' in response.request.meta:
redirecturl = response.request.meta['redirect_urls'][0]
if 'utm.pag.ca' in redirecturl:
url_shortener = response.request.meta['redirect_urls'][0]
else:
url_shortener = 'None'
else:
url_shortener = 'None'
client_id = response.meta['client_id']
url_title = response.css("title::text").extract_first()
# pagesize = len(response.xpath('//*[not(descendant-or-self::script)]'))
pagesize = len(response.body)
HTTP_code = response.status
####ERROR CHECK: Small page size
if 'instapage' in response.body.lower():
if pagesize <= 20000:
err_small = 1
else:
err_small = 0
else:
if pagesize <= 35000:
err_small = 1
else:
err_small = 0
####ERROR CHECK: Page contains the phrase 'not found'
if 'not found' in response.xpath('//*[not(descendant-or-self::script)]').extract_first().lower():
#their sites are full of HTML errors, making scrapy unable to notice what is and is not inside a script element
if 'dealerinspire' in response.body.lower():
err_has_not_found = 0
else:
err_has_not_found = 1
else:
err_has_not_found = 0
####ERROR CHECK: Page cotains the phrase 'can't be found'
if "can't be found" in response.xpath('//*[not(self::script)]').extract_first().lower():
err_has_cantbefound = 1
else:
err_has_cantbefound = 0
####ERROR CHECK: Page contains the phrase 'unable to locate'
if 'unable to locate' in response.body.lower():
err_has_unabletolocate = 1
else:
err_has_unabletolocate = 0
####ERROR CHECK: Page contains phrase 'no longer available'
if 'no longer available' in response.body.lower():
err_has_nolongeravailable = 1
else:
err_has_nolongeravailable = 0
####ERROR CHECK: Page contains phrase 'no service specials'
if 'no service specials' in response.body.lower():
err_has_noservicespecials = 1
else:
err_has_noservicespecials = 0
####ERROR CHECK: Page contains phrase 'Sorry, no' to match zero inventory for a search, which normally says "Sorry, no items matching your request were found."
if 'sorry, no ' in response.body.lower():
err_has_sorryno = 1
else:
err_has_sorryno = 0
yield {'client_id': client_id, 'url': url, 'url_shortener': url_shortener, 'url_title': url_title, "pagesize": pagesize, "HTTP_code": HTTP_code, "err_small": err_small, 'err_has_not_found': err_has_not_found, 'err_has_cantbefound': err_has_cantbefound, 'err_has_unabletolocate': err_has_unabletolocate, 'err_has_nolongeravailable': err_has_nolongeravailable, 'err_has_noservicespecials': err_has_noservicespecials, 'err_has_sorryno': err_has_sorryno}
#E-mail settings
def sendmail(recipients,subject,body):
fromaddr = "#######"
toaddr = recipients
msg = MIMEMultipart()
msg['From'] = fromaddr
msg['Subject'] = subject
body = body
msg.attach(MIMEText(body, 'html'))
server = smtplib.SMTP('########)
server.starttls()
server.login(fromaddr, "##########")
text = msg.as_string()
server.sendmail(fromaddr, recipients, text)
server.quit()
`
Expected results is a perfect scrape, with no errors.
Actual results are unpredicatable AttributeErrors, claiming that attribute 'css' can't be found on some pages. But if I scrape those pages individually, using the same script, they scrape just fine.
Sometimes Scrapy can't parse HTML because of markup errors, that's why you can't call response.css(). You can catch these events in your code and analyze broken HTML:
def parse(self, response):
try:
....
your code
.....
except:
with open("Error.htm", "w") as f:
f.write(response.body)
UPDATE You can try to check for empty response:
def parse(self, response):
if not response.body:
yield scrapy.Request(url=response.url, callback=self.parse, meta={'client_id': response.meta["client_id"]})
# your original code