How to get website to consistently return content from a GET request when it's inconsistent? - beautifulsoup

I posted a similar question earlier but I think this is a more refined question.
I'm trying to scrape: https://www.prosportstransactions.com/football/Search/SearchResults.php?Player=&Team=&BeginDate=&EndDate=&PlayerMovementChkBx=yes&submit=Search&start=0
My code randomly throws errors when I send a GET request to the URL. After debugging, I saw the following happen. A GET request for the following url will be sent(Example URL, could happen on any page): https://www.prosportstransactions.com/football/Search/SearchResults.php?Player=&Team=&BeginDate=&EndDate=&PlayerMovementChkBx=yes&submit=Search&start=2400
The webpage will then say "There were no matching transactions found.". However, if I refresh the page, the content will then be loaded. I'm using BeautifulSoup and Selenium and have put sleep statements in my code in hopes that it'll work but to no avail. Is this a problem on the website's end? It doesn't make sense to me how one GET request will return nothing but the exact same request will return something. Also, is there anything I could to fix it or is it out of control?
Here is a sample of my code:
t
def scrapeWebsite(url, start, stop):
driver = webdriver.Chrome(executable_path='/Users/Downloads/chromedriver')
print(start, stop)
madeDict = {"Date": [], "Team": [], "Name": [], "Relinquished": [], "Notes": []}
#for i in range(0, 214025, 25):
for i in range(start, stop, 25):
print("Current Page: " + str(i))
currUrl = url + str(i)
#print(currUrl)
#r = requests.get(currUrl)
#soupPage = BeautifulSoup(r.content)
driver.get(currUrl)
#Sleep program for dynamic refreshing
time.sleep(1)
soupPage = BeautifulSoup(driver.page_source, 'html.parser')
#page = urllib2.urlopen(currUrl)
#time.sleep(2)
#soupPage = BeautifulSoup(page, 'html.parser')
info = soupPage.find("table", attrs={'class': 'datatable center'})
time.sleep(1)
extractedInfo = info.findAll("td")
The error occurs at the last line. "findAll" complains because it can't find findAll when the content is null(meaning the GET request returned nothing)

I did some workaround to scrape all the page using try except.
Probably the requests loop it is so fast and the page can't support it.
See the example below, worked like a charm:
import requests
from bs4 import BeautifulSoup
URL = 'https://www.prosportstransactions.com/football/Search/SearchResults.php?Player=&Team=&BeginDate=&EndDate=' \
'&PlayerMovementChkBx=yes&submit=Search&start=%s'
def scrape(start=0, stop=214525):
for page in range(start, stop, 25):
current_url = URL % page
print('scrape: current %s' % page)
while True:
try:
response = requests.request('GET', current_url)
if response.ok:
soup = BeautifulSoup(response.content.decode('utf-8'), features='html.parser')
table = soup.find("table", attrs={'class': 'datatable center'})
trs = table.find_all('tr')
slice_pos = 1 if page > 0 else 0
for tr in trs[slice_pos:]:
yield tr.find_all('td')
break
except Exception as exception:
print(exception)
for columns in scrape():
values = [column.text.strip() for column in columns]
# Continuous your code ...

Related

Why does my web scraping function not export the data?

I am currently web scraping a few pages inside a list. I have the following code provided.
pages = {
"https://shop.supervalu.ie/shopping/wine-beer-spirits-germany/c-150410100",
"https://shop.supervalu.ie/shopping/wine-beer-spirits-small-bottles/c-150410110",
"https://shop.supervalu.ie/shopping/wine-beer-spirits-lager/c-150302375", #More than one page
"https://shop.supervalu.ie/shopping/wine-beer-spirits-stout/c-150302380",
"https://shop.supervalu.ie/shopping/wine-beer-spirits-ale/c-150302385",
"https://shop.supervalu.ie/shopping/wine-beer-spirits-lager/c-150302386",
"https://shop.supervalu.ie/shopping/wine-beer-spirits-stout/c-150302387",
"https://shop.supervalu.ie/shopping/wine-beer-spirits-ale/c-150302388", #More than one page
"https://shop.supervalu.ie/shopping/wine-beer-spirits-cider/c-150302389",
"https://shop.supervalu.ie/shopping/wine-beer-spirits-cider/c-150302390",
"https://shop.supervalu.ie/shopping/wine-beer-spirits-alcopops/c-150302395",
"https://shop.supervalu.ie/shopping/wine-beer-spirits-vodka/c-150302430",
"https://shop.supervalu.ie/shopping/wine-beer-spirits-irish-whiskey/c-150302435", #More than one page
}
products = []
prices = []
images = []
urls = []
def export_data():
logging.info("exporting data to pandas dataframe")
supervalu = pd.DataFrame({
'img_url' : images,
'url' : urls,
'product' : products,
'price' : prices
})
logging.info("sorting data by price")
supervalu.sort_values(by=['price'], inplace=True)
output_json = 'supervalu.json'
output_csv = 'supervalu.csv'
output_dir = Path('../../json/supervalu')
output_dir.mkdir(parents=True, exist_ok=True)
logging.info("exporting data to json")
supervalu.to_json(output_dir / output_json)
logging.info("exporting data to csv")
supervalu.to_csv(output_dir / output_csv)
def get_data(div):
raw_data = div.find_all('div', class_='ga-product')
raw_images = div.find_all('img')
raw_url = div.find_all('a', class_="ga-product-link")
product_data = [data['data-product'] for data in raw_data]
new_data = [d.replace("\r\n","") for d in product_data]
for name in new_data:
new_names = re.search(' "name": "(.+?)"', name).group(1)
products.append(new_names)
for price in new_data:
new_prices = re.search(' "price": ''"(.+?)"', price).group(1)
prices.append(new_prices)
for image in raw_images:
new_images = image['data-src']
images.append(new_images)
for url in raw_url:
new_url = url['href']
urls.append(new_url)
def scrape_page(next_url):
page = requests.get(next_url)
if page.status_code != 200:
logging.error("Page does not exist!")
exit()
soup = BeautifulSoup(page.content, 'html.parser')
get_data(soup.find(class_="row product-list ga-impression-group"))
try:
load_more_text = soup.find('a', class_='pill ajax-link load-more').findAll('span')[-1].text
if load_more_text == 'Load more':
next_page = soup.find('a', class_="pill ajax-link load-more").get('href')
logging.info("Scraping next page: {}".format(next_page))
scrape_page(next_page)
else:
export_data()
except:
logging.warning("No more next pages to scrape")
pass
for page in pages:
logging.info("Scraping page: {}".format(page))
scrape_page(page)
The main issue that appears is during the try exception handling of the next page. As not all of the pages provided have the the appropriate snippet, a ValueAttribute error will araise hence I have the aforementioned statement closed off in a try exception case. I want to skip the pages that don't have next page and scrape them regardless and continue looping the rest of the pages until a next page arises. All of the pages appear to be looped through but I never get the data exported. If I try the following code:
try:
load_more_text = soup.find('a', class_='pill ajax-link load-more').findAll('span')[-1].text
if load_more_text == 'Load more':
next_page = soup.find('a', class_="pill ajax-link load-more").get('href')
logging.info("Scraping next page: {}".format(next_page))
scrape_page(next_page)
except:
logging.warning("No more next pages to scrape")
pass
else:
export_data()
This would be the closest that I have gotten to the desired outcome. The above code works and the data gets exported but not all of the pages get exported because as a result - a new dataframe is created for every time a new next page appears and ends i.e. - code iterarets through the list, finds a next page, next page 'pages' get scraped and a new dataframe is created and deletes the previous data.
I'm hoping that someone would give me some guidance on what to do as I have been stuck on this part of my personal project and I'm not so sure on how I am supposed to overcome this obstacle. Thank you in advance.
I have modified my code as shown below and I have received my desired outcome.
load_more_text = soup.find('a', class_='pill ajax-link load-more')
if load_more_text:
next_page = soup.find('a', class_="pill ajax-link load-more").get('href')
logging.info("Scraping next page: {}".format(next_page))
scrape_page(next_page)
else:
export_data()

How to use the yield function to scrape data from multiple pages

I'm trying to scrape data from amazon India website. I am not able collect response and parse the elements using the yield() method when:
1) I have to move from product page to review page
2) I have to move from one review page to another review page
Product page
Review page
Code flow:
1) customerReviewData() calls the getCustomerRatingsAndComments(response)
2) The getCustomerRatingsAndComments(response)
finds the URL of the review page and call the yield request method with getCrrFromReviewPage(request) as callback method, with url of this review page
3) getCrrFromReviewPage() gets new response of the firstreview page and scrape all the elements from the first review page (page loaded) and add it to customerReviewDataList[]
4) get URL of the next page if it exists and recursively call getCrrFromReviewPage() method, and crawl elements from next page, until all the review page is crawled
5) All the reviews gets added to the customerReviewDataList[]
I have tried playing around with yield() changing the parameters and also looked up the scrapy documentation for yield() and Request/Response yield
# -*- coding: utf-8 -*-
import scrapy
import logging
customerReviewDataList = []
customerReviewData = {}
#Get product name in <H1>
def getProductTitleH1(response):
titleH1 = response.xpath('normalize-space(//*[#id="productTitle"]/text())').extract()
return titleH1
def getCustomerRatingsAndComments(response):
#Fetches the relative url
reviewRelativePageUrl = response.css('#reviews-medley-footer a::attr(href)').extract()[0]
if reviewRelativePageUrl:
#get absolute URL
reviewPageAbsoluteUrl = response.urljoin(reviewRelativePageUrl)
yield Request(url = reviewPageAbsoluteUrl, callback = getCrrFromReviewPage())
self.log("yield request complete")
return len(customerReviewDataList)
def getCrrFromReviewPage():
userReviewsAndRatings = response.xpath('//div[#id="cm_cr-review_list"]/div[#data-hook="review"]')
for userReviewAndRating in userReviewsAndRatings:
customerReviewData[reviewTitle] = response.css('#cm_cr-review_list .review-title span ::text').extract()
customerReviewData[reviewDescription] = response.css('#cm_cr-review_list .review-text span::text').extract()
customerReviewDataList.append(customerReviewData)
reviewNextPageRelativeUrl = response.css('#cm_cr-pagination_bar .a-pagination .a-last a::attr(href)')[0].extract()
if reviewNextPageRelativeUrl:
reviewNextPageAbsoluteUrl = response.urljoin(reviewNextPageRelativeUrl)
yield Request(url = reviewNextPageAbsoluteUrl, callback = getCrrFromReviewPage())
class UsAmazonSpider(scrapy.Spider):
name = 'Test_Crawler'
allowed_domains = ['amazon.in']
start_urls = ['https://www.amazon.in/Philips-Trimmer-Cordless-Corded-QT4011/dp/B00JJIDBIC/ref=sr_1_3?keywords=philips&qid=1554266853&s=gateway&sr=8-3']
def parse(self, response):
titleH1 = getProductTitleH1(response),
customerReviewData = getCustomerRatingsAndComments(response)
yield{
'Title_H1' : titleH1,
'customer_Review_Data' : customerReviewData
}
I'm getting the following response:
{'Title_H1': (['Philips Beard Trimmer Cordless and Corded for Men QT4011/15'],), 'customer_Review_Data': <generator object getCustomerRatingsAndComments at 0x048AC630>}
The "Customer_review_Data" should be a list of dict of title and review
I am not able to figure out as to what mistake I am doing here.
When I use the log() or print() to see what data is captured in customerReviewDataList[], unable to see the data in the console either.
I am able to scrape all the reviews in customerReviewDataList[], if they are present in the product page,
In this scenario where I have to use the yield function I am getting the output stated above like this [https://ibb.co/kq8w6cf]
This is the kind of output I am looking for:
{'customerReviewTitle': ['Difficult to find a charger adapter'],'customerReviewComment': ['I already have a phillips trimmer which was only cordless. ], 'customerReviewTitle': ['Good Product'],'customerReviewComment': ['Solves my need perfectly HK']}]}
Any help is appreciated. Thanks in advance.
You should complete the Scrapy tutorial. The Following links section should be specially helpful to you.
This is a simplified version of your code:
def data_request_iterator():
yield Request('https://example.org')
class MySpider(Spider):
name = 'myspider'
start_urls = ['https://example.com']
def parse(self, response):
yield {
'title': response.css('title::text').get(),
'data': data_request_iterator(),
}
Instead, it should look like this:
class MySpider(Spider):
name = 'myspider'
start_urls = ['https://example.com']
def parse(self, response):
item = {
'title': response.css('title::text').get(),
}
yield Request('https://example.org', meta={'item': item}, callback=self.parse_data)
def parse_data(self, response):
item = response.meta['item']
# TODO: Extend item with data from this second response as needed.
yield item

Update response.body in scrapy(without reload)

I use scrapy and selenium for crawl! my site use ajax for pagination! actully , url no change and so response.body also no change! I want to click with selenium (for pagination) and get self.driver.page_source and use it instead response.body!
So i writed this code :
res = scrapy.http.TextResponse(url=self.driver.current_url, body=self.driver.page_source,
encoding='utf-8')
print(str(res)) //nothing to print!
for quote in res.css("#ctl00_ContentPlaceHolder1_Grd_Dr_DXMainTable > tr.dxgvDataRow_Office2003Blue"):
i = i+1
item = dict()
item['id'] = int(quote.css("td.dxgv:nth-child(1)::text").extract_first())
And no error !
You can replace body of original response in scrapy by using response.replace() method:
def parse(self, response):
response = response.replace(body=driver.page_source)

Web page is showing weird unicode(?) letters: \u200e

How can I remove that? I Tried so many things and I am exhausted of trying to defeat this error by myself. I spent the last 3 hours looking at this and trying to get through it and I surrender to this code. Please help.
The first "for" statement grabs article titles from news.google.com
The second "for" statement grabs the time of submisssion from that article on news.google.com.
This is on django btw and this page shows the list of article titles and their time of submission in a list, going down. The weird unicode letters are popping up from the second "for" statement which is the time submissions. Here is my views.py:
def articles(request):
""" Grabs the most recent articles from the main news page """
import bs4, requests
list = []
list2 = []
url = 'https://news.google.com/'
r = requests.get(url)
try:
r.raise_for_status() == True
except ValueError:
print('Something went wrong.')
soup = bs4.BeautifulSoup(r.text, 'html.parser')
for (listarticles) in soup.find_all('h2', 'esc-lead-article-title'):
if listarticles is not None:
a = listarticles.text
list.append(a)
for articles_times in soup.find_all('span','al-attribution-timestamp'):
if articles_times is not None:
b = articles_times.text
list2.append(b)
list = zip(list,list2)
context = {'list':list}
return render(request, 'newz/articles.html', context)

Scrapy FormRequest return 400 error code

I am trying to scrapy following website in which the pagination is though AJAX request.
http://studiegids.uva.nl/xmlpages/page/2014-2015/zoek-vak
I am sending FormRequest to access the different pages, however I am getting following error.
Retrying http://studiegids.uva.nl/xmlpages/plspub/uva_search.courses_pls> (failed 1 times): 400 Bad Request
Not able to understand what is wrong? Following is the code.
class Spider(BaseSpider):
name = "zoek"
allowed_domains = ["studiegids.uva.nl"]
start_urls = ["http://studiegids.uva.nl/xmlpages/page/2014-2015/zoek-vak"]
def parse(self, response):
base_url = "http://studiegids.uva.nl/xmlpages/page/2014-2015/zoek-vak"
for i in range(1, 10):
data = {'p_fetch_size': unicode(20),
'p_page:': unicode(i),
'p_searchpagetype': u'courses',
'p_site_lang': u'nl',
'p_strip': u'/2014-2015',
'p_ctxparam': u'/xmlpages/page/2014-2015/',
'p_rsrcpath':u'/xmlpages/resources/TXP/studiegidswebsite/'}
yield FormRequest.from_response(response,
formdata=data,
callback=self.fetch_details,
dont_click=True)
# yield FormRequest(base_url,
# formdata=data,
# callback=self.fetch_details)
def fetch_details(self, response):
# print response.body
hxs = HtmlXPathSelector(response)
item = ZoekItem()
Studiegidsnummer = hxs.select("//div[#class=item-info']//tr[1]/td[2]/p/text()")
Studielast = hxs.select("//div[#class=item-info']//tr[2]/td[2]/p/text()")
Voertaal = hxs.select("//div[#class=item-info']//tr[3]/td[2]/p/text()")
Ingangseis = hxs.select("//div[#class=item-info']//tr[4]/td[2]/p/text()")
Studiejaar = hxs.select("//div[#class=item-info']//tr[5]/td[2]/p/text()")
Onderwijsinstituut = hxs.select("//div[#class=item-info']//tr[6]/td[2]/p/text()")
for i in range(20):
item['Studiegidsnummer'] = Studiegidsnummer
item['Studielast'] = Studielast
item['Voertaal'] = Voertaal
yield item
Try also check headers using firebug.
400 Bad Request usually means that your request does not fully match the expected request format. Common causes include missing or invalid cookies, headers or parameters.
On your web browser, open the Network tab of the Developer Tools and trigger the request. When you see the request in the Network tab, inspect it fully (parameters, headers, etc.). Try to match such a request in your code.