Add Proxy to Selenium & export dataframe to CSV - selenium

I'm trying to make a scraper for capterra. I'm having issues getting blocked, so I think I need a proxy for my driver.get. Also, I am having trouble exporting a dataframe to a CSV. The first half of my code (not attached) is able to get all the links and store them in a list that I am trying to access with Selenium to get the information I want, but the second part is where I am having trouble.
For an example, these are the types of links I am storing in the plinks list and that the driver is accessing:
https://www.capterra.com/p/212448/Blackbaud-Altru/
https://www.capterra.com/p/80509/Volgistics-Volunteer-Management/
https://www.capterra.com/p/179048/One-Earth/
for link in plinks:
driver.get(link)
#driver.implicitly_wait(20)
companyProfile = bs(driver.page_source, 'html.parser')
try:
name = companyProfile.find("h1", class_="sm:nb-type-2xl nb-type-xl").text
except AttributeError:
name = "couldn't find"
try:
reviews = companyProfile.find("div", class_="nb-ml-3xs").text
except AttributeError:
reviews = "couldn't find"
try:
location = driver.find_element(By. XPATH, "//*[starts-with(., 'Located in')]").text
except NoSuchElementException:
location = "couldn't find"
try:
url = driver.find_element(By. XPATH, "//*[starts-with(., 'http')]").text
except NoSuchElementException:
url = "couldn't find"
try:
features = [x.get_text() for x in companyProfile.select('[id="LoadableProductFeaturesSection"] li span')]
except AttributeError:
features = "couldn't find"
companyInfo.append([name, reviews, location, url, features])
companydf = pd.DataFrame(companyInfo, columns = ["Name", "Reviews", "Location", "URL", "Features"])
companydf.to_csv(wmtest.csv, sep='\t')
driver.close()
I am using Mozilla for the webdriver, and I am happy to change to Chrome if it works better, but is it possible to have the webdriver pick from a random set of proxies for each get request?
Thanks!

Related

invalid syntax on scraping with selenium and chromedriver

I've been using selenium to scrape data on on this site but i have this error
matches = driver.find_element(By.XPATH,'//*[#id="mainContent"]/div[3]/div[1]/div[2]/section/div[1]/ul/li[1]')
SyntaxError: invalid syntax
So i would like to know if my code below is correct to scrape all the season on this site.
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
website = "https://www.premierleague.com/results?co=1&se=363&cl=-1"
driver.get(website)
sleep(10)
element = driver.find_element(By.XPATH,('//*[#class="_24Il51SkQ29P1pCkJOUO-7"]/button')).click()
sleep(10)
WebDriverWait(driver, 20).until(driver.find_element(By.ID,('//*[#id="advertClose"]')).click()
matches = driver.find_element(By.XPATH,'//*
[#id="mainContent"]/div[3]/div[1]/div[2]/section/div[1]/ul/li[1]')
matches = []
for match in matches:
team = matches.find_element(By.XPATH,('//*
[#id="mainContent"]/div[3]/div[1]/div[2]/section/div[1]/ul/li[1]/div/span')).text
scores = matches.find_element(By.XPATH,('//*[#id="mainContent"]/div[3]/div[1]/div[2]/section/div[1]/ul/li[1]/div/span/span[1]/span[2]')).text
print(match.text)

Webscraping customer review - Invalid selector error using XPath

I am trying to extract userid, rating and review from the following site using selenium and it is showing "Invalid selector error". I think, the Xpath I have tried to define to get the review text is the reason for error. But I am unable to resolve the issue. The site link is as below:
teslamotor review
The code that I have used is following:
#Class for Review webscraping from consumeraffairs.com site
class CarForumCrawler():
def __init__(self, start_link):
self.link_to_explore = start_link
self.comments = pd.DataFrame(columns = ['rating','user_id','comments'])
self.driver = webdriver.Chrome(executable_path=r'C:/Users/mumid/Downloads/chromedriver/chromedriver.exe')
self.driver.get(self.link_to_explore)
self.driver.implicitly_wait(5)
self.extract_data()
self.save_data_to_file()
def extract_data(self):
ids = self.driver.find_elements_by_xpath("//*[contains(#id,'review-')]")
comment_ids = []
for i in ids:
comment_ids.append(i.get_attribute('id'))
for x in comment_ids:
#Extract dates from for each user on a page
user_rating = self.driver.find_elements_by_xpath('//*[#id="' + x +'"]/div[1]/div/img')[0]
rating = user_rating.get_attribute('data-rating')
#Extract user ids from each user on a page
userid_element = self.driver.find_elements_by_xpath('//*[#id="' + x +'"]/div[2]/div[2]/strong')[0]
userid = userid_element.get_attribute('itemprop')
#Extract Message for each user on a page
user_message = self.driver.find_elements_by_xpath('//*[#id="' + x +'"]]/div[3]/p[2]/text()')[0]
comment = user_message.text
#Adding date, userid and comment for each user in a dataframe
self.comments.loc[len(self.comments)] = [rating,userid,comment]
def save_data_to_file(self):
#we save the dataframe content to a CSV file
self.comments.to_csv ('Tesla_rating-6.csv', index = None, header=True)
def close_spider(self):
#end the session
self.driver.quit()
try:
url = 'https://www.consumeraffairs.com/automotive/tesla_motors.html'
mycrawler = CarForumCrawler(url)
mycrawler.close_spider()
except:
raise
The error that I am getting is as following:
Also, The xpath that I tried to trace is from following HTML
You are seeing the classic error of...
as find_elements_by_xpath('//*[#id="' + x +'"]]/div[3]/p[2]/text()')[0] would select the attributes, instead you need to pass an xpath expression that selects elements.
You need to change as:
user_message = self.driver.find_elements_by_xpath('//*[#id="' + x +'"]]/div[3]/p[2]')[0]
References
You can find a couple of relevant detailed discussions in:
invalid selector: The result of the xpath expression "//a[contains(#href, 'mailto')]/#href" is: [object Attr] getting the href attribute with Selenium

How to get website to consistently return content from a GET request when it's inconsistent?

I posted a similar question earlier but I think this is a more refined question.
I'm trying to scrape: https://www.prosportstransactions.com/football/Search/SearchResults.php?Player=&Team=&BeginDate=&EndDate=&PlayerMovementChkBx=yes&submit=Search&start=0
My code randomly throws errors when I send a GET request to the URL. After debugging, I saw the following happen. A GET request for the following url will be sent(Example URL, could happen on any page): https://www.prosportstransactions.com/football/Search/SearchResults.php?Player=&Team=&BeginDate=&EndDate=&PlayerMovementChkBx=yes&submit=Search&start=2400
The webpage will then say "There were no matching transactions found.". However, if I refresh the page, the content will then be loaded. I'm using BeautifulSoup and Selenium and have put sleep statements in my code in hopes that it'll work but to no avail. Is this a problem on the website's end? It doesn't make sense to me how one GET request will return nothing but the exact same request will return something. Also, is there anything I could to fix it or is it out of control?
Here is a sample of my code:
t
def scrapeWebsite(url, start, stop):
driver = webdriver.Chrome(executable_path='/Users/Downloads/chromedriver')
print(start, stop)
madeDict = {"Date": [], "Team": [], "Name": [], "Relinquished": [], "Notes": []}
#for i in range(0, 214025, 25):
for i in range(start, stop, 25):
print("Current Page: " + str(i))
currUrl = url + str(i)
#print(currUrl)
#r = requests.get(currUrl)
#soupPage = BeautifulSoup(r.content)
driver.get(currUrl)
#Sleep program for dynamic refreshing
time.sleep(1)
soupPage = BeautifulSoup(driver.page_source, 'html.parser')
#page = urllib2.urlopen(currUrl)
#time.sleep(2)
#soupPage = BeautifulSoup(page, 'html.parser')
info = soupPage.find("table", attrs={'class': 'datatable center'})
time.sleep(1)
extractedInfo = info.findAll("td")
The error occurs at the last line. "findAll" complains because it can't find findAll when the content is null(meaning the GET request returned nothing)
I did some workaround to scrape all the page using try except.
Probably the requests loop it is so fast and the page can't support it.
See the example below, worked like a charm:
import requests
from bs4 import BeautifulSoup
URL = 'https://www.prosportstransactions.com/football/Search/SearchResults.php?Player=&Team=&BeginDate=&EndDate=' \
'&PlayerMovementChkBx=yes&submit=Search&start=%s'
def scrape(start=0, stop=214525):
for page in range(start, stop, 25):
current_url = URL % page
print('scrape: current %s' % page)
while True:
try:
response = requests.request('GET', current_url)
if response.ok:
soup = BeautifulSoup(response.content.decode('utf-8'), features='html.parser')
table = soup.find("table", attrs={'class': 'datatable center'})
trs = table.find_all('tr')
slice_pos = 1 if page > 0 else 0
for tr in trs[slice_pos:]:
yield tr.find_all('td')
break
except Exception as exception:
print(exception)
for columns in scrape():
values = [column.text.strip() for column in columns]
# Continuous your code ...

How to include try and Exceptions tests in a thousands downloads program that uses selenium and requests?

I have a program to download photos on various websites. Each url is formed at the end of the address by codes, which are accessed in a dataframe. In a dataframe of 8,583 lines
The sites have javascript, so I use selenium to access the src of the photos. And I download it with urllib.request.urlretrieve
Example of a photo site: http://divulgacandcontas.tse.jus.br/divulga/#/candidato/2018/2022802018/PB/150000608817
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
from bs4 import BeautifulSoup
import time
import urllib.request, urllib.parse, urllib.error
# Root URL of the site that is accessed to fetch the photo link
url_raiz = 'http://divulgacandcontas.tse.jus.br/divulga/#/candidato/2018/2022802018/'
# Accesses the dataframe that has the "sequencial" type codes
candidatos = pd.read_excel('candidatos_2018.xlsx',sheet_name='Sheet1', converters={'sequencial': lambda x: str(x), 'cpf': lambda x: str(x),'numero_urna': lambda x: str(x)})
# Function that opens each page and takes the link from the photo
def pegalink(url):
profile = webdriver.FirefoxProfile()
browser = webdriver.Firefox(profile)
browser.get(url)
time.sleep(10)
html = browser.page_source
soup = BeautifulSoup(html, "html.parser")
browser.close()
link = soup.find("img", {"class": "img-thumbnail img-responsive dvg-cand-foto"})['src']
return link
# Function that downloads the photo and saves it with the code name "cpf"
def baixa_foto(nome, url):
urllib.request.urlretrieve(url, nome)
# Iteration in the dataframe
for num, row in candidatos.iterrows():
cpf = (row['cpf']).strip()
uf = (row['uf']).strip()
print(cpf)
print("-/-")
sequencial = (row['sequencial']).strip()
# Creates full page address
url = url_raiz + uf + '/' + sequencial
link_foto = pegalink(url)
baixa_foto(cpf, link_foto)
Please I look guidance for:
Put a try-Exception type to wait for the page to load (I'm having errors reading the src - after many hits the site takes more than ten seconds to load)
And I would like to record all possible errors - in a file or dataframe - to write down the "sequencial" code that gave error and continue the program
Would anyone know how to do it? The guidelines below were very useful, but I was unable to move forward
I put in a folder a part of the data I use and the program, if you want to look: https://drive.google.com/drive/folders/1lAnODBgC5ZUDINzGWMcvXKTzU7tVZXsj?usp=sharing
put your code within :
try:
WebDriverWait(browser, 30).until(wait_for(page_has_loaded))
# here goes your code
except: Exception
print "This is an unexpected condition!"
For waitForPageToLoad :
def page_has_loaded():
page_state = browser.execute_script(
'return document.readyState;'
)
return page_state == 'complete'
30 above is time in seconds. You can adjust it as per your need.
Approach 2 :
class wait_for_page_load(object):
def __init__(self, browser):
self.browser = browser
def __enter__(self):
self.old_page = self.browser.find_element_by_tag_name('html')
def page_has_loaded(self):
new_page = self.browser.find_element_by_tag_name('html')
return new_page.id != self.old_page.id
def __exit__(self, *_):
wait_for(self.page_has_loaded)
def pegalink(url):
profile = webdriver.FirefoxProfile()
browser = webdriver.Firefox(profile)
browser.get(url)
try:
with wait_for_page_load(browser):
html = browser.page_source
soup = BeautifulSoup(html, "html.parser")
browser.close()
link = soup.find("img", {"class": "img-thumbnail img-responsive dvg-cand-foto"})['src']
except Exception:
print ("This is an unexpected condition!")
print("Erro em: ", url)
link = "Erro"
return link

How do I make python try the next URL in my file if the current one returns a 404?

I'm having a problem figuring out what code I need to create to make to make python try the next url in my csv file each url is on a line like this:
http://www.indexedamerica.com/states/PR/Adjuntas/Restaurants-Adjuntas-00601.html
http://www.indexedamerica.com/states/PR/Aguada/Restaurants-Aguada-00602.html
http://www.indexedamerica.com/states/PR/Aguadilla/Restaurants-Aguadilla-00603.html
http://www.indexedamerica.com/states/PR/Aguadilla/Restaurants-Aguadilla-00604.html
http://www.indexedamerica.com/states/PR/Aguadilla/Restaurants-Aguadilla-00605.html
http://www.indexedamerica.com/states/PR/Maricao/Restaurants-Maricao-00606.html
http://www.indexedamerica.com/states/MI/Kent/Restaurants-Grand-Rapids-49503.html
#open csv file
#read csv file line by line
#Pass each line to beautiful soup to try
#If URL raises a 404 error continue to next line
#extract tables from url
from mechanize import Browser
from BeautifulSoup import BeautifulSoup
import csv
mech = Browser()
indexed = open('C://python27/longlist.csv')
reader = csv.reader(indexed)
html = mech.open(reader)
for line in html:
try:
mechanize.open(html)
table = soup.find("table", border=3)
else:
#!!!! try next url from file. How do I do this?
for row in table.findAll('tr')[2:]:
col = row.findAll('td')
BusinessName = col[0].string
Phone = col[1].string
Address = col[2].string
City = col[3].string
State = col[4].string
Zip = col[5].string
Restaurantinfo = (BusinessName, Phone, Address, City, State)
print "|".join(Restaurantinfo)
for line in html:
try:
mechanize.open(html)
table = soup.find("table", border=3)
except Exception:
continue
Alternatively, you could check the status code of the page, and skip if you receive a 404 (in a for loop):
if urllib.urlopen(url).getcode() == '404':
continue
continue in a loop, stops execution of further code and continues to the next entry in the loop.
Add all the urls you want to search through to a list. Then loop through the list, opening each url in sequence. If a given url returns any kind of error then you can choose to use continue to ignore that url-file and move on to the next one.