How do i extract data from yelp using selenium python - selenium

I am new to python!! I want to Extract data from yelp
https://www.yelp.com/search?find_desc=nails+salons&find_loc=San+Francisco%2C+CA&ns=1
and then from clicking on name on 1st page ...i.e
https://www.yelp.com/biz/joy-joy-nail-and-spa-san-francisco?osq=nails+salons
it should extract
Name
Address
Website
Contact No
Rating (How many) in numbers
and then it should continue doing so for full page
Example output
Joy Joy Nail & Spa
4023 24th St San Francisco, CA 94114
joyjoynailspa.com
(415) 655-3216
6 Reviews
Sunset Nails
1810 Irving St
San Francisco, CA 94122
(415) 566-9888
1185 reviews
if any of the element not present like website it should skip that info and continue

So, basically you have to go to page, then using find_elements have to see how many items are present to scrape, then select the first one and scrape the desire elements and go back to the previous page and do the same for other products.
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
driver = webdriver.Chrome(driver_path)
driver.maximize_window()
driver.implicitly_wait(50)
driver.get("https://www.yelp.com/search?find_desc=nails+salons&find_loc=San+Francisco%2C+CA&ns=1")
wait = WebDriverWait(driver, 20)
lnght = len(driver.find_elements(By.XPATH, "//div[contains(#class,'businessName')]/descendant::a"))
j = 0
for item in range(lnght):
elements = driver.find_elements(By.XPATH, "//div[contains(#class,'arrange-unit') and contains(#class,'arrange-unit-fill')]//ancestor::div[contains(#class,'container') and contains(#class,'hover')]")
time.sleep(1)
#driver.execute_script("arguments[0].scrollIntoView(true);", elements[j])
eles = driver.find_elements(By.XPATH, "//h4/descendant::a")
ActionChains(driver).move_to_element(eles[j]).click().perform()
#elements[j].click()
time.sleep(2)
print(wait.until(EC.visibility_of_element_located((By.XPATH, "//div[contains(#class,'headingLight')]//h1"))).text)
print(wait.until(EC.visibility_of_element_located((By.XPATH, "//p[text()='Business website']/following-sibling::p/a"))).text)
print(wait.until(EC.visibility_of_element_located((By.XPATH, "//p[text()='Phone number']/following-sibling::p"))).text)
print(wait.until(EC.visibility_of_element_located((By.XPATH, "//a[text()='Get Directions']/../following-sibling::p"))).text)
print(wait.until(EC.visibility_of_element_located((By.XPATH, "//span[contains(text(),'reviews')]"))).text)
driver.execute_script("window.history.go(-1)")
time.sleep(2)
j = j + 1
Update 1 :
Whichever line is causing the issue, try to wrap them like this :
try:
print(wait.until(EC.visibility_of_element_located((By.XPATH, "//p[text()='Business website']/following-sibling::p/a"))).text)
except:
pass

Related

The output from my selenium script is blank, how do I fix?

First time using selenium for web scraping a website, and I'm fairly new to python. I have tried to scrape a Swedish housing site to extract price, address, area, size, etc., for every listing for a specific URL that shows all houses for sale in a specific area called "Lidingö".
I managed to bypass the pop-up window for accepting cookies.
However, the output I get from the terminal is blank when the script runs. I get nothing, not an error, not any output.
What could possibly be wrong?
The code is:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
s = Service("/Users/brustabl1/hemnet/chromedriver")
url = "https://www.hemnet.se/bostader?location_ids%5B%5D=17846&item_types%5B%5D=villa"
driver = webdriver.Chrome(service=s)
driver.maximize_window()
driver.implicitly_wait(10)
driver.get(url)
# The cookie button clicker
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, "/html/body/div[62]/div/div/div/div/div/div[2]/div[2]/div[2]/button"))).click()
lists = driver.find_elements(By.XPATH, '//*[#id="result"]/ul[1]/li[1]/a/div[2]')
for list in lists:
adress = list.find_element(By.XPATH,'//*[#id="result"]/ul[1]/li[2]/a/div[2]/div/div[1]/div[1]/h2')
area = list.find_element(By.XPATH,'//*[#id="result"]/ul[1]/li[1]/a/div[2]/div/div[1]/div[1]/div/span[2]')
price = list.find_element(By.XPATH,'//*[#id="result"]/ul[1]/li[1]/a/div[2]/div/div[2]/div[1]/div[1]')
rooms = list.find_element(By.XPATH,'//*
[#id="result"]/ul[1]/li[1]/a/div[2]/div/div[2]/div[1]/div[3]')
size = list.find_element(By.XPATH,'//*[#id="result"]/ul[1]/li[1]/a/div[2]/div/div[2]/div[1]/div[2]')
print(adress.text)
There are a lot of flaws in your code...
lists = driver.find_elements(By.XPATH, '//*[#id="result"]/ul[1]/li[1]/a/div[2]')
in your code this returns a list of elements in the variable lists
for list in lists:
adress = list.find_element(By.XPATH,'//*[#id="result"]/ul[1]/li[2]/a/div[2]/div/div[1]/div[1]/h2')
area = list.find_element(By.XPATH,'//*[#id="result"]/ul[1]/li[1]/a/div[2]/div/div[1]/div[1]/div/span[2]')
price = list.find_element(By.XPATH,'//*[#id="result"]/ul[1]/li[1]/a/div[2]/div/div[2]/div[1]/div[1]')
rooms = list.find_element(By.XPATH,'//*
[#id="result"]/ul[1]/li[1]/a/div[2]/div/div[2]/div[1]/div[3]')
size = list.find_element(By.XPATH,'//*[#id="result"]/ul[1]/li[1]/a/div[2]/div/div[2]/div[1]/div[2]')
print(adress.text)
you are not storing the value of each address in a list, instead, you are updating its value through each iteration.And xpath refers to the exact element, your loop is selecting the same element over and over again!
And scraping text through selenium is a bad practice, use BeautifulSoup instead.

Selenium/BeautifulSoup - WebScrape this field

My code runs fine and prints the title for all rows but the rows with dropdowns.
For example, row 4 has a dropdown if clicked. I implemented a 'try' which would in theory click the dropdown, to then pull the titles.
But when i execute click() and try to print, for the rows with these drop downs, they are not printing.
Expected output- Print all titles including the ones in dropdown.
A user has submitted an answer on this link StackOverFlowAnswer but the format of his answer was different and I do not know how to add fields such as date, time, chairs, or the field on the top which says "On demand" with his approach
Any approach would be appreciated, would like to put into a dataframe. Thanks
from selenium import webdriver
from bs4 import BeautifulSoup
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.action_chains import ActionChains
import time
driver = webdriver.Chrome()
actions = ActionChains(driver)
driver.get('https://cslide.ctimeetingtech.com/esmo2021/attendee/confcal/session/list')
time.sleep(4)
page_source = driver.page_source
soup = BeautifulSoup(page_source,'html.parser')
new_titles = set()
productlist=driver.find_elements_by_xpath("//div[#class='card item-container session']")
for property in productlist:
actions.move_to_element_with_offset(property,0,0).perform()
time.sleep(4.5)
sessiontitle=property.find_element_by_xpath(".//h4[#class='session-title card-title']").text
#print(sessiontitle)
ifDropdown=property.find_elements_by_xpath(".//*[#class='item-expand-action expand']")
if(ifDropdown):
ifDropdown[0].click()
time.sleep(4)
open_titles = driver.find_elements_by_class_name('card-title')
for open_title in open_titles:
title = open_title.text
if(title not in new_titles):
print(title)
new_titles.add(title)
Your problem is with driver.find_elements_by_class_name('item-expand-action expand') command. The find_elements_by_class_name('item-expand-action expand') locator is wrong. Those web elements have multiple class names. To locate these elements you can use css_selector or XPath.
Also since there are several elements with dropdowns, to perform clicks on them you should iterate over them. You can not perform .click() on a list of web elements.
So your code should be like this:
ifDropdown=driver.find_elements_by_css_selector('.item-expand-action.expand')
for drop_down in ifDropdown:
drop_down.click()
time.sleep(0.5)
Alternatively to the css_selector above you can use XPath as well:
ifDropdown=driver.find_elements_by_xpath('//a[#class="item-expand-action expand"]')
UPD
If you wish to print the added, new titles you can do this:
ifDropdown=driver.find_elements_by_css_selector('.item-expand-action.expand')
for drop_down in ifDropdown:
drop_down.click()
time.sleep(0.5)
newTitles=driver.find_elements_by_class_name('card-title')
for new_title in newTitles:
print(new_title.text)
Here after expanding all the dropdown elements I'm getting all the new titles and then iterate over that list printing each element text.
driver.find_elements_by_class_name returns a list of web elements. You can not apply .text on a list, you have to iterate over list elements getting each single element text each time.
UPD2
The entire code opening dropdowns and printing their inner titles can be like this:
I'm doing this with Selenium, not mixing with bs4.
from selenium import webdriver
from bs4 import BeautifulSoup
from selenium.webdriver.common.action_chains import ActionChains
import time
driver = webdriver.Chrome()
actions = ActionChains(driver)
driver.get('https://cslide.ctimeetingtech.com/esmo2021/attendee/confcal/session/list')
time.sleep(4)
page_source = driver.page_source
soup = BeautifulSoup(page_source,'html.parser')
new_titles = set()
productlist=driver.find_elements_by_xpath("//div[#class='card item-container session']")
for property in productlist:
actions.move_to_element(property).perform()
time.sleep(0.5)
sessiontitle=property.find_element_by_xpath(".//h4[#class='session-title card-title']").text
print(sessiontitle)
ifDropdown=property.find_elements_by_xpath(".//*[#class='item-expand-action expand']")
if(ifDropdown):
ifDropdown[0].click()
time.sleep(4)
open_titles = driver.find_elements_by_class_name('card-title')
for open_title in open_titles:
title = open_title.text
if(title not in new_titles):
print(title)
new_titles.add(title)
Here I am checking if there is a dropdown. If do, I open it. Then getting all the currently existing opened titles. Per each such title I validate if it is new or was opened previously. If the title is new, not existing in the set I print it and add it to the set.
To get all the data, including the date, time, chairs, you can use only requests/BeautifulSoup. There's no need for Selenium.
import requests
import pandas as pd
from bs4 import BeautifulSoup
data = []
url = "https://cslide.ctimeetingtech.com/esmo2021/attendee/confcal/session/list?p={}"
for page in range(1, 5): # <-- Increase number of pages here
with requests.Session() as session:
soup = BeautifulSoup(session.get(url.format(page)).content, "html.parser")
for card in soup.select("div.card-block"):
title = card.find(class_="session-title card-title").get_text()
date = card.select_one(".internal_date div.property").get_text(strip=True)
time = card.select_one(".internal_time div.property").get_text()
try:
chairs = card.select_one(".persons").get_text(strip=True)
except AttributeError:
chairs = "N/A"
data.append({"title": title, "date": date, "time": time, "chairs": chairs})
df = pd.DataFrame(data)
print(df.to_string())
Output (truncated):
title date time chairs
0 Educational sessions on-demand Thu, 16.09.2021 08:30 - 09:40 N/A
1 Special Symposia on-demand Thu, 16.09.2021 12:30 - 13:40 N/A
2 Multidisciplinary sessions on-demand Thu, 16.09.2021 16:30 - 17:40 N/A
3 MSD - Homologous Recombination Deficiency: BRCA and beyond Fri, 17.09.2021 08:45 - 09:55 Frederique Penault-Llorca(Clermont-Ferrand, France)
4 Servier - The clinical value of IDH inhibition in cholangiocarcinoma Fri, 17.09.2021 08:45 - 10:15 Arndt Vogel(Hannover, Germany)Angela Lamarca(Manchester, United Kingdom)
5 AstraZeneca - Redefining Breast Cancer – Biology to Therapy Fri, 17.09.2021 08:45 - 10:15 Ian Krop(Boston, United States of America)

How to _scrape_ all the data from this website link using selenium and save the extracted city, location and contact number as csv dataframe object?

The website url to scrape data
http://jawedhabib.co.in/hairandbeautysalons-sl/
Code:
lst = driver.find_element_by_css_selector(".post-17954.page.type-page.status-publish.hentry").text
for i in lst:
driver.implicitly_wait(2)
city = driver.find_element_by_css_selector("tr").text
salon_address = driver.find_element_by_css_selector("tr").text
Contact_number = driver.find_element_by_css_selector("tr").text
print(lst)
Here's the first part of your problem. Starting from the start you need to wait for all elements to load onto the screen. Grab all tables trs that are beyond the first 2 trs which are reserved for the location. From the tr xpath to their child using ./ and grab the td[1-3] text using the attribute('textContent') respectfully.
wait = WebDriverWait(driver, 60)
driver.get("http://jawedhabib.co.in/hairandbeautysalons-sl/")
#driver.maximize_window()
tableValues=wait.until(EC.presence_of_all_elements_located((By.XPATH,"//tbody//tr[position()>2]")))
city=[]
address=[]
contactno=[]
for tr in tableValues:
#print(tr.get_attribute('textContent'))
city.append(tr.find_element_by_xpath("./td[1]").get_attribute('textContent'))
address.append(tr.find_element_by_xpath("./td[2]").get_attribute('textContent'))
contactno.append(tr.find_element_by_xpath("./td[3]").get_attribute('textContent'))
Import
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

How to extract couple of tables from site using selenium

Greeting all
i am trying to extract tables from this site https://theunderminejournal.com/#eu/silvermoon/category/battlepets but i am having some difficulties with that. my code and whatever i used failed to bring up any result:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import time
def getbrowser():
options = Options()
options.add_argument("--disable-extensions")
#options.add_argument('--headless')
driver = webdriver.Chrome(options=options)
return driver
def scrape(): # create scrape engine from scratch
driver = getbrowser()
start = time.time()
site1="https://theunderminejournal.com/#eu/silvermoon/category/battlepets"
driver.get(site1)
time.sleep(10)
tbody = driver.find_element_by_tag_name("table")
#cell = tbody.find_elements_by_tag_name("tr").text
for tr in tbody:
td = tbody.find_elements_by_tag_name("tr")
print (td)
driver.close()
scrape()
my goal is to extract the name and the first price from each pet (from all the tables) and create a table with these two values.
generally i am building a scrape bot that will compare the prices from two servers....
i know that my scraping skills are too low , can you please point me where i could find something to read or watch to improve myself.
thanks again for your time
Get all the names and prices in 2 lists, and use their value in order, just replace the print command with whatever you want
names = driver.find_elements_by_css_selector("[class='name'] a")
prices = driver.find_elements_by_css_selector(":nth-child(4)[class='price'] span")
i = 0
for x in names
print (x.text)
print (prices[i].text)
i+=1
hope it helps.

Scrolling through pages with Python Selenium

I have written a python script that aims to take data off a website but I am unable to navigate and loop through pages to collect the links. The website is https://www.shearman.com/people? The Xpath on the site looks like this below;
ul class="results-pagination"
li class/a href onclick="PageRequest('2', event)"
When I run the query below is says that the element is not attached to the page;
try:
# this is navigate to next page
driver.find_element_by_xpath('//ul[#class="results-pagination"]/li/[#onclick=">"]').click()
time.sleep(5)
except NoSuchElementException:
break
Any ideas what I'm doing wrong on this?
Many thanks in advance.
Chris
You can try this code :
browser.get("https://www.shearman.com/people")
wait = WebDriverWait(browser, 30)
main_tab = browser.current_window_handle
navigation_buttons = browser.find_elements_by_xpath('//ul[#class="results-pagination"]//descendant::a')
size = len(navigation_buttons )
print ('this the length of list:',navigation_buttons )
i = 0
while i<size:
ActionChains(browser).key_down(Keys.CONTROL).click(navigation_buttons [i]).key_up(Keys.CONTROL).perform()
browser.switch_to_window(main_tab)
i=i+1;
if i >= size:
break
Make sure to import these :
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
Note this will open each link in new tab. As per your requirement you can click on next button using this xpath : //ul[#class="results-pagination"]//descendant::a
If you want to open links one by one in same tab , then you will have to handle stale element reference as once you will be moved out from main page , all element will become stale.