Why Selenium sometimes can't find href witout error - selenium

I am new in crawling and I am trying to crawl. https://www.stradivarius.com/tr/en/woman/clothing/shop-by-product/sweatshirts-c1390587.html webpage, sometimes i could get hrefs but generally code gave me empty list? Do you have any suggesition?
This are packages:
import requests
from tqdm import tqdm
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import *
from selenium.webdriver.support import expected_conditions as EC
import time
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import json
import pandas as pd
import warnings
warnings.filterwarnings(action='ignore')
from unidecode import unidecode
import re
import time
from webdriver_manager.chrome import ChromeDriverManager
browser = webdriver.Chrome(ChromeDriverManager().install())
urlist = []
browser.get('https://www.stradivarius.com/tr/kad%C4%B1n/giyim/%C3%BCr%C3%BCne-g%C3%B6re-al%C4%B1%C5%9Fveri%C5%9F/sweatshi%CC%87rt-c1390587.html')
html = browser.page_source
soup = BeautifulSoup(html)
browser.implicitly_wait(90)
product_links=soup.find_all('a', {'id':'hrefRedirectProduct'})
for a in product_links:
urlist.append(product_links["href"])

It's possible the data hasn't rendered yet. You have the .implicitly_wait(90) but it's after you've already pulled the html. So you need to move that up in your code.
urlist = []
browser.get('https://www.stradivarius.com/tr/kad%C4%B1n/giyim/%C3%BCr%C3%BCne-g%C3%B6re-al%C4%B1%C5%9Fveri%C5%9F/sweatshi%CC%87rt-c1390587.html')
browser.implicitly_wait(90) #<--- wait for the page to render BEFORE...
html = browser.page_source # ...grabing the html source
soup = BeautifulSoup(html)
product_links=soup.find_all('a', {'id':'hrefRedirectProduct'})
for a in product_links:
urlist.append(product_links["href"])
A better solution may be to go after the data from the source.
Does this include your desired href?
import requests
import pandas as pd
url = 'https://www.stradivarius.com/itxrest/2/catalog/store/54009571/50331068/category/1390587/product?languageId=-43&appId=1'
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36'}
jsonData = requests.get(url, headers=headers).json()
df = pd.DataFrame(jsonData['products'])
Output:
print(df['productUrl'])
0 kolej-sweatshirt-l06710711
1 oversize-hard-rock-cafe-baskl-sweatshirt-l0670...
2 oversize-hard-rock-cafe-baskl-sweatshirt-l0670...
3 oversize-hard-rock-cafe-kapusonlu-sweatshirt-l...
4 fermuarl-sweatshirt-l06521718
60 fermuarl-oversize-kapusonlu-sweatshirt-l06765643
61 dikisli-basic-sweatshirt-l06519703
62 jogging-fit-pantolon-ve-sweatshirt-seti-l01174780
63 naylon-sweatshirt-l08221191
64 dikisli-basic-sweatshirt-l06519703
Name: productUrl, Length: 65, dtype: object

Related

Selenium : How to Keep Browser open after code ends

I am doing an automation task to click some webpages.
What i want is to use chrome browser after my coded work is done. I've done this by time.sleep() but i think there are other ways to do this... Is there any good ideas??
import datetime
import time
from selenium import webdriver
from selenium.webdriver.common.alert import Alert
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
options = webdriver.ChromeOptions()
options.add_argument("disable-gpu")
options.add_argument(
"user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36")
options.add_argument("disable-infobars")
options.add_argument("--disable-extensions")
options.add_experimental_option("excludeSwitches", ["enable-logging"])
options.add_experimental_option("detach", True)
options.add_experimental_option("excludeSwitches", ["enable-automation"])
driver = webdriver.Chrome(options=options)
driver.get("https://example.com")
driver.maximize_window()
driver.find_element(By.XPATH, '//*[#id="USER_ID"]').send_keys("id")
driver.find_element(By.XPATH, '//*[#id="PWD"]').send_keys("pwd")
driver.find_element(By.XPATH, '//*[#id="btn_Login"]').click()
WebDriverWait(driver, 10000).until(
EC.presence_of_element_located((By.XPATH, '//*[#id="jqg_grd_basket_1"]')))
driver.find_element(By.XPATH, '//*[#id="cb_grd_basket"]').click()
enrollment_time = datetime.datetime(2023, 2, 6, 10, 20, 1, 0)
current_time = datetime.datetime.now()
wait_time = enrollment_time - current_time
time.sleep(wait_time.total_seconds())
driver.find_element(By.XPATH, '//*[#id="btn_basketSave"]').click()
alert = Alert(driver)
print(alert.text)
time.sleep(999999)
While you don't use driver.close() or driver.quit() it will keep open till you manually exit.

How to avoid ModuleNotFoundError: No module named 'pandas' in VSCode

I'm trying to run from a tutorial in VSCode and keep getting errors about no pandas module but I know it is installed.
I've tried using "select interpreter" to swap between versions of python but then I have issues with requests module. The code below does work if I comment out the pandas module but I can't understand why this code doesn't work.
I tried using pip3 install pandas but the terminal tells me it is already installed.
The code is:
import requests
from bs4 import BeautifulSoup
import time
import datetime
import smtplib
import csv
#import pandas as pd
def check_price():
URL = "https://www.amazon.co.uk/Funny-Data-T-shirt-Mining-T-Shirt/dp/B0B68TSGCR/ref=sr_1_1?keywords=funny+data+mining&qid=1560000000&s=gateway&sr=8-1"
html = requests.get(URL).text
headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"}
page = requests.get(URL, headers=headers)
soup1 = BeautifulSoup(page.content, "html.parser")
soup2 = BeautifulSoup(soup1.prettify(), "html.parser")
title = soup2.find(id="productTitle").get_text().strip()
price_whole = soup2.find(attrs= {"class": "a-price-whole"}).get_text()
price_fraction = soup2.find(attrs= {"class": "a-price-fraction"}).get_text()
price = (f"{price_whole.split()[0]}.{price_fraction.split()[0]}")
today = datetime.date.today()
header = ['Title', 'Price', 'Date']
data = [title, price, today]
with open("amazonscraper2.csv", "a+", newline="", encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow(header)
writer.writerow(data)
while(True):
check_price()
time.sleep(5)
print("running")
#df = pd.read_csv("amazonscraper2.csv")
#print(df)

Unable to extract Email addresses from Pop Up

I have problem. I am trying to extract the email addresses from a website.
When trying to extract the email addresses,I have to click on the email icon enter image description here in order for it to appear. Once I click on the icon, a new "popup" appears.
I have tried using Selenium get_attribute for data-mailto-token & data-mailto-vector enter image description here, but without any success. How can I extract the email addresses with Python from these so called "popups"? Any help would be greatly appreciated!
Kind regards
Linus
I have tried using Selenium and looked into to further libraries for cross plattform access, but without any success
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from selenium.webdriver.support import expected_conditions as ec
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import ElementClickInterceptedException
from selenium.webdriver.support.select import Select
from bs4 import BeautifulSoup
import requests
import re
#card_small = driver.find_elements_by_class_name("Card small")
i_num = 1
list_links = []
list_links_all = []
num_inc = 1
for i_p in range(0,14):
url = "https://www.hotelleriesuisse.ch/de/branche-und-politik/branchenverzeichnis/hotel-page-"+str(num_inc)+"?filterValues=QWN0aXZlLEluYWN0aXZlOzs7OzQsMzs7Ozs7OzQ5LDEzLDUsNDU7&cHash=30901b0e3080a928cd0ad32522e81b3f"
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36'}
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.get(url)
time.sleep(5)
driver.find_element_by_css_selector("body > div.cc-window.cc-banner.cc-type-info.cc-theme-block.cc-bottom.cc-visible > div > div.cc-actions > a.cc-btn.cc-allow").click()
try:
driver.execute_script("window.scrollTo(0,2150)")
target = driver.find_elements_by_tag_name("a")
for i in target:
list_links.append(i.get_attribute("href"))
for i in range(10,22):
url_new = list_links[i]
print(url_new)
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36'}
page = requests.get(url_new, headers=headers)
soup = BeautifulSoup(page.text, 'html.parser')
name = soup.find('span',class_="Avatar--name")
address = soup.find_all('span', class_="Button--label")
phone = soup.find_all('span', class_="Button--label")
if name != None:
name_text = soup.find('span', class_="Avatar--name").text
#print(name_text)
if address != None:
for i in address:
search=i.select("span p")
if search != []:
print(search[0].text)
if phone != None:
for i in phone:
match = re.search("[+]\d{2} \d{2} \d{3} \d{2} \d{2}",i.text)
if match !=None:
print(match.group())
time.sleep(5)
driver.get(url_new)
try:
driver.execute_script("window.scrollTo(0,900)")
time.sleep(5)
element=driver.find_element_by_link_text("E-Mail")
info = element.get_attribute("data-mailto-token")
print(info)
element.click()
except NoSuchElementException:
pass
list_links = []
num_inc = num_inc + 1
i_num = i_num + 1
driver.close()
"""
driver.find_element_by_css_selector("#main-content > section.CardGrid > nav > a.Button.nolabel.primary.Pagination--button.Pagination--next").click()
time.sleep(5)
print("This is the end of page: "+str(i_num))
i_num = i_num + 1
time.sleep(5)
"""
except ElementClickInterceptedException:
break
The email address can be obtained by decrypting the combination of data-mailto-token and data-mailto-vector values found in the button.

My selenium web scraper doesn't make sense to me

I want to get the bitcoin price using the following code. I have no clue why the output behaves that way. It appears to store certain values and outputs them in-between accurate values. Bonus task: Make the old values disappear in tkinter
from bs4 import BeautifulSoup #Downloading pertinent Python packages
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import tkinter as tk
import time
time = 1000
def bitcoinTracker():
options = Options()
options.headless = True
options.add_argument("--window-size=1920,1200")
chromedriver = "/Users/Philipp/PhytonWebScrap/selenium_project/chromedriver" #Setting up Chrome driver
driver = webdriver.Chrome(options=options, executable_path=chromedriver)
driver.get("https://coinmarketcap.com/currencies/bitcoin/")
hunt = driver.find_element_by_class_name("priceValue___11gHJ").text
return(hunt)
driver.quit()
def collector():
label = tk.Label(text="Bitcoin " + bitcoinTracker(), font="Arial 18")
label.pack()
root.after(time, collector)
root = tk.Tk()
root.after(time, collector)
root.mainloop()
I try again this time only selenium without tkinter
from bs4 import BeautifulSoup #Downloading pertinent Python packages
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
options = Options()
options.headless = True
options.add_argument("--window-size=1920,1200")
while True:
chromedriver = "/Users/Philipp/PhytonWebScrap/selenium_project/chromedriver" #Setting up Chrome driver
driver = webdriver.Chrome(options=options, executable_path=chromedriver)
driver.get("https://coinmarketcap.com/currencies/bitcoin/")
hunt = driver.find_element_by_class_name("priceValue___11gHJ").text
#print(driver.page_source)
time.sleep(20)
print(hunt)
driver.quit()

PyQt4 & BeautifulSoup setting the Browser Window Size

I'm attempting to use PyQt4 to do some web scraping, but the site I'm attempting to scrape keeps thinking I'm a mobile device, and is not presenting the dataset available to a desktop or laptop (even though I'm using a Mozilla/5.0 user agent).
To try to find out why I'm setting my URL to "whatsmyuseragent.com". And I notice that it's telling me that although my Screen Resolution is 1920px x 1080px my Browser Window Size is 0px x 0px, so could this be the problem?
Here is my code below. Any suggestions on what I need to change to convince the site I'm scraping to believe I'm a desktop or laptop (rather than a mobile) would be appreciated. Thanks.
import sys
from PyQt4 import QtGui
from PyQt4.QtGui import QApplication
from PyQt4.QtCore import QUrl
from PyQt4.QtWebKit import QWebPage
from PyQt4.QtNetwork import QNetworkRequest
import bs4 as bs
import urllib.request
class Client(QWebPage):
def __init__ (self, url):
self.app = QApplication(sys.argv)
QWebPage.__init__(self)
self.loadFinished.connect(self.on_page_load)
self.request = QNetworkRequest()
self.request.setUrl(QUrl(url))
self.request.setRawHeader("User-Agent",'Mozilla/5.0')
self.mainFrame().load(self.request)
self.app.exec_()
def on_page_load (self):
self.app.quit()
url = 'http://www.whatsmyuseragent.com'
client_response = Client(url)
source = client_response.mainFrame().toHtml()
soup = bs.BeautifulSoup(source, 'lxml')
print(soup.prettify())
Try to set the Viewport Size:
import sys
import re
from PyQt4 import QtGui
from PyQt4.QtGui import QApplication
from PyQt4.QtCore import QUrl, QSize
from PyQt4.QtWebKit import QWebPage
from PyQt4.QtNetwork import QNetworkRequest
import bs4 as bs
class Client(QWebPage):
def __init__ (self, url):
self.app = QApplication(sys.argv)
QWebPage.__init__(self)
# good ol'size
size = QSize(640, 480)
self.setViewportSize(size)
self.loadFinished.connect(self.on_page_load)
self.request = QNetworkRequest()
self.request.setUrl(QUrl(url))
self.request.setRawHeader("User-Agent",'Mozilla/5.0')
self.mainFrame().load(self.request)
self.app.exec_()
def on_page_load (self):
self.app.quit()
url = 'http://www.whatsmyuseragent.com'
client_response = Client(url)
source = client_response.mainFrame().toHtml()
soup = bs.BeautifulSoup(source, 'lxml')
# some meat from the soup
print(re.sub('\s+', ' ', soup.find(class_='browser-window').text))
This produce the following for me:
625 px x 465 px