I have problem. I am trying to extract the email addresses from a website.
When trying to extract the email addresses,I have to click on the email icon enter image description here in order for it to appear. Once I click on the icon, a new "popup" appears.
I have tried using Selenium get_attribute for data-mailto-token & data-mailto-vector enter image description here, but without any success. How can I extract the email addresses with Python from these so called "popups"? Any help would be greatly appreciated!
Kind regards
Linus
I have tried using Selenium and looked into to further libraries for cross plattform access, but without any success
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from selenium.webdriver.support import expected_conditions as ec
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import ElementClickInterceptedException
from selenium.webdriver.support.select import Select
from bs4 import BeautifulSoup
import requests
import re
#card_small = driver.find_elements_by_class_name("Card small")
i_num = 1
list_links = []
list_links_all = []
num_inc = 1
for i_p in range(0,14):
url = "https://www.hotelleriesuisse.ch/de/branche-und-politik/branchenverzeichnis/hotel-page-"+str(num_inc)+"?filterValues=QWN0aXZlLEluYWN0aXZlOzs7OzQsMzs7Ozs7OzQ5LDEzLDUsNDU7&cHash=30901b0e3080a928cd0ad32522e81b3f"
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36'}
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.get(url)
time.sleep(5)
driver.find_element_by_css_selector("body > div.cc-window.cc-banner.cc-type-info.cc-theme-block.cc-bottom.cc-visible > div > div.cc-actions > a.cc-btn.cc-allow").click()
try:
driver.execute_script("window.scrollTo(0,2150)")
target = driver.find_elements_by_tag_name("a")
for i in target:
list_links.append(i.get_attribute("href"))
for i in range(10,22):
url_new = list_links[i]
print(url_new)
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36'}
page = requests.get(url_new, headers=headers)
soup = BeautifulSoup(page.text, 'html.parser')
name = soup.find('span',class_="Avatar--name")
address = soup.find_all('span', class_="Button--label")
phone = soup.find_all('span', class_="Button--label")
if name != None:
name_text = soup.find('span', class_="Avatar--name").text
#print(name_text)
if address != None:
for i in address:
search=i.select("span p")
if search != []:
print(search[0].text)
if phone != None:
for i in phone:
match = re.search("[+]\d{2} \d{2} \d{3} \d{2} \d{2}",i.text)
if match !=None:
print(match.group())
time.sleep(5)
driver.get(url_new)
try:
driver.execute_script("window.scrollTo(0,900)")
time.sleep(5)
element=driver.find_element_by_link_text("E-Mail")
info = element.get_attribute("data-mailto-token")
print(info)
element.click()
except NoSuchElementException:
pass
list_links = []
num_inc = num_inc + 1
i_num = i_num + 1
driver.close()
"""
driver.find_element_by_css_selector("#main-content > section.CardGrid > nav > a.Button.nolabel.primary.Pagination--button.Pagination--next").click()
time.sleep(5)
print("This is the end of page: "+str(i_num))
i_num = i_num + 1
time.sleep(5)
"""
except ElementClickInterceptedException:
break
The email address can be obtained by decrypting the combination of data-mailto-token and data-mailto-vector values found in the button.
Related
I am doing an automation task to click some webpages.
What i want is to use chrome browser after my coded work is done. I've done this by time.sleep() but i think there are other ways to do this... Is there any good ideas??
import datetime
import time
from selenium import webdriver
from selenium.webdriver.common.alert import Alert
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
options = webdriver.ChromeOptions()
options.add_argument("disable-gpu")
options.add_argument(
"user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36")
options.add_argument("disable-infobars")
options.add_argument("--disable-extensions")
options.add_experimental_option("excludeSwitches", ["enable-logging"])
options.add_experimental_option("detach", True)
options.add_experimental_option("excludeSwitches", ["enable-automation"])
driver = webdriver.Chrome(options=options)
driver.get("https://example.com")
driver.maximize_window()
driver.find_element(By.XPATH, '//*[#id="USER_ID"]').send_keys("id")
driver.find_element(By.XPATH, '//*[#id="PWD"]').send_keys("pwd")
driver.find_element(By.XPATH, '//*[#id="btn_Login"]').click()
WebDriverWait(driver, 10000).until(
EC.presence_of_element_located((By.XPATH, '//*[#id="jqg_grd_basket_1"]')))
driver.find_element(By.XPATH, '//*[#id="cb_grd_basket"]').click()
enrollment_time = datetime.datetime(2023, 2, 6, 10, 20, 1, 0)
current_time = datetime.datetime.now()
wait_time = enrollment_time - current_time
time.sleep(wait_time.total_seconds())
driver.find_element(By.XPATH, '//*[#id="btn_basketSave"]').click()
alert = Alert(driver)
print(alert.text)
time.sleep(999999)
While you don't use driver.close() or driver.quit() it will keep open till you manually exit.
I'm trying to run from a tutorial in VSCode and keep getting errors about no pandas module but I know it is installed.
I've tried using "select interpreter" to swap between versions of python but then I have issues with requests module. The code below does work if I comment out the pandas module but I can't understand why this code doesn't work.
I tried using pip3 install pandas but the terminal tells me it is already installed.
The code is:
import requests
from bs4 import BeautifulSoup
import time
import datetime
import smtplib
import csv
#import pandas as pd
def check_price():
URL = "https://www.amazon.co.uk/Funny-Data-T-shirt-Mining-T-Shirt/dp/B0B68TSGCR/ref=sr_1_1?keywords=funny+data+mining&qid=1560000000&s=gateway&sr=8-1"
html = requests.get(URL).text
headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"}
page = requests.get(URL, headers=headers)
soup1 = BeautifulSoup(page.content, "html.parser")
soup2 = BeautifulSoup(soup1.prettify(), "html.parser")
title = soup2.find(id="productTitle").get_text().strip()
price_whole = soup2.find(attrs= {"class": "a-price-whole"}).get_text()
price_fraction = soup2.find(attrs= {"class": "a-price-fraction"}).get_text()
price = (f"{price_whole.split()[0]}.{price_fraction.split()[0]}")
today = datetime.date.today()
header = ['Title', 'Price', 'Date']
data = [title, price, today]
with open("amazonscraper2.csv", "a+", newline="", encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow(header)
writer.writerow(data)
while(True):
check_price()
time.sleep(5)
print("running")
#df = pd.read_csv("amazonscraper2.csv")
#print(df)
I am new in crawling and I am trying to crawl. https://www.stradivarius.com/tr/en/woman/clothing/shop-by-product/sweatshirts-c1390587.html webpage, sometimes i could get hrefs but generally code gave me empty list? Do you have any suggesition?
This are packages:
import requests
from tqdm import tqdm
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import *
from selenium.webdriver.support import expected_conditions as EC
import time
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import json
import pandas as pd
import warnings
warnings.filterwarnings(action='ignore')
from unidecode import unidecode
import re
import time
from webdriver_manager.chrome import ChromeDriverManager
browser = webdriver.Chrome(ChromeDriverManager().install())
urlist = []
browser.get('https://www.stradivarius.com/tr/kad%C4%B1n/giyim/%C3%BCr%C3%BCne-g%C3%B6re-al%C4%B1%C5%9Fveri%C5%9F/sweatshi%CC%87rt-c1390587.html')
html = browser.page_source
soup = BeautifulSoup(html)
browser.implicitly_wait(90)
product_links=soup.find_all('a', {'id':'hrefRedirectProduct'})
for a in product_links:
urlist.append(product_links["href"])
It's possible the data hasn't rendered yet. You have the .implicitly_wait(90) but it's after you've already pulled the html. So you need to move that up in your code.
urlist = []
browser.get('https://www.stradivarius.com/tr/kad%C4%B1n/giyim/%C3%BCr%C3%BCne-g%C3%B6re-al%C4%B1%C5%9Fveri%C5%9F/sweatshi%CC%87rt-c1390587.html')
browser.implicitly_wait(90) #<--- wait for the page to render BEFORE...
html = browser.page_source # ...grabing the html source
soup = BeautifulSoup(html)
product_links=soup.find_all('a', {'id':'hrefRedirectProduct'})
for a in product_links:
urlist.append(product_links["href"])
A better solution may be to go after the data from the source.
Does this include your desired href?
import requests
import pandas as pd
url = 'https://www.stradivarius.com/itxrest/2/catalog/store/54009571/50331068/category/1390587/product?languageId=-43&appId=1'
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36'}
jsonData = requests.get(url, headers=headers).json()
df = pd.DataFrame(jsonData['products'])
Output:
print(df['productUrl'])
0 kolej-sweatshirt-l06710711
1 oversize-hard-rock-cafe-baskl-sweatshirt-l0670...
2 oversize-hard-rock-cafe-baskl-sweatshirt-l0670...
3 oversize-hard-rock-cafe-kapusonlu-sweatshirt-l...
4 fermuarl-sweatshirt-l06521718
60 fermuarl-oversize-kapusonlu-sweatshirt-l06765643
61 dikisli-basic-sweatshirt-l06519703
62 jogging-fit-pantolon-ve-sweatshirt-seti-l01174780
63 naylon-sweatshirt-l08221191
64 dikisli-basic-sweatshirt-l06519703
Name: productUrl, Length: 65, dtype: object
I am trying to create a loop that will loop though locations and extract out the necessary data and append it to the rest of the locations.
I feel that the code I have written is good but keep getting an error of:
AttributeError: 'NoneType' object has no attribute 'find_all'
but I know that shouldn't be the case.
Any help would be appreciated. Here is my code:
import urllib.request
from bs4 import BeautifulSoup as bs
import re
import pandas as pd
import requests
locations = ['las-vegas-nv','denver-co']
for location in locations:
headers = {
'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36'
}
url = 'https://www.apartments.com/{location}/1-bedrooms/'
page = requests.get(url, headers=headers)
soup = bs(page.text, 'lxml')
table = soup.find("table", class_="rentTrendGrid")
rows = []
for tr in table.find_all('tr'):
rows.append([td.text for td in tr.find_all(['th', 'td'])])
#header_row = rows[0]
rows = list(zip(*rows[0:])) # tranpose the table
df = pd.DataFrame(rows[1:], columns=rows[0])
df['City'] = location
dfs.append(df)
df = pd.concat(dfs).reset_index(drop = True)
print(df)
Andrej was right, super simple just had to put the 'f' in front.
import urllib.request
from bs4 import BeautifulSoup as bs
import re
import pandas as pd
import requests
locations = ['las-vegas-nv','denver-co']
for location in locations:
headers = {
'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36'
}
url = f'https://www.apartments.com/{location}/1-bedrooms/'
page = requests.get(url, headers=headers)
soup = bs(page.text, 'lxml')
table = soup.find("table", class_="rentTrendGrid")
rows = []
for tr in table.find_all('tr'):
rows.append([td.text for td in tr.find_all(['th', 'td'])])
#header_row = rows[0]
rows = list(zip(*rows[0:])) # tranpose the table
df = pd.DataFrame(rows[1:], columns=rows[0])
df['City'] = location
dfs.append(df)
df = pd.concat(dfs).reset_index(drop = True)
print(df)
import requests
import urllib
from bs4 import BeautifulSoup
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"
}
for idx in range(1, 17):
url = "https://www.coupang.com/np/categories/311357?page=" + str(idx)
print(url)
result = requests.get(url, headers=headers)
soup_obj = BeautifulSoup(result.content, "html.parser")
div = soup_obj.findAll("div", {"class": "name"})
lis = soup_obj.find("ul", {"id": "productList"}).findAll("li")
for li in lis:
name = li.find("div", {"class": "name"})
img = li.find("dt", {"class": "image"}).find("img", {"src": ""})
print("name: " + name.text.strip())
urllib.request.urlretrieve(img, "./imagepile")
// urllib.request.urlretrieve(img, "./imagepile") how to fix this code line? ple
import requests
from bs4 import BeautifulSoup
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:75.0) Gecko/20100101 Firefox/75.0'}
def main(url):
with requests.Session() as req:
for item in range(1, 18):
print(f"Extracting Page# {item}")
r = req.get(url.format(item), headers=headers)
soup = BeautifulSoup(r.content, 'html.parser')
for item in soup.select("dt.image"):
print(item.img['alt'], f"https:{item.img['src']}")
main("https://www.coupang.com/np/categories/311357?page={}")
Download Version:
import requests
from bs4 import BeautifulSoup
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:75.0) Gecko/20100101 Firefox/75.0'}
def main(url):
with requests.Session() as req:
for item in range(1, 3):
print(f"Extracting Page# {item}")
r = req.get(url.format(item), headers=headers)
soup = BeautifulSoup(r.content, 'html.parser')
target = [[item.img['alt'], f'https:{item.img["src"]}']
for item in soup.select("dt.image")]
for el in target:
print(f"Saving {el[0]}.jpg")
r = req.get(el[1])
with open(f"{el[0]}.jpg", 'wb') as f:
f.write(r.content)
main("https://www.coupang.com/np/categories/311357?page={}")