I'm trying to run from a tutorial in VSCode and keep getting errors about no pandas module but I know it is installed.
I've tried using "select interpreter" to swap between versions of python but then I have issues with requests module. The code below does work if I comment out the pandas module but I can't understand why this code doesn't work.
I tried using pip3 install pandas but the terminal tells me it is already installed.
The code is:
import requests
from bs4 import BeautifulSoup
import time
import datetime
import smtplib
import csv
#import pandas as pd
def check_price():
URL = "https://www.amazon.co.uk/Funny-Data-T-shirt-Mining-T-Shirt/dp/B0B68TSGCR/ref=sr_1_1?keywords=funny+data+mining&qid=1560000000&s=gateway&sr=8-1"
html = requests.get(URL).text
headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"}
page = requests.get(URL, headers=headers)
soup1 = BeautifulSoup(page.content, "html.parser")
soup2 = BeautifulSoup(soup1.prettify(), "html.parser")
title = soup2.find(id="productTitle").get_text().strip()
price_whole = soup2.find(attrs= {"class": "a-price-whole"}).get_text()
price_fraction = soup2.find(attrs= {"class": "a-price-fraction"}).get_text()
price = (f"{price_whole.split()[0]}.{price_fraction.split()[0]}")
today = datetime.date.today()
header = ['Title', 'Price', 'Date']
data = [title, price, today]
with open("amazonscraper2.csv", "a+", newline="", encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow(header)
writer.writerow(data)
while(True):
check_price()
time.sleep(5)
print("running")
#df = pd.read_csv("amazonscraper2.csv")
#print(df)
Related
I have problem. I am trying to extract the email addresses from a website.
When trying to extract the email addresses,I have to click on the email icon enter image description here in order for it to appear. Once I click on the icon, a new "popup" appears.
I have tried using Selenium get_attribute for data-mailto-token & data-mailto-vector enter image description here, but without any success. How can I extract the email addresses with Python from these so called "popups"? Any help would be greatly appreciated!
Kind regards
Linus
I have tried using Selenium and looked into to further libraries for cross plattform access, but without any success
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from selenium.webdriver.support import expected_conditions as ec
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import ElementClickInterceptedException
from selenium.webdriver.support.select import Select
from bs4 import BeautifulSoup
import requests
import re
#card_small = driver.find_elements_by_class_name("Card small")
i_num = 1
list_links = []
list_links_all = []
num_inc = 1
for i_p in range(0,14):
url = "https://www.hotelleriesuisse.ch/de/branche-und-politik/branchenverzeichnis/hotel-page-"+str(num_inc)+"?filterValues=QWN0aXZlLEluYWN0aXZlOzs7OzQsMzs7Ozs7OzQ5LDEzLDUsNDU7&cHash=30901b0e3080a928cd0ad32522e81b3f"
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36'}
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.get(url)
time.sleep(5)
driver.find_element_by_css_selector("body > div.cc-window.cc-banner.cc-type-info.cc-theme-block.cc-bottom.cc-visible > div > div.cc-actions > a.cc-btn.cc-allow").click()
try:
driver.execute_script("window.scrollTo(0,2150)")
target = driver.find_elements_by_tag_name("a")
for i in target:
list_links.append(i.get_attribute("href"))
for i in range(10,22):
url_new = list_links[i]
print(url_new)
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36'}
page = requests.get(url_new, headers=headers)
soup = BeautifulSoup(page.text, 'html.parser')
name = soup.find('span',class_="Avatar--name")
address = soup.find_all('span', class_="Button--label")
phone = soup.find_all('span', class_="Button--label")
if name != None:
name_text = soup.find('span', class_="Avatar--name").text
#print(name_text)
if address != None:
for i in address:
search=i.select("span p")
if search != []:
print(search[0].text)
if phone != None:
for i in phone:
match = re.search("[+]\d{2} \d{2} \d{3} \d{2} \d{2}",i.text)
if match !=None:
print(match.group())
time.sleep(5)
driver.get(url_new)
try:
driver.execute_script("window.scrollTo(0,900)")
time.sleep(5)
element=driver.find_element_by_link_text("E-Mail")
info = element.get_attribute("data-mailto-token")
print(info)
element.click()
except NoSuchElementException:
pass
list_links = []
num_inc = num_inc + 1
i_num = i_num + 1
driver.close()
"""
driver.find_element_by_css_selector("#main-content > section.CardGrid > nav > a.Button.nolabel.primary.Pagination--button.Pagination--next").click()
time.sleep(5)
print("This is the end of page: "+str(i_num))
i_num = i_num + 1
time.sleep(5)
"""
except ElementClickInterceptedException:
break
The email address can be obtained by decrypting the combination of data-mailto-token and data-mailto-vector values found in the button.
I am new in crawling and I am trying to crawl. https://www.stradivarius.com/tr/en/woman/clothing/shop-by-product/sweatshirts-c1390587.html webpage, sometimes i could get hrefs but generally code gave me empty list? Do you have any suggesition?
This are packages:
import requests
from tqdm import tqdm
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import *
from selenium.webdriver.support import expected_conditions as EC
import time
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import json
import pandas as pd
import warnings
warnings.filterwarnings(action='ignore')
from unidecode import unidecode
import re
import time
from webdriver_manager.chrome import ChromeDriverManager
browser = webdriver.Chrome(ChromeDriverManager().install())
urlist = []
browser.get('https://www.stradivarius.com/tr/kad%C4%B1n/giyim/%C3%BCr%C3%BCne-g%C3%B6re-al%C4%B1%C5%9Fveri%C5%9F/sweatshi%CC%87rt-c1390587.html')
html = browser.page_source
soup = BeautifulSoup(html)
browser.implicitly_wait(90)
product_links=soup.find_all('a', {'id':'hrefRedirectProduct'})
for a in product_links:
urlist.append(product_links["href"])
It's possible the data hasn't rendered yet. You have the .implicitly_wait(90) but it's after you've already pulled the html. So you need to move that up in your code.
urlist = []
browser.get('https://www.stradivarius.com/tr/kad%C4%B1n/giyim/%C3%BCr%C3%BCne-g%C3%B6re-al%C4%B1%C5%9Fveri%C5%9F/sweatshi%CC%87rt-c1390587.html')
browser.implicitly_wait(90) #<--- wait for the page to render BEFORE...
html = browser.page_source # ...grabing the html source
soup = BeautifulSoup(html)
product_links=soup.find_all('a', {'id':'hrefRedirectProduct'})
for a in product_links:
urlist.append(product_links["href"])
A better solution may be to go after the data from the source.
Does this include your desired href?
import requests
import pandas as pd
url = 'https://www.stradivarius.com/itxrest/2/catalog/store/54009571/50331068/category/1390587/product?languageId=-43&appId=1'
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36'}
jsonData = requests.get(url, headers=headers).json()
df = pd.DataFrame(jsonData['products'])
Output:
print(df['productUrl'])
0 kolej-sweatshirt-l06710711
1 oversize-hard-rock-cafe-baskl-sweatshirt-l0670...
2 oversize-hard-rock-cafe-baskl-sweatshirt-l0670...
3 oversize-hard-rock-cafe-kapusonlu-sweatshirt-l...
4 fermuarl-sweatshirt-l06521718
60 fermuarl-oversize-kapusonlu-sweatshirt-l06765643
61 dikisli-basic-sweatshirt-l06519703
62 jogging-fit-pantolon-ve-sweatshirt-seti-l01174780
63 naylon-sweatshirt-l08221191
64 dikisli-basic-sweatshirt-l06519703
Name: productUrl, Length: 65, dtype: object
I am trying to create a loop that will loop though locations and extract out the necessary data and append it to the rest of the locations.
I feel that the code I have written is good but keep getting an error of:
AttributeError: 'NoneType' object has no attribute 'find_all'
but I know that shouldn't be the case.
Any help would be appreciated. Here is my code:
import urllib.request
from bs4 import BeautifulSoup as bs
import re
import pandas as pd
import requests
locations = ['las-vegas-nv','denver-co']
for location in locations:
headers = {
'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36'
}
url = 'https://www.apartments.com/{location}/1-bedrooms/'
page = requests.get(url, headers=headers)
soup = bs(page.text, 'lxml')
table = soup.find("table", class_="rentTrendGrid")
rows = []
for tr in table.find_all('tr'):
rows.append([td.text for td in tr.find_all(['th', 'td'])])
#header_row = rows[0]
rows = list(zip(*rows[0:])) # tranpose the table
df = pd.DataFrame(rows[1:], columns=rows[0])
df['City'] = location
dfs.append(df)
df = pd.concat(dfs).reset_index(drop = True)
print(df)
Andrej was right, super simple just had to put the 'f' in front.
import urllib.request
from bs4 import BeautifulSoup as bs
import re
import pandas as pd
import requests
locations = ['las-vegas-nv','denver-co']
for location in locations:
headers = {
'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36'
}
url = f'https://www.apartments.com/{location}/1-bedrooms/'
page = requests.get(url, headers=headers)
soup = bs(page.text, 'lxml')
table = soup.find("table", class_="rentTrendGrid")
rows = []
for tr in table.find_all('tr'):
rows.append([td.text for td in tr.find_all(['th', 'td'])])
#header_row = rows[0]
rows = list(zip(*rows[0:])) # tranpose the table
df = pd.DataFrame(rows[1:], columns=rows[0])
df['City'] = location
dfs.append(df)
df = pd.concat(dfs).reset_index(drop = True)
print(df)
This is my first scrape. I have tried watching some videos online and using google. I haven't had much success with this website, though. Perhaps someone can help me.
This is what I have started with...
from lxml import html
from bs4 import BeautifulSoup
import requests
import pandas as pd
req = requests.get('https://www.baseball-reference.com/leagues/MLB-
standings.shtml')
soup = BeautifulSoup(req.text, "lxml")
W_L = [i.text for i in soup.find_all('td', {'data-stat': 'record_pythag'})]
team = [i.text for i in soup.find_all('td', {'data-stat': 'team_ID'})]
my_dict = dict(zip(team, W_L))
df = pd.DataFrame(my_dict)
writer = pd.ExcelWriter('my1st_webscrape.xlsx')
df.to_excel(writer,'Sheet1')
writer.save()
I would just like the pythagorean win/loss column please. Thanks!
Search for all td tags that include data-stat corresponding to record_pythag and extract .text from each:
W_L = [i.text for i in soup.find_all('td', {'data-stat': 'record_pythag'})]
Note that this assumes all tags found will include text. If one does not it will throw a NoneType error, in which case you could wrap this in a try-except block.
I imagine the dictionary you want to create uses the team name abbreviations as the keys, in which case you can assemble a list of them like so:
team = [i.text for i in soup.find_all('td', {'data-stat': 'team_ID'})]
Then you can create your dictionary:
my_dict = dict(zip(team, W_L))
And pass it to a dataframe:
df = pd.DataFrame(my_dict)
EDIT
It turns out you actually need Selenium in order to pull the information you are interested in. Selenium allows you to automate a web browser and it will load the full page source HTML. You will also need to convert the dictionary to a pandas Series prior to passing it to the DataFrame constructor. See revised code below (note that you will need to download the selenium chrome driver, or whichever web browser driver you are interested in using):
from bs4 import BeautifulSoup
import requests, os
import pandas as pd
from selenium import webdriver
os.chdir('path your chrome driver')
header = {'User-agent' : 'Mozilla/5.0 (Windows; U; Windows NT 5.1; de; rv:1.9.1.5) Gecko/20091102 Firefox/3.5.5'}
options = webdriver.ChromeOptions(); options.add_argument("--start-maximized")
driver = webdriver.Chrome(chrome_options=options)
driver.get('https://www.baseball-reference.com/leagues/MLB-standings.shtml')
soup = BeautifulSoup(driver.page_source, 'html.parser')
driver.quit()
W_L = [i.text for i in soup.find_all('td', {'data-stat': 'record_pythag'})]
team = [i.text for i in soup.find_all('td', {'data-stat': 'team_ID'})]
my_dict = dict(zip(team, W_L))
df = pd.DataFrame(pd.Series(my_dict))
writer = pd.ExcelWriter('my1st_webscrape.xlsx')
df.to_excel(writer,'Sheet1')
writer.save()
I'm attempting to use PyQt4 to do some web scraping, but the site I'm attempting to scrape keeps thinking I'm a mobile device, and is not presenting the dataset available to a desktop or laptop (even though I'm using a Mozilla/5.0 user agent).
To try to find out why I'm setting my URL to "whatsmyuseragent.com". And I notice that it's telling me that although my Screen Resolution is 1920px x 1080px my Browser Window Size is 0px x 0px, so could this be the problem?
Here is my code below. Any suggestions on what I need to change to convince the site I'm scraping to believe I'm a desktop or laptop (rather than a mobile) would be appreciated. Thanks.
import sys
from PyQt4 import QtGui
from PyQt4.QtGui import QApplication
from PyQt4.QtCore import QUrl
from PyQt4.QtWebKit import QWebPage
from PyQt4.QtNetwork import QNetworkRequest
import bs4 as bs
import urllib.request
class Client(QWebPage):
def __init__ (self, url):
self.app = QApplication(sys.argv)
QWebPage.__init__(self)
self.loadFinished.connect(self.on_page_load)
self.request = QNetworkRequest()
self.request.setUrl(QUrl(url))
self.request.setRawHeader("User-Agent",'Mozilla/5.0')
self.mainFrame().load(self.request)
self.app.exec_()
def on_page_load (self):
self.app.quit()
url = 'http://www.whatsmyuseragent.com'
client_response = Client(url)
source = client_response.mainFrame().toHtml()
soup = bs.BeautifulSoup(source, 'lxml')
print(soup.prettify())
Try to set the Viewport Size:
import sys
import re
from PyQt4 import QtGui
from PyQt4.QtGui import QApplication
from PyQt4.QtCore import QUrl, QSize
from PyQt4.QtWebKit import QWebPage
from PyQt4.QtNetwork import QNetworkRequest
import bs4 as bs
class Client(QWebPage):
def __init__ (self, url):
self.app = QApplication(sys.argv)
QWebPage.__init__(self)
# good ol'size
size = QSize(640, 480)
self.setViewportSize(size)
self.loadFinished.connect(self.on_page_load)
self.request = QNetworkRequest()
self.request.setUrl(QUrl(url))
self.request.setRawHeader("User-Agent",'Mozilla/5.0')
self.mainFrame().load(self.request)
self.app.exec_()
def on_page_load (self):
self.app.quit()
url = 'http://www.whatsmyuseragent.com'
client_response = Client(url)
source = client_response.mainFrame().toHtml()
soup = bs.BeautifulSoup(source, 'lxml')
# some meat from the soup
print(re.sub('\s+', ' ', soup.find(class_='browser-window').text))
This produce the following for me:
625 px x 465 px