Debugging Selenium crawling errors - selenium

I'm trying to crawl pictures using selenium, but I keep getting errors. What should I do?
Here's the syntax.
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
import urllib.request
driver = webdriver.Chrome()
driver.get("https://www.google.co.kr/imghp?hl=ko&tab=wi&authuser=0&ogbl")
elem = driver.find_element_by_name("q")
elem.send_keys("조코딩")
elem.send_keys(Keys.RETURN)
SCROLL_PAUSE_TIME = 1
# Get scroll height
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
# Scroll down to bottom
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Wait to load page
time.sleep(SCROLL_PAUSE_TIME)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
try:
driver.find_element_by_css_selector(".mye4qd").click()
except:
break
last_height = new_height
images = driver.find_elements_by_css_selector(".rg_i.Q4LuWd")
count = 1
for image in images:
try:
image.click()
time.sleep(2)
imgUrl = driver.find_element_by_xpath('/html/body/div[2]/c-wiz/div[3]/div[2]/div[3]/div/div/div[3]/div[2]/c-wiz/div[1]/div[1]/div/div[2]/a/img').get_attribute("src")
opener=urllib.request.build_opener()
opener.addheaders=[('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1941.0 Safari/537.36')]
urllib.request.install_opener(opener)
opener=urllib.request.build_opener()
opener.addheaders=[('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1941.0 Safari/537.36')]
urllib.request.install_opener(opener)
urllib.request.urlretrieve(imgUrl, str(count) + ".jpg")
count = count + 1
except:
pass
driver.close()
Below is the error syntax.
google.py:24: DeprecationWarning: find_element_by_* commands are deprecated. Please use find_element() instead
driver.find_element_by_css_selector(".mye4qd").click()
google.py:29: DeprecationWarning: find_elements_by_* commands are deprecated. Please use find_elements() instead
images = driver.find_elements_by_css_selector(".rg_i.Q4LuWd")
google.py:35: DeprecationWarning: find_element_by_* commands are deprecated. Please use find_element() instead
imgUrl = driver.find_element_by_xpath('/html/body/div[2]/c-wiz/div[3]/div[2]/div[3]/div/div/div[3]/div[2]/c-wiz/div[1]/div[1]/div/div[2]/a/img').get_attribute("src")
[1936:1984:0514/142442.544:ERROR:ssl_client_socket_impl.cc(999)] handshake failed; returned -1, SSL error code 1, net_error -200
I keep trying to change the syntax, but I get an error even if I keep doing it.
What should I do?

Related

How to crawl start URLs only?

I'm trying to make the spider crawl just the provided start URLs without following any extracted links. I've tried setting rules = (Rule (follow=False),) but it still follows links. Does anyone know how to download the start URLs only?
EDIT:
Here's some code
class Spider(CrawlSpider):
name = 'spider'
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36'
def __init__(self, mode, *args, **kwargs):
if mode == 'scan':
self.start_urls = ['https://www.example.com/']
self.rules = (Rule (callback="parse_obj", follow=False),
)
self.custom_settings = {
'COMPRESSION_ENABLED': True,
'URLLENGTH_LIMIT': 100,
'DOWNLOAD_DELAY': 1
}
elif mode == 'crawl':
# something else
super(Spider, self).__init__(*args, **kwargs)

Why am I seeing "no connection adapters found" when trying to use results as a variable, but not while trying to print?

Hope I am asking this the right way - just confused with what's going on: I have my working script (below). I'm trying to take the URLs from a spreadsheet, rather than copy and paste them in - basically, creating urlsA from column N on the sheet connected.
I've tested it out - I can print urlsA to terminal no problem, so I know the Sheet connection is working. I just can't seem to use them when I try to run the full script. I'm receiving this error:
Working code (before pulling links from Google Sheet):
from bs4 import BeautifulSoup
import requests
import time
import pandas as pd
import csv
profilesA = []
urlsA = ['https://stats.ncaa.org/player/game_by_game?game_sport_year_ctl_id=15881&id=15881&org_id=2&stats_player_seq=-100',
'https://stats.ncaa.org/player/game_by_game?game_sport_year_ctl_id=15881&id=15881&org_id=6&stats_player_seq=-100',
'https://stats.ncaa.org/player/game_by_game?game_sport_year_ctl_id=15881&id=15881&org_id=7&stats_player_seq=-100',
'https://stats.ncaa.org/player/game_by_game?game_sport_year_ctl_id=15881&id=15881&org_id=17&stats_player_seq=-100',
'https://stats.ncaa.org/player/game_by_game?game_sport_year_ctl_id=15881&id=15881&org_id=23&stats_player_seq=-100']
for urlA in urlsA:
req = requests.get(urlA, headers={'User-Agent': 'Mozilla/5.0'})
time.sleep(5)
soup = BeautifulSoup(req.text, 'html.parser')
for profileA in soup.select('.smtext > a[href^="/contests/"]'):
profileA = 'https://stats.ncaa.org'+profileA.get('href')
profilesA.append(profileA)
profilesB = []
urlsB = profilesA
for urlB in urlsB:
req = requests.get(urlB, headers={'User-Agent': 'Mozilla/5.0'})
soup = BeautifulSoup(req.text, 'html.parser')
for profileB in soup.select('a[href^="/game/play_by_play/"]'):
profileB = 'https://stats.ncaa.org'+profileB.get('href')
profilesB.append(profileB)
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
}
urls = profilesB
s = requests.Session()
s.headers.update(headers)
for url in urls:
gameId = url.split('/')[-1]
r = s.get(url)
dfs = pd.read_html(r.text)
for df in dfs:
if len(df.columns) > 2:
if df.iloc[0, 2] == 'Score':
df[4] = df[3]
df[[2, 3]] = df[2].str.split('-', expand=True)
df.to_csv('2022test.csv', mode='a', index=False)
Broken code: "No connection adapters were found" error:
from bs4 import BeautifulSoup
import requests
import time
import pandas as pd
import csv
from unittest import skip
import json
import gspread
gc = gspread.service_account(filename='creds.json')
sh = gc.open_by_key('1cEQlPB_ykJrucnbGgKhlKj49RdLNAzeO6fiO2gkQeNU')
wk = sh.worksheet("Team Select")
profilesA = []
ShUrls = wk.batch_get(('N3:N',))[0]
urlsA = ShUrls
for urlA in urlsA:
req = requests.get(urlA, headers={'User-Agent': 'Mozilla/5.0'})
time.sleep(5)
soup = BeautifulSoup(req.text, 'html.parser')
for profileA in soup.select('.smtext > a[href^="/contests/"]'):
profileA = 'https://stats.ncaa.org'+profileA.get('href')
profilesA.append(profileA)
profilesB = []
urlsB = profilesA
for urlB in urlsB:
req = requests.get(urlB, headers={'User-Agent': 'Mozilla/5.0'})
soup = BeautifulSoup(req.text, 'html.parser')
for profileB in soup.select('a[href^="/game/play_by_play/"]'):
profileB = 'https://stats.ncaa.org'+profileB.get('href')
profilesB.append(profileB)
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
}
urls = profilesB
s = requests.Session()
s.headers.update(headers)
for url in urls:
gameId = url.split('/')[-1]
r = s.get(url)
dfs = pd.read_html(r.text)
for df in dfs:
if len(df.columns) > 2:
if df.iloc[0, 2] == 'Score':
df[4] = df[3]
df[[2, 3]] = df[2].str.split('-', expand=True)
df.to_csv('2022test.csv', mode='a', index=False)
I'd inspect this line:
ShUrls = wk.batch_get(('N3:N',))[0]
As you might be pulling a list of lists, hence, this line breaks
req = requests.get(urlA, headers={'User-Agent': 'Mozilla/5.0'})
with the No connection adapters were found error as a list is not a valid URL.
Needed to flatten urlsA after seeing it was an array of arrays. Using this, then calling flatten fixed the issue:
def flatten(l):
fl = []
for sublist in l:
for item in sublist:
fl.append(item)
return fl

python requests.request "GET" returns different result

I am sending a GET request to a website using requests.request in python. When I set User-Agent in the header I only get script tags in the response. When I do not set User-Agent I get all the tags but script. What is the problem? Any idea?
Code with only script tags:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'}
URL = 'https://www.gate-away.com/properties/lazio/rome/rome/id/567087'
response = requests.request('GET', URL, headers=headers).text
soup = BeautifulSoup(response, 'html.parser')
print(len(soup.findAll('script', {'type': "text/javascript"})))
print(len(soup.findAll('div')))
Output is:
15 (a non-zero number)
0
Code with all tags but script:
headers = {}
URL = 'https://www.gate-away.com/properties/lazio/rome/rome/id/567087'
response = requests.request('GET', URL, headers=headers).text
soup = BeautifulSoup(response, 'html.parser')
print(len(soup.findAll('script', {'type': "text/javascript"})))
print(len(soup.findAll('div')))
Output is:
0
100 (a non-zero number)
import re
import requests
from bs4 import BeautifulSoup
import json
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'}
url = "https://www.gate-away.com/properties/lazio/rome/rome/id/567087"
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.text, 'html.parser')
data_script = soup.find('script', string=re.compile("var preloadedData = "))
actual_data = json.loads(data_script.contents[0].split('var preloadedData = ')[1].rsplit(';', 1)[0])
print(actual_data)
This will return a dict with pretty much all data in question.

requests_htlml infinite scrolling on div instead of entire page

Hello I am trying to get all the links from below web page. This page loads new product when we scroll down and I am trying to get the links for all the products by scrolling to the bottom of the page. I am using scrolldown method of requests_html after following this post however it only fetches links of the products that are visible without scroll. The problem is it is scrolling down the complete page instead of the product frame. If you see the below image the products are loaded only when you scroll at the bottom of the products frame.
I also tried seleniumwire(check below code) but it does the same thing, scrolls to the bottom of the page where no products are loaded. How ca I only scroll the products div?
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
import time
from seleniumwire import webdriver
baseurl = "https://www.medplusmart.com/categories/personal-care_10102/skin-care_20002"
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/74.0.3729.169 Safari/537.36 '
}
driver = webdriver.Chrome(executable_path="/src/resources/chromedriver")
driver.implicitly_wait(30)
product_links = []
try:
SCROLL_PAUSE_TIME = 2
def interceptor(request):
del request.headers['Referer'] # Delete the header first
request.headers['Referer'] = header
# Set the interceptor on the driver
driver.request_interceptor = interceptor
# All requests will now use 'some_referer' for the referer
driver.get(baseurl)
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(SCROLL_PAUSE_TIME)
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
# r = requests.get(driver.page_source, headers=header)
print(driver.page_source)
soup = BeautifulSoup(driver.page_source, 'html.parser')
# product_list = soup.find_all('div', class_='col-item productInfoDiv ')
#
# for itemprop in product_list:
# for link in itemprop.find_all('a', href=True):
# product_links.append("{}{}".format(baseurl, link['href']))
#
# product_links_uniq = set(product_links)
#
# print(product_links_uniq)
finally:
driver.quit()
from requests_html import HTML, HTMLSession
baseurl = "https://www.medplusmart.com/categories/personal-care_10102/skin-care_20002"
session = HTMLSession()
page = session.get(baseurl)
page.html.render(scrolldown=50, sleep=3)
html = HTML(html=page.text)
#noticeName = html.find('a href')
all_links = html.links
for ln in all_links:
print(ln)
print(len(all_links))
filtered_links = [link for link in all_links if link.startswith("/product")]
print(len(filtered_links))
You could just mimic the POST requests the page does and keep requesting batches of 20 results, extracting the links, until you have gathered the total specified number of results.
import requests
import math
from bs4 import BeautifulSoup as bs
def add_product_links(soup):
product_links.extend(['https://www.medplusmart.com' + i['href']
for i in soup.select('.productInfoDiv > div:nth-child(1) > [href^=\/product]')])
return
product_links = []
n = 0
results_per_page = 20
page = 1
data = {
'sortField': '',
'startIndex': n,
'productCategoryId': 'MART_20002',
'startPrice': '',
'endPrice': '',
'minPrice': '0',
'maxPrice': '2650',
'excludeNoStock': 'N',
'pCatName': 'personal-care_10102',
'catName': 'skin-care_20002',
'productIdString': '',
'Brand Search': ''
}
with requests.Session() as s:
s.headers = {"User-Agent": "Safari/537.36"}
r = s.get(
'https://www.medplusmart.com/categories/personal-care_10102/skin-care_20002')
soup = bs(r.content, 'lxml')
data['productIdString'] = soup.select_one('#productIdString')['value']
num_results = int(soup.select_one('#totalProductFound')['value'])
num_pages = math.ceil(num_results / results_per_page)
add_product_links(soup)
s.headers.update({'x-kl-ajax-request': 'Ajax_Request'})
while True:
if page > num_pages:
break
data['startIndex'] = n
r = s.post('https://www.medplusmart.com/loadMoreProduct.mart', data=data)
soup = bs(r.content, 'lxml')
add_product_links(soup)
n += results_per_page
page += 1
print(len(product_links))

I cant get text from atribute span in bs4

when i try to get text i have a output like:
price = item.find('span').text
AttributeError: 'NoneType' object has no attribute 'text'
code:
#___IMPORTS_____
from datetime import date
import calendar
import requests
from bs4 import BeautifulSoup
#_______________
url= 'https://www.investing.com/currencies/eur-usd'
page = requests.get(url, headers = {'User-
Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64)
AppleWebKit/537.36 (KHTML, like Gecko)
Chrome/91.0.4472.124 Safari/537.36'})
#print(f'Status code is: {page.status_code}')
soup = BeautifulSoup(page.text, 'html.parser')
table = soup.find_all('div', class_='first
inlineblock')[0]
for item in table:
price = item.find('span').text
print(price)
Try:
#___IMPORTS_____
from datetime import date
import calendar
import requests
from bs4 import BeautifulSoup
#_______________
url= 'https://www.investing.com/currencies/eur-usd'
page = requests.get(url, headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'})
#print(f'Status code is: {page.status_code}')
soup = BeautifulSoup(page.text, 'html.parser')
table = soup.find_all('div', class_='first inlineblock')
for item in table:
price = item.find('span', class_='float_lang_base_2')
print(price.text)
1.1753
1.1752
- 0.4
Or if you require the field:
for item in table:
field = item.find('span', class_='float_lang_base_1')
price = item.find('span', class_='float_lang_base_2')
print(field.text, ':', price.text)
Prev. Close : 1.1753
Open : 1.1752
1-Year Change : - 0.4