Scrapy-Selenium: Unable to get data - selenium

I am trying to get some data using Scrapy-Selenium. The script is working fine except for the "URL", "Name of the share" and "Code of the share" fields. It is providing empty values for these fields. Though the "URL" field is containing a URL link, but it is not the full URL.
What am I actually missing here?
import scrapy
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from scrapy_selenium import SeleniumRequest
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import time
from datetime import datetime
from scrapy import selector
from scrapy.selector import Selector
from markets_selenium.settings import *
class PriceSelSpider(scrapy.Spider):
name = 'price_sel'
def start_requests(self):
yield SeleniumRequest(
url="https://markets.ft.com",
wait_time=10,
callback=self.parse
)
def __init__(self):
chrome_options = Options()
chrome_options.add_argument('__headless')
chrome_path = SELENIUM_DRIVER_EXECUTABLE_PATH
driver = webdriver.Chrome(executable_path=chrome_path, options=chrome_options)
driver.set_window_size(1920, 1080)
driver.get('https://markets.ft.com/data/funds/tearsheet/historical?s=GB00B4NXY349:GBP')
time.sleep(2)
# self.html = driver.page_source
driver.quit()
def parse(self, response):
# resp = Selector(text=self.html)
now = datetime.now()
dt_string = now.strftime("%Y/%m/%d %H:%M:%S")
infos = response.xpath("//div[#class='mod-ui-table--freeze-pane__scroll-container']/table/tbody/tr")
for info in infos:
yield {
'Time': dt_string,
'URL': response.url,
'Name of the share': response.xpath(".//h1[#class='mod-tearsheet-overview__header__name mod-tearsheet-overview__header__name--large']/text()").get(),
'Code of the share': response.xpath(".//div[#class='mod-tearsheet-overview__header__symbol']/span/text()").get(),
'Date of share price ': info.xpath(".//td/span[1]/text()").get(),
'Opening price': info.xpath(".//td[2]/text()").get(),
'Highest price': info.xpath(".//td[3]/text()").get(),
'Lowest price': info.xpath(".//td[4]/text()").get(),
'Closing price': info.xpath(".//td[5]/text()").get(),
'Volume': info.xpath("(.//td/span)[3]/text()").get()
}
Output
{'Time': '2021/11/30 01:47:55', 'URL': 'https://markets.ft.com/data', 'Name of the share': None, 'Code of the share': None, 'Date of share price ': 'Friday, November 26, 2021', 'Opening price': '178.73', 'Highest price': '178.73', 'Lowest price': '178.73', 'Closing price': '178.73', 'Volume': '--'}
2021-11-30 01:47:55 [scrapy.core.scraper] DEBUG: Scraped from <200 https://markets.ft.com/data>

Related

How to attach screenshot in pytest-html report with coftest.py?

I want to attach a screenshot to my HTML report but I haven't found any good resource on how to use the conftest.py file. I created the coftest.py file inside the pytest folder with the following code:
import pytest
#pytest.hookimpl(hookwrapper=True)
def pytest_runtest_makereport(item, call):
pytest_html = item.config.pluginmanager.getplugin("html")
outcome = yield
report = outcome.get_result()
extra = getattr(report, "extra", [])
image="D:/Selenium/Insights/2022-11-02_00-13-18/error_page.png"
if report.when == "call":
# always add url to report
extra.append(pytest_html.extras.url("http://www.example.com/"))
extra.append(pytest_html.extra.image(image))
xfail = hasattr(report, "wasxfail")
if (report.skipped and xfail) or (report.failed and not xfail):
# only add additional html on failure
# extra.append(pytest_html.extras.html("<div>Additional HTML</div>"))
extra.append(pytest_html.extra.image(image))
report.extra = extra
And my test.py file is:
import time
from os import getenv
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
from dotenv import load_dotenv
from Login_actions import Login_activities
from Insights_actions import Insights_activities
from Locators import Locators
import pytest, os
from datetime import datetime
class Test_Insights():
#pytest.fixture
def test_setup(self):
#make new directory for downloads
new_dir = r"D:\Selenium\Insights\{timestamp}".format(timestamp=datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))
# print(new_dir)
if not os.path.exists(new_dir):
os.makedirs(new_dir)
self.saved_dir=new_dir
prefs = {"download.default_directory": new_dir, "download.directory_upgrade": True, "download.prompt_for_download": False}
#intiating chrome browser instance
options=Options()
options.add_argument('--start-maximized')
# options.add_argument('--headless')
options.add_experimental_option("prefs", prefs)
self.driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()),options=options)
#load credentials
load_dotenv()
self.username = getenv("TOP_USERNAME")
self.password = getenv("TOP_PWD")
#exiting ceremonies
yield
self.driver.close()
self.driver.quit()
print("Test executed")
def test_check_in(self, test_setup):
driver=self.driver
# login_url="https://tilt-sso.preprod.crto.in/" separate login page
# url="https://tilt-orange360.preprod.crto.in/insights/home"
url="https://tilt-sso.preprod.crto.in/auth?code=5515f8b0-4b64-4da4-b506-e6a6a3f81b23&scope=cn%20dn%20mail%20uid%20umsId&state=eyJyZWRpcmVjdF91cmkiOiJcL2hvbWUiLCJub25jZSI6IktaTFBxczU5T3lQUWJaRUp0OFhBQWZvZDNueDhPaENDbGlJWVRqZ08ifQ%3D%3D"
driver.get(url)
try:
welcome_text = driver.find_element(by=By.XPATH, value="//div[contains(text(),'Criteo')]")
assert welcome_text
login_actions = Login_activities(driver)
login_actions.enter_username(test_setup.username)
login_actions.enter_password(test_setup.password)
login_actions.login()
page_load_wait = WebDriverWait(driver, timeout=30).until(
EC.url_to_be("https://tilt-orange360.preprod.crto.in/insights/home"))
if (page_load_wait):
WebDriverWait(driver, timeout=20).until(
EC.visibility_of_element_located((By.XPATH, Locators.welcome_text)))
WebDriverWait(driver, timeout=20).until(EC.element_to_be_clickable((By.XPATH, Locators.run_insight)))
insights_actions = Insights_activities(driver)
insights_actions.insights_search("Check-In")
insights_actions.search_partners("BOOKINGIT")
insights_actions.smart_date_30days()
insights_actions.submit_insights()
WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.XPATH, Locators.success_mesg)))
# submit_verify = driver.find_element(by=By.XPATH, value=Locators.success_mesg)
# assert(submit_verify)
print("Submission successful")
insights_actions.download_file()
time.sleep(20)
print(self.saved_dir)
arr=[]
arr+=[file for file in os.listdir(self.saved_dir) if file.endswith('.pptx')]
print("File in the directory: " + arr[0])
while not arr:
time.sleep(5)
if arr:
print("Insights completed. File downloaded successfully")
else:
print("File not available")
raise NoSuchElementException
except:
if driver.find_element(by=By.XPATH,value=Locators.error_page):
driver.get_screenshot_as_file('{dir}/error_page.png'.format(dir=self.saved_dir))
print("500 Internal server error")
Error_page=driver.current_url
print("The error page: "+Error_page)
raise NoSuchElementException
I do not know why is it not working. The document: https://pytest-html.readthedocs.io/en/latest/user_guide.html#enhancing-reports does not have much information. I really need help here, please.

Scrapy Selenium: Why pagination is not working for scrapy-selenium?

I am trying to get data using scrapy-selenium but there is some issue with the pagination. I have tried my level best to use different selectors and methods but nothing changes. It can only able to scrape the 1st page. I have also checked the other solutions but still, I am unable to make it work. Looking forward to experts' advice.
Source: https://www.gumtree.com/property-for-sale/london
import scrapy
from urllib.parse import urljoin
from scrapy_selenium import SeleniumRequest
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from shutil import which
from selenium.webdriver.common.by import By
import time
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
class Basic2Spider(scrapy.Spider):
name = 'basic2'
def start_requests(self):
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.set_window_size(1920, 1080)
driver.get("https://www.gumtree.com/property-for-sale/london")
time.sleep(2)
property_xpath = driver.find_elements(By.XPATH, "(//article[#class='listing-maxi']/a)[position()>=2 and position()<30]")
for detail in property_xpath:
href= detail.get_attribute('href')
time.sleep(2)
yield SeleniumRequest(
url = href,
)
driver.quit()
return super().start_requests()
def parse(self, response):
yield {
'Title': response.xpath("//div[#class='css-w50tn5 e1pt9h6u11']/h1/text()").get(),
'Price': response.xpath("//h3[#itemprop='price']/text()").get(),
'Add Posted': response.xpath("//*[#id='content']/div[1]/div/main/div[5]/section/div[1]/dl[1]/dd/text()").get(),
'Links': response.url
}
next_page = response.xpath("//li[#class='pagination-currentpage']/following-sibling::li[1]/a/text()").get()
if next_page:
abs_url = f'https://www.gumtree.com/property-for-sale/london/page{next_page}'
yield SeleniumRequest(
url= abs_url,
wait_time=5,
callback=self.parse
)
Your code seem to be correct but getting tcp ip block. I also tried alternative way where code is correct and pagination is working and this type of pagination is two times faster than others but gives me sometimes strange result and sometimes getting ip block.
import scrapy
from scrapy import Selector
from scrapy_selenium import SeleniumRequest
class Basic2Spider(scrapy.Spider):
name = 'basic2'
responses = []
def start_requests(self):
url='https://www.gumtree.com/property-for-sale/london/page{page}'
for page in range(1,6):
print(page)
yield SeleniumRequest(
url=url.format(page=page),
callback=self.parse,
wait_time=5
)
def parse(self, response):
driver = response.meta['driver']
intial_page = driver.page_source
self.responses.append(intial_page)
for resp in self.responses:
r = Selector(text=resp)
property_xpath = r.xpath("(//article[#class='listing-maxi']/a)[position()>=2 and position()<30]")
for detail in property_xpath:
yield {
'Title': detail.xpath('.//*[#class="listing-title"]/text()').get().strip(),
'Price': detail.xpath('.//*[#class="listing-price"]/strong/text()').get(),
'Add Posted': detail.xpath('.//*[#class="listing-posted-date txt-sub"]/span//text()').getall()[2].strip(),
'Links': response.url
}

The website it opens using the automated software closes soon after it gets executed

Error could possibly be in line 7,
it should open the browser search for the specific word then should the window should stay until closed
import unittest
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
class PythonOrgSearch(unittest.TestCase):
def setUp(self):
self.driver = webdriver.Chrome("C:/Users/daniy/AppData/Local/Programs/Python/Python37-32/Scripts/chromedriver.exe")
def test_search_in_python_org(self):
driver = self.driver
driver.get("http://www.python.org")
self.assertIn("Python", driver.title)
driver.implicitly_wait(6000)
elem = driver.find_element_by_name("q")
driver.implicitly_wait(5000)
elem.send_keys("pycon")
driver.implicitly_wait(6000)
elem.send_keys(Keys.RETURN)
driver.implicitly_wait(6000)
assert "No results found." not in driver.page_source
driver.implicitly_wait(9000)
if __name__ == "__main__":
unittest.main()
Try this out it works for me.
driver.get("http://www.python.org")
self.assertIn("Python", driver.title)
elem=WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.NAME, "q")))
elem.send_keys("pycon")
elem.send_keys(Keys.RETURN)
driver.implicitly_wait(5)
self.assertNotIn("No results found.", driver.page_source)
Import
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
You should use the right version of ChromeDrive as per your browser
refer to this link: https://chromedriver.chromium.org/downloads
Try the below code it should work
import unittest
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
class PythonOrgSearch(unittest.TestCase):
def setUp(self):
self.driver = self.driver = webdriver.Chrome("C:/Users/daniy/AppData/Local/Programs/Python/Python37-32/Scripts/chromedriver.exe")
def test_search_in_python_org(self):
driver = self.driver
driver.implicitly_wait(6000)
driver.get("http://www.python.org")
self.assertIn("Python", driver.title)
time.sleep(6000)
elem = driver.find_element_by_name("q")
elem.send_keys("pycon")
elem.send_keys(Keys.RETURN)
time.sleep(6000)
assert "No results found." not in driver.page_source
if __name__ == "__main__":
unittest.main()

How to reduce number of selenium webdriver instances being spawned by scrapy on running crawl on a spider?

On running crawl process for any spider, Scrapy tends to spawn a lot of (27 average varying between 19 - 30) Firefox instances, even if the spider being run is not using selenium.
I have tried driver.quit() inside def __del__(self) in each of the spiders using selenium. The problem still persists.
The Firefox instances stay open even after the crawling process is finished.
example spider using selenium:
import logging
import time
from os.path import abspath, dirname, join
import requests
import scrapy
import selenium
from scrapy.http import HtmlResponse
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.remote.remote_connection import LOGGER
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
LOGGER.setLevel(logging.ERROR)
PATH_DIR = dirname(abspath(__file__))
GECKODRIVER_PATH = abspath(join(PATH_DIR, "../../geckodriver"))
WAIT_TIME = 10
class ExampleSpider(sso_singapore.SsoSpider):
name = "Example"
options = Options()
options.headless = True
driver = webdriver.Firefox(options=options, executable_path=GECKODRIVER_PATH)
def __del__(self):
self.driver.quit()
def parse(self, response):
meta = response.meta
try:
self.driver.get(response.url)
body = self.driver.page_source
try:
element = WebDriverWait(self.driver, WAIT_TIME).until(
EC.presence_of_element_located(
(By.ID, '//select[#id="rows_sort"]/option[text()="All"]')
)
)
except:
pass
response = HtmlResponse(
self.driver.current_url, body=body, encoding="utf-8"
)
except Exception as e:
logging.error(str(e))
finally:
self.driver.quit()
# Create Items based on response
def start_requests(self):
for url, meta in zip(urls, meta_list):
yield scrapy.Request(url, callback=parse, meta=meta)
Any help will be much appreciated.
from scrapy import signals
class ExampleSpider(sso_singapore.SsoSpider):
def __init__(self, *args, **kwargs):
options = Options()
options.headless = True
self.driver = webdriver.Firefox(options=options, executable_path="your_path")
#classmethod
def from_crawler(cls, crawler, *args, **kwargs):
spider = super(ExampleSpider, cls).from_crawler(crawler, *args, **kwargs)
crawler.signals.connect(spider.spider_closed, signal=signals.spider_closed)
return spider
def spider_closed(self, spider):
self.driver.quit()
This should do the job.
More on Scrapy signals:
https://docs.scrapy.org/en/latest/topics/signals.html
You can also use Pipeline if you have many spiders and don't want to add the same driver.quit() logic:
class YourPipeline:
#classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def spider_closed(self, spider):
if hasattr(spider, 'driver'):
spider.driver.quit()

Selenium & Scrapy: Last URL overwrites other URLs

I am currently trying to crawl data from three websites (three different URLs). Therefore, I am using a text-file to load the different URLs into the start_url.
At the moment, there are three URLs in my file. However, the script just saves/overwrites the data of the two URLs before.
This is my code:
# -*- coding: utf-8 -*-
import scrapy
from scrapy import Spider
from selenium import webdriver
from scrapy.selector import Selector
from scrapy.http import Request
from time import sleep
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
import re
import csv
class AlltipsSpider(Spider):
name = 'alltips'
allowed_domains = ['blogabet.com']
def start_requests(self):
self.driver = webdriver.Chrome('C:\webdrivers\chromedriver.exe')
with open("urls.txt", "rt") as f:
start_urls = [l.strip() for l in f.readlines()]
self.driver = webdriver.Chrome('C:\webdrivers\chromedriver.exe')
for url in start_urls:
self.driver.get(url)
self.driver.find_element_by_id('currentTab').click()
sleep(3)
self.logger.info('Sleeping for 5 sec.')
self.driver.find_element_by_xpath('//*[#id="_blog-menu"]/div[2]/div/div[2]/a[3]').click()
sleep(7)
self.logger.info('Sleeping for 7 sec.')
yield Request(self.driver.current_url, callback=self.crawltips)
def crawltips(self, response):
sel = Selector(text=self.driver.page_source)
allposts = sel.xpath('//*[#class="block media _feedPick feed-pick"]')
for post in allposts:
username = post.xpath('.//div[#class="col-sm-7 col-lg-6 no-padding"]/a/#title').extract()
publish_date = post.xpath('.//*[#class="bet-age text-muted"]/text()').extract()
yield{'Username': username,
'Publish date': publish_date
}