Need screenshot from URL using chromedriver and selenium - selenium

I am able to take screenshot of Website, but I need to take screenshot from URL like we do using snipping tool, but based on screen_size , pixel or want to do some scroll up and scroll down operation to get other images. Please suggest me a way to get picture from below URL
https://artsandculture.google.com/asset/FgEEOnrrqsn9OA
I am using below code in AWS Lambda:
import json
#coding=utf-8
import time
from selenium.webdriver import Chrome
from selenium.webdriver.chrome.options import Options
import logging
import os
class WebDriver(object):
def __init__(self):
self.options = Options()
self.options.binary_location = '/opt/headless-chromium'
#self.options.add_argument('--headless')
self.options.add_argument('--no-sandbox')
#self.options.add_argument('--start-maximized')
#self.options.add_argument('--start-fullscreen')
self.options.add_argument('--single-process')
self.options.add_argument('--disable-dev-shm-usage')
self.options.add_argument("--kiosk")
def get(self):
driver = Chrome('/opt/chromedriver', options=self.options)
return driver
def lambda_handler(event, context):
instance_ = WebDriver()
driver = instance_.get()
options = Options()
a = os.listdir('/tmp')
for x in a:
print(x)
URL = os.environ.get("URL")
driver.get(URL)
S = lambda X: driver.execute_script('return document.body.parentNode.scroll'+X)
#driver.set_window_size(S('Width'),S('Height'))
#driver.set_window_size(1280, 720)
# May need manual adjustment
driver.find_element_by_tag_name('body').screenshot('/tmp/job_status.png')
driver.quit()

I am using xpath to get particular division of class.
driver.find_element_by_xpath('/html/body/div/div[1]/section[2]/div[1]/div/div[2]').screenshot('/tmp/daily_job_status.png')

Related

How to attach screenshot in pytest-html report with coftest.py?

I want to attach a screenshot to my HTML report but I haven't found any good resource on how to use the conftest.py file. I created the coftest.py file inside the pytest folder with the following code:
import pytest
#pytest.hookimpl(hookwrapper=True)
def pytest_runtest_makereport(item, call):
pytest_html = item.config.pluginmanager.getplugin("html")
outcome = yield
report = outcome.get_result()
extra = getattr(report, "extra", [])
image="D:/Selenium/Insights/2022-11-02_00-13-18/error_page.png"
if report.when == "call":
# always add url to report
extra.append(pytest_html.extras.url("http://www.example.com/"))
extra.append(pytest_html.extra.image(image))
xfail = hasattr(report, "wasxfail")
if (report.skipped and xfail) or (report.failed and not xfail):
# only add additional html on failure
# extra.append(pytest_html.extras.html("<div>Additional HTML</div>"))
extra.append(pytest_html.extra.image(image))
report.extra = extra
And my test.py file is:
import time
from os import getenv
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
from dotenv import load_dotenv
from Login_actions import Login_activities
from Insights_actions import Insights_activities
from Locators import Locators
import pytest, os
from datetime import datetime
class Test_Insights():
#pytest.fixture
def test_setup(self):
#make new directory for downloads
new_dir = r"D:\Selenium\Insights\{timestamp}".format(timestamp=datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))
# print(new_dir)
if not os.path.exists(new_dir):
os.makedirs(new_dir)
self.saved_dir=new_dir
prefs = {"download.default_directory": new_dir, "download.directory_upgrade": True, "download.prompt_for_download": False}
#intiating chrome browser instance
options=Options()
options.add_argument('--start-maximized')
# options.add_argument('--headless')
options.add_experimental_option("prefs", prefs)
self.driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()),options=options)
#load credentials
load_dotenv()
self.username = getenv("TOP_USERNAME")
self.password = getenv("TOP_PWD")
#exiting ceremonies
yield
self.driver.close()
self.driver.quit()
print("Test executed")
def test_check_in(self, test_setup):
driver=self.driver
# login_url="https://tilt-sso.preprod.crto.in/" separate login page
# url="https://tilt-orange360.preprod.crto.in/insights/home"
url="https://tilt-sso.preprod.crto.in/auth?code=5515f8b0-4b64-4da4-b506-e6a6a3f81b23&scope=cn%20dn%20mail%20uid%20umsId&state=eyJyZWRpcmVjdF91cmkiOiJcL2hvbWUiLCJub25jZSI6IktaTFBxczU5T3lQUWJaRUp0OFhBQWZvZDNueDhPaENDbGlJWVRqZ08ifQ%3D%3D"
driver.get(url)
try:
welcome_text = driver.find_element(by=By.XPATH, value="//div[contains(text(),'Criteo')]")
assert welcome_text
login_actions = Login_activities(driver)
login_actions.enter_username(test_setup.username)
login_actions.enter_password(test_setup.password)
login_actions.login()
page_load_wait = WebDriverWait(driver, timeout=30).until(
EC.url_to_be("https://tilt-orange360.preprod.crto.in/insights/home"))
if (page_load_wait):
WebDriverWait(driver, timeout=20).until(
EC.visibility_of_element_located((By.XPATH, Locators.welcome_text)))
WebDriverWait(driver, timeout=20).until(EC.element_to_be_clickable((By.XPATH, Locators.run_insight)))
insights_actions = Insights_activities(driver)
insights_actions.insights_search("Check-In")
insights_actions.search_partners("BOOKINGIT")
insights_actions.smart_date_30days()
insights_actions.submit_insights()
WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.XPATH, Locators.success_mesg)))
# submit_verify = driver.find_element(by=By.XPATH, value=Locators.success_mesg)
# assert(submit_verify)
print("Submission successful")
insights_actions.download_file()
time.sleep(20)
print(self.saved_dir)
arr=[]
arr+=[file for file in os.listdir(self.saved_dir) if file.endswith('.pptx')]
print("File in the directory: " + arr[0])
while not arr:
time.sleep(5)
if arr:
print("Insights completed. File downloaded successfully")
else:
print("File not available")
raise NoSuchElementException
except:
if driver.find_element(by=By.XPATH,value=Locators.error_page):
driver.get_screenshot_as_file('{dir}/error_page.png'.format(dir=self.saved_dir))
print("500 Internal server error")
Error_page=driver.current_url
print("The error page: "+Error_page)
raise NoSuchElementException
I do not know why is it not working. The document: https://pytest-html.readthedocs.io/en/latest/user_guide.html#enhancing-reports does not have much information. I really need help here, please.

Scrapy Selenium: Why pagination is not working for scrapy-selenium?

I am trying to get data using scrapy-selenium but there is some issue with the pagination. I have tried my level best to use different selectors and methods but nothing changes. It can only able to scrape the 1st page. I have also checked the other solutions but still, I am unable to make it work. Looking forward to experts' advice.
Source: https://www.gumtree.com/property-for-sale/london
import scrapy
from urllib.parse import urljoin
from scrapy_selenium import SeleniumRequest
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from shutil import which
from selenium.webdriver.common.by import By
import time
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
class Basic2Spider(scrapy.Spider):
name = 'basic2'
def start_requests(self):
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.set_window_size(1920, 1080)
driver.get("https://www.gumtree.com/property-for-sale/london")
time.sleep(2)
property_xpath = driver.find_elements(By.XPATH, "(//article[#class='listing-maxi']/a)[position()>=2 and position()<30]")
for detail in property_xpath:
href= detail.get_attribute('href')
time.sleep(2)
yield SeleniumRequest(
url = href,
)
driver.quit()
return super().start_requests()
def parse(self, response):
yield {
'Title': response.xpath("//div[#class='css-w50tn5 e1pt9h6u11']/h1/text()").get(),
'Price': response.xpath("//h3[#itemprop='price']/text()").get(),
'Add Posted': response.xpath("//*[#id='content']/div[1]/div/main/div[5]/section/div[1]/dl[1]/dd/text()").get(),
'Links': response.url
}
next_page = response.xpath("//li[#class='pagination-currentpage']/following-sibling::li[1]/a/text()").get()
if next_page:
abs_url = f'https://www.gumtree.com/property-for-sale/london/page{next_page}'
yield SeleniumRequest(
url= abs_url,
wait_time=5,
callback=self.parse
)
Your code seem to be correct but getting tcp ip block. I also tried alternative way where code is correct and pagination is working and this type of pagination is two times faster than others but gives me sometimes strange result and sometimes getting ip block.
import scrapy
from scrapy import Selector
from scrapy_selenium import SeleniumRequest
class Basic2Spider(scrapy.Spider):
name = 'basic2'
responses = []
def start_requests(self):
url='https://www.gumtree.com/property-for-sale/london/page{page}'
for page in range(1,6):
print(page)
yield SeleniumRequest(
url=url.format(page=page),
callback=self.parse,
wait_time=5
)
def parse(self, response):
driver = response.meta['driver']
intial_page = driver.page_source
self.responses.append(intial_page)
for resp in self.responses:
r = Selector(text=resp)
property_xpath = r.xpath("(//article[#class='listing-maxi']/a)[position()>=2 and position()<30]")
for detail in property_xpath:
yield {
'Title': detail.xpath('.//*[#class="listing-title"]/text()').get().strip(),
'Price': detail.xpath('.//*[#class="listing-price"]/strong/text()').get(),
'Add Posted': detail.xpath('.//*[#class="listing-posted-date txt-sub"]/span//text()').getall()[2].strip(),
'Links': response.url
}

Scrapy sends multiple Documents to Elastic

we use scrapy to crawl a website where you need to be logged in.
There is one website with different pages to crawl. So we have for example 3 different spiders and just need one login.
So we tried to use one driver for all spiders and we need to run the spiders sequentially:
#...
from selenium.webdriver.firefox.options import Options
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions
from scrapy.utils.project import get_project_settings
#...
class LoginAndCrawl():
login_url = "https://example.com"
retry_count = 0
max_retries = 10
webdriver_timeout = 30
crawler_delay = 1
def __init__(self):
options = Options()
options.headless = True
self.driver = webdriver.Firefox(options=options)
self.prerender()
configure_logging()
self.runner = CrawlerRunner(get_project_settings())
self.crawl()
reactor.run()
self.driver.close()
#defer.inlineCallbacks
def crawl(self):
yield self.runner.crawl(MySpider1, driver=self.driver)
yield self.runner.crawl(MySpider2, driver=self.driver)
yield self.runner.crawl(MySpider3, driver=self.driver)
reactor.stop ()
def prerender(self):
try:
self.log_in()
except Exception as e:
self.retry_count += 1
if self.retry_count > self.max_retries:
self.driver.close()
self.driver = None
else:
self.prerender()
def log_in(self):
#... login code
helper = LoginAndCrawl()
class MySpider1(AbstractSpider)
def __init__(self, driver=None, **kwargs):
if not driver:
raise Exception("no driver")
super().__init__(**kwargs)
self.driver = driver
# do some stuff
class MySpider2(MySpider1)
def __init__(self, driver=None, **kwargs):
if not driver:
raise Exception("no driver")
super().__init__(driver, **kwargs)
# do some stuff
class MySpider3(MySpider1)
def __init__(self, driver=None, **kwargs):
if not driver:
raise Exception("no driver")
super().__init__(driver, **kwargs)
# do some stuff
If we just run one spider, everything is fine. But if we run more than one, the crawled documents are stored multiple times in out elasticsearch index. For example every document from MySpider1 is stored 3 times, MySpider2 twice and MySpider3 has every document stored once.
We tried to check if the duplicates are in our pipline before passing to elasticsearch but there aren't any duplicates passed from our side.
Our impression is that the elastic pipeline somehow keeps the documents from each spider and then saves them for each of them.
Is there any known issue with this implementation?
Can someone confirm this wrong behavior?
Is there any way to fix this problem?

My selenium web scraper doesn't make sense to me

I want to get the bitcoin price using the following code. I have no clue why the output behaves that way. It appears to store certain values and outputs them in-between accurate values. Bonus task: Make the old values disappear in tkinter
from bs4 import BeautifulSoup #Downloading pertinent Python packages
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import tkinter as tk
import time
time = 1000
def bitcoinTracker():
options = Options()
options.headless = True
options.add_argument("--window-size=1920,1200")
chromedriver = "/Users/Philipp/PhytonWebScrap/selenium_project/chromedriver" #Setting up Chrome driver
driver = webdriver.Chrome(options=options, executable_path=chromedriver)
driver.get("https://coinmarketcap.com/currencies/bitcoin/")
hunt = driver.find_element_by_class_name("priceValue___11gHJ").text
return(hunt)
driver.quit()
def collector():
label = tk.Label(text="Bitcoin " + bitcoinTracker(), font="Arial 18")
label.pack()
root.after(time, collector)
root = tk.Tk()
root.after(time, collector)
root.mainloop()
I try again this time only selenium without tkinter
from bs4 import BeautifulSoup #Downloading pertinent Python packages
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
options = Options()
options.headless = True
options.add_argument("--window-size=1920,1200")
while True:
chromedriver = "/Users/Philipp/PhytonWebScrap/selenium_project/chromedriver" #Setting up Chrome driver
driver = webdriver.Chrome(options=options, executable_path=chromedriver)
driver.get("https://coinmarketcap.com/currencies/bitcoin/")
hunt = driver.find_element_by_class_name("priceValue___11gHJ").text
#print(driver.page_source)
time.sleep(20)
print(hunt)
driver.quit()

Python Selenium: Global driver - 'driver' is not defined in the global scope

the source itself works, but I have the problem that the global driver is undefined, only in VsCode. When I run the source in pycharm, that problem does not exist. Unfortunately, I really do not know how to continue.
The Issue: 'driver' is not defined in the global scope
I used Python 3.7.2 with pytest
from selenium import webdriver
import pytest
from selenium.webdriver.common.keys import Keys
def test_setup():
global driver
driver = webdriver.Chrome(executable_path="e:/Webdriver/chromedriver.exe")
driver.implicitly_wait(10)
driver.maximize_window()
def test_login():
driver.get("http://www.dev-crowd.com/wp-login.php")
driver.find_element_by_id("user_login").send_keys("abc")
driver.find_element_by_id("user_pass").send_keys("cab")
driver.find_element_by_id("wp-submit").click()
x = driver.title("abc")
assert X == "abc"
def test_teardown():
driver.close()
driver.quit()
print("Test completed")
The following should work, but i think it should not be necessary:
from selenium import webdriver
import pytest
from selenium.webdriver.common.keys import Keys
driver = None
def test_setup():
driver = webdriver.Chrome(executable_path="e:/Webdriver/chromedriver.exe")
driver.implicitly_wait(10)
driver.maximize_window()
def test_login():
driver.get("http://www.dev-crowd.com/wp-login.php")
driver.find_element_by_id("user_login").send_keys("abc")
driver.find_element_by_id("user_pass").send_keys("cab")
driver.find_element_by_id("wp-submit").click()
x = driver.title("abc")
assert x == "abc"
def test_teardown():
driver.close()
driver.quit()
print("Test completed")