how can i use the driver init in the setupModule - selenium

when i use unittest in python3, i tried like this:
import unittest
from selenium import webdriver
def setupModule():
driver = webdriver.Firefox
driver.maximize_window()
driver.get('www.google.com')
def teardownModule():
driver.close()
class test_01(unittest.TestCase):
def setUp(self):
driver.xxxx
def tearDown(self):
driver.xxxx
def test_0001(self):
driver.yyyy
def test_0002(self):
driver.zzzz
class test_02(unittest.TestCase):
def setUp(self):
driver.xxxx
def tearDown(self):
driver.xxxx
def test_0001(self):
driver.yyyy
def test_0002(self):
driver.zzzz
the driver in class and teardownModule can't be recognized. Is there any way to make it available?
I don't want to put driver = webdriver.Firefox out of def, as if i have 2 py file for different cases, it will init 2 or more firefox open firstly, nor cases in that file will be run or not, it will cause that browser always opened.

I'd recommend having a base class to handle the webdriver setup and teardown, i.e:
class BaseTest(unittest.TestCase):
def setUp(self):
self.driver = webdriver.Firefox
def tearDown(self):
self.driver.quit()
class test_01(BaseTest):
def test_0001(self):
self.driver.xxx
class test_02(BaseTest):
def test_0002(self):
self.driver.xxx

Related

Scrapy sends multiple Documents to Elastic

we use scrapy to crawl a website where you need to be logged in.
There is one website with different pages to crawl. So we have for example 3 different spiders and just need one login.
So we tried to use one driver for all spiders and we need to run the spiders sequentially:
#...
from selenium.webdriver.firefox.options import Options
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions
from scrapy.utils.project import get_project_settings
#...
class LoginAndCrawl():
login_url = "https://example.com"
retry_count = 0
max_retries = 10
webdriver_timeout = 30
crawler_delay = 1
def __init__(self):
options = Options()
options.headless = True
self.driver = webdriver.Firefox(options=options)
self.prerender()
configure_logging()
self.runner = CrawlerRunner(get_project_settings())
self.crawl()
reactor.run()
self.driver.close()
#defer.inlineCallbacks
def crawl(self):
yield self.runner.crawl(MySpider1, driver=self.driver)
yield self.runner.crawl(MySpider2, driver=self.driver)
yield self.runner.crawl(MySpider3, driver=self.driver)
reactor.stop ()
def prerender(self):
try:
self.log_in()
except Exception as e:
self.retry_count += 1
if self.retry_count > self.max_retries:
self.driver.close()
self.driver = None
else:
self.prerender()
def log_in(self):
#... login code
helper = LoginAndCrawl()
class MySpider1(AbstractSpider)
def __init__(self, driver=None, **kwargs):
if not driver:
raise Exception("no driver")
super().__init__(**kwargs)
self.driver = driver
# do some stuff
class MySpider2(MySpider1)
def __init__(self, driver=None, **kwargs):
if not driver:
raise Exception("no driver")
super().__init__(driver, **kwargs)
# do some stuff
class MySpider3(MySpider1)
def __init__(self, driver=None, **kwargs):
if not driver:
raise Exception("no driver")
super().__init__(driver, **kwargs)
# do some stuff
If we just run one spider, everything is fine. But if we run more than one, the crawled documents are stored multiple times in out elasticsearch index. For example every document from MySpider1 is stored 3 times, MySpider2 twice and MySpider3 has every document stored once.
We tried to check if the duplicates are in our pipline before passing to elasticsearch but there aren't any duplicates passed from our side.
Our impression is that the elastic pipeline somehow keeps the documents from each spider and then saves them for each of them.
Is there any known issue with this implementation?
Can someone confirm this wrong behavior?
Is there any way to fix this problem?

Passing authenticated session from selenium to scrapy

I am trying to login in a website using selenium then pass the authenticated session to scrapy to extract stuff.
The issue is that after I pass the session to scrapy I am still not logged in.
class LoginSpider(scrapy.Spider):
name = 'login'
allowed_domains = ['*****']
start_urls = ['*****']
def __init__(self):
self.driver = webdriver.Firefox()
def start_requests(self):
# driver = webdriver.Firefox()
self.driver.get('*****')
time.sleep(5)
portalButton = self.driver.find_element_by_xpath('//*[#id="fb_submit"]')
portalButton.click()
time.sleep(2)
self.driver.find_element_by_xpath('//*[#id="email"]').send_keys('******')
self.driver.find_element_by_xpath('//*[#id="password"]').send_keys('******')
self.driver.find_element_by_xpath('//*[#id="btn-login"]').click()
time.sleep(5)
for cookie in self.driver.get_cookies():
c = {cookie['name']: cookie['value']}
yield Request(url="****",cookies=c,callback=self.parse)
def parse(self,response):
# self.log("->>>>>>>>>>>>")
open_in_browser(response)
# view(response)
self.log("->>>>>>>>>>>>")
I would suggest changing that step a bit:
for cookie in self.driver.get_cookies():
c = {cookie['name']: cookie['value']}
to something like that:
_cookies = {cookie['name']: cookie['value'] for cookie in self.driver.get_cookies()}
yield Request(url="****",cookies=_cookies,callback=self.parse)
in each iteration you re-create c with new {cookie['name']: cookie['value']}
my code examples:
import time
import scrapy
from scrapy import Request
from scrapy.utils.response import open_in_browser
from selenium import webdriver
from selenium.webdriver.common.by import By
class LoginSpider(scrapy.Spider):
name = 'login'
start_urls = ['URL']
def __init__(self):
super().__init__()
self.driver = webdriver.Chrome()
def start_requests(self):
self.driver.get('URL')
time.sleep(5)
self.driver.find_element(By.ID, ('email')).send_keys('EMAIL')
self.driver.find_element(By.ID, ('passwd')).send_keys('PASSWORD')
self.driver.find_element(By.ID, ('SubmitLogin')).click()
_cookies = {cookie['name']: cookie['value'] for cookie in self.driver.get_cookies()}
yield Request(url='URL',
cookies=_cookies,
callback=self.parse)
self.driver.quit()
def parse(self, response, **kwargs):
open_in_browser(response)
self.log(response)

How to reduce number of selenium webdriver instances being spawned by scrapy on running crawl on a spider?

On running crawl process for any spider, Scrapy tends to spawn a lot of (27 average varying between 19 - 30) Firefox instances, even if the spider being run is not using selenium.
I have tried driver.quit() inside def __del__(self) in each of the spiders using selenium. The problem still persists.
The Firefox instances stay open even after the crawling process is finished.
example spider using selenium:
import logging
import time
from os.path import abspath, dirname, join
import requests
import scrapy
import selenium
from scrapy.http import HtmlResponse
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.remote.remote_connection import LOGGER
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
LOGGER.setLevel(logging.ERROR)
PATH_DIR = dirname(abspath(__file__))
GECKODRIVER_PATH = abspath(join(PATH_DIR, "../../geckodriver"))
WAIT_TIME = 10
class ExampleSpider(sso_singapore.SsoSpider):
name = "Example"
options = Options()
options.headless = True
driver = webdriver.Firefox(options=options, executable_path=GECKODRIVER_PATH)
def __del__(self):
self.driver.quit()
def parse(self, response):
meta = response.meta
try:
self.driver.get(response.url)
body = self.driver.page_source
try:
element = WebDriverWait(self.driver, WAIT_TIME).until(
EC.presence_of_element_located(
(By.ID, '//select[#id="rows_sort"]/option[text()="All"]')
)
)
except:
pass
response = HtmlResponse(
self.driver.current_url, body=body, encoding="utf-8"
)
except Exception as e:
logging.error(str(e))
finally:
self.driver.quit()
# Create Items based on response
def start_requests(self):
for url, meta in zip(urls, meta_list):
yield scrapy.Request(url, callback=parse, meta=meta)
Any help will be much appreciated.
from scrapy import signals
class ExampleSpider(sso_singapore.SsoSpider):
def __init__(self, *args, **kwargs):
options = Options()
options.headless = True
self.driver = webdriver.Firefox(options=options, executable_path="your_path")
#classmethod
def from_crawler(cls, crawler, *args, **kwargs):
spider = super(ExampleSpider, cls).from_crawler(crawler, *args, **kwargs)
crawler.signals.connect(spider.spider_closed, signal=signals.spider_closed)
return spider
def spider_closed(self, spider):
self.driver.quit()
This should do the job.
More on Scrapy signals:
https://docs.scrapy.org/en/latest/topics/signals.html
You can also use Pipeline if you have many spiders and don't want to add the same driver.quit() logic:
class YourPipeline:
#classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def spider_closed(self, spider):
if hasattr(spider, 'driver'):
spider.driver.quit()

Python Selenium: Global driver - 'driver' is not defined in the global scope

the source itself works, but I have the problem that the global driver is undefined, only in VsCode. When I run the source in pycharm, that problem does not exist. Unfortunately, I really do not know how to continue.
The Issue: 'driver' is not defined in the global scope
I used Python 3.7.2 with pytest
from selenium import webdriver
import pytest
from selenium.webdriver.common.keys import Keys
def test_setup():
global driver
driver = webdriver.Chrome(executable_path="e:/Webdriver/chromedriver.exe")
driver.implicitly_wait(10)
driver.maximize_window()
def test_login():
driver.get("http://www.dev-crowd.com/wp-login.php")
driver.find_element_by_id("user_login").send_keys("abc")
driver.find_element_by_id("user_pass").send_keys("cab")
driver.find_element_by_id("wp-submit").click()
x = driver.title("abc")
assert X == "abc"
def test_teardown():
driver.close()
driver.quit()
print("Test completed")
The following should work, but i think it should not be necessary:
from selenium import webdriver
import pytest
from selenium.webdriver.common.keys import Keys
driver = None
def test_setup():
driver = webdriver.Chrome(executable_path="e:/Webdriver/chromedriver.exe")
driver.implicitly_wait(10)
driver.maximize_window()
def test_login():
driver.get("http://www.dev-crowd.com/wp-login.php")
driver.find_element_by_id("user_login").send_keys("abc")
driver.find_element_by_id("user_pass").send_keys("cab")
driver.find_element_by_id("wp-submit").click()
x = driver.title("abc")
assert x == "abc"
def test_teardown():
driver.close()
driver.quit()
print("Test completed")

Using external transactions within test suite

External transactions.
I want to commit a model or models to the database and roll them back after each test. The pure sqlalchemy implementation (at the bottom) works the way I expect. However, the flask-sqlalchemy implementation fails to rollback the committed models after each test.
How can I bind flask-sqlalchemy's scoped session to the engine's connection?
Generic flask app:
from flask import Flask
from flask_sqlalchemy import SQLAlchemy
app = Flask(__name__)
app.config['SQLALCHEMY_DATABASE_URI'] = 'sqlite://'
db = SQLAlchemy(app)
class A(db.Model):
id = db.Column(db.Integer, primary_key=True)
flask-sqlalchemy testcase (doesn't roll back):
from app import *
from unittest import TestCase
class Test(TestCase):
def setUp(self):
self.connection = db.engine.connect()
self.transaction = self.connection.begin()
options = dict(bind=self.connection, binds={})
self.session = db.create_scoped_session(options=options)
db.session = self.session
self.addCleanup(self.cleanup)
def cleanup(self):
self.transaction.rollback()
self.connection.close()
self.session.remove()
#classmethod
def setUpClass(cls):
db.create_all()
#classmethod
def tearDownClass(cls):
pass
def test_1(self):
a = A()
db.session.add(a)
db.session.commit()
assert len(db.session.query(A).all()) == 1
def test_2(self):
assert len(db.session.query(A).all()) == 0 # len is 1
sqlalchemy testcase (works):
from sqlalchemy import *
from sqlalchemy.orm import *
from sqlalchemy.ext.declarative import declarative_base
from unittest import TestCase
Base = declarative_base()
class A(Base):
__tablename__ = 'test'
id = Column(Integer, primary_key=True)
class Test(TestCase):
def setUp(self):
self.connection = e.connect()
self.transaction = self.connection.begin()
# Begin scoped session
factory = sessionmaker(bind=self.connection)
self.session = scoped_session(factory)
self.addCleanup(self.cleanup)
def cleanup(self):
self.session.close()
self.transaction.rollback()
self.connection.close()
#classmethod
def setUpClass(cls):
global e
e = create_engine("sqlite://")
Base.metadata.create_all(e)
#classmethod
def tearDownClass(cls):
pass
def test_1(self):
a = A()
self.session.add(a)
self.session.commit()
assert len(self.session.query(A).all()) == 1
def test_2(self):
assert len(self.session.query(A).all()) == 0
I had the same problem and perhaps my fix is the same as yours.
The this is now my setUp and tearDown:
import unittest
from my_app import db
class Test(unittest.TestCase):
def setUp(self):
self.connection = db.engine.connect()
self.trans = self.connection.begin()
db.session.configure(bind=self.connection, binds={})
def tearDown(self):
self.trans.rollback()
self.connection.close()
db.session.remove()
What actually did it for me was adding the:
binds={}
I haven't really investigated why this fixes it but it did for me. Would love to hear if anyone knows why the empty dict to bind fixes the problem.