Questions about crawling data using colab

Questions about crawling data using colab - selenium

I am trying to crawl data using Selenium in Colab.
But it's hard to find a reason why it doesn't crawl.
WebDriverException: Message: unknown error: Chrome failed to start: exited abnormally.
(unknown error: DevToolsActivePort file doesn't exist)
(The process started from chrome location /usr/bin/chromium-browser is no longer running, so ChromeDriver is assuming that Chrome has crashed.)
This is an error message. I don't know what to do.
I know that it is difficult to review as my code is to crawl the Korean website(naver).
I'm sorry, but I hope you can give me a hint to fix the error. Thanks!
!apt-get update
!apt install chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin
!pip install Selenium
!pip install webdriver_manager
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.service import Service
from bs4 import BeautifulSoup
import pandas as pd
import time
import urllib
from datetime import datetime
from tqdm import tqdm
import os
# chromdriver setting
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome('chromedriver', options=options)
def get_article_info(driver, crawl_date, press_list, title_list, link_list, date_list, more_news_base_url=None, more_news=False):
more_news_url_list = []
while True:
page_html_source = driver.page_source
url_soup = BeautifulSoup(page_html_source, 'lxml')
more_news_infos = url_soup.select('a.news_more')
if more_news:
for more_news_info in more_news_infos:
more_news_url = f"{more_news_base_url}{more_news_info.get('href')}"
more_news_url_list.append(more_news_url)
article_infos = url_soup.select("div.news_area")
if not article_infos:
break
for article_info in article_infos:
press_info = article_info.select_one("div.info_group > a.info.press")
if press_info is None:
press_info = article_info.select_one("div.info_group > span.info.press")
article = article_info.select_one("a.news_tit")
press = press_info.text.replace("언론사 선정", "")
title = article.get('title')
link = article.get('href')
press_list.append(press)
title_list.append(title)
link_list.append(link)
date_list.append(crawl_date)
time.sleep(2.0)
next_button_status = url_soup.select_one("a.btn_next").get("aria-disabled")
if next_button_status == 'true':
break
time.sleep(1.0)
next_page_btn = driver.find_element(By.CSS_SELECTOR, "a.btn_next").click()
return press_list, title_list, link_list, more_news_url_list
def get_naver_news_info_from_selenium(keyword, save_path, target_date, ds_de, sort=0, remove_duplicate=False):
crawl_date = f"{target_date[:4]}.{target_date[4:6]}.{target_date[6:]}"
driver = webdriver.Chrome('/usr/lib/chromium-browser/chromedriver', options=options) # chromedriver 파일 경로?
encoded_keyword = urllib.parse.quote(keyword)
url = f"https://search.naver.com/search.naver?where=news&query={encoded_keyword}&sm=tab_opt&sort={sort}&photo=0&field=0&pd=3&ds={ds_de}&de={ds_de}&docid=&related=0&mynews=0&office_type=0&office_section_code=0&news_office_checked=&nso=so%3Ar%2Cp%3Afrom{target_date}to{target_date}&is_sug_officeid=0"
more_news_base_url = "https://search.naver.com/search.naver"
driver.get(url)
press_list, title_list, link_list, date_list, more_news_url_list = [], [], [], [], []
press_list, title_list, link_list, more_news_url_list = get_article_info(driver=driver,
crawl_date=crawl_date,
press_list=press_list,
title_list=title_list,
link_list=link_list,
date_list=date_list,
more_news_base_url=more_news_base_url,
more_news=True)
driver.close()
if len(more_news_url_list) > 0:
print(len(more_news_url_list))
more_news_url_list = list(set(more_news_url_list))
print(f"->{len(more_news_url_list)}")
for more_news_url in more_news_url_list:
driver = webdriver.Chrome("/usr/lib/chromium-browser/chromedriver")
driver.get(more_news_url)
press_list, title_list, link_list, more_news_url_list = get_article_info(driver=driver,
crawl_date=crawl_date,
press_list=press_list,
title_list=title_list,
link_list=link_list,
date_list=date_list)
driver.close()
article_df = pd.DataFrame({"날짜": date_list, "언론사": press_list, "제목": title_list, "링크": link_list})
print(f"extract article num : {len(article_df)}")
if remove_duplicate:
article_df = article_df.drop_duplicates(['링크'], keep='first')
print(f"after remove duplicate -> {len(article_df)}")
article_df.to_excel(save_path, index=False)
def crawl_news_data(keyword, year, month, start_day, end_day, save_path):
for day in tqdm(range(start_day, end_day+1)):
date_time_obj = datetime(year=year, month=month, day=day)
target_date = date_time_obj.strftime("%Y%m%d")
ds_de = date_time_obj.strftime("%Y.%m.%d")
get_naver_news_info_from_selenium(keyword=keyword, save_path=f"{save_path}/{keyword}/{target_date}_{keyword}_.xlsx", target_date=target_date, ds_de=ds_de, remove_duplicate=False)
keywords = ['사회서비스']
save_path = "/content/naver_news_article"
for keyword in keywords:
os.makedirs(f"{save_path}/{keyword}")
for keyword in keywords:
print(f"start keyword - {keyword} crawling ...")
crawl_news_data(keyword=keyword, year=2022, month=1, start_day=1, end_day=2, save_path=save_path)

I usually run Selenium on Colab and in my setup I don't use webriver_manager
pip install selenium
!apt-get update;
!apt install chromium-chromedriver;
from selenium import webdriver
options = webdriver.ChromeOptions()
options.add_argument("--headless")
options.add_argument("--disable-gpu")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--no-sandbox")
driver = webdriver.Chrome(options=options)
This is my basic setup to make it work

Related

How to attach screenshot in pytest-html report with coftest.py?

I want to attach a screenshot to my HTML report but I haven't found any good resource on how to use the conftest.py file. I created the coftest.py file inside the pytest folder with the following code:
import pytest
#pytest.hookimpl(hookwrapper=True)
def pytest_runtest_makereport(item, call):
pytest_html = item.config.pluginmanager.getplugin("html")
outcome = yield
report = outcome.get_result()
extra = getattr(report, "extra", [])
image="D:/Selenium/Insights/2022-11-02_00-13-18/error_page.png"
if report.when == "call":
# always add url to report
extra.append(pytest_html.extras.url("http://www.example.com/"))
extra.append(pytest_html.extra.image(image))
xfail = hasattr(report, "wasxfail")
if (report.skipped and xfail) or (report.failed and not xfail):
# only add additional html on failure
# extra.append(pytest_html.extras.html("<div>Additional HTML</div>"))
extra.append(pytest_html.extra.image(image))
report.extra = extra
And my test.py file is:
import time
from os import getenv
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
from dotenv import load_dotenv
from Login_actions import Login_activities
from Insights_actions import Insights_activities
from Locators import Locators
import pytest, os
from datetime import datetime
class Test_Insights():
#pytest.fixture
def test_setup(self):
#make new directory for downloads
new_dir = r"D:\Selenium\Insights\{timestamp}".format(timestamp=datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))
# print(new_dir)
if not os.path.exists(new_dir):
os.makedirs(new_dir)
self.saved_dir=new_dir
prefs = {"download.default_directory": new_dir, "download.directory_upgrade": True, "download.prompt_for_download": False}
#intiating chrome browser instance
options=Options()
options.add_argument('--start-maximized')
# options.add_argument('--headless')
options.add_experimental_option("prefs", prefs)
self.driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()),options=options)
#load credentials
load_dotenv()
self.username = getenv("TOP_USERNAME")
self.password = getenv("TOP_PWD")
#exiting ceremonies
yield
self.driver.close()
self.driver.quit()
print("Test executed")
def test_check_in(self, test_setup):
driver=self.driver
# login_url="https://tilt-sso.preprod.crto.in/" separate login page
# url="https://tilt-orange360.preprod.crto.in/insights/home"
url="https://tilt-sso.preprod.crto.in/auth?code=5515f8b0-4b64-4da4-b506-e6a6a3f81b23&scope=cn%20dn%20mail%20uid%20umsId&state=eyJyZWRpcmVjdF91cmkiOiJcL2hvbWUiLCJub25jZSI6IktaTFBxczU5T3lQUWJaRUp0OFhBQWZvZDNueDhPaENDbGlJWVRqZ08ifQ%3D%3D"
driver.get(url)
try:
welcome_text = driver.find_element(by=By.XPATH, value="//div[contains(text(),'Criteo')]")
assert welcome_text
login_actions = Login_activities(driver)
login_actions.enter_username(test_setup.username)
login_actions.enter_password(test_setup.password)
login_actions.login()
page_load_wait = WebDriverWait(driver, timeout=30).until(
EC.url_to_be("https://tilt-orange360.preprod.crto.in/insights/home"))
if (page_load_wait):
WebDriverWait(driver, timeout=20).until(
EC.visibility_of_element_located((By.XPATH, Locators.welcome_text)))
WebDriverWait(driver, timeout=20).until(EC.element_to_be_clickable((By.XPATH, Locators.run_insight)))
insights_actions = Insights_activities(driver)
insights_actions.insights_search("Check-In")
insights_actions.search_partners("BOOKINGIT")
insights_actions.smart_date_30days()
insights_actions.submit_insights()
WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.XPATH, Locators.success_mesg)))
# submit_verify = driver.find_element(by=By.XPATH, value=Locators.success_mesg)
# assert(submit_verify)
print("Submission successful")
insights_actions.download_file()
time.sleep(20)
print(self.saved_dir)
arr=[]
arr+=[file for file in os.listdir(self.saved_dir) if file.endswith('.pptx')]
print("File in the directory: " + arr[0])
while not arr:
time.sleep(5)
if arr:
print("Insights completed. File downloaded successfully")
else:
print("File not available")
raise NoSuchElementException
except:
if driver.find_element(by=By.XPATH,value=Locators.error_page):
driver.get_screenshot_as_file('{dir}/error_page.png'.format(dir=self.saved_dir))
print("500 Internal server error")
Error_page=driver.current_url
print("The error page: "+Error_page)
raise NoSuchElementException
I do not know why is it not working. The document: https://pytest-html.readthedocs.io/en/latest/user_guide.html#enhancing-reports does not have much information. I really need help here, please.

My selenium web scraper doesn't make sense to me

I want to get the bitcoin price using the following code. I have no clue why the output behaves that way. It appears to store certain values and outputs them in-between accurate values. Bonus task: Make the old values disappear in tkinter
from bs4 import BeautifulSoup #Downloading pertinent Python packages
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import tkinter as tk
import time
time = 1000
def bitcoinTracker():
options = Options()
options.headless = True
options.add_argument("--window-size=1920,1200")
chromedriver = "/Users/Philipp/PhytonWebScrap/selenium_project/chromedriver" #Setting up Chrome driver
driver = webdriver.Chrome(options=options, executable_path=chromedriver)
driver.get("https://coinmarketcap.com/currencies/bitcoin/")
hunt = driver.find_element_by_class_name("priceValue___11gHJ").text
return(hunt)
driver.quit()
def collector():
label = tk.Label(text="Bitcoin " + bitcoinTracker(), font="Arial 18")
label.pack()
root.after(time, collector)
root = tk.Tk()
root.after(time, collector)
root.mainloop()

I try again this time only selenium without tkinter
from bs4 import BeautifulSoup #Downloading pertinent Python packages
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
options = Options()
options.headless = True
options.add_argument("--window-size=1920,1200")
while True:
chromedriver = "/Users/Philipp/PhytonWebScrap/selenium_project/chromedriver" #Setting up Chrome driver
driver = webdriver.Chrome(options=options, executable_path=chromedriver)
driver.get("https://coinmarketcap.com/currencies/bitcoin/")
hunt = driver.find_element_by_class_name("priceValue___11gHJ").text
#print(driver.page_source)
time.sleep(20)
print(hunt)
driver.quit()

The website it opens using the automated software closes soon after it gets executed

Error could possibly be in line 7,
it should open the browser search for the specific word then should the window should stay until closed
import unittest
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
class PythonOrgSearch(unittest.TestCase):
def setUp(self):
self.driver = webdriver.Chrome("C:/Users/daniy/AppData/Local/Programs/Python/Python37-32/Scripts/chromedriver.exe")
def test_search_in_python_org(self):
driver = self.driver
driver.get("http://www.python.org")
self.assertIn("Python", driver.title)
driver.implicitly_wait(6000)
elem = driver.find_element_by_name("q")
driver.implicitly_wait(5000)
elem.send_keys("pycon")
driver.implicitly_wait(6000)
elem.send_keys(Keys.RETURN)
driver.implicitly_wait(6000)
assert "No results found." not in driver.page_source
driver.implicitly_wait(9000)
if __name__ == "__main__":
unittest.main()

Try this out it works for me.
driver.get("http://www.python.org")
self.assertIn("Python", driver.title)
elem=WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.NAME, "q")))
elem.send_keys("pycon")
elem.send_keys(Keys.RETURN)
driver.implicitly_wait(5)
self.assertNotIn("No results found.", driver.page_source)
Import
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

You should use the right version of ChromeDrive as per your browser
refer to this link: https://chromedriver.chromium.org/downloads
Try the below code it should work
import unittest
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
class PythonOrgSearch(unittest.TestCase):
def setUp(self):
self.driver = self.driver = webdriver.Chrome("C:/Users/daniy/AppData/Local/Programs/Python/Python37-32/Scripts/chromedriver.exe")
def test_search_in_python_org(self):
driver = self.driver
driver.implicitly_wait(6000)
driver.get("http://www.python.org")
self.assertIn("Python", driver.title)
time.sleep(6000)
elem = driver.find_element_by_name("q")
elem.send_keys("pycon")
elem.send_keys(Keys.RETURN)
time.sleep(6000)
assert "No results found." not in driver.page_source
if __name__ == "__main__":
unittest.main()

Selenium (chromedriver) working on local machine but not on Ubuntu Server

I've successfully built and tested a selenium script (using python and chromedriver) that logs into my wordpress sites. The app runs flawlessly on my local machine. However, since deploying it to an Ubuntu server my script triggers a mod_security rule and blocks me from logging into my wordpress sites.
Does this have to do with cookies or the fact that its running on a server?
Any help would be appreciated. Mod_security Error
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
import os
import shutil
import smtplib
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
from email.mime.base import MIMEBase
from email import encoders
from dashboard.models import Site
import glob
def Backup():
prefs = {
'download.default_directory': r"/home/landon/app/static/dashboard/backups",
# 'download.default_directory': r"/Users/landonroddenberry/Documents/appThree/SITERACK_1.0/dashboard/static/dashboard/backups", #### USE only for local testing and devs #####
'download.prompt_for_download': False,
'download.extensions_to_open': 'xml',
'safebrowsing.enabled': True
}
chrome_path = r"/usr/bin/chromedriver"
# chrome_path = r"/Users/landonroddenberry/Documents/chromedriver" #### USE only for local testing and devs #####
WINDOW_SIZE = "1920,1080"
options = webdriver.ChromeOptions()
options.add_experimental_option('prefs',prefs)
options.add_argument("start-maximized")
options.add_argument("--headless")
# options.add_argument("disable-infobars")
options.add_argument("--disable-extensions")
options.add_argument("--window-size=%s" % WINDOW_SIZE)
options.add_argument("--safebrowsing-disable-download-protection")
options.add_argument("safebrowsing-disable-extension-blacklist")
driver = webdriver.Chrome(executable_path=chrome_path, options=options)
logLink = "/wp-login.php"
AdminLink = "/wp-admin/"
page = "export.php/"
p_col = "https://"
sites = Site.objects.filter(active=True)
for site in sites:
print('')
print('now backing up ' + site.webUrl)
MainLink = (p_col + site.webUrl + AdminLink)
loginLink = (p_col + site.webUrl + logLink)
driver.get(loginLink)
time.sleep(5)
try:
driver.find_element_by_xpath('/html/body/div[1]/form/div[3]/a[1]').click()
time.sleep(3)
except:
print("no button to click")
try:
user_name_elem = driver.find_element_by_xpath('//*[#id="user_login"]')
user_name_elem.clear()
time.sleep(2)
user_name_elem.send_keys(site.u_admin)
time.sleep(2)
passworword_elem = driver.find_element_by_xpath('//*[#id="user_pass"]')
time.sleep(2)
passworword_elem.clear()
passworword_elem.send_keys(site.p_admin)
time.sleep(3)
passworword_elem.send_keys(Keys.RETURN)
time.sleep(5)
driver.save_screenshot('Backup_Conflict_' + site.webUrl + '.png') ##### USED FOR DEBUGGING #######
print('done logging-in')
time.sleep(10)
except:
driver.save_screenshot('Backup_Conflict_' + site.webUrl + '.png')

if you are using the same chromedriver for both windows and linux OS, ten it will not work, the chromedriver is seprate for all the OS's .
Try downloading the chromedriver for linux OS.
Hope this helps.

Python Selenium: Global driver - 'driver' is not defined in the global scope

the source itself works, but I have the problem that the global driver is undefined, only in VsCode. When I run the source in pycharm, that problem does not exist. Unfortunately, I really do not know how to continue.
The Issue: 'driver' is not defined in the global scope
I used Python 3.7.2 with pytest
from selenium import webdriver
import pytest
from selenium.webdriver.common.keys import Keys
def test_setup():
global driver
driver = webdriver.Chrome(executable_path="e:/Webdriver/chromedriver.exe")
driver.implicitly_wait(10)
driver.maximize_window()
def test_login():
driver.get("http://www.dev-crowd.com/wp-login.php")
driver.find_element_by_id("user_login").send_keys("abc")
driver.find_element_by_id("user_pass").send_keys("cab")
driver.find_element_by_id("wp-submit").click()
x = driver.title("abc")
assert X == "abc"
def test_teardown():
driver.close()
driver.quit()
print("Test completed")

The following should work, but i think it should not be necessary:
from selenium import webdriver
import pytest
from selenium.webdriver.common.keys import Keys
driver = None
def test_setup():
driver = webdriver.Chrome(executable_path="e:/Webdriver/chromedriver.exe")
driver.implicitly_wait(10)
driver.maximize_window()
def test_login():
driver.get("http://www.dev-crowd.com/wp-login.php")
driver.find_element_by_id("user_login").send_keys("abc")
driver.find_element_by_id("user_pass").send_keys("cab")
driver.find_element_by_id("wp-submit").click()
x = driver.title("abc")
assert x == "abc"
def test_teardown():
driver.close()
driver.quit()
print("Test completed")

We Keep Coding

sql objective-c vba vb.net react-native apache vue.js tensorflow api pandas

Questions about crawling data using colab - selenium

Related

How to attach screenshot in pytest-html report with coftest.py?

My selenium web scraper doesn't make sense to me

The website it opens using the automated software closes soon after it gets executed

Selenium (chromedriver) working on local machine but not on Ubuntu Server

Python Selenium: Global driver - 'driver' is not defined in the global scope

Categories

Resources