I need to collect all links from a webpage as seen below, which also has a load more news button. I wrote my script, but my script gives only the links from the first page, as if it does not click on the load more news button. I updated some of Selenium attributes. I really don't know why I could not get all the links, clicking on load_more button.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from time import sleep
import json
options = webdriver.ChromeOptions()
driver = webdriver.Chrome(options=options)
url = "..."
base_url = "..."
driver.get(url)
outlinks = []
wait = WebDriverWait(driver, 90)
load_more_button = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'a.listing-load-more-btn[title="Load More News"]')))
num_links = 0
while True:
links = driver.find_elements(By.CSS_SELECTOR, 'a.text-truncate')
num_links_new = len(links)
if num_links_new > num_links:
num_links = num_links_new
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
load_more_button = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'a.listing-load-more-btn[title="Load More News"]')))
if load_more_button.is_displayed():
load_more_button.click()
sleep(10)
else:
break
new_links = driver.find_elements(By.CSS_SELECTOR, 'a.text-truncate')
for link in new_links:
href = link.get_attribute('href')
full_url = base_url + href
enurl=full_url.replace("ar-ae", "en")
outlinks.append(enurl)
print(outlinks)
data = json.dumps(outlinks)
with open('outlinks.json', 'w') as f:
f.write(data)
driver.close()
Although you have tagged selenium, this is a much better way to handle it.
Whenever you click on the "load more" button, it sends a POST request to:
https://www.mofaic.gov.ae/api/features/News/NewsListPartialView
So, you can just get all the data from there directly using the requests/BeautifulSoup modules. There's no need for Selenium, and the process will be much faster!
import requests
from bs4 import BeautifulSoup
data = {
"CurrentPage": "1",
"CurrentRenderId": "{439EC71A-4231-45C8-B075-975BD41099A7}",
"CategoryID": "{f9048938-c577-4caa-b1d9-ae1b7a5f1b20}",
"PageSize": "6",
}
BASE_URL = "https://www.mofaic.gov.ae"
POST_URL = "https://www.mofaic.gov.ae/api/features/News/NewsListPartialView"
response = requests.post(
POST_URL,
data=data,
)
for page in range(
1, 10
): # <-- Increase this number to get more Articles - simulates the "load more" button.
data["CurrentPage"] = page
response = requests.post(
POST_URL,
data=data,
)
soup = BeautifulSoup(response.text, "html.parser")
for link in soup.select("a.text-truncate"):
print(BASE_URL + link["href"])
Prints (truncated):
https://www.mofaic.gov.ae/ar-ae/mediahub/news/2023/1/2/02-01-2023-uae-leaders
https://www.mofaic.gov.ae/ar-ae/mediahub/news/2023/1/2/02-01-2023-uae-vatican
https://www.mofaic.gov.ae/ar-ae/mediahub/news/2023/1/2/02-01-2023-uae-fm
https://www.mofaic.gov.ae/ar-ae/mediahub/news/2023/1/1/01-01-2022-uae-cuba
https://www.mofaic.gov.ae/ar-ae/mediahub/news/2023/1/1/01-01-2022-uae-sudan
https://www.mofaic.gov.ae/ar-ae/mediahub/news/2023/1/1/01-01-2023-uae-israel
My code:
#Open Website
profile_path = r'C:\Users\XXX\AppData\Local\Mozilla\Firefox\Profiles\ndefault-release'
options = Options()
options.binary_location = r'C:\Program Files\Mozilla Firefox\firefox.exe'
options.set_preference('profile', profile_path)
options.add_argument("--no-sandbox")
service = Service(r'C:\Users\XXX\geckodriver.exe')
driver = Firefox(service=service, options=options)
# declaration of variables
name = "x"
suffix = "x"
start_number = 1
end_number = 1000
for i in range(start_number, end_number):
driver.get('https://www.bauhaus.info/gewinnspiel')
time.sleep(3)
#driver.find_element(by=By.XPATH, value=f"/div/div/div/div/div[2]/div/div[2]/div/div/div/button").click()
element = driver.execute_script("""return document.querySelector('#usercentrics-root').shadowRoot.querySelector("button[data-testid='uc-accept-all-button']")""")
element.click()
time.sleep(1)
driver.switch_to.frame(0)
time.sleep(1)
driver.find_element(by=By.XPATH, value=f"/html/body/section/div/div/div/div[2]/div/div/div[1]/div[1]/form/div/div[3]/a").click()
time.sleep(2)
If i add the xpath in the firefox console i get the right output:
Array [ a.btn.btn-primary ]
But the white windows does not open, only if i click manually on it:
shadow_host = driver.find_element(By.ID, "usercentrics-root")
if driver.name == 'firefox':
shadow_root = driver.execute_script('return arguments[0].shadowRoot.children', shadow_host)[0]
else:
shadow_root = shadow_host.shadow_root
WebDriverWait(shadow_root, 30).until(EC.presence_of_element_located((By.CSS_SELECTOR, "button[data-testid='uc-accept-all-button']"))).click()
iframe = WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.CSS_SELECTOR, "iframe[title=' blackweekgewinnspiel']")))
driver.switch_to.frame(iframe)
canvas = driver.find_element(By.CSS_SELECTOR, "canvas")
driver.execute_script("arguments[0].scrollIntoView(true);",canvas)
driver.execute_script("arguments[0].click();", canvas);
driver.switch_to.default_content()
Two points to note:
the Accept Cookies popup is in shadow DOM, for which Chrome + Selenium 4.1 has a ready made solution, but Firefox will also work with Selenium 4 via execute_script.
the roulette wheel is in an iframe, so must switch there before clicking.
I have made a youtube automation bot. I am getting error : unable to locate element (for the Xpath of subscribe button)
here is my code
from selenium import webdriver
from selenium import common
from selenium.webdriver.common import keys
from webdriver_manager.firefox import GeckoDriverManager
import time
class actions:
def __init__(self, email, password):
self.email = email
self.password = password
profile = webdriver.FirefoxProfile()
profile.set_preference("dom.webdriver.enabled", False)
profile.set_preference('useAutomationExtension', False)
profile.update_preferences()
driver = webdriver.Firefox(
executable_path=GeckoDriverManager().install(), firefox_profile=profile)
self.bot = driver
# self.bot.maximize_window()
self.bot.set_window_size(400, 700)
self.is_logged_in = False
def login(self):
bot = self.bot
bot.get("https://accounts.google.com/signin/v2/identifier?service=youtube&uilel=3&passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Faction_handle_signin%3Dtrue%26app%3Ddesktop%26hl%3Den%26next%3Dhttps%253A%252F%252Fwww.youtube.com%252F&hl=en&ec=65620&flowName=GlifWebSignIn&flowEntry=ServiceLogin")
time.sleep(5)
try:
email = bot.find_element_by_name('identifier')
except common.exceptions.NoSuchElementException:
time.sleep(5)
email = bot.find_element_by_name('identifier')
email.clear()
email.send_keys(self.email + keys.Keys.RETURN)
time.sleep(5)
try:
password = bot.find_element_by_name('password')
except common.exceptions.NoSuchElementException:
time.sleep(5)
password = bot.find_element_by_name('password')
password.clear()
password.send_keys(self.password + keys.Keys.RETURN)
time.sleep(5)
self.is_logged_in = True
def kill(self):
bot = self.bot
bot.quit()
def subscribe(self, url):
if not self.is_logged_in:
return
bot = self.bot
bot.get(url)
time.sleep(4)
try:
value = bot.find_element_by_xpath(
'/html/body/ytd-app/div/ytd-page-manager/ytd-watch-flexy/div[5]/div[1]/div/div[7]/div[2]/ytd-video-secondary-info-renderer/div/div/div/ytd-subscribe-button-renderer/tp-yt-paper-button').get_attribute('aria-label')
value = value.split()
except:
bot.execute_script(
'window.scrollTo(0,document.body.scrollHeight/3.5)')
time.sleep(3)
value = bot.find_element_by_xpath(
'/html/body/ytd-app/div/ytd-page-manager/ytd-watch-flexy/div[5]/div[1]/div/div[7]/div[2]/ytd-video-secondary-info-renderer/div/div/div/ytd-subscribe-button-renderer/tp-yt-paper-button').get_attribute('aria-label')
value = value.split(':')
if value[0] == "Subscribe":
try:
bot.find_element_by_xpath(
'/html/body/ytd-app/div/ytd-page-manager/ytd-watch-flexy/div[5]/div[1]/div/div[7]/div[2]/ytd-video-secondary-info-renderer/div/div/div/ytd-subscribe-button-renderer/tp-yt-paper-button').click()
time.sleep(3)
except:
bot.execute_script(
'window.scrollTo(0,document.body.scrollHeight/3.5)')
time.sleep(3)
bot.find_element_by_xpath(
'/html/body/ytd-app/div/ytd-page-manager/ytd-watch-flexy/div[5]/div[1]/div/div[7]/div[2]/ytd-video-secondary-info-renderer/div/div/div/ytd-subscribe-button-renderer/tp-yt-paper-button').click()
time.sleep(3)
how can i resolve this issue. I am not able to understand where things are going wrong. Or i should try find elements by id or other ways instead of Xpath.
Or is there any problem with any software.
Please help me out
Always use relative XPath in your test. Using the absolute XPath will cause regular test failures.
Refer to this tutorial about writing the relative XPaths. https://www.guru99.com/xpath-selenium.html
This extension will help you to write the relative XPaths. https://chrome.google.com/webstore/detail/chropath/ljngjbnaijcbncmcnjfhigebomdlkcjo
You can refer how to write XPath in different ways using functions like text(), starts-with(), contains(). so you can locate them by visible texts also.
Refer this articlehere
Does anybody know how to enable Flash plugin in Chrome 69.
I use chromedriver 2.41 with java selenium bindings.
I've tried with
prefs.put("profile.default_content_setting_values.plugins", 1);
prefs.put("profile.content_settings.plugin_whitelist.adobe-flash-player", 1);
prefs.put("profile.content_settings.exceptions.plugins.*,*.per_resource.adobe-flash-player", 1);
but with no luck.
I've also tried to compare chrome profile preferences with disallowed/allowed flash for particular site and then tried with:
Map<String, Object> site = new HashMap<>();
Map<String, Object> values = new HashMap<>();
Map<String, Object> setting = new HashMap<>();
setting.put("flashPreviouslyChanged", true);
values.put("last_modified", "13180613213099316");
values.put("setting", setting);
site.put("http://my.site,*", values);
prefs.put("profile.content_settings.exceptions.flash_data", site);
but it won't work as well.
I've also tried to run with profile specified via
options.addArguments("user-data-dir=" + profileDir);
but since this white list setting becomes 'ephemeral' in Chrome 69 it also won't work.
Is there any method to run my automation in Chrome with flash support?
Thanks everyone for answers.
I finally have found the solution. In order to enable flash progrmatically since Chrome 69 we have to do 2 things:
Disable Ephemeral Flash Permissions (to enable list of allowed for
Flash sites) and
Add all sites to that list.
See the following code on Java:
ChromeOptions options = new ChromeOptions();
// disable ephemeral flash permissions flag
options.addArguments("--disable-features=EnableEphemeralFlashPermission");
Map<String, Object> prefs = new HashMap<>();
// Enable flash for all sites for Chrome 69
prefs.put("profile.content_settings.exceptions.plugins.*,*.setting", 1);
options.setExperimentalOption("prefs", prefs);
nestedDriver = new ChromeDriver(options);
Given the flag --disable-features=EnableEphemeralFlashPermission has been removed in Chrome 71 which severely cripples Flash test automation I would like to share our solution.
public class FlashPolicyHelper {
private final ChromeDriver driver;
public FlashPolicyHelper(ChromeDriver driver) {
this.driver = driver;
}
public FlashPolicyHelper addSite(String site) {
this.driver.get("chrome://settings/content/siteDetails?site=" + site);
WebElement root1 = driver.findElement(By.tagName("settings-ui"));
WebElement shadowRoot1 = expandRootElement(root1);
WebElement root2 = shadowRoot1.findElement(getByIdentifier("id=container"));
WebElement main = root2.findElement(getByIdentifier("id=main"));
WebElement shadowRoot3 = expandRootElement(main);
WebElement shadowRoot4 = shadowRoot3.findElement(getByIdentifier("class=showing-subpage"));
WebElement shadowRoot5 = expandRootElement(shadowRoot4);
WebElement shadowRoot6 = shadowRoot5.findElement(getByIdentifier("id=advancedPage"));
WebElement shadowRoot7 = shadowRoot6.findElement(By.tagName("settings-privacy-page"));
WebElement shadowRoot8 = expandRootElement(shadowRoot7);
WebElement shadowRoot9 = shadowRoot8.findElement(getByIdentifier("id=pages"));
WebElement shadowRoot10 = shadowRoot9.findElement(By.tagName("settings-subpage"));
WebElement shadowRoot11 = shadowRoot10.findElement(By.tagName("site-details"));
WebElement shadowRoot12 = expandRootElement(shadowRoot11);
WebElement shadowRoot13 = shadowRoot12.findElement(By.id("plugins"));
WebElement shadowRoot14 = expandRootElement(shadowRoot13);
new Select(shadowRoot14.findElement(By.id("permission"))).selectByValue("allow");
return this;
}
private By getByIdentifier(String identifier) {
String[] identifiers = identifier.split("=");
return identifiers[0].equals("id") ? By.id(identifiers[1]) :
By.className(identifiers[1]);
}
private WebElement expandRootElement(WebElement element) {
return (WebElement) driver.executeScript("return arguments[0].shadowRoot",element);
}
}
The helper should be called after instantiating the ChromeDriver.
driver = new ChromeDriver(options);
new FlashPolicyHelper(driver).addSite("https://your.site").addSite("https://another.site");
Follow these steps:
Input this URL in Chrome: chrome://flags/
On search inputbox, digit: ephemeral flash
Choose "disabled" option.
This will not ask to run Flash Player for further sessions in Chrome 69.
Python3 version for Chrome 74. Converted from the Java version of #JohnoCrawford above.
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
def add_flash_site(driver, web_url):
def expand_root_element(element):
return driver.execute_script("return arguments[0].shadowRoot", element)
driver.get("chrome://settings/content/siteDetails?site=" + web_url)
root1 = driver.find_element(By.TAG_NAME, "settings-ui")
shadow_root1 = expand_root_element(root1)
root2 = shadow_root1.find_element(By.ID, "container")
root3 = root2.find_element(By.ID, "main")
shadow_root3 = expand_root_element(root3)
root4 = shadow_root3.find_element(By.CLASS_NAME, "showing-subpage")
shadow_root4 = expand_root_element(root4)
root5 = shadow_root4.find_element(By.ID, "advancedPage")
root6 = root5.find_element(By.TAG_NAME, "settings-privacy-page")
shadow_root6 = expand_root_element(root6)
root7 = shadow_root6.find_element(By.ID, "pages")
root8 = root7.find_element(By.TAG_NAME, "settings-subpage")
root9 = root8.find_element(By.TAG_NAME, "site-details")
shadow_root9 = expand_root_element(root9)
root10 = shadow_root9.find_element(By.ID, "plugins")
shadow_root10 = expand_root_element(root10)
root11 = shadow_root10.find_element(By.ID, "permission")
Select(root11).select_by_value("allow")
Chrome 69 recently released does not allow sites (URLs) to be permanently added (enabled) via chrome://settings/content/flash as was the case for previous versions of Flash Player. However, a URL can be temporarily enabled for the current sesison by clicking on the lock icon to the left of the location bar, then choose Site Settings, and then enable Flash Player.
This policy forces users of Flash Player to re-configure their permission settings every session, which makes it less convenient to use Flash Player. That is apparently by design.
Fortunately, the Microsoft Edge browser does not have this policy. Like Chrome, Edge runs Flash Player. However, unlike Chrome, it persists the permission settings and does not inconvenience the user.
thanks for #JohnoCrawford, i wrote a python code by referring to his java code.
from urllib import quote_plus as url_quoteplus
from urlparse import urlsplit
from selenium.webdriver.common.by import By as WebBy
from selenium.webdriver.support.ui import Select as WebSelect
def allow_flash(driver, url):
def _base_url(url):
if url.find("://")==-1:
url = "http://{}".format(url)
urls = urlsplit(url)
return "{}://{}".format(urls.scheme, urls.netloc)
def _shadow_root(driver, element):
return driver.execute_script("return arguments[0].shadowRoot", element)
base_url = _base_url(url)
driver.get("chrome://settings/content/siteDetails?site={}".format(url_quoteplus(base_url)))
root1 = driver.find_element(WebBy.TAG_NAME, "settings-ui")
shadow_root1 = _shadow_root(driver, root1)
root2 = shadow_root1.find_element(WebBy.ID, "container")
root3 = root2.find_element(WebBy.ID, "main")
shadow_root3 = _shadow_root(driver, root3)
root4 = shadow_root3.find_element(WebBy.CLASS_NAME, "showing-subpage")
shadow_root4 = _shadow_root(driver, root4)
root5 = shadow_root4.find_element(WebBy.ID, "advancedPage")
root6 = root5.find_element(WebBy.TAG_NAME, "settings-privacy-page")
shadow_root6 = _shadow_root(driver, root6)
root7 = shadow_root6.find_element(WebBy.ID, "pages")
root8 = root7.find_element(WebBy.TAG_NAME, "settings-subpage")
root9 = root8.find_element(WebBy.TAG_NAME, "site-details")
shadow_root9 = _shadow_root(driver, root9)
root10 = shadow_root9.find_element(WebBy.ID, "plugins") # Flash
shadow_root10 = _shadow_root(driver, root10)
root11 = shadow_root10.find_element(WebBy.ID, "permission")
WebSelect(root11).select_by_value("allow")
Since I saw many methods here is not work for Chrome 71, I would like to share the solution in C# that I am using:
ChromeOptions chromeOptions = new ChromeOptions();
List<string> allowFlashUrls = new List<string>() {
"*.testing1.com",
"*.testing2.com",
};
chromeOptions.AddUserProfilePreference("profile.managed_plugins_allowed_for_urls", config.ChromeConfig.AllowFlashUrls);
ChromeDriver chromeDriver = new ChromeDriver(chromeOptions);
// Then run your test using chromeDriver
By setting profile.managed_plugins_allowed_for_urls will force Chrome the allow run Flash in the domain declare in allowFlashUrls list. Not tested but it should allow Flash to all site by adding http://* and https:// to allow Flash list.
I handled this by going into the settings for the website before I ran my test and doing selenium actions such as below:
public void SetFlashForURL (string yourWebsiteURL) {
driver.Navigate().GoToUrl(string.Format("chrome://settings/content/siteDetails?site={0}", yourWebsiteURL));
Thread.Sleep(1000);
Actions actions = new Actions(driver);
if (yourWebsiteURL.Contains("https"))
{
actions.SendKeys(OpenQA.Selenium.Keys.Tab);
actions.SendKeys(OpenQA.Selenium.Keys.Tab);
actions.SendKeys(OpenQA.Selenium.Keys.Tab);
actions.SendKeys(OpenQA.Selenium.Keys.Tab);
}
actions.SendKeys(OpenQA.Selenium.Keys.Tab);
actions.SendKeys(OpenQA.Selenium.Keys.Tab);
actions.SendKeys(OpenQA.Selenium.Keys.Tab);
actions.SendKeys(OpenQA.Selenium.Keys.Down);
actions.Build().Perform();
}
My solution for C#
var chromeOptions = new ChromeOptions();
chromeOptions.AddArgument("--disable-features=EnableEphemeralFlashPermission");
chromeOptions.AddUserProfilePreference(
"profile.content_settings.exceptions.plugins.*,*.per_resource.adobe-flash-player", 1);
var capability = (DesiredCapabilities)chromeOptions.ToCapabilities();
Using #RodolphoSilva answer with links:
1 - Input link: chrome://flags/#enable-ephemeral-flash-permission
2 - Change to "Disabled"
3 - Click "RELAUNCH NOW" button
4 - Input link: chrome://settings/content/flash?search=flash
5 - Now you can add or block sites to use flash
#RodolphoSilva - Many thanks for your great answer!
In case anyone else needs it, here's how to do the same thing in a Protractor config:
capabilities: {
browserName: 'chrome',
chromeOptions: {
args: ['--disable-features=EnableEphemeralFlashPermission'],
prefs: {
"profile.content_settings.exceptions.plugins.*,*.per_resource.adobe-flash-player": 1,
}
},
}
ChromeOptions options = new ChromeOptions();
options.addArguments("--disable-features=EnableEphemeralFlashPermission");
Map<String, Object> prefs = new HashMap<>();
prefs.put("profile.content_settings.exceptions.plugins.*,*.per_resource.adobe-flash-player",1);
options.setExperimentalOption("prefs", prefs);
WebDriver driver = new ChromeDriver(options);
driver.get("some url");
Enable flash in chromedriver using robotframework
Credits: #BaiJiFeiLong
Save following code as flash_helper.py
from robot.libraries.BuiltIn import BuiltIn
from selenium.webdriver.common.by import By as WebBy
from selenium.webdriver.support.ui import Select as WebSelect
def allow_flash(url):
seleniumlib = BuiltIn().get_library_instance('SeleniumLibrary')
driver = seleniumlib.driver
def _shadow_root(driver, element):
return driver.execute_script("return arguments[0].shadowRoot", element)
driver.get("chrome://settings/content/siteDetails?site={}".format(url))
root1 = driver.find_element(WebBy.TAG_NAME, "settings-ui")
shadow_root1 = _shadow_root(driver, root1)
root2 = shadow_root1.find_element(WebBy.ID, "container")
root3 = root2.find_element(WebBy.ID, "main")
shadow_root3 = _shadow_root(driver, root3)
root4 = shadow_root3.find_element(WebBy.CLASS_NAME, "showing-subpage")
shadow_root4 = _shadow_root(driver, root4)
root5 = shadow_root4.find_element(WebBy.ID, "advancedPage")
root6 = root5.find_element(WebBy.TAG_NAME, "settings-privacy-page")
shadow_root6 = _shadow_root(driver, root6)
root7 = shadow_root6.find_element(WebBy.ID, "pages")
root8 = root7.find_element(WebBy.TAG_NAME, "settings-subpage")
root9 = root8.find_element(WebBy.TAG_NAME, "site-details")
shadow_root9 = _shadow_root(driver, root9)
root10 = shadow_root9.find_element(WebBy.ID, "plugins") # Flash
shadow_root10 = _shadow_root(driver, root10)
root11 = shadow_root10.find_element(WebBy.ID, "permission")
WebSelect(root11).select_by_value("allow")
Use above method as keyword in robotframework
save following code as test.robot
*** Settings ***
Library SeleniumLibrary
Library flash_helper.py
*** Test Case ***
Allow Flash In Chrome
Open Browser https://www.google.com chrome
# go to chrome settings and enable flash
${CURRENT_URL} Get Location
Allow Flash ${CURRENT_URL}
# revert to previous page
Go To ${CURRENT_URL}
# now Flash is enabled in chrome!!
Since EnableEphemeralFlashPermission can no longer be disabled in Chrome 71+ versions, here's a slightly modified version of #JohnoCrawford code that works for Chrome 81:
def add_flash_site(driver, web_url):
def expand_root_element(element):
return driver.execute_script("return arguments[0].shadowRoot", element)
driver.get("chrome://settings/content/siteDetails?site=" + web_url)
root1 = driver.find_element(By.TAG_NAME, "settings-ui")
shadow_root1 = expand_root_element(root1)
root2 = shadow_root1.find_element(By.ID, "container")
root3 = root2.find_element(By.ID, "main")
shadow_root3 = expand_root_element(root3)
root4 = shadow_root3.find_element(By.CLASS_NAME, "showing-subpage")
shadow_root4 = expand_root_element(root4)
root5 = shadow_root4.find_element(By.TAG_NAME, "settings-privacy-page")
shadow_root5 = expand_root_element(root5)
root6 = shadow_root5.find_element(By.ID, "pages")
root7 = root6.find_element(By.TAG_NAME, "settings-subpage")
root8 = root7.find_element(By.TAG_NAME, "site-details")
shadow_root8 = expand_root_element(root8)
root9 = shadow_root8.find_element(By.ID, "plugins")
shadow_root9 = expand_root_element(root9)
root10 = shadow_root9.find_element(By.ID, "permission")
Select(root10).select_by_value("allow")
I have the same answer as user BaiJiFeiLong
but i found that I had to change the line:
root5 = shadow_root4.find_element(By.ID, "advancedPage")
to:
root5 = shadow_root4.find_element(By.ID, "basicPage")
Because the original line was returning NoSuchElement error.
I have a program to download photos on various websites. Each url is formed at the end of the address by codes, which are accessed in a dataframe. In a dataframe of 8,583 lines
The sites have javascript, so I use selenium to access the src of the photos. And I download it with urllib.request.urlretrieve
Example of a photo site: http://divulgacandcontas.tse.jus.br/divulga/#/candidato/2018/2022802018/PB/150000608817
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
from bs4 import BeautifulSoup
import time
import urllib.request, urllib.parse, urllib.error
# Root URL of the site that is accessed to fetch the photo link
url_raiz = 'http://divulgacandcontas.tse.jus.br/divulga/#/candidato/2018/2022802018/'
# Accesses the dataframe that has the "sequencial" type codes
candidatos = pd.read_excel('candidatos_2018.xlsx',sheet_name='Sheet1', converters={'sequencial': lambda x: str(x), 'cpf': lambda x: str(x),'numero_urna': lambda x: str(x)})
# Function that opens each page and takes the link from the photo
def pegalink(url):
profile = webdriver.FirefoxProfile()
browser = webdriver.Firefox(profile)
browser.get(url)
time.sleep(10)
html = browser.page_source
soup = BeautifulSoup(html, "html.parser")
browser.close()
link = soup.find("img", {"class": "img-thumbnail img-responsive dvg-cand-foto"})['src']
return link
# Function that downloads the photo and saves it with the code name "cpf"
def baixa_foto(nome, url):
urllib.request.urlretrieve(url, nome)
# Iteration in the dataframe
for num, row in candidatos.iterrows():
cpf = (row['cpf']).strip()
uf = (row['uf']).strip()
print(cpf)
print("-/-")
sequencial = (row['sequencial']).strip()
# Creates full page address
url = url_raiz + uf + '/' + sequencial
link_foto = pegalink(url)
baixa_foto(cpf, link_foto)
Please I look guidance for:
Put a try-Exception type to wait for the page to load (I'm having errors reading the src - after many hits the site takes more than ten seconds to load)
And I would like to record all possible errors - in a file or dataframe - to write down the "sequencial" code that gave error and continue the program
Would anyone know how to do it? The guidelines below were very useful, but I was unable to move forward
I put in a folder a part of the data I use and the program, if you want to look: https://drive.google.com/drive/folders/1lAnODBgC5ZUDINzGWMcvXKTzU7tVZXsj?usp=sharing
put your code within :
try:
WebDriverWait(browser, 30).until(wait_for(page_has_loaded))
# here goes your code
except: Exception
print "This is an unexpected condition!"
For waitForPageToLoad :
def page_has_loaded():
page_state = browser.execute_script(
'return document.readyState;'
)
return page_state == 'complete'
30 above is time in seconds. You can adjust it as per your need.
Approach 2 :
class wait_for_page_load(object):
def __init__(self, browser):
self.browser = browser
def __enter__(self):
self.old_page = self.browser.find_element_by_tag_name('html')
def page_has_loaded(self):
new_page = self.browser.find_element_by_tag_name('html')
return new_page.id != self.old_page.id
def __exit__(self, *_):
wait_for(self.page_has_loaded)
def pegalink(url):
profile = webdriver.FirefoxProfile()
browser = webdriver.Firefox(profile)
browser.get(url)
try:
with wait_for_page_load(browser):
html = browser.page_source
soup = BeautifulSoup(html, "html.parser")
browser.close()
link = soup.find("img", {"class": "img-thumbnail img-responsive dvg-cand-foto"})['src']
except Exception:
print ("This is an unexpected condition!")
print("Erro em: ", url)
link = "Erro"
return link