X-Path - How To Pull This Field? - selenium

My code goes into a webpage, clicks on a record, which then drops other records.
Is there a way to use xPath to pull all of these drop-down titles?
Currently, I copied the first drop down titles full xpath, and its only pulling the first one.
That is fine, but how do I pull all entry titles that drop down?
My current code is specifically only for the first line
from selenium import webdriver
import time
driver = webdriver.Chrome()
for x in range (1,2):
driver.get(f'https://library.iaslc.org/conference-program?product_id=24&author=&category=&date=&session_type=&session=&presentation=&keyword=&available=&cme=&page={x}')
time.sleep(4)
productlist_length = len(driver.find_elements_by_xpath("//div[#class='accordin_title']"))
for i in range(1, productlist_length + 1):
product = driver.find_element_by_xpath("(//div[#class='accordin_title'])[" + str(i) + "]")
title = product.find_element_by_xpath('.//h4').text.strip()
buttonToClick = product.find_element_by_xpath('.//div[#class="sign"]')
buttonToClick.click()
time.sleep(5)
dropDownTitle=product.find_element_by_xpath('//*[#id="accordin"]/div/ul/li[1]/div[2]/div/ul/li/div[1]/div[3]/h4').text #this line is the full xpath
print(dropDownTitle)

So can you check with the below line of code
#try to execute it in maximize mode sometimes element is overlayed
driver.maximize_window()
for x in range (1,5):
driver.get(f'https://library.iaslc.org/conference-program?product_id=24&author=&category=&date=&session_type=&session=&presentation=&keyword=&available=&cme=&page={x}')
time.sleep(4)
productlist_length = len(driver.find_elements_by_xpath("//div[#class='accordin_title']"))
for i in range(1, productlist_length + 1):
product = driver.find_element_by_xpath("(//div[#class='accordin_title'])[" + str(i) + "]")
title = product.find_element_by_xpath('.//h4').text.strip()
#So at some point the dropdown doesn't display any records so at that point it throws ClickInterceptedException, Also I ActionChain to move to the particular element
buttonToClick = product.find_element_by_xpath('.//*[#class="info_right"]/h4')
action = ActionChains(driver)
action.move_to_element(buttonToClick).click().perform()
time.sleep(5)
#Here if you just provide the index of the li it will print the title
dropDownTitle=product.find_element_by_xpath("//*[#id='accordin']/div/ul/li["+str(i)+"]/div[1]/div[3]/h4").text
print(dropDownTitle)
import
from time import sleep
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
Output

Related

Unable to extract values linked to data points from an interactive line chart

I am trying to extract the percentage values (Target Rate Probability) from an interactive chart with preselected conditions "13 Dez23" and "Historical" on the following website: https://www.cmegroup.com/markets/interest-rates/cme-fedwatch-tool.html?redirect=/trading/interest-rates/countdown-to-fomc.html. The percentage probabilities only appear when I hover over the respective data points with a mouse.
I have tried the following:
XPath of tried element: /html/body/form/div[3]/div[2]/div[3]/div[1]/div/div/div[1]/div/div[3]/div[3]/div/div/div/div[1]/div/svg/g[5]/g[2]/path[2]
1. Method Approach: Webdriverwait in order for the elements to load on the page
1. Method Error: TimeoutException
2. Method Approach:
driver.execute_script("document.evaluate('/html/body
/form/div[3]/div[2]/div[3]/div[1]/div/div/div[1]/div/div[3]/div[3]/div/div/div/div[1]/div/svg/g[5]/g[2]/path[2]', document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue.style.display = 'block';")
2. Method Error:
selenium.common.exceptions.JavascriptException: Message: javascript error: Cannot read properties of null (reading 'style')
3. Method Approach:
driver.execute_script("document.evaluate('/html/body/form
/div[3]/div[2]/div[3]/div[1]/div/div/div[1]/div/div[3]/div[3]/div/div/div/div[1]/div/svg/g[5]/g[2]/path[2]', document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue.style.visibility = 'visible';")
3. Method Error:
selenium.common.exceptions.JavascriptException: Message: javascript error: Cannot read properties of null (reading 'style')
EC: Invisibility of element is True
**Code: **
from webdriver_manager.chrome import ChromeDriverManager
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
import datetime as dt
import time
driver = webdriver.Chrome(executable_path=ChromeDriverManager().install())
driver.get('https://www.cmegroup.com/markets/interest-rates/cme-fedwatch-tool.html?redirect=/trading/interest-rates/countdown-to-fomc.html')
time.sleep(10)
page_source=driver.page_source
time.sleep(5)
driver.switch_to.frame('cmeIframe-jtxelq2f')
time.sleep(5)
driver.find_element(By.XPATH,"/html/body/form/div[3]/div[2]/div[3]/div[1]/div/div/div[1]/div/div[3]/div[3]/div/div/ul/li[8]/a").click()
time.sleep(5)
driver.find_element(By.CSS_SELECTOR,"#ctl00_MainContent_ucViewControl_IntegratedFedWatchTool_lbHistorical").click()
time.sleep(10)
# 175-200 target rate probability
action = webdriver.ActionChains(driver)
CSS_Selector = '#highcharts-986wlrn-17 > svg > g.highcharts-series-group > g.highcharts-markers.highcharts-series-0.highcharts-line-series.highcharts-color-0.highcharts-tracker > path.highcharts-halo.highcharts-color-0'
# Wait for the element
wait = WebDriverWait(driver, 10)
# Check, if the element is invisible (IT IS)
print(wait.until(EC.invisibility_of_element_located((By.CSS_SELECTOR, CSS_Selector))))
#Change the data-z-index', '4' to put infront of the other elements (DOESNT WORK!)
driver.execute_script("document.querySelector('#highcharts-986wlrn-17 > svg > g.highcharts-series-group > g.highcharts-markers.highcharts-series-0.highcharts-line-series.highcharts-color-0.highcharts-tracker > path.highcharts-halo.highcharts-color-0').setAttribute('data-z-index', '4')")
time.sleep(10)
#Doesnt find the first element of 175-200 (DOESNT WORK!)
element = driver.find_element(By.CSS_SELECTOR,'#highcharts-986wlrn-17 > svg > g.highcharts-series-group > g.highcharts-markers.highcharts-series-0.highcharts-line-series.highcharts-color-0.highcharts-tracker > path.highcharts-halo.highcharts-color-0')
# Move the mouse over the very left element first element 175-200 (DOESNT WORK!)
action.move_to_element(element)
action.perform()
#Determining the location and size in order for the mouse to move along the interactive chart for 175-200
loc = element.location
size = element.size
#moving to right end of the interactive chart for 175-200 (DOESNT WORK!)
action.move_to_element_with_offset(element,510, 0).perform() #found 510 when inspecting the chart metrics
#first date (found on the very left of the chart) (DOESNT WORK!)
driver.find_element(By.XPATH,'/html/body/form/div[3]/div[2]/div[3]/div[1]/div/div/div[1]/div/div[3]/div[3]/div/div/div/div[1]/div/svg/g[10]/g/text/tspan[1]')
#first value (found on the very left of the chart) (DOESNT WORK!)
driver.find_element(By.XPATH,'/html/body/form/div[3]/div[2]/div[3]/div[1]/div/div/div[1]/div/div[3]/div[3]/div/div/div/div[1]/div/svg/g[5]/g[2]/path[1]')
#setting the limit of when to break out of the while loop (DOESNT WORK!)
limit = dt.datetime.strptime('Thu, May 5, 2022', '%a, %B %-m, %B')
pace = -5
dictionary = {}
while True:
#moving back to the very left first element of the interactive chart
action.move_by_offset(pace, 0).perform() # moving by a pace of -5 (-5 is trial value)
date = driver.find_element(By.CSS_SELECTOR,'#highcharts-986wlrn-17 > svg > g.highcharts-tooltip > g.highcharts-label.highcharts-tooltip-header.highcharts-tooltip-box > text').text
value = driver.find_element(By.CSS_SELECTOR,'#highcharts-986wlrn-17 > svg > g.highcharts-tooltip > g.highcharts-label.highcharts-tooltip-box.highcharts-color-0 > text').text
if dt.datetime.strptime(date, '%a, %B %-m, %B') < limit:
break
# add results to dictionary
if date in dictionary:
pass
else:
dictionary[date] = value
driver.quit()
[![Interactive Line Chart][1]][1]
[1]: https://i.stack.imgur.com/ZDHQt.jpg
The code below assumes that you already clicked on "13 Dec23" and "Historical":
import time
import pyautogui # pip install pyautogui
# sleep 3 seconds so that you can switch to the browser and move the mouse to the start position
time.sleep(3)
# start position: put mouse on chart's left border, for example between 0% and 20%
start_pos = pyautogui.position()
# this works only if the tooltip containing the percentage values was already been displayed once
old_values = driver.find_element(By.CSS_SELECTOR, 'g.highcharts-tooltip').text.replace('\n','').split('●')
data = []
for k in range(500):
pyautogui.moveTo(start_pos.x + 3*k , start_pos.y)
new_values = driver.find_element(By.CSS_SELECTOR, 'g.highcharts-tooltip').text.replace('\n','').split('●')
if new_values != old_values:
old_values = new_values
data.append(new_values)
# uncomment this if you want the mouse to move slower
# time.sleep(.1)
data will then be a list of lists, where each list contains the day label and the corresponding percentage values (as strings)
[['Thu, May 5, 2022',
'350-375: 17.18%',
'325-350: 28.11%',
'300-325: 27.01%',
'250-275: 4.39%',
'275-300: 14.80%',
'225-250: 0.65%',
'375-400: 6.27%',
'400-425: 1.36%',
'425-450: 0.17%'],
['Fri, May 6, 2022',
'350-375: 15.54%',
'325-350: 26.14%',
'300-325: 26.94%',
'250-275: 6.08%',
'275-300: 16.73%',
'225-250: 1.24%',
'375-400: 5.71%',
'400-425: 1.29%',
'425-450: 0.17%',
'200-225: 0.14%'],
... etc ...

Trying to fetch option chain data from NSE... but getting error using Chromedriver

Below is the code through which I am trying to fetch option chain data but I am getting errors.
Any help will be appreciated.
Thanks
Also, if someone can help me with a better code to record tick data, that would be great
import time
import datetime
import pandas as pd
from bs4 import BeautifulSoup,SoupStrainer
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
# from selenium.webdriver import Chrome as uc
# options=chrome_options
df=pd.DataFrame(columns=['SNO','call OI','call CHNG IN OI','call VOLUME','call IV','call LTP','call CHNG','call BID QTY','call BID PRICE','call ASK PRICE','call ASK QTY','STRIKE PRICE','put BID QTY','put BID PRICE','put ASK PRICE','put ASK QTY','put CHNG','put LTP','put IV','put VOLUME','put CHNG IN OI','put OI'])
chrome_options = Options()
chrome_options.add_argument("--log-level=3")
from selenium.webdriver import Chrome as uc
chrome_options = uc.ChromeOptions() #new solution
chrome_options.add_argument('--headless')
driver = webdriver.Chrome(executable_path=r"C:\Users\rohit taparia\Downloads\chromedriver_win32\chromedriver.exe",options=chrome_options)
driver.get('https://www.nseindia.com/get-quotes/derivatives?symbol=BANKNIFTY')
driver.minimize_window()
time.sleep(3)
for j in range(0,50):
print(j)
#.....refresh the page and read data again
driver.refresh()
continue_link1 = driver.find_element_by_xpath('''//*[#id="subtab-derivatives"]/div[2]/nav/div/div/a[2]''')
time.sleep(10)
filter_tag=SoupStrainer("table")
continue_link1.click()
time.sleep(3)
rtime = str(driver.find_element_by_xpath('''//*[#id="asondate"]''').text)
if rtime=='':
continue
print(rtime)
page=driver.page_source
soup = BeautifulSoup(page, "html.parser",parse_only=filter_tag)
gdp_table = soup.find("table", attrs={"id": "optionChainTable-indices"})
gdp_table_data = gdp_table.tbody.find_all("tr")
if len(gdp_table_data)==1:
continue
else:
for i in range (0,len(gdp_table_data)):
list1 = []
for td in gdp_table_data[i].find_all("td"):
# remove any newlines and extra spaces from left and right
cell_text=td.text
if cell_text is None or cell_text=='':
cell_text ='0'
cell_text=cell_text.replace(',','')
list1.append(cell_text)
if len(list1) > 0:
list1 = ['0' if i=='-' else i for i in list1]
else:
continue
del list1[0]
del list1[-1]
list1 = list(map(float, list1))
list1.insert(0,rtime)
df.loc[len(df)] = list1
df.to_excel("option-data.xlsx")

How to get value of a cell in html page when click to a link in list link?

I have a list about 5000 link.
Ex 2 in 5000 link:
https://racevietnam.com/runner/buiducninh/ecopark-marathon-2019
https://racevietnam.com/runner/drtungnguyen83/ecopark-marathon-2019
...
I want to get value of column Time of Day and row Finish of links.
Ex:
09:51:07 AM - https://racevietnam.com/runner/buiducninh/ecopark-marathon-2019
07:50:55 AM - https://racevietnam.com/runner/ngocsondknb/ecopark-marathon-2019
I got user infor of a website, that website has id, class. But table in https://racevietnam.com/runner/ngocsondknb/ecopark-marathon-2019 have not id, class in table. So I can't.
#!/usr/bin/python
from urllib.request import urlopen
from bs4 import BeautifulSoup
list_user = []
for userID in range(1, 100000):
link = "https://example.com/member.php?u=" + str(userID)
html = urlopen(link)
bsObj = BeautifulSoup(html, "lxml")
user_name = bsObj.find("div", {"id":"main_userinfo"}).h1.get_text()
list_user.append(user_name)
print("username", userID, "is: ", user_name)
with open("result.txt", "a") as myfile:
myfile.write(user_name)
Please help me.
Thank you.
Using bs4 4.7.1.
There is only one table and you want the second column (td) of the last row. You can use last:child to select the last row; which should be used in conjunction with tbody type selector, and > child combinator, so as not to get header row. You can use nth-of-type to specify the td cell to return.
Now you may wish to develop this in at least two ways:
Handle cases where not found e.g.
name = getattr(soup.select_one('title'), 'text', 'N/A')
timing = getattr(soup.select_one('tbody > tr:last-child td:nth-of-type(2)'), 'text', 'N/A')
Add items to lists/data structure, which can be output as a dataframe at end and written out as csv. Or you may wish to stick with your current method
Python:
import requests
from bs4 import BeautifulSoup as bs
urls = ['https://racevietnam.com/runner/buiducninh/ecopark-marathon-2019', 'https://racevietnam.com/runner/drtungnguyen83/ecopark-marathon-2019']
with requests.Session() as s:
for url in urls:
r = s.get(url)
soup = bs(r.content, 'lxml')
name = soup.select_one('title').text
timing = soup.select_one('tbody > tr:last-child td:nth-of-type(2)').text
print(name, timing)
This is my code.
It's working Ok.
import requests
from bs4 import BeautifulSoup
f = open("input.ecopark","r")
f_content = f.readlines()
f.close()
for url in f_content:
r = requests.get(url.rstrip())
soup = BeautifulSoup(r.text, 'html.parser')
result = soup.select("table tbody tr td")
x = ""
for i in result:
if not x:
if i.get_text() == "Finish":
x = 1
continue
if x:
print(url.rstrip()+ " "+i.get_text())
break

How to get ASINs XPATH from 2 different Amazon pages that have the same parent nodes?

I made a web scraping program using python and webdriver and I want to extract the ASIN from 2 different pages. I would like xpath to work for these 2 links at the same .
These are the amazon pages:https://www.amazon.com/Hydro-Flask-Wide-Mouth-Flip/dp/B01ACATW7E/ref=sr_1_3?s=kitchen&ie=UTF8&qid=1520348607&sr=1-3&keywords=-gfds and
https://www.amazon.com/Ubbi-Saving-Special-Required-Locking/dp/B00821FLSU/ref=sr_1_1?s=baby-products&ie=UTF8&qid=1520265799&sr=1-1&keywords=-hgfd&th=1. They have the same parent nodes(id, classes). How can I make this program work for these 2 links at the same time?
So the problem is on these lines of code: 36, 41
asin = driver.find_element_by_xpath('//div[#id="detail-bullets"]/table/tbody/tr/td/div/ul/li[4]').text
and
asin = driver.find_element_by_xpath('//div[#id="detail-bullets_feature_div"]/div[#id="detail-bullets"]/table/tbody/tr/td/div/ul/li[5]').text. I have to change these lines to output in the csv the ASINs for these 2 products. For the first link it prints the wrong information and for the second it prints the ASIN.
I attached the code. I will appreciate any help.
from selenium import webdriver
import csv
import io
# set the proxies to hide actual IP
proxies = {
'http': 'http://5.189.133.231:80',
'https': 'https://27.111.43.178:8080'
}
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--proxy-server="%s"' % ';'.join(['%s=%s' % (k, v) for k, v in proxies.items()]))
driver = webdriver.Chrome(executable_path="C:\\Users\Andrei-PC\Downloads\webdriver\chromedriver.exe",
chrome_options=chrome_options)
header = ['Product title', 'ASIN']
with open('csv/bot_1.csv', "w") as output:
writer = csv.writer(output)
writer.writerow(header)
links=['https://www.amazon.com/Hydro-Flask-Wide-Mouth-Flip/dp/B01ACATW7E/ref=sr_1_3?s=kitchen&ie=UTF8&qid=1520348607&sr=1-3&keywords=-gfds',
'https://www.amazon.com/Ubbi-Saving-Special-Required-Locking/dp/B00821FLSU/ref=sr_1_1?s=baby-products&ie=UTF8&qid=1520265799&sr=1-1&keywords=-hgfd&th=1'
]
for i in range(len(links)):
driver.get(links[i])
product_title = driver.find_elements_by_xpath('//*[#id="productTitle"][1]')
prod_title = [x.text for x in product_title]
try:
asin = driver.find_element_by_xpath('//div[#id="detail-bullets"]/table/tbody/tr/td/div/ul/li[4]').text
except:
print('no ASIN template one')
try:
asin = driver.find_element_by_xpath('//div[#id="detail-bullets_feature_div"]/div[#id="detail-bullets"]/table/tbody/tr/td/div/ul/li[5]').text
except:
print('no ASIN template two')
try:
data = [prod_title[0], asin]
except:
print('no items v3 ')
with io.open('csv/bot_1.csv', "a", newline="", encoding="utf-8") as output:
writer = csv.writer(output)
writer.writerow(data)
You can simply use
//li[b="ASIN:"]
to get required element on both pages

Not scraping xpath correctly

I'm trying to use the following xpath for this page but it is not loading correctly.
groups = ".//*[contains(#class, 'sl-CouponParticipantWithBookCloses_Name ')]"
xp_bp1 = ".//following::div[contains(#class,'sl-MarketCouponValuesExplicit33')][./div[contains(#class,'gl-MarketColumnHeader')][.='1']]//span[#class='gl-ParticipantOddsOnly_Odds']"
The output currently is..
[['3.00'], ['3.00'], ['3.00'] etc,,
Desired:
[['3.00'], ['1.30'], ['1.25'] etc,,
Data I am after
Script:
import csv
import time
from selenium import webdriver
from selenium.common.exceptions import TimeoutException, NoSuchElementException
driver = webdriver.Chrome()
driver.set_window_size(1024, 600)
driver.maximize_window()
driver.get('https://www.bet365.com.au/#/AC/B1/C1/D13/E108/F16/S1/')
driver.get('https://www.bet365.com.au/#/AC/B1/C1/D13/E108/F16/S1/')
time.sleep(10)
groups = ".//*[contains(#class, 'sl-CouponParticipantWithBookCloses_Name ')]"
#//div[contains(#class, 'gl-ParticipantOddsOnlyDarker gl-ParticipantOddsOnly gl-Participant_General sl-MarketCouponAdvancedBase_LastChild ')]
xp_bp1 = ".//following::div[contains(#class,'sl-MarketCouponValuesExplicit33')][./div[contains(#class,'gl-MarketColumnHeader')][.='1']]//span[#class='gl-ParticipantOddsOnly_Odds']"
while True:
try:
time.sleep(2)
data = []
for elem in driver.find_elements_by_xpath(groups):
try:
bp1 = elem.find_element_by_xpath(xp_bp1).text
except:
bp1 = None
url1 = driver.current_url
data.append([bp1])
print(data)
url1 = driver.current_url
with open('test.csv', 'a', newline='', encoding="utf-8") as outfile:
writer = csv.writer(outfile)
for row in data:
writer.writerow(row + [url1])
except TimeoutException as ex:
pass
except NoSuchElementException as ex:
print(ex)
break