Unable to extract values linked to data points from an interactive line chart

Unable to extract values linked to data points from an interactive line chart - selenium

I am trying to extract the percentage values (Target Rate Probability) from an interactive chart with preselected conditions "13 Dez23" and "Historical" on the following website: https://www.cmegroup.com/markets/interest-rates/cme-fedwatch-tool.html?redirect=/trading/interest-rates/countdown-to-fomc.html. The percentage probabilities only appear when I hover over the respective data points with a mouse.
I have tried the following:
XPath of tried element: /html/body/form/div[3]/div[2]/div[3]/div[1]/div/div/div[1]/div/div[3]/div[3]/div/div/div/div[1]/div/svg/g[5]/g[2]/path[2]
1. Method Approach: Webdriverwait in order for the elements to load on the page
1. Method Error: TimeoutException
2. Method Approach:
driver.execute_script("document.evaluate('/html/body
/form/div[3]/div[2]/div[3]/div[1]/div/div/div[1]/div/div[3]/div[3]/div/div/div/div[1]/div/svg/g[5]/g[2]/path[2]', document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue.style.display = 'block';")
2. Method Error:
selenium.common.exceptions.JavascriptException: Message: javascript error: Cannot read properties of null (reading 'style')
3. Method Approach:
driver.execute_script("document.evaluate('/html/body/form
/div[3]/div[2]/div[3]/div[1]/div/div/div[1]/div/div[3]/div[3]/div/div/div/div[1]/div/svg/g[5]/g[2]/path[2]', document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue.style.visibility = 'visible';")
3. Method Error:
selenium.common.exceptions.JavascriptException: Message: javascript error: Cannot read properties of null (reading 'style')
EC: Invisibility of element is True
**Code: **
from webdriver_manager.chrome import ChromeDriverManager
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
import datetime as dt
import time
driver = webdriver.Chrome(executable_path=ChromeDriverManager().install())
driver.get('https://www.cmegroup.com/markets/interest-rates/cme-fedwatch-tool.html?redirect=/trading/interest-rates/countdown-to-fomc.html')
time.sleep(10)
page_source=driver.page_source
time.sleep(5)
driver.switch_to.frame('cmeIframe-jtxelq2f')
time.sleep(5)
driver.find_element(By.XPATH,"/html/body/form/div[3]/div[2]/div[3]/div[1]/div/div/div[1]/div/div[3]/div[3]/div/div/ul/li[8]/a").click()
time.sleep(5)
driver.find_element(By.CSS_SELECTOR,"#ctl00_MainContent_ucViewControl_IntegratedFedWatchTool_lbHistorical").click()
time.sleep(10)
# 175-200 target rate probability
action = webdriver.ActionChains(driver)
CSS_Selector = '#highcharts-986wlrn-17 > svg > g.highcharts-series-group > g.highcharts-markers.highcharts-series-0.highcharts-line-series.highcharts-color-0.highcharts-tracker > path.highcharts-halo.highcharts-color-0'
# Wait for the element
wait = WebDriverWait(driver, 10)
# Check, if the element is invisible (IT IS)
print(wait.until(EC.invisibility_of_element_located((By.CSS_SELECTOR, CSS_Selector))))
#Change the data-z-index', '4' to put infront of the other elements (DOESNT WORK!)
driver.execute_script("document.querySelector('#highcharts-986wlrn-17 > svg > g.highcharts-series-group > g.highcharts-markers.highcharts-series-0.highcharts-line-series.highcharts-color-0.highcharts-tracker > path.highcharts-halo.highcharts-color-0').setAttribute('data-z-index', '4')")
time.sleep(10)
#Doesnt find the first element of 175-200 (DOESNT WORK!)
element = driver.find_element(By.CSS_SELECTOR,'#highcharts-986wlrn-17 > svg > g.highcharts-series-group > g.highcharts-markers.highcharts-series-0.highcharts-line-series.highcharts-color-0.highcharts-tracker > path.highcharts-halo.highcharts-color-0')
# Move the mouse over the very left element first element 175-200 (DOESNT WORK!)
action.move_to_element(element)
action.perform()
#Determining the location and size in order for the mouse to move along the interactive chart for 175-200
loc = element.location
size = element.size
#moving to right end of the interactive chart for 175-200 (DOESNT WORK!)
action.move_to_element_with_offset(element,510, 0).perform() #found 510 when inspecting the chart metrics
#first date (found on the very left of the chart) (DOESNT WORK!)
driver.find_element(By.XPATH,'/html/body/form/div[3]/div[2]/div[3]/div[1]/div/div/div[1]/div/div[3]/div[3]/div/div/div/div[1]/div/svg/g[10]/g/text/tspan[1]')
#first value (found on the very left of the chart) (DOESNT WORK!)
driver.find_element(By.XPATH,'/html/body/form/div[3]/div[2]/div[3]/div[1]/div/div/div[1]/div/div[3]/div[3]/div/div/div/div[1]/div/svg/g[5]/g[2]/path[1]')
#setting the limit of when to break out of the while loop (DOESNT WORK!)
limit = dt.datetime.strptime('Thu, May 5, 2022', '%a, %B %-m, %B')
pace = -5
dictionary = {}
while True:
#moving back to the very left first element of the interactive chart
action.move_by_offset(pace, 0).perform() # moving by a pace of -5 (-5 is trial value)
date = driver.find_element(By.CSS_SELECTOR,'#highcharts-986wlrn-17 > svg > g.highcharts-tooltip > g.highcharts-label.highcharts-tooltip-header.highcharts-tooltip-box > text').text
value = driver.find_element(By.CSS_SELECTOR,'#highcharts-986wlrn-17 > svg > g.highcharts-tooltip > g.highcharts-label.highcharts-tooltip-box.highcharts-color-0 > text').text
if dt.datetime.strptime(date, '%a, %B %-m, %B') < limit:
break
# add results to dictionary
if date in dictionary:
pass
else:
dictionary[date] = value
driver.quit()
[![Interactive Line Chart][1]][1]
[1]: https://i.stack.imgur.com/ZDHQt.jpg

The code below assumes that you already clicked on "13 Dec23" and "Historical":
import time
import pyautogui # pip install pyautogui
# sleep 3 seconds so that you can switch to the browser and move the mouse to the start position
time.sleep(3)
# start position: put mouse on chart's left border, for example between 0% and 20%
start_pos = pyautogui.position()
# this works only if the tooltip containing the percentage values was already been displayed once
old_values = driver.find_element(By.CSS_SELECTOR, 'g.highcharts-tooltip').text.replace('\n','').split('●')
data = []
for k in range(500):
pyautogui.moveTo(start_pos.x + 3*k , start_pos.y)
new_values = driver.find_element(By.CSS_SELECTOR, 'g.highcharts-tooltip').text.replace('\n','').split('●')
if new_values != old_values:
old_values = new_values
data.append(new_values)
# uncomment this if you want the mouse to move slower
# time.sleep(.1)
data will then be a list of lists, where each list contains the day label and the corresponding percentage values (as strings)
[['Thu, May 5, 2022',
'350-375: 17.18%',
'325-350: 28.11%',
'300-325: 27.01%',
'250-275: 4.39%',
'275-300: 14.80%',
'225-250: 0.65%',
'375-400: 6.27%',
'400-425: 1.36%',
'425-450: 0.17%'],
['Fri, May 6, 2022',
'350-375: 15.54%',
'325-350: 26.14%',
'300-325: 26.94%',
'250-275: 6.08%',
'275-300: 16.73%',
'225-250: 1.24%',
'375-400: 5.71%',
'400-425: 1.29%',
'425-450: 0.17%',
'200-225: 0.14%'],
... etc ...

Related

How to Screenshot a Website Element without Collapsible Division using Selenium?

I want to screenshot just the US hot pots map in https://www.nytimes.com/interactive/2021/us/covid-cases.html but the collapsible division at the very bottom (that says Thanks for reading the Times) keeps coming with the screenshot:
How can I exclude that?
Also ideally the New York Time banner at the top would be cropped out. I used Pillow's Image.crop() to crop from the first image captured but wonder if there is a more convenient/elegant way to achieve that. Any thoughts? Thank you!
Here's my code:
from Screenshot import Screenshot
from selenium.webdriver.common.by import By
from selenium import webdriver
from PIL import Image
ob = Screenshot.Screenshot()
driver = webdriver.Chrome()
driver.page_load_strategy = 'none'
url = "https://www.nytimes.com/interactive/2021/us/covid-cases.html"
driver.get(url)
class_name = "mapboxgl-canvas"
element = driver.find_element(By.CLASS_NAME, class_name)
element.screenshot('{}.png'.format(class_name))
location = element.location
size = element.size
print(class_name, 'location:', location, 'size:', size, '\n')
location = element.location
size = element.size
x = location['x']
# y = location['y']
y = 30
w = x + size['width']
h = y + size['height']
# x = 0; y = 10; w = 950; h = 600
fullImg = Image.open("mapboxgl-canvas.png")
cropImg = fullImg.crop((x, y, w, h))
cropImg.save('cropImage.png')
driver.close()
driver.quit()

After tons of trials and errors, finally I got my code to work. The key is to suppress the expandable dock and re-position the capturing window in js.
Attached is the map with top banner and bottom mapbox logo cropped out.
import platform
from Screenshot import Screenshot
from selenium.webdriver.common.by import By
from PIL import Image, ImageDraw
from selenium import webdriver
from selenium.webdriver import ChromeOptions
import numpy as np
#-------------------------------- Part 1 Take a Screenshot of the Map -----------------------------------
#Set up Chrome driver
options = ChromeOptions()
options.headless = True
options.add_argument('--page_load_strategy = none') #suppress browser window
driver = webdriver.Chrome(executable_path = ('/usr/local/bin/' if platform.system() == 'Linux' else '' + 'chromedriver')
, options=options)
# driver.get_window_rect()
_ = driver.set_window_rect(x=0, y=50, width=1000, height=1000)
# driver.execute_script("document.body.style.zoom='145%'")
#Load the website to capture screenshot
url = "https://www.nytimes.com/interactive/2021/us/covid-cases.html"
driver.get(url)
driver.execute_script("""window.scrollTo(0, 2071);
var element = document.querySelector('[data-testid="expanded-dock"]');
element.style.visibility = 'collapse';
element.style.height = '0px';""") #Suppress the randomly pop-out expandable dock at the bottom which messes up the screenshot window
#Identify the element to screenshot, which is the hot spots map
#-------------------------------------------------------------------------------------------------------
#Can't use class_name = "mapboxgl-canary",'multi-map', "map-wrap", "mapboxgl-interactive"
#These class_names are the same as "mapboxgl-canvas": "aspect-ratio-outer", "aspect-ratio-inner", "map", "mapboxgl-map"
#Using ID = "maps" works too, just y location is different. Stick to class_name = "mapboxgl-canvas"
# tag = "maps"
# element = driver.find_element(By.ID, tag)
#-------------------------------------------------------------------------------------------------------
tag = "mapboxgl-canvas"
element = driver.find_element(By.CLASS_NAME, tag)
img_path = '{}.png'.format(tag)
_ = element.screenshot(img_path)
#Check map location and size for window.scrollTo coordinates
location = element.location
size = element.size
print(tag, 'location:', location, 'size:', size, '\n')
# Make sure window.scrollTo = y location = 2382 and height and width stay at 643 and 919 when configuring set_window_rect()
# mapboxgl-canvas location: {'x': 30, 'y': 2382} size: {'height': 643, 'width': 919}
#Crop image to remove unwanted pixels
# x = location['x']
# y = location['y']
x=0; y=30 #Start from a lower position to crop the top banner
w = x + size['width']
h = y + size['height'] - 60 #Subtract from height to remove mapbox logo at the bottom
fullImg = Image.open(img_path)
cropImg = fullImg.crop((x, y, w, h)) #(left, upper, right, lower)-tuple
cropImg.save('cropImage.png')
fullImg.close()
driver.close()
driver.quit()
#--------- Part 2 Mask unwanted parts of the image (top right size control, P.R. region at bottom right) -----------
im = Image.open('cropImage.png')
draw = ImageDraw.Draw(im)
#Vertices of masking rectangles, one for top right size control, the other for bottom right Puerto Rico
top_left = (cropImg.width -41, 0)
bottom_right = (cropImg.width - 8, 40)
top_left2 = (cropImg.width - 100, cropImg.height - 45)
bottom_right2 = (cropImg.width - 40, cropImg.height - 15)
draw.rectangle((top_left, bottom_right), fill=(255, 255, 255))
draw.rectangle((top_left2, bottom_right2), fill=(255, 255, 255))
# Save final image
im.save('cropImage1.png')

Trying to fetch option chain data from NSE... but getting error using Chromedriver

Below is the code through which I am trying to fetch option chain data but I am getting errors.
Any help will be appreciated.
Thanks
Also, if someone can help me with a better code to record tick data, that would be great
import time
import datetime
import pandas as pd
from bs4 import BeautifulSoup,SoupStrainer
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
# from selenium.webdriver import Chrome as uc
# options=chrome_options
df=pd.DataFrame(columns=['SNO','call OI','call CHNG IN OI','call VOLUME','call IV','call LTP','call CHNG','call BID QTY','call BID PRICE','call ASK PRICE','call ASK QTY','STRIKE PRICE','put BID QTY','put BID PRICE','put ASK PRICE','put ASK QTY','put CHNG','put LTP','put IV','put VOLUME','put CHNG IN OI','put OI'])
chrome_options = Options()
chrome_options.add_argument("--log-level=3")
from selenium.webdriver import Chrome as uc
chrome_options = uc.ChromeOptions() #new solution
chrome_options.add_argument('--headless')
driver = webdriver.Chrome(executable_path=r"C:\Users\rohit taparia\Downloads\chromedriver_win32\chromedriver.exe",options=chrome_options)
driver.get('https://www.nseindia.com/get-quotes/derivatives?symbol=BANKNIFTY')
driver.minimize_window()
time.sleep(3)
for j in range(0,50):
print(j)
#.....refresh the page and read data again
driver.refresh()
continue_link1 = driver.find_element_by_xpath('''//*[#id="subtab-derivatives"]/div[2]/nav/div/div/a[2]''')
time.sleep(10)
filter_tag=SoupStrainer("table")
continue_link1.click()
time.sleep(3)
rtime = str(driver.find_element_by_xpath('''//*[#id="asondate"]''').text)
if rtime=='':
continue
print(rtime)
page=driver.page_source
soup = BeautifulSoup(page, "html.parser",parse_only=filter_tag)
gdp_table = soup.find("table", attrs={"id": "optionChainTable-indices"})
gdp_table_data = gdp_table.tbody.find_all("tr")
if len(gdp_table_data)==1:
continue
else:
for i in range (0,len(gdp_table_data)):
list1 = []
for td in gdp_table_data[i].find_all("td"):
# remove any newlines and extra spaces from left and right
cell_text=td.text
if cell_text is None or cell_text=='':
cell_text ='0'
cell_text=cell_text.replace(',','')
list1.append(cell_text)
if len(list1) > 0:
list1 = ['0' if i=='-' else i for i in list1]
else:
continue
del list1[0]
del list1[-1]
list1 = list(map(float, list1))
list1.insert(0,rtime)
df.loc[len(df)] = list1
df.to_excel("option-data.xlsx")

X-Path - How To Pull This Field?

My code goes into a webpage, clicks on a record, which then drops other records.
Is there a way to use xPath to pull all of these drop-down titles?
Currently, I copied the first drop down titles full xpath, and its only pulling the first one.
That is fine, but how do I pull all entry titles that drop down?
My current code is specifically only for the first line
from selenium import webdriver
import time
driver = webdriver.Chrome()
for x in range (1,2):
driver.get(f'https://library.iaslc.org/conference-program?product_id=24&author=&category=&date=&session_type=&session=&presentation=&keyword=&available=&cme=&page={x}')
time.sleep(4)
productlist_length = len(driver.find_elements_by_xpath("//div[#class='accordin_title']"))
for i in range(1, productlist_length + 1):
product = driver.find_element_by_xpath("(//div[#class='accordin_title'])[" + str(i) + "]")
title = product.find_element_by_xpath('.//h4').text.strip()
buttonToClick = product.find_element_by_xpath('.//div[#class="sign"]')
buttonToClick.click()
time.sleep(5)
dropDownTitle=product.find_element_by_xpath('//*[#id="accordin"]/div/ul/li[1]/div[2]/div/ul/li/div[1]/div[3]/h4').text #this line is the full xpath
print(dropDownTitle)

So can you check with the below line of code
#try to execute it in maximize mode sometimes element is overlayed
driver.maximize_window()
for x in range (1,5):
driver.get(f'https://library.iaslc.org/conference-program?product_id=24&author=&category=&date=&session_type=&session=&presentation=&keyword=&available=&cme=&page={x}')
time.sleep(4)
productlist_length = len(driver.find_elements_by_xpath("//div[#class='accordin_title']"))
for i in range(1, productlist_length + 1):
product = driver.find_element_by_xpath("(//div[#class='accordin_title'])[" + str(i) + "]")
title = product.find_element_by_xpath('.//h4').text.strip()
#So at some point the dropdown doesn't display any records so at that point it throws ClickInterceptedException, Also I ActionChain to move to the particular element
buttonToClick = product.find_element_by_xpath('.//*[#class="info_right"]/h4')
action = ActionChains(driver)
action.move_to_element(buttonToClick).click().perform()
time.sleep(5)
#Here if you just provide the index of the li it will print the title
dropDownTitle=product.find_element_by_xpath("//*[#id='accordin']/div/ul/li["+str(i)+"]/div[1]/div[3]/h4").text
print(dropDownTitle)
import
from time import sleep
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
Output

How to reduce amount of scraping time when using Requests-HTML?

I currently use Requests-HTML version 0.10.0 and Selenium 3.141.0. My project is to scrape the ratings of all articles on this website https://openreview.net/group?id=ICLR.cc/2021/Conference. To open each page of the website (the website has 53 pages and each page has 50 articles), I use Selenium. Next, to open articles on each page, I use Requests-HTML. My question is about how to reduce the time uses to open each article and get the rating. In this case, I use await r_inside.html.arender(sleep = 5, timeout=100), which means the sleeping time is 5 seconds and the timeout is 100 seconds. When I try to reduce sleep time to 0.5 seconds, it will cause an error, which is because it does not have enough time to scrape the website. However, if I keep the sleep time as 5 seconds, it will take 6 to 13 hours to scrape all 2600 articles. Also, after waiting for 13 hours, I can scrape all 2600 articles, but the codes use 88 GB of RAM, which I do not prefer because I need to send this code to other people who will not have enough RAM to run. My purpose is to reduce the scraping time and RAM memory. Below is the code I use.
import csv
link = 'https://openreview.net/group?id=ICLR.cc/2021/Conference'
from requests_html import HTMLSession, AsyncHTMLSession
import time
from tqdm import tqdm
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
id_list = []
keyword_list = []
abstract_list = []
title_list = []
driver = webdriver.Chrome('./requests_html/chromedriver.exe')
driver.get('https://openreview.net/group?id=ICLR.cc/2021/Conference')
cond = EC.presence_of_element_located((By.XPATH, '//*[#id="all-submissions"]/nav/ul/li[13]/a'))
WebDriverWait(driver, 10).until(cond)
for page in tqdm(range(1, 54)):
text = ''
elems = driver.find_elements_by_xpath('//*[#id="all-submissions"]/ul/li')
for i, elem in enumerate(elems):
try:
# parse title
title = elem.find_element_by_xpath('./h4/a[1]')
link = title.get_attribute('href')
paper_id = link.split('=')[-1]
title = title.text.strip().replace('\t', ' ').replace('\n', ' ')
# show details
elem.find_element_by_xpath('./a').click()
time.sleep(0.2)
# parse keywords & abstract
items = elem.find_elements_by_xpath('.//li')
keyword = ''.join([x.text for x in items if 'Keywords' in x.text])
abstract = ''.join([x.text for x in items if 'Abstract' in x.text])
keyword = keyword.strip().replace('\t', ' ').replace('\n', ' ').replace('Keywords: ', '')
abstract = abstract.strip().replace('\t', ' ').replace('\n', ' ').replace('Abstract: ', '')
text += paper_id+'\t'+title+'\t'+link+'\t'+keyword+'\t'+abstract+'\n'
title_list.append(title)
id_list.append(paper_id)
keyword_list.append(keyword)
abstract_list.append(abstract)
except Exception as e:
print(f'page {page}, # {i}:', e)
continue
# next page
try:
driver.find_element_by_xpath('//*[#id="all-submissions"]/nav/ul/li[13]/a').click()
time.sleep(2) # NOTE: increase sleep time if needed
except:
print('no next page, exit.')
break
csv_file = open('./requests_html/bb_website_scrap.csv','w', encoding="utf-8")
csv_writer = csv.writer(csv_file)
csv_writer.writerow(['Title','Keyword','Abstract','Link','Total Number of Reviews','Average Rating','Average Confidence'])
n = 0
for item in range(len(id_list)):
title = title_list[item]
keyword = keyword_list[item]
abstract = abstract_list[item]
id = id_list[item]
link_pdf = f'https://openreview.net/forum?id={id}'
print(id)
asession_inside = AsyncHTMLSession()
r_inside = await asession_inside.get(link_pdf)
print(type(r_inside))
await r_inside.html.arender(sleep = 5, timeout=100)
test_rating = r_inside.html.find('div.comment-level-odd div.note_contents span.note_content_value')
print(len(test_rating))
check_list = {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9','10'}
total_rating_confidence = []
total_rating = []
total_confidence = []
for t in range(len(test_rating)):
if any(test_rating[t].text.split(':')[0] in s for s in check_list):
total_rating_confidence.append(test_rating[t].text.split(':')[0])
for r in range(len(total_rating_confidence)):
if (r % 2 == 0):
total_rating.append(int(total_rating_confidence[r]))
else:
total_confidence.append(int(total_rating_confidence[r]))
average_rating = sum(total_rating) / len(total_rating)
average_confidence = sum(total_confidence) / len(total_confidence)
csv_writer.writerow([title, keyword, abstract, link_pdf,len(total_rating),average_rating,average_confidence])
n = n + 1
print('Order {}',n)
csv_file.close()

I'm no Python expert (in fact, I'm a rank beginner) but the simple answer is better parallelism & session management.
The useful answer is a bit more complicated.
You're leaving the Chromium session around, which is likely what's hoovering up all your RAM. If you call asession_inside.close(), you may see an improvement in RAM usage.
As far as I can tell, you're doing everything in serial; You fetch each page and extract data on the articles in serial. Then, you query each article in serial as well.
You're using arender to fetch each article asynchronously, but you're awaiting it & using a standard for loop. As far as I understand, that means you're not getting any advantage from async; You're still processing each page one at a time (which explains your long process time).
I'd suggest using asyncio to turn the for loop into a parallel version of itself as suggested in this article. Make sure you set a task limit so that you don't try to load all the articles at once; That will also help with your RAM usage.

How to get ASINs XPATH from 2 different Amazon pages that have the same parent nodes?

I made a web scraping program using python and webdriver and I want to extract the ASIN from 2 different pages. I would like xpath to work for these 2 links at the same .
These are the amazon pages:https://www.amazon.com/Hydro-Flask-Wide-Mouth-Flip/dp/B01ACATW7E/ref=sr_1_3?s=kitchen&ie=UTF8&qid=1520348607&sr=1-3&keywords=-gfds and
https://www.amazon.com/Ubbi-Saving-Special-Required-Locking/dp/B00821FLSU/ref=sr_1_1?s=baby-products&ie=UTF8&qid=1520265799&sr=1-1&keywords=-hgfd&th=1. They have the same parent nodes(id, classes). How can I make this program work for these 2 links at the same time?
So the problem is on these lines of code: 36, 41
asin = driver.find_element_by_xpath('//div[#id="detail-bullets"]/table/tbody/tr/td/div/ul/li[4]').text
and
asin = driver.find_element_by_xpath('//div[#id="detail-bullets_feature_div"]/div[#id="detail-bullets"]/table/tbody/tr/td/div/ul/li[5]').text. I have to change these lines to output in the csv the ASINs for these 2 products. For the first link it prints the wrong information and for the second it prints the ASIN.
I attached the code. I will appreciate any help.
from selenium import webdriver
import csv
import io
# set the proxies to hide actual IP
proxies = {
'http': 'http://5.189.133.231:80',
'https': 'https://27.111.43.178:8080'
}
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--proxy-server="%s"' % ';'.join(['%s=%s' % (k, v) for k, v in proxies.items()]))
driver = webdriver.Chrome(executable_path="C:\\Users\Andrei-PC\Downloads\webdriver\chromedriver.exe",
chrome_options=chrome_options)
header = ['Product title', 'ASIN']
with open('csv/bot_1.csv', "w") as output:
writer = csv.writer(output)
writer.writerow(header)
links=['https://www.amazon.com/Hydro-Flask-Wide-Mouth-Flip/dp/B01ACATW7E/ref=sr_1_3?s=kitchen&ie=UTF8&qid=1520348607&sr=1-3&keywords=-gfds',
'https://www.amazon.com/Ubbi-Saving-Special-Required-Locking/dp/B00821FLSU/ref=sr_1_1?s=baby-products&ie=UTF8&qid=1520265799&sr=1-1&keywords=-hgfd&th=1'
]
for i in range(len(links)):
driver.get(links[i])
product_title = driver.find_elements_by_xpath('//*[#id="productTitle"][1]')
prod_title = [x.text for x in product_title]
try:
asin = driver.find_element_by_xpath('//div[#id="detail-bullets"]/table/tbody/tr/td/div/ul/li[4]').text
except:
print('no ASIN template one')
try:
asin = driver.find_element_by_xpath('//div[#id="detail-bullets_feature_div"]/div[#id="detail-bullets"]/table/tbody/tr/td/div/ul/li[5]').text
except:
print('no ASIN template two')
try:
data = [prod_title[0], asin]
except:
print('no items v3 ')
with io.open('csv/bot_1.csv', "a", newline="", encoding="utf-8") as output:
writer = csv.writer(output)
writer.writerow(data)

You can simply use
//li[b="ASIN:"]
to get required element on both pages

We Keep Coding

sql objective-c vba vb.net react-native apache vue.js tensorflow api pandas

Unable to extract values linked to data points from an interactive line chart - selenium

Related

How to Screenshot a Website Element without Collapsible Division using Selenium?

Trying to fetch option chain data from NSE... but getting error using Chromedriver

X-Path - How To Pull This Field?

How to reduce amount of scraping time when using Requests-HTML?

How to get ASINs XPATH from 2 different Amazon pages that have the same parent nodes?

Categories

Resources