Selenium StaleElementReferenceException and command(related wait) does not work - selenium

When I run below Program, Code "option2[opt2].click()" work well.
But second "option2[opt2].click()" occur "StaleElementReferenceException"
I tried to solve this problem with "time.sleep(), implicitly_wait(), WebDriverWait(browser, 30).until(EC.presence_of_element_located((By.XPATH, {Xpath}).
But Doesn't work.
What is the problem........
#If you run my program, you can understand my code easily. I am Beginer in Programming and Korean. If explanation is not enough... sorry :( Please help me........
from selenium import webdriver
import pandas as pd
import time
import random
options = webdriver.ChromeOptions()
options.add_argument("user-agent={myagent}")
browser = webdriver.Chrome(options = options)
browser.get("https://yeyak.seoul.go.kr/web/main.do") #첫페이지로
browser.maximize_window()
#노원구 공간시설, 문화체험, 교육강좌 카테고리 크롤링
#column 이름
all_column_name = ['구', '카테고리', '대상', '장소', '이용기간', '접수기간', '선별방법',
'모집정원', '신청제한', '취소기간', '이용요금', '예약방법', '문의전화']
all_data = []
#노원구 선택(이미 Click되어있는데, 또 Click을 하면 오류가 남)
browser.find_element_by_xpath("//*[#id=\"sch_loc\"]/option[11]").click()
time.sleep(1)
for i in range(2,5):
#공간시설, 문화체험, 교육강좌 카테고리 선택
browser.find_element_by_xpath(f"//*[#id='contents']/div[1]/div[1]/div/div/div[1]/ul/li[{i}]").click()
time.sleep(1)
category_name = browser.find_element_by_xpath(f"//*[#id='contents']/div[1]/div[1]/div/div/div[1]/ul/li[{i}]").text
district = browser.find_element_by_xpath("//*[#id=\"contents\"]/div[1]/div[1]/div/div/div[2]/div[3]/div[1]/select/option[11]").text #구 이름 district 저장
#시설 선택 버튼_소주제
button2 = browser.find_element_by_xpath("//*[#id=\"contents\"]/div[1]/div[1]/div/div/div[2]/div[3]/div[2]/select")
button2.find_elements_by_tag_name("option")
option2 = button2.find_elements_by_tag_name("option")
if len(option2) == 1: #만약 옵션이 별도로 없는 경우 다음 카테고리로 넘어가기
continue
for opt2 in range(1, len(option2)+1):
print(option2[opt2].text)
option2[opt2].click()
time.sleep(1)
#시설 상세 선택 버튼
button3 = browser.find_element_by_xpath("//*[#id=\"contents\"]/div[1]/div[1]/div/div/div[2]/div[3]/div[3]/select")
option3 = button3.find_elements_by_tag_name("option")
if len(option3) == 1: #만약 옵션이 별도로 없는 경우 다음 카테고리로 넘어가기
continue
for opt3 in range(1, len(option3)+1):
small_data = []
option3[opt3].click()
time.sleep(1)
#예약하기 버튼 클릭
browser.find_element_by_xpath("//*[#id=\"contents\"]/div[1]/div[1]/div/div/div[2]/div[3]/button").click()
time.sleep(1)
#테이블에서 데이터 가져오기
table = browser.find_element_by_xpath("//*[#id=\"aform\"]/div[1]/div[2]/ul")
rows = table.find_elements_by_tag_name("li")
small_data.append(district) # 구 이름 넣기
small_data.append(category_name) # 카테고리 이름 넣기
for row in rows:
small_data.append(row.text.split("\n")[1])
all_data.append(small_data)
time.sleep(random.uniform(2,3))
browser.get("https://yeyak.seoul.go.kr/web/main.do")
time.sleep(random.uniform(2,3))
browser.find_element_by_xpath("//*[#id=\"sch_loc\"]/option[11]").click() #노원구 클릭
time.sleep(random.uniform(2,3))
browser.find_element_by_xpath(f"//*[#id='contents']/div[1]/div[1]/div/div/div[1]/ul/li[{i}]").click() #카테고리 클릭
time.sleep(random.uniform(2,3))
browser.find_element_by_xpath(f"//*[#id=\"contents\"]/div[1]/div[1]/div/div/div[2]/div[3]/div[2]/select/option[{opt2}]")
time.sleep(random.uniform(2,3))
if opt3 == len(option3)+1:
break

Related

Selenium scraping div table, getting duplicate rows

I wrote a script that scrolls through an infinitely loading table on a site and scrapes the entries, but instead of being a the entire thing is made up of elements. I can't scroll through and then scrape since new elements are loaded as it scrolls (shows about 6-8 at a time), so it scrolls, scrapes, appends to a dataframe, then repeats. It works great for the first few hundred rows, then it starts to get duplicate rows. Any idea what I'm doing wrong?
def scrapenotis():
driver.get("NOTIFICATIONS");
WebDriverWait(driver, 120).until(EC.presence_of_element_located((By.CSS_SELECTOR, "div.vue-recycle-scroller__item-view")));
tbltitles = ["Datetime","Username","User Link","Description","Category","Desc Link"];
tablelist = [];
starttime = datetime.now()
while driver.title == "WEBSITE TITLE":
try:
# gets list of all entries
entries = driver.find_elements(By.CSS_SELECTOR, "div.vue-recycle-scroller__item-view");
if len(entries) == 0:
break;
# iterates through entries
for x in entries:
# checking for those elements that persist for some ungodly reason
if x.get_attribute("style") == "transform: translateY(-9999px);":
continue;
#each entry is 83 pixels long with about 6 on screen at a time
driver.execute_script("window.scrollBy(0, 300);");
# entries need to load after scroll, they load twice within a second(?)
time.sleep(1.5);
# datedesc = driver.find_element(By.XPATH, "//*[#id='content']/div[1]/div[1]/div/div[3]/div/div[1]//div/div/div[5]/span/span").get_attribute("title");
datedesc = driver.find_element(By.CSS_SELECTOR, "span.b-notifications__list__item__actions__item.g-date span").get_attribute("title");
username = driver.find_element(By.CSS_SELECTOR, "div.b-username-wrapper div.g-user-name").text;
userlink = driver.find_element(By.CSS_SELECTOR, "div.b-username-wrapper a").get_attribute("href");
description = driver.find_element(By.CSS_SELECTOR, "div.b-notifications__list__item__text div.g-truncated-text").text;
#sorting them out for categories
if "ubscribed" in description:
cat = "New Sub";
desclink = "N/A";
elif "iked your" in description:
cat = "Like";
desclink = driver.find_element(By.CSS_SELECTOR, "div.b-notifications__list__item__text div.g-truncated-text a").get_attribute("href");
elif "restarted their monthly subscription" in description:
cat = "Sub Renewal";
desclink = "N/A";
elif "purchased your" in description:
cat = "Purchase";
desclink = driver.find_element(By.CSS_SELECTOR, "div.b-notifications__list__item__text div.g-truncated-text a").get_attribute("href");
elif any(x in description for x in ["eplied","esponded"]):
cat = "Comment";
desclink = driver.find_element(By.CSS_SELECTOR, "div.b-notifications__list__item__text div.g-truncated-text a").get_attribute("href");
elif "tip" in description:
cat = "Tip";
desclink = "N/A";
dict1 = [datedesc,username,userlink,description,cat,desclink];
tablelist.append(dict1);
#specify stop time in seconds
if (datetime.now()-starttime).seconds >= 14400: #3600(1 hour) * 4 = 14400
break;
except:
break
#convert list to df
msgbox(tablelist);
df = pd.DataFrame(tablelist,columns=tbltitles);
df.drop_duplicates(subset=tbltitles, inplace=True, keep='first');
#save to csv
path = filesavebox("Save your updated data file","","",["*.txt","*.csv"]);
if path == None:
return;
df.to_csv(path + ".csv");

page_source in selenium changes

I'm trying to make crawler for Youtube.
I encountered strange behavior.
In the following source code, driver.page_source is obtained by selenium.
I passed the result to Beautifulsoup for parsing.
The problem is that the length of driver.page_source changes.
How can this happen? Is there any idea about this?
elif 'src' in seq:
print('video-src')
print(seq['src'])
soup = bs(driver.page_source, "html.parser")
print('driver.page_source length='+str(len(driver.page_source)))
f = open('test.txt','w',encoding='UTF-8')
f.write(driver.page_source)
f.close()
print('driver.page_source length='+str(len(driver.page_source)))
tag = '<span dir="auto" class="style-scope yt-formatted-string">'
find_start = driver.page_source.find(tag+'댓글')
print('driver.page_source length='+str(len(driver.page_source)))
tag_value = driver.page_source[find_start:find_start+200]
print('driver.page_source length='+str(len(driver.page_source)))
p = re.compile('\>([\d,]+)\<')
m = p.search(tag_value)
if m:
print(m.group(1))
video[item['name']] = m.group(1)
else:
print('error')
print(tag_value)
driver.page_source length=4103114
driver.page_source length=4102392
driver.page_source length=4102392
driver.page_source length=4103129
The page_source can change, elements can be loaded later.
Instead of checking the page_source length you can save the different driver.page_sources in text files and compare them to understand what is different. A method to do so could be using difflib:
import difflib
source1 = driver.page_source
file1 = open("file1.txt", "w")
text1 = file1.write(source1)
text1.close()
source2 = driver.page_source
file2 = open("file2.txt", "w")
text2 = file2.write(source2)
text2.close()
with open('file1.txt') as file_1:
file_1_text = file_1.readlines()
with open('file2.txt') as file_2:
file_2_text = file_2.readlines()
# Find and print the diff:
for line in difflib.unified_diff(
file_1_text, file_2_text, fromfile='file1.txt',
tofile='file2.txt', lineterm=''):
print(line)

Trying to fetch option chain data from NSE... but getting error using Chromedriver

Below is the code through which I am trying to fetch option chain data but I am getting errors.
Any help will be appreciated.
Thanks
Also, if someone can help me with a better code to record tick data, that would be great
import time
import datetime
import pandas as pd
from bs4 import BeautifulSoup,SoupStrainer
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
# from selenium.webdriver import Chrome as uc
# options=chrome_options
df=pd.DataFrame(columns=['SNO','call OI','call CHNG IN OI','call VOLUME','call IV','call LTP','call CHNG','call BID QTY','call BID PRICE','call ASK PRICE','call ASK QTY','STRIKE PRICE','put BID QTY','put BID PRICE','put ASK PRICE','put ASK QTY','put CHNG','put LTP','put IV','put VOLUME','put CHNG IN OI','put OI'])
chrome_options = Options()
chrome_options.add_argument("--log-level=3")
from selenium.webdriver import Chrome as uc
chrome_options = uc.ChromeOptions() #new solution
chrome_options.add_argument('--headless')
driver = webdriver.Chrome(executable_path=r"C:\Users\rohit taparia\Downloads\chromedriver_win32\chromedriver.exe",options=chrome_options)
driver.get('https://www.nseindia.com/get-quotes/derivatives?symbol=BANKNIFTY')
driver.minimize_window()
time.sleep(3)
for j in range(0,50):
print(j)
#.....refresh the page and read data again
driver.refresh()
continue_link1 = driver.find_element_by_xpath('''//*[#id="subtab-derivatives"]/div[2]/nav/div/div/a[2]''')
time.sleep(10)
filter_tag=SoupStrainer("table")
continue_link1.click()
time.sleep(3)
rtime = str(driver.find_element_by_xpath('''//*[#id="asondate"]''').text)
if rtime=='':
continue
print(rtime)
page=driver.page_source
soup = BeautifulSoup(page, "html.parser",parse_only=filter_tag)
gdp_table = soup.find("table", attrs={"id": "optionChainTable-indices"})
gdp_table_data = gdp_table.tbody.find_all("tr")
if len(gdp_table_data)==1:
continue
else:
for i in range (0,len(gdp_table_data)):
list1 = []
for td in gdp_table_data[i].find_all("td"):
# remove any newlines and extra spaces from left and right
cell_text=td.text
if cell_text is None or cell_text=='':
cell_text ='0'
cell_text=cell_text.replace(',','')
list1.append(cell_text)
if len(list1) > 0:
list1 = ['0' if i=='-' else i for i in list1]
else:
continue
del list1[0]
del list1[-1]
list1 = list(map(float, list1))
list1.insert(0,rtime)
df.loc[len(df)] = list1
df.to_excel("option-data.xlsx")

Not scraping xpath correctly

I'm trying to use the following xpath for this page but it is not loading correctly.
groups = ".//*[contains(#class, 'sl-CouponParticipantWithBookCloses_Name ')]"
xp_bp1 = ".//following::div[contains(#class,'sl-MarketCouponValuesExplicit33')][./div[contains(#class,'gl-MarketColumnHeader')][.='1']]//span[#class='gl-ParticipantOddsOnly_Odds']"
The output currently is..
[['3.00'], ['3.00'], ['3.00'] etc,,
Desired:
[['3.00'], ['1.30'], ['1.25'] etc,,
Data I am after
Script:
import csv
import time
from selenium import webdriver
from selenium.common.exceptions import TimeoutException, NoSuchElementException
driver = webdriver.Chrome()
driver.set_window_size(1024, 600)
driver.maximize_window()
driver.get('https://www.bet365.com.au/#/AC/B1/C1/D13/E108/F16/S1/')
driver.get('https://www.bet365.com.au/#/AC/B1/C1/D13/E108/F16/S1/')
time.sleep(10)
groups = ".//*[contains(#class, 'sl-CouponParticipantWithBookCloses_Name ')]"
#//div[contains(#class, 'gl-ParticipantOddsOnlyDarker gl-ParticipantOddsOnly gl-Participant_General sl-MarketCouponAdvancedBase_LastChild ')]
xp_bp1 = ".//following::div[contains(#class,'sl-MarketCouponValuesExplicit33')][./div[contains(#class,'gl-MarketColumnHeader')][.='1']]//span[#class='gl-ParticipantOddsOnly_Odds']"
while True:
try:
time.sleep(2)
data = []
for elem in driver.find_elements_by_xpath(groups):
try:
bp1 = elem.find_element_by_xpath(xp_bp1).text
except:
bp1 = None
url1 = driver.current_url
data.append([bp1])
print(data)
url1 = driver.current_url
with open('test.csv', 'a', newline='', encoding="utf-8") as outfile:
writer = csv.writer(outfile)
for row in data:
writer.writerow(row + [url1])
except TimeoutException as ex:
pass
except NoSuchElementException as ex:
print(ex)
break

requests + bs4 no results from pages

Here the code that can get info from https://www.gabar.org/membersearchresults.cfm
but cannot from https://www.gabar.org/membersearchresults.cfm?start=1&id=70FFBD1B-9C8E-9913-79DBB8B989DED6C1
from bs4 import BeautifulSoup
import requests
import traceback
links_to_visit = []
navigation_links = [] # for testing next button
base_url = 'https://www.gabar.org'
def make_soup(link):
r = requests.get(link)
soup = BeautifulSoup(r.content, 'html.parser')
return soup
def all_results(url):
global links_to_visit
global navigation_links
soup = make_soup(url)
print(soup)
div = soup.find('div', {'class': 'cs_control'})
links = div.find_all('a')
print(links)
for link in links:
try:
if link.text == 'Next': # prev, next, new search
navigation_links.append(link)
print('got it')
elif not '/MemberSearchDetail.cfm?ID=' in link.get('href'):
pass # I dont need that link
else:
links_to_visit.append(link)
except:
traceback.print_exc()
print(len(links_to_visit))
print(links_to_visit)
#print(links_to_visit[-1].get('href'))
def start():
flag = 1
page = 1
while page < 60716:
flag = 0
if navigation_links[-1].text == 'Next':
flag = 1
next_link = navigation_links[-1]
#print(next_link.get('href'))
page += 25
print(base_url + next_link.get('href'))
all_results(base_url + next_link.get('href'))
print('page is:', page)
if __name__ == '__main__':
all_results('https://www.gabar.org/membersearchresults.cfm')
start()
What I need to understand or do if I want to get full result?
What you need to understand is that there is more than a URL to an HTTP-request. In this case, a search result is only available to the session that executed the search and can therefore only be paged through if you are the "owner" of that session. Most websites identify a session using session-cookies that you need to send along with your HTTP-request.
This can be a huge hassle, but luckily pythons requests takes care of all of that for you with requests.session. Instead of using requests.get(url) you initialize the session session=requests.session() and then use that session in subsequent requests session.get(url). This will automagically preserve cookies for you and in many ways behave like an actual browser would.
You can read more about how requests.session works here.
And last but not least, your fixed code =)
from bs4 import BeautifulSoup
import requests
import traceback
links_to_visit = []
navigation_links = [] # for testing next button
# we initialize the session here
session = requests.session()
base_url = 'https://www.gabar.org'
def make_soup(link):
# r = requests.get(link)
# we use the session here in order to preserve cookies across requests
r = session.get(link)
soup = BeautifulSoup(r.content, 'html.parser')
return soup
def all_results(url):
# globals are almost never needed or recommended and certainly not here.
# you can just leave this out
# global links_to_visit
# global navigation_links
soup = make_soup(url)
print(soup)
div = soup.find('div', {'class': 'cs_control'})
links = div.find_all('a')
print(links)
for link in links:
try:
if link.text == 'Next': # prev, next, new search
navigation_links.append(link)
print('got it')
elif not '/MemberSearchDetail.cfm?ID=' in link.get('href'):
pass # I dont need that link
else:
links_to_visit.append(link)
except:
traceback.print_exc()
print(len(links_to_visit))
print(links_to_visit)
#print(links_to_visit[-1].get('href'))
def start():
flag = 1
page = 1
while page < 60716:
flag = 0
if navigation_links[-1].text == 'Next':
flag = 1
next_link = navigation_links[-1]
#print(next_link.get('href'))
page += 25
print(base_url + next_link.get('href'))
all_results(base_url + next_link.get('href'))
print('page is:', page)
if __name__ == '__main__':
all_results('https://www.gabar.org/membersearchresults.cfm')
start()