I wrote a code to get an account followers. I log in account and go to profile and click follower box and scroll down until at the end withot problem but when i try to get follower's username I have a problem.
I do not get any error but list turns empty. Can you help me? Thanks
followers_panel = driver.find_element_by_xpath('/html/body/div[2]/div/div/div/div[2]/div/div/div[1]/div/div[2]/div/div/div/div/div[2]/div/div/div[2]')
last_ht, ht = 0, 1
while last_ht != ht:
last_ht = ht
time.sleep(2)
# scroll down and retrun the height of scroll
ht = driver.execute_script("""
arguments[0].scrollTo(0, arguments[0].scrollHeight);
return arguments[0].scrollHeight; """, followers_panel)
list_of_followers = []
# Extract the follower names
followers = followers_panel.find_elements_by_xpath('.//div[#class="PZuss"]')
list_of_followers = []
for follower in followers:
name = follower.find_element_by_xpath('.//a/span/div').text
list_of_followers.append(name)
print(list_of_followers)
does followers get elements? if yes, try out this instead of for loop
list_of_followers = list(map(lambda x: x.text, driver.find_elements_by_xpath('//div[#class="PZuss"]//a/span/div')))
Related
I am trying to webscrape this website: https://va.betway.com/sports/category/basketball/usa/nba?tab=matches. I am unable to get this elementImage of game stats.
This is my code snippet:
s = Service("./drivers/geckodriver")
options = FirefoxOptions()
options.headless = True
browser = webdriver.Firefox(service=s,options = options)
browser.get(website_hr)
print('Title: %s' % browser.title)
player_prop0 = browser.find_elements(by='id',value = 'root')
player_prop2 = browser.find_elements(by=By.CLASS_NAME,value = 'sc-eZMymg ktcjyc')
I get no value from player_prop2 and player_prop0.
enter image description here
How can I get the data on this page? Thank you
I tried using ID and class to get the game lines for the NBA games
I am currently web scraping a few pages inside a list. I have the following code provided.
pages = {
"https://shop.supervalu.ie/shopping/wine-beer-spirits-germany/c-150410100",
"https://shop.supervalu.ie/shopping/wine-beer-spirits-small-bottles/c-150410110",
"https://shop.supervalu.ie/shopping/wine-beer-spirits-lager/c-150302375", #More than one page
"https://shop.supervalu.ie/shopping/wine-beer-spirits-stout/c-150302380",
"https://shop.supervalu.ie/shopping/wine-beer-spirits-ale/c-150302385",
"https://shop.supervalu.ie/shopping/wine-beer-spirits-lager/c-150302386",
"https://shop.supervalu.ie/shopping/wine-beer-spirits-stout/c-150302387",
"https://shop.supervalu.ie/shopping/wine-beer-spirits-ale/c-150302388", #More than one page
"https://shop.supervalu.ie/shopping/wine-beer-spirits-cider/c-150302389",
"https://shop.supervalu.ie/shopping/wine-beer-spirits-cider/c-150302390",
"https://shop.supervalu.ie/shopping/wine-beer-spirits-alcopops/c-150302395",
"https://shop.supervalu.ie/shopping/wine-beer-spirits-vodka/c-150302430",
"https://shop.supervalu.ie/shopping/wine-beer-spirits-irish-whiskey/c-150302435", #More than one page
}
products = []
prices = []
images = []
urls = []
def export_data():
logging.info("exporting data to pandas dataframe")
supervalu = pd.DataFrame({
'img_url' : images,
'url' : urls,
'product' : products,
'price' : prices
})
logging.info("sorting data by price")
supervalu.sort_values(by=['price'], inplace=True)
output_json = 'supervalu.json'
output_csv = 'supervalu.csv'
output_dir = Path('../../json/supervalu')
output_dir.mkdir(parents=True, exist_ok=True)
logging.info("exporting data to json")
supervalu.to_json(output_dir / output_json)
logging.info("exporting data to csv")
supervalu.to_csv(output_dir / output_csv)
def get_data(div):
raw_data = div.find_all('div', class_='ga-product')
raw_images = div.find_all('img')
raw_url = div.find_all('a', class_="ga-product-link")
product_data = [data['data-product'] for data in raw_data]
new_data = [d.replace("\r\n","") for d in product_data]
for name in new_data:
new_names = re.search(' "name": "(.+?)"', name).group(1)
products.append(new_names)
for price in new_data:
new_prices = re.search(' "price": ''"(.+?)"', price).group(1)
prices.append(new_prices)
for image in raw_images:
new_images = image['data-src']
images.append(new_images)
for url in raw_url:
new_url = url['href']
urls.append(new_url)
def scrape_page(next_url):
page = requests.get(next_url)
if page.status_code != 200:
logging.error("Page does not exist!")
exit()
soup = BeautifulSoup(page.content, 'html.parser')
get_data(soup.find(class_="row product-list ga-impression-group"))
try:
load_more_text = soup.find('a', class_='pill ajax-link load-more').findAll('span')[-1].text
if load_more_text == 'Load more':
next_page = soup.find('a', class_="pill ajax-link load-more").get('href')
logging.info("Scraping next page: {}".format(next_page))
scrape_page(next_page)
else:
export_data()
except:
logging.warning("No more next pages to scrape")
pass
for page in pages:
logging.info("Scraping page: {}".format(page))
scrape_page(page)
The main issue that appears is during the try exception handling of the next page. As not all of the pages provided have the the appropriate snippet, a ValueAttribute error will araise hence I have the aforementioned statement closed off in a try exception case. I want to skip the pages that don't have next page and scrape them regardless and continue looping the rest of the pages until a next page arises. All of the pages appear to be looped through but I never get the data exported. If I try the following code:
try:
load_more_text = soup.find('a', class_='pill ajax-link load-more').findAll('span')[-1].text
if load_more_text == 'Load more':
next_page = soup.find('a', class_="pill ajax-link load-more").get('href')
logging.info("Scraping next page: {}".format(next_page))
scrape_page(next_page)
except:
logging.warning("No more next pages to scrape")
pass
else:
export_data()
This would be the closest that I have gotten to the desired outcome. The above code works and the data gets exported but not all of the pages get exported because as a result - a new dataframe is created for every time a new next page appears and ends i.e. - code iterarets through the list, finds a next page, next page 'pages' get scraped and a new dataframe is created and deletes the previous data.
I'm hoping that someone would give me some guidance on what to do as I have been stuck on this part of my personal project and I'm not so sure on how I am supposed to overcome this obstacle. Thank you in advance.
I have modified my code as shown below and I have received my desired outcome.
load_more_text = soup.find('a', class_='pill ajax-link load-more')
if load_more_text:
next_page = soup.find('a', class_="pill ajax-link load-more").get('href')
logging.info("Scraping next page: {}".format(next_page))
scrape_page(next_page)
else:
export_data()
I am trying to extract userid, rating and review from the following site using selenium and it is showing "Invalid selector error". I think, the Xpath I have tried to define to get the review text is the reason for error. But I am unable to resolve the issue. The site link is as below:
teslamotor review
The code that I have used is following:
#Class for Review webscraping from consumeraffairs.com site
class CarForumCrawler():
def __init__(self, start_link):
self.link_to_explore = start_link
self.comments = pd.DataFrame(columns = ['rating','user_id','comments'])
self.driver = webdriver.Chrome(executable_path=r'C:/Users/mumid/Downloads/chromedriver/chromedriver.exe')
self.driver.get(self.link_to_explore)
self.driver.implicitly_wait(5)
self.extract_data()
self.save_data_to_file()
def extract_data(self):
ids = self.driver.find_elements_by_xpath("//*[contains(#id,'review-')]")
comment_ids = []
for i in ids:
comment_ids.append(i.get_attribute('id'))
for x in comment_ids:
#Extract dates from for each user on a page
user_rating = self.driver.find_elements_by_xpath('//*[#id="' + x +'"]/div[1]/div/img')[0]
rating = user_rating.get_attribute('data-rating')
#Extract user ids from each user on a page
userid_element = self.driver.find_elements_by_xpath('//*[#id="' + x +'"]/div[2]/div[2]/strong')[0]
userid = userid_element.get_attribute('itemprop')
#Extract Message for each user on a page
user_message = self.driver.find_elements_by_xpath('//*[#id="' + x +'"]]/div[3]/p[2]/text()')[0]
comment = user_message.text
#Adding date, userid and comment for each user in a dataframe
self.comments.loc[len(self.comments)] = [rating,userid,comment]
def save_data_to_file(self):
#we save the dataframe content to a CSV file
self.comments.to_csv ('Tesla_rating-6.csv', index = None, header=True)
def close_spider(self):
#end the session
self.driver.quit()
try:
url = 'https://www.consumeraffairs.com/automotive/tesla_motors.html'
mycrawler = CarForumCrawler(url)
mycrawler.close_spider()
except:
raise
The error that I am getting is as following:
Also, The xpath that I tried to trace is from following HTML
You are seeing the classic error of...
as find_elements_by_xpath('//*[#id="' + x +'"]]/div[3]/p[2]/text()')[0] would select the attributes, instead you need to pass an xpath expression that selects elements.
You need to change as:
user_message = self.driver.find_elements_by_xpath('//*[#id="' + x +'"]]/div[3]/p[2]')[0]
References
You can find a couple of relevant detailed discussions in:
invalid selector: The result of the xpath expression "//a[contains(#href, 'mailto')]/#href" is: [object Attr] getting the href attribute with Selenium
I'm extracting NBA stats from my yahoo fantasy account. Below is the code that I made in jupyter notebook using selenium. Each page shows 25 players and a total of 720 players. I did a for loop that will scrape players in increments of 25 instead of one by one.
for k in range (0,725,25):
Players = driver.find_elements_by_xpath('//tbody/tr/td[2]/div/div/div/div/a')
Team_Position = driver.find_elements_by_xpath('//span[#class= "Fz-xxs"]')
Games_Played = driver.find_elements_by_xpath('//tbody/tr/td[7]/div')
Minutes_Played = driver.find_elements_by_xpath('//tbody/tr/td[11]/div')
FGM_A = driver.find_elements_by_xpath('//tbody/tr/td[12]/div')
FTM_A = driver.find_elements_by_xpath('//tbody/tr/td[14]/div')
Three_Points = driver.find_elements_by_xpath('//tbody/tr/td[16]/div')
PTS = driver.find_elements_by_xpath('//tbody/tr/td[17]/div')
REB = driver.find_elements_by_xpath('//tbody/tr/td[18]/div')
AST = driver.find_elements_by_xpath('//tbody/tr/td[19]/div')
ST = driver.find_elements_by_xpath('//tbody/tr/td[20]/div')
BLK = driver.find_elements_by_xpath('//tbody/tr/td[21]/div')
TO = driver.find_elements_by_xpath('//tbody/tr/td[22]/div')
NBA_Stats = []
for i in range(len(Players)):
players_stats = {'Name': Players[i].text,
'Position': Team_Position[i].text,
'GP': Games_Played[i].text,
'MP': Minutes_Played[i].text,
'FGM/A': FGM_A[i].text,
'FTM/A': FTM_A[i].text,
'3PTS': Three_Points[i].text,
'PTS': PTS[i].text,
'REB': REB[i].text,
'AST': AST[i].text,
'ST': ST[i].text,
'BLK': BLK[i].text,
'TO': TO[i].text}
driver.get('https://basketball.fantasysports.yahoo.com/nba/28951/players?status=ALL&pos=P&cut_type=33&stat1=S_AS_2021&myteam=0&sort=AR&sdir=1&count=' + str(k))
The browser will go page by page after it's done. I print out the results. It only scrape 1 player. What did I do wrong?
A picture of my codes and printing the results
It's hard to see what the issue here is without looking at the original page (can you provide a URL?), however looking at this:
next = driver.find_element_by_xpath('//a[#id = "yui_3_18_1_1_1636840807382_2187"]')
"1636840807382" looks like a Javascript timestamp, so I would guess that the reference you've got hardcoded there is dynamically generated, so the element "yui_3_18_1_1_1636840807382_2187" no longer exists.
I'm sending InlineQueryResultArticle to clients and i'm wondering how to get chosen result and it's data (like result_id,...).
here is the code to send results:
token = 'Bot token'
bot = telegram.Bot(token)
updater = Updater(token)
dispatcher = updater.dispatcher
def get_inline_results(bot, update):
query = update.inline_query.query
results = list()
results.append(InlineQueryResultArticle(id='1000',
title="Book 1",
description='Description of this book, author ...',
thumb_url='https://fakeimg.pl/100/?text=book%201',
input_message_content=InputTextMessageContent(
'chosen book:')))
results.append(InlineQueryResultArticle(id='1001',
title="Book 2",
description='Description of the book, author...',
thumb_url='https://fakeimg.pl/300/?text=book%202',
input_message_content=InputTextMessageContent(
'chosen book:')
))
update.inline_query.answer(results)
inline_query_handler = InlineQueryHandler(get_inline_results)
dispatcher.add_handler(inline_query_handler)
I'm looking for a method like on_inline_chosen(data) to get id of the chosen item. (1000 or 1001 for snippet above) and then send the appropriate response to user.
You should set /setinlinefeedback in #BotFather, then you will get this update
OK, i got my answer from here
Handling user chosen result:
from telegram.ext import ChosenInlineResultHandler
def on_result_chosen(bot, update):
print(update.to_dict())
result = update.chosen_inline_result
result_id = result.result_id
query = result.query
user = result.from_user.id
print(result_id)
print(user)
print(query)
print(result.inline_message_id)
bot.send_message(user, text='fetching book data with id:' + result_id)
result_chosen_handler = ChosenInlineResultHandler(on_result_chosen)
dispatcher.add_handler(result_chosen_handler)