getting the alt value in the div tag using beautifulsoup - selenium

Im trying to get the value "4" from below html from this website. This is just one of the values from the product list page. I want multiple values in a list form to put it in a dataframe.
<div class="review-stars-on-hover">
<divclass="product-rating">
<divclass="product-rating__meter"alt="4">
<divclass="product-rating__meter-btm">★★★★★</div>
<divclass="product-rating__meter-top"style="width:80%;">★★★★★</div>
</div>
<divclass="product-rating__countedf-font-size--xsmallnsg-text--medium-grey"alt="95">(95)</div>
</div>
</div>...
I tried:
items = soup.select('.grid-item-content')
star = [item.find('div', {'class': 'review-stars-on-hover'}).get('alt') for item in items]
Output(there are 16 products in total in the page, but only none shows up):
[None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None]
Any advice please?

Try the following code.However it returns 16 records based on the class you have mentioned but its only having 11 records for the class product-rating__meter.I have provided the check if product-rating__meter class available then print the alt value.
Hope this will help.
from bs4 import BeautifulSoup
import requests
data= requests.get('https://store.nike.com/us/en_us/pw/mens-walking-shoes/7puZ9ypZoi3').content
soup = BeautifulSoup(data, 'lxml')
print("Total element count : " + str(len(soup.find_all('div',class_='grid-item-content'))))
for item in soup.find_all('div',class_='grid-item-content'):
if item.find('div',class_='product-rating__meter'):
print("Alt value : " + item.find('div',class_='product-rating__meter')['alt'])
Output
Total element count : 16
Alt value : 4
Alt value : 4.3
Alt value : 4.6
Alt value : 4.8
Alt value : 4.4
Alt value : 4.7
Alt value : 4.7
Alt value : 3.8
Alt value : 4.5
Alt value : 3.3
Alt value : 4.5
EDITED
from bs4 import BeautifulSoup
import requests
data= requests.get('https://store.nike.com/us/en_us/pw/mens-walking-shoes/7puZ9ypZoi3').content
soup = BeautifulSoup(data, 'lxml')
print("Total element count : " + str(len(soup.find_all('div',class_='grid-item-content'))))
itemlist=[]
for item in soup.find_all('div',class_='grid-item-content'):
if item.find('div',class_='product-rating__meter'):
#print("Alt value : " + item.find('div',class_='product-rating__meter')['alt'])
itemlist.append("Alt value : " + item.find('div',class_='product-rating__meter')['alt'])
print(itemlist)
OutPut:
Total element count : 16
['Alt value : 4', 'Alt value : 4.3', 'Alt value : 4.6', 'Alt value : 4.8', 'Alt value : 4.4', 'Alt value : 4.7', 'Alt value : 4.7', 'Alt value : 3.8', 'Alt value : 4.5', 'Alt value : 3.3', 'Alt value : 4.5']

You can select by taking the first match only for inner class within parent class
import requests
from bs4 import BeautifulSoup as bs
r = requests.get('https://store.nike.com/us/en_us/pw/mens-walking-shoes/7puZ9ypZoi3')
soup = bs(r.content, 'lxml')
stars = [item.select_one('.product-rating__meter')['alt'] for item in soup.select('.grid-item-box:has(.product-rating__meter)')]

You can write something like below to retrieve all divs with "alt" attribute:
xml = bs.find_all("div", {"alt": True})
And to retrieve the value:
for x in xml:
print(x["alt"])
Or directly like below if you only want the first "alt":
xml = bs.find("div", {"alt": True})["alt"]

Related

searching a pandas dataframe using PySimpleGUI

How do I update my list box using pysimple gui as below
so the user input is AP and the list box suggests "APPLE"
the issue as I see it is describing the updated listbox as the user is inputting with respect to the pandas dataframe
import PySimpleGUI as sg
import pandas as pd
sg.theme('Default1')
pd.set_option('display.max_rows',None)
formulation =pd.read_csv('FORMULATIONS.csv')
names=formulation["PRODUCT"]
#names=product.values.tolist()
#names=formulation["PRODUCT"].to_string
#left side search
left_col=[[sg.Text('SEARCH')],
[sg.Input(size=(20,1),enable_events=True,key='-INPUT-',do_not_clear=True)],
[sg.Listbox(names,size=(50,len(names)),key='-LIST-',enable_events=True)]]
#right side batch sheet
right_col=[[sg.Text('Product : \n \n ITEM | RAW MATERIAL |')],
[sg.Text(size=(40,1),key='-TOUT-')]]
#together
layout=[[sg.Column(left_col,element_justification='c'),sg.VSeperator(),sg.Column(right_col)]]
#create window
window =sg.Window('BF-2.1',layout,resizable=True)
#event loop
while True:
event, values =window.Read()
if event in (sg.WIN_CLOSED,'Exit'):
break
if values['-INPUT-'] != '' :
search =values['-INPUT-']
new_values=[formulation["PRODUCT"]==['-INPUT-']] #how to use the input to navigate
window.Element('-LIST-').Update(new_values)
else:
window.Element('-LIST-').Update(names)
if event =='-LIST-' and len(values['-LIST-']):
sg.popup('Selected',values['-LIST-'])
window.close()
have tried
new_values=[x for x in names if search in x]
Use df.loc[df["PRODUCT"]==text]["KIND"] to get items which matched the text.
Following code demo how to get event from Input element to filtered the DataFrame to update the List element. Try to input 'car', 'motorcycleorship` to get the List element updated, or empty the List element to get all.
import pandas as pd
import PySimpleGUI as sg
data = [
["car", item] for item in ('SUV', 'Hatchback', 'Crossover', 'Convertible', 'Sedan', 'Sports Car', 'Coupe', 'Minivan', 'Station Wagon', 'Pichup Truck')] + [
["motorcycle", item] for item in ('standard', 'cruiser', 'touring', 'sports', 'off-road', 'dual-purpose')] + [
["ship", item] for item in ('Container', 'Bulk Carrier', 'Tanker', 'Passenger', 'Naval', 'Offshore', 'Special Purpose')]
df = pd.DataFrame(data, columns=["PRODUCT", "KIND"])
left_col=[
[sg.Text('SEARCH')],
[sg.Input(size=20, enable_events=True, key='-INPUT-')],
[sg.Listbox(df["KIND"], size=(50, 10), key='-LIST-', enable_events=True)],
]
right_col=[
[sg.Text('Product : \n \n ITEM | RAW MATERIAL |')],
[sg.Text(size=40, key='-TOUT-')],
]
layout=[
[sg.Column(left_col,element_justification='c'),
sg.VSeperator(),
sg.Column(right_col)],
]
window = sg.Window('BF-2.1', layout, resizable=True)
while True:
event, values = window.read()
if event == sg.WIN_CLOSED:
break
if event == '-INPUT-':
if values[event]:
text = values['-INPUT-'].lower()
new_values= df.loc[df["PRODUCT"]==text]["KIND"]
window['-LIST-'].update(new_values)
else:
new_values = df["KIND"]
window['-LIST-'].update(new_values)
if event =='-LIST-' and len(values['-LIST-']):
sg.popup('Selected', values['-LIST-'])
window.close()

Selenium/Webscrape this field

My code runs fine and prints the title for all rows but the rows with dropdowns.
For example, row 4 has a dropdown if clicked. I implemented a try which would in theory initiate the dropdown, to then pull the titles.
But when i execute click() and try to print, for the rows with these drop downs, they are not printing.
Expected output- Print all titles including the ones in dropdown.
from selenium import webdriver
from bs4 import BeautifulSoup
import time
driver = webdriver.Chrome()
driver.get('https://cslide.ctimeetingtech.com/esmo2021/attendee/confcal/session/list')
time.sleep(4)
page_source = driver.page_source
soup = BeautifulSoup(page_source,'html.parser')
productlist=soup.find_all('div',class_='card item-container session')
for property in productlist:
sessiontitle=property.find('h4',class_='session-title card-title').text
print(sessiontitle)
try:
ifDropdown=driver.find_elements_by_class_name('item-expand-action expand')
ifDropdown.click()
time.sleep(4)
newTitle=driver.find_element_by_class_name('card-title').text
print(newTitle)
except:
newTitle='none'
import requests
from bs4 import BeautifulSoup
import pandas as pd
def get_soup(content):
return BeautifulSoup(content, 'lxml')
def my_filter(req, content):
try:
r = req.get(content['href'])
soup = get_soup(r.text)
return [x.text for x in soup.select('.card-title')[1:]]
except TypeError:
return 'N/A'
def main(url):
with requests.Session() as req:
for page in range(1, 2):
print(f"Extracting Page# {page}\n")
params = {
"p": page
}
r = req.get(url, params=params)
soup = get_soup(r.text)
goal = {x.select_one('.session-title').text: my_filter(
req, x.select_one('.item-expand-action')) for x in soup.select('.card')}
df = pd.DataFrame(goal.items(), columns=['Title', 'Menu'])
print(df)
main('https://cslide.ctimeetingtech.com/esmo2021/attendee/confcal/session/list')
Output:
Title Menu
0 Educational sessions on-demand N/A
1 Special Symposia on-demand N/A
2 Multidisciplinary sessions on-demand N/A
3 Illumina - Diagnosing Non-Small Cell Lung Canc... [Illumina gives an update on their IVD road ma...
4 MSD - Homologous Recombination Deficiency: BRC... [Welcome and Introductions, Homologous Recombi...
5 Servier - The clinical value of IDH inhibition... [Isocitric dehydrogenase: an actionable geneti...
6 AstraZeneca - Redefining Breast Cancer – Biolo... [Welcome and Opening, Redefining Breast Cancer...
7 ITM Isotopen Technologien München AG - A Globa... [Welcome & Introduction, Changes in the Incide...
8 MSD - The Role of Biomarkers in Patient Manage... [Welcome and Introductions, The Role of Pd-L1 ...
9 AstraZeneca - Re-evaluating the role of gBRCA ... [Welcome and introduction, What do we know abo...
10 Novartis - Unmet needs in oncogene-driven NSCL... [Welcome and introduction, Unmet needs in onco...
11 Opening session N/A

'int' object has no attribute 'replace' error in python3.x

I don't get why this error occurs. Coz from my point of view the three columns 'WWBO','IBO','DBO' has exact same structure but when I apply 'replace' only WWBO works. Does it have sth with fillna?
Need your help!
import requests
from bs4 import BeautifulSoup as bs
#Read url
URL = "https://www.the-numbers.com/box-office-records/worldwide/all- movies/cumulative/released-in-2019"
data = requests.get(URL).text
#parse url
soup = bs(data, "html.parser")
#find the tables you want
table = soup.findAll("table")[1:]
#read it into pandas
df = pd.read_html(str(table))
#concat both the tables
df = pd.concat([df[0],df[1]])
df = df.rename(columns={'Rank':'Rank',
'Movie':'Title',
'Worldwide Box Office':'WWBO',
'Domestic Box Office':'DBO',
'International Box Office':'IBO',
'DomesticShare':'Share'})
#drop columns
market = df.drop(columns=['Rank','Share'])
market = market.fillna(0)
#replace $ -> ''
market['WWBO'] = market['WWBO'].map(lambda s: s.replace('$',''))
market['IBO'] = market['IBO'].map(lambda s: s.replace('$',''))
market['DBO'] = market['DBO'].map(lambda s: s.replace('$',''))
market
Error is:::
AttributeError: 'int' object has no attribute 'replace'
it is Pandas bugs auto casting '0' values to int, to solutions for this either eliminate the 0 value or cast the columns to string as below
import pandas as pd
import requests
from bs4 import BeautifulSoup as bs
#Read url
URL = "https://www.the-numbers.com/box-office-records/worldwide/all-movies/cumulative/released-in-2019"
data = requests.get(URL).text
#parse url
soup = bs(data, "html.parser")
#find the tables you want
table = soup.findAll("table")[1:]
#read it into pandas
df = pd.read_html(str(table))
#concat both the tables
df = pd.concat([df[0],df[1]])
df = df.rename(columns={'Rank':'Rank',
'Movie':'Title',
'Worldwide Box Office':'WWBO',
'Domestic Box Office':'DBO',
'International Box Office':'IBO',
'DomesticShare':'Share'})
#drop columns
market = df.drop(columns=['Rank','Share'])
market = market.fillna(0)
#replace $ -> ''
market['WWBO'] = market['WWBO'].map(lambda s: s.replace('$',''))
market['IBO']=market['IBO'].astype(str)
market['IBO'] = market['IBO'].map(lambda s: s.replace('$',''))
market['DBO']=market['DBO'].astype(str)
market['DBO'] = market['DBO'].map(lambda s: s.replace('$',''))
>> market[['WWBO','IBO','DBO']]
WWBO IBO DBO
0 2,622,240,021 1,842,814,023 779,425,998
1 1,121,905,659 696,535,598 425,370,061
2 692,163,684 692,163,684 0
3 518,883,574 358,491,094 160,392,480
4 402,976,036 317,265,826 85,710,210
5 358,234,705 220,034,625 138,200,080
6 342,904,508 231,276,537 111,627,971
7 326,150,303 326,150,303 0
8 293,766,097 192,548,368 101,217,729
9 255,832,826 255,832,826 0
10 253,940,650 79,203,380 174,737,270
11 245,303,505 134,268,500 111,035,005
12 190,454,964 84,648,456 105,806,508
13 155,313,390 98,312,634 57,000,756
clearly one or more of these fields(market['WWBO'], market['IBO'], market['DBO']) have integer values and you are trying to perform string operation i.e. replace over it that's it is throwing error that
AttributeError: 'int' object has no attribute 'replace'
could you first print those values and see what are they or if you have many then its better to perform type check first like
if market['WWBO'].dtype == object:
market['WWBO'].map(lambda s: s.replace('$',''))
else:
pass
let me know if this works for you or not

How to get value of a cell in html page when click to a link in list link?

I have a list about 5000 link.
Ex 2 in 5000 link:
https://racevietnam.com/runner/buiducninh/ecopark-marathon-2019
https://racevietnam.com/runner/drtungnguyen83/ecopark-marathon-2019
...
I want to get value of column Time of Day and row Finish of links.
Ex:
09:51:07 AM - https://racevietnam.com/runner/buiducninh/ecopark-marathon-2019
07:50:55 AM - https://racevietnam.com/runner/ngocsondknb/ecopark-marathon-2019
I got user infor of a website, that website has id, class. But table in https://racevietnam.com/runner/ngocsondknb/ecopark-marathon-2019 have not id, class in table. So I can't.
#!/usr/bin/python
from urllib.request import urlopen
from bs4 import BeautifulSoup
list_user = []
for userID in range(1, 100000):
link = "https://example.com/member.php?u=" + str(userID)
html = urlopen(link)
bsObj = BeautifulSoup(html, "lxml")
user_name = bsObj.find("div", {"id":"main_userinfo"}).h1.get_text()
list_user.append(user_name)
print("username", userID, "is: ", user_name)
with open("result.txt", "a") as myfile:
myfile.write(user_name)
Please help me.
Thank you.
Using bs4 4.7.1.
There is only one table and you want the second column (td) of the last row. You can use last:child to select the last row; which should be used in conjunction with tbody type selector, and > child combinator, so as not to get header row. You can use nth-of-type to specify the td cell to return.
Now you may wish to develop this in at least two ways:
Handle cases where not found e.g.
name = getattr(soup.select_one('title'), 'text', 'N/A')
timing = getattr(soup.select_one('tbody > tr:last-child td:nth-of-type(2)'), 'text', 'N/A')
Add items to lists/data structure, which can be output as a dataframe at end and written out as csv. Or you may wish to stick with your current method
Python:
import requests
from bs4 import BeautifulSoup as bs
urls = ['https://racevietnam.com/runner/buiducninh/ecopark-marathon-2019', 'https://racevietnam.com/runner/drtungnguyen83/ecopark-marathon-2019']
with requests.Session() as s:
for url in urls:
r = s.get(url)
soup = bs(r.content, 'lxml')
name = soup.select_one('title').text
timing = soup.select_one('tbody > tr:last-child td:nth-of-type(2)').text
print(name, timing)
This is my code.
It's working Ok.
import requests
from bs4 import BeautifulSoup
f = open("input.ecopark","r")
f_content = f.readlines()
f.close()
for url in f_content:
r = requests.get(url.rstrip())
soup = BeautifulSoup(r.text, 'html.parser')
result = soup.select("table tbody tr td")
x = ""
for i in result:
if not x:
if i.get_text() == "Finish":
x = 1
continue
if x:
print(url.rstrip()+ " "+i.get_text())
break

How to get ASINs XPATH from 2 different Amazon pages that have the same parent nodes?

I made a web scraping program using python and webdriver and I want to extract the ASIN from 2 different pages. I would like xpath to work for these 2 links at the same .
These are the amazon pages:https://www.amazon.com/Hydro-Flask-Wide-Mouth-Flip/dp/B01ACATW7E/ref=sr_1_3?s=kitchen&ie=UTF8&qid=1520348607&sr=1-3&keywords=-gfds and
https://www.amazon.com/Ubbi-Saving-Special-Required-Locking/dp/B00821FLSU/ref=sr_1_1?s=baby-products&ie=UTF8&qid=1520265799&sr=1-1&keywords=-hgfd&th=1. They have the same parent nodes(id, classes). How can I make this program work for these 2 links at the same time?
So the problem is on these lines of code: 36, 41
asin = driver.find_element_by_xpath('//div[#id="detail-bullets"]/table/tbody/tr/td/div/ul/li[4]').text
and
asin = driver.find_element_by_xpath('//div[#id="detail-bullets_feature_div"]/div[#id="detail-bullets"]/table/tbody/tr/td/div/ul/li[5]').text. I have to change these lines to output in the csv the ASINs for these 2 products. For the first link it prints the wrong information and for the second it prints the ASIN.
I attached the code. I will appreciate any help.
from selenium import webdriver
import csv
import io
# set the proxies to hide actual IP
proxies = {
'http': 'http://5.189.133.231:80',
'https': 'https://27.111.43.178:8080'
}
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--proxy-server="%s"' % ';'.join(['%s=%s' % (k, v) for k, v in proxies.items()]))
driver = webdriver.Chrome(executable_path="C:\\Users\Andrei-PC\Downloads\webdriver\chromedriver.exe",
chrome_options=chrome_options)
header = ['Product title', 'ASIN']
with open('csv/bot_1.csv', "w") as output:
writer = csv.writer(output)
writer.writerow(header)
links=['https://www.amazon.com/Hydro-Flask-Wide-Mouth-Flip/dp/B01ACATW7E/ref=sr_1_3?s=kitchen&ie=UTF8&qid=1520348607&sr=1-3&keywords=-gfds',
'https://www.amazon.com/Ubbi-Saving-Special-Required-Locking/dp/B00821FLSU/ref=sr_1_1?s=baby-products&ie=UTF8&qid=1520265799&sr=1-1&keywords=-hgfd&th=1'
]
for i in range(len(links)):
driver.get(links[i])
product_title = driver.find_elements_by_xpath('//*[#id="productTitle"][1]')
prod_title = [x.text for x in product_title]
try:
asin = driver.find_element_by_xpath('//div[#id="detail-bullets"]/table/tbody/tr/td/div/ul/li[4]').text
except:
print('no ASIN template one')
try:
asin = driver.find_element_by_xpath('//div[#id="detail-bullets_feature_div"]/div[#id="detail-bullets"]/table/tbody/tr/td/div/ul/li[5]').text
except:
print('no ASIN template two')
try:
data = [prod_title[0], asin]
except:
print('no items v3 ')
with io.open('csv/bot_1.csv', "a", newline="", encoding="utf-8") as output:
writer = csv.writer(output)
writer.writerow(data)
You can simply use
//li[b="ASIN:"]
to get required element on both pages