With this code:
match_url = f'https://interativos.globoesporte.globo.com/cartola-fc/mais-escalados/mais-escalados-do-cartola-fc'
browser.visit(match_url)
browser.find_by_tag('li[class="historico-rodadas__rodada historico-rodadas__rodada--ativa"]').click()
soup = BeautifulSoup(browser.html, 'html.parser')
innerContent = soup.findAll('ul',class_="field__players")
print (innerContent)
I've managed to fetch the <ul>:
[<ul class="field__players"><li class="player"...]
Now how can I access text for player__name and player__value for all players in the list?
This should help u:
from selenium import webdriver
from bs4 import BeautifulSoup
driver = webdriver.Chrome()
driver.get('https://interativos.globoesporte.globo.com/cartola-fc/mais-escalados/mais-escalados-do-cartola-fc')
src = driver.page_source
driver.close()
soup = BeautifulSoup(src,'html5lib')
innerContent = soup.find('ul',class_="field__players")
li_items = innerContent.find_all('li')
for li in li_items:
p_tags = li.find_all('p')[:-1] #The [:-1] removes the last p tag from the list, which is player__label
for p in p_tags:
print(p.text)
Output:
Keno
2.868.755
Pedro
2.483.069
Bruno Henrique
1.686.894
Hugo Souza
809.186
Guilherme Arana
1.314.769
Filipe Luís
776.147
Thiago Galhardo
2.696.853
Vinícius
1.405.012
Nenê
1.369.209
Jorge Sampaoli
1.255.731
Réver
1.505.522
Víctor Cuesta
1.220.451
I should just put this here to show you what he wants.
soup = BeautifulSoup(browser.html, 'html.parser')
innerContent = soup.findAll('ul',class_="field__players")
for li in innerContent.findAll('li'):
player_name = li.find('p', class_ = "player__name")
player_value = li.find('p', class_ = "player__value")
print(player_name.text)
print(player_value.text)
Related
I'm trying to make crawler for Youtube.
I encountered strange behavior.
In the following source code, driver.page_source is obtained by selenium.
I passed the result to Beautifulsoup for parsing.
The problem is that the length of driver.page_source changes.
How can this happen? Is there any idea about this?
elif 'src' in seq:
print('video-src')
print(seq['src'])
soup = bs(driver.page_source, "html.parser")
print('driver.page_source length='+str(len(driver.page_source)))
f = open('test.txt','w',encoding='UTF-8')
f.write(driver.page_source)
f.close()
print('driver.page_source length='+str(len(driver.page_source)))
tag = '<span dir="auto" class="style-scope yt-formatted-string">'
find_start = driver.page_source.find(tag+'댓글')
print('driver.page_source length='+str(len(driver.page_source)))
tag_value = driver.page_source[find_start:find_start+200]
print('driver.page_source length='+str(len(driver.page_source)))
p = re.compile('\>([\d,]+)\<')
m = p.search(tag_value)
if m:
print(m.group(1))
video[item['name']] = m.group(1)
else:
print('error')
print(tag_value)
driver.page_source length=4103114
driver.page_source length=4102392
driver.page_source length=4102392
driver.page_source length=4103129
The page_source can change, elements can be loaded later.
Instead of checking the page_source length you can save the different driver.page_sources in text files and compare them to understand what is different. A method to do so could be using difflib:
import difflib
source1 = driver.page_source
file1 = open("file1.txt", "w")
text1 = file1.write(source1)
text1.close()
source2 = driver.page_source
file2 = open("file2.txt", "w")
text2 = file2.write(source2)
text2.close()
with open('file1.txt') as file_1:
file_1_text = file_1.readlines()
with open('file2.txt') as file_2:
file_2_text = file_2.readlines()
# Find and print the diff:
for line in difflib.unified_diff(
file_1_text, file_2_text, fromfile='file1.txt',
tofile='file2.txt', lineterm=''):
print(line)
How can I get the data-video-id attribute from the below HTML using BeautifulSoup?
<a href="/watch/36242552" class="thumbnail video vod-show play-video-trigger user-can-watch" data-video-id="36242552" data-video-type="show">
The following prints an empty list.
html_content = requests.get(url).text
soup = BeautifulSoup(html_content, "lxml")
ids = [tag['data-video-id'] for tag in soup.select('a href[data-video-id]')]
print(ids)
Output:
[]
You are getting empty [] because soup.select('a href[data-video-id]') is return nothing.You could try below code. Hope its help you.
from bs4 import BeautifulSoup
html = """<a href="/watch/36242552" class="thumbnail video vod-show play-video-trigger user-can-watch" data-video-id="36242552" data-video-type="show">"""
# html_content = requests.get(url).text
soup = BeautifulSoup(html, "lxml")
print(soup.select('a href[data-video-id]'))
ids = [tag['data-video-id'] for tag in soup.select('a') if tag['data-video-id']]
print(ids)
I am trying to learn scraping with selenium while parsing the page_source with "html.parser" of BS4 soup. I have all the Tags that contain h2 tag and a class name, but extracting the text in between doesn't seem to work.
import os
import re
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup as soup
opts = webdriver.ChromeOptions()
opts.binary_location = os.environ.get('GOOGLE_CHROME_BIN', None)
opts.add_argument("--headless")
opts.add_argument("--disable-dev-shm-usage")
opts.add_argument("--no-sandbox")
browser = webdriver.Chrome(executable_path="chromedriver", options=opts)
url1='https://www.animechrono.com/date-a-live-series-watch-order'
browser.get(url1)
req = browser.page_source
sou = soup(req, "html.parser")
h = sou.find_all('h2', class_='heading-5')
p = sou.find_all('div', class_='text-block-5')
for i in range(len(h)):
h[i] == h[i].getText()
for j in range(len(p)):
p[j] = p[j].getText()
print(h)
print(p)
browser.quit()
My Output :
[<h2 class="heading-5">Season 1</h2>, <h2 class="heading-5">Date to Date OVA</h2>, <h2 class="heading-5">Season 2</h2>, <h2 class="heading-5">Kurumi Star Festival OVA</h2>, <h2 class="heading-5">Date A Live Movie: Mayuri Judgement</h2>, <h2 class="heading-5">Season 3</h2>, <h2 class="heading-5">Date A Bullet: Dead or Bullet Movie</h2>, <h2 class="heading-5">Date A Bullet: Nightmare or Queen Movie</h2>]
['Episodes 1-12', 'Date to Date OVA', 'Episodes 1-10', 'Kurumi Star Festival OVA', 'Date A Live Movie: Mayuri Judgement', 'Episodes 1-12', 'Date A Bullet: Dead or Bullet Movie', 'Date A Bullet: Nightmare or Queen Movie']
Add this line before driver.quit():
h = [elem.text for elem in h]
print(h)
Full code:
import os
import re
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup as soup
opts = webdriver.ChromeOptions()
opts.binary_location = os.environ.get('GOOGLE_CHROME_BIN', None)
opts.add_argument("--headless")
opts.add_argument("--disable-dev-shm-usage")
opts.add_argument("--no-sandbox")
browser = webdriver.Chrome(executable_path="chromedriver", options=opts)
url1='https://www.animechrono.com/date-a-live-series-watch-order'
browser.get(url1)
req = browser.page_source
sou = soup(req, "html.parser")
h = sou.find_all('h2', class_='heading-5')
p = sou.find_all('div', class_='text-block-5')
for j in range(len(p)):
p[j] = p[j].getText()
h = [elem.text for elem in h]
print(h)
browser.quit()
Output:
['Season 1', 'Date to Date OVA', 'Season 2', 'Kurumi Star Festival OVA', 'Date A Live Movie: Mayuri Judgement', 'Season 3', 'Date A Bullet: Dead or Bullet Movie', 'Date A Bullet: Nightmare or Queen Movie']
I have a list about 5000 link.
Ex 2 in 5000 link:
https://racevietnam.com/runner/buiducninh/ecopark-marathon-2019
https://racevietnam.com/runner/drtungnguyen83/ecopark-marathon-2019
...
I want to get value of column Time of Day and row Finish of links.
Ex:
09:51:07 AM - https://racevietnam.com/runner/buiducninh/ecopark-marathon-2019
07:50:55 AM - https://racevietnam.com/runner/ngocsondknb/ecopark-marathon-2019
I got user infor of a website, that website has id, class. But table in https://racevietnam.com/runner/ngocsondknb/ecopark-marathon-2019 have not id, class in table. So I can't.
#!/usr/bin/python
from urllib.request import urlopen
from bs4 import BeautifulSoup
list_user = []
for userID in range(1, 100000):
link = "https://example.com/member.php?u=" + str(userID)
html = urlopen(link)
bsObj = BeautifulSoup(html, "lxml")
user_name = bsObj.find("div", {"id":"main_userinfo"}).h1.get_text()
list_user.append(user_name)
print("username", userID, "is: ", user_name)
with open("result.txt", "a") as myfile:
myfile.write(user_name)
Please help me.
Thank you.
Using bs4 4.7.1.
There is only one table and you want the second column (td) of the last row. You can use last:child to select the last row; which should be used in conjunction with tbody type selector, and > child combinator, so as not to get header row. You can use nth-of-type to specify the td cell to return.
Now you may wish to develop this in at least two ways:
Handle cases where not found e.g.
name = getattr(soup.select_one('title'), 'text', 'N/A')
timing = getattr(soup.select_one('tbody > tr:last-child td:nth-of-type(2)'), 'text', 'N/A')
Add items to lists/data structure, which can be output as a dataframe at end and written out as csv. Or you may wish to stick with your current method
Python:
import requests
from bs4 import BeautifulSoup as bs
urls = ['https://racevietnam.com/runner/buiducninh/ecopark-marathon-2019', 'https://racevietnam.com/runner/drtungnguyen83/ecopark-marathon-2019']
with requests.Session() as s:
for url in urls:
r = s.get(url)
soup = bs(r.content, 'lxml')
name = soup.select_one('title').text
timing = soup.select_one('tbody > tr:last-child td:nth-of-type(2)').text
print(name, timing)
This is my code.
It's working Ok.
import requests
from bs4 import BeautifulSoup
f = open("input.ecopark","r")
f_content = f.readlines()
f.close()
for url in f_content:
r = requests.get(url.rstrip())
soup = BeautifulSoup(r.text, 'html.parser')
result = soup.select("table tbody tr td")
x = ""
for i in result:
if not x:
if i.get_text() == "Finish":
x = 1
continue
if x:
print(url.rstrip()+ " "+i.get_text())
break
I have fetched the html data from a site and trying to fetch product urls:
def get_soup(url):
soup = None
response = requests.get(url)
if response.status_code == 200:
html = response.content
soup = BeautifulSoup(html, "html.parser")
return soup
def get_category_urls(url):
soup = get_soup(url)
cat_urls = []
categories = soup.find('div', attrs={'id': 'menu_oc'})
if categories is not None:
for c in categories.findAll('a'):
if c['href'] is not None:
cat_urls.append(c['href'])
return cat_urls
def get_product_urls(url):
soup = get_soup(url)
prod_urls = []
if soup.find('div', attrs={'class': 'pagination'}):
for link in soup.select('div.links a'):
if link.string.isdecimal(): # dump next and last links
prod_urls.append(link['href'])
print("Found following product urls::", prod_urls)
return prod_urls
if __name__ == '__main__':
category_urls = get_category_urls(URL)
product_urls = get_product_urls(URL)
How to efficiently identify the pagination condition in above loc?
screen shots of actual site with pagination:
and without pagination:
site link
pagination category
should be okay
from bs4 import BeautifulSoup
import requests
def get_soup(url):
soup = None
response = requests.get(url)
if response.status_code == 200:
html = response.content
soup = BeautifulSoup(html, "html.parser")
return soup
def get_category_urls(url):
soup = get_soup(url)
cat_urls = []
categories = soup.find('div', attrs={'id': 'menu_oc'})
if categories is not None:
for c in categories.findAll('a'):
if c['href'] is not None:
cat_urls.append(c['href'])
return cat_urls
def get_all_products(url):
prod_urls = []
soup = get_soup(url)
prod_urls.append(get_product_urls(soup))
links = get_pagination(soup)
print("Found those pages:", links)
if not links:
return prod_urls
for link in links:
soup = get_soup(link)
prod_urls.append(get_product_urls(soup))
print("Found following product urls:", prod_urls)
return prod_urls
def get_product_urls(soup):
links = soup.select('div.product-list .span .name a')
return [link['href'] for link in links]
def get_pagination(soup):
pages = soup.select('div.pagination div.links a')
return [link['href'] for link in pages if link.string.isdecimal()]
if __name__ == '__main__':
URL = 'http://www.example.com/shop/index.php?route=product/category&path=63_64'
category_urls = get_category_urls(URL)
product_urls = get_all_products(URL)