How to find_all(data-video-id) from html using beautiful soup - beautifulsoup

How can I get the data-video-id attribute from the below HTML using BeautifulSoup?
<a href="/watch/36242552" class="thumbnail video vod-show play-video-trigger user-can-watch" data-video-id="36242552" data-video-type="show">
The following prints an empty list.
html_content = requests.get(url).text
soup = BeautifulSoup(html_content, "lxml")
ids = [tag['data-video-id'] for tag in soup.select('a href[data-video-id]')]
print(ids)
Output:
[]

You are getting empty [] because soup.select('a href[data-video-id]') is return nothing.You could try below code. Hope its help you.
from bs4 import BeautifulSoup
html = """<a href="/watch/36242552" class="thumbnail video vod-show play-video-trigger user-can-watch" data-video-id="36242552" data-video-type="show">"""
# html_content = requests.get(url).text
soup = BeautifulSoup(html, "lxml")
print(soup.select('a href[data-video-id]'))
ids = [tag['data-video-id'] for tag in soup.select('a') if tag['data-video-id']]
print(ids)

Related

Using BeautifulSoup to exploit a URL and its dependent pages and store results in csv?

This code does not crash, which is good. However, it generates and empty icao_publications.csv f. I want to populate icao_publications.csv with all record on all the pages from the URL and capture all the pages. The dataset should be about 10,000 rows or their about in all.
I want to get these 10,000 or so rows in the csv file.
import requests, csv
from bs4 import BeautifulSoup
url = 'https://www.icao.int/publications/DOC8643/Pages/Search.aspx'
with open('Test1_Aircraft_Type_Designators.csv', "w", encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow(["Manufacturers", "Model", "Type_Designator", "Description", "Engine_Type", "Engine_Count", "WTC"])
while True:
html = requests.get(url)
soup = BeautifulSoup(html.text, 'html.parser')
for row in soup.select('table tbody tr'):
writer.writerow([c.text if c.text else '' for c in row.select('td')])
if soup.select_one('li.paginate_button.active + li a'):
url = soup.select_one('li.paginate_button.active + li a')['href']
else:
break
Here you go:
import requests
import pandas as pd
url = 'https://www4.icao.int/doc8643/External/AircraftTypes'
resp = requests.post(url).json()
df = pd.DataFrame(resp)
df.to_csv('aircraft.csv',encoding='utf-8',index=False)
print('Saved to aircraft.csv')

crawling result is null

im doing the crawling about r6s
like this
from bs4 import BeautifulSoup as bs
import requests
bsURL = "https://r6.tracker.network/profile/pc/Spoit.GODSENT"
respinse = requests.get(bsURL)
html = bs(respinse.text, 'html.parser')
level = html.find_all(class_='trn-defstat__value')
print(level[0])
print Result-->
<div class="trn-defstat__value">
439
</div>
I only want to print numbers.
so i did print(level[0].text)
Result -> none
how can I solve this problem?
Just use .string instead of .text like this:
print(level[0].string)
Output:
439
Hope that this helps!

Beautiful Soup - get text from all <li> elements in <ul>

With this code:
match_url = f'https://interativos.globoesporte.globo.com/cartola-fc/mais-escalados/mais-escalados-do-cartola-fc'
browser.visit(match_url)
browser.find_by_tag('li[class="historico-rodadas__rodada historico-rodadas__rodada--ativa"]').click()
soup = BeautifulSoup(browser.html, 'html.parser')
innerContent = soup.findAll('ul',class_="field__players")
print (innerContent)
I've managed to fetch the <ul>:
[<ul class="field__players"><li class="player"...]
Now how can I access text for player__name and player__value for all players in the list?
This should help u:
from selenium import webdriver
from bs4 import BeautifulSoup
driver = webdriver.Chrome()
driver.get('https://interativos.globoesporte.globo.com/cartola-fc/mais-escalados/mais-escalados-do-cartola-fc')
src = driver.page_source
driver.close()
soup = BeautifulSoup(src,'html5lib')
innerContent = soup.find('ul',class_="field__players")
li_items = innerContent.find_all('li')
for li in li_items:
p_tags = li.find_all('p')[:-1] #The [:-1] removes the last p tag from the list, which is player__label
for p in p_tags:
print(p.text)
Output:
Keno
2.868.755
Pedro
2.483.069
Bruno Henrique
1.686.894
Hugo Souza
809.186
Guilherme Arana
1.314.769
Filipe Luís
776.147
Thiago Galhardo
2.696.853
Vinícius
1.405.012
Nenê
1.369.209
Jorge Sampaoli
1.255.731
Réver
1.505.522
Víctor Cuesta
1.220.451
I should just put this here to show you what he wants.
soup = BeautifulSoup(browser.html, 'html.parser')
innerContent = soup.findAll('ul',class_="field__players")
for li in innerContent.findAll('li'):
player_name = li.find('p', class_ = "player__name")
player_value = li.find('p', class_ = "player__value")
print(player_name.text)
print(player_value.text)

automatic crawling web site

I got help from here to crawl on law.go.kr with the code below.
I'm trying to crawl other websites like http://lawbot.org, http://law.go.kr, https://casenote.kr.
But problem is that I have no understanding of html...
I understood all the code and how to get html address for the code below but it's different on other websites...
I want to know how to use the code below to crawl other web pages.
import requests
from bs4 import BeautifulSoup
if __name__ == '__main__':
# Using request get 50 items from first page. pg=1 is page number, outmax=50 items
per page
response = requests.post(
"http://law.go.kr/precScListR.doq=*&section=evtNm&outmax=79329&pg=1&fsort=21,10,30&precSeq=0&dtlYn=N")
# Parse html using BeautifulSoup
page = BeautifulSoup(response.text, "html.parser")
# Go through all pages and collect posts numbers in items
items = []
for i in range(1, 2):
# Get all links
links = page.select("#viewHeightDiv .s_tit a")
# Loop all links and collect post numbers
for link in links:
# Parse post number from "onclick" attribute
items.append(''.join([n for n in link.attrs["onclick"] if n.isdigit()]))
# Open all posts and collect in posts dictionary with keys: number, url and text
posts = []
for item in items:
url = "http://law.go.kr/precInfoR.do?precSeq=%s&vSct=*" % item
response = requests.get(url)
parsed = BeautifulSoup(response.text, "html.parser")
text = parsed.find('div', attrs={'id': 'contentBody'}).text #전문 저장
'id': 'contentBody', 제목제외 저장 'class': 'pgroup'
title = parsed.select_one("h2").text
posts.append({'number': item, 'url': url, 'text': text, 'title': title})
with open("D://\LAWGO_DATA/" + item + '.txt', 'w', encoding='utf8') as f:
f.write(text)
One more example for lawbot.org:
import requests
from bs4 import BeautifulSoup
base_url = 'http://lawbot.org'
search_url = base_url + '/?q=유죄'
response = requests.get(search_url)
page = BeautifulSoup(response.text, "html.parser")
lastPageNumber = int(page.select_one("li.page-item:not(.next):nth-last-child(2)").text)
casesList = []
for i in range(1, lastPageNumber + 1):
if i > 1:
response = requests.get(search_url + "&page=" + str(i))
page = BeautifulSoup(response.text, "html.parser")
cases = page.select("div.panre_center > ul.media-list li.panre_lists")
for case in cases:
title = case.findChild("h6").text
caseDocNumber = case.findChild(attrs={"class": "caseDocNumber"}).text
caseCourt = case.findChild(attrs={"class": "caseCourt"}).text
case_url = base_url + case.findChild("a")['href']
casesList.append({"title": title, "caseDocNumber": caseDocNumber, "caseCourt": caseCourt, "case_url": case_url})
# print("title:{}, caseDocNumber:{}, caseCourt:{}, caseUrl:{}".format(title, caseDocNumber, caseCourt, case_url))
for case in casesList:
response = requests.get(case["case_url"])
page = BeautifulSoup(response.text, "html.parser")
body = page.find(attrs={"class": "panre_body"}).text
print(body)

Capture product url data in case of pagination using BeautifulSoup?

I have fetched the html data from a site and trying to fetch product urls:
def get_soup(url):
soup = None
response = requests.get(url)
if response.status_code == 200:
html = response.content
soup = BeautifulSoup(html, "html.parser")
return soup
def get_category_urls(url):
soup = get_soup(url)
cat_urls = []
categories = soup.find('div', attrs={'id': 'menu_oc'})
if categories is not None:
for c in categories.findAll('a'):
if c['href'] is not None:
cat_urls.append(c['href'])
return cat_urls
def get_product_urls(url):
soup = get_soup(url)
prod_urls = []
if soup.find('div', attrs={'class': 'pagination'}):
for link in soup.select('div.links a'):
if link.string.isdecimal(): # dump next and last links
prod_urls.append(link['href'])
print("Found following product urls::", prod_urls)
return prod_urls
if __name__ == '__main__':
category_urls = get_category_urls(URL)
product_urls = get_product_urls(URL)
How to efficiently identify the pagination condition in above loc?
screen shots of actual site with pagination:
and without pagination:
site link
pagination category
should be okay
from bs4 import BeautifulSoup
import requests
def get_soup(url):
soup = None
response = requests.get(url)
if response.status_code == 200:
html = response.content
soup = BeautifulSoup(html, "html.parser")
return soup
def get_category_urls(url):
soup = get_soup(url)
cat_urls = []
categories = soup.find('div', attrs={'id': 'menu_oc'})
if categories is not None:
for c in categories.findAll('a'):
if c['href'] is not None:
cat_urls.append(c['href'])
return cat_urls
def get_all_products(url):
prod_urls = []
soup = get_soup(url)
prod_urls.append(get_product_urls(soup))
links = get_pagination(soup)
print("Found those pages:", links)
if not links:
return prod_urls
for link in links:
soup = get_soup(link)
prod_urls.append(get_product_urls(soup))
print("Found following product urls:", prod_urls)
return prod_urls
def get_product_urls(soup):
links = soup.select('div.product-list .span .name a')
return [link['href'] for link in links]
def get_pagination(soup):
pages = soup.select('div.pagination div.links a')
return [link['href'] for link in pages if link.string.isdecimal()]
if __name__ == '__main__':
URL = 'http://www.example.com/shop/index.php?route=product/category&path=63_64'
category_urls = get_category_urls(URL)
product_urls = get_all_products(URL)