Is it possible to use bs4 soup object with lxml? - beautifulsoup

I am trying to use both BS4 and lxml
so instead of parsing html page twice, is there any way to use soup object in lxml or vice versa?
self.soup = BeautifulSoup(open(path), "html.parser")
i tried using this object with lxml like this
doc = html.fromstring(self.soup)
this is throwing error TypeError: expected string or bytes-like object
is there anyway to get this type of usage ?

I don't think there is a way without going through a string object.
from bs4 import BeautifulSoup
import lxml.html
html = """
<html><body>
<div>
<p>test</p>
</div>
</body></html>
"""
soup = BeautifulSoup(html, 'html.parser')
# Soup to lxml.html
doc = lxml.html.fromstring(soup.prettify())
print (type(doc))
print (lxml.html.tostring(doc))
#lxml.html to soup
soup = BeautifulSoup(lxml.html.tostring(doc), 'html.parser')
print (type(soup))
print (soup.prettify())
Outputs:
<class 'lxml.html.HtmlElement'>
b'<html>\n <body>\n <div>\n <p>\n test\n </p>\n </div>\n </body>\n</html>'
<class 'bs4.BeautifulSoup'>
<html>
<body>
<div>
<p>
test
</p>
</div>
</body>
</html>
Updated in response to comment:
You can use lxml.etree to iterate through the doc object:
# Soup to lxml.etree
doc = etree.fromstring(soup.prettify())
it = doc.getiterator()
for element in it:
print("%s - %s" % (element.tag, element.text.strip()))

Related

When i am using soup. Find(p)['class'] its is saying literal['class'] cannot be assigns to the type 'SupportsIndex' slice

import requests
from bs4 import BeautifulSoup, NavigableString, Tag
url = "https://www.codewithharry.com"
r = requests.get(url)
htmlContent = r.content
soup = BeautifulSoup(htmlContent, 'html.parser')
print(soup.find('p')['class'])
This code showed me a warning in vs code for the written code and the output that was given was correct so you can just write there as shown below:
print (soup. Find('p')['class']) # type: ignore
Sometimes it show the warning in the terminal in vs code so you can use this to stop the error

Get img src with beautifulsoup4 is returning an empty array response

I'm trying to get the url src from the following html
For some reason when i try to print out the logo url, I get [] as a response. My code is as follows:
from urllib.request import Request
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
url = 'https://growjo.com/industry/Cannabis'
request = Request(
url,
headers={'User-Agent': 'Mozilla/5.0'}
)
page = urlopen(request)
page_content_bytes = page.read()
page_html = page_content_bytes.decode("utf-8")
soup = BeautifulSoup(page_html, "html.parser")
company_rows = soup.find_all("table",{"class":"jss31"})[0].find_all("tbody")[0].find_all("tr")
for company in company_rows:
company_data = company.find_all("td")
logo = company_data[1].find_all("div",{"class":"lazyload-wrapper"})[0].find_all("a")
name = company_data[1].text
print(logo)
break
I tried printing out the 'a' tags...i tried the 'img'...they all respond with []. Its as if bs4 is not reading within the div class=lazyload-wrapper
Any help would be greatly appreciated.
The urls those contain logos are entirely dynamic
Bs4 can't render JS
API is restricted by authentication
Use an automation tool something like seleniun
Here I use Selenium4 with bs4
WebDriverManager is here
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import time
options = webdriver.ChromeOptions()
options.add_argument("start-maximized")
options.add_argument("--headless")
#chrome to stay open
#options.add_experimental_option("detach", True)
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()),options=options)
url= 'https://growjo.com/industry/Cannabis'
driver.get(url)
time.sleep(2)
soup = BeautifulSoup(driver.page_source, "html.parser")
company_rows = soup.select('table.jss31 tbody tr')
for company in company_rows:
log = company.select_one('td[class="jss38 jss40 jss46"] div + a')
logo = 'https://growjo.com' + log.get('href') if log else None
print(logo)
Output:
https://growjo.com/company/Dutchie
https://growjo.com/company/Ascend_Wellness_Holdings
https://growjo.com/company/Hiku_Brands
https://growjo.com/company/C3_Industries
https://growjo.com/company/Jane_Technologies
https://growjo.com/company/Headset
https://growjo.com/company/Jushi_Holdings
https://growjo.com/company/FLOWER_CO.
https://growjo.com/company/Columbia_Care
https://growjo.com/company/Cannabis_Control_Commission
https://growjo.com/company/FIGR
https://growjo.com/company/Leafly
https://growjo.com/company/Hound_Labs
https://growjo.com/company/Leaf_Trade
https://growjo.com/company/Wurk
https://growjo.com/company/Sundial_Cannabis
https://growjo.com/company/BEYOND_%2F_HELLO
https://growjo.com/company/PharmaCann
https://growjo.com/company/LeafLink
https://growjo.com/company/Connected_Cannabis_Co.
https://growjo.com/company/NATURE'S_MEDICINES
https://growjo.com/company/Althea_Group
https://growjo.com/company/CURE_Pharmaceutical
https://growjo.com/company/urban-gro
https://growjo.com/company/NABIS
None
https://growjo.com/company/Medisun
https://growjo.com/company/Mammoth_Distribution
https://growjo.com/company/Dosecann_Cannabis_Solutions
https://growjo.com/company/Vireo_Health
https://growjo.com/company/Dama_Financial
https://growjo.com/company/Caliber
https://growjo.com/company/springbig
https://growjo.com/company/Westleaf
https://growjo.com/company/INSA
https://growjo.com/company/Pure_Sunfarms
https://growjo.com/company/Sensi_Media_Group
https://growjo.com/company/Verano_Holdings
https://growjo.com/company/TILT_Holdings
https://growjo.com/company/Bloom_Medicinals
https://growjo.com/company/Planet_13_Holdings
https://growjo.com/company/Liberty_Health_Sciences
https://growjo.com/company/Calyx_Peak_Companies
https://growjo.com/company/Vangst
https://growjo.com/company/Fire_&_Flower
https://growjo.com/company/Revolution_Enterprises
https://growjo.com/company/4Front_Ventures
https://growjo.com/company/Calyx_Containers
https://growjo.com/company/GreenTech_Industries
https://growjo.com/company/BZAM_Cannabis
https://growjo.com/company/Cova_Software
None
https://growjo.com/company/Up_Cannabis
https://growjo.com/company/Cann_Group
https://growjo.com/company/Holistic_Industries
https://growjo.com/company/Treez
https://growjo.com/company/INDIVA
https://growjo.com/company/Kiva_Confections
https://growjo.com/company/MariMed
https://growjo.com/company/MCR_Labs
https://growjo.com/company/Vicente_Sederberg
https://growjo.com/company/Demetrix
https://growjo.com/company/365_Cannabis
https://growjo.com/company/LivWell_Enlightened_Health
https://growjo.com/company/High_Tide
https://growjo.com/company/The_Hawthorne_Gardening_Company
https://growjo.com/company/WYLD
https://growjo.com/company/VidaCann
https://growjo.com/company/Sira_Naturals
https://growjo.com/company/iAnthus
https://growjo.com/company/EastHORN_Clinical_Services
https://growjo.com/company/PharmaCielo
https://growjo.com/company/OCS_Ontario_Cannabis_Store
https://growjo.com/company/Hugh_Wood_Canada
https://growjo.com/company/Wana_Brands
https://growjo.com/company/Parallel
https://growjo.com/company/Weedmaps
None
https://growjo.com/company/Dark_Heart_Nursery
https://growjo.com/company/Stealth_Monitoring
https://growjo.com/company/dicentra
https://growjo.com/company/Sunday_Goods_&_The_Pharm
https://growjo.com/company/Phase_Zero_Design
https://growjo.com/company/Sava
https://growjo.com/company/Ceylon_Solutions
https://growjo.com/company/Green_Flower
https://growjo.com/company/Shryne_Group
https://growjo.com/company/MJ_Freeway
https://growjo.com/company/Theory_Wellness
https://growjo.com/company/HEXO_Corp
https://growjo.com/company/Lightshade
https://growjo.com/company/New_Frontier_Data
https://growjo.com/company/Mission_Dispensaries
https://growjo.com/company/FLUENT_Cannabis_Care
https://growjo.com/company/Superette
https://growjo.com/company/HdL_Companies
https://growjo.com/company/Helix_Technologies
https://growjo.com/company/Mary's_Medicinals
https://growjo.com/company/Indus_Holdings
https://growjo.com/company/Auxly
https://growjo.com/company/Good_Chemistry
https://growjo.com/company/Khiron_Life_Sciences_Corp
https://growjo.com/company/The_Apothecarium

How to get data from IMDb using BeautifulSoup

I'm trying to get the names of top 250 IMDb movies using BeautifulSoup. The code does not execute properly and shows no errors.
import requests
from bs4 import BeautifulSoup
url = "https://www.imdb.com/chart/top"
response = requests.get(url)
rc = response.content
soup = BeautifulSoup(rc,"html.parser")
for i in soup.find_all("td",{"class:":"titleColumn"}):
print(i)
I'm expecting it show me all of the td tags with titleColumn classes but it is not working. Am I missing something? Thanks in advance!
Remove the : after the class:
{"class:":"titleColumn"}
to
{"class":"titleColumn"}
Example ++
import requests
from bs4 import BeautifulSoup
url = "https://www.imdb.com/chart/top"
response = requests.get(url)
rc = response.content
soup = BeautifulSoup(rc,"html.parser")
data = []
for i in soup.find_all("td",{"class":"titleColumn"}):
data.append({
'people':i.a['title'],
'title':i.a.get_text(),
'info':i.span.get_text()
})
data

BeautifulSoup code Error

I cannot seem to find the problem in this code.
Help will be appreciated.
import requests
from bs4 import BeautifulSoup
url = 'http://nytimes.com'
r = requests.get(url)
r_html = r.text
soup = BeautifulSoup(r_html)
title = soup.find('span','articletitle').string
Code & Error Screenshot
The problem is http://nytimes.com does not have any articletitle span. To be safe, just check if soup.find('span','articletitle') is not None: before accessing it. Also, you don't need to access string property here. For example, the following would work fine.
import requests
from bs4 import BeautifulSoup
url = 'http://nytimes.com'
r = requests.get(url)
r_html = r.text
soup = BeautifulSoup(r_html, 'html.parser')
if soup.find('div', 'suggestions') is not None:
title = soup.find('div','suggestions')
print(title)
Put your code inside try and catch & then print exception which is occurring. using the exception occurred you can rectify the problem.
Hi use a parser as your second argument for the get method,
Ex:-
page_content = BeautifulSoup(r.content, "html.parser")

Find productID with beautifulsoup

I'm trying to extract a URL from this page using BeautifulSoup by searching for the p0662110597086(his id). I've tried several different approaches with BeautifulSoup, including a different html parser, but none of these have been successful.
video <ul>
<li>identity:<span itemprop="productID">p0662110597086</span></li>
<li>soll numbers:75</li>
<li>solds:97</li>
</ul>
import bs4
html = ''' video <ul>
<li>identity:<span itemprop="productID">p0662110597086</span></li>
<li>soll numbers:75</li>
<li>solds:97</li>
</ul>'''
soup = bs4.BeautifulSoup(html, 'lxml')
id_tag = soup.find('span', string='p0662110597086')
a_tag = id_tag.find_previous('a', class_='movie')
out:
id_tag: <span itemprop="productID">p0662110597086</span>
a_tag: <a class="movie" hpp="act_video" href="#media">video</a>
Signature: find_all_previous(name, attrs, string, limit, **kwargs)
Signature: find_previous(name, attrs, string, **kwargs)
These methods use .previous_elements to iterate over the tags and
strings that came before it in the document. The find_all_previous()
method returns all matches, and find_previous() only returns the first
match