I try to get some specific text using BeautifulSoup but couldn't figure it out.
All I need is the numbers with the "THIS TEXT" block (56789), not "SOME TEXT"...
Can someone point what's wrong with my code?
from bs4 import BeautifulSoup
def foo():
response = """
<div class="data_content_blog">
<div class="data_content">
<h5 class="large"> SOME TEXT </h5>
<p class="large some-text">12345</p>
</div>
</div>
<div class="data_content_blog">
<div class="data_content">
<h5 class="large"> SOME TEXT </h5>
<p class="large some-text">34567</p>
</div>
</div>
<div class="data_content_blog">
<div class="data_content">
<h5 class="large"> THIS TEXT </h5>
<p class="large this-text">56789</p>
</div>
</div>
"""
soup = BeautifulSoup(response, features="html.parser")
soup_1 = soup.find_all("div", {"class": "data_content"})
for s_1 in soup_1:
s_2 = s_1.find("p").attrs["class"][0]
s_3 = s_1.find("p").attrs["class"][1]
if s_2 == "large" and s_3 == "this-text":
print(s_2, s_3, "<- here")
# get the number 56789 ???
else:
print(s_2, s_3)
If class "this-text" is unique, you can select it and then .find_previous() tag:
num = soup.select_one(".this-text") # or soup.find(class_="this-text")
h5 = num.find_previous()
print(h5.text, num.text)
Prints:
THIS TEXT 56789
Related
Hello everyone I have the information I want pulled using BeautiuflSoup but I can't seem to get it printed out correctly to send to pandas and excel.
html_f ='''
<li class="list-group-item">
<div>
<div class="tyler-toggle-controller open">
<p class="text-primary">
07/01/2022 Date
<span class="caret"> </span>
</p>
</div>
<div class="tyler-toggle-container row-buff" style="display: block; overflow: hidden;">
<p class="col-sm-12 col-md-12">
<span class="text-muted">Comment</span><br>
[1] Comments
</p>
</div>
</div>
</li>'''
My code used to pull the data I want:
soup = BeautifulSoup(html_f,'html.parser')
for child in soup.findAll('li',class_='list-group-item')[0]:
print (child.text)
Here is the info it pulls But it prints it out weird with tons of spacing
07/01/2022 Date
Comment
[1] Comments
Ideally, I only need the top portion of (date and File Date) printed out but at the very least I need help getting it into a list format like:
07/01/2022 Date
Comment
[1] Comments
To get your information printed as expected in your question, you could use stripped_strings and iterate over its elements:
for e in soup.find_all('li',class_='list-group-item'):
for t in list(e.stripped_strings):
print(t)
Note: In new code use find_all() instead of old syntax findAll().
Example
html='''
<li class="list-group-item">
<div>
<div class="tyler-toggle-controller open">
<p class="text-primary">
07/01/2022 Date
<span class="caret">
</span>
</p>
</div>
<div class="tyler-toggle-container row-buff" style="display: block; overflow: hidden;">
<p class="col-sm-12 col-md-12">
<span class="text-muted">
Comment
</span>
<br/>
[1] Comments
</p>
</div>
</div>
</li>
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html)
for e in soup.find_all('li',class_='list-group-item'):
for t in list(e.stripped_strings):
print(t)
Output
07/01/2022 Date
Comment
[1] Comments
Not sure cause you are talking about pandas, you also could pick each information, clean it up and append to a list of dicts:
data = []
for e in soup.find_all('li',class_='list-group-item'):
data.append({
'date': e.p.text.strip().replace(' Date',''),
'comment': e.select_one('.tyler-toggle-container br').next_sibling.strip()
})
pd.DataFrame(data)
or
data = [{
'date':soup.select_one('li.list-group-item .text-primary').text.strip().replace(' Date',''),
'comment':soup.select_one('li.list-group-item .tyler-toggle-container br').next_sibling.strip()
}]
Output
date
comment
07/01/2022
[1] Comments
So far so good, it's my trying
doc='''
<li class="list-group-item">
<div>
<div class="tyler-toggle-controller open">
<p class="text-primary">
07/01/2022 Date
<span class="caret">
</span>
</p>
</div>
<div class="tyler-toggle-container row-buff" style="display: block; overflow: hidden;">
<p class="col-sm-12 col-md-12">
<span class="text-muted">
Comment
</span>
<br/>
[1] Comments
</p>
</div>
</div>
</li>
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(doc, 'html.parser')
text=[' '.join(child.get_text(strip=True).split(' ')).replace(' DateComment[1]',',') for child in soup.find_all('li',class_='list-group-item')]
print(text)
Output:
['07/01/2022, Comments']
Try this ways,must work
text=' '.join([' '.join(child.get_text(strip=True).split(' ')).replace(' DateComment[1]',',') for child in soup.find_all('li',class_='list-group-item')]).strip()
#Or
text= [' '.join(child.get_text(strip=True).split(' ')).replace(' DateComment[1]',',') for child in soup.find_all('li',class_='list-group-item')]
final_text= text[1]+ ',' +text[2]
final_text= text[1]+text[2].split()#if you want to make list
On the Website View,
How to get the value of "nbr1" field and copy it automatically in "Direct copy of nbr1" field ?
Does someone have an example, please ?
template.xml
<template id="biography">
<t t-call="website.layout">
<t t-set="title">bio</t>
<div class="oe_structure">
<div class="container">
<h3 t-field="person.name"/>
<p>Last modified: <i t-field="person.write_date"/></p>
<p>nbr1 : <input type="number" name="nbr1"/></p>
<p>Direct copy of nbr1 : <h3 t-field="person.result"/></p>
</div>
</div>
<div class="oe_structure"/>
</t>
</template>
model.py
name = fields.Char()
nbr1 = fields.Integer()
result = fields.Integer(compute="_compute_total")
#api.onchange('nbr1')
def _compute_total(self):
for record in self:
record.result = self.nbr1
Give nbr1 in onchange function instead of biography.
name = fields.Char()
nbr1 = fields.Integer()
result = fields.Integer()
#api.onchange('nbr1')
def _compute_total(self):
for record in self:
if record.nbr1:
record.result = record.nbr1
I have a page with list of products on it.
This is how HTML DOM looks like for one product item:
<div class="module card listing-search-card js-product-card " id="product-entry-123" data-product-id="123" data-toggle-status="open" data-out-of-stock="" data-toggle-isbundle="false" data-load-prices-async="false">
<div class="product-entry__wrapper">
<div class="card__header">
<div class="promotion">
<div class="product-entry__right promotion-card__body on-promotion--banner-offer">
</div>
<a href="/Products/p/123" tabindex="-1">
<picture>
<img class="card__image mobile-img lazyload" src="/medias/image-mobile">
<img class="card__image desktop-img lazyloaded" src="/medias/image-desktop">
</picture>
</a>
</div>
</div>
<div class="product-entry__body-actions-wrapper">
<div class="product-entry__body card__body">
<h3 class="card__title">
Schweppes
</h3>
<div class="product-entry__summary card__description-wrapper">
<div class="product-entry__summary__list">
<div class="card__detail-wrapper">
<div class="product-entry__summary__item card__description-product-detail">
33 x 24</div>
<div class="product-entry__summary__item card__description-product-code">
<span class="product-entry__code">
123</span>
</div>
</div>
<div class="container-type">
box</div>
</div>
</div>
</div>
<div class="cta-container">
<div class="card__amount-wrapper ">
<div class="card__amount">
61,83 € <span class="base-unit">HT/CHACUN</span>
<p class="sales-unit-price is-price">
<span>soit</span> 10,00 €
</span></span></p>
</div>
</div>
<div class="add-to-cart__footer add-to-cart__action">
<div class="success-overlay">Add to cart</div>
<div class="add-to-cart__action--active">
<div class="form-quantity__wrapper quantity-action quantity-action__wrapper"
data-form-quantity-id="123">
<div class="form-quantity ">
<button class="form-quantity__decrease quantity-action__decr icon-Minus disabled" type="button"
tabindex="-1" aria-label="decrement" data-form-quantity-decrement="">
</button>
<input id="product-123" class="form-quantity__input form-control quantity-action__value js-
quantity-input-typing" name="product-123" type="text" value="1" maxlength="4" data-price-
single="10.00" data-price-currency="€" data-parsley-range="[1,9999]" data-form-quantity-times="1"
data-parsley-multiplerange="1" data-parsley-type="integer" data-parsley-validation-threshold="1"
required="">
<button class="form-quantity__increase quantity-action__incr icon-Add-to-list" type="button"
tabindex="-1" aria-label="increment" data-form-quantity-increment="">
</button>
</div>
<span class="form-quantity__update" data-form-quantity-success=""></span>
</div>
<div class="add-to-cart__total">
<button class="button button--primary js-addToCart" role="button" title="Add
to cart" data-product-id-ref="123" data-modal-trigger="" data-modal-target="#add-to-cart-modal" data-
modal-before-trigger="addToCart" data-component-id="product list" tabindex="-1">
<div class="button__text">
<span class="button__text-add js-added-price">Add</span>
<span class="button__text-to-cart js-added-price">to cart</span>
</div>
<span class="button__text js-added-price mobile-only">Add</span>
</button>
</div>
</div>
</div>
<div class="add-to-template">
<button class="add-to-template--button button js-addToNewTemplate" type="button" data-modal-
trigger="" data-modal-target="#add-to-template-modal" data-modal-before-
trigger="openAddToTemplateModal" data-product-code="123">
<span>Add to list</span>
</button>
</div>
</div>
</div>
</div>
I am calling this function:
isSortedAlphabeticallyAscending($$('div.js-product-card'));
And the function implementation is:
isSortedAlphabeticallyAscending(list) {
for (let i = 0; i < (list.length - 1); i++) {
let outOfStockCurrent = list[i].getAttribute('data-out-of-stock');
let outOfStockNext = list[i + 1].getAttribute('data-out-of-stock');
let idCurrent = list[i].getAttribute('id');
let idNext = list[i + 1].getAttribute('id');
console.log("outOfStockCurrent " + outOfStockCurrent + " " + idCurrent);
console.log("outOfStockNext " + outOfStockNext + " " + idNext);
let productIdCurrent = idCurrent.split('-').pop();
let productIdNext = idNext.split('-').pop();
let currentText = list[i].$('a[href*="' + productIdCurrent + '"]').getText();
let nextText = list[i+1].$('a[href*="'+ productIdNext + '"]').getText();
console.log("currentText " + currentText);
console.log("nextText " + nextText);
if(outOfStockCurrent === "true" || outOfStockNext === "true") continue;
if (currentText > nextText) return false;
}
return true;
}
I ignore out of stock products since they are always at the bottom of the page.
But the list[i].$('a[href*="' + productIdCurrent + '"]').getText() is always returning empty text.
I would like it to get "Schweppes" text, i.e. product name.
Is there a way to chain somehow differently part with .$a[href ...] to get the text from the <a> tag inside the <div> element of the list of products using webdriverio 5?
Thanks!
The above selector list[i].$('a[href*="' + productIdCurrent + '"]').getText() targeted 2 elements.
What I needed to go one div further and find it there:
list[i].$('div.product-entry__body-actions-wrapper').$('a[href*="' + productIdCurrent + '"]').getText()
And voila, text appeared :)
Hope it will help someone with the similar issue :D
how to findall class using beautifulsoup when class has random offerid? im trying using below syntax but it don't get anything
containers = page_soup.findAll("div",{"class":xyz "})
class example is as below
<div class=abc">
<div class=bcd">
<div class="xyz " offerid="65546">
<div class="xyz " offerid="46465">
<div class="xyz " offerid="56747">
</div>
</div>
Use CSS Selector to get the items with class name.Try following example.
data = '''
<div class="xyz " offerid="65546">Test 1</div>
<div class="xyz " offerid="46465">Test 2</div>
<div class="xyz " offerid="56747">Test 3</div>
'''
soup = BeautifulSoup(data,'html.parser')
containers = soup.select("div.xyz")
for item in containers:
print(item.text)
OR
data = '''
<div class="xyz " offerid="65546">Test 1</div>
<div class="xyz " offerid="46465">Test 2</div>
<div class="xyz " offerid="56747">Test 3</div>
'''
soup = BeautifulSoup(data,'html.parser')
containers = soup.select(".xyz")
for item in containers:
print(item.text)
EDITED:
data = '''
<div class=abc">
<div class=bcd">
<div class="xyz " offerid="65546">
<div class="xyz " offerid="46465">
<div class="xyz " offerid="56747">
</div>
</div>
'''
soup = BeautifulSoup(data,'html.parser')
containers = soup.select("div.xyz[offerid]")
for item in containers:
print(item['offerid'])
I am not 100% clear on your quesiton but the ID shouldn't matter if you are picking up the div via the class name. The below sample code works.
html = '''
<div class="xyz " offerid="65546"> sample text </div>
<div class="xyz " offerid="46465"> sample text </div>
<div class="xyz " offerid="56747"> sample text </div>
'''
soup = BeautifulSoup(html,'html.parser')
containers = soup.find_all("div",{"class":"xyz "})
Perhaps you want an attribute selector
data = [item['offerid'] for item in soup.select('[offerid]')]
or
data = [item.text for item in soup.select('[offerid]')]
hello everyone I would like to know numer of elements like the following
<div id="datatable">
<form id="theForm" name="theForm" >......</form>
<div class="no_data_dd" id="no_data" >....
<div class= ....>.....
<div class= ....>
</div>
</div>
</div>
<div class="score_row score_header">.../div>
<div class="score_row match_line e_true" >..</div>
<div class="score_row padded_date ">..</div>
<div class= ....>.....</div>
</div>
I tried with
Set itemEle = objIE.document.getElementById("scoretable")
Length = itemEle.getElementsByTagName("class").Length
length = 0 and nont =5
why?
What if you try get "elements" by instead of get "element" that might be a reason for the answer