Trying to scrape texts with the same divs and no other info - scrapy

This html has 3 divs with the same name accounts-table__count but different types of information.
I'm trying to get the posts count and and follower count of this page. is there a way to take the texts using css selector?
site;https://mastodon.online/explore
<div class='directory__card__extra'>
<div class='accounts-table__count'>
629
<small>posts</small>
</div>
<div class='accounts-table__count'>
72
<small>followers</small>
</div>
<div class='accounts-table__count'>
<time class='time-ago' datetime='2021-05-18' title='May 18, 2021'>May 18, 2021</time>
<small>last active</small>
</div>
</div>
my code;
def parse(self, response):
for users in response.css('div.directory__card'):
yield {
'id': users.css('span::text').get().replace('#','').replace('.','-'),
'name': users.css('strong.p-name::text').get(),
'posts': '' // this is the post count //
'followers': '' // this is the follower count //
'description': users.css('p::text').get(),
'fediverse': users.css('span::text').get(),
'link': users.css('a.directory__card__bar__name').attrib['href'],
'image': users.css('img.u-photo').attrib['src'],
'bg-image': users.css('img').attrib['src'],
}
for nextpage in response.css('span.next'):
next_page = nextpage.css('a').attrib['href']
if next_page is not None:
yield response.follow(next_page, callback=self.parse)

As example, iterate over card, for each one get the values in shape of text and filter out the values.
raw_data = response.css(".directory__card")[0].css(".accounts-table__count::text").getall()
values = list(filter(lambda s: s != "", map(lambda s: s.strip(), raw_data)))
Some values from css selector of .accounts-table__count::text are empty, because div elements with this class has no text, but other html elements in it.

Related

Beautifoul soup: ho extract <p> content of a parent balise

in a text file, each item have the same structure so I would like to parse it with beautiful soup.
An extract:
data = """
<article id="1" title="Titre 1" sourcename="Le monde" about="Fillon|Macron">
<p type="title">Sub title1</p>
<p>xxxxxxxxxxxxxxxxxxxxxxxxx</p>
</article>
<article id="2" title="Titre 2" sourcename="La Croix" about="Le Pen|Mélanchon">
<p type="title">Sub title2</p>
<p>yyyyyyyyyyyyyyyyyyyyyyyyy</p>
</article>
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(data, 'html.parser')
for text in soup.find_all('article'):
print(text['id'])
print(list(text.findChildren()))
print(list(text.children))
I want to extract "p" balise content:
For each article, I would like to get a list of list (to convert to Df panda).
For example:
[
[1, "Sub title2", "xxxxxxxxxxxxx"],
2, "Sub title2", "yyyyyyyyyyyyy"],
]
Thanks a lot.
Théo
You're almost there.
result = [] # create a variable to store your results
for article in soup.find_all("article"):
article_id = article["id"]
title = article.select("p[type=title]")[0] # select the title tag
title_text = title.text
p = title.find_next("p").text # get the adjacent p tag
result.append([article_id, title_text, p])

Does BeautifulSoup can locate the element basing on contained text? [duplicate]

Observe the following problem:
import re
from bs4 import BeautifulSoup as BS
soup = BS("""
<a href="/customer-menu/1/accounts/1/update">
Edit
</a>
""")
# This returns the <a> element
soup.find(
'a',
href="/customer-menu/1/accounts/1/update",
text=re.compile(".*Edit.*")
)
soup = BS("""
<a href="/customer-menu/1/accounts/1/update">
<i class="fa fa-edit"></i> Edit
</a>
""")
# This returns None
soup.find(
'a',
href="/customer-menu/1/accounts/1/update",
text=re.compile(".*Edit.*")
)
For some reason, BeautifulSoup will not match the text, when the <i> tag is there as well. Finding the tag and showing its text produces
>>> a2 = soup.find(
'a',
href="/customer-menu/1/accounts/1/update"
)
>>> print(repr(a2.text))
'\n Edit\n'
Right. According to the Docs, soup uses the match function of the regular expression, not the search function. So I need to provide the DOTALL flag:
pattern = re.compile('.*Edit.*')
pattern.match('\n Edit\n') # Returns None
pattern = re.compile('.*Edit.*', flags=re.DOTALL)
pattern.match('\n Edit\n') # Returns MatchObject
Alright. Looks good. Let's try it with soup
soup = BS("""
<a href="/customer-menu/1/accounts/1/update">
<i class="fa fa-edit"></i> Edit
</a>
""")
soup.find(
'a',
href="/customer-menu/1/accounts/1/update",
text=re.compile(".*Edit.*", flags=re.DOTALL)
) # Still return None... Why?!
Edit
My solution based on geckons answer: I implemented these helpers:
import re
MATCH_ALL = r'.*'
def like(string):
"""
Return a compiled regular expression that matches the given
string with any prefix and postfix, e.g. if string = "hello",
the returned regex matches r".*hello.*"
"""
string_ = string
if not isinstance(string_, str):
string_ = str(string_)
regex = MATCH_ALL + re.escape(string_) + MATCH_ALL
return re.compile(regex, flags=re.DOTALL)
def find_by_text(soup, text, tag, **kwargs):
"""
Find the tag in soup that matches all provided kwargs, and contains the
text.
If no match is found, return None.
If more than one match is found, raise ValueError.
"""
elements = soup.find_all(tag, **kwargs)
matches = []
for element in elements:
if element.find(text=like(text)):
matches.append(element)
if len(matches) > 1:
raise ValueError("Too many matches:\n" + "\n".join(matches))
elif len(matches) == 0:
return None
else:
return matches[0]
Now, when I want to find the element above, I just run find_by_text(soup, 'Edit', 'a', href='/customer-menu/1/accounts/1/update')
The problem is that your <a> tag with the <i> tag inside, doesn't have the string attribute you expect it to have. First let's take a look at what text="" argument for find() does.
NOTE: The text argument is an old name, since BeautifulSoup 4.4.0 it's called string.
From the docs:
Although string is for finding strings, you can combine it with
arguments that find tags: Beautiful Soup will find all tags whose
.string matches your value for string. This code finds the tags
whose .string is “Elsie”:
soup.find_all("a", string="Elsie")
# [Elsie]
Now let's take a look what Tag's string attribute is (from the docs again):
If a tag has only one child, and that child is a NavigableString, the
child is made available as .string:
title_tag.string
# u'The Dormouse's story'
(...)
If a tag contains more than one thing, then it’s not clear what
.string should refer to, so .string is defined to be None:
print(soup.html.string)
# None
This is exactly your case. Your <a> tag contains a text and <i> tag. Therefore, the find gets None when trying to search for a string and thus it can't match.
How to solve this?
Maybe there is a better solution but I would probably go with something like this:
import re
from bs4 import BeautifulSoup as BS
soup = BS("""
<a href="/customer-menu/1/accounts/1/update">
<i class="fa fa-edit"></i> Edit
</a>
""")
links = soup.find_all('a', href="/customer-menu/1/accounts/1/update")
for link in links:
if link.find(text=re.compile("Edit")):
thelink = link
break
print(thelink)
I think there are not too many links pointing to /customer-menu/1/accounts/1/update so it should be fast enough.
in one line using lambda
soup.find(lambda tag:tag.name=="a" and "Edit" in tag.text)
You can pass a function that return True if a text contains "Edit" to .find
In [51]: def Edit_in_text(tag):
....: return tag.name == 'a' and 'Edit' in tag.text
....:
In [52]: soup.find(Edit_in_text, href="/customer-menu/1/accounts/1/update")
Out[52]:
<a href="/customer-menu/1/accounts/1/update">
<i class="fa fa-edit"></i> Edit
</a>
EDIT:
You can use the .get_text() method instead of the text in your function which gives the same result:
def Edit_in_text(tag):
return tag.name == 'a' and 'Edit' in tag.get_text()
With soupsieve 2.1.0 you can use :-soup-contains css pseudo class selector to target a node's text. This replaces the deprecated form of :contains().
from bs4 import BeautifulSoup as BS
soup = BS("""
<a href="/customer-menu/1/accounts/1/update">
Edit
</a>
""")
single = soup.select_one('a:-soup-contains("Edit")').text.strip()
multiple = [i.text.strip() for i in soup.select('a:-soup-contains("Edit")')]
print(single, '\n', multiple)
Method - 1: Checking text property
pattern = 'Edit'
a2 = soup.find_all('a', string = pattern)[0]
Method - 2: Using lambda iterate through all elements
a2 = soup.find(lambda tag:tag.name=="a" and "Edit" in tag.text)
Good Luck

BeautifulSoup Nested class selector

I am using BeautifulSoup for a project. Here is my HTML structure
<div class="container">
<div class="fruits">
<div class="apple">
<p>John</p>
<p>Sam</p>
<p>Bailey</p>
<p>Jack</p>
<ul>
<li>Sour</li>
<li>Sweet</li>
<li>Salty</li>
</ul>
<span>Fruits are good</span>
</div>
<div class="mango">
<p>Randy</p>
<p>James</p>
</div>
</div>
<div class="apple">
<p>Bill</p>
<p>Sean</p>
</div>
</div>
Now I want to grab text in div class 'apple' which falls under class 'fruits'
This is what I have tried so far ....
for node in soup.find_all("div", class_="apple")
Its returning ...
Bill
Sean
But I want it to return only ...
John
Sam
Bailey
Jack
Sour
Sweet
Salty
Fruits are good
Please note that I DO NOT know the exact structure of elements inside div class="apple" There can be any type of different HTML elements inside that class. So the selector has to be flexible enough.
Here is the full code, where I need to add this BeautifulSoup code ...
class MySpider(CrawlSpider):
name = 'dknnews'
start_urls = ['http://www.example.com/uat-area/scrapy/all-news-listing/_recache']
allowed_domains = ['example.com']
def parse(self, response):
hxs = Selector(response)
soup = BeautifulSoup(response.body, 'lxml')
#soup = BeautifulSoup(content.decode('utf-8','ignore'))
nf = NewsFields()
ptype = soup.find_all(attrs={"name":"dknpagetype"})
ptitle = soup.find_all(attrs={"name":"dknpagetitle"})
pturl = soup.find_all(attrs={"name":"dknpageurl"})
ptdate = soup.find_all(attrs={"name":"dknpagedate"})
ptdesc = soup.find_all(attrs={"name":"dknpagedescription"})
for node in soup.find_all("div", class_="apple"): <!-- THIS IS WHERE I NEED TO ADD THE BS CODE -->
ptbody = ''.join(node.find_all(text=True))
ptbody = ' '.join(ptbody.split())
nf['pagetype'] = ptype[0]['content'].encode('ascii', 'ignore')
nf['pagetitle'] = ptitle[0]['content'].encode('ascii', 'ignore')
nf['pageurl'] = pturl[0]['content'].encode('ascii', 'ignore')
nf['pagedate'] = ptdate[0]['content'].encode('ascii', 'ignore')
nf['pagedescription'] = ptdesc[0]['content'].encode('ascii', 'ignore')
nf['bodytext'] = ptbody.encode('ascii', 'ignore')
yield nf
for url in hxs.xpath('//ul[#class="scrapy"]/li/a/#href').extract():
yield Request(url, callback=self.parse)
I am not sure how to use nested selectors with BeautifulSoup find_all ?
Any help is very appreciated.
Thanks
soup.select('.fruits .apple p')
use CSSselector, it's very easy to express class.
soup.find(class_='fruits').find(class_="apple").find_all('p')
Or, you can use find() to get the p tag step by step
EDIT:
[s for div in soup.select('.fruits .apple') for s in div.stripped_strings]
use strings generator to get all the string under the div tag, stripped_strings will get rid of \n in the results.
out:
['John', 'Sam', 'Bailey', 'Jack', 'Sour', 'Sweet', 'Salty', 'Fruits are good']
Full code:
from bs4 import BeautifulSoup
source_code = """<div class="container">
<div class="fruits">
<div class="apple">
<p>John</p>
<p>Sam</p>
<p>Bailey</p>
<p>Jack</p>
<ul>
<li>Sour</li>
<li>Sweet</li>
<li>Salty</li>
</ul>
<span>Fruits are good</span>
</div>
<div class="mango">
<p>Randy</p>
<p>James</p>
</div>
</div>
<div class="apple">
<p>Bill</p>
<p>Sean</p>
</div>
</div>
"""
soup = BeautifulSoup(source_code, 'lxml')
[s for div in soup.select('.fruits .apple') for s in div.stripped_strings]

How to use scrapy to crawl multiple pages? (two level)

On my site I created two simple pages:
Here are their first html script:
test1.html :
<head>
<title>test1</title>
</head>
<body>
<a href="test2.html" onclick="javascript:return xt_click(this, "C", "1", "Product", "N");" indepth="true">
<span>cool</span></a>
</body></html>
test2.html :
<head>
<title>test2</title>
</head>
<body></body></html>
I want scraping text in the title tag of the two pages.here is "test1" and "test2".
but I am a novice with scrapy I only happens scraping only the first page.
my scrapy script:
from scrapy.spider import Spider
from scrapy.selector import Selector
from testscrapy1.items import Website
class DmozSpider(Spider):
name = "bill"
allowed_domains = ["http://exemple.com"]
start_urls = [
"http://www.exemple.com/test1.html"
]
def parse(self, response):
sel = Selector(response)
sites = sel.xpath('//head')
items = []
for site in sites:
item = Website()
item['title'] = site.xpath('//title/text()').extract()
items.append(item)
return items
How to pass the onclik?
and how to successfully scraping the text of the title tag of the second page?
Thank you in advance
STEF
To use multiple functions in your code, send multiple requests and parse them, you're going to need: 1) yield instead of return, 2) callback.
Example:
def parse(self,response):
for site in response.xpath('//head'):
item = Website()
item['title'] = site.xpath('//title/text()').extract()
yield item
yield scrapy.Request(url="http://www.domain.com", callback=self.other_function)
def other_function(self,response):
for other_thing in response.xpath('//this_xpath')
item = Website()
item['title'] = other_thing.xpath('//this/and/that').extract()
yield item
You cannot parse javascript with scrapy, but you can understand what the javascript does and do the same: http://doc.scrapy.org/en/latest/topics/firebug.html

How to display items in a sorted order?

I have this in my template file:
{% get_latest_show as slideshow %}
{% for slide in slideshow.slide_set.all %}
<img src="{% thumbnail slide.image 1174x640 upscale %}" alt="{{slide.title}}" width="1174"/>
{% endfor %}
models.py
from django.db import models
import datetime
class Slide(models.Model):
title = models.CharField(max_length=50)
description = models.TextField(blank=True, null=True)
target_url = models.TextField(blank=True, null=True)
slideshow = models.ForeignKey('Slideshow')
image = models.ImageField(upload_to='slideshow', max_length=500, blank=True,null=True)
def __unicode__(self):
return self.title
class Slideshow(models.Model):
title = models.CharField(max_length=50)
pub_date = models.DateTimeField(auto_now=True)
published = models.BooleanField(default=False)
class Meta:
ordering = ['-title']
def __unicode__(self):
return self.title
slide_tags.py
from django import template
from django.core.cache import cache
from django.contrib.contenttypes.models import ContentType
from slides.models import Slide, Slideshow
register = template.Library()
class GetSlideshowNode(template.Node):
"""
Retrieves the latest published slideshow
"""
def __init__(self, varname):
self.varname = varname
def render(self, context):
try:
show = Slideshow.objects.filter(published=True)[0]
except:
show = []
context[self.varname] = show
return ''
def get_latest_show(parser, token):
"""
Retrieves the latest published slideshow
{% get_latest_show as show %}
"""
args = token.split_contents()
argc = len(args)
try:
assert (argc == 3 and args[1] == 'as')
except AssertionError:
raise template.TemplateSyntaxError('get_latest_show syntax: {% get_latest_show as varname %}')
varname = None
t, a, varname = args
return GetSlideshowNode(varname=varname)
register.tag(get_latest_show)
The problem is that my slides are being displayed out of order. When I print slideshow.slide.set.all on the page, I see:
[<Slide: Slide 2>, <Slide: Slide 3>, <Slide: Slide 4>, <Slide: Slide 1>]
How do I get the slides to appear in order?
You want the slide_set to be ordered therefor the 'ordering' statement should be on the Slide model.
class Slide(models.Model):
# fields
class Meta:
ordering = ['-title']
This will cause Slide.objects.all() to return a queryset ordered by the title field. That is equivalent to slideshow.slide_set.all()