I'm trying to download and parse text from some RSS feeds, such as http://rss.sciencedirect.com/publication/science/03043878. Here's a simple example:
import urllib.request
import urllib.parse
import requests
from bs4 import BeautifulSoup
def main():
soup = BeautifulSoup(urllib.request.urlopen('http://rss.sciencedirect.com/publication/science/03043878'),"html.parser").encode("ascii")
print(soup)
if __name__ == '__main__':
main()
In the original html (if you look at the website directly), links are preceded by <link> and followed by </link>. But what beautifulsoup prints out replaces <link> with <link/> and drops </link> completely. Any ideas what I could be doing wrong or is this a bug?
PS Tried changing encoding to utf-8 and it still happens.
You are parsing RSS. RSS is XML. So pass features="xml" into the BeautifulSoup constructor.
import urllib.request
from bs4 import BeautifulSoup
def main():
doc = BeautifulSoup(urllib.request.urlopen('http://rss.sciencedirect.com/publication/science/03043878'), "xml")
# If you want to print it as ascii (as per your original post).
print (doc.prettify('ascii'))
# To write it to an file as ascii (as per your original post).
with open("ascii.txt", "wb") as file:
file.write(doc.prettify('ascii'))
# To write it to an file as utf-8 (as the original RSS).
with open("utf-8.txt", "wb") as file:
file.write(doc.prettify('utf-8'))
# If you want to print the links.
for item in doc.findAll('link'):
print(item)
if __name__ == '__main__':
main()
Outputs in both files and terminal:
... <link>
http://rss.sciencedirect.com/action/redirectFile?&zone=main¤tActivity=feed&usageType=outward&url=http%3A%2F%2Fwww.sciencedirect.com%2Fscience%3F_ob%3DGatewayURL%26_origin%3DIRSSSEARCH%26_method%3DcitationSearch%26_piikey%3DS0304387817300512%26_version%3D1%26md5%3D16ed8e2672e8048590d3c41993306b0f
</link> ...
The parser can't evaluate the links correctly. For this issue you should use xml as your parser instead of html.parser.
soup = BeautifulSoup(urllib.request.urlopen('http://rss.sciencedirect.com/publication/science/03043878'),"xml")
print(len(soup.find_all("link")))
Outputs 52 links.
Related
I hope you are well.
I'm trying to write the following to a pdf using the fpdf2 library:
GHI (kWh $m^-2 day^{-1}$)
But I'm not getting the expected result, the text is written in the pdf in the same way as the code, that is: GHI (kWh $m^-2 day^{-1}$)
Here is a basic example:
from fpdf import FPDF
pdf = FPDF()
pdf.add_page()
pdf.set_font("helvetica", "B", 16)
pdf.cell(40, 10, r"GHI (kWh $m^-2 day^{-1}$)")
pdf.output("test.pdf")
Could someone let me know how I can write this text in the pdf so that it is rendered correctly?
Thank you very much in advance,
Robson
fpdf2 now supports subscript & superscript, as well as <sub> & <sup> HTML tags: https://github.com/PyFPDF/fpdf2/blob/master/CHANGELOG.md#added-1
You can also render mathematical formulas using Google Charts API or Matplotlib: https://pyfpdf.github.io/fpdf2/Maths.html#mathematical-formulas
from io import BytesIO
from urllib.parse import quote
from urllib.request import urlopen
from fpdf import FPDF
formula = r"GHI (kWh $m^-2 day^{-1}$)"
height = 170
url = f"https://chart.googleapis.com/chart?cht=tx&chs={height}&chl={quote(formula)}"
with urlopen(url) as img_file:
img = BytesIO(img_file.read())
pdf = FPDF()
pdf.add_page()
pdf.image(img, w=30)
pdf.output("equation-with-gcharts.pdf")
I am trying to get specific data from a website that is under a class that is used multiple times. So my thought was to search for the next biggest class and then use bs4 again to narrow my search results further. However, I get this error:
AttributeError: ResultSet object has no attribute 'find_all'. You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?
This is my code:
import requests
from bs4 import BeautifulSoup
def main():
responce()
def responce():
r = requests.get('https://robinhood.com/stocks/WISH')
soup = BeautifulSoup(r.content, 'html.parser')
responce = soup.find_all(class_="css-ktio0g""")
responce = responce.find_all(class_="css-6e9xj2")
print(responce)
main()
import requests
from bs4 import BeautifulSoup
from pprint import pp
def main(url):
r = requests.get(url)
soup = BeautifulSoup(r.text, 'lxml')
goal = [x.text for x in soup.select('span.css-ktio0g')]
pp(goal)
main('https://robinhood.com/stocks/WISH')
Output:
['Piotr Szulczewski',
'—',
'San Francisco, California',
'2010',
'7.25B',
'—',
'—',
'155.46M',
'$12.39',
'$11.44',
'$11.97',
'76.70M',
'$32.85',
'$7.52',
'— per share',
'Expected Aug 11, After Hours',
'Sign up for a Robinhood Account to buy or sell ContextLogic stock and '
'options commission-free.']
Using Beautiful Soup and pandas, I am trying to append all the links on a site into a list with the following code. I am able to scrape all pages with relevant information in the table. The code seems work to me somehow. But the small problem occurs is that just only links in the last page appears. The output is not what I expected. In the end, I'd like to append a list containing all 40 links (next to the required info) in 2 pages. I try scraping 2 pages first although there are 618 pages in total. Do you have any advice how to adjust the code so that each link is appended into the table? Many thanks in advance.
import pandas as pd
import requests
from bs4 import BeautifulSoup
hdr={'User-Agent':'Chrome/84.0.4147.135'}
dfs=[]
for page_number in range(2):
http= "http://example.com/&Page={}".format(page_number+1)
print('Downloading page %s...' % http)
url= requests.get(http,headers=hdr)
soup = BeautifulSoup(url.text, 'html.parser')
table = soup.find('table')
df_list= pd.read_html(url.text)
df = pd.concat(df_list)
dfs.append(df)
links = []
for tr in table.findAll("tr"):
trs = tr.findAll("td")
for each in trs:
try:
link = each.find('a')['href']
links.append(link)
except:
pass
df['Link'] = links
final_df = pd.concat(dfs)
final_df.to_csv('myfile.csv',index=False,encoding='utf-8-sig')
It's with your logic. You only add the links column to the last df since it's outside your loop. Get the links within the page loop, then add that to df, then you can append the df to your dfs list:
import pandas as pd
import requests
from bs4 import BeautifulSoup
hdr={'User-Agent':'Chrome/84.0.4147.135'}
dfs=[]
for page_number in range(2):
http= "http://example.com/&Page={}".format(page_number+1)
print('Downloading page %s...' % http)
url= requests.get(http,headers=hdr)
soup = BeautifulSoup(url.text, 'html.parser')
table = soup.find('table')
df_list= pd.read_html(url.text)
df = pd.concat(df_list)
links = []
for tr in table.findAll("tr"):
trs = tr.findAll("td")
for each in trs:
try:
link = each.find('a')['href']
links.append(link)
except:
pass
df['Link'] = links
dfs.append(df)
final_df = pd.concat(dfs)
final_df.to_csv('myfile.csv',index=False,encoding='utf-8-sig')
I'm making a text scraper for youtube in which I want to enter data and search videos and collect data of it. I'm facing problems in entering data in the text field. Can anyone suggest me a method to do that?
from bs4 import BeautifulSoup
driver = webdriver.Chrome()
soup = BeautifulSoup(driver.page_source, 'lxml') #Use the page as source
page = driver.get('https://freight.rivigo.com/dashboard/home')
import sys
from importlib import reload
reload
elem = driver.find_element_by_tag_name("body")
no_of_pagedowns = 120
while no_of_pagedowns:
elem.send_keys(Keys.PAGE_DOWN)
time.sleep(0.5)
no_of_pagedowns-=1
soup = BeautifulSoup(driver.page_source, 'lxml')
In between this code I want to add a custom text in input field, lets say "comedy" and want to get data on that. I'm stuck on how to input data and I'm quite new to this so any sort of help will be helpful.
That page is NOT pointing to YouTube. Check out the working code sample below for an idea of what you can do with the YouTube API.
# https://medium.com/greyatom/youtube-data-in-python-6147160c5833
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
#from youtube_data import youtube_search
test = youtube_search("Nine Inch Nails")
test.keys()
test['commentCount'][:5]
df = pd.DataFrame(data=test)
df.head()
df1 = df[['title','viewCount','channelTitle','commentCount','likeCount','dislikeCount','tags','favoriteCount','videoId','channelId','categoryId']]
df1.columns = ['Title','viewCount','channelTitle','commentCount','likeCount','dislikeCount','tags','favoriteCount','videoId','channelId','categoryId']
df1.head()
#import numpy as np
#numeric_dtype = ['viewCount','commentCount','likeCount','dislikeCount','favoriteCount']
#for i in numeric_dtype:
# df1[i] = df[i].astype(int)
NIN = df1[df1['channelTitle']=='Nine Inch Nails']
NIN.head()
NIN = NIN.sort_values(ascending=False,by='viewCount')
plt.bar(range(NIN.shape[0]),NIN['viewCount'])
plt.xticks(range(NIN.shape[0]),NIN['Title'],rotation=90)
plt.ylabel('viewCount in 100 millions')
plt.show()
Bear with me. I'm writing every detail because so many parts of the toolchain do not handle Unicode gracefully and it's not clear what is failing.
PRELUDE
We first set up and use a recent Scrapy.
source ~/.scrapy_1.1.2/bin/activate
Since the terminal's default is ascii, not unicode, we set:
export LC_ALL=en_US.UTF-8
export LANG=en_US.UTF-8
Also since by default Python uses ascii, we modify the encoding:
export PYTHONIOENCODING="utf_8"
Now we're ready to start a Scrapy project.
scrapy startproject myproject
cd myproject
scrapy genspider dorf PLACEHOLDER
We're told we now have a spider.
Created spider 'dorf' using template 'basic' in module:
myproject.spiders.dorf
We modify myproject/items.py to be:
# -*- coding: utf-8 -*-
import scrapy
class MyprojectItem(scrapy.Item):
title = scrapy.Field()
ATTEMPT 1
Now we write the spider, relying on urllib.unquote
# -*- coding: utf-8 -*-
import scrapy
import urllib
from myproject.items import MyprojectItem
class DorfSpider(scrapy.Spider):
name = "dorf"
allowed_domains = [u'http://en.sistercity.info/']
start_urls = (
u'http://en.sistercity.info/sister-cities/Düsseldorf.html',
)
def parse(self, response):
item = MyprojectItem()
item['title'] = urllib.unquote(
response.xpath('//title').extract_first().encode('ascii')
).decode('utf8')
return item
And finally we use a custom item exporter (from all the way back in Oct 2011)
# -*- coding: utf-8 -*-
import json
from scrapy.exporters import BaseItemExporter
class UnicodeJsonLinesItemExporter(BaseItemExporter):
def __init__(self, file, **kwargs):
self._configure(kwargs)
self.file = file
self.encoder = json.JSONEncoder(ensure_ascii=False, **kwargs)
def export_item(self, item):
itemdict = dict(self._get_serialized_fields(item))
self.file.write(self.encoder.encode(itemdict) + '\n')
and add
FEED_EXPORTERS = {
'json': 'myproject.exporters.UnicodeJsonLinesItemExporter',
}
to myproject/settings.py.
Now we run
~/myproject> scrapy crawl dorf -o dorf.json -t json
we get
UnicodeEncodeError: 'ascii' codec can't encode character u'\xfc' in position 25: ordinal not in range(128)
ATTEMPT 2
Another solution (the candidate solution for Scrapy 1.2?) is to use the spider
# -*- coding: utf-8 -*-
import scrapy
from myproject.items import MyprojectItem
class DorfSpider(scrapy.Spider):
name = "dorf"
allowed_domains = [u'http://en.sistercity.info/']
start_urls = (
u'http://en.sistercity.info/sister-cities/Düsseldorf.html',
)
def parse(self, response):
item = MyprojectItem()
item['title'] = response.xpath('//title')[0].extract()
return item
and the custom item exporter
# -*- coding: utf-8 -*-
from scrapy.exporters import JsonItemExporter
class Utf8JsonItemExporter(JsonItemExporter):
def __init__(self, file, **kwargs):
super(Utf8JsonItemExporter, self).__init__(
file, ensure_ascii=False, **kwargs)
with
FEED_EXPORTERS = {
'json': 'myproject.exporters.Utf8JsonItemExporter',
}
in myproject/settings.py.
We get the following JSON file.
[
{"title": "<title>Sister cities of D\u00fcsseldorf \u2014 sistercity.info</title>"}
]
The Unicode is not UTF-8 encoded. Although this is a trivial problem for a couple of characters, it becomes a serious issue if the entire output is in a foreign language.
How can I get an output in UTF-8 unicode?
In Scrapy 1.2+ there is a FEED_EXPORT_ENCODING option. When FEED_EXPORT_ENCODING = "utf-8" escaping of non-ascii symbols in JSON output is turned off.
please try this on your Attempt 1 and let me know if it works (I've test it without setting all those env. variables)
def to_write(uni_str):
return urllib.unquote(uni_str.encode('utf8')).decode('utf8')
class CitiesSpider(scrapy.Spider):
name = "cities"
allowed_domains = ["sitercity.info"]
start_urls = (
'http://en.sistercity.info/sister-cities/Düsseldorf.html',
)
def parse(self, response):
for i in range(2):
item = SimpleItem()
item['title'] = to_write(response.xpath('//title').extract_first())
item['url'] = to_write(response.url)
yield item
the range(2) is for testing the json exporter, to get a list of dicts you can do this instead:
# -*- coding: utf-8 -*-
from scrapy.contrib.exporter import JsonItemExporter
from scrapy.utils.serialize import ScrapyJSONEncoder
class UnicodeJsonLinesItemExporter(JsonItemExporter):
def __init__(self, file, **kwargs):
self._configure(kwargs, dont_fail=True)
self.file = file
self.encoder = ScrapyJSONEncoder(ensure_ascii=False, **kwargs)
self.first_item = True