Given the following html
html = """
<html>
<body>
<p>
<p>
Hello World
</p>
</p>
<body>
</html>
"""
When I pass it to BeautifulSoup using lxml and prettify it, this is the output I get
print BeautifulSoup(html, 'lxml').prettify()
<html>
<body>
<p>
</p>
<p>
Hello World
</p>
</body>
</html>
Because the <p> tags can't be nested since its invalid HTML, they are closed when parsed.
Is there a way to ignore this and treat them as nestable?
I can't parse it as xml since I'd want to use css selectors which I can't as an XML
Any way to configure lxml to ignore invalid html?
You can use html.parser. But in html.parser a new body gets added.
In [1]: from bs4 import BeautifulSoup
In [2]: html = """<html>
...: <body>
...:
...: <p>
...: <p>
...: Hello World
...: </p>
...: </p>
...:
...: <body>
...: </html>"""
In [3]: soup = BeautifulSoup(html, "html.parser")
In [4]: soup.prettify()
Out[4]: '<html>\n <body>\n <p>\n <p>\n Hello World\n </p>\n </p>\n <body>\n </body>\n </body>\n</html>'
In [5]: print(soup.prettify())
<html>
<body>
<p>
<p>
Hello World
</p>
</p>
<body>
</body>
</body>
</html>
Using html5 parser. A head tag gets added
n [11]: html = """<html>
...: <body>
...:
...: <p>
...: <p>
...: Hello World
...: </p>
...: </p>
...:
...: <body>
...: </html>"""
In [12]: soup = BeautifulSoup(html, "html5")
In [13]: print(soup.prettify())
<html>
<head>
</head>
<body>
<p>
</p>
<p>
Hello World
</p>
<p>
</p>
</body>
</html>
Related
I want to upload a jpg file to imgur and get the jpg's link.
I have imgur API's Client Id and Client Secret.
Delphi code as below:
procedure TfrmMain.Button6Click(Sender: TObject);
var
client: TRESTClient;
request: TRESTRequest;
response: TCustomRESTResponse;
begin
client := TRESTClient.Create(nil);
try
client.BaseURL := 'https://api.imgur.com/';
Client.AddParameter('Client ID', '...', TRESTRequestParameterKind.pkHTTPHEADER);
Client.AddParameter('Client Secret', '...', TRESTRequestParameterKind.pkHTTPHEADER);
request := TRESTRequest.Create(nil);
try
request.Client := client;
request.Method := rmPOST;
request.Resource := 'a/C11W7xC';
request.Accept := 'application/json';
request.AddParameter('image','D:\linedw.jpg' , pkFile);
request.Execute;
response := request.Response;
if response.Status.Success then
begin
mo_response.Lines.add('Success: ' + slinebreak + response.Content);
end
else
begin
mo_response.Lines.add('Failed ' + response.StatusText + ': ' + slinebreak + response.Content);
end;
finally
request.Free;
end;
finally
client.Free;
end;
end;
The error information from response.Content is as below:
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html lang="en" xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>imgur: the simple 404 page</title>
<meta http-equiv="content-type" content="text/html;charset=utf-8" />
<meta name="robots" content="noindex,nofollow" />
<meta name="keywords" content="images, funny pictures, image host, image upload, image sharing, image resize" />
<meta name="description" content="Imgur is home to the web's most popular image content, curated in real time by a dedicated community through commenting, voting and sharing." />
<meta name="copyright" content="Copyright 2014 Imgur, Inc." />
<meta http-equiv="X-UA-Compatible" content="IE=Edge;" />
<link rel="stylesheet" type="text/css" href="https://s.imgur.com/min/404.css?1393899213" />
<!--[if IE 9]><link rel="stylesheet" href="https://s.imgur.com/include/css/ie-sucks.css?0" type="text/css" /><![endif]-->
</head>
<body>
<div class="nodisplay">
Imgur is home to the web's most popular image content, curated in real time by a dedicated community through commenting, voting and sharing.
</div>
<div id="hallway">
<div class="container">
<div id="cat1" class="painting">
<img src="//s.imgur.com/images/404/cat1weyes.png">
<div class="eye-container">
<div class="eye left">
<div class="pupil"></div>
</div>
<div class="eye right">
<div class="pupil"></div>
</div>
</div>
</div>
<div id="cat2" class="painting">
<img src="//s.imgur.com/images/404/cat2weyes.png">
<div class="eye-container">
<div class="eye">
<div class="pupil"></div>
</div>
</div>
</div>
<div id="giraffe" class="painting">
<img src="//s.imgur.com/images/404/giraffeweyes.png">
<div class="eye-container">
<div class="eye left">
<div class="pupil"></div>
</div>
<div class="eye right">
<div class="pupil"></div>
</div>
</div>
<img class="monocle" src="//s.imgur.com/images/404/monocle.png" />
</div>
<div id="cat3" class="painting">
<img src="//s.imgur.com/images/404/cat3weyes.png">
<div class="eye-container">
<div class="eye left">
<div class="pupil"></div>
</div>
<div class="eye right">
<div class="pupil"></div>
</div>
</div>
</div>
<div id="cat4" class="painting">
<img src="//s.imgur.com/images/404/cat4weyes.png">
<div class="eye-container">
<div class="eye left">
<div class="pupil"></div>
</div>
<div class="eye right">
<div class="pupil"></div>
</div>
</div>
</div>
</div>
</div>
<div class="footer textbox">
<h1>Zoinks! You've taken a wrong turn.</h1>
<p>Let's split up, gang. If you're looking for an image, it's probably been deleted or may not have existed at all.</p>
<p>If you are looking for groovy images, visit our gallery!</p>
<img src="https://s.imgur.com/images/imgurlogo-header.png">
</div>
<script type="text/javascript">
(function(widgetFactory) {
widgetFactory.mergeConfig('analytics', {
isAdmin: false
});
})(_widgetFactory);
</script>
<script type="text/javascript" src="https://s.imgur.com/min/404.js?1393899213"></script>
<script type="text/javascript">
var e404 = E404.getInstance();
e404.generalInit();
</script>
</body>
</html>
I have no experience with calling a REST API. I searched for Delphi demo information, but I did not find much. I need some guidance about this.
Delphi 10.4 / Windows 10
You are trying to upload an image to an invalid resource a/C11W7xC on the server, which is why you get an HTTP 404 Not Found response with HTML content.
According to the documentation, the resource for uploading images is 3/upload instead.
I haven't used the API myself, but it seems to me that the authorization you are using is not in line with Imgur's authorization.
Imgur's API allows you to upload images either anonymously via an Authorization: ClientID {YOUR_CLIENT_ID} HTTP header, or using an Authorization: Bearer {YOUR_ACCESS_TOKEN} HTTP header to tie the uploaded image to your account. See Authorization and OAuth on how to obtain the access token.
Note that you should not share your client credentials with the whole world, it has secret in its name, afterall. I recommend you should renew your client credentials at this point.
I am working with Django and geemap modules, in which I am trying to make an app that can display satellite data on the map and the map should also be interactive as in there should be a bidirectional flow of data from the front-end(Django template) to back-end(python script) and vice-versa.
As of now I only know how to display the instance of geemap.Map() on Jupyter Notebook cell or on Colab(we just need to write the name of the variable for it.). But, I have no idea about how can i display the instance of geemap.Map() in Django Template.
When I use the following method it just prints the instance object as a dictionary instead of interpreting it as a map and displaying the same.
The code for my views.py
from django.http import HttpResponse
from django.shortcuts import render
import geemap as gm
#import pandas as pd
def params(request):
g_map = gm.Map()
return render(request, "PlotMap/params.html", { "m" : g_map })
The code for the template(params.html)
<!DOCTYPE html>
{% load static %}
<html>
<head>
<meta charset="utf-8">
<title>map</title>
</head>
<body>
{{ m }}
</body>
</html>
The output that I get is as follows. output
If someone can help me out, It would mean a lot Thank you.
You can use geemap.foliumap.Map() or folium.Map()
Code for html template
<!DOCTYPE html>
{% load static %}
<html>
<head>
<meta charset="utf-8">
<title>map</title>
{{ map.header.render|safe }}
</head>
<body>
<div class="map">
{{ map.html.render|safe }}
</div>
</body>
<script> {{ map.script.render | safe }}</script>
</html>
Code for backend (views.py)
import folium
import geemap.foliumap as geemap
class map(TemplateView):
template_name = 'map.html'
def get_context_data(request):
figure = folium.Figure()
Map = geemap.Map(plugin_Draw = True,
Draw_export = True)
Map.add_to(figure)
figure.render()
return {"map": figure}
Code for urls.py
urlpatterns = [
path('', views.map.as_view(), name = 'map'),
]
I am trying to scrape data from a webpage with the following format
<html class="gr__racinng_applledaily_com_hk" style='overflow: initial;">
<head> ... </head>
<body data-gr-c-s-loaded="true">
<!-- Google Tag Mananger (noscript) -->
<noscript> ...</noscript>
<!-- End Google Tag Mananger (noscript) -->
<div data-v-6223d6a8 id="app" class="web"> ... </div>
</body>
</html>
By using
from bs4 import BeautifulSoup
page = BeautifulSoup(raw_html.content, 'html.parser')
or
from bs4 import BeautifulSoup
page = BeautifulSoup(raw_html.content, 'html5lib')
it missed the <div> part, is it possible to get it back
I try to delete each tag “a” if the attribute value matches “table.xml”.
For this I use vbscript.
Can anyone help me with this or give me tip?
Thank you
XML file bevor editing
<?xml version="1.0" encoding="UTF-8"?>
<html>
<head>
<title>title</title>
</head>
<body>
<h1>title h1</h1>
<p>This is a test.</p>
<p>This is a test.</p>
<ul>
<li>List one</li>
<li>List two test</li>
</ul>
<p>This is a test.</p>
</body>
</html>
XML file hove it shot look after deleting tag a with attribute value “table.xml”.
<?xml version="1.0" encoding="UTF-8"?>
<html>
<head>
<title>title</title>
</head>
<body>
<h1>title h1</h1>
<p>This is a test.</p>
<p>This is a test.</p>
<ul>
<li>List one</li>
<li>List two test</li>
</ul>
<p>This is a test.</p>
</body>
</html>
As the linked answer does not deal with finding elements 'everywhere' ("//a") and the text "test" is special, because it 'belongs' to the <a> node and its parent:
Option Explicit
Dim sFSpec : sFSpec = "..\data\39911374.xml"
Dim oXML : Set oXML = CreateObject("Msxml2.DOMDocument.6.0")
oXML.setProperty "SelectionLanguage", "XPath"
oXML.async = False
oXML.load sFSpec
If 0 = oXML.parseError Then
WScript.Echo oXML.xml
Dim sXPath : sXPath = "//a[#href=""table.xml""]"
Dim ndlFnd : Set ndlFnd = oXML.selectNodes(sXPath)
If 0 = ndlFnd.length Then
WScript.Echo sXPath, "not found"
Else
WScript.Echo "found", ndlFnd.length, "nodes for", sXPath
Dim ndCur, ndPar, sTmp
For Each ndCur In ndlFnd
Set ndPar = ndCur.parentNode
sTmp = ndPar.text
ndPar.removeChild ndCur
ndPar.text = sTmp
Next
WScript.Echo "-----------------"
WScript.Echo oXML.xml
End If
Else
WScript.Echo oXML.parseError.reason
End If
output:
cscript 39911374.vbs
<?xml version="1.0"?>
<html>
<head>
<title>title</title>
</head>
<body>
<h1>title h1</h1>
<p>This is a test.</p>
<p>This is a test.</p>
<ul>
<li>List one</li>
<li>List two test</li>
</ul>
<p>This is a test.</p>
</body>
</html>
found 2 nodes for //a[#href="table.xml"]
-----------------
<?xml version="1.0"?>
<html>
<head>
<title>title</title>
</head>
<body>
<h1>title h1</h1>
<p>This is a test.</p>
<p>This is a test.</p>
<ul>
<li>List one</li>
<li>List two test</li>
</ul>
<p>This is a test.</p>
</body>
</html>
Been working on wrapping my head around expression engine now. And I'm using the Structure add-on as well. This is my first attempt of building my own template.
The problem I'm having is on the index page when you initially land on the site. It will repeat the entire html over and over until its' loaded every pages' content. It only does this on the index page. If you click on any other page it works as its supposed to.
Heres my code:
<!DOCTYPE html>
<html>
{exp:channel:entries channel="pages"}
<head>
<meta charset="UTF-8">
{!-- add="filename|filename2" --}
{embed="Pages/styles"}
{embed="Pages/scripts"}
<title>{title}</title>
</head>
<body>
<header>
<img src="img/logo.png" />
<nav>
{exp:structure:nav css_id="none" start_from="/" show_depth="2"}{/exp:structure:nav}
</nav>
</header>
<main>
<nav>
{exp:structure:breadcrumb inc_home="no" here_as_title="yes"}{/exp:structure:breadcrumb}
</nav>
{if structure:child_ids != '' OR structure:parent:entry_id != '0'}
<aside>
{exp:structure:nav css_id="none" show_depth="2" start_from="/{segment_1}" show_overview="yes" rename_overview="{structure:top:title}"}{/exp:structure:nav}
</aside>
{/if}
<article>
<h1>{title}</h1>
{page_contents}
</article>
</main>
<footer>
<nav>
{exp:structure:nav css_id="none" start_from="/" max_depth="1"}{/exp:structure:nav}
</nav>
<address>
<p>1234 Address St<br />
City Name, STATE 12345<br />
Phonee: 123 456 7890<br />
Email Us</p>
</address>
</footer>
</body>
{/exp:channel:entries}
</html>
And here is the link to the site: (no styling, just building and testing for now) http://www.oneoffs.co
It's not the prettiest solution but you could simply add entry_id to the loop which would limit it to only the homepage.
{exp:channel:entries channel="pages" entry_id="1"}
..
{/exp:channel:entries}
or
{exp:channel:entries channel="pages" {if segment_1 == ''}entry_id="1"{/if}}
..
{/exp:channel:entries}
...which only adds the entry_id if you're on your homepage (assumption that segment_1 is blank).