How do you get everything under the root element? - lxml

I'm using etree to clean up some html. I realise that I have to have a root tag to hold all the elements, but I want to return the string without the root. Is there any way to do this.
from lxml import etree
fragment = etree.fromstring("<fragment>text1 <a>tex2 </a>text3<b>text4</b> <c>text 5</c>text 6<span style=''> Style</span></fragment>")
delete_tag = 'delete_me'
for element in fragment.xpath(".//span[#style='']"):
element.tag = delete_tag
etree.strip_tags(fragment, delete_tag)
print(etree.tostring(fragment))
What I get is
b'<fragment>text1 <a>tex2 </a>text3<b>text4</b> <c>text 5</c>text 6 Style</fragment>'
but I want is
text1 <a>tex2 </a>text3<b>text4</b> <c>text 5</c>text 6 Style

Try something along these lines:
elems = fragment.xpath('.//*')
target = ''
target+=(fragment.xpath('./text()[1]')[0])
for elem in elems:
target+=(etree.tostring(elem).decode())
target
Output:
'text1 <a>tex2 </a>text3<b>text4</b> <c>text 5</c>text 6 Style'

Related

get the index of string element in np1 while it has a substring in np2

my code is as below:
import numpy as np
keywordlist = ['cpp-4.8.5', 'CUnit-2.1.3', 'CUnit-devel', 'doxygen-1.8.5', 'e2fsprogs-1.42.9', 'e2fsprogs-libs', 'epel-release', 'fuse3-devel', 'fuse3-libs', 'gcc-4.8.5', 'gcc-c++', 'gcc-gfortran', 'ghc-array', 'ghc-base', 'ghc-bytestring', 'ghc-containers', 'ghc-deepseq', 'ghc-directory', 'ghc-filepath', 'ghc-json', 'ghc-mtl', 'ghc-old', 'ghc-parsec', 'ghc-pretty', 'ghc-regex', 'ghc-regex', 'ghc-ShellCheck', 'ghc-syb', 'ghc-text', 'ghc-time', 'ghc-transformers', 'ghc-unix', 'git-1.8.3.1', 'graphviz-2.30.1', 'help2man-1.41.1', 'ibacm-22.4', 'keyutils-libs', 'krb5-devel', 'krb5-libs', 'krb5-workstation', 'lcov-1.13', 'libaio-devel', 'libblkid-2.23.2', 'libcom_err-1.42.9', 'libcom_err-devel', 'libgcc-4.8.5', 'libgfortran-4.8.5', 'libgomp-4.8.5', 'libibumad-22.4', 'libibverbs-22.4', 'libiscsi-devel', 'libkadm5-1.15.1', 'libmount-2.23.2', 'libpmem-1.5.1', 'libpmemblk-1.5.1', 'libpmemblk-devel', 'libpmem-devel', 'libquadmath-4.8.5', 'libquadmath-devel', 'librdmacm-22.4', 'libselinux-2.5', 'libselinux-devel', 'libselinux-python', 'libselinux-utils', 'libsepol-devel', 'libsmartcols-2.23.2', 'libss-1.42.9', 'libstdc++-4.8.5', 'libstdc++-devel', 'libunwind-1.2', 'libunwind-devel', 'libuuid-2.23.2', 'libuuid-devel', 'libverto-devel', 'libXaw-1.0.13', 'libXScrnSaver-1.2.2', 'make-3.82', 'nasm-2.10.07', 'numactl-devel', 'numactl-libs', 'openssl-1.0.2k', 'openssl-devel', 'openssl-libs', 'pcre-devel', 'perl-Digest', 'perl-Digest', 'perl-GD', 'perl-Git', 'python-2.7.5', 'python2-pycodestyle', 'python-libs', 'rdma-core', 'rdma-core', 'sg3_utils-1.37', 'sg3_utils-libs', 'ShellCheck-0.3.8', 'util-linux', 'zlib-devel']
np1 = np.array(keywordlist)
# ['cpp-4.8.5' 'CUnit-2.1.3' 'CUnit-devel' 'doxygen-1.8.5' ... 'ShellCheck-0.3.8' 'util-linux' 'zlib-devel']
result = ['epel-release-7-12.noarch', 'rdma-core-22.4-5.el7.x86_64', 'cpp-4.8.5-44.el7.x86_64', 'doxygen-1.8.5-4.el7.x86_64', 'ghc-base-4.6.0.1-26.4.el7.x86_64', 'libuuid-2.23.2-65.el7.x86_64', 'python-libs-2.7.5-89.el7.x86_64', 'libkadm5-1.15.1-50.el7.x86_64', 'libmount-2.23.2-65.el7.x86_64', 'libquadmath-4.8.5-44.el7.x86_64', 'util-linux-2.23.2-65.el7.x86_64', 'libss-1.42.9-19.el7.x86_64', 'keyutils-libs-1.5.8-3.el7.x86_64', 'e2fsprogs-libs-1.42.9-19.el7.x86_64', 'ghc-pretty-1.1.1.0-26.4.el7.x86_64', 'libXaw-1.0.13-4.el7.x86_64', 'libselinux-2.5-15.el7.x86_64', 'libibverbs-22.4-5.el7.x86_64', 'libselinux-utils-2.5-15.el7.x86_64', 'libgomp-4.8.5-44.el7.x86_64', 'libblkid-2.23.2-65.el7.x86_64', 'gcc-c++-4.8.5-44.el7.x86_64', 'e2fsprogs-1.42.9-19.el7.x86_64', 'CUnit-devel-2.1.3-8.el7.x86_64', 'make-3.82-24.el7.x86_64', 'numactl-libs-2.0.12-5.el7.x86_64', 'perl-Git-1.8.3.1-23.el7_8.noarch', 'openssl-libs-1.0.2k-19.el7.x86_64', 'gcc-4.8.5-44.el7.x86_64', 'CUnit-2.1.3-8.el7.x86_64', 'ghc-syb-0.4.0-35.el7.x86_64', 'gcc-gfortran-4.8.5-44.el7.x86_64', 'libselinux-python-2.5-15.el7.x86_64', 'sg3_utils-libs-1.37-19.el7.x86_64', 'fuse3-libs-3.6.1-4.el7.x86_64', 'libquadmath-devel-4.8.5-44.el7.x86_64', 'libgfortran-4.8.5-44.el7.x86_64', 'krb5-workstation-1.15.1-50.el7.x86_64', 'librdmacm-22.4-5.el7.x86_64', 'sg3_utils-1.37-19.el7.x86_64', 'libsmartcols-2.23.2-65.el7.x86_64', 'fuse3-devel-3.6.1-4.el7.x86_64', 'python-2.7.5-89.el7.x86_64', 'openssl-1.0.2k-19.el7.x86_64', 'libgcc-4.8.5-44.el7.x86_64', 'libaio-devel-0.3.109-13.el7.x86_64', 'ghc-old-locale-1.0.0.5-26.4.el7.x86_64', 'libcom_err-1.42.9-19.el7.x86_64', 'git-1.8.3.1-23.el7_8.x86_64', 'krb5-libs-1.15.1-50.el7.x86_64']
np2 = np.array(result)
# ['epel-release-7-12.noarch' 'rdma-core-22.4-5.el7.x86_64' ... 'krb5-libs-1.15.1-50.el7.x86_64']
expectation = ['cpp-4.8.5-39.el7.x86_64', 'CUnit-2.1.3-8.el7.x86_64', 'CUnit-devel-2.1.3-8.el7.x86_64', 'doxygen-1.8.5-4.el7.x86_64', 'e2fsprogs-1.42.9-17.el7.x86_64', 'e2fsprogs-libs-1.42.9-17.el7.x86_64', 'epel-release-latest-7.noarch', 'fuse3-devel-3.6.1-4.el7.x86_64', 'fuse3-libs-3.6.1-4.el7.x86_64', 'gcc-4.8.5-39.el7.x86_64', 'gcc-c++-4.8.5-39.el7.x86_64', 'gcc-gfortran-4.8.5-39.el7.x86_64', 'ghc-array-0.4.0.1-26.4.el7.x86_64', 'ghc-base-4.6.0.1-26.4.el7.x86_64', 'ghc-bytestring-0.10.0.2-26.4.el7.x86_64', 'ghc-containers-0.5.0.0-26.4.el7.x86_64', 'ghc-deepseq-1.3.0.1-26.4.el7.x86_64', 'ghc-directory-1.2.0.1-26.4.el7.x86_64', 'ghc-filepath-1.3.0.1-26.4.el7.x86_64', 'ghc-json-0.7-4.el7.x86_64', 'ghc-mtl-2.1.2-27.el7.x86_64', 'ghc-old-locale-1.0.0.5-26.4.el7.x86_64', 'ghc-parsec-3.1.3-31.el7.x86_64', 'ghc-pretty-1.1.1.0-26.4.el7.x86_64', 'ghc-regex-base-0.93.2-29.el7.x86_64', 'ghc-regex-tdfa-1.1.8-11.el7.x86_64', 'ghc-ShellCheck-0.3.8-1.el7.x86_64', 'ghc-syb-0.4.0-35.el7.x86_64', 'ghc-text-0.11.3.1-2.el7.x86_64', 'ghc-time-1.4.0.1-26.4.el7.x86_64', 'ghc-transformers-0.3.0.0-34.el7.x86_64', 'ghc-unix-2.6.0.1-26.4.el7.x86_64', 'git-1.8.3.1-23.el7_8.x86_64', 'graphviz-2.30.1-21.el7.x86_64', 'help2man-1.41.1-3.el7.noarch', 'ibacm-22.4-2.el7_8.x86_64', 'keyutils-libs-devel-1.5.8-3.el7.x86_64', 'krb5-devel-1.15.1-46.el7.x86_64', 'krb5-libs-1.15.1-46.el7.x86_64', 'krb5-workstation-1.15.1-46.el7.x86_64', 'lcov-1.13-1.el7.noarch', 'libaio-devel-0.3.109-13.el7.x86_64', 'libblkid-2.23.2-63.el7.x86_64', 'libcom_err-1.42.9-17.el7.x86_64', 'libcom_err-devel-1.42.9-17.el7.x86_64', 'libgcc-4.8.5-39.el7.x86_64', 'libgfortran-4.8.5-39.el7.x86_64', 'libgomp-4.8.5-39.el7.x86_64', 'libibumad-22.4-2.el7_8.x86_64', 'libibverbs-22.4-2.el7_8.x86_64', 'libiscsi-devel-1.9.0-7.el7.x86_64', 'libkadm5-1.15.1-46.el7.x86_64', 'libmount-2.23.2-63.el7.x86_64', 'libpmem-1.5.1-2.1.el7.x86_64', 'libpmemblk-1.5.1-2.1.el7.x86_64', 'libpmemblk-devel-1.5.1-2.1.el7.x86_64', 'libpmem-devel-1.5.1-2.1.el7.x86_64', 'libquadmath-4.8.5-39.el7.x86_64', 'libquadmath-devel-4.8.5-39.el7.x86_64', 'librdmacm-22.4-2.el7_8.x86_64', 'libselinux-2.5-15.el7.x86_64', 'libselinux-devel-2.5-15.el7.x86_64', 'libselinux-python-2.5-15.el7.x86_64', 'libselinux-utils-2.5-15.el7.x86_64', 'libsepol-devel-2.5-10.el7.x86_64', 'libsmartcols-2.23.2-63.el7.x86_64', 'libss-1.42.9-17.el7.x86_64', 'libstdc++-4.8.5-39.el7.x86_64', 'libstdc++-devel-4.8.5-39.el7.x86_64', 'libunwind-1.2-2.el7.x86_64', 'libunwind-devel-1.2-2.el7.x86_64', 'libuuid-2.23.2-63.el7.x86_64', 'libuuid-devel-2.23.2-63.el7.x86_64', 'libverto-devel-0.2.5-4.el7.x86_64', 'libXaw-1.0.13-4.el7.x86_64', 'libXScrnSaver-1.2.2-6.1.el7.x86_64', 'make-3.82-24.el7.x86_64', 'nasm-2.10.07-7.el7.x86_64', 'numactl-devel-2.0.12-5.el7.x86_64', 'numactl-libs-2.0.12-5.el7.x86_64', 'openssl-1.0.2k-19.el7.x86_64', 'openssl-devel-1.0.2k-19.el7.x86_64', 'openssl-libs-1.0.2k-19.el7.x86_64', 'pcre-devel-8.32-17.el7.x86_64', 'perl-Digest-1.17-245.el7.noarch', 'perl-Digest-MD5-2.52-3.el7.x86_64', 'perl-GD-2.49-3.el7.x86_64', 'perl-Git-1.8.3.1-23.el7_8.noarch', 'python-2.7.5-88.el7.x86_64', 'python2-pycodestyle-2.5.0-1.el7.noarch', 'python-libs-2.7.5-88.el7.x86_64', 'rdma-core-22.4-2.el7_8.x86_64', 'rdma-core-devel-22.4-2.el7_8.x86_64', 'sg3_utils-1.37-19.el7.x86_64', 'sg3_utils-libs-1.37-19.el7.x86_64', 'ShellCheck-0.3.8-1.el7.x86_64', 'util-linux-2.23.2-63.el7.x86_64', 'zlib-devel-1.2.7-18.el7.x86_64']
np3 = np.array(expectation)
# ['cpp-4.8.5-39.el7.x86_64' 'CUnit-2.1.3-8.el7.x86_64' ... 'util-linux-2.23.2-63.el7.x86_64' 'zlib-devel-1.2.7-18.el7.x86_64']
ready = []
for i in keywordlist:
for j in result:
x = np.char.startswith(j, i)
if x:
ready.append(np3[np.where(np.char.startswith(np3, i))])
np4 = np.array(ready)
# [array(['cpp-4.8.5-39.el7.x86_64'], dtype='<U39') array(['CUnit-2.1.3-8.el7.x86_64'], dtype='<U39') ... array(['util-linux-2.23.2-63.el7.x86_64'], dtype='<U39')]
notready = [i for i in np3 if i not in np4]
print(f"not ready: {notready}")
The purpose is to use string format keyword in keyword list to examine its existence in all np2 elements.
If any element in np2 starts with any keyword, or keyword is the substring of any element in np2, get the index of element in expectation which also start with that keyword and form into np4.
Finally, get not ready which is made up of elements that are in np3 but not in np4.
To make my explanation more vividly, I have a bunch of rpm files to be installed, the list of expectation.
The keyword list catches the former two keywords of each rpm file name.
Result is the standard output of already installed rpm files.
Taking cpp-4.8.5 as an example, I can see cpp-4.8.5-44.el7.x86_64 in result, which means currently cpp-4.8.5-44.el7.x86_64 has been installed. So, cpp-4.8.5-39.el7.x86_64 in expectation can be removed, since cpp-4.8.5-*.rpm has been successfully installed. Next step, deal with the other left items in expectation.
My question is: there any easier or more efficient way to get the result equivalent to notready? maybe with any other numpy built-in methods, but not with for loop.

Find link in text and replace with "a" tag

I have a partially good HTML, I need to create hyperlink, like:
Superotto: risorse audiovisive per superare i pregiudizi e celebrare
l’otto marzo, in “Indire Informa”, 5 marzo 2021,
https://www.indire.it/2021/03/05/superotto-risorse-audiovisive-per-superare-i-pregiudizi-e-celebrare-lotto-marzo/;
Sezione Superotto in
https://piccolescuole.indire.it/iniziative/la-scuola-allo-schermo/#superotto.
Has to become:
Superotto: risorse audiovisive per superare i pregiudizi e celebrare
l’otto marzo, in “Indire Informa”, 5 marzo 2021, < a
href="https://www.indire.it/2021/03/05/superotto-risorse-audiovisive-per-superare-i-pregiudizi-e-celebrare-lotto-marzo/" >https://www.indire.it/2021/03/05/superotto-risorse-audiovisive-per-superare-i-pregiudizi-e-celebrare-lotto-marzo/< /a >;
Sezione Superotto in < a
href="https://piccolescuole.indire.it/iniziative/la-scuola-allo-schermo/#superotto">https://piccolescuole.indire.it/iniziative/la-scuola-allo-schermo/#superotto< /a >.
Beautifulsoup seems to not find the http well, so I used this regex with the pure python findall, but I cannot substitute or compose the text. Right now I made:
links = re.findall(r"(http|ftp|https:\/\/)([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,#?^=%&:\/~+#-]*[\w#?^=%&\/~+#-])", str(soup))
link_to_replace = []
for l in links:
link = ''.join(l)
if link in soup.find("body").text:
good_link = ""+link+""
fixed_text = soup.replace(link, good_link)
soup.replace_with(fixed_text)
I tried multiple solutions in the last two lines (this is just one), none worked.
Perhaps as follows, where I first identify the relevant anchor elements and strip out any other attributes besides the href, then later substitute the href link with the href html
import re
import requests
from bs4 import BeautifulSoup as bs
r = requests.get('https://rivista.clionet.it/vol5/giorgi-zoppi-la-ricerca-indire-tra-uso-didattico-del-patrimonio-storico-culturale-e-promozione-delle-buone-pratiche/')
soup = bs(r.text, 'lxml')
item = soup.select_one('p:has(a[id="ft-note-16"])')
text = item.text
for tag in item.select('a:not([id])'):
href = tag['href']
tag.attrs = {'href': href}
text = re.sub(href, str(tag), text)
text = re.sub(item.a.text, '', text).strip()
print(text)

modify html tags replace "src" with "data-src"

is there way to use BeautifulSoup to replace element tags for images, I have some HTML files that I want to replace the "src" with "data-src" of that img
<img src="//Pictures/q-90-90.png" data-src="//Pictures/p720_test.jpg">
code so far
soup = BeautifulSoup(open("template/home.html", 'lxml')
images = soup.findAll('img')
for i in images:
#replace src with data-src
I am open to any solution using regex as well, Ideally the output would be
<img src="//Pictures/p720_test.jpg" data-src="//Pictures/q-90-90.png">
Don't use regex on html; try this instead:
soup.find('img')['src']= "data-src"
Edit:
To swap attribute values inside the <img> elements, try this:
old_src = soup.select_one('img')['src']
old_data = soup.select_one('img')['data-src']
target = soup.select_one('img')
target['src']= old_data
target['data-src']= old_src

I want to extract the numbers at the end of a url using regular expressions in scrapy

I want to get '25430989' from the end of this url.
https://www.example.com/cars-for-sale/2007-ford-focus-1-6-diesel/25430989
How would I write it using the xpath?
I get the link using this xpath:
link = row.xpath('.//a/#href').get()
When I use a regex tester I can isolate it with r'(\d+)$ but when I put it into my code it doesn't work for some reason.
import scrapy
import re
from ..items import DonedealItem
class FarmtoolsSpider(scrapy.Spider):
name = 'farmtools'
allowed_domains = ['www.donedeal.ie']
start_urls = ['https://www.donedeal.ie/all?source=private&sort=publishdate%20desc']
def parse(self, response):
items = DonedealItem()
rows = response.xpath('//ul[#class="card-collection"]/li')
for row in rows:
if row.xpath('.//ul[#class="card__body-keyinfo"]/li[contains(text(),"0 min")]/text()'):
link = row.xpath('.//a/#href').get() #this is the full link.
linkid = link.re(r'(\d+)$).get()
title = row.xpath('.//p[#class="card__body-title"]/text()').get()
county = row.xpath('.//li[contains(text(),"min")]/following-sibling::node()/text()').get()
price = row.xpath('.//p[#class="card__price"]/span[1]/text()').get()
subcat = row.xpath('.//a/div/div[2]/div[1]/p[2]/text()[2]').get()
items['link'] = link
items['linkid'] = linkid
items['title'] = title
items['county'] = county
items['price'] = price
items['subcat'] = subcat
yield items
I'm trying to get the linkid.
The problem is here
link = row.xpath('.//a/#href').get() #this is the full link.
linkid = link.re(r'(\d+)$).get()
When you use the .get() method it returns a string that is saved in the link variable, and strings don't have a .re() method for you to call. You can use one of the methods from the re module (docs for reference).
I would use re.findall(), it will return you a list of values that matches the regex (in this case only one item would return), or None if nothing matches. re.search() is also a good choice, but will return you an re.Match object.
import re #Don't forget to import it
...
link = row.xpath('.//a/#href').get()
linkid = re.findall(r'(\d+)$', link)
Now, the Scrapy selectors also support regex, so an alternative would be implementing it like this: (No need for re module)
linkid = row.xpath('.//a/#href').re_first(r'(\d+)$')
Notice I didn't use .get() there.

Generate xml documents using lxml and vary element text and attributes based on logic

I have my lxml code like this
from lxml import etree
import sys
fd = open('D:\\text.xml', 'wb')
xmlns = "http://www.fpml.org/FpML-5/confirmation"
xsi = "http://www.w3.org/2001/XMLSchema-instance"
fpmlVersion="http://www.fpml.org/FpML-5/confirmation ../../fpml-main-5-6.xsd http://www.w3.org/2000/09/xmldsig# ../../xmldsig-core-schema.xsd"
page = etree.Element("{"+xmlns+"}dataDocument",nsmap={None:xmlns,'xsi':xsi })
doc = etree.ElementTree(page)
page.set("fpmlVersion", fpmlVersion)
trade = etree.SubElement(page,'trade')
tradeheader = etree.SubElement(trade,'tradeheader')
partyTradeIdentifier = etree.SubElement(tradeheader,'partyTradeIdentifier')
partyReference = etree.SubElement(partyTradeIdentifier,'partyReference',href='party1')
tradeId = etree.SubElement(partyTradeIdentifier,'tradeId',tradeIdScheme='http://www.partyA.com/swaps/trade-id')
tradeId.text = 'TW9235'
swap = etree.SubElement(trade,'swap')
party = etree.SubElement(page,'party',id='party1')
partyID = etree.SubElement(party,'partyID')
partyID.text = 'PARTYAUS33'
partyName = etree.SubElement(party,'partyName')
partyName.text = 'Party A'
party = etree.SubElement(page,'party',id='party2')
partyID = etree.SubElement(party,'partyID')
partyID.text = 'BARCGB2L'
partyName = etree.SubElement(party,'partyName')
partyName.text = 'Party B'
s = etree.tostring(doc, xml_declaration=True,encoding="UTF-8",pretty_print=True)
print (s)
fd.write(s)
And i need to generate a xml file like
<?xml version='1.0' encoding='UTF-8'?>
<dataDocument xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="http://www.fpml.org/FpML-5/confirmation" fpmlVersion="http://www.fpml.org/FpML-5/confirmation ../../fpml-main-5-6.xsd http://www.w3.org/2000/09/xmldsig# ../../xmldsig-core-schema.xsd">
<trade>
<tradeheader>
<partyTradeIdentifier>
<partyReference href="party1"/>
<tradeId tradeIdScheme="http://www.partyA.com/swaps/trade-id">TW9235</tradeId>
</partyTradeIdentifier>
</tradeheader>
<swap/>
</trade>
<party id="party1">
<partyID>PARTYAUS33</partyID>
<partyName>Party A</partyName>
</party>
<party id="party2">
<partyID>BARCGB2L</partyID>
<partyName>Party B</partyName>
</party>
</dataDocument>
Now the above code works.
However i need to generate 10k such files where the elements text or attributes vary .
For example the partyID maybe different like
PARTYGER45 instead of PARTYUS33 is there a clean way to do this instead of hard coding it ?
Similarly i need to vary lot of things like the tradeId TW9235
one way could be to have the output xml without values loaded to lxml objectify and then loop while setting relevant values and write it to a file, meaning
from lxml import objectify
with open('in.xml') as f_in:
for pId in ['PARTYGER45', ...]:
dataDocument = objectify.parse(f.read())
dataDocument.party.partyID._setText(pId)
...
obj_xml = lxml.etree.tostring(dataDocument)
with open('out_%s.xml' % pId, 'w') as f_out:
f.write(obj_xml)
another way might be to use lxml and xslt, again, start from an empty structured xml and transform the structure according to your needs.