my code is as below:
import numpy as np
keywordlist = ['cpp-4.8.5', 'CUnit-2.1.3', 'CUnit-devel', 'doxygen-1.8.5', 'e2fsprogs-1.42.9', 'e2fsprogs-libs', 'epel-release', 'fuse3-devel', 'fuse3-libs', 'gcc-4.8.5', 'gcc-c++', 'gcc-gfortran', 'ghc-array', 'ghc-base', 'ghc-bytestring', 'ghc-containers', 'ghc-deepseq', 'ghc-directory', 'ghc-filepath', 'ghc-json', 'ghc-mtl', 'ghc-old', 'ghc-parsec', 'ghc-pretty', 'ghc-regex', 'ghc-regex', 'ghc-ShellCheck', 'ghc-syb', 'ghc-text', 'ghc-time', 'ghc-transformers', 'ghc-unix', 'git-1.8.3.1', 'graphviz-2.30.1', 'help2man-1.41.1', 'ibacm-22.4', 'keyutils-libs', 'krb5-devel', 'krb5-libs', 'krb5-workstation', 'lcov-1.13', 'libaio-devel', 'libblkid-2.23.2', 'libcom_err-1.42.9', 'libcom_err-devel', 'libgcc-4.8.5', 'libgfortran-4.8.5', 'libgomp-4.8.5', 'libibumad-22.4', 'libibverbs-22.4', 'libiscsi-devel', 'libkadm5-1.15.1', 'libmount-2.23.2', 'libpmem-1.5.1', 'libpmemblk-1.5.1', 'libpmemblk-devel', 'libpmem-devel', 'libquadmath-4.8.5', 'libquadmath-devel', 'librdmacm-22.4', 'libselinux-2.5', 'libselinux-devel', 'libselinux-python', 'libselinux-utils', 'libsepol-devel', 'libsmartcols-2.23.2', 'libss-1.42.9', 'libstdc++-4.8.5', 'libstdc++-devel', 'libunwind-1.2', 'libunwind-devel', 'libuuid-2.23.2', 'libuuid-devel', 'libverto-devel', 'libXaw-1.0.13', 'libXScrnSaver-1.2.2', 'make-3.82', 'nasm-2.10.07', 'numactl-devel', 'numactl-libs', 'openssl-1.0.2k', 'openssl-devel', 'openssl-libs', 'pcre-devel', 'perl-Digest', 'perl-Digest', 'perl-GD', 'perl-Git', 'python-2.7.5', 'python2-pycodestyle', 'python-libs', 'rdma-core', 'rdma-core', 'sg3_utils-1.37', 'sg3_utils-libs', 'ShellCheck-0.3.8', 'util-linux', 'zlib-devel']
np1 = np.array(keywordlist)
# ['cpp-4.8.5' 'CUnit-2.1.3' 'CUnit-devel' 'doxygen-1.8.5' ... 'ShellCheck-0.3.8' 'util-linux' 'zlib-devel']
result = ['epel-release-7-12.noarch', 'rdma-core-22.4-5.el7.x86_64', 'cpp-4.8.5-44.el7.x86_64', 'doxygen-1.8.5-4.el7.x86_64', 'ghc-base-4.6.0.1-26.4.el7.x86_64', 'libuuid-2.23.2-65.el7.x86_64', 'python-libs-2.7.5-89.el7.x86_64', 'libkadm5-1.15.1-50.el7.x86_64', 'libmount-2.23.2-65.el7.x86_64', 'libquadmath-4.8.5-44.el7.x86_64', 'util-linux-2.23.2-65.el7.x86_64', 'libss-1.42.9-19.el7.x86_64', 'keyutils-libs-1.5.8-3.el7.x86_64', 'e2fsprogs-libs-1.42.9-19.el7.x86_64', 'ghc-pretty-1.1.1.0-26.4.el7.x86_64', 'libXaw-1.0.13-4.el7.x86_64', 'libselinux-2.5-15.el7.x86_64', 'libibverbs-22.4-5.el7.x86_64', 'libselinux-utils-2.5-15.el7.x86_64', 'libgomp-4.8.5-44.el7.x86_64', 'libblkid-2.23.2-65.el7.x86_64', 'gcc-c++-4.8.5-44.el7.x86_64', 'e2fsprogs-1.42.9-19.el7.x86_64', 'CUnit-devel-2.1.3-8.el7.x86_64', 'make-3.82-24.el7.x86_64', 'numactl-libs-2.0.12-5.el7.x86_64', 'perl-Git-1.8.3.1-23.el7_8.noarch', 'openssl-libs-1.0.2k-19.el7.x86_64', 'gcc-4.8.5-44.el7.x86_64', 'CUnit-2.1.3-8.el7.x86_64', 'ghc-syb-0.4.0-35.el7.x86_64', 'gcc-gfortran-4.8.5-44.el7.x86_64', 'libselinux-python-2.5-15.el7.x86_64', 'sg3_utils-libs-1.37-19.el7.x86_64', 'fuse3-libs-3.6.1-4.el7.x86_64', 'libquadmath-devel-4.8.5-44.el7.x86_64', 'libgfortran-4.8.5-44.el7.x86_64', 'krb5-workstation-1.15.1-50.el7.x86_64', 'librdmacm-22.4-5.el7.x86_64', 'sg3_utils-1.37-19.el7.x86_64', 'libsmartcols-2.23.2-65.el7.x86_64', 'fuse3-devel-3.6.1-4.el7.x86_64', 'python-2.7.5-89.el7.x86_64', 'openssl-1.0.2k-19.el7.x86_64', 'libgcc-4.8.5-44.el7.x86_64', 'libaio-devel-0.3.109-13.el7.x86_64', 'ghc-old-locale-1.0.0.5-26.4.el7.x86_64', 'libcom_err-1.42.9-19.el7.x86_64', 'git-1.8.3.1-23.el7_8.x86_64', 'krb5-libs-1.15.1-50.el7.x86_64']
np2 = np.array(result)
# ['epel-release-7-12.noarch' 'rdma-core-22.4-5.el7.x86_64' ... 'krb5-libs-1.15.1-50.el7.x86_64']
expectation = ['cpp-4.8.5-39.el7.x86_64', 'CUnit-2.1.3-8.el7.x86_64', 'CUnit-devel-2.1.3-8.el7.x86_64', 'doxygen-1.8.5-4.el7.x86_64', 'e2fsprogs-1.42.9-17.el7.x86_64', 'e2fsprogs-libs-1.42.9-17.el7.x86_64', 'epel-release-latest-7.noarch', 'fuse3-devel-3.6.1-4.el7.x86_64', 'fuse3-libs-3.6.1-4.el7.x86_64', 'gcc-4.8.5-39.el7.x86_64', 'gcc-c++-4.8.5-39.el7.x86_64', 'gcc-gfortran-4.8.5-39.el7.x86_64', 'ghc-array-0.4.0.1-26.4.el7.x86_64', 'ghc-base-4.6.0.1-26.4.el7.x86_64', 'ghc-bytestring-0.10.0.2-26.4.el7.x86_64', 'ghc-containers-0.5.0.0-26.4.el7.x86_64', 'ghc-deepseq-1.3.0.1-26.4.el7.x86_64', 'ghc-directory-1.2.0.1-26.4.el7.x86_64', 'ghc-filepath-1.3.0.1-26.4.el7.x86_64', 'ghc-json-0.7-4.el7.x86_64', 'ghc-mtl-2.1.2-27.el7.x86_64', 'ghc-old-locale-1.0.0.5-26.4.el7.x86_64', 'ghc-parsec-3.1.3-31.el7.x86_64', 'ghc-pretty-1.1.1.0-26.4.el7.x86_64', 'ghc-regex-base-0.93.2-29.el7.x86_64', 'ghc-regex-tdfa-1.1.8-11.el7.x86_64', 'ghc-ShellCheck-0.3.8-1.el7.x86_64', 'ghc-syb-0.4.0-35.el7.x86_64', 'ghc-text-0.11.3.1-2.el7.x86_64', 'ghc-time-1.4.0.1-26.4.el7.x86_64', 'ghc-transformers-0.3.0.0-34.el7.x86_64', 'ghc-unix-2.6.0.1-26.4.el7.x86_64', 'git-1.8.3.1-23.el7_8.x86_64', 'graphviz-2.30.1-21.el7.x86_64', 'help2man-1.41.1-3.el7.noarch', 'ibacm-22.4-2.el7_8.x86_64', 'keyutils-libs-devel-1.5.8-3.el7.x86_64', 'krb5-devel-1.15.1-46.el7.x86_64', 'krb5-libs-1.15.1-46.el7.x86_64', 'krb5-workstation-1.15.1-46.el7.x86_64', 'lcov-1.13-1.el7.noarch', 'libaio-devel-0.3.109-13.el7.x86_64', 'libblkid-2.23.2-63.el7.x86_64', 'libcom_err-1.42.9-17.el7.x86_64', 'libcom_err-devel-1.42.9-17.el7.x86_64', 'libgcc-4.8.5-39.el7.x86_64', 'libgfortran-4.8.5-39.el7.x86_64', 'libgomp-4.8.5-39.el7.x86_64', 'libibumad-22.4-2.el7_8.x86_64', 'libibverbs-22.4-2.el7_8.x86_64', 'libiscsi-devel-1.9.0-7.el7.x86_64', 'libkadm5-1.15.1-46.el7.x86_64', 'libmount-2.23.2-63.el7.x86_64', 'libpmem-1.5.1-2.1.el7.x86_64', 'libpmemblk-1.5.1-2.1.el7.x86_64', 'libpmemblk-devel-1.5.1-2.1.el7.x86_64', 'libpmem-devel-1.5.1-2.1.el7.x86_64', 'libquadmath-4.8.5-39.el7.x86_64', 'libquadmath-devel-4.8.5-39.el7.x86_64', 'librdmacm-22.4-2.el7_8.x86_64', 'libselinux-2.5-15.el7.x86_64', 'libselinux-devel-2.5-15.el7.x86_64', 'libselinux-python-2.5-15.el7.x86_64', 'libselinux-utils-2.5-15.el7.x86_64', 'libsepol-devel-2.5-10.el7.x86_64', 'libsmartcols-2.23.2-63.el7.x86_64', 'libss-1.42.9-17.el7.x86_64', 'libstdc++-4.8.5-39.el7.x86_64', 'libstdc++-devel-4.8.5-39.el7.x86_64', 'libunwind-1.2-2.el7.x86_64', 'libunwind-devel-1.2-2.el7.x86_64', 'libuuid-2.23.2-63.el7.x86_64', 'libuuid-devel-2.23.2-63.el7.x86_64', 'libverto-devel-0.2.5-4.el7.x86_64', 'libXaw-1.0.13-4.el7.x86_64', 'libXScrnSaver-1.2.2-6.1.el7.x86_64', 'make-3.82-24.el7.x86_64', 'nasm-2.10.07-7.el7.x86_64', 'numactl-devel-2.0.12-5.el7.x86_64', 'numactl-libs-2.0.12-5.el7.x86_64', 'openssl-1.0.2k-19.el7.x86_64', 'openssl-devel-1.0.2k-19.el7.x86_64', 'openssl-libs-1.0.2k-19.el7.x86_64', 'pcre-devel-8.32-17.el7.x86_64', 'perl-Digest-1.17-245.el7.noarch', 'perl-Digest-MD5-2.52-3.el7.x86_64', 'perl-GD-2.49-3.el7.x86_64', 'perl-Git-1.8.3.1-23.el7_8.noarch', 'python-2.7.5-88.el7.x86_64', 'python2-pycodestyle-2.5.0-1.el7.noarch', 'python-libs-2.7.5-88.el7.x86_64', 'rdma-core-22.4-2.el7_8.x86_64', 'rdma-core-devel-22.4-2.el7_8.x86_64', 'sg3_utils-1.37-19.el7.x86_64', 'sg3_utils-libs-1.37-19.el7.x86_64', 'ShellCheck-0.3.8-1.el7.x86_64', 'util-linux-2.23.2-63.el7.x86_64', 'zlib-devel-1.2.7-18.el7.x86_64']
np3 = np.array(expectation)
# ['cpp-4.8.5-39.el7.x86_64' 'CUnit-2.1.3-8.el7.x86_64' ... 'util-linux-2.23.2-63.el7.x86_64' 'zlib-devel-1.2.7-18.el7.x86_64']
ready = []
for i in keywordlist:
for j in result:
x = np.char.startswith(j, i)
if x:
ready.append(np3[np.where(np.char.startswith(np3, i))])
np4 = np.array(ready)
# [array(['cpp-4.8.5-39.el7.x86_64'], dtype='<U39') array(['CUnit-2.1.3-8.el7.x86_64'], dtype='<U39') ... array(['util-linux-2.23.2-63.el7.x86_64'], dtype='<U39')]
notready = [i for i in np3 if i not in np4]
print(f"not ready: {notready}")
The purpose is to use string format keyword in keyword list to examine its existence in all np2 elements.
If any element in np2 starts with any keyword, or keyword is the substring of any element in np2, get the index of element in expectation which also start with that keyword and form into np4.
Finally, get not ready which is made up of elements that are in np3 but not in np4.
To make my explanation more vividly, I have a bunch of rpm files to be installed, the list of expectation.
The keyword list catches the former two keywords of each rpm file name.
Result is the standard output of already installed rpm files.
Taking cpp-4.8.5 as an example, I can see cpp-4.8.5-44.el7.x86_64 in result, which means currently cpp-4.8.5-44.el7.x86_64 has been installed. So, cpp-4.8.5-39.el7.x86_64 in expectation can be removed, since cpp-4.8.5-*.rpm has been successfully installed. Next step, deal with the other left items in expectation.
My question is: there any easier or more efficient way to get the result equivalent to notready? maybe with any other numpy built-in methods, but not with for loop.
I have a partially good HTML, I need to create hyperlink, like:
Superotto: risorse audiovisive per superare i pregiudizi e celebrare
l’otto marzo, in “Indire Informa”, 5 marzo 2021,
https://www.indire.it/2021/03/05/superotto-risorse-audiovisive-per-superare-i-pregiudizi-e-celebrare-lotto-marzo/;
Sezione Superotto in
https://piccolescuole.indire.it/iniziative/la-scuola-allo-schermo/#superotto.
Has to become:
Superotto: risorse audiovisive per superare i pregiudizi e celebrare
l’otto marzo, in “Indire Informa”, 5 marzo 2021, < a
href="https://www.indire.it/2021/03/05/superotto-risorse-audiovisive-per-superare-i-pregiudizi-e-celebrare-lotto-marzo/" >https://www.indire.it/2021/03/05/superotto-risorse-audiovisive-per-superare-i-pregiudizi-e-celebrare-lotto-marzo/< /a >;
Sezione Superotto in < a
href="https://piccolescuole.indire.it/iniziative/la-scuola-allo-schermo/#superotto">https://piccolescuole.indire.it/iniziative/la-scuola-allo-schermo/#superotto< /a >.
Beautifulsoup seems to not find the http well, so I used this regex with the pure python findall, but I cannot substitute or compose the text. Right now I made:
links = re.findall(r"(http|ftp|https:\/\/)([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,#?^=%&:\/~+#-]*[\w#?^=%&\/~+#-])", str(soup))
link_to_replace = []
for l in links:
link = ''.join(l)
if link in soup.find("body").text:
good_link = ""+link+""
fixed_text = soup.replace(link, good_link)
soup.replace_with(fixed_text)
I tried multiple solutions in the last two lines (this is just one), none worked.
Perhaps as follows, where I first identify the relevant anchor elements and strip out any other attributes besides the href, then later substitute the href link with the href html
import re
import requests
from bs4 import BeautifulSoup as bs
r = requests.get('https://rivista.clionet.it/vol5/giorgi-zoppi-la-ricerca-indire-tra-uso-didattico-del-patrimonio-storico-culturale-e-promozione-delle-buone-pratiche/')
soup = bs(r.text, 'lxml')
item = soup.select_one('p:has(a[id="ft-note-16"])')
text = item.text
for tag in item.select('a:not([id])'):
href = tag['href']
tag.attrs = {'href': href}
text = re.sub(href, str(tag), text)
text = re.sub(item.a.text, '', text).strip()
print(text)
I have my lxml code like this
from lxml import etree
import sys
fd = open('D:\\text.xml', 'wb')
xmlns = "http://www.fpml.org/FpML-5/confirmation"
xsi = "http://www.w3.org/2001/XMLSchema-instance"
fpmlVersion="http://www.fpml.org/FpML-5/confirmation ../../fpml-main-5-6.xsd http://www.w3.org/2000/09/xmldsig# ../../xmldsig-core-schema.xsd"
page = etree.Element("{"+xmlns+"}dataDocument",nsmap={None:xmlns,'xsi':xsi })
doc = etree.ElementTree(page)
page.set("fpmlVersion", fpmlVersion)
trade = etree.SubElement(page,'trade')
tradeheader = etree.SubElement(trade,'tradeheader')
partyTradeIdentifier = etree.SubElement(tradeheader,'partyTradeIdentifier')
partyReference = etree.SubElement(partyTradeIdentifier,'partyReference',href='party1')
tradeId = etree.SubElement(partyTradeIdentifier,'tradeId',tradeIdScheme='http://www.partyA.com/swaps/trade-id')
tradeId.text = 'TW9235'
swap = etree.SubElement(trade,'swap')
party = etree.SubElement(page,'party',id='party1')
partyID = etree.SubElement(party,'partyID')
partyID.text = 'PARTYAUS33'
partyName = etree.SubElement(party,'partyName')
partyName.text = 'Party A'
party = etree.SubElement(page,'party',id='party2')
partyID = etree.SubElement(party,'partyID')
partyID.text = 'BARCGB2L'
partyName = etree.SubElement(party,'partyName')
partyName.text = 'Party B'
s = etree.tostring(doc, xml_declaration=True,encoding="UTF-8",pretty_print=True)
print (s)
fd.write(s)
And i need to generate a xml file like
<?xml version='1.0' encoding='UTF-8'?>
<dataDocument xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="http://www.fpml.org/FpML-5/confirmation" fpmlVersion="http://www.fpml.org/FpML-5/confirmation ../../fpml-main-5-6.xsd http://www.w3.org/2000/09/xmldsig# ../../xmldsig-core-schema.xsd">
<trade>
<tradeheader>
<partyTradeIdentifier>
<partyReference href="party1"/>
<tradeId tradeIdScheme="http://www.partyA.com/swaps/trade-id">TW9235</tradeId>
</partyTradeIdentifier>
</tradeheader>
<swap/>
</trade>
<party id="party1">
<partyID>PARTYAUS33</partyID>
<partyName>Party A</partyName>
</party>
<party id="party2">
<partyID>BARCGB2L</partyID>
<partyName>Party B</partyName>
</party>
</dataDocument>
Now the above code works.
However i need to generate 10k such files where the elements text or attributes vary .
For example the partyID maybe different like
PARTYGER45 instead of PARTYUS33 is there a clean way to do this instead of hard coding it ?
Similarly i need to vary lot of things like the tradeId TW9235
one way could be to have the output xml without values loaded to lxml objectify and then loop while setting relevant values and write it to a file, meaning
from lxml import objectify
with open('in.xml') as f_in:
for pId in ['PARTYGER45', ...]:
dataDocument = objectify.parse(f.read())
dataDocument.party.partyID._setText(pId)
...
obj_xml = lxml.etree.tostring(dataDocument)
with open('out_%s.xml' % pId, 'w') as f_out:
f.write(obj_xml)
another way might be to use lxml and xslt, again, start from an empty structured xml and transform the structure according to your needs.