get the index of string element in np1 while it has a substring in np2 - numpy

my code is as below:
import numpy as np
keywordlist = ['cpp-4.8.5', 'CUnit-2.1.3', 'CUnit-devel', 'doxygen-1.8.5', 'e2fsprogs-1.42.9', 'e2fsprogs-libs', 'epel-release', 'fuse3-devel', 'fuse3-libs', 'gcc-4.8.5', 'gcc-c++', 'gcc-gfortran', 'ghc-array', 'ghc-base', 'ghc-bytestring', 'ghc-containers', 'ghc-deepseq', 'ghc-directory', 'ghc-filepath', 'ghc-json', 'ghc-mtl', 'ghc-old', 'ghc-parsec', 'ghc-pretty', 'ghc-regex', 'ghc-regex', 'ghc-ShellCheck', 'ghc-syb', 'ghc-text', 'ghc-time', 'ghc-transformers', 'ghc-unix', 'git-1.8.3.1', 'graphviz-2.30.1', 'help2man-1.41.1', 'ibacm-22.4', 'keyutils-libs', 'krb5-devel', 'krb5-libs', 'krb5-workstation', 'lcov-1.13', 'libaio-devel', 'libblkid-2.23.2', 'libcom_err-1.42.9', 'libcom_err-devel', 'libgcc-4.8.5', 'libgfortran-4.8.5', 'libgomp-4.8.5', 'libibumad-22.4', 'libibverbs-22.4', 'libiscsi-devel', 'libkadm5-1.15.1', 'libmount-2.23.2', 'libpmem-1.5.1', 'libpmemblk-1.5.1', 'libpmemblk-devel', 'libpmem-devel', 'libquadmath-4.8.5', 'libquadmath-devel', 'librdmacm-22.4', 'libselinux-2.5', 'libselinux-devel', 'libselinux-python', 'libselinux-utils', 'libsepol-devel', 'libsmartcols-2.23.2', 'libss-1.42.9', 'libstdc++-4.8.5', 'libstdc++-devel', 'libunwind-1.2', 'libunwind-devel', 'libuuid-2.23.2', 'libuuid-devel', 'libverto-devel', 'libXaw-1.0.13', 'libXScrnSaver-1.2.2', 'make-3.82', 'nasm-2.10.07', 'numactl-devel', 'numactl-libs', 'openssl-1.0.2k', 'openssl-devel', 'openssl-libs', 'pcre-devel', 'perl-Digest', 'perl-Digest', 'perl-GD', 'perl-Git', 'python-2.7.5', 'python2-pycodestyle', 'python-libs', 'rdma-core', 'rdma-core', 'sg3_utils-1.37', 'sg3_utils-libs', 'ShellCheck-0.3.8', 'util-linux', 'zlib-devel']
np1 = np.array(keywordlist)
# ['cpp-4.8.5' 'CUnit-2.1.3' 'CUnit-devel' 'doxygen-1.8.5' ... 'ShellCheck-0.3.8' 'util-linux' 'zlib-devel']
result = ['epel-release-7-12.noarch', 'rdma-core-22.4-5.el7.x86_64', 'cpp-4.8.5-44.el7.x86_64', 'doxygen-1.8.5-4.el7.x86_64', 'ghc-base-4.6.0.1-26.4.el7.x86_64', 'libuuid-2.23.2-65.el7.x86_64', 'python-libs-2.7.5-89.el7.x86_64', 'libkadm5-1.15.1-50.el7.x86_64', 'libmount-2.23.2-65.el7.x86_64', 'libquadmath-4.8.5-44.el7.x86_64', 'util-linux-2.23.2-65.el7.x86_64', 'libss-1.42.9-19.el7.x86_64', 'keyutils-libs-1.5.8-3.el7.x86_64', 'e2fsprogs-libs-1.42.9-19.el7.x86_64', 'ghc-pretty-1.1.1.0-26.4.el7.x86_64', 'libXaw-1.0.13-4.el7.x86_64', 'libselinux-2.5-15.el7.x86_64', 'libibverbs-22.4-5.el7.x86_64', 'libselinux-utils-2.5-15.el7.x86_64', 'libgomp-4.8.5-44.el7.x86_64', 'libblkid-2.23.2-65.el7.x86_64', 'gcc-c++-4.8.5-44.el7.x86_64', 'e2fsprogs-1.42.9-19.el7.x86_64', 'CUnit-devel-2.1.3-8.el7.x86_64', 'make-3.82-24.el7.x86_64', 'numactl-libs-2.0.12-5.el7.x86_64', 'perl-Git-1.8.3.1-23.el7_8.noarch', 'openssl-libs-1.0.2k-19.el7.x86_64', 'gcc-4.8.5-44.el7.x86_64', 'CUnit-2.1.3-8.el7.x86_64', 'ghc-syb-0.4.0-35.el7.x86_64', 'gcc-gfortran-4.8.5-44.el7.x86_64', 'libselinux-python-2.5-15.el7.x86_64', 'sg3_utils-libs-1.37-19.el7.x86_64', 'fuse3-libs-3.6.1-4.el7.x86_64', 'libquadmath-devel-4.8.5-44.el7.x86_64', 'libgfortran-4.8.5-44.el7.x86_64', 'krb5-workstation-1.15.1-50.el7.x86_64', 'librdmacm-22.4-5.el7.x86_64', 'sg3_utils-1.37-19.el7.x86_64', 'libsmartcols-2.23.2-65.el7.x86_64', 'fuse3-devel-3.6.1-4.el7.x86_64', 'python-2.7.5-89.el7.x86_64', 'openssl-1.0.2k-19.el7.x86_64', 'libgcc-4.8.5-44.el7.x86_64', 'libaio-devel-0.3.109-13.el7.x86_64', 'ghc-old-locale-1.0.0.5-26.4.el7.x86_64', 'libcom_err-1.42.9-19.el7.x86_64', 'git-1.8.3.1-23.el7_8.x86_64', 'krb5-libs-1.15.1-50.el7.x86_64']
np2 = np.array(result)
# ['epel-release-7-12.noarch' 'rdma-core-22.4-5.el7.x86_64' ... 'krb5-libs-1.15.1-50.el7.x86_64']
expectation = ['cpp-4.8.5-39.el7.x86_64', 'CUnit-2.1.3-8.el7.x86_64', 'CUnit-devel-2.1.3-8.el7.x86_64', 'doxygen-1.8.5-4.el7.x86_64', 'e2fsprogs-1.42.9-17.el7.x86_64', 'e2fsprogs-libs-1.42.9-17.el7.x86_64', 'epel-release-latest-7.noarch', 'fuse3-devel-3.6.1-4.el7.x86_64', 'fuse3-libs-3.6.1-4.el7.x86_64', 'gcc-4.8.5-39.el7.x86_64', 'gcc-c++-4.8.5-39.el7.x86_64', 'gcc-gfortran-4.8.5-39.el7.x86_64', 'ghc-array-0.4.0.1-26.4.el7.x86_64', 'ghc-base-4.6.0.1-26.4.el7.x86_64', 'ghc-bytestring-0.10.0.2-26.4.el7.x86_64', 'ghc-containers-0.5.0.0-26.4.el7.x86_64', 'ghc-deepseq-1.3.0.1-26.4.el7.x86_64', 'ghc-directory-1.2.0.1-26.4.el7.x86_64', 'ghc-filepath-1.3.0.1-26.4.el7.x86_64', 'ghc-json-0.7-4.el7.x86_64', 'ghc-mtl-2.1.2-27.el7.x86_64', 'ghc-old-locale-1.0.0.5-26.4.el7.x86_64', 'ghc-parsec-3.1.3-31.el7.x86_64', 'ghc-pretty-1.1.1.0-26.4.el7.x86_64', 'ghc-regex-base-0.93.2-29.el7.x86_64', 'ghc-regex-tdfa-1.1.8-11.el7.x86_64', 'ghc-ShellCheck-0.3.8-1.el7.x86_64', 'ghc-syb-0.4.0-35.el7.x86_64', 'ghc-text-0.11.3.1-2.el7.x86_64', 'ghc-time-1.4.0.1-26.4.el7.x86_64', 'ghc-transformers-0.3.0.0-34.el7.x86_64', 'ghc-unix-2.6.0.1-26.4.el7.x86_64', 'git-1.8.3.1-23.el7_8.x86_64', 'graphviz-2.30.1-21.el7.x86_64', 'help2man-1.41.1-3.el7.noarch', 'ibacm-22.4-2.el7_8.x86_64', 'keyutils-libs-devel-1.5.8-3.el7.x86_64', 'krb5-devel-1.15.1-46.el7.x86_64', 'krb5-libs-1.15.1-46.el7.x86_64', 'krb5-workstation-1.15.1-46.el7.x86_64', 'lcov-1.13-1.el7.noarch', 'libaio-devel-0.3.109-13.el7.x86_64', 'libblkid-2.23.2-63.el7.x86_64', 'libcom_err-1.42.9-17.el7.x86_64', 'libcom_err-devel-1.42.9-17.el7.x86_64', 'libgcc-4.8.5-39.el7.x86_64', 'libgfortran-4.8.5-39.el7.x86_64', 'libgomp-4.8.5-39.el7.x86_64', 'libibumad-22.4-2.el7_8.x86_64', 'libibverbs-22.4-2.el7_8.x86_64', 'libiscsi-devel-1.9.0-7.el7.x86_64', 'libkadm5-1.15.1-46.el7.x86_64', 'libmount-2.23.2-63.el7.x86_64', 'libpmem-1.5.1-2.1.el7.x86_64', 'libpmemblk-1.5.1-2.1.el7.x86_64', 'libpmemblk-devel-1.5.1-2.1.el7.x86_64', 'libpmem-devel-1.5.1-2.1.el7.x86_64', 'libquadmath-4.8.5-39.el7.x86_64', 'libquadmath-devel-4.8.5-39.el7.x86_64', 'librdmacm-22.4-2.el7_8.x86_64', 'libselinux-2.5-15.el7.x86_64', 'libselinux-devel-2.5-15.el7.x86_64', 'libselinux-python-2.5-15.el7.x86_64', 'libselinux-utils-2.5-15.el7.x86_64', 'libsepol-devel-2.5-10.el7.x86_64', 'libsmartcols-2.23.2-63.el7.x86_64', 'libss-1.42.9-17.el7.x86_64', 'libstdc++-4.8.5-39.el7.x86_64', 'libstdc++-devel-4.8.5-39.el7.x86_64', 'libunwind-1.2-2.el7.x86_64', 'libunwind-devel-1.2-2.el7.x86_64', 'libuuid-2.23.2-63.el7.x86_64', 'libuuid-devel-2.23.2-63.el7.x86_64', 'libverto-devel-0.2.5-4.el7.x86_64', 'libXaw-1.0.13-4.el7.x86_64', 'libXScrnSaver-1.2.2-6.1.el7.x86_64', 'make-3.82-24.el7.x86_64', 'nasm-2.10.07-7.el7.x86_64', 'numactl-devel-2.0.12-5.el7.x86_64', 'numactl-libs-2.0.12-5.el7.x86_64', 'openssl-1.0.2k-19.el7.x86_64', 'openssl-devel-1.0.2k-19.el7.x86_64', 'openssl-libs-1.0.2k-19.el7.x86_64', 'pcre-devel-8.32-17.el7.x86_64', 'perl-Digest-1.17-245.el7.noarch', 'perl-Digest-MD5-2.52-3.el7.x86_64', 'perl-GD-2.49-3.el7.x86_64', 'perl-Git-1.8.3.1-23.el7_8.noarch', 'python-2.7.5-88.el7.x86_64', 'python2-pycodestyle-2.5.0-1.el7.noarch', 'python-libs-2.7.5-88.el7.x86_64', 'rdma-core-22.4-2.el7_8.x86_64', 'rdma-core-devel-22.4-2.el7_8.x86_64', 'sg3_utils-1.37-19.el7.x86_64', 'sg3_utils-libs-1.37-19.el7.x86_64', 'ShellCheck-0.3.8-1.el7.x86_64', 'util-linux-2.23.2-63.el7.x86_64', 'zlib-devel-1.2.7-18.el7.x86_64']
np3 = np.array(expectation)
# ['cpp-4.8.5-39.el7.x86_64' 'CUnit-2.1.3-8.el7.x86_64' ... 'util-linux-2.23.2-63.el7.x86_64' 'zlib-devel-1.2.7-18.el7.x86_64']
ready = []
for i in keywordlist:
for j in result:
x = np.char.startswith(j, i)
if x:
ready.append(np3[np.where(np.char.startswith(np3, i))])
np4 = np.array(ready)
# [array(['cpp-4.8.5-39.el7.x86_64'], dtype='<U39') array(['CUnit-2.1.3-8.el7.x86_64'], dtype='<U39') ... array(['util-linux-2.23.2-63.el7.x86_64'], dtype='<U39')]
notready = [i for i in np3 if i not in np4]
print(f"not ready: {notready}")
The purpose is to use string format keyword in keyword list to examine its existence in all np2 elements.
If any element in np2 starts with any keyword, or keyword is the substring of any element in np2, get the index of element in expectation which also start with that keyword and form into np4.
Finally, get not ready which is made up of elements that are in np3 but not in np4.
To make my explanation more vividly, I have a bunch of rpm files to be installed, the list of expectation.
The keyword list catches the former two keywords of each rpm file name.
Result is the standard output of already installed rpm files.
Taking cpp-4.8.5 as an example, I can see cpp-4.8.5-44.el7.x86_64 in result, which means currently cpp-4.8.5-44.el7.x86_64 has been installed. So, cpp-4.8.5-39.el7.x86_64 in expectation can be removed, since cpp-4.8.5-*.rpm has been successfully installed. Next step, deal with the other left items in expectation.
My question is: there any easier or more efficient way to get the result equivalent to notready? maybe with any other numpy built-in methods, but not with for loop.

Related

How to use mapFieldType with gdal.VectorTranslate

I'm trying to export a postgresql database into a .gpkg file, but some of my fields are lists, and ogr2ogr send me the message :
Warning 1: The output driver does not natively support StringList type for field my_field_name. Misconversion can happen. -mapFieldType can be used to control field type conversion.
But, as in the documentation, -mapFieldType is not a -lco, i don't find how to use it with the python version of gdal.VectorTranslate
here ma config :
gdal_conn = gdal.OpenEx(f"PG:service={my_pgsql_service}", gdal.OF_VECTOR)
gdal.VectorTranslate("path_to_my_file.gpkg"), gdal_conn,
SQLStatement=my_sql_query,
layerName=my_mayer_name,
format="GPKG",
accessMode='append',
)
so i've tried to add it in the -lco :
layerCreationOptions=["-mapFieldType StringList=String"]
but it didn't work
so i diged into the code of gdal, added a field mapFieldType=None into the VectorTranslateOptions function, and added into its code the following lines :
if mapFieldType is not None:
mapField_str = ''
i = 0
for k, v in mapFieldType.items():
i += 1
mapField_str += f"{k}={v}" if i == len(mapFieldType) else f"{k}={v},"
new_options += ['-mapFieldType', mapField_str]
And it worked, but is there an other way ?
And if not, where can i propose this feature ?
Thank you for your help

Can use Beautifulsoup to find elements hidden by other wrapped elements?

I would like to extract the text data of the author affiliations on this page using Beautiful soup.
I know of a work around using selenium to simply click on the 'show more' link and scan the page again? Im not sure what kind of elements these are, hidden? as they only appear in the inspector after clicking the button.
Is there a way to extract this info just using beautiful soup or do I need selenium or something equivalent to reveal the elements in the HTML code?
from bs4 import BeautifulSoup
import requests
url = 'https://www.sciencedirect.com/science/article/abs/pii/S0920379621007596'
sp = BeautifulSoup(r.content, 'html.parser')
r = sp.get(url)
author_data = sp.find('div', id='author-group')
affiliations = author_data.find('dl', class_='affiliation').text
print(affiliations)
That info is within a script tag though you need to map the letters for affiliations to the actual affiliations. The code below extracts the JavaScript object housing the info you want and handles with JSON library.
There is then a series of steps to dynamically determine which indices hold the info of interest and then use a constructed mapping of the letters to affiliations to assign the correct affiliation to each author.
The author first and last names are also dynamically ascertained and joined together with a space.
The intention was to avoid hardcoding indices which might change over time.
import re
import json
import requests
r = requests.get('https://www.sciencedirect.com/science/article/abs/pii/S0920379621007596',
headers={'User-Agent': 'Mozilla/5.0'})
data = json.loads(re.search(r'(\{"abstracts".*})', r.text).group(1))
base = [i for i in data['authors']['content']
if i.get('#name') == 'author-group'][0]['$$']
affiliation_data = [i for i in base if i['#name'] == 'affiliation']
author_data = [i for i in base if i['#name'] == 'author']
name_info = [i['_'] for author in author_data for i in author['$$']
if i['#name'] in ['given-name', 'surname']]
affiliations = dict(zip([j['_'] for i in affiliation_data for j in i['$$'] if j['#name'] == 'label'], [
j['_'] for i in affiliation_data for j in i['$$'] if isinstance(j, dict) and '_' in j and j['_'][0].isupper()]))
# print(affiliations)
author_affiliations = dict(zip([' '.join([i[0], i[1]]) for i in zip(name_info[0::2], name_info[1::2])], [
affiliations[j['_']] for author in author_data for i in author['$$'] if i['#name'] == 'cross-ref' for j in i['$$'] if j['_'] != '⁎']))
print(author_affiliations)

Convert CONLL file to a list of Doc objects

Is there a way to convert CONLL file into list of Doc objects without having to parse the sentence using the nlp object. I have a list of annotations that I have to pass to the automatic component that uses Doc objects as input. I have found a way to create the doc:
doc = Doc(nlp.vocab, words=[...])
And that I can use the from_array function to recreate the other linguistic features. This array can be recreated by using index value from StringStore object, I have successfully created Doc object with LEMMA and TAG information but cannot recreate HEAD data. My question is how to pass HEAD data to Doc object using from_array method.
The confusing thing about the HEAD is that for sentence that has this structure:
Ona 2
je 2
otišla 2
u 4
školu 2
. 2
The output of this code snippet:
from spacy.attrs import TAG, HEAD, DEP
doc.to_array([TAG, HEAD, DEP])
is:
array([[10468770234730083819, 2, 429],
[ 5333907774816518795, 1, 405],
[11670076340363994323, 0, 8206900633647566924],
[ 6471273018469892813, 1, 8110129090154140942],
[ 7055653905424136462, 18446744073709551614, 435],
[ 7173976090571422945, 18446744073709551613, 445]],
dtype=uint64)
I cannot correlate the center column of the from_array output to dependency tree structure given above.
Thanks in advance for the help,
Daniel
Ok, so I finally cracked it it appears that head - index if the index is lower than head and 18446744073709551616 - index otherwise. This is the function that I used if anyone else needed it:
import numpy as np
from spacy.tokens import Doc
docs = []
for sent in sents:
generated_doc = Doc(doc.vocab, words=[word["word"] for word in sent])
heads = []
for idx, word in enumerate(sent):
if word["pos"] not in doc.vocab.strings:
doc.vocab.strings.add(word["pos"])
if word["dep"] not in doc.vocab.strings:
doc.vocab.strings.add(word["dep"])
if word["head"] >= idx:
heads.append(word["head"] - idx)
else:
heads.append(18446744073709551616-idx)
np_array = np.array([np.array([doc.vocab.strings[word["pos"]], heads[idx], doc.vocab.strings[word["dep"]]], dtype=np.uint64) for idx, word in enumerate(sent)], dtype=np.uint64)
generated_doc.from_array([TAG, HEAD, DEP], np_array)
docs.append(generated_doc)

Still not understanding settingwithcopy warning

I want to isolate a string but I keep getting a setting with copy error. I read the other threads on settingwithcopy warnings but I don't understand why those solutions don't work here.
I've tried using:
df['Title'][i] = delBy[i]
df.Title[i] = delBy[i]
df[df.Title][i] = delBy[i]
df.loc[df.Title][i] = delBy[i]
df.loc[i]['Title'] = delBy[i]
Actual code:
delBy = df['Title'].str.extract(r'(.+?)(?= [bB]y)', expand = False)
for i in df.index:
if pd.notna(delBy[i]) == True:
df['Title'][i] = delBy[i]
else:
continue
If title has keywords by or By (ex: Animal by John) keep only title (Animal). Leave other titles alone (ex: Meditations)
It looks that you want to delete "by ..." part, where it can be done.
Then start from:
delBy = df.Title.str.extract(r'(.+?)(?= [bB]y)', expand = False).dropna()
(note that I added .dropna()).
Then, instead of your loop, just update this column (in place):
df.Title.update(delBy)
A shorter solution, isn't it?

function with loop for urls won't return all of them

I am working on a project to pull details from state reports on restaurant inspections. Each inspection has its own url. I am able to gather the values into a dictionary, but only return one at a time. In the call to the function, if I don't specify a specific library entry, I get an error: ''list' object has no attribute 'timeout'' If it ask for a specific entry, I get a good return. How can I get them all?
# loop through the url list to gather inspection details
detailsLib = {}
def get_inspect_detail(urlList):
html = urlopen(urlList)
soup = bs4.BeautifulSoup(html.read(), 'lxml')
details = soup.find_all('font', {'face': 'verdana'})[10:]
result = []
for detail in details:
siteName = details[0].text
licNum = details[2].text
siteRank = details[4].text
detailsLib = {
'Restaurant': siteName,
'License': licNum,
'Rank': siteRank,
}
result.append(detailsLib)
return result
get_inspect_detail(urlList[21])
So I can get the 21st restaurant on the list, repeating 36 times, but not all of them.
Another question for another day is where to do the clean-up. The details will need some regex work but I'm unsure whether to do that inside the function (one at a time), or outside the function by calling all values from a specific key in the library.
Call get_inspect_detail() once per item in urlList, and save all the results.
all_results = []
for url in urlList:
details = get_inspect_detail(url)
all_results.extend(details)