Jython ValueError: chr() arg not in range(256) - jython

I am using Jython (jython2.7.0) to send a string value from a java program to a python method and then return the value to the java program but I get this error. ValueError: chr() arg not in range(256) Do you know what is the cause of the problem and How can I solve it ??
Exception in thread "main" Traceback (most recent call last):
File "PageRanking.py", line 9, in <module>
from bs4 import BeautifulSoup
File "C:\jython2.7.0\Lib\bs4\__init__.py", line 35, in <module>
from .builder import builder_registry, ParserRejectedMarkup
File "C:\jython2.7.0\Lib\bs4\builder\__init__.py", line 7, in <module>
from bs4.element import (
File "C:\jython2.7.0\Lib\bs4\element.py", line 10, in <module>
from bs4.dammit import EntitySubstitution
File "C:\jython2.7.0\Lib\bs4\dammit.py", line 14, in <module>
from html.entities import codepoint2name
File "C:\jython2.7.0\Lib\html\__init__.py", line 6, in <module>
from html.entities import html5 as _html5
File "C:\jython2.7.0\Lib\html\entities.py", line 2507, in <module>
entitydefs[name] = chr(codepoint)
This is my Python code
from __future__ import with_statement
from bs4 import BeautifulSoup
import requests
def pageRank(link):
url = "https://checkpagerank.net/"
payload = {'name':link}
r = requests.post(url, payload)
with open("requests_results.html", "wb") as f:
f.write(r.content)
with open(r'requests_results.html', "r", encoding='utf-8') as f:
text= f.read()
soup = BeautifulSoup(r.text, 'html.parser')
results = soup.find_all('h2')
SResult = results[1]
first= SResult.contents[0]
rankerName = first.find('b').text
second= SResult.contents[2]
rankervalue = second.find('b').text
x = rankervalue[:1]
x = int(x)
x= x*100/10
return x

Related

Why doesnt gzip.open() recognize my compressed file?

I have been trying to extract a large set of images that are in a .pkl.gz file. Here is my code for doing so.
import gzip
import pickle
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import os
for f in os.listdir("W:\Code\Machine Learning\digit-recognition-dnn\data"):
print(f)
name = 'mnist.pkl.gz'
with gzip.open(name, 'rb') as f:
train_set, valid_set, test_set = pickle.load(f)
train_x, train_y = train_set
plt.imshow(train_x[0].reshape((28, 28)), cmap=cm.Greys_r)
plt.show()
Unfortunately, the output is like this:
getData.py
mnist.pkl.gz
Traceback (most recent call last):
File "w:\Code\Machine Learning\digit-recognition-dnn\data\getData.py", line 12, in <module>
with gzip.open(name, 'rb') as f:
File "C:\Users\trexx\AppData\Local\Programs\Python\Python39\lib\gzip.py", line 58, in open
binary_file = GzipFile(filename, gz_mode, compresslevel)
File "C:\Users\trexx\AppData\Local\Programs\Python\Python39\lib\gzip.py", line 173, in __init__
fileobj = self.myfileobj = builtins.open(filename, mode or 'rb')
FileNotFoundError: [Errno 2] No such file or directory: 'mnist.pkl.gz'
My filetree looks like this:
data
├─ getData.py
└─ mnist.pkl.gz
Any suggestions?
I tried changing name to `
name = 'mnist.pkl'
and
name = 'mnist.gz'
The result is the same. FileNotFound.
Try specifying a path instead of just the name.
Replace mnist.pkl.gz with something like "W:\my_dir\mnist.pkl.gz

import pandas error : Traceback (most recent call last) and expected string or bytes-like object

I installed GIS-Pro and use Jupyter. I believe Jupyter is in the GIS-Pro package. I use Jupyter to write Python codes. Since yesterday, I've got the following errors once executing import pandas as pd :
TypeError Traceback (most recent call last)
C:\Users\AppData\Local\Temp\2/ipykernel_23172/4080736814.py in <module>
----> 1 import pandas as pd
C:\ArcGISPro28\bin\Python\envs\arcgispro-py3\lib\site-packages\pandas\__init__.py in <module>
# numpy compat
from pandas.compat import (
np_version_under1p18 as _np_version_under1p18,
is_numpy_dev as _is_numpy_dev,
C:\ArcGISPro28\bin\Python\envs\arcgispro-py3\lib\site-packages\pandas\compat\__init__.py in <module>
np_version_under1p20)
from pandas.compat.pyarrow import (
pa_version_under1p0,
pa_version_under2p0,
C:\ArcGISPro28\bin\Python\envs\arcgispro-py3\lib\site-packages\pandas\compat\pyarrow.py in <module>
pa_version = pa.__version__
palv = Version(_pa_version)
pa_version_under1p0 = _palv < Version("1.0.0")
pa_version_under2p0 = _palv < Version("2.0.0")
C:\ArcGISPro28\bin\Python\envs\arcgispro-py3\lib\site-packages\pandas\util\version\__init__.py in __init__(self, version)
# Validate the version and parse it into pieces
match = self._regex.search(version)
if not match:
raise InvalidVersion(f"Invalid version: '{version}'")
TypeError: expected string or bytes-like object

Problem after groupby (pandas), the grouped column is not accessible

I have a problem after groupby and receive this error message:
Traceback (most recent call last):
File "C:\Users\User\PycharmProjects\HashTag_Curso\venv\lib\site-packages\pandas\core\indexes\base.py", line 3080, in get_loc
return self._engine.get_loc(casted_key)
File "pandas_libs\index.pyx", line 70, in pandas._libs.index.IndexEngine.get_loc
File "pandas_libs\index.pyx", line 101, in pandas._libs.index.IndexEngine.get_loc
File "pandas_libs\hashtable_class_helper.pxi", line 4554, in pandas._libs.hashtable.PyObjectHashTable.get_item
File "pandas_libs\hashtable_class_helper.pxi", line 4562, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'Ano'
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "C:/Users/User/PycharmProjects/Bibliotecas/Exemplo.py", line 11, in
x = dfg['Ano']
File "C:\Users\User\PycharmProjects\HashTag_Curso\venv\lib\site-packages\pandas\core\frame.py", line 3024, in getitem
indexer = self.columns.get_loc(key)
File "C:\Users\User\PycharmProjects\HashTag_Curso\venv\lib\site-packages\pandas\core\indexes\base.py", line 3082, in get_loc
raise KeyError(key) from err
KeyError: 'Ano'
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
from astropy.stats import biweight_midcorrelation as bw_cor
df = pd.read_csv(r'Bases_dados\D_1_4M\Tudo/combined.csv').iloc[:100000]
df['Ano'] = df['Data decimal']//1
dfg = df.groupby(by=["Ano"]).mean()
print(dfg)
x = dfg['Ano']
y = dfg['Lances']
r = np.corrcoef(x, y)[0][1]
bwr = bw_cor(x, y)
print(bwr, r)
plt.scatter(x, y)
plt.show()
If i use
x = df['Ano']
y = df['Lances']
work fine, but with dfg (grouped by 'Ano'), i receive that err msg.
When i print(dfg), the column "Ano" appears normally.
It's moved to the index part, so you can either reset_index or pass as_index=False to groupby to begin with:
dfg = df.groupby(by="Ano", as_index=False).mean()

TypeError: _any() missing 1 required keyword-only argument: 'where'

I am trying to read the file using pandas but it is showing me a type error. I am not able to discern why. Can someone help me?
Below is my code
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
#prepare the files
df = pd.read_csv("~/Downloads/Boston.csv") # for doing modifications
Traceback (most recent call last):
File "", line 1, in
df = pd.read_csv("~/Downloads/Boston.csv") # for doing modifications
File "/Users/nikhiladiga/opt/anaconda3/lib/python3.7/site-packages/pandas/io/parsers.py", line 676, in parser_f
low_memory=_c_parser_defaults["low_memory"],
File "/Users/nikhiladiga/opt/anaconda3/lib/python3.7/site-packages/pandas/io/parsers.py", line 454, in _read
iterator = kwds.get("iterator", False)
File "/Users/nikhiladiga/opt/anaconda3/lib/python3.7/site-packages/pandas/io/parsers.py", line 1148, in read
names : iterable of names
File "/Users/nikhiladiga/opt/anaconda3/lib/python3.7/site-packages/pandas/core/frame.py", line 435, in init
d = {'col1': [1, 2], 'col2': [3, 4]}
File "/Users/nikhiladiga/opt/anaconda3/lib/python3.7/site-packages/pandas/core/internals/construction.py", line 233, in init_dict
datelike_vals = maybe_infer_to_datetimelike(values)
TypeError: _any() missing 1 required keyword-only argument: 'where'
Could be that read_csv method has troubles parsing your file without any other indications.
Try using additional keywords arguments such as sep, usecols, etc.
Refer to documentation for more: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html

Loading .txt file from Google Cloud Storage into a Pandas DF

I'm trying to load a .txt file from a GCS bucket into pandas df via pd.read_csv. When I run this code on my local machine (sourcing the .txt file from a local directory), it works perfectly. However, when I try and run the code in a cloud function , accessing the same .txt file but from a GCS bucket, I get a 'TypeError: cannot use a string pattern on a bytes-like object'
The only thing that's different is the fact that I'm accessing the .txt file via the GCS bucket so its a bucket object (Blob) instead of a normal file. Would I need to download the blob as a string or as a file-like object first before doing pd.read_csv? code is below
def stage1_cogs_vfc(data, context):
from google.cloud import storage
import pandas as pd
import dask.dataframe as dd
import io
import numpy as np
start_bucket = 'my_bucket'
storage_client = storage.Client()
source_bucket = storage_client.bucket(start_bucket)
df = pd.DataFrame()
file_path = 'gs://my_bucket/SCE_Var_Fact_Costs.txt'
df = pd.read_csv(file_path,skiprows=12, encoding ='utf-8', error_bad_lines= False, warn_bad_lines= False , header = None ,sep = '\s+|\^+',engine='python')
Traceback (most recent call last):
File "/env/local/lib/python3.7/site-packages/google/cloud/functions/worker.py", line 383, in run_background_function _function_handler.invoke_user_function(event_object) File "/env/local/lib/python3.7/site-packages/google/cloud/functions/worker.py", line 217, in invoke_user_function return call_user_function(request_or_event) File "/env/local/lib/python3.7/site-packages/google/cloud/functions/worker.py", line 214, in call_user_function event_context.Context(**request_or_event.context)) File "/user_code/main.py", line 20, in stage1_cogs_vfc df = pd.read_csv(file_path,skiprows=12, encoding ='utf-8', error_bad_lines= False, warn_bad_lines= False , header = None ,sep = '\s+|\^+',engine='python') File "/env/local/lib/python3.7/site-packages/pandas/io/parsers.py", line 702, in parser_f return _read(filepath_or_buffer, kwds) File "/env/local/lib/python3.7/site-packages/pandas/io/parsers.py", line 429, in _read parser = TextFileReader(filepath_or_buffer, **kwds) File "/env/local/lib/python3.7/site-packages/pandas/io/parsers.py", line 895, in __init__ self._make_engine(self.engine) File "/env/local/lib/python3.7/site-packages/pandas/io/parsers.py", line 1132, in _make_engine self._engine = klass(self.f, **self.options) File "/env/local/lib/python3.7/site-packages/pandas/io/parsers.py", line 2238, in __init__ self.unnamed_cols) = self._infer_columns() File "/env/local/lib/python3.7/site-packages/pandas/io/parsers.py", line 2614, in _infer_columns line = self._buffered_line() File "/env/local/lib/python3.7/site-packages/pandas/io/parsers.py", line 2689, in _buffered_line return self._next_line() File "/env/local/lib/python3.7/site-packages/pandas/io/parsers.py", line 2791, in _next_line next(self.data) File "/env/local/lib/python3.7/site-packages/pandas/io/parsers.py", line 2379, in _read yield pat.split(line.strip()) TypeError: cannot use a string pattern on a bytes-like object
``|
I found a similar situation here.
I also noticed that on the line:
source_bucket = storage_client.bucket(source_bucket)
you are using "source_bucket" for both: your variable name and parameter. I would suggest to change one of those.
However, I think you'd like to see this doc for any further question related to the API itself: Storage Client - Google Cloud Storage API
Building on points from #K_immer is my updated code that includes reading into 'Dask' df...
def stage1_cogs_vfc(data, context):
from google.cloud import storage
import pandas as pd
import dask.dataframe as dd
import io
import numpy as np
import datetime as dt
start_bucket = 'my_bucket'
destination_path = 'gs://my_bucket/ddf-*_cogs_vfc.csv'
storage_client = storage.Client()
bucket = storage_client.get_bucket(start_bucket)
blob = bucket.get_blob('SCE_Var_Fact_Costs.txt')
df0 = pd.DataFrame()
file_path = 'gs://my_bucket/SCE_Var_Fact_Costs.txt'
df0 = dd.read_csv(file_path,skiprows=12, dtype=object ,encoding ='utf-8', error_bad_lines= False, warn_bad_lines= False , header = None ,sep = '\s+|\^+',engine='python')
df7 = df7.compute() # converts dask df to pandas df
# then do your heavy ETL stuff here using pandas...