TypeError: int() argument must be a string, a bytes-like object or a number, not 'PSKeyword' - pdfminer

I tried this code
but a error given
TypeError: int() argument must be a string, a bytes-like object or a number, not 'PSKeyword'
from pdfminer.layout import LAParams
from pdfminer.converter import PDFResourceManager, PDFPageAggregator
from pdfminer.pdfpage import PDFPage
from pdfminer.layout import LTTextBoxHorizontal
document = open('Sample.pdf', 'rb')
#Create resource manager
rsrcmgr = PDFResourceManager()
# Set parameters for analysis.
laparams = LAParams()
# Create a PDF page aggregator object.
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.get_pages(document):
interpreter.process_page(page)
# receive the LTPage object for the page.
layout = device.get_result()
for element in layout:
if instanceof(element, LTTextBoxHorizontal):
print(element.get_text())

Related

AttributeError: 'Tensor' object has no attribute 'numpy' while mapping a function through my dataset

I'm trying to map a function process_image to the dataset. This function calls another function, get_label. In get_label, I'm trying to retrieve the label's name from images.
The file path is like this: C:\\Users\\sis\\Desktop\\test\\0002_c1s1_000451_03.jpg. The label is number 0002.
def get_lab(file_path):
parts = tf.strings.split(file_path, os.path.sep)
part=parts[-1].numpy().decode().split('_')[0]
label=tf.strings.to_number(part)
return label
I solved it! I didn't understand exactly where the error was, I think the previous code mixed eager mode and graph mode, so I changed the code of get_label function and it worked!
def get_lab(file_path):
parts = tf.strings.split(file_path, os.path.sep)[-1]
part=tf.strings.split(parts, sep='_')[0]
print(part)
label=tf.strings.to_number(part)
return label
You may applied to folder name that create sufficeints used of external programs.
[ Sample ]:
import os
import tensorflow as tf
def get_lab(file_path):
parts = tf.strings.split(file_path, os.path.sep)
part=parts[-2].numpy().decode().split('.')[0]
label=tf.strings.to_number(part)
return label.numpy()
directory = "F:\\datasets\\downloads\\Actors\\train\\Candidt Kibt\\01.tif\\"
print( 'label as number: ' + str(get_lab( directory )) )
directory = "F:\\datasets\\downloads\\Actors\\train\\"
print( 'classname: ' + str(tf.io.gfile.listdir(
directory
))
)
[ Output ]:
label as number: 1.0
classname: ['Candidt Kibt', 'Pikaploy']
F:\temp\Python>

Calling Numba-generated PyCFunctionWithKeywords from Python

I serialized a jitted Numba function to a byte array and now want to deserialize and call it. This works fine for primitive data types with llvm_cfunc_wrapper_name:
import numba, ctypes
import llvmlite.binding as llvm
#numba.njit("f8(f8)")
def foo(x):
return x + 0.5
# serialize function to byte array
sig = foo.signatures[0]
lib = foo.overloads[sig].library
cfunc_name = foo.overloads[sig].fndesc.llvm_cfunc_wrapper_name
function_bytes = lib._get_compiled_object()
# deserialize function_bytes to func
llvm.initialize()
llvm.initialize_native_target()
llvm.initialize_native_asmprinter()
target = llvm.Target.from_default_triple()
target_machine = target.create_target_machine()
backing_mod = llvm.parse_assembly("")
engine = llvm.create_mcjit_compiler(backing_mod, target_machine)
engine.add_object_file(llvm.ObjectFileRef.from_data(function_bytes))
func_ptr = engine.get_function_address(cfunc_name)
func = ctypes.CFUNCTYPE(ctypes.c_double, ctypes.c_double)(func_ptr)
print(func(0.25))
But I want to call functions with NumPy arguments. There is a llvm_cpython_wrapper_name for that which uses PyCFunctionWithKeywords, but unfortunately my best guess segfaults:
import numba, ctypes
import llvmlite.binding as llvm
import numpy as np
#numba.njit("f8[:](f8[:])")
def foo(x):
return x + 0.5
# serialize function to byte array
sig = foo.signatures[0]
lib = foo.overloads[sig].library
cpython_name = foo.overloads[sig].fndesc.llvm_cpython_wrapper_name
function_bytes = lib._get_compiled_object()
# deserialize function_bytes to func
llvm.initialize()
llvm.initialize_native_target()
llvm.initialize_native_asmprinter()
target = llvm.Target.from_default_triple()
target_machine = target.create_target_machine()
backing_mod = llvm.parse_assembly("")
engine = llvm.create_mcjit_compiler(backing_mod, target_machine)
engine.add_object_file(llvm.ObjectFileRef.from_data(function_bytes))
func_ptr = engine.get_function_address(cpython_name)
def func(*args, **kwargs):
py_obj_ptr = ctypes.POINTER(ctypes.py_object)
return ctypes.CFUNCTYPE(py_obj_ptr, py_obj_ptr, py_obj_ptr, py_obj_ptr)(func_ptr)(
ctypes.cast(id(None), py_obj_ptr),
ctypes.cast(id(args), py_obj_ptr),
ctypes.cast(id(kwargs), py_obj_ptr))
# segfaults here
print(func(np.ones(3)))
Here are some links to Numba source code (unfortunately very hard to follow), which might be helpful to figure this out.
https://github.com/numba/numba/blob/61ec1fd0f69aeadece218dccf4c39ebc5c7dfbc4/numba/core/callwrapper.py#L105
https://github.com/numba/numba/blob/61ec1fd0f69aeadece218dccf4c39ebc5c7dfbc4/numba/core/pythonapi.py#L1456

ERROR in getting file from Amazon S3 using python(3.8)

I am trying to get data from Amazon S3 and store it into a variable (file such as .pkl file).
And I am getting the following error:
ERROR:
expected str, bytes or os.PathLike object, not _io.BytesIO
CODE for S3
class S3Mgr:
def __init__(self,bucketName):
self.aws_access_key_id= CONFIG.S3[0]
self.aws_secret_access_key = CONFIG.S3[1]
self.region_name = CONFIG.S3[2]
self.bucketName = bucketName
def __connect(self):
self.s3 = boto3.client(
's3',
aws_access_key_id=self.aws_access_key_id,
aws_secret_access_key=self.aws_secret_access_key,
region_name=self.region_name
)
def retrieveModel(self,fileName):
self.__connect()
a = self.s3.download_fileobj(Bucket=self.bucketName, Key="fcm/project/"+str(fileName))
return a['Body'].read()
CODE for pickle
import pickle
from io import BytesIO
S3obj = S3mgr("mybucket")
model = S3obj.retrieveModel("model.pkl")
data = BytesIO(model)
model = pickle.load(data)
prediction = model.predict(inputArray)
Above inputArray is the array of inputs.
Try this, replace download_fileobj with get_object in your s3Mgr class's retrieveModel method.
Something like this:
def retrieveModel(self,fileName):
self.__connect()
a = self.s3.get_object(Bucket=self.bucketName, Key="fcm/project/"+str(fileName))
return a['Body'].read()
I Hope, it works. Ping if any progress or need any help.
Cheers 👍 !

Getting error while converting base64 string into image using pyspark

I want to extract and process an image data (3D array) available in base64 format using pyspark. I'm using pandas_udf with pyarrow as a processing function. While parsing the base64 string into pandas_udf function, first I convert the base64 string into image. But, at this step I'm getting error as "TypeError: file() argument 1 must be encoded string without null bytes, not str."
I am using function base64.b64decode(imgString) to convert base64 string to image. I'm using python 2.7
...
avrodf=sqlContext.read.format("com.databricks.spark.avro").load("hdfs:///Raw_Images_201803182350.avro")
interested_cols = ["id","name","image_b64"]
indexed_avrodf = avrodf.select(interested_cols)
ctx_cols = ["id","name"]
result_sdf = indexed_avrodf.groupby(ctx_cols).apply(img_proc)
schema = StructType([
StructField("id",StringType()),
StructField("name",StringType()),
StructField("image",StringType()),
StructField("Proc_output",StringType())
])
#pandas_udf(schema, PandasUDFType.GROUPED_MAP)
def img_proc(df):
df['Proc_output'] = df['image_b64'].apply(is_processed)
return df
def is_processed(imgString):
import cv2
from PIL import Image, ImageDraw, ImageChops
import base64
wisimg = base64.b64decode(imgString)
image = Image.open(wisimg)
.....
return processed_status

how to make R datafile to Python type

I want to make R datatype to Python datatype below is the whole code
def convert_datafiles(datasets_folder):
import rpy2.robjects
rpy2.robjects.numpy2ri.activate()
pandas2ri.activate()
for root, dirs, files in os.walk(datasets_folder):
for name in files:
# sort out .RData files
if name.endswith('.RData'):
name_ = os.path.splitext(name)[0]
name_path = os.path.join(datasets_folder, name_)
# creat sub-directory
if not os.path.exists(name_path):
os.makedirs(name_path)
file_path = os.path.join(root, name)
robj = robjects.r.load(file_path)
# check out subfiles in the data frame
for var in robj:
###### error happend right here
myRData = pandas2ri.ri2py_dataframe( var )
####error happend right here
# convert to DataFrame
if not isinstance(myRData, pd.DataFrame):
myRData = pd.DataFrame(myRData)
var_path = os.path.join(datasets_folder,name_,var+'.csv')
myRData.to_csv(var_path)
os.remove(os.path.join(datasets_folder, name)) # clean up
print ("=> Success!")
I want to make R datatype to pythone type, but the error keeps popping up like this : AttributeError: 'str' object has no attribute 'dtype'
How should I do to resolve this error?
The rpy2 documentation is somewhat incomplete when it comes to interaction with pandas, but unit tests will provide examples of conversion. For example:
rdataf = robjects.r('data.frame(a=1:2, '
' b=I(c("a", "b")), '
' c=c("a", "b"))')
with localconverter(default_converter + rpyp.converter) as cv:
pandas_df = robjects.conversion.ri2py(rdataf)