This script, not elaborated by me, allows me to remove a watermark from a PDF file. The problem is that it doesn't have an option to do batch operations, and I have more than 1,000 PDF files in the same directory
from PyPDF2 import PdfFileReader, PdfFileWriter
def removeWatermark(input_fname: str, output_fname: str):
with open(input_fname, "rb") as inputFile, open(output_fname, "wb") as outputFile:
reader = PdfFileReader(inputFile)
writer = PdfFileWriter()
for n in range(reader.numPages):
page = reader.getPage(n)
del page["/Contents"][-1]
writer.addPage(page)
writer.write(outputFile)
if __name__ == "__main__":
import sys
if len(sys.argv) != 3:
raise RuntimeError("Arguments not correct!")
removeWatermark(sys.argv[1], sys.argv[2])
If anyone can help me I appreciate.
Dependences: PyPDF2 and PDFtk
I solved the problem:
import os
from PyPDF2 import PdfFileReader, PdfFileWriter
path = os.path.dirname(__file__)
files = os.listdir(path)
def removeWater(inputfilename,outputfilename):
"input"
inputFile = open(inputfilename, "rb")
outputFile = open(outputfilename, "wb")
pdfReader = PdfFileReader(inputFile)
pdfWriter = PdfFileWriter()
for n in range(pdfReader.numPages):
page = pdfReader.getPage(n)
del page["/Contents"][-1]
pdfWriter.addPage(page)
pdfWriter.write(outputFile)
if __name__ == '__main__':
for filename in files:
if filename.endswith(('.pdf')):
removeWater(os.path.splitext(filename)[0] + ".pdf",os.path.splitext(filename)[0] + "- limpo" + ".pdf")
Related
I was trying to run the main.py but it threw an error about attribution.
My Python version is Python 3.5. I am using the CNTK Docker release 2.6-cpu-python3.5. I cannot update the Python version because of CNTK. It only supports Python 3.5 and will only run in Ubuntu 16.04.
Pandas version: pandas==0.25.3
The Error
Traceback (most recent call last):
File "/workspace/main.py", line 5, in <module>
from model import extract_patches, score_patch, del_cache
File "/workspace/model.py", line 2, in <module>
from regressionModel import extract_features, predict_label
File "/workspace/regressionModel.py", line 26, in <module>
regression_model = read_model['model'][0]
File "/usr/local/lib/python3.5/dist-packages/pandas/core/frame.py", line 2898, in __getitem__
if self.columns.is_unique and key in self.columns:
File "/usr/local/lib/python3.5/dist-packages/pandas/core/generic.py", line 5063, in __getattr__
return object.__getattribute__(self, name)
File "pandas/_libs/properties.pyx", line 65, in pandas._libs.properties.AxisProperty.__get__
File "/usr/local/lib/python3.5/dist-packages/pandas/core/generic.py", line 5063, in __getattr__
return object.__getattribute__(self, name)
AttributeError: 'DataFrame' object has no attribute '_data'
main.py
import os
import flask
import numpy as np
from flask import jsonify, request
from model import extract_patches, score_patch, del_cache
app = flask.Flask(__name__)
#app.route('/url/<path:argument>')
def url(argument):
# create a patch folder
patch_path = './patches'
if not os.path.exists(patch_path):
os.mkdir(patch_path)
# get image url from the query string
imageURL = request.url.split('=',1)[1]
# extract patches from imageURL
dimension, face_loc, image_dim = extract_patches(imageURL)
# score each patch
patch_score= score_patch(patch_path)
# delete the downloaded image and the patches from local
del_cache(patch_path)
if os.path.exists('temp.jpg'):
os.remove('temp.jpg')
data = dict()
data['patch_score'] = []
for key in dimension:
tmp = []
tmp[:] = dimension[key]
tmp.append(patch_score[key])
data['patch_score'].append(tmp)
data['image_score'] = round(np.mean(list(patch_score.values())), 2)
data['face_loc'] = face_loc['face_loc']
data['img_dim'] = image_dim
return jsonify(patch_score = str(data['patch_score']), image_score = str(data['image_score']), face_loc = str(data['face_loc']), image_dim = str(data['img_dim']))
if __name__ == '__main__':
app.run(host='0.0.0.0', port = 9580) # port number can be changed in your case
model.py
import getPatches
from regressionModel import extract_features, predict_label
import os
import shutil
def extract_patches(imageURL):
patch_path = './patches'
dimension_dict = dict()
face_dict = dict()
image_dim = []
try:
dim, face, img = getPatches.extract_patches(imageURL, dimension_dict,face_dict, image_dim, patch_path)
print ("extract patches pass")
except:
print ('cannot extract patches from the image')
return dim, face, img
def score_patch(patch_path):
patch_score = dict()
for file in next(os.walk(patch_path))[2]:
file_path = os.path.join(patch_path, file)
score_features = extract_features (file_path)[0].flatten()# extract features from CNTK pretrained model
pred_score_label = predict_label(score_features) # score the extracted features using trained regression model
patch_score[file.split('.')[0]] = float("{0:.2f}".format(pred_score_label[0]))
return patch_score
def infer_label(patch_score, label_mapping):
max_score_name, max_score_value = max(patch_score.items(), key=lambda x:x[1])
pred_label = label_mapping[round(max_score_value)-1]
return pred_label
def del_cache(patch_folder):
shutil.rmtree(patch_folder)
return
regressionModel.py
import numpy as np
import pandas as pd
import cntk as C
from PIL import Image
import pickle
from cntk import load_model, combine
import cntk.io.transforms as xforms
from cntk.logging import graph
from cntk.logging.graph import get_node_outputs
pretrained_model = 'ResNet152_ImageNet_Caffe.model'
pretrained_node_name = 'pool5'
regression_model = 'cntk_regression.dat'
image_width = 224
image_height = 224
# load CNTK pretrained model
#model_file = os.path.join(pretrained_model_path, pretrained_model_name)
loaded_model = load_model(pretrained_model) # a full path is required
node_in_graph = loaded_model.find_by_name(pretrained_node_name)
output_nodes = combine([node_in_graph.owner])
# load the stored regression model
read_model = pd.read_pickle(regression_model)
regression_model = read_model['model'][0]
train_regression = pickle.loads(regression_model)
def extract_features(image_path):
img = Image.open(image_path)
resized = img.resize((image_width, image_height), Image.ANTIALIAS)
bgr_image = np.asarray(resized, dtype=np.float32)[..., [2, 1, 0]]
hwc_format = np.ascontiguousarray(np.rollaxis(bgr_image, 2))
arguments = {loaded_model.arguments[0]: [hwc_format]}
output = output_nodes.eval(arguments)
return output
def predict_label(features):
return train_regression.predict(features.reshape(1,-1))
https://pypi.org/project/cntk/#files has CNTK 2.7 for Python 3.6. Still an obsolete version, but not quite as obsolete.
from __future__ import division
from __future__ import print_function
from __future__ import absolute_import
import os
import io
import pandas as pd
import tensorflow as tf
from PIL import Image
from object_detection.utils import dataset_util
from collections import namedtuple, OrderedDict
flags = tf.compat.v1.app.flags
flags.DEFINE_string('csv_input', '', 'Path to the CSV input')
flags.DEFINE_string('output_path', '', 'Path to output TFRecord')
flags.DEFINE_string('image_dir', '', 'Path to images')
FLAGS = flags.FLAGS
# replace row_label with the name you annotated your images as
def class_text_to_int(row_label):
if row_label == 'Masked':
return 1
elif row_label == 'No_Masked':
return 2
else :
None
def split(df, group):
data = namedtuple('data', ['filename', 'object'])
gb = df.groupby(group)
return [data(filename, gb.get_group(x)) for filename, x in zip(gb.groups.keys(), gb.groups)]
def create_tf_example(group, path):
with tf.io.gfile.GFile(os.path.join(path, '{}'.format(group.filename)), 'rb') as fid:
encoded_jpg = fid.read()
encoded_jpg_io = io.BytesIO(encoded_jpg)
image = Image.open(encoded_jpg_io)
width, height = image.size
filename = group.filename.encode('utf8')
image_format = b'jpg'
xmins = []
xmaxs = []
ymins = []
ymaxs = []
classes_text = []
classes = []
for index, row in group.object.iterrows():
xmins.append(row['xmin'] / width)
xmaxs.append(row['xmax'] / width)
ymins.append(row['ymin'] / height)
ymaxs.append(row['ymax'] / height)
classes_text.append(row['class'].encode('utf8'))
classes.append(class_text_to_int(row['class']))
tf_example = tf.train.Example(features=tf.train.Features(feature={
'image/height': dataset_util.int64_feature(height),
'image/width': dataset_util.int64_feature(width),
'image/filename': dataset_util.bytes_feature(filename),
'image/source_id': dataset_util.bytes_feature(filename),
'image/encoded': dataset_util.bytes_feature(encoded_jpg),
'image/format': dataset_util.bytes_feature(image_format),
'image/object/bbox/xmin': dataset_util.float_list_feature(xmins),
'image/object/bbox/xmax': dataset_util.float_list_feature(xmaxs),
'image/object/bbox/ymin': dataset_util.float_list_feature(ymins),
'image/object/bbox/ymax': dataset_util.float_list_feature(ymaxs),
'image/object/class/text': dataset_util.bytes_list_feature(classes_text),
'image/object/class/label': dataset_util.int64_list_feature(classes),
}))
return tf_example
def main(_):
writer = tf.io.TFRecordWriter(FLAGS.output_path)
path = os.path.join(FLAGS.image_dir)
examples = pd.read_csv(FLAGS.csv_input)
grouped = split(examples, 'filename')
for group in grouped:
tf_example = create_tf_example(group, path)
writer.write(tf_example.SerializeToString())
writer.close()
output_path = os.path.join(os.getcwd(), FLAGS.output_path)
print('Successfully created the TFRecords: {}'.format(output_path))
if __name__ == '__main__':
tf.compat.v1.app.run()
this is my code named generate_tfrecord.py.
I downloaded this code from github as my first tensorflow tfrecord making example, but it makes error.
I am Korean, and I found that this error occurs when my computer name is korean.
But when I typed 'hostname' in my cmd, it returned 'DESKTOP-7AU~~~', which does not include Korean letters.
If you make comment about your required code or information, I will try to give it to you.
in my images - all folder, there are 764 sets of img+xml file and I have already run "xml_to_csv.py"
this code is from https://github.com/Bengemon825/TF_Object_Detection2020
this simplest way: you can rename your hostname with ascii characters.
your can search the question that how to rename the hostname by google.
this problem caused by Python read an non-unicode characters and cannot decode by utf-8.
I had a very similar problem and here is how I solved it - took me many hours to figure out:
If you are a Mac user, MacOS has 'invisible' folder organizing files of the format .DS_Store in every folder. When iterating through your images folder, the code runs into these .DS_Store files which the utf-8 decoder cannot decode. Deleting them is totally harmless although they in fact do re-appear but you don't have to worry about that
So you can get rid of them like this
OR (I preferred this option once figured out the problem): In your code, you can explicitly by-pass them with an if statement that checks only for .xml files or .csv files or .txt whichever files you are working with in your images folder/directory. So something like:
path = 'path to folder containing your .xml files or .csv files or .txt files'
if '.xml' in str(path):
I have also realized that when people directly use this generate_tfrecord.py as is, many tend to forget to explicitly call out their file paths correctly. This also happens for people using the create_pascal_tf_record.py python script of the object_detection api for TensorFlow.
For example, from your code above, flags.DEFINE_string('csv_input', '', 'Path to the CSV input'), you need to fill in the ' ' with your csv directory path and not leave it empty. For example flags.DEFINE_string('csv_input', 'add your csv directory path here', 'Path to the CSV input'). You have to do the same for all the flags.DEFINE_string instances or else you must explicitly spell out the path if you don't want to use the flags.DEFINE_string instances
I hope this is helpful to anyone using a Mac and running into all sorts of UnicodeDecodeError for TFRECORD files. I'm not sure if Windows users run into something similar. Also there could be other reasons but for me this happened to be the cause
I have a CSV file with url's and box coordinates (x coordinate of the top left corner, y coordinate of the top left corner, x coordinate of the bottom right corner and y coordinate of the bottom right corner) and I would like to acquire the image, crop it based on the coordinates (to 256x256) and then save the image. Unfortunately a solution to download the whole database and then create a separate with cropped images is difficult due to the size of the database. That for, it is necessary to create the image database with cropped images from the beginning. Another way is to save the image and then subsequently crop it and rewrite the initial image (and then i += 1 iterate to the next one).
Would the current approach work or should I use a different method for it? Additonally, how would I save the acquired images to a specified folder, as currently it downloads to the same folder as the script.
import urllib.request
import csv
import numpy as np
import pandas as pd
from io import BytesIO
import requests
from PIL import Image
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
filename = "images"
# open file to read
with open("data_test.csv".format(filename), 'r') as csvfile:
reader = csv.reader(csvfile)
# pop header row (1st row in csv)
header = next(reader)
# iterate on all lines
i = 0
for line in csvfile:
splitted_line = line.split(',')
# check if we have an image URL
if splitted_line[1] != '' and splitted_line[1] != "\n":
response = requests.get(splitted_line[1])
img = Image.open(BytesIO(response.content))
#crop_img = img[splitted_line[2]:splitted_line[3], splitted_line[4]:splitted_line[5]]
#crop_img = img[315:105, 370:173]
img.save(str(i) + ".png")
#crop_img = img[105:105+173,315:315+370]
#[y: y + h, x: x + w]
new_img = img.resize((256, 256))
new_img.save(str(i) + ".png")
imgplot = plt.imshow(img)
plt.show()
# urllib.request.urlopen(splitted_line[1])
print("Image saved for {0}".format(splitted_line[0]))
# img = cv2.imread(img_path, 0)
i += 1
else:
print("No result for {0}".format(splitted_line[0]))
Any further recommendations are welcome.
Edit: The latest version gives me error :
crop_img = img[105:105+173,315:315+370]
TypeError: 'JpegImageFile' object is not subscriptable
I solved the problem using Bytes.IO and some cropping/resizing techniques.
import csv
from io import BytesIO
import requests
from PIL import Image
import matplotlib.pyplot as plt
filename = "images"
# open file to read
with open("data_test.csv".format(filename), 'r') as csvfile:
reader = csv.reader(csvfile)
# pop header row (1st row in csv)
header = next(reader)
# iterate on all lines
i = 0
for line in csvfile:
splitted_line = line.split(',')
# check if we have an image URL
if splitted_line[1] != '' and splitted_line[1] != "\n":
response = requests.get(splitted_line[1])
img = Image.open(BytesIO(response.content))
#im.crop(box) ⇒ 4-tuple defining the left, upper, right, and lower pixel coordinate
left_x = int(splitted_line[2])
top_y = int(splitted_line[3])
right_x = int(splitted_line[4])
bottom_y = int(splitted_line[5])
crop = img.crop((left_x, top_y, right_x, bottom_y))
new_img = crop.resize((256, 256))
"""
# preview new images
imgplot = plt.imshow(new_img)
plt.show()
"""
new_img.save(str(i) + ".png")
print("Image saved for {0}".format(splitted_line[0]))
i += 1
else:
print("No result for {0}".format(splitted_line[0]))
Hope it will help someone. Any optimization recommendations are still welcome.
Could someone please tell me whether Tensorboard supports exporting CSV files from the command line? The reason why I ask this is because I have a lots of logging directory and I am hoping to have a script file that automates the process. Thanks.
The API supports reading files programmatically. Here's an example of extracting data for a tag and saving it to a .csv in a similar format to those generated by tensorboard
import argparse
import numpy as np
import tensorflow as tf
def save_tag_to_csv(fn, tag='test_metric', output_fn=None):
if output_fn is None:
output_fn = '{}.csv'.format(tag.replace('/', '_'))
print("Will save to {}".format(output_fn))
sess = tf.InteractiveSession()
wall_step_values = []
with sess.as_default():
for e in tf.train.summary_iterator(fn):
for v in e.summary.value:
if v.tag == tag:
wall_step_values.append((e.wall_time, e.step, v.simple_value))
np.savetxt(output_fn, wall_step_values, delimiter=',', fmt='%10.5f')
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('fn')
parser.add_argument('--tag', default='test_metric')
args = parser.parse_args()
save_tag_to_csv(args.fn, tag=args.tag)
The texts in the pdf files are text formats, not scanned. PDFMiner does not support python3, is there any other solutions?
There is also the pdfminer2 fork, supported for python 3.4, which available through pip3.
https://github.com/metachris/pdfminer
This thread helped me patch something together.
from urllib.request import urlopen
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO, BytesIO
def readPDF(pdfFile):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos=set()
for page in PDFPage.get_pages(pdfFile, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
device.close()
textstr = retstr.getvalue()
retstr.close()
return textstr
if __name__ == "__main__":
#scrape = open("../warandpeace/chapter1.pdf", 'rb') # for local files
scrape = urlopen("http://pythonscraping.com/pages/warandpeace/chapter1.pdf") # for external files
pdfFile = BytesIO(scrape.read())
outputString = readPDF(pdfFile)
print(outputString)
pdfFile.close()
For python3, you can download pdfminer as:
python -m pip install pdfminer.six
tika worked the best for me. It won't be wrong if I say it's better than PyPDF2 and pdfminer This made it really easy to extract each line in the pdf into a list. You can install it by pip install tika
And, use the code below:
from tika import parser
rawText = parser.from_file(path_to_pdf)
rawList = rawText['content'].splitlines()
print(rawList)