Extracting text from an area with PDFbox - pdf

is it possible to extract text from an area with PDFbox using just the binaries instead of having to create my own code?

Compile and pack this simple program into a jar
import java.awt.geom.Rectangle2D;
import java.io.File;
import java.io.IOException;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.text.PDFTextStripperByArea;
public class ExtractText {
// Usage: xxx.jar filepath page x y width height
public static void main(String[] args) throws IOException {
if (args.length != 6) {
System.out.println("Help info");
return;
}
// Parameters
String filepath = args[0];
int page = Integer.parseInt(args[1]);
int x = Integer.parseInt(args[2]);
int y = Integer.parseInt(args[3]);
int width = Integer.parseInt(args[4]);
int height = Integer.parseInt(args[5]);
PDDocument document = PDDocument.load(new File(filepath));
PDFTextStripperByArea textStripper = new PDFTextStripperByArea();
Rectangle2D rect = new java.awt.geom.Rectangle2D.Float(x, y, width, height);
textStripper.addRegion("region", rect);
PDPage docPage = document.getPage(page);
textStripper.extractRegions(docPage);
String textForRegion = textStripper.getTextForRegion("region");
System.out.println(textForRegion);
}
}
Run it from command line, ex:
xxx.jar filepathToPdf pageToExtract x y width height
Add validation code for parameters and some usage info.
Edit
Also add the PDFbox libraries
java -cp "..." -jar xxx.jar filepathToPdf pageToExtract x y width height

Related

my code is only drawing a straight line no matter how many sides i input in the pSides JTextFields

When I input a digit in the JTextfields it should pass through my actionListener to store all the values (ID, number of sides, length of sides and color) into an arraylist. It should then be used in my polygonContainer class that goes through a for loop and a polygon formula, then prints it out. However what comes out all the time is a straight line.
This is the code i tried:
import javax.swing.*;
import java.awt.*;
import java.util.ArrayList;
public class ContainerFrame extends JFrame{
ArrayList<PolygonContainer> polygons = new ArrayList<>();
// JTextFields for the number of sides, side length, color, and ID
public JTextField pSides, pSidesLengths, pColor, pID;
// Button for submission
public JButton submitButton;
public void createComponents() {
// Initialize the JTextFields
pSides = new JTextField();
pSidesLengths = new JTextField();
pColor = new JTextField();
pID = new JTextField();
submitButton = new JButton("Submit");
submitButton.addActionListener(new ContainerButtonHandler(this));
JPanel textFieldsPanel = new JPanel();
// uses a gridlayout to organise the textfields then sets it as north
textFieldsPanel.setLayout(new GridLayout(5, 2));
textFieldsPanel.add(new JLabel("Sides:"));
textFieldsPanel.add(pSides);
textFieldsPanel.add(new JLabel("Sides Length:"));
textFieldsPanel.add(pSidesLengths);
textFieldsPanel.add(new JLabel("Color:"));
textFieldsPanel.add(pColor);
textFieldsPanel.add(new JLabel("ID:"));
textFieldsPanel.add(pID);
add(textFieldsPanel, BorderLayout.NORTH);
add(submitButton, BorderLayout.SOUTH);
JPanel drawPanel = new ContainerPanel(this);
add(drawPanel, BorderLayout.CENTER);
setSize(1600, 900);
setVisible(true);
setDefaultCloseOperation( JFrame.EXIT_ON_CLOSE ); // Close action.
}
public static void main(String[] args) {
ContainerFrame cFrame = new ContainerFrame();
cFrame.createComponents();
}
}
import javax.swing.*;
import java.awt.event.ActionEvent;
import java.awt.event.ActionListener;
import java.util.ArrayList;
class ContainerButtonHandler implements ActionListener {
ContainerFrame theApp; // Reference to ContainerFrame object
// ButtonHandler constructor
ContainerButtonHandler(ContainerFrame app) {
theApp = app;
}
public void actionPerformed(ActionEvent e) {
// Get the text from the JTextFields using the getText method
String sides = theApp.pSides.getText();
String sidesLengths = theApp.pSidesLengths.getText();
String id = theApp.pID.getText();
//parse the values as integers
int intSides = Integer.parseInt(sides);
int intSidesLength = Integer.parseInt(sidesLengths);
int intId = Integer.parseInt(id);
//does the input validation where sides, sides length and id needs to be positive integers
if (intId >= 100000 && intId <= 999999 && intSides >= 0 && intSidesLength >= 0) {
// The inputs are valid
String color = theApp.pColor.getText();
// Store the values in an arraylist or other data structure
ArrayList<String> polygonArray = new ArrayList<String>();
polygonArray.add(sides);
polygonArray.add(sidesLengths);
polygonArray.add(color);
polygonArray.add(id);
PolygonContainer polygon = new PolygonContainer(polygonArray);
theApp.polygons.add(polygon);
theApp.repaint();
JOptionPane.showMessageDialog(theApp, "Polygon" + id + "was added to the list.", "Success", JOptionPane.INFORMATION_MESSAGE);
} else {
JOptionPane.showMessageDialog(theApp, "The Polygon" + id + "was not added to the list as a valid Id was not provided.", "Error", JOptionPane.ERROR_MESSAGE);
}
}
}
import java.awt.*;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
public class PolygonContainer implements Comparable<PolygonContainer>{
Color pColor = Color.BLACK; // Colour of the polygon, set to a Colour object, default set to black
int pId = 000000; // Polygon ID should be a six digit non-negative integer
int pSides; // Number of sides of the polygon, should be non-negative value
int pSideLengths; // Length of each side in pixels of the polygon, should be non-negative value
int polyCenX; // x value of centre point (pixel) of polygon when drawn on the panel
int polyCenY; // y value of centre point (pixel of polygon when drawn on the panel
int [] pointsX; // int array containing x values of each vertex (corner point) of the polygon
int [] pointsY; // int array containing y values of each vertex (corner point) of the polygon
// Constructor currently set the number of sides and the equal length of each side of the Polygon
//Constructor that takes the values from the array
public PolygonContainer(ArrayList<String> values){
this.pSides = Integer.parseInt(values.get(0));
this.pSideLengths = Integer.parseInt(values.get(1));
this.pColor = Color.getColor(values.get(2));
this.pId = Integer.parseInt(values.get(3));
pointsX = new int[pSides];
pointsY = new int[pSides];
}
// Used to populate the points array with the vertices corners (points) and construct a polygon with the
// number of sides defined by pSides and the length of each side defined by pSideLength.
// Dimension object that is passed in as an argument is used to get the width and height of the ContainerPanel
// and used to determine the x and y values of its centre point that will be used to position the drawn Polygon.
public Polygon getPolygonPoints(Dimension dim) {
polyCenX = dim.width / 2; // x value of centre point of the polygon
polyCenY = dim.height / 2; // y value of centre point of the polygon
Polygon p = new Polygon();
for (int i = 0; i < pSides; i++) {
// Calculate the x and y coordinates of the ith point of the polygon
int x = polyCenX + pSideLengths * (int) Math.cos(2.0 * Math.PI * i / pSides);
int y = polyCenY + pSideLengths * (int) Math.sin(2.0 * Math.PI * i / pSides);
// Add the x and y coordinates to the pointsX and pointsY arrays
pointsX[i] = x;
pointsY[i] = y;
// Add the point to the polygon object using the addPoint method
p.addPoint(x, y);
}
return p;
}
// You will need to modify this method to set the colour of the Polygon to be drawn
// Remember that Graphics2D has a setColor() method available for this purpose
public void drawPolygon(Graphics2D g, Dimension d) {
//Set color of polygon
g.setColor(pColor);
Polygon p = getPolygonPoints(d);
g.draw(p);
//this creates a bounding box around the polygon
Rectangle2D bounds = p.getBounds2D();
g.draw(bounds);
}
// gets a stored ID
public int getID() {
return pId;
}
#Override
// method used for comparing PolygonContainer objects based on stored ids, you need to complete the method
public int compareTo(PolygonContainer o) {
return 0;
}
// outputs a string representation of the PolygonContainer object, you need to complete this to use for testing
public String toString()
{
return "";
}
}
import javax.swing.JPanel;
import java.awt.*;
public class ContainerPanel extends JPanel{
ContainerFrame conFrame;
public ContainerPanel(ContainerFrame cf) {
conFrame = cf; // reference to ContainerFrame object
}
#Override
protected void paintComponent(Graphics g) {
super.paintComponent(g);
Graphics2D comp = (Graphics2D)g; // You will need to use a Graphics2D objects for this
Dimension size = getSize(); // You will need to use this Dimension object to get
// the width / height of the JPanel in which the
// Polygon is going to be drawn
for (PolygonContainer polygon : conFrame.polygons) {
polygon.drawPolygon(comp, size);
}
}
}
JFrame

A better solution for Itext 7 Text fitting

I have a project that used Itext 5 and worked as intended.
Program had to put userInput in certain 'Chunks' inside paragraphs. Paragraphs have unmovable (chunks)words per line, and the userInput should scale in the space reserved for the userInput inside paragraph.
Old Project had the following code(made as example)
public class Oldway {
static final transient Font bold2 = FontFactory.getFont("Times-Roman", 10.0f, 1);
public static void main(String[] args) {
Document document = new Document();
document.setPageSize(PageSize.A4);
try {
PdfWriter writer = PdfWriter.getInstance(document, new FileOutputStream(new File("itext5.pdf")));
document.open();
Paragraph title = new Paragraph("Title of doc");
title.setAlignment(1);
document.add(title);
Paragraph dec= new Paragraph();
Chunk ch01 = new Chunk("Prev text ");
dec.add(ch01);
Chunk ch02 = new Chunk(getEmptySpace(42));
dec.add(ch02);
Chunk ch03 = new Chunk(" next Text");
dec.add(ch03);
document.add(dec);
float y = writer.getVerticalPosition(false);
float x2 = document.left() + ch01.getWidthPoint();
float x3 = x2 + ch02.getWidthPoint();
getPlainFillTest("Text to insert", document, y, x3, x2, writer, false);
document.close();
writer.flush();
} catch (FileNotFoundException | DocumentException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public static Chunk getEmptySpace(int size) {
Chunk ch = new Chunk();
for(int i = 0;i<=size;i++) {
ch.append("\u00a0");
}
return new Chunk(ch);
}
public static void getPlainFillTest(String str,Document document,float y, float x1pos,
float x2pos, PdfWriter writer,boolean withTab) {
if(str.isEmpty() || str.isBlank()) {
str = "________";
}
Rectangle rec2 = null;
if(!withTab)
rec2 = new Rectangle(x2pos, y, x1pos-2,y+10);
else {
rec2 = new Rectangle(x2pos+35, y, x1pos+33,y+10);
}
BaseFont bf = bold2.getBaseFont();
PdfContentByte cb = writer.getDirectContent();
float fontSize = getMaxFontSize(bf, str,(int)rec2.getWidth(), (int)rec2.getHeight());
Phrase phrase = new Phrase(str, new Font(bf, fontSize));
ColumnText.showTextAligned(cb, Element.ALIGN_CENTER, phrase,
// center horizontally
(rec2.getLeft() + rec2.getRight()) / 2,
// shift baseline based on descent
rec2.getBottom() - bf.getDescentPoint(str, fontSize),0);
cb.saveState();//patrulaterul albastru
cb.setColorStroke(Color.BLUE);
cb.rectangle(rec2.getLeft(), rec2.getBottom(), rec2.getWidth(), rec2.getHeight());
cb.stroke();
cb.restoreState();
}
//stackoverflow solution
private static float getMaxFontSize(BaseFont bf, String text, int width, int height){
// avoid infinite loop when text is empty
if(text.isEmpty()){
return 0.0f;
}
float fontSize = 0.1f;
while(bf.getWidthPoint(text, fontSize) < width){
fontSize += 0.1f;
}
float maxHeight = measureHeight(bf, text, fontSize);
while(maxHeight > height){
fontSize -= 0.1f;
maxHeight = measureHeight(bf, text, fontSize);
};
return fontSize;
}
public static float measureHeight(BaseFont baseFont, String text, float fontSize)
{
float ascend = baseFont.getAscentPoint(text, fontSize);
float descend = baseFont.getDescentPoint(text, fontSize);
return ascend - descend;
}}
Now I'm trying to do the same thing in IText 7 and ...is not that easy!
I manage to create a working code, but its messy, and some things don't get the right coordinates. The Itext7 code(made as example):
public class Newway {
public static void main(String[] args) {
PdfWriter writer;
try {
writer = new PdfWriter(new File("test2.pdf"));
PdfDocument document = new PdfDocument(writer);
document.getDocumentInfo().addCreationDate();
document.getDocumentInfo().setTitle("Title");
document.setDefaultPageSize(PageSize.A4);
Document doc = new Document(document);
doc.setFontSize(12);
Paragraph par = new Paragraph();
Text ch01 = new Text("Prev Text ");
par.add(ch01);
Paragraph space = new Paragraph();
space.setMaxWidth(40);
for(int i=0;i<40;i++) {
par.add("\u00a0");
space.add("\u00a0");
}
Text ch02 = new Text(" next text");
par.add(ch02);
doc.add(par);
Paragraph linePara = new Paragraph().add("Test from UserInput")
.setTextAlignment(TextAlignment.CENTER).setBorder(new DottedBorder(1));
float width = doc.getPageEffectiveArea(PageSize.A4).getWidth();
float height = doc.getPageEffectiveArea(PageSize.A4).getHeight();
IRenderer primul = ch01.createRendererSubTree().setParent(doc.getRenderer());
IRenderer spaceR = space.createRendererSubTree().setParent(doc.getRenderer());
LayoutResult primulResult = primul.layout(new LayoutContext(new LayoutArea(1, new Rectangle(width,height))));
LayoutResult layoutResult = spaceR.layout(new LayoutContext(new LayoutArea(1, new Rectangle(width,height))));
Rectangle primulBox = ((TextRenderer) primul).getInnerAreaBBox();
Rectangle rect = ((ParagraphRenderer) spaceR).getInnerAreaBBox();
float rwidth = rect.getWidth();
float rheight = rect.getHeight();
float x = primulBox.getWidth()+ doc.getLeftMargin();
float y = rect.getY()+(rheight*2.05f);//rect.getY() is never accurate, is always below the paragraph. WHY ??
Rectangle towr = new Rectangle(x, y, rwidth, rheight*1.12f);//rheight on default is way too small
PdfCanvas pdfcanvas = new PdfCanvas(document.getFirstPage());
Canvas canvas = new Canvas(pdfcanvas, towr);
//from theinternet
float fontSizeL = 1;
float fontSizeR = 14;
while (Math.abs(fontSizeL - fontSizeR) > 1e-1) {
float curFontSize = (fontSizeL + fontSizeR) / 2;
linePara.setFontSize(curFontSize);
// It is important to set parent for the current element renderer to a root renderer
IRenderer renderer = linePara.createRendererSubTree().setParent(canvas.getRenderer());
LayoutContext context = new LayoutContext(new LayoutArea(1, towr));
if (renderer.layout(context).getStatus() == LayoutResult.FULL) {
// we can fit all the text with curFontSize
fontSizeL = curFontSize;
} else {
fontSizeR = curFontSize;
}
}
canvas.add(linePara);
new PdfCanvas(document.getFirstPage()).rectangle(towr).setStrokeColor(ColorConstants.BLACK).stroke();
canvas.close();
doc.close();
writer.flush();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}}
The questions are
Is there a better, more elegant way to do this ?
Why rect.getY() is always below the paragraph? and how do I get Y to match the Paragraph real Y coorinate ?
Why the default 'rheight' is always too small? but (rheight*1.1f) works ?
(Optional) How do I set tab() space size in IText 7 ?
This way is quite good because it's taking into account all the possible model element settings and implications of the layout process. Your iText 5 alternative was good enough for basic case of Latin-based text without any modifications on the visual side. The iText 7 code you have is much more flexible and will still work if you use more complex layout settings, complex scripts etc. Also I see the iText 5 code is 105 lines in your example while the iText 7 code is 80 lines.
You are adding some magic +(rheight*2.05f); here while in reality what you are missing here is that when you draw via Canvas you don't have your margins defined anymore, so what you really need instead of rect.getY()+(rheight*2.05f); is rect.getY() + doc.getBottomMargin()
The issue comes from the fact that you are calculating rheight as renderer.getInnerAreaBBox() while this calculation does not take into account default margins that are applied to a paragraph. Margins are included into the occupied area but not the inner area bbox. To fix that, use renderer.getOccupiedArea().getBBox() instead. In this case there is not need to multiply rheight by a coefficient anymore.
The visual result is slightly different now but there is no magic constants anymore. Depending on what you are trying to really achieve you can tune the code further (add some margins here and there etc). But the code adapts well to the change in the user text.
Visual result before:
Visual result after:
Resultant code:
PdfWriter writer;
try {
writer = new PdfWriter(new File("test2.pdf"));
PdfDocument document = new PdfDocument(writer);
document.getDocumentInfo().addCreationDate();
document.getDocumentInfo().setTitle("Title");
document.setDefaultPageSize(PageSize.A4);
Document doc = new Document(document);
doc.setFontSize(12);
Paragraph par = new Paragraph();
Text ch01 = new Text("Prev Text ");
par.add(ch01);
Paragraph space = new Paragraph();
space.setMaxWidth(40);
for(int i=0;i<40;i++) {
par.add("\u00a0");
space.add("\u00a0");
}
Text ch02 = new Text(" next text");
par.add(ch02);
doc.add(par);
Paragraph linePara = new Paragraph().add("Test from UserInput")
.setTextAlignment(TextAlignment.CENTER).setBorder(new DottedBorder(1));
float width = doc.getPageEffectiveArea(PageSize.A4).getWidth();
float height = doc.getPageEffectiveArea(PageSize.A4).getHeight();
IRenderer primul = ch01.createRendererSubTree().setParent(doc.getRenderer());
IRenderer spaceR = space.createRendererSubTree().setParent(doc.getRenderer());
LayoutResult primulResult = primul.layout(new LayoutContext(new LayoutArea(1, new Rectangle(width,height))));
LayoutResult layoutResult = spaceR.layout(new LayoutContext(new LayoutArea(1, new Rectangle(width,height))));
Rectangle primulBox = ((TextRenderer) primul).getInnerAreaBBox();
Rectangle rect = ((ParagraphRenderer) spaceR).getOccupiedArea().getBBox();
float rwidth = rect.getWidth();
float rheight = rect.getHeight();
float x = primulBox.getWidth()+ doc.getLeftMargin();
float y = rect.getY() + doc.getBottomMargin();
Rectangle towr = new Rectangle(x, y, rwidth, rheight);
PdfCanvas pdfcanvas = new PdfCanvas(document.getFirstPage());
Canvas canvas = new Canvas(pdfcanvas, towr);
//from theinternet
float fontSizeL = 1;
float fontSizeR = 14;
while (Math.abs(fontSizeL - fontSizeR) > 1e-1) {
float curFontSize = (fontSizeL + fontSizeR) / 2;
linePara.setFontSize(curFontSize);
// It is important to set parent for the current element renderer to a root renderer
IRenderer renderer = linePara.createRendererSubTree().setParent(canvas.getRenderer());
LayoutContext context = new LayoutContext(new LayoutArea(1, towr));
if (renderer.layout(context).getStatus() == LayoutResult.FULL) {
// we can fit all the text with curFontSize
fontSizeL = curFontSize;
} else {
fontSizeR = curFontSize;
}
}
canvas.add(linePara);
new PdfCanvas(document.getFirstPage()).rectangle(towr).setStrokeColor(ColorConstants.BLACK).stroke();
canvas.close();
doc.close();
writer.flush();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}

Selenium - Extra screenshots generated when scrolling in equivalent intervals to capture an entire web page

Objective: Scroll in equivalent intervals to take screenshot of an entire page using Java and JSExecutor for Selenium WebDriver.
Issue: The approach I've implemented below works, however, I end up having 2-3 extra screenshots at the end of the web page - I want to avoid these redundant screenshots.
Scrolling method is below:
public static void pageScrollable() throws InterruptedException, IOException {
JavascriptExecutor jse = (JavascriptExecutor) driver;
//Find page height
pageHeight = ((Number) jse.executeScript("return document.body.scrollHeight")).intValue();
//Find current browser dimensions and isolate its height
Dimension d = driver.manage().window().getSize();
int browserSize = d.getHeight();
int currentHeight = 0;
System.out.println("Current scroll at: " + currentHeight);
System.out.println("Page height is: " + pageHeight + "\n");
//Scrolling logic
while(pageHeight>=currentHeight) {
jse.executeScript("window.scrollBy(0,"+currentHeight+")", "");
screenShot();
currentHeight+=browserSize;
System.out.println("Current scroll now at: " + currentHeight);
}
}
Screenshot method is below:
public static void screenShot() throws IOException, InterruptedException {
File screenShot = ((TakesScreenshot)driver).getScreenshotAs(OutputType.FILE);
FileUtils.copyFile(screenShot, new File("C:\\FilePath\\Screen " + count + ".png"));
count++;
System.out.println("Current screenshot count: " + count);
}
Following variables are defined as static:
static int pageHeight = 0;
static int count = 0;
static WebDriver driver;
I understand there's no implementation currently to capture screen of an entire web page using Selenium. Any help to resolve my logic above would be greatly appreciated.
Your scrolling logic is the reason for the additional image. The Window.scrollBy method scrolls by a number of pixel and not to an absolute position. You need to scroll by browserSize.
And perhaps you should identify the viewport size instead of the browser window size with
int browserSize = ((Number) jse.executeScript("return window.innerHeight")).intValue();
I added a complete example how to get single page screenshots with Chrome:
package demo;
import java.awt.Graphics2D;
import java.awt.image.BufferedImage;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.util.LinkedList;
import java.util.List;
import javax.imageio.ImageIO;
import org.openqa.selenium.JavascriptExecutor;
import org.openqa.selenium.OutputType;
import org.openqa.selenium.TakesScreenshot;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebDriverException;
import org.openqa.selenium.chrome.ChromeDriver;
public class WebDriverDemo {
static int pageHeight = 0;
static int count = 0;
static WebDriver driver;
static List<BufferedImage> images = new LinkedList<>();
public static void main(String[] args) {
driver = new ChromeDriver();
driver.get("http://automationpractice.com/");
try {
pageScrollable();
} catch (WebDriverException | InterruptedException | IOException e) {
e.printStackTrace();
}
driver.quit();
}
public static void pageScrollable() throws InterruptedException, IOException {
JavascriptExecutor jse = (JavascriptExecutor) driver;
// Find page height
pageHeight = ((Number) jse.executeScript("return document.body.scrollHeight")).intValue();
// Find current browser dimensions and isolate its height
int browserSize = ((Number) jse.executeScript("return window.innerHeight")).intValue();
System.out.println("Page height is: " + pageHeight + "\n");
System.out.println("Browser height is: " + browserSize + "\n");
int currentHeight = 0;
System.out.println("Current scroll at: " + currentHeight);
System.out.println("Page height is: " + pageHeight + "\n");
// Scrolling logic
while (pageHeight >= currentHeight) {
screenShot();
currentHeight += browserSize;
jse.executeScript("window.scrollBy(0," + browserSize + ")", "");
System.out.println("Current scroll now at: " + currentHeight);
}
BufferedImage result = null;
Graphics2D g2d = null;
int heightCurr = 0;
System.out.println("Image count is " + images.size());
for (int i = 0; i < images.size(); i++) {
BufferedImage img = images.get(i);
int imageHeight = 0;
if (result == null) {
System.out.println("Image height is " + img.getHeight()); // differs from browserSize
imageHeight = pageHeight + images.size() * (img.getHeight() - browserSize);
result = new BufferedImage(img.getWidth(), imageHeight, img.getType());
g2d = result.createGraphics();
}
if (i == images.size() - 1) {
g2d.drawImage(img, 0, imageHeight - img.getHeight(), null);
} else {
g2d.drawImage(img, 0, heightCurr, null);
heightCurr += img.getHeight();
}
}
g2d.dispose();
ImageIO.write(result, "png", new File("screenshot.png"));
}
public static void screenShot() throws IOException, InterruptedException {
byte[] scr = ((TakesScreenshot) driver).getScreenshotAs(OutputType.BYTES);
BufferedImage img = ImageIO.read(new ByteArrayInputStream(scr));
images.add(img);
}
}
Unfortunately it does not deliver proper results with FireFox because of glitches in their Window.scrollBy implementation.

Highlight words inside existing PDF

I need to highlight a set of words inside an existing PDF given specific coordinates that i have already extracted.
I am working with pdfbox by Apache (last version 2.0.8).
There is an example file I can use to such a purpose (AddAnnotations.java inside the pdfbox website) but I think this example was compiled with an older Java version as the following import does not work:
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationHighlight;
Can anyone help me with that? Which is the simplest way to highlight words by using this library?
Here is the code to highlight ALL the words inside a PDF document. Highlighting only a specific set of words can be easily performed modifying this script. Please note this is only a test and further checks are needed for words that terminates in a new line as well as words placed in negative landscape/portrait PDF pages. Optimizing this script is also possible.
This script was built using Apache PDFBox 2.0.8.
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.ArrayList;
import java.util.List;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.graphics.color.PDColor;
import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceRGB;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationTextMarkup;
public class TestAnnotatePDF extends PDFTextStripper
{
static List<double[]> coordinates;
static ArrayList tokenStream;
public TestAnnotatePDF() throws IOException
{
//data structed containing coordinates information for each token
coordinates = new ArrayList<>();
//List of words extracted from text (considering a whitespace-based tokenization)
tokenStream = new ArrayList();
}
public static void main(String [] args) throws IOException
{
try
{
//Loading an existing document
File file = new File("MyDocument");
PDDocument document = PDDocument.load(file);
//extended PDFTextStripper class
PDFTextStripper stripper = new TestAnnotatePDF();
//Get number of pages
int number_of_pages = document.getDocumentCatalog().getPages().getCount();
//The method writeText will invoke an override version of writeString
Writer dummy = new OutputStreamWriter(new ByteArrayOutputStream());
stripper.writeText(document, dummy);
//Print collected information
System.out.println(tokenStream);
System.out.println(tokenStream.size());
System.out.println(coordinates.size());
double page_height;
double page_width;
double width, height, minx, maxx, miny, maxy;
int rotation;
//scan each page and highlitht all the words inside them
for (int page_index = 0; page_index < number_of_pages; page_index++)
{
//get current page
PDPage page = document.getPage(page_index);
//Get annotations for the selected page
List<PDAnnotation> annotations = page.getAnnotations();
//Define a color to use for highlighting text
PDColor red = new PDColor(new float[] { 1, 0, 0 }, PDDeviceRGB.INSTANCE);
//Page height and width
page_height = page.getMediaBox().getHeight();
page_width = page.getMediaBox().getWidth();
//Scan collected coordinates
for (int i=0; i<coordinates.size(); i++)
{
//if the current coordinates are not related to the current
//page, ignore them
if ((int) coordinates.get(i)[4] != (page_index+1))
continue;
else
{
//get rotation of the page...portrait..landscape..
rotation = (int) coordinates.get(i)[7];
//page rotated of 90degrees
if (rotation == 90)
{
height = coordinates.get(i)[5];
width = coordinates.get(i)[6];
width = (page_height * width)/page_width;
//define coordinates of a rectangle
maxx = coordinates.get(i)[1];
minx = coordinates.get(i)[1] - height;
miny = coordinates.get(i)[0];
maxy = coordinates.get(i)[0] + width;
}
else //i should add here the cases -90/-180 degrees
{
height = coordinates.get(i)[5];
minx = coordinates.get(i)[0];
maxx = coordinates.get(i)[2];
miny = page_height - coordinates.get(i)[1];
maxy = page_height - coordinates.get(i)[3] + height;
}
//Add an annotation for each scanned word
PDAnnotationTextMarkup txtMark = new PDAnnotationTextMarkup(PDAnnotationTextMarkup.SUB_TYPE_HIGHLIGHT);
txtMark.setColor(red);
txtMark.setConstantOpacity((float)0.3); // 30% transparent
PDRectangle position = new PDRectangle();
position.setLowerLeftX((float) minx);
position.setLowerLeftY((float) miny);
position.setUpperRightX((float) maxx);
position.setUpperRightY((float) ((float) maxy+height));
txtMark.setRectangle(position);
float[] quads = new float[8];
quads[0] = position.getLowerLeftX(); // x1
quads[1] = position.getUpperRightY()-2; // y1
quads[2] = position.getUpperRightX(); // x2
quads[3] = quads[1]; // y2
quads[4] = quads[0]; // x3
quads[5] = position.getLowerLeftY()-2; // y3
quads[6] = quads[2]; // x4
quads[7] = quads[5]; // y5
txtMark.setQuadPoints(quads);
txtMark.setContents(tokenStream.get(i).toString());
annotations.add(txtMark);
}
}
}
//Saving the document in a new file
File highlighted_doc = new File("MyDocument_final.pdf");
document.save(highlighted_doc);
document.close();
}
catch(IOException e)
{
System.out.println(e);
}
}
#Override
protected void writeString(String string, List<TextPosition> textPositions) throws IOException
{
String token = "";
int token_length = textPositions.size();
int counter = 1;
double minx = 0,maxx = 0,miny = 0,maxy =0;
double height = 0;
double width = 0;
int rotation = 0;
for (TextPosition text : textPositions)
{
rotation = text.getRotation();
if (text.getHeight() > height)
height = text.getHeight();
if (text.getWidth() > width)
width = text.getWidth();
//if it is the first char of the current word
if (counter == 1)
{
minx = text.getX();
miny = text.getY();
}
//if it is the last char of the current word
if (counter == token_length)
{
maxx = text.getEndX();
maxy = text.getY();
}
token += text;
counter += 1;
}
tokenStream.add(token);
double word_coordinates [] = {minx,miny,maxx,maxy,this.getCurrentPageNo(), height, width, rotation};
coordinates.add(word_coordinates);
}}
Here is the code to highlight specific words inside a PDF document. Please note this is working for highlighting the line of the search text. Highlight specific words in a PDF is still in progress... Any suggestion to highlight specific words on top of this code will be highly appreciated.
This script was built using Apache PDFBox 2.0.8
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.List;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.graphics.color.PDColor;
import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceRGB;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationTextMarkup;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;
public class PDFhighlightDemo extends PDFTextStripper {
public PDFhighlightDemo() throws IOException {
super();
}
public static void main(String[] args) throws IOException {
PDDocument document = null;
String fileName = "Demo1.pdf";
try {
document = PDDocument.load( new File(fileName) );
PDFTextStripper stripper = new PDFhighlightDemo();
stripper.setSortByPosition( true );
stripper.setStartPage( 0 );
stripper.setEndPage( document.getNumberOfPages() );
Writer dummy = new OutputStreamWriter(new ByteArrayOutputStream());
stripper.writeText(document, dummy);
File file1 = new File("FinalPDF.pdf");
document.save(file1);
}
finally {
if( document != null ) {
document.close();
}
}
}
/**
* Override the default functionality of PDFTextStripper.writeString()
*/
#Override
protected void writeString(String string, List<TextPosition> textPositions) throws IOException {
boolean isFound = false;
float posXInit1 = 0,
posXEnd1 = 0,
posYInit1 = 0,
posYEnd1 = 0,
width1 = 0,
height1 = 0,
fontHeight1 = 0;
String[] criteria = {"angular", "prepared"};
for (int i = 0; i < criteria.length; i++) {
if (string.contains(criteria[i])) {
isFound = true;
}
}
if (isFound) {
for(TextPosition textPosition:textPositions) {
posXInit1 = textPositions.get(0).getXDirAdj();
posXEnd1 = textPositions.get(textPositions.size() - 1).getXDirAdj() + textPositions.get(textPositions.size() - 1).getWidth();
posYInit1 = textPositions.get(0).getPageHeight() - textPositions.get(0).getYDirAdj();
posYEnd1 = textPositions.get(0).getPageHeight() - textPositions.get(textPositions.size() - 1).getYDirAdj();
width1 = textPositions.get(0).getWidthDirAdj();
height1 = textPositions.get(0).getHeightDir();
}
float quadPoints[] = {posXInit1, posYEnd1 + height1 + 2, posXEnd1, posYEnd1 + height1 + 2, posXInit1, posYInit1 - 2, posXEnd1, posYEnd1 - 2};
List<PDAnnotation> annotations = document.getPage(this.getCurrentPageNo() - 1).getAnnotations();
PDAnnotationTextMarkup highlight = new PDAnnotationTextMarkup(PDAnnotationTextMarkup.SUB_TYPE_HIGHLIGHT);
PDRectangle position = new PDRectangle();
position.setLowerLeftX(posXInit1);
position.setLowerLeftY(posYEnd1);
position.setUpperRightX(posXEnd1);
position.setUpperRightY(posYEnd1 + height1);
highlight.setRectangle(position);
// quadPoints is array of x,y coordinates in Z-like order (top-left, top-right, bottom-left,bottom-right)
// of the area to be highlighted
highlight.setQuadPoints(quadPoints);
PDColor yellow = new PDColor(new float[]{1, 1, 1 / 255F}, PDDeviceRGB.INSTANCE);
highlight.setColor(yellow);
annotations.add(highlight);
}
}
}
Highlight specific words in a document using PDFclown.
package com.NLP.demo;
import java.awt.geom.Rectangle2D;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.pdfclown.documents.Page;
import org.pdfclown.documents.contents.ITextString;
import org.pdfclown.documents.contents.TextChar;
import org.pdfclown.documents.interaction.annotations.TextMarkup;
import org.pdfclown.documents.interaction.annotations.TextMarkup.MarkupTypeEnum;
import org.pdfclown.files.SerializationModeEnum;
import org.pdfclown.tools.TextExtractor;
import org.pdfclown.util.math.Interval;
import org.pdfclown.util.math.geom.Quad;
public class PDFCrownDemo {
public static void main() throws IOException {
PDFCrownDemo PDFCrownDemo=new PDFCrownDemo();
PDFCrownDemo.highlighttext();
}
public void highlighttext() throws IOException{
org.pdfclown.files.File file = new org.pdfclown.files.File("src/main/resources/XXX.pdf");
String textRegEx = "Contract";
Pattern pattern = Pattern.compile(textRegEx, Pattern.CASE_INSENSITIVE);
TextExtractor textExtractor = new TextExtractor(true, true);
for(final Page page : file.getDocument().getPages())
{
Map<Rectangle2D,List<ITextString>> textStrings = textExtractor.extract(page);
final Matcher matcher = pattern.matcher(TextExtractor.toString(textStrings));
textExtractor.filter(textStrings,new TextExtractor.IIntervalFilter()
{
#Override
public boolean hasNext()
{return matcher.find();}
#Override
public Interval next()
{return new Interval(matcher.start(), matcher.end());}
#Override
public void process(Interval interval,ITextString match)
{
// Defining the highlight box of the text pattern match...
List highlightQuads = new ArrayList();
{
/*
NOTE: A text pattern match may be split across multiple contiguous lines,
so we have to define a distinct highlight box for each text chunk.
*/
Rectangle2D textBox = null;
for(TextChar textChar : match.getTextChars())
{
Rectangle2D textCharBox = textChar.getBox();
if(textBox == null)
{textBox = (Rectangle2D)textCharBox.clone();}
else
{
if(textCharBox.getY() > textBox.getMaxY())
{
highlightQuads.add(Quad.get(textBox));
textBox = (Rectangle2D)textCharBox.clone();
}
else
{textBox.add(textCharBox);}
}
}
highlightQuads.add(Quad.get(textBox));
}
// Highlight the text pattern match!
new TextMarkup(page,MarkupTypeEnum.Highlight, highlightQuads);
}
#Override
public void remove(
)
{throw new UnsupportedOperationException();}
}
);
}
//file.save(SerializationModeEnum.Incremental);
file.save(new java.io.File("src/main/resources/XXX.pdf"), SerializationModeEnum.Standard);
}
}

Lucene custom scoring for numeric fields

I would like to have, in addition to standard term search with tf-idf similarity over text content field, scoring based on "similarity" of numeric fields. This similarity will be depending on distance between the value in query and in document (e.g. gaussian with m= [user input], s= 0.5)
I.e. let's say documents represent people, and person document have two fields:
description (full text)
age (numeric).
I want to find documents like
description:(x y z) age:30
but age to be not the filter, but rather part of score (for person of age 30 multiplier will be 1.0, for 25-year-old person 0.8 etc.)
Can this be achieved in a sensible manner?
EDIT: Finally I found out this can be done by wrapping ValueSourceQuery and TermQuery with CustomScoreQuery. See my solution below.
EDIT 2: With fast-changing versions of Lucene, I just want to add that it was tested on Lucene 3.0 (Java).
Okay, so here's (a bit verbose) proof-of-concept as a full JUnit test. Haven't tested its efficiency yet for large index, but from what I've read probably after a warm-up it should perform well, providing there's enough RAM available to cache numeric fields.
package tests;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.NumericField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.function.CustomScoreQuery;
import org.apache.lucene.search.function.IntFieldSource;
import org.apache.lucene.search.function.ValueSourceQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;
import junit.framework.TestCase;
public class AgeAndContentScoreQueryTest extends TestCase
{
public class AgeAndContentScoreQuery extends CustomScoreQuery
{
protected float peakX;
protected float sigma;
public AgeAndContentScoreQuery(Query subQuery, ValueSourceQuery valSrcQuery, float peakX, float sigma) {
super(subQuery, valSrcQuery);
this.setStrict(true); // do not normalize score values from ValueSourceQuery!
this.peakX = peakX; // age for which the age-relevance is best
this.sigma = sigma;
}
#Override
public float customScore(int doc, float subQueryScore, float valSrcScore){
// subQueryScore is td-idf score from content query
float contentScore = subQueryScore;
// valSrcScore is a value of date-of-birth field, represented as a float
// let's convert age value to gaussian-like age relevance score
float x = (2011 - valSrcScore); // age
float ageScore = (float) Math.exp(-Math.pow(x - peakX, 2) / 2*sigma*sigma);
float finalScore = ageScore * contentScore;
System.out.println("#contentScore: " + contentScore);
System.out.println("#ageValue: " + (int)valSrcScore);
System.out.println("#ageScore: " + ageScore);
System.out.println("#finalScore: " + finalScore);
System.out.println("+++++++++++++++++");
return finalScore;
}
}
protected Directory directory;
protected Analyzer analyzer = new WhitespaceAnalyzer();
protected String fieldNameContent = "content";
protected String fieldNameDOB = "dob";
protected void setUp() throws Exception
{
directory = new RAMDirectory();
analyzer = new WhitespaceAnalyzer();
// indexed documents
String[] contents = {"foo baz1", "foo baz2 baz3", "baz4"};
int[] dobs = {1991, 1981, 1987}; // date of birth
IndexWriter writer = new IndexWriter(directory, analyzer, IndexWriter.MaxFieldLength.UNLIMITED);
for (int i = 0; i < contents.length; i++)
{
Document doc = new Document();
doc.add(new Field(fieldNameContent, contents[i], Field.Store.YES, Field.Index.ANALYZED)); // store & index
doc.add(new NumericField(fieldNameDOB, Field.Store.YES, true).setIntValue(dobs[i])); // store & index
writer.addDocument(doc);
}
writer.close();
}
public void testSearch() throws Exception
{
String inputTextQuery = "foo bar";
float peak = 27.0f;
float sigma = 0.1f;
QueryParser parser = new QueryParser(Version.LUCENE_30, fieldNameContent, analyzer);
Query contentQuery = parser.parse(inputTextQuery);
ValueSourceQuery dobQuery = new ValueSourceQuery( new IntFieldSource(fieldNameDOB) );
// or: FieldScoreQuery dobQuery = new FieldScoreQuery(fieldNameDOB,Type.INT);
CustomScoreQuery finalQuery = new AgeAndContentScoreQuery(contentQuery, dobQuery, peak, sigma);
IndexSearcher searcher = new IndexSearcher(directory);
TopDocs docs = searcher.search(finalQuery, 10);
System.out.println("\nDocuments found:\n");
for(ScoreDoc match : docs.scoreDocs)
{
Document d = searcher.doc(match.doc);
System.out.println("CONTENT: " + d.get(fieldNameContent) );
System.out.println("D.O.B.: " + d.get(fieldNameDOB) );
System.out.println("SCORE: " + match.score );
System.out.println("-----------------");
}
}
}
This can be achieved using Solr's FunctionQuery