Apache Tika: Parsed PDF contains rectangle box , how to remove/parse these chars? - pdf

I am parsing pdfs with apache tika, but I have these in the result String .
I think it's a enumeration dash. How can I remove this from the result string or parse it the right way?
Other characters are working fine e.g. $ % & and so on.
Here is the code to parse from an InputStream:
private SolrInputDocument getDocument(InputStream stream, String docPath) throws SolrServerException {
SolrInputDocument solrDocument = new SolrInputDocument();
Parser parser = new AutoDetectParser(); // Should auto-detect!
BodyContentHandler handler = new BodyContentHandler(Integer.MAX_VALUE);
PDFParserConfig pdfConfig = new PDFParserConfig();
pdfConfig.setExtractInlineImages(false);
ParseContext parseContext = new ParseContext();
parseContext.set(PDFParserConfig.class, pdfConfig);
parseContext.set(Parser.class, parser);
Metadata metadata = new Metadata();
metadata.add("encoding", "unicode");
try {
parser.parse(stream, handler, metadata, parseContext);
String body = handler.toString();
body = CharMatcher.JAVA_ISO_CONTROL.removeFrom(body);
solrDocument.addField("id", docPath);
solrDocument.addField("body", body);
solrDocument.addField("url", docPath);
solrDocument.addField("extension", PDF_EXTENSION);
} catch (IOException | SAXException | TikaException e) {
throw new SolrServerException(e);
}
return solrDocument;
}

Related

Pdfbox - adding pdf embedded File and save the PDDocument to OutputStream does not keep the embedded Files

I'm using Pdfbox (1.8.8) to adding attachments to a pdf. My problem is when one of the attachments is of type .pdf and i'm saving the PDDocument to OutputStream the final pdf document does not include the attachments. If a save the PDDocument to a file instead an OutputStream all works just fine, and if the attachments does not include any pdf, both save to file or OutputStream works fine.
I would like to know if there is any way to add pdf embedded Files and save the PDDocument to OutputStream keeping the attached files in the final pdf that is generated.
The code i'm using is:
private void insertAttachments(OutputStream out, ArrayList<Attachment> attachmentsResources) {
final PDDocument doc;
Boolean hasPdfAttach = false;
try {
doc = PDDocument.load(new ByteArrayInputStream(((ByteArrayOutputStream) out).toByteArray()));
// final PDFTextStripper pdfStripper = new PDFTextStripper();
// final String text = pdfStripper.getText(doc);
final PDEmbeddedFilesNameTreeNode efTree = new PDEmbeddedFilesNameTreeNode();
final Map embeddedFileMap = new HashMap();
PDEmbeddedFile embeddedFile;
File file = null;
for (Attachment attach : attachmentsResources) {
// first create the file specification, which holds the embedded file
final PDComplexFileSpecification fileSpecification = new PDComplexFileSpecification();
fileSpecification.setFile(attach.getFilename());
file = AttachmentUtils.getAttachmentFile(attach);
final InputStream is = new FileInputStream(file.getAbsolutePath());
embeddedFile = new PDEmbeddedFile(doc, is);
// set some of the attributes of the embedded file
if ("application/pdf".equals(attach.getMimetype())) {
hasPdfAttach = true;
}
embeddedFile.setSubtype(attach.getMimetype());
embeddedFile.setSize((int) (long) attach.getFilesize());
fileSpecification.setEmbeddedFile(embeddedFile);
// now add the entry to the embedded file tree and set in the document.
embeddedFileMap.put(attach.getFilename(), fileSpecification);
// final String text2 = pdfStripper.getText(doc);
}
// final String text3 = pdfStripper.getText(doc);
efTree.setNames(embeddedFileMap);
// ((COSDictionary) efTree.getCOSObject()).removeItem(COSName.LIMITS); (this not work for me)
// attachments are stored as part of the "names" dictionary in the document catalog
final PDDocumentNameDictionary names = new PDDocumentNameDictionary(doc.getDocumentCatalog());
names.setEmbeddedFiles(efTree);
doc.getDocumentCatalog().setNames(names);
// final ByteArrayOutputStream pdfboxToDocumentStream = new ByteArrayOutputStream();
final String tmpfile = "temporary.pdf";
if (hasPdfAttach) {
final File f = new File(tmpfile);
doc.save(f);
doc.close();
//i have try with parser but without success too
// PDFParser parser = new PDFParser(new FileInputStream(tmpfile));
// parser.parse();
// PDDocument doc2 = parser.getPDDocument();
final PDDocument doc2 = PDDocument.loadNonSeq(f, new RandomAccessFile(new File(getHomeTMP()
+ "tempppp.pdf"), "r"));
doc2.save(out);
doc2.close();
} else {
doc.save(out);
doc.close();
}
//that does not work too
// final InputStream in = new FileInputStream(tmpfile);
// IOUtils.copy(in, out);
// out = new FileOutputStream(tmpFile);
// doc.save (out);
} catch (IOException e1) {
e1.printStackTrace();
} catch (Exception e2) {
e2.printStackTrace();
}
}
Best regards
Solution:
private void insertAttachments(OutputStream out, ArrayList<Attachment> attachmentsResources) {
final PDDocument doc;
try {
doc = PDDocument.load(new ByteArrayInputStream(((ByteArrayOutputStream) out).toByteArray()));
((ByteArrayOutputStream) out).reset();
final PDEmbeddedFilesNameTreeNode efTree = new PDEmbeddedFilesNameTreeNode();
final Map embeddedFileMap = new HashMap();
PDEmbeddedFile embeddedFile;
File file = null;
for (Attachment attach : attachmentsResources) {
// first create the file specification, which holds the embedded file
final PDComplexFileSpecification fileSpecification = new PDComplexFileSpecification();
fileSpecification.setFile(attach.getFilename());
file = AttachmentUtils.getAttachmentFile(attach);
final InputStream is = new FileInputStream(file.getAbsolutePath());
embeddedFile = new PDEmbeddedFile(doc, is);
// set some of the attributes of the embedded file
embeddedFile.setSubtype(attach.getMimetype());
embeddedFile.setSize((int) (long) attach.getFilesize());
fileSpecification.setEmbeddedFile(embeddedFile);
// now add the entry to the embedded file tree and set in the document.
embeddedFileMap.put(attach.getFilename(), fileSpecification);
}
efTree.setNames(embeddedFileMap);
((COSDictionary) efTree.getCOSObject()).removeItem(COSName.LIMITS);
// attachments are stored as part of the "names" dictionary in the document catalog
final PDDocumentNameDictionary names = new PDDocumentNameDictionary(doc.getDocumentCatalog());
names.setEmbeddedFiles(efTree);
doc.getDocumentCatalog().setNames(names);
((COSDictionary) efTree.getCOSObject()).removeItem(COSName.LIMITS);
doc.save(out);
doc.close();
} catch (IOException e1) {
e1.printStackTrace();
} catch (Exception e2) {
e2.printStackTrace();
}
}
You store the new PDF after the original PDF in out:
Look at all the uses of out in your method:
private void insertAttachments(OutputStream out, ArrayList<Attachment> attachmentsResources) {
...
doc = PDDocument.load(new ByteArrayInputStream(((ByteArrayOutputStream) out).toByteArray()));
...
doc2.save(out);
...
doc.save(out);
So you get as input a ByteArrayOutputStream and take its current content as input (i.e. the ByteArrayOutputStream is not empty but already contains a PDF) and after some processing you append the modified PDF to the ByteArrayOutputStream. Depending on the PDF viewer you present this to, you will be shown either the original or the manipulated PDF or a (very correct) error message that the file is garbage.
If you want the ByteArrayOutputStream to contain only the manipulated PDF, simply add
((ByteArrayOutputStream) out).reset();
or (if you are not sure about the state of the stream)
out = new ByteArrayOutputStream();
right after
doc = PDDocument.load(new ByteArrayInputStream(((ByteArrayOutputStream) out).toByteArray()));
PS: According to the comments the OP tried the above proposed changes to his code without success.
I cannot run the code as presented in the question because it is not self-contained. Thus, I reduced it to the essentials to get a self-contained test:
#Test
public void test() throws IOException, COSVisitorException
{
ByteArrayOutputStream baos = new ByteArrayOutputStream();
try (
InputStream sourceStream = getClass().getResourceAsStream("test.pdf");
InputStream attachStream = getClass().getResourceAsStream("artificial text.pdf"))
{
final PDDocument document = PDDocument.load(sourceStream);
final PDEmbeddedFile embeddedFile = new PDEmbeddedFile(document, attachStream);
embeddedFile.setSubtype("application/pdf");
embeddedFile.setSize(10993);
final PDComplexFileSpecification fileSpecification = new PDComplexFileSpecification();
fileSpecification.setFile("artificial text.pdf");
fileSpecification.setEmbeddedFile(embeddedFile);
final Map<String, PDComplexFileSpecification> embeddedFileMap = new HashMap<String, PDComplexFileSpecification>();
embeddedFileMap.put("artificial text.pdf", fileSpecification);
final PDEmbeddedFilesNameTreeNode efTree = new PDEmbeddedFilesNameTreeNode();
efTree.setNames(embeddedFileMap);
final PDDocumentNameDictionary names = new PDDocumentNameDictionary(document.getDocumentCatalog());
names.setEmbeddedFiles(efTree);
document.getDocumentCatalog().setNames(names);
document.save(baos);
document.close();
}
Files.write(Paths.get("attachment.pdf"), baos.toByteArray());
}
As you see PDFBox here uses only streams. The result:
Thus, PDFBox without problem stores a PDF into which it has embedded a PDF file attachment.
The problem, therefore, most likely have nothing to do with this work flow as such

How to extract text from PDFs using a PIG UDF and Apache Tika?

I'm attempting to write a PIG eval function (UDF) to extract text from pdf files using Apache Tika. However, my function only writes 0 or 1 bytes to output whenever I try to run the function. How could I fix my code?
public class ExtractTextFromPDFs extends EvalFunc<String> {
#Override
public String exec(Tuple input) throws IOException {
String pdfText;
if (input == null || input.size() == 0 || input.get(0) == null) {
return "N/A";
}
DataByteArray dba = (DataByteArray)input.get(0);
InputStream is = new ByteArrayInputStream(dba.get());
ContentHandler contenthandler = new BodyContentHandler();
Metadata metadata = new Metadata();
Parser pdfparser = new AutoDetectParser();
try {
pdfparser.parse(is, contenthandler, metadata, new ParseContext());
} catch (SAXException | TikaException e) {
e.printStackTrace();
}
pdfText = contenthandler.toString();
//close the input stream
if(is != null){
is.close();
}
return pdfText;
}
}
I run the code using 'c = foreach b generate ExtractTextFromPDFs(content);' where b is a pdf and content is a bytearray.

Apache Tika - read chunk at a time from a file?

Is there any way to read chunk at a time (instead of reading the entire file) from a file using Tika API?
following is my code. As you can see I am reading the entire file at once. I would like to read chunk at a time and create a text file the content.
InputStream stream = new FileInputStream(file);
Parser p = new AutoDetectParser();
Metadata meta =new Metadata();
WriteOutContentHandler handler = new WriteOutContnetHandler(-1);
ParseContext parse = new ParseContext();
....
p.parse(stream,handler,meta, context);
...
String content = handler.toString();
There's (now) and Apache Tika example which shows how you can capture the plain text output, and return it in chunks based on the maximum allowed size of a chunk. You can find it in ContentHandlerExample - method is parseToPlainTextChunks
Based on that, if you wanted to output to a file instead, and on a per-chunk basis, you'd tweak it to be something like:
final int MAXIMUM_TEXT_CHUNK_SIZE = 100 * 1024 * 1024;
final File outputDir = new File("/tmp/");
private class ChunkHandler extends ContentHandlerDecorator {
private int size = 0;
private int fileNumber = -1;
private OutputStreamWriter out = null;
#Override
public void characters(char[] ch, int start, int length) throws IOException {
if (out == null || size+length > MAXIMUM_TEXT_CHUNK_SIZE) {
if (out != null) out.close();
fileNumber++;
File f = new File(outputDir, "output-" + fileNumber + ".txt);
out = new OutputStreamWriter(new FileOutputStream(f, "UTF-8"));
}
out.write(ch, start, length);
}
public void close() throws IOException {
if (out != null) out.close();
}
}
public void parse(File file) {
InputStream stream = new FileInputStream(file);
Parser p = new AutoDetectParser();
Metadata meta =new Metadata();
ContentHandler handler = new ChunkHandler();
ParseContext parse = new ParseContext();
p.parse(stream,handler,meta, context);
((ChunkHandler)handler).close();
}
That will give you plain text files in the given directory, of no more than a maximum size. All html tags will be ignored, you'll only get the plain textual content

Save SSRS Report as pdf using Reporting Services

I've been trying to convert a SSRS Report to PDF and save to my local drive using the Reporting Web Services. Though I'm able to generate the corresponding pdf file but the contents of the file are missing. I've checked that the report I'm trying to convert is not an empty one. Only the header section is present there within the generated pdf files. Below is the code I'm using:
protected void GeneratePDFFromReport(object sender, EventArgs e)
{
RS2005.ReportingService2005 rs;
RE2005.ReportExecutionService rsExec;
// Create a new proxy to the web service
rs = new RS2005.ReportingService2005();
rsExec = new RE2005.ReportExecutionService();
// Authenticate to the Web service using Windows credentials
rs.Credentials = new System.Net.NetworkCredential("username", "password", "domain");
rsExec.Credentials = new System.Net.NetworkCredential("username", "password", "domain");
//rsExec.Credentials = System.Net.CredentialCache.DefaultCredentials;
rs.Url = "http://servername/reportserver/reportservice2005.asmx";
rsExec.Url = "http://servername/reportserver/reportexecution2005.asmx";
string historyID = null;
string deviceInfo = null;
string format = "PDF";
Byte[] results;
string encoding = String.Empty;
string mimeType = "application/pdf";
string extension = String.Empty;
RE2005.Warning[] warnings = null;
string[] streamIDs = null;
// Path of the Report - XLS, PDF etc.
string fileName = #"C:\Report\MemberReport.pdf";
// Name of the report - Please note this is not the RDL file.
string _reportName = #"/ReportFolder/ReportName";
string _historyID = null;
bool _forRendering = false;
RS2005.ParameterValue[] _values = null;
RS2005.DataSourceCredentials[] _credentials = null;
RS2005.ReportParameter[] _parameters = null;
try
{
_parameters = rs.GetReportParameters(_reportName, _historyID, _forRendering, _values, _credentials);
RE2005.ExecutionInfo ei = rsExec.LoadReport(_reportName, historyID);
results = rsExec.Render(format, deviceInfo,
out extension, out encoding,
out mimeType, out warnings, out streamIDs);
try
{
FileStream stream = File.Create(fileName, results.Length);
stream.Write(results, 0, results.Length);
stream.Close();
}
catch { }
results = null;
}
catch (Exception ex)
{
throw ex;
}
}
Any help would highly be appreciated. Thanks.
Assuming that SSRS is working OK using browser, please modify your posted code as shown below:
1) Device info string, please set it as follows:
string deviceInfo = #"<DeviceInfo><Toolbar>False</Toolbar></DeviceInfo>"; //Initial value was null
2) Create Header instance before using web call LoadReport:
ExecutionHeader execHeader = new ExecutionHeader();
RE2005.ExecutionHeaderValue = execHeader;

Extract xml data from gzip file using apache tika?

I am working a project in which i need to extract xml(sitemap)data from gz file using apache tika[AM NEW TO TIKA].
the fie name is something like sitemap01.xml.gz
I could extract data from normal text file or html,but i don't know how to extract xml from gz and extract the meta and data from xml...
I searched Google for past two days.
Do i need to use delegateParser in tika to extract data from xml?
Please guide me to some sample or articles....
Here is my try
public void parseXml() throws IOException{
Metadata metadata = new Metadata();
ContentHandler handler = new BodyContentHandler();
Parser parser = new AutoDetectParser();
ParseContext context = new ParseContext();
InputStream stream =this.getClass().getResourceAsStream("sitemap.xml.gz");
try {
parser.parse(stream,handler,metadata,context);
for(int i = 0; i <metadata.names().length; i++) {
String name = metadata.names()[i];
System.out.println(name + " : " + metadata.get(name));
}
System.out.println(handler.toString());
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (SAXException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (TikaException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}finally{
if(stream!=null) {
stream.close();
}
}
}
The thing you're missing is setting a recursing parser on your ParseContext. You probably want something like:
Parser parser = new AutoDetectParser();
ParseContext context = new ParseContext();
context.set(Parser.class, parser);
parser.parse(....)
By setting a Parser on the ParseContext, you tell Tika to call that when it encounters embedded documents (such as the XML inside your GZip)
Here is how you can use XML parser from Apache Tika for your case:
//detecting the file type
BodyContentHandler handler = new BodyContentHandler(-1);
Metadata metadata = new Metadata();
File inFile = new File("sitemap.xml.gz");
System.out.println(inFile.isFile());
FileInputStream inputstream = new FileInputStream(inFile);
ParseContext pcontext = new ParseContext();
//Xml parser
XMLParser xmlparser = new XMLParser();
xmlparser.parse(inputstream, handler, metadata, pcontext);
System.out.println(pcontext.toString());
System.out.println("Contents of the document:" + handler.toString());//this one contains all contents from xml files and tags are also removed
System.out.println("Metadata of the document:");
String[] metadataNames = metadata.names();
for(String name : metadataNames) {
System.out.println(name + ": " + metadata.get(name));