Pdfbox - adding pdf embedded File and save the PDDocument to OutputStream does not keep the embedded Files - pdf

I'm using Pdfbox (1.8.8) to adding attachments to a pdf. My problem is when one of the attachments is of type .pdf and i'm saving the PDDocument to OutputStream the final pdf document does not include the attachments. If a save the PDDocument to a file instead an OutputStream all works just fine, and if the attachments does not include any pdf, both save to file or OutputStream works fine.
I would like to know if there is any way to add pdf embedded Files and save the PDDocument to OutputStream keeping the attached files in the final pdf that is generated.
The code i'm using is:
private void insertAttachments(OutputStream out, ArrayList<Attachment> attachmentsResources) {
final PDDocument doc;
Boolean hasPdfAttach = false;
try {
doc = PDDocument.load(new ByteArrayInputStream(((ByteArrayOutputStream) out).toByteArray()));
// final PDFTextStripper pdfStripper = new PDFTextStripper();
// final String text = pdfStripper.getText(doc);
final PDEmbeddedFilesNameTreeNode efTree = new PDEmbeddedFilesNameTreeNode();
final Map embeddedFileMap = new HashMap();
PDEmbeddedFile embeddedFile;
File file = null;
for (Attachment attach : attachmentsResources) {
// first create the file specification, which holds the embedded file
final PDComplexFileSpecification fileSpecification = new PDComplexFileSpecification();
fileSpecification.setFile(attach.getFilename());
file = AttachmentUtils.getAttachmentFile(attach);
final InputStream is = new FileInputStream(file.getAbsolutePath());
embeddedFile = new PDEmbeddedFile(doc, is);
// set some of the attributes of the embedded file
if ("application/pdf".equals(attach.getMimetype())) {
hasPdfAttach = true;
}
embeddedFile.setSubtype(attach.getMimetype());
embeddedFile.setSize((int) (long) attach.getFilesize());
fileSpecification.setEmbeddedFile(embeddedFile);
// now add the entry to the embedded file tree and set in the document.
embeddedFileMap.put(attach.getFilename(), fileSpecification);
// final String text2 = pdfStripper.getText(doc);
}
// final String text3 = pdfStripper.getText(doc);
efTree.setNames(embeddedFileMap);
// ((COSDictionary) efTree.getCOSObject()).removeItem(COSName.LIMITS); (this not work for me)
// attachments are stored as part of the "names" dictionary in the document catalog
final PDDocumentNameDictionary names = new PDDocumentNameDictionary(doc.getDocumentCatalog());
names.setEmbeddedFiles(efTree);
doc.getDocumentCatalog().setNames(names);
// final ByteArrayOutputStream pdfboxToDocumentStream = new ByteArrayOutputStream();
final String tmpfile = "temporary.pdf";
if (hasPdfAttach) {
final File f = new File(tmpfile);
doc.save(f);
doc.close();
//i have try with parser but without success too
// PDFParser parser = new PDFParser(new FileInputStream(tmpfile));
// parser.parse();
// PDDocument doc2 = parser.getPDDocument();
final PDDocument doc2 = PDDocument.loadNonSeq(f, new RandomAccessFile(new File(getHomeTMP()
+ "tempppp.pdf"), "r"));
doc2.save(out);
doc2.close();
} else {
doc.save(out);
doc.close();
}
//that does not work too
// final InputStream in = new FileInputStream(tmpfile);
// IOUtils.copy(in, out);
// out = new FileOutputStream(tmpFile);
// doc.save (out);
} catch (IOException e1) {
e1.printStackTrace();
} catch (Exception e2) {
e2.printStackTrace();
}
}
Best regards
Solution:
private void insertAttachments(OutputStream out, ArrayList<Attachment> attachmentsResources) {
final PDDocument doc;
try {
doc = PDDocument.load(new ByteArrayInputStream(((ByteArrayOutputStream) out).toByteArray()));
((ByteArrayOutputStream) out).reset();
final PDEmbeddedFilesNameTreeNode efTree = new PDEmbeddedFilesNameTreeNode();
final Map embeddedFileMap = new HashMap();
PDEmbeddedFile embeddedFile;
File file = null;
for (Attachment attach : attachmentsResources) {
// first create the file specification, which holds the embedded file
final PDComplexFileSpecification fileSpecification = new PDComplexFileSpecification();
fileSpecification.setFile(attach.getFilename());
file = AttachmentUtils.getAttachmentFile(attach);
final InputStream is = new FileInputStream(file.getAbsolutePath());
embeddedFile = new PDEmbeddedFile(doc, is);
// set some of the attributes of the embedded file
embeddedFile.setSubtype(attach.getMimetype());
embeddedFile.setSize((int) (long) attach.getFilesize());
fileSpecification.setEmbeddedFile(embeddedFile);
// now add the entry to the embedded file tree and set in the document.
embeddedFileMap.put(attach.getFilename(), fileSpecification);
}
efTree.setNames(embeddedFileMap);
((COSDictionary) efTree.getCOSObject()).removeItem(COSName.LIMITS);
// attachments are stored as part of the "names" dictionary in the document catalog
final PDDocumentNameDictionary names = new PDDocumentNameDictionary(doc.getDocumentCatalog());
names.setEmbeddedFiles(efTree);
doc.getDocumentCatalog().setNames(names);
((COSDictionary) efTree.getCOSObject()).removeItem(COSName.LIMITS);
doc.save(out);
doc.close();
} catch (IOException e1) {
e1.printStackTrace();
} catch (Exception e2) {
e2.printStackTrace();
}
}

You store the new PDF after the original PDF in out:
Look at all the uses of out in your method:
private void insertAttachments(OutputStream out, ArrayList<Attachment> attachmentsResources) {
...
doc = PDDocument.load(new ByteArrayInputStream(((ByteArrayOutputStream) out).toByteArray()));
...
doc2.save(out);
...
doc.save(out);
So you get as input a ByteArrayOutputStream and take its current content as input (i.e. the ByteArrayOutputStream is not empty but already contains a PDF) and after some processing you append the modified PDF to the ByteArrayOutputStream. Depending on the PDF viewer you present this to, you will be shown either the original or the manipulated PDF or a (very correct) error message that the file is garbage.
If you want the ByteArrayOutputStream to contain only the manipulated PDF, simply add
((ByteArrayOutputStream) out).reset();
or (if you are not sure about the state of the stream)
out = new ByteArrayOutputStream();
right after
doc = PDDocument.load(new ByteArrayInputStream(((ByteArrayOutputStream) out).toByteArray()));
PS: According to the comments the OP tried the above proposed changes to his code without success.
I cannot run the code as presented in the question because it is not self-contained. Thus, I reduced it to the essentials to get a self-contained test:
#Test
public void test() throws IOException, COSVisitorException
{
ByteArrayOutputStream baos = new ByteArrayOutputStream();
try (
InputStream sourceStream = getClass().getResourceAsStream("test.pdf");
InputStream attachStream = getClass().getResourceAsStream("artificial text.pdf"))
{
final PDDocument document = PDDocument.load(sourceStream);
final PDEmbeddedFile embeddedFile = new PDEmbeddedFile(document, attachStream);
embeddedFile.setSubtype("application/pdf");
embeddedFile.setSize(10993);
final PDComplexFileSpecification fileSpecification = new PDComplexFileSpecification();
fileSpecification.setFile("artificial text.pdf");
fileSpecification.setEmbeddedFile(embeddedFile);
final Map<String, PDComplexFileSpecification> embeddedFileMap = new HashMap<String, PDComplexFileSpecification>();
embeddedFileMap.put("artificial text.pdf", fileSpecification);
final PDEmbeddedFilesNameTreeNode efTree = new PDEmbeddedFilesNameTreeNode();
efTree.setNames(embeddedFileMap);
final PDDocumentNameDictionary names = new PDDocumentNameDictionary(document.getDocumentCatalog());
names.setEmbeddedFiles(efTree);
document.getDocumentCatalog().setNames(names);
document.save(baos);
document.close();
}
Files.write(Paths.get("attachment.pdf"), baos.toByteArray());
}
As you see PDFBox here uses only streams. The result:
Thus, PDFBox without problem stores a PDF into which it has embedded a PDF file attachment.
The problem, therefore, most likely have nothing to do with this work flow as such

Related

iText FileStream overwirtes

I am in the process of putting together some code that will merge pdf's based on the file name prefix. I currently have the below code that grabs the filename and doesn't merge, but overwrites. I believe my problem is the FileStream placement, but if I move it out of the current location, I can't get the filename. Any suggestions? Thanks.
static void CreateMergedPDFs()
{
string srcDir = "C:/PDFin/";
string resultPDF = "C:/PDFout/";
{
var files = System.IO.Directory.GetFiles(srcDir);
string prevFileName = null;
int i = 1;
foreach (string file in files)
{
string filename = Left(Path.GetFileName(file), 8);
using (FileStream stream = new FileStream(resultPDF + filename + ".pdf", FileMode.Create))
{
if (prevFileName == null || filename == prevFileName)
{
Document pdfDoc = new Document(PageSize.A4);
PdfCopy pdf = new PdfCopy(pdfDoc, stream);
pdfDoc.Open();
{
pdf.AddDocument(new PdfReader(file));
i++;
}
if (pdfDoc != null)
pdfDoc.Close();
Console.WriteLine("Merges done!");
}
}
}
}
}
}
}
The behavior you are describing is consistent with your code. You are creating the loop in an incorrect way.
Try this:
static void CreateMergedPDFs()
{
string srcDir = "C:/PDFin/";
string resultPDF = "C:/PDFout/merged.pdf";
FileStream stream = new FileStream(resultPDF, FileMode.Create);
Document pdfDoc = new Document(PageSize.A4);
PdfCopy pdf = new PdfCopy(pdfDoc, stream);
pdfDoc.Open();
var files = System.IO.Directory.GetFiles(srcDir);
foreach (string file in files)
{
pdf.AddDocument(new PdfReader(file));
}
pdfDoc.Close();
Console.WriteLine("Merges done!");
}
}
That makes more sense, doesn't it?
If you want to group files based on their prefix, you should read the answer to the question Group files in a directory based on their prefix
In the answer to this question, it is assumed that the prefix and the rest of the filename are separated by a - character. For instance 1-abc.pdf and 1-xyz.pdf have the prefix 1 whereas 2-abc.pdf and 2-xyz.pdf have the prefix 2. In your case, it's not clear how you'd determine the prefix, but it's easy to get a list of all the files, sort them and make groups of files based on whatever algorithm you want to determine the prefix.

Converting dynamic cshtml to pdf using iTextSharp?

I was using Rotativa for the conversion of html to pdf. My whole aim was to send receipt as pdf to customers emailid. It was working well locally but when deployed in GoDaddy Server its not supported. So I am planning to generate the pdf using iTextSharp(Viewers Opinion is also invited).
Code written for the generation of pdf in Rotativa:
public ActionResult GetPdfReceipt(int RegId)
{
var actionPDF = new Rotativa.ActionAsPdf("GetPdfReceipt", new { RegId = regId })
{
FileName = "Receipt.pdf"
};
//Dynamic student receipt pdf
var byteArrayDynamic = actionPDF.BuildPdf(ControllerContext);
}
public ActionResult GetPdfReceipt(int RegId)
{
try
{
var _mdlReceiptPdf = new ReceiptPdfVM
{
...
...
};
return View("Receipts", _mdlReceiptPdf);
}
catch (Exception ex)
{
return View("");
}
}
I have gone through some of the codes for generating pdf using iTextSharp and it was as follows
HttpContext.Current.Response.Clear();
HttpContext.Current.Response.Buffer = true;
HttpContext.Current.Response.Charset = "";
HttpContext.Current.Response.ContentType = "application/pdf";
HttpContext.Current.Response.AddHeader("content-disposition", "attachment;filename=LoginReportPerDay.pdf");
StringWriter sWriter = new StringWriter();
HtmlTextWriter hTWriter = new HtmlTextWriter(sWriter);
GridView1.RenderControl(hTWriter);
StringReader sReader = new StringReader(sWriter.ToString());
Document pdf = new Document(PageSize.A4);
HTMLWorker worker = new HTMLWorker(pdf);
PdfWriter.GetInstance(pdf, HttpContext.Current.Response.OutputStream);
pdf.Open();
worker.Parse(sReader);
pdf.Close();
HttpContext.Current.Response.Write(pdf);
HttpContext.Current.Response.Flush();
HttpContext.Current.Response.End();
How can I use the above code to render dynamically generated Receipt View pdf array bytes.

How to remove one indirectly referenced image from a PDF and keep all others?

I would like to parse a PDF and find the logo via known attributes and when I find a match, remove that image and then copy everything else.
I am using the code below to replace an image with a blank white image to remove a logo from PDFs that are to be printed on letterhead. It replaces the image with a white image of the same size. Is there a way to modify this to actually remove the image (and thus save some space, etc.?).
private static void Main(string[] args)
{
ManipulatePdf(#"C:\in.pdf", #"C:\out.pdf");
Console.WriteLine("Finished - press a key");
Console.ReadKey();
}
public static void ManipulatePdf(String src, String dest)
{
Console.WriteLine("Start");
PdfReader reader = new PdfReader(src);
// first read all references and find the one we wish to work on.
PdfDictionary page = reader.GetPageN(1); // all resources are available to every page (?)
PdfDictionary resources = page.GetAsDict(PdfName.RESOURCES);
PdfDictionary xobjects = resources.GetAsDict(PdfName.XOBJECT);
page = reader.GetPageN(1);
resources = page.GetAsDict(PdfName.RESOURCES);
xobjects = resources.GetAsDict(PdfName.XOBJECT);
foreach (PdfName pdfName in xobjects.Keys)
{
PRStream stream = (PRStream) xobjects.GetAsStream(pdfName);
if (stream.Length > 100000)
{
PdfImage image = new PdfImage(MakeBlankImg(), "", null);
Console.WriteLine("Calling replace stream");
ReplaceStream(stream, image);
}
}
PdfStamper stamper = new PdfStamper(reader, new FileStream(dest, FileMode.Create));
stamper.Close();
reader.Close();
}
public static iTextSharp.text.Image MakeBlankImg()
{
Console.WriteLine("Making small blank image");
byte[] array;
using (MemoryStream ms = new MemoryStream())
{
//var drawingImage = image.GetDrawingImage();
using (Bitmap newBi = new Bitmap(1, 1))
{
using (Graphics g = Graphics.FromImage(newBi))
{
g.Clear(Color.White);
g.Flush();
}
newBi.Save(ms, ImageFormat.Jpeg);
}
array = ms.ToArray();
}
Console.WriteLine("Image array is " + array.Length + " bytes.");
return iTextSharp.text.Image.GetInstance(array);
}
public static void ReplaceStream(PRStream orig, PdfStream stream)
{
orig.Clear();
MemoryStream ms = new MemoryStream();
stream.WriteContent(ms);
orig.SetData(ms.ToArray(), false);
Console.WriteLine("Iterating keys");
foreach (KeyValuePair<PdfName, PdfObject> keyValuePair in stream)
{
Console.WriteLine("Key: " + keyValuePair.Key.ToString());
orig.Put(keyValuePair.Key, stream.Get(keyValuePair.Key));
}
}
}

Apache Tika - read chunk at a time from a file?

Is there any way to read chunk at a time (instead of reading the entire file) from a file using Tika API?
following is my code. As you can see I am reading the entire file at once. I would like to read chunk at a time and create a text file the content.
InputStream stream = new FileInputStream(file);
Parser p = new AutoDetectParser();
Metadata meta =new Metadata();
WriteOutContentHandler handler = new WriteOutContnetHandler(-1);
ParseContext parse = new ParseContext();
....
p.parse(stream,handler,meta, context);
...
String content = handler.toString();
There's (now) and Apache Tika example which shows how you can capture the plain text output, and return it in chunks based on the maximum allowed size of a chunk. You can find it in ContentHandlerExample - method is parseToPlainTextChunks
Based on that, if you wanted to output to a file instead, and on a per-chunk basis, you'd tweak it to be something like:
final int MAXIMUM_TEXT_CHUNK_SIZE = 100 * 1024 * 1024;
final File outputDir = new File("/tmp/");
private class ChunkHandler extends ContentHandlerDecorator {
private int size = 0;
private int fileNumber = -1;
private OutputStreamWriter out = null;
#Override
public void characters(char[] ch, int start, int length) throws IOException {
if (out == null || size+length > MAXIMUM_TEXT_CHUNK_SIZE) {
if (out != null) out.close();
fileNumber++;
File f = new File(outputDir, "output-" + fileNumber + ".txt);
out = new OutputStreamWriter(new FileOutputStream(f, "UTF-8"));
}
out.write(ch, start, length);
}
public void close() throws IOException {
if (out != null) out.close();
}
}
public void parse(File file) {
InputStream stream = new FileInputStream(file);
Parser p = new AutoDetectParser();
Metadata meta =new Metadata();
ContentHandler handler = new ChunkHandler();
ParseContext parse = new ParseContext();
p.parse(stream,handler,meta, context);
((ChunkHandler)handler).close();
}
That will give you plain text files in the given directory, of no more than a maximum size. All html tags will be ignored, you'll only get the plain textual content

how to append one pdf to other pdf file using itextsharp

How to append pages to one pdf file from another pdf file without creating a new pdf using itextsharp. I have metadata attached to one pdf so i just want to add only the other pdf pages,so that first pdf metadata should remain as it is.
Regards
Himvj
Assuming you have 2 pdf files: file1.pdf and file2.pdf that you want to concatenate and save the resulting pdf to file1.pdf (by replacing its contents) you could try the following:
using (var output = new MemoryStream())
{
var document = new Document();
var writer = new PdfCopy(document, output);
document.Open();
foreach (var file in new[] { "file1.pdf", "file2.pdf" })
{
var reader = new PdfReader(file);
int n = reader.NumberOfPages;
PdfImportedPage page;
for (int p = 1; p <= n; p++)
{
page = writer.GetImportedPage(reader, p);
writer.AddPage(page);
}
}
document.Close();
File.WriteAllBytes("file1.pdf", output.ToArray());
}
You can try this it add the whole document with metadata
public static void MergeFiles(string destinationFile, string[] sourceFiles)
{
try
{
//1: Create the MemoryStream for the destination document.
using (MemoryStream ms = new MemoryStream())
{
//2: Create the PdfCopyFields object.
PdfCopyFields copy = new PdfCopyFields(ms);
// - Set the security and other settings for the destination file.
//copy.Writer.SetEncryption(PdfWriter.STRENGTH128BITS, null, "1234", PdfWriter.AllowPrinting | PdfWriter.AllowCopy | PdfWriter.AllowFillIn);
copy.Writer.ViewerPreferences = PdfWriter.PageModeUseOutlines;
// - Create an arraylist to hold bookmarks for later use.
ArrayList outlines = new ArrayList();
int pageOffset = 0;
int f = 0;
//3: Import the documents specified in args[1], args[2], etc...
while (f < sourceFiles.Length)
{
// Grab the file from args[] and open it with PdfReader.
string file = sourceFiles[f];
PdfReader reader = new PdfReader(file);
// Import the pages from the current file.
copy.AddDocument(reader);
// Create an ArrayList of bookmarks in the file being imported.
// ArrayList bookmarkLst = SimpleBookmark.GetBookmark(reader);
// Shift the pages to accomidate any pages that were imported before the current document.
// SimpleBookmark.ShiftPageNumbers(bookmarkLst, pageOffset, null);
// Fill the outlines ArrayList with each bookmark as a HashTable.
// foreach (Hashtable ht in bookmarkLst)
// {
// outlines.Add(ht);
// }
// Set the page offset to the last page imported.
//copy.Writer.SetPageSize(rec);
pageOffset += reader.NumberOfPages;
f++;
}
//4: Put the outlines from all documents under a new "Root" outline and
// set them for destination document
// copy.Writer.Outlines = GetBookmarks("Root", ((Hashtable)outlines[0])["Page"], outlines);
//5: Close the PdfCopyFields object.
copy.Close();
//6: Save the MemoryStream to a file.
MemoryStreamToFile(ms, destinationFile);
}
}
catch (System.Exception e)
{
System.Console.Error.WriteLine(e.Message);
System.Console.Error.WriteLine(e.StackTrace);
System.Console.ReadLine();
}
}
public static void MemoryStreamToFile(MemoryStream MS, string FileName)
{
using (FileStream fs = new FileStream(#FileName, FileMode.Create))
{
byte[] data = MS.ToArray();
fs.Write(data, 0, data.Length);
fs.Close();
}
}