Tuesday, 28 August 2012

Not possible to read a pdf from file system

If we try to read a pdf on file system  without ingesting into database

for $IMPORT in xdmp:filesystem-directory("C:\ABC\0000001\Import")/dir:entry
let $Batchfilename := $IMPORT/dir:filename
let $BatchPathname := $IMPORT/dir:pathname
for $EachBatchPath in xdmp:filesystem-directory($BatchPathname)
let $PDFDocumentPath := xdmp:filesystem-directory($EachBatchPath/dir:entry/dir:pathname)/dir:entry[cts:contains(dir:filename,cts:word-query(".pdf"))]/dir:pathname
 
let $EachInputFileContent := if (xdmp:filesystem-file-exists($PDFDocumentPath) ) then
                                              xdmp:filesystem-file($PDFDocumentPath)
                                            else ( )
 return
  xdmp:save("D:/test", $EachInputFileContent,
          <options xmlns="xdmp:save">
          <output-encoding>utf-8</output-encoding>
          </options>)


throws  error : XDMP-READFILE: $r instance of node()+ -- ReadFile File is not in UTF-8: 

Solution is to establish xcc/mljam connection to access java code to read a pdf



import com.itextpdf.text.pdf.parser.PdfTextExtractor;

import java.io.FileOutputStream;
import com.lowagie.text.Document;
import com.lowagie.text.Rectangle;
import com.lowagie.text.pdf.BaseFont;
import com.lowagie.text.pdf.PdfContentByte;
import com.lowagie.text.pdf.PdfImportedPage;
import com.lowagie.text.pdf.PdfReader;
import com.lowagie.text.pdf.PdfWriter;




public class PDFReaderSample
{
public static void main(String[] args) throws Exception
  {


PdfReader reader = new PdfReader("C:/ABC.pdf");
  int n = reader.getNumberOfPages();
  Rectangle psize = reader.getPageSize(1);
  float width = psize.height();
  float height = psize.width();
 Document document = new Document(new Rectangle(width, height));
  PdfWriter Pdfwriter = PdfWriter.getInstance(document,
new FileOutputStream("D:/test/satyam.pdf"));
 document.open();

 PdfContentByte cb = Pdfwriter.getDirectContent();
 int i = 0;
 int p = 0;
 while (i < n) {
 document.newPage();
 p++;
 i++;
 PdfImportedPage page1 = Pdfwriter.getImportedPage(reader, i);
 cb.addTemplate(page1, .5f, 0, 0, .5f, 60, 120);
 if (i < n) {
 i++;
 PdfImportedPage page2 = Pdfwriter.getImportedPage(reader, i);
 cb.addTemplate(page2, .5f, 0, 0, .5f, width / 2 + 60, 120);
 }
 BaseFont bf = BaseFont.createFont(BaseFont.HELVETICA,
BaseFont.CP1252,BaseFont.NOT_EMBEDDED);
 cb.beginText();
 cb.setFontAndSize(bf, 19);
 cb.showTextAligned(PdfContentByte.ALIGN_CENTER, "page " + p
+ " of " + ((n / 2) + (n % 2 > 0? 1 : 0)), width / 2, 40, 0);
 cb.endText();
 }
 document.close();

  }
}

1 comment:

  1. Have you tried xdmp:external-binary. It think the problem is you are trying to interpret the binary as utf-8 which it is not. So reading it as a binary would be the preferred approach

    ReplyDelete