If we try to read a pdf on file system without ingesting into database
for $IMPORT in xdmp:filesystem-directory("C:\ABC\0000001\Import")/dir:entry
let $Batchfilename := $IMPORT/dir:filename
let $BatchPathname := $IMPORT/dir:pathname
for $EachBatchPath in xdmp:filesystem-directory($BatchPathname)
let $PDFDocumentPath := xdmp:filesystem-directory($EachBatchPath/dir:entry/dir:pathname)/dir:entry[cts:contains(dir:filename,cts:word-query(".pdf"))]/dir:pathname
let $EachInputFileContent := if (xdmp:filesystem-file-exists($PDFDocumentPath) ) then
xdmp:filesystem-file($PDFDocumentPath)
else ( )
return
xdmp:save("D:/test", $EachInputFileContent,
<options xmlns="xdmp:save">
<output-encoding>utf-8</output-encoding>
</options>)
throws error : XDMP-READFILE: $r instance of node()+ -- ReadFile File is not in UTF-8:
Solution is to establish xcc/mljam connection to access java code to read a pdf
import com.itextpdf.text.pdf.parser.PdfTextExtractor;
import java.io.FileOutputStream;
import com.lowagie.text.Document;
import com.lowagie.text.Rectangle;
import com.lowagie.text.pdf.BaseFont;
import com.lowagie.text.pdf.PdfContentByte;
import com.lowagie.text.pdf.PdfImportedPage;
import com.lowagie.text.pdf.PdfReader;
import com.lowagie.text.pdf.PdfWriter;
public class PDFReaderSample
{
public static void main(String[] args) throws Exception
{
PdfReader reader = new PdfReader("C:/ABC.pdf");
int n = reader.getNumberOfPages();
Rectangle psize = reader.getPageSize(1);
float width = psize.height();
float height = psize.width();
Document document = new Document(new Rectangle(width, height));
PdfWriter Pdfwriter = PdfWriter.getInstance(document,
new FileOutputStream("D:/test/satyam.pdf"));
document.open();
PdfContentByte cb = Pdfwriter.getDirectContent();
int i = 0;
int p = 0;
while (i < n) {
document.newPage();
p++;
i++;
PdfImportedPage page1 = Pdfwriter.getImportedPage(reader, i);
cb.addTemplate(page1, .5f, 0, 0, .5f, 60, 120);
if (i < n) {
i++;
PdfImportedPage page2 = Pdfwriter.getImportedPage(reader, i);
cb.addTemplate(page2, .5f, 0, 0, .5f, width / 2 + 60, 120);
}
BaseFont bf = BaseFont.createFont(BaseFont.HELVETICA,
BaseFont.CP1252,BaseFont.NOT_EMBEDDED);
cb.beginText();
cb.setFontAndSize(bf, 19);
cb.showTextAligned(PdfContentByte.ALIGN_CENTER, "page " + p
+ " of " + ((n / 2) + (n % 2 > 0? 1 : 0)), width / 2, 40, 0);
cb.endText();
}
document.close();
}
}
for $IMPORT in xdmp:filesystem-directory("C:\ABC\0000001\Import")/dir:entry
let $Batchfilename := $IMPORT/dir:filename
let $BatchPathname := $IMPORT/dir:pathname
for $EachBatchPath in xdmp:filesystem-directory($BatchPathname)
let $PDFDocumentPath := xdmp:filesystem-directory($EachBatchPath/dir:entry/dir:pathname)/dir:entry[cts:contains(dir:filename,cts:word-query(".pdf"))]/dir:pathname
let $EachInputFileContent := if (xdmp:filesystem-file-exists($PDFDocumentPath) ) then
xdmp:filesystem-file($PDFDocumentPath)
else ( )
return
xdmp:save("D:/test", $EachInputFileContent,
<options xmlns="xdmp:save">
<output-encoding>utf-8</output-encoding>
</options>)
throws error : XDMP-READFILE: $r instance of node()+ -- ReadFile File is not in UTF-8:
Solution is to establish xcc/mljam connection to access java code to read a pdf
import com.itextpdf.text.pdf.parser.PdfTextExtractor;
import java.io.FileOutputStream;
import com.lowagie.text.Document;
import com.lowagie.text.Rectangle;
import com.lowagie.text.pdf.BaseFont;
import com.lowagie.text.pdf.PdfContentByte;
import com.lowagie.text.pdf.PdfImportedPage;
import com.lowagie.text.pdf.PdfReader;
import com.lowagie.text.pdf.PdfWriter;
public class PDFReaderSample
{
public static void main(String[] args) throws Exception
{
PdfReader reader = new PdfReader("C:/ABC.pdf");
int n = reader.getNumberOfPages();
Rectangle psize = reader.getPageSize(1);
float width = psize.height();
float height = psize.width();
Document document = new Document(new Rectangle(width, height));
PdfWriter Pdfwriter = PdfWriter.getInstance(document,
new FileOutputStream("D:/test/satyam.pdf"));
document.open();
PdfContentByte cb = Pdfwriter.getDirectContent();
int i = 0;
int p = 0;
while (i < n) {
document.newPage();
p++;
i++;
PdfImportedPage page1 = Pdfwriter.getImportedPage(reader, i);
cb.addTemplate(page1, .5f, 0, 0, .5f, 60, 120);
if (i < n) {
i++;
PdfImportedPage page2 = Pdfwriter.getImportedPage(reader, i);
cb.addTemplate(page2, .5f, 0, 0, .5f, width / 2 + 60, 120);
}
BaseFont bf = BaseFont.createFont(BaseFont.HELVETICA,
BaseFont.CP1252,BaseFont.NOT_EMBEDDED);
cb.beginText();
cb.setFontAndSize(bf, 19);
cb.showTextAligned(PdfContentByte.ALIGN_CENTER, "page " + p
+ " of " + ((n / 2) + (n % 2 > 0? 1 : 0)), width / 2, 40, 0);
cb.endText();
}
document.close();
}
}
Have you tried xdmp:external-binary. It think the problem is you are trying to interpret the binary as utf-8 which it is not. So reading it as a binary would be the preferred approach
ReplyDelete