pdfOCR: How to OCR an Image to PDF?
With our open source tool, pdfOCR, it is possible to OCR an image to a PDF, with just a few lines of code.
don't forget to specify the path to your Tesseract Data in your code TESS_DATA_FOLDER
below. You can always find trained models here.
JAVA
JAVA
import com.itextpdf.kernel.pdf.PdfWriter;
import com.itextpdf.pdfocr.OcrPdfCreator;
import com.itextpdf.pdfocr.tesseract4.Tesseract4LibOcrEngine;
import com.itextpdf.pdfocr.tesseract4.Tesseract4OcrEngineProperties;
import java.io.File;
import java.io.IOException;
import java.util.Arrays;
import java.util.List;
public class JDoodle {
static final Tesseract4OcrEngineProperties tesseract4OcrEngineProperties = new Tesseract4OcrEngineProperties();
private static List<File> LIST_IMAGES_OCR = Arrays.asList(new File("invoice_front.jpg"));
private static String OUTPUT_PDF = "/myfiles/hello.pdf";
public static void main(String[] args) throws IOException {
final Tesseract4LibOcrEngine tesseractReader = new Tesseract4LibOcrEngine(tesseract4OcrEngineProperties);
tesseract4OcrEngineProperties.setPathToTessData(new File(TESS_DATA_FOLDER));
OcrPdfCreator ocrPdfCreator = new OcrPdfCreator(tesseractReader);
try (PdfWriter writer = new PdfWriter(OUTPUT_PDF)) {
ocrPdfCreator.createPdf(LIST_IMAGES_OCR, writer).close();
}
}
}
C#
C#
using System.Collections.Generic;
using System.IO;
using iText.Kernel.Pdf;
using iText.Pdfocr;
using iText.Pdfocr.Tesseract4;
private static readonly Tesseract4OcrEngineProperties tesseract4OcrEngineProperties =
new Tesseract4OcrEngineProperties();
public class Program
{
private static string OUTPUT_PDF = "/myfiles/hello.pdf";
private static IList<FileInfo> LIST_IMAGES_OCR = new List<FileInfo>
{
new FileInfo("invoice_front.jpg")
};
static void Main() {
{
var tesseractReader = new Tesseract4LibOcrEngine(tesseract4OcrEngineProperties);
tesseract4OcrEngineProperties.SetPathToTessData(new FileInfo(TESS_DATA_FOLDER));
var ocrPdfCreator = new OcrPdfCreator(tesseractReader);
using (var writer = new PdfWriter(OUTPUT_PDF))
{
ocrPdfCreator.CreatePdf(LIST_IMAGES_OCR, writer).Close();
}
}
}