With our open source tool, pdfOCR, it is possible to OCR an image to a PDF, with just a few lines of code.


don't forget to specify the path to your Tesseract Data in your code TESS_DATA_FOLDER below. You can always find trained models here.

import com.itextpdf.kernel.pdf.PdfWriter; import com.itextpdf.pdfocr.OcrPdfCreator; import com.itextpdf.pdfocr.tesseract4.Tesseract4LibOcrEngine; import com.itextpdf.pdfocr.tesseract4.Tesseract4OcrEngineProperties; import java.io.File; import java.io.IOException; import java.util.Arrays; import java.util.List; public class JDoodle { static final Tesseract4OcrEngineProperties tesseract4OcrEngineProperties = new Tesseract4OcrEngineProperties(); private static List LIST_IMAGES_OCR = Arrays.asList(new File("invoice_front.jpg")); private static String OUTPUT_PDF = "/myfiles/hello.pdf"; public static void main(String[] args) throws IOException { final Tesseract4LibOcrEngine tesseractReader = new Tesseract4LibOcrEngine(tesseract4OcrEngineProperties); tesseract4OcrEngineProperties.setPathToTessData(new File(TESS_DATA_FOLDER)); OcrPdfCreator ocrPdfCreator = new OcrPdfCreator(tesseractReader); try (PdfWriter writer = new PdfWriter(OUTPUT_PDF)) { ocrPdfCreator.createPdf(LIST_IMAGES_OCR, writer).close(); } } }
using System.Collections.Generic; using System.IO; using iText.Kernel.Pdf; using iText.Pdfocr; using iText.Pdfocr.Tesseract4; private static readonly Tesseract4OcrEngineProperties tesseract4OcrEngineProperties = new Tesseract4OcrEngineProperties(); public class Program { private static string OUTPUT_PDF = "/myfiles/hello.pdf"; private static IList LIST_IMAGES_OCR = new List { new FileInfo("invoice_front.jpg") }; static void Main() { { var tesseractReader = new Tesseract4LibOcrEngine(tesseract4OcrEngineProperties); tesseract4OcrEngineProperties.SetPathToTessData(new FileInfo(TESS_DATA_FOLDER)); var ocrPdfCreator = new OcrPdfCreator(tesseractReader); using (var writer = new PdfWriter(OUTPUT_PDF)) { ocrPdfCreator.CreatePdf(LIST_IMAGES_OCR, writer).Close(); } } }