With our open source tool, pdfOCR, it is possible to OCR an image to a PDF/A compliant document (more specifically, PDF/A-3u), with just a few lines of code.

don't forget to specify the path to your Tesseract Data in your code TESS_DATA_FOLDER below. You can always find trained models here.

import com.itextpdf.kernel.pdf.PdfWriter; import com.itextpdf.pdfocr.OcrPdfCreator; import com.itextpdf.pdfocr.tesseract4.Tesseract4LibOcrEngine; import com.itextpdf.pdfocr.tesseract4.Tesseract4OcrEngineProperties; import java.io.File; import java.io.IOException; import java.util.Arrays; import java.util.List; public class JDoodle { static final Tesseract4OcrEngineProperties tesseract4OcrEngineProperties = new Tesseract4OcrEngineProperties(); private static List LIST_IMAGES_OCR = Arrays.asList(new File("invoice_front.jpg")); private static String OUTPUT_PDF = "/myfiles/hello.pdf"; private static final String DEFAULT_RGB_COLOR_PROFILE_PATH = "profiles/sRGB_CS_profile.icm"; public static void main(String[] args) throws IOException { final Tesseract4LibOcrEngine tesseractReader = new Tesseract4LibOcrEngine(tesseract4OcrEngineProperties); tesseract4OcrEngineProperties.setPathToTessData(new File(TESS_DATA_FOLDER)); OcrPdfCreatorProperties properties = new OcrPdfCreatorProperties(); properties.setPdfLang("en"); //we need to define a language to make it PDF/A compliant OcrPdfCreator ocrPdfCreator = new OcrPdfCreator(tesseractReader, properties); try (PdfWriter writer = new PdfWriter(OUTPUT_PDF)) { ocrPdfCreator.createPdfA(LIST_IMAGES_OCR, writer, getRGBPdfOutputIntent()).close(); } } public static PdfOutputIntent getRGBPdfOutputIntent() throws FileNotFoundException { InputStream is = new FileInputStream(DEFAULT_RGB_COLOR_PROFILE_PATH); return new PdfOutputIntent("", "", "", "sRGB IEC61966-2.1", is); } }
using System.Collections.Generic; using System.IO; using iText.Kernel.Pdf; using iText.Pdfocr; using iText.Pdfocr.Tesseract4; public class Program { private static readonly Tesseract4OcrEngineProperties tesseract4OcrEngineProperties = new Tesseract4OcrEngineProperties(); private static string OUTPUT_PDF = "/myfiles/hello.pdf"; private const string DEFAULT_RGB_COLOR_PROFILE_PATH = @"profiles\sRGB_CS_profile.icm"; private static IList LIST_IMAGES_OCR = new List { new FileInfo("invoice_front.jpg") }; static void Main() { var tesseractReader = new Tesseract4LibOcrEngine(tesseract4OcrEngineProperties); tesseract4OcrEngineProperties.SetPathToTessData(new FileInfo(TESS_DATA_FOLDER)); var properties = new OcrPdfCreatorProperties(); properties.SetPdfLang("en"); //we need to define a language to make it PDF/A compliant var ocrPdfCreator = new OcrPdfCreator(tesseractReader, properties); using (var writer = new PdfWriter(OUTPUT_PDF)) { ocrPdfCreator.CreatePdfA(LIST_IMAGES_OCR, writer, GetRgbPdfOutputIntent()).Close(); } } static PdfOutputIntent GetRgbPdfOutputIntent() { Stream @is = new FileStream(DEFAULT_RGB_COLOR_PROFILE_PATH, FileMode.Open, FileAccess.Read); return new PdfOutputIntent("", "", "", "sRGB IEC61966-2.1", @is); } }