pdfOCR: How to OCR an Image to PDF/A-3u?

With our open source tool, pdfOCR, it is possible to OCR an image to a PDF/A compliant document (more specifically, PDF/A-3u), with just a few lines of code.

don't forget to specify the path to your Tesseract Data in your code TESS_DATA_FOLDER below. You can always find trained models here.

JAVA

import com.itextpdf.kernel.pdf.PdfWriter;
import com.itextpdf.pdfocr.OcrPdfCreator;
import com.itextpdf.pdfocr.tesseract4.Tesseract4LibOcrEngine;
import com.itextpdf.pdfocr.tesseract4.Tesseract4OcrEngineProperties;

import java.io.File;
import java.io.IOException;
import java.util.Arrays;
import java.util.List;


public class JDoodle {

	static final Tesseract4OcrEngineProperties tesseract4OcrEngineProperties = new Tesseract4OcrEngineProperties();
    private static List<File> LIST_IMAGES_OCR = Arrays.asList(new File("invoice_front.jpg"));
    private static String OUTPUT_PDF = "/myfiles/hello.pdf";
	private static final String DEFAULT_RGB_COLOR_PROFILE_PATH = "profiles/sRGB_CS_profile.icm";

    public static void main(String[] args) throws IOException {

		final Tesseract4LibOcrEngine tesseractReader = new Tesseract4LibOcrEngine(tesseract4OcrEngineProperties);
        tesseract4OcrEngineProperties.setPathToTessData(new File(TESS_DATA_FOLDER));		

		OcrPdfCreatorProperties properties = new OcrPdfCreatorProperties();
        properties.setPdfLang("en"); //we need to define a language to make it PDF/A compliant

        OcrPdfCreator ocrPdfCreator = new OcrPdfCreator(tesseractReader, properties);
        try (PdfWriter writer = new PdfWriter(OUTPUT_PDF)) {
            ocrPdfCreator.createPdfA(LIST_IMAGES_OCR, writer, getRGBPdfOutputIntent()).close();
        }
    }

    public static PdfOutputIntent getRGBPdfOutputIntent() throws FileNotFoundException {
        InputStream is = new FileInputStream(DEFAULT_RGB_COLOR_PROFILE_PATH);
        return new PdfOutputIntent("", "",
                "", "sRGB IEC61966-2.1", is);
    }

}

C#

using System.Collections.Generic;
using System.IO;
using iText.Kernel.Pdf;
using iText.Pdfocr;
using iText.Pdfocr.Tesseract4;

public class Program
{
	private static readonly Tesseract4OcrEngineProperties tesseract4OcrEngineProperties = new Tesseract4OcrEngineProperties();
    private static string OUTPUT_PDF = "/myfiles/hello.pdf";
    private const string DEFAULT_RGB_COLOR_PROFILE_PATH = @"profiles\sRGB_CS_profile.icm";
    private static IList<FileInfo> LIST_IMAGES_OCR = new List<FileInfo>
    {
        new FileInfo("invoice_front.jpg")
    };

    static void Main()
    {
		var tesseractReader = new Tesseract4LibOcrEngine(tesseract4OcrEngineProperties);
        tesseract4OcrEngineProperties.SetPathToTessData(new FileInfo(TESS_DATA_FOLDER));

		var properties = new OcrPdfCreatorProperties();
        properties.SetPdfLang("en"); //we need to define a language to make it PDF/A compliant

        var ocrPdfCreator = new OcrPdfCreator(tesseractReader, properties);
        using (var writer = new PdfWriter(OUTPUT_PDF))
        {
            ocrPdfCreator.CreatePdfA(LIST_IMAGES_OCR, writer, GetRgbPdfOutputIntent()).Close();
        }
    }

	static PdfOutputIntent GetRgbPdfOutputIntent()
    {
       Stream @is = new FileStream(DEFAULT_RGB_COLOR_PROFILE_PATH, FileMode.Open, FileAccess.Read);
       return new PdfOutputIntent("", "", "", "sRGB IEC61966-2.1", @is);
    }
}