pdfOCR: How to OCR an Image to PDF/A-3u?
With our open source tool, pdfOCR, it is possible to OCR an image to a PDF/A compliant document (more specifically, PDF/A-3u), with just a few lines of code.
don't forget to specify the path to your Tesseract Data in your code TESS_DATA_FOLDER
below. You can always find trained models here.
JAVA
JAVA
import com.itextpdf.kernel.pdf.PdfWriter;
import com.itextpdf.pdfocr.OcrPdfCreator;
import com.itextpdf.pdfocr.tesseract4.Tesseract4LibOcrEngine;
import com.itextpdf.pdfocr.tesseract4.Tesseract4OcrEngineProperties;
import java.io.File;
import java.io.IOException;
import java.util.Arrays;
import java.util.List;
public class JDoodle {
static final Tesseract4OcrEngineProperties tesseract4OcrEngineProperties = new Tesseract4OcrEngineProperties();
private static List<File> LIST_IMAGES_OCR = Arrays.asList(new File("invoice_front.jpg"));
private static String OUTPUT_PDF = "/myfiles/hello.pdf";
private static final String DEFAULT_RGB_COLOR_PROFILE_PATH = "profiles/sRGB_CS_profile.icm";
public static void main(String[] args) throws IOException {
final Tesseract4LibOcrEngine tesseractReader = new Tesseract4LibOcrEngine(tesseract4OcrEngineProperties);
tesseract4OcrEngineProperties.setPathToTessData(new File(TESS_DATA_FOLDER));
OcrPdfCreatorProperties properties = new OcrPdfCreatorProperties();
properties.setPdfLang("en"); //we need to define a language to make it PDF/A compliant
OcrPdfCreator ocrPdfCreator = new OcrPdfCreator(tesseractReader, properties);
try (PdfWriter writer = new PdfWriter(OUTPUT_PDF)) {
ocrPdfCreator.createPdfA(LIST_IMAGES_OCR, writer, getRGBPdfOutputIntent()).close();
}
}
public static PdfOutputIntent getRGBPdfOutputIntent() throws FileNotFoundException {
InputStream is = new FileInputStream(DEFAULT_RGB_COLOR_PROFILE_PATH);
return new PdfOutputIntent("", "",
"", "sRGB IEC61966-2.1", is);
}
}
C#
C#
using System.Collections.Generic;
using System.IO;
using iText.Kernel.Pdf;
using iText.Pdfocr;
using iText.Pdfocr.Tesseract4;
public class Program
{
private static readonly Tesseract4OcrEngineProperties tesseract4OcrEngineProperties = new Tesseract4OcrEngineProperties();
private static string OUTPUT_PDF = "/myfiles/hello.pdf";
private const string DEFAULT_RGB_COLOR_PROFILE_PATH = @"profiles\sRGB_CS_profile.icm";
private static IList<FileInfo> LIST_IMAGES_OCR = new List<FileInfo>
{
new FileInfo("invoice_front.jpg")
};
static void Main()
{
var tesseractReader = new Tesseract4LibOcrEngine(tesseract4OcrEngineProperties);
tesseract4OcrEngineProperties.SetPathToTessData(new FileInfo(TESS_DATA_FOLDER));
var properties = new OcrPdfCreatorProperties();
properties.SetPdfLang("en"); //we need to define a language to make it PDF/A compliant
var ocrPdfCreator = new OcrPdfCreator(tesseractReader, properties);
using (var writer = new PdfWriter(OUTPUT_PDF))
{
ocrPdfCreator.CreatePdfA(LIST_IMAGES_OCR, writer, GetRgbPdfOutputIntent()).Close();
}
}
static PdfOutputIntent GetRgbPdfOutputIntent()
{
Stream @is = new FileStream(DEFAULT_RGB_COLOR_PROFILE_PATH, FileMode.Open, FileAccess.Read);
return new PdfOutputIntent("", "", "", "sRGB IEC61966-2.1", @is);
}
}