By default, pdfOCR merges the recognized text into the image that just got processed, but you may want to keep this information separated. To do this, all you need is under the OcrPdfCreatorProperties (Java/.NET) class.

With it, you can define:

  • If you want a separate text layer (either of the two options below will trigger the creation of a text layer)
    • by defining its name (Java/.NET
    • by defining its color (Java/.NET) - bear in mind that if you do not define this parameter, the text will be transparent
  • If you want a separate image layer

Here's a quick example with all bells and whistles turned on (all previously listed options being used):

don't forget to specify the path to your Tesseract Data in your code TESS_DATA_FOLDER below. You can always find trained models here.

import com.itextpdf.kernel.pdf.PdfWriter; import com.itextpdf.pdfocr.OcrPdfCreator; import com.itextpdf.pdfocr.tesseract4.Tesseract4LibOcrEngine; import com.itextpdf.pdfocr.tesseract4.Tesseract4OcrEngineProperties; import com.itextpdf.pdfocr.OcrPdfCreatorProperties; import; import; import java.util.Arrays; import java.util.List; public class JDoodle { static final Tesseract4OcrEngineProperties tesseract4OcrEngineProperties = new Tesseract4OcrEngineProperties(); private static List LIST_IMAGES_OCR = Arrays.asList(new File("invoice_front.jpg")); private static String OUTPUT_PDF = "/myfiles/hello.pdf"; public static void main(String[] args) throws IOException { final Tesseract4LibOcrEngine tesseractReader = new Tesseract4LibOcrEngine(tesseract4OcrEngineProperties); tesseract4OcrEngineProperties.setPathToTessData(new File(TESS_DATA_FOLDER)); OcrPdfCreatorProperties properties = new OcrPdfCreatorProperties(); properties.setTextLayerName("text"); properties.setImageLayerName("image"); properties.setTextColor(DeviceRgb.RED); OcrPdfCreator ocrPdfCreator = new OcrPdfCreator(tesseractReader, properties); try (PdfWriter writer = new PdfWriter(OUTPUT_PDF)) { ocrPdfCreator.createPdf(LIST_IMAGES_OCR, writer).close(); } } }
using System.Collections.Generic; using System.IO; using iText.Kernel.Pdf; using iText.Pdfocr; using iText.Pdfocr.Tesseract4; private static readonly Tesseract4OcrEngineProperties tesseract4OcrEngineProperties = new Tesseract4OcrEngineProperties(); public class Program { private static string OUTPUT_PDF = "/myfiles/hello.pdf"; private static IList LIST_IMAGES_OCR = new List { new FileInfo("invoice_front.jpg") }; static void Main() { { var tesseractReader = new Tesseract4LibOcrEngine(tesseract4OcrEngineProperties); tesseract4OcrEngineProperties.SetPathToTessData(new FileInfo(TESS_DATA_FOLDER)); var properties = new OcrPdfCreatorProperties(); properties.SetTextLayerName("text"); properties.SetImageLayerName("image"); properties.SetTextColor(DeviceRgb.RED); var ocrPdfCreator = new OcrPdfCreator(tesseractReader, properties); using (var writer = new PdfWriter(OUTPUT_PDF)) { ocrPdfCreator.CreatePdf(LIST_IMAGES_OCR, writer).Close(); } } }