Parsing PDFs

Examples written in answer to questions such as:

Click How to use a text extraction strategy after applying a location extraction strategy?

parsecustom

JAVA

/*
    This file is part of the iText (R) project.
    Copyright (c) 1998-2023 Apryse Group NV
    Authors: Apryse Software.

    For more information, please contact iText Software at this address:
    sales@itextpdf.com
 */
package com.itextpdf.samples.sandbox.parse;

import com.itextpdf.kernel.font.PdfFont;
import com.itextpdf.kernel.geom.Rectangle;
import com.itextpdf.kernel.pdf.PdfDocument;
import com.itextpdf.kernel.pdf.PdfReader;
import com.itextpdf.kernel.pdf.canvas.parser.EventType;
import com.itextpdf.kernel.pdf.canvas.parser.PdfCanvasProcessor;
import com.itextpdf.kernel.pdf.canvas.parser.data.IEventData;
import com.itextpdf.kernel.pdf.canvas.parser.data.TextRenderInfo;
import com.itextpdf.kernel.pdf.canvas.parser.filter.TextRegionEventFilter;
import com.itextpdf.kernel.pdf.canvas.parser.listener.FilteredEventListener;
import com.itextpdf.kernel.pdf.canvas.parser.listener.LocationTextExtractionStrategy;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.FileOutputStream;
import java.io.Writer;
import java.io.BufferedWriter;

public class ParseCustom {
    public static final String DEST = "./target/txt/parse_custom.txt";

    public static final String SRC = "./src/main/resources/pdfs/nameddestinations.pdf";

    public static void main(String[] args) throws IOException {
        File file = new File(DEST);
        file.getParentFile().mkdirs();

        new ParseCustom().manipulatePdf(DEST);
    }

    protected void manipulatePdf(String dest) throws IOException {
        PdfDocument pdfDoc = new PdfDocument(new PdfReader(SRC));

        Rectangle rect = new Rectangle(36, 750, 523, 56);
        CustomFontFilter fontFilter = new CustomFontFilter(rect);
        FilteredEventListener listener = new FilteredEventListener();

        // Create a text extraction renderer
        LocationTextExtractionStrategy extractionStrategy = listener
                .attachEventListener(new LocationTextExtractionStrategy(), fontFilter);

        // Note: If you want to re-use the PdfCanvasProcessor, you must call PdfCanvasProcessor.reset()
        PdfCanvasProcessor parser = new PdfCanvasProcessor(listener);
        parser.processPageContent(pdfDoc.getFirstPage());

        // Get the resultant text after applying the custom filter
        String actualText = extractionStrategy.getResultantText();

        pdfDoc.close();

        // See the resultant text in the console
        System.out.println(actualText);

        try (Writer writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(dest)))) {
            writer.write(actualText);
        }
    }

    /*
     * The custom filter filters only the text of which the font name ends with Bold or Oblique.
     */
    protected class CustomFontFilter extends TextRegionEventFilter {
        public CustomFontFilter(Rectangle filterRect) {
            super(filterRect);
        }

        @Override
        public boolean accept(IEventData data, EventType type) {
            if (type.equals(EventType.RENDER_TEXT)) {
                TextRenderInfo renderInfo = (TextRenderInfo) data;
                PdfFont font = renderInfo.getFont();
                if (null != font) {
                    String fontName = font.getFontProgram().getFontNames().getFontName();
                    return fontName.endsWith("Bold") || fontName.endsWith("Oblique");
                }
            }

            return false;
        }
    }
}

C#

using System;
using System.IO;
using iText.Kernel.Font;
using iText.Kernel.Geom;
using iText.Kernel.Pdf;
using iText.Kernel.Pdf.Canvas.Parser;
using iText.Kernel.Pdf.Canvas.Parser.Data;
using iText.Kernel.Pdf.Canvas.Parser.Filter;
using iText.Kernel.Pdf.Canvas.Parser.Listener;

namespace iText.Samples.Sandbox.Parse
{
    public class ParseCustom
    {
        public static readonly String DEST = "results/txt/parse_custom.txt";

        public static readonly String SRC = "../../../resources/pdfs/nameddestinations.pdf";

        public static void Main(String[] args)
        {
            FileInfo file = new FileInfo(DEST);
            file.Directory.Create();

            new ParseCustom().ManipulatePdf(DEST);
        }

        public virtual void ManipulatePdf(String dest)
        {
            PdfDocument pdfDoc = new PdfDocument(new PdfReader(SRC));

            Rectangle rect = new Rectangle(36, 750, 523, 56);
            CustomFontFilter fontFilter = new CustomFontFilter(rect);
            FilteredEventListener listener = new FilteredEventListener();

            // Create a text extraction renderer
            LocationTextExtractionStrategy extractionStrategy = listener
                .AttachEventListener(new LocationTextExtractionStrategy(), fontFilter);

            // Note: If you want to re-use the PdfCanvasProcessor, you must call PdfCanvasProcessor.reset()
            new PdfCanvasProcessor(listener).ProcessPageContent(pdfDoc.GetFirstPage());

            // Get the resultant text after applying the custom filter
            String actualText = extractionStrategy.GetResultantText();

            pdfDoc.Close();

            // See the resultant text in the console
            Console.Out.WriteLine(actualText);

            using (StreamWriter writer = new StreamWriter(dest))
            {
                writer.Write(actualText);
            }
        }

        // The custom filter filters only the text of which the font name ends with Bold or Oblique.
        protected class CustomFontFilter : TextRegionEventFilter
        {
            public CustomFontFilter(Rectangle filterRect)
                : base(filterRect)
            {
            }

            public override bool Accept(IEventData data, EventType type)
            {
                if (type.Equals(EventType.RENDER_TEXT))
                {
                    TextRenderInfo renderInfo = (TextRenderInfo) data;
                    PdfFont font = renderInfo.GetFont();
                    if (null != font)
                    {
                        String fontName = font.GetFontProgram().GetFontNames().GetFontName();
                        return fontName.EndsWith("Bold") || fontName.EndsWith("Oblique");
                    }
                }

                return false;
            }
        }
    }
}

parseczech

JAVA

/*
    This file is part of the iText (R) project.
    Copyright (c) 1998-2023 Apryse Group NV
    Authors: Apryse Software.

    For more information, please contact iText Software at this address:
    sales@itextpdf.com
 */
package com.itextpdf.samples.sandbox.parse;

import com.itextpdf.kernel.pdf.PdfDocument;
import com.itextpdf.kernel.pdf.PdfReader;
import com.itextpdf.kernel.pdf.canvas.parser.PdfCanvasProcessor;
import com.itextpdf.kernel.pdf.canvas.parser.listener.LocationTextExtractionStrategy;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;

public class ParseCzech {
    public static final String DEST = "./target/txt/czech.txt";

    public static final String SRC = "./src/main/resources/pdfs/czech.pdf";

    public static void main(String[] args) throws IOException {
        File file = new File(DEST);
        file.getParentFile().mkdirs();

        new ParseCzech().manipulatePdf(DEST);
    }

    protected void manipulatePdf(String dest) throws IOException {
        PdfDocument pdfDoc = new PdfDocument(new PdfReader(SRC));

        // Create a text extraction renderer
        LocationTextExtractionStrategy strategy = new LocationTextExtractionStrategy();

        // Note: if you want to re-use the PdfCanvasProcessor, you must call PdfCanvasProcessor.reset()
        PdfCanvasProcessor parser = new PdfCanvasProcessor(strategy);
        parser.processPageContent(pdfDoc.getFirstPage());

        byte[] content = strategy.getResultantText().getBytes("UTF-8");
        try (FileOutputStream stream = new FileOutputStream(dest)) {
            stream.write(content);
        }

        pdfDoc.close();
    }
}

C#

using System;
using System.IO;
using System.Text;
using iText.Kernel.Pdf;
using iText.Kernel.Pdf.Canvas.Parser;
using iText.Kernel.Pdf.Canvas.Parser.Listener;

namespace iText.Samples.Sandbox.Parse
{
    public class ParseCzech
    {
        public static readonly String DEST = "results/txt/czech.txt";

        public static readonly String SRC = "../../../resources/pdfs/czech.pdf";

        public static void Main(String[] args)
        {
            FileInfo file = new FileInfo(DEST);
            file.Directory.Create();

            new ParseCzech().ManipulatePdf(DEST);
        }

        protected virtual void ManipulatePdf(String dest)
        {
            PdfDocument pdfDoc = new PdfDocument(new PdfReader(SRC));

            // Create a text extraction renderer
            LocationTextExtractionStrategy strategy = new LocationTextExtractionStrategy();

            // Note: if you want to re-use the PdfCanvasProcessor, you must call PdfCanvasProcessor.Reset()
            PdfCanvasProcessor parser = new PdfCanvasProcessor(strategy);
            parser.ProcessPageContent(pdfDoc.GetFirstPage());

            byte[] array = Encoding.UTF8.GetBytes(strategy.GetResultantText());
            using (FileStream stream = new FileStream(dest, FileMode.Create))
            {
                stream.Write(array, 0, array.Length);
            }

            pdfDoc.Close();
        }
    }
}