Parsing PDFs
Examples written in answer to questions such as:
parsecustom
JAVA
JAVA
/*
This file is part of the iText (R) project.
Copyright (c) 1998-2023 Apryse Group NV
Authors: Apryse Software.
For more information, please contact iText Software at this address:
sales@itextpdf.com
*/
package com.itextpdf.samples.sandbox.parse;
import com.itextpdf.kernel.font.PdfFont;
import com.itextpdf.kernel.geom.Rectangle;
import com.itextpdf.kernel.pdf.PdfDocument;
import com.itextpdf.kernel.pdf.PdfReader;
import com.itextpdf.kernel.pdf.canvas.parser.EventType;
import com.itextpdf.kernel.pdf.canvas.parser.PdfCanvasProcessor;
import com.itextpdf.kernel.pdf.canvas.parser.data.IEventData;
import com.itextpdf.kernel.pdf.canvas.parser.data.TextRenderInfo;
import com.itextpdf.kernel.pdf.canvas.parser.filter.TextRegionEventFilter;
import com.itextpdf.kernel.pdf.canvas.parser.listener.FilteredEventListener;
import com.itextpdf.kernel.pdf.canvas.parser.listener.LocationTextExtractionStrategy;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.FileOutputStream;
import java.io.Writer;
import java.io.BufferedWriter;
public class ParseCustom {
public static final String DEST = "./target/txt/parse_custom.txt";
public static final String SRC = "./src/main/resources/pdfs/nameddestinations.pdf";
public static void main(String[] args) throws IOException {
File file = new File(DEST);
file.getParentFile().mkdirs();
new ParseCustom().manipulatePdf(DEST);
}
protected void manipulatePdf(String dest) throws IOException {
PdfDocument pdfDoc = new PdfDocument(new PdfReader(SRC));
Rectangle rect = new Rectangle(36, 750, 523, 56);
CustomFontFilter fontFilter = new CustomFontFilter(rect);
FilteredEventListener listener = new FilteredEventListener();
// Create a text extraction renderer
LocationTextExtractionStrategy extractionStrategy = listener
.attachEventListener(new LocationTextExtractionStrategy(), fontFilter);
// Note: If you want to re-use the PdfCanvasProcessor, you must call PdfCanvasProcessor.reset()
PdfCanvasProcessor parser = new PdfCanvasProcessor(listener);
parser.processPageContent(pdfDoc.getFirstPage());
// Get the resultant text after applying the custom filter
String actualText = extractionStrategy.getResultantText();
pdfDoc.close();
// See the resultant text in the console
System.out.println(actualText);
try (Writer writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(dest)))) {
writer.write(actualText);
}
}
/*
* The custom filter filters only the text of which the font name ends with Bold or Oblique.
*/
protected class CustomFontFilter extends TextRegionEventFilter {
public CustomFontFilter(Rectangle filterRect) {
super(filterRect);
}
@Override
public boolean accept(IEventData data, EventType type) {
if (type.equals(EventType.RENDER_TEXT)) {
TextRenderInfo renderInfo = (TextRenderInfo) data;
PdfFont font = renderInfo.getFont();
if (null != font) {
String fontName = font.getFontProgram().getFontNames().getFontName();
return fontName.endsWith("Bold") || fontName.endsWith("Oblique");
}
}
return false;
}
}
}
C#
C#
using System;
using System.IO;
using iText.Kernel.Font;
using iText.Kernel.Geom;
using iText.Kernel.Pdf;
using iText.Kernel.Pdf.Canvas.Parser;
using iText.Kernel.Pdf.Canvas.Parser.Data;
using iText.Kernel.Pdf.Canvas.Parser.Filter;
using iText.Kernel.Pdf.Canvas.Parser.Listener;
namespace iText.Samples.Sandbox.Parse
{
public class ParseCustom
{
public static readonly String DEST = "results/txt/parse_custom.txt";
public static readonly String SRC = "../../../resources/pdfs/nameddestinations.pdf";
public static void Main(String[] args)
{
FileInfo file = new FileInfo(DEST);
file.Directory.Create();
new ParseCustom().ManipulatePdf(DEST);
}
public virtual void ManipulatePdf(String dest)
{
PdfDocument pdfDoc = new PdfDocument(new PdfReader(SRC));
Rectangle rect = new Rectangle(36, 750, 523, 56);
CustomFontFilter fontFilter = new CustomFontFilter(rect);
FilteredEventListener listener = new FilteredEventListener();
// Create a text extraction renderer
LocationTextExtractionStrategy extractionStrategy = listener
.AttachEventListener(new LocationTextExtractionStrategy(), fontFilter);
// Note: If you want to re-use the PdfCanvasProcessor, you must call PdfCanvasProcessor.reset()
new PdfCanvasProcessor(listener).ProcessPageContent(pdfDoc.GetFirstPage());
// Get the resultant text after applying the custom filter
String actualText = extractionStrategy.GetResultantText();
pdfDoc.Close();
// See the resultant text in the console
Console.Out.WriteLine(actualText);
using (StreamWriter writer = new StreamWriter(dest))
{
writer.Write(actualText);
}
}
// The custom filter filters only the text of which the font name ends with Bold or Oblique.
protected class CustomFontFilter : TextRegionEventFilter
{
public CustomFontFilter(Rectangle filterRect)
: base(filterRect)
{
}
public override bool Accept(IEventData data, EventType type)
{
if (type.Equals(EventType.RENDER_TEXT))
{
TextRenderInfo renderInfo = (TextRenderInfo) data;
PdfFont font = renderInfo.GetFont();
if (null != font)
{
String fontName = font.GetFontProgram().GetFontNames().GetFontName();
return fontName.EndsWith("Bold") || fontName.EndsWith("Oblique");
}
}
return false;
}
}
}
}
parseczech
JAVA
JAVA
/*
This file is part of the iText (R) project.
Copyright (c) 1998-2023 Apryse Group NV
Authors: Apryse Software.
For more information, please contact iText Software at this address:
sales@itextpdf.com
*/
package com.itextpdf.samples.sandbox.parse;
import com.itextpdf.kernel.pdf.PdfDocument;
import com.itextpdf.kernel.pdf.PdfReader;
import com.itextpdf.kernel.pdf.canvas.parser.PdfCanvasProcessor;
import com.itextpdf.kernel.pdf.canvas.parser.listener.LocationTextExtractionStrategy;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
public class ParseCzech {
public static final String DEST = "./target/txt/czech.txt";
public static final String SRC = "./src/main/resources/pdfs/czech.pdf";
public static void main(String[] args) throws IOException {
File file = new File(DEST);
file.getParentFile().mkdirs();
new ParseCzech().manipulatePdf(DEST);
}
protected void manipulatePdf(String dest) throws IOException {
PdfDocument pdfDoc = new PdfDocument(new PdfReader(SRC));
// Create a text extraction renderer
LocationTextExtractionStrategy strategy = new LocationTextExtractionStrategy();
// Note: if you want to re-use the PdfCanvasProcessor, you must call PdfCanvasProcessor.reset()
PdfCanvasProcessor parser = new PdfCanvasProcessor(strategy);
parser.processPageContent(pdfDoc.getFirstPage());
byte[] content = strategy.getResultantText().getBytes("UTF-8");
try (FileOutputStream stream = new FileOutputStream(dest)) {
stream.write(content);
}
pdfDoc.close();
}
}
C#
C#
using System;
using System.IO;
using System.Text;
using iText.Kernel.Pdf;
using iText.Kernel.Pdf.Canvas.Parser;
using iText.Kernel.Pdf.Canvas.Parser.Listener;
namespace iText.Samples.Sandbox.Parse
{
public class ParseCzech
{
public static readonly String DEST = "results/txt/czech.txt";
public static readonly String SRC = "../../../resources/pdfs/czech.pdf";
public static void Main(String[] args)
{
FileInfo file = new FileInfo(DEST);
file.Directory.Create();
new ParseCzech().ManipulatePdf(DEST);
}
protected virtual void ManipulatePdf(String dest)
{
PdfDocument pdfDoc = new PdfDocument(new PdfReader(SRC));
// Create a text extraction renderer
LocationTextExtractionStrategy strategy = new LocationTextExtractionStrategy();
// Note: if you want to re-use the PdfCanvasProcessor, you must call PdfCanvasProcessor.Reset()
PdfCanvasProcessor parser = new PdfCanvasProcessor(strategy);
parser.ProcessPageContent(pdfDoc.GetFirstPage());
byte[] array = Encoding.UTF8.GetBytes(strategy.GetResultantText());
using (FileStream stream = new FileStream(dest, FileMode.Create))
{
stream.Write(array, 0, array.Length);
}
pdfDoc.Close();
}
}
}