Parsing PDFs
Examples written in answer to questions such as:
parsecustom
JAVA
JAVA
/*
* Example written by Bruno Lowagie in answer to:
* http://stackoverflow.com/questions/24506830/can-we-use-text-extraction-strategy-after-applying-location-extraction-strategy
*/
package sandbox.parse;
import com.itextpdf.text.DocumentException;
import com.itextpdf.text.Rectangle;
import com.itextpdf.text.pdf.PdfReader;
import com.itextpdf.text.pdf.parser.FilteredTextRenderListener;
import com.itextpdf.text.pdf.parser.LocationTextExtractionStrategy;
import com.itextpdf.text.pdf.parser.PdfTextExtractor;
import com.itextpdf.text.pdf.parser.RegionTextRenderFilter;
import com.itextpdf.text.pdf.parser.RenderFilter;
import com.itextpdf.text.pdf.parser.TextExtractionStrategy;
import com.itextpdf.text.pdf.parser.TextRenderInfo;
import java.io.IOException;
public class ParseCustom {
public static final String SRC = "resources/pdfs/nameddestinations.pdf";
class FontRenderFilter extends RenderFilter {
public boolean allowText(TextRenderInfo renderInfo) {
String font = renderInfo.getFont().getPostscriptFontName();
return font.endsWith("Bold") || font.endsWith("Oblique");
}
}
public static void main(String[] args) throws IOException, DocumentException {
new ParseCustom().parse(SRC);
}
public void parse(String filename) throws IOException {
PdfReader reader = new PdfReader(filename);
Rectangle rect = new Rectangle(36, 750, 559, 806);
RenderFilter regionFilter = new RegionTextRenderFilter(rect);
FontRenderFilter fontFilter = new FontRenderFilter();
TextExtractionStrategy strategy = new FilteredTextRenderListener(
new LocationTextExtractionStrategy(), regionFilter, fontFilter);
System.out.println(PdfTextExtractor.getTextFromPage(reader, 1, strategy));
reader.close();
}
}
parseczech
JAVA
JAVA
/*
* Example written by Bruno Lowagie in answer to:
* http://stackoverflow.com/questions/26670919/itextsharp-diacritic-chars
*/
package sandbox.parse;
import com.itextpdf.text.DocumentException;
import com.itextpdf.text.pdf.PdfReader;
import com.itextpdf.text.pdf.parser.PdfTextExtractor;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
/**
*
* @author Bruno Lowagie (iText Software)
*/
public class ParseCzech {
public static final String SRC = "resources/pdfs/czech.pdf";
public static final String DEST = "results/parse/czech.txt";
public static void main(String[] args) throws IOException, DocumentException {
File file = new File(DEST);
file.getParentFile().mkdirs();
new ParseCzech().parse(SRC);
}
public void parse(String filename) throws IOException {
PdfReader reader = new PdfReader(filename);
FileOutputStream fos = new FileOutputStream(DEST);
for (int page = 1; page <= 1; page++) {
fos.write(PdfTextExtractor.getTextFromPage(reader, page).getBytes("UTF-8"));
}
fos.flush();
fos.close();
}
}