|
@@ -0,0 +1,51 @@
|
|
|
+package com.simuwang.daq.utils;
|
|
|
+
|
|
|
+import org.apache.pdfbox.Loader;
|
|
|
+import org.apache.pdfbox.io.RandomAccessReadBufferedFile;
|
|
|
+import org.apache.pdfbox.pdmodel.PDDocument;
|
|
|
+import org.apache.pdfbox.pdmodel.PDPage;
|
|
|
+import org.apache.pdfbox.pdmodel.PDPageTree;
|
|
|
+import org.apache.pdfbox.text.PDFTextStripperByArea;
|
|
|
+
|
|
|
+import java.awt.geom.Rectangle2D;
|
|
|
+import java.io.IOException;
|
|
|
+import java.util.List;
|
|
|
+
|
|
|
+public class ReportParseUtil {
|
|
|
+ public static void main(String[] args) throws IOException {
|
|
|
+ try (PDDocument document = Loader.loadPDF(new RandomAccessReadBufferedFile("D:\\Documents\\workspace\\idea\\smppw\\data-daq\\service-daq\\src\\main\\java\\com\\simuwang\\daq\\utils\\2061834.pdf"))) {
|
|
|
+ PDPageTree pages = document.getPages();
|
|
|
+ for (int i = 0; i < pages.getCount(); i++) {
|
|
|
+ PDFTextStripperByArea stripper = new PDFTextStripperByArea();
|
|
|
+ stripper.setSortByPosition(true);
|
|
|
+
|
|
|
+ // 定义每个区域
|
|
|
+ defineAreas(stripper);
|
|
|
+
|
|
|
+ // 提取文本
|
|
|
+ PDPage page = document.getPage(i);
|
|
|
+ stripper.extractRegions(page);
|
|
|
+
|
|
|
+ List<String> regions = stripper.getRegions();
|
|
|
+ processRegions(stripper, regions);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ private static void defineAreas(PDFTextStripperByArea stripper) {
|
|
|
+ // 定义区域,位置左上角作为原点,横坐标往右为x轴,纵坐标往下为y轴
|
|
|
+ stripper.addRegion("header", new Rectangle2D.Float(0, 0, 612, 180));
|
|
|
+ stripper.addRegion("content-survey", new Rectangle2D.Float(0, 180, 612, 180));
|
|
|
+ stripper.addRegion("content-current-fund", new Rectangle2D.Float(0, 360, 612, 240));
|
|
|
+ stripper.addRegion("content-sub-fund1", new Rectangle2D.Float(0, 600, 612, 100));
|
|
|
+ stripper.addRegion("content-sub-fund2", new Rectangle2D.Float(0, 700, 612, 150));
|
|
|
+ stripper.addRegion("footer", new Rectangle2D.Float(0, 850, 612, 30));
|
|
|
+ }
|
|
|
+
|
|
|
+ private static void processRegions(PDFTextStripperByArea stripper, List<String> regions) {
|
|
|
+ for (String region : regions) {
|
|
|
+ String text = stripper.getTextForRegion(region);
|
|
|
+ System.out.println(text);
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|