浏览代码

feat:pdf报告模板解析开始

wangzaijun 7 月之前
父节点
当前提交
2f066a2580

+ 13 - 0
pom.xml

@@ -26,6 +26,7 @@
         <apache-shiro.version>2.0.1</apache-shiro.version>
         <java-jwt.version>4.4.0</java-jwt.version>
         <jjwt.version>0.12.6</jjwt.version>
+        <apahce-pdfbox.version>3.0.3</apahce-pdfbox.version>
     </properties>
 
     <modules>
@@ -152,6 +153,18 @@
                 <version>${jjwt.version}</version>
             </dependency>
 
+            <!-- pdf解析 -->
+            <dependency>
+                <groupId>org.apache.pdfbox</groupId>
+                <artifactId>pdfbox</artifactId>
+                <version>${apahce-pdfbox.version}</version>
+            </dependency>
+            <dependency>
+                <groupId>org.apache.pdfbox</groupId>
+                <artifactId>pdfbox-io</artifactId>
+                <version>${apahce-pdfbox.version}</version>
+            </dependency>
+
             <!-- 内部模块 -->
             <dependency>
                 <groupId>com.simuwang</groupId>

+ 8 - 0
service-daq/pom.xml

@@ -17,5 +17,13 @@
             <groupId>com.simuwang</groupId>
             <artifactId>service-base</artifactId>
         </dependency>
+        <dependency>
+            <groupId>org.apache.pdfbox</groupId>
+            <artifactId>pdfbox</artifactId>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.pdfbox</groupId>
+            <artifactId>pdfbox-io</artifactId>
+        </dependency>
     </dependencies>
 </project>

+ 13 - 0
service-daq/src/main/java/com/simuwang/daq/service/ReportParser.java

@@ -0,0 +1,13 @@
+package com.simuwang.daq.service;
+
+/**
+ * @author wangzaijun
+ * @date 2024/9/9 19:18
+ * @description 报告模板解析器,计划支持pdf、word等
+ */
+public interface ReportParser {
+    /**
+     * 解析接口
+     */
+    void parse();
+}

二进制
service-daq/src/main/java/com/simuwang/daq/utils/2061834.pdf


+ 51 - 0
service-daq/src/main/java/com/simuwang/daq/utils/ReportParseUtil.java

@@ -0,0 +1,51 @@
+package com.simuwang.daq.utils;
+
+import org.apache.pdfbox.Loader;
+import org.apache.pdfbox.io.RandomAccessReadBufferedFile;
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.pdmodel.PDPage;
+import org.apache.pdfbox.pdmodel.PDPageTree;
+import org.apache.pdfbox.text.PDFTextStripperByArea;
+
+import java.awt.geom.Rectangle2D;
+import java.io.IOException;
+import java.util.List;
+
+public class ReportParseUtil {
+    public static void main(String[] args) throws IOException {
+        try (PDDocument document = Loader.loadPDF(new RandomAccessReadBufferedFile("D:\\Documents\\workspace\\idea\\smppw\\data-daq\\service-daq\\src\\main\\java\\com\\simuwang\\daq\\utils\\2061834.pdf"))) {
+            PDPageTree pages = document.getPages();
+            for (int i = 0; i < pages.getCount(); i++) {
+                PDFTextStripperByArea stripper = new PDFTextStripperByArea();
+                stripper.setSortByPosition(true);
+
+                // 定义每个区域
+                defineAreas(stripper);
+
+                // 提取文本
+                PDPage page = document.getPage(i);
+                stripper.extractRegions(page);
+
+                List<String> regions = stripper.getRegions();
+                processRegions(stripper, regions);
+            }
+        }
+    }
+
+    private static void defineAreas(PDFTextStripperByArea stripper) {
+        // 定义区域,位置左上角作为原点,横坐标往右为x轴,纵坐标往下为y轴
+        stripper.addRegion("header", new Rectangle2D.Float(0, 0, 612, 180));
+        stripper.addRegion("content-survey", new Rectangle2D.Float(0, 180, 612, 180));
+        stripper.addRegion("content-current-fund", new Rectangle2D.Float(0, 360, 612, 240));
+        stripper.addRegion("content-sub-fund1", new Rectangle2D.Float(0, 600, 612, 100));
+        stripper.addRegion("content-sub-fund2", new Rectangle2D.Float(0, 700, 612, 150));
+        stripper.addRegion("footer", new Rectangle2D.Float(0, 850, 612, 30));
+    }
+
+    private static void processRegions(PDFTextStripperByArea stripper, List<String> regions) {
+        for (String region : regions) {
+            String text = stripper.getTextForRegion(region);
+            System.out.println(text);
+        }
+    }
+}