Explorar o código

feat:重写pdf表格提取的方法

wangzaijun hai 7 meses
pai
achega
cc98dbc670

+ 1 - 1
service-base/src/main/java/com/simuwang/base/common/conts/Constants.java

@@ -10,7 +10,7 @@ import cn.hutool.core.util.StrUtil;
  */
 public class Constants {
     public static final String EMPTY = StrUtil.EMPTY;
-    public static final String WATERMARK_REPLACE = "+_+";
+    public static final String WATERMARK_REPLACE = System.lineSeparator();
 
     public static final long DEFAULT_SERIAL_ID = 999L;
 

+ 7 - 1
service-daq/src/main/java/com/simuwang/daq/components/CustomPDFTextStripper.java

@@ -3,8 +3,10 @@ package com.simuwang.daq.components;
 import cn.hutool.core.collection.CollUtil;
 import cn.hutool.core.collection.ListUtil;
 import com.simuwang.base.common.conts.Constants;
+import org.apache.pdfbox.pdmodel.PDDocument;
 import org.apache.pdfbox.text.PDFTextStripper;
 import org.apache.pdfbox.text.TextPosition;
+import technology.tabula.TextStripper;
 
 import java.io.IOException;
 import java.util.List;
@@ -15,7 +17,11 @@ import java.util.stream.Collectors;
  * @date 2024/9/12 14:00
  * @description 自定义的文本去水印方法,发现水印基本是旋转文字并且比报告内其他文字都大
  */
-public class CustomPDFTextStripper extends PDFTextStripper {
+public class CustomPDFTextStripper extends TextStripper {
+    public CustomPDFTextStripper(PDDocument document, int pageNumber) throws IOException {
+        super(document, pageNumber);
+    }
+
     @Override
     protected void writeString(String text, List<TextPosition> textPositions) throws IOException {
         // 水印文字基本都是有角度的,统计有旋转角度的文字宽度

+ 3 - 6
service-daq/src/main/java/com/simuwang/daq/components/report/parser/pdf/AbstractPDReportParser.java

@@ -14,10 +14,7 @@ import com.simuwang.daq.components.report.parser.AbstractReportParser;
 import org.apache.pdfbox.Loader;
 import org.apache.pdfbox.io.RandomAccessReadBufferedFile;
 import org.apache.pdfbox.pdmodel.PDDocument;
-import technology.tabula.ObjectExtractor;
-import technology.tabula.Page;
-import technology.tabula.PageIterator;
-import technology.tabula.Table;
+import technology.tabula.*;
 import technology.tabula.extractors.SpreadsheetExtractionAlgorithm;
 
 import java.io.IOException;
@@ -42,7 +39,7 @@ public abstract class AbstractPDReportParser<T extends ReportData> extends Abstr
         // 解析报告名称和表格
         String reportName = null;
         try (PDDocument document = Loader.loadPDF(new RandomAccessReadBufferedFile(params.getFilepath()))) {
-            CustomPDFTextStripper stripper = new CustomPDFTextStripper();
+            CustomPDFTextStripper stripper = new CustomPDFTextStripper(document, 0);
             stripper.setSortByPosition(true);
             String text = stripper.getText(document).replace(Constants.WATERMARK_REPLACE, Constants.EMPTY);
             List<String> textList = StrUtil.split(text, System.lineSeparator());
@@ -55,7 +52,7 @@ public abstract class AbstractPDReportParser<T extends ReportData> extends Abstr
             }
             // 解析所有表格
             SpreadsheetExtractionAlgorithm extractionAlgorithm = new SpreadsheetExtractionAlgorithm();
-            PageIterator pageIterator = new ObjectExtractor(document).extract();
+            PageIterator pageIterator = new CustomObjectExtractor(document).extract();
             while (pageIterator.hasNext()) {
                 Page page = pageIterator.next();
                 List<Table> tables = extractionAlgorithm.extract(page);

+ 59 - 0
service-daq/src/main/java/technology/tabula/CustomObjectExtractor.java

@@ -0,0 +1,59 @@
+package technology.tabula;
+
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.pdmodel.PDPage;
+
+import java.io.IOException;
+
+/**
+ * @author wangzaijun
+ * @date 2024/9/30 11:15
+ * @description 重写的
+ */
+public class CustomObjectExtractor extends ObjectExtractor {
+    private final PDDocument pdfDocument;
+
+    public CustomObjectExtractor(PDDocument pdfDocument) {
+        super(pdfDocument);
+        this.pdfDocument = pdfDocument;
+    }
+
+    @Override
+    protected Page extractPage(Integer pageNumber) throws IOException {
+        if (pageNumber > pdfDocument.getNumberOfPages() || pageNumber < 1) {
+            throw new java.lang.IndexOutOfBoundsException("Page number does not exist.");
+        }
+        PDPage page = pdfDocument.getPage(pageNumber - 1);
+
+        ObjectExtractorStreamEngine streamEngine = new ObjectExtractorStreamEngine(page);
+        streamEngine.processPage(page);
+
+        TextStripper textStripper = new TextStripper(pdfDocument, pageNumber);
+        textStripper.process();
+
+        Utils.sort(textStripper.getTextElements(), Rectangle.ILL_DEFINED_ORDER);
+
+        float width, height;
+        int rotation = page.getRotation();
+        if (Math.abs(rotation) == 90 || Math.abs(rotation) == 270) {
+            width = page.getCropBox().getHeight();
+            height = page.getCropBox().getWidth();
+        } else {
+            width = page.getCropBox().getWidth();
+            height = page.getCropBox().getHeight();
+        }
+
+        return Page.Builder.newInstance()
+                .withPageDims(PageDims.of(0, 0, width, height))
+                .withRotation(rotation)
+                .withNumber(pageNumber)
+                .withPdPage(page)
+                .withPdDocument(pdfDocument)
+                .withRulings(streamEngine.rulings)
+                .withTextElements(textStripper.getTextElements())
+                .withMinCharWidth(textStripper.getMinCharWidth())
+                .withMinCharHeight(textStripper.getMinCharHeight())
+                .withIndex(textStripper.getSpatialIndex())
+                .build();
+    }
+}