9 月之前 · d082a45189
--- a/1.pdf
+++ b/1.pdf
--- a/pom.xml
+++ b/pom.xml
@@ -27,6 +27,7 @@
 
				         <java-jwt.version>4.4.0</java-jwt.version>
			
 
				         <jjwt.version>0.12.6</jjwt.version>
			
 
				         <apahce-pdfbox.version>3.0.3</apahce-pdfbox.version>
			
 
				+        <tabula.version>1.0.5</tabula.version>
			
 
				     </properties>
			
 
				 
			
 
				     <modules>
			
@@ -158,11 +159,31 @@
 
				                 <groupId>org.apache.pdfbox</groupId>
			
 
				                 <artifactId>pdfbox</artifactId>
			
 
				                 <version>${apahce-pdfbox.version}</version>
			
 
				+                <exclusions>
			
 
				+                    <exclusion>
			
 
				+                        <groupId>org.slf4j</groupId>
			
 
				+                        <artifactId>slf4j-simple</artifactId>
			
 
				+                    </exclusion>
			
 
				+                </exclusions>
			
 
				             </dependency>
			
 
				             <dependency>
			
 
				-                <groupId>org.apache.pdfbox</groupId>
			
 
				-                <artifactId>pdfbox-io</artifactId>
			
 
				-                <version>${apahce-pdfbox.version}</version>
			
 
				+                <groupId>technology.tabula</groupId>
			
 
				+                <artifactId>tabula</artifactId>
			
 
				+                <version>${tabula.version}</version>
			
 
				+                <exclusions>
			
 
				+                    <exclusion>
			
 
				+                        <groupId>org.apache.pdfbox</groupId>
			
 
				+                        <artifactId>pdfbox</artifactId>
			
 
				+                    </exclusion>
			
 
				+                    <exclusion>
			
 
				+                        <groupId>org.apache.pdfbox</groupId>
			
 
				+                        <artifactId>pdfbox-io</artifactId>
			
 
				+                    </exclusion>
			
 
				+                    <exclusion>
			
 
				+                        <groupId>org.slf4j</groupId>
			
 
				+                        <artifactId>slf4j-simple</artifactId>
			
 
				+                    </exclusion>
			
 
				+                </exclusions>
			
 
				             </dependency>
			
 
				 
			
 
				             <!-- 内部模块 -->
			
--- a/service-base/pom.xml
+++ b/service-base/pom.xml
@@ -174,30 +174,6 @@
 
				             <groupId>org.springframework.boot</groupId>
			
 
				             <artifactId>spring-boot-starter-quartz</artifactId>
			
 
				         </dependency>
			
 
				-
			
 
				-        <dependency>
			
 
				-            <groupId>org.apache.pdfbox</groupId>
			
 
				-            <artifactId>pdfbox</artifactId>
			
 
				-            <version>3.0.1</version>
			
 
				-            <exclusions>
			
 
				-                <exclusion>
			
 
				-                    <groupId>org.slf4j</groupId>
			
 
				-                    <artifactId>slf4j-simple</artifactId>
			
 
				-                </exclusion>
			
 
				-            </exclusions>
			
 
				-        </dependency>
			
 
				-
			
 
				-        <dependency>
			
 
				-            <groupId>technology.tabula</groupId>
			
 
				-            <artifactId>tabula</artifactId>
			
 
				-            <version>1.0.5</version>
			
 
				-            <exclusions>
			
 
				-                <exclusion>
			
 
				-                    <groupId>org.slf4j</groupId>
			
 
				-                    <artifactId>slf4j-simple</artifactId>
			
 
				-                </exclusion>
			
 
				-            </exclusions>
			
 
				-        </dependency>
			
 
				     </dependencies>
			
 
				 
			
 
				 <!--    <build>-->
			
--- a/service-daq/pom.xml
+++ b/service-daq/pom.xml
@@ -24,8 +24,8 @@
 
				             <artifactId>pdfbox</artifactId>
			
 
				         </dependency>
			
 
				         <dependency>
			
 
				-            <groupId>org.apache.pdfbox</groupId>
			
 
				-            <artifactId>pdfbox-io</artifactId>
			
 
				+            <groupId>technology.tabula</groupId>
			
 
				+            <artifactId>tabula</artifactId>
			
 
				         </dependency>
			
 
				     </dependencies>
			
 
				 
			
--- a/service-daq/src/main/java/com/simuwang/daq/components/AbstractReportParser.java
+++ b/service-daq/src/main/java/com/simuwang/daq/components/AbstractReportParser.java
@@ -0,0 +1,181 @@
 
				+package com.simuwang.daq.components;
			
 
				+
			
 
				+import cn.hutool.core.exceptions.ExceptionUtil;
			
 
				+import cn.hutool.core.map.MapUtil;
			
 
				+import cn.hutool.core.util.StrUtil;
			
 
				+import com.simuwang.daq.dto.ReportFundInfo;
			
 
				+import com.simuwang.daq.dto.ReportInfo;
			
 
				+import org.slf4j.Logger;
			
 
				+import org.slf4j.LoggerFactory;
			
 
				+import org.springframework.util.StopWatch;
			
 
				+
			
 
				+import java.io.IOException;
			
 
				+import java.util.*;
			
 
				+import java.util.concurrent.TimeUnit;
			
 
				+import java.util.regex.Matcher;
			
 
				+import java.util.regex.Pattern;
			
 
				+
			
 
				+public abstract class AbstractReportParser implements ReportParser {
			
 
				+    protected final Logger logger = LoggerFactory.getLogger(this.getClass());
			
 
				+    protected String filepath;
			
 
				+    protected Map<String, List<String>> watermarkListMap;
			
 
				+
			
 
				+    @Override
			
 
				+    public void parse(Integer fileId, String filepath, String watermarkName) {
			
 
				+        StopWatch watch = new StopWatch();
			
 
				+        watch.start();
			
 
				+        if (this.logger.isInfoEnabled()) {
			
 
				+            this.logger.info("报告{} 开始解析！", filepath);
			
 
				+        }
			
 
				+        this.filepath = filepath;
			
 
				+        this.watermarkListMap = this.generateWatermarkMap(watermarkName);
			
 
				+        try {
			
 
				+            this.initParse();
			
 
				+            ReportInfo reportInfo = this.parseReportInfo(fileId);
			
 
				+            ReportFundInfo reportFundInfo = this.parseBaseInfo();
			
 
				+            this.parseExtInfo();
			
 
				+        } catch (Exception e) {
			
 
				+            this.logger.error("报告{} 解析错误\n{}", filepath, ExceptionUtil.stacktraceToString(e));
			
 
				+        }
			
 
				+        this.saveResult();
			
 
				+        watch.stop();
			
 
				+        if (this.logger.isInfoEnabled()) {
			
 
				+            this.logger.info("报告{} 解析结束！耗时：{}s", filepath, watch.getTotalTime(TimeUnit.SECONDS));
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+    protected abstract void initParse() throws IOException;
			
 
				+
			
 
				+    protected abstract ReportInfo parseReportInfo(Integer fileId);
			
 
				+
			
 
				+    protected abstract ReportFundInfo parseBaseInfo();
			
 
				+
			
 
				+    protected abstract void parseExtInfo();
			
 
				+
			
 
				+    protected abstract void saveResult();
			
 
				+
			
 
				+    private Map<String, List<String>> generateWatermarkMap(String watermarkName) {
			
 
				+        Map<String, List<String>> result = MapUtil.newHashMap(32);
			
 
				+        // 生成水印列表
			
 
				+        String text = watermarkName;
			
 
				+        text = text.replaceAll("[()]", ""); // 移除括号
			
 
				+        List<String> textList = new ArrayList<>(new HashSet<>(convertStringToList(text)));
			
 
				+        Collections.reverse(textList);
			
 
				+        StringBuilder sb = new StringBuilder(textList.size());
			
 
				+        for (String ch : textList) {
			
 
				+            sb.append(ch);
			
 
				+        }
			
 
				+        String joinedText = sb.toString();
			
 
				+
			
 
				+        // 基本水印列表
			
 
				+        List<String> wkList = new ArrayList<>();
			
 
				+        for (String ch : textList) {
			
 
				+            wkList.add(ch + "\r\n");
			
 
				+            wkList.add("\r\n" + ch);
			
 
				+        }
			
 
				+
			
 
				+        // 查找数字
			
 
				+        List<String> matches = findDigits(watermarkName);
			
 
				+        if (!matches.isEmpty()) {
			
 
				+            for (String match : matches) {
			
 
				+                wkList.add("\r\n" + match);
			
 
				+                wkList.add(match + "\r\n");
			
 
				+            }
			
 
				+        }
			
 
				+        wkList.add("-");
			
 
				+        wkList.add("【");
			
 
				+        wkList.add("】");
			
 
				+        wkList.add("\r");
			
 
				+        wkList.add("\n");
			
 
				+        wkList.add("\r\n");
			
 
				+
			
 
				+        String noNumberText = removeDigits(joinedText);
			
 
				+
			
 
				+        // 生成不同字段的水印列表
			
 
				+        result.put("report_name", new ArrayList<>(wkList));
			
 
				+        result.get("report_name").addAll(convertStringToList("有限公司"));
			
 
				+
			
 
				+        result.put("less", new ArrayList<>(wkList));
			
 
				+
			
 
				+        result.put("more", new ArrayList<>(wkList));
			
 
				+        result.get("more").addAll(convertStringToList(noNumberText));
			
 
				+
			
 
				+        result.put("leverage", new ArrayList<>(wkList));
			
 
				+        result.get("leverage").addAll(convertStringToList(removeKeywords(noNumberText, "基金资产")));
			
 
				+
			
 
				+        result.put("base_info", new ArrayList<>(wkList));
			
 
				+        result.get("base_info").addAll(convertStringToList(removeKeywords(text, "基", "金", "投资", "管理", "有", "份", "融", "资", "产", "本", "号", "收益", "策略", "期")));
			
 
				+
			
 
				+        result.put("industry", new ArrayList<>(wkList));
			
 
				+        result.get("industry").addAll(convertStringToList(removeKeywords(noNumberText, "基金融公产")));
			
 
				+
			
 
				+        result.put("market_value", new ArrayList<>(Collections.singletonList("\n")));
			
 
				+        return result;
			
 
				+    }
			
 
				+
			
 
				+    private List<String> findDigits(String text) {
			
 
				+        List<String> digits = new ArrayList<>();
			
 
				+        Pattern pattern = Pattern.compile("\\d");
			
 
				+        Matcher matcher = pattern.matcher(text);
			
 
				+        while (matcher.find()) {
			
 
				+            digits.add(matcher.group());
			
 
				+        }
			
 
				+        return digits;
			
 
				+    }
			
 
				+
			
 
				+    private String removeDigits(String text) {
			
 
				+        return text.replaceAll("\\d", "");
			
 
				+    }
			
 
				+
			
 
				+    private String removeKeywords(String text, String... keywords) {
			
 
				+        for (String keyword : keywords) {
			
 
				+            text = text.replaceAll(keyword, "");
			
 
				+        }
			
 
				+        return text;
			
 
				+    }
			
 
				+
			
 
				+    private List<String> convertStringToList(String text) {
			
 
				+        List<String> charList = new ArrayList<>();
			
 
				+        for (char c : text.toCharArray()) {
			
 
				+            charList.add(c + "");
			
 
				+        }
			
 
				+        return charList;
			
 
				+    }
			
 
				+
			
 
				+    protected String processString(List<String> wmList, String string) {
			
 
				+        if (StrUtil.isBlank(string)) {
			
 
				+            return null;
			
 
				+        }
			
 
				+        // 生成正则表达式模式
			
 
				+        String pat = String.join("|", wmList);
			
 
				+        // 使用正则表达式移除wmList中的元素
			
 
				+        string = removeMatches(string, pat);
			
 
				+        // 替换中文括号为英文括号
			
 
				+        string = string.replace("（", "(").replace("）", ")");
			
 
				+        // 移除空格
			
 
				+        string = string.replace(" ", "");
			
 
				+        // 如果字符串以括号开头，则移除第一个字符
			
 
				+        if (startsWithParenthesis(string)) {
			
 
				+            string = string.substring(1);
			
 
				+        }
			
 
				+
			
 
				+        return string;
			
 
				+    }
			
 
				+
			
 
				+    private String removeMatches(String input, String pattern) {
			
 
				+        // 编译正则表达式
			
 
				+        Pattern compiledPattern = Pattern.compile(pattern);
			
 
				+        // 创建Matcher对象
			
 
				+        Matcher matcher = compiledPattern.matcher(input);
			
 
				+        // 使用replaceAll方法替换所有匹配到的字符为空字符串
			
 
				+        return matcher.replaceAll("");
			
 
				+    }
			
 
				+
			
 
				+    private boolean startsWithParenthesis(String input) {
			
 
				+        // 匹配以括号开头的字符串
			
 
				+        Pattern pattern = Pattern.compile("^[()].*");
			
 
				+        Matcher matcher = pattern.matcher(input);
			
 
				+        return matcher.find();
			
 
				+    }
			
 
				+}
			
--- a/service-daq/src/main/java/com/simuwang/daq/components/CustomPDFTextStripper.java
+++ b/service-daq/src/main/java/com/simuwang/daq/components/CustomPDFTextStripper.java
@@ -0,0 +1,42 @@
 
				+package com.simuwang.daq.components;
			
 
				+
			
 
				+import cn.hutool.core.collection.CollUtil;
			
 
				+import cn.hutool.core.collection.ListUtil;
			
 
				+import org.apache.pdfbox.text.PDFTextStripper;
			
 
				+import org.apache.pdfbox.text.TextPosition;
			
 
				+import org.apache.pdfbox.util.Matrix;
			
 
				+
			
 
				+import java.io.IOException;
			
 
				+import java.util.List;
			
 
				+
			
 
				+/**
			
 
				+ * @author wangzaijun
			
 
				+ * @date 2024/9/12 14:00
			
 
				+ * @description 自定义的文本去水印方法，发现水印基本是旋转文字并且比报告内其他文字都大
			
 
				+ */
			
 
				+public class CustomPDFTextStripper extends PDFTextStripper {
			
 
				+    private final float[] watermarkWidth = {0f};
			
 
				+
			
 
				+    @Override
			
 
				+    protected void writeString(String text, List<TextPosition> textPositions) throws IOException {
			
 
				+        List<String> newTexts = ListUtil.list(false);
			
 
				+        for (TextPosition textPosition : textPositions) {
			
 
				+            Matrix textMatrix = textPosition.getTextMatrix();
			
 
				+            float col = textMatrix.getValue(0, 1);
			
 
				+            float width = textPosition.getWidth();
			
 
				+            if (col == 0.) {
			
 
				+                if (width < watermarkWidth[0]) {
			
 
				+                    newTexts.add(textPosition.getUnicode());
			
 
				+                }
			
 
				+            } else {
			
 
				+                if (width > watermarkWidth[0]) {
			
 
				+                    watermarkWidth[0] = width;
			
 
				+                }
			
 
				+                newTexts.add("++");
			
 
				+            }
			
 
				+        }
			
 
				+        if (CollUtil.isNotEmpty(newTexts)) {
			
 
				+            super.writeString(String.join("", newTexts));
			
 
				+        }
			
 
				+    }
			
 
				+}
			
--- a/service-daq/src/main/java/com/simuwang/daq/components/PDMonthlyReportParser.java
+++ b/service-daq/src/main/java/com/simuwang/daq/components/PDMonthlyReportParser.java
@@ -0,0 +1,258 @@
 
				+package com.simuwang.daq.components;
			
 
				+
			
 
				+import cn.hutool.core.collection.CollUtil;
			
 
				+import cn.hutool.core.collection.ListUtil;
			
 
				+import cn.hutool.core.map.MapUtil;
			
 
				+import cn.hutool.core.util.ReflectUtil;
			
 
				+import cn.hutool.core.util.StrUtil;
			
 
				+import com.simuwang.base.common.exception.APIException;
			
 
				+import com.simuwang.base.mapper.EmailFieldMappingMapper;
			
 
				+import com.simuwang.base.pojo.dos.EmailFieldMappingDO;
			
 
				+import com.simuwang.daq.dto.ReportFundInfo;
			
 
				+import com.simuwang.daq.dto.ReportInfo;
			
 
				+import com.smppw.common.pojo.ValueLabelVO;
			
 
				+import org.apache.pdfbox.Loader;
			
 
				+import org.apache.pdfbox.io.RandomAccessReadBufferedFile;
			
 
				+import org.apache.pdfbox.pdmodel.PDDocument;
			
 
				+import org.springframework.stereotype.Component;
			
 
				+import technology.tabula.*;
			
 
				+import technology.tabula.extractors.SpreadsheetExtractionAlgorithm;
			
 
				+
			
 
				+import java.io.IOException;
			
 
				+import java.sql.Struct;
			
 
				+import java.util.Calendar;
			
 
				+import java.util.List;
			
 
				+import java.util.Map;
			
 
				+import java.util.regex.Matcher;
			
 
				+import java.util.regex.Pattern;
			
 
				+import java.util.stream.Collectors;
			
 
				+
			
 
				+/**
			
 
				+ * @author wangzaijun
			
 
				+ * @date 2024/9/11 16:19
			
 
				+ * @description pdf格式的月报解析
			
 
				+ */
			
 
				+@Component("monthly-report:pdf")
			
 
				+public class PDMonthlyReportParser extends AbstractReportParser {
			
 
				+    private final EmailFieldMappingMapper fieldMappingMapper;
			
 
				+    private String reportName = null;
			
 
				+    private Table baseInfoTable = null;
			
 
				+    private List<Table> extNavTables = ListUtil.list(true);
			
 
				+    private List<ValueLabelVO> fieldMapper = null;
			
 
				+
			
 
				+    public PDMonthlyReportParser(EmailFieldMappingMapper fieldMappingMapper) {
			
 
				+        this.fieldMappingMapper = fieldMappingMapper;
			
 
				+    }
			
 
				+
			
 
				+    @Override
			
 
				+    protected void initParse() throws IOException {
			
 
				+        try (PDDocument document = Loader.loadPDF(new RandomAccessReadBufferedFile(this.filepath))) {
			
 
				+            CustomPDFTextStripper stripper = new CustomPDFTextStripper();
			
 
				+            stripper.setSortByPosition(true);
			
 
				+            String text = stripper.getText(document);
			
 
				+            text = text.replace("++\r\n", "").replace("++", "");
			
 
				+            List<String> textList = StrUtil.split(text, "\r\n");
			
 
				+            if (CollUtil.isNotEmpty(textList)) {
			
 
				+                List<String> wkList = this.watermarkListMap.get("report_name");
			
 
				+                String name = this.processString(wkList, textList.get(0));
			
 
				+                this.reportName = this.matchReportName(name);
			
 
				+                if (StrUtil.isBlank(this.reportName)) {
			
 
				+                    throw new APIException("未匹配到报告名称");
			
 
				+                }
			
 
				+            }
			
 
				+
			
 
				+            SpreadsheetExtractionAlgorithm extractionAlgorithm = new SpreadsheetExtractionAlgorithm();
			
 
				+            PageIterator pageIterator = new ObjectExtractor(document).extract();
			
 
				+            while (pageIterator.hasNext()) {
			
 
				+                Page page = pageIterator.next();
			
 
				+                List<Table> tables = extractionAlgorithm.extract(page);
			
 
				+                tables = tables.stream().distinct().collect(Collectors.toList());
			
 
				+                for (Table table : tables) {
			
 
				+                    int colCount = table.getColCount();
			
 
				+                    if (colCount == 4) {
			
 
				+                        this.baseInfoTable = table;
			
 
				+                    } else if (colCount >= 5) {
			
 
				+                        this.extNavTables.add(table);
			
 
				+                    }
			
 
				+                }
			
 
				+            }
			
 
				+        }
			
 
				+        List<EmailFieldMappingDO> emailFieldMapping = this.fieldMappingMapper.getEmailFieldMapping();
			
 
				+        if (CollUtil.isNotEmpty(emailFieldMapping)) {
			
 
				+            this.fieldMapper = emailFieldMapping.stream().map(e -> new ValueLabelVO(e.getCode(), e.getName())).collect(Collectors.toList());
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    @Override
			
 
				+    protected ReportInfo parseReportInfo(Integer fileId) {
			
 
				+        ReportInfo reportInfo = new ReportInfo();
			
 
				+        reportInfo.setFileId(fileId);
			
 
				+        reportInfo.setReportName(this.reportName);
			
 
				+        reportInfo.setReportType(this.matchReportType(this.reportName));
			
 
				+        reportInfo.setReportDate(this.matchReportDate(this.reportName));
			
 
				+        return reportInfo;
			
 
				+    }
			
 
				+
			
 
				+    @Override
			
 
				+    protected ReportFundInfo parseBaseInfo() {
			
 
				+        Table baseInfoTable = this.baseInfoTable;
			
 
				+        if (baseInfoTable == null) {
			
 
				+            throw new APIException("未解析到基本信息表格");
			
 
				+        }
			
 
				+        Map<String, Object> baseInfoMap = MapUtil.newHashMap(32);
			
 
				+        for (int i = 0; i < baseInfoTable.getRows().size(); i++) {
			
 
				+            List<RectangularTextContainer> cols = baseInfoTable.getRows().get(i);
			
 
				+            for (int j = 0; j < 2; j++) {
			
 
				+                baseInfoMap.put(cols.get(j * 2).getText(), cols.get(j * 2 + 1).getText());
			
 
				+            }
			
 
				+        }
			
 
				+        // 匹配字段清洗字段
			
 
				+        ReportFundInfo reportFundInfo = new ReportFundInfo();
			
 
				+        baseInfoMap.forEach((k, v) -> {
			
 
				+            String fieldValue = StrUtil.toStringOrNull(v);
			
 
				+            if (fieldValue.contains("-")) {
			
 
				+                fieldValue = null;
			
 
				+            }
			
 
				+            if (fieldValue != null) {
			
 
				+                fieldValue = fieldValue.replace("\r", "");
			
 
				+            }
			
 
				+            for (ValueLabelVO vo : this.fieldMapper) {
			
 
				+                String fieldName = vo.getValue();
			
 
				+                List<String> labels = StrUtil.split(vo.getLabel(), ",");
			
 
				+                if (labels.contains(k)) {
			
 
				+                    ReflectUtil.setFieldValue(reportFundInfo, fieldName, fieldValue);
			
 
				+                    break;
			
 
				+                }
			
 
				+                for (String label : labels) {
			
 
				+                    if (k.contains(label)) {
			
 
				+                        ReflectUtil.setFieldValue(reportFundInfo, fieldName, fieldValue);
			
 
				+                        break;
			
 
				+                    }
			
 
				+                }
			
 
				+            }
			
 
				+        });
			
 
				+        return reportFundInfo;
			
 
				+    }
			
 
				+
			
 
				+    @Override
			
 
				+    protected void parseExtInfo() {
			
 
				+
			
 
				+    }
			
 
				+
			
 
				+    @Override
			
 
				+    protected void saveResult() {
			
 
				+
			
 
				+    }
			
 
				+
			
 
				+    /**
			
 
				+     * 匹配报告日期
			
 
				+     *
			
 
				+     * @param string 文本内容
			
 
				+     * @return 报告日期
			
 
				+     */
			
 
				+    private String matchReportDate(String string) {
			
 
				+        if (string == null) {
			
 
				+            return null;
			
 
				+        }
			
 
				+
			
 
				+        // 编译正则表达式模式
			
 
				+        Pattern pat1 = Pattern.compile("(2\\d{3}).*([一二三四1234])季度");  // 2023年XXX3季度
			
 
				+        Pattern pat2 = Pattern.compile("\\d{4}-\\d{2}-\\d{2}");  // 2023-12-31
			
 
				+        Pattern pat3 = Pattern.compile("(2\\d{3})年年度");  // 2023年年度
			
 
				+        Pattern pat4 = Pattern.compile("(\\d{4})年(\\d{1,2})月");  // 2023年12月
			
 
				+
			
 
				+        // 创建Matcher对象
			
 
				+        Matcher matcher1 = pat1.matcher(string);
			
 
				+        Matcher matcher2 = pat2.matcher(string);
			
 
				+        Matcher matcher3 = pat3.matcher(string);
			
 
				+        Matcher matcher4 = pat4.matcher(string);
			
 
				+
			
 
				+        // 尝试匹配
			
 
				+        if (matcher1.find()) {
			
 
				+            String year = matcher1.group(1);
			
 
				+            String quarter = matcher1.group(2);
			
 
				+            return switch (quarter) {
			
 
				+                case "一", "1" -> year + "-03-31";
			
 
				+                case "二", "2" -> year + "-06-30";
			
 
				+                case "三", "3" -> year + "-09-30";
			
 
				+                case "四", "4" -> year + "-12-31";
			
 
				+                default -> null;
			
 
				+            };
			
 
				+        } else if (matcher2.find()) {
			
 
				+            return matcher2.group();
			
 
				+        } else if (matcher3.find()) {
			
 
				+            return matcher3.group(1) + "-12-31";
			
 
				+        } else if (matcher4.find()) {
			
 
				+            String year = matcher4.group(1);
			
 
				+            String month = matcher4.group(2);
			
 
				+            int lastDayOfMonth = getLastDayOfMonth(Integer.parseInt(year), Integer.parseInt(month));
			
 
				+            return year + "-" + padZero(month) + "-" + padZero(lastDayOfMonth + "");
			
 
				+        } else {
			
 
				+            return null;
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    /**
			
 
				+     * 匹配报告类型，如“季度”、“年度”
			
 
				+     *
			
 
				+     * @param string 输入字符串
			
 
				+     * @return 匹配到的报告类型子字符串，如果没有匹配到则返回null
			
 
				+     */
			
 
				+    private String matchReportType(String string) {
			
 
				+        if (string == null) {
			
 
				+            return null;
			
 
				+        }
			
 
				+
			
 
				+        // 编译正则表达式模式
			
 
				+        Pattern pattern = Pattern.compile("月|季度|年度");
			
 
				+
			
 
				+        // 创建Matcher对象
			
 
				+        Matcher matcher = pattern.matcher(string);
			
 
				+
			
 
				+        // 尝试匹配
			
 
				+        if (matcher.find()) {
			
 
				+            return matcher.group();
			
 
				+        } else {
			
 
				+            return null;
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    private String matchReportName(String text) {
			
 
				+        if (StrUtil.isBlank(text)) {
			
 
				+            return null;
			
 
				+        }
			
 
				+        // 编译正则表达式模式
			
 
				+        Pattern pat1 = Pattern.compile(".+?报([告表])?\\d{4}(\\.?\\d{1,2}(\\.?\\d{2})?)?");
			
 
				+        Pattern pat2 = Pattern.compile("私募.*披露年度报[告表](（\\d{4}-\\d{2}-\\d{2}至\\d{4}-\\d{2}-\\d{2}）)?");
			
 
				+        Pattern pat3 = Pattern.compile(".+?报([告表])?\\d{4}-\\d{2}-\\d{2}至\\d{4}-\\d{2}-\\d{2}?");
			
 
				+
			
 
				+        // 创建Matcher对象
			
 
				+        Matcher matcher1 = pat1.matcher(text);
			
 
				+        Matcher matcher2 = pat2.matcher(text);
			
 
				+        Matcher matcher3 = pat3.matcher(text);
			
 
				+
			
 
				+        // 尝试匹配
			
 
				+        String reportName;
			
 
				+        if (matcher1.find()) {
			
 
				+            reportName = matcher1.group();
			
 
				+        } else if (matcher2.find()) {
			
 
				+            reportName = matcher2.group();
			
 
				+        } else if (matcher3.find()) {
			
 
				+            reportName = matcher3.group();
			
 
				+        } else {
			
 
				+            return null;
			
 
				+        }
			
 
				+        return reportName.replace("（", "(").replace("）", ")");
			
 
				+    }
			
 
				+
			
 
				+    private int getLastDayOfMonth(int year, int month) {
			
 
				+        Calendar calendar = Calendar.getInstance();
			
 
				+        calendar.set(Calendar.YEAR, year);
			
 
				+        calendar.set(Calendar.MONTH, month - 1); // Calendar.MONTH 是从0开始的
			
 
				+        return calendar.getActualMaximum(Calendar.DAY_OF_MONTH);
			
 
				+    }
			
 
				+
			
 
				+    private String padZero(String number) {
			
 
				+        return String.format("%02d", Integer.parseInt(number));
			
 
				+    }
			
 
				+}
			
--- a/service-daq/src/main/java/com/simuwang/daq/components/ReportParser.java
+++ b/service-daq/src/main/java/com/simuwang/daq/components/ReportParser.java
@@ -0,0 +1,18 @@
 
				+package com.simuwang.daq.components;
			
 
				+
			
 
				+/**
			
 
				+ * @author wangzaijun
			
 
				+ * @date 2024/9/9 19:18
			
 
				+ * @description 报告模板解析器，计划支持pdf、word等
			
 
				+ */
			
 
				+public interface ReportParser {
			
 
				+    /**
			
 
				+     * 报告模板解析接口
			
 
				+     * 扩展支持月报、季报和年报，解析文件格式支持pdf、word和excel
			
 
				+     *
			
 
				+     * @param fileId        文件id
			
 
				+     * @param filepath      文件路径
			
 
				+     * @param watermarkName 生成水印
			
 
				+     */
			
 
				+    void parse(Integer fileId, String filepath, String watermarkName);
			
 
				+}
			
--- a/service-daq/src/main/java/com/simuwang/daq/dto/ReportFileType.java
+++ b/service-daq/src/main/java/com/simuwang/daq/dto/ReportFileType.java
@@ -0,0 +1,18 @@
 
				+package com.simuwang.daq.dto;
			
 
				+
			
 
				+public enum ReportFileType {
			
 
				+    PDF("pdf"),
			
 
				+    DOCX("docx"),
			
 
				+    DOC("doc"),
			
 
				+    XLSX("xlsx");
			
 
				+
			
 
				+    private final String suffix;
			
 
				+
			
 
				+    ReportFileType(String suffix) {
			
 
				+        this.suffix = suffix;
			
 
				+    }
			
 
				+
			
 
				+    public String getSuffix() {
			
 
				+        return suffix;
			
 
				+    }
			
 
				+}
			
--- a/service-daq/src/main/java/com/simuwang/daq/dto/ReportFundInfo.java
+++ b/service-daq/src/main/java/com/simuwang/daq/dto/ReportFundInfo.java
@@ -0,0 +1,291 @@
 
				+package com.simuwang.daq.dto;
			
 
				+
			
 
				+/**
			
 
				+ * @author wangzaijun
			
 
				+ * @date 2024/9/12 15:34
			
 
				+ * @description 报告解析的基金信息
			
 
				+ */
			
 
				+public class ReportFundInfo {
			
 
				+    private String fundName;
			
 
				+    private String cFundName;
			
 
				+    /**
			
 
				+     * 是否分级基金
			
 
				+     */
			
 
				+    private Integer istiered;
			
 
				+    /**
			
 
				+     * 备案编码
			
 
				+     */
			
 
				+    private String registerNumber;
			
 
				+    private String trustName;
			
 
				+    private String custodianName;
			
 
				+    private String advisorName;
			
 
				+    /**
			
 
				+     * 运作方式 开放式或封闭式
			
 
				+     */
			
 
				+    private String operationType;
			
 
				+    private String fundType;
			
 
				+    /**
			
 
				+     * 成立日期
			
 
				+     */
			
 
				+    private String inceptionDate;
			
 
				+    private String sharePerAsset;
			
 
				+    private String investmentObjective;
			
 
				+    private String fundStrategyDescription;
			
 
				+    private String secondaryBenchmark;
			
 
				+    private String riskReturnDesc;
			
 
				+    private String realizedIncome;
			
 
				+    private String profit;
			
 
				+    private String fundAssetSize;
			
 
				+    private String nav;
			
 
				+    private String initTotalShares;
			
 
				+    private String subscription;
			
 
				+    private String redemption;
			
 
				+    private String split;
			
 
				+    /**
			
 
				+     * 杠杆信息描述
			
 
				+     */
			
 
				+    private String leverageNote;
			
 
				+    /**
			
 
				+     * 杠杆比例
			
 
				+     */
			
 
				+    private String leverage;
			
 
				+    private String remark;
			
 
				+    private String industryTrend;
			
 
				+    private String fundManager;
			
 
				+    /**
			
 
				+     * 是否托管复核
			
 
				+     */
			
 
				+    private String reviewed;
			
 
				+
			
 
				+    public String getFundName() {
			
 
				+        return fundName;
			
 
				+    }
			
 
				+
			
 
				+    public void setFundName(String fundName) {
			
 
				+        this.fundName = fundName;
			
 
				+    }
			
 
				+
			
 
				+    public String getcFundName() {
			
 
				+        return cFundName;
			
 
				+    }
			
 
				+
			
 
				+    public void setcFundName(String cFundName) {
			
 
				+        this.cFundName = cFundName;
			
 
				+    }
			
 
				+
			
 
				+    public Integer getIstiered() {
			
 
				+        return istiered;
			
 
				+    }
			
 
				+
			
 
				+    public void setIstiered(Integer istiered) {
			
 
				+        this.istiered = istiered;
			
 
				+    }
			
 
				+
			
 
				+    public String getRegisterNumber() {
			
 
				+        return registerNumber;
			
 
				+    }
			
 
				+
			
 
				+    public void setRegisterNumber(String registerNumber) {
			
 
				+        this.registerNumber = registerNumber;
			
 
				+    }
			
 
				+
			
 
				+    public String getTrustName() {
			
 
				+        return trustName;
			
 
				+    }
			
 
				+
			
 
				+    public void setTrustName(String trustName) {
			
 
				+        this.trustName = trustName;
			
 
				+    }
			
 
				+
			
 
				+    public String getCustodianName() {
			
 
				+        return custodianName;
			
 
				+    }
			
 
				+
			
 
				+    public void setCustodianName(String custodianName) {
			
 
				+        this.custodianName = custodianName;
			
 
				+    }
			
 
				+
			
 
				+    public String getAdvisorName() {
			
 
				+        return advisorName;
			
 
				+    }
			
 
				+
			
 
				+    public void setAdvisorName(String advisorName) {
			
 
				+        this.advisorName = advisorName;
			
 
				+    }
			
 
				+
			
 
				+    public String getOperationType() {
			
 
				+        return operationType;
			
 
				+    }
			
 
				+
			
 
				+    public void setOperationType(String operationType) {
			
 
				+        this.operationType = operationType;
			
 
				+    }
			
 
				+
			
 
				+    public String getFundType() {
			
 
				+        return fundType;
			
 
				+    }
			
 
				+
			
 
				+    public void setFundType(String fundType) {
			
 
				+        this.fundType = fundType;
			
 
				+    }
			
 
				+
			
 
				+    public String getInceptionDate() {
			
 
				+        return inceptionDate;
			
 
				+    }
			
 
				+
			
 
				+    public void setInceptionDate(String inceptionDate) {
			
 
				+        this.inceptionDate = inceptionDate;
			
 
				+    }
			
 
				+
			
 
				+    public String getSharePerAsset() {
			
 
				+        return sharePerAsset;
			
 
				+    }
			
 
				+
			
 
				+    public void setSharePerAsset(String sharePerAsset) {
			
 
				+        this.sharePerAsset = sharePerAsset;
			
 
				+    }
			
 
				+
			
 
				+    public String getInvestmentObjective() {
			
 
				+        return investmentObjective;
			
 
				+    }
			
 
				+
			
 
				+    public void setInvestmentObjective(String investmentObjective) {
			
 
				+        this.investmentObjective = investmentObjective;
			
 
				+    }
			
 
				+
			
 
				+    public String getFundStrategyDescription() {
			
 
				+        return fundStrategyDescription;
			
 
				+    }
			
 
				+
			
 
				+    public void setFundStrategyDescription(String fundStrategyDescription) {
			
 
				+        this.fundStrategyDescription = fundStrategyDescription;
			
 
				+    }
			
 
				+
			
 
				+    public String getSecondaryBenchmark() {
			
 
				+        return secondaryBenchmark;
			
 
				+    }
			
 
				+
			
 
				+    public void setSecondaryBenchmark(String secondaryBenchmark) {
			
 
				+        this.secondaryBenchmark = secondaryBenchmark;
			
 
				+    }
			
 
				+
			
 
				+    public String getRiskReturnDesc() {
			
 
				+        return riskReturnDesc;
			
 
				+    }
			
 
				+
			
 
				+    public void setRiskReturnDesc(String riskReturnDesc) {
			
 
				+        this.riskReturnDesc = riskReturnDesc;
			
 
				+    }
			
 
				+
			
 
				+    public String getRealizedIncome() {
			
 
				+        return realizedIncome;
			
 
				+    }
			
 
				+
			
 
				+    public void setRealizedIncome(String realizedIncome) {
			
 
				+        this.realizedIncome = realizedIncome;
			
 
				+    }
			
 
				+
			
 
				+    public String getProfit() {
			
 
				+        return profit;
			
 
				+    }
			
 
				+
			
 
				+    public void setProfit(String profit) {
			
 
				+        this.profit = profit;
			
 
				+    }
			
 
				+
			
 
				+    public String getFundAssetSize() {
			
 
				+        return fundAssetSize;
			
 
				+    }
			
 
				+
			
 
				+    public void setFundAssetSize(String fundAssetSize) {
			
 
				+        this.fundAssetSize = fundAssetSize;
			
 
				+    }
			
 
				+
			
 
				+    public String getNav() {
			
 
				+        return nav;
			
 
				+    }
			
 
				+
			
 
				+    public void setNav(String nav) {
			
 
				+        this.nav = nav;
			
 
				+    }
			
 
				+
			
 
				+    public String getInitTotalShares() {
			
 
				+        return initTotalShares;
			
 
				+    }
			
 
				+
			
 
				+    public void setInitTotalShares(String initTotalShares) {
			
 
				+        this.initTotalShares = initTotalShares;
			
 
				+    }
			
 
				+
			
 
				+    public String getSubscription() {
			
 
				+        return subscription;
			
 
				+    }
			
 
				+
			
 
				+    public void setSubscription(String subscription) {
			
 
				+        this.subscription = subscription;
			
 
				+    }
			
 
				+
			
 
				+    public String getRedemption() {
			
 
				+        return redemption;
			
 
				+    }
			
 
				+
			
 
				+    public void setRedemption(String redemption) {
			
 
				+        this.redemption = redemption;
			
 
				+    }
			
 
				+
			
 
				+    public String getSplit() {
			
 
				+        return split;
			
 
				+    }
			
 
				+
			
 
				+    public void setSplit(String split) {
			
 
				+        this.split = split;
			
 
				+    }
			
 
				+
			
 
				+    public String getLeverageNote() {
			
 
				+        return leverageNote;
			
 
				+    }
			
 
				+
			
 
				+    public void setLeverageNote(String leverageNote) {
			
 
				+        this.leverageNote = leverageNote;
			
 
				+    }
			
 
				+
			
 
				+    public String getLeverage() {
			
 
				+        return leverage;
			
 
				+    }
			
 
				+
			
 
				+    public void setLeverage(String leverage) {
			
 
				+        this.leverage = leverage;
			
 
				+    }
			
 
				+
			
 
				+    public String getRemark() {
			
 
				+        return remark;
			
 
				+    }
			
 
				+
			
 
				+    public void setRemark(String remark) {
			
 
				+        this.remark = remark;
			
 
				+    }
			
 
				+
			
 
				+    public String getIndustryTrend() {
			
 
				+        return industryTrend;
			
 
				+    }
			
 
				+
			
 
				+    public void setIndustryTrend(String industryTrend) {
			
 
				+        this.industryTrend = industryTrend;
			
 
				+    }
			
 
				+
			
 
				+    public String getFundManager() {
			
 
				+        return fundManager;
			
 
				+    }
			
 
				+
			
 
				+    public void setFundManager(String fundManager) {
			
 
				+        this.fundManager = fundManager;
			
 
				+    }
			
 
				+
			
 
				+    public String getReviewed() {
			
 
				+        return reviewed;
			
 
				+    }
			
 
				+
			
 
				+    public void setReviewed(String reviewed) {
			
 
				+        this.reviewed = reviewed;
			
 
				+    }
			
 
				+}
			
--- a/service-daq/src/main/java/com/simuwang/daq/dto/ReportInfo.java
+++ b/service-daq/src/main/java/com/simuwang/daq/dto/ReportInfo.java
@@ -0,0 +1,54 @@
 
				+package com.simuwang.daq.dto;
			
 
				+
			
 
				+/**
			
 
				+ * @author wangzaijun
			
 
				+ * @date 2024/9/11 17:57
			
 
				+ * @description 报告基本信息
			
 
				+ */
			
 
				+public class ReportInfo {
			
 
				+    private Integer fileId;
			
 
				+    /**
			
 
				+     * 报告名称
			
 
				+     */
			
 
				+    private String reportName;
			
 
				+    /**
			
 
				+     * 报告类型（月、季、年）
			
 
				+     */
			
 
				+    private String reportType;
			
 
				+    /**
			
 
				+     * 报告日期
			
 
				+     */
			
 
				+    private String reportDate;
			
 
				+
			
 
				+    public Integer getFileId() {
			
 
				+        return fileId;
			
 
				+    }
			
 
				+
			
 
				+    public void setFileId(Integer fileId) {
			
 
				+        this.fileId = fileId;
			
 
				+    }
			
 
				+
			
 
				+    public String getReportName() {
			
 
				+        return reportName;
			
 
				+    }
			
 
				+
			
 
				+    public void setReportName(String reportName) {
			
 
				+        this.reportName = reportName;
			
 
				+    }
			
 
				+
			
 
				+    public String getReportType() {
			
 
				+        return reportType;
			
 
				+    }
			
 
				+
			
 
				+    public void setReportType(String reportType) {
			
 
				+        this.reportType = reportType;
			
 
				+    }
			
 
				+
			
 
				+    public String getReportDate() {
			
 
				+        return reportDate;
			
 
				+    }
			
 
				+
			
 
				+    public void setReportDate(String reportDate) {
			
 
				+        this.reportDate = reportDate;
			
 
				+    }
			
 
				+}
			
--- a/service-daq/src/main/java/com/simuwang/daq/service/ReportParser.java
+++ b/service-daq/src/main/java/com/simuwang/daq/service/ReportParser.java
@@ -1,13 +0,0 @@
 
				-package com.simuwang.daq.service;
			
 
				-
			
 
				-/**
			
 
				- * @author wangzaijun
			
 
				- * @date 2024/9/9 19:18
			
 
				- * @description 报告模板解析器，计划支持pdf、word等
			
 
				- */
			
 
				-public interface ReportParser {
			
 
				-    /**
			
 
				-     * 解析接口
			
 
				-     */
			
 
				-    void parse();
			
 
				-}
			
--- a/service-daq/src/main/java/com/simuwang/daq/utils/12931.pdf
+++ b/service-daq/src/main/java/com/simuwang/daq/utils/12931.pdf
--- a/service-daq/src/main/java/com/simuwang/daq/utils/12932.pdf
+++ b/service-daq/src/main/java/com/simuwang/daq/utils/12932.pdf
--- a/service-daq/src/main/java/com/simuwang/daq/utils/12933.pdf
+++ b/service-daq/src/main/java/com/simuwang/daq/utils/12933.pdf
--- a/service-daq/src/main/java/com/simuwang/daq/utils/14655.pdf
+++ b/service-daq/src/main/java/com/simuwang/daq/utils/14655.pdf
--- a/service-daq/src/main/java/com/simuwang/daq/utils/14916.pdf
+++ b/service-daq/src/main/java/com/simuwang/daq/utils/14916.pdf
--- a/service-daq/src/main/java/com/simuwang/daq/utils/15654.pdf
+++ b/service-daq/src/main/java/com/simuwang/daq/utils/15654.pdf
--- a/service-daq/src/main/java/com/simuwang/daq/utils/15655.pdf
+++ b/service-daq/src/main/java/com/simuwang/daq/utils/15655.pdf
--- a/service-daq/src/main/java/com/simuwang/daq/utils/17847.pdf
+++ b/service-daq/src/main/java/com/simuwang/daq/utils/17847.pdf
--- a/service-daq/src/main/java/com/simuwang/daq/utils/17850.pdf
+++ b/service-daq/src/main/java/com/simuwang/daq/utils/17850.pdf
--- a/service-daq/src/main/java/com/simuwang/daq/utils/ReportParseUtil.java
+++ b/service-daq/src/main/java/com/simuwang/daq/utils/ReportParseUtil.java
@@ -1,51 +1,348 @@
 
				 package com.simuwang.daq.utils;
			
 
				 
			
 
				+import cn.hutool.core.collection.CollUtil;
			
 
				+import cn.hutool.core.collection.ListUtil;
			
 
				+import cn.hutool.core.map.MapUtil;
			
 
				+import cn.hutool.core.util.ReflectUtil;
			
 
				+import cn.hutool.core.util.StrUtil;
			
 
				+import com.simuwang.daq.components.CustomPDFTextStripper;
			
 
				+import com.simuwang.daq.dto.ReportFundInfo;
			
 
				+import com.smppw.common.pojo.ValueLabelVO;
			
 
				 import org.apache.pdfbox.Loader;
			
 
				+import org.apache.pdfbox.contentstream.PDFStreamEngine;
			
 
				+import org.apache.pdfbox.contentstream.operator.text.ShowText;
			
 
				+import org.apache.pdfbox.cos.COSName;
			
 
				 import org.apache.pdfbox.io.RandomAccessReadBufferedFile;
			
 
				 import org.apache.pdfbox.pdmodel.PDDocument;
			
 
				 import org.apache.pdfbox.pdmodel.PDPage;
			
 
				-import org.apache.pdfbox.pdmodel.PDPageTree;
			
 
				+import org.apache.pdfbox.pdmodel.PDResources;
			
 
				+import org.apache.pdfbox.pdmodel.common.PDStream;
			
 
				+import org.apache.pdfbox.pdmodel.documentinterchange.logicalstructure.PDStructureTreeRoot;
			
 
				+import org.apache.pdfbox.pdmodel.graphics.PDXObject;
			
 
				+import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
			
 
				+import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
			
 
				+import org.apache.pdfbox.text.PDFTextStripper;
			
 
				 import org.apache.pdfbox.text.PDFTextStripperByArea;
			
 
				+import org.apache.pdfbox.text.TextPosition;
			
 
				+import org.apache.pdfbox.util.Matrix;
			
 
				+import technology.tabula.*;
			
 
				+import technology.tabula.extractors.SpreadsheetExtractionAlgorithm;
			
 
				 
			
 
				 import java.awt.geom.Rectangle2D;
			
 
				 import java.io.IOException;
			
 
				-import java.util.List;
			
 
				+import java.util.*;
			
 
				+import java.util.regex.Matcher;
			
 
				+import java.util.regex.Pattern;
			
 
				+import java.util.stream.Collectors;
			
 
				 
			
 
				 public class ReportParseUtil {
			
 
				     public static void main(String[] args) throws IOException {
			
 
				+        List<ValueLabelVO> fieldMapper = ListUtil.list(false);
			
 
				+        fieldMapper.add(new ValueLabelVO("fundName", "基金名称"));
			
 
				+        fieldMapper.add(new ValueLabelVO("registerNumber", "基金编码"));
			
 
				+        fieldMapper.add(new ValueLabelVO("operationType", "基金运作方式"));
			
 
				+        fieldMapper.add(new ValueLabelVO("fundType", "基金类别"));
			
 
				+        fieldMapper.add(new ValueLabelVO("inceptionDate", "基金成立日期"));
			
 
				+        fieldMapper.add(new ValueLabelVO("trustName", "基金托管人"));
			
 
				+        fieldMapper.add(new ValueLabelVO("custodianName", "基金管理人"));
			
 
				+        fieldMapper.add(new ValueLabelVO("advisorName", "投资顾问"));
			
 
				+        fieldMapper.add(new ValueLabelVO("reviewed", "复核"));
			
 
				+
			
 
				+        Map<String, List<String>> watermarkMap = generateWatermarkListMap("幻方量化1000指数专享1号5期私募证券投资基金", "宁波幻方量化投资管理合伙企业（有限合伙）", null);
			
 
				+        List<String> watermarks = watermarkMap.get("less");
			
 
				+
			
 
				+//        System.out.println(watermarks);
			
 
				+//        try (PDDocument document = Loader.loadPDF(new RandomAccessReadBufferedFile("D:\\Documents\\workspace\\idea\\smppw\\data-daq\\service-daq\\src\\main\\java\\com\\simuwang\\daq\\utils\\12931.pdf"))) {
			
 
				         try (PDDocument document = Loader.loadPDF(new RandomAccessReadBufferedFile("D:\\Documents\\workspace\\idea\\smppw\\data-daq\\service-daq\\src\\main\\java\\com\\simuwang\\daq\\utils\\2061834.pdf"))) {
			
 
				-            PDPageTree pages = document.getPages();
			
 
				-            for (int i = 0; i < pages.getCount(); i++) {
			
 
				-                PDFTextStripperByArea stripper = new PDFTextStripperByArea();
			
 
				-                stripper.setSortByPosition(true);
			
 
				+//            PDFTextStripper stripper = new PDFTextStripper();
			
 
				+//            stripper.setSortByPosition(true);
			
 
				+//            String allText = stripper.getText(document);
			
 
				+//            List<String> textList = StrUtil.split(allText, "\r\n");
			
 
				+//            System.out.println(textList);
			
 
				 
			
 
				-                // 定义每个区域
			
 
				-                defineAreas(stripper);
			
 
				+            PDFTextStripper textStripper = new CustomPDFTextStripper();
			
 
				+            textStripper.setSortByPosition(true);
			
 
				+            String text1 = textStripper.getText(document);
			
 
				+            text1 = text1.replace("+\r\n", "").replace("+","");
			
 
				+            List<String> textList = StrUtil.split(text1, "\r\n");
			
 
				+            System.out.println(textList.get(0));
			
 
				 
			
 
				-                // 提取文本
			
 
				-                PDPage page = document.getPage(i);
			
 
				-                stripper.extractRegions(page);
			
 
				+//            for (PDPage page : document.getPages()) {
			
 
				+//
			
 
				+////                PDResources resources = page.getResources();
			
 
				+////                Map<COSName, PDImageXObject> imageXObjectMap = findImageWatermark(page);
			
 
				+////                Iterator<COSName> iterator = resources.getXObjectNames().iterator();
			
 
				+////                while (iterator.hasNext()) {
			
 
				+////                    COSName next = iterator.next();
			
 
				+////                    if (imageXObjectMap.containsKey(next)) {
			
 
				+////                        iterator.remove();
			
 
				+////                    }
			
 
				+////                }
			
 
				+////                removeTextWatermark(page);
			
 
				+//
			
 
				+//                PDFTextStripperByArea stripper = new PDFTextStripperByArea();
			
 
				+//                stripper.setSortByPosition(true);
			
 
				+//                stripper.addRegion("page", new Rectangle2D.Float(0, 0, page.getMediaBox().getWidth(), page.getMediaBox().getHeight()));
			
 
				+//                stripper.extractRegions(page);
			
 
				+//                for (String region : stripper.getRegions()) {
			
 
				+//                    String text = stripper.getTextForRegion(region);
			
 
				+//                    String res = processString(watermarks, text);
			
 
				+//                    System.out.println("原数据：" + text + ", 去除水印后数据：" + res);
			
 
				+//                }
			
 
				+//            }
			
 
				+//            document.save(new File("./1.pdf"));
			
 
				 
			
 
				-                List<String> regions = stripper.getRegions();
			
 
				-                processRegions(stripper, regions);
			
 
				+            SpreadsheetExtractionAlgorithm extractionAlgorithm = new SpreadsheetExtractionAlgorithm();
			
 
				+            PageIterator pageIterator = new ObjectExtractor(document).extract();
			
 
				+            while (pageIterator.hasNext()) {
			
 
				+                Page page = pageIterator.next();
			
 
				+                List<Table> tables = extractionAlgorithm.extract(page);
			
 
				+                tables = tables.stream().distinct().collect(Collectors.toList());
			
 
				+                for (Table table : tables) {
			
 
				+                    if (table.getColCount() == 4) {
			
 
				+                        Map<String, Object> baseInfoMap = MapUtil.newHashMap(32);
			
 
				+                        for (int i = 0; i < table.getRows().size(); i++) {
			
 
				+                            List<RectangularTextContainer> cols = table.getRows().get(i);
			
 
				+                            for (int j = 0; j < 2; j++) {
			
 
				+                                baseInfoMap.put(cols.get(j * 2).getText(), cols.get(j * 2 + 1).getText());
			
 
				+                            }
			
 
				+                        }
			
 
				+                        ReportFundInfo reportFundInfo = new ReportFundInfo();
			
 
				+                        baseInfoMap.forEach((k, v) -> {
			
 
				+                            for (ValueLabelVO vo : fieldMapper) {
			
 
				+                                String fieldName = vo.getValue();
			
 
				+                                List<String> labels = StrUtil.split(vo.getLabel(), ",");
			
 
				+                                if (labels.contains(k)) {
			
 
				+                                    ReflectUtil.setFieldValue(reportFundInfo, fieldName, v);
			
 
				+                                    break;
			
 
				+                                }
			
 
				+                                for (String label : labels) {
			
 
				+                                    if (k.contains(label)) {
			
 
				+                                        ReflectUtil.setFieldValue(reportFundInfo, fieldName, v);
			
 
				+                                        break;
			
 
				+                                    }
			
 
				+                                }
			
 
				+                            }
			
 
				+                        });
			
 
				+                        System.out.println(reportFundInfo);
			
 
				+                    }
			
 
				+                }
			
 
				             }
			
 
				         }
			
 
				     }
			
 
				 
			
 
				-    private static void defineAreas(PDFTextStripperByArea stripper) {
			
 
				-        // 定义区域，位置左上角作为原点，横坐标往右为x轴，纵坐标往下为y轴
			
 
				-        stripper.addRegion("header", new Rectangle2D.Float(0, 0, 612, 180));
			
 
				-        stripper.addRegion("content-survey", new Rectangle2D.Float(0, 180, 612, 180));
			
 
				-        stripper.addRegion("content-current-fund", new Rectangle2D.Float(0, 360, 612, 240));
			
 
				-        stripper.addRegion("content-sub-fund1", new Rectangle2D.Float(0, 600, 612, 100));
			
 
				-        stripper.addRegion("content-sub-fund2", new Rectangle2D.Float(0, 700, 612, 150));
			
 
				-        stripper.addRegion("footer", new Rectangle2D.Float(0, 850, 612, 30));
			
 
				+    /**
			
 
				+     * 找图片水印
			
 
				+     *
			
 
				+     * @param page
			
 
				+     * @return
			
 
				+     * @throws IOException
			
 
				+     */
			
 
				+    public static Map<COSName, PDImageXObject> findImageWatermark(PDPage page) throws IOException {
			
 
				+        Map<COSName, PDImageXObject> watermarkMap = MapUtil.newHashMap();
			
 
				+        PDResources resources = page.getResources();
			
 
				+        Iterable<COSName> xObjectNames = resources.getXObjectNames();
			
 
				+        for (COSName xObjectName : xObjectNames) {
			
 
				+            PDXObject xObject = resources.getXObject(xObjectName);
			
 
				+            PDStream stream = xObject.getStream();
			
 
				+            PDImageXObject imageXObject = null;
			
 
				+            try {
			
 
				+                imageXObject = new PDImageXObject(stream, resources);
			
 
				+            } catch (Exception e) {
			
 
				+                e.printStackTrace();
			
 
				+            }
			
 
				+            if (imageXObject != null) {
			
 
				+                watermarkMap.put(xObjectName, imageXObject);
			
 
				+            }
			
 
				+        }
			
 
				+        return watermarkMap;
			
 
				     }
			
 
				 
			
 
				-    private static void processRegions(PDFTextStripperByArea stripper, List<String> regions) {
			
 
				-        for (String region : regions) {
			
 
				-            String text = stripper.getTextForRegion(region);
			
 
				-            System.out.println(text);
			
 
				+    private static Map<String, List<String>> generateWatermarkListMap(String fundName, String trustName, String registerNumber) {
			
 
				+        Map<String, List<String>> result = MapUtil.newHashMap(32);
			
 
				+        // 生成水印列表
			
 
				+
			
 
				+        fundName = StrUtil.isNotBlank(fundName) ? fundName : "私募证券投资基金";
			
 
				+        trustName = StrUtil.isNotBlank(trustName) ? trustName : "资产管理有限公司";
			
 
				+        registerNumber = StrUtil.isNotBlank(registerNumber) ? registerNumber : "";
			
 
				+        String text = fundName + trustName + registerNumber;
			
 
				+        text = text.replaceAll("[()]", ""); // 移除括号
			
 
				+        List<String> textList = new ArrayList<>(new HashSet<>(convertStringToList(text)));
			
 
				+        Collections.reverse(textList);
			
 
				+        StringBuilder sb = new StringBuilder(textList.size());
			
 
				+        for (String ch : textList) {
			
 
				+            sb.append(ch);
			
 
				+        }
			
 
				+        String joinedText = sb.toString();
			
 
				+
			
 
				+        // 基本水印列表
			
 
				+        List<String> wkList = new ArrayList<>();
			
 
				+        for (String ch : textList) {
			
 
				+            wkList.add(ch + "\r\n");
			
 
				+            wkList.add("\r\n" + ch);
			
 
				+        }
			
 
				+
			
 
				+        // 查找数字
			
 
				+        List<String> matches = findDigits(fundName);
			
 
				+        if (!matches.isEmpty()) {
			
 
				+            for (String match : matches) {
			
 
				+                wkList.add("\r\n" + match);
			
 
				+                wkList.add(match + "\r\n");
			
 
				+            }
			
 
				         }
			
 
				+        wkList.add("-");
			
 
				+        wkList.add("【");
			
 
				+        wkList.add("】");
			
 
				+        wkList.add("\r");
			
 
				+        wkList.add("\r\n");
			
 
				+
			
 
				+        String noNumberText = removeDigits(joinedText);
			
 
				+
			
 
				+        // 生成不同字段的水印列表
			
 
				+        result.put("report_name", new ArrayList<>(wkList));
			
 
				+        result.get("report_name").addAll(convertStringToList("有限公司"));
			
 
				+
			
 
				+        result.put("less", new ArrayList<>(wkList));
			
 
				+
			
 
				+        result.put("more", new ArrayList<>(wkList));
			
 
				+        result.get("more").addAll(convertStringToList(noNumberText));
			
 
				+
			
 
				+        result.put("leverage", new ArrayList<>(wkList));
			
 
				+        result.get("leverage").addAll(convertStringToList(removeKeywords(noNumberText, "基金资产")));
			
 
				+
			
 
				+        result.put("base_info", new ArrayList<>(wkList));
			
 
				+        result.get("base_info").addAll(convertStringToList(removeKeywords(text, "基", "金", "投资", "管理", "有", "份", "融", "资", "产", "本", "号", "收益", "策略", "期")));
			
 
				+
			
 
				+        result.put("industry", new ArrayList<>(wkList));
			
 
				+        result.get("industry").addAll(convertStringToList(removeKeywords(noNumberText, "基金融公产")));
			
 
				+
			
 
				+        result.put("market_value", new ArrayList<>(Collections.singletonList("\n")));
			
 
				+        return result;
			
 
				     }
			
 
				+
			
 
				+    private static List<String> findDigits(String text) {
			
 
				+        List<String> digits = new ArrayList<>();
			
 
				+        Pattern pattern = Pattern.compile("\\d");
			
 
				+        Matcher matcher = pattern.matcher(text);
			
 
				+        while (matcher.find()) {
			
 
				+            digits.add(matcher.group());
			
 
				+        }
			
 
				+        return digits;
			
 
				+    }
			
 
				+
			
 
				+    private static String removeDigits(String text) {
			
 
				+        return text.replaceAll("\\d", "");
			
 
				+    }
			
 
				+
			
 
				+    private static String removeKeywords(String text, String... keywords) {
			
 
				+        for (String keyword : keywords) {
			
 
				+            text = text.replaceAll(keyword, "");
			
 
				+        }
			
 
				+        return text;
			
 
				+    }
			
 
				+
			
 
				+    private static List<String> convertStringToList(String text) {
			
 
				+        List<String> charList = new ArrayList<>();
			
 
				+        for (char c : text.toCharArray()) {
			
 
				+            charList.add(c + "");
			
 
				+        }
			
 
				+        return charList;
			
 
				+    }
			
 
				+
			
 
				+    public static String processString(List<String> wmList, String string) {
			
 
				+        // 生成正则表达式模式
			
 
				+        String pat = String.join("|", wmList);
			
 
				+        // 使用正则表达式移除wmList中的元素
			
 
				+        string = removeMatches(string, pat);
			
 
				+        // 替换中文括号为英文括号
			
 
				+        string = string.replace("（", "(").replace("）", ")");
			
 
				+        // 移除空格
			
 
				+        string = string.replace(" ", "");
			
 
				+        // 如果字符串以括号开头，则移除第一个字符
			
 
				+        if (startsWithParenthesis(string)) {
			
 
				+            string = string.substring(1);
			
 
				+        }
			
 
				+
			
 
				+        return string;
			
 
				+    }
			
 
				+
			
 
				+    private static String removeMatches(String input, String pattern) {
			
 
				+        // 编译正则表达式
			
 
				+        Pattern compiledPattern = Pattern.compile(pattern);
			
 
				+        // 创建Matcher对象
			
 
				+        Matcher matcher = compiledPattern.matcher(input);
			
 
				+        // 使用replaceAll方法替换所有匹配到的字符为空字符串
			
 
				+        return matcher.replaceAll("");
			
 
				+    }
			
 
				+
			
 
				+    private static boolean startsWithParenthesis(String input) {
			
 
				+        // 匹配以括号开头的字符串
			
 
				+        Pattern pattern = Pattern.compile("^[()].*");
			
 
				+        Matcher matcher = pattern.matcher(input);
			
 
				+        return matcher.find();
			
 
				+    }
			
 
				+
			
 
				+//    public static void removeTextWatermark(PDPage page) throws IOException {
			
 
				+//        PDResources resources = page.getResources();
			
 
				+////        if (StrUtil.isAllBlank(fundName, trustName)) {
			
 
				+////            return;
			
 
				+////        }
			
 
				+//        PDFTextStripperByArea stripper = new PDFTextStripperByArea();
			
 
				+//        stripper.setSortByPosition(true);
			
 
				+//        stripper.addRegion("watermark", new Rectangle2D.Float(0, 0, page.getMediaBox().getWidth(), page.getMediaBox().getHeight()));
			
 
				+//        stripper.extractRegions(page);
			
 
				+//
			
 
				+//        PDFStreamEngine engine = new PDFTextStripper();
			
 
				+//        engine.addOperator(new SetMatrix(stripper));
			
 
				+//
			
 
				+//    }
			
 
				+//
			
 
				+//    private static void processResources(PDResources resources) throws IOException {
			
 
				+//        for (COSName name : resources.getXObjectNames()) {
			
 
				+//            PDXObject xobject = resources.getXObject(name);
			
 
				+//            if (xobject instanceof PDFormXObject) {
			
 
				+//                PDFormXObject formXObject = (PDFormXObject) xobject;
			
 
				+//                writeTokensToStream(formXObject.getContentStream(),
			
 
				+//                        createTokensWithoutText(formXObject));
			
 
				+//                processResources(formXObject.getResources());
			
 
				+//            }
			
 
				+//        }
			
 
				+//        for (COSName name : resources.getPatternNames()) {
			
 
				+//            PDAbstractPattern pattern = resources.getPattern(name);
			
 
				+//            if (pattern instanceof PDTilingPattern) {
			
 
				+//                PDTilingPattern tilingPattern = (PDTilingPattern) pattern;
			
 
				+//                writeTokensToStream(tilingPattern.getContentStream(),
			
 
				+//                        createTokensWithoutText(tilingPattern));
			
 
				+//                processResources(tilingPattern.getResources());
			
 
				+//            }
			
 
				+//        }
			
 
				+//    }
			
 
				+//
			
 
				+//    private static void writeTokensToStream(PDStream newContents, List<Object> newTokens) throws IOException {
			
 
				+//        try (OutputStream out = newContents.createOutputStream(COSName.FLATE_DECODE)) {
			
 
				+//            ContentStreamWriter writer = new ContentStreamWriter(out);
			
 
				+//            writer.writeTokens(newTokens);
			
 
				+//        }
			
 
				+//    }
			
 
				+//
			
 
				+//    private static List<Object> createTokensWithoutText(PDContentStream contentStream) throws IOException {
			
 
				+//        PDFStreamParser parser = new PDFStreamParser(contentStream);
			
 
				+//        Object token = parser.parseNextToken();
			
 
				+//        List<Object> newTokens = new ArrayList<>();
			
 
				+//        while (token != null) {
			
 
				+//            if (token instanceof Operator op) {
			
 
				+//                String opName = op.getName();
			
 
				+//                if (OperatorName.SET_MATRIX.equals(opName)) {
			
 
				+//                    // remove the argument to this operator
			
 
				+//                    newTokens.remove(newTokens.size() - 1);
			
 
				+//
			
 
				+//                    token = parser.parseNextToken();
			
 
				+//                    continue;
			
 
				+//                }
			
 
				+//            }
			
 
				+//            newTokens.add(token);
			
 
				+//            token = parser.parseNextToken();
			
 
				+//        }
			
 
				+//        return newTokens;
			
 
				+//    }
			
 
				 }