Parcourir la source

feat:支持定期报告解析

wangzaijun il y a 2 semaines
Parent
commit
4cc01ff6ba

Fichier diff supprimé car celui-ci est trop grand
+ 113 - 15
mo-daq/db/init.sql


+ 25 - 13
mo-daq/src/main/java/com/smppw/modaq/application/components/CustomPDFTextStripper.java

@@ -9,8 +9,6 @@ import org.apache.pdfbox.text.TextPosition;
 import java.io.IOException;
 import java.util.List;
 
-import static com.smppw.modaq.common.conts.Constants.WATERMARK_REPLACE;
-
 /**
  * @author wangzaijun
  * @date 2024/9/12 14:00
@@ -18,6 +16,24 @@ import static com.smppw.modaq.common.conts.Constants.WATERMARK_REPLACE;
  * @see CustomTabulaTextStripper 区别于表格文字去水印的实现
  */
 public class CustomPDFTextStripper extends PDFTextStripper {
+    private final boolean sortByPosition;
+    private final String wordSeparator;
+
+    public CustomPDFTextStripper(boolean sortByPosition, String wordSeparator) {
+        this.sortByPosition = sortByPosition;
+        this.wordSeparator = wordSeparator;
+    }
+
+    @Override
+    public boolean getSortByPosition() {
+        return sortByPosition;
+    }
+
+    @Override
+    public String getWordSeparator() {
+        return wordSeparator;
+    }
+
     @Override
     protected void writeString(String text, List<TextPosition> textPositions) throws IOException {
         // 水印文字基本都是有角度的,统计有旋转角度的文字高度
@@ -30,21 +46,17 @@ public class CustomPDFTextStripper extends PDFTextStripper {
             // 如果旋转角度除90的余数大于0.1就说明是水印文字
             if (degrees % 90. > 0.1) {
                 rotationTexts.add(textPosition);
-                newTexts.add(WATERMARK_REPLACE);
             } else {
-                newTexts.add(textPosition.getUnicode());
+                String unicode = textPosition.getUnicode();
+                if (StrUtil.isBlank(unicode)) {
+                    continue;
+                }
+                newTexts.add(unicode);
             }
         }
         // 集合为空表示text的内容没有水印影响,直接输出该内容
-        if (CollUtil.isEmpty(rotationTexts)) {
-            super.writeString(text);
-            return;
-        }
-        // 如果全是水印文字则直接去除
-        if (textPositions.size() == rotationTexts.size()) {
-            super.writeString(WATERMARK_REPLACE);
-            return;
+        if (CollUtil.isNotEmpty(newTexts)) {
+            super.writeString(String.join(StrUtil.EMPTY, newTexts));
         }
-        super.writeString(String.join(StrUtil.EMPTY, newTexts));
     }
 }

+ 1 - 2
mo-daq/src/main/java/com/smppw/modaq/application/components/CustomTabulaTextStripper.java

@@ -15,7 +15,6 @@ import technology.tabula.Utils;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.List;
-import java.util.Objects;
 
 /**
  * @author wangzaijun
@@ -80,7 +79,7 @@ public class CustomTabulaTextStripper extends TextStripper {
             float h = textPosition.getHeightDir();
 
             if (c.equals(NBSP)) { // replace non-breaking space for space
-                c = " ";
+                c = "";
             }
 
             // 文字没有旋转角度,并且水印字体大小没有包含当前文字时说明是正常文字

+ 7 - 9
mo-daq/src/main/java/com/smppw/modaq/application/components/ReportParseUtils.java

@@ -195,7 +195,7 @@ public final class ReportParseUtils {
             }
             if (replaceParentheses) {
                 // 正则表达式匹配中文括号及其内容,并替换为空字符串
-                fieldValue = Pattern.compile("[(|(][^)]*[)|)]").matcher(fieldValue).replaceAll(StrUtil.EMPTY);
+                fieldValue = Pattern.compile("\\([^)]*\\)").matcher(fieldValue).replaceAll(StrUtil.EMPTY);
             }
         }
         // 如果仅有 “-” 该字段值为null
@@ -297,12 +297,11 @@ public final class ReportParseUtils {
     public static ReportType matchReportType(String string) {
         // 类型识别---先识别季度报告,没有季度再识别年度报告,最后识别月报
         ReportType reportType = null;
-//        if (StrUtil.containsAny(string, ReportType.QUARTERLY.getPatterns())) {
-//            reportType = ReportType.QUARTERLY;
-//        } else if (StrUtil.containsAny(string, ReportType.ANNUALLY.getPatterns())) {
-//            reportType = ReportType.ANNUALLY;
-//        } else
-        if (StrUtil.containsAny(string, ReportType.MONTHLY.getPatterns())) {
+        if (StrUtil.containsAny(string, ReportType.QUARTERLY.getPatterns())) {
+            reportType = ReportType.QUARTERLY;
+        } else if (StrUtil.containsAny(string, ReportType.ANNUALLY.getPatterns())) {
+            reportType = ReportType.ANNUALLY;
+        } else if (StrUtil.containsAny(string, ReportType.MONTHLY.getPatterns())) {
             reportType = ReportType.MONTHLY;
 //        } else if (StrUtil.containsAny(string, ReportType.WEEKLY.getPatterns())) {
 //            reportType = ReportType.WEEKLY;
@@ -408,8 +407,7 @@ public final class ReportParseUtils {
         // 解析报告和表格
         try (PDDocument document = Loader.loadPDF(new RandomAccessReadBufferedFile(filepath))) {
             // 识别所有文字(去水印后的)
-            CustomPDFTextStripper stripper = new CustomPDFTextStripper();
-            stripper.setSortByPosition(true);
+            CustomPDFTextStripper stripper = new CustomPDFTextStripper(true, "");
             String text = stripper.getText(document).replace(Constants.WATERMARK_REPLACE, StrUtil.EMPTY);
             textList = StrUtil.split(text, System.lineSeparator());
             textList.removeIf(StrUtil::isBlank);

+ 7 - 11
mo-daq/src/main/java/com/smppw/modaq/application/components/report/parser/pdf/AbstractPDReportParser.java

@@ -10,7 +10,6 @@ import cn.hutool.json.JSONUtil;
 import com.smppw.modaq.application.components.CustomPDFTextStripper;
 import com.smppw.modaq.application.components.ReportParseUtils;
 import com.smppw.modaq.application.components.report.parser.AbstractReportParser;
-import com.smppw.modaq.common.conts.Constants;
 import com.smppw.modaq.common.enums.ReportParseStatus;
 import com.smppw.modaq.common.enums.ReportType;
 import com.smppw.modaq.common.exception.ReportParseException;
@@ -65,9 +64,8 @@ public abstract class AbstractPDReportParser<T extends ReportData> extends Abstr
         // 解析报告和表格
         try (PDDocument document = Loader.loadPDF(new RandomAccessReadBufferedFile(filepath))) {
             // 识别所有文字(去水印后的)
-            CustomPDFTextStripper stripper = new CustomPDFTextStripper();
-            stripper.setSortByPosition(true);
-            String text = stripper.getText(document).replace(Constants.WATERMARK_REPLACE, StrUtil.EMPTY);
+            CustomPDFTextStripper stripper = new CustomPDFTextStripper(true, StrUtil.EMPTY);
+            String text = stripper.getText(document);
             this.textList = StrUtil.split(text, System.lineSeparator());
             this.textList.removeIf(StrUtil::isBlank);
             if (this.textList.isEmpty()) {
@@ -188,15 +186,10 @@ public abstract class AbstractPDReportParser<T extends ReportData> extends Abstr
         List<DTO> dtos = ListUtil.list(true);
         // 信息表格字段和值映射
         List<Map<String, Object>> infos = ListUtil.list(true);
-        Map<String, Object> infoMap = null;
         for (Table table : tables) {
+            Map<String, Object> infoMap = MapUtil.newHashMap(16);
             Map<String, Object> temp = function.apply(table);
             for (String key : temp.keySet()) {
-                // 如果infoMap为null,先声明然后放在infos中
-                if (infoMap == null) {
-                    infoMap = MapUtil.newHashMap(16);
-                    infos.add(infoMap);
-                }
                 // 如果infoMap中包含了该key时,先放infos中然后重新声明新map对象
                 if (infoMap.containsKey(key)) {
                     infos.add(new HashMap<>(infoMap));
@@ -205,6 +198,7 @@ public abstract class AbstractPDReportParser<T extends ReportData> extends Abstr
                     infoMap.put(key, temp.get(key));
                 }
             }
+            infos.add(infoMap);
         }
         // 分级基金匹配
         List<String> levels = ReportParseUtils.matchTieredFund(String.join(",", this.textList));
@@ -213,7 +207,9 @@ public abstract class AbstractPDReportParser<T extends ReportData> extends Abstr
             if (dto == null) {
                 continue;
             }
-            dto.setLevel(levels.get(i));
+            if (levels.size() > i) {
+                dto.setLevel(levels.get(i));
+            }
             dtos.add(dto);
         }
         return dtos;

+ 33 - 3
mo-daq/src/main/java/com/smppw/modaq/application/components/report/parser/pdf/PDAnnuallyReportParser.java

@@ -41,6 +41,10 @@ public class PDAnnuallyReportParser extends PDQuarterlyReportParser<AnnuallyRepo
 
     @Override
     protected void initTableInfo(List<Table> tables) {
+        Map<Integer, List<Table>> spanningPageFinancialIndicatorsTableMap = MapUtil.newHashMap(8, true);
+        Map<Integer, List<Table>> spanningPageShareChangeTableMap = MapUtil.newHashMap(8, true);
+        int fi = 0;
+        int sci = 0;
         for (int i = 0; i < tables.size(); i++) {
             Table table = tables.get(i);
             if (i <= 1) {
@@ -50,14 +54,34 @@ public class PDAnnuallyReportParser extends PDQuarterlyReportParser<AnnuallyRepo
             // 用表格的第一列的数据判断是否主要财务指标数据
             List<String> texts = this.getTableColTexts(table, 0);
             if (CollUtil.containsAny(texts, ReportParseUtils.FINANCIAL_INDICATORS_COLUMN_NAMES)) {
-                this.financialIndicatorsTables.add(table);
+                if (table.getRowCount() == 10) {
+                    fi++;
+                    this.financialIndicatorsTables.add(table);
+                } else {
+                    List<Table> tempList = spanningPageFinancialIndicatorsTableMap.getOrDefault(fi, ListUtil.list(true));
+                    tempList.add(table);
+                    spanningPageFinancialIndicatorsTableMap.putIfAbsent(fi, tempList);
+                    if (tempList.size() == 2) {
+                        fi++;
+                    }
+                }
                 continue;
             }
             int colCount = table.getColCount();
             if (colCount == 2) {
                 // 用表格的第一列的数据判断是否份额变动记录
                 if (CollUtil.containsAny(texts, ReportParseUtils.SHARE_CHANGE_COLUMN_NAMES)) {
-                    this.shareChangeTables.add(table);
+                    if (table.getRowCount() == 5) {
+                        sci++;
+                        this.shareChangeTables.add(table);
+                    } else {
+                        List<Table> tempList = spanningPageShareChangeTableMap.getOrDefault(sci, ListUtil.list(true));
+                        tempList.add(table);
+                        spanningPageShareChangeTableMap.putIfAbsent(sci, tempList);
+                        if (tempList.size() == 2) {
+                            sci++;
+                        }
+                    }
                 }
             } else if (colCount == 4) {
                 // 用表格的第二列的数据判断是否行业配置数据(内地)
@@ -79,6 +103,10 @@ public class PDAnnuallyReportParser extends PDQuarterlyReportParser<AnnuallyRepo
                 }
             }
         }
+        // 跨页的财务信息记录表(包括表头一共有10行)
+        this.handleSpanningPageTables(this.financialIndicatorsTables, spanningPageFinancialIndicatorsTableMap);
+        // 跨页的份额变动记录表(包括表头一共有5行)
+        this.handleSpanningPageTables(this.shareChangeTables, spanningPageShareChangeTableMap);
     }
 
     @Override
@@ -127,7 +155,9 @@ public class PDAnnuallyReportParser extends PDQuarterlyReportParser<AnnuallyRepo
                 }
                 ReportFinancialIndicatorsDTO dto = new ReportFinancialIndicatorsDTO(fileId);
                 this.buildInfo(infoMap, dto);
-                dto.setLevel(levels.get(k));
+                if (levels.size() > k) {
+                    dto.setLevel(levels.get(k));
+                }
                 dtos.add(dto);
             }
         }

+ 23 - 2
mo-daq/src/main/java/com/smppw/modaq/application/components/report/parser/pdf/PDMonthlyReportParser.java

@@ -2,6 +2,8 @@ package com.smppw.modaq.application.components.report.parser.pdf;
 
 import cn.hutool.core.collection.ListUtil;
 import cn.hutool.core.map.MapUtil;
+import cn.hutool.core.util.StrUtil;
+import com.smppw.modaq.application.components.ReportParseUtils;
 import com.smppw.modaq.application.components.report.parser.ReportParserConstant;
 import com.smppw.modaq.domain.dto.report.*;
 import com.smppw.modaq.domain.mapper.EmailFieldMappingMapper;
@@ -49,10 +51,24 @@ public class PDMonthlyReportParser extends AbstractPDReportParser<MonthlyReportD
             }
             if (colCount == 4) {
                 this.fundInfoTable = table;
-            } else if (colCount >= 5) {
+            } else if (colCount == 5 && rowCount == 2) {
                 this.extNavTables.add(table);
             }
         }
+        // 跨页表格合并处理
+        List<Table> spanningPageTables = tables.stream().filter(e -> e.getRowCount() == 1)
+                .filter(e -> e.getColCount() == 5).toList();
+        for (int i = 0; i < spanningPageTables.size(); i = i + 2) {
+            if (i + 1 >= spanningPageTables.size()) {
+                continue;
+            }
+            Table keyTable = spanningPageTables.get(i);
+            Table valueTable = spanningPageTables.get(i + 1);
+            for (int j = 0; j < valueTable.getColCount(); j++) {
+                keyTable.add(valueTable.getCell(0, j), 1, j);
+            }
+            this.extNavTables.add(keyTable);
+        }
     }
 
     @Override
@@ -76,8 +92,13 @@ public class PDMonthlyReportParser extends AbstractPDReportParser<MonthlyReportD
         List<ReportNetReportDTO> dtos = this.buildLevelDto(reportInfo.getFileId(), this.extNavTables,
                 ReportNetReportDTO.class, t -> {
                     Map<String, Object> extInfoMap = MapUtil.newHashMap(16);
+                    // 限制只能两行数据
+                    if (t.getRowCount() != 2) {
+                        return extInfoMap;
+                    }
                     for (int i = 0; i < t.getColCount(); i++) {
-                        String key = t.getCell(0, i).getText();
+                        String key = ReportParseUtils.cleaningValue(t.getCell(0, i).getText());
+                        key = StrUtil.subBefore(key, "(", false);
                         String value = t.getCell(1, i).getText();
                         extInfoMap.put(key, value);
                     }

+ 19 - 2
mo-daq/src/main/java/com/smppw/modaq/application/components/report/parser/pdf/PDQuarterlyReportParser.java

@@ -109,7 +109,8 @@ public class PDQuarterlyReportParser<T extends QuarterlyReportData> extends Abst
             @SuppressWarnings("all")
             List<RectangularTextContainer> cols = fundInfoTable.getRows().get(i);
             for (int j = 0; j < 1; j++) {
-                baseInfoMap.put(cols.get(j).getText(), cols.get(j + 1).getText());
+                String key = ReportParseUtils.cleaningValue(cols.get(j).getText());
+                baseInfoMap.put(key, cols.get(j + 1).getText());
             }
         }
         return baseInfoMap;
@@ -121,7 +122,7 @@ public class PDQuarterlyReportParser<T extends QuarterlyReportData> extends Abst
         Function<Table, Map<String, Object>> function = t -> {
             Map<String, Object> extInfoMap = MapUtil.newHashMap(16);
             for (int i = 0; i < t.getRowCount(); i++) {
-                String key = t.getCell(i, 0).getText();
+                String key = ReportParseUtils.cleaningValue(t.getCell(i, 0).getText());
                 String value = t.getCell(i, 1).getText();
                 extInfoMap.put(key, value);
             }
@@ -281,4 +282,20 @@ public class PDQuarterlyReportParser<T extends QuarterlyReportData> extends Abst
         }
         return details;
     }
+
+    protected void handleSpanningPageTables(List<Table> tables, Map<Integer, List<Table>> spanningPageTableMap) {
+        // 跨页的份额变动记录表(包括表头一共有5行)
+        for (Map.Entry<Integer, List<Table>> entry : spanningPageTableMap.entrySet()) {
+            List<Table> spanningPageShareChangeTables = entry.getValue();
+            Table master = spanningPageShareChangeTables.get(0);
+            Table slave = spanningPageShareChangeTables.get(1);
+            int rowCount = master.getRowCount();
+            for (int j = 0; j < slave.getRowCount(); j++) {
+                for (int k = 0; k < slave.getColCount(); k++) {
+                    master.add(slave.getCell(j, k), rowCount + j, k);
+                }
+            }
+            tables.add(entry.getKey(), master);
+        }
+    }
 }

+ 83 - 0
mo-daq/src/main/java/com/smppw/modaq/domain/dto/report/ReportFundInfoDTO.java

@@ -4,6 +4,8 @@ import com.smppw.modaq.domain.entity.report.ReportFundInfoDO;
 import lombok.Getter;
 import lombok.Setter;
 
+import java.util.Objects;
+
 /**
  * @author wangzaijun
  * @date 2024/9/26 16:47
@@ -32,6 +34,67 @@ public class ReportFundInfoDTO extends BaseReportDTO<ReportFundInfoDO> {
      */
     private String currency;
 
+    /**
+     * 投资顾问
+     */
+    private String advisorName;
+    /**
+     * 基金托管人
+     */
+    private String custodianName;
+    /**
+     * 基金经理描述
+     */
+    private String fundManager;
+    /**
+     * 投资策略
+     */
+    private String fundStrategyDescription;
+    /**
+     * 基金成立日期
+     */
+    private String inceptionDate;
+    /**
+     * 行业趋势
+     */
+    private String industryTrend;
+    /**
+     * 投资目标
+     */
+    private String investmentObjective;
+    /**
+     * 杠杆比例
+     */
+    private String leverage;
+    /**
+     * 杠杆比例描述
+     */
+    private String leverageNote;
+    /**
+     * 基金运作方式
+     */
+    private String operationType;
+    /**
+     * 备案编码
+     */
+    private String registerNumber;
+    /**
+     * 风险收益特征
+     */
+    private String riskReturnDesc;
+    /**
+     * 业绩比较基准
+     */
+    private String secondaryBenchmark;
+    /**
+     * 基金到期日期
+     */
+    private String dueDate;
+    /**
+     * 信息披露报告是否经托管机构复核
+     */
+    private String isReviewed;
+
     public ReportFundInfoDTO() {
         super();
     }
@@ -48,6 +111,22 @@ public class ReportFundInfoDTO extends BaseReportDTO<ReportFundInfoDO> {
         entity.setCompanyName(this.companyName);
         entity.setCurrency(this.currency);
         entity.setFundName(this.fundName);
+        entity.setAdvisorName(this.advisorName);
+        entity.setCustodianName(this.custodianName);
+        entity.setFundManager(this.fundManager);
+        entity.setFundName(this.fundName);
+        entity.setFundStrategyDescription(this.fundStrategyDescription);
+        entity.setInceptionDate(this.toDate(this.inceptionDate));
+        entity.setIndustryTrend(this.industryTrend);
+        entity.setInvestmentObjective(this.investmentObjective);
+        entity.setLeverage(this.toBigDecimal(this.leverage));
+        entity.setLeverageNote(this.leverageNote);
+        entity.setOperationType(this.operationType);
+        entity.setRegisterNumber(this.registerNumber);
+        entity.setRiskReturnDesc(this.riskReturnDesc);
+        entity.setSecondaryBenchmark(this.secondaryBenchmark);
+        entity.setDueDate(this.toDate(this.dueDate));
+        entity.setReviewed(Objects.equals("是", this.isReviewed) ? 1 : 0);
         this.initEntity(entity);
         return entity;
     }
@@ -56,8 +135,12 @@ public class ReportFundInfoDTO extends BaseReportDTO<ReportFundInfoDO> {
     public String toString() {
         return "{" +
                 super.toString() +
+                ", advisorName='" + advisorName + '\'' +
                 ", fundName='" + fundName + '\'' +
+                ", inceptionDate='" + inceptionDate + '\'' +
+                ", registerNumber='" + registerNumber + '\'' +
                 ", companyName='" + companyName + '\'' +
+                ", dueDate='" + dueDate + '\'' +
                 '}';
     }
 }

+ 66 - 0
mo-daq/src/main/java/com/smppw/modaq/domain/entity/report/ReportFundInfoDO.java

@@ -1,9 +1,13 @@
 package com.smppw.modaq.domain.entity.report;
 
+import com.baomidou.mybatisplus.annotation.TableField;
 import com.baomidou.mybatisplus.annotation.TableName;
 import lombok.Getter;
 import lombok.Setter;
 
+import java.math.BigDecimal;
+import java.util.Date;
+
 /**
  * @author wangzaijun
  * @date 2024/9/26 16:47
@@ -32,4 +36,66 @@ public class ReportFundInfoDO extends BaseReportDO {
      * 基金交易使用的货币种类
      */
     private String currency;
+
+    /**
+     * 投资顾问
+     */
+    private String advisorName;
+    /**
+     * 基金托管人
+     */
+    private String custodianName;
+    /**
+     * 基金经理描述
+     */
+    private String fundManager;
+    /**
+     * 投资策略
+     */
+    private String fundStrategyDescription;
+    /**
+     * 基金成立日期
+     */
+    private Date inceptionDate;
+    /**
+     * 行业趋势
+     */
+    private String industryTrend;
+    /**
+     * 投资目标
+     */
+    private String investmentObjective;
+    /**
+     * 杠杆比例
+     */
+    private BigDecimal leverage;
+    /**
+     * 杠杆比例描述
+     */
+    private String leverageNote;
+    /**
+     * 基金运作方式
+     */
+    private String operationType;
+    /**
+     * 备案编码
+     */
+    private String registerNumber;
+    /**
+     * 风险收益特征
+     */
+    private String riskReturnDesc;
+    /**
+     * 业绩比较基准
+     */
+    private String secondaryBenchmark;
+    /**
+     * 基金到期日期
+     */
+    private Date dueDate;
+    /**
+     * 信息披露报告是否经托管机构复核
+     */
+    @TableField(value = "reviewed")
+    private Integer reviewed;
 }