Explorar o código

fix:修复报告解析的问题

wangzaijun hai 6 meses
pai
achega
c78e247b35

+ 0 - 12
service-base/src/main/java/com/simuwang/base/common/enums/ReportType.java

@@ -2,10 +2,6 @@ package com.simuwang.base.common.enums;
 
 import lombok.Getter;
 
-import java.util.Arrays;
-import java.util.List;
-import java.util.stream.Collectors;
-
 @Getter
 public enum ReportType {
     MONTHLY(0, "月报", new String[]{"月", "月度", "月报"}),
@@ -21,12 +17,4 @@ public enum ReportType {
         this.label = label;
         this.patterns = patterns;
     }
-
-    public static String getAllPatterns() {
-        return String.join("|", patterns());
-    }
-
-    public static List<String> patterns() {
-        return Arrays.stream(ReportType.values()).flatMap(e -> Arrays.stream(e.getPatterns())).collect(Collectors.toList());
-    }
 }

+ 2 - 2
service-base/src/main/java/com/simuwang/base/pojo/dto/report/BaseReportDTO.java

@@ -70,8 +70,8 @@ public abstract class BaseReportDTO<T extends BaseReportDO> implements Serializa
             return null;
         }
         try {
-            // 移除所有非数字和“.”字符
-            String cleanedInput = input.trim().replaceAll("[^\\d.]", "");
+            // 替换掉数字分位的逗号
+            String cleanedInput = input.trim().replaceAll(",", "");
             // 创建BigDecimal对象
             return new BigDecimal(cleanedInput);
         } catch (NumberFormatException ignored) {

+ 5 - 5
service-base/src/main/java/com/simuwang/base/pojo/dto/report/ReportParseStatus.java

@@ -4,11 +4,11 @@ import com.smppw.common.pojo.enums.status.StatusCode;
 
 public enum ReportParseStatus implements StatusCode {
     PARSE_FAIL(21000, "定期报告解析错误:{}"),
-    NOT_A_REPORT(21001, "不是定期报告"),
-    REPORT_IS_SCAN(21002, "报告为扫描件"),
-    NO_SUPPORT_TEMPLATE(21003, "不支持的报告文件格式"),
-    NOT_A_FIXED_FORMAT(21004, "不是基协统一格式"),
-    PARSE_FUND_INFO_FAIL(21010, "没有解析到报告中的基金基本信息"),
+    NOT_A_REPORT(21001, "[{}] 不是定期报告"),
+    REPORT_IS_SCAN(21002, "报告[{}] 为扫描件"),
+    NO_SUPPORT_TEMPLATE(21003, "报告[{}] 是不支持的文件格式"),
+    NOT_A_FIXED_FORMAT(21004, "报告[{}] 不是基协统一格式"),
+    PARSE_FUND_INFO_FAIL(21010, "报告[{}] 没有解析到基金基本信息"),
     ;
     private final int code;
     private final String msg;

+ 42 - 26
service-daq/src/main/java/com/simuwang/daq/components/report/parser/pdf/AbstractPDReportParser.java

@@ -21,9 +21,10 @@ import technology.tabula.Table;
 import technology.tabula.extractors.SpreadsheetExtractionAlgorithm;
 
 import java.io.IOException;
+import java.util.Calendar;
+import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
-import java.util.Objects;
 import java.util.function.Function;
 import java.util.stream.Collectors;
 
@@ -53,6 +54,7 @@ public abstract class AbstractPDReportParser<T extends ReportData> extends Abstr
         this.textList = null;
         // 初始化
         this.init();
+        String filename = params.getFilename();
         // 解析报告和表格
         try (PDDocument document = Loader.loadPDF(new RandomAccessReadBufferedFile(params.getFilepath()))) {
             // 识别所有文字(去水印后的)
@@ -62,11 +64,7 @@ public abstract class AbstractPDReportParser<T extends ReportData> extends Abstr
             this.textList = StrUtil.split(text, System.lineSeparator());
             this.textList.removeIf(StrUtil::isBlank);
             if (this.textList.isEmpty()) {
-                throw new ReportParseException(ReportParseStatus.REPORT_IS_SCAN);
-            }
-            // 报告名称和类型一般在第一第二行
-            if (ReportParseUtils.matchReportType(this.textList.get(0)) == null && ReportParseUtils.matchReportType(this.textList.get(1)) == null) {
-                throw new ReportParseException(ReportParseStatus.NOT_A_REPORT);
+                throw new ReportParseException(ReportParseStatus.REPORT_IS_SCAN, filename);
             }
             // 解析所有表格(单元格字符去水印)
             List<Table> tables = ListUtil.list(true);
@@ -78,7 +76,7 @@ public abstract class AbstractPDReportParser<T extends ReportData> extends Abstr
                 tables.addAll(extractionAlgorithm.extract(page));
             }
             if (tables.isEmpty()) {
-                throw new ReportParseException(ReportParseStatus.REPORT_IS_SCAN);
+                throw new ReportParseException(ReportParseStatus.REPORT_IS_SCAN, filename);
             }
             this.initTableInfo(tables);
         }
@@ -96,7 +94,7 @@ public abstract class AbstractPDReportParser<T extends ReportData> extends Abstr
             throw e;
         } catch (Exception e) {
             this.logger.warn("报告解析错误:{}", ExceptionUtil.stacktraceToString(e));
-            throw new ReportParseException(ReportParseStatus.NOT_A_FIXED_FORMAT);
+            throw new ReportParseException(ReportParseStatus.NOT_A_FIXED_FORMAT, filename);
         }
     }
 
@@ -116,10 +114,11 @@ public abstract class AbstractPDReportParser<T extends ReportData> extends Abstr
     protected ReportFundInfoDTO buildFundInfo(ReportParserParams params) {
         Table fundInfoTable = this.fundInfoTable;
         if (fundInfoTable == null) {
-            throw new ReportParseException(ReportParseStatus.PARSE_FUND_INFO_FAIL);
+            throw new ReportParseException(ReportParseStatus.PARSE_FUND_INFO_FAIL, params.getFilename());
         }
         // 基金基本信息映射
-        return this.buildDto(params.getFileId(), fundInfoTable, ReportFundInfoDTO.class, this::parseFundInfo);
+        Map<String, Object> extInfoMap = this.parseFundInfo(fundInfoTable);
+        return this.buildDto(params.getFileId(), ReportFundInfoDTO.class, extInfoMap);
     }
 
     /**
@@ -161,7 +160,7 @@ public abstract class AbstractPDReportParser<T extends ReportData> extends Abstr
     }
 
     /**
-     * 构建只有两列表格的dto数据对象,如果有分级基金时
+     * 构建只有两列表格的dto数据对象,如果有分级基金时(并且一个表格可能跨页)
      *
      * @param <DTO>    泛型对象
      * @param fileId   文件id
@@ -172,17 +171,37 @@ public abstract class AbstractPDReportParser<T extends ReportData> extends Abstr
      */
     protected <DTO extends BaseReportLevelDTO<?>> List<DTO> buildLevelDto(Integer fileId, List<Table> tables, Class<DTO> clazz,
                                                                           Function<Table, Map<String, Object>> function) {
-        // 映射转换
-        List<DTO> dtos = tables.stream().filter(Objects::nonNull)
-                .map(e -> this.buildDto(fileId, e, clazz, function)).collect(Collectors.toList());
+        List<DTO> dtos = ListUtil.list(true);
+        // 信息表格字段和值映射
+        List<Map<String, Object>> infos = ListUtil.list(true);
+        Map<String, Object> infoMap = null;
+        for (Table table : tables) {
+            Map<String, Object> temp = function.apply(table);
+            for (String key : temp.keySet()) {
+                // 如果infoMap为null,先声明然后放在infos中
+                if (infoMap == null) {
+                    infoMap = MapUtil.newHashMap(16);
+                    infos.add(infoMap);
+                }
+                // 如果infoMap中包含了该key时,先放infos中然后重新声明新map对象
+                if (infoMap.containsKey(key)) {
+                    infos.add(new HashMap<>(infoMap));
+                    infoMap = MapUtil.newHashMap(16);
+                } else {
+                    infoMap.put(key, temp.get(key));
+                }
+            }
+        }
         // 分级基金匹配
         List<String> levels = ReportParseUtils.matchTieredFund(String.join(",", this.textList));
         levels.add(0, "母基金");
-        for (int i = 0; i < dtos.size(); i++) {
-            if (levels.size() <= i) {
+        for (int i = 0; i < infos.size(); i++) {
+            DTO dto = this.buildDto(fileId, clazz, infos.get(i));
+            if (dto == null) {
                 continue;
             }
-            dtos.get(i).setLevel(levels.get(i));
+            dto.setLevel(levels.get(i));
+            dtos.add(dto);
         }
         return dtos;
     }
@@ -190,20 +209,17 @@ public abstract class AbstractPDReportParser<T extends ReportData> extends Abstr
     /**
      * 构建只有两列表格的dto数据对象
      *
-     * @param <DTO>    泛型对象
-     * @param fileId   文件id
-     * @param table    表格
-     * @param clazz    泛型对象
-     * @param function 表格转换的函数
+     * @param <DTO>   泛型对象
+     * @param fileId  文件id
+     * @param clazz   泛型对象
+     * @param infoMap 表格转换的函数
      * @return /
      */
-    private <DTO extends BaseReportDTO<?>> DTO buildDto(Integer fileId, Table table, Class<DTO> clazz,
-                                                        Function<Table, Map<String, Object>> function) {
+    private <DTO extends BaseReportDTO<?>> DTO buildDto(Integer fileId, Class<DTO> clazz, Map<String, Object> infoMap) {
         try {
-            Map<String, Object> extInfoMap = function == null ? MapUtil.empty() : function.apply(table);
             DTO dto = clazz.getDeclaredConstructor().newInstance();
             dto.setFileId(fileId);
-            this.buildInfo(extInfoMap, dto);
+            this.buildInfo(infoMap, dto);
             return dto;
         } catch (Exception ignored) {
         }

+ 4 - 21
service-daq/src/main/java/com/simuwang/daq/components/report/parser/pdf/PDAnnuallyReportParser.java

@@ -21,18 +21,6 @@ import java.util.function.Function;
  */
 @Component(ReportParserConstant.PARSER_PDF_ANNUALLY)
 public class PDAnnuallyReportParser extends PDQuarterlyReportParser<AnnuallyReportData> {
-    private static final List<String> FINANCIAL_INDICATORS_COLUMN_NAMES = ListUtil.list(false);
-
-    static {
-        FINANCIAL_INDICATORS_COLUMN_NAMES.add("期末基金净资产");
-        FINANCIAL_INDICATORS_COLUMN_NAMES.add("报告期期末单位净值");
-        FINANCIAL_INDICATORS_COLUMN_NAMES.add("本期利润");
-        FINANCIAL_INDICATORS_COLUMN_NAMES.add("本期已实现收益");
-        FINANCIAL_INDICATORS_COLUMN_NAMES.add("期末可供分配利润");
-        FINANCIAL_INDICATORS_COLUMN_NAMES.add("期末可供分配基金份额利润");
-        FINANCIAL_INDICATORS_COLUMN_NAMES.add("基金份额累计净值增长率");
-    }
-
     private List<Table> fundInfoTables;
 
     public PDAnnuallyReportParser(EmailFieldMappingMapper fieldMappingMapper) {
@@ -104,14 +92,9 @@ public class PDAnnuallyReportParser extends PDQuarterlyReportParser<AnnuallyRepo
     }
 
     @Override
-    protected AnnuallyReportData buildExtData(ReportBaseInfoDTO reportInfo, ReportFundInfoDTO fundInfo,
-                                              List<ReportShareChangeDTO> shareChanges,
-                                              List<ReportAssetAllocationDTO> assetAllocations,
-                                              List<ReportInvestmentIndustryDTO> investmentIndustries,
-                                              Function<Table, Map<String, Object>> function) {
-        // 处理财务指标
-        List<ReportFinancialIndicatorsDTO> financialIndicators = this.buildFinancialIndicatorsInfo(reportInfo.getFileId());
-        // 返回数据构建
+    protected AnnuallyReportData buildReportData(ReportBaseInfoDTO reportInfo, ReportFundInfoDTO fundInfo,
+                                                 List<ReportShareChangeDTO> shareChanges, List<ReportFinancialIndicatorsDTO> financialIndicators,
+                                                 List<ReportAssetAllocationDTO> assetAllocations, List<ReportInvestmentIndustryDTO> investmentIndustries) {
         AnnuallyReportData reportData = new AnnuallyReportData(reportInfo, fundInfo);
         reportData.setShareChange(shareChanges);
         reportData.setFinancialIndicators(financialIndicators);
@@ -125,7 +108,7 @@ public class PDAnnuallyReportParser extends PDQuarterlyReportParser<AnnuallyRepo
         // todo 数据清洗
     }
 
-    private List<ReportFinancialIndicatorsDTO> buildFinancialIndicatorsInfo(Integer fileId) {
+    protected List<ReportFinancialIndicatorsDTO> buildFinancialIndicatorsInfo(Integer fileId, Function<Table, Map<String, Object>> function) {
         List<ReportFinancialIndicatorsDTO> dtos = ListUtil.list(false);
         // 分级基金
         List<String> levels = ReportParseUtils.matchTieredFund(String.join(",", this.textList));

+ 3 - 1
service-daq/src/main/java/com/simuwang/daq/components/report/parser/pdf/PDMonthlyReportParser.java

@@ -22,7 +22,7 @@ import java.util.Map;
  */
 @Component(ReportParserConstant.PARSER_PDF_MONTHLY)
 public class PDMonthlyReportParser extends AbstractPDReportParser<MonthlyReportData> {
-    private final List<Table> extNavTables = ListUtil.list(true);
+    private List<Table> extNavTables;
 
     public PDMonthlyReportParser(EmailFieldMappingMapper fieldMappingMapper) {
         super(fieldMappingMapper);
@@ -35,6 +35,8 @@ public class PDMonthlyReportParser extends AbstractPDReportParser<MonthlyReportD
 
     @Override
     protected void initTableInfo(List<Table> tables) {
+        // 这里初始化
+        this.extNavTables = ListUtil.list(true);
         // 一般月报是固定的模板,4列表格是基金基本信息,其他5列的表格是月净值
         for (Table table : tables) {
             int colCount = table.getColCount();

+ 46 - 19
service-daq/src/main/java/com/simuwang/daq/components/report/parser/pdf/PDQuarterlyReportParser.java

@@ -16,6 +16,7 @@ import java.awt.geom.Rectangle2D;
 import java.util.Comparator;
 import java.util.List;
 import java.util.Map;
+import java.util.Objects;
 import java.util.function.Function;
 
 /**
@@ -27,8 +28,18 @@ import java.util.function.Function;
 public class PDQuarterlyReportParser<T extends QuarterlyReportData> extends AbstractPDReportParser<T> {
     protected static final List<String> INDUSTRY_COLUMN_NAMES = ListUtil.list(false);
     protected static final List<String> SHARE_CHANGE_COLUMN_NAMES = ListUtil.list(false);
+    protected static final List<String> FINANCIAL_INDICATORS_COLUMN_NAMES = ListUtil.list(false);
 
     static {
+        // 财务指标
+        FINANCIAL_INDICATORS_COLUMN_NAMES.add("期末基金净资产");
+        FINANCIAL_INDICATORS_COLUMN_NAMES.add("报告期期末单位净值");
+        FINANCIAL_INDICATORS_COLUMN_NAMES.add("本期利润");
+        FINANCIAL_INDICATORS_COLUMN_NAMES.add("本期已实现收益");
+        FINANCIAL_INDICATORS_COLUMN_NAMES.add("期末可供分配利润");
+        FINANCIAL_INDICATORS_COLUMN_NAMES.add("期末可供分配基金份额利润");
+        FINANCIAL_INDICATORS_COLUMN_NAMES.add("基金份额累计净值增长率");
+
         // 中国证监会行业标准
         INDUSTRY_COLUMN_NAMES.add("农、林、牧、渔业");
         INDUSTRY_COLUMN_NAMES.add("采矿业");
@@ -107,7 +118,7 @@ public class PDQuarterlyReportParser<T extends QuarterlyReportData> extends Abst
                 // 主要财务指标或份额变动
                 if (CollUtil.containsAny(texts, SHARE_CHANGE_COLUMN_NAMES)) {
                     this.shareChangeTables.add(table);
-                } else {
+                } else if (CollUtil.containsAny(texts, FINANCIAL_INDICATORS_COLUMN_NAMES)) {
                     this.financialIndicatorsTables.add(table);
                 }
             } else if (colCount == 4) {
@@ -156,30 +167,42 @@ public class PDQuarterlyReportParser<T extends QuarterlyReportData> extends Abst
         List<ReportShareChangeDTO> shareChanges = this.buildLevelDto(fileId, this.shareChangeTables,
                 ReportShareChangeDTO.class, function);
         // 主要财务指标
-        List<ReportFinancialIndicatorsDTO> financialIndicators = this.buildLevelDto(fileId, this.financialIndicatorsTables,
-                ReportFinancialIndicatorsDTO.class, function);
+        List<ReportFinancialIndicatorsDTO> financialIndicators = this.buildFinancialIndicatorsInfo(fileId, function);
         // 资产配置
         List<ReportAssetAllocationDTO> assetAllocations = this.buildAssetAllocationInfo(fileId);
         // 行业配置
         List<ReportInvestmentIndustryDTO> investmentIndustries = this.buildInvestmentIndustryInfo(fileId);
         // 返回数据构建
-        QuarterlyReportData reportData = new QuarterlyReportData(reportInfo, fundInfo);
-        reportData.setShareChange(shareChanges);
-        reportData.setFinancialIndicators(financialIndicators);
-        reportData.setAssetAllocation(assetAllocations);
-        reportData.setInvestmentIndustry(investmentIndustries);
-        return this.buildExtData(reportInfo, fundInfo, shareChanges, assetAllocations, investmentIndustries, function);
+        return this.buildReportData(reportInfo, fundInfo, shareChanges, financialIndicators, assetAllocations, investmentIndustries);
     }
 
-    protected T buildExtData(ReportBaseInfoDTO reportInfo, ReportFundInfoDTO fundInfo,
-                             List<ReportShareChangeDTO> shareChanges,
-                             List<ReportAssetAllocationDTO> assetAllocations,
-                             List<ReportInvestmentIndustryDTO> investmentIndustries,
-                             Function<Table, Map<String, Object>> function) {
-        Integer fileId = reportInfo.getFileId();
-        // 主要财务指标
-        List<ReportFinancialIndicatorsDTO> financialIndicators = this.buildLevelDto(fileId, this.financialIndicatorsTables,
-                ReportFinancialIndicatorsDTO.class, function);
+    /**
+     * 主要财务指标数据构建(包括分级基金,并且一个表格可能跨页)
+     *
+     * @param fileId   文件id
+     * @param function 字段映射关系
+     * @return /
+     */
+    protected List<ReportFinancialIndicatorsDTO> buildFinancialIndicatorsInfo(Integer fileId, Function<Table, Map<String, Object>> function) {
+        return this.buildLevelDto(fileId, this.financialIndicatorsTables, ReportFinancialIndicatorsDTO.class, function);
+    }
+
+    /**
+     * 子类重写,放在cast异常
+     *
+     * @param reportInfo           报告基本信息
+     * @param fundInfo             基金基本信息
+     * @param shareChanges         份额变动
+     * @param financialIndicators  基本财务指标
+     * @param assetAllocations     资产配置
+     * @param investmentIndustries 行业配置
+     * @return /
+     */
+    protected T buildReportData(ReportBaseInfoDTO reportInfo, ReportFundInfoDTO fundInfo,
+                                List<ReportShareChangeDTO> shareChanges,
+                                List<ReportFinancialIndicatorsDTO> financialIndicators,
+                                List<ReportAssetAllocationDTO> assetAllocations,
+                                List<ReportInvestmentIndustryDTO> investmentIndustries) {
         QuarterlyReportData reportData = new QuarterlyReportData(reportInfo, fundInfo);
         reportData.setShareChange(shareChanges);
         reportData.setFinancialIndicators(financialIndicators);
@@ -213,9 +236,13 @@ public class PDQuarterlyReportParser<T extends QuarterlyReportData> extends Abst
                 if (StrUtil.containsAny(text, "序号", "行业类别")) {
                     continue;
                 }
+                String industryName = ReportParseUtils.cleaningValue(table.getCell(i, j).getText());
+                if (StrUtil.isBlank(industryName) || Objects.equals("合计", industryName)) {
+                    continue;
+                }
                 ReportInvestmentIndustryDTO dto = new ReportInvestmentIndustryDTO(fileId);
                 dto.setInvestType(investType);
-                dto.setIndustryName(ReportParseUtils.cleaningValue(table.getCell(i, j).getText()));
+                dto.setIndustryName(industryName);
                 dto.setMarketValue(ReportParseUtils.cleaningValue(table.getCell(i, j + 1).getText()));
                 dto.setRatio(ReportParseUtils.cleaningValue(table.getCell(i, j + 2).getText()));
                 dtos.add(dto);

+ 7 - 3
service-daq/src/main/java/com/simuwang/daq/service/EmailParseService.java

@@ -225,9 +225,13 @@ public class EmailParseService {
         }
         // 报告邮件有一条失败就表示整个邮件解析失败
         if (Objects.equals(EmailTypeConst.REPORT_EMAIL_TYPE, emailType) && CollUtil.isNotEmpty(dataList)) {
-            failReason = dataList.stream().filter(e -> !Objects.equals(1, e.getStatus()))
-                    .findFirst().map(ParseResult::getMsg).orElse(null);
-            emailParseStatus = failReason != null ? EmailParseStatusConst.FAIL : EmailParseStatusConst.SUCCESS;
+            long sucNum = dataList.stream().filter(e -> Objects.equals(1, e.getStatus())).count();
+            if (sucNum > 0) {
+                emailParseStatus = EmailParseStatusConst.SUCCESS;
+            } else {
+                emailParseStatus = EmailParseStatusConst.FAIL;
+                failReason = dataList.stream().map(ParseResult::getMsg).collect(Collectors.joining("/"));
+            }
         }
         emailParseInfoMapper.updateParseStatus(emailId, emailParseStatus, failReason);
     }

+ 2 - 2
service-deploy/src/test/java/com/simuwang/ApplicationTest.java

@@ -45,8 +45,8 @@ public class ApplicationTest {
     @Test
     public void reportTest() {
         MailboxInfoDTO emailInfoDTO = this.buildMailbox();
-        Date startDate = DateUtil.parse("2024-10-11 08:30:30", DateConst.YYYY_MM_DD_HH_MM_SS);
-        Date endDate = DateUtil.parse("2024-10-11 09:59:30", DateConst.YYYY_MM_DD_HH_MM_SS);
+        Date startDate = DateUtil.parse("2024-10-12 17:42:30", DateConst.YYYY_MM_DD_HH_MM_SS);
+        Date endDate = DateUtil.parse("2024-10-12 17:59:30", DateConst.YYYY_MM_DD_HH_MM_SS);
         try {
             emailParseService.parseEmail(emailInfoDTO, startDate, endDate);
         } catch (Exception e) {