wangzaijun 3 tygodni temu
rodzic
commit
ff8979021e

+ 4 - 5
mo-daq/src/main/java/com/smppw/modaq/application/components/OCRReportParser.java

@@ -24,8 +24,8 @@ public class OCRReportParser {
         RESULT_SCHEMA_MAP.put("产品代码", "");
         RESULT_SCHEMA_MAP.put("是否有红色印章", "");
         RESULT_SCHEMA_MAP.put("是否有电话", "");
-        RESULT_SCHEMA_MAP.put("是否有地址", "");
-        RESULT_SCHEMA_MAP.put("是否有关注我们", "");
+//        RESULT_SCHEMA_MAP.put("是否有地址", "");
+//        RESULT_SCHEMA_MAP.put("是否有关注我们", "");
     }
 
     public OCRParseData parse(String filename, String ocrApi, String ocrImgUrl) throws ReportParseException {
@@ -43,8 +43,6 @@ public class OCRReportParser {
             String fundCode = this.cleanData(jsonObject.getStr("产品代码"));
             String seals = this.cleanData(jsonObject.getStr("是否有红色印章"));
             String phone = this.cleanData(jsonObject.getStr("是否有电话"));
-            String addr = this.cleanData(jsonObject.getStr("是否有地址"));
-            String withme = this.cleanData(jsonObject.getStr("是否有关注我们"));
             OCRParseData res = new OCRParseData();
             if (StrUtil.isNotBlank(fundName) && fundName.contains("基金") && !fundName.contains("公司")) {
                 res.setFundName(fundName);
@@ -55,7 +53,7 @@ public class OCRReportParser {
             if (StrUtil.isNotBlank(seals)) {
                 res.setWithSeals(true);
             }
-            if (StrUtil.isNotBlank(phone) || StrUtil.isNotBlank(addr) || StrUtil.isNotBlank(withme)) {
+            if (StrUtil.isNotBlank(phone)) {
                 res.setWithContacts(true);
             }
             return res;
@@ -86,6 +84,7 @@ public class OCRReportParser {
         if (value == null) {
             return null;
         }
+        // 识别到多个基金
         if (value.contains("、") || value.contains(",")) {
             return value.replaceAll("、,", ",");
         }

+ 19 - 0
mo-daq/src/main/java/com/smppw/modaq/application/components/ReportParseUtils.java

@@ -19,6 +19,10 @@ import java.util.stream.Collectors;
 
 public final class ReportParseUtils {
     /**
+     * 基金基本信息表格列名称
+     */
+    public static final List<String> FUND_INFO_COLUMN_NAMES = ListUtil.list(false);
+    /**
      * 行业配置的表格列名称
      */
     public static final List<String> INDUSTRY_COLUMN_NAMES = ListUtil.list(false);
@@ -36,6 +40,21 @@ public final class ReportParseUtils {
     public static final Map<String, String> ASSET_ALLOCATION_TYPE_MAPPER = MapUtil.newHashMap(32, true);
 
     static {
+        FUND_INFO_COLUMN_NAMES.add("基金名称");
+        FUND_INFO_COLUMN_NAMES.add("基金编码");
+        FUND_INFO_COLUMN_NAMES.add("基金运作方式");
+        FUND_INFO_COLUMN_NAMES.add("基金成立日期");
+        FUND_INFO_COLUMN_NAMES.add("基金管理人");
+        FUND_INFO_COLUMN_NAMES.add("基金托管人");
+        FUND_INFO_COLUMN_NAMES.add("投资顾问");
+        FUND_INFO_COLUMN_NAMES.add("期末基金总份额/期末基金实缴总额");
+        FUND_INFO_COLUMN_NAMES.add("基金到期日期");
+        FUND_INFO_COLUMN_NAMES.add("投资目标");
+        FUND_INFO_COLUMN_NAMES.add("投资策略");
+        FUND_INFO_COLUMN_NAMES.add("业绩比较基准");
+        FUND_INFO_COLUMN_NAMES.add("风险收益特征");
+        FUND_INFO_COLUMN_NAMES.add("信息披露报告是否经托管机构复核");
+
         // 财务指标
         FINANCIAL_INDICATORS_COLUMN_NAMES.add("期末基金净资产");
         FINANCIAL_INDICATORS_COLUMN_NAMES.add("期末基金资产净值");

+ 5 - 2
mo-daq/src/main/java/com/smppw/modaq/application/components/report/parser/ai/AIQuarterlyReportParser.java

@@ -32,7 +32,7 @@ public class AIQuarterlyReportParser<T extends QuarterlyReportData> extends Abst
                 识别文件中的基金基本情况、投资组合情况,
                 投资组合情况包含期末基金资产组合情况、报告期末按行业分类的股票投资组合,
                 报告期末按行业分类的股票投资组合又包含报告期末按行业分类的境内股票投资组合、报告期末按行业分类的港股通投资股票投资组合,
-                要求准确识别金额等小数的位数,结果用json返回
+                结果用json返回
                 """;
     }
 
@@ -86,7 +86,10 @@ public class AIQuarterlyReportParser<T extends QuarterlyReportData> extends Abst
 
         List<ReportAssetAllocationDTO> dtos = ListUtil.list(false);
         for (Map<String, Object> allocation : allocationList) {
-            String detail = ReportParseUtils.cleaningValue(allocation.get("项目"));
+            String detail = ReportParseUtils.cleaningValue(allocation.get("子项"));
+            if (StrUtil.isBlank(detail)) {
+                detail = ReportParseUtils.cleaningValue(allocation.get("项目"));
+            }
             if (!ReportParseUtils.ASSET_ALLOCATION_TYPE_MAPPER.containsKey(detail)) {
                 continue;
             }

+ 6 - 8
mo-daq/src/main/java/com/smppw/modaq/application/components/report/parser/pdf/PDAnnuallyReportParser.java

@@ -45,23 +45,21 @@ public class PDAnnuallyReportParser extends PDQuarterlyReportParser<AnnuallyRepo
         Map<Integer, List<Table>> spanningPageShareChangeTableMap = MapUtil.newHashMap(8, true);
         int fi = 0;
         int sci = 0;
-        for (int i = 0; i < tables.size(); i++) {
-            Table table = tables.get(i);
-            if (i <= 1) {
-                this.fundInfoTables.add(table);
-                continue;
-            }
-            // 用表格的第一列的数据判断是否主要财务指标数据
+        for (Table table : tables) {
+            int colCount = table.getColCount();
             List<String> texts = this.getTableColTexts(table, 0);
+            // 用表格的第一列的数据判断是否主要财务指标数据
             if (CollUtil.containsAny(texts, ReportParseUtils.FINANCIAL_INDICATORS_COLUMN_NAMES)) {
                 this.splitTables(table, 10, fi, this.financialIndicatorsTables, spanningPageFinancialIndicatorsTableMap);
                 continue;
             }
-            int colCount = table.getColCount();
             if (colCount == 2) {
                 // 用表格的第一列的数据判断是否份额变动记录
                 if (CollUtil.containsAny(texts, ReportParseUtils.SHARE_CHANGE_COLUMN_NAMES)) {
                     this.splitTables(table, 5, sci, this.shareChangeTables, spanningPageShareChangeTableMap);
+                } else if (CollUtil.containsAny(texts, ReportParseUtils.FUND_INFO_COLUMN_NAMES)) {
+                    // 基金基本信息
+                    this.fundInfoTables.add(table);
                 }
             } else if (colCount == 4) {
                 // 用表格的第二列的数据判断是否行业配置数据(内地)

+ 6 - 4
mo-daq/src/main/java/com/smppw/modaq/application/components/report/parser/pdf/PDQuarterlyReportParser.java

@@ -65,11 +65,13 @@ public class PDQuarterlyReportParser<T extends QuarterlyReportData> extends Abst
             if (colCount == 0 && rowCount == 0) {
                 continue;
             }
-            if (rowCount == 13 && colCount == 2) {
-                this.fundInfoTable = table;
-            } else if (colCount == 2) {
+            if (colCount == 2) {
                 // 用表格的第一列的数据判断是否份额变动记录
                 List<String> texts = this.getTableColTexts(table, 0);
+                if (CollUtil.containsAny(texts, ReportParseUtils.FUND_INFO_COLUMN_NAMES)) {
+                    this.fundInfoTable = table;
+                    continue;
+                }
                 // 主要财务指标或份额变动
                 if (CollUtil.containsAny(texts, ReportParseUtils.SHARE_CHANGE_COLUMN_NAMES)) {
                     sci = this.splitTables(table, 5, sci, this.shareChangeTables, spanningPageShareChangeTableMap);
@@ -234,7 +236,7 @@ public class PDQuarterlyReportParser<T extends QuarterlyReportData> extends Abst
                 // x坐标升序(防止部分行乱序问题)
                 row.sort(Comparator.comparing(Rectangle2D.Float::getX));
                 // 金额、市值,有时是 “备注#金额”的格式
-                String marketValueAndRemark = ReportParseUtils.cleaningValue(row.get(2).getText());
+                String marketValueAndRemark = ReportParseUtils.cleaningValue(row.get(2).getText(), false);
                 // 资产明细
                 String detail = ReportParseUtils.cleaningValue(row.get(1).getText(), false);
                 if (!ReportParseUtils.ASSET_ALLOCATION_TYPE_MAPPER.containsKey(detail)) {

+ 1 - 1
mo-daq/src/main/java/com/smppw/modaq/common/enums/ReportType.java

@@ -6,7 +6,7 @@ import lombok.Getter;
 public enum ReportType {
     // 最后识别的类型
     OTHER(-2, "其他报告", new String[]{"公告", "通知", "告知函", "意见征询函", "说明函",
-            "清算报告", "邀请函", "观点", "预警", "投研报告", "公示", "回顾", "风险提示函", "说明", "合同变更"}),
+            "清算报告", "邀请函", "观点", "预警", "投研报告", "公示", "回顾", "风险提示函", "说明", "合同变更", "生效函"}),
 
     LETTER(-1, "交易流水确认函", new String[]{"确认单", "确认函", "交易确认数据",
             "赎回确认", "申购确认", "分红确认", "确认表", "交易确认", "确认"}),

+ 5 - 4
mo-daq/src/main/java/com/smppw/modaq/domain/service/EmailParseService.java

@@ -34,7 +34,6 @@ import com.smppw.modaq.domain.entity.EmailParseInfoDO;
 import com.smppw.modaq.domain.mapper.EmailFileInfoMapper;
 import com.smppw.modaq.domain.mapper.EmailParseInfoMapper;
 import com.smppw.modaq.infrastructure.util.ArchiveUtil;
-import com.smppw.modaq.infrastructure.util.DateUtils;
 import com.smppw.modaq.infrastructure.util.PdfUtil;
 import jakarta.mail.*;
 import jakarta.mail.internet.MimeUtility;
@@ -306,7 +305,7 @@ public class EmailParseService {
                 EmailZipFileDTO dto = iterator.next();
                 String filename = dto.getFilename();
                 // 删除复核函或基金合同
-                if (filename.contains("复核函") || filename.contains("基金合同")) {
+                if (filename.contains("复核函") || (filename.contains("基金合同") && !filename.contains("合同变更"))) {
                     log.warn("邮件{} 中的报告{} 是复核函或基金合同,不用解析上传。", emailTitle, filename);
                     iterator.remove();
                 }
@@ -553,11 +552,13 @@ public class EmailParseService {
             // ocr 识别的结果优先级更高
             if (reportData.getFundInfo() != null && parseRes != null) {
                 if (StrUtil.isBlank(reportData.getFundInfo().getFundName())
-                        || !Objects.equals(reportData.getFundInfo().getFundName(), parseRes.getFundName())) {
+                        || (StrUtil.isNotBlank(parseRes.getFundName())
+                        && !Objects.equals(reportData.getFundInfo().getFundName(), parseRes.getFundName()))) {
                     reportData.getFundInfo().setFundName(parseRes.getFundName());
                 }
                 if (StrUtil.isBlank(reportData.getFundInfo().getFundCode())
-                        || !Objects.equals(reportData.getFundInfo().getFundCode(), parseRes.getFundCode())) {
+                        || (StrUtil.isNotBlank(parseRes.getFundCode())
+                        && !Objects.equals(reportData.getFundInfo().getFundCode(), parseRes.getFundCode()))) {
                     reportData.getFundInfo().setFundCode(parseRes.getFundCode());
                 }
             }

+ 3 - 3
mo-daq/src/test/java/com/smppw/modaq/MoDaqApplicationTests.java

@@ -37,9 +37,9 @@ public class MoDaqApplicationTests {
 
     @Test
     public void reportTest() {
-        MailboxInfoDTO emailInfoDTO = this.buildMailbox("***@simuwang.com", "***");
-        Date startDate = DateUtil.parse("2025-06-05 09:00:00", DateConst.YYYY_MM_DD_HH_MM_SS);
-        Date endDate = DateUtil.parse("2025-06-05 17:58:00", DateConst.YYYY_MM_DD_HH_MM_SS);
+        MailboxInfoDTO emailInfoDTO = this.buildMailbox("*@simuwang.com", "**");
+        Date startDate = DateUtil.parse("2025-06-05 17:02:00", DateConst.YYYY_MM_DD_HH_MM_SS);
+        Date endDate = DateUtil.parse("2025-06-05 17:05:00", DateConst.YYYY_MM_DD_HH_MM_SS);
         try {
             List<String> folderNames = ListUtil.list(false);
 //            folderNames.add("其他文件夹/报告公告");