Browse Source

报告日期结果解析优化

wangzaijun 3 weeks ago
parent
commit
d80d0e06b2

+ 3 - 4
mo-daq/src/main/java/com/smppw/modaq/application/components/OCRReportParser.java

@@ -32,9 +32,9 @@ public class OCRReportParser {
         Map<String, Object> paramsMap = MapUtil.newHashMap(4);
         paramsMap.put("image_url", ocrImgUrl);
         paramsMap.put("result_schema", JSONUtil.toJsonStr(RESULT_SCHEMA_MAP));
-        String body = null;
+        OCRParseData res = new OCRParseData();
         try {
-            body = HttpUtil.get(ocrApi, paramsMap);
+            String body = HttpUtil.get(ocrApi, paramsMap);
             JSONObject jsonResult = JSONUtil.parseObj(body);
             String content = StrUtil.split(jsonResult.getStr("content"), "```").get(1);
             String aiParserContent = "{" + StrUtil.subAfter(content, "{", false) + "}";
@@ -43,7 +43,6 @@ public class OCRReportParser {
             String fundCode = this.cleanData(jsonObject.getStr("产品代码"));
             String seals = this.cleanData(jsonObject.getStr("是否有红色印章"));
             String phone = this.cleanData(jsonObject.getStr("是否有电话"));
-            OCRParseData res = new OCRParseData();
             if (StrUtil.isNotBlank(fundName) && fundName.contains("基金") && !fundName.contains("公司")) {
                 res.setFundName(fundName);
             }
@@ -64,7 +63,7 @@ public class OCRReportParser {
             throw new ReportParseException(ReportParseStatus.SYSTEM_ERROR);
         } finally {
             if (logger.isInfoEnabled()) {
-                this.logger.info("报告{} OCR识别参数{},OCR识别结果:{}", filename, paramsMap, body);
+                this.logger.info("报告{} OCR识别参数{},OCR识别结果:{}", filename, paramsMap, res);
             }
         }
     }

+ 24 - 28
mo-daq/src/main/java/com/smppw/modaq/application/components/ReportParseUtils.java

@@ -4,6 +4,7 @@ import cn.hutool.core.collection.ListUtil;
 import cn.hutool.core.map.MapUtil;
 import cn.hutool.core.util.NumberUtil;
 import cn.hutool.core.util.StrUtil;
+import com.smppw.modaq.application.util.EmailUtil;
 import com.smppw.modaq.common.conts.EmailTypeConst;
 import com.smppw.modaq.common.conts.PatternConsts;
 import com.smppw.modaq.common.enums.ReportType;
@@ -372,7 +373,7 @@ public final class ReportParseUtils {
         if (matcher.find()) {
             return matcher.group();
         }
-        // 匹配不了时
+        // 严格日期匹配不了时
         matcher = PatternConsts.DAY_PATTERN.matcher(text);
         if (matcher.find()) {
             String date = matcher.group();
@@ -381,7 +382,7 @@ public final class ReportParseUtils {
             }
             return date;
         }
-        // 其他报告的日期
+        // 其他报告的日期(匹配到日)
         if (ReportType.OTHER.equals(reportType)) {
             matcher = PatternConsts.MONTHLY_PATTERN.matcher(text);
             if (matcher.find()) {
@@ -389,21 +390,6 @@ public final class ReportParseUtils {
                 int month = Integer.parseInt(matcher.group(2));
                 return formatMonthEnd(year, month);
             }
-            matcher = PatternConsts.QUARTERLY_PATTERN.matcher(text);
-            if (matcher.find()) {
-                String year = matcher.group(1);
-                return switch (matcher.group(2)) {
-                    case "一", "1" -> year + "-03-31";
-                    case "二", "2" -> year + "-06-30";
-                    case "三", "3" -> year + "-09-30";
-                    case "四", "4" -> year + "-12-31";
-                    default -> null;
-                };
-            }
-            matcher = PatternConsts.ANNUALLY_PATTERN.matcher(text);
-            if (matcher.find()) {
-                return matcher.group(1) + "-12-31";
-            }
         }
         return null;
     }
@@ -449,16 +435,16 @@ public final class ReportParseUtils {
         return reportType;
     }
 
-    private static int getLastDayOfMonth(int year, int month) {
-        Calendar calendar = Calendar.getInstance();
-        calendar.set(Calendar.YEAR, year);
-        calendar.set(Calendar.MONTH, month - 1); // Calendar.MONTH 是从0开始的
-        return calendar.getActualMaximum(Calendar.DAY_OF_MONTH);
-    }
-
-    private static String padZero(String number) {
-        return String.format("%02d", Integer.parseInt(number));
-    }
+//    private static int getLastDayOfMonth(int year, int month) {
+//        Calendar calendar = Calendar.getInstance();
+//        calendar.set(Calendar.YEAR, year);
+//        calendar.set(Calendar.MONTH, month - 1); // Calendar.MONTH 是从0开始的
+//        return calendar.getActualMaximum(Calendar.DAY_OF_MONTH);
+//    }
+//
+//    private static String padZero(String number) {
+//        return String.format("%02d", Integer.parseInt(number));
+//    }
 
     /**
      * 计算指定月份的最后一天
@@ -579,7 +565,17 @@ public final class ReportParseUtils {
 //        reportType = matchReportType(emailType, text);
 //        System.out.println(emailType + ",reportType=" + reportType + ",reportDate=" + matchReportDate(reportType, text));
 
-        String date = "2025-06-04 11:36:43";
+        text = "三希赤霄二号月报2025.05.png";
+        emailType = EmailUtil.getEmailTypeBySubject(text);
+        reportType = matchReportType(emailType, text);
+        System.out.println(emailType + ",reportType=" + reportType + ",reportDate=" + matchReportDate(reportType, text));
+
+        text = "第一创业2025年合同变更公告.png";
+        emailType = EmailUtil.getEmailTypeBySubject(text);
+        reportType = matchReportType(emailType, text);
+        System.out.println(emailType + ",reportType=" + reportType + ",reportDate=" + matchReportDate(reportType, text));
+
+        String date = "2025年6月6日";
         String input = ReportParseUtils.cleaningValue(date, false);
         Date date1 = DateUtils.toDate(input);
         System.out.println(date1);

+ 19 - 0
mo-daq/src/main/java/com/smppw/modaq/application/components/report/parser/ai/AbstractAIReportParser.java

@@ -11,11 +11,14 @@ import com.smppw.modaq.application.components.report.parser.AbstractReportParser
 import com.smppw.modaq.common.conts.PatternConsts;
 import com.smppw.modaq.common.enums.ReportParseStatus;
 import com.smppw.modaq.common.exception.ReportParseException;
+import com.smppw.modaq.domain.dto.report.ReportBaseInfoDTO;
 import com.smppw.modaq.domain.dto.report.ReportData;
 import com.smppw.modaq.domain.dto.report.ReportParserParams;
 import com.smppw.modaq.domain.mapper.EmailFieldMappingMapper;
+import com.smppw.modaq.infrastructure.util.DateUtils;
 import org.springframework.beans.factory.annotation.Value;
 
+import java.util.Date;
 import java.util.Map;
 
 /**
@@ -76,6 +79,22 @@ public abstract class AbstractAIReportParser<T extends ReportData> extends Abstr
     protected abstract boolean isSupportAIParse();
 
     /**
+     * 覆盖报告基本信息的方法,报告日期从报告名称获取失败时从内容获取
+     *
+     * @param params /
+     * @return /
+     */
+    @Override
+    protected ReportBaseInfoDTO buildReportInfo(ReportParserParams params) {
+        ReportBaseInfoDTO reportInfo = super.buildReportInfo(params);
+        if (reportInfo.getReportDate() == null) {
+            Date date = DateUtils.toDate(MapUtil.getStr(this.allInfoMap, "报告日期"));
+            reportInfo.setReportDate(date);
+        }
+        return reportInfo;
+    }
+
+    /**
      * 处理ai解析结果,方便构建结构化对象
      *
      * @param result ai解析结果

+ 3 - 3
mo-daq/src/main/java/com/smppw/modaq/common/conts/PatternConsts.java

@@ -17,15 +17,15 @@ public class PatternConsts {
     /**
      * 月度报告日期正则匹配
      */
-    public static Pattern MONTHLY_PATTERN = Pattern.compile("(20[23]\\d)[/年_-]*(\\d{1,2})月?");
+    public static Pattern MONTHLY_PATTERN = Pattern.compile("(20[23]\\d)[./年_-]*(\\d{1,2})月?");
     /**
      * 严格的年月日匹配
      */
-    public static final Pattern STRICT_DAY_PATTERN = Pattern.compile("(20[23]\\d)[/年_-]*(\\d{1,2})[/月_-]*(\\d{1,2})[日号]*(?!\\d)");
+    public static final Pattern STRICT_DAY_PATTERN = Pattern.compile("(20[23]\\d)[./年_-]*(\\d{1,2})[./月_-]*(\\d{1,2})[日号]*(?!\\d)");
     /**
      * 宽泛的日期匹配
      */
-    public static final Pattern DAY_PATTERN = Pattern.compile("(20[23]\\d)[/年_-]*(\\d{1,2})[/月_-]*(\\d{1,2})[日号\\d]+");
+    public static final Pattern DAY_PATTERN = Pattern.compile("(20[23]\\d)[./年_-]*(\\d{1,2})[./月_-]*(\\d{1,2})[日号\\d]+");
 
     /**
      * 基金编码的正则表达式

+ 2 - 2
mo-daq/src/main/java/com/smppw/modaq/common/enums/ReportType.java

@@ -6,9 +6,9 @@ import lombok.Getter;
 public enum ReportType {
     // 最后识别的类型
     OTHER(-2, "其他报告",
-            new String[]{"公告", "通知", "告知函", "意见征询函", "说明函",
+            new String[]{"公告", "通知", "告知函", "意见征询函", "说明函", "简报",
                     "清算报告", "邀请函", "观点", "预警", "投研报告", "公示", "回顾",
-                    "风险提示函", "说明", "合同变更", "生效函", "投资报告", "投资者月报"}),
+                    "风险提示函", "说明", "合同变更", "生效函", "投资报告", "投资者月报", "运行报告"}),
 
     LETTER(-1, "交易流水确认函",
             new String[]{"确认单", "确认函", "交易确认数据",

+ 10 - 0
mo-daq/src/main/java/com/smppw/modaq/domain/dto/report/OCRParseData.java

@@ -26,4 +26,14 @@ public class OCRParseData {
      * 观点报告是否存在联系人信息(可能包含联系电话、地址等敏感信息)(报告尾页识别)
      */
     private Boolean withContacts;
+
+    @Override
+    public String toString() {
+        return "{" +
+                "fundName='" + fundName + '\'' +
+                ", fundCode='" + fundCode + '\'' +
+                ", withSeals=" + withSeals +
+                ", withContacts=" + withContacts +
+                '}';
+    }
 }

+ 6 - 4
mo-daq/src/main/java/com/smppw/modaq/domain/dto/report/ReportFundInfoDTO.java

@@ -18,7 +18,6 @@ public class ReportFundInfoDTO extends BaseReportDTO<ReportFundInfoDO> {
     /**
      * 基金的名称
      */
-    @Getter
     private String fundName;
 
     /**
@@ -118,6 +117,10 @@ public class ReportFundInfoDTO extends BaseReportDTO<ReportFundInfoDO> {
         return ReportParseUtils.matchFundCode(this.fundCode);
     }
 
+    public String getFundName() {
+        return this.fundName == null ? null : this.fundName.trim().replaceAll("- ", "");
+    }
+
     public ReportFundInfoDTO() {
         super();
     }
@@ -133,11 +136,10 @@ public class ReportFundInfoDTO extends BaseReportDTO<ReportFundInfoDO> {
         entity.setFundCode(this.getFundCode());
         entity.setCompanyName(this.companyName);
         entity.setCurrency(this.currency);
-        entity.setFundName(this.fundName);
+        entity.setFundName(this.getFundName());
         entity.setAdvisorName(this.advisorName);
         entity.setCustodianName(this.custodianName);
         entity.setFundManager(this.fundManager);
-        entity.setFundName(this.fundName);
         entity.setFundStrategyDescription(this.fundStrategyDescription);
         entity.setInceptionDate(DateUtils.toDate(this.inceptionDate));
         entity.setIndustryTrend(this.industryTrend);
@@ -158,7 +160,7 @@ public class ReportFundInfoDTO extends BaseReportDTO<ReportFundInfoDO> {
     public String toString() {
         return "{" +
                 super.toString() +
-                ", fundName='" + this.fundName + '\'' +
+                ", fundName='" + this.getFundName() + '\'' +
                 ", fundCode='" + this.fundCode + '\'' +
                 ", matchFundCode='" + this.getFundCode() + '\'' +
                 ", companyName='" + this.companyName + '\'' +

+ 7 - 12
mo-daq/src/main/java/com/smppw/modaq/domain/service/EmailParseService.java

@@ -352,7 +352,7 @@ public class EmailParseService {
                 continue;
             }
             if (log.isInfoEnabled()) {
-                log.info("邮件{} 还有报告待解析:{}", emailTitle, dtos);
+                log.info("邮件{} 还有报告待解析:\n{}", emailTitle, dtos);
             }
 
             Integer emailId = emailDto.getEmailId();
@@ -438,7 +438,7 @@ public class EmailParseService {
                 String output = FileUtil.getParent(filepath, 1) + File.separator + "image";
                 images = PdfUtil.convertFirstAndLastPagesToPng(filepath, FileUtil.file(output), 300);
                 if (log.isDebugEnabled()) {
-                    log.debug("报告[{}] 生成的图片地址是:{}", fileName, images);
+                    log.debug("报告[{}] 生成的图片地址是:\n{}", fileName, images);
                 }
             } catch (Exception e) {
                 log.warn("报告[{}] 生成图片失败:{}", fileName, ExceptionUtil.stacktraceToString(e));
@@ -512,7 +512,7 @@ public class EmailParseService {
                     result.setMsg(StrUtil.format(ReportParseStatus.PARSE_FAIL.getMsg(), e.getMessage()));
                 }
                 if (log.isInfoEnabled()) {
-                    log.info("报告{} AI解析结束!", fileName);
+                    log.info("报告{} AI解析结束!结果是:{}", fileName, reportData);
                 }
             }
             // ocr信息提取
@@ -560,8 +560,7 @@ public class EmailParseService {
             }
         }
         // 用首页识别基金名称、产品代码和报告日期
-        if ((reportData.getBaseInfo() != null && reportData.getBaseInfo().getReportDate() == null)
-                || (reportData.getFundInfo() != null && StrUtil.isBlank(reportData.getFundInfo().getFundName()))
+        if ((reportData.getFundInfo() != null && StrUtil.isBlank(reportData.getFundInfo().getFundName()))
                 || (reportData.getFundInfo() != null && StrUtil.isBlank(reportData.getFundInfo().getFundCode()))) {
             // 首页和尾页不相等时解析首页的数据
             if (images.size() != 1) {
@@ -571,16 +570,12 @@ public class EmailParseService {
                     log.error("报告{} OCR识别首页基金名称和报告日期出错:{}", fileName, ExceptionUtil.stacktraceToString(e));
                 }
             }
-            // ocr 识别的结果优先级更高
+            // ocr 识别的结果
             if (reportData.getFundInfo() != null && parseRes != null) {
-                if (StrUtil.isBlank(reportData.getFundInfo().getFundName())
-                        || (StrUtil.isNotBlank(parseRes.getFundName())
-                        && !Objects.equals(reportData.getFundInfo().getFundName(), parseRes.getFundName()))) {
+                if (StrUtil.isBlank(reportData.getFundInfo().getFundName())) {
                     reportData.getFundInfo().setFundName(parseRes.getFundName());
                 }
-                if (StrUtil.isBlank(reportData.getFundInfo().getFundCode())
-                        || (StrUtil.isNotBlank(parseRes.getFundCode())
-                        && !Objects.equals(reportData.getFundInfo().getFundCode(), parseRes.getFundCode()))) {
+                if (StrUtil.isBlank(reportData.getFundInfo().getFundCode())) {
                     reportData.getFundInfo().setFundCode(parseRes.getFundCode());
                 }
             }

+ 1 - 0
mo-daq/src/main/java/com/smppw/modaq/infrastructure/util/DateUtils.java

@@ -36,6 +36,7 @@ public class DateUtils {
                         DatePattern.NORM_DATE_PATTERN,
                         "yyyy/MM/dd",
                         "yyyy_MM_dd",
+                        "yyyy.MM.dd",
                         "yyyy年MM月dd");
             } catch (Exception ignored) {
             }

+ 3 - 3
mo-daq/src/test/java/com/smppw/modaq/MoDaqApplicationTests.java

@@ -37,9 +37,9 @@ public class MoDaqApplicationTests {
 
     @Test
     public void reportTest() {
-        MailboxInfoDTO emailInfoDTO = this.buildMailbox("8@simuwang.com", "8");
-        Date startDate = DateUtil.parse("2025-06-05 17:02:00", DateConst.YYYY_MM_DD_HH_MM_SS);
-        Date endDate = DateUtil.parse("2025-06-05 17:05:00", DateConst.YYYY_MM_DD_HH_MM_SS);
+        MailboxInfoDTO emailInfoDTO = this.buildMailbox("*@simuwang.com", "*");
+        Date startDate = DateUtil.parse("2025-06-06 15:28:00", DateConst.YYYY_MM_DD_HH_MM_SS);
+        Date endDate = DateUtil.parse("2025-06-06 17:05:00", DateConst.YYYY_MM_DD_HH_MM_SS);
         try {
             List<String> folderNames = ListUtil.list(false);
 //            folderNames.add("其他文件夹/报告公告");