فهرست منبع

fix:报告类型的识别逻辑补充,特殊报告定义为月报

wangzaijun 2 هفته پیش
والد
کامیت
0e65ec9158

+ 22 - 0
mo-daq/src/main/java/com/smppw/modaq/application/components/ReportParseUtils.java

@@ -20,6 +20,11 @@ import java.util.stream.Collectors;
 
 
 public final class ReportParseUtils {
+    public static final Set<String> MANAGER_KEYWORDS = Set.of(
+            "管理人", "公司版", "投资者月报", "运行报告", "月策略",
+            "投资者报告", "投资报告", "投资月报", "月度简报", "运行月报"
+    );
+
     /**
      * 基金基本信息表格列名称
      */
@@ -433,9 +438,26 @@ public final class ReportParseUtils {
             // 特殊的月报(当季度->年度->月度报告无法识别时)
             reportType = ReportType.MONTHLY;
         }
+
+        // 特殊月报
+        if (ReportParseUtils.containsAny(text, MANAGER_KEYWORDS) || text.contains("定期报告")) {
+            reportType = ReportType.MONTHLY;
+        }
+        // 其他报告
+        if (text.contains("报告")) {
+            reportType = ReportType.OTHER;
+        }
         return reportType;
     }
 
+    // 工具方法:检查字符串是否包含任意关键词
+    public static boolean containsAny(String input, Set<String> keywords) {
+        if (StrUtil.isBlank(input)) {
+            return false;
+        }
+        return keywords.stream().anyMatch(input::contains);
+    }
+
 //    private static int getLastDayOfMonth(int year, int month) {
 //        Calendar calendar = Calendar.getInstance();
 //        calendar.set(Calendar.YEAR, year);

+ 31 - 32
mo-daq/src/main/java/com/smppw/modaq/domain/service/EmailParseService.java

@@ -69,10 +69,7 @@ public class EmailParseService {
 
     // 常量定义:统一管理关键词
     private static final Set<String> AMAC_KEYWORDS = Set.of("协会", "信披");
-    private static final Set<String> MANAGER_KEYWORDS = Set.of(
-            "管理人", "公司版", "投资者月报", "运行报告", "月策略",
-            "投资者报告", "投资报告", "投资月报", "月度简报", "运行月报"
-    );
+
     private static final Set<String> EXCLUDE_PATH_KEYWORDS = Set.of("公司及协会版", "公司和协会版");
 
     // 扩展支持的 MIME 类型
@@ -178,6 +175,16 @@ public class EmailParseService {
                     continue;
                 }
                 Integer type = EmailUtil.getEmailTypeBySubject(emailTitle + emailFile.getFilename());
+                // 特殊月报
+                if (Objects.equals(EmailTypeConst.NAV_EMAIL_TYPE, type)
+                        && (ReportParseUtils.containsAny(emailTitle, ReportParseUtils.MANAGER_KEYWORDS)
+                        || emailTitle.contains("定期报告"))) {
+                    type = EmailTypeConst.REPORT_EMAIL_TYPE;
+                }
+                // 其他报告
+                if (Objects.equals(EmailTypeConst.NAV_EMAIL_TYPE, type)) {
+                    type = EmailTypeConst.REPORT_OTHER_TYPE;
+                }
                 emailFile.setEmailType(type);
             }
 
@@ -519,10 +526,10 @@ public class EmailParseService {
                 File outputFile = FileUtil.file(FileUtil.getParent(output, 1));
                 images = PdfUtil.convertFirstAndLastPagesToPng(filepath, outputFile, 300);
                 if (log.isDebugEnabled()) {
-                    log.debug("报告[{}] 生成的图片地址是:\n{}", fileName, images);
+                    log.debug("报告{} 生成的图片地址是:\n{}", fileName, images);
                 }
             } catch (Exception e) {
-                log.warn("报告[{}] 生成图片失败:{}", fileName, ExceptionUtil.stacktraceToString(e));
+                log.warn("报告{} 生成图片失败:{}", fileName, ExceptionUtil.stacktraceToString(e));
             }
         } else if (Objects.equals(ReportParserFileType.IMG, fileType)) {
             try {
@@ -553,8 +560,8 @@ public class EmailParseService {
                 result = new ParseResult<>(1, "报告解析成功", reportData);
             }
         } catch (ReportParseException e) {
-            log.warn("解析失败:{}", StrUtil.format(e.getMsg(), fileName));
             result = new ParseResult<>(e.getCode(), StrUtil.format(e.getMsg(), fileName), null);
+            log.warn("解析失败:{}", result.getMsg());
             if (e instanceof NotSupportReportException) {
                 notSupportFile = true;
             }
@@ -565,7 +572,7 @@ public class EmailParseService {
             // 如果解析结果是空的就用AI工具解析一次
             if (reportData == null && !notSupportFile) {
                 if (log.isInfoEnabled()) {
-                    log.info("报告{} 是周报或管理人月报或其他类型,用AI解析器解析", fileName);
+                    log.info("报告{} 是周报或管理人月报或其他类型或解析失败,用AI解析器解析", fileName);
                 }
                 try {
                     if (!isAmac && CollUtil.isNotEmpty(images)) {
@@ -576,8 +583,8 @@ public class EmailParseService {
                     reportData = instance.parse(params);
                     result = new ParseResult<>(1, "报告解析成功--AI", reportData);
                 } catch (ReportParseException e) {
-                    log.warn("AI解析失败:{}", StrUtil.format(e.getMsg(), fileName));
                     result = new ParseResult<>(e.getCode(), StrUtil.format(e.getMsg(), fileName), null);
+                    log.warn("AI解析失败:{}", result.getMsg());
                 } catch (Exception e) {
                     log.warn("AI解析错误:{}", ExceptionUtil.stacktraceToString(e));
                     result = new ParseResult<>(ReportParseStatus.PARSE_FAIL, null, e.getMessage());
@@ -612,10 +619,10 @@ public class EmailParseService {
     public ReportMonthlyType determineReportType(String emailTitle, String fileName,
                                                  String filepath, List<String> images) {
         // 1. 优先根据文件名判断
-        if (containsAny(fileName, AMAC_KEYWORDS)) {
+        if (ReportParseUtils.containsAny(fileName, AMAC_KEYWORDS)) {
             return ReportMonthlyType.AMAC;
         }
-        if (containsAny(fileName, MANAGER_KEYWORDS)) {
+        if (ReportParseUtils.containsAny(fileName, ReportParseUtils.MANAGER_KEYWORDS)) {
             return ReportMonthlyType.MANAGER;
         }
         if (StrUtil.isNotBlank(ReportParseUtils.matchFundCode(fileName))) {
@@ -624,21 +631,21 @@ public class EmailParseService {
         // 2. 根据文件路径判断
         List<String> pathSegments = StrUtil.split(filepath, File.separator);
         for (String segment : pathSegments) {
-            boolean isExcluded = containsAny(segment, EXCLUDE_PATH_KEYWORDS);
-            if (!isExcluded && containsAny(segment, AMAC_KEYWORDS)) {
+            boolean isExcluded = ReportParseUtils.containsAny(segment, EXCLUDE_PATH_KEYWORDS);
+            if (!isExcluded && ReportParseUtils.containsAny(segment, AMAC_KEYWORDS)) {
                 return ReportMonthlyType.AMAC;
             }
-            if (!isExcluded && containsAny(segment, MANAGER_KEYWORDS)) {
+            if (!isExcluded && ReportParseUtils.containsAny(segment, ReportParseUtils.MANAGER_KEYWORDS)) {
                 return ReportMonthlyType.MANAGER;
             }
         }
         // 3. 根据邮件主题判断
-        boolean isAmacEmail = containsAny(emailTitle, AMAC_KEYWORDS)
+        boolean isAmacEmail = ReportParseUtils.containsAny(emailTitle, AMAC_KEYWORDS)
                 && !emailTitle.contains("公司及协会版");
         if (isAmacEmail) {
             return ReportMonthlyType.AMAC;
         }
-        if (containsAny(emailTitle, MANAGER_KEYWORDS)) {
+        if (ReportParseUtils.containsAny(emailTitle, ReportParseUtils.MANAGER_KEYWORDS)) {
             return ReportMonthlyType.MANAGER;
         }
         // 4.ocr 提取“曲线”、“基金份额”等关键字,如果有曲线则是管理人,如果有估值日期则是协会
@@ -652,14 +659,6 @@ public class EmailParseService {
         return ReportMonthlyType.FAILED;
     }
 
-    // 工具方法:检查字符串是否包含任意关键词
-    private boolean containsAny(String input, Set<String> keywords) {
-        if (StrUtil.isBlank(input)) {
-            return false;
-        }
-        return keywords.stream().anyMatch(input::contains);
-    }
-
     /**
      * ocr 提取信息(包括首页的基金名称或报告日期,尾页的印章或联系人等信息)
      *
@@ -735,7 +734,7 @@ public class EmailParseService {
             ReportWriter<ReportData> instance = this.reportWriterFactory.getInstance(reportType);
             instance.write(reportData);
         } catch (Exception e) {
-            log.error("报告{}结果保存失败\n{}", fileName, ExceptionUtil.stacktraceToString(e));
+            log.error("报告{} 结果保存失败 {}", fileName, ExceptionUtil.stacktraceToString(e));
         } finally {
             writeWatch.stop();
             if (log.isInfoEnabled()) {
@@ -824,10 +823,10 @@ public class EmailParseService {
 
         Map<String, List<EmailContentInfoDTO>> result = MapUtil.newHashMap(128);
         try {
-            if (log.isInfoEnabled()) {
+            if (log.isDebugEnabled()) {
                 Folder[] list = store.getDefaultFolder().list("*");
                 List<String> names = Arrays.stream(list).map(Folder::getFullName).toList();
-                log.info("获取所有邮箱文件夹:{}", names);
+                log.debug("获取所有邮箱文件夹:{}", names);
             }
 
             for (String folderName : folderNames) {
@@ -910,7 +909,7 @@ public class EmailParseService {
                 } else if (content instanceof Part part) {
                     this.rePart(emailAddress, emailTitle, emailDate, part, dtos);
                 } else {
-                    log.warn("{} 不支持的邮件数据 {}", folderName, emailTitle);
+                    log.warn("{} 邮件{} 获取不了附件", folderName, emailTitle);
                 }
                 if (CollUtil.isEmpty(dtos)) {
                     log.warn("{} 邮件{} 没有获取到附件", folderName, emailTitle);
@@ -922,9 +921,9 @@ public class EmailParseService {
                 });
                 emailMessageMap.put(IdUtil.simpleUUID(), dtos);
             } catch (Exception e) {
-                log.error("{} 获取邮箱的邮件{} 报错,堆栈信息:{}", folderName, emailTitle, ExceptionUtil.stacktraceToString(e));
+                log.error("{} 邮件{} 下载报错 {}", folderName, emailTitle, ExceptionUtil.stacktraceToString(e));
             } finally {
-                if (log.isInfoEnabled()) {
+                if (CollUtil.isNotEmpty(dtos) && log.isInfoEnabled()) {
                     log.info("{} 邮件{} 下载完成,总计耗时{} ms,文件内容如下\n {}", folderName,
                             emailTitle, System.currentTimeMillis() - start, dtos);
                 }
@@ -953,8 +952,8 @@ public class EmailParseService {
         String disposition = part.getDisposition();
         String contentType = part.getContentType();
 
-        String[] att_files = new String[]{Constants.ARCHIVE_7Z, Constants.ARCHIVE_RAR,
-                Constants.ARCHIVE_ZIP, Constants.FILE_PDF, Constants.FILE_DOCX, Constants.FILE_JPG, Constants.FILE_PNG};
+        String[] att_files = new String[]{Constants.ARCHIVE_7Z, Constants.ARCHIVE_RAR, Constants.ARCHIVE_ZIP,
+                Constants.FILE_PDF, Constants.FILE_DOCX, Constants.FILE_JPG, Constants.FILE_PNG};
         boolean attachmentFlag = StrUtil.endWithAny(fileName, att_files);
         boolean isAttachment = attachmentFlag
                 || Part.ATTACHMENT.equalsIgnoreCase(disposition)

+ 3 - 3
mo-daq/src/test/java/com/smppw/modaq/MoDaqApplicationTests.java

@@ -41,9 +41,9 @@ public class MoDaqApplicationTests {
 
     @Test
     public void reportTest() {
-        MailboxInfoDTO emailInfoDTO = this.buildMailbox("*@simuwang.com", "*");
-        Date startDate = DateUtil.parse("2025-06-11 10:05:00", DateConst.YYYY_MM_DD_HH_MM_SS);
-        Date endDate = DateUtil.parse("2025-06-11 10:06:00", DateConst.YYYY_MM_DD_HH_MM_SS);
+        MailboxInfoDTO emailInfoDTO = this.buildMailbox("**@simuwang.com", "**");
+        Date startDate = DateUtil.parse("2025-06-12 13:56:00", DateConst.YYYY_MM_DD_HH_MM_SS);
+        Date endDate = DateUtil.parse("2025-06-12 17:06:00", DateConst.YYYY_MM_DD_HH_MM_SS);
         try {
             List<String> folderNames = ListUtil.list(false);
 //            folderNames.add("其他文件夹/报告公告");