Kaynağa Gözat

fix:报告日期正则匹配规则优化+已存在的报告追加邮件主题条件

wangzaijun 1 ay önce
ebeveyn
işleme
0e30c2b32d

+ 152 - 87
mo-daq/src/main/java/com/smppw/modaq/application/components/ReportParseUtils.java

@@ -3,16 +3,17 @@ package com.smppw.modaq.application.components;
 import cn.hutool.core.collection.ListUtil;
 import cn.hutool.core.map.MapUtil;
 import cn.hutool.core.util.StrUtil;
+import com.smppw.modaq.application.util.EmailUtil;
 import com.smppw.modaq.common.conts.EmailTypeConst;
+import com.smppw.modaq.common.conts.PatternConsts;
 import com.smppw.modaq.common.enums.ReportType;
-import com.smppw.modaq.common.exception.ReportParseException;
 import com.smppw.modaq.domain.dto.report.ReportAssetAllocationDTO;
-import jakarta.mail.internet.MimeUtility;
 
-import java.io.IOException;
 import java.time.YearMonth;
-import java.util.*;
-import java.util.function.Function;
+import java.util.Calendar;
+import java.util.List;
+import java.util.Map;
+import java.util.Objects;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 import java.util.stream.Collectors;
@@ -36,51 +37,6 @@ public final class ReportParseUtils {
      */
     public static final Map<String, String> ASSET_ALLOCATION_TYPE_MAPPER = MapUtil.newHashMap(32, true);
 
-    // 预编译所有正则表达式(线程安全)
-    private static final Map<Pattern, Function<Matcher, String>> PATTERNS = new LinkedHashMap<>() {{
-        // 季度报告(最高优先级)
-        put(Pattern.compile("(20\\d{2})[^\\d]*([一二三四1234])季"), matcher -> {
-            String year = matcher.group(1);
-            return switch (matcher.group(2)) {
-                case "一", "1" -> year + "-03-31";
-                case "二", "2" -> year + "-06-30";
-                case "三", "3" -> year + "-09-30";
-                case "四", "4" -> year + "-12-31";
-                default -> null;
-            };
-        });
-
-        // 明确日期格式(次优先级)
-        put(Pattern.compile("\\d{4}-\\d{2}-\\d{2}"), Matcher::group);
-
-        // 年度报告(合并相似正则)
-        put(Pattern.compile("(20\\d{2})年(度|年度)"), matcher -> matcher.group(1) + "-12-31");
-
-        // 月份相关格式(统一处理)
-        put(Pattern.compile("(20\\d{2})年[_-]?(\\d{1,2})月"), matcher -> {
-            int year = Integer.parseInt(matcher.group(1));
-            int month = Integer.parseInt(matcher.group(2));
-            return formatMonthEnd(year, month);
-        });
-
-        // 紧凑格式(如202312月)
-        put(Pattern.compile("(\\d{4})(\\d{2})月"), matcher -> {
-            int year = Integer.parseInt(matcher.group(1));
-            int month = Integer.parseInt(matcher.group(2));
-            return formatMonthEnd(year, month);
-        });
-
-        // 纯数字格式(最低优先级)
-        put(Pattern.compile("(?<!\\d)\\d{4}(0[1-9]|1[0-2])(?!\\d)"), matcher -> {
-            String group = matcher.group();
-            int year = Integer.parseInt(group.substring(0, 4));
-            int month = Integer.parseInt(group.substring(4, 6));
-            return formatMonthEnd(year, month);
-        });
-        put(Pattern.compile("(?<!\\d)\\d{4}(0[1-9]|1[0-2])(0[1-9]|[12]\\d|3[01])(?!\\d)"), Matcher::group);
-    }};
-
-
     static {
         // 财务指标
         FINANCIAL_INDICATORS_COLUMN_NAMES.add("期末基金净资产");
@@ -300,8 +256,7 @@ public final class ReportParseUtils {
             return matches;
         }
         // 使用正则表达式查找匹配项
-        Pattern pattern = Pattern.compile("[A-F]级|基金[A-F]");
-        Matcher matcher = pattern.matcher(text);
+        Matcher matcher = PatternConsts.FUND_LEVEL_PATTERN.matcher(text);
         // 收集所有匹配项
         while (matcher.find()) {
             matches.add(matcher.group());
@@ -325,8 +280,7 @@ public final class ReportParseUtils {
      */
     public static String matchFundLevel(String text) {
         // 使用正则表达式查找匹配项
-        Pattern pattern = Pattern.compile("[A-F]级|基金[A-F]");
-        Matcher matcher = pattern.matcher(text);
+        Matcher matcher = PatternConsts.FUND_LEVEL_PATTERN.matcher(text);
         String result = null;
         while (matcher.find()) {
             result = matcher.group();
@@ -349,8 +303,7 @@ public final class ReportParseUtils {
         if (StrUtil.isBlank(text)) {
             return null;
         }
-        Pattern pattern = Pattern.compile("S[A-Z0-9]{5}");
-        Matcher matcher = pattern.matcher(text);
+        Matcher matcher = PatternConsts.FUND_CODE_PATTERN.matcher(text);
         if (matcher.find()) {
             return matcher.group();
         }
@@ -358,31 +311,61 @@ public final class ReportParseUtils {
     }
 
     /**
-     * 匹配报告日期
+     * 根据报告类型和报告名称匹配报告日期
      *
-     * @param text 文本内容
+     * @param reportType 报告类型
+     * @param text       文本内容
      * @return 报告日期
      */
-    public static String matchReportDate(String text) {
-        return Optional.ofNullable(text)
-                .flatMap(str -> PATTERNS.entrySet().stream()
-                        .map(entry -> {
-                            Matcher matcher = entry.getKey().matcher(str);
-                            return matcher.find() ? entry.getValue().apply(matcher) : null;
-                        })
-                        .filter(Objects::nonNull)
-                        .findFirst())
-                .orElse(null);
+    public static String matchReportDate(ReportType reportType, String text) {
+        if (StrUtil.isBlank(text) || reportType == null) {
+            return null;
+        }
+        text = StrUtil.trim(text);
+        if (ReportType.QUARTERLY.equals(reportType) || ReportType.OTHER.equals(reportType)) {
+            Matcher matcher = PatternConsts.QUARTERLY_PATTERN.matcher(text);
+            if (matcher.find()) {
+                String year = matcher.group(1);
+                return switch (matcher.group(2)) {
+                    case "一", "1" -> year + "-03-31";
+                    case "二", "2" -> year + "-06-30";
+                    case "三", "3" -> year + "-09-30";
+                    case "四", "4" -> year + "-12-31";
+                    default -> null;
+                };
+            }
+        }
+        if (ReportType.ANNUALLY.equals(reportType) || ReportType.OTHER.equals(reportType)) {
+            Matcher matcher = PatternConsts.ANNUALLY_PATTERN.matcher(text);
+            if (matcher.find()) {
+                return matcher.group(1) + "-12-31";
+            }
+        }
+        if (ReportType.MONTHLY.equals(reportType) || ReportType.OTHER.equals(reportType)) {
+            Matcher matcher = PatternConsts.MONTHLY_PATTERN.matcher(text);
+            if (matcher.find()) {
+                int year = Integer.parseInt(matcher.group(1));
+                int month = Integer.parseInt(matcher.group(2));
+                return formatMonthEnd(year, month);
+            }
+        }
+        if (ReportType.WEEKLY.equals(reportType) || ReportType.LETTER.equals(reportType) || ReportType.OTHER.equals(reportType)) {
+            Matcher matcher = PatternConsts.STRICT_DAY_PATTERN.matcher(text);
+            if (matcher.find()) {
+                return matcher.group();
+            }
+        }
+        return null;
     }
 
     /**
      * 匹配报告类型,如“季度”、“年度”
      *
      * @param emailType 邮件类型
-     * @param string    输入字符串
+     * @param text      输入字符串
      * @return 匹配到的报告类型子字符串,如果没有匹配到则返回null
      */
-    public static ReportType matchReportType(Integer emailType, String string) {
+    public static ReportType matchReportType(Integer emailType, String text) {
         if (emailType == null) {
             return null;
         }
@@ -391,26 +374,26 @@ public final class ReportParseUtils {
         boolean isAmac = Objects.equals(EmailTypeConst.REPORT_EMAIL_TYPE, emailType);
         if (!isAmac) {
             if (Objects.equals(EmailTypeConst.REPORT_LETTER_EMAIL_TYPE, emailType)
-                    || StrUtil.containsAny(string, ReportType.LETTER.getPatterns())) {
+                    || StrUtil.containsAny(text, ReportType.LETTER.getPatterns())) {
                 reportType = ReportType.LETTER;
-            } else if (StrUtil.containsAny(string, ReportType.WEEKLY.getPatterns())) {
+            } else if (StrUtil.containsAny(text, ReportType.WEEKLY.getPatterns())) {
                 reportType = ReportType.WEEKLY;
-            } else if (StrUtil.containsAny(string, ReportType.OTHER.getPatterns())) {
+            } else if (StrUtil.containsAny(text, ReportType.OTHER.getPatterns())) {
                 reportType = ReportType.OTHER;
             }
             return reportType;
         }
         // 类型识别---先识别季度报告,没有季度再识别年度报告,最后识别月报
-        Pattern pattern = Pattern.compile("(\\d{1,2})月");  // xxxx_xx月
-        Matcher matcher = pattern.matcher(string);
-        if (StrUtil.containsAny(string, ReportType.QUARTERLY.getPatterns())) {
+        Matcher monthMatcher = PatternConsts.MONTHLY_PATTERN.matcher(text);
+        Matcher dayMatcher = PatternConsts.DAY_PATTERN.matcher(text);
+        if (StrUtil.containsAny(text, ReportType.QUARTERLY.getPatterns())) {
             reportType = ReportType.QUARTERLY;
-        } else if (StrUtil.containsAny(string, ReportType.ANNUALLY.getPatterns())) {
+        } else if (StrUtil.containsAny(text, ReportType.ANNUALLY.getPatterns())) {
             reportType = ReportType.ANNUALLY;
-        } else if (StrUtil.containsAny(string, ReportType.MONTHLY.getPatterns())) {
+        } else if (StrUtil.containsAny(text, ReportType.MONTHLY.getPatterns())) {
             reportType = ReportType.MONTHLY;
-        } else if (matcher.find()) {
-            // 特殊的月报(当季度->年度->月度报告无法识别时,如果包含“\\d{1,2}月”就说明也是月报
+        } else if (monthMatcher.find() && !dayMatcher.find()) {
+            // 特殊的月报(当季度->年度->月度报告无法识别时)
             reportType = ReportType.MONTHLY;
         }
         return reportType;
@@ -439,12 +422,94 @@ public final class ReportParseUtils {
         }
     }
 
-    public static void main(String[] args) throws IOException, ReportParseException {
-        String s = """
-                =?utf-8?b?5oGS5aSp5Y2D6LGh5LqM5pyf56eB5Yuf6K+B5Yi45oqV6LWE5Z+66YeRLeaBkg==?=
-                 天千象二期私募证券投资基金2025年4月月度报告-20250512.pdf
-                """;
-        String s1 = MimeUtility.decodeText(s);
-        System.out.println(s1);
+    public static void main(String[] args) {
+        String text = "私募基金2024年04月度报告";
+        Integer emailType = 1;
+        ReportType reportType = null;
+//        System.out.println(matchReportDate(matchReportType(EmailUtil.getEmailTypeBySubject(text), text), text));
+////        System.out.println(matchReportType(3, text));
+////
+////        text = "私募基金202404月度报告";
+////        System.out.println(matchReportDate(text));
+////        System.out.println(matchReportType(3, text));
+////
+////        text = "私募基金2024_04月度报告";
+////        System.out.println(matchReportDate(text));
+////        System.out.println(matchReportType(3, text));
+////
+////        text = "私募基金2024_4月度报告";
+////        System.out.println(matchReportDate(text));
+////        System.out.println(matchReportType(3, text));
+////
+////        text = "私募基金2024-04月度报告";
+////        System.out.println(matchReportDate(text));
+////        System.out.println(matchReportType(3, text));
+////
+////        text = "私募基金2024_04月";
+////        System.out.println(matchReportDate(text));
+////        System.out.println(matchReportType(3, text));
+//
+//        text = "私募基金2024年04月12号周报";
+//        System.out.println(matchReportDate(matchReportType(EmailUtil.getEmailTypeBySubject(text), text), text));
+////        System.out.println(matchReportType(3, text));
+////        System.out.println(matchReportType(5, text));
+//
+//        text = "私募基金20240412确认函";
+//        System.out.println(matchReportDate(matchReportType(EmailUtil.getEmailTypeBySubject(text), text), text));
+////        System.out.println(matchReportType(3, text));
+//
+//        text = "私募基金2024041201预警";
+//        System.out.println(matchReportDate(matchReportType(EmailUtil.getEmailTypeBySubject(text), text), text));
+////        System.out.println(matchReportType(3, text));
+//
+//        text = "私募基金_202404";
+//        System.out.println(matchReportDate(matchReportType(EmailUtil.getEmailTypeBySubject(text), text), text));
+////        System.out.println(matchReportType(3, text));
+//
+//        text = "(水印)SSA404_月报_天演金选沪深300指数增强1号私募证券投资基金_2025年_4月.pdf";
+//        System.out.println(matchReportDate(matchReportType(EmailUtil.getEmailTypeBySubject(text), text), text));
+
+//        text = "【报告披露】稳博创新一号私募证券投资基金_2024年年报_20241231.pdf";
+//        System.out.println(matchReportDate(matchReportType(EmailUtil.getEmailTypeBySubject(text), text), text));
+//
+//        text = "大岩市场中性2号私募证券投资基金_2025年_4月_月报.pdf";
+//        System.out.println(matchReportDate(matchReportType(EmailUtil.getEmailTypeBySubject(text), text), text));
+
+//        text = "投资策略调整.pdf";
+//        emailType = EmailUtil.getEmailTypeBySubject(text);
+//        System.out.println(emailType + "," + matchReportType(emailType, text));
+
+//        text = "SSH640_天演恒心精选3号私募证券投资基金(SSH640SM200010202504).pdf";
+//        emailType = EmailUtil.getEmailTypeBySubject(text);
+//        reportType = matchReportType(emailType, text);
+//        System.out.println(emailType + ",reportType=" + reportType + ",reportDate=" + matchReportDate(reportType, text));
+//
+//        text = "【复胜周度观点】 2025_04_07-2025_04_11.pdf";
+//        emailType = EmailUtil.getEmailTypeBySubject(text);
+//        reportType = matchReportType(emailType, text);
+//        System.out.println(emailType + ",reportType=" + reportType + ",reportDate=" + matchReportDate(reportType, text));
+//
+//        text = "交睿宏观配置2号私募证券投资基金B类_孙怡乐_20250429_073248861_赎回确认单.pdf";
+//        emailType = EmailUtil.getEmailTypeBySubject(text);
+//        reportType = matchReportType(emailType, text);
+//        System.out.println(emailType + ",reportType=" + reportType + ",reportDate=" + matchReportDate(reportType, text));
+//
+//        text = "远澜宏观周报20250418.pdf";
+//        emailType = EmailUtil.getEmailTypeBySubject(text);
+//        reportType = matchReportType(emailType, text);
+//        System.out.println(emailType + ",reportType=" + reportType + ",reportDate=" + matchReportDate(reportType, text));
+
+
+        text = "官方公众号市场周报披露250430(1).docx";
+        emailType = EmailUtil.getEmailTypeBySubject(text);
+        reportType = matchReportType(emailType, text);
+        System.out.println(emailType + ",reportType=" + reportType + ",reportDate=" + matchReportDate(reportType, text));
+
+
+        text = "涌津涌赢11号私募证券投资基金-月度报告-25250508.pdf";
+        emailType = EmailUtil.getEmailTypeBySubject(text);
+        reportType = matchReportType(emailType, text);
+        System.out.println(emailType + ",reportType=" + reportType + ",reportDate=" + matchReportDate(reportType, text));
+
     }
 }

+ 1 - 1
mo-daq/src/main/java/com/smppw/modaq/application/components/report/parser/AbstractReportParser.java

@@ -169,7 +169,7 @@ public abstract class AbstractReportParser<T extends ReportData> implements Repo
         ReportBaseInfoDTO reportInfo = new ReportBaseInfoDTO(fileId);
         reportInfo.setReportName(reportName);
         reportInfo.setReportType(params.getReportType().name());
-        reportInfo.setReportDate(ReportParseUtils.matchReportDate(reportName));
+        reportInfo.setReportDate(ReportParseUtils.matchReportDate(params.getReportType(), reportName));
         return reportInfo;
     }
 }

+ 5 - 6
mo-daq/src/main/java/com/smppw/modaq/application/service/EmailParseApiServiceImpl.java

@@ -154,11 +154,11 @@ public class EmailParseApiServiceImpl implements EmailParseApiService {
         }
 //        // 邮件字段识别映射表
 //        Map<String, List<String>> emailFieldMap = emailParseService.getEmailFieldMapping();
-        // 邮件类型配置
-        Map<Integer, List<String>> emailTypeMap = emailParseService.getEmailType();
+//        // 邮件类型配置
+//        Map<Integer, List<String>> emailTypeMap = emailParseService.getEmailType();
 
         // 解析流程
-        List<EmailContentInfoDTO> emailContentInfoDTOList = buildEmailContentInfoDTO(emailId, emailParseInfoDO, emailFileInfoDOList, emailTypeMap);
+        List<EmailContentInfoDTO> emailContentInfoDTOList = buildEmailContentInfoDTO(emailId, emailParseInfoDO, emailFileInfoDOList);
 
 //        List<EmailFundNavDTO> emailFundNavDTOList = CollUtil.newArrayList();
         Map<EmailContentInfoDTO, List<EmailZipFileDTO>> emailZipFileMap = MapUtil.newHashMap();
@@ -231,8 +231,7 @@ public class EmailParseApiServiceImpl implements EmailParseApiService {
 
     private List<EmailContentInfoDTO> buildEmailContentInfoDTO(Integer emailId,
                                                                EmailParseInfoDO emailParseInfoDO,
-                                                               List<EmailFileInfoDO> emailFileInfoDOList,
-                                                               Map<Integer, List<String>> emailTypeMap) {
+                                                               List<EmailFileInfoDO> emailFileInfoDOList) {
         List<EmailContentInfoDTO> emailContentInfoDTOList = CollUtil.newArrayList();
         String emailDate = DateUtil.format(emailParseInfoDO.getEmailDate(), DateConst.YYYY_MM_DD_HH_MM_SS);
         String parseDate = DateUtil.format(new Date(), DateConst.YYYY_MM_DD_HH_MM_SS);
@@ -247,7 +246,7 @@ public class EmailParseApiServiceImpl implements EmailParseApiService {
             contentInfoDTO.setParseDate(parseDate);
             contentInfoDTO.setFileName(fileInfoDO.getFileName());
             contentInfoDTO.setFilePath(fileInfoDO.getFilePath());
-            Integer emailType = EmailUtil.getEmailTypeBySubject(emailParseInfoDO.getEmailTitle(), emailTypeMap);
+            Integer emailType = EmailUtil.getEmailTypeBySubject(emailParseInfoDO.getEmailTitle());
             contentInfoDTO.setEmailType(emailType);
             String emailContent = readHtmlFileContent(fileInfoDO.getFilePath());
             contentInfoDTO.setEmailContent(emailContent);

+ 74 - 12
mo-daq/src/main/java/com/smppw/modaq/application/util/EmailUtil.java

@@ -1,10 +1,14 @@
 package com.smppw.modaq.application.util;
 
+import cn.hutool.core.collection.CollUtil;
+import cn.hutool.core.collection.ListUtil;
 import cn.hutool.core.exceptions.ExceptionUtil;
 import cn.hutool.core.map.MapUtil;
 import cn.hutool.core.util.StrUtil;
 import cn.hutool.extra.mail.JakartaUserPassAuthenticator;
 import com.smppw.modaq.common.conts.EmailTypeConst;
+import com.smppw.modaq.common.conts.PatternConsts;
+import com.smppw.modaq.common.enums.ReportType;
 import com.smppw.modaq.domain.dto.MailboxInfoDTO;
 import com.sun.mail.imap.IMAPStore;
 import jakarta.mail.MessagingException;
@@ -16,9 +20,11 @@ import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 import java.io.UnsupportedEncodingException;
-import java.util.*;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Properties;
 import java.util.regex.Matcher;
-import java.util.regex.Pattern;
 
 /**
  * @author mozuwen
@@ -251,14 +257,34 @@ public class EmailUtil {
 //        return emailContentInfoDTO;
 //    }
 
+    public static Map<Integer, List<String>> getEmailType() {
+        Map<Integer, List<String>> emailTypeMap = MapUtil.newHashMap(3, true);
+        // 1.确认函
+        emailTypeMap.put(EmailTypeConst.REPORT_LETTER_EMAIL_TYPE,
+                ListUtil.toList(ReportType.LETTER.getPatterns()));
+        // 2.周报
+        emailTypeMap.put(EmailTypeConst.REPORT_WEEKLY_TYPE,
+                ListUtil.toList(ReportType.WEEKLY.getPatterns()));
+        // 3.其他
+        emailTypeMap.put(EmailTypeConst.REPORT_OTHER_TYPE,
+                ListUtil.toList(ReportType.OTHER.getPatterns()));
+        // 4.定期报告的类型判断
+        List<String> types = ListUtil.list(true);
+        CollUtil.addAll(types, ReportType.QUARTERLY.getPatterns());
+        CollUtil.addAll(types, ReportType.ANNUALLY.getPatterns());
+        CollUtil.addAll(types, ReportType.MONTHLY.getPatterns());
+        emailTypeMap.put(EmailTypeConst.REPORT_EMAIL_TYPE, types);
+        return emailTypeMap;
+    }
+
     /**
      * 判断邮件是否符合解析条件
      *
-     * @param subject      邮件主题
-     * @param emailTypeMap 邮件类型识别规则映射表
+     * @param subject 邮件主题
      * @return 邮件类型:1-净值,2-估值表,3-定期报告 -> 兜底为净值类型
      */
-    public static Integer getEmailTypeBySubject(String subject, Map<Integer, List<String>> emailTypeMap) {
+    public static Integer getEmailTypeBySubject(String subject) {
+        Map<Integer, List<String>> emailTypeMap = getEmailType();
         if (MapUtil.isEmpty(emailTypeMap) || StrUtil.isBlank(subject)) {
             return EmailTypeConst.NAV_EMAIL_TYPE;
         }
@@ -268,13 +294,12 @@ public class EmailUtil {
                     return emailTypeEntry.getKey();
                 }
             }
-            if (Objects.equals(EmailTypeConst.REPORT_EMAIL_TYPE, emailTypeEntry.getKey())) {
-                Pattern pattern = Pattern.compile("(\\d{1,2})月");  // xxxx_xx月
-                Matcher matcher = pattern.matcher(subject);
-                if (matcher.find()) {
-                    return EmailTypeConst.REPORT_EMAIL_TYPE;
-                }
-            }
+        }
+        // 特殊月报识别规则(没有月报或月度等关键字的月报)
+        Matcher monthMatcher = PatternConsts.MONTHLY_PATTERN.matcher(subject);
+        Matcher dayMatcher = PatternConsts.DAY_PATTERN.matcher(subject);
+        if (monthMatcher.find() && !dayMatcher.find()) {
+            return EmailTypeConst.REPORT_EMAIL_TYPE;
         }
         return EmailTypeConst.NAV_EMAIL_TYPE;
     }
@@ -396,4 +421,41 @@ public class EmailUtil {
 //        }
 //        return null;
 //    }
+
+    public static void main(String[] args) {
+        String text = "私募基金2024年04月度报告";
+        System.out.println(getEmailTypeBySubject(text));
+//
+//        text = "私募基金202404月度报告";
+//        System.out.println(matchReportDate(text));
+//        System.out.println(matchReportType(3, text));
+//
+//        text = "私募基金2024_04月度报告";
+//        System.out.println(matchReportDate(text));
+//        System.out.println(matchReportType(3, text));
+//
+//        text = "私募基金2024_4月度报告";
+//        System.out.println(matchReportDate(text));
+//        System.out.println(matchReportType(3, text));
+//
+//        text = "私募基金2024-04月度报告";
+//        System.out.println(matchReportDate(text));
+//        System.out.println(matchReportType(3, text));
+//
+//        text = "私募基金2024_04月";
+//        System.out.println(matchReportDate(text));
+//        System.out.println(matchReportType(3, text));
+
+        text = "私募基金2024年04月12号";
+        System.out.println(getEmailTypeBySubject(text));
+
+        text = "私募基金20240412";
+        System.out.println(getEmailTypeBySubject(text));
+
+        text = "私募基金2024041201";
+        System.out.println(getEmailTypeBySubject(text));
+
+        text = "私募基金_202404";
+        System.out.println(getEmailTypeBySubject(text));
+    }
 }

+ 35 - 0
mo-daq/src/main/java/com/smppw/modaq/common/conts/PatternConsts.java

@@ -0,0 +1,35 @@
+package com.smppw.modaq.common.conts;
+
+import java.util.regex.Pattern;
+
+/**
+ * 正则匹配表达式常量
+ */
+public class PatternConsts {
+    /**
+     * 季度报告正则匹配
+     */
+    public static final Pattern QUARTERLY_PATTERN = Pattern.compile("(20[23]\\d)\\D*([一二三四1234])季");
+    /**
+     * 年度报告正则匹配
+     */
+    public static final Pattern ANNUALLY_PATTERN = Pattern.compile("(20[23]\\d)年(度|年度|年报)");
+    /**
+     * 月度报告正则匹配
+     */
+    public static Pattern MONTHLY_PATTERN = Pattern.compile("(20[23]\\d)[年_-]*(\\d{1,2})月?");
+    /**
+     * 严格的年月日匹配
+     */
+    public static final Pattern STRICT_DAY_PATTERN = Pattern.compile("(20[23]\\d)[年_-]*(\\d{1,2})[月_-]*(\\d{1,2})[日号_-]?(?!\\d)");
+    /**
+     * 宽泛的日期匹配
+     */
+    public static final Pattern DAY_PATTERN = Pattern.compile("(20[23]\\d)[年_-]*(\\d{1,2})[月_-]*(\\d{1,2})[日号\\d]+");
+
+
+    public static final Pattern FUND_CODE_PATTERN = Pattern.compile("S[A-Z0-9]{5}");
+
+
+    public static final Pattern FUND_LEVEL_PATTERN = Pattern.compile("[A-F]级|基金[A-F]");
+}

+ 10 - 5
mo-daq/src/main/java/com/smppw/modaq/domain/dto/EmailZipFileDTO.java

@@ -2,21 +2,25 @@ package com.smppw.modaq.domain.dto;
 
 import cn.hutool.core.io.FileUtil;
 import lombok.Getter;
+import lombok.Setter;
 
 @Getter
 public class EmailZipFileDTO {
+    private final String emailTitle;
     private final String filename;
-//    private final String originalName;
     private final String filepath;
-    private final Integer emailType;
+    @Setter
+    private Integer emailType;
 
-    public EmailZipFileDTO(String filepath, Integer emailType) {
+    public EmailZipFileDTO(String emailTitle, String filepath, Integer emailType) {
+        this.emailTitle = emailTitle;
         this.filepath = filepath;
         this.emailType = emailType;
         this.filename = FileUtil.getName(filepath);
     }
 
-    public EmailZipFileDTO(String filepath, String filename, Integer emailType) {
+    public EmailZipFileDTO(String emailTitle, String filepath, String filename, Integer emailType) {
+        this.emailTitle = emailTitle;
         this.filepath = filepath;
         this.emailType = emailType;
         this.filename = filename;
@@ -25,7 +29,8 @@ public class EmailZipFileDTO {
     @Override
     public String toString() {
         return "EmailZipFileDTO{" +
-                "filename='" + filename + '\'' +
+                "emailTitle='" + emailTitle + '\'' +
+                ", filename='" + filename + '\'' +
                 ", filepath='" + filepath + '\'' +
                 ", emailType=" + emailType +
                 '}';

+ 4 - 4
mo-daq/src/main/java/com/smppw/modaq/domain/mapper/EmailFileInfoMapper.java

@@ -36,11 +36,11 @@ public interface EmailFileInfoMapper {
                               @Param("aiParse") Boolean aiParse,
                               @Param("aiFileId") String aiFileId);
 
-    int getLetterFilenameSuccessCount(@Param("filename") String filename);
+    int getLetterFilenameSuccessCount(@Param("emailTitle") String emailTitle, @Param("filename") String filename);
 
-    int getAmacFilenameSuccessCount(@Param("filename") String filename);
+    int getAmacFilenameSuccessCount(@Param("emailTitle") String emailTitle, @Param("filename") String filename);
 
-    int getWeeklyFilenameSuccessCount(@Param("filename") String filename);
+    int getWeeklyFilenameSuccessCount(@Param("emailTitle") String emailTitle, @Param("filename") String filename);
 
-    int getOtherFilenameSuccessCount(@Param("filename") String filename);
+    int getOtherFilenameSuccessCount(@Param("emailTitle") String emailTitle, @Param("filename") String filename);
 }

+ 90 - 96
mo-daq/src/main/java/com/smppw/modaq/domain/service/EmailParseService.java

@@ -202,23 +202,25 @@ public class EmailParseService {
         List<EmailZipFileDTO> resultList = ListUtil.list(false);
         Integer emailType = emailContentInfoDTO.getEmailType();
         String filepath = emailContentInfoDTO.getFilePath();
+        String emailTitle = emailContentInfoDTO.getEmailTitle();
 
         if (ExcelUtil.isZip(filepath)) {
-            handleCompressedFiles(filepath, ".zip", emailType, resultList);
+            handleCompressedFiles(emailTitle, filepath, ".zip", emailType, resultList);
         } else if (ExcelUtil.isRAR(filepath)) {
-            handleCompressedFiles(filepath, ".rar", emailType, resultList);
+            handleCompressedFiles(emailTitle, filepath, ".rar", emailType, resultList);
         }
 
         // 文件中的类型判断
         if (emailType == null || !EmailTypeConst.SUPPORT_EMAIL_TYPES.contains(emailType)) {
-            emailType = EmailUtil.getEmailTypeBySubject(emailContentInfoDTO.getFileName(), this.getEmailType());
+            emailType = EmailUtil.getEmailTypeBySubject(emailContentInfoDTO.getFileName());
             emailContentInfoDTO.setEmailType(emailType);
         }
 
         return resultList;
     }
 
-    private void handleCompressedFiles(String filepath, String extension, Integer emailType, List<EmailZipFileDTO> resultList) throws Exception {
+    private void handleCompressedFiles(String emailTitle, String filepath, String extension,
+                                       Integer emailType, List<EmailZipFileDTO> resultList) throws Exception {
         String destPath = getDestinationPath(filepath, extension);
         log.info("压缩包地址:{}, 解压后文件地址:{}", filepath, destPath);
 
@@ -240,20 +242,20 @@ public class EmailParseService {
         for (String dir : extractedDirs) {
             // 如果邮件类型不满足解析条件则重新根据文件名判断
             if (emailType == null || !EmailTypeConst.SUPPORT_EMAIL_TYPES.contains(emailType)) {
-                emailType = EmailUtil.getEmailTypeBySubject(dir, this.getEmailType());
+                emailType = EmailUtil.getEmailTypeBySubject(dir);
             }
             File file = new File(dir);
             if (file.isDirectory()) {
                 String[] subDirs = file.list();
                 if (subDirs != null) {
                     for (String subDir : subDirs) {
-                        resultList.add(new EmailZipFileDTO(subDir, emailType));
+                        resultList.add(new EmailZipFileDTO(emailTitle, subDir, emailType));
                     }
                 } else {
                     log.warn("目录 {} 下无文件", dir);
                 }
             } else {
-                resultList.add(new EmailZipFileDTO(dir, emailType));
+                resultList.add(new EmailZipFileDTO(emailTitle, dir, emailType));
             }
         }
     }
@@ -270,89 +272,101 @@ public class EmailParseService {
         // python 报告解析接口结果
         List<ParseResult<ReportData>> dataList = ListUtil.list(false);
         for (Map.Entry<EmailContentInfoDTO, List<EmailZipFileDTO>> entry : emailZipFileMap.entrySet()) {
-            EmailContentInfoDTO emailContentInfoDTO = entry.getKey();
-            if (emailContentInfoDTO.getFileName() != null && emailContentInfoDTO.getFileName().endsWith(".html")) {
+            EmailContentInfoDTO emailDto = entry.getKey();
+            if (emailDto.getFileName() != null && emailDto.getFileName().endsWith(".html")) {
                 continue;
             }
+            String emailTitle = emailDto.getEmailTitle();
 
             // 待解析文件数据处理,不支持已存在的文件重复解析
             List<EmailZipFileDTO> dtos = ListUtil.list(false);
             List<EmailZipFileDTO> zipFiles = entry.getValue();
             if (CollUtil.isEmpty(zipFiles)) {
-                dtos.add(new EmailZipFileDTO(emailContentInfoDTO.getFilePath(), emailContentInfoDTO.getFileName(), emailContentInfoDTO.getEmailType()));
+                dtos.add(new EmailZipFileDTO(emailTitle, emailDto.getFilePath(), emailDto.getFileName(), emailDto.getEmailType()));
             } else {
                 dtos.addAll(zipFiles);
             }
-
-            String emailTitle = emailContentInfoDTO.getEmailTitle();
-            // 数据库已存在的数据过滤
-            Iterator<EmailZipFileDTO> iterator = dtos.iterator();
-            while (iterator.hasNext()) {
-                EmailZipFileDTO dto = iterator.next();
-                Integer emailType = dto.getEmailType();
+            // 重新判断类型
+            for (EmailZipFileDTO dto : dtos) {
                 String filename = dto.getFilename();
-                int count = 0;
-                if (Objects.equals(emailType, EmailTypeConst.REPORT_LETTER_EMAIL_TYPE)) {
-                    // 确认单
-                    count = this.emailFileInfoMapper.getLetterFilenameSuccessCount(filename);
-                } else if (Objects.equals(emailType, EmailTypeConst.REPORT_EMAIL_TYPE)) {
-                    // 定期报告
-                    count = this.emailFileInfoMapper.getAmacFilenameSuccessCount(filename);
-                } else if (Objects.equals(emailType, EmailTypeConst.REPORT_WEEKLY_TYPE)) {
-                    // 管理人周报
-                    count = this.emailFileInfoMapper.getWeeklyFilenameSuccessCount(filename);
-                } else if (Objects.equals(emailType, EmailTypeConst.REPORT_OTHER_TYPE)) {
-                    // 其他报告
-                    count = this.emailFileInfoMapper.getOtherFilenameSuccessCount(filename);
-                } else {
-                    log.info("邮件{} 类型{} 不支持解析。", emailTitle, emailType);
-                    iterator.remove();
-                }
-                if (count > 0) {
-                    iterator.remove();
-                    log.info("邮件{} 附件{} 已存在解析成功的记录,不用重新解析。", emailTitle, filename);
-                }
-            }
-
-            if (CollUtil.isEmpty(dtos)) {
-                log.info("邮件{} 所有文件都已经解析成功过,不能重复解析了", emailTitle);
-                continue;
-            }
-
-            Integer emailId = emailContentInfoDTO.getEmailId();
-            EmailParseInfoDO emailParseInfoDO = buildEmailParseInfo(emailId, emailAddress, emailContentInfoDTO);
-            emailParseInfoDO.setEmailKey(emailKey);
-            emailId = saveEmailParseInfo(emailParseInfoDO);
-            if (emailId == null) {
-                continue;
+                Integer emailType = EmailUtil.getEmailTypeBySubject(filename);
+                dto.setEmailType(emailType);
             }
 
-            for (EmailZipFileDTO zipFile : dtos) {
-                EmailFileInfoDO emailFile = saveEmailFileInfo(emailId, zipFile.getFilename(), zipFile.getFilepath());
-                // 解析并保存报告
-                ParseResult<ReportData> parseResult = this.parseReportAndHandleResult(emailTitle, emailFile, zipFile);
-                dataList.add(parseResult);
+            for (EmailZipFileDTO dto : dtos) {
+                String filename = dto.getFilename();
+                Integer emailType = dto.getEmailType();
+                ReportType reportType = ReportParseUtils.matchReportType(emailType, filename);
+                String reportDate = ReportParseUtils.matchReportDate(reportType, filename);
+                System.out.println(filename + ",emailType=" + emailType + ",reportType=" + reportType + ",reportDate=" + reportDate);
             }
 
-            String failReason = null;
-            int emailParseStatus = EmailParseStatusConst.SUCCESS;
-            // 报告邮件有一条失败就表示整个邮件解析失败
-            if (CollUtil.isNotEmpty(dataList)) {
-                // ai解析结果
-                List<ReportData> aiParaseList = dataList.stream().map(ParseResult::getData)
-                        .filter(Objects::nonNull).filter(e -> Objects.equals(true, e.getAiParse())).toList();
-                if (CollUtil.isNotEmpty(aiParaseList)) {
-                    for (ReportData data : aiParaseList) {
-                        this.emailFileInfoMapper.updateAiParseByFileId(data.getBaseInfo().getFileId(), data.getAiParse(), data.getAiFileId());
-                    }
-                }
-                long failNum = dataList.stream().filter(e -> !Objects.equals(EmailParseStatusConst.SUCCESS, e.getStatus())).count();
-                if (failNum > 0) {
-                    emailParseStatus = EmailParseStatusConst.FAIL;
-                    failReason = dataList.stream().map(ParseResult::getMsg).collect(Collectors.joining(";"));
-                }
-            }
-            emailParseInfoMapper.updateParseStatus(emailId, emailParseStatus, failReason);
+//            // 数据库已存在的数据过滤
+//            Iterator<EmailZipFileDTO> iterator = dtos.iterator();
+//            while (iterator.hasNext()) {
+//                EmailZipFileDTO dto = iterator.next();
+//                Integer emailType = dto.getEmailType();
+//                String filename = dto.getFilename();
+//                int count = 0;
+//                if (Objects.equals(emailType, EmailTypeConst.REPORT_LETTER_EMAIL_TYPE)) {
+//                    // 确认单
+//                    count = this.emailFileInfoMapper.getLetterFilenameSuccessCount(emailTitle, filename);
+//                } else if (Objects.equals(emailType, EmailTypeConst.REPORT_EMAIL_TYPE)) {
+//                    // 定期报告
+//                    count = this.emailFileInfoMapper.getAmacFilenameSuccessCount(emailTitle, filename);
+//                } else if (Objects.equals(emailType, EmailTypeConst.REPORT_WEEKLY_TYPE)) {
+//                    // 管理人周报
+//                    count = this.emailFileInfoMapper.getWeeklyFilenameSuccessCount(emailTitle, filename);
+//                } else if (Objects.equals(emailType, EmailTypeConst.REPORT_OTHER_TYPE)) {
+//                    // 其他报告
+//                    count = this.emailFileInfoMapper.getOtherFilenameSuccessCount(emailTitle, filename);
+//                } else {
+//                    log.info("邮件{} 类型{} 不支持解析。", emailTitle, emailType);
+//                    iterator.remove();
+//                }
+//                if (count > 0) {
+//                    iterator.remove();
+//                    log.info("邮件{} 附件{} 已存在解析成功的记录,不用重新解析。", emailTitle, filename);
+//                }
+//            }
+//            if (CollUtil.isEmpty(dtos)) {
+//                log.info("邮件{} 所有文件都已经解析成功过,不能重复解析了", emailTitle);
+//                continue;
+//            }
+//
+//            Integer emailId = emailDto.getEmailId();
+//            EmailParseInfoDO emailParseInfoDO = this.buildEmailParseInfo(emailId, emailAddress, emailDto);
+//            emailParseInfoDO.setEmailKey(emailKey);
+//            emailId = this.saveEmailParseInfo(emailParseInfoDO);
+//            if (emailId == null) {
+//                continue;
+//            }
+//            for (EmailZipFileDTO zipFile : dtos) {
+//                EmailFileInfoDO emailFile = this.saveEmailFileInfo(emailId, zipFile.getFilename(), zipFile.getFilepath());
+//                // 解析并保存报告
+//                ParseResult<ReportData> parseResult = this.parseReportAndHandleResult(emailTitle, emailFile, zipFile);
+//                dataList.add(parseResult);
+//            }
+//
+//            String failReason = null;
+//            int emailParseStatus = EmailParseStatusConst.SUCCESS;
+//            // 报告邮件有一条失败就表示整个邮件解析失败
+//            if (CollUtil.isNotEmpty(dataList)) {
+//                // ai解析结果
+//                List<ReportData> aiParaseList = dataList.stream().map(ParseResult::getData)
+//                        .filter(Objects::nonNull).filter(e -> Objects.equals(true, e.getAiParse())).toList();
+//                if (CollUtil.isNotEmpty(aiParaseList)) {
+//                    for (ReportData data : aiParaseList) {
+//                        this.emailFileInfoMapper.updateAiParseByFileId(data.getBaseInfo().getFileId(), data.getAiParse(), data.getAiFileId());
+//                    }
+//                }
+//                long failNum = dataList.stream().filter(e -> !Objects.equals(EmailParseStatusConst.SUCCESS, e.getStatus())).count();
+//                if (failNum > 0) {
+//                    emailParseStatus = EmailParseStatusConst.FAIL;
+//                    failReason = dataList.stream().map(ParseResult::getMsg).collect(Collectors.joining(";"));
+//                }
+//            }
+//            emailParseInfoMapper.updateParseStatus(emailId, emailParseStatus, failReason);
         }
     }
 
@@ -548,26 +562,6 @@ public class EmailParseService {
         return emailParseInfoDO;
     }
 
-    public Map<Integer, List<String>> getEmailType() {
-        Map<Integer, List<String>> emailTypeMap = MapUtil.newHashMap(3, true);
-        // 1.确认函
-        emailTypeMap.put(EmailTypeConst.REPORT_LETTER_EMAIL_TYPE,
-                ListUtil.toList(ReportType.LETTER.getPatterns()));
-        // 2.周报
-        emailTypeMap.put(EmailTypeConst.REPORT_WEEKLY_TYPE,
-                ListUtil.toList(ReportType.WEEKLY.getPatterns()));
-        // 3.其他
-        emailTypeMap.put(EmailTypeConst.REPORT_OTHER_TYPE,
-                ListUtil.toList(ReportType.OTHER.getPatterns()));
-        // 4.定期报告的类型判断
-        List<String> types = ListUtil.list(true);
-        CollUtil.addAll(types, ReportType.QUARTERLY.getPatterns());
-        CollUtil.addAll(types, ReportType.ANNUALLY.getPatterns());
-        CollUtil.addAll(types, ReportType.MONTHLY.getPatterns());
-        emailTypeMap.put(EmailTypeConst.REPORT_EMAIL_TYPE, types);
-        return emailTypeMap;
-    }
-
     /**
      * 读取邮件
      *
@@ -651,7 +645,7 @@ public class EmailParseService {
                     continue;
                 }
                 senderEmail = getSenderEmail(message);
-                emailType = EmailUtil.getEmailTypeBySubject(emailTitle, this.getEmailType());
+                emailType = EmailUtil.getEmailTypeBySubject(emailTitle);
                 if (emailType == null) {
                     log.warn("{} 邮件不满足解析条件 -> 邮件主题:{},邮件日期:{}", folderName, emailTitle, emailDateStr);
                     continue;

+ 4 - 0
mo-daq/src/main/resources/mapper/EmailFileInfoMapper.xml

@@ -238,6 +238,7 @@
     <select id="getLetterFilenameSuccessCount" resultType="int">
         select count(1)
         from mo_email_file_info a
+                 join mo_email_parse_info e on e.id = a.email_id and e.email_title = #{emailTitle}
                  join mo_report_base_info b on b.file_id = a.id and b.report_type = 'LETTER'
                  join mo_report_fund_transaction c on a.id = c.file_id and c.fund_name is not null
                  join mo_report_investor_info d on a.id = d.file_id and d.investor_name is not null
@@ -247,6 +248,7 @@
     <select id="getAmacFilenameSuccessCount" resultType="int">
         select count(1)
         from mo_email_file_info a
+                 join mo_email_parse_info e on e.id = a.email_id and e.email_title = #{emailTitle}
                  join mo_report_base_info b on b.file_id = a.id and b.report_type in ('MONTHLY', 'QUARTERLY', 'ANNUALLY')
                  join mo_report_fund_info c on a.id = c.file_id and (c.fund_name is not null or c.fund_code is not null)
         where a.file_name = #{filename} and a.isvalid = 1
@@ -255,6 +257,7 @@
     <select id="getWeeklyFilenameSuccessCount" resultType="int">
         select count(1)
         from mo_email_file_info a
+                 join mo_email_parse_info e on e.id = a.email_id and e.email_title = #{emailTitle}
                  join mo_report_base_info b on b.file_id = a.id and b.report_type = 'WEEKLY'
                  join mo_report_fund_info c on a.id = c.file_id and (c.fund_name is not null or c.fund_code is not null)
         where a.file_name = #{filename} and a.isvalid = 1
@@ -263,6 +266,7 @@
     <select id="getOtherFilenameSuccessCount" resultType="int">
         select count(1)
         from mo_email_file_info a
+                 join mo_email_parse_info e on e.id = a.email_id and e.email_title = #{emailTitle}
                  join mo_report_base_info b on b.file_id = a.id and b.report_type = 'OTHER'
         where a.file_name = #{filename} and a.isvalid = 1
     </select>

+ 4 - 4
mo-daq/src/test/java/com/smppw/modaq/MoDaqApplicationTests.java

@@ -37,15 +37,15 @@ public class MoDaqApplicationTests {
 
     @Test
     public void reportTest() {
-        MailboxInfoDTO emailInfoDTO = this.buildMailbox("wangzaijun@simuwang.com", "WZJ2twy1314");
-        Date startDate = DateUtil.parse("2025-05-13 16:50:00", DateConst.YYYY_MM_DD_HH_MM_SS);
-        Date endDate = DateUtil.parse("2025-05-13 16:53:00", DateConst.YYYY_MM_DD_HH_MM_SS);
+        MailboxInfoDTO emailInfoDTO = this.buildMailbox("***@simuwang.com", "***");
+        Date startDate = DateUtil.parse("2025-05-15 11:40:00", DateConst.YYYY_MM_DD_HH_MM_SS);
+        Date endDate = DateUtil.parse("2025-05-15 16:53:00", DateConst.YYYY_MM_DD_HH_MM_SS);
         try {
             List<String> folderNames = ListUtil.list(false);
 //            folderNames.add("其他文件夹/报告公告");
             folderNames.add("INBOX");
             emailParseService.parseEmail(emailInfoDTO, startDate, endDate,
-                    folderNames, ListUtil.of(EmailTypeConst.REPORT_LETTER_EMAIL_TYPE));
+                    folderNames, EmailTypeConst.REPORT_EMAIL_TYPES);
         } catch (Exception e) {
             throw new RuntimeException(e);
         }