Selaa lähdekoodia

fix:周报的ai提示词优化+报告日期正则匹配方法优化

wangzaijun 1 kuukausi sitten
vanhempi
commit
a42b242bba

+ 71 - 70
mo-daq/src/main/java/com/smppw/modaq/application/components/ReportParseUtils.java

@@ -10,10 +10,9 @@ import com.smppw.modaq.domain.dto.report.ReportAssetAllocationDTO;
 import jakarta.mail.internet.MimeUtility;
 
 import java.io.IOException;
-import java.util.Calendar;
-import java.util.List;
-import java.util.Map;
-import java.util.Objects;
+import java.time.YearMonth;
+import java.util.*;
+import java.util.function.Function;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 import java.util.stream.Collectors;
@@ -37,6 +36,51 @@ public final class ReportParseUtils {
      */
     public static final Map<String, String> ASSET_ALLOCATION_TYPE_MAPPER = MapUtil.newHashMap(32, true);
 
+    // 预编译所有正则表达式(线程安全)
+    private static final Map<Pattern, Function<Matcher, String>> PATTERNS = new LinkedHashMap<>() {{
+        // 季度报告(最高优先级)
+        put(Pattern.compile("(20\\d{2})[^\\d]*([一二三四1234])季"), matcher -> {
+            String year = matcher.group(1);
+            return switch (matcher.group(2)) {
+                case "一", "1" -> year + "-03-31";
+                case "二", "2" -> year + "-06-30";
+                case "三", "3" -> year + "-09-30";
+                case "四", "4" -> year + "-12-31";
+                default -> null;
+            };
+        });
+
+        // 明确日期格式(次优先级)
+        put(Pattern.compile("\\d{4}-\\d{2}-\\d{2}"), Matcher::group);
+
+        // 年度报告(合并相似正则)
+        put(Pattern.compile("(20\\d{2})年(度|年度)"), matcher -> matcher.group(1) + "-12-31");
+
+        // 月份相关格式(统一处理)
+        put(Pattern.compile("(20\\d{2})年[_-]?(\\d{1,2})月"), matcher -> {
+            int year = Integer.parseInt(matcher.group(1));
+            int month = Integer.parseInt(matcher.group(2));
+            return formatMonthEnd(year, month);
+        });
+
+        // 紧凑格式(如202312月)
+        put(Pattern.compile("(\\d{4})(\\d{2})月"), matcher -> {
+            int year = Integer.parseInt(matcher.group(1));
+            int month = Integer.parseInt(matcher.group(2));
+            return formatMonthEnd(year, month);
+        });
+
+        // 纯数字格式(最低优先级)
+        put(Pattern.compile("(?<!\\d)\\d{4}(0[1-9]|1[0-2])(?!\\d)"), matcher -> {
+            String group = matcher.group();
+            int year = Integer.parseInt(group.substring(0, 4));
+            int month = Integer.parseInt(group.substring(4, 6));
+            return formatMonthEnd(year, month);
+        });
+        put(Pattern.compile("(?<!\\d)\\d{4}(0[1-9]|1[0-2])(0[1-9]|[12]\\d|3[01])(?!\\d)"), Matcher::group);
+    }};
+
+
     static {
         // 财务指标
         FINANCIAL_INDICATORS_COLUMN_NAMES.add("期末基金净资产");
@@ -316,74 +360,19 @@ public final class ReportParseUtils {
     /**
      * 匹配报告日期
      *
-     * @param string 文本内容
+     * @param text 文本内容
      * @return 报告日期
      */
-    public static String matchReportDate(String string) {
-        if (string == null) {
-            return null;
-        }
-        // 编译正则表达式模式
-        Pattern pat1 = Pattern.compile("(2\\d{3}).*([一二三四1234])季");  // 2023年XXX3季(度\报)
-        Pattern pat2 = Pattern.compile("\\d{4}-\\d{2}-\\d{2}");  // 2023-12-31
-        Pattern pat3 = Pattern.compile("(2\\d{3})年年度");  // 2023年年度
-        Pattern pat6 = Pattern.compile("(2\\d{3})年度");  // 2023年度
-        Pattern pat4 = Pattern.compile("(\\d{4})年(\\d{1,2})月");  // 2023年12月
-        Pattern pat7 = Pattern.compile("(\\d{4})年_(\\d{1,2})月");  // 2023年_12月
-        Pattern pat8 = Pattern.compile("(\\d{4})(\\d{2})月");  // 202312月
-        Pattern pat5 = Pattern.compile("(?<!\\d)\\d{4}(0[1-9]|1[0-2])(0[1-9]|[12]\\d|3[01])(?!\\d)");  // 20231231
-        // 创建Matcher对象
-        Matcher matcher1 = pat1.matcher(string);
-        Matcher matcher2 = pat2.matcher(string);
-        Matcher matcher3 = pat3.matcher(string);
-        Matcher matcher6 = pat6.matcher(string);
-        Matcher matcher4 = pat4.matcher(string);
-        Matcher matcher7 = pat7.matcher(string);
-        Matcher matcher8 = pat8.matcher(string);
-        Matcher matcher5 = pat5.matcher(string);
-        // 尝试匹配
-        if (matcher1.find()) {
-            String year = matcher1.group(1);
-            String quarter = matcher1.group(2);
-            return switch (quarter) {
-                case "一", "1" -> year + "-03-31";
-                case "二", "2" -> year + "-06-30";
-                case "三", "3" -> year + "-09-30";
-                case "四", "4" -> year + "-12-31";
-                default -> null;
-            };
-        } else if (matcher2.find()) {
-            return matcher2.group();
-        } else if (matcher3.find()) {
-            return matcher3.group(1) + "-12-31";
-        } else if (matcher6.find()) {
-            return matcher6.group(1) + "-12-31";
-        } else {
-            // 格式4和格式7优先,不满足时才用格式5,都不满足返回null
-            boolean m4 = matcher4.find();
-            boolean m7 = matcher7.find();
-            boolean m8 = matcher8.find();
-            if (m4 || m7 || m8) {
-                String year;
-                String month;
-                if (m8) {
-                    year = matcher8.group(1);
-                    month = matcher8.group(2);
-                } else if (m4) {
-                    year = matcher4.group(1);
-                    month = matcher4.group(2);
-                } else {
-                    year = matcher7.group(1);
-                    month = matcher7.group(2);
-                }
-                int lastDayOfMonth = getLastDayOfMonth(Integer.parseInt(year), Integer.parseInt(month));
-                return year + "-" + padZero(month) + "-" + padZero(lastDayOfMonth + "");
-            } else if (matcher5.find()) {
-                return matcher5.group();
-            } else {
-                return null;
-            }
-        }
+    public static String matchReportDate(String text) {
+        return Optional.ofNullable(text)
+                .flatMap(str -> PATTERNS.entrySet().stream()
+                        .map(entry -> {
+                            Matcher matcher = entry.getKey().matcher(str);
+                            return matcher.find() ? entry.getValue().apply(matcher) : null;
+                        })
+                        .filter(result -> result != null)
+                        .findFirst())
+                .orElse(null);
     }
 
     /**
@@ -436,6 +425,18 @@ public final class ReportParseUtils {
         return String.format("%02d", Integer.parseInt(number));
     }
 
+    /**
+     * 计算指定月份的最后一天
+     */
+    private static String formatMonthEnd(int year, int month) {
+        try {
+            YearMonth ym = YearMonth.of(year, month);
+            return String.format("%d-%02d-%02d", year, month, ym.lengthOfMonth());
+        } catch (Exception e) { // 处理非法月份(如month=13)
+            return null;
+        }
+    }
+
     public static void main(String[] args) throws IOException, ReportParseException {
         String s = """
                 =?utf-8?b?5oGS5aSp5Y2D6LGh5LqM5pyf56eB5Yuf6K+B5Yi45oqV6LWE5Z+66YeRLeaBkg==?=

+ 2 - 4
mo-daq/src/main/java/com/smppw/modaq/application/components/report/parser/ai/AIWeeklyReportParser.java

@@ -22,10 +22,8 @@ public class AIWeeklyReportParser extends AbstractAIReportParser<WeeklyReportDat
     @Override
     protected String prompt() {
         return """
-                识别文件中的基金名称、基金编码、基金管理人和报告日期,
-                并且解析文件中的联系人等信息,
-                基金编码的正则表达式是`S[A-Z0-9]{5}`,联系人信息包含电话、传真、邮箱、地址和二维码,
-                如果日期是区间段则取截止日期,如果无法识别就返回空字符串,结果用json返回
+                识别报告中的基金名称、基金编码、基金管理人、报告日期和联系人信息,基金编码的正则表达式是`S[A-Z0-9]{5}`,如果没有联系人信息则返回空字符串,
+                结果用json返回
                 """;
     }
 

+ 9 - 9
mo-daq/src/main/java/com/smppw/modaq/application/task/ParseSchedulerTask.java

@@ -31,15 +31,15 @@ public class ParseSchedulerTask {
 
     @PostConstruct
     public void executeOnStartup() {
-        try {
-            // 定期报告从 我的文件夹.报告公告 文件夹获取邮件
-            this.emailParseApiService.parseEmail(
-                    DateUtil.parseDateTime("2025-05-12 10:24:00"),
-                    DateUtil.parseDateTime("2025-05-12 10:25:00"),
-                    ListUtil.of("其他文件夹/报告公告"), EmailTypeConst.REPORT_EMAIL_TYPES);
-        } catch (Exception e) {
-            logger.error(ExceptionUtil.getMessage(e));
-        }
+//        try {
+//            // 定期报告从 我的文件夹.报告公告 文件夹获取邮件
+//            this.emailParseApiService.parseEmail(
+//                    DateUtil.parseDateTime("2025-05-12 10:24:00"),
+//                    DateUtil.parseDateTime("2025-05-12 10:25:00"),
+//                    ListUtil.of("其他文件夹/报告公告"), EmailTypeConst.REPORT_EMAIL_TYPES);
+//        } catch (Exception e) {
+//            logger.error(ExceptionUtil.getMessage(e));
+//        }
     }
 
     /**