|
@@ -3,16 +3,17 @@ package com.smppw.modaq.application.components;
|
|
import cn.hutool.core.collection.ListUtil;
|
|
import cn.hutool.core.collection.ListUtil;
|
|
import cn.hutool.core.map.MapUtil;
|
|
import cn.hutool.core.map.MapUtil;
|
|
import cn.hutool.core.util.StrUtil;
|
|
import cn.hutool.core.util.StrUtil;
|
|
|
|
+import com.smppw.modaq.application.util.EmailUtil;
|
|
import com.smppw.modaq.common.conts.EmailTypeConst;
|
|
import com.smppw.modaq.common.conts.EmailTypeConst;
|
|
|
|
+import com.smppw.modaq.common.conts.PatternConsts;
|
|
import com.smppw.modaq.common.enums.ReportType;
|
|
import com.smppw.modaq.common.enums.ReportType;
|
|
-import com.smppw.modaq.common.exception.ReportParseException;
|
|
|
|
import com.smppw.modaq.domain.dto.report.ReportAssetAllocationDTO;
|
|
import com.smppw.modaq.domain.dto.report.ReportAssetAllocationDTO;
|
|
-import jakarta.mail.internet.MimeUtility;
|
|
|
|
|
|
|
|
-import java.io.IOException;
|
|
|
|
import java.time.YearMonth;
|
|
import java.time.YearMonth;
|
|
-import java.util.*;
|
|
|
|
-import java.util.function.Function;
|
|
|
|
|
|
+import java.util.Calendar;
|
|
|
|
+import java.util.List;
|
|
|
|
+import java.util.Map;
|
|
|
|
+import java.util.Objects;
|
|
import java.util.regex.Matcher;
|
|
import java.util.regex.Matcher;
|
|
import java.util.regex.Pattern;
|
|
import java.util.regex.Pattern;
|
|
import java.util.stream.Collectors;
|
|
import java.util.stream.Collectors;
|
|
@@ -36,51 +37,6 @@ public final class ReportParseUtils {
|
|
*/
|
|
*/
|
|
public static final Map<String, String> ASSET_ALLOCATION_TYPE_MAPPER = MapUtil.newHashMap(32, true);
|
|
public static final Map<String, String> ASSET_ALLOCATION_TYPE_MAPPER = MapUtil.newHashMap(32, true);
|
|
|
|
|
|
- // 预编译所有正则表达式(线程安全)
|
|
|
|
- private static final Map<Pattern, Function<Matcher, String>> PATTERNS = new LinkedHashMap<>() {{
|
|
|
|
- // 季度报告(最高优先级)
|
|
|
|
- put(Pattern.compile("(20\\d{2})[^\\d]*([一二三四1234])季"), matcher -> {
|
|
|
|
- String year = matcher.group(1);
|
|
|
|
- return switch (matcher.group(2)) {
|
|
|
|
- case "一", "1" -> year + "-03-31";
|
|
|
|
- case "二", "2" -> year + "-06-30";
|
|
|
|
- case "三", "3" -> year + "-09-30";
|
|
|
|
- case "四", "4" -> year + "-12-31";
|
|
|
|
- default -> null;
|
|
|
|
- };
|
|
|
|
- });
|
|
|
|
-
|
|
|
|
- // 明确日期格式(次优先级)
|
|
|
|
- put(Pattern.compile("\\d{4}-\\d{2}-\\d{2}"), Matcher::group);
|
|
|
|
-
|
|
|
|
- // 年度报告(合并相似正则)
|
|
|
|
- put(Pattern.compile("(20\\d{2})年(度|年度)"), matcher -> matcher.group(1) + "-12-31");
|
|
|
|
-
|
|
|
|
- // 月份相关格式(统一处理)
|
|
|
|
- put(Pattern.compile("(20\\d{2})年[_-]?(\\d{1,2})月"), matcher -> {
|
|
|
|
- int year = Integer.parseInt(matcher.group(1));
|
|
|
|
- int month = Integer.parseInt(matcher.group(2));
|
|
|
|
- return formatMonthEnd(year, month);
|
|
|
|
- });
|
|
|
|
-
|
|
|
|
- // 紧凑格式(如202312月)
|
|
|
|
- put(Pattern.compile("(\\d{4})(\\d{2})月"), matcher -> {
|
|
|
|
- int year = Integer.parseInt(matcher.group(1));
|
|
|
|
- int month = Integer.parseInt(matcher.group(2));
|
|
|
|
- return formatMonthEnd(year, month);
|
|
|
|
- });
|
|
|
|
-
|
|
|
|
- // 纯数字格式(最低优先级)
|
|
|
|
- put(Pattern.compile("(?<!\\d)\\d{4}(0[1-9]|1[0-2])(?!\\d)"), matcher -> {
|
|
|
|
- String group = matcher.group();
|
|
|
|
- int year = Integer.parseInt(group.substring(0, 4));
|
|
|
|
- int month = Integer.parseInt(group.substring(4, 6));
|
|
|
|
- return formatMonthEnd(year, month);
|
|
|
|
- });
|
|
|
|
- put(Pattern.compile("(?<!\\d)\\d{4}(0[1-9]|1[0-2])(0[1-9]|[12]\\d|3[01])(?!\\d)"), Matcher::group);
|
|
|
|
- }};
|
|
|
|
-
|
|
|
|
-
|
|
|
|
static {
|
|
static {
|
|
// 财务指标
|
|
// 财务指标
|
|
FINANCIAL_INDICATORS_COLUMN_NAMES.add("期末基金净资产");
|
|
FINANCIAL_INDICATORS_COLUMN_NAMES.add("期末基金净资产");
|
|
@@ -300,8 +256,7 @@ public final class ReportParseUtils {
|
|
return matches;
|
|
return matches;
|
|
}
|
|
}
|
|
// 使用正则表达式查找匹配项
|
|
// 使用正则表达式查找匹配项
|
|
- Pattern pattern = Pattern.compile("[A-F]级|基金[A-F]");
|
|
|
|
- Matcher matcher = pattern.matcher(text);
|
|
|
|
|
|
+ Matcher matcher = PatternConsts.FUND_LEVEL_PATTERN.matcher(text);
|
|
// 收集所有匹配项
|
|
// 收集所有匹配项
|
|
while (matcher.find()) {
|
|
while (matcher.find()) {
|
|
matches.add(matcher.group());
|
|
matches.add(matcher.group());
|
|
@@ -325,8 +280,7 @@ public final class ReportParseUtils {
|
|
*/
|
|
*/
|
|
public static String matchFundLevel(String text) {
|
|
public static String matchFundLevel(String text) {
|
|
// 使用正则表达式查找匹配项
|
|
// 使用正则表达式查找匹配项
|
|
- Pattern pattern = Pattern.compile("[A-F]级|基金[A-F]");
|
|
|
|
- Matcher matcher = pattern.matcher(text);
|
|
|
|
|
|
+ Matcher matcher = PatternConsts.FUND_LEVEL_PATTERN.matcher(text);
|
|
String result = null;
|
|
String result = null;
|
|
while (matcher.find()) {
|
|
while (matcher.find()) {
|
|
result = matcher.group();
|
|
result = matcher.group();
|
|
@@ -349,8 +303,7 @@ public final class ReportParseUtils {
|
|
if (StrUtil.isBlank(text)) {
|
|
if (StrUtil.isBlank(text)) {
|
|
return null;
|
|
return null;
|
|
}
|
|
}
|
|
- Pattern pattern = Pattern.compile("S[A-Z0-9]{5}");
|
|
|
|
- Matcher matcher = pattern.matcher(text);
|
|
|
|
|
|
+ Matcher matcher = PatternConsts.FUND_CODE_PATTERN.matcher(text);
|
|
if (matcher.find()) {
|
|
if (matcher.find()) {
|
|
return matcher.group();
|
|
return matcher.group();
|
|
}
|
|
}
|
|
@@ -358,31 +311,61 @@ public final class ReportParseUtils {
|
|
}
|
|
}
|
|
|
|
|
|
/**
|
|
/**
|
|
- * 匹配报告日期
|
|
|
|
|
|
+ * 根据报告类型和报告名称匹配报告日期
|
|
*
|
|
*
|
|
- * @param text 文本内容
|
|
|
|
|
|
+ * @param reportType 报告类型
|
|
|
|
+ * @param text 文本内容
|
|
* @return 报告日期
|
|
* @return 报告日期
|
|
*/
|
|
*/
|
|
- public static String matchReportDate(String text) {
|
|
|
|
- return Optional.ofNullable(text)
|
|
|
|
- .flatMap(str -> PATTERNS.entrySet().stream()
|
|
|
|
- .map(entry -> {
|
|
|
|
- Matcher matcher = entry.getKey().matcher(str);
|
|
|
|
- return matcher.find() ? entry.getValue().apply(matcher) : null;
|
|
|
|
- })
|
|
|
|
- .filter(Objects::nonNull)
|
|
|
|
- .findFirst())
|
|
|
|
- .orElse(null);
|
|
|
|
|
|
+ public static String matchReportDate(ReportType reportType, String text) {
|
|
|
|
+ if (StrUtil.isBlank(text) || reportType == null) {
|
|
|
|
+ return null;
|
|
|
|
+ }
|
|
|
|
+ text = StrUtil.trim(text);
|
|
|
|
+ if (ReportType.QUARTERLY.equals(reportType) || ReportType.OTHER.equals(reportType)) {
|
|
|
|
+ Matcher matcher = PatternConsts.QUARTERLY_PATTERN.matcher(text);
|
|
|
|
+ if (matcher.find()) {
|
|
|
|
+ String year = matcher.group(1);
|
|
|
|
+ return switch (matcher.group(2)) {
|
|
|
|
+ case "一", "1" -> year + "-03-31";
|
|
|
|
+ case "二", "2" -> year + "-06-30";
|
|
|
|
+ case "三", "3" -> year + "-09-30";
|
|
|
|
+ case "四", "4" -> year + "-12-31";
|
|
|
|
+ default -> null;
|
|
|
|
+ };
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ if (ReportType.ANNUALLY.equals(reportType) || ReportType.OTHER.equals(reportType)) {
|
|
|
|
+ Matcher matcher = PatternConsts.ANNUALLY_PATTERN.matcher(text);
|
|
|
|
+ if (matcher.find()) {
|
|
|
|
+ return matcher.group(1) + "-12-31";
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ if (ReportType.MONTHLY.equals(reportType) || ReportType.OTHER.equals(reportType)) {
|
|
|
|
+ Matcher matcher = PatternConsts.MONTHLY_PATTERN.matcher(text);
|
|
|
|
+ if (matcher.find()) {
|
|
|
|
+ int year = Integer.parseInt(matcher.group(1));
|
|
|
|
+ int month = Integer.parseInt(matcher.group(2));
|
|
|
|
+ return formatMonthEnd(year, month);
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ if (ReportType.WEEKLY.equals(reportType) || ReportType.LETTER.equals(reportType) || ReportType.OTHER.equals(reportType)) {
|
|
|
|
+ Matcher matcher = PatternConsts.STRICT_DAY_PATTERN.matcher(text);
|
|
|
|
+ if (matcher.find()) {
|
|
|
|
+ return matcher.group();
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ return null;
|
|
}
|
|
}
|
|
|
|
|
|
/**
|
|
/**
|
|
* 匹配报告类型,如“季度”、“年度”
|
|
* 匹配报告类型,如“季度”、“年度”
|
|
*
|
|
*
|
|
* @param emailType 邮件类型
|
|
* @param emailType 邮件类型
|
|
- * @param string 输入字符串
|
|
|
|
|
|
+ * @param text 输入字符串
|
|
* @return 匹配到的报告类型子字符串,如果没有匹配到则返回null
|
|
* @return 匹配到的报告类型子字符串,如果没有匹配到则返回null
|
|
*/
|
|
*/
|
|
- public static ReportType matchReportType(Integer emailType, String string) {
|
|
|
|
|
|
+ public static ReportType matchReportType(Integer emailType, String text) {
|
|
if (emailType == null) {
|
|
if (emailType == null) {
|
|
return null;
|
|
return null;
|
|
}
|
|
}
|
|
@@ -391,26 +374,26 @@ public final class ReportParseUtils {
|
|
boolean isAmac = Objects.equals(EmailTypeConst.REPORT_EMAIL_TYPE, emailType);
|
|
boolean isAmac = Objects.equals(EmailTypeConst.REPORT_EMAIL_TYPE, emailType);
|
|
if (!isAmac) {
|
|
if (!isAmac) {
|
|
if (Objects.equals(EmailTypeConst.REPORT_LETTER_EMAIL_TYPE, emailType)
|
|
if (Objects.equals(EmailTypeConst.REPORT_LETTER_EMAIL_TYPE, emailType)
|
|
- || StrUtil.containsAny(string, ReportType.LETTER.getPatterns())) {
|
|
|
|
|
|
+ || StrUtil.containsAny(text, ReportType.LETTER.getPatterns())) {
|
|
reportType = ReportType.LETTER;
|
|
reportType = ReportType.LETTER;
|
|
- } else if (StrUtil.containsAny(string, ReportType.WEEKLY.getPatterns())) {
|
|
|
|
|
|
+ } else if (StrUtil.containsAny(text, ReportType.WEEKLY.getPatterns())) {
|
|
reportType = ReportType.WEEKLY;
|
|
reportType = ReportType.WEEKLY;
|
|
- } else if (StrUtil.containsAny(string, ReportType.OTHER.getPatterns())) {
|
|
|
|
|
|
+ } else if (StrUtil.containsAny(text, ReportType.OTHER.getPatterns())) {
|
|
reportType = ReportType.OTHER;
|
|
reportType = ReportType.OTHER;
|
|
}
|
|
}
|
|
return reportType;
|
|
return reportType;
|
|
}
|
|
}
|
|
// 类型识别---先识别季度报告,没有季度再识别年度报告,最后识别月报
|
|
// 类型识别---先识别季度报告,没有季度再识别年度报告,最后识别月报
|
|
- Pattern pattern = Pattern.compile("(\\d{1,2})月"); // xxxx_xx月
|
|
|
|
- Matcher matcher = pattern.matcher(string);
|
|
|
|
- if (StrUtil.containsAny(string, ReportType.QUARTERLY.getPatterns())) {
|
|
|
|
|
|
+ Matcher monthMatcher = PatternConsts.MONTHLY_PATTERN.matcher(text);
|
|
|
|
+ Matcher dayMatcher = PatternConsts.DAY_PATTERN.matcher(text);
|
|
|
|
+ if (StrUtil.containsAny(text, ReportType.QUARTERLY.getPatterns())) {
|
|
reportType = ReportType.QUARTERLY;
|
|
reportType = ReportType.QUARTERLY;
|
|
- } else if (StrUtil.containsAny(string, ReportType.ANNUALLY.getPatterns())) {
|
|
|
|
|
|
+ } else if (StrUtil.containsAny(text, ReportType.ANNUALLY.getPatterns())) {
|
|
reportType = ReportType.ANNUALLY;
|
|
reportType = ReportType.ANNUALLY;
|
|
- } else if (StrUtil.containsAny(string, ReportType.MONTHLY.getPatterns())) {
|
|
|
|
|
|
+ } else if (StrUtil.containsAny(text, ReportType.MONTHLY.getPatterns())) {
|
|
reportType = ReportType.MONTHLY;
|
|
reportType = ReportType.MONTHLY;
|
|
- } else if (matcher.find()) {
|
|
|
|
- // 特殊的月报(当季度->年度->月度报告无法识别时,如果包含“\\d{1,2}月”就说明也是月报)
|
|
|
|
|
|
+ } else if (monthMatcher.find() && !dayMatcher.find()) {
|
|
|
|
+ // 特殊的月报(当季度->年度->月度报告无法识别时)
|
|
reportType = ReportType.MONTHLY;
|
|
reportType = ReportType.MONTHLY;
|
|
}
|
|
}
|
|
return reportType;
|
|
return reportType;
|
|
@@ -439,12 +422,94 @@ public final class ReportParseUtils {
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
- public static void main(String[] args) throws IOException, ReportParseException {
|
|
|
|
- String s = """
|
|
|
|
- =?utf-8?b?5oGS5aSp5Y2D6LGh5LqM5pyf56eB5Yuf6K+B5Yi45oqV6LWE5Z+66YeRLeaBkg==?=
|
|
|
|
- 天千象二期私募证券投资基金2025年4月月度报告-20250512.pdf
|
|
|
|
- """;
|
|
|
|
- String s1 = MimeUtility.decodeText(s);
|
|
|
|
- System.out.println(s1);
|
|
|
|
|
|
+ public static void main(String[] args) {
|
|
|
|
+ String text = "私募基金2024年04月度报告";
|
|
|
|
+ Integer emailType = 1;
|
|
|
|
+ ReportType reportType = null;
|
|
|
|
+// System.out.println(matchReportDate(matchReportType(EmailUtil.getEmailTypeBySubject(text), text), text));
|
|
|
|
+//// System.out.println(matchReportType(3, text));
|
|
|
|
+////
|
|
|
|
+//// text = "私募基金202404月度报告";
|
|
|
|
+//// System.out.println(matchReportDate(text));
|
|
|
|
+//// System.out.println(matchReportType(3, text));
|
|
|
|
+////
|
|
|
|
+//// text = "私募基金2024_04月度报告";
|
|
|
|
+//// System.out.println(matchReportDate(text));
|
|
|
|
+//// System.out.println(matchReportType(3, text));
|
|
|
|
+////
|
|
|
|
+//// text = "私募基金2024_4月度报告";
|
|
|
|
+//// System.out.println(matchReportDate(text));
|
|
|
|
+//// System.out.println(matchReportType(3, text));
|
|
|
|
+////
|
|
|
|
+//// text = "私募基金2024-04月度报告";
|
|
|
|
+//// System.out.println(matchReportDate(text));
|
|
|
|
+//// System.out.println(matchReportType(3, text));
|
|
|
|
+////
|
|
|
|
+//// text = "私募基金2024_04月";
|
|
|
|
+//// System.out.println(matchReportDate(text));
|
|
|
|
+//// System.out.println(matchReportType(3, text));
|
|
|
|
+//
|
|
|
|
+// text = "私募基金2024年04月12号周报";
|
|
|
|
+// System.out.println(matchReportDate(matchReportType(EmailUtil.getEmailTypeBySubject(text), text), text));
|
|
|
|
+//// System.out.println(matchReportType(3, text));
|
|
|
|
+//// System.out.println(matchReportType(5, text));
|
|
|
|
+//
|
|
|
|
+// text = "私募基金20240412确认函";
|
|
|
|
+// System.out.println(matchReportDate(matchReportType(EmailUtil.getEmailTypeBySubject(text), text), text));
|
|
|
|
+//// System.out.println(matchReportType(3, text));
|
|
|
|
+//
|
|
|
|
+// text = "私募基金2024041201预警";
|
|
|
|
+// System.out.println(matchReportDate(matchReportType(EmailUtil.getEmailTypeBySubject(text), text), text));
|
|
|
|
+//// System.out.println(matchReportType(3, text));
|
|
|
|
+//
|
|
|
|
+// text = "私募基金_202404";
|
|
|
|
+// System.out.println(matchReportDate(matchReportType(EmailUtil.getEmailTypeBySubject(text), text), text));
|
|
|
|
+//// System.out.println(matchReportType(3, text));
|
|
|
|
+//
|
|
|
|
+// text = "(水印)SSA404_月报_天演金选沪深300指数增强1号私募证券投资基金_2025年_4月.pdf";
|
|
|
|
+// System.out.println(matchReportDate(matchReportType(EmailUtil.getEmailTypeBySubject(text), text), text));
|
|
|
|
+
|
|
|
|
+// text = "【报告披露】稳博创新一号私募证券投资基金_2024年年报_20241231.pdf";
|
|
|
|
+// System.out.println(matchReportDate(matchReportType(EmailUtil.getEmailTypeBySubject(text), text), text));
|
|
|
|
+//
|
|
|
|
+// text = "大岩市场中性2号私募证券投资基金_2025年_4月_月报.pdf";
|
|
|
|
+// System.out.println(matchReportDate(matchReportType(EmailUtil.getEmailTypeBySubject(text), text), text));
|
|
|
|
+
|
|
|
|
+// text = "投资策略调整.pdf";
|
|
|
|
+// emailType = EmailUtil.getEmailTypeBySubject(text);
|
|
|
|
+// System.out.println(emailType + "," + matchReportType(emailType, text));
|
|
|
|
+
|
|
|
|
+// text = "SSH640_天演恒心精选3号私募证券投资基金(SSH640SM200010202504).pdf";
|
|
|
|
+// emailType = EmailUtil.getEmailTypeBySubject(text);
|
|
|
|
+// reportType = matchReportType(emailType, text);
|
|
|
|
+// System.out.println(emailType + ",reportType=" + reportType + ",reportDate=" + matchReportDate(reportType, text));
|
|
|
|
+//
|
|
|
|
+// text = "【复胜周度观点】 2025_04_07-2025_04_11.pdf";
|
|
|
|
+// emailType = EmailUtil.getEmailTypeBySubject(text);
|
|
|
|
+// reportType = matchReportType(emailType, text);
|
|
|
|
+// System.out.println(emailType + ",reportType=" + reportType + ",reportDate=" + matchReportDate(reportType, text));
|
|
|
|
+//
|
|
|
|
+// text = "交睿宏观配置2号私募证券投资基金B类_孙怡乐_20250429_073248861_赎回确认单.pdf";
|
|
|
|
+// emailType = EmailUtil.getEmailTypeBySubject(text);
|
|
|
|
+// reportType = matchReportType(emailType, text);
|
|
|
|
+// System.out.println(emailType + ",reportType=" + reportType + ",reportDate=" + matchReportDate(reportType, text));
|
|
|
|
+//
|
|
|
|
+// text = "远澜宏观周报20250418.pdf";
|
|
|
|
+// emailType = EmailUtil.getEmailTypeBySubject(text);
|
|
|
|
+// reportType = matchReportType(emailType, text);
|
|
|
|
+// System.out.println(emailType + ",reportType=" + reportType + ",reportDate=" + matchReportDate(reportType, text));
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ text = "官方公众号市场周报披露250430(1).docx";
|
|
|
|
+ emailType = EmailUtil.getEmailTypeBySubject(text);
|
|
|
|
+ reportType = matchReportType(emailType, text);
|
|
|
|
+ System.out.println(emailType + ",reportType=" + reportType + ",reportDate=" + matchReportDate(reportType, text));
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ text = "涌津涌赢11号私募证券投资基金-月度报告-25250508.pdf";
|
|
|
|
+ emailType = EmailUtil.getEmailTypeBySubject(text);
|
|
|
|
+ reportType = matchReportType(emailType, text);
|
|
|
|
+ System.out.println(emailType + ",reportType=" + reportType + ",reportDate=" + matchReportDate(reportType, text));
|
|
|
|
+
|
|
}
|
|
}
|
|
}
|
|
}
|