|
@@ -67,6 +67,14 @@ public class EmailParseService {
|
|
// public static final int stepSize = 10000;
|
|
// public static final int stepSize = 10000;
|
|
private static final Logger log = LoggerFactory.getLogger(EmailParseService.class);
|
|
private static final Logger log = LoggerFactory.getLogger(EmailParseService.class);
|
|
|
|
|
|
|
|
+ // 常量定义:统一管理关键词
|
|
|
|
+ private static final Set<String> AMAC_KEYWORDS = Set.of("协会", "信披");
|
|
|
|
+ private static final Set<String> MANAGER_KEYWORDS = Set.of(
|
|
|
|
+ "管理人", "公司版", "投资者月报", "运行报告", "月策略",
|
|
|
|
+ "投资者报告", "投资报告", "投资月报", "月度简报", "运行月报"
|
|
|
|
+ );
|
|
|
|
+ private static final Set<String> EXCLUDE_PATH_KEYWORDS = Set.of("公司及协会版", "公司和协会版");
|
|
|
|
+
|
|
// 扩展支持的 MIME 类型
|
|
// 扩展支持的 MIME 类型
|
|
private static final Set<String> attachmentMimePrefixes = Set.of(
|
|
private static final Set<String> attachmentMimePrefixes = Set.of(
|
|
"application/pdf",
|
|
"application/pdf",
|
|
@@ -525,22 +533,24 @@ public class EmailParseService {
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
|
|
+ // ocr识别月报是否管理人版或协会版
|
|
|
|
+ ReportMonthlyType monthlyType = ReportMonthlyType.NO_NEED;
|
|
|
|
+ if (ReportType.MONTHLY == reportType) {
|
|
|
|
+ monthlyType = this.determineReportType(emailTitle, fileName, filepath, images);
|
|
|
|
+ }
|
|
|
|
+ boolean isAmac = reportType == ReportType.ANNUALLY || reportType == ReportType.QUARTERLY
|
|
|
|
+ || (reportType == ReportType.MONTHLY && ReportMonthlyType.AMAC == monthlyType);
|
|
// 不支持解析的格式文件
|
|
// 不支持解析的格式文件
|
|
boolean notSupportFile = false;
|
|
boolean notSupportFile = false;
|
|
// 解析报告
|
|
// 解析报告
|
|
ReportData reportData = null;
|
|
ReportData reportData = null;
|
|
ReportParserParams params = new ReportParserParams(fileId, fileName, filepath, reportType);
|
|
ReportParserParams params = new ReportParserParams(fileId, fileName, filepath, reportType);
|
|
- StopWatch parserWatch = new StopWatch();
|
|
|
|
- parserWatch.start();
|
|
|
|
|
|
+ long start = System.currentTimeMillis();
|
|
try {
|
|
try {
|
|
- if (reportType != ReportType.OTHER && reportType != ReportType.WEEKLY) {
|
|
|
|
|
|
+ if (isAmac) {
|
|
ReportParser<ReportData> instance = this.reportParserFactory.getInstance(reportType, fileType);
|
|
ReportParser<ReportData> instance = this.reportParserFactory.getInstance(reportType, fileType);
|
|
reportData = instance.parse(params);
|
|
reportData = instance.parse(params);
|
|
result = new ParseResult<>(1, "报告解析成功", reportData);
|
|
result = new ParseResult<>(1, "报告解析成功", reportData);
|
|
- } else {
|
|
|
|
- if (log.isInfoEnabled()) {
|
|
|
|
- log.info("报告{} 是周报或其他类型,直接用AI解析器解析", fileName);
|
|
|
|
- }
|
|
|
|
}
|
|
}
|
|
} catch (ReportParseException e) {
|
|
} catch (ReportParseException e) {
|
|
log.warn("解析失败:{}", StrUtil.format(e.getMsg(), fileName));
|
|
log.warn("解析失败:{}", StrUtil.format(e.getMsg(), fileName));
|
|
@@ -554,17 +564,13 @@ public class EmailParseService {
|
|
} finally {
|
|
} finally {
|
|
// 如果解析结果是空的就用AI工具解析一次
|
|
// 如果解析结果是空的就用AI工具解析一次
|
|
if (reportData == null && !notSupportFile) {
|
|
if (reportData == null && !notSupportFile) {
|
|
- if (reportType == ReportType.QUARTERLY || reportType == ReportType.ANNUALLY) {
|
|
|
|
- if (log.isInfoEnabled()) {
|
|
|
|
- log.info("报告{} 开始AI解析......", fileName);
|
|
|
|
- }
|
|
|
|
- } else if (CollUtil.isNotEmpty(images)) {
|
|
|
|
- filepath = images.get(0);
|
|
|
|
- if (log.isInfoEnabled()) {
|
|
|
|
- log.info("报告{} 用首页图片{} 开始AI解析......", fileName, filepath);
|
|
|
|
- }
|
|
|
|
|
|
+ if (log.isInfoEnabled()) {
|
|
|
|
+ log.info("报告{} 是周报或管理人月报或其他类型,用AI解析器解析", fileName);
|
|
}
|
|
}
|
|
try {
|
|
try {
|
|
|
|
+ if (!isAmac && CollUtil.isNotEmpty(images)) {
|
|
|
|
+ filepath = images.get(0);
|
|
|
|
+ }
|
|
params = new ReportParserParams(fileId, fileName, filepath, reportType);
|
|
params = new ReportParserParams(fileId, fileName, filepath, reportType);
|
|
ReportParser<ReportData> instance = this.reportParserFactory.getInstance(reportType, ReportParserFileType.AI);
|
|
ReportParser<ReportData> instance = this.reportParserFactory.getInstance(reportType, ReportParserFileType.AI);
|
|
reportData = instance.parse(params);
|
|
reportData = instance.parse(params);
|
|
@@ -576,20 +582,18 @@ public class EmailParseService {
|
|
log.warn("AI解析错误:{}", ExceptionUtil.stacktraceToString(e));
|
|
log.warn("AI解析错误:{}", ExceptionUtil.stacktraceToString(e));
|
|
result = new ParseResult<>(ReportParseStatus.PARSE_FAIL, null, e.getMessage());
|
|
result = new ParseResult<>(ReportParseStatus.PARSE_FAIL, null, e.getMessage());
|
|
}
|
|
}
|
|
- if (log.isInfoEnabled()) {
|
|
|
|
- log.info("报告{} AI解析结束!结果是:{}", fileName, reportData);
|
|
|
|
- }
|
|
|
|
|
|
+ }
|
|
|
|
+ if (log.isInfoEnabled()) {
|
|
|
|
+ log.info("报告{} 用ocr补充解析结果。补充前的结果是:\n{}", fileName, reportData);
|
|
}
|
|
}
|
|
// ocr信息提取(印章、联系人、基金名称和产品代码)
|
|
// ocr信息提取(印章、联系人、基金名称和产品代码)
|
|
this.ocrReportData(reportType, reportData, fileName, images);
|
|
this.ocrReportData(reportType, reportData, fileName, images);
|
|
- // ocr识别月报是否管理人版或协会版
|
|
|
|
- ReportMonthlyType monthlyType = this.extractMonthlyType(reportType, emailTitle, fileName, filepath, images);
|
|
|
|
|
|
+ // 设置月报类型
|
|
if (reportData != null && reportData.getBaseInfo() != null) {
|
|
if (reportData != null && reportData.getBaseInfo() != null) {
|
|
reportData.getBaseInfo().setMonthlyType(monthlyType.getType());
|
|
reportData.getBaseInfo().setMonthlyType(monthlyType.getType());
|
|
}
|
|
}
|
|
- parserWatch.stop();
|
|
|
|
if (log.isInfoEnabled()) {
|
|
if (log.isInfoEnabled()) {
|
|
- log.info("报告{}解析结果为{},耗时{}ms", fileName, reportData, parserWatch.getTotalTimeMillis());
|
|
|
|
|
|
+ log.info("报告{} 解析耗时{}ms,结果是:\n{}", fileName, (System.currentTimeMillis() - start), reportData);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
// 保存报告解析结果
|
|
// 保存报告解析结果
|
|
@@ -600,53 +604,44 @@ public class EmailParseService {
|
|
/**
|
|
/**
|
|
* 判断月报类型(管理人版还是协会版)
|
|
* 判断月报类型(管理人版还是协会版)
|
|
*
|
|
*
|
|
- * @param reportType 报告类型
|
|
|
|
* @param emailTitle 邮件主题
|
|
* @param emailTitle 邮件主题
|
|
* @param fileName 报告名称
|
|
* @param fileName 报告名称
|
|
* @param filepath 报告路径
|
|
* @param filepath 报告路径
|
|
* @param images 报告的第一页和尾页图片地址(主要用于ocr提取关键信息)
|
|
* @param images 报告的第一页和尾页图片地址(主要用于ocr提取关键信息)
|
|
*/
|
|
*/
|
|
- private ReportMonthlyType extractMonthlyType(ReportType reportType, String emailTitle,
|
|
|
|
- String fileName, String filepath, List<String> images) {
|
|
|
|
- if (ReportType.MONTHLY != reportType) {
|
|
|
|
- return ReportMonthlyType.NO_NEED;
|
|
|
|
- }
|
|
|
|
- // 1.依据报告名称判断
|
|
|
|
- if (fileName.contains("协会")) {
|
|
|
|
- return ReportMonthlyType.AMAC;
|
|
|
|
- }
|
|
|
|
- String fundCode = ReportParseUtils.matchFundCode(fileName);
|
|
|
|
- if (StrUtil.isNotBlank(fundCode)) {
|
|
|
|
|
|
+ public ReportMonthlyType determineReportType(String emailTitle, String fileName,
|
|
|
|
+ String filepath, List<String> images) {
|
|
|
|
+ // 1. 优先根据文件名判断
|
|
|
|
+ if (containsAny(fileName, AMAC_KEYWORDS)) {
|
|
return ReportMonthlyType.AMAC;
|
|
return ReportMonthlyType.AMAC;
|
|
}
|
|
}
|
|
- if (fileName.contains("管理人") || fileName.contains("公司版")
|
|
|
|
- || fileName.contains("投资者月报") || fileName.contains("运行报告")
|
|
|
|
- || fileName.contains("投资者报告") || fileName.contains("投资报告")
|
|
|
|
- || fileName.contains("投资月报") || fileName.contains("月度简报")) {
|
|
|
|
|
|
+ if (containsAny(fileName, MANAGER_KEYWORDS)) {
|
|
return ReportMonthlyType.MANAGER;
|
|
return ReportMonthlyType.MANAGER;
|
|
}
|
|
}
|
|
- // 2.依据文件路径判断
|
|
|
|
- List<String> paths = StrUtil.split(filepath, File.separator);
|
|
|
|
- for (String pathSplit : paths) {
|
|
|
|
- boolean ncam = !pathSplit.contains("公司及协会版") && !pathSplit.contains("公司和协会版");
|
|
|
|
- if (ncam && pathSplit.contains("协会")) {
|
|
|
|
|
|
+ if (StrUtil.isNotBlank(ReportParseUtils.matchFundCode(fileName))) {
|
|
|
|
+ return ReportMonthlyType.AMAC;
|
|
|
|
+ }
|
|
|
|
+ // 2. 根据文件路径判断
|
|
|
|
+ List<String> pathSegments = StrUtil.split(filepath, File.separator);
|
|
|
|
+ for (String segment : pathSegments) {
|
|
|
|
+ boolean isExcluded = containsAny(segment, EXCLUDE_PATH_KEYWORDS);
|
|
|
|
+ if (!isExcluded && containsAny(segment, AMAC_KEYWORDS)) {
|
|
return ReportMonthlyType.AMAC;
|
|
return ReportMonthlyType.AMAC;
|
|
}
|
|
}
|
|
- if (ncam && (pathSplit.contains("管理人") || pathSplit.contains("公司版"))) {
|
|
|
|
|
|
+ if (!isExcluded && containsAny(segment, MANAGER_KEYWORDS)) {
|
|
return ReportMonthlyType.MANAGER;
|
|
return ReportMonthlyType.MANAGER;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
- // 3.依据主题判断
|
|
|
|
- if ((emailTitle.contains("协会") || emailTitle.contains("信披")) && !emailTitle.contains("公司及协会版")) {
|
|
|
|
|
|
+ // 3. 根据邮件主题判断
|
|
|
|
+ boolean isAmacEmail = containsAny(emailTitle, AMAC_KEYWORDS)
|
|
|
|
+ && !emailTitle.contains("公司及协会版");
|
|
|
|
+ if (isAmacEmail) {
|
|
return ReportMonthlyType.AMAC;
|
|
return ReportMonthlyType.AMAC;
|
|
}
|
|
}
|
|
- if (emailTitle.contains("管理人") || emailTitle.contains("公司版")
|
|
|
|
- || emailTitle.contains("投资者月报") || emailTitle.contains("运行报告")
|
|
|
|
- || emailTitle.contains("投资者报告") || emailTitle.contains("投资报告")
|
|
|
|
- || emailTitle.contains("投资月报") || emailTitle.contains("月度简报")) {
|
|
|
|
|
|
+ if (containsAny(emailTitle, MANAGER_KEYWORDS)) {
|
|
return ReportMonthlyType.MANAGER;
|
|
return ReportMonthlyType.MANAGER;
|
|
}
|
|
}
|
|
- // 4.ocr 提取“曲线”、“基金份额”等关键字,如果有曲线则是管理人,如果有基金份额则是协会
|
|
|
|
|
|
+ // 4.ocr 提取“曲线”、“基金份额”等关键字,如果有曲线则是管理人,如果有估值日期则是协会
|
|
if (CollUtil.isNotEmpty(images)) {
|
|
if (CollUtil.isNotEmpty(images)) {
|
|
try {
|
|
try {
|
|
return new OCRReportParser().parseMonthlyType(fileName, this.ocrParserUrl, images.get(0));
|
|
return new OCRReportParser().parseMonthlyType(fileName, this.ocrParserUrl, images.get(0));
|
|
@@ -657,6 +652,14 @@ public class EmailParseService {
|
|
return ReportMonthlyType.FAILED;
|
|
return ReportMonthlyType.FAILED;
|
|
}
|
|
}
|
|
|
|
|
|
|
|
+ // 工具方法:检查字符串是否包含任意关键词
|
|
|
|
+ private boolean containsAny(String input, Set<String> keywords) {
|
|
|
|
+ if (StrUtil.isBlank(input)) {
|
|
|
|
+ return false;
|
|
|
|
+ }
|
|
|
|
+ return keywords.stream().anyMatch(input::contains);
|
|
|
|
+ }
|
|
|
|
+
|
|
/**
|
|
/**
|
|
* ocr 提取信息(包括首页的基金名称或报告日期,尾页的印章或联系人等信息)
|
|
* ocr 提取信息(包括首页的基金名称或报告日期,尾页的印章或联系人等信息)
|
|
*
|
|
*
|
|
@@ -679,7 +682,7 @@ public class EmailParseService {
|
|
String imageUrl = images.size() == 1 ? images.get(0) : images.get(1);
|
|
String imageUrl = images.size() == 1 ? images.get(0) : images.get(1);
|
|
parseRes = new OCRReportParser().parse(fileName, this.ocrParserUrl, imageUrl);
|
|
parseRes = new OCRReportParser().parse(fileName, this.ocrParserUrl, imageUrl);
|
|
} catch (Exception e) {
|
|
} catch (Exception e) {
|
|
- log.error("报告{} OCR识别印章和联系人出错:{}", fileName, ExceptionUtil.stacktraceToString(e));
|
|
|
|
|
|
+ log.error("报告{} OCR识别印章和联系人出错:{}", fileName, e.getMessage());
|
|
}
|
|
}
|
|
// ocr识别尾页是否包含印章和联系人信息
|
|
// ocr识别尾页是否包含印章和联系人信息
|
|
if (parseRes != null) {
|
|
if (parseRes != null) {
|
|
@@ -692,25 +695,25 @@ public class EmailParseService {
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
- // 用首页识别基金名称、产品代码和报告日期
|
|
|
|
- if ((reportData.getFundInfo() != null && StrUtil.isBlank(reportData.getFundInfo().getFundName()))
|
|
|
|
- || (reportData.getFundInfo() != null && StrUtil.isBlank(reportData.getFundInfo().getFundCode()))) {
|
|
|
|
- // 首页和尾页不相等时解析首页的数据
|
|
|
|
- if (images.size() != 1) {
|
|
|
|
- try {
|
|
|
|
- parseRes = new OCRReportParser().parse(fileName, this.ocrParserUrl, images.get(0));
|
|
|
|
- } catch (Exception e) {
|
|
|
|
- log.error("报告{} OCR识别首页基金名称和报告日期出错:{}", fileName, ExceptionUtil.stacktraceToString(e));
|
|
|
|
- }
|
|
|
|
|
|
+ // 首页和尾页不相等时解析首页的数据
|
|
|
|
+ if (images.size() != 1) {
|
|
|
|
+ try {
|
|
|
|
+ parseRes = new OCRReportParser().parse(fileName, this.ocrParserUrl, images.get(0));
|
|
|
|
+ } catch (Exception e) {
|
|
|
|
+ log.error("报告{} OCR识别首页基金名称和报告日期出错:{}", fileName, e.getMessage());
|
|
}
|
|
}
|
|
- // ocr 识别的结果
|
|
|
|
- if (reportData.getFundInfo() != null && parseRes != null) {
|
|
|
|
- if (StrUtil.isBlank(reportData.getFundInfo().getFundName())) {
|
|
|
|
- reportData.getFundInfo().setFundName(parseRes.getFundName());
|
|
|
|
- }
|
|
|
|
- if (StrUtil.isBlank(reportData.getFundInfo().getFundCode())) {
|
|
|
|
- reportData.getFundInfo().setFundCode(parseRes.getFundCode());
|
|
|
|
- }
|
|
|
|
|
|
+ }
|
|
|
|
+ // 用首页识别基金名称、产品代码和基金管理人
|
|
|
|
+ if (reportData.getFundInfo() != null && parseRes != null) {
|
|
|
|
+ if (StrUtil.isBlank(reportData.getFundInfo().getFundName())) {
|
|
|
|
+ reportData.getFundInfo().setFundName(parseRes.getFundName());
|
|
|
|
+ }
|
|
|
|
+ if (StrUtil.isBlank(reportData.getFundInfo().getFundCode())) {
|
|
|
|
+ reportData.getFundInfo().setFundCode(parseRes.getFundCode());
|
|
|
|
+ }
|
|
|
|
+ if (StrUtil.isBlank(reportData.getFundInfo().getCompanyName())
|
|
|
|
+ || !reportData.getFundInfo().getCompanyName().contains("有限公司")) {
|
|
|
|
+ reportData.getFundInfo().setCompanyName(parseRes.getCompanyName());
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
@@ -889,14 +892,14 @@ public class EmailParseService {
|
|
log.warn("{} 邮件不满足解析条件 -> 邮件主题:{},邮件日期:{}", folderName, emailTitle, emailDateStr);
|
|
log.warn("{} 邮件不满足解析条件 -> 邮件主题:{},邮件日期:{}", folderName, emailTitle, emailDateStr);
|
|
continue;
|
|
continue;
|
|
}
|
|
}
|
|
- // 成功解析的邮件不用重复下载
|
|
|
|
- Integer okNum = this.emailParseInfoMapper.countEmailByInfoAndStatus(emailTitle, senderEmail, emailAddress, emailDateStr);
|
|
|
|
- if (okNum > 0) {
|
|
|
|
- if (log.isInfoEnabled()) {
|
|
|
|
- log.info("{} 邮件{} 已经存在解析完成的记录,不要重复下载了。", folderName, emailTitle);
|
|
|
|
- }
|
|
|
|
- continue;
|
|
|
|
- }
|
|
|
|
|
|
+// // 成功解析的邮件不用重复下载
|
|
|
|
+// Integer okNum = this.emailParseInfoMapper.countEmailByInfoAndStatus(emailTitle, senderEmail, emailAddress, emailDateStr);
|
|
|
|
+// if (okNum > 0) {
|
|
|
|
+// if (log.isInfoEnabled()) {
|
|
|
|
+// log.info("{} 邮件{} 已经存在解析完成的记录,不要重复下载了。", folderName, emailTitle);
|
|
|
|
+// }
|
|
|
|
+// continue;
|
|
|
|
+// }
|
|
if (log.isInfoEnabled()) {
|
|
if (log.isInfoEnabled()) {
|
|
log.info("{} 邮件{} 基本信息获取完成,开始下载附件!邮件日期:{}", folderName, emailTitle, emailDateStr);
|
|
log.info("{} 邮件{} 基本信息获取完成,开始下载附件!邮件日期:{}", folderName, emailTitle, emailDateStr);
|
|
}
|
|
}
|
|
@@ -959,7 +962,7 @@ public class EmailParseService {
|
|
StrUtil.startWithIgnoreCase(contentType, prefix)
|
|
StrUtil.startWithIgnoreCase(contentType, prefix)
|
|
));
|
|
));
|
|
if (!isAttachment) {
|
|
if (!isAttachment) {
|
|
- log.warn("邮件 {} 未检测到{}类型的附件 (fileName={}, disposition={}, contentType={})",
|
|
|
|
|
|
+ log.warn("邮件{} 未检测到{}类型的附件 (fileName={}, disposition={}, contentType={})",
|
|
subject, att_files, fileName, disposition, contentType);
|
|
subject, att_files, fileName, disposition, contentType);
|
|
return;
|
|
return;
|
|
}
|
|
}
|
|
@@ -976,9 +979,8 @@ public class EmailParseService {
|
|
Files.copy(is, saveFile.toPath());
|
|
Files.copy(is, saveFile.toPath());
|
|
}
|
|
}
|
|
} else {
|
|
} else {
|
|
- FileUtil.del(saveFile);
|
|
|
|
- try (InputStream is = part.getInputStream()) {
|
|
|
|
- Files.copy(is, saveFile.toPath());
|
|
|
|
|
|
+ if (log.isInfoEnabled()) {
|
|
|
|
+ log.info("邮件{} 已下载过附件:{},不用重新下载了。", subject, saveFile.toPath());
|
|
}
|
|
}
|
|
}
|
|
}
|
|
EmailContentInfoDTO emailContentInfoDTO = new EmailContentInfoDTO();
|
|
EmailContentInfoDTO emailContentInfoDTO = new EmailContentInfoDTO();
|