|
@@ -14,9 +14,11 @@ import com.smppw.modaq.application.components.report.parser.ReportParserFactory;
|
|
|
import com.smppw.modaq.application.components.report.writer.ReportWriter;
|
|
|
import com.smppw.modaq.application.components.report.writer.ReportWriterFactory;
|
|
|
import com.smppw.modaq.application.util.EmailUtil;
|
|
|
+import com.smppw.modaq.common.conts.Constants;
|
|
|
import com.smppw.modaq.common.conts.DateConst;
|
|
|
import com.smppw.modaq.common.conts.EmailParseStatusConst;
|
|
|
import com.smppw.modaq.common.conts.EmailTypeConst;
|
|
|
+import com.smppw.modaq.common.enums.ReportMonthlyType;
|
|
|
import com.smppw.modaq.common.enums.ReportParseStatus;
|
|
|
import com.smppw.modaq.common.enums.ReportParserFileType;
|
|
|
import com.smppw.modaq.common.enums.ReportType;
|
|
@@ -142,7 +144,8 @@ public class EmailParseService {
|
|
|
log.warn("未采集到正文或附件");
|
|
|
continue;
|
|
|
}
|
|
|
- log.info("开始解析邮件数据 -> 邮件主题:{},邮件日期:{}", emailContentInfoDTOList.get(0).getEmailTitle(), emailContentInfoDTOList.get(0).getEmailDate());
|
|
|
+ log.info("开始解析邮件数据 -> 邮件主题:{},邮件日期:{}",
|
|
|
+ emailContentInfoDTOList.get(0).getEmailTitle(), emailContentInfoDTOList.get(0).getEmailDate());
|
|
|
Map<EmailContentInfoDTO, List<EmailZipFileDTO>> emailZipFileMap = MapUtil.newHashMap();
|
|
|
for (EmailContentInfoDTO emailDto : emailContentInfoDTOList) {
|
|
|
// 正文不用解压附件
|
|
@@ -206,11 +209,11 @@ public class EmailParseService {
|
|
|
String emailTitle = emailContentInfoDTO.getEmailTitle();
|
|
|
|
|
|
if (ArchiveUtil.isZip(filepath)) {
|
|
|
- this.handleCompressedFiles(emailTitle, filepath, ".zip", emailType, resultList);
|
|
|
+ this.handleCompressedFiles(emailTitle, filepath, Constants.ARCHIVE_ZIP, emailType, resultList);
|
|
|
} else if (ArchiveUtil.isRAR(filepath)) {
|
|
|
- this.handleCompressedFiles(emailTitle, filepath, ".rar", emailType, resultList);
|
|
|
+ this.handleCompressedFiles(emailTitle, filepath, Constants.ARCHIVE_RAR, emailType, resultList);
|
|
|
} else if (ArchiveUtil.is7z(filepath)) {
|
|
|
- this.handleCompressedFiles(emailTitle, filepath, ".7z", emailType, resultList);
|
|
|
+ this.handleCompressedFiles(emailTitle, filepath, Constants.ARCHIVE_7Z, emailType, resultList);
|
|
|
} else {
|
|
|
// 不是压缩包时
|
|
|
EmailZipFileDTO dto = new EmailZipFileDTO(emailTitle, emailContentInfoDTO);
|
|
@@ -289,7 +292,7 @@ public class EmailParseService {
|
|
|
List<ParseResult<ReportData>> dataList = ListUtil.list(false);
|
|
|
for (Map.Entry<EmailContentInfoDTO, List<EmailZipFileDTO>> entry : emailZipFileMap.entrySet()) {
|
|
|
EmailContentInfoDTO emailDto = entry.getKey();
|
|
|
- if (emailDto.getFileName() != null && emailDto.getFileName().endsWith(".html")) {
|
|
|
+ if (emailDto.getFileName() != null && emailDto.getFileName().endsWith(Constants.FILE_HTML)) {
|
|
|
continue;
|
|
|
}
|
|
|
String emailTitle = emailDto.getEmailTitle();
|
|
@@ -312,8 +315,8 @@ public class EmailParseService {
|
|
|
}
|
|
|
// 如果压缩包里面既有pdf又有其他格式的文件,说明其他格式的文件是不需要解析的
|
|
|
List<String> exts = dtos.stream().map(EmailZipFileDTO::getExtName).distinct().toList();
|
|
|
- if (exts.contains("pdf") && exts.size() > 1) {
|
|
|
- dtos.removeIf(e -> !Objects.equals("pdf", e.getExtName()));
|
|
|
+ if (exts.contains(Constants.FILE_PDF) && exts.size() > 1) {
|
|
|
+ dtos.removeIf(e -> !Objects.equals(Constants.FILE_PDF, e.getExtName()));
|
|
|
}
|
|
|
// 移除逻辑
|
|
|
Iterator<EmailZipFileDTO> removeIterator = dtos.iterator();
|
|
@@ -390,7 +393,8 @@ public class EmailParseService {
|
|
|
.filter(Objects::nonNull).filter(e -> Objects.equals(true, e.getAiParse())).toList();
|
|
|
if (CollUtil.isNotEmpty(aiParaseList)) {
|
|
|
for (ReportData data : aiParaseList) {
|
|
|
- this.emailFileInfoMapper.updateAiParseByFileId(data.getBaseInfo().getFileId(), data.getAiParse(), data.getAiFileId());
|
|
|
+ this.emailFileInfoMapper.updateAiParseByFileId(data.getBaseInfo().getFileId(),
|
|
|
+ data.getAiParse(), data.getAiFileId());
|
|
|
}
|
|
|
}
|
|
|
long failNum = dataList.stream().filter(e -> !Objects.equals(EmailParseStatusConst.SUCCESS, e.getStatus())).count();
|
|
@@ -403,13 +407,15 @@ public class EmailParseService {
|
|
|
}
|
|
|
}
|
|
|
|
|
|
- private ParseResult<ReportData> parseReportAndHandleResult(String emailTitle, EmailFileInfoDO emailFileInfo, EmailZipFileDTO zipFile) {
|
|
|
+ private ParseResult<ReportData> parseReportAndHandleResult(String emailTitle,
|
|
|
+ EmailFileInfoDO emailFileInfo,
|
|
|
+ EmailZipFileDTO zipFile) {
|
|
|
Integer emailType = zipFile.getEmailType();
|
|
|
String fileName = zipFile.getFilename();
|
|
|
String filepath = zipFile.getFilepath();
|
|
|
ParseResult<ReportData> result = new ParseResult<>();
|
|
|
boolean reportFlag = emailType == null || !EmailTypeConst.SUPPORT_EMAIL_TYPES.contains(emailType);
|
|
|
- if (reportFlag || StrUtil.isBlank(fileName) || fileName.endsWith(".html")) {
|
|
|
+ if (reportFlag || StrUtil.isBlank(fileName) || fileName.endsWith(Constants.FILE_HTML)) {
|
|
|
result.setStatus(ReportParseStatus.NOT_A_REPORT.getCode());
|
|
|
result.setMsg(StrUtil.format(ReportParseStatus.NOT_A_REPORT.getMsg(), fileName));
|
|
|
log.error(result.getMsg());
|
|
@@ -424,9 +430,7 @@ public class EmailParseService {
|
|
|
}
|
|
|
}
|
|
|
// 解析器--根据文件后缀获取对应解析器,解析不了就用AI来解析
|
|
|
- ReportParserFileType fileType;
|
|
|
- String fileSuffix = StrUtil.subAfter(fileName, ".", true);
|
|
|
- fileType = ReportParserFileType.getBySuffix(fileSuffix);
|
|
|
+ ReportParserFileType fileType = ReportParserFileType.getBySuffix(zipFile.getExtName());
|
|
|
// 不支持的格式
|
|
|
if (fileType == null) {
|
|
|
result.setStatus(ReportParseStatus.NO_SUPPORT_TEMPLATE.getCode());
|
|
@@ -527,8 +531,13 @@ public class EmailParseService {
|
|
|
log.info("报告{} AI解析结束!结果是:{}", fileName, reportData);
|
|
|
}
|
|
|
}
|
|
|
- // ocr信息提取
|
|
|
+ // ocr信息提取(印章、联系人、基金名称和产品代码)
|
|
|
this.ocrReportData(reportType, reportData, fileName, images);
|
|
|
+ // ocr识别月报是否管理人版或协会版
|
|
|
+ ReportMonthlyType monthlyType = this.extractMonthlyType(reportType, emailTitle, fileName, filepath, images);
|
|
|
+ if (reportData != null && reportData.getBaseInfo() != null) {
|
|
|
+ reportData.getBaseInfo().setMonthlyType(monthlyType.getType());
|
|
|
+ }
|
|
|
parserWatch.stop();
|
|
|
if (log.isInfoEnabled()) {
|
|
|
log.info("报告{}解析结果为{},耗时{}ms", fileName, reportData, parserWatch.getTotalTimeMillis());
|
|
@@ -540,13 +549,70 @@ public class EmailParseService {
|
|
|
}
|
|
|
|
|
|
/**
|
|
|
+ * 判断月报类型(管理人版还是协会版)
|
|
|
+ *
|
|
|
+ * @param reportType 报告类型
|
|
|
+ * @param emailTitle 邮件主题
|
|
|
+ * @param fileName 报告名称
|
|
|
+ * @param filepath 报告路径
|
|
|
+ * @param images 报告的第一页和尾页图片地址(主要用于ocr提取关键信息)
|
|
|
+ */
|
|
|
+ private ReportMonthlyType extractMonthlyType(ReportType reportType, String emailTitle,
|
|
|
+ String fileName, String filepath, List<String> images) {
|
|
|
+ if (ReportType.MONTHLY != reportType) {
|
|
|
+ return ReportMonthlyType.NO_NEED;
|
|
|
+ }
|
|
|
+ // 1.依据报告名称判断
|
|
|
+ if (fileName.contains("协会")) {
|
|
|
+ return ReportMonthlyType.AMAC;
|
|
|
+ }
|
|
|
+ String fundCode = ReportParseUtils.matchFundCode(fileName);
|
|
|
+ if (StrUtil.isNotBlank(fundCode)) {
|
|
|
+ return ReportMonthlyType.AMAC;
|
|
|
+ }
|
|
|
+ if (fileName.contains("管理人") || fileName.contains("公司版")
|
|
|
+ || fileName.contains("投资者月报") || fileName.contains("运行报告")
|
|
|
+ || fileName.contains("投资者报告") || fileName.contains("投资报告")
|
|
|
+ || fileName.contains("投资月报") || fileName.contains("月度简报")) {
|
|
|
+ return ReportMonthlyType.MANAGER;
|
|
|
+ }
|
|
|
+ // 2.依据文件路径判断
|
|
|
+ List<String> paths = StrUtil.split(filepath, File.separator);
|
|
|
+ for (String pathSplit : paths) {
|
|
|
+ boolean ncam = !pathSplit.contains("公司及协会版") && !pathSplit.contains("公司和协会版");
|
|
|
+ if (ncam && pathSplit.contains("协会")) {
|
|
|
+ return ReportMonthlyType.AMAC;
|
|
|
+ }
|
|
|
+ if (ncam && (pathSplit.contains("管理人") || pathSplit.contains("公司版"))) {
|
|
|
+ return ReportMonthlyType.MANAGER;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ // 3.依据主题判断
|
|
|
+ if ((emailTitle.contains("协会") || emailTitle.contains("信披")) && !emailTitle.contains("公司及协会版")) {
|
|
|
+ return ReportMonthlyType.AMAC;
|
|
|
+ }
|
|
|
+ if (emailTitle.contains("管理人") || emailTitle.contains("公司版")
|
|
|
+ || emailTitle.contains("投资者月报") || emailTitle.contains("运行报告")
|
|
|
+ || emailTitle.contains("投资者报告") || emailTitle.contains("投资报告")
|
|
|
+ || emailTitle.contains("投资月报") || emailTitle.contains("月度简报")) {
|
|
|
+ return ReportMonthlyType.MANAGER;
|
|
|
+ }
|
|
|
+ // 4.ocr 提取“曲线”、“基金份额”等关键字,如果有曲线则是管理人,如果有基金份额则是协会
|
|
|
+ if (CollUtil.isNotEmpty(images)) {
|
|
|
+ return new OCRReportParser().parseMonthlyType(fileName, this.ocrParserUrl, images.get(0));
|
|
|
+ }
|
|
|
+ return ReportMonthlyType.FAILED;
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
* ocr 提取信息(包括首页的基金名称或报告日期,尾页的印章或联系人等信息)
|
|
|
*
|
|
|
* @param reportData 报告解析结果
|
|
|
* @param fileName 报告名称
|
|
|
* @param images 报告的收益和尾页png图片
|
|
|
*/
|
|
|
- private void ocrReportData(ReportType reportType, ReportData reportData, String fileName, List<String> images) {
|
|
|
+ private void ocrReportData(ReportType reportType, ReportData reportData,
|
|
|
+ String fileName, List<String> images) {
|
|
|
if (reportData == null || CollUtil.isEmpty(images)) {
|
|
|
return;
|
|
|
}
|
|
@@ -659,15 +725,15 @@ public class EmailParseService {
|
|
|
}
|
|
|
|
|
|
private EmailParseInfoDO buildEmailParseInfo(Integer emailId, String emailAddress,
|
|
|
- EmailContentInfoDTO emailContentInfoDTO, long totalSize) {
|
|
|
+ EmailContentInfoDTO emailDto, long totalSize) {
|
|
|
EmailParseInfoDO emailParseInfoDO = new EmailParseInfoDO();
|
|
|
emailParseInfoDO.setId(emailId);
|
|
|
- emailParseInfoDO.setSenderEmail(emailContentInfoDTO.getSenderEmail());
|
|
|
+ emailParseInfoDO.setSenderEmail(emailDto.getSenderEmail());
|
|
|
emailParseInfoDO.setEmail(emailAddress);
|
|
|
- emailParseInfoDO.setEmailDate(DateUtil.parse(emailContentInfoDTO.getEmailDate(), DateConst.YYYY_MM_DD_HH_MM_SS));
|
|
|
- emailParseInfoDO.setParseDate(emailContentInfoDTO.getParseDate() == null ? null : DateUtil.parseDate(emailContentInfoDTO.getParseDate()));
|
|
|
- emailParseInfoDO.setEmailTitle(emailContentInfoDTO.getEmailTitle());
|
|
|
- emailParseInfoDO.setEmailType(emailContentInfoDTO.getEmailType());
|
|
|
+ emailParseInfoDO.setEmailDate(DateUtil.parse(emailDto.getEmailDate(), DateConst.YYYY_MM_DD_HH_MM_SS));
|
|
|
+ emailParseInfoDO.setParseDate(emailDto.getParseDate() == null ? null : DateUtil.parseDate(emailDto.getParseDate()));
|
|
|
+ emailParseInfoDO.setEmailTitle(emailDto.getEmailTitle());
|
|
|
+ emailParseInfoDO.setEmailType(emailDto.getEmailType());
|
|
|
emailParseInfoDO.setParseStatus(EmailParseStatusConst.SUCCESS);
|
|
|
emailParseInfoDO.setAttrSize(totalSize);
|
|
|
emailParseInfoDO.setIsvalid(1);
|
|
@@ -824,15 +890,17 @@ public class EmailParseService {
|
|
|
String disposition = part.getDisposition();
|
|
|
String contentType = part.getContentType();
|
|
|
|
|
|
- boolean attachmentFlag = StrUtil.endWithAny(fileName, ".zip", ".rar", ".pdf", ".png", ".jpg", ".docx", ".7z");
|
|
|
+ String[] att_files = new String[]{Constants.ARCHIVE_7Z, Constants.ARCHIVE_RAR,
|
|
|
+ Constants.ARCHIVE_ZIP, Constants.FILE_PDF, Constants.FILE_DOCX, Constants.FILE_JPG, Constants.FILE_PNG};
|
|
|
+ boolean attachmentFlag = StrUtil.endWithAny(fileName, att_files);
|
|
|
boolean isAttachment = attachmentFlag
|
|
|
|| Part.ATTACHMENT.equalsIgnoreCase(disposition)
|
|
|
|| (contentType != null && attachmentMimePrefixes.stream().anyMatch(prefix ->
|
|
|
StrUtil.startWithIgnoreCase(contentType, prefix)
|
|
|
));
|
|
|
if (!isAttachment) {
|
|
|
- log.warn("邮件 {} 未检测到pdf/zip/rar/7z/png/jpg/docx类型的附件 (fileName={}, disposition={}, contentType={})",
|
|
|
- subject, fileName, disposition, contentType);
|
|
|
+ log.warn("邮件 {} 未检测到{}类型的附件 (fileName={}, disposition={}, contentType={})",
|
|
|
+ subject, att_files, fileName, disposition, contentType);
|
|
|
return;
|
|
|
}
|
|
|
|
|
@@ -840,7 +908,7 @@ public class EmailParseService {
|
|
|
String filePath = path + File.separator + account + File.separator + emailDateStr + File.separator;
|
|
|
// 压缩包重名时的后面的压缩包会覆盖前面压缩包的问题(不考虑普通文件)
|
|
|
String emailDate = DateUtil.format(sendDate, DateConst.YYYYMMDDHHMMSS24);
|
|
|
- String realName = (fileName.endsWith(".zip") || fileName.endsWith(".rar") || fileName.endsWith(".7z")) ? emailDate + fileName : fileName;
|
|
|
+ String realName = ArchiveUtil.isArchive(fileName) ? emailDate + fileName : fileName;
|
|
|
File saveFile = FileUtil.file(filePath + realName);
|
|
|
if (!saveFile.exists()) {
|
|
|
if (!saveFile.getParentFile().exists()) {
|