package com.smppw.modaq.domain.service; import cn.hutool.core.collection.CollUtil; import cn.hutool.core.collection.ListUtil; import cn.hutool.core.date.DateUtil; import cn.hutool.core.exceptions.ExceptionUtil; import cn.hutool.core.io.FileUtil; import cn.hutool.core.map.MapUtil; import cn.hutool.core.util.IdUtil; import cn.hutool.core.util.StrUtil; import com.smppw.modaq.application.components.OCRReportParser; import com.smppw.modaq.application.components.ReportParseUtils; import com.smppw.modaq.application.components.report.parser.ReportParser; import com.smppw.modaq.application.components.report.parser.ReportParserFactory; import com.smppw.modaq.application.components.report.writer.ReportWriter; import com.smppw.modaq.application.components.report.writer.ReportWriterFactory; import com.smppw.modaq.application.util.EmailUtil; import com.smppw.modaq.common.conts.Constants; import com.smppw.modaq.common.conts.DateConst; import com.smppw.modaq.common.conts.EmailParseStatusConst; import com.smppw.modaq.common.conts.EmailTypeConst; import com.smppw.modaq.common.enums.ReportMonthlyType; import com.smppw.modaq.common.enums.ReportParseStatus; import com.smppw.modaq.common.enums.ReportParserFileType; import com.smppw.modaq.common.enums.ReportType; import com.smppw.modaq.common.exception.NotSupportReportException; import com.smppw.modaq.common.exception.ReportParseException; import com.smppw.modaq.domain.dto.*; import com.smppw.modaq.domain.dto.report.*; import com.smppw.modaq.domain.dto.report.ocr.OCRLetterParseData; import com.smppw.modaq.domain.dto.report.ocr.OCRParseData; import com.smppw.modaq.domain.entity.EmailFileInfoDO; import com.smppw.modaq.domain.entity.EmailParseInfoDO; import com.smppw.modaq.domain.mapper.EmailFileInfoMapper; import com.smppw.modaq.domain.mapper.EmailParseInfoMapper; import com.smppw.modaq.infrastructure.util.ArchiveUtil; import com.smppw.modaq.infrastructure.util.PdfUtil; import jakarta.mail.*; import jakarta.mail.internet.MimeUtility; import jakarta.mail.search.ComparisonTerm; import jakarta.mail.search.ReceivedDateTerm; import jakarta.mail.search.SearchTerm; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.beans.factory.annotation.Value; import org.springframework.stereotype.Service; import org.springframework.util.StopWatch; import java.io.File; import java.io.IOException; import java.io.InputStream; import java.nio.file.Files; import java.util.*; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.stream.Collectors; /** * @author mozuwen * @date 2024-09-04 * @description 邮件解析服务 */ @Service public class EmailParseService { // public static final int stepSize = 10000; private static final Logger log = LoggerFactory.getLogger(EmailParseService.class); // 常量定义:统一管理关键词 private static final Set AMAC_KEYWORDS = Set.of("协会", "信披"); private static final Set EXCLUDE_PATH_KEYWORDS = Set.of("公司及协会版", "公司和协会版"); // 扩展支持的 MIME 类型 private static final Set attachmentMimePrefixes = Set.of( "application/pdf", "application/zip", "application/x-zip-compressed", "application/rar", "application/x-rar-compressed", "application/octet-stream" // 按需添加其他类型... ); private final EmailParseInfoMapper emailParseInfoMapper; private final EmailFileInfoMapper emailFileInfoMapper; /* 报告解析和入库的方法 */ private final ReportParserFactory reportParserFactory; private final ReportWriterFactory reportWriterFactory; @Value("${email.file.path}") private String path; @Value("${email.report.ocr-parser-url}") private String ocrParserUrl; @Value("${email.read-write-seen:true}") private boolean readWriteSeen; public EmailParseService(EmailParseInfoMapper emailParseInfoMapper, EmailFileInfoMapper emailFileInfoMapper, ReportParserFactory reportParserFactory, ReportWriterFactory reportWriterFactory) { this.emailParseInfoMapper = emailParseInfoMapper; this.emailFileInfoMapper = emailFileInfoMapper; this.reportParserFactory = reportParserFactory; this.reportWriterFactory = reportWriterFactory; } /** * 解析指定邮箱指定时间范围内的邮件 * * @param mailboxInfoDTO 邮箱配置信息 * @param startDate 邮件起始日期(yyyy-MM-dd HH:mm:ss) * @param endDate 邮件截止日期(yyyy-MM-dd HH:mm:ss, 为null,将解析邮件日期小于等于startDate的当天邮件) * @param emailTypes 当前任务支持的邮件类型,默认支持确认单 */ public void parseEmail(MailboxInfoDTO mailboxInfoDTO, Date startDate, Date endDate, List folderNames, List emailTypes) { if (CollUtil.isEmpty(emailTypes)) { emailTypes = ListUtil.of(EmailTypeConst.REPORT_LETTER_EMAIL_TYPE); } if (log.isInfoEnabled()) { log.info("开始邮件解析 -> 邮箱信息:{},开始时间:{},结束时间:{}", mailboxInfoDTO, DateUtil.format(startDate, DateConst.YYYY_MM_DD_HH_MM_SS), DateUtil.format(endDate, DateConst.YYYY_MM_DD_HH_MM_SS)); } Map> emailContentMap; try { emailContentMap = this.realEmail(mailboxInfoDTO, startDate, endDate, folderNames); } catch (Exception e) { log.error("采集邮件失败 -> 邮箱配置信息:{},堆栈信息:{}", mailboxInfoDTO, ExceptionUtil.stacktraceToString(e)); return; } if (MapUtil.isEmpty(emailContentMap)) { log.warn("未采集到邮件 -> 邮箱配置信息:{},开始时间:{},结束时间:{}", mailboxInfoDTO, DateUtil.format(startDate, DateConst.YYYY_MM_DD_HH_MM_SS), DateUtil.format(endDate, DateConst.YYYY_MM_DD_HH_MM_SS)); return; } for (Map.Entry> emailEntry : emailContentMap.entrySet()) { List emailContentInfoDTOList = emailEntry.getValue(); if (CollUtil.isEmpty(emailContentInfoDTOList)) { log.warn("未采集到正文或附件"); continue; } EmailContentInfoDTO dto = emailContentInfoDTOList.get(0); String emailTitle = dto.getEmailTitle(); log.info("开始解析邮件数据 -> 邮件主题:{},邮件日期:{}", emailTitle, dto.getEmailDate()); List emailFileList = ListUtil.list(false); EmailInfoDTO emailInfo = new EmailInfoDTO(dto, emailFileList); for (EmailContentInfoDTO emailDto : emailContentInfoDTOList) { // 正文不用解压附件 if (emailDto.getFileName() != null && emailDto.getFileName().endsWith(Constants.FILE_HTML)) { continue; } try { emailFileList.addAll(this.parseZipEmail(emailDto)); } catch (IOException e) { log.error("压缩包解压失败:{}", ExceptionUtil.stacktraceToString(e)); EmailParseInfoDO fail = buildEmailParseInfo(mailboxInfoDTO.getAccount(), dto.getEmailType(), emailInfo, emailDto.getFileSize()); fail.setFailReason("压缩包解压失败"); fail.setParseStatus(EmailParseStatusConst.FAIL); fail.setEmailKey(emailEntry.getKey()); this.emailParseInfoMapper.insert(fail); } catch (Exception e) { log.error("堆栈信息:{}", ExceptionUtil.stacktraceToString(e)); } } // 重新判断类型 for (EmailZipFileDTO emailFile : emailFileList) { if (EmailTypeConst.SUPPORT_NO_OTHER_TYPES.contains(emailFile.getEmailType())) { continue; } Integer type = EmailUtil.getEmailTypeBySubject(emailTitle + emailFile.getFilename()); // 特殊月报 if ((Objects.equals(EmailTypeConst.NAV_EMAIL_TYPE, type) || Objects.equals(EmailTypeConst.REPORT_OTHER_TYPE, type)) && (ReportParseUtils.containsAny(emailTitle, ReportParseUtils.MANAGER_KEYWORDS) || emailTitle.contains("定期报告"))) { type = EmailTypeConst.REPORT_EMAIL_TYPE; } // 其他报告 if (Objects.equals(EmailTypeConst.NAV_EMAIL_TYPE, type)) { type = EmailTypeConst.REPORT_OTHER_TYPE; } emailFile.setEmailType(type); } Iterator entryIterator = emailFileList.iterator(); while (entryIterator.hasNext()) { EmailZipFileDTO entry = entryIterator.next(); if (!emailTypes.contains(entry.getEmailType())) { log.warn("当前邮件{} 文件{} 的类型{} 不在支持的任务类型{} 中,不用执行解析逻辑。", entry.getEmailTitle(), entry.getFilepath(), entry.getEmailType(), emailTypes); entryIterator.remove(); } } // 保存相关信息 -> 邮件信息表,邮件文件表,邮件净值表,邮件规模表,基金净值表 saveRelatedTable(emailEntry.getKey(), mailboxInfoDTO.getAccount(), emailInfo); log.info("结束邮件解析 -> 邮箱信息:{},开始时间:{},结束时间:{}", emailEntry.getValue(), DateUtil.format(startDate, DateConst.YYYY_MM_DD_HH_MM_SS), DateUtil.format(endDate, DateConst.YYYY_MM_DD_HH_MM_SS)); } } /** * 解压压缩包,如果不是压缩包需转换 * * @param emailContentInfoDTO 邮件信息 * @return 解压后的文件列表 * @throws IOException / */ public List parseZipEmail(EmailContentInfoDTO emailContentInfoDTO) throws IOException { List resultList = ListUtil.list(false); Integer emailType = emailContentInfoDTO.getEmailType(); String filepath = emailContentInfoDTO.getFilePath(); String emailTitle = emailContentInfoDTO.getEmailTitle(); if (ArchiveUtil.isArchive(filepath)) { this.handleCompressedFiles(emailTitle, filepath, emailType, resultList); } else { // 不是压缩包时 EmailZipFileDTO dto = new EmailZipFileDTO(emailTitle, emailContentInfoDTO); resultList.add(dto); } // 文件中的类型判断 if (emailType == null || !EmailTypeConst.SUPPORT_NO_OTHER_TYPES.contains(emailType)) { emailType = EmailUtil.getEmailTypeBySubject(emailContentInfoDTO.getFileName()); emailContentInfoDTO.setEmailType(emailType); } if (CollUtil.isNotEmpty(resultList)) { for (EmailZipFileDTO dto : resultList) { dto.setEmailType(emailType); } } return resultList; } /** * 解压压缩包并把压缩包里面的所有文件放在resultList中 * * @param emailTitle 邮件主题 * @param filepath 压缩包路径 * @param emailType 邮件解析类型 * @param resultList 解压结果列表 * @throws IOException / */ private void handleCompressedFiles(String emailTitle, String filepath, Integer emailType, List resultList) throws IOException { String parent = FileUtil.getParent(filepath, 2); String destPath = parent + File.separator + "archive" + File.separator + FileUtil.mainName(filepath); File destFile = new File(destPath); if (!destFile.exists()) { if (!destFile.mkdirs()) { throw new IOException("无法创建目标目录: " + destPath); } } List extractedDirs; if (ArchiveUtil.isZip(filepath)) { extractedDirs = ArchiveUtil.extractCompressedFiles(filepath, destPath); } else if (ArchiveUtil.isRAR(filepath) || ArchiveUtil.is7z(filepath)) { // 7z和rar压缩包解压 extractedDirs = ArchiveUtil.extractRar5(filepath, destPath); } else { return; } for (String dir : extractedDirs) { // 如果邮件类型不满足解析条件则重新根据文件名判断 if (emailType == null || !EmailTypeConst.SUPPORT_EMAIL_TYPES.contains(emailType)) { emailType = EmailUtil.getEmailTypeBySubject(dir); } File file = new File(dir); if (file.isDirectory()) { String[] subDirs = file.list(); if (subDirs != null) { for (String subDir : subDirs) { resultList.add(new EmailZipFileDTO(emailTitle, subDir, emailType)); } } else { log.warn("目录 {} 下无文件", dir); } } else { resultList.add(new EmailZipFileDTO(emailTitle, dir, emailType)); } } } /** * 邮件附件解析并保存结果数据 * * @param emailKey 没封邮件的uuid * @param emailAddress 发送人地址 * @param emailInfo 邮件信息 */ public void saveRelatedTable(String emailKey, String emailAddress, EmailInfoDTO emailInfo) { // 附件文件检查 Long totalSize = this.checkEmailFileInfo(emailInfo); if (totalSize == null) { return; } // 解析并保存数据 List> dataList = ListUtil.list(true); Integer emailId = this.parseResults(null, emailKey, emailAddress, totalSize, emailInfo, dataList); String failReason = null; int emailParseStatus = EmailParseStatusConst.SUCCESS; // 报告邮件有一条失败就表示整个邮件解析失败 if (CollUtil.isNotEmpty(dataList)) { // ai解析结果 List aiParaseList = dataList.stream().map(ParseResult::getData) .filter(Objects::nonNull).filter(e -> Objects.equals(true, e.getAiParse())).toList(); if (CollUtil.isNotEmpty(aiParaseList)) { for (ReportData data : aiParaseList) { this.emailFileInfoMapper.updateAiParseByFileId(data.getBaseInfo().getFileId(), data.getAiParse(), data.getAiFileId()); } } long failNum = dataList.stream().filter(e -> !Objects.equals(EmailParseStatusConst.SUCCESS, e.getStatus())).count(); if (failNum > 0) { emailParseStatus = EmailParseStatusConst.FAIL; failReason = dataList.stream().map(ParseResult::getMsg).collect(Collectors.joining(";")); } } this.emailParseInfoMapper.updateParseStatus(emailId, emailParseStatus, failReason); } /** * 上传文件解析并返回解析状态 * * @param params 上传文件路径 * @return / */ public List uploadReportResults(UploadReportParams params) { List> dataList = ListUtil.list(false); List reportInfos = params.getReportInfos(); List dtos = ListUtil.list(false); for (UploadReportParams.ReportInfo e : reportInfos) { String reportPath = e.getReportPath(); if (ArchiveUtil.isArchive(reportPath)) { try { this.handleCompressedFiles(params.getTitle(), reportPath, e.getReportType(), dtos); } catch (Exception ex) { log.warn("报告{} 压缩包解压失败:{}", reportPath, ExceptionUtil.stacktraceToString(ex)); ReportData reportData = new ReportData.DefaultReportData(); reportData.setReportPath(reportPath); dataList.add(new ParseResult<>(ReportParseStatus.ARCHIVE_FAIL, reportData)); } } else { dtos.add(new EmailZipFileDTO(params.getTitle(), reportPath, e.getReportType())); } } EmailInfoDTO emailInfo = new EmailInfoDTO(params.getTitle(), dtos); // 附件文件检查 Long totalSize = this.checkEmailFileInfo(emailInfo); if (totalSize == null) { return null; } this.parseResults(-1, null, null, totalSize, emailInfo, dataList); List resultList = ListUtil.list(false); for (ParseResult result : dataList) { ReportData data = result.getData(); resultList.add(new UploadReportResult(data.getReportPath(), result.getStatus(), result.getMsg())); } return resultList; } /** * 邮件信息前置处理,在解析操作执行之前的过滤逻辑和校验逻辑。返回所有附件大小汇总 * * @param emailInfo 邮件信息(包含所有解压后的文件) * @return 所有附件大小汇总,为null说明没有文件需要上传 */ private Long checkEmailFileInfo(EmailInfoDTO emailInfo) { String emailTitle = emailInfo.getEmailTitle(); List dtos = emailInfo.getEmailFileList(); // 如果压缩包里面既有pdf又有其他格式的文件,说明其他格式的文件是不需要解析的 List exts = dtos.stream().map(EmailZipFileDTO::getExtName).distinct().toList(); if (exts.contains(Constants.FILE_PDF) && exts.size() > 1) { dtos.removeIf(e -> !Objects.equals(Constants.FILE_PDF, e.getExtName())); } // 移除逻辑 Iterator removeIterator = dtos.iterator(); while (removeIterator.hasNext()) { EmailZipFileDTO dto = removeIterator.next(); String filename = dto.getFilename(); // 删除复核函或基金合同 if (filename.contains("复核函") || (filename.contains("基金合同") && !filename.contains("合同变更"))) { log.warn("邮件{} 中的报告{} 是复核函或基金合同,不用解析上传。", emailTitle, filename); removeIterator.remove(); } // 不支持的类型 Integer type = dto.getEmailType(); if (!EmailTypeConst.SUPPORT_EMAIL_TYPES.contains(type)) { log.info("邮件{} 类型{} 不支持解析。", emailTitle, type); removeIterator.remove(); } } // 数据库已存在的数据过滤(邮件主题+报告名称+附件大小,压缩包文件大小汇总) long totalSize = dtos.stream().map(EmailZipFileDTO::getFileSize).reduce(0L, Long::sum); Iterator iterator = dtos.iterator(); while (iterator.hasNext()) { EmailZipFileDTO dto = iterator.next(); String filename = dto.getFilename(); Integer type = dto.getEmailType(); int count = 0; if (Objects.equals(type, EmailTypeConst.REPORT_LETTER_EMAIL_TYPE)) { // 确认单 count = this.emailFileInfoMapper.getLetterFilenameSuccessCount(emailTitle, filename); } else if (Objects.equals(type, EmailTypeConst.REPORT_EMAIL_TYPE)) { // 定期报告 count = this.emailFileInfoMapper.getAmacFilenameSuccessCount(emailTitle, filename, totalSize); } else if (Objects.equals(type, EmailTypeConst.REPORT_WEEKLY_TYPE)) { // 管理人周报 count = this.emailFileInfoMapper.getWeeklyFilenameSuccessCount(emailTitle, filename, totalSize); } else if (Objects.equals(type, EmailTypeConst.REPORT_OTHER_TYPE)) { // 其他报告 count = this.emailFileInfoMapper.getOtherFilenameSuccessCount(emailTitle, filename, totalSize); } if (count > 0) { iterator.remove(); log.info("邮件{} 报告{} 已存在解析成功的记录,不用重新解析。", emailTitle, filename); } } if (CollUtil.isEmpty(dtos)) { log.info("邮件{} 所有文件都已经解析成功过,不能重复解析了", emailTitle); return null; } if (log.isInfoEnabled()) { log.info("邮件{} 还有报告待解析:\n{}", emailTitle, dtos); } return totalSize; } /** * 邮件信息保存+附件解析 * * @param emailId 邮件ID,上传解析时一定是-1 * @param emailKey 邮件uuid(邮箱下载解析时) * @param emailAddress 接收人地址(邮箱下载解析时) * @param totalSize 所有附件大小汇总 * @param emailInfo 邮件信息,包含附件 * @param resultList 解析结果 * @return 邮件数据ID */ private Integer parseResults(Integer emailId, String emailKey, String emailAddress, long totalSize, EmailInfoDTO emailInfo, List> resultList) { String emailTitle = emailInfo.getEmailTitle(); List dtos = emailInfo.getEmailFileList(); if (emailId == null) { // 保存邮件信息 Integer emailType = dtos.get(0).getEmailType(); EmailParseInfoDO emailParseInfoDO = this.buildEmailParseInfo(emailAddress, emailType, emailInfo, totalSize); emailParseInfoDO.setEmailKey(emailKey); emailId = this.saveEmailParseInfo(emailParseInfoDO); } // 解析邮件报告 for (EmailZipFileDTO zipFile : dtos) { EmailFileInfoDO emailFile = this.saveEmailFileInfo(emailId, zipFile.getFilename(), zipFile.getFilepath()); // 解析并保存报告 ParseResult parseResult = this.parseReportAndHandleResult(emailTitle, emailFile.getId(), zipFile); if (!Objects.equals(1, parseResult.getStatus())) { log.error(parseResult.getMsg()); } if (parseResult.getData() == null) { parseResult.setData(new ReportData.DefaultReportData()); } parseResult.getData().setReportPath(zipFile.getFilepath()); resultList.add(parseResult); } return emailId; } /** * 解析报告并保存解析结果 * * @param emailTitle 邮件主题 * @param fileId 当前文件数据库ID * @param zipFile 当前报告的路径信息 * @return / */ private ParseResult parseReportAndHandleResult(String emailTitle, Integer fileId, EmailZipFileDTO zipFile) { Integer emailType = zipFile.getEmailType(); String fileName = zipFile.getFilename(); String filepath = zipFile.getFilepath(); ParseResult result = new ParseResult<>(); boolean reportFlag = emailType == null || !EmailTypeConst.SUPPORT_EMAIL_TYPES.contains(emailType); if (reportFlag || StrUtil.isBlank(fileName) || fileName.endsWith(Constants.FILE_HTML)) { return new ParseResult<>(ReportParseStatus.NOT_A_REPORT, null, fileName); } // 类型识别---先识别季度报告,没有季度再识别年度报告,最后识别月报 ReportType reportType = ReportParseUtils.matchReportType(emailType, fileName); if (reportType == null) { reportType = ReportParseUtils.matchReportType(emailType, emailTitle); if (log.isDebugEnabled()) { log.debug("报告{} 根据邮件主题{} 重新识别的类型是:{}", fileName, emailTitle, reportType); } } // 解析器--根据文件后缀获取对应解析器,解析不了就用AI来解析 ReportParserFileType fileType = ReportParserFileType.getBySuffix(zipFile.getExtName()); // 不支持的格式 if (fileType == null) { return new ParseResult<>(ReportParseStatus.NO_SUPPORT_TEMPLATE, null, fileName); } // 不是定期报告的判断逻辑放在不支持的格式下面 if (reportType == null) { return new ParseResult<>(ReportParseStatus.NOT_A_REPORT, null, fileName); } // docx转pdf if (Objects.equals(ReportParserFileType.WORD, fileType)) { try { String outputFile = FileUtil.getParent(filepath, 1) + File.separator + FileUtil.mainName(fileName) + ".pdf"; PdfUtil.convertDocxToPdf(filepath, outputFile); filepath = outputFile; } catch (Exception e) { log.warn("报告{} 转换为pdf失败:{}", fileName, ExceptionUtil.stacktraceToString(e)); } } // 首页和尾页转为png图片,首页用来识别基金名称和基金代码、尾页用来识别印章和联系人 List images = ListUtil.list(true); if (Objects.equals(ReportParserFileType.PDF, fileType)) { try { String output = filepath.replaceAll("archive|original", "image"); File outputFile = FileUtil.file(FileUtil.getParent(output, 1)); images = PdfUtil.convertFirstAndLastPagesToPng(filepath, outputFile, 300); if (log.isDebugEnabled()) { log.debug("报告{} 生成的图片地址是:\n{}", fileName, images); } } catch (Exception e) { log.warn("报告{} 生成图片失败:{}", fileName, ExceptionUtil.stacktraceToString(e)); } } else if (Objects.equals(ReportParserFileType.IMG, fileType)) { try { String outputFile = PdfUtil.compressAndSave(filepath); images.add(outputFile); } catch (IOException e) { log.error("报告{} 图片压缩失败,{}", fileName, ExceptionUtil.stacktraceToString(e)); } } // ocr识别月报是否管理人版或协会版 ReportMonthlyType monthlyType = ReportMonthlyType.NO_NEED; if (ReportType.MONTHLY == reportType) { monthlyType = this.determineReportType(emailTitle, fileName, filepath, images); } boolean isAmac = reportType == ReportType.ANNUALLY || reportType == ReportType.QUARTERLY || (reportType == ReportType.MONTHLY && ReportMonthlyType.AMAC == monthlyType); // 不支持解析的格式文件 boolean notSupportFile = false; // 解析报告 ReportData reportData = null; ReportParserParams params = new ReportParserParams(fileId, fileName, filepath, reportType); long start = System.currentTimeMillis(); try { if (isAmac || reportType == ReportType.LETTER) { ReportParser instance = this.reportParserFactory.getInstance(reportType, fileType); reportData = instance.parse(params); result = new ParseResult<>(1, "报告解析成功", reportData); } } catch (ReportParseException e) { result = new ParseResult<>(e.getCode(), StrUtil.format(e.getMsg(), fileName), null); log.warn("解析失败:{}", result.getMsg()); if (e instanceof NotSupportReportException) { notSupportFile = true; } } catch (Exception e) { log.warn("解析错误:{}", ExceptionUtil.stacktraceToString(e)); result = new ParseResult<>(ReportParseStatus.PARSE_FAIL, null, e.getMessage()); } finally { // 如果解析结果是空的就用AI工具解析一次 if (reportData == null && !notSupportFile) { if (log.isInfoEnabled()) { log.info("报告{} 是周报或管理人月报或其他类型或解析失败,用AI解析器解析", fileName); } try { if (!isAmac && CollUtil.isNotEmpty(images)) { filepath = images.get(0); } params = new ReportParserParams(fileId, fileName, filepath, reportType); ReportParser instance = this.reportParserFactory.getInstance(reportType, ReportParserFileType.AI); reportData = instance.parse(params); result = new ParseResult<>(1, "报告解析成功--AI", reportData); } catch (ReportParseException e) { result = new ParseResult<>(e.getCode(), StrUtil.format(e.getMsg(), fileName), null); log.warn("AI解析失败:{}", result.getMsg()); } catch (Exception e) { log.warn("AI解析错误:{}", ExceptionUtil.stacktraceToString(e)); result = new ParseResult<>(ReportParseStatus.PARSE_FAIL, null, e.getMessage()); } } if (reportData != null && reportData.getBaseInfo() != null) { // 设置月报类型 reportData.getBaseInfo().setMonthlyType(monthlyType.getType()); // 当报告日期还是空时设置为今天的前一天 if (reportData.getBaseInfo().getReportDate() == null) { Date date = DateUtil.offsetDay(new Date(), -1); reportData.getBaseInfo().setReportDate(date); } } // ocr信息提取(印章、联系人、基金名称和产品代码) this.ocrReportData(reportType, reportData, fileName, images); if (log.isInfoEnabled()) { log.info("报告{} 解析耗时{}ms,结果是:\n{}", fileName, (System.currentTimeMillis() - start), reportData); } } // 保存报告解析结果 this.saveReportData(reportData, reportType, fileName); return result; } /** * 判断月报类型(管理人版还是协会版) * * @param emailTitle 邮件主题 * @param fileName 报告名称 * @param filepath 报告路径 * @param images 报告的第一页和尾页图片地址(主要用于ocr提取关键信息) */ public ReportMonthlyType determineReportType(String emailTitle, String fileName, String filepath, List images) { // 1. 优先根据文件名判断 if (ReportParseUtils.containsAny(fileName, AMAC_KEYWORDS)) { return ReportMonthlyType.AMAC; } if (ReportParseUtils.containsAny(fileName, ReportParseUtils.MANAGER_KEYWORDS)) { return ReportMonthlyType.MANAGER; } // if (StrUtil.isNotBlank(ReportParseUtils.matchFundCode(fileName))) { // return ReportMonthlyType.AMAC; // } // 2. 根据文件路径判断 List pathSegments = StrUtil.split(filepath, File.separator); for (String segment : pathSegments) { boolean isExcluded = ReportParseUtils.containsAny(segment, EXCLUDE_PATH_KEYWORDS); if (!isExcluded && ReportParseUtils.containsAny(segment, AMAC_KEYWORDS)) { return ReportMonthlyType.AMAC; } if (!isExcluded && ReportParseUtils.containsAny(segment, ReportParseUtils.MANAGER_KEYWORDS)) { return ReportMonthlyType.MANAGER; } } // 3. 根据邮件主题判断 boolean isAmacEmail = ReportParseUtils.containsAny(emailTitle, AMAC_KEYWORDS) && !emailTitle.contains("公司及协会版"); if (isAmacEmail) { return ReportMonthlyType.AMAC; } if (ReportParseUtils.containsAny(emailTitle, ReportParseUtils.MANAGER_KEYWORDS)) { return ReportMonthlyType.MANAGER; } // 4.ocr 提取“曲线”、“基金份额”等关键字,如果有曲线则是管理人,如果有估值日期则是协会 if (CollUtil.isNotEmpty(images)) { try { return new OCRReportParser().parseMonthlyType(fileName, this.ocrParserUrl, images.get(0)); } catch (Exception ignored) { return ReportMonthlyType.FAILED; } } return ReportMonthlyType.FAILED; } /** * ocr 提取信息(包括首页的基金名称或报告日期,尾页的印章或联系人等信息) * * @param reportData 报告解析结果 * @param fileName 报告名称 * @param images 报告的收益和尾页png图片 */ private void ocrReportData(ReportType reportType, ReportData reportData, String fileName, List images) { if (reportData == null || CollUtil.isEmpty(images)) { return; } if (log.isInfoEnabled()) { log.info("报告{} 用ocr补充解析结果。补充前的结果是:\n{}", fileName, reportData); } // 报告才识别尾页的印章和联系人,确认单不识别尾页 if (ReportType.LETTER != reportType) { OCRParseData parseRes = null; try { // 首页和尾页相等时只读首页 String imageUrl = images.size() == 1 ? images.get(0) : images.get(1); parseRes = new OCRReportParser().parse(fileName, this.ocrParserUrl, imageUrl); } catch (Exception e) { log.error("报告{} OCR识别印章和联系人出错:{}", fileName, e.getMessage()); } // ocr识别尾页是否包含印章和联系人信息 if (parseRes != null) { if (reportData.getBaseInfo() != null) { reportData.getBaseInfo().setWithSeals(parseRes.getWithSeals()); reportData.getBaseInfo().setWithContacts(parseRes.getWithContacts()); if (fileName.contains("用印") && !Objects.equals(true, reportData.getBaseInfo().getWithSeals())) { reportData.getBaseInfo().setWithSeals(true); } } } // 首页和尾页不相等时解析首页的数据 if (images.size() != 1) { try { parseRes = new OCRReportParser().parse(fileName, this.ocrParserUrl, images.get(0)); } catch (Exception e) { log.error("报告{} OCR识别首页基金名称和报告日期出错:{}", fileName, e.getMessage()); } } // 用首页识别基金名称、产品代码和基金管理人 if (reportData.getFundInfo() != null && parseRes != null) { if (StrUtil.isBlank(reportData.getFundInfo().getFundName())) { reportData.getFundInfo().setFundName(parseRes.getFundName()); } if (StrUtil.isBlank(reportData.getFundInfo().getFundCode())) { reportData.getFundInfo().setFundCode(parseRes.getFundCode()); } if (StrUtil.isBlank(reportData.getFundInfo().getCompanyName()) || !reportData.getFundInfo().getCompanyName().contains("有限公司")) { reportData.getFundInfo().setCompanyName(parseRes.getCompanyName()); } } } else { // 确认单AI解析失败时重新用OCR识别 LetterReportData letterReportData = (LetterReportData) reportData; if (letterReportData.wasFailed()) { OCRLetterParseData parseRes = null; try { parseRes = new OCRReportParser().parseLetterData(fileName, this.ocrParserUrl, images.get(0)); } catch (Exception e) { log.error("报告{} OCR提取确认单关键信息出错:{}", fileName, e.getMessage()); } if (parseRes == null) { return; } if (letterReportData.getFundInfo() != null) { letterReportData.getFundInfo().setFundName(parseRes.getFundName()); letterReportData.getFundInfo().setFundCode(parseRes.getFundCode()); } if (letterReportData.getInvestorInfo() == null) { letterReportData.setInvestorInfo(new ReportInvestorInfoDTO()); } letterReportData.getInvestorInfo().setInvestorName(parseRes.getInvestorName()); letterReportData.getInvestorInfo().setCertificateNumber(parseRes.getCertificateNumber()); letterReportData.getInvestorInfo().setTradingAccount(parseRes.getTradingAccount()); letterReportData.getInvestorInfo().setFundAccount(parseRes.getFundAccount()); letterReportData.getInvestorInfo().setCertificateType(parseRes.getCertificateType()); if (letterReportData.getFundTransaction() == null) { letterReportData.setFundTransaction(new ReportFundTransactionDTO()); } letterReportData.getFundTransaction().setTransactionType(parseRes.getTransactionType()); letterReportData.getFundTransaction().setApplyDate(parseRes.getApplyDate()); letterReportData.getFundTransaction().setApplyShare(parseRes.getApplyShare()); letterReportData.getFundTransaction().setApplyAmount(parseRes.getApplyAmount()); letterReportData.getFundTransaction().setHoldingDate(parseRes.getHoldingDate()); letterReportData.getFundTransaction().setAmount(parseRes.getAmount()); letterReportData.getFundTransaction().setShare(parseRes.getShare()); letterReportData.getFundTransaction().setNav(parseRes.getNav()); } } } /** * 保存报告解析结果 * * @param reportData 报告解析结果 * @param reportType 报告类型 * @param fileName 报告名称 */ private void saveReportData(ReportData reportData, ReportType reportType, String fileName) { if (reportData == null) { return; } StopWatch writeWatch = new StopWatch(); writeWatch.start(); try { ReportWriter instance = this.reportWriterFactory.getInstance(reportType); instance.write(reportData); } catch (Exception e) { log.error("报告{} 结果保存失败 {}", fileName, ExceptionUtil.stacktraceToString(e)); } finally { writeWatch.stop(); if (log.isInfoEnabled()) { log.info("报告{}解析结果保存完成,耗时{}ms", fileName, writeWatch.getTotalTimeMillis()); } } } private EmailFileInfoDO saveEmailFileInfo(Integer emailId, String fileName, String filePath) { EmailFileInfoDO emailFileInfoDO = buildEmailFileInfoDO(emailId, fileName, filePath); emailFileInfoDO.setAiFileId(null); if (emailFileInfoDO.getId() != null) { emailFileInfoMapper.updateTimeById(null, new Date()); return emailFileInfoDO; } emailFileInfoMapper.insert(emailFileInfoDO); return emailFileInfoDO; } private EmailFileInfoDO buildEmailFileInfoDO(Integer emailId, String fileName, String filePath) { EmailFileInfoDO emailFileInfoDO = new EmailFileInfoDO(); emailFileInfoDO.setId(null); emailFileInfoDO.setEmailId(emailId); emailFileInfoDO.setFileName(fileName); emailFileInfoDO.setFilePath(filePath); emailFileInfoDO.setIsvalid(1); emailFileInfoDO.setCreatorId(0); emailFileInfoDO.setCreateTime(new Date()); emailFileInfoDO.setUpdaterId(0); emailFileInfoDO.setUpdateTime(new Date()); return emailFileInfoDO; } private Integer saveEmailParseInfo(EmailParseInfoDO emailParseInfoDO) { if (emailParseInfoDO == null) { return null; } // 重新邮件功能 -> 修改解析时间和更新时间 if (emailParseInfoDO.getId() != null) { emailParseInfoMapper.updateParseTime(emailParseInfoDO.getId(), emailParseInfoDO.getParseDate()); return emailParseInfoDO.getId(); } emailParseInfoMapper.insert(emailParseInfoDO); return emailParseInfoDO.getId(); } private EmailParseInfoDO buildEmailParseInfo(String emailAddress, Integer emailType, EmailInfoDTO emailInfo, long totalSize) { EmailParseInfoDO emailParseInfoDO = new EmailParseInfoDO(); emailParseInfoDO.setId(null); emailParseInfoDO.setSenderEmail(emailInfo.getSenderEmail()); emailParseInfoDO.setEmail(emailAddress); emailParseInfoDO.setEmailDate(DateUtil.parse(emailInfo.getEmailDate(), DateConst.YYYY_MM_DD_HH_MM_SS)); emailParseInfoDO.setParseDate(new Date()); emailParseInfoDO.setEmailTitle(emailInfo.getEmailTitle()); emailParseInfoDO.setEmailType(emailType); emailParseInfoDO.setParseStatus(EmailParseStatusConst.SUCCESS); emailParseInfoDO.setAttrSize(totalSize); emailParseInfoDO.setIsvalid(1); emailParseInfoDO.setCreatorId(0); emailParseInfoDO.setCreateTime(new Date()); emailParseInfoDO.setUpdaterId(0); emailParseInfoDO.setUpdateTime(new Date()); return emailParseInfoDO; } /** * 读取邮件 * * @param mailboxInfoDTO 邮箱配置信息 * @param startDate 邮件起始日期 * @param endDate 邮件截止日期(为null,将解析邮件日期小于等于startDate的当天邮件) * @return 读取到的邮件信息 * @throws Exception 异常信息 */ private Map> realEmail(MailboxInfoDTO mailboxInfoDTO, Date startDate, Date endDate, List folderNames) throws Exception { if (CollUtil.isEmpty(folderNames)) { folderNames = ListUtil.toList("INBOX"); } Store store = EmailUtil.getStoreNew(mailboxInfoDTO); if (store == null) { return MapUtil.newHashMap(4); } Map> result = MapUtil.newHashMap(128); try { if (log.isDebugEnabled()) { Folder[] list = store.getDefaultFolder().list("*"); List names = Arrays.stream(list).map(Folder::getFullName).toList(); log.debug("获取所有邮箱文件夹:{}", names); } for (String folderName : folderNames) { try { Map> temp = this.getFolderEmail(mailboxInfoDTO, startDate, endDate, store, folderName); if (MapUtil.isNotEmpty(temp)) { result.putAll(temp); } } catch (Exception e) { log.warn("文件夹{} 邮件获取失败:{}", folderName, ExceptionUtil.stacktraceToString(e)); } } } catch (Exception e) { log.error("邮件获取失败:{}", ExceptionUtil.stacktraceToString(e)); } finally { store.close(); } return result; } private Map> getFolderEmail(MailboxInfoDTO mailboxInfoDTO, Date startDate, Date endDate, Store store, String folderName) throws MessagingException { // 默认读取收件箱的邮件 Folder folder = store.getFolder(folderName); folder.open(this.readWriteSeen ? Folder.READ_WRITE : Folder.READ_ONLY); Message[] messages = getEmailMessage(folder, mailboxInfoDTO.getProtocol(), startDate); if (messages == null || messages.length == 0) { log.warn("{} 获取不到邮件 -> 邮箱信息:{},开始时间:{},结束时间:{}", folderName, mailboxInfoDTO, startDate, endDate); return MapUtil.newHashMap(); } String emailAddress = mailboxInfoDTO.getAccount(); Map> emailMessageMap = MapUtil.newHashMap(); for (Message message : messages) { long start = System.currentTimeMillis(); List dtos = CollUtil.newArrayList(); String emailTitle = message.getSubject(); if (this.readWriteSeen && isMessageRead(message)) { log.warn("{} 邮件{} 已读,不用重新下载解析!", folderName, emailTitle); continue; } try { Date emailDate = message.getSentDate(); String emailDateStr = DateUtil.format(emailDate, DateConst.YYYY_MM_DD_HH_MM_SS); if (log.isInfoEnabled()) { log.info("{} 邮件{} 数据获取中,邮件时间:{}", folderName, emailTitle, emailDateStr); } boolean isNotParseConditionSatisfied = emailDate == null || (endDate != null && emailDate.compareTo(endDate) > 0) || (startDate != null && emailDate.compareTo(startDate) < 0); if (isNotParseConditionSatisfied) { String st = DateUtil.formatDateTime(startDate); String ed = DateUtil.formatDateTime(endDate); log.warn("{} 邮件{} 发送时间{}不在区间内【{} ~ {}】", folderName, emailTitle, emailDateStr, st, ed); continue; } String senderEmail = getSenderEmail(message); Integer emailType = EmailUtil.getEmailTypeBySubject(emailTitle); if (emailType == null) { log.warn("{} 邮件不满足解析条件 -> 邮件主题:{},邮件日期:{}", folderName, emailTitle, emailDateStr); continue; } // // 成功解析的邮件不用重复下载 // Integer okNum = this.emailParseInfoMapper.countEmailByInfoAndStatus(emailTitle, senderEmail, emailAddress, emailDateStr); // if (okNum > 0) { // if (log.isInfoEnabled()) { // log.info("{} 邮件{} 已经存在解析完成的记录,不要重复下载了。", folderName, emailTitle); // } // continue; // } if (log.isInfoEnabled()) { log.info("{} 邮件{} 基本信息获取完成,开始下载附件!邮件日期:{}", folderName, emailTitle, emailDateStr); } Object content = message.getContent(); if (content instanceof Multipart multipart) { this.reMultipart(emailAddress, emailTitle, emailDate, multipart, dtos); } else if (content instanceof Part part) { this.rePart(emailAddress, emailTitle, emailDate, part, dtos); } else { log.warn("{} 邮件{} 获取不了附件", folderName, emailTitle); } if (CollUtil.isEmpty(dtos)) { log.warn("{} 邮件{} 没有获取到附件", folderName, emailTitle); continue; } dtos.forEach(e -> { e.setEmailType(emailType); e.setSenderEmail(senderEmail); }); emailMessageMap.put(IdUtil.simpleUUID(), dtos); } catch (Exception e) { log.error("{} 邮件{} 下载报错 {}", folderName, emailTitle, ExceptionUtil.stacktraceToString(e)); } finally { if (CollUtil.isNotEmpty(dtos) && log.isInfoEnabled()) { log.info("{} 邮件{} 下载完成,总计耗时{} ms,文件内容如下\n {}", folderName, emailTitle, System.currentTimeMillis() - start, dtos); } } } if (this.readWriteSeen) { // 设置已读标志 folder.setFlags(messages, new Flags(Flags.Flag.SEEN), true); } folder.close(false); return emailMessageMap; } private void rePart(String account, String subject, Date sendDate, Part part, List emailContentInfoDTOList) throws Exception { String fileName = EmailUtil.decodeFileName(part); if (StrUtil.isBlank(fileName)) { return; } if (fileName.contains("\"") || fileName.contains("\n")) { fileName = fileName.replaceAll("\"", "").replaceAll("\n", ""); } if (fileName.contains("=?")) { fileName = MimeUtility.decodeText(fileName); } String disposition = part.getDisposition(); String contentType = part.getContentType(); String[] att_files = new String[]{Constants.ARCHIVE_7Z, Constants.ARCHIVE_RAR, Constants.ARCHIVE_ZIP, Constants.FILE_PDF, Constants.FILE_DOCX, Constants.FILE_JPG, Constants.FILE_PNG}; boolean attachmentFlag = StrUtil.endWithAny(fileName, att_files); boolean isAttachment = attachmentFlag || Part.ATTACHMENT.equalsIgnoreCase(disposition) || (contentType != null && attachmentMimePrefixes.stream().anyMatch(prefix -> StrUtil.startWithIgnoreCase(contentType, prefix) )); if (!isAttachment) { log.warn("邮件{} 未检测到{}类型的附件 (fileName={}, disposition={}, contentType={})", subject, att_files, fileName, disposition, contentType); return; } File saveFile = this.generateSavePath(account, sendDate, fileName); if (!saveFile.exists()) { if (!saveFile.getParentFile().exists()) { boolean mkdirs = saveFile.getParentFile().mkdirs(); if (!mkdirs) { log.warn("file path mkdir failed."); } } try (InputStream is = part.getInputStream()) { Files.copy(is, saveFile.toPath()); } } else { if (log.isInfoEnabled()) { log.info("邮件{} 已下载过附件:{},不用重新下载了。", subject, saveFile.toPath()); } } EmailContentInfoDTO emailContentInfoDTO = new EmailContentInfoDTO(); emailContentInfoDTO.setFileName(fileName); emailContentInfoDTO.setFileSize(part.getSize()); emailContentInfoDTO.setFilePath(saveFile.getAbsolutePath()); emailContentInfoDTO.setEmailAddress(account); emailContentInfoDTO.setEmailTitle(subject); emailContentInfoDTO.setEmailDate(DateUtil.format(sendDate, DateConst.YYYY_MM_DD_HH_MM_SS)); emailContentInfoDTOList.add(emailContentInfoDTO); } public File generateSavePath(String account, Date sendDate, String fileName) { String emailDateStr = DateUtil.format(sendDate, DateConst.YYYYMMDD); String filePath = this.path + File.separator + account + File.separator + emailDateStr + File.separator + "original" + File.separator; // 压缩包重名时的后面的压缩包会覆盖前面压缩包的问题(不考虑普通文件) String emailDate = DateUtil.format(sendDate, DateConst.YYYYMMDDHHMMSS24); String realName = ArchiveUtil.isArchive(fileName) ? emailDate + fileName : fileName; return FileUtil.file(filePath + realName); } private void reMultipart(String account, String subject, Date emailDate, Multipart multipart, List emailContentInfoDTOList) throws Exception { for (int i = 0; i < multipart.getCount(); i++) { Part bodyPart = multipart.getBodyPart(i); Object content = bodyPart.getContent(); if (content instanceof String) { if (log.isDebugEnabled()) { log.debug("邮件{} 获取的正文不做解析,内容是 {}", subject, content); } continue; } if (content instanceof Multipart mp) { this.reMultipart(account, subject, emailDate, mp, emailContentInfoDTOList); } else { this.rePart(account, subject, emailDate, bodyPart, emailContentInfoDTOList); } } } private String getSenderEmail(Message message) { Address[] senderAddress; try { senderAddress = message.getFrom(); if (senderAddress == null || senderAddress.length == 0) { return null; } // 此时的address是含有编码(MIME编码方式)后的文本和实际的邮件地址 String address = ""; for (Address from : senderAddress) { if (StrUtil.isNotBlank(from.toString())) { address = from.toString(); break; } } // 正则表达式匹配邮件地址 Pattern pattern = Pattern.compile("<(\\S+)>"); Matcher matcher = pattern.matcher(address); if (matcher.find()) { return matcher.group(1); } } catch (MessagingException e) { log.error(e.getMessage(), e); } return null; } private Message[] getEmailMessage(Folder folder, String protocol, Date startDate) { try { if (protocol.contains("imap")) { // 获取邮件日期大于等于startDate的邮件(搜索条件只支持按天) SearchTerm startDateTerm = new ReceivedDateTerm(ComparisonTerm.GE, startDate); return folder.search(startDateTerm); } else { return folder.getMessages(); } } catch (MessagingException e) { throw new RuntimeException(e); } } /** * 检查邮件是否已读 * * @param message 邮件对象 * @return true表示已读,false表示未读 * @throws MessagingException 如果访问邮件标志时出错 */ private boolean isMessageRead(Message message) throws MessagingException { // 获取邮件的所有标志 Flags flags = message.getFlags(); // 检查是否包含 SEEN 标志 return flags.contains(Flags.Flag.SEEN); } }