package com.smppw.modaq.domain.service; import cn.hutool.core.collection.CollUtil; import cn.hutool.core.collection.ListUtil; import cn.hutool.core.date.DateUtil; import cn.hutool.core.exceptions.ExceptionUtil; import cn.hutool.core.io.FileUtil; import cn.hutool.core.map.MapUtil; import cn.hutool.core.util.StrUtil; import com.smppw.modaq.application.components.OCRReportParser; import com.smppw.modaq.application.components.ReportParseUtils; import com.smppw.modaq.application.components.report.parser.ReportParser; import com.smppw.modaq.application.components.report.parser.ReportParserFactory; import com.smppw.modaq.application.components.report.writer.ReportWriter; import com.smppw.modaq.application.components.report.writer.ReportWriterFactory; import com.smppw.modaq.application.util.EmailUtil; import com.smppw.modaq.common.conts.DateConst; import com.smppw.modaq.common.conts.EmailParseStatusConst; import com.smppw.modaq.common.conts.EmailTypeConst; import com.smppw.modaq.common.enums.ReportParseStatus; import com.smppw.modaq.common.enums.ReportParserFileType; import com.smppw.modaq.common.enums.ReportType; import com.smppw.modaq.common.exception.NotSupportReportException; import com.smppw.modaq.common.exception.ReportParseException; import com.smppw.modaq.domain.dto.EmailContentInfoDTO; import com.smppw.modaq.domain.dto.EmailZipFileDTO; import com.smppw.modaq.domain.dto.MailboxInfoDTO; import com.smppw.modaq.domain.dto.report.OCRParseData; import com.smppw.modaq.domain.dto.report.ParseResult; import com.smppw.modaq.domain.dto.report.ReportData; import com.smppw.modaq.domain.dto.report.ReportParserParams; import com.smppw.modaq.domain.entity.EmailFileInfoDO; import com.smppw.modaq.domain.entity.EmailParseInfoDO; import com.smppw.modaq.domain.mapper.EmailFileInfoMapper; import com.smppw.modaq.domain.mapper.EmailParseInfoMapper; import com.smppw.modaq.infrastructure.util.ArchiveUtil; import com.smppw.modaq.infrastructure.util.DateUtils; import com.smppw.modaq.infrastructure.util.PdfUtil; import jakarta.mail.*; import jakarta.mail.internet.MimeUtility; import jakarta.mail.search.ComparisonTerm; import jakarta.mail.search.ReceivedDateTerm; import jakarta.mail.search.SearchTerm; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.beans.factory.annotation.Value; import org.springframework.stereotype.Service; import org.springframework.util.StopWatch; import java.io.File; import java.io.IOException; import java.io.InputStream; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.util.*; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.stream.Collectors; /** * @author mozuwen * @date 2024-09-04 * @description 邮件解析服务 */ @Service public class EmailParseService { // public static final int stepSize = 10000; private static final Logger log = LoggerFactory.getLogger(EmailParseService.class); // 扩展支持的 MIME 类型 private static final Set attachmentMimePrefixes = Set.of( "application/pdf", "application/zip", "application/x-zip-compressed", "application/rar", "application/x-rar-compressed" // 按需添加其他类型... ); // private final EmailFieldMappingMapper emailFieldMapper; private final EmailParseInfoMapper emailParseInfoMapper; private final EmailFileInfoMapper emailFileInfoMapper; /* 报告解析和入库的方法 */ private final ReportParserFactory reportParserFactory; private final ReportWriterFactory reportWriterFactory; @Value("${email.file.path}") private String path; @Value("${email.report.ocr-parser-url}") private String ocrParserUrl; public EmailParseService(EmailParseInfoMapper emailParseInfoMapper, EmailFileInfoMapper emailFileInfoMapper, ReportParserFactory reportParserFactory, ReportWriterFactory reportWriterFactory) { this.emailParseInfoMapper = emailParseInfoMapper; this.emailFileInfoMapper = emailFileInfoMapper; this.reportParserFactory = reportParserFactory; this.reportWriterFactory = reportWriterFactory; } /** * 解析指定邮箱指定时间范围内的邮件 * * @param mailboxInfoDTO 邮箱配置信息 * @param startDate 邮件起始日期(yyyy-MM-dd HH:mm:ss) * @param endDate 邮件截止日期(yyyy-MM-dd HH:mm:ss, 为null,将解析邮件日期小于等于startDate的当天邮件) * @param emailTypes 当前任务支持的邮件类型,默认支持确认单 */ public void parseEmail(MailboxInfoDTO mailboxInfoDTO, Date startDate, Date endDate, List folderNames, List emailTypes) { if (CollUtil.isEmpty(emailTypes)) { emailTypes = ListUtil.of(EmailTypeConst.REPORT_LETTER_EMAIL_TYPE); } if (log.isInfoEnabled()) { log.info("开始邮件解析 -> 邮箱信息:{},开始时间:{},结束时间:{}", mailboxInfoDTO, DateUtil.format(startDate, DateConst.YYYY_MM_DD_HH_MM_SS), DateUtil.format(endDate, DateConst.YYYY_MM_DD_HH_MM_SS)); } Map> emailContentMap; try { emailContentMap = this.realEmail(mailboxInfoDTO, startDate, endDate, folderNames); } catch (Exception e) { log.error("采集邮件失败 -> 邮箱配置信息:{},堆栈信息:{}", mailboxInfoDTO, ExceptionUtil.stacktraceToString(e)); return; } if (MapUtil.isEmpty(emailContentMap)) { log.warn("未采集到邮件 -> 邮箱配置信息:{},开始时间:{},结束时间:{}", mailboxInfoDTO, DateUtil.format(startDate, DateConst.YYYY_MM_DD_HH_MM_SS), DateUtil.format(endDate, DateConst.YYYY_MM_DD_HH_MM_SS)); return; } for (Map.Entry> emailEntry : emailContentMap.entrySet()) { List emailContentInfoDTOList = emailEntry.getValue(); if (CollUtil.isEmpty(emailContentInfoDTOList)) { log.warn("未采集到正文或附件"); continue; } log.info("开始解析邮件数据 -> 邮件主题:{},邮件日期:{}", emailContentInfoDTOList.get(0).getEmailTitle(), emailContentInfoDTOList.get(0).getEmailDate()); Map> emailZipFileMap = MapUtil.newHashMap(); for (EmailContentInfoDTO emailContentInfoDTO : emailContentInfoDTOList) { // 正文不用解压附件 if (emailContentInfoDTO.getFileName() != null && emailContentInfoDTO.getFileName().endsWith(".html")) { emailZipFileMap.put(emailContentInfoDTO, ListUtil.empty()); continue; } try { List fundNavDTOList = this.parseZipEmail(emailContentInfoDTO); emailZipFileMap.put(emailContentInfoDTO, fundNavDTOList); } catch (IOException e) { log.error("压缩包解压失败:{}", ExceptionUtil.stacktraceToString(e)); EmailParseInfoDO fail = buildEmailParseInfo(null, mailboxInfoDTO.getAccount(), emailContentInfoDTO); fail.setFailReason("压缩包解压失败"); fail.setParseStatus(EmailParseStatusConst.FAIL); fail.setEmailKey(emailEntry.getKey()); this.emailParseInfoMapper.insert(fail); } catch (Exception e) { log.error("堆栈信息:{}", ExceptionUtil.stacktraceToString(e)); } } Iterator>> entryIterator = emailZipFileMap.entrySet().iterator(); while (entryIterator.hasNext()) { Map.Entry> entry = entryIterator.next(); EmailContentInfoDTO key = entry.getKey(); List dtos = entry.getValue(); List types = ListUtil.list(false); types.add(key.getEmailType()); if (CollUtil.isNotEmpty(dtos)) { List list = dtos.stream().map(EmailZipFileDTO::getEmailType).distinct().toList(); CollUtil.addAllIfNotContains(types, list); } boolean flag = false; for (Integer type : types) { if (emailTypes.contains(type)) { flag = true; break; } } if (!flag) { log.warn("当前邮件{} 的类型{} 不在支持的任务类型{} 中,不用执行解析逻辑。", key, types, emailTypes); entryIterator.remove(); } } // 保存相关信息 -> 邮件信息表,邮件文件表,邮件净值表,邮件规模表,基金净值表 saveRelatedTable(emailEntry.getKey(), mailboxInfoDTO.getAccount(), emailZipFileMap); log.info("结束邮件解析 -> 邮箱信息:{},开始时间:{},结束时间:{}", emailEntry.getValue(), DateUtil.format(startDate, DateConst.YYYY_MM_DD_HH_MM_SS), DateUtil.format(endDate, DateConst.YYYY_MM_DD_HH_MM_SS)); } } public List parseZipEmail(EmailContentInfoDTO emailContentInfoDTO) throws IOException { List resultList = ListUtil.list(false); Integer emailType = emailContentInfoDTO.getEmailType(); String filepath = emailContentInfoDTO.getFilePath(); String emailTitle = emailContentInfoDTO.getEmailTitle(); int fileSize = emailContentInfoDTO.getFileSize(); if (ArchiveUtil.isZip(filepath)) { handleCompressedFiles(emailTitle, filepath, ".zip", emailType, fileSize, resultList); } else if (ArchiveUtil.isRAR(filepath)) { handleCompressedFiles(emailTitle, filepath, ".rar", emailType, fileSize, resultList); } // 文件中的类型判断 if (emailType == null || !EmailTypeConst.SUPPORT_EMAIL_TYPES.contains(emailType)) { emailType = EmailUtil.getEmailTypeBySubject(emailContentInfoDTO.getFileName()); emailContentInfoDTO.setEmailType(emailType); } if (CollUtil.isNotEmpty(resultList)) { for (EmailZipFileDTO dto : resultList) { dto.setEmailType(emailType); } if (log.isInfoEnabled()) { log.info("当前邮件{} 所有解压缩文件解压完成:{}", emailTitle, resultList); } } return resultList; } private void handleCompressedFiles(String emailTitle, String filepath, String extension, Integer emailType, int fileSize, List resultList) throws IOException { String destPath = getDestinationPath(filepath, extension); File destFile = new File(destPath); if (!destFile.exists()) { if (!destFile.mkdirs()) { throw new IOException("无法创建目标目录: " + destPath); } } List extractedDirs; if (ArchiveUtil.isZip(filepath)) { extractedDirs = ArchiveUtil.extractCompressedFiles(filepath, destPath); } else if (ArchiveUtil.isRAR(filepath)) { extractedDirs = ArchiveUtil.extractRar5(filepath, destPath); } else { return; } for (String dir : extractedDirs) { // 如果邮件类型不满足解析条件则重新根据文件名判断 if (emailType == null || !EmailTypeConst.SUPPORT_EMAIL_TYPES.contains(emailType)) { emailType = EmailUtil.getEmailTypeBySubject(dir); } File file = new File(dir); if (file.isDirectory()) { String[] subDirs = file.list(); if (subDirs != null) { for (String subDir : subDirs) { resultList.add(new EmailZipFileDTO(emailTitle, subDir, fileSize, emailType)); } } else { log.warn("目录 {} 下无文件", dir); } } else { resultList.add(new EmailZipFileDTO(emailTitle, dir, fileSize, emailType)); } } } private String getDestinationPath(String filepath, String extension) { Path path = Paths.get(filepath); String fileName = path.getFileName().toString(); String baseName = fileName.substring(0, fileName.length() - extension.length()); return path.getParent().resolve(baseName).toString(); } public void saveRelatedTable(String emailKey, String emailAddress, Map> emailZipFileMap) { // python 报告解析接口结果 List> dataList = ListUtil.list(false); for (Map.Entry> entry : emailZipFileMap.entrySet()) { EmailContentInfoDTO emailDto = entry.getKey(); if (emailDto.getFileName() != null && emailDto.getFileName().endsWith(".html")) { continue; } String emailTitle = emailDto.getEmailTitle(); // 待解析文件数据处理,不支持已存在的文件重复解析 List dtos = ListUtil.list(false); List zipFiles = entry.getValue(); if (CollUtil.isEmpty(zipFiles)) { dtos.add(new EmailZipFileDTO(emailTitle, emailDto)); } else { dtos.addAll(zipFiles); } // 重新判断类型 for (EmailZipFileDTO dto : dtos) { if (!EmailTypeConst.SUPPORT_NO_OTHER_TYPES.contains(dto.getEmailType())) { Integer emailType = EmailUtil.getEmailTypeBySubject(emailTitle + dto.getFilename()); dto.setEmailType(emailType); } } // 数据库已存在的数据过滤(邮件主题+报告名称+附件大小) Iterator iterator = dtos.iterator(); while (iterator.hasNext()) { EmailZipFileDTO dto = iterator.next(); String filename = dto.getFilename(); // 删除复核函或基金合同 if (filename.contains("复核函") || filename.contains("基金合同")) { log.warn("邮件{} 中的报告{} 是复核函或基金合同,不用解析上传。", emailTitle, filename); iterator.remove(); } Integer emailType = dto.getEmailType(); int fileSize = dto.getFileSize(); int count = 0; if (Objects.equals(emailType, EmailTypeConst.REPORT_LETTER_EMAIL_TYPE)) { // 确认单 count = this.emailFileInfoMapper.getLetterFilenameSuccessCount(emailTitle, filename); } else if (Objects.equals(emailType, EmailTypeConst.REPORT_EMAIL_TYPE)) { // 定期报告 count = this.emailFileInfoMapper.getAmacFilenameSuccessCount(emailTitle, filename, fileSize); } else if (Objects.equals(emailType, EmailTypeConst.REPORT_WEEKLY_TYPE)) { // 管理人周报 count = this.emailFileInfoMapper.getWeeklyFilenameSuccessCount(emailTitle, filename, fileSize); } else if (Objects.equals(emailType, EmailTypeConst.REPORT_OTHER_TYPE)) { // 其他报告 count = this.emailFileInfoMapper.getOtherFilenameSuccessCount(emailTitle, filename, fileSize); } else { log.info("邮件{} 类型{} 不支持解析。", emailTitle, emailType); iterator.remove(); } if (count > 0) { iterator.remove(); log.info("邮件{} 附件{} 已存在解析成功的记录,不用重新解析。", emailTitle, filename); } } if (CollUtil.isEmpty(dtos)) { log.info("邮件{} 所有文件都已经解析成功过,不能重复解析了", emailTitle); continue; } Integer emailId = emailDto.getEmailId(); EmailParseInfoDO emailParseInfoDO = this.buildEmailParseInfo(emailId, emailAddress, emailDto); emailParseInfoDO.setEmailKey(emailKey); emailId = this.saveEmailParseInfo(emailParseInfoDO); if (emailId == null) { continue; } for (EmailZipFileDTO zipFile : dtos) { EmailFileInfoDO emailFile = this.saveEmailFileInfo(emailId, zipFile.getFilename(), zipFile.getFilepath()); // 解析并保存报告 ParseResult parseResult = this.parseReportAndHandleResult(emailTitle, emailFile, zipFile); dataList.add(parseResult); } String failReason = null; int emailParseStatus = EmailParseStatusConst.SUCCESS; // 报告邮件有一条失败就表示整个邮件解析失败 if (CollUtil.isNotEmpty(dataList)) { // ai解析结果 List aiParaseList = dataList.stream().map(ParseResult::getData) .filter(Objects::nonNull).filter(e -> Objects.equals(true, e.getAiParse())).toList(); if (CollUtil.isNotEmpty(aiParaseList)) { for (ReportData data : aiParaseList) { this.emailFileInfoMapper.updateAiParseByFileId(data.getBaseInfo().getFileId(), data.getAiParse(), data.getAiFileId()); } } long failNum = dataList.stream().filter(e -> !Objects.equals(EmailParseStatusConst.SUCCESS, e.getStatus())).count(); if (failNum > 0) { emailParseStatus = EmailParseStatusConst.FAIL; failReason = dataList.stream().map(ParseResult::getMsg).collect(Collectors.joining(";")); } } emailParseInfoMapper.updateParseStatus(emailId, emailParseStatus, failReason); } } private ParseResult parseReportAndHandleResult(String emailTitle, EmailFileInfoDO emailFileInfo, EmailZipFileDTO zipFile) { Integer emailType = zipFile.getEmailType(); String fileName = zipFile.getFilename(); String filepath = zipFile.getFilepath(); ParseResult result = new ParseResult<>(); boolean reportFlag = emailType == null || !EmailTypeConst.SUPPORT_EMAIL_TYPES.contains(emailType); if (reportFlag || StrUtil.isBlank(fileName) || fileName.endsWith(".html")) { result.setStatus(ReportParseStatus.NOT_A_REPORT.getCode()); result.setMsg(StrUtil.format(ReportParseStatus.NOT_A_REPORT.getMsg(), fileName)); log.error(result.getMsg()); return result; } // 类型识别---先识别季度报告,没有季度再识别年度报告,最后识别月报 ReportType reportType = ReportParseUtils.matchReportType(emailType, fileName); if (reportType == null) { reportType = ReportParseUtils.matchReportType(emailType, emailTitle); if (log.isDebugEnabled()) { log.debug("报告{} 根据邮件主题{} 重新识别的类型是:{}", fileName, emailTitle, reportType); } } // 解析器--根据文件后缀获取对应解析器,解析不了就用AI来解析 ReportParserFileType fileType; String fileSuffix = StrUtil.subAfter(fileName, ".", true); fileType = ReportParserFileType.getBySuffix(fileSuffix); // 不支持的格式 if (fileType == null) { result.setStatus(ReportParseStatus.NO_SUPPORT_TEMPLATE.getCode()); result.setMsg(StrUtil.format(ReportParseStatus.NO_SUPPORT_TEMPLATE.getMsg(), fileName)); log.error(result.getMsg()); return result; } // 不是定期报告的判断逻辑放在不支持的格式下面 if (reportType == null) { result.setStatus(ReportParseStatus.NOT_A_REPORT.getCode()); result.setMsg(StrUtil.format(ReportParseStatus.NOT_A_REPORT.getMsg(), fileName)); log.error(result.getMsg()); return result; } Integer fileId = emailFileInfo.getId(); // 首页和尾页转为png图片,首页用来识别基金名称和基金代码、尾页用来识别印章和联系人 List images = ListUtil.list(true); if (Objects.equals(ReportParserFileType.PDF, fileType)) { try { String output = FileUtil.getParent(filepath, 1) + File.separator + "image"; images = PdfUtil.convertFirstAndLastPagesToPng(filepath, FileUtil.file(output), 300); if (log.isDebugEnabled()) { log.debug("报告[{}] 生成的图片地址是:{}", fileName, images); } } catch (Exception e) { log.warn("报告[{}] 生成图片失败:{}", fileName, ExceptionUtil.stacktraceToString(e)); } } else if (Objects.equals(ReportParserFileType.IMG, fileType)) { images.add(filepath); } // 不支持解析的格式文件 boolean notSupportFile = false; // 解析报告 ReportData reportData = null; ReportParserParams params = new ReportParserParams(fileId, fileName, filepath, reportType); StopWatch parserWatch = new StopWatch(); parserWatch.start(); try { if (reportType != ReportType.OTHER && reportType != ReportType.WEEKLY) { ReportParser instance = this.reportParserFactory.getInstance(reportType, fileType); reportData = instance.parse(params); result.setStatus(1); result.setMsg("报告解析成功"); result.setData(reportData); } else { if (log.isInfoEnabled()) { log.info("报告{} 是周报或其他类型,直接用AI解析器解析", fileName); } } } catch (ReportParseException e) { log.error("解析失败:{}", StrUtil.format(e.getMsg(), fileName)); result.setStatus(e.getCode()); result.setMsg(StrUtil.format(e.getMsg(), fileName)); if (e instanceof NotSupportReportException) { notSupportFile = true; } } catch (Exception e) { log.error("解析错误:{}", ExceptionUtil.stacktraceToString(e)); result.setStatus(ReportParseStatus.PARSE_FAIL.getCode()); result.setMsg(StrUtil.format(ReportParseStatus.PARSE_FAIL.getMsg(), e.getMessage())); } finally { // 如果解析结果是空的就用AI工具解析一次 if (reportData == null && !notSupportFile) { if (reportType == ReportType.QUARTERLY || reportType == ReportType.ANNUALLY) { if (log.isInfoEnabled()) { log.info("报告{} 开始AI解析......", fileName); } } else if (CollUtil.isNotEmpty(images)) { filepath = images.get(0); if (log.isInfoEnabled()) { log.info("报告{} 用首页图片{} 开始AI解析......", fileName, filepath); } } try { params = new ReportParserParams(fileId, fileName, filepath, reportType); ReportParser instance = this.reportParserFactory.getInstance(reportType, ReportParserFileType.AI); reportData = instance.parse(params); result.setStatus(1); result.setMsg("报告解析成功--AI"); result.setData(reportData); } catch (ReportParseException e) { log.error("AI解析失败:{}", StrUtil.format(e.getMsg(), fileName)); result.setStatus(e.getCode()); result.setMsg(StrUtil.format(e.getMsg(), fileName)); } catch (Exception e) { log.error("AI解析错误:{}", ExceptionUtil.stacktraceToString(e)); result.setStatus(ReportParseStatus.PARSE_FAIL.getCode()); result.setMsg(StrUtil.format(ReportParseStatus.PARSE_FAIL.getMsg(), e.getMessage())); } if (log.isInfoEnabled()) { log.info("报告{} AI解析结束!", fileName); } } // ocr信息提取 this.ocrReportData(reportType, reportData, fileName, images); parserWatch.stop(); if (log.isInfoEnabled()) { log.info("报告{}解析结果为{},耗时{}ms", fileName, reportData, parserWatch.getTotalTimeMillis()); } } // 保存报告解析结果 this.saveReportData(reportData, reportType, fileName); return result; } /** * ocr 提取信息(包括首页的基金名称或报告日期,尾页的印章或联系人等信息) * * @param reportData 报告解析结果 * @param fileName 报告名称 * @param images 报告的收益和尾页png图片 */ private void ocrReportData(ReportType reportType, ReportData reportData, String fileName, List images) { if (reportData == null || CollUtil.isEmpty(images)) { return; } OCRParseData parseRes = null; // 报告才识别尾页的印章和联系人,确认单不识别尾页 if (ReportType.LETTER != reportType) { try { // 首页和尾页相等时只读首页 String imageUrl = images.size() == 1 ? images.get(0) : images.get(1); parseRes = new OCRReportParser().parse(fileName, this.ocrParserUrl, imageUrl); } catch (Exception e) { log.error("报告{} OCR识别印章和联系人出错:{}", fileName, ExceptionUtil.stacktraceToString(e)); } // ocr识别尾页是否包含印章和联系人信息 if (parseRes != null) { if (reportData.getBaseInfo() != null) { reportData.getBaseInfo().setWithSeals(parseRes.getWithSeals()); reportData.getBaseInfo().setWithContacts(parseRes.getWithContacts()); if (fileName.contains("用印") && !Objects.equals(true, reportData.getBaseInfo().getWithSeals())) { reportData.getBaseInfo().setWithSeals(true); } } } } // 用首页识别基金名称、产品代码和报告日期 if ((reportData.getBaseInfo() != null && reportData.getBaseInfo().getReportDate() == null) || (reportData.getFundInfo() != null && StrUtil.isBlank(reportData.getFundInfo().getFundName())) || (reportData.getFundInfo() != null && StrUtil.isBlank(reportData.getFundInfo().getFundCode()))) { // 首页和尾页不相等时解析首页的数据 if (images.size() != 1) { try { parseRes = new OCRReportParser().parse(fileName, this.ocrParserUrl, images.get(0)); } catch (Exception e) { log.error("报告{} OCR识别首页基金名称和报告日期出错:{}", fileName, ExceptionUtil.stacktraceToString(e)); } } // ocr 识别的结果优先级更高 if (reportData.getFundInfo() != null && parseRes != null) { if (StrUtil.isBlank(reportData.getFundInfo().getFundName()) || !Objects.equals(reportData.getFundInfo().getFundName(), parseRes.getFundName())) { reportData.getFundInfo().setFundName(parseRes.getFundName()); } if (StrUtil.isBlank(reportData.getFundInfo().getFundCode()) || !Objects.equals(reportData.getFundInfo().getFundCode(), parseRes.getFundCode())) { reportData.getFundInfo().setFundCode(parseRes.getFundCode()); } } } } /** * 保存报告解析结果 * * @param reportData 报告解析结果 * @param reportType 报告类型 * @param fileName 报告名称 */ private void saveReportData(ReportData reportData, ReportType reportType, String fileName) { if (reportData == null) { return; } StopWatch writeWatch = new StopWatch(); writeWatch.start(); try { ReportWriter instance = this.reportWriterFactory.getInstance(reportType); instance.write(reportData); } catch (Exception e) { log.error("报告{}结果保存失败\n{}", fileName, ExceptionUtil.stacktraceToString(e)); } finally { writeWatch.stop(); if (log.isInfoEnabled()) { log.info("报告{}解析结果保存完成,耗时{}ms", fileName, writeWatch.getTotalTimeMillis()); } } } private EmailFileInfoDO saveEmailFileInfo(Integer emailId, String fileName, String filePath) { EmailFileInfoDO emailFileInfoDO = buildEmailFileInfoDO(emailId, fileName, filePath); emailFileInfoDO.setAiFileId(null); if (emailFileInfoDO.getId() != null) { emailFileInfoMapper.updateTimeById(null, new Date()); return emailFileInfoDO; } emailFileInfoMapper.insert(emailFileInfoDO); return emailFileInfoDO; } private EmailFileInfoDO buildEmailFileInfoDO(Integer emailId, String fileName, String filePath) { EmailFileInfoDO emailFileInfoDO = new EmailFileInfoDO(); emailFileInfoDO.setId(null); emailFileInfoDO.setEmailId(emailId); emailFileInfoDO.setFileName(fileName); emailFileInfoDO.setFilePath(filePath); emailFileInfoDO.setIsvalid(1); emailFileInfoDO.setCreatorId(0); emailFileInfoDO.setCreateTime(new Date()); emailFileInfoDO.setUpdaterId(0); emailFileInfoDO.setUpdateTime(new Date()); return emailFileInfoDO; } private Integer saveEmailParseInfo(EmailParseInfoDO emailParseInfoDO) { if (emailParseInfoDO == null) { return null; } // 重新邮件功能 -> 修改解析时间和更新时间 if (emailParseInfoDO.getId() != null) { emailParseInfoMapper.updateParseTime(emailParseInfoDO.getId(), emailParseInfoDO.getParseDate()); return emailParseInfoDO.getId(); } emailParseInfoMapper.insert(emailParseInfoDO); return emailParseInfoDO.getId(); } private EmailParseInfoDO buildEmailParseInfo(Integer emailId, String emailAddress, EmailContentInfoDTO emailContentInfoDTO) { EmailParseInfoDO emailParseInfoDO = new EmailParseInfoDO(); emailParseInfoDO.setId(emailId); emailParseInfoDO.setSenderEmail(emailContentInfoDTO.getSenderEmail()); emailParseInfoDO.setEmail(emailAddress); emailParseInfoDO.setEmailDate(DateUtil.parse(emailContentInfoDTO.getEmailDate(), DateConst.YYYY_MM_DD_HH_MM_SS)); emailParseInfoDO.setParseDate(emailContentInfoDTO.getParseDate() == null ? null : DateUtil.parseDate(emailContentInfoDTO.getParseDate())); emailParseInfoDO.setEmailTitle(emailContentInfoDTO.getEmailTitle()); emailParseInfoDO.setEmailType(emailContentInfoDTO.getEmailType()); emailParseInfoDO.setParseStatus(EmailParseStatusConst.SUCCESS); emailParseInfoDO.setAttrSize(emailContentInfoDTO.getFileSize()); emailParseInfoDO.setIsvalid(1); emailParseInfoDO.setCreatorId(0); emailParseInfoDO.setCreateTime(new Date()); emailParseInfoDO.setUpdaterId(0); emailParseInfoDO.setUpdateTime(new Date()); return emailParseInfoDO; } /** * 读取邮件 * * @param mailboxInfoDTO 邮箱配置信息 * @param startDate 邮件起始日期 * @param endDate 邮件截止日期(为null,将解析邮件日期小于等于startDate的当天邮件) * @return 读取到的邮件信息 * @throws Exception 异常信息 */ private Map> realEmail(MailboxInfoDTO mailboxInfoDTO, Date startDate, Date endDate, List folderNames) throws Exception { if (CollUtil.isEmpty(folderNames)) { folderNames = ListUtil.toList("INBOX"); } Store store = EmailUtil.getStoreNew(mailboxInfoDTO); if (store == null) { return MapUtil.newHashMap(4); } Map> result = MapUtil.newHashMap(128); try { if (log.isInfoEnabled()) { Folder[] list = store.getDefaultFolder().list("*"); List names = Arrays.stream(list).map(Folder::getFullName).toList(); log.info("获取所有邮箱文件夹:{}", names); } for (String folderName : folderNames) { try { Map> temp = this.getFolderEmail(mailboxInfoDTO, startDate, endDate, store, folderName); if (MapUtil.isNotEmpty(temp)) { result.putAll(temp); } } catch (Exception e) { log.warn("文件夹{} 邮件获取失败:{}", folderName, ExceptionUtil.stacktraceToString(e)); } } } catch (Exception e) { log.error("邮件获取失败:{}", ExceptionUtil.stacktraceToString(e)); } finally { store.close(); } return result; } private Map> getFolderEmail(MailboxInfoDTO mailboxInfoDTO, Date startDate, Date endDate, Store store, String folderName) throws MessagingException { // 默认读取收件箱的邮件 Folder folder = store.getFolder(folderName); folder.open(Folder.READ_ONLY); Message[] messages = getEmailMessage(folder, mailboxInfoDTO.getProtocol(), startDate); if (messages == null || messages.length == 0) { log.warn("{} 获取不到邮件 -> 邮箱信息:{},开始时间:{},结束时间:{}", folderName, mailboxInfoDTO, startDate, endDate); return MapUtil.newHashMap(); } Map> emailMessageMap = MapUtil.newHashMap(); for (Message message : messages) { long start = System.currentTimeMillis(); List emailContentInfoDTOList = CollUtil.newArrayList(); String uuidKey = UUID.randomUUID().toString().replaceAll("-", ""); Integer emailType; String senderEmail; String emailTitle = message.getSubject(); try { Date emailDate = message.getSentDate(); String emailDateStr = DateUtil.format(emailDate, DateConst.YYYY_MM_DD_HH_MM_SS); if (log.isInfoEnabled()) { log.info("{} 邮件{} 数据获取中,邮件时间:{}", folderName, emailTitle, emailDateStr); } boolean isNotParseConditionSatisfied = emailDate == null || (endDate != null && emailDate.compareTo(endDate) > 0) || (startDate != null && emailDate.compareTo(startDate) < 0); if (isNotParseConditionSatisfied) { String st = DateUtil.formatDateTime(startDate); String ed = DateUtil.formatDateTime(endDate); log.warn("{} 邮件[{}]日期{}不在区间内【{} ~ {}】", folderName, emailTitle, emailDateStr, st, ed); continue; } senderEmail = getSenderEmail(message); emailType = EmailUtil.getEmailTypeBySubject(emailTitle); if (emailType == null) { log.warn("{} 邮件不满足解析条件 -> 邮件主题:{},邮件日期:{}", folderName, emailTitle, emailDateStr); continue; } if (log.isInfoEnabled()) { log.info("{} 邮件{} 基本信息获取完成,开始下载附件!邮件日期:{}", folderName, emailTitle, emailDateStr); } Object content = message.getContent(); if (content instanceof Multipart multipart) { this.reMultipart(mailboxInfoDTO.getAccount(), emailTitle, emailDate, multipart, emailContentInfoDTOList); } else if (content instanceof Part part) { this.rePart(mailboxInfoDTO.getAccount(), emailTitle, emailDate, part, emailContentInfoDTOList); } else { log.warn("{} 不支持的邮件数据 {}", folderName, emailTitle); } if (CollUtil.isNotEmpty(emailContentInfoDTOList)) { emailContentInfoDTOList.forEach(e -> { e.setEmailType(emailType); e.setSenderEmail(senderEmail); }); emailMessageMap.put(uuidKey, emailContentInfoDTOList); } if (log.isInfoEnabled()) { log.info("{} 邮件{} 下载完成,总计耗时{} ms,文件内容如下\n {}", folderName, emailTitle, System.currentTimeMillis() - start, emailContentInfoDTOList); } } catch (Exception e) { log.error("{} 获取邮箱的邮件{} 报错,堆栈信息:{}", folderName, emailTitle, ExceptionUtil.stacktraceToString(e)); } } folder.close(false); return emailMessageMap; } private void rePart(String account, String subject, Date sendDate, Part part, List emailContentInfoDTOList) throws Exception { String fileName = EmailUtil.decodeFileName(part); if (StrUtil.isBlank(fileName)) { log.warn("邮件{} 附件文件名是空的,不做下载!", subject); return; } if (fileName.contains("=?")) { fileName = MimeUtility.decodeText(fileName); } String disposition = part.getDisposition(); String contentType = part.getContentType(); boolean isAttachment = Part.ATTACHMENT.equalsIgnoreCase(disposition) || (contentType != null && attachmentMimePrefixes.stream().anyMatch(prefix -> StrUtil.startWithIgnoreCase(contentType, prefix) )); if (!isAttachment) { log.warn("邮件 {} 未检测到pdf/zip/rar类型的附件 (fileName={}, disposition={}, contentType={})", subject, fileName, disposition, contentType); return; } String emailDateStr = DateUtil.format(sendDate, DateConst.YYYYMMDD); String filePath = path + File.separator + account + File.separator + emailDateStr + File.separator; // 压缩包重名时的后面的压缩包会覆盖前面压缩包的问题(不考虑普通文件) String emailDate = DateUtil.format(sendDate, DateConst.YYYYMMDDHHMMSS24); String realName = (fileName.endsWith(".zip") || fileName.endsWith(".rar")) ? emailDate + fileName : fileName; File saveFile = FileUtil.file(filePath + realName); if (!saveFile.exists()) { if (!saveFile.getParentFile().exists()) { boolean mkdirs = saveFile.getParentFile().mkdirs(); if (!mkdirs) { log.warn("file path mkdir failed."); } } try (InputStream is = part.getInputStream()) { Files.copy(is, saveFile.toPath()); } } else { FileUtil.del(saveFile); try (InputStream is = part.getInputStream()) { Files.copy(is, saveFile.toPath()); } } EmailContentInfoDTO emailContentInfoDTO = new EmailContentInfoDTO(); emailContentInfoDTO.setFileName(fileName); emailContentInfoDTO.setFileSize(part.getSize()); emailContentInfoDTO.setFilePath(saveFile.getAbsolutePath()); emailContentInfoDTO.setEmailAddress(account); emailContentInfoDTO.setEmailTitle(subject); emailContentInfoDTO.setEmailDate(DateUtil.format(sendDate, DateConst.YYYY_MM_DD_HH_MM_SS)); emailContentInfoDTOList.add(emailContentInfoDTO); } private void reMultipart(String account, String subject, Date emailDate, Multipart multipart, List emailContentInfoDTOList) throws Exception { for (int i = 0; i < multipart.getCount(); i++) { Part bodyPart = multipart.getBodyPart(i); Object content = bodyPart.getContent(); if (content instanceof String) { if (log.isDebugEnabled()) { log.debug("邮件{} 获取的正文不做解析,内容是 {}", subject, content); } continue; } if (content instanceof Multipart mp) { this.reMultipart(account, subject, emailDate, mp, emailContentInfoDTOList); } else { this.rePart(account, subject, emailDate, bodyPart, emailContentInfoDTOList); } } } private String getSenderEmail(Message message) { Address[] senderAddress; try { senderAddress = message.getFrom(); if (senderAddress == null || senderAddress.length == 0) { return null; } // 此时的address是含有编码(MIME编码方式)后的文本和实际的邮件地址 String address = ""; for (Address from : senderAddress) { if (StrUtil.isNotBlank(from.toString())) { address = from.toString(); break; } } // 正则表达式匹配邮件地址 Pattern pattern = Pattern.compile("<(\\S+)>"); Matcher matcher = pattern.matcher(address); if (matcher.find()) { return matcher.group(1); } } catch (MessagingException e) { log.error(e.getMessage(), e); } return null; } private Message[] getEmailMessage(Folder folder, String protocol, Date startDate) { try { if (protocol.contains("imap")) { // 获取邮件日期大于等于startDate的邮件(搜索条件只支持按天) SearchTerm startDateTerm = new ReceivedDateTerm(ComparisonTerm.GE, startDate); return folder.search(startDateTerm); } else { return folder.getMessages(); } } catch (MessagingException e) { throw new RuntimeException(e); } } }