1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285 |
- package com.smppw.modaq.domain.service;
- import cn.hutool.core.collection.CollUtil;
- import cn.hutool.core.collection.ListUtil;
- import cn.hutool.core.date.DateUtil;
- import cn.hutool.core.exceptions.ExceptionUtil;
- import cn.hutool.core.io.FileUtil;
- import cn.hutool.core.map.MapUtil;
- import cn.hutool.core.util.IdUtil;
- import cn.hutool.core.util.StrUtil;
- import com.baomidou.mybatisplus.core.conditions.query.LambdaQueryWrapper;
- import com.baomidou.mybatisplus.core.toolkit.Wrappers;
- import com.smppw.modaq.application.components.OCRReportParser;
- import com.smppw.modaq.application.components.ReportParseUtils;
- import com.smppw.modaq.application.components.report.parser.ReportParser;
- import com.smppw.modaq.application.components.report.parser.ReportParserFactory;
- import com.smppw.modaq.application.components.report.writer.ReportWriter;
- import com.smppw.modaq.application.components.report.writer.ReportWriterFactory;
- import com.smppw.modaq.application.util.EmailUtil;
- import com.smppw.modaq.common.conts.Constants;
- import com.smppw.modaq.common.conts.DateConst;
- import com.smppw.modaq.common.conts.EmailParseStatusConst;
- import com.smppw.modaq.common.conts.EmailTypeConst;
- import com.smppw.modaq.common.enums.ReportMonthlyType;
- import com.smppw.modaq.common.enums.ReportParseStatus;
- import com.smppw.modaq.common.enums.ReportParserFileType;
- import com.smppw.modaq.common.enums.ReportType;
- import com.smppw.modaq.common.exception.NotSupportReportException;
- import com.smppw.modaq.common.exception.ReportParseException;
- import com.smppw.modaq.domain.dto.*;
- import com.smppw.modaq.domain.dto.report.*;
- import com.smppw.modaq.domain.dto.report.ocr.OCRLetterParseData;
- import com.smppw.modaq.domain.dto.report.ocr.OCRParseData;
- import com.smppw.modaq.domain.entity.EmailFileInfoDO;
- import com.smppw.modaq.domain.entity.EmailParseInfoDO;
- import com.smppw.modaq.domain.entity.TgEmailConfigDO;
- import com.smppw.modaq.domain.mapper.EmailFileInfoMapper;
- import com.smppw.modaq.domain.mapper.EmailParseInfoMapper;
- import com.smppw.modaq.domain.mapper.TgEmailConfigMapper;
- import com.smppw.modaq.infrastructure.util.ArchiveUtil;
- import com.smppw.modaq.infrastructure.util.ConvertUtil;
- import com.smppw.modaq.infrastructure.util.PdfUtil;
- import jakarta.mail.*;
- import jakarta.mail.internet.MimeUtility;
- import jakarta.mail.search.ComparisonTerm;
- import jakarta.mail.search.ReceivedDateTerm;
- import jakarta.mail.search.SearchTerm;
- import org.slf4j.Logger;
- import org.slf4j.LoggerFactory;
- import org.springframework.beans.factory.annotation.Value;
- import org.springframework.http.MediaType;
- import org.springframework.stereotype.Service;
- import org.springframework.util.StopWatch;
- import java.io.File;
- import java.io.IOException;
- import java.io.InputStream;
- import java.nio.file.Files;
- import java.util.*;
- import java.util.regex.Matcher;
- import java.util.regex.Pattern;
- import java.util.stream.Collectors;
- /**
- * @author mozuwen
- * @date 2024-09-04
- * @description 邮件解析服务
- */
- @Service
- public class EmailParseService {
- // public static final int stepSize = 10000;
- private static final Logger log = LoggerFactory.getLogger(EmailParseService.class);
- // 常量定义:统一管理关键词
- private static final Set<String> AMAC_KEYWORDS = Set.of("协会", "信披");
- private static final Set<String> EXCLUDE_PATH_KEYWORDS = Set.of("公司及协会版", "公司和协会版");
- // 扩展支持的 MIME 类型
- private static final Set<String> attachmentMimePrefixes = Set.of(
- "application/pdf",
- "application/zip",
- "application/x-zip-compressed",
- "application/rar",
- "application/x-rar-compressed",
- "application/octet-stream"
- // 按需添加其他类型...
- );
- private static final List<String> TG_EMAIL_LIST = ListUtil.list(false);
- private final TgEmailConfigMapper tgEmailConfigMapper;
- private final EmailParseInfoMapper emailParseInfoMapper;
- private final EmailFileInfoMapper emailFileInfoMapper;
- /* 报告解析和入库的方法 */
- private final ReportParserFactory reportParserFactory;
- private final ReportWriterFactory reportWriterFactory;
- @Value("${email.file.path}")
- private String path;
- @Value("${email.report.ocr-parser-url}")
- private String ocrParserUrl;
- @Value("${email.read-write-seen:true}")
- private boolean readWriteSeen;
- public EmailParseService(TgEmailConfigMapper tgEmailConfigMapper,
- EmailParseInfoMapper emailParseInfoMapper,
- EmailFileInfoMapper emailFileInfoMapper,
- ReportParserFactory reportParserFactory,
- ReportWriterFactory reportWriterFactory) {
- this.tgEmailConfigMapper = tgEmailConfigMapper;
- this.emailParseInfoMapper = emailParseInfoMapper;
- this.emailFileInfoMapper = emailFileInfoMapper;
- this.reportParserFactory = reportParserFactory;
- this.reportWriterFactory = reportWriterFactory;
- this.init();
- }
- public void init() {
- LambdaQueryWrapper<TgEmailConfigDO> wrapper = Wrappers.lambdaQuery(TgEmailConfigDO.class);
- List<TgEmailConfigDO> dataList = this.tgEmailConfigMapper.selectList(wrapper);
- for (TgEmailConfigDO temp : dataList) {
- TG_EMAIL_LIST.add(temp.getEmail());
- }
- }
- /**
- * 解析指定邮箱指定时间范围内的邮件
- *
- * @param mailboxInfoDTO 邮箱配置信息
- * @param startDate 邮件起始日期(yyyy-MM-dd HH:mm:ss)
- * @param endDate 邮件截止日期(yyyy-MM-dd HH:mm:ss, 为null,将解析邮件日期小于等于startDate的当天邮件)
- * @param emailTypes 当前任务支持的邮件类型,默认支持确认单
- */
- public void parseEmail(MailboxInfoDTO mailboxInfoDTO,
- Date startDate, Date endDate,
- List<String> folderNames,
- List<Integer> emailTypes) {
- if (CollUtil.isEmpty(emailTypes)) {
- emailTypes = ListUtil.of(EmailTypeConst.REPORT_LETTER_EMAIL_TYPE);
- }
- if (log.isInfoEnabled()) {
- log.info("开始邮件解析 -> 邮箱信息:{},开始时间:{},结束时间:{}", mailboxInfoDTO,
- DateUtil.formatDateTime(startDate), DateUtil.formatDateTime(endDate));
- }
- Map<String, List<EmailContentInfoDTO>> emailContentMap;
- try {
- emailContentMap = this.realEmail(mailboxInfoDTO, startDate, endDate, folderNames);
- } catch (Exception e) {
- log.error("采集邮件失败 -> 邮箱配置信息:{},堆栈信息:{}", mailboxInfoDTO, ExceptionUtil.stacktraceToString(e));
- return;
- }
- if (MapUtil.isEmpty(emailContentMap)) {
- log.warn("未采集到邮件 -> 邮箱配置信息:{},开始时间:{},结束时间:{}", mailboxInfoDTO,
- DateUtil.formatDateTime(startDate), DateUtil.formatDateTime(endDate));
- return;
- }
- for (Map.Entry<String, List<EmailContentInfoDTO>> emailEntry : emailContentMap.entrySet()) {
- List<EmailContentInfoDTO> dtoList = emailEntry.getValue();
- if (CollUtil.isEmpty(dtoList)) {
- log.warn("未采集到正文或附件");
- continue;
- }
- EmailContentInfoDTO dto = dtoList.get(0);
- String emailTitle = dto.getEmailTitle();
- if (log.isInfoEnabled()) {
- log.info("开始解析邮件数据 -> 邮件主题:{},邮件日期:{}", emailTitle, dto.getEmailDate());
- }
- Long totalSize = dtoList.stream().map(EmailContentInfoDTO::getFileSize).reduce(0L, Long::sum);
- String errMsg = null;
- int status = 1;
- List<EmailZipFileDTO> emailFileList = ListUtil.list(false);
- EmailInfoDTO emailInfo = new EmailInfoDTO(dto, emailFileList);
- if (dto.getEmailContent() != null && dto.getEmailContent().contains("超大附件列表")) {
- status = 0;
- errMsg = "邮件中存在超大附件,需要手动处理该邮件";
- } else {
- for (EmailContentInfoDTO emailDto : dtoList) {
- // 正文不用解压附件
- if (emailDto.getFileName() != null && emailDto.getFileName().endsWith(Constants.FILE_HTML)) {
- continue;
- }
- try {
- emailFileList.addAll(this.parseZipEmail(emailDto));
- } catch (IOException e) {
- log.error("邮件{} 压缩包解压失败:{}", emailTitle, ExceptionUtil.stacktraceToString(e));
- EmailZipFileDTO zipFileDTO = new EmailZipFileDTO(emailTitle, emailDto);
- zipFileDTO.setParseStatus(0);
- zipFileDTO.setFailReason("压缩包解压失败");
- emailFileList.add(zipFileDTO);
- } catch (Exception e) {
- log.error("邮件{} 堆栈信息:{}", emailTitle, ExceptionUtil.stacktraceToString(e));
- }
- }
- // 重新判断类型
- this.recheckEmailType(emailTitle, emailFileList);
- Iterator<EmailZipFileDTO> entryIterator = emailFileList.iterator();
- while (entryIterator.hasNext()) {
- EmailZipFileDTO entry = entryIterator.next();
- if (!emailTypes.contains(entry.getEmailType())) {
- log.warn("当前邮件{} 中的报告{} 的类型{} 不在支持的任务类型{} 中,不用执行解析逻辑。",
- entry.getEmailTitle(), entry.getFilename(), entry.getEmailType(), emailTypes);
- entryIterator.remove();
- }
- ReportParserFileType fileType = ReportParserFileType.getBySuffix(entry.getExtName());
- if (fileType == null) {
- log.warn("当前邮件{} 中的文件{} 是不支持的文件格式{} 中,不用执行解析逻辑。",
- entry.getEmailTitle(), entry.getFilepath(), entry.getExtName());
- entryIterator.remove();
- }
- }
- }
- // 保存邮件信息
- EmailParseInfoDO emailDo = this.buildEmailParseInfo(mailboxInfoDTO.getAccount(), emailInfo, totalSize);
- emailDo.setEmailKey(emailEntry.getKey());
- emailDo.setParseStatus(status);
- emailDo.setFailReason(errMsg);
- Integer emailId = this.saveEmailParseInfo(emailDo);
- // 保存附件(解压后的)
- for (EmailZipFileDTO zipFile : emailFileList) {
- EmailFileInfoDO emailFile = this.saveEmailFileInfo(emailId, zipFile);
- if (!Objects.equals(1, zipFile.getParseStatus()) || StrUtil.isNotBlank(zipFile.getFailReason())) {
- emailFile.setParseStatus(zipFile.getParseStatus());
- emailFile.setFailReason(zipFile.getFailReason());
- }
- zipFile.setFileId(emailFile.getId());
- }
- if (CollUtil.isNotEmpty(emailFileList)) {
- // 保存相关信息 -> 邮件信息表,邮件文件表,邮件净值表,邮件规模表,基金净值表
- this.saveRelatedTable(emailId, emailInfo);
- log.info("结束邮件解析 -> 邮箱信息:{},开始时间:{},结束时间:{}", emailEntry.getValue(),
- DateUtil.formatDateTime(startDate), DateUtil.formatDateTime(endDate));
- }
- }
- }
- /**
- * 解压压缩包,如果不是压缩包需转换
- *
- * @param emailContentInfoDTO 邮件信息
- * @return 解压后的文件列表
- * @throws IOException /
- */
- public List<EmailZipFileDTO> parseZipEmail(EmailContentInfoDTO emailContentInfoDTO) throws IOException {
- List<EmailZipFileDTO> resultList = ListUtil.list(false);
- Integer emailType = emailContentInfoDTO.getEmailType();
- String filepath = emailContentInfoDTO.getFilePath();
- String emailTitle = emailContentInfoDTO.getEmailTitle();
- if (ArchiveUtil.isArchive(filepath)) {
- this.handleCompressedFiles(emailTitle, filepath, emailType, resultList);
- } else {
- // 不是压缩包时
- EmailZipFileDTO dto = new EmailZipFileDTO(emailTitle, emailContentInfoDTO);
- resultList.add(dto);
- }
- // 文件中的类型判断
- if (emailType == null || !EmailTypeConst.SUPPORT_NO_OTHER_TYPES.contains(emailType)) {
- emailType = EmailUtil.getEmailTypeBySubject(emailContentInfoDTO.getFileName());
- emailContentInfoDTO.setEmailType(emailType);
- }
- if (CollUtil.isNotEmpty(resultList)) {
- for (EmailZipFileDTO dto : resultList) {
- dto.setEmailType(emailType);
- }
- }
- return resultList;
- }
- /**
- * 解压压缩包并把压缩包里面的所有文件放在resultList中
- *
- * @param emailTitle 邮件主题
- * @param filepath 压缩包路径
- * @param emailType 邮件解析类型
- * @param resultList 解压结果列表
- * @throws IOException /
- */
- private void handleCompressedFiles(String emailTitle,
- String filepath,
- Integer emailType,
- List<EmailZipFileDTO> resultList) throws IOException {
- if (!ArchiveUtil.isArchive(filepath)) {
- return;
- }
- String output = filepath.replaceAll("original", "archive");
- String destPath = FileUtil.getParent(output, 1) + File.separator + FileUtil.mainName(output);
- File destFile = FileUtil.file(destPath);
- if (!destFile.exists()) {
- if (!destFile.mkdirs()) {
- throw new IOException("无法创建目标目录: " + destPath);
- }
- }
- List<String> extractedDirs;
- if (ArchiveUtil.isZip(filepath)) {
- extractedDirs = ArchiveUtil.extractCompressedFiles(filepath, destPath);
- } else if (ArchiveUtil.isRAR(filepath) || ArchiveUtil.is7z(filepath)) {
- // 7z和rar压缩包解压
- extractedDirs = ArchiveUtil.extractRar5(filepath, destPath);
- } else {
- return;
- }
- for (String dir : extractedDirs) {
- // 如果邮件类型不满足解析条件则重新根据文件名判断
- if (emailType == null || !EmailTypeConst.SUPPORT_EMAIL_TYPES.contains(emailType)) {
- emailType = EmailUtil.getEmailTypeBySubject(dir);
- }
- File file = new File(dir);
- if (file.isDirectory()) {
- String[] subDirs = file.list();
- if (subDirs != null) {
- for (String subDir : subDirs) {
- resultList.add(new EmailZipFileDTO(emailTitle, subDir, emailType));
- }
- } else {
- log.warn("目录 {} 下无文件", dir);
- }
- } else {
- resultList.add(new EmailZipFileDTO(emailTitle, dir, emailType));
- }
- }
- }
- /**
- * 邮件附件解析并保存结果数据
- *
- * @param emailId 邮件数据ID
- * @param emailInfo 邮件信息
- */
- public void saveRelatedTable(Integer emailId, EmailInfoDTO emailInfo) {
- // 解析并保存数据
- List<ParseResult<ReportData>> dataList = ListUtil.list(true);
- this.parseAndUpdateResult(emailId, emailInfo, dataList);
- }
- private List<EmailFileInfoDO> buildEmailFileInfo(List<ParseResult<ReportData>> dataList) {
- List<EmailFileInfoDO> entityList = ListUtil.list(false);
- for (ParseResult<ReportData> result : dataList) {
- EmailFileInfoDO entity = new EmailFileInfoDO();
- entity.setId(result.getData().getBaseInfo().getFileId());
- entity.setParseStatus(result.getStatus());
- entity.setFailReason(result.getMsg());
- entity.setAiParse(result.getData().getAiParse());
- entityList.add(entity);
- }
- return entityList;
- }
- /**
- * 上传文件解析并返回解析状态
- *
- * @param params 上传文件路径
- * @return /
- */
- public List<UploadReportResult> uploadReportResults(UploadReportParams params) {
- String emailTitle = params.getTitle();
- List<UploadReportParams.ReportInfo> reportInfos = params.getReportInfos();
- List<EmailZipFileDTO> dtos = ListUtil.list(false);
- for (UploadReportParams.ReportInfo e : reportInfos) {
- EmailZipFileDTO zipFileDTO = new EmailZipFileDTO(emailTitle, e);
- String reportPath = e.getReportPath();
- if (ArchiveUtil.isArchive(reportPath)) {
- try {
- this.handleCompressedFiles(emailTitle, reportPath, e.getReportType(), dtos);
- } catch (Exception ex) {
- log.warn("报告{} 压缩包解压失败:{}", reportPath, ExceptionUtil.stacktraceToString(ex));
- zipFileDTO.setParseStatus(0);
- zipFileDTO.setFailReason("压缩包解压失败");
- dtos.add(zipFileDTO);
- }
- } else {
- dtos.add(zipFileDTO);
- }
- }
- // 重新判断类型
- this.recheckEmailType(emailTitle, dtos);
- EmailInfoDTO emailInfo = new EmailInfoDTO(emailTitle, dtos);
- Long totalSize = dtos.stream().map(EmailZipFileDTO::getFileSize).reduce(0L, Long::sum);
- EmailParseInfoDO emailDo = this.buildEmailParseInfo("upload", emailInfo, totalSize);
- Integer emailId = this.saveEmailParseInfo(emailDo);
- for (EmailZipFileDTO zipFile : dtos) {
- EmailFileInfoDO emailFile = this.saveEmailFileInfo(emailId, zipFile);
- zipFile.setFileId(emailFile.getId());
- }
- // 解析并处理解析结果
- List<ParseResult<ReportData>> dataList = ListUtil.list(false);
- this.parseAndUpdateResult(emailId, emailInfo, dataList);
- // 解析结果转换
- List<UploadReportResult> resultList = ListUtil.list(false);
- for (ParseResult<ReportData> result : dataList) {
- ReportData data = result.getData();
- resultList.add(new UploadReportResult(data.getBaseInfo().getFileId(),
- data.getBaseInfo().getReportName(), result.getStatus(), result.getMsg()));
- }
- return resultList;
- }
- private void parseAndUpdateResult(Integer emailId,
- EmailInfoDTO emailInfo,
- List<ParseResult<ReportData>> dataList) {
- this.parseResults(emailInfo, dataList);
- String failReason = null;
- int emailParseStatus = EmailParseStatusConst.SUCCESS;
- // 报告邮件有一条失败就表示整个邮件解析失败
- if (CollUtil.isNotEmpty(dataList)) {
- List<EmailFileInfoDO> entityList = this.buildEmailFileInfo(dataList);
- this.emailFileInfoMapper.batchUpdateByFileId(entityList);
- long failNum = dataList.stream().filter(e -> !Objects.equals(EmailParseStatusConst.SUCCESS, e.getStatus())).count();
- if (failNum > 0) {
- emailParseStatus = EmailParseStatusConst.FAIL;
- failReason = dataList.stream().map(ParseResult::getMsg).collect(Collectors.joining(";"));
- }
- }
- this.emailParseInfoMapper.updateParseStatus(emailId, emailParseStatus, failReason);
- }
- /**
- * 重新校验邮件附件的类型(用邮件主题+附件名称)
- *
- * @param emailTitle 邮件主题
- * @param dtos 所有附件
- */
- private void recheckEmailType(String emailTitle, List<EmailZipFileDTO> dtos) {
- for (EmailZipFileDTO emailFile : dtos) {
- if (EmailTypeConst.SUPPORT_NO_OTHER_TYPES.contains(emailFile.getEmailType())
- || !Objects.equals(1, emailFile.getParseStatus())) {
- continue;
- }
- Integer type = EmailUtil.getEmailTypeBySubject(emailTitle + emailFile.getFilename());
- // 特殊月报
- if ((Objects.equals(EmailTypeConst.NAV_EMAIL_TYPE, type) || Objects.equals(EmailTypeConst.REPORT_OTHER_TYPE, type))
- && ReportParseUtils.containsAny(emailTitle, ReportParseUtils.MONTHLY_REPORT_KEYWORDS)) {
- type = EmailTypeConst.REPORT_EMAIL_TYPE;
- }
- if (EmailTypeConst.SUPPORT_EMAIL_TYPES.contains(type)) {
- emailFile.setEmailType(type);
- }
- }
- }
- /**
- * 邮件信息前置处理,在解析操作执行之前的过滤逻辑和校验逻辑。返回所有附件大小汇总
- *
- * @param emailTitle 邮件信息(包含所有解压后的文件)
- * @param dtos 邮件信息(包含所有解压后的文件)
- */
- private void checkEmailFileInfo(String emailTitle, List<EmailZipFileDTO> dtos) {
- // 如果压缩包里面既有pdf又有其他格式的文件,说明其他格式的文件是不需要解析的
- List<String> exts = dtos.stream().map(EmailZipFileDTO::getExtName).distinct().toList();
- if (exts.contains(Constants.FILE_PDF) && exts.size() > 1) {
- dtos.removeIf(e -> !Objects.equals(Constants.FILE_PDF, e.getExtName()));
- }
- // 移除逻辑
- Iterator<EmailZipFileDTO> removeIterator = dtos.iterator();
- while (removeIterator.hasNext()) {
- EmailZipFileDTO dto = removeIterator.next();
- String filename = dto.getFilename();
- // 删除复核函或基金合同
- if (filename.contains("复核函") || (filename.contains("基金合同") && !filename.contains("合同变更"))) {
- log.warn("邮件{} 中的报告{} 是复核函或基金合同,不用解析上传。", emailTitle, filename);
- removeIterator.remove();
- }
- // 不支持的类型
- Integer type = dto.getEmailType();
- if (!EmailTypeConst.SUPPORT_EMAIL_TYPES.contains(type)) {
- log.info("邮件{} 类型{} 不支持解析。", emailTitle, type);
- removeIterator.remove();
- }
- }
- // 数据库已存在的数据过滤(邮件主题+报告名称+附件大小,压缩包文件大小汇总)
- long totalSize = dtos.stream().map(EmailZipFileDTO::getFileSize).reduce(0L, Long::sum);
- Iterator<EmailZipFileDTO> iterator = dtos.iterator();
- while (iterator.hasNext()) {
- EmailZipFileDTO dto = iterator.next();
- String filename = dto.getFilename();
- Integer type = dto.getEmailType();
- int count = 0;
- if (Objects.equals(type, EmailTypeConst.REPORT_LETTER_EMAIL_TYPE)) {
- // 确认单
- count = this.emailFileInfoMapper.getLetterFilenameSuccessCount(emailTitle, filename);
- } else if (Objects.equals(type, EmailTypeConst.REPORT_EMAIL_TYPE)) {
- // 定期报告
- count = this.emailFileInfoMapper.getAmacFilenameSuccessCount(emailTitle, filename, totalSize);
- } else if (Objects.equals(type, EmailTypeConst.REPORT_WEEKLY_TYPE)) {
- // 管理人周报
- count = this.emailFileInfoMapper.getWeeklyFilenameSuccessCount(emailTitle, filename, totalSize);
- } else if (Objects.equals(type, EmailTypeConst.REPORT_OTHER_TYPE)) {
- // 其他报告
- count = this.emailFileInfoMapper.getOtherFilenameSuccessCount(emailTitle, filename, totalSize);
- }
- if (count > 0) {
- iterator.remove();
- log.info("邮件{} 报告{} 已存在解析成功的记录,不用重新解析。", emailTitle, filename);
- }
- }
- if (CollUtil.isEmpty(dtos)) {
- log.info("邮件{} 经校验没有需要解析的报告文件。", emailTitle);
- return;
- }
- if (log.isInfoEnabled()) {
- log.info("邮件{} 还有报告待解析:\n{}", emailTitle, dtos);
- }
- }
- /**
- * 邮件信息保存+附件解析
- *
- * @param emailInfo 邮件信息,包含附件
- * @param resultList 解析结果
- */
- private void parseResults(EmailInfoDTO emailInfo,
- List<ParseResult<ReportData>> resultList) {
- String emailTitle = emailInfo.getEmailTitle();
- // 拷贝一个新的集合,方便操作不影响原集合
- List<EmailZipFileDTO> dtos = ListUtil.toList(emailInfo.getEmailFileList());
- if (CollUtil.isEmpty(dtos)) {
- return;
- }
- // 附件文件检查
- this.checkEmailFileInfo(emailTitle, dtos);
- // 解析邮件报告
- for (EmailZipFileDTO zipFile : dtos) {
- // 解析并保存报告
- ParseResult<ReportData> parseResult = this.parseReportAndHandleResult(emailTitle, emailInfo.getSenderEmail(), zipFile);
- if (!Objects.equals(1, parseResult.getStatus())) {
- log.error(parseResult.getMsg());
- }
- if (parseResult.getData() == null) {
- parseResult.setData(new ReportData.DefaultReportData());
- }
- resultList.add(parseResult);
- }
- }
- /**
- * 解析报告并保存解析结果
- *
- * @param emailTitle 邮件主题
- * @param zipFile 当前报告的路径信息
- * @return /
- */
- private ParseResult<ReportData> parseReportAndHandleResult(String emailTitle,
- String senderEmail,
- EmailZipFileDTO zipFile) {
- Integer fileId = zipFile.getFileId();
- Integer emailType = zipFile.getEmailType();
- String reportName = zipFile.getFilename();
- String filepath = zipFile.getFilepath();
- ParseResult<ReportData> result = new ParseResult<>();
- boolean reportFlag = emailType == null || !EmailTypeConst.SUPPORT_EMAIL_TYPES.contains(emailType);
- if (reportFlag || StrUtil.isBlank(reportName) || reportName.endsWith(Constants.FILE_HTML)) {
- return new ParseResult<>(ReportParseStatus.NOT_A_REPORT, null, reportName);
- }
- // 类型识别---先识别季度报告,没有季度再识别年度报告,最后识别月报
- ReportType reportType = ReportParseUtils.matchReportType(emailType, reportName);
- if (reportType == null) {
- reportType = ReportParseUtils.matchReportType(emailType, emailTitle);
- if (log.isDebugEnabled()) {
- log.debug("报告{} 根据邮件主题{} 重新识别的类型是:{}", reportName, emailTitle, reportType);
- }
- }
- // 解析器--根据文件后缀获取对应解析器,解析不了就用AI来解析
- ReportParserFileType fileType = ReportParserFileType.getBySuffix(zipFile.getExtName());
- // 不支持的格式
- if (fileType == null) {
- ReportData reportData = this.buildNvlReportData(fileId, reportType, reportName);
- return new ParseResult<>(ReportParseStatus.NO_SUPPORT_TEMPLATE, reportData, reportName);
- }
- // 不是定期报告的判断逻辑放在不支持的格式下面
- if (reportType == null) {
- ReportData reportData = this.buildNvlReportData(fileId, ReportType.OTHER, reportName);
- return new ParseResult<>(ReportParseStatus.NOT_A_REPORT, reportData, reportName);
- }
- // docx转pdf
- if (Objects.equals(ReportParserFileType.WORD, fileType)) {
- try {
- String outputFile = FileUtil.getParent(filepath, 1) + File.separator + FileUtil.mainName(reportName) + ".pdf";
- PdfUtil.convertDocxToPdf(filepath, outputFile);
- filepath = outputFile;
- } catch (Exception e) {
- log.warn("报告{} 转换为pdf失败:{}", reportName, ExceptionUtil.stacktraceToString(e));
- }
- }
- // 首页和尾页转为png图片,首页用来识别基金名称和基金代码、尾页用来识别印章和联系人
- List<String> images = ListUtil.list(true);
- if (Objects.equals(ReportParserFileType.PDF, fileType)) {
- try {
- String output = filepath.replaceAll("archive|original", "image");
- File outputFile = FileUtil.file(FileUtil.getParent(output, 1));
- images = PdfUtil.convertFirstAndLastPagesToPng(filepath, outputFile, 300, zipFile.getPdfPwd());
- if (log.isDebugEnabled()) {
- log.debug("报告{} 生成的图片地址是:\n{}", reportName, images);
- }
- } catch (Exception e) {
- log.warn("报告{} 生成图片失败:{}", reportName, ExceptionUtil.stacktraceToString(e));
- }
- } else if (Objects.equals(ReportParserFileType.IMG, fileType)) {
- try {
- String outputFile = PdfUtil.compressAndSave(filepath);
- images.add(outputFile);
- } catch (IOException e) {
- log.error("报告{} 图片压缩失败,{}", reportName, ExceptionUtil.stacktraceToString(e));
- }
- }
- // ocr识别月报是否管理人版或协会版
- ReportMonthlyType monthlyType = ReportMonthlyType.NO_NEED;
- if (ReportType.MONTHLY == reportType) {
- monthlyType = this.determineReportType(emailTitle, reportName, filepath, images);
- }
- boolean isAmac = reportType == ReportType.ANNUALLY || reportType == ReportType.QUARTERLY
- || (reportType == ReportType.MONTHLY && ReportMonthlyType.AMAC == monthlyType);
- // 不支持解析的格式文件
- boolean notSupportFile = false;
- // 解析报告
- ReportData reportData = null;
- ReportParserParams params = new ReportParserParams(fileId, reportName, filepath, reportType);
- long start = System.currentTimeMillis();
- try {
- if (isAmac || reportType == ReportType.LETTER) {
- ReportParser<ReportData> instance = this.reportParserFactory.getInstance(reportType, fileType);
- reportData = instance.parse(params);
- result = new ParseResult<>(1, "报告解析成功", reportData);
- }
- } catch (ReportParseException e) {
- result = new ParseResult<>(e.getCode(), StrUtil.format(e.getMsg(), reportName), null);
- log.warn("解析失败:{}", result.getMsg());
- if (e instanceof NotSupportReportException) {
- notSupportFile = true;
- }
- } catch (Exception e) {
- log.warn("解析错误:{}", ExceptionUtil.stacktraceToString(e));
- result = new ParseResult<>(ReportParseStatus.PARSE_FAIL, null, e.getMessage());
- } finally {
- // 如果解析结果是空的就用AI工具解析一次
- if (reportData == null && !notSupportFile) {
- if (log.isInfoEnabled()) {
- log.info("报告{} 是周报或管理人月报或其他类型或解析失败,用AI解析器解析", reportName);
- }
- try {
- if (!isAmac && CollUtil.isNotEmpty(images)) {
- filepath = images.get(0);
- }
- params = new ReportParserParams(fileId, reportName, filepath, reportType);
- ReportParser<ReportData> instance = this.reportParserFactory.getInstance(reportType, ReportParserFileType.AI);
- reportData = instance.parse(params);
- result = new ParseResult<>(1, "报告解析成功--AI", reportData);
- } catch (ReportParseException e) {
- result = new ParseResult<>(e.getCode(), StrUtil.format(e.getMsg(), reportName), null);
- log.warn("AI解析失败:{}", result.getMsg());
- } catch (Exception e) {
- log.warn("AI解析错误:{}", ExceptionUtil.stacktraceToString(e));
- result = new ParseResult<>(ReportParseStatus.PARSE_FAIL, null, e.getMessage());
- }
- }
- // 当reportData==null时重新构建一个reportData对象
- if (reportData == null) {
- reportData = this.buildNvlReportData(fileId, reportType, reportName);
- }
- if (reportData.getBaseInfo() != null) {
- // 设置月报类型
- reportData.getBaseInfo().setMonthlyType(monthlyType.getType());
- // 当报告日期还是空时设置为今天的前一天
- if (reportData.getBaseInfo().getReportDate() == null) {
- reportData.getBaseInfo().setReportDate(DateUtil.offsetDay(new Date(), -1));
- }
- }
- // ocr信息提取(印章、联系人、基金名称和产品代码)
- reportData = this.ocrReportData(fileId, reportType, monthlyType, reportData, reportName, senderEmail, images);
- result.setData(reportData);
- if (log.isInfoEnabled()) {
- log.info("报告{} 解析耗时{}ms,结果是:{}", reportName, (System.currentTimeMillis() - start), reportData);
- }
- }
- // 保存报告解析结果
- this.saveReportData(reportData, reportType, reportName);
- return result;
- }
- /**
- * 判断月报类型(管理人版还是协会版)
- *
- * @param emailTitle 邮件主题
- * @param fileName 报告名称
- * @param filepath 报告路径
- * @param images 报告的第一页和尾页图片地址(主要用于ocr提取关键信息)
- */
- public ReportMonthlyType determineReportType(String emailTitle, String fileName,
- String filepath, List<String> images) {
- // 1. 优先根据文件名判断
- if (ReportParseUtils.containsAny(fileName, AMAC_KEYWORDS)) {
- return ReportMonthlyType.AMAC;
- }
- if (ReportParseUtils.containsAny(fileName, ReportParseUtils.MANAGER_KEYWORDS)) {
- return ReportMonthlyType.MANAGER;
- }
- // if (StrUtil.isNotBlank(ReportParseUtils.matchFundCode(fileName))) {
- // return ReportMonthlyType.AMAC;
- // }
- // 2. 根据文件路径判断
- List<String> pathSegments = StrUtil.split(filepath, File.separator);
- for (String segment : pathSegments) {
- boolean isExcluded = ReportParseUtils.containsAny(segment, EXCLUDE_PATH_KEYWORDS);
- if (!isExcluded && ReportParseUtils.containsAny(segment, AMAC_KEYWORDS)) {
- return ReportMonthlyType.AMAC;
- }
- if (!isExcluded && ReportParseUtils.containsAny(segment, ReportParseUtils.MANAGER_KEYWORDS)) {
- return ReportMonthlyType.MANAGER;
- }
- }
- // 3. 根据邮件主题判断
- boolean isAmacEmail = ReportParseUtils.containsAny(emailTitle, AMAC_KEYWORDS)
- && !emailTitle.contains("公司及协会版");
- if (isAmacEmail) {
- return ReportMonthlyType.AMAC;
- }
- if (ReportParseUtils.containsAny(emailTitle, ReportParseUtils.MANAGER_KEYWORDS)) {
- return ReportMonthlyType.MANAGER;
- }
- // 4.ocr 提取“曲线”、“基金份额”等关键字,如果有曲线则是管理人,如果有估值日期则是协会
- if (CollUtil.isNotEmpty(images)) {
- try {
- return new OCRReportParser().parseMonthlyType(fileName, this.ocrParserUrl, images.get(0));
- } catch (Exception ignored) {
- return ReportMonthlyType.FAILED;
- }
- }
- return ReportMonthlyType.FAILED;
- }
- /**
- * ocr 提取信息(包括首页的基金名称或报告日期,尾页的印章或联系人等信息)
- *
- * @param fileId 文件表ID
- * @param reportType 报告类型
- * @param monthlyType 月报类型
- * @param reportData 报告解析结果
- * @param fileName 报告名称
- * @param senderEmail 邮件发送人邮箱
- * @param images 报告的收益和尾页png图片
- */
- private ReportData ocrReportData(Integer fileId,
- ReportType reportType,
- ReportMonthlyType monthlyType,
- ReportData reportData,
- String fileName,
- String senderEmail,
- List<String> images) {
- if (CollUtil.isEmpty(images)) {
- return reportData;
- }
- // 报告才识别尾页的印章和联系人,确认单不识别尾页
- if (ReportType.LETTER != reportType) {
- if (log.isInfoEnabled()) {
- log.info("报告{} 用ocr补充解析结果。补充前的结果是:{}", fileName, reportData);
- }
- OCRParseData parseRes = null;
- try {
- // 首页和尾页相等时只读首页
- String imageUrl = images.size() == 1 ? images.get(0) : images.get(1);
- parseRes = new OCRReportParser().parse(fileName, this.ocrParserUrl, imageUrl);
- } catch (Exception e) {
- log.error("报告{} OCR识别印章和联系人出错:{}", fileName, e.getMessage());
- }
- // ocr识别尾页是否包含印章和联系人信息
- if (parseRes != null && reportData.getBaseInfo() != null) {
- // 协会报告才设置印章标识
- boolean isAmac = reportType == ReportType.ANNUALLY || reportType == ReportType.QUARTERLY
- || (reportType == ReportType.MONTHLY && ReportMonthlyType.AMAC == monthlyType);
- if (isAmac) {
- if (TG_EMAIL_LIST.contains(senderEmail)) {
- reportData.getBaseInfo().setWithSeals(true);
- } else {
- reportData.getBaseInfo().setWithSeals(parseRes.getWithSeals());
- if (fileName.contains("用印") && !Objects.equals(true, reportData.getBaseInfo().getWithSeals())) {
- reportData.getBaseInfo().setWithSeals(true);
- }
- }
- } else {
- // 管理人报告才设置联系人标识
- reportData.getBaseInfo().setWithContacts(parseRes.getWithContacts());
- }
- }
- // 首页和尾页不相等时解析首页的数据
- if (images.size() != 1 || parseRes == null) {
- try {
- parseRes = new OCRReportParser().parse(fileName, this.ocrParserUrl, images.get(0));
- } catch (Exception e) {
- log.error("报告{} OCR识别首页基金名称和报告日期出错:{}", fileName, e.getMessage());
- }
- }
- // 用首页识别基金名称、产品代码和基金管理人
- if (reportData.getFundInfo() != null && parseRes != null) {
- if (StrUtil.isBlank(reportData.getFundInfo().getFundName())) {
- reportData.getFundInfo().setFundName(parseRes.getFundName());
- }
- if (StrUtil.isBlank(reportData.getFundInfo().getFundCode())) {
- reportData.getFundInfo().setFundCode(parseRes.getFundCode());
- }
- if (StrUtil.isBlank(reportData.getFundInfo().getCompanyName())
- || !reportData.getFundInfo().getCompanyName().contains("有限公司")) {
- reportData.getFundInfo().setCompanyName(parseRes.getCompanyName());
- }
- }
- reportData.setAiParse(true);
- return reportData;
- }
- // 确认单AI解析失败时重新用OCR识别
- if (!reportData.wasFailed()) {
- return reportData;
- }
- if (log.isInfoEnabled()) {
- log.info("确认单报告{} 用ocr补充解析结果。补充前的结果是:{}", fileName, reportData);
- }
- LetterReportData letterReportData = (LetterReportData) reportData;
- OCRLetterParseData parseRes = null;
- try {
- parseRes = new OCRReportParser().parseLetterData(fileName, this.ocrParserUrl, images.get(0));
- } catch (Exception e) {
- log.error("确认单报告{} OCR提取确认单关键信息出错:{}", fileName, e.getMessage());
- }
- if (parseRes == null) {
- return reportData;
- }
- if (letterReportData.getFundInfo() != null) {
- letterReportData.getFundInfo().setFundName(parseRes.getFundName());
- letterReportData.getFundInfo().setFundCode(parseRes.getFundCode());
- }
- // 投资者信息
- if (letterReportData.getInvestorInfo() == null) {
- letterReportData.setInvestorInfo(new ReportInvestorInfoDTO(fileId));
- }
- letterReportData.getInvestorInfo().setInvestorName(parseRes.getInvestorName());
- letterReportData.getInvestorInfo().setCertificateNumber(parseRes.getCertificateNumber());
- letterReportData.getInvestorInfo().setTradingAccount(parseRes.getTradingAccount());
- letterReportData.getInvestorInfo().setFundAccount(parseRes.getFundAccount());
- letterReportData.getInvestorInfo().setCertificateType(parseRes.getCertificateType());
- // 交易流水
- if (letterReportData.getTransaction() == null) {
- letterReportData.setTransaction(new ReportFundTransactionDTO(fileId));
- }
- letterReportData.getTransaction().setTransactionType(parseRes.getTransactionType());
- letterReportData.getTransaction().setApplyDate(parseRes.getApplyDate());
- letterReportData.getTransaction().setApplyShare(parseRes.getApplyShare());
- letterReportData.getTransaction().setApplyAmount(parseRes.getApplyAmount());
- letterReportData.getTransaction().setHoldingDate(parseRes.getHoldingDate());
- letterReportData.getTransaction().setAmount(parseRes.getAmount());
- letterReportData.getTransaction().setShare(parseRes.getShare());
- letterReportData.getTransaction().setNav(parseRes.getNav());
- letterReportData.setAiParse(true);
- return letterReportData;
- }
- /**
- * 当reportData==null时重新构建一个新对象
- *
- * @param fileId 文件ID
- * @param reportType 报告类型
- * @param reportName 报告名称
- * @return /
- */
- private ReportData buildNvlReportData(Integer fileId, ReportType reportType, String reportName) {
- ReportData reportData = null;
- if (reportType == null) {
- reportType = ReportType.OTHER;
- }
- ReportBaseInfoDTO baseInfo = new ReportBaseInfoDTO(fileId);
- baseInfo.setReportName(reportName);
- baseInfo.setReportType(reportType.name());
- String reportDate = ReportParseUtils.matchReportDate(reportType, reportName);
- baseInfo.setReportDate(ConvertUtil.toDate(reportDate));
- ReportFundInfoDTO fundInfo = new ReportFundInfoDTO(fileId);
- if (ReportType.ANNUALLY == reportType) {
- reportData = new AnnuallyReportData(baseInfo, fundInfo);
- } else if (ReportType.QUARTERLY == reportType) {
- reportData = new QuarterlyReportData(baseInfo, fundInfo);
- } else if (ReportType.MONTHLY == reportType) {
- reportData = new MonthlyReportData(baseInfo, fundInfo);
- } else if (ReportType.WEEKLY == reportType) {
- reportData = new WeeklyReportData(baseInfo, fundInfo);
- } else if (ReportType.OTHER == reportType) {
- reportData = new ReportData.DefaultReportData(baseInfo, fundInfo);
- } else if (ReportType.LETTER == reportType) {
- reportData = new LetterReportData(baseInfo, fundInfo);
- }
- return reportData;
- }
- /**
- * 保存报告解析结果
- *
- * @param reportData 报告解析结果
- * @param reportType 报告类型
- * @param fileName 报告名称
- */
- private void saveReportData(ReportData reportData, ReportType reportType, String fileName) {
- if (reportData == null) {
- return;
- }
- StopWatch writeWatch = new StopWatch();
- writeWatch.start();
- try {
- ReportWriter<ReportData> instance = this.reportWriterFactory.getInstance(reportType);
- instance.write(reportData);
- } catch (Exception e) {
- log.error("报告{} 结果保存失败 {}", fileName, ExceptionUtil.stacktraceToString(e));
- } finally {
- writeWatch.stop();
- if (log.isInfoEnabled()) {
- log.info("报告{} 解析结果保存完成,耗时{}ms", fileName, writeWatch.getTotalTimeMillis());
- }
- }
- }
- private EmailFileInfoDO saveEmailFileInfo(Integer emailId, EmailZipFileDTO zipFile) {
- String filename = zipFile.getFilename();
- String filepath = zipFile.getFilepath();
- EmailFileInfoDO emailFileInfoDO = this.buildEmailFileInfoDO(emailId, filename, filepath);
- if (!Objects.equals(1, zipFile.getParseStatus()) || StrUtil.isNotBlank(zipFile.getFailReason())) {
- emailFileInfoDO.setParseStatus(zipFile.getParseStatus());
- emailFileInfoDO.setFailReason(zipFile.getFailReason());
- }
- if (emailFileInfoDO.getId() != null) {
- this.emailFileInfoMapper.updateTimeById(emailFileInfoDO.getId(), new Date());
- return emailFileInfoDO;
- }
- LambdaQueryWrapper<EmailFileInfoDO> wrapper = Wrappers.lambdaQuery(EmailFileInfoDO.class)
- .eq(EmailFileInfoDO::getEmailId, emailId)
- .eq(EmailFileInfoDO::getFileName, filename)
- .eq(EmailFileInfoDO::getFilePath, filepath);
- List<EmailFileInfoDO> tempList = this.emailFileInfoMapper.selectList(wrapper);
- if (CollUtil.isNotEmpty(tempList)) {
- return tempList.get(0);
- }
- this.emailFileInfoMapper.insertById(emailFileInfoDO);
- return emailFileInfoDO;
- }
- private EmailFileInfoDO buildEmailFileInfoDO(Integer emailId, String fileName, String filePath) {
- EmailFileInfoDO emailFileInfoDO = new EmailFileInfoDO();
- emailFileInfoDO.setId(null);
- emailFileInfoDO.setEmailId(emailId);
- emailFileInfoDO.setFileName(fileName);
- emailFileInfoDO.setFilePath(filePath);
- emailFileInfoDO.setIsvalid(1);
- emailFileInfoDO.setCreatorId(0);
- emailFileInfoDO.setCreateTime(new Date());
- emailFileInfoDO.setUpdaterId(0);
- emailFileInfoDO.setUpdateTime(new Date());
- return emailFileInfoDO;
- }
- private Integer saveEmailParseInfo(EmailParseInfoDO emailParseInfoDO) {
- if (emailParseInfoDO == null) {
- return null;
- }
- // 重新邮件功能 -> 修改解析时间和更新时间
- if (emailParseInfoDO.getId() != null) {
- this.emailParseInfoMapper.updateParseTime(emailParseInfoDO.getId(), emailParseInfoDO.getParseDate());
- return emailParseInfoDO.getId();
- }
- LambdaQueryWrapper<EmailParseInfoDO> wrapper = Wrappers.lambdaQuery(EmailParseInfoDO.class)
- .eq(EmailParseInfoDO::getEmailTitle, emailParseInfoDO.getEmailTitle())
- .eq(EmailParseInfoDO::getSenderEmail, emailParseInfoDO.getSenderEmail())
- .eq(EmailParseInfoDO::getEmailDate, emailParseInfoDO.getEmailDate())
- .eq(EmailParseInfoDO::getEmail, emailParseInfoDO.getEmail())
- .orderByDesc(EmailParseInfoDO::getId);
- List<EmailParseInfoDO> tempList = this.emailParseInfoMapper.selectList(wrapper);
- if (CollUtil.isNotEmpty(tempList)) {
- this.emailParseInfoMapper.update(emailParseInfoDO, wrapper);
- return tempList.get(0).getId();
- }
- this.emailParseInfoMapper.insertAndId(emailParseInfoDO);
- return emailParseInfoDO.getId();
- }
- private EmailParseInfoDO buildEmailParseInfo(String emailAddress, EmailInfoDTO emailInfo, long totalSize) {
- EmailParseInfoDO emailParseInfoDO = new EmailParseInfoDO();
- emailParseInfoDO.setId(null);
- emailParseInfoDO.setSenderEmail(emailInfo.getSenderEmail());
- emailParseInfoDO.setEmail(emailAddress);
- emailParseInfoDO.setEmailDate(DateUtil.parse(emailInfo.getEmailDate(), DateConst.YYYY_MM_DD_HH_MM_SS));
- emailParseInfoDO.setParseDate(new Date());
- emailParseInfoDO.setEmailTitle(emailInfo.getEmailTitle());
- emailParseInfoDO.setEmailContent(emailInfo.getEmailContent());
- emailParseInfoDO.setParseStatus(EmailParseStatusConst.SUCCESS);
- emailParseInfoDO.setAttrSize(totalSize);
- emailParseInfoDO.setIsvalid(1);
- emailParseInfoDO.setCreatorId(0);
- emailParseInfoDO.setCreateTime(new Date());
- emailParseInfoDO.setUpdaterId(0);
- emailParseInfoDO.setUpdateTime(new Date());
- return emailParseInfoDO;
- }
- /**
- * 读取邮件
- *
- * @param mailboxInfoDTO 邮箱配置信息
- * @param startDate 邮件起始日期
- * @param endDate 邮件截止日期(为null,将解析邮件日期小于等于startDate的当天邮件)
- * @return 读取到的邮件信息
- * @throws Exception 异常信息
- */
- private Map<String, List<EmailContentInfoDTO>> realEmail(MailboxInfoDTO mailboxInfoDTO,
- Date startDate, Date endDate,
- List<String> folderNames) throws Exception {
- if (CollUtil.isEmpty(folderNames)) {
- folderNames = ListUtil.toList("INBOX");
- }
- Store store = EmailUtil.getStoreNew(mailboxInfoDTO);
- if (store == null) {
- return MapUtil.newHashMap(4);
- }
- Map<String, List<EmailContentInfoDTO>> result = MapUtil.newHashMap(128);
- try {
- if (log.isDebugEnabled()) {
- Folder[] list = store.getDefaultFolder().list("*");
- List<String> names = Arrays.stream(list).map(Folder::getFullName).toList();
- log.debug("获取所有邮箱文件夹:{}", names);
- }
- for (String folderName : folderNames) {
- try {
- Map<String, List<EmailContentInfoDTO>> temp = this.getFolderEmail(mailboxInfoDTO,
- startDate, endDate, store, folderName);
- if (MapUtil.isNotEmpty(temp)) {
- result.putAll(temp);
- }
- } catch (Exception e) {
- log.warn("文件夹{} 邮件获取失败:{}", folderName, ExceptionUtil.stacktraceToString(e));
- }
- }
- } catch (Exception e) {
- log.error("邮件获取失败:{}", ExceptionUtil.stacktraceToString(e));
- } finally {
- store.close();
- }
- return result;
- }
- private Map<String, List<EmailContentInfoDTO>> getFolderEmail(MailboxInfoDTO mailboxInfoDTO,
- Date startDate, Date endDate,
- Store store, String folderName) throws MessagingException {
- // 默认读取收件箱的邮件
- Folder folder = store.getFolder(folderName);
- folder.open(this.readWriteSeen ? Folder.READ_WRITE : Folder.READ_ONLY);
- Message[] messages = getEmailMessage(folder, mailboxInfoDTO.getProtocol(), startDate);
- if (messages == null || messages.length == 0) {
- log.warn("{} 获取不到邮件 -> 邮箱信息:{},开始时间:{},结束时间:{}", folderName, mailboxInfoDTO, startDate, endDate);
- return MapUtil.newHashMap();
- }
- String emailAddress = mailboxInfoDTO.getAccount();
- Map<String, List<EmailContentInfoDTO>> emailMessageMap = MapUtil.newHashMap();
- for (Message message : messages) {
- long start = System.currentTimeMillis();
- List<EmailContentInfoDTO> dtos = CollUtil.newArrayList();
- String emailTitle = message.getSubject();
- if (this.readWriteSeen && isMessageRead(message)) {
- log.warn("{} 邮件{} 已读,不用重新下载解析!", folderName, emailTitle);
- continue;
- }
- try {
- Date emailDate = message.getSentDate();
- String emailDateStr = DateUtil.format(emailDate, DateConst.YYYY_MM_DD_HH_MM_SS);
- if (log.isInfoEnabled()) {
- log.info("{} 邮件{} 数据获取中,邮件时间:{}", folderName, emailTitle, emailDateStr);
- }
- boolean isNotParseConditionSatisfied = emailDate == null
- || (endDate != null && emailDate.compareTo(endDate) > 0)
- || (startDate != null && emailDate.compareTo(startDate) < 0);
- if (isNotParseConditionSatisfied) {
- String st = DateUtil.formatDateTime(startDate);
- String ed = DateUtil.formatDateTime(endDate);
- log.warn("{} 邮件{} 发送时间{}不在区间内【{} ~ {}】", folderName, emailTitle, emailDateStr, st, ed);
- continue;
- }
- String senderEmail = getSenderEmail(message);
- Integer emailType = EmailUtil.getEmailTypeBySubject(emailTitle);
- if (emailType == null) {
- log.warn("{} 邮件不满足解析条件 -> 邮件主题:{},邮件日期:{}", folderName, emailTitle, emailDateStr);
- continue;
- }
- // // 成功解析的邮件不用重复下载
- // Integer okNum = this.emailParseInfoMapper.countEmailByInfoAndStatus(emailTitle, senderEmail, emailAddress, emailDateStr);
- // if (okNum > 0) {
- // if (log.isInfoEnabled()) {
- // log.info("{} 邮件{} 已经存在解析完成的记录,不要重复下载了。", folderName, emailTitle);
- // }
- // continue;
- // }
- if (log.isInfoEnabled()) {
- log.info("{} 邮件{} 基本信息获取完成,开始下载附件!邮件日期:{}", folderName, emailTitle, emailDateStr);
- }
- Object messageContent = message.getContent();
- String[] contents = new String[]{null};
- if (messageContent instanceof Multipart multipart) {
- this.reMultipart(emailAddress, emailTitle, emailDate, multipart, contents, dtos);
- } else {
- log.warn("{} 邮件{} 获取不了附件", folderName, emailTitle);
- }
- if (CollUtil.isEmpty(dtos)) {
- log.warn("{} 邮件{} 没有获取到附件", folderName, emailTitle);
- continue;
- }
- dtos.forEach(e -> {
- e.setEmailType(emailType);
- e.setSenderEmail(senderEmail);
- e.setEmailContent(contents[0]);
- });
- emailMessageMap.put(IdUtil.simpleUUID(), dtos);
- } catch (Exception e) {
- log.error("{} 邮件{} 下载报错 {}", folderName, emailTitle, ExceptionUtil.stacktraceToString(e));
- } finally {
- if (CollUtil.isNotEmpty(dtos) && log.isInfoEnabled()) {
- log.info("{} 邮件{} 下载完成,总计耗时{} ms,文件内容如下\n {}", folderName,
- emailTitle, System.currentTimeMillis() - start, dtos);
- }
- }
- }
- if (this.readWriteSeen) {
- // 设置已读标志
- folder.setFlags(messages, new Flags(Flags.Flag.SEEN), true);
- }
- folder.close(false);
- return emailMessageMap;
- }
- private void rePart(String account, String subject, Date sendDate, Part part,
- List<EmailContentInfoDTO> emailContentInfoDTOList) throws Exception {
- String fileName = EmailUtil.decodeFileName(part);
- if (StrUtil.isBlank(fileName)) {
- return;
- }
- if (fileName.contains("\"") || fileName.contains("\n")) {
- fileName = fileName.replaceAll("\"", "").replaceAll("\n", "");
- }
- if (fileName.contains("=?")) {
- fileName = MimeUtility.decodeText(fileName);
- }
- String disposition = part.getDisposition();
- String contentType = part.getContentType();
- String[] att_files = new String[]{Constants.ARCHIVE_7Z, Constants.ARCHIVE_RAR, Constants.ARCHIVE_ZIP,
- Constants.FILE_PDF, Constants.FILE_DOCX, Constants.FILE_JPG, Constants.FILE_PNG};
- boolean attachmentFlag = StrUtil.endWithAny(fileName, att_files);
- boolean isAttachment = attachmentFlag
- || Part.ATTACHMENT.equalsIgnoreCase(disposition)
- || (contentType != null && attachmentMimePrefixes.stream().anyMatch(prefix ->
- StrUtil.startWithIgnoreCase(contentType, prefix)
- ));
- if (!isAttachment) {
- log.warn("邮件{} 未检测到{}类型的附件 (fileName={}, disposition={}, contentType={})",
- subject, att_files, fileName, disposition, contentType);
- return;
- }
- File saveFile = this.generateSavePath(account, sendDate, fileName);
- if (!saveFile.exists()) {
- if (!saveFile.getParentFile().exists()) {
- boolean mkdirs = saveFile.getParentFile().mkdirs();
- if (!mkdirs) {
- log.warn("file path mkdir failed.");
- }
- }
- try (InputStream is = part.getInputStream()) {
- Files.copy(is, saveFile.toPath());
- }
- } else {
- if (log.isInfoEnabled()) {
- log.info("邮件{} 已下载过附件:{},不用重新下载了。", subject, saveFile.toPath());
- }
- }
- EmailContentInfoDTO emailContentInfoDTO = new EmailContentInfoDTO();
- emailContentInfoDTO.setFileName(fileName);
- emailContentInfoDTO.setFileSize(part.getSize());
- emailContentInfoDTO.setFilePath(saveFile.getAbsolutePath());
- emailContentInfoDTO.setEmailAddress(account);
- emailContentInfoDTO.setEmailTitle(subject);
- emailContentInfoDTO.setEmailDate(DateUtil.format(sendDate, DateConst.YYYY_MM_DD_HH_MM_SS));
- emailContentInfoDTOList.add(emailContentInfoDTO);
- }
- public File generateSavePath(String account, Date sendDate, String fileName) {
- String emailDateStr = DateUtil.format(sendDate, DateConst.YYYYMMDD);
- String filePath = this.path + File.separator + account + File.separator +
- emailDateStr + File.separator + "original" + File.separator;
- // 压缩包重名时的后面的压缩包会覆盖前面压缩包的问题(不考虑普通文件)
- String emailDate = DateUtil.format(sendDate, DateConst.YYYYMMDDHHMMSS24);
- String realName = ArchiveUtil.isArchive(fileName) ? emailDate + fileName : fileName;
- return FileUtil.file(filePath + realName);
- }
- private void reMultipart(String account, String subject, Date emailDate,
- Multipart multipart, String[] contents,
- List<EmailContentInfoDTO> emailContentInfoDTOList) throws Exception {
- for (int i = 0; i < multipart.getCount(); i++) {
- Part bodyPart = multipart.getBodyPart(i);
- Object bodyPartContent = bodyPart.getContent();
- if (bodyPartContent instanceof String) {
- if (log.isDebugEnabled()) {
- log.debug("邮件{} 获取的正文不做解析,内容是 {}", subject, bodyPartContent);
- }
- if (StrUtil.startWithIgnoreCase(bodyPart.getContentType(), MediaType.TEXT_HTML_VALUE)) {
- contents[0] = bodyPartContent.toString();
- }
- continue;
- }
- if (bodyPartContent instanceof Multipart mp) {
- this.reMultipart(account, subject, emailDate, mp, contents, emailContentInfoDTOList);
- } else {
- this.rePart(account, subject, emailDate, bodyPart, emailContentInfoDTOList);
- }
- }
- }
- private String getSenderEmail(Message message) {
- Address[] senderAddress;
- try {
- senderAddress = message.getFrom();
- if (senderAddress == null || senderAddress.length == 0) {
- return null;
- }
- // 此时的address是含有编码(MIME编码方式)后的文本和实际的邮件地址
- String address = "";
- for (Address from : senderAddress) {
- if (StrUtil.isNotBlank(from.toString())) {
- address = from.toString();
- break;
- }
- }
- // 正则表达式匹配邮件地址
- Pattern pattern = Pattern.compile("<(\\S+)>");
- Matcher matcher = pattern.matcher(address);
- if (matcher.find()) {
- return matcher.group(1);
- }
- } catch (MessagingException e) {
- log.error(e.getMessage(), e);
- }
- return null;
- }
- private Message[] getEmailMessage(Folder folder, String protocol, Date startDate) {
- try {
- if (protocol.contains("imap")) {
- // 获取邮件日期大于等于startDate的邮件(搜索条件只支持按天)
- SearchTerm startDateTerm = new ReceivedDateTerm(ComparisonTerm.GE, startDate);
- return folder.search(startDateTerm);
- } else {
- return folder.getMessages();
- }
- } catch (MessagingException e) {
- throw new RuntimeException(e);
- }
- }
- /**
- * 检查邮件是否已读
- *
- * @param message 邮件对象
- * @return true表示已读,false表示未读
- * @throws MessagingException 如果访问邮件标志时出错
- */
- private boolean isMessageRead(Message message) throws MessagingException {
- // 获取邮件的所有标志
- Flags flags = message.getFlags();
- // 检查是否包含 SEEN 标志
- return flags.contains(Flags.Flag.SEEN);
- }
- }
|