EmailParseService.java 55 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129
  1. package com.smppw.modaq.domain.service;
  2. import cn.hutool.core.collection.CollUtil;
  3. import cn.hutool.core.collection.ListUtil;
  4. import cn.hutool.core.date.DateUtil;
  5. import cn.hutool.core.exceptions.ExceptionUtil;
  6. import cn.hutool.core.io.FileUtil;
  7. import cn.hutool.core.map.MapUtil;
  8. import cn.hutool.core.util.IdUtil;
  9. import cn.hutool.core.util.StrUtil;
  10. import com.smppw.modaq.application.components.ReportParseUtils;
  11. import com.smppw.modaq.application.components.report.parser.ReportParser;
  12. import com.smppw.modaq.application.components.report.parser.ReportParserFactory;
  13. import com.smppw.modaq.application.components.report.writer.ReportWriter;
  14. import com.smppw.modaq.application.components.report.writer.ReportWriterFactory;
  15. import com.smppw.modaq.application.util.EmailUtil;
  16. import com.smppw.modaq.common.conts.Constants;
  17. import com.smppw.modaq.common.conts.DateConst;
  18. import com.smppw.modaq.common.conts.EmailParseStatusConst;
  19. import com.smppw.modaq.common.conts.EmailTypeConst;
  20. import com.smppw.modaq.common.enums.ReportMonthlyType;
  21. import com.smppw.modaq.common.enums.ReportParseStatus;
  22. import com.smppw.modaq.common.enums.ReportParserFileType;
  23. import com.smppw.modaq.common.enums.ReportType;
  24. import com.smppw.modaq.common.exception.NotSupportReportException;
  25. import com.smppw.modaq.common.exception.ReportParseException;
  26. import com.smppw.modaq.domain.dto.*;
  27. import com.smppw.modaq.domain.dto.report.*;
  28. import com.smppw.modaq.domain.dto.report.ocr.OCRLetterParseData;
  29. import com.smppw.modaq.domain.dto.report.ocr.OCRParseData;
  30. import com.smppw.modaq.domain.dto.report.ParseResult;
  31. import com.smppw.modaq.domain.dto.report.ReportData;
  32. import com.smppw.modaq.domain.dto.report.ReportParserParams;
  33. import com.smppw.modaq.domain.entity.EmailFileInfoDO;
  34. import com.smppw.modaq.domain.entity.EmailParseInfoDO;
  35. import com.smppw.modaq.domain.mapper.EmailFileInfoMapper;
  36. import com.smppw.modaq.domain.mapper.EmailParseInfoMapper;
  37. import com.smppw.modaq.infrastructure.util.ArchiveUtil;
  38. import com.smppw.modaq.infrastructure.util.PdfUtil;
  39. import jakarta.mail.*;
  40. import jakarta.mail.internet.MimeUtility;
  41. import jakarta.mail.search.ComparisonTerm;
  42. import jakarta.mail.search.ReceivedDateTerm;
  43. import jakarta.mail.search.SearchTerm;
  44. import org.slf4j.Logger;
  45. import org.slf4j.LoggerFactory;
  46. import org.springframework.beans.factory.annotation.Value;
  47. import org.springframework.stereotype.Service;
  48. import org.springframework.util.StopWatch;
  49. import java.io.File;
  50. import java.io.IOException;
  51. import java.io.InputStream;
  52. import java.nio.file.Files;
  53. import java.util.*;
  54. import java.util.regex.Matcher;
  55. import java.util.regex.Pattern;
  56. import java.util.stream.Collectors;
  57. /**
  58. * @author mozuwen
  59. * @date 2024-09-04
  60. * @description 邮件解析服务
  61. */
  62. @Service
  63. public class EmailParseService {
  64. // public static final int stepSize = 10000;
  65. private static final Logger log = LoggerFactory.getLogger(EmailParseService.class);
  66. // 常量定义:统一管理关键词
  67. private static final Set<String> AMAC_KEYWORDS = Set.of("协会", "信披");
  68. private static final Set<String> EXCLUDE_PATH_KEYWORDS = Set.of("公司及协会版", "公司和协会版");
  69. // 扩展支持的 MIME 类型
  70. private static final Set<String> attachmentMimePrefixes = Set.of(
  71. "application/pdf",
  72. "application/zip",
  73. "application/x-zip-compressed",
  74. "application/rar",
  75. "application/x-rar-compressed",
  76. "application/octet-stream"
  77. // 按需添加其他类型...
  78. );
  79. private final EmailParseInfoMapper emailParseInfoMapper;
  80. private final EmailFileInfoMapper emailFileInfoMapper;
  81. /* 报告解析和入库的方法 */
  82. private final ReportParserFactory reportParserFactory;
  83. private final ReportWriterFactory reportWriterFactory;
  84. @Value("${email.file.path}")
  85. private String path;
  86. // @Value("${email.report.ocr-parser-url}")
  87. // private String ocrParserUrl;
  88. @Value("${email.read-write-seen:true}")
  89. private boolean readWriteSeen;
  90. public EmailParseService(EmailParseInfoMapper emailParseInfoMapper,
  91. EmailFileInfoMapper emailFileInfoMapper,
  92. ReportParserFactory reportParserFactory,
  93. ReportWriterFactory reportWriterFactory) {
  94. this.emailParseInfoMapper = emailParseInfoMapper;
  95. this.emailFileInfoMapper = emailFileInfoMapper;
  96. this.reportParserFactory = reportParserFactory;
  97. this.reportWriterFactory = reportWriterFactory;
  98. }
  99. /**
  100. * 解析指定邮箱指定时间范围内的邮件
  101. *
  102. * @param mailboxInfoDTO 邮箱配置信息
  103. * @param startDate 邮件起始日期(yyyy-MM-dd HH:mm:ss)
  104. * @param endDate 邮件截止日期(yyyy-MM-dd HH:mm:ss, 为null,将解析邮件日期小于等于startDate的当天邮件)
  105. * @param emailTypes 当前任务支持的邮件类型,默认支持确认单
  106. */
  107. public void parseEmail(MailboxInfoDTO mailboxInfoDTO,
  108. Date startDate, Date endDate,
  109. List<String> folderNames, List<Integer> emailTypes) {
  110. if (CollUtil.isEmpty(emailTypes)) {
  111. emailTypes = ListUtil.of(EmailTypeConst.REPORT_LETTER_EMAIL_TYPE);
  112. }
  113. if (log.isInfoEnabled()) {
  114. log.info("开始邮件解析 -> 邮箱信息:{},开始时间:{},结束时间:{}", mailboxInfoDTO, DateUtil.format(startDate,
  115. DateConst.YYYY_MM_DD_HH_MM_SS), DateUtil.format(endDate, DateConst.YYYY_MM_DD_HH_MM_SS));
  116. }
  117. Map<String, List<EmailContentInfoDTO>> emailContentMap;
  118. try {
  119. emailContentMap = this.realEmail(mailboxInfoDTO, startDate, endDate, folderNames);
  120. } catch (Exception e) {
  121. log.error("采集邮件失败 -> 邮箱配置信息:{},堆栈信息:{}", mailboxInfoDTO, ExceptionUtil.stacktraceToString(e));
  122. return;
  123. }
  124. if (MapUtil.isEmpty(emailContentMap)) {
  125. log.warn("未采集到邮件 -> 邮箱配置信息:{},开始时间:{},结束时间:{}", mailboxInfoDTO,
  126. DateUtil.format(startDate, DateConst.YYYY_MM_DD_HH_MM_SS), DateUtil.format(endDate, DateConst.YYYY_MM_DD_HH_MM_SS));
  127. return;
  128. }
  129. for (Map.Entry<String, List<EmailContentInfoDTO>> emailEntry : emailContentMap.entrySet()) {
  130. List<EmailContentInfoDTO> emailContentInfoDTOList = emailEntry.getValue();
  131. if (CollUtil.isEmpty(emailContentInfoDTOList)) {
  132. log.warn("未采集到正文或附件");
  133. continue;
  134. }
  135. EmailContentInfoDTO dto = emailContentInfoDTOList.get(0);
  136. String emailTitle = dto.getEmailTitle();
  137. log.info("开始解析邮件数据 -> 邮件主题:{},邮件日期:{}", emailTitle, dto.getEmailDate());
  138. List<EmailZipFileDTO> emailFileList = ListUtil.list(false);
  139. EmailInfoDTO emailInfo = new EmailInfoDTO(dto, emailFileList);
  140. for (EmailContentInfoDTO emailDto : emailContentInfoDTOList) {
  141. // 正文不用解压附件
  142. if (emailDto.getFileName() != null && emailDto.getFileName().endsWith(Constants.FILE_HTML)) {
  143. continue;
  144. }
  145. try {
  146. emailFileList.addAll(this.parseZipEmail(emailDto));
  147. } catch (IOException e) {
  148. log.error("压缩包解压失败:{}", ExceptionUtil.stacktraceToString(e));
  149. EmailParseInfoDO fail = buildEmailParseInfo(mailboxInfoDTO.getAccount(),
  150. dto.getEmailType(), emailInfo, emailDto.getFileSize());
  151. fail.setFailReason("压缩包解压失败");
  152. fail.setParseStatus(EmailParseStatusConst.FAIL);
  153. fail.setEmailKey(emailEntry.getKey());
  154. this.emailParseInfoMapper.insert(fail);
  155. } catch (Exception e) {
  156. log.error("堆栈信息:{}", ExceptionUtil.stacktraceToString(e));
  157. }
  158. }
  159. // 重新判断类型
  160. for (EmailZipFileDTO emailFile : emailFileList) {
  161. if (EmailTypeConst.SUPPORT_NO_OTHER_TYPES.contains(emailFile.getEmailType())) {
  162. continue;
  163. }
  164. Integer type = EmailUtil.getEmailTypeBySubject(emailTitle + emailFile.getFilename());
  165. // 特殊月报
  166. if ((Objects.equals(EmailTypeConst.NAV_EMAIL_TYPE, type)
  167. || Objects.equals(EmailTypeConst.REPORT_OTHER_TYPE, type))
  168. && (ReportParseUtils.containsAny(emailTitle, ReportParseUtils.MANAGER_KEYWORDS)
  169. || emailTitle.contains("定期报告"))) {
  170. type = EmailTypeConst.REPORT_EMAIL_TYPE;
  171. }
  172. // 其他报告
  173. if (Objects.equals(EmailTypeConst.NAV_EMAIL_TYPE, type)) {
  174. type = EmailTypeConst.REPORT_OTHER_TYPE;
  175. }
  176. emailFile.setEmailType(type);
  177. }
  178. Iterator<EmailZipFileDTO> entryIterator = emailFileList.iterator();
  179. while (entryIterator.hasNext()) {
  180. EmailZipFileDTO entry = entryIterator.next();
  181. if (!emailTypes.contains(entry.getEmailType())) {
  182. log.warn("当前邮件{} 文件{} 的类型{} 不在支持的任务类型{} 中,不用执行解析逻辑。",
  183. entry.getEmailTitle(), entry.getFilepath(), entry.getEmailType(), emailTypes);
  184. entryIterator.remove();
  185. }
  186. }
  187. // 保存相关信息 -> 邮件信息表,邮件文件表,邮件净值表,邮件规模表,基金净值表
  188. saveRelatedTable(emailEntry.getKey(), mailboxInfoDTO.getAccount(), emailInfo);
  189. log.info("结束邮件解析 -> 邮箱信息:{},开始时间:{},结束时间:{}", emailEntry.getValue(),
  190. DateUtil.format(startDate, DateConst.YYYY_MM_DD_HH_MM_SS), DateUtil.format(endDate, DateConst.YYYY_MM_DD_HH_MM_SS));
  191. }
  192. }
  193. /**
  194. * 解压压缩包,如果不是压缩包需转换
  195. *
  196. * @param emailContentInfoDTO 邮件信息
  197. * @return 解压后的文件列表
  198. * @throws IOException /
  199. */
  200. public List<EmailZipFileDTO> parseZipEmail(EmailContentInfoDTO emailContentInfoDTO) throws IOException {
  201. List<EmailZipFileDTO> resultList = ListUtil.list(false);
  202. Integer emailType = emailContentInfoDTO.getEmailType();
  203. String filepath = emailContentInfoDTO.getFilePath();
  204. String emailTitle = emailContentInfoDTO.getEmailTitle();
  205. if (ArchiveUtil.isArchive(filepath)) {
  206. this.handleCompressedFiles(emailTitle, filepath, emailType, resultList);
  207. } else {
  208. // 不是压缩包时
  209. EmailZipFileDTO dto = new EmailZipFileDTO(emailTitle, emailContentInfoDTO);
  210. resultList.add(dto);
  211. }
  212. // 文件中的类型判断
  213. if (emailType == null || !EmailTypeConst.SUPPORT_NO_OTHER_TYPES.contains(emailType)) {
  214. emailType = EmailUtil.getEmailTypeBySubject(emailContentInfoDTO.getFileName());
  215. emailContentInfoDTO.setEmailType(emailType);
  216. }
  217. if (CollUtil.isNotEmpty(resultList)) {
  218. for (EmailZipFileDTO dto : resultList) {
  219. dto.setEmailType(emailType);
  220. }
  221. }
  222. return resultList;
  223. }
  224. /**
  225. * 解压压缩包并把压缩包里面的所有文件放在resultList中
  226. *
  227. * @param emailTitle 邮件主题
  228. * @param filepath 压缩包路径
  229. * @param emailType 邮件解析类型
  230. * @param resultList 解压结果列表
  231. * @throws IOException /
  232. */
  233. private void handleCompressedFiles(String emailTitle,
  234. String filepath,
  235. Integer emailType,
  236. List<EmailZipFileDTO> resultList) throws IOException {
  237. String parent = FileUtil.getParent(filepath, 2);
  238. String destPath = parent + File.separator + "archive" + File.separator + FileUtil.mainName(filepath);
  239. File destFile = new File(destPath);
  240. if (!destFile.exists()) {
  241. if (!destFile.mkdirs()) {
  242. throw new IOException("无法创建目标目录: " + destPath);
  243. }
  244. }
  245. List<String> extractedDirs;
  246. if (ArchiveUtil.isZip(filepath)) {
  247. extractedDirs = ArchiveUtil.extractCompressedFiles(filepath, destPath);
  248. } else if (ArchiveUtil.isRAR(filepath) || ArchiveUtil.is7z(filepath)) {
  249. // 7z和rar压缩包解压
  250. extractedDirs = ArchiveUtil.extractRar5(filepath, destPath);
  251. } else {
  252. return;
  253. }
  254. for (String dir : extractedDirs) {
  255. // 如果邮件类型不满足解析条件则重新根据文件名判断
  256. if (emailType == null || !EmailTypeConst.SUPPORT_EMAIL_TYPES.contains(emailType)) {
  257. emailType = EmailUtil.getEmailTypeBySubject(dir);
  258. }
  259. File file = new File(dir);
  260. if (file.isDirectory()) {
  261. String[] subDirs = file.list();
  262. if (subDirs != null) {
  263. for (String subDir : subDirs) {
  264. resultList.add(new EmailZipFileDTO(emailTitle, subDir, emailType));
  265. }
  266. } else {
  267. log.warn("目录 {} 下无文件", dir);
  268. }
  269. } else {
  270. resultList.add(new EmailZipFileDTO(emailTitle, dir, emailType));
  271. }
  272. }
  273. }
  274. /**
  275. * 邮件附件解析并保存结果数据
  276. *
  277. * @param emailKey 没封邮件的uuid
  278. * @param emailAddress 发送人地址
  279. * @param emailInfo 邮件信息
  280. */
  281. public void saveRelatedTable(String emailKey, String emailAddress, EmailInfoDTO emailInfo) {
  282. // 附件文件检查
  283. Long totalSize = this.checkEmailFileInfo(emailInfo);
  284. if (totalSize == null) {
  285. return;
  286. }
  287. // 解析并保存数据
  288. List<ParseResult<ReportData>> dataList = ListUtil.list(true);
  289. Integer emailId = this.parseResults(null, emailKey, emailAddress, totalSize, emailInfo, dataList);
  290. String failReason = null;
  291. int emailParseStatus = EmailParseStatusConst.SUCCESS;
  292. // 报告邮件有一条失败就表示整个邮件解析失败
  293. if (CollUtil.isNotEmpty(dataList)) {
  294. // ai解析结果
  295. List<ReportData> aiParaseList = dataList.stream().map(ParseResult::getData)
  296. .filter(Objects::nonNull).filter(e -> Objects.equals(true, e.getAiParse())).toList();
  297. if (CollUtil.isNotEmpty(aiParaseList)) {
  298. for (ReportData data : aiParaseList) {
  299. this.emailFileInfoMapper.updateAiParseByFileId(data.getBaseInfo().getFileId(),
  300. data.getAiParse(), data.getAiFileId());
  301. }
  302. }
  303. long failNum = dataList.stream().filter(e -> !Objects.equals(EmailParseStatusConst.SUCCESS, e.getStatus())).count();
  304. if (failNum > 0) {
  305. emailParseStatus = EmailParseStatusConst.FAIL;
  306. failReason = dataList.stream().map(ParseResult::getMsg).collect(Collectors.joining(";"));
  307. }
  308. }
  309. this.emailParseInfoMapper.updateParseStatus(emailId, emailParseStatus, failReason);
  310. }
  311. /**
  312. * 上传文件解析并返回解析状态
  313. *
  314. * @param params 上传文件路径
  315. * @return /
  316. */
  317. public List<UploadReportResult> uploadReportResults(UploadReportParams params) {
  318. List<ParseResult<ReportData>> dataList = ListUtil.list(false);
  319. List<UploadReportParams.ReportInfo> reportInfos = params.getReportInfos();
  320. List<EmailZipFileDTO> dtos = ListUtil.list(false);
  321. for (UploadReportParams.ReportInfo e : reportInfos) {
  322. String reportPath = e.getReportPath();
  323. if (ArchiveUtil.isArchive(reportPath)) {
  324. try {
  325. this.handleCompressedFiles(params.getTitle(), reportPath, e.getReportType(), dtos);
  326. } catch (Exception ex) {
  327. log.warn("报告{} 压缩包解压失败:{}", reportPath, ExceptionUtil.stacktraceToString(ex));
  328. ReportData reportData = new ReportData.DefaultReportData();
  329. reportData.setReportPath(reportPath);
  330. dataList.add(new ParseResult<>(ReportParseStatus.ARCHIVE_FAIL, reportData));
  331. }
  332. } else {
  333. dtos.add(new EmailZipFileDTO(params.getTitle(), reportPath, e.getReportType()));
  334. }
  335. }
  336. EmailInfoDTO emailInfo = new EmailInfoDTO(params.getTitle(), dtos);
  337. // 附件文件检查
  338. Long totalSize = this.checkEmailFileInfo(emailInfo);
  339. if (totalSize == null) {
  340. return null;
  341. }
  342. this.parseResults(-1, null, null, totalSize, emailInfo, dataList);
  343. List<UploadReportResult> resultList = ListUtil.list(false);
  344. for (ParseResult<ReportData> result : dataList) {
  345. ReportData data = result.getData();
  346. resultList.add(new UploadReportResult(data.getReportPath(), result.getStatus(), result.getMsg()));
  347. }
  348. return resultList;
  349. }
  350. /**
  351. * 邮件信息前置处理,在解析操作执行之前的过滤逻辑和校验逻辑。返回所有附件大小汇总
  352. *
  353. * @param emailInfo 邮件信息(包含所有解压后的文件)
  354. * @return 所有附件大小汇总,为null说明没有文件需要上传
  355. */
  356. private Long checkEmailFileInfo(EmailInfoDTO emailInfo) {
  357. String emailTitle = emailInfo.getEmailTitle();
  358. List<EmailZipFileDTO> dtos = emailInfo.getEmailFileList();
  359. // 如果压缩包里面既有pdf又有其他格式的文件,说明其他格式的文件是不需要解析的
  360. List<String> exts = dtos.stream().map(EmailZipFileDTO::getExtName).distinct().toList();
  361. if (exts.contains(Constants.FILE_PDF) && exts.size() > 1) {
  362. dtos.removeIf(e -> !Objects.equals(Constants.FILE_PDF, e.getExtName()));
  363. }
  364. // 移除逻辑
  365. Iterator<EmailZipFileDTO> removeIterator = dtos.iterator();
  366. while (removeIterator.hasNext()) {
  367. EmailZipFileDTO dto = removeIterator.next();
  368. String filename = dto.getFilename();
  369. // 删除复核函或基金合同
  370. if (filename.contains("复核函") || (filename.contains("基金合同") && !filename.contains("合同变更"))) {
  371. log.warn("邮件{} 中的报告{} 是复核函或基金合同,不用解析上传。", emailTitle, filename);
  372. removeIterator.remove();
  373. }
  374. // 不支持的类型
  375. Integer type = dto.getEmailType();
  376. if (!EmailTypeConst.SUPPORT_EMAIL_TYPES.contains(type)) {
  377. log.info("邮件{} 类型{} 不支持解析。", emailTitle, type);
  378. removeIterator.remove();
  379. }
  380. }
  381. // 数据库已存在的数据过滤(邮件主题+报告名称+附件大小,压缩包文件大小汇总)
  382. long totalSize = dtos.stream().map(EmailZipFileDTO::getFileSize).reduce(0L, Long::sum);
  383. Iterator<EmailZipFileDTO> iterator = dtos.iterator();
  384. while (iterator.hasNext()) {
  385. EmailZipFileDTO dto = iterator.next();
  386. String filename = dto.getFilename();
  387. Integer type = dto.getEmailType();
  388. int count = 0;
  389. if (Objects.equals(type, EmailTypeConst.REPORT_LETTER_EMAIL_TYPE)) {
  390. // 确认单
  391. count = this.emailFileInfoMapper.getLetterFilenameSuccessCount(emailTitle, filename);
  392. } else if (Objects.equals(type, EmailTypeConst.REPORT_EMAIL_TYPE)) {
  393. // 定期报告
  394. count = this.emailFileInfoMapper.getAmacFilenameSuccessCount(emailTitle, filename, totalSize);
  395. } else if (Objects.equals(type, EmailTypeConst.REPORT_WEEKLY_TYPE)) {
  396. // 管理人周报
  397. count = this.emailFileInfoMapper.getWeeklyFilenameSuccessCount(emailTitle, filename, totalSize);
  398. } else if (Objects.equals(type, EmailTypeConst.REPORT_OTHER_TYPE)) {
  399. // 其他报告
  400. count = this.emailFileInfoMapper.getOtherFilenameSuccessCount(emailTitle, filename, totalSize);
  401. }
  402. if (count > 0) {
  403. iterator.remove();
  404. log.info("邮件{} 报告{} 已存在解析成功的记录,不用重新解析。", emailTitle, filename);
  405. }
  406. }
  407. if (CollUtil.isEmpty(dtos)) {
  408. log.info("邮件{} 所有文件都已经解析成功过,不能重复解析了", emailTitle);
  409. return null;
  410. }
  411. if (log.isInfoEnabled()) {
  412. log.info("邮件{} 还有报告待解析:\n{}", emailTitle, dtos);
  413. }
  414. return totalSize;
  415. }
  416. /**
  417. * 邮件信息保存+附件解析
  418. *
  419. * @param emailId 邮件ID,上传解析时一定是-1
  420. * @param emailKey 邮件uuid(邮箱下载解析时)
  421. * @param emailAddress 接收人地址(邮箱下载解析时)
  422. * @param totalSize 所有附件大小汇总
  423. * @param emailInfo 邮件信息,包含附件
  424. * @param resultList 解析结果
  425. * @return 邮件数据ID
  426. */
  427. private Integer parseResults(Integer emailId,
  428. String emailKey,
  429. String emailAddress,
  430. long totalSize,
  431. EmailInfoDTO emailInfo,
  432. List<ParseResult<ReportData>> resultList) {
  433. String emailTitle = emailInfo.getEmailTitle();
  434. List<EmailZipFileDTO> dtos = emailInfo.getEmailFileList();
  435. if (emailId == null) {
  436. // 保存邮件信息
  437. Integer emailType = dtos.get(0).getEmailType();
  438. EmailParseInfoDO emailParseInfoDO = this.buildEmailParseInfo(emailAddress, emailType, emailInfo, totalSize);
  439. emailParseInfoDO.setEmailKey(emailKey);
  440. emailId = this.saveEmailParseInfo(emailParseInfoDO);
  441. }
  442. // 解析邮件报告
  443. for (EmailZipFileDTO zipFile : dtos) {
  444. EmailFileInfoDO emailFile = this.saveEmailFileInfo(emailId, zipFile.getFilename(), zipFile.getFilepath());
  445. // 解析并保存报告
  446. ParseResult<ReportData> parseResult = this.parseReportAndHandleResult(emailTitle, emailFile.getId(), zipFile);
  447. if (!Objects.equals(1, parseResult.getStatus())) {
  448. log.error(parseResult.getMsg());
  449. }
  450. if (parseResult.getData() == null) {
  451. parseResult.setData(new ReportData.DefaultReportData());
  452. }
  453. parseResult.getData().setReportPath(zipFile.getFilepath());
  454. resultList.add(parseResult);
  455. }
  456. return emailId;
  457. }
  458. /**
  459. * 解析报告并保存解析结果
  460. *
  461. * @param emailTitle 邮件主题
  462. * @param fileId 当前文件数据库ID
  463. * @param zipFile 当前报告的路径信息
  464. * @return /
  465. */
  466. private ParseResult<ReportData> parseReportAndHandleResult(String emailTitle,
  467. Integer fileId,
  468. EmailZipFileDTO zipFile) {
  469. Integer emailType = zipFile.getEmailType();
  470. String fileName = zipFile.getFilename();
  471. String filepath = zipFile.getFilepath();
  472. ParseResult<ReportData> result = new ParseResult<>();
  473. boolean reportFlag = emailType == null || !EmailTypeConst.SUPPORT_EMAIL_TYPES.contains(emailType);
  474. if (reportFlag || StrUtil.isBlank(fileName) || fileName.endsWith(Constants.FILE_HTML)) {
  475. return new ParseResult<>(ReportParseStatus.NOT_A_REPORT, null, fileName);
  476. }
  477. // 类型识别---先识别季度报告,没有季度再识别年度报告,最后识别月报
  478. ReportType reportType = ReportParseUtils.matchReportType(emailType, fileName);
  479. if (reportType == null) {
  480. reportType = ReportParseUtils.matchReportType(emailType, emailTitle);
  481. if (log.isDebugEnabled()) {
  482. log.debug("报告{} 根据邮件主题{} 重新识别的类型是:{}", fileName, emailTitle, reportType);
  483. }
  484. }
  485. // 解析器--根据文件后缀获取对应解析器,解析不了就用AI来解析
  486. ReportParserFileType fileType = ReportParserFileType.getBySuffix(zipFile.getExtName());
  487. // 不支持的格式
  488. if (fileType == null) {
  489. return new ParseResult<>(ReportParseStatus.NO_SUPPORT_TEMPLATE, null, fileName);
  490. }
  491. // 不是定期报告的判断逻辑放在不支持的格式下面
  492. if (reportType == null) {
  493. return new ParseResult<>(ReportParseStatus.NOT_A_REPORT, null, fileName);
  494. }
  495. // // docx转pdf
  496. // if (Objects.equals(ReportParserFileType.WORD, fileType)) {
  497. // try {
  498. // String outputFile = FileUtil.getParent(filepath, 1) + File.separator + FileUtil.mainName(fileName) + ".pdf";
  499. // PdfUtil.convertDocxToPdf(filepath, outputFile);
  500. // filepath = outputFile;
  501. // } catch (Exception e) {
  502. // log.warn("报告{} 转换为pdf失败:{}", fileName, ExceptionUtil.stacktraceToString(e));
  503. // }
  504. // }
  505. // 首页和尾页转为png图片,首页用来识别基金名称和基金代码、尾页用来识别印章和联系人
  506. List<String> images = ListUtil.list(true);
  507. if (Objects.equals(ReportParserFileType.PDF, fileType)) {
  508. try {
  509. String output = filepath.replaceAll("archive|original", "image");
  510. File outputFile = FileUtil.file(FileUtil.getParent(output, 1));
  511. images = PdfUtil.convertFirstAndLastPagesToPng(filepath, outputFile, 300);
  512. if (log.isDebugEnabled()) {
  513. log.debug("报告{} 生成的图片地址是:\n{}", fileName, images);
  514. }
  515. } catch (Exception e) {
  516. log.warn("报告{} 生成图片失败:{}", fileName, ExceptionUtil.stacktraceToString(e));
  517. }
  518. // } else if (Objects.equals(ReportParserFileType.IMG, fileType)) {
  519. // try {
  520. // String outputFile = PdfUtil.compressAndSave(filepath);
  521. // images.add(outputFile);
  522. // } catch (IOException e) {
  523. // log.error("报告{} 图片压缩失败,{}", fileName, ExceptionUtil.stacktraceToString(e));
  524. // }
  525. }
  526. // ocr识别月报是否管理人版或协会版
  527. ReportMonthlyType monthlyType = ReportMonthlyType.NO_NEED;
  528. // if (ReportType.MONTHLY == reportType) {
  529. // monthlyType = this.determineReportType(emailTitle, fileName, filepath, images);
  530. // }
  531. // boolean isAmac = reportType == ReportType.ANNUALLY || reportType == ReportType.QUARTERLY
  532. // || (reportType == ReportType.MONTHLY && ReportMonthlyType.AMAC == monthlyType);
  533. boolean isAmac = false;
  534. // 不支持解析的格式文件
  535. boolean notSupportFile = false;
  536. // 解析报告
  537. ReportData reportData = null;
  538. ReportParserParams params = new ReportParserParams(fileId, fileName, filepath, reportType);
  539. long start = System.currentTimeMillis();
  540. try {
  541. if (isAmac || reportType == ReportType.LETTER) {
  542. ReportParser<ReportData> instance = this.reportParserFactory.getInstance(reportType, fileType);
  543. reportData = instance.parse(params);
  544. result = new ParseResult<>(1, "报告解析成功", reportData);
  545. }
  546. } catch (ReportParseException e) {
  547. result = new ParseResult<>(e.getCode(), StrUtil.format(e.getMsg(), fileName), null);
  548. log.warn("解析失败:{}", result.getMsg());
  549. if (e instanceof NotSupportReportException) {
  550. notSupportFile = true;
  551. }
  552. } catch (Exception e) {
  553. log.warn("解析错误:{}", ExceptionUtil.stacktraceToString(e));
  554. result = new ParseResult<>(ReportParseStatus.PARSE_FAIL, null, e.getMessage());
  555. } finally {
  556. // 如果解析结果是空的就用AI工具解析一次
  557. if (reportData == null && !notSupportFile) {
  558. if (log.isInfoEnabled()) {
  559. log.info("报告{} 是周报或管理人月报或其他类型或解析失败,用AI解析器解析", fileName);
  560. }
  561. try {
  562. if (!isAmac && CollUtil.isNotEmpty(images)) {
  563. filepath = images.get(0);
  564. }
  565. params = new ReportParserParams(fileId, fileName, filepath, reportType);
  566. ReportParser<ReportData> instance = this.reportParserFactory.getInstance(reportType, ReportParserFileType.AI);
  567. reportData = instance.parse(params);
  568. result = new ParseResult<>(1, "报告解析成功--AI", reportData);
  569. } catch (ReportParseException e) {
  570. result = new ParseResult<>(e.getCode(), StrUtil.format(e.getMsg(), fileName), null);
  571. log.warn("AI解析失败:{}", result.getMsg());
  572. } catch (Exception e) {
  573. log.warn("AI解析错误:{}", ExceptionUtil.stacktraceToString(e));
  574. result = new ParseResult<>(ReportParseStatus.PARSE_FAIL, null, e.getMessage());
  575. }
  576. }
  577. if (log.isInfoEnabled()) {
  578. log.info("报告{} 用ocr补充解析结果。补充前的结果是:\n{}", fileName, reportData);
  579. }
  580. // // ocr信息提取(印章、联系人、基金名称和产品代码)
  581. // this.ocrReportData(reportType, reportData, fileName, images);
  582. // 设置月报类型
  583. if (reportData != null && reportData.getBaseInfo() != null) {
  584. reportData.getBaseInfo().setMonthlyType(monthlyType.getType());
  585. }
  586. if (log.isInfoEnabled()) {
  587. log.info("报告{} 解析耗时{}ms,结果是:\n{}", fileName, (System.currentTimeMillis() - start), reportData);
  588. }
  589. }
  590. // 保存报告解析结果
  591. this.saveReportData(reportData, reportType, fileName);
  592. return result;
  593. }
  594. // /**
  595. // * 判断月报类型(管理人版还是协会版)
  596. // *
  597. // * @param emailTitle 邮件主题
  598. // * @param fileName 报告名称
  599. // * @param filepath 报告路径
  600. // * @param images 报告的第一页和尾页图片地址(主要用于ocr提取关键信息)
  601. // */
  602. // public ReportMonthlyType determineReportType(String emailTitle, String fileName,
  603. // String filepath, List<String> images) {
  604. // // 1. 优先根据文件名判断
  605. // if (ReportParseUtils.containsAny(fileName, AMAC_KEYWORDS)) {
  606. // return ReportMonthlyType.AMAC;
  607. // }
  608. // if (ReportParseUtils.containsAny(fileName, ReportParseUtils.MANAGER_KEYWORDS)) {
  609. // return ReportMonthlyType.MANAGER;
  610. // }
  611. //// if (StrUtil.isNotBlank(ReportParseUtils.matchFundCode(fileName))) {
  612. //// return ReportMonthlyType.AMAC;
  613. //// }
  614. // // 2. 根据文件路径判断
  615. // List<String> pathSegments = StrUtil.split(filepath, File.separator);
  616. // for (String segment : pathSegments) {
  617. // boolean isExcluded = ReportParseUtils.containsAny(segment, EXCLUDE_PATH_KEYWORDS);
  618. // if (!isExcluded && ReportParseUtils.containsAny(segment, AMAC_KEYWORDS)) {
  619. // return ReportMonthlyType.AMAC;
  620. // }
  621. // if (!isExcluded && ReportParseUtils.containsAny(segment, ReportParseUtils.MANAGER_KEYWORDS)) {
  622. // return ReportMonthlyType.MANAGER;
  623. // }
  624. // }
  625. // // 3. 根据邮件主题判断
  626. // boolean isAmacEmail = ReportParseUtils.containsAny(emailTitle, AMAC_KEYWORDS)
  627. // && !emailTitle.contains("公司及协会版");
  628. // if (isAmacEmail) {
  629. // return ReportMonthlyType.AMAC;
  630. // }
  631. // if (ReportParseUtils.containsAny(emailTitle, ReportParseUtils.MANAGER_KEYWORDS)) {
  632. // return ReportMonthlyType.MANAGER;
  633. // }
  634. // // 4.ocr 提取“曲线”、“基金份额”等关键字,如果有曲线则是管理人,如果有估值日期则是协会
  635. // if (CollUtil.isNotEmpty(images)) {
  636. // try {
  637. // return new OCRReportParser().parseMonthlyType(fileName, this.ocrParserUrl, images.get(0));
  638. // } catch (Exception ignored) {
  639. // return ReportMonthlyType.FAILED;
  640. // }
  641. // }
  642. // return ReportMonthlyType.FAILED;
  643. // }
  644. /**
  645. * ocr 提取信息(包括首页的基金名称或报告日期,尾页的印章或联系人等信息)
  646. *
  647. * @param reportData 报告解析结果
  648. * @param fileName 报告名称
  649. * @param images 报告的收益和尾页png图片
  650. */
  651. private void ocrReportData(ReportType reportType,
  652. ReportData reportData,
  653. String fileName,
  654. List<String> images) {
  655. if (reportData == null || CollUtil.isEmpty(images)) {
  656. return;
  657. }
  658. // 报告才识别尾页的印章和联系人,确认单不识别尾页
  659. if (ReportType.LETTER != reportType) {
  660. OCRParseData parseRes = null;
  661. try {
  662. // 首页和尾页相等时只读首页
  663. String imageUrl = images.size() == 1 ? images.get(0) : images.get(1);
  664. parseRes = new OCRReportParser().parse(fileName, this.ocrParserUrl, imageUrl);
  665. } catch (Exception e) {
  666. log.error("报告{} OCR识别印章和联系人出错:{}", fileName, e.getMessage());
  667. }
  668. // ocr识别尾页是否包含印章和联系人信息
  669. if (parseRes != null) {
  670. if (reportData.getBaseInfo() != null) {
  671. reportData.getBaseInfo().setWithSeals(parseRes.getWithSeals());
  672. reportData.getBaseInfo().setWithContacts(parseRes.getWithContacts());
  673. if (fileName.contains("用印") && !Objects.equals(true, reportData.getBaseInfo().getWithSeals())) {
  674. reportData.getBaseInfo().setWithSeals(true);
  675. }
  676. }
  677. }
  678. // 首页和尾页不相等时解析首页的数据
  679. if (images.size() != 1) {
  680. try {
  681. parseRes = new OCRReportParser().parse(fileName, this.ocrParserUrl, images.get(0));
  682. } catch (Exception e) {
  683. log.error("报告{} OCR识别首页基金名称和报告日期出错:{}", fileName, e.getMessage());
  684. }
  685. }
  686. // 用首页识别基金名称、产品代码和基金管理人
  687. if (reportData.getFundInfo() != null && parseRes != null) {
  688. if (StrUtil.isBlank(reportData.getFundInfo().getFundName())) {
  689. reportData.getFundInfo().setFundName(parseRes.getFundName());
  690. }
  691. if (StrUtil.isBlank(reportData.getFundInfo().getFundCode())) {
  692. reportData.getFundInfo().setFundCode(parseRes.getFundCode());
  693. }
  694. if (StrUtil.isBlank(reportData.getFundInfo().getCompanyName())
  695. || !reportData.getFundInfo().getCompanyName().contains("有限公司")) {
  696. reportData.getFundInfo().setCompanyName(parseRes.getCompanyName());
  697. }
  698. }
  699. } else {
  700. // 确认单AI解析失败时重新用OCR识别
  701. LetterReportData letterReportData = (LetterReportData) reportData;
  702. if (letterReportData.wasFailed()) {
  703. OCRLetterParseData parseRes = null;
  704. try {
  705. parseRes = new OCRReportParser().parseLetterData(fileName, this.ocrParserUrl, images.get(0));
  706. } catch (Exception e) {
  707. log.error("报告{} OCR提取确认单关键信息出错:{}", fileName, e.getMessage());
  708. }
  709. if (parseRes == null) {
  710. return;
  711. }
  712. if (letterReportData.getFundInfo() != null) {
  713. letterReportData.getFundInfo().setFundName(parseRes.getFundName());
  714. letterReportData.getFundInfo().setFundCode(parseRes.getFundCode());
  715. }
  716. if (letterReportData.getInvestorInfo() == null) {
  717. letterReportData.setInvestorInfo(new ReportInvestorInfoDTO());
  718. }
  719. letterReportData.getInvestorInfo().setInvestorName(parseRes.getInvestorName());
  720. letterReportData.getInvestorInfo().setCertificateNumber(parseRes.getCertificateNumber());
  721. letterReportData.getInvestorInfo().setTradingAccount(parseRes.getTradingAccount());
  722. letterReportData.getInvestorInfo().setFundAccount(parseRes.getFundAccount());
  723. letterReportData.getInvestorInfo().setCertificateType(parseRes.getCertificateType());
  724. if (letterReportData.getFundTransaction() == null) {
  725. letterReportData.setFundTransaction(new ReportFundTransactionDTO());
  726. }
  727. letterReportData.getFundTransaction().setTransactionType(parseRes.getTransactionType());
  728. letterReportData.getFundTransaction().setApplyDate(parseRes.getApplyDate());
  729. letterReportData.getFundTransaction().setApplyShare(parseRes.getApplyShare());
  730. letterReportData.getFundTransaction().setApplyAmount(parseRes.getApplyAmount());
  731. letterReportData.getFundTransaction().setHoldingDate(parseRes.getHoldingDate());
  732. letterReportData.getFundTransaction().setAmount(parseRes.getAmount());
  733. letterReportData.getFundTransaction().setShare(parseRes.getShare());
  734. letterReportData.getFundTransaction().setNav(parseRes.getNav());
  735. }
  736. }
  737. }
  738. /**
  739. * 保存报告解析结果
  740. *
  741. * @param reportData 报告解析结果
  742. * @param reportType 报告类型
  743. * @param fileName 报告名称
  744. */
  745. private void saveReportData(ReportData reportData, ReportType reportType, String fileName) {
  746. if (reportData == null) {
  747. return;
  748. }
  749. StopWatch writeWatch = new StopWatch();
  750. writeWatch.start();
  751. try {
  752. ReportWriter<ReportData> instance = this.reportWriterFactory.getInstance(reportType);
  753. instance.write(reportData);
  754. } catch (Exception e) {
  755. log.error("报告{} 结果保存失败 {}", fileName, ExceptionUtil.stacktraceToString(e));
  756. } finally {
  757. writeWatch.stop();
  758. if (log.isInfoEnabled()) {
  759. log.info("报告{}解析结果保存完成,耗时{}ms", fileName, writeWatch.getTotalTimeMillis());
  760. }
  761. }
  762. }
  763. private EmailFileInfoDO saveEmailFileInfo(Integer emailId, String fileName, String filePath) {
  764. EmailFileInfoDO emailFileInfoDO = buildEmailFileInfoDO(emailId, fileName, filePath);
  765. emailFileInfoDO.setAiFileId(null);
  766. if (emailFileInfoDO.getId() != null) {
  767. emailFileInfoMapper.updateTimeById(null, new Date());
  768. return emailFileInfoDO;
  769. }
  770. emailFileInfoMapper.insert(emailFileInfoDO);
  771. return emailFileInfoDO;
  772. }
  773. private EmailFileInfoDO buildEmailFileInfoDO(Integer emailId, String fileName, String filePath) {
  774. EmailFileInfoDO emailFileInfoDO = new EmailFileInfoDO();
  775. emailFileInfoDO.setId(null);
  776. emailFileInfoDO.setEmailId(emailId);
  777. emailFileInfoDO.setFileName(fileName);
  778. emailFileInfoDO.setFilePath(filePath);
  779. emailFileInfoDO.setIsvalid(1);
  780. emailFileInfoDO.setCreatorId(0);
  781. emailFileInfoDO.setCreateTime(new Date());
  782. emailFileInfoDO.setUpdaterId(0);
  783. emailFileInfoDO.setUpdateTime(new Date());
  784. return emailFileInfoDO;
  785. }
  786. private Integer saveEmailParseInfo(EmailParseInfoDO emailParseInfoDO) {
  787. if (emailParseInfoDO == null) {
  788. return null;
  789. }
  790. // 重新邮件功能 -> 修改解析时间和更新时间
  791. if (emailParseInfoDO.getId() != null) {
  792. emailParseInfoMapper.updateParseTime(emailParseInfoDO.getId(), emailParseInfoDO.getParseDate());
  793. return emailParseInfoDO.getId();
  794. }
  795. emailParseInfoMapper.insert(emailParseInfoDO);
  796. return emailParseInfoDO.getId();
  797. }
  798. private EmailParseInfoDO buildEmailParseInfo(String emailAddress, Integer emailType,
  799. EmailInfoDTO emailInfo, long totalSize) {
  800. EmailParseInfoDO emailParseInfoDO = new EmailParseInfoDO();
  801. emailParseInfoDO.setId(null);
  802. emailParseInfoDO.setSenderEmail(emailInfo.getSenderEmail());
  803. emailParseInfoDO.setEmail(emailAddress);
  804. emailParseInfoDO.setEmailDate(DateUtil.parse(emailInfo.getEmailDate(), DateConst.YYYY_MM_DD_HH_MM_SS));
  805. emailParseInfoDO.setParseDate(new Date());
  806. emailParseInfoDO.setEmailTitle(emailInfo.getEmailTitle());
  807. emailParseInfoDO.setEmailType(emailType);
  808. emailParseInfoDO.setParseStatus(EmailParseStatusConst.SUCCESS);
  809. emailParseInfoDO.setAttrSize(totalSize);
  810. emailParseInfoDO.setIsvalid(1);
  811. emailParseInfoDO.setCreatorId(0);
  812. emailParseInfoDO.setCreateTime(new Date());
  813. emailParseInfoDO.setUpdaterId(0);
  814. emailParseInfoDO.setUpdateTime(new Date());
  815. return emailParseInfoDO;
  816. }
  817. /**
  818. * 读取邮件
  819. *
  820. * @param mailboxInfoDTO 邮箱配置信息
  821. * @param startDate 邮件起始日期
  822. * @param endDate 邮件截止日期(为null,将解析邮件日期小于等于startDate的当天邮件)
  823. * @return 读取到的邮件信息
  824. * @throws Exception 异常信息
  825. */
  826. private Map<String, List<EmailContentInfoDTO>> realEmail(MailboxInfoDTO mailboxInfoDTO,
  827. Date startDate, Date endDate,
  828. List<String> folderNames) throws Exception {
  829. if (CollUtil.isEmpty(folderNames)) {
  830. folderNames = ListUtil.toList("INBOX");
  831. }
  832. Store store = EmailUtil.getStoreNew(mailboxInfoDTO);
  833. if (store == null) {
  834. return MapUtil.newHashMap(4);
  835. }
  836. Map<String, List<EmailContentInfoDTO>> result = MapUtil.newHashMap(128);
  837. try {
  838. if (log.isDebugEnabled()) {
  839. Folder[] list = store.getDefaultFolder().list("*");
  840. List<String> names = Arrays.stream(list).map(Folder::getFullName).toList();
  841. log.debug("获取所有邮箱文件夹:{}", names);
  842. }
  843. for (String folderName : folderNames) {
  844. try {
  845. Map<String, List<EmailContentInfoDTO>> temp = this.getFolderEmail(mailboxInfoDTO,
  846. startDate, endDate, store, folderName);
  847. if (MapUtil.isNotEmpty(temp)) {
  848. result.putAll(temp);
  849. }
  850. } catch (Exception e) {
  851. log.warn("文件夹{} 邮件获取失败:{}", folderName, ExceptionUtil.stacktraceToString(e));
  852. }
  853. }
  854. } catch (Exception e) {
  855. log.error("邮件获取失败:{}", ExceptionUtil.stacktraceToString(e));
  856. } finally {
  857. store.close();
  858. }
  859. return result;
  860. }
  861. private Map<String, List<EmailContentInfoDTO>> getFolderEmail(MailboxInfoDTO mailboxInfoDTO,
  862. Date startDate, Date endDate,
  863. Store store, String folderName) throws MessagingException {
  864. // 默认读取收件箱的邮件
  865. Folder folder = store.getFolder(folderName);
  866. folder.open(this.readWriteSeen ? Folder.READ_WRITE : Folder.READ_ONLY);
  867. Message[] messages = getEmailMessage(folder, mailboxInfoDTO.getProtocol(), startDate);
  868. if (messages == null || messages.length == 0) {
  869. log.warn("{} 获取不到邮件 -> 邮箱信息:{},开始时间:{},结束时间:{}", folderName, mailboxInfoDTO, startDate, endDate);
  870. return MapUtil.newHashMap();
  871. }
  872. String emailAddress = mailboxInfoDTO.getAccount();
  873. Map<String, List<EmailContentInfoDTO>> emailMessageMap = MapUtil.newHashMap();
  874. for (Message message : messages) {
  875. long start = System.currentTimeMillis();
  876. List<EmailContentInfoDTO> dtos = CollUtil.newArrayList();
  877. String emailTitle = message.getSubject();
  878. if (this.readWriteSeen && isMessageRead(message)) {
  879. log.warn("{} 邮件{} 已读,不用重新下载解析!", folderName, emailTitle);
  880. continue;
  881. }
  882. try {
  883. Date emailDate = message.getSentDate();
  884. String emailDateStr = DateUtil.format(emailDate, DateConst.YYYY_MM_DD_HH_MM_SS);
  885. if (log.isInfoEnabled()) {
  886. log.info("{} 邮件{} 数据获取中,邮件时间:{}", folderName, emailTitle, emailDateStr);
  887. }
  888. boolean isNotParseConditionSatisfied = emailDate == null
  889. || (endDate != null && emailDate.compareTo(endDate) > 0)
  890. || (startDate != null && emailDate.compareTo(startDate) < 0);
  891. if (isNotParseConditionSatisfied) {
  892. String st = DateUtil.formatDateTime(startDate);
  893. String ed = DateUtil.formatDateTime(endDate);
  894. log.warn("{} 邮件{} 发送时间{}不在区间内【{} ~ {}】", folderName, emailTitle, emailDateStr, st, ed);
  895. continue;
  896. }
  897. String senderEmail = getSenderEmail(message);
  898. Integer emailType = EmailUtil.getEmailTypeBySubject(emailTitle);
  899. if (emailType == null) {
  900. log.warn("{} 邮件不满足解析条件 -> 邮件主题:{},邮件日期:{}", folderName, emailTitle, emailDateStr);
  901. continue;
  902. }
  903. // // 成功解析的邮件不用重复下载
  904. // Integer okNum = this.emailParseInfoMapper.countEmailByInfoAndStatus(emailTitle, senderEmail, emailAddress, emailDateStr);
  905. // if (okNum > 0) {
  906. // if (log.isInfoEnabled()) {
  907. // log.info("{} 邮件{} 已经存在解析完成的记录,不要重复下载了。", folderName, emailTitle);
  908. // }
  909. // continue;
  910. // }
  911. if (log.isInfoEnabled()) {
  912. log.info("{} 邮件{} 基本信息获取完成,开始下载附件!邮件日期:{}", folderName, emailTitle, emailDateStr);
  913. }
  914. Object content = message.getContent();
  915. if (content instanceof Multipart multipart) {
  916. this.reMultipart(emailAddress, emailTitle, emailDate, multipart, dtos);
  917. } else if (content instanceof Part part) {
  918. this.rePart(emailAddress, emailTitle, emailDate, part, dtos);
  919. } else {
  920. log.warn("{} 邮件{} 获取不了附件", folderName, emailTitle);
  921. }
  922. if (CollUtil.isEmpty(dtos)) {
  923. log.warn("{} 邮件{} 没有获取到附件", folderName, emailTitle);
  924. continue;
  925. }
  926. dtos.forEach(e -> {
  927. e.setEmailType(emailType);
  928. e.setSenderEmail(senderEmail);
  929. });
  930. emailMessageMap.put(IdUtil.simpleUUID(), dtos);
  931. } catch (Exception e) {
  932. log.error("{} 邮件{} 下载报错 {}", folderName, emailTitle, ExceptionUtil.stacktraceToString(e));
  933. } finally {
  934. if (CollUtil.isNotEmpty(dtos) && log.isInfoEnabled()) {
  935. log.info("{} 邮件{} 下载完成,总计耗时{} ms,文件内容如下\n {}", folderName,
  936. emailTitle, System.currentTimeMillis() - start, dtos);
  937. }
  938. }
  939. }
  940. if (this.readWriteSeen) {
  941. // 设置已读标志
  942. folder.setFlags(messages, new Flags(Flags.Flag.SEEN), true);
  943. }
  944. folder.close(false);
  945. return emailMessageMap;
  946. }
  947. private void rePart(String account, String subject, Date sendDate, Part part,
  948. List<EmailContentInfoDTO> emailContentInfoDTOList) throws Exception {
  949. String fileName = EmailUtil.decodeFileName(part);
  950. if (StrUtil.isBlank(fileName)) {
  951. return;
  952. }
  953. if (fileName.contains("\"") || fileName.contains("\n")) {
  954. fileName = fileName.replaceAll("\"", "").replaceAll("\n", "");
  955. }
  956. if (fileName.contains("=?")) {
  957. fileName = MimeUtility.decodeText(fileName);
  958. }
  959. String disposition = part.getDisposition();
  960. String contentType = part.getContentType();
  961. String[] att_files = new String[]{Constants.ARCHIVE_7Z, Constants.ARCHIVE_RAR, Constants.ARCHIVE_ZIP,
  962. Constants.FILE_PDF, Constants.FILE_DOCX, Constants.FILE_JPG, Constants.FILE_PNG};
  963. boolean attachmentFlag = StrUtil.endWithAny(fileName, att_files);
  964. boolean isAttachment = attachmentFlag
  965. || Part.ATTACHMENT.equalsIgnoreCase(disposition)
  966. || (contentType != null && attachmentMimePrefixes.stream().anyMatch(prefix ->
  967. StrUtil.startWithIgnoreCase(contentType, prefix)
  968. ));
  969. if (!isAttachment) {
  970. log.warn("邮件{} 未检测到{}类型的附件 (fileName={}, disposition={}, contentType={})",
  971. subject, att_files, fileName, disposition, contentType);
  972. return;
  973. }
  974. File saveFile = this.generateSavePath(account, sendDate, fileName);
  975. if (!saveFile.exists()) {
  976. if (!saveFile.getParentFile().exists()) {
  977. boolean mkdirs = saveFile.getParentFile().mkdirs();
  978. if (!mkdirs) {
  979. log.warn("file path mkdir failed.");
  980. }
  981. }
  982. try (InputStream is = part.getInputStream()) {
  983. Files.copy(is, saveFile.toPath());
  984. }
  985. } else {
  986. if (log.isInfoEnabled()) {
  987. log.info("邮件{} 已下载过附件:{},不用重新下载了。", subject, saveFile.toPath());
  988. }
  989. }
  990. EmailContentInfoDTO emailContentInfoDTO = new EmailContentInfoDTO();
  991. emailContentInfoDTO.setFileName(fileName);
  992. emailContentInfoDTO.setFileSize(part.getSize());
  993. emailContentInfoDTO.setFilePath(saveFile.getAbsolutePath());
  994. emailContentInfoDTO.setEmailAddress(account);
  995. emailContentInfoDTO.setEmailTitle(subject);
  996. emailContentInfoDTO.setEmailDate(DateUtil.format(sendDate, DateConst.YYYY_MM_DD_HH_MM_SS));
  997. emailContentInfoDTOList.add(emailContentInfoDTO);
  998. }
  999. public File generateSavePath(String account, Date sendDate, String fileName) {
  1000. String emailDateStr = DateUtil.format(sendDate, DateConst.YYYYMMDD);
  1001. String filePath = this.path + File.separator + account + File.separator +
  1002. emailDateStr + File.separator + "original" + File.separator;
  1003. // 压缩包重名时的后面的压缩包会覆盖前面压缩包的问题(不考虑普通文件)
  1004. String emailDate = DateUtil.format(sendDate, DateConst.YYYYMMDDHHMMSS24);
  1005. String realName = ArchiveUtil.isArchive(fileName) ? emailDate + fileName : fileName;
  1006. return FileUtil.file(filePath + realName);
  1007. }
  1008. private void reMultipart(String account, String subject, Date emailDate, Multipart multipart,
  1009. List<EmailContentInfoDTO> emailContentInfoDTOList) throws Exception {
  1010. for (int i = 0; i < multipart.getCount(); i++) {
  1011. Part bodyPart = multipart.getBodyPart(i);
  1012. Object content = bodyPart.getContent();
  1013. if (content instanceof String) {
  1014. if (log.isDebugEnabled()) {
  1015. log.debug("邮件{} 获取的正文不做解析,内容是 {}", subject, content);
  1016. }
  1017. continue;
  1018. }
  1019. if (content instanceof Multipart mp) {
  1020. this.reMultipart(account, subject, emailDate, mp, emailContentInfoDTOList);
  1021. } else {
  1022. this.rePart(account, subject, emailDate, bodyPart, emailContentInfoDTOList);
  1023. }
  1024. }
  1025. }
  1026. private String getSenderEmail(Message message) {
  1027. Address[] senderAddress;
  1028. try {
  1029. senderAddress = message.getFrom();
  1030. if (senderAddress == null || senderAddress.length == 0) {
  1031. return null;
  1032. }
  1033. // 此时的address是含有编码(MIME编码方式)后的文本和实际的邮件地址
  1034. String address = "";
  1035. for (Address from : senderAddress) {
  1036. if (StrUtil.isNotBlank(from.toString())) {
  1037. address = from.toString();
  1038. break;
  1039. }
  1040. }
  1041. // 正则表达式匹配邮件地址
  1042. Pattern pattern = Pattern.compile("<(\\S+)>");
  1043. Matcher matcher = pattern.matcher(address);
  1044. if (matcher.find()) {
  1045. return matcher.group(1);
  1046. }
  1047. } catch (MessagingException e) {
  1048. log.error(e.getMessage(), e);
  1049. }
  1050. return null;
  1051. }
  1052. private Message[] getEmailMessage(Folder folder, String protocol, Date startDate) {
  1053. try {
  1054. if (protocol.contains("imap")) {
  1055. // 获取邮件日期大于等于startDate的邮件(搜索条件只支持按天)
  1056. SearchTerm startDateTerm = new ReceivedDateTerm(ComparisonTerm.GE, startDate);
  1057. return folder.search(startDateTerm);
  1058. } else {
  1059. return folder.getMessages();
  1060. }
  1061. } catch (MessagingException e) {
  1062. throw new RuntimeException(e);
  1063. }
  1064. }
  1065. /**
  1066. * 检查邮件是否已读
  1067. *
  1068. * @param message 邮件对象
  1069. * @return true表示已读,false表示未读
  1070. * @throws MessagingException 如果访问邮件标志时出错
  1071. */
  1072. private boolean isMessageRead(Message message) throws MessagingException {
  1073. // 获取邮件的所有标志
  1074. Flags flags = message.getFlags();
  1075. // 检查是否包含 SEEN 标志
  1076. return flags.contains(Flags.Flag.SEEN);
  1077. }
  1078. }