EmailParseService.java 55 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131
  1. package com.smppw.modaq.domain.service;
  2. import cn.hutool.core.collection.CollUtil;
  3. import cn.hutool.core.collection.ListUtil;
  4. import cn.hutool.core.date.DateUtil;
  5. import cn.hutool.core.exceptions.ExceptionUtil;
  6. import cn.hutool.core.io.FileUtil;
  7. import cn.hutool.core.map.MapUtil;
  8. import cn.hutool.core.util.IdUtil;
  9. import cn.hutool.core.util.StrUtil;
  10. import com.smppw.modaq.application.components.OCRReportParser;
  11. import com.smppw.modaq.application.components.ReportParseUtils;
  12. import com.smppw.modaq.application.components.report.parser.ReportParser;
  13. import com.smppw.modaq.application.components.report.parser.ReportParserFactory;
  14. import com.smppw.modaq.application.components.report.writer.ReportWriter;
  15. import com.smppw.modaq.application.components.report.writer.ReportWriterFactory;
  16. import com.smppw.modaq.application.util.EmailUtil;
  17. import com.smppw.modaq.common.conts.Constants;
  18. import com.smppw.modaq.common.conts.DateConst;
  19. import com.smppw.modaq.common.conts.EmailParseStatusConst;
  20. import com.smppw.modaq.common.conts.EmailTypeConst;
  21. import com.smppw.modaq.common.enums.ReportMonthlyType;
  22. import com.smppw.modaq.common.enums.ReportParseStatus;
  23. import com.smppw.modaq.common.enums.ReportParserFileType;
  24. import com.smppw.modaq.common.enums.ReportType;
  25. import com.smppw.modaq.common.exception.NotSupportReportException;
  26. import com.smppw.modaq.common.exception.ReportParseException;
  27. import com.smppw.modaq.domain.dto.*;
  28. import com.smppw.modaq.domain.dto.report.*;
  29. import com.smppw.modaq.domain.dto.report.ocr.OCRLetterParseData;
  30. import com.smppw.modaq.domain.dto.report.ocr.OCRParseData;
  31. import com.smppw.modaq.domain.entity.EmailFileInfoDO;
  32. import com.smppw.modaq.domain.entity.EmailParseInfoDO;
  33. import com.smppw.modaq.domain.mapper.EmailFileInfoMapper;
  34. import com.smppw.modaq.domain.mapper.EmailParseInfoMapper;
  35. import com.smppw.modaq.infrastructure.util.ArchiveUtil;
  36. import com.smppw.modaq.infrastructure.util.PdfUtil;
  37. import jakarta.mail.*;
  38. import jakarta.mail.internet.MimeUtility;
  39. import jakarta.mail.search.ComparisonTerm;
  40. import jakarta.mail.search.ReceivedDateTerm;
  41. import jakarta.mail.search.SearchTerm;
  42. import org.slf4j.Logger;
  43. import org.slf4j.LoggerFactory;
  44. import org.springframework.beans.factory.annotation.Value;
  45. import org.springframework.stereotype.Service;
  46. import org.springframework.util.StopWatch;
  47. import java.io.File;
  48. import java.io.IOException;
  49. import java.io.InputStream;
  50. import java.nio.file.Files;
  51. import java.util.*;
  52. import java.util.regex.Matcher;
  53. import java.util.regex.Pattern;
  54. import java.util.stream.Collectors;
  55. /**
  56. * @author mozuwen
  57. * @date 2024-09-04
  58. * @description 邮件解析服务
  59. */
  60. @Service
  61. public class EmailParseService {
  62. // public static final int stepSize = 10000;
  63. private static final Logger log = LoggerFactory.getLogger(EmailParseService.class);
  64. // 常量定义:统一管理关键词
  65. private static final Set<String> AMAC_KEYWORDS = Set.of("协会", "信披");
  66. private static final Set<String> EXCLUDE_PATH_KEYWORDS = Set.of("公司及协会版", "公司和协会版");
  67. // 扩展支持的 MIME 类型
  68. private static final Set<String> attachmentMimePrefixes = Set.of(
  69. "application/pdf",
  70. "application/zip",
  71. "application/x-zip-compressed",
  72. "application/rar",
  73. "application/x-rar-compressed",
  74. "application/octet-stream"
  75. // 按需添加其他类型...
  76. );
  77. private final EmailParseInfoMapper emailParseInfoMapper;
  78. private final EmailFileInfoMapper emailFileInfoMapper;
  79. /* 报告解析和入库的方法 */
  80. private final ReportParserFactory reportParserFactory;
  81. private final ReportWriterFactory reportWriterFactory;
  82. @Value("${email.file.path}")
  83. private String path;
  84. @Value("${email.report.ocr-parser-url}")
  85. private String ocrParserUrl;
  86. @Value("${email.read-write-seen:true}")
  87. private boolean readWriteSeen;
  88. public EmailParseService(EmailParseInfoMapper emailParseInfoMapper,
  89. EmailFileInfoMapper emailFileInfoMapper,
  90. ReportParserFactory reportParserFactory,
  91. ReportWriterFactory reportWriterFactory) {
  92. this.emailParseInfoMapper = emailParseInfoMapper;
  93. this.emailFileInfoMapper = emailFileInfoMapper;
  94. this.reportParserFactory = reportParserFactory;
  95. this.reportWriterFactory = reportWriterFactory;
  96. }
  97. /**
  98. * 解析指定邮箱指定时间范围内的邮件
  99. *
  100. * @param mailboxInfoDTO 邮箱配置信息
  101. * @param startDate 邮件起始日期(yyyy-MM-dd HH:mm:ss)
  102. * @param endDate 邮件截止日期(yyyy-MM-dd HH:mm:ss, 为null,将解析邮件日期小于等于startDate的当天邮件)
  103. * @param emailTypes 当前任务支持的邮件类型,默认支持确认单
  104. */
  105. public void parseEmail(MailboxInfoDTO mailboxInfoDTO,
  106. Date startDate, Date endDate,
  107. List<String> folderNames, List<Integer> emailTypes) {
  108. if (CollUtil.isEmpty(emailTypes)) {
  109. emailTypes = ListUtil.of(EmailTypeConst.REPORT_LETTER_EMAIL_TYPE);
  110. }
  111. if (log.isInfoEnabled()) {
  112. log.info("开始邮件解析 -> 邮箱信息:{},开始时间:{},结束时间:{}", mailboxInfoDTO, DateUtil.format(startDate,
  113. DateConst.YYYY_MM_DD_HH_MM_SS), DateUtil.format(endDate, DateConst.YYYY_MM_DD_HH_MM_SS));
  114. }
  115. Map<String, List<EmailContentInfoDTO>> emailContentMap;
  116. try {
  117. emailContentMap = this.realEmail(mailboxInfoDTO, startDate, endDate, folderNames);
  118. } catch (Exception e) {
  119. log.error("采集邮件失败 -> 邮箱配置信息:{},堆栈信息:{}", mailboxInfoDTO, ExceptionUtil.stacktraceToString(e));
  120. return;
  121. }
  122. if (MapUtil.isEmpty(emailContentMap)) {
  123. log.warn("未采集到邮件 -> 邮箱配置信息:{},开始时间:{},结束时间:{}", mailboxInfoDTO,
  124. DateUtil.format(startDate, DateConst.YYYY_MM_DD_HH_MM_SS), DateUtil.format(endDate, DateConst.YYYY_MM_DD_HH_MM_SS));
  125. return;
  126. }
  127. for (Map.Entry<String, List<EmailContentInfoDTO>> emailEntry : emailContentMap.entrySet()) {
  128. List<EmailContentInfoDTO> emailContentInfoDTOList = emailEntry.getValue();
  129. if (CollUtil.isEmpty(emailContentInfoDTOList)) {
  130. log.warn("未采集到正文或附件");
  131. continue;
  132. }
  133. EmailContentInfoDTO dto = emailContentInfoDTOList.get(0);
  134. String emailTitle = dto.getEmailTitle();
  135. log.info("开始解析邮件数据 -> 邮件主题:{},邮件日期:{}", emailTitle, dto.getEmailDate());
  136. List<EmailZipFileDTO> emailFileList = ListUtil.list(false);
  137. EmailInfoDTO emailInfo = new EmailInfoDTO(dto, emailFileList);
  138. for (EmailContentInfoDTO emailDto : emailContentInfoDTOList) {
  139. // 正文不用解压附件
  140. if (emailDto.getFileName() != null && emailDto.getFileName().endsWith(Constants.FILE_HTML)) {
  141. continue;
  142. }
  143. try {
  144. emailFileList.addAll(this.parseZipEmail(emailDto));
  145. } catch (IOException e) {
  146. log.error("压缩包解压失败:{}", ExceptionUtil.stacktraceToString(e));
  147. EmailParseInfoDO fail = buildEmailParseInfo(mailboxInfoDTO.getAccount(),
  148. dto.getEmailType(), emailInfo, emailDto.getFileSize());
  149. fail.setFailReason("压缩包解压失败");
  150. fail.setParseStatus(EmailParseStatusConst.FAIL);
  151. fail.setEmailKey(emailEntry.getKey());
  152. this.emailParseInfoMapper.insert(fail);
  153. } catch (Exception e) {
  154. log.error("堆栈信息:{}", ExceptionUtil.stacktraceToString(e));
  155. }
  156. }
  157. // 重新判断类型
  158. for (EmailZipFileDTO emailFile : emailFileList) {
  159. if (EmailTypeConst.SUPPORT_NO_OTHER_TYPES.contains(emailFile.getEmailType())) {
  160. continue;
  161. }
  162. Integer type = EmailUtil.getEmailTypeBySubject(emailTitle + emailFile.getFilename());
  163. // 特殊月报
  164. if ((Objects.equals(EmailTypeConst.NAV_EMAIL_TYPE, type)
  165. || Objects.equals(EmailTypeConst.REPORT_OTHER_TYPE, type))
  166. && (ReportParseUtils.containsAny(emailTitle, ReportParseUtils.MANAGER_KEYWORDS)
  167. || emailTitle.contains("定期报告"))) {
  168. type = EmailTypeConst.REPORT_EMAIL_TYPE;
  169. }
  170. // 其他报告
  171. if (Objects.equals(EmailTypeConst.NAV_EMAIL_TYPE, type)) {
  172. type = EmailTypeConst.REPORT_OTHER_TYPE;
  173. }
  174. emailFile.setEmailType(type);
  175. }
  176. Iterator<EmailZipFileDTO> entryIterator = emailFileList.iterator();
  177. while (entryIterator.hasNext()) {
  178. EmailZipFileDTO entry = entryIterator.next();
  179. if (!emailTypes.contains(entry.getEmailType())) {
  180. log.warn("当前邮件{} 文件{} 的类型{} 不在支持的任务类型{} 中,不用执行解析逻辑。",
  181. entry.getEmailTitle(), entry.getFilepath(), entry.getEmailType(), emailTypes);
  182. entryIterator.remove();
  183. }
  184. }
  185. // 保存相关信息 -> 邮件信息表,邮件文件表,邮件净值表,邮件规模表,基金净值表
  186. saveRelatedTable(emailEntry.getKey(), mailboxInfoDTO.getAccount(), emailInfo);
  187. log.info("结束邮件解析 -> 邮箱信息:{},开始时间:{},结束时间:{}", emailEntry.getValue(),
  188. DateUtil.format(startDate, DateConst.YYYY_MM_DD_HH_MM_SS), DateUtil.format(endDate, DateConst.YYYY_MM_DD_HH_MM_SS));
  189. }
  190. }
  191. /**
  192. * 解压压缩包,如果不是压缩包需转换
  193. *
  194. * @param emailContentInfoDTO 邮件信息
  195. * @return 解压后的文件列表
  196. * @throws IOException /
  197. */
  198. public List<EmailZipFileDTO> parseZipEmail(EmailContentInfoDTO emailContentInfoDTO) throws IOException {
  199. List<EmailZipFileDTO> resultList = ListUtil.list(false);
  200. Integer emailType = emailContentInfoDTO.getEmailType();
  201. String filepath = emailContentInfoDTO.getFilePath();
  202. String emailTitle = emailContentInfoDTO.getEmailTitle();
  203. if (ArchiveUtil.isArchive(filepath)) {
  204. this.handleCompressedFiles(emailTitle, filepath, emailType, resultList);
  205. } else {
  206. // 不是压缩包时
  207. EmailZipFileDTO dto = new EmailZipFileDTO(emailTitle, emailContentInfoDTO);
  208. resultList.add(dto);
  209. }
  210. // 文件中的类型判断
  211. if (emailType == null || !EmailTypeConst.SUPPORT_NO_OTHER_TYPES.contains(emailType)) {
  212. emailType = EmailUtil.getEmailTypeBySubject(emailContentInfoDTO.getFileName());
  213. emailContentInfoDTO.setEmailType(emailType);
  214. }
  215. if (CollUtil.isNotEmpty(resultList)) {
  216. for (EmailZipFileDTO dto : resultList) {
  217. dto.setEmailType(emailType);
  218. }
  219. }
  220. return resultList;
  221. }
  222. /**
  223. * 解压压缩包并把压缩包里面的所有文件放在resultList中
  224. *
  225. * @param emailTitle 邮件主题
  226. * @param filepath 压缩包路径
  227. * @param emailType 邮件解析类型
  228. * @param resultList 解压结果列表
  229. * @throws IOException /
  230. */
  231. private void handleCompressedFiles(String emailTitle,
  232. String filepath,
  233. Integer emailType,
  234. List<EmailZipFileDTO> resultList) throws IOException {
  235. String parent = FileUtil.getParent(filepath, 2);
  236. String destPath = parent + File.separator + "archive" + File.separator + FileUtil.mainName(filepath);
  237. File destFile = new File(destPath);
  238. if (!destFile.exists()) {
  239. if (!destFile.mkdirs()) {
  240. throw new IOException("无法创建目标目录: " + destPath);
  241. }
  242. }
  243. List<String> extractedDirs;
  244. if (ArchiveUtil.isZip(filepath)) {
  245. extractedDirs = ArchiveUtil.extractCompressedFiles(filepath, destPath);
  246. } else if (ArchiveUtil.isRAR(filepath) || ArchiveUtil.is7z(filepath)) {
  247. // 7z和rar压缩包解压
  248. extractedDirs = ArchiveUtil.extractRar5(filepath, destPath);
  249. } else {
  250. return;
  251. }
  252. for (String dir : extractedDirs) {
  253. // 如果邮件类型不满足解析条件则重新根据文件名判断
  254. if (emailType == null || !EmailTypeConst.SUPPORT_EMAIL_TYPES.contains(emailType)) {
  255. emailType = EmailUtil.getEmailTypeBySubject(dir);
  256. }
  257. File file = new File(dir);
  258. if (file.isDirectory()) {
  259. String[] subDirs = file.list();
  260. if (subDirs != null) {
  261. for (String subDir : subDirs) {
  262. resultList.add(new EmailZipFileDTO(emailTitle, subDir, emailType));
  263. }
  264. } else {
  265. log.warn("目录 {} 下无文件", dir);
  266. }
  267. } else {
  268. resultList.add(new EmailZipFileDTO(emailTitle, dir, emailType));
  269. }
  270. }
  271. }
  272. /**
  273. * 邮件附件解析并保存结果数据
  274. *
  275. * @param emailKey 没封邮件的uuid
  276. * @param emailAddress 发送人地址
  277. * @param emailInfo 邮件信息
  278. */
  279. public void saveRelatedTable(String emailKey, String emailAddress, EmailInfoDTO emailInfo) {
  280. // 附件文件检查
  281. Long totalSize = this.checkEmailFileInfo(emailInfo);
  282. if (totalSize == null) {
  283. return;
  284. }
  285. // 解析并保存数据
  286. List<ParseResult<ReportData>> dataList = ListUtil.list(true);
  287. Integer emailId = this.parseResults(null, emailKey, emailAddress, totalSize, emailInfo, dataList);
  288. String failReason = null;
  289. int emailParseStatus = EmailParseStatusConst.SUCCESS;
  290. // 报告邮件有一条失败就表示整个邮件解析失败
  291. if (CollUtil.isNotEmpty(dataList)) {
  292. // ai解析结果
  293. List<ReportData> aiParaseList = dataList.stream().map(ParseResult::getData)
  294. .filter(Objects::nonNull).filter(e -> Objects.equals(true, e.getAiParse())).toList();
  295. if (CollUtil.isNotEmpty(aiParaseList)) {
  296. for (ReportData data : aiParaseList) {
  297. this.emailFileInfoMapper.updateAiParseByFileId(data.getBaseInfo().getFileId(),
  298. data.getAiParse(), data.getAiFileId());
  299. }
  300. }
  301. long failNum = dataList.stream().filter(e -> !Objects.equals(EmailParseStatusConst.SUCCESS, e.getStatus())).count();
  302. if (failNum > 0) {
  303. emailParseStatus = EmailParseStatusConst.FAIL;
  304. failReason = dataList.stream().map(ParseResult::getMsg).collect(Collectors.joining(";"));
  305. }
  306. }
  307. this.emailParseInfoMapper.updateParseStatus(emailId, emailParseStatus, failReason);
  308. }
  309. /**
  310. * 上传文件解析并返回解析状态
  311. *
  312. * @param params 上传文件路径
  313. * @return /
  314. */
  315. public List<UploadReportResult> uploadReportResults(UploadReportParams params) {
  316. List<ParseResult<ReportData>> dataList = ListUtil.list(false);
  317. List<UploadReportParams.ReportInfo> reportInfos = params.getReportInfos();
  318. List<EmailZipFileDTO> dtos = ListUtil.list(false);
  319. for (UploadReportParams.ReportInfo e : reportInfos) {
  320. String reportPath = e.getReportPath();
  321. if (ArchiveUtil.isArchive(reportPath)) {
  322. try {
  323. this.handleCompressedFiles(params.getTitle(), reportPath, e.getReportType(), dtos);
  324. } catch (Exception ex) {
  325. log.warn("报告{} 压缩包解压失败:{}", reportPath, ExceptionUtil.stacktraceToString(ex));
  326. ReportData reportData = new ReportData.DefaultReportData();
  327. reportData.setReportPath(reportPath);
  328. dataList.add(new ParseResult<>(ReportParseStatus.ARCHIVE_FAIL, reportData));
  329. }
  330. } else {
  331. dtos.add(new EmailZipFileDTO(params.getTitle(), reportPath, e.getReportType()));
  332. }
  333. }
  334. EmailInfoDTO emailInfo = new EmailInfoDTO(params.getTitle(), dtos);
  335. // 附件文件检查
  336. Long totalSize = this.checkEmailFileInfo(emailInfo);
  337. if (totalSize == null) {
  338. return null;
  339. }
  340. this.parseResults(-1, null, null, totalSize, emailInfo, dataList);
  341. List<UploadReportResult> resultList = ListUtil.list(false);
  342. for (ParseResult<ReportData> result : dataList) {
  343. ReportData data = result.getData();
  344. resultList.add(new UploadReportResult(data.getReportPath(), result.getStatus(), result.getMsg()));
  345. }
  346. return resultList;
  347. }
  348. /**
  349. * 邮件信息前置处理,在解析操作执行之前的过滤逻辑和校验逻辑。返回所有附件大小汇总
  350. *
  351. * @param emailInfo 邮件信息(包含所有解压后的文件)
  352. * @return 所有附件大小汇总,为null说明没有文件需要上传
  353. */
  354. private Long checkEmailFileInfo(EmailInfoDTO emailInfo) {
  355. String emailTitle = emailInfo.getEmailTitle();
  356. List<EmailZipFileDTO> dtos = emailInfo.getEmailFileList();
  357. // 如果压缩包里面既有pdf又有其他格式的文件,说明其他格式的文件是不需要解析的
  358. List<String> exts = dtos.stream().map(EmailZipFileDTO::getExtName).distinct().toList();
  359. if (exts.contains(Constants.FILE_PDF) && exts.size() > 1) {
  360. dtos.removeIf(e -> !Objects.equals(Constants.FILE_PDF, e.getExtName()));
  361. }
  362. // 移除逻辑
  363. Iterator<EmailZipFileDTO> removeIterator = dtos.iterator();
  364. while (removeIterator.hasNext()) {
  365. EmailZipFileDTO dto = removeIterator.next();
  366. String filename = dto.getFilename();
  367. // 删除复核函或基金合同
  368. if (filename.contains("复核函") || (filename.contains("基金合同") && !filename.contains("合同变更"))) {
  369. log.warn("邮件{} 中的报告{} 是复核函或基金合同,不用解析上传。", emailTitle, filename);
  370. removeIterator.remove();
  371. }
  372. // 不支持的类型
  373. Integer type = dto.getEmailType();
  374. if (!EmailTypeConst.SUPPORT_EMAIL_TYPES.contains(type)) {
  375. log.info("邮件{} 类型{} 不支持解析。", emailTitle, type);
  376. removeIterator.remove();
  377. }
  378. }
  379. // 数据库已存在的数据过滤(邮件主题+报告名称+附件大小,压缩包文件大小汇总)
  380. long totalSize = dtos.stream().map(EmailZipFileDTO::getFileSize).reduce(0L, Long::sum);
  381. Iterator<EmailZipFileDTO> iterator = dtos.iterator();
  382. while (iterator.hasNext()) {
  383. EmailZipFileDTO dto = iterator.next();
  384. String filename = dto.getFilename();
  385. Integer type = dto.getEmailType();
  386. int count = 0;
  387. if (Objects.equals(type, EmailTypeConst.REPORT_LETTER_EMAIL_TYPE)) {
  388. // 确认单
  389. count = this.emailFileInfoMapper.getLetterFilenameSuccessCount(emailTitle, filename);
  390. } else if (Objects.equals(type, EmailTypeConst.REPORT_EMAIL_TYPE)) {
  391. // 定期报告
  392. count = this.emailFileInfoMapper.getAmacFilenameSuccessCount(emailTitle, filename, totalSize);
  393. } else if (Objects.equals(type, EmailTypeConst.REPORT_WEEKLY_TYPE)) {
  394. // 管理人周报
  395. count = this.emailFileInfoMapper.getWeeklyFilenameSuccessCount(emailTitle, filename, totalSize);
  396. } else if (Objects.equals(type, EmailTypeConst.REPORT_OTHER_TYPE)) {
  397. // 其他报告
  398. count = this.emailFileInfoMapper.getOtherFilenameSuccessCount(emailTitle, filename, totalSize);
  399. }
  400. if (count > 0) {
  401. iterator.remove();
  402. log.info("邮件{} 报告{} 已存在解析成功的记录,不用重新解析。", emailTitle, filename);
  403. }
  404. }
  405. if (CollUtil.isEmpty(dtos)) {
  406. log.info("邮件{} 所有文件都已经解析成功过,不能重复解析了", emailTitle);
  407. return null;
  408. }
  409. if (log.isInfoEnabled()) {
  410. log.info("邮件{} 还有报告待解析:\n{}", emailTitle, dtos);
  411. }
  412. return totalSize;
  413. }
  414. /**
  415. * 邮件信息保存+附件解析
  416. *
  417. * @param emailId 邮件ID,上传解析时一定是-1
  418. * @param emailKey 邮件uuid(邮箱下载解析时)
  419. * @param emailAddress 接收人地址(邮箱下载解析时)
  420. * @param totalSize 所有附件大小汇总
  421. * @param emailInfo 邮件信息,包含附件
  422. * @param resultList 解析结果
  423. * @return 邮件数据ID
  424. */
  425. private Integer parseResults(Integer emailId,
  426. String emailKey,
  427. String emailAddress,
  428. long totalSize,
  429. EmailInfoDTO emailInfo,
  430. List<ParseResult<ReportData>> resultList) {
  431. String emailTitle = emailInfo.getEmailTitle();
  432. List<EmailZipFileDTO> dtos = emailInfo.getEmailFileList();
  433. if (emailId == null) {
  434. // 保存邮件信息
  435. Integer emailType = dtos.get(0).getEmailType();
  436. EmailParseInfoDO emailParseInfoDO = this.buildEmailParseInfo(emailAddress, emailType, emailInfo, totalSize);
  437. emailParseInfoDO.setEmailKey(emailKey);
  438. emailId = this.saveEmailParseInfo(emailParseInfoDO);
  439. }
  440. // 解析邮件报告
  441. for (EmailZipFileDTO zipFile : dtos) {
  442. EmailFileInfoDO emailFile = this.saveEmailFileInfo(emailId, zipFile.getFilename(), zipFile.getFilepath());
  443. // 解析并保存报告
  444. ParseResult<ReportData> parseResult = this.parseReportAndHandleResult(emailTitle, emailFile.getId(), zipFile);
  445. if (!Objects.equals(1, parseResult.getStatus())) {
  446. log.error(parseResult.getMsg());
  447. }
  448. if (parseResult.getData() == null) {
  449. parseResult.setData(new ReportData.DefaultReportData());
  450. }
  451. parseResult.getData().setReportPath(zipFile.getFilepath());
  452. resultList.add(parseResult);
  453. }
  454. return emailId;
  455. }
  456. /**
  457. * 解析报告并保存解析结果
  458. *
  459. * @param emailTitle 邮件主题
  460. * @param fileId 当前文件数据库ID
  461. * @param zipFile 当前报告的路径信息
  462. * @return /
  463. */
  464. private ParseResult<ReportData> parseReportAndHandleResult(String emailTitle,
  465. Integer fileId,
  466. EmailZipFileDTO zipFile) {
  467. Integer emailType = zipFile.getEmailType();
  468. String fileName = zipFile.getFilename();
  469. String filepath = zipFile.getFilepath();
  470. ParseResult<ReportData> result = new ParseResult<>();
  471. boolean reportFlag = emailType == null || !EmailTypeConst.SUPPORT_EMAIL_TYPES.contains(emailType);
  472. if (reportFlag || StrUtil.isBlank(fileName) || fileName.endsWith(Constants.FILE_HTML)) {
  473. return new ParseResult<>(ReportParseStatus.NOT_A_REPORT, null, fileName);
  474. }
  475. // 类型识别---先识别季度报告,没有季度再识别年度报告,最后识别月报
  476. ReportType reportType = ReportParseUtils.matchReportType(emailType, fileName);
  477. if (reportType == null) {
  478. reportType = ReportParseUtils.matchReportType(emailType, emailTitle);
  479. if (log.isDebugEnabled()) {
  480. log.debug("报告{} 根据邮件主题{} 重新识别的类型是:{}", fileName, emailTitle, reportType);
  481. }
  482. }
  483. // 解析器--根据文件后缀获取对应解析器,解析不了就用AI来解析
  484. ReportParserFileType fileType = ReportParserFileType.getBySuffix(zipFile.getExtName());
  485. // 不支持的格式
  486. if (fileType == null) {
  487. return new ParseResult<>(ReportParseStatus.NO_SUPPORT_TEMPLATE, null, fileName);
  488. }
  489. // 不是定期报告的判断逻辑放在不支持的格式下面
  490. if (reportType == null) {
  491. return new ParseResult<>(ReportParseStatus.NOT_A_REPORT, null, fileName);
  492. }
  493. // docx转pdf
  494. if (Objects.equals(ReportParserFileType.WORD, fileType)) {
  495. try {
  496. String outputFile = FileUtil.getParent(filepath, 1) + File.separator + FileUtil.mainName(fileName) + ".pdf";
  497. PdfUtil.convertDocxToPdf(filepath, outputFile);
  498. filepath = outputFile;
  499. } catch (Exception e) {
  500. log.warn("报告{} 转换为pdf失败:{}", fileName, ExceptionUtil.stacktraceToString(e));
  501. }
  502. }
  503. // 首页和尾页转为png图片,首页用来识别基金名称和基金代码、尾页用来识别印章和联系人
  504. List<String> images = ListUtil.list(true);
  505. if (Objects.equals(ReportParserFileType.PDF, fileType)) {
  506. try {
  507. String output = filepath.replaceAll("archive|original", "image");
  508. File outputFile = FileUtil.file(FileUtil.getParent(output, 1));
  509. images = PdfUtil.convertFirstAndLastPagesToPng(filepath, outputFile, 300);
  510. if (log.isDebugEnabled()) {
  511. log.debug("报告{} 生成的图片地址是:\n{}", fileName, images);
  512. }
  513. } catch (Exception e) {
  514. log.warn("报告{} 生成图片失败:{}", fileName, ExceptionUtil.stacktraceToString(e));
  515. }
  516. } else if (Objects.equals(ReportParserFileType.IMG, fileType)) {
  517. try {
  518. String outputFile = PdfUtil.compressAndSave(filepath);
  519. images.add(outputFile);
  520. } catch (IOException e) {
  521. log.error("报告{} 图片压缩失败,{}", fileName, ExceptionUtil.stacktraceToString(e));
  522. }
  523. }
  524. // ocr识别月报是否管理人版或协会版
  525. ReportMonthlyType monthlyType = ReportMonthlyType.NO_NEED;
  526. if (ReportType.MONTHLY == reportType) {
  527. monthlyType = this.determineReportType(emailTitle, fileName, filepath, images);
  528. }
  529. boolean isAmac = reportType == ReportType.ANNUALLY || reportType == ReportType.QUARTERLY
  530. || (reportType == ReportType.MONTHLY && ReportMonthlyType.AMAC == monthlyType);
  531. // 不支持解析的格式文件
  532. boolean notSupportFile = false;
  533. // 解析报告
  534. ReportData reportData = null;
  535. ReportParserParams params = new ReportParserParams(fileId, fileName, filepath, reportType);
  536. long start = System.currentTimeMillis();
  537. try {
  538. if (isAmac || reportType == ReportType.LETTER) {
  539. ReportParser<ReportData> instance = this.reportParserFactory.getInstance(reportType, fileType);
  540. reportData = instance.parse(params);
  541. result = new ParseResult<>(1, "报告解析成功", reportData);
  542. }
  543. } catch (ReportParseException e) {
  544. result = new ParseResult<>(e.getCode(), StrUtil.format(e.getMsg(), fileName), null);
  545. log.warn("解析失败:{}", result.getMsg());
  546. if (e instanceof NotSupportReportException) {
  547. notSupportFile = true;
  548. }
  549. } catch (Exception e) {
  550. log.warn("解析错误:{}", ExceptionUtil.stacktraceToString(e));
  551. result = new ParseResult<>(ReportParseStatus.PARSE_FAIL, null, e.getMessage());
  552. } finally {
  553. // 如果解析结果是空的就用AI工具解析一次
  554. if (reportData == null && !notSupportFile) {
  555. if (log.isInfoEnabled()) {
  556. log.info("报告{} 是周报或管理人月报或其他类型或解析失败,用AI解析器解析", fileName);
  557. }
  558. try {
  559. if (!isAmac && CollUtil.isNotEmpty(images)) {
  560. filepath = images.get(0);
  561. }
  562. params = new ReportParserParams(fileId, fileName, filepath, reportType);
  563. ReportParser<ReportData> instance = this.reportParserFactory.getInstance(reportType, ReportParserFileType.AI);
  564. reportData = instance.parse(params);
  565. result = new ParseResult<>(1, "报告解析成功--AI", reportData);
  566. } catch (ReportParseException e) {
  567. result = new ParseResult<>(e.getCode(), StrUtil.format(e.getMsg(), fileName), null);
  568. log.warn("AI解析失败:{}", result.getMsg());
  569. } catch (Exception e) {
  570. log.warn("AI解析错误:{}", ExceptionUtil.stacktraceToString(e));
  571. result = new ParseResult<>(ReportParseStatus.PARSE_FAIL, null, e.getMessage());
  572. }
  573. }
  574. if (reportData != null && reportData.getBaseInfo() != null) {
  575. // 设置月报类型
  576. reportData.getBaseInfo().setMonthlyType(monthlyType.getType());
  577. // 当报告日期还是空时设置为今天的前一天
  578. if (reportData.getBaseInfo().getReportDate() == null) {
  579. Date date = DateUtil.offsetDay(new Date(), -1);
  580. reportData.getBaseInfo().setReportDate(date);
  581. }
  582. }
  583. // ocr信息提取(印章、联系人、基金名称和产品代码)
  584. this.ocrReportData(reportType, reportData, fileName, images);
  585. if (log.isInfoEnabled()) {
  586. log.info("报告{} 解析耗时{}ms,结果是:\n{}", fileName, (System.currentTimeMillis() - start), reportData);
  587. }
  588. }
  589. // 保存报告解析结果
  590. this.saveReportData(reportData, reportType, fileName);
  591. return result;
  592. }
  593. /**
  594. * 判断月报类型(管理人版还是协会版)
  595. *
  596. * @param emailTitle 邮件主题
  597. * @param fileName 报告名称
  598. * @param filepath 报告路径
  599. * @param images 报告的第一页和尾页图片地址(主要用于ocr提取关键信息)
  600. */
  601. public ReportMonthlyType determineReportType(String emailTitle, String fileName,
  602. String filepath, List<String> images) {
  603. // 1. 优先根据文件名判断
  604. if (ReportParseUtils.containsAny(fileName, AMAC_KEYWORDS)) {
  605. return ReportMonthlyType.AMAC;
  606. }
  607. if (ReportParseUtils.containsAny(fileName, ReportParseUtils.MANAGER_KEYWORDS)) {
  608. return ReportMonthlyType.MANAGER;
  609. }
  610. // if (StrUtil.isNotBlank(ReportParseUtils.matchFundCode(fileName))) {
  611. // return ReportMonthlyType.AMAC;
  612. // }
  613. // 2. 根据文件路径判断
  614. List<String> pathSegments = StrUtil.split(filepath, File.separator);
  615. for (String segment : pathSegments) {
  616. boolean isExcluded = ReportParseUtils.containsAny(segment, EXCLUDE_PATH_KEYWORDS);
  617. if (!isExcluded && ReportParseUtils.containsAny(segment, AMAC_KEYWORDS)) {
  618. return ReportMonthlyType.AMAC;
  619. }
  620. if (!isExcluded && ReportParseUtils.containsAny(segment, ReportParseUtils.MANAGER_KEYWORDS)) {
  621. return ReportMonthlyType.MANAGER;
  622. }
  623. }
  624. // 3. 根据邮件主题判断
  625. boolean isAmacEmail = ReportParseUtils.containsAny(emailTitle, AMAC_KEYWORDS)
  626. && !emailTitle.contains("公司及协会版");
  627. if (isAmacEmail) {
  628. return ReportMonthlyType.AMAC;
  629. }
  630. if (ReportParseUtils.containsAny(emailTitle, ReportParseUtils.MANAGER_KEYWORDS)) {
  631. return ReportMonthlyType.MANAGER;
  632. }
  633. // 4.ocr 提取“曲线”、“基金份额”等关键字,如果有曲线则是管理人,如果有估值日期则是协会
  634. if (CollUtil.isNotEmpty(images)) {
  635. try {
  636. return new OCRReportParser().parseMonthlyType(fileName, this.ocrParserUrl, images.get(0));
  637. } catch (Exception ignored) {
  638. return ReportMonthlyType.FAILED;
  639. }
  640. }
  641. return ReportMonthlyType.FAILED;
  642. }
  643. /**
  644. * ocr 提取信息(包括首页的基金名称或报告日期,尾页的印章或联系人等信息)
  645. *
  646. * @param reportData 报告解析结果
  647. * @param fileName 报告名称
  648. * @param images 报告的收益和尾页png图片
  649. */
  650. private void ocrReportData(ReportType reportType,
  651. ReportData reportData,
  652. String fileName,
  653. List<String> images) {
  654. if (reportData == null || CollUtil.isEmpty(images)) {
  655. return;
  656. }
  657. if (log.isInfoEnabled()) {
  658. log.info("报告{} 用ocr补充解析结果。补充前的结果是:\n{}", fileName, reportData);
  659. }
  660. // 报告才识别尾页的印章和联系人,确认单不识别尾页
  661. if (ReportType.LETTER != reportType) {
  662. OCRParseData parseRes = null;
  663. try {
  664. // 首页和尾页相等时只读首页
  665. String imageUrl = images.size() == 1 ? images.get(0) : images.get(1);
  666. parseRes = new OCRReportParser().parse(fileName, this.ocrParserUrl, imageUrl);
  667. } catch (Exception e) {
  668. log.error("报告{} OCR识别印章和联系人出错:{}", fileName, e.getMessage());
  669. }
  670. // ocr识别尾页是否包含印章和联系人信息
  671. if (parseRes != null) {
  672. if (reportData.getBaseInfo() != null) {
  673. reportData.getBaseInfo().setWithSeals(parseRes.getWithSeals());
  674. reportData.getBaseInfo().setWithContacts(parseRes.getWithContacts());
  675. if (fileName.contains("用印") && !Objects.equals(true, reportData.getBaseInfo().getWithSeals())) {
  676. reportData.getBaseInfo().setWithSeals(true);
  677. }
  678. }
  679. }
  680. // 首页和尾页不相等时解析首页的数据
  681. if (images.size() != 1) {
  682. try {
  683. parseRes = new OCRReportParser().parse(fileName, this.ocrParserUrl, images.get(0));
  684. } catch (Exception e) {
  685. log.error("报告{} OCR识别首页基金名称和报告日期出错:{}", fileName, e.getMessage());
  686. }
  687. }
  688. // 用首页识别基金名称、产品代码和基金管理人
  689. if (reportData.getFundInfo() != null && parseRes != null) {
  690. if (StrUtil.isBlank(reportData.getFundInfo().getFundName())) {
  691. reportData.getFundInfo().setFundName(parseRes.getFundName());
  692. }
  693. if (StrUtil.isBlank(reportData.getFundInfo().getFundCode())) {
  694. reportData.getFundInfo().setFundCode(parseRes.getFundCode());
  695. }
  696. if (StrUtil.isBlank(reportData.getFundInfo().getCompanyName())
  697. || !reportData.getFundInfo().getCompanyName().contains("有限公司")) {
  698. reportData.getFundInfo().setCompanyName(parseRes.getCompanyName());
  699. }
  700. }
  701. } else {
  702. // 确认单AI解析失败时重新用OCR识别
  703. LetterReportData letterReportData = (LetterReportData) reportData;
  704. if (letterReportData.wasFailed()) {
  705. OCRLetterParseData parseRes = null;
  706. try {
  707. parseRes = new OCRReportParser().parseLetterData(fileName, this.ocrParserUrl, images.get(0));
  708. } catch (Exception e) {
  709. log.error("报告{} OCR提取确认单关键信息出错:{}", fileName, e.getMessage());
  710. }
  711. if (parseRes == null) {
  712. return;
  713. }
  714. if (letterReportData.getFundInfo() != null) {
  715. letterReportData.getFundInfo().setFundName(parseRes.getFundName());
  716. letterReportData.getFundInfo().setFundCode(parseRes.getFundCode());
  717. }
  718. if (letterReportData.getInvestorInfo() == null) {
  719. letterReportData.setInvestorInfo(new ReportInvestorInfoDTO());
  720. }
  721. letterReportData.getInvestorInfo().setInvestorName(parseRes.getInvestorName());
  722. letterReportData.getInvestorInfo().setCertificateNumber(parseRes.getCertificateNumber());
  723. letterReportData.getInvestorInfo().setTradingAccount(parseRes.getTradingAccount());
  724. letterReportData.getInvestorInfo().setFundAccount(parseRes.getFundAccount());
  725. letterReportData.getInvestorInfo().setCertificateType(parseRes.getCertificateType());
  726. if (letterReportData.getFundTransaction() == null) {
  727. letterReportData.setFundTransaction(new ReportFundTransactionDTO());
  728. }
  729. letterReportData.getFundTransaction().setTransactionType(parseRes.getTransactionType());
  730. letterReportData.getFundTransaction().setApplyDate(parseRes.getApplyDate());
  731. letterReportData.getFundTransaction().setApplyShare(parseRes.getApplyShare());
  732. letterReportData.getFundTransaction().setApplyAmount(parseRes.getApplyAmount());
  733. letterReportData.getFundTransaction().setHoldingDate(parseRes.getHoldingDate());
  734. letterReportData.getFundTransaction().setAmount(parseRes.getAmount());
  735. letterReportData.getFundTransaction().setShare(parseRes.getShare());
  736. letterReportData.getFundTransaction().setNav(parseRes.getNav());
  737. }
  738. }
  739. }
  740. /**
  741. * 保存报告解析结果
  742. *
  743. * @param reportData 报告解析结果
  744. * @param reportType 报告类型
  745. * @param fileName 报告名称
  746. */
  747. private void saveReportData(ReportData reportData, ReportType reportType, String fileName) {
  748. if (reportData == null) {
  749. return;
  750. }
  751. StopWatch writeWatch = new StopWatch();
  752. writeWatch.start();
  753. try {
  754. ReportWriter<ReportData> instance = this.reportWriterFactory.getInstance(reportType);
  755. instance.write(reportData);
  756. } catch (Exception e) {
  757. log.error("报告{} 结果保存失败 {}", fileName, ExceptionUtil.stacktraceToString(e));
  758. } finally {
  759. writeWatch.stop();
  760. if (log.isInfoEnabled()) {
  761. log.info("报告{}解析结果保存完成,耗时{}ms", fileName, writeWatch.getTotalTimeMillis());
  762. }
  763. }
  764. }
  765. private EmailFileInfoDO saveEmailFileInfo(Integer emailId, String fileName, String filePath) {
  766. EmailFileInfoDO emailFileInfoDO = buildEmailFileInfoDO(emailId, fileName, filePath);
  767. emailFileInfoDO.setAiFileId(null);
  768. if (emailFileInfoDO.getId() != null) {
  769. emailFileInfoMapper.updateTimeById(null, new Date());
  770. return emailFileInfoDO;
  771. }
  772. emailFileInfoMapper.insert(emailFileInfoDO);
  773. return emailFileInfoDO;
  774. }
  775. private EmailFileInfoDO buildEmailFileInfoDO(Integer emailId, String fileName, String filePath) {
  776. EmailFileInfoDO emailFileInfoDO = new EmailFileInfoDO();
  777. emailFileInfoDO.setId(null);
  778. emailFileInfoDO.setEmailId(emailId);
  779. emailFileInfoDO.setFileName(fileName);
  780. emailFileInfoDO.setFilePath(filePath);
  781. emailFileInfoDO.setIsvalid(1);
  782. emailFileInfoDO.setCreatorId(0);
  783. emailFileInfoDO.setCreateTime(new Date());
  784. emailFileInfoDO.setUpdaterId(0);
  785. emailFileInfoDO.setUpdateTime(new Date());
  786. return emailFileInfoDO;
  787. }
  788. private Integer saveEmailParseInfo(EmailParseInfoDO emailParseInfoDO) {
  789. if (emailParseInfoDO == null) {
  790. return null;
  791. }
  792. // 重新邮件功能 -> 修改解析时间和更新时间
  793. if (emailParseInfoDO.getId() != null) {
  794. emailParseInfoMapper.updateParseTime(emailParseInfoDO.getId(), emailParseInfoDO.getParseDate());
  795. return emailParseInfoDO.getId();
  796. }
  797. emailParseInfoMapper.insert(emailParseInfoDO);
  798. return emailParseInfoDO.getId();
  799. }
  800. private EmailParseInfoDO buildEmailParseInfo(String emailAddress, Integer emailType,
  801. EmailInfoDTO emailInfo, long totalSize) {
  802. EmailParseInfoDO emailParseInfoDO = new EmailParseInfoDO();
  803. emailParseInfoDO.setId(null);
  804. emailParseInfoDO.setSenderEmail(emailInfo.getSenderEmail());
  805. emailParseInfoDO.setEmail(emailAddress);
  806. emailParseInfoDO.setEmailDate(DateUtil.parse(emailInfo.getEmailDate(), DateConst.YYYY_MM_DD_HH_MM_SS));
  807. emailParseInfoDO.setParseDate(new Date());
  808. emailParseInfoDO.setEmailTitle(emailInfo.getEmailTitle());
  809. emailParseInfoDO.setEmailType(emailType);
  810. emailParseInfoDO.setParseStatus(EmailParseStatusConst.SUCCESS);
  811. emailParseInfoDO.setAttrSize(totalSize);
  812. emailParseInfoDO.setIsvalid(1);
  813. emailParseInfoDO.setCreatorId(0);
  814. emailParseInfoDO.setCreateTime(new Date());
  815. emailParseInfoDO.setUpdaterId(0);
  816. emailParseInfoDO.setUpdateTime(new Date());
  817. return emailParseInfoDO;
  818. }
  819. /**
  820. * 读取邮件
  821. *
  822. * @param mailboxInfoDTO 邮箱配置信息
  823. * @param startDate 邮件起始日期
  824. * @param endDate 邮件截止日期(为null,将解析邮件日期小于等于startDate的当天邮件)
  825. * @return 读取到的邮件信息
  826. * @throws Exception 异常信息
  827. */
  828. private Map<String, List<EmailContentInfoDTO>> realEmail(MailboxInfoDTO mailboxInfoDTO,
  829. Date startDate, Date endDate,
  830. List<String> folderNames) throws Exception {
  831. if (CollUtil.isEmpty(folderNames)) {
  832. folderNames = ListUtil.toList("INBOX");
  833. }
  834. Store store = EmailUtil.getStoreNew(mailboxInfoDTO);
  835. if (store == null) {
  836. return MapUtil.newHashMap(4);
  837. }
  838. Map<String, List<EmailContentInfoDTO>> result = MapUtil.newHashMap(128);
  839. try {
  840. if (log.isDebugEnabled()) {
  841. Folder[] list = store.getDefaultFolder().list("*");
  842. List<String> names = Arrays.stream(list).map(Folder::getFullName).toList();
  843. log.debug("获取所有邮箱文件夹:{}", names);
  844. }
  845. for (String folderName : folderNames) {
  846. try {
  847. Map<String, List<EmailContentInfoDTO>> temp = this.getFolderEmail(mailboxInfoDTO,
  848. startDate, endDate, store, folderName);
  849. if (MapUtil.isNotEmpty(temp)) {
  850. result.putAll(temp);
  851. }
  852. } catch (Exception e) {
  853. log.warn("文件夹{} 邮件获取失败:{}", folderName, ExceptionUtil.stacktraceToString(e));
  854. }
  855. }
  856. } catch (Exception e) {
  857. log.error("邮件获取失败:{}", ExceptionUtil.stacktraceToString(e));
  858. } finally {
  859. store.close();
  860. }
  861. return result;
  862. }
  863. private Map<String, List<EmailContentInfoDTO>> getFolderEmail(MailboxInfoDTO mailboxInfoDTO,
  864. Date startDate, Date endDate,
  865. Store store, String folderName) throws MessagingException {
  866. // 默认读取收件箱的邮件
  867. Folder folder = store.getFolder(folderName);
  868. folder.open(this.readWriteSeen ? Folder.READ_WRITE : Folder.READ_ONLY);
  869. Message[] messages = getEmailMessage(folder, mailboxInfoDTO.getProtocol(), startDate);
  870. if (messages == null || messages.length == 0) {
  871. log.warn("{} 获取不到邮件 -> 邮箱信息:{},开始时间:{},结束时间:{}", folderName, mailboxInfoDTO, startDate, endDate);
  872. return MapUtil.newHashMap();
  873. }
  874. String emailAddress = mailboxInfoDTO.getAccount();
  875. Map<String, List<EmailContentInfoDTO>> emailMessageMap = MapUtil.newHashMap();
  876. for (Message message : messages) {
  877. long start = System.currentTimeMillis();
  878. List<EmailContentInfoDTO> dtos = CollUtil.newArrayList();
  879. String emailTitle = message.getSubject();
  880. if (this.readWriteSeen && isMessageRead(message)) {
  881. log.warn("{} 邮件{} 已读,不用重新下载解析!", folderName, emailTitle);
  882. continue;
  883. }
  884. try {
  885. Date emailDate = message.getSentDate();
  886. String emailDateStr = DateUtil.format(emailDate, DateConst.YYYY_MM_DD_HH_MM_SS);
  887. if (log.isInfoEnabled()) {
  888. log.info("{} 邮件{} 数据获取中,邮件时间:{}", folderName, emailTitle, emailDateStr);
  889. }
  890. boolean isNotParseConditionSatisfied = emailDate == null
  891. || (endDate != null && emailDate.compareTo(endDate) > 0)
  892. || (startDate != null && emailDate.compareTo(startDate) < 0);
  893. if (isNotParseConditionSatisfied) {
  894. String st = DateUtil.formatDateTime(startDate);
  895. String ed = DateUtil.formatDateTime(endDate);
  896. log.warn("{} 邮件{} 发送时间{}不在区间内【{} ~ {}】", folderName, emailTitle, emailDateStr, st, ed);
  897. continue;
  898. }
  899. String senderEmail = getSenderEmail(message);
  900. Integer emailType = EmailUtil.getEmailTypeBySubject(emailTitle);
  901. if (emailType == null) {
  902. log.warn("{} 邮件不满足解析条件 -> 邮件主题:{},邮件日期:{}", folderName, emailTitle, emailDateStr);
  903. continue;
  904. }
  905. // // 成功解析的邮件不用重复下载
  906. // Integer okNum = this.emailParseInfoMapper.countEmailByInfoAndStatus(emailTitle, senderEmail, emailAddress, emailDateStr);
  907. // if (okNum > 0) {
  908. // if (log.isInfoEnabled()) {
  909. // log.info("{} 邮件{} 已经存在解析完成的记录,不要重复下载了。", folderName, emailTitle);
  910. // }
  911. // continue;
  912. // }
  913. if (log.isInfoEnabled()) {
  914. log.info("{} 邮件{} 基本信息获取完成,开始下载附件!邮件日期:{}", folderName, emailTitle, emailDateStr);
  915. }
  916. Object content = message.getContent();
  917. if (content instanceof Multipart multipart) {
  918. this.reMultipart(emailAddress, emailTitle, emailDate, multipart, dtos);
  919. } else if (content instanceof Part part) {
  920. this.rePart(emailAddress, emailTitle, emailDate, part, dtos);
  921. } else {
  922. log.warn("{} 邮件{} 获取不了附件", folderName, emailTitle);
  923. }
  924. if (CollUtil.isEmpty(dtos)) {
  925. log.warn("{} 邮件{} 没有获取到附件", folderName, emailTitle);
  926. continue;
  927. }
  928. dtos.forEach(e -> {
  929. e.setEmailType(emailType);
  930. e.setSenderEmail(senderEmail);
  931. });
  932. emailMessageMap.put(IdUtil.simpleUUID(), dtos);
  933. } catch (Exception e) {
  934. log.error("{} 邮件{} 下载报错 {}", folderName, emailTitle, ExceptionUtil.stacktraceToString(e));
  935. } finally {
  936. if (CollUtil.isNotEmpty(dtos) && log.isInfoEnabled()) {
  937. log.info("{} 邮件{} 下载完成,总计耗时{} ms,文件内容如下\n {}", folderName,
  938. emailTitle, System.currentTimeMillis() - start, dtos);
  939. }
  940. }
  941. }
  942. if (this.readWriteSeen) {
  943. // 设置已读标志
  944. folder.setFlags(messages, new Flags(Flags.Flag.SEEN), true);
  945. }
  946. folder.close(false);
  947. return emailMessageMap;
  948. }
  949. private void rePart(String account, String subject, Date sendDate, Part part,
  950. List<EmailContentInfoDTO> emailContentInfoDTOList) throws Exception {
  951. String fileName = EmailUtil.decodeFileName(part);
  952. if (StrUtil.isBlank(fileName)) {
  953. return;
  954. }
  955. if (fileName.contains("\"") || fileName.contains("\n")) {
  956. fileName = fileName.replaceAll("\"", "").replaceAll("\n", "");
  957. }
  958. if (fileName.contains("=?")) {
  959. fileName = MimeUtility.decodeText(fileName);
  960. }
  961. String disposition = part.getDisposition();
  962. String contentType = part.getContentType();
  963. String[] att_files = new String[]{Constants.ARCHIVE_7Z, Constants.ARCHIVE_RAR, Constants.ARCHIVE_ZIP,
  964. Constants.FILE_PDF, Constants.FILE_DOCX, Constants.FILE_JPG, Constants.FILE_PNG};
  965. boolean attachmentFlag = StrUtil.endWithAny(fileName, att_files);
  966. boolean isAttachment = attachmentFlag
  967. || Part.ATTACHMENT.equalsIgnoreCase(disposition)
  968. || (contentType != null && attachmentMimePrefixes.stream().anyMatch(prefix ->
  969. StrUtil.startWithIgnoreCase(contentType, prefix)
  970. ));
  971. if (!isAttachment) {
  972. log.warn("邮件{} 未检测到{}类型的附件 (fileName={}, disposition={}, contentType={})",
  973. subject, att_files, fileName, disposition, contentType);
  974. return;
  975. }
  976. File saveFile = this.generateSavePath(account, sendDate, fileName);
  977. if (!saveFile.exists()) {
  978. if (!saveFile.getParentFile().exists()) {
  979. boolean mkdirs = saveFile.getParentFile().mkdirs();
  980. if (!mkdirs) {
  981. log.warn("file path mkdir failed.");
  982. }
  983. }
  984. try (InputStream is = part.getInputStream()) {
  985. Files.copy(is, saveFile.toPath());
  986. }
  987. } else {
  988. if (log.isInfoEnabled()) {
  989. log.info("邮件{} 已下载过附件:{},不用重新下载了。", subject, saveFile.toPath());
  990. }
  991. }
  992. EmailContentInfoDTO emailContentInfoDTO = new EmailContentInfoDTO();
  993. emailContentInfoDTO.setFileName(fileName);
  994. emailContentInfoDTO.setFileSize(part.getSize());
  995. emailContentInfoDTO.setFilePath(saveFile.getAbsolutePath());
  996. emailContentInfoDTO.setEmailAddress(account);
  997. emailContentInfoDTO.setEmailTitle(subject);
  998. emailContentInfoDTO.setEmailDate(DateUtil.format(sendDate, DateConst.YYYY_MM_DD_HH_MM_SS));
  999. emailContentInfoDTOList.add(emailContentInfoDTO);
  1000. }
  1001. public File generateSavePath(String account, Date sendDate, String fileName) {
  1002. String emailDateStr = DateUtil.format(sendDate, DateConst.YYYYMMDD);
  1003. String filePath = this.path + File.separator + account + File.separator +
  1004. emailDateStr + File.separator + "original" + File.separator;
  1005. // 压缩包重名时的后面的压缩包会覆盖前面压缩包的问题(不考虑普通文件)
  1006. String emailDate = DateUtil.format(sendDate, DateConst.YYYYMMDDHHMMSS24);
  1007. String realName = ArchiveUtil.isArchive(fileName) ? emailDate + fileName : fileName;
  1008. return FileUtil.file(filePath + realName);
  1009. }
  1010. private void reMultipart(String account, String subject, Date emailDate, Multipart multipart,
  1011. List<EmailContentInfoDTO> emailContentInfoDTOList) throws Exception {
  1012. for (int i = 0; i < multipart.getCount(); i++) {
  1013. Part bodyPart = multipart.getBodyPart(i);
  1014. Object content = bodyPart.getContent();
  1015. if (content instanceof String) {
  1016. if (log.isDebugEnabled()) {
  1017. log.debug("邮件{} 获取的正文不做解析,内容是 {}", subject, content);
  1018. }
  1019. continue;
  1020. }
  1021. if (content instanceof Multipart mp) {
  1022. this.reMultipart(account, subject, emailDate, mp, emailContentInfoDTOList);
  1023. } else {
  1024. this.rePart(account, subject, emailDate, bodyPart, emailContentInfoDTOList);
  1025. }
  1026. }
  1027. }
  1028. private String getSenderEmail(Message message) {
  1029. Address[] senderAddress;
  1030. try {
  1031. senderAddress = message.getFrom();
  1032. if (senderAddress == null || senderAddress.length == 0) {
  1033. return null;
  1034. }
  1035. // 此时的address是含有编码(MIME编码方式)后的文本和实际的邮件地址
  1036. String address = "";
  1037. for (Address from : senderAddress) {
  1038. if (StrUtil.isNotBlank(from.toString())) {
  1039. address = from.toString();
  1040. break;
  1041. }
  1042. }
  1043. // 正则表达式匹配邮件地址
  1044. Pattern pattern = Pattern.compile("<(\\S+)>");
  1045. Matcher matcher = pattern.matcher(address);
  1046. if (matcher.find()) {
  1047. return matcher.group(1);
  1048. }
  1049. } catch (MessagingException e) {
  1050. log.error(e.getMessage(), e);
  1051. }
  1052. return null;
  1053. }
  1054. private Message[] getEmailMessage(Folder folder, String protocol, Date startDate) {
  1055. try {
  1056. if (protocol.contains("imap")) {
  1057. // 获取邮件日期大于等于startDate的邮件(搜索条件只支持按天)
  1058. SearchTerm startDateTerm = new ReceivedDateTerm(ComparisonTerm.GE, startDate);
  1059. return folder.search(startDateTerm);
  1060. } else {
  1061. return folder.getMessages();
  1062. }
  1063. } catch (MessagingException e) {
  1064. throw new RuntimeException(e);
  1065. }
  1066. }
  1067. /**
  1068. * 检查邮件是否已读
  1069. *
  1070. * @param message 邮件对象
  1071. * @return true表示已读,false表示未读
  1072. * @throws MessagingException 如果访问邮件标志时出错
  1073. */
  1074. private boolean isMessageRead(Message message) throws MessagingException {
  1075. // 获取邮件的所有标志
  1076. Flags flags = message.getFlags();
  1077. // 检查是否包含 SEEN 标志
  1078. return flags.contains(Flags.Flag.SEEN);
  1079. }
  1080. }