EmailParseService.java 46 KB


  1. package com.smppw.modaq.domain.service;
  2. import cn.hutool.core.collection.CollUtil;
  3. import cn.hutool.core.collection.ListUtil;
  4. import cn.hutool.core.date.DateUtil;
  5. import cn.hutool.core.exceptions.ExceptionUtil;
  6. import cn.hutool.core.io.FileUtil;
  7. import cn.hutool.core.map.MapUtil;
  8. import cn.hutool.core.util.StrUtil;
  9. import com.smppw.modaq.application.components.OCRReportParser;
  10. import com.smppw.modaq.application.components.ReportParseUtils;
  11. import com.smppw.modaq.application.components.report.parser.ReportParser;
  12. import com.smppw.modaq.application.components.report.parser.ReportParserFactory;
  13. import com.smppw.modaq.application.components.report.writer.ReportWriter;
  14. import com.smppw.modaq.application.components.report.writer.ReportWriterFactory;
  15. import com.smppw.modaq.application.util.EmailUtil;
  16. import com.smppw.modaq.common.conts.DateConst;
  17. import com.smppw.modaq.common.conts.EmailParseStatusConst;
  18. import com.smppw.modaq.common.conts.EmailTypeConst;
  19. import com.smppw.modaq.common.enums.ReportParseStatus;
  20. import com.smppw.modaq.common.enums.ReportParserFileType;
  21. import com.smppw.modaq.common.enums.ReportType;
  22. import com.smppw.modaq.common.exception.NotSupportReportException;
  23. import com.smppw.modaq.common.exception.ReportParseException;
  24. import com.smppw.modaq.domain.dto.EmailContentInfoDTO;
  25. import com.smppw.modaq.domain.dto.EmailZipFileDTO;
  26. import com.smppw.modaq.domain.dto.MailboxInfoDTO;
  27. import com.smppw.modaq.domain.dto.report.OCRParseData;
  28. import com.smppw.modaq.domain.dto.report.ParseResult;
  29. import com.smppw.modaq.domain.dto.report.ReportData;
  30. import com.smppw.modaq.domain.dto.report.ReportParserParams;
  31. import com.smppw.modaq.domain.entity.EmailFileInfoDO;
  32. import com.smppw.modaq.domain.entity.EmailParseInfoDO;
  33. import com.smppw.modaq.domain.mapper.EmailFileInfoMapper;
  34. import com.smppw.modaq.domain.mapper.EmailParseInfoMapper;
  35. import com.smppw.modaq.infrastructure.util.ArchiveUtil;
  36. import com.smppw.modaq.infrastructure.util.PdfUtil;
  37. import jakarta.mail.*;
  38. import jakarta.mail.internet.MimeUtility;
  39. import jakarta.mail.search.ComparisonTerm;
  40. import jakarta.mail.search.ReceivedDateTerm;
  41. import jakarta.mail.search.SearchTerm;
  42. import org.slf4j.Logger;
  43. import org.slf4j.LoggerFactory;
  44. import org.springframework.beans.factory.annotation.Value;
  45. import org.springframework.stereotype.Service;
  46. import org.springframework.util.StopWatch;
  47. import java.io.File;
  48. import java.io.IOException;
  49. import java.io.InputStream;
  50. import java.nio.file.Files;
  51. import java.nio.file.Path;
  52. import java.nio.file.Paths;
  53. import java.util.*;
  54. import java.util.regex.Matcher;
  55. import java.util.regex.Pattern;
  56. import java.util.stream.Collectors;
  57. /**
  58. * @author mozuwen
  59. * @date 2024-09-04
  60. * @description 邮件解析服务
  61. */
  62. @Service
  63. public class EmailParseService {
  64. // public static final int stepSize = 10000;
  65. private static final Logger log = LoggerFactory.getLogger(EmailParseService.class);
  66. // 扩展支持的 MIME 类型
  67. private static final Set<String> attachmentMimePrefixes = Set.of(
  68. "application/pdf",
  69. "application/zip",
  70. "application/x-zip-compressed",
  71. "application/rar",
  72. "application/x-rar-compressed",
  73. "application/octet-stream"
  74. // 按需添加其他类型...
  75. );
  76. // private final EmailFieldMappingMapper emailFieldMapper;
  77. private final EmailParseInfoMapper emailParseInfoMapper;
  78. private final EmailFileInfoMapper emailFileInfoMapper;
  79. /* 报告解析和入库的方法 */
  80. private final ReportParserFactory reportParserFactory;
  81. private final ReportWriterFactory reportWriterFactory;
  82. @Value("${email.file.path}")
  83. private String path;
  84. @Value("${email.report.ocr-parser-url}")
  85. private String ocrParserUrl;
  86. @Value("${email.read-write-seen:true}")
  87. private boolean readWriteSeen;
  88. public EmailParseService(EmailParseInfoMapper emailParseInfoMapper,
  89. EmailFileInfoMapper emailFileInfoMapper,
  90. ReportParserFactory reportParserFactory,
  91. ReportWriterFactory reportWriterFactory) {
  92. this.emailParseInfoMapper = emailParseInfoMapper;
  93. this.emailFileInfoMapper = emailFileInfoMapper;
  94. this.reportParserFactory = reportParserFactory;
  95. this.reportWriterFactory = reportWriterFactory;
  96. }
  97. /**
  98. * 解析指定邮箱指定时间范围内的邮件
  99. *
  100. * @param mailboxInfoDTO 邮箱配置信息
  101. * @param startDate 邮件起始日期(yyyy-MM-dd HH:mm:ss)
  102. * @param endDate 邮件截止日期(yyyy-MM-dd HH:mm:ss, 为null,将解析邮件日期小于等于startDate的当天邮件)
  103. * @param emailTypes 当前任务支持的邮件类型,默认支持确认单
  104. */
  105. public void parseEmail(MailboxInfoDTO mailboxInfoDTO,
  106. Date startDate, Date endDate,
  107. List<String> folderNames, List<Integer> emailTypes) {
  108. if (CollUtil.isEmpty(emailTypes)) {
  109. emailTypes = ListUtil.of(EmailTypeConst.REPORT_LETTER_EMAIL_TYPE);
  110. }
  111. if (log.isInfoEnabled()) {
  112. log.info("开始邮件解析 -> 邮箱信息:{},开始时间:{},结束时间:{}", mailboxInfoDTO, DateUtil.format(startDate,
  113. DateConst.YYYY_MM_DD_HH_MM_SS), DateUtil.format(endDate, DateConst.YYYY_MM_DD_HH_MM_SS));
  114. }
  115. Map<String, List<EmailContentInfoDTO>> emailContentMap;
  116. try {
  117. emailContentMap = this.realEmail(mailboxInfoDTO, startDate, endDate, folderNames);
  118. } catch (Exception e) {
  119. log.error("采集邮件失败 -> 邮箱配置信息:{},堆栈信息:{}", mailboxInfoDTO, ExceptionUtil.stacktraceToString(e));
  120. return;
  121. }
  122. if (MapUtil.isEmpty(emailContentMap)) {
  123. log.warn("未采集到邮件 -> 邮箱配置信息:{},开始时间:{},结束时间:{}", mailboxInfoDTO,
  124. DateUtil.format(startDate, DateConst.YYYY_MM_DD_HH_MM_SS), DateUtil.format(endDate, DateConst.YYYY_MM_DD_HH_MM_SS));
  125. return;
  126. }
  127. for (Map.Entry<String, List<EmailContentInfoDTO>> emailEntry : emailContentMap.entrySet()) {
  128. List<EmailContentInfoDTO> emailContentInfoDTOList = emailEntry.getValue();
  129. if (CollUtil.isEmpty(emailContentInfoDTOList)) {
  130. log.warn("未采集到正文或附件");
  131. continue;
  132. }
  133. log.info("开始解析邮件数据 -> 邮件主题:{},邮件日期:{}", emailContentInfoDTOList.get(0).getEmailTitle(), emailContentInfoDTOList.get(0).getEmailDate());
  134. Map<EmailContentInfoDTO, List<EmailZipFileDTO>> emailZipFileMap = MapUtil.newHashMap();
  135. for (EmailContentInfoDTO emailDto : emailContentInfoDTOList) {
  136. // 正文不用解压附件
  137. if (emailDto.getFileName() != null && emailDto.getFileName().endsWith(".html")) {
  138. emailZipFileMap.put(emailDto, ListUtil.list(false));
  139. continue;
  140. }
  141. try {
  142. List<EmailZipFileDTO> tempList = emailZipFileMap.getOrDefault(emailDto, ListUtil.list(false));
  143. tempList.addAll(this.parseZipEmail(emailDto));
  144. emailZipFileMap.put(emailDto, tempList);
  145. } catch (IOException e) {
  146. log.error("压缩包解压失败:{}", ExceptionUtil.stacktraceToString(e));
  147. EmailParseInfoDO fail = buildEmailParseInfo(null, mailboxInfoDTO.getAccount(), emailDto, emailDto.getFileSize());
  148. fail.setFailReason("压缩包解压失败");
  149. fail.setParseStatus(EmailParseStatusConst.FAIL);
  150. fail.setEmailKey(emailEntry.getKey());
  151. this.emailParseInfoMapper.insert(fail);
  152. } catch (Exception e) {
  153. log.error("堆栈信息:{}", ExceptionUtil.stacktraceToString(e));
  154. }
  155. }
  156. Iterator<Map.Entry<EmailContentInfoDTO, List<EmailZipFileDTO>>> entryIterator = emailZipFileMap.entrySet().iterator();
  157. while (entryIterator.hasNext()) {
  158. Map.Entry<EmailContentInfoDTO, List<EmailZipFileDTO>> entry = entryIterator.next();
  159. EmailContentInfoDTO key = entry.getKey();
  160. List<EmailZipFileDTO> dtos = entry.getValue();
  161. List<Integer> types = ListUtil.list(false);
  162. types.add(key.getEmailType());
  163. if (CollUtil.isNotEmpty(dtos)) {
  164. List<Integer> list = dtos.stream().map(EmailZipFileDTO::getEmailType).distinct().toList();
  165. CollUtil.addAllIfNotContains(types, list);
  166. }
  167. boolean flag = false;
  168. for (Integer type : types) {
  169. if (emailTypes.contains(type)) {
  170. flag = true;
  171. break;
  172. }
  173. }
  174. if (!flag) {
  175. log.warn("当前邮件{} 的类型{} 不在支持的任务类型{} 中,不用执行解析逻辑。", key, types, emailTypes);
  176. entryIterator.remove();
  177. }
  178. }
  179. // 保存相关信息 -> 邮件信息表,邮件文件表,邮件净值表,邮件规模表,基金净值表
  180. saveRelatedTable(emailEntry.getKey(), mailboxInfoDTO.getAccount(), emailZipFileMap);
  181. log.info("结束邮件解析 -> 邮箱信息:{},开始时间:{},结束时间:{}", emailEntry.getValue(),
  182. DateUtil.format(startDate, DateConst.YYYY_MM_DD_HH_MM_SS), DateUtil.format(endDate, DateConst.YYYY_MM_DD_HH_MM_SS));
  183. }
  184. }
  185. public List<EmailZipFileDTO> parseZipEmail(EmailContentInfoDTO emailContentInfoDTO) throws IOException {
  186. List<EmailZipFileDTO> resultList = ListUtil.list(false);
  187. Integer emailType = emailContentInfoDTO.getEmailType();
  188. String filepath = emailContentInfoDTO.getFilePath();
  189. String emailTitle = emailContentInfoDTO.getEmailTitle();
  190. if (ArchiveUtil.isZip(filepath)) {
  191. handleCompressedFiles(emailTitle, filepath, ".zip", emailType, resultList);
  192. } else if (ArchiveUtil.isRAR(filepath)) {
  193. handleCompressedFiles(emailTitle, filepath, ".rar", emailType, resultList);
  194. } else {
  195. // 不是压缩包时
  196. EmailZipFileDTO dto = new EmailZipFileDTO(emailTitle, emailContentInfoDTO);
  197. resultList.add(dto);
  198. }
  199. // 文件中的类型判断
  200. if (emailType == null || !EmailTypeConst.SUPPORT_NO_OTHER_TYPES.contains(emailType)) {
  201. emailType = EmailUtil.getEmailTypeBySubject(emailContentInfoDTO.getFileName());
  202. emailContentInfoDTO.setEmailType(emailType);
  203. }
  204. if (CollUtil.isNotEmpty(resultList)) {
  205. for (EmailZipFileDTO dto : resultList) {
  206. dto.setEmailType(emailType);
  207. }
  208. if (log.isInfoEnabled()) {
  209. log.info("当前邮件{} 所有解压缩文件解压完成:{}", emailTitle, resultList);
  210. }
  211. }
  212. return resultList;
  213. }
  214. private void handleCompressedFiles(String emailTitle, String filepath, String extension,
  215. Integer emailType, List<EmailZipFileDTO> resultList) throws IOException {
  216. String destPath = getDestinationPath(filepath, extension);
  217. File destFile = new File(destPath);
  218. if (!destFile.exists()) {
  219. if (!destFile.mkdirs()) {
  220. throw new IOException("无法创建目标目录: " + destPath);
  221. }
  222. }
  223. List<String> extractedDirs;
  224. if (ArchiveUtil.isZip(filepath)) {
  225. extractedDirs = ArchiveUtil.extractCompressedFiles(filepath, destPath);
  226. } else if (ArchiveUtil.isRAR(filepath)) {
  227. extractedDirs = ArchiveUtil.extractRar5(filepath, destPath);
  228. } else {
  229. return;
  230. }
  231. for (String dir : extractedDirs) {
  232. // 如果邮件类型不满足解析条件则重新根据文件名判断
  233. if (emailType == null || !EmailTypeConst.SUPPORT_EMAIL_TYPES.contains(emailType)) {
  234. emailType = EmailUtil.getEmailTypeBySubject(dir);
  235. }
  236. File file = new File(dir);
  237. if (file.isDirectory()) {
  238. String[] subDirs = file.list();
  239. if (subDirs != null) {
  240. for (String subDir : subDirs) {
  241. resultList.add(new EmailZipFileDTO(emailTitle, subDir, emailType));
  242. }
  243. } else {
  244. log.warn("目录 {} 下无文件", dir);
  245. }
  246. } else {
  247. resultList.add(new EmailZipFileDTO(emailTitle, dir, emailType));
  248. }
  249. }
  250. }
  251. private String getDestinationPath(String filepath, String extension) {
  252. Path path = Paths.get(filepath);
  253. String fileName = path.getFileName().toString();
  254. String baseName = fileName.substring(0, fileName.length() - extension.length());
  255. return path.getParent().resolve(baseName).toString();
  256. }
  257. public void saveRelatedTable(String emailKey, String emailAddress,
  258. Map<EmailContentInfoDTO, List<EmailZipFileDTO>> emailZipFileMap) {
  259. // python 报告解析接口结果
  260. List<ParseResult<ReportData>> dataList = ListUtil.list(false);
  261. for (Map.Entry<EmailContentInfoDTO, List<EmailZipFileDTO>> entry : emailZipFileMap.entrySet()) {
  262. EmailContentInfoDTO emailDto = entry.getKey();
  263. if (emailDto.getFileName() != null && emailDto.getFileName().endsWith(".html")) {
  264. continue;
  265. }
  266. String emailTitle = emailDto.getEmailTitle();
  267. // 待解析文件数据处理,不支持已存在的文件重复解析
  268. List<EmailZipFileDTO> dtos = ListUtil.list(false);
  269. List<EmailZipFileDTO> zipFiles = entry.getValue();
  270. if (CollUtil.isEmpty(zipFiles)) {
  271. dtos.add(new EmailZipFileDTO(emailTitle, emailDto));
  272. } else {
  273. dtos.addAll(zipFiles);
  274. zipFiles.clear();
  275. }
  276. // 重新判断类型
  277. for (EmailZipFileDTO dto : dtos) {
  278. if (!EmailTypeConst.SUPPORT_NO_OTHER_TYPES.contains(dto.getEmailType())) {
  279. Integer emailType = EmailUtil.getEmailTypeBySubject(emailTitle + dto.getFilename());
  280. dto.setEmailType(emailType);
  281. }
  282. }
  283. // 如果压缩包里面既有pdf又有其他格式的文件,说明其他格式的文件是不需要解析的
  284. List<String> exts = dtos.stream().map(EmailZipFileDTO::getExtName).distinct().toList();
  285. if (exts.contains("pdf") && exts.size() > 1) {
  286. dtos.removeIf(e -> !Objects.equals("pdf", e.getExtName()));
  287. }
  288. // 移除逻辑
  289. Iterator<EmailZipFileDTO> removeIterator = dtos.iterator();
  290. while (removeIterator.hasNext()) {
  291. EmailZipFileDTO dto = removeIterator.next();
  292. String filename = dto.getFilename();
  293. // 删除复核函或基金合同
  294. if (filename.contains("复核函") || (filename.contains("基金合同") && !filename.contains("合同变更"))) {
  295. log.warn("邮件{} 中的报告{} 是复核函或基金合同,不用解析上传。", emailTitle, filename);
  296. removeIterator.remove();
  297. }
  298. // 不支持的类型
  299. Integer emailType = dto.getEmailType();
  300. if (!EmailTypeConst.SUPPORT_EMAIL_TYPES.contains(emailType)) {
  301. log.info("邮件{} 类型{} 不支持解析。", emailTitle, emailType);
  302. removeIterator.remove();
  303. }
  304. }
  305. // 数据库已存在的数据过滤(邮件主题+报告名称+附件大小,压缩包文件大小汇总)
  306. long totalSize = dtos.stream().map(EmailZipFileDTO::getFileSize).reduce(0L, Long::sum);
  307. Iterator<EmailZipFileDTO> iterator = dtos.iterator();
  308. while (iterator.hasNext()) {
  309. EmailZipFileDTO dto = iterator.next();
  310. String filename = dto.getFilename();
  311. Integer emailType = dto.getEmailType();
  312. int count = 0;
  313. if (Objects.equals(emailType, EmailTypeConst.REPORT_LETTER_EMAIL_TYPE)) {
  314. // 确认单
  315. count = this.emailFileInfoMapper.getLetterFilenameSuccessCount(emailTitle, filename);
  316. } else if (Objects.equals(emailType, EmailTypeConst.REPORT_EMAIL_TYPE)) {
  317. // 定期报告
  318. count = this.emailFileInfoMapper.getAmacFilenameSuccessCount(emailTitle, filename, totalSize);
  319. } else if (Objects.equals(emailType, EmailTypeConst.REPORT_WEEKLY_TYPE)) {
  320. // 管理人周报
  321. count = this.emailFileInfoMapper.getWeeklyFilenameSuccessCount(emailTitle, filename, totalSize);
  322. } else if (Objects.equals(emailType, EmailTypeConst.REPORT_OTHER_TYPE)) {
  323. // 其他报告
  324. count = this.emailFileInfoMapper.getOtherFilenameSuccessCount(emailTitle, filename, totalSize);
  325. }
  326. if (count > 0) {
  327. iterator.remove();
  328. log.info("邮件{} 报告{} 已存在解析成功的记录,不用重新解析。", emailTitle, filename);
  329. }
  330. }
  331. if (CollUtil.isEmpty(dtos)) {
  332. log.info("邮件{} 所有文件都已经解析成功过,不能重复解析了", emailTitle);
  333. continue;
  334. }
  335. if (log.isInfoEnabled()) {
  336. log.info("邮件{} 还有报告待解析:\n{}", emailTitle, dtos);
  337. }
  338. Integer emailId = emailDto.getEmailId();
  339. EmailParseInfoDO emailParseInfoDO = this.buildEmailParseInfo(emailId, emailAddress, emailDto, totalSize);
  340. emailParseInfoDO.setEmailKey(emailKey);
  341. emailId = this.saveEmailParseInfo(emailParseInfoDO);
  342. if (emailId == null) {
  343. continue;
  344. }
  345. for (EmailZipFileDTO zipFile : dtos) {
  346. EmailFileInfoDO emailFile = this.saveEmailFileInfo(emailId, zipFile.getFilename(), zipFile.getFilepath());
  347. // 解析并保存报告
  348. ParseResult<ReportData> parseResult = this.parseReportAndHandleResult(emailTitle, emailFile, zipFile);
  349. dataList.add(parseResult);
  350. }
  351. String failReason = null;
  352. int emailParseStatus = EmailParseStatusConst.SUCCESS;
  353. // 报告邮件有一条失败就表示整个邮件解析失败
  354. if (CollUtil.isNotEmpty(dataList)) {
  355. // ai解析结果
  356. List<ReportData> aiParaseList = dataList.stream().map(ParseResult::getData)
  357. .filter(Objects::nonNull).filter(e -> Objects.equals(true, e.getAiParse())).toList();
  358. if (CollUtil.isNotEmpty(aiParaseList)) {
  359. for (ReportData data : aiParaseList) {
  360. this.emailFileInfoMapper.updateAiParseByFileId(data.getBaseInfo().getFileId(), data.getAiParse(), data.getAiFileId());
  361. }
  362. }
  363. long failNum = dataList.stream().filter(e -> !Objects.equals(EmailParseStatusConst.SUCCESS, e.getStatus())).count();
  364. if (failNum > 0) {
  365. emailParseStatus = EmailParseStatusConst.FAIL;
  366. failReason = dataList.stream().map(ParseResult::getMsg).collect(Collectors.joining(";"));
  367. }
  368. }
  369. emailParseInfoMapper.updateParseStatus(emailId, emailParseStatus, failReason);
  370. }
  371. }
  372. private ParseResult<ReportData> parseReportAndHandleResult(String emailTitle, EmailFileInfoDO emailFileInfo, EmailZipFileDTO zipFile) {
  373. Integer emailType = zipFile.getEmailType();
  374. String fileName = zipFile.getFilename();
  375. String filepath = zipFile.getFilepath();
  376. ParseResult<ReportData> result = new ParseResult<>();
  377. boolean reportFlag = emailType == null || !EmailTypeConst.SUPPORT_EMAIL_TYPES.contains(emailType);
  378. if (reportFlag || StrUtil.isBlank(fileName) || fileName.endsWith(".html")) {
  379. result.setStatus(ReportParseStatus.NOT_A_REPORT.getCode());
  380. result.setMsg(StrUtil.format(ReportParseStatus.NOT_A_REPORT.getMsg(), fileName));
  381. log.error(result.getMsg());
  382. return result;
  383. }
  384. // 类型识别---先识别季度报告,没有季度再识别年度报告,最后识别月报
  385. ReportType reportType = ReportParseUtils.matchReportType(emailType, fileName);
  386. if (reportType == null) {
  387. reportType = ReportParseUtils.matchReportType(emailType, emailTitle);
  388. if (log.isDebugEnabled()) {
  389. log.debug("报告{} 根据邮件主题{} 重新识别的类型是:{}", fileName, emailTitle, reportType);
  390. }
  391. }
  392. // 解析器--根据文件后缀获取对应解析器,解析不了就用AI来解析
  393. ReportParserFileType fileType;
  394. String fileSuffix = StrUtil.subAfter(fileName, ".", true);
  395. fileType = ReportParserFileType.getBySuffix(fileSuffix);
  396. // 不支持的格式
  397. if (fileType == null) {
  398. result.setStatus(ReportParseStatus.NO_SUPPORT_TEMPLATE.getCode());
  399. result.setMsg(StrUtil.format(ReportParseStatus.NO_SUPPORT_TEMPLATE.getMsg(), fileName));
  400. log.error(result.getMsg());
  401. return result;
  402. }
  403. // 不是定期报告的判断逻辑放在不支持的格式下面
  404. if (reportType == null) {
  405. result.setStatus(ReportParseStatus.NOT_A_REPORT.getCode());
  406. result.setMsg(StrUtil.format(ReportParseStatus.NOT_A_REPORT.getMsg(), fileName));
  407. log.error(result.getMsg());
  408. return result;
  409. }
  410. Integer fileId = emailFileInfo.getId();
  411. // 首页和尾页转为png图片,首页用来识别基金名称和基金代码、尾页用来识别印章和联系人
  412. List<String> images = ListUtil.list(true);
  413. if (Objects.equals(ReportParserFileType.PDF, fileType)) {
  414. try {
  415. String output = FileUtil.getParent(filepath, 1) + File.separator + "image";
  416. images = PdfUtil.convertFirstAndLastPagesToPng(filepath, FileUtil.file(output), 300);
  417. if (log.isDebugEnabled()) {
  418. log.debug("报告[{}] 生成的图片地址是:\n{}", fileName, images);
  419. }
  420. } catch (Exception e) {
  421. log.warn("报告[{}] 生成图片失败:{}", fileName, ExceptionUtil.stacktraceToString(e));
  422. }
  423. } else if (Objects.equals(ReportParserFileType.IMG, fileType)) {
  424. try {
  425. String outputFile = PdfUtil.compressAndSave(filepath);
  426. images.add(outputFile);
  427. } catch (IOException e) {
  428. log.error("报告{} 图片压缩失败,{}", fileName, ExceptionUtil.stacktraceToString(e));
  429. }
  430. }
  431. // 不支持解析的格式文件
  432. boolean notSupportFile = false;
  433. // 解析报告
  434. ReportData reportData = null;
  435. ReportParserParams params = new ReportParserParams(fileId, fileName, filepath, reportType);
  436. StopWatch parserWatch = new StopWatch();
  437. parserWatch.start();
  438. try {
  439. if (reportType != ReportType.OTHER && reportType != ReportType.WEEKLY) {
  440. ReportParser<ReportData> instance = this.reportParserFactory.getInstance(reportType, fileType);
  441. reportData = instance.parse(params);
  442. result.setStatus(1);
  443. result.setMsg("报告解析成功");
  444. result.setData(reportData);
  445. } else {
  446. if (log.isInfoEnabled()) {
  447. log.info("报告{} 是周报或其他类型,直接用AI解析器解析", fileName);
  448. }
  449. }
  450. } catch (ReportParseException e) {
  451. log.error("解析失败:{}", StrUtil.format(e.getMsg(), fileName));
  452. result.setStatus(e.getCode());
  453. result.setMsg(StrUtil.format(e.getMsg(), fileName));
  454. if (e instanceof NotSupportReportException) {
  455. notSupportFile = true;
  456. }
  457. } catch (Exception e) {
  458. log.error("解析错误:{}", ExceptionUtil.stacktraceToString(e));
  459. result.setStatus(ReportParseStatus.PARSE_FAIL.getCode());
  460. result.setMsg(StrUtil.format(ReportParseStatus.PARSE_FAIL.getMsg(), e.getMessage()));
  461. } finally {
  462. // 如果解析结果是空的就用AI工具解析一次
  463. if (reportData == null && !notSupportFile) {
  464. if (reportType == ReportType.QUARTERLY || reportType == ReportType.ANNUALLY) {
  465. if (log.isInfoEnabled()) {
  466. log.info("报告{} 开始AI解析......", fileName);
  467. }
  468. } else if (CollUtil.isNotEmpty(images)) {
  469. filepath = images.get(0);
  470. if (log.isInfoEnabled()) {
  471. log.info("报告{} 用首页图片{} 开始AI解析......", fileName, filepath);
  472. }
  473. }
  474. try {
  475. params = new ReportParserParams(fileId, fileName, filepath, reportType);
  476. ReportParser<ReportData> instance = this.reportParserFactory.getInstance(reportType, ReportParserFileType.AI);
  477. reportData = instance.parse(params);
  478. result.setStatus(1);
  479. result.setMsg("报告解析成功--AI");
  480. result.setData(reportData);
  481. } catch (ReportParseException e) {
  482. log.error("AI解析失败:{}", StrUtil.format(e.getMsg(), fileName));
  483. result.setStatus(e.getCode());
  484. result.setMsg(StrUtil.format(e.getMsg(), fileName));
  485. } catch (Exception e) {
  486. log.error("AI解析错误:{}", ExceptionUtil.stacktraceToString(e));
  487. result.setStatus(ReportParseStatus.PARSE_FAIL.getCode());
  488. result.setMsg(StrUtil.format(ReportParseStatus.PARSE_FAIL.getMsg(), e.getMessage()));
  489. }
  490. if (log.isInfoEnabled()) {
  491. log.info("报告{} AI解析结束!结果是:{}", fileName, reportData);
  492. }
  493. }
  494. // ocr信息提取
  495. this.ocrReportData(reportType, reportData, fileName, images);
  496. parserWatch.stop();
  497. if (log.isInfoEnabled()) {
  498. log.info("报告{}解析结果为{},耗时{}ms", fileName, reportData, parserWatch.getTotalTimeMillis());
  499. }
  500. }
  501. // 保存报告解析结果
  502. this.saveReportData(reportData, reportType, fileName);
  503. return result;
  504. }
  505. /**
  506. * ocr 提取信息(包括首页的基金名称或报告日期,尾页的印章或联系人等信息)
  507. *
  508. * @param reportData 报告解析结果
  509. * @param fileName 报告名称
  510. * @param images 报告的收益和尾页png图片
  511. */
  512. private void ocrReportData(ReportType reportType, ReportData reportData, String fileName, List<String> images) {
  513. if (reportData == null || CollUtil.isEmpty(images)) {
  514. return;
  515. }
  516. OCRParseData parseRes = null;
  517. // 报告才识别尾页的印章和联系人,确认单不识别尾页
  518. if (ReportType.LETTER != reportType) {
  519. try {
  520. // 首页和尾页相等时只读首页
  521. String imageUrl = images.size() == 1 ? images.get(0) : images.get(1);
  522. parseRes = new OCRReportParser().parse(fileName, this.ocrParserUrl, imageUrl);
  523. } catch (Exception e) {
  524. log.error("报告{} OCR识别印章和联系人出错:{}", fileName, ExceptionUtil.stacktraceToString(e));
  525. }
  526. // ocr识别尾页是否包含印章和联系人信息
  527. if (parseRes != null) {
  528. if (reportData.getBaseInfo() != null) {
  529. reportData.getBaseInfo().setWithSeals(parseRes.getWithSeals());
  530. reportData.getBaseInfo().setWithContacts(parseRes.getWithContacts());
  531. if (fileName.contains("用印") && !Objects.equals(true, reportData.getBaseInfo().getWithSeals())) {
  532. reportData.getBaseInfo().setWithSeals(true);
  533. }
  534. }
  535. }
  536. }
  537. // 用首页识别基金名称、产品代码和报告日期
  538. if ((reportData.getFundInfo() != null && StrUtil.isBlank(reportData.getFundInfo().getFundName()))
  539. || (reportData.getFundInfo() != null && StrUtil.isBlank(reportData.getFundInfo().getFundCode()))) {
  540. // 首页和尾页不相等时解析首页的数据
  541. if (images.size() != 1) {
  542. try {
  543. parseRes = new OCRReportParser().parse(fileName, this.ocrParserUrl, images.get(0));
  544. } catch (Exception e) {
  545. log.error("报告{} OCR识别首页基金名称和报告日期出错:{}", fileName, ExceptionUtil.stacktraceToString(e));
  546. }
  547. }
  548. // ocr 识别的结果
  549. if (reportData.getFundInfo() != null && parseRes != null) {
  550. if (StrUtil.isBlank(reportData.getFundInfo().getFundName())) {
  551. reportData.getFundInfo().setFundName(parseRes.getFundName());
  552. }
  553. if (StrUtil.isBlank(reportData.getFundInfo().getFundCode())) {
  554. reportData.getFundInfo().setFundCode(parseRes.getFundCode());
  555. }
  556. }
  557. }
  558. }
  559. /**
  560. * 保存报告解析结果
  561. *
  562. * @param reportData 报告解析结果
  563. * @param reportType 报告类型
  564. * @param fileName 报告名称
  565. */
  566. private void saveReportData(ReportData reportData, ReportType reportType, String fileName) {
  567. if (reportData == null) {
  568. return;
  569. }
  570. StopWatch writeWatch = new StopWatch();
  571. writeWatch.start();
  572. try {
  573. ReportWriter<ReportData> instance = this.reportWriterFactory.getInstance(reportType);
  574. instance.write(reportData);
  575. } catch (Exception e) {
  576. log.error("报告{}结果保存失败\n{}", fileName, ExceptionUtil.stacktraceToString(e));
  577. } finally {
  578. writeWatch.stop();
  579. if (log.isInfoEnabled()) {
  580. log.info("报告{}解析结果保存完成,耗时{}ms", fileName, writeWatch.getTotalTimeMillis());
  581. }
  582. }
  583. }
  584. private EmailFileInfoDO saveEmailFileInfo(Integer emailId, String fileName, String filePath) {
  585. EmailFileInfoDO emailFileInfoDO = buildEmailFileInfoDO(emailId, fileName, filePath);
  586. emailFileInfoDO.setAiFileId(null);
  587. if (emailFileInfoDO.getId() != null) {
  588. emailFileInfoMapper.updateTimeById(null, new Date());
  589. return emailFileInfoDO;
  590. }
  591. emailFileInfoMapper.insert(emailFileInfoDO);
  592. return emailFileInfoDO;
  593. }
  594. private EmailFileInfoDO buildEmailFileInfoDO(Integer emailId, String fileName, String filePath) {
  595. EmailFileInfoDO emailFileInfoDO = new EmailFileInfoDO();
  596. emailFileInfoDO.setId(null);
  597. emailFileInfoDO.setEmailId(emailId);
  598. emailFileInfoDO.setFileName(fileName);
  599. emailFileInfoDO.setFilePath(filePath);
  600. emailFileInfoDO.setIsvalid(1);
  601. emailFileInfoDO.setCreatorId(0);
  602. emailFileInfoDO.setCreateTime(new Date());
  603. emailFileInfoDO.setUpdaterId(0);
  604. emailFileInfoDO.setUpdateTime(new Date());
  605. return emailFileInfoDO;
  606. }
  607. private Integer saveEmailParseInfo(EmailParseInfoDO emailParseInfoDO) {
  608. if (emailParseInfoDO == null) {
  609. return null;
  610. }
  611. // 重新邮件功能 -> 修改解析时间和更新时间
  612. if (emailParseInfoDO.getId() != null) {
  613. emailParseInfoMapper.updateParseTime(emailParseInfoDO.getId(), emailParseInfoDO.getParseDate());
  614. return emailParseInfoDO.getId();
  615. }
  616. emailParseInfoMapper.insert(emailParseInfoDO);
  617. return emailParseInfoDO.getId();
  618. }
  619. private EmailParseInfoDO buildEmailParseInfo(Integer emailId, String emailAddress,
  620. EmailContentInfoDTO emailContentInfoDTO, long totalSize) {
  621. EmailParseInfoDO emailParseInfoDO = new EmailParseInfoDO();
  622. emailParseInfoDO.setId(emailId);
  623. emailParseInfoDO.setSenderEmail(emailContentInfoDTO.getSenderEmail());
  624. emailParseInfoDO.setEmail(emailAddress);
  625. emailParseInfoDO.setEmailDate(DateUtil.parse(emailContentInfoDTO.getEmailDate(), DateConst.YYYY_MM_DD_HH_MM_SS));
  626. emailParseInfoDO.setParseDate(emailContentInfoDTO.getParseDate() == null ? null : DateUtil.parseDate(emailContentInfoDTO.getParseDate()));
  627. emailParseInfoDO.setEmailTitle(emailContentInfoDTO.getEmailTitle());
  628. emailParseInfoDO.setEmailType(emailContentInfoDTO.getEmailType());
  629. emailParseInfoDO.setParseStatus(EmailParseStatusConst.SUCCESS);
  630. emailParseInfoDO.setAttrSize(totalSize);
  631. emailParseInfoDO.setIsvalid(1);
  632. emailParseInfoDO.setCreatorId(0);
  633. emailParseInfoDO.setCreateTime(new Date());
  634. emailParseInfoDO.setUpdaterId(0);
  635. emailParseInfoDO.setUpdateTime(new Date());
  636. return emailParseInfoDO;
  637. }
  638. /**
  639. * 读取邮件
  640. *
  641. * @param mailboxInfoDTO 邮箱配置信息
  642. * @param startDate 邮件起始日期
  643. * @param endDate 邮件截止日期(为null,将解析邮件日期小于等于startDate的当天邮件)
  644. * @return 读取到的邮件信息
  645. * @throws Exception 异常信息
  646. */
  647. private Map<String, List<EmailContentInfoDTO>> realEmail(MailboxInfoDTO mailboxInfoDTO,
  648. Date startDate, Date endDate,
  649. List<String> folderNames) throws Exception {
  650. if (CollUtil.isEmpty(folderNames)) {
  651. folderNames = ListUtil.toList("INBOX");
  652. }
  653. Store store = EmailUtil.getStoreNew(mailboxInfoDTO);
  654. if (store == null) {
  655. return MapUtil.newHashMap(4);
  656. }
  657. Map<String, List<EmailContentInfoDTO>> result = MapUtil.newHashMap(128);
  658. try {
  659. if (log.isInfoEnabled()) {
  660. Folder[] list = store.getDefaultFolder().list("*");
  661. List<String> names = Arrays.stream(list).map(Folder::getFullName).toList();
  662. log.info("获取所有邮箱文件夹:{}", names);
  663. }
  664. for (String folderName : folderNames) {
  665. try {
  666. Map<String, List<EmailContentInfoDTO>> temp = this.getFolderEmail(mailboxInfoDTO,
  667. startDate, endDate, store, folderName);
  668. if (MapUtil.isNotEmpty(temp)) {
  669. result.putAll(temp);
  670. }
  671. } catch (Exception e) {
  672. log.warn("文件夹{} 邮件获取失败:{}", folderName, ExceptionUtil.stacktraceToString(e));
  673. }
  674. }
  675. } catch (Exception e) {
  676. log.error("邮件获取失败:{}", ExceptionUtil.stacktraceToString(e));
  677. } finally {
  678. store.close();
  679. }
  680. return result;
  681. }
  682. private Map<String, List<EmailContentInfoDTO>> getFolderEmail(MailboxInfoDTO mailboxInfoDTO,
  683. Date startDate, Date endDate,
  684. Store store, String folderName) throws MessagingException {
  685. // 默认读取收件箱的邮件
  686. Folder folder = store.getFolder(folderName);
  687. folder.open(this.readWriteSeen ? Folder.READ_WRITE : Folder.READ_ONLY);
  688. Message[] messages = getEmailMessage(folder, mailboxInfoDTO.getProtocol(), startDate);
  689. if (messages == null || messages.length == 0) {
  690. log.warn("{} 获取不到邮件 -> 邮箱信息:{},开始时间:{},结束时间:{}", folderName, mailboxInfoDTO, startDate, endDate);
  691. return MapUtil.newHashMap();
  692. }
  693. Map<String, List<EmailContentInfoDTO>> emailMessageMap = MapUtil.newHashMap();
  694. for (Message message : messages) {
  695. long start = System.currentTimeMillis();
  696. List<EmailContentInfoDTO> emailContentInfoDTOList = CollUtil.newArrayList();
  697. String uuidKey = UUID.randomUUID().toString().replaceAll("-", "");
  698. Integer emailType;
  699. String senderEmail;
  700. String emailTitle = message.getSubject();
  701. if (this.readWriteSeen && isMessageRead(message)) {
  702. log.warn("{} 邮件{} 已读,不用重新下载解析!", folderName, emailTitle);
  703. continue;
  704. }
  705. try {
  706. Date emailDate = message.getSentDate();
  707. String emailDateStr = DateUtil.format(emailDate, DateConst.YYYY_MM_DD_HH_MM_SS);
  708. if (log.isInfoEnabled()) {
  709. log.info("{} 邮件{} 数据获取中,邮件时间:{}", folderName, emailTitle, emailDateStr);
  710. }
  711. boolean isNotParseConditionSatisfied = emailDate == null
  712. || (endDate != null && emailDate.compareTo(endDate) > 0)
  713. || (startDate != null && emailDate.compareTo(startDate) < 0);
  714. if (isNotParseConditionSatisfied) {
  715. String st = DateUtil.formatDateTime(startDate);
  716. String ed = DateUtil.formatDateTime(endDate);
  717. log.warn("{} 邮件[{}]日期{}不在区间内【{} ~ {}】", folderName, emailTitle, emailDateStr, st, ed);
  718. continue;
  719. }
  720. senderEmail = getSenderEmail(message);
  721. emailType = EmailUtil.getEmailTypeBySubject(emailTitle);
  722. if (emailType == null) {
  723. log.warn("{} 邮件不满足解析条件 -> 邮件主题:{},邮件日期:{}", folderName, emailTitle, emailDateStr);
  724. continue;
  725. }
  726. if (log.isInfoEnabled()) {
  727. log.info("{} 邮件{} 基本信息获取完成,开始下载附件!邮件日期:{}", folderName, emailTitle, emailDateStr);
  728. }
  729. Object content = message.getContent();
  730. if (content instanceof Multipart multipart) {
  731. this.reMultipart(mailboxInfoDTO.getAccount(), emailTitle, emailDate, multipart, emailContentInfoDTOList);
  732. } else if (content instanceof Part part) {
  733. this.rePart(mailboxInfoDTO.getAccount(), emailTitle, emailDate, part, emailContentInfoDTOList);
  734. } else {
  735. log.warn("{} 不支持的邮件数据 {}", folderName, emailTitle);
  736. }
  737. if (CollUtil.isNotEmpty(emailContentInfoDTOList)) {
  738. emailContentInfoDTOList.forEach(e -> {
  739. e.setEmailType(emailType);
  740. e.setSenderEmail(senderEmail);
  741. });
  742. emailMessageMap.put(uuidKey, emailContentInfoDTOList);
  743. }
  744. if (log.isInfoEnabled()) {
  745. log.info("{} 邮件{} 下载完成,总计耗时{} ms,文件内容如下\n {}", folderName,
  746. emailTitle, System.currentTimeMillis() - start, emailContentInfoDTOList);
  747. }
  748. } catch (Exception e) {
  749. log.error("{} 获取邮箱的邮件{} 报错,堆栈信息:{}", folderName, emailTitle, ExceptionUtil.stacktraceToString(e));
  750. }
  751. }
  752. if (this.readWriteSeen) {
  753. // 设置已读标志
  754. folder.setFlags(messages, new Flags(Flags.Flag.SEEN), true);
  755. }
  756. folder.close(false);
  757. return emailMessageMap;
  758. }
  759. private void rePart(String account, String subject, Date sendDate, Part part,
  760. List<EmailContentInfoDTO> emailContentInfoDTOList) throws Exception {
  761. String fileName = EmailUtil.decodeFileName(part);
  762. if (StrUtil.isBlank(fileName)) {
  763. log.warn("邮件{} 附件文件名是空的,不做下载!", subject);
  764. return;
  765. }
  766. if (fileName.contains("=?")) {
  767. fileName = MimeUtility.decodeText(fileName);
  768. }
  769. String disposition = part.getDisposition();
  770. String contentType = part.getContentType();
  771. boolean attachmentFlag = StrUtil.endWithAny(fileName, ".zip", ".rar", ".pdf", ".png", ".jpg", ".docx");
  772. boolean isAttachment = attachmentFlag
  773. || Part.ATTACHMENT.equalsIgnoreCase(disposition)
  774. || (contentType != null && attachmentMimePrefixes.stream().anyMatch(prefix ->
  775. StrUtil.startWithIgnoreCase(contentType, prefix)
  776. ));
  777. if (!isAttachment) {
  778. log.warn("邮件 {} 未检测到pdf/zip/rar/png/jpg/docx类型的附件 (fileName={}, disposition={}, contentType={})",
  779. subject, fileName, disposition, contentType);
  780. return;
  781. }
  782. String emailDateStr = DateUtil.format(sendDate, DateConst.YYYYMMDD);
  783. String filePath = path + File.separator + account + File.separator + emailDateStr + File.separator;
  784. // 压缩包重名时的后面的压缩包会覆盖前面压缩包的问题(不考虑普通文件)
  785. String emailDate = DateUtil.format(sendDate, DateConst.YYYYMMDDHHMMSS24);
  786. String realName = (fileName.endsWith(".zip") || fileName.endsWith(".rar")) ? emailDate + fileName : fileName;
  787. File saveFile = FileUtil.file(filePath + realName);
  788. if (!saveFile.exists()) {
  789. if (!saveFile.getParentFile().exists()) {
  790. boolean mkdirs = saveFile.getParentFile().mkdirs();
  791. if (!mkdirs) {
  792. log.warn("file path mkdir failed.");
  793. }
  794. }
  795. try (InputStream is = part.getInputStream()) {
  796. Files.copy(is, saveFile.toPath());
  797. }
  798. } else {
  799. FileUtil.del(saveFile);
  800. try (InputStream is = part.getInputStream()) {
  801. Files.copy(is, saveFile.toPath());
  802. }
  803. }
  804. EmailContentInfoDTO emailContentInfoDTO = new EmailContentInfoDTO();
  805. emailContentInfoDTO.setFileName(fileName);
  806. emailContentInfoDTO.setFileSize(part.getSize());
  807. emailContentInfoDTO.setFilePath(saveFile.getAbsolutePath());
  808. emailContentInfoDTO.setEmailAddress(account);
  809. emailContentInfoDTO.setEmailTitle(subject);
  810. emailContentInfoDTO.setEmailDate(DateUtil.format(sendDate, DateConst.YYYY_MM_DD_HH_MM_SS));
  811. emailContentInfoDTOList.add(emailContentInfoDTO);
  812. }
  813. private void reMultipart(String account, String subject, Date emailDate, Multipart multipart,
  814. List<EmailContentInfoDTO> emailContentInfoDTOList) throws Exception {
  815. for (int i = 0; i < multipart.getCount(); i++) {
  816. Part bodyPart = multipart.getBodyPart(i);
  817. Object content = bodyPart.getContent();
  818. if (content instanceof String) {
  819. if (log.isDebugEnabled()) {
  820. log.debug("邮件{} 获取的正文不做解析,内容是 {}", subject, content);
  821. }
  822. continue;
  823. }
  824. if (content instanceof Multipart mp) {
  825. this.reMultipart(account, subject, emailDate, mp, emailContentInfoDTOList);
  826. } else {
  827. this.rePart(account, subject, emailDate, bodyPart, emailContentInfoDTOList);
  828. }
  829. }
  830. }
  831. private String getSenderEmail(Message message) {
  832. Address[] senderAddress;
  833. try {
  834. senderAddress = message.getFrom();
  835. if (senderAddress == null || senderAddress.length == 0) {
  836. return null;
  837. }
  838. // 此时的address是含有编码(MIME编码方式)后的文本和实际的邮件地址
  839. String address = "";
  840. for (Address from : senderAddress) {
  841. if (StrUtil.isNotBlank(from.toString())) {
  842. address = from.toString();
  843. break;
  844. }
  845. }
  846. // 正则表达式匹配邮件地址
  847. Pattern pattern = Pattern.compile("<(\\S+)>");
  848. Matcher matcher = pattern.matcher(address);
  849. if (matcher.find()) {
  850. return matcher.group(1);
  851. }
  852. } catch (MessagingException e) {
  853. log.error(e.getMessage(), e);
  854. }
  855. return null;
  856. }
  857. private Message[] getEmailMessage(Folder folder, String protocol, Date startDate) {
  858. try {
  859. if (protocol.contains("imap")) {
  860. // 获取邮件日期大于等于startDate的邮件(搜索条件只支持按天)
  861. SearchTerm startDateTerm = new ReceivedDateTerm(ComparisonTerm.GE, startDate);
  862. return folder.search(startDateTerm);
  863. } else {
  864. return folder.getMessages();
  865. }
  866. } catch (MessagingException e) {
  867. throw new RuntimeException(e);
  868. }
  869. }
  870. /**
  871. * 检查邮件是否已读
  872. *
  873. * @param message 邮件对象
  874. * @return true表示已读,false表示未读
  875. * @throws MessagingException 如果访问邮件标志时出错
  876. */
  877. private boolean isMessageRead(Message message) throws MessagingException {
  878. // 获取邮件的所有标志
  879. Flags flags = message.getFlags();
  880. // 检查是否包含 SEEN 标志
  881. return flags.contains(Flags.Flag.SEEN);
  882. }
  883. }