EmailParseService.java 43 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889
  1. package com.smppw.modaq.domain.service;
  2. import cn.hutool.core.collection.CollUtil;
  3. import cn.hutool.core.collection.ListUtil;
  4. import cn.hutool.core.date.DateUtil;
  5. import cn.hutool.core.exceptions.ExceptionUtil;
  6. import cn.hutool.core.io.FileUtil;
  7. import cn.hutool.core.map.MapUtil;
  8. import cn.hutool.core.util.StrUtil;
  9. import com.smppw.modaq.application.components.OCRReportParser;
  10. import com.smppw.modaq.application.components.ReportParseUtils;
  11. import com.smppw.modaq.application.components.report.parser.ReportParser;
  12. import com.smppw.modaq.application.components.report.parser.ReportParserFactory;
  13. import com.smppw.modaq.application.components.report.writer.ReportWriter;
  14. import com.smppw.modaq.application.components.report.writer.ReportWriterFactory;
  15. import com.smppw.modaq.application.util.EmailUtil;
  16. import com.smppw.modaq.common.conts.DateConst;
  17. import com.smppw.modaq.common.conts.EmailParseStatusConst;
  18. import com.smppw.modaq.common.conts.EmailTypeConst;
  19. import com.smppw.modaq.common.enums.ReportParseStatus;
  20. import com.smppw.modaq.common.enums.ReportParserFileType;
  21. import com.smppw.modaq.common.enums.ReportType;
  22. import com.smppw.modaq.common.exception.NotSupportReportException;
  23. import com.smppw.modaq.common.exception.ReportParseException;
  24. import com.smppw.modaq.domain.dto.EmailContentInfoDTO;
  25. import com.smppw.modaq.domain.dto.EmailZipFileDTO;
  26. import com.smppw.modaq.domain.dto.MailboxInfoDTO;
  27. import com.smppw.modaq.domain.dto.report.OCRParseData;
  28. import com.smppw.modaq.domain.dto.report.ParseResult;
  29. import com.smppw.modaq.domain.dto.report.ReportData;
  30. import com.smppw.modaq.domain.dto.report.ReportParserParams;
  31. import com.smppw.modaq.domain.entity.EmailFileInfoDO;
  32. import com.smppw.modaq.domain.entity.EmailParseInfoDO;
  33. import com.smppw.modaq.domain.mapper.EmailFileInfoMapper;
  34. import com.smppw.modaq.domain.mapper.EmailParseInfoMapper;
  35. import com.smppw.modaq.infrastructure.util.ArchiveUtil;
  36. import com.smppw.modaq.infrastructure.util.DateUtils;
  37. import com.smppw.modaq.infrastructure.util.PdfUtil;
  38. import jakarta.mail.*;
  39. import jakarta.mail.internet.MimeUtility;
  40. import jakarta.mail.search.ComparisonTerm;
  41. import jakarta.mail.search.ReceivedDateTerm;
  42. import jakarta.mail.search.SearchTerm;
  43. import org.slf4j.Logger;
  44. import org.slf4j.LoggerFactory;
  45. import org.springframework.beans.factory.annotation.Value;
  46. import org.springframework.stereotype.Service;
  47. import org.springframework.util.StopWatch;
  48. import java.io.File;
  49. import java.io.IOException;
  50. import java.io.InputStream;
  51. import java.nio.file.Files;
  52. import java.nio.file.Path;
  53. import java.nio.file.Paths;
  54. import java.util.*;
  55. import java.util.regex.Matcher;
  56. import java.util.regex.Pattern;
  57. import java.util.stream.Collectors;
  58. /**
  59. * @author mozuwen
  60. * @date 2024-09-04
  61. * @description 邮件解析服务
  62. */
  63. @Service
  64. public class EmailParseService {
  65. // public static final int stepSize = 10000;
  66. private static final Logger log = LoggerFactory.getLogger(EmailParseService.class);
  67. // 扩展支持的 MIME 类型
  68. private static final Set<String> attachmentMimePrefixes = Set.of(
  69. "application/pdf",
  70. "application/zip",
  71. "application/x-zip-compressed",
  72. "application/rar",
  73. "application/x-rar-compressed"
  74. // 按需添加其他类型...
  75. );
  76. // private final EmailFieldMappingMapper emailFieldMapper;
  77. private final EmailParseInfoMapper emailParseInfoMapper;
  78. private final EmailFileInfoMapper emailFileInfoMapper;
  79. /* 报告解析和入库的方法 */
  80. private final ReportParserFactory reportParserFactory;
  81. private final ReportWriterFactory reportWriterFactory;
  82. @Value("${email.file.path}")
  83. private String path;
  84. @Value("${email.report.ocr-parser-url}")
  85. private String ocrParserUrl;
  86. public EmailParseService(EmailParseInfoMapper emailParseInfoMapper,
  87. EmailFileInfoMapper emailFileInfoMapper,
  88. ReportParserFactory reportParserFactory,
  89. ReportWriterFactory reportWriterFactory) {
  90. this.emailParseInfoMapper = emailParseInfoMapper;
  91. this.emailFileInfoMapper = emailFileInfoMapper;
  92. this.reportParserFactory = reportParserFactory;
  93. this.reportWriterFactory = reportWriterFactory;
  94. }
  95. /**
  96. * 解析指定邮箱指定时间范围内的邮件
  97. *
  98. * @param mailboxInfoDTO 邮箱配置信息
  99. * @param startDate 邮件起始日期(yyyy-MM-dd HH:mm:ss)
  100. * @param endDate 邮件截止日期(yyyy-MM-dd HH:mm:ss, 为null,将解析邮件日期小于等于startDate的当天邮件)
  101. * @param emailTypes 当前任务支持的邮件类型,默认支持确认单
  102. */
  103. public void parseEmail(MailboxInfoDTO mailboxInfoDTO,
  104. Date startDate, Date endDate,
  105. List<String> folderNames, List<Integer> emailTypes) {
  106. if (CollUtil.isEmpty(emailTypes)) {
  107. emailTypes = ListUtil.of(EmailTypeConst.REPORT_LETTER_EMAIL_TYPE);
  108. }
  109. if (log.isInfoEnabled()) {
  110. log.info("开始邮件解析 -> 邮箱信息:{},开始时间:{},结束时间:{}", mailboxInfoDTO, DateUtil.format(startDate,
  111. DateConst.YYYY_MM_DD_HH_MM_SS), DateUtil.format(endDate, DateConst.YYYY_MM_DD_HH_MM_SS));
  112. }
  113. Map<String, List<EmailContentInfoDTO>> emailContentMap;
  114. try {
  115. emailContentMap = this.realEmail(mailboxInfoDTO, startDate, endDate, folderNames);
  116. } catch (Exception e) {
  117. log.error("采集邮件失败 -> 邮箱配置信息:{},堆栈信息:{}", mailboxInfoDTO, ExceptionUtil.stacktraceToString(e));
  118. return;
  119. }
  120. if (MapUtil.isEmpty(emailContentMap)) {
  121. log.warn("未采集到邮件 -> 邮箱配置信息:{},开始时间:{},结束时间:{}", mailboxInfoDTO,
  122. DateUtil.format(startDate, DateConst.YYYY_MM_DD_HH_MM_SS), DateUtil.format(endDate, DateConst.YYYY_MM_DD_HH_MM_SS));
  123. return;
  124. }
  125. for (Map.Entry<String, List<EmailContentInfoDTO>> emailEntry : emailContentMap.entrySet()) {
  126. List<EmailContentInfoDTO> emailContentInfoDTOList = emailEntry.getValue();
  127. if (CollUtil.isEmpty(emailContentInfoDTOList)) {
  128. log.warn("未采集到正文或附件");
  129. continue;
  130. }
  131. log.info("开始解析邮件数据 -> 邮件主题:{},邮件日期:{}", emailContentInfoDTOList.get(0).getEmailTitle(), emailContentInfoDTOList.get(0).getEmailDate());
  132. Map<EmailContentInfoDTO, List<EmailZipFileDTO>> emailZipFileMap = MapUtil.newHashMap();
  133. for (EmailContentInfoDTO emailContentInfoDTO : emailContentInfoDTOList) {
  134. // 正文不用解压附件
  135. if (emailContentInfoDTO.getFileName() != null && emailContentInfoDTO.getFileName().endsWith(".html")) {
  136. emailZipFileMap.put(emailContentInfoDTO, ListUtil.empty());
  137. continue;
  138. }
  139. try {
  140. List<EmailZipFileDTO> fundNavDTOList = this.parseZipEmail(emailContentInfoDTO);
  141. emailZipFileMap.put(emailContentInfoDTO, fundNavDTOList);
  142. } catch (IOException e) {
  143. log.error("压缩包解压失败:{}", ExceptionUtil.stacktraceToString(e));
  144. EmailParseInfoDO fail = buildEmailParseInfo(null, mailboxInfoDTO.getAccount(), emailContentInfoDTO);
  145. fail.setFailReason("压缩包解压失败");
  146. fail.setParseStatus(EmailParseStatusConst.FAIL);
  147. fail.setEmailKey(emailEntry.getKey());
  148. this.emailParseInfoMapper.insert(fail);
  149. } catch (Exception e) {
  150. log.error("堆栈信息:{}", ExceptionUtil.stacktraceToString(e));
  151. }
  152. }
  153. Iterator<Map.Entry<EmailContentInfoDTO, List<EmailZipFileDTO>>> entryIterator = emailZipFileMap.entrySet().iterator();
  154. while (entryIterator.hasNext()) {
  155. Map.Entry<EmailContentInfoDTO, List<EmailZipFileDTO>> entry = entryIterator.next();
  156. EmailContentInfoDTO key = entry.getKey();
  157. List<EmailZipFileDTO> dtos = entry.getValue();
  158. List<Integer> types = ListUtil.list(false);
  159. types.add(key.getEmailType());
  160. if (CollUtil.isNotEmpty(dtos)) {
  161. List<Integer> list = dtos.stream().map(EmailZipFileDTO::getEmailType).distinct().toList();
  162. CollUtil.addAllIfNotContains(types, list);
  163. }
  164. boolean flag = false;
  165. for (Integer type : types) {
  166. if (emailTypes.contains(type)) {
  167. flag = true;
  168. break;
  169. }
  170. }
  171. if (!flag) {
  172. log.warn("当前邮件{} 的类型{} 不在支持的任务类型{} 中,不用执行解析逻辑。", key, types, emailTypes);
  173. entryIterator.remove();
  174. }
  175. }
  176. // 保存相关信息 -> 邮件信息表,邮件文件表,邮件净值表,邮件规模表,基金净值表
  177. saveRelatedTable(emailEntry.getKey(), mailboxInfoDTO.getAccount(), emailZipFileMap);
  178. log.info("结束邮件解析 -> 邮箱信息:{},开始时间:{},结束时间:{}", emailEntry.getValue(),
  179. DateUtil.format(startDate, DateConst.YYYY_MM_DD_HH_MM_SS), DateUtil.format(endDate, DateConst.YYYY_MM_DD_HH_MM_SS));
  180. }
  181. }
  182. public List<EmailZipFileDTO> parseZipEmail(EmailContentInfoDTO emailContentInfoDTO) throws IOException {
  183. List<EmailZipFileDTO> resultList = ListUtil.list(false);
  184. Integer emailType = emailContentInfoDTO.getEmailType();
  185. String filepath = emailContentInfoDTO.getFilePath();
  186. String emailTitle = emailContentInfoDTO.getEmailTitle();
  187. int fileSize = emailContentInfoDTO.getFileSize();
  188. if (ArchiveUtil.isZip(filepath)) {
  189. handleCompressedFiles(emailTitle, filepath, ".zip", emailType, fileSize, resultList);
  190. } else if (ArchiveUtil.isRAR(filepath)) {
  191. handleCompressedFiles(emailTitle, filepath, ".rar", emailType, fileSize, resultList);
  192. }
  193. // 文件中的类型判断
  194. if (emailType == null || !EmailTypeConst.SUPPORT_EMAIL_TYPES.contains(emailType)) {
  195. emailType = EmailUtil.getEmailTypeBySubject(emailContentInfoDTO.getFileName());
  196. emailContentInfoDTO.setEmailType(emailType);
  197. }
  198. if (CollUtil.isNotEmpty(resultList)) {
  199. for (EmailZipFileDTO dto : resultList) {
  200. dto.setEmailType(emailType);
  201. }
  202. if (log.isInfoEnabled()) {
  203. log.info("当前邮件{} 所有解压缩文件解压完成:{}", emailTitle, resultList);
  204. }
  205. }
  206. return resultList;
  207. }
  208. private void handleCompressedFiles(String emailTitle, String filepath, String extension,
  209. Integer emailType, int fileSize, List<EmailZipFileDTO> resultList) throws IOException {
  210. String destPath = getDestinationPath(filepath, extension);
  211. File destFile = new File(destPath);
  212. if (!destFile.exists()) {
  213. if (!destFile.mkdirs()) {
  214. throw new IOException("无法创建目标目录: " + destPath);
  215. }
  216. }
  217. List<String> extractedDirs;
  218. if (ArchiveUtil.isZip(filepath)) {
  219. extractedDirs = ArchiveUtil.extractCompressedFiles(filepath, destPath);
  220. } else if (ArchiveUtil.isRAR(filepath)) {
  221. extractedDirs = ArchiveUtil.extractRar5(filepath, destPath);
  222. } else {
  223. return;
  224. }
  225. for (String dir : extractedDirs) {
  226. // 如果邮件类型不满足解析条件则重新根据文件名判断
  227. if (emailType == null || !EmailTypeConst.SUPPORT_EMAIL_TYPES.contains(emailType)) {
  228. emailType = EmailUtil.getEmailTypeBySubject(dir);
  229. }
  230. File file = new File(dir);
  231. if (file.isDirectory()) {
  232. String[] subDirs = file.list();
  233. if (subDirs != null) {
  234. for (String subDir : subDirs) {
  235. resultList.add(new EmailZipFileDTO(emailTitle, subDir, fileSize, emailType));
  236. }
  237. } else {
  238. log.warn("目录 {} 下无文件", dir);
  239. }
  240. } else {
  241. resultList.add(new EmailZipFileDTO(emailTitle, dir, fileSize, emailType));
  242. }
  243. }
  244. }
  245. private String getDestinationPath(String filepath, String extension) {
  246. Path path = Paths.get(filepath);
  247. String fileName = path.getFileName().toString();
  248. String baseName = fileName.substring(0, fileName.length() - extension.length());
  249. return path.getParent().resolve(baseName).toString();
  250. }
  251. public void saveRelatedTable(String emailKey, String emailAddress,
  252. Map<EmailContentInfoDTO, List<EmailZipFileDTO>> emailZipFileMap) {
  253. // python 报告解析接口结果
  254. List<ParseResult<ReportData>> dataList = ListUtil.list(false);
  255. for (Map.Entry<EmailContentInfoDTO, List<EmailZipFileDTO>> entry : emailZipFileMap.entrySet()) {
  256. EmailContentInfoDTO emailDto = entry.getKey();
  257. if (emailDto.getFileName() != null && emailDto.getFileName().endsWith(".html")) {
  258. continue;
  259. }
  260. String emailTitle = emailDto.getEmailTitle();
  261. // 待解析文件数据处理,不支持已存在的文件重复解析
  262. List<EmailZipFileDTO> dtos = ListUtil.list(false);
  263. List<EmailZipFileDTO> zipFiles = entry.getValue();
  264. if (CollUtil.isEmpty(zipFiles)) {
  265. dtos.add(new EmailZipFileDTO(emailTitle, emailDto));
  266. } else {
  267. dtos.addAll(zipFiles);
  268. }
  269. // 重新判断类型
  270. for (EmailZipFileDTO dto : dtos) {
  271. if (!EmailTypeConst.SUPPORT_NO_OTHER_TYPES.contains(dto.getEmailType())) {
  272. Integer emailType = EmailUtil.getEmailTypeBySubject(emailTitle + dto.getFilename());
  273. dto.setEmailType(emailType);
  274. }
  275. }
  276. // 数据库已存在的数据过滤(邮件主题+报告名称+附件大小)
  277. Iterator<EmailZipFileDTO> iterator = dtos.iterator();
  278. while (iterator.hasNext()) {
  279. EmailZipFileDTO dto = iterator.next();
  280. String filename = dto.getFilename();
  281. // 删除复核函或基金合同
  282. if (filename.contains("复核函") || filename.contains("基金合同")) {
  283. log.warn("邮件{} 中的报告{} 是复核函或基金合同,不用解析上传。", emailTitle, filename);
  284. iterator.remove();
  285. }
  286. Integer emailType = dto.getEmailType();
  287. int fileSize = dto.getFileSize();
  288. int count = 0;
  289. if (Objects.equals(emailType, EmailTypeConst.REPORT_LETTER_EMAIL_TYPE)) {
  290. // 确认单
  291. count = this.emailFileInfoMapper.getLetterFilenameSuccessCount(emailTitle, filename);
  292. } else if (Objects.equals(emailType, EmailTypeConst.REPORT_EMAIL_TYPE)) {
  293. // 定期报告
  294. count = this.emailFileInfoMapper.getAmacFilenameSuccessCount(emailTitle, filename, fileSize);
  295. } else if (Objects.equals(emailType, EmailTypeConst.REPORT_WEEKLY_TYPE)) {
  296. // 管理人周报
  297. count = this.emailFileInfoMapper.getWeeklyFilenameSuccessCount(emailTitle, filename, fileSize);
  298. } else if (Objects.equals(emailType, EmailTypeConst.REPORT_OTHER_TYPE)) {
  299. // 其他报告
  300. count = this.emailFileInfoMapper.getOtherFilenameSuccessCount(emailTitle, filename, fileSize);
  301. } else {
  302. log.info("邮件{} 类型{} 不支持解析。", emailTitle, emailType);
  303. iterator.remove();
  304. }
  305. if (count > 0) {
  306. iterator.remove();
  307. log.info("邮件{} 附件{} 已存在解析成功的记录,不用重新解析。", emailTitle, filename);
  308. }
  309. }
  310. if (CollUtil.isEmpty(dtos)) {
  311. log.info("邮件{} 所有文件都已经解析成功过,不能重复解析了", emailTitle);
  312. continue;
  313. }
  314. Integer emailId = emailDto.getEmailId();
  315. EmailParseInfoDO emailParseInfoDO = this.buildEmailParseInfo(emailId, emailAddress, emailDto);
  316. emailParseInfoDO.setEmailKey(emailKey);
  317. emailId = this.saveEmailParseInfo(emailParseInfoDO);
  318. if (emailId == null) {
  319. continue;
  320. }
  321. for (EmailZipFileDTO zipFile : dtos) {
  322. EmailFileInfoDO emailFile = this.saveEmailFileInfo(emailId, zipFile.getFilename(), zipFile.getFilepath());
  323. // 解析并保存报告
  324. ParseResult<ReportData> parseResult = this.parseReportAndHandleResult(emailTitle, emailFile, zipFile);
  325. dataList.add(parseResult);
  326. }
  327. String failReason = null;
  328. int emailParseStatus = EmailParseStatusConst.SUCCESS;
  329. // 报告邮件有一条失败就表示整个邮件解析失败
  330. if (CollUtil.isNotEmpty(dataList)) {
  331. // ai解析结果
  332. List<ReportData> aiParaseList = dataList.stream().map(ParseResult::getData)
  333. .filter(Objects::nonNull).filter(e -> Objects.equals(true, e.getAiParse())).toList();
  334. if (CollUtil.isNotEmpty(aiParaseList)) {
  335. for (ReportData data : aiParaseList) {
  336. this.emailFileInfoMapper.updateAiParseByFileId(data.getBaseInfo().getFileId(), data.getAiParse(), data.getAiFileId());
  337. }
  338. }
  339. long failNum = dataList.stream().filter(e -> !Objects.equals(EmailParseStatusConst.SUCCESS, e.getStatus())).count();
  340. if (failNum > 0) {
  341. emailParseStatus = EmailParseStatusConst.FAIL;
  342. failReason = dataList.stream().map(ParseResult::getMsg).collect(Collectors.joining(";"));
  343. }
  344. }
  345. emailParseInfoMapper.updateParseStatus(emailId, emailParseStatus, failReason);
  346. }
  347. }
  348. private ParseResult<ReportData> parseReportAndHandleResult(String emailTitle, EmailFileInfoDO emailFileInfo, EmailZipFileDTO zipFile) {
  349. Integer emailType = zipFile.getEmailType();
  350. String fileName = zipFile.getFilename();
  351. String filepath = zipFile.getFilepath();
  352. ParseResult<ReportData> result = new ParseResult<>();
  353. boolean reportFlag = emailType == null || !EmailTypeConst.SUPPORT_EMAIL_TYPES.contains(emailType);
  354. if (reportFlag || StrUtil.isBlank(fileName) || fileName.endsWith(".html")) {
  355. result.setStatus(ReportParseStatus.NOT_A_REPORT.getCode());
  356. result.setMsg(StrUtil.format(ReportParseStatus.NOT_A_REPORT.getMsg(), fileName));
  357. log.error(result.getMsg());
  358. return result;
  359. }
  360. // 类型识别---先识别季度报告,没有季度再识别年度报告,最后识别月报
  361. ReportType reportType = ReportParseUtils.matchReportType(emailType, fileName);
  362. if (reportType == null) {
  363. reportType = ReportParseUtils.matchReportType(emailType, emailTitle);
  364. if (log.isDebugEnabled()) {
  365. log.debug("报告{} 根据邮件主题{} 重新识别的类型是:{}", fileName, emailTitle, reportType);
  366. }
  367. }
  368. // 解析器--根据文件后缀获取对应解析器,解析不了就用AI来解析
  369. ReportParserFileType fileType;
  370. String fileSuffix = StrUtil.subAfter(fileName, ".", true);
  371. fileType = ReportParserFileType.getBySuffix(fileSuffix);
  372. // 不支持的格式
  373. if (fileType == null) {
  374. result.setStatus(ReportParseStatus.NO_SUPPORT_TEMPLATE.getCode());
  375. result.setMsg(StrUtil.format(ReportParseStatus.NO_SUPPORT_TEMPLATE.getMsg(), fileName));
  376. log.error(result.getMsg());
  377. return result;
  378. }
  379. // 不是定期报告的判断逻辑放在不支持的格式下面
  380. if (reportType == null) {
  381. result.setStatus(ReportParseStatus.NOT_A_REPORT.getCode());
  382. result.setMsg(StrUtil.format(ReportParseStatus.NOT_A_REPORT.getMsg(), fileName));
  383. log.error(result.getMsg());
  384. return result;
  385. }
  386. Integer fileId = emailFileInfo.getId();
  387. // 首页和尾页转为png图片,首页用来识别基金名称和基金代码、尾页用来识别印章和联系人
  388. List<String> images = ListUtil.list(true);
  389. if (Objects.equals(ReportParserFileType.PDF, fileType)) {
  390. try {
  391. String output = FileUtil.getParent(filepath, 1) + File.separator + "image";
  392. images = PdfUtil.convertFirstAndLastPagesToPng(filepath, FileUtil.file(output), 300);
  393. if (log.isDebugEnabled()) {
  394. log.debug("报告[{}] 生成的图片地址是:{}", fileName, images);
  395. }
  396. } catch (Exception e) {
  397. log.warn("报告[{}] 生成图片失败:{}", fileName, ExceptionUtil.stacktraceToString(e));
  398. }
  399. } else if (Objects.equals(ReportParserFileType.IMG, fileType)) {
  400. images.add(filepath);
  401. }
  402. // 不支持解析的格式文件
  403. boolean notSupportFile = false;
  404. // 解析报告
  405. ReportData reportData = null;
  406. ReportParserParams params = new ReportParserParams(fileId, fileName, filepath, reportType);
  407. StopWatch parserWatch = new StopWatch();
  408. parserWatch.start();
  409. try {
  410. if (reportType != ReportType.OTHER && reportType != ReportType.WEEKLY) {
  411. ReportParser<ReportData> instance = this.reportParserFactory.getInstance(reportType, fileType);
  412. reportData = instance.parse(params);
  413. result.setStatus(1);
  414. result.setMsg("报告解析成功");
  415. result.setData(reportData);
  416. } else {
  417. if (log.isInfoEnabled()) {
  418. log.info("报告{} 是周报或其他类型,直接用AI解析器解析", fileName);
  419. }
  420. }
  421. } catch (ReportParseException e) {
  422. log.error("解析失败:{}", StrUtil.format(e.getMsg(), fileName));
  423. result.setStatus(e.getCode());
  424. result.setMsg(StrUtil.format(e.getMsg(), fileName));
  425. if (e instanceof NotSupportReportException) {
  426. notSupportFile = true;
  427. }
  428. } catch (Exception e) {
  429. log.error("解析错误:{}", ExceptionUtil.stacktraceToString(e));
  430. result.setStatus(ReportParseStatus.PARSE_FAIL.getCode());
  431. result.setMsg(StrUtil.format(ReportParseStatus.PARSE_FAIL.getMsg(), e.getMessage()));
  432. } finally {
  433. // 如果解析结果是空的就用AI工具解析一次
  434. if (reportData == null && !notSupportFile) {
  435. if (reportType == ReportType.QUARTERLY || reportType == ReportType.ANNUALLY) {
  436. if (log.isInfoEnabled()) {
  437. log.info("报告{} 开始AI解析......", fileName);
  438. }
  439. } else if (CollUtil.isNotEmpty(images)) {
  440. filepath = images.get(0);
  441. if (log.isInfoEnabled()) {
  442. log.info("报告{} 用首页图片{} 开始AI解析......", fileName, filepath);
  443. }
  444. }
  445. try {
  446. params = new ReportParserParams(fileId, fileName, filepath, reportType);
  447. ReportParser<ReportData> instance = this.reportParserFactory.getInstance(reportType, ReportParserFileType.AI);
  448. reportData = instance.parse(params);
  449. result.setStatus(1);
  450. result.setMsg("报告解析成功--AI");
  451. result.setData(reportData);
  452. } catch (ReportParseException e) {
  453. log.error("AI解析失败:{}", StrUtil.format(e.getMsg(), fileName));
  454. result.setStatus(e.getCode());
  455. result.setMsg(StrUtil.format(e.getMsg(), fileName));
  456. } catch (Exception e) {
  457. log.error("AI解析错误:{}", ExceptionUtil.stacktraceToString(e));
  458. result.setStatus(ReportParseStatus.PARSE_FAIL.getCode());
  459. result.setMsg(StrUtil.format(ReportParseStatus.PARSE_FAIL.getMsg(), e.getMessage()));
  460. }
  461. if (log.isInfoEnabled()) {
  462. log.info("报告{} AI解析结束!", fileName);
  463. }
  464. }
  465. // ocr信息提取
  466. this.ocrReportData(reportData, fileName, images);
  467. parserWatch.stop();
  468. if (log.isInfoEnabled()) {
  469. log.info("报告{}解析结果为{},耗时{}ms", fileName, reportData, parserWatch.getTotalTimeMillis());
  470. }
  471. }
  472. // 保存报告解析结果
  473. this.saveReportData(reportData, reportType, fileName);
  474. return result;
  475. }
  476. /**
  477. * ocr 提取信息(包括首页的基金名称或报告日期,尾页的印章或联系人等信息)
  478. *
  479. * @param reportData 报告解析结果
  480. * @param fileName 报告名称
  481. * @param images 报告的收益和尾页png图片
  482. */
  483. private void ocrReportData(ReportData reportData, String fileName, List<String> images) {
  484. if (reportData == null || CollUtil.isEmpty(images)) {
  485. return;
  486. }
  487. OCRParseData parseRes = null;
  488. try {
  489. // 首页和尾页相等时只读首页
  490. String imageUrl = images.size() == 1 ? images.get(0) : images.get(1);
  491. parseRes = new OCRReportParser().parse(fileName, this.ocrParserUrl, imageUrl);
  492. } catch (Exception e) {
  493. log.error("报告{} OCR识别印章和联系人出错:{}", fileName, ExceptionUtil.stacktraceToString(e));
  494. }
  495. // ocr识别尾页是否包含印章和联系人信息
  496. if (parseRes != null) {
  497. if (reportData.getBaseInfo() != null) {
  498. Date reportDate = DateUtils.toDate(parseRes.getReportDate());
  499. if (reportData.getBaseInfo().getReportDate() == null && reportDate != null) {
  500. reportData.getBaseInfo().setReportDate(reportDate);
  501. }
  502. reportData.getBaseInfo().setWithSeals(parseRes.getWithSeals());
  503. reportData.getBaseInfo().setWithContacts(parseRes.getWithContacts());
  504. if (fileName.contains("用印") && !Objects.equals(true, reportData.getBaseInfo().getWithSeals())) {
  505. reportData.getBaseInfo().setWithSeals(true);
  506. }
  507. }
  508. }
  509. // 用首页识别基金名称、产品代码和报告日期
  510. if ((reportData.getBaseInfo() != null && reportData.getBaseInfo().getReportDate() == null)
  511. || (reportData.getFundInfo() != null && StrUtil.isBlank(reportData.getFundInfo().getFundName()))
  512. || (reportData.getFundInfo() != null && StrUtil.isBlank(reportData.getFundInfo().getFundCode()))) {
  513. // 首页和尾页不相等时解析首页的数据
  514. if (images.size() != 1) {
  515. try {
  516. parseRes = new OCRReportParser().parse(fileName, this.ocrParserUrl, images.get(0));
  517. } catch (Exception e) {
  518. log.error("报告{} OCR识别首页基金名称和报告日期出错:{}", fileName, ExceptionUtil.stacktraceToString(e));
  519. }
  520. }
  521. if (reportData.getBaseInfo() != null && parseRes != null) {
  522. Date reportDate = DateUtils.toDate(parseRes.getReportDate());
  523. if (reportDate != null && reportData.getBaseInfo().getReportDate() == null) {
  524. reportData.getBaseInfo().setReportDate(reportDate);
  525. }
  526. }
  527. // ocr 识别的结果优先级更高
  528. if (reportData.getFundInfo() != null && parseRes != null) {
  529. if (StrUtil.isBlank(reportData.getFundInfo().getFundName())
  530. || !Objects.equals(reportData.getFundInfo().getFundName(), parseRes.getFundName())) {
  531. reportData.getFundInfo().setFundName(parseRes.getFundName());
  532. }
  533. if (StrUtil.isBlank(reportData.getFundInfo().getFundCode())
  534. || !Objects.equals(reportData.getFundInfo().getFundCode(), parseRes.getFundCode())) {
  535. reportData.getFundInfo().setFundCode(parseRes.getFundCode());
  536. }
  537. }
  538. }
  539. }
  540. /**
  541. * 保存报告解析结果
  542. *
  543. * @param reportData 报告解析结果
  544. * @param reportType 报告类型
  545. * @param fileName 报告名称
  546. */
  547. private void saveReportData(ReportData reportData, ReportType reportType, String fileName) {
  548. if (reportData == null) {
  549. return;
  550. }
  551. StopWatch writeWatch = new StopWatch();
  552. writeWatch.start();
  553. try {
  554. ReportWriter<ReportData> instance = this.reportWriterFactory.getInstance(reportType);
  555. instance.write(reportData);
  556. } catch (Exception e) {
  557. log.error("报告{}结果保存失败\n{}", fileName, ExceptionUtil.stacktraceToString(e));
  558. } finally {
  559. writeWatch.stop();
  560. if (log.isInfoEnabled()) {
  561. log.info("报告{}解析结果保存完成,耗时{}ms", fileName, writeWatch.getTotalTimeMillis());
  562. }
  563. }
  564. }
  565. private EmailFileInfoDO saveEmailFileInfo(Integer emailId, String fileName, String filePath) {
  566. EmailFileInfoDO emailFileInfoDO = buildEmailFileInfoDO(emailId, fileName, filePath);
  567. emailFileInfoDO.setAiFileId(null);
  568. if (emailFileInfoDO.getId() != null) {
  569. emailFileInfoMapper.updateTimeById(null, new Date());
  570. return emailFileInfoDO;
  571. }
  572. emailFileInfoMapper.insert(emailFileInfoDO);
  573. return emailFileInfoDO;
  574. }
  575. private EmailFileInfoDO buildEmailFileInfoDO(Integer emailId, String fileName, String filePath) {
  576. EmailFileInfoDO emailFileInfoDO = new EmailFileInfoDO();
  577. emailFileInfoDO.setId(null);
  578. emailFileInfoDO.setEmailId(emailId);
  579. emailFileInfoDO.setFileName(fileName);
  580. emailFileInfoDO.setFilePath(filePath);
  581. emailFileInfoDO.setIsvalid(1);
  582. emailFileInfoDO.setCreatorId(0);
  583. emailFileInfoDO.setCreateTime(new Date());
  584. emailFileInfoDO.setUpdaterId(0);
  585. emailFileInfoDO.setUpdateTime(new Date());
  586. return emailFileInfoDO;
  587. }
  588. private Integer saveEmailParseInfo(EmailParseInfoDO emailParseInfoDO) {
  589. if (emailParseInfoDO == null) {
  590. return null;
  591. }
  592. // 重新邮件功能 -> 修改解析时间和更新时间
  593. if (emailParseInfoDO.getId() != null) {
  594. emailParseInfoMapper.updateParseTime(emailParseInfoDO.getId(), emailParseInfoDO.getParseDate());
  595. return emailParseInfoDO.getId();
  596. }
  597. emailParseInfoMapper.insert(emailParseInfoDO);
  598. return emailParseInfoDO.getId();
  599. }
  600. private EmailParseInfoDO buildEmailParseInfo(Integer emailId, String emailAddress, EmailContentInfoDTO emailContentInfoDTO) {
  601. EmailParseInfoDO emailParseInfoDO = new EmailParseInfoDO();
  602. emailParseInfoDO.setId(emailId);
  603. emailParseInfoDO.setSenderEmail(emailContentInfoDTO.getSenderEmail());
  604. emailParseInfoDO.setEmail(emailAddress);
  605. emailParseInfoDO.setEmailDate(DateUtil.parse(emailContentInfoDTO.getEmailDate(), DateConst.YYYY_MM_DD_HH_MM_SS));
  606. emailParseInfoDO.setParseDate(emailContentInfoDTO.getParseDate() == null ? null : DateUtil.parseDate(emailContentInfoDTO.getParseDate()));
  607. emailParseInfoDO.setEmailTitle(emailContentInfoDTO.getEmailTitle());
  608. emailParseInfoDO.setEmailType(emailContentInfoDTO.getEmailType());
  609. emailParseInfoDO.setParseStatus(EmailParseStatusConst.SUCCESS);
  610. emailParseInfoDO.setAttrSize(emailContentInfoDTO.getFileSize());
  611. emailParseInfoDO.setIsvalid(1);
  612. emailParseInfoDO.setCreatorId(0);
  613. emailParseInfoDO.setCreateTime(new Date());
  614. emailParseInfoDO.setUpdaterId(0);
  615. emailParseInfoDO.setUpdateTime(new Date());
  616. return emailParseInfoDO;
  617. }
  618. /**
  619. * 读取邮件
  620. *
  621. * @param mailboxInfoDTO 邮箱配置信息
  622. * @param startDate 邮件起始日期
  623. * @param endDate 邮件截止日期(为null,将解析邮件日期小于等于startDate的当天邮件)
  624. * @return 读取到的邮件信息
  625. * @throws Exception 异常信息
  626. */
  627. private Map<String, List<EmailContentInfoDTO>> realEmail(MailboxInfoDTO mailboxInfoDTO,
  628. Date startDate, Date endDate,
  629. List<String> folderNames) throws Exception {
  630. if (CollUtil.isEmpty(folderNames)) {
  631. folderNames = ListUtil.toList("INBOX");
  632. }
  633. Store store = EmailUtil.getStoreNew(mailboxInfoDTO);
  634. if (store == null) {
  635. return MapUtil.newHashMap(4);
  636. }
  637. Map<String, List<EmailContentInfoDTO>> result = MapUtil.newHashMap(128);
  638. try {
  639. if (log.isInfoEnabled()) {
  640. Folder[] list = store.getDefaultFolder().list("*");
  641. List<String> names = Arrays.stream(list).map(Folder::getFullName).toList();
  642. log.info("获取所有邮箱文件夹:{}", names);
  643. }
  644. for (String folderName : folderNames) {
  645. try {
  646. Map<String, List<EmailContentInfoDTO>> temp = this.getFolderEmail(mailboxInfoDTO,
  647. startDate, endDate, store, folderName);
  648. if (MapUtil.isNotEmpty(temp)) {
  649. result.putAll(temp);
  650. }
  651. } catch (Exception e) {
  652. log.warn("文件夹{} 邮件获取失败:{}", folderName, ExceptionUtil.stacktraceToString(e));
  653. }
  654. }
  655. } catch (Exception e) {
  656. log.error("邮件获取失败:{}", ExceptionUtil.stacktraceToString(e));
  657. } finally {
  658. store.close();
  659. }
  660. return result;
  661. }
  662. private Map<String, List<EmailContentInfoDTO>> getFolderEmail(MailboxInfoDTO mailboxInfoDTO,
  663. Date startDate, Date endDate,
  664. Store store, String folderName) throws MessagingException {
  665. // 默认读取收件箱的邮件
  666. Folder folder = store.getFolder(folderName);
  667. folder.open(Folder.READ_ONLY);
  668. Message[] messages = getEmailMessage(folder, mailboxInfoDTO.getProtocol(), startDate);
  669. if (messages == null || messages.length == 0) {
  670. log.warn("{} 获取不到邮件 -> 邮箱信息:{},开始时间:{},结束时间:{}", folderName, mailboxInfoDTO, startDate, endDate);
  671. return MapUtil.newHashMap();
  672. }
  673. Map<String, List<EmailContentInfoDTO>> emailMessageMap = MapUtil.newHashMap();
  674. for (Message message : messages) {
  675. long start = System.currentTimeMillis();
  676. List<EmailContentInfoDTO> emailContentInfoDTOList = CollUtil.newArrayList();
  677. String uuidKey = UUID.randomUUID().toString().replaceAll("-", "");
  678. Integer emailType;
  679. String senderEmail;
  680. String emailTitle = message.getSubject();
  681. try {
  682. Date emailDate = message.getSentDate();
  683. String emailDateStr = DateUtil.format(emailDate, DateConst.YYYY_MM_DD_HH_MM_SS);
  684. if (log.isInfoEnabled()) {
  685. log.info("{} 邮件{} 数据获取中,邮件时间:{}", folderName, emailTitle, emailDateStr);
  686. }
  687. boolean isNotParseConditionSatisfied = emailDate == null
  688. || (endDate != null && emailDate.compareTo(endDate) > 0)
  689. || (startDate != null && emailDate.compareTo(startDate) < 0);
  690. if (isNotParseConditionSatisfied) {
  691. String st = DateUtil.formatDateTime(startDate);
  692. String ed = DateUtil.formatDateTime(endDate);
  693. log.warn("{} 邮件[{}]日期{}不在区间内【{} ~ {}】", folderName, emailTitle, emailDateStr, st, ed);
  694. continue;
  695. }
  696. senderEmail = getSenderEmail(message);
  697. emailType = EmailUtil.getEmailTypeBySubject(emailTitle);
  698. if (emailType == null) {
  699. log.warn("{} 邮件不满足解析条件 -> 邮件主题:{},邮件日期:{}", folderName, emailTitle, emailDateStr);
  700. continue;
  701. }
  702. if (log.isInfoEnabled()) {
  703. log.info("{} 邮件{} 基本信息获取完成,开始下载附件!邮件日期:{}", folderName, emailTitle, emailDateStr);
  704. }
  705. Object content = message.getContent();
  706. if (content instanceof Multipart multipart) {
  707. this.reMultipart(mailboxInfoDTO.getAccount(), emailTitle, emailDate, multipart, emailContentInfoDTOList);
  708. } else if (content instanceof Part part) {
  709. this.rePart(mailboxInfoDTO.getAccount(), emailTitle, emailDate, part, emailContentInfoDTOList);
  710. } else {
  711. log.warn("{} 不支持的邮件数据 {}", folderName, emailTitle);
  712. }
  713. if (CollUtil.isNotEmpty(emailContentInfoDTOList)) {
  714. emailContentInfoDTOList.forEach(e -> {
  715. e.setEmailType(emailType);
  716. e.setSenderEmail(senderEmail);
  717. });
  718. emailMessageMap.put(uuidKey, emailContentInfoDTOList);
  719. }
  720. if (log.isInfoEnabled()) {
  721. log.info("{} 邮件{} 下载完成,总计耗时{} ms,文件内容如下\n {}", folderName,
  722. emailTitle, System.currentTimeMillis() - start, emailContentInfoDTOList);
  723. }
  724. } catch (Exception e) {
  725. log.error("{} 获取邮箱的邮件{} 报错,堆栈信息:{}", folderName, emailTitle, ExceptionUtil.stacktraceToString(e));
  726. }
  727. }
  728. folder.close(false);
  729. return emailMessageMap;
  730. }
  731. private void rePart(String account, String subject, Date sendDate, Part part,
  732. List<EmailContentInfoDTO> emailContentInfoDTOList) throws Exception {
  733. String fileName = EmailUtil.decodeFileName(part);
  734. if (StrUtil.isBlank(fileName)) {
  735. log.warn("邮件{} 附件文件名是空的,不做下载!", subject);
  736. return;
  737. }
  738. if (fileName.contains("=?")) {
  739. fileName = MimeUtility.decodeText(fileName);
  740. }
  741. String disposition = part.getDisposition();
  742. String contentType = part.getContentType();
  743. boolean isAttachment = Part.ATTACHMENT.equalsIgnoreCase(disposition)
  744. || (contentType != null && attachmentMimePrefixes.stream().anyMatch(prefix ->
  745. StrUtil.startWithIgnoreCase(contentType, prefix)
  746. ));
  747. if (!isAttachment) {
  748. log.warn("邮件 {} 未检测到pdf/zip/rar类型的附件 (fileName={}, disposition={}, contentType={})",
  749. subject, fileName, disposition, contentType);
  750. return;
  751. }
  752. String emailDateStr = DateUtil.format(sendDate, DateConst.YYYYMMDD);
  753. String filePath = path + File.separator + account + File.separator + emailDateStr + File.separator;
  754. File saveFile = FileUtil.file(filePath + fileName);
  755. if (!saveFile.exists()) {
  756. if (!saveFile.getParentFile().exists()) {
  757. boolean mkdirs = saveFile.getParentFile().mkdirs();
  758. if (!mkdirs) {
  759. log.warn("file path mkdir failed.");
  760. }
  761. }
  762. try (InputStream is = part.getInputStream()) {
  763. Files.copy(is, saveFile.toPath());
  764. }
  765. } else {
  766. FileUtil.del(saveFile);
  767. try (InputStream is = part.getInputStream()) {
  768. Files.copy(is, saveFile.toPath());
  769. }
  770. }
  771. EmailContentInfoDTO emailContentInfoDTO = new EmailContentInfoDTO();
  772. emailContentInfoDTO.setFileName(fileName);
  773. emailContentInfoDTO.setFileSize(part.getSize());
  774. emailContentInfoDTO.setFilePath(saveFile.getAbsolutePath());
  775. emailContentInfoDTO.setEmailAddress(account);
  776. emailContentInfoDTO.setEmailTitle(subject);
  777. emailContentInfoDTO.setEmailDate(DateUtil.format(sendDate, DateConst.YYYY_MM_DD_HH_MM_SS));
  778. emailContentInfoDTOList.add(emailContentInfoDTO);
  779. }
  780. private void reMultipart(String account, String subject, Date emailDate, Multipart multipart,
  781. List<EmailContentInfoDTO> emailContentInfoDTOList) throws Exception {
  782. for (int i = 0; i < multipart.getCount(); i++) {
  783. Part bodyPart = multipart.getBodyPart(i);
  784. Object content = bodyPart.getContent();
  785. if (content instanceof String) {
  786. if (log.isDebugEnabled()) {
  787. log.debug("邮件{} 获取的正文不做解析,内容是 {}", subject, content);
  788. }
  789. continue;
  790. }
  791. if (content instanceof Multipart mp) {
  792. this.reMultipart(account, subject, emailDate, mp, emailContentInfoDTOList);
  793. } else {
  794. this.rePart(account, subject, emailDate, bodyPart, emailContentInfoDTOList);
  795. }
  796. }
  797. }
  798. private String getSenderEmail(Message message) {
  799. Address[] senderAddress;
  800. try {
  801. senderAddress = message.getFrom();
  802. if (senderAddress == null || senderAddress.length == 0) {
  803. return null;
  804. }
  805. // 此时的address是含有编码(MIME编码方式)后的文本和实际的邮件地址
  806. String address = "";
  807. for (Address from : senderAddress) {
  808. if (StrUtil.isNotBlank(from.toString())) {
  809. address = from.toString();
  810. break;
  811. }
  812. }
  813. // 正则表达式匹配邮件地址
  814. Pattern pattern = Pattern.compile("<(\\S+)>");
  815. Matcher matcher = pattern.matcher(address);
  816. if (matcher.find()) {
  817. return matcher.group(1);
  818. }
  819. } catch (MessagingException e) {
  820. log.error(e.getMessage(), e);
  821. }
  822. return null;
  823. }
  824. private Message[] getEmailMessage(Folder folder, String protocol, Date startDate) {
  825. try {
  826. if (protocol.contains("imap")) {
  827. // 获取邮件日期大于等于startDate的邮件(搜索条件只支持按天)
  828. SearchTerm startDateTerm = new ReceivedDateTerm(ComparisonTerm.GE, startDate);
  829. return folder.search(startDateTerm);
  830. } else {
  831. return folder.getMessages();
  832. }
  833. } catch (MessagingException e) {
  834. throw new RuntimeException(e);
  835. }
  836. }
  837. }