EmailParseService.java 39 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796
  1. package com.smppw.modaq.domain.service;
  2. import cn.hutool.core.collection.CollUtil;
  3. import cn.hutool.core.collection.ListUtil;
  4. import cn.hutool.core.date.DateUtil;
  5. import cn.hutool.core.exceptions.ExceptionUtil;
  6. import cn.hutool.core.map.MapUtil;
  7. import cn.hutool.core.util.StrUtil;
  8. import com.smppw.modaq.application.components.ReportParseUtils;
  9. import com.smppw.modaq.application.components.report.parser.ReportParser;
  10. import com.smppw.modaq.application.components.report.parser.ReportParserFactory;
  11. import com.smppw.modaq.application.components.report.writer.ReportWriter;
  12. import com.smppw.modaq.application.components.report.writer.ReportWriterFactory;
  13. import com.smppw.modaq.application.util.EmailUtil;
  14. import com.smppw.modaq.common.conts.DateConst;
  15. import com.smppw.modaq.common.conts.EmailParseStatusConst;
  16. import com.smppw.modaq.common.conts.EmailTypeConst;
  17. import com.smppw.modaq.common.enums.ReportParseStatus;
  18. import com.smppw.modaq.common.enums.ReportParserFileType;
  19. import com.smppw.modaq.common.enums.ReportType;
  20. import com.smppw.modaq.common.exception.NotSupportReportException;
  21. import com.smppw.modaq.common.exception.ReportParseException;
  22. import com.smppw.modaq.domain.dto.EmailContentInfoDTO;
  23. import com.smppw.modaq.domain.dto.EmailZipFileDTO;
  24. import com.smppw.modaq.domain.dto.MailboxInfoDTO;
  25. import com.smppw.modaq.domain.dto.report.ParseResult;
  26. import com.smppw.modaq.domain.dto.report.ReportData;
  27. import com.smppw.modaq.domain.dto.report.ReportParserParams;
  28. import com.smppw.modaq.domain.entity.EmailFileInfoDO;
  29. import com.smppw.modaq.domain.entity.EmailParseInfoDO;
  30. import com.smppw.modaq.domain.mapper.EmailFileInfoMapper;
  31. import com.smppw.modaq.domain.mapper.EmailParseInfoMapper;
  32. import com.smppw.modaq.infrastructure.util.ExcelUtil;
  33. import com.smppw.modaq.infrastructure.util.FileUtil;
  34. import jakarta.mail.*;
  35. import jakarta.mail.internet.MimeUtility;
  36. import jakarta.mail.search.ComparisonTerm;
  37. import jakarta.mail.search.ReceivedDateTerm;
  38. import jakarta.mail.search.SearchTerm;
  39. import org.apache.commons.compress.archivers.ArchiveException;
  40. import org.slf4j.Logger;
  41. import org.slf4j.LoggerFactory;
  42. import org.springframework.beans.factory.annotation.Value;
  43. import org.springframework.stereotype.Service;
  44. import org.springframework.util.StopWatch;
  45. import java.io.File;
  46. import java.io.IOException;
  47. import java.nio.file.Path;
  48. import java.nio.file.Paths;
  49. import java.util.*;
  50. import java.util.regex.Matcher;
  51. import java.util.regex.Pattern;
  52. import java.util.stream.Collectors;
  53. /**
  54. * @author mozuwen
  55. * @date 2024-09-04
  56. * @description 邮件解析服务
  57. */
  58. @Service
  59. public class EmailParseService {
  60. // public static final int stepSize = 10000;
  61. private static final Logger log = LoggerFactory.getLogger(EmailParseService.class);
  62. // 扩展支持的 MIME 类型
  63. private static final Set<String> attachmentMimePrefixes = Set.of(
  64. "application/pdf",
  65. "application/zip",
  66. "application/x-zip-compressed",
  67. "application/rar",
  68. "application/x-rar-compressed"
  69. // 按需添加其他类型...
  70. );
  71. // private final EmailFieldMappingMapper emailFieldMapper;
  72. private final EmailParseInfoMapper emailParseInfoMapper;
  73. private final EmailFileInfoMapper emailFileInfoMapper;
  74. /* 报告解析和入库的方法 */
  75. private final ReportParserFactory reportParserFactory;
  76. private final ReportWriterFactory reportWriterFactory;
  77. @Value("${email.file.path}")
  78. private String path;
  79. public EmailParseService(EmailParseInfoMapper emailParseInfoMapper,
  80. EmailFileInfoMapper emailFileInfoMapper,
  81. ReportParserFactory reportParserFactory,
  82. ReportWriterFactory reportWriterFactory) {
  83. this.emailParseInfoMapper = emailParseInfoMapper;
  84. this.emailFileInfoMapper = emailFileInfoMapper;
  85. this.reportParserFactory = reportParserFactory;
  86. this.reportWriterFactory = reportWriterFactory;
  87. }
  88. /**
  89. * 解析指定邮箱指定时间范围内的邮件
  90. *
  91. * @param mailboxInfoDTO 邮箱配置信息
  92. * @param startDate 邮件起始日期(yyyy-MM-dd HH:mm:ss)
  93. * @param endDate 邮件截止日期(yyyy-MM-dd HH:mm:ss, 为null,将解析邮件日期小于等于startDate的当天邮件)
  94. * @param emailTypes 当前任务支持的邮件类型,默认支持确认单
  95. */
  96. public void parseEmail(MailboxInfoDTO mailboxInfoDTO,
  97. Date startDate, Date endDate,
  98. List<String> folderNames, List<Integer> emailTypes) {
  99. if (CollUtil.isEmpty(emailTypes)) {
  100. emailTypes = ListUtil.of(EmailTypeConst.REPORT_LETTER_EMAIL_TYPE);
  101. }
  102. if (log.isInfoEnabled()) {
  103. log.info("开始邮件解析 -> 邮箱信息:{},开始时间:{},结束时间:{}", mailboxInfoDTO, DateUtil.format(startDate,
  104. DateConst.YYYY_MM_DD_HH_MM_SS), DateUtil.format(endDate, DateConst.YYYY_MM_DD_HH_MM_SS));
  105. }
  106. Map<String, List<EmailContentInfoDTO>> emailContentMap;
  107. try {
  108. emailContentMap = realEmail(mailboxInfoDTO, startDate, endDate, folderNames);
  109. } catch (Exception e) {
  110. log.error("采集邮件失败 -> 邮箱配置信息:{},堆栈信息:{}", mailboxInfoDTO, ExceptionUtil.stacktraceToString(e));
  111. return;
  112. }
  113. if (MapUtil.isEmpty(emailContentMap)) {
  114. log.warn("未采集到邮件 -> 邮箱配置信息:{},开始时间:{},结束时间:{}", mailboxInfoDTO,
  115. DateUtil.format(startDate, DateConst.YYYY_MM_DD_HH_MM_SS), DateUtil.format(endDate, DateConst.YYYY_MM_DD_HH_MM_SS));
  116. return;
  117. }
  118. for (Map.Entry<String, List<EmailContentInfoDTO>> emailEntry : emailContentMap.entrySet()) {
  119. List<EmailContentInfoDTO> emailContentInfoDTOList = emailEntry.getValue();
  120. if (CollUtil.isEmpty(emailContentInfoDTOList)) {
  121. log.warn("未采集到正文或附件");
  122. continue;
  123. }
  124. log.info("开始解析邮件数据 -> 邮件主题:{},邮件日期:{}", emailContentInfoDTOList.get(0).getEmailTitle(), emailContentInfoDTOList.get(0).getEmailDate());
  125. Map<EmailContentInfoDTO, List<EmailZipFileDTO>> emailZipFileMap = MapUtil.newHashMap();
  126. for (EmailContentInfoDTO emailContentInfoDTO : emailContentInfoDTOList) {
  127. // 正文不用解压附件
  128. if (emailContentInfoDTO.getFileName() != null && emailContentInfoDTO.getFileName().endsWith(".html")) {
  129. emailZipFileMap.put(emailContentInfoDTO, ListUtil.empty());
  130. continue;
  131. }
  132. try {
  133. List<EmailZipFileDTO> fundNavDTOList = parseZipEmail(emailContentInfoDTO);
  134. emailZipFileMap.put(emailContentInfoDTO, fundNavDTOList);
  135. } catch (IOException | ArchiveException e) {
  136. log.error("压缩包解压失败:{}", ExceptionUtil.stacktraceToString(e));
  137. EmailParseInfoDO fail = buildEmailParseInfo(null, mailboxInfoDTO.getAccount(), emailContentInfoDTO);
  138. fail.setFailReason("压缩包解压失败");
  139. fail.setParseStatus(EmailParseStatusConst.FAIL);
  140. fail.setEmailKey(emailEntry.getKey());
  141. this.emailParseInfoMapper.insert(fail);
  142. } catch (Exception e) {
  143. log.error("堆栈信息:{}", ExceptionUtil.stacktraceToString(e));
  144. }
  145. }
  146. Iterator<Map.Entry<EmailContentInfoDTO, List<EmailZipFileDTO>>> entryIterator = emailZipFileMap.entrySet().iterator();
  147. while (entryIterator.hasNext()) {
  148. Map.Entry<EmailContentInfoDTO, List<EmailZipFileDTO>> entry = entryIterator.next();
  149. EmailContentInfoDTO key = entry.getKey();
  150. String emailTitle = key.getEmailTitle();
  151. List<EmailZipFileDTO> dtos = entry.getValue();
  152. List<Integer> types = ListUtil.list(false);
  153. types.add(key.getEmailType());
  154. if (CollUtil.isNotEmpty(dtos)) {
  155. Iterator<EmailZipFileDTO> iterator = dtos.iterator();
  156. while (iterator.hasNext()) {
  157. EmailZipFileDTO dto = iterator.next();
  158. String filename = dto.getFilename();
  159. if (filename != null && filename.contains("复核函")) {
  160. log.warn("邮件{} 附件中的压缩文件{} 是复核函,不用解析上传。", emailTitle, filename);
  161. iterator.remove();
  162. }
  163. }
  164. List<Integer> list = dtos.stream().map(EmailZipFileDTO::getEmailType).distinct().toList();
  165. CollUtil.addAllIfNotContains(types, list);
  166. }
  167. boolean flag = false;
  168. for (Integer type : types) {
  169. if (emailTypes.contains(type)) {
  170. flag = true;
  171. break;
  172. }
  173. }
  174. if (!flag) {
  175. log.warn("当前邮件{} 的类型{} 不在支持的任务类型{} 中,不用执行解析逻辑。", key, types, emailTypes);
  176. entryIterator.remove();
  177. }
  178. }
  179. // 保存相关信息 -> 邮件信息表,邮件文件表,邮件净值表,邮件规模表,基金净值表
  180. saveRelatedTable(emailEntry.getKey(), mailboxInfoDTO.getAccount(), emailZipFileMap);
  181. log.info("结束邮件解析 -> 邮箱信息:{},开始时间:{},结束时间:{}", emailEntry.getValue(),
  182. DateUtil.format(startDate, DateConst.YYYY_MM_DD_HH_MM_SS), DateUtil.format(endDate, DateConst.YYYY_MM_DD_HH_MM_SS));
  183. }
  184. }
  185. public List<EmailZipFileDTO> parseZipEmail(EmailContentInfoDTO emailContentInfoDTO) throws Exception {
  186. List<EmailZipFileDTO> resultList = ListUtil.list(false);
  187. Integer emailType = emailContentInfoDTO.getEmailType();
  188. String filepath = emailContentInfoDTO.getFilePath();
  189. String emailTitle = emailContentInfoDTO.getEmailTitle();
  190. if (ExcelUtil.isZip(filepath)) {
  191. handleCompressedFiles(emailTitle, filepath, ".zip", emailType, resultList);
  192. } else if (ExcelUtil.isRAR(filepath)) {
  193. handleCompressedFiles(emailTitle, filepath, ".rar", emailType, resultList);
  194. }
  195. // 文件中的类型判断
  196. if (emailType == null || !EmailTypeConst.SUPPORT_EMAIL_TYPES.contains(emailType)) {
  197. emailType = EmailUtil.getEmailTypeBySubject(emailContentInfoDTO.getFileName());
  198. emailContentInfoDTO.setEmailType(emailType);
  199. }
  200. return resultList;
  201. }
  202. private void handleCompressedFiles(String emailTitle, String filepath, String extension,
  203. Integer emailType, List<EmailZipFileDTO> resultList) throws Exception {
  204. String destPath = getDestinationPath(filepath, extension);
  205. log.info("压缩包地址:{}, 解压后文件地址:{}", filepath, destPath);
  206. File destFile = new File(destPath);
  207. if (!destFile.exists()) {
  208. if (!destFile.mkdirs()) {
  209. throw new IOException("无法创建目标目录: " + destPath);
  210. }
  211. }
  212. List<String> extractedDirs;
  213. if (ExcelUtil.isZip(filepath)) {
  214. extractedDirs = ExcelUtil.extractCompressedFiles(filepath, destPath);
  215. } else if (ExcelUtil.isRAR(filepath)) {
  216. extractedDirs = ExcelUtil.extractRar5(filepath, destPath);
  217. } else {
  218. return;
  219. }
  220. for (String dir : extractedDirs) {
  221. // 如果邮件类型不满足解析条件则重新根据文件名判断
  222. if (emailType == null || !EmailTypeConst.SUPPORT_EMAIL_TYPES.contains(emailType)) {
  223. emailType = EmailUtil.getEmailTypeBySubject(dir);
  224. }
  225. File file = new File(dir);
  226. if (file.isDirectory()) {
  227. String[] subDirs = file.list();
  228. if (subDirs != null) {
  229. for (String subDir : subDirs) {
  230. resultList.add(new EmailZipFileDTO(emailTitle, subDir, emailType));
  231. }
  232. } else {
  233. log.warn("目录 {} 下无文件", dir);
  234. }
  235. } else {
  236. resultList.add(new EmailZipFileDTO(emailTitle, dir, emailType));
  237. }
  238. }
  239. }
  240. private String getDestinationPath(String filepath, String extension) {
  241. Path path = Paths.get(filepath);
  242. String fileName = path.getFileName().toString();
  243. String baseName = fileName.substring(0, fileName.length() - extension.length());
  244. return path.getParent().resolve(baseName).toString();
  245. }
  246. public void saveRelatedTable(String emailKey, String emailAddress,
  247. Map<EmailContentInfoDTO, List<EmailZipFileDTO>> emailZipFileMap) {
  248. // python 报告解析接口结果
  249. List<ParseResult<ReportData>> dataList = ListUtil.list(false);
  250. for (Map.Entry<EmailContentInfoDTO, List<EmailZipFileDTO>> entry : emailZipFileMap.entrySet()) {
  251. EmailContentInfoDTO emailDto = entry.getKey();
  252. if (emailDto.getFileName() != null && emailDto.getFileName().endsWith(".html")) {
  253. continue;
  254. }
  255. String emailTitle = emailDto.getEmailTitle();
  256. // 待解析文件数据处理,不支持已存在的文件重复解析
  257. List<EmailZipFileDTO> dtos = ListUtil.list(false);
  258. List<EmailZipFileDTO> zipFiles = entry.getValue();
  259. if (CollUtil.isEmpty(zipFiles)) {
  260. dtos.add(new EmailZipFileDTO(emailTitle, emailDto.getFilePath(), emailDto.getFileName(), emailDto.getEmailType()));
  261. } else {
  262. dtos.addAll(zipFiles);
  263. }
  264. // 重新判断类型
  265. for (EmailZipFileDTO dto : dtos) {
  266. Integer emailType = EmailUtil.getEmailTypeBySubject(emailTitle + dto.getFilename());
  267. dto.setEmailType(emailType);
  268. }
  269. // 数据库已存在的数据过滤
  270. Iterator<EmailZipFileDTO> iterator = dtos.iterator();
  271. while (iterator.hasNext()) {
  272. EmailZipFileDTO dto = iterator.next();
  273. Integer emailType = dto.getEmailType();
  274. String filename = dto.getFilename();
  275. int count = 0;
  276. if (Objects.equals(emailType, EmailTypeConst.REPORT_LETTER_EMAIL_TYPE)) {
  277. // 确认单
  278. count = this.emailFileInfoMapper.getLetterFilenameSuccessCount(emailTitle, filename);
  279. } else if (Objects.equals(emailType, EmailTypeConst.REPORT_EMAIL_TYPE)) {
  280. // 定期报告
  281. count = this.emailFileInfoMapper.getAmacFilenameSuccessCount(emailTitle, filename);
  282. } else if (Objects.equals(emailType, EmailTypeConst.REPORT_WEEKLY_TYPE)) {
  283. // 管理人周报
  284. count = this.emailFileInfoMapper.getWeeklyFilenameSuccessCount(emailTitle, filename);
  285. } else if (Objects.equals(emailType, EmailTypeConst.REPORT_OTHER_TYPE)) {
  286. // 其他报告
  287. count = this.emailFileInfoMapper.getOtherFilenameSuccessCount(emailTitle, filename);
  288. } else {
  289. log.info("邮件{} 类型{} 不支持解析。", emailTitle, emailType);
  290. iterator.remove();
  291. }
  292. if (count > 0) {
  293. iterator.remove();
  294. log.info("邮件{} 附件{} 已存在解析成功的记录,不用重新解析。", emailTitle, filename);
  295. }
  296. }
  297. if (CollUtil.isEmpty(dtos)) {
  298. log.info("邮件{} 所有文件都已经解析成功过,不能重复解析了", emailTitle);
  299. continue;
  300. }
  301. Integer emailId = emailDto.getEmailId();
  302. EmailParseInfoDO emailParseInfoDO = this.buildEmailParseInfo(emailId, emailAddress, emailDto);
  303. emailParseInfoDO.setEmailKey(emailKey);
  304. emailId = this.saveEmailParseInfo(emailParseInfoDO);
  305. if (emailId == null) {
  306. continue;
  307. }
  308. for (EmailZipFileDTO zipFile : dtos) {
  309. EmailFileInfoDO emailFile = this.saveEmailFileInfo(emailId, zipFile.getFilename(), zipFile.getFilepath());
  310. // 解析并保存报告
  311. ParseResult<ReportData> parseResult = this.parseReportAndHandleResult(emailTitle, emailFile, zipFile);
  312. dataList.add(parseResult);
  313. }
  314. String failReason = null;
  315. int emailParseStatus = EmailParseStatusConst.SUCCESS;
  316. // 报告邮件有一条失败就表示整个邮件解析失败
  317. if (CollUtil.isNotEmpty(dataList)) {
  318. // ai解析结果
  319. List<ReportData> aiParaseList = dataList.stream().map(ParseResult::getData)
  320. .filter(Objects::nonNull).filter(e -> Objects.equals(true, e.getAiParse())).toList();
  321. if (CollUtil.isNotEmpty(aiParaseList)) {
  322. for (ReportData data : aiParaseList) {
  323. this.emailFileInfoMapper.updateAiParseByFileId(data.getBaseInfo().getFileId(), data.getAiParse(), data.getAiFileId());
  324. }
  325. }
  326. long failNum = dataList.stream().filter(e -> !Objects.equals(EmailParseStatusConst.SUCCESS, e.getStatus())).count();
  327. if (failNum > 0) {
  328. emailParseStatus = EmailParseStatusConst.FAIL;
  329. failReason = dataList.stream().map(ParseResult::getMsg).collect(Collectors.joining(";"));
  330. }
  331. }
  332. emailParseInfoMapper.updateParseStatus(emailId, emailParseStatus, failReason);
  333. }
  334. }
  335. private ParseResult<ReportData> parseReportAndHandleResult(String emailTitle, EmailFileInfoDO emailFileInfo, EmailZipFileDTO zipFile) {
  336. Integer emailType = zipFile.getEmailType();
  337. String fileName = zipFile.getFilename();
  338. String filepath = zipFile.getFilepath();
  339. ParseResult<ReportData> result = new ParseResult<>();
  340. boolean reportFlag = emailType == null || !EmailTypeConst.SUPPORT_EMAIL_TYPES.contains(emailType);
  341. if (reportFlag || StrUtil.isBlank(fileName) || fileName.endsWith(".html")) {
  342. result.setStatus(ReportParseStatus.NOT_A_REPORT.getCode());
  343. result.setMsg(StrUtil.format(ReportParseStatus.NOT_A_REPORT.getMsg(), fileName));
  344. log.error(result.getMsg());
  345. return result;
  346. }
  347. // 基金代码、备案编码
  348. String registerNumber = ReportParseUtils.matchFundCode(fileName);
  349. // 类型识别---先识别季度报告,没有季度再识别年度报告,最后识别月报
  350. ReportType reportType = ReportParseUtils.matchReportType(emailType, fileName);
  351. if (reportType == null) {
  352. reportType = ReportParseUtils.matchReportType(emailType, emailTitle);
  353. if (log.isDebugEnabled()) {
  354. log.debug("报告{} 根据邮件主题{} 重新识别的类型是:{}", fileName, emailTitle, reportType);
  355. }
  356. }
  357. // 解析器--根据文件后缀获取对应解析器,解析不了就用AI来解析
  358. ReportParserFileType fileType;
  359. String fileSuffix = StrUtil.subAfter(fileName, ".", true);
  360. fileType = ReportParserFileType.getBySuffix(fileSuffix);
  361. // 不支持的格式
  362. if (fileType == null) {
  363. result.setStatus(ReportParseStatus.NO_SUPPORT_TEMPLATE.getCode());
  364. result.setMsg(StrUtil.format(ReportParseStatus.NO_SUPPORT_TEMPLATE.getMsg(), fileName));
  365. log.error(result.getMsg());
  366. return result;
  367. }
  368. // 不是定期报告的判断逻辑放在不支持的格式下面
  369. if (reportType == null) {
  370. result.setStatus(ReportParseStatus.NOT_A_REPORT.getCode());
  371. result.setMsg(StrUtil.format(ReportParseStatus.NOT_A_REPORT.getMsg(), fileName));
  372. log.error(result.getMsg());
  373. return result;
  374. }
  375. Integer fileId = emailFileInfo.getId();
  376. String aiFileId = emailFileInfo.getAiFileId();
  377. // 不支持解析的格式文件
  378. boolean notSupportFile = false;
  379. // 解析报告
  380. ReportData reportData = null;
  381. StopWatch parserWatch = new StopWatch();
  382. parserWatch.start();
  383. try {
  384. if (StrUtil.isBlank(aiFileId) && reportType != ReportType.OTHER && reportType != ReportType.WEEKLY) {
  385. ReportParserParams params = ReportParserParams.builder().fileId(fileId).filename(fileName).filepath(filepath)
  386. .registerNumber(registerNumber).reportType(reportType).build();
  387. ReportParser<ReportData> instance = this.reportParserFactory.getInstance(reportType, fileType);
  388. reportData = instance.parse(params);
  389. result.setStatus(1);
  390. result.setMsg("报告解析成功");
  391. result.setData(reportData);
  392. } else {
  393. if (reportType == ReportType.OTHER || reportType == ReportType.WEEKLY) {
  394. if (log.isInfoEnabled()) {
  395. log.info("报告{} 是周报或其他类型,直接用AI解析器解析", fileName);
  396. }
  397. } else {
  398. if (log.isInfoEnabled()) {
  399. log.info("报告{} 是已经存在ai解析记录,上传过文件{},直接跳转到AI解析器进行解析", fileName, aiFileId);
  400. }
  401. }
  402. }
  403. } catch (ReportParseException e) {
  404. log.error("解析失败:{}", StrUtil.format(e.getMsg(), fileName));
  405. result.setStatus(e.getCode());
  406. result.setMsg(StrUtil.format(e.getMsg(), fileName));
  407. if (e instanceof NotSupportReportException) {
  408. notSupportFile = true;
  409. }
  410. } catch (Exception e) {
  411. log.error("解析错误:{}", ExceptionUtil.stacktraceToString(e));
  412. result.setStatus(ReportParseStatus.PARSE_FAIL.getCode());
  413. result.setMsg(StrUtil.format(ReportParseStatus.PARSE_FAIL.getMsg(), e.getMessage()));
  414. } finally {
  415. // 如果解析结果是空的就用AI工具解析一次
  416. if (reportData == null && !notSupportFile) {
  417. if (log.isInfoEnabled()) {
  418. log.info("报告{} 开始AI解析......", fileName);
  419. }
  420. ReportParserParams params = ReportParserParams.builder().fileId(fileId).filename(fileName).filepath(filepath)
  421. .registerNumber(registerNumber).reportType(reportType).aiFileId(aiFileId).build();
  422. ReportParser<ReportData> instance = this.reportParserFactory.getInstance(reportType, ReportParserFileType.AI);
  423. try {
  424. reportData = instance.parse(params);
  425. result.setStatus(1);
  426. result.setMsg("报告解析成功--AI");
  427. result.setData(reportData);
  428. } catch (ReportParseException e) {
  429. log.error("AI解析失败:{}", StrUtil.format(e.getMsg(), fileName));
  430. result.setStatus(e.getCode());
  431. result.setMsg(StrUtil.format(e.getMsg(), fileName));
  432. } catch (Exception e) {
  433. log.error("AI解析错误:{}", ExceptionUtil.stacktraceToString(e));
  434. result.setStatus(ReportParseStatus.PARSE_FAIL.getCode());
  435. result.setMsg(StrUtil.format(ReportParseStatus.PARSE_FAIL.getMsg(), e.getMessage()));
  436. }
  437. if (log.isInfoEnabled()) {
  438. log.info("报告{} AI解析结束!", fileName);
  439. }
  440. }
  441. parserWatch.stop();
  442. if (log.isInfoEnabled()) {
  443. log.info("报告{}解析结果为{},耗时{}ms", fileName, reportData, parserWatch.getTotalTimeMillis());
  444. }
  445. }
  446. // 保存报告解析结果
  447. if (reportData != null) {
  448. StopWatch writeWatch = new StopWatch();
  449. writeWatch.start();
  450. try {
  451. ReportWriter<ReportData> instance = this.reportWriterFactory.getInstance(reportType);
  452. instance.write(reportData);
  453. } catch (Exception e) {
  454. log.error("报告{}结果保存失败\n{}", fileName, ExceptionUtil.stacktraceToString(e));
  455. } finally {
  456. writeWatch.stop();
  457. if (log.isInfoEnabled()) {
  458. log.info("报告{}解析结果保存完成,耗时{}ms", fileName, writeWatch.getTotalTimeMillis());
  459. }
  460. }
  461. }
  462. return result;
  463. }
  464. private EmailFileInfoDO saveEmailFileInfo(Integer emailId, String fileName, String filePath) {
  465. EmailFileInfoDO emailFileInfoDO = buildEmailFileInfoDO(emailId, fileName, filePath);
  466. emailFileInfoDO.setAiFileId(null);
  467. if (emailFileInfoDO.getId() != null) {
  468. emailFileInfoMapper.updateTimeById(null, new Date());
  469. return emailFileInfoDO;
  470. }
  471. emailFileInfoMapper.insert(emailFileInfoDO);
  472. return emailFileInfoDO;
  473. }
  474. private EmailFileInfoDO buildEmailFileInfoDO(Integer emailId, String fileName, String filePath) {
  475. EmailFileInfoDO emailFileInfoDO = new EmailFileInfoDO();
  476. emailFileInfoDO.setId(null);
  477. emailFileInfoDO.setEmailId(emailId);
  478. emailFileInfoDO.setFileName(fileName);
  479. emailFileInfoDO.setFilePath(filePath);
  480. emailFileInfoDO.setIsvalid(1);
  481. emailFileInfoDO.setCreatorId(0);
  482. emailFileInfoDO.setCreateTime(new Date());
  483. emailFileInfoDO.setUpdaterId(0);
  484. emailFileInfoDO.setUpdateTime(new Date());
  485. return emailFileInfoDO;
  486. }
  487. private Integer saveEmailParseInfo(EmailParseInfoDO emailParseInfoDO) {
  488. if (emailParseInfoDO == null) {
  489. return null;
  490. }
  491. // 重新邮件功能 -> 修改解析时间和更新时间
  492. if (emailParseInfoDO.getId() != null) {
  493. emailParseInfoMapper.updateParseTime(emailParseInfoDO.getId(), emailParseInfoDO.getParseDate());
  494. return emailParseInfoDO.getId();
  495. }
  496. // // 根据邮件发送人、邮件地址、邮箱日期、主题找到是否已经存在的记录(不管是否成功),已存在就不解析了
  497. // EmailParseInfoDO temp = this.emailParseInfoMapper.searchEmail(emailParseInfoDO);
  498. // if (temp != null) {
  499. // return null;
  500. // }
  501. emailParseInfoMapper.insert(emailParseInfoDO);
  502. return emailParseInfoDO.getId();
  503. }
  504. private EmailParseInfoDO buildEmailParseInfo(Integer emailId, String emailAddress, EmailContentInfoDTO emailContentInfoDTO) {
  505. EmailParseInfoDO emailParseInfoDO = new EmailParseInfoDO();
  506. emailParseInfoDO.setId(emailId);
  507. emailParseInfoDO.setSenderEmail(emailContentInfoDTO.getSenderEmail());
  508. emailParseInfoDO.setEmail(emailAddress);
  509. emailParseInfoDO.setEmailDate(DateUtil.parse(emailContentInfoDTO.getEmailDate(), DateConst.YYYY_MM_DD_HH_MM_SS));
  510. emailParseInfoDO.setParseDate(emailContentInfoDTO.getParseDate() == null ? null : DateUtil.parseDate(emailContentInfoDTO.getParseDate()));
  511. emailParseInfoDO.setEmailTitle(emailContentInfoDTO.getEmailTitle());
  512. emailParseInfoDO.setEmailType(emailContentInfoDTO.getEmailType());
  513. emailParseInfoDO.setParseStatus(EmailParseStatusConst.SUCCESS);
  514. emailParseInfoDO.setAttrSize(emailContentInfoDTO.getFileSize());
  515. emailParseInfoDO.setIsvalid(1);
  516. emailParseInfoDO.setCreatorId(0);
  517. emailParseInfoDO.setCreateTime(new Date());
  518. emailParseInfoDO.setUpdaterId(0);
  519. emailParseInfoDO.setUpdateTime(new Date());
  520. return emailParseInfoDO;
  521. }
  522. /**
  523. * 读取邮件
  524. *
  525. * @param mailboxInfoDTO 邮箱配置信息
  526. * @param startDate 邮件起始日期
  527. * @param endDate 邮件截止日期(为null,将解析邮件日期小于等于startDate的当天邮件)
  528. * @return 读取到的邮件信息
  529. * @throws Exception 异常信息
  530. */
  531. private Map<String, List<EmailContentInfoDTO>> realEmail(MailboxInfoDTO mailboxInfoDTO,
  532. Date startDate, Date endDate,
  533. List<String> folderNames) throws Exception {
  534. if (CollUtil.isEmpty(folderNames)) {
  535. folderNames = ListUtil.toList("INBOX");
  536. }
  537. Store store = EmailUtil.getStoreNew(mailboxInfoDTO);
  538. if (store == null) {
  539. return MapUtil.newHashMap(4);
  540. }
  541. Map<String, List<EmailContentInfoDTO>> result = MapUtil.newHashMap(128);
  542. try {
  543. if (log.isInfoEnabled()) {
  544. Folder[] list = store.getDefaultFolder().list("*");
  545. List<String> names = Arrays.stream(list).map(Folder::getFullName).toList();
  546. log.info("获取所有邮箱文件夹:{}", names);
  547. }
  548. for (String folderName : folderNames) {
  549. try {
  550. Map<String, List<EmailContentInfoDTO>> temp = this.getFolderEmail(mailboxInfoDTO,
  551. startDate, endDate, store, folderName);
  552. if (MapUtil.isNotEmpty(temp)) {
  553. result.putAll(temp);
  554. }
  555. } catch (Exception e) {
  556. log.warn("文件夹{} 邮件获取失败:{}", folderName, ExceptionUtil.stacktraceToString(e));
  557. }
  558. }
  559. } catch (Exception e) {
  560. log.error("邮件获取失败:{}", ExceptionUtil.stacktraceToString(e));
  561. } finally {
  562. store.close();
  563. }
  564. return result;
  565. }
  566. private Map<String, List<EmailContentInfoDTO>> getFolderEmail(MailboxInfoDTO mailboxInfoDTO,
  567. Date startDate, Date endDate,
  568. Store store, String folderName) throws MessagingException {
  569. // 默认读取收件箱的邮件
  570. Folder folder = store.getFolder(folderName);
  571. folder.open(Folder.READ_ONLY);
  572. Message[] messages = getEmailMessage(folder, mailboxInfoDTO.getProtocol(), startDate);
  573. if (messages == null || messages.length == 0) {
  574. log.warn("{} 获取不到邮件 -> 邮箱信息:{},开始时间:{},结束时间:{}", folderName, mailboxInfoDTO, startDate, endDate);
  575. return MapUtil.newHashMap();
  576. }
  577. Map<String, List<EmailContentInfoDTO>> emailMessageMap = MapUtil.newHashMap();
  578. for (Message message : messages) {
  579. long start = System.currentTimeMillis();
  580. List<EmailContentInfoDTO> emailContentInfoDTOList = CollUtil.newArrayList();
  581. String uuidKey = UUID.randomUUID().toString().replaceAll("-", "");
  582. Integer emailType;
  583. String senderEmail;
  584. String emailTitle = null;
  585. try {
  586. emailTitle = message.getSubject();
  587. Date emailDate = message.getSentDate();
  588. String emailDateStr = DateUtil.format(emailDate, DateConst.YYYY_MM_DD_HH_MM_SS);
  589. if (log.isInfoEnabled()) {
  590. log.info("{} 邮件{} 数据获取中,邮件时间:{}", folderName, emailTitle, emailDateStr);
  591. }
  592. boolean isNotParseConditionSatisfied = emailDate == null
  593. || (endDate != null && emailDate.compareTo(endDate) > 0)
  594. || (startDate != null && emailDate.compareTo(startDate) < 0);
  595. if (isNotParseConditionSatisfied) {
  596. log.warn("{} 邮件{} 没有日期{} 或者 邮件日期不在区间内【{} ~ {}】", folderName, emailTitle, emailDate, startDate, endDate);
  597. continue;
  598. }
  599. senderEmail = getSenderEmail(message);
  600. emailType = EmailUtil.getEmailTypeBySubject(emailTitle);
  601. if (emailType == null) {
  602. log.warn("{} 邮件不满足解析条件 -> 邮件主题:{},邮件日期:{}", folderName, emailTitle, emailDateStr);
  603. continue;
  604. }
  605. if (log.isInfoEnabled()) {
  606. log.info("{} 邮件{} 基本信息获取完成,开始下载附件!邮件日期:{}", folderName, emailTitle, emailDateStr);
  607. }
  608. Object content = message.getContent();
  609. if (content instanceof Multipart multipart) {
  610. this.reMultipart(mailboxInfoDTO.getAccount(), emailTitle, emailDate, multipart, emailContentInfoDTOList);
  611. } else if (content instanceof Part part) {
  612. this.rePart(mailboxInfoDTO.getAccount(), emailTitle, emailDate, part, emailContentInfoDTOList);
  613. } else {
  614. log.warn("{} 不支持的邮件数据 {}", folderName, emailTitle);
  615. }
  616. if (CollUtil.isNotEmpty(emailContentInfoDTOList)) {
  617. emailContentInfoDTOList.forEach(e -> {
  618. e.setEmailType(emailType);
  619. e.setSenderEmail(senderEmail);
  620. });
  621. emailMessageMap.put(uuidKey, emailContentInfoDTOList);
  622. }
  623. if (log.isInfoEnabled() && emailTitle != null) {
  624. log.info("{} 邮件{} 下载完成,总计耗时{} ms,文件内容如下\n {}", folderName,
  625. emailTitle, System.currentTimeMillis() - start, emailContentInfoDTOList);
  626. }
  627. } catch (Exception e) {
  628. log.error("{} 获取邮箱的邮件{} 报错,堆栈信息:{}", folderName, emailTitle, ExceptionUtil.stacktraceToString(e));
  629. }
  630. }
  631. folder.close(false);
  632. return emailMessageMap;
  633. }
  634. private void rePart(String account, String subject, Date sendDate, Part part,
  635. List<EmailContentInfoDTO> emailContentInfoDTOList) throws Exception {
  636. String fileName = EmailUtil.decodeFileName(part);
  637. if (StrUtil.isBlank(fileName)) {
  638. log.warn("邮件{} 附件文件名是空的,不做下载!", subject);
  639. return;
  640. }
  641. if (fileName.contains("=?")) {
  642. fileName = MimeUtility.decodeText(fileName);
  643. }
  644. String disposition = part.getDisposition();
  645. String contentType = part.getContentType();
  646. boolean isAttachment = Part.ATTACHMENT.equalsIgnoreCase(disposition)
  647. || (contentType != null && attachmentMimePrefixes.stream().anyMatch(prefix ->
  648. StrUtil.startWithIgnoreCase(contentType, prefix)
  649. ));
  650. if (!isAttachment) {
  651. log.warn("邮件 {} 未检测到pdf/zip/rar类型的附件 (fileName={}, disposition={}, contentType={})",
  652. subject, fileName, disposition, contentType);
  653. return;
  654. }
  655. String emailDate = DateUtil.format(sendDate, DateConst.YYYYMMDDHHMMSS24);
  656. String emailDateStr = DateUtil.format(sendDate, DateConst.YYYYMMDD);
  657. String filePath = path + File.separator + account + File.separator + emailDateStr + File.separator;
  658. String realPath = filePath + emailDate + fileName;
  659. File saveFile = cn.hutool.core.io.FileUtil.file(realPath);
  660. if (!saveFile.exists()) {
  661. if (!saveFile.getParentFile().exists()) {
  662. boolean mkdirs = saveFile.getParentFile().mkdirs();
  663. if (!mkdirs) {
  664. log.warn("file path mkdir failed.");
  665. }
  666. }
  667. FileUtil.saveFile(saveFile, part);
  668. } else {
  669. cn.hutool.core.io.FileUtil.del(saveFile);
  670. FileUtil.saveFile(saveFile, part);
  671. }
  672. EmailContentInfoDTO emailContentInfoDTO = new EmailContentInfoDTO();
  673. emailContentInfoDTO.setFileName(fileName);
  674. emailContentInfoDTO.setFileSize(part.getSize());
  675. emailContentInfoDTO.setFilePath(saveFile.getAbsolutePath());
  676. emailContentInfoDTO.setEmailAddress(account);
  677. emailContentInfoDTO.setEmailTitle(subject);
  678. emailContentInfoDTO.setEmailDate(DateUtil.format(sendDate, DateConst.YYYY_MM_DD_HH_MM_SS));
  679. emailContentInfoDTOList.add(emailContentInfoDTO);
  680. }
  681. private void reMultipart(String account, String subject, Date emailDate, Multipart multipart,
  682. List<EmailContentInfoDTO> emailContentInfoDTOList) throws Exception {
  683. for (int i = 0; i < multipart.getCount(); i++) {
  684. Part bodyPart = multipart.getBodyPart(i);
  685. Object content = bodyPart.getContent();
  686. if (content instanceof String) {
  687. if (log.isDebugEnabled()) {
  688. log.debug("邮件{} 获取的正文不做解析,内容是 {}", subject, content);
  689. }
  690. continue;
  691. }
  692. if (content instanceof Multipart mp) {
  693. this.reMultipart(account, subject, emailDate, mp, emailContentInfoDTOList);
  694. } else {
  695. this.rePart(account, subject, emailDate, bodyPart, emailContentInfoDTOList);
  696. }
  697. }
  698. }
  699. private String getSenderEmail(Message message) {
  700. Address[] senderAddress;
  701. try {
  702. senderAddress = message.getFrom();
  703. if (senderAddress == null || senderAddress.length == 0) {
  704. return null;
  705. }
  706. // 此时的address是含有编码(MIME编码方式)后的文本和实际的邮件地址
  707. String address = "";
  708. for (Address from : senderAddress) {
  709. if (StrUtil.isNotBlank(from.toString())) {
  710. address = from.toString();
  711. break;
  712. }
  713. }
  714. // 正则表达式匹配邮件地址
  715. Pattern pattern = Pattern.compile("<(\\S+)>");
  716. Matcher matcher = pattern.matcher(address);
  717. if (matcher.find()) {
  718. return matcher.group(1);
  719. }
  720. // //说明匹配不到,直接获取sender
  721. // Address sender = message.getSender();
  722. // if (sender == null) {
  723. // return address;
  724. // }
  725. // String senderEmail = sender.toString();
  726. // log.info("senderEmail:" + senderEmail + "====================");
  727. // if (senderEmail.contains("<") && senderEmail.contains(">") && senderEmail.indexOf("<") < senderEmail.indexOf(">")) {
  728. // senderEmail = senderEmail.substring(senderEmail.indexOf("<") + 1, senderEmail.length() - 1);
  729. // }
  730. // return senderEmail;
  731. } catch (MessagingException e) {
  732. log.error(e.getMessage(), e);
  733. }
  734. return null;
  735. }
  736. private Message[] getEmailMessage(Folder folder, String protocol, Date startDate) {
  737. try {
  738. if (protocol.contains("imap")) {
  739. // 获取邮件日期大于等于startDate的邮件(搜索条件只支持按天)
  740. SearchTerm startDateTerm = new ReceivedDateTerm(ComparisonTerm.GE, startDate);
  741. return folder.search(startDateTerm);
  742. } else {
  743. return folder.getMessages();
  744. }
  745. } catch (MessagingException e) {
  746. throw new RuntimeException(e);
  747. }
  748. }
  749. }