EmailParseService.java 61 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273
  1. package com.smppw.modaq.domain.service;
  2. import cn.hutool.core.collection.CollUtil;
  3. import cn.hutool.core.collection.ListUtil;
  4. import cn.hutool.core.date.DateUtil;
  5. import cn.hutool.core.exceptions.ExceptionUtil;
  6. import cn.hutool.core.io.FileUtil;
  7. import cn.hutool.core.map.MapUtil;
  8. import cn.hutool.core.util.IdUtil;
  9. import cn.hutool.core.util.StrUtil;
  10. import com.baomidou.mybatisplus.core.conditions.query.LambdaQueryWrapper;
  11. import com.baomidou.mybatisplus.core.toolkit.Wrappers;
  12. import com.smppw.modaq.application.components.OCRReportParser;
  13. import com.smppw.modaq.application.components.ReportParseUtils;
  14. import com.smppw.modaq.application.components.report.parser.ReportParser;
  15. import com.smppw.modaq.application.components.report.parser.ReportParserFactory;
  16. import com.smppw.modaq.application.components.report.writer.ReportWriter;
  17. import com.smppw.modaq.application.components.report.writer.ReportWriterFactory;
  18. import com.smppw.modaq.application.util.EmailUtil;
  19. import com.smppw.modaq.common.conts.Constants;
  20. import com.smppw.modaq.common.conts.DateConst;
  21. import com.smppw.modaq.common.conts.EmailParseStatusConst;
  22. import com.smppw.modaq.common.conts.EmailTypeConst;
  23. import com.smppw.modaq.common.enums.ReportMonthlyType;
  24. import com.smppw.modaq.common.enums.ReportParseStatus;
  25. import com.smppw.modaq.common.enums.ReportParserFileType;
  26. import com.smppw.modaq.common.enums.ReportType;
  27. import com.smppw.modaq.common.exception.NotSupportReportException;
  28. import com.smppw.modaq.common.exception.ReportParseException;
  29. import com.smppw.modaq.domain.dto.*;
  30. import com.smppw.modaq.domain.dto.report.*;
  31. import com.smppw.modaq.domain.dto.report.ocr.OCRLetterParseData;
  32. import com.smppw.modaq.domain.dto.report.ocr.OCRParseData;
  33. import com.smppw.modaq.domain.entity.EmailFileInfoDO;
  34. import com.smppw.modaq.domain.entity.EmailParseInfoDO;
  35. import com.smppw.modaq.domain.entity.TgEmailConfigDO;
  36. import com.smppw.modaq.domain.mapper.EmailFileInfoMapper;
  37. import com.smppw.modaq.domain.mapper.EmailParseInfoMapper;
  38. import com.smppw.modaq.domain.mapper.TgEmailConfigMapper;
  39. import com.smppw.modaq.infrastructure.util.ArchiveUtil;
  40. import com.smppw.modaq.infrastructure.util.ConvertUtil;
  41. import com.smppw.modaq.infrastructure.util.PdfUtil;
  42. import jakarta.mail.*;
  43. import jakarta.mail.internet.MimeUtility;
  44. import jakarta.mail.search.ComparisonTerm;
  45. import jakarta.mail.search.ReceivedDateTerm;
  46. import jakarta.mail.search.SearchTerm;
  47. import org.slf4j.Logger;
  48. import org.slf4j.LoggerFactory;
  49. import org.springframework.beans.factory.annotation.Value;
  50. import org.springframework.http.MediaType;
  51. import org.springframework.stereotype.Service;
  52. import org.springframework.util.StopWatch;
  53. import java.io.File;
  54. import java.io.IOException;
  55. import java.io.InputStream;
  56. import java.nio.file.Files;
  57. import java.util.*;
  58. import java.util.regex.Matcher;
  59. import java.util.regex.Pattern;
  60. import java.util.stream.Collectors;
  61. /**
  62. * @author mozuwen
  63. * @date 2024-09-04
  64. * @description 邮件解析服务
  65. */
  66. @Service
  67. public class EmailParseService {
  68. // public static final int stepSize = 10000;
  69. private static final Logger log = LoggerFactory.getLogger(EmailParseService.class);
  70. // 常量定义:统一管理关键词
  71. private static final Set<String> AMAC_KEYWORDS = Set.of("协会", "信披");
  72. private static final Set<String> EXCLUDE_PATH_KEYWORDS = Set.of("公司及协会版", "公司和协会版");
  73. // 扩展支持的 MIME 类型
  74. private static final Set<String> attachmentMimePrefixes = Set.of(
  75. "application/pdf",
  76. "application/zip",
  77. "application/x-zip-compressed",
  78. "application/rar",
  79. "application/x-rar-compressed",
  80. "application/octet-stream"
  81. // 按需添加其他类型...
  82. );
  83. private static final List<String> TG_EMAIL_LIST = ListUtil.list(false);
  84. private final TgEmailConfigMapper tgEmailConfigMapper;
  85. private final EmailParseInfoMapper emailParseInfoMapper;
  86. private final EmailFileInfoMapper emailFileInfoMapper;
  87. /* 报告解析和入库的方法 */
  88. private final ReportParserFactory reportParserFactory;
  89. private final ReportWriterFactory reportWriterFactory;
  90. @Value("${email.file.path}")
  91. private String path;
  92. @Value("${email.report.ocr-parser-url}")
  93. private String ocrParserUrl;
  94. @Value("${email.read-write-seen:true}")
  95. private boolean readWriteSeen;
  96. public EmailParseService(TgEmailConfigMapper tgEmailConfigMapper,
  97. EmailParseInfoMapper emailParseInfoMapper,
  98. EmailFileInfoMapper emailFileInfoMapper,
  99. ReportParserFactory reportParserFactory,
  100. ReportWriterFactory reportWriterFactory) {
  101. this.tgEmailConfigMapper = tgEmailConfigMapper;
  102. this.emailParseInfoMapper = emailParseInfoMapper;
  103. this.emailFileInfoMapper = emailFileInfoMapper;
  104. this.reportParserFactory = reportParserFactory;
  105. this.reportWriterFactory = reportWriterFactory;
  106. this.init();
  107. }
  108. public void init() {
  109. LambdaQueryWrapper<TgEmailConfigDO> wrapper = Wrappers.lambdaQuery(TgEmailConfigDO.class);
  110. List<TgEmailConfigDO> dataList = this.tgEmailConfigMapper.selectList(wrapper);
  111. for (TgEmailConfigDO temp : dataList) {
  112. TG_EMAIL_LIST.add(temp.getEmail());
  113. }
  114. }
  115. /**
  116. * 解析指定邮箱指定时间范围内的邮件
  117. *
  118. * @param mailboxInfoDTO 邮箱配置信息
  119. * @param startDate 邮件起始日期(yyyy-MM-dd HH:mm:ss)
  120. * @param endDate 邮件截止日期(yyyy-MM-dd HH:mm:ss, 为null,将解析邮件日期小于等于startDate的当天邮件)
  121. * @param emailTypes 当前任务支持的邮件类型,默认支持确认单
  122. */
  123. public void parseEmail(MailboxInfoDTO mailboxInfoDTO,
  124. Date startDate, Date endDate,
  125. List<String> folderNames,
  126. List<Integer> emailTypes) {
  127. if (CollUtil.isEmpty(emailTypes)) {
  128. emailTypes = ListUtil.of(EmailTypeConst.REPORT_LETTER_EMAIL_TYPE);
  129. }
  130. if (log.isInfoEnabled()) {
  131. log.info("开始邮件解析 -> 邮箱信息:{},开始时间:{},结束时间:{}", mailboxInfoDTO, DateUtil.format(startDate,
  132. DateConst.YYYY_MM_DD_HH_MM_SS), DateUtil.format(endDate, DateConst.YYYY_MM_DD_HH_MM_SS));
  133. }
  134. Map<String, List<EmailContentInfoDTO>> emailContentMap;
  135. try {
  136. emailContentMap = this.realEmail(mailboxInfoDTO, startDate, endDate, folderNames);
  137. } catch (Exception e) {
  138. log.error("采集邮件失败 -> 邮箱配置信息:{},堆栈信息:{}", mailboxInfoDTO, ExceptionUtil.stacktraceToString(e));
  139. return;
  140. }
  141. if (MapUtil.isEmpty(emailContentMap)) {
  142. log.warn("未采集到邮件 -> 邮箱配置信息:{},开始时间:{},结束时间:{}", mailboxInfoDTO,
  143. DateUtil.format(startDate, DateConst.YYYY_MM_DD_HH_MM_SS), DateUtil.format(endDate, DateConst.YYYY_MM_DD_HH_MM_SS));
  144. return;
  145. }
  146. for (Map.Entry<String, List<EmailContentInfoDTO>> emailEntry : emailContentMap.entrySet()) {
  147. List<EmailContentInfoDTO> emailContentInfoDTOList = emailEntry.getValue();
  148. if (CollUtil.isEmpty(emailContentInfoDTOList)) {
  149. log.warn("未采集到正文或附件");
  150. continue;
  151. }
  152. EmailContentInfoDTO dto = emailContentInfoDTOList.get(0);
  153. String emailTitle = dto.getEmailTitle();
  154. if (log.isInfoEnabled()) {
  155. log.info("开始解析邮件数据 -> 邮件主题:{},邮件日期:{}", emailTitle, dto.getEmailDate());
  156. }
  157. Long totalSize = emailContentInfoDTOList.stream().map(EmailContentInfoDTO::getFileSize).reduce(0L, Long::sum);
  158. String errMsg = null;
  159. int status = 1;
  160. List<EmailZipFileDTO> emailFileList = ListUtil.list(false);
  161. EmailInfoDTO emailInfo = new EmailInfoDTO(dto, emailFileList);
  162. if (dto.getEmailContent() != null && dto.getEmailContent().contains("超大附件列表")) {
  163. status = 0;
  164. errMsg = "邮件中存在超大附件,需要手动处理该邮件";
  165. } else {
  166. for (EmailContentInfoDTO emailDto : emailContentInfoDTOList) {
  167. // 正文不用解压附件
  168. if (emailDto.getFileName() != null && emailDto.getFileName().endsWith(Constants.FILE_HTML)) {
  169. continue;
  170. }
  171. try {
  172. emailFileList.addAll(this.parseZipEmail(emailDto));
  173. } catch (IOException e) {
  174. log.error("邮件{} 压缩包解压失败:{}", emailTitle, ExceptionUtil.stacktraceToString(e));
  175. status = 0;
  176. errMsg = "压缩包解压失败";
  177. } catch (Exception e) {
  178. log.error("邮件{} 堆栈信息:{}", emailTitle, ExceptionUtil.stacktraceToString(e));
  179. status = 0;
  180. errMsg = "内部错误";
  181. }
  182. }
  183. // 重新判断类型
  184. this.recheckEmailType(emailTitle, emailFileList);
  185. Iterator<EmailZipFileDTO> entryIterator = emailFileList.iterator();
  186. while (entryIterator.hasNext()) {
  187. EmailZipFileDTO entry = entryIterator.next();
  188. if (!emailTypes.contains(entry.getEmailType())) {
  189. log.warn("当前邮件{} 中的报告{} 的类型{} 不在支持的任务类型{} 中,不用执行解析逻辑。",
  190. entry.getEmailTitle(), entry.getFilename(), entry.getEmailType(), emailTypes);
  191. entryIterator.remove();
  192. }
  193. ReportParserFileType fileType = ReportParserFileType.getBySuffix(entry.getExtName());
  194. if (fileType == null) {
  195. log.warn("当前邮件{} 中的文件{} 是不支持的文件格式{} 中,不用执行解析逻辑。",
  196. entry.getEmailTitle(), entry.getFilepath(), entry.getExtName());
  197. entryIterator.remove();
  198. }
  199. }
  200. }
  201. // 保存邮件信息
  202. EmailParseInfoDO emailDo = this.buildEmailParseInfo(mailboxInfoDTO.getAccount(), emailInfo, totalSize);
  203. emailDo.setEmailKey(emailEntry.getKey());
  204. emailDo.setParseStatus(status);
  205. emailDo.setFailReason(errMsg);
  206. Integer emailId = this.saveEmailParseInfo(emailDo);
  207. // 保存附件(解压后的)
  208. for (EmailZipFileDTO zipFile : emailFileList) {
  209. EmailFileInfoDO emailFile = this.saveEmailFileInfo(emailId, zipFile.getFilename(), zipFile.getFilepath());
  210. zipFile.setFileId(emailFile.getId());
  211. }
  212. if (CollUtil.isNotEmpty(emailFileList)) {
  213. // 保存相关信息 -> 邮件信息表,邮件文件表,邮件净值表,邮件规模表,基金净值表
  214. this.saveRelatedTable(emailId, emailInfo);
  215. log.info("结束邮件解析 -> 邮箱信息:{},开始时间:{},结束时间:{}", emailEntry.getValue(),
  216. DateUtil.format(startDate, DateConst.YYYY_MM_DD_HH_MM_SS), DateUtil.format(endDate, DateConst.YYYY_MM_DD_HH_MM_SS));
  217. }
  218. }
  219. }
  220. /**
  221. * 解压压缩包,如果不是压缩包需转换
  222. *
  223. * @param emailContentInfoDTO 邮件信息
  224. * @return 解压后的文件列表
  225. * @throws IOException /
  226. */
  227. public List<EmailZipFileDTO> parseZipEmail(EmailContentInfoDTO emailContentInfoDTO) throws IOException {
  228. List<EmailZipFileDTO> resultList = ListUtil.list(false);
  229. Integer emailType = emailContentInfoDTO.getEmailType();
  230. String filepath = emailContentInfoDTO.getFilePath();
  231. String emailTitle = emailContentInfoDTO.getEmailTitle();
  232. if (ArchiveUtil.isArchive(filepath)) {
  233. this.handleCompressedFiles(emailTitle, filepath, emailType, resultList);
  234. } else {
  235. // 不是压缩包时
  236. EmailZipFileDTO dto = new EmailZipFileDTO(emailTitle, emailContentInfoDTO);
  237. resultList.add(dto);
  238. }
  239. // 文件中的类型判断
  240. if (emailType == null || !EmailTypeConst.SUPPORT_NO_OTHER_TYPES.contains(emailType)) {
  241. emailType = EmailUtil.getEmailTypeBySubject(emailContentInfoDTO.getFileName());
  242. emailContentInfoDTO.setEmailType(emailType);
  243. }
  244. if (CollUtil.isNotEmpty(resultList)) {
  245. for (EmailZipFileDTO dto : resultList) {
  246. dto.setEmailType(emailType);
  247. }
  248. }
  249. return resultList;
  250. }
  251. /**
  252. * 解压压缩包并把压缩包里面的所有文件放在resultList中
  253. *
  254. * @param emailTitle 邮件主题
  255. * @param filepath 压缩包路径
  256. * @param emailType 邮件解析类型
  257. * @param resultList 解压结果列表
  258. * @throws IOException /
  259. */
  260. private void handleCompressedFiles(String emailTitle,
  261. String filepath,
  262. Integer emailType,
  263. List<EmailZipFileDTO> resultList) throws IOException {
  264. if (!ArchiveUtil.isArchive(filepath)) {
  265. return;
  266. }
  267. String output = filepath.replaceAll("original", "archive");
  268. String destPath = FileUtil.getParent(output, 1) + File.separator + FileUtil.mainName(output);
  269. File destFile = FileUtil.file(destPath);
  270. if (!destFile.exists()) {
  271. if (!destFile.mkdirs()) {
  272. throw new IOException("无法创建目标目录: " + destPath);
  273. }
  274. }
  275. List<String> extractedDirs;
  276. if (ArchiveUtil.isZip(filepath)) {
  277. extractedDirs = ArchiveUtil.extractCompressedFiles(filepath, destPath);
  278. } else if (ArchiveUtil.isRAR(filepath) || ArchiveUtil.is7z(filepath)) {
  279. // 7z和rar压缩包解压
  280. extractedDirs = ArchiveUtil.extractRar5(filepath, destPath);
  281. } else {
  282. return;
  283. }
  284. for (String dir : extractedDirs) {
  285. // 如果邮件类型不满足解析条件则重新根据文件名判断
  286. if (emailType == null || !EmailTypeConst.SUPPORT_EMAIL_TYPES.contains(emailType)) {
  287. emailType = EmailUtil.getEmailTypeBySubject(dir);
  288. }
  289. File file = new File(dir);
  290. if (file.isDirectory()) {
  291. String[] subDirs = file.list();
  292. if (subDirs != null) {
  293. for (String subDir : subDirs) {
  294. resultList.add(new EmailZipFileDTO(emailTitle, subDir, emailType));
  295. }
  296. } else {
  297. log.warn("目录 {} 下无文件", dir);
  298. }
  299. } else {
  300. resultList.add(new EmailZipFileDTO(emailTitle, dir, emailType));
  301. }
  302. }
  303. }
  304. /**
  305. * 邮件附件解析并保存结果数据
  306. *
  307. * @param emailId 邮件数据ID
  308. * @param emailInfo 邮件信息
  309. */
  310. public void saveRelatedTable(Integer emailId, EmailInfoDTO emailInfo) {
  311. // 解析并保存数据
  312. List<ParseResult<ReportData>> dataList = ListUtil.list(true);
  313. this.parseAndUpdateResult(emailId, emailInfo, dataList);
  314. }
  315. private List<EmailFileInfoDO> buildEmailFileInfo(List<ParseResult<ReportData>> dataList) {
  316. List<EmailFileInfoDO> entityList = ListUtil.list(false);
  317. for (ParseResult<ReportData> result : dataList) {
  318. EmailFileInfoDO entity = new EmailFileInfoDO();
  319. entity.setId(result.getData().getBaseInfo().getFileId());
  320. entity.setParseStatus(result.getStatus());
  321. entity.setFailReason(result.getMsg());
  322. entity.setAiParse(result.getData().getAiParse());
  323. entityList.add(entity);
  324. }
  325. return entityList;
  326. }
  327. /**
  328. * 上传文件解析并返回解析状态
  329. *
  330. * @param params 上传文件路径
  331. * @return /
  332. */
  333. public List<UploadReportResult> uploadReportResults(UploadReportParams params) {
  334. List<ParseResult<ReportData>> dataList = ListUtil.list(false);
  335. String emailTitle = params.getTitle();
  336. List<UploadReportParams.ReportInfo> reportInfos = params.getReportInfos();
  337. List<EmailZipFileDTO> dtos = ListUtil.list(false);
  338. for (UploadReportParams.ReportInfo e : reportInfos) {
  339. String reportPath = e.getReportPath();
  340. if (ArchiveUtil.isArchive(reportPath)) {
  341. try {
  342. this.handleCompressedFiles(emailTitle, reportPath, e.getReportType(), dtos);
  343. } catch (Exception ex) {
  344. log.warn("报告{} 压缩包解压失败:{}", reportPath, ExceptionUtil.stacktraceToString(ex));
  345. ReportData reportData = new ReportData.DefaultReportData();
  346. dataList.add(new ParseResult<>(ReportParseStatus.ARCHIVE_FAIL, reportData));
  347. }
  348. } else {
  349. dtos.add(new EmailZipFileDTO(emailTitle, reportPath, e.getReportType()));
  350. }
  351. }
  352. // 重新判断类型
  353. this.recheckEmailType(emailTitle, dtos);
  354. EmailInfoDTO emailInfo = new EmailInfoDTO(emailTitle, dtos);
  355. Long totalSize = dtos.stream().map(EmailZipFileDTO::getFileSize).reduce(0L, Long::sum);
  356. EmailParseInfoDO emailDo = this.buildEmailParseInfo("upload", emailInfo, totalSize);
  357. Integer emailId = this.saveEmailParseInfo(emailDo);
  358. for (EmailZipFileDTO zipFile : dtos) {
  359. EmailFileInfoDO emailFile = this.saveEmailFileInfo(emailId, zipFile.getFilename(), zipFile.getFilepath());
  360. zipFile.setFileId(emailFile.getId());
  361. }
  362. // 解析并处理解析结果
  363. this.parseAndUpdateResult(emailId, emailInfo, dataList);
  364. // 解析结果转换
  365. List<UploadReportResult> resultList = ListUtil.list(false);
  366. for (ParseResult<ReportData> result : dataList) {
  367. ReportData data = result.getData();
  368. resultList.add(new UploadReportResult(data.getBaseInfo().getFileId(),
  369. data.getBaseInfo().getReportName(), result.getStatus(), result.getMsg()));
  370. }
  371. return resultList;
  372. }
  373. private void parseAndUpdateResult(Integer emailId, EmailInfoDTO emailInfo, List<ParseResult<ReportData>> dataList) {
  374. this.parseResults(emailInfo, dataList);
  375. String failReason = null;
  376. int emailParseStatus = EmailParseStatusConst.SUCCESS;
  377. // 报告邮件有一条失败就表示整个邮件解析失败
  378. if (CollUtil.isNotEmpty(dataList)) {
  379. List<EmailFileInfoDO> entityList = this.buildEmailFileInfo(dataList);
  380. this.emailFileInfoMapper.batchUpdateByFileId(entityList);
  381. long failNum = dataList.stream().filter(e -> !Objects.equals(EmailParseStatusConst.SUCCESS, e.getStatus())).count();
  382. if (failNum > 0) {
  383. emailParseStatus = EmailParseStatusConst.FAIL;
  384. failReason = dataList.stream().map(ParseResult::getMsg).collect(Collectors.joining(";"));
  385. }
  386. }
  387. this.emailParseInfoMapper.updateParseStatus(emailId, emailParseStatus, failReason);
  388. }
  389. /**
  390. * 重新校验邮件附件的类型(用邮件主题+附件名称)
  391. *
  392. * @param emailTitle 邮件主题
  393. * @param dtos 所有附件
  394. */
  395. private void recheckEmailType(String emailTitle, List<EmailZipFileDTO> dtos) {
  396. for (EmailZipFileDTO emailFile : dtos) {
  397. if (EmailTypeConst.SUPPORT_NO_OTHER_TYPES.contains(emailFile.getEmailType())) {
  398. continue;
  399. }
  400. Integer type = EmailUtil.getEmailTypeBySubject(emailTitle + emailFile.getFilename());
  401. // 特殊月报
  402. if ((Objects.equals(EmailTypeConst.NAV_EMAIL_TYPE, type) || Objects.equals(EmailTypeConst.REPORT_OTHER_TYPE, type))
  403. && ReportParseUtils.containsAny(emailTitle, ReportParseUtils.MONTHLY_REPORT_KEYWORDS)) {
  404. type = EmailTypeConst.REPORT_EMAIL_TYPE;
  405. }
  406. if (EmailTypeConst.SUPPORT_EMAIL_TYPES.contains(type)) {
  407. emailFile.setEmailType(type);
  408. }
  409. }
  410. }
  411. /**
  412. * 邮件信息前置处理,在解析操作执行之前的过滤逻辑和校验逻辑。返回所有附件大小汇总
  413. *
  414. * @param emailTitle 邮件信息(包含所有解压后的文件)
  415. * @param dtos 邮件信息(包含所有解压后的文件)
  416. */
  417. private void checkEmailFileInfo(String emailTitle, List<EmailZipFileDTO> dtos) {
  418. // 如果压缩包里面既有pdf又有其他格式的文件,说明其他格式的文件是不需要解析的
  419. List<String> exts = dtos.stream().map(EmailZipFileDTO::getExtName).distinct().toList();
  420. if (exts.contains(Constants.FILE_PDF) && exts.size() > 1) {
  421. dtos.removeIf(e -> !Objects.equals(Constants.FILE_PDF, e.getExtName()));
  422. }
  423. // 移除逻辑
  424. Iterator<EmailZipFileDTO> removeIterator = dtos.iterator();
  425. while (removeIterator.hasNext()) {
  426. EmailZipFileDTO dto = removeIterator.next();
  427. String filename = dto.getFilename();
  428. // 删除复核函或基金合同
  429. if (filename.contains("复核函") || (filename.contains("基金合同") && !filename.contains("合同变更"))) {
  430. log.warn("邮件{} 中的报告{} 是复核函或基金合同,不用解析上传。", emailTitle, filename);
  431. removeIterator.remove();
  432. }
  433. // 不支持的类型
  434. Integer type = dto.getEmailType();
  435. if (!EmailTypeConst.SUPPORT_EMAIL_TYPES.contains(type)) {
  436. log.info("邮件{} 类型{} 不支持解析。", emailTitle, type);
  437. removeIterator.remove();
  438. }
  439. }
  440. // 数据库已存在的数据过滤(邮件主题+报告名称+附件大小,压缩包文件大小汇总)
  441. long totalSize = dtos.stream().map(EmailZipFileDTO::getFileSize).reduce(0L, Long::sum);
  442. Iterator<EmailZipFileDTO> iterator = dtos.iterator();
  443. while (iterator.hasNext()) {
  444. EmailZipFileDTO dto = iterator.next();
  445. String filename = dto.getFilename();
  446. Integer type = dto.getEmailType();
  447. int count = 0;
  448. if (Objects.equals(type, EmailTypeConst.REPORT_LETTER_EMAIL_TYPE)) {
  449. // 确认单
  450. count = this.emailFileInfoMapper.getLetterFilenameSuccessCount(emailTitle, filename);
  451. } else if (Objects.equals(type, EmailTypeConst.REPORT_EMAIL_TYPE)) {
  452. // 定期报告
  453. count = this.emailFileInfoMapper.getAmacFilenameSuccessCount(emailTitle, filename, totalSize);
  454. } else if (Objects.equals(type, EmailTypeConst.REPORT_WEEKLY_TYPE)) {
  455. // 管理人周报
  456. count = this.emailFileInfoMapper.getWeeklyFilenameSuccessCount(emailTitle, filename, totalSize);
  457. } else if (Objects.equals(type, EmailTypeConst.REPORT_OTHER_TYPE)) {
  458. // 其他报告
  459. count = this.emailFileInfoMapper.getOtherFilenameSuccessCount(emailTitle, filename, totalSize);
  460. }
  461. if (count > 0) {
  462. iterator.remove();
  463. log.info("邮件{} 报告{} 已存在解析成功的记录,不用重新解析。", emailTitle, filename);
  464. }
  465. }
  466. if (CollUtil.isEmpty(dtos)) {
  467. log.info("邮件{} 所有文件都已经解析成功过,不能重复解析了", emailTitle);
  468. return;
  469. }
  470. if (log.isInfoEnabled()) {
  471. log.info("邮件{} 还有报告待解析:\n{}", emailTitle, dtos);
  472. }
  473. }
  474. /**
  475. * 邮件信息保存+附件解析
  476. *
  477. * @param emailInfo 邮件信息,包含附件
  478. * @param resultList 解析结果
  479. */
  480. private void parseResults(EmailInfoDTO emailInfo,
  481. List<ParseResult<ReportData>> resultList) {
  482. String emailTitle = emailInfo.getEmailTitle();
  483. List<EmailZipFileDTO> dtos = ListUtil.toList(emailInfo.getEmailFileList());
  484. if (CollUtil.isEmpty(dtos)) {
  485. return;
  486. }
  487. // 附件文件检查
  488. this.checkEmailFileInfo(emailTitle, dtos);
  489. // 解析邮件报告
  490. for (EmailZipFileDTO zipFile : dtos) {
  491. // 解析并保存报告
  492. ParseResult<ReportData> parseResult = this.parseReportAndHandleResult(emailTitle, emailInfo.getSenderEmail(), zipFile);
  493. if (!Objects.equals(1, parseResult.getStatus())) {
  494. log.error(parseResult.getMsg());
  495. }
  496. if (parseResult.getData() == null) {
  497. parseResult.setData(new ReportData.DefaultReportData());
  498. }
  499. resultList.add(parseResult);
  500. }
  501. }
  502. /**
  503. * 解析报告并保存解析结果
  504. *
  505. * @param emailTitle 邮件主题
  506. * @param zipFile 当前报告的路径信息
  507. * @return /
  508. */
  509. private ParseResult<ReportData> parseReportAndHandleResult(String emailTitle,
  510. String senderEmail,
  511. EmailZipFileDTO zipFile) {
  512. Integer fileId = zipFile.getFileId();
  513. Integer emailType = zipFile.getEmailType();
  514. String fileName = zipFile.getFilename();
  515. String filepath = zipFile.getFilepath();
  516. ParseResult<ReportData> result = new ParseResult<>();
  517. boolean reportFlag = emailType == null || !EmailTypeConst.SUPPORT_EMAIL_TYPES.contains(emailType);
  518. if (reportFlag || StrUtil.isBlank(fileName) || fileName.endsWith(Constants.FILE_HTML)) {
  519. return new ParseResult<>(ReportParseStatus.NOT_A_REPORT, null, fileName);
  520. }
  521. // 类型识别---先识别季度报告,没有季度再识别年度报告,最后识别月报
  522. ReportType reportType = ReportParseUtils.matchReportType(emailType, fileName);
  523. if (reportType == null) {
  524. reportType = ReportParseUtils.matchReportType(emailType, emailTitle);
  525. if (log.isDebugEnabled()) {
  526. log.debug("报告{} 根据邮件主题{} 重新识别的类型是:{}", fileName, emailTitle, reportType);
  527. }
  528. }
  529. // 解析器--根据文件后缀获取对应解析器,解析不了就用AI来解析
  530. ReportParserFileType fileType = ReportParserFileType.getBySuffix(zipFile.getExtName());
  531. // 不支持的格式
  532. if (fileType == null) {
  533. ReportData reportData = this.buildNvlReportData(fileId, reportType, null, fileName);
  534. return new ParseResult<>(ReportParseStatus.NO_SUPPORT_TEMPLATE, reportData, fileName);
  535. }
  536. // 不是定期报告的判断逻辑放在不支持的格式下面
  537. if (reportType == null) {
  538. ReportData reportData = this.buildNvlReportData(fileId, ReportType.OTHER, null, fileName);
  539. return new ParseResult<>(ReportParseStatus.NOT_A_REPORT, reportData, fileName);
  540. }
  541. // docx转pdf
  542. if (Objects.equals(ReportParserFileType.WORD, fileType)) {
  543. try {
  544. String outputFile = FileUtil.getParent(filepath, 1) + File.separator + FileUtil.mainName(fileName) + ".pdf";
  545. PdfUtil.convertDocxToPdf(filepath, outputFile);
  546. filepath = outputFile;
  547. } catch (Exception e) {
  548. log.warn("报告{} 转换为pdf失败:{}", fileName, ExceptionUtil.stacktraceToString(e));
  549. }
  550. }
  551. // 首页和尾页转为png图片,首页用来识别基金名称和基金代码、尾页用来识别印章和联系人
  552. List<String> images = ListUtil.list(true);
  553. if (Objects.equals(ReportParserFileType.PDF, fileType)) {
  554. try {
  555. String output = filepath.replaceAll("archive|original", "image");
  556. File outputFile = FileUtil.file(FileUtil.getParent(output, 1));
  557. images = PdfUtil.convertFirstAndLastPagesToPng(filepath, outputFile, 300, zipFile.getPdfPwd());
  558. if (log.isDebugEnabled()) {
  559. log.debug("报告{} 生成的图片地址是:\n{}", fileName, images);
  560. }
  561. } catch (Exception e) {
  562. log.warn("报告{} 生成图片失败:{}", fileName, ExceptionUtil.stacktraceToString(e));
  563. }
  564. } else if (Objects.equals(ReportParserFileType.IMG, fileType)) {
  565. try {
  566. String outputFile = PdfUtil.compressAndSave(filepath);
  567. images.add(outputFile);
  568. } catch (IOException e) {
  569. log.error("报告{} 图片压缩失败,{}", fileName, ExceptionUtil.stacktraceToString(e));
  570. }
  571. }
  572. // ocr识别月报是否管理人版或协会版
  573. ReportMonthlyType monthlyType = ReportMonthlyType.NO_NEED;
  574. if (ReportType.MONTHLY == reportType) {
  575. monthlyType = this.determineReportType(emailTitle, fileName, filepath, images);
  576. }
  577. boolean isAmac = reportType == ReportType.ANNUALLY || reportType == ReportType.QUARTERLY
  578. || (reportType == ReportType.MONTHLY && ReportMonthlyType.AMAC == monthlyType);
  579. // 不支持解析的格式文件
  580. boolean notSupportFile = false;
  581. // 解析报告
  582. ReportData reportData = null;
  583. ReportParserParams params = new ReportParserParams(fileId, fileName, filepath, reportType);
  584. long start = System.currentTimeMillis();
  585. try {
  586. if (isAmac || reportType == ReportType.LETTER) {
  587. ReportParser<ReportData> instance = this.reportParserFactory.getInstance(reportType, fileType);
  588. reportData = instance.parse(params);
  589. result = new ParseResult<>(1, "报告解析成功", reportData);
  590. }
  591. } catch (ReportParseException e) {
  592. result = new ParseResult<>(e.getCode(), StrUtil.format(e.getMsg(), fileName), null);
  593. log.warn("解析失败:{}", result.getMsg());
  594. if (e instanceof NotSupportReportException) {
  595. notSupportFile = true;
  596. }
  597. } catch (Exception e) {
  598. log.warn("解析错误:{}", ExceptionUtil.stacktraceToString(e));
  599. result = new ParseResult<>(ReportParseStatus.PARSE_FAIL, null, e.getMessage());
  600. } finally {
  601. // 如果解析结果是空的就用AI工具解析一次
  602. if (reportData == null && !notSupportFile) {
  603. if (log.isInfoEnabled()) {
  604. log.info("报告{} 是周报或管理人月报或其他类型或解析失败,用AI解析器解析", fileName);
  605. }
  606. try {
  607. if (!isAmac && CollUtil.isNotEmpty(images)) {
  608. filepath = images.get(0);
  609. }
  610. params = new ReportParserParams(fileId, fileName, filepath, reportType);
  611. ReportParser<ReportData> instance = this.reportParserFactory.getInstance(reportType, ReportParserFileType.AI);
  612. reportData = instance.parse(params);
  613. result = new ParseResult<>(1, "报告解析成功--AI", reportData);
  614. } catch (ReportParseException e) {
  615. result = new ParseResult<>(e.getCode(), StrUtil.format(e.getMsg(), fileName), null);
  616. log.warn("AI解析失败:{}", result.getMsg());
  617. } catch (Exception e) {
  618. log.warn("AI解析错误:{}", ExceptionUtil.stacktraceToString(e));
  619. result = new ParseResult<>(ReportParseStatus.PARSE_FAIL, null, e.getMessage());
  620. }
  621. }
  622. // 当reportData==null时重新构建一个reportData对象
  623. reportData = this.buildNvlReportData(fileId, reportType, reportData, fileName);
  624. if (reportData.getBaseInfo() != null) {
  625. // 设置月报类型
  626. reportData.getBaseInfo().setMonthlyType(monthlyType.getType());
  627. // 当报告日期还是空时设置为今天的前一天
  628. if (reportData.getBaseInfo().getReportDate() == null) {
  629. reportData.getBaseInfo().setReportDate(DateUtil.offsetDay(new Date(), -1));
  630. }
  631. }
  632. // ocr信息提取(印章、联系人、基金名称和产品代码)
  633. reportData = this.ocrReportData(fileId, reportType, monthlyType, reportData, fileName, senderEmail, images);
  634. result.setData(reportData);
  635. if (log.isInfoEnabled()) {
  636. log.info("报告{} 解析耗时{}ms,结果是:{}", fileName, (System.currentTimeMillis() - start), reportData);
  637. }
  638. }
  639. // 保存报告解析结果
  640. this.saveReportData(reportData, reportType, fileName);
  641. return result;
  642. }
  643. /**
  644. * 判断月报类型(管理人版还是协会版)
  645. *
  646. * @param emailTitle 邮件主题
  647. * @param fileName 报告名称
  648. * @param filepath 报告路径
  649. * @param images 报告的第一页和尾页图片地址(主要用于ocr提取关键信息)
  650. */
  651. public ReportMonthlyType determineReportType(String emailTitle, String fileName,
  652. String filepath, List<String> images) {
  653. // 1. 优先根据文件名判断
  654. if (ReportParseUtils.containsAny(fileName, AMAC_KEYWORDS)) {
  655. return ReportMonthlyType.AMAC;
  656. }
  657. if (ReportParseUtils.containsAny(fileName, ReportParseUtils.MANAGER_KEYWORDS)) {
  658. return ReportMonthlyType.MANAGER;
  659. }
  660. // if (StrUtil.isNotBlank(ReportParseUtils.matchFundCode(fileName))) {
  661. // return ReportMonthlyType.AMAC;
  662. // }
  663. // 2. 根据文件路径判断
  664. List<String> pathSegments = StrUtil.split(filepath, File.separator);
  665. for (String segment : pathSegments) {
  666. boolean isExcluded = ReportParseUtils.containsAny(segment, EXCLUDE_PATH_KEYWORDS);
  667. if (!isExcluded && ReportParseUtils.containsAny(segment, AMAC_KEYWORDS)) {
  668. return ReportMonthlyType.AMAC;
  669. }
  670. if (!isExcluded && ReportParseUtils.containsAny(segment, ReportParseUtils.MANAGER_KEYWORDS)) {
  671. return ReportMonthlyType.MANAGER;
  672. }
  673. }
  674. // 3. 根据邮件主题判断
  675. boolean isAmacEmail = ReportParseUtils.containsAny(emailTitle, AMAC_KEYWORDS)
  676. && !emailTitle.contains("公司及协会版");
  677. if (isAmacEmail) {
  678. return ReportMonthlyType.AMAC;
  679. }
  680. if (ReportParseUtils.containsAny(emailTitle, ReportParseUtils.MANAGER_KEYWORDS)) {
  681. return ReportMonthlyType.MANAGER;
  682. }
  683. // 4.ocr 提取“曲线”、“基金份额”等关键字,如果有曲线则是管理人,如果有估值日期则是协会
  684. if (CollUtil.isNotEmpty(images)) {
  685. try {
  686. return new OCRReportParser().parseMonthlyType(fileName, this.ocrParserUrl, images.get(0));
  687. } catch (Exception ignored) {
  688. return ReportMonthlyType.FAILED;
  689. }
  690. }
  691. return ReportMonthlyType.FAILED;
  692. }
  693. /**
  694. * ocr 提取信息(包括首页的基金名称或报告日期,尾页的印章或联系人等信息)
  695. *
  696. * @param fileId 文件表ID
  697. * @param reportType 报告类型
  698. * @param monthlyType 月报类型
  699. * @param reportData 报告解析结果
  700. * @param fileName 报告名称
  701. * @param senderEmail 邮件发送人邮箱
  702. * @param images 报告的收益和尾页png图片
  703. */
  704. private ReportData ocrReportData(Integer fileId,
  705. ReportType reportType,
  706. ReportMonthlyType monthlyType,
  707. ReportData reportData,
  708. String fileName,
  709. String senderEmail,
  710. List<String> images) {
  711. if (CollUtil.isEmpty(images)) {
  712. return reportData;
  713. }
  714. // 报告才识别尾页的印章和联系人,确认单不识别尾页
  715. if (ReportType.LETTER != reportType) {
  716. if (log.isInfoEnabled()) {
  717. log.info("报告{} 用ocr补充解析结果。补充前的结果是:{}", fileName, reportData);
  718. }
  719. OCRParseData parseRes = null;
  720. try {
  721. // 首页和尾页相等时只读首页
  722. String imageUrl = images.size() == 1 ? images.get(0) : images.get(1);
  723. parseRes = new OCRReportParser().parse(fileName, this.ocrParserUrl, imageUrl);
  724. } catch (Exception e) {
  725. log.error("报告{} OCR识别印章和联系人出错:{}", fileName, e.getMessage());
  726. }
  727. // ocr识别尾页是否包含印章和联系人信息
  728. if (parseRes != null && reportData.getBaseInfo() != null) {
  729. // 协会报告才设置印章标识
  730. boolean isAmac = reportType == ReportType.ANNUALLY || reportType == ReportType.QUARTERLY
  731. || (reportType == ReportType.MONTHLY && ReportMonthlyType.AMAC == monthlyType);
  732. if (isAmac) {
  733. if (TG_EMAIL_LIST.contains(senderEmail)) {
  734. reportData.getBaseInfo().setWithSeals(true);
  735. } else {
  736. reportData.getBaseInfo().setWithSeals(parseRes.getWithSeals());
  737. if (fileName.contains("用印") && !Objects.equals(true, reportData.getBaseInfo().getWithSeals())) {
  738. reportData.getBaseInfo().setWithSeals(true);
  739. }
  740. }
  741. } else {
  742. // 管理人报告才设置联系人标识
  743. reportData.getBaseInfo().setWithContacts(parseRes.getWithContacts());
  744. }
  745. }
  746. // 首页和尾页不相等时解析首页的数据
  747. if (images.size() != 1 || parseRes == null) {
  748. try {
  749. parseRes = new OCRReportParser().parse(fileName, this.ocrParserUrl, images.get(0));
  750. } catch (Exception e) {
  751. log.error("报告{} OCR识别首页基金名称和报告日期出错:{}", fileName, e.getMessage());
  752. }
  753. }
  754. // 用首页识别基金名称、产品代码和基金管理人
  755. if (reportData.getFundInfo() != null && parseRes != null) {
  756. if (StrUtil.isBlank(reportData.getFundInfo().getFundName())) {
  757. reportData.getFundInfo().setFundName(parseRes.getFundName());
  758. }
  759. if (StrUtil.isBlank(reportData.getFundInfo().getFundCode())) {
  760. reportData.getFundInfo().setFundCode(parseRes.getFundCode());
  761. }
  762. if (StrUtil.isBlank(reportData.getFundInfo().getCompanyName())
  763. || !reportData.getFundInfo().getCompanyName().contains("有限公司")) {
  764. reportData.getFundInfo().setCompanyName(parseRes.getCompanyName());
  765. }
  766. }
  767. reportData.setAiParse(true);
  768. return reportData;
  769. }
  770. // 确认单AI解析失败时重新用OCR识别
  771. if (!reportData.wasFailed()) {
  772. return reportData;
  773. }
  774. if (log.isInfoEnabled()) {
  775. log.info("确认单报告{} 用ocr补充解析结果。补充前的结果是:{}", fileName, reportData);
  776. }
  777. LetterReportData letterReportData = (LetterReportData) reportData;
  778. OCRLetterParseData parseRes = null;
  779. try {
  780. parseRes = new OCRReportParser().parseLetterData(fileName, this.ocrParserUrl, images.get(0));
  781. } catch (Exception e) {
  782. log.error("确认单报告{} OCR提取确认单关键信息出错:{}", fileName, e.getMessage());
  783. }
  784. if (parseRes == null) {
  785. return reportData;
  786. }
  787. if (letterReportData.getFundInfo() != null) {
  788. letterReportData.getFundInfo().setFundName(parseRes.getFundName());
  789. letterReportData.getFundInfo().setFundCode(parseRes.getFundCode());
  790. }
  791. // 投资者信息
  792. if (letterReportData.getInvestorInfo() == null) {
  793. letterReportData.setInvestorInfo(new ReportInvestorInfoDTO(fileId));
  794. }
  795. letterReportData.getInvestorInfo().setInvestorName(parseRes.getInvestorName());
  796. letterReportData.getInvestorInfo().setCertificateNumber(parseRes.getCertificateNumber());
  797. letterReportData.getInvestorInfo().setTradingAccount(parseRes.getTradingAccount());
  798. letterReportData.getInvestorInfo().setFundAccount(parseRes.getFundAccount());
  799. letterReportData.getInvestorInfo().setCertificateType(parseRes.getCertificateType());
  800. // 交易流水
  801. if (letterReportData.getTransaction() == null) {
  802. letterReportData.setTransaction(new ReportFundTransactionDTO(fileId));
  803. }
  804. letterReportData.getTransaction().setTransactionType(parseRes.getTransactionType());
  805. letterReportData.getTransaction().setApplyDate(parseRes.getApplyDate());
  806. letterReportData.getTransaction().setApplyShare(parseRes.getApplyShare());
  807. letterReportData.getTransaction().setApplyAmount(parseRes.getApplyAmount());
  808. letterReportData.getTransaction().setHoldingDate(parseRes.getHoldingDate());
  809. letterReportData.getTransaction().setAmount(parseRes.getAmount());
  810. letterReportData.getTransaction().setShare(parseRes.getShare());
  811. letterReportData.getTransaction().setNav(parseRes.getNav());
  812. letterReportData.setAiParse(true);
  813. return letterReportData;
  814. }
  815. /**
  816. * 当reportData==null时重新构建一个新对象
  817. *
  818. * @param fileId 文件ID
  819. * @param reportType 报告类型
  820. * @param reportData 解析结果对象
  821. * @param fileName 报告名称
  822. * @return /
  823. */
  824. private ReportData buildNvlReportData(Integer fileId,
  825. ReportType reportType,
  826. ReportData reportData,
  827. String fileName) {
  828. if (reportData != null) {
  829. return reportData;
  830. }
  831. if (reportType == null) {
  832. reportType = ReportType.OTHER;
  833. }
  834. ReportBaseInfoDTO baseInfo = new ReportBaseInfoDTO(fileId);
  835. baseInfo.setReportName(fileName);
  836. baseInfo.setReportType(reportType.name());
  837. String reportDate = ReportParseUtils.matchReportDate(reportType, fileName);
  838. baseInfo.setReportDate(ConvertUtil.toDate(reportDate));
  839. ReportFundInfoDTO fundInfo = new ReportFundInfoDTO(fileId);
  840. if (ReportType.ANNUALLY == reportType) {
  841. reportData = new AnnuallyReportData(baseInfo, fundInfo);
  842. } else if (ReportType.QUARTERLY == reportType) {
  843. reportData = new QuarterlyReportData(baseInfo, fundInfo);
  844. } else if (ReportType.MONTHLY == reportType) {
  845. reportData = new MonthlyReportData(baseInfo, fundInfo);
  846. } else if (ReportType.WEEKLY == reportType) {
  847. reportData = new WeeklyReportData(baseInfo, fundInfo);
  848. } else if (ReportType.OTHER == reportType) {
  849. reportData = new ReportData.DefaultReportData(baseInfo, fundInfo);
  850. } else if (ReportType.LETTER == reportType) {
  851. reportData = new LetterReportData(baseInfo, fundInfo);
  852. }
  853. return reportData;
  854. }
  855. /**
  856. * 保存报告解析结果
  857. *
  858. * @param reportData 报告解析结果
  859. * @param reportType 报告类型
  860. * @param fileName 报告名称
  861. */
  862. private void saveReportData(ReportData reportData, ReportType reportType, String fileName) {
  863. if (reportData == null) {
  864. return;
  865. }
  866. StopWatch writeWatch = new StopWatch();
  867. writeWatch.start();
  868. try {
  869. ReportWriter<ReportData> instance = this.reportWriterFactory.getInstance(reportType);
  870. instance.write(reportData);
  871. } catch (Exception e) {
  872. log.error("报告{} 结果保存失败 {}", fileName, ExceptionUtil.stacktraceToString(e));
  873. } finally {
  874. writeWatch.stop();
  875. if (log.isInfoEnabled()) {
  876. log.info("报告{}解析结果保存完成,耗时{}ms", fileName, writeWatch.getTotalTimeMillis());
  877. }
  878. }
  879. }
  880. private EmailFileInfoDO saveEmailFileInfo(Integer emailId, String fileName, String filePath) {
  881. EmailFileInfoDO emailFileInfoDO = buildEmailFileInfoDO(emailId, fileName, filePath);
  882. // emailFileInfoDO.setAiFileId(null);
  883. if (emailFileInfoDO.getId() != null) {
  884. this.emailFileInfoMapper.updateTimeById(emailFileInfoDO.getId(), new Date());
  885. return emailFileInfoDO;
  886. }
  887. LambdaQueryWrapper<EmailFileInfoDO> wrapper = Wrappers.lambdaQuery(EmailFileInfoDO.class)
  888. .eq(EmailFileInfoDO::getEmailId, emailId)
  889. .eq(EmailFileInfoDO::getFileName, fileName)
  890. .eq(EmailFileInfoDO::getFilePath, filePath);
  891. List<EmailFileInfoDO> tempList = this.emailFileInfoMapper.selectList(wrapper);
  892. if (CollUtil.isNotEmpty(tempList)) {
  893. return tempList.get(0);
  894. }
  895. this.emailFileInfoMapper.insertById(emailFileInfoDO);
  896. return emailFileInfoDO;
  897. }
  898. private EmailFileInfoDO buildEmailFileInfoDO(Integer emailId, String fileName, String filePath) {
  899. EmailFileInfoDO emailFileInfoDO = new EmailFileInfoDO();
  900. emailFileInfoDO.setId(null);
  901. emailFileInfoDO.setEmailId(emailId);
  902. emailFileInfoDO.setFileName(fileName);
  903. emailFileInfoDO.setFilePath(filePath);
  904. emailFileInfoDO.setIsvalid(1);
  905. emailFileInfoDO.setCreatorId(0);
  906. emailFileInfoDO.setCreateTime(new Date());
  907. emailFileInfoDO.setUpdaterId(0);
  908. emailFileInfoDO.setUpdateTime(new Date());
  909. return emailFileInfoDO;
  910. }
  911. private Integer saveEmailParseInfo(EmailParseInfoDO emailParseInfoDO) {
  912. if (emailParseInfoDO == null) {
  913. return null;
  914. }
  915. // 重新邮件功能 -> 修改解析时间和更新时间
  916. if (emailParseInfoDO.getId() != null) {
  917. this.emailParseInfoMapper.updateParseTime(emailParseInfoDO.getId(), emailParseInfoDO.getParseDate());
  918. return emailParseInfoDO.getId();
  919. }
  920. LambdaQueryWrapper<EmailParseInfoDO> wrapper = Wrappers.lambdaQuery(EmailParseInfoDO.class)
  921. .eq(EmailParseInfoDO::getEmailTitle, emailParseInfoDO.getEmailTitle())
  922. .eq(EmailParseInfoDO::getSenderEmail, emailParseInfoDO.getSenderEmail())
  923. .eq(EmailParseInfoDO::getEmailDate, emailParseInfoDO.getEmailDate())
  924. .eq(EmailParseInfoDO::getEmail, emailParseInfoDO.getEmail())
  925. .orderByDesc(EmailParseInfoDO::getId);
  926. List<EmailParseInfoDO> tempList = this.emailParseInfoMapper.selectList(wrapper);
  927. if (CollUtil.isNotEmpty(tempList)) {
  928. this.emailParseInfoMapper.update(emailParseInfoDO, wrapper);
  929. return tempList.get(0).getId();
  930. }
  931. this.emailParseInfoMapper.insertAndId(emailParseInfoDO);
  932. return emailParseInfoDO.getId();
  933. }
  934. private EmailParseInfoDO buildEmailParseInfo(String emailAddress, EmailInfoDTO emailInfo, long totalSize) {
  935. EmailParseInfoDO emailParseInfoDO = new EmailParseInfoDO();
  936. emailParseInfoDO.setId(null);
  937. emailParseInfoDO.setSenderEmail(emailInfo.getSenderEmail());
  938. emailParseInfoDO.setEmail(emailAddress);
  939. emailParseInfoDO.setEmailDate(DateUtil.parse(emailInfo.getEmailDate(), DateConst.YYYY_MM_DD_HH_MM_SS));
  940. emailParseInfoDO.setParseDate(new Date());
  941. emailParseInfoDO.setEmailTitle(emailInfo.getEmailTitle());
  942. emailParseInfoDO.setParseStatus(EmailParseStatusConst.SUCCESS);
  943. emailParseInfoDO.setAttrSize(totalSize);
  944. emailParseInfoDO.setIsvalid(1);
  945. emailParseInfoDO.setCreatorId(0);
  946. emailParseInfoDO.setCreateTime(new Date());
  947. emailParseInfoDO.setUpdaterId(0);
  948. emailParseInfoDO.setUpdateTime(new Date());
  949. return emailParseInfoDO;
  950. }
  951. /**
  952. * 读取邮件
  953. *
  954. * @param mailboxInfoDTO 邮箱配置信息
  955. * @param startDate 邮件起始日期
  956. * @param endDate 邮件截止日期(为null,将解析邮件日期小于等于startDate的当天邮件)
  957. * @return 读取到的邮件信息
  958. * @throws Exception 异常信息
  959. */
  960. private Map<String, List<EmailContentInfoDTO>> realEmail(MailboxInfoDTO mailboxInfoDTO,
  961. Date startDate, Date endDate,
  962. List<String> folderNames) throws Exception {
  963. if (CollUtil.isEmpty(folderNames)) {
  964. folderNames = ListUtil.toList("INBOX");
  965. }
  966. Store store = EmailUtil.getStoreNew(mailboxInfoDTO);
  967. if (store == null) {
  968. return MapUtil.newHashMap(4);
  969. }
  970. Map<String, List<EmailContentInfoDTO>> result = MapUtil.newHashMap(128);
  971. try {
  972. if (log.isDebugEnabled()) {
  973. Folder[] list = store.getDefaultFolder().list("*");
  974. List<String> names = Arrays.stream(list).map(Folder::getFullName).toList();
  975. log.debug("获取所有邮箱文件夹:{}", names);
  976. }
  977. for (String folderName : folderNames) {
  978. try {
  979. Map<String, List<EmailContentInfoDTO>> temp = this.getFolderEmail(mailboxInfoDTO,
  980. startDate, endDate, store, folderName);
  981. if (MapUtil.isNotEmpty(temp)) {
  982. result.putAll(temp);
  983. }
  984. } catch (Exception e) {
  985. log.warn("文件夹{} 邮件获取失败:{}", folderName, ExceptionUtil.stacktraceToString(e));
  986. }
  987. }
  988. } catch (Exception e) {
  989. log.error("邮件获取失败:{}", ExceptionUtil.stacktraceToString(e));
  990. } finally {
  991. store.close();
  992. }
  993. return result;
  994. }
  995. private Map<String, List<EmailContentInfoDTO>> getFolderEmail(MailboxInfoDTO mailboxInfoDTO,
  996. Date startDate, Date endDate,
  997. Store store, String folderName) throws MessagingException {
  998. // 默认读取收件箱的邮件
  999. Folder folder = store.getFolder(folderName);
  1000. folder.open(this.readWriteSeen ? Folder.READ_WRITE : Folder.READ_ONLY);
  1001. Message[] messages = getEmailMessage(folder, mailboxInfoDTO.getProtocol(), startDate);
  1002. if (messages == null || messages.length == 0) {
  1003. log.warn("{} 获取不到邮件 -> 邮箱信息:{},开始时间:{},结束时间:{}", folderName, mailboxInfoDTO, startDate, endDate);
  1004. return MapUtil.newHashMap();
  1005. }
  1006. String emailAddress = mailboxInfoDTO.getAccount();
  1007. Map<String, List<EmailContentInfoDTO>> emailMessageMap = MapUtil.newHashMap();
  1008. for (Message message : messages) {
  1009. long start = System.currentTimeMillis();
  1010. List<EmailContentInfoDTO> dtos = CollUtil.newArrayList();
  1011. String emailTitle = message.getSubject();
  1012. if (this.readWriteSeen && isMessageRead(message)) {
  1013. log.warn("{} 邮件{} 已读,不用重新下载解析!", folderName, emailTitle);
  1014. continue;
  1015. }
  1016. try {
  1017. Date emailDate = message.getSentDate();
  1018. String emailDateStr = DateUtil.format(emailDate, DateConst.YYYY_MM_DD_HH_MM_SS);
  1019. if (log.isInfoEnabled()) {
  1020. log.info("{} 邮件{} 数据获取中,邮件时间:{}", folderName, emailTitle, emailDateStr);
  1021. }
  1022. boolean isNotParseConditionSatisfied = emailDate == null
  1023. || (endDate != null && emailDate.compareTo(endDate) > 0)
  1024. || (startDate != null && emailDate.compareTo(startDate) < 0);
  1025. if (isNotParseConditionSatisfied) {
  1026. String st = DateUtil.formatDateTime(startDate);
  1027. String ed = DateUtil.formatDateTime(endDate);
  1028. log.warn("{} 邮件{} 发送时间{}不在区间内【{} ~ {}】", folderName, emailTitle, emailDateStr, st, ed);
  1029. continue;
  1030. }
  1031. String senderEmail = getSenderEmail(message);
  1032. Integer emailType = EmailUtil.getEmailTypeBySubject(emailTitle);
  1033. if (emailType == null) {
  1034. log.warn("{} 邮件不满足解析条件 -> 邮件主题:{},邮件日期:{}", folderName, emailTitle, emailDateStr);
  1035. continue;
  1036. }
  1037. // // 成功解析的邮件不用重复下载
  1038. // Integer okNum = this.emailParseInfoMapper.countEmailByInfoAndStatus(emailTitle, senderEmail, emailAddress, emailDateStr);
  1039. // if (okNum > 0) {
  1040. // if (log.isInfoEnabled()) {
  1041. // log.info("{} 邮件{} 已经存在解析完成的记录,不要重复下载了。", folderName, emailTitle);
  1042. // }
  1043. // continue;
  1044. // }
  1045. if (log.isInfoEnabled()) {
  1046. log.info("{} 邮件{} 基本信息获取完成,开始下载附件!邮件日期:{}", folderName, emailTitle, emailDateStr);
  1047. }
  1048. Object messageContent = message.getContent();
  1049. String[] contents = new String[]{null};
  1050. if (messageContent instanceof Multipart multipart) {
  1051. this.reMultipart(emailAddress, emailTitle, emailDate, multipart, contents, dtos);
  1052. } else {
  1053. log.warn("{} 邮件{} 获取不了附件", folderName, emailTitle);
  1054. }
  1055. if (CollUtil.isEmpty(dtos)) {
  1056. log.warn("{} 邮件{} 没有获取到附件", folderName, emailTitle);
  1057. continue;
  1058. }
  1059. dtos.forEach(e -> {
  1060. e.setEmailType(emailType);
  1061. e.setSenderEmail(senderEmail);
  1062. e.setEmailContent(contents[0]);
  1063. });
  1064. emailMessageMap.put(IdUtil.simpleUUID(), dtos);
  1065. } catch (Exception e) {
  1066. log.error("{} 邮件{} 下载报错 {}", folderName, emailTitle, ExceptionUtil.stacktraceToString(e));
  1067. } finally {
  1068. if (CollUtil.isNotEmpty(dtos) && log.isInfoEnabled()) {
  1069. log.info("{} 邮件{} 下载完成,总计耗时{} ms,文件内容如下\n {}", folderName,
  1070. emailTitle, System.currentTimeMillis() - start, dtos);
  1071. }
  1072. }
  1073. }
  1074. if (this.readWriteSeen) {
  1075. // 设置已读标志
  1076. folder.setFlags(messages, new Flags(Flags.Flag.SEEN), true);
  1077. }
  1078. folder.close(false);
  1079. return emailMessageMap;
  1080. }
  1081. private void rePart(String account, String subject, Date sendDate, Part part,
  1082. List<EmailContentInfoDTO> emailContentInfoDTOList) throws Exception {
  1083. String fileName = EmailUtil.decodeFileName(part);
  1084. if (StrUtil.isBlank(fileName)) {
  1085. return;
  1086. }
  1087. if (fileName.contains("\"") || fileName.contains("\n")) {
  1088. fileName = fileName.replaceAll("\"", "").replaceAll("\n", "");
  1089. }
  1090. if (fileName.contains("=?")) {
  1091. fileName = MimeUtility.decodeText(fileName);
  1092. }
  1093. String disposition = part.getDisposition();
  1094. String contentType = part.getContentType();
  1095. String[] att_files = new String[]{Constants.ARCHIVE_7Z, Constants.ARCHIVE_RAR, Constants.ARCHIVE_ZIP,
  1096. Constants.FILE_PDF, Constants.FILE_DOCX, Constants.FILE_JPG, Constants.FILE_PNG};
  1097. boolean attachmentFlag = StrUtil.endWithAny(fileName, att_files);
  1098. boolean isAttachment = attachmentFlag
  1099. || Part.ATTACHMENT.equalsIgnoreCase(disposition)
  1100. || (contentType != null && attachmentMimePrefixes.stream().anyMatch(prefix ->
  1101. StrUtil.startWithIgnoreCase(contentType, prefix)
  1102. ));
  1103. if (!isAttachment) {
  1104. log.warn("邮件{} 未检测到{}类型的附件 (fileName={}, disposition={}, contentType={})",
  1105. subject, att_files, fileName, disposition, contentType);
  1106. return;
  1107. }
  1108. File saveFile = this.generateSavePath(account, sendDate, fileName);
  1109. if (!saveFile.exists()) {
  1110. if (!saveFile.getParentFile().exists()) {
  1111. boolean mkdirs = saveFile.getParentFile().mkdirs();
  1112. if (!mkdirs) {
  1113. log.warn("file path mkdir failed.");
  1114. }
  1115. }
  1116. try (InputStream is = part.getInputStream()) {
  1117. Files.copy(is, saveFile.toPath());
  1118. }
  1119. } else {
  1120. if (log.isInfoEnabled()) {
  1121. log.info("邮件{} 已下载过附件:{},不用重新下载了。", subject, saveFile.toPath());
  1122. }
  1123. }
  1124. EmailContentInfoDTO emailContentInfoDTO = new EmailContentInfoDTO();
  1125. emailContentInfoDTO.setFileName(fileName);
  1126. emailContentInfoDTO.setFileSize(part.getSize());
  1127. emailContentInfoDTO.setFilePath(saveFile.getAbsolutePath());
  1128. emailContentInfoDTO.setEmailAddress(account);
  1129. emailContentInfoDTO.setEmailTitle(subject);
  1130. emailContentInfoDTO.setEmailDate(DateUtil.format(sendDate, DateConst.YYYY_MM_DD_HH_MM_SS));
  1131. emailContentInfoDTOList.add(emailContentInfoDTO);
  1132. }
  1133. public File generateSavePath(String account, Date sendDate, String fileName) {
  1134. String emailDateStr = DateUtil.format(sendDate, DateConst.YYYYMMDD);
  1135. String filePath = this.path + File.separator + account + File.separator +
  1136. emailDateStr + File.separator + "original" + File.separator;
  1137. // 压缩包重名时的后面的压缩包会覆盖前面压缩包的问题(不考虑普通文件)
  1138. String emailDate = DateUtil.format(sendDate, DateConst.YYYYMMDDHHMMSS24);
  1139. String realName = ArchiveUtil.isArchive(fileName) ? emailDate + fileName : fileName;
  1140. return FileUtil.file(filePath + realName);
  1141. }
  1142. private void reMultipart(String account, String subject, Date emailDate,
  1143. Multipart multipart, String[] contents,
  1144. List<EmailContentInfoDTO> emailContentInfoDTOList) throws Exception {
  1145. for (int i = 0; i < multipart.getCount(); i++) {
  1146. Part bodyPart = multipart.getBodyPart(i);
  1147. Object bodyPartContent = bodyPart.getContent();
  1148. if (bodyPartContent instanceof String) {
  1149. if (log.isDebugEnabled()) {
  1150. log.debug("邮件{} 获取的正文不做解析,内容是 {}", subject, bodyPartContent);
  1151. }
  1152. if (StrUtil.startWithIgnoreCase(bodyPart.getContentType(), MediaType.TEXT_PLAIN_VALUE)) {
  1153. contents[0] = bodyPartContent.toString();
  1154. }
  1155. continue;
  1156. }
  1157. if (bodyPartContent instanceof Multipart mp) {
  1158. this.reMultipart(account, subject, emailDate, mp, contents, emailContentInfoDTOList);
  1159. } else {
  1160. this.rePart(account, subject, emailDate, bodyPart, emailContentInfoDTOList);
  1161. }
  1162. }
  1163. }
  1164. private String getSenderEmail(Message message) {
  1165. Address[] senderAddress;
  1166. try {
  1167. senderAddress = message.getFrom();
  1168. if (senderAddress == null || senderAddress.length == 0) {
  1169. return null;
  1170. }
  1171. // 此时的address是含有编码(MIME编码方式)后的文本和实际的邮件地址
  1172. String address = "";
  1173. for (Address from : senderAddress) {
  1174. if (StrUtil.isNotBlank(from.toString())) {
  1175. address = from.toString();
  1176. break;
  1177. }
  1178. }
  1179. // 正则表达式匹配邮件地址
  1180. Pattern pattern = Pattern.compile("<(\\S+)>");
  1181. Matcher matcher = pattern.matcher(address);
  1182. if (matcher.find()) {
  1183. return matcher.group(1);
  1184. }
  1185. } catch (MessagingException e) {
  1186. log.error(e.getMessage(), e);
  1187. }
  1188. return null;
  1189. }
  1190. private Message[] getEmailMessage(Folder folder, String protocol, Date startDate) {
  1191. try {
  1192. if (protocol.contains("imap")) {
  1193. // 获取邮件日期大于等于startDate的邮件(搜索条件只支持按天)
  1194. SearchTerm startDateTerm = new ReceivedDateTerm(ComparisonTerm.GE, startDate);
  1195. return folder.search(startDateTerm);
  1196. } else {
  1197. return folder.getMessages();
  1198. }
  1199. } catch (MessagingException e) {
  1200. throw new RuntimeException(e);
  1201. }
  1202. }
  1203. /**
  1204. * 检查邮件是否已读
  1205. *
  1206. * @param message 邮件对象
  1207. * @return true表示已读,false表示未读
  1208. * @throws MessagingException 如果访问邮件标志时出错
  1209. */
  1210. private boolean isMessageRead(Message message) throws MessagingException {
  1211. // 获取邮件的所有标志
  1212. Flags flags = message.getFlags();
  1213. // 检查是否包含 SEEN 标志
  1214. return flags.contains(Flags.Flag.SEEN);
  1215. }
  1216. }