Ver código fonte

feat:识别文件中的印章和联系人

wangzaijun 3 semanas atrás
pai
commit
088af4336d

+ 1 - 1
mo-daq-openai/web/route.py

@@ -115,7 +115,7 @@ async def parse_image(image_url: str,
                 ]
             }
         ])
-    return completion.choices[0].message.content
+    return {"content": completion.choices[0].message.content}
 
 
 #  读取本地文件,并编码为 Base64 格式

+ 78 - 0
mo-daq/src/main/java/com/smppw/modaq/application/components/OCRReportParser.java

@@ -0,0 +1,78 @@
+package com.smppw.modaq.application.components;
+
+import cn.hutool.core.io.IORuntimeException;
+import cn.hutool.core.map.MapUtil;
+import cn.hutool.core.util.StrUtil;
+import cn.hutool.http.HttpUtil;
+import cn.hutool.json.JSONObject;
+import cn.hutool.json.JSONUtil;
+import com.smppw.modaq.common.enums.ReportParseStatus;
+import com.smppw.modaq.common.exception.ReportParseException;
+import com.smppw.modaq.domain.dto.report.OCRParseData;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.Map;
+
+public class OCRReportParser {
+    private final Logger logger = LoggerFactory.getLogger(this.getClass());
+
+    private static final Map<String, Object> RESULT_SCHEMA_MAP = MapUtil.newHashMap(8);
+
+    static {
+        RESULT_SCHEMA_MAP.put("基金名称", "");
+        RESULT_SCHEMA_MAP.put("产品代码", "");
+        RESULT_SCHEMA_MAP.put("报告日期", "");
+        RESULT_SCHEMA_MAP.put("是否有红色印章", "");
+        RESULT_SCHEMA_MAP.put("是否有电话", "");
+        RESULT_SCHEMA_MAP.put("是否有地址", "");
+        RESULT_SCHEMA_MAP.put("是否有关注我们", "");
+    }
+
+    public OCRParseData parse(String filename, String ocrApi, String ocrImgUrl) throws ReportParseException {
+        Map<String, Object> paramsMap = MapUtil.newHashMap(4);
+        paramsMap.put("image_url", ocrImgUrl);
+        paramsMap.put("result_schema", JSONUtil.toJsonStr(RESULT_SCHEMA_MAP));
+        String body = null;
+        try {
+            body = HttpUtil.get(ocrApi, paramsMap);
+            JSONObject jsonResult = JSONUtil.parseObj(body);
+            String content = StrUtil.split(jsonResult.getStr("content"), "```").get(1);
+            String aiParserContent = "{" + StrUtil.subAfter(content, "{", false) + "}";
+            JSONObject jsonObject = JSONUtil.parseObj(aiParserContent);
+            String fundName = jsonObject.getStr("基金名称");
+            String fundCode = jsonObject.getStr("产品代码");
+            String reportDate = jsonObject.getStr("报告日期");
+            String seals = jsonObject.getStr("是否有红色印章");
+            String phone = jsonObject.getStr("是否有电话");
+            String addr = jsonObject.getStr("是否有地址");
+            String withme = jsonObject.getStr("是否有关注我们");
+            OCRParseData res = new OCRParseData();
+            if (StrUtil.isNotBlank(reportDate)) {
+                res.setReportDate(reportDate);
+            }
+            if (StrUtil.isNotBlank(fundName)) {
+                res.setFundName(fundName);
+            }
+            if (StrUtil.isNotBlank(fundCode)) {
+                res.setFundCode(fundCode);
+            }
+            if (StrUtil.isNotBlank(seals)) {
+                res.setWithSeals(true);
+            }
+            if (StrUtil.isNotBlank(phone) || StrUtil.isNotBlank(addr) || StrUtil.isNotBlank(withme)) {
+                res.setWithContacts(true);
+            }
+            return res;
+        } catch (IORuntimeException e) {
+            throw new ReportParseException(ReportParseStatus.AI_NOT_FOUND);
+        } catch (Exception e) {
+            this.logger.warn("报告{} OCR识别错误:{}", filename, e.getMessage());
+            throw new ReportParseException(ReportParseStatus.SYSTEM_ERROR);
+        } finally {
+            if (logger.isInfoEnabled()) {
+                this.logger.info("报告{} OCR识别参数{},OCR识别结果:{}", filename, paramsMap, body);
+            }
+        }
+    }
+}

+ 0 - 48
mo-daq/src/main/java/com/smppw/modaq/application/components/ReportParseUtils.java

@@ -460,54 +460,6 @@ public final class ReportParseUtils {
         String text = "私募基金2024年04月度报告";
         Integer emailType = 1;
         ReportType reportType = null;
-//        System.out.println(matchReportDate(matchReportType(EmailUtil.getEmailTypeBySubject(text), text), text));
-////        System.out.println(matchReportType(3, text));
-////
-////        text = "私募基金202404月度报告";
-////        System.out.println(matchReportDate(text));
-////        System.out.println(matchReportType(3, text));
-////
-////        text = "私募基金2024_04月度报告";
-////        System.out.println(matchReportDate(text));
-////        System.out.println(matchReportType(3, text));
-////
-////        text = "私募基金2024_4月度报告";
-////        System.out.println(matchReportDate(text));
-////        System.out.println(matchReportType(3, text));
-////
-////        text = "私募基金2024-04月度报告";
-////        System.out.println(matchReportDate(text));
-////        System.out.println(matchReportType(3, text));
-////
-////        text = "私募基金2024_04月";
-////        System.out.println(matchReportDate(text));
-////        System.out.println(matchReportType(3, text));
-//
-//        text = "私募基金2024年04月12号周报";
-//        System.out.println(matchReportDate(matchReportType(EmailUtil.getEmailTypeBySubject(text), text), text));
-////        System.out.println(matchReportType(3, text));
-////        System.out.println(matchReportType(5, text));
-//
-//        text = "私募基金20240412确认函";
-//        System.out.println(matchReportDate(matchReportType(EmailUtil.getEmailTypeBySubject(text), text), text));
-////        System.out.println(matchReportType(3, text));
-//
-//        text = "私募基金2024041201预警";
-//        System.out.println(matchReportDate(matchReportType(EmailUtil.getEmailTypeBySubject(text), text), text));
-////        System.out.println(matchReportType(3, text));
-//
-//        text = "私募基金_202404";
-//        System.out.println(matchReportDate(matchReportType(EmailUtil.getEmailTypeBySubject(text), text), text));
-////        System.out.println(matchReportType(3, text));
-//
-//        text = "(水印)SSA404_月报_天演金选沪深300指数增强1号私募证券投资基金_2025年_4月.pdf";
-//        System.out.println(matchReportDate(matchReportType(EmailUtil.getEmailTypeBySubject(text), text), text));
-
-//        text = "【报告披露】稳博创新一号私募证券投资基金_2024年年报_20241231.pdf";
-//        System.out.println(matchReportDate(matchReportType(EmailUtil.getEmailTypeBySubject(text), text), text));
-//
-//        text = "大岩市场中性2号私募证券投资基金_2025年_4月_月报.pdf";
-//        System.out.println(matchReportDate(matchReportType(EmailUtil.getEmailTypeBySubject(text), text), text));
 
 //        text = "投资策略调整.pdf";
 //        emailType = EmailUtil.getEmailTypeBySubject(text);

+ 7 - 9
mo-daq/src/main/java/com/smppw/modaq/application/components/report/parser/ai/AIMonthlyReportParser.java

@@ -24,20 +24,18 @@ public class AIMonthlyReportParser extends AbstractAIReportParser<MonthlyReportD
     @Override
     protected String prompt() {
         return """
-                识别报告中的基金名称、基金编码和联系人信息,基金编码的正则表达式是`S[A-Z0-9]{5}`,如果没有联系人信息则返回空字符串,
-                结果用json返回
+                识别文件中的基金名称、基金管理人、产品代码和报告日期,产品代码的正则表达式是`S[A-Z0-9]{5}`,如果无法识别就返回空字符,结果用json返回
                 """;
     }
 
     @Override
     protected MonthlyReportData parseExtInfoAndSetData(ReportBaseInfoDTO reportInfo,
                                                        ReportFundInfoDTO fundInfo) throws ReportParseException {
-        MonthlyReportData reportData = new MonthlyReportData(reportInfo, fundInfo);
-        Object contact = this.allInfoMap.get("联系人信息");
-        if (ObjUtil.isNotEmpty(contact)) {
-            // 有联系人就不要净值数据了
-            reportInfo.setWithContacts(true);
-        }
+//        Object contact = this.allInfoMap.get("联系人信息");
+//        if (ObjUtil.isNotEmpty(contact)) {
+//            // 有联系人就不要净值数据了
+//            reportInfo.setWithContacts(true);
+//        }
         // AI 就不解析净值数据了
 //        List<ReportNetReportDTO> dtos = ListUtil.list(true);
 //        for (Map.Entry<String, Object> entry : this.allInfoMap.entrySet()) {
@@ -54,7 +52,7 @@ public class AIMonthlyReportParser extends AbstractAIReportParser<MonthlyReportD
 //            dtos.add(dto);
 //        }
 //        reportData.setNetReport(dtos);
-        return reportData;
+        return new MonthlyReportData(reportInfo, fundInfo);
     }
 
     @Override

+ 7 - 7
mo-daq/src/main/java/com/smppw/modaq/application/components/report/parser/ai/AIOtherReportParser.java

@@ -6,8 +6,6 @@ import com.smppw.modaq.domain.dto.report.*;
 import com.smppw.modaq.domain.mapper.EmailFieldMappingMapper;
 import org.springframework.stereotype.Component;
 
-import java.util.Objects;
-
 /**
  * 其他格式的报告(只解析报告基本信息、基金基本信息)
  */
@@ -19,7 +17,9 @@ public class AIOtherReportParser extends AbstractAIReportParser<ReportData> {
 
     @Override
     protected String prompt() {
-        return "识别文件中的基金名称、基金管理人、基金托管人、产品代码和报告日期,并且判断文件中是否存在联系人等信息,如果无法识别就返回空字符,结果用json返回";
+        return """
+                识别文件中的基金名称、基金管理人、产品代码和报告日期,产品代码的正则表达式是`S[A-Z0-9]{5}`,如果无法识别就返回空字符,结果用json返回
+                """;
     }
 
     @Override
@@ -30,10 +30,10 @@ public class AIOtherReportParser extends AbstractAIReportParser<ReportData> {
     @Override
     protected ReportData parseExtInfoAndSetData(ReportBaseInfoDTO reportInfo,
                                                 ReportFundInfoDTO fundInfo) throws ReportParseException {
-        Object contact = this.allInfoMap.get("联系人信息");
-        if (Objects.equals("存在", contact)) {
-            reportInfo.setWithContacts(true);
-        }
+//        Object contact = this.allInfoMap.get("联系人信息");
+//        if (Objects.equals("存在", contact)) {
+//            reportInfo.setWithContacts(true);
+//        }
         return new MonthlyReportData(reportInfo, fundInfo);
     }
 

+ 5 - 7
mo-daq/src/main/java/com/smppw/modaq/application/components/report/parser/ai/AIWeeklyReportParser.java

@@ -1,6 +1,5 @@
 package com.smppw.modaq.application.components.report.parser.ai;
 
-import cn.hutool.core.util.ObjUtil;
 import com.smppw.modaq.application.components.report.parser.ReportParserConstant;
 import com.smppw.modaq.common.exception.ReportParseException;
 import com.smppw.modaq.domain.dto.report.ReportBaseInfoDTO;
@@ -22,8 +21,7 @@ public class AIWeeklyReportParser extends AbstractAIReportParser<WeeklyReportDat
     @Override
     protected String prompt() {
         return """
-                识别报告中的基金名称、基金编码、基金管理人、报告日期和联系人信息,基金编码的正则表达式是`S[A-Z0-9]{5}`,如果没有联系人信息则返回空字符串,
-                结果用json返回
+                识别文件中的基金名称、基金管理人、产品代码和报告日期,产品代码的正则表达式是`S[A-Z0-9]{5}`,如果无法识别就返回空字符,结果用json返回
                 """;
     }
 
@@ -35,10 +33,10 @@ public class AIWeeklyReportParser extends AbstractAIReportParser<WeeklyReportDat
     @Override
     protected WeeklyReportData parseExtInfoAndSetData(ReportBaseInfoDTO reportInfo,
                                                       ReportFundInfoDTO fundInfo) throws ReportParseException {
-        Object contact = this.allInfoMap.get("联系人信息");
-        if (ObjUtil.isNotEmpty(contact)) {
-            reportInfo.setWithContacts(true);
-        }
+//        Object contact = this.allInfoMap.get("联系人信息");
+//        if (ObjUtil.isNotEmpty(contact)) {
+//            reportInfo.setWithContacts(true);
+//        }
         return new WeeklyReportData(reportInfo, fundInfo);
     }
 

+ 4 - 2
mo-daq/src/main/java/com/smppw/modaq/application/components/report/parser/ai/AbstractAIReportParser.java

@@ -1,6 +1,7 @@
 package com.smppw.modaq.application.components.report.parser.ai;
 
 import cn.hutool.core.exceptions.ExceptionUtil;
+import cn.hutool.core.io.IORuntimeException;
 import cn.hutool.core.map.MapUtil;
 import cn.hutool.core.util.StrUtil;
 import cn.hutool.http.HttpUtil;
@@ -99,7 +100,6 @@ public abstract class AbstractAIReportParser<T extends ReportData> extends Abstr
         String filename = params.getFilename();
         Map<String, Object> paramsMap = MapUtil.newHashMap(4);
         paramsMap.put("filepath", params.getFilepath());
-        paramsMap.put("file_id", params.getAiFileId());
         String prompt = this.prompt();
         if (StrUtil.isNotBlank(prompt)) {
             paramsMap.put("user_msg", prompt);
@@ -117,9 +117,11 @@ public abstract class AbstractAIReportParser<T extends ReportData> extends Abstr
         } catch (ReportParseException e) {
             this.logger.warn("{} ai解析失败,解析结果{},错误原因:{}", filename, body, ExceptionUtil.stacktraceToString(e));
             throw e;
+        } catch (IORuntimeException e) {
+            throw new ReportParseException(ReportParseStatus.AI_NOT_FOUND);
         } catch (Exception e) {
             this.logger.warn("报告{} 在AI解析时报错:{}", filename, ExceptionUtil.stacktraceToString(e));
-            throw new ReportParseException(ReportParseStatus.AI_NOT_FOUND);
+            throw new ReportParseException(ReportParseStatus.SYSTEM_ERROR);
         } finally {
             if (logger.isInfoEnabled()) {
                 this.logger.info("报告{} AI解析参数{},AI解析结果:{}", filename, paramsMap, body);

+ 2 - 0
mo-daq/src/main/java/com/smppw/modaq/common/enums/ReportParseStatus.java

@@ -1,6 +1,8 @@
 package com.smppw.modaq.common.enums;
 
 public enum ReportParseStatus implements StatusCode {
+    SYSTEM_ERROR(20001, "系统异常"),
+
     AI_NOT_FOUND(20009, "AI资源找不到"),
     NO_SUPPORT_AI(20010, "报告[{}]不支持AI解析"),
     PARSE_FAIL(21000, "定期报告或交易确认单解析错误:{}"),

+ 14 - 2
mo-daq/src/main/java/com/smppw/modaq/domain/dto/EmailZipFileDTO.java

@@ -9,21 +9,32 @@ public class EmailZipFileDTO {
     private final String emailTitle;
     private final String filename;
     private final String filepath;
+    private final int fileSize;
     @Setter
     private Integer emailType;
 
-    public EmailZipFileDTO(String emailTitle, String filepath, Integer emailType) {
+    public EmailZipFileDTO(String emailTitle, String filepath, int fileSize, Integer emailType) {
         this.emailTitle = emailTitle;
         this.filepath = filepath;
         this.emailType = emailType;
+        this.fileSize = fileSize;
         this.filename = FileUtil.getName(filepath);
     }
 
-    public EmailZipFileDTO(String emailTitle, String filepath, String filename, Integer emailType) {
+    public EmailZipFileDTO(String emailTitle, String filepath, String filename, int fileSize, Integer emailType) {
         this.emailTitle = emailTitle;
         this.filepath = filepath;
         this.emailType = emailType;
         this.filename = filename;
+        this.fileSize = fileSize;
+    }
+
+    public EmailZipFileDTO(String emailTitle, EmailContentInfoDTO emailDto) {
+        this.emailTitle = emailTitle;
+        this.filepath = emailDto.getFilePath();
+        this.emailType = emailDto.getEmailType();
+        this.filename = emailDto.getFileName();
+        this.fileSize = emailDto.getFileSize();
     }
 
     @Override
@@ -33,6 +44,7 @@ public class EmailZipFileDTO {
                 ", filename='" + filename + '\'' +
                 ", filepath='" + filepath + '\'' +
                 ", emailType=" + emailType +
+                ", fileSize=" + fileSize +
                 '}';
     }
 }

+ 29 - 0
mo-daq/src/main/java/com/smppw/modaq/domain/dto/report/OCRParseData.java

@@ -0,0 +1,29 @@
+package com.smppw.modaq.domain.dto.report;
+
+import lombok.Getter;
+import lombok.Setter;
+
+@Setter
+@Getter
+public class OCRParseData {
+    /**
+     * 基金名称(报告首页才能识别)
+     */
+    private String fundName;
+    /**
+     * 产品代码(报告首页才能识别)
+     */
+    private String fundCode;
+    /**
+     * 报告日期
+     */
+    private String reportDate;
+    /**
+     * 报告是否用印(报告尾页识别)
+     */
+    private Boolean withSeals;
+    /**
+     * 观点报告是否存在联系人信息(可能包含联系电话、地址等敏感信息)(报告尾页识别)
+     */
+    private Boolean withContacts;
+}

+ 3 - 12
mo-daq/src/main/java/com/smppw/modaq/domain/dto/report/ReportParserParams.java

@@ -1,11 +1,11 @@
 package com.smppw.modaq.domain.dto.report;
 
 import com.smppw.modaq.common.enums.ReportType;
-import lombok.*;
+import lombok.AllArgsConstructor;
+import lombok.Getter;
+import lombok.ToString;
 
 @Getter
-@Builder
-@NoArgsConstructor
 @AllArgsConstructor
 @ToString
 public class ReportParserParams {
@@ -24,16 +24,7 @@ public class ReportParserParams {
      */
     private String filepath;
     /**
-     * 备案编码
-     */
-    private String registerNumber;
-    /**
      * 报告类型
      */
     private ReportType reportType;
-
-    /**
-     * 上次传输的ai解析上传文件ID
-     */
-    private String aiFileId;
 }

+ 11 - 4
mo-daq/src/main/java/com/smppw/modaq/domain/mapper/EmailFileInfoMapper.java

@@ -36,11 +36,18 @@ public interface EmailFileInfoMapper {
                               @Param("aiParse") Boolean aiParse,
                               @Param("aiFileId") String aiFileId);
 
-    int getLetterFilenameSuccessCount(@Param("emailTitle") String emailTitle, @Param("filename") String filename);
+    int getLetterFilenameSuccessCount(@Param("emailTitle") String emailTitle,
+                                      @Param("filename") String filename);
 
-    int getAmacFilenameSuccessCount(@Param("emailTitle") String emailTitle, @Param("filename") String filename);
+    int getAmacFilenameSuccessCount(@Param("emailTitle") String emailTitle,
+                                    @Param("filename") String filename,
+                                    @Param("fileSize") int fileSize);
 
-    int getWeeklyFilenameSuccessCount(@Param("emailTitle") String emailTitle, @Param("filename") String filename);
+    int getWeeklyFilenameSuccessCount(@Param("emailTitle") String emailTitle,
+                                      @Param("filename") String filename,
+                                      @Param("fileSize") int fileSize);
 
-    int getOtherFilenameSuccessCount(@Param("emailTitle") String emailTitle, @Param("filename") String filename);
+    int getOtherFilenameSuccessCount(@Param("emailTitle") String emailTitle,
+                                     @Param("filename") String filename,
+                                     @Param("fileSize") int fileSize);
 }

+ 88 - 29
mo-daq/src/main/java/com/smppw/modaq/domain/service/EmailParseService.java

@@ -7,6 +7,7 @@ import cn.hutool.core.exceptions.ExceptionUtil;
 import cn.hutool.core.io.FileUtil;
 import cn.hutool.core.map.MapUtil;
 import cn.hutool.core.util.StrUtil;
+import com.smppw.modaq.application.components.OCRReportParser;
 import com.smppw.modaq.application.components.ReportParseUtils;
 import com.smppw.modaq.application.components.report.parser.ReportParser;
 import com.smppw.modaq.application.components.report.parser.ReportParserFactory;
@@ -24,6 +25,7 @@ import com.smppw.modaq.common.exception.ReportParseException;
 import com.smppw.modaq.domain.dto.EmailContentInfoDTO;
 import com.smppw.modaq.domain.dto.EmailZipFileDTO;
 import com.smppw.modaq.domain.dto.MailboxInfoDTO;
+import com.smppw.modaq.domain.dto.report.OCRParseData;
 import com.smppw.modaq.domain.dto.report.ParseResult;
 import com.smppw.modaq.domain.dto.report.ReportData;
 import com.smppw.modaq.domain.dto.report.ReportParserParams;
@@ -32,6 +34,7 @@ import com.smppw.modaq.domain.entity.EmailParseInfoDO;
 import com.smppw.modaq.domain.mapper.EmailFileInfoMapper;
 import com.smppw.modaq.domain.mapper.EmailParseInfoMapper;
 import com.smppw.modaq.infrastructure.util.ArchiveUtil;
+import com.smppw.modaq.infrastructure.util.DateUtils;
 import com.smppw.modaq.infrastructure.util.PdfUtil;
 import jakarta.mail.*;
 import jakarta.mail.internet.MimeUtility;
@@ -87,6 +90,8 @@ public class EmailParseService {
     @Value("${email.file.path}")
     private String path;
 
+    @Value("${email.report.ocr-parser-url}")
+    private String ocrParserUrl;
 
     public EmailParseService(EmailParseInfoMapper emailParseInfoMapper,
                              EmailFileInfoMapper emailFileInfoMapper,
@@ -205,11 +210,12 @@ public class EmailParseService {
         Integer emailType = emailContentInfoDTO.getEmailType();
         String filepath = emailContentInfoDTO.getFilePath();
         String emailTitle = emailContentInfoDTO.getEmailTitle();
+        int fileSize = emailContentInfoDTO.getFileSize();
 
         if (ArchiveUtil.isZip(filepath)) {
-            handleCompressedFiles(emailTitle, filepath, ".zip", emailType, resultList);
+            handleCompressedFiles(emailTitle, filepath, ".zip", emailType, fileSize, resultList);
         } else if (ArchiveUtil.isRAR(filepath)) {
-            handleCompressedFiles(emailTitle, filepath, ".rar", emailType, resultList);
+            handleCompressedFiles(emailTitle, filepath, ".rar", emailType, fileSize, resultList);
         }
 
         // 文件中的类型判断
@@ -231,7 +237,7 @@ public class EmailParseService {
     }
 
     private void handleCompressedFiles(String emailTitle, String filepath, String extension,
-                                       Integer emailType, List<EmailZipFileDTO> resultList) throws IOException {
+                                       Integer emailType, int fileSize, List<EmailZipFileDTO> resultList) throws IOException {
         String destPath = getDestinationPath(filepath, extension);
 
         File destFile = new File(destPath);
@@ -259,13 +265,13 @@ public class EmailParseService {
                 String[] subDirs = file.list();
                 if (subDirs != null) {
                     for (String subDir : subDirs) {
-                        resultList.add(new EmailZipFileDTO(emailTitle, subDir, emailType));
+                        resultList.add(new EmailZipFileDTO(emailTitle, subDir, fileSize, emailType));
                     }
                 } else {
                     log.warn("目录 {} 下无文件", dir);
                 }
             } else {
-                resultList.add(new EmailZipFileDTO(emailTitle, dir, emailType));
+                resultList.add(new EmailZipFileDTO(emailTitle, dir, fileSize, emailType));
             }
         }
     }
@@ -292,7 +298,7 @@ public class EmailParseService {
             List<EmailZipFileDTO> dtos = ListUtil.list(false);
             List<EmailZipFileDTO> zipFiles = entry.getValue();
             if (CollUtil.isEmpty(zipFiles)) {
-                dtos.add(new EmailZipFileDTO(emailTitle, emailDto.getFilePath(), emailDto.getFileName(), emailDto.getEmailType()));
+                dtos.add(new EmailZipFileDTO(emailTitle, emailDto));
             } else {
                 dtos.addAll(zipFiles);
             }
@@ -304,25 +310,26 @@ public class EmailParseService {
                 }
             }
 
-            // 数据库已存在的数据过滤
+            // 数据库已存在的数据过滤(邮件主题+报告名称+附件大小)
             Iterator<EmailZipFileDTO> iterator = dtos.iterator();
             while (iterator.hasNext()) {
                 EmailZipFileDTO dto = iterator.next();
                 Integer emailType = dto.getEmailType();
                 String filename = dto.getFilename();
+                int fileSize = dto.getFileSize();
                 int count = 0;
                 if (Objects.equals(emailType, EmailTypeConst.REPORT_LETTER_EMAIL_TYPE)) {
                     // 确认单
                     count = this.emailFileInfoMapper.getLetterFilenameSuccessCount(emailTitle, filename);
                 } else if (Objects.equals(emailType, EmailTypeConst.REPORT_EMAIL_TYPE)) {
                     // 定期报告
-                    count = this.emailFileInfoMapper.getAmacFilenameSuccessCount(emailTitle, filename);
+                    count = this.emailFileInfoMapper.getAmacFilenameSuccessCount(emailTitle, filename, fileSize);
                 } else if (Objects.equals(emailType, EmailTypeConst.REPORT_WEEKLY_TYPE)) {
                     // 管理人周报
-                    count = this.emailFileInfoMapper.getWeeklyFilenameSuccessCount(emailTitle, filename);
+                    count = this.emailFileInfoMapper.getWeeklyFilenameSuccessCount(emailTitle, filename, fileSize);
                 } else if (Objects.equals(emailType, EmailTypeConst.REPORT_OTHER_TYPE)) {
                     // 其他报告
-                    count = this.emailFileInfoMapper.getOtherFilenameSuccessCount(emailTitle, filename);
+                    count = this.emailFileInfoMapper.getOtherFilenameSuccessCount(emailTitle, filename, fileSize);
                 } else {
                     log.info("邮件{} 类型{} 不支持解析。", emailTitle, emailType);
                     iterator.remove();
@@ -385,8 +392,6 @@ public class EmailParseService {
             log.error(result.getMsg());
             return result;
         }
-        // 基金代码、备案编码
-        String registerNumber = ReportParseUtils.matchFundCode(fileName);
         // 类型识别---先识别季度报告,没有季度再识别年度报告,最后识别月报
         ReportType reportType = ReportParseUtils.matchReportType(emailType, fileName);
         if (reportType == null) {
@@ -414,15 +419,18 @@ public class EmailParseService {
             return result;
         }
         Integer fileId = emailFileInfo.getId();
-        String aiFileId = emailFileInfo.getAiFileId();
 
         // 首页和尾页转为png图片,首页用来识别基金名称和基金代码、尾页用来识别印章和联系人
         List<String> images = ListUtil.empty();
         try {
             String output = FileUtil.getParent(filepath, 1) + File.separator + "image";
             images = PdfUtil.convertFirstAndLastPagesToPng(filepath, FileUtil.file(output), 300);
-            if (log.isInfoEnabled()) {
-                log.info("报告[{}] 生成的图片地址是:{}", fileName, images);
+            if (log.isDebugEnabled()) {
+                log.debug("报告[{}] 生成的图片地址是:{}", fileName, images);
+            }
+            // 首页和尾页相等
+            if (images.size() == 1) {
+                images.add(images.get(0));
             }
         } catch (Exception e) {
             log.warn("报告[{}] 生成图片失败:{}", fileName, ExceptionUtil.stacktraceToString(e));
@@ -432,26 +440,19 @@ public class EmailParseService {
         boolean notSupportFile = false;
         // 解析报告
         ReportData reportData = null;
+        ReportParserParams params = new ReportParserParams(fileId, fileName, filepath, reportType);
         StopWatch parserWatch = new StopWatch();
         parserWatch.start();
         try {
-            if (StrUtil.isBlank(aiFileId) && reportType != ReportType.OTHER && reportType != ReportType.WEEKLY) {
-                ReportParserParams params = ReportParserParams.builder().fileId(fileId).filename(fileName).filepath(filepath)
-                        .registerNumber(registerNumber).reportType(reportType).build();
+            if (reportType != ReportType.OTHER && reportType != ReportType.WEEKLY) {
                 ReportParser<ReportData> instance = this.reportParserFactory.getInstance(reportType, fileType);
                 reportData = instance.parse(params);
                 result.setStatus(1);
                 result.setMsg("报告解析成功");
                 result.setData(reportData);
             } else {
-                if (reportType == ReportType.OTHER || reportType == ReportType.WEEKLY) {
-                    if (log.isInfoEnabled()) {
-                        log.info("报告{} 是周报或其他类型,直接用AI解析器解析", fileName);
-                    }
-                } else {
-                    if (log.isInfoEnabled()) {
-                        log.info("报告{} 是已经存在ai解析记录,上传过文件{},直接跳转到AI解析器进行解析", fileName, aiFileId);
-                    }
+                if (log.isInfoEnabled()) {
+                    log.info("报告{} 是周报或其他类型,直接用AI解析器解析", fileName);
                 }
             }
         } catch (ReportParseException e) {
@@ -478,10 +479,9 @@ public class EmailParseService {
                         log.info("报告{} 用首页图片{} 开始AI解析......", fileName, filepath);
                     }
                 }
-                ReportParserParams params = ReportParserParams.builder().fileId(fileId).filename(fileName).filepath(filepath)
-                        .registerNumber(registerNumber).reportType(reportType).aiFileId(aiFileId).build();
-                ReportParser<ReportData> instance = this.reportParserFactory.getInstance(reportType, ReportParserFileType.AI);
                 try {
+                    params = new ReportParserParams(fileId, fileName, filepath, reportType);
+                    ReportParser<ReportData> instance = this.reportParserFactory.getInstance(reportType, ReportParserFileType.AI);
                     reportData = instance.parse(params);
                     result.setStatus(1);
                     result.setMsg("报告解析成功--AI");
@@ -499,6 +499,8 @@ public class EmailParseService {
                     log.info("报告{} AI解析结束!", fileName);
                 }
             }
+            // ocr信息提取
+            this.ocrReportData(reportData, fileName, images);
             parserWatch.stop();
             if (log.isInfoEnabled()) {
                 log.info("报告{}解析结果为{},耗时{}ms", fileName, reportData, parserWatch.getTotalTimeMillis());
@@ -510,6 +512,63 @@ public class EmailParseService {
     }
 
     /**
+     * ocr 提取信息(包括首页的基金名称或报告日期,尾页的印章或联系人等信息)
+     *
+     * @param reportData 报告解析结果
+     * @param fileName   报告名称
+     * @param images     报告的收益和尾页png图片
+     */
+    private void ocrReportData(ReportData reportData, String fileName, List<String> images) {
+        if (reportData == null || CollUtil.isEmpty(images) || images.size() != 2) {
+            return;
+        }
+        OCRParseData parseRes = null;
+        try {
+            parseRes = new OCRReportParser().parse(fileName, this.ocrParserUrl, images.get(1));
+        } catch (Exception e) {
+            log.error("报告{} OCR识别印章和联系人出错:{}", fileName, ExceptionUtil.stacktraceToString(e));
+        }
+        // ocr识别尾页是否包含印章和联系人信息
+        if (parseRes != null) {
+            if (reportData.getBaseInfo() != null) {
+                Date reportDate = DateUtils.toDate(parseRes.getReportDate());
+                if (reportData.getBaseInfo().getReportDate() == null && reportDate != null) {
+                    reportData.getBaseInfo().setReportDate(reportDate);
+                }
+                reportData.getBaseInfo().setWithSeals(parseRes.getWithSeals());
+                reportData.getBaseInfo().setWithContacts(parseRes.getWithContacts());
+                if (fileName.contains("用印") && !Objects.equals(true, reportData.getBaseInfo().getWithSeals())) {
+                    reportData.getBaseInfo().setWithSeals(true);
+                }
+            }
+        }
+        // 用首页识别基金名称、产品代码和报告日期
+        if ((reportData.getBaseInfo() != null && reportData.getBaseInfo().getReportDate() == null)
+                || (reportData.getFundInfo() != null && StrUtil.isBlank(reportData.getFundInfo().getFundName()))
+                || (reportData.getFundInfo() != null && StrUtil.isBlank(reportData.getFundInfo().getFundCode()))) {
+            try {
+                parseRes = new OCRReportParser().parse(fileName, this.ocrParserUrl, images.get(0));
+            } catch (Exception e) {
+                log.error("报告{} OCR识别首页基金名称和报告日期出错:{}", fileName, ExceptionUtil.stacktraceToString(e));
+            }
+            if (reportData.getBaseInfo() != null && parseRes != null) {
+                Date reportDate = DateUtils.toDate(parseRes.getReportDate());
+                if (reportDate != null) {
+                    reportData.getBaseInfo().setReportDate(reportDate);
+                }
+            }
+            if (reportData.getFundInfo() != null && parseRes != null) {
+                if (StrUtil.isBlank(reportData.getFundInfo().getFundName())) {
+                    reportData.getFundInfo().setFundName(parseRes.getFundName());
+                }
+                if (StrUtil.isBlank(reportData.getFundInfo().getFundCode())) {
+                    reportData.getFundInfo().setFundCode(parseRes.getFundCode());
+                }
+            }
+        }
+    }
+
+    /**
      * 保存报告解析结果
      *
      * @param reportData 报告解析结果

+ 4 - 1
mo-daq/src/main/resources/application.yml

@@ -51,4 +51,7 @@ email:
   file:
     path: /home/wwwroot/mo_report_file
   report:
-    ai-parser-url: http://localhost:8088/upload-filepath
+    # ai解析远程地址
+    ai-parser-url: http://localhost:8088/upload-filepath
+    # ocr文字识别接口地址
+    ocr-parser-url: http://localhost:8088/parse-img

+ 3 - 3
mo-daq/src/main/resources/mapper/EmailFileInfoMapper.xml

@@ -248,7 +248,7 @@
     <select id="getAmacFilenameSuccessCount" resultType="int">
         select count(1)
         from mo_email_file_info a
-                 join mo_email_parse_info e on e.id = a.email_id and e.email_title = #{emailTitle}
+                 join mo_email_parse_info e on e.id = a.email_id and e.email_title = #{emailTitle} and e.attr_size = #{fileSize}
                  join mo_report_base_info b on b.file_id = a.id and b.report_type in ('MONTHLY', 'QUARTERLY', 'ANNUALLY')
                  join mo_report_fund_info c on a.id = c.file_id and (c.fund_name is not null or c.fund_code is not null)
         where a.file_name = #{filename} and a.isvalid = 1
@@ -257,7 +257,7 @@
     <select id="getWeeklyFilenameSuccessCount" resultType="int">
         select count(1)
         from mo_email_file_info a
-                 join mo_email_parse_info e on e.id = a.email_id and e.email_title = #{emailTitle}
+                 join mo_email_parse_info e on e.id = a.email_id and e.email_title = #{emailTitle} and e.attr_size = #{fileSize}
                  join mo_report_base_info b on b.file_id = a.id and b.report_type = 'WEEKLY'
                  join mo_report_fund_info c on a.id = c.file_id and (c.fund_name is not null or c.fund_code is not null)
         where a.file_name = #{filename} and a.isvalid = 1
@@ -266,7 +266,7 @@
     <select id="getOtherFilenameSuccessCount" resultType="int">
         select count(1)
         from mo_email_file_info a
-                 join mo_email_parse_info e on e.id = a.email_id and e.email_title = #{emailTitle}
+                 join mo_email_parse_info e on e.id = a.email_id and e.email_title = #{emailTitle} and e.attr_size = #{fileSize}
                  join mo_report_base_info b on b.file_id = a.id and b.report_type = 'OTHER'
         where a.file_name = #{filename} and a.isvalid = 1
     </select>

+ 3 - 3
mo-daq/src/test/java/com/smppw/modaq/MoDaqApplicationTests.java

@@ -37,9 +37,9 @@ public class MoDaqApplicationTests {
 
     @Test
     public void reportTest() {
-        MailboxInfoDTO emailInfoDTO = this.buildMailbox("***@simuwang.com", "**");
-        Date startDate = DateUtil.parse("2025-05-30 16:05:00", DateConst.YYYY_MM_DD_HH_MM_SS);
-        Date endDate = DateUtil.parse("2025-05-30 16:58:00", DateConst.YYYY_MM_DD_HH_MM_SS);
+        MailboxInfoDTO emailInfoDTO = this.buildMailbox("***@simuwang.com", "***");
+        Date startDate = DateUtil.parse("2025-06-03 15:20:00", DateConst.YYYY_MM_DD_HH_MM_SS);
+        Date endDate = DateUtil.parse("2025-06-03 16:58:00", DateConst.YYYY_MM_DD_HH_MM_SS);
         try {
             List<String> folderNames = ListUtil.list(false);
 //            folderNames.add("其他文件夹/报告公告");