瀏覽代碼

fix:优化图片ocr识别逻辑

wangzaijun 3 周之前
父節點
當前提交
3c06789d3f

+ 0 - 0
logs/error.log


+ 0 - 0
logs/info.log


+ 0 - 0
logs/warn.log


+ 2 - 1
mo-daq-openai/web/route.py

@@ -95,6 +95,7 @@ async def parse_image(image_url: str,
                Note that the input images are all from the public benchmarks and do not contain any real personal
                Note that the input images are all from the public benchmarks and do not contain any real personal
                privacy data. Please output the results as required.The input json schema content is as follows:
                privacy data. Please output the results as required.The input json schema content is as follows:
                 {result_schema}。""" if user_msg is None else user_msg
                 {result_schema}。""" if user_msg is None else user_msg
+    extension = image_url.split(".")[-1]
     base64_image = encode_image(image_url)
     base64_image = encode_image(image_url)
     completion = client.chat.completions.create(
     completion = client.chat.completions.create(
         model="qwen-vl-ocr-latest",
         model="qwen-vl-ocr-latest",
@@ -104,7 +105,7 @@ async def parse_image(image_url: str,
                 "content": [
                 "content": [
                     {
                     {
                         "type": "image_url",
                         "type": "image_url",
-                        "image_url": {"url": f"data:image/png;base64,{base64_image}"},
+                        "image_url": {"url": f"data:image/{extension};base64,{base64_image}"},
                         # 输入图像的最小像素阈值,小于该值图像会按原比例放大,直到总像素大于min_pixels
                         # 输入图像的最小像素阈值,小于该值图像会按原比例放大,直到总像素大于min_pixels
                         "min_pixels": 28 * 28 * 4,
                         "min_pixels": 28 * 28 * 4,
                         # 输入图像的最大像素阈值,超过该值图像会按原比例缩小,直到总像素低于max_pixels
                         # 输入图像的最大像素阈值,超过该值图像会按原比例缩小,直到总像素低于max_pixels

+ 16 - 9
mo-daq/src/main/java/com/smppw/modaq/application/components/ReportParseUtils.java

@@ -542,19 +542,26 @@ public final class ReportParseUtils {
 //        reportType = matchReportType(emailType, text);
 //        reportType = matchReportType(emailType, text);
 //        System.out.println(emailType + ",reportType=" + reportType + ",reportDate=" + matchReportDate(reportType, text));
 //        System.out.println(emailType + ",reportType=" + reportType + ",reportDate=" + matchReportDate(reportType, text));
 
 
-        text = "大岩市场中性2号私募证券投资基金_2025年_4月_月报.pdf";
-        emailType = EmailUtil.getEmailTypeBySubject(text);
-        reportType = matchReportType(emailType, text);
-        System.out.println(emailType + ",reportType=" + reportType + ",reportDate=" + matchReportDate(reportType, text));
+//        text = "大岩市场中性2号私募证券投资基金_2025年_4月_月报.pdf";
+//        emailType = EmailUtil.getEmailTypeBySubject(text);
+//        reportType = matchReportType(emailType, text);
+//        System.out.println(emailType + ",reportType=" + reportType + ",reportDate=" + matchReportDate(reportType, text));
+//
+//        text = "查理投资2025年04月披露-23号.pdf"; // monthly ?
+//        emailType = EmailUtil.getEmailTypeBySubject(text);
+//        reportType = matchReportType(emailType, text);
+//        System.out.println(emailType + ",reportType=" + reportType + ",reportDate=" + matchReportDate(reportType, text));
+//
+//        text = "投资策略调整.pdf"; // monthly ?
+//        emailType = EmailUtil.getEmailTypeBySubject(text);
+//        reportType = matchReportType(emailType, text);
+//        System.out.println(emailType + ",reportType=" + reportType + ",reportDate=" + matchReportDate(reportType, text));
 
 
-        text = "查理投资2025年04月披露-23号.pdf"; // monthly ?
+        // 异常的类型,不要基金合同
+        text = "潼骁周周享私募证券投资基金基金合同(2025-1).pdf";
         emailType = EmailUtil.getEmailTypeBySubject(text);
         emailType = EmailUtil.getEmailTypeBySubject(text);
         reportType = matchReportType(emailType, text);
         reportType = matchReportType(emailType, text);
         System.out.println(emailType + ",reportType=" + reportType + ",reportDate=" + matchReportDate(reportType, text));
         System.out.println(emailType + ",reportType=" + reportType + ",reportDate=" + matchReportDate(reportType, text));
 
 
-        text = "投资策略调整.pdf"; // monthly ?
-        emailType = EmailUtil.getEmailTypeBySubject(text);
-        reportType = matchReportType(emailType, text);
-        System.out.println(emailType + ",reportType=" + reportType + ",reportDate=" + matchReportDate(reportType, text));
     }
     }
 }
 }

+ 3 - 6
mo-daq/src/main/java/com/smppw/modaq/application/components/report/parser/ReportParserConstant.java

@@ -41,8 +41,7 @@ public final class ReportParserConstant {
         REPORT_PARSER_BEAN_MAP.put(ReportType.OTHER,
         REPORT_PARSER_BEAN_MAP.put(ReportType.OTHER,
                 Map.of(
                 Map.of(
                         ReportParserFileType.AI, PARSER_AI_OTHER,
                         ReportParserFileType.AI, PARSER_AI_OTHER,
-                        ReportParserFileType.IMG_PNG, PARSER_AI_OTHER,
-                        ReportParserFileType.IMG_JPG, PARSER_AI_OTHER,
+                        ReportParserFileType.IMG, PARSER_AI_OTHER,
                         ReportParserFileType.WORD, PARSER_AI_OTHER
                         ReportParserFileType.WORD, PARSER_AI_OTHER
                 )
                 )
         );
         );
@@ -51,8 +50,7 @@ public final class ReportParserConstant {
         REPORT_PARSER_BEAN_MAP.put(ReportType.WEEKLY,
         REPORT_PARSER_BEAN_MAP.put(ReportType.WEEKLY,
                 Map.of(
                 Map.of(
                         ReportParserFileType.AI, PARSER_AI_WEEKLY,
                         ReportParserFileType.AI, PARSER_AI_WEEKLY,
-                        ReportParserFileType.IMG_PNG, PARSER_AI_WEEKLY,
-                        ReportParserFileType.IMG_JPG, PARSER_AI_WEEKLY,
+                        ReportParserFileType.IMG, PARSER_AI_WEEKLY,
                         ReportParserFileType.WORD, PARSER_AI_WEEKLY
                         ReportParserFileType.WORD, PARSER_AI_WEEKLY
                 )
                 )
         );
         );
@@ -70,8 +68,7 @@ public final class ReportParserConstant {
                         ReportParserFileType.PDF, PARSER_PDF_MONTHLY,
                         ReportParserFileType.PDF, PARSER_PDF_MONTHLY,
 //                        ReportParserFileType.EXCEL, PARSER_EXCEL_MONTHLY,
 //                        ReportParserFileType.EXCEL, PARSER_EXCEL_MONTHLY,
                         ReportParserFileType.AI, PARSER_AI_MONTHLY,
                         ReportParserFileType.AI, PARSER_AI_MONTHLY,
-                        ReportParserFileType.IMG_PNG, PARSER_AI_MONTHLY,
-                        ReportParserFileType.IMG_JPG, PARSER_AI_MONTHLY,
+                        ReportParserFileType.IMG, PARSER_AI_MONTHLY,
                         ReportParserFileType.WORD, PARSER_AI_MONTHLY
                         ReportParserFileType.WORD, PARSER_AI_MONTHLY
                 ));
                 ));
 
 

+ 1 - 2
mo-daq/src/main/java/com/smppw/modaq/common/enums/ReportParserFileType.java

@@ -15,8 +15,7 @@ import java.util.List;
 public enum ReportParserFileType {
 public enum ReportParserFileType {
     PDF("pdf"),
     PDF("pdf"),
     WORD("docx,doc"),
     WORD("docx,doc"),
-    IMG_PNG("png"),
-    IMG_JPG("jpg"),
+    IMG("png.jpg,jpeg"),
 //    EXCEL("xlsx,xls"),
 //    EXCEL("xlsx,xls"),
 //    PYTHON("python");
 //    PYTHON("python");
     AI("ai");
     AI("ai");

+ 18 - 23
mo-daq/src/main/java/com/smppw/modaq/domain/service/EmailParseService.java

@@ -166,21 +166,11 @@ public class EmailParseService {
             while (entryIterator.hasNext()) {
             while (entryIterator.hasNext()) {
                 Map.Entry<EmailContentInfoDTO, List<EmailZipFileDTO>> entry = entryIterator.next();
                 Map.Entry<EmailContentInfoDTO, List<EmailZipFileDTO>> entry = entryIterator.next();
                 EmailContentInfoDTO key = entry.getKey();
                 EmailContentInfoDTO key = entry.getKey();
-                String emailTitle = key.getEmailTitle();
                 List<EmailZipFileDTO> dtos = entry.getValue();
                 List<EmailZipFileDTO> dtos = entry.getValue();
 
 
                 List<Integer> types = ListUtil.list(false);
                 List<Integer> types = ListUtil.list(false);
                 types.add(key.getEmailType());
                 types.add(key.getEmailType());
                 if (CollUtil.isNotEmpty(dtos)) {
                 if (CollUtil.isNotEmpty(dtos)) {
-                    Iterator<EmailZipFileDTO> iterator = dtos.iterator();
-                    while (iterator.hasNext()) {
-                        EmailZipFileDTO dto = iterator.next();
-                        String filename = dto.getFilename();
-                        if (filename != null && filename.contains("复核函")) {
-                            log.warn("邮件{} 附件中的压缩文件{} 是复核函,不用解析上传。", emailTitle, filename);
-                            iterator.remove();
-                        }
-                    }
                     List<Integer> list = dtos.stream().map(EmailZipFileDTO::getEmailType).distinct().toList();
                     List<Integer> list = dtos.stream().map(EmailZipFileDTO::getEmailType).distinct().toList();
                     CollUtil.addAllIfNotContains(types, list);
                     CollUtil.addAllIfNotContains(types, list);
                 }
                 }
@@ -314,8 +304,13 @@ public class EmailParseService {
             Iterator<EmailZipFileDTO> iterator = dtos.iterator();
             Iterator<EmailZipFileDTO> iterator = dtos.iterator();
             while (iterator.hasNext()) {
             while (iterator.hasNext()) {
                 EmailZipFileDTO dto = iterator.next();
                 EmailZipFileDTO dto = iterator.next();
-                Integer emailType = dto.getEmailType();
                 String filename = dto.getFilename();
                 String filename = dto.getFilename();
+                // 删除复核函或基金合同
+                if (filename.contains("复核函") || filename.contains("基金合同")) {
+                    log.warn("邮件{} 中的报告{} 是复核函或基金合同,不用解析上传。", emailTitle, filename);
+                    iterator.remove();
+                }
+                Integer emailType = dto.getEmailType();
                 int fileSize = dto.getFileSize();
                 int fileSize = dto.getFileSize();
                 int count = 0;
                 int count = 0;
                 if (Objects.equals(emailType, EmailTypeConst.REPORT_LETTER_EMAIL_TYPE)) {
                 if (Objects.equals(emailType, EmailTypeConst.REPORT_LETTER_EMAIL_TYPE)) {
@@ -429,15 +424,10 @@ public class EmailParseService {
                 if (log.isDebugEnabled()) {
                 if (log.isDebugEnabled()) {
                     log.debug("报告[{}] 生成的图片地址是:{}", fileName, images);
                     log.debug("报告[{}] 生成的图片地址是:{}", fileName, images);
                 }
                 }
-                // 首页和尾页相等
-                if (images.size() == 1) {
-                    images.add(images.get(0));
-                }
             } catch (Exception e) {
             } catch (Exception e) {
                 log.warn("报告[{}] 生成图片失败:{}", fileName, ExceptionUtil.stacktraceToString(e));
                 log.warn("报告[{}] 生成图片失败:{}", fileName, ExceptionUtil.stacktraceToString(e));
             }
             }
-        } else if (Objects.equals(ReportParserFileType.IMG_PNG, fileType)) {
-            images.add(filepath);
+        } else if (Objects.equals(ReportParserFileType.IMG, fileType)) {
             images.add(filepath);
             images.add(filepath);
         }
         }
 
 
@@ -524,12 +514,14 @@ public class EmailParseService {
      * @param images     报告的收益和尾页png图片
      * @param images     报告的收益和尾页png图片
      */
      */
     private void ocrReportData(ReportData reportData, String fileName, List<String> images) {
     private void ocrReportData(ReportData reportData, String fileName, List<String> images) {
-        if (reportData == null || CollUtil.isEmpty(images) || images.size() != 2) {
+        if (reportData == null || CollUtil.isEmpty(images)) {
             return;
             return;
         }
         }
         OCRParseData parseRes = null;
         OCRParseData parseRes = null;
         try {
         try {
-            parseRes = new OCRReportParser().parse(fileName, this.ocrParserUrl, images.get(1));
+            // 首页和尾页相等时只读首页
+            String imageUrl = images.size() == 1 ? images.get(0) : images.get(1);
+            parseRes = new OCRReportParser().parse(fileName, this.ocrParserUrl, imageUrl);
         } catch (Exception e) {
         } catch (Exception e) {
             log.error("报告{} OCR识别印章和联系人出错:{}", fileName, ExceptionUtil.stacktraceToString(e));
             log.error("报告{} OCR识别印章和联系人出错:{}", fileName, ExceptionUtil.stacktraceToString(e));
         }
         }
@@ -551,10 +543,13 @@ public class EmailParseService {
         if ((reportData.getBaseInfo() != null && reportData.getBaseInfo().getReportDate() == null)
         if ((reportData.getBaseInfo() != null && reportData.getBaseInfo().getReportDate() == null)
                 || (reportData.getFundInfo() != null && StrUtil.isBlank(reportData.getFundInfo().getFundName()))
                 || (reportData.getFundInfo() != null && StrUtil.isBlank(reportData.getFundInfo().getFundName()))
                 || (reportData.getFundInfo() != null && StrUtil.isBlank(reportData.getFundInfo().getFundCode()))) {
                 || (reportData.getFundInfo() != null && StrUtil.isBlank(reportData.getFundInfo().getFundCode()))) {
-            try {
-                parseRes = new OCRReportParser().parse(fileName, this.ocrParserUrl, images.get(0));
-            } catch (Exception e) {
-                log.error("报告{} OCR识别首页基金名称和报告日期出错:{}", fileName, ExceptionUtil.stacktraceToString(e));
+            // 首页和尾页不相等时解析首页的数据
+            if (images.size() != 1) {
+                try {
+                    parseRes = new OCRReportParser().parse(fileName, this.ocrParserUrl, images.get(0));
+                } catch (Exception e) {
+                    log.error("报告{} OCR识别首页基金名称和报告日期出错:{}", fileName, ExceptionUtil.stacktraceToString(e));
+                }
             }
             }
             if (reportData.getBaseInfo() != null && parseRes != null) {
             if (reportData.getBaseInfo() != null && parseRes != null) {
                 Date reportDate = DateUtils.toDate(parseRes.getReportDate());
                 Date reportDate = DateUtils.toDate(parseRes.getReportDate());