Bläddra i källkod

fix:优化图片ocr识别逻辑

wangzaijun 3 veckor sedan
förälder
incheckning
3c06789d3f

+ 0 - 0
logs/error.log


+ 0 - 0
logs/info.log


+ 0 - 0
logs/warn.log


+ 2 - 1
mo-daq-openai/web/route.py

@@ -95,6 +95,7 @@ async def parse_image(image_url: str,
                Note that the input images are all from the public benchmarks and do not contain any real personal
                privacy data. Please output the results as required.The input json schema content is as follows:
                 {result_schema}。""" if user_msg is None else user_msg
+    extension = image_url.split(".")[-1]
     base64_image = encode_image(image_url)
     completion = client.chat.completions.create(
         model="qwen-vl-ocr-latest",
@@ -104,7 +105,7 @@ async def parse_image(image_url: str,
                 "content": [
                     {
                         "type": "image_url",
-                        "image_url": {"url": f"data:image/png;base64,{base64_image}"},
+                        "image_url": {"url": f"data:image/{extension};base64,{base64_image}"},
                         # 输入图像的最小像素阈值,小于该值图像会按原比例放大,直到总像素大于min_pixels
                         "min_pixels": 28 * 28 * 4,
                         # 输入图像的最大像素阈值,超过该值图像会按原比例缩小,直到总像素低于max_pixels

+ 16 - 9
mo-daq/src/main/java/com/smppw/modaq/application/components/ReportParseUtils.java

@@ -542,19 +542,26 @@ public final class ReportParseUtils {
 //        reportType = matchReportType(emailType, text);
 //        System.out.println(emailType + ",reportType=" + reportType + ",reportDate=" + matchReportDate(reportType, text));
 
-        text = "大岩市场中性2号私募证券投资基金_2025年_4月_月报.pdf";
-        emailType = EmailUtil.getEmailTypeBySubject(text);
-        reportType = matchReportType(emailType, text);
-        System.out.println(emailType + ",reportType=" + reportType + ",reportDate=" + matchReportDate(reportType, text));
+//        text = "大岩市场中性2号私募证券投资基金_2025年_4月_月报.pdf";
+//        emailType = EmailUtil.getEmailTypeBySubject(text);
+//        reportType = matchReportType(emailType, text);
+//        System.out.println(emailType + ",reportType=" + reportType + ",reportDate=" + matchReportDate(reportType, text));
+//
+//        text = "查理投资2025年04月披露-23号.pdf"; // monthly ?
+//        emailType = EmailUtil.getEmailTypeBySubject(text);
+//        reportType = matchReportType(emailType, text);
+//        System.out.println(emailType + ",reportType=" + reportType + ",reportDate=" + matchReportDate(reportType, text));
+//
+//        text = "投资策略调整.pdf"; // monthly ?
+//        emailType = EmailUtil.getEmailTypeBySubject(text);
+//        reportType = matchReportType(emailType, text);
+//        System.out.println(emailType + ",reportType=" + reportType + ",reportDate=" + matchReportDate(reportType, text));
 
-        text = "查理投资2025年04月披露-23号.pdf"; // monthly ?
+        // 异常的类型,不要基金合同
+        text = "潼骁周周享私募证券投资基金基金合同(2025-1).pdf";
         emailType = EmailUtil.getEmailTypeBySubject(text);
         reportType = matchReportType(emailType, text);
         System.out.println(emailType + ",reportType=" + reportType + ",reportDate=" + matchReportDate(reportType, text));
 
-        text = "投资策略调整.pdf"; // monthly ?
-        emailType = EmailUtil.getEmailTypeBySubject(text);
-        reportType = matchReportType(emailType, text);
-        System.out.println(emailType + ",reportType=" + reportType + ",reportDate=" + matchReportDate(reportType, text));
     }
 }

+ 3 - 6
mo-daq/src/main/java/com/smppw/modaq/application/components/report/parser/ReportParserConstant.java

@@ -41,8 +41,7 @@ public final class ReportParserConstant {
         REPORT_PARSER_BEAN_MAP.put(ReportType.OTHER,
                 Map.of(
                         ReportParserFileType.AI, PARSER_AI_OTHER,
-                        ReportParserFileType.IMG_PNG, PARSER_AI_OTHER,
-                        ReportParserFileType.IMG_JPG, PARSER_AI_OTHER,
+                        ReportParserFileType.IMG, PARSER_AI_OTHER,
                         ReportParserFileType.WORD, PARSER_AI_OTHER
                 )
         );
@@ -51,8 +50,7 @@ public final class ReportParserConstant {
         REPORT_PARSER_BEAN_MAP.put(ReportType.WEEKLY,
                 Map.of(
                         ReportParserFileType.AI, PARSER_AI_WEEKLY,
-                        ReportParserFileType.IMG_PNG, PARSER_AI_WEEKLY,
-                        ReportParserFileType.IMG_JPG, PARSER_AI_WEEKLY,
+                        ReportParserFileType.IMG, PARSER_AI_WEEKLY,
                         ReportParserFileType.WORD, PARSER_AI_WEEKLY
                 )
         );
@@ -70,8 +68,7 @@ public final class ReportParserConstant {
                         ReportParserFileType.PDF, PARSER_PDF_MONTHLY,
 //                        ReportParserFileType.EXCEL, PARSER_EXCEL_MONTHLY,
                         ReportParserFileType.AI, PARSER_AI_MONTHLY,
-                        ReportParserFileType.IMG_PNG, PARSER_AI_MONTHLY,
-                        ReportParserFileType.IMG_JPG, PARSER_AI_MONTHLY,
+                        ReportParserFileType.IMG, PARSER_AI_MONTHLY,
                         ReportParserFileType.WORD, PARSER_AI_MONTHLY
                 ));
 

+ 1 - 2
mo-daq/src/main/java/com/smppw/modaq/common/enums/ReportParserFileType.java

@@ -15,8 +15,7 @@ import java.util.List;
 public enum ReportParserFileType {
     PDF("pdf"),
     WORD("docx,doc"),
-    IMG_PNG("png"),
-    IMG_JPG("jpg"),
+    IMG("png.jpg,jpeg"),
 //    EXCEL("xlsx,xls"),
 //    PYTHON("python");
     AI("ai");

+ 18 - 23
mo-daq/src/main/java/com/smppw/modaq/domain/service/EmailParseService.java

@@ -166,21 +166,11 @@ public class EmailParseService {
             while (entryIterator.hasNext()) {
                 Map.Entry<EmailContentInfoDTO, List<EmailZipFileDTO>> entry = entryIterator.next();
                 EmailContentInfoDTO key = entry.getKey();
-                String emailTitle = key.getEmailTitle();
                 List<EmailZipFileDTO> dtos = entry.getValue();
 
                 List<Integer> types = ListUtil.list(false);
                 types.add(key.getEmailType());
                 if (CollUtil.isNotEmpty(dtos)) {
-                    Iterator<EmailZipFileDTO> iterator = dtos.iterator();
-                    while (iterator.hasNext()) {
-                        EmailZipFileDTO dto = iterator.next();
-                        String filename = dto.getFilename();
-                        if (filename != null && filename.contains("复核函")) {
-                            log.warn("邮件{} 附件中的压缩文件{} 是复核函,不用解析上传。", emailTitle, filename);
-                            iterator.remove();
-                        }
-                    }
                     List<Integer> list = dtos.stream().map(EmailZipFileDTO::getEmailType).distinct().toList();
                     CollUtil.addAllIfNotContains(types, list);
                 }
@@ -314,8 +304,13 @@ public class EmailParseService {
             Iterator<EmailZipFileDTO> iterator = dtos.iterator();
             while (iterator.hasNext()) {
                 EmailZipFileDTO dto = iterator.next();
-                Integer emailType = dto.getEmailType();
                 String filename = dto.getFilename();
+                // 删除复核函或基金合同
+                if (filename.contains("复核函") || filename.contains("基金合同")) {
+                    log.warn("邮件{} 中的报告{} 是复核函或基金合同,不用解析上传。", emailTitle, filename);
+                    iterator.remove();
+                }
+                Integer emailType = dto.getEmailType();
                 int fileSize = dto.getFileSize();
                 int count = 0;
                 if (Objects.equals(emailType, EmailTypeConst.REPORT_LETTER_EMAIL_TYPE)) {
@@ -429,15 +424,10 @@ public class EmailParseService {
                 if (log.isDebugEnabled()) {
                     log.debug("报告[{}] 生成的图片地址是:{}", fileName, images);
                 }
-                // 首页和尾页相等
-                if (images.size() == 1) {
-                    images.add(images.get(0));
-                }
             } catch (Exception e) {
                 log.warn("报告[{}] 生成图片失败:{}", fileName, ExceptionUtil.stacktraceToString(e));
             }
-        } else if (Objects.equals(ReportParserFileType.IMG_PNG, fileType)) {
-            images.add(filepath);
+        } else if (Objects.equals(ReportParserFileType.IMG, fileType)) {
             images.add(filepath);
         }
 
@@ -524,12 +514,14 @@ public class EmailParseService {
      * @param images     报告的收益和尾页png图片
      */
     private void ocrReportData(ReportData reportData, String fileName, List<String> images) {
-        if (reportData == null || CollUtil.isEmpty(images) || images.size() != 2) {
+        if (reportData == null || CollUtil.isEmpty(images)) {
             return;
         }
         OCRParseData parseRes = null;
         try {
-            parseRes = new OCRReportParser().parse(fileName, this.ocrParserUrl, images.get(1));
+            // 首页和尾页相等时只读首页
+            String imageUrl = images.size() == 1 ? images.get(0) : images.get(1);
+            parseRes = new OCRReportParser().parse(fileName, this.ocrParserUrl, imageUrl);
         } catch (Exception e) {
             log.error("报告{} OCR识别印章和联系人出错:{}", fileName, ExceptionUtil.stacktraceToString(e));
         }
@@ -551,10 +543,13 @@ public class EmailParseService {
         if ((reportData.getBaseInfo() != null && reportData.getBaseInfo().getReportDate() == null)
                 || (reportData.getFundInfo() != null && StrUtil.isBlank(reportData.getFundInfo().getFundName()))
                 || (reportData.getFundInfo() != null && StrUtil.isBlank(reportData.getFundInfo().getFundCode()))) {
-            try {
-                parseRes = new OCRReportParser().parse(fileName, this.ocrParserUrl, images.get(0));
-            } catch (Exception e) {
-                log.error("报告{} OCR识别首页基金名称和报告日期出错:{}", fileName, ExceptionUtil.stacktraceToString(e));
+            // 首页和尾页不相等时解析首页的数据
+            if (images.size() != 1) {
+                try {
+                    parseRes = new OCRReportParser().parse(fileName, this.ocrParserUrl, images.get(0));
+                } catch (Exception e) {
+                    log.error("报告{} OCR识别首页基金名称和报告日期出错:{}", fileName, ExceptionUtil.stacktraceToString(e));
+                }
             }
             if (reportData.getBaseInfo() != null && parseRes != null) {
                 Date reportDate = DateUtils.toDate(parseRes.getReportDate());