Explorar el Código

fix:优化AI提示词json中不能有注释,优化一个邮件中存在多种报告类型时只要pdf格式

wangzaijun hace 3 semanas
padre
commit
352291a7b7

BIN
compressed_image.jpg


+ 2 - 2
mo-daq/src/main/java/com/smppw/modaq/application/components/report/parser/ai/AIMonthlyReportParser.java

@@ -1,6 +1,5 @@
 package com.smppw.modaq.application.components.report.parser.ai;
 
-import cn.hutool.core.util.ObjUtil;
 import com.smppw.modaq.application.components.report.parser.ReportParserConstant;
 import com.smppw.modaq.common.exception.ReportParseException;
 import com.smppw.modaq.domain.dto.report.MonthlyReportData;
@@ -24,7 +23,8 @@ public class AIMonthlyReportParser extends AbstractAIReportParser<MonthlyReportD
     @Override
     protected String prompt() {
         return """
-                识别文件中的基金名称、基金管理人、产品代码和报告日期,产品代码的正则表达式是`S[A-Z0-9]{5}`,如果无法识别就返回空字符,结果用json返回
+                识别文件中的基金名称、基金管理人、产品代码和报告日期,产品代码的正则表达式是`S[A-Z0-9]{5}`;
+                如果无法识别就返回空字符,结果用json返回,json中不要注释
                 """;
     }
 

+ 2 - 1
mo-daq/src/main/java/com/smppw/modaq/application/components/report/parser/ai/AIOtherReportParser.java

@@ -18,7 +18,8 @@ public class AIOtherReportParser extends AbstractAIReportParser<ReportData> {
     @Override
     protected String prompt() {
         return """
-                识别文件中的基金名称、基金管理人、产品代码和报告日期,产品代码的正则表达式是`S[A-Z0-9]{5}`,如果无法识别就返回空字符,结果用json返回
+                识别文件中的基金名称、基金管理人、产品代码和报告日期,产品代码的正则表达式是`S[A-Z0-9]{5}`;
+                如果无法识别就返回空字符,结果用json返回,json中不要注释
                 """;
     }
 

+ 1 - 1
mo-daq/src/main/java/com/smppw/modaq/application/components/report/parser/ai/AIQuarterlyReportParser.java

@@ -32,7 +32,7 @@ public class AIQuarterlyReportParser<T extends QuarterlyReportData> extends Abst
                 识别文件中的基金基本情况、投资组合情况,
                 投资组合情况包含期末基金资产组合情况、报告期末按行业分类的股票投资组合,
                 报告期末按行业分类的股票投资组合又包含报告期末按行业分类的境内股票投资组合、报告期末按行业分类的港股通投资股票投资组合,
-                结果用json返回
+                结果用json返回,json中不要注释
                 """;
     }
 

+ 2 - 1
mo-daq/src/main/java/com/smppw/modaq/application/components/report/parser/ai/AIWeeklyReportParser.java

@@ -21,7 +21,8 @@ public class AIWeeklyReportParser extends AbstractAIReportParser<WeeklyReportDat
     @Override
     protected String prompt() {
         return """
-                识别文件中的基金名称、基金管理人、产品代码和报告日期,产品代码的正则表达式是`S[A-Z0-9]{5}`,如果无法识别就返回空字符,结果用json返回
+                识别文件中的基金名称、基金管理人、产品代码和报告日期,产品代码的正则表达式是`S[A-Z0-9]{5}`;
+                如果无法识别就返回空字符,结果用json返回,json中不要注释
                 """;
     }
 

+ 19 - 1
mo-daq/src/main/java/com/smppw/modaq/application/components/report/parser/ai/AbstractAIReportParser.java

@@ -8,6 +8,7 @@ import cn.hutool.http.HttpUtil;
 import cn.hutool.json.JSONObject;
 import cn.hutool.json.JSONUtil;
 import com.smppw.modaq.application.components.report.parser.AbstractReportParser;
+import com.smppw.modaq.common.conts.PatternConsts;
 import com.smppw.modaq.common.enums.ReportParseStatus;
 import com.smppw.modaq.common.exception.ReportParseException;
 import com.smppw.modaq.domain.dto.report.ReportData;
@@ -96,6 +97,22 @@ public abstract class AbstractAIReportParser<T extends ReportData> extends Abstr
         this.allInfoMap = MapUtil.newHashMap(128);
     }
 
+    /**
+     * 移除 JSON 字符串中的注释
+     */
+    protected String removeJsonComments(String json) {
+        return PatternConsts.JSON_COMMENT_PATTERN.matcher(json).replaceAll(mr -> {
+                    // 如果匹配到的是字符串内容(双引号包裹),则保留原内容
+                    if (mr.group(1) != null) {
+                        return mr.group(1);
+                    }
+                    // 否则移除注释(替换为空)
+                    return "";
+                })
+                .replaceAll("(?m)^\\s+", "")  // 移除空行
+                .trim();
+    }
+
     private void parseFileContent(ReportParserParams params) {
         String filename = params.getFilename();
         Map<String, Object> paramsMap = MapUtil.newHashMap(4);
@@ -112,7 +129,8 @@ public abstract class AbstractAIReportParser<T extends ReportData> extends Abstr
             String content = StrUtil.split(jsonResult.getStr("content"), "```").get(1);
             String aiParserContent = "{" + StrUtil.subAfter(content, "{", false) + "}";
             if (StrUtil.isNotBlank(aiParserContent)) {
-                this.handleAiResult(aiParserContent);
+                String result = this.removeJsonComments(aiParserContent);
+                this.handleAiResult(result);
             }
         } catch (ReportParseException e) {
             this.logger.warn("{} ai解析失败,解析结果{},错误原因:{}", filename, body, ExceptionUtil.stacktraceToString(e));

+ 8 - 0
mo-daq/src/main/java/com/smppw/modaq/common/conts/PatternConsts.java

@@ -36,4 +36,12 @@ public class PatternConsts {
      * 分级基金级别正则匹配
      */
     public static final Pattern FUND_LEVEL_PATTERN = Pattern.compile("[A-F]级|基金[A-F]");
+
+    // 正则表达式匹配单行和多行注释
+    public static final Pattern JSON_COMMENT_PATTERN = Pattern.compile(
+            "(\"(?:\\\\\"|[^\"])*?\")" +  // 匹配双引号内的内容(避免匹配字符串内的注释符号)
+                    "|//.*" +                     // 匹配单行注释
+                    "|/\\*(?:.|[\\n\\r])*?\\*/",  // 匹配多行注释
+            Pattern.MULTILINE
+    );
 }

+ 1 - 1
mo-daq/src/main/java/com/smppw/modaq/common/enums/ReportType.java

@@ -8,7 +8,7 @@ public enum ReportType {
     OTHER(-2, "其他报告",
             new String[]{"公告", "通知", "告知函", "意见征询函", "说明函",
                     "清算报告", "邀请函", "观点", "预警", "投研报告", "公示", "回顾",
-                    "风险提示函", "说明", "合同变更", "生效函", "投资报告"}),
+                    "风险提示函", "说明", "合同变更", "生效函", "投资报告", "投资者月报"}),
 
     LETTER(-1, "交易流水确认函",
             new String[]{"确认单", "确认函", "交易确认数据",

+ 5 - 8
mo-daq/src/main/java/com/smppw/modaq/domain/dto/EmailContentInfoDTO.java

@@ -81,17 +81,16 @@ public class EmailContentInfoDTO implements Serializable {
         return Objects.equals(emailAddress, that.emailAddress)
                 && Objects.equals(emailTitle, that.emailTitle)
                 && Objects.equals(emailDate, that.emailDate)
-                && Objects.equals(fileName, that.fileName)
-                && Objects.equals(filePath, that.filePath)
-                && Objects.equals(fileSize, that.fileSize)
+//                && Objects.equals(fileName, that.fileName)
+//                && Objects.equals(filePath, that.filePath)
+//                && Objects.equals(fileSize, that.fileSize)
                 && Objects.equals(emailType, that.emailType)
                 && Objects.equals(senderEmail, that.senderEmail);
     }
 
     @Override
     public int hashCode() {
-        return Objects.hash(emailAddress, emailTitle, emailDate,
-                fileName, filePath, emailType, senderEmail, fileSize);
+        return Objects.hash(emailAddress, emailTitle, emailDate, emailType, senderEmail);
     }
 
     @Override
@@ -100,10 +99,8 @@ public class EmailContentInfoDTO implements Serializable {
                 "emailAddress='" + emailAddress + '\'' +
                 ", emailTitle='" + emailTitle + '\'' +
                 ", emailDate='" + emailDate + '\'' +
-                ", fileName='" + fileName + '\'' +
-                ", filePath='" + filePath + '\'' +
+                ", senderEmail='" + senderEmail + '\'' +
                 ", emailType=" + emailType +
-                ", fileSize=" + fileSize +
                 '}';
     }
 }

+ 5 - 0
mo-daq/src/main/java/com/smppw/modaq/domain/dto/EmailZipFileDTO.java

@@ -13,12 +13,15 @@ public class EmailZipFileDTO {
     @Setter
     private Integer emailType;
 
+    private final String extName;
+
     public EmailZipFileDTO(String emailTitle, String filepath, int fileSize, Integer emailType) {
         this.emailTitle = emailTitle;
         this.filepath = filepath;
         this.emailType = emailType;
         this.fileSize = fileSize;
         this.filename = FileUtil.getName(filepath);
+        this.extName = FileUtil.extName(filepath);
     }
 
     public EmailZipFileDTO(String emailTitle, String filepath, String filename, int fileSize, Integer emailType) {
@@ -27,6 +30,7 @@ public class EmailZipFileDTO {
         this.emailType = emailType;
         this.filename = filename;
         this.fileSize = fileSize;
+        this.extName = FileUtil.extName(filepath);
     }
 
     public EmailZipFileDTO(String emailTitle, EmailContentInfoDTO emailDto) {
@@ -35,6 +39,7 @@ public class EmailZipFileDTO {
         this.emailType = emailDto.getEmailType();
         this.filename = emailDto.getFileName();
         this.fileSize = emailDto.getFileSize();
+        this.extName = FileUtil.extName(filepath);
     }
 
     @Override

+ 28 - 6
mo-daq/src/main/java/com/smppw/modaq/domain/service/EmailParseService.java

@@ -37,7 +37,9 @@ import com.smppw.modaq.infrastructure.util.ArchiveUtil;
 import com.smppw.modaq.infrastructure.util.PdfUtil;
 import jakarta.mail.*;
 import jakarta.mail.internet.MimeUtility;
-import jakarta.mail.search.*;
+import jakarta.mail.search.ComparisonTerm;
+import jakarta.mail.search.ReceivedDateTerm;
+import jakarta.mail.search.SearchTerm;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.springframework.beans.factory.annotation.Value;
@@ -144,12 +146,13 @@ public class EmailParseService {
             for (EmailContentInfoDTO emailContentInfoDTO : emailContentInfoDTOList) {
                 // 正文不用解压附件
                 if (emailContentInfoDTO.getFileName() != null && emailContentInfoDTO.getFileName().endsWith(".html")) {
-                    emailZipFileMap.put(emailContentInfoDTO, ListUtil.empty());
+                    emailZipFileMap.put(emailContentInfoDTO, ListUtil.list(false));
                     continue;
                 }
                 try {
-                    List<EmailZipFileDTO> fundNavDTOList = this.parseZipEmail(emailContentInfoDTO);
-                    emailZipFileMap.put(emailContentInfoDTO, fundNavDTOList);
+                    List<EmailZipFileDTO> tempList = emailZipFileMap.getOrDefault(emailContentInfoDTO, ListUtil.list(false));
+                    tempList.addAll(this.parseZipEmail(emailContentInfoDTO));
+                    emailZipFileMap.put(emailContentInfoDTO, tempList);
                 } catch (IOException e) {
                     log.error("压缩包解压失败:{}", ExceptionUtil.stacktraceToString(e));
                     EmailParseInfoDO fail = buildEmailParseInfo(null, mailboxInfoDTO.getAccount(), emailContentInfoDTO);
@@ -206,6 +209,10 @@ public class EmailParseService {
             handleCompressedFiles(emailTitle, filepath, ".zip", emailType, fileSize, resultList);
         } else if (ArchiveUtil.isRAR(filepath)) {
             handleCompressedFiles(emailTitle, filepath, ".rar", emailType, fileSize, resultList);
+        } else {
+            // 不是压缩包时
+            EmailZipFileDTO dto = new EmailZipFileDTO(emailTitle, emailContentInfoDTO);
+            resultList.add(dto);
         }
 
         // 文件中的类型判断
@@ -291,6 +298,7 @@ public class EmailParseService {
                 dtos.add(new EmailZipFileDTO(emailTitle, emailDto));
             } else {
                 dtos.addAll(zipFiles);
+                zipFiles.clear();
             }
             // 重新判断类型
             for (EmailZipFileDTO dto : dtos) {
@@ -299,6 +307,11 @@ public class EmailParseService {
                     dto.setEmailType(emailType);
                 }
             }
+            // 如果压缩包里面既有pdf又有其他格式的文件,说明其他格式的文件是不需要解析的
+            List<String> exts = dtos.stream().map(EmailZipFileDTO::getExtName).distinct().toList();
+            if (exts.contains("pdf") && exts.size() > 1) {
+                dtos.removeIf(e -> !Objects.equals("pdf", e.getExtName()));
+            }
 
             // 数据库已存在的数据过滤(邮件主题+报告名称+附件大小)
             Iterator<EmailZipFileDTO> iterator = dtos.iterator();
@@ -331,13 +344,16 @@ public class EmailParseService {
                 }
                 if (count > 0) {
                     iterator.remove();
-                    log.info("邮件{} 附件{} 已存在解析成功的记录,不用重新解析。", emailTitle, filename);
+                    log.info("邮件{} 报告{} 已存在解析成功的记录,不用重新解析。", emailTitle, filename);
                 }
             }
             if (CollUtil.isEmpty(dtos)) {
                 log.info("邮件{} 所有文件都已经解析成功过,不能重复解析了", emailTitle);
                 continue;
             }
+            if (log.isInfoEnabled()) {
+                log.info("邮件{} 还有报告待解析:{}", emailTitle, dtos);
+            }
 
             Integer emailId = emailDto.getEmailId();
             EmailParseInfoDO emailParseInfoDO = this.buildEmailParseInfo(emailId, emailAddress, emailDto);
@@ -428,7 +444,12 @@ public class EmailParseService {
                 log.warn("报告[{}] 生成图片失败:{}", fileName, ExceptionUtil.stacktraceToString(e));
             }
         } else if (Objects.equals(ReportParserFileType.IMG, fileType)) {
-            images.add(filepath);
+            try {
+                String outputFile = PdfUtil.compressAndSave(filepath);
+                images.add(outputFile);
+            } catch (IOException e) {
+                log.error("报告{} 图片压缩失败,{}", fileName, ExceptionUtil.stacktraceToString(e));
+            }
         }
 
         // 不支持解析的格式文件
@@ -895,6 +916,7 @@ public class EmailParseService {
 
     /**
      * 检查邮件是否已读
+     *
      * @param message 邮件对象
      * @return true表示已读,false表示未读
      * @throws MessagingException 如果访问邮件标志时出错

+ 152 - 8
mo-daq/src/main/java/com/smppw/modaq/infrastructure/util/PdfUtil.java

@@ -8,9 +8,12 @@ import org.apache.pdfbox.pdmodel.PDDocument;
 import org.apache.pdfbox.rendering.PDFRenderer;
 
 import javax.imageio.ImageIO;
+import java.awt.*;
 import java.awt.image.BufferedImage;
 import java.io.File;
 import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.StandardCopyOption;
 import java.util.List;
 
 public class PdfUtil {
@@ -40,17 +43,15 @@ public class PdfUtil {
             PDFRenderer renderer = new PDFRenderer(document);
 
             // 始终处理首页(页码从1开始)
-            generatedImages.add(
-                    renderPage(renderer, 0, baseName + ".png", outputDir, dpi)
-            );
+            String firstImg = renderPage(renderer, 0, baseName + ".png", outputDir, dpi);
+            generatedImages.add(compressAndSave(firstImg));
 
             // 处理尾页(当总页数 > 1 时)
             if (totalPages > 1) {
-                generatedImages.add(
-                        renderPage(renderer, totalPages - 1,
-                                baseName + "_footer.png",
-                                outputDir, dpi)
-                );
+                String lastImg = renderPage(renderer, totalPages - 1,
+                        baseName + "_footer.png",
+                        outputDir, dpi);
+                generatedImages.add(compressAndSave(lastImg));
             }
         }
 
@@ -67,4 +68,147 @@ public class PdfUtil {
         ImageIO.write(image, "PNG", outputFile);
         return outputFile.getAbsolutePath();
     }
+
+    /**
+     * 检查图片分辨率是否超过限制
+     *
+     * @param image   图片对象
+     * @param maxSize 最大允许尺寸(单边像素数)
+     * @return 是否超出限制
+     */
+    private static boolean isResolutionExceeded(BufferedImage image, int maxSize) {
+        return image.getWidth() > maxSize || image.getHeight() > maxSize;
+    }
+
+//    /**
+//     * 压缩图片并转换为Base64
+//     *
+//     * @param inputFile 输入图片文件
+//     * @param maxSize   最大允许尺寸(单边像素数)
+//     * @param quality   压缩质量 (0.0-1.0)
+//     * @param format    输出格式 ("jpg", "png"等)
+//     * @return Base64编码的图片数据
+//     */
+//    public static String compressAndConvertToBase64(File inputFile, int maxSize, float quality, String format)
+//            throws IOException {
+//
+//        // 读取原始图片
+//        BufferedImage originalImage = ImageIO.read(inputFile);
+//
+//        // 检查分辨率是否超出限制
+//        if (!isResolutionExceeded(originalImage, maxSize)) {
+//            System.out.println("图片尺寸符合要求,无需压缩");
+//        }
+//
+//        // 计算新尺寸(保持宽高比)
+//        int originalWidth = originalImage.getWidth();
+//        int originalHeight = originalImage.getHeight();
+//        double ratio = (double) maxSize / Math.max(originalWidth, originalHeight);
+//        int newWidth = (int) (originalWidth * ratio);
+//        int newHeight = (int) (originalHeight * ratio);
+//
+//        // 创建缩放后的图片
+//        Image scaledImage = originalImage.getScaledInstance(newWidth, newHeight, Image.SCALE_SMOOTH);
+//        BufferedImage outputImage = new BufferedImage(newWidth, newHeight, BufferedImage.TYPE_INT_RGB);
+//
+//        // 绘制缩放后的图片
+//        Graphics2D g2d = outputImage.createGraphics();
+//        g2d.setRenderingHint(RenderingHints.KEY_INTERPOLATION, RenderingHints.VALUE_INTERPOLATION_BILINEAR);
+//        g2d.drawImage(scaledImage, 0, 0, null);
+//        g2d.dispose();
+//
+//        // 转换为Base64
+//        ByteArrayOutputStream baos = new ByteArrayOutputStream();
+//        ImageIO.write(outputImage, format, baos);
+//        byte[] imageBytes = baos.toByteArray();
+//
+//        return Base64.getEncoder().encodeToString(imageBytes);
+//    }
+
+    public static String compressAndSave(String inputFile) throws IOException {
+        return compressAndSave(FileUtil.file(inputFile));
+    }
+
+    public static String compressAndSave(File inputFile) throws IOException {
+        int maxSize = 8192;
+        String format = FileUtil.extName(inputFile);
+        return compressAndSave(inputFile, null, maxSize, format);
+    }
+
+    /**
+     * 压缩图片并保存到文件
+     *
+     * @param inputFile  输入文件
+     * @param outputFile 输出文件
+     * @param maxSize    最大尺寸
+     * @param format     输出格式
+     */
+    public static String compressAndSave(File inputFile, File outputFile, int maxSize, String format)
+            throws IOException {
+
+        BufferedImage originalImage = ImageIO.read(inputFile);
+
+        if (!isResolutionExceeded(originalImage, maxSize)) {
+            if (outputFile != null) {
+                // 直接复制文件
+                Files.copy(inputFile.toPath(), outputFile.toPath(), StandardCopyOption.REPLACE_EXISTING);
+                return outputFile.getAbsolutePath();
+            }
+            return inputFile.getAbsolutePath();
+        }
+        if (outputFile == null) {
+            outputFile = inputFile;
+        }
+
+        int originalWidth = originalImage.getWidth();
+        int originalHeight = originalImage.getHeight();
+        double ratio = (double) maxSize / Math.max(originalWidth, originalHeight);
+        int newWidth = (int) (originalWidth * ratio);
+        int newHeight = (int) (originalHeight * ratio);
+
+        Image scaledImage = originalImage.getScaledInstance(newWidth, newHeight, Image.SCALE_SMOOTH);
+        BufferedImage outputImage = new BufferedImage(newWidth, newHeight, BufferedImage.TYPE_INT_RGB);
+
+        Graphics2D g2d = outputImage.createGraphics();
+        g2d.setRenderingHint(RenderingHints.KEY_INTERPOLATION, RenderingHints.VALUE_INTERPOLATION_BILINEAR);
+        g2d.drawImage(scaledImage, 0, 0, null);
+        g2d.dispose();
+
+        // 保存到文件
+        ImageIO.write(outputImage, format, outputFile);
+        return outputFile.getAbsolutePath();
+    }
+
+    public static void main(String[] args) {
+        try {
+            // 示例用法
+            File inputFile = new File("D:\\home\\wwwroot\\mo_report_file\\wangzaijun@simuwang.com\\20250605\\image\\泓湖泓福积极配置2期私募证券投资基金-周报-20250530.png");
+            int maxSize = 8192; // OpenAI限制
+
+            // 1. 检查图片是否超出限制
+            BufferedImage image = ImageIO.read(inputFile);
+            if (isResolutionExceeded(image, maxSize)) {
+                System.out.println("图片超出尺寸限制,需要压缩");
+                System.out.println("原始尺寸: " + image.getWidth() + "x" + image.getHeight());
+
+//                // 2. 压缩并转换为Base64
+//                String base64Image = compressAndConvertToBase64(inputFile, maxSize, 0.85f, "png");
+//                System.out.println("Base64 数据长度: " + base64Image.length());
+//                System.out.println("Base64 前缀: " + base64Image.substring(0, 50) + "...");
+
+                // 3. 压缩并保存到文件
+                File outputFile = new File("D:\\home\\wwwroot\\mo_report_file\\wangzaijun@simuwang.com\\20250605\\image\\泓湖泓福积极配置2期私募证券投资基金-周报-20250530.png");
+                String output = compressAndSave(inputFile, outputFile, maxSize, "png");
+                System.out.println("图片已压缩保存到: " + output);
+
+                // 验证压缩后尺寸
+                BufferedImage compressedImage = ImageIO.read(outputFile);
+                System.out.println("压缩后尺寸: " + compressedImage.getWidth() + "x" + compressedImage.getHeight());
+            } else {
+                System.out.println("图片尺寸符合要求");
+            }
+        } catch (IOException e) {
+            e.printStackTrace();
+        }
+    }
 }

+ 2 - 2
mo-daq/src/test/java/com/smppw/modaq/MoDaqApplicationTests.java

@@ -37,9 +37,9 @@ public class MoDaqApplicationTests {
 
     @Test
     public void reportTest() {
-        MailboxInfoDTO emailInfoDTO = this.buildMailbox("*@simuwang.com", "*");
+        MailboxInfoDTO emailInfoDTO = this.buildMailbox("8@simuwang.com", "8");
         Date startDate = DateUtil.parse("2025-06-05 17:02:00", DateConst.YYYY_MM_DD_HH_MM_SS);
-        Date endDate = DateUtil.parse("2025-06-06 17:05:00", DateConst.YYYY_MM_DD_HH_MM_SS);
+        Date endDate = DateUtil.parse("2025-06-05 17:05:00", DateConst.YYYY_MM_DD_HH_MM_SS);
         try {
             List<String> folderNames = ListUtil.list(false);
 //            folderNames.add("其他文件夹/报告公告");