Browse Source

fix:修复报告类型被错误识别问题

wangzaijun 2 weeks ago
parent
commit
3d109b3a41

+ 17 - 0
mo-daq/pom.xml

@@ -146,6 +146,23 @@
             </exclusions>
         </dependency>
 
+        <!-- docx 转 pdf依赖 -->
+        <dependency>
+            <groupId>org.docx4j</groupId>
+            <artifactId>docx4j-core</artifactId>
+            <version>11.5.3</version> <!-- 检查最新版本 -->
+        </dependency>
+        <dependency>
+            <groupId>org.docx4j</groupId>
+            <artifactId>docx4j-export-fo</artifactId>
+            <version>11.5.3</version>
+        </dependency>
+        <dependency>
+            <groupId>org.docx4j</groupId>
+            <artifactId>docx4j-JAXB-ReferenceImpl</artifactId>
+            <version>11.5.3</version>
+        </dependency>
+
 <!--        &lt;!&ndash; 通义千问 ai &ndash;&gt;-->
 <!--        <dependency>-->
 <!--            <groupId>com.alibaba</groupId>-->

+ 1 - 1
mo-daq/src/main/java/com/smppw/modaq/application/components/OCRReportParser.java

@@ -24,7 +24,7 @@ public class OCRReportParser {
         Map<String, Object> paramsMap = MapUtil.newHashMap(4);
         paramsMap.put("image_url", ocrImgUrl);
         paramsMap.put("user_msg", """
-                请帮我判断报告的类型,判断依据是:如果有业绩曲线则为管理人版,如果有基金概况和净值月报则为协会版,都不满足是返回null。
+                请帮我判断报告的类型,判断依据是:如果有基金概况和净值月报则为协会版,如果有业绩曲线或者基金概况和净值月报都没有则为管理人版,都不满足是返回null。
                 返回数据格式以json方式输出,格式为:{"报告类型":""}
                 """);
         ReportMonthlyType res = ReportMonthlyType.FAILED;

+ 16 - 19
mo-daq/src/main/java/com/smppw/modaq/application/components/ReportParseUtils.java

@@ -184,7 +184,10 @@ public final class ReportParseUtils {
      * @param mvAndRemark 市值或备注
      * @param dtos        结果数据
      */
-    public static void buildAssetAllocation(Integer fileId, String detail, String mvAndRemark, List<ReportAssetAllocationDTO> dtos) {
+    public static void buildAssetAllocation(Integer fileId,
+                                            String detail,
+                                            String mvAndRemark,
+                                            List<ReportAssetAllocationDTO> dtos) {
         if (StrUtil.isBlank(mvAndRemark)) {
             return;
         }
@@ -426,27 +429,16 @@ public final class ReportParseUtils {
             return reportType;
         }
         // 类型识别---先识别季度报告,没有季度再识别年度报告,最后识别月报
-        Matcher monthMatcher = PatternConsts.MONTHLY_PATTERN.matcher(text);
-        Matcher dayMatcher = PatternConsts.DAY_PATTERN.matcher(text);
         if (StrUtil.containsAny(text, ReportType.QUARTERLY.getPatterns())) {
             reportType = ReportType.QUARTERLY;
         } else if (StrUtil.containsAny(text, ReportType.ANNUALLY.getPatterns())) {
             reportType = ReportType.ANNUALLY;
         } else if (StrUtil.containsAny(text, ReportType.MONTHLY.getPatterns())) {
             reportType = ReportType.MONTHLY;
-        } else if (monthMatcher.find() && !dayMatcher.find()) {
+        } else if (ReportParseUtils.containsAny(text, ReportParseUtils.MANAGER_KEYWORDS) || text.contains("定期报告")) {
             // 特殊的月报(当季度->年度->月度报告无法识别时)
             reportType = ReportType.MONTHLY;
         }
-
-        // 特殊月报
-        if (ReportParseUtils.containsAny(text, MANAGER_KEYWORDS) || text.contains("定期报告")) {
-            reportType = ReportType.MONTHLY;
-        }
-        // 其他报告
-        if (text.contains("报告")) {
-            reportType = ReportType.OTHER;
-        }
         return reportType;
     }
 
@@ -592,12 +584,17 @@ public final class ReportParseUtils {
         emailType = EmailUtil.getEmailTypeBySubject(text);
         reportType = matchReportType(emailType, text);
         System.out.println(emailType + ",reportType=" + reportType + ",reportDate=" + matchReportDate(reportType, text));
-//
-//        text = "第一创业2025年合同变更公告.png";
-//        emailType = EmailUtil.getEmailTypeBySubject(text);
-//        reportType = matchReportType(emailType, text);
-//        System.out.println(emailType + ",reportType=" + reportType + ",reportDate=" + matchReportDate(reportType, text));
-//
+
+        text = "第一创业2025年合同变更公告.png";
+        emailType = EmailUtil.getEmailTypeBySubject(text);
+        reportType = matchReportType(emailType, text);
+        System.out.println(emailType + ",reportType=" + reportType + ",reportDate=" + matchReportDate(reportType, text));
+
+        text = "SGH866_致衍梧桐树2号私募证券投资基金_2025年04月份月度报告.pdf";
+        emailType = EmailUtil.getEmailTypeBySubject(text);
+        reportType = matchReportType(emailType, text);
+        System.out.println(emailType + ",reportType=" + reportType + ",reportDate=" + matchReportDate(reportType, text));
+
         String date = "2025年6月6日";
         String input = ReportParseUtils.cleaningValue(date, false);
         Date date1 = DateUtils.toDate(input);

+ 16 - 5
mo-daq/src/main/java/com/smppw/modaq/domain/service/EmailParseService.java

@@ -176,9 +176,10 @@ public class EmailParseService {
                 }
                 Integer type = EmailUtil.getEmailTypeBySubject(emailTitle + emailFile.getFilename());
                 // 特殊月报
-                if (Objects.equals(EmailTypeConst.NAV_EMAIL_TYPE, type)
+                if ((Objects.equals(EmailTypeConst.NAV_EMAIL_TYPE, type)
+                          || Objects.equals(EmailTypeConst.REPORT_OTHER_TYPE, type))
                         && (ReportParseUtils.containsAny(emailTitle, ReportParseUtils.MANAGER_KEYWORDS)
-                        || emailTitle.contains("定期报告"))) {
+                          || emailTitle.contains("定期报告"))) {
                     type = EmailTypeConst.REPORT_EMAIL_TYPE;
                 }
                 // 其他报告
@@ -518,6 +519,16 @@ public class EmailParseService {
             return new ParseResult<>(ReportParseStatus.NOT_A_REPORT, null, fileName);
         }
 
+        // docx转pdf
+        if (Objects.equals(ReportParserFileType.WORD, fileType)) {
+            try {
+                String outputFile = FileUtil.getParent(filepath, 1) + File.separator + FileUtil.mainName(fileName) + ".pdf";
+                PdfUtil.convertDocxToPdf(filepath, outputFile);
+                filepath = outputFile;
+            } catch (Exception e) {
+                log.warn("报告{} 转换为pdf失败:{}", fileName, ExceptionUtil.stacktraceToString(e));
+            }
+        }
         // 首页和尾页转为png图片,首页用来识别基金名称和基金代码、尾页用来识别印章和联系人
         List<String> images = ListUtil.list(true);
         if (Objects.equals(ReportParserFileType.PDF, fileType)) {
@@ -625,9 +636,9 @@ public class EmailParseService {
         if (ReportParseUtils.containsAny(fileName, ReportParseUtils.MANAGER_KEYWORDS)) {
             return ReportMonthlyType.MANAGER;
         }
-        if (StrUtil.isNotBlank(ReportParseUtils.matchFundCode(fileName))) {
-            return ReportMonthlyType.AMAC;
-        }
+//        if (StrUtil.isNotBlank(ReportParseUtils.matchFundCode(fileName))) {
+//            return ReportMonthlyType.AMAC;
+//        }
         // 2. 根据文件路径判断
         List<String> pathSegments = StrUtil.split(filepath, File.separator);
         for (String segment : pathSegments) {

+ 44 - 2
mo-daq/src/main/java/com/smppw/modaq/infrastructure/util/PdfUtil.java

@@ -6,17 +6,59 @@ import org.apache.pdfbox.Loader;
 import org.apache.pdfbox.io.RandomAccessReadBufferedFile;
 import org.apache.pdfbox.pdmodel.PDDocument;
 import org.apache.pdfbox.rendering.PDFRenderer;
+import org.docx4j.Docx4J;
+import org.docx4j.openpackaging.packages.OpcPackage;
+import org.docx4j.openpackaging.packages.WordprocessingMLPackage;
 
 import javax.imageio.ImageIO;
 import java.awt.*;
 import java.awt.image.BufferedImage;
-import java.io.File;
-import java.io.IOException;
+import java.io.*;
 import java.nio.file.Files;
 import java.nio.file.StandardCopyOption;
 import java.util.List;
 
 public class PdfUtil {
+
+    public static void validateDocx(String path) throws Exception {
+        File file = new File(path);
+
+        // 基础检查
+        if (!file.exists()) throw new FileNotFoundException("文件不存在");
+        if (!file.canRead()) throw new IOException("无读取权限");
+        if (file.length() == 0) throw new IOException("文件为空");
+
+        // 文件头检查
+        try (InputStream is = new FileInputStream(file)) {
+            byte[] header = new byte[4];
+            if (is.read(header) < 4) throw new IOException("文件过小");
+            if (header[0] != 0x50 || header[1] != 0x4B) { // PK 头
+                throw new IOException("不是ZIP格式文件");
+            }
+        }
+
+        // 尝试作为ZIP打开
+        try (java.util.zip.ZipFile zip = new java.util.zip.ZipFile(file)) {
+            if (zip.getEntry("[Content_Types].xml") == null) {
+                throw new IOException("缺少[Content_Types].xml");
+            }
+        } catch (IOException e) {
+            throw new IOException("无效的ZIP格式: " + e.getMessage());
+        }
+    }
+
+    public static void convertDocxToPdf(String input, String output) throws Exception {
+        validateDocx(input);
+        try (OutputStream os = new FileOutputStream(output)) {
+            OpcPackage opc = OpcPackage.load(new File(input));
+            if (opc instanceof WordprocessingMLPackage) {
+                Docx4J.toPDF((WordprocessingMLPackage) opc, os);
+            } else {
+                throw new Exception("不是WordprocessingML文档");
+            }
+        }
+    }
+
     /**
      * 将 PDF 的首页和尾页转换为 PNG 图片
      *

+ 2 - 2
mo-daq/src/test/java/com/smppw/modaq/MoDaqApplicationTests.java

@@ -42,8 +42,8 @@ public class MoDaqApplicationTests {
     @Test
     public void reportTest() {
         MailboxInfoDTO emailInfoDTO = this.buildMailbox("**@simuwang.com", "**");
-        Date startDate = DateUtil.parse("2025-06-12 13:56:00", DateConst.YYYY_MM_DD_HH_MM_SS);
-        Date endDate = DateUtil.parse("2025-06-12 17:06:00", DateConst.YYYY_MM_DD_HH_MM_SS);
+        Date startDate = DateUtil.parse("2025-06-12 13:54:00", DateConst.YYYY_MM_DD_HH_MM_SS);
+        Date endDate = DateUtil.parse("2025-06-12 13:57:00", DateConst.YYYY_MM_DD_HH_MM_SS);
         try {
             List<String> folderNames = ListUtil.list(false);
 //            folderNames.add("其他文件夹/报告公告");