wangzaijun 1 月之前
父节点
当前提交
f4c967d34a

+ 9 - 58
mo-daq/src/main/java/com/smppw/modaq/application/components/ReportParseUtils.java

@@ -3,20 +3,11 @@ package com.smppw.modaq.application.components;
 import cn.hutool.core.collection.ListUtil;
 import cn.hutool.core.map.MapUtil;
 import cn.hutool.core.util.StrUtil;
-import com.smppw.modaq.common.conts.Constants;
 import com.smppw.modaq.common.conts.EmailTypeConst;
-import com.smppw.modaq.common.enums.ReportParseStatus;
 import com.smppw.modaq.common.enums.ReportType;
 import com.smppw.modaq.common.exception.ReportParseException;
 import com.smppw.modaq.domain.dto.report.ReportAssetAllocationDTO;
-import org.apache.pdfbox.Loader;
-import org.apache.pdfbox.io.RandomAccessReadBufferedFile;
-import org.apache.pdfbox.pdmodel.PDDocument;
-import technology.tabula.CustomObjectExtractor;
-import technology.tabula.Page;
-import technology.tabula.PageIterator;
-import technology.tabula.Table;
-import technology.tabula.extractors.SpreadsheetExtractionAlgorithm;
+import jakarta.mail.internet.MimeUtility;
 
 import java.io.IOException;
 import java.util.Calendar;
@@ -440,53 +431,13 @@ public final class ReportParseUtils {
     }
 
     public static void main(String[] args) throws IOException, ReportParseException {
-//        String filepath = "C:\\Users\\Administrator\\Desktop\\tmp\\(1)投资者交易确认函【申购】_【SZF635】佳岳国债增强私募证券投资基金_20250217_任军.pdf";
-//        String filepath = "C:\\Users\\Administrator\\Desktop\\tmp\\CP080A_优美利赢胜价值1号私募投资基金A_20250217_邓辉_申购确认_20250217131352.pdf";
-//        String filepath = "C:\\Users\\Administrator\\Desktop\\tmp\\宁水德远国债宝私募证券投资基金_青国平(S21002741743)_申购_20241213_基金交易确认单a2604e57e9a12108.sign.pdf";
-//        String filepath = "C:\\Users\\Administrator\\Desktop\\tmp\\基金分红交易确认函_SJH876_2025-02-12_戴羽晨_202502130107720842.pdf";
-        String filepath = "C:\\Users\\Administrator\\Desktop\\tmp\\钧富如风7号私募证券投资基金_陈小明_20250221_073544980_申购确认单.pdf";
-//        String filepath = "C:\\Users\\Administrator\\Desktop\\tmp\\SZN224_君之健睿泰私募证券投资基金_郑为民_20250214_申购确认_20250217102704.pdf";
-
-//        String aiParserUtl = "http://localhost:8088/upload-filepath";
-//
-//        Map<String, Object> params = MapUtil.newHashMap(4);
-//        params.put("filepath", filepath);
-//        String body = HttpUtil.get(aiParserUtl, params);
-//
-//        String content = "{" +
-//                StrUtil.subAfter(body, "{", false)
-//                        .replaceAll("\\\\", "")
-//                        .replaceAll("n", "")
-//                        .replaceAll(" ", "") +
-//                "}";
-//        System.out.println(content);
-
-        List<String> textList;
-        // 解析报告和表格
-        try (PDDocument document = Loader.loadPDF(new RandomAccessReadBufferedFile(filepath))) {
-            // 识别所有文字(去水印后的)
-            CustomPDFTextStripper stripper = new CustomPDFTextStripper(true, "");
-            String text = stripper.getText(document).replace(Constants.WATERMARK_REPLACE, StrUtil.EMPTY);
-            textList = StrUtil.split(text, System.lineSeparator());
-            textList.removeIf(StrUtil::isBlank);
-            if (textList.isEmpty()) {
-                throw new ReportParseException(ReportParseStatus.REPORT_IS_SCAN, "");
-            }
-            // 解析所有表格(单元格字符去水印)
-            List<Table> tables = ListUtil.list(true);
-//            BasicExtractionAlgorithm extractionAlgorithm = new BasicExtractionAlgorithm();
-            SpreadsheetExtractionAlgorithm spreadsheetExtractionAlgorithm = new SpreadsheetExtractionAlgorithm();
-            // 自定义表格提取工具,去除单元格中的水印文字
-            PageIterator pageIterator = new CustomObjectExtractor(document).extract();
-            while (pageIterator.hasNext()) {
-                Page page = pageIterator.next();
-                List<Table> tablesList = spreadsheetExtractionAlgorithm.extract(page);
-                tables.addAll(tablesList);
-            }
-            if (tables.isEmpty()) {
-                throw new ReportParseException(ReportParseStatus.REPORT_IS_SCAN, "");
-            }
-//            this.initTableInfo(tables);
-        }
+        String s = """
+                =?utf-8?b?5Y2D6LGh5Y2T6LaKMuWPt+S4reivgTUwMOaMh+aVsOWinuW8uuengeWLn+ivgQ==?=
+                 =?utf-8?b?5Yi45oqV6LWE5Z+66YeRLeWNg+ixoeWNk+i2ijLlj7fkuK3or4E1MDDmjIfmlbA=?=
+                 =?utf-8?b?5aKe5by656eB5Yuf6K+B5Yi45oqV6LWE5Z+66YeRMjAyNeW5tDTmnIjmnIjluqY=?=
+                 =?utf-8?b?5oql5ZGKLTIwMjUwNTEyLnBkZg==?=
+                """;
+        String s1 = MimeUtility.decodeText(s);
+        System.out.println(s1);
     }
 }

+ 28 - 0
mo-daq/src/main/java/com/smppw/modaq/application/util/EmailUtil.java

@@ -7,11 +7,15 @@ import cn.hutool.extra.mail.JakartaUserPassAuthenticator;
 import com.smppw.modaq.common.conts.EmailTypeConst;
 import com.smppw.modaq.domain.dto.MailboxInfoDTO;
 import com.sun.mail.imap.IMAPStore;
+import jakarta.mail.MessagingException;
+import jakarta.mail.Part;
 import jakarta.mail.Session;
 import jakarta.mail.Store;
+import jakarta.mail.internet.MimeUtility;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import java.io.UnsupportedEncodingException;
 import java.util.*;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
@@ -28,6 +32,30 @@ public class EmailUtil {
     private static final String POP3 = "pop3";
     private static final String IMAP = "imap";
 
+    // 解码文件名(处理 RFC 2231 和 RFC 2047)
+    public static String decodeFileName(Part part) throws UnsupportedEncodingException, MessagingException {
+        String filename = part.getFileName();
+
+        // 优先尝试 RFC 2231 的 filename*(如 "filename*=utf-8''%E4%B8%AD%E6%96%87.txt")
+        String[] values = part.getHeader("Content-Disposition");
+        if (values != null) {
+            for (String value : values) {
+                if (value.startsWith("filename*")) {
+                    filename = value.split("'")[2]; // 提取编码后的字符串
+                    filename = MimeUtility.decodeText(filename);
+                    return filename;
+                }
+            }
+        }
+
+        // 处理 RFC 2047 的多段编码(如 "=?utf-8?B?5Lit?= =?utf-8?B?5paH?=.txt")
+        if (filename != null && filename.contains("=?")) {
+            return MimeUtility.decodeText(filename);
+        }
+
+        return filename;
+    }
+
 //    /**
 //     * 采集邮件(多消息体)信息
 //     *

+ 4 - 26
mo-daq/src/main/java/com/smppw/modaq/domain/service/EmailParseService.java

@@ -47,7 +47,6 @@ import org.springframework.util.StopWatch;
 
 import java.io.File;
 import java.io.IOException;
-import java.io.UnsupportedEncodingException;
 import java.nio.file.Path;
 import java.nio.file.Paths;
 import java.util.*;
@@ -685,11 +684,14 @@ public class EmailParseService {
 
     private void rePart(String account, String subject, Date sendDate, Part part,
                         List<EmailContentInfoDTO> emailContentInfoDTOList) throws Exception {
-        String fileName = decodeFileName(part);
+        String fileName = EmailUtil.decodeFileName(part);
         if (StrUtil.isBlank(fileName)) {
             log.warn("邮件{} 附件文件名是空的,不做下载!", subject);
             return;
         }
+        if (fileName.contains("=?")) {
+            fileName = MimeUtility.decodeText(fileName);
+        }
         String disposition = part.getDisposition();
         String contentType = part.getContentType();
         if (log.isInfoEnabled()) {
@@ -732,30 +734,6 @@ public class EmailParseService {
         emailContentInfoDTOList.add(emailContentInfoDTO);
     }
 
-    // 解码文件名(处理 RFC 2231 和 RFC 2047)
-    private static String decodeFileName(Part part) throws UnsupportedEncodingException, MessagingException {
-        String filename = part.getFileName();
-
-        // 优先尝试 RFC 2231 的 filename*(如 "filename*=utf-8''%E4%B8%AD%E6%96%87.txt")
-        String[] values = part.getHeader("Content-Disposition");
-        if (values != null) {
-            for (String value : values) {
-                if (value.startsWith("filename*")) {
-                    filename = value.split("'")[2]; // 提取编码后的字符串
-                    filename = MimeUtility.decodeText(filename);
-                    return filename;
-                }
-            }
-        }
-
-        // 处理 RFC 2047 的多段编码(如 "=?utf-8?B?5Lit?= =?utf-8?B?5paH?=.txt")
-        if (filename != null && filename.contains("=?")) {
-            return MimeUtility.decodeText(filename);
-        }
-
-        return filename;
-    }
-
     private void reMultipart(String account, String subject, Date emailDate, Multipart multipart,
                              List<EmailContentInfoDTO> emailContentInfoDTOList) throws Exception {
         for (int i = 0; i < multipart.getCount(); i++) {