1 mēnesi atpakaļ · 1d1ec649e0
--- a/mo-daq/src/main/java/com/smppw/modaq/infrastructure/util/ExcelUtil.java
+++ b/mo-daq/src/main/java/com/smppw/modaq/infrastructure/util/ExcelUtil.java
@@ -4,6 +4,7 @@ import cn.hutool.core.collection.CollUtil;
 
				 import cn.hutool.core.collection.ListUtil;
			
 
				 import cn.hutool.core.io.FileUtil;
			
 
				 import cn.hutool.core.util.StrUtil;
			
 
				+import com.smppw.modaq.common.enums.ReportType;
			
 
				 import net.sf.sevenzipjbinding.ExtractOperationResult;
			
 
				 import net.sf.sevenzipjbinding.IInArchive;
			
 
				 import net.sf.sevenzipjbinding.SevenZip;
			
@@ -20,12 +21,14 @@ import org.apache.commons.compress.archivers.zip.ZipFile;
 
				 import org.apache.commons.io.IOUtils;
			
 
				 
			
 
				 import java.io.*;
			
 
				+import java.nio.charset.Charset;
			
 
				 import java.nio.file.Files;
			
 
				 import java.nio.file.Path;
			
 
				 import java.nio.file.Paths;
			
 
				 import java.util.Arrays;
			
 
				 import java.util.Enumeration;
			
 
				 import java.util.List;
			
 
				+import java.util.regex.Pattern;
			
 
				 
			
 
				 public class ExcelUtil {
			
 
				     // 候选编码列表（按常见顺序排列）
			
@@ -36,6 +39,24 @@ public class ExcelUtil {
 
				             "ISO-8859-1" // 默认回退
			
 
				     );
			
 
				 
			
 
				+    private static final Pattern ILLEGAL_CHARS_PATTERN =
			
 
				+            Pattern.compile("[\\\\:*?\"<>|\\x00-\\x1F]"); // 包含控制字符
			
 
				+
			
 
				+    // Unicode 中文字符区块定义
			
 
				+    private static final int[][] CJK_BLOCKS = {
			
 
				+            {0x4E00, 0x9FFF},    // CJK Unified Ideographs (基本汉字)
			
 
				+            {0x3400, 0x4DBF},    // CJK Extension A (生僻字)
			
 
				+            {0x20000, 0x2A6DF},  // CJK Extension B (极生僻字)
			
 
				+            {0x2A700, 0x2B73F},  // CJK Extension C
			
 
				+            {0x3000, 0x303F}     // CJK标点符号
			
 
				+    };
			
 
				+
			
 
				+    // 常见高频汉字范围（覆盖约99%日常用字）
			
 
				+    private static final int[][] COMMON_CHINESE_BLOCKS = {
			
 
				+            {0x4E00, 0x9FA5},    // 通用规范汉字表（8105字）
			
 
				+            {0x3000, 0x303F}     // 常用标点
			
 
				+    };
			
 
				+
			
 
				     public static boolean isExcel(String fileName) {
			
 
				         return StrUtil.isNotBlank(fileName) && (fileName.endsWith("xls") || fileName.endsWith("xlsx") || fileName.endsWith("XLS") || fileName.endsWith("XLSX"));
			
 
				     }
			
@@ -106,8 +127,124 @@ public class ExcelUtil {
 
				         }
			
 
				 
			
 
				         return filePathList;
			
 
				+//        return extractCompressedFiles(zipFilePath, destFilePath, 0, new HashSet<>());
			
 
				     }
			
 
				 
			
 
				+//    /**
			
 
				+//     * 递归解压压缩文件（支持嵌套）
			
 
				+//     *
			
 
				+//     * @param zipFilePath 压缩文件路径
			
 
				+//     * @param destDir     目标目录
			
 
				+//     * @param depth       当前递归深度（防止无限递归）
			
 
				+//     * @param processed   已处理的文件哈希（防止重复解压）
			
 
				+//     */
			
 
				+//    private static List<String> extractCompressedFiles(String zipFilePath, String destDir, int depth, Set<String> processed) throws IOException, ArchiveException {
			
 
				+//        // 防御：限制递归深度防止栈溢出
			
 
				+//        if (depth > 4) {
			
 
				+//            throw new ArchiveException("Maximum recursion depth (4) exceeded");
			
 
				+//        }
			
 
				+//
			
 
				+//        String encoding = detectEncoding(zipFilePath); // 编码检测
			
 
				+//        if (encoding == null) {
			
 
				+//            encoding = "GBK";
			
 
				+//        }
			
 
				+//        List<String> extractedFiles = ListUtil.list(false);
			
 
				+//        File destDirFile = FileUtil.mkdir(destDir);
			
 
				+//
			
 
				+//        try (InputStream fis = new BufferedInputStream(Files.newInputStream(Paths.get(zipFilePath)));
			
 
				+//             ArchiveInputStream<?> ais = createArchiveInputStream(fis, encoding)) {
			
 
				+//            ArchiveEntry entry;
			
 
				+//            while ((entry = ais.getNextEntry()) != null) {
			
 
				+//                String entryName = sanitizeEntryName(entry.getName()); // 清理非法字符
			
 
				+//                if (entryName.startsWith("__MACOSX/")) {
			
 
				+//                    continue;
			
 
				+//                }
			
 
				+//                String zipFilename = FileUtil.getName(destDir);
			
 
				+//                if (zipFilename.contains("确认") && !entryName.contains("确认")) {
			
 
				+//                    String ext = FileUtil.extName(entryName);
			
 
				+//                    entryName = StrUtil.subBefore(entryName, ".", true);
			
 
				+//                    entryName = entryName + "_确认单." + ext;
			
 
				+//                }
			
 
				+//                File entryFile = new File(destDirFile, entryName);
			
 
				+//                if (entry.isDirectory()) {
			
 
				+//                    Files.createDirectories(entryFile.toPath());
			
 
				+//                } else {
			
 
				+//                    extractSingleFile(ais, entryFile);
			
 
				+//                    if (isArchiveFile(entryFile)) {  // 判断是否为压缩文件
			
 
				+//                        extractedFiles.addAll(processNestedArchive(entryFile, destDir, depth, processed));
			
 
				+//                    } else {
			
 
				+//                        extractedFiles.add(entryFile.getAbsolutePath());
			
 
				+//                    }
			
 
				+//                }
			
 
				+//            }
			
 
				+//        } catch (Exception e) {
			
 
				+//            handleArchiveException(e, zipFilePath, destDir, extractedFiles, encoding);
			
 
				+//        }
			
 
				+//        return extractedFiles;
			
 
				+//    }
			
 
				+//
			
 
				+//    // ----------- 辅助方法 -----------
			
 
				+//    private static ArchiveInputStream<?> createArchiveInputStream(InputStream is, String encoding) throws ArchiveException {
			
 
				+//        return new ArchiveStreamFactory(encoding)
			
 
				+//                .createArchiveInputStream(ArchiveStreamFactory.detect(is), is);
			
 
				+//    }
			
 
				+//
			
 
				+//    private static void extractSingleFile(ArchiveInputStream<?> ais, File targetFile) throws IOException {
			
 
				+//        try (OutputStream fos = Files.newOutputStream(targetFile.toPath())) {
			
 
				+//            IOUtils.copy(ais, fos);
			
 
				+//        } finally {
			
 
				+//            if (ais != null) {
			
 
				+//                ais.close();
			
 
				+//            }
			
 
				+//        }
			
 
				+//    }
			
 
				+//
			
 
				+//    private static List<String> processNestedArchive(File archiveFile, String parentDir, int depth, Set<String> processed) throws IOException, ArchiveException {
			
 
				+//        // 强制关闭所有可能关联的流（防御性编程）
			
 
				+//        System.gc(); // 触发垃圾回收释放潜在未关闭的流
			
 
				+//
			
 
				+//        String fileHash = DigestUtil.md5Hex(new FileInputStream(archiveFile)); // 文件哈希校验（需实现）
			
 
				+//        if (processed.contains(fileHash)) {
			
 
				+//            return ListUtil.empty(); // 避免重复解压相同文件
			
 
				+//        }
			
 
				+//        processed.add(fileHash);
			
 
				+//
			
 
				+//        String nestedDestDir = parentDir + File.separator + FileUtil.mainName(archiveFile.getName());
			
 
				+//        List<String> nestedFiles = extractCompressedFiles(
			
 
				+//                archiveFile.getAbsolutePath(),
			
 
				+//                nestedDestDir,
			
 
				+//                depth + 1,
			
 
				+//                processed
			
 
				+//        );
			
 
				+//
			
 
				+//        Files.delete(archiveFile.toPath()); // 删除原压缩包
			
 
				+//        return nestedFiles;
			
 
				+//    }
			
 
				+//
			
 
				+//    private static boolean isArchiveFile(File file) {
			
 
				+//        try (InputStream is = Files.newInputStream(file.toPath());
			
 
				+//             BufferedInputStream bis = new BufferedInputStream(is)) {
			
 
				+//            ArchiveStreamFactory.detect(bis); // 通过文件头检测
			
 
				+//            return true;
			
 
				+//        } catch (Exception e) {
			
 
				+//            return false;
			
 
				+//        }
			
 
				+//    }
			
 
				+//
			
 
				+//    // 清理非法路径字符（防御路径穿越攻击）
			
 
				+//    private static String sanitizeEntryName(String name) {
			
 
				+//        return name.replaceAll("[:\\\\/*\"?|<>]", "_");
			
 
				+//    }
			
 
				+//
			
 
				+//    // 统一异常处理
			
 
				+//    private static void handleArchiveException(Exception e, String zipFile, String destDir, List<String> extractedFiles, String encoding) throws ArchiveException, IOException {
			
 
				+//        if (e.getMessage().contains("split")) {
			
 
				+//            extractedFiles.addAll(extractSplitZip(zipFile, destDir, encoding));
			
 
				+//        } else {
			
 
				+//            throw new ArchiveException("Failed to extract " + zipFile, e);
			
 
				+//        }
			
 
				+//    }
			
 
				+
			
 
				     public static List<String> extractSplitZip(String zipFilePath, String destFilePath, String encoding) throws IOException {
			
 
				         List<String> resultList = ListUtil.list(false);
			
 
				         File file = new File(zipFilePath);
			
@@ -200,7 +337,7 @@ public class ExcelUtil {
 
				                 if (entry == null) continue; // 空压缩包
			
 
				 
			
 
				                 String fileName = entry.getName();
			
 
				-                if (!hasInvalidCharacters(fileName)) {
			
 
				+                if (!isLikelyGarbled(fileName, encoding)) {
			
 
				                     return encoding; // 找到有效编码
			
 
				                 }
			
 
				             } catch (Exception e) {
			
@@ -210,10 +347,86 @@ public class ExcelUtil {
 
				         return null;
			
 
				     }
			
 
				 
			
 
				-    // 检查文件名是否包含无效字符（如替换符）
			
 
				-    private static boolean hasInvalidCharacters(String fileName) {
			
 
				-        // 检查常见乱码符号：�或连续问号
			
 
				-        return fileName.contains("�") || fileName.matches(".*\\?{2,}.*");
			
 
				+    public static boolean isLikelyGarbled(String fileName, String encoding) {
			
 
				+        // 基础检查：非法字符、替换符、连续问号
			
 
				+        if (ILLEGAL_CHARS_PATTERN.matcher(fileName).find() ||
			
 
				+                fileName.contains("�") ||
			
 
				+                fileName.matches(".*\\?{2,}.*")) {
			
 
				+            return true;
			
 
				+        }
			
 
				+
			
 
				+        // 编码一致性检查（假设系统使用 UTF-8）
			
 
				+        if (!isEncodingConsistent(fileName, Charset.forName(encoding))) {
			
 
				+            return true;
			
 
				+        }
			
 
				+
			
 
				+        // 中文字符乱码检测
			
 
				+        return hasLowMeaningfulness(fileName) || isLikelyGarbledWithRareChars(fileName);
			
 
				+    }
			
 
				+
			
 
				+    private static boolean isEncodingConsistent(String text, Charset expectedCharset) {
			
 
				+        // 将文本按预期编码转换为字节，再解码验证一致性
			
 
				+        byte[] bytes = text.getBytes(expectedCharset);
			
 
				+        String redecoded = new String(bytes, expectedCharset);
			
 
				+        return text.equals(redecoded);
			
 
				+    }
			
 
				+
			
 
				+    public static boolean isLikelyGarbledWithRareChars(String text) {
			
 
				+        int totalChars = text.length();
			
 
				+        if (totalChars == 0) return false;
			
 
				+
			
 
				+        int commonCount = 0;
			
 
				+        int rareCJKCount = 0;
			
 
				+
			
 
				+        for (char c : text.toCharArray()) {
			
 
				+            // 判断是否属于任何CJK区块
			
 
				+            boolean isCJK = isInUnicodeBlocks(c, CJK_BLOCKS);
			
 
				+            // 判断是否属于高频汉字区
			
 
				+            boolean isCommon = isInUnicodeBlocks(c, COMMON_CHINESE_BLOCKS);
			
 
				+
			
 
				+            if (isCJK && !isCommon) {
			
 
				+                rareCJKCount++; // 统计生僻CJK字符
			
 
				+            } else if (isCommon) {
			
 
				+                commonCount++;
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        // 规则1：生僻CJK占比超过50%且总CJK字符占比高
			
 
				+        boolean rule1 = (rareCJKCount > 0) &&
			
 
				+                (rareCJKCount * 2 > totalChars) &&
			
 
				+                ((commonCount + rareCJKCount) * 1.0 / totalChars > 0.7);
			
 
				+
			
 
				+        // 规则2：高频字占比极低（<20%）但CJK总占比高（编码错误特征）
			
 
				+        boolean rule2 = (commonCount * 1.0 / totalChars < 0.2) &&
			
 
				+                ((commonCount + rareCJKCount) * 1.0 / totalChars > 0.6);
			
 
				+
			
 
				+        return rule1 || rule2;
			
 
				+    }
			
 
				+
			
 
				+    // 辅助方法：判断字符是否在指定Unicode区块内
			
 
				+    private static boolean isInUnicodeBlocks(char c, int[][] blocks) {
			
 
				+        for (int[] block : blocks) {
			
 
				+            if ((int) c >= block[0] && (int) c <= block[1]) {
			
 
				+                return true;
			
 
				+            }
			
 
				+        }
			
 
				+        return false;
			
 
				+    }
			
 
				+
			
 
				+    // 上下文合理性检测
			
 
				+    private static boolean hasLowMeaningfulness(String text) {
			
 
				+        // 假设正常文本应包含常见停用词（的、是、在等）
			
 
				+        List<String> commonWords = ListUtil.list(false);
			
 
				+        commonWords.addAll(ListUtil.toList(ReportType.ANNUALLY.getPatterns()));
			
 
				+        commonWords.addAll(ListUtil.toList(ReportType.QUARTERLY.getPatterns()));
			
 
				+        commonWords.addAll(ListUtil.toList(ReportType.MONTHLY.getPatterns()));
			
 
				+        commonWords.addAll(ListUtil.toList(ReportType.LETTER.getPatterns()));
			
 
				+        commonWords.addAll(ListUtil.toList(ReportType.WEEKLY.getPatterns()));
			
 
				+        commonWords.addAll(ListUtil.toList(ReportType.OTHER.getPatterns()));
			
 
				+        for (String word : commonWords) {
			
 
				+            if (text.contains(word)) return false;
			
 
				+        }
			
 
				+        return true;
			
 
				     }
			
 
				 
			
 
				     public static void main(String[] args) throws Exception {
			
--- a/mo-daq/src/test/java/com/smppw/modaq/MoDaqApplicationTests.java
+++ b/mo-daq/src/test/java/com/smppw/modaq/MoDaqApplicationTests.java
@@ -37,9 +37,9 @@ public class MoDaqApplicationTests {
 
				 
			
 
				     @Test
			
 
				     public void reportTest() {
			
 
				-        MailboxInfoDTO emailInfoDTO = this.buildMailbox("***@simuwang.com", "***");
			
 
				-        Date startDate = DateUtil.parse("2025-05-15 11:40:00", DateConst.YYYY_MM_DD_HH_MM_SS);
			
 
				-        Date endDate = DateUtil.parse("2025-05-15 16:53:00", DateConst.YYYY_MM_DD_HH_MM_SS);
			
 
				+        MailboxInfoDTO emailInfoDTO = this.buildMailbox("**@simuwang.com", "**");
			
 
				+        Date startDate = DateUtil.parse("2025-05-22 11:55:00", DateConst.YYYY_MM_DD_HH_MM_SS);
			
 
				+        Date endDate = DateUtil.parse("2025-05-22 11:58:00", DateConst.YYYY_MM_DD_HH_MM_SS);
			
 
				         try {
			
 
				             List<String> folderNames = ListUtil.list(false);
			
 
				 //            folderNames.add("其他文件夹/报告公告");