فهرست منبع

fix:修复附件名称字符编码识别错误问题

wangzaijun 1 ماه پیش
والد
کامیت
1d1ec649e0

+ 218 - 5
mo-daq/src/main/java/com/smppw/modaq/infrastructure/util/ExcelUtil.java

@@ -4,6 +4,7 @@ import cn.hutool.core.collection.CollUtil;
 import cn.hutool.core.collection.ListUtil;
 import cn.hutool.core.io.FileUtil;
 import cn.hutool.core.util.StrUtil;
+import com.smppw.modaq.common.enums.ReportType;
 import net.sf.sevenzipjbinding.ExtractOperationResult;
 import net.sf.sevenzipjbinding.IInArchive;
 import net.sf.sevenzipjbinding.SevenZip;
@@ -20,12 +21,14 @@ import org.apache.commons.compress.archivers.zip.ZipFile;
 import org.apache.commons.io.IOUtils;
 
 import java.io.*;
+import java.nio.charset.Charset;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.nio.file.Paths;
 import java.util.Arrays;
 import java.util.Enumeration;
 import java.util.List;
+import java.util.regex.Pattern;
 
 public class ExcelUtil {
     // 候选编码列表(按常见顺序排列)
@@ -36,6 +39,24 @@ public class ExcelUtil {
             "ISO-8859-1" // 默认回退
     );
 
+    private static final Pattern ILLEGAL_CHARS_PATTERN =
+            Pattern.compile("[\\\\:*?\"<>|\\x00-\\x1F]"); // 包含控制字符
+
+    // Unicode 中文字符区块定义
+    private static final int[][] CJK_BLOCKS = {
+            {0x4E00, 0x9FFF},    // CJK Unified Ideographs (基本汉字)
+            {0x3400, 0x4DBF},    // CJK Extension A (生僻字)
+            {0x20000, 0x2A6DF},  // CJK Extension B (极生僻字)
+            {0x2A700, 0x2B73F},  // CJK Extension C
+            {0x3000, 0x303F}     // CJK标点符号
+    };
+
+    // 常见高频汉字范围(覆盖约99%日常用字)
+    private static final int[][] COMMON_CHINESE_BLOCKS = {
+            {0x4E00, 0x9FA5},    // 通用规范汉字表(8105字)
+            {0x3000, 0x303F}     // 常用标点
+    };
+
     public static boolean isExcel(String fileName) {
         return StrUtil.isNotBlank(fileName) && (fileName.endsWith("xls") || fileName.endsWith("xlsx") || fileName.endsWith("XLS") || fileName.endsWith("XLSX"));
     }
@@ -106,8 +127,124 @@ public class ExcelUtil {
         }
 
         return filePathList;
+//        return extractCompressedFiles(zipFilePath, destFilePath, 0, new HashSet<>());
     }
 
+//    /**
+//     * 递归解压压缩文件(支持嵌套)
+//     *
+//     * @param zipFilePath 压缩文件路径
+//     * @param destDir     目标目录
+//     * @param depth       当前递归深度(防止无限递归)
+//     * @param processed   已处理的文件哈希(防止重复解压)
+//     */
+//    private static List<String> extractCompressedFiles(String zipFilePath, String destDir, int depth, Set<String> processed) throws IOException, ArchiveException {
+//        // 防御:限制递归深度防止栈溢出
+//        if (depth > 4) {
+//            throw new ArchiveException("Maximum recursion depth (4) exceeded");
+//        }
+//
+//        String encoding = detectEncoding(zipFilePath); // 编码检测
+//        if (encoding == null) {
+//            encoding = "GBK";
+//        }
+//        List<String> extractedFiles = ListUtil.list(false);
+//        File destDirFile = FileUtil.mkdir(destDir);
+//
+//        try (InputStream fis = new BufferedInputStream(Files.newInputStream(Paths.get(zipFilePath)));
+//             ArchiveInputStream<?> ais = createArchiveInputStream(fis, encoding)) {
+//            ArchiveEntry entry;
+//            while ((entry = ais.getNextEntry()) != null) {
+//                String entryName = sanitizeEntryName(entry.getName()); // 清理非法字符
+//                if (entryName.startsWith("__MACOSX/")) {
+//                    continue;
+//                }
+//                String zipFilename = FileUtil.getName(destDir);
+//                if (zipFilename.contains("确认") && !entryName.contains("确认")) {
+//                    String ext = FileUtil.extName(entryName);
+//                    entryName = StrUtil.subBefore(entryName, ".", true);
+//                    entryName = entryName + "_确认单." + ext;
+//                }
+//                File entryFile = new File(destDirFile, entryName);
+//                if (entry.isDirectory()) {
+//                    Files.createDirectories(entryFile.toPath());
+//                } else {
+//                    extractSingleFile(ais, entryFile);
+//                    if (isArchiveFile(entryFile)) {  // 判断是否为压缩文件
+//                        extractedFiles.addAll(processNestedArchive(entryFile, destDir, depth, processed));
+//                    } else {
+//                        extractedFiles.add(entryFile.getAbsolutePath());
+//                    }
+//                }
+//            }
+//        } catch (Exception e) {
+//            handleArchiveException(e, zipFilePath, destDir, extractedFiles, encoding);
+//        }
+//        return extractedFiles;
+//    }
+//
+//    // ----------- 辅助方法 -----------
+//    private static ArchiveInputStream<?> createArchiveInputStream(InputStream is, String encoding) throws ArchiveException {
+//        return new ArchiveStreamFactory(encoding)
+//                .createArchiveInputStream(ArchiveStreamFactory.detect(is), is);
+//    }
+//
+//    private static void extractSingleFile(ArchiveInputStream<?> ais, File targetFile) throws IOException {
+//        try (OutputStream fos = Files.newOutputStream(targetFile.toPath())) {
+//            IOUtils.copy(ais, fos);
+//        } finally {
+//            if (ais != null) {
+//                ais.close();
+//            }
+//        }
+//    }
+//
+//    private static List<String> processNestedArchive(File archiveFile, String parentDir, int depth, Set<String> processed) throws IOException, ArchiveException {
+//        // 强制关闭所有可能关联的流(防御性编程)
+//        System.gc(); // 触发垃圾回收释放潜在未关闭的流
+//
+//        String fileHash = DigestUtil.md5Hex(new FileInputStream(archiveFile)); // 文件哈希校验(需实现)
+//        if (processed.contains(fileHash)) {
+//            return ListUtil.empty(); // 避免重复解压相同文件
+//        }
+//        processed.add(fileHash);
+//
+//        String nestedDestDir = parentDir + File.separator + FileUtil.mainName(archiveFile.getName());
+//        List<String> nestedFiles = extractCompressedFiles(
+//                archiveFile.getAbsolutePath(),
+//                nestedDestDir,
+//                depth + 1,
+//                processed
+//        );
+//
+//        Files.delete(archiveFile.toPath()); // 删除原压缩包
+//        return nestedFiles;
+//    }
+//
+//    private static boolean isArchiveFile(File file) {
+//        try (InputStream is = Files.newInputStream(file.toPath());
+//             BufferedInputStream bis = new BufferedInputStream(is)) {
+//            ArchiveStreamFactory.detect(bis); // 通过文件头检测
+//            return true;
+//        } catch (Exception e) {
+//            return false;
+//        }
+//    }
+//
+//    // 清理非法路径字符(防御路径穿越攻击)
+//    private static String sanitizeEntryName(String name) {
+//        return name.replaceAll("[:\\\\/*\"?|<>]", "_");
+//    }
+//
+//    // 统一异常处理
+//    private static void handleArchiveException(Exception e, String zipFile, String destDir, List<String> extractedFiles, String encoding) throws ArchiveException, IOException {
+//        if (e.getMessage().contains("split")) {
+//            extractedFiles.addAll(extractSplitZip(zipFile, destDir, encoding));
+//        } else {
+//            throw new ArchiveException("Failed to extract " + zipFile, e);
+//        }
+//    }
+
     public static List<String> extractSplitZip(String zipFilePath, String destFilePath, String encoding) throws IOException {
         List<String> resultList = ListUtil.list(false);
         File file = new File(zipFilePath);
@@ -200,7 +337,7 @@ public class ExcelUtil {
                 if (entry == null) continue; // 空压缩包
 
                 String fileName = entry.getName();
-                if (!hasInvalidCharacters(fileName)) {
+                if (!isLikelyGarbled(fileName, encoding)) {
                     return encoding; // 找到有效编码
                 }
             } catch (Exception e) {
@@ -210,10 +347,86 @@ public class ExcelUtil {
         return null;
     }
 
-    // 检查文件名是否包含无效字符(如替换符)
-    private static boolean hasInvalidCharacters(String fileName) {
-        // 检查常见乱码符号:�或连续问号
-        return fileName.contains("�") || fileName.matches(".*\\?{2,}.*");
+    public static boolean isLikelyGarbled(String fileName, String encoding) {
+        // 基础检查:非法字符、替换符、连续问号
+        if (ILLEGAL_CHARS_PATTERN.matcher(fileName).find() ||
+                fileName.contains("�") ||
+                fileName.matches(".*\\?{2,}.*")) {
+            return true;
+        }
+
+        // 编码一致性检查(假设系统使用 UTF-8)
+        if (!isEncodingConsistent(fileName, Charset.forName(encoding))) {
+            return true;
+        }
+
+        // 中文字符乱码检测
+        return hasLowMeaningfulness(fileName) || isLikelyGarbledWithRareChars(fileName);
+    }
+
+    private static boolean isEncodingConsistent(String text, Charset expectedCharset) {
+        // 将文本按预期编码转换为字节,再解码验证一致性
+        byte[] bytes = text.getBytes(expectedCharset);
+        String redecoded = new String(bytes, expectedCharset);
+        return text.equals(redecoded);
+    }
+
+    public static boolean isLikelyGarbledWithRareChars(String text) {
+        int totalChars = text.length();
+        if (totalChars == 0) return false;
+
+        int commonCount = 0;
+        int rareCJKCount = 0;
+
+        for (char c : text.toCharArray()) {
+            // 判断是否属于任何CJK区块
+            boolean isCJK = isInUnicodeBlocks(c, CJK_BLOCKS);
+            // 判断是否属于高频汉字区
+            boolean isCommon = isInUnicodeBlocks(c, COMMON_CHINESE_BLOCKS);
+
+            if (isCJK && !isCommon) {
+                rareCJKCount++; // 统计生僻CJK字符
+            } else if (isCommon) {
+                commonCount++;
+            }
+        }
+
+        // 规则1:生僻CJK占比超过50%且总CJK字符占比高
+        boolean rule1 = (rareCJKCount > 0) &&
+                (rareCJKCount * 2 > totalChars) &&
+                ((commonCount + rareCJKCount) * 1.0 / totalChars > 0.7);
+
+        // 规则2:高频字占比极低(<20%)但CJK总占比高(编码错误特征)
+        boolean rule2 = (commonCount * 1.0 / totalChars < 0.2) &&
+                ((commonCount + rareCJKCount) * 1.0 / totalChars > 0.6);
+
+        return rule1 || rule2;
+    }
+
+    // 辅助方法:判断字符是否在指定Unicode区块内
+    private static boolean isInUnicodeBlocks(char c, int[][] blocks) {
+        for (int[] block : blocks) {
+            if ((int) c >= block[0] && (int) c <= block[1]) {
+                return true;
+            }
+        }
+        return false;
+    }
+
+    // 上下文合理性检测
+    private static boolean hasLowMeaningfulness(String text) {
+        // 假设正常文本应包含常见停用词(的、是、在等)
+        List<String> commonWords = ListUtil.list(false);
+        commonWords.addAll(ListUtil.toList(ReportType.ANNUALLY.getPatterns()));
+        commonWords.addAll(ListUtil.toList(ReportType.QUARTERLY.getPatterns()));
+        commonWords.addAll(ListUtil.toList(ReportType.MONTHLY.getPatterns()));
+        commonWords.addAll(ListUtil.toList(ReportType.LETTER.getPatterns()));
+        commonWords.addAll(ListUtil.toList(ReportType.WEEKLY.getPatterns()));
+        commonWords.addAll(ListUtil.toList(ReportType.OTHER.getPatterns()));
+        for (String word : commonWords) {
+            if (text.contains(word)) return false;
+        }
+        return true;
     }
 
     public static void main(String[] args) throws Exception {

+ 3 - 3
mo-daq/src/test/java/com/smppw/modaq/MoDaqApplicationTests.java

@@ -37,9 +37,9 @@ public class MoDaqApplicationTests {
 
     @Test
     public void reportTest() {
-        MailboxInfoDTO emailInfoDTO = this.buildMailbox("***@simuwang.com", "***");
-        Date startDate = DateUtil.parse("2025-05-15 11:40:00", DateConst.YYYY_MM_DD_HH_MM_SS);
-        Date endDate = DateUtil.parse("2025-05-15 16:53:00", DateConst.YYYY_MM_DD_HH_MM_SS);
+        MailboxInfoDTO emailInfoDTO = this.buildMailbox("**@simuwang.com", "**");
+        Date startDate = DateUtil.parse("2025-05-22 11:55:00", DateConst.YYYY_MM_DD_HH_MM_SS);
+        Date endDate = DateUtil.parse("2025-05-22 11:58:00", DateConst.YYYY_MM_DD_HH_MM_SS);
         try {
             List<String> folderNames = ListUtil.list(false);
 //            folderNames.add("其他文件夹/报告公告");