package com.smppw.modaq.infrastructure.util; import cn.hutool.core.collection.ListUtil; import cn.hutool.core.io.FileUtil; import cn.hutool.core.util.StrUtil; import com.smppw.modaq.common.conts.Constants; import com.smppw.modaq.common.enums.ReportType; import net.sf.sevenzipjbinding.*; import net.sf.sevenzipjbinding.impl.RandomAccessFileInStream; import net.sf.sevenzipjbinding.simple.ISimpleInArchive; import net.sf.sevenzipjbinding.simple.ISimpleInArchiveItem; import org.apache.commons.compress.archivers.ArchiveEntry; import org.apache.commons.compress.archivers.ArchiveInputStream; import org.apache.commons.compress.archivers.ArchiveStreamFactory; import org.apache.commons.compress.archivers.zip.ZipArchiveEntry; import org.apache.commons.compress.archivers.zip.ZipFile; import org.apache.commons.io.IOUtils; import java.io.*; import java.nio.charset.Charset; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.util.Arrays; import java.util.Enumeration; import java.util.List; import java.util.regex.Pattern; public class ArchiveUtil { // 候选编码列表(按常见顺序排列) private static final List CANDIDATE_ENCODINGS = Arrays.asList( "GBK", // 中文环境常用 "UTF-8", // 标准编码 "GB2312", // 旧版中文 "ISO-8859-1" // 默认回退 ); private static final Pattern ILLEGAL_CHARS_PATTERN = Pattern.compile("[\\\\:*?\"<>|\\x00-\\x1F]"); // 包含控制字符 // Unicode 中文字符区块定义 private static final int[][] CJK_BLOCKS = { {0x4E00, 0x9FFF}, // CJK Unified Ideographs (基本汉字) {0x3400, 0x4DBF}, // CJK Extension A (生僻字) {0x20000, 0x2A6DF}, // CJK Extension B (极生僻字) {0x2A700, 0x2B73F}, // CJK Extension C {0x3000, 0x303F} // CJK标点符号 }; // 常见高频汉字范围(覆盖约99%日常用字) private static final int[][] COMMON_CHINESE_BLOCKS = { {0x4E00, 0x9FA5}, // 通用规范汉字表(8105字) {0x3000, 0x303F} // 常用标点 }; public static boolean isArchive(String fileName) { return isZip(fileName) || is7z(fileName) || isRAR(fileName); } public static boolean isZip(String fileName) { return StrUtil.isNotBlank(fileName) && StrUtil.endWithIgnoreCase(fileName, Constants.ARCHIVE_ZIP); } public static boolean is7z(String fileName) { return StrUtil.isNotBlank(fileName) && StrUtil.endWithIgnoreCase(fileName, Constants.ARCHIVE_7Z); } public static boolean isRAR(String fileName) { return StrUtil.isNotBlank(fileName) && StrUtil.endWithIgnoreCase(fileName, Constants.ARCHIVE_RAR); } public static List extractCompressedFiles(String zipFilePath, String destFilePath) throws IOException { File destFile = FileUtil.file(destFilePath); if (!destFile.exists()) { Files.createDirectories(destFile.toPath()); } String encoding = detectEncoding(zipFilePath); if (encoding == null) { encoding = "GBK"; } return decompressZip(zipFilePath, destFilePath, 2, encoding); } public static List extractRar5(String rarFilePath, String outputDir) throws IOException { try { // 初始化 SevenZipJBinding 本地库 SevenZip.initSevenZipFromPlatformJAR(); } catch (SevenZipNativeInitializationException ignored) { } RandomAccessFile randomAccessFile = null; IInArchive inArchive = null; List resultList = ListUtil.list(false); try { // 打开 RAR 文件 randomAccessFile = new RandomAccessFile(rarFilePath, "r"); inArchive = SevenZip.openInArchive(null, new RandomAccessFileInStream(randomAccessFile)); // 获取压缩包中的文件列表 ISimpleInArchive simpleInArchive = inArchive.getSimpleInterface(); for (ISimpleInArchiveItem item : simpleInArchive.getArchiveItems()) { if (!item.isFolder()) { resultList.add(extractItem(item, outputDir)); } } } finally { // 释放资源 if (inArchive != null) { inArchive.close(); } if (randomAccessFile != null) { randomAccessFile.close(); } } return resultList; } private static String extractItem(ISimpleInArchiveItem item, String outputDir) throws SevenZipException { String filePath = outputDir + File.separator + item.getPath(); File outputFile = FileUtil.file(filePath); // 创建父目录 File parentDir = outputFile.getParentFile(); if (!parentDir.exists() && !parentDir.mkdirs()) { throw new SevenZipException("无法创建目录: " + parentDir.getAbsolutePath()); } // 提取文件内容 try (FileOutputStream fos = new FileOutputStream(outputFile)) { ExtractOperationResult result = item.extractSlow(data -> { try { fos.write(data); return data.length; // 返回写入的字节数 } catch (IOException e) { throw new SevenZipException("写入文件失败", e); } }); if (result != ExtractOperationResult.OK) { throw new SevenZipException("解压失败: " + result); } } catch (IOException e) { throw new SevenZipException("文件操作失败", e); } return outputFile.getAbsolutePath(); } // 检测压缩包编码 private static String detectEncoding(String zipPath) { for (String encoding : CANDIDATE_ENCODINGS) { try (BufferedInputStream fis = new BufferedInputStream(new FileInputStream(zipPath)); ArchiveInputStream ais = new ArchiveStreamFactory() .createArchiveInputStream(ArchiveStreamFactory.detect(fis), fis, encoding)) { ArchiveEntry entry = ais.getNextEntry(); if (entry == null) continue; // 空压缩包 String fileName = entry.getName(); if (!isLikelyGarbled(fileName, encoding)) { return encoding; // 找到有效编码 } } catch (Exception e) { // 编码不支持或文件错误,继续尝试下一个 } } return null; } public static boolean isLikelyGarbled(String fileName, String encoding) { // 基础检查:非法字符、替换符、连续问号 if (ILLEGAL_CHARS_PATTERN.matcher(fileName).find() || fileName.contains("�") || fileName.matches(".*\\?{2,}.*")) { return true; } // 编码一致性检查(假设系统使用 UTF-8) if (!isEncodingConsistent(fileName, Charset.forName(encoding))) { return true; } // 中文字符乱码检测 return hasLowMeaningfulness(fileName) || isLikelyGarbledWithRareChars(fileName); } private static boolean isEncodingConsistent(String text, Charset expectedCharset) { // 将文本按预期编码转换为字节,再解码验证一致性 byte[] bytes = text.getBytes(expectedCharset); String redecoded = new String(bytes, expectedCharset); return text.equals(redecoded); } public static boolean isLikelyGarbledWithRareChars(String text) { int totalChars = text.length(); if (totalChars == 0) return false; int commonCount = 0; int rareCJKCount = 0; for (char c : text.toCharArray()) { // 判断是否属于任何CJK区块 boolean isCJK = isInUnicodeBlocks(c, CJK_BLOCKS); // 判断是否属于高频汉字区 boolean isCommon = isInUnicodeBlocks(c, COMMON_CHINESE_BLOCKS); if (isCJK && !isCommon) { rareCJKCount++; // 统计生僻CJK字符 } else if (isCommon) { commonCount++; } } // 规则1:生僻CJK占比超过50%且总CJK字符占比高 boolean rule1 = (rareCJKCount > 0) && (rareCJKCount * 2 > totalChars) && ((commonCount + rareCJKCount) * 1.0 / totalChars > 0.7); // 规则2:高频字占比极低(<20%)但CJK总占比高(编码错误特征) boolean rule2 = (commonCount * 1.0 / totalChars < 0.2) && ((commonCount + rareCJKCount) * 1.0 / totalChars > 0.6); return rule1 || rule2; } // 辅助方法:判断字符是否在指定Unicode区块内 private static boolean isInUnicodeBlocks(char c, int[][] blocks) { for (int[] block : blocks) { if ((int) c >= block[0] && (int) c <= block[1]) { return true; } } return false; } // 上下文合理性检测 private static boolean hasLowMeaningfulness(String text) { // 假设正常文本应包含常见停用词(的、是、在等) List commonWords = ListUtil.list(false); commonWords.add("基金"); commonWords.addAll(ListUtil.toList(ReportType.ANNUALLY.getPatterns())); commonWords.addAll(ListUtil.toList(ReportType.QUARTERLY.getPatterns())); commonWords.addAll(ListUtil.toList(ReportType.MONTHLY.getPatterns())); commonWords.addAll(ListUtil.toList(ReportType.LETTER.getPatterns())); commonWords.addAll(ListUtil.toList(ReportType.WEEKLY.getPatterns())); commonWords.addAll(ListUtil.toList(ReportType.OTHER.getPatterns())); for (String word : commonWords) { if (text.contains(word)) return false; } return true; } /** * 递归解压 ZIP 文件(含嵌套深度限制) * * @param zipFile 输入的 ZIP 文件 * @param outputDir 解压目标根目录 * @param maxDepth 最大嵌套深度(例如 3 表示允许 parent/nest1/nest2.zip) * @return 所有解压后的文件路径(格式:parent.zip/nest1/file.txt) */ public static List decompressZip(String zipFile, String outputDir, int maxDepth, String encoding) throws IOException { return decompressZip(FileUtil.file(zipFile), FileUtil.file(outputDir), maxDepth, encoding); } /** * 递归解压 ZIP 文件(含嵌套深度限制) * * @param zipFile 输入的 ZIP 文件 * @param outputDir 解压目标根目录 * @param maxDepth 最大嵌套深度(例如 3 表示允许 parent.zip/nest1.zip/nest2.zip) * @return 所有解压后的文件路径(格式:parent.zip/nest1.zip/file.txt) */ public static List decompressZip(File zipFile, File outputDir, int maxDepth, String encoding) throws IOException { if (maxDepth < 0) { throw new IllegalArgumentException("最大嵌套深度不能小于 0"); } List decompressedFiles = ListUtil.list(false); decompressZipRecursive(zipFile, outputDir, "", 0, maxDepth, encoding, decompressedFiles); return decompressedFiles; } /** * 递归解压核心逻辑 */ private static void decompressZipRecursive( File currentZip, File rootOutputDir, String nestedPath, int currentDepth, int maxDepth, String encoding, List decompressedFiles) throws IOException { // 1. 超过最大深度时停止处理嵌套 ZIP if (currentDepth > maxDepth) { return; } // 2. 创建当前 ZIP 的解压目录(跟压缩包目录已经处理过,就不要追加到文件目录中了) String currentZipName = FileUtil.mainName(currentZip); String currentNestedPath = nestedPath.isEmpty() ? "" : nestedPath + File.separator + currentZipName; File currentOutputDir = new File(rootOutputDir, currentNestedPath); FileUtil.mkdir(currentOutputDir); // 3. 解压当前 ZIP,支持最多10个分卷的解压 try (ZipFile zip = ZipFile.builder().setFile(currentZip).setCharset(encoding).setMaxNumberOfDisks(10).get()) { Enumeration entries = zip.getEntries(); while (entries.hasMoreElements()) { ZipArchiveEntry entry = entries.nextElement(); String name = entry.getName(); if (name.startsWith("__MACOSX/")) { continue; } Path entryPath = Paths.get(currentOutputDir.getAbsolutePath(), name); // 处理目录 if (entry.isDirectory()) { Files.createDirectories(entryPath); continue; } // 写入文件 Files.createDirectories(entryPath.getParent()); try (InputStream is = zip.getInputStream(entry); OutputStream os = new FileOutputStream(entryPath.toFile())) { IOUtils.copy(is, os); } // 4. 递归处理嵌套 ZIP(深度+1) if (isZip(name) && currentDepth < maxDepth) { File nestedZipFile = entryPath.toFile(); decompressZipRecursive( nestedZipFile, rootOutputDir, currentNestedPath, currentDepth + 1, // 深度递增 maxDepth, encoding, decompressedFiles ); Files.delete(nestedZipFile.toPath()); } else { // 记录路径 decompressedFiles.add(entryPath.toString()); } } } } public static void main(String[] args) throws Exception { String zipFilePath = "D:\\Documents\\新报告解析\\基协报告\\排排网代销-宏锡5月报告(公司及协会版).7z"; String destFilePath = "D:\\Documents\\新报告解析\\基协报告\\rar"; List strings = extractRar5(zipFilePath, destFilePath); for (String string : strings) { System.out.println(string); } // List fileList = extractCompressedFiles(zipFilePath, destFilePath); // for (String s : fileList) { // System.out.println(s); // } String currentZip = "D:\\Documents\\新报告解析\\基协报告\\排排网代销-宏锡5月报告(公司及协会版).7z"; List files = decompressZip(currentZip, "D:\\Documents\\新报告解析\\基协报告\\zip\\", 2, "utf-8"); System.out.println("解压后的文件路径:"); files.forEach(System.out::println); } }