123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373 |
- package com.smppw.modaq.infrastructure.util;
- import cn.hutool.core.collection.ListUtil;
- import cn.hutool.core.io.FileUtil;
- import cn.hutool.core.util.StrUtil;
- import com.smppw.modaq.common.conts.Constants;
- import com.smppw.modaq.common.enums.ReportType;
- import net.sf.sevenzipjbinding.*;
- import net.sf.sevenzipjbinding.impl.RandomAccessFileInStream;
- import net.sf.sevenzipjbinding.simple.ISimpleInArchive;
- import net.sf.sevenzipjbinding.simple.ISimpleInArchiveItem;
- import org.apache.commons.compress.archivers.ArchiveEntry;
- import org.apache.commons.compress.archivers.ArchiveInputStream;
- import org.apache.commons.compress.archivers.ArchiveStreamFactory;
- import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
- import org.apache.commons.compress.archivers.zip.ZipFile;
- import org.apache.commons.io.IOUtils;
- import java.io.*;
- import java.nio.charset.Charset;
- import java.nio.file.Files;
- import java.nio.file.Path;
- import java.nio.file.Paths;
- import java.util.Arrays;
- import java.util.Enumeration;
- import java.util.List;
- import java.util.regex.Pattern;
- public class ArchiveUtil {
- // 候选编码列表(按常见顺序排列)
- private static final List<String> CANDIDATE_ENCODINGS = Arrays.asList(
- "GBK", // 中文环境常用
- "UTF-8", // 标准编码
- "GB2312", // 旧版中文
- "ISO-8859-1" // 默认回退
- );
- private static final Pattern ILLEGAL_CHARS_PATTERN =
- Pattern.compile("[\\\\:*?\"<>|\\x00-\\x1F]"); // 包含控制字符
- // Unicode 中文字符区块定义
- private static final int[][] CJK_BLOCKS = {
- {0x4E00, 0x9FFF}, // CJK Unified Ideographs (基本汉字)
- {0x3400, 0x4DBF}, // CJK Extension A (生僻字)
- {0x20000, 0x2A6DF}, // CJK Extension B (极生僻字)
- {0x2A700, 0x2B73F}, // CJK Extension C
- {0x3000, 0x303F} // CJK标点符号
- };
- // 常见高频汉字范围(覆盖约99%日常用字)
- private static final int[][] COMMON_CHINESE_BLOCKS = {
- {0x4E00, 0x9FA5}, // 通用规范汉字表(8105字)
- {0x3000, 0x303F} // 常用标点
- };
- public static boolean isArchive(String fileName) {
- return isZip(fileName) || is7z(fileName) || isRAR(fileName);
- }
- public static boolean isZip(String fileName) {
- return StrUtil.isNotBlank(fileName) && StrUtil.endWithIgnoreCase(fileName, Constants.ARCHIVE_ZIP);
- }
- public static boolean is7z(String fileName) {
- return StrUtil.isNotBlank(fileName) && StrUtil.endWithIgnoreCase(fileName, Constants.ARCHIVE_7Z);
- }
- public static boolean isRAR(String fileName) {
- return StrUtil.isNotBlank(fileName) && StrUtil.endWithIgnoreCase(fileName, Constants.ARCHIVE_RAR);
- }
- public static List<String> extractCompressedFiles(String zipFilePath, String destFilePath) throws IOException {
- File destFile = FileUtil.file(destFilePath);
- if (!destFile.exists()) {
- Files.createDirectories(destFile.toPath());
- }
- String encoding = detectEncoding(zipFilePath);
- if (encoding == null) {
- encoding = "GBK";
- }
- return decompressZip(zipFilePath, destFilePath, 2, encoding);
- }
- public static List<String> extractRar5(String rarFilePath, String outputDir) throws IOException {
- try {
- // 初始化 SevenZipJBinding 本地库
- SevenZip.initSevenZipFromPlatformJAR();
- } catch (SevenZipNativeInitializationException ignored) {
- }
- RandomAccessFile randomAccessFile = null;
- IInArchive inArchive = null;
- List<String> resultList = ListUtil.list(false);
- try {
- // 打开 RAR 文件
- randomAccessFile = new RandomAccessFile(rarFilePath, "r");
- inArchive = SevenZip.openInArchive(null, new RandomAccessFileInStream(randomAccessFile));
- // 获取压缩包中的文件列表
- ISimpleInArchive simpleInArchive = inArchive.getSimpleInterface();
- for (ISimpleInArchiveItem item : simpleInArchive.getArchiveItems()) {
- if (!item.isFolder()) {
- resultList.add(extractItem(item, outputDir));
- }
- }
- } finally {
- // 释放资源
- if (inArchive != null) {
- inArchive.close();
- }
- if (randomAccessFile != null) {
- randomAccessFile.close();
- }
- }
- return resultList;
- }
- private static String extractItem(ISimpleInArchiveItem item, String outputDir) throws SevenZipException {
- String filePath = outputDir + File.separator + item.getPath();
- File outputFile = FileUtil.file(filePath);
- // 创建父目录
- File parentDir = outputFile.getParentFile();
- if (!parentDir.exists() && !parentDir.mkdirs()) {
- throw new SevenZipException("无法创建目录: " + parentDir.getAbsolutePath());
- }
- // 提取文件内容
- try (FileOutputStream fos = new FileOutputStream(outputFile)) {
- ExtractOperationResult result = item.extractSlow(data -> {
- try {
- fos.write(data);
- return data.length; // 返回写入的字节数
- } catch (IOException e) {
- throw new SevenZipException("写入文件失败", e);
- }
- });
- if (result != ExtractOperationResult.OK) {
- throw new SevenZipException("解压失败: " + result);
- }
- } catch (IOException e) {
- throw new SevenZipException("文件操作失败", e);
- }
- return outputFile.getAbsolutePath();
- }
- // 检测压缩包编码
- private static String detectEncoding(String zipPath) {
- for (String encoding : CANDIDATE_ENCODINGS) {
- try (BufferedInputStream fis = new BufferedInputStream(new FileInputStream(zipPath));
- ArchiveInputStream<? extends ArchiveEntry> ais = new ArchiveStreamFactory()
- .createArchiveInputStream(ArchiveStreamFactory.detect(fis), fis, encoding)) {
- ArchiveEntry entry = ais.getNextEntry();
- if (entry == null) continue; // 空压缩包
- String fileName = entry.getName();
- if (!isLikelyGarbled(fileName, encoding)) {
- return encoding; // 找到有效编码
- }
- } catch (Exception e) {
- // 编码不支持或文件错误,继续尝试下一个
- }
- }
- return null;
- }
- public static boolean isLikelyGarbled(String fileName, String encoding) {
- // 基础检查:非法字符、替换符、连续问号
- if (ILLEGAL_CHARS_PATTERN.matcher(fileName).find() ||
- fileName.contains("�") ||
- fileName.matches(".*\\?{2,}.*")) {
- return true;
- }
- // 编码一致性检查(假设系统使用 UTF-8)
- if (!isEncodingConsistent(fileName, Charset.forName(encoding))) {
- return true;
- }
- // 中文字符乱码检测
- return hasLowMeaningfulness(fileName) || isLikelyGarbledWithRareChars(fileName);
- }
- private static boolean isEncodingConsistent(String text, Charset expectedCharset) {
- // 将文本按预期编码转换为字节,再解码验证一致性
- byte[] bytes = text.getBytes(expectedCharset);
- String redecoded = new String(bytes, expectedCharset);
- return text.equals(redecoded);
- }
- public static boolean isLikelyGarbledWithRareChars(String text) {
- int totalChars = text.length();
- if (totalChars == 0) return false;
- int commonCount = 0;
- int rareCJKCount = 0;
- for (char c : text.toCharArray()) {
- // 判断是否属于任何CJK区块
- boolean isCJK = isInUnicodeBlocks(c, CJK_BLOCKS);
- // 判断是否属于高频汉字区
- boolean isCommon = isInUnicodeBlocks(c, COMMON_CHINESE_BLOCKS);
- if (isCJK && !isCommon) {
- rareCJKCount++; // 统计生僻CJK字符
- } else if (isCommon) {
- commonCount++;
- }
- }
- // 规则1:生僻CJK占比超过50%且总CJK字符占比高
- boolean rule1 = (rareCJKCount > 0) &&
- (rareCJKCount * 2 > totalChars) &&
- ((commonCount + rareCJKCount) * 1.0 / totalChars > 0.7);
- // 规则2:高频字占比极低(<20%)但CJK总占比高(编码错误特征)
- boolean rule2 = (commonCount * 1.0 / totalChars < 0.2) &&
- ((commonCount + rareCJKCount) * 1.0 / totalChars > 0.6);
- return rule1 || rule2;
- }
- // 辅助方法:判断字符是否在指定Unicode区块内
- private static boolean isInUnicodeBlocks(char c, int[][] blocks) {
- for (int[] block : blocks) {
- if ((int) c >= block[0] && (int) c <= block[1]) {
- return true;
- }
- }
- return false;
- }
- // 上下文合理性检测
- private static boolean hasLowMeaningfulness(String text) {
- // 假设正常文本应包含常见停用词(的、是、在等)
- List<String> commonWords = ListUtil.list(false);
- commonWords.add("基金");
- commonWords.addAll(ListUtil.toList(ReportType.ANNUALLY.getPatterns()));
- commonWords.addAll(ListUtil.toList(ReportType.QUARTERLY.getPatterns()));
- commonWords.addAll(ListUtil.toList(ReportType.MONTHLY.getPatterns()));
- commonWords.addAll(ListUtil.toList(ReportType.LETTER.getPatterns()));
- commonWords.addAll(ListUtil.toList(ReportType.WEEKLY.getPatterns()));
- commonWords.addAll(ListUtil.toList(ReportType.OTHER.getPatterns()));
- for (String word : commonWords) {
- if (text.contains(word)) return false;
- }
- return true;
- }
- /**
- * 递归解压 ZIP 文件(含嵌套深度限制)
- *
- * @param zipFile 输入的 ZIP 文件
- * @param outputDir 解压目标根目录
- * @param maxDepth 最大嵌套深度(例如 3 表示允许 parent/nest1/nest2.zip)
- * @return 所有解压后的文件路径(格式:parent.zip/nest1/file.txt)
- */
- public static List<String> decompressZip(String zipFile, String outputDir, int maxDepth, String encoding) throws IOException {
- return decompressZip(FileUtil.file(zipFile), FileUtil.file(outputDir), maxDepth, encoding);
- }
- /**
- * 递归解压 ZIP 文件(含嵌套深度限制)
- *
- * @param zipFile 输入的 ZIP 文件
- * @param outputDir 解压目标根目录
- * @param maxDepth 最大嵌套深度(例如 3 表示允许 parent.zip/nest1.zip/nest2.zip)
- * @return 所有解压后的文件路径(格式:parent.zip/nest1.zip/file.txt)
- */
- public static List<String> decompressZip(File zipFile, File outputDir, int maxDepth, String encoding) throws IOException {
- if (maxDepth < 0) {
- throw new IllegalArgumentException("最大嵌套深度不能小于 0");
- }
- List<String> decompressedFiles = ListUtil.list(false);
- decompressZipRecursive(zipFile, outputDir, "", 0, maxDepth, encoding, decompressedFiles);
- return decompressedFiles;
- }
- /**
- * 递归解压核心逻辑
- */
- private static void decompressZipRecursive(
- File currentZip,
- File rootOutputDir,
- String nestedPath,
- int currentDepth,
- int maxDepth,
- String encoding,
- List<String> decompressedFiles) throws IOException {
- // 1. 超过最大深度时停止处理嵌套 ZIP
- if (currentDepth > maxDepth) {
- return;
- }
- // 2. 创建当前 ZIP 的解压目录(跟压缩包目录已经处理过,就不要追加到文件目录中了)
- String currentZipName = FileUtil.mainName(currentZip);
- String currentNestedPath = nestedPath.isEmpty()
- ? ""
- : nestedPath + File.separator + currentZipName;
- File currentOutputDir = new File(rootOutputDir, currentNestedPath);
- FileUtil.mkdir(currentOutputDir);
- // 3. 解压当前 ZIP,支持最多10个分卷的解压
- try (ZipFile zip = ZipFile.builder().setFile(currentZip).setCharset(encoding).setMaxNumberOfDisks(10).get()) {
- Enumeration<ZipArchiveEntry> entries = zip.getEntries();
- while (entries.hasMoreElements()) {
- ZipArchiveEntry entry = entries.nextElement();
- String name = entry.getName();
- if (name.startsWith("__MACOSX/")) {
- continue;
- }
- Path entryPath = Paths.get(currentOutputDir.getAbsolutePath(), name);
- // 处理目录
- if (entry.isDirectory()) {
- Files.createDirectories(entryPath);
- continue;
- }
- // 写入文件
- Files.createDirectories(entryPath.getParent());
- try (InputStream is = zip.getInputStream(entry);
- OutputStream os = new FileOutputStream(entryPath.toFile())) {
- IOUtils.copy(is, os);
- }
- // 4. 递归处理嵌套 ZIP(深度+1)
- if (isZip(name) && currentDepth < maxDepth) {
- File nestedZipFile = entryPath.toFile();
- decompressZipRecursive(
- nestedZipFile,
- rootOutputDir,
- currentNestedPath,
- currentDepth + 1, // 深度递增
- maxDepth,
- encoding,
- decompressedFiles
- );
- Files.delete(nestedZipFile.toPath());
- } else {
- // 记录路径
- decompressedFiles.add(entryPath.toString());
- }
- }
- }
- }
- public static void main(String[] args) throws Exception {
- String zipFilePath = "D:\\Documents\\新报告解析\\基协报告\\排排网代销-宏锡5月报告(公司及协会版).7z";
- String destFilePath = "D:\\Documents\\新报告解析\\基协报告\\rar";
- List<String> strings = extractRar5(zipFilePath, destFilePath);
- for (String string : strings) {
- System.out.println(string);
- }
- // List<String> fileList = extractCompressedFiles(zipFilePath, destFilePath);
- // for (String s : fileList) {
- // System.out.println(s);
- // }
- String currentZip = "D:\\Documents\\新报告解析\\基协报告\\排排网代销-宏锡5月报告(公司及协会版).7z";
- List<String> files = decompressZip(currentZip, "D:\\Documents\\新报告解析\\基协报告\\zip\\", 2, "utf-8");
- System.out.println("解压后的文件路径:");
- files.forEach(System.out::println);
- }
- }
|