ArchiveUtil.java 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373
  1. package com.smppw.modaq.infrastructure.util;
  2. import cn.hutool.core.collection.ListUtil;
  3. import cn.hutool.core.io.FileUtil;
  4. import cn.hutool.core.util.StrUtil;
  5. import com.smppw.modaq.common.conts.Constants;
  6. import com.smppw.modaq.common.enums.ReportType;
  7. import net.sf.sevenzipjbinding.*;
  8. import net.sf.sevenzipjbinding.impl.RandomAccessFileInStream;
  9. import net.sf.sevenzipjbinding.simple.ISimpleInArchive;
  10. import net.sf.sevenzipjbinding.simple.ISimpleInArchiveItem;
  11. import org.apache.commons.compress.archivers.ArchiveEntry;
  12. import org.apache.commons.compress.archivers.ArchiveInputStream;
  13. import org.apache.commons.compress.archivers.ArchiveStreamFactory;
  14. import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
  15. import org.apache.commons.compress.archivers.zip.ZipFile;
  16. import org.apache.commons.io.IOUtils;
  17. import java.io.*;
  18. import java.nio.charset.Charset;
  19. import java.nio.file.Files;
  20. import java.nio.file.Path;
  21. import java.nio.file.Paths;
  22. import java.util.Arrays;
  23. import java.util.Enumeration;
  24. import java.util.List;
  25. import java.util.regex.Pattern;
  26. public class ArchiveUtil {
  27. // 候选编码列表(按常见顺序排列)
  28. private static final List<String> CANDIDATE_ENCODINGS = Arrays.asList(
  29. "GBK", // 中文环境常用
  30. "UTF-8", // 标准编码
  31. "GB2312", // 旧版中文
  32. "ISO-8859-1" // 默认回退
  33. );
  34. private static final Pattern ILLEGAL_CHARS_PATTERN =
  35. Pattern.compile("[\\\\:*?\"<>|\\x00-\\x1F]"); // 包含控制字符
  36. // Unicode 中文字符区块定义
  37. private static final int[][] CJK_BLOCKS = {
  38. {0x4E00, 0x9FFF}, // CJK Unified Ideographs (基本汉字)
  39. {0x3400, 0x4DBF}, // CJK Extension A (生僻字)
  40. {0x20000, 0x2A6DF}, // CJK Extension B (极生僻字)
  41. {0x2A700, 0x2B73F}, // CJK Extension C
  42. {0x3000, 0x303F} // CJK标点符号
  43. };
  44. // 常见高频汉字范围(覆盖约99%日常用字)
  45. private static final int[][] COMMON_CHINESE_BLOCKS = {
  46. {0x4E00, 0x9FA5}, // 通用规范汉字表(8105字)
  47. {0x3000, 0x303F} // 常用标点
  48. };
  49. public static boolean isArchive(String fileName) {
  50. return isZip(fileName) || is7z(fileName) || isRAR(fileName);
  51. }
  52. public static boolean isZip(String fileName) {
  53. return StrUtil.isNotBlank(fileName) && StrUtil.endWithIgnoreCase(fileName, Constants.ARCHIVE_ZIP);
  54. }
  55. public static boolean is7z(String fileName) {
  56. return StrUtil.isNotBlank(fileName) && StrUtil.endWithIgnoreCase(fileName, Constants.ARCHIVE_7Z);
  57. }
  58. public static boolean isRAR(String fileName) {
  59. return StrUtil.isNotBlank(fileName) && StrUtil.endWithIgnoreCase(fileName, Constants.ARCHIVE_RAR);
  60. }
  61. public static List<String> extractCompressedFiles(String zipFilePath, String destFilePath) throws IOException {
  62. File destFile = FileUtil.file(destFilePath);
  63. if (!destFile.exists()) {
  64. Files.createDirectories(destFile.toPath());
  65. }
  66. String encoding = detectEncoding(zipFilePath);
  67. if (encoding == null) {
  68. encoding = "GBK";
  69. }
  70. return decompressZip(zipFilePath, destFilePath, 2, encoding);
  71. }
  72. public static List<String> extractRar5(String rarFilePath, String outputDir) throws IOException {
  73. try {
  74. // 初始化 SevenZipJBinding 本地库
  75. SevenZip.initSevenZipFromPlatformJAR();
  76. } catch (SevenZipNativeInitializationException ignored) {
  77. }
  78. RandomAccessFile randomAccessFile = null;
  79. IInArchive inArchive = null;
  80. List<String> resultList = ListUtil.list(false);
  81. try {
  82. // 打开 RAR 文件
  83. randomAccessFile = new RandomAccessFile(rarFilePath, "r");
  84. inArchive = SevenZip.openInArchive(null, new RandomAccessFileInStream(randomAccessFile));
  85. // 获取压缩包中的文件列表
  86. ISimpleInArchive simpleInArchive = inArchive.getSimpleInterface();
  87. for (ISimpleInArchiveItem item : simpleInArchive.getArchiveItems()) {
  88. if (!item.isFolder()) {
  89. resultList.add(extractItem(item, outputDir));
  90. }
  91. }
  92. } finally {
  93. // 释放资源
  94. if (inArchive != null) {
  95. inArchive.close();
  96. }
  97. if (randomAccessFile != null) {
  98. randomAccessFile.close();
  99. }
  100. }
  101. return resultList;
  102. }
  103. private static String extractItem(ISimpleInArchiveItem item, String outputDir) throws SevenZipException {
  104. String filePath = outputDir + File.separator + item.getPath();
  105. File outputFile = FileUtil.file(filePath);
  106. // 创建父目录
  107. File parentDir = outputFile.getParentFile();
  108. if (!parentDir.exists() && !parentDir.mkdirs()) {
  109. throw new SevenZipException("无法创建目录: " + parentDir.getAbsolutePath());
  110. }
  111. // 提取文件内容
  112. try (FileOutputStream fos = new FileOutputStream(outputFile)) {
  113. ExtractOperationResult result = item.extractSlow(data -> {
  114. try {
  115. fos.write(data);
  116. return data.length; // 返回写入的字节数
  117. } catch (IOException e) {
  118. throw new SevenZipException("写入文件失败", e);
  119. }
  120. });
  121. if (result != ExtractOperationResult.OK) {
  122. throw new SevenZipException("解压失败: " + result);
  123. }
  124. } catch (IOException e) {
  125. throw new SevenZipException("文件操作失败", e);
  126. }
  127. return outputFile.getAbsolutePath();
  128. }
  129. // 检测压缩包编码
  130. private static String detectEncoding(String zipPath) {
  131. for (String encoding : CANDIDATE_ENCODINGS) {
  132. try (BufferedInputStream fis = new BufferedInputStream(new FileInputStream(zipPath));
  133. ArchiveInputStream<? extends ArchiveEntry> ais = new ArchiveStreamFactory()
  134. .createArchiveInputStream(ArchiveStreamFactory.detect(fis), fis, encoding)) {
  135. ArchiveEntry entry = ais.getNextEntry();
  136. if (entry == null) continue; // 空压缩包
  137. String fileName = entry.getName();
  138. if (!isLikelyGarbled(fileName, encoding)) {
  139. return encoding; // 找到有效编码
  140. }
  141. } catch (Exception e) {
  142. // 编码不支持或文件错误,继续尝试下一个
  143. }
  144. }
  145. return null;
  146. }
  147. public static boolean isLikelyGarbled(String fileName, String encoding) {
  148. // 基础检查:非法字符、替换符、连续问号
  149. if (ILLEGAL_CHARS_PATTERN.matcher(fileName).find() ||
  150. fileName.contains("�") ||
  151. fileName.matches(".*\\?{2,}.*")) {
  152. return true;
  153. }
  154. // 编码一致性检查(假设系统使用 UTF-8)
  155. if (!isEncodingConsistent(fileName, Charset.forName(encoding))) {
  156. return true;
  157. }
  158. // 中文字符乱码检测
  159. return hasLowMeaningfulness(fileName) || isLikelyGarbledWithRareChars(fileName);
  160. }
  161. private static boolean isEncodingConsistent(String text, Charset expectedCharset) {
  162. // 将文本按预期编码转换为字节,再解码验证一致性
  163. byte[] bytes = text.getBytes(expectedCharset);
  164. String redecoded = new String(bytes, expectedCharset);
  165. return text.equals(redecoded);
  166. }
  167. public static boolean isLikelyGarbledWithRareChars(String text) {
  168. int totalChars = text.length();
  169. if (totalChars == 0) return false;
  170. int commonCount = 0;
  171. int rareCJKCount = 0;
  172. for (char c : text.toCharArray()) {
  173. // 判断是否属于任何CJK区块
  174. boolean isCJK = isInUnicodeBlocks(c, CJK_BLOCKS);
  175. // 判断是否属于高频汉字区
  176. boolean isCommon = isInUnicodeBlocks(c, COMMON_CHINESE_BLOCKS);
  177. if (isCJK && !isCommon) {
  178. rareCJKCount++; // 统计生僻CJK字符
  179. } else if (isCommon) {
  180. commonCount++;
  181. }
  182. }
  183. // 规则1:生僻CJK占比超过50%且总CJK字符占比高
  184. boolean rule1 = (rareCJKCount > 0) &&
  185. (rareCJKCount * 2 > totalChars) &&
  186. ((commonCount + rareCJKCount) * 1.0 / totalChars > 0.7);
  187. // 规则2:高频字占比极低(<20%)但CJK总占比高(编码错误特征)
  188. boolean rule2 = (commonCount * 1.0 / totalChars < 0.2) &&
  189. ((commonCount + rareCJKCount) * 1.0 / totalChars > 0.6);
  190. return rule1 || rule2;
  191. }
  192. // 辅助方法:判断字符是否在指定Unicode区块内
  193. private static boolean isInUnicodeBlocks(char c, int[][] blocks) {
  194. for (int[] block : blocks) {
  195. if ((int) c >= block[0] && (int) c <= block[1]) {
  196. return true;
  197. }
  198. }
  199. return false;
  200. }
  201. // 上下文合理性检测
  202. private static boolean hasLowMeaningfulness(String text) {
  203. // 假设正常文本应包含常见停用词(的、是、在等)
  204. List<String> commonWords = ListUtil.list(false);
  205. commonWords.add("基金");
  206. commonWords.addAll(ListUtil.toList(ReportType.ANNUALLY.getPatterns()));
  207. commonWords.addAll(ListUtil.toList(ReportType.QUARTERLY.getPatterns()));
  208. commonWords.addAll(ListUtil.toList(ReportType.MONTHLY.getPatterns()));
  209. commonWords.addAll(ListUtil.toList(ReportType.LETTER.getPatterns()));
  210. commonWords.addAll(ListUtil.toList(ReportType.WEEKLY.getPatterns()));
  211. commonWords.addAll(ListUtil.toList(ReportType.OTHER.getPatterns()));
  212. for (String word : commonWords) {
  213. if (text.contains(word)) return false;
  214. }
  215. return true;
  216. }
  217. /**
  218. * 递归解压 ZIP 文件(含嵌套深度限制)
  219. *
  220. * @param zipFile 输入的 ZIP 文件
  221. * @param outputDir 解压目标根目录
  222. * @param maxDepth 最大嵌套深度(例如 3 表示允许 parent/nest1/nest2.zip)
  223. * @return 所有解压后的文件路径(格式:parent.zip/nest1/file.txt)
  224. */
  225. public static List<String> decompressZip(String zipFile, String outputDir, int maxDepth, String encoding) throws IOException {
  226. return decompressZip(FileUtil.file(zipFile), FileUtil.file(outputDir), maxDepth, encoding);
  227. }
  228. /**
  229. * 递归解压 ZIP 文件(含嵌套深度限制)
  230. *
  231. * @param zipFile 输入的 ZIP 文件
  232. * @param outputDir 解压目标根目录
  233. * @param maxDepth 最大嵌套深度(例如 3 表示允许 parent.zip/nest1.zip/nest2.zip)
  234. * @return 所有解压后的文件路径(格式:parent.zip/nest1.zip/file.txt)
  235. */
  236. public static List<String> decompressZip(File zipFile, File outputDir, int maxDepth, String encoding) throws IOException {
  237. if (maxDepth < 0) {
  238. throw new IllegalArgumentException("最大嵌套深度不能小于 0");
  239. }
  240. List<String> decompressedFiles = ListUtil.list(false);
  241. decompressZipRecursive(zipFile, outputDir, "", 0, maxDepth, encoding, decompressedFiles);
  242. return decompressedFiles;
  243. }
  244. /**
  245. * 递归解压核心逻辑
  246. */
  247. private static void decompressZipRecursive(
  248. File currentZip,
  249. File rootOutputDir,
  250. String nestedPath,
  251. int currentDepth,
  252. int maxDepth,
  253. String encoding,
  254. List<String> decompressedFiles) throws IOException {
  255. // 1. 超过最大深度时停止处理嵌套 ZIP
  256. if (currentDepth > maxDepth) {
  257. return;
  258. }
  259. // 2. 创建当前 ZIP 的解压目录(跟压缩包目录已经处理过,就不要追加到文件目录中了)
  260. String currentZipName = FileUtil.mainName(currentZip);
  261. String currentNestedPath = nestedPath.isEmpty()
  262. ? ""
  263. : nestedPath + File.separator + currentZipName;
  264. File currentOutputDir = new File(rootOutputDir, currentNestedPath);
  265. FileUtil.mkdir(currentOutputDir);
  266. // 3. 解压当前 ZIP,支持最多10个分卷的解压
  267. try (ZipFile zip = ZipFile.builder().setFile(currentZip).setCharset(encoding).setMaxNumberOfDisks(10).get()) {
  268. Enumeration<ZipArchiveEntry> entries = zip.getEntries();
  269. while (entries.hasMoreElements()) {
  270. ZipArchiveEntry entry = entries.nextElement();
  271. String name = entry.getName();
  272. if (name.startsWith("__MACOSX/")) {
  273. continue;
  274. }
  275. Path entryPath = Paths.get(currentOutputDir.getAbsolutePath(), name);
  276. // 处理目录
  277. if (entry.isDirectory()) {
  278. Files.createDirectories(entryPath);
  279. continue;
  280. }
  281. // 写入文件
  282. Files.createDirectories(entryPath.getParent());
  283. try (InputStream is = zip.getInputStream(entry);
  284. OutputStream os = new FileOutputStream(entryPath.toFile())) {
  285. IOUtils.copy(is, os);
  286. }
  287. // 4. 递归处理嵌套 ZIP(深度+1)
  288. if (isZip(name) && currentDepth < maxDepth) {
  289. File nestedZipFile = entryPath.toFile();
  290. decompressZipRecursive(
  291. nestedZipFile,
  292. rootOutputDir,
  293. currentNestedPath,
  294. currentDepth + 1, // 深度递增
  295. maxDepth,
  296. encoding,
  297. decompressedFiles
  298. );
  299. Files.delete(nestedZipFile.toPath());
  300. } else {
  301. // 记录路径
  302. decompressedFiles.add(entryPath.toString());
  303. }
  304. }
  305. }
  306. }
  307. public static void main(String[] args) throws Exception {
  308. String zipFilePath = "D:\\Documents\\新报告解析\\基协报告\\排排网代销-宏锡5月报告(公司及协会版).7z";
  309. String destFilePath = "D:\\Documents\\新报告解析\\基协报告\\rar";
  310. List<String> strings = extractRar5(zipFilePath, destFilePath);
  311. for (String string : strings) {
  312. System.out.println(string);
  313. }
  314. // List<String> fileList = extractCompressedFiles(zipFilePath, destFilePath);
  315. // for (String s : fileList) {
  316. // System.out.println(s);
  317. // }
  318. String currentZip = "D:\\Documents\\新报告解析\\基协报告\\排排网代销-宏锡5月报告(公司及协会版).7z";
  319. List<String> files = decompressZip(currentZip, "D:\\Documents\\新报告解析\\基协报告\\zip\\", 2, "utf-8");
  320. System.out.println("解压后的文件路径:");
  321. files.forEach(System.out::println);
  322. }
  323. }