|
@@ -4,6 +4,7 @@ import cn.hutool.core.collection.CollUtil;
|
|
|
import cn.hutool.core.collection.ListUtil;
|
|
|
import cn.hutool.core.io.FileUtil;
|
|
|
import cn.hutool.core.util.StrUtil;
|
|
|
+import com.smppw.modaq.common.enums.ReportType;
|
|
|
import net.sf.sevenzipjbinding.ExtractOperationResult;
|
|
|
import net.sf.sevenzipjbinding.IInArchive;
|
|
|
import net.sf.sevenzipjbinding.SevenZip;
|
|
@@ -20,12 +21,14 @@ import org.apache.commons.compress.archivers.zip.ZipFile;
|
|
|
import org.apache.commons.io.IOUtils;
|
|
|
|
|
|
import java.io.*;
|
|
|
+import java.nio.charset.Charset;
|
|
|
import java.nio.file.Files;
|
|
|
import java.nio.file.Path;
|
|
|
import java.nio.file.Paths;
|
|
|
import java.util.Arrays;
|
|
|
import java.util.Enumeration;
|
|
|
import java.util.List;
|
|
|
+import java.util.regex.Pattern;
|
|
|
|
|
|
public class ExcelUtil {
|
|
|
// 候选编码列表(按常见顺序排列)
|
|
@@ -36,6 +39,24 @@ public class ExcelUtil {
|
|
|
"ISO-8859-1" // 默认回退
|
|
|
);
|
|
|
|
|
|
+ private static final Pattern ILLEGAL_CHARS_PATTERN =
|
|
|
+ Pattern.compile("[\\\\:*?\"<>|\\x00-\\x1F]"); // 包含控制字符
|
|
|
+
|
|
|
+ // Unicode 中文字符区块定义
|
|
|
+ private static final int[][] CJK_BLOCKS = {
|
|
|
+ {0x4E00, 0x9FFF}, // CJK Unified Ideographs (基本汉字)
|
|
|
+ {0x3400, 0x4DBF}, // CJK Extension A (生僻字)
|
|
|
+ {0x20000, 0x2A6DF}, // CJK Extension B (极生僻字)
|
|
|
+ {0x2A700, 0x2B73F}, // CJK Extension C
|
|
|
+ {0x3000, 0x303F} // CJK标点符号
|
|
|
+ };
|
|
|
+
|
|
|
+ // 常见高频汉字范围(覆盖约99%日常用字)
|
|
|
+ private static final int[][] COMMON_CHINESE_BLOCKS = {
|
|
|
+ {0x4E00, 0x9FA5}, // 通用规范汉字表(8105字)
|
|
|
+ {0x3000, 0x303F} // 常用标点
|
|
|
+ };
|
|
|
+
|
|
|
public static boolean isExcel(String fileName) {
|
|
|
return StrUtil.isNotBlank(fileName) && (fileName.endsWith("xls") || fileName.endsWith("xlsx") || fileName.endsWith("XLS") || fileName.endsWith("XLSX"));
|
|
|
}
|
|
@@ -106,8 +127,124 @@ public class ExcelUtil {
|
|
|
}
|
|
|
|
|
|
return filePathList;
|
|
|
+// return extractCompressedFiles(zipFilePath, destFilePath, 0, new HashSet<>());
|
|
|
}
|
|
|
|
|
|
+// /**
|
|
|
+// * 递归解压压缩文件(支持嵌套)
|
|
|
+// *
|
|
|
+// * @param zipFilePath 压缩文件路径
|
|
|
+// * @param destDir 目标目录
|
|
|
+// * @param depth 当前递归深度(防止无限递归)
|
|
|
+// * @param processed 已处理的文件哈希(防止重复解压)
|
|
|
+// */
|
|
|
+// private static List<String> extractCompressedFiles(String zipFilePath, String destDir, int depth, Set<String> processed) throws IOException, ArchiveException {
|
|
|
+// // 防御:限制递归深度防止栈溢出
|
|
|
+// if (depth > 4) {
|
|
|
+// throw new ArchiveException("Maximum recursion depth (4) exceeded");
|
|
|
+// }
|
|
|
+//
|
|
|
+// String encoding = detectEncoding(zipFilePath); // 编码检测
|
|
|
+// if (encoding == null) {
|
|
|
+// encoding = "GBK";
|
|
|
+// }
|
|
|
+// List<String> extractedFiles = ListUtil.list(false);
|
|
|
+// File destDirFile = FileUtil.mkdir(destDir);
|
|
|
+//
|
|
|
+// try (InputStream fis = new BufferedInputStream(Files.newInputStream(Paths.get(zipFilePath)));
|
|
|
+// ArchiveInputStream<?> ais = createArchiveInputStream(fis, encoding)) {
|
|
|
+// ArchiveEntry entry;
|
|
|
+// while ((entry = ais.getNextEntry()) != null) {
|
|
|
+// String entryName = sanitizeEntryName(entry.getName()); // 清理非法字符
|
|
|
+// if (entryName.startsWith("__MACOSX/")) {
|
|
|
+// continue;
|
|
|
+// }
|
|
|
+// String zipFilename = FileUtil.getName(destDir);
|
|
|
+// if (zipFilename.contains("确认") && !entryName.contains("确认")) {
|
|
|
+// String ext = FileUtil.extName(entryName);
|
|
|
+// entryName = StrUtil.subBefore(entryName, ".", true);
|
|
|
+// entryName = entryName + "_确认单." + ext;
|
|
|
+// }
|
|
|
+// File entryFile = new File(destDirFile, entryName);
|
|
|
+// if (entry.isDirectory()) {
|
|
|
+// Files.createDirectories(entryFile.toPath());
|
|
|
+// } else {
|
|
|
+// extractSingleFile(ais, entryFile);
|
|
|
+// if (isArchiveFile(entryFile)) { // 判断是否为压缩文件
|
|
|
+// extractedFiles.addAll(processNestedArchive(entryFile, destDir, depth, processed));
|
|
|
+// } else {
|
|
|
+// extractedFiles.add(entryFile.getAbsolutePath());
|
|
|
+// }
|
|
|
+// }
|
|
|
+// }
|
|
|
+// } catch (Exception e) {
|
|
|
+// handleArchiveException(e, zipFilePath, destDir, extractedFiles, encoding);
|
|
|
+// }
|
|
|
+// return extractedFiles;
|
|
|
+// }
|
|
|
+//
|
|
|
+// // ----------- 辅助方法 -----------
|
|
|
+// private static ArchiveInputStream<?> createArchiveInputStream(InputStream is, String encoding) throws ArchiveException {
|
|
|
+// return new ArchiveStreamFactory(encoding)
|
|
|
+// .createArchiveInputStream(ArchiveStreamFactory.detect(is), is);
|
|
|
+// }
|
|
|
+//
|
|
|
+// private static void extractSingleFile(ArchiveInputStream<?> ais, File targetFile) throws IOException {
|
|
|
+// try (OutputStream fos = Files.newOutputStream(targetFile.toPath())) {
|
|
|
+// IOUtils.copy(ais, fos);
|
|
|
+// } finally {
|
|
|
+// if (ais != null) {
|
|
|
+// ais.close();
|
|
|
+// }
|
|
|
+// }
|
|
|
+// }
|
|
|
+//
|
|
|
+// private static List<String> processNestedArchive(File archiveFile, String parentDir, int depth, Set<String> processed) throws IOException, ArchiveException {
|
|
|
+// // 强制关闭所有可能关联的流(防御性编程)
|
|
|
+// System.gc(); // 触发垃圾回收释放潜在未关闭的流
|
|
|
+//
|
|
|
+// String fileHash = DigestUtil.md5Hex(new FileInputStream(archiveFile)); // 文件哈希校验(需实现)
|
|
|
+// if (processed.contains(fileHash)) {
|
|
|
+// return ListUtil.empty(); // 避免重复解压相同文件
|
|
|
+// }
|
|
|
+// processed.add(fileHash);
|
|
|
+//
|
|
|
+// String nestedDestDir = parentDir + File.separator + FileUtil.mainName(archiveFile.getName());
|
|
|
+// List<String> nestedFiles = extractCompressedFiles(
|
|
|
+// archiveFile.getAbsolutePath(),
|
|
|
+// nestedDestDir,
|
|
|
+// depth + 1,
|
|
|
+// processed
|
|
|
+// );
|
|
|
+//
|
|
|
+// Files.delete(archiveFile.toPath()); // 删除原压缩包
|
|
|
+// return nestedFiles;
|
|
|
+// }
|
|
|
+//
|
|
|
+// private static boolean isArchiveFile(File file) {
|
|
|
+// try (InputStream is = Files.newInputStream(file.toPath());
|
|
|
+// BufferedInputStream bis = new BufferedInputStream(is)) {
|
|
|
+// ArchiveStreamFactory.detect(bis); // 通过文件头检测
|
|
|
+// return true;
|
|
|
+// } catch (Exception e) {
|
|
|
+// return false;
|
|
|
+// }
|
|
|
+// }
|
|
|
+//
|
|
|
+// // 清理非法路径字符(防御路径穿越攻击)
|
|
|
+// private static String sanitizeEntryName(String name) {
|
|
|
+// return name.replaceAll("[:\\\\/*\"?|<>]", "_");
|
|
|
+// }
|
|
|
+//
|
|
|
+// // 统一异常处理
|
|
|
+// private static void handleArchiveException(Exception e, String zipFile, String destDir, List<String> extractedFiles, String encoding) throws ArchiveException, IOException {
|
|
|
+// if (e.getMessage().contains("split")) {
|
|
|
+// extractedFiles.addAll(extractSplitZip(zipFile, destDir, encoding));
|
|
|
+// } else {
|
|
|
+// throw new ArchiveException("Failed to extract " + zipFile, e);
|
|
|
+// }
|
|
|
+// }
|
|
|
+
|
|
|
public static List<String> extractSplitZip(String zipFilePath, String destFilePath, String encoding) throws IOException {
|
|
|
List<String> resultList = ListUtil.list(false);
|
|
|
File file = new File(zipFilePath);
|
|
@@ -200,7 +337,7 @@ public class ExcelUtil {
|
|
|
if (entry == null) continue; // 空压缩包
|
|
|
|
|
|
String fileName = entry.getName();
|
|
|
- if (!hasInvalidCharacters(fileName)) {
|
|
|
+ if (!isLikelyGarbled(fileName, encoding)) {
|
|
|
return encoding; // 找到有效编码
|
|
|
}
|
|
|
} catch (Exception e) {
|
|
@@ -210,10 +347,86 @@ public class ExcelUtil {
|
|
|
return null;
|
|
|
}
|
|
|
|
|
|
- // 检查文件名是否包含无效字符(如替换符)
|
|
|
- private static boolean hasInvalidCharacters(String fileName) {
|
|
|
- // 检查常见乱码符号:�或连续问号
|
|
|
- return fileName.contains("�") || fileName.matches(".*\\?{2,}.*");
|
|
|
+ public static boolean isLikelyGarbled(String fileName, String encoding) {
|
|
|
+ // 基础检查:非法字符、替换符、连续问号
|
|
|
+ if (ILLEGAL_CHARS_PATTERN.matcher(fileName).find() ||
|
|
|
+ fileName.contains("�") ||
|
|
|
+ fileName.matches(".*\\?{2,}.*")) {
|
|
|
+ return true;
|
|
|
+ }
|
|
|
+
|
|
|
+ // 编码一致性检查(假设系统使用 UTF-8)
|
|
|
+ if (!isEncodingConsistent(fileName, Charset.forName(encoding))) {
|
|
|
+ return true;
|
|
|
+ }
|
|
|
+
|
|
|
+ // 中文字符乱码检测
|
|
|
+ return hasLowMeaningfulness(fileName) || isLikelyGarbledWithRareChars(fileName);
|
|
|
+ }
|
|
|
+
|
|
|
+ private static boolean isEncodingConsistent(String text, Charset expectedCharset) {
|
|
|
+ // 将文本按预期编码转换为字节,再解码验证一致性
|
|
|
+ byte[] bytes = text.getBytes(expectedCharset);
|
|
|
+ String redecoded = new String(bytes, expectedCharset);
|
|
|
+ return text.equals(redecoded);
|
|
|
+ }
|
|
|
+
|
|
|
+ public static boolean isLikelyGarbledWithRareChars(String text) {
|
|
|
+ int totalChars = text.length();
|
|
|
+ if (totalChars == 0) return false;
|
|
|
+
|
|
|
+ int commonCount = 0;
|
|
|
+ int rareCJKCount = 0;
|
|
|
+
|
|
|
+ for (char c : text.toCharArray()) {
|
|
|
+ // 判断是否属于任何CJK区块
|
|
|
+ boolean isCJK = isInUnicodeBlocks(c, CJK_BLOCKS);
|
|
|
+ // 判断是否属于高频汉字区
|
|
|
+ boolean isCommon = isInUnicodeBlocks(c, COMMON_CHINESE_BLOCKS);
|
|
|
+
|
|
|
+ if (isCJK && !isCommon) {
|
|
|
+ rareCJKCount++; // 统计生僻CJK字符
|
|
|
+ } else if (isCommon) {
|
|
|
+ commonCount++;
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ // 规则1:生僻CJK占比超过50%且总CJK字符占比高
|
|
|
+ boolean rule1 = (rareCJKCount > 0) &&
|
|
|
+ (rareCJKCount * 2 > totalChars) &&
|
|
|
+ ((commonCount + rareCJKCount) * 1.0 / totalChars > 0.7);
|
|
|
+
|
|
|
+ // 规则2:高频字占比极低(<20%)但CJK总占比高(编码错误特征)
|
|
|
+ boolean rule2 = (commonCount * 1.0 / totalChars < 0.2) &&
|
|
|
+ ((commonCount + rareCJKCount) * 1.0 / totalChars > 0.6);
|
|
|
+
|
|
|
+ return rule1 || rule2;
|
|
|
+ }
|
|
|
+
|
|
|
+ // 辅助方法:判断字符是否在指定Unicode区块内
|
|
|
+ private static boolean isInUnicodeBlocks(char c, int[][] blocks) {
|
|
|
+ for (int[] block : blocks) {
|
|
|
+ if ((int) c >= block[0] && (int) c <= block[1]) {
|
|
|
+ return true;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return false;
|
|
|
+ }
|
|
|
+
|
|
|
+ // 上下文合理性检测
|
|
|
+ private static boolean hasLowMeaningfulness(String text) {
|
|
|
+ // 假设正常文本应包含常见停用词(的、是、在等)
|
|
|
+ List<String> commonWords = ListUtil.list(false);
|
|
|
+ commonWords.addAll(ListUtil.toList(ReportType.ANNUALLY.getPatterns()));
|
|
|
+ commonWords.addAll(ListUtil.toList(ReportType.QUARTERLY.getPatterns()));
|
|
|
+ commonWords.addAll(ListUtil.toList(ReportType.MONTHLY.getPatterns()));
|
|
|
+ commonWords.addAll(ListUtil.toList(ReportType.LETTER.getPatterns()));
|
|
|
+ commonWords.addAll(ListUtil.toList(ReportType.WEEKLY.getPatterns()));
|
|
|
+ commonWords.addAll(ListUtil.toList(ReportType.OTHER.getPatterns()));
|
|
|
+ for (String word : commonWords) {
|
|
|
+ if (text.contains(word)) return false;
|
|
|
+ }
|
|
|
+ return true;
|
|
|
}
|
|
|
|
|
|
public static void main(String[] args) throws Exception {
|