|
@@ -1,50 +1,62 @@
|
|
|
-package com.simuwang.daq.utils;
|
|
|
-
|
|
|
-import cn.hutool.core.map.MapUtil;
|
|
|
-import cn.hutool.core.util.StrUtil;
|
|
|
-import cn.hutool.http.HttpUtil;
|
|
|
-import cn.hutool.json.JSONObject;
|
|
|
-import cn.hutool.json.JSONUtil;
|
|
|
-import com.simuwang.base.pojo.dto.report.PythonResult;
|
|
|
-import com.simuwang.daq.components.PythonReportConverter;
|
|
|
-import org.apache.pdfbox.cos.COSName;
|
|
|
-import org.apache.pdfbox.pdmodel.PDPage;
|
|
|
-import org.apache.pdfbox.pdmodel.PDResources;
|
|
|
-import org.apache.pdfbox.pdmodel.common.PDStream;
|
|
|
-import org.apache.pdfbox.pdmodel.graphics.PDXObject;
|
|
|
-import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
|
|
|
-
|
|
|
-import java.io.IOException;
|
|
|
-import java.util.*;
|
|
|
-import java.util.regex.Matcher;
|
|
|
-import java.util.regex.Pattern;
|
|
|
-
|
|
|
-public class ReportParseUtil {
|
|
|
- public static void main(String[] args) throws IOException {
|
|
|
- String fileName = "SJM970_排排精选进取一号私募证券投资基金_2022年第4季度报告.pdf";
|
|
|
- Pattern pattern = Pattern.compile("S(?:[A-Z]{0}[0-9]{5}|[A-Z][0-9]{4}|[A-Z]{2}[0-9]{3}|[A-Z]{3}[0-9]{2})");
|
|
|
- Matcher matcher = pattern.matcher(fileName);
|
|
|
- String registerNumber = null;
|
|
|
- if (matcher.find()) {
|
|
|
- registerNumber = matcher.group();
|
|
|
- }
|
|
|
-
|
|
|
- int type = 1;
|
|
|
- String baseUrl = "http://192.168.0.81:8088";
|
|
|
- String api = "/api/v1/parse/amac_report";
|
|
|
- Map<String, Object> params = MapUtil.newHashMap(16);
|
|
|
- params.put("file_id", 111112);
|
|
|
- params.put("file_path", "E:/workproject/fastparse/src/fastparse/static/reports/quarterly_report/13445.pdf");
|
|
|
- params.put("register_number", registerNumber);
|
|
|
- params.put("file_type", type);
|
|
|
- params.put("file_name", fileName);
|
|
|
- params.put("fund_name", null);
|
|
|
- params.put("trust_name", null);
|
|
|
- String body = HttpUtil.post(baseUrl + api, JSONUtil.toJsonStr(params));
|
|
|
- JSONObject obj = JSONUtil.parseObj(body);
|
|
|
- PythonResult<?> result = PythonReportConverter.convert(obj, type);
|
|
|
- System.out.println(result);
|
|
|
-
|
|
|
+//package com.simuwang.daq.utils;
|
|
|
+//
|
|
|
+//import cn.hutool.core.collection.ListUtil;
|
|
|
+//import cn.hutool.core.map.MapUtil;
|
|
|
+//import cn.hutool.core.util.ReflectUtil;
|
|
|
+//import cn.hutool.core.util.StrUtil;
|
|
|
+//import cn.hutool.http.HttpUtil;
|
|
|
+//import cn.hutool.json.JSONObject;
|
|
|
+//import cn.hutool.json.JSONUtil;
|
|
|
+//import com.simuwang.base.pojo.dto.report.PythonResult;
|
|
|
+//import com.simuwang.daq.components.CustomPDFTextStripper;
|
|
|
+//import com.simuwang.daq.components.PythonReportConverter;
|
|
|
+//import com.simuwang.daq.dto.ReportFundInfo;
|
|
|
+//import com.smppw.common.pojo.ValueLabelVO;
|
|
|
+//import org.apache.pdfbox.Loader;
|
|
|
+//import org.apache.pdfbox.cos.COSName;
|
|
|
+//import org.apache.pdfbox.io.RandomAccessReadBufferedFile;
|
|
|
+//import org.apache.pdfbox.pdmodel.PDDocument;
|
|
|
+//import org.apache.pdfbox.pdmodel.PDPage;
|
|
|
+//import org.apache.pdfbox.pdmodel.PDResources;
|
|
|
+//import org.apache.pdfbox.pdmodel.common.PDStream;
|
|
|
+//import org.apache.pdfbox.pdmodel.graphics.PDXObject;
|
|
|
+//import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
|
|
|
+//import org.apache.pdfbox.text.PDFTextStripper;
|
|
|
+//import technology.tabula.*;
|
|
|
+//import technology.tabula.extractors.SpreadsheetExtractionAlgorithm;
|
|
|
+//
|
|
|
+//import java.io.IOException;
|
|
|
+//import java.util.*;
|
|
|
+//import java.util.regex.Matcher;
|
|
|
+//import java.util.regex.Pattern;
|
|
|
+//import java.util.stream.Collectors;
|
|
|
+//
|
|
|
+//public class ReportParseUtil {
|
|
|
+// public static void main(String[] args) throws IOException {
|
|
|
+//// String fileName = "SJM970_排排精选进取一号私募证券投资基金_2022年第4季度报告.pdf";
|
|
|
+//// Pattern pattern = Pattern.compile("S(?:[A-Z]{0}[0-9]{5}|[A-Z][0-9]{4}|[A-Z]{2}[0-9]{3}|[A-Z]{3}[0-9]{2})");
|
|
|
+//// Matcher matcher = pattern.matcher(fileName);
|
|
|
+//// String registerNumber = null;
|
|
|
+//// if (matcher.find()) {
|
|
|
+//// registerNumber = matcher.group();
|
|
|
+//// }
|
|
|
+////
|
|
|
+//// int type = 1;
|
|
|
+//// String baseUrl = "http://192.168.0.81:8088";
|
|
|
+//// String api = "/api/v1/parse/amac_report";
|
|
|
+//// Map<String, Object> params = MapUtil.newHashMap(16);
|
|
|
+//// params.put("file_id", 111112);
|
|
|
+//// params.put("file_path", "E:/workproject/fastparse/src/fastparse/static/reports/quarterly_report/13445.pdf");
|
|
|
+//// params.put("register_number", registerNumber);
|
|
|
+//// params.put("file_type", type);
|
|
|
+//// params.put("file_name", fileName);
|
|
|
+//// params.put("fund_name", null);
|
|
|
+//// params.put("trust_name", null);
|
|
|
+//// String body = HttpUtil.post(baseUrl + api, JSONUtil.toJsonStr(params));
|
|
|
+//// JSONObject obj = JSONUtil.parseObj(body);
|
|
|
+//// PythonResult<?> result = PythonReportConverter.convert(obj, type);
|
|
|
+//// System.out.println(result);
|
|
|
+//
|
|
|
// List<ValueLabelVO> fieldMapper = ListUtil.list(false);
|
|
|
// fieldMapper.add(new ValueLabelVO("fundName", "基金名称"));
|
|
|
// fieldMapper.add(new ValueLabelVO("registerNumber", "基金编码"));
|
|
@@ -137,220 +149,220 @@ public class ReportParseUtil {
|
|
|
// }
|
|
|
// }
|
|
|
// }
|
|
|
- }
|
|
|
-
|
|
|
- /**
|
|
|
- * 找图片水印
|
|
|
- *
|
|
|
- * @param page
|
|
|
- * @return
|
|
|
- * @throws IOException
|
|
|
- */
|
|
|
- public static Map<COSName, PDImageXObject> findImageWatermark(PDPage page) throws IOException {
|
|
|
- Map<COSName, PDImageXObject> watermarkMap = MapUtil.newHashMap();
|
|
|
- PDResources resources = page.getResources();
|
|
|
- Iterable<COSName> xObjectNames = resources.getXObjectNames();
|
|
|
- for (COSName xObjectName : xObjectNames) {
|
|
|
- PDXObject xObject = resources.getXObject(xObjectName);
|
|
|
- PDStream stream = xObject.getStream();
|
|
|
- PDImageXObject imageXObject = null;
|
|
|
- try {
|
|
|
- imageXObject = new PDImageXObject(stream, resources);
|
|
|
- } catch (Exception e) {
|
|
|
- e.printStackTrace();
|
|
|
- }
|
|
|
- if (imageXObject != null) {
|
|
|
- watermarkMap.put(xObjectName, imageXObject);
|
|
|
- }
|
|
|
- }
|
|
|
- return watermarkMap;
|
|
|
- }
|
|
|
-
|
|
|
- private static Map<String, List<String>> generateWatermarkListMap(String fundName, String trustName, String registerNumber) {
|
|
|
- Map<String, List<String>> result = MapUtil.newHashMap(32);
|
|
|
- // 生成水印列表
|
|
|
-
|
|
|
- fundName = StrUtil.isNotBlank(fundName) ? fundName : "私募证券投资基金";
|
|
|
- trustName = StrUtil.isNotBlank(trustName) ? trustName : "资产管理有限公司";
|
|
|
- registerNumber = StrUtil.isNotBlank(registerNumber) ? registerNumber : "";
|
|
|
- String text = fundName + trustName + registerNumber;
|
|
|
- text = text.replaceAll("[()]", ""); // 移除括号
|
|
|
- List<String> textList = new ArrayList<>(new HashSet<>(convertStringToList(text)));
|
|
|
- Collections.reverse(textList);
|
|
|
- StringBuilder sb = new StringBuilder(textList.size());
|
|
|
- for (String ch : textList) {
|
|
|
- sb.append(ch);
|
|
|
- }
|
|
|
- String joinedText = sb.toString();
|
|
|
-
|
|
|
- // 基本水印列表
|
|
|
- List<String> wkList = new ArrayList<>();
|
|
|
- for (String ch : textList) {
|
|
|
- wkList.add(ch + "\r\n");
|
|
|
- wkList.add("\r\n" + ch);
|
|
|
- }
|
|
|
-
|
|
|
- // 查找数字
|
|
|
- List<String> matches = findDigits(fundName);
|
|
|
- if (!matches.isEmpty()) {
|
|
|
- for (String match : matches) {
|
|
|
- wkList.add("\r\n" + match);
|
|
|
- wkList.add(match + "\r\n");
|
|
|
- }
|
|
|
- }
|
|
|
- wkList.add("-");
|
|
|
- wkList.add("【");
|
|
|
- wkList.add("】");
|
|
|
- wkList.add("\r");
|
|
|
- wkList.add("\r\n");
|
|
|
-
|
|
|
- String noNumberText = removeDigits(joinedText);
|
|
|
-
|
|
|
- // 生成不同字段的水印列表
|
|
|
- result.put("report_name", new ArrayList<>(wkList));
|
|
|
- result.get("report_name").addAll(convertStringToList("有限公司"));
|
|
|
-
|
|
|
- result.put("less", new ArrayList<>(wkList));
|
|
|
-
|
|
|
- result.put("more", new ArrayList<>(wkList));
|
|
|
- result.get("more").addAll(convertStringToList(noNumberText));
|
|
|
-
|
|
|
- result.put("leverage", new ArrayList<>(wkList));
|
|
|
- result.get("leverage").addAll(convertStringToList(removeKeywords(noNumberText, "基金资产")));
|
|
|
-
|
|
|
- result.put("base_info", new ArrayList<>(wkList));
|
|
|
- result.get("base_info").addAll(convertStringToList(removeKeywords(text, "基", "金", "投资", "管理", "有", "份", "融", "资", "产", "本", "号", "收益", "策略", "期")));
|
|
|
-
|
|
|
- result.put("industry", new ArrayList<>(wkList));
|
|
|
- result.get("industry").addAll(convertStringToList(removeKeywords(noNumberText, "基金融公产")));
|
|
|
-
|
|
|
- result.put("market_value", new ArrayList<>(Collections.singletonList("\n")));
|
|
|
- return result;
|
|
|
- }
|
|
|
-
|
|
|
- private static List<String> findDigits(String text) {
|
|
|
- List<String> digits = new ArrayList<>();
|
|
|
- Pattern pattern = Pattern.compile("\\d");
|
|
|
- Matcher matcher = pattern.matcher(text);
|
|
|
- while (matcher.find()) {
|
|
|
- digits.add(matcher.group());
|
|
|
- }
|
|
|
- return digits;
|
|
|
- }
|
|
|
-
|
|
|
- private static String removeDigits(String text) {
|
|
|
- return text.replaceAll("\\d", "");
|
|
|
- }
|
|
|
-
|
|
|
- private static String removeKeywords(String text, String... keywords) {
|
|
|
- for (String keyword : keywords) {
|
|
|
- text = text.replaceAll(keyword, "");
|
|
|
- }
|
|
|
- return text;
|
|
|
- }
|
|
|
-
|
|
|
- private static List<String> convertStringToList(String text) {
|
|
|
- List<String> charList = new ArrayList<>();
|
|
|
- for (char c : text.toCharArray()) {
|
|
|
- charList.add(c + "");
|
|
|
- }
|
|
|
- return charList;
|
|
|
- }
|
|
|
-
|
|
|
- public static String processString(List<String> wmList, String string) {
|
|
|
- // 生成正则表达式模式
|
|
|
- String pat = String.join("|", wmList);
|
|
|
- // 使用正则表达式移除wmList中的元素
|
|
|
- string = removeMatches(string, pat);
|
|
|
- // 替换中文括号为英文括号
|
|
|
- string = string.replace("(", "(").replace(")", ")");
|
|
|
- // 移除空格
|
|
|
- string = string.replace(" ", "");
|
|
|
- // 如果字符串以括号开头,则移除第一个字符
|
|
|
- if (startsWithParenthesis(string)) {
|
|
|
- string = string.substring(1);
|
|
|
- }
|
|
|
-
|
|
|
- return string;
|
|
|
- }
|
|
|
-
|
|
|
- private static String removeMatches(String input, String pattern) {
|
|
|
- // 编译正则表达式
|
|
|
- Pattern compiledPattern = Pattern.compile(pattern);
|
|
|
- // 创建Matcher对象
|
|
|
- Matcher matcher = compiledPattern.matcher(input);
|
|
|
- // 使用replaceAll方法替换所有匹配到的字符为空字符串
|
|
|
- return matcher.replaceAll("");
|
|
|
- }
|
|
|
-
|
|
|
- private static boolean startsWithParenthesis(String input) {
|
|
|
- // 匹配以括号开头的字符串
|
|
|
- Pattern pattern = Pattern.compile("^[()].*");
|
|
|
- Matcher matcher = pattern.matcher(input);
|
|
|
- return matcher.find();
|
|
|
- }
|
|
|
-
|
|
|
-// public static void removeTextWatermark(PDPage page) throws IOException {
|
|
|
+// }
|
|
|
+//
|
|
|
+// /**
|
|
|
+// * 找图片水印
|
|
|
+// *
|
|
|
+// * @param page
|
|
|
+// * @return
|
|
|
+// * @throws IOException
|
|
|
+// */
|
|
|
+// public static Map<COSName, PDImageXObject> findImageWatermark(PDPage page) throws IOException {
|
|
|
+// Map<COSName, PDImageXObject> watermarkMap = MapUtil.newHashMap();
|
|
|
// PDResources resources = page.getResources();
|
|
|
-//// if (StrUtil.isAllBlank(fundName, trustName)) {
|
|
|
-//// return;
|
|
|
-//// }
|
|
|
-// PDFTextStripperByArea stripper = new PDFTextStripperByArea();
|
|
|
-// stripper.setSortByPosition(true);
|
|
|
-// stripper.addRegion("watermark", new Rectangle2D.Float(0, 0, page.getMediaBox().getWidth(), page.getMediaBox().getHeight()));
|
|
|
-// stripper.extractRegions(page);
|
|
|
+// Iterable<COSName> xObjectNames = resources.getXObjectNames();
|
|
|
+// for (COSName xObjectName : xObjectNames) {
|
|
|
+// PDXObject xObject = resources.getXObject(xObjectName);
|
|
|
+// PDStream stream = xObject.getStream();
|
|
|
+// PDImageXObject imageXObject = null;
|
|
|
+// try {
|
|
|
+// imageXObject = new PDImageXObject(stream, resources);
|
|
|
+// } catch (Exception e) {
|
|
|
+// e.printStackTrace();
|
|
|
+// }
|
|
|
+// if (imageXObject != null) {
|
|
|
+// watermarkMap.put(xObjectName, imageXObject);
|
|
|
+// }
|
|
|
+// }
|
|
|
+// return watermarkMap;
|
|
|
+// }
|
|
|
//
|
|
|
-// PDFStreamEngine engine = new PDFTextStripper();
|
|
|
-// engine.addOperator(new SetMatrix(stripper));
|
|
|
+// private static Map<String, List<String>> generateWatermarkListMap(String fundName, String trustName, String registerNumber) {
|
|
|
+// Map<String, List<String>> result = MapUtil.newHashMap(32);
|
|
|
+// // 生成水印列表
|
|
|
//
|
|
|
-// }
|
|
|
+// fundName = StrUtil.isNotBlank(fundName) ? fundName : "私募证券投资基金";
|
|
|
+// trustName = StrUtil.isNotBlank(trustName) ? trustName : "资产管理有限公司";
|
|
|
+// registerNumber = StrUtil.isNotBlank(registerNumber) ? registerNumber : "";
|
|
|
+// String text = fundName + trustName + registerNumber;
|
|
|
+// text = text.replaceAll("[()]", ""); // 移除括号
|
|
|
+// List<String> textList = new ArrayList<>(new HashSet<>(convertStringToList(text)));
|
|
|
+// Collections.reverse(textList);
|
|
|
+// StringBuilder sb = new StringBuilder(textList.size());
|
|
|
+// for (String ch : textList) {
|
|
|
+// sb.append(ch);
|
|
|
+// }
|
|
|
+// String joinedText = sb.toString();
|
|
|
//
|
|
|
-// private static void processResources(PDResources resources) throws IOException {
|
|
|
-// for (COSName name : resources.getXObjectNames()) {
|
|
|
-// PDXObject xobject = resources.getXObject(name);
|
|
|
-// if (xobject instanceof PDFormXObject) {
|
|
|
-// PDFormXObject formXObject = (PDFormXObject) xobject;
|
|
|
-// writeTokensToStream(formXObject.getContentStream(),
|
|
|
-// createTokensWithoutText(formXObject));
|
|
|
-// processResources(formXObject.getResources());
|
|
|
-// }
|
|
|
+// // 基本水印列表
|
|
|
+// List<String> wkList = new ArrayList<>();
|
|
|
+// for (String ch : textList) {
|
|
|
+// wkList.add(ch + "\r\n");
|
|
|
+// wkList.add("\r\n" + ch);
|
|
|
// }
|
|
|
-// for (COSName name : resources.getPatternNames()) {
|
|
|
-// PDAbstractPattern pattern = resources.getPattern(name);
|
|
|
-// if (pattern instanceof PDTilingPattern) {
|
|
|
-// PDTilingPattern tilingPattern = (PDTilingPattern) pattern;
|
|
|
-// writeTokensToStream(tilingPattern.getContentStream(),
|
|
|
-// createTokensWithoutText(tilingPattern));
|
|
|
-// processResources(tilingPattern.getResources());
|
|
|
+//
|
|
|
+// // 查找数字
|
|
|
+// List<String> matches = findDigits(fundName);
|
|
|
+// if (!matches.isEmpty()) {
|
|
|
+// for (String match : matches) {
|
|
|
+// wkList.add("\r\n" + match);
|
|
|
+// wkList.add(match + "\r\n");
|
|
|
// }
|
|
|
// }
|
|
|
+// wkList.add("-");
|
|
|
+// wkList.add("【");
|
|
|
+// wkList.add("】");
|
|
|
+// wkList.add("\r");
|
|
|
+// wkList.add("\r\n");
|
|
|
+//
|
|
|
+// String noNumberText = removeDigits(joinedText);
|
|
|
+//
|
|
|
+// // 生成不同字段的水印列表
|
|
|
+// result.put("report_name", new ArrayList<>(wkList));
|
|
|
+// result.get("report_name").addAll(convertStringToList("有限公司"));
|
|
|
+//
|
|
|
+// result.put("less", new ArrayList<>(wkList));
|
|
|
+//
|
|
|
+// result.put("more", new ArrayList<>(wkList));
|
|
|
+// result.get("more").addAll(convertStringToList(noNumberText));
|
|
|
+//
|
|
|
+// result.put("leverage", new ArrayList<>(wkList));
|
|
|
+// result.get("leverage").addAll(convertStringToList(removeKeywords(noNumberText, "基金资产")));
|
|
|
+//
|
|
|
+// result.put("base_info", new ArrayList<>(wkList));
|
|
|
+// result.get("base_info").addAll(convertStringToList(removeKeywords(text, "基", "金", "投资", "管理", "有", "份", "融", "资", "产", "本", "号", "收益", "策略", "期")));
|
|
|
+//
|
|
|
+// result.put("industry", new ArrayList<>(wkList));
|
|
|
+// result.get("industry").addAll(convertStringToList(removeKeywords(noNumberText, "基金融公产")));
|
|
|
+//
|
|
|
+// result.put("market_value", new ArrayList<>(Collections.singletonList("\n")));
|
|
|
+// return result;
|
|
|
// }
|
|
|
//
|
|
|
-// private static void writeTokensToStream(PDStream newContents, List<Object> newTokens) throws IOException {
|
|
|
-// try (OutputStream out = newContents.createOutputStream(COSName.FLATE_DECODE)) {
|
|
|
-// ContentStreamWriter writer = new ContentStreamWriter(out);
|
|
|
-// writer.writeTokens(newTokens);
|
|
|
+// private static List<String> findDigits(String text) {
|
|
|
+// List<String> digits = new ArrayList<>();
|
|
|
+// Pattern pattern = Pattern.compile("\\d");
|
|
|
+// Matcher matcher = pattern.matcher(text);
|
|
|
+// while (matcher.find()) {
|
|
|
+// digits.add(matcher.group());
|
|
|
// }
|
|
|
+// return digits;
|
|
|
// }
|
|
|
//
|
|
|
-// private static List<Object> createTokensWithoutText(PDContentStream contentStream) throws IOException {
|
|
|
-// PDFStreamParser parser = new PDFStreamParser(contentStream);
|
|
|
-// Object token = parser.parseNextToken();
|
|
|
-// List<Object> newTokens = new ArrayList<>();
|
|
|
-// while (token != null) {
|
|
|
-// if (token instanceof Operator op) {
|
|
|
-// String opName = op.getName();
|
|
|
-// if (OperatorName.SET_MATRIX.equals(opName)) {
|
|
|
-// // remove the argument to this operator
|
|
|
-// newTokens.remove(newTokens.size() - 1);
|
|
|
+// private static String removeDigits(String text) {
|
|
|
+// return text.replaceAll("\\d", "");
|
|
|
+// }
|
|
|
//
|
|
|
-// token = parser.parseNextToken();
|
|
|
-// continue;
|
|
|
-// }
|
|
|
-// }
|
|
|
-// newTokens.add(token);
|
|
|
-// token = parser.parseNextToken();
|
|
|
+// private static String removeKeywords(String text, String... keywords) {
|
|
|
+// for (String keyword : keywords) {
|
|
|
+// text = text.replaceAll(keyword, "");
|
|
|
+// }
|
|
|
+// return text;
|
|
|
+// }
|
|
|
+//
|
|
|
+// private static List<String> convertStringToList(String text) {
|
|
|
+// List<String> charList = new ArrayList<>();
|
|
|
+// for (char c : text.toCharArray()) {
|
|
|
+// charList.add(c + "");
|
|
|
+// }
|
|
|
+// return charList;
|
|
|
+// }
|
|
|
+//
|
|
|
+// public static String processString(List<String> wmList, String string) {
|
|
|
+// // 生成正则表达式模式
|
|
|
+// String pat = String.join("|", wmList);
|
|
|
+// // 使用正则表达式移除wmList中的元素
|
|
|
+// string = removeMatches(string, pat);
|
|
|
+// // 替换中文括号为英文括号
|
|
|
+// string = string.replace("(", "(").replace(")", ")");
|
|
|
+// // 移除空格
|
|
|
+// string = string.replace(" ", "");
|
|
|
+// // 如果字符串以括号开头,则移除第一个字符
|
|
|
+// if (startsWithParenthesis(string)) {
|
|
|
+// string = string.substring(1);
|
|
|
// }
|
|
|
-// return newTokens;
|
|
|
+//
|
|
|
+// return string;
|
|
|
+// }
|
|
|
+//
|
|
|
+// private static String removeMatches(String input, String pattern) {
|
|
|
+// // 编译正则表达式
|
|
|
+// Pattern compiledPattern = Pattern.compile(pattern);
|
|
|
+// // 创建Matcher对象
|
|
|
+// Matcher matcher = compiledPattern.matcher(input);
|
|
|
+// // 使用replaceAll方法替换所有匹配到的字符为空字符串
|
|
|
+// return matcher.replaceAll("");
|
|
|
// }
|
|
|
-}
|
|
|
+//
|
|
|
+// private static boolean startsWithParenthesis(String input) {
|
|
|
+// // 匹配以括号开头的字符串
|
|
|
+// Pattern pattern = Pattern.compile("^[()].*");
|
|
|
+// Matcher matcher = pattern.matcher(input);
|
|
|
+// return matcher.find();
|
|
|
+// }
|
|
|
+//
|
|
|
+//// public static void removeTextWatermark(PDPage page) throws IOException {
|
|
|
+//// PDResources resources = page.getResources();
|
|
|
+////// if (StrUtil.isAllBlank(fundName, trustName)) {
|
|
|
+////// return;
|
|
|
+////// }
|
|
|
+//// PDFTextStripperByArea stripper = new PDFTextStripperByArea();
|
|
|
+//// stripper.setSortByPosition(true);
|
|
|
+//// stripper.addRegion("watermark", new Rectangle2D.Float(0, 0, page.getMediaBox().getWidth(), page.getMediaBox().getHeight()));
|
|
|
+//// stripper.extractRegions(page);
|
|
|
+////
|
|
|
+//// PDFStreamEngine engine = new PDFTextStripper();
|
|
|
+//// engine.addOperator(new SetMatrix(stripper));
|
|
|
+////
|
|
|
+//// }
|
|
|
+////
|
|
|
+//// private static void processResources(PDResources resources) throws IOException {
|
|
|
+//// for (COSName name : resources.getXObjectNames()) {
|
|
|
+//// PDXObject xobject = resources.getXObject(name);
|
|
|
+//// if (xobject instanceof PDFormXObject) {
|
|
|
+//// PDFormXObject formXObject = (PDFormXObject) xobject;
|
|
|
+//// writeTokensToStream(formXObject.getContentStream(),
|
|
|
+//// createTokensWithoutText(formXObject));
|
|
|
+//// processResources(formXObject.getResources());
|
|
|
+//// }
|
|
|
+//// }
|
|
|
+//// for (COSName name : resources.getPatternNames()) {
|
|
|
+//// PDAbstractPattern pattern = resources.getPattern(name);
|
|
|
+//// if (pattern instanceof PDTilingPattern) {
|
|
|
+//// PDTilingPattern tilingPattern = (PDTilingPattern) pattern;
|
|
|
+//// writeTokensToStream(tilingPattern.getContentStream(),
|
|
|
+//// createTokensWithoutText(tilingPattern));
|
|
|
+//// processResources(tilingPattern.getResources());
|
|
|
+//// }
|
|
|
+//// }
|
|
|
+//// }
|
|
|
+////
|
|
|
+//// private static void writeTokensToStream(PDStream newContents, List<Object> newTokens) throws IOException {
|
|
|
+//// try (OutputStream out = newContents.createOutputStream(COSName.FLATE_DECODE)) {
|
|
|
+//// ContentStreamWriter writer = new ContentStreamWriter(out);
|
|
|
+//// writer.writeTokens(newTokens);
|
|
|
+//// }
|
|
|
+//// }
|
|
|
+////
|
|
|
+//// private static List<Object> createTokensWithoutText(PDContentStream contentStream) throws IOException {
|
|
|
+//// PDFStreamParser parser = new PDFStreamParser(contentStream);
|
|
|
+//// Object token = parser.parseNextToken();
|
|
|
+//// List<Object> newTokens = new ArrayList<>();
|
|
|
+//// while (token != null) {
|
|
|
+//// if (token instanceof Operator op) {
|
|
|
+//// String opName = op.getName();
|
|
|
+//// if (OperatorName.SET_MATRIX.equals(opName)) {
|
|
|
+//// // remove the argument to this operator
|
|
|
+//// newTokens.remove(newTokens.size() - 1);
|
|
|
+////
|
|
|
+//// token = parser.parseNextToken();
|
|
|
+//// continue;
|
|
|
+//// }
|
|
|
+//// }
|
|
|
+//// newTokens.add(token);
|
|
|
+//// token = parser.parseNextToken();
|
|
|
+//// }
|
|
|
+//// return newTokens;
|
|
|
+//// }
|
|
|
+//}
|