|
@@ -1,348 +1,348 @@
|
|
|
-package com.simuwang.daq.utils;
|
|
|
-
|
|
|
-import cn.hutool.core.collection.CollUtil;
|
|
|
-import cn.hutool.core.collection.ListUtil;
|
|
|
-import cn.hutool.core.map.MapUtil;
|
|
|
-import cn.hutool.core.util.ReflectUtil;
|
|
|
-import cn.hutool.core.util.StrUtil;
|
|
|
-import com.simuwang.daq.components.CustomPDFTextStripper;
|
|
|
-import com.simuwang.daq.dto.ReportFundInfo;
|
|
|
-import com.smppw.common.pojo.ValueLabelVO;
|
|
|
-import org.apache.pdfbox.Loader;
|
|
|
-import org.apache.pdfbox.contentstream.PDFStreamEngine;
|
|
|
-import org.apache.pdfbox.contentstream.operator.text.ShowText;
|
|
|
-import org.apache.pdfbox.cos.COSName;
|
|
|
-import org.apache.pdfbox.io.RandomAccessReadBufferedFile;
|
|
|
-import org.apache.pdfbox.pdmodel.PDDocument;
|
|
|
-import org.apache.pdfbox.pdmodel.PDPage;
|
|
|
-import org.apache.pdfbox.pdmodel.PDResources;
|
|
|
-import org.apache.pdfbox.pdmodel.common.PDStream;
|
|
|
-import org.apache.pdfbox.pdmodel.documentinterchange.logicalstructure.PDStructureTreeRoot;
|
|
|
-import org.apache.pdfbox.pdmodel.graphics.PDXObject;
|
|
|
-import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
|
|
|
-import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
|
|
|
-import org.apache.pdfbox.text.PDFTextStripper;
|
|
|
-import org.apache.pdfbox.text.PDFTextStripperByArea;
|
|
|
-import org.apache.pdfbox.text.TextPosition;
|
|
|
-import org.apache.pdfbox.util.Matrix;
|
|
|
-import technology.tabula.*;
|
|
|
-import technology.tabula.extractors.SpreadsheetExtractionAlgorithm;
|
|
|
-
|
|
|
-import java.awt.geom.Rectangle2D;
|
|
|
-import java.io.IOException;
|
|
|
-import java.util.*;
|
|
|
-import java.util.regex.Matcher;
|
|
|
-import java.util.regex.Pattern;
|
|
|
-import java.util.stream.Collectors;
|
|
|
-
|
|
|
-public class ReportParseUtil {
|
|
|
- public static void main(String[] args) throws IOException {
|
|
|
- List<ValueLabelVO> fieldMapper = ListUtil.list(false);
|
|
|
- fieldMapper.add(new ValueLabelVO("fundName", "基金名称"));
|
|
|
- fieldMapper.add(new ValueLabelVO("registerNumber", "基金编码"));
|
|
|
- fieldMapper.add(new ValueLabelVO("operationType", "基金运作方式"));
|
|
|
- fieldMapper.add(new ValueLabelVO("fundType", "基金类别"));
|
|
|
- fieldMapper.add(new ValueLabelVO("inceptionDate", "基金成立日期"));
|
|
|
- fieldMapper.add(new ValueLabelVO("trustName", "基金托管人"));
|
|
|
- fieldMapper.add(new ValueLabelVO("custodianName", "基金管理人"));
|
|
|
- fieldMapper.add(new ValueLabelVO("advisorName", "投资顾问"));
|
|
|
- fieldMapper.add(new ValueLabelVO("reviewed", "复核"));
|
|
|
-
|
|
|
- Map<String, List<String>> watermarkMap = generateWatermarkListMap("幻方量化1000指数专享1号5期私募证券投资基金", "宁波幻方量化投资管理合伙企业(有限合伙)", null);
|
|
|
- List<String> watermarks = watermarkMap.get("less");
|
|
|
-
|
|
|
-// System.out.println(watermarks);
|
|
|
-// try (PDDocument document = Loader.loadPDF(new RandomAccessReadBufferedFile("D:\\Documents\\workspace\\idea\\smppw\\data-daq\\service-daq\\src\\main\\java\\com\\simuwang\\daq\\utils\\12931.pdf"))) {
|
|
|
- try (PDDocument document = Loader.loadPDF(new RandomAccessReadBufferedFile("D:\\Documents\\workspace\\idea\\smppw\\data-daq\\service-daq\\src\\main\\java\\com\\simuwang\\daq\\utils\\2061834.pdf"))) {
|
|
|
-// PDFTextStripper stripper = new PDFTextStripper();
|
|
|
-// stripper.setSortByPosition(true);
|
|
|
-// String allText = stripper.getText(document);
|
|
|
-// List<String> textList = StrUtil.split(allText, "\r\n");
|
|
|
-// System.out.println(textList);
|
|
|
-
|
|
|
- PDFTextStripper textStripper = new CustomPDFTextStripper();
|
|
|
- textStripper.setSortByPosition(true);
|
|
|
- String text1 = textStripper.getText(document);
|
|
|
- text1 = text1.replace("+\r\n", "").replace("+","");
|
|
|
- List<String> textList = StrUtil.split(text1, "\r\n");
|
|
|
- System.out.println(textList.get(0));
|
|
|
-
|
|
|
-// for (PDPage page : document.getPages()) {
|
|
|
+//package com.simuwang.daq.utils;
|
|
|
//
|
|
|
-//// PDResources resources = page.getResources();
|
|
|
-//// Map<COSName, PDImageXObject> imageXObjectMap = findImageWatermark(page);
|
|
|
-//// Iterator<COSName> iterator = resources.getXObjectNames().iterator();
|
|
|
-//// while (iterator.hasNext()) {
|
|
|
-//// COSName next = iterator.next();
|
|
|
-//// if (imageXObjectMap.containsKey(next)) {
|
|
|
-//// iterator.remove();
|
|
|
-//// }
|
|
|
+//import cn.hutool.core.collection.CollUtil;
|
|
|
+//import cn.hutool.core.collection.ListUtil;
|
|
|
+//import cn.hutool.core.map.MapUtil;
|
|
|
+//import cn.hutool.core.util.ReflectUtil;
|
|
|
+//import cn.hutool.core.util.StrUtil;
|
|
|
+//import com.simuwang.daq.components.CustomPDFTextStripper;
|
|
|
+//import com.simuwang.daq.dto.ReportFundInfo;
|
|
|
+//import com.smppw.common.pojo.ValueLabelVO;
|
|
|
+//import org.apache.pdfbox.Loader;
|
|
|
+//import org.apache.pdfbox.contentstream.PDFStreamEngine;
|
|
|
+//import org.apache.pdfbox.contentstream.operator.text.ShowText;
|
|
|
+//import org.apache.pdfbox.cos.COSName;
|
|
|
+//import org.apache.pdfbox.io.RandomAccessReadBufferedFile;
|
|
|
+//import org.apache.pdfbox.pdmodel.PDDocument;
|
|
|
+//import org.apache.pdfbox.pdmodel.PDPage;
|
|
|
+//import org.apache.pdfbox.pdmodel.PDResources;
|
|
|
+//import org.apache.pdfbox.pdmodel.common.PDStream;
|
|
|
+//import org.apache.pdfbox.pdmodel.documentinterchange.logicalstructure.PDStructureTreeRoot;
|
|
|
+//import org.apache.pdfbox.pdmodel.graphics.PDXObject;
|
|
|
+//import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
|
|
|
+//import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
|
|
|
+//import org.apache.pdfbox.text.PDFTextStripper;
|
|
|
+//import org.apache.pdfbox.text.PDFTextStripperByArea;
|
|
|
+//import org.apache.pdfbox.text.TextPosition;
|
|
|
+//import org.apache.pdfbox.util.Matrix;
|
|
|
+//import technology.tabula.*;
|
|
|
+//import technology.tabula.extractors.SpreadsheetExtractionAlgorithm;
|
|
|
+//
|
|
|
+//import java.awt.geom.Rectangle2D;
|
|
|
+//import java.io.IOException;
|
|
|
+//import java.util.*;
|
|
|
+//import java.util.regex.Matcher;
|
|
|
+//import java.util.regex.Pattern;
|
|
|
+//import java.util.stream.Collectors;
|
|
|
+//
|
|
|
+//public class ReportParseUtil {
|
|
|
+// public static void main(String[] args) throws IOException {
|
|
|
+// List<ValueLabelVO> fieldMapper = ListUtil.list(false);
|
|
|
+// fieldMapper.add(new ValueLabelVO("fundName", "基金名称"));
|
|
|
+// fieldMapper.add(new ValueLabelVO("registerNumber", "基金编码"));
|
|
|
+// fieldMapper.add(new ValueLabelVO("operationType", "基金运作方式"));
|
|
|
+// fieldMapper.add(new ValueLabelVO("fundType", "基金类别"));
|
|
|
+// fieldMapper.add(new ValueLabelVO("inceptionDate", "基金成立日期"));
|
|
|
+// fieldMapper.add(new ValueLabelVO("trustName", "基金托管人"));
|
|
|
+// fieldMapper.add(new ValueLabelVO("custodianName", "基金管理人"));
|
|
|
+// fieldMapper.add(new ValueLabelVO("advisorName", "投资顾问"));
|
|
|
+// fieldMapper.add(new ValueLabelVO("reviewed", "复核"));
|
|
|
+//
|
|
|
+// Map<String, List<String>> watermarkMap = generateWatermarkListMap("幻方量化1000指数专享1号5期私募证券投资基金", "宁波幻方量化投资管理合伙企业(有限合伙)", null);
|
|
|
+// List<String> watermarks = watermarkMap.get("less");
|
|
|
+//
|
|
|
+//// System.out.println(watermarks);
|
|
|
+//// try (PDDocument document = Loader.loadPDF(new RandomAccessReadBufferedFile("D:\\Documents\\workspace\\idea\\smppw\\data-daq\\service-daq\\src\\main\\java\\com\\simuwang\\daq\\utils\\12931.pdf"))) {
|
|
|
+// try (PDDocument document = Loader.loadPDF(new RandomAccessReadBufferedFile("D:\\Documents\\workspace\\idea\\smppw\\data-daq\\service-daq\\src\\main\\java\\com\\simuwang\\daq\\utils\\2061834.pdf"))) {
|
|
|
+//// PDFTextStripper stripper = new PDFTextStripper();
|
|
|
+//// stripper.setSortByPosition(true);
|
|
|
+//// String allText = stripper.getText(document);
|
|
|
+//// List<String> textList = StrUtil.split(allText, "\r\n");
|
|
|
+//// System.out.println(textList);
|
|
|
+//
|
|
|
+// PDFTextStripper textStripper = new CustomPDFTextStripper();
|
|
|
+// textStripper.setSortByPosition(true);
|
|
|
+// String text1 = textStripper.getText(document);
|
|
|
+// text1 = text1.replace("+\r\n", "").replace("+","");
|
|
|
+// List<String> textList = StrUtil.split(text1, "\r\n");
|
|
|
+// System.out.println(textList.get(0));
|
|
|
+//
|
|
|
+//// for (PDPage page : document.getPages()) {
|
|
|
+////
|
|
|
+////// PDResources resources = page.getResources();
|
|
|
+////// Map<COSName, PDImageXObject> imageXObjectMap = findImageWatermark(page);
|
|
|
+////// Iterator<COSName> iterator = resources.getXObjectNames().iterator();
|
|
|
+////// while (iterator.hasNext()) {
|
|
|
+////// COSName next = iterator.next();
|
|
|
+////// if (imageXObjectMap.containsKey(next)) {
|
|
|
+////// iterator.remove();
|
|
|
+////// }
|
|
|
+////// }
|
|
|
+////// removeTextWatermark(page);
|
|
|
+////
|
|
|
+//// PDFTextStripperByArea stripper = new PDFTextStripperByArea();
|
|
|
+//// stripper.setSortByPosition(true);
|
|
|
+//// stripper.addRegion("page", new Rectangle2D.Float(0, 0, page.getMediaBox().getWidth(), page.getMediaBox().getHeight()));
|
|
|
+//// stripper.extractRegions(page);
|
|
|
+//// for (String region : stripper.getRegions()) {
|
|
|
+//// String text = stripper.getTextForRegion(region);
|
|
|
+//// String res = processString(watermarks, text);
|
|
|
+//// System.out.println("原数据:" + text + ", 去除水印后数据:" + res);
|
|
|
//// }
|
|
|
-//// removeTextWatermark(page);
|
|
|
+//// }
|
|
|
+//// document.save(new File("./1.pdf"));
|
|
|
//
|
|
|
-// PDFTextStripperByArea stripper = new PDFTextStripperByArea();
|
|
|
-// stripper.setSortByPosition(true);
|
|
|
-// stripper.addRegion("page", new Rectangle2D.Float(0, 0, page.getMediaBox().getWidth(), page.getMediaBox().getHeight()));
|
|
|
-// stripper.extractRegions(page);
|
|
|
-// for (String region : stripper.getRegions()) {
|
|
|
-// String text = stripper.getTextForRegion(region);
|
|
|
-// String res = processString(watermarks, text);
|
|
|
-// System.out.println("原数据:" + text + ", 去除水印后数据:" + res);
|
|
|
+// SpreadsheetExtractionAlgorithm extractionAlgorithm = new SpreadsheetExtractionAlgorithm();
|
|
|
+// PageIterator pageIterator = new ObjectExtractor(document).extract();
|
|
|
+// while (pageIterator.hasNext()) {
|
|
|
+// Page page = pageIterator.next();
|
|
|
+// List<Table> tables = extractionAlgorithm.extract(page);
|
|
|
+// tables = tables.stream().distinct().collect(Collectors.toList());
|
|
|
+// for (Table table : tables) {
|
|
|
+// if (table.getColCount() == 4) {
|
|
|
+// Map<String, Object> baseInfoMap = MapUtil.newHashMap(32);
|
|
|
+// for (int i = 0; i < table.getRows().size(); i++) {
|
|
|
+// List<RectangularTextContainer> cols = table.getRows().get(i);
|
|
|
+// for (int j = 0; j < 2; j++) {
|
|
|
+// baseInfoMap.put(cols.get(j * 2).getText(), cols.get(j * 2 + 1).getText());
|
|
|
+// }
|
|
|
+// }
|
|
|
+// ReportFundInfo reportFundInfo = new ReportFundInfo();
|
|
|
+// baseInfoMap.forEach((k, v) -> {
|
|
|
+// for (ValueLabelVO vo : fieldMapper) {
|
|
|
+// String fieldName = vo.getValue();
|
|
|
+// List<String> labels = StrUtil.split(vo.getLabel(), ",");
|
|
|
+// if (labels.contains(k)) {
|
|
|
+// ReflectUtil.setFieldValue(reportFundInfo, fieldName, v);
|
|
|
+// break;
|
|
|
+// }
|
|
|
+// for (String label : labels) {
|
|
|
+// if (k.contains(label)) {
|
|
|
+// ReflectUtil.setFieldValue(reportFundInfo, fieldName, v);
|
|
|
+// break;
|
|
|
+// }
|
|
|
+// }
|
|
|
+// }
|
|
|
+// });
|
|
|
+// System.out.println(reportFundInfo);
|
|
|
+// }
|
|
|
// }
|
|
|
// }
|
|
|
-// document.save(new File("./1.pdf"));
|
|
|
-
|
|
|
- SpreadsheetExtractionAlgorithm extractionAlgorithm = new SpreadsheetExtractionAlgorithm();
|
|
|
- PageIterator pageIterator = new ObjectExtractor(document).extract();
|
|
|
- while (pageIterator.hasNext()) {
|
|
|
- Page page = pageIterator.next();
|
|
|
- List<Table> tables = extractionAlgorithm.extract(page);
|
|
|
- tables = tables.stream().distinct().collect(Collectors.toList());
|
|
|
- for (Table table : tables) {
|
|
|
- if (table.getColCount() == 4) {
|
|
|
- Map<String, Object> baseInfoMap = MapUtil.newHashMap(32);
|
|
|
- for (int i = 0; i < table.getRows().size(); i++) {
|
|
|
- List<RectangularTextContainer> cols = table.getRows().get(i);
|
|
|
- for (int j = 0; j < 2; j++) {
|
|
|
- baseInfoMap.put(cols.get(j * 2).getText(), cols.get(j * 2 + 1).getText());
|
|
|
- }
|
|
|
- }
|
|
|
- ReportFundInfo reportFundInfo = new ReportFundInfo();
|
|
|
- baseInfoMap.forEach((k, v) -> {
|
|
|
- for (ValueLabelVO vo : fieldMapper) {
|
|
|
- String fieldName = vo.getValue();
|
|
|
- List<String> labels = StrUtil.split(vo.getLabel(), ",");
|
|
|
- if (labels.contains(k)) {
|
|
|
- ReflectUtil.setFieldValue(reportFundInfo, fieldName, v);
|
|
|
- break;
|
|
|
- }
|
|
|
- for (String label : labels) {
|
|
|
- if (k.contains(label)) {
|
|
|
- ReflectUtil.setFieldValue(reportFundInfo, fieldName, v);
|
|
|
- break;
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- });
|
|
|
- System.out.println(reportFundInfo);
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- /**
|
|
|
- * 找图片水印
|
|
|
- *
|
|
|
- * @param page
|
|
|
- * @return
|
|
|
- * @throws IOException
|
|
|
- */
|
|
|
- public static Map<COSName, PDImageXObject> findImageWatermark(PDPage page) throws IOException {
|
|
|
- Map<COSName, PDImageXObject> watermarkMap = MapUtil.newHashMap();
|
|
|
- PDResources resources = page.getResources();
|
|
|
- Iterable<COSName> xObjectNames = resources.getXObjectNames();
|
|
|
- for (COSName xObjectName : xObjectNames) {
|
|
|
- PDXObject xObject = resources.getXObject(xObjectName);
|
|
|
- PDStream stream = xObject.getStream();
|
|
|
- PDImageXObject imageXObject = null;
|
|
|
- try {
|
|
|
- imageXObject = new PDImageXObject(stream, resources);
|
|
|
- } catch (Exception e) {
|
|
|
- e.printStackTrace();
|
|
|
- }
|
|
|
- if (imageXObject != null) {
|
|
|
- watermarkMap.put(xObjectName, imageXObject);
|
|
|
- }
|
|
|
- }
|
|
|
- return watermarkMap;
|
|
|
- }
|
|
|
-
|
|
|
- private static Map<String, List<String>> generateWatermarkListMap(String fundName, String trustName, String registerNumber) {
|
|
|
- Map<String, List<String>> result = MapUtil.newHashMap(32);
|
|
|
- // 生成水印列表
|
|
|
-
|
|
|
- fundName = StrUtil.isNotBlank(fundName) ? fundName : "私募证券投资基金";
|
|
|
- trustName = StrUtil.isNotBlank(trustName) ? trustName : "资产管理有限公司";
|
|
|
- registerNumber = StrUtil.isNotBlank(registerNumber) ? registerNumber : "";
|
|
|
- String text = fundName + trustName + registerNumber;
|
|
|
- text = text.replaceAll("[()]", ""); // 移除括号
|
|
|
- List<String> textList = new ArrayList<>(new HashSet<>(convertStringToList(text)));
|
|
|
- Collections.reverse(textList);
|
|
|
- StringBuilder sb = new StringBuilder(textList.size());
|
|
|
- for (String ch : textList) {
|
|
|
- sb.append(ch);
|
|
|
- }
|
|
|
- String joinedText = sb.toString();
|
|
|
-
|
|
|
- // 基本水印列表
|
|
|
- List<String> wkList = new ArrayList<>();
|
|
|
- for (String ch : textList) {
|
|
|
- wkList.add(ch + "\r\n");
|
|
|
- wkList.add("\r\n" + ch);
|
|
|
- }
|
|
|
-
|
|
|
- // 查找数字
|
|
|
- List<String> matches = findDigits(fundName);
|
|
|
- if (!matches.isEmpty()) {
|
|
|
- for (String match : matches) {
|
|
|
- wkList.add("\r\n" + match);
|
|
|
- wkList.add(match + "\r\n");
|
|
|
- }
|
|
|
- }
|
|
|
- wkList.add("-");
|
|
|
- wkList.add("【");
|
|
|
- wkList.add("】");
|
|
|
- wkList.add("\r");
|
|
|
- wkList.add("\r\n");
|
|
|
-
|
|
|
- String noNumberText = removeDigits(joinedText);
|
|
|
-
|
|
|
- // 生成不同字段的水印列表
|
|
|
- result.put("report_name", new ArrayList<>(wkList));
|
|
|
- result.get("report_name").addAll(convertStringToList("有限公司"));
|
|
|
-
|
|
|
- result.put("less", new ArrayList<>(wkList));
|
|
|
-
|
|
|
- result.put("more", new ArrayList<>(wkList));
|
|
|
- result.get("more").addAll(convertStringToList(noNumberText));
|
|
|
-
|
|
|
- result.put("leverage", new ArrayList<>(wkList));
|
|
|
- result.get("leverage").addAll(convertStringToList(removeKeywords(noNumberText, "基金资产")));
|
|
|
-
|
|
|
- result.put("base_info", new ArrayList<>(wkList));
|
|
|
- result.get("base_info").addAll(convertStringToList(removeKeywords(text, "基", "金", "投资", "管理", "有", "份", "融", "资", "产", "本", "号", "收益", "策略", "期")));
|
|
|
-
|
|
|
- result.put("industry", new ArrayList<>(wkList));
|
|
|
- result.get("industry").addAll(convertStringToList(removeKeywords(noNumberText, "基金融公产")));
|
|
|
-
|
|
|
- result.put("market_value", new ArrayList<>(Collections.singletonList("\n")));
|
|
|
- return result;
|
|
|
- }
|
|
|
-
|
|
|
- private static List<String> findDigits(String text) {
|
|
|
- List<String> digits = new ArrayList<>();
|
|
|
- Pattern pattern = Pattern.compile("\\d");
|
|
|
- Matcher matcher = pattern.matcher(text);
|
|
|
- while (matcher.find()) {
|
|
|
- digits.add(matcher.group());
|
|
|
- }
|
|
|
- return digits;
|
|
|
- }
|
|
|
-
|
|
|
- private static String removeDigits(String text) {
|
|
|
- return text.replaceAll("\\d", "");
|
|
|
- }
|
|
|
-
|
|
|
- private static String removeKeywords(String text, String... keywords) {
|
|
|
- for (String keyword : keywords) {
|
|
|
- text = text.replaceAll(keyword, "");
|
|
|
- }
|
|
|
- return text;
|
|
|
- }
|
|
|
-
|
|
|
- private static List<String> convertStringToList(String text) {
|
|
|
- List<String> charList = new ArrayList<>();
|
|
|
- for (char c : text.toCharArray()) {
|
|
|
- charList.add(c + "");
|
|
|
- }
|
|
|
- return charList;
|
|
|
- }
|
|
|
-
|
|
|
- public static String processString(List<String> wmList, String string) {
|
|
|
- // 生成正则表达式模式
|
|
|
- String pat = String.join("|", wmList);
|
|
|
- // 使用正则表达式移除wmList中的元素
|
|
|
- string = removeMatches(string, pat);
|
|
|
- // 替换中文括号为英文括号
|
|
|
- string = string.replace("(", "(").replace(")", ")");
|
|
|
- // 移除空格
|
|
|
- string = string.replace(" ", "");
|
|
|
- // 如果字符串以括号开头,则移除第一个字符
|
|
|
- if (startsWithParenthesis(string)) {
|
|
|
- string = string.substring(1);
|
|
|
- }
|
|
|
-
|
|
|
- return string;
|
|
|
- }
|
|
|
-
|
|
|
- private static String removeMatches(String input, String pattern) {
|
|
|
- // 编译正则表达式
|
|
|
- Pattern compiledPattern = Pattern.compile(pattern);
|
|
|
- // 创建Matcher对象
|
|
|
- Matcher matcher = compiledPattern.matcher(input);
|
|
|
- // 使用replaceAll方法替换所有匹配到的字符为空字符串
|
|
|
- return matcher.replaceAll("");
|
|
|
- }
|
|
|
-
|
|
|
- private static boolean startsWithParenthesis(String input) {
|
|
|
- // 匹配以括号开头的字符串
|
|
|
- Pattern pattern = Pattern.compile("^[()].*");
|
|
|
- Matcher matcher = pattern.matcher(input);
|
|
|
- return matcher.find();
|
|
|
- }
|
|
|
-
|
|
|
-// public static void removeTextWatermark(PDPage page) throws IOException {
|
|
|
+// }
|
|
|
+// }
|
|
|
+//
|
|
|
+// /**
|
|
|
+// * 找图片水印
|
|
|
+// *
|
|
|
+// * @param page
|
|
|
+// * @return
|
|
|
+// * @throws IOException
|
|
|
+// */
|
|
|
+// public static Map<COSName, PDImageXObject> findImageWatermark(PDPage page) throws IOException {
|
|
|
+// Map<COSName, PDImageXObject> watermarkMap = MapUtil.newHashMap();
|
|
|
// PDResources resources = page.getResources();
|
|
|
-//// if (StrUtil.isAllBlank(fundName, trustName)) {
|
|
|
-//// return;
|
|
|
-//// }
|
|
|
-// PDFTextStripperByArea stripper = new PDFTextStripperByArea();
|
|
|
-// stripper.setSortByPosition(true);
|
|
|
-// stripper.addRegion("watermark", new Rectangle2D.Float(0, 0, page.getMediaBox().getWidth(), page.getMediaBox().getHeight()));
|
|
|
-// stripper.extractRegions(page);
|
|
|
+// Iterable<COSName> xObjectNames = resources.getXObjectNames();
|
|
|
+// for (COSName xObjectName : xObjectNames) {
|
|
|
+// PDXObject xObject = resources.getXObject(xObjectName);
|
|
|
+// PDStream stream = xObject.getStream();
|
|
|
+// PDImageXObject imageXObject = null;
|
|
|
+// try {
|
|
|
+// imageXObject = new PDImageXObject(stream, resources);
|
|
|
+// } catch (Exception e) {
|
|
|
+// e.printStackTrace();
|
|
|
+// }
|
|
|
+// if (imageXObject != null) {
|
|
|
+// watermarkMap.put(xObjectName, imageXObject);
|
|
|
+// }
|
|
|
+// }
|
|
|
+// return watermarkMap;
|
|
|
+// }
|
|
|
//
|
|
|
-// PDFStreamEngine engine = new PDFTextStripper();
|
|
|
-// engine.addOperator(new SetMatrix(stripper));
|
|
|
+// private static Map<String, List<String>> generateWatermarkListMap(String fundName, String trustName, String registerNumber) {
|
|
|
+// Map<String, List<String>> result = MapUtil.newHashMap(32);
|
|
|
+// // 生成水印列表
|
|
|
//
|
|
|
-// }
|
|
|
+// fundName = StrUtil.isNotBlank(fundName) ? fundName : "私募证券投资基金";
|
|
|
+// trustName = StrUtil.isNotBlank(trustName) ? trustName : "资产管理有限公司";
|
|
|
+// registerNumber = StrUtil.isNotBlank(registerNumber) ? registerNumber : "";
|
|
|
+// String text = fundName + trustName + registerNumber;
|
|
|
+// text = text.replaceAll("[()]", ""); // 移除括号
|
|
|
+// List<String> textList = new ArrayList<>(new HashSet<>(convertStringToList(text)));
|
|
|
+// Collections.reverse(textList);
|
|
|
+// StringBuilder sb = new StringBuilder(textList.size());
|
|
|
+// for (String ch : textList) {
|
|
|
+// sb.append(ch);
|
|
|
+// }
|
|
|
+// String joinedText = sb.toString();
|
|
|
//
|
|
|
-// private static void processResources(PDResources resources) throws IOException {
|
|
|
-// for (COSName name : resources.getXObjectNames()) {
|
|
|
-// PDXObject xobject = resources.getXObject(name);
|
|
|
-// if (xobject instanceof PDFormXObject) {
|
|
|
-// PDFormXObject formXObject = (PDFormXObject) xobject;
|
|
|
-// writeTokensToStream(formXObject.getContentStream(),
|
|
|
-// createTokensWithoutText(formXObject));
|
|
|
-// processResources(formXObject.getResources());
|
|
|
-// }
|
|
|
+// // 基本水印列表
|
|
|
+// List<String> wkList = new ArrayList<>();
|
|
|
+// for (String ch : textList) {
|
|
|
+// wkList.add(ch + "\r\n");
|
|
|
+// wkList.add("\r\n" + ch);
|
|
|
// }
|
|
|
-// for (COSName name : resources.getPatternNames()) {
|
|
|
-// PDAbstractPattern pattern = resources.getPattern(name);
|
|
|
-// if (pattern instanceof PDTilingPattern) {
|
|
|
-// PDTilingPattern tilingPattern = (PDTilingPattern) pattern;
|
|
|
-// writeTokensToStream(tilingPattern.getContentStream(),
|
|
|
-// createTokensWithoutText(tilingPattern));
|
|
|
-// processResources(tilingPattern.getResources());
|
|
|
+//
|
|
|
+// // 查找数字
|
|
|
+// List<String> matches = findDigits(fundName);
|
|
|
+// if (!matches.isEmpty()) {
|
|
|
+// for (String match : matches) {
|
|
|
+// wkList.add("\r\n" + match);
|
|
|
+// wkList.add(match + "\r\n");
|
|
|
// }
|
|
|
// }
|
|
|
+// wkList.add("-");
|
|
|
+// wkList.add("【");
|
|
|
+// wkList.add("】");
|
|
|
+// wkList.add("\r");
|
|
|
+// wkList.add("\r\n");
|
|
|
+//
|
|
|
+// String noNumberText = removeDigits(joinedText);
|
|
|
+//
|
|
|
+// // 生成不同字段的水印列表
|
|
|
+// result.put("report_name", new ArrayList<>(wkList));
|
|
|
+// result.get("report_name").addAll(convertStringToList("有限公司"));
|
|
|
+//
|
|
|
+// result.put("less", new ArrayList<>(wkList));
|
|
|
+//
|
|
|
+// result.put("more", new ArrayList<>(wkList));
|
|
|
+// result.get("more").addAll(convertStringToList(noNumberText));
|
|
|
+//
|
|
|
+// result.put("leverage", new ArrayList<>(wkList));
|
|
|
+// result.get("leverage").addAll(convertStringToList(removeKeywords(noNumberText, "基金资产")));
|
|
|
+//
|
|
|
+// result.put("base_info", new ArrayList<>(wkList));
|
|
|
+// result.get("base_info").addAll(convertStringToList(removeKeywords(text, "基", "金", "投资", "管理", "有", "份", "融", "资", "产", "本", "号", "收益", "策略", "期")));
|
|
|
+//
|
|
|
+// result.put("industry", new ArrayList<>(wkList));
|
|
|
+// result.get("industry").addAll(convertStringToList(removeKeywords(noNumberText, "基金融公产")));
|
|
|
+//
|
|
|
+// result.put("market_value", new ArrayList<>(Collections.singletonList("\n")));
|
|
|
+// return result;
|
|
|
+// }
|
|
|
+//
|
|
|
+// private static List<String> findDigits(String text) {
|
|
|
+// List<String> digits = new ArrayList<>();
|
|
|
+// Pattern pattern = Pattern.compile("\\d");
|
|
|
+// Matcher matcher = pattern.matcher(text);
|
|
|
+// while (matcher.find()) {
|
|
|
+// digits.add(matcher.group());
|
|
|
+// }
|
|
|
+// return digits;
|
|
|
// }
|
|
|
//
|
|
|
-// private static void writeTokensToStream(PDStream newContents, List<Object> newTokens) throws IOException {
|
|
|
-// try (OutputStream out = newContents.createOutputStream(COSName.FLATE_DECODE)) {
|
|
|
-// ContentStreamWriter writer = new ContentStreamWriter(out);
|
|
|
-// writer.writeTokens(newTokens);
|
|
|
+// private static String removeDigits(String text) {
|
|
|
+// return text.replaceAll("\\d", "");
|
|
|
+// }
|
|
|
+//
|
|
|
+// private static String removeKeywords(String text, String... keywords) {
|
|
|
+// for (String keyword : keywords) {
|
|
|
+// text = text.replaceAll(keyword, "");
|
|
|
// }
|
|
|
+// return text;
|
|
|
// }
|
|
|
//
|
|
|
-// private static List<Object> createTokensWithoutText(PDContentStream contentStream) throws IOException {
|
|
|
-// PDFStreamParser parser = new PDFStreamParser(contentStream);
|
|
|
-// Object token = parser.parseNextToken();
|
|
|
-// List<Object> newTokens = new ArrayList<>();
|
|
|
-// while (token != null) {
|
|
|
-// if (token instanceof Operator op) {
|
|
|
-// String opName = op.getName();
|
|
|
-// if (OperatorName.SET_MATRIX.equals(opName)) {
|
|
|
-// // remove the argument to this operator
|
|
|
-// newTokens.remove(newTokens.size() - 1);
|
|
|
+// private static List<String> convertStringToList(String text) {
|
|
|
+// List<String> charList = new ArrayList<>();
|
|
|
+// for (char c : text.toCharArray()) {
|
|
|
+// charList.add(c + "");
|
|
|
+// }
|
|
|
+// return charList;
|
|
|
+// }
|
|
|
//
|
|
|
-// token = parser.parseNextToken();
|
|
|
-// continue;
|
|
|
-// }
|
|
|
-// }
|
|
|
-// newTokens.add(token);
|
|
|
-// token = parser.parseNextToken();
|
|
|
+// public static String processString(List<String> wmList, String string) {
|
|
|
+// // 生成正则表达式模式
|
|
|
+// String pat = String.join("|", wmList);
|
|
|
+// // 使用正则表达式移除wmList中的元素
|
|
|
+// string = removeMatches(string, pat);
|
|
|
+// // 替换中文括号为英文括号
|
|
|
+// string = string.replace("(", "(").replace(")", ")");
|
|
|
+// // 移除空格
|
|
|
+// string = string.replace(" ", "");
|
|
|
+// // 如果字符串以括号开头,则移除第一个字符
|
|
|
+// if (startsWithParenthesis(string)) {
|
|
|
+// string = string.substring(1);
|
|
|
// }
|
|
|
-// return newTokens;
|
|
|
+//
|
|
|
+// return string;
|
|
|
+// }
|
|
|
+//
|
|
|
+// private static String removeMatches(String input, String pattern) {
|
|
|
+// // 编译正则表达式
|
|
|
+// Pattern compiledPattern = Pattern.compile(pattern);
|
|
|
+// // 创建Matcher对象
|
|
|
+// Matcher matcher = compiledPattern.matcher(input);
|
|
|
+// // 使用replaceAll方法替换所有匹配到的字符为空字符串
|
|
|
+// return matcher.replaceAll("");
|
|
|
// }
|
|
|
-}
|
|
|
+//
|
|
|
+// private static boolean startsWithParenthesis(String input) {
|
|
|
+// // 匹配以括号开头的字符串
|
|
|
+// Pattern pattern = Pattern.compile("^[()].*");
|
|
|
+// Matcher matcher = pattern.matcher(input);
|
|
|
+// return matcher.find();
|
|
|
+// }
|
|
|
+//
|
|
|
+//// public static void removeTextWatermark(PDPage page) throws IOException {
|
|
|
+//// PDResources resources = page.getResources();
|
|
|
+////// if (StrUtil.isAllBlank(fundName, trustName)) {
|
|
|
+////// return;
|
|
|
+////// }
|
|
|
+//// PDFTextStripperByArea stripper = new PDFTextStripperByArea();
|
|
|
+//// stripper.setSortByPosition(true);
|
|
|
+//// stripper.addRegion("watermark", new Rectangle2D.Float(0, 0, page.getMediaBox().getWidth(), page.getMediaBox().getHeight()));
|
|
|
+//// stripper.extractRegions(page);
|
|
|
+////
|
|
|
+//// PDFStreamEngine engine = new PDFTextStripper();
|
|
|
+//// engine.addOperator(new SetMatrix(stripper));
|
|
|
+////
|
|
|
+//// }
|
|
|
+////
|
|
|
+//// private static void processResources(PDResources resources) throws IOException {
|
|
|
+//// for (COSName name : resources.getXObjectNames()) {
|
|
|
+//// PDXObject xobject = resources.getXObject(name);
|
|
|
+//// if (xobject instanceof PDFormXObject) {
|
|
|
+//// PDFormXObject formXObject = (PDFormXObject) xobject;
|
|
|
+//// writeTokensToStream(formXObject.getContentStream(),
|
|
|
+//// createTokensWithoutText(formXObject));
|
|
|
+//// processResources(formXObject.getResources());
|
|
|
+//// }
|
|
|
+//// }
|
|
|
+//// for (COSName name : resources.getPatternNames()) {
|
|
|
+//// PDAbstractPattern pattern = resources.getPattern(name);
|
|
|
+//// if (pattern instanceof PDTilingPattern) {
|
|
|
+//// PDTilingPattern tilingPattern = (PDTilingPattern) pattern;
|
|
|
+//// writeTokensToStream(tilingPattern.getContentStream(),
|
|
|
+//// createTokensWithoutText(tilingPattern));
|
|
|
+//// processResources(tilingPattern.getResources());
|
|
|
+//// }
|
|
|
+//// }
|
|
|
+//// }
|
|
|
+////
|
|
|
+//// private static void writeTokensToStream(PDStream newContents, List<Object> newTokens) throws IOException {
|
|
|
+//// try (OutputStream out = newContents.createOutputStream(COSName.FLATE_DECODE)) {
|
|
|
+//// ContentStreamWriter writer = new ContentStreamWriter(out);
|
|
|
+//// writer.writeTokens(newTokens);
|
|
|
+//// }
|
|
|
+//// }
|
|
|
+////
|
|
|
+//// private static List<Object> createTokensWithoutText(PDContentStream contentStream) throws IOException {
|
|
|
+//// PDFStreamParser parser = new PDFStreamParser(contentStream);
|
|
|
+//// Object token = parser.parseNextToken();
|
|
|
+//// List<Object> newTokens = new ArrayList<>();
|
|
|
+//// while (token != null) {
|
|
|
+//// if (token instanceof Operator op) {
|
|
|
+//// String opName = op.getName();
|
|
|
+//// if (OperatorName.SET_MATRIX.equals(opName)) {
|
|
|
+//// // remove the argument to this operator
|
|
|
+//// newTokens.remove(newTokens.size() - 1);
|
|
|
+////
|
|
|
+//// token = parser.parseNextToken();
|
|
|
+//// continue;
|
|
|
+//// }
|
|
|
+//// }
|
|
|
+//// newTokens.add(token);
|
|
|
+//// token = parser.parseNextToken();
|
|
|
+//// }
|
|
|
+//// return newTokens;
|
|
|
+//// }
|
|
|
+//}
|