|
@@ -1,51 +1,348 @@
|
|
|
package com.simuwang.daq.utils;
|
|
|
|
|
|
+import cn.hutool.core.collection.CollUtil;
|
|
|
+import cn.hutool.core.collection.ListUtil;
|
|
|
+import cn.hutool.core.map.MapUtil;
|
|
|
+import cn.hutool.core.util.ReflectUtil;
|
|
|
+import cn.hutool.core.util.StrUtil;
|
|
|
+import com.simuwang.daq.components.CustomPDFTextStripper;
|
|
|
+import com.simuwang.daq.dto.ReportFundInfo;
|
|
|
+import com.smppw.common.pojo.ValueLabelVO;
|
|
|
import org.apache.pdfbox.Loader;
|
|
|
+import org.apache.pdfbox.contentstream.PDFStreamEngine;
|
|
|
+import org.apache.pdfbox.contentstream.operator.text.ShowText;
|
|
|
+import org.apache.pdfbox.cos.COSName;
|
|
|
import org.apache.pdfbox.io.RandomAccessReadBufferedFile;
|
|
|
import org.apache.pdfbox.pdmodel.PDDocument;
|
|
|
import org.apache.pdfbox.pdmodel.PDPage;
|
|
|
-import org.apache.pdfbox.pdmodel.PDPageTree;
|
|
|
+import org.apache.pdfbox.pdmodel.PDResources;
|
|
|
+import org.apache.pdfbox.pdmodel.common.PDStream;
|
|
|
+import org.apache.pdfbox.pdmodel.documentinterchange.logicalstructure.PDStructureTreeRoot;
|
|
|
+import org.apache.pdfbox.pdmodel.graphics.PDXObject;
|
|
|
+import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
|
|
|
+import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
|
|
|
+import org.apache.pdfbox.text.PDFTextStripper;
|
|
|
import org.apache.pdfbox.text.PDFTextStripperByArea;
|
|
|
+import org.apache.pdfbox.text.TextPosition;
|
|
|
+import org.apache.pdfbox.util.Matrix;
|
|
|
+import technology.tabula.*;
|
|
|
+import technology.tabula.extractors.SpreadsheetExtractionAlgorithm;
|
|
|
|
|
|
import java.awt.geom.Rectangle2D;
|
|
|
import java.io.IOException;
|
|
|
-import java.util.List;
|
|
|
+import java.util.*;
|
|
|
+import java.util.regex.Matcher;
|
|
|
+import java.util.regex.Pattern;
|
|
|
+import java.util.stream.Collectors;
|
|
|
|
|
|
public class ReportParseUtil {
|
|
|
public static void main(String[] args) throws IOException {
|
|
|
+ List<ValueLabelVO> fieldMapper = ListUtil.list(false);
|
|
|
+ fieldMapper.add(new ValueLabelVO("fundName", "基金名称"));
|
|
|
+ fieldMapper.add(new ValueLabelVO("registerNumber", "基金编码"));
|
|
|
+ fieldMapper.add(new ValueLabelVO("operationType", "基金运作方式"));
|
|
|
+ fieldMapper.add(new ValueLabelVO("fundType", "基金类别"));
|
|
|
+ fieldMapper.add(new ValueLabelVO("inceptionDate", "基金成立日期"));
|
|
|
+ fieldMapper.add(new ValueLabelVO("trustName", "基金托管人"));
|
|
|
+ fieldMapper.add(new ValueLabelVO("custodianName", "基金管理人"));
|
|
|
+ fieldMapper.add(new ValueLabelVO("advisorName", "投资顾问"));
|
|
|
+ fieldMapper.add(new ValueLabelVO("reviewed", "复核"));
|
|
|
+
|
|
|
+ Map<String, List<String>> watermarkMap = generateWatermarkListMap("幻方量化1000指数专享1号5期私募证券投资基金", "宁波幻方量化投资管理合伙企业(有限合伙)", null);
|
|
|
+ List<String> watermarks = watermarkMap.get("less");
|
|
|
+
|
|
|
+// System.out.println(watermarks);
|
|
|
+// try (PDDocument document = Loader.loadPDF(new RandomAccessReadBufferedFile("D:\\Documents\\workspace\\idea\\smppw\\data-daq\\service-daq\\src\\main\\java\\com\\simuwang\\daq\\utils\\12931.pdf"))) {
|
|
|
try (PDDocument document = Loader.loadPDF(new RandomAccessReadBufferedFile("D:\\Documents\\workspace\\idea\\smppw\\data-daq\\service-daq\\src\\main\\java\\com\\simuwang\\daq\\utils\\2061834.pdf"))) {
|
|
|
- PDPageTree pages = document.getPages();
|
|
|
- for (int i = 0; i < pages.getCount(); i++) {
|
|
|
- PDFTextStripperByArea stripper = new PDFTextStripperByArea();
|
|
|
- stripper.setSortByPosition(true);
|
|
|
+// PDFTextStripper stripper = new PDFTextStripper();
|
|
|
+// stripper.setSortByPosition(true);
|
|
|
+// String allText = stripper.getText(document);
|
|
|
+// List<String> textList = StrUtil.split(allText, "\r\n");
|
|
|
+// System.out.println(textList);
|
|
|
|
|
|
- // 定义每个区域
|
|
|
- defineAreas(stripper);
|
|
|
+ PDFTextStripper textStripper = new CustomPDFTextStripper();
|
|
|
+ textStripper.setSortByPosition(true);
|
|
|
+ String text1 = textStripper.getText(document);
|
|
|
+ text1 = text1.replace("+\r\n", "").replace("+","");
|
|
|
+ List<String> textList = StrUtil.split(text1, "\r\n");
|
|
|
+ System.out.println(textList.get(0));
|
|
|
|
|
|
- // 提取文本
|
|
|
- PDPage page = document.getPage(i);
|
|
|
- stripper.extractRegions(page);
|
|
|
+// for (PDPage page : document.getPages()) {
|
|
|
+//
|
|
|
+//// PDResources resources = page.getResources();
|
|
|
+//// Map<COSName, PDImageXObject> imageXObjectMap = findImageWatermark(page);
|
|
|
+//// Iterator<COSName> iterator = resources.getXObjectNames().iterator();
|
|
|
+//// while (iterator.hasNext()) {
|
|
|
+//// COSName next = iterator.next();
|
|
|
+//// if (imageXObjectMap.containsKey(next)) {
|
|
|
+//// iterator.remove();
|
|
|
+//// }
|
|
|
+//// }
|
|
|
+//// removeTextWatermark(page);
|
|
|
+//
|
|
|
+// PDFTextStripperByArea stripper = new PDFTextStripperByArea();
|
|
|
+// stripper.setSortByPosition(true);
|
|
|
+// stripper.addRegion("page", new Rectangle2D.Float(0, 0, page.getMediaBox().getWidth(), page.getMediaBox().getHeight()));
|
|
|
+// stripper.extractRegions(page);
|
|
|
+// for (String region : stripper.getRegions()) {
|
|
|
+// String text = stripper.getTextForRegion(region);
|
|
|
+// String res = processString(watermarks, text);
|
|
|
+// System.out.println("原数据:" + text + ", 去除水印后数据:" + res);
|
|
|
+// }
|
|
|
+// }
|
|
|
+// document.save(new File("./1.pdf"));
|
|
|
|
|
|
- List<String> regions = stripper.getRegions();
|
|
|
- processRegions(stripper, regions);
|
|
|
+ SpreadsheetExtractionAlgorithm extractionAlgorithm = new SpreadsheetExtractionAlgorithm();
|
|
|
+ PageIterator pageIterator = new ObjectExtractor(document).extract();
|
|
|
+ while (pageIterator.hasNext()) {
|
|
|
+ Page page = pageIterator.next();
|
|
|
+ List<Table> tables = extractionAlgorithm.extract(page);
|
|
|
+ tables = tables.stream().distinct().collect(Collectors.toList());
|
|
|
+ for (Table table : tables) {
|
|
|
+ if (table.getColCount() == 4) {
|
|
|
+ Map<String, Object> baseInfoMap = MapUtil.newHashMap(32);
|
|
|
+ for (int i = 0; i < table.getRows().size(); i++) {
|
|
|
+ List<RectangularTextContainer> cols = table.getRows().get(i);
|
|
|
+ for (int j = 0; j < 2; j++) {
|
|
|
+ baseInfoMap.put(cols.get(j * 2).getText(), cols.get(j * 2 + 1).getText());
|
|
|
+ }
|
|
|
+ }
|
|
|
+ ReportFundInfo reportFundInfo = new ReportFundInfo();
|
|
|
+ baseInfoMap.forEach((k, v) -> {
|
|
|
+ for (ValueLabelVO vo : fieldMapper) {
|
|
|
+ String fieldName = vo.getValue();
|
|
|
+ List<String> labels = StrUtil.split(vo.getLabel(), ",");
|
|
|
+ if (labels.contains(k)) {
|
|
|
+ ReflectUtil.setFieldValue(reportFundInfo, fieldName, v);
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ for (String label : labels) {
|
|
|
+ if (k.contains(label)) {
|
|
|
+ ReflectUtil.setFieldValue(reportFundInfo, fieldName, v);
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ });
|
|
|
+ System.out.println(reportFundInfo);
|
|
|
+ }
|
|
|
+ }
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
|
|
|
- private static void defineAreas(PDFTextStripperByArea stripper) {
|
|
|
- // 定义区域,位置左上角作为原点,横坐标往右为x轴,纵坐标往下为y轴
|
|
|
- stripper.addRegion("header", new Rectangle2D.Float(0, 0, 612, 180));
|
|
|
- stripper.addRegion("content-survey", new Rectangle2D.Float(0, 180, 612, 180));
|
|
|
- stripper.addRegion("content-current-fund", new Rectangle2D.Float(0, 360, 612, 240));
|
|
|
- stripper.addRegion("content-sub-fund1", new Rectangle2D.Float(0, 600, 612, 100));
|
|
|
- stripper.addRegion("content-sub-fund2", new Rectangle2D.Float(0, 700, 612, 150));
|
|
|
- stripper.addRegion("footer", new Rectangle2D.Float(0, 850, 612, 30));
|
|
|
+ /**
|
|
|
+ * 找图片水印
|
|
|
+ *
|
|
|
+ * @param page
|
|
|
+ * @return
|
|
|
+ * @throws IOException
|
|
|
+ */
|
|
|
+ public static Map<COSName, PDImageXObject> findImageWatermark(PDPage page) throws IOException {
|
|
|
+ Map<COSName, PDImageXObject> watermarkMap = MapUtil.newHashMap();
|
|
|
+ PDResources resources = page.getResources();
|
|
|
+ Iterable<COSName> xObjectNames = resources.getXObjectNames();
|
|
|
+ for (COSName xObjectName : xObjectNames) {
|
|
|
+ PDXObject xObject = resources.getXObject(xObjectName);
|
|
|
+ PDStream stream = xObject.getStream();
|
|
|
+ PDImageXObject imageXObject = null;
|
|
|
+ try {
|
|
|
+ imageXObject = new PDImageXObject(stream, resources);
|
|
|
+ } catch (Exception e) {
|
|
|
+ e.printStackTrace();
|
|
|
+ }
|
|
|
+ if (imageXObject != null) {
|
|
|
+ watermarkMap.put(xObjectName, imageXObject);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return watermarkMap;
|
|
|
}
|
|
|
|
|
|
- private static void processRegions(PDFTextStripperByArea stripper, List<String> regions) {
|
|
|
- for (String region : regions) {
|
|
|
- String text = stripper.getTextForRegion(region);
|
|
|
- System.out.println(text);
|
|
|
+ private static Map<String, List<String>> generateWatermarkListMap(String fundName, String trustName, String registerNumber) {
|
|
|
+ Map<String, List<String>> result = MapUtil.newHashMap(32);
|
|
|
+ // 生成水印列表
|
|
|
+
|
|
|
+ fundName = StrUtil.isNotBlank(fundName) ? fundName : "私募证券投资基金";
|
|
|
+ trustName = StrUtil.isNotBlank(trustName) ? trustName : "资产管理有限公司";
|
|
|
+ registerNumber = StrUtil.isNotBlank(registerNumber) ? registerNumber : "";
|
|
|
+ String text = fundName + trustName + registerNumber;
|
|
|
+ text = text.replaceAll("[()]", ""); // 移除括号
|
|
|
+ List<String> textList = new ArrayList<>(new HashSet<>(convertStringToList(text)));
|
|
|
+ Collections.reverse(textList);
|
|
|
+ StringBuilder sb = new StringBuilder(textList.size());
|
|
|
+ for (String ch : textList) {
|
|
|
+ sb.append(ch);
|
|
|
+ }
|
|
|
+ String joinedText = sb.toString();
|
|
|
+
|
|
|
+ // 基本水印列表
|
|
|
+ List<String> wkList = new ArrayList<>();
|
|
|
+ for (String ch : textList) {
|
|
|
+ wkList.add(ch + "\r\n");
|
|
|
+ wkList.add("\r\n" + ch);
|
|
|
+ }
|
|
|
+
|
|
|
+ // 查找数字
|
|
|
+ List<String> matches = findDigits(fundName);
|
|
|
+ if (!matches.isEmpty()) {
|
|
|
+ for (String match : matches) {
|
|
|
+ wkList.add("\r\n" + match);
|
|
|
+ wkList.add(match + "\r\n");
|
|
|
+ }
|
|
|
}
|
|
|
+ wkList.add("-");
|
|
|
+ wkList.add("【");
|
|
|
+ wkList.add("】");
|
|
|
+ wkList.add("\r");
|
|
|
+ wkList.add("\r\n");
|
|
|
+
|
|
|
+ String noNumberText = removeDigits(joinedText);
|
|
|
+
|
|
|
+ // 生成不同字段的水印列表
|
|
|
+ result.put("report_name", new ArrayList<>(wkList));
|
|
|
+ result.get("report_name").addAll(convertStringToList("有限公司"));
|
|
|
+
|
|
|
+ result.put("less", new ArrayList<>(wkList));
|
|
|
+
|
|
|
+ result.put("more", new ArrayList<>(wkList));
|
|
|
+ result.get("more").addAll(convertStringToList(noNumberText));
|
|
|
+
|
|
|
+ result.put("leverage", new ArrayList<>(wkList));
|
|
|
+ result.get("leverage").addAll(convertStringToList(removeKeywords(noNumberText, "基金资产")));
|
|
|
+
|
|
|
+ result.put("base_info", new ArrayList<>(wkList));
|
|
|
+ result.get("base_info").addAll(convertStringToList(removeKeywords(text, "基", "金", "投资", "管理", "有", "份", "融", "资", "产", "本", "号", "收益", "策略", "期")));
|
|
|
+
|
|
|
+ result.put("industry", new ArrayList<>(wkList));
|
|
|
+ result.get("industry").addAll(convertStringToList(removeKeywords(noNumberText, "基金融公产")));
|
|
|
+
|
|
|
+ result.put("market_value", new ArrayList<>(Collections.singletonList("\n")));
|
|
|
+ return result;
|
|
|
}
|
|
|
+
|
|
|
+ private static List<String> findDigits(String text) {
|
|
|
+ List<String> digits = new ArrayList<>();
|
|
|
+ Pattern pattern = Pattern.compile("\\d");
|
|
|
+ Matcher matcher = pattern.matcher(text);
|
|
|
+ while (matcher.find()) {
|
|
|
+ digits.add(matcher.group());
|
|
|
+ }
|
|
|
+ return digits;
|
|
|
+ }
|
|
|
+
|
|
|
+ private static String removeDigits(String text) {
|
|
|
+ return text.replaceAll("\\d", "");
|
|
|
+ }
|
|
|
+
|
|
|
+ private static String removeKeywords(String text, String... keywords) {
|
|
|
+ for (String keyword : keywords) {
|
|
|
+ text = text.replaceAll(keyword, "");
|
|
|
+ }
|
|
|
+ return text;
|
|
|
+ }
|
|
|
+
|
|
|
+ private static List<String> convertStringToList(String text) {
|
|
|
+ List<String> charList = new ArrayList<>();
|
|
|
+ for (char c : text.toCharArray()) {
|
|
|
+ charList.add(c + "");
|
|
|
+ }
|
|
|
+ return charList;
|
|
|
+ }
|
|
|
+
|
|
|
+ public static String processString(List<String> wmList, String string) {
|
|
|
+ // 生成正则表达式模式
|
|
|
+ String pat = String.join("|", wmList);
|
|
|
+ // 使用正则表达式移除wmList中的元素
|
|
|
+ string = removeMatches(string, pat);
|
|
|
+ // 替换中文括号为英文括号
|
|
|
+ string = string.replace("(", "(").replace(")", ")");
|
|
|
+ // 移除空格
|
|
|
+ string = string.replace(" ", "");
|
|
|
+ // 如果字符串以括号开头,则移除第一个字符
|
|
|
+ if (startsWithParenthesis(string)) {
|
|
|
+ string = string.substring(1);
|
|
|
+ }
|
|
|
+
|
|
|
+ return string;
|
|
|
+ }
|
|
|
+
|
|
|
+ private static String removeMatches(String input, String pattern) {
|
|
|
+ // 编译正则表达式
|
|
|
+ Pattern compiledPattern = Pattern.compile(pattern);
|
|
|
+ // 创建Matcher对象
|
|
|
+ Matcher matcher = compiledPattern.matcher(input);
|
|
|
+ // 使用replaceAll方法替换所有匹配到的字符为空字符串
|
|
|
+ return matcher.replaceAll("");
|
|
|
+ }
|
|
|
+
|
|
|
+ private static boolean startsWithParenthesis(String input) {
|
|
|
+ // 匹配以括号开头的字符串
|
|
|
+ Pattern pattern = Pattern.compile("^[()].*");
|
|
|
+ Matcher matcher = pattern.matcher(input);
|
|
|
+ return matcher.find();
|
|
|
+ }
|
|
|
+
|
|
|
+// public static void removeTextWatermark(PDPage page) throws IOException {
|
|
|
+// PDResources resources = page.getResources();
|
|
|
+//// if (StrUtil.isAllBlank(fundName, trustName)) {
|
|
|
+//// return;
|
|
|
+//// }
|
|
|
+// PDFTextStripperByArea stripper = new PDFTextStripperByArea();
|
|
|
+// stripper.setSortByPosition(true);
|
|
|
+// stripper.addRegion("watermark", new Rectangle2D.Float(0, 0, page.getMediaBox().getWidth(), page.getMediaBox().getHeight()));
|
|
|
+// stripper.extractRegions(page);
|
|
|
+//
|
|
|
+// PDFStreamEngine engine = new PDFTextStripper();
|
|
|
+// engine.addOperator(new SetMatrix(stripper));
|
|
|
+//
|
|
|
+// }
|
|
|
+//
|
|
|
+// private static void processResources(PDResources resources) throws IOException {
|
|
|
+// for (COSName name : resources.getXObjectNames()) {
|
|
|
+// PDXObject xobject = resources.getXObject(name);
|
|
|
+// if (xobject instanceof PDFormXObject) {
|
|
|
+// PDFormXObject formXObject = (PDFormXObject) xobject;
|
|
|
+// writeTokensToStream(formXObject.getContentStream(),
|
|
|
+// createTokensWithoutText(formXObject));
|
|
|
+// processResources(formXObject.getResources());
|
|
|
+// }
|
|
|
+// }
|
|
|
+// for (COSName name : resources.getPatternNames()) {
|
|
|
+// PDAbstractPattern pattern = resources.getPattern(name);
|
|
|
+// if (pattern instanceof PDTilingPattern) {
|
|
|
+// PDTilingPattern tilingPattern = (PDTilingPattern) pattern;
|
|
|
+// writeTokensToStream(tilingPattern.getContentStream(),
|
|
|
+// createTokensWithoutText(tilingPattern));
|
|
|
+// processResources(tilingPattern.getResources());
|
|
|
+// }
|
|
|
+// }
|
|
|
+// }
|
|
|
+//
|
|
|
+// private static void writeTokensToStream(PDStream newContents, List<Object> newTokens) throws IOException {
|
|
|
+// try (OutputStream out = newContents.createOutputStream(COSName.FLATE_DECODE)) {
|
|
|
+// ContentStreamWriter writer = new ContentStreamWriter(out);
|
|
|
+// writer.writeTokens(newTokens);
|
|
|
+// }
|
|
|
+// }
|
|
|
+//
|
|
|
+// private static List<Object> createTokensWithoutText(PDContentStream contentStream) throws IOException {
|
|
|
+// PDFStreamParser parser = new PDFStreamParser(contentStream);
|
|
|
+// Object token = parser.parseNextToken();
|
|
|
+// List<Object> newTokens = new ArrayList<>();
|
|
|
+// while (token != null) {
|
|
|
+// if (token instanceof Operator op) {
|
|
|
+// String opName = op.getName();
|
|
|
+// if (OperatorName.SET_MATRIX.equals(opName)) {
|
|
|
+// // remove the argument to this operator
|
|
|
+// newTokens.remove(newTokens.size() - 1);
|
|
|
+//
|
|
|
+// token = parser.parseNextToken();
|
|
|
+// continue;
|
|
|
+// }
|
|
|
+// }
|
|
|
+// newTokens.add(token);
|
|
|
+// token = parser.parseNextToken();
|
|
|
+// }
|
|
|
+// return newTokens;
|
|
|
+// }
|
|
|
}
|