123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348 |
- //package com.simuwang.daq.utils;
- //
- //import cn.hutool.core.collection.CollUtil;
- //import cn.hutool.core.collection.ListUtil;
- //import cn.hutool.core.map.MapUtil;
- //import cn.hutool.core.util.ReflectUtil;
- //import cn.hutool.core.util.StrUtil;
- //import com.simuwang.daq.components.CustomPDFTextStripper;
- //import com.simuwang.daq.dto.ReportFundInfo;
- //import com.smppw.common.pojo.ValueLabelVO;
- //import org.apache.pdfbox.Loader;
- //import org.apache.pdfbox.contentstream.PDFStreamEngine;
- //import org.apache.pdfbox.contentstream.operator.text.ShowText;
- //import org.apache.pdfbox.cos.COSName;
- //import org.apache.pdfbox.io.RandomAccessReadBufferedFile;
- //import org.apache.pdfbox.pdmodel.PDDocument;
- //import org.apache.pdfbox.pdmodel.PDPage;
- //import org.apache.pdfbox.pdmodel.PDResources;
- //import org.apache.pdfbox.pdmodel.common.PDStream;
- //import org.apache.pdfbox.pdmodel.documentinterchange.logicalstructure.PDStructureTreeRoot;
- //import org.apache.pdfbox.pdmodel.graphics.PDXObject;
- //import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
- //import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
- //import org.apache.pdfbox.text.PDFTextStripper;
- //import org.apache.pdfbox.text.PDFTextStripperByArea;
- //import org.apache.pdfbox.text.TextPosition;
- //import org.apache.pdfbox.util.Matrix;
- //import technology.tabula.*;
- //import technology.tabula.extractors.SpreadsheetExtractionAlgorithm;
- //
- //import java.awt.geom.Rectangle2D;
- //import java.io.IOException;
- //import java.util.*;
- //import java.util.regex.Matcher;
- //import java.util.regex.Pattern;
- //import java.util.stream.Collectors;
- //
- //public class ReportParseUtil {
- // public static void main(String[] args) throws IOException {
- // List<ValueLabelVO> fieldMapper = ListUtil.list(false);
- // fieldMapper.add(new ValueLabelVO("fundName", "基金名称"));
- // fieldMapper.add(new ValueLabelVO("registerNumber", "基金编码"));
- // fieldMapper.add(new ValueLabelVO("operationType", "基金运作方式"));
- // fieldMapper.add(new ValueLabelVO("fundType", "基金类别"));
- // fieldMapper.add(new ValueLabelVO("inceptionDate", "基金成立日期"));
- // fieldMapper.add(new ValueLabelVO("trustName", "基金托管人"));
- // fieldMapper.add(new ValueLabelVO("custodianName", "基金管理人"));
- // fieldMapper.add(new ValueLabelVO("advisorName", "投资顾问"));
- // fieldMapper.add(new ValueLabelVO("reviewed", "复核"));
- //
- // Map<String, List<String>> watermarkMap = generateWatermarkListMap("幻方量化1000指数专享1号5期私募证券投资基金", "宁波幻方量化投资管理合伙企业(有限合伙)", null);
- // List<String> watermarks = watermarkMap.get("less");
- //
- //// System.out.println(watermarks);
- //// try (PDDocument document = Loader.loadPDF(new RandomAccessReadBufferedFile("D:\\Documents\\workspace\\idea\\smppw\\data-daq\\service-daq\\src\\main\\java\\com\\simuwang\\daq\\utils\\12931.pdf"))) {
- // try (PDDocument document = Loader.loadPDF(new RandomAccessReadBufferedFile("D:\\Documents\\workspace\\idea\\smppw\\data-daq\\service-daq\\src\\main\\java\\com\\simuwang\\daq\\utils\\2061834.pdf"))) {
- //// PDFTextStripper stripper = new PDFTextStripper();
- //// stripper.setSortByPosition(true);
- //// String allText = stripper.getText(document);
- //// List<String> textList = StrUtil.split(allText, "\r\n");
- //// System.out.println(textList);
- //
- // PDFTextStripper textStripper = new CustomPDFTextStripper();
- // textStripper.setSortByPosition(true);
- // String text1 = textStripper.getText(document);
- // text1 = text1.replace("+\r\n", "").replace("+","");
- // List<String> textList = StrUtil.split(text1, "\r\n");
- // System.out.println(textList.get(0));
- //
- //// for (PDPage page : document.getPages()) {
- ////
- ////// PDResources resources = page.getResources();
- ////// Map<COSName, PDImageXObject> imageXObjectMap = findImageWatermark(page);
- ////// Iterator<COSName> iterator = resources.getXObjectNames().iterator();
- ////// while (iterator.hasNext()) {
- ////// COSName next = iterator.next();
- ////// if (imageXObjectMap.containsKey(next)) {
- ////// iterator.remove();
- ////// }
- ////// }
- ////// removeTextWatermark(page);
- ////
- //// PDFTextStripperByArea stripper = new PDFTextStripperByArea();
- //// stripper.setSortByPosition(true);
- //// stripper.addRegion("page", new Rectangle2D.Float(0, 0, page.getMediaBox().getWidth(), page.getMediaBox().getHeight()));
- //// stripper.extractRegions(page);
- //// for (String region : stripper.getRegions()) {
- //// String text = stripper.getTextForRegion(region);
- //// String res = processString(watermarks, text);
- //// System.out.println("原数据:" + text + ", 去除水印后数据:" + res);
- //// }
- //// }
- //// document.save(new File("./1.pdf"));
- //
- // SpreadsheetExtractionAlgorithm extractionAlgorithm = new SpreadsheetExtractionAlgorithm();
- // PageIterator pageIterator = new ObjectExtractor(document).extract();
- // while (pageIterator.hasNext()) {
- // Page page = pageIterator.next();
- // List<Table> tables = extractionAlgorithm.extract(page);
- // tables = tables.stream().distinct().collect(Collectors.toList());
- // for (Table table : tables) {
- // if (table.getColCount() == 4) {
- // Map<String, Object> baseInfoMap = MapUtil.newHashMap(32);
- // for (int i = 0; i < table.getRows().size(); i++) {
- // List<RectangularTextContainer> cols = table.getRows().get(i);
- // for (int j = 0; j < 2; j++) {
- // baseInfoMap.put(cols.get(j * 2).getText(), cols.get(j * 2 + 1).getText());
- // }
- // }
- // ReportFundInfo reportFundInfo = new ReportFundInfo();
- // baseInfoMap.forEach((k, v) -> {
- // for (ValueLabelVO vo : fieldMapper) {
- // String fieldName = vo.getValue();
- // List<String> labels = StrUtil.split(vo.getLabel(), ",");
- // if (labels.contains(k)) {
- // ReflectUtil.setFieldValue(reportFundInfo, fieldName, v);
- // break;
- // }
- // for (String label : labels) {
- // if (k.contains(label)) {
- // ReflectUtil.setFieldValue(reportFundInfo, fieldName, v);
- // break;
- // }
- // }
- // }
- // });
- // System.out.println(reportFundInfo);
- // }
- // }
- // }
- // }
- // }
- //
- // /**
- // * 找图片水印
- // *
- // * @param page
- // * @return
- // * @throws IOException
- // */
- // public static Map<COSName, PDImageXObject> findImageWatermark(PDPage page) throws IOException {
- // Map<COSName, PDImageXObject> watermarkMap = MapUtil.newHashMap();
- // PDResources resources = page.getResources();
- // Iterable<COSName> xObjectNames = resources.getXObjectNames();
- // for (COSName xObjectName : xObjectNames) {
- // PDXObject xObject = resources.getXObject(xObjectName);
- // PDStream stream = xObject.getStream();
- // PDImageXObject imageXObject = null;
- // try {
- // imageXObject = new PDImageXObject(stream, resources);
- // } catch (Exception e) {
- // e.printStackTrace();
- // }
- // if (imageXObject != null) {
- // watermarkMap.put(xObjectName, imageXObject);
- // }
- // }
- // return watermarkMap;
- // }
- //
- // private static Map<String, List<String>> generateWatermarkListMap(String fundName, String trustName, String registerNumber) {
- // Map<String, List<String>> result = MapUtil.newHashMap(32);
- // // 生成水印列表
- //
- // fundName = StrUtil.isNotBlank(fundName) ? fundName : "私募证券投资基金";
- // trustName = StrUtil.isNotBlank(trustName) ? trustName : "资产管理有限公司";
- // registerNumber = StrUtil.isNotBlank(registerNumber) ? registerNumber : "";
- // String text = fundName + trustName + registerNumber;
- // text = text.replaceAll("[()]", ""); // 移除括号
- // List<String> textList = new ArrayList<>(new HashSet<>(convertStringToList(text)));
- // Collections.reverse(textList);
- // StringBuilder sb = new StringBuilder(textList.size());
- // for (String ch : textList) {
- // sb.append(ch);
- // }
- // String joinedText = sb.toString();
- //
- // // 基本水印列表
- // List<String> wkList = new ArrayList<>();
- // for (String ch : textList) {
- // wkList.add(ch + "\r\n");
- // wkList.add("\r\n" + ch);
- // }
- //
- // // 查找数字
- // List<String> matches = findDigits(fundName);
- // if (!matches.isEmpty()) {
- // for (String match : matches) {
- // wkList.add("\r\n" + match);
- // wkList.add(match + "\r\n");
- // }
- // }
- // wkList.add("-");
- // wkList.add("【");
- // wkList.add("】");
- // wkList.add("\r");
- // wkList.add("\r\n");
- //
- // String noNumberText = removeDigits(joinedText);
- //
- // // 生成不同字段的水印列表
- // result.put("report_name", new ArrayList<>(wkList));
- // result.get("report_name").addAll(convertStringToList("有限公司"));
- //
- // result.put("less", new ArrayList<>(wkList));
- //
- // result.put("more", new ArrayList<>(wkList));
- // result.get("more").addAll(convertStringToList(noNumberText));
- //
- // result.put("leverage", new ArrayList<>(wkList));
- // result.get("leverage").addAll(convertStringToList(removeKeywords(noNumberText, "基金资产")));
- //
- // result.put("base_info", new ArrayList<>(wkList));
- // result.get("base_info").addAll(convertStringToList(removeKeywords(text, "基", "金", "投资", "管理", "有", "份", "融", "资", "产", "本", "号", "收益", "策略", "期")));
- //
- // result.put("industry", new ArrayList<>(wkList));
- // result.get("industry").addAll(convertStringToList(removeKeywords(noNumberText, "基金融公产")));
- //
- // result.put("market_value", new ArrayList<>(Collections.singletonList("\n")));
- // return result;
- // }
- //
- // private static List<String> findDigits(String text) {
- // List<String> digits = new ArrayList<>();
- // Pattern pattern = Pattern.compile("\\d");
- // Matcher matcher = pattern.matcher(text);
- // while (matcher.find()) {
- // digits.add(matcher.group());
- // }
- // return digits;
- // }
- //
- // private static String removeDigits(String text) {
- // return text.replaceAll("\\d", "");
- // }
- //
- // private static String removeKeywords(String text, String... keywords) {
- // for (String keyword : keywords) {
- // text = text.replaceAll(keyword, "");
- // }
- // return text;
- // }
- //
- // private static List<String> convertStringToList(String text) {
- // List<String> charList = new ArrayList<>();
- // for (char c : text.toCharArray()) {
- // charList.add(c + "");
- // }
- // return charList;
- // }
- //
- // public static String processString(List<String> wmList, String string) {
- // // 生成正则表达式模式
- // String pat = String.join("|", wmList);
- // // 使用正则表达式移除wmList中的元素
- // string = removeMatches(string, pat);
- // // 替换中文括号为英文括号
- // string = string.replace("(", "(").replace(")", ")");
- // // 移除空格
- // string = string.replace(" ", "");
- // // 如果字符串以括号开头,则移除第一个字符
- // if (startsWithParenthesis(string)) {
- // string = string.substring(1);
- // }
- //
- // return string;
- // }
- //
- // private static String removeMatches(String input, String pattern) {
- // // 编译正则表达式
- // Pattern compiledPattern = Pattern.compile(pattern);
- // // 创建Matcher对象
- // Matcher matcher = compiledPattern.matcher(input);
- // // 使用replaceAll方法替换所有匹配到的字符为空字符串
- // return matcher.replaceAll("");
- // }
- //
- // private static boolean startsWithParenthesis(String input) {
- // // 匹配以括号开头的字符串
- // Pattern pattern = Pattern.compile("^[()].*");
- // Matcher matcher = pattern.matcher(input);
- // return matcher.find();
- // }
- //
- //// public static void removeTextWatermark(PDPage page) throws IOException {
- //// PDResources resources = page.getResources();
- ////// if (StrUtil.isAllBlank(fundName, trustName)) {
- ////// return;
- ////// }
- //// PDFTextStripperByArea stripper = new PDFTextStripperByArea();
- //// stripper.setSortByPosition(true);
- //// stripper.addRegion("watermark", new Rectangle2D.Float(0, 0, page.getMediaBox().getWidth(), page.getMediaBox().getHeight()));
- //// stripper.extractRegions(page);
- ////
- //// PDFStreamEngine engine = new PDFTextStripper();
- //// engine.addOperator(new SetMatrix(stripper));
- ////
- //// }
- ////
- //// private static void processResources(PDResources resources) throws IOException {
- //// for (COSName name : resources.getXObjectNames()) {
- //// PDXObject xobject = resources.getXObject(name);
- //// if (xobject instanceof PDFormXObject) {
- //// PDFormXObject formXObject = (PDFormXObject) xobject;
- //// writeTokensToStream(formXObject.getContentStream(),
- //// createTokensWithoutText(formXObject));
- //// processResources(formXObject.getResources());
- //// }
- //// }
- //// for (COSName name : resources.getPatternNames()) {
- //// PDAbstractPattern pattern = resources.getPattern(name);
- //// if (pattern instanceof PDTilingPattern) {
- //// PDTilingPattern tilingPattern = (PDTilingPattern) pattern;
- //// writeTokensToStream(tilingPattern.getContentStream(),
- //// createTokensWithoutText(tilingPattern));
- //// processResources(tilingPattern.getResources());
- //// }
- //// }
- //// }
- ////
- //// private static void writeTokensToStream(PDStream newContents, List<Object> newTokens) throws IOException {
- //// try (OutputStream out = newContents.createOutputStream(COSName.FLATE_DECODE)) {
- //// ContentStreamWriter writer = new ContentStreamWriter(out);
- //// writer.writeTokens(newTokens);
- //// }
- //// }
- ////
- //// private static List<Object> createTokensWithoutText(PDContentStream contentStream) throws IOException {
- //// PDFStreamParser parser = new PDFStreamParser(contentStream);
- //// Object token = parser.parseNextToken();
- //// List<Object> newTokens = new ArrayList<>();
- //// while (token != null) {
- //// if (token instanceof Operator op) {
- //// String opName = op.getName();
- //// if (OperatorName.SET_MATRIX.equals(opName)) {
- //// // remove the argument to this operator
- //// newTokens.remove(newTokens.size() - 1);
- ////
- //// token = parser.parseNextToken();
- //// continue;
- //// }
- //// }
- //// newTokens.add(token);
- //// token = parser.parseNextToken();
- //// }
- //// return newTokens;
- //// }
- //}
|