//package com.simuwang.daq.utils; // //import cn.hutool.core.collection.CollUtil; //import cn.hutool.core.collection.ListUtil; //import cn.hutool.core.map.MapUtil; //import cn.hutool.core.util.ReflectUtil; //import cn.hutool.core.util.StrUtil; //import com.simuwang.daq.components.CustomPDFTextStripper; //import com.simuwang.daq.dto.ReportFundInfo; //import com.smppw.common.pojo.ValueLabelVO; //import org.apache.pdfbox.Loader; //import org.apache.pdfbox.contentstream.PDFStreamEngine; //import org.apache.pdfbox.contentstream.operator.text.ShowText; //import org.apache.pdfbox.cos.COSName; //import org.apache.pdfbox.io.RandomAccessReadBufferedFile; //import org.apache.pdfbox.pdmodel.PDDocument; //import org.apache.pdfbox.pdmodel.PDPage; //import org.apache.pdfbox.pdmodel.PDResources; //import org.apache.pdfbox.pdmodel.common.PDStream; //import org.apache.pdfbox.pdmodel.documentinterchange.logicalstructure.PDStructureTreeRoot; //import org.apache.pdfbox.pdmodel.graphics.PDXObject; //import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject; //import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem; //import org.apache.pdfbox.text.PDFTextStripper; //import org.apache.pdfbox.text.PDFTextStripperByArea; //import org.apache.pdfbox.text.TextPosition; //import org.apache.pdfbox.util.Matrix; //import technology.tabula.*; //import technology.tabula.extractors.SpreadsheetExtractionAlgorithm; // //import java.awt.geom.Rectangle2D; //import java.io.IOException; //import java.util.*; //import java.util.regex.Matcher; //import java.util.regex.Pattern; //import java.util.stream.Collectors; // //public class ReportParseUtil { // public static void main(String[] args) throws IOException { // List fieldMapper = ListUtil.list(false); // fieldMapper.add(new ValueLabelVO("fundName", "基金名称")); // fieldMapper.add(new ValueLabelVO("registerNumber", "基金编码")); // fieldMapper.add(new ValueLabelVO("operationType", "基金运作方式")); // fieldMapper.add(new ValueLabelVO("fundType", "基金类别")); // fieldMapper.add(new ValueLabelVO("inceptionDate", "基金成立日期")); // fieldMapper.add(new ValueLabelVO("trustName", "基金托管人")); // fieldMapper.add(new ValueLabelVO("custodianName", "基金管理人")); // fieldMapper.add(new ValueLabelVO("advisorName", "投资顾问")); // fieldMapper.add(new ValueLabelVO("reviewed", "复核")); // // Map> watermarkMap = generateWatermarkListMap("幻方量化1000指数专享1号5期私募证券投资基金", "宁波幻方量化投资管理合伙企业(有限合伙)", null); // List watermarks = watermarkMap.get("less"); // //// System.out.println(watermarks); //// try (PDDocument document = Loader.loadPDF(new RandomAccessReadBufferedFile("D:\\Documents\\workspace\\idea\\smppw\\data-daq\\service-daq\\src\\main\\java\\com\\simuwang\\daq\\utils\\12931.pdf"))) { // try (PDDocument document = Loader.loadPDF(new RandomAccessReadBufferedFile("D:\\Documents\\workspace\\idea\\smppw\\data-daq\\service-daq\\src\\main\\java\\com\\simuwang\\daq\\utils\\2061834.pdf"))) { //// PDFTextStripper stripper = new PDFTextStripper(); //// stripper.setSortByPosition(true); //// String allText = stripper.getText(document); //// List textList = StrUtil.split(allText, "\r\n"); //// System.out.println(textList); // // PDFTextStripper textStripper = new CustomPDFTextStripper(); // textStripper.setSortByPosition(true); // String text1 = textStripper.getText(document); // text1 = text1.replace("+\r\n", "").replace("+",""); // List textList = StrUtil.split(text1, "\r\n"); // System.out.println(textList.get(0)); // //// for (PDPage page : document.getPages()) { //// ////// PDResources resources = page.getResources(); ////// Map imageXObjectMap = findImageWatermark(page); ////// Iterator iterator = resources.getXObjectNames().iterator(); ////// while (iterator.hasNext()) { ////// COSName next = iterator.next(); ////// if (imageXObjectMap.containsKey(next)) { ////// iterator.remove(); ////// } ////// } ////// removeTextWatermark(page); //// //// PDFTextStripperByArea stripper = new PDFTextStripperByArea(); //// stripper.setSortByPosition(true); //// stripper.addRegion("page", new Rectangle2D.Float(0, 0, page.getMediaBox().getWidth(), page.getMediaBox().getHeight())); //// stripper.extractRegions(page); //// for (String region : stripper.getRegions()) { //// String text = stripper.getTextForRegion(region); //// String res = processString(watermarks, text); //// System.out.println("原数据:" + text + ", 去除水印后数据:" + res); //// } //// } //// document.save(new File("./1.pdf")); // // SpreadsheetExtractionAlgorithm extractionAlgorithm = new SpreadsheetExtractionAlgorithm(); // PageIterator pageIterator = new ObjectExtractor(document).extract(); // while (pageIterator.hasNext()) { // Page page = pageIterator.next(); // List tables = extractionAlgorithm.extract(page); // tables = tables.stream().distinct().collect(Collectors.toList()); // for (Table table : tables) { // if (table.getColCount() == 4) { // Map baseInfoMap = MapUtil.newHashMap(32); // for (int i = 0; i < table.getRows().size(); i++) { // List cols = table.getRows().get(i); // for (int j = 0; j < 2; j++) { // baseInfoMap.put(cols.get(j * 2).getText(), cols.get(j * 2 + 1).getText()); // } // } // ReportFundInfo reportFundInfo = new ReportFundInfo(); // baseInfoMap.forEach((k, v) -> { // for (ValueLabelVO vo : fieldMapper) { // String fieldName = vo.getValue(); // List labels = StrUtil.split(vo.getLabel(), ","); // if (labels.contains(k)) { // ReflectUtil.setFieldValue(reportFundInfo, fieldName, v); // break; // } // for (String label : labels) { // if (k.contains(label)) { // ReflectUtil.setFieldValue(reportFundInfo, fieldName, v); // break; // } // } // } // }); // System.out.println(reportFundInfo); // } // } // } // } // } // // /** // * 找图片水印 // * // * @param page // * @return // * @throws IOException // */ // public static Map findImageWatermark(PDPage page) throws IOException { // Map watermarkMap = MapUtil.newHashMap(); // PDResources resources = page.getResources(); // Iterable xObjectNames = resources.getXObjectNames(); // for (COSName xObjectName : xObjectNames) { // PDXObject xObject = resources.getXObject(xObjectName); // PDStream stream = xObject.getStream(); // PDImageXObject imageXObject = null; // try { // imageXObject = new PDImageXObject(stream, resources); // } catch (Exception e) { // e.printStackTrace(); // } // if (imageXObject != null) { // watermarkMap.put(xObjectName, imageXObject); // } // } // return watermarkMap; // } // // private static Map> generateWatermarkListMap(String fundName, String trustName, String registerNumber) { // Map> result = MapUtil.newHashMap(32); // // 生成水印列表 // // fundName = StrUtil.isNotBlank(fundName) ? fundName : "私募证券投资基金"; // trustName = StrUtil.isNotBlank(trustName) ? trustName : "资产管理有限公司"; // registerNumber = StrUtil.isNotBlank(registerNumber) ? registerNumber : ""; // String text = fundName + trustName + registerNumber; // text = text.replaceAll("[()]", ""); // 移除括号 // List textList = new ArrayList<>(new HashSet<>(convertStringToList(text))); // Collections.reverse(textList); // StringBuilder sb = new StringBuilder(textList.size()); // for (String ch : textList) { // sb.append(ch); // } // String joinedText = sb.toString(); // // // 基本水印列表 // List wkList = new ArrayList<>(); // for (String ch : textList) { // wkList.add(ch + "\r\n"); // wkList.add("\r\n" + ch); // } // // // 查找数字 // List matches = findDigits(fundName); // if (!matches.isEmpty()) { // for (String match : matches) { // wkList.add("\r\n" + match); // wkList.add(match + "\r\n"); // } // } // wkList.add("-"); // wkList.add("【"); // wkList.add("】"); // wkList.add("\r"); // wkList.add("\r\n"); // // String noNumberText = removeDigits(joinedText); // // // 生成不同字段的水印列表 // result.put("report_name", new ArrayList<>(wkList)); // result.get("report_name").addAll(convertStringToList("有限公司")); // // result.put("less", new ArrayList<>(wkList)); // // result.put("more", new ArrayList<>(wkList)); // result.get("more").addAll(convertStringToList(noNumberText)); // // result.put("leverage", new ArrayList<>(wkList)); // result.get("leverage").addAll(convertStringToList(removeKeywords(noNumberText, "基金资产"))); // // result.put("base_info", new ArrayList<>(wkList)); // result.get("base_info").addAll(convertStringToList(removeKeywords(text, "基", "金", "投资", "管理", "有", "份", "融", "资", "产", "本", "号", "收益", "策略", "期"))); // // result.put("industry", new ArrayList<>(wkList)); // result.get("industry").addAll(convertStringToList(removeKeywords(noNumberText, "基金融公产"))); // // result.put("market_value", new ArrayList<>(Collections.singletonList("\n"))); // return result; // } // // private static List findDigits(String text) { // List digits = new ArrayList<>(); // Pattern pattern = Pattern.compile("\\d"); // Matcher matcher = pattern.matcher(text); // while (matcher.find()) { // digits.add(matcher.group()); // } // return digits; // } // // private static String removeDigits(String text) { // return text.replaceAll("\\d", ""); // } // // private static String removeKeywords(String text, String... keywords) { // for (String keyword : keywords) { // text = text.replaceAll(keyword, ""); // } // return text; // } // // private static List convertStringToList(String text) { // List charList = new ArrayList<>(); // for (char c : text.toCharArray()) { // charList.add(c + ""); // } // return charList; // } // // public static String processString(List wmList, String string) { // // 生成正则表达式模式 // String pat = String.join("|", wmList); // // 使用正则表达式移除wmList中的元素 // string = removeMatches(string, pat); // // 替换中文括号为英文括号 // string = string.replace("(", "(").replace(")", ")"); // // 移除空格 // string = string.replace(" ", ""); // // 如果字符串以括号开头,则移除第一个字符 // if (startsWithParenthesis(string)) { // string = string.substring(1); // } // // return string; // } // // private static String removeMatches(String input, String pattern) { // // 编译正则表达式 // Pattern compiledPattern = Pattern.compile(pattern); // // 创建Matcher对象 // Matcher matcher = compiledPattern.matcher(input); // // 使用replaceAll方法替换所有匹配到的字符为空字符串 // return matcher.replaceAll(""); // } // // private static boolean startsWithParenthesis(String input) { // // 匹配以括号开头的字符串 // Pattern pattern = Pattern.compile("^[()].*"); // Matcher matcher = pattern.matcher(input); // return matcher.find(); // } // //// public static void removeTextWatermark(PDPage page) throws IOException { //// PDResources resources = page.getResources(); ////// if (StrUtil.isAllBlank(fundName, trustName)) { ////// return; ////// } //// PDFTextStripperByArea stripper = new PDFTextStripperByArea(); //// stripper.setSortByPosition(true); //// stripper.addRegion("watermark", new Rectangle2D.Float(0, 0, page.getMediaBox().getWidth(), page.getMediaBox().getHeight())); //// stripper.extractRegions(page); //// //// PDFStreamEngine engine = new PDFTextStripper(); //// engine.addOperator(new SetMatrix(stripper)); //// //// } //// //// private static void processResources(PDResources resources) throws IOException { //// for (COSName name : resources.getXObjectNames()) { //// PDXObject xobject = resources.getXObject(name); //// if (xobject instanceof PDFormXObject) { //// PDFormXObject formXObject = (PDFormXObject) xobject; //// writeTokensToStream(formXObject.getContentStream(), //// createTokensWithoutText(formXObject)); //// processResources(formXObject.getResources()); //// } //// } //// for (COSName name : resources.getPatternNames()) { //// PDAbstractPattern pattern = resources.getPattern(name); //// if (pattern instanceof PDTilingPattern) { //// PDTilingPattern tilingPattern = (PDTilingPattern) pattern; //// writeTokensToStream(tilingPattern.getContentStream(), //// createTokensWithoutText(tilingPattern)); //// processResources(tilingPattern.getResources()); //// } //// } //// } //// //// private static void writeTokensToStream(PDStream newContents, List newTokens) throws IOException { //// try (OutputStream out = newContents.createOutputStream(COSName.FLATE_DECODE)) { //// ContentStreamWriter writer = new ContentStreamWriter(out); //// writer.writeTokens(newTokens); //// } //// } //// //// private static List createTokensWithoutText(PDContentStream contentStream) throws IOException { //// PDFStreamParser parser = new PDFStreamParser(contentStream); //// Object token = parser.parseNextToken(); //// List newTokens = new ArrayList<>(); //// while (token != null) { //// if (token instanceof Operator op) { //// String opName = op.getName(); //// if (OperatorName.SET_MATRIX.equals(opName)) { //// // remove the argument to this operator //// newTokens.remove(newTokens.size() - 1); //// //// token = parser.parseNextToken(); //// continue; //// } //// } //// newTokens.add(token); //// token = parser.parseNextToken(); //// } //// return newTokens; //// } //}