wangzaijun
/
data-daq
forkattu lähteestä Tech2/data-daq


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348
							//package com.simuwang.daq.utils;
//
//import cn.hutool.core.collection.CollUtil;
//import cn.hutool.core.collection.ListUtil;
//import cn.hutool.core.map.MapUtil;
//import cn.hutool.core.util.ReflectUtil;
//import cn.hutool.core.util.StrUtil;
//import com.simuwang.daq.components.CustomPDFTextStripper;
//import com.simuwang.daq.dto.ReportFundInfo;
//import com.smppw.common.pojo.ValueLabelVO;
//import org.apache.pdfbox.Loader;
//import org.apache.pdfbox.contentstream.PDFStreamEngine;
//import org.apache.pdfbox.contentstream.operator.text.ShowText;
//import org.apache.pdfbox.cos.COSName;
//import org.apache.pdfbox.io.RandomAccessReadBufferedFile;
//import org.apache.pdfbox.pdmodel.PDDocument;
//import org.apache.pdfbox.pdmodel.PDPage;
//import org.apache.pdfbox.pdmodel.PDResources;
//import org.apache.pdfbox.pdmodel.common.PDStream;
//import org.apache.pdfbox.pdmodel.documentinterchange.logicalstructure.PDStructureTreeRoot;
//import org.apache.pdfbox.pdmodel.graphics.PDXObject;
//import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
//import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
//import org.apache.pdfbox.text.PDFTextStripper;
//import org.apache.pdfbox.text.PDFTextStripperByArea;
//import org.apache.pdfbox.text.TextPosition;
//import org.apache.pdfbox.util.Matrix;
//import technology.tabula.*;
//import technology.tabula.extractors.SpreadsheetExtractionAlgorithm;
//
//import java.awt.geom.Rectangle2D;
//import java.io.IOException;
//import java.util.*;
//import java.util.regex.Matcher;
//import java.util.regex.Pattern;
//import java.util.stream.Collectors;
//
//public class ReportParseUtil {
//    public static void main(String[] args) throws IOException {
//        List<ValueLabelVO> fieldMapper = ListUtil.list(false);
//        fieldMapper.add(new ValueLabelVO("fundName", "基金名称"));
//        fieldMapper.add(new ValueLabelVO("registerNumber", "基金编码"));
//        fieldMapper.add(new ValueLabelVO("operationType", "基金运作方式"));
//        fieldMapper.add(new ValueLabelVO("fundType", "基金类别"));
//        fieldMapper.add(new ValueLabelVO("inceptionDate", "基金成立日期"));
//        fieldMapper.add(new ValueLabelVO("trustName", "基金托管人"));
//        fieldMapper.add(new ValueLabelVO("custodianName", "基金管理人"));
//        fieldMapper.add(new ValueLabelVO("advisorName", "投资顾问"));
//        fieldMapper.add(new ValueLabelVO("reviewed", "复核"));
//
//        Map<String, List<String>> watermarkMap = generateWatermarkListMap("幻方量化1000指数专享1号5期私募证券投资基金", "宁波幻方量化投资管理合伙企业（有限合伙）", null);
//        List<String> watermarks = watermarkMap.get("less");
//
////        System.out.println(watermarks);
////        try (PDDocument document = Loader.loadPDF(new RandomAccessReadBufferedFile("D:\\Documents\\workspace\\idea\\smppw\\data-daq\\service-daq\\src\\main\\java\\com\\simuwang\\daq\\utils\\12931.pdf"))) {
//        try (PDDocument document = Loader.loadPDF(new RandomAccessReadBufferedFile("D:\\Documents\\workspace\\idea\\smppw\\data-daq\\service-daq\\src\\main\\java\\com\\simuwang\\daq\\utils\\2061834.pdf"))) {
////            PDFTextStripper stripper = new PDFTextStripper();
////            stripper.setSortByPosition(true);
////            String allText = stripper.getText(document);
////            List<String> textList = StrUtil.split(allText, "\r\n");
////            System.out.println(textList);
//
//            PDFTextStripper textStripper = new CustomPDFTextStripper();
//            textStripper.setSortByPosition(true);
//            String text1 = textStripper.getText(document);
//            text1 = text1.replace("+\r\n", "").replace("+","");
//            List<String> textList = StrUtil.split(text1, "\r\n");
//            System.out.println(textList.get(0));
//
////            for (PDPage page : document.getPages()) {
////
//////                PDResources resources = page.getResources();
//////                Map<COSName, PDImageXObject> imageXObjectMap = findImageWatermark(page);
//////                Iterator<COSName> iterator = resources.getXObjectNames().iterator();
//////                while (iterator.hasNext()) {
//////                    COSName next = iterator.next();
//////                    if (imageXObjectMap.containsKey(next)) {
//////                        iterator.remove();
//////                    }
//////                }
//////                removeTextWatermark(page);
////
////                PDFTextStripperByArea stripper = new PDFTextStripperByArea();
////                stripper.setSortByPosition(true);
////                stripper.addRegion("page", new Rectangle2D.Float(0, 0, page.getMediaBox().getWidth(), page.getMediaBox().getHeight()));
////                stripper.extractRegions(page);
////                for (String region : stripper.getRegions()) {
////                    String text = stripper.getTextForRegion(region);
////                    String res = processString(watermarks, text);
////                    System.out.println("原数据：" + text + ", 去除水印后数据：" + res);
////                }
////            }
////            document.save(new File("./1.pdf"));
//
//            SpreadsheetExtractionAlgorithm extractionAlgorithm = new SpreadsheetExtractionAlgorithm();
//            PageIterator pageIterator = new ObjectExtractor(document).extract();
//            while (pageIterator.hasNext()) {
//                Page page = pageIterator.next();
//                List<Table> tables = extractionAlgorithm.extract(page);
//                tables = tables.stream().distinct().collect(Collectors.toList());
//                for (Table table : tables) {
//                    if (table.getColCount() == 4) {
//                        Map<String, Object> baseInfoMap = MapUtil.newHashMap(32);
//                        for (int i = 0; i < table.getRows().size(); i++) {
//                            List<RectangularTextContainer> cols = table.getRows().get(i);
//                            for (int j = 0; j < 2; j++) {
//                                baseInfoMap.put(cols.get(j * 2).getText(), cols.get(j * 2 + 1).getText());
//                            }
//                        }
//                        ReportFundInfo reportFundInfo = new ReportFundInfo();
//                        baseInfoMap.forEach((k, v) -> {
//                            for (ValueLabelVO vo : fieldMapper) {
//                                String fieldName = vo.getValue();
//                                List<String> labels = StrUtil.split(vo.getLabel(), ",");
//                                if (labels.contains(k)) {
//                                    ReflectUtil.setFieldValue(reportFundInfo, fieldName, v);
//                                    break;
//                                }
//                                for (String label : labels) {
//                                    if (k.contains(label)) {
//                                        ReflectUtil.setFieldValue(reportFundInfo, fieldName, v);
//                                        break;
//                                    }
//                                }
//                            }
//                        });
//                        System.out.println(reportFundInfo);
//                    }
//                }
//            }
//        }
//    }
//
//    /**
//     * 找图片水印
//     *
//     * @param page
//     * @return
//     * @throws IOException
//     */
//    public static Map<COSName, PDImageXObject> findImageWatermark(PDPage page) throws IOException {
//        Map<COSName, PDImageXObject> watermarkMap = MapUtil.newHashMap();
//        PDResources resources = page.getResources();
//        Iterable<COSName> xObjectNames = resources.getXObjectNames();
//        for (COSName xObjectName : xObjectNames) {
//            PDXObject xObject = resources.getXObject(xObjectName);
//            PDStream stream = xObject.getStream();
//            PDImageXObject imageXObject = null;
//            try {
//                imageXObject = new PDImageXObject(stream, resources);
//            } catch (Exception e) {
//                e.printStackTrace();
//            }
//            if (imageXObject != null) {
//                watermarkMap.put(xObjectName, imageXObject);
//            }
//        }
//        return watermarkMap;
//    }
//
//    private static Map<String, List<String>> generateWatermarkListMap(String fundName, String trustName, String registerNumber) {
//        Map<String, List<String>> result = MapUtil.newHashMap(32);
//        // 生成水印列表
//
//        fundName = StrUtil.isNotBlank(fundName) ? fundName : "私募证券投资基金";
//        trustName = StrUtil.isNotBlank(trustName) ? trustName : "资产管理有限公司";
//        registerNumber = StrUtil.isNotBlank(registerNumber) ? registerNumber : "";
//        String text = fundName + trustName + registerNumber;
//        text = text.replaceAll("[()]", ""); // 移除括号
//        List<String> textList = new ArrayList<>(new HashSet<>(convertStringToList(text)));
//        Collections.reverse(textList);
//        StringBuilder sb = new StringBuilder(textList.size());
//        for (String ch : textList) {
//            sb.append(ch);
//        }
//        String joinedText = sb.toString();
//
//        // 基本水印列表
//        List<String> wkList = new ArrayList<>();
//        for (String ch : textList) {
//            wkList.add(ch + "\r\n");
//            wkList.add("\r\n" + ch);
//        }
//
//        // 查找数字
//        List<String> matches = findDigits(fundName);
//        if (!matches.isEmpty()) {
//            for (String match : matches) {
//                wkList.add("\r\n" + match);
//                wkList.add(match + "\r\n");
//            }
//        }
//        wkList.add("-");
//        wkList.add("【");
//        wkList.add("】");
//        wkList.add("\r");
//        wkList.add("\r\n");
//
//        String noNumberText = removeDigits(joinedText);
//
//        // 生成不同字段的水印列表
//        result.put("report_name", new ArrayList<>(wkList));
//        result.get("report_name").addAll(convertStringToList("有限公司"));
//
//        result.put("less", new ArrayList<>(wkList));
//
//        result.put("more", new ArrayList<>(wkList));
//        result.get("more").addAll(convertStringToList(noNumberText));
//
//        result.put("leverage", new ArrayList<>(wkList));
//        result.get("leverage").addAll(convertStringToList(removeKeywords(noNumberText, "基金资产")));
//
//        result.put("base_info", new ArrayList<>(wkList));
//        result.get("base_info").addAll(convertStringToList(removeKeywords(text, "基", "金", "投资", "管理", "有", "份", "融", "资", "产", "本", "号", "收益", "策略", "期")));
//
//        result.put("industry", new ArrayList<>(wkList));
//        result.get("industry").addAll(convertStringToList(removeKeywords(noNumberText, "基金融公产")));
//
//        result.put("market_value", new ArrayList<>(Collections.singletonList("\n")));
//        return result;
//    }
//
//    private static List<String> findDigits(String text) {
//        List<String> digits = new ArrayList<>();
//        Pattern pattern = Pattern.compile("\\d");
//        Matcher matcher = pattern.matcher(text);
//        while (matcher.find()) {
//            digits.add(matcher.group());
//        }
//        return digits;
//    }
//
//    private static String removeDigits(String text) {
//        return text.replaceAll("\\d", "");
//    }
//
//    private static String removeKeywords(String text, String... keywords) {
//        for (String keyword : keywords) {
//            text = text.replaceAll(keyword, "");
//        }
//        return text;
//    }
//
//    private static List<String> convertStringToList(String text) {
//        List<String> charList = new ArrayList<>();
//        for (char c : text.toCharArray()) {
//            charList.add(c + "");
//        }
//        return charList;
//    }
//
//    public static String processString(List<String> wmList, String string) {
//        // 生成正则表达式模式
//        String pat = String.join("|", wmList);
//        // 使用正则表达式移除wmList中的元素
//        string = removeMatches(string, pat);
//        // 替换中文括号为英文括号
//        string = string.replace("（", "(").replace("）", ")");
//        // 移除空格
//        string = string.replace(" ", "");
//        // 如果字符串以括号开头，则移除第一个字符
//        if (startsWithParenthesis(string)) {
//            string = string.substring(1);
//        }
//
//        return string;
//    }
//
//    private static String removeMatches(String input, String pattern) {
//        // 编译正则表达式
//        Pattern compiledPattern = Pattern.compile(pattern);
//        // 创建Matcher对象
//        Matcher matcher = compiledPattern.matcher(input);
//        // 使用replaceAll方法替换所有匹配到的字符为空字符串
//        return matcher.replaceAll("");
//    }
//
//    private static boolean startsWithParenthesis(String input) {
//        // 匹配以括号开头的字符串
//        Pattern pattern = Pattern.compile("^[()].*");
//        Matcher matcher = pattern.matcher(input);
//        return matcher.find();
//    }
//
////    public static void removeTextWatermark(PDPage page) throws IOException {
////        PDResources resources = page.getResources();
//////        if (StrUtil.isAllBlank(fundName, trustName)) {
//////            return;
//////        }
////        PDFTextStripperByArea stripper = new PDFTextStripperByArea();
////        stripper.setSortByPosition(true);
////        stripper.addRegion("watermark", new Rectangle2D.Float(0, 0, page.getMediaBox().getWidth(), page.getMediaBox().getHeight()));
////        stripper.extractRegions(page);
////
////        PDFStreamEngine engine = new PDFTextStripper();
////        engine.addOperator(new SetMatrix(stripper));
////
////    }
////
////    private static void processResources(PDResources resources) throws IOException {
////        for (COSName name : resources.getXObjectNames()) {
////            PDXObject xobject = resources.getXObject(name);
////            if (xobject instanceof PDFormXObject) {
////                PDFormXObject formXObject = (PDFormXObject) xobject;
////                writeTokensToStream(formXObject.getContentStream(),
////                        createTokensWithoutText(formXObject));
////                processResources(formXObject.getResources());
////            }
////        }
////        for (COSName name : resources.getPatternNames()) {
////            PDAbstractPattern pattern = resources.getPattern(name);
////            if (pattern instanceof PDTilingPattern) {
////                PDTilingPattern tilingPattern = (PDTilingPattern) pattern;
////                writeTokensToStream(tilingPattern.getContentStream(),
////                        createTokensWithoutText(tilingPattern));
////                processResources(tilingPattern.getResources());
////            }
////        }
////    }
////
////    private static void writeTokensToStream(PDStream newContents, List<Object> newTokens) throws IOException {
////        try (OutputStream out = newContents.createOutputStream(COSName.FLATE_DECODE)) {
////            ContentStreamWriter writer = new ContentStreamWriter(out);
////            writer.writeTokens(newTokens);
////        }
////    }
////
////    private static List<Object> createTokensWithoutText(PDContentStream contentStream) throws IOException {
////        PDFStreamParser parser = new PDFStreamParser(contentStream);
////        Object token = parser.parseNextToken();
////        List<Object> newTokens = new ArrayList<>();
////        while (token != null) {
////            if (token instanceof Operator op) {
////                String opName = op.getName();
////                if (OperatorName.SET_MATRIX.equals(opName)) {
////                    // remove the argument to this operator
////                    newTokens.remove(newTokens.size() - 1);
////
////                    token = parser.parseNextToken();
////                    continue;
////                }
////            }
////            newTokens.add(token);
////            token = parser.parseNextToken();
////        }
////        return newTokens;
////    }
//}