浏览代码

feat:删除pdf报告的部分模板

wangzaijun 7 月之前
父节点
当前提交
aa207b1742

+ 0 - 0
1.pdf


二进制
service-daq/src/main/java/com/simuwang/daq/utils/12932.pdf


二进制
service-daq/src/main/java/com/simuwang/daq/utils/12933.pdf


二进制
service-daq/src/main/java/com/simuwang/daq/utils/14655.pdf


二进制
service-daq/src/main/java/com/simuwang/daq/utils/14916.pdf


二进制
service-daq/src/main/java/com/simuwang/daq/utils/15654.pdf


二进制
service-daq/src/main/java/com/simuwang/daq/utils/15655.pdf


二进制
service-daq/src/main/java/com/simuwang/daq/utils/17847.pdf


二进制
service-daq/src/main/java/com/simuwang/daq/utils/17850.pdf


+ 328 - 328
service-daq/src/main/java/com/simuwang/daq/utils/ReportParseUtil.java

@@ -1,348 +1,348 @@
-package com.simuwang.daq.utils;
-
-import cn.hutool.core.collection.CollUtil;
-import cn.hutool.core.collection.ListUtil;
-import cn.hutool.core.map.MapUtil;
-import cn.hutool.core.util.ReflectUtil;
-import cn.hutool.core.util.StrUtil;
-import com.simuwang.daq.components.CustomPDFTextStripper;
-import com.simuwang.daq.dto.ReportFundInfo;
-import com.smppw.common.pojo.ValueLabelVO;
-import org.apache.pdfbox.Loader;
-import org.apache.pdfbox.contentstream.PDFStreamEngine;
-import org.apache.pdfbox.contentstream.operator.text.ShowText;
-import org.apache.pdfbox.cos.COSName;
-import org.apache.pdfbox.io.RandomAccessReadBufferedFile;
-import org.apache.pdfbox.pdmodel.PDDocument;
-import org.apache.pdfbox.pdmodel.PDPage;
-import org.apache.pdfbox.pdmodel.PDResources;
-import org.apache.pdfbox.pdmodel.common.PDStream;
-import org.apache.pdfbox.pdmodel.documentinterchange.logicalstructure.PDStructureTreeRoot;
-import org.apache.pdfbox.pdmodel.graphics.PDXObject;
-import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
-import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
-import org.apache.pdfbox.text.PDFTextStripper;
-import org.apache.pdfbox.text.PDFTextStripperByArea;
-import org.apache.pdfbox.text.TextPosition;
-import org.apache.pdfbox.util.Matrix;
-import technology.tabula.*;
-import technology.tabula.extractors.SpreadsheetExtractionAlgorithm;
-
-import java.awt.geom.Rectangle2D;
-import java.io.IOException;
-import java.util.*;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-import java.util.stream.Collectors;
-
-public class ReportParseUtil {
-    public static void main(String[] args) throws IOException {
-        List<ValueLabelVO> fieldMapper = ListUtil.list(false);
-        fieldMapper.add(new ValueLabelVO("fundName", "基金名称"));
-        fieldMapper.add(new ValueLabelVO("registerNumber", "基金编码"));
-        fieldMapper.add(new ValueLabelVO("operationType", "基金运作方式"));
-        fieldMapper.add(new ValueLabelVO("fundType", "基金类别"));
-        fieldMapper.add(new ValueLabelVO("inceptionDate", "基金成立日期"));
-        fieldMapper.add(new ValueLabelVO("trustName", "基金托管人"));
-        fieldMapper.add(new ValueLabelVO("custodianName", "基金管理人"));
-        fieldMapper.add(new ValueLabelVO("advisorName", "投资顾问"));
-        fieldMapper.add(new ValueLabelVO("reviewed", "复核"));
-
-        Map<String, List<String>> watermarkMap = generateWatermarkListMap("幻方量化1000指数专享1号5期私募证券投资基金", "宁波幻方量化投资管理合伙企业(有限合伙)", null);
-        List<String> watermarks = watermarkMap.get("less");
-
-//        System.out.println(watermarks);
-//        try (PDDocument document = Loader.loadPDF(new RandomAccessReadBufferedFile("D:\\Documents\\workspace\\idea\\smppw\\data-daq\\service-daq\\src\\main\\java\\com\\simuwang\\daq\\utils\\12931.pdf"))) {
-        try (PDDocument document = Loader.loadPDF(new RandomAccessReadBufferedFile("D:\\Documents\\workspace\\idea\\smppw\\data-daq\\service-daq\\src\\main\\java\\com\\simuwang\\daq\\utils\\2061834.pdf"))) {
-//            PDFTextStripper stripper = new PDFTextStripper();
-//            stripper.setSortByPosition(true);
-//            String allText = stripper.getText(document);
-//            List<String> textList = StrUtil.split(allText, "\r\n");
-//            System.out.println(textList);
-
-            PDFTextStripper textStripper = new CustomPDFTextStripper();
-            textStripper.setSortByPosition(true);
-            String text1 = textStripper.getText(document);
-            text1 = text1.replace("+\r\n", "").replace("+","");
-            List<String> textList = StrUtil.split(text1, "\r\n");
-            System.out.println(textList.get(0));
-
-//            for (PDPage page : document.getPages()) {
+//package com.simuwang.daq.utils;
 //
-////                PDResources resources = page.getResources();
-////                Map<COSName, PDImageXObject> imageXObjectMap = findImageWatermark(page);
-////                Iterator<COSName> iterator = resources.getXObjectNames().iterator();
-////                while (iterator.hasNext()) {
-////                    COSName next = iterator.next();
-////                    if (imageXObjectMap.containsKey(next)) {
-////                        iterator.remove();
-////                    }
+//import cn.hutool.core.collection.CollUtil;
+//import cn.hutool.core.collection.ListUtil;
+//import cn.hutool.core.map.MapUtil;
+//import cn.hutool.core.util.ReflectUtil;
+//import cn.hutool.core.util.StrUtil;
+//import com.simuwang.daq.components.CustomPDFTextStripper;
+//import com.simuwang.daq.dto.ReportFundInfo;
+//import com.smppw.common.pojo.ValueLabelVO;
+//import org.apache.pdfbox.Loader;
+//import org.apache.pdfbox.contentstream.PDFStreamEngine;
+//import org.apache.pdfbox.contentstream.operator.text.ShowText;
+//import org.apache.pdfbox.cos.COSName;
+//import org.apache.pdfbox.io.RandomAccessReadBufferedFile;
+//import org.apache.pdfbox.pdmodel.PDDocument;
+//import org.apache.pdfbox.pdmodel.PDPage;
+//import org.apache.pdfbox.pdmodel.PDResources;
+//import org.apache.pdfbox.pdmodel.common.PDStream;
+//import org.apache.pdfbox.pdmodel.documentinterchange.logicalstructure.PDStructureTreeRoot;
+//import org.apache.pdfbox.pdmodel.graphics.PDXObject;
+//import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
+//import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
+//import org.apache.pdfbox.text.PDFTextStripper;
+//import org.apache.pdfbox.text.PDFTextStripperByArea;
+//import org.apache.pdfbox.text.TextPosition;
+//import org.apache.pdfbox.util.Matrix;
+//import technology.tabula.*;
+//import technology.tabula.extractors.SpreadsheetExtractionAlgorithm;
+//
+//import java.awt.geom.Rectangle2D;
+//import java.io.IOException;
+//import java.util.*;
+//import java.util.regex.Matcher;
+//import java.util.regex.Pattern;
+//import java.util.stream.Collectors;
+//
+//public class ReportParseUtil {
+//    public static void main(String[] args) throws IOException {
+//        List<ValueLabelVO> fieldMapper = ListUtil.list(false);
+//        fieldMapper.add(new ValueLabelVO("fundName", "基金名称"));
+//        fieldMapper.add(new ValueLabelVO("registerNumber", "基金编码"));
+//        fieldMapper.add(new ValueLabelVO("operationType", "基金运作方式"));
+//        fieldMapper.add(new ValueLabelVO("fundType", "基金类别"));
+//        fieldMapper.add(new ValueLabelVO("inceptionDate", "基金成立日期"));
+//        fieldMapper.add(new ValueLabelVO("trustName", "基金托管人"));
+//        fieldMapper.add(new ValueLabelVO("custodianName", "基金管理人"));
+//        fieldMapper.add(new ValueLabelVO("advisorName", "投资顾问"));
+//        fieldMapper.add(new ValueLabelVO("reviewed", "复核"));
+//
+//        Map<String, List<String>> watermarkMap = generateWatermarkListMap("幻方量化1000指数专享1号5期私募证券投资基金", "宁波幻方量化投资管理合伙企业(有限合伙)", null);
+//        List<String> watermarks = watermarkMap.get("less");
+//
+////        System.out.println(watermarks);
+////        try (PDDocument document = Loader.loadPDF(new RandomAccessReadBufferedFile("D:\\Documents\\workspace\\idea\\smppw\\data-daq\\service-daq\\src\\main\\java\\com\\simuwang\\daq\\utils\\12931.pdf"))) {
+//        try (PDDocument document = Loader.loadPDF(new RandomAccessReadBufferedFile("D:\\Documents\\workspace\\idea\\smppw\\data-daq\\service-daq\\src\\main\\java\\com\\simuwang\\daq\\utils\\2061834.pdf"))) {
+////            PDFTextStripper stripper = new PDFTextStripper();
+////            stripper.setSortByPosition(true);
+////            String allText = stripper.getText(document);
+////            List<String> textList = StrUtil.split(allText, "\r\n");
+////            System.out.println(textList);
+//
+//            PDFTextStripper textStripper = new CustomPDFTextStripper();
+//            textStripper.setSortByPosition(true);
+//            String text1 = textStripper.getText(document);
+//            text1 = text1.replace("+\r\n", "").replace("+","");
+//            List<String> textList = StrUtil.split(text1, "\r\n");
+//            System.out.println(textList.get(0));
+//
+////            for (PDPage page : document.getPages()) {
+////
+//////                PDResources resources = page.getResources();
+//////                Map<COSName, PDImageXObject> imageXObjectMap = findImageWatermark(page);
+//////                Iterator<COSName> iterator = resources.getXObjectNames().iterator();
+//////                while (iterator.hasNext()) {
+//////                    COSName next = iterator.next();
+//////                    if (imageXObjectMap.containsKey(next)) {
+//////                        iterator.remove();
+//////                    }
+//////                }
+//////                removeTextWatermark(page);
+////
+////                PDFTextStripperByArea stripper = new PDFTextStripperByArea();
+////                stripper.setSortByPosition(true);
+////                stripper.addRegion("page", new Rectangle2D.Float(0, 0, page.getMediaBox().getWidth(), page.getMediaBox().getHeight()));
+////                stripper.extractRegions(page);
+////                for (String region : stripper.getRegions()) {
+////                    String text = stripper.getTextForRegion(region);
+////                    String res = processString(watermarks, text);
+////                    System.out.println("原数据:" + text + ", 去除水印后数据:" + res);
 ////                }
-////                removeTextWatermark(page);
+////            }
+////            document.save(new File("./1.pdf"));
 //
-//                PDFTextStripperByArea stripper = new PDFTextStripperByArea();
-//                stripper.setSortByPosition(true);
-//                stripper.addRegion("page", new Rectangle2D.Float(0, 0, page.getMediaBox().getWidth(), page.getMediaBox().getHeight()));
-//                stripper.extractRegions(page);
-//                for (String region : stripper.getRegions()) {
-//                    String text = stripper.getTextForRegion(region);
-//                    String res = processString(watermarks, text);
-//                    System.out.println("原数据:" + text + ", 去除水印后数据:" + res);
+//            SpreadsheetExtractionAlgorithm extractionAlgorithm = new SpreadsheetExtractionAlgorithm();
+//            PageIterator pageIterator = new ObjectExtractor(document).extract();
+//            while (pageIterator.hasNext()) {
+//                Page page = pageIterator.next();
+//                List<Table> tables = extractionAlgorithm.extract(page);
+//                tables = tables.stream().distinct().collect(Collectors.toList());
+//                for (Table table : tables) {
+//                    if (table.getColCount() == 4) {
+//                        Map<String, Object> baseInfoMap = MapUtil.newHashMap(32);
+//                        for (int i = 0; i < table.getRows().size(); i++) {
+//                            List<RectangularTextContainer> cols = table.getRows().get(i);
+//                            for (int j = 0; j < 2; j++) {
+//                                baseInfoMap.put(cols.get(j * 2).getText(), cols.get(j * 2 + 1).getText());
+//                            }
+//                        }
+//                        ReportFundInfo reportFundInfo = new ReportFundInfo();
+//                        baseInfoMap.forEach((k, v) -> {
+//                            for (ValueLabelVO vo : fieldMapper) {
+//                                String fieldName = vo.getValue();
+//                                List<String> labels = StrUtil.split(vo.getLabel(), ",");
+//                                if (labels.contains(k)) {
+//                                    ReflectUtil.setFieldValue(reportFundInfo, fieldName, v);
+//                                    break;
+//                                }
+//                                for (String label : labels) {
+//                                    if (k.contains(label)) {
+//                                        ReflectUtil.setFieldValue(reportFundInfo, fieldName, v);
+//                                        break;
+//                                    }
+//                                }
+//                            }
+//                        });
+//                        System.out.println(reportFundInfo);
+//                    }
 //                }
 //            }
-//            document.save(new File("./1.pdf"));
-
-            SpreadsheetExtractionAlgorithm extractionAlgorithm = new SpreadsheetExtractionAlgorithm();
-            PageIterator pageIterator = new ObjectExtractor(document).extract();
-            while (pageIterator.hasNext()) {
-                Page page = pageIterator.next();
-                List<Table> tables = extractionAlgorithm.extract(page);
-                tables = tables.stream().distinct().collect(Collectors.toList());
-                for (Table table : tables) {
-                    if (table.getColCount() == 4) {
-                        Map<String, Object> baseInfoMap = MapUtil.newHashMap(32);
-                        for (int i = 0; i < table.getRows().size(); i++) {
-                            List<RectangularTextContainer> cols = table.getRows().get(i);
-                            for (int j = 0; j < 2; j++) {
-                                baseInfoMap.put(cols.get(j * 2).getText(), cols.get(j * 2 + 1).getText());
-                            }
-                        }
-                        ReportFundInfo reportFundInfo = new ReportFundInfo();
-                        baseInfoMap.forEach((k, v) -> {
-                            for (ValueLabelVO vo : fieldMapper) {
-                                String fieldName = vo.getValue();
-                                List<String> labels = StrUtil.split(vo.getLabel(), ",");
-                                if (labels.contains(k)) {
-                                    ReflectUtil.setFieldValue(reportFundInfo, fieldName, v);
-                                    break;
-                                }
-                                for (String label : labels) {
-                                    if (k.contains(label)) {
-                                        ReflectUtil.setFieldValue(reportFundInfo, fieldName, v);
-                                        break;
-                                    }
-                                }
-                            }
-                        });
-                        System.out.println(reportFundInfo);
-                    }
-                }
-            }
-        }
-    }
-
-    /**
-     * 找图片水印
-     *
-     * @param page
-     * @return
-     * @throws IOException
-     */
-    public static Map<COSName, PDImageXObject> findImageWatermark(PDPage page) throws IOException {
-        Map<COSName, PDImageXObject> watermarkMap = MapUtil.newHashMap();
-        PDResources resources = page.getResources();
-        Iterable<COSName> xObjectNames = resources.getXObjectNames();
-        for (COSName xObjectName : xObjectNames) {
-            PDXObject xObject = resources.getXObject(xObjectName);
-            PDStream stream = xObject.getStream();
-            PDImageXObject imageXObject = null;
-            try {
-                imageXObject = new PDImageXObject(stream, resources);
-            } catch (Exception e) {
-                e.printStackTrace();
-            }
-            if (imageXObject != null) {
-                watermarkMap.put(xObjectName, imageXObject);
-            }
-        }
-        return watermarkMap;
-    }
-
-    private static Map<String, List<String>> generateWatermarkListMap(String fundName, String trustName, String registerNumber) {
-        Map<String, List<String>> result = MapUtil.newHashMap(32);
-        // 生成水印列表
-
-        fundName = StrUtil.isNotBlank(fundName) ? fundName : "私募证券投资基金";
-        trustName = StrUtil.isNotBlank(trustName) ? trustName : "资产管理有限公司";
-        registerNumber = StrUtil.isNotBlank(registerNumber) ? registerNumber : "";
-        String text = fundName + trustName + registerNumber;
-        text = text.replaceAll("[()]", ""); // 移除括号
-        List<String> textList = new ArrayList<>(new HashSet<>(convertStringToList(text)));
-        Collections.reverse(textList);
-        StringBuilder sb = new StringBuilder(textList.size());
-        for (String ch : textList) {
-            sb.append(ch);
-        }
-        String joinedText = sb.toString();
-
-        // 基本水印列表
-        List<String> wkList = new ArrayList<>();
-        for (String ch : textList) {
-            wkList.add(ch + "\r\n");
-            wkList.add("\r\n" + ch);
-        }
-
-        // 查找数字
-        List<String> matches = findDigits(fundName);
-        if (!matches.isEmpty()) {
-            for (String match : matches) {
-                wkList.add("\r\n" + match);
-                wkList.add(match + "\r\n");
-            }
-        }
-        wkList.add("-");
-        wkList.add("【");
-        wkList.add("】");
-        wkList.add("\r");
-        wkList.add("\r\n");
-
-        String noNumberText = removeDigits(joinedText);
-
-        // 生成不同字段的水印列表
-        result.put("report_name", new ArrayList<>(wkList));
-        result.get("report_name").addAll(convertStringToList("有限公司"));
-
-        result.put("less", new ArrayList<>(wkList));
-
-        result.put("more", new ArrayList<>(wkList));
-        result.get("more").addAll(convertStringToList(noNumberText));
-
-        result.put("leverage", new ArrayList<>(wkList));
-        result.get("leverage").addAll(convertStringToList(removeKeywords(noNumberText, "基金资产")));
-
-        result.put("base_info", new ArrayList<>(wkList));
-        result.get("base_info").addAll(convertStringToList(removeKeywords(text, "基", "金", "投资", "管理", "有", "份", "融", "资", "产", "本", "号", "收益", "策略", "期")));
-
-        result.put("industry", new ArrayList<>(wkList));
-        result.get("industry").addAll(convertStringToList(removeKeywords(noNumberText, "基金融公产")));
-
-        result.put("market_value", new ArrayList<>(Collections.singletonList("\n")));
-        return result;
-    }
-
-    private static List<String> findDigits(String text) {
-        List<String> digits = new ArrayList<>();
-        Pattern pattern = Pattern.compile("\\d");
-        Matcher matcher = pattern.matcher(text);
-        while (matcher.find()) {
-            digits.add(matcher.group());
-        }
-        return digits;
-    }
-
-    private static String removeDigits(String text) {
-        return text.replaceAll("\\d", "");
-    }
-
-    private static String removeKeywords(String text, String... keywords) {
-        for (String keyword : keywords) {
-            text = text.replaceAll(keyword, "");
-        }
-        return text;
-    }
-
-    private static List<String> convertStringToList(String text) {
-        List<String> charList = new ArrayList<>();
-        for (char c : text.toCharArray()) {
-            charList.add(c + "");
-        }
-        return charList;
-    }
-
-    public static String processString(List<String> wmList, String string) {
-        // 生成正则表达式模式
-        String pat = String.join("|", wmList);
-        // 使用正则表达式移除wmList中的元素
-        string = removeMatches(string, pat);
-        // 替换中文括号为英文括号
-        string = string.replace("(", "(").replace(")", ")");
-        // 移除空格
-        string = string.replace(" ", "");
-        // 如果字符串以括号开头,则移除第一个字符
-        if (startsWithParenthesis(string)) {
-            string = string.substring(1);
-        }
-
-        return string;
-    }
-
-    private static String removeMatches(String input, String pattern) {
-        // 编译正则表达式
-        Pattern compiledPattern = Pattern.compile(pattern);
-        // 创建Matcher对象
-        Matcher matcher = compiledPattern.matcher(input);
-        // 使用replaceAll方法替换所有匹配到的字符为空字符串
-        return matcher.replaceAll("");
-    }
-
-    private static boolean startsWithParenthesis(String input) {
-        // 匹配以括号开头的字符串
-        Pattern pattern = Pattern.compile("^[()].*");
-        Matcher matcher = pattern.matcher(input);
-        return matcher.find();
-    }
-
-//    public static void removeTextWatermark(PDPage page) throws IOException {
+//        }
+//    }
+//
+//    /**
+//     * 找图片水印
+//     *
+//     * @param page
+//     * @return
+//     * @throws IOException
+//     */
+//    public static Map<COSName, PDImageXObject> findImageWatermark(PDPage page) throws IOException {
+//        Map<COSName, PDImageXObject> watermarkMap = MapUtil.newHashMap();
 //        PDResources resources = page.getResources();
-////        if (StrUtil.isAllBlank(fundName, trustName)) {
-////            return;
-////        }
-//        PDFTextStripperByArea stripper = new PDFTextStripperByArea();
-//        stripper.setSortByPosition(true);
-//        stripper.addRegion("watermark", new Rectangle2D.Float(0, 0, page.getMediaBox().getWidth(), page.getMediaBox().getHeight()));
-//        stripper.extractRegions(page);
+//        Iterable<COSName> xObjectNames = resources.getXObjectNames();
+//        for (COSName xObjectName : xObjectNames) {
+//            PDXObject xObject = resources.getXObject(xObjectName);
+//            PDStream stream = xObject.getStream();
+//            PDImageXObject imageXObject = null;
+//            try {
+//                imageXObject = new PDImageXObject(stream, resources);
+//            } catch (Exception e) {
+//                e.printStackTrace();
+//            }
+//            if (imageXObject != null) {
+//                watermarkMap.put(xObjectName, imageXObject);
+//            }
+//        }
+//        return watermarkMap;
+//    }
 //
-//        PDFStreamEngine engine = new PDFTextStripper();
-//        engine.addOperator(new SetMatrix(stripper));
+//    private static Map<String, List<String>> generateWatermarkListMap(String fundName, String trustName, String registerNumber) {
+//        Map<String, List<String>> result = MapUtil.newHashMap(32);
+//        // 生成水印列表
 //
-//    }
+//        fundName = StrUtil.isNotBlank(fundName) ? fundName : "私募证券投资基金";
+//        trustName = StrUtil.isNotBlank(trustName) ? trustName : "资产管理有限公司";
+//        registerNumber = StrUtil.isNotBlank(registerNumber) ? registerNumber : "";
+//        String text = fundName + trustName + registerNumber;
+//        text = text.replaceAll("[()]", ""); // 移除括号
+//        List<String> textList = new ArrayList<>(new HashSet<>(convertStringToList(text)));
+//        Collections.reverse(textList);
+//        StringBuilder sb = new StringBuilder(textList.size());
+//        for (String ch : textList) {
+//            sb.append(ch);
+//        }
+//        String joinedText = sb.toString();
 //
-//    private static void processResources(PDResources resources) throws IOException {
-//        for (COSName name : resources.getXObjectNames()) {
-//            PDXObject xobject = resources.getXObject(name);
-//            if (xobject instanceof PDFormXObject) {
-//                PDFormXObject formXObject = (PDFormXObject) xobject;
-//                writeTokensToStream(formXObject.getContentStream(),
-//                        createTokensWithoutText(formXObject));
-//                processResources(formXObject.getResources());
-//            }
+//        // 基本水印列表
+//        List<String> wkList = new ArrayList<>();
+//        for (String ch : textList) {
+//            wkList.add(ch + "\r\n");
+//            wkList.add("\r\n" + ch);
 //        }
-//        for (COSName name : resources.getPatternNames()) {
-//            PDAbstractPattern pattern = resources.getPattern(name);
-//            if (pattern instanceof PDTilingPattern) {
-//                PDTilingPattern tilingPattern = (PDTilingPattern) pattern;
-//                writeTokensToStream(tilingPattern.getContentStream(),
-//                        createTokensWithoutText(tilingPattern));
-//                processResources(tilingPattern.getResources());
+//
+//        // 查找数字
+//        List<String> matches = findDigits(fundName);
+//        if (!matches.isEmpty()) {
+//            for (String match : matches) {
+//                wkList.add("\r\n" + match);
+//                wkList.add(match + "\r\n");
 //            }
 //        }
+//        wkList.add("-");
+//        wkList.add("【");
+//        wkList.add("】");
+//        wkList.add("\r");
+//        wkList.add("\r\n");
+//
+//        String noNumberText = removeDigits(joinedText);
+//
+//        // 生成不同字段的水印列表
+//        result.put("report_name", new ArrayList<>(wkList));
+//        result.get("report_name").addAll(convertStringToList("有限公司"));
+//
+//        result.put("less", new ArrayList<>(wkList));
+//
+//        result.put("more", new ArrayList<>(wkList));
+//        result.get("more").addAll(convertStringToList(noNumberText));
+//
+//        result.put("leverage", new ArrayList<>(wkList));
+//        result.get("leverage").addAll(convertStringToList(removeKeywords(noNumberText, "基金资产")));
+//
+//        result.put("base_info", new ArrayList<>(wkList));
+//        result.get("base_info").addAll(convertStringToList(removeKeywords(text, "基", "金", "投资", "管理", "有", "份", "融", "资", "产", "本", "号", "收益", "策略", "期")));
+//
+//        result.put("industry", new ArrayList<>(wkList));
+//        result.get("industry").addAll(convertStringToList(removeKeywords(noNumberText, "基金融公产")));
+//
+//        result.put("market_value", new ArrayList<>(Collections.singletonList("\n")));
+//        return result;
+//    }
+//
+//    private static List<String> findDigits(String text) {
+//        List<String> digits = new ArrayList<>();
+//        Pattern pattern = Pattern.compile("\\d");
+//        Matcher matcher = pattern.matcher(text);
+//        while (matcher.find()) {
+//            digits.add(matcher.group());
+//        }
+//        return digits;
 //    }
 //
-//    private static void writeTokensToStream(PDStream newContents, List<Object> newTokens) throws IOException {
-//        try (OutputStream out = newContents.createOutputStream(COSName.FLATE_DECODE)) {
-//            ContentStreamWriter writer = new ContentStreamWriter(out);
-//            writer.writeTokens(newTokens);
+//    private static String removeDigits(String text) {
+//        return text.replaceAll("\\d", "");
+//    }
+//
+//    private static String removeKeywords(String text, String... keywords) {
+//        for (String keyword : keywords) {
+//            text = text.replaceAll(keyword, "");
 //        }
+//        return text;
 //    }
 //
-//    private static List<Object> createTokensWithoutText(PDContentStream contentStream) throws IOException {
-//        PDFStreamParser parser = new PDFStreamParser(contentStream);
-//        Object token = parser.parseNextToken();
-//        List<Object> newTokens = new ArrayList<>();
-//        while (token != null) {
-//            if (token instanceof Operator op) {
-//                String opName = op.getName();
-//                if (OperatorName.SET_MATRIX.equals(opName)) {
-//                    // remove the argument to this operator
-//                    newTokens.remove(newTokens.size() - 1);
+//    private static List<String> convertStringToList(String text) {
+//        List<String> charList = new ArrayList<>();
+//        for (char c : text.toCharArray()) {
+//            charList.add(c + "");
+//        }
+//        return charList;
+//    }
 //
-//                    token = parser.parseNextToken();
-//                    continue;
-//                }
-//            }
-//            newTokens.add(token);
-//            token = parser.parseNextToken();
+//    public static String processString(List<String> wmList, String string) {
+//        // 生成正则表达式模式
+//        String pat = String.join("|", wmList);
+//        // 使用正则表达式移除wmList中的元素
+//        string = removeMatches(string, pat);
+//        // 替换中文括号为英文括号
+//        string = string.replace("(", "(").replace(")", ")");
+//        // 移除空格
+//        string = string.replace(" ", "");
+//        // 如果字符串以括号开头,则移除第一个字符
+//        if (startsWithParenthesis(string)) {
+//            string = string.substring(1);
 //        }
-//        return newTokens;
+//
+//        return string;
+//    }
+//
+//    private static String removeMatches(String input, String pattern) {
+//        // 编译正则表达式
+//        Pattern compiledPattern = Pattern.compile(pattern);
+//        // 创建Matcher对象
+//        Matcher matcher = compiledPattern.matcher(input);
+//        // 使用replaceAll方法替换所有匹配到的字符为空字符串
+//        return matcher.replaceAll("");
 //    }
-}
+//
+//    private static boolean startsWithParenthesis(String input) {
+//        // 匹配以括号开头的字符串
+//        Pattern pattern = Pattern.compile("^[()].*");
+//        Matcher matcher = pattern.matcher(input);
+//        return matcher.find();
+//    }
+//
+////    public static void removeTextWatermark(PDPage page) throws IOException {
+////        PDResources resources = page.getResources();
+//////        if (StrUtil.isAllBlank(fundName, trustName)) {
+//////            return;
+//////        }
+////        PDFTextStripperByArea stripper = new PDFTextStripperByArea();
+////        stripper.setSortByPosition(true);
+////        stripper.addRegion("watermark", new Rectangle2D.Float(0, 0, page.getMediaBox().getWidth(), page.getMediaBox().getHeight()));
+////        stripper.extractRegions(page);
+////
+////        PDFStreamEngine engine = new PDFTextStripper();
+////        engine.addOperator(new SetMatrix(stripper));
+////
+////    }
+////
+////    private static void processResources(PDResources resources) throws IOException {
+////        for (COSName name : resources.getXObjectNames()) {
+////            PDXObject xobject = resources.getXObject(name);
+////            if (xobject instanceof PDFormXObject) {
+////                PDFormXObject formXObject = (PDFormXObject) xobject;
+////                writeTokensToStream(formXObject.getContentStream(),
+////                        createTokensWithoutText(formXObject));
+////                processResources(formXObject.getResources());
+////            }
+////        }
+////        for (COSName name : resources.getPatternNames()) {
+////            PDAbstractPattern pattern = resources.getPattern(name);
+////            if (pattern instanceof PDTilingPattern) {
+////                PDTilingPattern tilingPattern = (PDTilingPattern) pattern;
+////                writeTokensToStream(tilingPattern.getContentStream(),
+////                        createTokensWithoutText(tilingPattern));
+////                processResources(tilingPattern.getResources());
+////            }
+////        }
+////    }
+////
+////    private static void writeTokensToStream(PDStream newContents, List<Object> newTokens) throws IOException {
+////        try (OutputStream out = newContents.createOutputStream(COSName.FLATE_DECODE)) {
+////            ContentStreamWriter writer = new ContentStreamWriter(out);
+////            writer.writeTokens(newTokens);
+////        }
+////    }
+////
+////    private static List<Object> createTokensWithoutText(PDContentStream contentStream) throws IOException {
+////        PDFStreamParser parser = new PDFStreamParser(contentStream);
+////        Object token = parser.parseNextToken();
+////        List<Object> newTokens = new ArrayList<>();
+////        while (token != null) {
+////            if (token instanceof Operator op) {
+////                String opName = op.getName();
+////                if (OperatorName.SET_MATRIX.equals(opName)) {
+////                    // remove the argument to this operator
+////                    newTokens.remove(newTokens.size() - 1);
+////
+////                    token = parser.parseNextToken();
+////                    continue;
+////                }
+////            }
+////            newTokens.add(token);
+////            token = parser.parseNextToken();
+////        }
+////        return newTokens;
+////    }
+//}