Ver código fonte

fix:删除测试的报告文件

wangzaijun 8 meses atrás
pai
commit
7df4d7a35d

BIN
service-daq/src/main/java/com/simuwang/daq/utils/12931.pdf


BIN
service-daq/src/main/java/com/simuwang/daq/utils/2061834.pdf


+ 0 - 40
service-daq/src/main/java/com/simuwang/daq/utils/ExcelReportParseUtil.java

@@ -1,40 +0,0 @@
-package com.simuwang.daq.utils;
-
-import cn.hutool.core.collection.ListUtil;
-import com.alibaba.excel.EasyExcel;
-import com.alibaba.excel.read.builder.ExcelReaderBuilder;
-import com.simuwang.base.pojo.dto.report.CustomExcelTable;
-import com.simuwang.base.pojo.dto.report.SimpleTable;
-import com.simuwang.daq.components.CustomExcelMultiSheetListener;
-
-import java.util.List;
-
-public class ExcelReportParseUtil {
-    public static final String filepath = "D:\\Documents\\workspace\\idea\\smppw\\data-daq\\service-daq\\src\\main\\java\\com\\simuwang\\daq\\utils\\SEW698_旭日盛德价值成长一期私募证券投资基金_2023年第4季度报告.xlsx";
-
-    public static void main(String[] args) {
-        List<CustomExcelTable> customExcelTables = ListUtil.list(true);
-        customExcelTables.add(new CustomExcelTable("fundInfo", "基金基本情况", 2));
-        customExcelTables.add(new CustomExcelTable("financialIndicators", "主要财务指标", 5, 6));
-        customExcelTables.add(new CustomExcelTable("financialIndicators", "级基金主要财务指标", 5, 6));
-        customExcelTables.add(new CustomExcelTable("assetAllocation", "期末基金资产组合情况", 3));
-        customExcelTables.add(new CustomExcelTable("investmentIndustry", "按行业分类的", 4));
-        customExcelTables.add(new CustomExcelTable("investmentIndustry", "报告期末按行业分类的港股通投资股票投资组合", 3));
-        customExcelTables.add(new CustomExcelTable("shareChange", "基金份额变动情况", 3, 6));
-        customExcelTables.add(new CustomExcelTable("shareChange", "级基金份额变动情况", 3, 6));
-
-        CustomExcelMultiSheetListener readListener = new CustomExcelMultiSheetListener();
-        ExcelReaderBuilder readerBuilder = EasyExcel.read(filepath);
-        readerBuilder.sheet();
-        // 没有表头行
-        readerBuilder.headRowNumber(0);
-        readerBuilder.customObject(customExcelTables);
-        readerBuilder.registerReadListener(readListener);
-        readerBuilder.doReadAll();
-
-        List<SimpleTable> tables = readListener.getTables();
-        for (SimpleTable table : tables) {
-            System.out.println(table);
-        }
-    }
-}

BIN
service-daq/src/main/java/com/simuwang/daq/utils/PB0001私募月报(证券投资).xlsx


BIN
service-daq/src/main/java/com/simuwang/daq/utils/PB0003私募年报(证券投资).xlsx


+ 0 - 370
service-daq/src/main/java/com/simuwang/daq/utils/ReportParseUtil.java

@@ -1,370 +0,0 @@
-//package com.simuwang.daq.utils;
-//
-//import cn.hutool.core.collection.ListUtil;
-//import cn.hutool.core.map.MapUtil;
-//import cn.hutool.core.util.ReflectUtil;
-//import cn.hutool.core.util.StrUtil;
-//import cn.hutool.http.HttpUtil;
-//import cn.hutool.json.JSONObject;
-//import cn.hutool.json.JSONUtil;
-//import com.simuwang.base.common.conts.Constants;
-//import com.simuwang.base.pojo.dto.report.PythonResult;
-//import com.simuwang.base.pojo.dto.report.ReportFundInfoDTO;
-//import com.simuwang.daq.components.CustomPDFTextStripper;
-//import com.simuwang.daq.components.PythonReportConverter;
-//import com.smppw.common.pojo.ValueLabelVO;
-//import org.apache.pdfbox.Loader;
-//import org.apache.pdfbox.cos.COSName;
-//import org.apache.pdfbox.io.RandomAccessReadBufferedFile;
-//import org.apache.pdfbox.pdmodel.PDDocument;
-//import org.apache.pdfbox.pdmodel.PDPage;
-//import org.apache.pdfbox.pdmodel.PDResources;
-//import org.apache.pdfbox.pdmodel.common.PDStream;
-//import org.apache.pdfbox.pdmodel.graphics.PDXObject;
-//import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
-//import org.apache.pdfbox.text.PDFTextStripper;
-//import technology.tabula.*;
-//import technology.tabula.extractors.SpreadsheetExtractionAlgorithm;
-//
-//import java.io.IOException;
-//import java.util.*;
-//import java.util.regex.Matcher;
-//import java.util.regex.Pattern;
-//import java.util.stream.Collectors;
-//
-//public class ReportParseUtil {
-//    public static void main(String[] args) throws IOException {
-////        String fileName = "SJM970_排排精选进取一号私募证券投资基金_2022年第4季度报告.pdf";
-////        Pattern pattern = Pattern.compile("S(?:[A-Z]{0}[0-9]{5}|[A-Z][0-9]{4}|[A-Z]{2}[0-9]{3}|[A-Z]{3}[0-9]{2})");
-////        Matcher matcher = pattern.matcher(fileName);
-////        String registerNumber = null;
-////        if (matcher.find()) {
-////            registerNumber = matcher.group();
-////        }
-////
-////        int type = 1;
-////        String baseUrl = "http://192.168.0.81:8088";
-////        String api = "/api/v1/parse/amac_report";
-////        Map<String, Object> params = MapUtil.newHashMap(16);
-////        params.put("file_id", 111112);
-////        params.put("file_path", "E:/workproject/fastparse/src/fastparse/static/reports/quarterly_report/13445.pdf");
-////        params.put("register_number", registerNumber);
-////        params.put("file_type", type);
-////        params.put("file_name", fileName);
-////        params.put("fund_name", null);
-////        params.put("trust_name", null);
-////        String body = HttpUtil.post(baseUrl + api, JSONUtil.toJsonStr(params));
-////        JSONObject obj = JSONUtil.parseObj(body);
-////        PythonResult<?> result = PythonReportConverter.convert(obj, type);
-////        System.out.println(result);
-//
-//        List<ValueLabelVO> fieldMapper = ListUtil.list(false);
-//        fieldMapper.add(new ValueLabelVO("fundName", "基金名称"));
-//        fieldMapper.add(new ValueLabelVO("registerNumber", "基金编码"));
-//        fieldMapper.add(new ValueLabelVO("operationType", "基金运作方式"));
-//        fieldMapper.add(new ValueLabelVO("fundType", "基金类别"));
-//        fieldMapper.add(new ValueLabelVO("inceptionDate", "基金成立日期"));
-//        fieldMapper.add(new ValueLabelVO("trustName", "基金托管人"));
-//        fieldMapper.add(new ValueLabelVO("custodianName", "基金管理人"));
-//        fieldMapper.add(new ValueLabelVO("advisorName", "投资顾问"));
-//        fieldMapper.add(new ValueLabelVO("reviewed", "复核"));
-//
-////        Map<String, List<String>> watermarkMap = generateWatermarkListMap("幻方量化1000指数专享1号5期私募证券投资基金", "宁波幻方量化投资管理合伙企业(有限合伙)", null);
-////        List<String> watermarks = watermarkMap.get("less");
-//
-////        System.out.println(watermarks);
-////        try (PDDocument document = Loader.loadPDF(new RandomAccessReadBufferedFile("D:\\Documents\\workspace\\idea\\smppw\\data-daq\\service-daq\\src\\main\\java\\com\\simuwang\\daq\\utils\\12931.pdf"))) {
-//        try (PDDocument document = Loader.loadPDF(new RandomAccessReadBufferedFile("C:\\Users\\Administrator\\Desktop\\self\\新报告解析\\基协报告\\季报\\SVP311_私募基金季报PDF_国恩回报6号增强私募证券投资基金_2024年06月30日.pdf"))) {
-////            PDFTextStripper stripper = new PDFTextStripper();
-////            stripper.setSortByPosition(true);
-////            String allText = stripper.getText(document);
-////            List<String> textList = StrUtil.split(allText, "\r\n");
-////            System.out.println(textList);
-//
-//            PDFTextStripper textStripper = new CustomPDFTextStripper();
-//            textStripper.setSortByPosition(true);
-//            String text1 = textStripper.getText(document);
-//            text1 = text1.replace(Constants.WATERMARK_REPLACE, Constants.EMPTY);
-//            List<String> textList = StrUtil.split(text1, System.lineSeparator());
-//            textList.removeIf(StrUtil::isBlank);
-//            System.out.println(textList.get(0));
-//
-////            for (PDPage page : document.getPages()) {
-////
-//////                PDResources resources = page.getResources();
-//////                Map<COSName, PDImageXObject> imageXObjectMap = findImageWatermark(page);
-//////                Iterator<COSName> iterator = resources.getXObjectNames().iterator();
-//////                while (iterator.hasNext()) {
-//////                    COSName next = iterator.next();
-//////                    if (imageXObjectMap.containsKey(next)) {
-//////                        iterator.remove();
-//////                    }
-//////                }
-//////                removeTextWatermark(page);
-////
-////                PDFTextStripperByArea stripper = new PDFTextStripperByArea();
-////                stripper.setSortByPosition(true);
-////                stripper.addRegion("page", new Rectangle2D.Float(0, 0, page.getMediaBox().getWidth(), page.getMediaBox().getHeight()));
-////                stripper.extractRegions(page);
-////                for (String region : stripper.getRegions()) {
-////                    String text = stripper.getTextForRegion(region);
-////                    String res = processString(watermarks, text);
-////                    System.out.println("原数据:" + text + ", 去除水印后数据:" + res);
-////                }
-////            }
-////            document.save(new File("./1.pdf"));
-//
-//            SpreadsheetExtractionAlgorithm extractionAlgorithm = new SpreadsheetExtractionAlgorithm();
-//            PageIterator pageIterator = new ObjectExtractor(document).extract();
-//            while (pageIterator.hasNext()) {
-//                Page page = pageIterator.next();
-//                List<Table> tables = extractionAlgorithm.extract(page);
-//                tables = tables.stream().distinct().collect(Collectors.toList());
-//                for (Table table : tables) {
-//                    if (table.getColCount() == 4) {
-//                        Map<String, Object> baseInfoMap = MapUtil.newHashMap(32);
-//                        for (int i = 0; i < table.getRows().size(); i++) {
-//                            List<RectangularTextContainer> cols = table.getRows().get(i);
-//                            for (int j = 0; j < 2; j++) {
-//                                baseInfoMap.put(cols.get(j * 2).getText(), cols.get(j * 2 + 1).getText());
-//                            }
-//                        }
-//                        ReportFundInfoDTO reportFundInfo = new ReportFundInfoDTO();
-//                        baseInfoMap.forEach((k, v) -> {
-//                            for (ValueLabelVO vo : fieldMapper) {
-//                                String fieldName = vo.getValue();
-//                                List<String> labels = StrUtil.split(vo.getLabel(), ",");
-//                                if (labels.contains(k)) {
-//                                    ReflectUtil.setFieldValue(reportFundInfo, fieldName, v);
-//                                    break;
-//                                }
-//                                for (String label : labels) {
-//                                    if (k.contains(label)) {
-//                                        ReflectUtil.setFieldValue(reportFundInfo, fieldName, v);
-//                                        break;
-//                                    }
-//                                }
-//                            }
-//                        });
-//                        System.out.println(reportFundInfo);
-//                    }
-//                }
-//            }
-//        }
-//    }
-//
-//    /**
-//     * 找图片水印
-//     *
-//     * @param page
-//     * @return
-//     * @throws IOException
-//     */
-//    public static Map<COSName, PDImageXObject> findImageWatermark(PDPage page) throws IOException {
-//        Map<COSName, PDImageXObject> watermarkMap = MapUtil.newHashMap();
-//        PDResources resources = page.getResources();
-//        Iterable<COSName> xObjectNames = resources.getXObjectNames();
-//        for (COSName xObjectName : xObjectNames) {
-//            PDXObject xObject = resources.getXObject(xObjectName);
-//            PDStream stream = xObject.getStream();
-//            PDImageXObject imageXObject = null;
-//            try {
-//                imageXObject = new PDImageXObject(stream, resources);
-//            } catch (Exception e) {
-//                e.printStackTrace();
-//            }
-//            if (imageXObject != null) {
-//                watermarkMap.put(xObjectName, imageXObject);
-//            }
-//        }
-//        return watermarkMap;
-//    }
-//
-//    private static Map<String, List<String>> generateWatermarkListMap(String fundName, String trustName, String registerNumber) {
-//        Map<String, List<String>> result = MapUtil.newHashMap(32);
-//        // 生成水印列表
-//
-//        fundName = StrUtil.isNotBlank(fundName) ? fundName : "私募证券投资基金";
-//        trustName = StrUtil.isNotBlank(trustName) ? trustName : "资产管理有限公司";
-//        registerNumber = StrUtil.isNotBlank(registerNumber) ? registerNumber : "";
-//        String text = fundName + trustName + registerNumber;
-//        text = text.replaceAll("[()]", ""); // 移除括号
-//        List<String> textList = new ArrayList<>(new HashSet<>(convertStringToList(text)));
-//        Collections.reverse(textList);
-//        StringBuilder sb = new StringBuilder(textList.size());
-//        for (String ch : textList) {
-//            sb.append(ch);
-//        }
-//        String joinedText = sb.toString();
-//
-//        // 基本水印列表
-//        List<String> wkList = new ArrayList<>();
-//        for (String ch : textList) {
-//            wkList.add(ch + "\r\n");
-//            wkList.add("\r\n" + ch);
-//        }
-//
-//        // 查找数字
-//        List<String> matches = findDigits(fundName);
-//        if (!matches.isEmpty()) {
-//            for (String match : matches) {
-//                wkList.add("\r\n" + match);
-//                wkList.add(match + "\r\n");
-//            }
-//        }
-//        wkList.add("-");
-//        wkList.add("【");
-//        wkList.add("】");
-//        wkList.add("\r");
-//        wkList.add("\r\n");
-//
-//        String noNumberText = removeDigits(joinedText);
-//
-//        // 生成不同字段的水印列表
-//        result.put("report_name", new ArrayList<>(wkList));
-//        result.get("report_name").addAll(convertStringToList("有限公司"));
-//
-//        result.put("less", new ArrayList<>(wkList));
-//
-//        result.put("more", new ArrayList<>(wkList));
-//        result.get("more").addAll(convertStringToList(noNumberText));
-//
-//        result.put("leverage", new ArrayList<>(wkList));
-//        result.get("leverage").addAll(convertStringToList(removeKeywords(noNumberText, "基金资产")));
-//
-//        result.put("base_info", new ArrayList<>(wkList));
-//        result.get("base_info").addAll(convertStringToList(removeKeywords(text, "基", "金", "投资", "管理", "有", "份", "融", "资", "产", "本", "号", "收益", "策略", "期")));
-//
-//        result.put("industry", new ArrayList<>(wkList));
-//        result.get("industry").addAll(convertStringToList(removeKeywords(noNumberText, "基金融公产")));
-//
-//        result.put("market_value", new ArrayList<>(Collections.singletonList("\n")));
-//        return result;
-//    }
-//
-//    private static List<String> findDigits(String text) {
-//        List<String> digits = new ArrayList<>();
-//        Pattern pattern = Pattern.compile("\\d");
-//        Matcher matcher = pattern.matcher(text);
-//        while (matcher.find()) {
-//            digits.add(matcher.group());
-//        }
-//        return digits;
-//    }
-//
-//    private static String removeDigits(String text) {
-//        return text.replaceAll("\\d", "");
-//    }
-//
-//    private static String removeKeywords(String text, String... keywords) {
-//        for (String keyword : keywords) {
-//            text = text.replaceAll(keyword, "");
-//        }
-//        return text;
-//    }
-//
-//    private static List<String> convertStringToList(String text) {
-//        List<String> charList = new ArrayList<>();
-//        for (char c : text.toCharArray()) {
-//            charList.add(c + "");
-//        }
-//        return charList;
-//    }
-//
-//    public static String processString(List<String> wmList, String string) {
-//        // 生成正则表达式模式
-//        String pat = String.join("|", wmList);
-//        // 使用正则表达式移除wmList中的元素
-//        string = removeMatches(string, pat);
-//        // 替换中文括号为英文括号
-//        string = string.replace("(", "(").replace(")", ")");
-//        // 移除空格
-//        string = string.replace(" ", "");
-//        // 如果字符串以括号开头,则移除第一个字符
-//        if (startsWithParenthesis(string)) {
-//            string = string.substring(1);
-//        }
-//
-//        return string;
-//    }
-//
-//    private static String removeMatches(String input, String pattern) {
-//        // 编译正则表达式
-//        Pattern compiledPattern = Pattern.compile(pattern);
-//        // 创建Matcher对象
-//        Matcher matcher = compiledPattern.matcher(input);
-//        // 使用replaceAll方法替换所有匹配到的字符为空字符串
-//        return matcher.replaceAll("");
-//    }
-//
-//    private static boolean startsWithParenthesis(String input) {
-//        // 匹配以括号开头的字符串
-//        Pattern pattern = Pattern.compile("^[()].*");
-//        Matcher matcher = pattern.matcher(input);
-//        return matcher.find();
-//    }
-//
-////    public static void removeTextWatermark(PDPage page) throws IOException {
-////        PDResources resources = page.getResources();
-//////        if (StrUtil.isAllBlank(fundName, trustName)) {
-//////            return;
-//////        }
-////        PDFTextStripperByArea stripper = new PDFTextStripperByArea();
-////        stripper.setSortByPosition(true);
-////        stripper.addRegion("watermark", new Rectangle2D.Float(0, 0, page.getMediaBox().getWidth(), page.getMediaBox().getHeight()));
-////        stripper.extractRegions(page);
-////
-////        PDFStreamEngine engine = new PDFTextStripper();
-////        engine.addOperator(new SetMatrix(stripper));
-////
-////    }
-////
-////    private static void processResources(PDResources resources) throws IOException {
-////        for (COSName name : resources.getXObjectNames()) {
-////            PDXObject xobject = resources.getXObject(name);
-////            if (xobject instanceof PDFormXObject) {
-////                PDFormXObject formXObject = (PDFormXObject) xobject;
-////                writeTokensToStream(formXObject.getContentStream(),
-////                        createTokensWithoutText(formXObject));
-////                processResources(formXObject.getResources());
-////            }
-////        }
-////        for (COSName name : resources.getPatternNames()) {
-////            PDAbstractPattern pattern = resources.getPattern(name);
-////            if (pattern instanceof PDTilingPattern) {
-////                PDTilingPattern tilingPattern = (PDTilingPattern) pattern;
-////                writeTokensToStream(tilingPattern.getContentStream(),
-////                        createTokensWithoutText(tilingPattern));
-////                processResources(tilingPattern.getResources());
-////            }
-////        }
-////    }
-////
-////    private static void writeTokensToStream(PDStream newContents, List<Object> newTokens) throws IOException {
-////        try (OutputStream out = newContents.createOutputStream(COSName.FLATE_DECODE)) {
-////            ContentStreamWriter writer = new ContentStreamWriter(out);
-////            writer.writeTokens(newTokens);
-////        }
-////    }
-////
-////    private static List<Object> createTokensWithoutText(PDContentStream contentStream) throws IOException {
-////        PDFStreamParser parser = new PDFStreamParser(contentStream);
-////        Object token = parser.parseNextToken();
-////        List<Object> newTokens = new ArrayList<>();
-////        while (token != null) {
-////            if (token instanceof Operator op) {
-////                String opName = op.getName();
-////                if (OperatorName.SET_MATRIX.equals(opName)) {
-////                    // remove the argument to this operator
-////                    newTokens.remove(newTokens.size() - 1);
-////
-////                    token = parser.parseNextToken();
-////                    continue;
-////                }
-////            }
-////            newTokens.add(token);
-////            token = parser.parseNextToken();
-////        }
-////        return newTokens;
-////    }
-//}

BIN
service-daq/src/main/java/com/simuwang/daq/utils/SEW698_旭日盛德价值成长一期私募证券投资基金_2023年第4季度报告.xlsx


BIN
service-daq/src/main/java/com/simuwang/daq/utils/ST8112_瑞富兴康稳健成长一号私募证券投资基金_信息披露月报.xlsx