Procházet zdrojové kódy

feat:月报解析先停止,开启用户管理功能开发

wangzaijun před 7 měsíci
rodič
revize
d082a45189
22 změnil soubory, kde provedl 1210 přidání a 67 odebrání
  1. 0 0
      1.pdf
  2. 24 3
      pom.xml
  3. 0 24
      service-base/pom.xml
  4. 2 2
      service-daq/pom.xml
  5. 181 0
      service-daq/src/main/java/com/simuwang/daq/components/AbstractReportParser.java
  6. 42 0
      service-daq/src/main/java/com/simuwang/daq/components/CustomPDFTextStripper.java
  7. 258 0
      service-daq/src/main/java/com/simuwang/daq/components/PDMonthlyReportParser.java
  8. 18 0
      service-daq/src/main/java/com/simuwang/daq/components/ReportParser.java
  9. 18 0
      service-daq/src/main/java/com/simuwang/daq/dto/ReportFileType.java
  10. 291 0
      service-daq/src/main/java/com/simuwang/daq/dto/ReportFundInfo.java
  11. 54 0
      service-daq/src/main/java/com/simuwang/daq/dto/ReportInfo.java
  12. 0 13
      service-daq/src/main/java/com/simuwang/daq/service/ReportParser.java
  13. binární
      service-daq/src/main/java/com/simuwang/daq/utils/12931.pdf
  14. binární
      service-daq/src/main/java/com/simuwang/daq/utils/12932.pdf
  15. binární
      service-daq/src/main/java/com/simuwang/daq/utils/12933.pdf
  16. binární
      service-daq/src/main/java/com/simuwang/daq/utils/14655.pdf
  17. binární
      service-daq/src/main/java/com/simuwang/daq/utils/14916.pdf
  18. binární
      service-daq/src/main/java/com/simuwang/daq/utils/15654.pdf
  19. binární
      service-daq/src/main/java/com/simuwang/daq/utils/15655.pdf
  20. binární
      service-daq/src/main/java/com/simuwang/daq/utils/17847.pdf
  21. binární
      service-daq/src/main/java/com/simuwang/daq/utils/17850.pdf
  22. 322 25
      service-daq/src/main/java/com/simuwang/daq/utils/ReportParseUtil.java

+ 0 - 0
1.pdf


+ 24 - 3
pom.xml

@@ -27,6 +27,7 @@
         <java-jwt.version>4.4.0</java-jwt.version>
         <jjwt.version>0.12.6</jjwt.version>
         <apahce-pdfbox.version>3.0.3</apahce-pdfbox.version>
+        <tabula.version>1.0.5</tabula.version>
     </properties>
 
     <modules>
@@ -158,11 +159,31 @@
                 <groupId>org.apache.pdfbox</groupId>
                 <artifactId>pdfbox</artifactId>
                 <version>${apahce-pdfbox.version}</version>
+                <exclusions>
+                    <exclusion>
+                        <groupId>org.slf4j</groupId>
+                        <artifactId>slf4j-simple</artifactId>
+                    </exclusion>
+                </exclusions>
             </dependency>
             <dependency>
-                <groupId>org.apache.pdfbox</groupId>
-                <artifactId>pdfbox-io</artifactId>
-                <version>${apahce-pdfbox.version}</version>
+                <groupId>technology.tabula</groupId>
+                <artifactId>tabula</artifactId>
+                <version>${tabula.version}</version>
+                <exclusions>
+                    <exclusion>
+                        <groupId>org.apache.pdfbox</groupId>
+                        <artifactId>pdfbox</artifactId>
+                    </exclusion>
+                    <exclusion>
+                        <groupId>org.apache.pdfbox</groupId>
+                        <artifactId>pdfbox-io</artifactId>
+                    </exclusion>
+                    <exclusion>
+                        <groupId>org.slf4j</groupId>
+                        <artifactId>slf4j-simple</artifactId>
+                    </exclusion>
+                </exclusions>
             </dependency>
 
             <!-- 内部模块 -->

+ 0 - 24
service-base/pom.xml

@@ -174,30 +174,6 @@
             <groupId>org.springframework.boot</groupId>
             <artifactId>spring-boot-starter-quartz</artifactId>
         </dependency>
-
-        <dependency>
-            <groupId>org.apache.pdfbox</groupId>
-            <artifactId>pdfbox</artifactId>
-            <version>3.0.1</version>
-            <exclusions>
-                <exclusion>
-                    <groupId>org.slf4j</groupId>
-                    <artifactId>slf4j-simple</artifactId>
-                </exclusion>
-            </exclusions>
-        </dependency>
-
-        <dependency>
-            <groupId>technology.tabula</groupId>
-            <artifactId>tabula</artifactId>
-            <version>1.0.5</version>
-            <exclusions>
-                <exclusion>
-                    <groupId>org.slf4j</groupId>
-                    <artifactId>slf4j-simple</artifactId>
-                </exclusion>
-            </exclusions>
-        </dependency>
     </dependencies>
 
 <!--    <build>-->

+ 2 - 2
service-daq/pom.xml

@@ -24,8 +24,8 @@
             <artifactId>pdfbox</artifactId>
         </dependency>
         <dependency>
-            <groupId>org.apache.pdfbox</groupId>
-            <artifactId>pdfbox-io</artifactId>
+            <groupId>technology.tabula</groupId>
+            <artifactId>tabula</artifactId>
         </dependency>
     </dependencies>
 

+ 181 - 0
service-daq/src/main/java/com/simuwang/daq/components/AbstractReportParser.java

@@ -0,0 +1,181 @@
+package com.simuwang.daq.components;
+
+import cn.hutool.core.exceptions.ExceptionUtil;
+import cn.hutool.core.map.MapUtil;
+import cn.hutool.core.util.StrUtil;
+import com.simuwang.daq.dto.ReportFundInfo;
+import com.simuwang.daq.dto.ReportInfo;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.springframework.util.StopWatch;
+
+import java.io.IOException;
+import java.util.*;
+import java.util.concurrent.TimeUnit;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+public abstract class AbstractReportParser implements ReportParser {
+    protected final Logger logger = LoggerFactory.getLogger(this.getClass());
+    protected String filepath;
+    protected Map<String, List<String>> watermarkListMap;
+
+    @Override
+    public void parse(Integer fileId, String filepath, String watermarkName) {
+        StopWatch watch = new StopWatch();
+        watch.start();
+        if (this.logger.isInfoEnabled()) {
+            this.logger.info("报告{} 开始解析!", filepath);
+        }
+        this.filepath = filepath;
+        this.watermarkListMap = this.generateWatermarkMap(watermarkName);
+        try {
+            this.initParse();
+            ReportInfo reportInfo = this.parseReportInfo(fileId);
+            ReportFundInfo reportFundInfo = this.parseBaseInfo();
+            this.parseExtInfo();
+        } catch (Exception e) {
+            this.logger.error("报告{} 解析错误\n{}", filepath, ExceptionUtil.stacktraceToString(e));
+        }
+        this.saveResult();
+        watch.stop();
+        if (this.logger.isInfoEnabled()) {
+            this.logger.info("报告{} 解析结束!耗时:{}s", filepath, watch.getTotalTime(TimeUnit.SECONDS));
+        }
+    }
+
+
+    protected abstract void initParse() throws IOException;
+
+    protected abstract ReportInfo parseReportInfo(Integer fileId);
+
+    protected abstract ReportFundInfo parseBaseInfo();
+
+    protected abstract void parseExtInfo();
+
+    protected abstract void saveResult();
+
+    private Map<String, List<String>> generateWatermarkMap(String watermarkName) {
+        Map<String, List<String>> result = MapUtil.newHashMap(32);
+        // 生成水印列表
+        String text = watermarkName;
+        text = text.replaceAll("[()]", ""); // 移除括号
+        List<String> textList = new ArrayList<>(new HashSet<>(convertStringToList(text)));
+        Collections.reverse(textList);
+        StringBuilder sb = new StringBuilder(textList.size());
+        for (String ch : textList) {
+            sb.append(ch);
+        }
+        String joinedText = sb.toString();
+
+        // 基本水印列表
+        List<String> wkList = new ArrayList<>();
+        for (String ch : textList) {
+            wkList.add(ch + "\r\n");
+            wkList.add("\r\n" + ch);
+        }
+
+        // 查找数字
+        List<String> matches = findDigits(watermarkName);
+        if (!matches.isEmpty()) {
+            for (String match : matches) {
+                wkList.add("\r\n" + match);
+                wkList.add(match + "\r\n");
+            }
+        }
+        wkList.add("-");
+        wkList.add("【");
+        wkList.add("】");
+        wkList.add("\r");
+        wkList.add("\n");
+        wkList.add("\r\n");
+
+        String noNumberText = removeDigits(joinedText);
+
+        // 生成不同字段的水印列表
+        result.put("report_name", new ArrayList<>(wkList));
+        result.get("report_name").addAll(convertStringToList("有限公司"));
+
+        result.put("less", new ArrayList<>(wkList));
+
+        result.put("more", new ArrayList<>(wkList));
+        result.get("more").addAll(convertStringToList(noNumberText));
+
+        result.put("leverage", new ArrayList<>(wkList));
+        result.get("leverage").addAll(convertStringToList(removeKeywords(noNumberText, "基金资产")));
+
+        result.put("base_info", new ArrayList<>(wkList));
+        result.get("base_info").addAll(convertStringToList(removeKeywords(text, "基", "金", "投资", "管理", "有", "份", "融", "资", "产", "本", "号", "收益", "策略", "期")));
+
+        result.put("industry", new ArrayList<>(wkList));
+        result.get("industry").addAll(convertStringToList(removeKeywords(noNumberText, "基金融公产")));
+
+        result.put("market_value", new ArrayList<>(Collections.singletonList("\n")));
+        return result;
+    }
+
+    private List<String> findDigits(String text) {
+        List<String> digits = new ArrayList<>();
+        Pattern pattern = Pattern.compile("\\d");
+        Matcher matcher = pattern.matcher(text);
+        while (matcher.find()) {
+            digits.add(matcher.group());
+        }
+        return digits;
+    }
+
+    private String removeDigits(String text) {
+        return text.replaceAll("\\d", "");
+    }
+
+    private String removeKeywords(String text, String... keywords) {
+        for (String keyword : keywords) {
+            text = text.replaceAll(keyword, "");
+        }
+        return text;
+    }
+
+    private List<String> convertStringToList(String text) {
+        List<String> charList = new ArrayList<>();
+        for (char c : text.toCharArray()) {
+            charList.add(c + "");
+        }
+        return charList;
+    }
+
+    protected String processString(List<String> wmList, String string) {
+        if (StrUtil.isBlank(string)) {
+            return null;
+        }
+        // 生成正则表达式模式
+        String pat = String.join("|", wmList);
+        // 使用正则表达式移除wmList中的元素
+        string = removeMatches(string, pat);
+        // 替换中文括号为英文括号
+        string = string.replace("(", "(").replace(")", ")");
+        // 移除空格
+        string = string.replace(" ", "");
+        // 如果字符串以括号开头,则移除第一个字符
+        if (startsWithParenthesis(string)) {
+            string = string.substring(1);
+        }
+
+        return string;
+    }
+
+    private String removeMatches(String input, String pattern) {
+        // 编译正则表达式
+        Pattern compiledPattern = Pattern.compile(pattern);
+        // 创建Matcher对象
+        Matcher matcher = compiledPattern.matcher(input);
+        // 使用replaceAll方法替换所有匹配到的字符为空字符串
+        return matcher.replaceAll("");
+    }
+
+    private boolean startsWithParenthesis(String input) {
+        // 匹配以括号开头的字符串
+        Pattern pattern = Pattern.compile("^[()].*");
+        Matcher matcher = pattern.matcher(input);
+        return matcher.find();
+    }
+}

+ 42 - 0
service-daq/src/main/java/com/simuwang/daq/components/CustomPDFTextStripper.java

@@ -0,0 +1,42 @@
+package com.simuwang.daq.components;
+
+import cn.hutool.core.collection.CollUtil;
+import cn.hutool.core.collection.ListUtil;
+import org.apache.pdfbox.text.PDFTextStripper;
+import org.apache.pdfbox.text.TextPosition;
+import org.apache.pdfbox.util.Matrix;
+
+import java.io.IOException;
+import java.util.List;
+
+/**
+ * @author wangzaijun
+ * @date 2024/9/12 14:00
+ * @description 自定义的文本去水印方法,发现水印基本是旋转文字并且比报告内其他文字都大
+ */
+public class CustomPDFTextStripper extends PDFTextStripper {
+    private final float[] watermarkWidth = {0f};
+
+    @Override
+    protected void writeString(String text, List<TextPosition> textPositions) throws IOException {
+        List<String> newTexts = ListUtil.list(false);
+        for (TextPosition textPosition : textPositions) {
+            Matrix textMatrix = textPosition.getTextMatrix();
+            float col = textMatrix.getValue(0, 1);
+            float width = textPosition.getWidth();
+            if (col == 0.) {
+                if (width < watermarkWidth[0]) {
+                    newTexts.add(textPosition.getUnicode());
+                }
+            } else {
+                if (width > watermarkWidth[0]) {
+                    watermarkWidth[0] = width;
+                }
+                newTexts.add("++");
+            }
+        }
+        if (CollUtil.isNotEmpty(newTexts)) {
+            super.writeString(String.join("", newTexts));
+        }
+    }
+}

+ 258 - 0
service-daq/src/main/java/com/simuwang/daq/components/PDMonthlyReportParser.java

@@ -0,0 +1,258 @@
+package com.simuwang.daq.components;
+
+import cn.hutool.core.collection.CollUtil;
+import cn.hutool.core.collection.ListUtil;
+import cn.hutool.core.map.MapUtil;
+import cn.hutool.core.util.ReflectUtil;
+import cn.hutool.core.util.StrUtil;
+import com.simuwang.base.common.exception.APIException;
+import com.simuwang.base.mapper.EmailFieldMappingMapper;
+import com.simuwang.base.pojo.dos.EmailFieldMappingDO;
+import com.simuwang.daq.dto.ReportFundInfo;
+import com.simuwang.daq.dto.ReportInfo;
+import com.smppw.common.pojo.ValueLabelVO;
+import org.apache.pdfbox.Loader;
+import org.apache.pdfbox.io.RandomAccessReadBufferedFile;
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.springframework.stereotype.Component;
+import technology.tabula.*;
+import technology.tabula.extractors.SpreadsheetExtractionAlgorithm;
+
+import java.io.IOException;
+import java.sql.Struct;
+import java.util.Calendar;
+import java.util.List;
+import java.util.Map;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import java.util.stream.Collectors;
+
+/**
+ * @author wangzaijun
+ * @date 2024/9/11 16:19
+ * @description pdf格式的月报解析
+ */
+@Component("monthly-report:pdf")
+public class PDMonthlyReportParser extends AbstractReportParser {
+    private final EmailFieldMappingMapper fieldMappingMapper;
+    private String reportName = null;
+    private Table baseInfoTable = null;
+    private List<Table> extNavTables = ListUtil.list(true);
+    private List<ValueLabelVO> fieldMapper = null;
+
+    public PDMonthlyReportParser(EmailFieldMappingMapper fieldMappingMapper) {
+        this.fieldMappingMapper = fieldMappingMapper;
+    }
+
+    @Override
+    protected void initParse() throws IOException {
+        try (PDDocument document = Loader.loadPDF(new RandomAccessReadBufferedFile(this.filepath))) {
+            CustomPDFTextStripper stripper = new CustomPDFTextStripper();
+            stripper.setSortByPosition(true);
+            String text = stripper.getText(document);
+            text = text.replace("++\r\n", "").replace("++", "");
+            List<String> textList = StrUtil.split(text, "\r\n");
+            if (CollUtil.isNotEmpty(textList)) {
+                List<String> wkList = this.watermarkListMap.get("report_name");
+                String name = this.processString(wkList, textList.get(0));
+                this.reportName = this.matchReportName(name);
+                if (StrUtil.isBlank(this.reportName)) {
+                    throw new APIException("未匹配到报告名称");
+                }
+            }
+
+            SpreadsheetExtractionAlgorithm extractionAlgorithm = new SpreadsheetExtractionAlgorithm();
+            PageIterator pageIterator = new ObjectExtractor(document).extract();
+            while (pageIterator.hasNext()) {
+                Page page = pageIterator.next();
+                List<Table> tables = extractionAlgorithm.extract(page);
+                tables = tables.stream().distinct().collect(Collectors.toList());
+                for (Table table : tables) {
+                    int colCount = table.getColCount();
+                    if (colCount == 4) {
+                        this.baseInfoTable = table;
+                    } else if (colCount >= 5) {
+                        this.extNavTables.add(table);
+                    }
+                }
+            }
+        }
+        List<EmailFieldMappingDO> emailFieldMapping = this.fieldMappingMapper.getEmailFieldMapping();
+        if (CollUtil.isNotEmpty(emailFieldMapping)) {
+            this.fieldMapper = emailFieldMapping.stream().map(e -> new ValueLabelVO(e.getCode(), e.getName())).collect(Collectors.toList());
+        }
+    }
+
+    @Override
+    protected ReportInfo parseReportInfo(Integer fileId) {
+        ReportInfo reportInfo = new ReportInfo();
+        reportInfo.setFileId(fileId);
+        reportInfo.setReportName(this.reportName);
+        reportInfo.setReportType(this.matchReportType(this.reportName));
+        reportInfo.setReportDate(this.matchReportDate(this.reportName));
+        return reportInfo;
+    }
+
+    @Override
+    protected ReportFundInfo parseBaseInfo() {
+        Table baseInfoTable = this.baseInfoTable;
+        if (baseInfoTable == null) {
+            throw new APIException("未解析到基本信息表格");
+        }
+        Map<String, Object> baseInfoMap = MapUtil.newHashMap(32);
+        for (int i = 0; i < baseInfoTable.getRows().size(); i++) {
+            List<RectangularTextContainer> cols = baseInfoTable.getRows().get(i);
+            for (int j = 0; j < 2; j++) {
+                baseInfoMap.put(cols.get(j * 2).getText(), cols.get(j * 2 + 1).getText());
+            }
+        }
+        // 匹配字段清洗字段
+        ReportFundInfo reportFundInfo = new ReportFundInfo();
+        baseInfoMap.forEach((k, v) -> {
+            String fieldValue = StrUtil.toStringOrNull(v);
+            if (fieldValue.contains("-")) {
+                fieldValue = null;
+            }
+            if (fieldValue != null) {
+                fieldValue = fieldValue.replace("\r", "");
+            }
+            for (ValueLabelVO vo : this.fieldMapper) {
+                String fieldName = vo.getValue();
+                List<String> labels = StrUtil.split(vo.getLabel(), ",");
+                if (labels.contains(k)) {
+                    ReflectUtil.setFieldValue(reportFundInfo, fieldName, fieldValue);
+                    break;
+                }
+                for (String label : labels) {
+                    if (k.contains(label)) {
+                        ReflectUtil.setFieldValue(reportFundInfo, fieldName, fieldValue);
+                        break;
+                    }
+                }
+            }
+        });
+        return reportFundInfo;
+    }
+
+    @Override
+    protected void parseExtInfo() {
+
+    }
+
+    @Override
+    protected void saveResult() {
+
+    }
+
+    /**
+     * 匹配报告日期
+     *
+     * @param string 文本内容
+     * @return 报告日期
+     */
+    private String matchReportDate(String string) {
+        if (string == null) {
+            return null;
+        }
+
+        // 编译正则表达式模式
+        Pattern pat1 = Pattern.compile("(2\\d{3}).*([一二三四1234])季度");  // 2023年XXX3季度
+        Pattern pat2 = Pattern.compile("\\d{4}-\\d{2}-\\d{2}");  // 2023-12-31
+        Pattern pat3 = Pattern.compile("(2\\d{3})年年度");  // 2023年年度
+        Pattern pat4 = Pattern.compile("(\\d{4})年(\\d{1,2})月");  // 2023年12月
+
+        // 创建Matcher对象
+        Matcher matcher1 = pat1.matcher(string);
+        Matcher matcher2 = pat2.matcher(string);
+        Matcher matcher3 = pat3.matcher(string);
+        Matcher matcher4 = pat4.matcher(string);
+
+        // 尝试匹配
+        if (matcher1.find()) {
+            String year = matcher1.group(1);
+            String quarter = matcher1.group(2);
+            return switch (quarter) {
+                case "一", "1" -> year + "-03-31";
+                case "二", "2" -> year + "-06-30";
+                case "三", "3" -> year + "-09-30";
+                case "四", "4" -> year + "-12-31";
+                default -> null;
+            };
+        } else if (matcher2.find()) {
+            return matcher2.group();
+        } else if (matcher3.find()) {
+            return matcher3.group(1) + "-12-31";
+        } else if (matcher4.find()) {
+            String year = matcher4.group(1);
+            String month = matcher4.group(2);
+            int lastDayOfMonth = getLastDayOfMonth(Integer.parseInt(year), Integer.parseInt(month));
+            return year + "-" + padZero(month) + "-" + padZero(lastDayOfMonth + "");
+        } else {
+            return null;
+        }
+    }
+
+    /**
+     * 匹配报告类型,如“季度”、“年度”
+     *
+     * @param string 输入字符串
+     * @return 匹配到的报告类型子字符串,如果没有匹配到则返回null
+     */
+    private String matchReportType(String string) {
+        if (string == null) {
+            return null;
+        }
+
+        // 编译正则表达式模式
+        Pattern pattern = Pattern.compile("月|季度|年度");
+
+        // 创建Matcher对象
+        Matcher matcher = pattern.matcher(string);
+
+        // 尝试匹配
+        if (matcher.find()) {
+            return matcher.group();
+        } else {
+            return null;
+        }
+    }
+
+    private String matchReportName(String text) {
+        if (StrUtil.isBlank(text)) {
+            return null;
+        }
+        // 编译正则表达式模式
+        Pattern pat1 = Pattern.compile(".+?报([告表])?\\d{4}(\\.?\\d{1,2}(\\.?\\d{2})?)?");
+        Pattern pat2 = Pattern.compile("私募.*披露年度报[告表]((\\d{4}-\\d{2}-\\d{2}至\\d{4}-\\d{2}-\\d{2}))?");
+        Pattern pat3 = Pattern.compile(".+?报([告表])?\\d{4}-\\d{2}-\\d{2}至\\d{4}-\\d{2}-\\d{2}?");
+
+        // 创建Matcher对象
+        Matcher matcher1 = pat1.matcher(text);
+        Matcher matcher2 = pat2.matcher(text);
+        Matcher matcher3 = pat3.matcher(text);
+
+        // 尝试匹配
+        String reportName;
+        if (matcher1.find()) {
+            reportName = matcher1.group();
+        } else if (matcher2.find()) {
+            reportName = matcher2.group();
+        } else if (matcher3.find()) {
+            reportName = matcher3.group();
+        } else {
+            return null;
+        }
+        return reportName.replace("(", "(").replace(")", ")");
+    }
+
+    private int getLastDayOfMonth(int year, int month) {
+        Calendar calendar = Calendar.getInstance();
+        calendar.set(Calendar.YEAR, year);
+        calendar.set(Calendar.MONTH, month - 1); // Calendar.MONTH 是从0开始的
+        return calendar.getActualMaximum(Calendar.DAY_OF_MONTH);
+    }
+
+    private String padZero(String number) {
+        return String.format("%02d", Integer.parseInt(number));
+    }
+}

+ 18 - 0
service-daq/src/main/java/com/simuwang/daq/components/ReportParser.java

@@ -0,0 +1,18 @@
+package com.simuwang.daq.components;
+
+/**
+ * @author wangzaijun
+ * @date 2024/9/9 19:18
+ * @description 报告模板解析器,计划支持pdf、word等
+ */
+public interface ReportParser {
+    /**
+     * 报告模板解析接口
+     * 扩展支持月报、季报和年报,解析文件格式支持pdf、word和excel
+     *
+     * @param fileId        文件id
+     * @param filepath      文件路径
+     * @param watermarkName 生成水印
+     */
+    void parse(Integer fileId, String filepath, String watermarkName);
+}

+ 18 - 0
service-daq/src/main/java/com/simuwang/daq/dto/ReportFileType.java

@@ -0,0 +1,18 @@
+package com.simuwang.daq.dto;
+
+public enum ReportFileType {
+    PDF("pdf"),
+    DOCX("docx"),
+    DOC("doc"),
+    XLSX("xlsx");
+
+    private final String suffix;
+
+    ReportFileType(String suffix) {
+        this.suffix = suffix;
+    }
+
+    public String getSuffix() {
+        return suffix;
+    }
+}

+ 291 - 0
service-daq/src/main/java/com/simuwang/daq/dto/ReportFundInfo.java

@@ -0,0 +1,291 @@
+package com.simuwang.daq.dto;
+
+/**
+ * @author wangzaijun
+ * @date 2024/9/12 15:34
+ * @description 报告解析的基金信息
+ */
+public class ReportFundInfo {
+    private String fundName;
+    private String cFundName;
+    /**
+     * 是否分级基金
+     */
+    private Integer istiered;
+    /**
+     * 备案编码
+     */
+    private String registerNumber;
+    private String trustName;
+    private String custodianName;
+    private String advisorName;
+    /**
+     * 运作方式 开放式或封闭式
+     */
+    private String operationType;
+    private String fundType;
+    /**
+     * 成立日期
+     */
+    private String inceptionDate;
+    private String sharePerAsset;
+    private String investmentObjective;
+    private String fundStrategyDescription;
+    private String secondaryBenchmark;
+    private String riskReturnDesc;
+    private String realizedIncome;
+    private String profit;
+    private String fundAssetSize;
+    private String nav;
+    private String initTotalShares;
+    private String subscription;
+    private String redemption;
+    private String split;
+    /**
+     * 杠杆信息描述
+     */
+    private String leverageNote;
+    /**
+     * 杠杆比例
+     */
+    private String leverage;
+    private String remark;
+    private String industryTrend;
+    private String fundManager;
+    /**
+     * 是否托管复核
+     */
+    private String reviewed;
+
+    public String getFundName() {
+        return fundName;
+    }
+
+    public void setFundName(String fundName) {
+        this.fundName = fundName;
+    }
+
+    public String getcFundName() {
+        return cFundName;
+    }
+
+    public void setcFundName(String cFundName) {
+        this.cFundName = cFundName;
+    }
+
+    public Integer getIstiered() {
+        return istiered;
+    }
+
+    public void setIstiered(Integer istiered) {
+        this.istiered = istiered;
+    }
+
+    public String getRegisterNumber() {
+        return registerNumber;
+    }
+
+    public void setRegisterNumber(String registerNumber) {
+        this.registerNumber = registerNumber;
+    }
+
+    public String getTrustName() {
+        return trustName;
+    }
+
+    public void setTrustName(String trustName) {
+        this.trustName = trustName;
+    }
+
+    public String getCustodianName() {
+        return custodianName;
+    }
+
+    public void setCustodianName(String custodianName) {
+        this.custodianName = custodianName;
+    }
+
+    public String getAdvisorName() {
+        return advisorName;
+    }
+
+    public void setAdvisorName(String advisorName) {
+        this.advisorName = advisorName;
+    }
+
+    public String getOperationType() {
+        return operationType;
+    }
+
+    public void setOperationType(String operationType) {
+        this.operationType = operationType;
+    }
+
+    public String getFundType() {
+        return fundType;
+    }
+
+    public void setFundType(String fundType) {
+        this.fundType = fundType;
+    }
+
+    public String getInceptionDate() {
+        return inceptionDate;
+    }
+
+    public void setInceptionDate(String inceptionDate) {
+        this.inceptionDate = inceptionDate;
+    }
+
+    public String getSharePerAsset() {
+        return sharePerAsset;
+    }
+
+    public void setSharePerAsset(String sharePerAsset) {
+        this.sharePerAsset = sharePerAsset;
+    }
+
+    public String getInvestmentObjective() {
+        return investmentObjective;
+    }
+
+    public void setInvestmentObjective(String investmentObjective) {
+        this.investmentObjective = investmentObjective;
+    }
+
+    public String getFundStrategyDescription() {
+        return fundStrategyDescription;
+    }
+
+    public void setFundStrategyDescription(String fundStrategyDescription) {
+        this.fundStrategyDescription = fundStrategyDescription;
+    }
+
+    public String getSecondaryBenchmark() {
+        return secondaryBenchmark;
+    }
+
+    public void setSecondaryBenchmark(String secondaryBenchmark) {
+        this.secondaryBenchmark = secondaryBenchmark;
+    }
+
+    public String getRiskReturnDesc() {
+        return riskReturnDesc;
+    }
+
+    public void setRiskReturnDesc(String riskReturnDesc) {
+        this.riskReturnDesc = riskReturnDesc;
+    }
+
+    public String getRealizedIncome() {
+        return realizedIncome;
+    }
+
+    public void setRealizedIncome(String realizedIncome) {
+        this.realizedIncome = realizedIncome;
+    }
+
+    public String getProfit() {
+        return profit;
+    }
+
+    public void setProfit(String profit) {
+        this.profit = profit;
+    }
+
+    public String getFundAssetSize() {
+        return fundAssetSize;
+    }
+
+    public void setFundAssetSize(String fundAssetSize) {
+        this.fundAssetSize = fundAssetSize;
+    }
+
+    public String getNav() {
+        return nav;
+    }
+
+    public void setNav(String nav) {
+        this.nav = nav;
+    }
+
+    public String getInitTotalShares() {
+        return initTotalShares;
+    }
+
+    public void setInitTotalShares(String initTotalShares) {
+        this.initTotalShares = initTotalShares;
+    }
+
+    public String getSubscription() {
+        return subscription;
+    }
+
+    public void setSubscription(String subscription) {
+        this.subscription = subscription;
+    }
+
+    public String getRedemption() {
+        return redemption;
+    }
+
+    public void setRedemption(String redemption) {
+        this.redemption = redemption;
+    }
+
+    public String getSplit() {
+        return split;
+    }
+
+    public void setSplit(String split) {
+        this.split = split;
+    }
+
+    public String getLeverageNote() {
+        return leverageNote;
+    }
+
+    public void setLeverageNote(String leverageNote) {
+        this.leverageNote = leverageNote;
+    }
+
+    public String getLeverage() {
+        return leverage;
+    }
+
+    public void setLeverage(String leverage) {
+        this.leverage = leverage;
+    }
+
+    public String getRemark() {
+        return remark;
+    }
+
+    public void setRemark(String remark) {
+        this.remark = remark;
+    }
+
+    public String getIndustryTrend() {
+        return industryTrend;
+    }
+
+    public void setIndustryTrend(String industryTrend) {
+        this.industryTrend = industryTrend;
+    }
+
+    public String getFundManager() {
+        return fundManager;
+    }
+
+    public void setFundManager(String fundManager) {
+        this.fundManager = fundManager;
+    }
+
+    public String getReviewed() {
+        return reviewed;
+    }
+
+    public void setReviewed(String reviewed) {
+        this.reviewed = reviewed;
+    }
+}

+ 54 - 0
service-daq/src/main/java/com/simuwang/daq/dto/ReportInfo.java

@@ -0,0 +1,54 @@
+package com.simuwang.daq.dto;
+
+/**
+ * @author wangzaijun
+ * @date 2024/9/11 17:57
+ * @description 报告基本信息
+ */
+public class ReportInfo {
+    private Integer fileId;
+    /**
+     * 报告名称
+     */
+    private String reportName;
+    /**
+     * 报告类型(月、季、年)
+     */
+    private String reportType;
+    /**
+     * 报告日期
+     */
+    private String reportDate;
+
+    public Integer getFileId() {
+        return fileId;
+    }
+
+    public void setFileId(Integer fileId) {
+        this.fileId = fileId;
+    }
+
+    public String getReportName() {
+        return reportName;
+    }
+
+    public void setReportName(String reportName) {
+        this.reportName = reportName;
+    }
+
+    public String getReportType() {
+        return reportType;
+    }
+
+    public void setReportType(String reportType) {
+        this.reportType = reportType;
+    }
+
+    public String getReportDate() {
+        return reportDate;
+    }
+
+    public void setReportDate(String reportDate) {
+        this.reportDate = reportDate;
+    }
+}

+ 0 - 13
service-daq/src/main/java/com/simuwang/daq/service/ReportParser.java

@@ -1,13 +0,0 @@
-package com.simuwang.daq.service;
-
-/**
- * @author wangzaijun
- * @date 2024/9/9 19:18
- * @description 报告模板解析器,计划支持pdf、word等
- */
-public interface ReportParser {
-    /**
-     * 解析接口
-     */
-    void parse();
-}

binární
service-daq/src/main/java/com/simuwang/daq/utils/12931.pdf


binární
service-daq/src/main/java/com/simuwang/daq/utils/12932.pdf


binární
service-daq/src/main/java/com/simuwang/daq/utils/12933.pdf


binární
service-daq/src/main/java/com/simuwang/daq/utils/14655.pdf


binární
service-daq/src/main/java/com/simuwang/daq/utils/14916.pdf


binární
service-daq/src/main/java/com/simuwang/daq/utils/15654.pdf


binární
service-daq/src/main/java/com/simuwang/daq/utils/15655.pdf


binární
service-daq/src/main/java/com/simuwang/daq/utils/17847.pdf


binární
service-daq/src/main/java/com/simuwang/daq/utils/17850.pdf


+ 322 - 25
service-daq/src/main/java/com/simuwang/daq/utils/ReportParseUtil.java

@@ -1,51 +1,348 @@
 package com.simuwang.daq.utils;
 
+import cn.hutool.core.collection.CollUtil;
+import cn.hutool.core.collection.ListUtil;
+import cn.hutool.core.map.MapUtil;
+import cn.hutool.core.util.ReflectUtil;
+import cn.hutool.core.util.StrUtil;
+import com.simuwang.daq.components.CustomPDFTextStripper;
+import com.simuwang.daq.dto.ReportFundInfo;
+import com.smppw.common.pojo.ValueLabelVO;
 import org.apache.pdfbox.Loader;
+import org.apache.pdfbox.contentstream.PDFStreamEngine;
+import org.apache.pdfbox.contentstream.operator.text.ShowText;
+import org.apache.pdfbox.cos.COSName;
 import org.apache.pdfbox.io.RandomAccessReadBufferedFile;
 import org.apache.pdfbox.pdmodel.PDDocument;
 import org.apache.pdfbox.pdmodel.PDPage;
-import org.apache.pdfbox.pdmodel.PDPageTree;
+import org.apache.pdfbox.pdmodel.PDResources;
+import org.apache.pdfbox.pdmodel.common.PDStream;
+import org.apache.pdfbox.pdmodel.documentinterchange.logicalstructure.PDStructureTreeRoot;
+import org.apache.pdfbox.pdmodel.graphics.PDXObject;
+import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
+import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
+import org.apache.pdfbox.text.PDFTextStripper;
 import org.apache.pdfbox.text.PDFTextStripperByArea;
+import org.apache.pdfbox.text.TextPosition;
+import org.apache.pdfbox.util.Matrix;
+import technology.tabula.*;
+import technology.tabula.extractors.SpreadsheetExtractionAlgorithm;
 
 import java.awt.geom.Rectangle2D;
 import java.io.IOException;
-import java.util.List;
+import java.util.*;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import java.util.stream.Collectors;
 
 public class ReportParseUtil {
     public static void main(String[] args) throws IOException {
+        List<ValueLabelVO> fieldMapper = ListUtil.list(false);
+        fieldMapper.add(new ValueLabelVO("fundName", "基金名称"));
+        fieldMapper.add(new ValueLabelVO("registerNumber", "基金编码"));
+        fieldMapper.add(new ValueLabelVO("operationType", "基金运作方式"));
+        fieldMapper.add(new ValueLabelVO("fundType", "基金类别"));
+        fieldMapper.add(new ValueLabelVO("inceptionDate", "基金成立日期"));
+        fieldMapper.add(new ValueLabelVO("trustName", "基金托管人"));
+        fieldMapper.add(new ValueLabelVO("custodianName", "基金管理人"));
+        fieldMapper.add(new ValueLabelVO("advisorName", "投资顾问"));
+        fieldMapper.add(new ValueLabelVO("reviewed", "复核"));
+
+        Map<String, List<String>> watermarkMap = generateWatermarkListMap("幻方量化1000指数专享1号5期私募证券投资基金", "宁波幻方量化投资管理合伙企业(有限合伙)", null);
+        List<String> watermarks = watermarkMap.get("less");
+
+//        System.out.println(watermarks);
+//        try (PDDocument document = Loader.loadPDF(new RandomAccessReadBufferedFile("D:\\Documents\\workspace\\idea\\smppw\\data-daq\\service-daq\\src\\main\\java\\com\\simuwang\\daq\\utils\\12931.pdf"))) {
         try (PDDocument document = Loader.loadPDF(new RandomAccessReadBufferedFile("D:\\Documents\\workspace\\idea\\smppw\\data-daq\\service-daq\\src\\main\\java\\com\\simuwang\\daq\\utils\\2061834.pdf"))) {
-            PDPageTree pages = document.getPages();
-            for (int i = 0; i < pages.getCount(); i++) {
-                PDFTextStripperByArea stripper = new PDFTextStripperByArea();
-                stripper.setSortByPosition(true);
+//            PDFTextStripper stripper = new PDFTextStripper();
+//            stripper.setSortByPosition(true);
+//            String allText = stripper.getText(document);
+//            List<String> textList = StrUtil.split(allText, "\r\n");
+//            System.out.println(textList);
 
-                // 定义每个区域
-                defineAreas(stripper);
+            PDFTextStripper textStripper = new CustomPDFTextStripper();
+            textStripper.setSortByPosition(true);
+            String text1 = textStripper.getText(document);
+            text1 = text1.replace("+\r\n", "").replace("+","");
+            List<String> textList = StrUtil.split(text1, "\r\n");
+            System.out.println(textList.get(0));
 
-                // 提取文本
-                PDPage page = document.getPage(i);
-                stripper.extractRegions(page);
+//            for (PDPage page : document.getPages()) {
+//
+////                PDResources resources = page.getResources();
+////                Map<COSName, PDImageXObject> imageXObjectMap = findImageWatermark(page);
+////                Iterator<COSName> iterator = resources.getXObjectNames().iterator();
+////                while (iterator.hasNext()) {
+////                    COSName next = iterator.next();
+////                    if (imageXObjectMap.containsKey(next)) {
+////                        iterator.remove();
+////                    }
+////                }
+////                removeTextWatermark(page);
+//
+//                PDFTextStripperByArea stripper = new PDFTextStripperByArea();
+//                stripper.setSortByPosition(true);
+//                stripper.addRegion("page", new Rectangle2D.Float(0, 0, page.getMediaBox().getWidth(), page.getMediaBox().getHeight()));
+//                stripper.extractRegions(page);
+//                for (String region : stripper.getRegions()) {
+//                    String text = stripper.getTextForRegion(region);
+//                    String res = processString(watermarks, text);
+//                    System.out.println("原数据:" + text + ", 去除水印后数据:" + res);
+//                }
+//            }
+//            document.save(new File("./1.pdf"));
 
-                List<String> regions = stripper.getRegions();
-                processRegions(stripper, regions);
+            SpreadsheetExtractionAlgorithm extractionAlgorithm = new SpreadsheetExtractionAlgorithm();
+            PageIterator pageIterator = new ObjectExtractor(document).extract();
+            while (pageIterator.hasNext()) {
+                Page page = pageIterator.next();
+                List<Table> tables = extractionAlgorithm.extract(page);
+                tables = tables.stream().distinct().collect(Collectors.toList());
+                for (Table table : tables) {
+                    if (table.getColCount() == 4) {
+                        Map<String, Object> baseInfoMap = MapUtil.newHashMap(32);
+                        for (int i = 0; i < table.getRows().size(); i++) {
+                            List<RectangularTextContainer> cols = table.getRows().get(i);
+                            for (int j = 0; j < 2; j++) {
+                                baseInfoMap.put(cols.get(j * 2).getText(), cols.get(j * 2 + 1).getText());
+                            }
+                        }
+                        ReportFundInfo reportFundInfo = new ReportFundInfo();
+                        baseInfoMap.forEach((k, v) -> {
+                            for (ValueLabelVO vo : fieldMapper) {
+                                String fieldName = vo.getValue();
+                                List<String> labels = StrUtil.split(vo.getLabel(), ",");
+                                if (labels.contains(k)) {
+                                    ReflectUtil.setFieldValue(reportFundInfo, fieldName, v);
+                                    break;
+                                }
+                                for (String label : labels) {
+                                    if (k.contains(label)) {
+                                        ReflectUtil.setFieldValue(reportFundInfo, fieldName, v);
+                                        break;
+                                    }
+                                }
+                            }
+                        });
+                        System.out.println(reportFundInfo);
+                    }
+                }
             }
         }
     }
 
-    private static void defineAreas(PDFTextStripperByArea stripper) {
-        // 定义区域,位置左上角作为原点,横坐标往右为x轴,纵坐标往下为y轴
-        stripper.addRegion("header", new Rectangle2D.Float(0, 0, 612, 180));
-        stripper.addRegion("content-survey", new Rectangle2D.Float(0, 180, 612, 180));
-        stripper.addRegion("content-current-fund", new Rectangle2D.Float(0, 360, 612, 240));
-        stripper.addRegion("content-sub-fund1", new Rectangle2D.Float(0, 600, 612, 100));
-        stripper.addRegion("content-sub-fund2", new Rectangle2D.Float(0, 700, 612, 150));
-        stripper.addRegion("footer", new Rectangle2D.Float(0, 850, 612, 30));
+    /**
+     * 找图片水印
+     *
+     * @param page
+     * @return
+     * @throws IOException
+     */
+    public static Map<COSName, PDImageXObject> findImageWatermark(PDPage page) throws IOException {
+        Map<COSName, PDImageXObject> watermarkMap = MapUtil.newHashMap();
+        PDResources resources = page.getResources();
+        Iterable<COSName> xObjectNames = resources.getXObjectNames();
+        for (COSName xObjectName : xObjectNames) {
+            PDXObject xObject = resources.getXObject(xObjectName);
+            PDStream stream = xObject.getStream();
+            PDImageXObject imageXObject = null;
+            try {
+                imageXObject = new PDImageXObject(stream, resources);
+            } catch (Exception e) {
+                e.printStackTrace();
+            }
+            if (imageXObject != null) {
+                watermarkMap.put(xObjectName, imageXObject);
+            }
+        }
+        return watermarkMap;
     }
 
-    private static void processRegions(PDFTextStripperByArea stripper, List<String> regions) {
-        for (String region : regions) {
-            String text = stripper.getTextForRegion(region);
-            System.out.println(text);
+    private static Map<String, List<String>> generateWatermarkListMap(String fundName, String trustName, String registerNumber) {
+        Map<String, List<String>> result = MapUtil.newHashMap(32);
+        // 生成水印列表
+
+        fundName = StrUtil.isNotBlank(fundName) ? fundName : "私募证券投资基金";
+        trustName = StrUtil.isNotBlank(trustName) ? trustName : "资产管理有限公司";
+        registerNumber = StrUtil.isNotBlank(registerNumber) ? registerNumber : "";
+        String text = fundName + trustName + registerNumber;
+        text = text.replaceAll("[()]", ""); // 移除括号
+        List<String> textList = new ArrayList<>(new HashSet<>(convertStringToList(text)));
+        Collections.reverse(textList);
+        StringBuilder sb = new StringBuilder(textList.size());
+        for (String ch : textList) {
+            sb.append(ch);
+        }
+        String joinedText = sb.toString();
+
+        // 基本水印列表
+        List<String> wkList = new ArrayList<>();
+        for (String ch : textList) {
+            wkList.add(ch + "\r\n");
+            wkList.add("\r\n" + ch);
+        }
+
+        // 查找数字
+        List<String> matches = findDigits(fundName);
+        if (!matches.isEmpty()) {
+            for (String match : matches) {
+                wkList.add("\r\n" + match);
+                wkList.add(match + "\r\n");
+            }
         }
+        wkList.add("-");
+        wkList.add("【");
+        wkList.add("】");
+        wkList.add("\r");
+        wkList.add("\r\n");
+
+        String noNumberText = removeDigits(joinedText);
+
+        // 生成不同字段的水印列表
+        result.put("report_name", new ArrayList<>(wkList));
+        result.get("report_name").addAll(convertStringToList("有限公司"));
+
+        result.put("less", new ArrayList<>(wkList));
+
+        result.put("more", new ArrayList<>(wkList));
+        result.get("more").addAll(convertStringToList(noNumberText));
+
+        result.put("leverage", new ArrayList<>(wkList));
+        result.get("leverage").addAll(convertStringToList(removeKeywords(noNumberText, "基金资产")));
+
+        result.put("base_info", new ArrayList<>(wkList));
+        result.get("base_info").addAll(convertStringToList(removeKeywords(text, "基", "金", "投资", "管理", "有", "份", "融", "资", "产", "本", "号", "收益", "策略", "期")));
+
+        result.put("industry", new ArrayList<>(wkList));
+        result.get("industry").addAll(convertStringToList(removeKeywords(noNumberText, "基金融公产")));
+
+        result.put("market_value", new ArrayList<>(Collections.singletonList("\n")));
+        return result;
     }
+
+    private static List<String> findDigits(String text) {
+        List<String> digits = new ArrayList<>();
+        Pattern pattern = Pattern.compile("\\d");
+        Matcher matcher = pattern.matcher(text);
+        while (matcher.find()) {
+            digits.add(matcher.group());
+        }
+        return digits;
+    }
+
+    private static String removeDigits(String text) {
+        return text.replaceAll("\\d", "");
+    }
+
+    private static String removeKeywords(String text, String... keywords) {
+        for (String keyword : keywords) {
+            text = text.replaceAll(keyword, "");
+        }
+        return text;
+    }
+
+    private static List<String> convertStringToList(String text) {
+        List<String> charList = new ArrayList<>();
+        for (char c : text.toCharArray()) {
+            charList.add(c + "");
+        }
+        return charList;
+    }
+
+    public static String processString(List<String> wmList, String string) {
+        // 生成正则表达式模式
+        String pat = String.join("|", wmList);
+        // 使用正则表达式移除wmList中的元素
+        string = removeMatches(string, pat);
+        // 替换中文括号为英文括号
+        string = string.replace("(", "(").replace(")", ")");
+        // 移除空格
+        string = string.replace(" ", "");
+        // 如果字符串以括号开头,则移除第一个字符
+        if (startsWithParenthesis(string)) {
+            string = string.substring(1);
+        }
+
+        return string;
+    }
+
+    private static String removeMatches(String input, String pattern) {
+        // 编译正则表达式
+        Pattern compiledPattern = Pattern.compile(pattern);
+        // 创建Matcher对象
+        Matcher matcher = compiledPattern.matcher(input);
+        // 使用replaceAll方法替换所有匹配到的字符为空字符串
+        return matcher.replaceAll("");
+    }
+
+    private static boolean startsWithParenthesis(String input) {
+        // 匹配以括号开头的字符串
+        Pattern pattern = Pattern.compile("^[()].*");
+        Matcher matcher = pattern.matcher(input);
+        return matcher.find();
+    }
+
+//    public static void removeTextWatermark(PDPage page) throws IOException {
+//        PDResources resources = page.getResources();
+////        if (StrUtil.isAllBlank(fundName, trustName)) {
+////            return;
+////        }
+//        PDFTextStripperByArea stripper = new PDFTextStripperByArea();
+//        stripper.setSortByPosition(true);
+//        stripper.addRegion("watermark", new Rectangle2D.Float(0, 0, page.getMediaBox().getWidth(), page.getMediaBox().getHeight()));
+//        stripper.extractRegions(page);
+//
+//        PDFStreamEngine engine = new PDFTextStripper();
+//        engine.addOperator(new SetMatrix(stripper));
+//
+//    }
+//
+//    private static void processResources(PDResources resources) throws IOException {
+//        for (COSName name : resources.getXObjectNames()) {
+//            PDXObject xobject = resources.getXObject(name);
+//            if (xobject instanceof PDFormXObject) {
+//                PDFormXObject formXObject = (PDFormXObject) xobject;
+//                writeTokensToStream(formXObject.getContentStream(),
+//                        createTokensWithoutText(formXObject));
+//                processResources(formXObject.getResources());
+//            }
+//        }
+//        for (COSName name : resources.getPatternNames()) {
+//            PDAbstractPattern pattern = resources.getPattern(name);
+//            if (pattern instanceof PDTilingPattern) {
+//                PDTilingPattern tilingPattern = (PDTilingPattern) pattern;
+//                writeTokensToStream(tilingPattern.getContentStream(),
+//                        createTokensWithoutText(tilingPattern));
+//                processResources(tilingPattern.getResources());
+//            }
+//        }
+//    }
+//
+//    private static void writeTokensToStream(PDStream newContents, List<Object> newTokens) throws IOException {
+//        try (OutputStream out = newContents.createOutputStream(COSName.FLATE_DECODE)) {
+//            ContentStreamWriter writer = new ContentStreamWriter(out);
+//            writer.writeTokens(newTokens);
+//        }
+//    }
+//
+//    private static List<Object> createTokensWithoutText(PDContentStream contentStream) throws IOException {
+//        PDFStreamParser parser = new PDFStreamParser(contentStream);
+//        Object token = parser.parseNextToken();
+//        List<Object> newTokens = new ArrayList<>();
+//        while (token != null) {
+//            if (token instanceof Operator op) {
+//                String opName = op.getName();
+//                if (OperatorName.SET_MATRIX.equals(opName)) {
+//                    // remove the argument to this operator
+//                    newTokens.remove(newTokens.size() - 1);
+//
+//                    token = parser.parseNextToken();
+//                    continue;
+//                }
+//            }
+//            newTokens.add(token);
+//            token = parser.parseNextToken();
+//        }
+//        return newTokens;
+//    }
 }