Pārlūkot izejas kodu

feat:excel格式的定期报告解析开始

wangzaijun 6 mēneši atpakaļ
vecāks
revīzija
42904e0456
16 mainītis faili ar 619 papildinājumiem un 201 dzēšanām
  1. 3 7
      service-base/src/main/java/com/simuwang/base/common/enums/ReportParserFileType.java
  2. 7 1
      service-base/src/main/java/com/simuwang/base/pojo/dto/report/BaseReportDTO.java
  3. 7 1
      service-base/src/main/java/com/simuwang/base/pojo/dto/report/ReportData.java
  4. 97 0
      service-base/src/main/java/com/simuwang/base/pojo/dto/report/SimpleTable.java
  5. 55 0
      service-daq/src/main/java/com/simuwang/daq/components/CustomExcelMultiSheetListener.java
  6. 166 0
      service-daq/src/main/java/com/simuwang/daq/components/ReportParseUtils.java
  7. 33 34
      service-daq/src/main/java/com/simuwang/daq/components/report/parser/AbstractReportParser.java
  8. 12 24
      service-daq/src/main/java/com/simuwang/daq/components/report/parser/ReportParserConstant.java
  9. 46 0
      service-daq/src/main/java/com/simuwang/daq/components/report/parser/excel/AbstractExcelReportParser.java
  10. 33 0
      service-daq/src/main/java/com/simuwang/daq/components/report/parser/excel/ExcelMonthlyReportParser.java
  11. 119 122
      service-daq/src/main/java/com/simuwang/daq/components/report/parser/pdf/AbstractPDReportParser.java
  12. 5 4
      service-daq/src/main/java/com/simuwang/daq/components/report/parser/pdf/PDAnnuallyReportParser.java
  13. 9 8
      service-daq/src/main/java/com/simuwang/daq/components/report/parser/pdf/PDQuarterlyReportParser.java
  14. 27 0
      service-daq/src/main/java/com/simuwang/daq/utils/ExcelReportParseUtil.java
  15. BIN
      service-daq/src/main/java/com/simuwang/daq/utils/PB0001私募月报(证券投资).xlsx
  16. BIN
      service-daq/src/main/java/com/simuwang/daq/utils/PB0003私募年报(证券投资).xlsx

+ 3 - 7
service-base/src/main/java/com/simuwang/base/common/enums/ReportParserFileType.java

@@ -1,7 +1,5 @@
 package com.simuwang.base.common.enums;
 
-import cn.hutool.core.util.StrUtil;
-
 import java.util.Arrays;
 
 /**
@@ -11,10 +9,8 @@ import java.util.Arrays;
  */
 public enum ReportParserFileType {
     PDF("pdf"),
-    DOCX("docx"),
-    DOC("doc"),
-    XLSX("xlsx"),
-    XLS("xls"),
+    WORD("docx,doc"),
+    EXCEL("xlsx,xls"),
     PYTHON("python");
 
     private final String suffix;
@@ -25,7 +21,7 @@ public enum ReportParserFileType {
 
     public static ReportParserFileType getBySuffix(String suffix) {
         return Arrays.stream(ReportParserFileType.values())
-                .filter(e -> StrUtil.equals(e.getSuffix(), suffix)).findFirst().orElse(null);
+                .filter(e -> e.getSuffix().contains(suffix)).findFirst().orElse(null);
     }
 
     public String getSuffix() {

+ 7 - 1
service-base/src/main/java/com/simuwang/base/pojo/dto/report/BaseReportDTO.java

@@ -3,10 +3,13 @@ package com.simuwang.base.pojo.dto.report;
 import cn.hutool.core.date.DatePattern;
 import cn.hutool.core.date.DateUtil;
 import cn.hutool.core.util.StrUtil;
+import com.simuwang.base.common.conts.Constants;
 import com.simuwang.base.pojo.dos.report.BaseReportDO;
 import lombok.Getter;
 import lombok.Setter;
 
+import java.io.Serial;
+import java.io.Serializable;
 import java.math.BigDecimal;
 import java.util.Date;
 
@@ -17,7 +20,10 @@ import java.util.Date;
  */
 @Setter
 @Getter
-public abstract class BaseReportDTO<T extends BaseReportDO> {
+public abstract class BaseReportDTO<T extends BaseReportDO> implements Serializable {
+    @Serial
+    private static final long serialVersionUID = Constants.DEFAULT_SERIAL_ID;
+
     private Integer fileId;
 
     public BaseReportDTO() {

+ 7 - 1
service-base/src/main/java/com/simuwang/base/pojo/dto/report/ReportData.java

@@ -1,9 +1,13 @@
 package com.simuwang.base.pojo.dto.report;
 
+import com.simuwang.base.common.conts.Constants;
 import com.simuwang.base.common.enums.ReportType;
 import lombok.Getter;
 import lombok.Setter;
 
+import java.io.Serial;
+import java.io.Serializable;
+
 /**
  * @author wangzaijun
  * @date 2024/9/29 9:32
@@ -11,7 +15,9 @@ import lombok.Setter;
  */
 @Setter
 @Getter
-public abstract class ReportData {
+public abstract class ReportData implements Serializable {
+    @Serial
+    private static final long serialVersionUID = Constants.DEFAULT_SERIAL_ID;
     /**
      * 报告基本信息
      */

+ 97 - 0
service-base/src/main/java/com/simuwang/base/pojo/dto/report/SimpleTable.java

@@ -0,0 +1,97 @@
+package com.simuwang.base.pojo.dto.report;
+
+import cn.hutool.core.collection.ListUtil;
+import com.simuwang.base.common.conts.Constants;
+
+import java.io.Serial;
+import java.io.Serializable;
+import java.util.Iterator;
+import java.util.List;
+
+/**
+ * @author wangzaijun
+ * @date 2024/10/12 11:00
+ * @description 自定义简单表格对象(excel工作表中的多个表格解析)
+ */
+public class SimpleTable implements Serializable {
+    @Serial
+    private static final long serialVersionUID = Constants.DEFAULT_SERIAL_ID;
+    /**
+     * 表格标题
+     */
+    private final String title;
+    /**
+     * 表格数据行
+     */
+    private final List<List<String>> rows;
+
+    public SimpleTable(String title) {
+        this.title = title;
+        this.rows = ListUtil.list(true);
+    }
+
+    public void addRow(List<String> row) {
+        rows.add(row);
+    }
+
+    public String getTitle() {
+        return title;
+    }
+
+    public List<List<String>> getRows() {
+        return rows;
+    }
+
+    /**
+     * 实现迭代器接口,支持按行遍历表格数据
+     *
+     * @return /
+     */
+    public Iterator<List<String>> iterator() {
+        return new TableIterator(rows);
+    }
+
+    /**
+     * 获取某个单元格的数据
+     *
+     * @param row    行号
+     * @param column 列号
+     * @return 单元格内容
+     */
+    public String getCell(int row, int column) {
+        if (row < 0 || row >= rows.size() || column < 0 || column >= rows.get(row).size()) {
+            throw new IndexOutOfBoundsException("Invalid row or column index");
+        }
+        return rows.get(row).get(column);
+    }
+
+    @Override
+    public String toString() {
+        return "SimpleTable{" +
+                "title='" + title + '\'' +
+                ", rows=" + rows +
+                '}';
+    }
+
+    /**
+     * 内部迭代器类
+     */
+    private static class TableIterator implements Iterator<List<String>> {
+        private final List<List<String>> rows;
+        private int currentIndex = 0;
+
+        public TableIterator(List<List<String>> rows) {
+            this.rows = rows;
+        }
+
+        @Override
+        public boolean hasNext() {
+            return currentIndex < rows.size();
+        }
+
+        @Override
+        public List<String> next() {
+            return rows.get(currentIndex++);
+        }
+    }
+}

+ 55 - 0
service-daq/src/main/java/com/simuwang/daq/components/CustomExcelMultiSheetListener.java

@@ -0,0 +1,55 @@
+package com.simuwang.daq.components;
+
+import cn.hutool.core.collection.ListUtil;
+import com.alibaba.excel.context.AnalysisContext;
+import com.alibaba.excel.event.AnalysisEventListener;
+import com.alibaba.excel.read.metadata.holder.ReadSheetHolder;
+import com.simuwang.base.pojo.dto.report.SimpleTable;
+
+import java.util.LinkedHashMap;
+import java.util.List;
+
+/**
+ * @author wangzaijun
+ * @date 2024/10/12 9:17
+ * @description 自定义的excel多sheet解析事件监听器
+ */
+public class CustomExcelMultiSheetListener extends AnalysisEventListener<LinkedHashMap<String, Object>> {
+    private final List<SimpleTable> tables = ListUtil.list(false);
+    private SimpleTable table;
+
+    public List<SimpleTable> getTables() {
+        return tables;
+    }
+
+    public void cleanTables() {
+        this.tables.clear();
+    }
+
+    @Override
+    public void invoke(LinkedHashMap<String, Object> row, AnalysisContext analysisContext) {
+        ReadSheetHolder sheetHolder = analysisContext.readSheetHolder();
+        String sheetName = sheetHolder.getSheetName();
+        if (sheetName.contains("封面") && sheetHolder.getSheetNo() == 0) {
+            return;
+        }
+        @SuppressWarnings("unchecked")
+        List<String> tableTitles = (List<String>) analysisContext.getCustom();
+        String title = ReportParseUtils.cleaningValue(row.get("1"));
+        if (tableTitles.contains(title)) {
+            this.table = new SimpleTable(title);
+            this.tables.add(table);
+        }
+
+        List<String> tableRow = ListUtil.list(true);
+        for (int i = 1; i < row.size(); i++) {
+            tableRow.add(ReportParseUtils.cleaningValue(row.get(String.valueOf(i))));
+        }
+        this.table.addRow(tableRow);
+    }
+
+    @Override
+    public void doAfterAllAnalysed(AnalysisContext analysisContext) {
+
+    }
+}

+ 166 - 0
service-daq/src/main/java/com/simuwang/daq/components/ReportParseUtils.java

@@ -0,0 +1,166 @@
+package com.simuwang.daq.components;
+
+import cn.hutool.core.collection.ListUtil;
+import cn.hutool.core.util.StrUtil;
+import com.simuwang.base.common.enums.ReportType;
+
+import java.util.Calendar;
+import java.util.List;
+import java.util.Objects;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import java.util.stream.Collectors;
+
+public final class ReportParseUtils {
+    /**
+     * 数据清洗,替换圆括号,包含中文或英文的圆括号
+     *
+     * @param value /
+     * @return /
+     */
+    public static String cleaningValue(Object value) {
+        return cleaningValue(value, true);
+    }
+
+    /**
+     * 数据简单清洗,并全部转为字符串类型
+     *
+     * @param value              待清洗的数据
+     * @param replaceParentheses 是否替换圆括号
+     * @return /
+     */
+    public static String cleaningValue(Object value, boolean replaceParentheses) {
+        String fieldValue = StrUtil.toStringOrNull(value);
+        if (!StrUtil.isNullOrUndefined(fieldValue)) {
+            // 特殊字符替换,空格替换为空字符
+            fieldValue = fieldValue
+                    .replace("\r", StrUtil.EMPTY)
+                    .replace(";", ";")
+                    .replaceAll(" ", StrUtil.EMPTY);
+            if (replaceParentheses) {
+                // 正则表达式匹配中文括号及其内容,并替换为空字符串
+                fieldValue = Pattern.compile("[(|(][^)]*[)|)]").matcher(fieldValue).replaceAll(StrUtil.EMPTY);
+            }
+        }
+        // 如果仅有 “-” 该字段值为null
+        if (Objects.equals("-", fieldValue)) {
+            fieldValue = null;
+        }
+        return StrUtil.isBlank(fieldValue) ? null : fieldValue;
+    }
+
+    /**
+     * 匹配分级基金名称
+     *
+     * @param text 文本内容
+     * @return /
+     */
+    public static List<String> matchTieredFund(String text) {
+        List<String> matches = ListUtil.list(false);
+        if (StrUtil.isBlank(text)) {
+            return matches;
+        }
+        // 使用正则表达式查找匹配项
+        Pattern pattern = Pattern.compile("[A-F]级|基金[A-F]");
+        Matcher matcher = pattern.matcher(text);
+        // 收集所有匹配项
+        while (matcher.find()) {
+            matches.add(matcher.group());
+        }
+        // 提取字母并按字母顺序排序
+        return matches.stream()
+                .map(s -> s.replaceAll("[^A-F]", ""))
+                .distinct()
+                .sorted()
+                .map(letter -> letter + "级")
+                .collect(Collectors.toList());
+    }
+
+    /**
+     * 匹配报告日期
+     *
+     * @param string 文本内容
+     * @return 报告日期
+     */
+    public static String matchReportDate(String string) {
+        if (string == null) {
+            return null;
+        }
+        // 编译正则表达式模式
+        Pattern pat1 = Pattern.compile("(2\\d{3}).*([一二三四1234])季度");  // 2023年XXX3季度
+        Pattern pat2 = Pattern.compile("\\d{4}-\\d{2}-\\d{2}");  // 2023-12-31
+        Pattern pat3 = Pattern.compile("(2\\d{3})年年度");  // 2023年年度
+        Pattern pat4 = Pattern.compile("(\\d{4})年(\\d{1,2})月");  // 2023年12月
+        Pattern pat5 = Pattern.compile("\\d{4}\\d{2}\\d{2}");  // 20231231
+        Pattern pat6 = Pattern.compile("(2\\d{3})年度");  // 2023年度
+        // 创建Matcher对象
+        Matcher matcher1 = pat1.matcher(string);
+        Matcher matcher2 = pat2.matcher(string);
+        Matcher matcher3 = pat3.matcher(string);
+        Matcher matcher4 = pat4.matcher(string);
+        Matcher matcher5 = pat5.matcher(string);
+        Matcher matcher6 = pat6.matcher(string);
+        // 尝试匹配
+        if (matcher1.find()) {
+            String year = matcher1.group(1);
+            String quarter = matcher1.group(2);
+            return switch (quarter) {
+                case "一", "1" -> year + "-03-31";
+                case "二", "2" -> year + "-06-30";
+                case "三", "3" -> year + "-09-30";
+                case "四", "4" -> year + "-12-31";
+                default -> null;
+            };
+        } else if (matcher2.find()) {
+            return matcher2.group();
+        } else if (matcher5.find()) {
+            return matcher5.group();
+        } else if (matcher3.find()) {
+            return matcher3.group(1) + "-12-31";
+        } else if (matcher6.find()) {
+            return matcher6.group(1) + "-12-31";
+        } else if (matcher4.find()) {
+            String year = matcher4.group(1);
+            String month = matcher4.group(2);
+            int lastDayOfMonth = getLastDayOfMonth(Integer.parseInt(year), Integer.parseInt(month));
+            return year + "-" + padZero(month) + "-" + padZero(lastDayOfMonth + "");
+        } else {
+            return null;
+        }
+    }
+
+    /**
+     * 匹配报告类型,如“季度”、“年度”
+     *
+     * @param string 输入字符串
+     * @return 匹配到的报告类型子字符串,如果没有匹配到则返回null
+     */
+    public static String matchReportType(String string) {
+        if (string == null) {
+            return null;
+        }
+        // 所有报告的正则识别方式
+        String patterns = ReportType.getAllPatterns();
+        // 编译正则表达式模式
+        Pattern pattern = Pattern.compile(patterns);
+        // 创建Matcher对象
+        Matcher matcher = pattern.matcher(string);
+        // 尝试匹配
+        if (matcher.find()) {
+            return matcher.group();
+        } else {
+            return null;
+        }
+    }
+
+    private static int getLastDayOfMonth(int year, int month) {
+        Calendar calendar = Calendar.getInstance();
+        calendar.set(Calendar.YEAR, year);
+        calendar.set(Calendar.MONTH, month - 1); // Calendar.MONTH 是从0开始的
+        return calendar.getActualMaximum(Calendar.DAY_OF_MONTH);
+    }
+
+    private static String padZero(String number) {
+        return String.format("%02d", Integer.parseInt(number));
+    }
+}

+ 33 - 34
service-daq/src/main/java/com/simuwang/daq/components/report/parser/AbstractReportParser.java

@@ -7,13 +7,12 @@ import cn.hutool.core.util.StrUtil;
 import com.simuwang.base.mapper.EmailFieldMappingMapper;
 import com.simuwang.base.pojo.dos.EmailFieldMappingDO;
 import com.simuwang.base.pojo.dto.report.ReportData;
+import com.simuwang.daq.components.ReportParseUtils;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 import java.util.List;
 import java.util.Map;
-import java.util.Objects;
-import java.util.regex.Pattern;
 
 /**
  * @author wangzaijun
@@ -70,8 +69,8 @@ public abstract class AbstractReportParser<T extends ReportData> implements Repo
             return;
         }
         for (Map.Entry<String, Object> entry : extInfoMap.entrySet()) {
-            String k = this.cleaningValue(entry.getKey());
-            String fieldValue = this.cleaningValue(entry.getValue());
+            String k = ReportParseUtils.cleaningValue(entry.getKey());
+            String fieldValue = ReportParseUtils.cleaningValue(entry.getValue());
             String fieldName = this.fieldMapper.get(k);
             if (StrUtil.isBlank(fieldName)) {
                 continue;
@@ -84,34 +83,34 @@ public abstract class AbstractReportParser<T extends ReportData> implements Repo
         }
     }
 
-    protected String cleaningValue(Object value) {
-        return this.cleaningValue(value, true);
-    }
-
-    /**
-     * 数据简单清洗,并全部转为字符串类型
-     *
-     * @param value              待清洗的数据
-     * @param replaceParentheses 是否替换圆括号
-     * @return /
-     */
-    protected String cleaningValue(Object value, boolean replaceParentheses) {
-        String fieldValue = StrUtil.toStringOrNull(value);
-        if (!StrUtil.isNullOrUndefined(fieldValue)) {
-            // 特殊字符替换,空格替换为空字符
-            fieldValue = fieldValue
-                    .replace("\r", StrUtil.EMPTY)
-                    .replace(";", ";")
-                    .replaceAll(" ", StrUtil.EMPTY);
-            if (replaceParentheses) {
-                // 正则表达式匹配中文括号及其内容,并替换为空字符串
-                fieldValue = Pattern.compile("[(|(][^)]*[)|)]").matcher(fieldValue).replaceAll(StrUtil.EMPTY);
-            }
-        }
-        // 如果仅有 “-” 该字段值为null
-        if (Objects.equals("-", fieldValue)) {
-            fieldValue = null;
-        }
-        return StrUtil.isBlank(fieldValue) ? null : fieldValue;
-    }
+//    protected String cleaningValue(Object value) {
+//        return this.cleaningValue(value, true);
+//    }
+//
+//    /**
+//     * 数据简单清洗,并全部转为字符串类型
+//     *
+//     * @param value              待清洗的数据
+//     * @param replaceParentheses 是否替换圆括号
+//     * @return /
+//     */
+//    protected String cleaningValue(Object value, boolean replaceParentheses) {
+//        String fieldValue = StrUtil.toStringOrNull(value);
+//        if (!StrUtil.isNullOrUndefined(fieldValue)) {
+//            // 特殊字符替换,空格替换为空字符
+//            fieldValue = fieldValue
+//                    .replace("\r", StrUtil.EMPTY)
+//                    .replace(";", ";")
+//                    .replaceAll(" ", StrUtil.EMPTY);
+//            if (replaceParentheses) {
+//                // 正则表达式匹配中文括号及其内容,并替换为空字符串
+//                fieldValue = Pattern.compile("[(|(][^)]*[)|)]").matcher(fieldValue).replaceAll(StrUtil.EMPTY);
+//            }
+//        }
+//        // 如果仅有 “-” 该字段值为null
+//        if (Objects.equals("-", fieldValue)) {
+//            fieldValue = null;
+//        }
+//        return StrUtil.isBlank(fieldValue) ? null : fieldValue;
+//    }
 }

+ 12 - 24
service-daq/src/main/java/com/simuwang/daq/components/report/parser/ReportParserConstant.java

@@ -15,53 +15,41 @@ public final class ReportParserConstant {
     public static final Map<ReportType, Map<ReportParserFileType, String>> REPORT_PARSER_BEAN_MAP = MapUtil.newHashMap(8);
 
     public static final String PARSER_PDF_MONTHLY = "report-parser:pdf:monthly";
-    public static final String PARSER_DOC_MONTHLY = "report-parser:doc:monthly";
-    public static final String PARSER_DOCX_MONTHLY = "report-parser:docx:monthly";
-    public static final String PARSER_XLSX_MONTHLY = "report-parser:xlsx:monthly";
-    public static final String PARSER_XLS_MONTHLY = "report-parser:xls:monthly";
+    public static final String PARSER_WORD_MONTHLY = "report-parser:word:monthly";
+    public static final String PARSER_EXCEL_MONTHLY = "report-parser:excel:monthly";
     public static final String PARSER_PYTHON_MONTHLY = "report-parser:python:monthly";
 
     public static final String PARSER_PDF_QUARTERLY = "report-parser:pdf:quarterly";
-    public static final String PARSER_DOC_QUARTERLY = "report-parser:doc:quarterly";
-    public static final String PARSER_DOCX_QUARTERLY = "report-parser:docx:quarterly";
-    public static final String PARSER_XLSX_QUARTERLY = "report-parser:xlsx:quarterly";
-    public static final String PARSER_XLS_QUARTERLY = "report-parser:xls:quarterly";
+    public static final String PARSER_WORD_QUARTERLY = "report-parser:word:quarterly";
+    public static final String PARSER_EXCEL_QUARTERLY = "report-parser:excel:quarterly";
     public static final String PARSER_PYTHON_QUARTERLY = "report-parser:python:quarterly";
 
     public static final String PARSER_PDF_ANNUALLY = "report-parser:pdf:annually";
-    public static final String PARSER_DOC_ANNUALLY = "report-parser:doc:annually";
-    public static final String PARSER_DOCX_ANNUALLY = "report-parser:docx:annually";
-    public static final String PARSER_XLSX_ANNUALLY = "report-parser:xlsx:annually";
-    public static final String PARSER_XLS_ANNUALLY = "report-parser:xls:annually";
+    public static final String PARSER_WORD_ANNUALLY = "report-parser:word:annually";
+    public static final String PARSER_EXCEL_ANNUALLY = "report-parser:excel:annually";
     public static final String PARSER_PYTHON_ANNUALLY = "report-parser:python:annually";
 
     static {
         REPORT_PARSER_BEAN_MAP.put(ReportType.MONTHLY,
                 Map.of(ReportParserFileType.PDF, PARSER_PDF_MONTHLY,
-                        ReportParserFileType.DOC, PARSER_DOC_MONTHLY,
-                        ReportParserFileType.DOCX, PARSER_DOCX_MONTHLY,
-                        ReportParserFileType.XLSX, PARSER_XLSX_MONTHLY,
-                        ReportParserFileType.XLS, PARSER_XLS_MONTHLY,
+                        ReportParserFileType.WORD, PARSER_WORD_MONTHLY,
+                        ReportParserFileType.EXCEL, PARSER_EXCEL_MONTHLY,
 
                         ReportParserFileType.PYTHON, PARSER_PYTHON_MONTHLY
                 ));
 
         REPORT_PARSER_BEAN_MAP.put(ReportType.QUARTERLY,
                 Map.of(ReportParserFileType.PDF, PARSER_PDF_QUARTERLY,
-                        ReportParserFileType.DOC, PARSER_DOC_QUARTERLY,
-                        ReportParserFileType.DOCX, PARSER_DOCX_QUARTERLY,
-                        ReportParserFileType.XLSX, PARSER_XLSX_QUARTERLY,
-                        ReportParserFileType.XLS, PARSER_XLS_QUARTERLY,
+                        ReportParserFileType.WORD, PARSER_WORD_QUARTERLY,
+                        ReportParserFileType.EXCEL, PARSER_EXCEL_QUARTERLY,
 
                         ReportParserFileType.PYTHON, PARSER_PYTHON_QUARTERLY
                 ));
 
         REPORT_PARSER_BEAN_MAP.put(ReportType.ANNUALLY,
                 Map.of(ReportParserFileType.PDF, PARSER_PDF_ANNUALLY,
-                        ReportParserFileType.DOC, PARSER_DOC_ANNUALLY,
-                        ReportParserFileType.DOCX, PARSER_DOCX_ANNUALLY,
-                        ReportParserFileType.XLSX, PARSER_XLSX_ANNUALLY,
-                        ReportParserFileType.XLS, PARSER_XLS_ANNUALLY,
+                        ReportParserFileType.WORD, PARSER_WORD_ANNUALLY,
+                        ReportParserFileType.EXCEL, PARSER_EXCEL_ANNUALLY,
 
                         ReportParserFileType.PYTHON, PARSER_PYTHON_ANNUALLY
                 ));

+ 46 - 0
service-daq/src/main/java/com/simuwang/daq/components/report/parser/excel/AbstractExcelReportParser.java

@@ -0,0 +1,46 @@
+package com.simuwang.daq.components.report.parser.excel;
+
+import com.alibaba.excel.EasyExcel;
+import com.alibaba.excel.read.builder.ExcelReaderBuilder;
+import com.simuwang.base.common.exception.ReportParseException;
+import com.simuwang.base.mapper.EmailFieldMappingMapper;
+import com.simuwang.base.pojo.dto.report.ReportData;
+import com.simuwang.base.pojo.dto.report.ReportParserParams;
+import com.simuwang.base.pojo.dto.report.SimpleTable;
+import com.simuwang.daq.components.CustomExcelMultiSheetListener;
+import com.simuwang.daq.components.report.parser.AbstractReportParser;
+
+import java.io.IOException;
+import java.util.List;
+
+public abstract class AbstractExcelReportParser<T extends ReportData> extends AbstractReportParser<T> {
+    public AbstractExcelReportParser(EmailFieldMappingMapper fieldMappingMapper) {
+        super(fieldMappingMapper);
+    }
+
+    @Override
+    public T parse(ReportParserParams params) throws IOException, ReportParseException {
+        this.init();
+        // 配置excel解析
+        CustomExcelMultiSheetListener excelListener = new CustomExcelMultiSheetListener();
+        ExcelReaderBuilder readerBuilder = EasyExcel.read(params.getFilepath());
+        readerBuilder.sheet();
+        readerBuilder.customObject(this.tableTitles());
+        readerBuilder.registerReadListener(excelListener);
+        readerBuilder.doReadAll();
+        // 解析出表格
+        List<SimpleTable> tables = excelListener.getTables();
+        T reportData = this.excelParse(tables, params);
+        excelListener.cleanTables();
+        return reportData;
+    }
+
+    @Override
+    protected void cleaningReportData(T reportData) {
+        // cleaning.
+    }
+
+    protected abstract List<String> tableTitles();
+
+    protected abstract T excelParse(List<SimpleTable> tables, ReportParserParams params);
+}

+ 33 - 0
service-daq/src/main/java/com/simuwang/daq/components/report/parser/excel/ExcelMonthlyReportParser.java

@@ -0,0 +1,33 @@
+package com.simuwang.daq.components.report.parser.excel;
+
+import cn.hutool.core.collection.ListUtil;
+import com.simuwang.base.mapper.EmailFieldMappingMapper;
+import com.simuwang.base.pojo.dto.report.MonthlyReportData;
+import com.simuwang.base.pojo.dto.report.ReportParserParams;
+import com.simuwang.base.pojo.dto.report.SimpleTable;
+import com.simuwang.daq.components.report.parser.ReportParserConstant;
+import org.springframework.stereotype.Component;
+
+import java.util.List;
+
+@Component(ReportParserConstant.PARSER_EXCEL_MONTHLY)
+public class ExcelMonthlyReportParser extends AbstractExcelReportParser<MonthlyReportData> {
+    public ExcelMonthlyReportParser(EmailFieldMappingMapper fieldMappingMapper) {
+        super(fieldMappingMapper);
+    }
+
+    @Override
+    protected List<String> tableTitles() {
+        return ListUtil.of("基金概况", "净值月报", "级基金净值表");
+    }
+
+    @Override
+    protected MonthlyReportData excelParse(List<SimpleTable> tables, ReportParserParams params) {
+        return null;
+    }
+
+    @Override
+    protected void cleaningReportData(MonthlyReportData reportData) {
+        // todo 数据清理
+    }
+}

+ 119 - 122
service-daq/src/main/java/com/simuwang/daq/components/report/parser/pdf/AbstractPDReportParser.java

@@ -5,11 +5,11 @@ import cn.hutool.core.exceptions.ExceptionUtil;
 import cn.hutool.core.map.MapUtil;
 import cn.hutool.core.util.StrUtil;
 import com.simuwang.base.common.conts.Constants;
-import com.simuwang.base.common.enums.ReportType;
 import com.simuwang.base.common.exception.ReportParseException;
 import com.simuwang.base.mapper.EmailFieldMappingMapper;
 import com.simuwang.base.pojo.dto.report.*;
 import com.simuwang.daq.components.CustomPDFTextStripper;
+import com.simuwang.daq.components.ReportParseUtils;
 import com.simuwang.daq.components.report.parser.AbstractReportParser;
 import org.apache.pdfbox.Loader;
 import org.apache.pdfbox.io.RandomAccessReadBufferedFile;
@@ -21,13 +21,10 @@ import technology.tabula.Table;
 import technology.tabula.extractors.SpreadsheetExtractionAlgorithm;
 
 import java.io.IOException;
-import java.util.Calendar;
 import java.util.List;
 import java.util.Map;
 import java.util.Objects;
 import java.util.function.Function;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
 import java.util.stream.Collectors;
 
 /**
@@ -68,7 +65,7 @@ public abstract class AbstractPDReportParser<T extends ReportData> extends Abstr
                 throw new ReportParseException(ReportParseStatus.REPORT_IS_SCAN);
             }
             // 报告名称和类型一般在第一第二行
-            if (this.matchReportType(this.textList.get(0)) == null && this.matchReportType(this.textList.get(1)) == null) {
+            if (ReportParseUtils.matchReportType(this.textList.get(0)) == null && ReportParseUtils.matchReportType(this.textList.get(1)) == null) {
                 throw new ReportParseException(ReportParseStatus.NOT_A_REPORT);
             }
             // 解析所有表格(单元格字符去水印)
@@ -158,8 +155,8 @@ public abstract class AbstractPDReportParser<T extends ReportData> extends Abstr
         String reportName = params.getFilename();
         ReportBaseInfoDTO reportInfo = new ReportBaseInfoDTO(fileId);
         reportInfo.setReportName(reportName);
-        reportInfo.setReportType(this.matchReportType(reportName));
-        reportInfo.setReportDate(this.matchReportDate(reportName));
+        reportInfo.setReportType(ReportParseUtils.matchReportType(reportName));
+        reportInfo.setReportDate(ReportParseUtils.matchReportDate(reportName));
         return reportInfo;
     }
 
@@ -179,7 +176,7 @@ public abstract class AbstractPDReportParser<T extends ReportData> extends Abstr
         List<DTO> dtos = tables.stream().filter(Objects::nonNull)
                 .map(e -> this.buildDto(fileId, e, clazz, function)).collect(Collectors.toList());
         // 分级基金匹配
-        List<String> levels = this.matchTieredFund(String.join(",", this.textList));
+        List<String> levels = ReportParseUtils.matchTieredFund(String.join(",", this.textList));
         levels.add(0, "母基金");
         for (int i = 0; i < dtos.size(); i++) {
             if (levels.size() <= i) {
@@ -213,118 +210,118 @@ public abstract class AbstractPDReportParser<T extends ReportData> extends Abstr
         return null;
     }
 
-    /**
-     * 匹配分级基金名称
-     *
-     * @param text 文本内容
-     * @return /
-     */
-    protected List<String> matchTieredFund(String text) {
-        List<String> matches = ListUtil.list(false);
-        if (StrUtil.isBlank(text)) {
-            return matches;
-        }
-        // 使用正则表达式查找匹配项
-        Pattern pattern = Pattern.compile("[A-F]级|基金[A-F]");
-        Matcher matcher = pattern.matcher(text);
-        // 收集所有匹配项
-        while (matcher.find()) {
-            matches.add(matcher.group());
-        }
-        // 提取字母并按字母顺序排序
-        return matches.stream()
-                .map(s -> s.replaceAll("[^A-F]", ""))
-                .distinct()
-                .sorted()
-                .map(letter -> letter + "级")
-                .collect(Collectors.toList());
-    }
-
-    /**
-     * 匹配报告日期
-     *
-     * @param string 文本内容
-     * @return 报告日期
-     */
-    private String matchReportDate(String string) {
-        if (string == null) {
-            return null;
-        }
-        // 编译正则表达式模式
-        Pattern pat1 = Pattern.compile("(2\\d{3}).*([一二三四1234])季度");  // 2023年XXX3季度
-        Pattern pat2 = Pattern.compile("\\d{4}-\\d{2}-\\d{2}");  // 2023-12-31
-        Pattern pat3 = Pattern.compile("(2\\d{3})年年度");  // 2023年年度
-        Pattern pat4 = Pattern.compile("(\\d{4})年(\\d{1,2})月");  // 2023年12月
-        Pattern pat5 = Pattern.compile("\\d{4}\\d{2}\\d{2}");  // 20231231
-        Pattern pat6 = Pattern.compile("(2\\d{3})年度");  // 2023年度
-        // 创建Matcher对象
-        Matcher matcher1 = pat1.matcher(string);
-        Matcher matcher2 = pat2.matcher(string);
-        Matcher matcher3 = pat3.matcher(string);
-        Matcher matcher4 = pat4.matcher(string);
-        Matcher matcher5 = pat5.matcher(string);
-        Matcher matcher6 = pat6.matcher(string);
-        // 尝试匹配
-        if (matcher1.find()) {
-            String year = matcher1.group(1);
-            String quarter = matcher1.group(2);
-            return switch (quarter) {
-                case "一", "1" -> year + "-03-31";
-                case "二", "2" -> year + "-06-30";
-                case "三", "3" -> year + "-09-30";
-                case "四", "4" -> year + "-12-31";
-                default -> null;
-            };
-        } else if (matcher2.find()) {
-            return matcher2.group();
-        } else if (matcher5.find()) {
-            return matcher5.group();
-        } else if (matcher3.find()) {
-            return matcher3.group(1) + "-12-31";
-        } else if (matcher6.find()) {
-            return matcher6.group(1) + "-12-31";
-        } else if (matcher4.find()) {
-            String year = matcher4.group(1);
-            String month = matcher4.group(2);
-            int lastDayOfMonth = getLastDayOfMonth(Integer.parseInt(year), Integer.parseInt(month));
-            return year + "-" + padZero(month) + "-" + padZero(lastDayOfMonth + "");
-        } else {
-            return null;
-        }
-    }
-
-    /**
-     * 匹配报告类型,如“季度”、“年度”
-     *
-     * @param string 输入字符串
-     * @return 匹配到的报告类型子字符串,如果没有匹配到则返回null
-     */
-    private String matchReportType(String string) {
-        if (string == null) {
-            return null;
-        }
-        // 所有报告的正则识别方式
-        String patterns = ReportType.getAllPatterns();
-        // 编译正则表达式模式
-        Pattern pattern = Pattern.compile(patterns);
-        // 创建Matcher对象
-        Matcher matcher = pattern.matcher(string);
-        // 尝试匹配
-        if (matcher.find()) {
-            return matcher.group();
-        } else {
-            return null;
-        }
-    }
-
-    private int getLastDayOfMonth(int year, int month) {
-        Calendar calendar = Calendar.getInstance();
-        calendar.set(Calendar.YEAR, year);
-        calendar.set(Calendar.MONTH, month - 1); // Calendar.MONTH 是从0开始的
-        return calendar.getActualMaximum(Calendar.DAY_OF_MONTH);
-    }
-
-    private String padZero(String number) {
-        return String.format("%02d", Integer.parseInt(number));
-    }
+//    /**
+//     * 匹配分级基金名称
+//     *
+//     * @param text 文本内容
+//     * @return /
+//     */
+//    protected List<String> matchTieredFund(String text) {
+//        List<String> matches = ListUtil.list(false);
+//        if (StrUtil.isBlank(text)) {
+//            return matches;
+//        }
+//        // 使用正则表达式查找匹配项
+//        Pattern pattern = Pattern.compile("[A-F]级|基金[A-F]");
+//        Matcher matcher = pattern.matcher(text);
+//        // 收集所有匹配项
+//        while (matcher.find()) {
+//            matches.add(matcher.group());
+//        }
+//        // 提取字母并按字母顺序排序
+//        return matches.stream()
+//                .map(s -> s.replaceAll("[^A-F]", ""))
+//                .distinct()
+//                .sorted()
+//                .map(letter -> letter + "级")
+//                .collect(Collectors.toList());
+//    }
+//
+//    /**
+//     * 匹配报告日期
+//     *
+//     * @param string 文本内容
+//     * @return 报告日期
+//     */
+//    private String matchReportDate(String string) {
+//        if (string == null) {
+//            return null;
+//        }
+//        // 编译正则表达式模式
+//        Pattern pat1 = Pattern.compile("(2\\d{3}).*([一二三四1234])季度");  // 2023年XXX3季度
+//        Pattern pat2 = Pattern.compile("\\d{4}-\\d{2}-\\d{2}");  // 2023-12-31
+//        Pattern pat3 = Pattern.compile("(2\\d{3})年年度");  // 2023年年度
+//        Pattern pat4 = Pattern.compile("(\\d{4})年(\\d{1,2})月");  // 2023年12月
+//        Pattern pat5 = Pattern.compile("\\d{4}\\d{2}\\d{2}");  // 20231231
+//        Pattern pat6 = Pattern.compile("(2\\d{3})年度");  // 2023年度
+//        // 创建Matcher对象
+//        Matcher matcher1 = pat1.matcher(string);
+//        Matcher matcher2 = pat2.matcher(string);
+//        Matcher matcher3 = pat3.matcher(string);
+//        Matcher matcher4 = pat4.matcher(string);
+//        Matcher matcher5 = pat5.matcher(string);
+//        Matcher matcher6 = pat6.matcher(string);
+//        // 尝试匹配
+//        if (matcher1.find()) {
+//            String year = matcher1.group(1);
+//            String quarter = matcher1.group(2);
+//            return switch (quarter) {
+//                case "一", "1" -> year + "-03-31";
+//                case "二", "2" -> year + "-06-30";
+//                case "三", "3" -> year + "-09-30";
+//                case "四", "4" -> year + "-12-31";
+//                default -> null;
+//            };
+//        } else if (matcher2.find()) {
+//            return matcher2.group();
+//        } else if (matcher5.find()) {
+//            return matcher5.group();
+//        } else if (matcher3.find()) {
+//            return matcher3.group(1) + "-12-31";
+//        } else if (matcher6.find()) {
+//            return matcher6.group(1) + "-12-31";
+//        } else if (matcher4.find()) {
+//            String year = matcher4.group(1);
+//            String month = matcher4.group(2);
+//            int lastDayOfMonth = getLastDayOfMonth(Integer.parseInt(year), Integer.parseInt(month));
+//            return year + "-" + padZero(month) + "-" + padZero(lastDayOfMonth + "");
+//        } else {
+//            return null;
+//        }
+//    }
+//
+//    /**
+//     * 匹配报告类型,如“季度”、“年度”
+//     *
+//     * @param string 输入字符串
+//     * @return 匹配到的报告类型子字符串,如果没有匹配到则返回null
+//     */
+//    private String matchReportType(String string) {
+//        if (string == null) {
+//            return null;
+//        }
+//        // 所有报告的正则识别方式
+//        String patterns = ReportType.getAllPatterns();
+//        // 编译正则表达式模式
+//        Pattern pattern = Pattern.compile(patterns);
+//        // 创建Matcher对象
+//        Matcher matcher = pattern.matcher(string);
+//        // 尝试匹配
+//        if (matcher.find()) {
+//            return matcher.group();
+//        } else {
+//            return null;
+//        }
+//    }
+//
+//    private int getLastDayOfMonth(int year, int month) {
+//        Calendar calendar = Calendar.getInstance();
+//        calendar.set(Calendar.YEAR, year);
+//        calendar.set(Calendar.MONTH, month - 1); // Calendar.MONTH 是从0开始的
+//        return calendar.getActualMaximum(Calendar.DAY_OF_MONTH);
+//    }
+//
+//    private String padZero(String number) {
+//        return String.format("%02d", Integer.parseInt(number));
+//    }
 }

+ 5 - 4
service-daq/src/main/java/com/simuwang/daq/components/report/parser/pdf/PDAnnuallyReportParser.java

@@ -5,6 +5,7 @@ import cn.hutool.core.collection.ListUtil;
 import cn.hutool.core.map.MapUtil;
 import com.simuwang.base.mapper.EmailFieldMappingMapper;
 import com.simuwang.base.pojo.dto.report.*;
+import com.simuwang.daq.components.ReportParseUtils;
 import com.simuwang.daq.components.report.parser.ReportParserConstant;
 import org.springframework.stereotype.Component;
 import technology.tabula.Table;
@@ -127,7 +128,7 @@ public class PDAnnuallyReportParser extends PDQuarterlyReportParser<AnnuallyRepo
     private List<ReportFinancialIndicatorsDTO> buildFinancialIndicatorsInfo(Integer fileId) {
         List<ReportFinancialIndicatorsDTO> dtos = ListUtil.list(false);
         // 分级基金
-        List<String> levels = this.matchTieredFund(String.join(",", this.textList));
+        List<String> levels = ReportParseUtils.matchTieredFund(String.join(",", this.textList));
         levels.add(0, "母基金");
         // 假设这里可能存在分级基金,不存在表格跨页
         for (int k = 0; k < this.financialIndicatorsTables.size(); k++) {
@@ -135,14 +136,14 @@ public class PDAnnuallyReportParser extends PDQuarterlyReportParser<AnnuallyRepo
             int colCount = table.getColCount();
             for (int j = 1; j < colCount; j++) {
                 Map<String, Object> infoMap = MapUtil.newHashMap(16);
-                String year = this.cleaningValue(table.getCell(0, j).getText());
+                String year = ReportParseUtils.cleaningValue(table.getCell(0, j).getText());
                 infoMap.put("年度", year);
                 for (int i = 0; i < table.getRowCount(); i++) {
-                    String columnName = this.cleaningValue(table.getCell(i, 0).getText());
+                    String columnName = ReportParseUtils.cleaningValue(table.getCell(i, 0).getText());
                     if (!CollUtil.contains(FINANCIAL_INDICATORS_COLUMN_NAMES, columnName)) {
                         continue;
                     }
-                    String value = this.cleaningValue(table.getCell(i, j).getText());
+                    String value = ReportParseUtils.cleaningValue(table.getCell(i, j).getText());
                     infoMap.put(columnName, value);
                 }
                 ReportFinancialIndicatorsDTO dto = new ReportFinancialIndicatorsDTO(fileId);

+ 9 - 8
service-daq/src/main/java/com/simuwang/daq/components/report/parser/pdf/PDQuarterlyReportParser.java

@@ -6,6 +6,7 @@ import cn.hutool.core.map.MapUtil;
 import cn.hutool.core.util.StrUtil;
 import com.simuwang.base.mapper.EmailFieldMappingMapper;
 import com.simuwang.base.pojo.dto.report.*;
+import com.simuwang.daq.components.ReportParseUtils;
 import com.simuwang.daq.components.report.parser.ReportParserConstant;
 import org.springframework.stereotype.Component;
 import technology.tabula.RectangularTextContainer;
@@ -208,15 +209,15 @@ public class PDQuarterlyReportParser<T extends QuarterlyReportData> extends Abst
             int j = colCount == 4 ? 1 : 0;
             // 按行遍历
             for (int i = 0; i < table.getRowCount(); i++) {
-                String text = this.cleaningValue(table.getCell(i, 0).getText());
+                String text = ReportParseUtils.cleaningValue(table.getCell(i, 0).getText());
                 if (StrUtil.containsAny(text, "序号", "行业类别")) {
                     continue;
                 }
                 ReportInvestmentIndustryDTO dto = new ReportInvestmentIndustryDTO(fileId);
                 dto.setInvestType(investType);
-                dto.setIndustryName(this.cleaningValue(table.getCell(i, j).getText()));
-                dto.setMarketValue(this.cleaningValue(table.getCell(i, j + 1).getText()));
-                dto.setRatio(this.cleaningValue(table.getCell(i, j + 2).getText()));
+                dto.setIndustryName(ReportParseUtils.cleaningValue(table.getCell(i, j).getText()));
+                dto.setMarketValue(ReportParseUtils.cleaningValue(table.getCell(i, j + 1).getText()));
+                dto.setRatio(ReportParseUtils.cleaningValue(table.getCell(i, j + 2).getText()));
                 dtos.add(dto);
             }
         }
@@ -238,17 +239,17 @@ public class PDQuarterlyReportParser<T extends QuarterlyReportData> extends Abst
                 // x坐标升序(防止部分行乱序问题)
                 row.sort(Comparator.comparing(Rectangle2D.Float::getX));
                 // 大类
-                String type = this.cleaningValue(row.get(0).getText());
+                String type = ReportParseUtils.cleaningValue(row.get(0).getText());
                 if (StrUtil.isNotBlank(type)) {
                     assetType = type;
                 }
                 // 金额、市值,有时是 “备注#金额”的格式
-                String marketValueAndRemark = this.cleaningValue(row.get(2).getText());
+                String marketValueAndRemark = ReportParseUtils.cleaningValue(row.get(2).getText());
                 if (StrUtil.isBlank(marketValueAndRemark) || StrUtil.isBlank(assetType)) {
                     continue;
                 }
                 // 资产明细
-                String detail = this.cleaningValue(row.get(1).getText(), false);
+                String detail = ReportParseUtils.cleaningValue(row.get(1).getText(), false);
                 if (StrUtil.contains(marketValueAndRemark, "#")) {
                     // 有#表示有备注,而且可能有多个,多个用分号分隔的.
                     List<String> marketValueAndRemarks = StrUtil.split(marketValueAndRemark, ";");
@@ -286,7 +287,7 @@ public class PDQuarterlyReportParser<T extends QuarterlyReportData> extends Abst
     protected List<String> getTableColTexts(Table table, Integer col) {
         List<String> details = ListUtil.list(false);
         for (@SuppressWarnings("all") List<RectangularTextContainer> row : table.getRows()) {
-            String detail = this.cleaningValue(row.get(col).getText(), false);
+            String detail = ReportParseUtils.cleaningValue(row.get(col).getText(), false);
             if (StrUtil.isNotBlank(detail)) {
                 details.add(detail);
             }

+ 27 - 0
service-daq/src/main/java/com/simuwang/daq/utils/ExcelReportParseUtil.java

@@ -0,0 +1,27 @@
+package com.simuwang.daq.utils;
+
+import cn.hutool.core.collection.ListUtil;
+import com.alibaba.excel.EasyExcel;
+import com.alibaba.excel.read.builder.ExcelReaderBuilder;
+import com.simuwang.base.pojo.dto.report.SimpleTable;
+import com.simuwang.daq.components.CustomExcelMultiSheetListener;
+
+import java.util.List;
+
+public class ExcelReportParseUtil {
+    public static final String filepath = "D:\\Documents\\workspace\\idea\\smppw\\data-daq\\service-daq\\src\\main\\java\\com\\simuwang\\daq\\utils\\PB0001私募月报(证券投资).xlsx";
+
+    public static void main(String[] args) {
+        CustomExcelMultiSheetListener readListener = new CustomExcelMultiSheetListener();
+        ExcelReaderBuilder readerBuilder = EasyExcel.read(filepath);
+        readerBuilder.sheet();
+        readerBuilder.customObject(ListUtil.toList("基金概况", "净值月报", "级基金净值表"));
+        readerBuilder.registerReadListener(readListener);
+        readerBuilder.doReadAll();
+
+        List<SimpleTable> tables = readListener.getTables();
+        for (SimpleTable table : tables) {
+            System.out.println(table);
+        }
+    }
+}

BIN
service-daq/src/main/java/com/simuwang/daq/utils/PB0001私募月报(证券投资).xlsx


BIN
service-daq/src/main/java/com/simuwang/daq/utils/PB0003私募年报(证券投资).xlsx