123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258 |
- package com.simuwang.daq.components;
- import cn.hutool.core.collection.CollUtil;
- import cn.hutool.core.collection.ListUtil;
- import cn.hutool.core.map.MapUtil;
- import cn.hutool.core.util.ReflectUtil;
- import cn.hutool.core.util.StrUtil;
- import com.simuwang.base.common.exception.APIException;
- import com.simuwang.base.mapper.EmailFieldMappingMapper;
- import com.simuwang.base.pojo.dos.EmailFieldMappingDO;
- import com.simuwang.daq.dto.ReportFundInfo;
- import com.simuwang.daq.dto.ReportInfo;
- import com.smppw.common.pojo.ValueLabelVO;
- import org.apache.pdfbox.Loader;
- import org.apache.pdfbox.io.RandomAccessReadBufferedFile;
- import org.apache.pdfbox.pdmodel.PDDocument;
- import org.springframework.stereotype.Component;
- import technology.tabula.*;
- import technology.tabula.extractors.SpreadsheetExtractionAlgorithm;
- import java.io.IOException;
- import java.sql.Struct;
- import java.util.Calendar;
- import java.util.List;
- import java.util.Map;
- import java.util.regex.Matcher;
- import java.util.regex.Pattern;
- import java.util.stream.Collectors;
- /**
- * @author wangzaijun
- * @date 2024/9/11 16:19
- * @description pdf格式的月报解析
- */
- @Component("monthly-report:pdf")
- public class PDMonthlyReportParser extends AbstractReportParser {
- private final EmailFieldMappingMapper fieldMappingMapper;
- private String reportName = null;
- private Table baseInfoTable = null;
- private List<Table> extNavTables = ListUtil.list(true);
- private List<ValueLabelVO> fieldMapper = null;
- public PDMonthlyReportParser(EmailFieldMappingMapper fieldMappingMapper) {
- this.fieldMappingMapper = fieldMappingMapper;
- }
- @Override
- protected void initParse() throws IOException {
- try (PDDocument document = Loader.loadPDF(new RandomAccessReadBufferedFile(this.filepath))) {
- CustomPDFTextStripper stripper = new CustomPDFTextStripper();
- stripper.setSortByPosition(true);
- String text = stripper.getText(document);
- text = text.replace("++\r\n", "").replace("++", "");
- List<String> textList = StrUtil.split(text, "\r\n");
- if (CollUtil.isNotEmpty(textList)) {
- List<String> wkList = this.watermarkListMap.get("report_name");
- String name = this.processString(wkList, textList.get(0));
- this.reportName = this.matchReportName(name);
- if (StrUtil.isBlank(this.reportName)) {
- throw new APIException("未匹配到报告名称");
- }
- }
- SpreadsheetExtractionAlgorithm extractionAlgorithm = new SpreadsheetExtractionAlgorithm();
- PageIterator pageIterator = new ObjectExtractor(document).extract();
- while (pageIterator.hasNext()) {
- Page page = pageIterator.next();
- List<Table> tables = extractionAlgorithm.extract(page);
- tables = tables.stream().distinct().collect(Collectors.toList());
- for (Table table : tables) {
- int colCount = table.getColCount();
- if (colCount == 4) {
- this.baseInfoTable = table;
- } else if (colCount >= 5) {
- this.extNavTables.add(table);
- }
- }
- }
- }
- List<EmailFieldMappingDO> emailFieldMapping = this.fieldMappingMapper.getEmailFieldMapping();
- if (CollUtil.isNotEmpty(emailFieldMapping)) {
- this.fieldMapper = emailFieldMapping.stream().map(e -> new ValueLabelVO(e.getCode(), e.getName())).collect(Collectors.toList());
- }
- }
- @Override
- protected ReportInfo parseReportInfo(Integer fileId) {
- ReportInfo reportInfo = new ReportInfo();
- reportInfo.setFileId(fileId);
- reportInfo.setReportName(this.reportName);
- reportInfo.setReportType(this.matchReportType(this.reportName));
- reportInfo.setReportDate(this.matchReportDate(this.reportName));
- return reportInfo;
- }
- @Override
- protected ReportFundInfo parseBaseInfo() {
- Table baseInfoTable = this.baseInfoTable;
- if (baseInfoTable == null) {
- throw new APIException("未解析到基本信息表格");
- }
- Map<String, Object> baseInfoMap = MapUtil.newHashMap(32);
- for (int i = 0; i < baseInfoTable.getRows().size(); i++) {
- List<RectangularTextContainer> cols = baseInfoTable.getRows().get(i);
- for (int j = 0; j < 2; j++) {
- baseInfoMap.put(cols.get(j * 2).getText(), cols.get(j * 2 + 1).getText());
- }
- }
- // 匹配字段清洗字段
- ReportFundInfo reportFundInfo = new ReportFundInfo();
- baseInfoMap.forEach((k, v) -> {
- String fieldValue = StrUtil.toStringOrNull(v);
- if (fieldValue.contains("-")) {
- fieldValue = null;
- }
- if (fieldValue != null) {
- fieldValue = fieldValue.replace("\r", "");
- }
- for (ValueLabelVO vo : this.fieldMapper) {
- String fieldName = vo.getValue();
- List<String> labels = StrUtil.split(vo.getLabel(), ",");
- if (labels.contains(k)) {
- ReflectUtil.setFieldValue(reportFundInfo, fieldName, fieldValue);
- break;
- }
- for (String label : labels) {
- if (k.contains(label)) {
- ReflectUtil.setFieldValue(reportFundInfo, fieldName, fieldValue);
- break;
- }
- }
- }
- });
- return reportFundInfo;
- }
- @Override
- protected void parseExtInfo() {
- }
- @Override
- protected void saveResult() {
- }
- /**
- * 匹配报告日期
- *
- * @param string 文本内容
- * @return 报告日期
- */
- private String matchReportDate(String string) {
- if (string == null) {
- return null;
- }
- // 编译正则表达式模式
- Pattern pat1 = Pattern.compile("(2\\d{3}).*([一二三四1234])季度"); // 2023年XXX3季度
- Pattern pat2 = Pattern.compile("\\d{4}-\\d{2}-\\d{2}"); // 2023-12-31
- Pattern pat3 = Pattern.compile("(2\\d{3})年年度"); // 2023年年度
- Pattern pat4 = Pattern.compile("(\\d{4})年(\\d{1,2})月"); // 2023年12月
- // 创建Matcher对象
- Matcher matcher1 = pat1.matcher(string);
- Matcher matcher2 = pat2.matcher(string);
- Matcher matcher3 = pat3.matcher(string);
- Matcher matcher4 = pat4.matcher(string);
- // 尝试匹配
- if (matcher1.find()) {
- String year = matcher1.group(1);
- String quarter = matcher1.group(2);
- return switch (quarter) {
- case "一", "1" -> year + "-03-31";
- case "二", "2" -> year + "-06-30";
- case "三", "3" -> year + "-09-30";
- case "四", "4" -> year + "-12-31";
- default -> null;
- };
- } else if (matcher2.find()) {
- return matcher2.group();
- } else if (matcher3.find()) {
- return matcher3.group(1) + "-12-31";
- } else if (matcher4.find()) {
- String year = matcher4.group(1);
- String month = matcher4.group(2);
- int lastDayOfMonth = getLastDayOfMonth(Integer.parseInt(year), Integer.parseInt(month));
- return year + "-" + padZero(month) + "-" + padZero(lastDayOfMonth + "");
- } else {
- return null;
- }
- }
- /**
- * 匹配报告类型,如“季度”、“年度”
- *
- * @param string 输入字符串
- * @return 匹配到的报告类型子字符串,如果没有匹配到则返回null
- */
- private String matchReportType(String string) {
- if (string == null) {
- return null;
- }
- // 编译正则表达式模式
- Pattern pattern = Pattern.compile("月|季度|年度");
- // 创建Matcher对象
- Matcher matcher = pattern.matcher(string);
- // 尝试匹配
- if (matcher.find()) {
- return matcher.group();
- } else {
- return null;
- }
- }
- private String matchReportName(String text) {
- if (StrUtil.isBlank(text)) {
- return null;
- }
- // 编译正则表达式模式
- Pattern pat1 = Pattern.compile(".+?报([告表])?\\d{4}(\\.?\\d{1,2}(\\.?\\d{2})?)?");
- Pattern pat2 = Pattern.compile("私募.*披露年度报[告表]((\\d{4}-\\d{2}-\\d{2}至\\d{4}-\\d{2}-\\d{2}))?");
- Pattern pat3 = Pattern.compile(".+?报([告表])?\\d{4}-\\d{2}-\\d{2}至\\d{4}-\\d{2}-\\d{2}?");
- // 创建Matcher对象
- Matcher matcher1 = pat1.matcher(text);
- Matcher matcher2 = pat2.matcher(text);
- Matcher matcher3 = pat3.matcher(text);
- // 尝试匹配
- String reportName;
- if (matcher1.find()) {
- reportName = matcher1.group();
- } else if (matcher2.find()) {
- reportName = matcher2.group();
- } else if (matcher3.find()) {
- reportName = matcher3.group();
- } else {
- return null;
- }
- return reportName.replace("(", "(").replace(")", ")");
- }
- private int getLastDayOfMonth(int year, int month) {
- Calendar calendar = Calendar.getInstance();
- calendar.set(Calendar.YEAR, year);
- calendar.set(Calendar.MONTH, month - 1); // Calendar.MONTH 是从0开始的
- return calendar.getActualMaximum(Calendar.DAY_OF_MONTH);
- }
- private String padZero(String number) {
- return String.format("%02d", Integer.parseInt(number));
- }
- }
|