package com.simuwang.daq.components; import cn.hutool.core.collection.CollUtil; import cn.hutool.core.collection.ListUtil; import cn.hutool.core.map.MapUtil; import cn.hutool.core.util.ReflectUtil; import cn.hutool.core.util.StrUtil; import com.simuwang.base.common.exception.APIException; import com.simuwang.base.mapper.EmailFieldMappingMapper; import com.simuwang.base.pojo.dos.EmailFieldMappingDO; import com.simuwang.daq.dto.ReportFundInfo; import com.simuwang.daq.dto.ReportInfo; import com.smppw.common.pojo.ValueLabelVO; import org.apache.pdfbox.Loader; import org.apache.pdfbox.io.RandomAccessReadBufferedFile; import org.apache.pdfbox.pdmodel.PDDocument; import org.springframework.stereotype.Component; import technology.tabula.*; import technology.tabula.extractors.SpreadsheetExtractionAlgorithm; import java.io.IOException; import java.sql.Struct; import java.util.Calendar; import java.util.List; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.stream.Collectors; /** * @author wangzaijun * @date 2024/9/11 16:19 * @description pdf格式的月报解析 */ @Component("monthly-report:pdf") public class PDMonthlyReportParser extends AbstractReportParser { private final EmailFieldMappingMapper fieldMappingMapper; private String reportName = null; private Table baseInfoTable = null; private List extNavTables = ListUtil.list(true); private List fieldMapper = null; public PDMonthlyReportParser(EmailFieldMappingMapper fieldMappingMapper) { this.fieldMappingMapper = fieldMappingMapper; } @Override protected void initParse() throws IOException { try (PDDocument document = Loader.loadPDF(new RandomAccessReadBufferedFile(this.filepath))) { CustomPDFTextStripper stripper = new CustomPDFTextStripper(); stripper.setSortByPosition(true); String text = stripper.getText(document); text = text.replace("++\r\n", "").replace("++", ""); List textList = StrUtil.split(text, "\r\n"); if (CollUtil.isNotEmpty(textList)) { List wkList = this.watermarkListMap.get("report_name"); String name = this.processString(wkList, textList.get(0)); this.reportName = this.matchReportName(name); if (StrUtil.isBlank(this.reportName)) { throw new APIException("未匹配到报告名称"); } } SpreadsheetExtractionAlgorithm extractionAlgorithm = new SpreadsheetExtractionAlgorithm(); PageIterator pageIterator = new ObjectExtractor(document).extract(); while (pageIterator.hasNext()) { Page page = pageIterator.next(); List
tables = extractionAlgorithm.extract(page); tables = tables.stream().distinct().collect(Collectors.toList()); for (Table table : tables) { int colCount = table.getColCount(); if (colCount == 4) { this.baseInfoTable = table; } else if (colCount >= 5) { this.extNavTables.add(table); } } } } List emailFieldMapping = this.fieldMappingMapper.getEmailFieldMapping(); if (CollUtil.isNotEmpty(emailFieldMapping)) { this.fieldMapper = emailFieldMapping.stream().map(e -> new ValueLabelVO(e.getCode(), e.getName())).collect(Collectors.toList()); } } @Override protected ReportInfo parseReportInfo(Integer fileId) { ReportInfo reportInfo = new ReportInfo(); reportInfo.setFileId(fileId); reportInfo.setReportName(this.reportName); reportInfo.setReportType(this.matchReportType(this.reportName)); reportInfo.setReportDate(this.matchReportDate(this.reportName)); return reportInfo; } @Override protected ReportFundInfo parseBaseInfo() { Table baseInfoTable = this.baseInfoTable; if (baseInfoTable == null) { throw new APIException("未解析到基本信息表格"); } Map baseInfoMap = MapUtil.newHashMap(32); for (int i = 0; i < baseInfoTable.getRows().size(); i++) { List cols = baseInfoTable.getRows().get(i); for (int j = 0; j < 2; j++) { baseInfoMap.put(cols.get(j * 2).getText(), cols.get(j * 2 + 1).getText()); } } // 匹配字段清洗字段 ReportFundInfo reportFundInfo = new ReportFundInfo(); baseInfoMap.forEach((k, v) -> { String fieldValue = StrUtil.toStringOrNull(v); if (fieldValue.contains("-")) { fieldValue = null; } if (fieldValue != null) { fieldValue = fieldValue.replace("\r", ""); } for (ValueLabelVO vo : this.fieldMapper) { String fieldName = vo.getValue(); List labels = StrUtil.split(vo.getLabel(), ","); if (labels.contains(k)) { ReflectUtil.setFieldValue(reportFundInfo, fieldName, fieldValue); break; } for (String label : labels) { if (k.contains(label)) { ReflectUtil.setFieldValue(reportFundInfo, fieldName, fieldValue); break; } } } }); return reportFundInfo; } @Override protected void parseExtInfo() { } @Override protected void saveResult() { } /** * 匹配报告日期 * * @param string 文本内容 * @return 报告日期 */ private String matchReportDate(String string) { if (string == null) { return null; } // 编译正则表达式模式 Pattern pat1 = Pattern.compile("(2\\d{3}).*([一二三四1234])季度"); // 2023年XXX3季度 Pattern pat2 = Pattern.compile("\\d{4}-\\d{2}-\\d{2}"); // 2023-12-31 Pattern pat3 = Pattern.compile("(2\\d{3})年年度"); // 2023年年度 Pattern pat4 = Pattern.compile("(\\d{4})年(\\d{1,2})月"); // 2023年12月 // 创建Matcher对象 Matcher matcher1 = pat1.matcher(string); Matcher matcher2 = pat2.matcher(string); Matcher matcher3 = pat3.matcher(string); Matcher matcher4 = pat4.matcher(string); // 尝试匹配 if (matcher1.find()) { String year = matcher1.group(1); String quarter = matcher1.group(2); return switch (quarter) { case "一", "1" -> year + "-03-31"; case "二", "2" -> year + "-06-30"; case "三", "3" -> year + "-09-30"; case "四", "4" -> year + "-12-31"; default -> null; }; } else if (matcher2.find()) { return matcher2.group(); } else if (matcher3.find()) { return matcher3.group(1) + "-12-31"; } else if (matcher4.find()) { String year = matcher4.group(1); String month = matcher4.group(2); int lastDayOfMonth = getLastDayOfMonth(Integer.parseInt(year), Integer.parseInt(month)); return year + "-" + padZero(month) + "-" + padZero(lastDayOfMonth + ""); } else { return null; } } /** * 匹配报告类型,如“季度”、“年度” * * @param string 输入字符串 * @return 匹配到的报告类型子字符串,如果没有匹配到则返回null */ private String matchReportType(String string) { if (string == null) { return null; } // 编译正则表达式模式 Pattern pattern = Pattern.compile("月|季度|年度"); // 创建Matcher对象 Matcher matcher = pattern.matcher(string); // 尝试匹配 if (matcher.find()) { return matcher.group(); } else { return null; } } private String matchReportName(String text) { if (StrUtil.isBlank(text)) { return null; } // 编译正则表达式模式 Pattern pat1 = Pattern.compile(".+?报([告表])?\\d{4}(\\.?\\d{1,2}(\\.?\\d{2})?)?"); Pattern pat2 = Pattern.compile("私募.*披露年度报[告表]((\\d{4}-\\d{2}-\\d{2}至\\d{4}-\\d{2}-\\d{2}))?"); Pattern pat3 = Pattern.compile(".+?报([告表])?\\d{4}-\\d{2}-\\d{2}至\\d{4}-\\d{2}-\\d{2}?"); // 创建Matcher对象 Matcher matcher1 = pat1.matcher(text); Matcher matcher2 = pat2.matcher(text); Matcher matcher3 = pat3.matcher(text); // 尝试匹配 String reportName; if (matcher1.find()) { reportName = matcher1.group(); } else if (matcher2.find()) { reportName = matcher2.group(); } else if (matcher3.find()) { reportName = matcher3.group(); } else { return null; } return reportName.replace("(", "(").replace(")", ")"); } private int getLastDayOfMonth(int year, int month) { Calendar calendar = Calendar.getInstance(); calendar.set(Calendar.YEAR, year); calendar.set(Calendar.MONTH, month - 1); // Calendar.MONTH 是从0开始的 return calendar.getActualMaximum(Calendar.DAY_OF_MONTH); } private String padZero(String number) { return String.format("%02d", Integer.parseInt(number)); } }