package com.simuwang.daq.components;
import cn.hutool.core.collection.CollUtil;
import cn.hutool.core.collection.ListUtil;
import cn.hutool.core.map.MapUtil;
import cn.hutool.core.util.ReflectUtil;
import cn.hutool.core.util.StrUtil;
import com.simuwang.base.common.exception.APIException;
import com.simuwang.base.mapper.EmailFieldMappingMapper;
import com.simuwang.base.pojo.dos.EmailFieldMappingDO;
import com.simuwang.daq.dto.ReportFundInfo;
import com.simuwang.daq.dto.ReportInfo;
import com.smppw.common.pojo.ValueLabelVO;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.io.RandomAccessReadBufferedFile;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.springframework.stereotype.Component;
import technology.tabula.*;
import technology.tabula.extractors.SpreadsheetExtractionAlgorithm;
import java.io.IOException;
import java.sql.Struct;
import java.util.Calendar;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
/**
* @author wangzaijun
* @date 2024/9/11 16:19
* @description pdf格式的月报解析
*/
@Component("monthly-report:pdf")
public class PDMonthlyReportParser extends AbstractReportParser {
private final EmailFieldMappingMapper fieldMappingMapper;
private String reportName = null;
private Table baseInfoTable = null;
private List
extNavTables = ListUtil.list(true);
private List fieldMapper = null;
public PDMonthlyReportParser(EmailFieldMappingMapper fieldMappingMapper) {
this.fieldMappingMapper = fieldMappingMapper;
}
@Override
protected void initParse() throws IOException {
try (PDDocument document = Loader.loadPDF(new RandomAccessReadBufferedFile(this.filepath))) {
CustomPDFTextStripper stripper = new CustomPDFTextStripper();
stripper.setSortByPosition(true);
String text = stripper.getText(document);
text = text.replace("++\r\n", "").replace("++", "");
List textList = StrUtil.split(text, "\r\n");
if (CollUtil.isNotEmpty(textList)) {
List wkList = this.watermarkListMap.get("report_name");
String name = this.processString(wkList, textList.get(0));
this.reportName = this.matchReportName(name);
if (StrUtil.isBlank(this.reportName)) {
throw new APIException("未匹配到报告名称");
}
}
SpreadsheetExtractionAlgorithm extractionAlgorithm = new SpreadsheetExtractionAlgorithm();
PageIterator pageIterator = new ObjectExtractor(document).extract();
while (pageIterator.hasNext()) {
Page page = pageIterator.next();
List tables = extractionAlgorithm.extract(page);
tables = tables.stream().distinct().collect(Collectors.toList());
for (Table table : tables) {
int colCount = table.getColCount();
if (colCount == 4) {
this.baseInfoTable = table;
} else if (colCount >= 5) {
this.extNavTables.add(table);
}
}
}
}
List emailFieldMapping = this.fieldMappingMapper.getEmailFieldMapping();
if (CollUtil.isNotEmpty(emailFieldMapping)) {
this.fieldMapper = emailFieldMapping.stream().map(e -> new ValueLabelVO(e.getCode(), e.getName())).collect(Collectors.toList());
}
}
@Override
protected ReportInfo parseReportInfo(Integer fileId) {
ReportInfo reportInfo = new ReportInfo();
reportInfo.setFileId(fileId);
reportInfo.setReportName(this.reportName);
reportInfo.setReportType(this.matchReportType(this.reportName));
reportInfo.setReportDate(this.matchReportDate(this.reportName));
return reportInfo;
}
@Override
protected ReportFundInfo parseBaseInfo() {
Table baseInfoTable = this.baseInfoTable;
if (baseInfoTable == null) {
throw new APIException("未解析到基本信息表格");
}
Map baseInfoMap = MapUtil.newHashMap(32);
for (int i = 0; i < baseInfoTable.getRows().size(); i++) {
List cols = baseInfoTable.getRows().get(i);
for (int j = 0; j < 2; j++) {
baseInfoMap.put(cols.get(j * 2).getText(), cols.get(j * 2 + 1).getText());
}
}
// 匹配字段清洗字段
ReportFundInfo reportFundInfo = new ReportFundInfo();
baseInfoMap.forEach((k, v) -> {
String fieldValue = StrUtil.toStringOrNull(v);
if (fieldValue.contains("-")) {
fieldValue = null;
}
if (fieldValue != null) {
fieldValue = fieldValue.replace("\r", "");
}
for (ValueLabelVO vo : this.fieldMapper) {
String fieldName = vo.getValue();
List labels = StrUtil.split(vo.getLabel(), ",");
if (labels.contains(k)) {
ReflectUtil.setFieldValue(reportFundInfo, fieldName, fieldValue);
break;
}
for (String label : labels) {
if (k.contains(label)) {
ReflectUtil.setFieldValue(reportFundInfo, fieldName, fieldValue);
break;
}
}
}
});
return reportFundInfo;
}
@Override
protected void parseExtInfo() {
}
@Override
protected void saveResult() {
}
/**
* 匹配报告日期
*
* @param string 文本内容
* @return 报告日期
*/
private String matchReportDate(String string) {
if (string == null) {
return null;
}
// 编译正则表达式模式
Pattern pat1 = Pattern.compile("(2\\d{3}).*([一二三四1234])季度"); // 2023年XXX3季度
Pattern pat2 = Pattern.compile("\\d{4}-\\d{2}-\\d{2}"); // 2023-12-31
Pattern pat3 = Pattern.compile("(2\\d{3})年年度"); // 2023年年度
Pattern pat4 = Pattern.compile("(\\d{4})年(\\d{1,2})月"); // 2023年12月
// 创建Matcher对象
Matcher matcher1 = pat1.matcher(string);
Matcher matcher2 = pat2.matcher(string);
Matcher matcher3 = pat3.matcher(string);
Matcher matcher4 = pat4.matcher(string);
// 尝试匹配
if (matcher1.find()) {
String year = matcher1.group(1);
String quarter = matcher1.group(2);
return switch (quarter) {
case "一", "1" -> year + "-03-31";
case "二", "2" -> year + "-06-30";
case "三", "3" -> year + "-09-30";
case "四", "4" -> year + "-12-31";
default -> null;
};
} else if (matcher2.find()) {
return matcher2.group();
} else if (matcher3.find()) {
return matcher3.group(1) + "-12-31";
} else if (matcher4.find()) {
String year = matcher4.group(1);
String month = matcher4.group(2);
int lastDayOfMonth = getLastDayOfMonth(Integer.parseInt(year), Integer.parseInt(month));
return year + "-" + padZero(month) + "-" + padZero(lastDayOfMonth + "");
} else {
return null;
}
}
/**
* 匹配报告类型,如“季度”、“年度”
*
* @param string 输入字符串
* @return 匹配到的报告类型子字符串,如果没有匹配到则返回null
*/
private String matchReportType(String string) {
if (string == null) {
return null;
}
// 编译正则表达式模式
Pattern pattern = Pattern.compile("月|季度|年度");
// 创建Matcher对象
Matcher matcher = pattern.matcher(string);
// 尝试匹配
if (matcher.find()) {
return matcher.group();
} else {
return null;
}
}
private String matchReportName(String text) {
if (StrUtil.isBlank(text)) {
return null;
}
// 编译正则表达式模式
Pattern pat1 = Pattern.compile(".+?报([告表])?\\d{4}(\\.?\\d{1,2}(\\.?\\d{2})?)?");
Pattern pat2 = Pattern.compile("私募.*披露年度报[告表]((\\d{4}-\\d{2}-\\d{2}至\\d{4}-\\d{2}-\\d{2}))?");
Pattern pat3 = Pattern.compile(".+?报([告表])?\\d{4}-\\d{2}-\\d{2}至\\d{4}-\\d{2}-\\d{2}?");
// 创建Matcher对象
Matcher matcher1 = pat1.matcher(text);
Matcher matcher2 = pat2.matcher(text);
Matcher matcher3 = pat3.matcher(text);
// 尝试匹配
String reportName;
if (matcher1.find()) {
reportName = matcher1.group();
} else if (matcher2.find()) {
reportName = matcher2.group();
} else if (matcher3.find()) {
reportName = matcher3.group();
} else {
return null;
}
return reportName.replace("(", "(").replace(")", ")");
}
private int getLastDayOfMonth(int year, int month) {
Calendar calendar = Calendar.getInstance();
calendar.set(Calendar.YEAR, year);
calendar.set(Calendar.MONTH, month - 1); // Calendar.MONTH 是从0开始的
return calendar.getActualMaximum(Calendar.DAY_OF_MONTH);
}
private String padZero(String number) {
return String.format("%02d", Integer.parseInt(number));
}
}