|
@@ -22,9 +22,9 @@ import technology.tabula.extractors.SpreadsheetExtractionAlgorithm;
|
|
|
|
|
|
import java.io.IOException;
|
|
import java.io.IOException;
|
|
import java.util.Calendar;
|
|
import java.util.Calendar;
|
|
|
|
+import java.util.HashMap;
|
|
import java.util.List;
|
|
import java.util.List;
|
|
import java.util.Map;
|
|
import java.util.Map;
|
|
-import java.util.Objects;
|
|
|
|
import java.util.function.Function;
|
|
import java.util.function.Function;
|
|
import java.util.regex.Matcher;
|
|
import java.util.regex.Matcher;
|
|
import java.util.regex.Pattern;
|
|
import java.util.regex.Pattern;
|
|
@@ -56,6 +56,7 @@ public abstract class AbstractPDReportParser<T extends ReportData> extends Abstr
|
|
this.textList = null;
|
|
this.textList = null;
|
|
// 初始化
|
|
// 初始化
|
|
this.init();
|
|
this.init();
|
|
|
|
+ String filename = params.getFilename();
|
|
// 解析报告和表格
|
|
// 解析报告和表格
|
|
try (PDDocument document = Loader.loadPDF(new RandomAccessReadBufferedFile(params.getFilepath()))) {
|
|
try (PDDocument document = Loader.loadPDF(new RandomAccessReadBufferedFile(params.getFilepath()))) {
|
|
// 识别所有文字(去水印后的)
|
|
// 识别所有文字(去水印后的)
|
|
@@ -65,11 +66,7 @@ public abstract class AbstractPDReportParser<T extends ReportData> extends Abstr
|
|
this.textList = StrUtil.split(text, System.lineSeparator());
|
|
this.textList = StrUtil.split(text, System.lineSeparator());
|
|
this.textList.removeIf(StrUtil::isBlank);
|
|
this.textList.removeIf(StrUtil::isBlank);
|
|
if (this.textList.isEmpty()) {
|
|
if (this.textList.isEmpty()) {
|
|
- throw new ReportParseException(ReportParseStatus.REPORT_IS_SCAN);
|
|
|
|
- }
|
|
|
|
- // 报告名称和类型一般在第一第二行
|
|
|
|
- if (this.matchReportType(this.textList.get(0)) == null && this.matchReportType(this.textList.get(1)) == null) {
|
|
|
|
- throw new ReportParseException(ReportParseStatus.NOT_A_REPORT);
|
|
|
|
|
|
+ throw new ReportParseException(ReportParseStatus.REPORT_IS_SCAN, filename);
|
|
}
|
|
}
|
|
// 解析所有表格(单元格字符去水印)
|
|
// 解析所有表格(单元格字符去水印)
|
|
List<Table> tables = ListUtil.list(true);
|
|
List<Table> tables = ListUtil.list(true);
|
|
@@ -81,7 +78,7 @@ public abstract class AbstractPDReportParser<T extends ReportData> extends Abstr
|
|
tables.addAll(extractionAlgorithm.extract(page));
|
|
tables.addAll(extractionAlgorithm.extract(page));
|
|
}
|
|
}
|
|
if (tables.isEmpty()) {
|
|
if (tables.isEmpty()) {
|
|
- throw new ReportParseException(ReportParseStatus.REPORT_IS_SCAN);
|
|
|
|
|
|
+ throw new ReportParseException(ReportParseStatus.REPORT_IS_SCAN, filename);
|
|
}
|
|
}
|
|
this.initTableInfo(tables);
|
|
this.initTableInfo(tables);
|
|
}
|
|
}
|
|
@@ -99,7 +96,7 @@ public abstract class AbstractPDReportParser<T extends ReportData> extends Abstr
|
|
throw e;
|
|
throw e;
|
|
} catch (Exception e) {
|
|
} catch (Exception e) {
|
|
this.logger.warn("报告解析错误:{}", ExceptionUtil.stacktraceToString(e));
|
|
this.logger.warn("报告解析错误:{}", ExceptionUtil.stacktraceToString(e));
|
|
- throw new ReportParseException(ReportParseStatus.NOT_A_FIXED_FORMAT);
|
|
|
|
|
|
+ throw new ReportParseException(ReportParseStatus.NOT_A_FIXED_FORMAT, filename);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
@@ -119,10 +116,11 @@ public abstract class AbstractPDReportParser<T extends ReportData> extends Abstr
|
|
protected ReportFundInfoDTO buildFundInfo(ReportParserParams params) {
|
|
protected ReportFundInfoDTO buildFundInfo(ReportParserParams params) {
|
|
Table fundInfoTable = this.fundInfoTable;
|
|
Table fundInfoTable = this.fundInfoTable;
|
|
if (fundInfoTable == null) {
|
|
if (fundInfoTable == null) {
|
|
- throw new ReportParseException(ReportParseStatus.PARSE_FUND_INFO_FAIL);
|
|
|
|
|
|
+ throw new ReportParseException(ReportParseStatus.PARSE_FUND_INFO_FAIL, params.getFilename());
|
|
}
|
|
}
|
|
// 基金基本信息映射
|
|
// 基金基本信息映射
|
|
- return this.buildDto(params.getFileId(), fundInfoTable, ReportFundInfoDTO.class, this::parseFundInfo);
|
|
|
|
|
|
+ Map<String, Object> extInfoMap = this.parseFundInfo(fundInfoTable);
|
|
|
|
+ return this.buildDto(params.getFileId(), ReportFundInfoDTO.class, extInfoMap);
|
|
}
|
|
}
|
|
|
|
|
|
/**
|
|
/**
|
|
@@ -164,7 +162,7 @@ public abstract class AbstractPDReportParser<T extends ReportData> extends Abstr
|
|
}
|
|
}
|
|
|
|
|
|
/**
|
|
/**
|
|
- * 构建只有两列表格的dto数据对象,如果有分级基金时
|
|
|
|
|
|
+ * 构建只有两列表格的dto数据对象,如果有分级基金时(并且一个表格可能跨页)
|
|
*
|
|
*
|
|
* @param <DTO> 泛型对象
|
|
* @param <DTO> 泛型对象
|
|
* @param fileId 文件id
|
|
* @param fileId 文件id
|
|
@@ -175,17 +173,37 @@ public abstract class AbstractPDReportParser<T extends ReportData> extends Abstr
|
|
*/
|
|
*/
|
|
protected <DTO extends BaseReportLevelDTO<?>> List<DTO> buildLevelDto(Integer fileId, List<Table> tables, Class<DTO> clazz,
|
|
protected <DTO extends BaseReportLevelDTO<?>> List<DTO> buildLevelDto(Integer fileId, List<Table> tables, Class<DTO> clazz,
|
|
Function<Table, Map<String, Object>> function) {
|
|
Function<Table, Map<String, Object>> function) {
|
|
- // 映射转换
|
|
|
|
- List<DTO> dtos = tables.stream().filter(Objects::nonNull)
|
|
|
|
- .map(e -> this.buildDto(fileId, e, clazz, function)).collect(Collectors.toList());
|
|
|
|
|
|
+ List<DTO> dtos = ListUtil.list(true);
|
|
|
|
+ // 信息表格字段和值映射
|
|
|
|
+ List<Map<String, Object>> infos = ListUtil.list(true);
|
|
|
|
+ Map<String, Object> infoMap = null;
|
|
|
|
+ for (Table table : tables) {
|
|
|
|
+ Map<String, Object> temp = function.apply(table);
|
|
|
|
+ for (String key : temp.keySet()) {
|
|
|
|
+ // 如果infoMap为null,先声明然后放在infos中
|
|
|
|
+ if (infoMap == null) {
|
|
|
|
+ infoMap = MapUtil.newHashMap(16);
|
|
|
|
+ infos.add(infoMap);
|
|
|
|
+ }
|
|
|
|
+ // 如果infoMap中包含了该key时,先放infos中然后重新声明新map对象
|
|
|
|
+ if (infoMap.containsKey(key)) {
|
|
|
|
+ infos.add(new HashMap<>(infoMap));
|
|
|
|
+ infoMap = MapUtil.newHashMap(16);
|
|
|
|
+ } else {
|
|
|
|
+ infoMap.put(key, temp.get(key));
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ }
|
|
// 分级基金匹配
|
|
// 分级基金匹配
|
|
List<String> levels = this.matchTieredFund(String.join(",", this.textList));
|
|
List<String> levels = this.matchTieredFund(String.join(",", this.textList));
|
|
levels.add(0, "母基金");
|
|
levels.add(0, "母基金");
|
|
- for (int i = 0; i < dtos.size(); i++) {
|
|
|
|
- if (levels.size() <= i) {
|
|
|
|
|
|
+ for (int i = 0; i < infos.size(); i++) {
|
|
|
|
+ DTO dto = this.buildDto(fileId, clazz, infos.get(i));
|
|
|
|
+ if (dto == null) {
|
|
continue;
|
|
continue;
|
|
}
|
|
}
|
|
- dtos.get(i).setLevel(levels.get(i));
|
|
|
|
|
|
+ dto.setLevel(levels.get(i));
|
|
|
|
+ dtos.add(dto);
|
|
}
|
|
}
|
|
return dtos;
|
|
return dtos;
|
|
}
|
|
}
|
|
@@ -193,20 +211,17 @@ public abstract class AbstractPDReportParser<T extends ReportData> extends Abstr
|
|
/**
|
|
/**
|
|
* 构建只有两列表格的dto数据对象
|
|
* 构建只有两列表格的dto数据对象
|
|
*
|
|
*
|
|
- * @param <DTO> 泛型对象
|
|
|
|
- * @param fileId 文件id
|
|
|
|
- * @param table 表格
|
|
|
|
- * @param clazz 泛型对象
|
|
|
|
- * @param function 表格转换的函数
|
|
|
|
|
|
+ * @param <DTO> 泛型对象
|
|
|
|
+ * @param fileId 文件id
|
|
|
|
+ * @param clazz 泛型对象
|
|
|
|
+ * @param infoMap 表格转换的函数
|
|
* @return /
|
|
* @return /
|
|
*/
|
|
*/
|
|
- private <DTO extends BaseReportDTO<?>> DTO buildDto(Integer fileId, Table table, Class<DTO> clazz,
|
|
|
|
- Function<Table, Map<String, Object>> function) {
|
|
|
|
|
|
+ private <DTO extends BaseReportDTO<?>> DTO buildDto(Integer fileId, Class<DTO> clazz, Map<String, Object> infoMap) {
|
|
try {
|
|
try {
|
|
- Map<String, Object> extInfoMap = function == null ? MapUtil.empty() : function.apply(table);
|
|
|
|
DTO dto = clazz.getDeclaredConstructor().newInstance();
|
|
DTO dto = clazz.getDeclaredConstructor().newInstance();
|
|
dto.setFileId(fileId);
|
|
dto.setFileId(fileId);
|
|
- this.buildInfo(extInfoMap, dto);
|
|
|
|
|
|
+ this.buildInfo(infoMap, dto);
|
|
return dto;
|
|
return dto;
|
|
} catch (Exception ignored) {
|
|
} catch (Exception ignored) {
|
|
}
|
|
}
|
|
@@ -303,18 +318,14 @@ public abstract class AbstractPDReportParser<T extends ReportData> extends Abstr
|
|
if (string == null) {
|
|
if (string == null) {
|
|
return null;
|
|
return null;
|
|
}
|
|
}
|
|
- // 所有报告的正则识别方式
|
|
|
|
- String patterns = ReportType.getAllPatterns();
|
|
|
|
- // 编译正则表达式模式
|
|
|
|
- Pattern pattern = Pattern.compile(patterns);
|
|
|
|
- // 创建Matcher对象
|
|
|
|
- Matcher matcher = pattern.matcher(string);
|
|
|
|
- // 尝试匹配
|
|
|
|
- if (matcher.find()) {
|
|
|
|
- return matcher.group();
|
|
|
|
- } else {
|
|
|
|
- return null;
|
|
|
|
|
|
+ // 类型识别---先识别季度报告,没有季度再识别年度报告,最后识别月报
|
|
|
|
+ ReportType reportType = ReportType.MONTHLY;
|
|
|
|
+ if (StrUtil.containsAny(string, ReportType.QUARTERLY.getPatterns())) {
|
|
|
|
+ reportType = ReportType.QUARTERLY;
|
|
|
|
+ } else if (StrUtil.containsAny(string, ReportType.ANNUALLY.getPatterns())) {
|
|
|
|
+ reportType = ReportType.ANNUALLY;
|
|
}
|
|
}
|
|
|
|
+ return reportType.getLabel();
|
|
}
|
|
}
|
|
|
|
|
|
private int getLastDayOfMonth(int year, int month) {
|
|
private int getLastDayOfMonth(int year, int month) {
|