PDMonthlyReportParser.java 9.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258
  1. package com.simuwang.daq.components;
  2. import cn.hutool.core.collection.CollUtil;
  3. import cn.hutool.core.collection.ListUtil;
  4. import cn.hutool.core.map.MapUtil;
  5. import cn.hutool.core.util.ReflectUtil;
  6. import cn.hutool.core.util.StrUtil;
  7. import com.simuwang.base.common.exception.APIException;
  8. import com.simuwang.base.mapper.EmailFieldMappingMapper;
  9. import com.simuwang.base.pojo.dos.EmailFieldMappingDO;
  10. import com.simuwang.daq.dto.ReportFundInfo;
  11. import com.simuwang.daq.dto.ReportInfo;
  12. import com.smppw.common.pojo.ValueLabelVO;
  13. import org.apache.pdfbox.Loader;
  14. import org.apache.pdfbox.io.RandomAccessReadBufferedFile;
  15. import org.apache.pdfbox.pdmodel.PDDocument;
  16. import org.springframework.stereotype.Component;
  17. import technology.tabula.*;
  18. import technology.tabula.extractors.SpreadsheetExtractionAlgorithm;
  19. import java.io.IOException;
  20. import java.sql.Struct;
  21. import java.util.Calendar;
  22. import java.util.List;
  23. import java.util.Map;
  24. import java.util.regex.Matcher;
  25. import java.util.regex.Pattern;
  26. import java.util.stream.Collectors;
  27. /**
  28. * @author wangzaijun
  29. * @date 2024/9/11 16:19
  30. * @description pdf格式的月报解析
  31. */
  32. @Component("monthly-report:pdf")
  33. public class PDMonthlyReportParser extends AbstractReportParser {
  34. private final EmailFieldMappingMapper fieldMappingMapper;
  35. private String reportName = null;
  36. private Table baseInfoTable = null;
  37. private List<Table> extNavTables = ListUtil.list(true);
  38. private List<ValueLabelVO> fieldMapper = null;
  39. public PDMonthlyReportParser(EmailFieldMappingMapper fieldMappingMapper) {
  40. this.fieldMappingMapper = fieldMappingMapper;
  41. }
  42. @Override
  43. protected void initParse() throws IOException {
  44. try (PDDocument document = Loader.loadPDF(new RandomAccessReadBufferedFile(this.filepath))) {
  45. CustomPDFTextStripper stripper = new CustomPDFTextStripper();
  46. stripper.setSortByPosition(true);
  47. String text = stripper.getText(document);
  48. text = text.replace("++\r\n", "").replace("++", "");
  49. List<String> textList = StrUtil.split(text, "\r\n");
  50. if (CollUtil.isNotEmpty(textList)) {
  51. List<String> wkList = this.watermarkListMap.get("report_name");
  52. String name = this.processString(wkList, textList.get(0));
  53. this.reportName = this.matchReportName(name);
  54. if (StrUtil.isBlank(this.reportName)) {
  55. throw new APIException("未匹配到报告名称");
  56. }
  57. }
  58. SpreadsheetExtractionAlgorithm extractionAlgorithm = new SpreadsheetExtractionAlgorithm();
  59. PageIterator pageIterator = new ObjectExtractor(document).extract();
  60. while (pageIterator.hasNext()) {
  61. Page page = pageIterator.next();
  62. List<Table> tables = extractionAlgorithm.extract(page);
  63. tables = tables.stream().distinct().collect(Collectors.toList());
  64. for (Table table : tables) {
  65. int colCount = table.getColCount();
  66. if (colCount == 4) {
  67. this.baseInfoTable = table;
  68. } else if (colCount >= 5) {
  69. this.extNavTables.add(table);
  70. }
  71. }
  72. }
  73. }
  74. List<EmailFieldMappingDO> emailFieldMapping = this.fieldMappingMapper.getEmailFieldMapping();
  75. if (CollUtil.isNotEmpty(emailFieldMapping)) {
  76. this.fieldMapper = emailFieldMapping.stream().map(e -> new ValueLabelVO(e.getCode(), e.getName())).collect(Collectors.toList());
  77. }
  78. }
  79. @Override
  80. protected ReportInfo parseReportInfo(Integer fileId) {
  81. ReportInfo reportInfo = new ReportInfo();
  82. reportInfo.setFileId(fileId);
  83. reportInfo.setReportName(this.reportName);
  84. reportInfo.setReportType(this.matchReportType(this.reportName));
  85. reportInfo.setReportDate(this.matchReportDate(this.reportName));
  86. return reportInfo;
  87. }
  88. @Override
  89. protected ReportFundInfo parseBaseInfo() {
  90. Table baseInfoTable = this.baseInfoTable;
  91. if (baseInfoTable == null) {
  92. throw new APIException("未解析到基本信息表格");
  93. }
  94. Map<String, Object> baseInfoMap = MapUtil.newHashMap(32);
  95. for (int i = 0; i < baseInfoTable.getRows().size(); i++) {
  96. List<RectangularTextContainer> cols = baseInfoTable.getRows().get(i);
  97. for (int j = 0; j < 2; j++) {
  98. baseInfoMap.put(cols.get(j * 2).getText(), cols.get(j * 2 + 1).getText());
  99. }
  100. }
  101. // 匹配字段清洗字段
  102. ReportFundInfo reportFundInfo = new ReportFundInfo();
  103. baseInfoMap.forEach((k, v) -> {
  104. String fieldValue = StrUtil.toStringOrNull(v);
  105. if (fieldValue.contains("-")) {
  106. fieldValue = null;
  107. }
  108. if (fieldValue != null) {
  109. fieldValue = fieldValue.replace("\r", "");
  110. }
  111. for (ValueLabelVO vo : this.fieldMapper) {
  112. String fieldName = vo.getValue();
  113. List<String> labels = StrUtil.split(vo.getLabel(), ",");
  114. if (labels.contains(k)) {
  115. ReflectUtil.setFieldValue(reportFundInfo, fieldName, fieldValue);
  116. break;
  117. }
  118. for (String label : labels) {
  119. if (k.contains(label)) {
  120. ReflectUtil.setFieldValue(reportFundInfo, fieldName, fieldValue);
  121. break;
  122. }
  123. }
  124. }
  125. });
  126. return reportFundInfo;
  127. }
  128. @Override
  129. protected void parseExtInfo() {
  130. }
  131. @Override
  132. protected void saveResult() {
  133. }
  134. /**
  135. * 匹配报告日期
  136. *
  137. * @param string 文本内容
  138. * @return 报告日期
  139. */
  140. private String matchReportDate(String string) {
  141. if (string == null) {
  142. return null;
  143. }
  144. // 编译正则表达式模式
  145. Pattern pat1 = Pattern.compile("(2\\d{3}).*([一二三四1234])季度"); // 2023年XXX3季度
  146. Pattern pat2 = Pattern.compile("\\d{4}-\\d{2}-\\d{2}"); // 2023-12-31
  147. Pattern pat3 = Pattern.compile("(2\\d{3})年年度"); // 2023年年度
  148. Pattern pat4 = Pattern.compile("(\\d{4})年(\\d{1,2})月"); // 2023年12月
  149. // 创建Matcher对象
  150. Matcher matcher1 = pat1.matcher(string);
  151. Matcher matcher2 = pat2.matcher(string);
  152. Matcher matcher3 = pat3.matcher(string);
  153. Matcher matcher4 = pat4.matcher(string);
  154. // 尝试匹配
  155. if (matcher1.find()) {
  156. String year = matcher1.group(1);
  157. String quarter = matcher1.group(2);
  158. return switch (quarter) {
  159. case "一", "1" -> year + "-03-31";
  160. case "二", "2" -> year + "-06-30";
  161. case "三", "3" -> year + "-09-30";
  162. case "四", "4" -> year + "-12-31";
  163. default -> null;
  164. };
  165. } else if (matcher2.find()) {
  166. return matcher2.group();
  167. } else if (matcher3.find()) {
  168. return matcher3.group(1) + "-12-31";
  169. } else if (matcher4.find()) {
  170. String year = matcher4.group(1);
  171. String month = matcher4.group(2);
  172. int lastDayOfMonth = getLastDayOfMonth(Integer.parseInt(year), Integer.parseInt(month));
  173. return year + "-" + padZero(month) + "-" + padZero(lastDayOfMonth + "");
  174. } else {
  175. return null;
  176. }
  177. }
  178. /**
  179. * 匹配报告类型,如“季度”、“年度”
  180. *
  181. * @param string 输入字符串
  182. * @return 匹配到的报告类型子字符串,如果没有匹配到则返回null
  183. */
  184. private String matchReportType(String string) {
  185. if (string == null) {
  186. return null;
  187. }
  188. // 编译正则表达式模式
  189. Pattern pattern = Pattern.compile("月|季度|年度");
  190. // 创建Matcher对象
  191. Matcher matcher = pattern.matcher(string);
  192. // 尝试匹配
  193. if (matcher.find()) {
  194. return matcher.group();
  195. } else {
  196. return null;
  197. }
  198. }
  199. private String matchReportName(String text) {
  200. if (StrUtil.isBlank(text)) {
  201. return null;
  202. }
  203. // 编译正则表达式模式
  204. Pattern pat1 = Pattern.compile(".+?报([告表])?\\d{4}(\\.?\\d{1,2}(\\.?\\d{2})?)?");
  205. Pattern pat2 = Pattern.compile("私募.*披露年度报[告表]((\\d{4}-\\d{2}-\\d{2}至\\d{4}-\\d{2}-\\d{2}))?");
  206. Pattern pat3 = Pattern.compile(".+?报([告表])?\\d{4}-\\d{2}-\\d{2}至\\d{4}-\\d{2}-\\d{2}?");
  207. // 创建Matcher对象
  208. Matcher matcher1 = pat1.matcher(text);
  209. Matcher matcher2 = pat2.matcher(text);
  210. Matcher matcher3 = pat3.matcher(text);
  211. // 尝试匹配
  212. String reportName;
  213. if (matcher1.find()) {
  214. reportName = matcher1.group();
  215. } else if (matcher2.find()) {
  216. reportName = matcher2.group();
  217. } else if (matcher3.find()) {
  218. reportName = matcher3.group();
  219. } else {
  220. return null;
  221. }
  222. return reportName.replace("(", "(").replace(")", ")");
  223. }
  224. private int getLastDayOfMonth(int year, int month) {
  225. Calendar calendar = Calendar.getInstance();
  226. calendar.set(Calendar.YEAR, year);
  227. calendar.set(Calendar.MONTH, month - 1); // Calendar.MONTH 是从0开始的
  228. return calendar.getActualMaximum(Calendar.DAY_OF_MONTH);
  229. }
  230. private String padZero(String number) {
  231. return String.format("%02d", Integer.parseInt(number));
  232. }
  233. }