浏览代码

fix:多页带表头PDF解析优化

chenjianhua 3 周之前
父节点
当前提交
6ce2c3bc61
共有 1 个文件被更改,包括 90 次插入1 次删除
  1. 90 1
      service-daq/src/main/java/com/simuwang/daq/service/NavEmailParser.java

+ 90 - 1
service-daq/src/main/java/com/simuwang/daq/service/NavEmailParser.java

@@ -176,9 +176,98 @@ public class NavEmailParser extends AbstractEmailParser {
         if (StrUtil.isBlank(excelFilePath)) {
             return CollUtil.newArrayList();
         }
-        return parseExcelFile(excelFilePath, emailFieldMap);
+        return parseExcelFileFrom(excelFilePath, emailFieldMap);
+    }
+
+    private List<EmailFundNavDTO> parseExcelFileFrom(String filePath, Map<String, List<String>> emailFieldMap) {
+        Sheet sheet = ExcelUtil.getFirstSheet(filePath);
+        if (sheet == null) {
+            log.info("获取不到有效的sheet页面,文件路径:{}", filePath);
+            return CollUtil.newArrayList();
+        }
+        // 1.找到表头所在位置
+        Map<String, Pair<Integer, Integer>> fieldPositionMap = getFieldPositionFromPdf(sheet, emailFieldMap);
+        if (MapUtil.isEmpty(fieldPositionMap)) {
+            log.warn("找不到文件表头字段 -> 文件:{}", filePath);
+            return CollUtil.newArrayList();
+        }
+        // 2.解析sheet中的净值数据
+        List<EmailFundNavDTO> emailFundNavDTOList = parseSheetData(filePath, sheet, fieldPositionMap, null);
+        // 3.校验净值数据格式 并 设置数据校验不通过的原因
+        if (CollUtil.isNotEmpty(emailFundNavDTOList)) {
+            emailFundNavDTOList.forEach(e -> e.setFailReason(super.checkDataFailReason(e)));
+        }
+        return emailFundNavDTOList;
+    }
+
+    private Map<String, Pair<Integer, Integer>> getFieldPositionFromPdf(Sheet sheet, Map<String, List<String>> emailFieldMap) {
+        Map<String, List<FieldPositionDTO>> tempFieldPositionMap = MapUtil.newHashMap();
+        int lastRowNum = sheet.getLastRowNum();
+        for (int rowNum = 0; rowNum <= lastRowNum; rowNum++) {
+            Row sheetRow = sheet.getRow(rowNum);
+            if (sheetRow == null) {
+                continue;
+            }
+            int lastCellNum = sheetRow.getLastCellNum();
+            for (int cellNum = 0; cellNum < lastCellNum; cellNum++) {
+                Cell cell = sheetRow.getCell(cellNum);
+                if (cell == null) {
+                    continue;
+                }
+                String cellValue = ExcelUtil.getCellValue(cell);
+                // 移除掉非中文字符
+                String newCellValue = StringUtil.retainChineseCharacters(cellValue, NOT_CONVERT_FIELD_LIST);
+                String field = fieldMatch(newCellValue, emailFieldMap);
+                if (StrUtil.isNotBlank(field)) {
+                    List<FieldPositionDTO> fieldPositionDTOList = tempFieldPositionMap.getOrDefault(field, new ArrayList<>());
+                    fieldPositionDTOList.add(new FieldPositionDTO(newCellValue, Pair.of(rowNum, cellNum)));
+                    tempFieldPositionMap.put(field, fieldPositionDTOList);
+                }
+            }
+        }
+        // 判断是不是份额基金净值文件格式(同时存在两个备案编码字段)
+        return handlerFieldPositionFromPdf(tempFieldPositionMap);
+    }
+
+    private Map<String, Pair<Integer, Integer>> handlerFieldPositionFromPdf(Map<String, List<FieldPositionDTO>> tempFieldPositionMap) {
+        Map<String, Pair<Integer, Integer>> fieldPositionMap = MapUtil.newHashMap();
+        boolean hasParentField = tempFieldPositionMap.keySet().stream().anyMatch(e -> e.contains("parent"));
+        for (Map.Entry<String, List<FieldPositionDTO>> entry : tempFieldPositionMap.entrySet()) {
+            String field = entry.getKey();
+            List<FieldPositionDTO> fieldPositionDTOList = entry.getValue();
+            int size = fieldPositionDTOList.size();
+            if (size == 1) {
+                fieldPositionMap.put(field, fieldPositionDTOList.get(0).getPair());
+                continue;
+            }
+            if ((!hasParentField && size > 1)) {
+                if (EmailFieldConst.REGISTER_NUMBER.equals(field)) {
+                    Pair<Integer, Integer> pair = fieldPositionDTOList.stream()
+                            .filter(e -> !e.getFieldValue().contains("协会") && !e.getFieldValue().contains("备案")).map(FieldPositionDTO::getPair).findFirst().orElse(null);
+                    fieldPositionMap.put(field, pair);
+                } else {
+                    fieldPositionMap.put(field, fieldPositionDTOList.get(0).getPair());
+                }
+                continue;
+            }
+            if ((hasParentField && size > 1)) {
+                fieldPositionMap.put(field, fieldPositionDTOList.get(0).getPair());
+            }
+        }
+
+        // 母基金缺少代码的情况
+        if (hasParentField && fieldPositionMap.get(EmailFieldConst.PARENT_REGISTER_NUMBER) == null) {
+            List<FieldPositionDTO> fieldPositionDTOS = tempFieldPositionMap.get(EmailFieldConst.REGISTER_NUMBER);
+            if (CollUtil.isNotEmpty(fieldPositionDTOS)) {
+                Pair<Integer, Integer> parentRegisterNumberPair = fieldPositionDTOS.stream()
+                        .filter(e -> e.getFieldValue().contains("协会") || e.getFieldValue().contains("备案")).map(FieldPositionDTO::getPair).findFirst().orElse(null);
+                fieldPositionMap.put(EmailFieldConst.PARENT_REGISTER_NUMBER, parentRegisterNumberPair);
+            }
+        }
+        return fieldPositionMap;
     }
 
+
     /**
      * 解析邮件excel附件
      *