1
0

ReportParseUtil.java 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348
  1. //package com.simuwang.daq.utils;
  2. //
  3. //import cn.hutool.core.collection.CollUtil;
  4. //import cn.hutool.core.collection.ListUtil;
  5. //import cn.hutool.core.map.MapUtil;
  6. //import cn.hutool.core.util.ReflectUtil;
  7. //import cn.hutool.core.util.StrUtil;
  8. //import com.simuwang.daq.components.CustomPDFTextStripper;
  9. //import com.simuwang.daq.dto.ReportFundInfo;
  10. //import com.smppw.common.pojo.ValueLabelVO;
  11. //import org.apache.pdfbox.Loader;
  12. //import org.apache.pdfbox.contentstream.PDFStreamEngine;
  13. //import org.apache.pdfbox.contentstream.operator.text.ShowText;
  14. //import org.apache.pdfbox.cos.COSName;
  15. //import org.apache.pdfbox.io.RandomAccessReadBufferedFile;
  16. //import org.apache.pdfbox.pdmodel.PDDocument;
  17. //import org.apache.pdfbox.pdmodel.PDPage;
  18. //import org.apache.pdfbox.pdmodel.PDResources;
  19. //import org.apache.pdfbox.pdmodel.common.PDStream;
  20. //import org.apache.pdfbox.pdmodel.documentinterchange.logicalstructure.PDStructureTreeRoot;
  21. //import org.apache.pdfbox.pdmodel.graphics.PDXObject;
  22. //import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
  23. //import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
  24. //import org.apache.pdfbox.text.PDFTextStripper;
  25. //import org.apache.pdfbox.text.PDFTextStripperByArea;
  26. //import org.apache.pdfbox.text.TextPosition;
  27. //import org.apache.pdfbox.util.Matrix;
  28. //import technology.tabula.*;
  29. //import technology.tabula.extractors.SpreadsheetExtractionAlgorithm;
  30. //
  31. //import java.awt.geom.Rectangle2D;
  32. //import java.io.IOException;
  33. //import java.util.*;
  34. //import java.util.regex.Matcher;
  35. //import java.util.regex.Pattern;
  36. //import java.util.stream.Collectors;
  37. //
  38. //public class ReportParseUtil {
  39. // public static void main(String[] args) throws IOException {
  40. // List<ValueLabelVO> fieldMapper = ListUtil.list(false);
  41. // fieldMapper.add(new ValueLabelVO("fundName", "基金名称"));
  42. // fieldMapper.add(new ValueLabelVO("registerNumber", "基金编码"));
  43. // fieldMapper.add(new ValueLabelVO("operationType", "基金运作方式"));
  44. // fieldMapper.add(new ValueLabelVO("fundType", "基金类别"));
  45. // fieldMapper.add(new ValueLabelVO("inceptionDate", "基金成立日期"));
  46. // fieldMapper.add(new ValueLabelVO("trustName", "基金托管人"));
  47. // fieldMapper.add(new ValueLabelVO("custodianName", "基金管理人"));
  48. // fieldMapper.add(new ValueLabelVO("advisorName", "投资顾问"));
  49. // fieldMapper.add(new ValueLabelVO("reviewed", "复核"));
  50. //
  51. // Map<String, List<String>> watermarkMap = generateWatermarkListMap("幻方量化1000指数专享1号5期私募证券投资基金", "宁波幻方量化投资管理合伙企业(有限合伙)", null);
  52. // List<String> watermarks = watermarkMap.get("less");
  53. //
  54. //// System.out.println(watermarks);
  55. //// try (PDDocument document = Loader.loadPDF(new RandomAccessReadBufferedFile("D:\\Documents\\workspace\\idea\\smppw\\data-daq\\service-daq\\src\\main\\java\\com\\simuwang\\daq\\utils\\12931.pdf"))) {
  56. // try (PDDocument document = Loader.loadPDF(new RandomAccessReadBufferedFile("D:\\Documents\\workspace\\idea\\smppw\\data-daq\\service-daq\\src\\main\\java\\com\\simuwang\\daq\\utils\\2061834.pdf"))) {
  57. //// PDFTextStripper stripper = new PDFTextStripper();
  58. //// stripper.setSortByPosition(true);
  59. //// String allText = stripper.getText(document);
  60. //// List<String> textList = StrUtil.split(allText, "\r\n");
  61. //// System.out.println(textList);
  62. //
  63. // PDFTextStripper textStripper = new CustomPDFTextStripper();
  64. // textStripper.setSortByPosition(true);
  65. // String text1 = textStripper.getText(document);
  66. // text1 = text1.replace("+\r\n", "").replace("+","");
  67. // List<String> textList = StrUtil.split(text1, "\r\n");
  68. // System.out.println(textList.get(0));
  69. //
  70. //// for (PDPage page : document.getPages()) {
  71. ////
  72. ////// PDResources resources = page.getResources();
  73. ////// Map<COSName, PDImageXObject> imageXObjectMap = findImageWatermark(page);
  74. ////// Iterator<COSName> iterator = resources.getXObjectNames().iterator();
  75. ////// while (iterator.hasNext()) {
  76. ////// COSName next = iterator.next();
  77. ////// if (imageXObjectMap.containsKey(next)) {
  78. ////// iterator.remove();
  79. ////// }
  80. ////// }
  81. ////// removeTextWatermark(page);
  82. ////
  83. //// PDFTextStripperByArea stripper = new PDFTextStripperByArea();
  84. //// stripper.setSortByPosition(true);
  85. //// stripper.addRegion("page", new Rectangle2D.Float(0, 0, page.getMediaBox().getWidth(), page.getMediaBox().getHeight()));
  86. //// stripper.extractRegions(page);
  87. //// for (String region : stripper.getRegions()) {
  88. //// String text = stripper.getTextForRegion(region);
  89. //// String res = processString(watermarks, text);
  90. //// System.out.println("原数据:" + text + ", 去除水印后数据:" + res);
  91. //// }
  92. //// }
  93. //// document.save(new File("./1.pdf"));
  94. //
  95. // SpreadsheetExtractionAlgorithm extractionAlgorithm = new SpreadsheetExtractionAlgorithm();
  96. // PageIterator pageIterator = new ObjectExtractor(document).extract();
  97. // while (pageIterator.hasNext()) {
  98. // Page page = pageIterator.next();
  99. // List<Table> tables = extractionAlgorithm.extract(page);
  100. // tables = tables.stream().distinct().collect(Collectors.toList());
  101. // for (Table table : tables) {
  102. // if (table.getColCount() == 4) {
  103. // Map<String, Object> baseInfoMap = MapUtil.newHashMap(32);
  104. // for (int i = 0; i < table.getRows().size(); i++) {
  105. // List<RectangularTextContainer> cols = table.getRows().get(i);
  106. // for (int j = 0; j < 2; j++) {
  107. // baseInfoMap.put(cols.get(j * 2).getText(), cols.get(j * 2 + 1).getText());
  108. // }
  109. // }
  110. // ReportFundInfo reportFundInfo = new ReportFundInfo();
  111. // baseInfoMap.forEach((k, v) -> {
  112. // for (ValueLabelVO vo : fieldMapper) {
  113. // String fieldName = vo.getValue();
  114. // List<String> labels = StrUtil.split(vo.getLabel(), ",");
  115. // if (labels.contains(k)) {
  116. // ReflectUtil.setFieldValue(reportFundInfo, fieldName, v);
  117. // break;
  118. // }
  119. // for (String label : labels) {
  120. // if (k.contains(label)) {
  121. // ReflectUtil.setFieldValue(reportFundInfo, fieldName, v);
  122. // break;
  123. // }
  124. // }
  125. // }
  126. // });
  127. // System.out.println(reportFundInfo);
  128. // }
  129. // }
  130. // }
  131. // }
  132. // }
  133. //
  134. // /**
  135. // * 找图片水印
  136. // *
  137. // * @param page
  138. // * @return
  139. // * @throws IOException
  140. // */
  141. // public static Map<COSName, PDImageXObject> findImageWatermark(PDPage page) throws IOException {
  142. // Map<COSName, PDImageXObject> watermarkMap = MapUtil.newHashMap();
  143. // PDResources resources = page.getResources();
  144. // Iterable<COSName> xObjectNames = resources.getXObjectNames();
  145. // for (COSName xObjectName : xObjectNames) {
  146. // PDXObject xObject = resources.getXObject(xObjectName);
  147. // PDStream stream = xObject.getStream();
  148. // PDImageXObject imageXObject = null;
  149. // try {
  150. // imageXObject = new PDImageXObject(stream, resources);
  151. // } catch (Exception e) {
  152. // e.printStackTrace();
  153. // }
  154. // if (imageXObject != null) {
  155. // watermarkMap.put(xObjectName, imageXObject);
  156. // }
  157. // }
  158. // return watermarkMap;
  159. // }
  160. //
  161. // private static Map<String, List<String>> generateWatermarkListMap(String fundName, String trustName, String registerNumber) {
  162. // Map<String, List<String>> result = MapUtil.newHashMap(32);
  163. // // 生成水印列表
  164. //
  165. // fundName = StrUtil.isNotBlank(fundName) ? fundName : "私募证券投资基金";
  166. // trustName = StrUtil.isNotBlank(trustName) ? trustName : "资产管理有限公司";
  167. // registerNumber = StrUtil.isNotBlank(registerNumber) ? registerNumber : "";
  168. // String text = fundName + trustName + registerNumber;
  169. // text = text.replaceAll("[()]", ""); // 移除括号
  170. // List<String> textList = new ArrayList<>(new HashSet<>(convertStringToList(text)));
  171. // Collections.reverse(textList);
  172. // StringBuilder sb = new StringBuilder(textList.size());
  173. // for (String ch : textList) {
  174. // sb.append(ch);
  175. // }
  176. // String joinedText = sb.toString();
  177. //
  178. // // 基本水印列表
  179. // List<String> wkList = new ArrayList<>();
  180. // for (String ch : textList) {
  181. // wkList.add(ch + "\r\n");
  182. // wkList.add("\r\n" + ch);
  183. // }
  184. //
  185. // // 查找数字
  186. // List<String> matches = findDigits(fundName);
  187. // if (!matches.isEmpty()) {
  188. // for (String match : matches) {
  189. // wkList.add("\r\n" + match);
  190. // wkList.add(match + "\r\n");
  191. // }
  192. // }
  193. // wkList.add("-");
  194. // wkList.add("【");
  195. // wkList.add("】");
  196. // wkList.add("\r");
  197. // wkList.add("\r\n");
  198. //
  199. // String noNumberText = removeDigits(joinedText);
  200. //
  201. // // 生成不同字段的水印列表
  202. // result.put("report_name", new ArrayList<>(wkList));
  203. // result.get("report_name").addAll(convertStringToList("有限公司"));
  204. //
  205. // result.put("less", new ArrayList<>(wkList));
  206. //
  207. // result.put("more", new ArrayList<>(wkList));
  208. // result.get("more").addAll(convertStringToList(noNumberText));
  209. //
  210. // result.put("leverage", new ArrayList<>(wkList));
  211. // result.get("leverage").addAll(convertStringToList(removeKeywords(noNumberText, "基金资产")));
  212. //
  213. // result.put("base_info", new ArrayList<>(wkList));
  214. // result.get("base_info").addAll(convertStringToList(removeKeywords(text, "基", "金", "投资", "管理", "有", "份", "融", "资", "产", "本", "号", "收益", "策略", "期")));
  215. //
  216. // result.put("industry", new ArrayList<>(wkList));
  217. // result.get("industry").addAll(convertStringToList(removeKeywords(noNumberText, "基金融公产")));
  218. //
  219. // result.put("market_value", new ArrayList<>(Collections.singletonList("\n")));
  220. // return result;
  221. // }
  222. //
  223. // private static List<String> findDigits(String text) {
  224. // List<String> digits = new ArrayList<>();
  225. // Pattern pattern = Pattern.compile("\\d");
  226. // Matcher matcher = pattern.matcher(text);
  227. // while (matcher.find()) {
  228. // digits.add(matcher.group());
  229. // }
  230. // return digits;
  231. // }
  232. //
  233. // private static String removeDigits(String text) {
  234. // return text.replaceAll("\\d", "");
  235. // }
  236. //
  237. // private static String removeKeywords(String text, String... keywords) {
  238. // for (String keyword : keywords) {
  239. // text = text.replaceAll(keyword, "");
  240. // }
  241. // return text;
  242. // }
  243. //
  244. // private static List<String> convertStringToList(String text) {
  245. // List<String> charList = new ArrayList<>();
  246. // for (char c : text.toCharArray()) {
  247. // charList.add(c + "");
  248. // }
  249. // return charList;
  250. // }
  251. //
  252. // public static String processString(List<String> wmList, String string) {
  253. // // 生成正则表达式模式
  254. // String pat = String.join("|", wmList);
  255. // // 使用正则表达式移除wmList中的元素
  256. // string = removeMatches(string, pat);
  257. // // 替换中文括号为英文括号
  258. // string = string.replace("(", "(").replace(")", ")");
  259. // // 移除空格
  260. // string = string.replace(" ", "");
  261. // // 如果字符串以括号开头,则移除第一个字符
  262. // if (startsWithParenthesis(string)) {
  263. // string = string.substring(1);
  264. // }
  265. //
  266. // return string;
  267. // }
  268. //
  269. // private static String removeMatches(String input, String pattern) {
  270. // // 编译正则表达式
  271. // Pattern compiledPattern = Pattern.compile(pattern);
  272. // // 创建Matcher对象
  273. // Matcher matcher = compiledPattern.matcher(input);
  274. // // 使用replaceAll方法替换所有匹配到的字符为空字符串
  275. // return matcher.replaceAll("");
  276. // }
  277. //
  278. // private static boolean startsWithParenthesis(String input) {
  279. // // 匹配以括号开头的字符串
  280. // Pattern pattern = Pattern.compile("^[()].*");
  281. // Matcher matcher = pattern.matcher(input);
  282. // return matcher.find();
  283. // }
  284. //
  285. //// public static void removeTextWatermark(PDPage page) throws IOException {
  286. //// PDResources resources = page.getResources();
  287. ////// if (StrUtil.isAllBlank(fundName, trustName)) {
  288. ////// return;
  289. ////// }
  290. //// PDFTextStripperByArea stripper = new PDFTextStripperByArea();
  291. //// stripper.setSortByPosition(true);
  292. //// stripper.addRegion("watermark", new Rectangle2D.Float(0, 0, page.getMediaBox().getWidth(), page.getMediaBox().getHeight()));
  293. //// stripper.extractRegions(page);
  294. ////
  295. //// PDFStreamEngine engine = new PDFTextStripper();
  296. //// engine.addOperator(new SetMatrix(stripper));
  297. ////
  298. //// }
  299. ////
  300. //// private static void processResources(PDResources resources) throws IOException {
  301. //// for (COSName name : resources.getXObjectNames()) {
  302. //// PDXObject xobject = resources.getXObject(name);
  303. //// if (xobject instanceof PDFormXObject) {
  304. //// PDFormXObject formXObject = (PDFormXObject) xobject;
  305. //// writeTokensToStream(formXObject.getContentStream(),
  306. //// createTokensWithoutText(formXObject));
  307. //// processResources(formXObject.getResources());
  308. //// }
  309. //// }
  310. //// for (COSName name : resources.getPatternNames()) {
  311. //// PDAbstractPattern pattern = resources.getPattern(name);
  312. //// if (pattern instanceof PDTilingPattern) {
  313. //// PDTilingPattern tilingPattern = (PDTilingPattern) pattern;
  314. //// writeTokensToStream(tilingPattern.getContentStream(),
  315. //// createTokensWithoutText(tilingPattern));
  316. //// processResources(tilingPattern.getResources());
  317. //// }
  318. //// }
  319. //// }
  320. ////
  321. //// private static void writeTokensToStream(PDStream newContents, List<Object> newTokens) throws IOException {
  322. //// try (OutputStream out = newContents.createOutputStream(COSName.FLATE_DECODE)) {
  323. //// ContentStreamWriter writer = new ContentStreamWriter(out);
  324. //// writer.writeTokens(newTokens);
  325. //// }
  326. //// }
  327. ////
  328. //// private static List<Object> createTokensWithoutText(PDContentStream contentStream) throws IOException {
  329. //// PDFStreamParser parser = new PDFStreamParser(contentStream);
  330. //// Object token = parser.parseNextToken();
  331. //// List<Object> newTokens = new ArrayList<>();
  332. //// while (token != null) {
  333. //// if (token instanceof Operator op) {
  334. //// String opName = op.getName();
  335. //// if (OperatorName.SET_MATRIX.equals(opName)) {
  336. //// // remove the argument to this operator
  337. //// newTokens.remove(newTokens.size() - 1);
  338. ////
  339. //// token = parser.parseNextToken();
  340. //// continue;
  341. //// }
  342. //// }
  343. //// newTokens.add(token);
  344. //// token = parser.parseNextToken();
  345. //// }
  346. //// return newTokens;
  347. //// }
  348. //}