|
@@ -0,0 +1,59 @@
|
|
|
+package technology.tabula;
|
|
|
+
|
|
|
+import org.apache.pdfbox.pdmodel.PDDocument;
|
|
|
+import org.apache.pdfbox.pdmodel.PDPage;
|
|
|
+
|
|
|
+import java.io.IOException;
|
|
|
+
|
|
|
+/**
|
|
|
+ * @author wangzaijun
|
|
|
+ * @date 2024/9/30 11:15
|
|
|
+ * @description 重写的
|
|
|
+ */
|
|
|
+public class CustomObjectExtractor extends ObjectExtractor {
|
|
|
+ private final PDDocument pdfDocument;
|
|
|
+
|
|
|
+ public CustomObjectExtractor(PDDocument pdfDocument) {
|
|
|
+ super(pdfDocument);
|
|
|
+ this.pdfDocument = pdfDocument;
|
|
|
+ }
|
|
|
+
|
|
|
+ @Override
|
|
|
+ protected Page extractPage(Integer pageNumber) throws IOException {
|
|
|
+ if (pageNumber > pdfDocument.getNumberOfPages() || pageNumber < 1) {
|
|
|
+ throw new java.lang.IndexOutOfBoundsException("Page number does not exist.");
|
|
|
+ }
|
|
|
+ PDPage page = pdfDocument.getPage(pageNumber - 1);
|
|
|
+
|
|
|
+ ObjectExtractorStreamEngine streamEngine = new ObjectExtractorStreamEngine(page);
|
|
|
+ streamEngine.processPage(page);
|
|
|
+
|
|
|
+ TextStripper textStripper = new TextStripper(pdfDocument, pageNumber);
|
|
|
+ textStripper.process();
|
|
|
+
|
|
|
+ Utils.sort(textStripper.getTextElements(), Rectangle.ILL_DEFINED_ORDER);
|
|
|
+
|
|
|
+ float width, height;
|
|
|
+ int rotation = page.getRotation();
|
|
|
+ if (Math.abs(rotation) == 90 || Math.abs(rotation) == 270) {
|
|
|
+ width = page.getCropBox().getHeight();
|
|
|
+ height = page.getCropBox().getWidth();
|
|
|
+ } else {
|
|
|
+ width = page.getCropBox().getWidth();
|
|
|
+ height = page.getCropBox().getHeight();
|
|
|
+ }
|
|
|
+
|
|
|
+ return Page.Builder.newInstance()
|
|
|
+ .withPageDims(PageDims.of(0, 0, width, height))
|
|
|
+ .withRotation(rotation)
|
|
|
+ .withNumber(pageNumber)
|
|
|
+ .withPdPage(page)
|
|
|
+ .withPdDocument(pdfDocument)
|
|
|
+ .withRulings(streamEngine.rulings)
|
|
|
+ .withTextElements(textStripper.getTextElements())
|
|
|
+ .withMinCharWidth(textStripper.getMinCharWidth())
|
|
|
+ .withMinCharHeight(textStripper.getMinCharHeight())
|
|
|
+ .withIndex(textStripper.getSpatialIndex())
|
|
|
+ .build();
|
|
|
+ }
|
|
|
+}
|