Add automatic HTML outline with the iText5 library, also

flyingsaucerproject · Nov 22, 2016 · b702730 · b702730
1 parent d6eda87
commit b702730
Show file tree

Hide file tree

Showing 2 changed files with 257 additions and 13 deletions.
diff --git a/flying-saucer-pdf-itext5/src/main/java/org/xhtmlrenderer/pdf/HTMLOutline.java b/flying-saucer-pdf-itext5/src/main/java/org/xhtmlrenderer/pdf/HTMLOutline.java
@@ -0,0 +1,235 @@
+/*
+ * {{{ header & license
+ * Copyright (c) 2016 Stanimir Stamenkov
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ * }}}
+ */
+package org.xhtmlrenderer.pdf;
+
+import java.util.IdentityHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.w3c.dom.Element;
+import org.w3c.dom.Node;
+import org.w3c.dom.traversal.DocumentTraversal;
+import org.w3c.dom.traversal.NodeFilter;
+import org.w3c.dom.traversal.NodeIterator;
+
+import org.xhtmlrenderer.pdf.ITextOutputDevice.Bookmark;
+import org.xhtmlrenderer.render.Box;
+
+class HTMLOutline {
+
+    private static final Pattern HEADING =
+            Pattern.compile("h([1-6])", Pattern.CASE_INSENSITIVE);
+
+    /** <a href="https://www.w3.org/TR/html51/sections.html#sectioning-roots">sectioning roots</a> */
+    private static final Pattern ROOT =
+            Pattern.compile("blockquote|details|fieldset|figure|td",
+                            Pattern.CASE_INSENSITIVE);
+
+    private static final Pattern WS = Pattern.compile("\\s+");
+
+    private static final int MAX_NAME_LENGTH = 200;
+
+    private final HTMLOutline parent;
+
+    private final int level;
+
+    private final Bookmark bookmark;
+
+    private HTMLOutline() {
+        this(0, "root", null);
+    }
+
+    private HTMLOutline(int level, String name, HTMLOutline parent) {
+        this.level = level;
+        this.bookmark = new Bookmark(name, "");
+        this.parent = parent;
+        if (parent != null) {
+            parent.bookmark.addChild(bookmark);
+        }
+    }
+
+    /**
+     * Creates a bookmark list of the document outline generated for the given
+     * element context (usually the root document element).
+     * <p>
+     * The current algorithm is more simple than the one suggested in the HTML5
+     * specification such as it is not affected by
+     * <a href="https://www.w3.org/TR/html51/dom.html#sectioning-content">sectioning
+     * content</a> but just the heading level.  For
+     * <a href="https://www.w3.org/TR/html51/sections.html#example-d42b7aaf">example</a>:</p>
+     * <pre>
+     * &lt;body>
+     *   &lt;h1>Foo&lt;/h1>
+     *   &lt;h3>Bar&lt;/h3>
+     *   &lt;blockquote>
+     *     &lt;h5>Bla&lt;/h5>
+     *   &lt;/blockquote>
+     *   &lt;p>Baz&lt;/p>
+     *   &lt;h2>Quux&lt;/h2>
+     *   &lt;section>
+     *     &lt;h3>Thud&lt;/h3>
+     *   &lt;/section>
+     *   &lt;h4>Grunt&lt;/h4>
+     * &lt;/body></pre>
+     * <p>
+     * Should generate outline as:</p>
+     * <ol>
+     * <li>Foo
+     *   <ol>
+     *   <li>Bar</li>
+     *   <li>Quux</li>
+     *   <li>Thud</li>
+     *   <li>Grunt</li>
+     *   </ol></li>
+     * </ol>
+     * <p>
+     * But it generates outline as:</p>
+     * <ol>
+     * <li>Foo
+     *   <ol>
+     *   <li>Bar</li>
+     *   <li>Quux
+     *     <ol>
+     *     <li>Thud
+     *       <ol>
+     *       <li>Grunt</li>
+     *       </ol></li>
+     *     </ol></li>
+     *   </ol></li>
+     * </ol>
+     *
+     * <h4>Example document customizations</h4>
+     *
+     * <h5>Include non-heading element as bookmark (level 4)</h5>
+     * <pre>
+     * &lt;strong data-pdf-bookmark="4">Foo bar&lt;/strong></pre>
+     *
+     * <h5>Specify bookmark name</h5>
+     * <pre>
+     * &lt;tr data-pdf-bookmark="5" data-pdf-bookmark-name="Bar baz">...&lt;/tr></pre>
+     *
+     * <h5>Exclude individual heading from bookmarks</h5>
+     * <pre>
+     * &lt;h3 data-pdf-bookmark="none">Baz qux&lt;/h3></pre>
+     *
+     * <h5>Prevent automatic bookmarks for the whole of the document</h5>
+     * <pre>
+     * &lt;html data-pdf-bookmark="exclude">...&lt;/html></pre>
+     *
+     * @param   context  the top element a sectioning outline would be generated for;
+     * @param   box  box hierarchy the outline bookmarks would get mapped into.
+     * @return  Bookmarks of the outline generated for the given element context.
+     * @see     <a href="https://www.w3.org/TR/html51/sections.html#creating-an-outline">Creating an outline</a>
+     */
+    public static List<Bookmark> generate(Element context, Box box) {
+        NodeIterator iterator = NestedSectioningFilter.iterator(context);
+
+        HTMLOutline root = new HTMLOutline();
+        HTMLOutline current = root;
+        Map<Element,Bookmark> map = new IdentityHashMap();
+
+        for (Element element = (Element) iterator.nextNode();
+                element != null; element = (Element) iterator.nextNode()) {
+            int level;
+            try {
+                level = Integer.parseInt(getOutlineLevel(element));
+                if (level < 1) {
+                    continue; // Illegal value
+                }
+            } catch (NumberFormatException e) {
+                continue; // Invalid value
+            }
+
+            String name = getBookmarkName(element);
+
+            while (current.level >= level) {
+                current = current.parent;
+            }
+            current = new HTMLOutline(level, name, current);
+            map.put(element, current.bookmark);
+        }
+        initBoxRefs(map, box);
+        return root.bookmark.getChildren();
+    } // generate(Element, Box) : List<Bookmark>
+
+    private static void initBoxRefs(Map<Element,Bookmark> map, Box box) {
+        Bookmark bookmark = map.get(box.getElement());
+        if (bookmark != null) {
+            bookmark.setBox(box);
+        }
+        for (int i = 0, len = box.getChildCount(); i < len; i++) {
+            initBoxRefs(map, box.getChild(i));
+        }
+    }
+
+    private static String getBookmarkName(Element element) {
+        String name = element.getAttribute("data-pdf-bookmark-name").trim();
+        if (name.isEmpty()) {
+            name = element.getTextContent();
+        }
+        name = WS.matcher(name.trim()).replaceAll(" ");
+        if (name.length() > MAX_NAME_LENGTH) {
+            name = name.substring(0, MAX_NAME_LENGTH);
+        }
+        return name;
+    }
+
+    static String getOutlineLevel(Element element) {
+        String bookmark = element.getAttribute("data-pdf-bookmark").trim();
+        if (bookmark.isEmpty()) {
+            Matcher heading = HEADING.matcher(element.getTagName());
+            if (heading.matches()) {
+                bookmark = heading.group(1);
+            } else if (ROOT.matcher(element.getTagName()).matches()) {
+                bookmark = "exclude";
+            } else {
+                bookmark = "none";
+            }
+        }
+        return bookmark;
+    }
+
+
+    private static class NestedSectioningFilter implements NodeFilter {
+
+        static final NestedSectioningFilter INSTANCE = new NestedSectioningFilter();
+
+        static NodeIterator iterator(Element root) {
+            return ((DocumentTraversal) root.getOwnerDocument())
+                    .createNodeIterator(root, SHOW_ELEMENT, INSTANCE, true);
+        }
+
+        @Override
+        public short acceptNode(Node n) {
+            String outlineLevel = getOutlineLevel((Element) n);
+            if (outlineLevel.equalsIgnoreCase("none")) {
+                return FILTER_SKIP;
+            }
+            return outlineLevel.equalsIgnoreCase("exclude")
+                    ? FILTER_REJECT
+                    : FILTER_ACCEPT;
+        }
+
+    } // class NestedSectioningFilter
+
+
+} // class HTMLOutline
diff --git a/flying-saucer-pdf-itext5/src/main/java/org/xhtmlrenderer/pdf/ITextOutputDevice.java b/flying-saucer-pdf-itext5/src/main/java/org/xhtmlrenderer/pdf/ITextOutputDevice.java
@@ -20,7 +20,6 @@
 package org.xhtmlrenderer.pdf;
 
 import java.awt.BasicStroke;
-import java.awt.Color;
 import java.awt.Point;
 import java.awt.Rectangle;
 import java.awt.Shape;
@@ -35,8 +34,6 @@
 import java.awt.geom.Point2D;
 import java.io.IOException;
 import java.net.URI;
-import java.net.URISyntaxException;
-import java.net.URL;
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.Comparator;
@@ -66,7 +63,6 @@
 import org.xhtmlrenderer.pdf.ITextFontResolver.FontDescription;
 import org.xhtmlrenderer.render.AbstractOutputDevice;
 import org.xhtmlrenderer.render.BlockBox;
-import org.xhtmlrenderer.render.BorderPainter;
 import org.xhtmlrenderer.render.Box;
 import org.xhtmlrenderer.render.FSFont;
 import org.xhtmlrenderer.render.InlineLayoutBox;
@@ -906,6 +902,9 @@ public void finish(RenderingContext c, Box root) {
     }
 
     private void writeOutline(RenderingContext c, Box root) {
+        if (_bookmarks.isEmpty()) {
+            _bookmarks = HTMLOutline.generate(root.getElement(), root);
+        }
         if (_bookmarks.size() > 0) {
             _writer.setViewerPreferences(PdfWriter.PageModeUseOutlines);
             writeBookmarks(c, root, _writer.getRootOutline(), _bookmarks);
@@ -931,15 +930,16 @@ private int getPageRefY(Box box) {
     private void writeBookmark(RenderingContext c, Box root, PdfOutline parent, Bookmark bookmark) {
         String href = bookmark.getHRef();
         PdfDestination target = null;
+        Box box = bookmark.getBox();
         if (href.length() > 0 && href.charAt(0) == '#') {
-            Box box = _sharedContext.getBoxById(href.substring(1));
-            if (box != null) {
-                PageBox page = root.getLayer().getPage(c, getPageRefY(box));
-                int distanceFromTop = page.getMarginBorderPadding(c, CalculatedStyle.TOP);
-                distanceFromTop += box.getAbsY() - page.getTop();
-                target = new PdfDestination(PdfDestination.XYZ, 0, normalizeY(distanceFromTop / _dotsPerPoint), 0);
-                target.addPage(_writer.getPageReference(_startPageNo + page.getPageNo() + 1));
-            }
+            box = _sharedContext.getBoxById(href.substring(1));
+        }
+        if (box != null) {
+            PageBox page = root.getLayer().getPage(c, getPageRefY(box));
+            int distanceFromTop = page.getMarginBorderPadding(c, CalculatedStyle.TOP);
+            distanceFromTop += box.getAbsY() - page.getTop();
+            target = new PdfDestination(PdfDestination.XYZ, 0, normalizeY(distanceFromTop / _dotsPerPoint), 0);
+            target.addPage(_writer.getPageReference(_startPageNo + page.getPageNo() + 1));
         }
         if (target == null) {
             target = _defaultDestination;
@@ -980,9 +980,10 @@ private void loadBookmark(Bookmark parent, Element bookmark) {
         }
     }
 
-    private static class Bookmark {
+    static class Bookmark {
         private String _name;
         private String _HRef;
+        private Box    _box;
 
         private List _children;
 
@@ -994,6 +995,14 @@ public Bookmark(String name, String href) {
             _HRef = href;
         }
 
+        public Box getBox() {
+            return _box;
+        }
+
+        public void setBox(Box box) {
+            _box = box;
+        }
+
         public String getHRef() {
             return _HRef;
         }