From d6eda87209a3a78df7700abaf57b89b7a62f4763 Mon Sep 17 00:00:00 2001 From: Stanimir Stamenkov Date: Mon, 21 Nov 2016 09:34:39 +0200 Subject: [PATCH 1/2] Add license file comment; Clean and polish Use separate values for "no bookmark" and "exclude outline". Expand a bit on documentation. --- .../org/xhtmlrenderer/pdf/HTMLOutline.java | 200 +++++++++++++----- 1 file changed, 145 insertions(+), 55 deletions(-) diff --git a/flying-saucer-pdf/src/main/java/org/xhtmlrenderer/pdf/HTMLOutline.java b/flying-saucer-pdf/src/main/java/org/xhtmlrenderer/pdf/HTMLOutline.java index acdeab8f1..5f23f8c4c 100644 --- a/flying-saucer-pdf/src/main/java/org/xhtmlrenderer/pdf/HTMLOutline.java +++ b/flying-saucer-pdf/src/main/java/org/xhtmlrenderer/pdf/HTMLOutline.java @@ -1,6 +1,24 @@ +/* + * {{{ header & license + * Copyright (c) 2016 Stanimir Stamenkov + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * }}} + */ package org.xhtmlrenderer.pdf; -import java.util.ArrayList; import java.util.IdentityHashMap; import java.util.List; import java.util.Map; @@ -21,11 +39,18 @@ class HTMLOutline { private static final Pattern HEADING = Pattern.compile("h([1-6])", Pattern.CASE_INSENSITIVE); + /** sectioning roots */ + private static final Pattern ROOT = + Pattern.compile("blockquote|details|fieldset|figure|td", + Pattern.CASE_INSENSITIVE); + private static final Pattern WS = Pattern.compile("\\s+"); + private static final int MAX_NAME_LENGTH = 200; + private final HTMLOutline parent; - private final int rank; + private final int level; private final Bookmark bookmark; @@ -33,9 +58,9 @@ private HTMLOutline() { this(0, "root", null); } - private HTMLOutline(int rank, String title, HTMLOutline parent) { - this.rank = rank; - this.bookmark = new Bookmark(title, ""); + private HTMLOutline(int level, String name, HTMLOutline parent) { + this.level = level; + this.bookmark = new Bookmark(name, ""); this.parent = parent; if (parent != null) { parent.bookmark.addChild(bookmark); @@ -43,30 +68,80 @@ private HTMLOutline(int rank, String title, HTMLOutline parent) { } /** - * Include non-heading element as bookmark: - *
-     * <strong data-pdf-bookmark="4">...</strong>
+ * Creates a bookmark list of the document outline generated for the given + * element context (usually the root document element). *

- * Specify bookmark name:

+ * The current algorithm is more simple than the one suggested in the HTML5 + * specification such as it is not affected by + * sectioning + * content but just the heading level. For + * example:

*
-     * <tr data-pdf-bookmark="4" data-pdf-bookmark-name="Bar baz">...</tr>
+ * <body> + * <h1>Foo</h1> + * <h3>Bar</h3> + * <blockquote> + * <h5>Bla</h5> + * </blockquote> + * <p>Baz</p> + * <h2>Quux</h2> + * <section> + * <h3>Thud</h3> + * </section> + * <h4>Grunt</h4> + * </body> *

- * Exclude individual heading from bookmarks:

+ * Should generate outline as:

+ *
    + *
  1. Foo + *
      + *
    1. Bar
    2. + *
    3. Quux
    4. + *
    5. Thud
    6. + *
    7. Grunt
    8. + *
  2. + *
+ *

+ * But it generates outline as:

+ *
    + *
  1. Foo + *
      + *
    1. Bar
    2. + *
    3. Quux + *
        + *
      1. Thud + *
          + *
        1. Grunt
        2. + *
      2. + *
    4. + *
  2. + *
+ * + *

Example document customizations

+ * + *
Include non-heading element as bookmark (level 4)
+ *
+     * <strong data-pdf-bookmark="4">Foo bar</strong>
+ * + *
Specify bookmark name
+ *
+     * <tr data-pdf-bookmark="5" data-pdf-bookmark-name="Bar baz">...</tr>
+ * + *
Exclude individual heading from bookmarks
*
      * <h3 data-pdf-bookmark="none">Baz qux</h3>
- *

- * Prevent automatic bookmarks for the whole of the document:

+ * + *
Prevent automatic bookmarks for the whole of the document
*
-     * <html data-pdf-bookmark="none">...</html>
+ * <html data-pdf-bookmark="exclude">...</html> + * + * @param context the top element a sectioning outline would be generated for; + * @param box box hierarchy the outline bookmarks would get mapped into. + * @return Bookmarks of the outline generated for the given element context. + * @see Creating an outline */ public static List generate(Element context, Box box) { - if (context.getAttribute("data-pdf-bookmark").trim().equalsIgnoreCase("none")) { - return new ArrayList(0); - } - - NodeIterator iterator = ((DocumentTraversal) context.getOwnerDocument()) - .createNodeIterator(context, NodeFilter.SHOW_ELEMENT, - NestedSectioningFilter.INSTANCE, true); + NodeIterator iterator = NestedSectioningFilter.iterator(context); HTMLOutline root = new HTMLOutline(); HTMLOutline current = root; @@ -74,49 +149,63 @@ public static List generate(Element context, Box box) { for (Element element = (Element) iterator.nextNode(); element != null; element = (Element) iterator.nextNode()) { - String bookmark = element.getAttribute("data-pdf-bookmark").trim(); - Matcher matcher = HEADING.matcher(element.getTagName()); - if (bookmark.isEmpty()) { - bookmark = matcher.matches() ? matcher.group(1) : "none"; - } - if (bookmark.equalsIgnoreCase("none")) { - continue; - } - - int rank; + int level; try { - rank = Integer.parseInt(bookmark); - if (rank < 1) { + level = Integer.parseInt(getOutlineLevel(element)); + if (level < 1) { continue; // Illegal value } } catch (NumberFormatException e) { continue; // Invalid value } - String name = element.getAttribute("data-pdf-bookmark-name").trim(); - if (name.isEmpty()) { - name = element.getTextContent(); - } - name = WS.matcher(name.trim()).replaceAll(" "); + String name = getBookmarkName(element); - while (current.rank >= rank) { + while (current.level >= level) { current = current.parent; } - current = new HTMLOutline(rank, name, current); + current = new HTMLOutline(level, name, current); map.put(element, current.bookmark); } - initBoxPositions(map, box); + initBoxRefs(map, box); return root.bookmark.getChildren(); - } + } // generate(Element, Box) : List - private static void initBoxPositions(Map map, Box box) { + private static void initBoxRefs(Map map, Box box) { Bookmark bookmark = map.get(box.getElement()); if (bookmark != null) { bookmark.setBox(box); } for (int i = 0, len = box.getChildCount(); i < len; i++) { - initBoxPositions(map, box.getChild(i)); + initBoxRefs(map, box.getChild(i)); + } + } + + private static String getBookmarkName(Element element) { + String name = element.getAttribute("data-pdf-bookmark-name").trim(); + if (name.isEmpty()) { + name = element.getTextContent(); + } + name = WS.matcher(name.trim()).replaceAll(" "); + if (name.length() > MAX_NAME_LENGTH) { + name = name.substring(0, MAX_NAME_LENGTH); } + return name; + } + + static String getOutlineLevel(Element element) { + String bookmark = element.getAttribute("data-pdf-bookmark").trim(); + if (bookmark.isEmpty()) { + Matcher heading = HEADING.matcher(element.getTagName()); + if (heading.matches()) { + bookmark = heading.group(1); + } else if (ROOT.matcher(element.getTagName()).matches()) { + bookmark = "exclude"; + } else { + bookmark = "none"; + } + } + return bookmark; } @@ -124,22 +213,23 @@ private static class NestedSectioningFilter implements NodeFilter { static final NestedSectioningFilter INSTANCE = new NestedSectioningFilter(); - // https://www.w3.org/TR/html51/sections.html#sectioning-roots - private static final Pattern ROOTS = Pattern - .compile("blockquote|details|fieldset|figure|td", - Pattern.CASE_INSENSITIVE); + static NodeIterator iterator(Element root) { + return ((DocumentTraversal) root.getOwnerDocument()) + .createNodeIterator(root, SHOW_ELEMENT, INSTANCE, true); + } @Override public short acceptNode(Node n) { - if (((Element) n).getAttribute("data-pdf-bookmark").equalsIgnoreCase("none")) { - return FILTER_REJECT; + String outlineLevel = getOutlineLevel((Element) n); + if (outlineLevel.equalsIgnoreCase("none")) { + return FILTER_SKIP; } - // REVISIT: May be use another control "data-pdf-bookmark" value - // to indicate force traversing into "blockquote" and similar. - return ROOTS.matcher(n.getNodeName()).matches() ? FILTER_REJECT - : FILTER_ACCEPT; + return outlineLevel.equalsIgnoreCase("exclude") + ? FILTER_REJECT + : FILTER_ACCEPT; } } // class NestedSectioningFilter -} + +} // class HTMLOutline From b7027302bb5388da8d9d9f012b2b08324271766e Mon Sep 17 00:00:00 2001 From: Stanimir Stamenkov Date: Tue, 22 Nov 2016 04:32:53 +0200 Subject: [PATCH 2/2] Add automatic HTML outline with the iText5 library, also --- .../org/xhtmlrenderer/pdf/HTMLOutline.java | 235 ++++++++++++++++++ .../xhtmlrenderer/pdf/ITextOutputDevice.java | 35 ++- 2 files changed, 257 insertions(+), 13 deletions(-) create mode 100644 flying-saucer-pdf-itext5/src/main/java/org/xhtmlrenderer/pdf/HTMLOutline.java diff --git a/flying-saucer-pdf-itext5/src/main/java/org/xhtmlrenderer/pdf/HTMLOutline.java b/flying-saucer-pdf-itext5/src/main/java/org/xhtmlrenderer/pdf/HTMLOutline.java new file mode 100644 index 000000000..5f23f8c4c --- /dev/null +++ b/flying-saucer-pdf-itext5/src/main/java/org/xhtmlrenderer/pdf/HTMLOutline.java @@ -0,0 +1,235 @@ +/* + * {{{ header & license + * Copyright (c) 2016 Stanimir Stamenkov + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * }}} + */ +package org.xhtmlrenderer.pdf; + +import java.util.IdentityHashMap; +import java.util.List; +import java.util.Map; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.w3c.dom.Element; +import org.w3c.dom.Node; +import org.w3c.dom.traversal.DocumentTraversal; +import org.w3c.dom.traversal.NodeFilter; +import org.w3c.dom.traversal.NodeIterator; + +import org.xhtmlrenderer.pdf.ITextOutputDevice.Bookmark; +import org.xhtmlrenderer.render.Box; + +class HTMLOutline { + + private static final Pattern HEADING = + Pattern.compile("h([1-6])", Pattern.CASE_INSENSITIVE); + + /** sectioning roots */ + private static final Pattern ROOT = + Pattern.compile("blockquote|details|fieldset|figure|td", + Pattern.CASE_INSENSITIVE); + + private static final Pattern WS = Pattern.compile("\\s+"); + + private static final int MAX_NAME_LENGTH = 200; + + private final HTMLOutline parent; + + private final int level; + + private final Bookmark bookmark; + + private HTMLOutline() { + this(0, "root", null); + } + + private HTMLOutline(int level, String name, HTMLOutline parent) { + this.level = level; + this.bookmark = new Bookmark(name, ""); + this.parent = parent; + if (parent != null) { + parent.bookmark.addChild(bookmark); + } + } + + /** + * Creates a bookmark list of the document outline generated for the given + * element context (usually the root document element). + *

+ * The current algorithm is more simple than the one suggested in the HTML5 + * specification such as it is not affected by + * sectioning + * content but just the heading level. For + * example:

+ *
+     * <body>
+     *   <h1>Foo</h1>
+     *   <h3>Bar</h3>
+     *   <blockquote>
+     *     <h5>Bla</h5>
+     *   </blockquote>
+     *   <p>Baz</p>
+     *   <h2>Quux</h2>
+     *   <section>
+     *     <h3>Thud</h3>
+     *   </section>
+     *   <h4>Grunt</h4>
+     * </body>
+ *

+ * Should generate outline as:

+ *
    + *
  1. Foo + *
      + *
    1. Bar
    2. + *
    3. Quux
    4. + *
    5. Thud
    6. + *
    7. Grunt
    8. + *
  2. + *
+ *

+ * But it generates outline as:

+ *
    + *
  1. Foo + *
      + *
    1. Bar
    2. + *
    3. Quux + *
        + *
      1. Thud + *
          + *
        1. Grunt
        2. + *
      2. + *
    4. + *
  2. + *
+ * + *

Example document customizations

+ * + *
Include non-heading element as bookmark (level 4)
+ *
+     * <strong data-pdf-bookmark="4">Foo bar</strong>
+ * + *
Specify bookmark name
+ *
+     * <tr data-pdf-bookmark="5" data-pdf-bookmark-name="Bar baz">...</tr>
+ * + *
Exclude individual heading from bookmarks
+ *
+     * <h3 data-pdf-bookmark="none">Baz qux</h3>
+ * + *
Prevent automatic bookmarks for the whole of the document
+ *
+     * <html data-pdf-bookmark="exclude">...</html>
+ * + * @param context the top element a sectioning outline would be generated for; + * @param box box hierarchy the outline bookmarks would get mapped into. + * @return Bookmarks of the outline generated for the given element context. + * @see Creating an outline + */ + public static List generate(Element context, Box box) { + NodeIterator iterator = NestedSectioningFilter.iterator(context); + + HTMLOutline root = new HTMLOutline(); + HTMLOutline current = root; + Map map = new IdentityHashMap(); + + for (Element element = (Element) iterator.nextNode(); + element != null; element = (Element) iterator.nextNode()) { + int level; + try { + level = Integer.parseInt(getOutlineLevel(element)); + if (level < 1) { + continue; // Illegal value + } + } catch (NumberFormatException e) { + continue; // Invalid value + } + + String name = getBookmarkName(element); + + while (current.level >= level) { + current = current.parent; + } + current = new HTMLOutline(level, name, current); + map.put(element, current.bookmark); + } + initBoxRefs(map, box); + return root.bookmark.getChildren(); + } // generate(Element, Box) : List + + private static void initBoxRefs(Map map, Box box) { + Bookmark bookmark = map.get(box.getElement()); + if (bookmark != null) { + bookmark.setBox(box); + } + for (int i = 0, len = box.getChildCount(); i < len; i++) { + initBoxRefs(map, box.getChild(i)); + } + } + + private static String getBookmarkName(Element element) { + String name = element.getAttribute("data-pdf-bookmark-name").trim(); + if (name.isEmpty()) { + name = element.getTextContent(); + } + name = WS.matcher(name.trim()).replaceAll(" "); + if (name.length() > MAX_NAME_LENGTH) { + name = name.substring(0, MAX_NAME_LENGTH); + } + return name; + } + + static String getOutlineLevel(Element element) { + String bookmark = element.getAttribute("data-pdf-bookmark").trim(); + if (bookmark.isEmpty()) { + Matcher heading = HEADING.matcher(element.getTagName()); + if (heading.matches()) { + bookmark = heading.group(1); + } else if (ROOT.matcher(element.getTagName()).matches()) { + bookmark = "exclude"; + } else { + bookmark = "none"; + } + } + return bookmark; + } + + + private static class NestedSectioningFilter implements NodeFilter { + + static final NestedSectioningFilter INSTANCE = new NestedSectioningFilter(); + + static NodeIterator iterator(Element root) { + return ((DocumentTraversal) root.getOwnerDocument()) + .createNodeIterator(root, SHOW_ELEMENT, INSTANCE, true); + } + + @Override + public short acceptNode(Node n) { + String outlineLevel = getOutlineLevel((Element) n); + if (outlineLevel.equalsIgnoreCase("none")) { + return FILTER_SKIP; + } + return outlineLevel.equalsIgnoreCase("exclude") + ? FILTER_REJECT + : FILTER_ACCEPT; + } + + } // class NestedSectioningFilter + + +} // class HTMLOutline diff --git a/flying-saucer-pdf-itext5/src/main/java/org/xhtmlrenderer/pdf/ITextOutputDevice.java b/flying-saucer-pdf-itext5/src/main/java/org/xhtmlrenderer/pdf/ITextOutputDevice.java index 59f199192..77b02357b 100644 --- a/flying-saucer-pdf-itext5/src/main/java/org/xhtmlrenderer/pdf/ITextOutputDevice.java +++ b/flying-saucer-pdf-itext5/src/main/java/org/xhtmlrenderer/pdf/ITextOutputDevice.java @@ -20,7 +20,6 @@ package org.xhtmlrenderer.pdf; import java.awt.BasicStroke; -import java.awt.Color; import java.awt.Point; import java.awt.Rectangle; import java.awt.Shape; @@ -35,8 +34,6 @@ import java.awt.geom.Point2D; import java.io.IOException; import java.net.URI; -import java.net.URISyntaxException; -import java.net.URL; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; @@ -66,7 +63,6 @@ import org.xhtmlrenderer.pdf.ITextFontResolver.FontDescription; import org.xhtmlrenderer.render.AbstractOutputDevice; import org.xhtmlrenderer.render.BlockBox; -import org.xhtmlrenderer.render.BorderPainter; import org.xhtmlrenderer.render.Box; import org.xhtmlrenderer.render.FSFont; import org.xhtmlrenderer.render.InlineLayoutBox; @@ -906,6 +902,9 @@ public void finish(RenderingContext c, Box root) { } private void writeOutline(RenderingContext c, Box root) { + if (_bookmarks.isEmpty()) { + _bookmarks = HTMLOutline.generate(root.getElement(), root); + } if (_bookmarks.size() > 0) { _writer.setViewerPreferences(PdfWriter.PageModeUseOutlines); writeBookmarks(c, root, _writer.getRootOutline(), _bookmarks); @@ -931,15 +930,16 @@ private int getPageRefY(Box box) { private void writeBookmark(RenderingContext c, Box root, PdfOutline parent, Bookmark bookmark) { String href = bookmark.getHRef(); PdfDestination target = null; + Box box = bookmark.getBox(); if (href.length() > 0 && href.charAt(0) == '#') { - Box box = _sharedContext.getBoxById(href.substring(1)); - if (box != null) { - PageBox page = root.getLayer().getPage(c, getPageRefY(box)); - int distanceFromTop = page.getMarginBorderPadding(c, CalculatedStyle.TOP); - distanceFromTop += box.getAbsY() - page.getTop(); - target = new PdfDestination(PdfDestination.XYZ, 0, normalizeY(distanceFromTop / _dotsPerPoint), 0); - target.addPage(_writer.getPageReference(_startPageNo + page.getPageNo() + 1)); - } + box = _sharedContext.getBoxById(href.substring(1)); + } + if (box != null) { + PageBox page = root.getLayer().getPage(c, getPageRefY(box)); + int distanceFromTop = page.getMarginBorderPadding(c, CalculatedStyle.TOP); + distanceFromTop += box.getAbsY() - page.getTop(); + target = new PdfDestination(PdfDestination.XYZ, 0, normalizeY(distanceFromTop / _dotsPerPoint), 0); + target.addPage(_writer.getPageReference(_startPageNo + page.getPageNo() + 1)); } if (target == null) { target = _defaultDestination; @@ -980,9 +980,10 @@ private void loadBookmark(Bookmark parent, Element bookmark) { } } - private static class Bookmark { + static class Bookmark { private String _name; private String _HRef; + private Box _box; private List _children; @@ -994,6 +995,14 @@ public Bookmark(String name, String href) { _HRef = href; } + public Box getBox() { + return _box; + } + + public void setBox(Box box) { + _box = box; + } + public String getHRef() { return _HRef; }