inline_parser.dart

// Copyright (c) 2011, the Dart project authors.  Please see the AUTHORS file
// for details. All rights reserved. Use of this source code is governed by a
// BSD-style license that can be found in the LICENSE file.

/// Maintains the internal state needed to parse inline span elements in
/// markdown.
class InlineParser {
  static List<InlineSyntax> get syntaxes() {
    // Lazy initialize.
    if (_syntaxes == null) {
      _syntaxes = <InlineSyntax>[
        new AutolinkSyntax(),
        new LinkSyntax(),
        // "*" surrounded by spaces is left alone.
        new TextSyntax(@' \* '),
        // "_" surrounded by spaces is left alone.
        new TextSyntax(@' _ '),
        // Leave already-encoded HTML entities alone. Ensures we don't turn
        // "&amp;" into "&amp;amp;"
        new TextSyntax(@'&[#a-zA-Z0-9]*;'),
        // Encode "&".
        new TextSyntax(@'&', sub: '&amp;'),
        // Encode "<". (Why not encode ">" too? Gruber is toying with us.)
        new TextSyntax(@'<', sub: '&lt;'),
        // Parse "**strong**" tags.
        new TagSyntax(@'\*\*', tag: 'strong'),
        // Parse "__strong__" tags.
        new TagSyntax(@'__', tag: 'strong'),
        // Parse "*emphasis*" tags.
        new TagSyntax(@'\*', tag: 'em'),
        // Parse "_emphasis_" tags.
        // TODO(rnystrom): Underscores in the middle of a word should not be
        // parsed as emphasis like_in_this.
        new TagSyntax(@'_', tag: 'em'),
        // Parse inline code within double backticks: "``code``".
        new CodeSyntax(@'``\s?((?:.|\n)*?)\s?``'),
        // Parse inline code within backticks: "`code`".
        new CodeSyntax(@'`([^`]*)`')
      ];
    }

    return _syntaxes;
  }

  static List<InlineSyntax> _syntaxes;

  /// The string of markdown being parsed.
  final String source;

  /// The markdown document this parser is parsing.
  final Document document;

  /// The current read position.
  int pos = 0;

  /// Starting position of the last unconsumed text.
  int start = 0;

  final List<TagState> _stack;

  InlineParser(this.source, this.document)
    : _stack = <TagState>[];

  List<Node> parse() {
    // Make a fake top tag to hold the results.
    _stack.add(new TagState(0, 0, null));

    while (!isDone) {
      bool matched = false;

      // See if any of the current tags on the stack match. We don't allow tags
      // of the same kind to nest, so this takes priority over other possible // matches.
      for (int i = _stack.length - 1; i > 0; i--) {
        if (_stack[i].tryMatch(this)) {
          matched = true;
          break;
        }
      }
      if (matched) continue;

      // See if the current text matches any defined markdown syntax.
      for (final syntax in syntaxes) {
        if (syntax.tryMatch(this)) {
          matched = true;
          break;
        }
      }
      if (matched) continue;

      // If we got here, it's just text.
      advanceBy(1);
    }

    // Unwind any unmatched tags and get the results.
    return _stack[0].close(this, null);
  }

  writeText() {
    writeTextRange(start, pos);
    start = pos;
  }

  writeTextRange(int start, int end) {
    if (end > start) {
      final text = source.substring(start, end);
      final nodes = _stack.last().children;

      // If the previous node is text too, just append.
      if ((nodes.length > 0) && (nodes.last() is Text)) {
        final newNode = new Text('${nodes.last().text}$text');
        nodes[nodes.length - 1] = newNode;
      } else {
        nodes.add(new Text(text));
      }
    }
  }

  addNode(Node node) {
    _stack.last().children.add(node);
  }

  // TODO(rnystrom): Only need this because RegExp doesn't let you start
  // searching from a given offset.
  String get currentSource() => source.substring(pos, source.length);

  bool get isDone() => pos == source.length;

  void advanceBy(int length) {
    pos += length;
  }

  void consume(int length) {
    pos += length;
    start = pos;
  }
}

/// Represents one kind of markdown tag that can be parsed.
class InlineSyntax {
  final RegExp pattern;

  InlineSyntax(String pattern)
    : pattern = new RegExp(pattern, true);
  // TODO(rnystrom): Should use named arg for RegExp multiLine.

  bool tryMatch(InlineParser parser) {
    final startMatch = pattern.firstMatch(parser.currentSource);
    if ((startMatch != null) && (startMatch.start() == 0)) {
      // Write any existing plain text up to this point.
      parser.writeText();

      if (onMatch(parser, startMatch)) {
        parser.consume(startMatch[0].length);
      }
      return true;
    }
    return false;
  }

  abstract bool onMatch(InlineParser parser, Match match);
}

/// Matches stuff that should just be passed through as straight text.
class TextSyntax extends InlineSyntax {
  String substitute;
  TextSyntax(String pattern, [String sub])
    : super(pattern),
      substitute = sub;

  bool onMatch(InlineParser parser, Match match) {
    if (substitute == null) {
      // Just use the original matched text.
      parser.advanceBy(match[0].length);
      return false;
    }

    // Insert the substitution.
    parser.addNode(new Text(substitute));
    return true;
  }
}

/// Matches autolinks like `<http://foo.com>`.
class AutolinkSyntax extends InlineSyntax {
  AutolinkSyntax()
    : super(@'<((http|https|ftp)://[^>]*)>');
  // TODO(rnystrom): Make case insensitive.

  bool onMatch(InlineParser parser, Match match) {
    final url = match[1];

    final anchor = new Element.text('a', escapeHtml(url));
    anchor.attributes['href'] = url;
    parser.addNode(anchor);

    return true;
  }
}

/// Matches syntax that has a pair of tags and becomes an element, like `*` for
/// `<em>`. Allows nested tags.
class TagSyntax extends InlineSyntax {
  final RegExp endPattern;
  final String tag;

  TagSyntax(String pattern, [String tag, String end = null])
    : super(pattern),
      endPattern = new RegExp((end != null) ? end : pattern, true),
      tag = tag;
    // TODO(rnystrom): Doing this.field doesn't seem to work with named args.
    // TODO(rnystrom): Should use named arg for RegExp multiLine.

  bool onMatch(InlineParser parser, Match match) {
    parser._stack.add(new TagState(parser.pos,
      parser.pos + match[0].length, this));
    return true;
  }

  bool onMatchEnd(InlineParser parser, Match match, TagState state) {
    parser.addNode(new Element(tag, state.children));
    return true;
  }
}

/// Matches inline links like `[blah] [id]` and `[blah] (url)`.
class LinkSyntax extends TagSyntax {
  /// The regex for the end of a link needs to handle both reference style and
  /// inline styles as well as optional titles for inline links. To make that
  /// a bit more palatable, this breaks it into pieces.
  static get linkPattern() {
    final refLink    = @'\s?\[([^\]]*)\]';        // "[id]" reflink id.
    final title      = @'(?:[ ]*"([^"]+)"|)';     // Optional title in quotes.
    final inlineLink = '\\s?\\(([^ )]+)$title\\)'; // "(url "title")" link.
    return '\](?:($refLink|$inlineLink)|)';

    // The groups matched by this are:
    // 1: Will be non-empty if it's either a ref or inline link. Will be empty
    //    if it's just a bare pair of square brackets with nothing after them.
    // 2: Contains the id inside [] for a reference-style link.
    // 3: Contains the URL for an inline link.
    // 4: Contains the title, if present, for an inline link.
  }

  LinkSyntax()
    : super(@'\[', end: linkPattern);

  bool onMatchEnd(InlineParser parser, Match match, TagState state) {
    var url;
    var title;

    // If we didn't match refLink or inlineLink, then it means there was
    // nothing after the first square bracket, so it isn't a normal markdown
    // link at all. Instead, we allow users of the library to specify a special
    // resolver function ([setImplicitLinkResolver]) that may choose to handle
    // this. Otherwise, it's just treated as plain text.
    if ((match[1] == null) || (match[1] == '')) {
      if (_implicitLinkResolver == null) return false;

      // Only allow implicit links if the content is just text.
      // TODO(rnystrom): Do we want to relax this?
      if (state.children.length != 1) return false;
      if (state.children[0] is! Text) return false;

      Text link = state.children[0];

      // See if we have a resolver that will generate a link for us.
      final node = _implicitLinkResolver(link.text);
      if (node == null) return false;

      parser.addNode(node);
      return true;
    }

    if ((match[3] != null) && (match[3] != '')) {
      // Inline link like [foo](url).
      url = match[3];
      title = match[4];

      // For whatever reason, markdown allows angle-bracketed URLs here.
      if (url.startsWith('<') && url.endsWith('>')) {
        url = url.substring(1, url.length - 1);
      }
    } else {
      // Reference link like [foo] [bar].
      var id = match[2];
      if (id == '') {
        // The id is empty ("[]") so infer it from the contents.
        id = parser.source.substring(state.startPos + 1, parser.pos);
      }

      // Look up the link.
      final link = parser.document.refLinks[id];
      // If it's an unknown link just emit plaintext.
      if (link == null) return false;

      url = link.url;
      title = link.title;
    }

    final anchor = new Element('a', state.children);
    anchor.attributes['href'] = escapeHtml(url);
    if ((title != null) && (title != '')) {
      anchor.attributes['title'] = escapeHtml(title);
    }

    parser.addNode(anchor);
    return true;
  }
}

/// Matches backtick-enclosed inline code blocks.
class CodeSyntax extends InlineSyntax {
  CodeSyntax(String pattern)
    : super(pattern);

  bool onMatch(InlineParser parser, Match match) {
    parser.addNode(new Element.text('code', escapeHtml(match[1])));
    return true;
  }
}

/// Keeps track of a currently open tag while it is being parsed. The parser
/// maintains a stack of these so it can handle nested tags.
class TagState {
  /// The point in the original source where this tag started.
  int startPos;

  /// The point in the original source where open tag ended.
  int endPos;

  /// The syntax that created this node.
  final TagSyntax syntax;

  /// The children of this node. Will be `null` for text nodes.
  final List<Node> children;

  TagState(this.startPos, this.endPos, this.syntax)
    : children = <Node>[];

  /// Attempts to close this tag by matching the current text against its end
  /// pattern.
  bool tryMatch(InlineParser parser) {
    Match endMatch = syntax.endPattern.firstMatch(parser.currentSource);
    if ((endMatch != null) && (endMatch.start() == 0)) {
      // Close the tag.
      close(parser, endMatch);
      return true;
    }

    return false;
  }

  /// Pops this tag off the stack, completes it, and adds it to the output.
  /// Will discard any unmatched tags that happen to be above it on the stack.
  /// If this is the last node in the stack, returns its children.
  List<Node> close(InlineParser parser, Match endMatch) {
    // If there are unclosed tags on top of this one when it's closed, that
    // means they are mismatched. Mismatched tags are treated as plain text in
    // markdown. So for each tag above this one, we write its start tag as text
    // and then adds its children to this one's children.
    int index = parser._stack.indexOf(this);

    // Remove the unmatched children.
    final unmatchedTags = parser._stack.getRange(index + 1,
        parser._stack.length - index - 1);
    parser._stack.removeRange(index + 1, parser._stack.length - index - 1);

    // Flatten them out onto this tag.
    for (final unmatched in unmatchedTags) {
      // Write the start tag as text.
      parser.writeTextRange(unmatched.startPos, unmatched.endPos);

      // Bequeath its children unto this tag.
      children.addAll(unmatched.children);
    }

    // Pop this off the stack.
    parser.writeText();
    parser._stack.removeLast();

    // If the stack is empty now, this is the special "results" node.
    if (parser._stack.length == 0) return children;

    // We are still parsing, so add this to its parent's children.
    if (syntax.onMatchEnd(parser, endMatch, this)) {
      parser.consume(endMatch[0].length);
    } else {
      // Didn't close correctly so revert to text.
      parser.start = startPos;
      parser.advanceBy(endMatch[0].length);
    }

    return null;
  }
}