diff --git a/dev-guide.md b/dev-guide.md index 8d6438a..881f1e0 100644 --- a/dev-guide.md +++ b/dev-guide.md @@ -3,13 +3,25 @@ This document describes usage of mistletoe and its API from the developer's point of view. -Understanding the AST ---------------------- +Understanding the AST and the tokens +------------------------------------ When a markdown document gets parsed by mistletoe, the result is represented -as an "abstract syntax tree" (AST), stored in an instance of `Document`. -This object contains a hierarchy of -all the various tokens which were recognized during the parsing process. +as an _abstract syntax tree (AST)_, stored in an instance of `Document`. +This object contains a hierarchy of all the various _tokens_ which were recognized +during the parsing process, for example, `Paragraph`, `Heading`, and `RawText`. + +The tokens which represent a line or a block of lines in the input markdown +are called _block tokens_. Examples include `List`, `Paragraph`, `ThematicBreak`, +and also the `Document` itself. + +The tokens which represent the actual content within a block are called _span tokens_, +or, with CommonMark terminology, _inline tokens_. +In this category you will find tokens like `RawText`, `Link`, and `Emphasis`. + +Block tokens may have block tokens, span tokens, or no tokens at all as children +in the AST; this depends on the type of token. Span tokens may *only* have span +tokens as children. In order to see what exactly gets parsed, one can simply use the `AstRenderer` on a given markdown input, for example: @@ -36,9 +48,11 @@ Then we will get this JSON output from the AST renderer: { "type": "Document", "footnotes": {}, + "line_number": 1, "children": [ { "type": "Heading", + "line_number": 1, "level": 1, "children": [ { @@ -49,6 +63,7 @@ Then we will get this JSON output from the AST renderer: }, { "type": "Paragraph", + "line_number": 3, "children": [ { "type": "RawText", @@ -58,6 +73,7 @@ Then we will get this JSON output from the AST renderer: }, { "type": "Heading", + "line_number": 5, "level": 1, "children": [ { @@ -68,6 +84,7 @@ Then we will get this JSON output from the AST renderer: }, { "type": "Paragraph", + "line_number": 7, "children": [ { "type": "Link", @@ -86,12 +103,25 @@ Then we will get this JSON output from the AST renderer: } ``` -When passing this tree to a renderer, it is recursively traversed +### Line numbers + +mistletoe records the starting line of all block tokens that it encounters during +parsing and stores it as the `line_number` attribute of each token. +(This feature is not available for span tokens yet.) + +Rendering +--------- +Sometimes all you need is the information from the AST. But more often, you'll +want to take that information and turn it into some other format like HTML. +This is called _rendering_. mistletoe provides a set of built-in renderers for +different formats, and it's also possible to define your own renderer. + +When passing an AST to a renderer, the tree is recursively traversed and methods corresponding to individual token types get called on the renderer in order to create the output in the desired format. -Creating a custom renderer --------------------------- +Creating a custom token and renderer +------------------------------------ Here's an example of how to add GitHub-style wiki links to the parsing process, and provide a renderer for this new token. @@ -245,7 +275,8 @@ For more info, take a look at the `base_renderer` module in mistletoe. The docstrings might give you a more granular idea of customizing mistletoe to your needs. -## Markdown to Markdown +Markdown to Markdown parsing-and-rendering +------------------------------------------ Suppose you have some Markdown that you want to process and then output as Markdown again. Thanks to the text-like nature of Markdown, it is often @@ -254,12 +285,11 @@ example, if you want to replace a text fragment in the plain text, but not in the embedded code samples, then the search-and-replace approach won't work. In this case you can use mistletoe's `MarkdownRenderer`: -1. Parse Markdown to an AST tree (usually held in a `Document` token). -2. Make modifications to the AST tree. +1. Parse Markdown to an AST (usually held in a `Document` token). +2. Make modifications to the AST. 3. Render back to Markdown using `MarkdownRenderer.render()`. -Here is an example of how you can make text replacements in selected parts -of the AST: +Here is an example of how you can replace text in selected parts of the AST: ```python import mistletoe @@ -296,7 +326,8 @@ with open("README.md", "r") as fin: print(md) ``` -If you're making large changes, so that the formatting of the document is -affected, then it can be useful to also have the text reflowed. This can -be done by specifying a `max_line_length` parameter in the call to the -`MarkdownRenderer` constructor. +The `MarkdownRenderer` can also reflow the text in the document to a given +maximum line length. And it can do so while preserving the formatting of code +blocks and other tokens where line breaks matter. To use this feature, +specify a `max_line_length` parameter in the call to the `MarkdownRenderer` +constructor. diff --git a/mistletoe/block_token.py b/mistletoe/block_token.py index 470a49e..a86ac51 100644 --- a/mistletoe/block_token.py +++ b/mistletoe/block_token.py @@ -109,6 +109,8 @@ class BlockToken(token.Token): children (list): inner tokens. line_number (int): starting line (1-based). """ + repr_attributes = ("line_number",) + def __init__(self, lines, tokenize_func): self.children = tokenize_func(lines) @@ -139,6 +141,7 @@ def __init__(self, lines): lines = lines.splitlines(keepends=True) lines = [line if line.endswith('\n') else '{}\n'.format(line) for line in lines] self.footnotes = {} + self.line_number = 1 token._root_node = self self.children = tokenize(lines) token._root_node = None @@ -152,7 +155,7 @@ class Heading(BlockToken): Attributes: level (int): heading level. """ - repr_attributes = ("level",) + repr_attributes = BlockToken.repr_attributes + ("level",) pattern = re.compile(r' {0,3}(#{1,6})(?:\n|\s+?(.*?)(\n|\s+?#+\s*?$))') level = 0 content = '' @@ -193,7 +196,7 @@ class SetextHeading(BlockToken): Attributes: level (int): heading level. """ - repr_attributes = ("level",) + repr_attributes = BlockToken.repr_attributes + ("level",) def __init__(self, lines): self.underline = lines.pop().rstrip() @@ -352,7 +355,7 @@ class BlockCode(BlockToken): Attributes: language (str): always the empty string. """ - repr_attributes = ("language",) + repr_attributes = BlockToken.repr_attributes + ("language",) def __init__(self, lines): self.language = '' self.children = (span_token.RawText(''.join(lines).strip('\n')+'\n'),) @@ -408,7 +411,7 @@ class CodeFence(BlockToken): Attributes: language (str): language of code block (default to empty). """ - repr_attributes = ("language",) + repr_attributes = BlockToken.repr_attributes + ("language",) pattern = re.compile(r'( {0,3})(`{3,}|~{3,})( *(\S*)[^\n]*)') _open_info = None @@ -468,7 +471,7 @@ class List(BlockToken): loose (bool): whether the list is loose. start (NoneType or int): None if unordered, starting number if ordered. """ - repr_attributes = ("loose", "start") + repr_attributes = BlockToken.repr_attributes + ("loose", "start") pattern = re.compile(r' {0,3}(?:\d{0,9}[.)]|[+\-*])(?:[ \t]*$|[ \t]+)') def __init__(self, matches): self.children = [ListItem(*match) for match in matches] @@ -539,7 +542,7 @@ class ListItem(BlockToken): for continuation lines. loose (bool): whether the list is loose. """ - repr_attributes = ("leader", "indentation", "prepend", "loose") + repr_attributes = BlockToken.repr_attributes + ("leader", "indentation", "prepend", "loose") pattern = re.compile(r'( {0,3})(\d{0,9}[.)]|[+\-*])($|\s+)') continuation_pattern = re.compile(r'([ \t]*)(\S.*\n|\n)') @@ -684,7 +687,7 @@ class Table(BlockToken): header: header row (TableRow). column_align (list): align options for each column (default to [None]). """ - repr_attributes = ("column_align",) + repr_attributes = BlockToken.repr_attributes + ("column_align",) interrupt_paragraph = True def __init__(self, match): @@ -760,7 +763,7 @@ class TableRow(BlockToken): Attributes: row_align (list): align options for each column (default to [None]). """ - repr_attributes = ("row_align",) + repr_attributes = BlockToken.repr_attributes + ("row_align",) # Note: Python regex requires fixed-length look-behind, # so we cannot use a more precise alternative: r"(? Foo") - self._check_repr_matches(doc.children[0], "block_token.Quote with 1 child") + self._check_repr_matches(doc.children[0], "block_token.Quote with 1 child line_number=1") def test_paragraph(self): doc = Document("Foo") - self._check_repr_matches(doc.children[0], "block_token.Paragraph with 1 child") + self._check_repr_matches(doc.children[0], "block_token.Paragraph with 1 child line_number=1") def test_blockcode(self): doc = Document("Foo\n\n\tBar\n\nBaz") - self._check_repr_matches(doc.children[1], "block_token.BlockCode with 1 child language=''") + self._check_repr_matches(doc.children[1], "block_token.BlockCode with 1 child line_number=3 language=''") def test_codefence(self): doc = Document("""```python\nprint("Hello, World!"\n```""") - self._check_repr_matches(doc.children[0], "block_token.CodeFence with 1 child language='python'") + self._check_repr_matches(doc.children[0], "block_token.CodeFence with 1 child line_number=1 language='python'") def test_unordered_list(self): doc = Document("* Foo\n* Bar\n* Baz") - self._check_repr_matches(doc.children[0], "block_token.List with 3 children loose=False start=None") - self._check_repr_matches(doc.children[0].children[0], "block_token.ListItem with 1 child leader='*' indentation=0 prepend=2 loose=False") + self._check_repr_matches(doc.children[0], "block_token.List with 3 children line_number=1 loose=False start=None") + self._check_repr_matches(doc.children[0].children[0], "block_token.ListItem with 1 child line_number=1 leader='*' indentation=0 prepend=2 loose=False") def test_ordered_list(self): doc = Document("1. Foo\n2. Bar\n3. Baz") - self._check_repr_matches(doc.children[0], "block_token.List with 3 children loose=False start=1") - self._check_repr_matches(doc.children[0].children[0], "block_token.ListItem with 1 child leader='1.' indentation=0 prepend=3 loose=False") + self._check_repr_matches(doc.children[0], "block_token.List with 3 children line_number=1 loose=False start=1") + self._check_repr_matches(doc.children[0].children[0], "block_token.ListItem with 1 child line_number=1 leader='1.' indentation=0 prepend=3 loose=False") def test_table(self): doc = Document("| Foo | Bar | Baz |\n|:--- |:---:| ---:|\n| Foo | Bar | Baz |\n") - self._check_repr_matches(doc.children[0], "block_token.Table with 1 child column_align=[None, 0, 1]") - self._check_repr_matches(doc.children[0].children[0], "block_token.TableRow with 3 children row_align=[None, 0, 1]") - self._check_repr_matches(doc.children[0].children[0].children[0], "block_token.TableCell with 1 child align=None") + self._check_repr_matches(doc.children[0], "block_token.Table with 1 child line_number=1 column_align=[None, 0, 1]") + self._check_repr_matches(doc.children[0].children[0], "block_token.TableRow with 3 children line_number=3 row_align=[None, 0, 1]") + self._check_repr_matches(doc.children[0].children[0].children[0], "block_token.TableCell with 1 child line_number=3 align=None") def test_thematicbreak(self): doc = Document("Foo\n\n---\n\nBar\n") - self._check_repr_matches(doc.children[1], "block_token.ThematicBreak") + self._check_repr_matches(doc.children[1], "block_token.ThematicBreak line_number=3") # No test for ``Footnote`` @@ -70,7 +70,7 @@ def test_htmlblock(self): doc = Document("
\nFoo\n
\n") finally: block_token.reset_tokens() - self._check_repr_matches(doc.children[0], "block_token.HtmlBlock with 1 child") + self._check_repr_matches(doc.children[0], "block_token.HtmlBlock with 1 child line_number=1") self._check_repr_matches(doc.children[0].children[0], "span_token.RawText content='
\\nFoo\\n
'") # Span tokens