Assign line numbers to all block tokens during parsing. Added unit te…

…sts.
miyuchina · Aug 14, 2023 · 639e1fd · 639e1fd
1 parent 21cb864
commit 639e1fd
Show file tree

Hide file tree

Showing 6 changed files with 126 additions and 25 deletions.
diff --git a/mistletoe/block_token.py b/mistletoe/block_token.py
@@ -107,6 +107,7 @@ class BlockToken(token.Token):
 
     Attributes:
         children (list): inner tokens.
+        line_number (int): starting line (1-based).
     """
     def __init__(self, lines, tokenize_func):
         self.children = tokenize_func(lines)
@@ -680,16 +681,17 @@ class Table(BlockToken):
     repr_attributes = ("column_align",)
     interrupt_paragraph = True
 
-    def __init__(self, lines):
+    def __init__(self, match):
+        lines, start_line = match
         if '---' in lines[1]:
             self.column_align = [self.parse_align(column)
                     for column in self.split_delimiter(lines[1])]
-            self.header = TableRow(lines[0], self.column_align)
-            self.children = [TableRow(line, self.column_align) for line in lines[2:]]
+            self.header = TableRow(lines[0], start_line, self.column_align)
+            self.children = [TableRow(line, start_line + offset, self.column_align) for offset, line in enumerate(lines[2:], start=2)]
         else:
             # note: not reachable, because read() guarantees the presence of three dashes
             self.column_align = [None]
-            self.children = [TableRow(line) for line in lines]
+            self.children = [TableRow(line, start_line + offset) for offset, line in enumerate(lines)]
 
     @staticmethod
     def split_delimiter(delimiter):
@@ -733,12 +735,13 @@ def check_interrupts_paragraph(cls, lines):
     def read(lines):
         anchor = lines.get_pos()
         line_buffer = [next(lines)]
+        start_line = lines.line_number()
         while lines.peek() is not None and '|' in lines.peek():
             line_buffer.append(next(lines))
         if len(line_buffer) < 2 or '---' not in line_buffer[1]:
             lines.set_pos(anchor)
             return None
-        return line_buffer
+        return line_buffer, start_line
 
 
 class TableRow(BlockToken):
@@ -757,10 +760,11 @@ class TableRow(BlockToken):
     split_pattern = re.compile(r"(?<!\\)\|")
     escaped_pipe_pattern = re.compile(r"(?<!\\)(\\\\)*\\\|")
 
-    def __init__(self, line, row_align=None):
+    def __init__(self, line, line_number, row_align=None):
         self.row_align = row_align or [None]
+        self.line_number = line_number
         cells = filter(None, self.split_pattern.split(line.strip()))
-        self.children = [TableCell(self.escaped_pipe_pattern.sub('\\1|', cell.strip()) if cell else '', align)
+        self.children = [TableCell(self.escaped_pipe_pattern.sub('\\1|', cell.strip()) if cell else '', line_number, align)
                          for cell, align in zip_longest(cells, self.row_align)]
 
 
@@ -775,8 +779,9 @@ class TableCell(BlockToken):
         align (bool): align option for current cell (default to None).
     """
     repr_attributes = ("align",)
-    def __init__(self, content, align=None):
+    def __init__(self, content, line_number, align=None):
         self.align = align
+        self.line_number = line_number
         super().__init__(content, span_token.tokenize_inner)
 
 

diff --git a/mistletoe/block_tokenizer.py b/mistletoe/block_tokenizer.py
@@ -79,9 +79,10 @@ def tokenize_block(iterable, token_types, start_line=1):
     while line is not None:
         for token_type in token_types:
             if token_type.start(line):
+                line_number = lines.line_number() + 1
                 result = token_type.read(lines)
                 if result is not None:
-                    parse_buffer.append((token_type, result))
+                    parse_buffer.append((token_type, result, line_number))
                     break
         else:  # unmatched newlines
             next(lines)
@@ -99,9 +100,10 @@ def make_tokens(parse_buffer):
     and span-level parsing is started here.
     """
     tokens = []
-    for token_type, result in parse_buffer:
+    for token_type, result, line_number in parse_buffer:
         token = token_type(result)
         if token is not None:
+            token.line_number = line_number
             tokens.append(token)
     return tokens
 

diff --git a/mistletoe/markdown_renderer.py b/mistletoe/markdown_renderer.py
@@ -51,7 +51,7 @@ def __init__(self, match):
 class LinkReferenceDefinitionBlock(block_token.Footnote):
     """
     A sequence of link reference definitions.
-    This is a container block token. Its children are link reference definition tokens.
+    This is a leaf block token. Its children are link reference definition tokens.
 
     This class inherits from `Footnote` and modifies the behavior of the constructor,
     to keep the tokens in the AST.

diff --git a/test/samples/line_numbers.md b/test/samples/line_numbers.md
@@ -0,0 +1,37 @@
+# Test document for line numbers
+
+See test_line_numbers.py.
+Every number written as inline code should match the line number of its nearest
+parent block token. Same with titles of link reference definitions.
+
+## Heading `7`
+
+Basic paragraph.
+
+> Block quote (with a paragraph inside)
+> It's the start line that counts! `11`
+> * a list inside the quote
+> * with two items `14`
+>
+> Still the same block quote, but another paragraph `16`
+
+1. List item `18`
+2. Nested list
+   * item
+   * another item `21`
+3. List item with a nested `22`
+   > block quote `23`
+
+| Table `25` | Columns |
+| ---------- | ------- |
+| ?          | ! `27`  |
+
+Paragraph with <p>inline HTML</p> `29`
+
+Setext
+heading `31`
+------------
+
+Paragraph with [ref] to a link reference definition.
+
+[ref]: /url (37)
diff --git a/test/test_block_token.py b/test/test_block_token.py
@@ -390,7 +390,7 @@ def test_match(self):
             self.assertTrue(hasattr(token, 'header'))
             self.assertEqual(token.column_align, [None, None, None])
             token.children
-            calls = [call(line, [None, None, None]) for line in lines[:1]+lines[2:]]
+            calls = [call(line, line_number, [None, None, None]) for line_number, line in enumerate(lines, start=1) if line_number != 2]
             mock.assert_has_calls(calls)
 
     def test_easy_table(self):
@@ -403,7 +403,7 @@ def test_easy_table(self):
             self.assertTrue(hasattr(token, 'header'))
             self.assertEqual(token.column_align, [1, None])
             token.children
-            calls = [call(line, [1, None]) for line in lines[:1] + lines[2:]]
+            calls = [call(line, line_number, [1, None]) for line_number, line in enumerate(lines, start=1) if line_number != 2]
             mock.assert_has_calls(calls)
 
     def test_not_easy_table(self):
@@ -433,44 +433,44 @@ class TestTableRow(unittest.TestCase):
     def test_match(self):
         with patch('mistletoe.block_token.TableCell') as mock:
             line = '| cell 1 | cell 2 |\n'
-            token = block_token.TableRow(line)
+            token = block_token.TableRow(line, 10)
             self.assertEqual(token.row_align, [None])
-            mock.assert_has_calls([call('cell 1', None), call('cell 2', None)])
+            mock.assert_has_calls([call('cell 1', 10, None), call('cell 2', 10, None)])
 
     def test_easy_table_row(self):
         with patch('mistletoe.block_token.TableCell') as mock:
             line = 'cell 1 | cell 2\n'
-            token = block_token.TableRow(line)
+            token = block_token.TableRow(line, 10)
             self.assertEqual(token.row_align, [None])
-            mock.assert_has_calls([call('cell 1', None), call('cell 2', None)])
+            mock.assert_has_calls([call('cell 1', 10, None), call('cell 2', 10, None)])
 
     def test_short_row(self):
         with patch('mistletoe.block_token.TableCell') as mock:
             line = '| cell 1 |\n'
-            token = block_token.TableRow(line, [None, None])
+            token = block_token.TableRow(line, 10, [None, None])
             self.assertEqual(token.row_align, [None, None])
-            mock.assert_has_calls([call('cell 1', None), call('', None)])
+            mock.assert_has_calls([call('cell 1', 10, None), call('', 10, None)])
 
     def test_escaped_pipe_in_cell(self):
         with patch('mistletoe.block_token.TableCell') as mock:
             line = '| pipe: `\\|` | cell 2\n'
-            token = block_token.TableRow(line, [None, None])
+            token = block_token.TableRow(line, 10, [None, None])
             self.assertEqual(token.row_align, [None, None])
-            mock.assert_has_calls([call('pipe: `|`', None), call('cell 2', None)])
+            mock.assert_has_calls([call('pipe: `|`', 10, None), call('cell 2', 10, None)])
 
     @unittest.skip('Even GitHub fails in here, workaround: always put a space before `|`')
     def test_not_really_escaped_pipe_in_cell(self):
         with patch('mistletoe.block_token.TableCell') as mock:
             line = '|ending with a \\\\|cell 2\n'
-            token = block_token.TableRow(line, [None, None])
+            token = block_token.TableRow(line, 10, [None, None])
             self.assertEqual(token.row_align, [None, None])
-            mock.assert_has_calls([call('ending with a \\\\', None), call('cell 2', None)])
+            mock.assert_has_calls([call('ending with a \\\\', 10, None), call('cell 2', 10, None)])
 
 
 class TestTableCell(TestToken):
     def test_match(self):
-        token = block_token.TableCell('cell 2')
-        self._test_token(token, 'cell 2', align=None)
+        token = block_token.TableCell('cell 2', 13)
+        self._test_token(token, 'cell 2', line_number=13, align=None)
 
 
 class TestFootnote(unittest.TestCase):

diff --git a/test/test_line_numbers.py b/test/test_line_numbers.py
@@ -0,0 +1,57 @@
+import unittest
+
+import mistletoe.block_token as block_token
+import mistletoe.span_token as span_token
+from mistletoe.markdown_renderer import (
+    LinkReferenceDefinition,
+    LinkReferenceDefinitionBlock,
+)
+
+
+class TestLineNumbers(unittest.TestCase):
+    def setUp(self) -> None:
+        block_token.add_token(block_token.HTMLBlock)
+        span_token.add_token(span_token.HTMLSpan)
+        block_token.remove_token(block_token.Footnote)
+        block_token.add_token(LinkReferenceDefinitionBlock)
+        return super().setUp()
+
+    def tearDown(self) -> None:
+        span_token.reset_tokens()
+        block_token.reset_tokens()
+        return super().tearDown()
+
+    def test_main(self):
+        # see line_numbers.md for a description of how the test works.
+        NUMBER_OF_LINE_NUMBERS_TO_BE_CHECKED = 13
+        with open("test/samples/line_numbers.md", "r") as fin:
+            document = block_token.Document(fin)
+        count = self.check_line_numbers(document)
+        self.assertEqual(count, NUMBER_OF_LINE_NUMBERS_TO_BE_CHECKED)
+
+    def check_line_numbers(self, token: block_token.BlockToken):
+        """Check the line number on the given block token and its children, if possible."""
+        count = 0
+        line_number = self.get_expected_line_number(token)
+        if line_number:
+            self.assertEqual(token.line_number, line_number)
+            count += 1
+
+        if isinstance(token, block_token.Table):
+            count += self.check_line_numbers(token.header)
+
+        for child in token.children:
+            if isinstance(child, block_token.BlockToken):
+                count += self.check_line_numbers(child)
+
+        return count
+
+    def get_expected_line_number(self, token: block_token.BlockToken):
+        # the expected line number, if it exists, should be wrapped in an inline
+        # code token and be an immediate child of the token.
+        # or it could be the title of a link reference definition.
+        for child in token.children:
+            if isinstance(child, span_token.InlineCode):
+                return int(child.children[0].content)
+            if isinstance(child, LinkReferenceDefinition):
+                return int(child.title)