From 639e1fdb50005e5c46992d5b634246db12f3da09 Mon Sep 17 00:00:00 2001 From: Anders Kaplan Date: Thu, 22 Jun 2023 22:34:35 +0200 Subject: [PATCH] Assign line numbers to all block tokens during parsing. Added unit tests. --- mistletoe/block_token.py | 21 ++++++++----- mistletoe/block_tokenizer.py | 6 ++-- mistletoe/markdown_renderer.py | 2 +- test/samples/line_numbers.md | 37 ++++++++++++++++++++++ test/test_block_token.py | 28 ++++++++--------- test/test_line_numbers.py | 57 ++++++++++++++++++++++++++++++++++ 6 files changed, 126 insertions(+), 25 deletions(-) create mode 100644 test/samples/line_numbers.md create mode 100644 test/test_line_numbers.py diff --git a/mistletoe/block_token.py b/mistletoe/block_token.py index 5b12bfac..c20da1fa 100644 --- a/mistletoe/block_token.py +++ b/mistletoe/block_token.py @@ -107,6 +107,7 @@ class BlockToken(token.Token): Attributes: children (list): inner tokens. + line_number (int): starting line (1-based). """ def __init__(self, lines, tokenize_func): self.children = tokenize_func(lines) @@ -680,16 +681,17 @@ class Table(BlockToken): repr_attributes = ("column_align",) interrupt_paragraph = True - def __init__(self, lines): + def __init__(self, match): + lines, start_line = match if '---' in lines[1]: self.column_align = [self.parse_align(column) for column in self.split_delimiter(lines[1])] - self.header = TableRow(lines[0], self.column_align) - self.children = [TableRow(line, self.column_align) for line in lines[2:]] + self.header = TableRow(lines[0], start_line, self.column_align) + self.children = [TableRow(line, start_line + offset, self.column_align) for offset, line in enumerate(lines[2:], start=2)] else: # note: not reachable, because read() guarantees the presence of three dashes self.column_align = [None] - self.children = [TableRow(line) for line in lines] + self.children = [TableRow(line, start_line + offset) for offset, line in enumerate(lines)] @staticmethod def split_delimiter(delimiter): @@ -733,12 +735,13 @@ def check_interrupts_paragraph(cls, lines): def read(lines): anchor = lines.get_pos() line_buffer = [next(lines)] + start_line = lines.line_number() while lines.peek() is not None and '|' in lines.peek(): line_buffer.append(next(lines)) if len(line_buffer) < 2 or '---' not in line_buffer[1]: lines.set_pos(anchor) return None - return line_buffer + return line_buffer, start_line class TableRow(BlockToken): @@ -757,10 +760,11 @@ class TableRow(BlockToken): split_pattern = re.compile(r"(? Block quote (with a paragraph inside) +> It's the start line that counts! `11` +> * a list inside the quote +> * with two items `14` +> +> Still the same block quote, but another paragraph `16` + +1. List item `18` +2. Nested list + * item + * another item `21` +3. List item with a nested `22` + > block quote `23` + +| Table `25` | Columns | +| ---------- | ------- | +| ? | ! `27` | + +Paragraph with

inline HTML

`29` + +Setext +heading `31` +------------ + +Paragraph with [ref] to a link reference definition. + +[ref]: /url (37) diff --git a/test/test_block_token.py b/test/test_block_token.py index 99901f2f..324ad0e2 100644 --- a/test/test_block_token.py +++ b/test/test_block_token.py @@ -390,7 +390,7 @@ def test_match(self): self.assertTrue(hasattr(token, 'header')) self.assertEqual(token.column_align, [None, None, None]) token.children - calls = [call(line, [None, None, None]) for line in lines[:1]+lines[2:]] + calls = [call(line, line_number, [None, None, None]) for line_number, line in enumerate(lines, start=1) if line_number != 2] mock.assert_has_calls(calls) def test_easy_table(self): @@ -403,7 +403,7 @@ def test_easy_table(self): self.assertTrue(hasattr(token, 'header')) self.assertEqual(token.column_align, [1, None]) token.children - calls = [call(line, [1, None]) for line in lines[:1] + lines[2:]] + calls = [call(line, line_number, [1, None]) for line_number, line in enumerate(lines, start=1) if line_number != 2] mock.assert_has_calls(calls) def test_not_easy_table(self): @@ -433,44 +433,44 @@ class TestTableRow(unittest.TestCase): def test_match(self): with patch('mistletoe.block_token.TableCell') as mock: line = '| cell 1 | cell 2 |\n' - token = block_token.TableRow(line) + token = block_token.TableRow(line, 10) self.assertEqual(token.row_align, [None]) - mock.assert_has_calls([call('cell 1', None), call('cell 2', None)]) + mock.assert_has_calls([call('cell 1', 10, None), call('cell 2', 10, None)]) def test_easy_table_row(self): with patch('mistletoe.block_token.TableCell') as mock: line = 'cell 1 | cell 2\n' - token = block_token.TableRow(line) + token = block_token.TableRow(line, 10) self.assertEqual(token.row_align, [None]) - mock.assert_has_calls([call('cell 1', None), call('cell 2', None)]) + mock.assert_has_calls([call('cell 1', 10, None), call('cell 2', 10, None)]) def test_short_row(self): with patch('mistletoe.block_token.TableCell') as mock: line = '| cell 1 |\n' - token = block_token.TableRow(line, [None, None]) + token = block_token.TableRow(line, 10, [None, None]) self.assertEqual(token.row_align, [None, None]) - mock.assert_has_calls([call('cell 1', None), call('', None)]) + mock.assert_has_calls([call('cell 1', 10, None), call('', 10, None)]) def test_escaped_pipe_in_cell(self): with patch('mistletoe.block_token.TableCell') as mock: line = '| pipe: `\\|` | cell 2\n' - token = block_token.TableRow(line, [None, None]) + token = block_token.TableRow(line, 10, [None, None]) self.assertEqual(token.row_align, [None, None]) - mock.assert_has_calls([call('pipe: `|`', None), call('cell 2', None)]) + mock.assert_has_calls([call('pipe: `|`', 10, None), call('cell 2', 10, None)]) @unittest.skip('Even GitHub fails in here, workaround: always put a space before `|`') def test_not_really_escaped_pipe_in_cell(self): with patch('mistletoe.block_token.TableCell') as mock: line = '|ending with a \\\\|cell 2\n' - token = block_token.TableRow(line, [None, None]) + token = block_token.TableRow(line, 10, [None, None]) self.assertEqual(token.row_align, [None, None]) - mock.assert_has_calls([call('ending with a \\\\', None), call('cell 2', None)]) + mock.assert_has_calls([call('ending with a \\\\', 10, None), call('cell 2', 10, None)]) class TestTableCell(TestToken): def test_match(self): - token = block_token.TableCell('cell 2') - self._test_token(token, 'cell 2', align=None) + token = block_token.TableCell('cell 2', 13) + self._test_token(token, 'cell 2', line_number=13, align=None) class TestFootnote(unittest.TestCase): diff --git a/test/test_line_numbers.py b/test/test_line_numbers.py new file mode 100644 index 00000000..897939b3 --- /dev/null +++ b/test/test_line_numbers.py @@ -0,0 +1,57 @@ +import unittest + +import mistletoe.block_token as block_token +import mistletoe.span_token as span_token +from mistletoe.markdown_renderer import ( + LinkReferenceDefinition, + LinkReferenceDefinitionBlock, +) + + +class TestLineNumbers(unittest.TestCase): + def setUp(self) -> None: + block_token.add_token(block_token.HTMLBlock) + span_token.add_token(span_token.HTMLSpan) + block_token.remove_token(block_token.Footnote) + block_token.add_token(LinkReferenceDefinitionBlock) + return super().setUp() + + def tearDown(self) -> None: + span_token.reset_tokens() + block_token.reset_tokens() + return super().tearDown() + + def test_main(self): + # see line_numbers.md for a description of how the test works. + NUMBER_OF_LINE_NUMBERS_TO_BE_CHECKED = 13 + with open("test/samples/line_numbers.md", "r") as fin: + document = block_token.Document(fin) + count = self.check_line_numbers(document) + self.assertEqual(count, NUMBER_OF_LINE_NUMBERS_TO_BE_CHECKED) + + def check_line_numbers(self, token: block_token.BlockToken): + """Check the line number on the given block token and its children, if possible.""" + count = 0 + line_number = self.get_expected_line_number(token) + if line_number: + self.assertEqual(token.line_number, line_number) + count += 1 + + if isinstance(token, block_token.Table): + count += self.check_line_numbers(token.header) + + for child in token.children: + if isinstance(child, block_token.BlockToken): + count += self.check_line_numbers(child) + + return count + + def get_expected_line_number(self, token: block_token.BlockToken): + # the expected line number, if it exists, should be wrapped in an inline + # code token and be an immediate child of the token. + # or it could be the title of a link reference definition. + for child in token.children: + if isinstance(child, span_token.InlineCode): + return int(child.children[0].content) + if isinstance(child, LinkReferenceDefinition): + return int(child.title)