From 1b9d7e4501bfc807d24e5bb20f93709cf98f1e05 Mon Sep 17 00:00:00 2001 From: Mathias Kende Date: Sat, 30 Mar 2024 22:22:59 +0100 Subject: [PATCH] Initial support for the GitHub table syntax. For now, leading and trailing pipes are always required. --- .aspelldict | 3 +- lib/Markdown/Perl.pm | 17 +++++ lib/Markdown/Perl/BlockParser.pm | 105 +++++++++++++++++++++---------- t/901-github-test-suite.t | 2 +- 4 files changed, 92 insertions(+), 35 deletions(-) diff --git a/.aspelldict b/.aspelldict index f3f9ee5..967f10c 100644 --- a/.aspelldict +++ b/.aspelldict @@ -1,4 +1,4 @@ -personal_ws-1.1 en 168 +personal_ws-1.1 en 169 CDATA CommonMark CounterClockwiseContourIntegral @@ -77,6 +77,7 @@ figcaption fj frameset gc +gcx gfm github gx diff --git a/lib/Markdown/Perl.pm b/lib/Markdown/Perl.pm index f0ea8c2..4c04b15 100644 --- a/lib/Markdown/Perl.pm +++ b/lib/Markdown/Perl.pm @@ -152,6 +152,23 @@ sub _emit_html { .join("\n
  • ", map { $this->_emit_html(!$loose, 'list', $linkrefs, @{$_->{content}}) } @{$b->{items}}) ."
  • \n\n"; + } elsif ($b->{type} eq 'table') { + $out .= ''; + if (@{$b->{content}{table}}) { + $out .= ''; + for my $l (@{$b->{content}{table}}) { + $out .= ''; + } + $out .= ''; + } + $out .= '
    '; + $out .= join('', + map { $this->_render_inlines($linkrefs, $_) } @{$b->{content}{headers}}); + $out .= '
    '; + $out .= join('', map { $this->_render_inlines($linkrefs, $_) } @{$l}); + $out .= '
    '; + } else { + confess 'Unexpected block type when rendering HTML output: '.$b->{type}; } } # Note: a final new line should always be appended to $out. This is not diff --git a/lib/Markdown/Perl/BlockParser.pm b/lib/Markdown/Perl/BlockParser.pm index eb36a9d..1b2c017 100644 --- a/lib/Markdown/Perl/BlockParser.pm +++ b/lib/Markdown/Perl/BlockParser.pm @@ -63,6 +63,8 @@ sub AUTOLOAD { ## no critic (ProhibitAutoloading, RequireArgUnpacking) return $this->{pmarkdown}->$AUTOLOAD(@_); } +my $eol_re = qr/ \r\n | \n | \r /x; + sub next_line { my ($this) = @_; # When we are forcing a line, we don’t recompute the line_ending, but it @@ -71,7 +73,7 @@ sub next_line { return delete $this->{forced_line} if exists $this->{forced_line}; return if pos($this->{md}) == length($this->{md}); $this->{last_pos} = pos($this->{md}); - $this->{md} =~ m/\G([^\n\r]*)(\r\n|\n|\r)?/g or confess 'Should not happen'; + $this->{md} =~ m/\G([^\n\r]*)(${eol_re})?/g or confess 'Should not happen'; my ($t, $e) = ($1, $2); if ($1 =~ /^[ \t]+$/) { $this->{line_ending} = $t.($e // '') if $this->get_preserve_white_lines; @@ -649,23 +651,18 @@ sub _do_link_reference_definition { $this->redo_line(); my $start_pos = $this->get_pos(); - # We consume the prefix of enclosing blocks until we find the marker that we - # know is there. This won’t work if we accept task list markers in the - # future. - # This also won’t work to consume markers of subsequent lines of the link - # reference definition. - # TODO: fix these two bugs above (hard! — although in practice the only - # prefix character that can exist are '>' at the beginning of the line, so - # we could try to count them, we don’t even need to count spaces for the lists - # because the link definition is considered to be paragraph continuation text - # by cmark, the spec seems to accept any number of additional spaces too). + # We consume the continuation prefix of enclosing blocks. Note that in the big + # regex we allow any number of space after the continuation because it’s what + # cmark does. my $cont = $this->{continuation_re}; - $this->{md} =~ m/\G${cont}/g; + confess 'Unexpected regex match failure' unless $this->{md} =~ m/\G${cont}/g; # TODO: # - Support for escaped or balanced parenthesis in naked destination # - break this up in smaller pieces and test them independently. # - The need to disable ProhibitUnusedCapture seems to be buggy... + # - most of the regex parses only \n and not other eol sequence. The regex + # should either be fixed or the entry be normalized. ## no critic (ProhibitComplexRegexes, ProhibitUnusedCapture) if ( $this->{md} =~ m/\G @@ -722,28 +719,70 @@ sub _do_link_reference_definition { # https://github.github.com/gfm/#tables-extension- sub _do_table_block { my ($this) = @_; - return; - # # TODO: add an option to prevent interrupting a paragraph with a table (and - # # make it be true for pmarkdown, but not for github where tables can interrupt - # # a paragraph). - # return unless $l =~ m/^ {0,3}\|/; - # my $init_pos = $this->get_pos(); - # $this->redo_line(); - # my $start_pos = $this->get_pos(); - - # # See the note in the link_reference parsing for this approach. Note that, - # # as opposed to what happens for links, subsequent lines can have at most - # # 3 more spaces than the initial one with the GitHub implementation (but not - # # some other GFM implementations). - # $this->{md} =~ m/\G.*?\|/g; - - # # TODO: - # # - break this up in smaller pieces and test them independently. - # ## no critic (ProhibitComplexRegexes) - # if ($this->{md} =~ m/\G/x) { } - - # return; + # TODO: add an option to prevent interrupting a paragraph with a table (and + # make it be true for pmarkdown, but not for github where tables can interrupt + # a paragraph). + # TODO: github supports omitting the first | even on the first line when we + # are not interrupting a paragraph and when subsequent the delimiter line has + # more than one dash per cell. + return unless $l =~ m/^ {0,3}\|/; + my $init_pos = $this->get_pos(); + $this->redo_line(); + + my $table = $this->_parse_table_structure(); + if (!$table) { + $this->set_pos($init_pos); + return; + } + + $this->_add_block({type => 'table', content => $table}); + + return 1; +} + +sub _parse_table_structure { + my ($this) = @_; + + # A regexp that matches no back-slashes or an even number of them, so that the + # next character cannot be escaped. + my $e = qr/(?{continuation_re}; + confess 'Unexpected regex match failure' unless $this->{md} =~ m/\G${cont}/g; + + # Now we consume the initial | marking the beginning of the table that we know + # is here because of the initial match against $l in _do_table_block. + confess 'Unexpected missing table markers' unless $this->{md} =~ m/\G {0,3}\|/g; + + # We parse the header row + my @headers = $this->{md} =~ m/\G [ \t]* (.*? [ \t]* $e) \| /gcx; + return unless @headers; + + # We consume the end of line that must happen after the headers. + return unless $this->{md} =~ m/\G [ \t]* ${eol_re} ${cont} \ {0,3} \|? /gx; + + my @separators = $this->{md} =~ m/\G [ \t]* ( :? -+ :? [ \t]* $e) \| /gcx; + return unless @separators == @headers; + + # We consume the end of line that must happen after the headers. + return unless $this->{md} =~ m/\G [ \t]* (:? ${eol_re} | $ ) /gx; + + # And now we try to read as many lines as possible + my @table; + while (1) { + last unless $this->{md} =~ m/\G ${cont} \ {0,3} \| /gcx; + my @cells = $this->{md} =~ m/\G [ \t]* (.*? [ \t]* $e) \| /gcx; + # We consume the end of line that must happen after the cells. + return unless $this->{md} =~ m/\G [ \t]* (:? ${eol_re} | $ ) /gx; + push @table, \@cells; + } + + return {headers => \@headers, separators => \@separators, table => \@table}; } # https://spec.commonmark.org/0.30/#paragraphs diff --git a/t/901-github-test-suite.t b/t/901-github-test-suite.t index aba9df8..5d0b9db 100644 --- a/t/901-github-test-suite.t +++ b/t/901-github-test-suite.t @@ -9,7 +9,7 @@ use CmarkTest; use Test2::V0; # TODO: remove these todos. -my %opt = (todo => [198 .. 202, 204, 205], +my %opt = (todo => [199 .. 202, 204, 205], # These are bugs in the GitHub spec, not in our implementation. All # of these have been tested to be buggy in the real cmark-gfm # implementation.