Skip to content

Commit

Permalink
Applies a workaround for parser bug dealing with inline comments.
Browse files Browse the repository at this point in the history
This tries to split off inline comments from content that immediately
follows them.  This works around bug
pulldown-cmark/pulldown-cmark#712.
  • Loading branch information
dyoo committed Sep 5, 2023
1 parent 08efd0b commit c9c53cb
Showing 1 changed file with 112 additions and 29 deletions.
141 changes: 112 additions & 29 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
//! how to use the supplied `mdbook` plugins.
use polib::catalog::Catalog;
use pulldown_cmark::{Event, LinkType, Tag};
use pulldown_cmark::{CowStr, Event, LinkType, Tag};
use pulldown_cmark_to_cmark::{cmark_resume_with_options, Options, State};
use regex::Regex;
use std::sync::OnceLock;
Expand Down Expand Up @@ -90,6 +90,38 @@ pub fn extract_events<'a>(text: &'a str, state: Option<State<'static>>) -> Vec<(
}
}

fn expand_event(event: Event) -> Vec<Event> {
match event {
Event::SoftBreak => vec![Event::Text(" ".into())],
// Shortcut links like "[foo]" end up as "[foo]"
// in output. By changing them to a reference
// link, the link is expanded on the fly and the
// output becomes self-contained.
Event::Start(tag @ Tag::Link(..) | tag @ Tag::Image(..)) => {
vec![Event::Start(expand_shortcut_link(tag))]
}
Event::End(tag @ Tag::Link(..) | tag @ Tag::Image(..)) => {
vec![Event::End(expand_shortcut_link(tag))]
}
Event::Html(text) => {
// WARNING: the following is a hack to try to
// work around the behavior of the parser when
// it runs into HTML.
match split_first_html_comment(&text) {
Some((comment, after_comment)) => {
if !after_comment.trim().is_empty() {
vec![Event::Html(comment), Event::Text(after_comment)]
} else {
vec![Event::Html(comment)]
}
}
None => vec![Event::Html(text)],
}
}
_ => vec![event],
}
}

match state {
// If we're in a code block, we disable the normal parsing and
// return lines of text. This matches the behavior of the
Expand All @@ -102,28 +134,47 @@ pub fn extract_events<'a>(text: &'a str, state: Option<State<'static>>) -> Vec<(
// Otherwise, we parse the text line normally.
_ => new_cmark_parser(text, None)
.into_offset_iter()
.map(|(event, range)| {
.flat_map(|(event, range)| {
let lineno = offsets.partition_point(|&o| o < range.start) + 1;
let event = match event {
Event::SoftBreak => Event::Text(" ".into()),
// Shortcut links like "[foo]" end up as "[foo]"
// in output. By changing them to a reference
// link, the link is expanded on the fly and the
// output becomes self-contained.
Event::Start(tag @ Tag::Link(..) | tag @ Tag::Image(..)) => {
Event::Start(expand_shortcut_link(tag))
}
Event::End(tag @ Tag::Link(..) | tag @ Tag::Image(..)) => {
Event::End(expand_shortcut_link(tag))
}
_ => event,
};
(lineno, event)
expand_event(event)
.into_iter()
.map(move |event| (lineno, event))
.collect::<Vec<_>>()
})
.collect(),
}
}

/// Given a html string that begins with an HTML comment, splits the
/// string into two parts: the HTML comment, followed by the content
/// following the comment. This is to workaround an issue:
/// https://github.com/raphlinus/pulldown-cmark/issues/712
fn split_first_html_comment<'a>(html: &CowStr<'a>) -> Option<(CowStr<'a>, CowStr<'a>)> {
const COMMENT_START: &str = "<!--";
const COMMENT_END: &str = "-->";

let start_index = html.find(COMMENT_START)?;
if start_index != 0 {
return None;
}

let comment_start_index = COMMENT_START.len();
let comment_end_index =
html[comment_start_index..].find(COMMENT_END)? + comment_start_index + COMMENT_END.len();

// Slice and preserve CowStr::Borrowed state when possible:
match html {
CowStr::Borrowed(b) => {
let (comment, rest) = b.split_at(comment_end_index);
Some((comment.into(), rest.into()))
}
_ => {
let (comment, rest) = html.split_at(comment_end_index);
Some((comment.to_owned().into(), rest.to_owned().into()))
}
}
}

/// Markdown events grouped by type.
#[derive(Debug, Copy, Clone, PartialEq)]
pub enum Group<'a> {
Expand Down Expand Up @@ -657,14 +708,55 @@ mod tests {
assert_eq!(
extract_events("<!--- mdbook-xgettext:skip -->\nHello", None),
vec![
(1, Html("<!--- mdbook-xgettext:skip -->\n".into())),
(1, Html("<!--- mdbook-xgettext:skip -->".into())),
(2, Start(Paragraph)),
(2, Text("Hello".into())),
(2, End(Paragraph)),
]
);
}

#[test]
fn split_first_html_comment_empty() {
assert_eq!(split_first_html_comment(&"".into()), None);
}

#[test]
fn split_first_html_comment_no_comment() {
assert_eq!(split_first_html_comment(&"not-a-comment".into()), None);
}

#[test]
fn split_first_html_comment_comment_alone_() {
let html = "<!-- hello -->";
assert_eq!(
split_first_html_comment(&CowStr::Borrowed(&html)),
Some((CowStr::Borrowed(&html), "".into()))
);
}

#[test]
fn split_first_html_comment_() {
let html = "<!-- blah --> after comment";
let (left_chunk, right_chunk) = html.split_at(13);
// Sanity check to make sure we are looking at the expected substrings
assert_eq!(left_chunk, "<!-- blah -->");
assert_eq!(right_chunk, " after comment");

assert_eq!(
split_first_html_comment(&CowStr::Borrowed(&html)),
Some((CowStr::Borrowed(left_chunk), CowStr::Borrowed(right_chunk)))
);
}

#[test]
fn split_first_html_comment_ignore_nonleading_comment() {
assert_eq!(
split_first_html_comment(&"pause <!-- blah --> after comment".into()),
None
);
}

#[test]
fn extract_messages_empty() {
assert_extract_messages("", vec![]);
Expand Down Expand Up @@ -1170,23 +1262,14 @@ But *this* should!",
}

#[test]
fn extract_messages_skipping_inline_second_item_buggy() {
// This isn't great: we lose text following a HTML comment.
// Very similar to the failure mode of the
// `extract_messages_details` test.
//
// The root cause appears to be a bug in the Markdown parser
// because it's not separating HTML element from text that
// immediately follows it.
//
// Related: https://github.com/raphlinus/pulldown-cmark/issues/712
fn extract_messages_skipping_inline_second_item() {
assert_extract_messages(
"
* A
* <!--- mdbook-xgettext:skip --> B
* C
",
vec![(2, "A")],
vec![(2, "A"), (4, "C")],
);
}

Expand Down

0 comments on commit c9c53cb

Please sign in to comment.