Skip to content

Commit

Permalink
fix(parser/html): misc minor parsing fixes (#4852)
Browse files Browse the repository at this point in the history
  • Loading branch information
dyc3 authored Jan 8, 2025
1 parent f064fd4 commit ec02891
Show file tree
Hide file tree
Showing 10 changed files with 314 additions and 11 deletions.
6 changes: 4 additions & 2 deletions crates/biome_html_formatter/src/html/auxiliary/element.rs
Original file line number Diff line number Diff line change
Expand Up @@ -113,8 +113,10 @@ impl FormatNodeRule<HtmlElement> for FormatHtmlElement {
f,
)?;
if should_be_verbatim {
format_verbatim_skipped(children.syntax()).fmt(f)?;
write!(f, [hard_line_break()])?;
write!(
f,
[&block_indent(&format_verbatim_skipped(children.syntax()))]
)?;
} else {
let format_children = FormatHtmlElementList::default()
.with_options(FormatHtmlElementListOptions {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,8 @@ Indent script and style: false
-----

```html
<pre>___ ___ ___ ___
<pre>
___ ___ ___ ___
/\ \ ___ /\ \ /\__\ /\ \
/::\ \ /\ \ /::\ \ /::| | /::\ \
/:/\:\ \ \:\ \ /:/\:\ \ /:|:| | /:/\:\ \
Expand Down
41 changes: 36 additions & 5 deletions crates/biome_html_parser/src/lexer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,12 @@ impl<'src> HtmlLexer<'src> {
b'=' => self.consume_byte(T![=]),
b'!' => self.consume_byte(T![!]),
b'\'' | b'"' => self.consume_string_literal(current),
// TODO: differentiate between attribute names and identifiers
_ if is_identifier_byte(current) || is_attribute_name_byte(current) => {
_ if self.current_kind == T![<] && is_tag_name_byte(current) => {
// tag names must immediately follow a `<`
// https://html.spec.whatwg.org/multipage/syntax.html#start-tags
self.consume_tag_name(current)
}
_ if self.current_kind != T![<] && is_attribute_name_byte(current) => {
self.consume_identifier(current, false)
}
_ => {
Expand Down Expand Up @@ -104,7 +108,7 @@ impl<'src> HtmlLexer<'src> {
b'>' => self.consume_byte(T![>]),
b'!' => self.consume_byte(T![!]),
b'\'' | b'"' => self.consume_string_literal(current),
_ if is_identifier_byte(current) || is_attribute_name_byte(current) => {
_ if is_tag_name_byte(current) || is_attribute_name_byte(current) => {
self.consume_identifier(current, true)
}
_ => self.consume_unexpected_character(),
Expand Down Expand Up @@ -193,7 +197,7 @@ impl<'src> HtmlLexer<'src> {
self.advance_byte_or_char(first);

while let Some(byte) = self.current_byte() {
if is_identifier_byte(byte) || is_attribute_name_byte(byte) {
if is_attribute_name_byte(byte) {
if len < BUFFER_SIZE {
buffer[len] = byte;
len += 1;
Expand All @@ -212,6 +216,32 @@ impl<'src> HtmlLexer<'src> {
}
}

fn consume_tag_name(&mut self, first: u8) -> HtmlSyntaxKind {
self.assert_current_char_boundary();

const BUFFER_SIZE: usize = 14;
let mut buffer = [0u8; BUFFER_SIZE];
buffer[0] = first;
let mut len = 1;

self.advance_byte_or_char(first);

while let Some(byte) = self.current_byte() {
if is_tag_name_byte(byte) {
if len < BUFFER_SIZE {
buffer[len] = byte;
len += 1;
}

self.advance(1)
} else {
break;
}
}

HTML_LITERAL
}

fn consume_string_literal(&mut self, quote: u8) -> HtmlSyntaxKind {
self.assert_current_char_boundary();
let start = self.text_position();
Expand Down Expand Up @@ -554,8 +584,9 @@ impl<'src> Lexer<'src> for HtmlLexer<'src> {
}
}

fn is_identifier_byte(byte: u8) -> bool {
fn is_tag_name_byte(byte: u8) -> bool {
// https://html.spec.whatwg.org/#elements-2
// https://html.spec.whatwg.org/multipage/syntax.html#syntax-tag-name
byte.is_ascii_alphanumeric()
}

Expand Down
8 changes: 5 additions & 3 deletions crates/biome_html_parser/src/syntax/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ fn parse_element(p: &mut HtmlParser) -> ParsedSyntax {
loop {
ElementList.parse_list(p);
if let Some(mut closing) =
parse_closing_element(p).or_add_diagnostic(p, expected_closing_tag)
parse_closing_tag(p).or_add_diagnostic(p, expected_closing_tag)
{
if !closing.text(p).contains(opening_tag_name.as_str()) {
p.error(expected_matching_closing_tag(p, closing.range(p)).into_diagnostic(p));
Expand All @@ -130,7 +130,7 @@ fn parse_element(p: &mut HtmlParser) -> ParsedSyntax {
}
}

fn parse_closing_element(p: &mut HtmlParser) -> ParsedSyntax {
fn parse_closing_tag(p: &mut HtmlParser) -> ParsedSyntax {
if !p.at(T![<]) || !p.nth_at(1, T![/]) {
return Absent;
}
Expand Down Expand Up @@ -271,7 +271,9 @@ fn parse_comment(p: &mut HtmlParser) -> ParsedSyntax {
}
let m = p.start();
p.bump_with_context(T![<!--], HtmlLexContext::Comment);
p.bump_with_context(HTML_LITERAL, HtmlLexContext::Comment);
while !p.at(T![-->]) && !p.at(EOF) {
p.bump_with_context(HTML_LITERAL, HtmlLexContext::Comment);
}
p.expect(T![-->]);
Present(m.complete(p, HTML_COMMENT))
}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
<div><</div>
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
---
source: crates/biome_html_parser/tests/spec_test.rs
expression: snapshot
---
## Input

```html
<div><</div>
```


## AST

```
HtmlRoot {
bom_token: missing (optional),
directive: missing (optional),
html: HtmlElementList [
HtmlElement {
opening_element: HtmlOpeningElement {
l_angle_token: L_ANGLE@0..1 "<" [] [],
name: HtmlName {
value_token: HTML_LITERAL@1..4 "div" [] [],
},
attributes: HtmlAttributeList [],
r_angle_token: R_ANGLE@4..5 ">" [] [],
},
children: HtmlElementList [
HtmlElement {
opening_element: HtmlOpeningElement {
l_angle_token: L_ANGLE@5..6 "<" [] [],
name: missing (required),
attributes: HtmlAttributeList [],
r_angle_token: missing (required),
},
children: HtmlElementList [],
closing_element: HtmlClosingElement {
l_angle_token: L_ANGLE@6..7 "<" [] [],
slash_token: SLASH@7..8 "/" [] [],
name: HtmlName {
value_token: HTML_LITERAL@8..11 "div" [] [],
},
r_angle_token: R_ANGLE@11..12 ">" [] [],
},
},
],
closing_element: missing (required),
},
],
eof_token: EOF@12..13 "" [Newline("\n")] [],
}
```

## CST

```
0: [email protected]
0: (empty)
1: (empty)
2: [email protected]
0: [email protected]
0: [email protected]
0: [email protected] "<" [] []
1: [email protected]
0: [email protected] "div" [] []
2: [email protected]
3: [email protected] ">" [] []
1: [email protected]
0: [email protected]
0: [email protected]
0: [email protected] "<" [] []
1: (empty)
2: [email protected]
3: (empty)
1: [email protected]
2: [email protected]
0: [email protected] "<" [] []
1: [email protected] "/" [] []
2: [email protected]
0: [email protected] "div" [] []
3: [email protected] ">" [] []
2: (empty)
3: [email protected] "" [Newline("\n")] []
```

## Diagnostics

```
child-no-tag-name.html:1:7 parse ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
× Expected an element name but instead found '<'.
> 1 │ <div><</div>
│ ^
2 │
i Expected an element name here.
> 1 │ <div><</div>
│ ^
2 │
child-no-tag-name.html:2:1 parse ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
× Expected a closing tag but instead found the end of the file.
1 │ <div><</div>
> 2 │
i Expected a closing tag here.
1 │ <div><</div>
> 2 │
```
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
<div>
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
---
source: crates/biome_html_parser/tests/spec_test.rs
expression: snapshot
---
## Input

```html
<div>
```
## AST
```
HtmlRoot {
bom_token: missing (optional),
directive: missing (optional),
html: HtmlElementList [
HtmlElement {
opening_element: HtmlOpeningElement {
l_angle_token: L_ANGLE@0..1 "<" [] [],
name: HtmlName {
value_token: HTML_LITERAL@1..4 "div" [] [],
},
attributes: HtmlAttributeList [],
r_angle_token: R_ANGLE@4..5 ">" [] [],
},
children: HtmlElementList [],
closing_element: missing (required),
},
],
eof_token: EOF@5..6 "" [Newline("\n")] [],
}
```
## CST
```
0: [email protected]
0: (empty)
1: (empty)
2: [email protected]
0: [email protected]
0: [email protected]
0: [email protected] "<" [] []
1: [email protected]
0: [email protected] "div" [] []
2: [email protected]
3: [email protected] ">" [] []
1: [email protected]
2: (empty)
3: [email protected] "" [Newline("\n")] []
```
## Diagnostics
```
missing-close-tag-2.html:2:1 parse ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
× Expected a closing tag but instead found the end of the file.
1 │ <div>
> 2 │
i Expected a closing tag here.
1 │ <div>
> 2 │
```
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
<
Loading

0 comments on commit ec02891

Please sign in to comment.