fix(parser/html): misc minor parsing fixes (#4852)

biomejs · Jan 8, 2025 · ec02891 · ec02891
1 parent f064fd4
commit ec02891
Show file tree

Hide file tree

Showing 10 changed files with 314 additions and 11 deletions.
diff --git a/crates/biome_html_formatter/src/html/auxiliary/element.rs b/crates/biome_html_formatter/src/html/auxiliary/element.rs
@@ -113,8 +113,10 @@ impl FormatNodeRule<HtmlElement> for FormatHtmlElement {
             f,
         )?;
         if should_be_verbatim {
-            format_verbatim_skipped(children.syntax()).fmt(f)?;
-            write!(f, [hard_line_break()])?;
+            write!(
+                f,
+                [&block_indent(&format_verbatim_skipped(children.syntax()))]
+            )?;
         } else {
             let format_children = FormatHtmlElementList::default()
                 .with_options(FormatHtmlElementListOptions {

diff --git a/crates/biome_html_formatter/tests/specs/html/elements/pre.html.snap b/crates/biome_html_formatter/tests/specs/html/elements/pre.html.snap
@@ -40,7 +40,8 @@ Indent script and style: false
 -----
 
 ```html
-<pre>___                       ___           ___           ___
+<pre>
+	___                       ___           ___           ___
 	/\  \          ___        /\  \         /\__\         /\  \
  /::\  \        /\  \      /::\  \       /::|  |       /::\  \
 /:/\:\  \       \:\  \    /:/\:\  \     /:|:|  |      /:/\:\  \

diff --git a/crates/biome_html_parser/src/lexer/mod.rs b/crates/biome_html_parser/src/lexer/mod.rs
@@ -60,8 +60,12 @@ impl<'src> HtmlLexer<'src> {
             b'=' => self.consume_byte(T![=]),
             b'!' => self.consume_byte(T![!]),
             b'\'' | b'"' => self.consume_string_literal(current),
-            // TODO: differentiate between attribute names and identifiers
-            _ if is_identifier_byte(current) || is_attribute_name_byte(current) => {
+            _ if self.current_kind == T![<] && is_tag_name_byte(current) => {
+                // tag names must immediately follow a `<`
+                // https://html.spec.whatwg.org/multipage/syntax.html#start-tags
+                self.consume_tag_name(current)
+            }
+            _ if self.current_kind != T![<] && is_attribute_name_byte(current) => {
                 self.consume_identifier(current, false)
             }
             _ => {
@@ -104,7 +108,7 @@ impl<'src> HtmlLexer<'src> {
             b'>' => self.consume_byte(T![>]),
             b'!' => self.consume_byte(T![!]),
             b'\'' | b'"' => self.consume_string_literal(current),
-            _ if is_identifier_byte(current) || is_attribute_name_byte(current) => {
+            _ if is_tag_name_byte(current) || is_attribute_name_byte(current) => {
                 self.consume_identifier(current, true)
             }
             _ => self.consume_unexpected_character(),
@@ -193,7 +197,7 @@ impl<'src> HtmlLexer<'src> {
         self.advance_byte_or_char(first);
 
         while let Some(byte) = self.current_byte() {
-            if is_identifier_byte(byte) || is_attribute_name_byte(byte) {
+            if is_attribute_name_byte(byte) {
                 if len < BUFFER_SIZE {
                     buffer[len] = byte;
                     len += 1;
@@ -212,6 +216,32 @@ impl<'src> HtmlLexer<'src> {
         }
     }
 
+    fn consume_tag_name(&mut self, first: u8) -> HtmlSyntaxKind {
+        self.assert_current_char_boundary();
+
+        const BUFFER_SIZE: usize = 14;
+        let mut buffer = [0u8; BUFFER_SIZE];
+        buffer[0] = first;
+        let mut len = 1;
+
+        self.advance_byte_or_char(first);
+
+        while let Some(byte) = self.current_byte() {
+            if is_tag_name_byte(byte) {
+                if len < BUFFER_SIZE {
+                    buffer[len] = byte;
+                    len += 1;
+                }
+
+                self.advance(1)
+            } else {
+                break;
+            }
+        }
+
+        HTML_LITERAL
+    }
+
     fn consume_string_literal(&mut self, quote: u8) -> HtmlSyntaxKind {
         self.assert_current_char_boundary();
         let start = self.text_position();
@@ -554,8 +584,9 @@ impl<'src> Lexer<'src> for HtmlLexer<'src> {
     }
 }
 
-fn is_identifier_byte(byte: u8) -> bool {
+fn is_tag_name_byte(byte: u8) -> bool {
     // https://html.spec.whatwg.org/#elements-2
+    // https://html.spec.whatwg.org/multipage/syntax.html#syntax-tag-name
     byte.is_ascii_alphanumeric()
 }
 

diff --git a/crates/biome_html_parser/src/syntax/mod.rs b/crates/biome_html_parser/src/syntax/mod.rs
@@ -114,7 +114,7 @@ fn parse_element(p: &mut HtmlParser) -> ParsedSyntax {
         loop {
             ElementList.parse_list(p);
             if let Some(mut closing) =
-                parse_closing_element(p).or_add_diagnostic(p, expected_closing_tag)
+                parse_closing_tag(p).or_add_diagnostic(p, expected_closing_tag)
             {
                 if !closing.text(p).contains(opening_tag_name.as_str()) {
                     p.error(expected_matching_closing_tag(p, closing.range(p)).into_diagnostic(p));
@@ -130,7 +130,7 @@ fn parse_element(p: &mut HtmlParser) -> ParsedSyntax {
     }
 }
 
-fn parse_closing_element(p: &mut HtmlParser) -> ParsedSyntax {
+fn parse_closing_tag(p: &mut HtmlParser) -> ParsedSyntax {
     if !p.at(T![<]) || !p.nth_at(1, T![/]) {
         return Absent;
     }
@@ -271,7 +271,9 @@ fn parse_comment(p: &mut HtmlParser) -> ParsedSyntax {
     }
     let m = p.start();
     p.bump_with_context(T![<!--], HtmlLexContext::Comment);
-    p.bump_with_context(HTML_LITERAL, HtmlLexContext::Comment);
+    while !p.at(T![-->]) && !p.at(EOF) {
+        p.bump_with_context(HTML_LITERAL, HtmlLexContext::Comment);
+    }
     p.expect(T![-->]);
     Present(m.complete(p, HTML_COMMENT))
 }
diff --git a/crates/biome_html_parser/tests/html_specs/error/element/child-no-tag-name.html b/crates/biome_html_parser/tests/html_specs/error/element/child-no-tag-name.html
@@ -0,0 +1 @@
+<div><</div>
diff --git a/crates/biome_html_parser/tests/html_specs/error/element/child-no-tag-name.html.snap b/crates/biome_html_parser/tests/html_specs/error/element/child-no-tag-name.html.snap
@@ -0,0 +1,119 @@
+---
+source: crates/biome_html_parser/tests/spec_test.rs
+expression: snapshot
+---
+## Input
+
+```html
+<div><</div>
+
+```
+
+
+## AST
+
+```
+HtmlRoot {
+    bom_token: missing (optional),
+    directive: missing (optional),
+    html: HtmlElementList [
+        HtmlElement {
+            opening_element: HtmlOpeningElement {
+                l_angle_token: L_ANGLE@0..1 "<" [] [],
+                name: HtmlName {
+                    value_token: HTML_LITERAL@1..4 "div" [] [],
+                },
+                attributes: HtmlAttributeList [],
+                r_angle_token: R_ANGLE@4..5 ">" [] [],
+            },
+            children: HtmlElementList [
+                HtmlElement {
+                    opening_element: HtmlOpeningElement {
+                        l_angle_token: L_ANGLE@5..6 "<" [] [],
+                        name: missing (required),
+                        attributes: HtmlAttributeList [],
+                        r_angle_token: missing (required),
+                    },
+                    children: HtmlElementList [],
+                    closing_element: HtmlClosingElement {
+                        l_angle_token: L_ANGLE@6..7 "<" [] [],
+                        slash_token: SLASH@7..8 "/" [] [],
+                        name: HtmlName {
+                            value_token: HTML_LITERAL@8..11 "div" [] [],
+                        },
+                        r_angle_token: R_ANGLE@11..12 ">" [] [],
+                    },
+                },
+            ],
+            closing_element: missing (required),
+        },
+    ],
+    eof_token: EOF@12..13 "" [Newline("\n")] [],
+}
+```
+
+## CST
+
+```
+0: [email protected]
+  0: (empty)
+  1: (empty)
+  2: [email protected]
+    0: [email protected]
+      0: [email protected]
+        0: [email protected] "<" [] []
+        1: [email protected]
+          0: [email protected] "div" [] []
+        2: [email protected]
+        3: [email protected] ">" [] []
+      1: [email protected]
+        0: [email protected]
+          0: [email protected]
+            0: [email protected] "<" [] []
+            1: (empty)
+            2: [email protected]
+            3: (empty)
+          1: [email protected]
+          2: [email protected]
+            0: [email protected] "<" [] []
+            1: [email protected] "/" [] []
+            2: [email protected]
+              0: [email protected] "div" [] []
+            3: [email protected] ">" [] []
+      2: (empty)
+  3: [email protected] "" [Newline("\n")] []
+
+```
+
+## Diagnostics
+
+```
+child-no-tag-name.html:1:7 parse ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+
+  × Expected an element name but instead found '<'.
+  
+  > 1 │ <div><</div>
+      │       ^
+    2 │ 
+  
+  i Expected an element name here.
+  
+  > 1 │ <div><</div>
+      │       ^
+    2 │ 
+  
+child-no-tag-name.html:2:1 parse ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+
+  × Expected a closing tag but instead found the end of the file.
+  
+    1 │ <div><</div>
+  > 2 │ 
+      │ 
+  
+  i Expected a closing tag here.
+  
+    1 │ <div><</div>
+  > 2 │ 
+      │ 
+  
+```
diff --git a/crates/biome_html_parser/tests/html_specs/error/element/missing-close-tag-2.html b/crates/biome_html_parser/tests/html_specs/error/element/missing-close-tag-2.html
@@ -0,0 +1 @@
+<div>
diff --git a/crates/biome_html_parser/tests/html_specs/error/element/missing-close-tag-2.html.snap b/crates/biome_html_parser/tests/html_specs/error/element/missing-close-tag-2.html.snap
@@ -0,0 +1,74 @@
+---
+source: crates/biome_html_parser/tests/spec_test.rs
+expression: snapshot
+---
+## Input
+
+```html
+<div>
+
+```
+
+
+## AST
+
+```
+HtmlRoot {
+    bom_token: missing (optional),
+    directive: missing (optional),
+    html: HtmlElementList [
+        HtmlElement {
+            opening_element: HtmlOpeningElement {
+                l_angle_token: L_ANGLE@0..1 "<" [] [],
+                name: HtmlName {
+                    value_token: HTML_LITERAL@1..4 "div" [] [],
+                },
+                attributes: HtmlAttributeList [],
+                r_angle_token: R_ANGLE@4..5 ">" [] [],
+            },
+            children: HtmlElementList [],
+            closing_element: missing (required),
+        },
+    ],
+    eof_token: EOF@5..6 "" [Newline("\n")] [],
+}
+```
+
+## CST
+
+```
+0: [email protected]
+  0: (empty)
+  1: (empty)
+  2: [email protected]
+    0: [email protected]
+      0: [email protected]
+        0: [email protected] "<" [] []
+        1: [email protected]
+          0: [email protected] "div" [] []
+        2: [email protected]
+        3: [email protected] ">" [] []
+      1: [email protected]
+      2: (empty)
+  3: [email protected] "" [Newline("\n")] []
+
+```
+
+## Diagnostics
+
+```
+missing-close-tag-2.html:2:1 parse ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+
+  × Expected a closing tag but instead found the end of the file.
+  
+    1 │ <div>
+  > 2 │ 
+      │ 
+  
+  i Expected a closing tag here.
+  
+    1 │ <div>
+  > 2 │ 
+      │ 
+  
+```
diff --git a/crates/biome_html_parser/tests/html_specs/error/element/solo-no-tag-name.html b/crates/biome_html_parser/tests/html_specs/error/element/solo-no-tag-name.html
@@ -0,0 +1 @@
+<