From 0fe81d899054ac4031a32f461bcfee2b9942a5fd Mon Sep 17 00:00:00 2001 From: WATANABE Yuki Date: Mon, 2 Dec 2024 01:26:39 +0900 Subject: [PATCH] Report syntax errors in escaped strings --- yash-syntax/CHANGELOG.md | 13 ++ yash-syntax/src/parser/error.rs | 66 ++++++-- yash-syntax/src/parser/lex/escape.rs | 238 +++++++++++++++++++++++---- 3 files changed, 276 insertions(+), 41 deletions(-) diff --git a/yash-syntax/CHANGELOG.md b/yash-syntax/CHANGELOG.md index 0a7bc869..7707e092 100644 --- a/yash-syntax/CHANGELOG.md +++ b/yash-syntax/CHANGELOG.md @@ -19,6 +19,19 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - The `DollarSingleQuote` variant is added to the `syntax::WordUnit` enum. - The `EscapeUnit` enum and `EscapedString` struct are added to the `syntax` module. + - The `escape_unit` and `escaped_string` methods are added to the + `parser::lex::Lexer` struct. + - The following error variants are added to `parser::SyntaxError`: + - `IncompleteControlBackslashEscape` + - `IncompleteControlEscape` + - `IncompleteEscape` + - `IncompleteHexEscape` + - `IncompleteLongUnicodeEscape` + - `IncompleteShortUnicodeEscape` + - `InvalidControlEscape` + - `InvalidEscape` + - `UnclosedDollarSingleQuote` + - `UnicodeEscapeOutOfRange` - In the `syntax::MaybeLiteral` trait, the `extend_if_literal` method is replaced with the `extend_literal` method, which now takes a mutable reference to an `Extend` object, instead of an ownership of it. The method may diff --git a/yash-syntax/src/parser/error.rs b/yash-syntax/src/parser/error.rs index 70115412..c5c731ef 100644 --- a/yash-syntax/src/parser/error.rs +++ b/yash-syntax/src/parser/error.rs @@ -30,22 +30,28 @@ use thiserror::Error; #[error("{}", self.message())] #[non_exhaustive] pub enum SyntaxError { + /// A backslash is at the end of the input. + IncompleteEscape, + /// A backslash is not followed by a character that makes a valid escape. + InvalidEscape, /// A `(` lacks a closing `)`. UnclosedParen { opening_location: Location }, - /// A modifier does not have a valid form in a parameter expansion. - InvalidModifier, - /// A braced parameter expansion has both a prefix and suffix modifier. - MultipleModifier, /// A single quotation lacks a closing `'`. UnclosedSingleQuote { opening_location: Location }, /// A double quotation lacks a closing `"`. UnclosedDoubleQuote { opening_location: Location }, + /// A `$'` lacks a closing `'`. + UnclosedDollarSingleQuote { opening_location: Location }, /// A parameter expansion lacks a closing `}`. UnclosedParam { opening_location: Location }, /// A parameter expansion lacks a name. EmptyParam, /// A parameter expansion has an invalid name. InvalidParam, + /// A modifier does not have a valid form in a parameter expansion. + InvalidModifier, + /// A braced parameter expansion has both a prefix and suffix modifier. + MultipleModifier, /// A command substitution started with `$(` but lacks a closing `)`. UnclosedCommandSubstitution { opening_location: Location }, /// A command substitution started with `` ` `` but lacks a closing `` ` ``. @@ -161,6 +167,22 @@ pub enum SyntaxError { MissingCommandAfterBar, /// There is a redundant token. RedundantToken, + /// A control escape (`\c...`) is incomplete in a dollar-single-quoted string. + IncompleteControlEscape, + /// A control-backslash escape (`\c\\`) is incomplete in a dollar-single-quoted string. + IncompleteControlBackslashEscape, + /// A control escape (`\c...`) does not have a valid control character. + InvalidControlEscape, + /// An octal escape is out of range (greater than `\377`) in a dollar-single-quoted string. + OctalEscapeOutOfRange, + /// An hexadecimal escape (`\x...`) is incomplete in a dollar-single-quoted string. + IncompleteHexEscape, + /// A Unicode escape (`\u...`) is incomplete in a dollar-single-quoted string. + IncompleteShortUnicodeEscape, + /// A Unicode escape (`\U...`) is incomplete in a dollar-single-quoted string. + IncompleteLongUnicodeEscape, + /// A Unicode escape (`\u...` or `\U...`) is out of range in a dollar-single-quoted string. + UnicodeEscapeOutOfRange, } impl SyntaxError { @@ -169,14 +191,17 @@ impl SyntaxError { pub fn message(&self) -> &'static str { use SyntaxError::*; match self { + IncompleteEscape => "The backslash is escaping nothing", + InvalidEscape => "The backslash escape is invalid", UnclosedParen { .. } => "The parenthesis is not closed", - InvalidModifier => "The parameter expansion contains a malformed modifier", - MultipleModifier => "A suffix modifier cannot be used together with a prefix modifier", UnclosedSingleQuote { .. } => "The single quote is not closed", UnclosedDoubleQuote { .. } => "The double quote is not closed", + UnclosedDollarSingleQuote { .. } => "The dollar single quote is not closed", UnclosedParam { .. } => "The parameter expansion is not closed", EmptyParam => "The parameter name is missing", InvalidParam => "The parameter name is invalid", + InvalidModifier => "The parameter expansion contains a malformed modifier", + MultipleModifier => "A suffix modifier cannot be used together with a prefix modifier", UnclosedCommandSubstitution { .. } => "The command substitution is not closed", UnclosedBackquote { .. } => "The backquote is not closed", UnclosedArith { .. } => "The arithmetic expansion is not closed", @@ -233,6 +258,15 @@ impl SyntaxError { MissingCommandAfterBang => "A command is missing after `!`", MissingCommandAfterBar => "A command is missing after `|`", RedundantToken => "There is a redundant token", + IncompleteControlEscape => "The control escape is incomplete", + IncompleteControlBackslashEscape => "The control-backslash escape is incomplete", + InvalidControlEscape => "The control escape is invalid", + OctalEscapeOutOfRange => "The octal escape is out of range", + IncompleteHexEscape => "The hexadecimal escape is incomplete", + IncompleteShortUnicodeEscape | IncompleteLongUnicodeEscape => { + "The Unicode escape is incomplete" + } + UnicodeEscapeOutOfRange => "The Unicode escape is out of range", } } @@ -241,6 +275,8 @@ impl SyntaxError { pub fn label(&self) -> &'static str { use SyntaxError::*; match self { + IncompleteEscape => "expected an escaped character after the backslash", + InvalidEscape => "invalid escape sequence", UnclosedParen { .. } | UnclosedCommandSubstitution { .. } | UnclosedArrayValue { .. } @@ -262,13 +298,13 @@ impl SyntaxError { | MissingCommandAfterBar => "expected a command", InvalidForValue | MissingCaseSubject | InvalidCaseSubject | MissingPattern | InvalidPattern => "expected a word", - InvalidModifier => "broken modifier", - MultipleModifier => "conflicting modifier", - UnclosedSingleQuote { .. } => "expected `'`", + UnclosedSingleQuote { .. } | UnclosedDollarSingleQuote { .. } => "expected `'`", UnclosedDoubleQuote { .. } => "expected `\"`", UnclosedParam { .. } | UnclosedGrouping { .. } => "expected `}`", EmptyParam => "expected a parameter name", InvalidParam => "not a valid named or positional parameter", + InvalidModifier => "broken modifier", + MultipleModifier => "conflicting modifier", UnclosedBackquote { .. } => "expected '`'", UnclosedArith { .. } => "expected `))`", InvalidCommandToken => "does not begin a valid command", @@ -301,6 +337,14 @@ impl SyntaxError { DoubleNegation => "only one `!` allowed", BangAfterBar => "`!` not allowed here", RedundantToken => "unexpected token", + IncompleteControlEscape => r"expected a control character after `\c`", + IncompleteControlBackslashEscape => r"expected another backslash after `\c\`", + InvalidControlEscape => "not a valid control character", + OctalEscapeOutOfRange => r"expected a value between \0 and \377", + IncompleteHexEscape => r"expected a hexadecimal digit after `\x`", + IncompleteShortUnicodeEscape => r"expected a hexadecimal digit after `\u`", + IncompleteLongUnicodeEscape => r"expected a hexadecimal digit after `\U`", + UnicodeEscapeOutOfRange => "not a valid Unicode scalar value", } } @@ -315,7 +359,9 @@ impl SyntaxError { | UnclosedArrayValue { opening_location } => { Some((opening_location, "the opening parenthesis was here")) } - UnclosedSingleQuote { opening_location } | UnclosedDoubleQuote { opening_location } => { + UnclosedSingleQuote { opening_location } + | UnclosedDoubleQuote { opening_location } + | UnclosedDollarSingleQuote { opening_location } => { Some((opening_location, "the opening quote was here")) } UnclosedParam { opening_location } => { diff --git a/yash-syntax/src/parser/lex/escape.rs b/yash-syntax/src/parser/lex/escape.rs index f9eddbe3..188ee1c6 100644 --- a/yash-syntax/src/parser/lex/escape.rs +++ b/yash-syntax/src/parser/lex/escape.rs @@ -18,6 +18,7 @@ use super::core::Lexer; use crate::parser::core::Result; +use crate::parser::{Error, SyntaxError}; use crate::syntax::EscapeUnit::{self, *}; use crate::syntax::EscapedString; @@ -64,13 +65,16 @@ impl Lexer<'_> { let Some(c1) = self.peek_char().await? else { return Ok(None); }; + let start_index = self.index(); self.consume_char(); if c1 != '\\' { return Ok(Some(Literal(c1))); } let Some(c2) = self.peek_char().await? else { - todo!("return error: missing escape character"); + let cause = SyntaxError::IncompleteEscape.into(); + let location = self.location().await?.clone(); + return Err(Error { cause, location }); }; self.consume_char(); match c2 { @@ -88,14 +92,19 @@ impl Lexer<'_> { 'v' => Ok(Some(VerticalTab)), 'c' => { + let start_index = self.index(); let Some(c3) = self.peek_char().await? else { - todo!("return error: missing control character"); + let cause = SyntaxError::IncompleteControlEscape.into(); + let location = self.location().await?.clone(); + return Err(Error { cause, location }); }; self.consume_char(); match c3.to_ascii_uppercase() { '\\' => { let Some('\\') = self.peek_char().await? else { - todo!("return error: missing control character"); + let cause = SyntaxError::IncompleteControlBackslashEscape.into(); + let location = self.location().await?.clone(); + return Err(Error { cause, location }); }; self.consume_char(); Ok(Some(Control(0x1C))) @@ -103,13 +112,19 @@ impl Lexer<'_> { c3 @ ('\u{3F}'..'\u{60}') => Ok(Some(Control(c3 as u8 ^ 0x40))), - _ => todo!("return error: unknown control character {c3:?}"), + _ => { + let cause = SyntaxError::InvalidControlEscape.into(); + let location = self.location_range(start_index..self.index()); + Err(Error { cause, location }) + } } } 'x' => { let Some(value) = self.hex_digits(2).await? else { - todo!("return error: missing hexadecimal digit"); + let cause = SyntaxError::IncompleteHexEscape.into(); + let location = self.location().await?.clone(); + return Err(Error { cause, location }); }; // TODO Reject a third hexadecimal digit in POSIX mode Ok(Some(Hex(value as u8))) @@ -117,22 +132,40 @@ impl Lexer<'_> { 'u' => { let Some(value) = self.hex_digits(4).await? else { - todo!("return error: missing hexadecimal digit"); + let cause = SyntaxError::IncompleteShortUnicodeEscape.into(); + let location = self.location().await?.clone(); + return Err(Error { cause, location }); }; - Ok(Some(Unicode(char::from_u32(value).expect("todo")))) + if let Some(c) = char::from_u32(value) { + Ok(Some(Unicode(c))) + } else { + let cause = SyntaxError::UnicodeEscapeOutOfRange.into(); + let location = self.location_range(start_index..self.index()); + Err(Error { cause, location }) + } } 'U' => { let Some(value) = self.hex_digits(8).await? else { - todo!("return error: missing hexadecimal digit"); + let cause = SyntaxError::IncompleteLongUnicodeEscape.into(); + let location = self.location().await?.clone(); + return Err(Error { cause, location }); }; - Ok(Some(Unicode(char::from_u32(value).expect("todo")))) + if let Some(c) = char::from_u32(value) { + Ok(Some(Unicode(c))) + } else { + let cause = SyntaxError::UnicodeEscapeOutOfRange.into(); + let location = self.location_range(start_index..self.index()); + Err(Error { cause, location }) + } } _ => { // Consume at most 3 octal digits (including c2) let Some(mut value) = c2.to_digit(8) else { - todo!("return error: unknown escape character {c2:?}"); + let cause = SyntaxError::InvalidEscape.into(); + let location = self.location_range(start_index..self.index()); + return Err(Error { cause, location }); }; for _ in 0..2 { let Some(digit) = self.peek_char().await? else { @@ -144,7 +177,13 @@ impl Lexer<'_> { value = value * 8 + digit; self.consume_char(); } - Ok(Some(Octal(value as u8))) + if let Ok(value) = value.try_into() { + Ok(Some(Octal(value))) + } else { + let cause = SyntaxError::OctalEscapeOutOfRange.into(); + let location = self.location_range(start_index..self.index()); + Err(Error { cause, location }) + } } } } @@ -203,9 +242,10 @@ impl Lexer<'_> { let is_single_quote = |c| c == '\''; // Consume the opening single quote - if self.consume_char_if(is_single_quote).await?.is_none() { + let Some(quote) = self.consume_char_if(is_single_quote).await? else { return Ok(None); - } + }; + let opening_location = quote.location.clone(); let content = self.escaped_string(is_single_quote).await?; @@ -215,7 +255,9 @@ impl Lexer<'_> { self.consume_char(); Ok(Some(content)) } else { - todo!("return error: missing closing quote"); + let cause = SyntaxError::UnclosedDollarSingleQuote { opening_location }.into(); + let location = self.location().await?.clone(); + Err(Error { cause, location }) } } } @@ -223,6 +265,7 @@ impl Lexer<'_> { #[cfg(test)] mod tests { use super::*; + use crate::parser::ErrorCause; use crate::source::Source; use assert_matches::assert_matches; use futures_util::FutureExt; @@ -270,9 +313,17 @@ mod tests { } #[test] - #[ignore = "not implemented"] fn escape_unit_incomplete_escapes() { - todo!() + let mut lexer = Lexer::from_memory(r"\", Source::Unknown); + let error = lexer.escape_unit().now_or_never().unwrap().unwrap_err(); + assert_matches!( + error.cause, + ErrorCause::Syntax(SyntaxError::IncompleteEscape) + ); + assert_eq!(*error.location.code.value.borrow(), r"\"); + assert_eq!(error.location.code.start_line_number.get(), 1); + assert_eq!(*error.location.code.source, Source::Unknown); + assert_eq!(error.location.range, 1..1); } #[test] @@ -292,20 +343,67 @@ mod tests { } #[test] - #[ignore = "not implemented"] - fn escape_unit_incomplete_control_escapes() { - todo!() + fn escape_unit_incomplete_control_escape() { + let mut lexer = Lexer::from_memory(r"\c", Source::Unknown); + let error = lexer.escape_unit().now_or_never().unwrap().unwrap_err(); + assert_matches!( + error.cause, + ErrorCause::Syntax(SyntaxError::IncompleteControlEscape) + ); + assert_eq!(*error.location.code.value.borrow(), r"\c"); + assert_eq!(error.location.code.start_line_number.get(), 1); + assert_eq!(*error.location.code.source, Source::Unknown); + assert_eq!(error.location.range, 2..2); + } + + #[test] + fn escape_unit_incomplete_control_backslash_escapes() { + let mut lexer = Lexer::from_memory(r"\c\", Source::Unknown); + let error = lexer.escape_unit().now_or_never().unwrap().unwrap_err(); + assert_matches!( + error.cause, + ErrorCause::Syntax(SyntaxError::IncompleteControlBackslashEscape) + ); + assert_eq!(*error.location.code.value.borrow(), r"\c\"); + assert_eq!(error.location.code.start_line_number.get(), 1); + assert_eq!(*error.location.code.source, Source::Unknown); + assert_eq!(error.location.range, 3..3); + + let mut lexer = Lexer::from_memory(r"\c\a", Source::Unknown); + let error = lexer.escape_unit().now_or_never().unwrap().unwrap_err(); + assert_matches!( + error.cause, + ErrorCause::Syntax(SyntaxError::IncompleteControlBackslashEscape) + ); + assert_eq!(*error.location.code.value.borrow(), r"\c\a"); + assert_eq!(error.location.code.start_line_number.get(), 1); + assert_eq!(*error.location.code.source, Source::Unknown); + assert_eq!(error.location.range, 3..4); + } + + #[test] + fn escape_unit_unknown_control_escape() { + let mut lexer = Lexer::from_memory(r"\c!`", Source::Unknown); + let error = lexer.escape_unit().now_or_never().unwrap().unwrap_err(); + assert_matches!( + error.cause, + ErrorCause::Syntax(SyntaxError::InvalidControlEscape) + ); + assert_eq!(*error.location.code.value.borrow(), r"\c!`"); + assert_eq!(error.location.code.start_line_number.get(), 1); + assert_eq!(*error.location.code.source, Source::Unknown); + assert_eq!(error.location.range, 2..3); } #[test] fn escape_unit_octal_escapes() { - let mut lexer = Lexer::from_memory(r"\0\07\177\0123", Source::Unknown); + let mut lexer = Lexer::from_memory(r"\0\07\234\0123", Source::Unknown); let result = lexer.escape_unit().now_or_never().unwrap().unwrap(); assert_eq!(result, Some(Octal(0o0))); let result = lexer.escape_unit().now_or_never().unwrap().unwrap(); assert_eq!(result, Some(Octal(0o7))); let result = lexer.escape_unit().now_or_never().unwrap().unwrap(); - assert_eq!(result, Some(Octal(0o177))); + assert_eq!(result, Some(Octal(0o234))); let result = lexer.escape_unit().now_or_never().unwrap().unwrap(); assert_eq!(result, Some(Octal(0o12))); // At most 3 octal digits are consumed @@ -323,11 +421,17 @@ mod tests { } #[test] - #[ignore = "not implemented"] fn escape_unit_non_byte_octal_escape() { - let mut lexer = Lexer::from_memory(r"\700", Source::Unknown); - let result = lexer.escape_unit().now_or_never().unwrap(); - todo!("should be an error: {result:?}"); + let mut lexer = Lexer::from_memory(r"\400", Source::Unknown); + let error = lexer.escape_unit().now_or_never().unwrap().unwrap_err(); + assert_matches!( + error.cause, + ErrorCause::Syntax(SyntaxError::OctalEscapeOutOfRange) + ); + assert_eq!(*error.location.code.value.borrow(), r"\400"); + assert_eq!(error.location.code.start_line_number.get(), 1); + assert_eq!(*error.location.code.source, Source::Unknown); + assert_eq!(error.location.range, 0..4); } #[test] @@ -349,9 +453,17 @@ mod tests { } #[test] - #[ignore = "not implemented"] fn escape_unit_incomplete_hexadecimal_escape() { - todo!() + let mut lexer = Lexer::from_memory(r"\x", Source::Unknown); + let error = lexer.escape_unit().now_or_never().unwrap().unwrap_err(); + assert_matches!( + error.cause, + ErrorCause::Syntax(SyntaxError::IncompleteHexEscape) + ); + assert_eq!(*error.location.code.value.borrow(), r"\x"); + assert_eq!(error.location.code.start_line_number.get(), 1); + assert_eq!(*error.location.code.source, Source::Unknown); + assert_eq!(error.location.range, 2..2); } #[test] @@ -374,12 +486,55 @@ mod tests { } #[test] - #[ignore = "not implemented"] fn escape_unit_incomplete_unicode_escapes() { - todo!() + let mut lexer = Lexer::from_memory(r"\u", Source::Unknown); + let error = lexer.escape_unit().now_or_never().unwrap().unwrap_err(); + assert_matches!( + error.cause, + ErrorCause::Syntax(SyntaxError::IncompleteShortUnicodeEscape) + ); + assert_eq!(*error.location.code.value.borrow(), r"\u"); + assert_eq!(error.location.code.start_line_number.get(), 1); + assert_eq!(*error.location.code.source, Source::Unknown); + assert_eq!(error.location.range, 2..2); + + let mut lexer = Lexer::from_memory(r"\U", Source::Unknown); + let error = lexer.escape_unit().now_or_never().unwrap().unwrap_err(); + assert_matches!( + error.cause, + ErrorCause::Syntax(SyntaxError::IncompleteLongUnicodeEscape) + ); + assert_eq!(*error.location.code.value.borrow(), r"\U"); + assert_eq!(error.location.code.start_line_number.get(), 1); + assert_eq!(*error.location.code.source, Source::Unknown); + assert_eq!(error.location.range, 2..2); } - // TODO escape_unit_invalid_unicode_escapes + #[test] + fn escape_unit_invalid_unicode_escapes() { + // U+D800 is not a valid Unicode scalar value + let mut lexer = Lexer::from_memory(r"\uD800", Source::Unknown); + let error = lexer.escape_unit().now_or_never().unwrap().unwrap_err(); + assert_matches!( + error.cause, + ErrorCause::Syntax(SyntaxError::UnicodeEscapeOutOfRange) + ); + assert_eq!(*error.location.code.value.borrow(), r"\uD800"); + assert_eq!(error.location.code.start_line_number.get(), 1); + assert_eq!(*error.location.code.source, Source::Unknown); + assert_eq!(error.location.range, 0..6); + } + + #[test] + fn escape_unit_unknown_escape() { + let mut lexer = Lexer::from_memory(r"\!", Source::Unknown); + let error = lexer.escape_unit().now_or_never().unwrap().unwrap_err(); + assert_matches!(error.cause, ErrorCause::Syntax(SyntaxError::InvalidEscape)); + assert_eq!(*error.location.code.value.borrow(), r"\!"); + assert_eq!(error.location.code.start_line_number.get(), 1); + assert_eq!(*error.location.code.source, Source::Unknown); + assert_eq!(error.location.range, 0..2); + } // TODO Reject non-portable escapes in POSIX mode @@ -448,5 +603,26 @@ mod tests { assert_eq!(lexer.peek_char().now_or_never().unwrap(), Ok(Some('x'))); } - // TODO single_quoted_escaped_string_unclosed + #[test] + fn single_quoted_escaped_string_unclosed() { + let mut lexer = Lexer::from_memory("'foo", Source::Unknown); + let error = lexer + .single_quoted_escaped_string() + .now_or_never() + .unwrap() + .unwrap_err(); + assert_matches!( + error.cause, + ErrorCause::Syntax(SyntaxError::UnclosedDollarSingleQuote { opening_location }) => { + assert_eq!(*opening_location.code.value.borrow(), "'foo"); + assert_eq!(opening_location.code.start_line_number.get(), 1); + assert_eq!(*opening_location.code.source, Source::Unknown); + assert_eq!(opening_location.range, 0..1); + } + ); + assert_eq!(*error.location.code.value.borrow(), "'foo"); + assert_eq!(error.location.code.start_line_number.get(), 1); + assert_eq!(*error.location.code.source, Source::Unknown); + assert_eq!(error.location.range, 4..4); + } }