Skip to content

Commit

Permalink
Parse Unicode escapes
Browse files Browse the repository at this point in the history
  • Loading branch information
magicant committed Dec 1, 2024
1 parent dc6be06 commit 76931d1
Showing 1 changed file with 55 additions and 9 deletions.
64 changes: 55 additions & 9 deletions yash-syntax/src/parser/lex/escape.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,16 +23,36 @@ use crate::syntax::EscapedString;

impl Lexer<'_> {
/// Parses a hexadecimal digit.
async fn hex_digit(&mut self) -> Result<Option<u8>> {
async fn hex_digit(&mut self) -> Result<Option<u32>> {
if let Some(c) = self.peek_char().await? {
if let Some(digit) = c.to_digit(16) {
self.consume_char();
return Ok(Some(digit as u8));
return Ok(Some(digit));
}
}
Ok(None)
}

/// Parses a sequence of hexadecimal digits.
///
/// This function consumes up to `count` hexadecimal digits and returns the
/// value as a single number. If fewer than `count` digits are found, the
/// function returns the value of the digits found so far. If no digits are
/// found, the function returns `Ok(None)`.
async fn hex_digits(&mut self, count: usize) -> Result<Option<u32>> {
let Some(digit) = self.hex_digit().await? else {
return Ok(None);
};
let mut value = digit;
for _ in 1..count {
let Some(digit) = self.hex_digit().await? else {
break;
};
value = value << 4 | digit;
}
Ok(Some(value))
}

/// Parses an escape unit.
///
/// This function tests if the next character is an escape sequence and
Expand Down Expand Up @@ -88,14 +108,25 @@ impl Lexer<'_> {
}

'x' => {
let Some(digit1) = self.hex_digit().await? else {
let Some(value) = self.hex_digits(2).await? else {
todo!("return error: missing hexadecimal digit");
};
let Some(digit2) = self.hex_digit().await? else {
return Ok(Some(Hex(digit1)));
};
// TODO Reject a third hexadecimal digit in POSIX mode
Ok(Some(Hex(digit1 << 4 | digit2)))
Ok(Some(Hex(value as u8)))
}

'u' => {
let Some(value) = self.hex_digits(4).await? else {
todo!("return error: missing hexadecimal digit");
};
Ok(Some(Unicode(char::from_u32(value).expect("todo"))))
}

'U' => {
let Some(value) = self.hex_digits(8).await? else {
todo!("return error: missing hexadecimal digit");
};
Ok(Some(Unicode(char::from_u32(value).expect("todo"))))
}

_ => {
Expand Down Expand Up @@ -324,9 +355,22 @@ mod tests {
}

#[test]
#[ignore = "not implemented"]
fn escape_unit_unicode_escapes() {
todo!()
let mut lexer = Lexer::from_memory(r"\u20\u4B9d0", Source::Unknown);
let result = lexer.escape_unit().now_or_never().unwrap().unwrap();
assert_eq!(result, Some(Unicode('\u{20}')));
let result = lexer.escape_unit().now_or_never().unwrap().unwrap();
assert_eq!(result, Some(Unicode('\u{4B9D}')));
// At most 4 hexadecimal digits are consumed
assert_eq!(lexer.peek_char().now_or_never().unwrap(), Ok(Some('0')));

let mut lexer = Lexer::from_memory(r"\U42\U0001f4A9b", Source::Unknown);
let result = lexer.escape_unit().now_or_never().unwrap().unwrap();
assert_eq!(result, Some(Unicode('\u{42}')));
let result = lexer.escape_unit().now_or_never().unwrap().unwrap();
assert_eq!(result, Some(Unicode('\u{1F4A9}')));
// At most 8 hexadecimal digits are consumed
assert_eq!(lexer.peek_char().now_or_never().unwrap(), Ok(Some('b')));
}

#[test]
Expand All @@ -335,6 +379,8 @@ mod tests {
todo!()
}

// TODO escape_unit_invalid_unicode_escapes

// TODO Reject non-portable escapes in POSIX mode

#[test]
Expand Down

0 comments on commit 76931d1

Please sign in to comment.