From 579c455ff65730e3cf7dd46e5f2ef4ce08904e9c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Juli=C3=A1n=20Espina?= Date: Tue, 7 Jan 2025 11:17:38 -0600 Subject: [PATCH] Move methods of `JsString` to `JsStr` (#4106) --- core/string/src/lib.rs | 647 ++++++++++++++------------------------- core/string/src/str.rs | 295 +++++++++++++++++- core/string/src/tests.rs | 10 +- 3 files changed, 521 insertions(+), 431 deletions(-) diff --git a/core/string/src/lib.rs b/core/string/src/lib.rs index b64004a008f..eb206dfd92f 100644 --- a/core/string/src/lib.rs +++ b/core/string/src/lib.rs @@ -42,7 +42,6 @@ use std::{ cell::Cell, convert::Infallible, hash::{Hash, Hasher}, - iter::Peekable, mem::ManuallyDrop, process::abort, ptr::{self, addr_of, addr_of_mut, NonNull}, @@ -290,6 +289,235 @@ impl<'a> IntoIterator for &'a JsString { } } +impl JsString { + /// Create an iterator over the [`JsString`]. + #[inline] + #[must_use] + pub fn iter(&self) -> Iter<'_> { + self.as_str().iter() + } + + /// Create an iterator over overlapping subslices of length size. + #[inline] + #[must_use] + pub fn windows(&self, size: usize) -> Windows<'_> { + self.as_str().windows(size) + } + + /// Decodes a [`JsString`] into a [`String`], replacing invalid data with its escaped representation + /// in 4 digit hexadecimal. + #[inline] + #[must_use] + pub fn to_std_string_escaped(&self) -> String { + self.display_escaped().to_string() + } + + /// Decodes a [`JsString`] into a [`String`], replacing invalid data with the + /// replacement character U+FFFD. + #[inline] + #[must_use] + pub fn to_std_string_lossy(&self) -> String { + self.display_lossy().to_string() + } + + /// Decodes a [`JsString`] into a [`String`], returning an error if the string contains unpaired + /// surrogates. + /// + /// # Errors + /// + /// [`FromUtf16Error`][std::string::FromUtf16Error] if it contains any invalid data. + #[inline] + pub fn to_std_string(&self) -> Result { + self.as_str().to_std_string() + } + + /// Decodes a [`JsString`] into an iterator of [`Result`], returning surrogates as + /// errors. + #[inline] + pub fn to_std_string_with_surrogates(&self) -> impl Iterator> + '_ { + self.as_str().to_std_string_with_surrogates() + } + + /// Maps the valid segments of an UTF16 string and leaves the unpaired surrogates unchanged. + #[inline] + #[must_use] + pub fn map_valid_segments(&self, mut f: F) -> Self + where + F: FnMut(String) -> String, + { + let mut text = Vec::new(); + + for part in self.to_std_string_with_surrogates() { + match part { + Ok(string) => text.extend(f(string).encode_utf16()), + Err(surr) => text.push(surr), + } + } + + Self::from(&text[..]) + } + + /// Gets an iterator of all the Unicode codepoints of a [`JsString`]. + #[inline] + pub fn code_points(&self) -> impl Iterator + Clone + '_ { + self.as_str().code_points() + } + + /// Abstract operation `StringIndexOf ( string, searchValue, fromIndex )` + /// + /// Note: Instead of returning an isize with `-1` as the "not found" value, we make use of the + /// type system and return [Option]\ with [`None`] as the "not found" value. + /// + /// More information: + /// - [ECMAScript reference][spec] + /// + /// [spec]: https://tc39.es/ecma262/#sec-stringindexof + #[inline] + #[must_use] + pub fn index_of(&self, search_value: JsStr<'_>, from_index: usize) -> Option { + self.as_str().index_of(search_value, from_index) + } + + /// Abstract operation `CodePointAt( string, position )`. + /// + /// The abstract operation `CodePointAt` takes arguments `string` (a String) and `position` (a + /// non-negative integer) and returns a Record with fields `[[CodePoint]]` (a code point), + /// `[[CodeUnitCount]]` (a positive integer), and `[[IsUnpairedSurrogate]]` (a Boolean). It + /// interprets string as a sequence of UTF-16 encoded code points, as described in 6.1.4, and reads + /// from it a single code point starting with the code unit at index `position`. + /// + /// More information: + /// - [ECMAScript reference][spec] + /// + /// [spec]: https://tc39.es/ecma262/#sec-codepointat + /// + /// # Panics + /// + /// If `position` is smaller than size of string. + #[inline] + #[must_use] + pub fn code_point_at(&self, position: usize) -> CodePoint { + self.as_str().code_point_at(position) + } + + /// Abstract operation `StringToNumber ( str )` + /// + /// More information: + /// - [ECMAScript reference][spec] + /// + /// [spec]: https://tc39.es/ecma262/#sec-stringtonumber + #[inline] + #[must_use] + pub fn to_number(&self) -> f64 { + self.as_str().to_number() + } + + /// Get the length of the [`JsString`]. + #[inline] + #[must_use] + pub fn len(&self) -> usize { + self.as_str().len() + } + + /// Return true if the [`JsString`] is emtpy. + #[inline] + #[must_use] + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// Convert the [`JsString`] into a [`Vec`]. + #[inline] + #[must_use] + pub fn to_vec(&self) -> Vec { + self.as_str().to_vec() + } + + /// Check if the [`JsString`] contains a byte. + #[inline] + #[must_use] + pub fn contains(&self, element: u8) -> bool { + self.as_str().contains(element) + } + + /// Trim whitespace from the start and end of the [`JsString`]. + #[inline] + #[must_use] + pub fn trim(&self) -> JsStr<'_> { + self.as_str().trim() + } + + /// Trim whitespace from the start of the [`JsString`]. + #[inline] + #[must_use] + pub fn trim_start(&self) -> JsStr<'_> { + self.as_str().trim_start() + } + + /// Trim whitespace from the end of the [`JsString`]. + #[inline] + #[must_use] + pub fn trim_end(&self) -> JsStr<'_> { + self.as_str().trim_end() + } + + /// Get the element a the given index, [`None`] otherwise. + #[inline] + #[must_use] + pub fn get<'a, I>(&'a self, index: I) -> Option + where + I: JsSliceIndex<'a>, + { + self.as_str().get(index) + } + + /// Returns an element or subslice depending on the type of index, without doing bounds check. + /// + /// # Safety + /// + /// Caller must ensure the index is not out of bounds + #[inline] + #[must_use] + pub unsafe fn get_unchecked<'a, I>(&'a self, index: I) -> I::Value + where + I: JsSliceIndex<'a>, + { + // SAFETY: Caller must ensure the index is not out of bounds + unsafe { self.as_str().get_unchecked(index) } + } + + /// Get the element a the given index. + /// + /// # Panics + /// + /// If the index is out of bounds. + #[inline] + #[must_use] + pub fn get_expect<'a, I>(&'a self, index: I) -> I::Value + where + I: JsSliceIndex<'a>, + { + self.as_str().get_expect(index) + } + + /// Gets a displayable escaped string. This may be faster and has fewer + /// allocations than `format!("{}", str.to_string_escaped())` when + /// displaying. + #[inline] + #[must_use] + pub fn display_escaped(&self) -> JsStrDisplayEscaped<'_> { + self.as_str().display_escaped() + } + + /// Gets a displayable lossy string. This may be faster and has fewer + /// allocations than `format!("{}", str.to_string_lossy())` when displaying. + #[inline] + #[must_use] + pub fn display_lossy(&self) -> JsStrDisplayLossy<'_> { + self.as_str().display_lossy() + } +} + impl JsString { /// Create a [`JsString`] from a static js string. #[must_use] @@ -304,20 +532,6 @@ impl JsString { } } - /// Create an iterator over the [`JsString`]. - #[inline] - #[must_use] - pub fn iter(&self) -> Iter<'_> { - Iter::new(self.as_str()) - } - - /// Create an iterator over overlapping subslices of length size. - #[inline] - #[must_use] - pub fn windows(&self, size: usize) -> Windows<'_> { - Windows::new(self.as_str(), size) - } - /// Obtains the underlying [`&[u16]`][slice] slice of a [`JsString`] #[inline] #[must_use] @@ -448,278 +662,6 @@ impl JsString { StaticJsStrings::get_string(&string.as_str()).unwrap_or(string) } - /// Decodes a [`JsString`] into a [`String`], replacing invalid data with its escaped representation - /// in 4 digit hexadecimal. - #[inline] - #[must_use] - pub fn to_std_string_escaped(&self) -> String { - self.to_string_escaped() - } - - /// Decodes a [`JsString`] into a [`String`], replacing invalid data with the - /// replacement character U+FFFD. - #[inline] - #[must_use] - pub fn to_std_string_lossy(&self) -> String { - self.code_points() - .map(|cp| match cp { - CodePoint::Unicode(c) => c, - CodePoint::UnpairedSurrogate(_) => '\u{FFFD}', - }) - .collect() - } - - /// Decodes a [`JsString`] into a [`String`], returning - /// - /// # Errors - /// - /// [`FromUtf16Error`][std::string::FromUtf16Error] if it contains any invalid data. - #[inline] - pub fn to_std_string(&self) -> Result { - match self.as_str().variant() { - JsStrVariant::Latin1(v) => Ok(v.iter().copied().map(char::from).collect()), - JsStrVariant::Utf16(v) => String::from_utf16(v), - } - } - - /// Decodes a [`JsString`] into an iterator of [`Result`], returning surrogates as - /// errors. - #[inline] - pub fn to_std_string_with_surrogates(&self) -> impl Iterator> + '_ { - struct WideStringDecoderIterator { - codepoints: Peekable, - } - - impl WideStringDecoderIterator { - fn new(iterator: I) -> Self { - Self { - codepoints: iterator.peekable(), - } - } - } - - impl Iterator for WideStringDecoderIterator - where - I: Iterator, - { - type Item = Result; - - fn next(&mut self) -> Option { - let cp = self.codepoints.next()?; - let char = match cp { - CodePoint::Unicode(c) => c, - CodePoint::UnpairedSurrogate(surr) => return Some(Err(surr)), - }; - - let mut string = String::from(char); - - loop { - let Some(cp) = self.codepoints.peek().and_then(|cp| match cp { - CodePoint::Unicode(c) => Some(*c), - CodePoint::UnpairedSurrogate(_) => None, - }) else { - break; - }; - - string.push(cp); - - self.codepoints - .next() - .expect("should exist by the check above"); - } - - Some(Ok(string)) - } - } - - WideStringDecoderIterator::new(self.code_points()) - } - - /// Maps the valid segments of an UTF16 string and leaves the unpaired surrogates unchanged. - #[inline] - #[must_use] - pub fn map_valid_segments(&self, mut f: F) -> Self - where - F: FnMut(String) -> String, - { - let mut text = Vec::new(); - - for part in self.to_std_string_with_surrogates() { - match part { - Ok(string) => text.extend(f(string).encode_utf16()), - Err(surr) => text.push(surr), - } - } - - Self::from(&text[..]) - } - - /// Gets an iterator of all the Unicode codepoints of a [`JsString`]. - #[inline] - pub fn code_points(&self) -> impl Iterator + Clone + '_ { - self.as_str().code_points() - } - - /// Abstract operation `StringIndexOf ( string, searchValue, fromIndex )` - /// - /// Note: Instead of returning an isize with `-1` as the "not found" value, we make use of the - /// type system and return [Option]\ with [`None`] as the "not found" value. - /// - /// More information: - /// - [ECMAScript reference][spec] - /// - /// [spec]: https://tc39.es/ecma262/#sec-stringindexof - #[inline] - #[must_use] - pub fn index_of(&self, search_value: JsStr<'_>, from_index: usize) -> Option { - // 1. Assert: Type(string) is String. - // 2. Assert: Type(searchValue) is String. - // 3. Assert: fromIndex is a non-negative integer. - - // 4. Let len be the length of string. - let len = self.len(); - - // 5. If searchValue is the empty String and fromIndex ≤ len, return fromIndex. - if search_value.is_empty() { - return if from_index <= len { - Some(from_index) - } else { - None - }; - } - - // 6. Let searchLen be the length of searchValue. - // 7. For each integer i starting with fromIndex such that i ≤ len - searchLen, in ascending order, do - // a. Let candidate be the substring of string from i to i + searchLen. - // b. If candidate is the same sequence of code units as searchValue, return i. - // 8. Return -1. - self.windows(search_value.len()) - .skip(from_index) - .position(|s| s == search_value) - .map(|i| i + from_index) - } - - /// Abstract operation `CodePointAt( string, position )`. - /// - /// The abstract operation `CodePointAt` takes arguments `string` (a String) and `position` (a - /// non-negative integer) and returns a Record with fields `[[CodePoint]]` (a code point), - /// `[[CodeUnitCount]]` (a positive integer), and `[[IsUnpairedSurrogate]]` (a Boolean). It - /// interprets string as a sequence of UTF-16 encoded code points, as described in 6.1.4, and reads - /// from it a single code point starting with the code unit at index `position`. - /// - /// More information: - /// - [ECMAScript reference][spec] - /// - /// [spec]: https://tc39.es/ecma262/#sec-codepointat - /// - /// # Panics - /// - /// If `position` is smaller than size of string. - #[inline] - #[must_use] - pub fn code_point_at(&self, position: usize) -> CodePoint { - // 1. Let size be the length of string. - let size = self.len(); - - // 2. Assert: position ≥ 0 and position < size. - // position >= 0 ensured by position: usize - assert!(position < size); - - match self.as_str().variant() { - JsStrVariant::Latin1(v) => { - let code_point = v.get(position).expect("Already checked the size"); - CodePoint::Unicode(*code_point as char) - } - // 3. Let first be the code unit at index position within string. - // 4. Let cp be the code point whose numeric value is that of first. - // 5. If first is not a leading surrogate or trailing surrogate, then - // a. Return the Record { [[CodePoint]]: cp, [[CodeUnitCount]]: 1, [[IsUnpairedSurrogate]]: false }. - // 6. If first is a trailing surrogate or position + 1 = size, then - // a. Return the Record { [[CodePoint]]: cp, [[CodeUnitCount]]: 1, [[IsUnpairedSurrogate]]: true }. - // 7. Let second be the code unit at index position + 1 within string. - // 8. If second is not a trailing surrogate, then - // a. Return the Record { [[CodePoint]]: cp, [[CodeUnitCount]]: 1, [[IsUnpairedSurrogate]]: true }. - // 9. Set cp to ! UTF16SurrogatePairToCodePoint(first, second). - JsStrVariant::Utf16(v) => { - // We can skip the checks and instead use the `char::decode_utf16` function to take care of that for us. - let code_point = v - .get(position..=position + 1) - .unwrap_or(&v[position..=position]); - - match char::decode_utf16(code_point.iter().copied()) - .next() - .expect("code_point always has a value") - { - Ok(c) => CodePoint::Unicode(c), - Err(e) => CodePoint::UnpairedSurrogate(e.unpaired_surrogate()), - } - } - } - } - - /// Abstract operation `StringToNumber ( str )` - /// - /// More information: - /// - [ECMAScript reference][spec] - /// - /// [spec]: https://tc39.es/ecma262/#sec-stringtonumber - #[inline] - #[must_use] - pub fn to_number(&self) -> f64 { - // 1. Let text be ! StringToCodePoints(str). - // 2. Let literal be ParseText(text, StringNumericLiteral). - let Ok(string) = self.to_std_string() else { - // 3. If literal is a List of errors, return NaN. - return f64::NAN; - }; - // 4. Return StringNumericValue of literal. - let string = string.trim_matches(is_trimmable_whitespace); - match string { - "" => return 0.0, - "-Infinity" => return f64::NEG_INFINITY, - "Infinity" | "+Infinity" => return f64::INFINITY, - _ => {} - } - - let mut s = string.bytes(); - let base = match (s.next(), s.next()) { - (Some(b'0'), Some(b'b' | b'B')) => Some(2), - (Some(b'0'), Some(b'o' | b'O')) => Some(8), - (Some(b'0'), Some(b'x' | b'X')) => Some(16), - // Make sure that no further variants of "infinity" are parsed. - (Some(b'i' | b'I'), _) => { - return f64::NAN; - } - _ => None, - }; - - // Parse numbers that begin with `0b`, `0o` and `0x`. - if let Some(base) = base { - let string = &string[2..]; - if string.is_empty() { - return f64::NAN; - } - - // Fast path - if let Ok(value) = u32::from_str_radix(string, base) { - return f64::from(value); - } - - // Slow path - let mut value: f64 = 0.0; - for c in s { - if let Some(digit) = char::from(c).to_digit(base) { - value = value.mul_add(f64::from(base), f64::from(digit)); - } else { - return f64::NAN; - } - } - return value; - } - - fast_float2::parse(string).unwrap_or(f64::NAN) - } - /// Allocates a new [`RawJsString`] with an internal capacity of `str_len` chars. /// /// # Panics @@ -843,58 +785,6 @@ impl JsString { Self::from_slice_skip_interning(string) } - /// Get the length of the [`JsString`]. - #[inline] - #[must_use] - pub fn len(&self) -> usize { - self.as_str().len() - } - - /// Return true if the [`JsString`] is emtpy. - #[inline] - #[must_use] - pub fn is_empty(&self) -> bool { - self.len() == 0 - } - - /// Convert the [`JsString`] into a [`Vec`]. - #[inline] - #[must_use] - pub fn to_vec(&self) -> Vec { - self.as_str().to_vec() - } - - /// Check if the [`JsString`] contains a byte. - #[inline] - #[must_use] - pub fn contains(&self, element: u8) -> bool { - match self.as_str().variant() { - JsStrVariant::Latin1(v) => v.contains(&element), - JsStrVariant::Utf16(v) => v.contains(&u16::from(element)), - } - } - - /// Trim whitespace from the start and end of the [`JsString`]. - #[inline] - #[must_use] - pub fn trim(&self) -> JsStr<'_> { - self.as_str().trim() - } - - /// Trim whitespace from the start of the [`JsString`]. - #[inline] - #[must_use] - pub fn trim_start(&self) -> JsStr<'_> { - self.as_str().trim_start() - } - - /// Trim whitespace from the end of the [`JsString`]. - #[inline] - #[must_use] - pub fn trim_end(&self) -> JsStr<'_> { - self.as_str().trim_end() - } - /// Check if the [`JsString`] is static. #[inline] #[must_use] @@ -902,45 +792,6 @@ impl JsString { self.refcount().is_none() } - /// Get the element a the given index, [`None`] otherwise. - #[inline] - #[must_use] - pub fn get<'a, I>(&'a self, index: I) -> Option - where - I: JsSliceIndex<'a>, - { - I::get(self.as_str(), index) - } - - /// Returns an element or subslice depending on the type of index, without doing bounds check. - /// - /// # Safety - /// - /// Caller must ensure the index is not out of bounds - #[inline] - #[must_use] - pub unsafe fn get_unchecked<'a, I>(&'a self, index: I) -> I::Value - where - I: JsSliceIndex<'a>, - { - // SAFETY: Caller must ensure the index is not out of bounds - unsafe { I::get_unchecked(self.as_str(), index) } - } - - /// Get the element a the given index. - /// - /// # Panics - /// - /// If the index is out of bounds. - #[inline] - #[must_use] - pub fn get_expect<'a, I>(&'a self, index: I) -> I::Value - where - I: JsSliceIndex<'a>, - { - self.get(index).expect("Index out of bounds") - } - /// Gets the number of `JsString`s which point to this allocation. #[inline] #[must_use] @@ -961,23 +812,6 @@ impl JsString { UnwrappedTagged::Tag(_inner) => None, } } - - /// Gets a displayable escaped string. This may be faster and has fewer - /// allocations than `format!("{}", str.to_string_escaped())` when - /// displaying. - #[inline] - #[must_use] - pub fn display_escaped(&self) -> JsStrDisplayEscaped<'_> { - JsStrDisplayEscaped::from(self.as_str()) - } - - /// Gets a displayable lossy string. This may be faster and has fewer - /// allocations than `format!("{}", str.to_string_lossy())` when displaying. - #[inline] - #[must_use] - pub fn display_lossy(&self) -> JsStrDisplayLossy<'_> { - JsStrDisplayLossy::from(self.as_str()) - } } impl Clone for JsString { @@ -1076,17 +910,10 @@ impl Drop for JsString { } } -impl ToStringEscaped for JsString { - #[inline] - fn to_string_escaped(&self) -> String { - format!("{}", self.display_escaped()) - } -} - impl std::fmt::Debug for JsString { #[inline] fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - self.to_std_string_escaped().fmt(f) + self.as_str().fmt(f) } } @@ -1283,17 +1110,3 @@ impl FromStr for JsString { Ok(Self::from(s)) } } - -/// Utility trait that adds a `UTF-16` escaped representation to every [`[u16]`][slice]. -pub(crate) trait ToStringEscaped { - /// Decodes `self` as an `UTF-16` encoded string, escaping any unpaired surrogates by its - /// codepoint value. - fn to_string_escaped(&self) -> String; -} - -impl ToStringEscaped for [u16] { - #[inline] - fn to_string_escaped(&self) -> String { - JsString::from(self).to_string_escaped() - } -} diff --git a/core/string/src/str.rs b/core/string/src/str.rs index c0c49f44f27..8ab75409108 100644 --- a/core/string/src/str.rs +++ b/core/string/src/str.rs @@ -1,4 +1,7 @@ -use crate::{is_trimmable_whitespace, is_trimmable_whitespace_latin1, CodePoint, Iter}; +use crate::{ + display::{JsStrDisplayEscaped, JsStrDisplayLossy}, + is_trimmable_whitespace, is_trimmable_whitespace_latin1, CodePoint, Iter, +}; use std::{ hash::{Hash, Hasher}, slice::SliceIndex, @@ -47,7 +50,7 @@ pub enum JsStrVariant<'a> { } /// This is equivalent to Rust's `&str`. -#[derive(Debug, Clone, Copy)] +#[derive(Clone, Copy)] pub struct JsStr<'a> { inner: JsStrVariant<'a>, } @@ -191,6 +194,20 @@ impl<'a> JsStr<'a> { I::get(self, index) } + /// Get the element at the given index. + /// + /// # Panics + /// + /// If the index is out of bounds. + #[inline] + #[must_use] + pub fn get_expect(&self, index: I) -> I::Value + where + I: JsSliceIndex<'a>, + { + self.get(index).expect("Index out of bounds") + } + /// Returns an element or subslice depending on the type of index, without doing bounds check. /// /// # Safety @@ -235,23 +252,276 @@ impl<'a> JsStr<'a> { m >= n && needle == self.get(m - n..).expect("already checked size") } - /// Gets an iterator of all the Unicode codepoints of a [`JsStr`], replacing - /// unpaired surrogates with the replacement character. This is faster than - /// using [`Self::code_points`]. + /// Abstract operation `StringIndexOf ( string, searchValue, fromIndex )` + /// + /// Note: Instead of returning an isize with `-1` as the "not found" value, we make use of the + /// type system and return [Option]\ with [`None`] as the "not found" value. + /// + /// More information: + /// - [ECMAScript reference][spec] + /// + /// [spec]: https://tc39.es/ecma262/#sec-stringindexof #[inline] - pub(crate) fn code_points_lossy(self) -> impl Iterator + 'a { - char::decode_utf16(self.iter()).map(|res| res.unwrap_or('\u{FFFD}')) + #[must_use] + pub fn index_of(&self, search_value: JsStr<'_>, from_index: usize) -> Option { + // 1. Assert: Type(string) is String. + // 2. Assert: Type(searchValue) is String. + // 3. Assert: fromIndex is a non-negative integer. + + // 4. Let len be the length of string. + let len = self.len(); + + // 5. If searchValue is the empty String and fromIndex ≤ len, return fromIndex. + if search_value.is_empty() { + return if from_index <= len { + Some(from_index) + } else { + None + }; + } + + // 6. Let searchLen be the length of searchValue. + // 7. For each integer i starting with fromIndex such that i ≤ len - searchLen, in ascending order, do + // a. Let candidate be the substring of string from i to i + searchLen. + // b. If candidate is the same sequence of code units as searchValue, return i. + // 8. Return -1. + self.windows(search_value.len()) + .skip(from_index) + .position(|s| s == search_value) + .map(|i| i + from_index) + } + + /// Abstract operation `CodePointAt( string, position )`. + /// + /// The abstract operation `CodePointAt` takes arguments `string` (a String) and `position` (a + /// non-negative integer) and returns a Record with fields `[[CodePoint]]` (a code point), + /// `[[CodeUnitCount]]` (a positive integer), and `[[IsUnpairedSurrogate]]` (a Boolean). It + /// interprets string as a sequence of UTF-16 encoded code points, as described in 6.1.4, and reads + /// from it a single code point starting with the code unit at index `position`. + /// + /// More information: + /// - [ECMAScript reference][spec] + /// + /// [spec]: https://tc39.es/ecma262/#sec-codepointat + /// + /// # Panics + /// + /// If `position` is smaller than size of string. + #[inline] + #[must_use] + pub fn code_point_at(&self, position: usize) -> CodePoint { + // 1. Let size be the length of string. + let size = self.len(); + + // 2. Assert: position ≥ 0 and position < size. + // position >= 0 ensured by position: usize + assert!(position < size); + + match self.variant() { + JsStrVariant::Latin1(v) => { + let code_point = v.get(position).expect("Already checked the size"); + CodePoint::Unicode(*code_point as char) + } + // 3. Let first be the code unit at index position within string. + // 4. Let cp be the code point whose numeric value is that of first. + // 5. If first is not a leading surrogate or trailing surrogate, then + // a. Return the Record { [[CodePoint]]: cp, [[CodeUnitCount]]: 1, [[IsUnpairedSurrogate]]: false }. + // 6. If first is a trailing surrogate or position + 1 = size, then + // a. Return the Record { [[CodePoint]]: cp, [[CodeUnitCount]]: 1, [[IsUnpairedSurrogate]]: true }. + // 7. Let second be the code unit at index position + 1 within string. + // 8. If second is not a trailing surrogate, then + // a. Return the Record { [[CodePoint]]: cp, [[CodeUnitCount]]: 1, [[IsUnpairedSurrogate]]: true }. + // 9. Set cp to ! UTF16SurrogatePairToCodePoint(first, second). + JsStrVariant::Utf16(v) => { + // We can skip the checks and instead use the `char::decode_utf16` function to take care of that for us. + let code_point = v + .get(position..=position + 1) + .unwrap_or(&v[position..=position]); + + match char::decode_utf16(code_point.iter().copied()) + .next() + .expect("code_point always has a value") + { + Ok(c) => CodePoint::Unicode(c), + Err(e) => CodePoint::UnpairedSurrogate(e.unpaired_surrogate()), + } + } + } + } + + /// Abstract operation `StringToNumber ( str )` + /// + /// More information: + /// - [ECMAScript reference][spec] + /// + /// [spec]: https://tc39.es/ecma262/#sec-stringtonumber + #[inline] + #[must_use] + pub fn to_number(&self) -> f64 { + // 1. Let text be ! StringToCodePoints(str). + // 2. Let literal be ParseText(text, StringNumericLiteral). + let Ok(string) = self.to_std_string() else { + // 3. If literal is a List of errors, return NaN. + return f64::NAN; + }; + // 4. Return StringNumericValue of literal. + let string = string.trim_matches(is_trimmable_whitespace); + match string { + "" => return 0.0, + "-Infinity" => return f64::NEG_INFINITY, + "Infinity" | "+Infinity" => return f64::INFINITY, + _ => {} + } + + let mut s = string.bytes(); + let base = match (s.next(), s.next()) { + (Some(b'0'), Some(b'b' | b'B')) => Some(2), + (Some(b'0'), Some(b'o' | b'O')) => Some(8), + (Some(b'0'), Some(b'x' | b'X')) => Some(16), + // Make sure that no further variants of "infinity" are parsed. + (Some(b'i' | b'I'), _) => { + return f64::NAN; + } + _ => None, + }; + + // Parse numbers that begin with `0b`, `0o` and `0x`. + if let Some(base) = base { + let string = &string[2..]; + if string.is_empty() { + return f64::NAN; + } + + // Fast path + if let Ok(value) = u32::from_str_radix(string, base) { + return f64::from(value); + } + + // Slow path + let mut value: f64 = 0.0; + for c in s { + if let Some(digit) = char::from(c).to_digit(base) { + value = value.mul_add(f64::from(base), f64::from(digit)); + } else { + return f64::NAN; + } + } + return value; + } + + fast_float2::parse(string).unwrap_or(f64::NAN) } /// Gets an iterator of all the Unicode codepoints of a [`JsStr`]. - /// This is not optimized for Latin1 strings. + // TODO: optimize for Latin1 strings. #[inline] - pub(crate) fn code_points(self) -> impl Iterator + Clone + 'a { + pub fn code_points(&self) -> impl Iterator + Clone + 'a { char::decode_utf16(self.iter()).map(|res| match res { Ok(c) => CodePoint::Unicode(c), Err(e) => CodePoint::UnpairedSurrogate(e.unpaired_surrogate()), }) } + + /// Checks if the [`JsStr`] contains a byte. + #[inline] + #[must_use] + pub fn contains(&self, element: u8) -> bool { + match self.variant() { + JsStrVariant::Latin1(v) => v.contains(&element), + JsStrVariant::Utf16(v) => v.contains(&u16::from(element)), + } + } + + /// Gets an iterator of all the Unicode codepoints of a [`JsStr`], replacing + /// unpaired surrogates with the replacement character. This is faster than + /// using [`Self::code_points`]. + #[inline] + pub fn code_points_lossy(self) -> impl Iterator + 'a { + char::decode_utf16(self.iter()).map(|res| res.unwrap_or('\u{FFFD}')) + } + + /// Decodes a [`JsStr`] into an iterator of [`Result`], returning surrogates as + /// errors. + #[inline] + #[allow(clippy::missing_panics_doc)] + pub fn to_std_string_with_surrogates(&self) -> impl Iterator> + 'a { + let mut iter = self.code_points().peekable(); + + std::iter::from_fn(move || { + let cp = iter.next()?; + let char = match cp { + CodePoint::Unicode(c) => c, + CodePoint::UnpairedSurrogate(surr) => return Some(Err(surr)), + }; + + let mut string = String::from(char); + + loop { + let Some(cp) = iter.peek().and_then(|cp| match cp { + CodePoint::Unicode(c) => Some(*c), + CodePoint::UnpairedSurrogate(_) => None, + }) else { + break; + }; + + string.push(cp); + + iter.next().expect("should exist by the check above"); + } + + Some(Ok(string)) + }) + } + + /// Decodes a [`JsStr`] into a [`String`], returning an error if it contains any invalid data. + /// + /// # Errors + /// + /// [`FromUtf16Error`][std::string::FromUtf16Error] if it contains any invalid data. + #[inline] + pub fn to_std_string(&self) -> Result { + match self.variant() { + JsStrVariant::Latin1(v) => Ok(v.iter().copied().map(char::from).collect()), + JsStrVariant::Utf16(v) => String::from_utf16(v), + } + } + + /// Decodes a [`JsStr`] into a [`String`], replacing invalid data with its escaped representation + /// in 4 digit hexadecimal. + #[inline] + #[must_use] + pub fn to_std_string_escaped(&self) -> String { + self.display_escaped().to_string() + } + + /// Decodes a [`JsStr`] into a [`String`], replacing invalid data with the + /// replacement character U+FFFD. + #[inline] + #[must_use] + pub fn to_std_string_lossy(&self) -> String { + self.display_lossy().to_string() + } + + /// Gets a displayable escaped string. + /// + /// This may be faster and has fewer + /// allocations than `format!("{}", str.to_string_escaped())` when + /// displaying. + #[inline] + #[must_use] + pub fn display_escaped(&self) -> JsStrDisplayEscaped<'a> { + JsStrDisplayEscaped::from(*self) + } + + /// Gets a displayable lossy string. + /// + /// This may be faster and has fewer + /// allocations than `format!("{}", str.to_string_lossy())` when displaying. + #[inline] + #[must_use] + pub fn display_lossy(&self) -> JsStrDisplayLossy<'a> { + JsStrDisplayLossy::from(*self) + } } impl Hash for JsStr<'_> { @@ -341,6 +611,13 @@ impl<'a> PartialEq> for [u16] { } } +impl std::fmt::Debug for JsStr<'_> { + #[inline] + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + self.to_std_string_escaped().fmt(f) + } +} + pub trait JsSliceIndex<'a>: SliceIndex<[u8]> + SliceIndex<[u16]> { type Value; diff --git a/core/string/src/tests.rs b/core/string/src/tests.rs index 043f07a0c09..7717c4d4e8f 100644 --- a/core/string/src/tests.rs +++ b/core/string/src/tests.rs @@ -4,7 +4,7 @@ use std::hash::{BuildHasher, BuildHasherDefault, Hash}; use crate::{ CommonJsStringBuilder, JsStr, JsString, Latin1JsStringBuilder, StaticJsString, StaticJsStrings, - ToStringEscaped, Utf16JsStringBuilder, + Utf16JsStringBuilder, }; use rustc_hash::FxHasher; @@ -178,21 +178,21 @@ fn conversion_to_known_static_js_string() { } #[test] -fn to_string_escaped() { +fn to_std_string_escaped() { assert_eq!( - JsString::from("Hello, \u{1D49E} world!").to_string_escaped(), + JsString::from("Hello, \u{1D49E} world!").to_std_string_escaped(), "Hello, \u{1D49E} world!" ); assert_eq!( - JsString::from("Hello, world!").to_string_escaped(), + JsString::from("Hello, world!").to_std_string_escaped(), "Hello, world!" ); // 15 should not be escaped. let unpaired_surrogates: [u16; 3] = [0xDC58, 0xD83C, 0x0015]; assert_eq!( - JsString::from(&unpaired_surrogates).to_string_escaped(), + JsString::from(&unpaired_surrogates).to_std_string_escaped(), "\\uDC58\\uD83C\u{15}" ); }