diff --git a/scripts/unicode.py b/scripts/unicode.py index f40e411..109bb38 100755 --- a/scripts/unicode.py +++ b/scripts/unicode.py @@ -19,6 +19,7 @@ # Since this should not require frequent updates, we just store this # out-of-line and check the tables.rs and normalization_tests.rs files into git. import collections +import re import urllib.request UNICODE_VERSION = "15.1.0" @@ -66,6 +67,8 @@ class UnicodeData(object): def __init__(self): self._load_unicode_data() + self._load_default_ignorable_marks() + self.norm_props = self._load_norm_props() self.norm_tests = self._load_norm_tests() @@ -100,6 +103,11 @@ def _load_unicode_data(self): self.general_category_mark = [] self.general_category_public_assigned = [] + # Characters that cannot be part of a combining character sequence: + # control characters, format characters other than ZWJ and ZWNJ, + # the line and paragraph separators, and noncharacters. + self.not_in_ccs = [] + assigned_start = 0; prev_char_int = -1; prev_name = ""; @@ -125,6 +133,9 @@ def _load_unicode_data(self): if category == 'M' or 'M' in expanded_categories.get(category, []): self.general_category_mark.append(char_int) + if category in ['Cc', 'Cf', 'Zl', 'Zp'] and char_int not in [0x200C, 0x200D]: + self.not_in_ccs.append(char_int) + assert category != 'Cn', "Unexpected: Unassigned codepoint in UnicodeData.txt" if category not in ['Co', 'Cs']: if char_int != prev_char_int + 1 and not is_first_and_last(prev_name, name): @@ -135,6 +146,44 @@ def _load_unicode_data(self): self.general_category_public_assigned.append((assigned_start, prev_char_int)) + # Mark noncharacters as nongraphic + for i in range(0xFDD0, 0xFDF0): + self.not_in_ccs.append(i) + for prefix in range(0, 0x11): + shifted = prefix << 16 + self.not_in_ccs.append(shifted | 0xFFFE) + self.not_in_ccs.append(shifted | 0xFFFF) + + self.not_in_ccs.sort() + + def _load_default_ignorable_marks(self): + default_ignorable_cps = set() + + single = re.compile(r"^([0-9A-F]+)\s+;\s+Default_Ignorable_Code_Point\s+") + multiple = re.compile( + r"^([0-9A-F]+)\.\.([0-9A-F]+)\s+;\s+Default_Ignorable_Code_Point\s+" + ) + + for line in self._fetch("DerivedCoreProperties.txt").splitlines(): + raw_data = None # (low, high) + if match := single.match(line): + raw_data = (match.group(1), match.group(1)) + elif match := multiple.match(line): + raw_data = (match.group(1), match.group(2)) + else: + continue + low = int(raw_data[0], 16) + high = int(raw_data[1], 16) + for cp in range(low, high + 1): + default_ignorable_cps.add(cp) + + self.default_ignorable_marks = [] + for cp in self.general_category_mark: + if cp in default_ignorable_cps: + self.default_ignorable_marks.append(cp) + + self.default_ignorable_marks.sort() + def _load_cjk_compat_ideograph_variants(self): for line in self._fetch("StandardizedVariants.txt").splitlines(): strip_comments = line.split('#', 1)[0].strip() @@ -454,7 +503,7 @@ def gen_combining_mark(general_category_mark, out): def gen_public_assigned(general_category_public_assigned, out): # This could be done as a hash but the table is somewhat small. - out.write("#[inline]\n") + out.write("\n#[inline]\n") out.write("pub fn is_public_assigned(c: char) -> bool {\n") out.write(" match c {\n") @@ -476,6 +525,66 @@ def gen_public_assigned(general_category_public_assigned, out): out.write("}\n") out.write("\n") +def gen_not_in_ccs(not_in_ccs, out): + # List of codepoints to list of ranges + range_list = [] + for cp in not_in_ccs: + if len(range_list) != 0 and range_list[-1][1] == cp - 1: + range_list[-1] = (range_list[-1][0], cp) + else: + range_list.append((cp, cp)) + + out.write("\n#[inline]\n") + out.write("pub fn not_in_ccs(c: char) -> bool {\n") + out.write(" match c {\n") + + start = True + for first, last in range_list: + if start: + out.write(" ") + start = False + else: + out.write("\n | ") + if first == last: + out.write("'\\u{%s}'" % hexify(first)) + else: + out.write("'\\u{%s}'..='\\u{%s}'" % (hexify(first), hexify(last))) + out.write(" => true,\n") + + out.write(" _ => false,\n") + out.write(" }\n") + out.write("}\n") + +def gen_default_ignorable_mark(default_ignorable_marks, out): + # List of codepoints to list of ranges + range_list = [] + for cp in default_ignorable_marks: + if len(range_list) != 0 and range_list[-1][1] == cp - 1: + range_list[-1] = (range_list[-1][0], cp) + else: + range_list.append((cp, cp)) + + out.write("\n#[inline]\n") + out.write("pub fn is_default_ignorable_mark(c: char) -> bool {\n") + out.write(" match c {\n") + + start = True + for first, last in range_list: + if start: + out.write(" ") + start = False + else: + out.write("\n | ") + if first == last: + out.write("'\\u{%s}'" % hexify(first)) + else: + out.write("'\\u{%s}'..='\\u{%s}'" % (hexify(first), hexify(last))) + out.write(" => true,\n") + + out.write(" _ => false,\n") + out.write(" }\n") + out.write("}\n") + def gen_stream_safe(leading, trailing, out): # This could be done as a hash but the table is very small. out.write("#[inline]\n") @@ -602,6 +711,10 @@ def minimal_perfect_hash(d): gen_public_assigned(data.general_category_public_assigned, out) out.write("\n") + gen_not_in_ccs(data.not_in_ccs, out) + + gen_default_ignorable_mark(data.default_ignorable_marks, out) + gen_nfc_qc(data.norm_props, out) out.write("\n") diff --git a/src/correct_ccs.rs b/src/correct_ccs.rs new file mode 100644 index 0000000..7395b88 --- /dev/null +++ b/src/correct_ccs.rs @@ -0,0 +1,177 @@ +#[cfg(not(feature = "std"))] +use alloc::collections::VecDeque; +use core::iter::FusedIterator; +#[cfg(feature = "std")] +use std::collections::VecDeque; + +use crate::{lookups, tables}; + +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +enum CcsKind { + /// A CCS base character (graphic character other than combining mark). + Base, + + /// A combining character other than a `Default_Ignorable_Code_Point`. + NonIgnorableCombining, + + /// A default-ignorable combining character, ZWJ, or ZWNJ. + IgnorableCombining, +} + +impl CcsKind { + fn of(c: char) -> Option { + if c == '\u{200C}' || c == '\u{200D}' { + // ZWNJ || ZWJ + Some(CcsKind::IgnorableCombining) + } else if lookups::is_combining_mark(c) { + if tables::is_default_ignorable_mark(c) { + Some(CcsKind::IgnorableCombining) + } else { + Some(CcsKind::NonIgnorableCombining) + } + } else if tables::not_in_ccs(c) { + None + } else { + Some(CcsKind::Base) + } + } +} + +/// An iterator over the string that corrects +/// [defective combining character sequences](https://www.unicode.org/versions/Unicode15.0.0/UnicodeStandard-15.0.pdf#I6.1.36487) +/// by inserting U+00A0 NO-BREAK SPACE in front of them. +/// +/// For the purposes of this iterator, private use characters, +/// as well as unassigned codepoints other than noncharacters, +/// are considered valid base characters, +/// so combining character sequences that start with such will not be modified. +/// +/// In addition, combining character sequences that consist entirely of `Default_Ignorable_Code_Point`s +/// will not be modified. (Because of this, this iterator may buffer up to the entire length of its input; +/// it is *not* "stream-safe" *even if* used with [`StreamSafe`][crate::StreamSafe]). +#[derive(Clone, Debug)] +pub struct CorrectDefectiveCcs { + /// Whether the last character emitted was part of a CCS. + in_ccs: bool, + buffer: VecDeque>, + /// Whether the last character in `buffer` is part of a CCS. + /// (Updated only when `is_ccs` is set from false to true). + end_of_buffer_in_ccs: bool, + iter: I, +} + +impl> Iterator for CorrectDefectiveCcs { + type Item = char; + + fn next(&mut self) -> Option { + if self.in_ccs { + if let Some(c) = self.buffer.pop_front() { + // Empty buffer + + if self.buffer.is_empty() { + self.in_ccs = self.end_of_buffer_in_ccs; + } + c + } else { + // Forward from inner iterator + + let c = self.iter.next(); + if c.map_or(true, tables::not_in_ccs) { + self.in_ccs = false; + } + c + } + } else { + if self.buffer.is_empty() { + // We don't have a buffer of default ignorable combining characters built up + + let c = self.iter.next()?; + match CcsKind::of(c) { + // Character not in CCS, just forward it + None => return Some(c), + + // Character starts non-defective CCS, + // label ourselves as in CCS and forward it + Some(CcsKind::Base) => { + self.in_ccs = true; + return Some(c); + } + + // Character starts defective CCS and is not default-ignorable. + // Put it in the buffer to emit on next iteration, + // mark ourselves as in CCS, + // and emit NO-BREAK SPACE + Some(CcsKind::NonIgnorableCombining) => { + self.in_ccs = true; + self.end_of_buffer_in_ccs = true; + self.buffer.push_back(Some(c)); + return Some('\u{00A0}'); // NO-BREAK SPACE + } + + // Character starts defective CCS and is default-ignorable. + // Put it in the buffer, and fall through to loop below + // to find out whether we emit a NO-BREAK SPACE first. + Some(CcsKind::IgnorableCombining) => { + self.buffer.push_back(Some(c)); + } + } + } + + loop { + // We do have a buffer of default ignorable combining characters built up, + // and we need to figure out whether to emit a NO-BREAK SPACE first. + + let c = self.iter.next(); + match c.and_then(CcsKind::of) { + // Inner iterator yielded character outside CCS (or `None`). + // Emit the built-up buffer with no leading NO-BREAK SPACE. + None => { + self.in_ccs = true; + self.end_of_buffer_in_ccs = false; + let ret = self.buffer.pop_front().unwrap(); + self.buffer.push_back(c); + return ret; + } + + // Inner iterator yielded character that starts a new CCS. + // Emit the built-up buffer with no leading NO-BREAK SPACE. + Some(CcsKind::Base) => { + self.in_ccs = true; + self.end_of_buffer_in_ccs = true; + let ret = self.buffer.pop_front().unwrap(); + self.buffer.push_back(c); + return ret; + } + + // Inner iterator yielded non-ignorable combining character. + // Emit the built-up buffer with leading NO-BREAK SPACE. + Some(CcsKind::NonIgnorableCombining) => { + self.in_ccs = true; + self.end_of_buffer_in_ccs = true; + self.buffer.push_back(c); + return Some('\u{00A0}'); // NO-BREAK SPACE + } + + // Inner iterator yielded ignorable combining character. + // Add it to the buffer, don't emit anything. + Some(CcsKind::IgnorableCombining) => { + self.buffer.push_back(c); + } + } + } + } + } +} + +impl + FusedIterator> FusedIterator for CorrectDefectiveCcs {} + +impl CorrectDefectiveCcs { + pub(crate) fn new(iter: I) -> Self { + Self { + in_ccs: false, + buffer: VecDeque::new(), + end_of_buffer_in_ccs: false, + iter, + } + } +} diff --git a/src/lib.rs b/src/lib.rs index 8cf4c4a..78519d2 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -52,6 +52,7 @@ extern crate core; extern crate tinyvec; +pub use crate::correct_ccs::CorrectDefectiveCcs; pub use crate::decompose::Decompositions; pub use crate::quick_check::{ is_nfc, is_nfc_quick, is_nfc_stream_safe, is_nfc_stream_safe_quick, is_nfd, is_nfd_quick, @@ -64,6 +65,7 @@ pub use crate::stream_safe::StreamSafe; pub use crate::tables::UNICODE_VERSION; use core::{option, str::Chars}; +mod correct_ccs; mod decompose; mod lookups; mod normalize; @@ -130,6 +132,19 @@ pub trait UnicodeNormalization> { /// An Iterator over the string with Conjoining Grapheme Joiner characters /// inserted according to the Stream-Safe Text Process (UAX15-D4) fn stream_safe(self) -> StreamSafe; + + /// An iterator over the string with + /// [defective combining character sequences](https://www.unicode.org/versions/Unicode15.0.0/UnicodeStandard-15.0.pdf#I6.1.36487) + /// corrected via the insertion of U+00A0 NO-BREAK SPACE. + /// + /// Sequences starting with a private use character or an unassigned codepoint that is not a noncharacter + /// are not corrected. Additionally, combining character sequences consisting entirely of + /// [default-ignorable code points](https://www.unicode.org/versions/Unicode15.0.0/UnicodeStandard-15.0.pdf#I8.1.40715) + /// are also left untouched. Handling this last case may require the iterator + /// to buffer up to the entire length of the input; + /// this iterator is therefore *not* "stream safe" + /// *even if* used in combination with [`stream_safe()`][UnicodeNormalization::stream_safe]. + fn correct_defective_ccs(self) -> CorrectDefectiveCcs; } impl<'a> UnicodeNormalization> for &'a str { @@ -162,6 +177,11 @@ impl<'a> UnicodeNormalization> for &'a str { fn stream_safe(self) -> StreamSafe> { StreamSafe::new(self.chars()) } + + #[inline] + fn correct_defective_ccs(self) -> CorrectDefectiveCcs> { + CorrectDefectiveCcs::new(self.chars()) + } } impl UnicodeNormalization> for char { @@ -194,6 +214,11 @@ impl UnicodeNormalization> for char { fn stream_safe(self) -> StreamSafe> { StreamSafe::new(Some(self).into_iter()) } + + #[inline] + fn correct_defective_ccs(self) -> CorrectDefectiveCcs> { + CorrectDefectiveCcs::new(Some(self).into_iter()) + } } impl> UnicodeNormalization for I { @@ -226,4 +251,9 @@ impl> UnicodeNormalization for I { fn stream_safe(self) -> StreamSafe { StreamSafe::new(self) } + + #[inline] + fn correct_defective_ccs(self) -> CorrectDefectiveCcs { + CorrectDefectiveCcs::new(self) + } } diff --git a/src/tables.rs b/src/tables.rs index 6c00bee..f563fb2 100644 --- a/src/tables.rs +++ b/src/tables.rs @@ -33570,6 +33570,7 @@ pub(crate) const COMBINING_MARK_KV: &[u32] = &[ ]; + #[inline] pub fn is_public_assigned(c: char) -> bool { match c { @@ -34284,6 +34285,68 @@ pub fn is_public_assigned(c: char) -> bool { } + +#[inline] +pub fn not_in_ccs(c: char) -> bool { + match c { + '\u{0000}'..='\u{001F}' + | '\u{007F}'..='\u{009F}' + | '\u{00AD}' + | '\u{0600}'..='\u{0605}' + | '\u{061C}' + | '\u{06DD}' + | '\u{070F}' + | '\u{0890}'..='\u{0891}' + | '\u{08E2}' + | '\u{180E}' + | '\u{200B}' + | '\u{200E}'..='\u{200F}' + | '\u{2028}'..='\u{202E}' + | '\u{2060}'..='\u{2064}' + | '\u{2066}'..='\u{206F}' + | '\u{FDD0}'..='\u{FDEF}' + | '\u{FEFF}' + | '\u{FFF9}'..='\u{FFFB}' + | '\u{FFFE}'..='\u{FFFF}' + | '\u{110BD}' + | '\u{110CD}' + | '\u{13430}'..='\u{1343F}' + | '\u{1BCA0}'..='\u{1BCA3}' + | '\u{1D173}'..='\u{1D17A}' + | '\u{1FFFE}'..='\u{1FFFF}' + | '\u{2FFFE}'..='\u{2FFFF}' + | '\u{3FFFE}'..='\u{3FFFF}' + | '\u{4FFFE}'..='\u{4FFFF}' + | '\u{5FFFE}'..='\u{5FFFF}' + | '\u{6FFFE}'..='\u{6FFFF}' + | '\u{7FFFE}'..='\u{7FFFF}' + | '\u{8FFFE}'..='\u{8FFFF}' + | '\u{9FFFE}'..='\u{9FFFF}' + | '\u{AFFFE}'..='\u{AFFFF}' + | '\u{BFFFE}'..='\u{BFFFF}' + | '\u{CFFFE}'..='\u{CFFFF}' + | '\u{DFFFE}'..='\u{DFFFF}' + | '\u{E0001}' + | '\u{E0020}'..='\u{E007F}' + | '\u{EFFFE}'..='\u{EFFFF}' + | '\u{FFFFE}'..='\u{FFFFF}' + | '\u{10FFFE}'..='\u{10FFFF}' => true, + _ => false, + } +} + +#[inline] +pub fn is_default_ignorable_mark(c: char) -> bool { + match c { + '\u{034F}' + | '\u{17B4}'..='\u{17B5}' + | '\u{180B}'..='\u{180D}' + | '\u{180F}' + | '\u{FE00}'..='\u{FE0F}' + | '\u{E0100}'..='\u{E01EF}' => true, + _ => false, + } +} #[inline] #[allow(ellipsis_inclusive_range_patterns)] pub fn qc_nfc(c: char) -> IsNormalized { diff --git a/tests/correct_defective_ccs.rs b/tests/correct_defective_ccs.rs new file mode 100644 index 0000000..552b712 --- /dev/null +++ b/tests/correct_defective_ccs.rs @@ -0,0 +1,29 @@ +use unicode_normalization::UnicodeNormalization; + +macro_rules! check_ccs { + ($input: expr, $expected_out: expr) => { + assert_eq!( + $input.correct_defective_ccs().collect::(), + $expected_out + ) + }; +} + +#[test] +fn defective_css() { + check_ccs!("", ""); + check_ccs!("abcde", "abcde"); + check_ccs!("a\u{0301}bcde", "a\u{0301}bcde"); + check_ccs!("\u{0301}bcde", "\u{00A0}\u{0301}bcde"); + check_ccs!("\u{200C}\u{0301}bcde", "\u{00A0}\u{200C}\u{0301}bcde"); + check_ccs!("\u{200C}bcde", "\u{200C}bcde"); + check_ccs!("\u{180F}bcde", "\u{180F}bcde"); + check_ccs!("\u{FFFF}\u{0301}bcde", "\u{FFFF}\u{00A0}\u{0301}bcde"); + check_ccs!("\u{10FFFD}\u{0301}bcde", "\u{10FFFD}\u{0301}bcde"); + check_ccs!("\u{180F}\u{180F}\u{180F}", "\u{180F}\u{180F}\u{180F}"); + check_ccs!("\u{180F}\u{180F}\u{180F}a", "\u{180F}\u{180F}\u{180F}a"); + check_ccs!( + "\u{180F}\u{180F}\u{180F}\u{0301}", + "\u{00A0}\u{180F}\u{180F}\u{180F}\u{0301}" + ); +}