Skip to content

Commit

Permalink
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add API to correct defective combining character sequences
Browse files Browse the repository at this point in the history
Jules-Bertholet committed Mar 2, 2024
1 parent a6a221a commit b83714b
Showing 5 changed files with 413 additions and 1 deletion.
115 changes: 114 additions & 1 deletion scripts/unicode.py
Original file line number Diff line number Diff line change
@@ -19,6 +19,7 @@
# Since this should not require frequent updates, we just store this
# out-of-line and check the tables.rs and normalization_tests.rs files into git.
import collections
import re
import urllib.request

UNICODE_VERSION = "15.1.0"
@@ -66,6 +67,8 @@
class UnicodeData(object):
def __init__(self):
self._load_unicode_data()
self._load_default_ignorable_marks()

self.norm_props = self._load_norm_props()
self.norm_tests = self._load_norm_tests()

@@ -100,6 +103,11 @@ def _load_unicode_data(self):
self.general_category_mark = []
self.general_category_public_assigned = []

# Characters that cannot be part of a combining character sequence:
# control characters, format characters other than ZWJ and ZWNJ,
# the line and paragraph separators, and noncharacters.
self.not_in_ccs = []

assigned_start = 0;
prev_char_int = -1;
prev_name = "";
@@ -125,6 +133,9 @@ def _load_unicode_data(self):
if category == 'M' or 'M' in expanded_categories.get(category, []):
self.general_category_mark.append(char_int)

if category in ['Cc', 'Cf', 'Zl', 'Zp'] and char_int not in [0x200C, 0x200D]:
self.not_in_ccs.append(char_int)

assert category != 'Cn', "Unexpected: Unassigned codepoint in UnicodeData.txt"
if category not in ['Co', 'Cs']:
if char_int != prev_char_int + 1 and not is_first_and_last(prev_name, name):
@@ -135,6 +146,44 @@ def _load_unicode_data(self):

self.general_category_public_assigned.append((assigned_start, prev_char_int))

# Mark noncharacters as nongraphic
for i in range(0xFDD0, 0xFDF0):
self.not_in_ccs.append(i)
for prefix in range(0, 0x11):
shifted = prefix << 16
self.not_in_ccs.append(shifted | 0xFFFE)
self.not_in_ccs.append(shifted | 0xFFFF)

self.not_in_ccs.sort()

def _load_default_ignorable_marks(self):
default_ignorable_cps = set()

single = re.compile(r"^([0-9A-F]+)\s+;\s+Default_Ignorable_Code_Point\s+")
multiple = re.compile(
r"^([0-9A-F]+)\.\.([0-9A-F]+)\s+;\s+Default_Ignorable_Code_Point\s+"
)

for line in self._fetch("DerivedCoreProperties.txt").splitlines():
raw_data = None # (low, high)
if match := single.match(line):
raw_data = (match.group(1), match.group(1))
elif match := multiple.match(line):
raw_data = (match.group(1), match.group(2))
else:
continue
low = int(raw_data[0], 16)
high = int(raw_data[1], 16)
for cp in range(low, high + 1):
default_ignorable_cps.add(cp)

self.default_ignorable_marks = []
for cp in self.general_category_mark:
if cp in default_ignorable_cps:
self.default_ignorable_marks.append(cp)

self.default_ignorable_marks.sort()

def _load_cjk_compat_ideograph_variants(self):
for line in self._fetch("StandardizedVariants.txt").splitlines():
strip_comments = line.split('#', 1)[0].strip()
@@ -454,7 +503,7 @@ def gen_combining_mark(general_category_mark, out):

def gen_public_assigned(general_category_public_assigned, out):
# This could be done as a hash but the table is somewhat small.
out.write("#[inline]\n")
out.write("\n#[inline]\n")
out.write("pub fn is_public_assigned(c: char) -> bool {\n")
out.write(" match c {\n")

@@ -476,6 +525,66 @@ def gen_public_assigned(general_category_public_assigned, out):
out.write("}\n")
out.write("\n")

def gen_not_in_ccs(not_in_ccs, out):
# List of codepoints to list of ranges
range_list = []
for cp in not_in_ccs:
if len(range_list) != 0 and range_list[-1][1] == cp - 1:
range_list[-1] = (range_list[-1][0], cp)
else:
range_list.append((cp, cp))

out.write("\n#[inline]\n")
out.write("pub fn not_in_ccs(c: char) -> bool {\n")
out.write(" match c {\n")

start = True
for first, last in range_list:
if start:
out.write(" ")
start = False
else:
out.write("\n | ")
if first == last:
out.write("'\\u{%s}'" % hexify(first))
else:
out.write("'\\u{%s}'..='\\u{%s}'" % (hexify(first), hexify(last)))
out.write(" => true,\n")

out.write(" _ => false,\n")
out.write(" }\n")
out.write("}\n")

def gen_default_ignorable_mark(default_ignorable_marks, out):
# List of codepoints to list of ranges
range_list = []
for cp in default_ignorable_marks:
if len(range_list) != 0 and range_list[-1][1] == cp - 1:
range_list[-1] = (range_list[-1][0], cp)
else:
range_list.append((cp, cp))

out.write("\n#[inline]\n")
out.write("pub fn is_default_ignorable_mark(c: char) -> bool {\n")
out.write(" match c {\n")

start = True
for first, last in range_list:
if start:
out.write(" ")
start = False
else:
out.write("\n | ")
if first == last:
out.write("'\\u{%s}'" % hexify(first))
else:
out.write("'\\u{%s}'..='\\u{%s}'" % (hexify(first), hexify(last)))
out.write(" => true,\n")

out.write(" _ => false,\n")
out.write(" }\n")
out.write("}\n")

def gen_stream_safe(leading, trailing, out):
# This could be done as a hash but the table is very small.
out.write("#[inline]\n")
@@ -602,6 +711,10 @@ def minimal_perfect_hash(d):
gen_public_assigned(data.general_category_public_assigned, out)
out.write("\n")

gen_not_in_ccs(data.not_in_ccs, out)

gen_default_ignorable_mark(data.default_ignorable_marks, out)

gen_nfc_qc(data.norm_props, out)
out.write("\n")

177 changes: 177 additions & 0 deletions src/correct_ccs.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,177 @@
#[cfg(not(feature = "std"))]
use alloc::collections::VecDeque;
use core::iter::FusedIterator;
#[cfg(feature = "std")]
use std::collections::VecDeque;

use crate::{lookups, tables};

#[derive(Clone, Copy, Debug, PartialEq, Eq)]
enum CcsKind {
/// A CCS base character (graphic character other than combining mark).
Base,

/// A combining character other than a `Default_Ignorable_Code_Point`.
NonIgnorableCombining,

/// A default-ignorable combining character, ZWJ, or ZWNJ.
IgnorableCombining,
}

impl CcsKind {
fn of(c: char) -> Option<Self> {
if c == '\u{200C}' || c == '\u{200D}' {
// ZWNJ || ZWJ
Some(CcsKind::IgnorableCombining)
} else if lookups::is_combining_mark(c) {
if tables::is_default_ignorable_mark(c) {
Some(CcsKind::IgnorableCombining)
} else {
Some(CcsKind::NonIgnorableCombining)
}
} else if tables::not_in_ccs(c) {
None
} else {
Some(CcsKind::Base)
}
}
}

/// An iterator over the string that corrects
/// [defective combining character sequences](https://www.unicode.org/versions/Unicode15.0.0/UnicodeStandard-15.0.pdf#I6.1.36487)
/// by inserting U+00A0 NO-BREAK SPACE in front of them.
///
/// For the purposes of this iterator, private use characters,
/// as well as unassigned codepoints other than noncharacters,
/// are considered valid base characters,
/// so combining character sequences that start with such will not be modified.
///
/// In addition, combining character sequences that consist entirely of `Default_Ignorable_Code_Point`s
/// will not be modified. (Because of this, this iterator may buffer up to the entire length of its input;
/// it is *not* "stream-safe" *even if* used with [`StreamSafe`][crate::StreamSafe]).
#[derive(Clone, Debug)]
pub struct CorrectDefectiveCcs<I> {
/// Whether the last character emitted was part of a CCS.
in_ccs: bool,
buffer: VecDeque<Option<char>>,
/// Whether the last character in `buffer` is part of a CCS.
/// (Updated only when `is_ccs` is set from false to true).
end_of_buffer_in_ccs: bool,
iter: I,
}

impl<I: Iterator<Item = char>> Iterator for CorrectDefectiveCcs<I> {
type Item = char;

fn next(&mut self) -> Option<Self::Item> {
if self.in_ccs {
if let Some(c) = self.buffer.pop_front() {
// Empty buffer

if self.buffer.is_empty() {
self.in_ccs = self.end_of_buffer_in_ccs;
}
c
} else {
// Forward from inner iterator

let c = self.iter.next();
if c.map_or(true, tables::not_in_ccs) {
self.in_ccs = false;
}
c
}
} else {
if self.buffer.is_empty() {
// We don't have a buffer of default ignorable combining characters built up

let c = self.iter.next()?;
match CcsKind::of(c) {
// Character not in CCS, just forward it
None => return Some(c),

// Character starts non-defective CCS,
// label ourselves as in CCS and forward it
Some(CcsKind::Base) => {
self.in_ccs = true;
return Some(c);
}

// Character starts defective CCS and is not default-ignorable.
// Put it in the buffer to emit on next iteration,
// mark ourselves as in CCS,
// and emit NO-BREAK SPACE
Some(CcsKind::NonIgnorableCombining) => {
self.in_ccs = true;
self.end_of_buffer_in_ccs = true;
self.buffer.push_back(Some(c));
return Some('\u{00A0}'); // NO-BREAK SPACE
}

// Character starts defective CCS and is default-ignorable.
// Put it in the buffer, and fall through to loop below
// to find out whether we emit a NO-BREAK SPACE first.
Some(CcsKind::IgnorableCombining) => {
self.buffer.push_back(Some(c));
}
}
}

loop {
// We do have a buffer of default ignorable combining characters built up,
// and we need to figure out whether to emit a NO-BREAK SPACE first.

let c = self.iter.next();
match c.and_then(CcsKind::of) {
// Inner iterator yielded character outside CCS (or `None`).
// Emit the built-up buffer with no leading NO-BREAK SPACE.
None => {
self.in_ccs = true;
self.end_of_buffer_in_ccs = false;
let ret = self.buffer.pop_front().unwrap();
self.buffer.push_back(c);
return ret;
}

// Inner iterator yielded character that starts a new CCS.
// Emit the built-up buffer with no leading NO-BREAK SPACE.
Some(CcsKind::Base) => {
self.in_ccs = true;
self.end_of_buffer_in_ccs = true;
let ret = self.buffer.pop_front().unwrap();
self.buffer.push_back(c);
return ret;
}

// Inner iterator yielded non-ignorable combining character.
// Emit the built-up buffer with leading NO-BREAK SPACE.
Some(CcsKind::NonIgnorableCombining) => {
self.in_ccs = true;
self.end_of_buffer_in_ccs = true;
self.buffer.push_back(c);
return Some('\u{00A0}'); // NO-BREAK SPACE
}

// Inner iterator yielded ignorable combining character.
// Add it to the buffer, don't emit anything.
Some(CcsKind::IgnorableCombining) => {
self.buffer.push_back(c);
}
}
}
}
}
}

impl<I: Iterator<Item = char> + FusedIterator> FusedIterator for CorrectDefectiveCcs<I> {}

impl<I> CorrectDefectiveCcs<I> {
pub(crate) fn new(iter: I) -> Self {
Self {
in_ccs: false,
buffer: VecDeque::new(),
end_of_buffer_in_ccs: false,
iter,
}
}
}
Loading

0 comments on commit b83714b

Please sign in to comment.