ICU-13219 add -u-dx- support to BreakIterator

ICU-13219 Fix ICU-13219 add -u-dx- support to BreakIterator ICU-13219 Fix ICU-13219 update
unicode-org · Oct 21, 2024 · ba1260f · ba1260f
1 parent 99ca2ad
commit ba1260f
Show file tree

Hide file tree

Showing 12 changed files with 401 additions and 30 deletions.
diff --git a/icu4c/source/common/brkiter.cpp b/icu4c/source/common/brkiter.cpp
@@ -55,7 +55,7 @@ U_NAMESPACE_BEGIN
 // -------------------------------------
 
 BreakIterator*
-BreakIterator::buildInstance(const Locale& loc, const char *type, UErrorCode &status)
+BreakIterator::buildInstance(const Locale& loc, const char *type, bool checkDX, UErrorCode &status)
 {
     char fnbuff[256];
     char ext[4]={'\0'};
@@ -116,8 +116,22 @@ BreakIterator::buildInstance(const Locale& loc, const char *type, UErrorCode &st
         return nullptr;
     }
 
-    // Create a RuleBasedBreakIterator
-    result = new RuleBasedBreakIterator(file, uprv_strstr(type, "phrase") != nullptr, status);
+    {
+        const char* dxs = nullptr;
+        CharString dxsValue; // keep on the stack till we no longer need dxs.
+        // If it is word or line instance, try to get the value for dx
+        if (checkDX) {
+            UErrorCode dxsStatus = U_ZERO_ERROR;
+            CharStringByteSink dxsSink(&dxsValue);
+            loc.getKeywordValue("dx", dxsSink, dxsStatus);
+            if (U_SUCCESS(dxsStatus) && dxsValue.length() > 0) {
+                dxs = dxsValue.data();
+            }
+        }
+
+        // Create a RuleBasedBreakIterator
+        result = new RuleBasedBreakIterator(file, uprv_strstr(type, "phrase") != nullptr, dxs, status);
+    }
 
     // If there is a result, set the valid locale and actual locale, and the kind
     if (U_SUCCESS(status) && result != nullptr) {
@@ -421,14 +435,14 @@ BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status)
     case UBRK_CHARACTER:
         {
             UTRACE_ENTRY(UTRACE_UBRK_CREATE_CHARACTER);
-            result = BreakIterator::buildInstance(loc, "grapheme", status);
+            result = BreakIterator::buildInstance(loc, "grapheme", false, status);
             UTRACE_EXIT_STATUS(status);
         }
         break;
     case UBRK_WORD:
         {
             UTRACE_ENTRY(UTRACE_UBRK_CREATE_WORD);
-            result = BreakIterator::buildInstance(loc, "word", status);
+            result = BreakIterator::buildInstance(loc, "word", true, status);
             UTRACE_EXIT_STATUS(status);
         }
         break;
@@ -451,7 +465,7 @@ BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status)
                     uprv_strcat(lb_lw, value.data());
                 }
             }
-            result = BreakIterator::buildInstance(loc, lb_lw, status);
+            result = BreakIterator::buildInstance(loc, lb_lw, true, status);
 
             UTRACE_DATA1(UTRACE_INFO, "lb_lw=%s", lb_lw);
             UTRACE_EXIT_STATUS(status);
@@ -460,7 +474,7 @@ BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status)
     case UBRK_SENTENCE:
         {
             UTRACE_ENTRY(UTRACE_UBRK_CREATE_SENTENCE);
-            result = BreakIterator::buildInstance(loc, "sentence", status);
+            result = BreakIterator::buildInstance(loc, "sentence", false, status);
 #if !UCONFIG_NO_FILTERED_BREAK_ITERATION
             char ssKeyValue[kKeyValueLenMax] = {0};
             UErrorCode kvStatus = U_ZERO_ERROR;
@@ -479,7 +493,7 @@ BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status)
     case UBRK_TITLE:
         {
             UTRACE_ENTRY(UTRACE_UBRK_CREATE_TITLE);
-            result = BreakIterator::buildInstance(loc, "title", status);
+            result = BreakIterator::buildInstance(loc, "title", false, status);
             UTRACE_EXIT_STATUS(status);
         }
         break;

diff --git a/icu4c/source/common/rbbi.cpp b/icu4c/source/common/rbbi.cpp
@@ -25,6 +25,7 @@
 #include "unicode/uchriter.h"
 #include "unicode/uclean.h"
 #include "unicode/udata.h"
+#include "unicode/uniset.h"
 
 #include "brkeng.h"
 #include "ucln_cmn.h"
@@ -89,9 +90,45 @@ RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode
 //
 //-------------------------------------------------------------------------------
 RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UBool isPhraseBreaking,
+                                               const char* dxs,
         UErrorCode &status) : RuleBasedBreakIterator(udm, status)
 {
     fIsPhraseBreaking = isPhraseBreaking;
+    if (U_FAILURE(status)) {
+        return;
+    }
+    if (dxs != nullptr) {
+        size_t length = uprv_strlen(dxs);
+        // The value should be a list of 4 letter script codes joined by '-'.
+        if (length % 5 != 4) {
+            status = U_ILLEGAL_ARGUMENT_ERROR;
+            return;
+        }
+        // The code Zyyy (Common) can be specified to exclude all scripts, if
+        // and only if it is the only SCRIPT_CODE value specified. If it is
+        // not the only script code, Zyyy has the normal meaning: excluding
+        // Script_Extension=Common.
+        if (uprv_strcmp(dxs, "zyyy") == 0) {
+            fDX = (new UnicodeSet(UnicodeSet::MIN_VALUE, UnicodeSet::MAX_VALUE))->freeze();
+            return;
+        }
+        size_t items = 1 + length / 5;
+        // Change from "thai" to "[[:scx=thai:]]" or
+        // "thai-arab" to "[[:scx=thai:][:scx=arab:]]"
+        UnicodeString udxs(u'[');
+        for (size_t i = 0; i < items; i++) {
+            if (i > 0 && dxs[i*5-1] != u'-') {
+                status = U_ILLEGAL_ARGUMENT_ERROR;
+                return;
+            }
+            // Dictionary-based break iterators will ignore each character whose
+            // Script_Extension value set intersects with the DX value set.
+            udxs.append(u"[:scx=", -1)
+                .append(UnicodeString(dxs + i * 5, 4, US_INV))
+                .append(u":]", -1);
+        }
+        fDX = (new UnicodeSet(udxs.append(u']'), status))->freeze();
+    }
 }
 
 //
@@ -198,7 +235,7 @@ RuleBasedBreakIterator::RuleBasedBreakIterator()
  * Simple Constructor with an error code.
  * Handles common initialization for all other constructors.
  */
-RuleBasedBreakIterator::RuleBasedBreakIterator(UErrorCode *status) {
+RuleBasedBreakIterator::RuleBasedBreakIterator(UErrorCode *status) : fDX(nullptr) {
     UErrorCode ec = U_ZERO_ERROR;
     if (status == nullptr) {
         status = &ec;
@@ -212,6 +249,7 @@ RuleBasedBreakIterator::RuleBasedBreakIterator(UErrorCode *status) {
     }
     fDictionaryCache = lpDictionaryCache.orphan();
     fBreakCache = lpBreakCache.orphan();
+    fDX = nullptr;
 
 #ifdef RBBI_DEBUG
     static UBool debugInitDone = false;
@@ -261,6 +299,9 @@ RuleBasedBreakIterator::~RuleBasedBreakIterator() {
     delete fDictionaryCache;
     fDictionaryCache = nullptr;
 
+    delete fDX;
+    fDX = nullptr;
+
     delete fLanguageBreakEngines;
     fLanguageBreakEngines = nullptr;
 
@@ -333,6 +374,7 @@ RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) {
     //       the assumption that the current position is on a rule boundary.
     fBreakCache->reset(fPosition, fRuleStatusIndex);
     fDictionaryCache->reset();
+    fDX = (that.fDX == nullptr) ? nullptr : that.fDX->cloneAsThawed();
 
     return *this;
 }
@@ -381,11 +423,15 @@ RuleBasedBreakIterator::operator==(const BreakIterator& that) const {
         return false;
     }
 
+    // If only one has fDX or they are not equal
+    if (!((that2.fDX == nullptr && fDX == nullptr) || *that2.fDX == *fDX)) {
+        return false;
+    }
     if (that2.fData == fData ||
         (fData != nullptr && that2.fData != nullptr && *that2.fData == *fData)) {
             // The two break iterators are using the same rules.
             return true;
-        }
+    }
     return false;
 }
 
@@ -1298,6 +1344,10 @@ RuleBasedBreakIterator::getRules() const {
     }
 }
 
+bool RuleBasedBreakIterator::excludedFromDictionaryBreak(int32_t c) {
+    return fDX != nullptr && fDX->contains(c);
+}
+
 U_NAMESPACE_END
 
 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
diff --git a/icu4c/source/common/rbbi_cache.cpp b/icu4c/source/common/rbbi_cache.cpp
@@ -156,17 +156,21 @@ void RuleBasedBreakIterator::DictionaryCache::populateDictionary(int32_t startPo
             break;
         }
 
-        // We now have a dictionary character. Get the appropriate language object
-        // to deal with it.
-        const LanguageBreakEngine *lbe = fBI->getLanguageBreakEngine(
-            c, fBI->getLocaleID(ULOC_REQUESTED_LOCALE, status));
-
-        // Ask the language object if there are any breaks. It will add them to the cache and
-        // leave the text pointer on the other side of its range, ready to search for the next one.
-        if (lbe != nullptr) {
-            foundBreakCount += lbe->findBreaks(text, current, rangeEnd, fBreaks, fBI->fIsPhraseBreaking, status);
+        // We now have a dictionary character.
+        // Handle dx (Dictionary break script exclusions) first if needed
+        if (fBI->excludedFromDictionaryBreak(c)) {
+            utext_next32(text);
+        } else {
+            // Get the appropriate language object to deal with it.
+            const LanguageBreakEngine *lbe = fBI->getLanguageBreakEngine(
+                c, fBI->getLocaleID(ULOC_REQUESTED_LOCALE, status));
+
+            // Ask the language object if there are any breaks. It will add them to the cache and
+            // leave the text pointer on the other side of its range, ready to search for the next one.
+            if (lbe != nullptr) {
+                foundBreakCount += lbe->findBreaks(text, current, rangeEnd, fBreaks, fBI->fIsPhraseBreaking, status);
+            }
         }
-
         // Reload the loop variables for the next go-round
         c = utext_current32(text);
         category = ucptrie_get(fBI->fData->fTrie, c);

diff --git a/icu4c/source/common/unicode/brkiter.h b/icu4c/source/common/unicode/brkiter.h
@@ -622,7 +622,7 @@ class U_COMMON_API BreakIterator : public UObject {
     virtual BreakIterator &refreshInputText(UText *input, UErrorCode &status) = 0;
 
  private:
-    static BreakIterator* buildInstance(const Locale& loc, const char *type, UErrorCode& status);
+    static BreakIterator* buildInstance(const Locale& loc, const char *type, bool checkDX, UErrorCode& status);
     static BreakIterator* createInstance(const Locale& loc, int32_t kind, UErrorCode& status);
     static BreakIterator* makeInstance(const Locale& loc, int32_t kind, UErrorCode& status);
 

diff --git a/icu4c/source/common/unicode/rbbi.h b/icu4c/source/common/unicode/rbbi.h
@@ -17,6 +17,7 @@
 #define RBBI_H
 
 #include "unicode/utypes.h"
+#include "unicode/uniset.h"
 
 #if U_SHOW_CPLUSPLUS_API
 
@@ -42,6 +43,7 @@ struct RBBIDataHeader;
 class  RBBIDataWrapper;
 class  UnhandledEngine;
 class  UStack;
+class  UnicodeSet;
 
 
 #ifndef U_HIDE_INTERNAL_API
@@ -221,6 +223,12 @@ class U_COMMON_API RuleBasedBreakIterator /*final*/ : public BreakIterator {
      */
     UBool fIsPhraseBreaking = false;
 
+    /**
+     * A UnicodeSet for Dictionary Break Exclusion.
+     */
+    UnicodeSet* fDX = nullptr;
+private:
+
     //=======================================================================
     // constructors
     //=======================================================================
@@ -246,11 +254,12 @@ class U_COMMON_API RuleBasedBreakIterator /*final*/ : public BreakIterator {
      *        which will be responsible for closing it when it is no longer needed.
      * @param status Information on any errors encountered.
      * @param isPhraseBreaking true if phrase based breaking is required, otherwise false.
+     * @param dxs nullptr or a string to denote "Dictionary break script exclusions".
      * @see udata_open
      * @see #getBinaryRules
      * @internal (private)
      */
-    RuleBasedBreakIterator(UDataMemory* image, UBool isPhraseBreaking, UErrorCode &status);
+    RuleBasedBreakIterator(UDataMemory* image, UBool isPhraseBreaking, const char* dxs, UErrorCode &status);
 
     /** @internal */
     friend class RBBIRuleBuilder;
@@ -764,6 +773,12 @@ class U_COMMON_API RuleBasedBreakIterator /*final*/ : public BreakIterator {
      * signature)
      */
 
+    /*
+     * Check should the character be excluded from dictionary-based text break.
+     * @internal (private)
+     */
+    bool excludedFromDictionaryBreak(int32_t c);
+
     typedef uint16_t (*PTrieFunc)(const UCPTrie *, UChar32);
 
     template<typename RowType, PTrieFunc trieFunc>