Skip to content

Commit

Permalink
ICU-13219 add -u-dx- support to BreakIterator
Browse files Browse the repository at this point in the history
ICU-13219 Fix

ICU-13219 add -u-dx- support to BreakIterator

ICU-13219 Fix

ICU-13219 update
  • Loading branch information
FrankYFTang committed Oct 21, 2024
1 parent 99ca2ad commit ba1260f
Show file tree
Hide file tree
Showing 12 changed files with 401 additions and 30 deletions.
30 changes: 22 additions & 8 deletions icu4c/source/common/brkiter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ U_NAMESPACE_BEGIN
// -------------------------------------

BreakIterator*
BreakIterator::buildInstance(const Locale& loc, const char *type, UErrorCode &status)
BreakIterator::buildInstance(const Locale& loc, const char *type, bool checkDX, UErrorCode &status)
{
char fnbuff[256];
char ext[4]={'\0'};
Expand Down Expand Up @@ -116,8 +116,22 @@ BreakIterator::buildInstance(const Locale& loc, const char *type, UErrorCode &st
return nullptr;
}

// Create a RuleBasedBreakIterator
result = new RuleBasedBreakIterator(file, uprv_strstr(type, "phrase") != nullptr, status);
{
const char* dxs = nullptr;
CharString dxsValue; // keep on the stack till we no longer need dxs.
// If it is word or line instance, try to get the value for dx
if (checkDX) {
UErrorCode dxsStatus = U_ZERO_ERROR;
CharStringByteSink dxsSink(&dxsValue);
loc.getKeywordValue("dx", dxsSink, dxsStatus);
if (U_SUCCESS(dxsStatus) && dxsValue.length() > 0) {
dxs = dxsValue.data();
}
}

// Create a RuleBasedBreakIterator
result = new RuleBasedBreakIterator(file, uprv_strstr(type, "phrase") != nullptr, dxs, status);
}

// If there is a result, set the valid locale and actual locale, and the kind
if (U_SUCCESS(status) && result != nullptr) {
Expand Down Expand Up @@ -421,14 +435,14 @@ BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status)
case UBRK_CHARACTER:
{
UTRACE_ENTRY(UTRACE_UBRK_CREATE_CHARACTER);
result = BreakIterator::buildInstance(loc, "grapheme", status);
result = BreakIterator::buildInstance(loc, "grapheme", false, status);
UTRACE_EXIT_STATUS(status);
}
break;
case UBRK_WORD:
{
UTRACE_ENTRY(UTRACE_UBRK_CREATE_WORD);
result = BreakIterator::buildInstance(loc, "word", status);
result = BreakIterator::buildInstance(loc, "word", true, status);
UTRACE_EXIT_STATUS(status);
}
break;
Expand All @@ -451,7 +465,7 @@ BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status)
uprv_strcat(lb_lw, value.data());
}
}
result = BreakIterator::buildInstance(loc, lb_lw, status);
result = BreakIterator::buildInstance(loc, lb_lw, true, status);

UTRACE_DATA1(UTRACE_INFO, "lb_lw=%s", lb_lw);
UTRACE_EXIT_STATUS(status);
Expand All @@ -460,7 +474,7 @@ BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status)
case UBRK_SENTENCE:
{
UTRACE_ENTRY(UTRACE_UBRK_CREATE_SENTENCE);
result = BreakIterator::buildInstance(loc, "sentence", status);
result = BreakIterator::buildInstance(loc, "sentence", false, status);
#if !UCONFIG_NO_FILTERED_BREAK_ITERATION
char ssKeyValue[kKeyValueLenMax] = {0};
UErrorCode kvStatus = U_ZERO_ERROR;
Expand All @@ -479,7 +493,7 @@ BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status)
case UBRK_TITLE:
{
UTRACE_ENTRY(UTRACE_UBRK_CREATE_TITLE);
result = BreakIterator::buildInstance(loc, "title", status);
result = BreakIterator::buildInstance(loc, "title", false, status);
UTRACE_EXIT_STATUS(status);
}
break;
Expand Down
54 changes: 52 additions & 2 deletions icu4c/source/common/rbbi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
#include "unicode/uchriter.h"
#include "unicode/uclean.h"
#include "unicode/udata.h"
#include "unicode/uniset.h"

#include "brkeng.h"
#include "ucln_cmn.h"
Expand Down Expand Up @@ -89,9 +90,45 @@ RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode
//
//-------------------------------------------------------------------------------
RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UBool isPhraseBreaking,
const char* dxs,
UErrorCode &status) : RuleBasedBreakIterator(udm, status)
{
fIsPhraseBreaking = isPhraseBreaking;
if (U_FAILURE(status)) {
return;
}
if (dxs != nullptr) {
size_t length = uprv_strlen(dxs);
// The value should be a list of 4 letter script codes joined by '-'.
if (length % 5 != 4) {
status = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
// The code Zyyy (Common) can be specified to exclude all scripts, if
// and only if it is the only SCRIPT_CODE value specified. If it is
// not the only script code, Zyyy has the normal meaning: excluding
// Script_Extension=Common.
if (uprv_strcmp(dxs, "zyyy") == 0) {
fDX = (new UnicodeSet(UnicodeSet::MIN_VALUE, UnicodeSet::MAX_VALUE))->freeze();
return;
}
size_t items = 1 + length / 5;
// Change from "thai" to "[[:scx=thai:]]" or
// "thai-arab" to "[[:scx=thai:][:scx=arab:]]"
UnicodeString udxs(u'[');
for (size_t i = 0; i < items; i++) {
if (i > 0 && dxs[i*5-1] != u'-') {
status = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
// Dictionary-based break iterators will ignore each character whose
// Script_Extension value set intersects with the DX value set.
udxs.append(u"[:scx=", -1)
.append(UnicodeString(dxs + i * 5, 4, US_INV))
.append(u":]", -1);
}
fDX = (new UnicodeSet(udxs.append(u']'), status))->freeze();
}
}

//
Expand Down Expand Up @@ -198,7 +235,7 @@ RuleBasedBreakIterator::RuleBasedBreakIterator()
* Simple Constructor with an error code.
* Handles common initialization for all other constructors.
*/
RuleBasedBreakIterator::RuleBasedBreakIterator(UErrorCode *status) {
RuleBasedBreakIterator::RuleBasedBreakIterator(UErrorCode *status) : fDX(nullptr) {
UErrorCode ec = U_ZERO_ERROR;
if (status == nullptr) {
status = &ec;
Expand All @@ -212,6 +249,7 @@ RuleBasedBreakIterator::RuleBasedBreakIterator(UErrorCode *status) {
}
fDictionaryCache = lpDictionaryCache.orphan();
fBreakCache = lpBreakCache.orphan();
fDX = nullptr;

#ifdef RBBI_DEBUG
static UBool debugInitDone = false;
Expand Down Expand Up @@ -261,6 +299,9 @@ RuleBasedBreakIterator::~RuleBasedBreakIterator() {
delete fDictionaryCache;
fDictionaryCache = nullptr;

delete fDX;
fDX = nullptr;

delete fLanguageBreakEngines;
fLanguageBreakEngines = nullptr;

Expand Down Expand Up @@ -333,6 +374,7 @@ RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) {
// the assumption that the current position is on a rule boundary.
fBreakCache->reset(fPosition, fRuleStatusIndex);
fDictionaryCache->reset();
fDX = (that.fDX == nullptr) ? nullptr : that.fDX->cloneAsThawed();

return *this;
}
Expand Down Expand Up @@ -381,11 +423,15 @@ RuleBasedBreakIterator::operator==(const BreakIterator& that) const {
return false;
}

// If only one has fDX or they are not equal
if (!((that2.fDX == nullptr && fDX == nullptr) || *that2.fDX == *fDX)) {
return false;
}
if (that2.fData == fData ||
(fData != nullptr && that2.fData != nullptr && *that2.fData == *fData)) {
// The two break iterators are using the same rules.
return true;
}
}
return false;
}

Expand Down Expand Up @@ -1298,6 +1344,10 @@ RuleBasedBreakIterator::getRules() const {
}
}

bool RuleBasedBreakIterator::excludedFromDictionaryBreak(int32_t c) {
return fDX != nullptr && fDX->contains(c);
}

U_NAMESPACE_END

#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
24 changes: 14 additions & 10 deletions icu4c/source/common/rbbi_cache.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -156,17 +156,21 @@ void RuleBasedBreakIterator::DictionaryCache::populateDictionary(int32_t startPo
break;
}

// We now have a dictionary character. Get the appropriate language object
// to deal with it.
const LanguageBreakEngine *lbe = fBI->getLanguageBreakEngine(
c, fBI->getLocaleID(ULOC_REQUESTED_LOCALE, status));

// Ask the language object if there are any breaks. It will add them to the cache and
// leave the text pointer on the other side of its range, ready to search for the next one.
if (lbe != nullptr) {
foundBreakCount += lbe->findBreaks(text, current, rangeEnd, fBreaks, fBI->fIsPhraseBreaking, status);
// We now have a dictionary character.
// Handle dx (Dictionary break script exclusions) first if needed
if (fBI->excludedFromDictionaryBreak(c)) {
utext_next32(text);
} else {
// Get the appropriate language object to deal with it.
const LanguageBreakEngine *lbe = fBI->getLanguageBreakEngine(
c, fBI->getLocaleID(ULOC_REQUESTED_LOCALE, status));

// Ask the language object if there are any breaks. It will add them to the cache and
// leave the text pointer on the other side of its range, ready to search for the next one.
if (lbe != nullptr) {
foundBreakCount += lbe->findBreaks(text, current, rangeEnd, fBreaks, fBI->fIsPhraseBreaking, status);
}
}

// Reload the loop variables for the next go-round
c = utext_current32(text);
category = ucptrie_get(fBI->fData->fTrie, c);
Expand Down
2 changes: 1 addition & 1 deletion icu4c/source/common/unicode/brkiter.h
Original file line number Diff line number Diff line change
Expand Up @@ -622,7 +622,7 @@ class U_COMMON_API BreakIterator : public UObject {
virtual BreakIterator &refreshInputText(UText *input, UErrorCode &status) = 0;

private:
static BreakIterator* buildInstance(const Locale& loc, const char *type, UErrorCode& status);
static BreakIterator* buildInstance(const Locale& loc, const char *type, bool checkDX, UErrorCode& status);
static BreakIterator* createInstance(const Locale& loc, int32_t kind, UErrorCode& status);
static BreakIterator* makeInstance(const Locale& loc, int32_t kind, UErrorCode& status);

Expand Down
17 changes: 16 additions & 1 deletion icu4c/source/common/unicode/rbbi.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#define RBBI_H

#include "unicode/utypes.h"
#include "unicode/uniset.h"

#if U_SHOW_CPLUSPLUS_API

Expand All @@ -42,6 +43,7 @@ struct RBBIDataHeader;
class RBBIDataWrapper;
class UnhandledEngine;
class UStack;
class UnicodeSet;


#ifndef U_HIDE_INTERNAL_API
Expand Down Expand Up @@ -221,6 +223,12 @@ class U_COMMON_API RuleBasedBreakIterator /*final*/ : public BreakIterator {
*/
UBool fIsPhraseBreaking = false;

/**
* A UnicodeSet for Dictionary Break Exclusion.
*/
UnicodeSet* fDX = nullptr;
private:

//=======================================================================
// constructors
//=======================================================================
Expand All @@ -246,11 +254,12 @@ class U_COMMON_API RuleBasedBreakIterator /*final*/ : public BreakIterator {
* which will be responsible for closing it when it is no longer needed.
* @param status Information on any errors encountered.
* @param isPhraseBreaking true if phrase based breaking is required, otherwise false.
* @param dxs nullptr or a string to denote "Dictionary break script exclusions".
* @see udata_open
* @see #getBinaryRules
* @internal (private)
*/
RuleBasedBreakIterator(UDataMemory* image, UBool isPhraseBreaking, UErrorCode &status);
RuleBasedBreakIterator(UDataMemory* image, UBool isPhraseBreaking, const char* dxs, UErrorCode &status);

/** @internal */
friend class RBBIRuleBuilder;
Expand Down Expand Up @@ -764,6 +773,12 @@ class U_COMMON_API RuleBasedBreakIterator /*final*/ : public BreakIterator {
* signature)
*/

/*
* Check should the character be excluded from dictionary-based text break.
* @internal (private)
*/
bool excludedFromDictionaryBreak(int32_t c);

typedef uint16_t (*PTrieFunc)(const UCPTrie *, UChar32);

template<typename RowType, PTrieFunc trieFunc>
Expand Down
Loading

0 comments on commit ba1260f

Please sign in to comment.