diff --git a/icu4c/source/common/brkiter.cpp b/icu4c/source/common/brkiter.cpp index 4d945cc17e2b..e0699d7f9af3 100644 --- a/icu4c/source/common/brkiter.cpp +++ b/icu4c/source/common/brkiter.cpp @@ -55,7 +55,7 @@ U_NAMESPACE_BEGIN // ------------------------------------- BreakIterator* -BreakIterator::buildInstance(const Locale& loc, const char *type, UErrorCode &status) +BreakIterator::buildInstance(const Locale& loc, const char *type, bool checkDX, UErrorCode &status) { char fnbuff[256]; char ext[4]={'\0'}; @@ -116,8 +116,22 @@ BreakIterator::buildInstance(const Locale& loc, const char *type, UErrorCode &st return nullptr; } - // Create a RuleBasedBreakIterator - result = new RuleBasedBreakIterator(file, uprv_strstr(type, "phrase") != nullptr, status); + { + const char* dxs = nullptr; + CharString dxsValue; // keep on the stack till we no longer need dxs. + // If it is word or line instance, try to get the value for dx + if (checkDX) { + UErrorCode dxsStatus = U_ZERO_ERROR; + CharStringByteSink dxsSink(&dxsValue); + loc.getKeywordValue("dx", dxsSink, dxsStatus); + if (U_SUCCESS(dxsStatus) && dxsValue.length() > 0) { + dxs = dxsValue.data(); + } + } + + // Create a RuleBasedBreakIterator + result = new RuleBasedBreakIterator(file, uprv_strstr(type, "phrase") != nullptr, dxs, status); + } // If there is a result, set the valid locale and actual locale, and the kind if (U_SUCCESS(status) && result != nullptr) { @@ -421,14 +435,14 @@ BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status) case UBRK_CHARACTER: { UTRACE_ENTRY(UTRACE_UBRK_CREATE_CHARACTER); - result = BreakIterator::buildInstance(loc, "grapheme", status); + result = BreakIterator::buildInstance(loc, "grapheme", false, status); UTRACE_EXIT_STATUS(status); } break; case UBRK_WORD: { UTRACE_ENTRY(UTRACE_UBRK_CREATE_WORD); - result = BreakIterator::buildInstance(loc, "word", status); + result = BreakIterator::buildInstance(loc, "word", true, status); UTRACE_EXIT_STATUS(status); } break; @@ -451,7 +465,7 @@ BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status) uprv_strcat(lb_lw, value.data()); } } - result = BreakIterator::buildInstance(loc, lb_lw, status); + result = BreakIterator::buildInstance(loc, lb_lw, true, status); UTRACE_DATA1(UTRACE_INFO, "lb_lw=%s", lb_lw); UTRACE_EXIT_STATUS(status); @@ -460,7 +474,7 @@ BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status) case UBRK_SENTENCE: { UTRACE_ENTRY(UTRACE_UBRK_CREATE_SENTENCE); - result = BreakIterator::buildInstance(loc, "sentence", status); + result = BreakIterator::buildInstance(loc, "sentence", false, status); #if !UCONFIG_NO_FILTERED_BREAK_ITERATION char ssKeyValue[kKeyValueLenMax] = {0}; UErrorCode kvStatus = U_ZERO_ERROR; @@ -479,7 +493,7 @@ BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status) case UBRK_TITLE: { UTRACE_ENTRY(UTRACE_UBRK_CREATE_TITLE); - result = BreakIterator::buildInstance(loc, "title", status); + result = BreakIterator::buildInstance(loc, "title", false, status); UTRACE_EXIT_STATUS(status); } break; diff --git a/icu4c/source/common/rbbi.cpp b/icu4c/source/common/rbbi.cpp index 069af2153711..8b672cdf488f 100644 --- a/icu4c/source/common/rbbi.cpp +++ b/icu4c/source/common/rbbi.cpp @@ -25,6 +25,7 @@ #include "unicode/uchriter.h" #include "unicode/uclean.h" #include "unicode/udata.h" +#include "unicode/uniset.h" #include "brkeng.h" #include "ucln_cmn.h" @@ -89,9 +90,45 @@ RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode // //------------------------------------------------------------------------------- RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UBool isPhraseBreaking, + const char* dxs, UErrorCode &status) : RuleBasedBreakIterator(udm, status) { fIsPhraseBreaking = isPhraseBreaking; + if (U_FAILURE(status)) { + return; + } + if (dxs != nullptr) { + size_t length = uprv_strlen(dxs); + // The value should be a list of 4 letter script codes joined by '-'. + if (length % 5 != 4) { + status = U_ILLEGAL_ARGUMENT_ERROR; + return; + } + // The code Zyyy (Common) can be specified to exclude all scripts, if + // and only if it is the only SCRIPT_CODE value specified. If it is + // not the only script code, Zyyy has the normal meaning: excluding + // Script_Extension=Common. + if (uprv_strcmp(dxs, "zyyy") == 0) { + fDX = (new UnicodeSet(UnicodeSet::MIN_VALUE, UnicodeSet::MAX_VALUE))->freeze(); + return; + } + size_t items = 1 + length / 5; + // Change from "thai" to "[[:scx=thai:]]" or + // "thai-arab" to "[[:scx=thai:][:scx=arab:]]" + UnicodeString udxs(u'['); + for (size_t i = 0; i < items; i++) { + if (i > 0 && dxs[i*5-1] != u'-') { + status = U_ILLEGAL_ARGUMENT_ERROR; + return; + } + // Dictionary-based break iterators will ignore each character whose + // Script_Extension value set intersects with the DX value set. + udxs.append(u"[:scx=", -1) + .append(UnicodeString(dxs + i * 5, 4, US_INV)) + .append(u":]", -1); + } + fDX = (new UnicodeSet(udxs.append(u']'), status))->freeze(); + } } // @@ -198,7 +235,7 @@ RuleBasedBreakIterator::RuleBasedBreakIterator() * Simple Constructor with an error code. * Handles common initialization for all other constructors. */ -RuleBasedBreakIterator::RuleBasedBreakIterator(UErrorCode *status) { +RuleBasedBreakIterator::RuleBasedBreakIterator(UErrorCode *status) : fDX(nullptr) { UErrorCode ec = U_ZERO_ERROR; if (status == nullptr) { status = &ec; @@ -212,6 +249,7 @@ RuleBasedBreakIterator::RuleBasedBreakIterator(UErrorCode *status) { } fDictionaryCache = lpDictionaryCache.orphan(); fBreakCache = lpBreakCache.orphan(); + fDX = nullptr; #ifdef RBBI_DEBUG static UBool debugInitDone = false; @@ -261,6 +299,9 @@ RuleBasedBreakIterator::~RuleBasedBreakIterator() { delete fDictionaryCache; fDictionaryCache = nullptr; + delete fDX; + fDX = nullptr; + delete fLanguageBreakEngines; fLanguageBreakEngines = nullptr; @@ -333,6 +374,7 @@ RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) { // the assumption that the current position is on a rule boundary. fBreakCache->reset(fPosition, fRuleStatusIndex); fDictionaryCache->reset(); + fDX = (that.fDX == nullptr) ? nullptr : that.fDX->cloneAsThawed(); return *this; } @@ -381,11 +423,15 @@ RuleBasedBreakIterator::operator==(const BreakIterator& that) const { return false; } + // If only one has fDX or they are not equal + if (!((that2.fDX == nullptr && fDX == nullptr) || *that2.fDX == *fDX)) { + return false; + } if (that2.fData == fData || (fData != nullptr && that2.fData != nullptr && *that2.fData == *fData)) { // The two break iterators are using the same rules. return true; - } + } return false; } @@ -1298,6 +1344,10 @@ RuleBasedBreakIterator::getRules() const { } } +bool RuleBasedBreakIterator::excludedFromDictionaryBreak(int32_t c) { + return fDX != nullptr && fDX->contains(c); +} + U_NAMESPACE_END #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ diff --git a/icu4c/source/common/rbbi_cache.cpp b/icu4c/source/common/rbbi_cache.cpp index 3ef030cb9195..5ebda56f1dae 100644 --- a/icu4c/source/common/rbbi_cache.cpp +++ b/icu4c/source/common/rbbi_cache.cpp @@ -156,17 +156,21 @@ void RuleBasedBreakIterator::DictionaryCache::populateDictionary(int32_t startPo break; } - // We now have a dictionary character. Get the appropriate language object - // to deal with it. - const LanguageBreakEngine *lbe = fBI->getLanguageBreakEngine( - c, fBI->getLocaleID(ULOC_REQUESTED_LOCALE, status)); - - // Ask the language object if there are any breaks. It will add them to the cache and - // leave the text pointer on the other side of its range, ready to search for the next one. - if (lbe != nullptr) { - foundBreakCount += lbe->findBreaks(text, current, rangeEnd, fBreaks, fBI->fIsPhraseBreaking, status); + // We now have a dictionary character. + // Handle dx (Dictionary break script exclusions) first if needed + if (fBI->excludedFromDictionaryBreak(c)) { + utext_next32(text); + } else { + // Get the appropriate language object to deal with it. + const LanguageBreakEngine *lbe = fBI->getLanguageBreakEngine( + c, fBI->getLocaleID(ULOC_REQUESTED_LOCALE, status)); + + // Ask the language object if there are any breaks. It will add them to the cache and + // leave the text pointer on the other side of its range, ready to search for the next one. + if (lbe != nullptr) { + foundBreakCount += lbe->findBreaks(text, current, rangeEnd, fBreaks, fBI->fIsPhraseBreaking, status); + } } - // Reload the loop variables for the next go-round c = utext_current32(text); category = ucptrie_get(fBI->fData->fTrie, c); diff --git a/icu4c/source/common/unicode/brkiter.h b/icu4c/source/common/unicode/brkiter.h index 30c59c4a94ac..98c0d24f5754 100644 --- a/icu4c/source/common/unicode/brkiter.h +++ b/icu4c/source/common/unicode/brkiter.h @@ -622,7 +622,7 @@ class U_COMMON_API BreakIterator : public UObject { virtual BreakIterator &refreshInputText(UText *input, UErrorCode &status) = 0; private: - static BreakIterator* buildInstance(const Locale& loc, const char *type, UErrorCode& status); + static BreakIterator* buildInstance(const Locale& loc, const char *type, bool checkDX, UErrorCode& status); static BreakIterator* createInstance(const Locale& loc, int32_t kind, UErrorCode& status); static BreakIterator* makeInstance(const Locale& loc, int32_t kind, UErrorCode& status); diff --git a/icu4c/source/common/unicode/rbbi.h b/icu4c/source/common/unicode/rbbi.h index 659e3e46b352..621e7a9db8ba 100644 --- a/icu4c/source/common/unicode/rbbi.h +++ b/icu4c/source/common/unicode/rbbi.h @@ -17,6 +17,7 @@ #define RBBI_H #include "unicode/utypes.h" +#include "unicode/uniset.h" #if U_SHOW_CPLUSPLUS_API @@ -42,6 +43,7 @@ struct RBBIDataHeader; class RBBIDataWrapper; class UnhandledEngine; class UStack; +class UnicodeSet; #ifndef U_HIDE_INTERNAL_API @@ -221,6 +223,12 @@ class U_COMMON_API RuleBasedBreakIterator /*final*/ : public BreakIterator { */ UBool fIsPhraseBreaking = false; + /** + * A UnicodeSet for Dictionary Break Exclusion. + */ + UnicodeSet* fDX = nullptr; +private: + //======================================================================= // constructors //======================================================================= @@ -246,11 +254,12 @@ class U_COMMON_API RuleBasedBreakIterator /*final*/ : public BreakIterator { * which will be responsible for closing it when it is no longer needed. * @param status Information on any errors encountered. * @param isPhraseBreaking true if phrase based breaking is required, otherwise false. + * @param dxs nullptr or a string to denote "Dictionary break script exclusions". * @see udata_open * @see #getBinaryRules * @internal (private) */ - RuleBasedBreakIterator(UDataMemory* image, UBool isPhraseBreaking, UErrorCode &status); + RuleBasedBreakIterator(UDataMemory* image, UBool isPhraseBreaking, const char* dxs, UErrorCode &status); /** @internal */ friend class RBBIRuleBuilder; @@ -764,6 +773,12 @@ class U_COMMON_API RuleBasedBreakIterator /*final*/ : public BreakIterator { * signature) */ + /* + * Check should the character be excluded from dictionary-based text break. + * @internal (private) + */ + bool excludedFromDictionaryBreak(int32_t c); + typedef uint16_t (*PTrieFunc)(const UCPTrie *, UChar32); template diff --git a/icu4c/source/test/intltest/rbbitst.cpp b/icu4c/source/test/intltest/rbbitst.cpp index 31897a19ba04..f6af43076e21 100644 --- a/icu4c/source/test/intltest/rbbitst.cpp +++ b/icu4c/source/test/intltest/rbbitst.cpp @@ -111,6 +111,8 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha TESTCASE_AUTO(TestLineBreaks); TESTCASE_AUTO(TestSentBreaks); TESTCASE_AUTO(TestExtended); + TESTCASE_AUTO(TestDXLineBreaks); + TESTCASE_AUTO(TestDXWordBreaks); #endif #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO TESTCASE_AUTO(TestMonkey); @@ -4025,6 +4027,99 @@ void RBBITest::TestLineBreaks() #endif } +void RBBITest::TestDXLineBreaks() +{ +#if !UCONFIG_NO_REGULAR_EXPRESSIONS + UnicodeString text(u"abcde 一二三四五六七八九十อิสราเอลโชว์คลิป"); + std::vector expected{ 0, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 32 }; + Locale locale("ja-u-dx-hani-thai"); + UErrorCode status = U_ZERO_ERROR; + std::unique_ptr bi(BreakIterator::createLineInstance(locale, status)); + TEST_ASSERT_SUCCESS(status); + if (U_FAILURE(status)) { + return; + } + bi->setText(text); + int32_t c = bi->first(); + std::vector actuals; + do { + actuals.push_back(c); + } while ((c = bi->next()) != BreakIterator::DONE ); + + assertEquals(WHERE, + static_cast(expected.size()), + static_cast(actuals.size())); + if (expected.size() == actuals.size()) { + for (size_t i = 0; i < expected.size(); i++) { + assertEquals(WHERE, expected[i], actuals[i]); + } + } + + bi->setText(UnicodeString(u"aaอออaaaaaอออ aaaa")); + c = bi->first(); + actuals.clear(); + do { + actuals.push_back(c); + } while ((c = bi->next()) != BreakIterator::DONE ); + std::vector expected2{ 0, 17, 21 }; + assertEquals(WHERE, + static_cast(expected2.size()), + static_cast(actuals.size())); + if (expected2.size() == actuals.size()) { + for (size_t i = 0; i < expected2.size(); i++) { + assertEquals(WHERE, expected2[i], actuals[i]); + } + } + +#endif +} + +void RBBITest::TestDXWordBreaks() +{ +#if !UCONFIG_NO_REGULAR_EXPRESSIONS + UnicodeString text(u"abcde 一二三四五六七八九十อิสราเอลโชว์คลิป"); + Locale locale("ja-u-dx-hani-thai"); + std::vector expected{ 0, 5, 6, 16, 32 }; + UErrorCode status = U_ZERO_ERROR; + std::unique_ptr bi(BreakIterator::createWordInstance(locale, status)); + TEST_ASSERT_SUCCESS(status); + if (U_FAILURE(status)) { + return; + } + bi->setText(text); + int32_t c = bi->first(); + std::vector actuals; + do { + actuals.push_back(c); + } while ((c = bi->next()) != BreakIterator::DONE ); + + assertEquals(WHERE, + static_cast(expected.size()), + static_cast(actuals.size())); + if (expected.size() == actuals.size()) { + for (size_t i = 0; i < expected.size(); i++) { + assertEquals(WHERE, expected[i], actuals[i]); + } + } + + bi->setText(UnicodeString(u"aaอออaaaaaอออ aaaa")); + c = bi->first(); + actuals.clear(); + do { + actuals.push_back(c); + } while ((c = bi->next()) != BreakIterator::DONE ); + std::vector expected2{ 0, 13, 17, 21 }; + assertEquals(WHERE, + static_cast(expected2.size()), + static_cast(actuals.size())); + if (expected2.size() == actuals.size()) { + for (size_t i = 0; i < expected2.size(); i++) { + assertEquals(WHERE, expected2[i], actuals[i]); + } + } +#endif +} + void RBBITest::TestSentBreaks() { #if !UCONFIG_NO_REGULAR_EXPRESSIONS diff --git a/icu4c/source/test/intltest/rbbitst.h b/icu4c/source/test/intltest/rbbitst.h index 9638caf6a277..8f51d2038b46 100644 --- a/icu4c/source/test/intltest/rbbitst.h +++ b/icu4c/source/test/intltest/rbbitst.h @@ -107,6 +107,8 @@ class RBBITest: public IntlTest { void TestBug22602(); void TestBug22636(); + void TestDXLineBreaks(); + void TestDXWordBreaks(); #if U_ENABLE_TRACING void TestTraceCreateCharacter(); void TestTraceCreateWord(); diff --git a/icu4c/source/test/testdata/rbbitst.txt b/icu4c/source/test/testdata/rbbitst.txt index 7d77588ef977..ed5800258da3 100644 --- a/icu4c/source/test/testdata/rbbitst.txt +++ b/icu4c/source/test/testdata/rbbitst.txt @@ -1525,6 +1525,32 @@ Bangkok)• # #################################################################################### +# -u-dx (exclude script) +# + + + +# Should no longer break at the dictionary points - it's not Thai language +# Short Test +•โอํน •อะไป •จู่วาม •โล่น• +•โอํน •อะไป •จู่วาม •โล่น •เปี่ยร •อะลู่วาง •แมะ, •ปาย •อัน •แบ็จ •อะโจํน •ซา •เมาะ. •อัน •ฮะบืน •ตะ •เวี่ยะ •ตะ •งี่ยาน, •อัน •ฮะบืน •อีว •อะปายฮ.• + +# Should no longer break at the dictionary points - it's not the Thai language +•โอํน<200> •อะไป<200> •จู่วาม<200> •โล่น<200> •เปี่ยร<200> •อะลู่วาง<200> •แมะ<200>,• •ปาย<200> •อัน<200> •แบ็จ<200> •อะโจํน<200> •ซา<200> •เมาะ<200>.• •อัน<200> •ฮะบืน<200> •ตะ<200> •เวี่ยะ<200> •ตะ<200> •งี่ยาน<200>,• •อัน<200> •ฮะบืน<200> •อีว<200> •อะปายฮ<200>.• + + + + +# Should no longer break at the dictionary points - it's not Thai language +•โอํน •อะไป •จู่วาม •โล่น •เปี่ยร •อะลู่วาง •แมะ, •ปาย •อัน •แบ็จ •อะโจํน •ซา •เมาะ. •อัน •ฮะบืน •ตะ •เวี่ยะ •ตะ •งี่ยาน, •อัน •ฮะบืน •อีว •อะปายฮ.• + +# Should no longer break at the dictionary points - it's not the Thai language +•โอํน<200> •อะไป<200> •จู่วาม<200> •โล่น<200> •เปี่ยร<200> •อะลู่วาง<200> •แมะ<200>,• •ปาย<200> •อัน<200> •แบ็จ<200> •อะโจํน<200> •ซา<200> •เมาะ<200>.• •อัน<200> •ฮะบืน<200> •ตะ<200> •เวี่ยะ<200> •ตะ<200> •งี่ยาน<200>,• •อัน<200> •ฮะบืน<200> •อีว<200> •อะปายฮ<200>.• + + # Japanese line break tailoring test @@ -2222,4 +2248,3 @@ Bangkok)• •Anmerkung: •„White“ •bzw. •‚白•人‘ •– •in •der •Amtlichen •Statistik• •« Complex »« chaining » • •« .618 »• # Interaction with the ICU tailoring to break before such numbers. - diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/BreakIteratorFactory.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/BreakIteratorFactory.java index c78f36ed638b..0b8c02fcc55b 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/text/BreakIteratorFactory.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/BreakIteratorFactory.java @@ -156,12 +156,17 @@ private static BreakIterator createBreakInstance(ULocale locale, int kind) { throw new MissingResourceException(e.toString(),"",""); } + // Dictionary Break Exclusion + String dxValue = null; + if (kind == BreakIterator.KIND_LINE || kind == BreakIterator.KIND_WORD) { + dxValue = locale.getUnicodeLocaleType("dx"); + } // // Create a normal RuleBasedBreakIterator. // try { boolean isPhraseBreaking = (brkfname != null) && brkfname.contains("phrase"); - iter = RuleBasedBreakIterator.getInstanceFromCompiledRules(bytes, isPhraseBreaking); + iter = RuleBasedBreakIterator.getInstanceFromCompiledRules(bytes, isPhraseBreaking, dxValue); } catch (IOException e) { // Shouldn't be possible to get here. diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/RuleBasedBreakIterator.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/RuleBasedBreakIterator.java index 9d54bd1aaa4c..167beb53e30c 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/text/RuleBasedBreakIterator.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/RuleBasedBreakIterator.java @@ -20,7 +20,7 @@ import java.nio.ByteBuffer; import java.text.CharacterIterator; import java.util.MissingResourceException; -import java.util.concurrent.ConcurrentLinkedQueue; +import java.util.Objects; import com.ibm.icu.impl.CharacterIteration; import com.ibm.icu.impl.ICUBinary; @@ -39,6 +39,7 @@ import com.ibm.icu.lang.UProperty; import com.ibm.icu.lang.UScript; import com.ibm.icu.util.CodePointTrie; +import com.ibm.icu.text.UnicodeSet; /** * Rule Based Break Iterator @@ -95,17 +96,72 @@ public static RuleBasedBreakIterator getInstanceFromCompiledRules(InputStream is * * @param bytes a buffer supplying the compiled binary rules. * @param phraseBreaking a flag indicating if phrase breaking is required. + * @param dxValues Dictionary break script exclusions. * @throws IOException if there is an error while reading the rules from the buffer. * @see #compileRules(String, OutputStream) * @internal */ /* package-potected */ static RuleBasedBreakIterator getInstanceFromCompiledRules( - ByteBuffer bytes, boolean phraseBreaking) throws IOException { + ByteBuffer bytes, boolean phraseBreaking, String dxValues) throws IOException { RuleBasedBreakIterator instance = getInstanceFromCompiledRules(bytes); instance.fPhraseBreaking = phraseBreaking; + instance.fDX = makeExcludedDictionaryBreakUnicodeSet(dxValues); return instance; } + /** + * Crate a UnicodeSet for the Dictionary Break Script Exclusions. + * @param dxValues Dictionary break script exclusions, a string of Script code joined by "-". + * @internal + */ + private static UnicodeSet makeExcludedDictionaryBreakUnicodeSet( + String dxs) { + if (dxs == null) { + return null; + } + if (dxs.equals("zyyy")) { + // The code Zyyy (Common) can be specified to exclude all scripts, + // if and only if it is the only SCRIPT_CODE value specified. + // If it is not the only script code, Zyyy has the normal meaning: + // excluding Script_Extension=Common. + return UnicodeSet.ALL_CODE_POINTS; + } + if (dxs.length() % 5 != 4) { + throw new IllegalArgumentException("Incorrect value for dx key: " + dxs); + } + // Change from "thai" to "[[:scx=thai:]]" or "thai-arab" to "[[:scx=thai:][:scx=arab:]]" + StringBuilder builder = new StringBuilder("["); + int items = 1 + (dxs.length() / 5); + for (int i = 0; i < items; i++) { + if (i > 0 && dxs.charAt(i*5-1) != '-') { + throw new IllegalArgumentException("Incorrect value for dx key: " + dxs); + } + // Dictionary-based break iterators will ignore each character whose + // Script_Extension value set intersects with the DX value set. + builder.append("[:scx=").append(dxs.substring(i*5, i*5+4)).append(":]"); + } + builder.append("]"); + // The UnicodeSet constructor will catch malformed dx values below. + // For example, if the locale is "en-u-dx-abc-defgh", dxs is "abc-defgh" + // and builder.toString() return "[[:scx=abc-:][:scx=efgh:]]" and causes + // UnicodeSet constructor to throw IllegalArgumentException + return (new UnicodeSet(builder.toString())).freeze(); + } + + /** + * Check should the character be excluded from dictionary-based text break. + */ + private boolean excludedFromDictionaryBreak(int c) { + return hasDictionaryExclusion() && fDX.contains(c); + } + + /** + * Check do we need to consider dictionary exclusion. + */ + private boolean hasDictionaryExclusion() { + return fDX != null; + } + /** * Create a break iterator from a precompiled set of break rules. * @@ -173,6 +229,7 @@ public Object clone() { result.fLookAheadMatches = new int[fRData.fFTable.fLookAheadResultsSize]; result.fBreakCache = result.new BreakCache(fBreakCache); result.fDictionaryCache = result.new DictionaryCache(fDictionaryCache); + result.fDX = fDX; // fDX could be shared w/ other instance return result; } @@ -199,6 +256,9 @@ public boolean equals(Object that) { (!fRData.fRuleSource.equals(other.fRData.fRuleSource))) { return false; } + if (!Objects.equals(fDX, other.fDX)) { + return false; + } if (fText == null && other.fText == null) { return true; } @@ -298,6 +358,10 @@ public int hashCode() */ private boolean fPhraseBreaking = false; + /** + * UnicodeSet for Dictionary break script exclusions. + */ + protected UnicodeSet fDX = null; /** * Counter for the number of characters encountered with the "dictionary" @@ -879,7 +943,7 @@ else if (mode == RBBI_RUN) { category = (short) trie.get(c); // Check for categories that require word dictionary handling. - if (category >= dictStart) { + if (category >= dictStart && !excludedFromDictionaryBreak(c)) { fDictionaryCharCount++; } @@ -1224,9 +1288,8 @@ void populateDictionary(int startPos, int endPos, // Ask the language object if there are any breaks. It will add them to the cache and // leave the text pointer on the other side of its range, ready to search for the next one. if (lbe != null) { - foundBreakCount += lbe.findBreaks(fText, rangeStart, rangeEnd, fBreaks, fPhraseBreaking); + foundBreakCount += lbe.findBreaks(fText, current, rangeEnd, fBreaks, fPhraseBreaking); } - // Reload the loop variables for the next go-round c = CharacterIteration.current32(fText); category = (short)fRData.fTrie.get(c); diff --git a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/rbbi/RBBITest.java b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/rbbi/RBBITest.java index 0c60e946222f..afacc523693c 100644 --- a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/rbbi/RBBITest.java +++ b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/rbbi/RBBITest.java @@ -8,6 +8,9 @@ */ package com.ibm.icu.dev.test.rbbi; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.text.UnicodeSetSpanner; + import java.text.CharacterIterator; import java.util.ArrayList; import java.util.Arrays; @@ -1077,4 +1080,74 @@ public int randomStringIndex() { assertEquals("preceding" + idx, fns.expectedPreceding(idx), bi.preceding(idx)); } } + + private List GetResult(BreakIterator brk) { + List out = new ArrayList(); + int c = brk.first(); + do { + out.add(c); + } while ((c = brk.next()) != BreakIterator.DONE); + return out; + } + + @Test + public void TestDXLineBreaks() { + String text = "abcde 一二三四五六七八九十อิสราเอลโชว์คลิป"; + BreakIterator brk = BreakIterator.getLineInstance(ULocale.forLanguageTag("ja-u-dx-hani-thai")); + brk.setText(text); + List expected = new ArrayList(Arrays.asList( + 0, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 32)); + List actuals = GetResult(brk); + + assertEquals("-u-dx- is not working", expected, actuals); + + brk.setText("aaอออaaaaaอออ aaaa"); + + expected = new ArrayList(Arrays.asList(0, 17, 21)); + actuals = GetResult(brk); + assertEquals("-u-dx- is not working", expected, actuals); + } + @Test + public void TestDXWordBreaks() { + String text = "abcde 一二三四五六七八九十อิสราเอลโชว์คลิป"; + BreakIterator brk = BreakIterator.getWordInstance(ULocale.forLanguageTag("ja-u-dx-hani-thai")); + brk.setText(text); + List actuals = GetResult(brk); + List expected = new ArrayList(Arrays.asList( + 0, 5, 6, 16, 32 )); + assertEquals("-u-dx- is not working", expected, actuals); + + brk.setText("aaอออaaaaaอออ aaaa"); + actuals = GetResult(brk); + expected = new ArrayList(Arrays.asList(0, 13, 17, 21)); + assertEquals("-u-dx- is not working", expected, actuals); + + String [] testCases = { + "列列パルス列列パルス列列", + "パルス列列パルス列列パルス", + "イスラエルとイスラム組織ハマスが戦闘を休止するなか、", + "衛星データの分析でつかの間の平穏が訪れていることが分かった。", + "パレスチナ自治区ガザやイスラエルでは目立った熱異常は検知されず、", + "大規模な衝突は起きていないもようだ。", + "イスラエルへのロケットの飛来を知らせる防空警報の発令も途絶え、", + "国際機関はガザへの人道支援物資の搬入を急いでいる。" + }; + UnicodeSetSpanner uss = new UnicodeSetSpanner(new UnicodeSet("[:scx=Kana:]")); + brk = BreakIterator.getWordInstance(ULocale.forLanguageTag("ja-u-dx-kana")); + for (String test : testCases) { + String kanaAsA = uss.replaceFrom(test, "A"); + System.out.println(test); + System.out.println(kanaAsA); + + brk.setText(kanaAsA); + expected = GetResult(brk); + + brk.setText(test); + actuals = GetResult(brk); + + assertEquals("-u-dx- is not working for '" + + test + "': ", expected, actuals); + } + + } } diff --git a/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/rbbitst.txt b/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/rbbitst.txt index 7d77588ef977..ed5800258da3 100644 --- a/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/rbbitst.txt +++ b/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/rbbitst.txt @@ -1525,6 +1525,32 @@ Bangkok)• # #################################################################################### +# -u-dx (exclude script) +# + + + +# Should no longer break at the dictionary points - it's not Thai language +# Short Test +•โอํน •อะไป •จู่วาม •โล่น• +•โอํน •อะไป •จู่วาม •โล่น •เปี่ยร •อะลู่วาง •แมะ, •ปาย •อัน •แบ็จ •อะโจํน •ซา •เมาะ. •อัน •ฮะบืน •ตะ •เวี่ยะ •ตะ •งี่ยาน, •อัน •ฮะบืน •อีว •อะปายฮ.• + +# Should no longer break at the dictionary points - it's not the Thai language +•โอํน<200> •อะไป<200> •จู่วาม<200> •โล่น<200> •เปี่ยร<200> •อะลู่วาง<200> •แมะ<200>,• •ปาย<200> •อัน<200> •แบ็จ<200> •อะโจํน<200> •ซา<200> •เมาะ<200>.• •อัน<200> •ฮะบืน<200> •ตะ<200> •เวี่ยะ<200> •ตะ<200> •งี่ยาน<200>,• •อัน<200> •ฮะบืน<200> •อีว<200> •อะปายฮ<200>.• + + + + +# Should no longer break at the dictionary points - it's not Thai language +•โอํน •อะไป •จู่วาม •โล่น •เปี่ยร •อะลู่วาง •แมะ, •ปาย •อัน •แบ็จ •อะโจํน •ซา •เมาะ. •อัน •ฮะบืน •ตะ •เวี่ยะ •ตะ •งี่ยาน, •อัน •ฮะบืน •อีว •อะปายฮ.• + +# Should no longer break at the dictionary points - it's not the Thai language +•โอํน<200> •อะไป<200> •จู่วาม<200> •โล่น<200> •เปี่ยร<200> •อะลู่วาง<200> •แมะ<200>,• •ปาย<200> •อัน<200> •แบ็จ<200> •อะโจํน<200> •ซา<200> •เมาะ<200>.• •อัน<200> •ฮะบืน<200> •ตะ<200> •เวี่ยะ<200> •ตะ<200> •งี่ยาน<200>,• •อัน<200> •ฮะบืน<200> •อีว<200> •อะปายฮ<200>.• + + # Japanese line break tailoring test @@ -2222,4 +2248,3 @@ Bangkok)• •Anmerkung: •„White“ •bzw. •‚白•人‘ •– •in •der •Amtlichen •Statistik• •« Complex »« chaining » • •« .618 »• # Interaction with the ICU tailoring to break before such numbers. -