diff --git a/.travis.yml b/.travis.yml index 71a4d16c6..ed387a48c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -11,6 +11,7 @@ addons: - libssl-dev - libhttp-parser-dev - libssh2-1-dev + - libicu-dev - gcc-8 - g++-8 diff --git a/CMakeLists.txt b/CMakeLists.txt index 7bc654615..e11531bf0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -65,6 +65,12 @@ ENDIF () find_package(Boost REQUIRED COMPONENTS locale system) +if (NOT CMAKE_SYSTEM_NAME MATCHES "Windows") + find_package(ICU REQUIRED COMPONENTS uc) + include_directories($ICU_INCLUDE_DIRS) + link_directories($ICU_LIBRARY_DIRS) +endif() + ExternalProject_Add(GTest PREFIX "external" URL "https://github.com/google/googletest/archive/release-1.8.1.tar.gz" @@ -389,12 +395,12 @@ ENDIF () # Build tests. add_executable (libloot_internals_tests ${LIBLOOT_SRC} ${LIBLOOT_HEADERS} ${LOOT_TESTS_SRC} ${LOOT_TESTS_HEADERS}) add_dependencies (libloot_internals_tests esplugin libgit2 libloadorder loot-condition-interpreter spdlog yaml-cpp GTest testing-metadata testing-plugins) -target_link_libraries(libloot_internals_tests ${Boost_LIBRARIES} ${LIBGIT2_LIBRARIES} ${ESPLUGIN_LIBRARIES} ${LIBLOADORDER_LIBRARIES} ${LOOT_LIBS} ${LCI_LIBRARIES} ${YAML_CPP_LIBRARIES} ${GTEST_LIBRARIES}) +target_link_libraries(libloot_internals_tests ${Boost_LIBRARIES} ${LIBGIT2_LIBRARIES} ${ESPLUGIN_LIBRARIES} ${LIBLOADORDER_LIBRARIES} ${LOOT_LIBS} ${LCI_LIBRARIES} ${YAML_CPP_LIBRARIES} ${GTEST_LIBRARIES} ${ICU_LIBRARIES}) # Build API. add_library (loot ${LIBLOOT_SRC} ${LIBLOOT_HEADERS}) add_dependencies (loot esplugin libgit2 libloadorder loot-condition-interpreter spdlog yaml-cpp) -target_link_libraries(loot ${Boost_LIBRARIES} ${LIBGIT2_LIBRARIES} ${ESPLUGIN_LIBRARIES} ${LIBLOADORDER_LIBRARIES} ${LOOT_LIBS} ${LCI_LIBRARIES} ${YAML_CPP_LIBRARIES}) +target_link_libraries(loot ${Boost_LIBRARIES} ${LIBGIT2_LIBRARIES} ${ESPLUGIN_LIBRARIES} ${LIBLOADORDER_LIBRARIES} ${LOOT_LIBS} ${LCI_LIBRARIES} ${YAML_CPP_LIBRARIES} ${ICU_LIBRARIES}) # Build API tests. add_executable (libloot_tests ${LIBLOOT_TESTS_SRC} ${LIBLOOT_TESTS_HEADERS}) diff --git a/src/api/helpers/text.cpp b/src/api/helpers/text.cpp index 00928fa81..9abacd81b 100644 --- a/src/api/helpers/text.cpp +++ b/src/api/helpers/text.cpp @@ -30,7 +30,8 @@ #ifdef _WIN32 #include "windows.h" #else -#include +#include +#include #endif using std::regex; @@ -114,7 +115,6 @@ std::optional ExtractVersion(const std::string& text) { return std::nullopt; } - #ifdef _WIN32 std::wstring ToWinWide(const std::string& str) { size_t len = MultiByteToWideChar(CP_UTF8, 0, str.c_str(), str.length(), 0, 0); @@ -125,10 +125,10 @@ std::wstring ToWinWide(const std::string& str) { std::string FromWinWide(const std::wstring& wstr) { size_t len = WideCharToMultiByte( - CP_UTF8, 0, wstr.c_str(), wstr.length(), NULL, 0, NULL, NULL); + CP_UTF8, 0, wstr.c_str(), wstr.length(), NULL, 0, NULL, NULL); std::string str(len, 0); WideCharToMultiByte( - CP_UTF8, 0, wstr.c_str(), wstr.length(), &str[0], len, NULL, NULL); + CP_UTF8, 0, wstr.c_str(), wstr.length(), &str[0], len, NULL, NULL); return str; } #endif @@ -138,20 +138,23 @@ int CompareFilenames(const std::string& lhs, const std::string& rhs) { // On Windows, use CompareStringOrdinal as that will perform case conversion // using the operating system uppercase table information, which (I think) // will give results that match the filesystem, and is not locale-dependent. - int result = CompareStringOrdinal(ToWinWide(lhs).c_str(), -1, ToWinWide(rhs).c_str(), -1, true); + int result = CompareStringOrdinal( + ToWinWide(lhs).c_str(), -1, ToWinWide(rhs).c_str(), -1, true); switch (result) { case CSTR_LESS_THAN: - return -1; + return -1; case CSTR_EQUAL: - return 0; + return 0; case CSTR_GREATER_THAN: - return 1; + return 1; default: - throw std::invalid_argument("One of the filenames to compare was invalid."); + throw std::invalid_argument( + "One of the filenames to compare was invalid."); } #else - using boost::locale::to_upper; - return to_upper(lhs).compare(to_upper(rhs)); + auto unicodeLhs = UnicodeString::fromUTF8(lhs); + auto unicodeRhs = UnicodeString::fromUTF8(rhs); + return unicodeLhs.caseCompare(unicodeRhs, U_FOLD_CASE_DEFAULT); #endif } @@ -161,7 +164,11 @@ std::string NormalizeFilename(const std::string& filename) { CharUpperBuffW(&wideString[0], wideString.length()); return FromWinWide(wideString); #else - return boost::locale::to_upper(filename); + std::string normalizedFilename; + UnicodeString::fromUTF8(filename) + .foldCase(U_FOLD_CASE_DEFAULT) + .toUTF8String(normalizedFilename); + return normalizedFilename; #endif } } diff --git a/src/api/helpers/text.h b/src/api/helpers/text.h index a8c90416d..24149444f 100644 --- a/src/api/helpers/text.h +++ b/src/api/helpers/text.h @@ -38,10 +38,16 @@ std::optional ExtractVersion(const std::string& text); // Compare strings as if they're filenames, respecting filesystem case // insensitivity on Windows. Returns -1 if lhs < rhs, 0 if lhs == rhs, and 1 if -// lhs > rhs. +// lhs > rhs. The comparison may give different results on Linux, but is still +// locale-invariant. int CompareFilenames(const std::string& lhs, const std::string& rhs); -// Uppercase the given filename using an invariant locale on Windows. +// Normalize the given filename in a way that is locale-invariant. On Windows, +// this uppercases the filename according to the same case mapping rules as used +// by the filesystem. On Linux, case folding is used and gives results that are +// different but hopefully still consistent enough with the behaviour on Windows +// that the normalized filenames distinguish characters in a similar way to the +// Windows filesystem. std::string NormalizeFilename(const std::string& filename); } diff --git a/src/tests/api/internals/helpers/text_test.h b/src/tests/api/internals/helpers/text_test.h index fa0c9837b..32bc596c7 100644 --- a/src/tests/api/internals/helpers/text_test.h +++ b/src/tests/api/internals/helpers/text_test.h @@ -177,7 +177,6 @@ TEST(ExtractVersion, shouldPreferVersionPrefixedNumbersOverVPrefixedNumber) { EXPECT_EQ("1.0", text.value()); } -#ifdef _WIN32 // MSVC interprets source files in the default code page, so // for me u8"\xC3\x9C" != u8"\u00DC", which is a lot of fun. // To avoid insanity, write non-ASCII characters as \uXXXX escapes. @@ -188,14 +187,22 @@ TEST(ExtractVersion, shouldPreferVersionPrefixedNumbersOverVPrefixedNumber) { // \u0131 is turkish 'ı' TEST(CompareFilenames, shouldBeCaseInsensitiveAndLocaleInvariant) { + // ICU sees all three greek rhos as case-insensitively equal, unlike Windows. + // A small enough deviation that it should hopefully be insignificant. + EXPECT_EQ(0, CompareFilenames("i", "I")); EXPECT_EQ(-1, CompareFilenames("i", u8"\u0130")); EXPECT_EQ(-1, CompareFilenames("i", u8"\u0131")); EXPECT_EQ(-1, CompareFilenames("I", u8"\u0130")); EXPECT_EQ(-1, CompareFilenames("I", u8"\u0131")); EXPECT_EQ(-1, CompareFilenames(u8"\u0130", u8"\u0131")); +#ifdef _WIN32 EXPECT_EQ(1, CompareFilenames(u8"\u03f1", u8"\u03a1")); EXPECT_EQ(1, CompareFilenames(u8"\u03f1", u8"\u03c1")); +#else + EXPECT_EQ(0, CompareFilenames(u8"\u03f1", u8"\u03a1")); + EXPECT_EQ(0, CompareFilenames(u8"\u03f1", u8"\u03c1")); +#endif EXPECT_EQ(0, CompareFilenames(u8"\u03a1", u8"\u03c1")); // Set locale to Turkish. @@ -207,8 +214,13 @@ TEST(CompareFilenames, shouldBeCaseInsensitiveAndLocaleInvariant) { EXPECT_EQ(-1, CompareFilenames("I", u8"\u0130")); EXPECT_EQ(-1, CompareFilenames("I", u8"\u0131")); EXPECT_EQ(-1, CompareFilenames(u8"\u0130", u8"\u0131")); +#ifdef _WIN32 EXPECT_EQ(1, CompareFilenames(u8"\u03f1", u8"\u03a1")); EXPECT_EQ(1, CompareFilenames(u8"\u03f1", u8"\u03c1")); +#else + EXPECT_EQ(0, CompareFilenames(u8"\u03f1", u8"\u03a1")); + EXPECT_EQ(0, CompareFilenames(u8"\u03f1", u8"\u03c1")); +#endif EXPECT_EQ(0, CompareFilenames(u8"\u03a1", u8"\u03c1")); // Set locale to Greek. @@ -220,14 +232,20 @@ TEST(CompareFilenames, shouldBeCaseInsensitiveAndLocaleInvariant) { EXPECT_EQ(-1, CompareFilenames("I", u8"\u0130")); EXPECT_EQ(-1, CompareFilenames("I", u8"\u0131")); EXPECT_EQ(-1, CompareFilenames(u8"\u0130", u8"\u0131")); +#ifdef _WIN32 EXPECT_EQ(1, CompareFilenames(u8"\u03f1", u8"\u03a1")); EXPECT_EQ(1, CompareFilenames(u8"\u03f1", u8"\u03c1")); +#else + EXPECT_EQ(0, CompareFilenames(u8"\u03f1", u8"\u03a1")); + EXPECT_EQ(0, CompareFilenames(u8"\u03f1", u8"\u03c1")); +#endif EXPECT_EQ(0, CompareFilenames(u8"\u03a1", u8"\u03c1")); // Reset locale. std::locale::global(boost::locale::generator().generate("")); } +#ifdef _WIN32 TEST(NormalizeFilename, shouldUppercaseStringsAndBeLocaleInvariant) { EXPECT_EQ("I", NormalizeFilename("i")); EXPECT_EQ("I", NormalizeFilename("I")); @@ -262,6 +280,46 @@ TEST(NormalizeFilename, shouldUppercaseStringsAndBeLocaleInvariant) { // Reset locale. std::locale::global(boost::locale::generator().generate("")); } +#else +TEST(NormalizeFilename, shouldCaseFoldStringsAndBeLocaleInvariant) { + // ICU folds all greek rhos to the lowercase rho, unlike Windows. The result + // for uppercase turkish i is different from Windows but functionally + // equivalent. + // A small enough deviation that it should hopefully be insignificant. + + EXPECT_EQ("i", NormalizeFilename("i")); + EXPECT_EQ("i", NormalizeFilename("I")); + EXPECT_EQ(u8"i\u0307", NormalizeFilename(u8"\u0130")); + EXPECT_EQ(u8"\u0131", NormalizeFilename(u8"\u0131")); + EXPECT_EQ(u8"\u03c1", NormalizeFilename(u8"\u03f1")); + EXPECT_EQ(u8"\u03c1", NormalizeFilename(u8"\u03a1")); + EXPECT_EQ(u8"\u03c1", NormalizeFilename(u8"\u03c1")); + + // Set locale to Turkish. + std::locale::global(boost::locale::generator().generate("tr_TR.UTF-8")); + + EXPECT_EQ("i", NormalizeFilename("i")); + EXPECT_EQ("i", NormalizeFilename("I")); + EXPECT_EQ(u8"i\u0307", NormalizeFilename(u8"\u0130")); + EXPECT_EQ(u8"\u0131", NormalizeFilename(u8"\u0131")); + EXPECT_EQ(u8"\u03c1", NormalizeFilename(u8"\u03f1")); + EXPECT_EQ(u8"\u03c1", NormalizeFilename(u8"\u03a1")); + EXPECT_EQ(u8"\u03c1", NormalizeFilename(u8"\u03c1")); + + // Set locale to Greek. + std::locale::global(boost::locale::generator().generate("el_GR.UTF-8")); + + EXPECT_EQ("i", NormalizeFilename("i")); + EXPECT_EQ("i", NormalizeFilename("I")); + EXPECT_EQ(u8"i\u0307", NormalizeFilename(u8"\u0130")); + EXPECT_EQ(u8"\u0131", NormalizeFilename(u8"\u0131")); + EXPECT_EQ(u8"\u03c1", NormalizeFilename(u8"\u03f1")); + EXPECT_EQ(u8"\u03c1", NormalizeFilename(u8"\u03a1")); + EXPECT_EQ(u8"\u03c1", NormalizeFilename(u8"\u03c1")); + + // Reset locale. + std::locale::global(boost::locale::generator().generate("")); +} #endif } }