Skip to content

Commit

Permalink
Use ICU to compare / normalize filenames on Linux
Browse files Browse the repository at this point in the history
It gives results that are locale-invariant and closer to results on
Windows (though not identical).
  • Loading branch information
Ortham committed May 25, 2019
1 parent a301369 commit 306c57e
Show file tree
Hide file tree
Showing 5 changed files with 95 additions and 17 deletions.
1 change: 1 addition & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ addons:
- libssl-dev
- libhttp-parser-dev
- libssh2-1-dev
- libicu-dev
- gcc-8
- g++-8

Expand Down
10 changes: 8 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,12 @@ ENDIF ()

find_package(Boost REQUIRED COMPONENTS locale system)

if (NOT CMAKE_SYSTEM_NAME MATCHES "Windows")
find_package(ICU REQUIRED COMPONENTS uc)
include_directories($ICU_INCLUDE_DIRS)
link_directories($ICU_LIBRARY_DIRS)
endif()

ExternalProject_Add(GTest
PREFIX "external"
URL "https://github.com/google/googletest/archive/release-1.8.1.tar.gz"
Expand Down Expand Up @@ -389,12 +395,12 @@ ENDIF ()
# Build tests.
add_executable (libloot_internals_tests ${LIBLOOT_SRC} ${LIBLOOT_HEADERS} ${LOOT_TESTS_SRC} ${LOOT_TESTS_HEADERS})
add_dependencies (libloot_internals_tests esplugin libgit2 libloadorder loot-condition-interpreter spdlog yaml-cpp GTest testing-metadata testing-plugins)
target_link_libraries(libloot_internals_tests ${Boost_LIBRARIES} ${LIBGIT2_LIBRARIES} ${ESPLUGIN_LIBRARIES} ${LIBLOADORDER_LIBRARIES} ${LOOT_LIBS} ${LCI_LIBRARIES} ${YAML_CPP_LIBRARIES} ${GTEST_LIBRARIES})
target_link_libraries(libloot_internals_tests ${Boost_LIBRARIES} ${LIBGIT2_LIBRARIES} ${ESPLUGIN_LIBRARIES} ${LIBLOADORDER_LIBRARIES} ${LOOT_LIBS} ${LCI_LIBRARIES} ${YAML_CPP_LIBRARIES} ${GTEST_LIBRARIES} ${ICU_LIBRARIES})

# Build API.
add_library (loot ${LIBLOOT_SRC} ${LIBLOOT_HEADERS})
add_dependencies (loot esplugin libgit2 libloadorder loot-condition-interpreter spdlog yaml-cpp)
target_link_libraries(loot ${Boost_LIBRARIES} ${LIBGIT2_LIBRARIES} ${ESPLUGIN_LIBRARIES} ${LIBLOADORDER_LIBRARIES} ${LOOT_LIBS} ${LCI_LIBRARIES} ${YAML_CPP_LIBRARIES})
target_link_libraries(loot ${Boost_LIBRARIES} ${LIBGIT2_LIBRARIES} ${ESPLUGIN_LIBRARIES} ${LIBLOADORDER_LIBRARIES} ${LOOT_LIBS} ${LCI_LIBRARIES} ${YAML_CPP_LIBRARIES} ${ICU_LIBRARIES})

# Build API tests.
add_executable (libloot_tests ${LIBLOOT_TESTS_SRC} ${LIBLOOT_TESTS_HEADERS})
Expand Down
31 changes: 19 additions & 12 deletions src/api/helpers/text.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@
#ifdef _WIN32
#include "windows.h"
#else
#include <boost/locale.hpp>
#include <unicode/uchar.h>
#include <unicode/unistr.h>
#endif

using std::regex;
Expand Down Expand Up @@ -114,7 +115,6 @@ std::optional<std::string> ExtractVersion(const std::string& text) {
return std::nullopt;
}


#ifdef _WIN32
std::wstring ToWinWide(const std::string& str) {
size_t len = MultiByteToWideChar(CP_UTF8, 0, str.c_str(), str.length(), 0, 0);
Expand All @@ -125,10 +125,10 @@ std::wstring ToWinWide(const std::string& str) {

std::string FromWinWide(const std::wstring& wstr) {
size_t len = WideCharToMultiByte(
CP_UTF8, 0, wstr.c_str(), wstr.length(), NULL, 0, NULL, NULL);
CP_UTF8, 0, wstr.c_str(), wstr.length(), NULL, 0, NULL, NULL);
std::string str(len, 0);
WideCharToMultiByte(
CP_UTF8, 0, wstr.c_str(), wstr.length(), &str[0], len, NULL, NULL);
CP_UTF8, 0, wstr.c_str(), wstr.length(), &str[0], len, NULL, NULL);
return str;
}
#endif
Expand All @@ -138,20 +138,23 @@ int CompareFilenames(const std::string& lhs, const std::string& rhs) {
// On Windows, use CompareStringOrdinal as that will perform case conversion
// using the operating system uppercase table information, which (I think)
// will give results that match the filesystem, and is not locale-dependent.
int result = CompareStringOrdinal(ToWinWide(lhs).c_str(), -1, ToWinWide(rhs).c_str(), -1, true);
int result = CompareStringOrdinal(
ToWinWide(lhs).c_str(), -1, ToWinWide(rhs).c_str(), -1, true);
switch (result) {
case CSTR_LESS_THAN:
return -1;
return -1;
case CSTR_EQUAL:
return 0;
return 0;
case CSTR_GREATER_THAN:
return 1;
return 1;
default:
throw std::invalid_argument("One of the filenames to compare was invalid.");
throw std::invalid_argument(
"One of the filenames to compare was invalid.");
}
#else
using boost::locale::to_upper;
return to_upper(lhs).compare(to_upper(rhs));
auto unicodeLhs = UnicodeString::fromUTF8(lhs);
auto unicodeRhs = UnicodeString::fromUTF8(rhs);
return unicodeLhs.caseCompare(unicodeRhs, U_FOLD_CASE_DEFAULT);
#endif
}

Expand All @@ -161,7 +164,11 @@ std::string NormalizeFilename(const std::string& filename) {
CharUpperBuffW(&wideString[0], wideString.length());
return FromWinWide(wideString);
#else
return boost::locale::to_upper(filename);
std::string normalizedFilename;
UnicodeString::fromUTF8(filename)
.foldCase(U_FOLD_CASE_DEFAULT)
.toUTF8String(normalizedFilename);
return normalizedFilename;
#endif
}
}
10 changes: 8 additions & 2 deletions src/api/helpers/text.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,16 @@ std::optional<std::string> ExtractVersion(const std::string& text);

// Compare strings as if they're filenames, respecting filesystem case
// insensitivity on Windows. Returns -1 if lhs < rhs, 0 if lhs == rhs, and 1 if
// lhs > rhs.
// lhs > rhs. The comparison may give different results on Linux, but is still
// locale-invariant.
int CompareFilenames(const std::string& lhs, const std::string& rhs);

// Uppercase the given filename using an invariant locale on Windows.
// Normalize the given filename in a way that is locale-invariant. On Windows,
// this uppercases the filename according to the same case mapping rules as used
// by the filesystem. On Linux, case folding is used and gives results that are
// different but hopefully still consistent enough with the behaviour on Windows
// that the normalized filenames distinguish characters in a similar way to the
// Windows filesystem.
std::string NormalizeFilename(const std::string& filename);
}

Expand Down
60 changes: 59 additions & 1 deletion src/tests/api/internals/helpers/text_test.h
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,6 @@ TEST(ExtractVersion, shouldPreferVersionPrefixedNumbersOverVPrefixedNumber) {
EXPECT_EQ("1.0", text.value());
}

#ifdef _WIN32
// MSVC interprets source files in the default code page, so
// for me u8"\xC3\x9C" != u8"\u00DC", which is a lot of fun.
// To avoid insanity, write non-ASCII characters as \uXXXX escapes.
Expand All @@ -188,14 +187,22 @@ TEST(ExtractVersion, shouldPreferVersionPrefixedNumbersOverVPrefixedNumber) {
// \u0131 is turkish 'ı'

TEST(CompareFilenames, shouldBeCaseInsensitiveAndLocaleInvariant) {
// ICU sees all three greek rhos as case-insensitively equal, unlike Windows.
// A small enough deviation that it should hopefully be insignificant.

EXPECT_EQ(0, CompareFilenames("i", "I"));
EXPECT_EQ(-1, CompareFilenames("i", u8"\u0130"));
EXPECT_EQ(-1, CompareFilenames("i", u8"\u0131"));
EXPECT_EQ(-1, CompareFilenames("I", u8"\u0130"));
EXPECT_EQ(-1, CompareFilenames("I", u8"\u0131"));
EXPECT_EQ(-1, CompareFilenames(u8"\u0130", u8"\u0131"));
#ifdef _WIN32
EXPECT_EQ(1, CompareFilenames(u8"\u03f1", u8"\u03a1"));
EXPECT_EQ(1, CompareFilenames(u8"\u03f1", u8"\u03c1"));
#else
EXPECT_EQ(0, CompareFilenames(u8"\u03f1", u8"\u03a1"));
EXPECT_EQ(0, CompareFilenames(u8"\u03f1", u8"\u03c1"));
#endif
EXPECT_EQ(0, CompareFilenames(u8"\u03a1", u8"\u03c1"));

// Set locale to Turkish.
Expand All @@ -207,8 +214,13 @@ TEST(CompareFilenames, shouldBeCaseInsensitiveAndLocaleInvariant) {
EXPECT_EQ(-1, CompareFilenames("I", u8"\u0130"));
EXPECT_EQ(-1, CompareFilenames("I", u8"\u0131"));
EXPECT_EQ(-1, CompareFilenames(u8"\u0130", u8"\u0131"));
#ifdef _WIN32
EXPECT_EQ(1, CompareFilenames(u8"\u03f1", u8"\u03a1"));
EXPECT_EQ(1, CompareFilenames(u8"\u03f1", u8"\u03c1"));
#else
EXPECT_EQ(0, CompareFilenames(u8"\u03f1", u8"\u03a1"));
EXPECT_EQ(0, CompareFilenames(u8"\u03f1", u8"\u03c1"));
#endif
EXPECT_EQ(0, CompareFilenames(u8"\u03a1", u8"\u03c1"));

// Set locale to Greek.
Expand All @@ -220,14 +232,20 @@ TEST(CompareFilenames, shouldBeCaseInsensitiveAndLocaleInvariant) {
EXPECT_EQ(-1, CompareFilenames("I", u8"\u0130"));
EXPECT_EQ(-1, CompareFilenames("I", u8"\u0131"));
EXPECT_EQ(-1, CompareFilenames(u8"\u0130", u8"\u0131"));
#ifdef _WIN32
EXPECT_EQ(1, CompareFilenames(u8"\u03f1", u8"\u03a1"));
EXPECT_EQ(1, CompareFilenames(u8"\u03f1", u8"\u03c1"));
#else
EXPECT_EQ(0, CompareFilenames(u8"\u03f1", u8"\u03a1"));
EXPECT_EQ(0, CompareFilenames(u8"\u03f1", u8"\u03c1"));
#endif
EXPECT_EQ(0, CompareFilenames(u8"\u03a1", u8"\u03c1"));

// Reset locale.
std::locale::global(boost::locale::generator().generate(""));
}

#ifdef _WIN32
TEST(NormalizeFilename, shouldUppercaseStringsAndBeLocaleInvariant) {
EXPECT_EQ("I", NormalizeFilename("i"));
EXPECT_EQ("I", NormalizeFilename("I"));
Expand Down Expand Up @@ -262,6 +280,46 @@ TEST(NormalizeFilename, shouldUppercaseStringsAndBeLocaleInvariant) {
// Reset locale.
std::locale::global(boost::locale::generator().generate(""));
}
#else
TEST(NormalizeFilename, shouldCaseFoldStringsAndBeLocaleInvariant) {
// ICU folds all greek rhos to the lowercase rho, unlike Windows. The result
// for uppercase turkish i is different from Windows but functionally
// equivalent.
// A small enough deviation that it should hopefully be insignificant.

EXPECT_EQ("i", NormalizeFilename("i"));
EXPECT_EQ("i", NormalizeFilename("I"));
EXPECT_EQ(u8"i\u0307", NormalizeFilename(u8"\u0130"));
EXPECT_EQ(u8"\u0131", NormalizeFilename(u8"\u0131"));
EXPECT_EQ(u8"\u03c1", NormalizeFilename(u8"\u03f1"));
EXPECT_EQ(u8"\u03c1", NormalizeFilename(u8"\u03a1"));
EXPECT_EQ(u8"\u03c1", NormalizeFilename(u8"\u03c1"));

// Set locale to Turkish.
std::locale::global(boost::locale::generator().generate("tr_TR.UTF-8"));

EXPECT_EQ("i", NormalizeFilename("i"));
EXPECT_EQ("i", NormalizeFilename("I"));
EXPECT_EQ(u8"i\u0307", NormalizeFilename(u8"\u0130"));
EXPECT_EQ(u8"\u0131", NormalizeFilename(u8"\u0131"));
EXPECT_EQ(u8"\u03c1", NormalizeFilename(u8"\u03f1"));
EXPECT_EQ(u8"\u03c1", NormalizeFilename(u8"\u03a1"));
EXPECT_EQ(u8"\u03c1", NormalizeFilename(u8"\u03c1"));

// Set locale to Greek.
std::locale::global(boost::locale::generator().generate("el_GR.UTF-8"));

EXPECT_EQ("i", NormalizeFilename("i"));
EXPECT_EQ("i", NormalizeFilename("I"));
EXPECT_EQ(u8"i\u0307", NormalizeFilename(u8"\u0130"));
EXPECT_EQ(u8"\u0131", NormalizeFilename(u8"\u0131"));
EXPECT_EQ(u8"\u03c1", NormalizeFilename(u8"\u03f1"));
EXPECT_EQ(u8"\u03c1", NormalizeFilename(u8"\u03a1"));
EXPECT_EQ(u8"\u03c1", NormalizeFilename(u8"\u03c1"));

// Reset locale.
std::locale::global(boost::locale::generator().generate(""));
}
#endif
}
}
Expand Down

0 comments on commit 306c57e

Please sign in to comment.