From d1d903d2d9ecae599fcb288b20c13e9593c9498d Mon Sep 17 00:00:00 2001 From: Sakkyoi Cheng <22865542+sakkyoi@users.noreply.github.com> Date: Mon, 12 Aug 2024 03:45:46 +0800 Subject: [PATCH] Implement Unicode Support closes #93 * fix full-width characters issue (https://github.com/magmax/python-inquirer/issues/432) * Fix the issue that raises a bunch of OSError exceptions in the test script (https://github.com/magmax/python-readchar/issues/93) * Fix UnicodeEncodeError when inputting emojis * add tests for new unicode support --- readchar/_win_read.py | 29 ++++++++++++++++++++--------- tests/windows/conftest.py | 13 ++++++------- tests/windows/test_readchar.py | 15 +++++++++++++++ tests/windows/test_readkey.py | 19 +++++++++++++++++++ 4 files changed, 60 insertions(+), 16 deletions(-) diff --git a/readchar/_win_read.py b/readchar/_win_read.py index c3c51c7..f58464d 100644 --- a/readchar/_win_read.py +++ b/readchar/_win_read.py @@ -4,27 +4,38 @@ def readchar() -> str: - """Reads a single character from the input stream. + """Reads a single utf8-character from the input stream. Blocks until a character is available.""" - # manual byte decoding because some bytes in windows are not utf-8 encodable. - return chr(int.from_bytes(msvcrt.getch(), "big")) + # read a single wide character from the input + return msvcrt.getwch() def readkey() -> str: """Reads the next keypress. If an escaped key is pressed, the full sequence is read and returned as noted in `_win_key.py`.""" + # read first character ch = readchar() + # keys like CTRL+C should cause a interrupt if ch in config.INTERRUPT_KEYS: raise KeyboardInterrupt - # if it is a normal character: - if ch not in "\x00\xe0": - return ch + # parse special multi character keys (see key module) + # https://learn.microsoft.com/cpp/c-runtime-library/reference/getch-getwch#remarks + if ch in "\x00\xe0": + # read the second half + # we always return the 0x00 prefix, this avoids duplications in the key module + ch = "\x00" + readchar() - # if it is a scpeal key, read second half: - ch2 = readchar() + # parse unicode surrogates + # https://docs.python.org/3/c-api/unicode.html#c.Py_UNICODE_IS_SURROGATE + if "\uD800" <= ch <= "\uDFFF": + ch += readchar() - return "\x00" + ch2 + # combine the characters into a single utf-16 encoded string. + # this prevents the character from being treated as a surrogate pair again. + ch = ch.encode("utf-16", errors="surrogatepass").decode("utf-16") + + return ch diff --git a/tests/windows/conftest.py b/tests/windows/conftest.py index 39817fc..492d7d7 100644 --- a/tests/windows/conftest.py +++ b/tests/windows/conftest.py @@ -3,10 +3,6 @@ import pytest -if sys.platform in ("win32", "cygwin"): - import msvcrt - - # ignore all tests in this folder if not on windows def pytest_ignore_collect(path, config): if sys.platform not in ("win32", "cygwin"): @@ -14,10 +10,13 @@ def pytest_ignore_collect(path, config): @pytest.fixture -def patched_stdin(): +def patched_stdin(monkeypatch): class mocked_stdin: def push(self, string): - for c in string: - msvcrt.ungetch(ord(c).to_bytes(1, "big")) + # Create an iterator from the string + characters = iter(string) + + # Patch msvcrt.getwch to return the next character from the iterator. + monkeypatch.setattr("msvcrt.getwch", lambda: next(characters)) return mocked_stdin() diff --git a/tests/windows/test_readchar.py b/tests/windows/test_readchar.py index a6fa882..b8a8303 100644 --- a/tests/windows/test_readchar.py +++ b/tests/windows/test_readchar.py @@ -62,3 +62,18 @@ def test_controlCharacters(seq, key, patched_stdin): def test_CTRL_Characters(seq, key, patched_stdin): patched_stdin.push(seq) assert key == readchar() + + +@pytest.mark.parametrize( + ["seq", "key"], + [ + ("\xe4", "ä"), + ("\xe1", "á"), + ("\xe5", "å"), + ("\xdf", "ß"), + ("\u304c", "が"), + ], +) +def test_Unicode_Characters(seq, key, patched_stdin): + patched_stdin.push(seq) + assert key == readchar() diff --git a/tests/windows/test_readkey.py b/tests/windows/test_readkey.py index 8cfe53f..f2baafd 100644 --- a/tests/windows/test_readkey.py +++ b/tests/windows/test_readkey.py @@ -65,3 +65,22 @@ def test_navigationKeys(seq, key, patched_stdin): def test_functionKeys(seq, key, patched_stdin): patched_stdin.push(seq) assert key == readkey() + + +@pytest.mark.parametrize( + ["seq", "key"], + [ + ("\ud83d\ude00", "😀"), + ("\ud83d\ude18", "😘"), + ("\ud83d\ude09", "😉"), + ("\ud83d\udc4d", "👍"), + ("\ud83d\udc35", "🐵"), + ("\ud83c\udf47", "🍇"), + ("\ud83c\udf83", "🎃"), + ("\ud83d\udc53", "👓"), + ("\ud83c\udfc1", "🏁"), + ], +) +def test_UnicodeSurrogates(seq, key, patched_stdin): + patched_stdin.push(seq) + assert key == readkey()