From d1d903d2d9ecae599fcb288b20c13e9593c9498d Mon Sep 17 00:00:00 2001
From: Sakkyoi Cheng <22865542+sakkyoi@users.noreply.github.com>
Date: Mon, 12 Aug 2024 03:45:46 +0800
Subject: [PATCH] Implement Unicode Support

closes #93

* fix full-width characters issue (https://github.com/magmax/python-inquirer/issues/432)

* Fix the issue that raises a bunch of OSError exceptions in the test script (https://github.com/magmax/python-readchar/issues/93)

* Fix UnicodeEncodeError when inputting emojis

* add tests for new unicode support
---
 readchar/_win_read.py          | 29 ++++++++++++++++++++---------
 tests/windows/conftest.py      | 13 ++++++-------
 tests/windows/test_readchar.py | 15 +++++++++++++++
 tests/windows/test_readkey.py  | 19 +++++++++++++++++++
 4 files changed, 60 insertions(+), 16 deletions(-)

diff --git a/readchar/_win_read.py b/readchar/_win_read.py
index c3c51c7..f58464d 100644
--- a/readchar/_win_read.py
+++ b/readchar/_win_read.py
@@ -4,27 +4,38 @@
 
 
 def readchar() -> str:
-    """Reads a single character from the input stream.
+    """Reads a single utf8-character from the input stream.
     Blocks until a character is available."""
 
-    # manual byte decoding because some bytes in windows are not utf-8 encodable.
-    return chr(int.from_bytes(msvcrt.getch(), "big"))
+    # read a single wide character from the input
+    return msvcrt.getwch()
 
 
 def readkey() -> str:
     """Reads the next keypress. If an escaped key is pressed, the full
     sequence is read and returned as noted in `_win_key.py`."""
 
+    # read first character
     ch = readchar()
 
+    # keys like CTRL+C should cause a interrupt
     if ch in config.INTERRUPT_KEYS:
         raise KeyboardInterrupt
 
-    # if it is a normal character:
-    if ch not in "\x00\xe0":
-        return ch
+    # parse special multi character keys (see key module)
+    # https://learn.microsoft.com/cpp/c-runtime-library/reference/getch-getwch#remarks
+    if ch in "\x00\xe0":
+        # read the second half
+        # we always return the 0x00 prefix, this avoids duplications in the key module
+        ch = "\x00" + readchar()
 
-    # if it is a scpeal key, read second half:
-    ch2 = readchar()
+    # parse unicode surrogates
+    # https://docs.python.org/3/c-api/unicode.html#c.Py_UNICODE_IS_SURROGATE
+    if "\uD800" <= ch <= "\uDFFF":
+        ch += readchar()
 
-    return "\x00" + ch2
+        # combine the characters into a single utf-16 encoded string.
+        # this prevents the character from being treated as a surrogate pair again.
+        ch = ch.encode("utf-16", errors="surrogatepass").decode("utf-16")
+
+    return ch
diff --git a/tests/windows/conftest.py b/tests/windows/conftest.py
index 39817fc..492d7d7 100644
--- a/tests/windows/conftest.py
+++ b/tests/windows/conftest.py
@@ -3,10 +3,6 @@
 import pytest
 
 
-if sys.platform in ("win32", "cygwin"):
-    import msvcrt
-
-
 # ignore all tests in this folder if not on windows
 def pytest_ignore_collect(path, config):
     if sys.platform not in ("win32", "cygwin"):
@@ -14,10 +10,13 @@ def pytest_ignore_collect(path, config):
 
 
 @pytest.fixture
-def patched_stdin():
+def patched_stdin(monkeypatch):
     class mocked_stdin:
         def push(self, string):
-            for c in string:
-                msvcrt.ungetch(ord(c).to_bytes(1, "big"))
+            # Create an iterator from the string
+            characters = iter(string)
+
+            # Patch msvcrt.getwch to return the next character from the iterator.
+            monkeypatch.setattr("msvcrt.getwch", lambda: next(characters))
 
     return mocked_stdin()
diff --git a/tests/windows/test_readchar.py b/tests/windows/test_readchar.py
index a6fa882..b8a8303 100644
--- a/tests/windows/test_readchar.py
+++ b/tests/windows/test_readchar.py
@@ -62,3 +62,18 @@ def test_controlCharacters(seq, key, patched_stdin):
 def test_CTRL_Characters(seq, key, patched_stdin):
     patched_stdin.push(seq)
     assert key == readchar()
+
+
+@pytest.mark.parametrize(
+    ["seq", "key"],
+    [
+        ("\xe4", "ä"),
+        ("\xe1", "á"),
+        ("\xe5", "å"),
+        ("\xdf", "ß"),
+        ("\u304c", "が"),
+    ],
+)
+def test_Unicode_Characters(seq, key, patched_stdin):
+    patched_stdin.push(seq)
+    assert key == readchar()
diff --git a/tests/windows/test_readkey.py b/tests/windows/test_readkey.py
index 8cfe53f..f2baafd 100644
--- a/tests/windows/test_readkey.py
+++ b/tests/windows/test_readkey.py
@@ -65,3 +65,22 @@ def test_navigationKeys(seq, key, patched_stdin):
 def test_functionKeys(seq, key, patched_stdin):
     patched_stdin.push(seq)
     assert key == readkey()
+
+
+@pytest.mark.parametrize(
+    ["seq", "key"],
+    [
+        ("\ud83d\ude00", "😀"),
+        ("\ud83d\ude18", "😘"),
+        ("\ud83d\ude09", "😉"),
+        ("\ud83d\udc4d", "👍"),
+        ("\ud83d\udc35", "🐵"),
+        ("\ud83c\udf47", "🍇"),
+        ("\ud83c\udf83", "🎃"),
+        ("\ud83d\udc53", "👓"),
+        ("\ud83c\udfc1", "🏁"),
+    ],
+)
+def test_UnicodeSurrogates(seq, key, patched_stdin):
+    patched_stdin.push(seq)
+    assert key == readkey()