use encoding-guessing heuristic

lunakv · Nov 15, 2022 · 491dd7d · 491dd7d
1 parent 366be61
commit 491dd7d
Show file tree

Hide file tree

Showing 4 changed files with 100 additions and 26 deletions.
diff --git a/README.md b/README.md
@@ -10,19 +10,18 @@ It is used to power the [Academy Ruins](https://github.com/lunakv/academyruins)
 ### Prerequisites
 - Python 3.10 or later
 - PostgreSQL 14
+- (optional) [Pushover](https://pushover.net/) account
 
 ### Installation
-0. (recommended) Set up a virtual Python environment for the repository.
-1. Set up Postgres and create a user and database for your API.
-2. (optional) Set up a [Pushover](https://pushover.net/) account and create an API token for the app.
-3. `cp .env_EXAMPLE .env`
-4. Adjust the values in your `.env` file according to your local configuration
-5. Install [Poetry](https://python-poetry.org/), either globally or inside your venv.
-6. (Inside your venv) run `poetry install`
-7. (Inside your venv) run `./update_schema.sh` to load the current schema into the database.
+1. Install the [Poetry](https://python-poetry.org/docs/#installation) package manager
+2. `poetry install`
+3. Start Postgres and create a user and database for your API.
+4. `cp .env_EXAMPLE .env`
+5. Adjust the values in your `.env` file according to your local configuration
+6. `poetry run ./update_schema.sh` to load the current schema into the database.
 
 ### Run
-`python devstart.py`
+`poetry run python devstart.py`
 
 The API server will start on port 8000 by default.
 
@@ -35,4 +34,4 @@ This project uses the [Black](https://black.readthedocs.io/en/stable/) code form
 The full API docs are available at https://api.academyruins.com/docs
 
 ## Data
-All data used by the site, both raw and processed, is periodically backed up to public a [Backblaze B2](https://www.backblaze.com/b2/) (S3 compatible) bucket. If you need to access that data for some reason, you can send me a message through one of the channels specified [on the site](https://academyruins.com/about) and request access to the bucket. 
+All data used by the site, both raw and processed, is periodically backed up to public a [Backblaze B2](https://www.backblaze.com/b2/) (S3 compatible) bucket. You can view the contents of this bucket at <https://backup.academyruins.com> If you need bulk/programatic access to that data for some reason, send me a message through one of the channels specified [on the site](https://academyruins.com/about). 
diff --git a/app/parsing/refresh_cr.py b/app/parsing/refresh_cr.py
@@ -1,31 +1,60 @@
 import datetime
 import re
-
 import requests
 
 from app.database import operations as ops
 from app.database.db import SessionLocal
 from app.parsing.difftool.diffmaker import CRDiffMaker
+from app.utils import notifier
 from app.utils.logger import logger
 from . import extract_cr
 from ..resources import static_paths as paths
 from ..resources.cache import KeywordCache, GlossaryCache
 
 
-def download_cr(uri):
+def get_response_text(response: requests.Response) -> str | None:
+    """
+    Since WotC can't just decide on a consistent character encoding for its text files, and I don't really care for
+    manually changing it every other set, this method performs a simple heuristic to decide which encodings should be
+    tried out.
+
+    It runs through a list of common encodings and checks whether the file
+    a) starts with the phrase "Magic: The Gathering", and
+    b) contains some properly encoded common phrases that I don't expect to disappear from the CR anytime soon.
+
+    It also re-formats the text by replacing all line endings with just LF and removing a BOM if present.
+    """
+    encodings = [response.encoding, "UTF-8", "UTF-16BE", "WINDOWS-1252", "UTF-16LE", "ISO-8859-1"]
+
+    starting_phrase = "Magic: The Gathering"
+    # some phrases with non-ASCII diacritics (mostly Arabian Nights card names)
+    phrases = ["Magic: The Gathering®", "™", "Ring of Ma’rûf", "Dandân", "Ghazbán Ogre"]
+
+    bom = re.compile("^\ufeff")
+
+    for encoding in encodings:
+        response.encoding = encoding
+        text = response.text
+        text = bom.sub("", text)
+        if text.startswith(starting_phrase) and all((phrase in text) for phrase in phrases):
+            return text.replace("\r\n", "\n").replace("\r", "\n")
+
+    return None
+
+
+def download_cr(uri: str) -> tuple[str, str] | None:
     response = requests.get(uri)
     if not response.ok:
-        logger.error(f"Couldn't download CR from link (code {response.status_code}). Tried link: {uri}")
-        return
-
-    # WotC refuses to use UTF-8, and the autodetection falsely returns ISO-8859-1, but they actually use WINDOWS-1252
-    response.encoding = "WINDOWS-1252"
-    text = response.text
-    # replace CR and CRLF with LF
-    text = text.replace("\r\n", "\n").replace("\r", "\n")
-    # remove BOM
-    bom = re.compile("^\ufeff")
-    text = re.sub(bom, "", text)
+        msg = f"Couldn't download CR from link (code {response.status_code}). Tried link: {uri}"
+        logger.error(msg)
+        notifier.notify(msg, "CR parsing error", uri, "Tried link")
+        return None
+
+    text = get_response_text(response)
+    if text is None:
+        logger.error("Couldn't determine encoding for new CR")
+        notifier.notify("Couldn't determine encoding for new CR", "CR parsing error")
+        return None
 
     # save to file
     file_name = "cr-" + datetime.date.today().isoformat() + ".txt"
@@ -45,9 +74,12 @@ async def refresh_cr(link):
                 link = ops.get_redirect(session, "cr")
 
             current_cr = ops.get_current_cr(session)
-            new_text, file_name = download_cr(link)
-            result = await extract_cr.extract(new_text)
+            new_cr = download_cr(link)
+            if new_cr is None:
+                return
+            new_text, file_name = new_cr
 
+            result = await extract_cr.extract(new_text)
             diff_result = CRDiffMaker().diff(current_cr, result["rules"])
             # TODO add to database instead?
             KeywordCache().replace(result["keywords"])

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -27,6 +27,7 @@ thefuzz = {extras = ["speedup"], version = "^0.19.0"}
 [tool.poetry.group.dev.dependencies]
 python-dotenv = "^0.21.0"
 black = "^22.8.0"
+alembic = "^1.8.1"
 
 [build-system]
 requires = ["poetry-core"]