update rest of the files

lunakv · Sep 27, 2022 · 8592ba1 · 8592ba1
1 parent f0d25c6
commit 8592ba1
Show file tree

Hide file tree

Showing 8 changed files with 132 additions and 335 deletions.
diff --git a/app/database/operations.py b/app/database/operations.py
@@ -133,3 +133,16 @@ def get_cr_filename(db: Session, code: str) -> str | None:
 
 def get_doc_filename(db: Session, date: datetime.date, table: Type[Base]) -> str | None:
     return db.execute(select(table.file_name).where(table.creation_day == date)).scalar_one_or_none()
+
+
+def set_pending_cr_and_diff(db: Session, new_rules: dict, new_diff: list, file_name: str):
+    new_cr = PendingCr(creation_day=datetime.date.today(), data=new_rules, file_name=file_name)
+    curr_cr_id: Cr = db.execute(select(Cr.id).order_by(Cr.creation_day.desc())).scalars().first()
+    new_diff = PendingCrDiff(creation_day=datetime.date.today(), source_id=curr_cr_id, dest=new_cr, changes=new_diff)
+    db.add(new_cr)
+    db.add(new_diff)
+
+
+def upload_doc(db: Session, file_name: str, kind: Type[Base]):
+    new_doc = kind(creation_day=datetime.date.today(), file_name=file_name)
+    db.add(new_doc)
diff --git a/app/parsing/cr_scraper.py b/app/parsing/cr_scraper.py
@@ -3,8 +3,9 @@
 import requests
 from bs4 import BeautifulSoup
 
+from app.database import operations as ops
+from app.database.db import SessionLocal
 from app.utils.logger import logger
-from ..utils import db
 from ..utils.notifier import notify_scrape_error, notify_new_cr
 
 rules_page_uri = "https://magic.wizards.com/en/rules/"
@@ -15,29 +16,31 @@ def is_txt_link(tag):
 
 
 async def scrape_rules_page():
-    pending = await db.get_pending("cr")
-    if pending:
-        logger.debug("New CR redirect already pending, skipping scrape")
-        return
-
-    response = requests.get(rules_page_uri)
-    if response.status_code != requests.codes.ok:
-        notify_scrape_error(f"Couldn't fetch rules page (code {response.status_code})")
-        return
-
-    soup = BeautifulSoup(response.text, "html.parser")
-    txt_links = soup.find_all(is_txt_link)
-    if len(txt_links) != 1:
-        notify_scrape_error(f"Wrong number of TXT links found! (expected 1, got {len(txt_links)})")
-        return
-
-    href = txt_links[0]["href"]
-    href = href.replace(" ", "%20")  # the last path segment sometimes has a space (kinda hacky, but whatever)
-
-    current = await db.get_redirect("cr")
-    if href != current:
-        await db.set_pending("cr", href)
-        notify_new_cr(href)
+    with SessionLocal() as session:
+        with session.begin():
+            pending = ops.get_pending_redirect(session, "cr")
+            if pending:
+                logger.debug("New CR redirect already pending, skipping scrape")
+                return
+
+            response = requests.get(rules_page_uri)
+            if response.status_code != requests.codes.ok:
+                notify_scrape_error(f"Couldn't fetch rules page (code {response.status_code})")
+                return
+
+            soup = BeautifulSoup(response.text, "html.parser")
+            txt_links = soup.find_all(is_txt_link)
+            if len(txt_links) != 1:
+                notify_scrape_error(f"Wrong number of TXT links found! (expected 1, got {len(txt_links)})")
+                return
+
+            href = txt_links[0]["href"]
+            href = href.replace(" ", "%20")  # the last path segment sometimes has a space (kinda hacky, but whatever)
+
+            current = ops.get_redirect(session, "cr")
+            if href != current:
+                ops.set_pending(session, "cr", href)
+                notify_new_cr(href)
 
 
 if __name__ == "__main__":

diff --git a/app/parsing/docs_scraper.py b/app/parsing/docs_scraper.py
@@ -3,9 +3,12 @@
 
 import hjson
 import requests
+from sqlalchemy.orm import Session
 
+from app.database import operations as ops
+from app.database.db import SessionLocal
+from app.database.models import PendingRedirect
 from app.utils.logger import logger
-from ..utils import db
 from ..utils.notifier import notify_scrape_error, notify_new_doc
 
 docs_page_uri = "https://wpn.wizards.com/en/rules-documents/"
@@ -77,66 +80,68 @@ def get_doc_link(title, objects):
     return link
 
 
-async def set_broken():
-    await db.set_pending("__broken__", datetime.now().isoformat())
+async def set_broken(session: Session):
+    ops.set_pending(session, "__broken__", datetime.now().isoformat())
 
 
 # once an error is detected, retry only once per day instead of once per hour
-async def can_scrape():
-    link = await db.get_pending("__broken__")
+async def can_scrape(session: Session):
+    link: PendingRedirect | None = session.get(PendingRedirect, "__broken__")
     if not link:
         return True
-    broken_date = datetime.fromisoformat(link)
+    broken_date = datetime.fromisoformat(link.link)
     return (datetime.now() - broken_date).days > 0
 
 
 async def scrape_docs_page():
-    if not (await can_scrape()):
-        logger.info("Skipping broken scrape, retry moved to daily")
-        return
-
-    pending = {}
-    for id, _ in docs:
-        p = await db.get_pending(id)
-        if p:
-            pending[id] = p
-
-    if len(pending) == len(docs):
-        logger.debug("All policy docs already pending, skipping scrape")
-        return
-
-    response = requests.get(docs_page_uri)
-    if response.status_code != requests.codes.ok:
-        notify_scrape_error(f"Couldn't fetch WPN docs page (code {response.status_code})")
-        logger.error("Couldn't fetch WPN docs page: %s", response.reason)
-        await set_broken()
-        return
-
-    text = response.text
-    objects = parse_nuxt_object(text)
-    if not objects:
-        await set_broken()
-        return
-
-    found = {}
-    for id, title in docs:
-        f = get_doc_link(title, objects)
-        if f:
-            found[id] = f
-
-    if len(found) != len(docs):
-        # not all links were found correctly, so we don't wanna update anything to be safe
-        notify_scrape_error(f"Couldn't find links for all WPN documents")
-        logger.error("Couldn't find links for all WPN documents")
-        logger.error(found)
-        await set_broken()
-        return
-
-    for id, _ in docs:
-        current = await db.get_redirect(id)
-        if current != found[id] and (id not in pending or pending[id] != found[id]):
-            await db.set_pending(id, found[id])
-            notify_new_doc(found[id], id)
+    with SessionLocal() as session:
+        with session.begin():
+            if not (await can_scrape(session)):
+                logger.info("Skipping broken scrape, retry moved to daily")
+                return
+
+            pending = {}
+            for id, _ in docs:
+                p = ops.get_pending_redirect(session, id)
+                if p:
+                    pending[id] = p
+
+            if len(pending) == len(docs):
+                logger.debug("All policy docs already pending, skipping scrape")
+                return
+
+            response = requests.get(docs_page_uri)
+            if response.status_code != requests.codes.ok:
+                notify_scrape_error(f"Couldn't fetch WPN docs page (code {response.status_code})")
+                logger.error("Couldn't fetch WPN docs page: %s", response.reason)
+                await set_broken(session)
+                return
+
+            text = response.text
+            objects = parse_nuxt_object(text)
+            if not objects:
+                await set_broken(session)
+                return
+
+            found = {}
+            for id, title in docs:
+                f = get_doc_link(title, objects)
+                if f:
+                    found[id] = f
+
+            if len(found) != len(docs):
+                # not all links were found correctly, so we don't wanna update anything to be safe
+                notify_scrape_error(f"Couldn't find links for all WPN documents")
+                logger.error("Couldn't find links for all WPN documents")
+                logger.error(found)
+                await set_broken(session)
+                return
+
+            for id, _ in docs:
+                current = ops.get_redirect(session, id)
+                if current != found[id] and (id not in pending or pending[id] != found[id]):
+                    ops.set_pending(session, id, found[id])
+                    notify_new_doc(found[id], id)
 
 
 if __name__ == "__main__":

diff --git a/app/parsing/keyword_def.py b/app/parsing/keyword_def.py
@@ -1,6 +1,6 @@
 import re
 from .extract_cr import keyword_action_regex, keyword_regex
-from ..utils import db
+from ..database import operations as ops
 
 # is just rule definition (ends with a number) - we want subrules
 definition = r".*\d$"
@@ -18,28 +18,28 @@ def should_skip(rule):
     )
 
 
-async def get_keyword_definition(rule_id):
+async def get_keyword_definition(db, rule_id):
     """Keyword rules are not very useful in isolation. For example, 702.3 just says 'Defender'. To get the actual
     definition, we need to go to the sub-rules. Most of the time, the first sub-rule has the definition,
     but sometimes it doesn't (for example 702.3a just says 'Defender is a static ability.' which isn't particularly
     useful). This method uses a simple regex heuristic to find the sub-rule that's most likely to be a keyword's
     definition."""
-    rule = await db.fetch_rule(rule_id)
+    rule = ops.get_rule(db, rule_id)
     while should_skip(rule):
         next_rule = rule["navigation"]["nextRule"]
         if not next_rule:
             break  # stop at the end of the road
-        next_rule = await db.fetch_rule(next_rule)
+        next_rule = ops.get_rule(db, next_rule)
         if re.match(definition, next_rule["ruleNumber"]):
             break  # stop at the end of the rule
         rule = next_rule
     return rule
 
 
-async def get_best_rule(rule_id):
+async def get_best_rule(db, rule_id):
     if re.fullmatch(keyword_action_regex, rule_id):
-        return await db.fetch_rule(rule_id + "a")
+        return ops.get_rule(db, rule_id + "a")
     elif re.fullmatch(keyword_regex, rule_id):
-        return await get_keyword_definition(rule_id)
+        return await get_keyword_definition(db, rule_id)
     else:
-        return await db.fetch_rule(rule_id)  # TODO optimize connections
+        return ops.get_rule(db, rule_id)  # TODO optimize connections
diff --git a/app/parsing/refresh_cr.py b/app/parsing/refresh_cr.py
@@ -3,12 +3,13 @@
 
 import requests
 
+from app.database import operations as ops
+from app.database.db import SessionLocal
 from app.parsing.difftool.diffmaker import CRDiffMaker
 from app.utils.logger import logger
 from . import extract_cr
 from ..resources import static_paths as paths
 from ..resources.cache import KeywordCache, GlossaryCache
-from ..utils import db
 
 
 def download_cr(uri):
@@ -37,18 +38,20 @@ def download_cr(uri):
 
 
 async def refresh_cr(link):
-    if link is None:
-        link = await db.get_redirect("cr")
-
-    current_cr = await db.fetch_current_cr()
-    new_text, file_name = download_cr(link)
-    result = await extract_cr.extract(new_text)
-
-    diff_result = CRDiffMaker().diff(current_cr, result["rules"])
-    # TODO add to database instead?
-    KeywordCache().replace(result["keywords"])
-    GlossaryCache().replace(result["glossary"])
-    await db.upload_cr_and_diff(result["rules"], diff_result.diff, file_name)
+    with SessionLocal() as session:
+        with session.begin():
+            if link is None:
+                link = ops.get_redirect(session, "cr")
+
+            current_cr = ops.get_current_cr(session)
+            new_text, file_name = download_cr(link)
+            result = await extract_cr.extract(new_text)
+
+            diff_result = CRDiffMaker().diff(current_cr, result["rules"])
+            # TODO add to database instead?
+            KeywordCache().replace(result["keywords"])
+            GlossaryCache().replace(result["glossary"])
+            ops.set_pending_cr_and_diff(session, result["rules"], diff_result.diff, file_name)
 
 
 if __name__ == "__main__":

diff --git a/app/parsing/refresh_docs.py b/app/parsing/refresh_docs.py
@@ -1,12 +1,15 @@
 from datetime import date
 from typing import Literal
+
+from ..database import operations as ops
+from ..database.db import SessionLocal
+from ..database.models import Ipg, Mtr
 from ..resources import static_paths as paths
 import requests
 
-from ..utils import db
-
 
 async def download_doc(link: str, kind: Literal["mtr", "ipg"]):
+
     directory = paths.docs_dir + "/" + kind
     filename = kind + "-" + date.today().isoformat() + ".pdf"
     filepath = directory + "/" + filename
@@ -17,4 +20,8 @@ async def download_doc(link: str, kind: Literal["mtr", "ipg"]):
         for chunk in r.iter_content(chunk_size=None):
             fd.write(chunk)
 
-    await db.upload_doc(filename, kind)
+    docType = Mtr if kind == "mtr" else Ipg
+
+    with SessionLocal() as session:
+        with session.begin():
+            ops.upload_doc(session, filename, docType)
diff --git a/app/routers/rule.py b/app/routers/rule.py
@@ -53,7 +53,7 @@ async def get_rule(
         return {"detail": "Rule not found", "ruleNumber": rule_id}
 
     if not exact_match:
-        rule = await get_best_rule(rule_id)
+        rule = await get_best_rule(db, rule_id)
     return {"ruleNumber": rule["ruleNumber"], "ruleText": rule["ruleText"]}