Skip to content

Commit

Permalink
update rest of the files
Browse files Browse the repository at this point in the history
  • Loading branch information
lunakv committed Sep 27, 2022
1 parent f0d25c6 commit 8592ba1
Show file tree
Hide file tree
Showing 8 changed files with 132 additions and 335 deletions.
13 changes: 13 additions & 0 deletions app/database/operations.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,3 +133,16 @@ def get_cr_filename(db: Session, code: str) -> str | None:

def get_doc_filename(db: Session, date: datetime.date, table: Type[Base]) -> str | None:
return db.execute(select(table.file_name).where(table.creation_day == date)).scalar_one_or_none()


def set_pending_cr_and_diff(db: Session, new_rules: dict, new_diff: list, file_name: str):
new_cr = PendingCr(creation_day=datetime.date.today(), data=new_rules, file_name=file_name)
curr_cr_id: Cr = db.execute(select(Cr.id).order_by(Cr.creation_day.desc())).scalars().first()
new_diff = PendingCrDiff(creation_day=datetime.date.today(), source_id=curr_cr_id, dest=new_cr, changes=new_diff)
db.add(new_cr)
db.add(new_diff)


def upload_doc(db: Session, file_name: str, kind: Type[Base]):
new_doc = kind(creation_day=datetime.date.today(), file_name=file_name)
db.add(new_doc)
51 changes: 27 additions & 24 deletions app/parsing/cr_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,9 @@
import requests
from bs4 import BeautifulSoup

from app.database import operations as ops
from app.database.db import SessionLocal
from app.utils.logger import logger
from ..utils import db
from ..utils.notifier import notify_scrape_error, notify_new_cr

rules_page_uri = "https://magic.wizards.com/en/rules/"
Expand All @@ -15,29 +16,31 @@ def is_txt_link(tag):


async def scrape_rules_page():
pending = await db.get_pending("cr")
if pending:
logger.debug("New CR redirect already pending, skipping scrape")
return

response = requests.get(rules_page_uri)
if response.status_code != requests.codes.ok:
notify_scrape_error(f"Couldn't fetch rules page (code {response.status_code})")
return

soup = BeautifulSoup(response.text, "html.parser")
txt_links = soup.find_all(is_txt_link)
if len(txt_links) != 1:
notify_scrape_error(f"Wrong number of TXT links found! (expected 1, got {len(txt_links)})")
return

href = txt_links[0]["href"]
href = href.replace(" ", "%20") # the last path segment sometimes has a space (kinda hacky, but whatever)

current = await db.get_redirect("cr")
if href != current:
await db.set_pending("cr", href)
notify_new_cr(href)
with SessionLocal() as session:
with session.begin():
pending = ops.get_pending_redirect(session, "cr")
if pending:
logger.debug("New CR redirect already pending, skipping scrape")
return

response = requests.get(rules_page_uri)
if response.status_code != requests.codes.ok:
notify_scrape_error(f"Couldn't fetch rules page (code {response.status_code})")
return

soup = BeautifulSoup(response.text, "html.parser")
txt_links = soup.find_all(is_txt_link)
if len(txt_links) != 1:
notify_scrape_error(f"Wrong number of TXT links found! (expected 1, got {len(txt_links)})")
return

href = txt_links[0]["href"]
href = href.replace(" ", "%20") # the last path segment sometimes has a space (kinda hacky, but whatever)

current = ops.get_redirect(session, "cr")
if href != current:
ops.set_pending(session, "cr", href)
notify_new_cr(href)


if __name__ == "__main__":
Expand Down
109 changes: 57 additions & 52 deletions app/parsing/docs_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,12 @@

import hjson
import requests
from sqlalchemy.orm import Session

from app.database import operations as ops
from app.database.db import SessionLocal
from app.database.models import PendingRedirect
from app.utils.logger import logger
from ..utils import db
from ..utils.notifier import notify_scrape_error, notify_new_doc

docs_page_uri = "https://wpn.wizards.com/en/rules-documents/"
Expand Down Expand Up @@ -77,66 +80,68 @@ def get_doc_link(title, objects):
return link


async def set_broken():
await db.set_pending("__broken__", datetime.now().isoformat())
async def set_broken(session: Session):
ops.set_pending(session, "__broken__", datetime.now().isoformat())


# once an error is detected, retry only once per day instead of once per hour
async def can_scrape():
link = await db.get_pending("__broken__")
async def can_scrape(session: Session):
link: PendingRedirect | None = session.get(PendingRedirect, "__broken__")
if not link:
return True
broken_date = datetime.fromisoformat(link)
broken_date = datetime.fromisoformat(link.link)
return (datetime.now() - broken_date).days > 0


async def scrape_docs_page():
if not (await can_scrape()):
logger.info("Skipping broken scrape, retry moved to daily")
return

pending = {}
for id, _ in docs:
p = await db.get_pending(id)
if p:
pending[id] = p

if len(pending) == len(docs):
logger.debug("All policy docs already pending, skipping scrape")
return

response = requests.get(docs_page_uri)
if response.status_code != requests.codes.ok:
notify_scrape_error(f"Couldn't fetch WPN docs page (code {response.status_code})")
logger.error("Couldn't fetch WPN docs page: %s", response.reason)
await set_broken()
return

text = response.text
objects = parse_nuxt_object(text)
if not objects:
await set_broken()
return

found = {}
for id, title in docs:
f = get_doc_link(title, objects)
if f:
found[id] = f

if len(found) != len(docs):
# not all links were found correctly, so we don't wanna update anything to be safe
notify_scrape_error(f"Couldn't find links for all WPN documents")
logger.error("Couldn't find links for all WPN documents")
logger.error(found)
await set_broken()
return

for id, _ in docs:
current = await db.get_redirect(id)
if current != found[id] and (id not in pending or pending[id] != found[id]):
await db.set_pending(id, found[id])
notify_new_doc(found[id], id)
with SessionLocal() as session:
with session.begin():
if not (await can_scrape(session)):
logger.info("Skipping broken scrape, retry moved to daily")
return

pending = {}
for id, _ in docs:
p = ops.get_pending_redirect(session, id)
if p:
pending[id] = p

if len(pending) == len(docs):
logger.debug("All policy docs already pending, skipping scrape")
return

response = requests.get(docs_page_uri)
if response.status_code != requests.codes.ok:
notify_scrape_error(f"Couldn't fetch WPN docs page (code {response.status_code})")
logger.error("Couldn't fetch WPN docs page: %s", response.reason)
await set_broken(session)
return

text = response.text
objects = parse_nuxt_object(text)
if not objects:
await set_broken(session)
return

found = {}
for id, title in docs:
f = get_doc_link(title, objects)
if f:
found[id] = f

if len(found) != len(docs):
# not all links were found correctly, so we don't wanna update anything to be safe
notify_scrape_error(f"Couldn't find links for all WPN documents")
logger.error("Couldn't find links for all WPN documents")
logger.error(found)
await set_broken(session)
return

for id, _ in docs:
current = ops.get_redirect(session, id)
if current != found[id] and (id not in pending or pending[id] != found[id]):
ops.set_pending(session, id, found[id])
notify_new_doc(found[id], id)


if __name__ == "__main__":
Expand Down
16 changes: 8 additions & 8 deletions app/parsing/keyword_def.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import re
from .extract_cr import keyword_action_regex, keyword_regex
from ..utils import db
from ..database import operations as ops

# is just rule definition (ends with a number) - we want subrules
definition = r".*\d$"
Expand All @@ -18,28 +18,28 @@ def should_skip(rule):
)


async def get_keyword_definition(rule_id):
async def get_keyword_definition(db, rule_id):
"""Keyword rules are not very useful in isolation. For example, 702.3 just says 'Defender'. To get the actual
definition, we need to go to the sub-rules. Most of the time, the first sub-rule has the definition,
but sometimes it doesn't (for example 702.3a just says 'Defender is a static ability.' which isn't particularly
useful). This method uses a simple regex heuristic to find the sub-rule that's most likely to be a keyword's
definition."""
rule = await db.fetch_rule(rule_id)
rule = ops.get_rule(db, rule_id)
while should_skip(rule):
next_rule = rule["navigation"]["nextRule"]
if not next_rule:
break # stop at the end of the road
next_rule = await db.fetch_rule(next_rule)
next_rule = ops.get_rule(db, next_rule)
if re.match(definition, next_rule["ruleNumber"]):
break # stop at the end of the rule
rule = next_rule
return rule


async def get_best_rule(rule_id):
async def get_best_rule(db, rule_id):
if re.fullmatch(keyword_action_regex, rule_id):
return await db.fetch_rule(rule_id + "a")
return ops.get_rule(db, rule_id + "a")
elif re.fullmatch(keyword_regex, rule_id):
return await get_keyword_definition(rule_id)
return await get_keyword_definition(db, rule_id)
else:
return await db.fetch_rule(rule_id) # TODO optimize connections
return ops.get_rule(db, rule_id) # TODO optimize connections
29 changes: 16 additions & 13 deletions app/parsing/refresh_cr.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,13 @@

import requests

from app.database import operations as ops
from app.database.db import SessionLocal
from app.parsing.difftool.diffmaker import CRDiffMaker
from app.utils.logger import logger
from . import extract_cr
from ..resources import static_paths as paths
from ..resources.cache import KeywordCache, GlossaryCache
from ..utils import db


def download_cr(uri):
Expand Down Expand Up @@ -37,18 +38,20 @@ def download_cr(uri):


async def refresh_cr(link):
if link is None:
link = await db.get_redirect("cr")

current_cr = await db.fetch_current_cr()
new_text, file_name = download_cr(link)
result = await extract_cr.extract(new_text)

diff_result = CRDiffMaker().diff(current_cr, result["rules"])
# TODO add to database instead?
KeywordCache().replace(result["keywords"])
GlossaryCache().replace(result["glossary"])
await db.upload_cr_and_diff(result["rules"], diff_result.diff, file_name)
with SessionLocal() as session:
with session.begin():
if link is None:
link = ops.get_redirect(session, "cr")

current_cr = ops.get_current_cr(session)
new_text, file_name = download_cr(link)
result = await extract_cr.extract(new_text)

diff_result = CRDiffMaker().diff(current_cr, result["rules"])
# TODO add to database instead?
KeywordCache().replace(result["keywords"])
GlossaryCache().replace(result["glossary"])
ops.set_pending_cr_and_diff(session, result["rules"], diff_result.diff, file_name)


if __name__ == "__main__":
Expand Down
13 changes: 10 additions & 3 deletions app/parsing/refresh_docs.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
from datetime import date
from typing import Literal

from ..database import operations as ops
from ..database.db import SessionLocal
from ..database.models import Ipg, Mtr
from ..resources import static_paths as paths
import requests

from ..utils import db


async def download_doc(link: str, kind: Literal["mtr", "ipg"]):

directory = paths.docs_dir + "/" + kind
filename = kind + "-" + date.today().isoformat() + ".pdf"
filepath = directory + "/" + filename
Expand All @@ -17,4 +20,8 @@ async def download_doc(link: str, kind: Literal["mtr", "ipg"]):
for chunk in r.iter_content(chunk_size=None):
fd.write(chunk)

await db.upload_doc(filename, kind)
docType = Mtr if kind == "mtr" else Ipg

with SessionLocal() as session:
with session.begin():
ops.upload_doc(session, filename, docType)
2 changes: 1 addition & 1 deletion app/routers/rule.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ async def get_rule(
return {"detail": "Rule not found", "ruleNumber": rule_id}

if not exact_match:
rule = await get_best_rule(rule_id)
rule = await get_best_rule(db, rule_id)
return {"ruleNumber": rule["ruleNumber"], "ruleText": rule["ruleText"]}


Expand Down
Loading

0 comments on commit 8592ba1

Please sign in to comment.