Skip to content

Commit

Permalink
Moves scripts to dwdsmor.build package
Browse files Browse the repository at this point in the history
  • Loading branch information
gremid committed Nov 22, 2024
1 parent 7b53e22 commit e711f53
Show file tree
Hide file tree
Showing 8 changed files with 192 additions and 150 deletions.
2 changes: 0 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -29,5 +29,3 @@ manifest.xml
/build/
/lib/
/releases/

/test/UD_German-HDT-r2.12/
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@ For lemmatisation:

>>> import dwsdmor
>>> lemmatizer = dwdsmor.lemmatizer()
>>> assert lemmatizer("getestet", "+V") == "testen"
>>> assert lemmatizer("getestet", "+ADJ") == "getestet"
>>> assert lemmatizer("getestet", {"+V"}) == "testen"
>>> assert lemmatizer("getestet", {"+ADJ"}) == "getestet"

## Development
Expand Down
35 changes: 24 additions & 11 deletions dwdsmor/automaton.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,11 @@
"Generator",
"Analyzer",
"Automata",
"load_from_hub",
"save_to_hub",
"automata",
"Lemmatizer",
"lemmatizer",
"load_from_hub",
"save_to_hub",
]

import csv
Expand All @@ -32,10 +33,12 @@

from dotenv import dotenv_values
from huggingface_hub import ModelHubMixin, create_tag, snapshot_download
from huggingface_hub.utils import disable_progress_bars, enable_progress_bars
from sfst_transduce import CompactTransducer, Transducer

from dwdsmor.traversal import Traversal
from dwdsmor.version import __version__
from .log import logger
from .traversal import Traversal
from .version import __version__

config = {
**dotenv_values(".env.shared"),
Expand Down Expand Up @@ -73,11 +76,11 @@ def detect_root_dir() -> Path:
"""
Detect local automata directory if none is specified.
1. Try to get it from an environment variable ``DWDSMOR_AUTOMATA_ROOT``.
1. Try to get it from an environment variable ``DWDSMOR_AUTOMATA_DIR``.
2. Try to detect a development environment and get it from there.
3. Raise ``AutomataDirNotFound``.
"""
root = config.get("DWDSMOR_AUTOMATA_ROOT")
root = config.get("DWDSMOR_AUTOMATA_DIR")
if root:
return Path(root)
root = detect_dev_root_dir()
Expand Down Expand Up @@ -204,15 +207,20 @@ def _save_pretrained(self, save_directory: Path) -> None:
copy(self.root_dir / metadata_filename, save_directory)


default_repo_id = "gremid/dwdsmor-dev" # FIXME: "zentrum-lexikographie/dwdsmor-open"
default_repo_id = config.get("DWDSMOR_HF_REPO_ID", "zentrum-lexikographie/dwdsmor-open")


def load_from_hub(
repo_id: Optional[str] = None, *args, revision: Optional[str] = None, **kwargs
):
repo_id = repo_id or default_repo_id
revision = revision or f"v{__version__}"
return Automata.from_pretrained(repo_id, *args, revision=revision, **kwargs)
logger.debug("Load automata from Huggingface repo %s @ %s", repo_id, revision)
try:
disable_progress_bars()
return Automata.from_pretrained(repo_id, *args, revision=revision, **kwargs)
finally:
enable_progress_bars()


def save_to_hub(automata, repo_id, *args, tag: Optional[str] = None, **kwargs):
Expand All @@ -226,10 +234,15 @@ def automata(automata_location: Optional[str] = None, *args, **kwargs):
if automata_location is not None:
path = Path(automata_location)
if path.is_dir():
logger.debug("Load automata from local dir '%s'", str(path))
return Automata(path)
if automata_location is None:
try:
return Automata(detect_root_dir())
detected_dir = detect_root_dir()
logger.debug(
"Load automata from detected local dir '%s'", str(detected_dir)
)
return Automata(detected_dir)
except AutomataDirNotFound:
pass
return load_from_hub(automata_location, *args, **kwargs)
Expand All @@ -239,9 +252,9 @@ class Lemmatizer:
def __init__(self, automata, automaton_type="lemma"):
self.analyzer = automata.analyzer(automaton_type)

def __call__(self, word, pos=None):
def __call__(self, word, pos_set=None):
for traversal in self.analyzer.analyze(word):
if pos is None or pos == traversal.pos:
if pos_set is None or traversal.pos in pos_set:
return traversal.analysis


Expand Down
37 changes: 15 additions & 22 deletions scripts/build → dwdsmor/build/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,35 +3,26 @@
import argparse
import csv
from datetime import datetime
import logging
import lzma
import os
from pathlib import Path
import subprocess
import sys

from dwdsmor.tag import all_tags
from dwdsmor.traversal import Traversal
from ..log import logger
from ..tag import all_tags
from ..traversal import Traversal


project_dir = Path(__file__).parent.parent
project_dir = Path(".").resolve()

xsl_dir = project_dir / "share"
grammar_dir = project_dir / "grammar"
lexicon_dir = project_dir / "lexicon"
build_dir = project_dir / "build"

logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
stream=sys.stderr,
level=(logging.DEBUG if os.getenv("DEBUG") else logging.INFO),
)


def run(cmd, output_file=None, log_file=None, **args):
"Run a subprocess, optionally redirecting stdout/stderr to files."
logging.debug("! %s", cmd)
logger.debug("! %s", cmd)

args["capture_output"] = (
args["capture_output"]
Expand Down Expand Up @@ -125,10 +116,10 @@ def build_lexicon(edition_dir, force=False):
lex_log = edition_build_dir / "lex.log"

if not force and is_current(lex_txt, sources):
logging.debug("Skip building lexicon '%s'", edition_name)
logger.debug("Skip building lexicon '%s'", edition_name)
return False

logging.info("Building lexicon '%s'", edition_name)
logger.info("Building lexicon '%s'", edition_name)
edition_build_dir.mkdir(parents=True, exist_ok=True)
saxon(
xsl_dir / "dwds2manifest.xsl",
Expand Down Expand Up @@ -174,10 +165,10 @@ def build_automaton(edition_dir, automaton_type, force=False):
and is_current(automaton_a, sources)
and is_current(automaton_ca, sources)
):
logging.debug("Skip building automaton '%s/%s'", edition_name, automaton_type)
logger.debug("Skip building automaton '%s/%s'", edition_name, automaton_type)
return False

logging.info("Building automaton '%s/%s'", edition_name, automaton_type)
logger.info("Building automaton '%s/%s'", edition_name, automaton_type)
automaton_src = grammar_dir / f"{edition_name}-{automaton_type}.fst"
try:
edition_build_dir.mkdir(parents=True, exist_ok=True)
Expand Down Expand Up @@ -211,12 +202,12 @@ def build_traversals(edition_dir, automaton_type, force=False):
automaton_traversal = edition_build_dir / f"{automaton_type}.csv.lzma"

if not force and is_current(automaton_traversal, [automaton_a]):
logging.debug(
logger.debug(
"Skip building full traversal of '%s/%s'", edition_name, automaton_type
)
return False

logging.info("Building full traversal of '%s/%s'", edition_name, automaton_type)
logger.info("Building full traversal of '%s/%s'", edition_name, automaton_type)
with subprocess.Popen(
["fst-generate", automaton_a.as_posix()],
encoding="utf-8",
Expand Down Expand Up @@ -255,7 +246,9 @@ def stamp_build(edition_dir):


if __name__ == "__main__":
arg_parser = argparse.ArgumentParser(description="Build DWDSmor automata.")
arg_parser = argparse.ArgumentParser(
prog=__package__, description="Build DWDSmor automata."
)
arg_parser.add_argument(
"editions", help="Editions to build (all by default)", nargs="*"
)
Expand All @@ -281,7 +274,7 @@ def stamp_build(edition_dir):
assert edition_dir.is_dir()
edition_name = edition_dir.name
if not has_sources(edition_dir):
logging.info("Skipping edition '%s' without sources", edition_name)
logger.info("Skipping edition '%s' without sources", edition_name)
continue
edition_built = build_lexicon(edition_dir, force=args.force)
for automaton_type in automaton_types:
Expand Down
Loading

0 comments on commit e711f53

Please sign in to comment.