Skip to content

Commit

Permalink
Add a pipeline for parsing mimosa papers
Browse files Browse the repository at this point in the history
  • Loading branch information
rafelafrance committed Sep 26, 2024
1 parent 182a0d3 commit 0c0ebc3
Show file tree
Hide file tree
Showing 11 changed files with 129 additions and 31 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/python-app.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ jobs:
- name: Set up Python
uses: actions/setup-python@v2
with:
python-version: '3.11'
python-version: '3.12'
- name: Install dependencies
run: |
make install
Expand Down
36 changes: 15 additions & 21 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,41 +1,35 @@
.PHONY: test install dev venv clean activate base
.ONESHELL:

VENV=.venv
PY_VER=python3.11
PYTHON=./$(VENV)/bin/$(PY_VER)
PIP_INSTALL=$(PYTHON) -m pip install
SPACY_MODEL=$(PYTHON) -m spacy download en_core_web_md

test: activate
export MOCK_TRAITER=1
$(PYTHON) -m unittest discover
./.venv/bin/python3.12 -m unittest discover
export MOCK_TRAITER=0

install: venv activate base
$(PIP_INSTALL) git+https://github.com/rafelafrance/common_utils.git@main#egg=common_utils
$(PIP_INSTALL) git+https://github.com/rafelafrance/spell-well.git@main#egg=spell-well
$(PIP_INSTALL) git+https://github.com/rafelafrance/traiter.git@master#egg=traiter
$(PIP_INSTALL) .
$(SPACY_MODEL)
./.venv/bin/python3.12 -m pip install git+https://github.com/rafelafrance/common_utils.git@main#egg=common_utils
./.venv/bin/python3.12 -m pip install git+https://github.com/rafelafrance/spell-well.git@main#egg=spell-well
./.venv/bin/python3.12 -m pip install git+https://github.com/rafelafrance/traiter.git@master#egg=traiter
./.venv/bin/python3.12 -m pip install .
./.venv/bin/python3.12 -m spacy download en_core_web_md

dev: venv activate base
$(PIP_INSTALL) -e ../../misc/common_utils
$(PIP_INSTALL) -e ../../misc/spell-well
$(PIP_INSTALL) -e ../../traiter/traiter
$(PIP_INSTALL) -e .[dev]
$(SPACY_MODEL)
./.venv/bin/python3.12 -m pip install -e ../../misc/common_utils
./.venv/bin/python3.12 -m pip install -e ../../misc/spell-well
./.venv/bin/python3.12 -m pip install -e ../../traiter/traiter
./.venv/bin/python3.12 -m pip install -e .[dev]
./.venv/bin/python3.12 -m spacy download en_core_web_md
pre-commit install

activate:
. $(VENV)/bin/activate
. .venv/bin/activate

base:
$(PIP_INSTALL) -U pip setuptools wheel
./.venv/bin/python3.12 -m pip install -U pip setuptools wheel

venv:
test -d $(VENV) || $(PY_VER) -m venv $(VENV)
test -d .venv || python3.12 -m venv .venv

clean:
rm -r $(VENV)
rm -r .venv
find -iname "*.pyc" -delete
1 change: 0 additions & 1 deletion flora/parse_treatments.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@ def write_json(treatments, json_dir):

def parse_args() -> argparse.Namespace:
arg_parser = argparse.ArgumentParser(
fromfile_prefix_chars="@",
formatter_class=argparse.RawDescriptionHelpFormatter,
description=textwrap.dedent(
"""
Expand Down
4 changes: 2 additions & 2 deletions flora/pylib/labels.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,13 @@
from flora.pylib.label import Label
from flora.pylib.rules import terms as p_terms

from . import pipeline
from .pipelines import flora_pipeline


class Labels:
def __init__(self, args):
self.labels: list[Label] = self.get_labels(args)
self.nlp = pipeline.build()
self.nlp = flora_pipeline.build()
self.image_paths = self.get_image_paths(args)
self.vocabulary: set = self.get_vocabulary()

Expand Down
Empty file.
File renamed without changes.
105 changes: 105 additions & 0 deletions flora/pylib/pipelines/mimosa_pipeline.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
import spacy
from traiter.pylib.pipes import extensions, sentence, tokenizer
from traiter.pylib.rules.date_ import Date
from traiter.pylib.rules.elevation import Elevation
from traiter.pylib.rules.habitat import Habitat
from traiter.pylib.rules.lat_long import LatLong

from flora.pylib.rules import delete_missing, delete_too_far, post_process
from flora.pylib.rules.color import Color
from flora.pylib.rules.count import Count
from flora.pylib.rules.duration import Duration
from flora.pylib.rules.flower_location import FlowerLocation
from flora.pylib.rules.flower_morphology import FlowerMorphology
from flora.pylib.rules.habit import Habit
from flora.pylib.rules.leaf_duration import LeafDuration
from flora.pylib.rules.leaf_folding import LeafFolding
from flora.pylib.rules.margin import Margin
from flora.pylib.rules.morphology import Morphology
from flora.pylib.rules.name import Name
from flora.pylib.rules.odor import Odor
from flora.pylib.rules.part import Part
from flora.pylib.rules.part_linker import PartLinker
from flora.pylib.rules.part_location import PartLocation
from flora.pylib.rules.part_location_linker import PartLocationLinker
from flora.pylib.rules.plant_duration import PlantDuration
from flora.pylib.rules.range import Range
from flora.pylib.rules.reproduction import Reproduction
from flora.pylib.rules.sex import Sex
from flora.pylib.rules.sex_linker import SexLinker
from flora.pylib.rules.shape import Shape
from flora.pylib.rules.size import Size
from flora.pylib.rules.subpart import Subpart
from flora.pylib.rules.subpart_linker import SubpartLinker
from flora.pylib.rules.surface import Surface
from flora.pylib.rules.taxon import Taxon
from flora.pylib.rules.taxon_like import TaxonLike
from flora.pylib.rules.taxon_like_linker import TaxonLikeLinker
from flora.pylib.rules.venation import Venation
from flora.pylib.rules.woodiness import Woodiness

# from traiter.pylib.pipes import debug


def build():
extensions.add_extensions()

nlp = spacy.load("en_core_web_md", exclude=["ner"])

tokenizer.setup_tokenizer(nlp)

config = {"base_model": "en_core_web_md"}
nlp.add_pipe(sentence.SENTENCES, config=config, before="parser")

Date.pipe(nlp)

Part.pipe(nlp)
Subpart.pipe(nlp)

Elevation.pipe(nlp)
LatLong.pipe(nlp)

Color.pipe(nlp)
Habitat.pipe(nlp)

Duration.pipe(nlp)
FlowerLocation.pipe(nlp)
FlowerMorphology.pipe(nlp)
LeafDuration.pipe(nlp)
LeafFolding.pipe(nlp)
Morphology.pipe(nlp)
Odor.pipe(nlp)
PlantDuration.pipe(nlp)
Reproduction.pipe(nlp)
Sex.pipe(nlp)
Venation.pipe(nlp)
Woodiness.pipe(nlp)

Name.pipe(nlp, overwrite=["subpart", "color", "admin_unit"])

Range.pipe(nlp)
Size.pipe(nlp)
Count.pipe(nlp)

Habit.pipe(nlp)
Margin.pipe(nlp)
Shape.pipe(nlp)
Surface.pipe(nlp)

Taxon.pipe(nlp, extend=2, overwrite=["habitat", "color"])

PartLocation.pipe(nlp)
TaxonLike.pipe(nlp)

PartLinker.pipe(nlp)
SubpartLinker.pipe(nlp)
SexLinker.pipe(nlp)
PartLocationLinker.pipe(nlp)
TaxonLikeLinker.pipe(nlp)

delete_missing.pipe(nlp)
delete_too_far.pipe(nlp)

post_process.pipe(nlp)

return nlp
File renamed without changes.
4 changes: 2 additions & 2 deletions flora/pylib/treatments.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,15 @@

from flora.pylib.treatment import Treatment

from . import pipeline
from .pipelines import flora_pipeline


class Treatments:
def __init__(self, treatment_dir, limit, offset):
self.treatments: list[Treatment] = self.get_treatments(
treatment_dir, limit, offset
)
self.nlp = pipeline.build()
self.nlp = flora_pipeline.build()

@staticmethod
def get_treatments(treatment_dir, limit, offset):
Expand Down
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ readme = "README.md"
description = ""
license = {file = "LICENSE"}
authors = [{name="Raphael LaFrance", email="[email protected]"}]
requires-python = ">=3.11"
requires-python = ">=3.12"
dependencies = [
"Jinja2",
"pandas",
Expand Down Expand Up @@ -39,7 +39,7 @@ add-taxa = "flora.util_add_taxon_terms:main"
py-modules = []

[tool.ruff]
target-version = "py311"
target-version = "py312"

show-fixes = true

Expand Down
4 changes: 2 additions & 2 deletions tests/setup.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import traiter.pylib.darwin_core as t_dwc
from traiter.pylib.util import compress

from flora.pylib import pipeline
from flora.pylib.pipelines import flora_pipeline

PIPELINE = pipeline.build()
PIPELINE = flora_pipeline.build()


def parse(text: str) -> list:
Expand Down

0 comments on commit 0c0ebc3

Please sign in to comment.