Merge pull request #71 from teddygroves/remove_tests_option

Make tests folder non-optional
teddygroves · Jan 25, 2024 · a34d5af · a34d5af
2 parents 25ebe6f + c74b818
commit a34d5af
Show file tree

Hide file tree

Showing 42 changed files with 105,827 additions and 109,632 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,4 @@
+bibat/examples/**/idata/
 .tox",
 __pycache__",
 .ipynb_checkpoints/",

diff --git a/README.rst b/README.rst
@@ -174,20 +174,20 @@ and are as follows:
 - scipy
 - scikit-learn
 - toml
+- pytest
+- black
 
 In addition, the following Python packages may be installed, depending on how
 the user answers bibat's wizard:
 
-- pytest
-- black
 - sphinx
 
 Target project dependencies: Cmdstan
 ------------------------------------
 
 Bibat will attempt to install `cmdstan
 <https://mc-stan.org/docs/cmdstan-guide/index.html>`__, the command line
-interface to Stan, when you run the commands :code:`make env` or :code:`make analysis` 
+interface to Stan, when you run the commands :code:`make env` or :code:`make analysis`
 from the root of the target project.
 
 If bibat fails to install cmdstan, please raise an issue! The relevant

diff --git a/bibat/cli.py b/bibat/cli.py
@@ -51,11 +51,6 @@
         ["Quarto", "Sphinx", "No docs"],
         default="Quarto",
     ),
-    WizardStr(
-        "create_tests_directory",
-        "Would you like to create a tests directory?",
-        default="y",
-    ),
     WizardStr(
         "create_dotgithub_directory",
         "Would you like to create a .github directory?",

diff --git a/bibat/cookiecutter.json b/bibat/cookiecutter.json
@@ -1,14 +1,21 @@
 {
-    "project_name": "project_name",
-    "repo_name": "{{ cookiecutter.project_name.lower().replace(' ', '_') }}",
-    "author_name": "Your name (or your organization/company/team)",
-    "author_email": "Author email",
-    "coc_contact": "Code of conduct contact",
-    "description": "A short description of the project.",
-    "open_source_license": ["MIT", "BSD-3-Clause", "No license file"],
-    "docs_format": ["Quarto", "Sphinx", "No docs"],
-    "create_tests_directory": "y",
-    "create_dotgithub_directory": "y",
-    "bibat_version": "unknown version",
-    "n_iter_default": "1000"
+  "project_name": "project_name",
+  "repo_name": "{{ cookiecutter.project_name.lower().replace(' ', '_') }}",
+  "author_name": "Your name (or your organization/company/team)",
+  "author_email": "Author email",
+  "coc_contact": "Code of conduct contact",
+  "description": "A short description of the project.",
+  "open_source_license": [
+    "MIT",
+    "BSD-3-Clause",
+    "No license file"
+  ],
+  "docs_format": [
+    "Quarto",
+    "Sphinx",
+    "No docs"
+  ],
+  "create_dotgithub_directory": "y",
+  "bibat_version": "unknown version",
+  "n_iter_default": "1000"
 }
diff --git a/bibat/examples/baseball/LICENSE b/bibat/examples/baseball/LICENSE
@@ -1,6 +1,6 @@
 
 The MIT License (MIT)
-Copyright (c) 2023, Teddy Groves
+Copyright (c) 2024, Teddy Groves
 
 Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
 

diff --git a/bibat/examples/baseball/Makefile b/bibat/examples/baseball/Makefile
@@ -1,4 +1,4 @@
-.PHONY: clean-inferences clean-plots clean-stan clean-all analysis env docs clean-docs
+.PHONY: clean-inferences clean-plots clean-stan clean-all analysis env test docs clean-docs
 
 ENV_MARKER = .venv/.bibat.marker
 ACTIVATE_VENV = .venv/bin/activate
@@ -9,10 +9,10 @@ REPORT_STEM = docs/report
 QUARTO_EXTENSIONS_FOLDER = $(DOCS_DIR)/_extensions
 
 ifeq ($(OS),Windows_NT)
-	INSTALL_CMDSTAN_FLAGS = --compiler
+	INSTALL_CMDSTAN_FLAGS =  --version git:develop --cores 2
 	ACTIVATE_VENV = .venv/Scripts/activate
 else
-	INSTALL_CMDSTAN_FLAGS =
+	INSTALL_CMDSTAN_FLAGS = --cores 2
 endif
 
 env: $(ENV_MARKER)
@@ -31,14 +31,20 @@ docs: $(ENV_MARKER) $(QUARTO_EXTENSIONS_FOLDER)
 $(ENV_MARKER): $(ACTIVATE_VENV) $(CMDSTAN)
 	. $(ACTIVATE_VENV) && (\
 	  python -m pip install --upgrade pip; \
-		python -m pip install -e .; \
-	  install_cmdstan $(INSTALL_CMDSTAN_FLAGS); \
+	  python -m pip install -e .; \
+	  python -m cmdstanpy.install_cmdstan $(INSTALL_CMDSTAN_FLAGS); \
 		touch $@ ; \
 	)
 
+test: $(ENV_MARKER)
+	. $(ACTIVATE_VENV) && ( \
+	  python -m pip install -e .'[dev]'; \
+	  python -m pytest || exit 1; \
+	)
+
 analysis: $(ENV_MARKER)
-	. $(ACTIVATE_VENV) && (\
-	  python $(SRC)/prepare_data.py || exit 1; \
+	. $(ACTIVATE_VENV) && ( \
+	  python $(SRC)/data_preparation.py || exit 1; \
 	  python $(SRC)/sample.py || exit 1; \
 	  jupyter execute $(SRC)/investigate.ipynb || exit 1; \
 	)

diff --git a/bibat/examples/baseball/README.md b/bibat/examples/baseball/README.md
@@ -7,7 +7,7 @@ Comparison of distributions for modelling baseball hitting
 
 To run the analysis, run the command `make analysis` from the project root. This
 will install a fresh virtual environment if one doesn't exist already, activate
-it and install python dependencies and cmdstan, then run the analysis with the
+it, install python dependencies and cmdstan and then run the analysis with the
 following commands:
 
 - `python baseball/prepare_data.py`
@@ -21,5 +21,24 @@ First make sure you have installed [quarto](https://https://quarto.org/).
 Now run this command from the project root:
 
 ```
-make docs
+$ make docs
+```
+
+
+
+
+# How to run tests
+
+From the project root, either run
+
+```
+$ make test
+```
+
+or
+
+```
+$ source .venv/bin/activate
+$ pip install -e .'[dev]'
+$ python -m pytest
 ```
diff --git a/bibat/examples/baseball/baseball/data_preparation.py b/bibat/examples/baseball/baseball/data_preparation.py
@@ -1,28 +1,26 @@
-"""Provides functions prepare_data_x.
+"""Provides function `prepare_data` and runs it.
 
-These functions should take in a dataframe of measurements and return a
-PreparedData object.
+This function should run some other functions with names `prepare_data_x`, which
+each take in a dataframe of measurements and return a PreparedData object.
 
 """
 import json
 import os
+from io import StringIO
+from pathlib import Path
+from typing import Any
 
 import pandas as pd
 import pandera as pa
 from pandera.typing import DataFrame, Series
-from pydantic import BaseModel
+from pandera.typing.common import DataFrameBase
+from pydantic import BaseModel, field_serializer, field_validator
 
 from baseball import util
 
-NAME_FILE = "name.txt"
-COORDS_FILE = "coords.json"
-MEASUREMENTS_FILE = "measurements.csv"
-N_CV_FOLDS = 10
-
-HERE = os.path.dirname(__file__)
-DATA_DIR = os.path.join(HERE, "..", "data")
-RAW_DIR = os.path.join(DATA_DIR, "raw")
-PREPARED_DIR = os.path.join(DATA_DIR, "prepared")
+HERE = Path(__file__).parent
+RAW_DIR = HERE / ".." / "data" / "raw"
+PREPARED_DIR = HERE / ".." / "data" / "prepared"
 RAW_DATA_FILES = {
     "2006": [os.path.join(RAW_DIR, "2006.csv")],
     "bdb": [
@@ -33,28 +31,6 @@
 }
 
 
-def prepare_data():
-    """Run main function."""
-    print("Reading raw data...")
-    raw_data = {
-        k: [pd.read_csv(file, index_col=None) for file in v]
-        for k, v in RAW_DATA_FILES.items()
-    }
-    data_preparation_functions_to_run = {
-        "2006": prepare_data_2006,
-        "bdb": prepare_data_bdb,
-    }
-    print("Preparing data...")
-    for name, dpf in data_preparation_functions_to_run.items():
-        print(f"Running data preparation function {dpf.__name__}...")
-        prepared_data = dpf(*raw_data[name])
-        output_dir = os.path.join(PREPARED_DIR, prepared_data.name)
-        print(f"\twriting files to {output_dir}")
-        if not os.path.exists(PREPARED_DIR):
-            os.mkdir(PREPARED_DIR)
-        write_prepared_data(prepared_data, output_dir)
-
-
 class MeasurementsDF(pa.SchemaModel):
     """A PreparedData should have a measurements dataframe like this.
 
@@ -72,7 +48,28 @@ class PreparedData(BaseModel, arbitrary_types_allowed=True):
 
     name: str
     coords: util.CoordDict
-    measurements: DataFrame[MeasurementsDF]
+    measurements: Any
+
+    @field_validator("measurements")
+    def validate_measurements(cls, v: Any) -> DataFrameBase[MeasurementsDF]:
+        """Validate the measurements field."""
+        if isinstance(v, str):
+            v = pd.read_json(StringIO(v))
+        return MeasurementsDF.validate(v)
+
+    @field_serializer("measurements")
+    def serialize_measurements(
+        self, measurements: DataFrame[MeasurementsDF], _info
+    ):
+        """Serialise the measurements field."""
+        return measurements.to_json()
+
+
+def load_prepared_data(path_to_data: str) -> PreparedData:
+    """Load a dataset."""
+    with open(path_to_data) as f:
+        raw = json.load(f)
+    return PreparedData(**raw)
 
 
 def prepare_data_2006(measurements_raw: pd.DataFrame) -> PreparedData:
@@ -162,26 +159,28 @@ def filter_batters(df: pd.DataFrame):
     )
 
 
-def load_prepared_data(directory: str) -> PreparedData:
-    """Load prepared data from files in directory."""
-    with open(os.path.join(directory, COORDS_FILE), "r") as f:
-        coords = json.load(f)
-    with open(os.path.join(directory, NAME_FILE), "r") as f:
-        name = f.read()
-    measurements = pd.read_csv(os.path.join(directory, MEASUREMENTS_FILE))
-    return PreparedData(
-        name=name,
-        coords=coords,
-        measurements=DataFrame[MeasurementsDF](measurements),
-    )
+def prepare_data():
+    """Run main function."""
+    print("Reading raw data...")
+    raw_data = {
+        k: [pd.read_csv(file, index_col=None) for file in v]
+        for k, v in RAW_DATA_FILES.items()
+    }
+    data_preparation_functions_to_run = {
+        "2006": prepare_data_2006,
+        "bdb": prepare_data_bdb,
+    }
+    print("Preparing data...")
+    for name, dpf in data_preparation_functions_to_run.items():
+        print(f"Running data preparation function {dpf.__name__}...")
+        prepared_data = dpf(*raw_data[name])
+        output_file = os.path.join(PREPARED_DIR, prepared_data.name + ".json")
+        print(f"\twriting files to {output_file}")
+        if not os.path.exists(PREPARED_DIR):
+            os.mkdir(PREPARED_DIR)
+        with open(output_file, "w") as f:
+            f.write(prepared_data.model_dump_json())
 
 
-def write_prepared_data(prepped: PreparedData, directory):
-    """Write prepared data files to a directory."""
-    if not os.path.exists(directory):
-        os.mkdir(directory)
-        prepped.measurements.to_csv(os.path.join(directory, MEASUREMENTS_FILE))
-    with open(os.path.join(directory, COORDS_FILE), "w") as f:
-        json.dump(prepped.coords, f)
-    with open(os.path.join(directory, NAME_FILE), "w") as f:
-        f.write(prepped.name)
+if __name__ == "__main__":
+    prepare_data()
diff --git a/bibat/examples/baseball/baseball/fetch_data.py b/bibat/examples/baseball/baseball/fetch_data.py
@@ -7,11 +7,11 @@
 URLS = {
     "2006": "https://raw.githubusercontent.com/stan-dev/"
     "example-models/master/knitr/pool-binary-trials/baseball-hits-2006.csv",
-    "bdb-main": "https://raw.githubusercontent.com/chadwickbureau/"
+    "bdb-main": "https://raw.githubusercontent.com/cbwinslow/"
     "baseballdatabank/master/core/Batting.csv",
-    "bdb-post": "https://raw.githubusercontent.com/chadwickbureau/"
+    "bdb-post": "https://raw.githubusercontent.com/cbwinslow/"
     "baseballdatabank/master/core/BattingPost.csv",
-    "bdb-apps": "https://raw.githubusercontent.com/chadwickbureau/"
+    "bdb-apps": "https://raw.githubusercontent.com/cbwinslow/"
     "baseballdatabank/master/core/Appearances.csv",
 }
 OUT_FILES = {

diff --git a/bibat/examples/baseball/baseball/fitting_mode.py b/bibat/examples/baseball/baseball/fitting_mode.py
@@ -47,7 +47,7 @@ class FittingMode(BaseModel):
     name: str
     idata_target: IdataTarget
     fit: Callable[
-        [CmdStanModel, Dict, Dict[str, str]], Union[CmdStanMCMC, xr.Dataset]
+        [CmdStanModel, Dict, Dict[str, str]], Union[CmdStanMCMC, xr.DataArray]
     ]
 
 
@@ -136,10 +136,12 @@ def fit_kfold(model: CmdStanModel, input_dict: dict, kwargs) -> xr.DataArray:
     return xr.concat(lliks_by_fold, dim="llik_dim_0").sortby("llik_dim_0")
 
 
-prior_mode = FittingMode(name="prior", idata_target="prior", fit=fit_prior)
+prior_mode = FittingMode(
+    name="prior", idata_target=IdataTarget.prior, fit=fit_prior
+)
 posterior_mode = FittingMode(
-    name="posterior", idata_target="posterior", fit=fit_posterior
+    name="posterior", idata_target=IdataTarget.posterior, fit=fit_posterior
 )
 kfold_mode = FittingMode(
-    name="kfold", idata_target="log_likelihood", fit=fit_kfold
+    name="kfold", idata_target=IdataTarget.log_likelihood, fit=fit_kfold
 )