From 1fcda57e941e680cb7976edd883a73d55633d8f3 Mon Sep 17 00:00:00 2001 From: Jelle Teijema Date: Thu, 18 Apr 2024 15:10:09 +0200 Subject: [PATCH] Refactor of template files (#55) * Adds a base class for templates and moves duplicate code to the base class * passes wordcloud flag to docs renderer * adds allow overwrite flag * adds a catch for using the wrong template with the wrong template class * cleans up name passing (name is now only in the template class, or the actual template, no more random strings) * cleans up valid template checker * cleans up the console output * format the documents using ruff * add an extra line to the filehandler writer, that way we can remove all the double empty lines in all templates * update the workflow * add a config DEFAULT object * add prohibited arguments to templates * refactor platform detection code * refactor fp_template code * n_runs only adds a _{{ run }} to the filename if n_runs is more than 1 --- .github/workflows/ci-workflow.yml | 66 +++-- .github/workflows/pythonpackage.yml | 4 +- README.md | 7 +- asreviewcontrib/makita/entrypoint.py | 270 +++++++----------- asreviewcontrib/makita/template_arfi.py | 155 ++++------ asreviewcontrib/makita/template_base.py | 139 +++++++++ asreviewcontrib/makita/template_basic.py | 148 ++++------ asreviewcontrib/makita/template_multimodel.py | 173 ++++------- .../makita/templates/doc_README.md.template | 11 +- .../templates/script_get_plot.py.template | 30 +- ...script_get_settings_from_state.py.template | 19 +- .../script_merge_descriptives.py.template | 14 +- .../script_merge_metrics.py.template | 24 +- .../templates/script_merge_tds.py.template | 42 ++- ...plit_data_with_multiple_labels.py.template | 1 - .../templates/template_arfi.txt.template | 2 +- .../templates/template_basic.txt.template | 8 +- .../template_multimodel.txt.template | 8 +- asreviewcontrib/makita/utils.py | 31 +- pyproject.toml | 18 +- 20 files changed, 542 insertions(+), 628 deletions(-) create mode 100644 asreviewcontrib/makita/template_base.py diff --git a/.github/workflows/ci-workflow.yml b/.github/workflows/ci-workflow.yml index 5c257afa..fe0dce24 100644 --- a/.github/workflows/ci-workflow.yml +++ b/.github/workflows/ci-workflow.yml @@ -4,34 +4,40 @@ jobs: test-template-and-lint: strategy: matrix: - os: [macos-latest, windows-latest, ubuntu-latest] + os: [windows-latest, ubuntu-latest] + python-version: ['3.8', '3.12'] runs-on: ${{ matrix.os }} steps: - - uses: actions/checkout@master - - uses: actions/setup-python@v4 + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 with: - python-version: '3.8' + python-version: ${{ matrix.python-version }} architecture: 'x64' - - name: Install makita + - name: Cache Python packages + uses: actions/cache@v4 + with: + path: | + ${{ runner.os == 'Windows' && 'C:\users\runneradmin\appdata\local\pip\cache' || '~/.cache/pip' }} + key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt') }} + restore-keys: | + ${{ runner.os }}-pip- + - name: Install dependencies run: | - pip install . - - name: Install ruff + pip install . ruff scitree asreview-datatools asreview-insights synergy-dataset + - name: Lint python with ruff run: | - pip install ruff + ruff check . + - name: Create directories using Python + run: python -c "import os; [os.makedirs(path, exist_ok=True) for path in ['./tmp/basic/data-test', './tmp/arfi/data', './tmp/multimodel/data', './tmp/scripts', './tmp/synergy/data']]" - name: set up environment run: | - mkdir tmp - cd tmp - mkdir -p basic/data - mkdir -p arfi/data - mkdir -p multimodel/data - cp ../.github/workflows/test_data/labels.csv basic/data/labels.csv - cp ../.github/workflows/test_data/labels.csv arfi/data/labels.csv - cp ../.github/workflows/test_data/labels.csv multimodel/data/labels.csv - - name: Test makita templates + cp .github/workflows/test_data/labels.csv ./tmp/basic/data-test/labels.csv + cp .github/workflows/test_data/labels.csv ./tmp/arfi/data/labels.csv + cp .github/workflows/test_data/labels.csv ./tmp/multimodel/data/labels.csv + - name: Render makita templates run: | cd tmp/basic - asreview makita template basic | tee output.txt + asreview makita template basic --classifier nb --feature_extractor tfidf --query_strategy max --n_runs 1 -s data-test -o output-test --init_seed 1 --model_seed 2 --skip_wordclouds --overwrite --instances_per_query 2 --stop_if min --balance_strategy double | tee output.txt grep -q "ERROR" output.txt && exit 1 || true cd ../arfi asreview makita template arfi | tee output.txt @@ -39,16 +45,20 @@ jobs: cd ../multimodel asreview makita template multimodel | tee output.txt grep -q "ERROR" output.txt && exit 1 || true - - name: Run ShellCheck + - name: Render makita scripts + run: | + asreview makita add-script --all -o ./tmp/scripts | tee output.txt + grep -q "ERROR" output.txt && exit 1 || true + - name: Run SciTree if: ${{ matrix.os != 'windows-latest' }} - uses: ludeeus/action-shellcheck@master - with: - scandir: './tmp' - env: - SHELLCHECK_OPTS: -e SC2148 - - name: Generate makita scripts run: | - asreview makita add-script --all - - name: Lint python with ruff + cd ./tmp/ + scitree + - name: Execute basic template jobs file + if: ${{ matrix.os != 'windows-latest' }} run: | - ruff . + cd tmp/synergy + synergy_dataset get -d van_de_Schoot_2018 -o ./data -l + asreview makita template basic --instances_per_query 100 --skip_wordclouds --overwrite --n_runs 2 + sh jobs.sh + scitree diff --git a/.github/workflows/pythonpackage.yml b/.github/workflows/pythonpackage.yml index effda0e0..ab84a01d 100644 --- a/.github/workflows/pythonpackage.yml +++ b/.github/workflows/pythonpackage.yml @@ -13,9 +13,9 @@ jobs: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Set up Python - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: '3.x' - name: Install dependencies diff --git a/README.md b/README.md index 5ba7da9d..2ea17c3c 100644 --- a/README.md +++ b/README.md @@ -121,6 +121,7 @@ optional arguments: --platform PLATFORM Platform to run jobs: Windows, Darwin, Linux. Default: the system of rendering templates. --n_runs N_RUNS Number of runs. Default: 1. --no_wordclouds Disables the generation of wordclouds. + --overwrite Automatically accepts all overwrite requests. --classifier CLASSIFIER Classifier to use. Default: nb. --feature_extractor FEATURE_EXTRACTOR Feature_extractor to use. Default: tfidf. --query_strategy QUERY_STRATEGY Query strategy to use. Default: max. @@ -148,6 +149,7 @@ optional arguments: --platform PLATFORM Platform to run jobs: Windows, Darwin, Linux. Default: the system of rendering templates. --n_priors N_PRIORS Number of priors. Default: 10. --no_wordclouds Disables the generation of wordclouds. + --overwrite Automatically accepts all overwrite requests. --classifier CLASSIFIER Classifier to use. Default: nb. --feature_extractor FEATURE_EXTRACTOR Feature_extractor to use. Default: tfidf. --query_strategy QUERY_STRATEGY Query strategy to use. Default: max. @@ -175,18 +177,19 @@ optional arguments: --platform PLATFORM Platform to run jobs: Windows, Darwin, Linux. Default: the system of rendering templates. --n_runs N_RUNS Number of runs. Default: 1. --no_wordclouds Disables the generation of wordclouds. + --overwrite Automatically accepts all overwrite requests. --instances_per_query INSTANCES_PER_QUERY Number of instances per query. Default: 1. --stop_if STOP_IF The number of label actions to simulate. Default 'min' will stop simulating when all relevant records are found. --classifiers CLASSIFIERS Classifiers to use Default: ['logistic', 'nb', 'rf', 'svm'] --feature_extractors FEATURE_EXTRACTOR Feature extractors to use Default: ['doc2vec', 'sbert', 'tfidf'] --query_strategies QUERY_STRATEGY Query strategies to use Default: ['max'] - --balancing_strategies BALANCE_STRATEGY Balance strategies to use Default: ['double'] + --balance_strategies BALANCE_STRATEGY Balance strategies to use Default: ['double'] --impossible_models IMPOSSIBLE_MODELS Model combinations to exclude Default: ['nb,doc2vec', 'nb,sbert'] ``` If you want to specify certain combinations of classifiers and feature extractors that should and should not be used, you can use the `--classifiers`, -`--feature_extractors`, `--query_strategies`, `--balancing_strategies` and `--impossible_models` option. For instance, if you +`--feature_extractors`, `--query_strategies`, `--balance_strategies` and `--impossible_models` option. For instance, if you want to exclude the combinations of `nb` with `doc2vec` and `logistic` with `tfidf`, use the following command: diff --git a/asreviewcontrib/makita/entrypoint.py b/asreviewcontrib/makita/entrypoint.py index 84444d4e..4b8dd9e0 100644 --- a/asreviewcontrib/makita/entrypoint.py +++ b/asreviewcontrib/makita/entrypoint.py @@ -2,34 +2,15 @@ import os from pathlib import Path +from asreview import config as ASREVIEW_CONFIG from asreview.entry_points import BaseEntryPoint +from asreview.utils import _entry_points from asreviewcontrib.makita import __version__ from asreviewcontrib.makita.config import TEMPLATES_FP -from asreviewcontrib.makita.template_arfi import render_jobs_arfi -from asreviewcontrib.makita.template_basic import render_jobs_basic -from asreviewcontrib.makita.template_multimodel import render_jobs_multimodel from asreviewcontrib.makita.utils import FileHandler -def _get_template_fp(name): - return Path(TEMPLATES_FP, f"template_{name}.txt.template") - - -def _is_valid_template(fp): - if fp and Path(fp).is_file(): - return True - else: - raise ValueError(f"Template {fp} not found") - - -def _shell_to_batch(job): - job = f"@ echo off\nCOLOR E0{job}" - job = job.replace("#", "::") - job = job.replace("/", "\\") - return job - - class MakitaEntryPoint(BaseEntryPoint): # backward compat? description = "Makita functionality for ASReview datasets." @@ -67,13 +48,13 @@ def execute(self, argv): # noqa: C901 "--init_seed", type=int, default=535, - help="Seed of the priors. Seed is set to 535 by default.", + help="Seed of the priors. 535 by default.", ) parser_template.add_argument( "--model_seed", type=int, default=165, - help="Seed of the models. Seed is set to 165 by default.", + help="Seed of the models. 165 by default.", ) parser_template.add_argument( "--template", type=str, help="Overwrite template with template file path." @@ -84,101 +65,82 @@ def execute(self, argv): # noqa: C901 help="Platform to run jobs: Windows, Darwin, Linux. " "Default: the system of rendering templates.", ) + parser_template.add_argument( + "--instances_per_query", + type=int, + default=ASREVIEW_CONFIG.DEFAULT_N_INSTANCES, + help="Number of instances per query. ", + ) + parser_template.add_argument( + "--stop_if", + type=str, + default="min", + help="The number of label actions to simulate. ", + ) parser_template.add_argument( "--n_runs", type=int, - default=1, - help="Number of runs. Only for templates 'basic' and 'multimodel'. " - "Default: 1.", + help="Number of runs. Only for templates 'basic' and 'multimodel'. ", ) parser_template.add_argument( "--n_priors", type=int, - default=10, - help="Number of priors. Only for template 'arfi'. " - "Default: 10.", + help="Number of priors. Only for template 'arfi'.", ) parser_template.add_argument( - "--no_wordclouds", - action="store_false", - help="Disables the generation of wordclouds. " + "--skip_wordclouds", + action="store_true", + help="Disables the generation of wordclouds. ", + ) + parser_template.add_argument( + "--overwrite", + action="store_true", + help="Overwrite existing files in the output folder. ", ) parser_template.add_argument( "--classifier", type=str, - default="nb", - help="Classifier to use. Only for template 'basic' and 'arfi'. " - "Default: nb.", + help="Classifier to use. Only for template 'basic' and 'arfi'. ", ) parser_template.add_argument( "--feature_extractor", type=str, - default="tfidf", - help="Feature_extractor to use. Only for template 'basic' and 'arfi'. " - "Default: tfidf.", + help="Feature_extractor to use. Only for template 'basic' and 'arfi'. ", ) parser_template.add_argument( "--query_strategy", type=str, - default="max", - help="Query strategy to use. " - "Default: max.", + help="Query strategy to use. Only for template 'basic' and 'arfi'. ", ) parser_template.add_argument( "--balance_strategy", type=str, - default="double", - help="Balance strategy to use. " - "Default: double.", - ) - parser_template.add_argument( - "--instances_per_query", - type=int, - default=1, - help="Number of instances per query. " - "Default: 1.", - ) - parser_template.add_argument( - "--stop_if", - type=str, - default="min", - help="The number of label actions to simulate. " - "Default 'min' will stop simulating when all relevant records are found.", + help="Balance strategy to use. Only for template 'basic' and 'arfi'. ", ) parser_template.add_argument( "--classifiers", nargs="+", - default=["logistic", "nb", "rf", "svm"], - help="Classifiers to use. Only for template 'multimodel'. " - "Default: ['logistic', 'nb', 'rf', 'svm']", + help="Classifiers to use. Only for template 'multimodel'. ", ) parser_template.add_argument( "--feature_extractors", nargs="+", - default=["doc2vec", "sbert", "tfidf"], - help="Feature extractors to use. Only for template 'multimodel'. " - "Default: ['doc2vec', 'sbert', 'tfidf']", + help="Feature extractors to use. Only for template 'multimodel'. ", ) parser_template.add_argument( "--query_strategies", nargs="+", - default=["max"], - help="Query strategies to use. Only for template 'multimodel'. " - "Default: ['max']", + help="Query strategies to use. Only for template 'multimodel'. ", ) parser_template.add_argument( - "--balancing_strategies", + "--balance_strategies", nargs="+", - default=["double"], - help="Balancing strategies to use. Only for template 'multimodel'. " - "Default: ['double']", + help="Balancing strategies to use. Only for template 'multimodel'. ", ) parser_template.add_argument( "--impossible_models", nargs="+", - default=["nb,doc2vec", "nb,sbert"], - help="Model combinations to exclude. Only for template 'multimodel'. " - "Default: ['nb,doc2vec', 'nb,sbert']", + help="Model combinations to exclude. Only for template 'multimodel'.", ) parser_template.set_defaults(func=self._template_cli) @@ -191,11 +153,14 @@ def execute(self, argv): # noqa: C901 "--all", "-a", action="store_true", help="Add all scripts." ) parser_script.add_argument( - "-o", type=str, default="scripts", help="Location of the scripts folder." + "-o", + type=str, + default="scripts", + help="Location of the scripts folder.", ) parser_script.set_defaults(func=self._add_script_cli) - # parse the args and call whatever function was selected + # parse the args and call the selected function args = parser.parse_args(argv) args.func(args) @@ -208,13 +173,24 @@ def _template_cli(self, args): def _template(self, args): """Generate a template.""" - # backwards compatibility for 'multiple_models' - if args.name == "multiple_models": - args.name = "multimodel" + # lowercase name + args.name = args.name.lower() + + # check if args.name is in _entry_points + if args.name not in _entry_points(group="asreview.makita.templates").names: + raise ValueError(f"Template {args.name} not found.") - # check if a custom template is used, otherwise use the default template - fp_template = args.template or (args.name and _get_template_fp(args.name)) - _is_valid_template(fp_template) + # if a custom template is provided, check if it exists + if args.template: + fp_template = Path(args.template) + if not fp_template.is_file(): + raise ValueError(f"Custom template {args.template} not found") + print( + f"\033[33mRendering custom template {args.template} using {args.name}.\u001b[0m\n" # noqa: E501 + ) + else: + fp_template = None + print(f"\033[33mRendering template {args.name}.\u001b[0m\n") # load datasets datasets = ( @@ -225,94 +201,62 @@ def _template(self, args): # throw exception if no datasets are found if len(datasets) == 0: - raise ValueError("No datasets found in the specified folder.") + raise ValueError("No datasets found in the selected data folder.") # create output folder Path(args.o).parent.mkdir(parents=True, exist_ok=True) - if args.name in ["basic"]: - # render jobs - job = render_jobs_basic( - datasets, - output_folder=Path(args.o), - create_wordclouds=args.no_wordclouds, - n_runs=args.n_runs, - init_seed=args.init_seed, - model_seed=args.model_seed, - classifier=args.classifier, - feature_extractor=args.feature_extractor, - query_strategy=args.query_strategy, - balance_strategy=args.balance_strategy, - instances_per_query=args.instances_per_query, - stop_if=args.stop_if, - fp_template=fp_template, - job_file=args.job_file, - platform_sys=args.platform, - ) - - elif args.name in ["arfi"]: - # render jobs - job = render_jobs_arfi( - datasets, - output_folder=Path(args.o), - create_wordclouds=args.no_wordclouds, - n_priors=args.n_priors, - init_seed=args.init_seed, - model_seed=args.model_seed, - classifier=args.classifier, - feature_extractor=args.feature_extractor, - query_strategy=args.query_strategy, - balance_strategy=args.balance_strategy, - instances_per_query=args.instances_per_query, - stop_if=args.stop_if, - fp_template=fp_template, - job_file=args.job_file, - platform_sys=args.platform, - ) - - elif args.name in ["multimodel"]: - # render jobs - job = render_jobs_multimodel( - datasets, - output_folder=Path(args.o), - create_wordclouds=args.no_wordclouds, - n_runs=args.n_runs, - init_seed=args.init_seed, - model_seed=args.model_seed, - all_classifiers=args.classifiers, - all_feature_extractors=args.feature_extractors, - all_query_strategies=args.query_strategies, - all_balancing_strategies=args.balancing_strategies, - impossible_models=args.impossible_models, - instances_per_query=args.instances_per_query, - stop_if=args.stop_if, - fp_template=fp_template, - job_file=args.job_file, - platform_sys=args.platform, - ) - - else: - # render jobs - job = render_jobs_basic( - datasets, - output_folder=Path(args.o), - init_seed=args.init_seed, - model_seed=args.model_seed, - fp_template=fp_template, - job_file=args.job_file, - platform_sys=args.platform, - ) - - if args.platform == "Windows" or (args.platform is None and os.name == "nt"): - job = _shell_to_batch(job) - job_file = "jobs.bat" if args.job_file is None else args.job_file - else: - job_file = "jobs.sh" if args.job_file is None else args.job_file + # get job file + if args.job_file is None: + if args.platform == "Windows" or ( + args.platform is None and os.name == "nt" + ): # noqa: E501 + args.job_file = "jobs.bat" + else: + args.job_file = "jobs.sh" + + # load template + template = _entry_points(group="asreview.makita.templates")[args.name].load() + + keys_of_interest = [ + "skip_wordclouds", + "overwrite", + "n_runs", + "n_priors", + "init_seed", + "model_seed", + "classifier", + "feature_extractor", + "query_strategy", + "balance_strategy", + "classifiers", + "feature_extractors", + "query_strategies", + "balance_strategies", + "impossible_models", + "instances_per_query", + "stop_if", + "job_file", + ] + + job = template( + datasets=datasets, + fp_template=fp_template, + output_folder=Path(args.o), + scripts_folder=Path("scripts"), + **{key: vars(args)[key] for key in keys_of_interest if key in vars(args)}, + ).render() + + # convert shell to batch if needed + if args.job_file.endswith(".bat"): + job = f"@ echo off\nCOLOR E0{job}" + job = job.replace("#", "::") + job = job.replace("/", "\\") # store result in output folder - with open(job_file, "w") as f: + with open(args.job_file, "w") as f: f.write(job) - print(f"Rendered template {args.name} and saved to {job_file}") + print(f"Rendered template {args.name} and saved to {args.job_file}") def _add_script_cli(self, args): try: diff --git a/asreviewcontrib/makita/template_arfi.py b/asreviewcontrib/makita/template_arfi.py index 9506fec0..800a453d 100644 --- a/asreviewcontrib/makita/template_arfi.py +++ b/asreviewcontrib/makita/template_arfi.py @@ -1,114 +1,69 @@ """Render ARFI template.""" -import os -import platform -from pathlib import Path - import numpy as np -from asreview import ASReviewData -from cfgtemplater.config_template import ConfigTemplate - -from asreviewcontrib.makita import __version__ -from asreviewcontrib.makita.utils import FileHandler -from asreviewcontrib.makita.utils import check_filename_dataset - - -def render_jobs_arfi( - datasets, - output_folder="output", - scripts_folder="scripts", - create_wordclouds=True, - n_priors=10, - init_seed=535, - model_seed=165, - classifier="nb", - feature_extractor="tfidf", - query_strategy="max", - balance_strategy="double", - instances_per_query=1, - stop_if='min', - fp_template=None, - job_file=None, - platform_sys=None, -): - """Render jobs.""" - - if not platform_sys: - platform_sys = platform.system() - if not job_file: - job_file = "jobs.bat" if os.name == "nt" else "jobs.sh" - - params = [] - - # initialize file handler - file_handler = FileHandler() - - # generate params for all simulations - for i, fp_dataset in enumerate(sorted(datasets)): - check_filename_dataset(fp_dataset) - - # render priors - priors = _get_priors(fp_dataset, init_seed=init_seed + i, n_priors=n_priors) - - # params for single dataset - params.append( - { - "input_file": fp_dataset.as_posix(), - "input_file_stem": fp_dataset.stem, - "priors": priors, - "model_seed": model_seed + i, - } +from asreview import config as ASREVIEW_CONFIG +from asreview.data import ASReviewData + +from asreviewcontrib.makita.template_base import TemplateBase + + +class TemplateARFI(TemplateBase): + template_file = "template_arfi.txt.template" + + def __init__( + self, + classifier, + feature_extractor, + query_strategy, + n_priors, + **kwargs, + ): + self.classifier = classifier + self.feature_extractor = feature_extractor + self.query_strategy = query_strategy + self.n_priors = n_priors + super().__init__(**kwargs) + + def get_dataset_specific_params(self, index, fp_dataset): + """Prepare dataset-specific parameters. These parameters are provided to the + template once for each dataset.""" + + n_priors = self.n_priors if self.n_priors is not None else 10 + + priors = _get_priors( + fp_dataset, init_seed=self.init_seed + index, n_priors=n_priors ) + return { + "input_file": fp_dataset.as_posix(), + "input_file_stem": fp_dataset.stem, + "priors": priors, + "model_seed": self.model_seed + index, + } - # Instantiate a ConfigTemplate object, initializing a Jinja2 environment and - # setting up template variables and extensions. - template = ConfigTemplate(fp_template) - - # render scripts - if template.scripts is not None: - for s in template.scripts: - t_script = file_handler.render_file_from_template( - s, "script", output_folder=output_folder - ) - export_fp = Path(scripts_folder, s) - file_handler.add_file(t_script, export_fp) - - # render docs - if template.docs is not None: - for s in template.docs: - t_docs = file_handler.render_file_from_template( - s, - "doc", - datasets=datasets, - template_name=template.name if template.name == "ARFI" else "custom", - template_name_long=template.name_long, - template_scripts=template.scripts, - output_folder=output_folder, - job_file=job_file, - ) - file_handler.add_file(t_docs, s) - - # print summary to console - file_handler.print_summary() - - # render file and return - return template.render( - { + def get_template_specific_params(self, params): + """Prepare template-specific parameters. These parameters are provided to the + template only once.""" + + # set default values if not provided + classifier = self.classifier if self.classifier is not None else ASREVIEW_CONFIG.DEFAULT_MODEL # noqa: E501 + feature_extractor = self.feature_extractor if self.feature_extractor is not None else ASREVIEW_CONFIG.DEFAULT_FEATURE_EXTRACTION # noqa: E501 + query_strategy = self.query_strategy if self.query_strategy is not None else ASREVIEW_CONFIG.DEFAULT_QUERY_STRATEGY # noqa: E501 + balance_strategy = self.balance_strategy if self.balance_strategy is not None else ASREVIEW_CONFIG.DEFAULT_BALANCE_STRATEGY # noqa: E501 + + return { "datasets": params, - "create_wordclouds": create_wordclouds, + "skip_wordclouds": self.skip_wordclouds, "classifier": classifier, "feature_extractor": feature_extractor, "query_strategy": query_strategy, "balance_strategy": balance_strategy, - "instances_per_query": instances_per_query, - "stop_if": stop_if, - "init_seed": init_seed, - "output_folder": output_folder, - "scripts_folder": scripts_folder, - "platform": platform_sys, - "version": __version__, + "instances_per_query": self.instances_per_query, + "stop_if": self.stop_if, + "init_seed": self.init_seed, + "output_folder": self.output_folder, + "scripts_folder": self.scripts_folder, + "version": self.__version__, } - ) def _get_priors(dataset, init_seed, n_priors): diff --git a/asreviewcontrib/makita/template_base.py b/asreviewcontrib/makita/template_base.py new file mode 100644 index 00000000..c0fa83d9 --- /dev/null +++ b/asreviewcontrib/makita/template_base.py @@ -0,0 +1,139 @@ +"""Rendering base class for templates.""" + +from pathlib import Path + +from cfgtemplater.config_template import ConfigTemplate + +from asreviewcontrib.makita import __version__ +from asreviewcontrib.makita.config import TEMPLATES_FP +from asreviewcontrib.makita.utils import FileHandler + + +class TemplateBase: + template_file = "" + + def __init__( + self, + datasets, + fp_template, + output_folder, + scripts_folder, + skip_wordclouds, + overwrite, + init_seed, + model_seed, + balance_strategy, + instances_per_query, + stop_if, + job_file, + **kwargs, + ): + self.datasets = datasets + self.output_folder = output_folder + self.scripts_folder = scripts_folder + self.skip_wordclouds = skip_wordclouds + self.init_seed = init_seed + self.model_seed = model_seed + self.balance_strategy = balance_strategy + self.instances_per_query = instances_per_query + self.stop_if = stop_if + self.job_file = job_file + self.file_handler = FileHandler(overwrite) + self.__version__ = __version__ + + self.template = ConfigTemplate( + fp_template if fp_template is not None else self.get_template_file() + ) # noqa: E501 + + for param in kwargs: + if kwargs[param] is not None: + # print value of param + print(f"{param} = {kwargs[param]}") + raise ValueError(f"{param} should not be set for this template.") + + def get_template_file(self): + return Path(TEMPLATES_FP, self.template_file) + + def get_dataset_specific_params(self, index, fp_dataset): + """Prepare dataset-specific parameters. These parameters are provided to the + template once for each dataset.""" + + raise NotImplementedError( + "Subclasses should implement this method to prepare dataset-specific parameters." # noqa: E501 + ) + + def get_template_specific_params(self, params): + """Prepare template-specific parameters. These parameters are provided to the + template only once.""" + + raise NotImplementedError( + "Subclasses should implement this method to prepare template-specific parameters." # noqa: E501 + ) + + def render_scripts(self, scripts: list): + """Render scripts.""" + + for s in scripts: + t_script = self.file_handler.render_file_from_template( + s, "script", output_folder=self.output_folder + ) + export_fp = Path(self.scripts_folder, s) + self.file_handler.add_file(t_script, export_fp) + + def render_docs(self, docs: list): + """Render docs.""" + + for s in docs: + t_docs = self.file_handler.render_file_from_template( + s, + "doc", + datasets=self.datasets, + template_name=self.template.name, + template_name_long=self.template.name_long, + template_scripts=self.template.scripts, + skip_wordclouds=self.skip_wordclouds, + output_folder=self.output_folder, + job_file=self.job_file, + ) + self.file_handler.add_file(t_docs, s) + + def render(self): + """Render template.""" + + # render scripts + if self.template.scripts: + self.render_scripts(self.template.scripts) + + # render docs + if self.template.docs: + self.render_docs(self.template.docs) + + # collect dynamic parameters + params = [] + for i, fp_dataset in enumerate(sorted(self.datasets)): + if " " in Path(fp_dataset).stem: + raise ValueError( + f"Dataset filename '{fp_dataset}' cannot contain whitespace." + ) # noqa + fp_dataset = Path(fp_dataset) + params.append(self.get_dataset_specific_params(i, fp_dataset)) + + try: + rendered_output = self.template.render( + self.get_template_specific_params(params) + ) + except TypeError as e: + if "'StrictUndefined' object cannot be interpreted as an integer" in str(e): + print("\033[31mERROR: A rendering exception occurred -", e) + print( + "The rendering process failed due to an attempt to use an undefined variable where an integer was expected." # noqa: E501 + ) + print( + "\033[33mPlease check your template for variables that are not properly defined or passed in.\033[0m" # noqa: E501 + ) + exit(1) + else: + raise e + + self.file_handler.print_summary() + return rendered_output diff --git a/asreviewcontrib/makita/template_basic.py b/asreviewcontrib/makita/template_basic.py index 38643176..e1edb76d 100644 --- a/asreviewcontrib/makita/template_basic.py +++ b/asreviewcontrib/makita/template_basic.py @@ -1,108 +1,60 @@ """Render basic template.""" -import os -import platform -from pathlib import Path - -from cfgtemplater.config_template import ConfigTemplate - -from asreviewcontrib.makita import __version__ -from asreviewcontrib.makita.utils import FileHandler -from asreviewcontrib.makita.utils import check_filename_dataset - - -def render_jobs_basic( - datasets, - output_folder="output", - scripts_folder="scripts", - create_wordclouds=True, - n_runs=1, - init_seed=535, - model_seed=165, - classifier="nb", - feature_extractor="tfidf", - query_strategy="max", - balance_strategy="double", - instances_per_query=1, - stop_if='min', - fp_template=None, - job_file=None, - platform_sys=None, -): - """Render jobs.""" - - if not platform_sys: - platform_sys = platform.system() - if not job_file: - job_file = "jobs.bat" if os.name == "nt" else "jobs.sh" - - params = [] - - # initialize file handler - file_handler = FileHandler() - - # generate params for all simulations - for i, fp_dataset in enumerate(sorted(datasets)): - check_filename_dataset(fp_dataset) - - fp_dataset = Path(fp_dataset) - - # params for single dataset - params.append( - { - "input_file": fp_dataset.as_posix(), - "input_file_stem": fp_dataset.stem, - "model_seed": model_seed + i, - "init_seed": init_seed, - "n_runs": n_runs, - } - ) - - # Instantiate a ConfigTemplate object, initializing a Jinja2 environment and - # setting up template variables and extensions. - template = ConfigTemplate(fp_template) - - # render scripts - if template.scripts is not None: - for s in template.scripts: - t_script = file_handler.render_file_from_template( - s, "script", output_folder=output_folder - ) - export_fp = Path(scripts_folder, s) - file_handler.add_file(t_script, export_fp) +from asreview import config as ASREVIEW_CONFIG + +from asreviewcontrib.makita.template_base import TemplateBase + + +class TemplateBasic(TemplateBase): + template_file = "template_basic.txt.template" + + def __init__( + self, + classifier, + feature_extractor, + query_strategy, + n_runs, + **kwargs, + ): + self.classifier = classifier + self.feature_extractor = feature_extractor + self.query_strategy = query_strategy + self.n_runs = n_runs + super().__init__(**kwargs) + + def get_dataset_specific_params(self, index, fp_dataset): + """Prepare dataset-specific parameters. These parameters are provided to the + template once for each dataset.""" + + return { + "input_file": fp_dataset.as_posix(), + "input_file_stem": fp_dataset.stem, + "model_seed": self.model_seed + index, + "init_seed": self.init_seed, + } - # render docs - if template.docs is not None: - for s in template.docs: - t_docs = file_handler.render_file_from_template( - s, - "doc", - datasets=datasets, - template_name=template.name if template.name == "basic" else "custom", - template_name_long=template.name_long, - template_scripts=template.scripts, - output_folder=output_folder, - job_file=job_file, - ) - file_handler.add_file(t_docs, s) + def get_template_specific_params(self, params): + """Prepare template-specific parameters. These parameters are provided to the + template only once.""" - # print summary to console - file_handler.print_summary() + # set default values if not provided + classifier = self.classifier if self.classifier is not None else ASREVIEW_CONFIG.DEFAULT_MODEL # noqa: E501 + feature_extractor = self.feature_extractor if self.feature_extractor is not None else ASREVIEW_CONFIG.DEFAULT_FEATURE_EXTRACTION # noqa: E501 + query_strategy = self.query_strategy if self.query_strategy is not None else ASREVIEW_CONFIG.DEFAULT_QUERY_STRATEGY # noqa: E501 + balance_strategy = self.balance_strategy if self.balance_strategy is not None else ASREVIEW_CONFIG.DEFAULT_BALANCE_STRATEGY # noqa: E501 + n_runs = self.n_runs if self.n_runs is not None else 1 - # render file and return - return template.render( - { - "datasets": params, - "create_wordclouds": create_wordclouds, + return { "classifier": classifier, "feature_extractor": feature_extractor, "query_strategy": query_strategy, "balance_strategy": balance_strategy, - "instances_per_query": instances_per_query, - "stop_if": stop_if, - "output_folder": output_folder, - "scripts_folder": scripts_folder, - "platform_sys": platform_sys, - "version": __version__, + "n_runs": n_runs, + "datasets": params, + "skip_wordclouds": self.skip_wordclouds, + "instances_per_query": self.instances_per_query, + "stop_if": self.stop_if, + "output_folder": self.output_folder, + "scripts_folder": self.scripts_folder, + "version": self.__version__, } - ) diff --git a/asreviewcontrib/makita/template_multimodel.py b/asreviewcontrib/makita/template_multimodel.py index e830e458..60ce53a4 100644 --- a/asreviewcontrib/makita/template_multimodel.py +++ b/asreviewcontrib/makita/template_multimodel.py @@ -1,127 +1,66 @@ """Render multimodel template.""" -import os -import platform -from pathlib import Path - -from cfgtemplater.config_template import ConfigTemplate - -from asreviewcontrib.makita import __version__ -from asreviewcontrib.makita.utils import FileHandler -from asreviewcontrib.makita.utils import check_filename_dataset - - -def render_jobs_multimodel( - datasets, - output_folder="output", - n_runs=1, - scripts_folder="scripts", - create_wordclouds=True, - init_seed=535, - model_seed=165, - all_classifiers=None, - all_feature_extractors=None, - all_query_strategies=None, - all_balancing_strategies=None, - impossible_models=None, - instances_per_query=1, - stop_if='min', - fp_template=None, - job_file=None, - platform_sys=None, -): - if all_classifiers is None: - all_classifiers = ["logistic", "nb", "rf", "svm"] - - if all_feature_extractors is None: - all_feature_extractors = ["doc2vec", "sbert", "tfidf"] - - if all_query_strategies is None: - all_query_strategies = ["max"] - - if all_balancing_strategies is None: - all_balancing_strategies = ["double"] - - if impossible_models is None: - impossible_models = ["nb,doc2vec", "nb,sbert"] - - - """Render jobs.""" - if not platform_sys: - platform_sys = platform.system() - if not job_file: - job_file = "jobs.bat" if os.name == "nt" else "jobs.sh" - - params = [] - - # initialize file handler - file_handler = FileHandler() - - # generate params for all simulations - for i, fp_dataset in enumerate(sorted(datasets)): - check_filename_dataset(fp_dataset) - - fp_dataset = Path(fp_dataset) - - # params for single dataset - params.append( - { - "input_file": fp_dataset.as_posix(), - "input_file_stem": fp_dataset.stem, - "model_seed": model_seed + i, - "init_seed": init_seed, - } - ) - - # Instantiate a ConfigTemplate object, initializing a Jinja2 environment and - # setting up template variables and extensions. - template = ConfigTemplate(fp_template) - - # render scripts - if template.scripts is not None: - for s in template.scripts: - t_script = file_handler.render_file_from_template( - s, "script", output_folder=output_folder - ) - export_fp = Path(scripts_folder, s) - file_handler.add_file(t_script, export_fp) +from asreview import config as ASREVIEW_CONFIG + +from asreviewcontrib.makita.template_base import TemplateBase + + +class TemplateMultiModel(TemplateBase): + template_file = "template_multimodel.txt.template" + + def __init__( + self, + classifiers, + feature_extractors, + query_strategies, + balance_strategies, + impossible_models, + n_runs, + **kwargs, + ): + self.n_runs = n_runs + self.all_classifiers = classifiers + self.all_feature_extractors = feature_extractors + self.all_query_strategies = query_strategies + self.all_balance_strategies = balance_strategies + self.impossible_models = impossible_models + + super().__init__(**kwargs) + + def get_dataset_specific_params(self, index, fp_dataset): + """Prepare dataset-specific parameters. These parameters are provided to the + template once for each dataset.""" + + return { + "input_file": fp_dataset.as_posix(), + "input_file_stem": fp_dataset.stem, + "model_seed": self.model_seed + index, + "init_seed": self.init_seed, + } - # render docs - if template.docs is not None: - for s in template.docs: - t_docs = file_handler.render_file_from_template( - s, - "doc", - datasets=datasets, - template_name=template.name - if template.name == "multimodel" - else "custom", - template_name_long=template.name_long, - template_scripts=template.scripts, - output_folder=output_folder, - job_file=job_file, - ) - file_handler.add_file(t_docs, s) + def get_template_specific_params(self, params): + """Prepare template-specific parameters. These parameters are provided to the + template only once.""" - # print summary to console - file_handler.print_summary() + all_classifiers = self.all_classifiers if self.all_classifiers is not None else ["logistic", "nb", "rf"] # noqa: E501 + all_feature_extractors = self.all_feature_extractors if self.all_feature_extractors is not None else ["doc2vec", "sbert", "tfidf"] # noqa: E501 + all_query_strategies = self.all_query_strategies if self.all_query_strategies is not None else [ASREVIEW_CONFIG.DEFAULT_QUERY_STRATEGY] # noqa: E501 + all_balance_strategies = self.all_balance_strategies if self.all_balance_strategies is not None else [ASREVIEW_CONFIG.DEFAULT_BALANCE_STRATEGY] # noqa: E501 + impossible_models = [i.split(",") for i in self.impossible_models] if self.impossible_models is not None else [['nb', 'doc2vec'], ['nb', 'sbert']] # noqa: E501 + n_runs = self.n_runs if self.n_runs is not None else 1 - # render file and return - return template.render( - { + return { "datasets": params, - "create_wordclouds": create_wordclouds, - "instances_per_query": instances_per_query, - "stop_if": stop_if, - "output_folder": output_folder, + "skip_wordclouds": self.skip_wordclouds, + "instances_per_query": self.instances_per_query, + "stop_if": self.stop_if, + "output_folder": self.output_folder, "n_runs": n_runs, - "scripts_folder": scripts_folder, - "platform": platform_sys, - "version": __version__, - "all_query_strategies": all_query_strategies, + "scripts_folder": self.scripts_folder, + "version": self.__version__, "all_classifiers": all_classifiers, "all_feature_extractors": all_feature_extractors, - "all_balancing_strategies": all_balancing_strategies, - "impossible_models": [i.split(",") for i in impossible_models], + "all_query_strategies": all_query_strategies, + "all_balance_strategies": all_balance_strategies, + "impossible_models": impossible_models, } - ) diff --git a/asreviewcontrib/makita/templates/doc_README.md.template b/asreviewcontrib/makita/templates/doc_README.md.template index df9667a8..2dc3e989 100644 --- a/asreviewcontrib/makita/templates/doc_README.md.template +++ b/asreviewcontrib/makita/templates/doc_README.md.template @@ -13,13 +13,13 @@ This project depends on Python 3.7 or later (python.org/download), and [ASReview ```sh pip install asreview>=1.0 asreview-insights>=1.1.2 asreview-datatools ``` - -If wordcloud images are required, install the following dependencies. +{% if not skip_wordclouds %} +For generating wordclouds, install the following dependencies. ```sh pip install asreview-wordcloud ``` - +{% endif %} ## Data The performance on the following datasets is evaluated: @@ -70,7 +70,6 @@ The following files are found in this project: | └── 📜metrics_summary.xlsx └── 📂figures{% for dataset in datasets %} ├── 📈plot_recall_{{ dataset.stem }}.png{% endfor %}{% for dataset in datasets %} - ├── 📈wordcloud_{{ dataset.stem }}.png +{% if not skip_wordclouds %} ├── 📈wordcloud_{{ dataset.stem }}.png ├── 📈wordcloud_relevant_{{ dataset.stem }}.png - └── 📈wordcloud_irrelevant_{{ dataset.stem }}.png{% endfor %} -{%endif %} + └── 📈wordcloud_irrelevant_{{ dataset.stem }}.png{%endif %}{% endfor %}{%endif %} \ No newline at end of file diff --git a/asreviewcontrib/makita/templates/script_get_plot.py.template b/asreviewcontrib/makita/templates/script_get_plot.py.template index 1108a226..4faf4a8a 100644 --- a/asreviewcontrib/makita/templates/script_get_plot.py.template +++ b/asreviewcontrib/makita/templates/script_get_plot.py.template @@ -34,10 +34,13 @@ def _set_legend(ax, state, legend_option, label_to_line, state_file): label = state_file.stem elif legend_option == "model": label = " - ".join( - [metadata["settings"]["model"], - metadata["settings"]["feature_extraction"], - metadata["settings"]["balance_strategy"], - metadata["settings"]["query_strategy"]]) + [ + metadata["settings"]["model"], + metadata["settings"]["feature_extraction"], + metadata["settings"]["balance_strategy"], + metadata["settings"]["query_strategy"], + ] + ) elif legend_option == "classifier": label = metadata["settings"]["model"] else: @@ -82,27 +85,22 @@ def get_plot_from_states(states, filename, legend=None): _set_legend(ax, state, legend, label_to_line, state_file) if legend: - ax.legend(loc=4, prop={'size': 8}) + ax.legend(loc=4, prop={"size": 8}) fig.savefig(str(filename)) if __name__ == "__main__": - parser = argparse.ArgumentParser( description="Generate an ASReview plot from the found state files." ) + parser.add_argument("-s", type=str, help="States location") + parser.add_argument("-o", type=str, help="Output location") parser.add_argument( - "-s", - type=str, - help="States location") - parser.add_argument( - "-o", + "--show_legend", + "-l", type=str, - help="Output location") - parser.add_argument( - "--show_legend", "-l", - type=str, - help="Add a legend to the plot, based on the given parameter.") + help="Add a legend to the plot, based on the given parameter.", + ) args = parser.parse_args() # load states diff --git a/asreviewcontrib/makita/templates/script_get_settings_from_state.py.template b/asreviewcontrib/makita/templates/script_get_settings_from_state.py.template index d3f2f8c0..88bb47cc 100644 --- a/asreviewcontrib/makita/templates/script_get_settings_from_state.py.template +++ b/asreviewcontrib/makita/templates/script_get_settings_from_state.py.template @@ -36,23 +36,13 @@ def get_settings_from_state(state): return state.settings.to_dict() -if __name__ == '__main__': - - parser = argparse.ArgumentParser( - description='Convert ASReview state file to CSV' - ) - parser.add_argument( - 's', - type=str, - help='State file location') - parser.add_argument( - 'o', - type=str, - help='Export file location (json)') +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Convert ASReview state file to CSV") + parser.add_argument("s", type=str, help="State file location") + parser.add_argument("o", type=str, help="Export file location (json)") args = parser.parse_args() with open_state(args.s) as state: - result = get_settings_from_state(state) # store result in output folder @@ -60,4 +50,3 @@ if __name__ == '__main__': with open(Path(args.o), "w") as f: json.dump(result, f) - diff --git a/asreviewcontrib/makita/templates/script_merge_descriptives.py.template b/asreviewcontrib/makita/templates/script_merge_descriptives.py.template index eb5930f8..2b2992c5 100644 --- a/asreviewcontrib/makita/templates/script_merge_descriptives.py.template +++ b/asreviewcontrib/makita/templates/script_merge_descriptives.py.template @@ -35,10 +35,10 @@ def create_table_descriptives(datasets): for ds in datasets: with open(ds) as f: - data = json.load(f)['data']['items'] + data = json.load(f)["data"]["items"] values = {} for item in data: - values[item['id']] = item['value'] + values[item["id"]] = item["value"] stats.append(values) df = pd.DataFrame(stats, index=[Path(ds).name for ds in datasets]) @@ -46,7 +46,6 @@ def create_table_descriptives(datasets): if __name__ == "__main__": - parser = argparse.ArgumentParser( description="Merge descriptives of multiple files into single table." ) @@ -54,12 +53,14 @@ if __name__ == "__main__": "-s", type=str, default="{{ output_folder }}/simulation/*/descriptives/", - help="Datasets location") + help="Datasets location", + ) parser.add_argument( "-o", type=str, default="{{ output_folder }}/tables/data_descriptives_all.csv", - help="Output table location") + help="Output table location", + ) args = parser.parse_args() # load datasets @@ -75,5 +76,4 @@ if __name__ == "__main__": # store result in output folder Path(args.o).parent.mkdir(parents=True, exist_ok=True) result.to_csv(Path(args.o)) - result.to_excel(Path(args.o).with_suffix('.xlsx')) - + result.to_excel(Path(args.o).with_suffix(".xlsx")) diff --git a/asreviewcontrib/makita/templates/script_merge_metrics.py.template b/asreviewcontrib/makita/templates/script_merge_metrics.py.template index 83bd7efb..512857bb 100644 --- a/asreviewcontrib/makita/templates/script_merge_metrics.py.template +++ b/asreviewcontrib/makita/templates/script_merge_metrics.py.template @@ -33,25 +33,24 @@ def create_table_state_metrics(metric_files): for metric in metric_files: with open(metric) as f: - data = json.load(f)['data']['items'] + data = json.load(f)["data"]["items"] values = {} - values['file_name'] = Path(metric).name + values["file_name"] = Path(metric).name for item in data: - if item['id'] == 'td': + if item["id"] == "td": continue # check if value is a list - if item['value'] is not None and isinstance(item['value'], list): - for value in item['value']: - values[item['id'] + "_" + str(value[0])] = value[1] + if item["value"] is not None and isinstance(item["value"], list): + for value in item["value"]: + values[item["id"] + "_" + str(value[0])] = value[1] else: - values[item['id']] = item['value'] + values[item["id"]] = item["value"] metrics.append(values) return pd.DataFrame(metrics) if __name__ == "__main__": - parser = argparse.ArgumentParser( description="Merge metrics of multiple states into single table." ) @@ -59,12 +58,14 @@ if __name__ == "__main__": "-s", type=str, default="{{ output_folder }}/simulation/*/metrics/", - help="states location") + help="states location", + ) parser.add_argument( "-o", type=str, default="{{ output_folder }}/tables/metrics_sim_all.csv", - help="Output table location") + help="Output table location", + ) args = parser.parse_args() # load metric files @@ -80,5 +81,4 @@ if __name__ == "__main__": # store result in output folder Path(args.o).parent.mkdir(parents=True, exist_ok=True) result.to_csv(Path(args.o)) - result.to_excel(Path(args.o).with_suffix('.xlsx')) - + result.to_excel(Path(args.o).with_suffix(".xlsx")) diff --git a/asreviewcontrib/makita/templates/script_merge_tds.py.template b/asreviewcontrib/makita/templates/script_merge_tds.py.template index 28af7cc1..17971a1b 100644 --- a/asreviewcontrib/makita/templates/script_merge_tds.py.template +++ b/asreviewcontrib/makita/templates/script_merge_tds.py.template @@ -36,25 +36,28 @@ def create_table_state_tds(metrics): for metric in metrics: with open(metric) as f: - i = next(filter(lambda x: x['id'] == 'td', json.load(f)['data']['items']))['value'] # noqa + i = next(filter(lambda x: x["id"] == "td", json.load(f)["data"]["items"]))[ + "value" + ] values.extend((item[0], item[1], file_counter) for item in i) file_counter += 1 - df = pd.DataFrame(values, columns=['record_id', 'td', 'metric_file']) - pivoted = df.pivot_table(index='record_id', - columns='metric_file', - values='td', - aggfunc='first', - fill_value=nan) - pivoted.columns = [f'td_sim_{col}' for col in pivoted.columns] + df = pd.DataFrame(values, columns=["record_id", "td", "metric_file"]) + pivoted = df.pivot_table( + index="record_id", + columns="metric_file", + values="td", + aggfunc="first", + fill_value=nan, + ) + pivoted.columns = [f"td_sim_{col}" for col in pivoted.columns] return pivoted def get_atd_values(df): + df["record_atd"] = df.mean(axis=1) - df['record_atd'] = df.mean(axis=1) - - df.loc['average_simulation_TD'] = df.iloc[:, :-1].mean(axis=0) + df.loc["average_simulation_TD"] = df.iloc[:, :-1].mean(axis=0) return df @@ -63,16 +66,8 @@ if __name__ == "__main__": parser = argparse.ArgumentParser( description="Merge tds of multiple metrics into single table." ) - parser.add_argument( - "-s", - type=str, - required=True, - help="metrics location") - parser.add_argument( - "-o", - type=str, - required=True, - help="Output table location") + parser.add_argument("-s", type=str, required=True, help="metrics location") + parser.add_argument("-o", type=str, required=True, help="Output table location") args = parser.parse_args() # load metric files @@ -83,7 +78,7 @@ if __name__ == "__main__": raise FileNotFoundError("No metrics found in " + args.s) # check if output file has .csv extension - if Path(args.o).suffix != '.csv': + if Path(args.o).suffix != ".csv": raise ValueError("Output file should have .csv extension") td_table = create_table_state_tds(metric_files) @@ -92,5 +87,4 @@ if __name__ == "__main__": # store table Path(args.o).parent.mkdir(parents=True, exist_ok=True) atd_table.to_csv(Path(args.o)) - atd_table.to_excel(Path(args.o).with_suffix('.xlsx')) - + atd_table.to_excel(Path(args.o).with_suffix(".xlsx")) diff --git a/asreviewcontrib/makita/templates/script_split_data_with_multiple_labels.py.template b/asreviewcontrib/makita/templates/script_split_data_with_multiple_labels.py.template index 6c13418b..a631c4c8 100644 --- a/asreviewcontrib/makita/templates/script_split_data_with_multiple_labels.py.template +++ b/asreviewcontrib/makita/templates/script_split_data_with_multiple_labels.py.template @@ -104,4 +104,3 @@ if __name__ == '__main__': args = parser.parse_args() etl(args.s, args.o, split=args.split, suffix=args.suffix) - diff --git a/asreviewcontrib/makita/templates/template_arfi.txt.template b/asreviewcontrib/makita/templates/template_arfi.txt.template index 182ee5ad..99a7bead 100644 --- a/asreviewcontrib/makita/templates/template_arfi.txt.template +++ b/asreviewcontrib/makita/templates/template_arfi.txt.template @@ -36,7 +36,7 @@ mkdir {{ output_folder }}/simulation/{{ dataset.input_file_stem }}/metrics # Collect descriptives about the dataset mkdir {{ output_folder }}/simulation/{{ dataset.input_file_stem }}/descriptives python -m asreview data describe {{ dataset.input_file }} -o {{ output_folder }}/simulation/{{ dataset.input_file_stem }}/descriptives/data_stats_{{ dataset.input_file_stem }}.json -{% if create_wordclouds %} +{% if not skip_wordclouds %} # Generate wordcloud visualizations of all datasets python -m asreview wordcloud {{ dataset.input_file }} -o {{ output_folder }}/figures/wordcloud_{{ dataset.input_file_stem }}.png --width 800 --height 500 diff --git a/asreviewcontrib/makita/templates/template_basic.txt.template b/asreviewcontrib/makita/templates/template_basic.txt.template index 7be33cb2..734e072f 100644 --- a/asreviewcontrib/makita/templates/template_basic.txt.template +++ b/asreviewcontrib/makita/templates/template_basic.txt.template @@ -38,7 +38,7 @@ mkdir {{ output_folder }}/simulation/{{ dataset.input_file_stem }}/metrics # Collect descriptives about the dataset {{ dataset.input_file_stem }} mkdir {{ output_folder }}/simulation/{{ dataset.input_file_stem }}/descriptives python -m asreview data describe {{ dataset.input_file }} -o {{ output_folder }}/simulation/{{ dataset.input_file_stem }}/descriptives/data_stats_{{ dataset.input_file_stem }}.json -{% if create_wordclouds %} +{% if not skip_wordclouds %} # Generate wordcloud visualizations of all datasets python -m asreview wordcloud {{ dataset.input_file }} -o {{ output_folder }}/figures/wordcloud_{{ dataset.input_file_stem }}.png --width 800 --height 500 @@ -48,9 +48,9 @@ python -m asreview wordcloud {{ dataset.input_file }} -o {{ output_folder }}/fig # Simulate runs mkdir {{ output_folder }}/simulation/{{ dataset.input_file_stem }}/state_files -{% for run in range(dataset.n_runs) %} -python -m asreview simulate {{ dataset.input_file }} -s {{ output_folder }}/simulation/{{ dataset.input_file_stem }}/state_files/sim_{{ dataset.input_file_stem }}_{{ run }}.asreview --init_seed {{ dataset.init_seed + run }} --seed {{ dataset.model_seed + run }} -m {{ classifier }} -e {{ feature_extractor }} -q {{ query_strategy }} -b {{ balance_strategy }} --n_instances {{ instances_per_query }} --stop_if {{ stop_if }} -python -m asreview metrics {{ output_folder }}/simulation/{{ dataset.input_file_stem }}/state_files/sim_{{ dataset.input_file_stem }}_{{ run }}.asreview -o {{ output_folder }}/simulation/{{ dataset.input_file_stem }}/metrics/metrics_sim_{{ dataset.input_file_stem }}_{{ run }}.json +{% for run in range(n_runs) %} +python -m asreview simulate {{ dataset.input_file }} -s {{ output_folder }}/simulation/{{ dataset.input_file_stem }}/state_files/sim_{{ dataset.input_file_stem }}{{ "_{}".format(run) if n_runs > 1 else "" }}.asreview --init_seed {{ dataset.init_seed + run }} --seed {{ dataset.model_seed + run }} -m {{ classifier }} -e {{ feature_extractor }} -q {{ query_strategy }} -b {{ balance_strategy }} --n_instances {{ instances_per_query }} --stop_if {{ stop_if }} +python -m asreview metrics {{ output_folder }}/simulation/{{ dataset.input_file_stem }}/state_files/sim_{{ dataset.input_file_stem }}{{ "_{}".format(run) if n_runs > 1 else "" }}.asreview -o {{ output_folder }}/simulation/{{ dataset.input_file_stem }}/metrics/metrics_sim_{{ dataset.input_file_stem }}{{ "_{}".format(run) if n_runs > 1 else "" }}.json {% endfor %} # Generate plot and tables for dataset diff --git a/asreviewcontrib/makita/templates/template_multimodel.txt.template b/asreviewcontrib/makita/templates/template_multimodel.txt.template index c1a318d3..f0b8a949 100644 --- a/asreviewcontrib/makita/templates/template_multimodel.txt.template +++ b/asreviewcontrib/makita/templates/template_multimodel.txt.template @@ -35,7 +35,7 @@ mkdir {{ output_folder }}/simulation/{{ dataset.input_file_stem }}/metrics # Collect descriptives about the dataset {{ dataset.input_file_stem }} mkdir {{ output_folder }}/simulation/{{ dataset.input_file_stem }}/descriptives python -m asreview data describe {{ dataset.input_file }} -o {{ output_folder }}/simulation/{{ dataset.input_file_stem }}/descriptives/data_stats_{{ dataset.input_file_stem }}.json -{% if create_wordclouds %} +{% if not skip_wordclouds %} # Generate wordcloud visualizations of all datasets python -m asreview wordcloud {{ dataset.input_file }} -o {{ output_folder }}/figures/wordcloud_{{ dataset.input_file_stem }}.png --width 800 --height 500 @@ -48,15 +48,15 @@ mkdir {{ output_folder }}/simulation/{{ dataset.input_file_stem }}/state_files {% for classifier in all_classifiers %} {% for feature_extraction in all_feature_extractors %} {% for query_strategy in all_query_strategies %} -{% for balance_strategy in all_balancing_strategies %} +{% for balance_strategy in all_balance_strategies %} {% set temp = [] %}{{ temp.append(classifier)|default("", True) }}{{ temp.append(feature_extraction)|default("", True) }} {% if temp in impossible_models %} # Skipped {{ classifier }} + {{ feature_extraction }} + {{ query_strategy}} model {% else %}# Classifier = {{ classifier }}, Feature extractor = {{ feature_extraction }}, Query strategy = {{ query_strategy }}, Balance strategy = {{balance_strategy}} {% for run in range(n_runs) %} -python -m asreview simulate {{ dataset.input_file }} -s {{ output_folder }}/simulation/{{ dataset.input_file_stem }}/state_files/sim_{{ dataset.input_file_stem }}_{{ classifier }}_{{ feature_extraction }}_{{ query_strategy }}_{{balance_strategy}}_{{ run }}.asreview --model {{ classifier }} --query_strategy {{query_strategy}} --balance_strategy {{balance_strategy}} --feature_extraction {{ feature_extraction }} --init_seed {{ dataset.init_seed + run }} --seed {{ dataset.model_seed }} -q {{ query_strategy }} -b {{ balance_strategy }} --n_instances {{ instances_per_query }} --stop_if {{ stop_if }} -python -m asreview metrics {{ output_folder }}/simulation/{{ dataset.input_file_stem }}/state_files/sim_{{ dataset.input_file_stem }}_{{ classifier }}_{{ feature_extraction }}_{{ query_strategy }}_{{balance_strategy}}_{{ run }}.asreview -o {{ output_folder }}/simulation/{{ dataset.input_file_stem }}/metrics/metrics_sim_{{ dataset.input_file_stem }}_{{ classifier }}_{{ feature_extraction }}_{{ query_strategy }}_{{balance_strategy}}_{{ run }}.json +python -m asreview simulate {{ dataset.input_file }} -s {{ output_folder }}/simulation/{{ dataset.input_file_stem }}/state_files/sim_{{ dataset.input_file_stem }}_{{ classifier }}_{{ feature_extraction }}_{{ query_strategy }}_{{ balance_strategy }}{{ "_{}".format(run) if n_runs > 1 else "" }}.asreview --model {{ classifier }} --query_strategy {{query_strategy}} --feature_extraction {{ feature_extraction }} --init_seed {{ dataset.init_seed + run }} --seed {{ dataset.model_seed }} -q {{ query_strategy }} -b {{ balance_strategy }} --n_instances {{ instances_per_query }} --stop_if {{ stop_if }} +python -m asreview metrics {{ output_folder }}/simulation/{{ dataset.input_file_stem }}/state_files/sim_{{ dataset.input_file_stem }}_{{ classifier }}_{{ feature_extraction }}_{{ query_strategy }}_{{ balance_strategy }}{{ "_{}".format(run) if n_runs > 1 else "" }}.asreview -o {{ output_folder }}/simulation/{{ dataset.input_file_stem }}/metrics/metrics_sim_{{ dataset.input_file_stem }}_{{ classifier }}_{{ feature_extraction }}_{{ query_strategy }}_{{ balance_strategy }}{{ "_{}".format(run) if n_runs > 1 else "" }}.json {% endfor %}{% endif %} {% endfor %} {% endfor %} diff --git a/asreviewcontrib/makita/utils.py b/asreviewcontrib/makita/utils.py index 4169248e..d818413d 100644 --- a/asreviewcontrib/makita/utils.py +++ b/asreviewcontrib/makita/utils.py @@ -12,9 +12,9 @@ class FileHandler: scripts. """ - def __init__(self): - self.overwrite_all = False - self.total_files = 0 + def __init__(self, allow_overwrite=False): + self.overwrite_all = allow_overwrite + self._total_files = 0 def add_file(self, content, export_fp): """ @@ -50,16 +50,18 @@ def allow_overwrite(): with open(export_fp, "w") as f: f.write(content) + f.write("\n") - print(f"Added {export_fp}") - self.total_files += 1 + print(f"Created {export_fp}") + + self._total_files += 1 def print_summary(self): """ Print the total number of files created by the FileHandler object. """ - print(f"{self.total_files} file(s) created.") + print(f"\n{self._total_files} file(s) created.") def render_file_from_template(self, name, file_type, **kwargs): """ @@ -78,25 +80,8 @@ def render_file_from_template(self, name, file_type, **kwargs): "version": __version__, } - print(f"Loading {file_type} {name}") - # open template with open(Path(TEMPLATES_FP, f"{file_type}_{name}.template")) as f: template = Template(f.read()) return template.render({**params, **kwargs}) - - -def check_filename_dataset(fp): - """ - Check if the filename of the dataset contains any whitespace. - - Args: - fp (str): The file path of the dataset. - - Raises: - ValueError: If the filename of the dataset contains whitespace. - """ - - if " " in Path(fp).stem: - raise ValueError(f"Dataset filename '{fp}' cannot contain whitespace.") diff --git a/pyproject.toml b/pyproject.toml index 5c10c86e..6af6bad4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,10 +12,11 @@ classifiers = [ "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", - "Programming Language :: Python :: 3.11" + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12" ] license = {text = "MIT"} -dependencies = ["asreview", "jinja2", "cfgtemplater"] +dependencies = ["asreview>=1,<2", "jinja2", "cfgtemplater"] dynamic = ["version"] requires-python = ">=3.7" @@ -26,6 +27,12 @@ repository = "https://github.com/asreview/asreview-makita" [project.entry-points."asreview.entry_points"] makita = "asreviewcontrib.makita.entrypoint:MakitaEntryPoint" +[project.entry-points."asreview.makita.templates"] +basic = "asreviewcontrib.makita.template_basic:TemplateBasic" +arfi = "asreviewcontrib.makita.template_arfi:TemplateARFI" +multimodel = "asreviewcontrib.makita.template_multimodel:TemplateMultiModel" +multiple_models = "asreviewcontrib.makita.template_multimodel:TemplateMultiModel" + [project.optional-dependencies] lint = ["ruff"] test = ["pytest"] @@ -41,7 +48,8 @@ packages = ["asreviewcontrib"] write_to = "asreviewcontrib/makita/_version.py" [tool.ruff] -select = ["E", "F", "UP", "I", "B"] +lint.select = ["E", "F", "UP", "I", "B"] +include = ["**/*.py", "**/*.py.template"] -[tool.ruff.isort] -force-single-line = true +[tool.ruff.lint.isort] +force-single-line = true \ No newline at end of file