diff --git a/CHANGELOG.md b/CHANGELOG.md index 9c749d08..48f7f833 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,15 @@ ## [Unreleased] ### Features +* Save Experiment Description files in YAML (from JSON) + * Makes Description files easier for humans to read + * Fixes issues with certain hyperparameter types ignored by JSON, such as dicts with non-string + keys, and tuples + * If you hate the new YAML Descriptions and want to go back to JSON, you can set + `settings.G.description_format` to "json" + * Consider opening [an issue](https://github.com/HunterMcGushion/hyperparameter_hunter/issues), + telling me this was a dumb decision. Feedback is very much appreciated (honestly) + * Be warned that reverting to "json" means the above-noted issues could occur * Enabled optimization of tuple values via [`Categorical`](https://hyperparameter-hunter.readthedocs.io/en/stable/source/hyperparameter_hunter.space.html#hyperparameter_hunter.space.dimensions.Categorical) * This can be used with Keras to search over different `kernel_size` values for `Conv2D` or `pool_size` values for `MaxPooling2D`, for example: diff --git a/hyperparameter_hunter/i_o/exceptions.py b/hyperparameter_hunter/i_o/exceptions.py index 964adb62..2100a438 100644 --- a/hyperparameter_hunter/i_o/exceptions.py +++ b/hyperparameter_hunter/i_o/exceptions.py @@ -130,11 +130,6 @@ def __init__(self, candidate, template): super(IncompatibleCandidateError, self).__init__(message) -class ContinueRemap(Exception): - def __str__(self): - return "Just keep doing what you were doing" - - ################################################## # Deprecation Warnings ################################################## diff --git a/hyperparameter_hunter/i_o/recorders.py b/hyperparameter_hunter/i_o/recorders.py index cd982f53..2b11da9b 100644 --- a/hyperparameter_hunter/i_o/recorders.py +++ b/hyperparameter_hunter/i_o/recorders.py @@ -13,15 +13,18 @@ from hyperparameter_hunter.i_o.exceptions import EnvironmentInactiveError, EnvironmentInvalidError from hyperparameter_hunter.i_o.leaderboards import GlobalLeaderboard from hyperparameter_hunter.settings import G -from hyperparameter_hunter.utils.file_utils import write_json, add_to_json, make_dirs, read_json -from hyperparameter_hunter.utils.file_utils import RetryMakeDirs +from hyperparameter_hunter.utils.file_utils import ( + add_to_json, + RetryMakeDirs, + write_json, + write_yaml, +) from hyperparameter_hunter.utils.general_utils import subdict ################################################## # Import Miscellaneous Assets ################################################## from abc import ABCMeta, abstractmethod -from collections import OrderedDict from platform import node import shutil from sys import exc_info @@ -124,7 +127,8 @@ def __init__(self, file_blacklist=None, extra_recorders=None): an Experiment. The contents of `extra_recorders` are blacklisted in the same way as normal `recorders`. That is, if `file_blacklist` contains the `result_path_key` of a recorder in `extra_recorders`, that recorder is blacklisted""" - # WARNING: Take care if modifying the order/contents of :attr:`recorders`. See :meth:`save_result` documentation for info + # WARNING: Take care if modifying the order/contents of :attr:`recorders` + # See :meth:`save_result` documentation for info self.recorders = [ TestedKeyRecorder, LeaderboardEntryRecorder, @@ -193,7 +197,7 @@ class DescriptionRecorder(BaseRecorder): "cross_experiment_key", "last_evaluation_results", "stat_aggregates", - # 'train_features', + # "train_features", "source_script", "notes", "model_initializer", @@ -204,25 +208,22 @@ class DescriptionRecorder(BaseRecorder): ] def format_result(self): - """Format an OrderedDict containing the Experiment's identifying attributes, results, + """Format a dict containing the Experiment's identifying attributes, results, hyperparameters used, and other stats or information that may be useful""" - self.result = OrderedDict( - [ - ("experiment_id", self.experiment_id), - ("algorithm_name", self.algorithm_name), - ("module_name", self.module_name), - ("hyperparameter_key", self.hyperparameter_key.key), - ("cross_experiment_key", self.cross_experiment_key.key), - ("final_evaluations", self.last_evaluation_results), - ("hyperparameters", self.hyperparameter_key.parameters), - ("cross_experiment_parameters", self.cross_experiment_key.parameters), - ("train_features", None), # TODO: Record the column features in train df - ("platform", node()), - ("source_script", self.source_script), - ("notes", self.notes or ""), - ("aggregates", self.stat_aggregates), - ] - ) + self.result = dict() + self.result["experiment_id"] = self.experiment_id + self.result["algorithm_name"] = self.algorithm_name + self.result["module_name"] = self.module_name + self.result["hyperparameter_key"] = self.hyperparameter_key.key + self.result["cross_experiment_key"] = self.cross_experiment_key.key + self.result["final_evaluations"] = self.last_evaluation_results + self.result["hyperparameters"] = self.hyperparameter_key.parameters + self.result["cross_experiment_parameters"] = self.cross_experiment_key.parameters + self.result["train_features"] = None # TODO: Record the column features in train df + self.result["platform"] = node() + self.result["source_script"] = self.source_script + self.result["notes"] = self.notes or "" + self.result["aggregates"] = self.stat_aggregates #################### Filter Hyperparameters' model_init_params #################### self.result["hyperparameters"]["model_init_params"] = subdict( @@ -230,22 +231,29 @@ def format_result(self): ) def save_result(self): - """Save the Experiment description as a .json file, named after :attr:`experiment_id`. If - :attr:`do_full_save` is a callable and returns False when given the description object, the - result recording loop will be broken, and the remaining result files will not be saved + """Save the Experiment Description as a .yaml/.json file, named after :attr:`experiment_id`. + If :attr:`do_full_save` is a callable and returns False when given the description object, + the result recording loop will be broken, and the remaining result files will not be saved Returns ------- - 'break' - This string will be returned if :attr:`do_full_save` is a callable and returns False - when given the description object. This is the signal for - :class:`recorders.RecorderList` to stop recording result files""" - try: - write_json(f"{self.result_path}/{self.experiment_id}.json", self.result, do_clear=False) - except FileNotFoundError: - make_dirs(self.result_path, exist_ok=False) - write_json(f"{self.result_path}/{self.experiment_id}.json", self.result, do_clear=False) - + "break", or None + "break" is returned if :attr:`do_full_save` is callable and returns False when given the + Description (:attr:`result`). This is the signal for :class:`recorders.RecorderList` to + stop saving files. Otherwise, nothing is returned, continuing the recording process + + See Also + -------- + :attr:`hyperparameter_hunter.settings.G.description_format` + Dictates whether to save Description as a .yaml file (default), or .json""" + if G.description_format == "yaml": + write_yaml(f"{self.result_path}/{self.experiment_id}.yaml", self.result) + elif G.description_format == "json": + write_json(f"{self.result_path}/{self.experiment_id}.json", self.result) + else: + raise ValueError(f"Unexpected `G.description_format`: {G.description_format}") + + #################### Decide Whether to Kill Recorder Loop #################### if (self.do_full_save is not None) and (not self.do_full_save(self.result)): G.warn("Breaking result-saving loop early! Remaining result files will not be saved") return "break" @@ -415,24 +423,3 @@ def format_result(self): def save_result(self): """Save the updated leaderboard file""" self.result.save(path=self.result_paths["unsorted_id_leaderboard"]) - - -class YAMLDescriptionRecorder(BaseRecorder): - result_path_key = "yaml_description" - required_attributes = ["result_paths", "experiment_id"] - - def format_result(self): - pass - - def save_result(self): - from yaml import dump - - self.result = read_json(f"{self.result_paths['description']}/{self.experiment_id}.json") - - make_dirs(self.result_path, exist_ok=True) - with open(f"{self.result_path}/{self.experiment_id}.yml", "w+") as f: - dump(self.result, f, default_flow_style=False, width=200) - - -if __name__ == "__main__": - pass diff --git a/hyperparameter_hunter/i_o/result_reader.py b/hyperparameter_hunter/i_o/result_reader.py index d9e42fad..91221016 100644 --- a/hyperparameter_hunter/i_o/result_reader.py +++ b/hyperparameter_hunter/i_o/result_reader.py @@ -1,3 +1,11 @@ +"""This module handles reading and processing saved Experiment result files and determining their +match status to guidelines/search space + +Related +------- +:mod:`hyperparameter_hunter.optimization.protocol_core` + OptPros use :class:`ResultFinder` to identify saved Experiment results that fit within the + current guidelines/search space""" ################################################## # Import Own Assets ################################################## @@ -550,9 +558,8 @@ def find(self): providing an updated "feature_engineer" value for compatible candidates to use. Specifics are documented in :meth:`does_match_feature_engineer`""" for exp_id in self.experiment_ids: - description_path = f"{self.descriptions_dir}/{exp_id}.json" # TODO: Get `description` from `get_scored_params` - Take whatever value `sort` needs - params, score = get_scored_params(description_path, self.target_metric) + params, score = get_scored_params(self.descriptions_dir, exp_id, self.target_metric) #################### Match Init Params #################### self.does_match_init_params_space(exp_id, params["model_init_params"], score) @@ -929,14 +936,14 @@ def _visit(path, key, value): ################################################## # Utilities ################################################## -def has_experiment_result_file(results_dir, experiment_id, result_type=None): - """Check if the specified result files exist in `results_dir` for Experiment `experiment_id` +def has_experiment_result_file(results_dir, exp_id, result_type=None): + """Check if the specified result files exist in `results_dir` for Experiment `exp_id` Parameters ---------- results_dir: String HyperparameterHunterAssets directory in which to search for Experiment result files - experiment_id: String, or BaseExperiment + exp_id: String, or BaseExperiment ID of the Experiment whose result files should be searched for in `results_dir`. If not string, should be an instance of a descendant of :class:`~hyperparameter_hunter.experiments.BaseExperiment` with an "experiment_id" attribute @@ -953,8 +960,13 @@ def has_experiment_result_file(results_dir, experiment_id, result_type=None): ------- Boolean True if all result files specified by `result_type` exist in `results_dir` for the - Experiment specified by `experiment_id`. Else, False""" - experiment_id = experiment_id if isinstance(experiment_id, str) else experiment_id.experiment_id + Experiment specified by `exp_id`. Else, False""" + exp_id = exp_id if isinstance(exp_id, str) else exp_id.experiment_id + + if results_dir.endswith("HyperparameterHunterAssets"): + exp_dir = Path(results_dir) / "Experiments" + else: + exp_dir = Path(results_dir) / "HyperparameterHunterAssets" / "Experiments" #################### Format `result_type` #################### if not result_type: @@ -972,25 +984,20 @@ def has_experiment_result_file(results_dir, experiment_id, result_type=None): result_type = [result_type] for subdir in result_type: - #################### Select Result File Suffix #################### + #################### Select Result File Suffixes #################### if subdir == "Descriptions": - suffix = ".json" + suffixes = (".yaml", ".yml", ".json") elif subdir == "Heartbeats": - suffix = ".log" + suffixes = (".log",) elif subdir == "ScriptBackups": - suffix = ".py" + suffixes = (".py",) elif subdir.startswith("Predictions"): - suffix = ".csv" + suffixes = (".csv",) else: raise ValueError(f"Cannot resolve suffix for subdir `result_type`: {subdir}") #################### Check "Experiments" Directory #################### - if results_dir.endswith("HyperparameterHunterAssets"): - experiments_dir = Path(results_dir) / "Experiments" - else: - experiments_dir = Path(results_dir) / "HyperparameterHunterAssets" / "Experiments" - - if not (experiments_dir / subdir / f"{experiment_id}{suffix}").exists(): + if not any((exp_dir / subdir / f"{exp_id}{suffix}").exists() for suffix in suffixes): return False return True diff --git a/hyperparameter_hunter/optimization/protocol_core.py b/hyperparameter_hunter/optimization/protocol_core.py index 0762f2d6..457352a3 100644 --- a/hyperparameter_hunter/optimization/protocol_core.py +++ b/hyperparameter_hunter/optimization/protocol_core.py @@ -8,7 +8,7 @@ Defines the optimization classes that are intended for direct use. All classes defined in :mod:`hyperparameter_hunter.optimization.backends.skopt.protocols` should be descendants of :class:`~hyperparameter_hunter.optimization.protocol_core.BaseOptPro` -:mod:`hyperparameter_hunter.result_reader` +:mod:`hyperparameter_hunter.i_o.result_reader` Used to locate result files for Experiments that are similar to the current optimization constraints, and produce data to learn from in the case of :class:`SKOptPro` :mod:`hyperparameter_hunter.space` @@ -393,17 +393,17 @@ def forge_experiment( #################### Deal with Keras #################### if self.module_name == "keras": - reusable_build_fn, reusable_wrapper_params, dummy_layers, dummy_compile_params = keras_prep_workflow( + build_fn, wrapper_params, dummy_layers, dummy_compile_params = keras_prep_workflow( self.model_initializer, self.model_init_params["build_fn"], self.model_extra_params, self.source_script, ) - self.model_init_params = dict(build_fn=reusable_build_fn) - self.model_extra_params = reusable_wrapper_params + self.model_init_params = dict(build_fn=build_fn) # Reusable + self.model_extra_params = wrapper_params # Reusable self.dummy_layers = dummy_layers self.dummy_compile_params = dummy_compile_params - # FLAG: Deal with capitalization conflicts when comparing similar experiments: `optimizer`='Adam' vs 'adam' + # FLAG: Handle `optimizer` capitalization conflicts: `optimizer`="Adam" vs "adam" self.set_dimensions() diff --git a/hyperparameter_hunter/settings.py b/hyperparameter_hunter/settings.py index 2d415f6a..22bf9a2c 100644 --- a/hyperparameter_hunter/settings.py +++ b/hyperparameter_hunter/settings.py @@ -94,6 +94,9 @@ class G(object): target, which is the same form as the original target data. Continuing the example of label-encoded target data, and an :class:`feature_engineering.EngineerStep` to one-hot encode the target, in this case, label-encoded predictions will be saved. + description_format: {"yaml", "json"}, default="yaml" + How to save Experiment Description files. See + :meth:`hyperparameter_hunter.i_o.recorders.RecorderList.__init__` priority_callbacks: Tuple Intended for internal use only. The contents of this tuple are inserted at the front of an Experiment's list of callback bases via :class:`experiment_core.ExperimentMeta`, ahead of @@ -116,11 +119,13 @@ class G(object): #################### Miscellaneous Settings #################### save_transformed_predictions = False + description_format = "yaml" #################### Internal Settings #################### priority_callbacks = tuple() - #################### Standard Logging Set by :class:`environment.Environment` #################### + #################### Standard Logging #################### + # Set by :class:`environment.Environment` @staticmethod def log(content, *args, **kwargs): """Set in :meth:`environment.Environment.initialize_reporting` to the updated version of @@ -139,7 +144,8 @@ def warn(content, *args, **kwargs): :meth:`reporting.ReportingHandler.warn`""" warnings.warn(content, *args, **kwargs) - #################### Optimization Logging Set by :class:`protocol_core.BaseOptPro` #################### + #################### Optimization Logging #################### + # Set by :class:`protocol_core.BaseOptPro` log_ = print debug_ = print warn_ = warnings.warn diff --git a/hyperparameter_hunter/utils/file_utils.py b/hyperparameter_hunter/utils/file_utils.py index c407499c..3f881084 100644 --- a/hyperparameter_hunter/utils/file_utils.py +++ b/hyperparameter_hunter/utils/file_utils.py @@ -13,176 +13,15 @@ import os import os.path import pandas as pd +from pathlib import Path +from ruamel.yaml import YAML import simplejson as json -from typing import Union +from typing import Any, List, Tuple, Union import wrapt ################################################## -# JSON File Functions -################################################## -def default_json_write(obj): - """Convert values that are not JSON-friendly to a more acceptable type - - Parameters - ---------- - obj: Object - The object that is expected to be of a type that is incompatible with JSON files - - Returns - ------- - Object - The value of `obj` after being cast to a type accepted by JSON - - Raises - ------ - TypeError - If the type of `obj` is unhandled - - Examples - -------- - >>> assert default_json_write(np.array([1, 2, 3])) == [1, 2, 3] - >>> assert default_json_write(np.int8(32)) == 32 - >>> assert np.isclose(default_json_write(np.float16(3.14)), 3.14, atol=0.001) - >>> assert default_json_write(pd.Index(["a", "b", "c"])) == ["a", "b", "c"] - >>> assert default_json_write((1, 2)) == {"__tuple__": [1, 2]} - >>> default_json_write(object()) # doctest: +ELLIPSIS - Traceback (most recent call last): - File "file_utils.py", line ?, in default_json_write - TypeError: is not JSON serializable""" - #################### Builtin Types #################### - if isinstance(obj, tuple): - return {"__tuple__": list(obj)} - #################### NumPy Types #################### - if isinstance(obj, np.ndarray): - return obj.tolist() - if isinstance(obj, np.integer): - return int(obj) - if isinstance(obj, np.floating): - return float(obj) - #################### Pandas Types #################### - if isinstance(obj, pd.Index): - return list(obj) - - raise TypeError(f"{obj!r} is not JSON serializable") - - -def hook_json_read(obj): - """Hook function to decode JSON objects during reading - - Parameters - ---------- - obj: Object - JSON object to process, or return unchanged - - Returns - ------- - Object - If `obj` contains the key "__tuple__", its value is cast to a tuple and returned. Else, - `obj` is returned unchanged - - Examples - -------- - >>> assert hook_json_read({"__tuple__": [1, 2]}) == (1, 2) - >>> assert hook_json_read({"__tuple__": (1, 2)}) == (1, 2) - >>> assert hook_json_read({"a": "foo", "b": 42}) == {"a": "foo", "b": 42} - """ - if "__tuple__" in obj: - return tuple(obj["__tuple__"]) - return obj - - -def write_json(file_path, data, do_clear=False): - """Write `data` to the JSON file specified by `file_path`, optionally clearing the file before - adding `data` - - Parameters - ---------- - file_path: String - The target .json file path to which `data` will be written - data: Object - The content to save at the .json file given by `file_path` - do_clear: Boolean, default=False - If True, the contents of the file at `file_path` will be cleared before saving `data`""" - if do_clear is True: - clear_file(file_path) - - with open(file_path, "w") as f: - json.dump(data, f, default=default_json_write, tuple_as_array=False) - - -def read_json(file_path, np_arr=False): - """Get the contents of the .json file located at `file_path` - - Parameters - ---------- - file_path: String - The path of the .json file to be read - np_arr: Boolean, default=False - If True, the contents read from `file_path` will be cast to a numpy array before returning - - Returns - ------- - content: Object - The contents of the .json file located at `file_path`""" - content = json.loads(open(file_path).read(), object_hook=hook_json_read) - - if np_arr is True: - return np.array(content) - - return content - - -def add_to_json(file_path, data_to_add, key=None, condition=None, default=None, append_value=False): - """Append `data_to_add` to the contents of the .json file specified by `file_path` - - Parameters - ---------- - file_path: String - The target .json file path to which `data_to_add` will be added and saved - data_to_add: Object - The data to add to the contents of the .json file given by `file_path` - key: String, or None, default=None - If None, the original contents of the file at `file_path` should not be of type dict. If - string, the original content at `file_path` is expected to be a dict, and `data_to_add` will - be added to the original dict under the key `key`. Therefore, `key` is expected to be a - unique key to the original dict contents of `file_path`, unless `append_value` is True - condition: Callable, or None, default=None - If callable, will be given the original contents of the .json file at `file_path` as input, - and should return a boolean value. If `condition(original_data)` is truthy, `data_to_add` - will be added to the contents of the file at `file_path` as usual. Otherwise, `data_to_add` - will not be added to the file, and the contents at `file_path` will remain unchanged. If - `condition` is None, it will be treated as having been truthy, and will proceed to append - `data_to_add` to the target file - default: Object, or None, default=None - If the attempt to read the original content at `file_path` raises a `FileNotFoundError` and - `default` is not None, `default` will be used as the original data for the file. Otherwise, - the error will be raised - append_value: Boolean, default=False - If True and the original data at `file_path` is a dict, then `data_to_add` will be appended - as a list to the value of the original data at key `key`""" - try: - original_data = read_json(file_path) - except FileNotFoundError: - if default is not None: - original_data = default - else: - raise - - if condition is None or original_data is None or condition(original_data): - if key is None and isinstance(original_data, list): - original_data.append(data_to_add) - elif isinstance(key, str) and isinstance(original_data, dict): - if append_value is True: - original_data[key] = original_data[key] + [data_to_add] - else: - original_data[key] = data_to_add - - write_json(file_path, original_data) - - -################################################## -# General File Functions +# General File Utilities/Decorators ################################################## def make_dirs(name, mode=0o0777, exist_ok=False): """Permissive version of `os.makedirs` that gives full permissions by default @@ -216,7 +55,8 @@ def clear_file(file_path): class RetryMakeDirs(object): def __init__(self): """Execute decorated callable, but if `OSError` is raised, call :func:`make_dirs` on the - directory specified by the exception, then recall the decorated callable again + directory specified by the exception, then recall the decorated callable again. This also + works with operations on files, in which case the file's parent directories are created Examples -------- @@ -240,6 +80,8 @@ def __call__(self, wrapped, instance, args, kwargs): try: return wrapped(*args, **kwargs) except OSError as _ex: + # TODO: Add ability to check `kwargs` for value dictating whether to call `make_dirs` + # - Provide name or index (if arg) of value to check in `RetryMakeDirs.__init__` if _ex.filename: make_dirs(os.path.split(_ex.filename)[0], exist_ok=True) return wrapped(*args, **kwargs) @@ -333,6 +175,240 @@ def __call__(self, wrapped, instance, args, kwargs): return wrapped(*args, **kwargs) +################################################## +# JSON File Utilities +################################################## +def default_json_write(obj): + """Convert values that are not JSON-friendly to a more acceptable type + + Parameters + ---------- + obj: Object + Object that is expected to be of a type that is incompatible with JSON files + + Returns + ------- + Object + Value of `obj` after being cast to a type accepted by JSON + + Raises + ------ + TypeError + If the type of `obj` is unhandled + + Examples + -------- + >>> assert default_json_write(np.array([1, 2, 3])) == [1, 2, 3] + >>> assert default_json_write(np.int8(32)) == 32 + >>> assert np.isclose(default_json_write(np.float16(3.14)), 3.14, atol=0.001) + >>> assert default_json_write(pd.Index(["a", "b", "c"])) == ["a", "b", "c"] + >>> assert default_json_write((1, 2)) == {"__tuple__": [1, 2]} + >>> default_json_write(object()) # doctest: +ELLIPSIS + Traceback (most recent call last): + File "file_utils.py", line ?, in default_json_write + TypeError: is not JSON serializable""" + #################### Builtin Types #################### + if isinstance(obj, tuple): + return {"__tuple__": list(obj)} + #################### NumPy Types #################### + if isinstance(obj, np.ndarray): + return obj.tolist() + if isinstance(obj, np.integer): + return int(obj) + if isinstance(obj, np.floating): + return float(obj) + #################### Pandas Types #################### + if isinstance(obj, pd.Index): + return list(obj) + + raise TypeError(f"{obj!r} is not JSON serializable") + + +def hook_json_read(obj): + """Hook function to decode JSON objects during reading + + Parameters + ---------- + obj: Object + JSON object to process, or return unchanged + + Returns + ------- + Object + If `obj` contains the key "__tuple__", its value is cast to a tuple and returned. Else, + `obj` is returned unchanged + + Examples + -------- + >>> assert hook_json_read({"__tuple__": [1, 2]}) == (1, 2) + >>> assert hook_json_read({"__tuple__": (1, 2)}) == (1, 2) + >>> assert hook_json_read({"a": "foo", "b": 42}) == {"a": "foo", "b": 42} + """ + if "__tuple__" in obj: + return tuple(obj["__tuple__"]) + return obj + + +def read_json(file_path: str) -> object: + """Get the contents of the .json file located at `file_path` + + Parameters + ---------- + file_path: String + Path to the .json file to be read + + Returns + ------- + content: Object + The contents of the .json file located at `file_path`""" + content = json.loads(open(file_path).read(), object_hook=hook_json_read) + return content + + +@RetryMakeDirs() +def write_json(file_path: str, data: Any): + """Write `data` to the JSON file specified by `file_path` + + Parameters + ---------- + file_path: String + Target .json file path to which `data` will be written + data: Object + Content to save in the .json file given by `file_path`""" + with open(file_path, "w") as f: + json.dump(data, f, default=default_json_write, tuple_as_array=False) + + +def add_to_json(file_path, data_to_add, key=None, condition=None, default=None, append_value=False): + """Append `data_to_add` to the contents of the .json file specified by `file_path` + + Parameters + ---------- + file_path: String + The target .json file path to which `data_to_add` will be added and saved + data_to_add: Object + The data to add to the contents of the .json file given by `file_path` + key: String, or None, default=None + If None, the original contents of the file at `file_path` should not be of type dict. If + string, the original content at `file_path` is expected to be a dict, and `data_to_add` will + be added to the original dict under the key `key`. Therefore, `key` is expected to be a + unique key to the original dict contents of `file_path`, unless `append_value` is True + condition: Callable, or None, default=None + If callable, will be given the original contents of the .json file at `file_path` as input, + and should return a boolean value. If `condition(original_data)` is truthy, `data_to_add` + will be added to the contents of the file at `file_path` as usual. Otherwise, `data_to_add` + will not be added to the file, and the contents at `file_path` will remain unchanged. If + `condition` is None, it will be treated as having been truthy, and will proceed to append + `data_to_add` to the target file + default: Object, or None, default=None + If the attempt to read the original content at `file_path` raises a `FileNotFoundError` and + `default` is not None, `default` will be used as the original data for the file. Otherwise, + the error will be raised + append_value: Boolean, default=False + If True and the original data at `file_path` is a dict, then `data_to_add` will be appended + as a list to the value of the original data at key `key`""" + try: + original_data = read_json(file_path) + except FileNotFoundError: + if default is not None: + original_data = default + else: + raise + + if condition is None or original_data is None or condition(original_data): + if key is None and isinstance(original_data, list): + original_data.append(data_to_add) + elif isinstance(key, str) and isinstance(original_data, dict): + if append_value is True: + original_data[key] = original_data[key] + [data_to_add] + else: + original_data[key] = data_to_add + + write_json(file_path, original_data) + + +################################################## +# YAML File Utilities +################################################## +# Extra Representers used in the default HH Ruamel YAML instance +_RUAMEL_REPRESENTERS: List[Tuple[type, callable]] = [ + (np.ndarray, lambda dumper, data: dumper.represent_list(data.tolist())), + (np.float64, lambda dumper, data: dumper.represent_float(float(data))), + (np.int64, lambda dumper, data: dumper.represent_int(int(data))), + (tuple, lambda dumper, data: dumper.represent_sequence("tag:yaml.org,2002:python/tuple", data)), + (str, lambda dumper, data: dumper.represent_scalar("tag:yaml.org,2002:str", data, style='"')), +] +# Extra Constructors used in the default HH Ruamel YAML instance +_RUAMEL_CONSTRUCTORS: List[Tuple[str, callable]] = [ + ("tag:yaml.org,2002:python/tuple", lambda loader, node: tuple(loader.construct_sequence(node))) +] + + +def get_ruamel_instance() -> YAML: + """Get the default :class:`ruamel.yaml.YAML` instance used for dumping/loading YAML files + + Returns + ------- + yml: YAML + :class:`ruamel.yaml.YAML` instance configured for HyperparameterHunter, outfitted with + additional Ruamel Representers to properly format non-standard data types""" + #################### Prepare Ruamel YAML Instance #################### + yml = YAML(typ="safe") + yml.default_flow_style = None + yml.sort_base_mapping_type_on_output = False # False retains original mapping order + yml.top_level_colon_align = True # Make it easier to see top-level elements + yml.width = 100 + + #################### Add Auxiliary Ruamel Representers/Constructors #################### + for (data_type, representer) in _RUAMEL_REPRESENTERS: + yml.representer.add_representer(data_type, representer) + + for (tag, constructor) in _RUAMEL_CONSTRUCTORS: + yml.constructor.add_constructor(tag, constructor) + + return yml + + +def read_yaml(file_path: Union[str, Path], yml: YAML = None) -> object: + """Get the contents of the .yaml file located at `file_path` + + Parameters + ---------- + file_path: String, or Path + Path to the .yaml file to be read + yml: YAML (optional) + :class:`ruamel.yaml.YAML` instance used to load data from `file_path`. If not given, the + result of :func:`get_ruamel_instance` is used + + Returns + ------- + Object + Contents of the .yaml file located at `file_path`""" + file_path = Path(file_path) + yml = get_ruamel_instance() if yml is None else yml + return yml.load(file_path) + + +@RetryMakeDirs() +def write_yaml(file_path: Union[str, Path], data: Any, yml: YAML = None): + """Write `data` to the YAML file specified by `file_path` + + Parameters + ---------- + file_path: String, or Path + Target .yaml file path to which `data` will be written + data: Object + Content to save in the .yaml file given by `file_path` + yml: YAML (optional) + :class:`ruamel.yaml.YAML` instance used to dump `data` to `file_path`. If not given, the + result of :func:`get_ruamel_instance` is used""" + file_path = Path(file_path) + yml = get_ruamel_instance() if yml is None else yml + + with open(file_path, "w+") as f: + yml.dump(data, f) + + ################################################## # Display Utilities ################################################## diff --git a/hyperparameter_hunter/utils/optimization_utils.py b/hyperparameter_hunter/utils/optimization_utils.py index deb9e2ba..4050c4c5 100644 --- a/hyperparameter_hunter/utils/optimization_utils.py +++ b/hyperparameter_hunter/utils/optimization_utils.py @@ -10,11 +10,10 @@ ################################################## # Import Own Assets ################################################## -from hyperparameter_hunter.i_o.exceptions import ContinueRemap from hyperparameter_hunter.keys.hashing import make_hash_sha256 from hyperparameter_hunter.space.dimensions import Real, Integer, Categorical, RejectedOptional from hyperparameter_hunter.utils.boltons_utils import get_path, remap -from hyperparameter_hunter.utils.file_utils import read_json +from hyperparameter_hunter.utils.file_utils import read_json, read_yaml from hyperparameter_hunter.utils.general_utils import extra_enter_attrs try: @@ -27,6 +26,7 @@ ################################################## from contextlib import suppress import pandas as pd +from pathlib import Path ################################################## @@ -89,13 +89,42 @@ def get_ids_by( return matching_ids -def get_scored_params(experiment_description_path, target_metric, get_description=False): +def find_experiment_description(description_dir: str, experiment_id: str) -> dict: + """Locate and return the Description file contents for `experiment_id`. Assumes the Description + file extension to be in {".yaml", ".yml", ".json"}, and checks for files in that order, + returning the first one found + + Parameters + ---------- + description_dir: String + Path to a directory containing the Description files of saved Experiments + experiment_id: String + ID of the saved Experiment whose Description should be returned + + Returns + ------- + Dict + Experiment Description file contents""" + description_path = Path(description_dir) / experiment_id # Extension unknown right now + + for (extension, reader) in [(".yaml", read_yaml), (".yml", read_yaml), (".json", read_json)]: + try: + return reader(description_path.with_suffix(extension)) + except FileNotFoundError: + continue + + raise ValueError(f"Expected YAML/JSON `description_path`, not {description_path}") + + +def get_scored_params(description_dir, experiment_id, target_metric, get_description=False): """Retrieve the hyperparameters of a completed Experiment, along with its performance evaluation Parameters ---------- - experiment_description_path: String - The path to an Experiment's description .json file + description_dir: String + Path to a directory containing the Description files of saved Experiments + experiment_id: String + ID of the saved Experiment whose Description should be returned target_metric: Tuple A path denoting the metric to be used. If tuple, the first value should be one of ['oof', 'holdout', 'in_fold'], and the second value should be the name of a metric supplied in @@ -111,7 +140,7 @@ def get_scored_params(experiment_description_path, target_metric, get_descriptio A dict of the hyperparameters used by the Experiment evaluation: Float Value of the Experiment's `target_metric`""" - description = read_json(file_path=experiment_description_path) + description = find_experiment_description(description_dir, experiment_id) evaluation = get_path(description["final_evaluations"], target_metric) all_hyperparameters = description["hyperparameters"] @@ -165,59 +194,6 @@ def does_fit_in_space(root, space): return dimension_subset(root, space.names()) in space -def visit_feature_engineer(path, key, value): - """Helper to be used within a `visit` function intended for a `remap`-like function - - Parameters - ---------- - path: Tuple - The path of keys that leads to `key` - key: String - The parameter name - value: Object - The value of the parameter `key` - - Returns - ------- - False if the value represents a dataset, or tuple of (`key`, ). If neither of - these are returned, a `ContinueRemap` exception is raised - - Raises - ------ - ContinueRemap - If a value is not returned by `visit_function_engineer`. For proper functioning, this raised - `ContinueRemap` is assumed to be handled by the calling `visit` function. Usually, the - `except` block for `ContinueRemap` will simply continue execution of `visit` - - Examples - -------- - >>> visit_feature_engineer(("feature_engineer",), "datasets", dict()) - False - >>> visit_feature_engineer(("feature_engineer", "steps"), "f", lambda _: _) # pytest: +ELLIPSIS - ('f', '...') - >>> visit_feature_engineer(("feature_engineer", "steps"), "foo", lambda _: _) - Traceback (most recent call last): - File "optimization_utils.py", line ?, in visit_feature_engineer - hyperparameter_hunter.i_o.exceptions.ContinueRemap: Just keep doing what you were doing - >>> visit_feature_engineer(("feature_engineer",), "foo", dict()) - Traceback (most recent call last): - File "optimization_utils.py", line ?, in visit_feature_engineer - hyperparameter_hunter.i_o.exceptions.ContinueRemap: Just keep doing what you were doing - >>> visit_feature_engineer(("foo",), "bar", dict()) - Traceback (most recent call last): - File "optimization_utils.py", line ?, in visit_feature_engineer - hyperparameter_hunter.i_o.exceptions.ContinueRemap: Just keep doing what you were doing""" - if path and path[0] == "feature_engineer": - # Drop dataset hashes - if key in ("datasets", "original_hashes", "updated_hashes") and isinstance(value, dict): - return False - # Ensure `EngineerStep.f` is hashed - with suppress(IndexError): - if path[1] == "steps" and key == "f" and callable(value): - return key, make_hash_sha256(value) - raise ContinueRemap - - def get_choice_dimensions(params, iter_attrs=None): """List all elements in the nested structure `params` that are hyperparameter space choices diff --git a/setup.py b/setup.py index 7b2a26ad..fc7f6e1d 100644 --- a/setup.py +++ b/setup.py @@ -47,6 +47,7 @@ def readme(): "nbformat", "numpy", "pandas", + "ruamel.yaml", "scikit-learn", "scikit-optimize", "scipy", diff --git a/tests/smoke_tests/test_general.py b/tests/smoke_tests/test_general.py index fca2c6c5..c221cba3 100644 --- a/tests/smoke_tests/test_general.py +++ b/tests/smoke_tests/test_general.py @@ -4,10 +4,7 @@ from hyperparameter_hunter import Environment, CVExperiment, Real, Integer, Categorical from hyperparameter_hunter import BayesianOptPro, ExtraTreesOptPro, lambda_callback from hyperparameter_hunter.callbacks.recipes import confusion_matrix_oof, confusion_matrix_holdout -from hyperparameter_hunter.i_o.recorders import ( - YAMLDescriptionRecorder, - UnsortedIDLeaderboardRecorder, -) +from hyperparameter_hunter.i_o.recorders import UnsortedIDLeaderboardRecorder from hyperparameter_hunter.i_o.result_reader import has_experiment_result_file from hyperparameter_hunter.utils.learning_utils import ( get_toy_classification_data, @@ -135,14 +132,7 @@ def env_4(): @pytest.fixture( scope="function", autouse=False, - params=[ - [], - [(UnsortedIDLeaderboardRecorder, "Leaderboards/UnsortedIDLeaderboard.csv")], - [ - (UnsortedIDLeaderboardRecorder, "Leaderboards/UnsortedIDLeaderboard.csv"), - (YAMLDescriptionRecorder, "Experiments/YAMLDescriptions"), - ], - ], + params=[[], [(UnsortedIDLeaderboardRecorder, "Leaderboards/UnsortedIDLeaderboard.csv")]], ) def env_5(request): return Environment( diff --git a/tests/test_space/test_space.py b/tests/test_space/test_space.py index 75667391..a219c028 100644 --- a/tests/test_space/test_space.py +++ b/tests/test_space/test_space.py @@ -2,16 +2,43 @@ # Import Own Assets ################################################## from hyperparameter_hunter import Real, Categorical, Integer -from hyperparameter_hunter.feature_engineering import EngineerStep +from hyperparameter_hunter import Environment, CVExperiment, BayesianOptPro, EngineerStep from hyperparameter_hunter.space.dimensions import RejectedOptional from hyperparameter_hunter.space.space_core import Space +from hyperparameter_hunter.utils.learning_utils import get_iris_data ################################################## # Import Miscellaneous Assets ################################################## +from os import makedirs import pytest +from shutil import rmtree from sys import maxsize +################################################## +# Import Learning Assets +################################################## +from sklearn.ensemble import RandomForestClassifier + +################################################## +# Global Settings +################################################## +assets_dir = "hyperparameter_hunter/__TEST__HyperparameterHunterAssets__" +# assets_dir = "hyperparameter_hunter/HyperparameterHunterAssets" + + +@pytest.fixture(scope="function", autouse=False) +def hh_assets(): + """Construct a temporary HyperparameterHunterAssets directory that exists only for the duration + of the tests contained in each function, before it and its contents are deleted""" + temp_assets_path = assets_dir + try: + makedirs(temp_assets_path) + except FileExistsError: + rmtree(temp_assets_path) + makedirs(temp_assets_path) + yield + ################################################## # `Space.rvs` with `Categorical` Strings @@ -295,3 +322,68 @@ def test_get_by_name_use_location(space, name, expected): ################################################## def test_rejected_optional_repr(): assert "{!r}".format(RejectedOptional()) == "RejectedOptional()" + + +################################################## +# Nested Dimension Optimization Matching Tests +################################################## +# Regression tests to ensure proper Experiment result matching when optimizing `Dimension` s nested +# inside other structures. See https://github.com/HunterMcGushion/hyperparameter_hunter/issues/183 + + +@pytest.fixture() +def env_iris(): + env = Environment( + train_dataset=get_iris_data(), + results_path=assets_dir, + target_column="species", + metrics=["hamming_loss"], + cv_params=dict(n_splits=5, shuffle=True, random_state=32), + ) + return env + + +def get_nested_dict_rfc_opt_pro() -> BayesianOptPro: + """Get a :class:`BayesianOptPro` instance, forged with Dimensions in a nested `class_weight` + dict under `model_init_params`--for `RandomForestClassifier`""" + opt = BayesianOptPro(iterations=2, random_state=32, n_initial_points=1) + opt.forge_experiment( + model_initializer=RandomForestClassifier, + model_init_params=dict( + n_estimators=Integer(5, 100), + # Below `class_weight` is object under test + class_weight={0: Categorical([1, 3]), 1: Categorical([1, 4]), 2: Integer(1, 9)}, + ), + ) + return opt + + +def test_nested_dict_matching_exp(env_iris): + """Test that individual values in a `class_weight` dict can be optimized and matched with + compatible saved Experiment results. See HH issue #183 (linked above) for details""" + # Experiment, whose saved results should be matched by `opt` + exp = CVExperiment( + RandomForestClassifier, dict(n_estimators=10, class_weight={0: 1, 1: 1, 2: 1}) + ) + + # OptPro, whose Dimensions should match with results of `exp` + opt = get_nested_dict_rfc_opt_pro() + opt.go() + + # Check that `opt` matched with `exp` + assert exp.experiment_id in [_[2] for _ in opt.similar_experiments] + + +def test_nested_dict_matching_opt(env_iris): + """Test that individual values in a `class_weight` dict can be optimized and matched with + compatible saved OptPro results. See HH issue #183 (linked above) for details""" + # First OptPro, whose Dimensions should match with below `opt_1` + opt_0 = get_nested_dict_rfc_opt_pro() + opt_0.go() + + # Second OptPro, identical to `opt_0`, whose Dimensions should match with results of `opt_0` + opt_1 = get_nested_dict_rfc_opt_pro() + opt_1.go() + + # Assert `opt_1` matched with all Experiments executed by `opt_0` + assert len(opt_1.similar_experiments) == opt_0.successful_iterations