Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add conda env and $TMPDIR checks to Dataset class #155

Merged
merged 2 commits into from
Feb 13, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 54 additions & 4 deletions global_scripts/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -347,6 +347,50 @@ def init_retries(self, retries: int, retry_delay: int, save_settings: bool=False
return retries, retry_delay


def _check_env_and_run(self, correct_env: str):
"""
Check conda environment is set to correct_env, log warning if it isn't
Check if $TMPDIR is in /local, log warning if it is
Then, run self.main()
"""
logger = self.get_logger()

try:
# CONDA_DEFAULT_ENV should just be the name of the current env
current_env = os.environ["CONDA_DEFAULT_ENV"]
except KeyError:
# KeyError if there is no such environment variable
logger.warning("No conda environment detected! Have you loaded the anaconda module and activated an environment?")
except:
# don't kill the program if something else goes wrong
logger.warning("Unable to detect current conda environment")
else:
# test if the current env is the one we wanted
if current_env != correct_env:
logger.warning(f"Your conda environment is {current_env} instead of the expected {correct_env}")

try:
# $TMPDIR is the default temporary directory that deployments use to store and execute code
# is $TMPDIR set, and can we resolve it (find it on the filesystem)?
tmp_dir = Path(os.environ["TMPDIR"]).resolve(strict=True)
except KeyError:
# KeyError if there is no such environment variable
logger.warning("No $TMPDIR environment variable found!")
except FileNotFoundError:
# when we tried to resolve the path, the folder wasn't found on filesystem
logger.warning("$TMPDIR path not found!")
else:
# /local points to local storage on W&M HPC
slash_local = Path("/local").resolve()
# is /local a parent dir of tmp_dir?
for p in tmp_dir.parents:
if p.resolve() == slash_local:
logger.warning("$TMPDIR in /local, deployments won't be accessible to compute nodes.")

# run the dataset (self.main() should be defined in child class instance)
self.main()


def run(
self,
backend: Optional[str]=None,
Expand All @@ -359,13 +403,19 @@ def run(
logger_level=logging.INFO,
retries: int=3,
retry_delay: int=5,
conda_env: str = "geodata38",
**kwargs):
"""
Run a dataset
Calls self.main() with a backend e.g. "prefect"
Initializes class variables and chosen backend
This is how Datasets should usually be run
Eventually calls _check_env_and_run(), starting dataset (see below)
"""

# no matter what happens, this is our ticket to run the actual dataset
# every backend calls this after initializing
launch = lambda: self._check_env_and_run(conda_env)

self.init_retries(retries, retry_delay, save_settings=True)

self.log_dir = Path(log_dir)
Expand Down Expand Up @@ -406,7 +456,7 @@ def run(

@flow(task_runner=tr, name=self.name)
def prefect_main_wrapper():
self.main()
launch()

prefect_main_wrapper()

Expand All @@ -425,14 +475,14 @@ def prefect_main_wrapper():
self.backend = "mpi"
self.mpi_max_workers = max_workers

self.main()
launch()

elif backend == "local" or backend is None:
if run_parallel:
self.backend = "concurrent"
else:
self.backend = "serial"
self.main()
launch()

else:
raise ValueError(f"Backend {backend} not recognized.")