From 4a94e93ec457efc90de0acd8a5146e0b68a0f9f3 Mon Sep 17 00:00:00 2001 From: Technici4n <13494793+Technici4n@users.noreply.github.com> Date: Tue, 17 Dec 2024 15:17:16 +0100 Subject: [PATCH] Increase robustness: version check, prevent precompilation with MPI, automatically precompile in the workchain --- pyproject.toml | 1 + src/aiida_dftk/calculations.py | 72 ++++++++++++++++++++++++---- src/aiida_dftk/parsers.py | 11 +++++ src/aiida_dftk/workflows/base.py | 46 ++++++++++++++++-- tests/conftest.py | 17 +++++-- tests/julia_environment/.gitignore | 3 +- tests/julia_environment/Project.toml | 5 -- 7 files changed, 133 insertions(+), 22 deletions(-) delete mode 100644 tests/julia_environment/Project.toml diff --git a/pyproject.toml b/pyproject.toml index 6575095..82c9d03 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -65,6 +65,7 @@ aiida-dftk = 'aiida_dftk.cli:cmd_root' [project.entry-points.'aiida.calculations'] 'dftk' = 'aiida_dftk.calculations:DftkCalculation' +'dftk.precompile' = 'aiida_dftk.calculations:PrecompileCalculation' [project.entry-points.'aiida.parsers'] 'dftk' = 'aiida_dftk.parsers:DftkParser' diff --git a/src/aiida_dftk/calculations.py b/src/aiida_dftk/calculations.py index 5724415..677628a 100644 --- a/src/aiida_dftk/calculations.py +++ b/src/aiida_dftk/calculations.py @@ -4,15 +4,18 @@ import os import json import typing as ty -from pathlib import Path from aiida import orm from aiida.common import datastructures, exceptions -from aiida.engine import CalcJob +from aiida.engine import CalcJob, ExitCode from aiida_pseudo.data.pseudo import UpfData from pymatgen.core import units +_AIIDA_DFTK_MIN_VERSION = "0.1.9" # inclusive +_AIIDA_DFTK_MAX_VERSION = "0.2" # exclusive + + class DftkCalculation(CalcJob): """`CalcJob` implementation for DFTK.""" @@ -62,6 +65,8 @@ def define(cls, spec): spec.exit_code(501, 'ERROR_SCF_OUT_OF_WALLTIME',message='The SCF was interuptted due to out of walltime. Non-recovarable error.') spec.exit_code(502, 'ERROR_POSTSCF_OUT_OF_WALLTIME',message='The POSTSCF was interuptted due to out of walltime.') spec.exit_code(503, 'ERROR_BANDS_CONVERGENCE_NOT_REACHED', message='The BANDS minimization cycle did not converge.') + # Significant errors but calculation can be used to restart + spec.exit_code(400, 'ERROR_PACKAGE_IMPORT_FAILED', message="Failed to import AiiDA DFTK or write first log message. Typically indicates an environment issue.") # Outputs spec.output('output_parameters', valid_type=orm.Dict, help='output parameters') @@ -170,12 +175,6 @@ def _generate_inputdata( return data, local_copy_pseudo_list - def _generate_cmdline_params(self) -> ty.List[str]: - # Define the command based on the input settings - cmd_params = [] - cmd_params.extend(['-e', 'using AiidaDFTK; AiidaDFTK.run()', self.metadata.options.input_filename]) - return cmd_params - def _generate_retrieve_list(self, parameters: orm.Dict) -> list: """Generate the list of files to retrieve based on the type of calculation requested in the input parameters. @@ -188,6 +187,7 @@ def _generate_retrieve_list(self, parameters: orm.Dict) -> list: f"{item['$function']}.json" if item['$function'] == 'compute_bands' else f"{item['$function']}.hdf5" for item in parameters['postscf'] ] + retrieve_list.append('errors.log') retrieve_list.append('timings.json') retrieve_list.append(f'{self.SCFRES_SUMMARY_NAME}') return retrieve_list @@ -232,7 +232,12 @@ def prepare_for_submission(self, folder): remote_copy_list.append(checkpointfile_info) # prepare command line parameters - cmdline_params = self._generate_cmdline_params() + cmdline_params = [ + # Precompilation under MPI generally deadlocks. Make sure everything is already precompiled. + '--compiled-modules=strict', + '-e', f'using AiidaDFTK; AiidaDFTK.run(; min_version=v"{_AIIDA_DFTK_MIN_VERSION}", max_version=v"{_AIIDA_DFTK_MAX_VERSION}")', + self.metadata.options.input_filename + ] # prepare retrieve list retrieve_list = self._generate_retrieve_list(self.inputs.parameters) @@ -251,3 +256,52 @@ def prepare_for_submission(self, folder): calcinfo.local_copy_list = local_copy_list return calcinfo + +class PrecompileCalculation(CalcJob): + """Calcjob implementation to precompile AiidaDFTK.""" + + _SUCCESS_PRINT = "Import successful" + + @classmethod + def define(cls, spec): + """Define the process specification.""" + super().define(spec) + + options = spec.inputs['metadata']['options'] + options['resources'].default = {'num_machines': 1, 'num_mpiprocs_per_machine': 1} + + def prepare_for_submission(self, folder): + # Set up the `CodeInfo` to pass to `CalcInfo` + codeinfo = datastructures.CodeInfo() + codeinfo.code_uuid = self.inputs.code.uuid + codeinfo.cmdline_params = [ + '-e', f'using AiidaDFTK; println("{self._SUCCESS_PRINT}")' + ] + codeinfo.withmpi = False + + # Set up the `CalcInfo` so AiiDA knows what to do with everything + calcinfo = datastructures.CalcInfo() + calcinfo.codes_info = [codeinfo] + + return calcinfo + + # Easier to override the parse method than to write a parser. + def parse(self, *args, **kwargs): + exit_code = super().parse(*args, **kwargs) + if exit_code.status != 0: + return exit_code + + retrieved = self.node.outputs.retrieved + filename_stdout = self.node.get_option('scheduler_stdout') + + if filename_stdout is None: + self.report('could not determine `stdout` filename because `scheduler_stdout` option was not set.') + else: + try: + scheduler_stdout = retrieved.base.repository.get_object_content(filename_stdout, mode='r') + if self._SUCCESS_PRINT in scheduler_stdout: + return ExitCode(0) + except FileNotFoundError: + self.report(f'could not parse scheduler output: the `{filename_stdout}` file is missing') + + return self.exit_codes.ERROR_UNSPECIFIED diff --git a/src/aiida_dftk/parsers.py b/src/aiida_dftk/parsers.py index 335fef0..908cb86 100644 --- a/src/aiida_dftk/parsers.py +++ b/src/aiida_dftk/parsers.py @@ -40,6 +40,17 @@ def parse(self, **kwargs): else: return self.exit_codes.ERROR_POSTSCF_OUT_OF_WALLTIME + # Check error file + try: + errors_log = self.retrieved.base.repository.get_object_content("errors.log") + if "Imports succeeded" not in errors_log: + return self.exit_codes.ERROR_PACKAGE_IMPORT_FAILED + except FileNotFoundError: + return self.exit_codes.ERROR_PACKAGE_IMPORT_FAILED + + if "Finished successfully" not in errors_log: + return self.exit_codes.ERROR_UNSPECIFIED + # Check retrieve list to know which files the calculation is expected to have produced. try: self._parse_optional_result( diff --git a/src/aiida_dftk/workflows/base.py b/src/aiida_dftk/workflows/base.py index 8c591c3..5867379 100644 --- a/src/aiida_dftk/workflows/base.py +++ b/src/aiida_dftk/workflows/base.py @@ -2,12 +2,13 @@ """Base DFTK WorkChain implementation.""" from aiida import orm from aiida.common import AttributeDict, exceptions -from aiida.engine import BaseRestartWorkChain, ProcessHandlerReport, process_handler, while_ +from aiida.engine import BaseRestartWorkChain, ProcessHandlerReport, process_handler, while_, if_, ToContext from aiida.plugins import CalculationFactory from aiida_dftk.utils import create_kpoints_from_distance, validate_and_prepare_pseudos_inputs DftkCalculation = CalculationFactory('dftk') +PrecompileCalculation = CalculationFactory('dftk.precompile') class DftkBaseWorkChain(BaseRestartWorkChain): @@ -15,6 +16,8 @@ class DftkBaseWorkChain(BaseRestartWorkChain): _process_class = DftkCalculation + _attempted_precompilation_extra = "attempted_precompilation" + @classmethod def define(cls, spec): """Define the process specification.""" @@ -44,7 +47,12 @@ def define(cls, spec): while_(cls.should_run_process)( cls.prepare_process, cls.run_process, - cls.inspect_process, + if_(cls.should_attempt_precompilation)( + cls.run_precompilation, + cls.inspect_precompilation, + ).else_( + cls.inspect_process, + ) ), cls.results, ) @@ -59,6 +67,8 @@ def define(cls, spec): message='Neither the `options` nor `automatic_parallelization` input was specified.') spec.exit_code(204, 'ERROR_INVALID_INPUT_RESOURCES_UNDERSPECIFIED', message='The `metadata.options` did not specify both `resources.num_machines` and `max_wallclock_seconds`.') + spec.exit_code(300, 'ERROR_PRECOMPILATION_FAILURE', + message='Failed to precompile AiidaDFTK. Typically indicates an environment issue.') def setup(self): """Call the `setup` of the `BaseRestartWorkChain` and then create the inputs dictionary in `self.ctx.inputs`. @@ -146,7 +156,37 @@ def report_error_handled(self, calculation, action): :param action: a string message with the action taken """ self.report(f'{calculation.process_label}<{calculation.pk}> failed with exit status {calculation.exit_status}: {calculation.exit_message}') - self.report(f'Action taken: {action}') + self.report(f'action taken: {action}') + + def should_attempt_precompilation(self): + if self.node.base.extras.get(self._attempted_precompilation_extra, False): + return False + node = self.ctx.children[self.ctx.iteration - 1] + return node.exit_code == DftkCalculation.exit_codes.ERROR_PACKAGE_IMPORT_FAILED + + def run_precompilation(self): + self.node.base.extras.set(self._attempted_precompilation_extra, True) + calculation = self.ctx.children[self.ctx.iteration - 1] + self.report_error_handled(calculation, 'attempting to precompile') + + node = self.submit(PrecompileCalculation, inputs={ + "code": calculation.inputs.code, + }) + self.report(f'launching {node.process_label}<{node.pk}>') + + return ToContext(precompile_job=node) + + def inspect_precompilation(self): + calculation = self.ctx.precompile_job + + if calculation.exit_status != 0: + self.report(f'{calculation.process_label}<{calculation.pk}> failed with exit status {calculation.exit_status}: {calculation.exit_message}') + self.report(f'the issue can be diagnosed by running `verdi process report {calculation.pk}` and checking the logs.') + self.report('aborting workchain') + return self.exit_codes.ERROR_PRECOMPILATION_FAILURE + + self.report(f'{calculation.process_label}<{calculation.pk}> succeeded. Resuming workchain iterations.') + return None @process_handler(priority=500, exit_codes=[DftkCalculation.exit_codes.ERROR_SCF_CONVERGENCE_NOT_REACHED]) def handle_scf_convergence_not_reached(self, _): diff --git a/tests/conftest.py b/tests/conftest.py index ae526cf..82ac732 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,19 +1,30 @@ import logging -import os +from pathlib import Path import pytest +import aiida_dftk.calculations + pytest_plugins = 'aiida.manage.tests.pytest_fixtures' _LOGGER = logging.getLogger(__name__) -_julia_project_path = os.path.join(__file__, "..", "julia_environment") +_julia_project_path = Path(__file__).parent / "julia_environment" def pytest_sessionstart(): """Instantiates the test Julia environment before any test runs.""" import subprocess + with open(_julia_project_path / "Project.toml", "w") as f: + f.write(f""" +[deps] +AiidaDFTK = "26386dbc-b74b-4d9a-b75a-41d28ada84fc" + +[compat] +AiidaDFTK = "{aiida_dftk.calculations._AIIDA_DFTK_MIN_VERSION}" +""") + # Pkg.Registry.add() seems necessary for GitHub Actions - subprocess.run(['julia', f'--project={_julia_project_path}', '-e', 'using Pkg; Pkg.Registry.add(); Pkg.resolve(); Pkg.precompile();'], check=True) + subprocess.run(['julia', f'--project={_julia_project_path}', '-e', 'using Pkg; Pkg.Registry.add(); Pkg.resolve();'], check=True) @pytest.fixture diff --git a/tests/julia_environment/.gitignore b/tests/julia_environment/.gitignore index 36d8c6a..f59ec20 100644 --- a/tests/julia_environment/.gitignore +++ b/tests/julia_environment/.gitignore @@ -1,2 +1 @@ -LocalPreferences.toml -Manifest.toml \ No newline at end of file +* \ No newline at end of file diff --git a/tests/julia_environment/Project.toml b/tests/julia_environment/Project.toml deleted file mode 100644 index 629b17f..0000000 --- a/tests/julia_environment/Project.toml +++ /dev/null @@ -1,5 +0,0 @@ -[deps] -AiidaDFTK = "26386dbc-b74b-4d9a-b75a-41d28ada84fc" - -[compat] -AiidaDFTK = "0.1" \ No newline at end of file