diff --git a/MANIFEST.in b/MANIFEST.in index 845905c022..d64dfc817f 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,5 +1,4 @@ include aiida/cmdline/templates/*.tpl -include aiida/manage/backup/backup_info.json.tmpl include aiida/manage/configuration/schema/*.json include setup.json include AUTHORS.txt diff --git a/aiida/backends/djsite/db/migrations/0014_add_node_uuid_unique_constraint.py b/aiida/backends/djsite/db/migrations/0014_add_node_uuid_unique_constraint.py index 8d125f2196..0261e5b12c 100644 --- a/aiida/backends/djsite/db/migrations/0014_add_node_uuid_unique_constraint.py +++ b/aiida/backends/djsite/db/migrations/0014_add_node_uuid_unique_constraint.py @@ -27,7 +27,7 @@ def verify_node_uuid_uniqueness(_, __): :raises: IntegrityError if database contains nodes with duplicate UUIDS. """ - from aiida.manage.database.integrity.duplicate_uuid import verify_uuid_uniqueness + from aiida.backends.general.migrations.utils import verify_uuid_uniqueness verify_uuid_uniqueness(table='db_dbnode') diff --git a/aiida/backends/djsite/db/migrations/0046_add_node_repository_metadata.py b/aiida/backends/djsite/db/migrations/0046_add_node_repository_metadata.py new file mode 100644 index 0000000000..82167f9436 --- /dev/null +++ b/aiida/backends/djsite/db/migrations/0046_add_node_repository_metadata.py @@ -0,0 +1,36 @@ +# -*- coding: utf-8 -*- +########################################################################### +# Copyright (c), The AiiDA team. All rights reserved. # +# This file is part of the AiiDA code. # +# # +# The code is hosted on GitHub at https://github.com/aiidateam/aiida-core # +# For further information on the license, see the LICENSE.txt file # +# For further information please visit http://www.aiida.net # +########################################################################### +# pylint: disable=invalid-name,too-few-public-methods +"""Migration to add the `repository_metadata` JSONB column.""" + +# pylint: disable=no-name-in-module,import-error +import django.contrib.postgres.fields.jsonb +from django.db import migrations +from aiida.backends.djsite.db.migrations import upgrade_schema_version + +REVISION = '1.0.46' +DOWN_REVISION = '1.0.45' + + +class Migration(migrations.Migration): + """Migration to add the `repository_metadata` JSONB column.""" + + dependencies = [ + ('db', '0045_dbgroup_extras'), + ] + + operations = [ + migrations.AddField( + model_name='dbnode', + name='repository_metadata', + field=django.contrib.postgres.fields.jsonb.JSONField(null=True), + ), + upgrade_schema_version(REVISION, DOWN_REVISION), + ] diff --git a/aiida/backends/djsite/db/migrations/0047_migrate_repository.py b/aiida/backends/djsite/db/migrations/0047_migrate_repository.py new file mode 100644 index 0000000000..118a870ba0 --- /dev/null +++ b/aiida/backends/djsite/db/migrations/0047_migrate_repository.py @@ -0,0 +1,132 @@ +# -*- coding: utf-8 -*- +########################################################################### +# Copyright (c), The AiiDA team. All rights reserved. # +# This file is part of the AiiDA code. # +# # +# The code is hosted on GitHub at https://github.com/aiidateam/aiida-core # +# For further information on the license, see the LICENSE.txt file # +# For further information please visit http://www.aiida.net # +########################################################################### +# pylint: disable=invalid-name,too-few-public-methods +"""Migrate the file repository to the new disk object store based implementation.""" +# pylint: disable=no-name-in-module,import-error +from django.core.exceptions import ObjectDoesNotExist +from django.db import migrations + +from aiida.backends.djsite.db.migrations import upgrade_schema_version +from aiida.backends.general.migrations import utils +from aiida.cmdline.utils import echo + +REVISION = '1.0.47' +DOWN_REVISION = '1.0.46' + +REPOSITORY_UUID_KEY = 'repository|uuid' + + +def migrate_repository(apps, schema_editor): + """Migrate the repository.""" + # pylint: disable=too-many-locals + import json + from tempfile import NamedTemporaryFile + from aiida.common.progress_reporter import set_progress_bar_tqdm, get_progress_reporter + from aiida.manage.configuration import get_profile + + DbNode = apps.get_model('db', 'DbNode') + + profile = get_profile() + node_count = DbNode.objects.count() + missing_node_uuids = [] + missing_repo_folder = [] + shard_count = 256 + + set_progress_bar_tqdm() + + with get_progress_reporter()(total=shard_count, desc='Migrating file repository') as progress: + for i in range(shard_count): + + shard = '%.2x' % i # noqa flynt + progress.set_description_str(f'Migrating file repository: shard {shard}') + + mapping_node_repository_metadata, missing_sub_repo_folder = utils.migrate_legacy_repository( + node_count, shard + ) + + if missing_sub_repo_folder: + missing_repo_folder.extend(missing_sub_repo_folder) + del missing_sub_repo_folder + + if mapping_node_repository_metadata is None: + continue + + for node_uuid, repository_metadata in mapping_node_repository_metadata.items(): + + # If `repository_metadata` is `{}` or `None`, we skip it, as we can leave the column default `null`. + if not repository_metadata: + continue + + try: + # This can happen if the node was deleted but the repo folder wasn't, or the repo folder just never + # corresponded to an actual node. In any case, we don't want to fail but just log the warning. + node = DbNode.objects.get(uuid=node_uuid) + except ObjectDoesNotExist: + missing_node_uuids.append((node_uuid, repository_metadata)) + else: + node.repository_metadata = repository_metadata + node.save() + + del mapping_node_repository_metadata + progress.update() + + # Store the UUID of the repository container in the `DbSetting` table. Note that for new databases, the profile + # setup will already have stored the UUID and so it should be skipped, or an exception for a duplicate key will be + # raised. This migration step is only necessary for existing databases that are migrated. + container_id = profile.get_repository_container().container_id + with schema_editor.connection.cursor() as cursor: + cursor.execute( + f""" + INSERT INTO db_dbsetting (key, val, description, time) + VALUES ('repository|uuid', to_json('{container_id}'::text), 'Repository UUID', current_timestamp) + ON CONFLICT (key) DO NOTHING; + """ + ) + + if not profile.is_test_profile: + + if missing_node_uuids: + prefix = 'migration-repository-missing-nodes-' + with NamedTemporaryFile(prefix=prefix, suffix='.json', dir='.', mode='w+', delete=False) as handle: + json.dump(missing_node_uuids, handle) + echo.echo_warning( + '\nDetected node repository folders for nodes that do not exist in the database. The UUIDs of ' + f'those nodes have been written to a log file: {handle.name}' + ) + + if missing_repo_folder: + prefix = 'migration-repository-missing-subfolder-' + with NamedTemporaryFile(prefix=prefix, suffix='.json', dir='.', mode='w+', delete=False) as handle: + json.dump(missing_repo_folder, handle) + echo.echo_warning( + '\nDetected repository folders that were missing the required subfolder `path` or `raw_input`.' + f' The paths of those nodes repository folders have been written to a log file: {handle.name}' + ) + + # If there were no nodes, most likely a new profile, there is not need to print the warning + if node_count: + import pathlib + echo.echo_warning( + '\nMigrated file repository to the new disk object store. The old repository has not been deleted ' + f'out of safety and can be found at {pathlib.Path(profile.repository_path, "repository")}.' + ) + + +class Migration(migrations.Migration): + """Migrate the file repository to the new disk object store based implementation.""" + + dependencies = [ + ('db', '0046_add_node_repository_metadata'), + ] + + operations = [ + migrations.RunPython(migrate_repository, reverse_code=migrations.RunPython.noop), + upgrade_schema_version(REVISION, DOWN_REVISION), + ] diff --git a/aiida/backends/djsite/db/migrations/__init__.py b/aiida/backends/djsite/db/migrations/__init__.py index da2065cbaf..c9bf861176 100644 --- a/aiida/backends/djsite/db/migrations/__init__.py +++ b/aiida/backends/djsite/db/migrations/__init__.py @@ -21,7 +21,7 @@ class DeserializationException(AiidaException): pass -LATEST_MIGRATION = '0045_dbgroup_extras' +LATEST_MIGRATION = '0047_migrate_repository' def _update_schema_version(version, apps, _): diff --git a/aiida/backends/djsite/db/models.py b/aiida/backends/djsite/db/models.py index 3ccfc33c2a..30b7b142b7 100644 --- a/aiida/backends/djsite/db/models.py +++ b/aiida/backends/djsite/db/models.py @@ -127,6 +127,7 @@ class DbNode(m.Model): attributes = JSONField(default=dict, null=True) # JSON Extras extras = JSONField(default=dict, null=True) + repository_metadata = JSONField(null=True) objects = m.Manager() # Return aiida Node instances or their subclasses instead of DbNode instances diff --git a/aiida/backends/djsite/manager.py b/aiida/backends/djsite/manager.py index 81c12c2dfe..c3911dc10c 100644 --- a/aiida/backends/djsite/manager.py +++ b/aiida/backends/djsite/manager.py @@ -84,7 +84,7 @@ def get_schema_generation_database(self): from django.db.utils import ProgrammingError from aiida.manage.manager import get_manager - backend = get_manager()._load_backend(schema_check=False) # pylint: disable=protected-access + backend = get_manager()._load_backend(schema_check=False, repository_check=False) # pylint: disable=protected-access try: result = backend.execute_raw(r"""SELECT tval FROM db_dbsetting WHERE key = 'schema_generation';""") @@ -104,7 +104,7 @@ def get_schema_version_database(self): from django.db.utils import ProgrammingError from aiida.manage.manager import get_manager - backend = get_manager()._load_backend(schema_check=False) # pylint: disable=protected-access + backend = get_manager()._load_backend(schema_check=False, repository_check=False) # pylint: disable=protected-access try: result = backend.execute_raw(r"""SELECT tval FROM db_dbsetting WHERE key = 'db|schemaversion';""") @@ -129,7 +129,7 @@ def _migrate_database_generation(self): from aiida.manage.manager import get_manager super()._migrate_database_generation() - backend = get_manager()._load_backend(schema_check=False) # pylint: disable=protected-access + backend = get_manager()._load_backend(schema_check=False, repository_check=False) # pylint: disable=protected-access backend.execute_raw(r"""DELETE FROM django_migrations WHERE app = 'db';""") backend.execute_raw( r"""INSERT INTO django_migrations (app, name, applied) VALUES ('db', '0001_initial', NOW());""" diff --git a/aiida/backends/general/migrations/utils.py b/aiida/backends/general/migrations/utils.py index fd1e8c69dc..1a10788c3e 100644 --- a/aiida/backends/general/migrations/utils.py +++ b/aiida/backends/general/migrations/utils.py @@ -9,17 +9,288 @@ ########################################################################### # pylint: disable=invalid-name """Various utils that should be used during migrations and migrations tests because the AiiDA ORM cannot be used.""" - import datetime -import errno +import functools +import io import os +import pathlib import re +import typing import numpy +from disk_objectstore import Container +from disk_objectstore.utils import LazyOpener -from aiida.common import json +from aiida.common import exceptions, json +from aiida.repository.backend import AbstractRepositoryBackend +from aiida.repository.common import File, FileType +from aiida.repository.repository import Repository ISOFORMAT_DATETIME_REGEX = re.compile(r'^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d+(\+\d{2}:\d{2})?$') +REGEX_SHARD_SUB_LEVEL = re.compile(r'^[0-9a-f]{2}$') +REGEX_SHARD_FINAL_LEVEL = re.compile(r'^[0-9a-f-]{32}$') + + +class LazyFile(File): + """Subclass of `File` where `key` also allows `LazyOpener` in addition to a string. + + This subclass is necessary because the migration will be storing instances of `LazyOpener` as the `key` which should + normally only be a string. This subclass updates the `key` type check to allow this. + """ + + def __init__( + self, + name: str = '', + file_type: FileType = FileType.DIRECTORY, + key: typing.Union[str, None, LazyOpener] = None, + objects: typing.Dict[str, 'File'] = None + ): + # pylint: disable=super-init-not-called + if not isinstance(name, str): + raise TypeError('name should be a string.') + + if not isinstance(file_type, FileType): + raise TypeError('file_type should be an instance of `FileType`.') + + if key is not None and not isinstance(key, (str, LazyOpener)): + raise TypeError('key should be `None` or a string.') + + if objects is not None and any([not isinstance(obj, self.__class__) for obj in objects.values()]): + raise TypeError('objects should be `None` or a dictionary of `File` instances.') + + if file_type == FileType.DIRECTORY and key is not None: + raise ValueError('an object of type `FileType.DIRECTORY` cannot define a key.') + + if file_type == FileType.FILE and objects is not None: + raise ValueError('an object of type `FileType.FILE` cannot define any objects.') + + self._name = name + self._file_type = file_type + self._key = key + self._objects = objects or {} + + +class MigrationRepository(Repository): + """Subclass of `Repository` that uses `LazyFile` instead of `File` as its file class.""" + + _file_cls = LazyFile + + +class NoopRepositoryBackend(AbstractRepositoryBackend): + """Implementation of the ``AbstractRepositoryBackend`` where all write operations are no-ops. + + This repository backend is used to use the ``Repository`` interface to build repository metadata but instead of + actually writing the content of the current repository to disk elsewhere, it will simply open a lazy file opener. + In a subsequent step, all these streams are passed to the new Disk Object Store that will write their content + directly to pack files for optimal efficiency. + """ + + def put_object_from_filelike(self, handle: io.BufferedIOBase) -> str: + """Store the byte contents of a file in the repository. + + :param handle: filelike object with the byte content to be stored. + :return: the generated fully qualified identifier for the object within the repository. + :raises TypeError: if the handle is not a byte stream. + """ + return LazyOpener(handle.name) + + def has_object(self, key: str) -> bool: + """Return whether the repository has an object with the given key. + + :param key: fully qualified identifier for the object within the repository. + :return: True if the object exists, False otherwise. + """ + raise NotImplementedError() + + +def migrate_legacy_repository(node_count, shard=None): + """Migrate the legacy file repository to the new disk object store and return mapping of repository metadata. + + The format of the return value will be a dictionary where the keys are the UUIDs of the nodes whose repository + folder has contents have been migrated to the disk object store. The values are the repository metadata that contain + the keys for the generated files with which the files in the disk object store can be retrieved. The format of the + repository metadata follows exactly that of what is generated normally by the ORM. + + This implementation consciously uses the ``Repository`` interface in order to not have to rewrite the logic that + builds the nested repository metadata based on the contents of a folder on disk. The advantage is that in this way + it is guarantee that the exact same repository metadata is generated as it would have during normal operation. + However, if the ``Repository`` interface or its implementation ever changes, it is possible that this solution will + have to be adapted and the significant parts of the implementation will have to be copy pasted here. + + :return: mapping of node UUIDs onto the new repository metadata. + :raises `~aiida.common.exceptions.DatabaseMigrationError`: in case the container of the migrated repository already + exists or if the repository does not exist but the database contains at least one node. + """ + # pylint: disable=too-many-locals + from aiida.manage.configuration import get_profile + + profile = get_profile() + backend = NoopRepositoryBackend() + repository = MigrationRepository(backend=backend) + + # Initialize the new container: don't go through the profile, because that will not check if it already exists + filepath = pathlib.Path(profile.repository_path) / 'container' + basepath = pathlib.Path(profile.repository_path) / 'repository' / 'node' + container = Container(filepath) + + if not basepath.is_dir(): + # If the database is empty, this is a new profile and so it is normal the repo folder doesn't exist. We simply + # return as there is nothing to migrate + if profile.is_test_profile or node_count == 0: + return None, None + + raise exceptions.DatabaseMigrationError( + f'the file repository `{basepath}` does not exist but the database is not empty, it contains {node_count} ' + 'nodes. Aborting the migration.' + ) + + # When calling this function multiple times, once for each shard, we should only check whether the container has + # already been initialized for the first shard. + if shard is None or shard == '00': + if container.is_initialised and not profile.is_test_profile: + raise exceptions.DatabaseMigrationError( + f'the container {filepath} already exists. If you ran this migration before and it failed simply ' + 'delete this directory and restart the migration.' + ) + + container.init_container(clear=True, **profile.defaults['repository']) + + node_repository_dirpaths, missing_sub_repo_folder = get_node_repository_dirpaths(basepath, shard) + + filepaths = [] + streams = [] + mapping_metadata = {} + + # Loop over all the folders for each node that was found in the existing file repository and generate the repository + # metadata that will have to be stored on the node. Calling `put_object_from_tree` will generate the virtual + # hierarchy in memory, writing the files not actually to disk but opening lazy file handles, and then the call to + # `serialize_repository` serializes the virtual hierarchy into JSON storable dictionary. This will later be stored + # on the nodes in the database, and so it is added to the `mapping_metadata` which will be returned from this + # function. After having constructed the virtual hierarchy, we walk over the contents and take just the files and + # add the value (which is the `LazyOpener`) to the `streams` list as well as its relative path to `filepaths`. + for node_uuid, node_dirpath in node_repository_dirpaths.items(): + repository.put_object_from_tree(node_dirpath) + metadata = serialize_repository(repository) + mapping_metadata[node_uuid] = metadata + for root, _, filenames in repository.walk(): + for filename in filenames: + parts = list(pathlib.Path(root / filename).parts) + filepaths.append((node_uuid, parts)) + streams.append(functools.reduce(lambda objects, part: objects['o'].get(part), parts, metadata)['k']) + + # Reset the repository to a clean node repository, which removes the internal virtual file hierarchy + repository.reset() + + # Free up the memory of this mapping that is no longer needed and can be big + del node_repository_dirpaths + + hashkeys = container.add_streamed_objects_to_pack(streams, compress=False, open_streams=True) + + # Now all that remains is to go through all the generated repository metadata, stored for each node in the + # `mapping_metadata` and replace the "values" for all the files, which are currently still the `LazyOpener` + # instances, and replace them with the hashkey that was generated from its content by the DOS container. + for hashkey, (node_uuid, parts) in zip(hashkeys, filepaths): + repository_metadata = mapping_metadata[node_uuid] + functools.reduce(lambda objects, part: objects['o'].get(part), parts, repository_metadata)['k'] = hashkey + + del filepaths + del streams + + return mapping_metadata, missing_sub_repo_folder + + +def get_node_repository_dirpaths(basepath, shard=None): + """Return a mapping of node UUIDs onto the path to their current repository folder in the old repository. + + :param basepath: the absolute path of the base folder of the old file repository. + :param shard: optional shard to define which first shard level to check. If `None`, all shard levels are checked. + :return: dictionary of node UUID onto absolute filepath and list of node repo missing one of the two known sub + folders, ``path`` or ``raw_input``, which is unexpected. + :raises `~aiida.common.exceptions.DatabaseMigrationError`: if the repository contains node folders that contain both + the `path` and `raw_input` subdirectories, which should never happen. + """ + # pylint: disable=too-many-branches + from aiida.manage.configuration import get_profile + + profile = get_profile() + mapping = {} + missing_sub_repo_folder = [] + contains_both = [] + + if shard is not None: + + # If the shard is not present in the basepath, there is nothing to do + if shard not in os.listdir(basepath): + return mapping, missing_sub_repo_folder + + shards = [pathlib.Path(basepath) / shard] + else: + shards = basepath.iterdir() + + for shard_one in shards: + + if not REGEX_SHARD_SUB_LEVEL.match(shard_one.name): + continue + + for shard_two in shard_one.iterdir(): + + if not REGEX_SHARD_SUB_LEVEL.match(shard_two.name): + continue + + for shard_three in shard_two.iterdir(): + + if not REGEX_SHARD_FINAL_LEVEL.match(shard_three.name): + continue + + uuid = shard_one.name + shard_two.name + shard_three.name + dirpath = basepath / shard_one / shard_two / shard_three + subdirs = [path.name for path in dirpath.iterdir()] + + path = None + + if 'path' in subdirs and 'raw_input' in subdirs: + # If the `path` is empty, we simply ignore and set `raw_input` to be migrated, otherwise we add + # the entry to `contains_both` which will cause the migration to fail. + if os.listdir(dirpath / 'path'): + contains_both.append(str(dirpath)) + else: + path = dirpath / 'raw_input' + elif 'path' in subdirs: + path = dirpath / 'path' + elif 'raw_input' in subdirs: + path = dirpath / 'raw_input' + else: + missing_sub_repo_folder.append(str(dirpath)) + + if path is not None: + mapping[uuid] = path + + if contains_both and not profile.is_test_profile: + raise exceptions.DatabaseMigrationError( + f'The file repository `{basepath}` contained node repository folders that contained both the `path` as well' + ' as the `raw_input` subfolders. This should not have happened, as the latter is used for calculation job ' + 'nodes, and the former for all other nodes. The migration will be aborted and the paths of the offending ' + 'node folders will be printed below. If you know which of the subpaths is incorrect, you can manually ' + 'delete it and then restart the migration. Here is the list of offending node folders:\n' + + '\n'.join(contains_both) + ) + + return mapping, missing_sub_repo_folder + + +def serialize_repository(repository: Repository) -> dict: + """Serialize the metadata into a JSON-serializable format. + + .. note:: the serialization format is optimized to reduce the size in bytes. + + :return: dictionary with the content metadata. + """ + file_object = repository._directory # pylint: disable=protected-access + if file_object.file_type == FileType.DIRECTORY: + if file_object.objects: + return {'o': {key: obj.serialize() for key, obj in file_object.objects.items()}} + return {} + return {'k': file_object.key} def ensure_repository_folder_created(uuid): @@ -28,12 +299,7 @@ def ensure_repository_folder_created(uuid): :param uuid: UUID of the node """ dirpath = get_node_repository_sub_folder(uuid) - - try: - os.makedirs(dirpath) - except OSError as exception: - if exception.errno != errno.EEXIST: - raise + os.makedirs(dirpath, exist_ok=True) def put_object_from_string(uuid, name, content): @@ -44,7 +310,13 @@ def put_object_from_string(uuid, name, content): :param content: the content to write to the file """ ensure_repository_folder_created(uuid) - filepath = os.path.join(get_node_repository_sub_folder(uuid), name) + basepath = get_node_repository_sub_folder(uuid) + dirname = os.path.dirname(name) + + if dirname: + os.makedirs(os.path.join(basepath, dirname), exist_ok=True) + + filepath = os.path.join(basepath, name) with open(filepath, 'w', encoding='utf-8') as handle: handle.write(content) @@ -62,7 +334,7 @@ def get_object_from_repository(uuid, name): return handle.read() -def get_node_repository_sub_folder(uuid): +def get_node_repository_sub_folder(uuid, subfolder='path'): """Return the absolute path to the sub folder `path` within the repository of the node with the given UUID. :param uuid: UUID of the node @@ -73,7 +345,7 @@ def get_node_repository_sub_folder(uuid): uuid = str(uuid) repo_dirpath = os.path.join(get_profile().repository_path, 'repository') - node_dirpath = os.path.join(repo_dirpath, 'node', uuid[:2], uuid[2:4], uuid[4:], 'path') + node_dirpath = os.path.join(repo_dirpath, 'node', uuid[:2], uuid[2:4], uuid[4:], subfolder) return node_dirpath @@ -127,6 +399,15 @@ def load_numpy_array_from_repository(uuid, name): return numpy.load(filepath) +def get_repository_object(hashkey): + """Return the content of an object stored in the disk object store repository for the given hashkey.""" + from aiida.manage.configuration import get_profile + + dirpath_container = os.path.join(get_profile().repository_path, 'container') + container = Container(dirpath_container) + return container.get_object_content(hashkey) + + def recursive_datetime_to_isoformat(value): """Convert all datetime objects in the given value to string representations in ISO format. @@ -147,3 +428,99 @@ def recursive_datetime_to_isoformat(value): def dumps_json(dictionary): """Transforms all datetime object into isoformat and then returns the JSON.""" return json.dumps(recursive_datetime_to_isoformat(dictionary)) + + +def get_duplicate_uuids(table): + """Retrieve rows with duplicate UUIDS. + + :param table: database table with uuid column, e.g. 'db_dbnode' + :return: list of tuples of (id, uuid) of rows with duplicate UUIDs + """ + from aiida.manage.manager import get_manager + backend = get_manager().get_backend() + return backend.query_manager.get_duplicate_uuids(table=table) + + +def verify_uuid_uniqueness(table): + """Check whether database table contains rows with duplicate UUIDS. + + :param table: Database table with uuid column, e.g. 'db_dbnode' + :type str: + + :raises: IntegrityError if table contains rows with duplicate UUIDS. + """ + duplicates = get_duplicate_uuids(table=table) + + if duplicates: + raise exceptions.IntegrityError( + 'Table {table:} contains rows with duplicate UUIDS: run ' + '`verdi database integrity detect-duplicate-uuid -t {table:}` to address the problem'.format(table=table) + ) + + +def apply_new_uuid_mapping(table, mapping): + """Take a mapping of pks to UUIDs and apply it to the given table. + + :param table: database table with uuid column, e.g. 'db_dbnode' + :param mapping: dictionary of UUIDs mapped onto a pk + """ + from aiida.manage.manager import get_manager + backend = get_manager().get_backend() + backend.query_manager.apply_new_uuid_mapping(table, mapping) + + +def deduplicate_uuids(table=None): + """Detect and solve entities with duplicate UUIDs in a given database table. + + Before aiida-core v1.0.0, there was no uniqueness constraint on the UUID column of the node table in the database + and a few other tables as well. This made it possible to store multiple entities with identical UUIDs in the same + table without the database complaining. This bug was fixed in aiida-core=1.0.0 by putting an explicit uniqueness + constraint on UUIDs on the database level. However, this would leave databases created before this patch with + duplicate UUIDs in an inconsistent state. This command will run an analysis to detect duplicate UUIDs in a given + table and solve it by generating new UUIDs. Note that it will not delete or merge any rows. + + :return: list of strings denoting the performed operations + :raises ValueError: if the specified table is invalid + """ + import distutils.dir_util + from collections import defaultdict + + from aiida.common.utils import get_new_uuid + + mapping = defaultdict(list) + + for pk, uuid in get_duplicate_uuids(table=table): + mapping[uuid].append(int(pk)) + + messages = [] + mapping_new_uuid = {} + + for uuid, rows in mapping.items(): + + uuid_ref = None + + for pk in rows: + + # We don't have to change all rows that have the same UUID, the first one can keep the original + if uuid_ref is None: + uuid_ref = uuid + continue + + uuid_new = str(get_new_uuid()) + mapping_new_uuid[pk] = uuid_new + + messages.append(f'updated UUID of {table} row<{pk}> from {uuid_ref} to {uuid_new}') + dirpath_repo_ref = get_node_repository_sub_folder(uuid_ref) + dirpath_repo_new = get_node_repository_sub_folder(uuid_new) + + # First make sure the new repository exists, then copy the contents of the ref into the new. We use the + # somewhat unknown `distuitils.dir_util` method since that does just contents as we want. + os.makedirs(dirpath_repo_new, exist_ok=True) + distutils.dir_util.copy_tree(dirpath_repo_ref, dirpath_repo_new) + + apply_new_uuid_mapping(table, mapping_new_uuid) + + if not messages: + messages = ['no duplicate UUIDs found'] + + return messages diff --git a/aiida/backends/manager.py b/aiida/backends/manager.py index f0fc3101ca..5a57baf141 100644 --- a/aiida/backends/manager.py +++ b/aiida/backends/manager.py @@ -8,7 +8,6 @@ # For further information please visit http://www.aiida.net # ########################################################################### """Module for settings and utilities to determine and set the database schema versions.""" - import abc import collections @@ -61,6 +60,8 @@ After the database schema is migrated to version `{schema_version_reset}` you can reinstall this version of `aiida-core` and migrate the schema generation. """ +REPOSITORY_UUID_KEY = 'repository|uuid' + Setting = collections.namedtuple('Setting', ['key', 'value', 'description', 'time']) @@ -221,6 +222,24 @@ def set_schema_generation_database(self, generation): """ self.get_settings_manager().set(SCHEMA_GENERATION_KEY, generation) + def set_repository_uuid(self, uuid): + """Set the UUID of the repository that is associated with this database. + + :param uuid: the UUID of the repository associated with this database. + """ + self.get_settings_manager().set(REPOSITORY_UUID_KEY, uuid, description='Repository UUID') + + def get_repository_uuid(self): + """Return the UUID of the repository that is associated with this database. + + :return: the UUID of the repository associated with this database or None if it doesn't exist. + """ + try: + setting = self.get_settings_manager().get(REPOSITORY_UUID_KEY) + return setting.value + except exceptions.NotExistent: + return None + def validate_schema(self, profile): """Validate that the current database generation and schema are up-to-date with that of the code. diff --git a/aiida/backends/sqlalchemy/migrations/versions/1feaea71bd5a_migrate_repository.py b/aiida/backends/sqlalchemy/migrations/versions/1feaea71bd5a_migrate_repository.py new file mode 100644 index 0000000000..dc300428e4 --- /dev/null +++ b/aiida/backends/sqlalchemy/migrations/versions/1feaea71bd5a_migrate_repository.py @@ -0,0 +1,112 @@ +# -*- coding: utf-8 -*- +# pylint: disable=invalid-name,no-member +"""Migrate the file repository to the new disk object store based implementation. + +Revision ID: 1feaea71bd5a +Revises: 7536a82b2cc4 +Create Date: 2020-10-01 15:05:49.271958 + +""" +from alembic import op +from sqlalchemy import Integer, cast +from sqlalchemy.dialects.postgresql import UUID, JSONB +from sqlalchemy.sql import table, column, select, func, text + +from aiida.backends.general.migrations import utils +from aiida.cmdline.utils import echo + +# revision identifiers, used by Alembic. +revision = '1feaea71bd5a' +down_revision = '7536a82b2cc4' +branch_labels = None +depends_on = None + + +def upgrade(): + """Migrations for the upgrade.""" + # pylint: disable=too-many-locals + import json + from tempfile import NamedTemporaryFile + from aiida.common.progress_reporter import set_progress_bar_tqdm, get_progress_reporter + from aiida.manage.configuration import get_profile + + connection = op.get_bind() + + DbNode = table( + 'db_dbnode', + column('id', Integer), + column('uuid', UUID), + column('repository_metadata', JSONB), + ) + + profile = get_profile() + node_count = connection.execute(select([func.count()]).select_from(DbNode)).scalar() + missing_repo_folder = [] + shard_count = 256 + + set_progress_bar_tqdm() + + with get_progress_reporter()(total=shard_count, desc='Migrating file repository') as progress: + for i in range(shard_count): + + shard = '%.2x' % i # noqa flynt + progress.set_description_str(f'Migrating file repository: shard {shard}') + + mapping_node_repository_metadata, missing_sub_repo_folder = utils.migrate_legacy_repository( + node_count, shard + ) + + if missing_sub_repo_folder: + missing_repo_folder.extend(missing_sub_repo_folder) + del missing_sub_repo_folder + + if mapping_node_repository_metadata is None: + continue + + for node_uuid, repository_metadata in mapping_node_repository_metadata.items(): + + # If `repository_metadata` is `{}` or `None`, we skip it, as we can leave the column default `null`. + if not repository_metadata: + continue + + value = cast(repository_metadata, JSONB) + connection.execute(DbNode.update().where(DbNode.c.uuid == node_uuid).values(repository_metadata=value)) + + del mapping_node_repository_metadata + progress.update() + + # Store the UUID of the repository container in the `DbSetting` table. Note that for new databases, the profile + # setup will already have stored the UUID and so it should be skipped, or an exception for a duplicate key will be + # raised. This migration step is only necessary for existing databases that are migrated. + container_id = profile.get_repository_container().container_id + statement = text( + f""" + INSERT INTO db_dbsetting (key, val, description) + VALUES ('repository|uuid', to_json('{container_id}'::text), 'Repository UUID') + ON CONFLICT (key) DO NOTHING; + """ + ) + connection.execute(statement) + + if not profile.is_test_profile: + + if missing_repo_folder: + prefix = 'migration-repository-missing-subfolder-' + with NamedTemporaryFile(prefix=prefix, suffix='.json', dir='.', mode='w+', delete=False) as handle: + json.dump(missing_repo_folder, handle) + echo.echo_warning( + 'Detected repository folders that were missing the required subfolder `path` or `raw_input`. ' + f'The paths of those nodes repository folders have been written to a log file: {handle.name}' + ) + + # If there were no nodes, most likely a new profile, there is not need to print the warning + if node_count: + import pathlib + echo.echo_warning( + 'Migrated file repository to the new disk object store. The old repository has not been deleted out' + f' of safety and can be found at {pathlib.Path(get_profile().repository_path, "repository")}.' + ) + + +def downgrade(): + """Migrations for the downgrade.""" diff --git a/aiida/backends/sqlalchemy/migrations/versions/7536a82b2cc4_add_node_repository_metadata.py b/aiida/backends/sqlalchemy/migrations/versions/7536a82b2cc4_add_node_repository_metadata.py new file mode 100644 index 0000000000..8e8c6d3e94 --- /dev/null +++ b/aiida/backends/sqlalchemy/migrations/versions/7536a82b2cc4_add_node_repository_metadata.py @@ -0,0 +1,36 @@ +# -*- coding: utf-8 -*- +########################################################################### +# Copyright (c), The AiiDA team. All rights reserved. # +# This file is part of the AiiDA code. # +# # +# The code is hosted on GitHub at https://github.com/aiidateam/aiida-core # +# For further information on the license, see the LICENSE.txt file # +# For further information please visit http://www.aiida.net # +########################################################################### +# pylint: disable=invalid-name,no-member +"""Migration to add the `repository_metadata` JSONB column. + +Revision ID: 7536a82b2cc4 +Revises: 0edcdd5a30f0 +Create Date: 2020-07-09 11:32:39.924151 + +""" +from alembic import op +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql + +# revision identifiers, used by Alembic. +revision = '7536a82b2cc4' +down_revision = '0edcdd5a30f0' +branch_labels = None +depends_on = None + + +def upgrade(): + """Migrations for the upgrade.""" + op.add_column('db_dbnode', sa.Column('repository_metadata', postgresql.JSONB(astext_type=sa.Text()), nullable=True)) + + +def downgrade(): + """Migrations for the downgrade.""" + op.drop_column('db_dbnode', 'repository_metadata') diff --git a/aiida/backends/sqlalchemy/models/node.py b/aiida/backends/sqlalchemy/models/node.py index efe18bc979..aee04e38b2 100644 --- a/aiida/backends/sqlalchemy/models/node.py +++ b/aiida/backends/sqlalchemy/models/node.py @@ -41,6 +41,7 @@ class DbNode(Base): mtime = Column(DateTime(timezone=True), default=timezone.now, onupdate=timezone.now) attributes = Column(JSONB) extras = Column(JSONB) + repository_metadata = Column(JSONB) dbcomputer_id = Column( Integer, diff --git a/aiida/cmdline/commands/cmd_database.py b/aiida/cmdline/commands/cmd_database.py index 6da318ea34..cd287af561 100644 --- a/aiida/cmdline/commands/cmd_database.py +++ b/aiida/cmdline/commands/cmd_database.py @@ -59,7 +59,7 @@ def database_migrate(force): if force: try: backend.migrate() - except exceptions.ConfigurationError as exception: + except (exceptions.ConfigurationError, exceptions.DatabaseMigrationError) as exception: echo.echo_critical(str(exception)) return @@ -88,7 +88,7 @@ def database_migrate(force): else: try: backend.migrate() - except exceptions.ConfigurationError as exception: + except (exceptions.ConfigurationError, exceptions.DatabaseMigrationError) as exception: echo.echo_critical(str(exception)) else: echo.echo_success('migration completed') diff --git a/aiida/cmdline/commands/cmd_devel.py b/aiida/cmdline/commands/cmd_devel.py index 381fd46045..e58dfff18c 100644 --- a/aiida/cmdline/commands/cmd_devel.py +++ b/aiida/cmdline/commands/cmd_devel.py @@ -108,10 +108,3 @@ def devel_play(): import webbrowser webbrowser.open_new('http://upload.wikimedia.org/wikipedia/commons/3/32/Triumphal_March_from_Aida.ogg') - - -@verdi_devel.command() -def configure_backup(): - """Configure backup of the repository folder.""" - from aiida.manage.backup.backup_setup import BackupSetup - BackupSetup().run() diff --git a/aiida/cmdline/commands/cmd_node.py b/aiida/cmdline/commands/cmd_node.py index 1be70e7ee4..b7ac1e14d2 100644 --- a/aiida/cmdline/commands/cmd_node.py +++ b/aiida/cmdline/commands/cmd_node.py @@ -58,7 +58,7 @@ def repo_cat(node, relative_path): @verdi_node_repo.command('ls') @arguments.NODE() -@click.argument('relative_path', type=str, default='.') +@click.argument('relative_path', type=str, required=False) @click.option('-c', '--color', 'color', flag_value=True, help='Use different color for folders and files.') @with_dbenv() def repo_ls(node, relative_path, color): diff --git a/aiida/cmdline/commands/cmd_setup.py b/aiida/cmdline/commands/cmd_setup.py index 241e048bb8..2017161e9f 100644 --- a/aiida/cmdline/commands/cmd_setup.py +++ b/aiida/cmdline/commands/cmd_setup.py @@ -78,7 +78,8 @@ def setup( # Migrate the database echo.echo_info('migrating the database.') - backend = get_manager()._load_backend(schema_check=False) # pylint: disable=protected-access + manager = get_manager() + backend = manager._load_backend(schema_check=False) # pylint: disable=protected-access try: backend.migrate() @@ -89,6 +90,27 @@ def setup( else: echo.echo_success('database migration completed.') + # Retrieve the repository UUID from the database. If set, this means this database is associated with the repository + # with that UUID and we have to make sure that the provided repository corresponds to it. + backend_manager = manager.get_backend_manager() + repository_uuid_database = backend_manager.get_repository_uuid() + repository_uuid_profile = profile.get_repository_container().container_id + + # If database contains no repository UUID, it should be a clean database so associate it with the repository + if repository_uuid_database is None: + backend_manager.set_repository_uuid(repository_uuid_profile) + + # Otherwise, if the database UUID does not match that of the repository, it means they do not belong together. Note + # that if a new repository path was specified, which does not yet contain a container, the call to retrieve the + # repo by `get_repository_container` will initialize the container and generate a UUID. This guarantees that if a + # non-empty database is configured with an empty repository path, this check will hit. + elif repository_uuid_database != repository_uuid_profile: + echo.echo_critical( + f'incompatible database and repository configured:\n' + f'Database `{db_name}` is associated with the repository with UUID `{repository_uuid_database}`\n' + f'However, the configured repository `{repository}` has UUID `{repository_uuid_profile}`.' + ) + # Optionally setting configuration default user settings config.set_option('autofill.user.email', email, override=False) config.set_option('autofill.user.first_name', first_name, override=False) diff --git a/aiida/common/exceptions.py b/aiida/common/exceptions.py index 72909d73e8..fbae709d02 100644 --- a/aiida/common/exceptions.py +++ b/aiida/common/exceptions.py @@ -17,7 +17,7 @@ 'PluginInternalError', 'ValidationError', 'ConfigurationError', 'ProfileConfigurationError', 'MissingConfigurationError', 'ConfigurationVersionError', 'IncompatibleDatabaseSchema', 'DbContentError', 'InputValidationError', 'FeatureNotAvailable', 'FeatureDisabled', 'LicensingException', 'TestsNotAllowedError', - 'UnsupportedSpeciesError', 'TransportTaskException', 'OutputParsingError', 'HashingError' + 'UnsupportedSpeciesError', 'TransportTaskException', 'OutputParsingError', 'HashingError', 'DatabaseMigrationError' ) @@ -186,6 +186,10 @@ class IncompatibleDatabaseSchema(ConfigurationError): """Raised when the database schema is incompatible with that of the code.""" +class DatabaseMigrationError(AiidaException): + """Raised if a critical error is encountered during a database migration.""" + + class DbContentError(AiidaException): """ Raised when the content of the DB is not valid. diff --git a/aiida/engine/daemon/execmanager.py b/aiida/engine/daemon/execmanager.py index 02dc638a99..9e1e3459e3 100644 --- a/aiida/engine/daemon/execmanager.py +++ b/aiida/engine/daemon/execmanager.py @@ -291,7 +291,12 @@ def upload_calculation( relpath = os.path.normpath(os.path.relpath(filepath, folder.abspath)) if relpath not in provenance_exclude_list: with open(filepath, 'rb') as handle: - node._repository.put_object_from_filelike(handle, relpath, 'wb', force=True) # pylint: disable=protected-access + node._repository.put_object_from_filelike(handle, relpath) # pylint: disable=protected-access + + # Since the node is already stored, we cannot use the normal repository interface since it will raise a + # `ModificationNotAllowed` error. To bypass it, we go straight to the underlying repository instance to store the + # files, however, this means we have to manually update the node's repository metadata. + node._update_repository_metadata() # pylint: disable=protected-access if not dry_run: # Make sure that attaching the `remote_folder` with a link is the last thing we do. This gives the biggest diff --git a/aiida/manage/backup/__init__.py b/aiida/manage/backup/__init__.py deleted file mode 100644 index 2776a55f97..0000000000 --- a/aiida/manage/backup/__init__.py +++ /dev/null @@ -1,9 +0,0 @@ -# -*- coding: utf-8 -*- -########################################################################### -# Copyright (c), The AiiDA team. All rights reserved. # -# This file is part of the AiiDA code. # -# # -# The code is hosted on GitHub at https://github.com/aiidateam/aiida-core # -# For further information on the license, see the LICENSE.txt file # -# For further information please visit http://www.aiida.net # -########################################################################### diff --git a/aiida/manage/backup/backup_base.py b/aiida/manage/backup/backup_base.py deleted file mode 100644 index a643699a4c..0000000000 --- a/aiida/manage/backup/backup_base.py +++ /dev/null @@ -1,423 +0,0 @@ -# -*- coding: utf-8 -*- -########################################################################### -# Copyright (c), The AiiDA team. All rights reserved. # -# This file is part of the AiiDA code. # -# # -# The code is hosted on GitHub at https://github.com/aiidateam/aiida-core # -# For further information on the license, see the LICENSE.txt file # -# For further information please visit http://www.aiida.net # -########################################################################### -"""Base abstract Backup class for all backends.""" -import datetime -import os -import logging -import shutil - -from abc import ABC, abstractmethod -from dateutil.parser import parse - -from aiida.common import json -from aiida.common import timezone as dtimezone - - -class AbstractBackup(ABC): - """ - This class handles the backup of the AiiDA repository that is referenced - by the current AiiDA database. The backup will start from the - given backup timestamp (*oldest_object_backedup*) or the date of the - oldest node/workflow object found and it will periodically backup - (in periods of *periodicity* days) until the ending date of the backup - specified by *end_date_of_backup* or *days_to_backup*. - """ - - # Keys in the dictionary loaded by the JSON file - OLDEST_OBJECT_BK_KEY = 'oldest_object_backedup' - BACKUP_DIR_KEY = 'backup_dir' - DAYS_TO_BACKUP_KEY = 'days_to_backup' - END_DATE_OF_BACKUP_KEY = 'end_date_of_backup' - PERIODICITY_KEY = 'periodicity' - BACKUP_LENGTH_THRESHOLD_KEY = 'backup_length_threshold' - - # Backup parameters that will be populated by the JSON file - - # Where did the last backup stop - _oldest_object_bk = None - # The destination directory of the backup - _backup_dir = None - - # How many days to backup - _days_to_backup = None - # Until what date we should backup - _end_date_of_backup = None - - # How many consecutive days to backup in one round. - _periodicity = None - - # The threshold (in hours) between the oldest object to be backed up - # and the end of the backup. If the difference is bellow this threshold - # the backup should not start. - _backup_length_threshold = None - - # The end of the backup dates (or days) until the end are translated to - # the following internal variable containing the end date - _internal_end_date_of_backup = None - - _additional_back_time_mins = None - - _ignore_backup_dir_existence_check = False # pylint: disable=invalid-name - - def __init__(self, backup_info_filepath, additional_back_time_mins): - - # The path to the JSON file with the backup information - self._backup_info_filepath = backup_info_filepath - - self._additional_back_time_mins = additional_back_time_mins - - # Configuring the logging - logging.basicConfig(format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s') - - # The logger of the backup script - self._logger = logging.getLogger('aiida.aiida_backup') - - def _read_backup_info_from_file(self, backup_info_file_name): - """ - This method reads the backup information from the given file and - passes the dictionary to the method responsible for the initialization - of the needed class variables. - """ - backup_variables = None - - with open(backup_info_file_name, 'r', encoding='utf8') as backup_info_file: - try: - backup_variables = json.load(backup_info_file) - except ValueError: - self._logger.error('Could not parse file %s', backup_info_file_name) - raise BackupError(f'Could not parse file {backup_info_file_name}') - - self._read_backup_info_from_dict(backup_variables) - - def _read_backup_info_from_dict(self, backup_variables): # pylint: disable=too-many-branches,too-many-statements - """ - This method reads the backup information from the given dictionary and - sets the needed class variables. - """ - # Setting the oldest backup date. This will be used as start of - # the new backup procedure. - # - # If the oldest backup date is not set, then find the oldest - # creation timestamp and set it as the oldest backup date. - if backup_variables.get(self.OLDEST_OBJECT_BK_KEY) is None: - query_node_res = self._query_first_node() - - if not query_node_res: - self._logger.error('The oldest modification date was not found.') - raise BackupError('The oldest modification date was not found.') - - oldest_timestamps = [] - if query_node_res: - oldest_timestamps.append(query_node_res[0].ctime) - - self._oldest_object_bk = min(oldest_timestamps) - self._logger.info( - 'Setting the oldest modification date to the creation date of the oldest object ' - '(%s)', self._oldest_object_bk - ) - - # If the oldest backup date is not None then try to parse it - else: - try: - self._oldest_object_bk = parse(backup_variables.get(self.OLDEST_OBJECT_BK_KEY)) - if self._oldest_object_bk.tzinfo is None: - curr_timezone = dtimezone.get_current_timezone() - self._oldest_object_bk = dtimezone.get_current_timezone().localize(self._oldest_object_bk) - self._logger.info( - 'No timezone defined in the oldest modification date timestamp. Setting current timezone (%s).', - curr_timezone.zone - ) - # If it is not parsable... - except ValueError: - self._logger.error('We did not manage to parse the start timestamp of the last backup.') - raise - - # Setting the backup directory & normalizing it - self._backup_dir = os.path.normpath(backup_variables.get(self.BACKUP_DIR_KEY)) - if (not self._ignore_backup_dir_existence_check and not os.path.isdir(self._backup_dir)): - self._logger.error('The given backup directory does not exist.') - raise BackupError('The given backup directory does not exist.') - - # You can not set an end-of-backup date and end days from the backup - # that you should stop. - if ( - backup_variables.get(self.DAYS_TO_BACKUP_KEY) is not None and - backup_variables.get(self.END_DATE_OF_BACKUP_KEY) is not None - ): - self._logger.error('Only one end of backup date can be set.') - raise BackupError('Only one backup end can be set (date or days from backup start.') - - # Check if there is an end-of-backup date - elif backup_variables.get(self.END_DATE_OF_BACKUP_KEY) is not None: - try: - self._end_date_of_backup = parse(backup_variables.get(self.END_DATE_OF_BACKUP_KEY)) - - if self._end_date_of_backup.tzinfo is None: - curr_timezone = dtimezone.get_current_timezone() - self._end_date_of_backup = \ - curr_timezone.localize( - self._end_date_of_backup) - self._logger.info( - 'No timezone defined in the end date of backup timestamp. Setting current timezone (%s).', - curr_timezone.zone - ) - - self._internal_end_date_of_backup = self._end_date_of_backup - except ValueError: - self._logger.error('The end date of the backup could not be parsed correctly') - raise - - # Check if there is defined a days to backup - elif backup_variables.get(self.DAYS_TO_BACKUP_KEY) is not None: - try: - self._days_to_backup = int(backup_variables.get(self.DAYS_TO_BACKUP_KEY)) - self._internal_end_date_of_backup = ( - self._oldest_object_bk + datetime.timedelta(days=self._days_to_backup) - ) - except ValueError: - self._logger.error('The days to backup should be an integer') - raise - # If the backup end is not set, then the ending date remains open - - # Parse the backup periodicity. - try: - self._periodicity = int(backup_variables.get(self.PERIODICITY_KEY)) - except ValueError: - self._logger.error('The backup _periodicity should be an integer') - raise - - # Parse the backup length threshold - try: - hours_th = int(backup_variables.get(self.BACKUP_LENGTH_THRESHOLD_KEY)) - self._backup_length_threshold = datetime.timedelta(hours=hours_th) - except ValueError: - self._logger.error('The backup length threshold should be an integer') - raise - - def _dictionarize_backup_info(self): - """ - This dictionarises the backup information and returns the dictionary. - """ - backup_variables = { - self.OLDEST_OBJECT_BK_KEY: str(self._oldest_object_bk), - self.BACKUP_DIR_KEY: self._backup_dir, - self.DAYS_TO_BACKUP_KEY: self._days_to_backup, - self.END_DATE_OF_BACKUP_KEY: None if self._end_date_of_backup is None else str(self._end_date_of_backup), - self.PERIODICITY_KEY: self._periodicity, - self.BACKUP_LENGTH_THRESHOLD_KEY: int(self._backup_length_threshold.total_seconds() // 3600) - } - - return backup_variables - - def _store_backup_info(self, backup_info_file_name): - """ - This method writes the backup variables dictionary to a file with the - given filename. - """ - backup_variables = self._dictionarize_backup_info() - with open(backup_info_file_name, 'wb') as backup_info_file: - json.dump(backup_variables, backup_info_file) - - def _find_files_to_backup(self): - """ - Query the database for nodes that were created after the - the start of the last backup. Return a query set. - """ - # Go a bit further back to avoid any rounding problems. Set the - # smallest timestamp to be backed up. - start_of_backup = (self._oldest_object_bk - datetime.timedelta(minutes=self._additional_back_time_mins)) - - # Find the end of backup for this round using the given _periodicity. - backup_end_for_this_round = (self._oldest_object_bk + datetime.timedelta(days=self._periodicity)) - - # If the end of the backup is after the given end by the user, - # adapt it accordingly - if ( - self._internal_end_date_of_backup is not None and - backup_end_for_this_round > self._internal_end_date_of_backup - ): - backup_end_for_this_round = self._internal_end_date_of_backup - - # If the end of the backup is after the current time, adapt the end accordingly - now_timestamp = datetime.datetime.now(dtimezone.get_current_timezone()) - if backup_end_for_this_round > now_timestamp: - self._logger.info( - 'We can not backup until %s. We will backup until now (%s).', backup_end_for_this_round, now_timestamp - ) - backup_end_for_this_round = now_timestamp - - # Check if the backup length is below the backup length threshold - if backup_end_for_this_round - start_of_backup < \ - self._backup_length_threshold: - self._logger.info('Backup (timestamp) length is below the given threshold. Backup finished') - return -1, None - - # Construct the queries & query sets - query_sets = self._get_query_sets(start_of_backup, backup_end_for_this_round) - - # Set the new start of the backup - self._oldest_object_bk = backup_end_for_this_round - - # Check if threshold is 0 - if self._backup_length_threshold == datetime.timedelta(hours=0): - return -2, query_sets - - return 0, query_sets - - @staticmethod - def _get_repository_path(): - from aiida.manage.configuration import get_profile - return get_profile().repository_path - - def _backup_needed_files(self, query_sets): - """Perform backup of a minimum-set of files""" - - repository_path = os.path.normpath(self._get_repository_path()) - - parent_dir_set = set() - copy_counter = 0 - - dir_no_to_copy = 0 - - for query_set in query_sets: - dir_no_to_copy += self._get_query_set_length(query_set) - - self._logger.info('Start copying %s directories', dir_no_to_copy) - - last_progress_print = datetime.datetime.now() - percent_progress = 0 - - for query_set in query_sets: - for item in self._get_query_set_iterator(query_set): - source_dir = self._get_source_directory(item) - - # Get the relative directory without the / which - # separates the repository_path from the relative_dir. - relative_dir = source_dir[(len(repository_path) + 1):] - destination_dir = os.path.join(self._backup_dir, relative_dir) - - # Remove the destination directory if it already exists - if os.path.exists(destination_dir): - shutil.rmtree(destination_dir) - - # Copy the needed directory - try: - shutil.copytree(source_dir, destination_dir, True, None) - except EnvironmentError as why: - self._logger.warning( - 'Problem copying directory %s to %s. More information: %s (Error no: %s)', source_dir, - destination_dir, why.strerror, why.errno - ) - # Raise envEr - - # Extract the needed parent directories - AbstractBackup._extract_parent_dirs(relative_dir, parent_dir_set) - copy_counter += 1 - log_msg = 'Copied %.0f directories [%s] (%3.0f/100)' - - if ( - self._logger.getEffectiveLevel() <= logging.INFO and - (datetime.datetime.now() - last_progress_print).seconds > 60 - ): - last_progress_print = datetime.datetime.now() - percent_progress = copy_counter * 100 / dir_no_to_copy - self._logger.info(log_msg, copy_counter, item.__class__.__name__, percent_progress) - - if ( - self._logger.getEffectiveLevel() <= logging.INFO and percent_progress < - (copy_counter * 100 / dir_no_to_copy) - ): - percent_progress = (copy_counter * 100 / dir_no_to_copy) - last_progress_print = datetime.datetime.now() - self._logger.info(log_msg, copy_counter, item.__class__.__name__, percent_progress) - - self._logger.info('%.0f directories copied', copy_counter) - - self._logger.info('Start setting permissions') - perm_counter = 0 - for tmp_rel_path in parent_dir_set: - try: - shutil.copystat( - os.path.join(repository_path, tmp_rel_path), os.path.join(self._backup_dir, tmp_rel_path) - ) - except OSError as why: - self._logger.warning( - 'Problem setting permissions to directory %s.', os.path.join(self._backup_dir, tmp_rel_path) - ) - self._logger.warning(os.path.join(repository_path, tmp_rel_path)) - self._logger.warning('More information: %s (Error no: %s)', why.strerror, why.errno) - perm_counter += 1 - - self._logger.info('Set correct permissions to %.0f directories.', perm_counter) - - self._logger.info('End of backup.') - self._logger.info('Backed up objects with modification timestamp less or equal to %s.', self._oldest_object_bk) - - @staticmethod - def _extract_parent_dirs(given_rel_dir, parent_dir_set): - """ - This method extracts the parent directories of the givenDir - and populates the parent_dir_set. - """ - sub_paths = given_rel_dir.split('/') - - temp_path = '' - for sub_path in sub_paths: - temp_path += f'{sub_path}/' - parent_dir_set.add(temp_path) - - return parent_dir_set - - def run(self): - """Run the backup""" - while True: - self._read_backup_info_from_file(self._backup_info_filepath) - item_sets_to_backup = self._find_files_to_backup() - if item_sets_to_backup[0] == -1: - break - self._backup_needed_files(item_sets_to_backup[1]) - self._store_backup_info(self._backup_info_filepath) - if item_sets_to_backup[0] == -2: - self._logger.info('Threshold is 0. Backed up one round and exiting.') - break - - @abstractmethod - def _query_first_node(self): - """Query first node""" - - @abstractmethod - def _get_query_set_length(self, query_set): - """Get query set length""" - - @abstractmethod - def _get_query_sets(self, start_of_backup, backup_end_for_this_round): - """Get query set""" - - @abstractmethod - def _get_query_set_iterator(self, query_set): - """Get query set iterator""" - - @abstractmethod - def _get_source_directory(self, item): - """Get source directory of item - :param self: - :return: - """ - - -class BackupError(Exception): - """General backup error""" - - def __init__(self, value, *args, **kwargs): - super().__init__(*args, **kwargs) - self._value = value - - def __str__(self): - return repr(self._value) diff --git a/aiida/manage/backup/backup_general.py b/aiida/manage/backup/backup_general.py deleted file mode 100644 index 1ec59796ee..0000000000 --- a/aiida/manage/backup/backup_general.py +++ /dev/null @@ -1,89 +0,0 @@ -# -*- coding: utf-8 -*- -########################################################################### -# Copyright (c), The AiiDA team. All rights reserved. # -# This file is part of the AiiDA code. # -# # -# The code is hosted on GitHub at https://github.com/aiidateam/aiida-core # -# For further information on the license, see the LICENSE.txt file # -# For further information please visit http://www.aiida.net # -########################################################################### -"""Backup implementation for any backend (using the QueryBuilder).""" -# pylint: disable=no-member - -import os - -from aiida.orm import Node -from aiida.manage.backup.backup_base import AbstractBackup, BackupError -from aiida.common.folders import RepositoryFolder -from aiida.orm.utils._repository import Repository - - -class Backup(AbstractBackup): - """Backup for any backend""" - - def _query_first_node(self): - """Query first node - :return: The first Node object (return specific subclass thereof). - :rtype: :class:`~aiida.orm.nodes.node.Node` - """ - return Node.objects.find(order_by='ctime')[:1] - - def _get_query_set_length(self, query_set): - """Get query set length""" - return query_set.count() - - def _get_query_sets(self, start_of_backup, backup_end_for_this_round): - """Get Nodes and Worflows query set from start to end of backup. - - :param start_of_backup: datetime object with start datetime of Node modification times for backup. - :param backup_end_for_this_round: datetime object with end datetime of Node modification times for backup this - round. - - :return: List of QueryBuilder queries/query. - :rtype: :class:`~aiida.orm.querybuilder.QueryBuilder` - """ - mtime_interval = {'mtime': {'and': [{'>=': str(start_of_backup)}, {'<=': str(backup_end_for_this_round)}]}} - query_set = Node.objects.query() - query_set.add_filter(Node, mtime_interval) - - return [query_set] - - def _get_query_set_iterator(self, query_set): - """Get query set iterator - - :param query_set: QueryBuilder object - :type query_set: :class:`~aiida.orm.querybuilder.QueryBuilder` - - :return: Generator, returning the results of the QueryBuilder query. - :rtype: list - - :raises `~aiida.manage.backup.backup_base.BackupError`: if the number of yielded items in the list from - iterall() is more than 1. - """ - for item in query_set.iterall(): - yield_len = len(item) - if yield_len == 1: - yield item[0] - else: - msg = 'Unexpected number of items in list yielded from QueryBuilder.iterall(): %s' - self._logger.error(msg, yield_len) - raise BackupError(msg % yield_len) - - def _get_source_directory(self, item): - """Retrieve the node repository folder - - :param item: Subclasses of Node. - :type item: :class:`~aiida.orm.nodes.node.Node` - - :return: Normalized path to the Node's repository folder. - :rtype: str - """ - # pylint: disable=protected-access - if isinstance(item, Node): - source_dir = os.path.normpath(RepositoryFolder(section=Repository._section_name, uuid=item.uuid).abspath) - else: - # Raise exception - msg = 'Unexpected item type to backup: %s' - self._logger.error(msg, type(item)) - raise BackupError(msg % type(item)) - return source_dir diff --git a/aiida/manage/backup/backup_info.json.tmpl b/aiida/manage/backup/backup_info.json.tmpl deleted file mode 100644 index 33c5e37a6c..0000000000 --- a/aiida/manage/backup/backup_info.json.tmpl +++ /dev/null @@ -1 +0,0 @@ -{"backup_length_threshold": 1, "periodicity": 2, "oldest_object_backedup": null, "end_date_of_backup": null, "days_to_backup": null, "backup_dir": "/scratch/backup_dest/backup_script_dest/"} diff --git a/aiida/manage/backup/backup_setup.py b/aiida/manage/backup/backup_setup.py deleted file mode 100644 index 264e6b1ac2..0000000000 --- a/aiida/manage/backup/backup_setup.py +++ /dev/null @@ -1,256 +0,0 @@ -# -*- coding: utf-8 -*- -########################################################################### -# Copyright (c), The AiiDA team. All rights reserved. # -# This file is part of the AiiDA code. # -# # -# The code is hosted on GitHub at https://github.com/aiidateam/aiida-core # -# For further information on the license, see the LICENSE.txt file # -# For further information please visit http://www.aiida.net # -########################################################################### -"""Class to backup an AiiDA instance profile.""" - -import datetime -import logging -import os -import shutil -import stat -import sys - -from aiida.common import json -from aiida.manage import configuration -from aiida.manage.backup.backup_base import AbstractBackup -from aiida.manage.configuration.settings import AIIDA_CONFIG_FOLDER - -from aiida.manage.backup import backup_utils as utils - - -class BackupSetup: - """ - This class setups the main backup script related information & files like:: - - - the backup parameter file. It also allows the user to set it up by answering questions. - - the backup folders. - - the script that initiates the backup. - """ - - def __init__(self): - # The backup directory names - self._conf_backup_folder_rel = f'backup_{configuration.PROFILE.name}' - self._file_backup_folder_rel = 'backup_dest' - - # The backup configuration file (& template) names - self._backup_info_filename = 'backup_info.json' - self._backup_info_tmpl_filename = 'backup_info.json.tmpl' - - # The name of the script that initiates the backup - self._script_filename = 'start_backup.py' - - # Configuring the logging - logging.basicConfig(format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s') - - # The logger of the backup script - self._logger = logging.getLogger('aiida_backup_setup') - - @staticmethod - def construct_backup_variables(file_backup_folder_abs): - """Construct backup variables.""" - backup_variables = {} - - # Setting the oldest backup timestamp - oldest_object_bk = utils.ask_question( - 'Please provide the oldest backup timestamp ' - '(e.g. 2014-07-18 13:54:53.688484+00:00): ', datetime.datetime, True - ) - - if oldest_object_bk is None: - backup_variables[AbstractBackup.OLDEST_OBJECT_BK_KEY] = None - else: - backup_variables[AbstractBackup.OLDEST_OBJECT_BK_KEY] = str(oldest_object_bk) - - # Setting the backup directory - backup_variables[AbstractBackup.BACKUP_DIR_KEY] = file_backup_folder_abs - - # Setting the days_to_backup - backup_variables[AbstractBackup.DAYS_TO_BACKUP_KEY - ] = utils.ask_question('Please provide the number of days to backup: ', int, True) - - # Setting the end date - end_date_of_backup_key = utils.ask_question( - 'Please provide the end date of the backup (e.g. 2014-07-18 13:54:53.688484+00:00): ', datetime.datetime, - True - ) - if end_date_of_backup_key is None: - backup_variables[AbstractBackup.END_DATE_OF_BACKUP_KEY] = None - else: - backup_variables[AbstractBackup.END_DATE_OF_BACKUP_KEY] = str(end_date_of_backup_key) - - # Setting the backup periodicity - backup_variables[AbstractBackup.PERIODICITY_KEY - ] = utils.ask_question('Please provide the periodicity (in days): ', int, False) - - # Setting the backup threshold - backup_variables[AbstractBackup.BACKUP_LENGTH_THRESHOLD_KEY - ] = utils.ask_question('Please provide the backup threshold (in hours): ', int, False) - - return backup_variables - - def create_dir(self, question, dir_path): - """Create the directories for the backup folder and return its path.""" - final_path = utils.query_string(question, dir_path) - - if not os.path.exists(final_path): - if utils.query_yes_no(f"The path {final_path} doesn't exist. Should it be created?", 'yes'): - try: - os.makedirs(final_path) - except OSError: - self._logger.error('Error creating the path %s.', final_path) - raise - return final_path - - @staticmethod - def print_info(): - """Write a string with information to stdout.""" - info_str = \ -"""Variables to set up in the JSON file ------------------------------------- - - * ``periodicity`` (in days): The backup runs periodically for a number of days - defined in the periodicity variable. The purpose of this variable is to limit - the backup to run only on a few number of days and therefore to limit the - number of files that are backed up at every round. e.g. ``"periodicity": 2`` - Example: if you have files in the AiiDA repositories created in the past 30 - days, and periodicity is 15, the first run will backup the files of the first - 15 days; a second run of the script will backup the next 15 days, completing - the backup (if it is run within the same day). Further runs will only backup - newer files, if they are created. - - * ``oldest_object_backedup`` (timestamp or null): This is the timestamp of the - oldest object that was backed up. If you are not aware of this value or if it - is the first time that you start a backup up for this repository, then set - this value to ``null``. Then the script will search the creation date of the - oldest node object in the database and it will start - the backup from that date. E.g. ``"oldest_object_backedup": - "2015-07-20 11:13:08.145804+02:00"`` - - * ``end_date_of_backup``: If set, the backup script will backup files that - have a modification date until the value specified by this variable. If not - set, the ending of the backup will be set by the following variable - (``days_to_backup``) which specifies how many days to backup from the start - of the backup. If none of these variables are set (``end_date_of_backup`` - and ``days_to_backup``), then the end date of backup is set to the current - date. E.g. ``"end_date_of_backup": null`` or ``"end_date_of_backup": - "2015-07-20 11:13:08.145804+02:00"`` - - * ``days_to_backup``: If set, you specify how many days you will backup from - the starting date of your backup. If it set to ``null`` and also - ``end_date_of_backup`` is set to ``null``, then the end date of the backup - is set to the current date. You can not set ``days_to_backup`` - & ``end_date_of_backup`` at the same time (it will lead to an error). - E.g. ``"days_to_backup": null`` or ``"days_to_backup": 5`` - - * ``backup_length_threshold`` (in hours): The backup script runs in rounds and - on every round it backs-up a number of days that are controlled primarily by - ``periodicity`` and also by ``end_date_of_backup`` / ``days_to_backup``, - for the last backup round. The ``backup_length_threshold`` specifies the - lowest acceptable round length. This is important for the end of the backup. - - * ``backup_dir``: The destination directory of the backup. e.g. - ``"backup_dir": "/scratch/aiida_user/backup_script_dest"`` -""" - sys.stdout.write(info_str) - - def run(self): - """Run the backup.""" - conf_backup_folder_abs = self.create_dir( - 'Please provide the backup folder by providing the full path.', - os.path.join(os.path.expanduser(AIIDA_CONFIG_FOLDER), self._conf_backup_folder_rel) - ) - - file_backup_folder_abs = self.create_dir( - 'Please provide the destination folder of the backup (normally in ' - 'the previously provided backup folder).', - os.path.join(conf_backup_folder_abs, self._file_backup_folder_rel) - ) - - # The template backup configuration file - template_conf_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), self._backup_info_tmpl_filename) - - # Copy the sample configuration file to the backup folder - try: - shutil.copy(template_conf_path, conf_backup_folder_abs) - except OSError: - self._logger.error( - 'Error copying the file %s to the directory %s', template_conf_path, conf_backup_folder_abs - ) - raise - - if utils.query_yes_no( - 'A sample configuration file was copied to {}. ' - 'Would you like to see the configuration parameters explanation?'.format(conf_backup_folder_abs), - default='yes' - ): - self.print_info() - - # Construct the path to the backup configuration file - final_conf_filepath = os.path.join(conf_backup_folder_abs, self._backup_info_filename) - - # If the backup parameters are configured now - if utils.query_yes_no('Would you like to configure the backup configuration file now?', default='yes'): - - # Ask questions to properly setup the backup variables - backup_variables = self.construct_backup_variables(file_backup_folder_abs) - - with open(final_conf_filepath, 'wb') as backup_info_file: - json.dump(backup_variables, backup_info_file) - # If the backup parameters are configured manually - else: - sys.stdout.write( - f'Please rename the file {self._backup_info_tmpl_filename} ' + - f'found in {conf_backup_folder_abs} to ' + f'{self._backup_info_filename} and ' + - 'change the backup parameters accordingly.\n' - ) - sys.stdout.write( - 'Please adapt the startup script accordingly to point to the ' + - 'correct backup configuration file. For the moment, it points ' + - f'to {os.path.join(conf_backup_folder_abs, self._backup_info_filename)}\n' - ) - - script_content = \ -f"""#!/usr/bin/env python -import logging -from aiida.manage.configuration import load_profile - -load_profile(profile='{configuration.PROFILE.name}') - -from aiida.manage.backup.backup_general import Backup - -# Create the backup instance -backup_inst = Backup(backup_info_filepath="{final_conf_filepath}", additional_back_time_mins=2) - -# Define the backup logging level -backup_inst._logger.setLevel(logging.INFO) - -# Start the backup -backup_inst.run() -""" - - # Script full path - script_path = os.path.join(conf_backup_folder_abs, self._script_filename) - - # Write the contents to the script - with open(script_path, 'w', encoding='utf8') as script_file: - script_file.write(script_content) - - # Set the right permissions - try: - statistics = os.stat(script_path) - os.chmod(script_path, statistics.st_mode | stat.S_IEXEC) - except OSError: - self._logger.error('Problem setting the right permissions to the script %s.', script_path) - raise - - sys.stdout.write('Backup setup completed.\n') - - -if __name__ == '__main__': - BackupSetup().run() diff --git a/aiida/manage/backup/backup_utils.py b/aiida/manage/backup/backup_utils.py deleted file mode 100644 index b00b1c7320..0000000000 --- a/aiida/manage/backup/backup_utils.py +++ /dev/null @@ -1,126 +0,0 @@ -# -*- coding: utf-8 -*- -########################################################################### -# Copyright (c), The AiiDA team. All rights reserved. # -# This file is part of the AiiDA code. # -# # -# The code is hosted on GitHub at https://github.com/aiidateam/aiida-core # -# For further information on the license, see the LICENSE.txt file # -# For further information please visit http://www.aiida.net # -########################################################################### -# pylint: disable=redefined-builtin -"""Utilities for the backup functionality.""" - -import datetime -import sys - -import dateutil - - -def ask_question(question, reply_type, allow_none_as_answer=True): - """ - This method asks a specific question, tries to parse the given reply - and then it verifies the parsed answer. - :param question: The question to be asked. - :param reply_type: The type of the expected answer (int, datetime etc). It - is needed for the parsing of the answer. - :param allow_none_as_answer: Allow empty answers? - :return: The parsed reply. - """ - final_answer = None - - while True: - answer = query_string(question, '') - - # If the reply is empty - if not answer: - if not allow_none_as_answer: - continue - # Otherwise, try to parse it - else: - try: - if reply_type == int: - final_answer = int(answer) - elif reply_type == float: - final_answer = float(answer) - elif reply_type == datetime.datetime: - final_answer = dateutil.parser.parse(answer) - else: - raise ValueError - # If it is not parsable... - except ValueError: - sys.stdout.write(f'The given value could not be parsed. Type expected: {reply_type}\n') - # If the timestamp could not have been parsed, - # ask again the same question. - continue - - if query_yes_no(f'{final_answer} was parsed. Is it correct?', default='yes'): - break - return final_answer - - -def query_yes_no(question, default='yes'): - """Ask a yes/no question via input() and return their answer. - - "question" is a string that is presented to the user. - "default" is the presumed answer if the user just hits . - It must be "yes" (the default), "no" or None (meaning - an answer is required of the user). - - The "answer" return value is True for "yes" or False for "no". - """ - valid = {'yes': True, 'y': True, 'ye': True, 'no': False, 'n': False} - if default is None: - prompt = ' [y/n] ' - elif default == 'yes': - prompt = ' [Y/n] ' - elif default == 'no': - prompt = ' [y/N] ' - else: - raise ValueError(f"invalid default answer: '{default}'") - - while True: - choice = input(question + prompt).lower() - if default is not None and not choice: - return valid[default] - - if choice in valid: - return valid[choice] - - sys.stdout.write("Please respond with 'yes' or 'no' (or 'y' or 'n').\n") - - -def query_string(question, default): - """ - Asks a question (with the option to have a default, predefined answer, - and depending on the default answer and the answer of the user the - following options are available: - - If the user replies (with a non empty answer), then his answer is - returned. - - If the default answer is None then the user has to reply with a non-empty - answer. - - If the default answer is not None, then it is returned if the user gives - an empty answer. In the case of empty default answer and empty reply from - the user, None is returned. - :param question: The question that we want to ask the user. - :param default: The default answer (if there is any) to the question asked. - :return: The returned reply. - """ - - if default is None or not default: - prompt = '' - else: - prompt = f' [{default}]' - - while True: - reply = input(question + prompt) - if default is not None and not reply: - # If the default answer is an empty string. - if not default: - return None - - return default - - if reply: - return reply - - sys.stdout.write('Please provide a non empty answer.\n') diff --git a/aiida/manage/configuration/__init__.py b/aiida/manage/configuration/__init__.py index 568fa9992e..a22620573e 100644 --- a/aiida/manage/configuration/__init__.py +++ b/aiida/manage/configuration/__init__.py @@ -50,16 +50,12 @@ def load_profile(profile=None): if PROFILE and (profile is None or PROFILE.name is profile): return PROFILE - profile = get_config().get_profile(profile) + PROFILE = get_config().get_profile(profile) - if BACKEND_UUID is not None and BACKEND_UUID != profile.uuid: + if BACKEND_UUID is not None and BACKEND_UUID != PROFILE.uuid: # Once the switching of profiles with different backends becomes possible, the backend has to be reset properly raise InvalidOperation('cannot switch profile because backend of another profile is already loaded') - # Set the global variable and make sure the repository is configured - PROFILE = profile - PROFILE.configure_repository() - # Reconfigure the logging to make sure that profile specific logging configuration options are taken into account. # Note that we do not configure with `with_orm=True` because that will force the backend to be loaded. This should # instead be done lazily in `Manager._load_backend`. @@ -98,8 +94,8 @@ def load_config(create=False): try: config = Config.from_file(filepath) - except ValueError: - raise exceptions.ConfigurationError(f'configuration file {filepath} contains invalid JSON') + except ValueError as exc: + raise exceptions.ConfigurationError(f'configuration file {filepath} contains invalid JSON') from exc _merge_deprecated_cache_yaml(config, filepath) @@ -274,4 +270,4 @@ def load_documentation_profile(): config = {'default_profile': profile_name, 'profiles': {profile_name: profile}} PROFILE = Profile(profile_name, profile, from_config=True) CONFIG = Config(handle.name, config) - get_manager()._load_backend(schema_check=False) # pylint: disable=protected-access + get_manager()._load_backend(schema_check=False, repository_check=False) # pylint: disable=protected-access diff --git a/aiida/manage/configuration/profile.py b/aiida/manage/configuration/profile.py index 593302116a..7b4265201f 100644 --- a/aiida/manage/configuration/profile.py +++ b/aiida/manage/configuration/profile.py @@ -10,12 +10,17 @@ """AiiDA profile related code""" import collections import os +from typing import TYPE_CHECKING from aiida.common import exceptions +from aiida.common.lang import classproperty from .options import parse_option from .settings import DAEMON_DIR, DAEMON_LOG_DIR +if TYPE_CHECKING: + from disk_objectstore import Container + __all__ = ('Profile',) CIRCUS_PID_FILE_TEMPLATE = os.path.join(DAEMON_DIR, 'circus-{}.pid') @@ -74,6 +79,18 @@ class Profile: # pylint: disable=too-many-public-methods KEY_REPOSITORY_URI: 'repository_uri', } + @classproperty + def defaults(cls): # pylint: disable=no-self-use,no-self-argument + """Return the dictionary of default values for profile settings.""" + return { + 'repository': { + 'pack_size_target': 4 * 1024 * 1024 * 1024, + 'loose_prefix_len': 2, + 'hash_type': 'sha256', + 'compression_algorithm': 'zlib+1' + } + } + @classmethod def contains_unknown_keys(cls, dictionary): """Return whether the profile dictionary contains any unsupported keys. @@ -110,6 +127,21 @@ def __init__(self, name, attributes, from_config=False): # Currently, whether a profile is a test profile is solely determined by its name starting with 'test_' self._test_profile = bool(self.name.startswith('test_')) + def get_repository_container(self) -> 'Container': + """Return the container of the profile's file repository. + + :return: the profile's file repository container. + """ + from disk_objectstore import Container + + filepath = os.path.join(self.repository_path, 'container') + container = Container(filepath) + + if not container.is_initialised: + container.init_container(clear=True, **self.defaults['repository']) # pylint: disable=unsubscriptable-object + + return container + @property def uuid(self): """Return the profile uuid. @@ -351,18 +383,6 @@ def get_rmq_url(self): **self.broker_parameters ) - def configure_repository(self): - """Validates the configured repository and in the case of a file system repo makes sure the folder exists.""" - import errno - - try: - os.makedirs(self.repository_path) - except OSError as exception: - if exception.errno != errno.EEXIST: - raise exceptions.ConfigurationError( - f'could not create the configured repository `{self.repository_path}`: {str(exception)}' - ) - @property def filepaths(self): """Return the filepaths used by this profile. diff --git a/aiida/manage/database/integrity/duplicate_uuid.py b/aiida/manage/database/integrity/duplicate_uuid.py index de581b2341..503efd3ebb 100644 --- a/aiida/manage/database/integrity/duplicate_uuid.py +++ b/aiida/manage/database/integrity/duplicate_uuid.py @@ -71,7 +71,6 @@ def deduplicate_uuids(table=None, dry_run=True): from collections import defaultdict from aiida.common.utils import get_new_uuid - from aiida.orm.utils._repository import Repository if table not in TABLES_UUID_DEDUPLICATION: raise ValueError(f"invalid table {table}: choose from {', '.join(TABLES_UUID_DEDUPLICATION)}") @@ -102,10 +101,6 @@ def deduplicate_uuids(table=None, dry_run=True): messages.append(f'would update UUID of {table} row<{pk}> from {uuid_ref} to {uuid_new}') else: messages.append(f'updated UUID of {table} row<{pk}> from {uuid_ref} to {uuid_new}') - repo_ref = Repository(uuid_ref, True, 'path') - repo_new = Repository(uuid_new, False, 'path') - repo_new.put_object_from_tree(repo_ref._get_base_folder().abspath) # pylint: disable=protected-access - repo_new.store() if not dry_run: apply_new_uuid_mapping(table, mapping_new_uuid) diff --git a/aiida/manage/manager.py b/aiida/manage/manager.py index 8f8bdfd1f1..0fe9f85343 100644 --- a/aiida/manage/manager.py +++ b/aiida/manage/manager.py @@ -99,17 +99,18 @@ def unload_backend(self) -> None: manager.reset_backend_environment() self._backend = None - def _load_backend(self, schema_check: bool = True) -> 'Backend': + def _load_backend(self, schema_check: bool = True, repository_check: bool = True) -> 'Backend': """Load the backend for the currently configured profile and return it. .. note:: this will reconstruct the `Backend` instance in `self._backend` so the preferred method to load the backend is to call `get_backend` which will create it only when not yet instantiated. - :param schema_check: force a database schema check if the database environment has not yet been loaded - :return: the database backend - + :param schema_check: force a database schema check if the database environment has not yet been loaded. + :param repository_check: force a check that the database is associated with the repository that is configured + for the current profile. + :return: the database backend. """ - from aiida.backends import BACKEND_DJANGO, BACKEND_SQLA + from aiida.backends import BACKEND_DJANGO, BACKEND_SQLA, get_backend_manager from aiida.common import ConfigurationError, InvalidOperation from aiida.common.log import configure_logging from aiida.manage import configuration @@ -124,13 +125,30 @@ def _load_backend(self, schema_check: bool = True) -> 'Backend': if configuration.BACKEND_UUID is not None and configuration.BACKEND_UUID != profile.uuid: raise InvalidOperation('cannot load backend because backend of another profile is already loaded') + backend_manager = get_backend_manager(profile.database_backend) + # Do NOT reload the backend environment if already loaded, simply reload the backend instance after if configuration.BACKEND_UUID is None: - from aiida.backends import get_backend_manager - backend_manager = get_backend_manager(profile.database_backend) backend_manager.load_backend_environment(profile, validate_schema=schema_check) configuration.BACKEND_UUID = profile.uuid + # Perform the check on the repository compatibility. Since this is new functionality and the stability is not + # yet known, we issue a warning in the case the repo and database are incompatible. In the future this might + # then become an exception once we have verified that it is working reliably. + if repository_check and not profile.is_test_profile: + repository_uuid_config = profile.get_repository_container().container_id + repository_uuid_database = backend_manager.get_repository_uuid() + + from aiida.cmdline.utils import echo + if repository_uuid_config != repository_uuid_database: + echo.echo_warning( + f'the database and repository configured for profile `{profile.name}` are incompatible:\n\n' + f'Repository UUID in profile: {repository_uuid_config}\n' + f'Repository UUID in database: {repository_uuid_database}\n\n' + 'Using a database with an incompatible repository will prevent AiiDA from functioning properly.\n' + 'Please make sure that the configuration of your profile is correct.\n' + ) + backend_type = profile.database_backend # Can only import the backend classes after the backend has been loaded diff --git a/aiida/orm/implementation/nodes.py b/aiida/orm/implementation/nodes.py index 09f2b60132..2a07ff7393 100644 --- a/aiida/orm/implementation/nodes.py +++ b/aiida/orm/implementation/nodes.py @@ -96,6 +96,22 @@ def description(self, value): """ self._dbmodel.description = value + @property + def repository_metadata(self): + """Return the node repository metadata. + + :return: the repository metadata + """ + return self._dbmodel.repository_metadata + + @repository_metadata.setter + def repository_metadata(self, value): + """Set the repository metadata. + + :param value: the new value to set + """ + self._dbmodel.repository_metadata = value + @abc.abstractproperty def computer(self): """Return the computer of this node. diff --git a/aiida/orm/nodes/data/array/array.py b/aiida/orm/nodes/data/array/array.py index b4d00079d4..3d1553e4c5 100644 --- a/aiida/orm/nodes/data/array/array.py +++ b/aiida/orm/nodes/data/array/array.py @@ -169,7 +169,7 @@ def set_array(self, name, array): handle.seek(0) # Write the numpy array to the repository, keeping the byte representation - self.put_object_from_filelike(handle, f'{name}.npy', mode='wb', encoding=None) + self.put_object_from_filelike(handle, f'{name}.npy') # Store the array name and shape for querying purposes self.set_attribute(f'{self.array_prefix}{name}', list(array.shape)) diff --git a/aiida/orm/nodes/data/cif.py b/aiida/orm/nodes/data/cif.py index 6873a94f10..b59e4c4764 100644 --- a/aiida/orm/nodes/data/cif.py +++ b/aiida/orm/nodes/data/cif.py @@ -418,9 +418,7 @@ def get_ase(self, **kwargs): if not kwargs and self._ase: return self.ase with self.open() as handle: - cif = CifData.read_cif(handle, **kwargs) - - return cif + return CifData.read_cif(handle, **kwargs) def set_ase(self, aseatoms): """ @@ -473,7 +471,8 @@ def set_values(self, values): with Capturing(): tmpf.write(values.WriteOut()) tmpf.flush() - self.set_file(tmpf.name) + tmpf.seek(0) + self.set_file(tmpf) self._values = values @@ -785,10 +784,6 @@ def _prepare_cif(self, **kwargs): # pylint: disable=unused-argument If parsed values are present, a CIF string is created and written to file. If no parsed values are present, the CIF string is read from file. """ - if self._values and not self.is_stored: - # Note: this overwrites the CIF file! - self.set_values(self._values) - with self.open(mode='rb') as handle: return handle.read(), {} diff --git a/aiida/orm/nodes/data/code.py b/aiida/orm/nodes/data/code.py index d96924a5cf..b3bade861e 100644 --- a/aiida/orm/nodes/data/code.py +++ b/aiida/orm/nodes/data/code.py @@ -93,7 +93,7 @@ def set_files(self, files): for filename in files: if os.path.isfile(filename): with open(filename, 'rb') as handle: - self.put_object_from_filelike(handle, os.path.split(filename)[1], 'wb', encoding=None) + self.put_object_from_filelike(handle, os.path.split(filename)[1]) def __str__(self): local_str = 'Local' if self.is_local() else 'Remote' diff --git a/aiida/orm/nodes/data/data.py b/aiida/orm/nodes/data/data.py index 872192b48e..dd6c7af6c0 100644 --- a/aiida/orm/nodes/data/data.py +++ b/aiida/orm/nodes/data/data.py @@ -8,7 +8,6 @@ # For further information please visit http://www.aiida.net # ########################################################################### """Module with `Node` sub class `Data` to be used as a base class for data structures.""" - from aiida.common import exceptions from aiida.common.links import LinkType from aiida.common.lang import override @@ -49,26 +48,23 @@ def __copy__(self): def __deepcopy__(self, memo): """ - Create a clone of the Data node by pipiong through to the clone method and return the result. + Create a clone of the Data node by piping through to the clone method and return the result. :returns: an unstored clone of this Data node """ return self.clone() def clone(self): - """ - Create a clone of the Data node. + """Create a clone of the Data node. :returns: an unstored clone of this Data node """ - # pylint: disable=no-member import copy backend_clone = self.backend_entity.clone() clone = self.__class__.from_backend_entity(backend_clone) - clone.reset_attributes(copy.deepcopy(self.attributes)) - clone.put_object_from_tree(self._repository._get_base_folder().abspath) # pylint: disable=protected-access + clone._repository.clone(self._repository) # pylint: disable=protected-access return clone diff --git a/aiida/orm/nodes/data/singlefile.py b/aiida/orm/nodes/data/singlefile.py index eecc0484d3..c647ca93c5 100644 --- a/aiida/orm/nodes/data/singlefile.py +++ b/aiida/orm/nodes/data/singlefile.py @@ -8,6 +8,7 @@ # For further information please visit http://www.aiida.net # ########################################################################### """Data class that can be used to store a single file in its repository.""" +import contextlib import inspect import os import warnings @@ -57,34 +58,19 @@ def filename(self): """ return self.get_attribute('filename') - def open(self, path=None, mode='r', key=None): + @contextlib.contextmanager + def open(self, path=None, mode='r'): """Return an open file handle to the content of this data node. - .. deprecated:: 1.4.0 - Keyword `key` is deprecated and will be removed in `v2.0.0`. Use `path` instead. - - .. deprecated:: 1.4.0 - Starting from `v2.0.0` this will raise if not used in a context manager. - :param path: the relative path of the object within the repository. - :param key: optional key within the repository, by default is the `filename` set in the attributes :param mode: the mode with which to open the file handle (default: read mode) :return: a file handle """ - from ..node import WarnWhenNotEntered - if key is not None: - if path is not None: - raise ValueError('cannot specify both `path` and `key`.') - warnings.warn( - 'keyword `key` is deprecated and will be removed in `v2.0.0`. Use `path` instead.', - AiidaDeprecationWarning - ) # pylint: disable=no-member - path = key - if path is None: path = self.filename - return WarnWhenNotEntered(self._repository.open(path, mode=mode), repr(self)) + with super().open(path, mode=mode) as handle: + yield handle def get_content(self): """Return the content of the single file stored for this data node. @@ -130,7 +116,7 @@ def set_file(self, file, filename=None): pass if is_filelike: - self.put_object_from_filelike(file, key, mode='wb') + self.put_object_from_filelike(file, key) else: self.put_object_from_file(file, key) diff --git a/aiida/orm/nodes/data/upf.py b/aiida/orm/nodes/data/upf.py index 0c2a481b75..7617138d6b 100644 --- a/aiida/orm/nodes/data/upf.py +++ b/aiida/orm/nodes/data/upf.py @@ -185,6 +185,7 @@ def parse_upf(fname, check_filename=True): If check_filename is True, raise a ParsingError exception if the filename does not start with the element name. """ + # pylint: disable=too-many-branches import os from aiida.common.exceptions import ParsingError @@ -195,10 +196,13 @@ def parse_upf(fname, check_filename=True): try: upf_contents = fname.read() - fname = fname.name except AttributeError: - with open(fname, encoding='utf8') as handle: + with open(fname) as handle: upf_contents = handle.read() + else: + if check_filename: + raise ValueError('cannot use filelike objects when `check_filename=True`, use a filepath instead.') + fname = 'file.txt' match = REGEX_UPF_VERSION.search(upf_contents) if match: @@ -305,8 +309,13 @@ def store(self, *args, **kwargs): # pylint: disable=signature-differs if self.is_stored: return self + # Do not check the filename because it will fail since we are passing in a handle, which doesn't have a filename + # and so `parse_upf` will raise. The reason we have to pass in a handle is because this is the repository does + # not allow to get an absolute filepath. Anyway, the filename was already checked in `set_file` when the file + # was set for the first time. All the logic in this method is duplicated in `store` and `_validate` and badly + # needs to be refactored, but that is for another time. with self.open(mode='r') as handle: - parsed_data = parse_upf(handle) + parsed_data = parse_upf(handle, check_filename=False) # Open in binary mode which is required for generating the md5 checksum with self.open(mode='rb') as handle: @@ -397,8 +406,13 @@ def _validate(self): super()._validate() + # Do not check the filename because it will fail since we are passing in a handle, which doesn't have a filename + # and so `parse_upf` will raise. The reason we have to pass in a handle is because this is the repository does + # not allow to get an absolute filepath. Anyway, the filename was already checked in `set_file` when the file + # was set for the first time. All the logic in this method is duplicated in `store` and `_validate` and badly + # needs to be refactored, but that is for another time. with self.open(mode='r') as handle: - parsed_data = parse_upf(handle) + parsed_data = parse_upf(handle, check_filename=False) # Open in binary mode which is required for generating the md5 checksum with self.open(mode='rb') as handle: diff --git a/aiida/orm/nodes/node.py b/aiida/orm/nodes/node.py index a30a1d1135..67ffb749f9 100644 --- a/aiida/orm/nodes/node.py +++ b/aiida/orm/nodes/node.py @@ -9,12 +9,13 @@ ########################################################################### # pylint: disable=too-many-lines,too-many-arguments """Package for node ORM classes.""" +import copy import datetime import importlib from logging import Logger +import typing import warnings -import traceback -from typing import Any, Dict, IO, Iterator, List, Optional, Sequence, Tuple, Type, Union +from typing import Any, Dict, Iterator, List, Optional, Sequence, Tuple, Type, Union from typing import TYPE_CHECKING from uuid import UUID @@ -26,7 +27,6 @@ from aiida.common.warnings import AiidaDeprecationWarning from aiida.manage.manager import get_manager from aiida.orm.utils.links import LinkManager, LinkTriple -from aiida.orm.utils._repository import Repository from aiida.orm.utils.node import AbstractNodeMeta from aiida.orm import autogroup @@ -36,6 +36,7 @@ from ..entities import Collection as EntityCollection from ..querybuilder import QueryBuilder from ..users import User +from .repository import NodeRepositoryMixin if TYPE_CHECKING: from aiida.repository import File @@ -47,62 +48,7 @@ _NO_DEFAULT = tuple() # type: ignore[var-annotated] -class WarnWhenNotEntered: - """Temporary wrapper to warn when `Node.open` is called outside of a context manager.""" - - def __init__(self, fileobj: Union[IO[str], IO[bytes]], name: str) -> None: - self._fileobj: Union[IO[str], IO[bytes]] = fileobj - self._name = name - self._was_entered = False - - def _warn_if_not_entered(self, method) -> None: - """Fire a warning if the object wrapper has not yet been entered.""" - if not self._was_entered: - msg = f'\nThe method `{method}` was called on the return value of `{self._name}.open()`' + \ - ' outside of a context manager.\n' + \ - 'Please wrap this call inside `with .open(): ...` to silence this warning. ' + \ - 'This will raise an exception, starting from `aiida-core==2.0.0`.\n' - - try: - caller = traceback.format_stack()[-3] - except Exception: # pylint: disable=broad-except - msg += 'Could not determine the line of code responsible for triggering this warning.' - else: - msg += f'The offending call comes from:\n{caller}' - - warnings.warn(msg, AiidaDeprecationWarning) # pylint: disable=no-member - - def __enter__(self) -> Union[IO[str], IO[bytes]]: - self._was_entered = True - return self._fileobj.__enter__() - - def __exit__(self, *args: Any) -> None: - self._fileobj.__exit__(*args) - - def __getattr__(self, key: str): - if key == '_fileobj': - return self._fileobj - return getattr(self._fileobj, key) - - def __del__(self) -> None: - self._warn_if_not_entered('del') - - def __iter__(self) -> Iterator[Union[str, bytes]]: - return self._fileobj.__iter__() - - def __next__(self) -> Union[str, bytes]: - return self._fileobj.__next__() - - def read(self, *args: Any, **kwargs: Any) -> Union[str, bytes]: - self._warn_if_not_entered('read') - return self._fileobj.read(*args, **kwargs) - - def close(self, *args: Any, **kwargs: Any) -> None: - self._warn_if_not_entered('close') - return self._fileobj.close(*args, **kwargs) # type: ignore[call-arg] - - -class Node(Entity, EntityAttributesMixin, EntityExtrasMixin, metaclass=AbstractNodeMeta): +class Node(Entity, NodeRepositoryMixin, EntityAttributesMixin, EntityExtrasMixin, metaclass=AbstractNodeMeta): """ Base class for all nodes in AiiDA. @@ -139,9 +85,7 @@ def delete(self, node_id: int) -> None: if node.get_outgoing().all(): raise exceptions.InvalidOperation(f'cannot delete Node<{node.pk}> because it has outgoing links') - repository = node._repository # pylint: disable=protected-access self._backend.nodes.delete(node_id) - repository.erase(force=True) # This will be set by the metaclass call _logger: Optional[Logger] = None @@ -156,16 +100,12 @@ def delete(self, node_id: int) -> None: # Flag that determines whether the class can be cached. _cachable = False - # Base path within the repository where to put objects by default - _repository_base_path = 'path' - # Flag that determines whether the class can be stored. _storable = False _unstorable_message = 'only Data, WorkflowNode, CalculationNode or their subclasses can be stored' # These are to be initialized in the `initialization` method _incoming_cache: Optional[List[LinkTriple]] = None - _repository: Optional[Repository] = None @classmethod def from_backend_entity(cls, backend_entity: 'BackendNode') -> 'Node': @@ -237,9 +177,6 @@ def initialize(self) -> None: # A cache of incoming links represented as a list of LinkTriples instances self._incoming_cache = list() - # Calls the initialisation from the RepositoryMixin - self._repository = Repository(uuid=self.uuid, is_stored=self.is_stored, base_path=self._repository_base_path) - def _validate(self) -> bool: """Check if the attributes and files retrieved from the database are valid. @@ -345,6 +282,22 @@ def description(self, value: str) -> None: """ self.backend_entity.description = value + @property + def repository_metadata(self) -> typing.Dict: + """Return the node repository metadata. + + :return: the repository metadata + """ + return self.backend_entity.repository_metadata or {} + + @repository_metadata.setter + def repository_metadata(self, value): + """Set the repository metadata. + + :param value: the new value to set + """ + self.backend_entity.repository_metadata = value + @property def computer(self) -> Optional[Computer]: """Return the computer of this node. @@ -410,341 +363,6 @@ def mtime(self) -> datetime.datetime: """ return self.backend_entity.mtime - def list_objects(self, path: Optional[str] = None, key: Optional[str] = None) -> List['File']: - """Return a list of the objects contained in this repository, optionally in the given sub directory. - - .. deprecated:: 1.4.0 - Keyword `key` is deprecated and will be removed in `v2.0.0`. Use `path` instead. - - :param path: the relative path of the object within the repository. - :param key: fully qualified identifier for the object within the repository - :return: a list of `File` named tuples representing the objects present in directory with the given path - :raises FileNotFoundError: if the `path` does not exist in the repository of this node - """ - assert self._repository is not None, 'repository not initialised' - - if key is not None: - if path is not None: - raise ValueError('cannot specify both `path` and `key`.') - warnings.warn( - 'keyword `key` is deprecated and will be removed in `v2.0.0`. Use `path` instead.', - AiidaDeprecationWarning - ) # pylint: disable=no-member - path = key - - return self._repository.list_objects(path) - - def list_object_names(self, path: Optional[str] = None, key: Optional[str] = None) -> List[str]: - """Return a list of the object names contained in this repository, optionally in the given sub directory. - - .. deprecated:: 1.4.0 - Keyword `key` is deprecated and will be removed in `v2.0.0`. Use `path` instead. - - :param path: the relative path of the object within the repository. - :param key: fully qualified identifier for the object within the repository - - """ - assert self._repository is not None, 'repository not initialised' - - if key is not None: - if path is not None: - raise ValueError('cannot specify both `path` and `key`.') - warnings.warn( - 'keyword `key` is deprecated and will be removed in `v2.0.0`. Use `path` instead.', - AiidaDeprecationWarning - ) # pylint: disable=no-member - path = key - - return self._repository.list_object_names(path) - - def open(self, path: Optional[str] = None, mode: str = 'r', key: Optional[str] = None) -> WarnWhenNotEntered: - """Open a file handle to the object with the given path. - - .. deprecated:: 1.4.0 - Keyword `key` is deprecated and will be removed in `v2.0.0`. Use `path` instead. - - .. deprecated:: 1.4.0 - Starting from `v2.0.0` this will raise if not used in a context manager. - - :param path: the relative path of the object within the repository. - :param key: fully qualified identifier for the object within the repository - :param mode: the mode under which to open the handle - """ - assert self._repository is not None, 'repository not initialised' - - if key is not None: - if path is not None: - raise ValueError('cannot specify both `path` and `key`.') - warnings.warn( - 'keyword `key` is deprecated and will be removed in `v2.0.0`. Use `path` instead.', - AiidaDeprecationWarning - ) # pylint: disable=no-member - path = key - - if path is None: - raise TypeError("open() missing 1 required positional argument: 'path'") - - if mode not in ['r', 'rb']: - warnings.warn("from v2.0 only the modes 'r' and 'rb' will be accepted", AiidaDeprecationWarning) # pylint: disable=no-member - - return WarnWhenNotEntered(self._repository.open(path, mode), repr(self)) - - def get_object(self, path: Optional[str] = None, key: Optional[str] = None) -> 'File': - """Return the object with the given path. - - .. deprecated:: 1.4.0 - Keyword `key` is deprecated and will be removed in `v2.0.0`. Use `path` instead. - - :param path: the relative path of the object within the repository. - :param key: fully qualified identifier for the object within the repository - :return: a `File` named tuple - """ - assert self._repository is not None, 'repository not initialised' - - if key is not None: - if path is not None: - raise ValueError('cannot specify both `path` and `key`.') - warnings.warn( - 'keyword `key` is deprecated and will be removed in `v2.0.0`. Use `path` instead.', - AiidaDeprecationWarning - ) # pylint: disable=no-member - path = key - - if path is None: - raise TypeError("get_object() missing 1 required positional argument: 'path'") - - return self._repository.get_object(path) - - def get_object_content(self, - path: Optional[str] = None, - mode: str = 'r', - key: Optional[str] = None) -> Union[str, bytes]: - """Return the content of a object with the given path. - - .. deprecated:: 1.4.0 - Keyword `key` is deprecated and will be removed in `v2.0.0`. Use `path` instead. - - :param path: the relative path of the object within the repository. - :param key: fully qualified identifier for the object within the repository - """ - assert self._repository is not None, 'repository not initialised' - - if key is not None: - if path is not None: - raise ValueError('cannot specify both `path` and `key`.') - warnings.warn( - 'keyword `key` is deprecated and will be removed in `v2.0.0`. Use `path` instead.', - AiidaDeprecationWarning - ) # pylint: disable=no-member - path = key - - if path is None: - raise TypeError("get_object_content() missing 1 required positional argument: 'path'") - - if mode not in ['r', 'rb']: - warnings.warn("from v2.0 only the modes 'r' and 'rb' will be accepted", AiidaDeprecationWarning) # pylint: disable=no-member - - return self._repository.get_object_content(path, mode) - - def put_object_from_tree( - self, - filepath: str, - path: Optional[str] = None, - contents_only: bool = True, - force: bool = False, - key: Optional[str] = None - ) -> None: - """Store a new object under `path` with the contents of the directory located at `filepath` on this file system. - - .. warning:: If the repository belongs to a stored node, a `ModificationNotAllowed` exception will be raised. - This check can be avoided by using the `force` flag, but this should be used with extreme caution! - - .. deprecated:: 1.4.0 - First positional argument `path` has been deprecated and renamed to `filepath`. - - .. deprecated:: 1.4.0 - Keyword `key` is deprecated and will be removed in `v2.0.0`. Use `path` instead. - - .. deprecated:: 1.4.0 - Keyword `force` is deprecated and will be removed in `v2.0.0`. - - .. deprecated:: 1.4.0 - Keyword `contents_only` is deprecated and will be removed in `v2.0.0`. - - :param filepath: absolute path of directory whose contents to copy to the repository - :param path: the relative path of the object within the repository. - :param key: fully qualified identifier for the object within the repository - :param contents_only: boolean, if True, omit the top level directory of the path and only copy its contents. - :param force: boolean, if True, will skip the mutability check - :raises aiida.common.ModificationNotAllowed: if repository is immutable and `force=False` - """ - assert self._repository is not None, 'repository not initialised' - - if force: - warnings.warn('the `force` keyword is deprecated and will be removed in `v2.0.0`.', AiidaDeprecationWarning) # pylint: disable=no-member - - if contents_only is False: - warnings.warn( - 'the `contents_only` keyword is deprecated and will be removed in `v2.0.0`.', AiidaDeprecationWarning - ) # pylint: disable=no-member - - if key is not None: - if path is not None: - raise ValueError('cannot specify both `path` and `key`.') - warnings.warn( - 'keyword `key` is deprecated and will be removed in `v2.0.0`. Use `path` instead.', - AiidaDeprecationWarning - ) # pylint: disable=no-member - path = key - - self._repository.put_object_from_tree(filepath, path, contents_only, force) - - def put_object_from_file( - self, - filepath: str, - path: Optional[str] = None, - mode: Optional[str] = None, - encoding: Optional[str] = None, - force: bool = False, - key: Optional[str] = None - ) -> None: - """Store a new object under `path` with contents of the file located at `filepath` on this file system. - - .. warning:: If the repository belongs to a stored node, a `ModificationNotAllowed` exception will be raised. - This check can be avoided by using the `force` flag, but this should be used with extreme caution! - - .. deprecated:: 1.4.0 - First positional argument `path` has been deprecated and renamed to `filepath`. - - .. deprecated:: 1.4.0 - Keyword `key` is deprecated and will be removed in `v2.0.0`. Use `path` instead. - - .. deprecated:: 1.4.0 - Keyword `force` is deprecated and will be removed in `v2.0.0`. - - :param filepath: absolute path of file whose contents to copy to the repository - :param path: the relative path where to store the object in the repository. - :param key: fully qualified identifier for the object within the repository - :param mode: the file mode with which the object will be written - Deprecated: will be removed in `v2.0.0` - :param encoding: the file encoding with which the object will be written - Deprecated: will be removed in `v2.0.0` - :param force: boolean, if True, will skip the mutability check - :raises aiida.common.ModificationNotAllowed: if repository is immutable and `force=False` - """ - assert self._repository is not None, 'repository not initialised' - - # Note that the defaults of `mode` and `encoding` had to be change to `None` from `w` and `utf-8` resptively, in - # order to detect when they were being passed such that the deprecation warning can be emitted. The defaults did - # not make sense and so ignoring them is justified, since the side-effect of this function, a file being copied, - # will continue working the same. - if force: - warnings.warn('the `force` keyword is deprecated and will be removed in `v2.0.0`.', AiidaDeprecationWarning) # pylint: disable=no-member - - if mode is not None: - warnings.warn('the `mode` argument is deprecated and will be removed in `v2.0.0`.', AiidaDeprecationWarning) # pylint: disable=no-member - - if encoding is not None: - warnings.warn( # pylint: disable=no-member - 'the `encoding` argument is deprecated and will be removed in `v2.0.0`', AiidaDeprecationWarning - ) - - if key is not None: - if path is not None: - raise ValueError('cannot specify both `path` and `key`.') - warnings.warn( - 'keyword `key` is deprecated and will be removed in `v2.0.0`. Use `path` instead.', - AiidaDeprecationWarning - ) # pylint: disable=no-member - path = key - - if path is None: - raise TypeError("put_object_from_file() missing 1 required positional argument: 'path'") - - self._repository.put_object_from_file(filepath, path, mode, encoding, force) - - def put_object_from_filelike( - self, - handle: IO[Any], - path: Optional[str] = None, - mode: str = 'w', - encoding: str = 'utf8', - force: bool = False, - key: Optional[str] = None - ) -> None: - """Store a new object under `path` with contents of filelike object `handle`. - - .. warning:: If the repository belongs to a stored node, a `ModificationNotAllowed` exception will be raised. - This check can be avoided by using the `force` flag, but this should be used with extreme caution! - - .. deprecated:: 1.4.0 - Keyword `key` is deprecated and will be removed in `v2.0.0`. Use `path` instead. - - .. deprecated:: 1.4.0 - Keyword `force` is deprecated and will be removed in `v2.0.0`. - - :param handle: filelike object with the content to be stored - :param path: the relative path where to store the object in the repository. - :param key: fully qualified identifier for the object within the repository - :param mode: the file mode with which the object will be written - :param encoding: the file encoding with which the object will be written - :param force: boolean, if True, will skip the mutability check - :raises aiida.common.ModificationNotAllowed: if repository is immutable and `force=False` - """ - assert self._repository is not None, 'repository not initialised' - - if force: - warnings.warn('the `force` keyword is deprecated and will be removed in `v2.0.0`.', AiidaDeprecationWarning) # pylint: disable=no-member - - if key is not None: - if path is not None: - raise ValueError('cannot specify both `path` and `key`.') - warnings.warn( - 'keyword `key` is deprecated and will be removed in `v2.0.0`. Use `path` instead.', - AiidaDeprecationWarning - ) # pylint: disable=no-member - path = key - - if path is None: - raise TypeError("put_object_from_filelike() missing 1 required positional argument: 'path'") - - self._repository.put_object_from_filelike(handle, path, mode, encoding, force) - - def delete_object(self, path: Optional[str] = None, force: bool = False, key: Optional[str] = None) -> None: - """Delete the object from the repository. - - .. warning:: If the repository belongs to a stored node, a `ModificationNotAllowed` exception will be raised. - This check can be avoided by using the `force` flag, but this should be used with extreme caution! - - .. deprecated:: 1.4.0 - Keyword `key` is deprecated and will be removed in `v2.0.0`. Use `path` instead. - - .. deprecated:: 1.4.0 - Keyword `force` is deprecated and will be removed in `v2.0.0`. - - :param key: fully qualified identifier for the object within the repository - :param force: boolean, if True, will skip the mutability check - :raises aiida.common.ModificationNotAllowed: if repository is immutable and `force=False` - """ - assert self._repository is not None, 'repository not initialised' - - if force: - warnings.warn('the `force` keyword is deprecated and will be removed in `v2.0.0`.', AiidaDeprecationWarning) # pylint: disable=no-member - - if key is not None: - if path is not None: - raise ValueError('cannot specify both `path` and `key`.') - warnings.warn( - 'keyword `key` is deprecated and will be removed in `v2.0.0`. Use `path` instead.', - AiidaDeprecationWarning - ) # pylint: disable=no-member - path = key - - if path is None: - raise TypeError("delete_object() missing 1 required positional argument: 'path'") - - self._repository.delete_object(path, force) - def add_comment(self, content: str, user: Optional[User] = None) -> Comment: """Add a new comment. @@ -1087,20 +705,23 @@ def _store(self, with_transaction: bool = True, clean: bool = True) -> 'Node': :param with_transaction: if False, do not use a transaction because the caller will already have opened one. :param clean: boolean, if True, will clean the attributes and extras before attempting to store """ - assert self._repository is not None, 'repository not initialised' + from aiida.repository import Repository + from aiida.repository.backend import DiskObjectStoreRepositoryBackend, SandboxRepositoryBackend - # First store the repository folder such that if this fails, there won't be an incomplete node in the database. - # On the flipside, in the case that storing the node does fail, the repository will now have an orphaned node - # directory which will have to be cleaned manually sometime. - self._repository.store() + # Only if the backend repository is a sandbox do we have to clone its contents to the permanent repository. + if isinstance(self._repository.backend, SandboxRepositoryBackend): + profile = get_manager().get_profile() + assert profile is not None, 'profile not loaded' + backend = DiskObjectStoreRepositoryBackend(container=profile.get_repository_container()) + repository = Repository(backend=backend) + repository.clone(self._repository) + # Swap the sandbox repository for the new permanent repository instance which should delete the sandbox + self._repository_instance = repository - try: - links = self._incoming_cache - self._backend_entity.store(links, with_transaction=with_transaction, clean=clean) - except Exception: - # I put back the files in the sandbox folder since the transaction did not succeed - self._repository.restore() - raise + self.repository_metadata = self._repository.serialize() + + links = self._incoming_cache + self._backend_entity.store(links, with_transaction=with_transaction, clean=clean) self._incoming_cache = list() self._backend_entity.set_extra(_HASH_EXTRA_KEY, self.get_hash()) @@ -1121,11 +742,18 @@ def verify_are_parents_stored(self) -> None: ) def _store_from_cache(self, cache_node: 'Node', with_transaction: bool) -> None: - """Store this node from an existing cache node.""" - assert self._repository is not None, 'repository not initialised' - assert cache_node._repository is not None, 'cache repository not initialised' # pylint: disable=protected-access + """Store this node from an existing cache node. + + .. note:: + With the current implementation of the backend repository, which automatically deduplicates the content that + it contains, we do not have to copy the contents of the source node. Since the content should be exactly + equal, the repository will already contain it and there is nothing to copy. We simply replace the current + ``repository`` instance with a clone of that of the source node, which does not actually copy any files. + + """ from aiida.orm.utils.mixins import Sealable + from aiida.repository import Repository assert self.node_type == cache_node.node_type # Make sure the node doesn't have any RETURN links @@ -1135,17 +763,13 @@ def _store_from_cache(self, cache_node: 'Node', with_transaction: bool) -> None: self.label = cache_node.label self.description = cache_node.description + # Make sure to reinitialize the repository instance of the clone to that of the source node. + self._repository: Repository = copy.copy(cache_node._repository) # pylint: disable=protected-access + for key, value in cache_node.attributes.items(): if key != Sealable.SEALED_KEY: self.set_attribute(key, value) - # The erase() removes the current content of the sandbox folder. - # If this was not done, the content of the sandbox folder could - # become mangled when copying over the content of the cache - # source repository folder. - self._repository.erase() - self.put_object_from_tree(cache_node._repository._get_base_folder().abspath) # pylint: disable=protected-access - self._store(with_transaction=with_transaction, clean=False) self._add_outputs_from_cache(cache_node) self.set_extra('_aiida_cached_from', cache_node.uuid) @@ -1199,7 +823,7 @@ def _get_objects_to_hash(self) -> List[Any]: for key, val in self.attributes_items() if key not in self._hash_ignored_attributes and key not in self._updatable_attributes # pylint: disable=unsupported-membership-test }, - self._repository._get_base_folder(), # pylint: disable=protected-access + self._repository.hash(), self.computer.uuid if self.computer is not None else None ] return objects diff --git a/aiida/orm/nodes/process/calculation/calcjob.py b/aiida/orm/nodes/process/calculation/calcjob.py index ccfa5d921a..85bcb3020b 100644 --- a/aiida/orm/nodes/process/calculation/calcjob.py +++ b/aiida/orm/nodes/process/calculation/calcjob.py @@ -17,7 +17,6 @@ from aiida.common.datastructures import CalcJobState from aiida.common.lang import classproperty from aiida.common.links import LinkType -from aiida.common.folders import Folder from aiida.common.warnings import AiidaDeprecationWarning from .calculation import CalculationNode @@ -51,9 +50,6 @@ class CalcJobNode(CalculationNode): SCHEDULER_LAST_JOB_INFO_KEY = 'last_job_info' SCHEDULER_DETAILED_JOB_INFO_KEY = 'detailed_job_info' - # Base path within the repository where to put objects by default - _repository_base_path = 'raw_input' - # An optional entry point for a CalculationTools instance _tools = None @@ -156,24 +152,6 @@ def get_builder_restart(self) -> 'ProcessBuilder': return builder - @property - def _raw_input_folder(self) -> Folder: - """ - Get the input folder object. - - :return: the input folder object. - :raise: NotExistent: if the raw folder hasn't been created yet - """ - from aiida.common.exceptions import NotExistent - - assert self._repository is not None, 'repository not initialised' - - return_folder = self._repository._get_base_folder() # pylint: disable=protected-access - if return_folder.exists(): - return return_folder - - raise NotExistent('the `_raw_input_folder` has not yet been created') - def get_option(self, name: str) -> Optional[Any]: """ Retun the value of an option that was set for this CalcJobNode diff --git a/aiida/orm/nodes/process/process.py b/aiida/orm/nodes/process/process.py index 63409b4857..c98e36f5bb 100644 --- a/aiida/orm/nodes/process/process.py +++ b/aiida/orm/nodes/process/process.py @@ -482,13 +482,14 @@ def is_valid_cache(self) -> bool: """ if not (super().is_valid_cache and self.is_finished): return False + try: process_class = self.process_class except ValueError as exc: self.logger.warning(f"Not considering {self} for caching, '{exc!r}' when accessing its process class.") return False - # For process functions, the `process_class` does not have an - # is_valid_cache attribute + + # For process functions, the `process_class` does not have an is_valid_cache attribute try: is_valid_cache_func = process_class.is_valid_cache except AttributeError: diff --git a/aiida/orm/nodes/repository.py b/aiida/orm/nodes/repository.py new file mode 100644 index 0000000000..248f3d4764 --- /dev/null +++ b/aiida/orm/nodes/repository.py @@ -0,0 +1,215 @@ +# -*- coding: utf-8 -*- +"""Interface to the file repository of a node instance.""" +import contextlib +import io +import tempfile +import typing + +from aiida.common import exceptions +from aiida.repository import Repository, File +from aiida.repository.backend import DiskObjectStoreRepositoryBackend, SandboxRepositoryBackend + +__all__ = ('NodeRepositoryMixin',) + + +class NodeRepositoryMixin: + """Interface to the file repository of a node instance. + + This is the compatibility layer between the `Node` class and the `Repository` class. The repository in principle has + no concept of immutability, so it is implemented here. Any mutating operations will raise a `ModificationNotAllowed` + exception if the node is stored. Otherwise the operation is just forwarded to the repository instance. + + The repository instance keeps an internal mapping of the file hierarchy that it maintains, starting from an empty + hierarchy if the instance was constructed normally, or from a specific hierarchy if reconstructred through the + ``Repository.from_serialized`` classmethod. This is only the case for stored nodes, because unstored nodes do not + have any files yet when they are constructed. Once the node get's stored, the repository is asked to serialize its + metadata contents which is then stored in the ``repository_metadata`` attribute of the node in the database. This + layer explicitly does not update the metadata of the node on a mutation action. The reason is that for stored nodes + these actions are anyway forbidden and for unstored nodes, the final metadata will be stored in one go, once the + node is stored, so there is no need to keep updating the node metadata intermediately. Note that this does mean that + ``repository_metadata`` does not give accurate information as long as the node is not yet stored. + """ + + _repository_instance = None + + def _update_repository_metadata(self): + """Refresh the repository metadata of the node if it is stored and the decorated method returns successfully.""" + if self.is_stored: + self.repository_metadata = self._repository.serialize() + + @property + def _repository(self) -> Repository: + """Return the repository instance, lazily constructing it if necessary. + + .. note:: this property is protected because a node's repository should not be accessed outside of its scope. + + :return: the file repository instance. + """ + if self._repository_instance is None: + if self.is_stored: + from aiida.manage.manager import get_manager + container = get_manager().get_profile().get_repository_container() + backend = DiskObjectStoreRepositoryBackend(container=container) + serialized = self.repository_metadata + self._repository_instance = Repository.from_serialized(backend=backend, serialized=serialized) + else: + self._repository_instance = Repository(backend=SandboxRepositoryBackend()) + + return self._repository_instance + + @_repository.setter + def _repository(self, repository: Repository) -> None: + """Set a new repository instance, deleting the current reference if it has been initialized. + + :param repository: the new repository instance to set. + """ + if self._repository_instance is not None: + del self._repository_instance + + self._repository_instance = repository + + def repository_serialize(self) -> typing.Dict: + """Serialize the metadata of the repository content into a JSON-serializable format. + + :return: dictionary with the content metadata. + """ + return self._repository.serialize() + + def check_mutability(self): + """Check if the node is mutable. + + :raises `~aiida.common.exceptions.ModificationNotAllowed`: when the node is stored and therefore immutable. + """ + if self.is_stored: + raise exceptions.ModificationNotAllowed('the node is stored and therefore the repository is immutable.') + + def list_objects(self, path: str = None) -> typing.List[File]: + """Return a list of the objects contained in this repository sorted by name, optionally in given sub directory. + + :param path: the relative path where to store the object in the repository. + :return: a list of `File` named tuples representing the objects present in directory with the given key. + :raises TypeError: if the path is not a string and relative path. + :raises FileNotFoundError: if no object exists for the given path. + :raises NotADirectoryError: if the object at the given path is not a directory. + """ + return self._repository.list_objects(path) + + def list_object_names(self, path: str = None) -> typing.List[str]: + """Return a sorted list of the object names contained in this repository, optionally in the given sub directory. + + :param path: the relative path where to store the object in the repository. + :return: a list of `File` named tuples representing the objects present in directory with the given key. + :raises TypeError: if the path is not a string and relative path. + :raises FileNotFoundError: if no object exists for the given path. + :raises NotADirectoryError: if the object at the given path is not a directory. + """ + return self._repository.list_object_names(path) + + @contextlib.contextmanager + def open(self, path: str, mode='r') -> io.BufferedReader: + """Open a file handle to an object stored under the given key. + + .. note:: this should only be used to open a handle to read an existing file. To write a new file use the method + ``put_object_from_filelike`` instead. + + :param path: the relative path of the object within the repository. + :return: yield a byte stream object. + :raises TypeError: if the path is not a string and relative path. + :raises FileNotFoundError: if the file does not exist. + :raises IsADirectoryError: if the object is a directory and not a file. + :raises OSError: if the file could not be opened. + """ + if mode not in ['r', 'rb']: + raise ValueError(f'the mode {mode} is not supported.') + + with self._repository.open(path) as handle: + if 'b' not in mode: + yield io.StringIO(handle.read().decode('utf-8')) + else: + yield handle + + def get_object_content(self, path: str, mode='r') -> typing.Union[str, bytes]: + """Return the content of a object identified by key. + + :param key: fully qualified identifier for the object within the repository. + :raises TypeError: if the path is not a string and relative path. + :raises FileNotFoundError: if the file does not exist. + :raises IsADirectoryError: if the object is a directory and not a file. + :raises OSError: if the file could not be opened. + """ + if mode not in ['r', 'rb']: + raise ValueError(f'the mode {mode} is not supported.') + + if 'b' not in mode: + return self._repository.get_object_content(path).decode('utf-8') + + return self._repository.get_object_content(path) + + def put_object_from_filelike(self, handle: io.BufferedReader, path: str): + """Store the byte contents of a file in the repository. + + :param handle: filelike object with the byte content to be stored. + :param path: the relative path where to store the object in the repository. + :raises TypeError: if the path is not a string and relative path. + :raises `~aiida.common.exceptions.ModificationNotAllowed`: when the node is stored and therefore immutable. + """ + self.check_mutability() + + if isinstance(handle, io.StringIO): + handle = io.BytesIO(handle.read().encode('utf-8')) + + if isinstance(handle, tempfile._TemporaryFileWrapper): # pylint: disable=protected-access + if 'b' in handle.file.mode: + handle = io.BytesIO(handle.read()) + else: + handle = io.BytesIO(handle.read().encode('utf-8')) + + self._repository.put_object_from_filelike(handle, path) + self._update_repository_metadata() + + def put_object_from_file(self, filepath: str, path: str): + """Store a new object under `path` with contents of the file located at `filepath` on the local file system. + + :param filepath: absolute path of file whose contents to copy to the repository + :param path: the relative path where to store the object in the repository. + :raises TypeError: if the path is not a string and relative path, or the handle is not a byte stream. + :raises `~aiida.common.exceptions.ModificationNotAllowed`: when the node is stored and therefore immutable. + """ + self.check_mutability() + self._repository.put_object_from_file(filepath, path) + self._update_repository_metadata() + + def put_object_from_tree(self, filepath: str, path: str = None): + """Store the entire contents of `filepath` on the local file system in the repository with under given `path`. + + :param filepath: absolute path of the directory whose contents to copy to the repository. + :param path: the relative path where to store the objects in the repository. + :raises TypeError: if the path is not a string and relative path. + :raises `~aiida.common.exceptions.ModificationNotAllowed`: when the node is stored and therefore immutable. + """ + self.check_mutability() + self._repository.put_object_from_tree(filepath, path) + self._update_repository_metadata() + + def delete_object(self, path: str): + """Delete the object from the repository. + + :param key: fully qualified identifier for the object within the repository. + :raises TypeError: if the path is not a string and relative path. + :raises FileNotFoundError: if the file does not exist. + :raises IsADirectoryError: if the object is a directory and not a file. + :raises OSError: if the file could not be deleted. + :raises `~aiida.common.exceptions.ModificationNotAllowed`: when the node is stored and therefore immutable. + """ + self.check_mutability() + self._repository.delete_object(path) + self._update_repository_metadata() + + def erase(self): + """Delete all objects from the repository. + + :raises `~aiida.common.exceptions.ModificationNotAllowed`: when the node is stored and therefore immutable. + """ + self.check_mutability() + self._repository.erase() + self._update_repository_metadata() diff --git a/aiida/orm/utils/_repository.py b/aiida/orm/utils/_repository.py deleted file mode 100644 index 7b4c400acf..0000000000 --- a/aiida/orm/utils/_repository.py +++ /dev/null @@ -1,304 +0,0 @@ -# -*- coding: utf-8 -*- -########################################################################### -# Copyright (c), The AiiDA team. All rights reserved. # -# This file is part of the AiiDA code. # -# # -# The code is hosted on GitHub at https://github.com/aiidateam/aiida-core # -# For further information on the license, see the LICENSE.txt file # -# For further information please visit http://www.aiida.net # -########################################################################### -"""Class that represents the repository of a `Node` instance. - -.. deprecated:: 1.4.0 - This module has been deprecated and will be removed in `v2.0.0`. - -""" -import os -import warnings - -from aiida.common import exceptions -from aiida.common.folders import RepositoryFolder, SandboxFolder -from aiida.common.warnings import AiidaDeprecationWarning -from aiida.repository import File, FileType - - -class Repository: - """Class that represents the repository of a `Node` instance. - - .. deprecated:: 1.4.0 - This class has been deprecated and will be removed in `v2.0.0`. - """ - - # Name to be used for the Repository section - _section_name = 'node' - - def __init__(self, uuid, is_stored, base_path=None): - self._is_stored = is_stored - self._base_path = base_path - self._temp_folder = None - self._repo_folder = RepositoryFolder(section=self._section_name, uuid=uuid) - - def __del__(self): - """Clean the sandboxfolder if it was instantiated.""" - if getattr(self, '_temp_folder', None) is not None: - self._temp_folder.erase() - - def validate_mutability(self): - """Raise if the repository is immutable. - - :raises aiida.common.ModificationNotAllowed: if repository is marked as immutable because the corresponding node - is stored - """ - if self._is_stored: - raise exceptions.ModificationNotAllowed('cannot modify the repository after the node has been stored') - - @staticmethod - def validate_object_key(key): - """Validate the key of an object. - - :param key: an object key in the repository - :raises ValueError: if the key is not a valid object key - """ - if key and os.path.isabs(key): - raise ValueError('the key must be a relative path') - - def list_objects(self, key=None): - """Return a list of the objects contained in this repository, optionally in the given sub directory. - - :param key: fully qualified identifier for the object within the repository - :return: a list of `File` named tuples representing the objects present in directory with the given key - """ - folder = self._get_base_folder() - - if key: - folder = folder.get_subfolder(key) - - objects = [] - - for filename in folder.get_content_list(): - if os.path.isdir(os.path.join(folder.abspath, filename)): - objects.append(File(filename, FileType.DIRECTORY)) - else: - objects.append(File(filename, FileType.FILE)) - - return sorted(objects, key=lambda x: x.name) - - def list_object_names(self, key=None): - """Return a list of the object names contained in this repository, optionally in the given sub directory. - - :param key: fully qualified identifier for the object within the repository - :return: a list of `File` named tuples representing the objects present in directory with the given key - """ - return [entry.name for entry in self.list_objects(key)] - - def open(self, key, mode='r'): - """Open a file handle to an object stored under the given key. - - :param key: fully qualified identifier for the object within the repository - :param mode: the mode under which to open the handle - """ - return open(self._get_base_folder().get_abs_path(key), mode=mode) - - def get_object(self, key): - """Return the object identified by key. - - :param key: fully qualified identifier for the object within the repository - :return: a `File` named tuple representing the object located at key - :raises IOError: if no object with the given key exists - """ - self.validate_object_key(key) - - try: - directory, filename = key.rsplit(os.sep, 1) - except ValueError: - directory, filename = None, key - - folder = self._get_base_folder() - - if directory: - folder = folder.get_subfolder(directory) - - filepath = os.path.join(folder.abspath, filename) - - if os.path.isdir(filepath): - return File(filename, FileType.DIRECTORY) - - if os.path.isfile(filepath): - return File(filename, FileType.FILE) - - raise IOError(f'object {key} does not exist') - - def get_object_content(self, key, mode='r'): - """Return the content of a object identified by key. - - :param key: fully qualified identifier for the object within the repository - :param mode: the mode under which to open the handle - """ - with self.open(key, mode=mode) as handle: - return handle.read() - - def put_object_from_tree(self, path, key=None, contents_only=True, force=False): - """Store a new object under `key` with the contents of the directory located at `path` on this file system. - - .. warning:: If the repository belongs to a stored node, a `ModificationNotAllowed` exception will be raised. - This check can be avoided by using the `force` flag, but this should be used with extreme caution! - - :param path: absolute path of directory whose contents to copy to the repository - :param key: fully qualified identifier for the object within the repository - :param contents_only: boolean, if True, omit the top level directory of the path and only copy its contents. - :param force: boolean, if True, will skip the mutability check - :raises aiida.common.ModificationNotAllowed: if repository is immutable and `force=False` - """ - if not force: - self.validate_mutability() - - self.validate_object_key(key) - - if not os.path.isabs(path): - raise ValueError('the `path` must be an absolute path') - - folder = self._get_base_folder() - - if key: - folder = folder.get_subfolder(key, create=True) - - if contents_only: - for entry in os.listdir(path): - folder.insert_path(os.path.join(path, entry)) - else: - folder.insert_path(path) - - def put_object_from_file(self, path, key, mode=None, encoding=None, force=False): - """Store a new object under `key` with contents of the file located at `path` on this file system. - - .. warning:: If the repository belongs to a stored node, a `ModificationNotAllowed` exception will be raised. - This check can be avoided by using the `force` flag, but this should be used with extreme caution! - - :param path: absolute path of file whose contents to copy to the repository - :param key: fully qualified identifier for the object within the repository - :param mode: the file mode with which the object will be written - Deprecated: will be removed in `v2.0.0` - :param encoding: the file encoding with which the object will be written - Deprecated: will be removed in `v2.0.0` - :param force: boolean, if True, will skip the mutability check - :raises aiida.common.ModificationNotAllowed: if repository is immutable and `force=False` - """ - # pylint: disable=unused-argument,no-member - # Note that the defaults of `mode` and `encoding` had to be change to `None` from `w` and `utf-8` resptively, in - # order to detect when they were being passed such that the deprecation warning can be emitted. The defaults did - # not make sense and so ignoring them is justified, since the side-effect of this function, a file being copied, - # will continue working the same. - if mode is not None: - warnings.warn('the `mode` argument is deprecated and will be removed in `v2.0.0`', AiidaDeprecationWarning) - - if encoding is not None: - warnings.warn( - 'the `encoding` argument is deprecated and will be removed in `v2.0.0`', AiidaDeprecationWarning - ) - - if not force: - self.validate_mutability() - - self.validate_object_key(key) - - with open(path, mode='rb') as handle: - self.put_object_from_filelike(handle, key, mode='wb', encoding=None) - - def put_object_from_filelike(self, handle, key, mode='w', encoding='utf8', force=False): - """Store a new object under `key` with contents of filelike object `handle`. - - .. warning:: If the repository belongs to a stored node, a `ModificationNotAllowed` exception will be raised. - This check can be avoided by using the `force` flag, but this should be used with extreme caution! - - :param handle: filelike object with the content to be stored - :param key: fully qualified identifier for the object within the repository - :param mode: the file mode with which the object will be written - :param encoding: the file encoding with which the object will be written - :param force: boolean, if True, will skip the mutability check - :raises aiida.common.ModificationNotAllowed: if repository is immutable and `force=False` - """ - if not force: - self.validate_mutability() - - self.validate_object_key(key) - - folder = self._get_base_folder() - - while os.sep in key: - basepath, key = key.split(os.sep, 1) - folder = folder.get_subfolder(basepath, create=True) - - folder.create_file_from_filelike(handle, key, mode=mode, encoding=encoding) - - def delete_object(self, key, force=False): - """Delete the object from the repository. - - .. warning:: If the repository belongs to a stored node, a `ModificationNotAllowed` exception will be raised. - This check can be avoided by using the `force` flag, but this should be used with extreme caution! - - :param key: fully qualified identifier for the object within the repository - :param force: boolean, if True, will skip the mutability check - :raises aiida.common.ModificationNotAllowed: if repository is immutable and `force=False` - """ - if not force: - self.validate_mutability() - - self.validate_object_key(key) - - self._get_base_folder().remove_path(key) - - def erase(self, force=False): - """Delete the repository folder. - - .. warning:: If the repository belongs to a stored node, a `ModificationNotAllowed` exception will be raised. - This check can be avoided by using the `force` flag, but this should be used with extreme caution! - - :param force: boolean, if True, will skip the mutability check - :raises aiida.common.ModificationNotAllowed: if repository is immutable and `force=False` - """ - if not force: - self.validate_mutability() - - self._get_base_folder().erase() - - def store(self): - """Store the contents of the sandbox folder into the repository folder.""" - if self._is_stored: - raise exceptions.ModificationNotAllowed('repository is already stored') - - self._repo_folder.replace_with_folder(self._get_temp_folder().abspath, move=True, overwrite=True) - self._is_stored = True - - def restore(self): - """Move the contents from the repository folder back into the sandbox folder.""" - if not self._is_stored: - raise exceptions.ModificationNotAllowed('repository is not yet stored') - - self._temp_folder.replace_with_folder(self._repo_folder.abspath, move=True, overwrite=True) - self._is_stored = False - - def _get_base_folder(self): - """Return the base sub folder in the repository. - - :return: a Folder object. - """ - if self._is_stored: - folder = self._repo_folder - else: - folder = self._get_temp_folder() - - if self._base_path is not None: - folder = folder.get_subfolder(self._base_path, reset_limit=True) - folder.create() - - return folder - - def _get_temp_folder(self): - """Return the temporary sandbox folder. - - :return: a SandboxFolder object mapping the node in the repository. - """ - if self._temp_folder is None: - self._temp_folder = SandboxFolder() - - return self._temp_folder diff --git a/aiida/orm/utils/mixins.py b/aiida/orm/utils/mixins.py index 3c12048fb6..75bc1aa6f9 100644 --- a/aiida/orm/utils/mixins.py +++ b/aiida/orm/utils/mixins.py @@ -8,8 +8,9 @@ # For further information please visit http://www.aiida.net # ########################################################################### """Mixin classes for ORM classes.""" - import inspect +import io +import tempfile from aiida.common import exceptions from aiida.common.lang import override @@ -56,7 +57,7 @@ def store_source_info(self, func): try: source_file_path = inspect.getsourcefile(func) with open(source_file_path, 'rb') as handle: - self.put_object_from_filelike(handle, self.FUNCTION_SOURCE_FILE_PATH, mode='wb', encoding=None) + self.put_object_from_filelike(handle, self.FUNCTION_SOURCE_FILE_PATH) except (IOError, OSError): pass @@ -123,6 +124,14 @@ class Sealable: def _updatable_attributes(cls): # pylint: disable=no-self-argument return (cls.SEALED_KEY,) + def check_mutability(self): + """Check if the node is mutable. + + :raises `~aiida.common.exceptions.ModificationNotAllowed`: when the node is sealed and therefore immutable. + """ + if self.is_stored: + raise exceptions.ModificationNotAllowed('the node is sealed and therefore the repository is immutable.') + def validate_incoming(self, source, link_type, link_label): """Validate adding a link of the given type from a given node to ourself. @@ -196,3 +205,67 @@ def delete_attribute(self, key): raise exceptions.ModificationNotAllowed(f'`{key}` is not an updatable attribute') self.backend_entity.delete_attribute(key) + + @override + def put_object_from_filelike(self, handle: io.BufferedReader, path: str): + """Store the byte contents of a file in the repository. + + :param handle: filelike object with the byte content to be stored. + :param path: the relative path where to store the object in the repository. + :raises TypeError: if the path is not a string and relative path. + :raises aiida.common.exceptions.ModificationNotAllowed: when the node is sealed and therefore immutable. + """ + self.check_mutability() + + if isinstance(handle, io.StringIO): + handle = io.BytesIO(handle.read().encode('utf-8')) + + if isinstance(handle, tempfile._TemporaryFileWrapper): # pylint: disable=protected-access + if 'b' in handle.file.mode: + handle = io.BytesIO(handle.read()) + else: + handle = io.BytesIO(handle.read().encode('utf-8')) + + self._repository.put_object_from_filelike(handle, path) + self._update_repository_metadata() + + @override + def put_object_from_file(self, filepath: str, path: str): + """Store a new object under `path` with contents of the file located at `filepath` on the local file system. + + :param filepath: absolute path of file whose contents to copy to the repository + :param path: the relative path where to store the object in the repository. + :raises TypeError: if the path is not a string and relative path, or the handle is not a byte stream. + :raises aiida.common.exceptions.ModificationNotAllowed: when the node is sealed and therefore immutable. + """ + self.check_mutability() + self._repository.put_object_from_file(filepath, path) + self._update_repository_metadata() + + @override + def put_object_from_tree(self, filepath: str, path: str = None): + """Store the entire contents of `filepath` on the local file system in the repository with under given `path`. + + :param filepath: absolute path of the directory whose contents to copy to the repository. + :param path: the relative path where to store the objects in the repository. + :raises TypeError: if the path is not a string and relative path. + :raises aiida.common.exceptions.ModificationNotAllowed: when the node is sealed and therefore immutable. + """ + self.check_mutability() + self._repository.put_object_from_tree(filepath, path) + self._update_repository_metadata() + + @override + def delete_object(self, path: str): + """Delete the object from the repository. + + :param key: fully qualified identifier for the object within the repository. + :raises TypeError: if the path is not a string and relative path. + :raises FileNotFoundError: if the file does not exist. + :raises IsADirectoryError: if the object is a directory and not a file. + :raises OSError: if the file could not be deleted. + :raises aiida.common.exceptions.ModificationNotAllowed: when the node is sealed and therefore immutable. + """ + self.check_mutability() + self._repository.delete_object(path) + self._update_repository_metadata() diff --git a/aiida/orm/utils/repository.py b/aiida/orm/utils/repository.py deleted file mode 100644 index 0b15b17af5..0000000000 --- a/aiida/orm/utils/repository.py +++ /dev/null @@ -1,31 +0,0 @@ -# -*- coding: utf-8 -*- -########################################################################### -# Copyright (c), The AiiDA team. All rights reserved. # -# This file is part of the AiiDA code. # -# # -# The code is hosted on GitHub at https://github.com/aiidateam/aiida-core # -# For further information on the license, see the LICENSE.txt file # -# For further information please visit http://www.aiida.net # -########################################################################### -# pylint: disable=unused-import -"""Module shadowing original in order to print deprecation warning only when external code uses it.""" -import warnings - -from aiida.common import exceptions -from aiida.common.folders import RepositoryFolder, SandboxFolder -from aiida.common.warnings import AiidaDeprecationWarning -from aiida.repository import File, FileType -from ._repository import Repository as _Repository - -warnings.warn( - 'this module is deprecated and will be removed in `v2.0.0`. ' - '`File` and `FileType` should be imported from `aiida.repository`.', AiidaDeprecationWarning -) - - -class Repository(_Repository): - """Class shadowing original class in order to print deprecation warning when external code uses it.""" - - def __init__(self, *args, **kwargs): - warnings.warn('This class has been deprecated and will be removed in `v2.0.0`.', AiidaDeprecationWarning) # pylint: disable=no-member""" - super().__init__(*args, **kwargs) diff --git a/aiida/repository/__init__.py b/aiida/repository/__init__.py index 1ccf31a99e..2f8ec80902 100644 --- a/aiida/repository/__init__.py +++ b/aiida/repository/__init__.py @@ -9,6 +9,8 @@ ########################################################################### """Module with resources dealing with the file repository.""" # pylint: disable=undefined-variable +from .backend import * from .common import * +from .repository import * -__all__ = (common.__all__) +__all__ = (backend.__all__ + common.__all__ + repository.__all__) diff --git a/aiida/repository/backend/__init__.py b/aiida/repository/backend/__init__.py new file mode 100644 index 0000000000..b20d6ca9fc --- /dev/null +++ b/aiida/repository/backend/__init__.py @@ -0,0 +1,8 @@ +# -*- coding: utf-8 -*- +# pylint: disable=undefined-variable +"""Module for file repository backend implementations.""" +from .abstract import * +from .disk_object_store import * +from .sandbox import * + +__all__ = (abstract.__all__ + disk_object_store.__all__ + sandbox.__all__) diff --git a/aiida/repository/backend/abstract.py b/aiida/repository/backend/abstract.py new file mode 100644 index 0000000000..fdb8733938 --- /dev/null +++ b/aiida/repository/backend/abstract.py @@ -0,0 +1,92 @@ +# -*- coding: utf-8 -*- +"""Class that defines the abstract interface for an object repository. + +The scope of this class is intentionally very narrow. Any backend implementation should merely provide the methods to +store binary blobs, or "objects", and return a string-based key that unique identifies the object that was just created. +This key should then be able to be used to retrieve the bytes of the corresponding object or to delete it. +""" +import abc +import contextlib +import io +import pathlib +import typing + +__all__ = ('AbstractRepositoryBackend',) + + +class AbstractRepositoryBackend(metaclass=abc.ABCMeta): + """Class that defines the abstract interface for an object repository. + + The repository backend only deals with raw bytes, both when creating new objects as well as when returning a stream + or the content of an existing object. The encoding and decoding of the byte content should be done by the client + upstream. The file repository backend is also not expected to keep any kind of file hierarchy but must be assumed + to be a simple flat data store. When files are created in the file object repository, the implementation will return + a string-based key with which the content of the stored object can be addressed. This key is guaranteed to be unique + and persistent. Persisting the key or mapping it onto a virtual file hierarchy is again up to the client upstream. + """ + + @staticmethod + def is_readable_byte_stream(handle): + return hasattr(handle, 'read') and hasattr(handle, 'mode') and 'b' in handle.mode + + def put_object_from_filelike(self, handle: io.BufferedIOBase) -> str: + """Store the byte contents of a file in the repository. + + :param handle: filelike object with the byte content to be stored. + :return: the generated fully qualified identifier for the object within the repository. + """ + if not isinstance(handle, io.BytesIO) and not self.is_readable_byte_stream(handle): + raise TypeError(f'handle does not seem to be a byte stream: {type(handle)}.') + + def put_object_from_file(self, filepath: typing.Union[str, pathlib.Path]) -> str: + """Store a new object with contents of the file located at `filepath` on this file system. + + :param filepath: absolute path of file whose contents to copy to the repository. + :return: the generated fully qualified identifier for the object within the repository. + :raises TypeError: if the handle is not a byte stream. + """ + with open(filepath, mode='rb') as handle: + return self.put_object_from_filelike(handle) + + @abc.abstractmethod + def has_object(self, key: str) -> bool: + """Return whether the repository has an object with the given key. + + :param key: fully qualified identifier for the object within the repository. + :return: True if the object exists, False otherwise. + """ + + @contextlib.contextmanager + def open(self, key: str) -> io.BufferedIOBase: + """Open a file handle to an object stored under the given key. + + .. note:: this should only be used to open a handle to read an existing file. To write a new file use the method + ``put_object_from_filelike`` instead. + + :param key: fully qualified identifier for the object within the repository. + :return: yield a byte stream object. + :raise FileNotFoundError: if the file does not exist. + :raise OSError: if the file could not be opened. + """ + if not self.has_object(key): + raise FileNotFoundError(f'object with key `{key}` does not exist.') + + def get_object_content(self, key: str) -> bytes: + """Return the content of a object identified by key. + + :param key: fully qualified identifier for the object within the repository. + :raise FileNotFoundError: if the file does not exist. + :raise OSError: if the file could not be opened. + """ + with self.open(key) as handle: + return handle.read() + + def delete_object(self, key: str): + """Delete the object from the repository. + + :param key: fully qualified identifier for the object within the repository. + :raise FileNotFoundError: if the file does not exist. + :raise OSError: if the file could not be deleted. + """ + if not self.has_object(key): + raise FileNotFoundError(f'object with key `{key}` does not exist.') diff --git a/aiida/repository/backend/disk_object_store.py b/aiida/repository/backend/disk_object_store.py new file mode 100644 index 0000000000..cdae34c0d0 --- /dev/null +++ b/aiida/repository/backend/disk_object_store.py @@ -0,0 +1,69 @@ +# -*- coding: utf-8 -*- +"""Implementation of the ``AbstractRepositoryBackend`` using the ``disk-objectstore`` as the backend.""" +import contextlib +import io + +from disk_objectstore import Container + +from aiida.common.lang import type_check + +from .abstract import AbstractRepositoryBackend + +__all__ = ('DiskObjectStoreRepositoryBackend',) + + +class DiskObjectStoreRepositoryBackend(AbstractRepositoryBackend): + """Implementation of the ``AbstractRepositoryBackend`` using the ``disk-object-store`` as the backend.""" + + def __init__(self, container): + type_check(container, Container) + self._container = container + + @property + def container(self): + return self._container + + def put_object_from_filelike(self, handle: io.BufferedIOBase) -> str: + """Store the byte contents of a file in the repository. + + :param handle: filelike object with the byte content to be stored. + :return: the generated fully qualified identifier for the object within the repository. + :raises TypeError: if the handle is not a byte stream. + """ + super().put_object_from_filelike(handle) + return self.container.add_object(handle.read()) + + def has_object(self, key: str) -> bool: + """Return whether the repository has an object with the given key. + + :param key: fully qualified identifier for the object within the repository. + :return: True if the object exists, False otherwise. + """ + return self.container.has_object(key) + + @contextlib.contextmanager + def open(self, key: str) -> io.BufferedIOBase: + """Open a file handle to an object stored under the given key. + + .. note:: this should only be used to open a handle to read an existing file. To write a new file use the method + ``put_object_from_filelike`` instead. + + :param key: fully qualified identifier for the object within the repository. + :return: yield a byte stream object. + :raise FileNotFoundError: if the file does not exist. + :raise OSError: if the file could not be opened. + """ + super().open(key) + + with self.container.get_object_stream(key) as handle: + yield handle + + def delete_object(self, key: str): + """Delete the object from the repository. + + :param key: fully qualified identifier for the object within the repository. + :raise FileNotFoundError: if the file does not exist. + :raise OSError: if the file could not be deleted. + """ + super().delete_object(key) + self.container.delete_objects([key]) diff --git a/aiida/repository/backend/sandbox.py b/aiida/repository/backend/sandbox.py new file mode 100644 index 0000000000..ef4df9f849 --- /dev/null +++ b/aiida/repository/backend/sandbox.py @@ -0,0 +1,88 @@ +# -*- coding: utf-8 -*- +"""Implementation of the ``AbstractRepositoryBackend`` using a sandbox folder on disk as the backend.""" +import contextlib +import io +import os +import shutil +import uuid + +from .abstract import AbstractRepositoryBackend + +__all__ = ('SandboxRepositoryBackend',) + + +class SandboxRepositoryBackend(AbstractRepositoryBackend): + """Implementation of the ``AbstractRepositoryBackend`` using a sandbox folder on disk as the backend.""" + + def __init__(self): + self._sandbox = None + + def __del__(self): + """Delete the entire sandbox folder if it was instantiated and still exists.""" + if getattr(self, '_sandbox', None) is not None: + try: + shutil.rmtree(self.sandbox.abspath) + except FileNotFoundError: + pass + + @property + def sandbox(self): + """Return the sandbox instance of this repository.""" + from aiida.common.folders import SandboxFolder + + if self._sandbox is None: + self._sandbox = SandboxFolder() + + return self._sandbox + + def put_object_from_filelike(self, handle: io.BufferedIOBase) -> str: + """Store the byte contents of a file in the repository. + + :param handle: filelike object with the byte content to be stored. + :return: the generated fully qualified identifier for the object within the repository. + :raises TypeError: if the handle is not a byte stream. + """ + super().put_object_from_filelike(handle) + + key = str(uuid.uuid4()) + filepath = os.path.join(self.sandbox.abspath, key) + + with open(filepath, 'wb') as target: + shutil.copyfileobj(handle, target) + + return key + + def has_object(self, key: str) -> bool: + """Return whether the repository has an object with the given key. + + :param key: fully qualified identifier for the object within the repository. + :return: True if the object exists, False otherwise. + """ + return key in os.listdir(self.sandbox.abspath) + + @contextlib.contextmanager + def open(self, key: str) -> io.BufferedIOBase: + """Open a file handle to an object stored under the given key. + + .. note:: this should only be used to open a handle to read an existing file. To write a new file use the method + ``put_object_from_filelike`` instead. + + :param key: fully qualified identifier for the object within the repository. + :return: yield a byte stream object. + :raise FileNotFoundError: if the file does not exist. + :raise OSError: if the file could not be opened. + """ + super().open(key) + + with self.sandbox.open(key, mode='rb') as handle: + yield handle + + def delete_object(self, key: str): + """Delete the object from the repository. + + :param key: fully qualified identifier for the object within the repository. + :raise FileNotFoundError: if the file does not exist. + :raise OSError: if the file could not be deleted. + """ + super().delete_object(key) + os.remove(os.path.join(self.sandbox.abspath, key)) diff --git a/aiida/repository/common.py b/aiida/repository/common.py index f9dee05b0c..ac61b57f20 100644 --- a/aiida/repository/common.py +++ b/aiida/repository/common.py @@ -7,14 +7,11 @@ # For further information on the license, see the LICENSE.txt file # # For further information please visit http://www.aiida.net # ########################################################################### -# pylint: disable=redefined-builtin """Module with resources common to the repository.""" import enum -import warnings +import typing -from aiida.common.warnings import AiidaDeprecationWarning - -__all__ = ('File', 'FileType') +__all__ = ('FileType', 'File') class FileType(enum.Enum): @@ -24,59 +21,103 @@ class FileType(enum.Enum): FILE = 1 -class File: +class File(): """Data class representing a file object.""" - def __init__(self, name: str = '', file_type: FileType = FileType.DIRECTORY, type=None): - """ - - .. deprecated:: 1.4.0 - The argument `type` has been deprecated and will be removed in `v2.0.0`, use `file_type` instead. - """ - if type is not None: - warnings.warn( - 'argument `type` is deprecated and will be removed in `v2.0.0`. Use `file_type` instead.', - AiidaDeprecationWarning - ) # pylint: disable=no-member""" - file_type = type - + def __init__( + self, + name: str = '', + file_type: FileType = FileType.DIRECTORY, + key: typing.Union[str, None] = None, + objects: typing.Dict[str, 'File'] = None + ): if not isinstance(name, str): raise TypeError('name should be a string.') if not isinstance(file_type, FileType): raise TypeError('file_type should be an instance of `FileType`.') + if key is not None and not isinstance(key, str): + raise TypeError('key should be `None` or a string.') + + if objects is not None and any([not isinstance(obj, self.__class__) for obj in objects.values()]): + raise TypeError('objects should be `None` or a dictionary of `File` instances.') + + if file_type == FileType.DIRECTORY and key is not None: + raise ValueError('an object of type `FileType.DIRECTORY` cannot define a key.') + + if file_type == FileType.FILE and objects is not None: + raise ValueError('an object of type `FileType.FILE` cannot define any objects.') + self._name = name self._file_type = file_type + self._key = key + self._objects = objects or {} + + @classmethod + def from_serialized(cls, serialized: dict, name='') -> 'File': + """Construct a new instance from a serialized instance. + + :param serialized: the serialized instance. + :return: the reconstructed file object. + """ + if 'k' in serialized: + file_type = FileType.FILE + key = serialized['k'] + objects = None + else: + file_type = FileType.DIRECTORY + key = None + objects = {name: File.from_serialized(obj, name) for name, obj in serialized.get('o', {}).items()} + + instance = cls.__new__(cls) + instance.__init__(name, file_type, key, objects) + return instance + + def serialize(self) -> dict: + """Serialize the metadata into a JSON-serializable format. + + .. note:: the serialization format is optimized to reduce the size in bytes. + + :return: dictionary with the content metadata. + """ + if self.file_type == FileType.DIRECTORY: + if self.objects: + return {'o': {key: obj.serialize() for key, obj in self.objects.items()}} + return {} + return {'k': self.key} @property def name(self) -> str: """Return the name of the file object.""" return self._name - @property - def type(self) -> FileType: - """Return the file type of the file object. - - .. deprecated:: 1.4.0 - Will be removed in `v2.0.0`, use `file_type` instead. - """ - warnings.warn('property is deprecated, use `file_type` instead', AiidaDeprecationWarning) # pylint: disable=no-member""" - return self.file_type - @property def file_type(self) -> FileType: """Return the file type of the file object.""" return self._file_type - def __iter__(self): - """Iterate over the properties.""" - warnings.warn( - '`File` has changed from named tuple into class and from `v2.0.0` will no longer be iterable', - AiidaDeprecationWarning - ) - yield self.name - yield self.file_type - - def __eq__(self, other): - return self.file_type == other.file_type and self.name == other.name + @property + def key(self) -> typing.Union[str, None]: + """Return the key of the file object.""" + return self._key + + @property + def objects(self) -> typing.Dict[str, 'File']: + """Return the objects of the file object.""" + return self._objects + + def __eq__(self, other) -> bool: + """Return whether this instance is equal to another file object instance.""" + if not isinstance(other, self.__class__): + return False + + equal_attributes = all([getattr(self, key) == getattr(other, key) for key in ['name', 'file_type', 'key']]) + equal_object_keys = sorted(self.objects) == sorted(other.objects) + equal_objects = equal_object_keys and all([obj == other.objects[key] for key, obj in self.objects.items()]) + + return equal_attributes and equal_objects + + def __repr__(self): + args = (self.name, self.file_type.value, self.key, self.objects.items()) + return 'File'.format(*args) diff --git a/aiida/repository/repository.py b/aiida/repository/repository.py new file mode 100644 index 0000000000..be0b2cbb1b --- /dev/null +++ b/aiida/repository/repository.py @@ -0,0 +1,444 @@ +# -*- coding: utf-8 -*- +"""Module for the implementation of a file repository.""" +import contextlib +import io +import pathlib +import typing + +from aiida.common.hashing import make_hash +from aiida.common.lang import type_check + +from .backend import AbstractRepositoryBackend, SandboxRepositoryBackend +from .common import File, FileType + +__all__ = ('Repository',) + +FilePath = typing.Union[str, pathlib.Path] + + +class Repository: + """File repository. + + This class provides an interface to a backend file repository instance, but unlike the backend repository, this + class keeps a reference of the virtual file hierarchy. This means that through this interface, a client can create + files and directories with a file hierarchy, just as they would on a local file system, except it is completely + virtual as the files are stored by the backend which can store them in a completely flat structure. This also means + that the internal virtual hierarchy of a ``Repository`` instance does not necessarily represent all the files that + are stored by repository backend. The repository exposes a mere subset of all the file objects stored in the + backend. This is why object deletion is also implemented as a soft delete, by default, where the files are just + removed from the internal virtual hierarchy, but not in the actual backend. This is because those objects can be + referenced by other instances. + """ + + # pylint: disable=too-many-public-methods + + _backend = None + _file_cls = File + + def __init__(self, backend: AbstractRepositoryBackend = None): + """Construct a new instance with empty metadata. + + :param backend: instance of repository backend to use to actually store the file objects. By default, an + instance of the ``SandboxRepositoryBackend`` will be created. + """ + if backend is None: + backend = SandboxRepositoryBackend() + + self.set_backend(backend) + self.reset() + + @classmethod + def from_serialized(cls, backend: AbstractRepositoryBackend, serialized: typing.Dict) -> 'Repository': + """Construct an instance where the metadata is initialized from the serialized content. + + :param backend: instance of repository backend to use to actually store the file objects. + """ + instance = cls.__new__(cls) + instance.__init__(backend) + + if serialized: + for name, obj in serialized['o'].items(): + instance.get_directory().objects[name] = cls._file_cls.from_serialized(obj, name) + + return instance + + def reset(self): + self._directory = self._file_cls() + + def serialize(self) -> typing.Dict: + """Serialize the metadata into a JSON-serializable format. + + :return: dictionary with the content metadata. + """ + return self._directory.serialize() + + def hash(self) -> str: + """Generate a hash of the repository's contents. + + .. warning:: this will read the content of all file objects contained within the virtual hierarchy into memory. + + :return: the hash representing the contents of the repository. + """ + objects = {} + for root, dirnames, filenames in self.walk(): + objects['__dirnames__'] = dirnames + for filename in filenames: + with self.open(root / filename) as handle: + objects[str(root / filename)] = handle.read() + + return make_hash(objects) + + @staticmethod + def _pre_process_path(path: FilePath = None) -> typing.Union[pathlib.Path, None]: + """Validate and convert the path to instance of ``pathlib.Path``. + + This should be called by every method of this class before doing anything, such that it can safely assume that + the path is a ``pathlib.Path`` object, which makes path manipulation a lot easier. + + :param path: the path as a ``pathlib.Path`` object or `None`. + :raises TypeError: if the type of path was not a str nor a ``pathlib.Path`` instance. + """ + if path is None: + return pathlib.Path() + + if isinstance(path, str): + path = pathlib.Path(path) + + if not isinstance(path, pathlib.Path): + raise TypeError('path is not of type `str` nor `pathlib.Path`.') + + if path.is_absolute(): + raise TypeError(f'path `{path}` is not a relative path.') + + return path + + @property + def backend(self) -> AbstractRepositoryBackend: + """Return the current repository backend. + + :return: the repository backend. + """ + return self._backend + + def set_backend(self, backend: AbstractRepositoryBackend): + """Set the backend for this repository. + + :param backend: the repository backend. + :raises TypeError: if the type of the backend is invalid. + """ + type_check(backend, AbstractRepositoryBackend) + self._backend = backend + + def _insert_file(self, path: pathlib.Path, key: str): + """Insert a new file object in the object mapping. + + .. note:: this assumes the path is a valid relative path, so should be checked by the caller. + + :param path: the relative path where to store the object in the repository. + :param key: fully qualified identifier for the object within the repository. + """ + if path.parent: + directory = self.create_directory(path.parent) + else: + directory = self.get_directory + + directory.objects[path.name] = self._file_cls(path.name, FileType.FILE, key) + + def create_directory(self, path: FilePath) -> File: + """Create a new directory with the given path. + + :param path: the relative path of the directory. + :return: the created directory. + :raises TypeError: if the path is not a string or ``Path``, or is an absolute path. + """ + if path is None: + raise TypeError('path cannot be `None`.') + + path = self._pre_process_path(path) + directory = self._directory + + for part in path.parts: + if part not in directory.objects: + directory.objects[part] = self._file_cls(part) + + directory = directory.objects[part] + + return directory + + def get_hash_keys(self) -> typing.List[str]: + """Return the hash keys of all file objects contained within this repository. + + :return: list of file object hash keys. + """ + hash_keys = [] + + def add_hash_keys(keys, objects): + """Recursively add keys of all file objects to the keys list.""" + for obj in objects.values(): + if obj.file_type == FileType.FILE and obj.key is not None: + keys.append(obj.key) + elif obj.file_type == FileType.DIRECTORY: + add_hash_keys(keys, obj.objects) + + add_hash_keys(hash_keys, self._directory.objects) + + return hash_keys + + def get_object(self, path: FilePath = None) -> File: + """Return the object at the given path. + + :param path: the relative path where to store the object in the repository. + :return: the `File` representing the object located at the given relative path. + :raises TypeError: if the path is not a string or ``Path``, or is an absolute path. + :raises FileNotFoundError: if no object exists for the given path. + """ + path = self._pre_process_path(path) + file_object = self._directory + + if not path.parts: + return file_object + + for part in path.parts: + if part not in file_object.objects: + raise FileNotFoundError(f'object with path `{path}` does not exist.') + + file_object = file_object.objects[part] + + return file_object + + def get_directory(self, path: FilePath = None) -> File: + """Return the directory object at the given path. + + :param path: the relative path of the directory. + :return: the `File` representing the object located at the given relative path. + :raises TypeError: if the path is not a string or ``Path``, or is an absolute path. + :raises FileNotFoundError: if no object exists for the given path. + :raises NotADirectoryError: if the object at the given path is not a directory. + """ + file_object = self.get_object(path) + + if file_object.file_type != FileType.DIRECTORY: + raise NotADirectoryError(f'object with path `{path}` is not a directory.') + + return file_object + + def get_file(self, path: FilePath) -> File: + """Return the file object at the given path. + + :param path: the relative path of the file object. + :return: the `File` representing the object located at the given relative path. + :raises TypeError: if the path is not a string or ``Path``, or is an absolute path. + :raises FileNotFoundError: if no object exists for the given path. + :raises IsADirectoryError: if the object at the given path is not a directory. + """ + if path is None: + raise TypeError('path cannot be `None`.') + + path = self._pre_process_path(path) + + file_object = self.get_object(path) + + if file_object.file_type != FileType.FILE: + raise IsADirectoryError(f'object with path `{path}` is not a file.') + + return file_object + + def list_objects(self, path: FilePath = None) -> typing.List[File]: + """Return a list of the objects contained in this repository sorted by name, optionally in given sub directory. + + :param path: the relative path of the directory. + :return: a list of `File` named tuples representing the objects present in directory with the given path. + :raises TypeError: if the path is not a string or ``Path``, or is an absolute path. + :raises FileNotFoundError: if no object exists for the given path. + :raises NotADirectoryError: if the object at the given path is not a directory. + """ + directory = self.get_directory(path) + return sorted(directory.objects.values(), key=lambda obj: obj.name) + + def list_object_names(self, path: FilePath = None) -> typing.List[str]: + """Return a sorted list of the object names contained in this repository, optionally in the given sub directory. + + :param path: the relative path of the directory. + :return: a list of `File` named tuples representing the objects present in directory with the given path. + :raises TypeError: if the path is not a string or ``Path``, or is an absolute path. + :raises FileNotFoundError: if no object exists for the given path. + :raises NotADirectoryError: if the object at the given path is not a directory. + """ + return [entry.name for entry in self.list_objects(path)] + + def put_object_from_filelike(self, handle: io.BufferedReader, path: FilePath): + """Store the byte contents of a file in the repository. + + :param handle: filelike object with the byte content to be stored. + :param path: the relative path where to store the object in the repository. + :raises TypeError: if the path is not a string or ``Path``, or is an absolute path. + """ + path = self._pre_process_path(path) + key = self.backend.put_object_from_filelike(handle) + self._insert_file(path, key) + + def put_object_from_file(self, filepath: FilePath, path: FilePath): + """Store a new object under `path` with contents of the file located at `filepath` on the local file system. + + :param filepath: absolute path of file whose contents to copy to the repository + :param path: the relative path where to store the object in the repository. + :raises TypeError: if the path is not a string and relative path, or the handle is not a byte stream. + """ + with open(filepath, 'rb') as handle: + self.put_object_from_filelike(handle, path) + + def put_object_from_tree(self, filepath: FilePath, path: FilePath = None): + """Store the entire contents of `filepath` on the local file system in the repository with under given `path`. + + :param filepath: absolute path of the directory whose contents to copy to the repository. + :param path: the relative path where to store the objects in the repository. + :raises TypeError: if the filepath is not a string or ``Path``, or is a relative path. + :raises TypeError: if the path is not a string or ``Path``, or is an absolute path. + """ + import os + + path = self._pre_process_path(path) + + if isinstance(filepath, str): + filepath = pathlib.Path(filepath) + + if not isinstance(filepath, pathlib.Path): + raise TypeError(f'filepath `{filepath}` is not of type `str` nor `pathlib.Path`.') + + if not filepath.is_absolute(): + raise TypeError(f'filepath `{filepath}` is not an absolute path.') + + # Explicitly create the base directory if specified by `path`, just in case `filepath` contains no file objects. + if path.parts: + self.create_directory(path) + + for root, dirnames, filenames in os.walk(filepath): + + root = pathlib.Path(root) + + for dirname in dirnames: + self.create_directory(path / root.relative_to(filepath) / dirname) + + for filename in filenames: + self.put_object_from_file(root / filename, path / root.relative_to(filepath) / filename) + + def is_empty(self) -> bool: + """Return whether the repository is empty. + + :return: True if the repository contains no file objects. + """ + return not self._directory.objects + + def has_object(self, path: FilePath) -> bool: + """Return whether the repository has an object with the given path. + + :param path: the relative path of the object within the repository. + :return: True if the object exists, False otherwise. + :raises TypeError: if the path is not a string or ``Path``, or is an absolute path. + """ + try: + self.get_object(path) + except FileNotFoundError: + return False + else: + return True + + @contextlib.contextmanager + def open(self, path: FilePath) -> io.BufferedReader: + """Open a file handle to an object stored under the given path. + + .. note:: this should only be used to open a handle to read an existing file. To write a new file use the method + ``put_object_from_filelike`` instead. + + :param path: the relative path of the object within the repository. + :return: yield a byte stream object. + :raises TypeError: if the path is not a string or ``Path``, or is an absolute path. + :raises FileNotFoundError: if the file does not exist. + :raises IsADirectoryError: if the object is a directory and not a file. + :raises OSError: if the file could not be opened. + """ + with self.backend.open(self.get_file(path).key) as handle: + yield handle + + def get_object_content(self, path: FilePath) -> bytes: + """Return the content of a object identified by path. + + :param path: the relative path of the object within the repository. + :raises TypeError: if the path is not a string or ``Path``, or is an absolute path. + :raises FileNotFoundError: if the file does not exist. + :raises IsADirectoryError: if the object is a directory and not a file. + :raises OSError: if the file could not be opened. + """ + return self.backend.get_object_content(self.get_file(path).key) + + def delete_object(self, path: FilePath, hard_delete: bool = False): + """Soft delete the object from the repository. + + .. note:: can only delete file objects, but not directories. + + :param path: the relative path of the object within the repository. + :param hard_delete: when true, not only remove the file from the internal mapping but also call through to the + ``delete_object`` method of the actual repository backend. + :raises TypeError: if the path is not a string or ``Path``, or is an absolute path. + :raises FileNotFoundError: if the file does not exist. + :raises IsADirectoryError: if the object is a directory and not a file. + :raises OSError: if the file could not be deleted. + """ + path = self._pre_process_path(path) + file_object = self.get_object(path) + + if file_object.file_type == FileType.DIRECTORY: + raise IsADirectoryError(f'object with path `{path}` is a directory.') + + if hard_delete: + self.backend.delete_object(file_object.key) + + directory = self.get_directory(path.parent) + directory.objects.pop(path.name) + + def erase(self): + """Delete all objects from the repository. + + .. important: this intentionally does not call through to any ``erase`` method of the backend, because unlike + this class, the backend does not just store the objects of a single node, but potentially of a lot of other + nodes. Therefore, we manually delete all file objects and then simply reset the internal file hierarchy. + + """ + for hash_key in self.get_hash_keys(): + self.backend.delete_object(hash_key) + self.reset() + + def clone(self, source: 'Repository'): + """Clone the contents of another repository instance.""" + if not isinstance(source, Repository): + raise TypeError('source is not an instance of `Repository`.') + + for root, dirnames, filenames in source.walk(): + for dirname in dirnames: + self.create_directory(root / dirname) + for filename in filenames: + with source.open(root / filename) as handle: + self.put_object_from_filelike(handle, root / filename) + + def walk(self, path: FilePath = None) -> typing.Tuple[pathlib.Path, typing.List[str], typing.List[str]]: + """Walk over the directories and files contained within this repository. + + .. note:: the order of the dirname and filename lists that are returned is not necessarily sorted. This is in + line with the ``os.walk`` implementation where the order depends on the underlying file system used. + + :param path: the relative path of the directory within the repository whose contents to walk. + :return: tuples of root, dirnames and filenames just like ``os.walk``, with the exception that the root path is + always relative with respect to the repository root, instead of an absolute path and it is an instance of + ``pathlib.Path`` instead of a normal string + """ + path = self._pre_process_path(path) + + directory = self.get_directory(path) + dirnames = [obj.name for obj in directory.objects.values() if obj.file_type == FileType.DIRECTORY] + filenames = [obj.name for obj in directory.objects.values() if obj.file_type == FileType.FILE] + + if dirnames: + for dirname in dirnames: + yield from self.walk(path / dirname) + + yield path, dirnames, filenames diff --git a/aiida/restapi/translator/nodes/node.py b/aiida/restapi/translator/nodes/node.py index 8b8e4d3d2c..a2beb59869 100644 --- a/aiida/restapi/translator/nodes/node.py +++ b/aiida/restapi/translator/nodes/node.py @@ -466,7 +466,7 @@ def get_repo_list(node, filename=''): """ try: flist = node.list_objects(filename) - except IOError: + except NotADirectoryError: raise RestInputValidationError(f'{filename} is not a directory in this repository') response = [] for fobj in flist: @@ -487,7 +487,7 @@ def get_repo_contents(node, filename=''): try: data = node.get_object_content(filename, mode='rb') return data - except IOError: + except FileNotFoundError: raise RestInputValidationError('No such file is present') raise RestValidationError('filename is not provided') diff --git a/aiida/tools/dbimporters/baseclasses.py b/aiida/tools/dbimporters/baseclasses.py index 64db13dac1..f978755a7e 100644 --- a/aiida/tools/dbimporters/baseclasses.py +++ b/aiida/tools/dbimporters/baseclasses.py @@ -333,8 +333,8 @@ def get_upf_node(self, store=False): # Prefixing with an ID in order to start file name with the name # of the described element. - with tempfile.NamedTemporaryFile(mode='w+', prefix=self.source['id']) as handle: - handle.write(self.contents) + with tempfile.NamedTemporaryFile(mode='w+b', prefix=self.source['id']) as handle: + handle.write(self.contents.encode('utf-8')) handle.flush() upfnode = UpfData(file=handle.name, source=self.source) diff --git a/aiida/tools/graph/deletions.py b/aiida/tools/graph/deletions.py index b151f7d3c8..26b6bd81a2 100644 --- a/aiida/tools/graph/deletions.py +++ b/aiida/tools/graph/deletions.py @@ -15,7 +15,7 @@ from aiida.backends.utils import delete_nodes_and_connections from aiida.common.log import AIIDA_LOGGER from aiida.common.warnings import AiidaDeprecationWarning -from aiida.orm import Group, Node, QueryBuilder, load_node +from aiida.orm import Group, Node, QueryBuilder from aiida.tools.graph.graph_traversers import get_nodes_delete __all__ = ('DELETE_LOGGER', 'delete_nodes', 'delete_group_nodes') @@ -123,20 +123,8 @@ def _missing_callback(_pks: Iterable[int]): if not pks_set_to_delete: return (pks_set_to_delete, True) - # Recover the list of folders to delete before actually deleting the nodes. I will delete the folders only later, - # so that if there is a problem during the deletion of the nodes in the DB, I don't delete the folders - repositories = [load_node(pk)._repository for pk in pks_set_to_delete] # pylint: disable=protected-access - DELETE_LOGGER.info('Starting node deletion...') delete_nodes_and_connections(pks_set_to_delete) - - DELETE_LOGGER.info('Nodes deleted from database, deleting files from the repository now...') - - # If we are here, we managed to delete the entries from the DB. - # I can now delete the folders - for repository in repositories: - repository.erase(force=True) - DELETE_LOGGER.info('Deletion of nodes completed.') return (pks_set_to_delete, True) diff --git a/aiida/tools/importexport/archive/migrations/__init__.py b/aiida/tools/importexport/archive/migrations/__init__.py index b2d0c76de2..436597e677 100644 --- a/aiida/tools/importexport/archive/migrations/__init__.py +++ b/aiida/tools/importexport/archive/migrations/__init__.py @@ -8,8 +8,7 @@ # For further information please visit http://www.aiida.net # ########################################################################### """Migration archive files from old export versions to the newest, used by `verdi export migrate` command.""" -from pathlib import Path -from typing import Any, Callable, Dict, Tuple, Union +from typing import Callable, Dict, Tuple from aiida.tools.importexport.archive.common import CacheFolder @@ -22,6 +21,7 @@ from .v07_to_v08 import migrate_v7_to_v8 from .v08_to_v09 import migrate_v8_to_v9 from .v09_to_v10 import migrate_v9_to_v10 +from .v10_to_v11 import migrate_v10_to_v11 # version from -> version to, function which acts on the cache folder _vtype = Dict[str, Tuple[str, Callable[[CacheFolder], None]]] @@ -34,5 +34,6 @@ '0.6': ('0.7', migrate_v6_to_v7), '0.7': ('0.8', migrate_v7_to_v8), '0.8': ('0.9', migrate_v8_to_v9), - '0.9': ('0.10', migrate_v9_to_v10) + '0.9': ('0.10', migrate_v9_to_v10), + '0.10': ('0.11', migrate_v10_to_v11), } diff --git a/aiida/tools/importexport/archive/migrations/v03_to_v04.py b/aiida/tools/importexport/archive/migrations/v03_to_v04.py index b0c3fc97df..5440f77305 100644 --- a/aiida/tools/importexport/archive/migrations/v03_to_v04.py +++ b/aiida/tools/importexport/archive/migrations/v03_to_v04.py @@ -342,14 +342,12 @@ def migration_trajectory_symbols_to_attribute(data: dict, folder: CacheFolder): """Apply migrations: 0026 - REV. 1.0.26 and 0027 - REV. 1.0.27 Create the symbols attribute from the repository array for all `TrajectoryData` nodes. """ - from aiida.tools.importexport.common.config import NODES_EXPORT_SUBFOLDER - path = folder.get_path(flush=False) for node_id, content in data['export_data'].get('Node', {}).items(): if content.get('type', '') == 'node.data.array.trajectory.TrajectoryData.': uuid = content['uuid'] - symbols_path = path.joinpath(NODES_EXPORT_SUBFOLDER, uuid[0:2], uuid[2:4], uuid[4:], 'path', 'symbols.npy') + symbols_path = path.joinpath('nodes', uuid[0:2], uuid[2:4], uuid[4:], 'path', 'symbols.npy') symbols = np.load(os.path.abspath(symbols_path)).tolist() symbols_path.unlink() # Update 'node_attributes' diff --git a/aiida/tools/importexport/archive/migrations/v10_to_v11.py b/aiida/tools/importexport/archive/migrations/v10_to_v11.py new file mode 100644 index 0000000000..9ed2260fa4 --- /dev/null +++ b/aiida/tools/importexport/archive/migrations/v10_to_v11.py @@ -0,0 +1,74 @@ +# -*- coding: utf-8 -*- +########################################################################### +# Copyright (c), The AiiDA team. All rights reserved. # +# This file is part of the AiiDA code. # +# # +# The code is hosted on GitHub at https://github.com/aiidateam/aiida-core # +# For further information on the license, see the LICENSE.txt file # +# For further information please visit http://www.aiida.net # +########################################################################### +"""Migration from v0.10 to v0.11, used by `verdi export migrate` command. + +This migration deals with the file repository. In the old version, the +""" +import os +import shutil + +from aiida.tools.importexport.archive.common import CacheFolder +from .utils import verify_metadata_version, update_metadata + + +def migrate_repository(metadata, data, folder): + """Migrate the file repository to a disk object store container.""" + from disk_objectstore import Container + from aiida.repository import Repository, File + from aiida.repository.backend import DiskObjectStoreRepositoryBackend + + container = Container(os.path.join(folder.get_path(), 'container')) + container.init_container() + backend = DiskObjectStoreRepositoryBackend(container=container) + repository = Repository(backend=backend) + + for values in data.get('export_data', {}).get('Node', {}).values(): + uuid = values['uuid'] + dirpath_calc = os.path.join(folder.get_path(), 'nodes', uuid[:2], uuid[2:4], uuid[4:], 'raw_input') + dirpath_data = os.path.join(folder.get_path(), 'nodes', uuid[:2], uuid[2:4], uuid[4:], 'path') + + if os.path.isdir(dirpath_calc): + dirpath = dirpath_calc + elif os.path.isdir(dirpath_data): + dirpath = dirpath_data + else: + raise AssertionError('node repository contains neither `raw_input` nor `path` subfolder.') + + if not os.listdir(dirpath): + continue + + repository.put_object_from_tree(dirpath) + values['repository_metadata'] = repository.serialize() + # Artificially reset the metadata + repository._directory = File() # pylint: disable=protected-access + + container.pack_all_loose(compress=False) + shutil.rmtree(os.path.join(folder.get_path(), 'nodes')) + + metadata['all_fields_info']['Node']['repository_metadata'] = {} + + +def migrate_v10_to_v11(folder: CacheFolder): + """Migration of export files from v0.10 to v0.11.""" + old_version = '0.10' + new_version = '0.11' + + _, metadata = folder.load_json('metadata.json') + + verify_metadata_version(metadata, old_version) + update_metadata(metadata, new_version) + + _, data = folder.load_json('data.json') + + # Apply migrations + migrate_repository(metadata, data, folder) + + folder.write_json('metadata.json', metadata) + folder.write_json('data.json', data) diff --git a/aiida/tools/importexport/archive/readers.py b/aiida/tools/importexport/archive/readers.py index 65da299ab2..190e654b86 100644 --- a/aiida/tools/importexport/archive/readers.py +++ b/aiida/tools/importexport/archive/readers.py @@ -10,24 +10,23 @@ """Archive reader classes.""" from abc import ABC, abstractmethod import json -import os from pathlib import Path import tarfile from types import TracebackType -from typing import Any, Callable, cast, Dict, Iterable, Iterator, List, Optional, Set, Tuple, Type +from typing import Any, Callable, cast, Dict, Iterator, List, Optional, Set, Tuple, Type import zipfile from distutils.version import StrictVersion from archive_path import TarPath, ZipPath, read_file_in_tar, read_file_in_zip +from disk_objectstore import Container from aiida.common.log import AIIDA_LOGGER from aiida.common.exceptions import InvalidOperation -from aiida.common.folders import Folder, SandboxFolder -from aiida.tools.importexport.common.config import EXPORT_VERSION, ExportFileFormat, NODES_EXPORT_SUBFOLDER +from aiida.common.folders import SandboxFolder +from aiida.tools.importexport.common.config import EXPORT_VERSION, ExportFileFormat from aiida.tools.importexport.common.exceptions import (CorruptArchive, IncompatibleArchiveVersionError) from aiida.tools.importexport.archive.common import (ArchiveMetadata, null_callback) from aiida.tools.importexport.common.config import NODE_ENTITY_NAME, GROUP_ENTITY_NAME -from aiida.tools.importexport.common.utils import export_shard_uuid __all__ = ( 'ArchiveReaderAbstract', @@ -184,33 +183,8 @@ def iter_link_data(self) -> Iterator[dict]: """Iterate over links: {'input': , 'output': , 'label':