Skip to content

Commit

Permalink
Merge pull request #4345 from sphuber/fix/3445/disk-object-store-repo…
Browse files Browse the repository at this point in the history
…sitory

New file repository implementation based on a disk object store
  • Loading branch information
sphuber authored Apr 28, 2021
2 parents 4d90f37 + 0e250e6 commit 16b326b
Show file tree
Hide file tree
Showing 132 changed files with 8,073 additions and 3,437 deletions.
1 change: 0 additions & 1 deletion MANIFEST.in
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
include aiida/cmdline/templates/*.tpl
include aiida/manage/backup/backup_info.json.tmpl
include aiida/manage/configuration/schema/*.json
include setup.json
include AUTHORS.txt
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def verify_node_uuid_uniqueness(_, __):
:raises: IntegrityError if database contains nodes with duplicate UUIDS.
"""
from aiida.manage.database.integrity.duplicate_uuid import verify_uuid_uniqueness
from aiida.backends.general.migrations.utils import verify_uuid_uniqueness
verify_uuid_uniqueness(table='db_dbnode')


Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# -*- coding: utf-8 -*-
###########################################################################
# Copyright (c), The AiiDA team. All rights reserved. #
# This file is part of the AiiDA code. #
# #
# The code is hosted on GitHub at https://github.com/aiidateam/aiida-core #
# For further information on the license, see the LICENSE.txt file #
# For further information please visit http://www.aiida.net #
###########################################################################
# pylint: disable=invalid-name,too-few-public-methods
"""Migration to add the `repository_metadata` JSONB column."""

# pylint: disable=no-name-in-module,import-error
import django.contrib.postgres.fields.jsonb
from django.db import migrations
from aiida.backends.djsite.db.migrations import upgrade_schema_version

REVISION = '1.0.46'
DOWN_REVISION = '1.0.45'


class Migration(migrations.Migration):
"""Migration to add the `repository_metadata` JSONB column."""

dependencies = [
('db', '0045_dbgroup_extras'),
]

operations = [
migrations.AddField(
model_name='dbnode',
name='repository_metadata',
field=django.contrib.postgres.fields.jsonb.JSONField(null=True),
),
upgrade_schema_version(REVISION, DOWN_REVISION),
]
132 changes: 132 additions & 0 deletions aiida/backends/djsite/db/migrations/0047_migrate_repository.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
# -*- coding: utf-8 -*-
###########################################################################
# Copyright (c), The AiiDA team. All rights reserved. #
# This file is part of the AiiDA code. #
# #
# The code is hosted on GitHub at https://github.com/aiidateam/aiida-core #
# For further information on the license, see the LICENSE.txt file #
# For further information please visit http://www.aiida.net #
###########################################################################
# pylint: disable=invalid-name,too-few-public-methods
"""Migrate the file repository to the new disk object store based implementation."""
# pylint: disable=no-name-in-module,import-error
from django.core.exceptions import ObjectDoesNotExist
from django.db import migrations

from aiida.backends.djsite.db.migrations import upgrade_schema_version
from aiida.backends.general.migrations import utils
from aiida.cmdline.utils import echo

REVISION = '1.0.47'
DOWN_REVISION = '1.0.46'

REPOSITORY_UUID_KEY = 'repository|uuid'


def migrate_repository(apps, schema_editor):
"""Migrate the repository."""
# pylint: disable=too-many-locals
import json
from tempfile import NamedTemporaryFile
from aiida.common.progress_reporter import set_progress_bar_tqdm, get_progress_reporter
from aiida.manage.configuration import get_profile

DbNode = apps.get_model('db', 'DbNode')

profile = get_profile()
node_count = DbNode.objects.count()
missing_node_uuids = []
missing_repo_folder = []
shard_count = 256

set_progress_bar_tqdm()

with get_progress_reporter()(total=shard_count, desc='Migrating file repository') as progress:
for i in range(shard_count):

shard = '%.2x' % i # noqa flynt
progress.set_description_str(f'Migrating file repository: shard {shard}')

mapping_node_repository_metadata, missing_sub_repo_folder = utils.migrate_legacy_repository(
node_count, shard
)

if missing_sub_repo_folder:
missing_repo_folder.extend(missing_sub_repo_folder)
del missing_sub_repo_folder

if mapping_node_repository_metadata is None:
continue

for node_uuid, repository_metadata in mapping_node_repository_metadata.items():

# If `repository_metadata` is `{}` or `None`, we skip it, as we can leave the column default `null`.
if not repository_metadata:
continue

try:
# This can happen if the node was deleted but the repo folder wasn't, or the repo folder just never
# corresponded to an actual node. In any case, we don't want to fail but just log the warning.
node = DbNode.objects.get(uuid=node_uuid)
except ObjectDoesNotExist:
missing_node_uuids.append((node_uuid, repository_metadata))
else:
node.repository_metadata = repository_metadata
node.save()

del mapping_node_repository_metadata
progress.update()

# Store the UUID of the repository container in the `DbSetting` table. Note that for new databases, the profile
# setup will already have stored the UUID and so it should be skipped, or an exception for a duplicate key will be
# raised. This migration step is only necessary for existing databases that are migrated.
container_id = profile.get_repository_container().container_id
with schema_editor.connection.cursor() as cursor:
cursor.execute(
f"""
INSERT INTO db_dbsetting (key, val, description, time)
VALUES ('repository|uuid', to_json('{container_id}'::text), 'Repository UUID', current_timestamp)
ON CONFLICT (key) DO NOTHING;
"""
)

if not profile.is_test_profile:

if missing_node_uuids:
prefix = 'migration-repository-missing-nodes-'
with NamedTemporaryFile(prefix=prefix, suffix='.json', dir='.', mode='w+', delete=False) as handle:
json.dump(missing_node_uuids, handle)
echo.echo_warning(
'\nDetected node repository folders for nodes that do not exist in the database. The UUIDs of '
f'those nodes have been written to a log file: {handle.name}'
)

if missing_repo_folder:
prefix = 'migration-repository-missing-subfolder-'
with NamedTemporaryFile(prefix=prefix, suffix='.json', dir='.', mode='w+', delete=False) as handle:
json.dump(missing_repo_folder, handle)
echo.echo_warning(
'\nDetected repository folders that were missing the required subfolder `path` or `raw_input`.'
f' The paths of those nodes repository folders have been written to a log file: {handle.name}'
)

# If there were no nodes, most likely a new profile, there is not need to print the warning
if node_count:
import pathlib
echo.echo_warning(
'\nMigrated file repository to the new disk object store. The old repository has not been deleted '
f'out of safety and can be found at {pathlib.Path(profile.repository_path, "repository")}.'
)


class Migration(migrations.Migration):
"""Migrate the file repository to the new disk object store based implementation."""

dependencies = [
('db', '0046_add_node_repository_metadata'),
]

operations = [
migrations.RunPython(migrate_repository, reverse_code=migrations.RunPython.noop),
upgrade_schema_version(REVISION, DOWN_REVISION),
]
2 changes: 1 addition & 1 deletion aiida/backends/djsite/db/migrations/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ class DeserializationException(AiidaException):
pass


LATEST_MIGRATION = '0045_dbgroup_extras'
LATEST_MIGRATION = '0047_migrate_repository'


def _update_schema_version(version, apps, _):
Expand Down
1 change: 1 addition & 0 deletions aiida/backends/djsite/db/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,7 @@ class DbNode(m.Model):
attributes = JSONField(default=dict, null=True)
# JSON Extras
extras = JSONField(default=dict, null=True)
repository_metadata = JSONField(null=True)

objects = m.Manager()
# Return aiida Node instances or their subclasses instead of DbNode instances
Expand Down
6 changes: 3 additions & 3 deletions aiida/backends/djsite/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ def get_schema_generation_database(self):
from django.db.utils import ProgrammingError
from aiida.manage.manager import get_manager

backend = get_manager()._load_backend(schema_check=False) # pylint: disable=protected-access
backend = get_manager()._load_backend(schema_check=False, repository_check=False) # pylint: disable=protected-access

try:
result = backend.execute_raw(r"""SELECT tval FROM db_dbsetting WHERE key = 'schema_generation';""")
Expand All @@ -104,7 +104,7 @@ def get_schema_version_database(self):
from django.db.utils import ProgrammingError
from aiida.manage.manager import get_manager

backend = get_manager()._load_backend(schema_check=False) # pylint: disable=protected-access
backend = get_manager()._load_backend(schema_check=False, repository_check=False) # pylint: disable=protected-access

try:
result = backend.execute_raw(r"""SELECT tval FROM db_dbsetting WHERE key = 'db|schemaversion';""")
Expand All @@ -129,7 +129,7 @@ def _migrate_database_generation(self):
from aiida.manage.manager import get_manager
super()._migrate_database_generation()

backend = get_manager()._load_backend(schema_check=False) # pylint: disable=protected-access
backend = get_manager()._load_backend(schema_check=False, repository_check=False) # pylint: disable=protected-access
backend.execute_raw(r"""DELETE FROM django_migrations WHERE app = 'db';""")
backend.execute_raw(
r"""INSERT INTO django_migrations (app, name, applied) VALUES ('db', '0001_initial', NOW());"""
Expand Down
Loading

1 comment on commit 16b326b

@github-actions
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Performance Alert ⚠️

Possible performance regression was detected for benchmark 'pytest-benchmarks:ubuntu-18.04,django'.
Benchmark result of this commit is worse than the previous benchmark result exceeding threshold 2.

Benchmark suite Current: 16b326b Previous: 4d90f37 Ratio
tests/benchmark/test_nodes.py::test_store_with_object 94.22083026314499 iter/sec (stddev: 0.0011289) 210.5721856480447 iter/sec (stddev: 0.00077759) 2.23

This comment was automatically generated by workflow using github-action-benchmark.

CC: @chrisjsewell @giovannipizzi

Please sign in to comment.