From 0f1d9d43e2b36281b5975e69e90083898c8a1b90 Mon Sep 17 00:00:00 2001 From: manukala6 Date: Fri, 6 Sep 2024 17:04:47 -0400 Subject: [PATCH 01/20] GTC-2898 Modify metadata fields --- .../orm/migrations/versions/e8a7dc28a874_.py | 39 +++++++++++++++++++ app/models/orm/mixins.py | 3 +- app/models/pydantic/metadata.py | 6 ++- tests/crud/test_versions.py | 2 +- tests/routes/datasets/test_versions.py | 9 +++-- tests/utils.py | 3 +- tests_v2/fixtures/metadata/version.py | 3 +- 7 files changed, 56 insertions(+), 9 deletions(-) create mode 100644 app/models/orm/migrations/versions/e8a7dc28a874_.py diff --git a/app/models/orm/migrations/versions/e8a7dc28a874_.py b/app/models/orm/migrations/versions/e8a7dc28a874_.py new file mode 100644 index 000000000..8bcccbf2c --- /dev/null +++ b/app/models/orm/migrations/versions/e8a7dc28a874_.py @@ -0,0 +1,39 @@ +"""empty message + +Revision ID: e8a7dc28a874 +Revises: d767b6dd2c4c +Create Date: 2024-09-06 20:37:11.512231 + +""" +from alembic import op +import sqlalchemy as sa +import sqlalchemy_utils +from sqlalchemy.dialects import postgresql + +# revision identifiers, used by Alembic. +revision = 'e8a7dc28a874' +down_revision = 'd767b6dd2c4c' +branch_labels = None +depends_on = None + + +def upgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.add_column('dataset_metadata', sa.Column('spatial_resolution', sa.Numeric(), nullable=True)) + op.add_column('dataset_metadata', sa.Column('resolution_description', sa.String(), nullable=True)) + op.drop_column('dataset_metadata', 'resolution') + op.add_column('version_metadata', sa.Column('spatial_resolution', sa.Numeric(), nullable=True)) + op.add_column('version_metadata', sa.Column('resolution_description', sa.String(), nullable=True)) + op.drop_column('version_metadata', 'resolution') + # ### end Alembic commands ### + + +def downgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.add_column('version_metadata', sa.Column('resolution', sa.NUMERIC(), autoincrement=False, nullable=True)) + op.drop_column('version_metadata', 'resolution_description') + op.drop_column('version_metadata', 'spatial_resolution') + op.add_column('dataset_metadata', sa.Column('resolution', sa.NUMERIC(), autoincrement=False, nullable=True)) + op.drop_column('dataset_metadata', 'resolution_description') + op.drop_column('dataset_metadata', 'spatial_resolution') + # ### end Alembic commands ### diff --git a/app/models/orm/mixins.py b/app/models/orm/mixins.py index 7577a77ac..c1cf744db 100644 --- a/app/models/orm/mixins.py +++ b/app/models/orm/mixins.py @@ -3,7 +3,8 @@ class MetadataMixin: title = db.Column(db.String) - resolution = db.Column(db.Numeric) + spatial_resolution = db.Column(db.Numeric) + resolution_description = db.Column(db.String) geographic_coverage = db.Column(db.String) update_frequency = db.Column(db.String) citation = db.Column(db.String) diff --git a/app/models/pydantic/metadata.py b/app/models/pydantic/metadata.py index f17d44b6d..01f5a7308 100644 --- a/app/models/pydantic/metadata.py +++ b/app/models/pydantic/metadata.py @@ -11,7 +11,8 @@ class CommonMetadata(BaseModel): - resolution: Optional[Union[int, float]] + spatial_resolution: Optional[Union[int, float]] + resolution_description: Optional[str] geographic_coverage: Optional[str] update_frequency: Optional[str] scale: Optional[str] @@ -21,7 +22,8 @@ class Config: schema_extra = { "examples": [ { - "resolution": 10, + "spatial_resolution": 10, + "resolution_description": "10 meters", "geographic_coverage": "Amazon Basin", "update_frequency": "Updated daily, image revisit time every 5 days", "scale": "regional", diff --git a/tests/crud/test_versions.py b/tests/crud/test_versions.py index 673c1f8ad..4dbf28d86 100644 --- a/tests/crud/test_versions.py +++ b/tests/crud/test_versions.py @@ -115,7 +115,7 @@ async def test_versions(app): metadata=metadata.dict(by_alias=True), change_log=[logs.dict(by_alias=True)], ) - assert row.metadata.resolution == version_metadata["resolution"] + assert row.metadata.spatial_resolution == version_metadata["spatial_resolution"] assert row.change_log[0]["date_time"] == json.loads(logs.json())["date_time"] assert row.change_log[0]["status"] == logs.dict(by_alias=True)["status"] assert row.change_log[0]["message"] == logs.dict(by_alias=True)["message"] diff --git a/tests/routes/datasets/test_versions.py b/tests/routes/datasets/test_versions.py index 2fbd1b3d9..0c3b084f9 100755 --- a/tests/routes/datasets/test_versions.py +++ b/tests/routes/datasets/test_versions.py @@ -56,8 +56,11 @@ async def test_versions(async_client: AsyncClient): assert version_data["data"]["dataset"] == dataset assert version_data["data"]["version"] == version assert ( - version_data["data"]["metadata"]["resolution"] == version_metadata["resolution"] + version_data["data"]["metadata"]["spatial_resolution"] == version_metadata["spatial_resolution"] ) + assert ( + version_data["data"]["metadata"]["resolution_description"] == version_metadata["resolution_description"] + ) assert ( version_data["data"]["metadata"]["content_date_range"]["start_date"] == version_metadata["content_date_range"]["start_date"] @@ -180,8 +183,8 @@ async def test_version_metadata(async_client: AsyncClient): assert response.status_code == 201 assert ( - response.json()["data"]["metadata"]["resolution"] - == version_metadata["resolution"] + response.json()["data"]["metadata"]["spatial_resolution"] + == version_metadata["spatial_resolution"] ) assert ( response.json()["data"]["metadata"]["content_date_range"] diff --git a/tests/utils.py b/tests/utils.py index c3dba73e6..a9a4075a0 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -32,7 +32,8 @@ version_metadata = { "content_date_range": {"start_date": "2000-01-01", "end_date": "2021-01-01"}, "last_update": "2020-01-03", - "resolution": 10, + "spatial_resolution": 10, + "resolution_description": "10 meters", } asset_metadata = { diff --git a/tests_v2/fixtures/metadata/version.py b/tests_v2/fixtures/metadata/version.py index c05f4e274..63a81369c 100644 --- a/tests_v2/fixtures/metadata/version.py +++ b/tests_v2/fixtures/metadata/version.py @@ -1,5 +1,6 @@ VERSION_METADATA = { "content_date_range": {"start_date": "2000-01-01", "end_date": "2021-01-01"}, "last_update": "2020-01-03", - "resolution": 10, + "spatial_resolution": 10, + "resolution_description": "10 meters", } From 786ab414dece874c2c8f82056906edfef275074d Mon Sep 17 00:00:00 2001 From: manukala6 Date: Tue, 10 Sep 2024 10:00:31 -0400 Subject: [PATCH 02/20] GTC-2898 Correct migration script --- .../{e8a7dc28a874_.py => ab98e958dc3b_.py} | 20 ++++++++----------- 1 file changed, 8 insertions(+), 12 deletions(-) rename app/models/orm/migrations/versions/{e8a7dc28a874_.py => ab98e958dc3b_.py} (52%) diff --git a/app/models/orm/migrations/versions/e8a7dc28a874_.py b/app/models/orm/migrations/versions/ab98e958dc3b_.py similarity index 52% rename from app/models/orm/migrations/versions/e8a7dc28a874_.py rename to app/models/orm/migrations/versions/ab98e958dc3b_.py index 8bcccbf2c..5ceca4ec2 100644 --- a/app/models/orm/migrations/versions/e8a7dc28a874_.py +++ b/app/models/orm/migrations/versions/ab98e958dc3b_.py @@ -1,8 +1,8 @@ -"""empty message +"""update resolution metadata fields -Revision ID: e8a7dc28a874 +Revision ID: ab98e958dc3b Revises: d767b6dd2c4c -Create Date: 2024-09-06 20:37:11.512231 +Create Date: 2024-09-10 13:49:58.066058 """ from alembic import op @@ -11,7 +11,7 @@ from sqlalchemy.dialects import postgresql # revision identifiers, used by Alembic. -revision = 'e8a7dc28a874' +revision = 'ab98e958dc3b' down_revision = 'd767b6dd2c4c' branch_labels = None depends_on = None @@ -19,21 +19,17 @@ def upgrade(): # ### commands auto generated by Alembic - please adjust! ### - op.add_column('dataset_metadata', sa.Column('spatial_resolution', sa.Numeric(), nullable=True)) + op.alter_column('dataset_metadata', 'resolution', nullable=False, new_column_name='spatial_resolution') op.add_column('dataset_metadata', sa.Column('resolution_description', sa.String(), nullable=True)) - op.drop_column('dataset_metadata', 'resolution') - op.add_column('version_metadata', sa.Column('spatial_resolution', sa.Numeric(), nullable=True)) + op.alter_column('version_metadata', 'resolution', nullable=False, new_column_name='spatial_resolution') op.add_column('version_metadata', sa.Column('resolution_description', sa.String(), nullable=True)) - op.drop_column('version_metadata', 'resolution') # ### end Alembic commands ### def downgrade(): # ### commands auto generated by Alembic - please adjust! ### - op.add_column('version_metadata', sa.Column('resolution', sa.NUMERIC(), autoincrement=False, nullable=True)) + op.alter_column('dataset_metadata', 'spatial_resolution', nullable=False, new_column_name='resolution') op.drop_column('version_metadata', 'resolution_description') - op.drop_column('version_metadata', 'spatial_resolution') - op.add_column('dataset_metadata', sa.Column('resolution', sa.NUMERIC(), autoincrement=False, nullable=True)) + op.alter_column('version_metadata', 'spatial_resolution', nullable=False, new_column_name='resolution') op.drop_column('dataset_metadata', 'resolution_description') - op.drop_column('dataset_metadata', 'spatial_resolution') # ### end Alembic commands ### From ba03419cec6150171e43a2d7a99a266fc12c1a47 Mon Sep 17 00:00:00 2001 From: manukala6 Date: Tue, 10 Sep 2024 10:46:45 -0400 Subject: [PATCH 03/20] GTC-2898 Allow nullable resolution fields --- .../{ab98e958dc3b_.py => ef3392e8e054_.py} | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) rename app/models/orm/migrations/versions/{ab98e958dc3b_.py => ef3392e8e054_.py} (71%) diff --git a/app/models/orm/migrations/versions/ab98e958dc3b_.py b/app/models/orm/migrations/versions/ef3392e8e054_.py similarity index 71% rename from app/models/orm/migrations/versions/ab98e958dc3b_.py rename to app/models/orm/migrations/versions/ef3392e8e054_.py index 5ceca4ec2..05360d4fe 100644 --- a/app/models/orm/migrations/versions/ab98e958dc3b_.py +++ b/app/models/orm/migrations/versions/ef3392e8e054_.py @@ -1,8 +1,8 @@ """update resolution metadata fields -Revision ID: ab98e958dc3b +Revision ID: ef3392e8e054 Revises: d767b6dd2c4c -Create Date: 2024-09-10 13:49:58.066058 +Create Date: 2024-09-10 14:19:43.424752 """ from alembic import op @@ -11,7 +11,7 @@ from sqlalchemy.dialects import postgresql # revision identifiers, used by Alembic. -revision = 'ab98e958dc3b' +revision = 'ef3392e8e054' down_revision = 'd767b6dd2c4c' branch_labels = None depends_on = None @@ -19,17 +19,17 @@ def upgrade(): # ### commands auto generated by Alembic - please adjust! ### - op.alter_column('dataset_metadata', 'resolution', nullable=False, new_column_name='spatial_resolution') + op.alter_column('dataset_metadata', 'resolution', nullable=True, new_column_name='spatial_resolution') op.add_column('dataset_metadata', sa.Column('resolution_description', sa.String(), nullable=True)) - op.alter_column('version_metadata', 'resolution', nullable=False, new_column_name='spatial_resolution') + op.alter_column('version_metadata', 'resolution', nullable=True, new_column_name='spatial_resolution') op.add_column('version_metadata', sa.Column('resolution_description', sa.String(), nullable=True)) # ### end Alembic commands ### def downgrade(): # ### commands auto generated by Alembic - please adjust! ### - op.alter_column('dataset_metadata', 'spatial_resolution', nullable=False, new_column_name='resolution') + op.alter_column('dataset_metadata', 'spatial_resolution', nullable=True, new_column_name='resolution') op.drop_column('version_metadata', 'resolution_description') - op.alter_column('version_metadata', 'spatial_resolution', nullable=False, new_column_name='resolution') + op.alter_column('version_metadata', 'spatial_resolution', nullable=True, new_column_name='resolution') op.drop_column('dataset_metadata', 'resolution_description') # ### end Alembic commands ### From 95747d7317c66ef5cc35174465f2b6e36d8d77c0 Mon Sep 17 00:00:00 2001 From: Solomon Negusse Date: Mon, 23 Sep 2024 13:02:41 +0300 Subject: [PATCH 04/20] create cog creation queue with on-demand ec2 instances --- app/models/pydantic/jobs.py | 5 +-- app/settings/globals.py | 1 + terraform/data.tf | 3 +- terraform/main.tf | 31 +++++++++++++++++++ terraform/modules/batch/main.tf | 9 +++++- terraform/modules/batch/outputs.tf | 6 +++- terraform/modules/batch/variables.tf | 1 + .../templates/container_definition.json.tmpl | 4 +++ 8 files changed, 55 insertions(+), 5 deletions(-) diff --git a/app/models/pydantic/jobs.py b/app/models/pydantic/jobs.py index 295358005..a44030595 100644 --- a/app/models/pydantic/jobs.py +++ b/app/models/pydantic/jobs.py @@ -4,6 +4,7 @@ from ...settings.globals import ( AURORA_JOB_QUEUE, + COG_JOB_QUEUE, DATA_LAKE_JOB_QUEUE, DEFAULT_JOB_DURATION, GDAL_PYTHON_JOB_DEFINITION, @@ -138,9 +139,9 @@ class PixETLJob(Job): class GDALCOGJob(Job): - """Use for creating COG files using GDAL Python docker in PixETL queue.""" + """Use for creating COG files using GDAL Python docker in COG queue.""" - job_queue = PIXETL_JOB_QUEUE + job_queue = COG_JOB_QUEUE job_definition = GDAL_PYTHON_JOB_DEFINITION vcpus = 8 memory = 64000 diff --git a/app/settings/globals.py b/app/settings/globals.py index c72037fe7..df73dbf94 100644 --- a/app/settings/globals.py +++ b/app/settings/globals.py @@ -116,6 +116,7 @@ MAX_MEM = config("MAX_MEM", cast=int, default=760000) PIXETL_JOB_DEFINITION = config("PIXETL_JOB_DEFINITION", cast=str) PIXETL_JOB_QUEUE = config("PIXETL_JOB_QUEUE", cast=str) +COG_JOB_QUEUE = config("COG_JOB_QUEUE", cast=str) PIXETL_CORES = config("PIXETL_CORES", cast=int, default=48) PIXETL_MAX_MEM = config("PIXETL_MAX_MEM", cast=int, default=380000) PIXETL_DEFAULT_RESAMPLING = config( diff --git a/terraform/data.tf b/terraform/data.tf index d3748b037..ba7a49cb4 100644 --- a/terraform/data.tf +++ b/terraform/data.tf @@ -68,6 +68,7 @@ data "template_file" "container_definition" { tile_cache_job_queue = module.batch_job_queues.tile_cache_job_queue_arn pixetl_job_definition = module.batch_job_queues.pixetl_job_definition_arn pixetl_job_queue = module.batch_job_queues.pixetl_job_queue_arn + cog_job_queue = module.batch_job_queues.cog_job_queue_arn raster_analysis_lambda_name = "raster-analysis-tiled_raster_analysis-default" raster_analysis_sfn_arn = data.terraform_remote_state.raster_analysis_lambda.outputs.raster_analysis_state_machine_arn service_url = local.service_url @@ -190,4 +191,4 @@ data "template_file" "step_function_policy" { vars = { raster_analysis_state_machine_arn = data.terraform_remote_state.raster_analysis_lambda.outputs.raster_analysis_state_machine_arn } -} \ No newline at end of file +} diff --git a/terraform/main.tf b/terraform/main.tf index 263d70da0..f9df92ad0 100644 --- a/terraform/main.tf +++ b/terraform/main.tf @@ -183,12 +183,43 @@ module "batch_data_lake_writer" { compute_environment_name = "data_lake_writer" } +module "batch_cog_creator" { + source = "git::https://github.com/wri/gfw-terraform-modules.git//terraform/modules/compute_environment?ref=v0.4.2.3" + ecs_role_policy_arns = [ + aws_iam_policy.query_batch_jobs.arn, + aws_iam_policy.s3_read_only.arn, + data.terraform_remote_state.core.outputs.iam_policy_s3_write_data-lake_arn, + data.terraform_remote_state.core.outputs.secrets_postgresql-reader_policy_arn, + data.terraform_remote_state.core.outputs.secrets_postgresql-writer_policy_arn, + data.terraform_remote_state.core.outputs.secrets_read-gfw-gee-export_policy_arn + ] + key_pair = var.key_pair + max_vcpus = var.data_lake_max_vcpus + project = local.project + security_group_ids = [ + data.terraform_remote_state.core.outputs.default_security_group_id, + data.terraform_remote_state.core.outputs.postgresql_security_group_id + ] + subnets = data.terraform_remote_state.core.outputs.private_subnet_ids + suffix = local.name_suffix + tags = local.batch_tags + use_ephemeral_storage = true + launch_type = "EC2" + instance_types = [ + "r6id.large", "r6id.xlarge", "r6id.2xlarge", "r6id.4xlarge", "r6id.8xlarge", "r6id.12xlarge", "r6id.16xlarge", "r6id.24xlarge", + "r5ad.large", "r5ad.xlarge", "r5ad.2xlarge", "r5ad.4xlarge", "r5ad.8xlarge", "r5ad.12xlarge", "r5ad.16xlarge", "r5ad.24xlarge", + "r5d.large", "r5d.xlarge", "r5d.2xlarge", "r5d.4xlarge", "r5d.8xlarge", "r5d.12xlarge", "r5d.16xlarge", "r5d.24xlarge" + ] + compute_environment_name = "cog_creator" +} + module "batch_job_queues" { source = "./modules/batch" aurora_compute_environment_arn = module.batch_aurora_writer.arn data_lake_compute_environment_arn = module.batch_data_lake_writer.arn pixetl_compute_environment_arn = module.batch_data_lake_writer.arn tile_cache_compute_environment_arn = module.batch_data_lake_writer.arn + cog_compute_environment_arn = module.cog_creator.arn environment = var.environment name_suffix = local.name_suffix project = local.project diff --git a/terraform/modules/batch/main.tf b/terraform/modules/batch/main.tf index 3433ccf4c..e62c38214 100644 --- a/terraform/modules/batch/main.tf +++ b/terraform/modules/batch/main.tf @@ -52,6 +52,13 @@ resource "aws_batch_job_queue" "pixetl" { depends_on = [var.pixetl_compute_environment_arn] } +resource "aws_batch_job_queue" "cog" { + name = substr("${var.project}-cog-job-queue${var.name_suffix}", 0, 64) + state = "ENABLED" + priority = 1 + compute_environments = [var.cog_compute_environment_arn] + depends_on = [var.pixetl_compute_environment_arn] +} resource "aws_batch_job_definition" "tile_cache" { name = substr("${var.project}-tile_cache${var.name_suffix}", 0, 64) @@ -190,4 +197,4 @@ data "template_file" "ecs-task_assume" { vars = { service = "ecs-tasks" } -} \ No newline at end of file +} diff --git a/terraform/modules/batch/outputs.tf b/terraform/modules/batch/outputs.tf index 9d91ec956..863bb90cd 100644 --- a/terraform/modules/batch/outputs.tf +++ b/terraform/modules/batch/outputs.tf @@ -38,6 +38,10 @@ output "pixetl_job_queue_arn" { value = aws_batch_job_queue.pixetl.arn } +output "cog_job_queue_arn" { + value = aws_batch_job_queue.cog.arn +} + output "tile_cache_job_definition_arn" { value = aws_batch_job_definition.tile_cache.arn } @@ -48,4 +52,4 @@ output "tile_cache_job_definition" { output "tile_cache_job_queue_arn" { value = aws_batch_job_queue.tile_cache.arn -} \ No newline at end of file +} diff --git a/terraform/modules/batch/variables.tf b/terraform/modules/batch/variables.tf index 3d6e72aae..7c03c859f 100644 --- a/terraform/modules/batch/variables.tf +++ b/terraform/modules/batch/variables.tf @@ -2,6 +2,7 @@ variable "project" { type = string } variable "name_suffix" { type = string } variable "aurora_compute_environment_arn" { type = string } variable "data_lake_compute_environment_arn" { type = string } +variable "cog_compute_environment_arn" { type = string } variable "tile_cache_compute_environment_arn" { type = string } variable "pixetl_compute_environment_arn" { type = string } variable "gdal_repository_url" { type = string } diff --git a/terraform/templates/container_definition.json.tmpl b/terraform/templates/container_definition.json.tmpl index f031b29ca..d62cb9c47 100644 --- a/terraform/templates/container_definition.json.tmpl +++ b/terraform/templates/container_definition.json.tmpl @@ -73,6 +73,10 @@ "name": "PIXETL_JOB_QUEUE", "value": "${pixetl_job_queue}" }, + { + "name": "COG_JOB_QUEUE", + "value": "${cog_job_queue}" + }, { "name": "API_URL", "value": "${service_url}" From 0f767205270c2da804e6a0e35ecec88f30247966 Mon Sep 17 00:00:00 2001 From: Solomon Negusse Date: Mon, 23 Sep 2024 13:37:24 +0300 Subject: [PATCH 05/20] store ec2 instances in a variable --- terraform/main.tf | 24 ++++++++---------------- terraform/variables.tf | 10 ++++++++++ 2 files changed, 18 insertions(+), 16 deletions(-) diff --git a/terraform/main.tf b/terraform/main.tf index f9df92ad0..17c5f92a2 100644 --- a/terraform/main.tf +++ b/terraform/main.tf @@ -174,12 +174,8 @@ module "batch_data_lake_writer" { tags = local.batch_tags use_ephemeral_storage = true # SPOT is actually the default, this is just a placeholder until GTC-1791 is done - launch_type = "SPOT" - instance_types = [ - "r6id.large", "r6id.xlarge", "r6id.2xlarge", "r6id.4xlarge", "r6id.8xlarge", "r6id.12xlarge", "r6id.16xlarge", "r6id.24xlarge", - "r5ad.large", "r5ad.xlarge", "r5ad.2xlarge", "r5ad.4xlarge", "r5ad.8xlarge", "r5ad.12xlarge", "r5ad.16xlarge", "r5ad.24xlarge", - "r5d.large", "r5d.xlarge", "r5d.2xlarge", "r5d.4xlarge", "r5d.8xlarge", "r5d.12xlarge", "r5d.16xlarge", "r5d.24xlarge" - ] + launch_type = "SPOT" + instance_types = var.data_lake_writer_instance_types compute_environment_name = "data_lake_writer" } @@ -200,16 +196,12 @@ module "batch_cog_creator" { data.terraform_remote_state.core.outputs.default_security_group_id, data.terraform_remote_state.core.outputs.postgresql_security_group_id ] - subnets = data.terraform_remote_state.core.outputs.private_subnet_ids - suffix = local.name_suffix - tags = local.batch_tags - use_ephemeral_storage = true - launch_type = "EC2" - instance_types = [ - "r6id.large", "r6id.xlarge", "r6id.2xlarge", "r6id.4xlarge", "r6id.8xlarge", "r6id.12xlarge", "r6id.16xlarge", "r6id.24xlarge", - "r5ad.large", "r5ad.xlarge", "r5ad.2xlarge", "r5ad.4xlarge", "r5ad.8xlarge", "r5ad.12xlarge", "r5ad.16xlarge", "r5ad.24xlarge", - "r5d.large", "r5d.xlarge", "r5d.2xlarge", "r5d.4xlarge", "r5d.8xlarge", "r5d.12xlarge", "r5d.16xlarge", "r5d.24xlarge" - ] + subnets = data.terraform_remote_state.core.outputs.private_subnet_ids + suffix = local.name_suffix + tags = local.batch_tags + use_ephemeral_storage = true + launch_type = "EC2" + instance_types = var.data_lake_writer_instance_types compute_environment_name = "cog_creator" } diff --git a/terraform/variables.tf b/terraform/variables.tf index 8b56728b9..142736b0f 100644 --- a/terraform/variables.tf +++ b/terraform/variables.tf @@ -156,3 +156,13 @@ variable "api_gateway_url" { description = "The invoke url of the API Gateway stage" default = "" } + +variable "data_lake_writer_instance_types" { + type = list(string) + description = "memory optimized EC2 instances with local NVMe SSDs for data lake writer batche queues" + default = [ + "r6id.large", "r6id.xlarge", "r6id.2xlarge", "r6id.4xlarge", "r6id.8xlarge", "r6id.12xlarge", "r6id.16xlarge", "r6id.24xlarge", + "r5ad.large", "r5ad.xlarge", "r5ad.2xlarge", "r5ad.4xlarge", "r5ad.8xlarge", "r5ad.12xlarge", "r5ad.16xlarge", "r5ad.24xlarge", + "r5d.large", "r5d.xlarge", "r5d.2xlarge", "r5d.4xlarge", "r5d.8xlarge", "r5d.12xlarge", "r5d.16xlarge", "r5d.24xlarge" + ] +} From 79a7d2e04225a2c3a5415111f0e6bb9107cb1ace Mon Sep 17 00:00:00 2001 From: Solomon Negusse Date: Mon, 23 Sep 2024 16:59:56 +0300 Subject: [PATCH 06/20] cog -> cogify --- app/models/pydantic/jobs.py | 4 ++-- app/settings/globals.py | 2 +- docker-compose.dev.yml | 1 + docker-compose.prod.yml | 1 + docker-compose.test.yml | 1 + terraform/data.tf | 2 +- terraform/modules/batch/outputs.tf | 2 +- terraform/templates/container_definition.json.tmpl | 4 ++-- 8 files changed, 10 insertions(+), 7 deletions(-) diff --git a/app/models/pydantic/jobs.py b/app/models/pydantic/jobs.py index a44030595..52bc0e1bb 100644 --- a/app/models/pydantic/jobs.py +++ b/app/models/pydantic/jobs.py @@ -4,7 +4,7 @@ from ...settings.globals import ( AURORA_JOB_QUEUE, - COG_JOB_QUEUE, + COGIFY_JOB_QUEUE, DATA_LAKE_JOB_QUEUE, DEFAULT_JOB_DURATION, GDAL_PYTHON_JOB_DEFINITION, @@ -141,7 +141,7 @@ class PixETLJob(Job): class GDALCOGJob(Job): """Use for creating COG files using GDAL Python docker in COG queue.""" - job_queue = COG_JOB_QUEUE + job_queue = COGIFY_JOB_QUEUE job_definition = GDAL_PYTHON_JOB_DEFINITION vcpus = 8 memory = 64000 diff --git a/app/settings/globals.py b/app/settings/globals.py index df73dbf94..9d3fa4156 100644 --- a/app/settings/globals.py +++ b/app/settings/globals.py @@ -116,7 +116,7 @@ MAX_MEM = config("MAX_MEM", cast=int, default=760000) PIXETL_JOB_DEFINITION = config("PIXETL_JOB_DEFINITION", cast=str) PIXETL_JOB_QUEUE = config("PIXETL_JOB_QUEUE", cast=str) -COG_JOB_QUEUE = config("COG_JOB_QUEUE", cast=str) +COGIFY_JOB_QUEUE = config("COGIFY_JOB_QUEUE", cast=str) PIXETL_CORES = config("PIXETL_CORES", cast=int, default=48) PIXETL_MAX_MEM = config("PIXETL_MAX_MEM", cast=int, default=380000) PIXETL_DEFAULT_RESAMPLING = config( diff --git a/docker-compose.dev.yml b/docker-compose.dev.yml index 8a60157cf..b755f2eb6 100644 --- a/docker-compose.dev.yml +++ b/docker-compose.dev.yml @@ -39,6 +39,7 @@ services: - TILE_CACHE_CLUSTER=tile_cache_cluster - TILE_CACHE_SERVICE=tile_cache_service - PIXETL_JOB_QUEUE=pixetl_jq + - COGIFY_JOB_QUEUE=cogify_jq - API_URL=http://app_dev:80 - RASTER_ANALYSIS_LAMBDA_NAME=raster-analysis-tiled_raster_analysis-default - RW_API_URL=https://staging-api.resourcewatch.org diff --git a/docker-compose.prod.yml b/docker-compose.prod.yml index 0f764358a..7f13c9f1e 100644 --- a/docker-compose.prod.yml +++ b/docker-compose.prod.yml @@ -34,6 +34,7 @@ services: - DATA_LAKE_JOB_QUEUE=data_lake_jq - TILE_CACHE_JOB_QUEUE=tile_cache_jq - PIXETL_JOB_QUEUE=pixetl_jq + - COGIFY_JOB_QUEUE=cogify_jq - RASTER_ANALYSIS_LAMBDA_NAME=raster_analysis - API_URL="http://app_dev:80" - RW_API_URL=https://api.resourcewatch.org diff --git a/docker-compose.test.yml b/docker-compose.test.yml index 1c9c155ba..ba287eff3 100644 --- a/docker-compose.test.yml +++ b/docker-compose.test.yml @@ -49,6 +49,7 @@ services: - TILE_CACHE_CLUSTER=tile_cache_cluster - TILE_CACHE_SERVICE=tile_cache_service - PIXETL_JOB_QUEUE=pixetl_jq + - COGIFY_JOB_QUEUE=cogify_jq - PIXETL_CORES=1 - MAX_CORES=1 - NUM_PROCESSES=1 diff --git a/terraform/data.tf b/terraform/data.tf index ba7a49cb4..4064a2c88 100644 --- a/terraform/data.tf +++ b/terraform/data.tf @@ -68,7 +68,7 @@ data "template_file" "container_definition" { tile_cache_job_queue = module.batch_job_queues.tile_cache_job_queue_arn pixetl_job_definition = module.batch_job_queues.pixetl_job_definition_arn pixetl_job_queue = module.batch_job_queues.pixetl_job_queue_arn - cog_job_queue = module.batch_job_queues.cog_job_queue_arn + cogify_job_queue = module.batch_job_queues.cogify_job_queue_arn raster_analysis_lambda_name = "raster-analysis-tiled_raster_analysis-default" raster_analysis_sfn_arn = data.terraform_remote_state.raster_analysis_lambda.outputs.raster_analysis_state_machine_arn service_url = local.service_url diff --git a/terraform/modules/batch/outputs.tf b/terraform/modules/batch/outputs.tf index 863bb90cd..b4d458b71 100644 --- a/terraform/modules/batch/outputs.tf +++ b/terraform/modules/batch/outputs.tf @@ -38,7 +38,7 @@ output "pixetl_job_queue_arn" { value = aws_batch_job_queue.pixetl.arn } -output "cog_job_queue_arn" { +output "cogify_job_queue_arn" { value = aws_batch_job_queue.cog.arn } diff --git a/terraform/templates/container_definition.json.tmpl b/terraform/templates/container_definition.json.tmpl index d62cb9c47..b24963a12 100644 --- a/terraform/templates/container_definition.json.tmpl +++ b/terraform/templates/container_definition.json.tmpl @@ -74,8 +74,8 @@ "value": "${pixetl_job_queue}" }, { - "name": "COG_JOB_QUEUE", - "value": "${cog_job_queue}" + "name": "COGIFY_JOB_QUEUE", + "value": "${cogify_job_queue}" }, { "name": "API_URL", From 9e2e67c8582d77c2bbc535d75af79adf77d90465 Mon Sep 17 00:00:00 2001 From: Solomon Negusse Date: Mon, 23 Sep 2024 17:06:44 +0300 Subject: [PATCH 07/20] fix module name --- terraform/main.tf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/terraform/main.tf b/terraform/main.tf index 17c5f92a2..8feed191e 100644 --- a/terraform/main.tf +++ b/terraform/main.tf @@ -202,7 +202,7 @@ module "batch_cog_creator" { use_ephemeral_storage = true launch_type = "EC2" instance_types = var.data_lake_writer_instance_types - compute_environment_name = "cog_creator" + compute_environment_name = "batch_cog_creator" } module "batch_job_queues" { @@ -211,7 +211,7 @@ module "batch_job_queues" { data_lake_compute_environment_arn = module.batch_data_lake_writer.arn pixetl_compute_environment_arn = module.batch_data_lake_writer.arn tile_cache_compute_environment_arn = module.batch_data_lake_writer.arn - cog_compute_environment_arn = module.cog_creator.arn + cog_compute_environment_arn = module.batch_cog_creator.arn environment = var.environment name_suffix = local.name_suffix project = local.project From 451eda186d1192c061407741dfca4c38ac0b9c03 Mon Sep 17 00:00:00 2001 From: Solomon Negusse Date: Mon, 23 Sep 2024 18:45:31 +0300 Subject: [PATCH 08/20] add cogify batch queue to moto --- tests/conftest.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/conftest.py b/tests/conftest.py index 50af56e3a..b2d8204e7 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -33,6 +33,7 @@ GDAL_PYTHON_JOB_DEFINITION, PIXETL_JOB_DEFINITION, PIXETL_JOB_QUEUE, + COGIFY_JOB_QUEUE, POSTGRESQL_CLIENT_JOB_DEFINITION, TILE_CACHE_BUCKET, TILE_CACHE_JOB_DEFINITION, @@ -167,6 +168,7 @@ def patch_run(self, *k, **kwargs): "s3_writer", subnet_id, sg_id, iam_arn ) pixetl_env = aws_mock.add_compute_environment("pixetl", subnet_id, sg_id, iam_arn) + cogify_env = aws_mock.add_compute_environment("cogify", subnet_id, sg_id, iam_arn) aws_mock.add_job_queue(AURORA_JOB_QUEUE, aurora_writer_env["computeEnvironmentArn"]) aws_mock.add_job_queue( @@ -175,6 +177,7 @@ def patch_run(self, *k, **kwargs): aws_mock.add_job_queue(DATA_LAKE_JOB_QUEUE, s3_writer_env["computeEnvironmentArn"]) aws_mock.add_job_queue(TILE_CACHE_JOB_QUEUE, s3_writer_env["computeEnvironmentArn"]) aws_mock.add_job_queue(PIXETL_JOB_QUEUE, pixetl_env["computeEnvironmentArn"]) + aws_mock.add_job_queue(COGIFY_JOB_QUEUE, cogify_env["computeEnvironmentArn"]) aws_mock.add_job_definition(GDAL_PYTHON_JOB_DEFINITION, "batch_gdal-python_test") aws_mock.add_job_definition( From 10d357e7248a602f218c70cc4eb01a5e4f8f8cf3 Mon Sep 17 00:00:00 2001 From: Solomon Negusse Date: Tue, 24 Sep 2024 12:16:07 +0300 Subject: [PATCH 09/20] generalize on-demand job queue name --- app/models/pydantic/jobs.py | 6 +++--- app/settings/globals.py | 2 +- docker-compose.dev.yml | 2 +- docker-compose.prod.yml | 2 +- docker-compose.test.yml | 2 +- terraform/data.tf | 2 +- terraform/main.tf | 6 +++--- terraform/modules/batch/main.tf | 6 +++--- terraform/modules/batch/outputs.tf | 4 ++-- terraform/modules/batch/variables.tf | 2 +- terraform/templates/container_definition.json.tmpl | 4 ++-- tests/conftest.py | 6 ++++-- 12 files changed, 23 insertions(+), 21 deletions(-) diff --git a/app/models/pydantic/jobs.py b/app/models/pydantic/jobs.py index 52bc0e1bb..643292f2e 100644 --- a/app/models/pydantic/jobs.py +++ b/app/models/pydantic/jobs.py @@ -4,7 +4,7 @@ from ...settings.globals import ( AURORA_JOB_QUEUE, - COGIFY_JOB_QUEUE, + ON_DEMAND_COMPUTE_JOB_QUEUE, DATA_LAKE_JOB_QUEUE, DEFAULT_JOB_DURATION, GDAL_PYTHON_JOB_DEFINITION, @@ -139,9 +139,9 @@ class PixETLJob(Job): class GDALCOGJob(Job): - """Use for creating COG files using GDAL Python docker in COG queue.""" + """Use for creating COG files using GDAL Python docker in on-demand compute queue.""" - job_queue = COGIFY_JOB_QUEUE + job_queue = ON_DEMAND_COMPUTE_JOB_QUEUE job_definition = GDAL_PYTHON_JOB_DEFINITION vcpus = 8 memory = 64000 diff --git a/app/settings/globals.py b/app/settings/globals.py index 9d3fa4156..47d9ac728 100644 --- a/app/settings/globals.py +++ b/app/settings/globals.py @@ -116,7 +116,7 @@ MAX_MEM = config("MAX_MEM", cast=int, default=760000) PIXETL_JOB_DEFINITION = config("PIXETL_JOB_DEFINITION", cast=str) PIXETL_JOB_QUEUE = config("PIXETL_JOB_QUEUE", cast=str) -COGIFY_JOB_QUEUE = config("COGIFY_JOB_QUEUE", cast=str) +ON_DEMAND_COMPUTE_JOB_QUEUE = config("ON_DEMAND_COMPUTE_JOB_QUEUE", cast=str) PIXETL_CORES = config("PIXETL_CORES", cast=int, default=48) PIXETL_MAX_MEM = config("PIXETL_MAX_MEM", cast=int, default=380000) PIXETL_DEFAULT_RESAMPLING = config( diff --git a/docker-compose.dev.yml b/docker-compose.dev.yml index b755f2eb6..98a0e5f83 100644 --- a/docker-compose.dev.yml +++ b/docker-compose.dev.yml @@ -39,7 +39,7 @@ services: - TILE_CACHE_CLUSTER=tile_cache_cluster - TILE_CACHE_SERVICE=tile_cache_service - PIXETL_JOB_QUEUE=pixetl_jq - - COGIFY_JOB_QUEUE=cogify_jq + - ON_DEMAND_COMPUTE_JOB_QUEUE=cogify_jq - API_URL=http://app_dev:80 - RASTER_ANALYSIS_LAMBDA_NAME=raster-analysis-tiled_raster_analysis-default - RW_API_URL=https://staging-api.resourcewatch.org diff --git a/docker-compose.prod.yml b/docker-compose.prod.yml index 7f13c9f1e..461ed1749 100644 --- a/docker-compose.prod.yml +++ b/docker-compose.prod.yml @@ -34,7 +34,7 @@ services: - DATA_LAKE_JOB_QUEUE=data_lake_jq - TILE_CACHE_JOB_QUEUE=tile_cache_jq - PIXETL_JOB_QUEUE=pixetl_jq - - COGIFY_JOB_QUEUE=cogify_jq + - ON_DEMAND_COMPUTE_JOB_QUEUE=cogify_jq - RASTER_ANALYSIS_LAMBDA_NAME=raster_analysis - API_URL="http://app_dev:80" - RW_API_URL=https://api.resourcewatch.org diff --git a/docker-compose.test.yml b/docker-compose.test.yml index ba287eff3..0288e1675 100644 --- a/docker-compose.test.yml +++ b/docker-compose.test.yml @@ -49,7 +49,7 @@ services: - TILE_CACHE_CLUSTER=tile_cache_cluster - TILE_CACHE_SERVICE=tile_cache_service - PIXETL_JOB_QUEUE=pixetl_jq - - COGIFY_JOB_QUEUE=cogify_jq + - ON_DEMAND_COMPUTE_JOB_QUEUE=cogify_jq - PIXETL_CORES=1 - MAX_CORES=1 - NUM_PROCESSES=1 diff --git a/terraform/data.tf b/terraform/data.tf index 4064a2c88..d9e2861aa 100644 --- a/terraform/data.tf +++ b/terraform/data.tf @@ -68,7 +68,7 @@ data "template_file" "container_definition" { tile_cache_job_queue = module.batch_job_queues.tile_cache_job_queue_arn pixetl_job_definition = module.batch_job_queues.pixetl_job_definition_arn pixetl_job_queue = module.batch_job_queues.pixetl_job_queue_arn - cogify_job_queue = module.batch_job_queues.cogify_job_queue_arn + on_demand_compute_job_queue = module.batch_job_queues.on_demand_compute_job_queue_arn raster_analysis_lambda_name = "raster-analysis-tiled_raster_analysis-default" raster_analysis_sfn_arn = data.terraform_remote_state.raster_analysis_lambda.outputs.raster_analysis_state_machine_arn service_url = local.service_url diff --git a/terraform/main.tf b/terraform/main.tf index 8feed191e..80d5249ca 100644 --- a/terraform/main.tf +++ b/terraform/main.tf @@ -179,7 +179,7 @@ module "batch_data_lake_writer" { compute_environment_name = "data_lake_writer" } -module "batch_cog_creator" { +module "batch_cogify" { source = "git::https://github.com/wri/gfw-terraform-modules.git//terraform/modules/compute_environment?ref=v0.4.2.3" ecs_role_policy_arns = [ aws_iam_policy.query_batch_jobs.arn, @@ -202,7 +202,7 @@ module "batch_cog_creator" { use_ephemeral_storage = true launch_type = "EC2" instance_types = var.data_lake_writer_instance_types - compute_environment_name = "batch_cog_creator" + compute_environment_name = "batch_cogify" } module "batch_job_queues" { @@ -211,7 +211,7 @@ module "batch_job_queues" { data_lake_compute_environment_arn = module.batch_data_lake_writer.arn pixetl_compute_environment_arn = module.batch_data_lake_writer.arn tile_cache_compute_environment_arn = module.batch_data_lake_writer.arn - cog_compute_environment_arn = module.batch_cog_creator.arn + cogify_compute_environment_arn = module.batch_cogify.arn environment = var.environment name_suffix = local.name_suffix project = local.project diff --git a/terraform/modules/batch/main.tf b/terraform/modules/batch/main.tf index e62c38214..de99d49ba 100644 --- a/terraform/modules/batch/main.tf +++ b/terraform/modules/batch/main.tf @@ -52,12 +52,12 @@ resource "aws_batch_job_queue" "pixetl" { depends_on = [var.pixetl_compute_environment_arn] } -resource "aws_batch_job_queue" "cog" { +resource "aws_batch_job_queue" "on_demand" { name = substr("${var.project}-cog-job-queue${var.name_suffix}", 0, 64) state = "ENABLED" priority = 1 - compute_environments = [var.cog_compute_environment_arn] - depends_on = [var.pixetl_compute_environment_arn] + compute_environments = [var.cogify_compute_environment_arn] + depends_on = [var.cogify_compute_environment_arn] } resource "aws_batch_job_definition" "tile_cache" { diff --git a/terraform/modules/batch/outputs.tf b/terraform/modules/batch/outputs.tf index b4d458b71..2962aa93c 100644 --- a/terraform/modules/batch/outputs.tf +++ b/terraform/modules/batch/outputs.tf @@ -38,8 +38,8 @@ output "pixetl_job_queue_arn" { value = aws_batch_job_queue.pixetl.arn } -output "cogify_job_queue_arn" { - value = aws_batch_job_queue.cog.arn +output "on_demand_compute_job_queue_arn" { + value = aws_batch_job_queue.on_demand.arn } output "tile_cache_job_definition_arn" { diff --git a/terraform/modules/batch/variables.tf b/terraform/modules/batch/variables.tf index 7c03c859f..ff2005c07 100644 --- a/terraform/modules/batch/variables.tf +++ b/terraform/modules/batch/variables.tf @@ -2,7 +2,7 @@ variable "project" { type = string } variable "name_suffix" { type = string } variable "aurora_compute_environment_arn" { type = string } variable "data_lake_compute_environment_arn" { type = string } -variable "cog_compute_environment_arn" { type = string } +variable "cogify_compute_environment_arn" { type = string } variable "tile_cache_compute_environment_arn" { type = string } variable "pixetl_compute_environment_arn" { type = string } variable "gdal_repository_url" { type = string } diff --git a/terraform/templates/container_definition.json.tmpl b/terraform/templates/container_definition.json.tmpl index b24963a12..e2de276e7 100644 --- a/terraform/templates/container_definition.json.tmpl +++ b/terraform/templates/container_definition.json.tmpl @@ -74,8 +74,8 @@ "value": "${pixetl_job_queue}" }, { - "name": "COGIFY_JOB_QUEUE", - "value": "${cogify_job_queue}" + "name": "ON_DEMAND_COMPUTE_JOB_QUEUE", + "value": "${on_demand_compute_job_queue}" }, { "name": "API_URL", diff --git a/tests/conftest.py b/tests/conftest.py index b2d8204e7..74e415d2d 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -33,7 +33,7 @@ GDAL_PYTHON_JOB_DEFINITION, PIXETL_JOB_DEFINITION, PIXETL_JOB_QUEUE, - COGIFY_JOB_QUEUE, + ON_DEMAND_COMPUTE_JOB_QUEUE, POSTGRESQL_CLIENT_JOB_DEFINITION, TILE_CACHE_BUCKET, TILE_CACHE_JOB_DEFINITION, @@ -177,7 +177,9 @@ def patch_run(self, *k, **kwargs): aws_mock.add_job_queue(DATA_LAKE_JOB_QUEUE, s3_writer_env["computeEnvironmentArn"]) aws_mock.add_job_queue(TILE_CACHE_JOB_QUEUE, s3_writer_env["computeEnvironmentArn"]) aws_mock.add_job_queue(PIXETL_JOB_QUEUE, pixetl_env["computeEnvironmentArn"]) - aws_mock.add_job_queue(COGIFY_JOB_QUEUE, cogify_env["computeEnvironmentArn"]) + aws_mock.add_job_queue( + ON_DEMAND_COMPUTE_JOB_QUEUE, cogify_env["computeEnvironmentArn"] + ) aws_mock.add_job_definition(GDAL_PYTHON_JOB_DEFINITION, "batch_gdal-python_test") aws_mock.add_job_definition( From d9d2473bb2d564708606bc6a5ff2d1ab6ea6d4f0 Mon Sep 17 00:00:00 2001 From: Solomon Negusse Date: Tue, 24 Sep 2024 12:19:43 +0300 Subject: [PATCH 10/20] update queue name --- terraform/modules/batch/main.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/terraform/modules/batch/main.tf b/terraform/modules/batch/main.tf index de99d49ba..fb5207473 100644 --- a/terraform/modules/batch/main.tf +++ b/terraform/modules/batch/main.tf @@ -53,7 +53,7 @@ resource "aws_batch_job_queue" "pixetl" { } resource "aws_batch_job_queue" "on_demand" { - name = substr("${var.project}-cog-job-queue${var.name_suffix}", 0, 64) + name = substr("${var.project}-on-demand-job-queue${var.name_suffix}", 0, 64) state = "ENABLED" priority = 1 compute_environments = [var.cogify_compute_environment_arn] From 720433400f14bf62012ece749d050c7982426d0f Mon Sep 17 00:00:00 2001 From: Dan Scales Date: Sat, 28 Sep 2024 13:08:56 -0700 Subject: [PATCH 11/20] GTC-2958 Tag Docker image from docker hash, so it always exists Currently, we tag the new Docker image with the current Git SHA. But the container_registry module only creates a new Docker image if the docker contents change. So, if the docker contents haven't changed with this Git change, we can have a bug where we reference the app docker via a tag (of the new Git SHA) which doesn't exist. The fix is to use instead as a tag the hash of the docker contents. We use the same hash script that the container_registry module uses. Therefore, we will always be using a tag that exists, either because the container_register module just pushed a new docker with the new tag, or the docker already exists under the docker hash tag, because the docker contents and has haven't changed. I noticed a bunch of bugs in the container_registry module's hash.sh script, which I will fix later. One of the main things is that it doesn't ignore comments, so it can match on words in the comments. For that reason, I removed the '# Docker Files' comment, which was causing the Dockerfile itself to be ignored during hashing. --- .dockerignore | 1 - terraform/data.tf | 10 +++++++- terraform/main.tf | 8 ++++++- terraform/scripts/hash.sh | 50 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 66 insertions(+), 3 deletions(-) create mode 100755 terraform/scripts/hash.sh diff --git a/.dockerignore b/.dockerignore index f2fec2f03..cd78e8236 100644 --- a/.dockerignore +++ b/.dockerignore @@ -6,7 +6,6 @@ # MyPy .mypy_cache/* -# Docker Files docker-compose.dev.yml docker-compose.prod.yml docker-compose.test.yml diff --git a/terraform/data.tf b/terraform/data.tf index d3748b037..34fd7fd02 100644 --- a/terraform/data.tf +++ b/terraform/data.tf @@ -190,4 +190,12 @@ data "template_file" "step_function_policy" { vars = { raster_analysis_state_machine_arn = data.terraform_remote_state.raster_analysis_lambda.outputs.raster_analysis_state_machine_arn } -} \ No newline at end of file +} + +# Hash of the contents of the FastAPI app docker. The docker commands run in the main +# directory (parent directory of terraform directory), and the Docker file is in the +# same directory. +data "external" "hash" { + program = ["${path.root}/scripts/hash.sh", "${path.root}/../", "."] +} + diff --git a/terraform/main.tf b/terraform/main.tf index 263d70da0..89a321402 100644 --- a/terraform/main.tf +++ b/terraform/main.tf @@ -26,7 +26,13 @@ locals { aurora_instance_class = data.terraform_remote_state.core.outputs.aurora_cluster_instance_class aurora_max_vcpus = local.aurora_instance_class == "db.t3.medium" ? 2 : local.aurora_instance_class == "db.r6g.large" ? 2 : local.aurora_instance_class == "db.r6g.xlarge" ? 4 : local.aurora_instance_class == "db.r6g.2xlarge" ? 8 : local.aurora_instance_class == "db.r6g.4xlarge" ? 16 : local.aurora_instance_class == "db.r6g.8xlarge" ? 32 : local.aurora_instance_class == "db.r6g.16xlarge" ? 64 : local.aurora_instance_class == "db.r5.large" ? 2 : local.aurora_instance_class == "db.r5.xlarge" ? 4 : local.aurora_instance_class == "db.r5.2xlarge" ? 8 : local.aurora_instance_class == "db.r5.4xlarge" ? 16 : local.aurora_instance_class == "db.r5.8xlarge" ? 32 : local.aurora_instance_class == "db.r5.12xlarge" ? 48 : local.aurora_instance_class == "db.r5.16xlarge" ? 64 : local.aurora_instance_class == "db.r5.24xlarge" ? 96 : "" service_url = var.environment == "dev" ? "http://${module.fargate_autoscaling.lb_dns_name}" : var.service_url - container_tag = substr(var.git_sha, 0, 7) + # The container_registry module only pushes a new Docker image if the docker hash + # computed by its hash.sh script has changed. So, we make the container tag exactly + # be that hash. Therefore, we will know that either the previous docker with the + # same contents and tag will already exist, if nothing has changed in the docker + # image, or the container registry module will push a new docker with the tag we + # want. + container_tag = lookup(data.external.hash.result, "hash") lb_dns_name = coalesce(module.fargate_autoscaling.lb_dns_name, var.lb_dns_name) } diff --git a/terraform/scripts/hash.sh b/terraform/scripts/hash.sh new file mode 100755 index 000000000..a67cc4713 --- /dev/null +++ b/terraform/scripts/hash.sh @@ -0,0 +1,50 @@ +#!/bin/bash +# +# Calculates hash of Docker image source contents +# +# Must be identical to the script that is used by the +# gfw-terraform-modules:terraform/modules/container_registry Terraform module. +# +# Usage: +# +# $ ./hash.sh . +# + +set -e + +pushd () { + command pushd "$@" > /dev/null +} + +popd () { + command popd "$@" > /dev/null +} + +ROOT_DIR=${1:-.} +DOCKER_PATH=${2:-.} +IGNORE="${DOCKER_PATH}/.dockerignore" + +pushd "$ROOT_DIR" + +# Hash all source files of the Docker image +if [ -f "$IGNORE" ]; then + # We don't want to compute hashes for files listed in .dockerignore + # to match regex pattern we need to escape leading . + a=$(printf "! -regex ^./%s.* " `< .dockerignore`) + b=${a//\/.//\\\.} + + file_hashes="$( + find . -type f $b -exec md5sum {} \; + )" +else + # Exclude Python cache files, dot files + file_hashes="$( + find . -type f -not -name '*.pyc' -not -path './.**' -exec md5sum {} \; + )" +fi + +popd + +hash="$(echo "$file_hashes" | md5sum | cut -d' ' -f1)" + +echo '{ "hash": "'"$hash"'" }' From c677d25b5f12f374ff43b4f3f31b4ab4e4d88450 Mon Sep 17 00:00:00 2001 From: Daniel Mannarino Date: Thu, 3 Oct 2024 15:23:43 -0400 Subject: [PATCH 12/20] Use experimental pixetl --- batch/pixetl.dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/batch/pixetl.dockerfile b/batch/pixetl.dockerfile index 6c7f44fc3..a9afef12d 100644 --- a/batch/pixetl.dockerfile +++ b/batch/pixetl.dockerfile @@ -1,4 +1,4 @@ -FROM globalforestwatch/pixetl:v1.7.7 +FROM globalforestwatch/pixetl:v1.7.8rc # Copy scripts COPY ./batch/scripts/ /opt/scripts/ From a7ca3a4f674c767c7bf0069d916f32d89dce85f4 Mon Sep 17 00:00:00 2001 From: Daniel Mannarino Date: Thu, 3 Oct 2024 22:21:36 -0400 Subject: [PATCH 13/20] Bump pixetl --- batch/pixetl.dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/batch/pixetl.dockerfile b/batch/pixetl.dockerfile index a9afef12d..47ab19e79 100644 --- a/batch/pixetl.dockerfile +++ b/batch/pixetl.dockerfile @@ -1,4 +1,4 @@ -FROM globalforestwatch/pixetl:v1.7.8rc +FROM globalforestwatch/pixetl:v1.7.8rc2 # Copy scripts COPY ./batch/scripts/ /opt/scripts/ From 62b7db4adc5a892df2694ffa1ea6fb21d04776fe Mon Sep 17 00:00:00 2001 From: Solomon Negusse Date: Tue, 8 Oct 2024 11:01:31 +0300 Subject: [PATCH 14/20] give ecs task role permission to run on on-demand compute job queue --- terraform/data.tf | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/terraform/data.tf b/terraform/data.tf index d9e2861aa..dcb8aacd0 100644 --- a/terraform/data.tf +++ b/terraform/data.tf @@ -96,15 +96,16 @@ data "template_file" "container_definition" { data "template_file" "task_batch_policy" { template = file("${path.root}/templates/run_batch_policy.json.tmpl") vars = { - aurora_job_definition_arn = module.batch_job_queues.aurora_job_definition_arn - aurora_job_queue_arn = module.batch_job_queues.aurora_job_queue_arn - aurora_job_queue_fast_arn = module.batch_job_queues.aurora_job_queue_fast_arn - data_lake_job_definition_arn = module.batch_job_queues.data_lake_job_definition_arn - data_lake_job_queue_arn = module.batch_job_queues.data_lake_job_queue_arn - tile_cache_job_definition_arn = module.batch_job_queues.tile_cache_job_definition_arn - tile_cache_job_queue_arn = module.batch_job_queues.tile_cache_job_queue_arn - pixetl_job_definition_arn = module.batch_job_queues.pixetl_job_definition_arn - pixetl_job_queue_arn = module.batch_job_queues.pixetl_job_queue_arn + aurora_job_definition_arn = module.batch_job_queues.aurora_job_definition_arn + aurora_job_queue_arn = module.batch_job_queues.aurora_job_queue_arn + aurora_job_queue_fast_arn = module.batch_job_queues.aurora_job_queue_fast_arn + data_lake_job_definition_arn = module.batch_job_queues.data_lake_job_definition_arn + data_lake_job_queue_arn = module.batch_job_queues.data_lake_job_queue_arn + tile_cache_job_definition_arn = module.batch_job_queues.tile_cache_job_definition_arn + tile_cache_job_queue_arn = module.batch_job_queues.tile_cache_job_queue_arn + pixetl_job_definition_arn = module.batch_job_queues.pixetl_job_definition_arn + pixetl_job_queue_arn = module.batch_job_queues.pixetl_job_queue_arn + on_demand_compute_job_queue_arn = module.batch_job_queues.on_demand_compute_job_queue_arn } depends_on = [ module.batch_job_queues.aurora_job_definition, From b26841483559474ea9802104cc6aef768bf67cf8 Mon Sep 17 00:00:00 2001 From: Solomon Negusse Date: Tue, 8 Oct 2024 11:16:37 +0300 Subject: [PATCH 15/20] add missing variable in template --- terraform/templates/run_batch_policy.json.tmpl | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/terraform/templates/run_batch_policy.json.tmpl b/terraform/templates/run_batch_policy.json.tmpl index 31c2886a8..a26d9f739 100644 --- a/terraform/templates/run_batch_policy.json.tmpl +++ b/terraform/templates/run_batch_policy.json.tmpl @@ -21,7 +21,9 @@ "${tile_cache_job_definition_arn}", "${pixetl_job_queue_arn}", - "${pixetl_job_definition_arn}" + "${pixetl_job_definition_arn}", + + "{on_demand_compute_job_queue_arn}" ] }, { From 50ab46a6bfea052fb1aa2be3ccbec8a311b30c63 Mon Sep 17 00:00:00 2001 From: Solomon Negusse Date: Tue, 8 Oct 2024 11:24:04 +0300 Subject: [PATCH 16/20] syntax fix --- terraform/templates/run_batch_policy.json.tmpl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/terraform/templates/run_batch_policy.json.tmpl b/terraform/templates/run_batch_policy.json.tmpl index a26d9f739..044608630 100644 --- a/terraform/templates/run_batch_policy.json.tmpl +++ b/terraform/templates/run_batch_policy.json.tmpl @@ -23,7 +23,7 @@ "${pixetl_job_queue_arn}", "${pixetl_job_definition_arn}", - "{on_demand_compute_job_queue_arn}" + "${on_demand_compute_job_queue_arn}" ] }, { From d930f5f8aab4c3980067098bcebbaf34d526bca8 Mon Sep 17 00:00:00 2001 From: Dan Scales Date: Wed, 16 Oct 2024 16:50:52 -0700 Subject: [PATCH 17/20] Back out use of new pixetl docker using hard links to reduce disk space I ran a full test using the DIST-Alerts, and it seems like the tile files are generally much smaller, so there could be a bug. --- batch/pixetl.dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/batch/pixetl.dockerfile b/batch/pixetl.dockerfile index 47ab19e79..6c7f44fc3 100644 --- a/batch/pixetl.dockerfile +++ b/batch/pixetl.dockerfile @@ -1,4 +1,4 @@ -FROM globalforestwatch/pixetl:v1.7.8rc2 +FROM globalforestwatch/pixetl:v1.7.7 # Copy scripts COPY ./batch/scripts/ /opt/scripts/ From 9ba09adb156a8ee52b919f2272deaa3181b47aaf Mon Sep 17 00:00:00 2001 From: Dan Scales Date: Thu, 24 Oct 2024 09:40:06 -0700 Subject: [PATCH 18/20] Sort the assets by creation time in the version endpoint, just like /assets We made the asset list in the GET version endpoint more useful recently by including the asset-id. But that list is not being sorted by creation time, unlike the list provided by /assets, so the assets are somewhat randomly ordered. So, add the change to sort the asset list of the GET version endpoint by creation time. This ensures that the default asset is always first, etc. --- app/routes/datasets/versions.py | 1 + 1 file changed, 1 insertion(+) diff --git a/app/routes/datasets/versions.py b/app/routes/datasets/versions.py index 89a2f1722..085877ca5 100644 --- a/app/routes/datasets/versions.py +++ b/app/routes/datasets/versions.py @@ -536,6 +536,7 @@ async def _version_response( .where(ORMAsset.dataset == dataset) .where(ORMAsset.version == version) .where(ORMAsset.status == AssetStatus.saved) + .order_by(ORMAsset.created_on) .gino.all() ) data = Version.from_orm(data).dict(by_alias=True) From 674ca770e74f7e36522099f8bda4296ca4fefdf8 Mon Sep 17 00:00:00 2001 From: Dan Scales Date: Thu, 24 Oct 2024 12:01:54 -0700 Subject: [PATCH 19/20] Add doc comments to indicate asset list is sorted by creation time. Updating doc based on good comment from Gary. --- app/routes/datasets/asset.py | 3 ++- app/routes/datasets/versions.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/app/routes/datasets/asset.py b/app/routes/datasets/asset.py index 66f81d9f3..862a6eb8a 100644 --- a/app/routes/datasets/asset.py +++ b/app/routes/datasets/asset.py @@ -66,7 +66,8 @@ async def get_version_assets( description="The number of assets per page. Default is `10`.", ), ) -> Union[PaginatedAssetsResponse, AssetsResponse]: - """Get all assets for a given dataset version. + """Get all assets for a given dataset version. The list of assets + is sorted by the creation time of each asset. Will attempt to paginate if `page[size]` or `page[number]` is provided. Otherwise, it will attempt to return the entire list of diff --git a/app/routes/datasets/versions.py b/app/routes/datasets/versions.py index 085877ca5..d25175b5e 100644 --- a/app/routes/datasets/versions.py +++ b/app/routes/datasets/versions.py @@ -81,7 +81,8 @@ async def get_version( *, dv: Tuple[str, str] = Depends(dataset_version_dependency) ) -> VersionResponse: - """Get basic metadata for a given version.""" + """Get basic metadata for a given version. The list of assets is sorted by + the creation time of each asset.""" dataset, version = dv row: ORMVersion = await versions.get_version(dataset, version) From d0acafc568bf930808cd54bb3ee6c61051fc46a7 Mon Sep 17 00:00:00 2001 From: manukala6 Date: Thu, 31 Oct 2024 10:28:25 -0700 Subject: [PATCH 20/20] GTC-3007 Add content-date-description metadata field --- .../orm/migrations/versions/604bf4e66c2b_.py | 29 +++++++++++++++++++ app/models/orm/version_metadata.py | 1 + app/models/pydantic/metadata.py | 10 ++++++- tests/utils.py | 1 + tests_v2/fixtures/metadata/version.py | 1 + 5 files changed, 41 insertions(+), 1 deletion(-) create mode 100644 app/models/orm/migrations/versions/604bf4e66c2b_.py diff --git a/app/models/orm/migrations/versions/604bf4e66c2b_.py b/app/models/orm/migrations/versions/604bf4e66c2b_.py new file mode 100644 index 000000000..8eebb37ba --- /dev/null +++ b/app/models/orm/migrations/versions/604bf4e66c2b_.py @@ -0,0 +1,29 @@ +"""Add content_date_description to version_metadata + +Revision ID: 604bf4e66c2b +Revises: ef3392e8e054 +Create Date: 2024-10-31 16:52:56.571782 + +""" +from alembic import op +import sqlalchemy as sa +import sqlalchemy_utils +from sqlalchemy.dialects import postgresql + +# revision identifiers, used by Alembic. +revision = '604bf4e66c2b' +down_revision = 'ef3392e8e054' +branch_labels = None +depends_on = None + + +def upgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.add_column('version_metadata', sa.Column('content_date_description', sa.String(), nullable=True)) + # ### end Alembic commands ### + + +def downgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.drop_column('version_metadata', 'content_date_description') + # ### end Alembic commands ### diff --git a/app/models/orm/version_metadata.py b/app/models/orm/version_metadata.py index f1a09042f..df39cc94e 100644 --- a/app/models/orm/version_metadata.py +++ b/app/models/orm/version_metadata.py @@ -10,6 +10,7 @@ class VersionMetadata(Base, MetadataMixin): version = db.Column(db.String, nullable=False) content_date = db.Column(db.Date) content_start_date = db.Column(db.Date) + content_date_description = db.Column(db.String) content_end_date = db.Column(db.Date) last_update = db.Column(db.Date) description = db.Column(db.String) diff --git a/app/models/pydantic/metadata.py b/app/models/pydantic/metadata.py index 01f5a7308..63e836d81 100644 --- a/app/models/pydantic/metadata.py +++ b/app/models/pydantic/metadata.py @@ -112,7 +112,10 @@ class VersionMetadata(CommonMetadata): None, description="Date range covered by the content", ) - + content_date_description: Optional[str] = Field( + None, + description="Date of content to display", + ) last_update: Optional[date] = Field( None, description="Date the data were last updated", @@ -130,6 +133,7 @@ class Config: "start_date": "2000-01-01", # TODO fix date "end_date": "2021-04-06", }, + "content_date_description": "2000 - present", } ] } @@ -159,6 +163,10 @@ class VersionMetadataUpdate(VersionMetadataIn): None, description="Date range covered by the content", ) + content_date_description: Optional[str] = Field( + None, + description="Date of content to display", + ) last_update: Optional[date] = Field( None, diff --git a/tests/utils.py b/tests/utils.py index a9a4075a0..026fbf2ec 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -31,6 +31,7 @@ version_metadata = { "content_date_range": {"start_date": "2000-01-01", "end_date": "2021-01-01"}, + "content_date_description": "2000 - present", "last_update": "2020-01-03", "spatial_resolution": 10, "resolution_description": "10 meters", diff --git a/tests_v2/fixtures/metadata/version.py b/tests_v2/fixtures/metadata/version.py index 63a81369c..07f442d2c 100644 --- a/tests_v2/fixtures/metadata/version.py +++ b/tests_v2/fixtures/metadata/version.py @@ -1,5 +1,6 @@ VERSION_METADATA = { "content_date_range": {"start_date": "2000-01-01", "end_date": "2021-01-01"}, + "content_date_description": "2000 - present", "last_update": "2020-01-03", "spatial_resolution": 10, "resolution_description": "10 meters",