Skip to content

Commit

Permalink
feat(daft): read delta table (#24848)
Browse files Browse the repository at this point in the history
  • Loading branch information
hongbo-miao authored Mar 8, 2025
1 parent 200768a commit 21183da
Show file tree
Hide file tree
Showing 27 changed files with 597 additions and 127 deletions.
8 changes: 4 additions & 4 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -429,7 +429,7 @@ jobs:
attempt_delay: 2000

build-hm-prefect-calculate:
name: Build hm-prefect-calculate
name: Build prefect-calculate
needs: detect-changes
if: ${{ needs.detect-changes.outputs.hm-prefect-calculate == 'true' }}
runs-on: ubuntu-24.04
Expand Down Expand Up @@ -465,7 +465,7 @@ jobs:
attempt_delay: 2000

build-hm-prefect-daft-analysis:
name: Build hm-prefect-daft-analysis
name: Build prefect-daft-analysis
needs: detect-changes
if: ${{ needs.detect-changes.outputs.hm-prefect-daft-analysis == 'true' }}
runs-on: ubuntu-24.04
Expand Down Expand Up @@ -501,7 +501,7 @@ jobs:
attempt_delay: 2000

build-hm-prefect-greet:
name: Build hm-prefect-greet
name: Build prefect-greet
needs: detect-changes
if: ${{ needs.detect-changes.outputs.hm-prefect-greet == 'true' }}
runs-on: ubuntu-24.04
Expand Down Expand Up @@ -537,7 +537,7 @@ jobs:
attempt_delay: 2000

build-hm-prefect-print-platform:
name: Build hm-prefect-print-platform
name: Build prefect-print-platform
needs: detect-changes
if: ${{ needs.detect-changes.outputs.hm-prefect-print-platform == 'true' }}
runs-on: ubuntu-24.04
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -455,7 +455,7 @@ module "hm_glue_crawler_motor_data" {
source = "../../../../modules/aws/hm_aws_glue_crawler"
aws_glue_crawler_name = "hm-delta-lake-crawler-iot"
aws_glue_crawler_delta_tables = ["s3://hm-production-bucket/delta-tables/motor_data/"]
aws_glue_database = "production_hm_delta_db"
aws_glue_database = "production_motor_db"
schedule = "cron(40 9 * * ? *)" # Every day at 9:40 UTC https://crontab.cronhub.io/
iam_role_arn = "arn:aws:iam::272394222652:role/service-role/AWSGlueServiceRole-hm"
environment = var.environment
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -612,7 +612,8 @@ module "ray_cluster_iam_role" {
ray_cluster_namespace = "${var.environment}-hm-ray-cluster"
amazon_eks_cluster_oidc_provider = module.amazon_eks_cluster.oidc_provider
amazon_eks_cluster_oidc_provider_arn = module.amazon_eks_cluster.oidc_provider_arn
s3_bucket_name = module.amazon_s3_bucket_hm_mlflow.name
mlflow_s3_bucket_name = module.amazon_s3_bucket_hm_mlflow.name
iot_data_s3_bucket_name = "iot-data-bucket"
environment = var.environment
team = var.team
}
Expand All @@ -628,7 +629,7 @@ module "kubernetes_namespace_hm_ray_cluster" {
]
}
# Ray Cluster Valkey - Kubernetes namespace
module "hm_kubernetes_namespace_hm_ray_cluster_valkey" {
module "kubernetes_namespace_hm_ray_cluster_valkey" {
source = "../../../../modules/kubernetes/hm_kubernetes_namespace"
kubernetes_namespace = "${var.environment}-hm-ray-cluster-valkey"
labels = {
Expand Down Expand Up @@ -680,7 +681,7 @@ module "s3_bucket_hm_mimir_ruler" {
environment = var.environment
team = var.team
}
module "hm_mimir_iam_role" {
module "mimir_iam_role" {
providers = { aws = aws.production }
source = "../../../../modules/kubernetes/hm_mimir_iam_role"
mimir_service_account_name = "hm-mimir"
Expand Down Expand Up @@ -733,7 +734,7 @@ module "s3_bucket_hm_loki_ruler" {
environment = var.environment
team = var.team
}
module "hm_loki_iam_role" {
module "loki_iam_role" {
providers = { aws = aws.production }
source = "../../../../modules/kubernetes/hm_loki_iam_role"
loki_service_account_name = "hm-loki"
Expand Down Expand Up @@ -791,7 +792,7 @@ module "s3_bucket_hm_tempo_trace" {
environment = var.environment
team = var.team
}
module "hm_tempo_iam_role" {
module "tempo_iam_role" {
providers = { aws = aws.production }
source = "../../../../modules/kubernetes/hm_tempo_iam_role"
tempo_service_account_name = "hm-tempo"
Expand Down Expand Up @@ -896,7 +897,7 @@ data "aws_secretsmanager_secret_version" "hm_prefect_postgres_secret_version" {
provider = aws.production
secret_id = data.aws_secretsmanager_secret.hm_prefect_postgres_secret.id
}
module "hm_prefect_postgres_security_group" {
module "prefect_postgres_security_group" {
providers = { aws = aws.production }
source = "../../../../modules/aws/hm_amazon_rds_security_group"
amazon_ec2_security_group_name = "${local.prefect_postgres_name}-security-group"
Expand All @@ -905,23 +906,23 @@ module "hm_prefect_postgres_security_group" {
environment = var.environment
team = var.team
}
module "hm_prefect_postgres_subnet_group" {
module "prefect_postgres_subnet_group" {
providers = { aws = aws.production }
source = "../../../../modules/aws/hm_amazon_rds_subnet_group"
subnet_group_name = "${local.prefect_postgres_name}-subnet-group"
subnet_ids = var.amazon_vpc_private_subnet_ids
environment = var.environment
team = var.team
}
module "hm_prefect_postgres_parameter_group" {
module "prefect_postgres_parameter_group" {
providers = { aws = aws.production }
source = "../../../../modules/aws/hm_amazon_rds_parameter_group"
family = "postgres17"
parameter_group_name = "${local.prefect_postgres_name}-parameter-group"
environment = var.environment
team = var.team
}
module "hm_prefect_postgres_instance" {
module "prefect_postgres_instance" {
providers = { aws = aws.production }
source = "../../../../modules/aws/hm_amazon_rds_instance"
amazon_rds_name = local.prefect_postgres_name
Expand All @@ -940,7 +941,7 @@ module "hm_prefect_postgres_instance" {
team = var.team
}
# Prefect Server - Kubernetes namespace
module "hm_kubernetes_namespace_hm_prefect_server" {
module "kubernetes_namespace_hm_prefect_server" {
source = "../../../../modules/kubernetes/hm_kubernetes_namespace"
kubernetes_namespace = "${var.environment}-hm-prefect-server"
labels = {
Expand All @@ -950,8 +951,24 @@ module "hm_kubernetes_namespace_hm_prefect_server" {
module.hm_amazon_eks_cluster
]
}
# Prefect Worker - IAM role
module "prefect_worker_iam_role" {
providers = { aws = aws.production }
source = "../../../../modules/kubernetes/hm_prefect_worker_iam_role"
prefect_worker_service_account_name = "hm-prefect-worker"
prefect_worker_namespace = "${var.environment}-hm-prefect-worker"
amazon_eks_cluster_oidc_provider = module.hm_amazon_eks_cluster.oidc_provider
amazon_eks_cluster_oidc_provider_arn = module.hm_amazon_eks_cluster.oidc_provider_arn
iot_data_s3_bucket_name = "iot-data-bucket"
aws_glue_database_names = [
"${var.environment}_battery_db",
"${var.environment}_motor_db"
]
environment = var.environment
team = var.team
}
# Prefect Worker - Kubernetes namespace
module "hm_kubernetes_namespace_hm_prefect_worker" {
module "kubernetes_namespace_hm_prefect_worker" {
source = "../../../../modules/kubernetes/hm_kubernetes_namespace"
kubernetes_namespace = "${var.environment}-hm-prefect-worker"
labels = {
Expand Down Expand Up @@ -1090,7 +1107,7 @@ module "kubernetes_namespace_hm_kafbat_ui" {

# LiteLLM
# LiteLLM - IAM role
module "hm_litellm_iam_role" {
module "litellm_iam_role" {
providers = { aws = aws.production }
source = "../../../../modules/kubernetes/hm_litellm_iam_role"
litellm_service_account_name = "hm-litellm-service-account"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ resource "aws_iam_role_policy" "eks_cluster_s3_policy" {
}
# https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy
resource "aws_iam_role_policy" "iot_data_s3_policy" {
name = "${local.aws_iam_role_name_prefix}IoTDataS3Policy-${var.amazon_eks_cluster_name}"
name = "${local.aws_iam_role_name_prefix}IotDataS3Policy-${var.amazon_eks_cluster_name}"
role = aws_iam_role.s3_csi_driver_mountpoint_role.name
policy = jsonencode({
Version = "2012-10-17"
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
terraform {
required_providers {
aws = {
source = "hashicorp/aws"
}
}
}

data "aws_caller_identity" "current" {}
data "aws_region" "current" {}

locals {
aws_iam_role_name_prefix = "PrefectWorkerRole"
}
# https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role
resource "aws_iam_role" "prefect_worker_iam_role" {
name = "${local.aws_iam_role_name_prefix}-${var.prefect_worker_service_account_name}"
assume_role_policy = jsonencode({
Version = "2012-10-17"
Statement = [
{
Effect = "Allow"
Principal = {
Federated = var.amazon_eks_cluster_oidc_provider_arn
}
Action = "sts:AssumeRoleWithWebIdentity"
Condition = {
StringEquals = {
"${var.amazon_eks_cluster_oidc_provider}:aud" = "sts.amazonaws.com",
"${var.amazon_eks_cluster_oidc_provider}:sub" = "system:serviceaccount:${var.prefect_worker_namespace}:${var.prefect_worker_service_account_name}"
}
}
}
]
})
tags = {
Environment = var.environment
Team = var.team
Name = "${local.aws_iam_role_name_prefix}-${var.prefect_worker_service_account_name}"
}
}
# https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy
resource "aws_iam_role_policy" "iot_data_s3_policy" {
name = "${local.aws_iam_role_name_prefix}IotDataS3Policy-${var.prefect_worker_service_account_name}"
role = aws_iam_role.prefect_worker_iam_role.name
policy = jsonencode({
Version = "2012-10-17"
Statement = [
{
Effect = "Allow"
Action = [
"s3:GetBucketLocation",
"s3:ListBucket"
]
Resource = [
"arn:aws:s3:::${var.iot_data_s3_bucket_name}"
]
},
{
Effect = "Allow"
Action = [
"s3:GetObject",
"s3:PutObject"
]
Resource = [
"arn:aws:s3:::${var.iot_data_s3_bucket_name}/*"
]
}
]
})
}
# https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy
resource "aws_iam_role_policy" "aws_glue_policy" {
name = "${local.aws_iam_role_name_prefix}AwsGluePolicy-${var.prefect_worker_service_account_name}"
role = aws_iam_role.prefect_worker_iam_role.name
policy = jsonencode({
Version = "2012-10-17"
Statement = [
{
Effect = "Allow"
Action = [
"glue:GetDatabase",
"glue:GetDatabases",
"glue:GetTable",
"glue:GetTables"
]
Resource = flatten([
"arn:aws:glue:${data.aws_region.current.name}:${data.aws_caller_identity.current.account_id}:catalog",
[for database in var.aws_glue_database_names :
"arn:aws:glue:${data.aws_region.current.name}:${data.aws_caller_identity.current.account_id}:database/${database}"
],
[for database in var.aws_glue_database_names :
"arn:aws:glue:${data.aws_region.current.name}:${data.aws_caller_identity.current.account_id}:table/${database}/*"
]
])
}
]
})
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
variable "prefect_worker_service_account_name" {
type = string
}
variable "prefect_worker_namespace" {
type = string
}
variable "iot_data_s3_bucket_name" {
type = string
}
variable "aws_glue_database_names" {
type = list(string)
}
variable "amazon_eks_cluster_oidc_provider" {
type = string
}
variable "amazon_eks_cluster_oidc_provider_arn" {
type = string
}
variable "environment" {
type = string
}
variable "team" {
type = string
}
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ locals {
aws_iam_role_name_prefix = "RayClusterRole"
}
# https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role
resource "aws_iam_role" "ray_cluster_role" {
resource "aws_iam_role" "ray_cluster_iam_role" {
name = "${local.aws_iam_role_name_prefix}-${var.ray_cluster_service_account_name}"
assume_role_policy = jsonencode({
Version = "2012-10-17"
Expand All @@ -37,9 +37,9 @@ resource "aws_iam_role" "ray_cluster_role" {
}
}
# https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy
resource "aws_iam_role_policy" "ray_cluster_role_policy" {
name = "${local.aws_iam_role_name_prefix}Policy-${var.ray_cluster_service_account_name}"
role = aws_iam_role.ray_cluster_role.name
resource "aws_iam_role_policy" "mlflow_s3_policy" {
name = "${local.aws_iam_role_name_prefix}MLflowS3Policy-${var.ray_cluster_service_account_name}"
role = aws_iam_role.ray_cluster_iam_role.name
policy = jsonencode({
Version = "2012-10-17"
Statement = [
Expand All @@ -49,7 +49,36 @@ resource "aws_iam_role_policy" "ray_cluster_role_policy" {
"s3:PutObject"
]
Resource = [
"arn:aws:s3:::${var.s3_bucket_name}/*"
"arn:aws:s3:::${var.mlflow_s3_bucket_name}/*"
]
}
]
})
}
# https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy
resource "aws_iam_role_policy" "iot_data_s3_policy" {
name = "${local.aws_iam_role_name_prefix}IotDataS3Policy-${var.ray_cluster_service_account_name}"
role = aws_iam_role.ray_cluster_iam_role.name
policy = jsonencode({
Version = "2012-10-17"
Statement = [
{
Effect = "Allow"
Action = [
"s3:ListBucket"
]
Resource = [
"arn:aws:s3:::${var.iot_data_s3_bucket_name}"
]
},
{
Effect = "Allow"
Action = [
"s3:GetObject",
"s3:PutObject"
]
Resource = [
"arn:aws:s3:::${var.iot_data_s3_bucket_name}/*"
]
}
]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,10 @@ variable "amazon_eks_cluster_oidc_provider" {
variable "amazon_eks_cluster_oidc_provider_arn" {
type = string
}
variable "s3_bucket_name" {
variable "mlflow_s3_bucket_name" {
type = string
}
variable "iot_data_s3_bucket_name" {
type = string
}
variable "environment" {
Expand Down
6 changes: 3 additions & 3 deletions cloud-platform/aws/aws-glue/justfile
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,13 @@ aws-glue-crawler-connection-get:
aws glue get-connection --name=hm-postgres-connection

aws-glue-partition-index-list:
aws glue get-partition-indexes --database-name=production_hm_delta_db --table-name=motor
aws glue get-partition-indexes --database-name=production_motor_db --table-name=motor

aws-glue-partition-index-create:
aws glue create-partition-index --database-name=production_hm_delta_db --table-name=motor --partition-index=Keys=_event_id,IndexName=_event_id_idx
aws glue create-partition-index --database-name=production_motor_db --table-name=motor --partition-index=Keys=_event_id,IndexName=_event_id_idx

aws-glue-partition-index-delete:
aws glue delete-partition-index --database-name=production_hm_delta_db --table-name=motor --index-name=_event_id_idx
aws glue delete-partition-index --database-name=production_motor_db --table-name=motor --index-name=_event_id_idx

create-adsb-db-delta-tables:
bash bin/create_adsb_db_delta_tables.sh
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# https://docs.prefect.io/concepts/deployments/?h=.prefectignore#build-the-deployment
# https://docs.prefect.io/v3/deploy/infrastructure-concepts/store-flow-code#include-or-exclude-files-from-storage

# Anywhere
**/*.aliases
Expand Down
Loading

0 comments on commit 21183da

Please sign in to comment.