Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat/sagemaker llms #234

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,18 @@ name: Lint Terraform

on:
push:
branches: [ "main" ]
branches: [ "main", "feat/sagemaker-llms" ]
pull_request:
branches: [ "main" ]
branches: [ "main", "feat/sagemaker-llms" ]

jobs:
test:
lint:
name: Lint Terraform
runs-on: ubuntu-20.04
steps:
- name: "Checkout"
uses: "actions/checkout@v4"

- name: "Install Terraform"
# From https://developer.hashicorp.com/terraform/tutorials/aws-get-started/install-cli,
# but with the addition of programatically verifying the package signing key, which was
Expand All @@ -39,6 +40,20 @@ jobs:
sudo tee /etc/apt/sources.list.d/hashicorp.list
sudo apt update
sudo apt-get install terraform
- name: "Run linting"

- name: Set up Python 3.12
uses: actions/setup-python@v3
with:
python-version: "3.12"

- name: Install uv
uses: astral-sh/setup-uv@v3

- name: Run linting on terraform
run: |
terraform fmt -check -recursive -diff

- name: Run linting on Python code for lambda functions
run: |
cd infra/modules/
uv run ruff check .
5 changes: 5 additions & 0 deletions infra/ecr.tf
Original file line number Diff line number Diff line change
Expand Up @@ -249,6 +249,11 @@ data "aws_ecr_lifecycle_policy_document" "expire_untagged_after_one_day" {
}
}

resource "aws_ecr_repository" "sagemaker" {

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@peter-woodcock identified that this can be removed @isobel-daley-6point6 we can review

count = var.sagemaker_on ? 1 : 0
name = "${var.prefix}-sagemaker"
}

data "aws_ecr_lifecycle_policy_document" "expire_preview_and_untagged_after_one_day" {
# Match *--prod images, but expire them in 1000 years...
rule {
Expand Down
122 changes: 122 additions & 0 deletions infra/ecs_notebooks_notebook.tf
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,8 @@ resource "aws_iam_policy" "notebook_task_execution" {
}

data "aws_iam_policy_document" "notebook_task_execution" {


statement {
actions = [
"logs:CreateLogStream",
Expand All @@ -122,6 +124,41 @@ data "aws_iam_policy_document" "notebook_task_execution" {
]
}

dynamic "statement" {

for_each = var.sagemaker_on ? [1] : []

content {
actions = [
"sagemaker:DescribeEndpoint",
"sagemaker:DescribeEndpointConfig",
"sagemaker:DescribeModel",
"sagemaker:InvokeEndpointAsync",
"sagemaker:ListEndpoints",
"sagemaker:ListEndpointConfigs",
"sagemaker:ListModels",
]

resources = [
"*",
]
}
}

dynamic "statement" {

for_each = var.sagemaker_on ? [1] : []

content {
actions = [
"ec2:*VpcEndpoint*"
]
resources = [
"*",
]
}
}

statement {
actions = [
"ecr:GetAuthorizationToken",
Expand Down Expand Up @@ -234,6 +271,41 @@ data "aws_iam_policy_document" "notebook_s3_access_template" {
"${aws_efs_file_system.notebooks.arn}",
]
}

dynamic "statement" {

for_each = var.sagemaker_on ? [1] : []

content {
actions = [
"sagemaker:DescribeEndpoint",
"sagemaker:DescribeEndpointConfig",
"sagemaker:DescribeModel",
"sagemaker:InvokeEndpointAsync",
"sagemaker:ListEndpoints",
"sagemaker:ListEndpointConfigs",
"sagemaker:ListModels",
]

resources = [
"*",
]
}
}

dynamic "statement" {

for_each = var.sagemaker_on ? [1] : []

content {
actions = [
"ec2:*VpcEndpoint*"
]
resources = [
"*",
]
}
}
}

resource "aws_vpc_endpoint" "s3" {
Expand Down Expand Up @@ -345,6 +417,34 @@ data "aws_iam_policy_document" "aws_vpc_endpoint_s3_notebooks" {
"arn:aws:s3:::amazonlinux.*.amazonaws.com/*",
]
}

dynamic "statement" {

for_each = var.sagemaker_on ? [1] : []

content {

principals {
type = "AWS"
identifiers = ["*"]
}

actions = [
"s3:ListBucket",
"s3:GetObject",
"s3:PutObject",
"s3:DeleteObject",
"s3:GetBucketLocation",
]

resources = [
"arn:aws:s3:::jumpstart-cache-prod-eu-west-2/*",
"arn:aws:s3:::jumpstart-private-cache-prod-eu-west-2/*",
"arn:aws:s3:::jumpstart-cache-prod-eu-west-2",
"arn:aws:s3:::jumpstart-private-cache-prod-eu-west-2",
]
}
}
}

resource "aws_iam_policy" "notebook_task_boundary" {
Expand Down Expand Up @@ -375,6 +475,28 @@ data "aws_iam_policy_document" "jupyterhub_notebook_task_boundary" {
]
}

# Allow all tools users to access SageMaker endpoints
dynamic "statement" {

for_each = var.sagemaker_on ? [1] : []

content {
actions = [
"sagemaker:DescribeEndpoint",
"sagemaker:DescribeEndpointConfig",
"sagemaker:DescribeModel",
"sagemaker:InvokeEndpointAsync",
"sagemaker:ListEndpoints",
"sagemaker:ListEndpointConfigs",
"sagemaker:ListModels",
]

resources = [
"*",
]
}
}

statement {
actions = [
"s3:ListBucket",
Expand Down
62 changes: 59 additions & 3 deletions infra/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -41,16 +41,16 @@ variable "subnets_num_bits" {}
variable "vpc_notebooks_cidr" {}
variable "vpc_notebooks_subnets_num_bits" {}
variable "vpc_datasets_cidr" {}
variable "vpc_sagemaker_cidr" {}
variable "vpc_sagemaker_subnets_num_bits" {}

variable "aws_route53_zone" {}
variable "admin_domain" {}
variable "appstream_domain" {}
variable "support_domain" {}

variable "admin_db_instance_class" {}
variable "admin_db_instance_version" {
default = "10.15"
}
variable "admin_db_instance_version" {}
variable "admin_db_instance_allocated_storage" {
type = number
default = 200
Expand Down Expand Up @@ -274,38 +274,94 @@ variable "s3_prefixes_for_external_role_copy" {
default = ["import-data", "export-data"]
}

variable "sagemaker_example_inference_image" { default = "" }

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@peter-woodcock identified this can be removed we can review @isobel-daley-6point6


variable "sagemaker_models_folder" { default = "" }
variable "hugging_face_model_image" { default = "" }
variable "sagemaker_default_bucket" { default = "" }
variable "teams_webhook_url" { default = "" }
variable "sagemaker_budget_emails" { default = [""] }
variable "slack_webhook_resource_alerts" { default = [""] }
variable "slack_webhook_cpu_alerts" { default = [""] }
variable "slack_webhook_gpu_alerts" { default = [""] }
variable "slack_webhook_security_alerts" { default = [""] }
variable "slack_webhook_backlog_alerts" { default = [""] }

variable "sagemaker_on" {
type = bool
default = false
}

variable "sagemaker_gpt_neo_125m" {
type = bool
default = false
}

variable "sagemaker_flan_t5_780m" {
type = bool
default = false
}

variable "sagemaker_phi_2_3b" {
type = bool
default = false
}

variable "sagemaker_llama_3_3b" {
type = bool
default = false
}

variable "sagemaker_llama_3_3b_instruct" {
type = bool
default = false
}

variable "sagemaker_mistral_7b_instruct" {
type = bool
default = false
}

variable "matchbox_on" {
type = bool
default = false
}

variable "matchbox_dev_mode_on" {
type = bool
default = false
}

variable "vpc_matchbox_cidr" {
type = string
default = ""
}

variable "matchbox_instances" {
type = list(string)
default = []
}

variable "matchbox_instances_long" {
type = list(string)
default = []
}

variable "matchbox_db_instance_class" {
type = string
default = ""
}

variable "vpc_matchbox_subnets_num_bits" {
type = string
default = ""
}

variable "matchbox_s3_cache" {
type = string
default = ""
}

variable "matchbox_s3_dev_artefacts" {
type = string
default = ""
Expand Down
21 changes: 21 additions & 0 deletions infra/modules/.bumpversion.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
[tool.bumpversion]
current_version = "0.1.19"
parse = "(?P<major>\\d+)\\.(?P<minor>\\d+)\\.(?P<patch>\\d+)"
serialize = ["{major}.{minor}.{patch}"]
search = "{current_version}"
replace = "{new_version}"
regex = false
ignore_missing_version = false
ignore_missing_files = false
tag = false
sign_tags = false
tag_name = "v{new_version}"
tag_message = "Bump version: {current_version} → {new_version}"
allow_dirty = false
commit = false
message = "Bump version: {current_version} → {new_version}"
moveable_tags = []
commit_args = ""
setup_hooks = []
pre_commit_hooks = []
post_commit_hooks = []
26 changes: 26 additions & 0 deletions infra/modules/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
.DEFAULT_GOAL := help
SHELL := /bin/bash

.PHONY: help
help: ## Show all available commands
@awk 'BEGIN {FS = ":.*##"; printf "Usage: make \033[36m<target>\033[0m\n"} /^[a-zA-Z_-]+:.*?##/ { printf " \033[36m%-13s\033[0m %s\n", $$1, $$2 } /^##@/ { printf "\n\033[1m%s\033[0m\n", substr($$0, 5) } ' $(MAKEFILE_LIST);

.PHONY: bump
bump: format # You must install uv https://docs.astral.sh/uv/getting-started/installation/
@echo "You must have committed your changes before running this"
@uv run bump-my-version bump patch;
@echo "Upgraded version to $(shell bump-my-version show current_version)"
@git add .bumpversion.toml
@git commit -m "Upgraded version to $(shell bump-my-version show current_version)"

.PHONY: format
format: # You must install terraform https://developer.hashicorp.com/terraform/tutorials/aws-get-started/install-cli
@echo "Enforcing formatting"
@terraform fmt -recursive;
@uv run ruff format .;
@uv run ruff check --fix-only --unsafe-fixes .;

.PHONY: quality
quality:
@uv run ruff check .;
@terraform fmt -check -recursive -diff;
Loading
Loading