Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

maap: new cluster #5228

Merged
merged 5 commits into from
Dec 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/deploy-grafana-dashboards.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ jobs:
- cluster_name: jupyter-meets-the-earth
- cluster_name: kitware
- cluster_name: leap
- cluster_name: maap
- cluster_name: nasa-cryo
- cluster_name: nasa-ghg
- cluster_name: nasa-veda
Expand Down
37 changes: 37 additions & 0 deletions config/clusters/maap/cluster.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
name: maap
provider: aws # https://916098889494.signin.aws.amazon.com/console
aws:
key: enc-deployer-credentials.secret.json
clusterType: eks
clusterName: maap
region: us-west-2
billing:
paid_by_us: false
support:
helm_chart_values_files:
- support.values.yaml
- enc-support.secret.values.yaml
hubs:
[]
# Uncomment the lines below once the support infrastructure was deployed and
# you are ready to add the first cluster

# - name: staging
# # Tip: consider changing this to something more human friendly
# display_name: "maap - staging"
# domain: staging.maap.2i2c.cloud
# helm_chart: basehub
# helm_chart_values_files:
# - common.values.yaml
# - staging.values.yaml
# - enc-staging.secret.values.yaml

# - name: prod
# # Tip: consider changing this to something more human friendly
# display_name: "maap - prod"
# domain: prod.maap.2i2c.cloud
# helm_chart: basehub
# helm_chart_values_files:
# - common.values.yaml
# - prod.values.yaml
# - enc-prod.secret.values.yaml
25 changes: 25 additions & 0 deletions config/clusters/maap/enc-deployer-credentials.secret.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
{
"AccessKey": {
"AccessKeyId": "ENC[AES256_GCM,data:JMiFl1UnzusCQNlEOBsYvHa+9Uo=,iv:CC0kCAIAbQXtJE4aWfvXd63FWVSuO9To2L8aKkHRgo4=,tag:r2ZlXvm+UtsVyim0WI0M9Q==,type:str]",
"SecretAccessKey": "ENC[AES256_GCM,data:w6Agme4BM109uRDH2CXIp9ffqeD6xXe/Rw6ed2X8uN42CecK1vamNQ==,iv:7eEROA5OrThNMgq9dsHeVyFFsSUbksmt1kA0f5dBDXA=,tag:5UD9cGGNEKvw20Cril4evw==,type:str]",
"UserName": "ENC[AES256_GCM,data:GcAK1BJTZVmJGoVxeRb4zErA7RA371Y=,iv:6udAmDeSfJ2DO8j+/aINVF4PSjhQs+j5BxBSA2llB9Y=,tag:zYLlltSLTCH01wxrr5mffg==,type:str]"
},
"sops": {
"kms": null,
"gcp_kms": [
{
"resource_id": "projects/two-eye-two-see/locations/global/keyRings/sops-keys/cryptoKeys/similar-hubs",
"created_at": "2024-12-04T12:21:40Z",
"enc": "CiUA4OM7eOtAu8gt5nq+Tr+m64LsqMU7YruHfYzFWFswrGfKO5SgEkkAnGhyNghFbi9rWO0BUsWs199nUCTeQOOebtO8KFEMrbH5bejuZDyjRar2fU3WyUKxlBRuywgZySqZgJ9Ut+LDL+c2LdWZD+Qz"
}
],
"azure_kv": null,
"hc_vault": null,
"age": null,
"lastmodified": "2024-12-04T12:21:41Z",
"mac": "ENC[AES256_GCM,data:kuyRynza4+RG2CGJyYQgUqjLAEZiCrjRvTpR/ciO0yKoRhFzykkbg12J/1y4M4eqlsezvUfyqE+EUtsBaISH1mg8nIuchHi6sRz9XAjQeLX3cwrEPlItH7sUjjGOTbRhcHna+zXVoM2q6gxIpEdNaNq/vPtAKs9TGCRRkw1NfSQ=,iv:RvP7hU6/6kJOBStTO5FEACDPwDA5tBYvjEptdGDRcOA=,tag:as4VS4owv5yZ2c0s+lbZ8A==,type:str]",
"pgp": null,
"unencrypted_suffix": "_unencrypted",
"version": "3.8.1"
}
}
17 changes: 17 additions & 0 deletions config/clusters/maap/enc-support.secret.values.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
prometheusIngressAuthSecret:
username: ENC[AES256_GCM,data:1Fs5zwh1wn4/8KWnSoswC/KiW/1jw8CJxUSnOLne6KRI1W9uftsJt43FmRdzQMqsiadc291Jo74/YWBFBC1khw==,iv:ouHNVDQcyfsHQ7zj144fVEfqQX7oIez0uLmCDeO47dw=,tag:MxKMSNP+DVTBdQbBRIxA+Q==,type:str]
password: ENC[AES256_GCM,data:qtItFIiARguwpejHWHBDSoKOl4uilmXgEkC4nBonqqWoCkMBHBDFCAr7qbH+fwep+1+yNUkuDXKJE6l0zp/gqw==,iv:8Pcbr2lulRPc0wPYOtgLez2lBLa+PKfxmd/SA75VLpY=,tag:mzZukJ3yv+IPxxDO22O9Sg==,type:str]
sops:
kms: []
gcp_kms:
- resource_id: projects/two-eye-two-see/locations/global/keyRings/sops-keys/cryptoKeys/similar-hubs
created_at: "2024-12-04T11:17:12Z"
enc: CiUA4OM7eInxKKOnVMfm7f3ZEMUF8+vdF7TSx3WQo65HugraH6wMEkkAnGhyNpIACP7jUyAu/WPOXEmSwhwAXVaQGCMbgWbeuh0A+qvSUieMHE53t/VCgGa5n0Dnitr/jqchmhNaJQfs4GyoxgF3RbAp
azure_kv: []
hc_vault: []
age: []
lastmodified: "2024-12-04T11:17:12Z"
mac: ENC[AES256_GCM,data:9hrfgDF4tkpynItWcIkFTIGF8GRxeCXm0vcdMwcuNAx4E/vC/WMKxES3LFK2ygNzSljKZ3C76F3ipHjEioognquZQoEZWF22tAcJHFfc1VGa9iR6Dh22z4X33UcEZFELXBDJUPI01YWEOybqx74Khd13Yo8ht61vnUsDEbvEPTY=,iv:EwWG5H90WIEoX1T46DDaSvascSafppbtRvQPW9byerY=,tag:wDIatpNvUyHBzLSqzhabkQ==,type:str]
pgp: []
unencrypted_suffix: _unencrypted
version: 3.8.1
42 changes: 42 additions & 0 deletions config/clusters/maap/support.values.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
prometheusIngressAuthSecret:
enabled: true

prometheus:
server:
ingress:
enabled: true
hosts:
- prometheus.maap.2i2c.cloud
tls:
- secretName: prometheus-tls
hosts:
- prometheus.maap.2i2c.cloud

grafana:
grafana.ini:
server:
root_url: https://grafana.maap.2i2c.cloud/
auth.github:
enabled: true
allowed_organizations: 2i2c-org
ingress:
hosts:
- grafana.maap.2i2c.cloud
tls:
- secretName: grafana-tls
hosts:
- grafana.maap.2i2c.cloud

aws-ce-grafana-backend:
enabled: true
envBasedConfig:
clusterName: maap
serviceAccount:
annotations:
eks.amazonaws.com/role-arn: arn:aws:iam::916098889494:role/aws_ce_grafana_backend_iam_role

cluster-autoscaler:
enabled: true
autoDiscovery:
clusterName: maap
awsRegion: us-west-2
262 changes: 262 additions & 0 deletions eksctl/maap.jsonnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,262 @@
/*
This file is a jsonnet template of a eksctl's cluster configuration file,
that is used with the eksctl CLI to both update and initialize an AWS EKS
based cluster.

This file has in turn been generated from eksctl/template.jsonnet which is
relevant to compare with for changes over time.

To use jsonnet to generate an eksctl configuration file from this, do:

jsonnet maap.jsonnet > maap.eksctl.yaml

References:
- https://eksctl.io/usage/schema/
*/
local ng = import "./libsonnet/nodegroup.jsonnet";

// place all cluster nodes here
local clusterRegion = "us-west-2";
local masterAzs = ["us-west-2a", "us-west-2b", "us-west-2c"];
local nodeAz = "us-west-2a";

// Node definitions for notebook nodes. Config here is merged
// with our notebook node definition.
// A `node.kubernetes.io/instance-type label is added, so pods
// can request a particular kind of node with a nodeSelector
local notebookNodes = [
// staging
{
instanceType: "r5.xlarge",
namePrefix: "nb-staging",
labels+: { "2i2c/hub-name": "staging" },
tags+: { "2i2c:hub-name": "staging" },
},
{
instanceType: "r5.4xlarge",
namePrefix: "nb-staging",
labels+: { "2i2c/hub-name": "staging" },
tags+: { "2i2c:hub-name": "staging" },
},
{
instanceType: "r5.16xlarge",
namePrefix: "nb-staging",
labels+: { "2i2c/hub-name": "staging" },
tags+: { "2i2c:hub-name": "staging" },
},
// prod
{
instanceType: "r5.xlarge",
namePrefix: "nb-prod",
labels+: { "2i2c/hub-name": "prod" },
tags+: { "2i2c:hub-name": "prod" },
},
{
instanceType: "r5.4xlarge",
namePrefix: "nb-prod",
labels+: { "2i2c/hub-name": "prod" },
tags+: { "2i2c:hub-name": "prod" },
},
{
instanceType: "r5.16xlarge",
namePrefix: "nb-prod",
labels+: { "2i2c/hub-name": "prod" },
tags+: { "2i2c:hub-name": "prod" },
},
// gpus
{
instanceType: "g4dn.xlarge",
namePrefix: "gpu-staging",
labels+: { "2i2c/hub-name": "staging" },
tags+: {
"2i2c:hub-name": "staging",
"k8s.io/cluster-autoscaler/node-template/resources/nvidia.com/gpu": "1"
},
taints+: {
"nvidia.com/gpu": "present:NoSchedule"
},
// Allow provisioning GPUs across all AZs, to prevent situation where all
// GPUs in a single AZ are in use and no new nodes can be spawned
availabilityZones: masterAzs,
},
{
instanceType: "g4dn.xlarge",
namePrefix: "gpu-prod",
labels+: { "2i2c/hub-name": "prod" },
tags+: {
"2i2c:hub-name": "prod",
"k8s.io/cluster-autoscaler/node-template/resources/nvidia.com/gpu": "1"
},
taints+: {
"nvidia.com/gpu": "present:NoSchedule"
},
// Allow provisioning GPUs across all AZs, to prevent situation where all
// GPUs in a single AZ are in use and no new nodes can be spawned
availabilityZones: masterAzs,
},
];

local daskNodes = [
// Node definitions for dask worker nodes. Config here is merged
// with our dask worker node definition, which uses spot instances.
// A `node.kubernetes.io/instance-type label is set to the name of the
// *first* item in instanceDistribution.instanceTypes, to match
// what we do with notebook nodes. Pods can request a particular
// kind of node with a nodeSelector
//
// A not yet fully established policy is being developed about using a single
// node pool, see https://github.com/2i2c-org/infrastructure/issues/2687.
//
{
namePrefix: "dask-staging",
labels+: { "2i2c/hub-name": "staging" },
tags+: { "2i2c:hub-name": "staging" },
instancesDistribution+: { instanceTypes: ["r5.4xlarge"] }
},
{
namePrefix: "dask-prod",
labels+: { "2i2c/hub-name": "prod" },
tags+: { "2i2c:hub-name": "prod" },
instancesDistribution+: { instanceTypes: ["r5.4xlarge"] }
},
];


{
apiVersion: 'eksctl.io/v1alpha5',
kind: 'ClusterConfig',
metadata+: {
name: "maap",
region: clusterRegion,
version: "1.30",
tags+: {
"ManagedBy": "2i2c",
"2i2c.org/cluster-name": $.metadata.name,
},
},
availabilityZones: masterAzs,
iam: {
withOIDC: true,
},
// If you add an addon to this config, run the create addon command.
//
// eksctl create addon --config-file=maap.eksctl.yaml
//
addons: [
{ version: "latest", tags: $.metadata.tags } + addon
for addon in
[
{ name: "coredns" },
{ name: "kube-proxy" },
{
// vpc-cni is a Amazon maintained container networking interface
// (CNI), where a CNI is required for k8s networking. The aws-node
// DaemonSet in kube-system stems from installing this.
//
// Related docs: https://kubernetes.io/docs/concepts/extend-kubernetes/compute-storage-net/network-plugins/
// https://docs.aws.amazon.com/eks/latest/userguide/managing-vpc-cni.html
//
name: "vpc-cni",
attachPolicyARNs: ["arn:aws:iam::aws:policy/AmazonEKS_CNI_Policy"],
# FIXME: enabling network policy enforcement didn't work as of
# August 2024, what's wrong isn't clear.
#
# configurationValues ref: https://github.com/aws/amazon-vpc-cni-k8s/blob/HEAD/charts/aws-vpc-cni/values.yaml
configurationValues: |||
enableNetworkPolicy: "false"
|||,
},
{
// aws-ebs-csi-driver ensures that our PVCs are bound to PVs that
// couple to AWS EBS based storage, without it expect to see pods
// mounting a PVC failing to schedule and PVC resources that are
// unbound.
//
// Related docs: https://docs.aws.amazon.com/eks/latest/userguide/managing-ebs-csi.html
//
name: "aws-ebs-csi-driver",
wellKnownPolicies: {
ebsCSIController: true,
},
# configurationValues ref: https://github.com/kubernetes-sigs/aws-ebs-csi-driver/blob/HEAD/charts/aws-ebs-csi-driver/values.yaml
configurationValues: |||
defaultStorageClass:
enabled: true
|||,
},
]
],
nodeGroups: [
n + {clusterName: $.metadata.name} for n in
[
ng + {
namePrefix: 'core',
nameSuffix: 'a',
nameIncludeInstanceType: false,
availabilityZones: [nodeAz],
ssh: {
publicKeyPath: 'ssh-keys/maap.key.pub'
},
instanceType: "r5.xlarge",
minSize: 1,
maxSize: 6,
labels+: {
"hub.jupyter.org/node-purpose": "core",
"k8s.dask.org/node-purpose": "core",
},
tags+: {
"2i2c:node-purpose": "core"
},
},
] + [
ng + {
namePrefix: 'nb',
availabilityZones: [nodeAz],
minSize: 0,
maxSize: 500,
instanceType: n.instanceType,
ssh: {
publicKeyPath: 'ssh-keys/maap.key.pub'
},
labels+: {
"hub.jupyter.org/node-purpose": "user",
"k8s.dask.org/node-purpose": "scheduler"
},
taints+: {
"hub.jupyter.org_dedicated": "user:NoSchedule",
"hub.jupyter.org/dedicated": "user:NoSchedule",
},
tags+: {
"2i2c:node-purpose": "user"
},
} + n for n in notebookNodes
] + ( if daskNodes != null then
[
ng + {
namePrefix: 'dask',
availabilityZones: [nodeAz],
minSize: 0,
maxSize: 500,
ssh: {
publicKeyPath: 'ssh-keys/maap.key.pub'
},
labels+: {
"k8s.dask.org/node-purpose": "worker"
},
taints+: {
"k8s.dask.org_dedicated" : "worker:NoSchedule",
"k8s.dask.org/dedicated" : "worker:NoSchedule",
},
tags+: {
"2i2c:node-purpose": "worker"
},
instancesDistribution+: {
onDemandBaseCapacity: 0,
onDemandPercentageAboveBaseCapacity: 0,
spotAllocationStrategy: "capacity-optimized",
},
} + n for n in daskNodes
] else []
)
]
}
1 change: 1 addition & 0 deletions eksctl/ssh-keys/maap.key.pub
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIEKAnc9uvG/u94tT0iBOzgpcIbtzYqn18Mrm0MGGscJc [email protected]
Loading
Loading