diff --git a/README.md b/README.md index 5b3007da..d3a5253b 100644 --- a/README.md +++ b/README.md @@ -35,6 +35,7 @@ _Please note that we are not affiliates of Hetzner; this is just an open-source - Proper use of the Hetzner private network to minimize latency and remove the need for encryption. - Traefik or Nginx as ingress controller attached to a Hetzner load balancer with Proxy Protocol turned on. - Automatic HA with the default setting of three control-plane nodes and two agent nodes. +- Autoscaling nodes by supporting the [kubernetes autoscaler](https://github.com/kubernetes/autoscaler). - Super-HA: Nodepools for both control-plane and agent nodes can be in different locations. - Possibility to have a single node cluster with a proper ingress controller. - Can use Klipper as an "on-metal" LB instead of the Hetzner LB. @@ -116,6 +117,29 @@ _Once the cluster is up; you can change any nodepool count and even set it to 0 _However, you can freely add other nodepools at the end of each list. And for each nodepools, you can freely increase or decrease the node count (if you want to decrease a nodepool node count make sure you drain the nodes in question before, you can use `terraform show` to identify the node names at the end of the nodepool list, otherwise, if you do not drain the nodes before removing them, it could leave your cluster in a bad state). The only nodepool that needs to have always at least a count of 1 is the first control-plane nodepool._ +### Autoscaling Node Pools + +We are supporting autoscaling node pools by deploying the [k8s cluster autoscaler (CA)](https://github.com/kubernetes/autoscaler). +By default, this feature is disabled. You can control the feature via adding a pool description to the following variable in `kube.tf` (by default this array is empty): + +```terraform +autoscaler_nodepools = [ + { + name = "autoscaler" + server_type = "cpx21" # must be same or better than the control_plane server type (regarding disk size)! + location = "fsn1" + min_nodes = 0 + max_nodes = 5 + } + ] +``` + +By adding at least one map to the array of `autoscaler_nodepools` the feature will be enabled. +The nodes are booted based on a snapshot that is created from the initial control_plane. +So please ensure that the disk of your chosen server type is at least the same size as the one of the first control_plane. + +See the _CA_ documentation for more configuration options. + ## High Availability By default, we have three control planes and three agents configured, with automatic upgrades and reboots of the nodes. @@ -136,7 +160,8 @@ You can copy and modify the [one in the templates](https://github.com/kube-hetzn ### Turning Off Automatic Upgrade -_If you wish to turn off automatic MicroOS upgrades (Important if you are not launching an HA setup which requires at least 3 control-plane nodes), you need to set:_ +_If you wish to turn off automatic MicroOS upgrades (Important if you are not launching an HA setup which requires at least 3 control-plane nodes), you need to set:_ + ```terraform automatically_upgrade_os = false ``` diff --git a/autoscaler-agents.tf b/autoscaler-agents.tf new file mode 100644 index 00000000..8eeda69e --- /dev/null +++ b/autoscaler-agents.tf @@ -0,0 +1,91 @@ +locals { + autoscaler_yaml = length(var.autoscaler_nodepools) == 0 ? "" : templatefile( + "${path.module}/templates/autoscaler.yaml.tpl", + { + #cloudinit_config - we have to check if this is necessary, if so we need to recreate it, or somehow extract it from server module, up to a higher level + cloudinit_config = base64encode(data.cloudinit_config.autoscaler-config[0].rendered) + ca_image = var.cluster_autoscaler_image + ca_version = var.cluster_autoscaler_version + ssh_key = local.hcloud_ssh_key_id + ipv4_subnet_id = hcloud_network.k3s.id # for now we use the k3s network, as we cannot reference subnet-ids in autoscaler + snapshot_id = hcloud_snapshot.autoscaler_image[0].id + firewall_id = hcloud_firewall.k3s.id + node_pools = var.autoscaler_nodepools + }) +} + +resource "hcloud_snapshot" "autoscaler_image" { + count = length(var.autoscaler_nodepools) > 0 ? 1 : 0 + + # using control_plane here as this one is always available + server_id = values(module.control_planes)[0].id + description = "Initial snapshot used for autoscaler" + labels = { + autoscaler = "true" + } +} + +resource "null_resource" "configure_autoscaler" { + count = length(var.autoscaler_nodepools) > 0 ? 1 : 0 + + triggers = { + template = local.autoscaler_yaml + } + connection { + user = "root" + private_key = var.ssh_private_key + agent_identity = local.ssh_agent_identity + host = module.control_planes[keys(module.control_planes)[0]].ipv4_address + port = var.ssh_port + } + + # Upload the autoscaler resource defintion + provisioner "file" { + content = local.autoscaler_yaml + destination = "/tmp/autoscaler.yaml" + } + + # Create/Apply the definition + provisioner "remote-exec" { + inline = [ + "set -ex", + "kubectl apply -f /tmp/autoscaler.yaml", + ] + } + + depends_on = [ + null_resource.first_control_plane, + hcloud_snapshot.autoscaler_image + ] +} + +data "cloudinit_config" "autoscaler-config" { + count = length(var.autoscaler_nodepools) > 0 ? 1 : 0 + + gzip = true + base64_encode = true + + # Main cloud-config configuration file. + part { + filename = "init.cfg" + content_type = "text/cloud-config" + content = templatefile( + "${path.module}/templates/autoscaler-cloudinit.yaml.tpl", + { + hostname = "autoscaler" + sshPort = var.ssh_port + sshAuthorizedKeys = concat([var.ssh_public_key], var.ssh_additional_public_keys) + dnsServers = var.dns_servers + k3s_channel = var.initial_k3s_channel + k3s_config = yamlencode({ + server = "https://${var.use_control_plane_lb ? hcloud_load_balancer_network.control_plane.*.ip[0] : module.control_planes[keys(module.control_planes)[0]].private_ipv4_address}:6443" + token = random_password.k3s_token.result + kubelet-arg = ["cloud-provider=external", "volume-plugin-dir=/var/lib/kubelet/volumeplugins"] + flannel-iface = "eth1" + node-label = local.default_agent_labels + node-taint = local.default_agent_taints + }) + } + ) + } +} diff --git a/kube.tf.example b/kube.tf.example index 3729ce30..0a5c24ca 100644 --- a/kube.tf.example +++ b/kube.tf.example @@ -145,6 +145,19 @@ module "kube-hetzner" { # You can refine a base domain name to be use in this form of nodename.base_domain for setting the reserve dns inside Hetzner # base_domain = "mycluster.example.com" + # Cluster Autoscaler + # Providing at least one map for the array enables the cluster autoscaler feature, default is disabled + # * Example below: + # autoscaler_nodepools = [ + # { + # name = "autoscaler" + # server_type = "cpx21" # must be same or better than the control_plane server type (regarding disk size)! + # location = "fsn1" + # min_nodes = 0 + # max_nodes = 5 + # } + # ] + # To use local storage on the nodes, you can enable Longhorn, default is "false". # enable_longhorn = true diff --git a/modules/host/variables.tf b/modules/host/variables.tf index 2d6e8820..921e50d1 100644 --- a/modules/host/variables.tf +++ b/modules/host/variables.tf @@ -36,7 +36,7 @@ variable "ssh_keys" { } variable "firewall_ids" { - description = "Set of firewal IDs" + description = "Set of firewall IDs" type = set(number) nullable = true } diff --git a/templates/autoscaler-cloudinit.yaml.tpl b/templates/autoscaler-cloudinit.yaml.tpl new file mode 100644 index 00000000..008bc7b9 --- /dev/null +++ b/templates/autoscaler-cloudinit.yaml.tpl @@ -0,0 +1,135 @@ +instance-id: iid-abcde001 + +#cloud-config + +debug: True + +bootcmd: +# uninstall k3s if it exists already in the snaphshot +- [/bin/sh, -c, '[ -f /usr/local/bin/k3s-uninstall.sh ] && /usr/local/bin/k3s-uninstall.sh'] + +write_files: + +# Configure the private network interface +- content: | + BOOTPROTO='dhcp' + STARTMODE='auto' + path: /etc/sysconfig/network/ifcfg-eth1 + +# Disable ssh password authentication +- content: | + Port ${sshPort} + PasswordAuthentication no + X11Forwarding no + MaxAuthTries 2 + AllowTcpForwarding no + AllowAgentForwarding no + AuthorizedKeysFile .ssh/authorized_keys + path: /etc/ssh/sshd_config.d/kube-hetzner.conf + +# Set reboot method as "kured" +- content: | + REBOOT_METHOD=kured + path: /etc/transactional-update.conf + +# Create the sshd_t.pp file, that allows in SELinux custom SSH ports via "semodule -i", +# the encoding is binary in base64, created on a test machine with "audit2allow -a -M sshd_t", +# it is only applied when the port is different then 22, see below in the runcmd section. +- content: !!binary | + j/98+QEAAAABAAAAEAAAAI3/fPkPAAAAU0UgTGludXggTW9kdWxlAgAAABUAAAABAAAACAAAAAAA + AAAGAAAAc3NoZF90AwAAADEuMEAAAAAAAAAAAAAAAAAAAAAAAAAAAgAAAAIAAAAKAAAAAAAAAAIA + AAABAAAAAQAAAAAAAAB0Y3Bfc29ja2V0CQAAAAEAAABuYW1lX2JpbmQDAAAAAAAAAAEAAAABAAAA + AQAAAAAAAABkaXIFAAAAAQAAAHdyaXRlAQAAAAEAAAAIAAAAAQAAAAAAAABvYmplY3RfckAAAAAA + AAAAAAAAAEAAAAAAAAAAAAAAAEAAAAAAAAAAAAAAAAAAAAAAAAAAQAAAAAAAAAAAAAAABAAAAAQA + AAARAAAAAQAAAAEAAAABAAAAAAAAAEAAAAAAAAAAAAAAAGNocm9ueWRfdmFyX3J1bl90EQAAAAIA + AAABAAAAAQAAAAAAAABAAAAAAAAAAAAAAAB1bnJlc2VydmVkX3BvcnRfdAgAAAADAAAAAQAAAAEA + AAAAAAAAQAAAAAAAAAAAAAAAd2lja2VkX3QGAAAABAAAAAEAAAABAAAAAAAAAEAAAAAAAAAAAAAA + AHNzaGRfdAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAQAAAAEAAAABAAAAAAAAAAAA + AAACAAAAAQAAAAAAAABAAAAAQAAAAAEAAAAAAAAACAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAQAAA + AEAAAAABAAAAAAAAAAIAAAAAAAAAQAAAAAAAAAAAAAAAAAAAAAEAAAACAAAAAQAAAAEAAAAAAAAA + QAAAAEAAAAABAAAAAAAAAAQAAAAAAAAAQAAAAAAAAAAAAAAAAAAAAEAAAABAAAAAAQAAAAAAAAAB + AAAAAAAAAEAAAAAAAAAAAAAAAAAAAAABAAAAAQAAAAEAAAAAAAAAAAAAAAAAAAAAAAAAQAAAAAAA + AAAAAAAAQAAAAEAAAAABAAAAAAAAAAMAAAAAAAAAQAAAAAAAAAAAAAAAQAAAAEAAAAABAAAAAAAA + AA8AAAAAAAAAQAAAAAAAAAAAAAAAQAAAAAAAAAAAAAAAQAAAAAAAAAAAAAAAQAAAAAAAAAAAAAAA + AgAAAEAAAABAAAAAAQAAAAAAAAABAAAAAAAAAEAAAABAAAAAAQAAAAAAAAABAAAAAAAAAEAAAAAA + AAAAAAAAAEAAAAAAAAAAAAAAAEAAAAAAAAAAAAAAAEAAAAAAAAAAAAAAAEAAAAAAAAAAAAAAAEAA + AAAAAAAAAAAAAEAAAAAAAAAAAAAAAEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA + AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAIAAAAKAAAA + dGNwX3NvY2tldAEAAAABAAAAAQAAAAMAAABkaXIBAAAAAQAAAAEAAAABAAAACAAAAG9iamVjdF9y + AgAAAAEAAAABAAAABAAAABEAAABjaHJvbnlkX3Zhcl9ydW5fdAEAAAABAAAAAQAAABEAAAB1bnJl + c2VydmVkX3BvcnRfdAEAAAABAAAAAQAAAAgAAAB3aWNrZWRfdAEAAAABAAAAAQAAAAYAAABzc2hk + X3QBAAAAAQAAAAEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA== + path: /etc/selinux/sshd_t.pp + +- owner: root:root + path: /etc/rancher/k3s/config.yaml + encoding: base64 + content: ${base64encode(k3s_config)} + +- owner: root:root + path: /root/install-k3s-agent.sh + permissions: '0600' + content: | + #!/bin/sh + set -e + + # old k3s is deleted with bootcmd + + # run installer. Not the best way to serve directly from a public server, but works for now + curl -sfL https://get.k3s.io | INSTALL_K3S_SKIP_START=true INSTALL_K3S_SKIP_SELINUX_RPM=true INSTALL_K3S_CHANNEL=${k3s_channel} INSTALL_K3S_EXEC=agent sh - + + # install selinux module + /sbin/semodule -v -i /usr/share/selinux/packages/k3s.pp + +# Add new authorized keys +ssh_deletekeys: true + +ssh_authorized_keys: +%{ for key in sshAuthorizedKeys ~} + - ${key} +%{ endfor ~} + +# Resize /var, not /, as that's the last partition in MicroOS image. +growpart: + devices: ["/var"] + +# Make sure the hostname is set correctly +hostname: ${hostname} +preserve_hostname: true + +runcmd: + +# ensure that /var uses full available disk size, thanks to btrfs this is easy +- [btrfs, 'filesystem', 'resize', 'max', '/var'] + +%{ if sshPort != 22 } +# SELinux permission for the SSH alternative port. +- [semodule, '-vi', '/etc/selinux/sshd_t.pp'] +%{ endif } + +# As above, make sure the hostname is not reset +- [sed, '-i', 's/NETCONFIG_NIS_SETDOMAINNAME="yes"/NETCONFIG_NIS_SETDOMAINNAME="no"/g', /etc/sysconfig/network/config] +- [sed, '-i', 's/DHCLIENT_SET_HOSTNAME="yes"/DHCLIENT_SET_HOSTNAME="no"/g', /etc/sysconfig/network/dhcp] + +%{ if length(dnsServers) > 0 } +# We set the user provided DNS servers, or leave the value empty to default to Hetzners +- [sed, '-i', 's/NETCONFIG_DNS_STATIC_SERVERS=""/NETCONFIG_DNS_STATIC_SERVERS="${join(" ", dnsServers)}"/g', /etc/sysconfig/network/config] +%{ endif } + +# Bounds the amount of logs that can survive on the system +- [sed, '-i', 's/#SystemMaxUse=/SystemMaxUse=3G/g', /etc/systemd/journald.conf] +- [sed, '-i', 's/#MaxRetentionSec=/MaxRetentionSec=1week/g', /etc/systemd/journald.conf] + +# Reduces the default number of snapshots from 2-10 number limit, to 4 and from 4-10 number limit important, to 2 +- [sed, '-i', 's/NUMBER_LIMIT="2-10"/NUMBER_LIMIT="4"/g', /etc/snapper/configs/root] +- [sed, '-i', 's/NUMBER_LIMIT_IMPORTANT="4-10"/NUMBER_LIMIT_IMPORTANT="3"/g', /etc/snapper/configs/root] + +# Disables unneeded services +- [systemctl, 'restart', 'sshd'] +- [systemctl, disable, '--now', 'rebootmgr.service'] + +# install k3s +- [/bin/sh, /root/install-k3s-agent.sh] + +# start k3s-agent service +- [systemctl, 'start', 'k3s-agent'] diff --git a/templates/autoscaler.yaml.tpl b/templates/autoscaler.yaml.tpl new file mode 100644 index 00000000..423a21c0 --- /dev/null +++ b/templates/autoscaler.yaml.tpl @@ -0,0 +1,199 @@ +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + labels: + k8s-addon: cluster-autoscaler.addons.k8s.io + k8s-app: cluster-autoscaler + name: cluster-autoscaler + namespace: kube-system +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: cluster-autoscaler + labels: + k8s-addon: cluster-autoscaler.addons.k8s.io + k8s-app: cluster-autoscaler +rules: + - apiGroups: [""] + resources: ["events", "endpoints"] + verbs: ["create", "patch"] + - apiGroups: [""] + resources: ["pods/eviction"] + verbs: ["create"] + - apiGroups: [""] + resources: ["pods/status"] + verbs: ["update"] + - apiGroups: [""] + resources: ["endpoints"] + resourceNames: ["cluster-autoscaler"] + verbs: ["get", "update"] + - apiGroups: [""] + resources: ["nodes"] + verbs: ["watch", "list", "get", "update"] + - apiGroups: [""] + resources: + - "namespaces" + - "pods" + - "services" + - "replicationcontrollers" + - "persistentvolumeclaims" + - "persistentvolumes" + verbs: ["watch", "list", "get"] + - apiGroups: ["extensions"] + resources: ["replicasets", "daemonsets"] + verbs: ["watch", "list", "get"] + - apiGroups: ["policy"] + resources: ["poddisruptionbudgets"] + verbs: ["watch", "list"] + - apiGroups: ["apps"] + resources: ["statefulsets", "replicasets", "daemonsets"] + verbs: ["watch", "list", "get"] + - apiGroups: ["storage.k8s.io"] + resources: ["storageclasses", "csinodes", "csistoragecapacities", "csidrivers"] + verbs: ["watch", "list", "get"] + - apiGroups: ["batch", "extensions"] + resources: ["jobs"] + verbs: ["get", "list", "watch", "patch"] + - apiGroups: ["coordination.k8s.io"] + resources: ["leases"] + verbs: ["create"] + - apiGroups: ["coordination.k8s.io"] + resourceNames: ["cluster-autoscaler"] + resources: ["leases"] + verbs: ["get", "update"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: cluster-autoscaler + namespace: kube-system + labels: + k8s-addon: cluster-autoscaler.addons.k8s.io + k8s-app: cluster-autoscaler +rules: + - apiGroups: [""] + resources: ["configmaps"] + verbs: ["create","list","watch"] + - apiGroups: [""] + resources: ["configmaps"] + resourceNames: ["cluster-autoscaler-status", "cluster-autoscaler-priority-expander"] + verbs: ["delete", "get", "update", "watch"] + +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: cluster-autoscaler + labels: + k8s-addon: cluster-autoscaler.addons.k8s.io + k8s-app: cluster-autoscaler +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: cluster-autoscaler +subjects: + - kind: ServiceAccount + name: cluster-autoscaler + namespace: kube-system + +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: cluster-autoscaler + namespace: kube-system + labels: + k8s-addon: cluster-autoscaler.addons.k8s.io + k8s-app: cluster-autoscaler +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: cluster-autoscaler +subjects: + - kind: ServiceAccount + name: cluster-autoscaler + namespace: kube-system + +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: cluster-autoscaler + namespace: kube-system + labels: + app: cluster-autoscaler +spec: + replicas: 1 + selector: + matchLabels: + app: cluster-autoscaler + template: + metadata: + labels: + app: cluster-autoscaler + annotations: + prometheus.io/scrape: 'true' + prometheus.io/port: '8085' + spec: + serviceAccountName: cluster-autoscaler + tolerations: + - effect: NoSchedule + key: node-role.kubernetes.io/control-plane + - effect: NoSchedule + key: node-role.kubernetes.io/master + + # Node affinity is used to force cluster-autoscaler to stick + # to the master node. This allows the cluster to reliably downscale + # to zero worker nodes when needed. + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: node-role.kubernetes.io/master + operator: Exists + containers: + - image: ${ca_image}:${ca_version} + name: cluster-autoscaler + resources: + limits: + cpu: 100m + memory: 300Mi + requests: + cpu: 100m + memory: 300Mi + command: + - ./cluster-autoscaler + - --v=5 + - --cloud-provider=hetzner + - --stderrthreshold=info + %{~ for pool in node_pools ~} + - --nodes=${pool.min_nodes}:${pool.max_nodes}:${pool.server_type}:${pool.location}:${pool.name} + %{~ endfor ~} + env: + - name: HCLOUD_TOKEN + valueFrom: + secretKeyRef: + name: hcloud + key: token + - name: HCLOUD_CLOUD_INIT + value: ${cloudinit_config} + - name: HCLOUD_SSH_KEY + value: '${ssh_key}' + - name: HCLOUD_NETWORK + value: '${ipv4_subnet_id}' + - name: HCLOUD_IMAGE + value: '${snapshot_id}' + - name: HCLOUD_FIREWALL + value: '${firewall_id}' + volumeMounts: + - name: ssl-certs + mountPath: /etc/ssl/certs + readOnly: true + imagePullPolicy: "Always" + volumes: + - name: ssl-certs + hostPath: + path: "/etc/ssl/certs" # right place on MicroOS? diff --git a/variables.tf b/variables.tf index 4e6ea5e0..e8a118cc 100644 --- a/variables.tf +++ b/variables.tf @@ -80,6 +80,30 @@ variable "agent_nodepools" { default = [] } +variable "cluster_autoscaler_image" { + type = string + default = "k8s.gcr.io/autoscaling/cluster-autoscaler" + description = "Image of Kubernetes Cluster Autoscaler for Hetzner Cloud to be used." +} + +variable "cluster_autoscaler_version" { + type = string + default = "v1.25.0" + description = "Version of Kubernetes Cluster Autoscaler for Hetzner Cloud. Should be aligned with Kubernetes version" +} + +variable "autoscaler_nodepools" { + description = "Cluster autoscaler nodepools." + type = list(object({ + name = string + server_type = string + location = string + min_nodes = number + max_nodes = number + })) + default = [] +} + variable "hetzner_ccm_version" { type = string default = null @@ -243,7 +267,6 @@ variable "longhorn_fstype" { } } - variable "longhorn_replica_count" { type = number default = 3