diff --git a/init.tf b/init.tf index 369aa21f..a2e44d0f 100644 --- a/init.tf +++ b/init.tf @@ -88,7 +88,7 @@ resource "null_resource" "kustomization" { "https://raw.githubusercontent.com/rancher/system-upgrade-controller/master/manifests/system-upgrade-controller.yaml", ], var.disable_hetzner_csi ? [] : ["https://raw.githubusercontent.com/hetznercloud/csi-driver/${local.csi_version}/deploy/kubernetes/hcloud-csi.yml"], - local.is_single_node_cluster ? [] : var.traefik_enabled ? ["traefik_config.yaml"] : [], + local.using_klipper_lb ? [] : var.traefik_enabled ? ["traefik_config.yaml"] : [], var.cni_plugin == "calico" ? ["https://projectcalico.docs.tigera.io/manifests/calico.yaml"] : [], var.enable_longhorn ? ["longhorn.yaml"] : [], var.enable_cert_manager || var.enable_rancher ? ["cert-manager.yaml"] : [], @@ -109,7 +109,7 @@ resource "null_resource" "kustomization" { # Upload traefik config provisioner "file" { - content = local.is_single_node_cluster || var.traefik_enabled == false ? "" : templatefile( + content = local.using_klipper_lb || var.traefik_enabled == false ? "" : templatefile( "${path.module}/templates/traefik_config.yaml.tpl", { name = "${var.cluster_name}-traefik" @@ -196,6 +196,7 @@ resource "null_resource" "kustomization" { provisioner "remote-exec" { inline = concat([ "set -ex", + # This ugly hack is here, because terraform serializes the # embedded yaml files with "- |2", when there is more than # one yamldocument in the embedded file. Kustomize does not understand @@ -205,12 +206,27 @@ resource "null_resource" "kustomization" { # due to indendation this should not changes the embedded # manifests themselves "sed -i 's/^- |[0-9]\\+$/- |/g' /var/post_install/kustomization.yaml", + + # Wait for k3s to become ready (we check one more time) because in some edge cases, + # the cluster had become unvailable for a few seconds, at this very instant. + <<-EOT + timeout 120 bash < /dev/null)" == "ok" ]]; do + echo "Waiting for the cluster to become ready..." + sleep 2 + done + EOF + EOT + , + + # Ready, set, go for the kustomization "kubectl apply -k /var/post_install", "echo 'Waiting for the system-upgrade-controller deployment to become available...'", "kubectl -n system-upgrade wait --for=condition=available --timeout=120s deployment/system-upgrade-controller", "kubectl -n system-upgrade apply -f /var/post_install/plans.yaml" ], - local.is_single_node_cluster || var.traefik_enabled == false ? [] : [<<-EOT + + local.using_klipper_lb || var.traefik_enabled == false ? [] : [<<-EOT timeout 120 bash < /dev/null)" ]; do echo "Waiting for load-balancer to get an IP..." diff --git a/locals.tf b/locals.tf index dcb9dcfd..48b8380f 100644 --- a/locals.tf +++ b/locals.tf @@ -68,16 +68,18 @@ locals { # if we are in a single cluster config, we use the default klipper lb instead of Hetzner LB control_plane_count = sum([for v in var.control_plane_nodepools : v.count]) agent_count = sum([for v in var.agent_nodepools : v.count]) - is_single_node_cluster = local.control_plane_count + local.agent_count == 1 + is_single_node_cluster = (local.control_plane_count + local.agent_count) == 1 + + using_klipper_lb = var.use_klipper_lb || local.is_single_node_cluster # disable k3s extras - disable_extras = concat(["local-storage"], local.is_single_node_cluster ? [] : ["servicelb"], var.traefik_enabled ? [] : ["traefik"], var.metrics_server_enabled ? [] : ["metrics-server"]) + disable_extras = concat(["local-storage"], local.using_klipper_lb ? [] : ["servicelb"], var.traefik_enabled ? [] : ["traefik"], var.metrics_server_enabled ? [] : ["metrics-server"]) # Default k3s node labels default_agent_labels = concat([], var.automatically_upgrade_k3s ? ["k3s_upgrade=true"] : []) default_control_plane_labels = concat([], var.automatically_upgrade_k3s ? ["k3s_upgrade=true"] : []) - allow_scheduling_on_control_plane = local.is_single_node_cluster ? true : var.allow_scheduling_on_control_plane + allow_scheduling_on_control_plane = local.using_klipper_lb ? true : var.allow_scheduling_on_control_plane # Default k3s node taints default_control_plane_taints = concat([], local.allow_scheduling_on_control_plane ? [] : ["node-role.kubernetes.io/master:NoSchedule"]) @@ -201,7 +203,7 @@ locals { "0.0.0.0/0" ] } - ], !local.is_single_node_cluster ? [] : [ + ], !local.using_klipper_lb ? [] : [ # Allow incoming web traffic for single node clusters, because we are using k3s servicelb there, # not an external load-balancer. { diff --git a/main.tf b/main.tf index 56cf19f1..7521984b 100644 --- a/main.tf +++ b/main.tf @@ -60,7 +60,7 @@ resource "hcloud_placement_group" "agent" { } data "hcloud_load_balancer" "traefik" { - count = local.is_single_node_cluster ? 0 : var.traefik_enabled == false ? 0 : 1 + count = local.using_klipper_lb ? 0 : var.traefik_enabled == false ? 0 : 1 name = "${var.cluster_name}-traefik" depends_on = [null_resource.kustomization] diff --git a/output.tf b/output.tf index 1ea34fd0..b7678df4 100644 --- a/output.tf +++ b/output.tf @@ -19,7 +19,7 @@ output "agents_public_ipv4" { output "load_balancer_public_ipv4" { description = "The public IPv4 address of the Hetzner load balancer" - value = local.is_single_node_cluster ? [ + value = local.using_klipper_lb ? [ for obj in module.control_planes : obj.ipv4_address ][0] : var.traefik_enabled == false ? null : data.hcloud_load_balancer.traefik[0].ipv4 } diff --git a/terraform.tfvars.example b/terraform.tfvars.example index e50001e3..288275a9 100644 --- a/terraform.tfvars.example +++ b/terraform.tfvars.example @@ -170,10 +170,16 @@ load_balancer_location = "fsn1" # Example: traefik_additional_options = ["--log.level=DEBUG", "--tracing=true"] # traefik_additional_options = [] +# Use the klipper LB, instead of the default Hetzner one, that has an advantage of dropping the cost of the setup, +# but you would need to point your DNS to every schedulable IPs in your cluster (usually agents). The default is "false". +# Automatically "true" in the case of single node cluster. +# use_klipper_lb = "true" + # If you want to configure a different CNI for k3s, use this flag # possible values: flannel (Default), calico # Cilium or other would be easy to add, you can mirror how Calico was added. PRs are welcome! -# CAVEATS: Calico is not supported for single node setups, because of the following issue https://github.com/k3s-io/klipper-lb/issues/6. +# CAVEATS: Calico is not supported when not using the Hetzner LB (like when use_klipper_lb is set to true or when using a single node cluster), +# because of the following issue https://github.com/k3s-io/klipper-lb/issues/6. # cni_plugin = "calico" # If you want to disable the k3s default network policy controller, use this flag! diff --git a/variables.tf b/variables.tf index 714ce4a2..d7371c2d 100644 --- a/variables.tf +++ b/variables.tf @@ -210,3 +210,9 @@ variable "rancher_registration_manifest_url" { description = "The url of a rancher registration manifest to apply. (see https://rancher.com/docs/rancher/v2.6/en/cluster-provisioning/registered-clusters/)" default = "" } + +variable "use_klipper_lb" { + type = bool + default = false + description = "Use klipper load balancer" +}