Add high availability support for k3s with external database (loft-sh…

…#795) * feat(k3s): add HA support with external database Adjusted the k3s Helm chart so support the HA setup of k3s with an external database. * docs: add description for k3s in HA mode * fix(k3s): create service for STS only when HA mode is disabled * feat(k3s): always use policy/v1 for PodDisruptionBudget * feat(k3s): allow configuration of PodDisruptionBudget Adjusted the Helm template to allow the conditional creation of the PodDisruptionBudget as well as to provide user-defined values. * refactor(k3s): move changes for HA support into existing template In order to prevent the duplication of content, all necessary changes to add HA support was moved into the existing template `statefulset.yaml`. All k3s-specific named templates were added to a dedicated file to keep `_helpers.tpl` consistent across all charts. * feat(k3s): automatically generate server token Added the necessary changes to automatically generate the secret containing the k3s server token - in case no value is supplied. * refactor(k3s): generate k3s tokens with Helm hook Instead of using the `lookup` function, which is known to cause problems with the Helm CLI, ArgoCD, ..., to generate the k3s server token, it is now generated by a Helm pre-install hook. * refactor(k3s): only create secret in pre-install hook * docs: fix link to external datastore page
FabianKramm · Nov 7, 2022 · aed2b16 · aed2b16
1 parent 201de53
commit aed2b16
Show file tree

Hide file tree

Showing 10 changed files with 241 additions and 11 deletions.
diff --git a/charts/k3s/templates/_k3s_helpers.tpl b/charts/k3s/templates/_k3s_helpers.tpl
@@ -0,0 +1,29 @@
+{{/* vim: set filetype=mustache: */}}
+{{/*
+Returns the desired workload kind (StatefulSet / Deployment) for k3s
+*/}}
+{{- define "vcluster.k3s.workloadKind" -}}
+{{- ternary "Deployment" "StatefulSet" (.Values.enableHA) -}}
+{{- end -}}
+
+{{/*
+Returns the name of the secret containing the k3s tokens.
+*/}}
+{{- define "vcluster.k3s.tokenSecretName" -}}
+{{- with .Values.serverToken.secretKeyRef.name -}}
+{{- . -}}
+{{- else -}}
+{{- printf "%s-tokens" .Release.Name -}}
+{{- end -}}
+{{- end -}}
+
+{{/*
+Returns the secret key name containing the k3s server token.
+*/}}
+{{- define "vcluster.k3s.serverTokenKey" -}}
+{{- with .Values.serverToken.secretKeyRef.key -}}
+{{- . -}}
+{{- else -}}
+{{- "server-token" -}}
+{{- end -}}
+{{- end -}}
diff --git a/charts/k3s/templates/pdb.yaml b/charts/k3s/templates/pdb.yaml
@@ -0,0 +1,27 @@
+{{- if and (.Values.enableHA) (.Values.podDisruptionBudget.enabled) (gt (int .Values.replicas) 1) -}}
+apiVersion: policy/v1
+kind: PodDisruptionBudget
+metadata:
+  name: {{ .Release.Name }}
+  namespace: {{ .Release.Namespace }}
+  labels:
+    app: vcluster
+    chart: "{{ .Chart.Name }}-{{ .Chart.Version }}"
+    release: "{{ .Release.Name }}"
+    heritage: "{{ .Release.Service }}"
+  {{- if .Values.globalAnnotations }}
+  annotations:
+{{ toYaml .Values.globalAnnotations | indent 4 }}
+  {{- end }}
+spec:
+  {{- with .Values.podDisruptionBudget.minAvailable }}
+  minAvailable: {{ . }}
+  {{- end }}
+  {{- with .Values.podDisruptionBudget.maxUnavailable }}
+  maxUnavailable: {{ . }}
+  {{- end }}
+  selector:
+    matchLabels:
+      app: vcluster
+      release: {{ .Release.Name }}
+{{- end }}
diff --git a/charts/k3s/templates/pre-install-hook-secret.yaml b/charts/k3s/templates/pre-install-hook-secret.yaml
@@ -0,0 +1,19 @@
+{{- if (and (.Values.enableHA) (not .Values.serverToken.secretKeyRef)) }}
+apiVersion: v1
+kind: Secret
+metadata:
+  name: {{ include "vcluster.k3s.tokenSecretName" . | quote }}
+  namespace: {{ .Release.Namespace }}
+  annotations:
+    helm.sh/hook: pre-install
+    helm.sh/hook-weight: "3"
+    # helm.sh/hook-delete-policy: before-hook-creation # Default value
+    helm.sh/resource-policy: keep
+type: Opaque
+data:
+{{- if .Values.serverToken.value }}
+  server-token: {{ .Values.serverToken.value | b64enc | quote }}
+{{- else }}
+  server-token: {{ (randAlphaNum 32) | b64enc | quote }}
+{{- end }}
+{{- end }}
diff --git a/charts/k3s/templates/statefulset-service.yaml b/charts/k3s/templates/statefulset-service.yaml
@@ -1,3 +1,4 @@
+{{- if (eq (include "vcluster.k3s.workloadKind" .) "StatefulSet") }}
 apiVersion: v1
 kind: Service
 metadata:
@@ -22,3 +23,4 @@ spec:
   selector:
     app: vcluster
     release: "{{ .Release.Name }}"
+{{- end }}
diff --git a/charts/k3s/templates/statefulset.yaml b/charts/k3s/templates/statefulset.yaml
@@ -1,5 +1,6 @@
+{{- $kind := include "vcluster.k3s.workloadKind" . -}}
 apiVersion: apps/v1
-kind: StatefulSet
+kind: {{ $kind }}
 metadata:
   name: {{ .Release.Name }}
   namespace: {{ .Release.Namespace }}
@@ -17,12 +18,26 @@ metadata:
 {{ toYaml $annotations | indent 4 }}
   {{- end }}
 spec:
+{{- if (eq $kind "StatefulSet") }}
   serviceName: {{ .Release.Name }}-headless
+{{- end }}
   replicas: {{ .Values.replicas }}
+{{- if (and (eq $kind "Deployment") (.Values.enableHA)) }}
+  strategy:
+    rollingUpdate:
+      maxSurge: 1
+    {{- if (eq (int .Values.replicas) 1) }}
+      maxUnavailable: 0
+    {{- else }}
+      maxUnavailable: 1
+    {{- end }}
+    type: RollingUpdate
+{{- end }}
   selector:
     matchLabels:
       app: vcluster
       release: {{ .Release.Name }}
+  {{- if (eq $kind "StatefulSet") }}
   {{- if (hasKey .Values "volumeClaimTemplates") }}
   volumeClaimTemplates:
 {{ toYaml .Values.volumeClaimTemplates | indent 4 }}
@@ -37,6 +52,7 @@ spec:
           requests:
             storage: {{ .Values.storage.size }}
   {{- end }}
+  {{- end }}
   template:
     metadata:
   {{- if .Values.podAnnotations }}
@@ -53,8 +69,42 @@ spec:
       terminationGracePeriodSeconds: 10
       nodeSelector:
 {{ toYaml .Values.nodeSelector | indent 8 }}
+      {{- if .Values.affinity }}
       affinity:
 {{ toYaml .Values.affinity | indent 8 }}
+      {{- else if .Values.enableHA }}
+      affinity:
+        podAntiAffinity:
+          preferredDuringSchedulingIgnoredDuringExecution:
+          # if possible avoid scheduling more than one pod on one node
+          - weight: 100
+            podAffinityTerm:
+              labelSelector:
+                matchExpressions:
+                - key: app
+                  operator: In
+                  values:
+                  - vcluster
+                - key: release
+                  operator: In
+                  values:
+                  - {{ .Release.Name }}
+              topologyKey: "kubernetes.io/hostname"
+          # if possible avoid scheduling pod onto node that is in the same zone as one or more vcluster pods are running
+          - weight: 50
+            podAffinityTerm:
+              labelSelector:
+                matchExpressions:
+                - key: app
+                  operator: In
+                  values:
+                  - vcluster
+                - key: release
+                  operator: In
+                  values:
+                  - {{ .Release.Name }}
+              topologyKey: topology.kubernetes.io/zone
+      {{- end }}
       tolerations:
 {{ toYaml .Values.tolerations | indent 8 }}
       {{- if .Values.serviceAccount.name }}
@@ -130,6 +180,13 @@ spec:
           {{- if .Values.vcluster.env }}
 {{ toYaml .Values.vcluster.env | indent 10 }}
           {{- end }}
+          {{- if .Values.enableHA }}
+          - name: K3S_TOKEN
+            valueFrom:
+              secretKeyRef:
+                name: {{ include "vcluster.k3s.tokenSecretName" . | quote }}
+                key: {{ include "vcluster.k3s.serverTokenKey" . | quote }}
+          {{- end }}
           {{- if not .Values.serviceCIDR }}
           - name: SERVICE_CIDR
             valueFrom:
@@ -180,6 +237,11 @@ spec:
           {{- if .Values.syncer.kubeConfigContextName }}
           - --kube-config-context-name={{ .Values.syncer.kubeConfigContextName }}
           {{- end }}
+          {{- if .Values.enableHA }}
+          - --leader-elect=true
+          {{- else }}
+          - --leader-elect=false
+          {{- end }}
           {{- if .Values.ingress.enabled }}
           - --tls-san={{ .Values.ingress.host }}
           {{- end }}

diff --git a/charts/k3s/values.yaml b/charts/k3s/values.yaml
@@ -1,6 +1,10 @@
 # These annotations will be applied to all objects created in this chart
 globalAnnotations: {}
 
+# If the control plane is deployed in high availability mode
+# Make sure to scale up the replicas and use an external datastore
+enableHA: false
+
 # DefaultImageRegistry will be prepended to all deployed vcluster images, such as the vcluster pod, coredns etc.. Deployed
 # images within the vcluster will not be rewritten.
 defaultImageRegistry: ""
@@ -223,6 +227,25 @@ podLabels: {}
 annotations: {}
 podAnnotations: {}
 
+# PodDisruptionBudget settings for the vcluster
+# (takes effect only if high availability mode is enabled and more than one replica is created)
+podDisruptionBudget:
+  # Controls wether a PodDisruptionBudget will be created
+  enabled: false
+  minAvailable: 1
+  # maxUnavailable: 1
+
+# k3s token settings for the vcluster
+# (take effect only if high availability mode is enabled)
+serverToken:
+  # Shared secret used to join a k3s server to the cluster. If empty, a random token value will be generated.
+  # (Note that this token is also used to generate the encryption key for important content in the database e.g., bootstrap data)
+  value: ""
+  # Reference to an existing secret key used as value for the k3s server token
+  secretKeyRef: {}
+    # name: ""
+    # key: ""
+
 # Service configurations
 service:
   type: ClusterIP

diff --git a/docs/pages/fragments/high-availability-k3s.mdx b/docs/pages/fragments/high-availability-k3s.mdx
@@ -0,0 +1,61 @@
+### Enabling High Availability
+
+In order to run vcluster with k3s as Kubernetes distribution in high availability mode, the following steps are required:
+
+* create and use an [external datastore](../operator/external-datastore.mdx) (as opposed to the embedded SQLite datastore used in single-server setups)
+* run two or more k3s pods that will serve the Kubernetes API and run other control plane services
+
+First create a `values.yaml` in the following form and make sure to change the connection string in `K3S_DATASTORE_ENDPOINT`:
+
+```
+# Enable HA mode
+enableHA: true
+
+# Scale up k3s replicas
+replicas: 2
+
+# Set external datastore endpoint
+vcluster:
+  env:
+    - name: K3S_DATASTORE_ENDPOINT
+      value: mysql://username:password@tcp(hostname:3306)/database-name
+
+# Disable persistent storage as all data (including bootstrap data) is stored in external datastore
+storage:
+  persistence: false
+
+# Scale up CoreDNS replicas
+coredns:
+  replicas: 2
+```
+
+Then create the vcluster with the following command:
+
+```
+vcluster create ... --connect=false -f values.yaml
+```
+
+Check that vcluster including the control plane is running correctly:
+
+```
+kubectl get pods -n vcluster
+NAME                                                READY   STATUS    RESTARTS   AGE
+coredns-66ffcc6b58-bhk4s-x-kube-system-x-vcluster   1/1     Running   0          21s
+coredns-66ffcc6b58-n7npd-x-kube-system-x-vcluster   1/1     Running   0          21s
+vcluster-54fb5dd76-92szq                            2/2     Running   0          3m1s
+vcluster-54fb5dd76-ntbrh                            2/2     Running   0          3m1s
+```
+
+Now connect to the vcluster:
+
+```
+vcluster connect vcluster -n vcluster
+
+# Then execute in a new terminal
+export KUBECONFIG=kubeconfig.yaml
+kubectl get ns
+...
+```
+
+
+Check the [GitHub repository](https://github.com/loft-sh/vcluster/tree/main/charts/k3s) for all available chart options.
diff --git a/docs/pages/fragments/high-availability.mdx → ...pages/fragments/high-availability-k8s.mdx b/docs/pages/fragments/high-availability.mdx → ...pages/fragments/high-availability-k8s.mdx
@@ -1,3 +1,5 @@
+### Enabling High Availability
+
 In order to run vcluster in high availability mode, create a `values.yaml` in the following form:
 
 ```
@@ -57,11 +59,11 @@ vcluster connect vcluster-1 -n host-namespace-1
 
 # Then execute in a new terminal
 export KUBECONFIG=kubeconfig.yaml
-kubectl get ns  
+kubectl get ns
 ...
 ```
 
-## Enable HA in rootless mode
+### Enable HA in rootless mode
 Rootless mode means running vcluster without root user privileges in container, making host k8s cluster more secure.
 You can find more about rootless mode [here](../operator/restricted-hosts.mdx).
 
@@ -147,4 +149,4 @@ coredns:
       type: RuntimeDefault
 ```
 
-Check the [github repository](https://github.com/loft-sh/vcluster/tree/main/charts/k8s) for all available chart options.
+Check the [github repository](https://github.com/loft-sh/vcluster/tree/main/charts/k8s) for all available chart options.
diff --git a/docs/pages/operator/high-availability.mdx b/docs/pages/operator/high-availability.mdx
@@ -3,10 +3,15 @@ title: High Availability
 sidebar_label: High Availability
 ---
 
-import HighAvailability from '../fragments/high-availability.mdx';
+import HighAvailabilityK3s from '../fragments/high-availability-k3s.mdx';
+import HighAvailabilityK8s from '../fragments/high-availability-k8s.mdx';
 
-vcluster supports high-availability with the vanilla k8s distribution. Single binary distributions such as k0s and k3s are currently not supported for high availability setup of vcluster.
+vcluster supports high-availability with k3s as well as the vanilla k8s distribution. k0s is currently not supported for high availability setup of vcluster.
 
-### Enabling High Availability
+## k3s
 
-<HighAvailability />
+<HighAvailabilityK3s />
+
+## Vanilla k8s
+
+<HighAvailabilityK8s />
diff --git a/docs/pages/operator/other-distributions.mdx b/docs/pages/operator/other-distributions.mdx
@@ -3,7 +3,7 @@ title: Other Kubernetes distributions
 sidebar_label: Other Kubernetes distributions
 ---
 
-import HighAvailability from '../fragments/high-availability.mdx';
+import HighAvailabilityK8s from '../fragments/high-availability-k8s.mdx';
 
 By default, vcluster will use [k3s](https://github.com/k3s-io/k3s) as virtual Kubernetes cluster, which is a highly available, certified Kubernetes distribution designed for production workloads in unattended, resource-constrained, remote locations or inside IoT appliances.
 
@@ -29,7 +29,7 @@ Behind the scenes a different helm chart will be deployed (`vcluster-k0s`), that
 
 ## Vanilla k8s
 
-When choosing this option, vcluster will deploy a separate etcd cluster, kubernetes controller manager and api server alongside the vcluster hypervisor. 
+When choosing this option, vcluster will deploy a separate etcd cluster, kubernetes controller manager and api server alongside the vcluster hypervisor.
 
 In order to use vanilla k8s as backing cluster, create a vcluster with the following command:
 
@@ -47,7 +47,7 @@ Behind the scenes a different helm chart will be deployed (`vcluster-k8s`), that
 
 ### High Available Vanilla k8s
 
-<HighAvailability />
+<HighAvailabilityK8s />
 
 ## Other Distributions