forked from mspnp/aks-baseline
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathkured.yaml
183 lines (183 loc) · 5.77 KB
/
kured.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
# Source: https://github.com/kubereboot/charts/tree/kured-5.2.0/charts/kured (1.14.0)
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: kured
rules:
# Allow kured to read spec.unschedulable
# Allow kubectl to drain/uncordon
#
# NB: These permissions are tightly coupled to the bundled version of kubectl; the ones below
# match https://github.com/kubernetes/kubernetes/blob/v1.19.4/staging/src/k8s.io/kubectl/pkg/cmd/drain/drain.go
#
- apiGroups: [""]
resources: ["nodes"]
verbs: ["get", "patch"]
- apiGroups: [""]
resources: ["pods"]
verbs: ["list","delete","get"]
- apiGroups: ["extensions"]
resources: ["daemonsets"]
verbs: ["get"]
- apiGroups: ["apps"]
resources: ["daemonsets"]
verbs: ["get"]
- apiGroups: [""]
resources: ["pods/eviction"]
verbs: ["create"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: kured
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: kured
subjects:
- kind: ServiceAccount
name: kured
namespace: cluster-baseline-settings
---
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
namespace: cluster-baseline-settings
name: kured
rules:
# Allow kured to lock/unlock itself
- apiGroups: ["extensions"]
resources: ["daemonsets"]
resourceNames: ["kured"]
verbs: ["update", "patch"]
- apiGroups: ["apps"]
resources: ["daemonsets"]
resourceNames: ["kured"]
verbs: ["update", "patch"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
namespace: cluster-baseline-settings
name: kured
subjects:
- kind: ServiceAccount
namespace: cluster-baseline-settings
name: kured
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: kured
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: kured
namespace: cluster-baseline-settings
---
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: kured # Must match `--ds-name`
namespace: cluster-baseline-settings # Must match `--ds-namespace`
spec:
revisionHistoryLimit: 10
selector:
matchLabels:
app.kubernetes.io/name: kured
updateStrategy:
type: RollingUpdate
rollingUpdate:
maxUnavailable: 1
template:
metadata:
labels:
app.kubernetes.io/name: kured
annotations:
prometheus.io/scrape: "true"
prometheus.io/path: "/metrics"
prometheus.io/port: "8080"
spec:
serviceAccountName: kured
tolerations:
- key: node-role.kubernetes.io/control-plane
effect: NoSchedule
- key: node-role.kubernetes.io/master
effect: NoSchedule
- key: CriticalAddonsOnly
effect: NoSchedule
operator: Equal
value: "true"
hostNetwork: true
hostPID: true # Facilitate entering the host mount namespace via init
restartPolicy: Always
nodeSelector:
kubernetes.io/arch: amd64
kubernetes.io/os: linux
containers:
- name: kured
# PRODUCTION READINESS CHANGE REQUIRED
# This image should be sourced from a non-public container registry, such as the
# one deployed along side of this reference implementation.
# az acr import --source ghcr.io/kubereboot/kured:1.14.0 -n <your-acr-instance-name>
# and then set this to
# image: <your-acr-instance-name>.azurecr.io/kubereboot/kured:1.14.0
image: ghcr.io/kubereboot/kured:1.14.0
imagePullPolicy: IfNotPresent
securityContext:
privileged: true # Give permission to nsenter /proc/1/ns/mnt
resources:
limits:
cpu: 500m
memory: 48Mi
requests:
cpu: 200m
memory: 16Mi
ports:
- containerPort: 8080
name: metrics
env:
# Pass in the name of the node on which this pod is scheduled
# for use with drain/uncordon operations and lock acquisition
- name: KURED_NODE_ID
valueFrom:
fieldRef:
fieldPath: spec.nodeName
command:
- /usr/bin/kured
args:
- --ds-namespace=cluster-baseline-settings
# - --ds-name=kured
# - --reboot-command=/bin/systemctl reboot
# - --force-reboot=false
# - --drain-grace-period=-1
# - --skip-wait-for-delete-timeout=0
# - --drain-timeout=0
# - --period=1h
# - --ds-name=kured
# - --lock-annotation=weave.works/kured-node-lock
# - --lock-ttl=0
# - --prometheus-url=http://prometheus.monitoring.svc.cluster.local
# - --alert-filter-regexp=^RebootRequired$
# - --alert-firing-only=false
# - --reboot-sentinel=/var/run/reboot-required
# - --prefer-no-schedule-taint=""
# - --reboot-sentinel-command=""
# - --slack-hook-url=https://hooks.slack.com/...
# - --slack-username=prod
# - --slack-channel=alerting
# - --notify-url="" # See also shoutrrr url format
# - --message-template-drain=Draining node %s
# - --message-template-reboot=Rebooting node %s
# - --message-template-uncordon=Node %s rebooted & uncordoned successfully!
# - --blocking-pod-selector=runtime=long,cost=expensive
# - --blocking-pod-selector=name=temperamental
# - --blocking-pod-selector=...
# - --reboot-days=sun,mon,tue,wed,thu,fri,sat
# - --reboot-delay=90s
# - --start-time=0:00
# - --end-time=23:59:59
# - --time-zone=UTC
# - --annotate-nodes=false
# - --lock-release-delay=30m
# - --log-format=text