-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathk8s-check.py
160 lines (153 loc) · 6.17 KB
/
k8s-check.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
#!/opt/k8s-check/venv/bin/python3
#
# This script will check (basic) the k8s environment. It will report the status of:
# - nodes
# - deployments
# - pvc's
#
# If there is a problem found, it will display a message and the return code will be 1.
#
import argparse
import sys
from kubernetes import client, config, dynamic
from tabulate import tabulate
def get_nodes(args):
"""
This function will get a list of all nodes, and display the status and the k3s version.
If the status of the nodes is not "Ready" (this should be only reported), the cluster will be
reported as having problems.
:param args:
:return:
"""
error_present = False
dyn_client = dynamic.DynamicClient(
client.ApiClient(configuration=config.load_kube_config(args.kubeconfig))
)
api = dyn_client.resources.get(api_version="v1", kind="Node")
node_info = []
for item in api.get().items:
node = api.get(name=item.metadata.name)
node_status = ""
for condition in node.status.conditions:
if "Unknown" in condition["status"] and "Ready" in condition["type"]:
node_status = "NotReady"
if "True" in condition["status"] and not "EtcdIsVoter" in condition["type"]:
if node_status:
node_type = condition["type"]
node_status = f"{node_status},{node_type}"
else:
node_status = condition["type"]
if node.spec.taints:
for taint in node.spec.taints:
val = taint["effect"]
node_status = f"{node_status},{val}"
if len(node_status.strip()) > 5:
error_present = True
node_info.append([node.metadata.name, node_status, node.status.nodeInfo.kubeProxyVersion])
print("\n\nOverview of Nodes\n")
headers = ["Nodename", "status", "K3S Version"]
print(tabulate(node_info, headers=headers, tablefmt="pretty"))
return error_present
def get_deployments(args):
"""
This function will get a list of all deployments, and display the number of expected replicas
abd the used replica. If there are unused replicas this will be displayed. The cluster will be
reported as having problems.
:param args:
:return:
"""
error_present = False
config.load_kube_config(args.kubeconfig)
apps_api = client.AppsV1Api()
deployments = apps_api.list_deployment_for_all_namespaces()
depl_info = []
for deployment in deployments.items:
depl_info.append(
[deployment.metadata.name, deployment.metadata.namespace, deployment.spec.replicas,
deployment.status.available_replicas, deployment.status.unavailable_replicas])
if not deployment.status.unavailable_replicas is None:
error_present = True
headers = ["Name", "Namspace", "Minimum Replicas", "Available Replicas", "Unavailable Replicas"]
print("\n\nOverview of deployments\n")
print(tabulate(depl_info, headers=headers, tablefmt="pretty"))
return error_present
def get_pvc_status(args):
"""
This function will get a list of all pvc's, and display the status and the robustness.
If the robustness is not "Healthy", the cluster will be reported as having problems.
:param args:
:return:
"""
error_present = False
config.load_kube_config(args.kubeconfig)
core_v1_api = client.CoreV1Api()
custom_objects_api = client.CustomObjectsApi()
pvcs = core_v1_api.list_persistent_volume_claim_for_all_namespaces()
pvc_info = []
for pvc in pvcs.items:
namespace = pvc.metadata.namespace
pvc_name = pvc.metadata.name
status = pvc.status.phase
storage = pvc.spec.resources.requests.get('storage', 'Unknown')
volume_name = pvc.spec.volume_name
robustness = "Unknown"
used_space = "Unknown"
reserved_space = "Unknown"
try:
volume = custom_objects_api.get_namespaced_custom_object(
group="longhorn.io",
version="v1beta1",
namespace="longhorn-system",
plural="volumes",
name=volume_name,
)
robustness = volume.get("status", {}).get("robustness", "Unknown")
used_space = volume.get("status", {}).get("actualSize", "Unknown")
reserved_space = volume.get("spec", {}).get("size", "Unknown")
if not "healthy" in robustness:
error_present = True
except client.exceptions.ApiException:
pass # If volume is not found, skip
pod_status = "Not Bound"
pods = core_v1_api.list_pod_for_all_namespaces()
for pod in pods.items:
for vol in pod.spec.volumes:
if vol.persistent_volume_claim and vol.persistent_volume_claim.claim_name == pvc_name:
pod_status = pod.status.phase
break
pvc_info.append([namespace, pvc_name, status, storage, used_space, reserved_space, robustness, pod_status])
headers = ["Namespace", "PVC Name", "Status", "Requested Storage", "Used Space", "Reserved Space", "Robustness", "Pod Status"]
print("\n\nOverview of PVCs\n")
print(tabulate(pvc_info, headers=headers, tablefmt="pretty"))
return error_present
def do_check(args):
"""
This will execute the healtcheck
:param args:
:return:
"""
error_present = False
if get_nodes(args):
error_present = True
if get_deployments(args):
error_present = True
if get_pvc_status(args):
error_present = True
if error_present:
print("\nThe cluster is not healthy. Errors have been found!!!!!!!!")
return 1
else:
return 0
def main():
parser = argparse.ArgumentParser(description="Check kubernetes cluster.")
parser.add_argument('-k', '--kubeconfig',
help='path to the kubeconfig file. Required')
parser.add_argument('--version', action='version', version='%(prog)s 0.0.1, November 21, 2024')
args = parser.parse_args()
if not args.kubeconfig:
print("ERROR: The option --kubeconfig is mandatory. Exiting script")
return 1
return do_check(args)
if __name__ == "__main__":
rc = main()
sys.exit(rc)