From bfe540b6591977c33f5938e225a173e7a68a8dbf Mon Sep 17 00:00:00 2001 From: SteBaum Date: Mon, 12 Aug 2024 10:52:15 +0200 Subject: [PATCH] feat: node decommissioning --- playbooks/decommission/README.md | 15 ++++++++ .../hdfs_namenode_decomm_datanode.yml | 14 ++++++++ ...arn_resourcemanager_decomm_nodemanager.yml | 14 ++++++++ .../decommission/hadoop-decommission.yml | 11 ++++++ playbooks/decommission/hbase-decommission.yml | 8 +++++ roles/hdfs/namenode/tasks/decommission.yml | 35 +++++++++++++++++++ .../resourcemanager/tasks/decommission.yml | 35 +++++++++++++++++++ 7 files changed, 132 insertions(+) create mode 100644 playbooks/decommission/README.md create mode 100644 playbooks/decommission/hadoop-components-decommission/hdfs_namenode_decomm_datanode.yml create mode 100644 playbooks/decommission/hadoop-components-decommission/yarn_resourcemanager_decomm_nodemanager.yml create mode 100644 playbooks/decommission/hadoop-decommission.yml create mode 100644 playbooks/decommission/hbase-decommission.yml create mode 100644 roles/hdfs/namenode/tasks/decommission.yml create mode 100644 roles/yarn/resourcemanager/tasks/decommission.yml diff --git a/playbooks/decommission/README.md b/playbooks/decommission/README.md new file mode 100644 index 00000000..85bd049f --- /dev/null +++ b/playbooks/decommission/README.md @@ -0,0 +1,15 @@ +# Node decommissioning + +- First decommission the HBase regionserver of the node for example: + + ```sh + ansible-playbook tdp/playbooks/meta/hbase-decommission.yaml -l worker-01 + ``` + +- If you want to update and refreh the queue of the Yarn capacity scheduler, change the variables in the Yarn variables file. Then execute the playbook hadoop-decommission which will effectively update and refreh the queue of the yarn capacity scheduler, decommission the Yarn nodemanager and decommission the HDFS datanode by adding the its FQDN to the variable `excluded_node_fqdn`, for example: + + ```sh + ansible-playbook tdp/playbooks/meta/decommission.yml -e "excluded_node_fqdn=worker-01.novalocal" + ``` + +*NB*: the decommissioning of the HDFS datanode can take several hours depending on the size of the file system. diff --git a/playbooks/decommission/hadoop-components-decommission/hdfs_namenode_decomm_datanode.yml b/playbooks/decommission/hadoop-components-decommission/hdfs_namenode_decomm_datanode.yml new file mode 100644 index 00000000..2fc62f1a --- /dev/null +++ b/playbooks/decommission/hadoop-components-decommission/hdfs_namenode_decomm_datanode.yml @@ -0,0 +1,14 @@ +# Copyright 2022 TOSIT.IO +# SPDX-License-Identifier: Apache-2.0 + +--- +- name: Hadoop HDFS datanode Decommission + hosts: hdfs_nn + tasks: + - tosit.tdp.resolve: # noqa unnamed-task + node_name: hdfs_namenode + - name: Decommission HDFS datanode + ansible.builtin.import_role: + name: tosit.tdp.hdfs.namenode + tasks_from: decommission + - ansible.builtin.meta: clear_facts # noqa unnamed-task diff --git a/playbooks/decommission/hadoop-components-decommission/yarn_resourcemanager_decomm_nodemanager.yml b/playbooks/decommission/hadoop-components-decommission/yarn_resourcemanager_decomm_nodemanager.yml new file mode 100644 index 00000000..466ddc7a --- /dev/null +++ b/playbooks/decommission/hadoop-components-decommission/yarn_resourcemanager_decomm_nodemanager.yml @@ -0,0 +1,14 @@ +# Copyright 2022 TOSIT.IO +# SPDX-License-Identifier: Apache-2.0 + +--- +- name: Hadoop Yarn resourcemanager decommission + hosts: yarn_rm + tasks: + - tosit.tdp.resolve: # noqa unnamed-task + node_name: yarn_resourcemanager + - name: Decommision YARN NM + ansible.builtin.import_role: + name: tosit.tdp.yarn.resourcemanager + tasks_from: decommission + - ansible.builtin.meta: clear_facts # noqa unnamed-task diff --git a/playbooks/decommission/hadoop-decommission.yml b/playbooks/decommission/hadoop-decommission.yml new file mode 100644 index 00000000..73813182 --- /dev/null +++ b/playbooks/decommission/hadoop-decommission.yml @@ -0,0 +1,11 @@ +# Copyright 2022 TOSIT.IO +# SPDX-License-Identifier: Apache-2.0 + +--- +# Add variable of the decommissioned node FQDN example: +# ansible-playbook tdp/playbooks/decommission/decommission.yml -e "excluded_node_fqdn=worker-01.novalocal" +- ansible.builtin.import_playbook: ../utils/yarn_capacity_scheduler.yml +- ansible.builtin.import_playbook: hadoop-components-decommission/yarn_resourcemanager_decomm_nodemanager.yml +# Decommission Yarn nodemanager +- ansible.builtin.import_playbook: hadoop-components-decommission/hdfs_namenode_decomm_datanode.yml +# Decommission HDFS namenode diff --git a/playbooks/decommission/hbase-decommission.yml b/playbooks/decommission/hbase-decommission.yml new file mode 100644 index 00000000..c73a3c88 --- /dev/null +++ b/playbooks/decommission/hbase-decommission.yml @@ -0,0 +1,8 @@ +# Copyright 2022 TOSIT.IO +# SPDX-License-Identifier: Apache-2.0 + +--- +# Add the host of the regionserver which should be decommissioned example: +# ansible-playbook tdpplaybooks/decommission/hbase-decommission.yaml -l worker-01 +- ansible.builtin.import_playbook: ../hbase_regionserver_stop.yml +# Decommission HBase regionserver diff --git a/roles/hdfs/namenode/tasks/decommission.yml b/roles/hdfs/namenode/tasks/decommission.yml new file mode 100644 index 00000000..5dda1178 --- /dev/null +++ b/roles/hdfs/namenode/tasks/decommission.yml @@ -0,0 +1,35 @@ +# Copyright 2022 TOSIT.IO +# SPDX-License-Identifier: Apache-2.0 + +--- +- name: Render dfs.exclude file + ansible.builtin.template: + src: dfs.exclude.j2 + dest: "{{ hdfs_site['dfs.hosts.exclude'] }}" + owner: root + group: root + mode: "644" + +- name: Update exlude nodes file + ansible.builtin.lineinfile: + path: /etc/hadoop/conf.nn/dfs.exclude + line: "{{ item }}" + state: present + loop: "{{ [excluded_node_fqdn] }}" + +- name: kinit hdfs NN + ansible.builtin.command: kinit -kt /etc/security/keytabs/nn.service.keytab nn/{{ ansible_fqdn }}@{{ realm }} + become_user: hdfs + +- name: RefreshNodes + ansible.builtin.command: /usr/bin/hdfs dfsadmin -refreshNodes + become_user: hdfs + +- name: Check node status + ansible.builtin.command: hdfs dfsadmin -report -decommissioning + register: hdfs_output + become_user: hdfs + +- name: Print output of node status + ansible.builtin.debug: + var: hdfs_output.stdout diff --git a/roles/yarn/resourcemanager/tasks/decommission.yml b/roles/yarn/resourcemanager/tasks/decommission.yml new file mode 100644 index 00000000..ad7d0f89 --- /dev/null +++ b/roles/yarn/resourcemanager/tasks/decommission.yml @@ -0,0 +1,35 @@ +# Copyright 2022 TOSIT.IO +# SPDX-License-Identifier: Apache-2.0 + +--- +- name: Render yarn.exclude file + ansible.builtin.template: + src: yarn.exclude.j2 + dest: "{{ yarn_site['yarn.resourcemanager.nodes.exclude-path'] }}" + owner: root + group: root + mode: "644" + +- name: Update exlude nodes file + ansible.builtin.lineinfile: + path: /etc/hadoop/conf.rm/yarn.exclude + line: "{{ item }}" + state: present + loop: "{{ [excluded_node_fqdn] }}" + +- name: kinit yarn RM + ansible.builtin.command: kinit -kt /etc/security/keytabs/rm.service.keytab rm/{{ ansible_fqdn }}@{{ realm }} + become_user: yarn + +- name: RefreshNodes + ansible.builtin.command: /usr/bin/yarn rmadmin -refreshNodes + become_user: yarn + +- name: Check node status + ansible.builtin.command: yarn node -list -all + register: yarn_output + become_user: yarn + +- name: Print output of node status + ansible.builtin.debug: + var: yarn_output.stdout