From 705e3aabb59f6c6acb6b639da1ee3a92c4f6304f Mon Sep 17 00:00:00 2001 From: Konstantin Shalygin Date: Wed, 29 Nov 2023 15:42:51 +0300 Subject: [PATCH] ceph_ec_profile: added support for define failure domain For a large EC deployments, for example EC 8+3: * we need 12 racks * we have 10 hosts in rack with 60 OSD's per host * definitely need a failure domain `rack` The `crush-failure-domain=rack` in profile cmd will create a CRUSH rule that ensures no two chunks are stored in the same rack. Signed-off-by: Konstantin Shalygin --- library/ceph_ec_profile.py | 13 ++++++++++++- roles/ceph-rgw/tasks/rgw_create_pools.yml | 1 + tests/library/test_ceph_ec_profile.py | 22 +++++++++++++--------- 3 files changed, 26 insertions(+), 10 deletions(-) diff --git a/library/ceph_ec_profile.py b/library/ceph_ec_profile.py index 1ac5df38dc..5e35740a31 100644 --- a/library/ceph_ec_profile.py +++ b/library/ceph_ec_profile.py @@ -76,6 +76,10 @@ - Compute coding chunks for each object and store them on different OSDs. required: true + crush_failure_domain: + description: + - The domain for data durability (host/rack) + required: false crush_device_class: description: - Restrict placement to devices of a specific class (hdd/ssd) @@ -116,7 +120,7 @@ def get_profile(module, name, cluster='ceph', container_image=None): return cmd -def create_profile(module, name, k, m, stripe_unit, crush_device_class, cluster='ceph', force=False, container_image=None): # noqa: E501 +def create_profile(module, name, k, m, stripe_unit, crush_failure_domain, crush_device_class, cluster='ceph', force=False, container_image=None): # noqa: E501 ''' Create a profile ''' @@ -124,6 +128,8 @@ def create_profile(module, name, k, m, stripe_unit, crush_device_class, cluster= args = ['set', name, 'k={}'.format(k), 'm={}'.format(m)] if stripe_unit: args.append('stripe_unit={}'.format(stripe_unit)) + if crush_failure_domain: + args.append('crush-failure-domain={}'.format(crush_failure_domain)) if crush_device_class: args.append('crush-device-class={}'.format(crush_device_class)) if force: @@ -161,6 +167,7 @@ def run_module(): stripe_unit=dict(type='str', required=False), k=dict(type='str', required=False), m=dict(type='str', required=False), + crush_failure_domain=dict(type='str', required=False, default=''), crush_device_class=dict(type='str', required=False, default=''), ) @@ -177,6 +184,7 @@ def run_module(): stripe_unit = module.params.get('stripe_unit') k = module.params.get('k') m = module.params.get('m') + crush_failure_domain = module.params.get('crush_failure_domain') crush_device_class = module.params.get('crush_device_class') if module.check_mode: @@ -205,6 +213,7 @@ def run_module(): if current_profile['k'] != k or \ current_profile['m'] != m or \ current_profile.get('stripe_unit', stripe_unit) != stripe_unit or \ + current_profile.get('crush-failure-domain', crush_failure_domain) != crush_failure_domain or \ current_profile.get('crush-device-class', crush_device_class) != crush_device_class: # noqa: E501 rc, cmd, out, err = exec_command(module, create_profile(module, @@ -212,6 +221,7 @@ def run_module(): k, m, stripe_unit, + crush_failure_domain, # noqa: E501 crush_device_class, # noqa: E501 cluster, force=True, container_image=container_image)) # noqa: E501 @@ -223,6 +233,7 @@ def run_module(): k, m, stripe_unit, # noqa: E501 + crush_failure_domain, # noqa: E501 crush_device_class, # noqa: E501 cluster, container_image=container_image)) # noqa: E501 diff --git a/roles/ceph-rgw/tasks/rgw_create_pools.yml b/roles/ceph-rgw/tasks/rgw_create_pools.yml index bbdac1b4c7..4758a60bcb 100644 --- a/roles/ceph-rgw/tasks/rgw_create_pools.yml +++ b/roles/ceph-rgw/tasks/rgw_create_pools.yml @@ -5,6 +5,7 @@ cluster: "{{ cluster }}" k: "{{ item.value.ec_k }}" m: "{{ item.value.ec_m }}" + crush_failure_domain: "{{ item.value.crush_failure_domain | default(omit) }}" crush_device_class: "{{ item.value.ec_crush_device_class | default(omit) }}" delegate_to: "{{ groups[mon_group_name][0] }}" loop: "{{ rgw_create_pools | dict2items }}" diff --git a/tests/library/test_ceph_ec_profile.py b/tests/library/test_ceph_ec_profile.py index 955148f572..f6219364a5 100644 --- a/tests/library/test_ceph_ec_profile.py +++ b/tests/library/test_ceph_ec_profile.py @@ -28,15 +28,15 @@ def test_get_profile(self): assert ceph_ec_profile.get_profile(self.fake_module, self.fake_name) == expected_cmd - @pytest.mark.parametrize("stripe_unit,crush_device_class,force", [(False, None, False), - (32, None, True), - (False, None, True), - (32, None, False), - (False, 'hdd', False), - (32, 'ssd', True), - (False, 'nvme', True), - (32, 'hdd', False)]) - def test_create_profile(self, stripe_unit, crush_device_class, force): + @pytest.mark.parametrize("stripe_unit,crush_failure_domain,crush_device_class,force", [(False, None, None, False), + (32, None, None, True), + (False, None, None, True), + (32, None, None, False), + (False, 'host', 'hdd', False), + (32, 'host,', 'ssd', True), + (False, 'host', 'nvme', True), + (32, 'host', 'hdd', False)]) + def test_create_profile(self, stripe_unit, crush_failure_domain, crush_device_class, force): expected_cmd = [ self.fake_binary, '-n', 'client.admin', @@ -48,6 +48,8 @@ def test_create_profile(self, stripe_unit, crush_device_class, force): ] if stripe_unit: expected_cmd.append('stripe_unit={}'.format(stripe_unit)) + if crush_failure_domain: + expected_cmd.append('crush-failure-domain={}'.format(crush_failure_domain)) if crush_device_class: expected_cmd.append('crush-device-class={}'.format(crush_device_class)) if force: @@ -58,6 +60,7 @@ def test_create_profile(self, stripe_unit, crush_device_class, force): self.fake_k, self.fake_m, stripe_unit, + crush_failure_domain, crush_device_class, self.fake_cluster, force) == expected_cmd @@ -85,6 +88,7 @@ def test_state_present_nothing_to_update(self, m_exec_command, m_exit_json, m_fa "k": 2, "m": 4, "stripe_unit": 32, + "crush_failure_domain": "host", }) m_exit_json.side_effect = ca_test_common.exit_json m_fail_json.side_effect = ca_test_common.fail_json