diff options
-rw-r--r-- | roles/openshift_health_checker/openshift_checks/etcd_volume.py | 58 | ||||
-rw-r--r-- | roles/openshift_health_checker/test/etcd_volume_test.py | 149 |
2 files changed, 207 insertions, 0 deletions
diff --git a/roles/openshift_health_checker/openshift_checks/etcd_volume.py b/roles/openshift_health_checker/openshift_checks/etcd_volume.py new file mode 100644 index 000000000..7452c9cc1 --- /dev/null +++ b/roles/openshift_health_checker/openshift_checks/etcd_volume.py @@ -0,0 +1,58 @@ +"""A health check for OpenShift clusters.""" + +from openshift_checks import OpenShiftCheck, OpenShiftCheckException, get_var + + +class EtcdVolume(OpenShiftCheck): + """Ensures etcd storage usage does not exceed a given threshold.""" + + name = "etcd_volume" + tags = ["etcd", "health"] + + # Default device usage threshold. Value should be in the range [0, 100]. + default_threshold_percent = 90 + # Where to find ectd data, higher priority first. + supported_mount_paths = ["/var/lib/etcd", "/var/lib", "/var", "/"] + + @classmethod + def is_active(cls, task_vars): + etcd_hosts = get_var(task_vars, "groups", "etcd", default=[]) or get_var(task_vars, "groups", "masters", + default=[]) or [] + is_etcd_host = get_var(task_vars, "ansible_ssh_host") in etcd_hosts + return super(EtcdVolume, cls).is_active(task_vars) and is_etcd_host + + def run(self, tmp, task_vars): + mount_info = self._etcd_mount_info(task_vars) + available = mount_info["size_available"] + total = mount_info["size_total"] + used = total - available + + threshold = get_var( + task_vars, + "etcd_device_usage_threshold_percent", + default=self.default_threshold_percent + ) + + used_percent = 100.0 * used / total + + if used_percent > threshold: + device = mount_info.get("device", "unknown") + mount = mount_info.get("mount", "unknown") + msg = "etcd storage usage ({:.1f}%) is above threshold ({:.1f}%). Device: {}, mount: {}.".format( + used_percent, threshold, device, mount + ) + return {"failed": True, "msg": msg} + + return {"changed": False} + + def _etcd_mount_info(self, task_vars): + ansible_mounts = get_var(task_vars, "ansible_mounts") + mounts = {mnt.get("mount"): mnt for mnt in ansible_mounts} + + for path in self.supported_mount_paths: + if path in mounts: + return mounts[path] + + paths = ', '.join(sorted(mounts)) or 'none' + msg = "Unable to find etcd storage mount point. Paths mounted: {}.".format(paths) + raise OpenShiftCheckException(msg) diff --git a/roles/openshift_health_checker/test/etcd_volume_test.py b/roles/openshift_health_checker/test/etcd_volume_test.py new file mode 100644 index 000000000..917045526 --- /dev/null +++ b/roles/openshift_health_checker/test/etcd_volume_test.py @@ -0,0 +1,149 @@ +import pytest + +from openshift_checks.etcd_volume import EtcdVolume, OpenShiftCheckException + + +@pytest.mark.parametrize('ansible_mounts,extra_words', [ + ([], ['none']), # empty ansible_mounts + ([{'mount': '/mnt'}], ['/mnt']), # missing relevant mount paths +]) +def test_cannot_determine_available_disk(ansible_mounts, extra_words): + task_vars = dict( + ansible_mounts=ansible_mounts, + ) + check = EtcdVolume(execute_module=fake_execute_module) + + with pytest.raises(OpenShiftCheckException) as excinfo: + check.run(tmp=None, task_vars=task_vars) + + for word in 'Unable to find etcd storage mount point'.split() + extra_words: + assert word in str(excinfo.value) + + +@pytest.mark.parametrize('size_limit,ansible_mounts', [ + ( + # if no size limit is specified, expect max usage + # limit to default to 90% of size_total + None, + [{ + 'mount': '/', + 'size_available': 40 * 10**9, + 'size_total': 80 * 10**9 + }], + ), + ( + 1, + [{ + 'mount': '/', + 'size_available': 30 * 10**9, + 'size_total': 30 * 10**9, + }], + ), + ( + 20000000000, + [{ + 'mount': '/', + 'size_available': 20 * 10**9, + 'size_total': 40 * 10**9, + }], + ), + ( + 5000000000, + [{ + # not enough space on / ... + 'mount': '/', + 'size_available': 0, + 'size_total': 0, + }, { + # not enough space on /var/lib ... + 'mount': '/var/lib', + 'size_available': 2 * 10**9, + 'size_total': 21 * 10**9, + }, { + # ... but enough on /var/lib/etcd + 'mount': '/var/lib/etcd', + 'size_available': 36 * 10**9, + 'size_total': 40 * 10**9 + }], + ) +]) +def test_succeeds_with_recommended_disk_space(size_limit, ansible_mounts): + task_vars = dict( + etcd_device_usage_threshold_percent=size_limit, + ansible_mounts=ansible_mounts, + ) + + if task_vars["etcd_device_usage_threshold_percent"] is None: + task_vars.pop("etcd_device_usage_threshold_percent") + + check = EtcdVolume(execute_module=fake_execute_module) + result = check.run(tmp=None, task_vars=task_vars) + + assert not result.get('failed', False) + + +@pytest.mark.parametrize('size_limit_percent,ansible_mounts,extra_words', [ + ( + # if no size limit is specified, expect max usage + # limit to default to 90% of size_total + None, + [{ + 'mount': '/', + 'size_available': 1 * 10**9, + 'size_total': 100 * 10**9, + }], + ['99.0%'], + ), + ( + 70.0, + [{ + 'mount': '/', + 'size_available': 1 * 10**6, + 'size_total': 5 * 10**9, + }], + ['100.0%'], + ), + ( + 40.0, + [{ + 'mount': '/', + 'size_available': 2 * 10**9, + 'size_total': 6 * 10**9, + }], + ['66.7%'], + ), + ( + None, + [{ + # enough space on /var ... + 'mount': '/var', + 'size_available': 20 * 10**9, + 'size_total': 20 * 10**9, + }, { + # .. but not enough on /var/lib + 'mount': '/var/lib', + 'size_available': 1 * 10**9, + 'size_total': 20 * 10**9, + }], + ['95.0%'], + ), +]) +def test_fails_with_insufficient_disk_space(size_limit_percent, ansible_mounts, extra_words): + task_vars = dict( + etcd_device_usage_threshold_percent=size_limit_percent, + ansible_mounts=ansible_mounts, + ) + + if task_vars["etcd_device_usage_threshold_percent"] is None: + task_vars.pop("etcd_device_usage_threshold_percent") + + check = EtcdVolume(execute_module=fake_execute_module) + result = check.run(tmp=None, task_vars=task_vars) + + assert result['failed'] + for word in extra_words: + assert word in result['msg'] + + +def fake_execute_module(*args): + raise AssertionError('this function should not be called') |