From 9698f76b641b7cc5964d2f4f318ea71702aa2245 Mon Sep 17 00:00:00 2001 From: Luke Meyer Date: Wed, 20 Sep 2017 21:35:57 -0400 Subject: health checks: add diagnostics check Also, moved is_first_master method into superclass for reuse. And look at oo_first_master and ansible_host instead of masters and ansible_ssh_host. --- roles/openshift_health_checker/library/ocutil.py | 11 ++-- .../openshift_checks/__init__.py | 17 ++++++ .../openshift_checks/diagnostics.py | 62 ++++++++++++++++++++++ .../openshift_checks/etcd_volume.py | 2 +- .../openshift_checks/logging/logging.py | 8 --- .../test/diagnostics_test.py | 50 +++++++++++++++++ .../test/logging_check_test.py | 8 ++- 7 files changed, 138 insertions(+), 20 deletions(-) create mode 100644 roles/openshift_health_checker/openshift_checks/diagnostics.py create mode 100644 roles/openshift_health_checker/test/diagnostics_test.py diff --git a/roles/openshift_health_checker/library/ocutil.py b/roles/openshift_health_checker/library/ocutil.py index 2e60735d6..c72f4c5b3 100644 --- a/roles/openshift_health_checker/library/ocutil.py +++ b/roles/openshift_health_checker/library/ocutil.py @@ -40,18 +40,17 @@ def main(): module = AnsibleModule( argument_spec=dict( - namespace=dict(type="str", required=True), + namespace=dict(type="str", required=False), config_file=dict(type="str", required=True), cmd=dict(type="str", required=True), extra_args=dict(type="list", default=[]), ), ) - cmd = [ - locate_oc_binary(), - '--config', module.params["config_file"], - '-n', module.params["namespace"], - ] + shlex.split(module.params["cmd"]) + cmd = [locate_oc_binary(), '--config', module.params["config_file"]] + if module.params["namespace"]: + cmd += ['-n', module.params["namespace"]] + cmd += shlex.split(module.params["cmd"]) + module.params["extra_args"] failed = True try: diff --git a/roles/openshift_health_checker/openshift_checks/__init__.py b/roles/openshift_health_checker/openshift_checks/__init__.py index 28cb53cc5..ce05b44a4 100644 --- a/roles/openshift_health_checker/openshift_checks/__init__.py +++ b/roles/openshift_health_checker/openshift_checks/__init__.py @@ -13,6 +13,7 @@ from importlib import import_module from ansible.module_utils import six from ansible.module_utils.six.moves import reduce # pylint: disable=import-error,redefined-builtin +from ansible.module_utils.six import string_types from ansible.plugins.filter.core import to_bool as ansible_to_bool @@ -110,6 +111,11 @@ class OpenShiftCheck(object): """Returns true if this check applies to the ansible-playbook run.""" return True + def is_first_master(self): + """Determine if running on first master. Returns: bool""" + masters = self.get_var("groups", "oo_first_master", default=None) or [None] + return masters[0] == self.get_var("ansible_host") + @abstractmethod def run(self): """Executes a check against a host and returns a result hash similar to Ansible modules. @@ -282,6 +288,17 @@ class OpenShiftCheck(object): error=error )) + @staticmethod + def normalize(name_list): + """Return a clean list of names. + + The input may be a comma-separated string or a sequence. Leading and + trailing whitespace characters are removed. Empty items are discarded. + """ + if isinstance(name_list, string_types): + name_list = name_list.split(',') + return [name.strip() for name in name_list if name.strip()] + @staticmethod def get_major_minor_version(openshift_image_tag): """Parse and return the deployed version of OpenShift as a tuple.""" diff --git a/roles/openshift_health_checker/openshift_checks/diagnostics.py b/roles/openshift_health_checker/openshift_checks/diagnostics.py new file mode 100644 index 000000000..1cfdc1129 --- /dev/null +++ b/roles/openshift_health_checker/openshift_checks/diagnostics.py @@ -0,0 +1,62 @@ +""" +A check to run relevant diagnostics via `oc adm diagnostics`. +""" + +import os + +from openshift_checks import OpenShiftCheck, OpenShiftCheckException + + +DIAGNOSTIC_LIST = ( + "AggregatedLogging ClusterRegistry ClusterRoleBindings ClusterRoles " + "ClusterRouter DiagnosticPod NetworkCheck" +).split() + + +class DiagnosticCheck(OpenShiftCheck): + """A check to run relevant diagnostics via `oc adm diagnostics`.""" + + name = "diagnostics" + tags = ["health"] + + def is_active(self): + return super(DiagnosticCheck, self).is_active() and self.is_first_master() + + def run(self): + if self.exec_diagnostic("ConfigContexts"): + # only run the other diagnostics if that one succeeds (otherwise, all will fail) + diagnostics = self.get_var("openshift_check_diagnostics", default=DIAGNOSTIC_LIST) + for diagnostic in self.normalize(diagnostics): + self.exec_diagnostic(diagnostic) + return {} + + def exec_diagnostic(self, diagnostic): + """ + Execute an 'oc adm diagnostics' command on the remote host. + Raises OcNotFound or registers OcDiagFailed. + Returns True on success or False on failure (non-zero rc). + """ + config_base = self.get_var("openshift.common.config_base") + args = { + "config_file": os.path.join(config_base, "master", "admin.kubeconfig"), + "cmd": "adm diagnostics", + "extra_args": [diagnostic], + } + + result = self.execute_module("ocutil", args, save_as_name=diagnostic + ".failure.json") + self.register_file(diagnostic + ".txt", result['result']) + if result.get("failed"): + if result['result'] == '[Errno 2] No such file or directory': + raise OpenShiftCheckException( + "OcNotFound", + "This host is supposed to be a master but does not have the `oc` command where expected.\n" + "Has an installation been run on this host yet?" + ) + + self.register_failure(OpenShiftCheckException( + 'OcDiagFailed', + 'The {diag} diagnostic reported an error:\n' + '{error}'.format(diag=diagnostic, error=result['result']) + )) + return False + return True diff --git a/roles/openshift_health_checker/openshift_checks/etcd_volume.py b/roles/openshift_health_checker/openshift_checks/etcd_volume.py index e5d93ff3f..79955cb2f 100644 --- a/roles/openshift_health_checker/openshift_checks/etcd_volume.py +++ b/roles/openshift_health_checker/openshift_checks/etcd_volume.py @@ -16,7 +16,7 @@ class EtcdVolume(OpenShiftCheck): def is_active(self): etcd_hosts = self.get_var("groups", "etcd", default=[]) or self.get_var("groups", "masters", default=[]) or [] - is_etcd_host = self.get_var("ansible_ssh_host") in etcd_hosts + is_etcd_host = self.get_var("ansible_host") in etcd_hosts return super(EtcdVolume, self).is_active() and is_etcd_host def run(self): diff --git a/roles/openshift_health_checker/openshift_checks/logging/logging.py b/roles/openshift_health_checker/openshift_checks/logging/logging.py index 06bdfebf6..05ba73ca1 100644 --- a/roles/openshift_health_checker/openshift_checks/logging/logging.py +++ b/roles/openshift_health_checker/openshift_checks/logging/logging.py @@ -30,14 +30,6 @@ class LoggingCheck(OpenShiftCheck): logging_deployed = self.get_var("openshift_hosted_logging_deploy", convert=bool, default=False) return logging_deployed and super(LoggingCheck, self).is_active() and self.is_first_master() - def is_first_master(self): - """Determine if running on first master. Returns: bool""" - # Note: It would be nice to use membership in oo_first_master group, however for now it - # seems best to avoid requiring that setup and just check this is the first master. - hostname = self.get_var("ansible_ssh_host") or [None] - masters = self.get_var("groups", "masters", default=None) or [None] - return masters[0] == hostname - def run(self): return {} diff --git a/roles/openshift_health_checker/test/diagnostics_test.py b/roles/openshift_health_checker/test/diagnostics_test.py new file mode 100644 index 000000000..800889fa7 --- /dev/null +++ b/roles/openshift_health_checker/test/diagnostics_test.py @@ -0,0 +1,50 @@ +import pytest + +from openshift_checks.diagnostics import DiagnosticCheck, OpenShiftCheckException + + +@pytest.fixture() +def task_vars(): + return dict( + openshift=dict( + common=dict(config_base="/etc/origin/") + ) + ) + + +def test_module_succeeds(task_vars): + check = DiagnosticCheck(lambda *_: {"result": "success"}, task_vars) + check.is_first_master = lambda: True + assert check.is_active() + check.exec_diagnostic("spam") + assert not check.failures + + +def test_oc_not_there(task_vars): + def exec_module(*_): + return {"failed": True, "result": "[Errno 2] No such file or directory"} + + check = DiagnosticCheck(exec_module, task_vars) + with pytest.raises(OpenShiftCheckException) as excinfo: + check.exec_diagnostic("spam") + assert excinfo.value.name == "OcNotFound" + + +def test_module_fails(task_vars): + def exec_module(*_): + return {"failed": True, "result": "something broke"} + + check = DiagnosticCheck(exec_module, task_vars) + check.exec_diagnostic("spam") + assert check.failures and check.failures[0].name == "OcDiagFailed" + + +def test_names_executed(task_vars): + task_vars["openshift_check_diagnostics"] = diagnostics = "ConfigContexts,spam,,eggs" + + def exec_module(module, args, *_): + assert "extra_args" in args + assert args["extra_args"][0] in diagnostics + return {"result": "success"} + + DiagnosticCheck(exec_module, task_vars).run() diff --git a/roles/openshift_health_checker/test/logging_check_test.py b/roles/openshift_health_checker/test/logging_check_test.py index 1a1c190f6..59c703214 100644 --- a/roles/openshift_health_checker/test/logging_check_test.py +++ b/roles/openshift_health_checker/test/logging_check_test.py @@ -98,21 +98,19 @@ def test_oc_failure(problem, expect): assert expect in str(excinfo) -groups_with_first_master = dict(masters=['this-host', 'other-host']) -groups_with_second_master = dict(masters=['other-host', 'this-host']) -groups_not_a_master = dict(masters=['other-host']) +groups_with_first_master = dict(oo_first_master=['this-host']) +groups_not_a_master = dict(oo_first_master=['other-host'], oo_masters=['other-host']) @pytest.mark.parametrize('groups, logging_deployed, is_active', [ (groups_with_first_master, True, True), (groups_with_first_master, False, False), (groups_not_a_master, True, False), - (groups_with_second_master, True, False), (groups_not_a_master, True, False), ]) def test_is_active(groups, logging_deployed, is_active): task_vars = dict( - ansible_ssh_host='this-host', + ansible_host='this-host', groups=groups, openshift_hosted_logging_deploy=logging_deployed, ) -- cgit v1.2.3