diff options
Diffstat (limited to 'roles')
22 files changed, 730 insertions, 643 deletions
diff --git a/roles/openshift_health_checker/action_plugins/openshift_health_check.py b/roles/openshift_health_checker/action_plugins/openshift_health_check.py index 23da53940..05e53333d 100644 --- a/roles/openshift_health_checker/action_plugins/openshift_health_check.py +++ b/roles/openshift_health_checker/action_plugins/openshift_health_check.py @@ -68,13 +68,15 @@ class ActionModule(ActionBase):                          msg=str(e),                      ) +            if check.changed: +                r["changed"] = True              check_results[check_name] = r -            if r.get("failed", False): -                result["failed"] = True -                result["msg"] = "One or more checks failed" +        result["changed"] = any(r.get("changed") for r in check_results.values()) +        if any(r.get("failed") for r in check_results.values()): +            result["failed"] = True +            result["msg"] = "One or more checks failed" -        result["changed"] = any(r.get("changed", False) for r in check_results.values())          return result      def load_known_checks(self, tmp, task_vars): diff --git a/roles/openshift_health_checker/openshift_checks/__init__.py b/roles/openshift_health_checker/openshift_checks/__init__.py index 85cbc6224..f26008c9f 100644 --- a/roles/openshift_health_checker/openshift_checks/__init__.py +++ b/roles/openshift_health_checker/openshift_checks/__init__.py @@ -13,8 +13,30 @@ from ansible.module_utils.six.moves import reduce  # pylint: disable=import-erro  class OpenShiftCheckException(Exception): -    """Raised when a check cannot proceed.""" -    pass +    """Raised when a check encounters a failure condition.""" + +    def __init__(self, name, msg=None): +        # msg is for the message the user will see when this is raised. +        # name is for test code to identify the error without looking at msg text. +        if msg is None:  # for parameter backward compatibility +            msg = name +            name = self.__class__.__name__ +        self.name = name +        super(OpenShiftCheckException, self).__init__(msg) + + +class OpenShiftCheckExceptionList(OpenShiftCheckException): +    """A container for multiple logging errors that may be detected in one check.""" +    def __init__(self, errors): +        self.errors = errors +        super(OpenShiftCheckExceptionList, self).__init__( +            'OpenShiftCheckExceptionList', +            '\n'.join(str(msg) for msg in errors) +        ) + +    # make iterable +    def __getitem__(self, index): +        return self.errors[index]  @six.add_metaclass(ABCMeta) @@ -35,6 +57,9 @@ class OpenShiftCheck(object):          self.task_vars = task_vars or {}          self.tmp = tmp +        # set to True when the check changes the host, for accurate total "changed" count +        self.changed = False +      @abstractproperty      def name(self):          """The name of this check, usually derived from the class name.""" diff --git a/roles/openshift_health_checker/openshift_checks/docker_image_availability.py b/roles/openshift_health_checker/openshift_checks/docker_image_availability.py index 77180223e..85a922f86 100644 --- a/roles/openshift_health_checker/openshift_checks/docker_image_availability.py +++ b/roles/openshift_health_checker/openshift_checks/docker_image_availability.py @@ -41,11 +41,10 @@ class DockerImageAvailability(DockerHostMixin, OpenShiftCheck):          return super(DockerImageAvailability, self).is_active() and has_valid_deployment_type      def run(self): -        msg, failed, changed = self.ensure_dependencies() +        msg, failed = self.ensure_dependencies()          if failed:              return {                  "failed": True, -                "changed": changed,                  "msg": "Some dependencies are required in order to check Docker image availability.\n" + msg              } @@ -54,11 +53,11 @@ class DockerImageAvailability(DockerHostMixin, OpenShiftCheck):          # exit early if all images were found locally          if not missing_images: -            return {"changed": changed} +            return {}          registries = self.known_docker_registries()          if not registries: -            return {"failed": True, "msg": "Unable to retrieve any docker registries.", "changed": changed} +            return {"failed": True, "msg": "Unable to retrieve any docker registries."}          available_images = self.available_images(missing_images, registries)          unavailable_images = set(missing_images) - set(available_images) @@ -70,10 +69,9 @@ class DockerImageAvailability(DockerHostMixin, OpenShiftCheck):                      "One or more required Docker images are not available:\n    {}\n"                      "Configured registries: {}"                  ).format(",\n    ".join(sorted(unavailable_images)), ", ".join(registries)), -                "changed": changed,              } -        return {"changed": changed} +        return {}      def required_images(self):          """ diff --git a/roles/openshift_health_checker/openshift_checks/docker_storage.py b/roles/openshift_health_checker/openshift_checks/docker_storage.py index dea15a56e..7ae384bd7 100644 --- a/roles/openshift_health_checker/openshift_checks/docker_storage.py +++ b/roles/openshift_health_checker/openshift_checks/docker_storage.py @@ -43,21 +43,20 @@ class DockerStorage(DockerHostMixin, OpenShiftCheck):      ]      def run(self): -        msg, failed, changed = self.ensure_dependencies() +        msg, failed = self.ensure_dependencies()          if failed:              return {                  "failed": True, -                "changed": changed,                  "msg": "Some dependencies are required in order to query docker storage on host:\n" + msg              }          # attempt to get the docker info hash from the API          docker_info = self.execute_module("docker_info", {})          if docker_info.get("failed"): -            return {"failed": True, "changed": changed, +            return {"failed": True,                      "msg": "Failed to query Docker API. Is docker running on this host?"}          if not docker_info.get("info"):  # this would be very strange -            return {"failed": True, "changed": changed, +            return {"failed": True,                      "msg": "Docker API query missing info:\n{}".format(json.dumps(docker_info))}          docker_info = docker_info["info"] @@ -68,7 +67,7 @@ class DockerStorage(DockerHostMixin, OpenShiftCheck):                  "Detected unsupported Docker storage driver '{driver}'.\n"                  "Supported storage drivers are: {drivers}"              ).format(driver=driver, drivers=', '.join(self.storage_drivers)) -            return {"failed": True, "changed": changed, "msg": msg} +            return {"failed": True, "msg": msg}          # driver status info is a list of tuples; convert to dict and validate based on driver          driver_status = {item[0]: item[1] for item in docker_info.get("DriverStatus", [])} @@ -81,7 +80,6 @@ class DockerStorage(DockerHostMixin, OpenShiftCheck):          if driver in ['overlay', 'overlay2']:              result = self.check_overlay_support(docker_info, driver_status) -        result['changed'] = result.get('changed', False) or changed          return result      def check_devicemapper_support(self, driver_status): diff --git a/roles/openshift_health_checker/openshift_checks/etcd_imagedata_size.py b/roles/openshift_health_checker/openshift_checks/etcd_imagedata_size.py index 28c38504d..ae8460b7e 100644 --- a/roles/openshift_health_checker/openshift_checks/etcd_imagedata_size.py +++ b/roles/openshift_health_checker/openshift_checks/etcd_imagedata_size.py @@ -56,7 +56,7 @@ class EtcdImageDataSize(OpenShiftCheck):                      reason = etcdkeysize["module_stderr"]                  msg = msg.format(host=etcd_host, reason=reason) -                return {"failed": True, "changed": False, "msg": msg} +                return {"failed": True, "msg": msg}              if etcdkeysize["size_limit_exceeded"]:                  limit = self._to_gigabytes(etcd_imagedata_size_limit) @@ -65,7 +65,7 @@ class EtcdImageDataSize(OpenShiftCheck):                         "Use the `oadm prune images` command to cleanup unused Docker images.")                  return {"failed": True, "msg": msg.format(host=etcd_host, limit=limit)} -        return {"changed": False} +        return {}      @staticmethod      def _get_etcd_mountpath(ansible_mounts): diff --git a/roles/openshift_health_checker/openshift_checks/etcd_volume.py b/roles/openshift_health_checker/openshift_checks/etcd_volume.py index da7d0364a..e55d55e91 100644 --- a/roles/openshift_health_checker/openshift_checks/etcd_volume.py +++ b/roles/openshift_health_checker/openshift_checks/etcd_volume.py @@ -40,7 +40,7 @@ class EtcdVolume(OpenShiftCheck):              )              return {"failed": True, "msg": msg} -        return {"changed": False} +        return {}      def _etcd_mount_info(self):          ansible_mounts = self.get_var("ansible_mounts") diff --git a/roles/openshift_health_checker/openshift_checks/logging/curator.py b/roles/openshift_health_checker/openshift_checks/logging/curator.py index 32d853d57..b27f97172 100644 --- a/roles/openshift_health_checker/openshift_checks/logging/curator.py +++ b/roles/openshift_health_checker/openshift_checks/logging/curator.py @@ -1,6 +1,6 @@  """Check for an aggregated logging Curator deployment""" -from openshift_checks.logging.logging import LoggingCheck +from openshift_checks.logging.logging import OpenShiftCheckException, LoggingCheck  class Curator(LoggingCheck): @@ -12,27 +12,17 @@ class Curator(LoggingCheck):      def run(self):          """Check various things and gather errors. Returns: result as hash""" -        self.logging_namespace = self.get_var("openshift_logging_namespace", default="logging") -        curator_pods, error = self.get_pods_for_component( -            self.logging_namespace, -            "curator", -        ) -        if error: -            return {"failed": True, "changed": False, "msg": error} -        check_error = self.check_curator(curator_pods) - -        if check_error: -            msg = ("The following Curator deployment issue was found:" -                   "{}".format(check_error)) -            return {"failed": True, "changed": False, "msg": msg} - +        curator_pods = self.get_pods_for_component("curator") +        self.check_curator(curator_pods)          # TODO(lmeyer): run it all again for the ops cluster -        return {"failed": False, "changed": False, "msg": 'No problems found with Curator deployment.'} + +        return {}      def check_curator(self, pods):          """Check to see if curator is up and working. Returns: error string"""          if not pods: -            return ( +            raise OpenShiftCheckException( +                "MissingComponentPods",                  "There are no Curator pods for the logging stack,\n"                  "so nothing will prune Elasticsearch indexes.\n"                  "Is Curator correctly deployed?" @@ -40,14 +30,14 @@ class Curator(LoggingCheck):          not_running = self.not_running_pods(pods)          if len(not_running) == len(pods): -            return ( +            raise OpenShiftCheckException( +                "CuratorNotRunning",                  "The Curator pod is not currently in a running state,\n"                  "so Elasticsearch indexes may increase without bound."              )          if len(pods) - len(not_running) > 1: -            return ( +            raise OpenShiftCheckException( +                "TooManyCurators",                  "There is more than one Curator pod running. This should not normally happen.\n"                  "Although this doesn't cause any problems, you may want to investigate."              ) - -        return None diff --git a/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py b/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py index 8bdda1f32..7fc843fd7 100644 --- a/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py +++ b/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py @@ -3,6 +3,7 @@  import json  import re +from openshift_checks import OpenShiftCheckException, OpenShiftCheckExceptionList  from openshift_checks.logging.logging import LoggingCheck @@ -15,168 +16,178 @@ class Elasticsearch(LoggingCheck):      def run(self):          """Check various things and gather errors. Returns: result as hash""" -        self.logging_namespace = self.get_var("openshift_logging_namespace", default="logging") -        es_pods, error = self.get_pods_for_component( -            self.logging_namespace, -            "es", -        ) -        if error: -            return {"failed": True, "changed": False, "msg": error} -        check_error = self.check_elasticsearch(es_pods) - -        if check_error: -            msg = ("The following Elasticsearch deployment issue was found:" -                   "{}".format(check_error)) -            return {"failed": True, "changed": False, "msg": msg} - +        es_pods = self.get_pods_for_component("es") +        self.check_elasticsearch(es_pods)          # TODO(lmeyer): run it all again for the ops cluster -        return {"failed": False, "changed": False, "msg": 'No problems found with Elasticsearch deployment.'} -    def _not_running_elasticsearch_pods(self, es_pods): -        """Returns: list of pods that are not running, list of errors about non-running pods""" -        not_running = self.not_running_pods(es_pods) -        if not_running: -            return not_running, [( -                'The following Elasticsearch pods are not running:\n' -                '{pods}' -                'These pods will not aggregate logs from their nodes.' -            ).format(pods=''.join( -                "  {} ({})\n".format(pod['metadata']['name'], pod['spec'].get('host', 'None')) -                for pod in not_running -            ))] -        return not_running, [] +        return {}      def check_elasticsearch(self, es_pods): -        """Various checks for elasticsearch. Returns: error string""" -        not_running_pods, error_msgs = self._not_running_elasticsearch_pods(es_pods) -        running_pods = [pod for pod in es_pods if pod not in not_running_pods] +        """Perform checks for Elasticsearch. Raises OpenShiftCheckExceptionList on any errors.""" +        running_pods, errors = self.running_elasticsearch_pods(es_pods)          pods_by_name = {              pod['metadata']['name']: pod for pod in running_pods              # Filter out pods that are not members of a DC              if pod['metadata'].get('labels', {}).get('deploymentconfig')          }          if not pods_by_name: -            return 'No logging Elasticsearch pods were found. Is logging deployed?' -        error_msgs += self._check_elasticsearch_masters(pods_by_name) -        error_msgs += self._check_elasticsearch_node_list(pods_by_name) -        error_msgs += self._check_es_cluster_health(pods_by_name) -        error_msgs += self._check_elasticsearch_diskspace(pods_by_name) -        return '\n'.join(error_msgs) +            # nothing running, cannot run the rest of the check +            errors.append(OpenShiftCheckException( +                'NoRunningPods', +                'No logging Elasticsearch pods were found running, so no logs are being aggregated.' +            )) +            raise OpenShiftCheckExceptionList(errors) + +        errors += self.check_elasticsearch_masters(pods_by_name) +        errors += self.check_elasticsearch_node_list(pods_by_name) +        errors += self.check_es_cluster_health(pods_by_name) +        errors += self.check_elasticsearch_diskspace(pods_by_name) +        if errors: +            raise OpenShiftCheckExceptionList(errors) + +    def running_elasticsearch_pods(self, es_pods): +        """Returns: list of running pods, list of errors about non-running pods""" +        not_running = self.not_running_pods(es_pods) +        running_pods = [pod for pod in es_pods if pod not in not_running] +        if not_running: +            return running_pods, [OpenShiftCheckException( +                'PodNotRunning', +                'The following Elasticsearch pods are defined but not running:\n' +                '{pods}'.format(pods=''.join( +                    "  {} ({})\n".format(pod['metadata']['name'], pod['spec'].get('host', 'None')) +                    for pod in not_running +                )) +            )] +        return running_pods, []      @staticmethod      def _build_es_curl_cmd(pod_name, url):          base = "exec {name} -- curl -s --cert {base}cert --key {base}key --cacert {base}ca -XGET '{url}'"          return base.format(base="/etc/elasticsearch/secret/admin-", name=pod_name, url=url) -    def _check_elasticsearch_masters(self, pods_by_name): -        """Check that Elasticsearch masters are sane. Returns: list of error strings""" +    def check_elasticsearch_masters(self, pods_by_name): +        """Check that Elasticsearch masters are sane. Returns: list of errors"""          es_master_names = set() -        error_msgs = [] +        errors = []          for pod_name in pods_by_name.keys():              # Compare what each ES node reports as master and compare for split brain              get_master_cmd = self._build_es_curl_cmd(pod_name, "https://localhost:9200/_cat/master") -            master_name_str = self.exec_oc(self.logging_namespace, get_master_cmd, []) +            master_name_str = self.exec_oc(get_master_cmd, [])              master_names = (master_name_str or '').split(' ')              if len(master_names) > 1:                  es_master_names.add(master_names[1])              else: -                error_msgs.append( -                    'No master? Elasticsearch {pod} returned bad string when asked master name:\n' +                errors.append(OpenShiftCheckException( +                    'NoMasterName', +                    'Elasticsearch {pod} gave unexpected response when asked master name:\n'                      '  {response}'.format(pod=pod_name, response=master_name_str) -                ) +                ))          if not es_master_names: -            error_msgs.append('No logging Elasticsearch masters were found. Is logging deployed?') -            return '\n'.join(error_msgs) +            errors.append(OpenShiftCheckException( +                'NoMasterFound', +                'No logging Elasticsearch masters were found.' +            )) +            return errors          if len(es_master_names) > 1: -            error_msgs.append( +            errors.append(OpenShiftCheckException( +                'SplitBrainMasters',                  'Found multiple Elasticsearch masters according to the pods:\n'                  '{master_list}\n'                  'This implies that the masters have "split brain" and are not correctly\n'                  'replicating data for the logging cluster. Log loss is likely to occur.'                  .format(master_list='\n'.join('  ' + master for master in es_master_names)) -            ) +            )) -        return error_msgs +        return errors -    def _check_elasticsearch_node_list(self, pods_by_name): -        """Check that reported ES masters are accounted for by pods. Returns: list of error strings""" +    def check_elasticsearch_node_list(self, pods_by_name): +        """Check that reported ES masters are accounted for by pods. Returns: list of errors"""          if not pods_by_name: -            return ['No logging Elasticsearch masters were found. Is logging deployed?'] +            return [OpenShiftCheckException( +                'MissingComponentPods', +                'No logging Elasticsearch pods were found.' +            )]          # get ES cluster nodes          node_cmd = self._build_es_curl_cmd(list(pods_by_name.keys())[0], 'https://localhost:9200/_nodes') -        cluster_node_data = self.exec_oc(self.logging_namespace, node_cmd, []) +        cluster_node_data = self.exec_oc(node_cmd, [])          try:              cluster_nodes = json.loads(cluster_node_data)['nodes']          except (ValueError, KeyError): -            return [ +            return [OpenShiftCheckException( +                'MissingNodeList',                  'Failed to query Elasticsearch for the list of ES nodes. The output was:\n' +                  cluster_node_data -            ] +            )]          # Try to match all ES-reported node hosts to known pods. -        error_msgs = [] +        errors = []          for node in cluster_nodes.values():              # Note that with 1.4/3.4 the pod IP may be used as the master name              if not any(node['host'] in (pod_name, pod['status'].get('podIP'))                         for pod_name, pod in pods_by_name.items()): -                error_msgs.append( +                errors.append(OpenShiftCheckException( +                    'EsPodNodeMismatch',                      'The Elasticsearch cluster reports a member node "{node}"\n'                      'that does not correspond to any known ES pod.'.format(node=node['host']) -                ) +                )) -        return error_msgs +        return errors -    def _check_es_cluster_health(self, pods_by_name): +    def check_es_cluster_health(self, pods_by_name):          """Exec into the elasticsearch pods and check the cluster health. Returns: list of errors""" -        error_msgs = [] +        errors = []          for pod_name in pods_by_name.keys():              cluster_health_cmd = self._build_es_curl_cmd(pod_name, 'https://localhost:9200/_cluster/health?pretty=true') -            cluster_health_data = self.exec_oc(self.logging_namespace, cluster_health_cmd, []) +            cluster_health_data = self.exec_oc(cluster_health_cmd, [])              try:                  health_res = json.loads(cluster_health_data)                  if not health_res or not health_res.get('status'):                      raise ValueError()              except ValueError: -                error_msgs.append( +                errors.append(OpenShiftCheckException( +                    'BadEsResponse',                      'Could not retrieve cluster health status from logging ES pod "{pod}".\n'                      'Response was:\n{output}'.format(pod=pod_name, output=cluster_health_data) -                ) +                ))                  continue              if health_res['status'] not in ['green', 'yellow']: -                error_msgs.append( +                errors.append(OpenShiftCheckException( +                    'EsClusterHealthRed',                      'Elasticsearch cluster health status is RED according to pod "{}"'.format(pod_name) -                ) +                )) -        return error_msgs +        return errors -    def _check_elasticsearch_diskspace(self, pods_by_name): +    def check_elasticsearch_diskspace(self, pods_by_name):          """          Exec into an ES pod and query the diskspace on the persistent volume.          Returns: list of errors          """ -        error_msgs = [] +        errors = []          for pod_name in pods_by_name.keys():              df_cmd = 'exec {} -- df --output=ipcent,pcent /elasticsearch/persistent'.format(pod_name) -            disk_output = self.exec_oc(self.logging_namespace, df_cmd, []) +            disk_output = self.exec_oc(df_cmd, [])              lines = disk_output.splitlines()              # expecting one header looking like 'IUse% Use%' and one body line              body_re = r'\s*(\d+)%?\s+(\d+)%?\s*$'              if len(lines) != 2 or len(lines[0].split()) != 2 or not re.match(body_re, lines[1]): -                error_msgs.append( +                errors.append(OpenShiftCheckException( +                    'BadDfResponse',                      'Could not retrieve storage usage from logging ES pod "{pod}".\n'                      'Response to `df` command was:\n{output}'.format(pod=pod_name, output=disk_output) -                ) +                ))                  continue              inode_pct, disk_pct = re.match(body_re, lines[1]).groups()              inode_pct_thresh = self.get_var('openshift_check_efk_es_inode_pct', default='90')              if int(inode_pct) >= int(inode_pct_thresh): -                error_msgs.append( +                errors.append(OpenShiftCheckException( +                    'InodeUsageTooHigh',                      'Inode percent usage on the storage volume for logging ES pod "{pod}"\n'                      '  is {pct}, greater than threshold {limit}.\n'                      '  Note: threshold can be specified in inventory with {param}'.format( @@ -184,10 +195,11 @@ class Elasticsearch(LoggingCheck):                          pct=str(inode_pct),                          limit=str(inode_pct_thresh),                          param='openshift_check_efk_es_inode_pct', -                    )) +                    )))              disk_pct_thresh = self.get_var('openshift_check_efk_es_storage_pct', default='80')              if int(disk_pct) >= int(disk_pct_thresh): -                error_msgs.append( +                errors.append(OpenShiftCheckException( +                    'DiskUsageTooHigh',                      'Disk percent usage on the storage volume for logging ES pod "{pod}"\n'                      '  is {pct}, greater than threshold {limit}.\n'                      '  Note: threshold can be specified in inventory with {param}'.format( @@ -195,6 +207,6 @@ class Elasticsearch(LoggingCheck):                          pct=str(disk_pct),                          limit=str(disk_pct_thresh),                          param='openshift_check_efk_es_storage_pct', -                    )) +                    ))) -        return error_msgs +        return errors diff --git a/roles/openshift_health_checker/openshift_checks/logging/fluentd.py b/roles/openshift_health_checker/openshift_checks/logging/fluentd.py index b3485bf44..3b192a281 100644 --- a/roles/openshift_health_checker/openshift_checks/logging/fluentd.py +++ b/roles/openshift_health_checker/openshift_checks/logging/fluentd.py @@ -2,6 +2,8 @@  import json + +from openshift_checks import OpenShiftCheckException, OpenShiftCheckExceptionList  from openshift_checks.logging.logging import LoggingCheck @@ -12,57 +14,96 @@ class Fluentd(LoggingCheck):      tags = ["health", "logging"]      def run(self): -        """Check various things and gather errors. Returns: result as hash""" +        """Check the Fluentd deployment and raise an error if any problems are found.""" + +        fluentd_pods = self.get_pods_for_component("fluentd") +        self.check_fluentd(fluentd_pods) +        return {} + +    def check_fluentd(self, pods): +        """Verify fluentd is running everywhere. Raises OpenShiftCheckExceptionList if error(s) found.""" -        self.logging_namespace = self.get_var("openshift_logging_namespace", default="logging") -        fluentd_pods, error = super(Fluentd, self).get_pods_for_component( -            self.logging_namespace, -            "fluentd", +        node_selector = self.get_var( +            'openshift_logging_fluentd_nodeselector', +            default='logging-infra-fluentd=true'          ) -        if error: -            return {"failed": True, "changed": False, "msg": error} -        check_error = self.check_fluentd(fluentd_pods) -        if check_error: -            msg = ("The following Fluentd deployment issue was found:" -                   "{}".format(check_error)) -            return {"failed": True, "changed": False, "msg": msg} +        nodes_by_name = self.get_nodes_by_name() +        fluentd_nodes = self.filter_fluentd_labeled_nodes(nodes_by_name, node_selector) -        # TODO(lmeyer): run it all again for the ops cluster -        return {"failed": False, "changed": False, "msg": 'No problems found with Fluentd deployment.'} +        errors = [] +        errors += self.check_node_labeling(nodes_by_name, fluentd_nodes, node_selector) +        errors += self.check_nodes_have_fluentd(pods, fluentd_nodes) +        errors += self.check_fluentd_pods_running(pods) + +        # Make sure there are no extra fluentd pods +        if len(pods) > len(fluentd_nodes): +            errors.append(OpenShiftCheckException( +                'TooManyFluentdPods', +                'There are more Fluentd pods running than nodes labeled.\n' +                'This may not cause problems with logging but it likely indicates something wrong.' +            )) + +        if errors: +            raise OpenShiftCheckExceptionList(errors) + +    def get_nodes_by_name(self): +        """Retrieve all the node definitions. Returns: dict(name: node)""" +        nodes_json = self.exec_oc("get nodes -o json", []) +        try: +            nodes = json.loads(nodes_json) +        except ValueError:  # no valid json - should not happen +            raise OpenShiftCheckException( +                "BadOcNodeList", +                "Could not obtain a list of nodes to validate fluentd.\n" +                "Output from oc get:\n" + nodes_json +            ) +        if not nodes or not nodes.get('items'):  # also should not happen +            raise OpenShiftCheckException( +                "NoNodesDefined", +                "No nodes appear to be defined according to the API." +            ) +        return { +            node['metadata']['name']: node +            for node in nodes['items'] +        }      @staticmethod -    def _filter_fluentd_labeled_nodes(nodes_by_name, node_selector): -        """Filter to all nodes with fluentd label. Returns dict(name: node), error string""" +    def filter_fluentd_labeled_nodes(nodes_by_name, node_selector): +        """Filter to all nodes with fluentd label. Returns dict(name: node)"""          label, value = node_selector.split('=', 1)          fluentd_nodes = {              name: node for name, node in nodes_by_name.items()              if node['metadata']['labels'].get(label) == value          }          if not fluentd_nodes: -            return None, ( +            raise OpenShiftCheckException( +                'NoNodesLabeled',                  'There are no nodes with the fluentd label {label}.\n' -                'This means no logs will be aggregated from the nodes.' -            ).format(label=node_selector) -        return fluentd_nodes, None +                'This means no logs will be aggregated from the nodes.'.format(label=node_selector) +            ) +        return fluentd_nodes -    def _check_node_labeling(self, nodes_by_name, fluentd_nodes, node_selector): -        """Note if nodes are not labeled as expected. Returns: error string""" +    def check_node_labeling(self, nodes_by_name, fluentd_nodes, node_selector): +        """Note if nodes are not labeled as expected. Returns: error list"""          intended_nodes = self.get_var('openshift_logging_fluentd_hosts', default=['--all'])          if not intended_nodes or '--all' in intended_nodes:              intended_nodes = nodes_by_name.keys()          nodes_missing_labels = set(intended_nodes) - set(fluentd_nodes.keys())          if nodes_missing_labels: -            return ( +            return [OpenShiftCheckException( +                'NodesUnlabeled',                  'The following nodes are supposed to be labeled with {label} but are not:\n'                  '  {nodes}\n' -                'Fluentd will not aggregate logs from these nodes.' -            ).format(label=node_selector, nodes=', '.join(nodes_missing_labels)) -        return None +                'Fluentd will not aggregate logs from these nodes.'.format( +                    label=node_selector, nodes=', '.join(nodes_missing_labels) +                ))] + +        return []      @staticmethod -    def _check_nodes_have_fluentd(pods, fluentd_nodes): -        """Make sure fluentd is on all the labeled nodes. Returns: error string""" +    def check_nodes_have_fluentd(pods, fluentd_nodes): +        """Make sure fluentd is on all the labeled nodes. Returns: error list"""          unmatched_nodes = fluentd_nodes.copy()          node_names_by_label = {              node['metadata']['labels']['kubernetes.io/hostname']: name @@ -82,80 +123,32 @@ class Fluentd(LoggingCheck):              ]:                  unmatched_nodes.pop(name, None)          if unmatched_nodes: -            return ( +            return [OpenShiftCheckException( +                'MissingFluentdPod',                  'The following nodes are supposed to have a Fluentd pod but do not:\n' -                '{nodes}' -                'These nodes will not have their logs aggregated.' -            ).format(nodes=''.join( -                "  {}\n".format(name) -                for name in unmatched_nodes.keys() -            )) -        return None +                '  {nodes}\n' +                'These nodes will not have their logs aggregated.'.format( +                    nodes='\n  '.join(unmatched_nodes.keys()) +                ))] + +        return [] -    def _check_fluentd_pods_running(self, pods): +    def check_fluentd_pods_running(self, pods):          """Make sure all fluentd pods are running. Returns: error string"""          not_running = super(Fluentd, self).not_running_pods(pods)          if not_running: -            return ( +            return [OpenShiftCheckException( +                'FluentdNotRunning',                  'The following Fluentd pods are supposed to be running but are not:\n' -                '{pods}' -                'These pods will not aggregate logs from their nodes.' -            ).format(pods=''.join( -                "  {} ({})\n".format(pod['metadata']['name'], pod['spec'].get('host', 'None')) -                for pod in not_running -            )) -        return None - -    def check_fluentd(self, pods): -        """Verify fluentd is running everywhere. Returns: error string""" - -        node_selector = self.get_var( -            'openshift_logging_fluentd_nodeselector', -            default='logging-infra-fluentd=true' -        ) - -        nodes_by_name, error = self.get_nodes_by_name() - -        if error: -            return error -        fluentd_nodes, error = self._filter_fluentd_labeled_nodes(nodes_by_name, node_selector) -        if error: -            return error - -        error_msgs = [] -        error = self._check_node_labeling(nodes_by_name, fluentd_nodes, node_selector) -        if error: -            error_msgs.append(error) -        error = self._check_nodes_have_fluentd(pods, fluentd_nodes) -        if error: -            error_msgs.append(error) -        error = self._check_fluentd_pods_running(pods) -        if error: -            error_msgs.append(error) - -        # Make sure there are no extra fluentd pods -        if len(pods) > len(fluentd_nodes): -            error_msgs.append( -                'There are more Fluentd pods running than nodes labeled.\n' -                'This may not cause problems with logging but it likely indicates something wrong.' -            ) - -        return '\n'.join(error_msgs) - -    def get_nodes_by_name(self): -        """Retrieve all the node definitions. Returns: dict(name: node), error string""" -        nodes_json = self.exec_oc( -            self.logging_namespace, -            "get nodes -o json", -            [] -        ) -        try: -            nodes = json.loads(nodes_json) -        except ValueError:  # no valid json - should not happen -            return None, "Could not obtain a list of nodes to validate fluentd. Output from oc get:\n" + nodes_json -        if not nodes or not nodes.get('items'):  # also should not happen -            return None, "No nodes appear to be defined according to the API." -        return { -            node['metadata']['name']: node -            for node in nodes['items'] -        }, None +                '  {pods}\n' +                'These pods will not aggregate logs from their nodes.'.format( +                    pods='\n'.join( +                        "  {name} ({host})".format( +                            name=pod['metadata']['name'], +                            host=pod['spec'].get('host', 'None') +                        ) +                        for pod in not_running +                    ) +                ))] + +        return [] diff --git a/roles/openshift_health_checker/openshift_checks/logging/fluentd_config.py b/roles/openshift_health_checker/openshift_checks/logging/fluentd_config.py index 0970f0a63..d783e6760 100644 --- a/roles/openshift_health_checker/openshift_checks/logging/fluentd_config.py +++ b/roles/openshift_health_checker/openshift_checks/logging/fluentd_config.py @@ -24,7 +24,6 @@ class FluentdConfig(LoggingCheck):      def run(self):          """Check that Fluentd has running pods, and that its logging config matches Docker's logging config.""" -        self.logging_namespace = self.get_var("openshift_logging_namespace", default=self.logging_namespace)          config_error = self.check_logging_config()          if config_error:              msg = ("The following Fluentd logging configuration problem was found:" @@ -120,19 +119,13 @@ class FluentdConfig(LoggingCheck):      def running_fluentd_pods(self):          """Return a list of running fluentd pods.""" -        fluentd_pods, error = self.get_pods_for_component( -            self.logging_namespace, -            "fluentd", -        ) -        if error: -            msg = 'Unable to retrieve any pods for the "fluentd" logging component: {}'.format(error) -            raise OpenShiftCheckException(msg) +        fluentd_pods = self.get_pods_for_component("fluentd")          running_fluentd_pods = [pod for pod in fluentd_pods if pod['status']['phase'] == 'Running']          if not running_fluentd_pods: -            msg = ('No Fluentd pods were found to be in the "Running" state. ' -                   'At least one Fluentd pod is required in order to perform this check.') - -            raise OpenShiftCheckException(msg) +            raise OpenShiftCheckException( +                'No Fluentd pods were found to be in the "Running" state. ' +                'At least one Fluentd pod is required in order to perform this check.' +            )          return running_fluentd_pods diff --git a/roles/openshift_health_checker/openshift_checks/logging/kibana.py b/roles/openshift_health_checker/openshift_checks/logging/kibana.py index efb14ab42..3b1cf8baa 100644 --- a/roles/openshift_health_checker/openshift_checks/logging/kibana.py +++ b/roles/openshift_health_checker/openshift_checks/logging/kibana.py @@ -12,7 +12,7 @@ except ImportError:      from urllib.error import HTTPError, URLError      import urllib.request as urllib2 -from openshift_checks.logging.logging import LoggingCheck +from openshift_checks.logging.logging import LoggingCheck, OpenShiftCheckException  class Kibana(LoggingCheck): @@ -24,25 +24,12 @@ class Kibana(LoggingCheck):      def run(self):          """Check various things and gather errors. Returns: result as hash""" -        self.logging_namespace = self.get_var("openshift_logging_namespace", default="logging") -        kibana_pods, error = self.get_pods_for_component( -            self.logging_namespace, -            "kibana", -        ) -        if error: -            return {"failed": True, "changed": False, "msg": error} -        check_error = self.check_kibana(kibana_pods) - -        if not check_error: -            check_error = self._check_kibana_route() - -        if check_error: -            msg = ("The following Kibana deployment issue was found:" -                   "{}".format(check_error)) -            return {"failed": True, "changed": False, "msg": msg} - +        kibana_pods = self.get_pods_for_component("kibana") +        self.check_kibana(kibana_pods) +        self.check_kibana_route()          # TODO(lmeyer): run it all again for the ops cluster -        return {"failed": False, "changed": False, "msg": 'No problems found with Kibana deployment.'} + +        return {}      def _verify_url_internal(self, url):          """ @@ -65,7 +52,7 @@ class Kibana(LoggingCheck):      def _verify_url_external(url):          """          Try to reach a URL from ansible control host. -        Returns: success (bool), reason (for failure) +        Raise an OpenShiftCheckException if anything goes wrong.          """          # This actually checks from the ansible control host, which may or may not          # really be "external" to the cluster. @@ -91,130 +78,149 @@ class Kibana(LoggingCheck):          return None      def check_kibana(self, pods): -        """Check to see if Kibana is up and working. Returns: error string.""" +        """Check to see if Kibana is up and working. Raises OpenShiftCheckException if not."""          if not pods: -            return "There are no Kibana pods deployed, so no access to the logging UI." +            raise OpenShiftCheckException( +                "MissingComponentPods", +                "There are no Kibana pods deployed, so no access to the logging UI." +            )          not_running = self.not_running_pods(pods)          if len(not_running) == len(pods): -            return "No Kibana pod is in a running state, so there is no access to the logging UI." +            raise OpenShiftCheckException( +                "NoRunningPods", +                "No Kibana pod is in a running state, so there is no access to the logging UI." +            )          elif not_running: -            return ( +            raise OpenShiftCheckException( +                "PodNotRunning",                  "The following Kibana pods are not currently in a running state:\n" -                "{pods}" -                "However at least one is, so service may not be impacted." -            ).format(pods="".join("  " + pod['metadata']['name'] + "\n" for pod in not_running)) - -        return None +                "  {pods}\n" +                "However at least one is, so service may not be impacted.".format( +                    pods="\n  ".join(pod['metadata']['name'] for pod in not_running) +                ) +            )      def _get_kibana_url(self):          """          Get kibana route or report error. -        Returns: url (or empty), reason for failure +        Returns: url          """          # Get logging url -        get_route = self.exec_oc( -            self.logging_namespace, -            "get route logging-kibana -o json", -            [], -        ) +        get_route = self.exec_oc("get route logging-kibana -o json", [])          if not get_route: -            return None, 'no_route_exists' +            raise OpenShiftCheckException( +                'no_route_exists', +                'No route is defined for Kibana in the logging namespace,\n' +                'so the logging stack is not accessible. Is logging deployed?\n' +                'Did something remove the logging-kibana route?' +            ) -        route = json.loads(get_route) +        try: +            route = json.loads(get_route) +            # check that the route has been accepted by a router +            ingress = route["status"]["ingress"] +        except (ValueError, KeyError): +            raise OpenShiftCheckException( +                'get_route_failed', +                '"oc get route" returned an unexpected response:\n' + get_route +            ) -        # check that the route has been accepted by a router -        ingress = route["status"]["ingress"]          # ingress can be null if there is no router, or empty if not routed          if not ingress or not ingress[0]: -            return None, 'route_not_accepted' +            raise OpenShiftCheckException( +                'route_not_accepted', +                'The logging-kibana route is not being routed by any router.\n' +                'Is the router deployed and working?' +            )          host = route.get("spec", {}).get("host")          if not host: -            return None, 'route_missing_host' +            raise OpenShiftCheckException( +                'route_missing_host', +                'The logging-kibana route has no hostname defined,\n' +                'which should never happen. Did something alter its definition?' +            ) -        return 'https://{}/'.format(host), None +        return 'https://{}/'.format(host) -    def _check_kibana_route(self): +    def check_kibana_route(self):          """          Check to see if kibana route is up and working. -        Returns: error string +        Raises exception if not.          """ -        known_errors = dict( -            no_route_exists=( -                'No route is defined for Kibana in the logging namespace,\n' -                'so the logging stack is not accessible. Is logging deployed?\n' -                'Did something remove the logging-kibana route?' -            ), -            route_not_accepted=( -                'The logging-kibana route is not being routed by any router.\n' -                'Is the router deployed and working?' -            ), -            route_missing_host=( -                'The logging-kibana route has no hostname defined,\n' -                'which should never happen. Did something alter its definition?' -            ), -        ) -        kibana_url, error = self._get_kibana_url() -        if not kibana_url: -            return known_errors.get(error, error) +        kibana_url = self._get_kibana_url()          # first, check that kibana is reachable from the master.          error = self._verify_url_internal(kibana_url)          if error:              if 'urlopen error [Errno 111] Connection refused' in error: -                error = ( +                raise OpenShiftCheckException( +                    'FailedToConnectInternal',                      'Failed to connect from this master to Kibana URL {url}\n' -                    'Is kibana running, and is at least one router routing to it?' -                ).format(url=kibana_url) +                    'Is kibana running, and is at least one router routing to it?'.format(url=kibana_url) +                )              elif 'urlopen error [Errno -2] Name or service not known' in error: -                error = ( +                raise OpenShiftCheckException( +                    'FailedToResolveInternal',                      'Failed to connect from this master to Kibana URL {url}\n'                      'because the hostname does not resolve.\n' -                    'Is DNS configured for the Kibana hostname?' -                ).format(url=kibana_url) +                    'Is DNS configured for the Kibana hostname?'.format(url=kibana_url) +                )              elif 'Status code was not' in error: -                error = ( +                raise OpenShiftCheckException( +                    'WrongReturnCodeInternal',                      'A request from this master to the Kibana URL {url}\n'                      'did not return the correct status code (302).\n'                      'This could mean that Kibana is malfunctioning, the hostname is\n'                      'resolving incorrectly, or other network issues. The output was:\n' -                    '  {error}' -                ).format(url=kibana_url, error=error) -            return 'Error validating the logging Kibana route:\n' + error +                    '  {error}'.format(url=kibana_url, error=error) +                ) +            raise OpenShiftCheckException( +                'MiscRouteErrorInternal', +                'Error validating the logging Kibana route internally:\n' + error +            )          # in production we would like the kibana route to work from outside the          # cluster too; but that may not be the case, so allow disabling just this part. -        if not self.get_var("openshift_check_efk_kibana_external", default=True): -            return None +        if self.get_var("openshift_check_efk_kibana_external", default="True").lower() != "true": +            return          error = self._verify_url_external(kibana_url) -        if error: -            if 'urlopen error [Errno 111] Connection refused' in error: -                error = ( -                    'Failed to connect from the Ansible control host to Kibana URL {url}\n' -                    'Is the router for the Kibana hostname exposed externally?' -                ).format(url=kibana_url) -            elif 'urlopen error [Errno -2] Name or service not known' in error: -                error = ( -                    'Failed to resolve the Kibana hostname in {url}\n' -                    'from the Ansible control host.\n' -                    'Is DNS configured to resolve this Kibana hostname externally?' -                ).format(url=kibana_url) -            elif 'Expected success (200)' in error: -                error = ( -                    'A request to Kibana at {url}\n' -                    'returned the wrong error code:\n' -                    '  {error}\n' -                    'This could mean that Kibana is malfunctioning, the hostname is\n' -                    'resolving incorrectly, or other network issues.' -                ).format(url=kibana_url, error=error) -            error = ( -                'Error validating the logging Kibana route:\n{error}\n' -                'To disable external Kibana route validation, set in your inventory:\n' -                '  openshift_check_efk_kibana_external=False' -            ).format(error=error) -            return error -        return None + +        if not error: +            return + +        error_fmt = ( +            'Error validating the logging Kibana route:\n{error}\n' +            'To disable external Kibana route validation, set the variable:\n' +            '  openshift_check_efk_kibana_external=False' +        ) +        if 'urlopen error [Errno 111] Connection refused' in error: +            msg = ( +                'Failed to connect from the Ansible control host to Kibana URL {url}\n' +                'Is the router for the Kibana hostname exposed externally?' +            ).format(url=kibana_url) +            raise OpenShiftCheckException('FailedToConnect', error_fmt.format(error=msg)) +        elif 'urlopen error [Errno -2] Name or service not known' in error: +            msg = ( +                'Failed to resolve the Kibana hostname in {url}\n' +                'from the Ansible control host.\n' +                'Is DNS configured to resolve this Kibana hostname externally?' +            ).format(url=kibana_url) +            raise OpenShiftCheckException('FailedToResolve', error_fmt.format(error=msg)) +        elif 'Expected success (200)' in error: +            msg = ( +                'A request to Kibana at {url}\n' +                'returned the wrong error code:\n' +                '  {error}\n' +                'This could mean that Kibana is malfunctioning, the hostname is\n' +                'resolving incorrectly, or other network issues.' +            ).format(url=kibana_url, error=error) +            raise OpenShiftCheckException('WrongReturnCode', error_fmt.format(error=msg)) +        raise OpenShiftCheckException( +            'MiscRouteError', +            'Error validating the logging Kibana route externally:\n' + error +        ) diff --git a/roles/openshift_health_checker/openshift_checks/logging/logging.py b/roles/openshift_health_checker/openshift_checks/logging/logging.py index 43ba6c406..3b7c39760 100644 --- a/roles/openshift_health_checker/openshift_checks/logging/logging.py +++ b/roles/openshift_health_checker/openshift_checks/logging/logging.py @@ -8,6 +8,16 @@ import os  from openshift_checks import OpenShiftCheck, OpenShiftCheckException +class MissingComponentPods(OpenShiftCheckException): +    """Raised when a component has no pods in the namespace.""" +    pass + + +class CouldNotUseOc(OpenShiftCheckException): +    """Raised when ocutil has a failure running oc.""" +    pass + +  class LoggingCheck(OpenShiftCheck):      """Base class for OpenShift aggregated logging component checks""" @@ -15,7 +25,6 @@ class LoggingCheck(OpenShiftCheck):      # run by itself.      name = "logging" -    logging_namespace = "logging"      def is_active(self):          logging_deployed = self.get_var("openshift_hosted_logging_deploy", default=False) @@ -32,22 +41,24 @@ class LoggingCheck(OpenShiftCheck):      def run(self):          return {} -    def get_pods_for_component(self, namespace, logging_component): -        """Get all pods for a given component. Returns: list of pods for component, error string""" +    def get_pods_for_component(self, logging_component): +        """Get all pods for a given component. Returns: list of pods."""          pod_output = self.exec_oc( -            namespace,              "get pods -l component={} -o json".format(logging_component),              [],          )          try: -            pods = json.loads(pod_output) -            if not pods or not pods.get('items'): +            pods = json.loads(pod_output)  # raises ValueError if deserialize fails +            if not pods or not pods.get('items'):  # also a broken response, treat the same                  raise ValueError()          except ValueError: -            # successful run but non-parsing data generally means there were no pods in the namespace -            return None, 'No pods were found for the "{}" logging component.'.format(logging_component) +            # successful run but non-parsing data generally means there were no pods to be found +            raise MissingComponentPods( +                'There are no "{}" component pods in the "{}" namespace.\n' +                'Is logging deployed?'.format(logging_component, self.logging_namespace()) +            ) -        return pods['items'], None +        return pods['items']      @staticmethod      def not_running_pods(pods): @@ -63,15 +74,19 @@ class LoggingCheck(OpenShiftCheck):              )          ] -    def exec_oc(self, namespace="logging", cmd_str="", extra_args=None): +    def logging_namespace(self): +        """Returns the namespace in which logging is configured to deploy.""" +        return self.get_var("openshift_logging_namespace", default="logging") + +    def exec_oc(self, cmd_str="", extra_args=None):          """          Execute an 'oc' command in the remote host.          Returns: output of command and namespace, -        or raises OpenShiftCheckException on error +        or raises CouldNotUseOc on error          """          config_base = self.get_var("openshift", "common", "config_base")          args = { -            "namespace": namespace, +            "namespace": self.logging_namespace(),              "config_file": os.path.join(config_base, "master", "admin.kubeconfig"),              "cmd": cmd_str,              "extra_args": list(extra_args) if extra_args else [], @@ -79,17 +94,16 @@ class LoggingCheck(OpenShiftCheck):          result = self.execute_module("ocutil", args)          if result.get("failed"): -            msg = ( -                'Unexpected error using `oc` to validate the logging stack components.\n' -                'Error executing `oc {cmd}`:\n' -                '{error}' -            ).format(cmd=args['cmd'], error=result['result']) -              if result['result'] == '[Errno 2] No such file or directory': -                msg = ( +                raise CouldNotUseOc(                      "This host is supposed to be a master but does not have the `oc` command where expected.\n"                      "Has an installation been run on this host yet?"                  ) -            raise OpenShiftCheckException(msg) + +            raise CouldNotUseOc( +                'Unexpected error using `oc` to validate the logging stack components.\n' +                'Error executing `oc {cmd}`:\n' +                '{error}'.format(cmd=args['cmd'], error=result['result']) +            )          return result.get("result", "") diff --git a/roles/openshift_health_checker/openshift_checks/logging/logging_index_time.py b/roles/openshift_health_checker/openshift_checks/logging/logging_index_time.py index b24e88e05..d781db649 100644 --- a/roles/openshift_health_checker/openshift_checks/logging/logging_index_time.py +++ b/roles/openshift_health_checker/openshift_checks/logging/logging_index_time.py @@ -19,8 +19,6 @@ class LoggingIndexTime(LoggingCheck):      name = "logging_index_time"      tags = ["health", "logging"] -    logging_namespace = "logging" -      def run(self):          """Add log entry by making unique request to Kibana. Check for unique entry in the ElasticSearch pod logs."""          try: @@ -28,29 +26,25 @@ class LoggingIndexTime(LoggingCheck):                  self.get_var("openshift_check_logging_index_timeout_seconds", default=ES_CMD_TIMEOUT_SECONDS)              )          except ValueError: -            return { -                "failed": True, -                "msg": ('Invalid value provided for "openshift_check_logging_index_timeout_seconds". ' -                        'Value must be an integer representing an amount in seconds.'), -            } +            raise OpenShiftCheckException( +                'InvalidTimeout', +                'Invalid value provided for "openshift_check_logging_index_timeout_seconds". ' +                'Value must be an integer representing an amount in seconds.' +            )          running_component_pods = dict()          # get all component pods -        self.logging_namespace = self.get_var("openshift_logging_namespace", default=self.logging_namespace)          for component, name in (['kibana', 'Kibana'], ['es', 'Elasticsearch']): -            pods, error = self.get_pods_for_component(self.logging_namespace, component) - -            if error: -                msg = 'Unable to retrieve pods for the {} logging component: {}' -                return {"failed": True, "changed": False, "msg": msg.format(name, error)} - +            pods = self.get_pods_for_component(component)              running_pods = self.running_pods(pods)              if not running_pods: -                msg = ('No {} pods in the "Running" state were found.' -                       'At least one pod is required in order to perform this check.') -                return {"failed": True, "changed": False, "msg": msg.format(name)} +                raise OpenShiftCheckException( +                    component + 'NoRunningPods', +                    'No {} pods in the "Running" state were found.' +                    'At least one pod is required in order to perform this check.'.format(name) +                )              running_component_pods[component] = running_pods @@ -65,8 +59,11 @@ class LoggingIndexTime(LoggingCheck):          interval = 1          while not self.query_es_from_es(es_pod, uuid):              if time.time() + interval > deadline: -                msg = "expecting match in Elasticsearch for message with uuid {}, but no matches were found after {}s." -                raise OpenShiftCheckException(msg.format(uuid, timeout_secs)) +                raise OpenShiftCheckException( +                    "NoMatchFound", +                    "expecting match in Elasticsearch for message with uuid {}, " +                    "but no matches were found after {}s.".format(uuid, timeout_secs) +                )              time.sleep(interval)      def curl_kibana_with_uuid(self, kibana_pod): @@ -76,22 +73,23 @@ class LoggingIndexTime(LoggingCheck):          exec_cmd = "exec {pod_name} -c kibana -- curl --max-time 30 -s http://localhost:5601/{uuid}"          exec_cmd = exec_cmd.format(pod_name=pod_name, uuid=uuid) -        error_str = self.exec_oc(self.logging_namespace, exec_cmd, []) +        error_str = self.exec_oc(exec_cmd, [])          try:              error_code = json.loads(error_str)["statusCode"] -        except KeyError: -            msg = ('invalid response returned from Kibana request (Missing "statusCode" key):\n' -                   'Command: {}\nResponse: {}').format(exec_cmd, error_str) -            raise OpenShiftCheckException(msg) -        except ValueError: -            msg = ('invalid response returned from Kibana request (Non-JSON output):\n' -                   'Command: {}\nResponse: {}').format(exec_cmd, error_str) -            raise OpenShiftCheckException(msg) +        except (KeyError, ValueError): +            raise OpenShiftCheckException( +                'kibanaInvalidResponse', +                'invalid response returned from Kibana request:\n' +                'Command: {}\nResponse: {}'.format(exec_cmd, error_str) +            )          if error_code != 404: -            msg = 'invalid error code returned from Kibana request. Expecting error code "404", but got "{}" instead.' -            raise OpenShiftCheckException(msg.format(error_code)) +            raise OpenShiftCheckException( +                'kibanaInvalidReturnCode', +                'invalid error code returned from Kibana request.\n' +                'Expecting error code "404", but got "{}" instead.'.format(error_code) +            )          return uuid @@ -105,17 +103,18 @@ class LoggingIndexTime(LoggingCheck):              "--key /etc/elasticsearch/secret/admin-key "              "https://logging-es:9200/project.{namespace}*/_count?q=message:{uuid}"          ) -        exec_cmd = exec_cmd.format(pod_name=pod_name, namespace=self.logging_namespace, uuid=uuid) -        result = self.exec_oc(self.logging_namespace, exec_cmd, []) +        exec_cmd = exec_cmd.format(pod_name=pod_name, namespace=self.logging_namespace(), uuid=uuid) +        result = self.exec_oc(exec_cmd, [])          try:              count = json.loads(result)["count"] -        except KeyError: -            msg = 'invalid response from Elasticsearch query:\n"{}"\nMissing "count" key:\n{}' -            raise OpenShiftCheckException(msg.format(exec_cmd, result)) -        except ValueError: -            msg = 'invalid response from Elasticsearch query:\n"{}"\nNon-JSON output:\n{}' -            raise OpenShiftCheckException(msg.format(exec_cmd, result)) +        except (KeyError, ValueError): +            raise OpenShiftCheckException( +                'esInvalidResponse', +                'Invalid response from Elasticsearch query:\n' +                '  {}\n' +                'Response was:\n{}'.format(exec_cmd, result) +            )          return count diff --git a/roles/openshift_health_checker/openshift_checks/mixins.py b/roles/openshift_health_checker/openshift_checks/mixins.py index 3b2c64e6a..e9bae60a3 100644 --- a/roles/openshift_health_checker/openshift_checks/mixins.py +++ b/roles/openshift_health_checker/openshift_checks/mixins.py @@ -29,10 +29,10 @@ class DockerHostMixin(object):          """          Ensure that docker-related packages exist, but not on atomic hosts          (which would not be able to install but should already have them). -        Returns: msg, failed, changed +        Returns: msg, failed          """          if self.get_var("openshift", "common", "is_atomic"): -            return "", False, False +            return "", False          # NOTE: we would use the "package" module but it's actually an action plugin          # and it's not clear how to invoke one of those. This is about the same anyway: @@ -49,5 +49,5 @@ class DockerHostMixin(object):                  "    {deps}\n{msg}"              ).format(deps=',\n    '.join(self.dependencies), msg=msg)          failed = result.get("failed", False) or result.get("rc", 0) != 0 -        changed = result.get("changed", False) -        return msg, failed, changed +        self.changed = result.get("changed", False) +        return msg, failed diff --git a/roles/openshift_health_checker/test/action_plugin_test.py b/roles/openshift_health_checker/test/action_plugin_test.py index 2d068be3d..f5161d6f5 100644 --- a/roles/openshift_health_checker/test/action_plugin_test.py +++ b/roles/openshift_health_checker/test/action_plugin_test.py @@ -6,7 +6,7 @@ from openshift_health_check import ActionModule, resolve_checks  from openshift_checks import OpenShiftCheckException -def fake_check(name='fake_check', tags=None, is_active=True, run_return=None, run_exception=None): +def fake_check(name='fake_check', tags=None, is_active=True, run_return=None, run_exception=None, changed=False):      """Returns a new class that is compatible with OpenShiftCheck for testing."""      _name, _tags = name, tags @@ -14,6 +14,7 @@ def fake_check(name='fake_check', tags=None, is_active=True, run_return=None, ru      class FakeCheck(object):          name = _name          tags = _tags or [] +        changed = False          def __init__(self, execute_module=None, task_vars=None, tmp=None):              pass @@ -22,6 +23,7 @@ def fake_check(name='fake_check', tags=None, is_active=True, run_return=None, ru              return is_active          def run(self): +            self.changed = changed              if run_exception is not None:                  raise run_exception              return run_return @@ -135,14 +137,15 @@ def test_action_plugin_run_check_ok(plugin, task_vars, monkeypatch):  def test_action_plugin_run_check_changed(plugin, task_vars, monkeypatch): -    check_return_value = {'ok': 'test', 'changed': True} -    check_class = fake_check(run_return=check_return_value) +    check_return_value = {'ok': 'test'} +    check_class = fake_check(run_return=check_return_value, changed=True)      monkeypatch.setattr(plugin, 'load_known_checks', lambda tmp, task_vars: {'fake_check': check_class()})      monkeypatch.setattr('openshift_health_check.resolve_checks', lambda *args: ['fake_check'])      result = plugin.run(tmp=None, task_vars=task_vars)      assert result['checks']['fake_check'] == check_return_value +    assert changed(result['checks']['fake_check'])      assert not failed(result)      assert changed(result)      assert not skipped(result) @@ -165,7 +168,7 @@ def test_action_plugin_run_check_fail(plugin, task_vars, monkeypatch):  def test_action_plugin_run_check_exception(plugin, task_vars, monkeypatch):      exception_msg = 'fake check has an exception'      run_exception = OpenShiftCheckException(exception_msg) -    check_class = fake_check(run_exception=run_exception) +    check_class = fake_check(run_exception=run_exception, changed=True)      monkeypatch.setattr(plugin, 'load_known_checks', lambda tmp, task_vars: {'fake_check': check_class()})      monkeypatch.setattr('openshift_health_check.resolve_checks', lambda *args: ['fake_check']) @@ -173,7 +176,8 @@ def test_action_plugin_run_check_exception(plugin, task_vars, monkeypatch):      assert failed(result['checks']['fake_check'], msg_has=exception_msg)      assert failed(result, msg_has=['failed']) -    assert not changed(result) +    assert changed(result['checks']['fake_check']) +    assert changed(result)      assert not skipped(result) diff --git a/roles/openshift_health_checker/test/curator_test.py b/roles/openshift_health_checker/test/curator_test.py index ae108c96e..62c680b74 100644 --- a/roles/openshift_health_checker/test/curator_test.py +++ b/roles/openshift_health_checker/test/curator_test.py @@ -1,22 +1,6 @@  import pytest -from openshift_checks.logging.curator import Curator - - -def canned_curator(exec_oc=None): -    """Create a Curator check object with canned exec_oc method""" -    check = Curator("dummy")  # fails if a module is actually invoked -    if exec_oc: -        check._exec_oc = exec_oc -    return check - - -def assert_error(error, expect_error): -    if expect_error: -        assert error -        assert expect_error in error -    else: -        assert not error +from openshift_checks.logging.curator import Curator, OpenShiftCheckException  plain_curator_pod = { @@ -44,25 +28,30 @@ not_running_curator_pod = {  } +def test_get_curator_pods(): +    check = Curator() +    check.get_pods_for_component = lambda *_: [plain_curator_pod] +    result = check.run() +    assert "failed" not in result or not result["failed"] + +  @pytest.mark.parametrize('pods, expect_error', [      (          [], -        "no Curator pods", -    ), -    ( -        [plain_curator_pod], -        None, +        'MissingComponentPods',      ),      (          [not_running_curator_pod], -        "not currently in a running state", +        'CuratorNotRunning',      ),      (          [plain_curator_pod, plain_curator_pod], -        "more than one Curator pod", +        'TooManyCurators',      ),  ]) -def test_get_curator_pods(pods, expect_error): -    check = canned_curator() -    error = check.check_curator(pods) -    assert_error(error, expect_error) +def test_get_curator_pods_fail(pods, expect_error): +    check = Curator() +    check.get_pods_for_component = lambda *_: pods +    with pytest.raises(OpenShiftCheckException) as excinfo: +        check.run() +    assert excinfo.value.name == expect_error diff --git a/roles/openshift_health_checker/test/elasticsearch_test.py b/roles/openshift_health_checker/test/elasticsearch_test.py index 67408609a..09bacd9ac 100644 --- a/roles/openshift_health_checker/test/elasticsearch_test.py +++ b/roles/openshift_health_checker/test/elasticsearch_test.py @@ -1,17 +1,26 @@  import pytest  import json -from openshift_checks.logging.elasticsearch import Elasticsearch +from openshift_checks.logging.elasticsearch import Elasticsearch, OpenShiftCheckExceptionList +  task_vars_config_base = dict(openshift=dict(common=dict(config_base='/etc/origin'))) -def assert_error(error, expect_error): -    if expect_error: -        assert error -        assert expect_error in error -    else: -        assert not error +def canned_elasticsearch(task_vars=None, exec_oc=None): +    """Create an Elasticsearch check object with stubbed exec_oc method""" +    check = Elasticsearch(None, task_vars or {}) +    if exec_oc: +        check.exec_oc = exec_oc +    return check + + +def assert_error_in_list(expect_err, errorlist): +    assert any(err.name == expect_err for err in errorlist), "{} in {}".format(str(expect_err), str(errorlist)) + + +def pods_by_name(pods): +    return {pod['metadata']['name']: pod for pod in pods}  plain_es_pod = { @@ -19,6 +28,7 @@ plain_es_pod = {          "labels": {"component": "es", "deploymentconfig": "logging-es"},          "name": "logging-es",      }, +    "spec": {},      "status": {          "conditions": [{"status": "True", "type": "Ready"}],          "containerStatuses": [{"ready": True}], @@ -32,6 +42,7 @@ split_es_pod = {          "labels": {"component": "es", "deploymentconfig": "logging-es-2"},          "name": "logging-es-2",      }, +    "spec": {},      "status": {          "conditions": [{"status": "True", "type": "Ready"}],          "containerStatuses": [{"ready": True}], @@ -40,12 +51,28 @@ split_es_pod = {      "_test_master_name_str": "name logging-es-2",  } +unready_es_pod = { +    "metadata": { +        "labels": {"component": "es", "deploymentconfig": "logging-es-3"}, +        "name": "logging-es-3", +    }, +    "spec": {}, +    "status": { +        "conditions": [{"status": "False", "type": "Ready"}], +        "containerStatuses": [{"ready": False}], +        "podIP": "10.10.10.10", +    }, +    "_test_master_name_str": "BAD_NAME_RESPONSE", +} +  def test_check_elasticsearch(): -    assert 'No logging Elasticsearch pods' in Elasticsearch().check_elasticsearch([]) +    with pytest.raises(OpenShiftCheckExceptionList) as excinfo: +        canned_elasticsearch().check_elasticsearch([]) +    assert_error_in_list('NoRunningPods', excinfo.value)      # canned oc responses to match so all the checks pass -    def _exec_oc(ns, cmd, args): +    def exec_oc(cmd, args):          if '_cat/master' in cmd:              return 'name logging-es'          elif '/_nodes' in cmd: @@ -57,35 +84,41 @@ def test_check_elasticsearch():          else:              raise Exception(cmd) -    check = Elasticsearch(None, {}) -    check.exec_oc = _exec_oc -    assert not check.check_elasticsearch([plain_es_pod]) +    check = canned_elasticsearch({}, exec_oc) +    check.get_pods_for_component = lambda *_: [plain_es_pod] +    assert {} == check.run() -def pods_by_name(pods): -    return {pod['metadata']['name']: pod for pod in pods} +def test_check_running_es_pods(): +    pods, errors = Elasticsearch().running_elasticsearch_pods([plain_es_pod, unready_es_pod]) +    assert plain_es_pod in pods +    assert_error_in_list('PodNotRunning', errors) + + +def test_check_elasticsearch_masters(): +    pods = [plain_es_pod] +    check = canned_elasticsearch(task_vars_config_base, lambda *_: plain_es_pod['_test_master_name_str']) +    assert not check.check_elasticsearch_masters(pods_by_name(pods))  @pytest.mark.parametrize('pods, expect_error', [      (          [], -        'No logging Elasticsearch masters', +        'NoMasterFound',      ),      ( -        [plain_es_pod], -        None, +        [unready_es_pod], +        'NoMasterName',      ),      (          [plain_es_pod, split_es_pod], -        'Found multiple Elasticsearch masters', +        'SplitBrainMasters',      ),  ]) -def test_check_elasticsearch_masters(pods, expect_error): +def test_check_elasticsearch_masters_error(pods, expect_error):      test_pods = list(pods) -    check = Elasticsearch(None, task_vars_config_base) -    check.execute_module = lambda cmd, args: {'result': test_pods.pop(0)['_test_master_name_str']} -    errors = check._check_elasticsearch_masters(pods_by_name(pods)) -    assert_error(''.join(errors), expect_error) +    check = canned_elasticsearch(task_vars_config_base, lambda *_: test_pods.pop(0)['_test_master_name_str']) +    assert_error_in_list(expect_error, check.check_elasticsearch_masters(pods_by_name(pods)))  es_node_list = { @@ -95,83 +128,76 @@ es_node_list = {          }}} +def test_check_elasticsearch_node_list(): +    check = canned_elasticsearch(task_vars_config_base, lambda *_: json.dumps(es_node_list)) +    assert not check.check_elasticsearch_node_list(pods_by_name([plain_es_pod])) + +  @pytest.mark.parametrize('pods, node_list, expect_error', [      (          [],          {}, -        'No logging Elasticsearch masters', -    ), -    ( -        [plain_es_pod], -        es_node_list, -        None, +        'MissingComponentPods',      ),      (          [plain_es_pod],          {},  # empty list of nodes triggers KeyError -        "Failed to query", +        'MissingNodeList',      ),      (          [split_es_pod],          es_node_list, -        'does not correspond to any known ES pod', +        'EsPodNodeMismatch',      ),  ]) -def test_check_elasticsearch_node_list(pods, node_list, expect_error): -    check = Elasticsearch(None, task_vars_config_base) -    check.execute_module = lambda cmd, args: {'result': json.dumps(node_list)} +def test_check_elasticsearch_node_list_errors(pods, node_list, expect_error): +    check = canned_elasticsearch(task_vars_config_base, lambda cmd, args: json.dumps(node_list)) +    assert_error_in_list(expect_error, check.check_elasticsearch_node_list(pods_by_name(pods))) -    errors = check._check_elasticsearch_node_list(pods_by_name(pods)) -    assert_error(''.join(errors), expect_error) + +def test_check_elasticsearch_cluster_health(): +    test_health_data = [{"status": "green"}] +    check = canned_elasticsearch(exec_oc=lambda *_: json.dumps(test_health_data.pop(0))) +    assert not check.check_es_cluster_health(pods_by_name([plain_es_pod]))  @pytest.mark.parametrize('pods, health_data, expect_error', [      (          [plain_es_pod], -        [{"status": "green"}], -        None, -    ), -    ( -        [plain_es_pod],          [{"no-status": "should bomb"}], -        'Could not retrieve cluster health status', +        'BadEsResponse',      ),      (          [plain_es_pod, split_es_pod],          [{"status": "green"}, {"status": "red"}], -        'Elasticsearch cluster health status is RED', +        'EsClusterHealthRed',      ),  ]) -def test_check_elasticsearch_cluster_health(pods, health_data, expect_error): +def test_check_elasticsearch_cluster_health_errors(pods, health_data, expect_error):      test_health_data = list(health_data) -    check = Elasticsearch(None, task_vars_config_base) -    check.execute_module = lambda cmd, args: {'result': json.dumps(test_health_data.pop(0))} +    check = canned_elasticsearch(exec_oc=lambda *_: json.dumps(test_health_data.pop(0))) +    assert_error_in_list(expect_error, check.check_es_cluster_health(pods_by_name(pods))) -    errors = check._check_es_cluster_health(pods_by_name(pods)) -    assert_error(''.join(errors), expect_error) + +def test_check_elasticsearch_diskspace(): +    check = canned_elasticsearch(exec_oc=lambda *_: 'IUse% Use%\n 3%  4%\n') +    assert not check.check_elasticsearch_diskspace(pods_by_name([plain_es_pod]))  @pytest.mark.parametrize('disk_data, expect_error', [      (          'df: /elasticsearch/persistent: No such file or directory\n', -        'Could not retrieve storage usage', -    ), -    ( -        'IUse% Use%\n 3%  4%\n', -        None, +        'BadDfResponse',      ),      (          'IUse% Use%\n 95%  40%\n', -        'Inode percent usage on the storage volume', +        'InodeUsageTooHigh',      ),      (          'IUse% Use%\n 3%  94%\n', -        'Disk percent usage on the storage volume', +        'DiskUsageTooHigh',      ),  ]) -def test_check_elasticsearch_diskspace(disk_data, expect_error): -    check = Elasticsearch(None, task_vars_config_base) -    check.execute_module = lambda cmd, args: {'result': disk_data} - -    errors = check._check_elasticsearch_diskspace(pods_by_name([plain_es_pod])) -    assert_error(''.join(errors), expect_error) +def test_check_elasticsearch_diskspace_errors(disk_data, expect_error): +    check = canned_elasticsearch(exec_oc=lambda *_: disk_data) +    assert_error_in_list(expect_error, check.check_elasticsearch_diskspace(pods_by_name([plain_es_pod]))) diff --git a/roles/openshift_health_checker/test/fluentd_config_test.py b/roles/openshift_health_checker/test/fluentd_config_test.py index 8a2d8b72b..10db253bc 100644 --- a/roles/openshift_health_checker/test/fluentd_config_test.py +++ b/roles/openshift_health_checker/test/fluentd_config_test.py @@ -198,12 +198,9 @@ def test_check_logging_config_master(name, pods, logging_driver, extra_words):          ),      ) -    def get_pods(namespace, logging_component): -        return pods, None -      check = FluentdConfig(execute_module, task_vars)      check.execute_module = execute_module -    check.get_pods_for_component = get_pods +    check.get_pods_for_component = lambda _: pods      error = check.check_logging_config()      assert error is None @@ -283,12 +280,9 @@ def test_check_logging_config_master_failed(name, pods, logging_driver, words):          ),      ) -    def get_pods(namespace, logging_component): -        return pods, None -      check = FluentdConfig(execute_module, task_vars)      check.execute_module = execute_module -    check.get_pods_for_component = get_pods +    check.get_pods_for_component = lambda _: pods      error = check.check_logging_config()      assert error is not None @@ -343,11 +337,8 @@ def test_check_logging_config_master_fails_on_unscheduled_deployment(name, pods,          ),      ) -    def get_pods(namespace, logging_component): -        return pods, None -      check = FluentdConfig(execute_module, task_vars) -    check.get_pods_for_component = get_pods +    check.get_pods_for_component = lambda _: pods      with pytest.raises(OpenShiftCheckException) as error:          check.check_logging_config() diff --git a/roles/openshift_health_checker/test/fluentd_test.py b/roles/openshift_health_checker/test/fluentd_test.py index a84d89cef..e7bf9818b 100644 --- a/roles/openshift_health_checker/test/fluentd_test.py +++ b/roles/openshift_health_checker/test/fluentd_test.py @@ -1,15 +1,11 @@  import pytest  import json -from openshift_checks.logging.fluentd import Fluentd +from openshift_checks.logging.fluentd import Fluentd, OpenShiftCheckExceptionList, OpenShiftCheckException -def assert_error(error, expect_error): -    if expect_error: -        assert error -        assert expect_error in error -    else: -        assert not error +def assert_error_in_list(expect_err, errorlist): +    assert any(err.name == expect_err for err in errorlist), "{} in {}".format(str(expect_err), str(errorlist))  fluentd_pod_node1 = { @@ -57,45 +53,60 @@ fluentd_node3_unlabeled = {  } +def test_get_fluentd_pods(): +    check = Fluentd() +    check.exec_oc = lambda *_: json.dumps(dict(items=[fluentd_node1])) +    check.get_pods_for_component = lambda *_: [fluentd_pod_node1] +    assert not check.run() + +  @pytest.mark.parametrize('pods, nodes, expect_error', [      (          [],          [], -        'No nodes appear to be defined', +        'NoNodesDefined',      ),      (          [],          [fluentd_node3_unlabeled], -        'There are no nodes with the fluentd label', +        'NoNodesLabeled',      ),      (          [],          [fluentd_node1, fluentd_node3_unlabeled], -        'Fluentd will not aggregate logs from these nodes.', +        'NodesUnlabeled',      ),      (          [],          [fluentd_node2], -        "nodes are supposed to have a Fluentd pod but do not", +        'MissingFluentdPod',      ),      (          [fluentd_pod_node1, fluentd_pod_node1],          [fluentd_node1], -        'more Fluentd pods running than nodes labeled', +        'TooManyFluentdPods',      ),      (          [fluentd_pod_node2_down],          [fluentd_node2], -        "Fluentd pods are supposed to be running", -    ), -    ( -        [fluentd_pod_node1], -        [fluentd_node1], -        None, +        'FluentdNotRunning',      ),  ]) -def test_get_fluentd_pods(pods, nodes, expect_error): +def test_get_fluentd_pods_errors(pods, nodes, expect_error): +    check = Fluentd() +    check.exec_oc = lambda *_: json.dumps(dict(items=nodes)) + +    with pytest.raises(OpenShiftCheckException) as excinfo: +        check.check_fluentd(pods) +    if isinstance(excinfo.value, OpenShiftCheckExceptionList): +        assert_error_in_list(expect_error, excinfo.value) +    else: +        assert expect_error == excinfo.value.name + + +def test_bad_oc_node_list():      check = Fluentd() -    check.exec_oc = lambda ns, cmd, args: json.dumps(dict(items=nodes)) -    error = check.check_fluentd(pods) -    assert_error(error, expect_error) +    check.exec_oc = lambda *_: "this isn't even json" +    with pytest.raises(OpenShiftCheckException) as excinfo: +        check.get_nodes_by_name() +    assert 'BadOcNodeList' == excinfo.value.name diff --git a/roles/openshift_health_checker/test/kibana_test.py b/roles/openshift_health_checker/test/kibana_test.py index 0bf492511..04a5e89c4 100644 --- a/roles/openshift_health_checker/test/kibana_test.py +++ b/roles/openshift_health_checker/test/kibana_test.py @@ -8,15 +8,7 @@ except ImportError:      from urllib.error import HTTPError, URLError      import urllib.request as urllib2 -from openshift_checks.logging.kibana import Kibana - - -def assert_error(error, expect_error): -    if expect_error: -        assert error -        assert expect_error in error -    else: -        assert not error +from openshift_checks.logging.kibana import Kibana, OpenShiftCheckException  plain_kibana_pod = { @@ -41,39 +33,45 @@ not_running_kibana_pod = {  } +def test_check_kibana(): +    # should run without exception: +    Kibana().check_kibana([plain_kibana_pod]) + +  @pytest.mark.parametrize('pods, expect_error', [      (          [], -        "There are no Kibana pods deployed", -    ), -    ( -        [plain_kibana_pod], -        None, +        "MissingComponentPods",      ),      (          [not_running_kibana_pod], -        "No Kibana pod is in a running state", +        "NoRunningPods",      ),      (          [plain_kibana_pod, not_running_kibana_pod], -        "The following Kibana pods are not currently in a running state", +        "PodNotRunning",      ),  ]) -def test_check_kibana(pods, expect_error): -    check = Kibana() -    error = check.check_kibana(pods) -    assert_error(error, expect_error) +def test_check_kibana_error(pods, expect_error): +    with pytest.raises(OpenShiftCheckException) as excinfo: +        Kibana().check_kibana(pods) +    assert expect_error == excinfo.value.name -@pytest.mark.parametrize('route, expect_url, expect_error', [ +@pytest.mark.parametrize('comment, route, expect_error', [      ( +        "No route returned",          None, -        None, -        'no_route_exists', +        "no_route_exists",      ), -    # test route with no ingress      ( +        "broken route response", +        {"status": {}}, +        "get_route_failed", +    ), +    ( +        "route with no ingress",          {              "metadata": {                  "labels": {"component": "kibana", "deploymentconfig": "logging-kibana"}, @@ -86,12 +84,11 @@ def test_check_kibana(pods, expect_error):                  "host": "hostname",              }          }, -        None, -        'route_not_accepted', +        "route_not_accepted",      ), -    # test route with no host      ( +        "route with no host",          {              "metadata": {                  "labels": {"component": "kibana", "deploymentconfig": "logging-kibana"}, @@ -104,12 +101,21 @@ def test_check_kibana(pods, expect_error):              },              "spec": {},          }, -        None, -        'route_missing_host', +        "route_missing_host",      ), +]) +def test_get_kibana_url_error(comment, route, expect_error): +    check = Kibana() +    check.exec_oc = lambda *_: json.dumps(route) if route else "" + +    with pytest.raises(OpenShiftCheckException) as excinfo: +        check._get_kibana_url() +    assert excinfo.value.name == expect_error -    # test route that looks fine + +@pytest.mark.parametrize('comment, route, expect_url', [      ( +        "test route that looks fine",          {              "metadata": {                  "labels": {"component": "kibana", "deploymentconfig": "logging-kibana"}, @@ -125,62 +131,57 @@ def test_check_kibana(pods, expect_error):              },          },          "https://hostname/", -        None,      ),  ]) -def test_get_kibana_url(route, expect_url, expect_error): +def test_get_kibana_url(comment, route, expect_url):      check = Kibana() -    check.exec_oc = lambda ns, cmd, args: json.dumps(route) if route else "" - -    url, error = check._get_kibana_url() -    if expect_url: -        assert url == expect_url -    else: -        assert not url -    if expect_error: -        assert error == expect_error -    else: -        assert not error +    check.exec_oc = lambda *_: json.dumps(route) +    assert expect_url == check._get_kibana_url()  @pytest.mark.parametrize('exec_result, expect', [      (          'urlopen error [Errno 111] Connection refused', -        'at least one router routing to it?', +        'FailedToConnectInternal',      ),      (          'urlopen error [Errno -2] Name or service not known', -        'DNS configured for the Kibana hostname?', +        'FailedToResolveInternal',      ),      (          'Status code was not [302]: HTTP Error 500: Server error', -        'did not return the correct status code', +        'WrongReturnCodeInternal',      ),      (          'bork bork bork', -        'bork bork bork',  # should pass through +        'MiscRouteErrorInternal',      ),  ])  def test_verify_url_internal_failure(exec_result, expect):      check = Kibana(execute_module=lambda *_: dict(failed=True, msg=exec_result)) -    check._get_kibana_url = lambda: ('url', None) +    check._get_kibana_url = lambda: 'url' -    error = check._check_kibana_route() -    assert_error(error, expect) +    with pytest.raises(OpenShiftCheckException) as excinfo: +        check.check_kibana_route() +    assert expect == excinfo.value.name  @pytest.mark.parametrize('lib_result, expect', [      ( -        HTTPError('url', 500, "it broke", hdrs=None, fp=None), -        'it broke', +        HTTPError('url', 500, 'it broke', hdrs=None, fp=None), +        'MiscRouteError', +    ), +    ( +        URLError('urlopen error [Errno 111] Connection refused'), +        'FailedToConnect',      ),      ( -        URLError('it broke'), -        'it broke', +        URLError('urlopen error [Errno -2] Name or service not known'), +        'FailedToResolve',      ),      (          302, -        'returned the wrong error code', +        'WrongReturnCode',      ),      (          200, @@ -204,8 +205,40 @@ def test_verify_url_external_failure(lib_result, expect, monkeypatch):      monkeypatch.setattr(urllib2, 'urlopen', urlopen)      check = Kibana() -    check._get_kibana_url = lambda: ('url', None) +    check._get_kibana_url = lambda: 'url'      check._verify_url_internal = lambda url: None -    error = check._check_kibana_route() -    assert_error(error, expect) +    if not expect: +        check.check_kibana_route() +        return + +    with pytest.raises(OpenShiftCheckException) as excinfo: +        check.check_kibana_route() +    assert expect == excinfo.value.name + + +def test_verify_url_external_skip(): +    check = Kibana(lambda *_: {}, dict(openshift_check_efk_kibana_external="false")) +    check._get_kibana_url = lambda: 'url' +    check.check_kibana_route() + + +# this is kind of silly but it adds coverage for the run() method... +def test_run(): +    pods = ["foo"] +    ran = dict(check_kibana=False, check_route=False) + +    def check_kibana(pod_list): +        ran["check_kibana"] = True +        assert pod_list == pods + +    def check_kibana_route(): +        ran["check_route"] = True + +    check = Kibana() +    check.get_pods_for_component = lambda *_: pods +    check.check_kibana = check_kibana +    check.check_kibana_route = check_kibana_route + +    check.run() +    assert ran["check_kibana"] and ran["check_route"] diff --git a/roles/openshift_health_checker/test/logging_check_test.py b/roles/openshift_health_checker/test/logging_check_test.py index 6f1697ee6..1a1c190f6 100644 --- a/roles/openshift_health_checker/test/logging_check_test.py +++ b/roles/openshift_health_checker/test/logging_check_test.py @@ -1,18 +1,14 @@  import pytest  import json -from openshift_checks.logging.logging import LoggingCheck, OpenShiftCheckException +from openshift_checks.logging.logging import LoggingCheck, MissingComponentPods, CouldNotUseOc  task_vars_config_base = dict(openshift=dict(common=dict(config_base='/etc/origin'))) -logging_namespace = "logging" - - -def canned_loggingcheck(exec_oc=None): +def canned_loggingcheck(exec_oc=None, execute_module=None):      """Create a LoggingCheck object with canned exec_oc method""" -    check = LoggingCheck()  # fails if a module is actually invoked -    check.logging_namespace = 'logging' +    check = LoggingCheck(execute_module)      if exec_oc:          check.exec_oc = exec_oc      return check @@ -97,8 +93,8 @@ def test_oc_failure(problem, expect):      check = LoggingCheck(execute_module, task_vars_config_base) -    with pytest.raises(OpenShiftCheckException) as excinfo: -        check.exec_oc(logging_namespace, 'get foo', []) +    with pytest.raises(CouldNotUseOc) as excinfo: +        check.exec_oc('get foo', [])      assert expect in str(excinfo) @@ -124,25 +120,32 @@ def test_is_active(groups, logging_deployed, is_active):      assert LoggingCheck(None, task_vars).is_active() == is_active -@pytest.mark.parametrize('pod_output, expect_pods, expect_error', [ +@pytest.mark.parametrize('pod_output, expect_pods', [ +    ( +        json.dumps({'items': [plain_es_pod]}), +        [plain_es_pod], +    ), +]) +def test_get_pods_for_component(pod_output, expect_pods): +    check = canned_loggingcheck(lambda *_: pod_output) +    pods = check.get_pods_for_component("es") +    assert pods == expect_pods + + +@pytest.mark.parametrize('exec_oc_output, expect_error', [      (          'No resources found.', -        None, -        'No pods were found for the "es"', +        MissingComponentPods,      ),      ( -        json.dumps({'items': [plain_kibana_pod, plain_es_pod, plain_curator_pod, fluentd_pod_node1]}), -        [plain_es_pod], -        None, +        '{"items": null}', +        MissingComponentPods,      ),  ]) -def test_get_pods_for_component(pod_output, expect_pods, expect_error): -    check = canned_loggingcheck(lambda namespace, cmd, args: pod_output) -    pods, error = check.get_pods_for_component( -        logging_namespace, -        "es", -    ) -    assert_error(error, expect_error) +def test_get_pods_for_component_fail(exec_oc_output, expect_error): +    check = canned_loggingcheck(lambda *_: exec_oc_output) +    with pytest.raises(expect_error): +        check.get_pods_for_component("es")  @pytest.mark.parametrize('name, pods, expected_pods', [ @@ -159,7 +162,7 @@ def test_get_pods_for_component(pod_output, expect_pods, expect_error):  ], ids=lambda argvals: argvals[0])  def test_get_not_running_pods_no_container_status(name, pods, expected_pods): -    check = canned_loggingcheck(lambda exec_module, namespace, cmd, args, task_vars: '') +    check = canned_loggingcheck(lambda *_: '')      result = check.not_running_pods(pods)      assert result == expected_pods diff --git a/roles/openshift_health_checker/test/logging_index_time_test.py b/roles/openshift_health_checker/test/logging_index_time_test.py index 178d7cd84..22566b295 100644 --- a/roles/openshift_health_checker/test/logging_index_time_test.py +++ b/roles/openshift_health_checker/test/logging_index_time_test.py @@ -69,7 +69,29 @@ def test_check_running_pods(pods, expect_pods):      assert pods == expect_pods -@pytest.mark.parametrize('name, json_response, uuid, timeout, extra_words', [ +def test_bad_config_param(): +    with pytest.raises(OpenShiftCheckException) as error: +        LoggingIndexTime(task_vars=dict(openshift_check_logging_index_timeout_seconds="foo")).run() +    assert 'InvalidTimeout' == error.value.name + + +def test_no_running_pods(): +    check = LoggingIndexTime() +    check.get_pods_for_component = lambda *_: [not_running_kibana_pod] +    with pytest.raises(OpenShiftCheckException) as error: +        check.run() +    assert 'kibanaNoRunningPods' == error.value.name + + +def test_with_running_pods(): +    check = LoggingIndexTime() +    check.get_pods_for_component = lambda *_: [plain_running_kibana_pod, plain_running_elasticsearch_pod] +    check.curl_kibana_with_uuid = lambda *_: SAMPLE_UUID +    check.wait_until_cmd_or_err = lambda *_: None +    assert not check.run().get("failed") + + +@pytest.mark.parametrize('name, json_response, uuid, timeout', [      (          'valid count in response',          { @@ -77,94 +99,72 @@ def test_check_running_pods(pods, expect_pods):          },          SAMPLE_UUID,          0.001, -        [],      ),  ], ids=lambda argval: argval[0]) -def test_wait_until_cmd_or_err_succeeds(name, json_response, uuid, timeout, extra_words): +def test_wait_until_cmd_or_err_succeeds(name, json_response, uuid, timeout):      check = canned_loggingindextime(lambda *_: json.dumps(json_response))      check.wait_until_cmd_or_err(plain_running_elasticsearch_pod, uuid, timeout) -@pytest.mark.parametrize('name, json_response, uuid, timeout, extra_words', [ +@pytest.mark.parametrize('name, json_response, timeout, expect_error', [      (          'invalid json response',          {              "invalid_field": 1,          }, -        SAMPLE_UUID,          0.001, -        ["invalid response", "Elasticsearch"], +        'esInvalidResponse',      ),      (          'empty response',          {}, -        SAMPLE_UUID,          0.001, -        ["invalid response", "Elasticsearch"], +        'esInvalidResponse',      ),      (          'valid response but invalid match count',          {              "count": 0,          }, -        SAMPLE_UUID,          0.005, -        ["expecting match", SAMPLE_UUID, "0.005s"], +        'NoMatchFound',      )  ], ids=lambda argval: argval[0]) -def test_wait_until_cmd_or_err(name, json_response, uuid, timeout, extra_words): +def test_wait_until_cmd_or_err(name, json_response, timeout, expect_error):      check = canned_loggingindextime(lambda *_: json.dumps(json_response))      with pytest.raises(OpenShiftCheckException) as error: -        check.wait_until_cmd_or_err(plain_running_elasticsearch_pod, uuid, timeout) +        check.wait_until_cmd_or_err(plain_running_elasticsearch_pod, SAMPLE_UUID, timeout) -    for word in extra_words: -        assert word in str(error) +    assert expect_error == error.value.name -@pytest.mark.parametrize('name, json_response, uuid, extra_words', [ -    ( -        'correct response code, found unique id is returned', -        { -            "statusCode": 404, -        }, -        "sample unique id", -        ["sample unique id"], -    ), -], ids=lambda argval: argval[0]) -def test_curl_kibana_with_uuid(name, json_response, uuid, extra_words): -    check = canned_loggingindextime(lambda *_: json.dumps(json_response)) -    check.generate_uuid = lambda: uuid - -    result = check.curl_kibana_with_uuid(plain_running_kibana_pod) - -    for word in extra_words: -        assert word in result +def test_curl_kibana_with_uuid(): +    check = canned_loggingindextime(lambda *_: json.dumps({"statusCode": 404})) +    check.generate_uuid = lambda: SAMPLE_UUID +    assert SAMPLE_UUID == check.curl_kibana_with_uuid(plain_running_kibana_pod) -@pytest.mark.parametrize('name, json_response, uuid, extra_words', [ +@pytest.mark.parametrize('name, json_response, expect_error', [      (          'invalid json response',          {              "invalid_field": "invalid",          }, -        SAMPLE_UUID, -        ["invalid response returned", 'Missing "statusCode" key'], +        'kibanaInvalidResponse',      ),      (          'wrong error code in response',          {              "statusCode": 500,          }, -        SAMPLE_UUID, -        ["Expecting error code", "500"], +        'kibanaInvalidReturnCode',      ),  ], ids=lambda argval: argval[0]) -def test_failed_curl_kibana_with_uuid(name, json_response, uuid, extra_words): +def test_failed_curl_kibana_with_uuid(name, json_response, expect_error):      check = canned_loggingindextime(lambda *_: json.dumps(json_response)) -    check.generate_uuid = lambda: uuid +    check.generate_uuid = lambda: SAMPLE_UUID      with pytest.raises(OpenShiftCheckException) as error:          check.curl_kibana_with_uuid(plain_running_kibana_pod) -    for word in extra_words: -        assert word in str(error) +    assert expect_error == error.value.name  | 
