11 files changed, 619 insertions, 0 deletions
diff --git a/roles/openshift_health_checker/README.md b/roles/openshift_health_checker/README.md
new file mode 100644
index 000000000..745e45b50
--- /dev/null
+++ b/roles/openshift_health_checker/README.md
@@ -0,0 +1,43 @@
+OpenShift Health Checker
+========================
+
+This role detects common problems with OpenShift installations or with
+environments prior to install.
+
+Requirements
+------------
+
+* Ansible 2.2+
+
+Role Variables
+--------------
+
+None
+
+Dependencies
+------------
+
+- openshift_facts
+
+Example Playbook
+----------------
+
+```yaml
+---
+- hosts: OSEv3
+  name: run OpenShift health checks
+  roles:
+    - openshift_health_checker
+  post_tasks:
+    - action: openshift_health_check
+```
+
+License
+-------
+
+Apache License Version 2.0
+
+Author Information
+------------------
+
+Customer Success team (dev@lists.openshift.redhat.com)
diff --git a/roles/openshift_health_checker/action_plugins/openshift_health_check.py b/roles/openshift_health_checker/action_plugins/openshift_health_check.py
new file mode 100644
index 000000000..36defde0a
--- /dev/null
+++ b/roles/openshift_health_checker/action_plugins/openshift_health_check.py
@@ -0,0 +1,95 @@
+"""
+Ansible action plugin to execute health checks in OpenShift clusters.
+"""
+# pylint: disable=wrong-import-position,missing-docstring,invalid-name
+import sys
+import os
+
+try:
+    from __main__ import display
+except ImportError:
+    from ansible.utils.display import Display
+    display = Display()
+
+from ansible.plugins.action import ActionBase
+
+# Augment sys.path so that we can import checks from a directory relative to
+# this callback plugin.
+sys.path.insert(1, os.path.dirname(os.path.dirname(__file__)))
+
+from openshift_checks import OpenShiftCheck, OpenShiftCheckException  # noqa: E402
+
+
+class ActionModule(ActionBase):
+
+    def run(self, tmp=None, task_vars=None):
+        result = super(ActionModule, self).run(tmp, task_vars)
+
+        if task_vars is None:
+            task_vars = {}
+
+        if "openshift" not in task_vars:
+            result["failed"] = True
+            result["msg"] = "'openshift' is undefined, did 'openshift_facts' run?"
+            return result
+
+        try:
+            known_checks = self.load_known_checks()
+        except OpenShiftCheckException as e:
+            result["failed"] = True
+            result["msg"] = str(e)
+            return result
+
+        args = self._task.args
+        requested_checks = set(args.get("checks", []))
+
+        unknown_checks = requested_checks - set(known_checks)
+        if unknown_checks:
+            result["failed"] = True
+            result["msg"] = (
+                "One or more checks are unknown: {}. "
+                "Make sure there is no typo in the playbook and no files are missing."
+            ).format(", ".join(unknown_checks))
+            return result
+
+        result["checks"] = check_results = {}
+
+        for check_name in requested_checks & set(known_checks):
+            display.banner("CHECK [{} : {}]".format(check_name, task_vars["ansible_host"]))
+            check = known_checks[check_name]
+
+            if check.is_active(task_vars):
+                try:
+                    r = check.run(tmp, task_vars)
+                except OpenShiftCheckException as e:
+                    r = {}
+                    r["failed"] = True
+                    r["msg"] = str(e)
+            else:
+                r = {"skipped": True}
+
+            check_results[check_name] = r
+
+            if r.get("failed", False):
+                result["failed"] = True
+                result["msg"] = "One or more checks failed"
+
+        return result
+
+    def load_known_checks(self):
+        known_checks = {}
+
+        known_check_classes = set(cls for cls in OpenShiftCheck.subclasses())
+
+        for cls in known_check_classes:
+            check_name = cls.name
+            if check_name in known_checks:
+                other_cls = known_checks[check_name].__class__
+                raise OpenShiftCheckException(
+                    "non-unique check name '{}' in: '{}.{}' and '{}.{}'".format(
+                        check_name,
+                        cls.__module__, cls.__name__,
+                        other_cls.__module__, other_cls.__name__))
+            known_checks[check_name] = cls(module_executor=self._execute_module)
+
+        return known_checks
diff --git a/roles/openshift_health_checker/callback_plugins/zz_failure_summary.py b/roles/openshift_health_checker/callback_plugins/zz_failure_summary.py
new file mode 100644
index 000000000..8caefab15
--- /dev/null
+++ b/roles/openshift_health_checker/callback_plugins/zz_failure_summary.py
@@ -0,0 +1,100 @@
+# vim: expandtab:tabstop=4:shiftwidth=4
+'''
+Ansible callback plugin.
+'''
+
+from pprint import pformat
+
+from ansible.plugins.callback import CallbackBase
+from ansible import constants as C
+from ansible.utils.color import stringc
+
+
+class CallbackModule(CallbackBase):
+    '''
+    This callback plugin stores task results and summarizes failures.
+    The file name is prefixed with `zz_` to make this plugin be loaded last by
+    Ansible, thus making its output the last thing that users see.
+    '''
+
+    CALLBACK_VERSION = 2.0
+    CALLBACK_TYPE = 'aggregate'
+    CALLBACK_NAME = 'failure_summary'
+    CALLBACK_NEEDS_WHITELIST = False
+
+    def __init__(self):
+        super(CallbackModule, self).__init__()
+        self.__failures = []
+
+    def v2_runner_on_failed(self, result, ignore_errors=False):
+        super(CallbackModule, self).v2_runner_on_failed(result, ignore_errors)
+        self.__failures.append(dict(result=result, ignore_errors=ignore_errors))
+
+    def v2_playbook_on_stats(self, stats):
+        super(CallbackModule, self).v2_playbook_on_stats(stats)
+        # TODO: update condition to consider a host var or env var to
+        # enable/disable the summary, so that we can control the output from a
+        # play.
+        if self.__failures:
+            self._print_failure_summary()
+
+    def _print_failure_summary(self):
+        '''Print a summary of failed tasks (including ignored failures).'''
+        self._display.display(u'\nFailure summary:\n')
+
+        # TODO: group failures by host or by task. If grouped by host, it is
+        # easy to see all problems of a given host. If grouped by task, it is
+        # easy to see what hosts needs the same fix.
+
+        width = len(str(len(self.__failures)))
+        initial_indent_format = u'  {{:>{width}}}. '.format(width=width)
+        initial_indent_len = len(initial_indent_format.format(0))
+        subsequent_indent = u' ' * initial_indent_len
+        subsequent_extra_indent = u' ' * (initial_indent_len + 10)
+
+        for i, failure in enumerate(self.__failures, 1):
+            lines = _format_failure(failure)
+            self._display.display(u'\n{}{}'.format(initial_indent_format.format(i), lines[0]))
+            for line in lines[1:]:
+                line = line.replace(u'\n', u'\n' + subsequent_extra_indent)
+                indented = u'{}{}'.format(subsequent_indent, line)
+                self._display.display(indented)
+
+
+# Reason: disable pylint protected-access because we need to access _*
+#         attributes of a task result to implement this method.
+# Status: permanently disabled unless Ansible's API changes.
+# pylint: disable=protected-access
+def _format_failure(failure):
+    '''Return a list of pretty-formatted lines describing a failure, including
+    relevant information about it. Line separators are not included.'''
+    result = failure['result']
+    host = result._host.get_name()
+    play = _get_play(result._task)
+    if play:
+        play = play.get_name()
+    task = result._task.get_name()
+    msg = result._result.get('msg', u'???')
+    rows = (
+        (u'Host', host),
+        (u'Play', play),
+        (u'Task', task),
+        (u'Message', stringc(msg, C.COLOR_ERROR)),
+    )
+    if 'checks' in result._result:
+        rows += ((u'Details', stringc(pformat(result._result['checks']), C.COLOR_ERROR)),)
+    row_format = '{:10}{}'
+    return [row_format.format(header + u':', body) for header, body in rows]
+
+
+# Reason: disable pylint protected-access because we need to access _*
+#         attributes of obj to implement this function.
+#         This is inspired by ansible.playbook.base.Base.dump_me.
+# Status: permanently disabled unless Ansible's API changes.
+# pylint: disable=protected-access
+def _get_play(obj):
+    '''Given a task or block, recursively tries to find its parent play.'''
+    if hasattr(obj, '_play'):
+        return obj._play
+    if getattr(obj, '_parent'):
+        return _get_play(obj._parent)
diff --git a/roles/openshift_health_checker/library/aos_version.py b/roles/openshift_health_checker/library/aos_version.py
new file mode 100755
index 000000000..37c8b483c
--- /dev/null
+++ b/roles/openshift_health_checker/library/aos_version.py
@@ -0,0 +1,85 @@
+#!/usr/bin/python
+# vim: expandtab:tabstop=4:shiftwidth=4
+'''
+Ansible module for determining if multiple versions of an OpenShift package are
+available, and if the version requested is available down to the given
+precision.
+
+Multiple versions available suggest that multiple repos are enabled for the
+different versions, which may cause installation problems.
+'''
+
+import yum  # pylint: disable=import-error
+
+from ansible.module_utils.basic import AnsibleModule
+
+
+def main():  # pylint: disable=missing-docstring
+    module = AnsibleModule(
+        argument_spec=dict(
+            version=dict(required=True)
+        ),
+        supports_check_mode=True
+    )
+
+    def bail(error):  # pylint: disable=missing-docstring
+        module.fail_json(msg=error)
+
+    yb = yum.YumBase()  # pylint: disable=invalid-name
+
+    # search for package versions available for aos pkgs
+    expected_pkgs = [
+        'atomic-openshift',
+        'atomic-openshift-master',
+        'atomic-openshift-node',
+    ]
+    try:
+        pkgs = yb.pkgSack.returnPackages(patterns=expected_pkgs)
+    except yum.Errors.PackageSackError as e:  # pylint: disable=invalid-name
+        # you only hit this if *none* of the packages are available
+        bail('Unable to find any atomic-openshift packages. \nCheck your subscription and repo settings. \n%s' % e)
+
+    # determine what level of precision we're expecting for the version
+    expected_version = module.params['version']
+    if expected_version.startswith('v'):  # v3.3 => 3.3
+        expected_version = expected_version[1:]
+    num_dots = expected_version.count('.')
+
+    pkgs_by_name_version = {}
+    pkgs_precise_version_found = {}
+    for pkg in pkgs:
+        # get expected version precision
+        match_version = '.'.join(pkg.version.split('.')[:num_dots + 1])
+        if match_version == expected_version:
+            pkgs_precise_version_found[pkg.name] = True
+        # get x.y version precision
+        minor_version = '.'.join(pkg.version.split('.')[:2])
+        if pkg.name not in pkgs_by_name_version:
+            pkgs_by_name_version[pkg.name] = {}
+        pkgs_by_name_version[pkg.name][minor_version] = True
+
+    # see if any packages couldn't be found at requested version
+    # see if any packages are available in more than one minor version
+    not_found = []
+    multi_found = []
+    for name in expected_pkgs:
+        if name not in pkgs_precise_version_found:
+            not_found.append(name)
+        if name in pkgs_by_name_version and len(pkgs_by_name_version[name]) > 1:
+            multi_found.append(name)
+    if not_found:
+        msg = 'Not all of the required packages are available at requested version %s:\n' % expected_version
+        for name in not_found:
+            msg += '  %s\n' % name
+        bail(msg + 'Please check your subscriptions and enabled repositories.')
+    if multi_found:
+        msg = 'Multiple minor versions of these packages are available\n'
+        for name in multi_found:
+            msg += '  %s\n' % name
+        bail(msg + "There should only be one OpenShift version's repository enabled at a time.")
+
+    module.exit_json(changed=False)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/roles/openshift_health_checker/library/check_yum_update.py b/roles/openshift_health_checker/library/check_yum_update.py
new file mode 100755
index 000000000..9bc14fd47
--- /dev/null
+++ b/roles/openshift_health_checker/library/check_yum_update.py
@@ -0,0 +1,105 @@
+#!/usr/bin/python
+# vim: expandtab:tabstop=4:shiftwidth=4
+'''
+Ansible module to test whether a yum update or install will succeed,
+without actually performing it or running yum.
+parameters:
+  packages: (optional) A list of package names to install or update.
+            If omitted, all installed RPMs are considered for updates.
+'''
+
+import sys
+
+import yum  # pylint: disable=import-error
+
+from ansible.module_utils.basic import AnsibleModule
+
+
+def main():  # pylint: disable=missing-docstring,too-many-branches
+    module = AnsibleModule(
+        argument_spec=dict(
+            packages=dict(type='list', default=[])
+        ),
+        supports_check_mode=True
+    )
+
+    def bail(error):  # pylint: disable=missing-docstring
+        module.fail_json(msg=error)
+
+    yb = yum.YumBase()  # pylint: disable=invalid-name
+    # determine if the existing yum configuration is valid
+    try:
+        yb.repos.populateSack(mdtype='metadata', cacheonly=1)
+    # for error of type:
+    #   1. can't reach the repo URL(s)
+    except yum.Errors.NoMoreMirrorsRepoError as e:  # pylint: disable=invalid-name
+        bail('Error getting data from at least one yum repository: %s' % e)
+    #   2. invalid repo definition
+    except yum.Errors.RepoError as e:  # pylint: disable=invalid-name
+        bail('Error with yum repository configuration: %s' % e)
+    #   3. other/unknown
+    #    * just report the problem verbatim
+    except:  # pylint: disable=bare-except; # noqa
+        bail('Unexpected error with yum repository: %s' % sys.exc_info()[1])
+
+    packages = module.params['packages']
+    no_such_pkg = []
+    for pkg in packages:
+        try:
+            yb.install(name=pkg)
+        except yum.Errors.InstallError as e:  # pylint: disable=invalid-name
+            no_such_pkg.append(pkg)
+        except:  # pylint: disable=bare-except; # noqa
+            bail('Unexpected error with yum install/update: %s' %
+                 sys.exc_info()[1])
+    if not packages:
+        # no packages requested means test a yum update of everything
+        yb.update()
+    elif no_such_pkg:
+        # wanted specific packages to install but some aren't available
+        user_msg = 'Cannot install all of the necessary packages. Unavailable:\n'
+        for pkg in no_such_pkg:
+            user_msg += '  %s\n' % pkg
+        user_msg += 'You may need to enable one or more yum repositories to make this content available.'
+        bail(user_msg)
+
+    try:
+        txn_result, txn_msgs = yb.buildTransaction()
+    except:  # pylint: disable=bare-except; # noqa
+        bail('Unexpected error during dependency resolution for yum update: \n %s' %
+             sys.exc_info()[1])
+
+    # find out if there are any errors with the update/install
+    if txn_result == 0:  # 'normal exit' meaning there's nothing to install/update
+        pass
+    elif txn_result == 1:  # error with transaction
+        user_msg = 'Could not perform a yum update.\n'
+        if len(txn_msgs) > 0:
+            user_msg += 'Errors from dependency resolution:\n'
+            for msg in txn_msgs:
+                user_msg += '  %s\n' % msg
+            user_msg += 'You should resolve these issues before proceeding with an install.\n'
+            user_msg += 'You may need to remove or downgrade packages or enable/disable yum repositories.'
+        bail(user_msg)
+    # TODO: it would be nice depending on the problem:
+    #   1. dependency for update not found
+    #    * construct the dependency tree
+    #    * find the installed package(s) that required the missing dep
+    #    * determine if any of these packages matter to openshift
+    #    * build helpful error output
+    #   2. conflicts among packages in available content
+    #    * analyze dependency tree and build helpful error output
+    #   3. other/unknown
+    #    * report the problem verbatim
+    #    * add to this list as we come across problems we can clearly diagnose
+    elif txn_result == 2:  # everything resolved fine
+        pass
+    else:
+        bail('Unknown error(s) from dependency resolution. Exit Code: %d:\n%s' %
+             (txn_result, txn_msgs))
+
+    module.exit_json(changed=False)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/roles/openshift_health_checker/meta/main.yml b/roles/openshift_health_checker/meta/main.yml
new file mode 100644
index 000000000..0bbeadd34
--- /dev/null
+++ b/roles/openshift_health_checker/meta/main.yml
@@ -0,0 +1,3 @@
+---
+dependencies:
+  - role: openshift_facts
diff --git a/roles/openshift_health_checker/openshift_checks/__init__.py b/roles/openshift_health_checker/openshift_checks/__init__.py
new file mode 100644
index 000000000..d893ba591
--- /dev/null
+++ b/roles/openshift_health_checker/openshift_checks/__init__.py
@@ -0,0 +1,57 @@
+"""
+Health checks for OpenShift clusters.
+"""
+
+import os
+from abc import ABCMeta, abstractmethod, abstractproperty
+from importlib import import_module
+
+import six
+
+
+class OpenShiftCheckException(Exception):
+    """Raised when a check cannot proceed."""
+    pass
+
+
+@six.add_metaclass(ABCMeta)
+class OpenShiftCheck(object):
+    """A base class for defining checks for an OpenShift cluster environment."""
+
+    def __init__(self, module_executor):
+        self.module_executor = module_executor
+
+    @abstractproperty
+    def name(self):
+        """The name of this check, usually derived from the class name."""
+        return "openshift_check"
+
+    @classmethod
+    def is_active(cls, task_vars):  # pylint: disable=unused-argument
+        """Returns true if this check applies to the ansible-playbook run."""
+        return True
+
+    @abstractmethod
+    def run(self, tmp, task_vars):
+        """Executes a check, normally implemented as a module."""
+        return {}
+
+    @classmethod
+    def subclasses(cls):
+        """Returns a generator of subclasses of this class and its subclasses."""
+        for subclass in cls.__subclasses__():  # pylint: disable=no-member
+            yield subclass
+            for subclass in subclass.subclasses():
+                yield subclass
+
+
+# Dynamically import all submodules for the side effect of loading checks.
+
+EXCLUDES = (
+    "__init__.py",
+    "mixins.py",
+)
+
+for name in os.listdir(os.path.dirname(__file__)):
+    if name.endswith(".py") and name not in EXCLUDES:
+        import_module(__package__ + "." + name[:-3])
diff --git a/roles/openshift_health_checker/openshift_checks/mixins.py b/roles/openshift_health_checker/openshift_checks/mixins.py
new file mode 100644
index 000000000..4e0415944
--- /dev/null
+++ b/roles/openshift_health_checker/openshift_checks/mixins.py
@@ -0,0 +1,24 @@
+# pylint: disable=missing-docstring
+from openshift_checks import OpenShiftCheckException
+
+
+class NotContainerized(object):
+    """Mixin for checks that are only active when not in containerized mode."""
+
+    @classmethod
+    def is_active(cls, task_vars):
+        return (
+            # This mixin is meant to be used with subclasses of
+            # OpenShiftCheck. Pylint disables this by default on mixins,
+            # though it relies on the class name ending in 'mixin'.
+            # pylint: disable=no-member
+            super(NotContainerized, cls).is_active(task_vars) and
+            not cls.is_containerized(task_vars)
+        )
+
+    @staticmethod
+    def is_containerized(task_vars):
+        try:
+            return task_vars["openshift"]["common"]["is_containerized"]
+        except (KeyError, TypeError):
+            raise OpenShiftCheckException("'openshift.common.is_containerized' is undefined")
diff --git a/roles/openshift_health_checker/openshift_checks/package_availability.py b/roles/openshift_health_checker/openshift_checks/package_availability.py
new file mode 100644
index 000000000..4260cbf7c
--- /dev/null
+++ b/roles/openshift_health_checker/openshift_checks/package_availability.py
@@ -0,0 +1,69 @@
+# pylint: disable=missing-docstring
+from openshift_checks import OpenShiftCheck, OpenShiftCheckException
+from openshift_checks.mixins import NotContainerized
+
+
+class PackageAvailability(NotContainerized, OpenShiftCheck):
+    """Check that required RPM packages are available."""
+
+    name = "package_availability"
+
+    def run(self, tmp, task_vars):
+        try:
+            rpm_prefix = task_vars["openshift"]["common"]["service_type"]
+        except (KeyError, TypeError):
+            raise OpenShiftCheckException("'openshift.common.service_type' is undefined")
+
+        group_names = task_vars.get("group_names", [])
+
+        packages = set()
+
+        if "masters" in group_names:
+            packages.update(self.master_packages(rpm_prefix))
+        if "nodes" in group_names:
+            packages.update(self.node_packages(rpm_prefix))
+
+        args = {"packages": sorted(set(packages))}
+        return self.module_executor("check_yum_update", args, tmp, task_vars)
+
+    @staticmethod
+    def master_packages(rpm_prefix):
+        return [
+            "{rpm_prefix}".format(rpm_prefix=rpm_prefix),
+            "{rpm_prefix}-clients".format(rpm_prefix=rpm_prefix),
+            "{rpm_prefix}-master".format(rpm_prefix=rpm_prefix),
+            "bash-completion",
+            "cockpit-bridge",
+            "cockpit-docker",
+            "cockpit-kubernetes",
+            "cockpit-shell",
+            "cockpit-ws",
+            "etcd",
+            "httpd-tools",
+        ]
+
+    @staticmethod
+    def node_packages(rpm_prefix):
+        return [
+            "{rpm_prefix}".format(rpm_prefix=rpm_prefix),
+            "{rpm_prefix}-node".format(rpm_prefix=rpm_prefix),
+            "{rpm_prefix}-sdn-ovs".format(rpm_prefix=rpm_prefix),
+            "bind",
+            "ceph-common",
+            "dnsmasq",
+            "docker",
+            "firewalld",
+            "flannel",
+            "glusterfs-fuse",
+            "iptables-services",
+            "iptables",
+            "iscsi-initiator-utils",
+            "libselinux-python",
+            "nfs-utils",
+            "ntp",
+            "openssl",
+            "pyparted",
+            "python-httplib2",
+            "PyYAML",
+            "yum-utils",
+        ]
diff --git a/roles/openshift_health_checker/openshift_checks/package_update.py b/roles/openshift_health_checker/openshift_checks/package_update.py
new file mode 100644
index 000000000..316a776f5
--- /dev/null
+++ b/roles/openshift_health_checker/openshift_checks/package_update.py
@@ -0,0 +1,13 @@
+# pylint: disable=missing-docstring
+from openshift_checks import OpenShiftCheck
+from openshift_checks.mixins import NotContainerized
+
+
+class PackageUpdate(NotContainerized, OpenShiftCheck):
+    """Check that there are no conflicts in RPM packages."""
+
+    name = "package_update"
+
+    def run(self, tmp, task_vars):
+        args = {"packages": []}
+        return self.module_executor("check_yum_update", args, tmp, task_vars)
diff --git a/roles/openshift_health_checker/openshift_checks/package_version.py b/roles/openshift_health_checker/openshift_checks/package_version.py
new file mode 100644
index 000000000..a473119f3
--- /dev/null
+++ b/roles/openshift_health_checker/openshift_checks/package_version.py
@@ -0,0 +1,25 @@
+# pylint: disable=missing-docstring
+from openshift_checks import OpenShiftCheck, OpenShiftCheckException
+from openshift_checks.mixins import NotContainerized
+
+
+class PackageVersion(NotContainerized, OpenShiftCheck):
+    """Check that available RPM packages match the required versions."""
+
+    name = "package_version"
+
+    @classmethod
+    def is_active(cls, task_vars):
+        return (
+            super(PackageVersion, cls).is_active(task_vars)
+            and task_vars.get("deployment_type") == "openshift-enterprise"
+        )
+
+    def run(self, tmp, task_vars):
+        try:
+            openshift_release = task_vars["openshift_release"]
+        except (KeyError, TypeError):
+            raise OpenShiftCheckException("'openshift_release' is undefined")
+
+        args = {"version": openshift_release}
+        return self.module_executor("aos_version", args, tmp, task_vars)