diff options
Diffstat (limited to 'roles/os_zabbix')
-rw-r--r-- | roles/os_zabbix/README.md | 40 | ||||
-rw-r--r-- | roles/os_zabbix/defaults/main.yml | 1 | ||||
-rw-r--r-- | roles/os_zabbix/handlers/main.yml | 1 | ||||
-rwxr-xr-x | roles/os_zabbix/library/zbxapi.py | 370 | ||||
-rw-r--r-- | roles/os_zabbix/meta/main.yml | 9 | ||||
-rw-r--r-- | roles/os_zabbix/tasks/main.yml | 81 | ||||
-rw-r--r-- | roles/os_zabbix/vars/main.yml | 1 | ||||
-rw-r--r-- | roles/os_zabbix/vars/template_app_zabbix_agent.yml | 23 | ||||
-rw-r--r-- | roles/os_zabbix/vars/template_app_zabbix_server.yml | 412 | ||||
-rw-r--r-- | roles/os_zabbix/vars/template_docker.yml | 94 | ||||
-rw-r--r-- | roles/os_zabbix/vars/template_heartbeat.yml | 13 | ||||
-rw-r--r-- | roles/os_zabbix/vars/template_openshift_master.yml | 58 | ||||
-rw-r--r-- | roles/os_zabbix/vars/template_openshift_node.yml | 44 | ||||
-rw-r--r-- | roles/os_zabbix/vars/template_ops_tools.yml | 23 | ||||
-rw-r--r-- | roles/os_zabbix/vars/template_os_linux.yml | 260 |
15 files changed, 1060 insertions, 370 deletions
diff --git a/roles/os_zabbix/README.md b/roles/os_zabbix/README.md new file mode 100644 index 000000000..ac3dc2833 --- /dev/null +++ b/roles/os_zabbix/README.md @@ -0,0 +1,40 @@ +os_zabbix +========= + +Automate zabbix tasks. + +Requirements +------------ + +This requires the openshift_tools rpm be installed for the zbxapi.py library. It can be found here: https://github.com/openshift/openshift-tools under openshift_tools/monitoring/zbxapi.py for now. + +Role Variables +-------------- + +zab_server +zab_username +zab_password + +Dependencies +------------ + +This depeonds on the zbxapi.py library located here: https://github.com/openshift/openshift-tools under openshift_tools/monitoring/zbxapi.py for now. + +Example Playbook +---------------- + + - zbx_host: + server: zab_server + user: zab_user + password: zab_password + name: 'myhost' + +License +------- + +ASL 2.0 + +Author Information +------------------ + +OpenShift operations, Red Hat, Inc diff --git a/roles/os_zabbix/defaults/main.yml b/roles/os_zabbix/defaults/main.yml new file mode 100644 index 000000000..ed97d539c --- /dev/null +++ b/roles/os_zabbix/defaults/main.yml @@ -0,0 +1 @@ +--- diff --git a/roles/os_zabbix/handlers/main.yml b/roles/os_zabbix/handlers/main.yml new file mode 100644 index 000000000..ed97d539c --- /dev/null +++ b/roles/os_zabbix/handlers/main.yml @@ -0,0 +1 @@ +--- diff --git a/roles/os_zabbix/library/zbxapi.py b/roles/os_zabbix/library/zbxapi.py deleted file mode 100755 index b5fa5ee2b..000000000 --- a/roles/os_zabbix/library/zbxapi.py +++ /dev/null @@ -1,370 +0,0 @@ -#!/usr/bin/env python -# vim: expandtab:tabstop=4:shiftwidth=4 -''' - ZabbixAPI ansible module -''' - -# Copyright 2015 Red Hat Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# Purpose: An ansible module to communicate with zabbix. -# - -# pylint: disable=line-too-long -# Disabling line length for readability - -import json -import httplib2 -import sys -import os -import re -import copy - -class ZabbixAPIError(Exception): - ''' - ZabbixAPIError - Exists to propagate errors up from the api - ''' - pass - -class ZabbixAPI(object): - ''' - ZabbixAPI class - ''' - classes = { - 'Action': ['create', 'delete', 'get', 'update'], - 'Alert': ['get'], - 'Application': ['create', 'delete', 'get', 'massadd', 'update'], - 'Configuration': ['export', 'import'], - 'Dcheck': ['get'], - 'Dhost': ['get'], - 'Drule': ['copy', 'create', 'delete', 'get', 'isreadable', 'iswritable', 'update'], - 'Dservice': ['get'], - 'Event': ['acknowledge', 'get'], - 'Graph': ['create', 'delete', 'get', 'update'], - 'Graphitem': ['get'], - 'Graphprototype': ['create', 'delete', 'get', 'update'], - 'History': ['get'], - 'Hostgroup': ['create', 'delete', 'get', 'isreadable', 'iswritable', 'massadd', 'massremove', 'massupdate', 'update'], - 'Hostinterface': ['create', 'delete', 'get', 'massadd', 'massremove', 'replacehostinterfaces', 'update'], - 'Host': ['create', 'delete', 'get', 'isreadable', 'iswritable', 'massadd', 'massremove', 'massupdate', 'update'], - 'Hostprototype': ['create', 'delete', 'get', 'isreadable', 'iswritable', 'update'], - 'Httptest': ['create', 'delete', 'get', 'isreadable', 'iswritable', 'update'], - 'Iconmap': ['create', 'delete', 'get', 'isreadable', 'iswritable', 'update'], - 'Image': ['create', 'delete', 'get', 'update'], - 'Item': ['create', 'delete', 'get', 'isreadable', 'iswritable', 'update'], - 'Itemprototype': ['create', 'delete', 'get', 'isreadable', 'iswritable', 'update'], - 'Maintenance': ['create', 'delete', 'get', 'update'], - 'Map': ['create', 'delete', 'get', 'isreadable', 'iswritable', 'update'], - 'Mediatype': ['create', 'delete', 'get', 'update'], - 'Proxy': ['create', 'delete', 'get', 'isreadable', 'iswritable', 'update'], - 'Screen': ['create', 'delete', 'get', 'update'], - 'Screenitem': ['create', 'delete', 'get', 'isreadable', 'iswritable', 'update', 'updatebyposition'], - 'Script': ['create', 'delete', 'execute', 'get', 'getscriptsbyhosts', 'update'], - 'Service': ['adddependencies', 'addtimes', 'create', 'delete', 'deletedependencies', 'deletetimes', 'get', 'getsla', 'isreadable', 'iswritable', 'update'], - 'Template': ['create', 'delete', 'get', 'isreadable', 'iswritable', 'massadd', 'massremove', 'massupdate', 'update'], - 'Templatescreen': ['copy', 'create', 'delete', 'get', 'isreadable', 'iswritable', 'update'], - 'Templatescreenitem': ['get'], - 'Trigger': ['adddependencies', 'create', 'delete', 'deletedependencies', 'get', 'isreadable', 'iswritable', 'update'], - 'Triggerprototype': ['create', 'delete', 'get', 'update'], - 'User': ['addmedia', 'create', 'delete', 'deletemedia', 'get', 'isreadable', 'iswritable', 'login', 'logout', 'update', 'updatemedia', 'updateprofile'], - 'Usergroup': ['create', 'delete', 'get', 'isreadable', 'iswritable', 'massadd', 'massupdate', 'update'], - 'Usermacro': ['create', 'createglobal', 'delete', 'deleteglobal', 'get', 'update', 'updateglobal'], - 'Usermedia': ['get'], - } - - def __init__(self, data=None): - if not data: - data = {} - self.server = data.get('server', None) - self.username = data.get('user', None) - self.password = data.get('password', None) - if any([value == None for value in [self.server, self.username, self.password]]): - print 'Please specify zabbix server url, username, and password.' - sys.exit(1) - - self.verbose = data.get('verbose', False) - self.use_ssl = data.has_key('use_ssl') - self.auth = None - - for cname, _ in self.classes.items(): - setattr(self, cname.lower(), getattr(self, cname)(self)) - - # pylint: disable=no-member - # This method does not exist until the metaprogramming executed - # This is permanently disabled. - results = self.user.login(user=self.username, password=self.password) - - if results[0]['status'] == '200': - if results[1].has_key('result'): - self.auth = results[1]['result'] - elif results[1].has_key('error'): - print "Unable to authenticate with zabbix server. {0} ".format(results[1]['error']) - sys.exit(1) - else: - print "Error in call to zabbix. Http status: {0}.".format(results[0]['status']) - sys.exit(1) - - def perform(self, method, rpc_params): - ''' - This method calls your zabbix server. - - It requires the following parameters in order for a proper request to be processed: - jsonrpc - the version of the JSON-RPC protocol used by the API; - the Zabbix API implements JSON-RPC version 2.0; - method - the API method being called; - rpc_params - parameters that will be passed to the API method; - id - an arbitrary identifier of the request; - auth - a user authentication token; since we don't have one yet, it's set to null. - ''' - http_method = "POST" - jsonrpc = "2.0" - rid = 1 - - http = None - if self.use_ssl: - http = httplib2.Http() - else: - http = httplib2.Http(disable_ssl_certificate_validation=True,) - - headers = {} - headers["Content-type"] = "application/json" - - body = { - "jsonrpc": jsonrpc, - "method": method, - "params": rpc_params.get('params', {}), - "id": rid, - 'auth': self.auth, - } - - if method in ['user.login', 'api.version']: - del body['auth'] - - body = json.dumps(body) - - if self.verbose: - print body - print method - print headers - httplib2.debuglevel = 1 - - response, content = http.request(self.server, http_method, body, headers) - - if response['status'] not in ['200', '201']: - raise ZabbixAPIError('Error calling zabbix. Zabbix returned %s' % response['status']) - - if self.verbose: - print response - print content - - try: - content = json.loads(content) - except ValueError as err: - content = {"error": err.message} - - return response, content - - @staticmethod - def meta(cname, method_names): - ''' - This bit of metaprogramming is where the ZabbixAPI subclasses are created. - For each of ZabbixAPI.classes we create a class from the key and methods - from the ZabbixAPI.classes values. We pass a reference to ZabbixAPI class - to each subclass in order for each to be able to call the perform method. - ''' - def meta_method(_class, method_name): - ''' - This meta method allows a class to add methods to it. - ''' - # This template method is a stub method for each of the subclass - # methods. - def template_method(self, params=None, **rpc_params): - ''' - This template method is a stub method for each of the subclass methods. - ''' - if params: - rpc_params['params'] = params - else: - rpc_params['params'] = copy.deepcopy(rpc_params) - - return self.parent.perform(cname.lower()+"."+method_name, rpc_params) - - template_method.__doc__ = \ - "https://www.zabbix.com/documentation/2.4/manual/api/reference/%s/%s" % \ - (cname.lower(), method_name) - template_method.__name__ = method_name - # this is where the template method is placed inside of the subclass - # e.g. setattr(User, "create", stub_method) - setattr(_class, template_method.__name__, template_method) - - # This class call instantiates a subclass. e.g. User - _class = type(cname, - (object,), - {'__doc__': \ - "https://www.zabbix.com/documentation/2.4/manual/api/reference/%s" % cname.lower()}) - def __init__(self, parent): - ''' - This init method gets placed inside of the _class - to allow it to be instantiated. A reference to the parent class(ZabbixAPI) - is passed in to allow each class access to the perform method. - ''' - self.parent = parent - - # This attaches the init to the subclass. e.g. Create - setattr(_class, __init__.__name__, __init__) - # For each of our ZabbixAPI.classes dict values - # Create a method and attach it to our subclass. - # e.g. 'User': ['delete', 'get', 'updatemedia', 'updateprofile', - # 'update', 'iswritable', 'logout', 'addmedia', 'create', - # 'login', 'deletemedia', 'isreadable'], - # User.delete - # User.get - for method_name in method_names: - meta_method(_class, method_name) - # Return our subclass with all methods attached - return _class - -# Attach all ZabbixAPI.classes to ZabbixAPI class through metaprogramming -for _class_name, _method_names in ZabbixAPI.classes.items(): - setattr(ZabbixAPI, _class_name, ZabbixAPI.meta(_class_name, _method_names)) - -def exists(content, key='result'): - ''' Check if key exists in content or the size of content[key] > 0 - ''' - if not content.has_key(key): - return False - - if not content[key]: - return False - - return True - -def diff_content(from_zabbix, from_user): - ''' Compare passed in object to results returned from zabbix - ''' - terms = ['search', 'output', 'groups', 'select', 'expand'] - regex = '(' + '|'.join(terms) + ')' - retval = {} - for key, value in from_user.items(): - if re.findall(regex, key): - continue - - if from_zabbix[key] != str(value): - retval[key] = str(value) - - return retval - -def main(): - ''' - This main method runs the ZabbixAPI Ansible Module - ''' - - module = AnsibleModule( - argument_spec=dict( - server=dict(default='https://localhost/zabbix/api_jsonrpc.php', type='str'), - user=dict(default=None, type='str'), - password=dict(default=None, type='str'), - zbx_class=dict(choices=ZabbixAPI.classes.keys()), - params=dict(), - debug=dict(default=False, type='bool'), - state=dict(default='present', type='str'), - ), - #supports_check_mode=True - ) - - user = module.params.get('user', None) - if not user: - user = os.environ['ZABBIX_USER'] - - passwd = module.params.get('password', None) - if not passwd: - passwd = os.environ['ZABBIX_PASSWORD'] - - - - api_data = { - 'user': user, - 'password': passwd, - 'server': module.params['server'], - 'verbose': module.params['debug'] - } - - if not user or not passwd or not module.params['server']: - module.fail_json(msg='Please specify the user, password, and the zabbix server.') - - zapi = ZabbixAPI(api_data) - - zbx_class = module.params.get('zbx_class') - rpc_params = module.params.get('params', {}) - state = module.params.get('state') - - # Get the instance we are trying to call - zbx_class_inst = zapi.__getattribute__(zbx_class.lower()) - - # perform get - # Get the instance's method we are trying to call - - zbx_action_method = zapi.__getattribute__(zbx_class.capitalize()).__dict__['get'] - _, content = zbx_action_method(zbx_class_inst, rpc_params) - - if state == 'list': - module.exit_json(changed=False, results=content['result'], state="list") - - if state == 'absent': - if not exists(content): - module.exit_json(changed=False, state="absent") - # If we are coming from a query, we need to pass in the correct rpc_params for delete. - # specifically the zabbix class name + 'id' - # if rpc_params is a list then we need to pass it. (list of ids to delete) - idname = zbx_class.lower() + "id" - if not isinstance(rpc_params, list) and content['result'][0].has_key(idname): - rpc_params = [content['result'][0][idname]] - - zbx_action_method = zapi.__getattribute__(zbx_class.capitalize()).__dict__['delete'] - _, content = zbx_action_method(zbx_class_inst, rpc_params) - module.exit_json(changed=True, results=content['result'], state="absent") - - if state == 'present': - # It's not there, create it! - if not exists(content): - zbx_action_method = zapi.__getattribute__(zbx_class.capitalize()).__dict__['create'] - _, content = zbx_action_method(zbx_class_inst, rpc_params) - module.exit_json(changed=True, results=content['result'], state='present') - - # It's there and the same, do nothing! - diff_params = diff_content(content['result'][0], rpc_params) - if not diff_params: - module.exit_json(changed=False, results=content['result'], state="present") - - # Add the id to update with - idname = zbx_class.lower() + "id" - diff_params[idname] = content['result'][0][idname] - - - ## It's there and not the same, update it! - zbx_action_method = zapi.__getattribute__(zbx_class.capitalize()).__dict__['update'] - _, content = zbx_action_method(zbx_class_inst, diff_params) - module.exit_json(changed=True, results=content, state="present") - - module.exit_json(failed=True, - changed=False, - results='Unknown state passed. %s' % state, - state="unknown") - -# pylint: disable=redefined-builtin, unused-wildcard-import, wildcard-import, locally-disabled -# import module snippets. This are required -from ansible.module_utils.basic import * - -main() diff --git a/roles/os_zabbix/meta/main.yml b/roles/os_zabbix/meta/main.yml new file mode 100644 index 000000000..360f5aad2 --- /dev/null +++ b/roles/os_zabbix/meta/main.yml @@ -0,0 +1,9 @@ +--- +galaxy_info: + author: OpenShift + description: ZabbixAPI + company: Red Hat, Inc + license: ASL 2.0 + min_ansible_version: 1.2 +dependencies: +- lib_zabbix diff --git a/roles/os_zabbix/tasks/main.yml b/roles/os_zabbix/tasks/main.yml new file mode 100644 index 000000000..a503b24d7 --- /dev/null +++ b/roles/os_zabbix/tasks/main.yml @@ -0,0 +1,81 @@ +--- +- name: Main List all templates + zbx_template: + zbx_server: "{{ ozb_server }}" + zbx_user: "{{ ozb_user }}" + zbx_password: "{{ ozb_password }}" + state: list + register: templates + +- include_vars: template_heartbeat.yml +- include_vars: template_os_linux.yml +- include_vars: template_docker.yml +- include_vars: template_openshift_master.yml +- include_vars: template_openshift_node.yml +- include_vars: template_ops_tools.yml +- include_vars: template_app_zabbix_server.yml +- include_vars: template_app_zabbix_agent.yml + +- name: Include Template Heartbeat + include: ../../lib_zabbix/tasks/create_template.yml + vars: + template: "{{ g_template_heartbeat }}" + server: "{{ ozb_server }}" + user: "{{ ozb_user }}" + password: "{{ ozb_password }}" + +- name: Include Template os_linux + include: ../../lib_zabbix/tasks/create_template.yml + vars: + template: "{{ g_template_os_linux }}" + server: "{{ ozb_server }}" + user: "{{ ozb_user }}" + password: "{{ ozb_password }}" + +- name: Include Template docker + include: ../../lib_zabbix/tasks/create_template.yml + vars: + template: "{{ g_template_docker }}" + server: "{{ ozb_server }}" + user: "{{ ozb_user }}" + password: "{{ ozb_password }}" + +- name: Include Template Openshift Master + include: ../../lib_zabbix/tasks/create_template.yml + vars: + template: "{{ g_template_openshift_master }}" + server: "{{ ozb_server }}" + user: "{{ ozb_user }}" + password: "{{ ozb_password }}" + +- name: Include Template Openshift Node + include: ../../lib_zabbix/tasks/create_template.yml + vars: + template: "{{ g_template_openshift_node }}" + server: "{{ ozb_server }}" + user: "{{ ozb_user }}" + password: "{{ ozb_password }}" + +- name: Include Template Ops Tools + include: ../../lib_zabbix/tasks/create_template.yml + vars: + template: "{{ g_template_ops_tools }}" + server: "{{ ozb_server }}" + user: "{{ ozb_user }}" + password: "{{ ozb_password }}" + +- name: Include Template App Zabbix Server + include: ../../lib_zabbix/tasks/create_template.yml + vars: + template: "{{ g_template_app_zabbix_server }}" + server: "{{ ozb_server }}" + user: "{{ ozb_user }}" + password: "{{ ozb_password }}" + +- name: Include Template App Zabbix Agent + include: ../../lib_zabbix/tasks/create_template.yml + vars: + template: "{{ g_template_app_zabbix_agent }}" + server: "{{ ozb_server }}" + user: "{{ ozb_user }}" + password: "{{ ozb_password }}" diff --git a/roles/os_zabbix/vars/main.yml b/roles/os_zabbix/vars/main.yml new file mode 100644 index 000000000..ed97d539c --- /dev/null +++ b/roles/os_zabbix/vars/main.yml @@ -0,0 +1 @@ +--- diff --git a/roles/os_zabbix/vars/template_app_zabbix_agent.yml b/roles/os_zabbix/vars/template_app_zabbix_agent.yml new file mode 100644 index 000000000..d636d4822 --- /dev/null +++ b/roles/os_zabbix/vars/template_app_zabbix_agent.yml @@ -0,0 +1,23 @@ +--- +g_template_app_zabbix_agent: + name: Template App Zabbix Agent + zitems: + - key: agent.hostname + applications: + - Zabbix agent + value_type: character + zabbix_type: agent + + - key: agent.ping + applications: + - Zabbix agent + description: The agent always returns 1 for this item. It could be used in combination with nodata() for availability check. + value_type: int + zabbix_type: agent + + ztriggers: + - name: '[Reboot] Zabbix agent on {HOST.NAME} is unreachable for 15 minutes' + description: Zabbix agent is unreachable for 15 minutes. + expression: '{Template App Zabbix Agent:agent.ping.nodata(15m)}=1' + priority: high + url: https://github.com/openshift/ops-sop/blob/master/Alerts/check_ping.asciidoc diff --git a/roles/os_zabbix/vars/template_app_zabbix_server.yml b/roles/os_zabbix/vars/template_app_zabbix_server.yml new file mode 100644 index 000000000..43517113b --- /dev/null +++ b/roles/os_zabbix/vars/template_app_zabbix_server.yml @@ -0,0 +1,412 @@ +--- +g_template_app_zabbix_server: + name: Template App Zabbix Server + zitems: + - key: housekeeper_creates + applications: + - Zabbix server + description: A simple count of the number of partition creates output by the housekeeper script. + units: '' + value_type: int + zabbix_type: internal + + - key: housekeeper_drops + applications: + - Zabbix server + description: A simple count of the number of partition drops output by the housekeeper script. + units: '' + value_type: int + zabbix_type: internal + + - key: housekeeper_errors + applications: + - Zabbix server + description: A simple count of the number of errors output by the housekeeper script. + units: '' + value_type: int + zabbix_type: internal + + - key: housekeeper_total + applications: + - Zabbix server + description: A simple count of the total number of lines output by the housekeeper + script. + units: '' + value_type: int + zabbix_type: internal + + - key: zabbix[process,alerter,avg,busy] + applications: + - Zabbix server + description: '' + units: '%' + value_type: float + zabbix_type: internal + + - key: zabbix[process,configuration syncer,avg,busy] + applications: + - Zabbix server + description: '' + units: '%' + value_type: float + zabbix_type: internal + + - key: zabbix[process,db watchdog,avg,busy] + applications: + - Zabbix server + description: '' + units: '%' + value_type: float + zabbix_type: internal + + - key: zabbix[process,discoverer,avg,busy] + applications: + - Zabbix server + description: '' + units: '%' + value_type: float + zabbix_type: internal + + - key: zabbix[process,escalator,avg,busy] + applications: + - Zabbix server + description: '' + units: '%' + value_type: float + zabbix_type: internal + + - key: zabbix[process,history syncer,avg,busy] + applications: + - Zabbix server + description: '' + units: '%' + value_type: float + zabbix_type: internal + + - key: zabbix[process,housekeeper,avg,busy] + applications: + - Zabbix server + description: '' + units: '%' + value_type: float + zabbix_type: internal + + - key: zabbix[process,http poller,avg,busy] + applications: + - Zabbix server + description: '' + units: '%' + value_type: float + zabbix_type: internal + + - key: zabbix[process,icmp pinger,avg,busy] + applications: + - Zabbix server + description: '' + units: '%' + value_type: float + zabbix_type: internal + + - key: zabbix[process,ipmi poller,avg,busy] + applications: + - Zabbix server + description: '' + units: '%' + value_type: float + zabbix_type: internal + + - key: zabbix[process,java poller,avg,busy] + applications: + - Zabbix server + description: '' + units: '%' + value_type: float + zabbix_type: internal + + - key: zabbix[process,node watcher,avg,busy] + applications: + - Zabbix server + description: '' + units: '%' + value_type: float + zabbix_type: internal + + - key: zabbix[process,poller,avg,busy] + applications: + - Zabbix server + description: '' + units: '%' + value_type: float + zabbix_type: internal + + - key: zabbix[process,proxy poller,avg,busy] + applications: + - Zabbix server + description: '' + units: '%' + value_type: float + zabbix_type: internal + + - key: zabbix[process,self-monitoring,avg,busy] + applications: + - Zabbix server + description: '' + units: '%' + value_type: float + zabbix_type: internal + + - key: zabbix[process,snmp trapper,avg,busy] + applications: + - Zabbix server + description: '' + units: '%' + value_type: float + zabbix_type: internal + + - key: zabbix[process,timer,avg,busy] + applications: + - Zabbix server + description: '' + units: '%' + value_type: float + zabbix_type: internal + + - key: zabbix[process,trapper,avg,busy] + applications: + - Zabbix server + description: '' + units: '%' + value_type: float + zabbix_type: internal + + - key: zabbix[process,unreachable poller,avg,busy] + applications: + - Zabbix server + description: '' + units: '%' + value_type: float + zabbix_type: internal + + - key: zabbix[queue,10m] + applications: + - Zabbix server + description: '' + units: '' + value_type: int + zabbix_type: internal + interval: 600 + + - key: zabbix[queue] + applications: + - Zabbix server + description: '' + units: '' + value_type: int + zabbix_type: internal + interval: 600 + + - key: zabbix[rcache,buffer,pfree] + applications: + - Zabbix server + description: '' + units: '' + value_type: float + zabbix_type: internal + + - key: zabbix[wcache,history,pfree] + applications: + - Zabbix server + description: '' + units: '' + value_type: float + zabbix_type: internal + + - key: zabbix[wcache,text,pfree] + applications: + - Zabbix server + description: '' + units: '' + value_type: float + zabbix_type: internal + + - key: zabbix[wcache,trend,pfree] + applications: + - Zabbix server + description: '' + units: '' + value_type: float + zabbix_type: internal + + - key: zabbix[wcache,values] + applications: + - Zabbix server + description: '' + units: '' + value_type: float + zabbix_type: internal + delta: 1 # speed per second + + ztriggers: + - description: "There has been unexpected output while running the housekeeping script\ + \ on the Zabbix. There are only three kinds of lines we expect to see in the output,\ + \ and we've gotten something enw.\r\n\r\nCheck the script's output in /var/lib/zabbix/state\ + \ for more details." + expression: '{Template App Zabbix Server:housekeeper_errors.last(0)}+{Template App Zabbix Server:housekeeper_creates.last(0)}+{Template App Zabbix Server:housekeeper_drops.last(0)}<>{Template App Zabbix Server:housekeeper_total.last(0)}' + name: Unexpected output in Zabbix DB Housekeeping + priority: avg + url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_DB_Housekeeping.asciidoc + + - description: An error has occurred during running the housekeeping script on the Zabbix. Check the script's output in /var/lib/zabbix/state for more details. + expression: '{Template App Zabbix Server:housekeeper_errors.last(0)}>0' + name: Errors during Zabbix DB Housekeeping + priority: high + url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc + + - description: '' + expression: '{Template App Zabbix Server:zabbix[process,alerter,avg,busy].min(600)}>75' + name: Zabbix alerter processes more than 75% busy + priority: avg + url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc + + - description: '' + expression: '{Template App Zabbix Server:zabbix[process,configuration syncer,avg,busy].min(600)}>75' + name: Zabbix configuration syncer processes more than 75% busy + priority: avg + url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc + + - description: '' + expression: '{Template App Zabbix Server:zabbix[process,db watchdog,avg,busy].min(600)}>75' + name: Zabbix db watchdog processes more than 75% busy + priority: avg + url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc + + - description: '' + expression: '{Template App Zabbix Server:zabbix[process,discoverer,avg,busy].min(600)}>75' + name: Zabbix discoverer processes more than 75% busy + priority: avg + url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc + + - description: '' + expression: '{Template App Zabbix Server:zabbix[process,escalator,avg,busy].min(600)}>75' + name: Zabbix escalator processes more than 75% busy + priority: avg + url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc + + - description: '' + expression: '{Template App Zabbix Server:zabbix[process,history syncer,avg,busy].min(600)}>75' + name: Zabbix history syncer processes more than 75% busy + priority: avg + url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc + + - description: '' + expression: '{Template App Zabbix Server:zabbix[process,housekeeper,avg,busy].min(1800)}>75' + name: Zabbix housekeeper processes more than 75% busy + priority: avg + url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc + + - description: '' + expression: '{Template App Zabbix Server:zabbix[process,http poller,avg,busy].min(600)}>75' + name: Zabbix http poller processes more than 75% busy + priority: avg + url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc + + - description: '' + expression: '{Template App Zabbix Server:zabbix[process,icmp pinger,avg,busy].min(600)}>75' + name: Zabbix icmp pinger processes more than 75% busy + priority: avg + url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc + + - description: '' + expression: '{Template App Zabbix Server:zabbix[process,ipmi poller,avg,busy].min(600)}>75' + name: Zabbix ipmi poller processes more than 75% busy + priority: avg + url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc + + - description: '' + expression: '{Template App Zabbix Server:zabbix[process,java poller,avg,busy].min(600)}>75' + name: Zabbix java poller processes more than 75% busy + priority: avg + url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc + + - description: '' + expression: '{Template App Zabbix Server:zabbix[process,node watcher,avg,busy].min(600)}>75' + name: Zabbix node watcher processes more than 75% busy + priority: avg + url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc + + - description: '' + expression: '{Template App Zabbix Server:zabbix[process,poller,avg,busy].min(600)}>75' + name: Zabbix poller processes more than 75% busy + priority: high + url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc + + - description: '' + expression: '{Template App Zabbix Server:zabbix[process,proxy poller,avg,busy].min(600)}>75' + name: Zabbix proxy poller processes more than 75% busy + priority: avg + url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc + + - description: '' + expression: '{Template App Zabbix Server:zabbix[process,self-monitoring,avg,busy].min(600)}>75' + name: Zabbix self-monitoring processes more than 75% busy + priority: avg + url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc + + - description: '' + expression: '{Template App Zabbix Server:zabbix[process,snmp trapper,avg,busy].min(600)}>75' + name: Zabbix snmp trapper processes more than 75% busy + priority: avg + url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc + + - description: Timer processes usually are busy because they have to process time + based trigger functions + expression: '{Template App Zabbix Server:zabbix[process,timer,avg,busy].min(600)}>75' + name: Zabbix timer processes more than 75% busy + priority: avg + url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc + + - description: '' + expression: '{Template App Zabbix Server:zabbix[process,trapper,avg,busy].min(600)}>75' + name: Zabbix trapper processes more than 75% busy + priority: avg + url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc + + - description: '' + expression: '{Template App Zabbix Server:zabbix[process,unreachable poller,avg,busy].min(600)}>75' + name: Zabbix unreachable poller processes more than 75% busy + priority: avg + url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc + + - description: "This alert generally indicates a performance problem or a problem\ + \ with the zabbix-server or proxy.\r\n\r\nThe first place to check for issues\ + \ is Administration > Queue. Be sure to check the general view and the per-proxy\ + \ view." + expression: '{Template App Zabbix Server:zabbix[queue,10m].min(600)}>1000' + name: More than 1000 items having missing data for more than 10 minutes + priority: high + url: https://github.com/openshift/ops-sop/blob/master/Alerts/data_lost_overview_plugin.asciidoc + + - description: Consider increasing CacheSize in the zabbix_server.conf configuration + file + expression: '{Template App Zabbix Server:zabbix[rcache,buffer,pfree].min(600)}<5' + name: Less than 5% free in the configuration cache + priority: info + url: https://github.com/openshift/ops-sop/blob/master/Alerts/check_cache.asciidoc + + - description: '' + expression: '{Template App Zabbix Server:zabbix[wcache,history,pfree].min(600)}<25' + name: Less than 25% free in the history cache + priority: avg + url: https://github.com/openshift/ops-sop/blob/master/Alerts/check_cache.asciidoc + + - description: '' + expression: '{Template App Zabbix Server:zabbix[wcache,text,pfree].min(600)}<25' + name: Less than 25% free in the text history cache + priority: avg + url: https://github.com/openshift/ops-sop/blob/master/Alerts/check_cache.asciidoc + + - description: '' + expression: '{Template App Zabbix Server:zabbix[wcache,trend,pfree].min(600)}<25' + name: Less than 25% free in the trends cache + priority: avg + url: https://github.com/openshift/ops-sop/blob/master/Alerts/check_cache.asciidoc diff --git a/roles/os_zabbix/vars/template_docker.yml b/roles/os_zabbix/vars/template_docker.yml new file mode 100644 index 000000000..bfabf50c5 --- /dev/null +++ b/roles/os_zabbix/vars/template_docker.yml @@ -0,0 +1,94 @@ +--- +g_template_docker: + name: Template Docker + zitems: + - key: docker.ping + applications: + - Docker Daemon + value_type: int + + - key: docker.info_elapsed_ms + applications: + - Docker Daemon + value_type: int + + - key: docker.storage.is_loopback + applications: + - Docker Storage + value_type: int + + - key: docker.storage.data.space.total + applications: + - Docker Storage + value_type: float + + - key: docker.storage.data.space.used + applications: + - Docker Storage + value_type: float + + - key: docker.storage.data.space.available + applications: + - Docker Storage + value_type: float + + - key: docker.storage.data.space.percent_available + applications: + - Docker Storage + value_type: float + + - key: docker.storage.metadata.space.total + applications: + - Docker Storage + value_type: float + + - key: docker.storage.metadata.space.used + applications: + - Docker Storage + value_type: float + + - key: docker.storage.metadata.space.available + applications: + - Docker Storage + value_type: float + + - key: docker.storage.metadata.space.percent_available + applications: + - Docker Storage + value_type: float + ztriggers: + - name: 'docker.ping failed on {HOST.NAME}' + expression: '{Template Docker:docker.ping.max(#3)}<1' + url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_docker_ping.asciidoc' + priority: high + + - name: 'Docker storage is using LOOPBACK on {HOST.NAME}' + expression: '{Template Docker:docker.storage.is_loopback.last()}<>0' + url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_docker_loopback.asciidoc' + priority: high + + - name: 'Critically low docker storage data space on {HOST.NAME}' + expression: '{Template Docker:docker.storage.data.space.percent_available.max(#3)}<5 or {Template Docker:docker.storage.data.space.available.max(#3)}<5' # < 5% or < 5GB + url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_docker_storage.asciidoc' + priority: high + + - name: 'Critically low docker storage metadata space on {HOST.NAME}' + expression: '{Template Docker:docker.storage.metadata.space.percent_available.max(#3)}<5 or {Template Docker:docker.storage.metadata.space.available.max(#3)}<0.005' # < 5% or < 5MB + url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_docker_storage.asciidoc' + priority: high + + # Put triggers that depend on other triggers here (deps must be created first) + - name: 'Low docker storage data space on {HOST.NAME}' + expression: '{Template Docker:docker.storage.data.space.percent_available.max(#3)}<10 or {Template Docker:docker.storage.data.space.available.max(#3)}<10' # < 10% or < 10GB + url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_docker_storage.asciidoc' + dependencies: + - 'Critically low docker storage data space on {HOST.NAME}' + priority: average + + - name: 'Low docker storage metadata space on {HOST.NAME}' + expression: '{Template Docker:docker.storage.metadata.space.percent_available.max(#3)}<10 or {Template Docker:docker.storage.metadata.space.available.max(#3)}<0.01' # < 10% or < 10MB + url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_docker_storage.asciidoc' + dependencies: + - 'Critically low docker storage metadata space on {HOST.NAME}' + priority: average + diff --git a/roles/os_zabbix/vars/template_heartbeat.yml b/roles/os_zabbix/vars/template_heartbeat.yml new file mode 100644 index 000000000..8dbe0d0d6 --- /dev/null +++ b/roles/os_zabbix/vars/template_heartbeat.yml @@ -0,0 +1,13 @@ +--- +g_template_heartbeat: + name: Template Heartbeat + zitems: + - name: Heartbeat Ping + applications: + - Heartbeat + key: heartbeat.ping + ztriggers: + - name: 'Heartbeat.ping has failed on {HOST.NAME}' + expression: '{Template Heartbeat:heartbeat.ping.nodata(20m)}=1' + priority: avg + url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_node_heartbeat.asciidoc' diff --git a/roles/os_zabbix/vars/template_openshift_master.yml b/roles/os_zabbix/vars/template_openshift_master.yml new file mode 100644 index 000000000..1de4fefbb --- /dev/null +++ b/roles/os_zabbix/vars/template_openshift_master.yml @@ -0,0 +1,58 @@ +--- +g_template_openshift_master: + name: Template Openshift Master + zitems: + - name: create_app + applications: + - Openshift Master + key: create_app + + - key: openshift.master.process.count + description: Shows number of master processes running + type: int + applications: + - Openshift Master + + - key: openshift.master.user.count + description: Shows number of users in a cluster + type: int + applications: + - Openshift Master + + - key: openshift.master.pod.running.count + description: Shows number of pods running + type: int + applications: + - Openshift Master + + - key: openshift.project.counter + description: Shows number of projects on a cluster + type: int + applications: + - Openshift Master + + ztriggers: + - name: 'Application creation has failed on {HOST.NAME}' + expression: '{Template Openshift Master:create_app.last(#1)}=1 and {Template Openshift Master:create_app.last(#2)}=1' + url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_create_app.asciidoc' + priority: avg + + - name: 'Openshift Master process not running on {HOST.NAME}' + expression: '{Template Openshift Master:openshift.master.process.count.max(#3)}<1' + url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc' + priority: high + + - name: 'Too many Openshift Master processes running on {HOST.NAME}' + expression: '{Template Openshift Master:openshift.master.process.count.min(#3)}>1' + url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc' + priority: high + + - name: 'Number of users for Openshift Master on {HOST.NAME}' + expression: '{Template Openshift Master:openshift.master.user.count.last()}=0' + url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc' + priority: info + + - name: 'There are no projects running on {HOST.NAME}' + expression: '{Template Openshift Master:openshift.project.counter.last()}=0' + url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc' + priority: info diff --git a/roles/os_zabbix/vars/template_openshift_node.yml b/roles/os_zabbix/vars/template_openshift_node.yml new file mode 100644 index 000000000..ce28b1048 --- /dev/null +++ b/roles/os_zabbix/vars/template_openshift_node.yml @@ -0,0 +1,44 @@ +--- +g_template_openshift_node: + name: Template Openshift Node + zitems: + - key: openshift.node.process.count + description: Shows number of OpenShift Node processes running + type: int + applications: + - Openshift Node + + - key: openshift.node.ovs.pids.count + description: Shows number of ovs process ids running + type: int + applications: + - Openshift Node + + - key: openshift.node.ovs.ports.count + description: Shows number of OVS ports defined + type: int + applications: + - Openshift Node + + ztriggers: + - name: 'Openshift Node process not running on {HOST.NAME}' + expression: '{Template Openshift Node:openshift.node.process.count.max(#3)}<1' + url: 'https://github.com/openshift/ops-sop/blob/node/V3/Alerts/openshift_node.asciidoc' + priority: high + + - name: 'Too many Openshift Node processes running on {HOST.NAME}' + expression: '{Template Openshift Node:openshift.node.process.count.min(#3)}>1' + url: 'https://github.com/openshift/ops-sop/blob/node/V3/Alerts/openshift_node.asciidoc' + priority: high + + - name: 'OVS may not be running on {HOST.NAME}' + expression: '{Template Openshift Node:openshift.node.ovs.pids.count.last()}<>4' + url: 'https://github.com/openshift/ops-sop/blob/node/V3/Alerts/openshift_node.asciidoc' + priority: high + + - name: 'Number of OVS ports is 0 on {HOST.NAME}' + expression: '{Template Openshift Node:openshift.node.ovs.ports.count.last()}=0' + url: 'https://github.com/openshift/ops-sop/blob/node/V3/Alerts/openshift_node.asciidoc' + priority: high + + diff --git a/roles/os_zabbix/vars/template_ops_tools.yml b/roles/os_zabbix/vars/template_ops_tools.yml new file mode 100644 index 000000000..d1b8a2514 --- /dev/null +++ b/roles/os_zabbix/vars/template_ops_tools.yml @@ -0,0 +1,23 @@ +--- +g_template_ops_tools: + name: Template Operations Tools + zdiscoveryrules: + - name: disc.ops.runner + key: disc.ops.runner + lifetime: 1 + description: "Dynamically register operations runner items" + + zitemprototypes: + - discoveryrule_key: disc.ops.runner + name: "Exit code of ops-runner[{#OSO_COMMAND}]" + key: "disc.ops.runner.command.exitcode[{#OSO_COMMAND}]" + value_type: int + description: "The exit code of the command run from ops-runner" + applications: + - Ops Runner + + ztriggerprototypes: + - name: 'ops-runner[{#OSO_COMMAND}]: non-zero exit code on {HOST.NAME}' + expression: '{Template Operations Tools:disc.ops.runner.command.exitcode[{#OSO_COMMAND}].last()}<>0' + url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_ops_runner_command.asciidoc' + priority: average diff --git a/roles/os_zabbix/vars/template_os_linux.yml b/roles/os_zabbix/vars/template_os_linux.yml new file mode 100644 index 000000000..3ae1500bc --- /dev/null +++ b/roles/os_zabbix/vars/template_os_linux.yml @@ -0,0 +1,260 @@ +--- +g_template_os_linux: + name: Template OS Linux + zitems: + - key: kernel.uname.sysname + applications: + - Kernel + value_type: string + + - key: kernel.all.cpu.wait.total + applications: + - Kernel + value_type: float + units: '%' + + - key: kernel.all.cpu.irq.hard + applications: + - Kernel + value_type: float + units: '%' + + - key: kernel.all.cpu.idle + applications: + - Kernel + value_type: float + units: '%' + + - key: kernel.uname.distro + applications: + - Kernel + value_type: string + + - key: kernel.uname.nodename + applications: + - Kernel + value_type: string + + - key: kernel.all.cpu.irq.soft + applications: + - Kernel + value_type: float + units: '%' + + - key: kernel.all.load.15_minute + applications: + - Kernel + value_type: float + + - key: kernel.all.cpu.sys + applications: + - Kernel + value_type: float + units: '%' + + - key: kernel.all.load.5_minute + applications: + - Kernel + value_type: float + + - key: kernel.all.cpu.nice + applications: + - Kernel + value_type: float + units: '%' + + - key: kernel.all.load.1_minute + applications: + - Kernel + value_type: float + + - key: kernel.uname.version + applications: + - Kernel + value_type: string + + - key: kernel.all.uptime + applications: + - Kernel + value_type: int + + - key: kernel.all.cpu.user + applications: + - Kernel + value_type: float + units: '%' + + - key: kernel.uname.machine + applications: + - Kernel + value_type: string + + - key: hinv.ncpu + applications: + - Kernel + value_type: int + + - key: kernel.all.cpu.steal + applications: + - Kernel + value_type: float + units: '%' + + - key: kernel.all.pswitch + applications: + - Kernel + value_type: int + + - key: kernel.uname.release + applications: + - Kernel + value_type: string + + - key: proc.nprocs + applications: + - Kernel + value_type: int + + # Memory Items + - key: mem.freemem + applications: + - Memory + value_type: int + description: "PCP: free system memory metric from /proc/meminfo" + multiplier: 1024 + units: B + + - key: mem.util.bufmem + applications: + - Memory + value_type: int + description: "PCP: Memory allocated for buffer_heads.; I/O buffers metric from /proc/meminfo" + multiplier: 1024 + units: B + + - key: swap.used + applications: + - Memory + value_type: int + description: "PCP: swap used metric from /proc/meminfo" + multiplier: 1024 + units: B + + - key: swap.length + applications: + - Memory + value_type: int + description: "PCP: total swap available metric from /proc/meminfo" + multiplier: 1024 + units: B + + - key: mem.physmem + applications: + - Memory + value_type: int + description: "PCP: The value of this metric corresponds to the \"MemTotal\" field reported by /proc/meminfo. Note that this does not necessarily correspond to actual installed physical memory - there may be areas of the physical address space mapped as ROM in various peripheral devices and the bios may be mirroring certain ROMs in RAM." + multiplier: 1024 + units: B + + - key: swap.free + applications: + - Memory + value_type: int + description: "PCP: swap free metric from /proc/meminfo" + multiplier: 1024 + units: B + + - key: mem.util.available + applications: + - Memory + value_type: int + description: "PCP: The amount of memory that is available for a new workload, without pushing the system into swap. Estimated from MemFree, Active(file), Inactive(file), and SReclaimable, as well as the \"low\" watermarks from /proc/zoneinfo.; available memory from /proc/meminfo" + multiplier: 1024 + units: B + + - key: mem.util.used + applications: + - Memory + value_type: int + description: "PCP: Used memory is the difference between mem.physmem and mem.freemem; used memory metric from /proc/meminfo" + multiplier: 1024 + units: B + + - key: mem.util.cached + applications: + - Memory + value_type: int + description: "PCP: Memory used by the page cache, including buffered file data. This is in-memory cache for files read from the disk (the pagecache) but doesn't include SwapCached.; page cache metric from /proc/meminfo" + multiplier: 1024 + units: B + + zdiscoveryrules: + - name: disc.filesys + key: disc.filesys + lifetime: 1 + description: "Dynamically register the filesystems" + + zitemprototypes: + - discoveryrule_key: disc.filesys + name: "disc.filesys.full.{#OSO_FILESYS}" + key: "disc.filesys.full[{#OSO_FILESYS}]" + value_type: float + description: "PCP filesys.full option. This is the percent full returned from pcp filesys.full" + applications: + - Disk + + - discoveryrule_key: disc.filesys + name: "Percentage of used inodes on {#OSO_FILESYS}" + key: "disc.filesys.inodes.pused[{#OSO_FILESYS}]" + value_type: float + description: "PCP derived value of percentage of used inodes on a filesystem." + applications: + - Disk + + ztriggerprototypes: + - name: 'Filesystem: {#OSO_FILESYS} has less than 15% free disk space on {HOST.NAME}' + expression: '{Template OS Linux:disc.filesys.full[{#OSO_FILESYS}].last()}>85' + url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_filesys_full.asciidoc' + priority: warn + + - name: 'Filesystem: {#OSO_FILESYS} has less than 10% free disk space on {HOST.NAME}' + expression: '{Template OS Linux:disc.filesys.full[{#OSO_FILESYS}].last()}>90' + url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_filesys_full.asciidoc' + priority: high + + - name: 'Filesystem: {#OSO_FILESYS} has less than 10% free inodes on {HOST.NAME}' + expression: '{Template OS Linux:disc.filesys.inodes.pused[{#OSO_FILESYS}].last()}>90' + url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_filesys_full.asciidoc' + priority: warn + + - name: 'Filesystem: {#OSO_FILESYS} has less than 5% free inodes on {HOST.NAME}' + expression: '{Template OS Linux:disc.filesys.inodes.pused[{#OSO_FILESYS}].last()}>95' + url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_filesys_full.asciidoc' + priority: high + + ztriggers: + - name: 'Too many TOTAL processes on {HOST.NAME}' + expression: '{Template OS Linux:proc.nprocs.last()}>5000' + url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_proc.asciidoc' + priority: warn + + - name: 'Lack of available memory on {HOST.NAME}' + expression: '{Template OS Linux:mem.freemem.last()}<30720000' + url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_memory.asciidoc' + priority: warn + description: 'Alert on less than 30MegaBytes. This is 30 Million Bytes. 30000 KB x 1024' + + # CPU Utilization # + - name: 'CPU idle less than 5% on {HOST.NAME}' + expression: '{Template OS Linux:kernel.all.cpu.idle.last()}<5 and {Template OS Linux:kernel.all.cpu.idle.last(#2)}<5' + url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_cpu_idle.asciidoc' + priority: average + description: 'CPU is less than 5% idle' + + - name: 'CPU idle less than 10% on {HOST.NAME}' + expression: '{Template OS Linux:kernel.all.cpu.idle.last()}<10 and {Template OS Linux:kernel.all.cpu.idle.last(#2)}<10' + url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_cpu_idle.asciidoc' + priority: warn + description: 'CPU is less than 10% idle' + dependencies: + - 'CPU idle less than 5% on {HOST.NAME}' |