diff options
-rw-r--r-- | roles/os_zabbix/tasks/main.yml | 13 | ||||
-rw-r--r-- | roles/os_zabbix/vars/template_openshift_master.yml | 8 | ||||
-rw-r--r-- | roles/os_zabbix/vars/template_zagg_server.yml | 36 |
3 files changed, 57 insertions, 0 deletions
diff --git a/roles/os_zabbix/tasks/main.yml b/roles/os_zabbix/tasks/main.yml index d0b307a3d..7552086d4 100644 --- a/roles/os_zabbix/tasks/main.yml +++ b/roles/os_zabbix/tasks/main.yml @@ -37,6 +37,9 @@ - include_vars: template_aws.yml tags: - aws +- include_vars: template_zagg_server.yml + tags: + - zagg_server - name: Include Template Heartbeat include: ../../lib_zabbix/tasks/create_template.yml @@ -137,3 +140,13 @@ password: "{{ ozb_password }}" tags: - aws + +- name: Include Template Zagg Server + include: ../../lib_zabbix/tasks/create_template.yml + vars: + template: "{{ g_template_zagg_server }}" + server: "{{ ozb_server }}" + user: "{{ ozb_user }}" + password: "{{ ozb_password }}" + tags: + - zagg_server diff --git a/roles/os_zabbix/vars/template_openshift_master.yml b/roles/os_zabbix/vars/template_openshift_master.yml index 514d6fd24..a0ba8d104 100644 --- a/roles/os_zabbix/vars/template_openshift_master.yml +++ b/roles/os_zabbix/vars/template_openshift_master.yml @@ -269,6 +269,14 @@ g_template_openshift_master: - 'Openshift Master process not running on {HOST.NAME}' priority: avg + - name: 'Application creation has failed multiple times in the last hour on {HOST.NAME}' + expression: '{Template Openshift Master:create_app.sum(1h)}>3' + url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_create_app.asciidoc' + dependencies: + - 'Openshift Master process not running on {HOST.NAME}' + description: The application create loop has failed 4 or more times in the last hour + priority: avg + - name: 'Openshift Master API health check is failing on {HOST.NAME}' expression: '{Template Openshift Master:openshift.master.api.healthz.max(#3)}<1' url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc' diff --git a/roles/os_zabbix/vars/template_zagg_server.yml b/roles/os_zabbix/vars/template_zagg_server.yml new file mode 100644 index 000000000..0e8e53bb7 --- /dev/null +++ b/roles/os_zabbix/vars/template_zagg_server.yml @@ -0,0 +1,36 @@ +--- +g_template_zagg_server: + name: Template Zagg Server + zitems: + - key: zagg.server.metrics.count + applications: + - Zagg Server + value_type: int + + - key: zagg.server.processor.errors + applications: + - Zagg Server + value_type: int + + - key: zagg.server.heartbeat.count + applications: + - Zagg Server + value_type: int + + ztriggers: + - name: 'Error sending metrics on {HOST.NAME}' + expression: '{Template Zagg Server:zagg.server.processor.errors.min(#3)}>0' + url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/zagg_server.asciidoc' + priority: average + + - name: 'Critically High number of metrics in Zagg queue {HOST.NAME}' + expression: '{Template Zagg Server:zagg.server.metrics.count.min(#3)}>10000' + url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/zagg_server.asciidoc' + priority: high + + - name: 'High number of metrics in Zagg queue {HOST.NAME}' + expression: '{Template Zagg Server:zagg.server.metrics.count.min(#3)}>5000' + url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/zagg_server.asciidoc' + dependencies: + - 'Critically High number of metrics in Zagg queue {HOST.NAME}' + priority: average |