diff options
author | Thomas Wiest <twiest@users.noreply.github.com> | 2015-12-09 15:49:48 -0500 |
---|---|---|
committer | Thomas Wiest <twiest@users.noreply.github.com> | 2015-12-09 15:49:48 -0500 |
commit | eeb164fae0e6721100c4fcc1717d92bb85b9652c (patch) | |
tree | 70eee046db8012061c178ab4e686650048265564 /roles/os_zabbix/vars | |
parent | 898290cb3aabbc9d98883181877ac857a2fe1faf (diff) | |
parent | 14c69ad397be8ee101ef5b4edfa223d703e67ad0 (diff) | |
download | openshift-eeb164fae0e6721100c4fcc1717d92bb85b9652c.tar.gz openshift-eeb164fae0e6721100c4fcc1717d92bb85b9652c.tar.bz2 openshift-eeb164fae0e6721100c4fcc1717d92bb85b9652c.tar.xz openshift-eeb164fae0e6721100c4fcc1717d92bb85b9652c.zip |
Merge pull request #1048 from twiest/prod
Sync master -> Prod
Diffstat (limited to 'roles/os_zabbix/vars')
-rw-r--r-- | roles/os_zabbix/vars/template_openshift_master.yml | 198 | ||||
-rw-r--r-- | roles/os_zabbix/vars/template_os_linux.yml | 22 |
2 files changed, 198 insertions, 22 deletions
diff --git a/roles/os_zabbix/vars/template_openshift_master.yml b/roles/os_zabbix/vars/template_openshift_master.yml index 174486e15..514d6fd24 100644 --- a/roles/os_zabbix/vars/template_openshift_master.yml +++ b/roles/os_zabbix/vars/template_openshift_master.yml @@ -7,12 +7,24 @@ g_template_openshift_master: - Openshift Master key: create_app + - key: openshift.master.registry.healthz + description: "Shows the health status of the cluster's docker registry" + type: int + applications: + - Openshift Master + - key: openshift.master.process.count description: Shows number of master processes running type: int applications: - Openshift Master + - key: openshift.master.api.ping + description: "Verify that the Openshift API is up" + type: int + applications: + - Openshift Master + - key: openshift.master.api.healthz description: "Checks the healthz check of the master's api: https://master_host/healthz" type: int @@ -44,12 +56,48 @@ g_template_openshift_master: applications: - Openshift Master + - key: openshift.master.node.count + description: Shows the total number of nodes found in the Openshift Cluster + type: int + applications: + - Openshift Master + - key: openshift.project.count description: Shows number of projects on a cluster type: int applications: - Openshift Master + - key: openshift.master.pv.total.count + description: Total number of Persistent Volumes in the Openshift Cluster + type: int + applications: + - Openshift Master + + - key: openshift.master.pv.available.count + description: Total number of Available Persistent Volumes in the Openshift Cluster + type: int + applications: + - Openshift Master + + - key: openshift.master.pv.released.count + description: Total number of Released Persistent Volumes in the Openshift Cluster + type: int + applications: + - Openshift Master + + - key: openshift.master.pv.bound.count + description: Total number of Bound Persistent Volumes in the Openshift Cluster + type: int + applications: + - Openshift Master + + - key: openshift.master.pv.failed.count + description: Total number of Failed Persistent Volumes in the Openshift Cluster + type: int + applications: + - Openshift Master + - key: openshift.master.etcd.create.success description: Show number of successful create actions type: int @@ -122,17 +170,67 @@ g_template_openshift_master: applications: - Openshift Etcd - ztriggers: - - name: 'Application creation has failed on {HOST.NAME}' - expression: '{Template Openshift Master:create_app.last(#1)}=1 and {Template Openshift Master:create_app.last(#2)}=1' - url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_create_app.asciidoc' - priority: avg + - key: openshift.master.metric.ping + description: "This check verifies that the https://master/metrics check is alive and communicating properly." + type: int + applications: + - Openshift Master Metrics - - name: 'Openshift Master API health check is failing on {HOST.NAME}' - expression: '{Template Openshift Master:openshift.master.api.healthz.max(#3)}<1' - url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc' - priority: high + - key: openshift.master.apiserver.latency.summary.pods.quantile.list.5 + description: "Value from https://master/metrics. This is the time, in miliseconds, that 50% of the pod operations have taken to completed." + type: int + applications: + - Openshift Master Metrics + + - key: openshift.master.apiserver.latency.summary.pods.quantile.list.9 + description: "Value from https://master/metrics. This is the time, in miliseconds, that 90% of the pod operations have taken to completed." + type: int + applications: + - Openshift Master Metrics + + - key: openshift.master.apiserver.latency.summary.pods.quantile.list.99 + description: "Value from https://master/metrics. This is the time, in miliseconds, that 99% of the pod operations have taken to completed." + type: int + applications: + - Openshift Master Metrics + + - key: openshift.master.apiserver.latency.summary.pods.quantile.watchlist.5 + description: "Value from https://master/metrics. This is the time, in miliseconds, that 50% of the pod operations have taken to completed." + type: int + applications: + - Openshift Master Metrics + + - key: openshift.master.apiserver.latency.summary.pods.quantile.watchlist.9 + description: "Value from https://master/metrics. This is the time, in miliseconds, that 90% of the pod operations have taken to completed." + type: int + applications: + - Openshift Master Metrics + + - key: openshift.master.apiserver.latency.summary.pods.quantile.watchlist.99 + description: "Value from https://master/metrics. This is the time, in miliseconds, that 99% of the pod operations have taken to completed." + type: int + applications: + - Openshift Master Metrics + + - key: openshift.master.scheduler.e2e.scheduling.latency.quantile.5 + description: "Value from https://master/metrics. This is the time, in miliseconds, that 50% of the end to end scheduling operations have taken to completed." + type: int + applications: + - Openshift Master Metrics + + - key: openshift.master.scheduler.e2e.scheduling.latency.quantile.9 + description: "Value from https://master/metrics. This is the time, in miliseconds, that 90% of the end to end scheduling operations have taken to completed." + type: int + applications: + - Openshift Master Metrics + + - key: openshift.master.scheduler.e2e.scheduling.latency.quantile.99 + description: "Value from https://master/metrics. This is the time, in miliseconds, that 99% of the end to end scheduling operations have taken to completed." + type: int + applications: + - Openshift Master Metrics + ztriggers: - name: 'Openshift Master process not running on {HOST.NAME}' expression: '{Template Openshift Master:openshift.master.process.count.max(#3)}<1' url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc' @@ -143,6 +241,16 @@ g_template_openshift_master: url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc' priority: high + - name: 'Low number of etcd watchers on {HOST.NAME}' + expression: '{Template Openshift Master:openshift.master.etcd.watchers.last(#1)}<10 and {Template Openshift Master:openshift.master.etcd.watchers.last(#2)}<10' + url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_etcd.asciidoc' + priority: avg + + - name: 'Etcd ping failed on {HOST.NAME}' + expression: '{Template Openshift Master:openshift.master.etcd.ping.last(#1)}=0 and {Template Openshift Master:openshift.master.etcd.ping.last(#2)}=0' + url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_etcd.asciidoc' + priority: high + - name: 'Number of users for Openshift Master on {HOST.NAME}' expression: '{Template Openshift Master:openshift.master.user.count.last()}=0' url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc' @@ -153,12 +261,72 @@ g_template_openshift_master: url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc' priority: info - - name: 'Low number of etcd watchers on {HOST.NAME}' - expression: '{Template Openshift Master:openshift.master.etcd.watchers.last(#1)}<10 and {Template Openshift Master:openshift.master.etcd.watchers.last(#2)}<10' - url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_etcd.asciidoc' + # Put triggers that depend on other triggers here (deps must be created first) + - name: 'Application creation has failed on {HOST.NAME}' + expression: '{Template Openshift Master:create_app.last(#1)}=1 and {Template Openshift Master:create_app.last(#2)}=1' + url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_create_app.asciidoc' + dependencies: + - 'Openshift Master process not running on {HOST.NAME}' priority: avg - - name: 'Etcd ping failed on {HOST.NAME}' - expression: '{Template Openshift Master:openshift.master.etcd.ping.last(#1)}=0 and {Template Openshift Master:openshift.master.etcd.ping.last(#2)}=0' - url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_etcd.asciidoc' + - name: 'Openshift Master API health check is failing on {HOST.NAME}' + expression: '{Template Openshift Master:openshift.master.api.healthz.max(#3)}<1' + url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc' + dependencies: + - 'Openshift Master process not running on {HOST.NAME}' + priority: high + + - name: 'Openshift Master API PING check is failing on {HOST.NAME}' + expression: '{Template Openshift Master:openshift.master.api.ping.max(#3)}<1' + url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc' + dependencies: + - 'Openshift Master process not running on {HOST.NAME}' priority: high + + - name: 'Openshift Master metric PING check is failing on {HOST.NAME}' + expression: '{Template Openshift Master:openshift.master.metric.ping.max(#3)}<1' + url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc' + dependencies: + - 'Openshift Master process not running on {HOST.NAME}' + priority: avg + + - name: 'Docker Registry check failed on {HOST.NAME}' + expression: '{Template Openshift Master:openshift.master.registry.healthz.max(#2)}<1' + url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc' + dependencies: + - 'Openshift Master process not running on {HOST.NAME}' + priority: high + + zgraphs: + - name: Openshift Master API Server Latency Pods LIST Quantiles + width: 900 + height: 200 + graph_items: + - item_name: openshift.master.apiserver.latency.summary.pods.quantile.list.5 + color: red + - item_name: openshift.master.apiserver.latency.summary.pods.quantile.list.9 + color: blue + - item_name: openshift.master.apiserver.latency.summary.pods.quantile.list.99 + color: orange + + - name: Openshift Master API Server Latency Pods WATCHLIST Quantiles + width: 900 + height: 200 + graph_items: + - item_name: openshift.master.apiserver.latency.summary.pods.quantile.watchlist.5 + color: red + - item_name: openshift.master.apiserver.latency.summary.pods.quantile.watchlist.9 + color: blue + - item_name: openshift.master.apiserver.latency.summary.pods.quantile.watchlist.99 + color: orange + + - name: Openshift Master Scheduler End to End Latency Quantiles + width: 900 + height: 200 + graph_items: + - item_name: openshift.master.scheduler.e2e.scheduling.latency.quantile.5 + color: red + - item_name: openshift.master.scheduler.e2e.scheduling.latency.quantile.9 + color: blue + - item_name: openshift.master.scheduler.e2e.scheduling.latency.quantile.99 + color: orange diff --git a/roles/os_zabbix/vars/template_os_linux.yml b/roles/os_zabbix/vars/template_os_linux.yml index 04665be62..c6e557f12 100644 --- a/roles/os_zabbix/vars/template_os_linux.yml +++ b/roles/os_zabbix/vars/template_os_linux.yml @@ -258,26 +258,34 @@ g_template_os_linux: - Network ztriggerprototypes: - - name: 'Filesystem: {#OSO_FILESYS} has less than 15% free disk space on {HOST.NAME}' - expression: '{Template OS Linux:disc.filesys.full[{#OSO_FILESYS}].last()}>85' - url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_filesys_full.asciidoc' - priority: warn - - name: 'Filesystem: {#OSO_FILESYS} has less than 10% free disk space on {HOST.NAME}' expression: '{Template OS Linux:disc.filesys.full[{#OSO_FILESYS}].last()}>90' url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_filesys_full.asciidoc' priority: high - - name: 'Filesystem: {#OSO_FILESYS} has less than 10% free inodes on {HOST.NAME}' - expression: '{Template OS Linux:disc.filesys.inodes.pused[{#OSO_FILESYS}].last()}>90' + # This has a dependency on the previous trigger + # Trigger Prototypes do not work in 2.4. They will work in Zabbix 3.0 + - name: 'Filesystem: {#OSO_FILESYS} has less than 15% free disk space on {HOST.NAME}' + expression: '{Template OS Linux:disc.filesys.full[{#OSO_FILESYS}].last()}>85' url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_filesys_full.asciidoc' priority: warn + dependencies: + - 'Filesystem: {#OSO_FILESYS} has less than 10% free disk space on {HOST.NAME}' - name: 'Filesystem: {#OSO_FILESYS} has less than 5% free inodes on {HOST.NAME}' expression: '{Template OS Linux:disc.filesys.inodes.pused[{#OSO_FILESYS}].last()}>95' url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_filesys_full.asciidoc' priority: high + # This has a dependency on the previous trigger + # Trigger Prototypes do not work in 2.4. They will work in Zabbix 3.0 + - name: 'Filesystem: {#OSO_FILESYS} has less than 10% free inodes on {HOST.NAME}' + expression: '{Template OS Linux:disc.filesys.inodes.pused[{#OSO_FILESYS}].last()}>90' + url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_filesys_full.asciidoc' + priority: warn + dependencies: + - 'Filesystem: {#OSO_FILESYS} has less than 5% free inodes on {HOST.NAME}' + ztriggers: - name: 'Too many TOTAL processes on {HOST.NAME}' expression: '{Template OS Linux:proc.nprocs.last()}>5000' |