summaryrefslogtreecommitdiffstats
path: root/roles/os_zabbix/vars
diff options
context:
space:
mode:
authorKenny Woodson <kwoodson@redhat.com>2015-10-29 11:14:51 -0400
committerKenny Woodson <kwoodson@redhat.com>2015-10-29 11:14:51 -0400
commit9bbaa824da5e1a049cdec1a6523c3841d713386c (patch)
tree93e80f1577ad0f2f5f8931b493c50cd9aa657c77 /roles/os_zabbix/vars
parent15df494fb781dd1509854eeb366e981930b52c22 (diff)
parent16d1bce0be2f8c3942489630adcb7030aecadc55 (diff)
downloadopenshift-9bbaa824da5e1a049cdec1a6523c3841d713386c.tar.gz
openshift-9bbaa824da5e1a049cdec1a6523c3841d713386c.tar.bz2
openshift-9bbaa824da5e1a049cdec1a6523c3841d713386c.tar.xz
openshift-9bbaa824da5e1a049cdec1a6523c3841d713386c.zip
Merge pull request #763 from openshift/master
Merge master into prod.
Diffstat (limited to 'roles/os_zabbix/vars')
-rw-r--r--roles/os_zabbix/vars/main.yml1
-rw-r--r--roles/os_zabbix/vars/template_app_zabbix_agent.yml23
-rw-r--r--roles/os_zabbix/vars/template_app_zabbix_server.yml412
-rw-r--r--roles/os_zabbix/vars/template_docker.yml94
-rw-r--r--roles/os_zabbix/vars/template_heartbeat.yml13
-rw-r--r--roles/os_zabbix/vars/template_openshift_master.yml58
-rw-r--r--roles/os_zabbix/vars/template_openshift_node.yml44
-rw-r--r--roles/os_zabbix/vars/template_ops_tools.yml23
-rw-r--r--roles/os_zabbix/vars/template_os_linux.yml260
9 files changed, 928 insertions, 0 deletions
diff --git a/roles/os_zabbix/vars/main.yml b/roles/os_zabbix/vars/main.yml
new file mode 100644
index 000000000..ed97d539c
--- /dev/null
+++ b/roles/os_zabbix/vars/main.yml
@@ -0,0 +1 @@
+---
diff --git a/roles/os_zabbix/vars/template_app_zabbix_agent.yml b/roles/os_zabbix/vars/template_app_zabbix_agent.yml
new file mode 100644
index 000000000..d636d4822
--- /dev/null
+++ b/roles/os_zabbix/vars/template_app_zabbix_agent.yml
@@ -0,0 +1,23 @@
+---
+g_template_app_zabbix_agent:
+ name: Template App Zabbix Agent
+ zitems:
+ - key: agent.hostname
+ applications:
+ - Zabbix agent
+ value_type: character
+ zabbix_type: agent
+
+ - key: agent.ping
+ applications:
+ - Zabbix agent
+ description: The agent always returns 1 for this item. It could be used in combination with nodata() for availability check.
+ value_type: int
+ zabbix_type: agent
+
+ ztriggers:
+ - name: '[Reboot] Zabbix agent on {HOST.NAME} is unreachable for 15 minutes'
+ description: Zabbix agent is unreachable for 15 minutes.
+ expression: '{Template App Zabbix Agent:agent.ping.nodata(15m)}=1'
+ priority: high
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/check_ping.asciidoc
diff --git a/roles/os_zabbix/vars/template_app_zabbix_server.yml b/roles/os_zabbix/vars/template_app_zabbix_server.yml
new file mode 100644
index 000000000..43517113b
--- /dev/null
+++ b/roles/os_zabbix/vars/template_app_zabbix_server.yml
@@ -0,0 +1,412 @@
+---
+g_template_app_zabbix_server:
+ name: Template App Zabbix Server
+ zitems:
+ - key: housekeeper_creates
+ applications:
+ - Zabbix server
+ description: A simple count of the number of partition creates output by the housekeeper script.
+ units: ''
+ value_type: int
+ zabbix_type: internal
+
+ - key: housekeeper_drops
+ applications:
+ - Zabbix server
+ description: A simple count of the number of partition drops output by the housekeeper script.
+ units: ''
+ value_type: int
+ zabbix_type: internal
+
+ - key: housekeeper_errors
+ applications:
+ - Zabbix server
+ description: A simple count of the number of errors output by the housekeeper script.
+ units: ''
+ value_type: int
+ zabbix_type: internal
+
+ - key: housekeeper_total
+ applications:
+ - Zabbix server
+ description: A simple count of the total number of lines output by the housekeeper
+ script.
+ units: ''
+ value_type: int
+ zabbix_type: internal
+
+ - key: zabbix[process,alerter,avg,busy]
+ applications:
+ - Zabbix server
+ description: ''
+ units: '%'
+ value_type: float
+ zabbix_type: internal
+
+ - key: zabbix[process,configuration syncer,avg,busy]
+ applications:
+ - Zabbix server
+ description: ''
+ units: '%'
+ value_type: float
+ zabbix_type: internal
+
+ - key: zabbix[process,db watchdog,avg,busy]
+ applications:
+ - Zabbix server
+ description: ''
+ units: '%'
+ value_type: float
+ zabbix_type: internal
+
+ - key: zabbix[process,discoverer,avg,busy]
+ applications:
+ - Zabbix server
+ description: ''
+ units: '%'
+ value_type: float
+ zabbix_type: internal
+
+ - key: zabbix[process,escalator,avg,busy]
+ applications:
+ - Zabbix server
+ description: ''
+ units: '%'
+ value_type: float
+ zabbix_type: internal
+
+ - key: zabbix[process,history syncer,avg,busy]
+ applications:
+ - Zabbix server
+ description: ''
+ units: '%'
+ value_type: float
+ zabbix_type: internal
+
+ - key: zabbix[process,housekeeper,avg,busy]
+ applications:
+ - Zabbix server
+ description: ''
+ units: '%'
+ value_type: float
+ zabbix_type: internal
+
+ - key: zabbix[process,http poller,avg,busy]
+ applications:
+ - Zabbix server
+ description: ''
+ units: '%'
+ value_type: float
+ zabbix_type: internal
+
+ - key: zabbix[process,icmp pinger,avg,busy]
+ applications:
+ - Zabbix server
+ description: ''
+ units: '%'
+ value_type: float
+ zabbix_type: internal
+
+ - key: zabbix[process,ipmi poller,avg,busy]
+ applications:
+ - Zabbix server
+ description: ''
+ units: '%'
+ value_type: float
+ zabbix_type: internal
+
+ - key: zabbix[process,java poller,avg,busy]
+ applications:
+ - Zabbix server
+ description: ''
+ units: '%'
+ value_type: float
+ zabbix_type: internal
+
+ - key: zabbix[process,node watcher,avg,busy]
+ applications:
+ - Zabbix server
+ description: ''
+ units: '%'
+ value_type: float
+ zabbix_type: internal
+
+ - key: zabbix[process,poller,avg,busy]
+ applications:
+ - Zabbix server
+ description: ''
+ units: '%'
+ value_type: float
+ zabbix_type: internal
+
+ - key: zabbix[process,proxy poller,avg,busy]
+ applications:
+ - Zabbix server
+ description: ''
+ units: '%'
+ value_type: float
+ zabbix_type: internal
+
+ - key: zabbix[process,self-monitoring,avg,busy]
+ applications:
+ - Zabbix server
+ description: ''
+ units: '%'
+ value_type: float
+ zabbix_type: internal
+
+ - key: zabbix[process,snmp trapper,avg,busy]
+ applications:
+ - Zabbix server
+ description: ''
+ units: '%'
+ value_type: float
+ zabbix_type: internal
+
+ - key: zabbix[process,timer,avg,busy]
+ applications:
+ - Zabbix server
+ description: ''
+ units: '%'
+ value_type: float
+ zabbix_type: internal
+
+ - key: zabbix[process,trapper,avg,busy]
+ applications:
+ - Zabbix server
+ description: ''
+ units: '%'
+ value_type: float
+ zabbix_type: internal
+
+ - key: zabbix[process,unreachable poller,avg,busy]
+ applications:
+ - Zabbix server
+ description: ''
+ units: '%'
+ value_type: float
+ zabbix_type: internal
+
+ - key: zabbix[queue,10m]
+ applications:
+ - Zabbix server
+ description: ''
+ units: ''
+ value_type: int
+ zabbix_type: internal
+ interval: 600
+
+ - key: zabbix[queue]
+ applications:
+ - Zabbix server
+ description: ''
+ units: ''
+ value_type: int
+ zabbix_type: internal
+ interval: 600
+
+ - key: zabbix[rcache,buffer,pfree]
+ applications:
+ - Zabbix server
+ description: ''
+ units: ''
+ value_type: float
+ zabbix_type: internal
+
+ - key: zabbix[wcache,history,pfree]
+ applications:
+ - Zabbix server
+ description: ''
+ units: ''
+ value_type: float
+ zabbix_type: internal
+
+ - key: zabbix[wcache,text,pfree]
+ applications:
+ - Zabbix server
+ description: ''
+ units: ''
+ value_type: float
+ zabbix_type: internal
+
+ - key: zabbix[wcache,trend,pfree]
+ applications:
+ - Zabbix server
+ description: ''
+ units: ''
+ value_type: float
+ zabbix_type: internal
+
+ - key: zabbix[wcache,values]
+ applications:
+ - Zabbix server
+ description: ''
+ units: ''
+ value_type: float
+ zabbix_type: internal
+ delta: 1 # speed per second
+
+ ztriggers:
+ - description: "There has been unexpected output while running the housekeeping script\
+ \ on the Zabbix. There are only three kinds of lines we expect to see in the output,\
+ \ and we've gotten something enw.\r\n\r\nCheck the script's output in /var/lib/zabbix/state\
+ \ for more details."
+ expression: '{Template App Zabbix Server:housekeeper_errors.last(0)}+{Template App Zabbix Server:housekeeper_creates.last(0)}+{Template App Zabbix Server:housekeeper_drops.last(0)}<>{Template App Zabbix Server:housekeeper_total.last(0)}'
+ name: Unexpected output in Zabbix DB Housekeeping
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_DB_Housekeeping.asciidoc
+
+ - description: An error has occurred during running the housekeeping script on the Zabbix. Check the script's output in /var/lib/zabbix/state for more details.
+ expression: '{Template App Zabbix Server:housekeeper_errors.last(0)}>0'
+ name: Errors during Zabbix DB Housekeeping
+ priority: high
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[process,alerter,avg,busy].min(600)}>75'
+ name: Zabbix alerter processes more than 75% busy
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[process,configuration syncer,avg,busy].min(600)}>75'
+ name: Zabbix configuration syncer processes more than 75% busy
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[process,db watchdog,avg,busy].min(600)}>75'
+ name: Zabbix db watchdog processes more than 75% busy
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[process,discoverer,avg,busy].min(600)}>75'
+ name: Zabbix discoverer processes more than 75% busy
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[process,escalator,avg,busy].min(600)}>75'
+ name: Zabbix escalator processes more than 75% busy
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[process,history syncer,avg,busy].min(600)}>75'
+ name: Zabbix history syncer processes more than 75% busy
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[process,housekeeper,avg,busy].min(1800)}>75'
+ name: Zabbix housekeeper processes more than 75% busy
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[process,http poller,avg,busy].min(600)}>75'
+ name: Zabbix http poller processes more than 75% busy
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[process,icmp pinger,avg,busy].min(600)}>75'
+ name: Zabbix icmp pinger processes more than 75% busy
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[process,ipmi poller,avg,busy].min(600)}>75'
+ name: Zabbix ipmi poller processes more than 75% busy
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[process,java poller,avg,busy].min(600)}>75'
+ name: Zabbix java poller processes more than 75% busy
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[process,node watcher,avg,busy].min(600)}>75'
+ name: Zabbix node watcher processes more than 75% busy
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[process,poller,avg,busy].min(600)}>75'
+ name: Zabbix poller processes more than 75% busy
+ priority: high
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[process,proxy poller,avg,busy].min(600)}>75'
+ name: Zabbix proxy poller processes more than 75% busy
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[process,self-monitoring,avg,busy].min(600)}>75'
+ name: Zabbix self-monitoring processes more than 75% busy
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[process,snmp trapper,avg,busy].min(600)}>75'
+ name: Zabbix snmp trapper processes more than 75% busy
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
+
+ - description: Timer processes usually are busy because they have to process time
+ based trigger functions
+ expression: '{Template App Zabbix Server:zabbix[process,timer,avg,busy].min(600)}>75'
+ name: Zabbix timer processes more than 75% busy
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[process,trapper,avg,busy].min(600)}>75'
+ name: Zabbix trapper processes more than 75% busy
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[process,unreachable poller,avg,busy].min(600)}>75'
+ name: Zabbix unreachable poller processes more than 75% busy
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc
+
+ - description: "This alert generally indicates a performance problem or a problem\
+ \ with the zabbix-server or proxy.\r\n\r\nThe first place to check for issues\
+ \ is Administration > Queue. Be sure to check the general view and the per-proxy\
+ \ view."
+ expression: '{Template App Zabbix Server:zabbix[queue,10m].min(600)}>1000'
+ name: More than 1000 items having missing data for more than 10 minutes
+ priority: high
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/data_lost_overview_plugin.asciidoc
+
+ - description: Consider increasing CacheSize in the zabbix_server.conf configuration
+ file
+ expression: '{Template App Zabbix Server:zabbix[rcache,buffer,pfree].min(600)}<5'
+ name: Less than 5% free in the configuration cache
+ priority: info
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/check_cache.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[wcache,history,pfree].min(600)}<25'
+ name: Less than 25% free in the history cache
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/check_cache.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[wcache,text,pfree].min(600)}<25'
+ name: Less than 25% free in the text history cache
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/check_cache.asciidoc
+
+ - description: ''
+ expression: '{Template App Zabbix Server:zabbix[wcache,trend,pfree].min(600)}<25'
+ name: Less than 25% free in the trends cache
+ priority: avg
+ url: https://github.com/openshift/ops-sop/blob/master/Alerts/check_cache.asciidoc
diff --git a/roles/os_zabbix/vars/template_docker.yml b/roles/os_zabbix/vars/template_docker.yml
new file mode 100644
index 000000000..bfabf50c5
--- /dev/null
+++ b/roles/os_zabbix/vars/template_docker.yml
@@ -0,0 +1,94 @@
+---
+g_template_docker:
+ name: Template Docker
+ zitems:
+ - key: docker.ping
+ applications:
+ - Docker Daemon
+ value_type: int
+
+ - key: docker.info_elapsed_ms
+ applications:
+ - Docker Daemon
+ value_type: int
+
+ - key: docker.storage.is_loopback
+ applications:
+ - Docker Storage
+ value_type: int
+
+ - key: docker.storage.data.space.total
+ applications:
+ - Docker Storage
+ value_type: float
+
+ - key: docker.storage.data.space.used
+ applications:
+ - Docker Storage
+ value_type: float
+
+ - key: docker.storage.data.space.available
+ applications:
+ - Docker Storage
+ value_type: float
+
+ - key: docker.storage.data.space.percent_available
+ applications:
+ - Docker Storage
+ value_type: float
+
+ - key: docker.storage.metadata.space.total
+ applications:
+ - Docker Storage
+ value_type: float
+
+ - key: docker.storage.metadata.space.used
+ applications:
+ - Docker Storage
+ value_type: float
+
+ - key: docker.storage.metadata.space.available
+ applications:
+ - Docker Storage
+ value_type: float
+
+ - key: docker.storage.metadata.space.percent_available
+ applications:
+ - Docker Storage
+ value_type: float
+ ztriggers:
+ - name: 'docker.ping failed on {HOST.NAME}'
+ expression: '{Template Docker:docker.ping.max(#3)}<1'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_docker_ping.asciidoc'
+ priority: high
+
+ - name: 'Docker storage is using LOOPBACK on {HOST.NAME}'
+ expression: '{Template Docker:docker.storage.is_loopback.last()}<>0'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_docker_loopback.asciidoc'
+ priority: high
+
+ - name: 'Critically low docker storage data space on {HOST.NAME}'
+ expression: '{Template Docker:docker.storage.data.space.percent_available.max(#3)}<5 or {Template Docker:docker.storage.data.space.available.max(#3)}<5' # < 5% or < 5GB
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_docker_storage.asciidoc'
+ priority: high
+
+ - name: 'Critically low docker storage metadata space on {HOST.NAME}'
+ expression: '{Template Docker:docker.storage.metadata.space.percent_available.max(#3)}<5 or {Template Docker:docker.storage.metadata.space.available.max(#3)}<0.005' # < 5% or < 5MB
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_docker_storage.asciidoc'
+ priority: high
+
+ # Put triggers that depend on other triggers here (deps must be created first)
+ - name: 'Low docker storage data space on {HOST.NAME}'
+ expression: '{Template Docker:docker.storage.data.space.percent_available.max(#3)}<10 or {Template Docker:docker.storage.data.space.available.max(#3)}<10' # < 10% or < 10GB
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_docker_storage.asciidoc'
+ dependencies:
+ - 'Critically low docker storage data space on {HOST.NAME}'
+ priority: average
+
+ - name: 'Low docker storage metadata space on {HOST.NAME}'
+ expression: '{Template Docker:docker.storage.metadata.space.percent_available.max(#3)}<10 or {Template Docker:docker.storage.metadata.space.available.max(#3)}<0.01' # < 10% or < 10MB
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_docker_storage.asciidoc'
+ dependencies:
+ - 'Critically low docker storage metadata space on {HOST.NAME}'
+ priority: average
+
diff --git a/roles/os_zabbix/vars/template_heartbeat.yml b/roles/os_zabbix/vars/template_heartbeat.yml
new file mode 100644
index 000000000..8dbe0d0d6
--- /dev/null
+++ b/roles/os_zabbix/vars/template_heartbeat.yml
@@ -0,0 +1,13 @@
+---
+g_template_heartbeat:
+ name: Template Heartbeat
+ zitems:
+ - name: Heartbeat Ping
+ applications:
+ - Heartbeat
+ key: heartbeat.ping
+ ztriggers:
+ - name: 'Heartbeat.ping has failed on {HOST.NAME}'
+ expression: '{Template Heartbeat:heartbeat.ping.nodata(20m)}=1'
+ priority: avg
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_node_heartbeat.asciidoc'
diff --git a/roles/os_zabbix/vars/template_openshift_master.yml b/roles/os_zabbix/vars/template_openshift_master.yml
new file mode 100644
index 000000000..1de4fefbb
--- /dev/null
+++ b/roles/os_zabbix/vars/template_openshift_master.yml
@@ -0,0 +1,58 @@
+---
+g_template_openshift_master:
+ name: Template Openshift Master
+ zitems:
+ - name: create_app
+ applications:
+ - Openshift Master
+ key: create_app
+
+ - key: openshift.master.process.count
+ description: Shows number of master processes running
+ type: int
+ applications:
+ - Openshift Master
+
+ - key: openshift.master.user.count
+ description: Shows number of users in a cluster
+ type: int
+ applications:
+ - Openshift Master
+
+ - key: openshift.master.pod.running.count
+ description: Shows number of pods running
+ type: int
+ applications:
+ - Openshift Master
+
+ - key: openshift.project.counter
+ description: Shows number of projects on a cluster
+ type: int
+ applications:
+ - Openshift Master
+
+ ztriggers:
+ - name: 'Application creation has failed on {HOST.NAME}'
+ expression: '{Template Openshift Master:create_app.last(#1)}=1 and {Template Openshift Master:create_app.last(#2)}=1'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_create_app.asciidoc'
+ priority: avg
+
+ - name: 'Openshift Master process not running on {HOST.NAME}'
+ expression: '{Template Openshift Master:openshift.master.process.count.max(#3)}<1'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
+ priority: high
+
+ - name: 'Too many Openshift Master processes running on {HOST.NAME}'
+ expression: '{Template Openshift Master:openshift.master.process.count.min(#3)}>1'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
+ priority: high
+
+ - name: 'Number of users for Openshift Master on {HOST.NAME}'
+ expression: '{Template Openshift Master:openshift.master.user.count.last()}=0'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
+ priority: info
+
+ - name: 'There are no projects running on {HOST.NAME}'
+ expression: '{Template Openshift Master:openshift.project.counter.last()}=0'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc'
+ priority: info
diff --git a/roles/os_zabbix/vars/template_openshift_node.yml b/roles/os_zabbix/vars/template_openshift_node.yml
new file mode 100644
index 000000000..ce28b1048
--- /dev/null
+++ b/roles/os_zabbix/vars/template_openshift_node.yml
@@ -0,0 +1,44 @@
+---
+g_template_openshift_node:
+ name: Template Openshift Node
+ zitems:
+ - key: openshift.node.process.count
+ description: Shows number of OpenShift Node processes running
+ type: int
+ applications:
+ - Openshift Node
+
+ - key: openshift.node.ovs.pids.count
+ description: Shows number of ovs process ids running
+ type: int
+ applications:
+ - Openshift Node
+
+ - key: openshift.node.ovs.ports.count
+ description: Shows number of OVS ports defined
+ type: int
+ applications:
+ - Openshift Node
+
+ ztriggers:
+ - name: 'Openshift Node process not running on {HOST.NAME}'
+ expression: '{Template Openshift Node:openshift.node.process.count.max(#3)}<1'
+ url: 'https://github.com/openshift/ops-sop/blob/node/V3/Alerts/openshift_node.asciidoc'
+ priority: high
+
+ - name: 'Too many Openshift Node processes running on {HOST.NAME}'
+ expression: '{Template Openshift Node:openshift.node.process.count.min(#3)}>1'
+ url: 'https://github.com/openshift/ops-sop/blob/node/V3/Alerts/openshift_node.asciidoc'
+ priority: high
+
+ - name: 'OVS may not be running on {HOST.NAME}'
+ expression: '{Template Openshift Node:openshift.node.ovs.pids.count.last()}<>4'
+ url: 'https://github.com/openshift/ops-sop/blob/node/V3/Alerts/openshift_node.asciidoc'
+ priority: high
+
+ - name: 'Number of OVS ports is 0 on {HOST.NAME}'
+ expression: '{Template Openshift Node:openshift.node.ovs.ports.count.last()}=0'
+ url: 'https://github.com/openshift/ops-sop/blob/node/V3/Alerts/openshift_node.asciidoc'
+ priority: high
+
+
diff --git a/roles/os_zabbix/vars/template_ops_tools.yml b/roles/os_zabbix/vars/template_ops_tools.yml
new file mode 100644
index 000000000..d1b8a2514
--- /dev/null
+++ b/roles/os_zabbix/vars/template_ops_tools.yml
@@ -0,0 +1,23 @@
+---
+g_template_ops_tools:
+ name: Template Operations Tools
+ zdiscoveryrules:
+ - name: disc.ops.runner
+ key: disc.ops.runner
+ lifetime: 1
+ description: "Dynamically register operations runner items"
+
+ zitemprototypes:
+ - discoveryrule_key: disc.ops.runner
+ name: "Exit code of ops-runner[{#OSO_COMMAND}]"
+ key: "disc.ops.runner.command.exitcode[{#OSO_COMMAND}]"
+ value_type: int
+ description: "The exit code of the command run from ops-runner"
+ applications:
+ - Ops Runner
+
+ ztriggerprototypes:
+ - name: 'ops-runner[{#OSO_COMMAND}]: non-zero exit code on {HOST.NAME}'
+ expression: '{Template Operations Tools:disc.ops.runner.command.exitcode[{#OSO_COMMAND}].last()}<>0'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_ops_runner_command.asciidoc'
+ priority: average
diff --git a/roles/os_zabbix/vars/template_os_linux.yml b/roles/os_zabbix/vars/template_os_linux.yml
new file mode 100644
index 000000000..3ae1500bc
--- /dev/null
+++ b/roles/os_zabbix/vars/template_os_linux.yml
@@ -0,0 +1,260 @@
+---
+g_template_os_linux:
+ name: Template OS Linux
+ zitems:
+ - key: kernel.uname.sysname
+ applications:
+ - Kernel
+ value_type: string
+
+ - key: kernel.all.cpu.wait.total
+ applications:
+ - Kernel
+ value_type: float
+ units: '%'
+
+ - key: kernel.all.cpu.irq.hard
+ applications:
+ - Kernel
+ value_type: float
+ units: '%'
+
+ - key: kernel.all.cpu.idle
+ applications:
+ - Kernel
+ value_type: float
+ units: '%'
+
+ - key: kernel.uname.distro
+ applications:
+ - Kernel
+ value_type: string
+
+ - key: kernel.uname.nodename
+ applications:
+ - Kernel
+ value_type: string
+
+ - key: kernel.all.cpu.irq.soft
+ applications:
+ - Kernel
+ value_type: float
+ units: '%'
+
+ - key: kernel.all.load.15_minute
+ applications:
+ - Kernel
+ value_type: float
+
+ - key: kernel.all.cpu.sys
+ applications:
+ - Kernel
+ value_type: float
+ units: '%'
+
+ - key: kernel.all.load.5_minute
+ applications:
+ - Kernel
+ value_type: float
+
+ - key: kernel.all.cpu.nice
+ applications:
+ - Kernel
+ value_type: float
+ units: '%'
+
+ - key: kernel.all.load.1_minute
+ applications:
+ - Kernel
+ value_type: float
+
+ - key: kernel.uname.version
+ applications:
+ - Kernel
+ value_type: string
+
+ - key: kernel.all.uptime
+ applications:
+ - Kernel
+ value_type: int
+
+ - key: kernel.all.cpu.user
+ applications:
+ - Kernel
+ value_type: float
+ units: '%'
+
+ - key: kernel.uname.machine
+ applications:
+ - Kernel
+ value_type: string
+
+ - key: hinv.ncpu
+ applications:
+ - Kernel
+ value_type: int
+
+ - key: kernel.all.cpu.steal
+ applications:
+ - Kernel
+ value_type: float
+ units: '%'
+
+ - key: kernel.all.pswitch
+ applications:
+ - Kernel
+ value_type: int
+
+ - key: kernel.uname.release
+ applications:
+ - Kernel
+ value_type: string
+
+ - key: proc.nprocs
+ applications:
+ - Kernel
+ value_type: int
+
+ # Memory Items
+ - key: mem.freemem
+ applications:
+ - Memory
+ value_type: int
+ description: "PCP: free system memory metric from /proc/meminfo"
+ multiplier: 1024
+ units: B
+
+ - key: mem.util.bufmem
+ applications:
+ - Memory
+ value_type: int
+ description: "PCP: Memory allocated for buffer_heads.; I/O buffers metric from /proc/meminfo"
+ multiplier: 1024
+ units: B
+
+ - key: swap.used
+ applications:
+ - Memory
+ value_type: int
+ description: "PCP: swap used metric from /proc/meminfo"
+ multiplier: 1024
+ units: B
+
+ - key: swap.length
+ applications:
+ - Memory
+ value_type: int
+ description: "PCP: total swap available metric from /proc/meminfo"
+ multiplier: 1024
+ units: B
+
+ - key: mem.physmem
+ applications:
+ - Memory
+ value_type: int
+ description: "PCP: The value of this metric corresponds to the \"MemTotal\" field reported by /proc/meminfo. Note that this does not necessarily correspond to actual installed physical memory - there may be areas of the physical address space mapped as ROM in various peripheral devices and the bios may be mirroring certain ROMs in RAM."
+ multiplier: 1024
+ units: B
+
+ - key: swap.free
+ applications:
+ - Memory
+ value_type: int
+ description: "PCP: swap free metric from /proc/meminfo"
+ multiplier: 1024
+ units: B
+
+ - key: mem.util.available
+ applications:
+ - Memory
+ value_type: int
+ description: "PCP: The amount of memory that is available for a new workload, without pushing the system into swap. Estimated from MemFree, Active(file), Inactive(file), and SReclaimable, as well as the \"low\" watermarks from /proc/zoneinfo.; available memory from /proc/meminfo"
+ multiplier: 1024
+ units: B
+
+ - key: mem.util.used
+ applications:
+ - Memory
+ value_type: int
+ description: "PCP: Used memory is the difference between mem.physmem and mem.freemem; used memory metric from /proc/meminfo"
+ multiplier: 1024
+ units: B
+
+ - key: mem.util.cached
+ applications:
+ - Memory
+ value_type: int
+ description: "PCP: Memory used by the page cache, including buffered file data. This is in-memory cache for files read from the disk (the pagecache) but doesn't include SwapCached.; page cache metric from /proc/meminfo"
+ multiplier: 1024
+ units: B
+
+ zdiscoveryrules:
+ - name: disc.filesys
+ key: disc.filesys
+ lifetime: 1
+ description: "Dynamically register the filesystems"
+
+ zitemprototypes:
+ - discoveryrule_key: disc.filesys
+ name: "disc.filesys.full.{#OSO_FILESYS}"
+ key: "disc.filesys.full[{#OSO_FILESYS}]"
+ value_type: float
+ description: "PCP filesys.full option. This is the percent full returned from pcp filesys.full"
+ applications:
+ - Disk
+
+ - discoveryrule_key: disc.filesys
+ name: "Percentage of used inodes on {#OSO_FILESYS}"
+ key: "disc.filesys.inodes.pused[{#OSO_FILESYS}]"
+ value_type: float
+ description: "PCP derived value of percentage of used inodes on a filesystem."
+ applications:
+ - Disk
+
+ ztriggerprototypes:
+ - name: 'Filesystem: {#OSO_FILESYS} has less than 15% free disk space on {HOST.NAME}'
+ expression: '{Template OS Linux:disc.filesys.full[{#OSO_FILESYS}].last()}>85'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_filesys_full.asciidoc'
+ priority: warn
+
+ - name: 'Filesystem: {#OSO_FILESYS} has less than 10% free disk space on {HOST.NAME}'
+ expression: '{Template OS Linux:disc.filesys.full[{#OSO_FILESYS}].last()}>90'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_filesys_full.asciidoc'
+ priority: high
+
+ - name: 'Filesystem: {#OSO_FILESYS} has less than 10% free inodes on {HOST.NAME}'
+ expression: '{Template OS Linux:disc.filesys.inodes.pused[{#OSO_FILESYS}].last()}>90'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_filesys_full.asciidoc'
+ priority: warn
+
+ - name: 'Filesystem: {#OSO_FILESYS} has less than 5% free inodes on {HOST.NAME}'
+ expression: '{Template OS Linux:disc.filesys.inodes.pused[{#OSO_FILESYS}].last()}>95'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_filesys_full.asciidoc'
+ priority: high
+
+ ztriggers:
+ - name: 'Too many TOTAL processes on {HOST.NAME}'
+ expression: '{Template OS Linux:proc.nprocs.last()}>5000'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_proc.asciidoc'
+ priority: warn
+
+ - name: 'Lack of available memory on {HOST.NAME}'
+ expression: '{Template OS Linux:mem.freemem.last()}<30720000'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_memory.asciidoc'
+ priority: warn
+ description: 'Alert on less than 30MegaBytes. This is 30 Million Bytes. 30000 KB x 1024'
+
+ # CPU Utilization #
+ - name: 'CPU idle less than 5% on {HOST.NAME}'
+ expression: '{Template OS Linux:kernel.all.cpu.idle.last()}<5 and {Template OS Linux:kernel.all.cpu.idle.last(#2)}<5'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_cpu_idle.asciidoc'
+ priority: average
+ description: 'CPU is less than 5% idle'
+
+ - name: 'CPU idle less than 10% on {HOST.NAME}'
+ expression: '{Template OS Linux:kernel.all.cpu.idle.last()}<10 and {Template OS Linux:kernel.all.cpu.idle.last(#2)}<10'
+ url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_cpu_idle.asciidoc'
+ priority: warn
+ description: 'CPU is less than 10% idle'
+ dependencies:
+ - 'CPU idle less than 5% on {HOST.NAME}'