diff options
author | Joel Diaz <jdiaz@redhat.com> | 2016-04-08 13:48:13 -0400 |
---|---|---|
committer | Joel Diaz <jdiaz@redhat.com> | 2016-04-15 09:35:54 -0400 |
commit | 3681ab5fb2a39ccb06024e6ad514ad50df21f9d2 (patch) | |
tree | 5d1dd7da5404f431a60241a419155204a7784dcc /roles/os_zabbix | |
parent | 51e390459d7eda90c44d9ffd7f1199a3590beac1 (diff) | |
download | openshift-3681ab5fb2a39ccb06024e6ad514ad50df21f9d2.tar.gz openshift-3681ab5fb2a39ccb06024e6ad514ad50df21f9d2.tar.bz2 openshift-3681ab5fb2a39ccb06024e6ad514ad50df21f9d2.tar.xz openshift-3681ab5fb2a39ccb06024e6ad514ad50df21f9d2.zip |
cleanup roles after roles move to openshift-tools
also removing inventory/multi_inventory*
things left behind unchanged even though they were copied:
playbooks/adhoc/*
roles/dns
roles/kube_nfs_volumes
roles/os_update_latest
Diffstat (limited to 'roles/os_zabbix')
-rw-r--r-- | roles/os_zabbix/README.md | 40 | ||||
-rw-r--r-- | roles/os_zabbix/defaults/main.yml | 1 | ||||
-rw-r--r-- | roles/os_zabbix/handlers/main.yml | 1 | ||||
-rw-r--r-- | roles/os_zabbix/meta/main.yml | 9 | ||||
-rw-r--r-- | roles/os_zabbix/tasks/main.yml | 166 | ||||
-rw-r--r-- | roles/os_zabbix/vars/main.yml | 1 | ||||
-rw-r--r-- | roles/os_zabbix/vars/template_app_zabbix_agent.yml | 23 | ||||
-rw-r--r-- | roles/os_zabbix/vars/template_app_zabbix_server.yml | 412 | ||||
-rw-r--r-- | roles/os_zabbix/vars/template_aws.yml | 25 | ||||
-rw-r--r-- | roles/os_zabbix/vars/template_config_loop.yml | 14 | ||||
-rw-r--r-- | roles/os_zabbix/vars/template_docker.yml | 116 | ||||
-rw-r--r-- | roles/os_zabbix/vars/template_heartbeat.yml | 18 | ||||
-rw-r--r-- | roles/os_zabbix/vars/template_openshift_master.yml | 458 | ||||
-rw-r--r-- | roles/os_zabbix/vars/template_openshift_node.yml | 70 | ||||
-rw-r--r-- | roles/os_zabbix/vars/template_ops_tools.yml | 54 | ||||
-rw-r--r-- | roles/os_zabbix/vars/template_os_linux.yml | 314 | ||||
-rw-r--r-- | roles/os_zabbix/vars/template_performance_copilot.yml | 14 | ||||
-rw-r--r-- | roles/os_zabbix/vars/template_zagg_server.yml | 46 |
18 files changed, 0 insertions, 1782 deletions
diff --git a/roles/os_zabbix/README.md b/roles/os_zabbix/README.md deleted file mode 100644 index ac3dc2833..000000000 --- a/roles/os_zabbix/README.md +++ /dev/null @@ -1,40 +0,0 @@ -os_zabbix -========= - -Automate zabbix tasks. - -Requirements ------------- - -This requires the openshift_tools rpm be installed for the zbxapi.py library. It can be found here: https://github.com/openshift/openshift-tools under openshift_tools/monitoring/zbxapi.py for now. - -Role Variables --------------- - -zab_server -zab_username -zab_password - -Dependencies ------------- - -This depeonds on the zbxapi.py library located here: https://github.com/openshift/openshift-tools under openshift_tools/monitoring/zbxapi.py for now. - -Example Playbook ----------------- - - - zbx_host: - server: zab_server - user: zab_user - password: zab_password - name: 'myhost' - -License -------- - -ASL 2.0 - -Author Information ------------------- - -OpenShift operations, Red Hat, Inc diff --git a/roles/os_zabbix/defaults/main.yml b/roles/os_zabbix/defaults/main.yml deleted file mode 100644 index ed97d539c..000000000 --- a/roles/os_zabbix/defaults/main.yml +++ /dev/null @@ -1 +0,0 @@ ---- diff --git a/roles/os_zabbix/handlers/main.yml b/roles/os_zabbix/handlers/main.yml deleted file mode 100644 index ed97d539c..000000000 --- a/roles/os_zabbix/handlers/main.yml +++ /dev/null @@ -1 +0,0 @@ ---- diff --git a/roles/os_zabbix/meta/main.yml b/roles/os_zabbix/meta/main.yml deleted file mode 100644 index 360f5aad2..000000000 --- a/roles/os_zabbix/meta/main.yml +++ /dev/null @@ -1,9 +0,0 @@ ---- -galaxy_info: - author: OpenShift - description: ZabbixAPI - company: Red Hat, Inc - license: ASL 2.0 - min_ansible_version: 1.2 -dependencies: -- lib_zabbix diff --git a/roles/os_zabbix/tasks/main.yml b/roles/os_zabbix/tasks/main.yml deleted file mode 100644 index 1c8d88854..000000000 --- a/roles/os_zabbix/tasks/main.yml +++ /dev/null @@ -1,166 +0,0 @@ ---- -- name: Main List all templates - zbx_template: - zbx_server: "{{ ozb_server }}" - zbx_user: "{{ ozb_user }}" - zbx_password: "{{ ozb_password }}" - state: list - register: templates - -- include_vars: template_heartbeat.yml - tags: - - heartbeat -- include_vars: template_os_linux.yml - tags: - - linux -- include_vars: template_docker.yml - tags: - - docker -- include_vars: template_openshift_master.yml - tags: - - openshift_master -- include_vars: template_openshift_node.yml - tags: - - openshift_node -- include_vars: template_ops_tools.yml - tags: - - ops_tools -- include_vars: template_app_zabbix_server.yml - tags: - - zabbix_server -- include_vars: template_app_zabbix_agent.yml - tags: - - zabbix_agent -- include_vars: template_performance_copilot.yml - tags: - - pcp -- include_vars: template_aws.yml - tags: - - aws -- include_vars: template_zagg_server.yml - tags: - - zagg_server - -- include_vars: template_config_loop.yml - tags: - - config_loop - -- name: Include Template Heartbeat - include: ../../lib_zabbix/tasks/create_template.yml - vars: - template: "{{ g_template_heartbeat }}" - server: "{{ ozb_server }}" - user: "{{ ozb_user }}" - password: "{{ ozb_password }}" - tags: - - heartbeat - -- name: Include Template os_linux - include: ../../lib_zabbix/tasks/create_template.yml - vars: - template: "{{ g_template_os_linux }}" - server: "{{ ozb_server }}" - user: "{{ ozb_user }}" - password: "{{ ozb_password }}" - tags: - - linux - -- name: Include Template docker - include: ../../lib_zabbix/tasks/create_template.yml - vars: - template: "{{ g_template_docker }}" - server: "{{ ozb_server }}" - user: "{{ ozb_user }}" - password: "{{ ozb_password }}" - tags: - - docker - -- name: Include Template Openshift Master - include: ../../lib_zabbix/tasks/create_template.yml - vars: - template: "{{ g_template_openshift_master }}" - server: "{{ ozb_server }}" - user: "{{ ozb_user }}" - password: "{{ ozb_password }}" - tags: - - openshift_master - -- name: Include Template Openshift Node - include: ../../lib_zabbix/tasks/create_template.yml - vars: - template: "{{ g_template_openshift_node }}" - server: "{{ ozb_server }}" - user: "{{ ozb_user }}" - password: "{{ ozb_password }}" - tags: - - openshift_node - -- name: Include Template Ops Tools - include: ../../lib_zabbix/tasks/create_template.yml - vars: - template: "{{ g_template_ops_tools }}" - server: "{{ ozb_server }}" - user: "{{ ozb_user }}" - password: "{{ ozb_password }}" - tags: - - ops_tools - -- name: Include Template App Zabbix Server - include: ../../lib_zabbix/tasks/create_template.yml - vars: - template: "{{ g_template_app_zabbix_server }}" - server: "{{ ozb_server }}" - user: "{{ ozb_user }}" - password: "{{ ozb_password }}" - tags: - - zabbix_server - -- name: Include Template App Zabbix Agent - include: ../../lib_zabbix/tasks/create_template.yml - vars: - template: "{{ g_template_app_zabbix_agent }}" - server: "{{ ozb_server }}" - user: "{{ ozb_user }}" - password: "{{ ozb_password }}" - tags: - - zabbix_agent - -- name: Include Template Performance Copilot - include: ../../lib_zabbix/tasks/create_template.yml - vars: - template: "{{ g_template_performance_copilot }}" - server: "{{ ozb_server }}" - user: "{{ ozb_user }}" - password: "{{ ozb_password }}" - tags: - - pcp - -- name: Include Template AWS - include: ../../lib_zabbix/tasks/create_template.yml - vars: - template: "{{ g_template_aws }}" - server: "{{ ozb_server }}" - user: "{{ ozb_user }}" - password: "{{ ozb_password }}" - tags: - - aws - -- name: Include Template Zagg Server - include: ../../lib_zabbix/tasks/create_template.yml - vars: - template: "{{ g_template_zagg_server }}" - server: "{{ ozb_server }}" - user: "{{ ozb_user }}" - password: "{{ ozb_password }}" - tags: - - zagg_server - -- name: Include Template Config Loop - include: ../../lib_zabbix/tasks/create_template.yml - vars: - template: "{{ g_template_config_loop }}" - server: "{{ ozb_server }}" - user: "{{ ozb_user }}" - password: "{{ ozb_password }}" - tags: - - config_loop diff --git a/roles/os_zabbix/vars/main.yml b/roles/os_zabbix/vars/main.yml deleted file mode 100644 index ed97d539c..000000000 --- a/roles/os_zabbix/vars/main.yml +++ /dev/null @@ -1 +0,0 @@ ---- diff --git a/roles/os_zabbix/vars/template_app_zabbix_agent.yml b/roles/os_zabbix/vars/template_app_zabbix_agent.yml deleted file mode 100644 index d636d4822..000000000 --- a/roles/os_zabbix/vars/template_app_zabbix_agent.yml +++ /dev/null @@ -1,23 +0,0 @@ ---- -g_template_app_zabbix_agent: - name: Template App Zabbix Agent - zitems: - - key: agent.hostname - applications: - - Zabbix agent - value_type: character - zabbix_type: agent - - - key: agent.ping - applications: - - Zabbix agent - description: The agent always returns 1 for this item. It could be used in combination with nodata() for availability check. - value_type: int - zabbix_type: agent - - ztriggers: - - name: '[Reboot] Zabbix agent on {HOST.NAME} is unreachable for 15 minutes' - description: Zabbix agent is unreachable for 15 minutes. - expression: '{Template App Zabbix Agent:agent.ping.nodata(15m)}=1' - priority: high - url: https://github.com/openshift/ops-sop/blob/master/Alerts/check_ping.asciidoc diff --git a/roles/os_zabbix/vars/template_app_zabbix_server.yml b/roles/os_zabbix/vars/template_app_zabbix_server.yml deleted file mode 100644 index 43517113b..000000000 --- a/roles/os_zabbix/vars/template_app_zabbix_server.yml +++ /dev/null @@ -1,412 +0,0 @@ ---- -g_template_app_zabbix_server: - name: Template App Zabbix Server - zitems: - - key: housekeeper_creates - applications: - - Zabbix server - description: A simple count of the number of partition creates output by the housekeeper script. - units: '' - value_type: int - zabbix_type: internal - - - key: housekeeper_drops - applications: - - Zabbix server - description: A simple count of the number of partition drops output by the housekeeper script. - units: '' - value_type: int - zabbix_type: internal - - - key: housekeeper_errors - applications: - - Zabbix server - description: A simple count of the number of errors output by the housekeeper script. - units: '' - value_type: int - zabbix_type: internal - - - key: housekeeper_total - applications: - - Zabbix server - description: A simple count of the total number of lines output by the housekeeper - script. - units: '' - value_type: int - zabbix_type: internal - - - key: zabbix[process,alerter,avg,busy] - applications: - - Zabbix server - description: '' - units: '%' - value_type: float - zabbix_type: internal - - - key: zabbix[process,configuration syncer,avg,busy] - applications: - - Zabbix server - description: '' - units: '%' - value_type: float - zabbix_type: internal - - - key: zabbix[process,db watchdog,avg,busy] - applications: - - Zabbix server - description: '' - units: '%' - value_type: float - zabbix_type: internal - - - key: zabbix[process,discoverer,avg,busy] - applications: - - Zabbix server - description: '' - units: '%' - value_type: float - zabbix_type: internal - - - key: zabbix[process,escalator,avg,busy] - applications: - - Zabbix server - description: '' - units: '%' - value_type: float - zabbix_type: internal - - - key: zabbix[process,history syncer,avg,busy] - applications: - - Zabbix server - description: '' - units: '%' - value_type: float - zabbix_type: internal - - - key: zabbix[process,housekeeper,avg,busy] - applications: - - Zabbix server - description: '' - units: '%' - value_type: float - zabbix_type: internal - - - key: zabbix[process,http poller,avg,busy] - applications: - - Zabbix server - description: '' - units: '%' - value_type: float - zabbix_type: internal - - - key: zabbix[process,icmp pinger,avg,busy] - applications: - - Zabbix server - description: '' - units: '%' - value_type: float - zabbix_type: internal - - - key: zabbix[process,ipmi poller,avg,busy] - applications: - - Zabbix server - description: '' - units: '%' - value_type: float - zabbix_type: internal - - - key: zabbix[process,java poller,avg,busy] - applications: - - Zabbix server - description: '' - units: '%' - value_type: float - zabbix_type: internal - - - key: zabbix[process,node watcher,avg,busy] - applications: - - Zabbix server - description: '' - units: '%' - value_type: float - zabbix_type: internal - - - key: zabbix[process,poller,avg,busy] - applications: - - Zabbix server - description: '' - units: '%' - value_type: float - zabbix_type: internal - - - key: zabbix[process,proxy poller,avg,busy] - applications: - - Zabbix server - description: '' - units: '%' - value_type: float - zabbix_type: internal - - - key: zabbix[process,self-monitoring,avg,busy] - applications: - - Zabbix server - description: '' - units: '%' - value_type: float - zabbix_type: internal - - - key: zabbix[process,snmp trapper,avg,busy] - applications: - - Zabbix server - description: '' - units: '%' - value_type: float - zabbix_type: internal - - - key: zabbix[process,timer,avg,busy] - applications: - - Zabbix server - description: '' - units: '%' - value_type: float - zabbix_type: internal - - - key: zabbix[process,trapper,avg,busy] - applications: - - Zabbix server - description: '' - units: '%' - value_type: float - zabbix_type: internal - - - key: zabbix[process,unreachable poller,avg,busy] - applications: - - Zabbix server - description: '' - units: '%' - value_type: float - zabbix_type: internal - - - key: zabbix[queue,10m] - applications: - - Zabbix server - description: '' - units: '' - value_type: int - zabbix_type: internal - interval: 600 - - - key: zabbix[queue] - applications: - - Zabbix server - description: '' - units: '' - value_type: int - zabbix_type: internal - interval: 600 - - - key: zabbix[rcache,buffer,pfree] - applications: - - Zabbix server - description: '' - units: '' - value_type: float - zabbix_type: internal - - - key: zabbix[wcache,history,pfree] - applications: - - Zabbix server - description: '' - units: '' - value_type: float - zabbix_type: internal - - - key: zabbix[wcache,text,pfree] - applications: - - Zabbix server - description: '' - units: '' - value_type: float - zabbix_type: internal - - - key: zabbix[wcache,trend,pfree] - applications: - - Zabbix server - description: '' - units: '' - value_type: float - zabbix_type: internal - - - key: zabbix[wcache,values] - applications: - - Zabbix server - description: '' - units: '' - value_type: float - zabbix_type: internal - delta: 1 # speed per second - - ztriggers: - - description: "There has been unexpected output while running the housekeeping script\ - \ on the Zabbix. There are only three kinds of lines we expect to see in the output,\ - \ and we've gotten something enw.\r\n\r\nCheck the script's output in /var/lib/zabbix/state\ - \ for more details." - expression: '{Template App Zabbix Server:housekeeper_errors.last(0)}+{Template App Zabbix Server:housekeeper_creates.last(0)}+{Template App Zabbix Server:housekeeper_drops.last(0)}<>{Template App Zabbix Server:housekeeper_total.last(0)}' - name: Unexpected output in Zabbix DB Housekeeping - priority: avg - url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_DB_Housekeeping.asciidoc - - - description: An error has occurred during running the housekeeping script on the Zabbix. Check the script's output in /var/lib/zabbix/state for more details. - expression: '{Template App Zabbix Server:housekeeper_errors.last(0)}>0' - name: Errors during Zabbix DB Housekeeping - priority: high - url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc - - - description: '' - expression: '{Template App Zabbix Server:zabbix[process,alerter,avg,busy].min(600)}>75' - name: Zabbix alerter processes more than 75% busy - priority: avg - url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc - - - description: '' - expression: '{Template App Zabbix Server:zabbix[process,configuration syncer,avg,busy].min(600)}>75' - name: Zabbix configuration syncer processes more than 75% busy - priority: avg - url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc - - - description: '' - expression: '{Template App Zabbix Server:zabbix[process,db watchdog,avg,busy].min(600)}>75' - name: Zabbix db watchdog processes more than 75% busy - priority: avg - url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc - - - description: '' - expression: '{Template App Zabbix Server:zabbix[process,discoverer,avg,busy].min(600)}>75' - name: Zabbix discoverer processes more than 75% busy - priority: avg - url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc - - - description: '' - expression: '{Template App Zabbix Server:zabbix[process,escalator,avg,busy].min(600)}>75' - name: Zabbix escalator processes more than 75% busy - priority: avg - url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc - - - description: '' - expression: '{Template App Zabbix Server:zabbix[process,history syncer,avg,busy].min(600)}>75' - name: Zabbix history syncer processes more than 75% busy - priority: avg - url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc - - - description: '' - expression: '{Template App Zabbix Server:zabbix[process,housekeeper,avg,busy].min(1800)}>75' - name: Zabbix housekeeper processes more than 75% busy - priority: avg - url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc - - - description: '' - expression: '{Template App Zabbix Server:zabbix[process,http poller,avg,busy].min(600)}>75' - name: Zabbix http poller processes more than 75% busy - priority: avg - url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc - - - description: '' - expression: '{Template App Zabbix Server:zabbix[process,icmp pinger,avg,busy].min(600)}>75' - name: Zabbix icmp pinger processes more than 75% busy - priority: avg - url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc - - - description: '' - expression: '{Template App Zabbix Server:zabbix[process,ipmi poller,avg,busy].min(600)}>75' - name: Zabbix ipmi poller processes more than 75% busy - priority: avg - url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc - - - description: '' - expression: '{Template App Zabbix Server:zabbix[process,java poller,avg,busy].min(600)}>75' - name: Zabbix java poller processes more than 75% busy - priority: avg - url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc - - - description: '' - expression: '{Template App Zabbix Server:zabbix[process,node watcher,avg,busy].min(600)}>75' - name: Zabbix node watcher processes more than 75% busy - priority: avg - url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc - - - description: '' - expression: '{Template App Zabbix Server:zabbix[process,poller,avg,busy].min(600)}>75' - name: Zabbix poller processes more than 75% busy - priority: high - url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc - - - description: '' - expression: '{Template App Zabbix Server:zabbix[process,proxy poller,avg,busy].min(600)}>75' - name: Zabbix proxy poller processes more than 75% busy - priority: avg - url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc - - - description: '' - expression: '{Template App Zabbix Server:zabbix[process,self-monitoring,avg,busy].min(600)}>75' - name: Zabbix self-monitoring processes more than 75% busy - priority: avg - url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc - - - description: '' - expression: '{Template App Zabbix Server:zabbix[process,snmp trapper,avg,busy].min(600)}>75' - name: Zabbix snmp trapper processes more than 75% busy - priority: avg - url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc - - - description: Timer processes usually are busy because they have to process time - based trigger functions - expression: '{Template App Zabbix Server:zabbix[process,timer,avg,busy].min(600)}>75' - name: Zabbix timer processes more than 75% busy - priority: avg - url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc - - - description: '' - expression: '{Template App Zabbix Server:zabbix[process,trapper,avg,busy].min(600)}>75' - name: Zabbix trapper processes more than 75% busy - priority: avg - url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc - - - description: '' - expression: '{Template App Zabbix Server:zabbix[process,unreachable poller,avg,busy].min(600)}>75' - name: Zabbix unreachable poller processes more than 75% busy - priority: avg - url: https://github.com/openshift/ops-sop/blob/master/Alerts/Zabbix_state_check.asciidoc - - - description: "This alert generally indicates a performance problem or a problem\ - \ with the zabbix-server or proxy.\r\n\r\nThe first place to check for issues\ - \ is Administration > Queue. Be sure to check the general view and the per-proxy\ - \ view." - expression: '{Template App Zabbix Server:zabbix[queue,10m].min(600)}>1000' - name: More than 1000 items having missing data for more than 10 minutes - priority: high - url: https://github.com/openshift/ops-sop/blob/master/Alerts/data_lost_overview_plugin.asciidoc - - - description: Consider increasing CacheSize in the zabbix_server.conf configuration - file - expression: '{Template App Zabbix Server:zabbix[rcache,buffer,pfree].min(600)}<5' - name: Less than 5% free in the configuration cache - priority: info - url: https://github.com/openshift/ops-sop/blob/master/Alerts/check_cache.asciidoc - - - description: '' - expression: '{Template App Zabbix Server:zabbix[wcache,history,pfree].min(600)}<25' - name: Less than 25% free in the history cache - priority: avg - url: https://github.com/openshift/ops-sop/blob/master/Alerts/check_cache.asciidoc - - - description: '' - expression: '{Template App Zabbix Server:zabbix[wcache,text,pfree].min(600)}<25' - name: Less than 25% free in the text history cache - priority: avg - url: https://github.com/openshift/ops-sop/blob/master/Alerts/check_cache.asciidoc - - - description: '' - expression: '{Template App Zabbix Server:zabbix[wcache,trend,pfree].min(600)}<25' - name: Less than 25% free in the trends cache - priority: avg - url: https://github.com/openshift/ops-sop/blob/master/Alerts/check_cache.asciidoc diff --git a/roles/os_zabbix/vars/template_aws.yml b/roles/os_zabbix/vars/template_aws.yml deleted file mode 100644 index 57832a3fe..000000000 --- a/roles/os_zabbix/vars/template_aws.yml +++ /dev/null @@ -1,25 +0,0 @@ ---- -g_template_aws: - name: Template AWS - zdiscoveryrules: - - name: disc.aws - key: disc.aws - lifetime: 14 - description: "Dynamically register AWS bucket info" - - zitemprototypes: - - discoveryrule_key: disc.aws - name: "S3 bucket size (GB) [{#S3_BUCKET}]" - key: "disc.aws.size[{#S3_BUCKET}]" - value_type: int - description: "Size of S3 bucket" - applications: - - AWS - - - discoveryrule_key: disc.aws - name: "S3 bucket object count [{#S3_BUCKET}]" - key: "disc.aws.objects[{#S3_BUCKET}]" - value_type: int - description: "Objects in S3 bucket" - applications: - - AWS diff --git a/roles/os_zabbix/vars/template_config_loop.yml b/roles/os_zabbix/vars/template_config_loop.yml deleted file mode 100644 index 823da1868..000000000 --- a/roles/os_zabbix/vars/template_config_loop.yml +++ /dev/null @@ -1,14 +0,0 @@ ---- -g_template_config_loop: - name: Template Config Loop - zitems: - - key: config_loop.run.exit_code - applications: - - Config Loop - value_type: int - - ztriggers: - - name: 'config_loop.run.exit_code not zero on {HOST.NAME}' - expression: '{Template Config Loop:config_loop.run.exit_code.min(#2)}>0' - url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_config_loop.asciidoc' - priority: average diff --git a/roles/os_zabbix/vars/template_docker.yml b/roles/os_zabbix/vars/template_docker.yml deleted file mode 100644 index dd13e76f7..000000000 --- a/roles/os_zabbix/vars/template_docker.yml +++ /dev/null @@ -1,116 +0,0 @@ ---- -g_template_docker: - name: Template Docker - zitems: - - key: docker.ping - applications: - - Docker Daemon - value_type: int - - - key: docker.info_elapsed_ms - applications: - - Docker Daemon - value_type: int - - - key: docker.container.dns.resolution - applications: - - Docker Daemon - value_type: int - - - key: docker.container.existing.dns.resolution.failed - applications: - - Docker Daemon - value_type: int - - - key: docker.storage.is_loopback - applications: - - Docker Storage - value_type: int - - - key: docker.storage.data.space.total - applications: - - Docker Storage - value_type: float - - - key: docker.storage.data.space.used - applications: - - Docker Storage - value_type: float - - - key: docker.storage.data.space.available - applications: - - Docker Storage - value_type: float - - - key: docker.storage.data.space.percent_available - applications: - - Docker Storage - value_type: float - - - key: docker.storage.metadata.space.total - applications: - - Docker Storage - value_type: float - - - key: docker.storage.metadata.space.used - applications: - - Docker Storage - value_type: float - - - key: docker.storage.metadata.space.available - applications: - - Docker Storage - value_type: float - - - key: docker.storage.metadata.space.percent_available - applications: - - Docker Storage - value_type: float - ztriggers: - - name: 'docker.ping failed on {HOST.NAME}' - expression: '{Template Docker:docker.ping.max(#3)}<1' - url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_docker_ping.asciidoc' - priority: high - - # Re-enable for OpenShift 3.1.1 (https://bugzilla.redhat.com/show_bug.cgi?id=1292971#c6) - - name: 'docker.container.dns.resolution failed on {HOST.NAME}' - expression: '{Template Docker:docker.container.dns.resolution.min(#3)}>0' - url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_docker_dns.asciidoc' - priority: average - status: disabled - - - name: 'docker.container.existing.dns.resolution.failed on {HOST.NAME}' - expression: '{Template Docker:docker.container.existing.dns.resolution.failed.min(#3)}>0' - url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_docker_dns.asciidoc' - priority: average - - - name: 'Docker storage is using LOOPBACK on {HOST.NAME}' - expression: '{Template Docker:docker.storage.is_loopback.last()}<>0' - url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_docker_loopback.asciidoc' - priority: high - - - name: 'Critically low docker storage data space on {HOST.NAME}' - expression: '{Template Docker:docker.storage.data.space.percent_available.max(#3)}<5 or {Template Docker:docker.storage.data.space.available.max(#3)}<5' # < 5% or < 5GB - url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_docker_storage.asciidoc' - priority: high - - - name: 'Critically low docker storage metadata space on {HOST.NAME}' - expression: '{Template Docker:docker.storage.metadata.space.percent_available.max(#3)}<5 or {Template Docker:docker.storage.metadata.space.available.max(#3)}<0.005' # < 5% or < 5MB - url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_docker_storage.asciidoc' - priority: high - - # Put triggers that depend on other triggers here (deps must be created first) - - name: 'Low docker storage data space on {HOST.NAME}' - expression: '{Template Docker:docker.storage.data.space.percent_available.max(#3)}<10 or {Template Docker:docker.storage.data.space.available.max(#3)}<10' # < 10% or < 10GB - url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_docker_storage.asciidoc' - dependencies: - - 'Critically low docker storage data space on {HOST.NAME}' - priority: average - - - name: 'Low docker storage metadata space on {HOST.NAME}' - expression: '{Template Docker:docker.storage.metadata.space.percent_available.max(#3)}<10 or {Template Docker:docker.storage.metadata.space.available.max(#3)}<0.01' # < 10% or < 10MB - url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_docker_storage.asciidoc' - dependencies: - - 'Critically low docker storage metadata space on {HOST.NAME}' - priority: average - diff --git a/roles/os_zabbix/vars/template_heartbeat.yml b/roles/os_zabbix/vars/template_heartbeat.yml deleted file mode 100644 index ec953c79b..000000000 --- a/roles/os_zabbix/vars/template_heartbeat.yml +++ /dev/null @@ -1,18 +0,0 @@ ---- -g_template_heartbeat: - name: Template Heartbeat - zitems: - - name: Heartbeat Ping - applications: - - Heartbeat - key: heartbeat.ping - ztriggers: - - name: 'Heartbeat.ping has failed on {HOST.NAME}' - expression: '{Template Heartbeat:heartbeat.ping.nodata(20m)}=1' - priority: avg - url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_node_heartbeat.asciidoc' - - - name: 'Heartbeat.ping has failed (60 min) on {HOST.NAME}' - expression: '{Template Heartbeat:heartbeat.ping.nodata(60m)}=1' - priority: high - url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_node_heartbeat.asciidoc' diff --git a/roles/os_zabbix/vars/template_openshift_master.yml b/roles/os_zabbix/vars/template_openshift_master.yml deleted file mode 100644 index a38db9f65..000000000 --- a/roles/os_zabbix/vars/template_openshift_master.yml +++ /dev/null @@ -1,458 +0,0 @@ ---- -g_template_openshift_master: - name: Template Openshift Master - zitems: - - name: openshift.master.app.create - applications: - - Openshift Master - key: openshift.master.app.create - - - key: openshift.master.app.build.create - description: "check the app create with a build process" - value_type: int - applications: - - Openshift Master - - - key: openshift.master.app.create.time - description: "check the time it takes app create with a build process" - value_type: float - applications: - - Openshift Master - - - key: openshift.master.app.build.time - description: "check the time it takes app build" - value_type: float - applications: - - Openshift Master - - - key: openshift.master.process.count - description: Shows number of master processes running - value_type: int - applications: - - Openshift Master - - - key: openshift.master.api.ping - description: "Verify that the Openshift API is up (uses the cluster API URL)" - value_type: int - applications: - - Openshift Master - - - key: openshift.master.local.api.ping - description: "Verify that the Openshift API is up on the host (uses the API URL as the https://127.0.0.1)" - value_type: int - applications: - - Openshift Master - - - key: openshift.master.api.healthz - description: "Checks the healthz check of the master's api: https://<cluster_api_url>/healthz" - value_type: int - data_type: bool - applications: - - Openshift Master - - - key: openshift.master.local.api.healthz - description: "Checks the healthz check of the master's api: https://127.0.0.1/healthz" - value_type: int - data_type: bool - applications: - - Openshift Master - - - key: openshift.master.user.count - description: Shows number of users in a cluster - value_type: int - applications: - - Openshift Master - - - key: openshift.master.pod.running.count - description: Shows number of pods running - value_type: int - applications: - - Openshift Master - - - key: openshift.master.pod.user.running.count - description: Shows number of user pods running (non infrastructure pods) - value_type: int - applications: - - Openshift Master - - - key: openshift.master.pod.total.count - description: Shows total number of pods (running and non running) - value_type: int - applications: - - Openshift Master - - - key: openshift.master.node.count - description: Shows the total number of nodes found in the Openshift Cluster - value_type: int - applications: - - Openshift Master - - - key: openshift.project.count - description: Shows number of projects on a cluster - value_type: int - applications: - - Openshift Master - - - key: openshift.master.pv.space.total - description: Shows the total space of pv - value_type: int - applications: - - Openshift Master - - - key: openshift.master.pv.space.available - description: Shows the available space of pv - value_type: int - applications: - - Openshift Master - - - key: openshift.master.pv.total.count - description: Total number of Persistent Volumes in the Openshift Cluster - value_type: int - applications: - - Openshift Master - - - key: openshift.master.pv.available.count - description: Total number of Available Persistent Volumes in the Openshift Cluster - value_type: int - applications: - - Openshift Master - - - key: openshift.master.pv.released.count - description: Total number of Released Persistent Volumes in the Openshift Cluster - value_type: int - applications: - - Openshift Master - - - key: openshift.master.pv.bound.count - description: Total number of Bound Persistent Volumes in the Openshift Cluster - value_type: int - applications: - - Openshift Master - - - key: openshift.master.pv.failed.count - description: Total number of Failed Persistent Volumes in the Openshift Cluster - value_type: int - applications: - - Openshift Master - - - key: openshift.master.skydns.port.open - description: State of the SkyDNS port open and listening - value_type: int - applications: - - Openshift Master - - - key: openshift.master.skydns.query - description: SkyDNS can be queried or not - value_type: int - applications: - - Openshift Master - - - key: openshift.master.etcd.create.success - description: Show number of successful create actions - value_type: int - applications: - - Openshift Etcd - - - key: openshift.master.etcd.create.fail - description: Show number of failed create actions - value_type: int - applications: - - Openshift Etcd - - - key: openshift.master.etcd.delete.success - description: Show number of successful delete actions - value_type: int - applications: - - Openshift Etcd - - - key: openshift.master.etcd.delete.fail - description: Show number of failed delete actions - value_type: int - applications: - - Openshift Etcd - - - key: openshift.master.etcd.get.success - description: Show number of successful get actions - value_type: int - applications: - - Openshift Etcd - - - key: openshift.master.etcd.get.fail - description: Show number of failed get actions - value_type: int - applications: - - Openshift Etcd - - - key: openshift.master.etcd.set.success - description: Show number of successful set actions - value_type: int - applications: - - Openshift Etcd - - - key: openshift.master.etcd.set.fail - description: Show number of failed set actions - value_type: int - applications: - - Openshift Etcd - - - key: openshift.master.etcd.update.success - description: Show number of successful update actions - value_type: int - applications: - - Openshift Etcd - - - key: openshift.master.etcd.update.fail - description: Show number of failed update actions - value_type: int - applications: - - Openshift Etcd - - - key: openshift.master.etcd.watchers - description: Show number of etcd watchers - value_type: int - applications: - - Openshift Etcd - - - key: openshift.master.etcd.ping - description: etcd ping - value_type: int - applications: - - Openshift Etcd - - - key: openshift.master.metric.ping - description: "This check verifies that the https://master/metrics check is alive and communicating properly." - value_type: int - applications: - - Openshift Master Metrics - - - key: openshift.master.nodesnotready.count - description: "This check shows how many nodes in a cluster are in NotReady state." - value_type: int - applications: - - Openshift Master - - - key: openshift.master.nodesnotschedulable.count - description: "This check shows how many nodes in a cluster are not schedulable." - value_type: int - applications: - - Openshift Master - - - key: openshift.master.apiserver.latency.summary.pods.quantile.list.5 - description: "Value from https://master/metrics. This is the time, in miliseconds, that 50% of the pod operations have taken to completed." - value_type: int - applications: - - Openshift Master Metrics - - - key: openshift.master.apiserver.latency.summary.pods.quantile.list.9 - description: "Value from https://master/metrics. This is the time, in miliseconds, that 90% of the pod operations have taken to completed." - value_type: int - applications: - - Openshift Master Metrics - - - key: openshift.master.apiserver.latency.summary.pods.quantile.list.99 - description: "Value from https://master/metrics. This is the time, in miliseconds, that 99% of the pod operations have taken to completed." - value_type: int - applications: - - Openshift Master Metrics - - - key: openshift.master.apiserver.latency.summary.pods.quantile.watchlist.5 - description: "Value from https://master/metrics. This is the time, in miliseconds, that 50% of the pod operations have taken to completed." - value_type: int - applications: - - Openshift Master Metrics - - - key: openshift.master.apiserver.latency.summary.pods.quantile.watchlist.9 - description: "Value from https://master/metrics. This is the time, in miliseconds, that 90% of the pod operations have taken to completed." - value_type: int - applications: - - Openshift Master Metrics - - - key: openshift.master.apiserver.latency.summary.pods.quantile.watchlist.99 - description: "Value from https://master/metrics. This is the time, in miliseconds, that 99% of the pod operations have taken to completed." - value_type: int - applications: - - Openshift Master Metrics - - - key: openshift.master.scheduler.e2e.scheduling.latency.quantile.5 - description: "Value from https://master/metrics. This is the time, in miliseconds, that 50% of the end to end scheduling operations have taken to completed." - value_type: int - applications: - - Openshift Master Metrics - - - key: openshift.master.scheduler.e2e.scheduling.latency.quantile.9 - description: "Value from https://master/metrics. This is the time, in miliseconds, that 90% of the end to end scheduling operations have taken to completed." - value_type: int - applications: - - Openshift Master Metrics - - - key: openshift.master.scheduler.e2e.scheduling.latency.quantile.99 - description: "Value from https://master/metrics. This is the time, in miliseconds, that 99% of the end to end scheduling operations have taken to completed." - value_type: int - applications: - - Openshift Master Metrics - - zdiscoveryrules: - - name: disc.pv - key: disc.pv - lifetime: 1 - description: "Dynamically register the Persistent Volumes" - - zitemprototypes: - - discoveryrule_key: disc.pv - name: "disc.pv.count.{#OSO_PV}" - key: "disc.pv.count[{#OSO_PV}]" - value_type: int - description: "Number of PV's of this size" - applications: - - Openshift Master - - - discoveryrule_key: disc.pv - name: "disc.pv.available.{#OSO_PV}" - key: "disc.pv.available[{#OSO_PV}]" - value_type: int - description: "Number of PV's of this size that are available" - applications: - - Openshift Master - - ztriggers: - - name: 'Openshift Master process not running on {HOST.NAME}' - expression: '{Template Openshift Master:openshift.master.process.count.max(#3)}<1' - url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc' - priority: high - - - name: 'Too many Openshift Master processes running on {HOST.NAME}' - expression: '{Template Openshift Master:openshift.master.process.count.min(#3)}>1' - url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc' - priority: high - - - name: 'Etcd ping failed on {HOST.NAME}' - expression: '{Template Openshift Master:openshift.master.etcd.ping.last(#1)}=0 and {Template Openshift Master:openshift.master.etcd.ping.last(#2)}=0' - url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_etcd.asciidoc' - priority: high - - - name: 'Number of users for Openshift Master on {HOST.NAME}' - expression: '{Template Openshift Master:openshift.master.user.count.last()}=0' - url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc' - priority: info - - - name: 'There are no projects running on {HOST.NAME}' - expression: '{Template Openshift Master:openshift.project.count.last()}=0' - url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc' - priority: info - - # Put triggers that depend on other triggers here (deps must be created first) - - name: 'Application creation has failed on {HOST.NAME}' - expression: '{Template Openshift Master:openshift.master.app.create.last(#1)}=1 and {Template Openshift Master:openshift.master.app.create.last(#2)}=1' - url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_create_app.asciidoc' - dependencies: - - 'Openshift Master process not running on {HOST.NAME}' - priority: avg - - - name: 'Application creation with build has failed on {HOST.NAME}' - expression: '{Template Openshift Master:openshift.master.app.build.create.last(#1)}=1 and {Template Openshift Master:openshift.master.app.build.create.last(#2)}=1' - url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_create_app.asciidoc' - dependencies: - - 'Openshift Master process not running on {HOST.NAME}' - priority: avg - - - name: 'Application creation has failed multiple times in the last hour on {HOST.NAME}' - expression: '{Template Openshift Master:openshift.master.app.create.sum(1h)}>3' - url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_create_app.asciidoc' - dependencies: - - 'Openshift Master process not running on {HOST.NAME}' - description: The application create loop has failed 4 or more times in the last hour - priority: avg - - - name: 'Application with build creation has failed multiple times in the last 2 hour on {HOST.NAME}' - expression: '{Template Openshift Master:openshift.master.app.build.create.sum(2h)}>3' - url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_create_app.asciidoc' - dependencies: - - 'Openshift Master process not running on {HOST.NAME}' - description: The application create loop has failed 4 or more times in the last hour - priority: avg - - - name: 'Openshift Master API health check is failing on {HOST.NAME}' - expression: '{Template Openshift Master:openshift.master.api.healthz.max(#3)}<1' - url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc' - priority: high - - - name: 'Openshift Master Local API health check is failing on {HOST.NAME}' - expression: '{Template Openshift Master:openshift.master.local.api.healthz.max(#3)}<1' - url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc' - dependencies: - - 'Openshift Master process not running on {HOST.NAME}' - priority: high - - - name: 'Openshift Master API PING check is failing on {HOST.NAME}' - expression: '{Template Openshift Master:openshift.master.api.ping.max(#3)}<1' - url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc' - priority: high - - - name: 'Openshift Master Local API PING check is failing on {HOST.NAME}' - expression: '{Template Openshift Master:openshift.master.local.api.ping.max(#3)}<1' - url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc' - dependencies: - - 'Openshift Master process not running on {HOST.NAME}' - priority: high - - - name: 'Openshift Master metric PING check is failing on {HOST.NAME}' - expression: '{Template Openshift Master:openshift.master.metric.ping.max(#3)}<1' - url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc' - dependencies: - - 'Openshift Master process not running on {HOST.NAME}' - priority: avg - - - name: 'SkyDNS port not listening on {HOST.NAME}' - expression: '{Template Openshift Master:openshift.master.skydns.port.open.max(#3)}<1' - url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc' - dependencies: - - 'Openshift Master process not running on {HOST.NAME}' - priority: high - - - name: 'SkyDNS query failed on {HOST.NAME}' - expression: '{Template Openshift Master:openshift.master.skydns.query.max(#3)}<1' - url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc' - dependencies: - - 'Openshift Master API health check is failing on {HOST.NAME}' - priority: high - - - name: 'Hosts not ready according to {HOST.NAME}' - expression: '{Template Openshift Master:openshift.master.nodesnotready.count.last(#2)}>0' - url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_node.asciidoc' - dependencies: - - 'Openshift Master process not running on {HOST.NAME}' - priority: high - - zgraphs: - - name: Openshift Master API Server Latency Pods LIST Quantiles - width: 900 - height: 200 - graph_items: - - item_name: openshift.master.apiserver.latency.summary.pods.quantile.list.5 - color: red - - item_name: openshift.master.apiserver.latency.summary.pods.quantile.list.9 - color: blue - - item_name: openshift.master.apiserver.latency.summary.pods.quantile.list.99 - color: orange - - - name: Openshift Master API Server Latency Pods WATCHLIST Quantiles - width: 900 - height: 200 - graph_items: - - item_name: openshift.master.apiserver.latency.summary.pods.quantile.watchlist.5 - color: red - - item_name: openshift.master.apiserver.latency.summary.pods.quantile.watchlist.9 - color: blue - - item_name: openshift.master.apiserver.latency.summary.pods.quantile.watchlist.99 - color: orange - - - name: Openshift Master Scheduler End to End Latency Quantiles - width: 900 - height: 200 - graph_items: - - item_name: openshift.master.scheduler.e2e.scheduling.latency.quantile.5 - color: red - - item_name: openshift.master.scheduler.e2e.scheduling.latency.quantile.9 - color: blue - - item_name: openshift.master.scheduler.e2e.scheduling.latency.quantile.99 - color: orange diff --git a/roles/os_zabbix/vars/template_openshift_node.yml b/roles/os_zabbix/vars/template_openshift_node.yml deleted file mode 100644 index 9f84a2cdf..000000000 --- a/roles/os_zabbix/vars/template_openshift_node.yml +++ /dev/null @@ -1,70 +0,0 @@ ---- -g_template_openshift_node: - name: Template Openshift Node - zitems: - - key: openshift.node.process.count - description: Shows number of OpenShift Node processes running - value_type: int - applications: - - Openshift Node - - - key: openshift.node.ovs.pids.count - description: Shows number of ovs process ids running - value_type: int - applications: - - Openshift Node - - - key: openshift.node.ovs.ports.count - description: Shows number of OVS ports defined - value_type: int - applications: - - Openshift Node - - - key: openshift.node.ovs.stray.rules - description: Number of OVS stray rules found/removed - value_type: int - applications: - - Openshift Node - - - key: openshift.node.registry-pods.healthy_pct - description: Shows the percentage of healthy registries in the cluster - value_type: int - applications: - - Openshift Node - - - key: openshift.node.registry.service.ping - description: Ping docker-registry service from node - value_type: int - applications: - - Openshift Node - - ztriggers: - - name: 'One or more Docker Registries is unhealthy according to {HOST.NAME}' - expression: '{Template Openshift Node:openshift.node.registry-pods.healthy_pct.last(#2)}<100 and {Template Openshift Node:openshift.node.registry-pods.healthy_pct.last(#1)}<100' - url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_registry.asciidoc' - priority: avg - - - name: 'Docker Registry service is unhealthy according to {HOST.NAME}' - expression: '{Template Openshift Node:openshift.node.registry.service.ping.last(#2)}<1 and {Template Openshift Node:openshift.node.registry.service.ping.last(#1)}<1' - url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_registry.asciidoc' - priority: avg - - - name: 'Openshift Node process not running on {HOST.NAME}' - expression: '{Template Openshift Node:openshift.node.process.count.max(#3)}<1' - url: 'https://github.com/openshift/ops-sop/blob/node/V3/Alerts/openshift_node.asciidoc' - priority: high - - - name: 'Too many Openshift Node processes running on {HOST.NAME}' - expression: '{Template Openshift Node:openshift.node.process.count.min(#3)}>1' - url: 'https://github.com/openshift/ops-sop/blob/node/V3/Alerts/openshift_node.asciidoc' - priority: high - - - name: '[Heal] OVS may not be running on {HOST.NAME}' - expression: '{Template Openshift Node:openshift.node.ovs.pids.count.last(#1)}<>4 and {Template Openshift Node:openshift.node.ovs.pids.count.last(#2)}<>4' - url: 'https://github.com/openshift/ops-sop/blob/node/V3/Alerts/openshift_node.asciidoc' - priority: high - - - name: 'Number of OVS ports is 0 on {HOST.NAME}' - expression: '{Template Openshift Node:openshift.node.ovs.ports.count.last()}=0' - url: 'https://github.com/openshift/ops-sop/blob/node/V3/Alerts/openshift_node.asciidoc' - priority: high diff --git a/roles/os_zabbix/vars/template_ops_tools.yml b/roles/os_zabbix/vars/template_ops_tools.yml deleted file mode 100644 index a0a5a4d03..000000000 --- a/roles/os_zabbix/vars/template_ops_tools.yml +++ /dev/null @@ -1,54 +0,0 @@ ---- -g_template_ops_tools: - name: Template Operations Tools - zdiscoveryrules: - - name: disc.ops.runner - key: disc.ops.runner - lifetime: 1 - description: "Dynamically register operations runner items" - - zitemprototypes: - - discoveryrule_key: disc.ops.runner - name: "Exit code of ops-runner[{#OSO_COMMAND}]" - key: "disc.ops.runner.command.exitcode[{#OSO_COMMAND}]" - value_type: int - description: "The exit code of the command run from ops-runner" - applications: - - Ops Runner - - ztriggerprototypes: - - name: 'ops-runner[{#OSO_COMMAND}]: non-zero exit code on {HOST.NAME}' - expression: '{Template Operations Tools:disc.ops.runner.command.exitcode[{#OSO_COMMAND}].last()}<>0' - url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_ops_runner_command.asciidoc' - priority: average - - zactions: - - name: 'Remote command for [Heal] triggers' - status: enabled - escalation_time: 60 - conditions_filter: - calculation_type: "and/or" - conditions: - - conditiontype: maintenance status - operator: not in - - conditiontype: trigger name - operator: like - value: "[Heal]" - - conditiontype: trigger value - operator: "=" - value: PROBLEM - operations: - - esc_step_from: 1 - esc_step_to: 1 - esc_period: 0 - operationtype: remote command - opcommand: - command: 'ssh -i /etc/openshift_tools/scriptrunner_id_rsa {{ ozb_scriptrunner_user }}@{{ ozb_scriptrunner_bastion_host }} remote-healer --host \"{HOST.NAME}\" --trigger \"{TRIGGER.NAME}\" --trigger-val \"{TRIGGER.VALUE}\"' - execute_on: "zabbix server" - type: 'custom script' - target_hosts: - - target_type: 'zabbix server' - opconditions: - - conditiontype: 'event acknowledged' - operator: '=' - value: 'not acknowledged' diff --git a/roles/os_zabbix/vars/template_os_linux.yml b/roles/os_zabbix/vars/template_os_linux.yml deleted file mode 100644 index c6e557f12..000000000 --- a/roles/os_zabbix/vars/template_os_linux.yml +++ /dev/null @@ -1,314 +0,0 @@ ---- -g_template_os_linux: - name: Template OS Linux - zitems: - - key: kernel.uname.sysname - applications: - - Kernel - value_type: string - - - key: kernel.all.cpu.wait.total - applications: - - Kernel - value_type: float - units: '%' - - - key: kernel.all.cpu.irq.hard - applications: - - Kernel - value_type: float - units: '%' - - - key: kernel.all.cpu.idle - applications: - - Kernel - value_type: float - units: '%' - - - key: kernel.uname.distro - applications: - - Kernel - value_type: string - - - key: kernel.uname.nodename - applications: - - Kernel - value_type: string - - - key: kernel.all.cpu.irq.soft - applications: - - Kernel - value_type: float - units: '%' - - - key: kernel.all.load.15_minute - applications: - - Kernel - value_type: float - - - key: kernel.all.cpu.sys - applications: - - Kernel - value_type: float - units: '%' - - - key: kernel.all.load.5_minute - applications: - - Kernel - value_type: float - - - key: kernel.all.cpu.nice - applications: - - Kernel - value_type: float - units: '%' - - - key: kernel.all.load.1_minute - applications: - - Kernel - value_type: float - - - key: kernel.uname.version - applications: - - Kernel - value_type: string - - - key: kernel.all.uptime - applications: - - Kernel - value_type: int - - - key: kernel.all.cpu.user - applications: - - Kernel - value_type: float - units: '%' - - - key: kernel.uname.machine - applications: - - Kernel - value_type: string - - - key: hinv.ncpu - applications: - - Kernel - value_type: int - - - key: kernel.all.cpu.steal - applications: - - Kernel - value_type: float - units: '%' - - - key: kernel.all.pswitch - applications: - - Kernel - value_type: int - - - key: kernel.uname.release - applications: - - Kernel - value_type: string - - - key: proc.nprocs - applications: - - Kernel - value_type: int - - # Memory Items - - key: mem.freemem - applications: - - Memory - value_type: int - description: "PCP: free system memory metric from /proc/meminfo" - multiplier: 1024 - units: B - - - key: mem.util.bufmem - applications: - - Memory - value_type: int - description: "PCP: Memory allocated for buffer_heads.; I/O buffers metric from /proc/meminfo" - multiplier: 1024 - units: B - - - key: swap.used - applications: - - Memory - value_type: int - description: "PCP: swap used metric from /proc/meminfo" - multiplier: 1024 - units: B - - - key: swap.length - applications: - - Memory - value_type: int - description: "PCP: total swap available metric from /proc/meminfo" - multiplier: 1024 - units: B - - - key: mem.physmem - applications: - - Memory - value_type: int - description: "PCP: The value of this metric corresponds to the \"MemTotal\" field reported by /proc/meminfo. Note that this does not necessarily correspond to actual installed physical memory - there may be areas of the physical address space mapped as ROM in various peripheral devices and the bios may be mirroring certain ROMs in RAM." - multiplier: 1024 - units: B - - - key: swap.free - applications: - - Memory - value_type: int - description: "PCP: swap free metric from /proc/meminfo" - multiplier: 1024 - units: B - - - key: mem.util.available - applications: - - Memory - value_type: int - description: "PCP: The amount of memory that is available for a new workload, without pushing the system into swap. Estimated from MemFree, Active(file), Inactive(file), and SReclaimable, as well as the \"low\" watermarks from /proc/zoneinfo.; available memory from /proc/meminfo" - multiplier: 1024 - units: B - - - key: mem.util.used - applications: - - Memory - value_type: int - description: "PCP: Used memory is the difference between mem.physmem and mem.freemem; used memory metric from /proc/meminfo" - multiplier: 1024 - units: B - - - key: mem.util.cached - applications: - - Memory - value_type: int - description: "PCP: Memory used by the page cache, including buffered file data. This is in-memory cache for files read from the disk (the pagecache) but doesn't include SwapCached.; page cache metric from /proc/meminfo" - multiplier: 1024 - units: B - - zdiscoveryrules: - - name: disc.filesys - key: disc.filesys - lifetime: 1 - description: "Dynamically register the filesystems" - - - name: disc.disk - key: disc.disk - lifetime: 1 - description: "Dynamically register disks on a node" - - - name: disc.network - key: disc.network - lifetime: 1 - description: "Dynamically register network interfaces on a node" - - zitemprototypes: - - discoveryrule_key: disc.filesys - name: "disc.filesys.full.{#OSO_FILESYS}" - key: "disc.filesys.full[{#OSO_FILESYS}]" - value_type: float - description: "PCP filesys.full option. This is the percent full returned from pcp filesys.full" - applications: - - Disk - - - discoveryrule_key: disc.filesys - name: "Percentage of used inodes on {#OSO_FILESYS}" - key: "disc.filesys.inodes.pused[{#OSO_FILESYS}]" - value_type: float - description: "PCP derived value of percentage of used inodes on a filesystem." - applications: - - Disk - - - discoveryrule_key: disc.disk - name: "TPS (IOPS) for disk {#OSO_DISK}" - key: "disc.disk.tps[{#OSO_DISK}]" - value_type: int - description: "PCP disk.dev.totals metric measured over a period of time. This shows how many disk transactions per second the disk is using" - applications: - - Disk - - - discoveryrule_key: disc.disk - name: "Percent Utilized for disk {#OSO_DISK}" - key: "disc.disk.putil[{#OSO_DISK}]" - value_type: float - description: "PCP disk.dev.avactive metric measured over a period of time. This is the '%util' in the iostat command" - applications: - - Disk - - - discoveryrule_key: disc.network - name: "Bytes per second IN on network interface {#OSO_NET_INTERFACE}" - key: "disc.network.in.bytes[{#OSO_NET_INTERFACE}]" - value_type: int - units: B - delta: 1 - description: "PCP network.interface.in.bytes metric. This is setup as a delta in Zabbix to measure the speed per second" - applications: - - Network - - - discoveryrule_key: disc.network - name: "Bytes per second OUT on network interface {#OSO_NET_INTERFACE}" - key: "disc.network.out.bytes[{#OSO_NET_INTERFACE}]" - value_type: int - units: B - delta: 1 - description: "PCP network.interface.out.bytes metric. This is setup as a delta in Zabbix to measure the speed per second" - applications: - - Network - - ztriggerprototypes: - - name: 'Filesystem: {#OSO_FILESYS} has less than 10% free disk space on {HOST.NAME}' - expression: '{Template OS Linux:disc.filesys.full[{#OSO_FILESYS}].last()}>90' - url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_filesys_full.asciidoc' - priority: high - - # This has a dependency on the previous trigger - # Trigger Prototypes do not work in 2.4. They will work in Zabbix 3.0 - - name: 'Filesystem: {#OSO_FILESYS} has less than 15% free disk space on {HOST.NAME}' - expression: '{Template OS Linux:disc.filesys.full[{#OSO_FILESYS}].last()}>85' - url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_filesys_full.asciidoc' - priority: warn - dependencies: - - 'Filesystem: {#OSO_FILESYS} has less than 10% free disk space on {HOST.NAME}' - - - name: 'Filesystem: {#OSO_FILESYS} has less than 5% free inodes on {HOST.NAME}' - expression: '{Template OS Linux:disc.filesys.inodes.pused[{#OSO_FILESYS}].last()}>95' - url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_filesys_full.asciidoc' - priority: high - - # This has a dependency on the previous trigger - # Trigger Prototypes do not work in 2.4. They will work in Zabbix 3.0 - - name: 'Filesystem: {#OSO_FILESYS} has less than 10% free inodes on {HOST.NAME}' - expression: '{Template OS Linux:disc.filesys.inodes.pused[{#OSO_FILESYS}].last()}>90' - url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_filesys_full.asciidoc' - priority: warn - dependencies: - - 'Filesystem: {#OSO_FILESYS} has less than 5% free inodes on {HOST.NAME}' - - ztriggers: - - name: 'Too many TOTAL processes on {HOST.NAME}' - expression: '{Template OS Linux:proc.nprocs.last()}>5000' - url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_proc.asciidoc' - priority: warn - - - name: 'Lack of available memory on {HOST.NAME}' - expression: '{Template OS Linux:mem.freemem.last()}<30720000' - url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_memory.asciidoc' - priority: warn - description: 'Alert on less than 30MegaBytes. This is 30 Million Bytes. 30000 KB x 1024' - - # CPU Utilization # - - name: 'CPU idle less than 5% on {HOST.NAME}' - expression: '{Template OS Linux:kernel.all.cpu.idle.max(#5)}<5' - url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_cpu_idle.asciidoc' - priority: average - description: 'CPU is less than 5% idle' - - - name: 'CPU idle less than 10% on {HOST.NAME}' - expression: '{Template OS Linux:kernel.all.cpu.idle.max(#5)}<10' - url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_cpu_idle.asciidoc' - priority: average - description: 'CPU is less than 10% idle' - dependencies: - - 'CPU idle less than 5% on {HOST.NAME}' diff --git a/roles/os_zabbix/vars/template_performance_copilot.yml b/roles/os_zabbix/vars/template_performance_copilot.yml deleted file mode 100644 index b62fa0228..000000000 --- a/roles/os_zabbix/vars/template_performance_copilot.yml +++ /dev/null @@ -1,14 +0,0 @@ ---- -g_template_performance_copilot: - name: Template Performance Copilot - zitems: - - key: pcp.ping - applications: - - Performance Copilot - value_type: int - - ztriggers: - - name: 'pcp.ping failed on {HOST.NAME}' - expression: '{Template Performance Copilot:pcp.ping.max(#3)}<1' - url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_pcp_ping.asciidoc' - priority: average diff --git a/roles/os_zabbix/vars/template_zagg_server.yml b/roles/os_zabbix/vars/template_zagg_server.yml deleted file mode 100644 index db5665993..000000000 --- a/roles/os_zabbix/vars/template_zagg_server.yml +++ /dev/null @@ -1,46 +0,0 @@ ---- -g_template_zagg_server: - name: Template Zagg Server - zitems: - - key: zagg.server.metrics.count - applications: - - Zagg Server - value_type: int - - - key: zagg.server.metrics.errors - applications: - - Zagg Server - value_type: int - - - key: zagg.server.heartbeat.errors - applications: - - Zagg Server - value_type: int - - - key: zagg.server.heartbeat.count - applications: - - Zagg Server - value_type: int - - ztriggers: - - name: 'Error processing metrics on {HOST.NAME}' - expression: '{Template Zagg Server:zagg.server.metrics.errors.min(#3)}>0' - url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/zagg_server.asciidoc' - priority: average - - - name: 'Error processing heartbeats on {HOST.NAME}' - expression: '{Template Zagg Server:zagg.server.heartbeat.errors.min(#3)}>0' - url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/zagg_server.asciidoc' - priority: average - - - name: 'Critically High number of metrics in Zagg queue {HOST.NAME}' - expression: '{Template Zagg Server:zagg.server.metrics.count.min(#3)}>10000' - url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/zagg_server.asciidoc' - priority: high - - - name: 'High number of metrics in Zagg queue {HOST.NAME}' - expression: '{Template Zagg Server:zagg.server.metrics.count.min(#3)}>5000' - url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/zagg_server.asciidoc' - dependencies: - - 'Critically High number of metrics in Zagg queue {HOST.NAME}' - priority: average |