From 2f0bbb5781b270d88bccd5fcbe90723f9b3a5930 Mon Sep 17 00:00:00 2001 From: Matt Woodson Date: Thu, 10 Sep 2015 11:26:22 -0400 Subject: updated triggers and items to have better descriptions and multipliers --- roles/os_zabbix/vars/template_docker.yml | 12 +- roles/os_zabbix/vars/template_heartbeat.yml | 2 +- roles/os_zabbix/vars/template_openshift_master.yml | 2 +- roles/os_zabbix/vars/template_os_linux.yml | 122 ++++++++++++--------- 4 files changed, 81 insertions(+), 57 deletions(-) diff --git a/roles/os_zabbix/vars/template_docker.yml b/roles/os_zabbix/vars/template_docker.yml index a1cd3519e..395e054de 100644 --- a/roles/os_zabbix/vars/template_docker.yml +++ b/roles/os_zabbix/vars/template_docker.yml @@ -52,35 +52,35 @@ g_template_docker: - Docker Storage value_type: float ztriggers: - - description: 'docker.ping failed on {HOST.NAME}' + - name: 'docker.ping failed on {HOST.NAME}' expression: '{Template Docker:docker.ping.max(#3)}<1' url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_docker_ping.asciidoc' priority: high - - description: 'Docker storage is using LOOPBACK on {HOST.NAME}' + - name: 'Docker storage is using LOOPBACK on {HOST.NAME}' expression: '{Template Docker:docker.storage.is_loopback.last()}<>0' url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_docker_loopback.asciidoc' priority: high - - description: 'Critically low docker storage data space on {HOST.NAME}' + - name: 'Critically low docker storage data space on {HOST.NAME}' expression: '{Template Docker:docker.storage.data.space.percent_available.max(#3)}<5 or {Template Docker:docker.storage.data.space.available.max(#3)}<5' # < 5% or < 5GB url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_docker_storage.asciidoc' priority: high - - description: 'Critically low docker storage metadata space on {HOST.NAME}' + - name: 'Critically low docker storage metadata space on {HOST.NAME}' expression: '{Template Docker:docker.storage.metadata.space.percent_available.max(#3)}<5 or {Template Docker:docker.storage.metadata.space.available.max(#3)}<0.005' # < 5% or < 5MB url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_docker_storage.asciidoc' priority: high # Put triggers that depend on other triggers here (deps must be created first) - - description: 'Low docker storage data space on {HOST.NAME}' + - name: 'Low docker storage data space on {HOST.NAME}' expression: '{Template Docker:docker.storage.data.space.percent_available.max(#3)}<10 or {Template Docker:docker.storage.data.space.available.max(#3)}<10' # < 10% or < 10GB url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_docker_storage.asciidoc' dependencies: - 'Critically low docker storage data space on {HOST.NAME}' priority: average - - description: 'Low docker storage metadata space on {HOST.NAME}' + - name: 'Low docker storage metadata space on {HOST.NAME}' expression: '{Template Docker:docker.storage.metadata.space.percent_available.max(#3)}<10 or {Template Docker:docker.storage.metadata.space.available.max(#3)}<0.01' # < 10% or < 10MB url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_docker_storage.asciidoc' dependencies: diff --git a/roles/os_zabbix/vars/template_heartbeat.yml b/roles/os_zabbix/vars/template_heartbeat.yml index 798377cd9..8dbe0d0d6 100644 --- a/roles/os_zabbix/vars/template_heartbeat.yml +++ b/roles/os_zabbix/vars/template_heartbeat.yml @@ -7,7 +7,7 @@ g_template_heartbeat: - Heartbeat key: heartbeat.ping ztriggers: - - description: 'Heartbeat.ping has failed on {HOST.NAME}' + - name: 'Heartbeat.ping has failed on {HOST.NAME}' expression: '{Template Heartbeat:heartbeat.ping.nodata(20m)}=1' priority: avg url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_node_heartbeat.asciidoc' diff --git a/roles/os_zabbix/vars/template_openshift_master.yml b/roles/os_zabbix/vars/template_openshift_master.yml index d2c1365b0..728423ac1 100644 --- a/roles/os_zabbix/vars/template_openshift_master.yml +++ b/roles/os_zabbix/vars/template_openshift_master.yml @@ -7,7 +7,7 @@ g_template_openshift_master: - Openshift Master key: create_app ztriggers: - - description: 'Application creation has failed on {HOST.NAME}' + - name: 'Application creation has failed on {HOST.NAME}' expression: '{Template Openshift Master:create_app.last(#1)}=1 and {Template Openshift Master:create_app.last(#2)}=1' url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_create_app.asciidoc' priority: avg diff --git a/roles/os_zabbix/vars/template_os_linux.yml b/roles/os_zabbix/vars/template_os_linux.yml index fad6af807..3173c79b2 100644 --- a/roles/os_zabbix/vars/template_os_linux.yml +++ b/roles/os_zabbix/vars/template_os_linux.yml @@ -52,112 +52,135 @@ g_template_os_linux: - Kernel value_type: float - - key: mem.freemem + - key: kernel.all.cpu.nice applications: - - Memory + - Kernel value_type: int - - key: kernel.all.cpu.nice + - key: kernel.all.load.1_minute applications: - Kernel - value_type: int + value_type: float - - key: mem.util.bufmem + - key: kernel.uname.version applications: - - Memory - value_type: int + - Kernel + value_type: string - - key: swap.used + - key: kernel.all.uptime applications: - - Memory + - Kernel value_type: int - - key: kernel.all.load.1_minute + - key: kernel.all.cpu.user applications: - Kernel - value_type: float + value_type: int - - key: kernel.uname.version + - key: kernel.uname.machine applications: - Kernel value_type: string - - key: swap.length + - key: hinv.ncpu applications: - - Memory + - Kernel value_type: int - - key: mem.physmem + - key: kernel.all.cpu.steal applications: - - Memory + - Kernel value_type: int - - key: kernel.all.uptime + - key: kernel.all.pswitch applications: - Kernel value_type: int - - key: swap.free + - key: kernel.uname.release applications: - - Memory - value_type: int + - Kernel + value_type: string - - key: mem.util.available + - key: proc.nprocs applications: - - Memory + - Kernel value_type: int - - key: mem.util.used + # Memory Items + - key: mem.freemem applications: - Memory value_type: int - description: used memory + description: "PCP: free system memory metric from /proc/meminfo" multiplier: 1024 units: B - - key: kernel.all.cpu.user + - key: mem.util.bufmem applications: - - Kernel + - Memory value_type: int + description: "PCP: Memory allocated for buffer_heads.; I/O buffers metric from /proc/meminfo" + multiplier: 1024 + units: B - - key: kernel.uname.machine + - key: swap.used applications: - - Kernel - value_type: string + - Memory + value_type: int + description: "PCP: swap used metric from /proc/meminfo" + multiplier: 1024 + units: B - - key: hinv.ncpu + - key: swap.length applications: - - Kernel + - Memory value_type: int + description: "PCP: total swap available metric from /proc/meminfo" + multiplier: 1024 + units: B - - key: mem.util.cached + - key: mem.physmem applications: - Memory value_type: int - description: cached memory + description: "PCP: The value of this metric corresponds to the \"MemTotal\" field reported by /proc/meminfo. Note that this does not necessarily correspond to actual installed physical memory - there may be areas of the physical address space mapped as ROM in various peripheral devices and the bios may be mirroring certain ROMs in RAM." multiplier: 1024 units: B - - key: kernel.all.cpu.steal + - key: swap.free applications: - - Kernel + - Memory value_type: int + description: "PCP: swap free metric from /proc/meminfo" + multiplier: 1024 + units: B - - key: kernel.all.pswitch + - key: mem.util.available applications: - - Kernel + - Memory value_type: int + description: "PCP: The amount of memory that is available for a new workload, without pushing the system into swap. Estimated from MemFree, Active(file), Inactive(file), and SReclaimable, as well as the \"low\" watermarks from /proc/zoneinfo.; available memory from /proc/meminfo" + multiplier: 1024 + units: B - - key: kernel.uname.release + - key: mem.util.used applications: - - Kernel - value_type: string + - Memory + value_type: int + description: "PCP: Used memory is the difference between mem.physmem and mem.freemem; used memory metric from /proc/meminfo" + multiplier: 1024 + units: B - - key: proc.nprocs + - key: mem.util.cached applications: - - Kernel + - Memory value_type: int + description: "PCP: Memory used by the page cache, including buffered file data. This is in-memory cache for files read from the disk (the pagecache) but doesn't include SwapCached.; page cache metric from /proc/meminfo" + multiplier: 1024 + units: B + # Disk items - key: filesys.full.xvda2 applications: - Disk @@ -169,32 +192,33 @@ g_template_os_linux: value_type: float ztriggers: - - description: 'Filesystem: / has less than 10% free on {HOST.NAME}' + - name: 'Filesystem: / has less than 10% free on {HOST.NAME}' expression: '{Template OS Linux:filesys.full.xvda2.last()}>90' url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_filesys_full.asciidoc' priority: warn - - description: 'Filesystem: / has less than 5% free on {HOST.NAME}' + - name: 'Filesystem: / has less than 5% free on {HOST.NAME}' expression: '{Template OS Linux:filesys.full.xvda2.last()}>95' url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_filesys_full.asciidoc' priority: high - - description: 'Filesystem: /var has less than 10% free on {HOST.NAME}' + - name: 'Filesystem: /var has less than 10% free on {HOST.NAME}' expression: '{Template OS Linux:filesys.full.xvda3.last()}>90' url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_filesys_full.asciidoc' priority: warn - - description: 'Filesystem: /var has less than 5% free on {HOST.NAME}' + - name: 'Filesystem: /var has less than 5% free on {HOST.NAME}' expression: '{Template OS Linux:filesys.full.xvda3.last()}>95' url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_filesys_full.asciidoc' priority: high - - description: 'Too many TOTAL processes on {HOST.NAME}' + - name: 'Too many TOTAL processes on {HOST.NAME}' expression: '{Template OS Linux:proc.nprocs.last()}>5000' url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_proc.asciidoc' priority: warn - - description: 'Lack of available memory on {HOST.NAME}' - expression: '{Template OS Linux:mem.freemem.last()}<3000' + - name: 'Lack of available memory on {HOST.NAME}' + expression: '{Template OS Linux:mem.freemem.last()}<30720000' url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_memory.asciidoc' priority: warn + description: 'Alert on less than 30MegaBytes. This is 30 Million Bytes. 30000 KB x 1024' -- cgit v1.2.3