summaryrefslogtreecommitdiffstats
path: root/roles/ands_monitor
diff options
context:
space:
mode:
Diffstat (limited to 'roles/ands_monitor')
-rw-r--r--roles/ands_monitor/tasks/main.yml13
-rw-r--r--roles/ands_monitor/templates/cron/maintain.j24
-rwxr-xr-xroles/ands_monitor/templates/scripts/check_server_status.sh.j211
-rwxr-xr-xroles/ands_monitor/templates/scripts/check_uptime_status.sh.j29
-rwxr-xr-xroles/ands_monitor/templates/scripts/clean_rogue_interfaces.sh.j218
-rwxr-xr-xroles/ands_monitor/templates/scripts/list_containers.sh.j23
-rwxr-xr-xroles/ands_monitor/templates/scripts/maintain.sh.j28
7 files changed, 65 insertions, 1 deletions
diff --git a/roles/ands_monitor/tasks/main.yml b/roles/ands_monitor/tasks/main.yml
index ac70d28..8cac4ea 100644
--- a/roles/ands_monitor/tasks/main.yml
+++ b/roles/ands_monitor/tasks/main.yml
@@ -1,3 +1,8 @@
+- name: Install monitoring applications
+ package: name={{item}} state=present
+ with_items:
+ - sysstat
+
- name: Create scripts directory
file: path="{{ ands_script_path }}" state=directory
@@ -7,3 +12,11 @@
script_name: "{{ item | basename | regex_replace('\\.j2','') }}"
with_fileglob:
- "{{ role_path }}/templates/scripts/*.j2"
+
+
+- name: "Deploy cron jobs"
+ template: src="{{ item | quote }}" dest="/etc/cron.d/{{ cron_name }}" owner=root group=root mode=0644
+ vars:
+ cron_name: "{{ item | basename | regex_replace('\\.j2','') }}"
+ with_fileglob:
+ - "{{ role_path }}/templates/cron/*.j2"
diff --git a/roles/ands_monitor/templates/cron/maintain.j2 b/roles/ands_monitor/templates/cron/maintain.j2
new file mode 100644
index 0000000..2c3ce9c
--- /dev/null
+++ b/roles/ands_monitor/templates/cron/maintain.j2
@@ -0,0 +1,4 @@
+SHELL=/bin/bash
+PATH=/sbin:/bin:/usr/sbin:/usr/bin
+MAILTO=csa-darkserv@suren.me
+33 */4 * * * root {{ ands_script_path }}/maintain.sh
diff --git a/roles/ands_monitor/templates/scripts/check_server_status.sh.j2 b/roles/ands_monitor/templates/scripts/check_server_status.sh.j2
index caa63ce..b02f031 100755
--- a/roles/ands_monitor/templates/scripts/check_server_status.sh.j2
+++ b/roles/ands_monitor/templates/scripts/check_server_status.sh.j2
@@ -32,3 +32,14 @@ if [ -z "$disks" -o "$disks" -ne 0 ]; then
echo "Not all disks are online:"
/opt/MegaRAID/storcli/storcli64 /c0 show | grep -P "(HDD|SSD)" | grep -v "Onln"
fi
+
+ifaces=$(ovs-vsctl show | grep -oP "could not open network device\s*\Kveth[a-f0-9]+" | wc -l)
+if [ "$ifaces" -gt 50 ]; then
+ echo "Too many rogue interfaces ($ifaces) is registered on OpenVSwitch bridge. It could introduce large delays in pod scheduling..."
+fi
+
+#Check various known problems
+vssize=$(du -sm /var/log/openvswitch/ovs-vswitchd.log | cut -f 1)
+if [ "$vssize" -gt 128 ]; then
+ echo "Current OpenVSwitch log is over $vssize MB. It could indicate some severe problems in pod networking..."
+fi
diff --git a/roles/ands_monitor/templates/scripts/check_uptime_status.sh.j2 b/roles/ands_monitor/templates/scripts/check_uptime_status.sh.j2
index 0602fcb..7acac5f 100755
--- a/roles/ands_monitor/templates/scripts/check_uptime_status.sh.j2
+++ b/roles/ands_monitor/templates/scripts/check_uptime_status.sh.j2
@@ -2,7 +2,14 @@
up=$(uptime | cut -d ' ' -f 4- | cut -d ',' -f 1 | sed -re 's/^\s*//')
load=$(uptime | cut -d ' ' -f 4- | cut -d ',' -f 4- | cut -d ':' -f 2 | cut -d ',' -f 3 | sed -re 's/^\s*//')
+#pods=$(oc get pods --all-namespaces -o wide | grep `hostname` | wc -l)
+containers=$(docker ps -q | wc -l)
+#processes=$(ps xa --no-headers | wc -l)
+mem=$(free -t -g | grep "Mem:" | sed -re 's/\s+/ /g' | cut -d ' ' -f 3)
+iops=$(iostat -d | grep -E "^sd" | awk '{s+=$2} END {print s}' | cut -d '.' -f 1)
+net=$(ifstat -n; sleep 0.1; ifstat | grep -E "^(enp|ib)" | awk '{s+=$4+$5} END {print s}'); net=$((net / 100))
disks=$(/opt/MegaRAID/storcli/storcli64 /c0 show | grep -P "(HDD|SSD)" | grep "Onln" | wc -l)
data=`df -lh /mnt/ands | grep -vi Filesystem | sed -e 's/[[:space:]]\+/ /g' | cut -d ' ' -f 4`
-echo -n "1 Up $up \${color gray}/ $disks disks, $data free, load: $load"
+#echo -n "1 Up $up \${color gray}/ $disks disks, $data free, load: $load, pods: $pods"
+echo -en "1 $up\${color gray}, ${disks}/${data}, $(printf %3u ${containers}) c - $(printf %4.1f ${load}), $(printf %3u ${mem}) GB, $(printf %4u ${iops}) IOPS, $(printf %3u ${net}) MB/s"
diff --git a/roles/ands_monitor/templates/scripts/clean_rogue_interfaces.sh.j2 b/roles/ands_monitor/templates/scripts/clean_rogue_interfaces.sh.j2
new file mode 100755
index 0000000..c04ce60
--- /dev/null
+++ b/roles/ands_monitor/templates/scripts/clean_rogue_interfaces.sh.j2
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+ifaces=$(ovs-vsctl show | grep -oP "could not open network device\s*\Kveth[a-f0-9]+")
+[ $? -eq 0 ] || exit
+
+#Find bridge
+#ovs-vsctl list-br
+
+for iface in $ifaces; do
+# echo "$iface"
+
+# Verify that interface is not active
+ ip link show | grep $iface &> /dev/null
+ [ $? -eq 0 ] && continue
+
+ echo "Removing: $iface"
+ ovs-vsctl del-port br0 $iface
+done
diff --git a/roles/ands_monitor/templates/scripts/list_containers.sh.j2 b/roles/ands_monitor/templates/scripts/list_containers.sh.j2
new file mode 100755
index 0000000..bac2884
--- /dev/null
+++ b/roles/ands_monitor/templates/scripts/list_containers.sh.j2
@@ -0,0 +1,3 @@
+#! /bin/bash
+
+kubectl get pods --all-namespaces -o jsonpath='{range .items[*]}{@.metadata.name}{" "}{@.spec.containers[*].image}{" - "}{@.status.containerStatuses[*].containerID}{"\n"}{end}'
diff --git a/roles/ands_monitor/templates/scripts/maintain.sh.j2 b/roles/ands_monitor/templates/scripts/maintain.sh.j2
new file mode 100755
index 0000000..45c9513
--- /dev/null
+++ b/roles/ands_monitor/templates/scripts/maintain.sh.j2
@@ -0,0 +1,8 @@
+#! /bin/bash
+
+# Left-over network interfaces on the OpenVSwitch bridge after pod termination
+ifaces=$(ovs-vsctl show | grep -oP "could not open network device\s*\Kveth[a-f0-9]+" | wc -l)
+if [ $ifaces -gt 25 ]; then
+ echo "Cleaning rogue interfaces ($ifaces) on $(hostname)"
+ {{ ands_script_path }}/clean_rogue_interfaces.sh > /dev/null
+fi