From 4175af7f92ad7357b83ceb56f2a6d42a8243cd80 Mon Sep 17 00:00:00 2001
From: "Suren A. Chilingaryan" <csa@suren.me>
Date: Mon, 29 Jul 2024 22:32:00 +0200
Subject: Update documentation & users

---
 docs/maintenance.txt        |  4 ++++
 docs/problems.txt           | 31 ++++++++++++++++++++++++++++++-
 docs/troubleshooting.txt    | 14 +++++++++++++-
 docs/webservices.txt        |  2 +-
 setup/configs/openshift.yml | 26 ++++++++++++++------------
 setup/users/htpasswd        |  3 +++
 6 files changed, 65 insertions(+), 15 deletions(-)

diff --git a/docs/maintenance.txt b/docs/maintenance.txt
index 9f52e18..c05b10f 100644
--- a/docs/maintenance.txt
+++ b/docs/maintenance.txt
@@ -53,3 +53,7 @@ Unused resources
         oc delete image sha256:04afd4d4a0481e1510f12d6d071f1dceddef27416eb922cf524a61281257c66e
     * Cleaning old dangling images using docker (on all nodes). Tried and as it seems caused no issues to the operation of the cluster.
         docker rmi $(docker images --filter "dangling=true" -q --no-trunc)
+
+ - Cleaning log files over-using inodes, etc.
+    * Volume log files
+        find  /var/lib/origin/openshift.local.volumes/plugins/kubernetes.io/ -name '*.log' -delete
diff --git a/docs/problems.txt b/docs/problems.txt
index 3b652ec..49137aa 100644
--- a/docs/problems.txt
+++ b/docs/problems.txt
@@ -170,4 +170,33 @@ Orphaning / pod termination problems in the logs
   Scenario:
     * Reported on long running pods with persistent volumes (katrin, adai-db)
     * Also seems an unrelated set of the problems.
-    
\ No newline at end of file
+    
+    
+Evicted Pods
+============
+ Pods are evicted if node running pod becomes unavailable or have not enough resources to run the pod.
+ - It is possible to lookup which resource is likely triggering event by 
+    > oc describe node ipekatrin2.ipe.kit.edu
+     Type                  Status  LastHeartbeatTime                       LastTransitionTime                      Reason                          Message
+     ----                  ------  -----------------                       ------------------                      ------                          -------
+     OutOfDisk             False   Tue, 05 Apr 2022 03:24:54 +0200         Tue, 21 Dec 2021 19:09:33 +0100         KubeletHasSufficientDisk        kubelet has sufficient disk space available
+     MemoryPressure        False   Tue, 05 Apr 2022 03:24:54 +0200         Tue, 21 Dec 2021 19:09:33 +0100         KubeletHasSufficientMemory      kubelet has sufficient memory available
+     DiskPressure          False   Tue, 05 Apr 2022 03:24:54 +0200         Mon, 04 Apr 2022 10:00:23 +0200         KubeletHasNoDiskPressure        kubelet has no disk pressure
+     Ready                 True    Tue, 05 Apr 2022 03:24:54 +0200         Tue, 21 Dec 2021 19:09:43 +0100         KubeletReady                    kubelet is posting ready status
+  The latest transition is 'DiskPressure' happened on Apr 04. So, likely disk is an issue.
+  
+ - DiskPressure eviction
+    * This might happen because the pod writting to much output to the logs (standard ouput). This logs are stored under '/var/lib/origin/openshift.local.volumes/pods/...'
+    and if growing large might use all the space in '/var' file system. OpenShift is not rotating these logs and have no other mechanisms to prevent large output eventually 
+    causing space issues. So, pods have to rate-limit output to stdout. Otherwise, we need to find misbehaving pods which writes too much...
+    * Another problem is 'inode' pressure. This can be checked with 'df' and anything above 80% is definitively a sign of a problem
+            df -i
+      The particular folder with lots of inodes can be found with the following command:
+            { find / -xdev -printf '%h\n' | sort | uniq -c | sort -k 1 -n; } 2>/dev/null
+      Likely there would be some openshift-related volume logs in '/var/lib/origin/openshift.local.volumes/plugins/kubernetes.io/'
+      Check particularly cronJob logs mounting volumes, e.g. various 'adei' stuff. Can be cleaned with
+            find  /var/lib/origin/openshift.local.volumes/plugins/kubernetes.io/ -name '*.log' -delete
+
+ - If resource is not available for long time, node will become NotReady and all pods will be evicted. However, short term problems caused by pod itself are likely cause only eviction of this
+ particular pod itself only (once pod evicted disk/memory-space is recalimed, logs are deleted). So, it is possible to find the problematic pod by looking which pod was evicted most frequently.
+ 
diff --git a/docs/troubleshooting.txt b/docs/troubleshooting.txt
index 315f9f4..0621b25 100644
--- a/docs/troubleshooting.txt
+++ b/docs/troubleshooting.txt
@@ -151,8 +151,17 @@ nodes: domino failures
     * This might continue infinitely as one node is gets disconnected after another, pods get rescheduled, and process never stops
     * The only solution is to remove temporarily some pods, e.g. ADEI pods could be easily removed and, then, provivisioned back
 
-pods: very slow scheduling (normal start time in seconds range), failed pods, rogue namespaces, etc...
+pods: failed or very slow scheduling (normal start time in seconds range), failed pods, rogue namespaces, etc...
 ====
+  - LSDF mounts might cause pod-scheduling to fail
+    * It seems OpenShift tries to index (chroot+chmod) files on mount and timeouts if LSDF volume has too many small files...
+    * Reducing number of files with 'subPath' doesn't help here, but setting more specific 'networkPath' in pv helps
+    * Suggestion is to remove fsGroup from 'dc' definition, but it is added automatically if pods use network volumes,
+    setting volume 'gid' (cifs mount parameters specified in 'mountOptions' in pv definition) to match fsGroup doesn't help either
+    * Timeout seems to be fixed to 2m and is not configurable...
+    * Later versions of OpenShift has 'fsGroupChangePolicy=OnRootMismatch' parameter, but it is not present in 3.9
+    => Honestly, solution is unclear besides reducing number of files or mounting a small share subset with little fieles
+
  - OpenShift has numerous problems with clean-up resources after the pods. The problems are more likely to happen on the 
  heavily loaded systems: cpu, io, interrputs, etc.
     * This may be indicated in the logs with various errors reporting inability to stop containers/processes, free network
@@ -450,3 +459,6 @@ Various
  - IPMI may cause problems as well. Particularly, the mounted CDrom may start complaining. Easiest is
  just to remove it from the running system with
      echo 1 > /sys/block/sdd/device/delete
+
+ - 'oc get scc' reports the server doesn't have a resource type "scc"
+    Delete (will be restarted) 'apiserver-*' pod in the 'kube-service-catalog' namespace
diff --git a/docs/webservices.txt b/docs/webservices.txt
index 0edfdeb..f8952ca 100644
--- a/docs/webservices.txt
+++ b/docs/webservices.txt
@@ -55,6 +55,6 @@ Updating/Generating certificates for the router
     * New 'router-certs' secret should be created in 'default' namespace. Probably it is better to 
     modify existing secret than delete/create. However, the strings can't just be copied. Easiest way 
     is to create a new secret in temporary namespace:
-        oc -n test secrets new router-certs tls.crt=kaas.pem tls.key=kaas.key
+        oc -n test secrets new router-certs-2022 tls.crt=kaas-chain-20220121.pem tls.key=kaas.key
     and then copy 'tls.crt' and 'tls.key' values over.
     * To reload secret, the 'router' pods should be deleted (and automatically re-created by rc).
diff --git a/setup/configs/openshift.yml b/setup/configs/openshift.yml
index 7ead691..767ac78 100644
--- a/setup/configs/openshift.yml
+++ b/setup/configs/openshift.yml
@@ -23,31 +23,33 @@ ands_openshift_users:
   pdv: { name: "IPE Administation Account" }
   katrin: { name: "KATRIN Project" }
   csa: { name: "Suren A. Chilingaryan", email: "csa@suren.me", uid: "1001", shell: "/bin/bash" }
+  jan: { name: "Jan Behrens", email: "jan.behrens@kit.edu" }
   kopmann: { name: "Andreas Kopmann", email: "kopmann@kit.edu" }
   ntj: { name: "Nicholas Tan Jerome", email: "nicholas.jerome@kit.edu" }
   jonasteufel: { name: "Jonas Teufel", email: "jonseb1998@gmail.com" }
   jalal: { name: "Jalal Mostafa", email: "jalal.mostapha@outlook.com" }
   gil: { name: "Woosik Gil", email: "gil@kit.edu" }
   jhar: { name: "Julius Hartmann", email: "julius.hartmann@kit.edu" }
-
+  bbieringer: { name: "Benedikt Bieringer", email: "benedikt.b@uni-muenster.de" }
 
 ands_openshift_roles:
-  cluster-admin: csa, jalal
-  kaas/admin: csa, jalal, kopmann
-  katrin/admin: katrin, jalal
-  status/admin: katrin, jalal
+  cluster-admin: csa, jalal, jan, ntj
+  kaas/admin: csa, jalal, kopmann, ntj, bbieringer
+  katrin/admin: katrin, jalal, jan, ntj, bbieringer
+  status/admin: katrin, jalal, ntj, bbieringer
   adei/admin: csa
-  adei/view: pdv, kopmann, jalal
-  adei/kaas-maintain: pdv, kopmann, jalal
+  adei/view: pdv, kopmann, jalal, ntj
+  adei/kaas-maintain: pdv, kopmann, jalal, ntj
   adai/admin: csa, kopmann, jalal
   bora/admin: csa, ntj, gil, jalal, katrin, kopmann
   epics/admin: csa, jalal
   wave/admin: csa, ntj
-  services/admin: csa, jalal, katrin
-  web/admin: kopmann, jonasteufel, jalal
-  mon/admin: csa, jalal
-  test/admin: csa, ntj, kopmann, katrin, jalal
-  jupyter/admin: csa, kopmann, jalal, jhar
+  services/admin: csa, jalal, katrin, jan, ntj, bbieringer
+  web/admin: kopmann, jonasteufel, jalal, ntj, bbieringer
+  mon/admin: csa, jalal, ntj
+  test/admin: csa, ntj, kopmann, katrin, jalal, jan
+  jupyter/admin: csa, kopmann, jalal, jhar, ntj
+  hpc/admin: csa, jalal, fawez
 
 ands_repos:
   docker: "{{ ands_info.git_url }}/ands-docker"
diff --git a/setup/users/htpasswd b/setup/users/htpasswd
index 4939761..5788f55 100644
--- a/setup/users/htpasswd
+++ b/setup/users/htpasswd
@@ -1,9 +1,12 @@
 pdv:$apr1$ACvj6uUa$Nm1Vq8hZq3RzTtaYpAHv01
 csa:$apr1$IqEwdnzy$UAdd8ZSFnXommBbj29w3c0
 katrin:$apr1$94lAgTxt$LVOWdwye92nsZVqVT7VaG1
+jan:$2y$05$RJqQQczSaO0sIiYLVI3CI.mKPIOvxnJUTYbUjyXqjJjLQJQinBlQu
 ntj:$apr1$G5/ThWdp$kFLsj/hO9jIYYP.Zab9kC/
 kopmann:$apr1$jU8jCdPh$u7ZUBiT3gzxlf1xPJl6FI.
 jonasteufel:$apr1$2dsiiZ1p$Us/5i8DEt9fxeliGy7L6h/
 jalal:$apr1$hwKRrL2x$RbtSQbfZZqPuvHL9YhCKp.
 gil:$apr1$p2khs49v$7poH4dUbTpCyhEO5JmgLx0
 jhar:$apr1$pDAXDbT4$r2f1SP5D71KplWZKLNi27.
+fawez:$apr1$yqROFhQ9$QdvOPFZ3zAbmtI9Dv53WU.
+bbieringer:$apr1$JqJxhUCR$5KfXdYvZzBFBbNm.mnrtd.
-- 
cgit v1.2.3