summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorOpenShift Bot <eparis+openshiftbot@redhat.com>2017-06-06 16:52:47 -0400
committerGitHub <noreply@github.com>2017-06-06 16:52:47 -0400
commitf734724fbcac6bdd2e9bb9f69058385768ef0cd0 (patch)
tree2b410928549eaa43c4323adc703a44bdaf5a962e
parent67da7e2e4699f9a226a436218723749ebc14ace1 (diff)
parent2d8e3d2b28ce19569c76c56102e9639a6f26b0c2 (diff)
downloadopenshift-f734724fbcac6bdd2e9bb9f69058385768ef0cd0.tar.gz
openshift-f734724fbcac6bdd2e9bb9f69058385768ef0cd0.tar.bz2
openshift-f734724fbcac6bdd2e9bb9f69058385768ef0cd0.tar.xz
openshift-f734724fbcac6bdd2e9bb9f69058385768ef0cd0.zip
Merge pull request #4252 from sdodson/tolerate-node-upgrade-failure
Merged by openshift-bot
-rw-r--r--inventory/byo/hosts.origin.example29
-rw-r--r--inventory/byo/hosts.ose.example29
-rw-r--r--playbooks/common/openshift-cluster/upgrades/upgrade_control_plane.yml4
-rw-r--r--playbooks/common/openshift-cluster/upgrades/upgrade_nodes.yml2
4 files changed, 61 insertions, 3 deletions
diff --git a/inventory/byo/hosts.origin.example b/inventory/byo/hosts.origin.example
index 206ec06c3..b2490638b 100644
--- a/inventory/byo/hosts.origin.example
+++ b/inventory/byo/hosts.origin.example
@@ -799,6 +799,35 @@ openshift_master_identity_providers=[{'name': 'htpasswd_auth', 'login': 'true',
#
#etcd_ca_default_days=1825
+# Upgrade Control
+#
+# By default nodes are upgraded in a serial manner one at a time and all failures
+# are fatal, one set of variables for normal nodes, one set of variables for
+# nodes that are part of control plane as the number of hosts may be different
+# in those two groups.
+#openshift_upgrade_nodes_serial=1
+#openshift_upgrade_nodes_max_fail_percentage=0
+#openshift_upgrade_control_plane_nodes_serial=1
+#openshift_upgrade_control_plane_nodes_max_fail_percentage=0
+#
+# You can specify the number of nodes to upgrade at once. We do not currently
+# attempt to verify that you have capacity to drain this many nodes at once
+# so please be careful when specifying these values. You should also verify that
+# the expected number of nodes are all schedulable and ready before starting an
+# upgrade. If it's not possible to drain the requested nodes the upgrade will
+# stall indefinitely until the drain is successful.
+#
+# If you're upgrading more than one node at a time you can specify the maximum
+# percentage of failure within the batch before the upgrade is aborted. Any
+# nodes that do fail are ignored for the rest of the playbook run and you should
+# take care to investigate the failure and return the node to service so that
+# your cluster.
+#
+# The percentage must exceed the value, this would fail on two failures
+# openshift_upgrade_nodes_serial=4 openshift_upgrade_nodes_max_fail_percentage=49
+# where as this would not
+# openshift_upgrade_nodes_serial=4 openshift_upgrade_nodes_max_fail_percentage=50
+
# host group for masters
[masters]
ose3-master[1:3]-ansible.test.example.com
diff --git a/inventory/byo/hosts.ose.example b/inventory/byo/hosts.ose.example
index 4f777c330..67d53b22d 100644
--- a/inventory/byo/hosts.ose.example
+++ b/inventory/byo/hosts.ose.example
@@ -795,6 +795,35 @@ openshift_master_identity_providers=[{'name': 'htpasswd_auth', 'login': 'true',
#
#etcd_ca_default_days=1825
+# Upgrade Control
+#
+# By default nodes are upgraded in a serial manner one at a time and all failures
+# are fatal, one set of variables for normal nodes, one set of variables for
+# nodes that are part of control plane as the number of hosts may be different
+# in those two groups.
+#openshift_upgrade_nodes_serial=1
+#openshift_upgrade_nodes_max_fail_percentage=0
+#openshift_upgrade_control_plane_nodes_serial=1
+#openshift_upgrade_control_plane_nodes_max_fail_percentage=0
+#
+# You can specify the number of nodes to upgrade at once. We do not currently
+# attempt to verify that you have capacity to drain this many nodes at once
+# so please be careful when specifying these values. You should also verify that
+# the expected number of nodes are all schedulable and ready before starting an
+# upgrade. If it's not possible to drain the requested nodes the upgrade will
+# stall indefinitely until the drain is successful.
+#
+# If you're upgrading more than one node at a time you can specify the maximum
+# percentage of failure within the batch before the upgrade is aborted. Any
+# nodes that do fail are ignored for the rest of the playbook run and you should
+# take care to investigate the failure and return the node to service so that
+# your cluster.
+#
+# The percentage must exceed the value, this would fail on two failures
+# openshift_upgrade_nodes_serial=4 openshift_upgrade_nodes_max_fail_percentage=49
+# where as this would not
+# openshift_upgrade_nodes_serial=4 openshift_upgrade_nodes_max_fail_percentage=50
+
# host group for masters
[masters]
ose3-master[1:3]-ansible.test.example.com
diff --git a/playbooks/common/openshift-cluster/upgrades/upgrade_control_plane.yml b/playbooks/common/openshift-cluster/upgrades/upgrade_control_plane.yml
index e10c4c540..b980909eb 100644
--- a/playbooks/common/openshift-cluster/upgrades/upgrade_control_plane.yml
+++ b/playbooks/common/openshift-cluster/upgrades/upgrade_control_plane.yml
@@ -247,8 +247,8 @@
hosts: oo_masters_to_config:&oo_nodes_to_upgrade
# This var must be set with -e on invocation, as it is not a per-host inventory var
# and is evaluated early. Values such as "20%" can also be used.
- serial: "{{ openshift_upgrade_nodes_serial | default(1) }}"
- any_errors_fatal: true
+ serial: "{{ openshift_upgrade_control_plane_nodes_serial | default(1) }}"
+ max_fail_percentage: "{{ openshift_upgrade_control_plane_nodes_max_fail_percentage | default(0) }}"
pre_tasks:
- name: Load lib_openshift modules
diff --git a/playbooks/common/openshift-cluster/upgrades/upgrade_nodes.yml b/playbooks/common/openshift-cluster/upgrades/upgrade_nodes.yml
index 4d455fe0a..91dbc2cd4 100644
--- a/playbooks/common/openshift-cluster/upgrades/upgrade_nodes.yml
+++ b/playbooks/common/openshift-cluster/upgrades/upgrade_nodes.yml
@@ -4,7 +4,7 @@
# This var must be set with -e on invocation, as it is not a per-host inventory var
# and is evaluated early. Values such as "20%" can also be used.
serial: "{{ openshift_upgrade_nodes_serial | default(1) }}"
- any_errors_fatal: true
+ max_fail_percentage: "{{ openshift_upgrade_nodes_max_fail_percentage | default(0) }}"
pre_tasks:
- name: Load lib_openshift modules