From 1bc6d4390661fe18bebbc020b2c7b25972e80b41 Mon Sep 17 00:00:00 2001 From: Dan Mace Date: Fri, 7 Oct 2016 16:28:17 -0400 Subject: Retry failed master startup once Master startup can fail when ec2 transparently reallocates the block storage, causing etcd writes to temporarily fail. Retry failures blindly just once to allow time for this transient condition to to resolve and for systemd to restart the master (which will eventually succeed). https://github.com/coreos/etcd/issues/3864 https://github.com/openshift/origin/issues/6065 https://github.com/openshift/origin/issues/6447 --- roles/openshift_master/tasks/main.yml | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'roles/openshift_master') diff --git a/roles/openshift_master/tasks/main.yml b/roles/openshift_master/tasks/main.yml index ce2f96723..645871ab4 100644 --- a/roles/openshift_master/tasks/main.yml +++ b/roles/openshift_master/tasks/main.yml @@ -168,10 +168,21 @@ - include: set_loopback_context.yml when: openshift.common.version_gte_3_2_or_1_2 +# TODO: Master startup can fail when ec2 transparently reallocates the block +# storage, causing etcd writes to temporarily fail. Retry failures blindly just +# once to allow time for this transient condition to to resolve and for systemd +# to restart the master (which will eventually succeed). +# +# https://github.com/coreos/etcd/issues/3864 +# https://github.com/openshift/origin/issues/6065 +# https://github.com/openshift/origin/issues/6447 - name: Start and enable master service: name={{ openshift.common.service_type }}-master enabled=yes state=started when: not openshift_master_ha | bool register: start_result + until: not start_result | failed + retries: 1 + delay: 60 notify: Verify API Server - name: Check for non-HA master service presence -- cgit v1.2.3 From ebef73fe94a0329011ca35ba33272789826ce282 Mon Sep 17 00:00:00 2001 From: Scott Dodson Date: Mon, 10 Oct 2016 09:43:15 -0400 Subject: Apply same pattern to HA master services --- roles/openshift_master/tasks/main.yml | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'roles/openshift_master') diff --git a/roles/openshift_master/tasks/main.yml b/roles/openshift_master/tasks/main.yml index 645871ab4..1a59717c7 100644 --- a/roles/openshift_master/tasks/main.yml +++ b/roles/openshift_master/tasks/main.yml @@ -213,6 +213,9 @@ state: started when: openshift_master_ha | bool and openshift.master.cluster_method == 'native' and inventory_hostname == openshift_master_hosts[0] register: start_result + until: not start_result | failed + retries: 1 + delay: 60 - set_fact: master_api_service_status_changed: "{{ start_result | changed }}" @@ -229,6 +232,9 @@ state: started when: openshift_master_ha | bool and openshift.master.cluster_method == 'native' and inventory_hostname != openshift_master_hosts[0] register: start_result + until: not start_result | failed + retries: 1 + delay: 60 - set_fact: master_api_service_status_changed: "{{ start_result | changed }}" @@ -262,6 +268,9 @@ state: started when: openshift_master_ha | bool and openshift.master.cluster_method == 'native' and inventory_hostname == openshift_master_hosts[0] register: start_result + until: not start_result | failed + retries: 1 + delay: 60 - pause: seconds: 15 @@ -274,6 +283,9 @@ state: started when: openshift_master_ha | bool and openshift.master.cluster_method == 'native' and inventory_hostname != openshift_master_hosts[0] register: start_result + until: not start_result | failed + retries: 1 + delay: 60 - set_fact: master_controllers_service_status_changed: "{{ start_result | changed }}" -- cgit v1.2.3