From 4a38bc1bbcb743d94a481c539ccd723efe436da0 Mon Sep 17 00:00:00 2001 From: Steve Milner Date: Thu, 6 Jul 2017 10:33:27 -0400 Subject: Test docker restart with retries 3 delay 30 --- playbooks/adhoc/uninstall.yml | 4 ++++ playbooks/common/openshift-cluster/upgrades/docker/restart.yml | 4 ++++ playbooks/common/openshift-node/restart.yml | 4 ++++ roles/calico/handlers/main.yml | 4 ++++ roles/contiv/tasks/netplugin.yml | 4 ++++ roles/docker/handlers/main.yml | 2 +- roles/flannel/handlers/main.yml | 4 ++++ roles/openshift_node_certificates/handlers/main.yml | 4 ++++ roles/openshift_node_upgrade/tasks/restart.yml | 2 +- 9 files changed, 30 insertions(+), 2 deletions(-) diff --git a/playbooks/adhoc/uninstall.yml b/playbooks/adhoc/uninstall.yml index a1f541712..58b3a7835 100644 --- a/playbooks/adhoc/uninstall.yml +++ b/playbooks/adhoc/uninstall.yml @@ -325,6 +325,10 @@ service: name=docker state=restarted failed_when: false when: not (container_engine | changed) + register: l_docker_restart_docker_in_pb_result + until: not l_docker_restart_docker_in_pb_result | failed + retries: 3 + delay: 30 - hosts: masters become: yes diff --git a/playbooks/common/openshift-cluster/upgrades/docker/restart.yml b/playbooks/common/openshift-cluster/upgrades/docker/restart.yml index 96c729d79..13313377e 100644 --- a/playbooks/common/openshift-cluster/upgrades/docker/restart.yml +++ b/playbooks/common/openshift-cluster/upgrades/docker/restart.yml @@ -1,6 +1,10 @@ --- - name: Restart docker service: name=docker state=restarted + register: l_docker_restart_docker_in_upgrade_result + until: not l_docker_restart_docker_in_upgrade_result | failed + retries: 3 + delay: 30 - name: Update docker facts openshift_facts: diff --git a/playbooks/common/openshift-node/restart.yml b/playbooks/common/openshift-node/restart.yml index 63273cb78..ed2473a43 100644 --- a/playbooks/common/openshift-node/restart.yml +++ b/playbooks/common/openshift-node/restart.yml @@ -11,6 +11,10 @@ service: name: docker state: restarted + register: l_docker_restart_docker_in_node_result + until: not l_docker_restart_docker_in_node_result | failed + retries: 3 + delay: 30 - name: Update docker facts openshift_facts: diff --git a/roles/calico/handlers/main.yml b/roles/calico/handlers/main.yml index 53cecfcc3..67fc0065f 100644 --- a/roles/calico/handlers/main.yml +++ b/roles/calico/handlers/main.yml @@ -8,3 +8,7 @@ systemd: name: "{{ openshift.docker.service_name }}" state: restarted + register: l_docker_restart_docker_in_calico_result + until: not l_docker_restart_docker_in_calico_result | failed + retries: 3 + delay: 30 diff --git a/roles/contiv/tasks/netplugin.yml b/roles/contiv/tasks/netplugin.yml index 0847c92bc..e861a2591 100644 --- a/roles/contiv/tasks/netplugin.yml +++ b/roles/contiv/tasks/netplugin.yml @@ -108,6 +108,10 @@ name: "{{ openshift.docker.service_name }}" state: restarted when: docker_updated|changed + register: l_docker_restart_docker_in_contiv_result + until: not l_docker_restart_docker_in_contiv_result | failed + retries: 3 + delay: 30 - name: Netplugin | Enable Netplugin service: diff --git a/roles/docker/handlers/main.yml b/roles/docker/handlers/main.yml index 3a4f4ba92..6745f4302 100644 --- a/roles/docker/handlers/main.yml +++ b/roles/docker/handlers/main.yml @@ -6,7 +6,7 @@ state: restarted register: r_docker_restart_docker_result until: not r_docker_restart_docker_result | failed - retries: 1 + retries: 3 delay: 30 when: not docker_service_status_changed | default(false) | bool diff --git a/roles/flannel/handlers/main.yml b/roles/flannel/handlers/main.yml index c60c2115a..02f5a5f64 100644 --- a/roles/flannel/handlers/main.yml +++ b/roles/flannel/handlers/main.yml @@ -8,3 +8,7 @@ systemd: name: "{{ openshift.docker.service_name }}" state: restarted + register: l_docker_restart_docker_in_flannel_result + until: not l_docker_restart_docker_in_flannel_result | failed + retries: 3 + delay: 30 diff --git a/roles/openshift_node_certificates/handlers/main.yml b/roles/openshift_node_certificates/handlers/main.yml index 502f80434..4abe8bcaf 100644 --- a/roles/openshift_node_certificates/handlers/main.yml +++ b/roles/openshift_node_certificates/handlers/main.yml @@ -9,3 +9,7 @@ name: "{{ openshift.docker.service_name }}" state: restarted when: not openshift_certificates_redeploy | default(false) | bool + register: l_docker_restart_docker_in_cert_result + until: not l_docker_restart_docker_in_cert_result | failed + retries: 3 + delay: 30 diff --git a/roles/openshift_node_upgrade/tasks/restart.yml b/roles/openshift_node_upgrade/tasks/restart.yml index 6947223af..f228b6e08 100644 --- a/roles/openshift_node_upgrade/tasks/restart.yml +++ b/roles/openshift_node_upgrade/tasks/restart.yml @@ -19,7 +19,7 @@ state: started register: docker_start_result until: not docker_start_result | failed - retries: 1 + retries: 3 delay: 30 - name: Update docker facts -- cgit v1.2.3 From 36a7c5bf5c69511020c516a33ca6b3e57aff485d Mon Sep 17 00:00:00 2001 From: Scott Dodson Date: Thu, 6 Jul 2017 11:28:26 -0400 Subject: Add retries to node restart handlers --- roles/openshift_node/handlers/main.yml | 8 +++++++- roles/openshift_node_upgrade/handlers/main.yml | 12 ++++++++++-- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/roles/openshift_node/handlers/main.yml b/roles/openshift_node/handlers/main.yml index a6bd12d4e..0d4af9f53 100644 --- a/roles/openshift_node/handlers/main.yml +++ b/roles/openshift_node/handlers/main.yml @@ -15,7 +15,13 @@ systemd: name: "{{ openshift.common.service_type }}-node" state: restarted - when: (not skip_node_svc_handlers | default(False) | bool) and not (node_service_status_changed | default(false) | bool) + register: l_openshift_node_restart_node_result + until: not l_openshift_node_restart_node_result | failed + retries: 3 + delay: 30 + when: + - (not skip_node_svc_handlers | default(False) | bool) + - not (node_service_status_changed | default(false) | bool) - name: reload sysctl.conf command: /sbin/sysctl -p diff --git a/roles/openshift_node_upgrade/handlers/main.yml b/roles/openshift_node_upgrade/handlers/main.yml index cb51416d4..c636f6fa3 100644 --- a/roles/openshift_node_upgrade/handlers/main.yml +++ b/roles/openshift_node_upgrade/handlers/main.yml @@ -10,5 +10,13 @@ when: (not skip_node_svc_handlers | default(False) | bool) and openshift.common.is_containerized | bool - name: restart node - systemd: name={{ openshift.common.service_type }}-node state=restarted - when: (not skip_node_svc_handlers | default(False) | bool) and not (node_service_status_changed | default(false) | bool) + systemd: + name: "{{ openshift.common.service_type }}-node" + state: restarted + register: l_openshift_node_upgrade_restart_node_result + until: not l_openshift_node_upgrade_restart_node_result | failed + retries: 3 + delay: 30 + when: + - (not skip_node_svc_handlers | default(False) | bool) + - not (node_service_status_changed | default(false) | bool) -- cgit v1.2.3 From b98dd4f1ac8582dbdee70128151f7e14c68c9b74 Mon Sep 17 00:00:00 2001 From: Scott Dodson Date: Thu, 6 Jul 2017 12:13:38 -0400 Subject: Wrap docker stop in retries --- playbooks/common/openshift-cluster/upgrades/docker/upgrade.yml | 8 +++++++- roles/openshift_node_upgrade/tasks/docker/upgrade.yml | 8 +++++++- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/playbooks/common/openshift-cluster/upgrades/docker/upgrade.yml b/playbooks/common/openshift-cluster/upgrades/docker/upgrade.yml index 17f8fc6e9..35d000e49 100644 --- a/playbooks/common/openshift-cluster/upgrades/docker/upgrade.yml +++ b/playbooks/common/openshift-cluster/upgrades/docker/upgrade.yml @@ -32,7 +32,13 @@ - debug: var=docker_image_count.stdout when: docker_upgrade_nuke_images is defined and docker_upgrade_nuke_images | bool -- service: name=docker state=stopped +- service: + name: docker + state: stopped + register: l_pb_docker_upgrade_stop_result + until: not l_pb_docker_upgrade_stop_result | failed + retries: 3 + delay: 30 - name: Upgrade Docker package: name=docker{{ '-' + docker_version }} state=present diff --git a/roles/openshift_node_upgrade/tasks/docker/upgrade.yml b/roles/openshift_node_upgrade/tasks/docker/upgrade.yml index 416cf605a..ebe87d6fd 100644 --- a/roles/openshift_node_upgrade/tasks/docker/upgrade.yml +++ b/roles/openshift_node_upgrade/tasks/docker/upgrade.yml @@ -26,7 +26,13 @@ - debug: var=docker_image_count.stdout when: docker_upgrade_nuke_images is defined and docker_upgrade_nuke_images | bool -- service: name=docker state=stopped +- service: + name: docker + state: stopped + register: l_openshift_node_upgrade_docker_stop_result + until: not l_openshift_node_upgrade_docker_stop_result | failed + retries: 3 + delay: 30 - name: Upgrade Docker package: name=docker{{ '-' + docker_version }} state=present -- cgit v1.2.3 From a05fbeb6135864fedfb648644b06702ee1afea68 Mon Sep 17 00:00:00 2001 From: Scott Dodson Date: Mon, 10 Jul 2017 13:47:51 -0400 Subject: Wrap additional service changes in retries --- roles/docker/handlers/main.yml | 1 - roles/docker/tasks/package_docker.yml | 7 +++++-- roles/docker/tasks/systemcontainer_docker.yml | 12 ++++++++++-- roles/openshift_node/handlers/main.yml | 5 +++++ roles/openshift_node/tasks/main.yml | 3 +++ roles/openshift_node_upgrade/handlers/main.yml | 8 +++++++- 6 files changed, 30 insertions(+), 6 deletions(-) diff --git a/roles/docker/handlers/main.yml b/roles/docker/handlers/main.yml index 6745f4302..591367467 100644 --- a/roles/docker/handlers/main.yml +++ b/roles/docker/handlers/main.yml @@ -8,7 +8,6 @@ until: not r_docker_restart_docker_result | failed retries: 3 delay: 30 - when: not docker_service_status_changed | default(false) | bool - name: restart udev diff --git a/roles/docker/tasks/package_docker.yml b/roles/docker/tasks/package_docker.yml index c82d8659a..18e3fe749 100644 --- a/roles/docker/tasks/package_docker.yml +++ b/roles/docker/tasks/package_docker.yml @@ -123,9 +123,12 @@ enabled: yes state: started daemon_reload: yes - register: start_result + register: r_docker_package_docker_start_result + until: not r_docker_package_docker_start_result | failed + retries: 3 + delay: 30 - set_fact: - docker_service_status_changed: start_result | changed + docker_service_status_changed: r_docker_package_docker_start_result | changed - meta: flush_handlers diff --git a/roles/docker/tasks/systemcontainer_docker.yml b/roles/docker/tasks/systemcontainer_docker.yml index d8c5ccfd3..7e861cb8e 100644 --- a/roles/docker/tasks/systemcontainer_docker.yml +++ b/roles/docker/tasks/systemcontainer_docker.yml @@ -46,6 +46,11 @@ state: stopped daemon_reload: yes ignore_errors: True + register: r_docker_systemcontainer_docker_stop_result + until: not r_docker_systemcontainer_docker_stop_result | failed + retries: 3 + delay: 30 + # Set http_proxy, https_proxy, and no_proxy in /etc/atomic.conf # regexp: the line starts with or without #, followed by the string @@ -160,9 +165,12 @@ enabled: yes state: started daemon_reload: yes - register: start_result + register: r_docker_systemcontainer_docker_start_result + until: not r_docker_systemcontainer_docker_start_result | failed + retries: 3 + delay: 30 - set_fact: - docker_service_status_changed: start_result | changed + docker_service_status_changed: r_docker_systemcontainer_docker_start_result | changed - meta: flush_handlers diff --git a/roles/openshift_node/handlers/main.yml b/roles/openshift_node/handlers/main.yml index 0d4af9f53..6b38da7f8 100644 --- a/roles/openshift_node/handlers/main.yml +++ b/roles/openshift_node/handlers/main.yml @@ -4,9 +4,14 @@ name: openvswitch state: restarted when: (not skip_node_svc_handlers | default(False) | bool) and not (ovs_service_status_changed | default(false) | bool) and openshift.common.use_openshift_sdn | bool + register: l_openshift_node_stop_openvswitch_result + until: not l_openshift_node_stop_openvswitch_result | failed + retries: 3 + delay: 30 notify: - restart openvswitch pause + - name: restart openvswitch pause pause: seconds=15 when: (not skip_node_svc_handlers | default(False) | bool) and openshift.common.is_containerized | bool diff --git a/roles/openshift_node/tasks/main.yml b/roles/openshift_node/tasks/main.yml index 573051504..dd47d643b 100644 --- a/roles/openshift_node/tasks/main.yml +++ b/roles/openshift_node/tasks/main.yml @@ -120,6 +120,9 @@ state: started when: openshift.common.is_containerized | bool and openshift.common.use_openshift_sdn | bool register: ovs_start_result + until: not ovs_start_result | failed + retries: 3 + delay: 30 - set_fact: ovs_service_status_changed: "{{ ovs_start_result | changed }}" diff --git a/roles/openshift_node_upgrade/handlers/main.yml b/roles/openshift_node_upgrade/handlers/main.yml index c636f6fa3..110dfe5ce 100644 --- a/roles/openshift_node_upgrade/handlers/main.yml +++ b/roles/openshift_node_upgrade/handlers/main.yml @@ -1,7 +1,13 @@ --- - name: restart openvswitch - systemd: name=openvswitch state=restarted + systemd: + name: openvswitch + state: restarted when: (not skip_node_svc_handlers | default(False) | bool) and not (ovs_service_status_changed | default(false) | bool) and openshift.common.use_openshift_sdn | bool + register: l_openshift_node_upgrade_stop_openvswitch_result + until: not l_openshift_node_upgrade_stop_openvswitch_result | failed + retries: 3 + delay: 30 notify: - restart openvswitch pause -- cgit v1.2.3 From 2c369ea3f7c5e87b84944296ad1e3ae2898f934f Mon Sep 17 00:00:00 2001 From: Scott Dodson Date: Wed, 12 Jul 2017 15:56:32 -0400 Subject: Ensure proper fact evaluation --- roles/docker/tasks/package_docker.yml | 2 +- roles/docker/tasks/systemcontainer_docker.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/roles/docker/tasks/package_docker.yml b/roles/docker/tasks/package_docker.yml index 18e3fe749..5f536005d 100644 --- a/roles/docker/tasks/package_docker.yml +++ b/roles/docker/tasks/package_docker.yml @@ -129,6 +129,6 @@ delay: 30 - set_fact: - docker_service_status_changed: r_docker_package_docker_start_result | changed + docker_service_status_changed: "{{ r_docker_package_docker_start_result | changed }}" - meta: flush_handlers diff --git a/roles/docker/tasks/systemcontainer_docker.yml b/roles/docker/tasks/systemcontainer_docker.yml index 7e861cb8e..57a84bc2c 100644 --- a/roles/docker/tasks/systemcontainer_docker.yml +++ b/roles/docker/tasks/systemcontainer_docker.yml @@ -171,6 +171,6 @@ delay: 30 - set_fact: - docker_service_status_changed: r_docker_systemcontainer_docker_start_result | changed + docker_service_status_changed: "{{ r_docker_systemcontainer_docker_start_result | changed }}" - meta: flush_handlers -- cgit v1.2.3 From 4c8f1c1269aa8fa527816ad63a295f8863d8a6f8 Mon Sep 17 00:00:00 2001 From: Scott Dodson Date: Fri, 14 Jul 2017 16:01:13 -0400 Subject: daemon_reload on node and ovs start At least in my smoke testing of a containerized install i had to manually reload systemd --- roles/openshift_node/tasks/main.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/roles/openshift_node/tasks/main.yml b/roles/openshift_node/tasks/main.yml index dd47d643b..05721c882 100644 --- a/roles/openshift_node/tasks/main.yml +++ b/roles/openshift_node/tasks/main.yml @@ -118,6 +118,7 @@ name: openvswitch.service enabled: yes state: started + daemon_reload: yes when: openshift.common.is_containerized | bool and openshift.common.use_openshift_sdn | bool register: ovs_start_result until: not ovs_start_result | failed @@ -220,6 +221,7 @@ name: "{{ openshift.common.service_type }}-node" enabled: yes state: started + daemon_reload: yes register: node_start_result until: not node_start_result | failed retries: 1 -- cgit v1.2.3 From 44fb8d5d9825bd5a708062dfe371763566d014e7 Mon Sep 17 00:00:00 2001 From: Scott Dodson Date: Tue, 18 Jul 2017 08:19:27 -0400 Subject: Dump some logs --- roles/openshift_node/tasks/main.yml | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/roles/openshift_node/tasks/main.yml b/roles/openshift_node/tasks/main.yml index 05721c882..879f6c207 100644 --- a/roles/openshift_node/tasks/main.yml +++ b/roles/openshift_node/tasks/main.yml @@ -216,6 +216,7 @@ state: started when: openshift.common.is_containerized | bool + - name: Start and enable node systemd: name: "{{ openshift.common.service_type }}-node" @@ -226,6 +227,16 @@ until: not node_start_result | failed retries: 1 delay: 30 + ignore_errors: true + +- name: Dump logs from node service if it failed + command: journalctl --no-pager -n 100 {{ openshift.common.service_type }}-node + when: node_start_result | failed + +- name: Abort if node failed to start + fail: + msg: Node failed to start please inspect the logs and try again + when: node_start_result | failed - set_fact: node_service_status_changed: "{{ node_start_result | changed }}" -- cgit v1.2.3 From 5a94fe5b074d01a3b16db8a05c47c31e484e5ebe Mon Sep 17 00:00:00 2001 From: Scott Dodson Date: Fri, 14 Jul 2017 12:13:25 -0400 Subject: Add drain retries after 60 second delay --- .../common/openshift-cluster/upgrades/docker/docker_upgrade.yml | 4 ++++ .../common/openshift-cluster/upgrades/upgrade_control_plane.yml | 4 ++++ playbooks/common/openshift-cluster/upgrades/upgrade_nodes.yml | 4 ++++ roles/openshift_node_upgrade/README.md | 5 +++++ 4 files changed, 17 insertions(+) diff --git a/playbooks/common/openshift-cluster/upgrades/docker/docker_upgrade.yml b/playbooks/common/openshift-cluster/upgrades/docker/docker_upgrade.yml index 07db071ce..1ed9041d4 100644 --- a/playbooks/common/openshift-cluster/upgrades/docker/docker_upgrade.yml +++ b/playbooks/common/openshift-cluster/upgrades/docker/docker_upgrade.yml @@ -55,6 +55,10 @@ {{ openshift.common.admin_binary }} drain {{ openshift.node.nodename }} --force --delete-local-data --ignore-daemonsets delegate_to: "{{ groups.oo_first_master.0 }}" when: l_docker_upgrade is defined and l_docker_upgrade | bool and inventory_hostname in groups.oo_nodes_to_upgrade + register: l_docker_upgrade_drain_result + until: not l_docker_upgrade_drain_result | failed + retries: 60 + delay: 60 - include: upgrade.yml when: l_docker_upgrade is defined and l_docker_upgrade | bool diff --git a/playbooks/common/openshift-cluster/upgrades/upgrade_control_plane.yml b/playbooks/common/openshift-cluster/upgrades/upgrade_control_plane.yml index b7f089d99..2b2f10aee 100644 --- a/playbooks/common/openshift-cluster/upgrades/upgrade_control_plane.yml +++ b/playbooks/common/openshift-cluster/upgrades/upgrade_control_plane.yml @@ -296,6 +296,10 @@ command: > {{ hostvars[groups.oo_first_master.0].openshift.common.admin_binary }} drain {{ openshift.node.nodename | lower }} --config={{ openshift.common.config_base }}/master/admin.kubeconfig --force --delete-local-data --ignore-daemonsets delegate_to: "{{ groups.oo_first_master.0 }}" + register: l_upgrade_control_plane_drain_result + until: not l_upgrade_control_plane_drain_result | failed + retries: 60 + delay: 60 roles: - lib_openshift diff --git a/playbooks/common/openshift-cluster/upgrades/upgrade_nodes.yml b/playbooks/common/openshift-cluster/upgrades/upgrade_nodes.yml index 1d1e440d4..af15ec5b2 100644 --- a/playbooks/common/openshift-cluster/upgrades/upgrade_nodes.yml +++ b/playbooks/common/openshift-cluster/upgrades/upgrade_nodes.yml @@ -28,6 +28,10 @@ command: > {{ hostvars[groups.oo_first_master.0].openshift.common.admin_binary }} drain {{ openshift.node.nodename | lower }} --force --delete-local-data --ignore-daemonsets delegate_to: "{{ groups.oo_first_master.0 }}" + register: l_upgrade_nodes_drain_result + until: not l_upgrade_nodes_drain_result | failed + retries: 60 + delay: 60 roles: - lib_openshift diff --git a/roles/openshift_node_upgrade/README.md b/roles/openshift_node_upgrade/README.md index 8b388cc6a..4e6229bfb 100644 --- a/roles/openshift_node_upgrade/README.md +++ b/roles/openshift_node_upgrade/README.md @@ -84,6 +84,11 @@ Including an example of how to use your role (for instance, with variables passe command: > {{ hostvars[groups.oo_first_master.0].openshift.common.admin_binary }} drain {{ openshift.node.nodename | lower }} --force --delete-local-data --ignore-daemonsets delegate_to: "{{ groups.oo_first_master.0 }}" + register: l_docker_upgrade_drain_result + until: not l_docker_upgrade_drain_result | failed + retries: 60 + delay: 60 + roles: - openshift_facts -- cgit v1.2.3