7 files changed, 261 insertions, 1 deletions
diff --git a/playbooks/common/openshift-master/config.yml b/playbooks/common/openshift-master/config.yml
index 4ecdf2a0c..0df03f194 100644
--- a/playbooks/common/openshift-master/config.yml
+++ b/playbooks/common/openshift-master/config.yml
@@ -313,6 +313,7 @@
 
 - name: Configure master instances
   hosts: oo_masters_to_config
+  any_errors_fatal: true
   serial: 1
   vars:
     sync_tmpdir: "{{ hostvars.localhost.g_master_mktemp.stdout }}"
diff --git a/playbooks/common/openshift-master/restart.yml b/playbooks/common/openshift-master/restart.yml
new file mode 100644
index 000000000..fa13a64cb
--- /dev/null
+++ b/playbooks/common/openshift-master/restart.yml
@@ -0,0 +1,141 @@
+---
+- include: ../openshift-cluster/evaluate_groups.yml
+
+- name: Validate configuration for rolling restart
+  hosts: oo_masters_to_config
+  roles:
+  - openshift_facts
+  tasks:
+  - fail:
+      msg: "openshift_rolling_restart_mode must be set to either 'services' or 'system'"
+    when: openshift_rolling_restart_mode is defined and openshift_rolling_restart_mode not in ["services", "system"]
+  - openshift_facts:
+      role: "{{ item.role }}"
+      local_facts: "{{ item.local_facts }}"
+    with_items:
+      - role: common
+        local_facts:
+          rolling_restart_mode: "{{ openshift_rolling_restart_mode | default('services') }}"
+      - role: master
+        local_facts:
+          cluster_method: "{{ openshift_master_cluster_method | default(None) }}"
+
+# Creating a temp file on localhost, we then check each system that will
+# be rebooted to see if that file exists, if so we know we're running
+# ansible on a machine that needs a reboot, and we need to error out.
+- name: Create temp file on localhost
+  hosts: localhost
+  connection: local
+  become: no
+  gather_facts: no
+  tasks:
+  - local_action: command mktemp
+    register: mktemp
+    changed_when: false
+
+- name: Check if temp file exists on any masters
+  hosts: oo_masters_to_config
+  tasks:
+  - stat: path="{{ hostvars.localhost.mktemp.stdout }}"
+    register: exists
+    changed_when: false
+
+- name: Cleanup temp file on localhost
+  hosts: localhost
+  connection: local
+  become: no
+  gather_facts: no
+  tasks:
+  - file: path="{{ hostvars.localhost.mktemp.stdout }}" state=absent
+    changed_when: false
+
+- name: Warn if restarting the system where ansible is running
+  hosts: oo_masters_to_config
+  tasks:
+  - pause:
+      prompt: >
+        Warning: Running playbook from a host that will be restarted!
+        Press CTRL+C and A to abort playbook execution. You may
+        continue by pressing ENTER but the playbook will stop
+        executing once this system restarts and services must be
+        manually verified.
+    when: exists.stat.exists and openshift.common.rolling_restart_mode == 'system'
+  - set_fact:
+      current_host: "{{ exists.stat.exists }}"
+    when: openshift.common.rolling_restart_mode == 'system'
+
+- name: Determine which masters are currently active
+  hosts: oo_masters_to_config
+  tasks:
+  - name: Check master service status
+    command: >
+      systemctl is-active {{ openshift.common.service_type }}-master
+    register: active_check_output
+    when: openshift.master.cluster_method == 'pacemaker'
+    failed_when: active_check_output.stdout not in ['active', 'inactive']
+    changed_when: false
+  - set_fact:
+      is_active: "{{ active_check_output.stdout == 'active' }}"
+    when: openshift.master.cluster_method == 'pacemaker'
+
+- name: Evaluate master groups
+  hosts: localhost
+  become: no
+  tasks:
+  - name: Evaluate oo_active_masters
+    add_host:
+      name: "{{ item }}"
+      groups: oo_active_masters
+      ansible_ssh_user: "{{ g_ssh_user | default(omit) }}"
+      ansible_sudo: "{{ g_sudo | default(omit) }}"
+    with_items: "{{ groups.oo_masters_to_config | default([]) }}"
+    when: (hostvars[item]['is_active'] | default(false)) | bool
+  - name: Evaluate oo_current_masters
+    add_host:
+      name: "{{ item }}"
+      groups: oo_current_masters
+      ansible_ssh_user: "{{ g_ssh_user | default(omit) }}"
+      ansible_sudo: "{{ g_sudo | default(omit) }}"
+    with_items: "{{ groups.oo_masters_to_config | default([]) }}"
+    when: (hostvars[item]['current_host'] | default(false)) | bool
+
+- name: Validate pacemaker cluster
+  hosts: oo_active_masters
+  tasks:
+  - name: Retrieve pcs status
+    command: pcs status
+    register: pcs_status_output
+    changed_when: false
+  - fail:
+      msg: >
+        Pacemaker cluster validation failed. One or more nodes are not online.
+    when: not (pcs_status_output.stdout | validate_pcs_cluster(groups.oo_masters_to_config)) | bool
+
+- name: Restart masters
+  hosts: oo_masters_to_config:!oo_active_masters:!oo_current_masters
+  vars:
+    openshift_master_ha: "{{ groups.oo_masters_to_config | length > 1 }}"
+  serial: 1
+  tasks:
+  - include: restart_hosts.yml
+    when: openshift.common.rolling_restart_mode == 'system'
+  - include: restart_services.yml
+    when: openshift.common.rolling_restart_mode == 'services'
+
+- name: Restart active masters
+  hosts: oo_active_masters
+  serial: 1
+  tasks:
+  - include: restart_hosts_pacemaker.yml
+    when: openshift.common.rolling_restart_mode == 'system'
+  - include: restart_services_pacemaker.yml
+    when: openshift.common.rolling_restart_mode == 'services'
+
+- name: Restart current masters
+  hosts: oo_current_masters
+  serial: 1
+  tasks:
+  - include: restart_hosts.yml
+    when: openshift.common.rolling_restart_mode == 'system'
+  - include: restart_services.yml
+    when: openshift.common.rolling_restart_mode == 'services'
diff --git a/playbooks/common/openshift-master/restart_hosts.yml b/playbooks/common/openshift-master/restart_hosts.yml
new file mode 100644
index 000000000..ff206f5a2
--- /dev/null
+++ b/playbooks/common/openshift-master/restart_hosts.yml
@@ -0,0 +1,39 @@
+- name: Restart master system
+  # https://github.com/ansible/ansible/issues/10616
+  shell: sleep 2 && shutdown -r now "OpenShift Ansible master rolling restart"
+  async: 1
+  poll: 0
+  ignore_errors: true
+  become: yes
+# When cluster_method != pacemaker we can ensure the api_port is
+# available.
+- name: Wait for master API to come back online
+  become: no
+  local_action:
+    module: wait_for
+      host="{{ inventory_hostname }}"
+      state=started
+      delay=10
+      port="{{ openshift.master.api_port }}"
+  when: openshift.master.cluster_method != 'pacemaker'
+- name: Wait for master to start
+  become: no
+  local_action:
+    module: wait_for
+      host="{{ inventory_hostname }}"
+      state=started
+      delay=10
+      port=22
+  when: openshift.master.cluster_method == 'pacemaker'
+- name: Wait for master to become available
+  command: pcs status
+  register: pcs_status_output
+  until: pcs_status_output.stdout | validate_pcs_cluster([inventory_hostname]) | bool
+  retries: 15
+  delay: 2
+  changed_when: false
+  when: openshift.master.cluster_method == 'pacemaker'
+- fail:
+    msg: >
+      Pacemaker cluster validation failed {{ inventory hostname }} is not online.
+  when: openshift.master.cluster_method == 'pacemaker' and not (pcs_status_output.stdout | validate_pcs_cluster([inventory_hostname])) | bool
diff --git a/playbooks/common/openshift-master/restart_hosts_pacemaker.yml b/playbooks/common/openshift-master/restart_hosts_pacemaker.yml
new file mode 100644
index 000000000..c9219e8de
--- /dev/null
+++ b/playbooks/common/openshift-master/restart_hosts_pacemaker.yml
@@ -0,0 +1,25 @@
+- name: Fail over master resource
+  command: >
+    pcs resource move master {{ hostvars | oo_select_keys(groups['oo_masters_to_config']) | oo_collect('openshift.common.hostname', {'is_active': 'False'}) | list | first }}
+- name: Wait for master API to come back online
+  become: no
+  local_action:
+    module: wait_for
+      host="{{ openshift.master.cluster_hostname }}"
+      state=started
+      delay=10
+      port="{{ openshift.master.api_port }}"
+- name: Restart master system
+  # https://github.com/ansible/ansible/issues/10616
+  shell: sleep 2 && shutdown -r now "OpenShift Ansible master rolling restart"
+  async: 1
+  poll: 0
+  ignore_errors: true
+  become: yes
+- name: Wait for master to start
+  become: no
+  local_action:
+   module: wait_for
+      host="{{ inventory_hostname }}"
+      state=started
+      delay=10
diff --git a/playbooks/common/openshift-master/restart_services.yml b/playbooks/common/openshift-master/restart_services.yml
new file mode 100644
index 000000000..5e539cd65
--- /dev/null
+++ b/playbooks/common/openshift-master/restart_services.yml
@@ -0,0 +1,27 @@
+- name: Restart master
+  service:
+    name: "{{ openshift.common.service_type }}-master"
+    state: restarted
+  when: not openshift_master_ha | bool
+- name: Restart master API
+  service:
+    name: "{{ openshift.common.service_type }}-master-api"
+    state: restarted
+  when: openshift_master_ha | bool and openshift.master.cluster_method != 'pacemaker'
+- name: Wait for master API to come back online
+  become: no
+  local_action:
+    module: wait_for
+      host="{{ inventory_hostname }}"
+      state=started
+      delay=10
+      port="{{ openshift.master.api_port }}"
+  when: openshift_master_ha | bool and openshift.master.cluster_method != 'pacemaker'
+- name: Restart master controllers
+  service:
+    name: "{{ openshift.common.service_type }}-master-controllers"
+    state: restarted
+  # Ignore errrors since it is possible that type != simple for
+  # pre-3.1.1 installations.
+  ignore_errors: true
+  when: openshift_master_ha | bool and openshift.master.cluster_method != 'pacemaker'
diff --git a/playbooks/common/openshift-master/restart_services_pacemaker.yml b/playbooks/common/openshift-master/restart_services_pacemaker.yml
new file mode 100644
index 000000000..e738f3fb6
--- /dev/null
+++ b/playbooks/common/openshift-master/restart_services_pacemaker.yml
@@ -0,0 +1,10 @@
+- name: Restart master services
+  command: pcs resource restart master
+- name: Wait for master API to come back online
+  become: no
+  local_action:
+    module: wait_for
+      host="{{ openshift.master.cluster_hostname }}"
+      state=started
+      delay=10
+      port="{{ openshift.master.api_port }}"
diff --git a/playbooks/common/openshift-node/config.yml b/playbooks/common/openshift-node/config.yml
index 483a7768c..336cbed5e 100644
--- a/playbooks/common/openshift-node/config.yml
+++ b/playbooks/common/openshift-node/config.yml
@@ -215,6 +215,23 @@
                          | oo_collect('openshift.common.hostname') }}"
     openshift_node_vars: "{{ hostvars | oo_select_keys(groups['oo_nodes_to_config']) }}"
   pre_tasks:
-
+  # Necessary because when you're on a node that's also a master the master will be
+  # restarted after the node restarts docker and it will take up to 60 seconds for
+  # systemd to start the master again
+  - name: Wait for master API to become available before proceeding
+    # Using curl here since the uri module requires python-httplib2 and
+    # wait_for port doesn't provide health information.
+    command: >
+      curl -k --head --silent {{ openshift.master.api_url }}
+    register: api_available_output
+    until: api_available_output.stdout.find("200 OK") != -1
+    retries: 120
+    delay: 1
+    changed_when: false
+    when: openshift.common.is_containerized | bool
+  - fail:
+      msg: >
+        Unable to contact master API at {{ openshift.master.api_url }}
+    when: openshift.common.is_containerized | bool and api_available_output.stdout.find("200 OK") == -1
   roles:
   - openshift_manage_node