From dea9abfe22864cf10d85d85370b1633ca18060b6 Mon Sep 17 00:00:00 2001 From: Devan Goodwin Date: Mon, 14 Dec 2015 15:29:24 -0400 Subject: Implement simple master rolling restarts. Blocks running ansible on a host that will be restarted. Can restart just services, or optionally the full system. --- playbooks/byo/openshift-cluster/restart.yml | 7 +++ playbooks/common/openshift-cluster/restart.yml | 78 ++++++++++++++++++++++++++ 2 files changed, 85 insertions(+) create mode 100644 playbooks/byo/openshift-cluster/restart.yml create mode 100644 playbooks/common/openshift-cluster/restart.yml diff --git a/playbooks/byo/openshift-cluster/restart.yml b/playbooks/byo/openshift-cluster/restart.yml new file mode 100644 index 000000000..da0da69a6 --- /dev/null +++ b/playbooks/byo/openshift-cluster/restart.yml @@ -0,0 +1,7 @@ +--- +- include: ../../common/openshift-cluster/restart.yml + vars: + g_etcd_hosts: "{{ groups.etcd | default([]) }}" + g_master_hosts: "{{ groups.masters | default([]) }}" + g_node_hosts: "{{ groups.nodes | default([]) }}" + g_lb_hosts: "{{ groups.lb | default([]) }}" diff --git a/playbooks/common/openshift-cluster/restart.yml b/playbooks/common/openshift-cluster/restart.yml new file mode 100644 index 000000000..4117f7297 --- /dev/null +++ b/playbooks/common/openshift-cluster/restart.yml @@ -0,0 +1,78 @@ +--- +- include: evaluate_groups.yml +# TODO: verify this is an HA environment +# TODO: fork for pacemaker vs haproxy (based on?) + +- name: Validate configuration for rolling restart + hosts: oo_masters_to_config + tasks: + - set_fact: + openshift_rolling_restart_mode: "{{ openshift_rolling_restart_mode | default('services') }}" + - fail: + msg: "openshift_rolling_restart_mode must be set to either 'services' or 'system'" + when: openshift_rolling_restart_mode is defined and openshift_rolling_restart_mode not in ["services", "system"] + +# Creating a temp file on localhost, we then check each system that will +# be rebooted to see if that file exists, if so we know we're running +# ansible on a machine that needs a reboot, and we need to error out. +- name: Create temp file on localhost + hosts: localhost + connection: local + become: no + gather_facts: no + tasks: + - local_action: command mktemp + register: mktemp + changed_when: False + +- name: Check if temp file exists on any masters + hosts: oo_masters_to_config + tasks: + - stat: path="{{ hostvars.localhost.mktemp.stdout }}" + register: exists + +- name: Cleanup temp file on localhost + hosts: localhost + connection: local + become: no + gather_facts: no + tasks: + - file: path="{{ hostvars.localhost.mktemp.stdout }}" state=absent + +- name: Fail if restarting the system where ansible is running + hosts: oo_masters_to_config + any_errors_fatal: true + tasks: + - fail: msg="Cannot run playbook on a host that will be restarted." + when: exists.stat.exists + +- name: Restart Masters + hosts: oo_masters_to_config + serial: 1 + roles: + - openshift_facts + tasks: + - name: Restart master system + # https://github.com/ansible/ansible/issues/10616 + shell: sleep 2 && shutdown -r now "OpenShift Ansible master rolling restart" + async: 1 + poll: 0 + ignore_errors: true + become: yes + when: openshift_rolling_restart_mode == 'system' + - name: Restart master services + service: + name: "{{ openshift.common.service_type }}-master-api" + state: restarted + # NOTE: no need to check openshift_master_ha here, we know it must be, + # thus the api service is the one we restart. + when: openshift_rolling_restart_mode == 'services' + + - name: Wait for master API to come back online + become: no + local_action: + module: wait_for + host="{{ inventory_hostname }}" + state=started + delay=10 + port=8443 # TODO: should this be made a master host variable? -- cgit v1.2.3 From f9aaa8ac13adf841823f35be594641bdc2ebecac Mon Sep 17 00:00:00 2001 From: Andrew Butcher Date: Tue, 5 Jan 2016 11:39:22 -0500 Subject: Update rolling restart playbook for pacemaker support. Replace fail with a warn and prompt if running ansible from a host that will be rebooted. Re-organize playbooks. --- playbooks/byo/openshift-cluster/restart.yml | 7 -- playbooks/byo/openshift-master/filter_plugins | 1 + playbooks/byo/openshift-master/lookup_plugins | 1 + playbooks/byo/openshift-master/restart.yml | 4 + playbooks/byo/openshift-master/roles | 1 + playbooks/common/openshift-cluster/restart.yml | 78 ------------- playbooks/common/openshift-master/restart.yml | 128 +++++++++++++++++++++ .../common/openshift-master/restart_hosts.yml | 28 +++++ .../openshift-master/restart_hosts_pacemaker.yml | 25 ++++ .../common/openshift-master/restart_services.yml | 27 +++++ .../restart_services_pacemaker.yml | 10 ++ 11 files changed, 225 insertions(+), 85 deletions(-) delete mode 100644 playbooks/byo/openshift-cluster/restart.yml create mode 120000 playbooks/byo/openshift-master/filter_plugins create mode 120000 playbooks/byo/openshift-master/lookup_plugins create mode 100644 playbooks/byo/openshift-master/restart.yml create mode 120000 playbooks/byo/openshift-master/roles delete mode 100644 playbooks/common/openshift-cluster/restart.yml create mode 100644 playbooks/common/openshift-master/restart.yml create mode 100644 playbooks/common/openshift-master/restart_hosts.yml create mode 100644 playbooks/common/openshift-master/restart_hosts_pacemaker.yml create mode 100644 playbooks/common/openshift-master/restart_services.yml create mode 100644 playbooks/common/openshift-master/restart_services_pacemaker.yml diff --git a/playbooks/byo/openshift-cluster/restart.yml b/playbooks/byo/openshift-cluster/restart.yml deleted file mode 100644 index da0da69a6..000000000 --- a/playbooks/byo/openshift-cluster/restart.yml +++ /dev/null @@ -1,7 +0,0 @@ ---- -- include: ../../common/openshift-cluster/restart.yml - vars: - g_etcd_hosts: "{{ groups.etcd | default([]) }}" - g_master_hosts: "{{ groups.masters | default([]) }}" - g_node_hosts: "{{ groups.nodes | default([]) }}" - g_lb_hosts: "{{ groups.lb | default([]) }}" diff --git a/playbooks/byo/openshift-master/filter_plugins b/playbooks/byo/openshift-master/filter_plugins new file mode 120000 index 000000000..99a95e4ca --- /dev/null +++ b/playbooks/byo/openshift-master/filter_plugins @@ -0,0 +1 @@ +../../../filter_plugins \ No newline at end of file diff --git a/playbooks/byo/openshift-master/lookup_plugins b/playbooks/byo/openshift-master/lookup_plugins new file mode 120000 index 000000000..ac79701db --- /dev/null +++ b/playbooks/byo/openshift-master/lookup_plugins @@ -0,0 +1 @@ +../../../lookup_plugins \ No newline at end of file diff --git a/playbooks/byo/openshift-master/restart.yml b/playbooks/byo/openshift-master/restart.yml new file mode 100644 index 000000000..a78a6aa3d --- /dev/null +++ b/playbooks/byo/openshift-master/restart.yml @@ -0,0 +1,4 @@ +--- +- include: ../../common/openshift-master/restart.yml + vars_files: + - ../../byo/openshift-cluster/cluster_hosts.yml diff --git a/playbooks/byo/openshift-master/roles b/playbooks/byo/openshift-master/roles new file mode 120000 index 000000000..20c4c58cf --- /dev/null +++ b/playbooks/byo/openshift-master/roles @@ -0,0 +1 @@ +../../../roles \ No newline at end of file diff --git a/playbooks/common/openshift-cluster/restart.yml b/playbooks/common/openshift-cluster/restart.yml deleted file mode 100644 index 4117f7297..000000000 --- a/playbooks/common/openshift-cluster/restart.yml +++ /dev/null @@ -1,78 +0,0 @@ ---- -- include: evaluate_groups.yml -# TODO: verify this is an HA environment -# TODO: fork for pacemaker vs haproxy (based on?) - -- name: Validate configuration for rolling restart - hosts: oo_masters_to_config - tasks: - - set_fact: - openshift_rolling_restart_mode: "{{ openshift_rolling_restart_mode | default('services') }}" - - fail: - msg: "openshift_rolling_restart_mode must be set to either 'services' or 'system'" - when: openshift_rolling_restart_mode is defined and openshift_rolling_restart_mode not in ["services", "system"] - -# Creating a temp file on localhost, we then check each system that will -# be rebooted to see if that file exists, if so we know we're running -# ansible on a machine that needs a reboot, and we need to error out. -- name: Create temp file on localhost - hosts: localhost - connection: local - become: no - gather_facts: no - tasks: - - local_action: command mktemp - register: mktemp - changed_when: False - -- name: Check if temp file exists on any masters - hosts: oo_masters_to_config - tasks: - - stat: path="{{ hostvars.localhost.mktemp.stdout }}" - register: exists - -- name: Cleanup temp file on localhost - hosts: localhost - connection: local - become: no - gather_facts: no - tasks: - - file: path="{{ hostvars.localhost.mktemp.stdout }}" state=absent - -- name: Fail if restarting the system where ansible is running - hosts: oo_masters_to_config - any_errors_fatal: true - tasks: - - fail: msg="Cannot run playbook on a host that will be restarted." - when: exists.stat.exists - -- name: Restart Masters - hosts: oo_masters_to_config - serial: 1 - roles: - - openshift_facts - tasks: - - name: Restart master system - # https://github.com/ansible/ansible/issues/10616 - shell: sleep 2 && shutdown -r now "OpenShift Ansible master rolling restart" - async: 1 - poll: 0 - ignore_errors: true - become: yes - when: openshift_rolling_restart_mode == 'system' - - name: Restart master services - service: - name: "{{ openshift.common.service_type }}-master-api" - state: restarted - # NOTE: no need to check openshift_master_ha here, we know it must be, - # thus the api service is the one we restart. - when: openshift_rolling_restart_mode == 'services' - - - name: Wait for master API to come back online - become: no - local_action: - module: wait_for - host="{{ inventory_hostname }}" - state=started - delay=10 - port=8443 # TODO: should this be made a master host variable? diff --git a/playbooks/common/openshift-master/restart.yml b/playbooks/common/openshift-master/restart.yml new file mode 100644 index 000000000..7603f0d61 --- /dev/null +++ b/playbooks/common/openshift-master/restart.yml @@ -0,0 +1,128 @@ +--- +- include: ../openshift-cluster/evaluate_groups.yml + +- name: Validate configuration for rolling restart + hosts: oo_masters_to_config + roles: + - openshift_facts + tasks: + - fail: + msg: "openshift_rolling_restart_mode must be set to either 'services' or 'system'" + when: openshift_rolling_restart_mode is defined and openshift_rolling_restart_mode not in ["services", "system"] + - openshift_facts: + role: "{{ item.role }}" + local_facts: "{{ item.local_facts }}" + with_items: + - role: common + local_facts: + rolling_restart_mode: "{{ openshift_rolling_restart_mode | default('services') }}" + - role: master + local_facts: + cluster_method: "{{ openshift_master_cluster_method | default(None) }}" + +# Creating a temp file on localhost, we then check each system that will +# be rebooted to see if that file exists, if so we know we're running +# ansible on a machine that needs a reboot, and we need to error out. +- name: Create temp file on localhost + hosts: localhost + connection: local + become: no + gather_facts: no + tasks: + - local_action: command mktemp + register: mktemp + changed_when: false + +- name: Check if temp file exists on any masters + hosts: oo_masters_to_config + tasks: + - stat: path="{{ hostvars.localhost.mktemp.stdout }}" + register: exists + changed_when: false + +- name: Cleanup temp file on localhost + hosts: localhost + connection: local + become: no + gather_facts: no + tasks: + - file: path="{{ hostvars.localhost.mktemp.stdout }}" state=absent + changed_when: false + +- name: Warn if restarting the system where ansible is running + hosts: oo_masters_to_config + tasks: + - pause: + prompt: > + Warning: Running playbook from a host that will be restarted! + Press CTRL+C and A to abort playbook execution. You may + continue by pressing ENTER but the playbook will stop + executing once this system restarts and services must be + manually verified. + when: exists.stat.exists and openshift.common.rolling_restart_mode == 'system' + - set_fact: + current_host: "{{ exists.stat.exists }}" + when: openshift.common.rolling_restart_mode == 'system' + +- name: Determine which masters are currently active + hosts: oo_masters_to_config + tasks: + - name: Check master service status + command: > + systemctl is-active {{ openshift.common.service_type }}-master + register: active_check_output + when: openshift.master.cluster_method == 'pacemaker' + failed_when: active_check_output.stdout not in ['active', 'inactive'] + - set_fact: + is_active: "{{ active_check_output.stdout == 'active' }}" + when: openshift.master.cluster_method == 'pacemaker' + +- name: Evaluate master groups + hosts: localhost + become: no + tasks: + - name: Evaluate oo_active_masters + add_host: + name: "{{ item }}" + groups: oo_active_masters + ansible_ssh_user: "{{ g_ssh_user | default(omit) }}" + ansible_sudo: "{{ g_sudo | default(omit) }}" + with_items: "{{ groups.oo_masters_to_config | default([]) }}" + when: (hostvars[item]['is_active'] | default(false)) | bool + - name: Evaluate oo_current_masters + add_host: + name: "{{ item }}" + groups: oo_current_masters + ansible_ssh_user: "{{ g_ssh_user | default(omit) }}" + ansible_sudo: "{{ g_sudo | default(omit) }}" + with_items: "{{ groups.oo_masters_to_config | default([]) }}" + when: (hostvars[item]['current_host'] | default(false)) | bool + +- name: Restart masters + hosts: oo_masters_to_config:!oo_active_masters:!oo_current_masters + vars: + openshift_master_ha: "{{ groups.oo_masters_to_config | length > 1 }}" + serial: 1 + tasks: + - include: restart_hosts.yml + when: openshift.common.rolling_restart_mode == 'system' + - include: restart_services.yml + when: openshift.common.rolling_restart_mode == 'services' + +- name: Restart active masters + hosts: oo_active_masters + serial: 1 + tasks: + - include: restart_hosts_pacemaker.yml + when: openshift.common.rolling_restart_mode == 'system' + - include: restart_services_pacemaker.yml + when: openshift.common.rolling_restart_mode == 'services' + +- name: Restart current masters + hosts: oo_current_masters + serial: 1 + tasks: + - include: restart_hosts.yml + when: openshift.common.rolling_restart_mode == 'system' + - include: restart_services.yml + when: openshift.common.rolling_restart_mode == 'services' diff --git a/playbooks/common/openshift-master/restart_hosts.yml b/playbooks/common/openshift-master/restart_hosts.yml new file mode 100644 index 000000000..598e1ad63 --- /dev/null +++ b/playbooks/common/openshift-master/restart_hosts.yml @@ -0,0 +1,28 @@ +- name: Restart master system + # https://github.com/ansible/ansible/issues/10616 + shell: sleep 2 && shutdown -r now "OpenShift Ansible master rolling restart" + async: 1 + poll: 0 + ignore_errors: true + become: yes +# When cluster_method != pacemaker we can ensure the api_port is +# available. +- name: Wait for master API to come back online + become: no + local_action: + module: wait_for + host="{{ inventory_hostname }}" + state=started + delay=10 + port="{{ openshift.master.api_port }}" + when: openshift.master.cluster_method != 'pacemaker' +# When cluster_method is pacemaker we can only ensure that the host +# restarted successfully. +- name: Wait for master to start + become: no + local_action: + module: wait_for + host="{{ inventory_hostname }}" + state=started + delay=10 + when: openshift.master.cluster_method == 'pacemaker' diff --git a/playbooks/common/openshift-master/restart_hosts_pacemaker.yml b/playbooks/common/openshift-master/restart_hosts_pacemaker.yml new file mode 100644 index 000000000..c9219e8de --- /dev/null +++ b/playbooks/common/openshift-master/restart_hosts_pacemaker.yml @@ -0,0 +1,25 @@ +- name: Fail over master resource + command: > + pcs resource move master {{ hostvars | oo_select_keys(groups['oo_masters_to_config']) | oo_collect('openshift.common.hostname', {'is_active': 'False'}) | list | first }} +- name: Wait for master API to come back online + become: no + local_action: + module: wait_for + host="{{ openshift.master.cluster_hostname }}" + state=started + delay=10 + port="{{ openshift.master.api_port }}" +- name: Restart master system + # https://github.com/ansible/ansible/issues/10616 + shell: sleep 2 && shutdown -r now "OpenShift Ansible master rolling restart" + async: 1 + poll: 0 + ignore_errors: true + become: yes +- name: Wait for master to start + become: no + local_action: + module: wait_for + host="{{ inventory_hostname }}" + state=started + delay=10 diff --git a/playbooks/common/openshift-master/restart_services.yml b/playbooks/common/openshift-master/restart_services.yml new file mode 100644 index 000000000..5e539cd65 --- /dev/null +++ b/playbooks/common/openshift-master/restart_services.yml @@ -0,0 +1,27 @@ +- name: Restart master + service: + name: "{{ openshift.common.service_type }}-master" + state: restarted + when: not openshift_master_ha | bool +- name: Restart master API + service: + name: "{{ openshift.common.service_type }}-master-api" + state: restarted + when: openshift_master_ha | bool and openshift.master.cluster_method != 'pacemaker' +- name: Wait for master API to come back online + become: no + local_action: + module: wait_for + host="{{ inventory_hostname }}" + state=started + delay=10 + port="{{ openshift.master.api_port }}" + when: openshift_master_ha | bool and openshift.master.cluster_method != 'pacemaker' +- name: Restart master controllers + service: + name: "{{ openshift.common.service_type }}-master-controllers" + state: restarted + # Ignore errrors since it is possible that type != simple for + # pre-3.1.1 installations. + ignore_errors: true + when: openshift_master_ha | bool and openshift.master.cluster_method != 'pacemaker' diff --git a/playbooks/common/openshift-master/restart_services_pacemaker.yml b/playbooks/common/openshift-master/restart_services_pacemaker.yml new file mode 100644 index 000000000..e738f3fb6 --- /dev/null +++ b/playbooks/common/openshift-master/restart_services_pacemaker.yml @@ -0,0 +1,10 @@ +- name: Restart master services + command: pcs resource restart master +- name: Wait for master API to come back online + become: no + local_action: + module: wait_for + host="{{ openshift.master.cluster_hostname }}" + state=started + delay=10 + port="{{ openshift.master.api_port }}" -- cgit v1.2.3 From 97be5890e2a34036a22d2d1e2586c83009ae6064 Mon Sep 17 00:00:00 2001 From: Andrew Butcher Date: Tue, 12 Jan 2016 16:39:06 -0500 Subject: Validate pacemaker cluster members. --- filter_plugins/openshift_master.py | 28 +++++++++++++++++++++- playbooks/common/openshift-master/restart.yml | 13 ++++++++++ .../common/openshift-master/restart_hosts.yml | 15 ++++++++++-- roles/openshift_facts/tasks/main.yml | 1 + 4 files changed, 54 insertions(+), 3 deletions(-) diff --git a/filter_plugins/openshift_master.py b/filter_plugins/openshift_master.py index 8d7c62ad1..7ababc51a 100644 --- a/filter_plugins/openshift_master.py +++ b/filter_plugins/openshift_master.py @@ -463,6 +463,32 @@ class FilterModule(object): IdentityProviderBase.validate_idp_list(idp_list) return yaml.safe_dump([idp.to_dict() for idp in idp_list], default_flow_style=False) + @staticmethod + def validate_pcs_cluster(data, masters=None): + ''' Validates output from "pcs status", ensuring that each master + provided is online. + Ex: data = ('...', + 'PCSD Status:', + 'master1.example.com: Online', + 'master2.example.com: Online', + 'master3.example.com: Online', + '...') + masters = ['master1.example.com', + 'master2.example.com', + 'master3.example.com'] + returns True + ''' + if not issubclass(type(data), str): + raise errors.AnsibleFilterError("|failed expects data is a string") + if not issubclass(type(masters), list): + raise errors.AnsibleFilterError("|failed expects masters is a list") + valid = True + for master in masters: + if "{0}: Online".format(master) not in data: + valid = False + return valid + def filters(self): ''' returns a mapping of filters to methods ''' - return {"translate_idps": self.translate_idps} + return {"translate_idps": self.translate_idps, + "validate_pcs_cluster": self.validate_pcs_cluster} diff --git a/playbooks/common/openshift-master/restart.yml b/playbooks/common/openshift-master/restart.yml index 7603f0d61..fa13a64cb 100644 --- a/playbooks/common/openshift-master/restart.yml +++ b/playbooks/common/openshift-master/restart.yml @@ -73,6 +73,7 @@ register: active_check_output when: openshift.master.cluster_method == 'pacemaker' failed_when: active_check_output.stdout not in ['active', 'inactive'] + changed_when: false - set_fact: is_active: "{{ active_check_output.stdout == 'active' }}" when: openshift.master.cluster_method == 'pacemaker' @@ -98,6 +99,18 @@ with_items: "{{ groups.oo_masters_to_config | default([]) }}" when: (hostvars[item]['current_host'] | default(false)) | bool +- name: Validate pacemaker cluster + hosts: oo_active_masters + tasks: + - name: Retrieve pcs status + command: pcs status + register: pcs_status_output + changed_when: false + - fail: + msg: > + Pacemaker cluster validation failed. One or more nodes are not online. + when: not (pcs_status_output.stdout | validate_pcs_cluster(groups.oo_masters_to_config)) | bool + - name: Restart masters hosts: oo_masters_to_config:!oo_active_masters:!oo_current_masters vars: diff --git a/playbooks/common/openshift-master/restart_hosts.yml b/playbooks/common/openshift-master/restart_hosts.yml index 598e1ad63..ff206f5a2 100644 --- a/playbooks/common/openshift-master/restart_hosts.yml +++ b/playbooks/common/openshift-master/restart_hosts.yml @@ -16,8 +16,6 @@ delay=10 port="{{ openshift.master.api_port }}" when: openshift.master.cluster_method != 'pacemaker' -# When cluster_method is pacemaker we can only ensure that the host -# restarted successfully. - name: Wait for master to start become: no local_action: @@ -25,4 +23,17 @@ host="{{ inventory_hostname }}" state=started delay=10 + port=22 when: openshift.master.cluster_method == 'pacemaker' +- name: Wait for master to become available + command: pcs status + register: pcs_status_output + until: pcs_status_output.stdout | validate_pcs_cluster([inventory_hostname]) | bool + retries: 15 + delay: 2 + changed_when: false + when: openshift.master.cluster_method == 'pacemaker' +- fail: + msg: > + Pacemaker cluster validation failed {{ inventory hostname }} is not online. + when: openshift.master.cluster_method == 'pacemaker' and not (pcs_status_output.stdout | validate_pcs_cluster([inventory_hostname])) | bool diff --git a/roles/openshift_facts/tasks/main.yml b/roles/openshift_facts/tasks/main.yml index 87fa99a3b..e40a1b329 100644 --- a/roles/openshift_facts/tasks/main.yml +++ b/roles/openshift_facts/tasks/main.yml @@ -10,6 +10,7 @@ shell: ls /run/ostree-booted ignore_errors: yes failed_when: false + changed_when: false register: ostree_output # Locally setup containerized facts for now -- cgit v1.2.3