From 4b5d8d2dc25dbca20be59f3d5d111d737fd865bc Mon Sep 17 00:00:00 2001 From: Scott Dodson Date: Tue, 1 Aug 2017 12:55:47 -0400 Subject: Switch to migrating one host and forming a new cluster With large datasets where there are many keys with TTLs the expiry was creating a data inconsistency problem. The hope is that by performing the migration once and then forming a new cluster this is avoided. Fixes https://bugzilla.redhat.com/show_bug.cgi?id=1475351 --- roles/etcd_migrate/tasks/add_ttls.yml | 33 ++++++++++++++++++++++ roles/etcd_migrate/tasks/check.yml | 3 -- roles/etcd_migrate/tasks/clean_data.yml | 5 ++++ roles/etcd_migrate/tasks/main.yml | 4 +-- roles/etcd_migrate/tasks/migrate.yml | 49 +++++++++++---------------------- 5 files changed, 56 insertions(+), 38 deletions(-) create mode 100644 roles/etcd_migrate/tasks/add_ttls.yml create mode 100644 roles/etcd_migrate/tasks/clean_data.yml (limited to 'roles/etcd_migrate') diff --git a/roles/etcd_migrate/tasks/add_ttls.yml b/roles/etcd_migrate/tasks/add_ttls.yml new file mode 100644 index 000000000..c10465af9 --- /dev/null +++ b/roles/etcd_migrate/tasks/add_ttls.yml @@ -0,0 +1,33 @@ +--- +# To be executed on first master +- slurp: + src: "{{ openshift.common.config_base }}/master/master-config.yaml" + register: g_master_config_output + +- set_fact: + accessTokenMaxAgeSeconds: "{{ (g_master_config_output.content|b64decode|from_yaml).oauthConfig.tokenConfig.accessTokenMaxAgeSeconds | default(86400) }}" + authroizeTokenMaxAgeSeconds: "{{ (g_master_config_output.content|b64decode|from_yaml).oauthConfig.tokenConfig.authroizeTokenMaxAgeSeconds | default(500) }}" + controllerLeaseTTL: "{{ (g_master_config_output.content|b64decode|from_yaml).controllerLeaseTTL | default(30) }}" +- name: Re-introduce leases (as a replacement for key TTLs) + command: > + oadm migrate etcd-ttl \ + --cert {{ r_etcd_common_master_peer_cert_file }} \ + --key {{ r_etcd_common_master_peer_key_file }} \ + --cacert {{ r_etcd_common_master_peer_ca_file }} \ + --etcd-address 'https://{{ etcd_peer }}:{{ etcd_client_port }}' \ + --ttl-keys-prefix {{ item.keys }} \ + --lease-duration {{ item.ttl }} + environment: + ETCDCTL_API: 3 + PATH: "/usr/local/bin:/var/usrlocal/bin:{{ ansible_env.PATH }}" + with_items: + - keys: "/kubernetes.io/events" + ttl: "1h" + - keys: "/kubernetes.io/masterleases" + ttl: "10s" + - keys: "/openshift.io/oauth/accesstokens" + ttl: "{{ accessTokenMaxAgeSeconds }}s" + - keys: "/openshift.io/oauth/authorizetokens" + ttl: "{{ authroizeTokenMaxAgeSeconds }}s" + - keys: "/openshift.io/leases/controllers" + ttl: "{{ controllerLeaseTTL }}s" diff --git a/roles/etcd_migrate/tasks/check.yml b/roles/etcd_migrate/tasks/check.yml index b66696b55..0804d9e1c 100644 --- a/roles/etcd_migrate/tasks/check.yml +++ b/roles/etcd_migrate/tasks/check.yml @@ -1,7 +1,4 @@ --- -- fail: - msg: "Currently etcd v3 migration is unsupported while we test it more thoroughly" - when: not openshift_enable_unsupported_configurations | default(false) | bool # Check the cluster is healthy - include: check_cluster_health.yml diff --git a/roles/etcd_migrate/tasks/clean_data.yml b/roles/etcd_migrate/tasks/clean_data.yml new file mode 100644 index 000000000..95a0e7c0a --- /dev/null +++ b/roles/etcd_migrate/tasks/clean_data.yml @@ -0,0 +1,5 @@ +--- +- name: Remove member data + file: + path: /var/lib/etcd/member + state: absent diff --git a/roles/etcd_migrate/tasks/main.yml b/roles/etcd_migrate/tasks/main.yml index 409b0b613..e82f6a6b4 100644 --- a/roles/etcd_migrate/tasks/main.yml +++ b/roles/etcd_migrate/tasks/main.yml @@ -1,8 +1,8 @@ --- - name: Fail if invalid r_etcd_migrate_action provided fail: - msg: "etcd_migrate role can only be called with 'check' or 'migrate' or 'configure'" - when: r_etcd_migrate_action not in ['check', 'migrate', 'configure'] + msg: "etcd_migrate role can only be called with 'check', 'migrate', 'configure', 'add_ttls', or 'clean_data'" + when: r_etcd_migrate_action not in ['check', 'migrate', 'configure', 'add_ttls', 'clean_data'] - name: Include main action task file include: "{{ r_etcd_migrate_action }}.yml" diff --git a/roles/etcd_migrate/tasks/migrate.yml b/roles/etcd_migrate/tasks/migrate.yml index b2cf6d20a..173de77f4 100644 --- a/roles/etcd_migrate/tasks/migrate.yml +++ b/roles/etcd_migrate/tasks/migrate.yml @@ -3,62 +3,45 @@ - set_fact: l_etcd_service: "{{ 'etcd_container' if openshift.common.is_containerized else 'etcd' }}" -- name: Disable etcd members - service: - name: "{{ l_etcd_service }}" - state: stopped - -# Should we skip all TTL keys? https://bugzilla.redhat.com/show_bug.cgi?id=1389773 - name: Migrate etcd data command: > etcdctl migrate --data-dir={{ etcd_data_dir }} environment: ETCDCTL_API: 3 register: l_etcdctl_migrate - # TODO(jchaloup): If any of the members fails, we need to restore all members to v2 from the pre-migrate backup - name: Check the etcd v2 data are correctly migrated fail: msg: "Failed to migrate a member" when: "'finished transforming keys' not in l_etcdctl_migrate.stdout and 'no v2 keys to migrate' not in l_etcdctl_migrate.stdout" - - name: Migration message debug: msg: "Etcd migration finished with: {{ l_etcdctl_migrate.stdout }}" - -- name: Enable etcd member - service: +- name: Set ETCD_FORCE_NEW_CLUSTER=true on first etcd host + lineinfile: + line: "ETCD_FORCE_NEW_CLUSTER=true" + dest: /etc/etcd/etcd.conf +- name: Start etcd + systemd: name: "{{ l_etcd_service }}" state: started +- name: Unset ETCD_FORCE_NEW_CLUSTER=true on first etcd host + lineinfile: + line: "ETCD_FORCE_NEW_CLUSTER=true" + dest: /etc/etcd/etcd.conf + state: absent +- name: Restart first etcd host + systemd: + name: "{{ l_etcd_service }}" + state: restarted -- name: Wait for cluster to become healthy after migration +- name: Wait for cluster to become healthy after bringing up first member command: > etcdctl --cert-file {{ etcd_peer_cert_file }} --key-file {{ etcd_peer_key_file }} --ca-file {{ etcd_peer_ca_file }} --endpoint https://{{ etcd_peer }}:{{ etcd_client_port }} cluster-health register: l_etcd_migrate_health until: l_etcd_migrate_health.rc == 0 retries: 3 delay: 30 - run_once: true - -# NOTE: /usr/local/bin may be removed from the PATH by ansible hence why -# it's added to the environment in this task. -- name: Re-introduce leases (as a replacement for key TTLs) - command: > - oadm migrate etcd-ttl \ - --cert {{ r_etcd_common_master_peer_cert_file }} \ - --key {{ r_etcd_common_master_peer_key_file }} \ - --cacert {{ r_etcd_common_master_peer_ca_file }} \ - --etcd-address 'https://{{ etcd_peer }}:{{ etcd_client_port }}' \ - --ttl-keys-prefix {{ item }} \ - --lease-duration 1h - environment: - ETCDCTL_API: 3 - PATH: "/usr/local/bin:/var/usrlocal/bin:{{ ansible_env.PATH }}" - with_items: - - "/kubernetes.io/events" - - "/kubernetes.io/masterleases" - delegate_to: "{{ groups.oo_first_master[0] }}" - run_once: true - set_fact: r_etcd_migrate_success: true -- cgit v1.2.3