summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--playbooks/byo/openshift-etcd/migrate.yml124
-rw-r--r--roles/etcd_migrate/README.md53
-rw-r--r--roles/etcd_migrate/defaults/main.yml3
-rw-r--r--roles/etcd_migrate/meta/main.yml17
-rw-r--r--roles/etcd_migrate/tasks/check.yml55
-rw-r--r--roles/etcd_migrate/tasks/check_cluster_health.yml23
-rw-r--r--roles/etcd_migrate/tasks/check_cluster_status.yml32
-rw-r--r--roles/etcd_migrate/tasks/configure.yml13
-rw-r--r--roles/etcd_migrate/tasks/main.yml25
-rw-r--r--roles/etcd_migrate/tasks/migrate.yml53
10 files changed, 398 insertions, 0 deletions
diff --git a/playbooks/byo/openshift-etcd/migrate.yml b/playbooks/byo/openshift-etcd/migrate.yml
new file mode 100644
index 000000000..fd02e066e
--- /dev/null
+++ b/playbooks/byo/openshift-etcd/migrate.yml
@@ -0,0 +1,124 @@
+---
+- include: ../openshift-cluster/initialize_groups.yml
+ tags:
+ - always
+
+- include: ../../common/openshift-cluster/evaluate_groups.yml
+ tags:
+ - always
+
+- name: Run pre-checks
+ hosts: oo_etcd_to_config
+ tags:
+ - always
+ roles:
+ - role: etcd_migrate
+ r_etcd_migrate_action: check
+ etcd_peer: "{{ ansible_default_ipv4.address }}"
+
+# TODO(jchaloup): replace the std_include with something minimal so the entire playbook is faster
+# e.g. I don't need to detect the OCP version, install deps, etc.
+- include: ../../common/openshift-cluster/std_include.yml
+ tags:
+ - always
+
+- name: Backup v2 data
+ hosts: oo_etcd_to_config
+ gather_facts: no
+ tags:
+ - always
+ roles:
+ - role: openshift_facts
+ - role: etcd_common
+ r_etcd_common_action: backup
+ r_etcd_common_etcd_runtime: "{{ openshift.common.etcd_runtime }}"
+ r_etcd_common_backup_tag: pre-migration
+ r_etcd_common_embedded_etcd: "{{ groups.oo_etcd_to_config | default([]) | length == 0 }}"
+ r_etcd_common_backup_sufix_name: "{{ lookup('pipe', 'date +%Y%m%d%H%M%S') }}"
+
+- name: Gate on etcd backup
+ hosts: localhost
+ connection: local
+ become: no
+ tasks:
+ - set_fact:
+ etcd_backup_completed: "{{ hostvars
+ | oo_select_keys(groups.oo_etcd_to_config)
+ | oo_collect('inventory_hostname', {'r_etcd_common_backup_complete': true}) }}"
+ - set_fact:
+ etcd_backup_failed: "{{ groups.oo_etcd_to_config | difference(etcd_backup_completed) }}"
+ - fail:
+ msg: "Migration cannot continue. The following hosts did not complete etcd backup: {{ etcd_backup_failed | join(',') }}"
+ when:
+ - etcd_backup_failed | length > 0
+
+- name: Prepare masters for etcd data migration
+ hosts: oo_masters_to_config
+ tasks:
+ - set_fact:
+ master_services:
+ - "{{ openshift.common.service_type + '-master' }}"
+ - set_fact:
+ master_services:
+ - "{{ openshift.common.service_type + '-master-controllers' }}"
+ - "{{ openshift.common.service_type + '-master-api' }}"
+ when:
+ - (openshift_master_cluster_method is defined and openshift_master_cluster_method == "native") or openshift.common.is_master_system_container | bool
+ - debug:
+ msg: "master service name: {{ master_services }}"
+ - name: Stop masters
+ service:
+ name: "{{ item }}"
+ state: stopped
+ with_items: "{{ master_services }}"
+
+- name: Migrate etcd data from v2 to v3
+ hosts: oo_etcd_to_config
+ gather_facts: no
+ tags:
+ - always
+ roles:
+ - role: etcd_migrate
+ r_etcd_migrate_action: migrate
+ etcd_peer: "{{ ansible_default_ipv4.address }}"
+
+- name: Gate on etcd migration
+ hosts: oo_masters_to_config
+ gather_facts: no
+ tasks:
+ - set_fact:
+ etcd_migration_completed: "{{ hostvars
+ | oo_select_keys(groups.oo_etcd_to_config)
+ | oo_collect('inventory_hostname', {'r_etcd_migrate_success': true}) }}"
+ - set_fact:
+ etcd_migration_failed: "{{ groups.oo_etcd_to_config | difference(etcd_migration_completed) }}"
+
+- name: Configure masters if etcd data migration is succesfull
+ hosts: oo_masters_to_config
+ roles:
+ - role: etcd_migrate
+ r_etcd_migrate_action: configure
+ when: etcd_migration_failed | length == 0
+ tasks:
+ - debug:
+ msg: "Skipping master re-configuration since migration failed."
+ when:
+ - etcd_migration_failed | length > 0
+
+- name: Start masters after etcd data migration
+ hosts: oo_masters_to_config
+ tasks:
+ - name: Start master services
+ service:
+ name: "{{ item }}"
+ state: started
+ register: service_status
+ # Sometimes the master-api, resp. master-controllers fails to start for the first time
+ until: service_status.state is defined and service_status.state == "started"
+ retries: 5
+ delay: 10
+ with_items: "{{ master_services[::-1] }}"
+ - fail:
+ msg: "Migration failed. The following hosts were not properly migrated: {{ etcd_migration_failed | join(',') }}"
+ when:
+ - etcd_migration_failed | length > 0
diff --git a/roles/etcd_migrate/README.md b/roles/etcd_migrate/README.md
new file mode 100644
index 000000000..369e78ff2
--- /dev/null
+++ b/roles/etcd_migrate/README.md
@@ -0,0 +1,53 @@
+Role Name
+=========
+
+Offline etcd migration of data from v2 to v3
+
+Requirements
+------------
+
+It is expected all consumers of the etcd data are not accessing the data.
+Otherwise the migrated data can be out-of-sync with the v2 and can result in unhealthy etcd cluster.
+
+The role itself is responsible for:
+- checking etcd cluster health and raft status before the migration
+- checking of presence of any v3 data (in that case the migration is stopped)
+- migration of v2 data to v3 data (including attaching leases of keys prefixed with "/kubernetes.io/events" and "/kubernetes.io/masterleases" string)
+- validation of migrated data (all v2 keys and in v3 keys and are set to the identical value)
+
+The migration itself requires an etcd member to be down in the process. Once the migration is done, the etcd member is started.
+
+Role Variables
+--------------
+
+TBD
+
+Dependencies
+------------
+
+- etcd_common
+- lib_utils
+
+Example Playbook
+----------------
+
+```yaml
+- name: Migrate etcd data from v2 to v3
+ hosts: oo_etcd_to_config
+ gather_facts: no
+ tasks:
+ - include_role:
+ name: openshift_etcd_migrate
+ vars:
+ etcd_peer: "{{ ansible_default_ipv4.address }}"
+```
+
+License
+-------
+
+Apache License, Version 2.0
+
+Author Information
+------------------
+
+Jan Chaloupka (jchaloup@redhat.com)
diff --git a/roles/etcd_migrate/defaults/main.yml b/roles/etcd_migrate/defaults/main.yml
new file mode 100644
index 000000000..05cf41fbb
--- /dev/null
+++ b/roles/etcd_migrate/defaults/main.yml
@@ -0,0 +1,3 @@
+---
+# Default action when calling this role, choices: check, migrate, configure
+r_etcd_migrate_action: migrate
diff --git a/roles/etcd_migrate/meta/main.yml b/roles/etcd_migrate/meta/main.yml
new file mode 100644
index 000000000..f3cabbef6
--- /dev/null
+++ b/roles/etcd_migrate/meta/main.yml
@@ -0,0 +1,17 @@
+---
+galaxy_info:
+ author: Jan Chaloupka
+ description: Etcd migration
+ company: Red Hat, Inc.
+ license: Apache License, Version 2.0
+ min_ansible_version: 2.1
+ platforms:
+ - name: EL
+ versions:
+ - 7
+ categories:
+ - cloud
+ - system
+dependencies:
+- { role: etcd_common }
+- { role: lib_utils }
diff --git a/roles/etcd_migrate/tasks/check.yml b/roles/etcd_migrate/tasks/check.yml
new file mode 100644
index 000000000..2f07713bc
--- /dev/null
+++ b/roles/etcd_migrate/tasks/check.yml
@@ -0,0 +1,55 @@
+---
+# Check the cluster is healthy
+- include: check_cluster_health.yml
+
+# Check if the member has v3 data already
+# Run the migration only if the data are v2
+- name: Check if there are any v3 data
+ command: >
+ etcdctl --cert {{ etcd_peer_cert_file }} --key {{ etcd_peer_key_file }} --cacert {{ etcd_peer_ca_file }} --endpoints 'https://{{ etcd_peer }}:2379' get "" --from-key --keys-only -w json --limit 1
+ environment:
+ ETCDCTL_API: 3
+ register: l_etcdctl_output
+
+- fail:
+ msg: "Unable to get a number of v3 keys"
+ when: l_etcdctl_output.rc != 0
+
+- fail:
+ msg: "The etcd has at least one v3 key"
+ when: "'count' in (l_etcdctl_output.stdout | from_json) and (l_etcdctl_output.stdout | from_json).count != 0"
+
+
+# TODO(jchaloup): once the until loop can be used over include/block,
+# remove the repetive code
+# - until loop not supported over include statement (nor block)
+# https://github.com/ansible/ansible/issues/17098
+# - with_items not supported over block
+
+# Check the cluster status for the first time
+- include: check_cluster_status.yml
+
+# Check the cluster status for the second time
+- block:
+ - debug:
+ msg: "l_etcd_cluster_status_ok: {{ l_etcd_cluster_status_ok }}"
+ - name: Wait a while before another check
+ pause:
+ seconds: 5
+ when: not l_etcd_cluster_status_ok | bool
+
+ - include: check_cluster_status.yml
+ when: not l_etcd_cluster_status_ok | bool
+
+
+# Check the cluster status for the third time
+- block:
+ - debug:
+ msg: "l_etcd_cluster_status_ok: {{ l_etcd_cluster_status_ok }}"
+ - name: Wait a while before another check
+ pause:
+ seconds: 5
+ when: not l_etcd_cluster_status_ok | bool
+
+ - include: check_cluster_status.yml
+ when: not l_etcd_cluster_status_ok | bool
diff --git a/roles/etcd_migrate/tasks/check_cluster_health.yml b/roles/etcd_migrate/tasks/check_cluster_health.yml
new file mode 100644
index 000000000..1abd6a32f
--- /dev/null
+++ b/roles/etcd_migrate/tasks/check_cluster_health.yml
@@ -0,0 +1,23 @@
+---
+- name: Check cluster health
+ command: >
+ etcdctl --cert-file /etc/etcd/peer.crt --key-file /etc/etcd/peer.key --ca-file /etc/etcd/ca.crt --endpoint https://{{ etcd_peer }}:2379 cluster-health
+ register: etcd_cluster_health
+ changed_when: false
+ failed_when: false
+
+- name: Assume a member is not healthy
+ set_fact:
+ etcd_member_healthy: false
+
+- name: Get member item health status
+ set_fact:
+ etcd_member_healthy: true
+ with_items: "{{ etcd_cluster_health.stdout_lines }}"
+ when: "(etcd_peer in item) and ('is healthy' in item)"
+
+- name: Check the etcd cluster health
+ # TODO(jchaloup): should we fail or ask user if he wants to continue? Or just wait until the cluster is healthy?
+ fail:
+ msg: "Etcd member {{ etcd_peer }} is not healthy"
+ when: not etcd_member_healthy
diff --git a/roles/etcd_migrate/tasks/check_cluster_status.yml b/roles/etcd_migrate/tasks/check_cluster_status.yml
new file mode 100644
index 000000000..90fe385c1
--- /dev/null
+++ b/roles/etcd_migrate/tasks/check_cluster_status.yml
@@ -0,0 +1,32 @@
+---
+# etcd_ip originates from etcd_common role
+- name: Check cluster status
+ command: >
+ etcdctl --cert /etc/etcd/peer.crt --key /etc/etcd/peer.key --cacert /etc/etcd/ca.crt --endpoints 'https://{{ etcd_peer }}:2379' -w json endpoint status
+ environment:
+ ETCDCTL_API: 3
+ register: l_etcd_cluster_status
+
+- name: Retrieve raftIndex
+ set_fact:
+ etcd_member_raft_index: "{{ (l_etcd_cluster_status.stdout | from_json)[0]['Status']['raftIndex'] }}"
+
+- block:
+ # http://docs.ansible.com/ansible/playbooks_filters.html#extracting-values-from-containers
+ - name: Group all raftIndices into a list
+ set_fact:
+ etcd_members_raft_indices: "{{ groups['oo_etcd_to_config'] | map('extract', hostvars, 'etcd_member_raft_index') | list | unique }}"
+
+ - name: Check the minimum and the maximum of raftIndices is at most 1
+ set_fact:
+ etcd_members_raft_indices_diff: "{{ ((etcd_members_raft_indices | max | int) - (etcd_members_raft_indices | min | int)) | int }}"
+
+ - debug:
+ msg: "Raft indices difference: {{ etcd_members_raft_indices_diff }}"
+
+ when: inventory_hostname in groups.oo_etcd_to_config[0]
+
+# The cluster raft status is ok if the difference of the max and min raft index is at most 1
+- name: capture the status
+ set_fact:
+ l_etcd_cluster_status_ok: "{{ hostvars[groups.oo_etcd_to_config[0]]['etcd_members_raft_indices_diff'] | int < 2 }}"
diff --git a/roles/etcd_migrate/tasks/configure.yml b/roles/etcd_migrate/tasks/configure.yml
new file mode 100644
index 000000000..a305d5bf3
--- /dev/null
+++ b/roles/etcd_migrate/tasks/configure.yml
@@ -0,0 +1,13 @@
+---
+- name: Configure master to use etcd3 storage backend
+ yedit:
+ src: /etc/origin/master/master-config.yaml
+ key: "{{ item.key }}"
+ value: "{{ item.value }}"
+ with_items:
+ - key: kubernetesMasterConfig.apiServerArguments.storage-backend
+ value:
+ - etcd3
+ - key: kubernetesMasterConfig.apiServerArguments.storage-media-type
+ value:
+ - application/vnd.kubernetes.protobuf
diff --git a/roles/etcd_migrate/tasks/main.yml b/roles/etcd_migrate/tasks/main.yml
new file mode 100644
index 000000000..409b0b613
--- /dev/null
+++ b/roles/etcd_migrate/tasks/main.yml
@@ -0,0 +1,25 @@
+---
+- name: Fail if invalid r_etcd_migrate_action provided
+ fail:
+ msg: "etcd_migrate role can only be called with 'check' or 'migrate' or 'configure'"
+ when: r_etcd_migrate_action not in ['check', 'migrate', 'configure']
+
+- name: Include main action task file
+ include: "{{ r_etcd_migrate_action }}.yml"
+
+# 2. migrate v2 datadir into v3:
+# ETCDCTL_API=3 ./etcdctl migrate --data-dir=${data_dir} --no-ttl
+# backup the etcd datadir first
+# Provide a way for an operator to specify transformer
+
+# 3. re-configure OpenShift master at /etc/origin/master/master-config.yml
+# set storage-backend to “etcd3”
+# 4. we could leave the master restart to current logic (there is already the code ready (single vs. HA master))
+
+# Run
+# etcdctl --cert-file /etc/etcd/peer.crt --key-file /etc/etcd/peer.key --ca-file /etc/etcd/ca.crt --endpoint https://172.16.186.45:2379 cluster-health
+# to check the cluster health (from the etcdctl.sh aliases file)
+
+# Another assumption:
+# - in order to migrate all etcd v2 data into v3, we need to shut down the cluster (let's verify that on Wednesday meeting)
+# -
diff --git a/roles/etcd_migrate/tasks/migrate.yml b/roles/etcd_migrate/tasks/migrate.yml
new file mode 100644
index 000000000..cb479b0cc
--- /dev/null
+++ b/roles/etcd_migrate/tasks/migrate.yml
@@ -0,0 +1,53 @@
+---
+# Should this be run in a serial manner?
+- set_fact:
+ l_etcd_service: "{{ 'etcd_container' if openshift.common.is_containerized else 'etcd' }}"
+
+- name: Disable etcd members
+ service:
+ name: "{{ l_etcd_service }}"
+ state: stopped
+
+# Should we skip all TTL keys? https://bugzilla.redhat.com/show_bug.cgi?id=1389773
+- name: Migrate etcd data
+ command: >
+ etcdctl migrate --data-dir={{ etcd_data_dir }}
+ environment:
+ ETCDCTL_API: 3
+ register: l_etcdctl_migrate
+
+# TODO(jchaloup): If any of the members fails, we need to restore all members to v2 from the pre-migrate backup
+- name: Check the etcd v2 data are correctly migrated
+ fail:
+ msg: "Failed to migrate a member"
+ when: "'finished transforming keys' not in l_etcdctl_migrate.stdout"
+
+# TODO(jchaloup): start the etcd on a different port so noone can access it
+# Once the validation is done
+- name: Enable etcd member
+ service:
+ name: "{{ l_etcd_service }}"
+ state: started
+
+- name: Re-introduce leases (as a replacement for key TTLs)
+ command: >
+ oadm migrate etcd-ttl \
+ --cert {{ etcd_peer_cert_file }} \
+ --key {{ etcd_peer_key_file }} \
+ --cacert {{ etcd_peer_ca_file }} \
+ --etcd-address 'https://{{ etcd_peer }}:2379' \
+ --ttl-keys-prefix {{ item }} \
+ --lease-duration 1h
+ environment:
+ ETCDCTL_API: 3
+ with_items:
+ - "/kubernetes.io/events"
+ - "/kubernetes.io/masterleases"
+
+- set_fact:
+ r_etcd_migrate_success: true
+
+- name: Enable etcd member
+ service:
+ name: "{{ l_etcd_service }}"
+ state: started