diff options
7 files changed, 72 insertions, 21 deletions
diff --git a/playbooks/byo/openshift-cluster/upgrades/docker/docker_upgrade.yml b/playbooks/byo/openshift-cluster/upgrades/docker/docker_upgrade.yml index 0d451cf77..1e0a6d4e7 100644 --- a/playbooks/byo/openshift-cluster/upgrades/docker/docker_upgrade.yml +++ b/playbooks/byo/openshift-cluster/upgrades/docker/docker_upgrade.yml @@ -18,20 +18,20 @@ # If a node fails, halt everything, the admin will need to clean up and we # don't want to carry on, potentially taking out every node. The playbook can safely be re-run # and will not take any action on a node already running the requested docker version. -- name: Evacuate and upgrade nodes +- name: Drain and upgrade nodes hosts: oo_masters_to_config:oo_nodes_to_upgrade:oo_etcd_to_config serial: 1 any_errors_fatal: true tasks: - - name: Prepare for Node evacuation + - name: Prepare for Node draining command: > {{ openshift.common.client_binary }} adm manage-node {{ openshift.node.nodename }} --schedulable=false delegate_to: "{{ groups.oo_first_master.0 }}" when: l_docker_upgrade is defined and l_docker_upgrade | bool and inventory_hostname in groups.oo_nodes_to_upgrade - - name: Evacuate Node for Kubelet upgrade + - name: Drain Node for Kubelet upgrade command: > - {{ openshift.common.client_binary }} adm manage-node {{ openshift.node.nodename }} --evacuate --force + {{ openshift.common.client_binary }} adm manage-node {{ openshift.node.nodename }} --drain --force delegate_to: "{{ groups.oo_first_master.0 }}" when: l_docker_upgrade is defined and l_docker_upgrade | bool and inventory_hostname in groups.oo_nodes_to_upgrade diff --git a/playbooks/common/openshift-cluster/redeploy-certificates.yml b/playbooks/common/openshift-cluster/redeploy-certificates.yml index 5f008a045..5fc81bf3a 100644 --- a/playbooks/common/openshift-cluster/redeploy-certificates.yml +++ b/playbooks/common/openshift-cluster/redeploy-certificates.yml @@ -204,7 +204,7 @@ cp {{ openshift.common.config_base }}/master//admin.kubeconfig {{ mktemp.stdout }}/admin.kubeconfig changed_when: False -- name: Serially evacuate all nodes to trigger redeployments +- name: Serially drain all nodes to trigger redeployments hosts: oo_nodes_to_config serial: 1 any_errors_fatal: true @@ -222,7 +222,7 @@ was_schedulable: "{{ 'unschedulable' not in (node_output.stdout | from_json).spec }}" when: openshift_certificates_redeploy_ca | default(false) | bool - - name: Prepare for node evacuation + - name: Prepare for node draining command: > {{ openshift.common.client_binary }} adm --config={{ hostvars[groups.oo_first_master.0].mktemp.stdout }}/admin.kubeconfig manage-node {{ openshift.node.nodename }} @@ -230,11 +230,11 @@ delegate_to: "{{ groups.oo_first_master.0 }}" when: openshift_certificates_redeploy_ca | default(false) | bool and was_schedulable | bool - - name: Evacuate node + - name: Drain node command: > {{ openshift.common.client_binary }} adm --config={{ hostvars[groups.oo_first_master.0].mktemp.stdout }}/admin.kubeconfig manage-node {{ openshift.node.nodename }} - --evacuate --force + --drain --force delegate_to: "{{ groups.oo_first_master.0 }}" when: openshift_certificates_redeploy_ca | default(false) | bool and was_schedulable | bool diff --git a/playbooks/common/openshift-cluster/upgrades/upgrade_nodes.yml b/playbooks/common/openshift-cluster/upgrades/upgrade_nodes.yml index cefc7d12b..68b111df4 100644 --- a/playbooks/common/openshift-cluster/upgrades/upgrade_nodes.yml +++ b/playbooks/common/openshift-cluster/upgrades/upgrade_nodes.yml @@ -1,5 +1,5 @@ --- -- name: Evacuate and upgrade nodes +- name: Drain and upgrade nodes hosts: oo_nodes_to_upgrade # This var must be set with -e on invocation, as it is not a per-host inventory var # and is evaluated early. Values such as "20%" can also be used. @@ -39,9 +39,9 @@ retries: 3 delay: 1 - - name: Evacuate Node for Kubelet upgrade + - name: Drain Node for Kubelet upgrade command: > - {{ hostvars[groups.oo_first_master.0].openshift.common.client_binary }} adm manage-node {{ openshift.node.nodename | lower }} --evacuate --force + {{ hostvars[groups.oo_first_master.0].openshift.common.client_binary }} adm manage-node {{ openshift.node.nodename | lower }} --drain --force delegate_to: "{{ groups.oo_first_master.0 }}" when: inventory_hostname in groups.oo_nodes_to_upgrade diff --git a/roles/openshift_certificate_expiry/README.md b/roles/openshift_certificate_expiry/README.md index d44438332..a88470bdd 100644 --- a/roles/openshift_certificate_expiry/README.md +++ b/roles/openshift_certificate_expiry/README.md @@ -9,7 +9,7 @@ include: * Master/Node Service Certificates * Router/Registry Service Certificates from etcd secrets * Master/Node/Router/Registry/Admin `kubeconfig`s -* Etcd certificates +* Etcd certificates (including embedded) This role pairs well with the redeploy certificates playbook: @@ -111,12 +111,16 @@ There are two top-level keys in the saved JSON results, `data` and `summary`. The `data` key is a hash where the keys are the names of each host -examined and the values are the check results for each respective -host. +examined and the values are the check results for the certificates +identified on each respective host. -The `summary` key is a hash that summarizes the number of certificates -expiring within the configured warning window and the number of -already expired certificates. +The `summary` key is a hash that summarizes the total number of +certificates: + +* examined on the entire cluster +* OK +* expiring within the configured warning window +* already expired The example below is abbreviated to save space: @@ -193,7 +197,9 @@ The example below is abbreviated to save space: }, "summary": { "warning": 6, - "expired": 0 + "expired": 0, + "total": 7, + "ok": 1 } } ``` diff --git a/roles/openshift_certificate_expiry/filter_plugins/oo_cert_expiry.py b/roles/openshift_certificate_expiry/filter_plugins/oo_cert_expiry.py index bedd23fe8..5f102e960 100644 --- a/roles/openshift_certificate_expiry/filter_plugins/oo_cert_expiry.py +++ b/roles/openshift_certificate_expiry/filter_plugins/oo_cert_expiry.py @@ -51,9 +51,13 @@ Example playbook usage: total_warnings = sum([hostvars[h]['check_results']['summary']['warning'] for h in play_hosts]) total_expired = sum([hostvars[h]['check_results']['summary']['expired'] for h in play_hosts]) + total_ok = sum([hostvars[h]['check_results']['summary']['ok'] for h in play_hosts]) + total_total = sum([hostvars[h]['check_results']['summary']['total'] for h in play_hosts]) json_result['summary']['warning'] = total_warnings json_result['summary']['expired'] = total_expired + json_result['summary']['ok'] = total_ok + json_result['summary']['total'] = total_total return json_result diff --git a/roles/openshift_certificate_expiry/library/openshift_cert_expiry.py b/roles/openshift_certificate_expiry/library/openshift_cert_expiry.py index e838eb2d4..1fac284f2 100644 --- a/roles/openshift_certificate_expiry/library/openshift_cert_expiry.py +++ b/roles/openshift_certificate_expiry/library/openshift_cert_expiry.py @@ -467,7 +467,11 @@ an OpenShift Container Platform cluster ###################################################################### # Check etcd certs + # + # Two things to check: 'external' etcd, and embedded etcd. ###################################################################### + # FIRST: The 'external' etcd + # # Some values may be duplicated, make this a set for now so we # unique them all etcd_certs_to_check = set([]) @@ -506,6 +510,43 @@ an OpenShift Container Platform cluster classify_cert(expire_check_result, now, time_remaining, expire_window, etcd_certs) ###################################################################### + # Now the embedded etcd + ###################################################################### + try: + with open('/etc/origin/master/master-config.yaml', 'r') as fp: + cfg = yaml.load(fp) + except IOError: + # Not present + pass + else: + if cfg.get('etcdConfig', {}).get('servingInfo', {}).get('certFile', None) is not None: + # This is embedded + etcd_crt_name = cfg['etcdConfig']['servingInfo']['certFile'] + else: + # Not embedded + etcd_crt_name = None + + if etcd_crt_name is not None: + # etcd_crt_name is relative to the location of the + # master-config.yaml file + cfg_path = os.path.dirname(fp.name) + etcd_cert = os.path.join(cfg_path, etcd_crt_name) + with open(etcd_cert, 'r') as etcd_fp: + (cert_subject, + cert_expiry_date, + time_remaining) = load_and_handle_cert(etcd_fp.read(), now) + + expire_check_result = { + 'cert_cn': cert_subject, + 'path': etcd_fp.name, + 'expiry': cert_expiry_date, + 'days_remaining': time_remaining.days, + 'health': None, + } + + classify_cert(expire_check_result, now, time_remaining, expire_window, etcd_certs) + + ###################################################################### # /Check etcd certs ###################################################################### @@ -523,7 +564,7 @@ an OpenShift Container Platform cluster ###################################################################### # First the router certs try: - router_secrets_raw = subprocess.Popen('oc get secret router-certs -o yaml'.split(), + router_secrets_raw = subprocess.Popen('oc get -n default secret router-certs -o yaml'.split(), stdout=subprocess.PIPE) router_ds = yaml.load(router_secrets_raw.communicate()[0]) router_c = router_ds['data']['tls.crt'] @@ -552,7 +593,7 @@ an OpenShift Container Platform cluster ###################################################################### # Now for registry try: - registry_secrets_raw = subprocess.Popen('oc get secret registry-certificates -o yaml'.split(), + registry_secrets_raw = subprocess.Popen('oc get -n default secret registry-certificates -o yaml'.split(), stdout=subprocess.PIPE) registry_ds = yaml.load(registry_secrets_raw.communicate()[0]) registry_c = registry_ds['data']['registry.crt'] diff --git a/roles/openshift_node/README.md b/roles/openshift_node/README.md index d1920c485..616f44c1d 100644 --- a/roles/openshift_node/README.md +++ b/roles/openshift_node/README.md @@ -43,7 +43,7 @@ Currently we support re-labeling nodes but we don't re-schedule running pods nor ``` oadm manage-node --schedulable=false ${NODE} -oadm manage-node --evacuate ${NODE} +oadm manage-node --drain ${NODE} oadm manage-node --schedulable=true ${NODE} ```` |