From 5624c74062f97c2cfaddda6d5d5cb04a10411db4 Mon Sep 17 00:00:00 2001 From: Tyler Smith Date: Thu, 10 Jan 2019 11:37:27 -0500 Subject: [PATCH] Trigger application reapply on host unlock/delete - Triggers an application reapply (including override regeneration) on node unlock to pick up a new node being added or configuration changes being made. - The reapply also triggers on node delete if the host had a compute node label to remove any per-host overrides. - Turned on the restriction that nodes must be locked to modify labels. Added an audit task to sync any labels made before a node is unlocked and missing from k8s. - Modified k8s puppet manifest to only launch kubelet on the initial configuration, after that the service file is modified to have a dependancy on the config gate. This is to avoid PLEG errors in kubernetes due to the node being overwhelmed during boot. Change-Id: I1d9ca92f451aa322765da43ffcbb1d95f97f92f2 Story: 2004520 Task: 28826 Signed-off-by: Tyler Smith --- .../modules/platform/manifests/kubernetes.pp | 46 +++++++++++- .../templates/kube-stx-override.conf.erb | 2 + .../sysinv/sysinv/api/controllers/v1/host.py | 40 +++++++++++ .../sysinv/sysinv/api/controllers/v1/label.py | 8 +-- .../sysinv/sysinv/sysinv/common/constants.py | 1 + .../sysinv/sysinv/sysinv/common/exception.py | 4 ++ .../sysinv/sysinv/sysinv/common/kubernetes.py | 15 +++- .../sysinv/sysinv/conductor/kube_app.py | 72 ++++++++++++------- .../sysinv/sysinv/sysinv/conductor/manager.py | 39 +++++++++- sysinv/sysinv/sysinv/sysinv/helm/neutron.py | 3 +- sysinv/sysinv/sysinv/sysinv/helm/nova.py | 3 +- 11 files changed, 197 insertions(+), 36 deletions(-) create mode 100644 puppet-manifests/src/modules/platform/templates/kube-stx-override.conf.erb diff --git a/puppet-manifests/src/modules/platform/manifests/kubernetes.pp b/puppet-manifests/src/modules/platform/manifests/kubernetes.pp index f30dbd5da6..319d23b626 100644 --- a/puppet-manifests/src/modules/platform/manifests/kubernetes.pp +++ b/puppet-manifests/src/modules/platform/manifests/kubernetes.pp @@ -39,7 +39,6 @@ class platform::kubernetes::kubeadm { } # Start kubelet. -> service { 'kubelet': - ensure => 'running', enable => true, } # A seperate enable is required since we have modified the service resource @@ -124,6 +123,21 @@ class platform::kubernetes::master::init command => "kubectl --kubeconfig=/etc/kubernetes/admin.conf taint node ${::platform::params::hostname} node-role.kubernetes.io/master-", # lint:ignore:140chars logoutput => true, } + + # Add a dependency to kubelet on config so it doesn't enter a bad state on subsequent boots + -> file { '/etc/systemd/system/kubelet.service.d/kube-stx-override.conf': + ensure => file, + content => template('platform/kube-stx-override.conf.erb'), + owner => 'root', + group => 'root', + mode => '0644', + } + + # Reload systemd + -> exec { 'perform systemctl daemon reload for kubelet override': + command => 'systemctl daemon-reload', + logoutput => true, + } } else { if str2bool($::is_initial_config) { # For subsequent controller installs, install kubernetes using the @@ -206,6 +220,21 @@ class platform::kubernetes::master::init command => "kubectl --kubeconfig=/etc/kubernetes/admin.conf taint node ${::platform::params::hostname} node-role.kubernetes.io/master-", # lint:ignore:140chars logoutput => true, } + + # Add a dependency to kubelet on config so it doesn't enter a bad state on subsequent boots + -> file { '/etc/systemd/system/kubelet.service.d/kube-stx-override.conf': + ensure => file, + content => template('platform/kube-stx-override.conf.erb'), + owner => 'root', + group => 'root', + mode => '0644', + } + + # Reload systemd + -> exec { 'perform systemctl daemon reload for kubelet override': + command => 'systemctl daemon-reload', + logoutput => true, + } } } } @@ -242,6 +271,21 @@ class platform::kubernetes::worker::init logoutput => true, unless => 'test -f /etc/kubernetes/kubelet.conf', } + + # Add a dependency to kubelet on config so it doesn't enter a bad state + -> file { '/etc/systemd/system/kubelet.service.d/kube-stx-override.conf': + ensure => file, + content => template('platform/kube-stx-override.conf.erb'), + owner => 'root', + group => 'root', + mode => '0644', + } + + # Reload systemd + -> exec { 'perform systemctl daemon reload for kubelet override': + command => 'systemctl daemon-reload', + logoutput => true, + } } class platform::kubernetes::worker diff --git a/puppet-manifests/src/modules/platform/templates/kube-stx-override.conf.erb b/puppet-manifests/src/modules/platform/templates/kube-stx-override.conf.erb new file mode 100644 index 0000000000..69f59043f2 --- /dev/null +++ b/puppet-manifests/src/modules/platform/templates/kube-stx-override.conf.erb @@ -0,0 +1,2 @@ +[Unit] +After=config.service \ No newline at end of file diff --git a/sysinv/sysinv/sysinv/sysinv/api/controllers/v1/host.py b/sysinv/sysinv/sysinv/sysinv/api/controllers/v1/host.py index 3c943e9d42..bddb110039 100644 --- a/sysinv/sysinv/sysinv/sysinv/api/controllers/v1/host.py +++ b/sysinv/sysinv/sysinv/sysinv/api/controllers/v1/host.py @@ -2110,6 +2110,11 @@ class HostController(rest.RestController): ihost_obj['uuid'], ibm_msg_dict) + # Trigger a system app reapply if the host has been unlocked + if (utils.is_kubernetes_config() and patched_ihost.get('action') in + [constants.UNLOCK_ACTION, constants.FORCE_UNLOCK_ACTION]): + self._reapply_system_app() + elif mtc_response['status'] is None: raise wsme.exc.ClientSideError( _("Timeout waiting for maintenance response. " @@ -2341,6 +2346,15 @@ class HostController(rest.RestController): # wait for VIM signal return + openstack_worker = False + if utils.is_kubernetes_config(): + labels = objects.label.get_by_host_id(pecan.request.context, ihost.uuid) + for l in labels: + if (constants.COMPUTE_NODE_LABEL == + str(l.label_key) + '=' + str(l.label_value)): + openstack_worker = True + break + idict = {'operation': constants.DELETE_ACTION, 'uuid': ihost.uuid, 'invprovision': ihost.invprovision} @@ -2464,6 +2478,32 @@ class HostController(rest.RestController): pecan.request.dbapi.ihost_destroy(ihost_id) + # If the host being removed was an openstack worker node, trigger + # a reapply + if openstack_worker: + self._reapply_system_app() + + def _reapply_system_app(self): + try: + db_app = objects.kube_app.get_by_name( + pecan.request.context, constants.HELM_APP_OPENSTACK) + + if db_app.status == constants.APP_APPLY_SUCCESS: + LOG.info( + "Reapplying the %s app" % constants.HELM_APP_OPENSTACK) + db_app.status = constants.APP_APPLY_IN_PROGRESS + db_app.progress = None + db_app.save() + pecan.request.rpcapi.perform_app_apply( + pecan.request.context, db_app) + else: + LOG.info("%s system app is present but not applied, " + "skipping re-apply" % constants.HELM_APP_OPENSTACK) + except exception.KubeAppNotFound: + LOG.info( + "%s system app not present, skipping re-apply" % + constants.HELM_APP_OPENSTACK) + def _check_upgrade_provision_order(self, personality, hostname): LOG.info("_check_upgrade_provision_order personality=%s, hostname=%s" % (personality, hostname)) diff --git a/sysinv/sysinv/sysinv/sysinv/api/controllers/v1/label.py b/sysinv/sysinv/sysinv/sysinv/api/controllers/v1/label.py index abd7f68ec8..3121eddae3 100644 --- a/sysinv/sysinv/sysinv/sysinv/api/controllers/v1/label.py +++ b/sysinv/sysinv/sysinv/sysinv/api/controllers/v1/label.py @@ -288,11 +288,5 @@ class LabelController(rest.RestController): # UTILS ########### def _check_host_locked(host): - - # TODO(ksmith): - # turn this on later - return - - if (utils.is_aio_simplex_host_unlocked(host) or - host.administrative != constants.ADMIN_LOCKED): + if host.administrative != constants.ADMIN_LOCKED: raise wsme.exc.ClientSideError(_("Host must be locked.")) diff --git a/sysinv/sysinv/sysinv/sysinv/common/constants.py b/sysinv/sysinv/sysinv/sysinv/common/constants.py index e86d69b522..666a105bd5 100644 --- a/sysinv/sysinv/sysinv/sysinv/common/constants.py +++ b/sysinv/sysinv/sysinv/sysinv/common/constants.py @@ -123,6 +123,7 @@ HOST_DELETE = 'host_delete' # for personality sub-type validation # Availability AVAILABILITY_AVAILABLE = 'available' +AVAILABILITY_INTEST = 'intest' AVAILABILITY_OFFLINE = 'offline' AVAILABILITY_ONLINE = 'online' AVAILABILITY_DEGRADED = 'degraded' diff --git a/sysinv/sysinv/sysinv/sysinv/common/exception.py b/sysinv/sysinv/sysinv/sysinv/common/exception.py index b675ee862b..30077ae331 100644 --- a/sysinv/sysinv/sysinv/sysinv/common/exception.py +++ b/sysinv/sysinv/sysinv/sysinv/common/exception.py @@ -1170,6 +1170,10 @@ class HostLabelInvalid(Invalid): message = _("Host label is invalid. Reason: %(reason)s") +class K8sNodeNotFound(NotFound): + message = _("Kubernetes Node %(name)s could not be found.") + + class PickleableException(Exception): """ Pickleable Exception diff --git a/sysinv/sysinv/sysinv/sysinv/common/kubernetes.py b/sysinv/sysinv/sysinv/sysinv/common/kubernetes.py index 511f6bff47..7d5abffc54 100644 --- a/sysinv/sysinv/sysinv/sysinv/common/kubernetes.py +++ b/sysinv/sysinv/sysinv/sysinv/common/kubernetes.py @@ -51,6 +51,19 @@ class KubeOperator(object): if e.status == httplib.UNPROCESSABLE_ENTITY: reason = json.loads(e.body).get('message', "") raise exception.HostLabelInvalid(reason=reason) + elif e.status == httplib.NOT_FOUND: + raise exception.K8sNodeNotFound(name=name) + else: + raise except Exception as e: - LOG.error("Kubernetes exception: %s" % e) + LOG.error("Kubernetes exception in kube_patch_node: %s" % e) + raise + + def kube_get_nodes(self): + try: + api_response = self._get_kubernetesclient().list_node() + LOG.debug("Response: %s" % api_response) + return api_response.items + except Exception as e: + LOG.error("Kubernetes exception in kube_get_nodes: %s" % e) raise diff --git a/sysinv/sysinv/sysinv/sysinv/conductor/kube_app.py b/sysinv/sysinv/sysinv/sysinv/conductor/kube_app.py index a1177e756d..4055949102 100644 --- a/sysinv/sysinv/sysinv/sysinv/conductor/kube_app.py +++ b/sysinv/sysinv/sysinv/sysinv/conductor/kube_app.py @@ -532,10 +532,15 @@ class AppOperator(object): } } body['metadata']['labels'].update(label_dict) - self._kube.kube_patch_node(hostname, body) + try: + self._kube.kube_patch_node(hostname, body) + except exception.K8sNodeNotFound: + pass def _assign_host_labels(self, hosts, labels): for host in hosts: + if host.administrative != constants.ADMIN_LOCKED: + continue for label_str in labels: k, v = label_str.split('=') try: @@ -557,6 +562,8 @@ class AppOperator(object): def _remove_host_labels(self, hosts, labels): for host in hosts: + if host.administrative != constants.ADMIN_LOCKED: + continue null_labels = {} for label_str in labels: lbl_obj = self._find_label(host.uuid, label_str) @@ -941,43 +948,60 @@ class AppOperator(object): if self._make_armada_request_with_monitor(app, constants.APP_DELETE_OP): if app.system_app: - try: - # TODO convert these kubectl commands to use the k8s api - p1 = subprocess.Popen( - ['kubectl', '--kubeconfig=/etc/kubernetes/admin.conf', - 'get', 'pvc', '--no-headers', '-n', 'openstack'], - stdout=subprocess.PIPE) - p2 = subprocess.Popen(['awk', '{print $3}'], - stdin=p1.stdout, - stdout=subprocess.PIPE) - p3 = subprocess.Popen( - ['xargs', '-i', 'kubectl', - '--kubeconfig=/etc/kubernetes/admin.conf', 'delete', - 'pv', '{}', '--wait=false'], - stdin=p2.stdout, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) + # TODO convert these kubectl commands to use the k8s api + p1 = subprocess.Popen( + ['kubectl', '--kubeconfig=/etc/kubernetes/admin.conf', + 'get', 'pvc', '--no-headers', '-n', 'openstack'], + stdout=subprocess.PIPE) + p2 = subprocess.Popen(['awk', '{print $3}'], + stdin=p1.stdout, + stdout=subprocess.PIPE) + p3 = subprocess.Popen( + ['xargs', '-i', 'kubectl', + '--kubeconfig=/etc/kubernetes/admin.conf', 'delete', + 'pv', '{}', '--wait=false'], + stdin=p2.stdout, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + timer = threading.Timer(10, p3.kill) + try: + timer.start() p1.stdout.close() p2.stdout.close() out, err = p3.communicate() - if not err: + if out and not err: LOG.info("Persistent Volumes marked for deletion.") + else: + self._abort_operation(app, constants.APP_REMOVE_OP) + LOG.error("Failed to clean up PVs after app removal.") except Exception as e: + self._abort_operation(app, constants.APP_REMOVE_OP) LOG.exception("Failed to clean up PVs after app " "removal: %s" % e) + finally: + timer.cancel() + p4 = subprocess.Popen( + ['kubectl', '--kubeconfig=/etc/kubernetes/admin.conf', + 'delete', 'namespace', 'openstack'], + stdout=subprocess.PIPE) + timer2 = threading.Timer(10, p4.kill) try: - p1 = subprocess.Popen( - ['kubectl', '--kubeconfig=/etc/kubernetes/admin.conf', - 'delete', 'namespace', 'openstack'], - stdout=subprocess.PIPE) - out, err = p1.communicate() - if not err: + timer2.start() + out, err = p4.communicate() + if out and not err: LOG.info("Openstack namespace delete completed.") + else: + self._abort_operation(app, constants.APP_REMOVE_OP) + LOG.error("Failed to clean up openstack namespace" + " after app removal.") except Exception as e: + self._abort_operation(app, constants.APP_REMOVE_OP) LOG.exception("Failed to clean up openstack namespace " "after app removal: %s" % e) + finally: + timer2.cancel() self._update_app_status(app, constants.APP_UPLOAD_SUCCESS) LOG.info("Application (%s) remove completed." % app.name) diff --git a/sysinv/sysinv/sysinv/sysinv/conductor/manager.py b/sysinv/sysinv/sysinv/sysinv/conductor/manager.py index 5f42b95dbf..8ba9dd1934 100644 --- a/sysinv/sysinv/sysinv/sysinv/conductor/manager.py +++ b/sysinv/sysinv/sysinv/sysinv/conductor/manager.py @@ -4937,12 +4937,45 @@ class ConductorManager(service.PeriodicService): # Audit install states self._audit_install_states(hosts) + # Audit kubernetes node labels + self._audit_kubernetes_labels(hosts) + for host in hosts: # only audit configured hosts if not host.personality: continue self._audit_ihost_action(host) + def _audit_kubernetes_labels(self, hosts): + if not utils.is_kubernetes_config(self.dbapi): + LOG.debug("_audit_kubernetes_labels skip") + return + + LOG.debug("Starting kubernetes label audit") + sysinv_labels = self.dbapi.label_get_all() + nodes = self._kube.kube_get_nodes() + + for host in hosts: + try: + for node in nodes: + if host.hostname == node.metadata.name: + node_labels = node.metadata.labels + host_labels = [l for l in sysinv_labels if l.host_id == host.id] + for host_label in host_labels: + if host_label.label_key not in node_labels.keys(): + LOG.info("Label audit: creating %s=%s on node %s" + % (host_label.label_key, + host_label.label_value, host.hostname)) + body = { + 'metadata': { + 'labels': {host_label.label_key: host_label.label_value} + } + } + self._kube.kube_patch_node(host.hostname, body) + except Exception as e: + LOG.warning("Failed to sync kubernetes label to host %s: %s" % + (host.hostname, e)) + # TODO(CephPoolsDecouple): remove @periodic_task.periodic_task(spacing=60) def _osd_pool_audit(self, context): @@ -10530,7 +10563,11 @@ class ConductorManager(service.PeriodicService): } } body['metadata']['labels'].update(label_dict) - self._kube.kube_patch_node(host.hostname, body) + try: + self._kube.kube_patch_node(host.hostname, body) + except exception.K8sNodeNotFound: + LOG.info("Host %s does not exist in kubernetes yet, label will " + "be added after node's unlock by audit" % host.hostname) def update_host_memory(self, context, host_uuid): try: diff --git a/sysinv/sysinv/sysinv/sysinv/helm/neutron.py b/sysinv/sysinv/sysinv/sysinv/helm/neutron.py index 5a2d318bd1..9db377f7fa 100644 --- a/sysinv/sysinv/sysinv/sysinv/helm/neutron.py +++ b/sysinv/sysinv/sysinv/sysinv/helm/neutron.py @@ -152,7 +152,8 @@ class NeutronHelm(openstack.OpenstackBaseHelm): hosts = self.dbapi.ihost_get_list() for host in hosts: - if (host.invprovision == constants.PROVISIONED): + if (host.invprovision in [constants.PROVISIONED, + constants.PROVISIONING]): if constants.WORKER in utils.get_personalities(host): hostname = str(host.hostname) diff --git a/sysinv/sysinv/sysinv/sysinv/helm/nova.py b/sysinv/sysinv/sysinv/sysinv/helm/nova.py index 2e9f2ddce0..2903c86d8c 100644 --- a/sysinv/sysinv/sysinv/sysinv/helm/nova.py +++ b/sysinv/sysinv/sysinv/sysinv/helm/nova.py @@ -422,7 +422,8 @@ class NovaHelm(openstack.OpenstackBaseHelm): hosts = self.dbapi.ihost_get_list() for host in hosts: - if (host.invprovision == constants.PROVISIONED): + if (host.invprovision in [constants.PROVISIONED, + constants.PROVISIONING]): if constants.WORKER in utils.get_personalities(host): hostname = str(host.hostname)