From 4ea74a99c9148ee5c813a3bec086103c463634e5 Mon Sep 17 00:00:00 2001 From: Bart Wensley Date: Fri, 16 Aug 2019 13:48:09 -0500 Subject: [PATCH] Make VIM tolerant of compute service failures When the VIM detects the nova compute service is down on a worker host, it attempts to migrate instances off that host (by "disabling" the host). However, this isn't possible if the compute service is down. The VIM then fails the instances, which will eventually result in their evacuation (if the host goes offline) or a reboot of the instance (if the compute service recovers). In the containers world, when the libvirt pod is restarted (e.g. when stx-openstack application is re-applied), nova reports that the compute service is down (for a short period of time), which causes the undesirable behaviour described above. The VIM is being updated to not disable the host in this case and instead just raise an alarm to indicate that the compute service has failed. Change-Id: I186d8d76bbcd87405bafec47deb92ec24580640e Closes-Bug: 1833096 Signed-off-by: Bart Wensley (cherry picked from commit a9004988dc37bdd9caefdbcf911472b38c4db5ac) --- .../nfv_vim/host_fsm/_host_state_enabled.py | 7 +++- nfv/nfv-vim/nfv_vim/objects/_host.py | 40 ++++++++++++++----- 2 files changed, 35 insertions(+), 12 deletions(-) diff --git a/nfv/nfv-vim/nfv_vim/host_fsm/_host_state_enabled.py b/nfv/nfv-vim/nfv_vim/host_fsm/_host_state_enabled.py index a9672d70..c9342c22 100755 --- a/nfv/nfv-vim/nfv_vim/host_fsm/_host_state_enabled.py +++ b/nfv/nfv-vim/nfv_vim/host_fsm/_host_state_enabled.py @@ -53,8 +53,12 @@ class EnabledState(state_machine.State): return HOST_STATE.DISABLING elif HOST_EVENT.TASK_COMPLETED == event: + # Do not disable this host if only the compute service is disabled. + # We will raise an alarm, but there is no way to safely move work + # off the host if the compute service is down. if objects.HOST_SERVICE_STATE.ENABLED != \ - host.host_service_state_aggregate(): + host.host_service_state_aggregate( + ignore_services=[objects.HOST_SERVICES.COMPUTE]): if not host.host_services_locked: DLOG.info("Host services are not enabled on %s. " "Disabling host." % host.name) @@ -62,6 +66,7 @@ class EnabledState(state_machine.State): else: DLOG.info("Host services are not enabled on %s. " "Host services are locked." % host.name) + elif HOST_EVENT.TASK_FAILED == event: DLOG.info("Audit failed for %s." % host.name) diff --git a/nfv/nfv-vim/nfv_vim/objects/_host.py b/nfv/nfv-vim/nfv_vim/objects/_host.py index bbc3c8f7..536aa92e 100755 --- a/nfv/nfv-vim/nfv_vim/objects/_host.py +++ b/nfv/nfv-vim/nfv_vim/objects/_host.py @@ -185,10 +185,12 @@ class Host(ObjectData): """ return self._host_service_state[service] - def host_service_state_aggregate(self): + def host_service_state_aggregate(self, ignore_services=None): """ Returns the overall state of the host services """ + if ignore_services is None: + ignore_services = [] all_enabled = True at_least_one_failed = False for service, service_state in self._host_service_state.items(): @@ -196,6 +198,9 @@ class Host(ObjectData): # there is no query function for that sevice. if service == HOST_SERVICES.CONTAINER: continue + # Ignore services we were told to ignore + if service in ignore_services: + continue all_enabled = all_enabled and \ (service_state == HOST_SERVICE_STATE.ENABLED) at_least_one_failed = at_least_one_failed or \ @@ -758,30 +763,39 @@ class Host(ObjectData): if service is not None: if host_service_state == self._host_service_state[service]: + # No change to the state of the service return self._host_service_state[service] = host_service_state - # Host services logs and alarms only apply to worker hosts - if 'worker' in self.personality: - host_service_state_overall = \ - self.host_service_state_aggregate() - if (HOST_SERVICE_STATE.ENABLED == - host_service_state_overall): + # Host services logs and alarms only apply to the compute service on + # worker hosts + if 'worker' in self.personality and HOST_SERVICES.COMPUTE == service: + if HOST_SERVICE_STATE.ENABLED == host_service_state: self._events = event_log.host_issue_log( self, event_log.EVENT_ID.HOST_SERVICES_ENABLED) alarm.host_clear_alarm(self._alarms) self._alarms[:] = list() - elif (HOST_SERVICE_STATE.DISABLED == - host_service_state_overall): + elif HOST_SERVICE_STATE.DISABLED == host_service_state: + # Always log the disabled compute service self._events = event_log.host_issue_log( self, event_log.EVENT_ID.HOST_SERVICES_DISABLED) + # Clear any previous alarms for this host alarm.host_clear_alarm(self._alarms) self._alarms[:] = list() + # Alarm the disabled compute service if the host is still + # enabled and is not being locked. Alarm it as a failure. + if self.nfvi_host_is_enabled(): + if reason is None: + additional_text = '' + else: + additional_text = ", %s" % reason + self._alarms = alarm.host_raise_alarm( + self, alarm.ALARM_TYPE.HOST_SERVICES_FAILED, + additional_text=additional_text) - elif (HOST_SERVICE_STATE.FAILED == - host_service_state_overall): + elif HOST_SERVICE_STATE.FAILED == host_service_state: if reason is None: additional_text = '' else: @@ -790,6 +804,10 @@ class Host(ObjectData): self._events = event_log.host_issue_log( self, event_log.EVENT_ID.HOST_SERVICES_FAILED, additional_text=additional_text) + # Clear any previous alarms for this host + alarm.host_clear_alarm(self._alarms) + self._alarms[:] = list() + # Alarm the failed compute service self._alarms = alarm.host_raise_alarm( self, alarm.ALARM_TYPE.HOST_SERVICES_FAILED, additional_text=additional_text)