Make VIM tolerant of compute service failures
When the VIM detects the nova compute service is down on a
worker host, it attempts to migrate instances off that host
(by "disabling" the host). However, this isn't possible if
the compute service is down. The VIM then fails the instances,
which will eventually result in their evacuation (if the host
goes offline) or a reboot of the instance (if the compute
service recovers).
In the containers world, when the libvirt pod is restarted
(e.g. when stx-openstack application is re-applied), nova
reports that the compute service is down (for a short period
of time), which causes the undesirable behaviour described
above. The VIM is being updated to not disable the host in
this case and instead just raise an alarm to indicate that
the compute service has failed.
Change-Id: I186d8d76bbcd87405bafec47deb92ec24580640e
Closes-Bug: 1833096
Signed-off-by: Bart Wensley <barton.wensley@windriver.com>
(cherry picked from commit a9004988dc
)
This commit is contained in:
parent
ac00a68b22
commit
4ea74a99c9
|
@ -53,8 +53,12 @@ class EnabledState(state_machine.State):
|
|||
return HOST_STATE.DISABLING
|
||||
|
||||
elif HOST_EVENT.TASK_COMPLETED == event:
|
||||
# Do not disable this host if only the compute service is disabled.
|
||||
# We will raise an alarm, but there is no way to safely move work
|
||||
# off the host if the compute service is down.
|
||||
if objects.HOST_SERVICE_STATE.ENABLED != \
|
||||
host.host_service_state_aggregate():
|
||||
host.host_service_state_aggregate(
|
||||
ignore_services=[objects.HOST_SERVICES.COMPUTE]):
|
||||
if not host.host_services_locked:
|
||||
DLOG.info("Host services are not enabled on %s. "
|
||||
"Disabling host." % host.name)
|
||||
|
@ -62,6 +66,7 @@ class EnabledState(state_machine.State):
|
|||
else:
|
||||
DLOG.info("Host services are not enabled on %s. "
|
||||
"Host services are locked." % host.name)
|
||||
|
||||
elif HOST_EVENT.TASK_FAILED == event:
|
||||
DLOG.info("Audit failed for %s." % host.name)
|
||||
|
||||
|
|
|
@ -185,10 +185,12 @@ class Host(ObjectData):
|
|||
"""
|
||||
return self._host_service_state[service]
|
||||
|
||||
def host_service_state_aggregate(self):
|
||||
def host_service_state_aggregate(self, ignore_services=None):
|
||||
"""
|
||||
Returns the overall state of the host services
|
||||
"""
|
||||
if ignore_services is None:
|
||||
ignore_services = []
|
||||
all_enabled = True
|
||||
at_least_one_failed = False
|
||||
for service, service_state in self._host_service_state.items():
|
||||
|
@ -196,6 +198,9 @@ class Host(ObjectData):
|
|||
# there is no query function for that sevice.
|
||||
if service == HOST_SERVICES.CONTAINER:
|
||||
continue
|
||||
# Ignore services we were told to ignore
|
||||
if service in ignore_services:
|
||||
continue
|
||||
all_enabled = all_enabled and \
|
||||
(service_state == HOST_SERVICE_STATE.ENABLED)
|
||||
at_least_one_failed = at_least_one_failed or \
|
||||
|
@ -758,30 +763,39 @@ class Host(ObjectData):
|
|||
|
||||
if service is not None:
|
||||
if host_service_state == self._host_service_state[service]:
|
||||
# No change to the state of the service
|
||||
return
|
||||
|
||||
self._host_service_state[service] = host_service_state
|
||||
|
||||
# Host services logs and alarms only apply to worker hosts
|
||||
if 'worker' in self.personality:
|
||||
host_service_state_overall = \
|
||||
self.host_service_state_aggregate()
|
||||
if (HOST_SERVICE_STATE.ENABLED ==
|
||||
host_service_state_overall):
|
||||
# Host services logs and alarms only apply to the compute service on
|
||||
# worker hosts
|
||||
if 'worker' in self.personality and HOST_SERVICES.COMPUTE == service:
|
||||
if HOST_SERVICE_STATE.ENABLED == host_service_state:
|
||||
self._events = event_log.host_issue_log(
|
||||
self, event_log.EVENT_ID.HOST_SERVICES_ENABLED)
|
||||
alarm.host_clear_alarm(self._alarms)
|
||||
self._alarms[:] = list()
|
||||
|
||||
elif (HOST_SERVICE_STATE.DISABLED ==
|
||||
host_service_state_overall):
|
||||
elif HOST_SERVICE_STATE.DISABLED == host_service_state:
|
||||
# Always log the disabled compute service
|
||||
self._events = event_log.host_issue_log(
|
||||
self, event_log.EVENT_ID.HOST_SERVICES_DISABLED)
|
||||
# Clear any previous alarms for this host
|
||||
alarm.host_clear_alarm(self._alarms)
|
||||
self._alarms[:] = list()
|
||||
# Alarm the disabled compute service if the host is still
|
||||
# enabled and is not being locked. Alarm it as a failure.
|
||||
if self.nfvi_host_is_enabled():
|
||||
if reason is None:
|
||||
additional_text = ''
|
||||
else:
|
||||
additional_text = ", %s" % reason
|
||||
self._alarms = alarm.host_raise_alarm(
|
||||
self, alarm.ALARM_TYPE.HOST_SERVICES_FAILED,
|
||||
additional_text=additional_text)
|
||||
|
||||
elif (HOST_SERVICE_STATE.FAILED ==
|
||||
host_service_state_overall):
|
||||
elif HOST_SERVICE_STATE.FAILED == host_service_state:
|
||||
if reason is None:
|
||||
additional_text = ''
|
||||
else:
|
||||
|
@ -790,6 +804,10 @@ class Host(ObjectData):
|
|||
self._events = event_log.host_issue_log(
|
||||
self, event_log.EVENT_ID.HOST_SERVICES_FAILED,
|
||||
additional_text=additional_text)
|
||||
# Clear any previous alarms for this host
|
||||
alarm.host_clear_alarm(self._alarms)
|
||||
self._alarms[:] = list()
|
||||
# Alarm the failed compute service
|
||||
self._alarms = alarm.host_raise_alarm(
|
||||
self, alarm.ALARM_TYPE.HOST_SERVICES_FAILED,
|
||||
additional_text=additional_text)
|
||||
|
|
Loading…
Reference in New Issue