distcloud/distributedcloud/dcmanager/manager/subcloud_audit_manager.py

414 lines
18 KiB
Python

# Copyright 2017 Ericsson AB.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Copyright (c) 2017-2020 Wind River Systems, Inc.
#
# The right to copy, distribute, modify, or otherwise make use
# of this software may be licensed only pursuant to the terms
# of an applicable Wind River license agreement.
#
from keystoneauth1 import exceptions as keystone_exceptions
from oslo_config import cfg
from oslo_log import log as logging
from fm_api import constants as fm_const
from fm_api import fm_api
from sysinv.common import constants as sysinv_constants
from dccommon import consts as dccommon_consts
from dccommon.drivers.openstack.sdk_platform import OpenStackDriver
from dccommon.drivers.openstack.sysinv_v1 import SysinvClient
from dcorch.rpc import client as dcorch_rpc_client
from dcmanager.common import consts
from dcmanager.common import context
from dcmanager.common import exceptions
from dcmanager.common.i18n import _
from dcmanager.common import manager
from dcmanager.db import api as db_api
from dcmanager.manager import scheduler
CONF = cfg.CONF
LOG = logging.getLogger(__name__)
# We will update the state of each subcloud in the dcorch about once per hour.
# Calculate how many iterations that will be.
SUBCLOUD_STATE_UPDATE_ITERATIONS = \
dccommon_consts.SECONDS_IN_HOUR / CONF.scheduler.subcloud_audit_interval
class SubcloudAuditManager(manager.Manager):
"""Manages tasks related to audits."""
def __init__(self, *args, **kwargs):
LOG.debug(_('SubcloudAuditManager initialization...'))
super(SubcloudAuditManager, self).__init__(
service_name="subcloud_audit_manager")
self.context = context.get_admin_context()
self.dcorch_rpc_client = dcorch_rpc_client.EngineClient()
self.fm_api = fm_api.FaultAPIs()
self.subcloud_manager = kwargs['subcloud_manager']
# Keeps track of greenthreads we create to do work.
self.thread_group_manager = scheduler.ThreadGroupManager(
thread_pool_size=100)
# Track workers created for each subcloud.
self.subcloud_workers = dict()
# Number of audits since last subcloud state update
self.audit_count = 0
def periodic_subcloud_audit(self):
"""Audit availability of subclouds."""
# Blanket catch all exceptions in the audit so that the audit
# does not die.
try:
self._periodic_subcloud_audit_loop()
except Exception as e:
LOG.exception(e)
def _periodic_subcloud_audit_loop(self):
"""Audit availability of subclouds loop."""
# We will be running in our own green thread here.
LOG.info('Triggered subcloud audit.')
self.audit_count += 1
# Determine whether to trigger a state update to each subcloud
if self.audit_count >= SUBCLOUD_STATE_UPDATE_ITERATIONS:
update_subcloud_state = True
else:
update_subcloud_state = False
# Determine whether OpenStack is installed in central cloud
os_client = OpenStackDriver(region_name=consts.DEFAULT_REGION_NAME,
region_clients=None)
sysinv_client = SysinvClient(consts.DEFAULT_REGION_NAME,
os_client.keystone_client.session)
# This could be optimized in the future by attempting to get just the
# one application. However, sysinv currently treats this as a failure
# if the application is not installed and generates warning logs, so it
# would require changes to handle this gracefully.
apps = sysinv_client.get_applications()
openstack_installed = False
for app in apps:
if app.name == sysinv_constants.HELM_APP_OPENSTACK and app.active:
openstack_installed = True
break
for subcloud in db_api.subcloud_get_all(self.context):
if (subcloud.deploy_status not in
[consts.DEPLOY_STATE_DONE,
consts.DEPLOY_STATE_DEPLOYING,
consts.DEPLOY_STATE_DEPLOY_FAILED]):
LOG.debug("Skip subcloud %s audit, deploy_status: %s" %
(subcloud.name, subcloud.deploy_status))
continue
# Create a new greenthread for each subcloud to allow the audits
# to be done in parallel. If there are not enough greenthreads
# in the pool, this will block until one becomes available.
self.subcloud_workers[subcloud.name] = \
self.thread_group_manager.start(self._audit_subcloud,
subcloud.name,
update_subcloud_state,
openstack_installed)
# Wait for all greenthreads to complete
LOG.info('Waiting for subcloud audits to complete.')
for thread in self.subcloud_workers.values():
thread.wait()
# Clear the list of workers before next audit
self.subcloud_workers = dict()
LOG.info('All subcloud audits have completed.')
def _audit_subcloud(self, subcloud_name, update_subcloud_state,
audit_openstack):
"""Audit a single subcloud."""
# Retrieve the subcloud
try:
subcloud = db_api.subcloud_get_by_name(self.context, subcloud_name)
except exceptions.SubcloudNotFound:
# Possibility subcloud could have been deleted since the list of
# subclouds to audit was created.
LOG.info('Ignoring SubcloudNotFound when auditing subcloud %s' %
subcloud_name)
return
# For each subcloud, if at least one service is active in
# each service of servicegroup-list then declare the subcloud online.
subcloud_id = subcloud.id
avail_status_current = subcloud.availability_status
audit_fail_count = subcloud.audit_fail_count
# Set defaults to None and disabled so we will still set disabled
# status if we encounter an error.
sysinv_client = None
svc_groups = None
avail_to_set = consts.AVAILABILITY_OFFLINE
try:
os_client = OpenStackDriver(region_name=subcloud_name,
region_clients=None)
sysinv_client = SysinvClient(subcloud_name,
os_client.keystone_client.session)
except (keystone_exceptions.EndpointNotFound,
keystone_exceptions.ConnectFailure,
keystone_exceptions.ConnectTimeout,
IndexError):
if avail_status_current == consts.AVAILABILITY_OFFLINE:
LOG.info("Identity or Platform endpoint for %s not "
"found, ignoring for offline "
"subcloud." % subcloud_name)
return
else:
# The subcloud will be marked as offline below.
LOG.error("Identity or Platform endpoint for online "
"subcloud: %s not found." % subcloud_name)
except Exception as e:
LOG.exception(e)
if sysinv_client:
# get a list of service groups in the subcloud
try:
svc_groups = sysinv_client.get_service_groups()
except Exception as e:
svc_groups = None
LOG.warn('Cannot retrieve service groups for '
'subcloud: %s, %s' % (subcloud_name, e))
if svc_groups:
active_sgs = []
inactive_sgs = []
# Build 2 lists, 1 of active service groups,
# one with non-active.
for sg in svc_groups:
if sg.state != consts.SERVICE_GROUP_STATUS_ACTIVE:
inactive_sgs.append(sg.service_group_name)
else:
active_sgs.append(sg.service_group_name)
# Create a list of service groups that are only present
# in non-active list
inactive_only = [sg for sg in inactive_sgs if
sg not in active_sgs]
# An empty inactive only list and a non-empty active list
# means we're good to go.
if not inactive_only and active_sgs:
avail_to_set = \
consts.AVAILABILITY_ONLINE
else:
LOG.info("Subcloud:%s has non-active "
"service groups: %s" %
(subcloud_name, inactive_only))
if avail_to_set == consts.AVAILABILITY_OFFLINE:
if audit_fail_count < consts.AVAIL_FAIL_COUNT_MAX:
audit_fail_count = audit_fail_count + 1
if (avail_status_current == consts.AVAILABILITY_ONLINE) and \
(audit_fail_count < consts.AVAIL_FAIL_COUNT_TO_ALARM):
# Do not set offline until we have failed audit
# the requisite number of times
avail_to_set = consts.AVAILABILITY_ONLINE
else:
# In the case of a one off blip, we may need to set the
# fail count back to 0
audit_fail_count = 0
if avail_to_set != avail_status_current:
if avail_to_set == consts.AVAILABILITY_ONLINE:
audit_fail_count = 0
LOG.info('Setting new availability status: %s '
'on subcloud: %s' %
(avail_to_set, subcloud_name))
entity_instance_id = "subcloud=%s" % subcloud_name
fault = self.fm_api.get_fault(
fm_const.FM_ALARM_ID_DC_SUBCLOUD_OFFLINE,
entity_instance_id)
if fault and (avail_to_set == consts.AVAILABILITY_ONLINE):
try:
self.fm_api.clear_fault(
fm_const.FM_ALARM_ID_DC_SUBCLOUD_OFFLINE,
entity_instance_id)
except Exception as e:
LOG.exception(e)
elif not fault and \
(avail_to_set == consts.AVAILABILITY_OFFLINE):
try:
fault = fm_api.Fault(
alarm_id=fm_const.FM_ALARM_ID_DC_SUBCLOUD_OFFLINE,
alarm_state=fm_const.FM_ALARM_STATE_SET,
entity_type_id=fm_const.FM_ENTITY_TYPE_SUBCLOUD,
entity_instance_id=entity_instance_id,
severity=fm_const.FM_ALARM_SEVERITY_CRITICAL,
reason_text=('%s is offline' % subcloud_name),
alarm_type=fm_const.FM_ALARM_TYPE_0,
probable_cause=fm_const.ALARM_PROBABLE_CAUSE_29,
proposed_repair_action="Wait for subcloud to "
"become online; if "
"problem persists contact "
"next level of support.",
service_affecting=True)
self.fm_api.set_fault(fault)
except Exception as e:
LOG.exception(e)
try:
updated_subcloud = db_api.subcloud_update(
self.context,
subcloud_id,
management_state=None,
availability_status=avail_to_set,
software_version=None,
description=None, location=None,
audit_fail_count=audit_fail_count)
except exceptions.SubcloudNotFound:
# slim possibility subcloud could have been deleted since
# we found it in db, ignore this benign error.
LOG.info('Ignoring SubcloudNotFound when attempting state'
' update: %s' % subcloud_name)
return
try:
self.dcorch_rpc_client.\
update_subcloud_states(self.context,
subcloud_name,
updated_subcloud.management_state,
avail_to_set)
LOG.info('Notifying dcorch, subcloud:%s management: %s, '
'availability:%s' %
(subcloud_name,
updated_subcloud.management_state,
avail_to_set))
except Exception as e:
LOG.exception(e)
LOG.warn('Problem informing dcorch of subcloud '
'state change, subcloud: %s' % subcloud_name)
if avail_to_set == consts.AVAILABILITY_OFFLINE:
# Subcloud is going offline, set all endpoint statuses to
# unknown.
try:
self.subcloud_manager.update_subcloud_endpoint_status(
self.context,
subcloud_name=subcloud_name,
endpoint_type=None,
sync_status=consts.SYNC_STATUS_UNKNOWN)
except exceptions.SubcloudNotFound:
LOG.info('Ignoring SubcloudNotFound when attempting '
'sync_status update: %s' % subcloud_name)
return
elif audit_fail_count != subcloud.audit_fail_count:
try:
db_api.subcloud_update(self.context, subcloud_id,
management_state=None,
availability_status=None,
software_version=None,
description=None, location=None,
audit_fail_count=audit_fail_count)
except exceptions.SubcloudNotFound:
# slim possibility subcloud could have been deleted since
# we found it in db, ignore this benign error.
LOG.info('Ignoring SubcloudNotFound when attempting '
'audit_fail_count update: %s' % subcloud_name)
return
elif update_subcloud_state:
# Nothing has changed, but we want to send a state update for this
# subcloud as an audit. Get the most up-to-date data.
subcloud = db_api.subcloud_get_by_name(self.context, subcloud_name)
self.dcorch_rpc_client. \
update_subcloud_states(self.context,
subcloud_name,
subcloud.management_state,
subcloud.availability_status)
if audit_openstack and sysinv_client:
# get a list of installed apps in the subcloud
try:
apps = sysinv_client.get_applications()
except Exception as e:
LOG.warn('Cannot retrieve installed apps for '
'subcloud:%s, %s' % (subcloud_name, e))
return
openstack_installed = subcloud.openstack_installed
openstack_installed_current = False
for app in apps:
if app.name == sysinv_constants.HELM_APP_OPENSTACK\
and app.active:
# audit find openstack app is installed and active in
# the subcloud
openstack_installed_current = True
break
dcm_update_func = None
dco_update_func = None
if openstack_installed_current and not openstack_installed:
dcm_update_func = db_api.subcloud_status_create
# TODO(andy.ning): This RPC will block for the duration of the
# initial sync. It needs to be made non-blocking.
dco_update_func = self.dcorch_rpc_client.\
add_subcloud_sync_endpoint_type
elif not openstack_installed_current and openstack_installed:
dcm_update_func = db_api.subcloud_status_delete
dco_update_func = self.dcorch_rpc_client.\
remove_subcloud_sync_endpoint_type
if dcm_update_func and dco_update_func:
endpoint_type_list = dccommon_consts.ENDPOINT_TYPES_LIST_OS
try:
# Notify dcorch to add/remove sync endpoint type list
dco_update_func(self.context, subcloud_name,
endpoint_type_list)
LOG.info('Notifying dcorch, subcloud: %s new sync'
' endpoint: %s' % (subcloud_name,
endpoint_type_list))
# Update subcloud status table by adding/removing
# openstack sync endpoint types.
for endpoint_type in endpoint_type_list:
dcm_update_func(self.context, subcloud_id,
endpoint_type)
# Update openstack_installed of subcloud table
db_api.subcloud_update(
self.context, subcloud_id,
openstack_installed=openstack_installed_current)
except exceptions.SubcloudNotFound:
LOG.info('Ignoring SubcloudNotFound when attempting'
' openstack_installed update: %s'
% subcloud_name)
except Exception as e:
LOG.exception(e)
LOG.warn('Problem informing dcorch of subcloud '
'sync endpoint type change, subcloud: %s'
% subcloud_name)