distcloud/dcmanager/manager/patch_audit_manager.py

249 lines
10 KiB
Python

# Copyright 2017 Ericsson AB.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Copyright (c) 2017 Wind River Systems, Inc.
#
# The right to copy, distribute, modify, or otherwise make use
# of this software may be licensed only pursuant to the terms
# of an applicable Wind River license agreement.
#
from keystoneauth1 import exceptions as keystone_exceptions
from oslo_config import cfg
from oslo_log import log as logging
from dcorch.common import consts as dcorch_consts
from dcorch.drivers.openstack.keystone_v3 import KeystoneClient
from dcmanager.common import consts
from dcmanager.common import context
from dcmanager.common.i18n import _
from dcmanager.common import manager
from dcmanager.db import api as db_api
from dcmanager.drivers.openstack import patching_v1
from dcmanager.drivers.openstack.patching_v1 import PatchingClient
from dcmanager.drivers.openstack.sysinv_v1 import SysinvClient
LOG = logging.getLogger(__name__)
# By default the patch audit will only occur every five minutes.
DEFAULT_PATCH_AUDIT_DELAY_SECONDS = 300
class PatchAuditManager(manager.Manager):
"""Manages tasks related to patch audits."""
def __init__(self, *args, **kwargs):
LOG.info(_('PatchAuditManager initialization...'))
super(PatchAuditManager, self).__init__(
service_name="patch_audit_manager")
self.context = context.get_admin_context()
self.subcloud_manager = kwargs['subcloud_manager']
# Wait 20 seconds before doing the first audit
self.wait_time_passed = DEFAULT_PATCH_AUDIT_DELAY_SECONDS - 25
# Used to force an audit on the next interval
_force_audit = False
@classmethod
def trigger_audit(cls):
"""Trigger audit at next interval.
This can be called from outside the audit greenthread.
"""
cls._force_audit = True
def periodic_patch_audit(self):
"""Audit patch status of subclouds.
Audit normally happens every DEFAULT_PATCH_AUDIT_DELAY_SECONDS, but
can be forced to happen on the next audit interval by calling
trigger_audit.
"""
do_audit = False
if PatchAuditManager._force_audit:
# Audit has been triggered.
do_audit = True
else:
# This won't be super accurate as we aren't woken up after exactly
# the interval seconds, but it is good enough for an audit.
self.wait_time_passed += cfg.CONF.scheduler.patch_audit_interval
if self.wait_time_passed >= DEFAULT_PATCH_AUDIT_DELAY_SECONDS:
do_audit = True
if do_audit:
self.wait_time_passed = 0
PatchAuditManager._force_audit = False
# Blanket catch all exceptions in the audit so that the audit
# does not die.
try:
self._periodic_patch_audit_loop()
except Exception as e:
LOG.exception(e)
def _periodic_patch_audit_loop(self):
"""Audit patch status of subclouds loop."""
# We are running in our own green thread here.
LOG.info('Triggered patch audit.')
try:
ks_client = KeystoneClient()
except Exception:
LOG.warn('Failure initializing KeystoneClient, exiting audit.')
return
# First query RegionOne to determine what patches should be applied
# to the system.
patching_client = PatchingClient(
consts.DEFAULT_REGION_NAME, ks_client.session)
regionone_patches = patching_client.query()
LOG.debug("regionone_patches: %s" % regionone_patches)
# Build lists of patches that should be applied or committed in all
# subclouds, based on their state in RegionOne. Check repostate
# (not patchstate) as we only care if the patch has been applied to
# the repo (not whether it is installed on the hosts).
applied_patch_ids = list()
committed_patch_ids = list()
for patch_id in regionone_patches.keys():
if regionone_patches[patch_id]['repostate'] == \
patching_v1.PATCH_STATE_APPLIED:
applied_patch_ids.append(patch_id)
elif regionone_patches[patch_id]['repostate'] == \
patching_v1.PATCH_STATE_COMMITTED:
committed_patch_ids.append(patch_id)
LOG.debug("RegionOne applied_patch_ids: %s" % applied_patch_ids)
LOG.debug("RegionOne committed_patch_ids: %s" % committed_patch_ids)
# For each subcloud, check whether the patches match the target.
for subcloud in db_api.subcloud_get_all(self.context):
# Only audit patching on subclouds that are managed and online
if (subcloud.management_state != consts.MANAGEMENT_MANAGED or
subcloud.availability_status !=
consts.AVAILABILITY_ONLINE):
continue
try:
sc_ks_client = KeystoneClient(subcloud.name)
except (keystone_exceptions.EndpointNotFound, IndexError) as e:
LOG.warn("Identity endpoint for online subcloud % not found. %"
% (subcloud.name, e))
continue
try:
patching_client = PatchingClient(subcloud.name,
sc_ks_client.session)
except keystone_exceptions.EndpointNotFound:
LOG.warn("Patching endpoint for online subcloud %s not found."
% subcloud.name)
continue
try:
sysinv_client = SysinvClient(subcloud.name,
sc_ks_client.session)
except keystone_exceptions.EndpointNotFound:
LOG.warn("Sysinv endpoint for online subcloud %s not found."
% subcloud.name)
continue
# Retrieve all the patches that are present in this subcloud.
try:
subcloud_patches = patching_client.query()
LOG.debug("Patches for subcloud %s: %s" %
(subcloud.name, subcloud_patches))
except Exception:
LOG.warn('Cannot retrieve patches for subcloud: %s' %
subcloud.name)
continue
# Determine which loads are present in this subcloud. During an
# upgrade, there will be more than one load installed.
installed_loads = list()
try:
loads = sysinv_client.get_loads()
except Exception:
LOG.warn('Cannot retrieve loads for subcloud: %s' %
subcloud.name)
continue
for load in loads:
installed_loads.append(load.software_version)
out_of_sync = False
# Check that all patches in this subcloud are in the correct
# state, based on the state of the patch in RegionOne. For the
# subcloud, we use the patchstate because we care whether the
# patch is installed on the hosts.
for patch_id in subcloud_patches.keys():
if subcloud_patches[patch_id]['patchstate'] == \
patching_v1.PATCH_STATE_APPLIED:
if patch_id not in applied_patch_ids:
if patch_id not in committed_patch_ids:
LOG.debug("Patch %s should not be applied in %s" %
(patch_id, subcloud.name))
else:
LOG.debug("Patch %s should be committed in %s" %
(patch_id, subcloud.name))
out_of_sync = True
elif subcloud_patches[patch_id]['patchstate'] == \
patching_v1.PATCH_STATE_COMMITTED:
if patch_id not in committed_patch_ids:
LOG.warn("Patch %s should not be committed in %s" %
(patch_id, subcloud.name))
out_of_sync = True
else:
# In steady state, all patches should either be applied
# or committed in each subcloud. Patches in other
# states mean a sync is required.
out_of_sync = True
# Check that all applied or committed patches in RegionOne are
# present in the subcloud.
for patch_id in applied_patch_ids:
if regionone_patches[patch_id]['sw_version'] in \
installed_loads and patch_id not in subcloud_patches:
LOG.debug("Patch %s missing from %s" %
(patch_id, subcloud.name))
out_of_sync = True
for patch_id in committed_patch_ids:
if regionone_patches[patch_id]['sw_version'] in \
installed_loads and patch_id not in subcloud_patches:
LOG.debug("Patch %s missing from %s" %
(patch_id, subcloud.name))
out_of_sync = True
if out_of_sync:
LOG.debug("Subcloud %s is out-of-sync for patching" %
subcloud.name)
self.subcloud_manager.update_subcloud_endpoint_status(
self.context,
subcloud_name=subcloud.name,
endpoint_type=dcorch_consts.ENDPOINT_TYPE_PATCHING,
sync_status=consts.SYNC_STATUS_OUT_OF_SYNC)
else:
LOG.debug("Subcloud %s is in-sync for patching" %
subcloud.name)
self.subcloud_manager.update_subcloud_endpoint_status(
self.context,
subcloud_name=subcloud.name,
endpoint_type=dcorch_consts.ENDPOINT_TYPE_PATCHING,
sync_status=consts.SYNC_STATUS_IN_SYNC)