452 lines
17 KiB
Python
Executable File
452 lines
17 KiB
Python
Executable File
#
|
|
# Copyright (c) 2018-2020 Wind River Systems, Inc.
|
|
#
|
|
# SPDX-License-Identifier: Apache-2.0
|
|
#
|
|
from eventlet.green import subprocess
|
|
import os
|
|
|
|
|
|
from fm_api import fm_api
|
|
|
|
from oslo_log import log
|
|
from sysinv._i18n import _
|
|
from sysinv.common import ceph
|
|
from sysinv.common import constants
|
|
from sysinv.common import kubernetes
|
|
from sysinv.common import utils
|
|
from sysinv.common.fm import fmclient
|
|
from sysinv.common.storage_backend_conf import StorageBackendConfig
|
|
from sysinv.api.controllers.v1 import patch_api
|
|
from sysinv.api.controllers.v1 import vim_api
|
|
|
|
import cgcs_patch.constants as patch_constants
|
|
|
|
LOG = log.getLogger(__name__)
|
|
|
|
|
|
class Health(object):
|
|
|
|
SUCCESS_MSG = _('OK')
|
|
FAIL_MSG = _('Fail')
|
|
|
|
def __init__(self, dbapi):
|
|
self._dbapi = dbapi
|
|
self._ceph = ceph.CephApiOperator()
|
|
self._kube_operator = kubernetes.KubeOperator()
|
|
|
|
def _check_hosts_provisioned(self, hosts):
|
|
"""Checks that each host is provisioned"""
|
|
provisioned_hosts = []
|
|
unprovisioned_hosts = 0
|
|
for host in hosts:
|
|
if host['invprovision'] != constants.PROVISIONED or \
|
|
host['hostname'] is None:
|
|
unprovisioned_hosts = unprovisioned_hosts + 1
|
|
else:
|
|
provisioned_hosts.append(host)
|
|
|
|
return unprovisioned_hosts, provisioned_hosts
|
|
|
|
def _check_hosts_enabled(self, hosts):
|
|
"""Checks that each host is enabled and unlocked"""
|
|
offline_host_list = []
|
|
for host in hosts:
|
|
if host['administrative'] != constants.ADMIN_UNLOCKED or \
|
|
host['operational'] != constants.OPERATIONAL_ENABLED:
|
|
offline_host_list.append(host.hostname)
|
|
|
|
success = not offline_host_list
|
|
return success, offline_host_list
|
|
|
|
def _check_hosts_config(self, hosts):
|
|
"""Checks that the applied and target config match for each host"""
|
|
config_host_list = []
|
|
for host in hosts:
|
|
if (host.config_target and
|
|
host.config_applied != host.config_target):
|
|
config_host_list.append(host.hostname)
|
|
|
|
success = not config_host_list
|
|
return success, config_host_list
|
|
|
|
def _check_patch_current(self, hosts):
|
|
"""Checks that each host is patch current"""
|
|
system = self._dbapi.isystem_get_one()
|
|
response = patch_api.patch_query_hosts(token=None, timeout=60,
|
|
region_name=system.region_name)
|
|
patch_hosts = response['data']
|
|
not_patch_current_hosts = []
|
|
hostnames = []
|
|
for host in hosts:
|
|
hostnames.append(host['hostname'])
|
|
|
|
for host in patch_hosts:
|
|
# There may be instances where the patching db returns
|
|
# hosts that have been recently deleted. We will continue if a host
|
|
# is the patching db but not sysinv
|
|
try:
|
|
hostnames.remove(host['hostname'])
|
|
except ValueError:
|
|
LOG.info('Host %s found in patching but not in sysinv. '
|
|
'Continuing' % host['hostname'])
|
|
else:
|
|
if not host['patch_current']:
|
|
not_patch_current_hosts.append(host['hostname'])
|
|
|
|
success = not not_patch_current_hosts and not hostnames
|
|
return success, not_patch_current_hosts, hostnames
|
|
|
|
def _check_alarms(self, context, force=False):
|
|
"""Checks that no alarms are active"""
|
|
db_alarms = fmclient(context).alarm.list(include_suppress=True)
|
|
|
|
success = True
|
|
allowed = 0
|
|
affecting = 0
|
|
# Only fail if we find alarms past their affecting threshold
|
|
for db_alarm in db_alarms:
|
|
if isinstance(db_alarm, tuple):
|
|
alarm = db_alarm[0]
|
|
mgmt_affecting = db_alarm[constants.DB_MGMT_AFFECTING]
|
|
else:
|
|
alarm = db_alarm
|
|
mgmt_affecting = db_alarm.mgmt_affecting
|
|
if fm_api.FaultAPIs.alarm_allowed(alarm.severity, mgmt_affecting):
|
|
allowed += 1
|
|
if not force:
|
|
success = False
|
|
else:
|
|
affecting += 1
|
|
success = False
|
|
|
|
return success, allowed, affecting
|
|
|
|
def get_alarms_degrade(self, context, alarm_ignore_list=None,
|
|
entity_instance_id_filter=""):
|
|
"""Return all the alarms that cause the degrade"""
|
|
db_alarms = fmclient(context).alarm.list(include_suppress=True)
|
|
degrade_alarms = []
|
|
if alarm_ignore_list is None:
|
|
alarm_ignore_list = []
|
|
|
|
for db_alarm in db_alarms:
|
|
if isinstance(db_alarm, tuple):
|
|
alarm = db_alarm[0]
|
|
degrade_affecting = db_alarm[constants.DB_DEGRADE_AFFECTING]
|
|
else:
|
|
alarm = db_alarm
|
|
degrade_affecting = db_alarm.degrade_affecting
|
|
# Ignore alarms that are part of the ignore list sent as parameter
|
|
# and also filter the alarms bases on entity instance id.
|
|
# If multiple alarms with the same ID exist, we only return the ID
|
|
# one time.
|
|
if degrade_affecting == 'True':
|
|
if (entity_instance_id_filter in alarm.entity_instance_id and
|
|
alarm.alarm_id not in alarm_ignore_list and
|
|
alarm.alarm_id not in degrade_alarms):
|
|
degrade_alarms.append(alarm.alarm_id)
|
|
return degrade_alarms
|
|
|
|
def _check_ceph(self):
|
|
"""Checks the ceph health status"""
|
|
return self._ceph.ceph_status_ok()
|
|
|
|
def _check_license(self, version):
|
|
"""Validates the current license is valid for the specified version"""
|
|
check_binary = "/usr/bin/sm-license-check"
|
|
license_file = '/etc/platform/.license'
|
|
system = self._dbapi.isystem_get_one()
|
|
system_type = system.system_type
|
|
system_mode = system.system_mode
|
|
|
|
with open(os.devnull, "w") as fnull:
|
|
try:
|
|
subprocess.check_call([check_binary, license_file, version,
|
|
system_type, system_mode],
|
|
stdout=fnull, stderr=fnull)
|
|
except subprocess.CalledProcessError:
|
|
return False
|
|
|
|
return True
|
|
|
|
def _check_required_patches(self, patch_list):
|
|
"""Validates that each patch provided is applied on the system"""
|
|
system = self._dbapi.isystem_get_one()
|
|
response = patch_api.patch_query(token=None, timeout=60,
|
|
region_name=system.region_name)
|
|
query_patches = response['pd']
|
|
applied_patches = []
|
|
for patch_key in query_patches:
|
|
patch = query_patches[patch_key]
|
|
patchstate = patch.get('patchstate', None)
|
|
if patchstate == patch_constants.APPLIED or \
|
|
patchstate == patch_constants.COMMITTED:
|
|
applied_patches.append(patch_key)
|
|
|
|
missing_patches = []
|
|
for required_patch in patch_list:
|
|
if required_patch not in applied_patches:
|
|
missing_patches.append(required_patch)
|
|
|
|
success = not missing_patches
|
|
return success, missing_patches
|
|
|
|
def _check_running_instances(self, host):
|
|
"""Checks that no instances are running on the host"""
|
|
|
|
vim_resp = vim_api.vim_host_get_instances(
|
|
None,
|
|
host['uuid'],
|
|
host['hostname'],
|
|
constants.VIM_DEFAULT_TIMEOUT_IN_SECS)
|
|
running_instances = vim_resp['instances']
|
|
|
|
success = running_instances == 0
|
|
return success, running_instances
|
|
|
|
def _check_simplex_available_space(self):
|
|
"""Ensures there is free space for the backup"""
|
|
|
|
# TODO: Switch this over to use Ansible
|
|
# try:
|
|
# backup_restore.check_size("/opt/backups", True)
|
|
# except backup_restore.BackupFail:
|
|
# return False
|
|
# return True
|
|
LOG.info("Skip the check of the enough free space.")
|
|
|
|
def _check_kube_nodes_ready(self):
|
|
"""Checks that each kubernetes node is ready"""
|
|
fail_node_list = []
|
|
|
|
nodes = self._kube_operator.kube_get_nodes()
|
|
for node in nodes:
|
|
for condition in node.status.conditions:
|
|
if condition.type == "Ready" and condition.status != "True":
|
|
# This node is not ready
|
|
fail_node_list.append(node.metadata.name)
|
|
|
|
success = not fail_node_list
|
|
return success, fail_node_list
|
|
|
|
def _check_kube_control_plane_pods(self):
|
|
"""Checks that each kubernetes control plane pod is ready"""
|
|
fail_pod_list = []
|
|
|
|
pod_ready_status = self._kube_operator.\
|
|
kube_get_control_plane_pod_ready_status()
|
|
|
|
for pod_name, ready_status in pod_ready_status.items():
|
|
if ready_status != "True":
|
|
# This pod is not ready
|
|
fail_pod_list.append(pod_name)
|
|
|
|
success = not fail_pod_list
|
|
return success, fail_pod_list
|
|
|
|
def _check_kube_applications(self):
|
|
"""Checks that each kubernetes application is in a valid state"""
|
|
|
|
fail_app_list = []
|
|
apps = self._dbapi.kube_app_get_all()
|
|
|
|
for app in apps:
|
|
# The following states are valid during kubernetes upgrade
|
|
if app.status not in [constants.APP_UPLOAD_SUCCESS,
|
|
constants.APP_APPLY_SUCCESS,
|
|
constants.APP_INACTIVE_STATE]:
|
|
fail_app_list.append(app.name)
|
|
|
|
success = not fail_app_list
|
|
return success, fail_app_list
|
|
|
|
def get_system_health(self, context, force=False):
|
|
"""Returns the general health of the system
|
|
|
|
Checks the following:
|
|
- All hosts are provisioned
|
|
- All hosts are patch current
|
|
- All hosts are unlocked/enabled
|
|
- All hosts having matching configs
|
|
- No management affecting alarms
|
|
- For ceph systems: The storage cluster is healthy
|
|
- All kubernetes nodes are ready
|
|
- All kubernetes control plane pods are ready
|
|
"""
|
|
|
|
hosts = self._dbapi.ihost_get_list()
|
|
output = _('System Health:\n')
|
|
health_ok = True
|
|
|
|
unprovisioned_hosts, provisioned_hosts = \
|
|
self._check_hosts_provisioned(hosts)
|
|
success = unprovisioned_hosts == 0
|
|
output += (_('All hosts are provisioned: [%s]\n')
|
|
% (Health.SUCCESS_MSG if success else Health.FAIL_MSG))
|
|
if not success:
|
|
output += _('%s Unprovisioned hosts\n') % unprovisioned_hosts
|
|
# Set the hosts to the provisioned_hosts. This will allow the other
|
|
# checks to continue
|
|
hosts = provisioned_hosts
|
|
|
|
health_ok = health_ok and success
|
|
|
|
success, error_hosts = self._check_hosts_enabled(hosts)
|
|
output += _('All hosts are unlocked/enabled: [%s]\n') \
|
|
% (Health.SUCCESS_MSG if success else Health.FAIL_MSG)
|
|
if not success:
|
|
output += _('Locked or disabled hosts: %s\n') \
|
|
% ', '.join(error_hosts)
|
|
|
|
health_ok = health_ok and success
|
|
|
|
success, error_hosts = self._check_hosts_config(hosts)
|
|
output += _('All hosts have current configurations: [%s]\n') \
|
|
% (Health.SUCCESS_MSG if success else Health.FAIL_MSG)
|
|
if not success:
|
|
output += _('Hosts with out of date configurations: %s\n') \
|
|
% ', '.join(error_hosts)
|
|
|
|
health_ok = health_ok and success
|
|
|
|
success, error_hosts, missing_hosts = self._check_patch_current(hosts)
|
|
output += _('All hosts are patch current: [%s]\n') \
|
|
% (Health.SUCCESS_MSG if success else Health.FAIL_MSG)
|
|
if not success:
|
|
if error_hosts:
|
|
output += _('Hosts not patch current: %s\n') \
|
|
% ', '.join(error_hosts)
|
|
if missing_hosts:
|
|
output += _('Hosts without patch data: %s\n') \
|
|
% ', '.join(missing_hosts)
|
|
|
|
health_ok = health_ok and success
|
|
|
|
if StorageBackendConfig.has_backend(
|
|
self._dbapi,
|
|
constants.CINDER_BACKEND_CEPH):
|
|
success = self._check_ceph()
|
|
output += _('Ceph Storage Healthy: [%s]\n') \
|
|
% (Health.SUCCESS_MSG if success else Health.FAIL_MSG)
|
|
|
|
health_ok = health_ok and success
|
|
|
|
success, allowed, affecting = self._check_alarms(context, force)
|
|
output += _('No alarms: [%s]\n') \
|
|
% (Health.SUCCESS_MSG if success else Health.FAIL_MSG)
|
|
if not success:
|
|
output += _('[%s] alarms found, [%s] of which are management '
|
|
'affecting\n') % (allowed + affecting, affecting)
|
|
|
|
health_ok = health_ok and success
|
|
|
|
success, error_nodes = self._check_kube_nodes_ready()
|
|
output += _('All kubernetes nodes are ready: [%s]\n') \
|
|
% (Health.SUCCESS_MSG if success else Health.FAIL_MSG)
|
|
if not success:
|
|
output += _('Kubernetes nodes not ready: %s\n') \
|
|
% ', '.join(error_nodes)
|
|
|
|
health_ok = health_ok and success
|
|
|
|
success, error_nodes = self._check_kube_control_plane_pods()
|
|
output += _('All kubernetes control plane pods are ready: [%s]\n') \
|
|
% (Health.SUCCESS_MSG if success else Health.FAIL_MSG)
|
|
if not success:
|
|
output += _('Kubernetes control plane pods not ready: %s\n') \
|
|
% ', '.join(error_nodes)
|
|
|
|
health_ok = health_ok and success
|
|
|
|
return health_ok, output
|
|
|
|
def get_system_health_upgrade(self, context, force=False):
|
|
"""Ensures the system is in a valid state for an upgrade"""
|
|
# Does a general health check then does the following:
|
|
# A load is imported
|
|
# The load patch requirements are met
|
|
# The license is valid for the N+1 load
|
|
|
|
system_mode = self._dbapi.isystem_get_one().system_mode
|
|
simplex = (system_mode == constants.SYSTEM_MODE_SIMPLEX)
|
|
|
|
health_ok, output = self.get_system_health(context, force)
|
|
loads = self._dbapi.load_get_list()
|
|
try:
|
|
imported_load = utils.get_imported_load(loads)
|
|
except Exception as e:
|
|
LOG.exception(e)
|
|
output += _('No imported load found. Unable to test further\n')
|
|
return health_ok, output
|
|
|
|
upgrade_version = imported_load.software_version
|
|
if imported_load.required_patches:
|
|
patches = imported_load.required_patches.split('\n')
|
|
else:
|
|
patches = []
|
|
|
|
success, missing_patches = self._check_required_patches(patches)
|
|
output += _('Required patches are applied: [%s]\n') \
|
|
% (Health.SUCCESS_MSG if success else Health.FAIL_MSG)
|
|
if not success:
|
|
output += _('Patches not applied: %s\n') \
|
|
% ', '.join(missing_patches)
|
|
|
|
health_ok = health_ok and success
|
|
|
|
success = self._check_license(upgrade_version)
|
|
output += _('License valid for upgrade: [%s]\n') \
|
|
% (Health.SUCCESS_MSG if success else Health.FAIL_MSG)
|
|
|
|
health_ok = health_ok and success
|
|
|
|
if not simplex:
|
|
controller_1 = self._dbapi.ihost_get_by_hostname(
|
|
constants.CONTROLLER_1_HOSTNAME)
|
|
|
|
# If we are running on CPE we don't want any instances running
|
|
# on controller-1 before we start the upgrade, otherwise the
|
|
# databases will be out of sync after we lock controller-1
|
|
if constants.WORKER in controller_1.subfunctions:
|
|
success, running_instances = self._check_running_instances(
|
|
controller_1)
|
|
output += \
|
|
_('No instances running on controller-1: [%s]\n') \
|
|
% (Health.SUCCESS_MSG if success else Health.FAIL_MSG)
|
|
if not success:
|
|
output += _('Number of instances on controller-1: %s\n') \
|
|
% (running_instances)
|
|
|
|
health_ok = health_ok and success
|
|
else:
|
|
success = self._check_simplex_available_space()
|
|
output += \
|
|
_('Sufficient free space for upgrade: [%s]\n') \
|
|
% (Health.SUCCESS_MSG if success else Health.FAIL_MSG)
|
|
|
|
health_ok = health_ok and success
|
|
|
|
return health_ok, output
|
|
|
|
def get_system_health_kube_upgrade(self, context, force=False):
|
|
"""Ensures the system is in a valid state for a kubernetes upgrade
|
|
|
|
Does a general health check then does the following:
|
|
- All kubernetes applications are in a stable state
|
|
"""
|
|
|
|
health_ok, output = self.get_system_health(context, force)
|
|
|
|
success, apps_not_valid = self._check_kube_applications()
|
|
output += _(
|
|
'All kubernetes applications are in a valid state: [%s]\n') \
|
|
% (Health.SUCCESS_MSG if success else Health.FAIL_MSG)
|
|
if not success:
|
|
output += _('Kubernetes applications not in a valid state: %s\n') \
|
|
% ', '.join(apps_not_valid)
|
|
|
|
health_ok = health_ok and success
|
|
|
|
return health_ok, output
|