Fix Sub clouds going offline due to auth failure

This update contains the following changes that prevent subclouds
going offline due to authentication failure:
1. The os region client cache is cleared when a new keystone client
is created. The os region client will be re-created using the new
keystone session.
2. When the user's access info (such as role id) is changed create
new keystone client and os region clients. This could happen after
system controller keystone role ids were synced to subclouds
3. Remove get_admin_backup_session that was only required when
upgrading to stx 4.0.
4. Increase AVAIL_FAIL_COUNT_TO_ALARM to 2 as we don't want to alarm
first failure since there are cases where we expect a transient
failure in the subcloud (e.g. haproxy process restart to update
certificates)

Tested on DC-6:
1. Adding 50 subclouds twice
2. Soaking the fix over the weekend

Closes-Bug: 1927007

Signed-off-by: Tao Liu <tao.liu@windriver.com>
Change-Id: I86fdc9a2f062409e704bdfac2119dc488123f7de
(cherry picked from commit 17b5505d9e)
This commit is contained in:
Tao Liu 2021-05-03 12:32:53 -04:00
parent de0fef663a
commit d8ce118e50
5 changed files with 29 additions and 51 deletions

View File

@ -83,6 +83,9 @@ class OpenStackDriver(object):
OpenStackDriver.update_region_clients(region_name,
KEYSTONE_CLIENT_NAME,
self.keystone_client)
# Clear client object cache
OpenStackDriver.os_clients_dict[region_name] = \
collections.defaultdict(dict)
except Exception as exception:
LOG.error('keystone_client region %s error: %s' %
(region_name, str(exception)))
@ -185,14 +188,18 @@ class OpenStackDriver(object):
OpenStackDriver._identity_tokens[region_name],
include_catalog=False)
if token != OpenStackDriver._identity_tokens[region_name]:
LOG.debug("%s: updating token %s to %s" %
LOG.debug("%s: AccessInfo changed %s to %s" %
(region_name,
OpenStackDriver._identity_tokens[region_name],
token))
OpenStackDriver._identity_tokens[region_name] = token
OpenStackDriver._identity_tokens[region_name] = None
OpenStackDriver.os_clients_dict[region_name] = \
collections.defaultdict(dict)
return False
except Exception as exception:
LOG.info('_is_token_valid handle: %s', str(exception))
LOG.info('_is_token_valid handle: region: %s error: %s',
(region_name, str(exception)))
# Reset the cached dictionary
OpenStackDriver.os_clients_dict[region_name] = \
collections.defaultdict(dict)

View File

@ -23,7 +23,6 @@
import collections
import threading
from keystoneauth1 import exceptions as keystone_exceptions
from keystoneauth1 import loading
from keystoneauth1 import session
@ -107,10 +106,6 @@ class EndpointCache(object):
CONF.endpoint_cache.password,
CONF.endpoint_cache.project_name,
CONF.endpoint_cache.project_domain_name)
# check if the current session is valid and get an admin session
# if necessary
self.admin_session = EndpointCache.get_admin_backup_session(
self.admin_session, CONF.endpoint_cache.username, sc_auth_url)
self.keystone_client = ks_client.Client(
session=self.admin_session,
@ -140,33 +135,6 @@ class EndpointCache(object):
auth=user_auth, additional_headers=consts.USER_HEADER,
timeout=timeout)
@classmethod
def get_admin_backup_session(cls, admin_session, user_name, auth_url):
"""Validate a session and open an admin session if it fails.
This method is require to handle an upgrade to stx 4.0 and it
can be removed in stx 5.0.
"""
try:
admin_session.get_auth_headers()
except keystone_exceptions.Unauthorized:
# this will only happen briefly during an upgrade to stx 4.0
# just until the dcorch has synced the dcmanager user to each
# subcloud
LOG.info("Failed to authenticate user:%s, use %s user instead"
% (user_name,
CONF.cache.admin_username))
admin_session = EndpointCache.get_admin_session(
auth_url,
CONF.cache.admin_username,
CONF.cache.admin_user_domain_name,
CONF.cache.admin_password,
CONF.cache.admin_tenant,
CONF.cache.admin_project_domain_name)
return admin_session
@staticmethod
def _is_central_cloud(region_id):
central_cloud_regions = [consts.CLOUD_0, consts.VIRTUAL_MASTER_CLOUD]

View File

@ -75,7 +75,11 @@ ENDPOINT_TYPE = "endpoint_type"
SERVICE_GROUP_STATUS_ACTIVE = "active"
# Availability fail count
AVAIL_FAIL_COUNT_TO_ALARM = 1
# we don't want to alarm first failure since there are
# cases where we expect a transient failure in the
# subcloud (e.g. haproxy process restart to update
# certificates)
AVAIL_FAIL_COUNT_TO_ALARM = 2
AVAIL_FAIL_COUNT_MAX = 9999
# Software update strategy types

View File

@ -583,8 +583,7 @@ class TestAuditWorkerManager(base.DCManagerTestCase):
audit_fail_count = 1
self.fake_dcmanager_api.update_subcloud_availability.\
assert_called_with(mock.ANY, subcloud.name,
consts.AVAILABILITY_OFFLINE,
False, audit_fail_count)
None, False, audit_fail_count)
# Update the DB like dcmanager would do.
subcloud = db_api.subcloud_update(
@ -605,23 +604,27 @@ class TestAuditWorkerManager(base.DCManagerTestCase):
audit_fail_count = audit_fail_count + 1
# Verify the subcloud availability didn't change, just the fail count
# Verify the subcloud goes offline
self.fake_dcmanager_api.update_subcloud_availability.\
assert_called_with(mock.ANY, subcloud.name,
None, False,
audit_fail_count)
# Verify alarm update is not called
self.fake_alarm_aggr.update_alarm_summary.assert_not_called()
# Verify alarm update is called only once
self.fake_alarm_aggr.update_alarm_summary.assert_called_once_with(
subcloud.name, self.fake_openstack_client.fm_client)
# Verify patch audit is not called
self.fake_patch_audit.subcloud_patch_audit.assert_not_called()
# Verify patch audit is called only once
self.fake_patch_audit.subcloud_patch_audit.assert_called_once_with(
subcloud.name, mock.ANY, True)
# Verify firmware audit is not called
self.fake_firmware_audit.subcloud_firmware_audit.assert_not_called()
# Verify firmware audit is called
self.fake_firmware_audit.subcloud_firmware_audit.assert_called_once_with(
subcloud.name, mock.ANY)
# Verify firmware audit is not called
self.fake_kubernetes_audit.subcloud_kubernetes_audit.assert_not_called()
# Verify firmware audit is called
self.fake_kubernetes_audit.subcloud_kubernetes_audit.assert_called_once_with(
subcloud.name, mock.ANY)
def test_audit_subcloud_offline_no_change(self):
subcloud = self.create_subcloud_static(self.ctx, name='subcloud1')

View File

@ -193,10 +193,6 @@ class SyncThread(object):
config.admin_project_domain_name,
timeout=60)
if config is cfg.CONF.endpoint_cache:
self.sc_admin_session = EndpointCache.get_admin_backup_session(
self.sc_admin_session, config.username, sc_auth_url)
def initial_sync(self):
# Return True to indicate initial sync success
return True