From ca30ba60565591791a09a5d4f4f8e4fca57f78d5 Mon Sep 17 00:00:00 2001 From: Victor Romano Date: Fri, 1 Mar 2024 11:28:40 -0300 Subject: [PATCH] Add robustness to keystone endpoint operations Following [1], it was observed on scale deployments that some subclouds failed to bootstrap because of intermittent keystone connections. This commit does the following to fix this problem: - Add a retry to all functions containing keystone operations. - On endpoint deletion, generate a new keystone client before retrying. This is done because original client was created with RegionOne and 127.0.0.1, which would become invalid upon deletion of previous keystone endpoints. - Change the order so keystone endpoints gets deleted at the end, to avoid the problem described above. - Modify RC file with new region_name before deleting endpoints. If the RC file is not modified before deleting keystone RegionOne endpoints, a failure in deletion could result in a system where we can't source credentials. Additionally, this commit changes the dcdbsync endpoint to be created with RegionOne instead of SystemController region, as this is the correct behavior. Test plan: - PASS: Deploy 250 subclouds in parallel and verify all of them completed bootstrap without errors, progressed to 'complete' and became online. - PASS: Deploy a DC system with 2 system controllers and 1 subcloud. Verify the dcdbsync endpoint was only created in RegionOne in the system controller and on corresponding region_name on the subcloud. [1]: https://review.opendev.org/c/starlingx/config/+/909662 Story: 2011035 Task: 49665 Change-Id: Ic47c399af342f84ddd4d4665a0c561cac67c5587 Signed-off-by: Victor Romano --- .../common/openstack_config_endpoints.py | 59 +++++++++++++++++-- 1 file changed, 53 insertions(+), 6 deletions(-) diff --git a/sysinv/sysinv/sysinv/sysinv/common/openstack_config_endpoints.py b/sysinv/sysinv/sysinv/sysinv/common/openstack_config_endpoints.py index a5a2f7cff7..96bcc34ada 100644 --- a/sysinv/sysinv/sysinv/sysinv/common/openstack_config_endpoints.py +++ b/sysinv/sysinv/sysinv/sysinv/common/openstack_config_endpoints.py @@ -7,10 +7,13 @@ import copy import os +from oslo_config import cfg from oslo_log import log as logging from sysinv.common import constants +from sysinv.common.retrying import retry from sysinv.conductor import openstack +from sysinv.db import api as dbapi from sysinv.puppet import puppet @@ -135,11 +138,11 @@ SERVICES_WITH_ADITIONAL_SYSTEMCONTROLLER_ENDPOINTS = [ 'patching', 'usm', 'vim', - 'dcmanager', - 'dcdbsync' + 'dcmanager' ] -SERVICES_WITH_ADITIONAL_SUBCLOUD_ENDPOINTS = [ +# DC services where endpoints will be created in RegionOne +SERVICES_WITH_ADITIONAL_DC_ENDPOINTS = [ 'dcdbsync' ] @@ -155,6 +158,7 @@ SERVICES_PORTS_PATHS_MAP = { } +@retry(stop_max_attempt_number=3, wait_fixed=1000) def create_users(keystone, users_to_create): if not users_to_create: LOG.info('No users to create') @@ -177,6 +181,7 @@ def create_users(keystone, users_to_create): LOG.info(f"User {username} successfully created") +@retry(stop_max_attempt_number=3, wait_fixed=1000) def grant_admin_role(keystone, users_to_create, project_name): roles_dict = {role.name: role.id for role in keystone.roles.list()} users_dict = {user.name: user.id for user in keystone.users.list()} @@ -206,6 +211,7 @@ def grant_admin_role(keystone, users_to_create, project_name): LOG.info(f'Granted admin role for user {username}') +@retry(stop_max_attempt_number=3, wait_fixed=1000) def create_services(keystone, services_to_create): if not services_to_create: LOG.info('No services to create') @@ -226,6 +232,7 @@ def create_services(keystone, services_to_create): LOG.info(f"Service {service_name} successfully created") +@retry(stop_max_attempt_number=3, wait_fixed=1000) def create_endpoints(keystone, endpoints_to_create): if not endpoints_to_create: LOG.info('No endpoints to create') @@ -267,14 +274,32 @@ def create_endpoints(keystone, endpoints_to_create): f"{region=} was successfully created with {url}") +@retry(stop_max_attempt_number=3, wait_fixed=1000) def delete_regionone_endpoints(keystone): existing_endpoints = keystone.endpoints.list() + existing_services = keystone.services.list() + services_dict = {service.name: service.id for service in existing_services} + keystone_service_id = services_dict['keystone'] + keystone_endpoints = [] + for endpoint in existing_endpoints: if endpoint.region == constants.REGION_ONE_NAME: + if endpoint.service_id == keystone_service_id: + # Register keystone endpoints to delete them at the end + # so previous authentication still works + keystone_endpoints.append(endpoint) + continue + # Deleting non Keystone endpoints keystone.endpoints.delete(endpoint) LOG.info(f'Deleted endpoint {endpoint}') + for endpoint in keystone_endpoints: + # Deleting Keystone endpoints + keystone.endpoints.delete(endpoint) + LOG.info(f'Deleted endpoint {endpoint}') + +@retry(stop_max_attempt_number=3, wait_fixed=1000) def update_region_name_on_rc_file(region_name): with open(RC_FILE_PATH, 'r') as file: lines = file.readlines() @@ -368,9 +393,9 @@ def run_endpoint_config(puppet_operator: puppet.PuppetOperator, services_to_create.extend(ADDITIONAL_SUBCLOUD_SERVICES) services_with_endpoints = copy.deepcopy(SERVICES_WITH_ENDPOINTS) - if is_subcloud: + if is_systemcontroller or is_subcloud: services_with_endpoints.extend( - SERVICES_WITH_ADITIONAL_SUBCLOUD_ENDPOINTS + SERVICES_WITH_ADITIONAL_DC_ENDPOINTS ) users_to_create = [] @@ -386,6 +411,9 @@ def run_endpoint_config(puppet_operator: puppet.PuppetOperator, region_name, puppet_plugins_dict) if is_systemcontroller: + # This endpoints will be created in a different region, + # so they need to be added to the endpoints list after + # the RegionOne endpoints list was created endpoints_to_create.extend( build_endpoint_list( SERVICES_WITH_ADITIONAL_SYSTEMCONTROLLER_ENDPOINTS, @@ -407,8 +435,27 @@ def run_endpoint_config(puppet_operator: puppet.PuppetOperator, create_services(keystone, services_to_create) create_endpoints(keystone, endpoints_to_create) if is_subcloud: - delete_regionone_endpoints(keystone) + # Update the rc file with the region name before deleting + # the endpoints so it's still possible to source credentials + # if the deletion failed after keystone RegionOne endpoints + # were deleted update_region_name_on_rc_file(region_name) + try: + delete_regionone_endpoints(keystone) + except Exception: + LOG.warning("Endpoint deletion failed. Generating new " + "keystone client and trying again") + # The keystone service/client is cached in the class object, + # so we create a new instance to get a new client with + # updated region_name and keystone uri + # First we need to set the new region_name and auth_uri in cfg + auth_uri = keystone_plugin.get_identity_uri() + cfg.CONF.set_override("auth_uri", auth_uri, group=openstack.OPENSTACK_CONFIG) + cfg.CONF.set_override("region_name", region_name, group=openstack.OPENSTACK_CONFIG) + db_instance = dbapi.get_instance() + openstack_operator = openstack.OpenStackOperator(db_instance) + keystone = get_keystone_client(openstack_operator) + delete_regionone_endpoints(keystone) # Set new endpoint reconfiguration flag with open(ENDPONTS_RECONFIGURED_FLAG_PATH, 'a'):