Add robustness to keystone endpoint operations

Following [1], it was observed on scale deployments that some subclouds failed to bootstrap because of intermittent keystone connections. This commit does the following to fix this problem: - Add a retry to all functions containing keystone operations. - On endpoint deletion, generate a new keystone client before retrying. This is done because original client was created with RegionOne and 127.0.0.1, which would become invalid upon deletion of previous keystone endpoints. - Change the order so keystone endpoints gets deleted at the end, to avoid the problem described above. - Modify RC file with new region_name before deleting endpoints. If the RC file is not modified before deleting keystone RegionOne endpoints, a failure in deletion could result in a system where we can't source credentials. Additionally, this commit changes the dcdbsync endpoint to be created with RegionOne instead of SystemController region, as this is the correct behavior. Test plan: - PASS: Deploy 250 subclouds in parallel and verify all of them completed bootstrap without errors, progressed to 'complete' and became online. - PASS: Deploy a DC system with 2 system controllers and 1 subcloud. Verify the dcdbsync endpoint was only created in RegionOne in the system controller and on corresponding region_name on the subcloud. [1]: https://review.opendev.org/c/starlingx/config/+/909662 Story: 2011035 Task: 49665 Change-Id: Ic47c399af342f84ddd4d4665a0c561cac67c5587 Signed-off-by: Victor Romano <victor.gluzromano@windriver.com>
2024-03-01 11:28:40 -03:00 · 2024-03-01 11:28:40 -03:00 · ca30ba6056
parent 000415e953
commit ca30ba6056
1 changed files with 53 additions and 6 deletions
--- a/sysinv/sysinv/sysinv/sysinv/common/openstack_config_endpoints.py
+++ b/sysinv/sysinv/sysinv/sysinv/common/openstack_config_endpoints.py
@ -7,10 +7,13 @@
 import copy
 import os

+from oslo_config import cfg
 from oslo_log import log as logging

 from sysinv.common import constants
+from sysinv.common.retrying import retry
 from sysinv.conductor import openstack
+from sysinv.db import api as dbapi
 from sysinv.puppet import puppet


@ -135,11 +138,11 @@ SERVICES_WITH_ADITIONAL_SYSTEMCONTROLLER_ENDPOINTS = [
    'patching',
    'usm',
    'vim',
-    'dcmanager',
-    'dcdbsync'
+    'dcmanager'
 ]

-SERVICES_WITH_ADITIONAL_SUBCLOUD_ENDPOINTS = [
+# DC services where endpoints will be created in RegionOne
+SERVICES_WITH_ADITIONAL_DC_ENDPOINTS = [
    'dcdbsync'
 ]

@ -155,6 +158,7 @@ SERVICES_PORTS_PATHS_MAP = {
 }


+@retry(stop_max_attempt_number=3, wait_fixed=1000)
 def create_users(keystone, users_to_create):
    if not users_to_create:
        LOG.info('No users to create')
@ -177,6 +181,7 @@ def create_users(keystone, users_to_create):
        LOG.info(f"User {username} successfully created")


+@retry(stop_max_attempt_number=3, wait_fixed=1000)
 def grant_admin_role(keystone, users_to_create, project_name):
    roles_dict = {role.name: role.id for role in keystone.roles.list()}
    users_dict = {user.name: user.id for user in keystone.users.list()}
@ -206,6 +211,7 @@ def grant_admin_role(keystone, users_to_create, project_name):
        LOG.info(f'Granted admin role for user {username}')


+@retry(stop_max_attempt_number=3, wait_fixed=1000)
 def create_services(keystone, services_to_create):
    if not services_to_create:
        LOG.info('No services to create')
@ -226,6 +232,7 @@ def create_services(keystone, services_to_create):
        LOG.info(f"Service {service_name} successfully created")


+@retry(stop_max_attempt_number=3, wait_fixed=1000)
 def create_endpoints(keystone, endpoints_to_create):
    if not endpoints_to_create:
        LOG.info('No endpoints to create')
@ -267,14 +274,32 @@ def create_endpoints(keystone, endpoints_to_create):
                     f"{region=} was successfully created with {url}")


+@retry(stop_max_attempt_number=3, wait_fixed=1000)
 def delete_regionone_endpoints(keystone):
    existing_endpoints = keystone.endpoints.list()
+    existing_services = keystone.services.list()
+    services_dict = {service.name: service.id for service in existing_services}
+    keystone_service_id = services_dict['keystone']
+    keystone_endpoints = []
+
    for endpoint in existing_endpoints:
        if endpoint.region == constants.REGION_ONE_NAME:
+            if endpoint.service_id == keystone_service_id:
+                # Register keystone endpoints to delete them at the end
+                # so previous authentication still works
+                keystone_endpoints.append(endpoint)
+                continue
+            # Deleting non Keystone endpoints
            keystone.endpoints.delete(endpoint)
            LOG.info(f'Deleted endpoint {endpoint}')

+    for endpoint in keystone_endpoints:
+        # Deleting Keystone endpoints
+        keystone.endpoints.delete(endpoint)
+        LOG.info(f'Deleted endpoint {endpoint}')

+
+@retry(stop_max_attempt_number=3, wait_fixed=1000)
 def update_region_name_on_rc_file(region_name):
    with open(RC_FILE_PATH, 'r') as file:
        lines = file.readlines()
@ -368,9 +393,9 @@ def run_endpoint_config(puppet_operator: puppet.PuppetOperator,
        services_to_create.extend(ADDITIONAL_SUBCLOUD_SERVICES)

    services_with_endpoints = copy.deepcopy(SERVICES_WITH_ENDPOINTS)
-    if is_subcloud:
+    if is_systemcontroller or is_subcloud:
        services_with_endpoints.extend(
-            SERVICES_WITH_ADITIONAL_SUBCLOUD_ENDPOINTS
+            SERVICES_WITH_ADITIONAL_DC_ENDPOINTS
        )

    users_to_create = []
@ -386,6 +411,9 @@ def run_endpoint_config(puppet_operator: puppet.PuppetOperator,
                                              region_name,
                                              puppet_plugins_dict)
    if is_systemcontroller:
+        # This endpoints will be created in a different region,
+        # so they need to be added to the endpoints list after
+        # the RegionOne endpoints list was created
        endpoints_to_create.extend(
            build_endpoint_list(
                SERVICES_WITH_ADITIONAL_SYSTEMCONTROLLER_ENDPOINTS,
@ -407,8 +435,27 @@ def run_endpoint_config(puppet_operator: puppet.PuppetOperator,
    create_services(keystone, services_to_create)
    create_endpoints(keystone, endpoints_to_create)
    if is_subcloud:
-        delete_regionone_endpoints(keystone)
+        # Update the rc file with the region name before deleting
+        # the endpoints so it's still possible to source credentials
+        # if the deletion failed after keystone RegionOne endpoints
+        # were deleted
        update_region_name_on_rc_file(region_name)
+        try:
+            delete_regionone_endpoints(keystone)
+        except Exception:
+            LOG.warning("Endpoint deletion failed. Generating new "
+                        "keystone client and trying again")
+            # The keystone service/client is cached in the class object,
+            # so we create a new instance to get a new client with
+            # updated region_name and keystone uri
+            # First we need to set the new region_name and auth_uri in cfg
+            auth_uri = keystone_plugin.get_identity_uri()
+            cfg.CONF.set_override("auth_uri", auth_uri, group=openstack.OPENSTACK_CONFIG)
+            cfg.CONF.set_override("region_name", region_name, group=openstack.OPENSTACK_CONFIG)
+            db_instance = dbapi.get_instance()
+            openstack_operator = openstack.OpenStackOperator(db_instance)
+            keystone = get_keystone_client(openstack_operator)
+            delete_regionone_endpoints(keystone)

    # Set new endpoint reconfiguration flag
    with open(ENDPONTS_RECONFIGURED_FLAG_PATH, 'a'):