Add robustness to keystone endpoint operations

Following [1], it was observed on scale deployments that some
subclouds failed to bootstrap because of intermittent keystone
connections. This commit does the following to fix this problem:

- Add a retry to all functions containing keystone operations.
- On endpoint deletion, generate a new keystone client before
  retrying. This is done because original client was created with
  RegionOne and 127.0.0.1, which would become invalid upon deletion
  of previous keystone endpoints.
- Change the order so keystone endpoints gets deleted at the end, to
  avoid the problem described above.
- Modify RC file with new region_name before deleting endpoints. If
  the RC file is not modified before deleting keystone RegionOne
  endpoints, a failure in deletion could result in a system where
  we can't source credentials.

Additionally, this commit changes the dcdbsync endpoint to be created
with RegionOne instead of SystemController region, as this is the
correct behavior.

Test plan:
  - PASS: Deploy 250 subclouds in parallel and verify all of them
          completed bootstrap without errors, progressed to 'complete'
          and became online.
  - PASS: Deploy a DC system with 2 system controllers and 1 subcloud.
          Verify the dcdbsync endpoint was only created in RegionOne
          in the system controller and on corresponding region_name
          on the subcloud.

[1]: https://review.opendev.org/c/starlingx/config/+/909662

Story: 2011035
Task: 49665

Change-Id: Ic47c399af342f84ddd4d4665a0c561cac67c5587
Signed-off-by: Victor Romano <victor.gluzromano@windriver.com>
This commit is contained in:
Victor Romano 2024-03-01 11:28:40 -03:00
parent 000415e953
commit ca30ba6056
1 changed files with 53 additions and 6 deletions

View File

@ -7,10 +7,13 @@
import copy import copy
import os import os
from oslo_config import cfg
from oslo_log import log as logging from oslo_log import log as logging
from sysinv.common import constants from sysinv.common import constants
from sysinv.common.retrying import retry
from sysinv.conductor import openstack from sysinv.conductor import openstack
from sysinv.db import api as dbapi
from sysinv.puppet import puppet from sysinv.puppet import puppet
@ -135,11 +138,11 @@ SERVICES_WITH_ADITIONAL_SYSTEMCONTROLLER_ENDPOINTS = [
'patching', 'patching',
'usm', 'usm',
'vim', 'vim',
'dcmanager', 'dcmanager'
'dcdbsync'
] ]
SERVICES_WITH_ADITIONAL_SUBCLOUD_ENDPOINTS = [ # DC services where endpoints will be created in RegionOne
SERVICES_WITH_ADITIONAL_DC_ENDPOINTS = [
'dcdbsync' 'dcdbsync'
] ]
@ -155,6 +158,7 @@ SERVICES_PORTS_PATHS_MAP = {
} }
@retry(stop_max_attempt_number=3, wait_fixed=1000)
def create_users(keystone, users_to_create): def create_users(keystone, users_to_create):
if not users_to_create: if not users_to_create:
LOG.info('No users to create') LOG.info('No users to create')
@ -177,6 +181,7 @@ def create_users(keystone, users_to_create):
LOG.info(f"User {username} successfully created") LOG.info(f"User {username} successfully created")
@retry(stop_max_attempt_number=3, wait_fixed=1000)
def grant_admin_role(keystone, users_to_create, project_name): def grant_admin_role(keystone, users_to_create, project_name):
roles_dict = {role.name: role.id for role in keystone.roles.list()} roles_dict = {role.name: role.id for role in keystone.roles.list()}
users_dict = {user.name: user.id for user in keystone.users.list()} users_dict = {user.name: user.id for user in keystone.users.list()}
@ -206,6 +211,7 @@ def grant_admin_role(keystone, users_to_create, project_name):
LOG.info(f'Granted admin role for user {username}') LOG.info(f'Granted admin role for user {username}')
@retry(stop_max_attempt_number=3, wait_fixed=1000)
def create_services(keystone, services_to_create): def create_services(keystone, services_to_create):
if not services_to_create: if not services_to_create:
LOG.info('No services to create') LOG.info('No services to create')
@ -226,6 +232,7 @@ def create_services(keystone, services_to_create):
LOG.info(f"Service {service_name} successfully created") LOG.info(f"Service {service_name} successfully created")
@retry(stop_max_attempt_number=3, wait_fixed=1000)
def create_endpoints(keystone, endpoints_to_create): def create_endpoints(keystone, endpoints_to_create):
if not endpoints_to_create: if not endpoints_to_create:
LOG.info('No endpoints to create') LOG.info('No endpoints to create')
@ -267,14 +274,32 @@ def create_endpoints(keystone, endpoints_to_create):
f"{region=} was successfully created with {url}") f"{region=} was successfully created with {url}")
@retry(stop_max_attempt_number=3, wait_fixed=1000)
def delete_regionone_endpoints(keystone): def delete_regionone_endpoints(keystone):
existing_endpoints = keystone.endpoints.list() existing_endpoints = keystone.endpoints.list()
existing_services = keystone.services.list()
services_dict = {service.name: service.id for service in existing_services}
keystone_service_id = services_dict['keystone']
keystone_endpoints = []
for endpoint in existing_endpoints: for endpoint in existing_endpoints:
if endpoint.region == constants.REGION_ONE_NAME: if endpoint.region == constants.REGION_ONE_NAME:
if endpoint.service_id == keystone_service_id:
# Register keystone endpoints to delete them at the end
# so previous authentication still works
keystone_endpoints.append(endpoint)
continue
# Deleting non Keystone endpoints
keystone.endpoints.delete(endpoint) keystone.endpoints.delete(endpoint)
LOG.info(f'Deleted endpoint {endpoint}') LOG.info(f'Deleted endpoint {endpoint}')
for endpoint in keystone_endpoints:
# Deleting Keystone endpoints
keystone.endpoints.delete(endpoint)
LOG.info(f'Deleted endpoint {endpoint}')
@retry(stop_max_attempt_number=3, wait_fixed=1000)
def update_region_name_on_rc_file(region_name): def update_region_name_on_rc_file(region_name):
with open(RC_FILE_PATH, 'r') as file: with open(RC_FILE_PATH, 'r') as file:
lines = file.readlines() lines = file.readlines()
@ -368,9 +393,9 @@ def run_endpoint_config(puppet_operator: puppet.PuppetOperator,
services_to_create.extend(ADDITIONAL_SUBCLOUD_SERVICES) services_to_create.extend(ADDITIONAL_SUBCLOUD_SERVICES)
services_with_endpoints = copy.deepcopy(SERVICES_WITH_ENDPOINTS) services_with_endpoints = copy.deepcopy(SERVICES_WITH_ENDPOINTS)
if is_subcloud: if is_systemcontroller or is_subcloud:
services_with_endpoints.extend( services_with_endpoints.extend(
SERVICES_WITH_ADITIONAL_SUBCLOUD_ENDPOINTS SERVICES_WITH_ADITIONAL_DC_ENDPOINTS
) )
users_to_create = [] users_to_create = []
@ -386,6 +411,9 @@ def run_endpoint_config(puppet_operator: puppet.PuppetOperator,
region_name, region_name,
puppet_plugins_dict) puppet_plugins_dict)
if is_systemcontroller: if is_systemcontroller:
# This endpoints will be created in a different region,
# so they need to be added to the endpoints list after
# the RegionOne endpoints list was created
endpoints_to_create.extend( endpoints_to_create.extend(
build_endpoint_list( build_endpoint_list(
SERVICES_WITH_ADITIONAL_SYSTEMCONTROLLER_ENDPOINTS, SERVICES_WITH_ADITIONAL_SYSTEMCONTROLLER_ENDPOINTS,
@ -407,8 +435,27 @@ def run_endpoint_config(puppet_operator: puppet.PuppetOperator,
create_services(keystone, services_to_create) create_services(keystone, services_to_create)
create_endpoints(keystone, endpoints_to_create) create_endpoints(keystone, endpoints_to_create)
if is_subcloud: if is_subcloud:
delete_regionone_endpoints(keystone) # Update the rc file with the region name before deleting
# the endpoints so it's still possible to source credentials
# if the deletion failed after keystone RegionOne endpoints
# were deleted
update_region_name_on_rc_file(region_name) update_region_name_on_rc_file(region_name)
try:
delete_regionone_endpoints(keystone)
except Exception:
LOG.warning("Endpoint deletion failed. Generating new "
"keystone client and trying again")
# The keystone service/client is cached in the class object,
# so we create a new instance to get a new client with
# updated region_name and keystone uri
# First we need to set the new region_name and auth_uri in cfg
auth_uri = keystone_plugin.get_identity_uri()
cfg.CONF.set_override("auth_uri", auth_uri, group=openstack.OPENSTACK_CONFIG)
cfg.CONF.set_override("region_name", region_name, group=openstack.OPENSTACK_CONFIG)
db_instance = dbapi.get_instance()
openstack_operator = openstack.OpenStackOperator(db_instance)
keystone = get_keystone_client(openstack_operator)
delete_regionone_endpoints(keystone)
# Set new endpoint reconfiguration flag # Set new endpoint reconfiguration flag
with open(ENDPONTS_RECONFIGURED_FLAG_PATH, 'a'): with open(ENDPONTS_RECONFIGURED_FLAG_PATH, 'a'):