Fix LDAP issue for DC subcloud

This commit fixes an LDAP authentication issue seen on worker nodes
of a subcloud after a rehoming procedure was performed.

There are two main parts:

1. Since every host of a subcloud authenticates with the system
   controller, we need to reconfigure the LDAP URI across all nodes
   of the system when the system controller network changes (upon
   rehome).  Currently, it is only being reconfigured on controller
   nodes.

2. Currently, the system uses an SNAT rule to allow worker/storage
   nodes to authenticate with the system controller when the admin
   network is in use.  This is because the admin network only exists
   between controller nodes of a distributed cloud.  The SNAT rule
   is needed to allow traffic from the (private) management network
   of the subcloud over the admin network to the system controller
   and back again.  If the admin network is _not_ being used,
   worker/storage nodes of the subcloud can authenticate with the
   system controller, but routes must be installed on the
   worker/storage nodes to facilitate this.  It becomes tricky to
   manage in certain circumstances of rehoming/network config.
   This traffic really should be treated in the same way as that
   of the admin network.

This commit addresses the above by:

1. Reconfiguring the ldap_server config across all nodes upon
   system controller network changes.

2. Generalizing the current admin network nat implementation to
   handle the management network as well.

Test Plan:

IPv4, IPv6 distributed clouds

1. Rehome a subcloud to another system controller and back again
   (mgmt network)
2. Update the subcloud to use the admin network (mgmt -> admin)
3. Rehome the subcloud to another system controller and back again
   (admin network)
4. Update the subcloud to use the mgmt network (admin -> mgmt)

After each of the numbered steps, the following were performed:

a. Ensure the system controller could become managed, online, in-sync
b. Ensure the iptables SNAT rules were installed or updated
   appropriately on the subcloud controller nodes.
c. Log into a worker node of the subcloud and ensure sudo commands
   could be issued without LDAP timeout.
d. Log into worder node with LDAP USER X via console and verify
   login succeed

In general, tcpdump was also used to ensure the SNAT translation was
actually happening.

Partial-Bug: #2056560

Change-Id: Ia675a4ff3a2cba93e4ef62b27dba91802811e097
Signed-off-by: Steven Webster <steven.webster@windriver.com>
This commit is contained in:
Steven Webster 2024-03-08 08:30:07 -05:00
parent b3b279a36c
commit f8d30588ad
9 changed files with 48 additions and 38 deletions

View File

@ -716,13 +716,6 @@ class AddressPoolController(rest.RestController):
# The admin address pool can be deleted at runtime
admin_network_reconfig = True
if (admin_network_reconfig):
# At runtime, remove the NAT rule that enables worker, storage
# nodes to communicate with the system controller. We must do
# this before deleting the address pool and addresses to obtain
# information about the current admin floating IP and interface.
pecan.request.rpcapi.remove_admin_firewall_config(pecan.request.context)
addresses = pecan.request.dbapi.addresses_get_by_pool(
addrpool.id)
if addresses:

View File

@ -206,11 +206,11 @@ class InterfaceNetworkController(rest.RestController):
ethernet_port_mac = tmp_interface['imac']
_update_host_mgmt_mac(host, ethernet_port_mac)
cutils.perform_distributed_cloud_config(pecan.request.dbapi,
interface_id)
interface_id, host)
elif network_type == constants.NETWORK_TYPE_ADMIN:
pecan.request.rpcapi.update_admin_config(pecan.request.context, host)
cutils.perform_distributed_cloud_config(pecan.request.dbapi,
interface_id)
interface_id, host)
elif network_type == constants.NETWORK_TYPE_OAM:
pecan.request.rpcapi.initialize_oam_config(pecan.request.context, host)

View File

@ -429,6 +429,8 @@ class NetworkController(rest.RestController):
if type == constants.NETWORK_TYPE_SYSTEM_CONTROLLER:
pecan.request.rpcapi.update_ldap_client_config(
pecan.request.context)
pecan.request.rpcapi.update_ldap_nat_config(
pecan.request.context)
elif type == constants.NETWORK_TYPE_SYSTEM_CONTROLLER_OAM:
pecan.request.rpcapi.update_dnsmasq_config(
pecan.request.context)

View File

@ -1739,7 +1739,7 @@ def is_partition_the_last(dbapi, partition):
return True
def perform_distributed_cloud_config(dbapi, mgmt_iface_id):
def perform_distributed_cloud_config(dbapi, mgmt_iface_id, host):
"""
Check if we are running in distributed cloud mode and perform any
necessary configuration.
@ -1796,8 +1796,9 @@ def perform_distributed_cloud_config(dbapi, mgmt_iface_id):
(new_route['network'], new_route['prefix'],
new_route['gateway'], mgmt_iface_id))
elif system.distributed_cloud_role == \
constants.DISTRIBUTED_CLOUD_ROLE_SUBCLOUD:
elif (system.distributed_cloud_role ==
constants.DISTRIBUTED_CLOUD_ROLE_SUBCLOUD and
host['personality'] == constants.CONTROLLER):
# Add the route back to the system controller.
# Assumption is we do not have to do any error checking
# for local & reachable gateway etc, as config_subcloud

View File

@ -3516,7 +3516,8 @@ class ConductorManager(service.PeriodicService):
# Do any potential distributed cloud config
# We do this here where the interface is created.
cutils.perform_distributed_cloud_config(self.dbapi,
new_interface['id'])
new_interface['id'],
ihost)
if port:
values = {'interface_id': port.interface_id}
try:
@ -9471,17 +9472,6 @@ class ConductorManager(service.PeriodicService):
config_uuid,
config_dict)
def remove_admin_firewall_config(self, context):
""" Remove the platform firewall rules associated with the admin network """
personalities = [constants.CONTROLLER]
config_uuid = self._config_update_hosts(context,
personalities)
config_dict = {
"personalities": personalities,
"classes": ['platform::firewall::nat::admin::remove']
}
self._config_apply_runtime_manifest(context, config_uuid, config_dict)
def update_admin_config(self, context, host, disable=False):
"""Update the admin network configuration"""
@ -9512,8 +9502,7 @@ class ConductorManager(service.PeriodicService):
'platform::sm::enable_admin_config::runtime',
'platform::haproxy::runtime',
'openstack::keystone::endpoint::runtime',
'platform::firewall::runtime',
'platform::firewall::nat::admin::runtime']
'platform::firewall::runtime']
}
self._config_apply_runtime_manifest(context, config_uuid, config_dict)
@ -14554,7 +14543,9 @@ class ConductorManager(service.PeriodicService):
def update_ldap_client_config(self, context):
"""Update the LDAP client configuration"""
personalities = [constants.CONTROLLER]
personalities = [constants.CONTROLLER,
constants.WORKER,
constants.STORAGE]
config_uuid = self._config_update_hosts(context, personalities)
config_dict = {
"personalities": personalities,
@ -14563,6 +14554,16 @@ class ConductorManager(service.PeriodicService):
}
self._config_apply_runtime_manifest(context, config_uuid, config_dict)
def update_ldap_nat_config(self, context):
"""Update the LDAP NAT configuration"""
personalities = [constants.CONTROLLER]
config_uuid = self._config_update_hosts(context, personalities)
config_dict = {
"personalities": personalities,
"classes": ['platform::firewall::dc::nat::ldap::runtime']
}
self._config_apply_runtime_manifest(context, config_uuid, config_dict)
def get_controllerfs_lv_sizes(self, context):
system = self.dbapi.isystem_get_one()
system_dc_role = system.get('distributed_cloud_role', None)

View File

@ -828,15 +828,6 @@ class ConductorAPI(sysinv.openstack.common.rpc.proxy.RpcProxy):
host=host,
disable=disable))
def remove_admin_firewall_config(self, context):
"""Synchronously, have the conductor remove the admin firewall
configuration.
:param context: request context.
"""
return self.call(context, self.make_msg(
'remove_admin_firewall_config'))
def set_mgmt_network_reconfig_flag(self, context):
"""Synchronously, have the conductor update the mgmt network reconfig flag.
:param context: request context.
@ -2170,6 +2161,17 @@ class ConductorAPI(sysinv.openstack.common.rpc.proxy.RpcProxy):
return self.call(context,
self.make_msg('update_ldap_client_config'))
def update_ldap_nat_config(self, context):
"""Synchronously, have a conductor configure LDAP NAT configureation
Does the following tasks:
- Update puppet hiera configuration file and apply run time manifest.
:param context: request context.
"""
return self.call(context,
self.make_msg('update_ldap_nat_config'))
def update_dnsmasq_config(self, context):
"""Synchronously, have a conductor configure the DNS configuration

View File

@ -221,15 +221,21 @@ class TestPostMixin(NetworkTestCase):
update_ldap_client_config = "sysinv.conductor.rpcapi." \
"ConductorAPI." \
"update_ldap_client_config"
update_ldap_nat_config = "sysinv.conductor.rpcapi." \
"ConductorAPI." \
"update_ldap_nat_config"
with mock.patch('sysinv.common.utils.is_initial_config_complete',
lambda: True), \
mock.patch(update_ldap_client_config,
m.update_ldap_client_config):
m.update_ldap_client_config), \
mock.patch(update_ldap_nat_config,
m.update_ldap_nat_config):
self._test_create_network_success(
'system-controller',
constants.NETWORK_TYPE_SYSTEM_CONTROLLER,
self.system_controller_subnet)
m.update_ldap_client_config.assert_called_once()
m.update_ldap_nat_config.assert_called_once()
def test_create_success_pxeboot(self):
self._test_create_network_success(

View File

@ -5020,7 +5020,9 @@ class ManagerTestCase(base.DbTestCase):
p2.start()
self.addCleanup(p2.stop)
self.service.update_ldap_client_config(self.context)
personalities = [constants.CONTROLLER]
personalities = [constants.CONTROLLER,
constants.WORKER,
constants.STORAGE]
config_dict = {
"personalities": personalities,
"classes": ['platform::ldap::client::runtime',

View File

@ -100,6 +100,9 @@ class RPCAPITestCase(base.DbTestCase):
def test_update_ldap_client_config(self):
self._test_rpcapi('update_ldap_client_config', 'call')
def test_update_ldap_nat_config(self):
self._test_rpcapi('update_ldap_nat_config', 'call')
def test_update_dnsmasq_config(self):
self._test_rpcapi('update_dnsmasq_config', 'call')