Increase the timeout to retrieve subcloud cert secret

The currently 1 second delay between the maximum of 20 attempts to
retrieve subcloud's certificate secret infos could further aggravate
stress scenarios due to consequents API requests at Kubernetes.

This commit implements a pseudorandomic time delay and an exponential
back-off retry before attempts to request the Kubernetes API to
minimize the number of sequential requests issued to the API during
stress scenarios involving the parallel addition of many subclouds.

The exponential back-off retry is designed here to be executed in a
total maximum wait of 210s with the maximum wait of ~38s in the last
attempt of the loop in the worst case scenario and the minimun wait
of 2s in the first loop for the best case scenario. This is intended
in a matter to adapt the increase of the wait time per request to a
possible stress scenario in the system.

Test Plan:
PASS: Full build, system install, bootstrap and unlock DC system w/
      unlocked enabled available status. Add a SX subcloud via the
      SystemController and wait until the deploy is complete.
      Observe that the subcloud is online and with in-sync status.
PASS: Deploy 250 subclouds and ensure no failures due to
      certificate creation

Closes-Bug: 2037298

Change-Id: Idddba12cc08c98bda5ef1c44511e525d0188d048
Signed-off-by: Manoel Benedito Neto <Manoel.BeneditoNeto@windriver.com>
This commit is contained in:
Manoel Benedito Neto 2023-09-19 20:16:52 -03:00 committed by Jerry Sun
parent c35aa8f566
commit c3f8d82d9b
1 changed files with 27 additions and 3 deletions

View File

@ -24,6 +24,7 @@ import filecmp
import functools
import json
import os
import random
import shutil
import threading
import time
@ -160,6 +161,11 @@ ENDPOINT_URLS = {
dccommon_consts.ENDPOINT_TYPE_SOFTWARE: "https://{}:5498",
}
# Values for the exponential backoff retry to get subcloud's
# certificate secret.
MAX_ATTEMPTS_TO_GET_INTERMEDIATE_CA_CERT = 15
MIN_WAIT_BEFORE_RETRY_KUBE_REQUEST = 1
# Values present on the overrides file generated during
# subcloud_deploy_create. They should not be deleted from
# the overrides if it's needed to recreate the file.
@ -248,12 +254,30 @@ class SubcloudManager(manager.Manager):
}
kube = kubeoperator.KubeOperator()
# Time delay is set to prevent the aggravation of stress scenarios in
# the system while performing the parallel addition of many subclouds.
delay = random.uniform(0, 10)
time.sleep(delay)
kube.apply_cert_manager_certificate(CERT_NAMESPACE, cert_name, cert)
for count in range(1, 20):
# May wait from ~2s to ~3min30s for the certificate secret to be ready.
# Exponential backoff retry is implemented to define the wait time
# between each attempt to request the certificate secret object.
# wait_per_request = min_wait*2**retry_times + random_number between
# 0-min_wait with a total maximum wait time of 210s, e.g:
# 1st retry: 1*2**(0*0.3 + 1) + 1, max wait time 3s,
# 2nd retry: 1*2**(1*0.3 + 1) + 1, max wait time ~3.46s,
# ...
# 10th retry: 1*2**(9*0.3 + 1) + 1, max wait time ~13.99s,
# ...
# 15th retry: 1*2**(14*0.3 + 1) + 1, max wait time ~37.76s.
for count in range(MAX_ATTEMPTS_TO_GET_INTERMEDIATE_CA_CERT):
secret = kube.kube_get_secret(secret_name, CERT_NAMESPACE)
wait_per_request = \
MIN_WAIT_BEFORE_RETRY_KUBE_REQUEST * 2 ** (count * 0.3 + 1) \
+ random.uniform(0, MIN_WAIT_BEFORE_RETRY_KUBE_REQUEST)
if not hasattr(secret, 'data'):
time.sleep(1)
time.sleep(wait_per_request)
LOG.debug('Wait for %s ... %s' % (secret_name, count))
continue
@ -264,7 +288,7 @@ class SubcloudManager(manager.Manager):
# ca cert, certificate and key pair are needed and must exist
# for creating an intermediate ca. If not, certificate is not
# ready yet.
time.sleep(1)
time.sleep(wait_per_request)
LOG.debug('Wait for %s ... %s' % (secret_name, count))
continue