438 lines
17 KiB
Python
438 lines
17 KiB
Python
# Copyright (c) 2020-2023 Wind River Systems, Inc.
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
|
# implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
#
|
|
|
|
import collections
|
|
from datetime import datetime
|
|
import functools
|
|
import os
|
|
import random
|
|
import re
|
|
import threading
|
|
import time
|
|
|
|
from eventlet.green import subprocess
|
|
from oslo_log import log as logging
|
|
from oslo_utils import timeutils
|
|
|
|
from dccommon import consts
|
|
from dccommon import exceptions
|
|
from dccommon.exceptions import PlaybookExecutionFailed
|
|
from dccommon.exceptions import PlaybookExecutionTimeout
|
|
from dccommon.subprocess_cleanup import kill_subprocess_group
|
|
from dccommon.subprocess_cleanup import SubprocessCleanup
|
|
from dcorch.common.i18n import _
|
|
|
|
LOG = logging.getLogger(__name__)
|
|
ANSIBLE_PASSWD_PARMS = ['ansible_ssh_pass', 'ansible_become_pass']
|
|
SCRIPT_PASSWD_PARMS = ['sysadmin_password', 'password']
|
|
|
|
# Gap, in seconds, to determine whether the given token is about to expire
|
|
# These values are used to randomize the token early renewal duration and
|
|
# to distribute the new keystone creation to different audit cycles
|
|
|
|
STALE_TOKEN_DURATION_MIN = 300
|
|
STALE_TOKEN_DURATION_MAX = 480
|
|
STALE_TOKEN_DURATION_STEP = 20
|
|
|
|
# Exitcode from 'timeout' command on timeout:
|
|
TIMEOUT_EXITCODE = 124
|
|
|
|
LAST_SW_VERSION_IN_CENTOS = "22.06"
|
|
|
|
|
|
class memoized(object):
|
|
"""Decorator.
|
|
|
|
Caches a function's return value each time it is called.
|
|
If called later with the same arguments, the cached value is returned
|
|
(not reevaluated).
|
|
|
|
WARNING: This function should not be used for class methods since it
|
|
does not provide weak references; thus would prevent the instance from
|
|
being garbage collected.
|
|
"""
|
|
|
|
def __init__(self, func):
|
|
self.func = func
|
|
self.cache = {}
|
|
|
|
def __call__(self, *args):
|
|
if not isinstance(args, collections.Hashable):
|
|
# uncacheable. a list, for instance.
|
|
# better to not cache than blow up.
|
|
return self.func(*args)
|
|
if args in self.cache:
|
|
return self.cache[args]
|
|
else:
|
|
value = self.func(*args)
|
|
self.cache[args] = value
|
|
return value
|
|
|
|
def __repr__(self):
|
|
'''Return the function's docstring.'''
|
|
return self.func.__doc__
|
|
|
|
def __get__(self, obj, objtype):
|
|
'''Support instance methods.'''
|
|
return functools.partial(self.__call__, obj)
|
|
|
|
|
|
class RunAnsible(object):
|
|
"""Class to run Ansible playbooks with the abort option
|
|
|
|
Approach:
|
|
|
|
At the start of the playbook execution, the abort status
|
|
(default value is False) and PID of the subprocess for the
|
|
specified subcloud are set on the class variable dict (abort_status).
|
|
When the user sends the abort command, the subcloud_manager changes
|
|
the abort status to True and the subprocess is killed.
|
|
|
|
If Ansible is currently executing a task that cannot be interrupted,
|
|
a deploy_not_abortable flag is created in the overrides folder by the
|
|
playbook itself, and the abort process will wait for said flag to be
|
|
deleted before killing the subprocess. If the task fails while abort
|
|
is waiting, the playbook_failed flag will indicate to the
|
|
original process to raise PlaybookExecutionFailed.
|
|
"""
|
|
abort_status = {}
|
|
lock = threading.Lock()
|
|
|
|
def _unregister_subcloud(self, subcloud_name):
|
|
with RunAnsible.lock:
|
|
if RunAnsible.abort_status.get(subcloud_name):
|
|
del RunAnsible.abort_status[subcloud_name]
|
|
|
|
def run_abort(self, subcloud_name, timeout=600):
|
|
"""Set abort status for a subcloud.
|
|
|
|
:param subcloud_name: Name of the subcloud
|
|
param timeout: Timeout in seconds.
|
|
"""
|
|
with RunAnsible.lock:
|
|
RunAnsible.abort_status[subcloud_name]['abort'] = True
|
|
unabortable_flag = os.path.join(consts.ANSIBLE_OVERRIDES_PATH,
|
|
'.%s_deploy_not_abortable' % subcloud_name)
|
|
subp = RunAnsible.abort_status[subcloud_name]['subp']
|
|
while os.path.exists(unabortable_flag) and timeout > 0:
|
|
time.sleep(1)
|
|
timeout -= 1
|
|
kill_subprocess_group(subp)
|
|
return True
|
|
|
|
def exec_playbook(self, log_file, playbook_command, subcloud_name,
|
|
timeout=None, register_cleanup=True):
|
|
"""Run ansible playbook via subprocess.
|
|
|
|
:param log_file: Logs output to file
|
|
:param timeout: Timeout in seconds. Raises PlaybookExecutionTimeout
|
|
on timeout
|
|
:param register_cleanup: Register the subprocess group for cleanup on
|
|
shutdown, if the underlying service supports cleanup.
|
|
"""
|
|
exec_env = os.environ.copy()
|
|
exec_env["ANSIBLE_LOG_PATH"] = "/dev/null"
|
|
|
|
aborted = False
|
|
|
|
if timeout:
|
|
timeout_log_str = " (timeout: %ss)" % timeout
|
|
else:
|
|
timeout_log_str = ''
|
|
|
|
with open(log_file, "a+") as f_out_log:
|
|
try:
|
|
logged_playbook_command = \
|
|
_strip_password_from_command(playbook_command)
|
|
txt = "%s Executing playbook command%s: %s\n" \
|
|
% (datetime.today().strftime('%Y-%m-%d-%H:%M:%S'),
|
|
timeout_log_str,
|
|
logged_playbook_command)
|
|
f_out_log.write(txt)
|
|
f_out_log.flush()
|
|
|
|
# Remove unabortable flag created by the playbook
|
|
# if present from previous executions
|
|
unabortable_flag = os.path.join(consts.ANSIBLE_OVERRIDES_PATH,
|
|
'.%s_deploy_not_abortable' % subcloud_name)
|
|
if os.path.exists(unabortable_flag):
|
|
os.remove(unabortable_flag)
|
|
|
|
subp = subprocess.Popen(playbook_command,
|
|
stdout=f_out_log,
|
|
stderr=f_out_log,
|
|
env=exec_env,
|
|
start_new_session=register_cleanup)
|
|
try:
|
|
if register_cleanup:
|
|
SubprocessCleanup.register_subprocess_group(subp)
|
|
with RunAnsible.lock:
|
|
RunAnsible.abort_status[subcloud_name] = {
|
|
'abort': False,
|
|
'subp': subp}
|
|
|
|
subp.wait(timeout)
|
|
subp_rc = subp.poll()
|
|
|
|
# There are 5 possible outcomes of the subprocess execution:
|
|
# 1: Playbook completed (process exited)
|
|
# - playbook_failure is False with subp_rc == 0,
|
|
# aborted is False, unabortable_flag_exists is False
|
|
# 2: Playbook was aborted (process killed)
|
|
# - playbook_failure is False with subp_rc != 0,
|
|
# aborted is True, unabortable_flag_exists is False
|
|
# 3: Playbook failed (process exited)
|
|
# - playbook_failure is True with subp_rc != 0,
|
|
# aborted is False, unabortable_flag_exists is False
|
|
# 4: Playbook failed during unabortable task (process exited)
|
|
# - playbook_failure is True with subp_rc != 0,
|
|
# aborted is False, unabortable_flag_exists is True
|
|
# 5: Playbook failed while waiting to be aborted (process exited)
|
|
# - playbook_failure is True with subp_rc != 0,
|
|
# aborted is True, unabortable_flag_exists is False
|
|
with RunAnsible.lock:
|
|
aborted = RunAnsible.abort_status[subcloud_name]['abort']
|
|
unabortable_flag_exists = os.path.exists(unabortable_flag)
|
|
playbook_failure = (subp_rc != 0 and
|
|
(not aborted or unabortable_flag_exists))
|
|
|
|
# Raise PlaybookExecutionFailed if the playbook fails when
|
|
# on normal conditions (no abort issued) or fails while
|
|
# waiting for the unabortable flag to be cleared.
|
|
if playbook_failure:
|
|
raise PlaybookExecutionFailed(playbook_cmd=playbook_command)
|
|
|
|
except subprocess.TimeoutExpired:
|
|
kill_subprocess_group(subp)
|
|
f_out_log.write(
|
|
"%s TIMEOUT (%ss) - playbook is terminated\n" %
|
|
(datetime.today().strftime('%Y-%m-%d-%H:%M:%S'), timeout)
|
|
)
|
|
raise PlaybookExecutionTimeout(playbook_cmd=playbook_command,
|
|
timeout=timeout)
|
|
finally:
|
|
f_out_log.flush()
|
|
if register_cleanup:
|
|
SubprocessCleanup.unregister_subprocess_group(subp)
|
|
self._unregister_subcloud(subcloud_name)
|
|
|
|
except PlaybookExecutionFailed:
|
|
raise
|
|
except Exception as ex:
|
|
LOG.error(str(ex))
|
|
raise
|
|
return aborted
|
|
|
|
|
|
def _strip_password_from_command(script_command):
|
|
"""Strip out any known password arguments from given command"""
|
|
logged_command = list()
|
|
for item in script_command:
|
|
if not any(parm in item for parm in SCRIPT_PASSWD_PARMS):
|
|
logged_command.append(item)
|
|
else:
|
|
tmpl = item.split()
|
|
tmpstr = ''
|
|
for tmp in tmpl:
|
|
if any(parm in tmp for parm in SCRIPT_PASSWD_PARMS):
|
|
tmpstr = tmpstr + tmp[:tmp.index('=') + 1] + ' '
|
|
else:
|
|
tmpstr = tmpstr + tmp + ' '
|
|
tmpstr = tmpstr[:-1]
|
|
logged_command.append(tmpstr)
|
|
return logged_command
|
|
|
|
|
|
# TODO(vgluzrom): remove this function and replace all calls
|
|
# with RunAnsible class
|
|
def run_playbook(log_file, playbook_command,
|
|
timeout=None, register_cleanup=True):
|
|
"""Run ansible playbook via subprocess.
|
|
|
|
log_file: Logs output to file
|
|
timeout: Timeout in seconds. Raises PlaybookExecutionTimeout on timeout
|
|
register_cleanup: Register the subprocess group for cleanup on shutdown,
|
|
if the underlying service supports cleanup.
|
|
"""
|
|
exec_env = os.environ.copy()
|
|
exec_env["ANSIBLE_LOG_PATH"] = "/dev/null"
|
|
|
|
if timeout:
|
|
# Invoke ansible-playbook via the 'timeout' command.
|
|
# Using --kill-after=5s which will force a kill -9 if the process
|
|
# hasn't terminated within 5s:
|
|
timeout_log_str = " (timeout: %ss)" % timeout
|
|
playbook_command = ["/usr/bin/timeout", "--kill-after=5s",
|
|
"%ss" % timeout] + playbook_command
|
|
else:
|
|
timeout_log_str = ''
|
|
|
|
with open(log_file, "a+") as f_out_log:
|
|
try:
|
|
logged_playbook_command = \
|
|
_strip_password_from_command(playbook_command)
|
|
txt = "%s Executing playbook command%s: %s\n" \
|
|
% (datetime.today().strftime('%Y-%m-%d-%H:%M:%S'),
|
|
timeout_log_str,
|
|
logged_playbook_command)
|
|
f_out_log.write(txt)
|
|
f_out_log.flush()
|
|
|
|
if register_cleanup:
|
|
# Use the same process group / session for all children
|
|
# This makes it easier to kill the entire process group
|
|
# on cleanup
|
|
preexec_fn = os.setsid
|
|
else:
|
|
preexec_fn = None
|
|
|
|
# TODO(kmacleod) future considerations:
|
|
# - In python3, this code can be simplified to use the new
|
|
# subprocess.run(timeout=val) method or Popen with
|
|
# subp.wait(timeout=val)
|
|
# - Beginning with ansible 2.10, we can introduce the
|
|
# ANSIBLE_TASK_TIMEOUT value to set a task-level timeout.
|
|
# This is not available in our current version of ansible (2.7.5)
|
|
subp = subprocess.Popen(playbook_command,
|
|
stdout=f_out_log,
|
|
stderr=f_out_log,
|
|
env=exec_env,
|
|
preexec_fn=preexec_fn)
|
|
try:
|
|
if register_cleanup:
|
|
SubprocessCleanup.register_subprocess_group(subp)
|
|
|
|
subp.communicate() # wait for process to exit
|
|
|
|
if timeout and subp.returncode == TIMEOUT_EXITCODE:
|
|
f_out_log.write(
|
|
"%s TIMEOUT (%ss) - playbook is terminated\n" %
|
|
(datetime.today().strftime('%Y-%m-%d-%H:%M:%S'), timeout)
|
|
)
|
|
raise PlaybookExecutionTimeout(playbook_cmd=playbook_command,
|
|
timeout=timeout)
|
|
if subp.returncode != 0:
|
|
raise PlaybookExecutionFailed(playbook_cmd=playbook_command)
|
|
finally:
|
|
f_out_log.flush()
|
|
if register_cleanup:
|
|
SubprocessCleanup.unregister_subprocess_group(subp)
|
|
|
|
except PlaybookExecutionFailed:
|
|
raise
|
|
except Exception as ex:
|
|
LOG.error(str(ex))
|
|
raise
|
|
|
|
|
|
def is_token_expiring_soon(token,
|
|
stale_token_duration_min=STALE_TOKEN_DURATION_MIN,
|
|
stale_token_duration_max=STALE_TOKEN_DURATION_MAX,
|
|
stale_token_duration_step=STALE_TOKEN_DURATION_STEP):
|
|
expiry_time = timeutils.normalize_time(timeutils.parse_isotime(token['expires_at']))
|
|
duration = random.randrange(stale_token_duration_min,
|
|
stale_token_duration_max,
|
|
stale_token_duration_step)
|
|
if timeutils.is_soon(expiry_time, duration):
|
|
return True
|
|
return False
|
|
|
|
|
|
def _get_key_from_file(file_contents, key):
|
|
"""Extract value from KEY=VALUE entries.
|
|
|
|
Ignore newline, ignore apostrophe, ignore quotation mark.
|
|
:param file_contents: contents of file
|
|
:param key: key to search
|
|
:return: found value or ''
|
|
"""
|
|
r = re.compile('^{}\=[\'\"]*([^\'\"\n]*)'.format(key), re.MULTILINE)
|
|
match = r.search(file_contents)
|
|
if match:
|
|
return match.group(1)
|
|
else:
|
|
return ''
|
|
|
|
|
|
@memoized
|
|
def get_os_release(release_file=consts.OS_RELEASE_FILE):
|
|
"""Function to read release information.
|
|
|
|
Ignore newline, ignore apostrophe, ignore quotation mark.
|
|
:param release_file: file to read from
|
|
:return: a tuple of (ID, VERSION)
|
|
"""
|
|
linux_distro = ('', '')
|
|
|
|
try:
|
|
with open(release_file, 'r') as f:
|
|
data = f.read()
|
|
linux_distro = (
|
|
_get_key_from_file(data, 'ID'),
|
|
_get_key_from_file(data, 'VERSION'))
|
|
except Exception as e:
|
|
raise exceptions.DCCommonException(
|
|
msg=_("Failed to open %s : %s" % (release_file, str(e))))
|
|
|
|
if linux_distro[0] == '':
|
|
raise exceptions.DCCommonException(
|
|
msg=_("Could not determine os type from %s" % release_file))
|
|
|
|
# Hint: This code is added here to aid future unit test.
|
|
# Probably running unit tests on a non-supported OS (example at
|
|
# time of writing: ubuntu), which is perfect, because code reaching
|
|
# here will fail, and we just identified a place that would split
|
|
# logic between OSs. The failing tests should mock this function
|
|
# (get_os_release) for each supported OS.
|
|
if linux_distro[0] not in consts.SUPPORTED_OS_TYPES:
|
|
raise exceptions.DCCommonException(
|
|
msg=_("Unsupported OS detected %s" % linux_distro[0]))
|
|
|
|
return linux_distro
|
|
|
|
|
|
def get_os_type(release_file=consts.OS_RELEASE_FILE):
|
|
return get_os_release(release_file)[0]
|
|
|
|
|
|
def is_debian(software_version=None):
|
|
"""Check target version or underlying OS type.
|
|
|
|
Check either the given software_version (e.g. for checking a subcloud,
|
|
or prestaging operation), or the underlying OS type (for this running
|
|
instance)
|
|
"""
|
|
if software_version:
|
|
return not is_centos(software_version)
|
|
return get_os_type() == consts.OS_DEBIAN
|
|
|
|
|
|
def is_centos(software_version=None):
|
|
"""Check target version or underlying OS type.
|
|
|
|
Check either the given software_version (e.g. for checking a subcloud,
|
|
or prestaging operation), or the underlying OS type (for this running
|
|
instance)
|
|
"""
|
|
if software_version:
|
|
return software_version <= LAST_SW_VERSION_IN_CENTOS
|
|
return get_os_type() == consts.OS_CENTOS
|
|
|
|
|
|
def get_ssl_cert_ca_file():
|
|
return os.path.join(
|
|
consts.SSL_CERT_CA_DIR,
|
|
consts.CERT_CA_FILE_DEBIAN if is_debian() else consts.CERT_CA_FILE_CENTOS)
|