distcloud/distributedcloud/dccommon/utils.py

438 lines
17 KiB
Python

# Copyright (c) 2020-2023 Wind River Systems, Inc.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import collections
from datetime import datetime
import functools
import os
import random
import re
import threading
import time
from eventlet.green import subprocess
from oslo_log import log as logging
from oslo_utils import timeutils
from dccommon import consts
from dccommon import exceptions
from dccommon.exceptions import PlaybookExecutionFailed
from dccommon.exceptions import PlaybookExecutionTimeout
from dccommon.subprocess_cleanup import kill_subprocess_group
from dccommon.subprocess_cleanup import SubprocessCleanup
from dcorch.common.i18n import _
LOG = logging.getLogger(__name__)
ANSIBLE_PASSWD_PARMS = ['ansible_ssh_pass', 'ansible_become_pass']
SCRIPT_PASSWD_PARMS = ['sysadmin_password', 'password']
# Gap, in seconds, to determine whether the given token is about to expire
# These values are used to randomize the token early renewal duration and
# to distribute the new keystone creation to different audit cycles
STALE_TOKEN_DURATION_MIN = 300
STALE_TOKEN_DURATION_MAX = 480
STALE_TOKEN_DURATION_STEP = 20
# Exitcode from 'timeout' command on timeout:
TIMEOUT_EXITCODE = 124
LAST_SW_VERSION_IN_CENTOS = "22.06"
class memoized(object):
"""Decorator.
Caches a function's return value each time it is called.
If called later with the same arguments, the cached value is returned
(not reevaluated).
WARNING: This function should not be used for class methods since it
does not provide weak references; thus would prevent the instance from
being garbage collected.
"""
def __init__(self, func):
self.func = func
self.cache = {}
def __call__(self, *args):
if not isinstance(args, collections.Hashable):
# uncacheable. a list, for instance.
# better to not cache than blow up.
return self.func(*args)
if args in self.cache:
return self.cache[args]
else:
value = self.func(*args)
self.cache[args] = value
return value
def __repr__(self):
'''Return the function's docstring.'''
return self.func.__doc__
def __get__(self, obj, objtype):
'''Support instance methods.'''
return functools.partial(self.__call__, obj)
class RunAnsible(object):
"""Class to run Ansible playbooks with the abort option
Approach:
At the start of the playbook execution, the abort status
(default value is False) and PID of the subprocess for the
specified subcloud are set on the class variable dict (abort_status).
When the user sends the abort command, the subcloud_manager changes
the abort status to True and the subprocess is killed.
If Ansible is currently executing a task that cannot be interrupted,
a deploy_not_abortable flag is created in the overrides folder by the
playbook itself, and the abort process will wait for said flag to be
deleted before killing the subprocess. If the task fails while abort
is waiting, the playbook_failed flag will indicate to the
original process to raise PlaybookExecutionFailed.
"""
abort_status = {}
lock = threading.Lock()
def _unregister_subcloud(self, subcloud_name):
with RunAnsible.lock:
if RunAnsible.abort_status.get(subcloud_name):
del RunAnsible.abort_status[subcloud_name]
def run_abort(self, subcloud_name, timeout=600):
"""Set abort status for a subcloud.
:param subcloud_name: Name of the subcloud
param timeout: Timeout in seconds.
"""
with RunAnsible.lock:
RunAnsible.abort_status[subcloud_name]['abort'] = True
unabortable_flag = os.path.join(consts.ANSIBLE_OVERRIDES_PATH,
'.%s_deploy_not_abortable' % subcloud_name)
subp = RunAnsible.abort_status[subcloud_name]['subp']
while os.path.exists(unabortable_flag) and timeout > 0:
time.sleep(1)
timeout -= 1
kill_subprocess_group(subp)
return True
def exec_playbook(self, log_file, playbook_command, subcloud_name,
timeout=None, register_cleanup=True):
"""Run ansible playbook via subprocess.
:param log_file: Logs output to file
:param timeout: Timeout in seconds. Raises PlaybookExecutionTimeout
on timeout
:param register_cleanup: Register the subprocess group for cleanup on
shutdown, if the underlying service supports cleanup.
"""
exec_env = os.environ.copy()
exec_env["ANSIBLE_LOG_PATH"] = "/dev/null"
aborted = False
if timeout:
timeout_log_str = " (timeout: %ss)" % timeout
else:
timeout_log_str = ''
with open(log_file, "a+") as f_out_log:
try:
logged_playbook_command = \
_strip_password_from_command(playbook_command)
txt = "%s Executing playbook command%s: %s\n" \
% (datetime.today().strftime('%Y-%m-%d-%H:%M:%S'),
timeout_log_str,
logged_playbook_command)
f_out_log.write(txt)
f_out_log.flush()
# Remove unabortable flag created by the playbook
# if present from previous executions
unabortable_flag = os.path.join(consts.ANSIBLE_OVERRIDES_PATH,
'.%s_deploy_not_abortable' % subcloud_name)
if os.path.exists(unabortable_flag):
os.remove(unabortable_flag)
subp = subprocess.Popen(playbook_command,
stdout=f_out_log,
stderr=f_out_log,
env=exec_env,
start_new_session=register_cleanup)
try:
if register_cleanup:
SubprocessCleanup.register_subprocess_group(subp)
with RunAnsible.lock:
RunAnsible.abort_status[subcloud_name] = {
'abort': False,
'subp': subp}
subp.wait(timeout)
subp_rc = subp.poll()
# There are 5 possible outcomes of the subprocess execution:
# 1: Playbook completed (process exited)
# - playbook_failure is False with subp_rc == 0,
# aborted is False, unabortable_flag_exists is False
# 2: Playbook was aborted (process killed)
# - playbook_failure is False with subp_rc != 0,
# aborted is True, unabortable_flag_exists is False
# 3: Playbook failed (process exited)
# - playbook_failure is True with subp_rc != 0,
# aborted is False, unabortable_flag_exists is False
# 4: Playbook failed during unabortable task (process exited)
# - playbook_failure is True with subp_rc != 0,
# aborted is False, unabortable_flag_exists is True
# 5: Playbook failed while waiting to be aborted (process exited)
# - playbook_failure is True with subp_rc != 0,
# aborted is True, unabortable_flag_exists is False
with RunAnsible.lock:
aborted = RunAnsible.abort_status[subcloud_name]['abort']
unabortable_flag_exists = os.path.exists(unabortable_flag)
playbook_failure = (subp_rc != 0 and
(not aborted or unabortable_flag_exists))
# Raise PlaybookExecutionFailed if the playbook fails when
# on normal conditions (no abort issued) or fails while
# waiting for the unabortable flag to be cleared.
if playbook_failure:
raise PlaybookExecutionFailed(playbook_cmd=playbook_command)
except subprocess.TimeoutExpired:
kill_subprocess_group(subp)
f_out_log.write(
"%s TIMEOUT (%ss) - playbook is terminated\n" %
(datetime.today().strftime('%Y-%m-%d-%H:%M:%S'), timeout)
)
raise PlaybookExecutionTimeout(playbook_cmd=playbook_command,
timeout=timeout)
finally:
f_out_log.flush()
if register_cleanup:
SubprocessCleanup.unregister_subprocess_group(subp)
self._unregister_subcloud(subcloud_name)
except PlaybookExecutionFailed:
raise
except Exception as ex:
LOG.error(str(ex))
raise
return aborted
def _strip_password_from_command(script_command):
"""Strip out any known password arguments from given command"""
logged_command = list()
for item in script_command:
if not any(parm in item for parm in SCRIPT_PASSWD_PARMS):
logged_command.append(item)
else:
tmpl = item.split()
tmpstr = ''
for tmp in tmpl:
if any(parm in tmp for parm in SCRIPT_PASSWD_PARMS):
tmpstr = tmpstr + tmp[:tmp.index('=') + 1] + ' '
else:
tmpstr = tmpstr + tmp + ' '
tmpstr = tmpstr[:-1]
logged_command.append(tmpstr)
return logged_command
# TODO(vgluzrom): remove this function and replace all calls
# with RunAnsible class
def run_playbook(log_file, playbook_command,
timeout=None, register_cleanup=True):
"""Run ansible playbook via subprocess.
log_file: Logs output to file
timeout: Timeout in seconds. Raises PlaybookExecutionTimeout on timeout
register_cleanup: Register the subprocess group for cleanup on shutdown,
if the underlying service supports cleanup.
"""
exec_env = os.environ.copy()
exec_env["ANSIBLE_LOG_PATH"] = "/dev/null"
if timeout:
# Invoke ansible-playbook via the 'timeout' command.
# Using --kill-after=5s which will force a kill -9 if the process
# hasn't terminated within 5s:
timeout_log_str = " (timeout: %ss)" % timeout
playbook_command = ["/usr/bin/timeout", "--kill-after=5s",
"%ss" % timeout] + playbook_command
else:
timeout_log_str = ''
with open(log_file, "a+") as f_out_log:
try:
logged_playbook_command = \
_strip_password_from_command(playbook_command)
txt = "%s Executing playbook command%s: %s\n" \
% (datetime.today().strftime('%Y-%m-%d-%H:%M:%S'),
timeout_log_str,
logged_playbook_command)
f_out_log.write(txt)
f_out_log.flush()
if register_cleanup:
# Use the same process group / session for all children
# This makes it easier to kill the entire process group
# on cleanup
preexec_fn = os.setsid
else:
preexec_fn = None
# TODO(kmacleod) future considerations:
# - In python3, this code can be simplified to use the new
# subprocess.run(timeout=val) method or Popen with
# subp.wait(timeout=val)
# - Beginning with ansible 2.10, we can introduce the
# ANSIBLE_TASK_TIMEOUT value to set a task-level timeout.
# This is not available in our current version of ansible (2.7.5)
subp = subprocess.Popen(playbook_command,
stdout=f_out_log,
stderr=f_out_log,
env=exec_env,
preexec_fn=preexec_fn)
try:
if register_cleanup:
SubprocessCleanup.register_subprocess_group(subp)
subp.communicate() # wait for process to exit
if timeout and subp.returncode == TIMEOUT_EXITCODE:
f_out_log.write(
"%s TIMEOUT (%ss) - playbook is terminated\n" %
(datetime.today().strftime('%Y-%m-%d-%H:%M:%S'), timeout)
)
raise PlaybookExecutionTimeout(playbook_cmd=playbook_command,
timeout=timeout)
if subp.returncode != 0:
raise PlaybookExecutionFailed(playbook_cmd=playbook_command)
finally:
f_out_log.flush()
if register_cleanup:
SubprocessCleanup.unregister_subprocess_group(subp)
except PlaybookExecutionFailed:
raise
except Exception as ex:
LOG.error(str(ex))
raise
def is_token_expiring_soon(token,
stale_token_duration_min=STALE_TOKEN_DURATION_MIN,
stale_token_duration_max=STALE_TOKEN_DURATION_MAX,
stale_token_duration_step=STALE_TOKEN_DURATION_STEP):
expiry_time = timeutils.normalize_time(timeutils.parse_isotime(token['expires_at']))
duration = random.randrange(stale_token_duration_min,
stale_token_duration_max,
stale_token_duration_step)
if timeutils.is_soon(expiry_time, duration):
return True
return False
def _get_key_from_file(file_contents, key):
"""Extract value from KEY=VALUE entries.
Ignore newline, ignore apostrophe, ignore quotation mark.
:param file_contents: contents of file
:param key: key to search
:return: found value or ''
"""
r = re.compile('^{}\=[\'\"]*([^\'\"\n]*)'.format(key), re.MULTILINE)
match = r.search(file_contents)
if match:
return match.group(1)
else:
return ''
@memoized
def get_os_release(release_file=consts.OS_RELEASE_FILE):
"""Function to read release information.
Ignore newline, ignore apostrophe, ignore quotation mark.
:param release_file: file to read from
:return: a tuple of (ID, VERSION)
"""
linux_distro = ('', '')
try:
with open(release_file, 'r') as f:
data = f.read()
linux_distro = (
_get_key_from_file(data, 'ID'),
_get_key_from_file(data, 'VERSION'))
except Exception as e:
raise exceptions.DCCommonException(
msg=_("Failed to open %s : %s" % (release_file, str(e))))
if linux_distro[0] == '':
raise exceptions.DCCommonException(
msg=_("Could not determine os type from %s" % release_file))
# Hint: This code is added here to aid future unit test.
# Probably running unit tests on a non-supported OS (example at
# time of writing: ubuntu), which is perfect, because code reaching
# here will fail, and we just identified a place that would split
# logic between OSs. The failing tests should mock this function
# (get_os_release) for each supported OS.
if linux_distro[0] not in consts.SUPPORTED_OS_TYPES:
raise exceptions.DCCommonException(
msg=_("Unsupported OS detected %s" % linux_distro[0]))
return linux_distro
def get_os_type(release_file=consts.OS_RELEASE_FILE):
return get_os_release(release_file)[0]
def is_debian(software_version=None):
"""Check target version or underlying OS type.
Check either the given software_version (e.g. for checking a subcloud,
or prestaging operation), or the underlying OS type (for this running
instance)
"""
if software_version:
return not is_centos(software_version)
return get_os_type() == consts.OS_DEBIAN
def is_centos(software_version=None):
"""Check target version or underlying OS type.
Check either the given software_version (e.g. for checking a subcloud,
or prestaging operation), or the underlying OS type (for this running
instance)
"""
if software_version:
return software_version <= LAST_SW_VERSION_IN_CENTOS
return get_os_type() == consts.OS_CENTOS
def get_ssl_cert_ca_file():
return os.path.join(
consts.SSL_CERT_CA_DIR,
consts.CERT_CA_FILE_DEBIAN if is_debian() else consts.CERT_CA_FILE_CENTOS)