test/automated-pytest-suite/keywords/pm_helper.py

1153 lines
44 KiB
Python

#
# Copyright (c) 2019 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
import configparser
import datetime
import os.path
import re
import time
from io import StringIO
import pexpect
from consts.auth import Tenant
from consts.timeout import MTCTimeout
from keywords import system_helper, host_helper
from utils.clients.ssh import ControllerClient
from utils.tis_log import LOG
KILL_CMD = 'kill -9'
PROCESS_TYPES = ['sm', 'pmon', 'other']
KILL_PROC_EVENT_FORMAT = {
# documented
# 401.001 Service group <group> state change from <state> to <state> on
# host <host_name>
#
# actual in 2017-02-20_22-01-22
# clear | 400.001 | Service group cloud-services degraded;
# cinder-api(disabled, failed) |\
# service_domain=controller.service_group=cloud-services.host=controller-1
# log | 401.001 | Service group cloud-services state change from
# active-degraded to active on host
# set | 400.001 | Service group cloud-services degraded;
# cinder-api(disabled, failed) |\
# service_domain=controller.service_group=cloud-services.host=controller-1
# 'sm': ('401.001',
# actual in 2017-02-20_22-01-22
# clear 400.001 Service group cloud-services warning; nova-novnc(disabled,
# failed)
# service_domain=controller.service_group=cloud-services.host=controller-0
# 'sm': ('400.001',
# r'Service group ([^\s]+) ([^\s]+);\s*(.*)',
# r'service_domain=controller\.service_group=([^\.]+)\.host=(.*)'),
'sm': {
'event_id': '400.001',
'critical': (
r'Service group ([^\s]+) ([^\s]+);\s*(.*)',
r'service_domain=controller\.service_group=([^\.]+)\.host=(.*)'
),
'major': (
r'Service group ([^\s]+) ([^\s]+);\s*(.*)',
r'service_domain=controller\.service_group=([^\.]+)\.host=(.*)'
),
'minor': (
r'Service group ([^\s]+) ([^\s]+);\s*(.*)',
r'service_domain=controller\.service_group=([^\.]+)\.host=(.*)'
),
},
# set 200.006 controller-1 'acpid' process has failed.
# Auto recovery in progress. host=controller-1.process=acpid minor
'pmon': {
'event_id': '200.006',
# controller-1 critical 'sm' process has failed and could not be
# auto-recovered gracefully.
# Auto- recovery progression by host reboot is required and in
# progress. host=controller-1.process=sm
'critical': (
r'([^\s]+) ([^\s]+) \'([^\']+)\' process has ([^\s]+) and could '
r'not be auto-recovered gracefully. '
r'Auto.recovery progression by host reboot is required and in '
r'progress.',
r'host=([^\.]+)\.process=([^\s]+)'
),
# compute-2 is degraded due to the failure of its 'fsmond' process.
# Auto recovery of this major
# | host=compute-2.process= | major | process is in progress.
'major': (
r'([^\s]+) is ([^\s]+) due to the failure of its \'([^\']+)\' '
r'process. Auto recovery of this ([^\s]+) '
r'process is in progress.',
r'host=([^\.]+)\.process=([^\s]+)'
),
# clear 200.006 compute-2 'mtclogd' process has failed. Auto
# recovery in progress.
# host=compute-2.process=mtclogd minor
# "compute-2 'mtclogd' process has failed. Auto recovery in progress."
# set compute-1 'ntpd' process has failed. Manual recovery is required.
'minor': (
r"([^\s]+) '([^\']+)' process has ([^\s]+)\. [^\s]+ recovery.*",
r'host=([^\.]+)\.process=([^\s]+)'
),
},
}
AVAILABILITY_MAPPINGS = {'active': 'enabled', 'enabled': 'active'}
PMON_PROC_CONF_DIR = '/etc/pmon.d'
def get_pmon_process_info(name, host, conf_file=None, con_ssh=None):
"""
Get process info from its PMON config file
Args:
name (str): name of the PMON process
host (str): host on which the PROM process running
con_ssh: connection to the active controller
conf_file (str): configuration file for the PMON process
Returns (dict): settings of the process
"""
LOG.info('Get PMON process information for {}'.format(name))
if not conf_file:
file_name = '{}.conf'.format(name)
else:
file_name = conf_file
cmd = 'cat {}'.format(os.path.join(PMON_PROC_CONF_DIR, file_name))
with host_helper.ssh_to_host(host, con_ssh=con_ssh) as con0_ssh:
code, output = con0_ssh.exec_sudo_cmd(cmd)
if 0 != code or not output.strip():
LOG.error(
'Failed to read config file:{}/{} for PMON process:{} on host:{}, '
'code:{}, message:{}'.format(
PMON_PROC_CONF_DIR, file_name, name, host, code, output))
return {}
conf_parser = configparser.ConfigParser()
conf_parser.read_file(StringIO(output))
settings = {}
if 'process' in conf_parser.sections():
settings = {k.strip(): v.split(';')[0].strip() for k, v in
conf_parser.items('process')}
settings['interval'] = int(settings.get('interval', 5))
settings['debounce'] = int(settings.get('debounce', 20))
LOG.debug('process settings:{}'.format(settings))
return settings
def get_ancestor_process(name, host, cmd='', fail_ok=False, retries=5,
retry_interval=3, con_ssh=None):
"""
Get the ancestor of the processes with the given name and command-line if
any.
Args:
name: name of the process
host: host on which to find the process
cmd: executable name
fail_ok: do not throw exception when errors
retries: times to try before return
retry_interval: wait before next re-try
con_ssh: ssh connection/client to the active controller
Returns:
pid (int), process id, -1 if there is any error
ppid (int), parent process id, -1 if there is any error
cmdline (str) command line of the process
"""
retries = retries if retries > 1 else 3
retry_interval = retry_interval if retry_interval > 0 else 1
if cmd:
ps_cmd = r'ps -e -oppid,pid,cmd | /usr/bin/grep "{}\|{}" | ' \
r'/usr/bin/grep -v grep | /usr/bin/grep {}'.\
format(name, os.path.basename(cmd), cmd)
else:
ps_cmd = 'ps -e -oppid,pid,cmd | /usr/bin/grep "{}" | /usr/bin/grep ' \
'-v grep'.format(name)
code, output = -1, ''
if fail_ok:
for count in range(retries):
with host_helper.ssh_to_host(host, con_ssh=con_ssh) as con0_ssh:
code, output = con0_ssh.exec_cmd(ps_cmd, fail_ok=True)
if 0 == code and output.strip():
break
LOG.warn('Failed to run cli:{} on controller at retry:{:02d}, '
'wait:{} seconds and try again'.format(cmd, count,
retry_interval))
time.sleep(retry_interval)
else:
with host_helper.ssh_to_host(host, con_ssh=con_ssh) as con0_ssh:
code, output = con0_ssh.exec_cmd(ps_cmd, fail_ok=False)
if not (0 == code and output.strip()):
LOG.error(
'Failed to find process with name:{} and cmd:{}'.format(name, cmd))
return -1, -1, ''
procs = []
ppids = []
for line in output.strip().splitlines():
proc_attr = line.strip().split()
if not proc_attr:
continue
try:
ppid = int(proc_attr[0].strip())
pid = int(proc_attr[1].strip())
cmdline = ' '.join(proc_attr[2:])
LOG.info('ppid={}, pid={}\ncmdline={}'.format(ppid, pid, cmdline))
except IndexError:
LOG.warn(
'Failed to execute ps -p ?! cmd={}, line={}, output={}'.format(
cmd, line, output.strip()))
continue
if cmd and cmd not in cmdline:
continue
procs.append((pid, ppid, cmdline))
ppids.append(ppid)
if len(procs) <= 0:
LOG.error(
'Could not find process with name:{} and cmd:{}'.format(name, cmd))
return -1, -1, ''
pids = [v[1] for v in procs]
if len(pids) == 1:
LOG.info('porcs[0]:{}'.format(procs[0]))
return procs[0]
LOG.warn(
'Multiple ({}) parent processes?, ppids:{}'.format(len(ppids), ppids))
if '1' not in ppids:
LOG.warn(
'Init is not the grand parent process?, ppids:{}'.format(ppids))
for ppid, pid, cmdline in procs:
if pid in ppids and ppid not in pids and 1 != pid:
LOG.info('pid={}, ppid={}, cmdline={}'.format(pid, ppid, cmdline))
return pid, ppid, cmdline
LOG.error(
'Could not find process, procs:{}, ppids:{}, pids:{}'.format(procs,
ppids,
pids))
return -1, -1, ''
def verify_process_with_pid_file(pid, pid_file, con_ssh=None):
"""
Check if the given PID matching the PID in the specified pid_file
Args:
pid: process id
pid_file: the file containing the process id
con_ssh: ssh connnection/client to the host on which the process
resides
Returns:
"""
con_ssh = con_ssh or ControllerClient.get_active_controller()
code, output = con_ssh.exec_sudo_cmd('cat {} | head -n1'.format(pid_file),
fail_ok=False)
LOG.info('code={}, output={}'.format(code, output))
output = output.strip()
if not output or int(output) != pid:
LOG.info('Mismatched PID, expected:<{}>, from pid_file:<{}>, '
'pid_file={}'.format(pid, output, pid_file))
return False
else:
LOG.info(
'OK PID:{} matches with that from pid_file:{}, pid_file={}'.format(
pid, output.strip(), pid_file))
return True
def get_process_from_sm(name, con_ssh=None, pid_file='',
expecting_status='enabled-active'):
"""
Get the information for the process from SM, including PID, Name, Current
Status and Pid-File
Args:
name: name of the process
con_ssh: ssh connection/client to the active-controller
pid_file: known pid-file path/name to compare with
expecting_status: expected status of the process
Returns:
pid (int): process id
proc_name (str): process name
actual_status (str): actual/current status of the process
sm_pid_file (str): pid-file in records of SM
"""
con_ssh = con_ssh or ControllerClient.get_active_controller()
cmd = "true; NM={}; sudo sm-dump --impact --pid --pid_file | awk -v " \
"pname=$NM '{{ if ($1 == pname) print }}'; " \
"echo".format(name)
code, output = con_ssh.exec_sudo_cmd(cmd, fail_ok=True)
pid, proc_name, impact, sm_pid_file, actual_status = -1, '', '', '', ''
if 0 != code or not output:
LOG.warn(
'Cannot find the process:{} in SM with error code:\n{}\n'
'output:{}'.format(name, code, output))
return pid, proc_name, impact, sm_pid_file, actual_status
for line in output.splitlines():
if not line.strip():
continue
pid, proc_name, impact, sm_pid_file, actual_status = -1, '', '', '', ''
results_array = line.strip().split()
LOG.info('results_array={}'.format(results_array))
if len(results_array) != 6:
LOG.debug(
'Invalid format from output of sm-dump?! line={}\n'
'cmd={}'.format(line, cmd))
continue
proc_name = results_array[0]
if proc_name != name:
continue
expect_status = results_array[1]
actual_status = results_array[2]
if expect_status != actual_status:
LOG.warn(
'service:{} is not in expected status yet. expected:{}, '
'actual:{}. Retry'.format(
proc_name, expect_status, actual_status))
continue
if actual_status != expecting_status:
LOG.warn(
'service:{} is not in expected status yet. expected:{}, '
'actual:{}. Retry'.format(
proc_name, expecting_status, actual_status))
break
impact = results_array[3]
pid = int(results_array[4].strip())
if results_array[5] != sm_pid_file:
LOG.warn(
'pid_file not matching with that from SM-dump, pid_file={}, '
'sm-dump-pid_file={}'.format(
sm_pid_file, results_array[5]))
sm_pid_file = results_array[5]
if pid_file and sm_pid_file != pid_file:
LOG.warn(
'pid_file differs from input pid_file, pid_file={}, '
'sm-dump-pid_file={}'.format(
pid_file, sm_pid_file))
if sm_pid_file:
if not verify_process_with_pid_file(pid, sm_pid_file,
con_ssh=con_ssh):
LOG.warn(
'pid of service mismatch that from pid-file, pid:{}, '
'pid-file:{}, proc-name:{}'.format(
pid, sm_pid_file, proc_name))
# found
break
if -1 != pid:
host = system_helper.get_active_controller_name()
running, msg = is_process_running(pid, host)
if not running:
LOG.warn(
'Process not existing, name={}, pid={}, msg={}'.format(name,
pid,
msg))
return -1, '', '', '', ''
else:
LOG.info(
'OK, Process is running: name={}, pid={}, output={}'.format(
name, pid, msg))
return pid, proc_name, impact, actual_status, sm_pid_file
def is_controller_swacted(
prev_active, prev_standby,
swact_start_timeout=MTCTimeout.KILL_PROCESS_SWACT_NOT_START,
swact_complete_timeout=MTCTimeout.KILL_PROCESS_SWACT_COMPLETE,
con_ssh=None):
"""
Wait and check if the active-controller on the system was 'swacted' with
give time period
Args:
prev_active: previous active controller
prev_standby: previous standby controller
swact_start_timeout: check within this time frame if the swacting
started
swact_complete_timeout: check if the swacting (if any) completed in
this time period
con_ssh: ssh connection/client to the current
active-controller
Returns:
"""
LOG.info(
'Check if the controllers started to swact within:{}, and completing '
'swacting within:{}'.format(
swact_start_timeout, swact_complete_timeout))
code = -1
host = prev_active
for retry in range(1, 5):
LOG.info(
'retry{:02d}: checking if swacting triggered, '
'prev-active-controller={}'.format(
retry, prev_active))
code = 0
try:
code, msg = host_helper.wait_for_swact_complete(
host, con_ssh=con_ssh, fail_ok=True,
swact_start_timeout=swact_start_timeout,
swact_complete_timeout=swact_complete_timeout)
if 0 == code:
LOG.info(
'OK, host-swacted, prev-active:{}, pre-standby:{}, '
'code:{}, message:{}'.format(
prev_active, prev_active, code, msg))
return True
active, standby = system_helper.get_active_standby_controllers()
if active == prev_standby and standby == prev_active:
LOG.info(
'swacted?! prev-active:{} prev-standby:{}, cur active:{}, '
'cur standby:{}'.format(
prev_active, prev_standby, active, standby))
return True
break
except Exception as e:
LOG.warn(
'erred, indicating system is in unstable state, meaning '
'probably swacting is in process. '
'previous active-controller:{}, previous standby-controller:{}'
'\nerror message:{}'.format(prev_active, prev_standby, e))
if retry >= 4:
LOG.error(
'Fail the test after retry {} times, system remains in '
'unstable state, '
'meaning probably swacting is in process. previous '
'active-controller:{}, '
'previous standby-controller:{}\nerror message:{}'.
format(retry, prev_active, prev_standby, e))
raise
time.sleep(10)
return 0 == code
def wait_for_sm_process_events(service, host, target_status, expecting=True,
severity='major',
last_events=None, process_type='sm', timeout=60,
interval=3, con_ssh=None):
if process_type not in KILL_PROC_EVENT_FORMAT:
LOG.error('unknown type of process:{}'.format(process_type))
event_log_id = KILL_PROC_EVENT_FORMAT[process_type]['event_id']
reason_pattern, entity_id_pattern = KILL_PROC_EVENT_FORMAT[process_type][
severity][0:2]
if last_events is not None:
last_event = last_events['values'][0]
start_time = \
last_event[1].replace('-', '').replace('T', ' ').split('.')[0]
else:
start_time = ''
search_keys = {
'Event Log ID': event_log_id,
'Reason Text': reason_pattern,
'Entity Instance ID': entity_id_pattern,
}
expected_availability = target_status.get('availability', None)
matched_events = []
stop_time = time.time() + timeout
if expecting and (service == 'nova-novnc' or service == 'vim-webserver'):
stop_time = time.time() + timeout + 300
interval = 60
retry = 0
while time.time() < stop_time:
retry += 1
matched_events[:] = []
events_table = system_helper.get_events_table(
event_log_id=event_log_id, show_uuid=True,
start=start_time, limit=10, con_ssh=con_ssh, regex=True,
**search_keys)
if not events_table or not events_table['values']:
LOG.warn(
'run{:02d} for process:{}: Empty event table?!\n'
'evens_table:{}\nevent_id={}, '
'start={}\nkeys={}, severify={}'.
format(retry, service, events_table, event_log_id, start_time,
search_keys, severity))
continue
for event in events_table['values']:
try:
actual_event_id = event[3].strip()
if actual_event_id != event_log_id:
LOG.warn('Irrelevant event? event-list quering broken?!'
' looking-for-event-id={}, actual-event-id={}, '
'event={}'.
format(event_log_id, actual_event_id, event))
continue
actual_state = event[2]
if actual_state not in ('set', 'clear'):
LOG.info(
'State not matching, expected-state="log", '
'actual-state={}", event={}'.format(
actual_state, event))
continue
actual_reason = event[4].strip()
# ('cloud-services', 'active', 'active-degraded',
# 'controller-0;', ' glance-api(disabled, failed)')
m = re.match(reason_pattern, actual_reason)
if not m:
LOG.info(
'Not matched event:{},\nevent_id={}, start={}, '
'reason_text={}'.format(
event, event_log_id, start_time, reason_pattern))
continue
actual_group_status = m.group(2)
if actual_group_status not in ('active', expected_availability):
LOG.info(
'Group status not matching!, expected-status={}, '
'actual-status={}\nevent={}'.format(
expected_availability, actual_group_status, event))
continue
if 'host={}'.format(host) not in event[5]:
LOG.info(
'Host not matching, expected-host={}, acutal-host={}, '
'event={}'.format(
host, event[5], event))
continue
actual_service_name, status = m.group(3).split('(')
service_operational, service_availability = status.split(',')
matched_events.append(dict(
uuid=event[0],
event=event[1:-1],
service=actual_service_name,
serice_operational=service_operational,
service_availability=service_availability.strip().strip(
')'),
group_name=m.group(1),
group_prev_status=m.group(2),
group_status=m.group(3)
))
if not expecting:
LOG.error(
'Found set/clear event while it should NOT\nevent:'
'{}'.format(event))
return -1, tuple(matched_events)
matched_events = list(reversed(matched_events))
if len(matched_events) > 1:
if matched_events[-1]['event'][1] == 'clear' and \
matched_events[-2]['event'][1] == 'set':
LOG.info('OK, found matched events:{}'.format(
matched_events))
return 0, tuple(matched_events)
except IndexError:
LOG.error(
'CLI fm event-list changed its output format?\nsearching '
'keys={}'.format(
search_keys))
raise
LOG.warn(
'No matched event found at try:{}, will sleep {} seconds and retry'
'\nmatched events:\n{}, host={}'.format(retry, interval,
matched_events, host))
time.sleep(interval)
continue
LOG.info('No matched events:\n{}'.format(matched_events))
return -1, tuple()
def _check_status_after_killing_process(service, host, target_status,
expecting=True, process_type='sm',
last_events=None, con_ssh=None,
auth_info=Tenant.get('admin_platform')):
LOG.info(
'check for process:{} on host:{} expecting status:{}, process_type:'
'{}'.format(service, host, target_status, process_type))
try:
operational, availability = target_status.split('-')
except ValueError as e:
LOG.error('unknown host status:{}, error:{}'.format(target_status, e))
raise
expected = {'operational': operational, 'availability': availability}
if availability == 'warning':
LOG.info('impact:{} meaning: operational={}, availabiltiy={}'.format(
target_status, operational, availability))
code, _ = wait_for_sm_process_events(
service,
host,
expected,
expecting=expecting,
last_events=last_events,
process_type=process_type,
con_ssh=con_ssh)
return (0 == code) == expecting
total_wait = 120 if expecting else 30
time.sleep(1)
found = system_helper.wait_for_host_values(host, timeout=total_wait / 2,
con_ssh=con_ssh, fail_ok=True,
auth_info=auth_info, **expected)
if expecting and found:
LOG.debug('OK, process:{} in status:{} as expected.'.format(
service, target_status))
LOG.debug('Next, wait and verify the sytstem recovers')
expected = {'operational': 'enabled', 'availability': 'available'}
return system_helper.wait_for_host_values(
host, timeout=total_wait / 2, con_ssh=con_ssh, auth_info=auth_info,
fail_ok=True, **expected)
# return True
elif not expecting and found:
LOG.error('Unexpected status for process:{}, expected status:{}'.format(
service, expected))
return False
elif not expecting and not found:
LOG.info(
'OK, IMPACT did not happen which is correct. '
'target_status={}'.format(target_status))
return True
elif expecting and not found:
LOG.warn(
'host is not in expected status:{} for service:{}'.format(expected,
service))
code = wait_for_sm_process_events(
service, host, expected, expecting=expecting,
last_events=last_events,
process_type=process_type, con_ssh=con_ssh)[0]
return 0 == code
else:
# should never reach here
pass
def check_impact(impact, service_name, host='', last_events=None,
expecting_impact=False, process_type='sm', con_ssh=None,
timeout=80, **kwargs):
"""
Check if the expected IMPACT happens (or NOT) on the specified host
Args:
impact (str): system behavior to check, including:
swact --- the active controller swacting
enabled-degraded --- the host changed to
'enalbed-degraded' status
disabled-failed --- the host changed to
'disabled-failed' status
...
service_name (str): name of the service/process
host (str): the host to check
last_events (dict) the last events before action
expecting_impact (bool): if the IMPACT should happen timeout
process_type (str): type of the process: sm, pm, other
con_ssh: ssh connection/client to the active controller
timeout
**kwargs:
Returns:
boolean - whether the IMPACT happens as expected
"""
LOG.info(
'Checking impact:{} on host:{} after killing process:{}, '
'process_type={}'.format(
impact, host, service_name, process_type))
prev_active = kwargs.get('active_controller', 'controller-0')
prev_standby = kwargs.get('standby_controller', 'controller-1')
severity = kwargs.get('severity', 'major')
if impact == 'swact':
if expecting_impact:
return is_controller_swacted(prev_active, prev_standby,
con_ssh=con_ssh,
swact_start_timeout=max(timeout / 2,
20),
swact_complete_timeout=timeout)
else:
return not is_controller_swacted(prev_active, prev_standby,
con_ssh=con_ssh,
swact_start_timeout=timeout / 4)
elif impact in ('enabled-degraded', 'enabled-warning'):
return _check_status_after_killing_process(
service_name, host, target_status=impact,
expecting=expecting_impact,
process_type=process_type, last_events=last_events, con_ssh=con_ssh)
elif impact == 'disabled-failed':
if host == prev_active:
LOG.info(
'Killing PMON process:{} on active host:{} will trigger '
'swact. impact:{}, '
'severity:{}'.format(service_name, host, impact, severity))
swacted = is_controller_swacted(prev_active, prev_standby,
con_ssh=con_ssh,
swact_start_timeout=20,
swact_complete_timeout=timeout)
assert swacted, 'Active-controller must be swacted before been ' \
'taken into disabled-failed status'
operational, available = impact.split('-')
expected = {'operational': operational, 'available': available}
reached = system_helper.wait_for_host_values(host, timeout=timeout,
con_ssh=con_ssh,
fail_ok=True, **expected)
if reached and expecting_impact:
LOG.info(
'host {} reached status {} as expected after killing process '
'{}'.format(
host, expected, service_name))
return True
elif not reached and not expecting_impact:
LOG.info(
'host {} DID NOT reach status {} (as expected) after killing '
'process {}'.format(
host, expected, service_name))
return True
else:
LOG.error(
'Host:{} did not get into status:{} in {} seconds, seaching '
'for related events'.format(
host, expected, timeout))
# todo: it's better to do this in parallel with process-monitoring
expected = {'operational': 'enabled',
'available': ['available', 'degraded']}
reached = system_helper.wait_for_host_values(host, timeout=timeout,
con_ssh=con_ssh,
fail_ok=True,
**expected)
if reached:
LOG.info(
'Host:{} did not recover into status:{} in {} '
'seconds'.format(
host, expected, timeout))
return True
LOG.error(
'Host:{} did not get into status:{} in {} seconds, and there '
'is no related events'.format(
host, expected, timeout))
return False
else:
LOG.warn(
'impact-checker for impact:{} not implemented yet, '
'kwargs:{}'.format(impact, kwargs))
return False
def get_pmon_process_id(pid_file, host, con_ssh=None):
cmd = 'cat {} 2>/dev/null | head -n1 && echo 2>/dev/null'.format(pid_file)
with host_helper.ssh_to_host(host, con_ssh=con_ssh) as con:
code, output = con.exec_cmd(cmd)
if output.strip():
return int(output.strip())
return -1
def get_process_info(name, cmd='', pid_file='', host='', process_type='sm',
con_ssh=None,
auth_info=Tenant.get('admin_platform')):
"""
Get the information of the process with the specified name
Args:
name (str): name of the process
cmd (str): path of the executable
pid_file (str): path of the file containing the process id
host (str): host on which the process resides
process_type (str): type of service/process, must be one of 'sm',
'pm', 'other'
con_ssh: ssh connection/client to the active controller
auth_info
Returns:
"""
LOG.info('name:{} cmd={} pid_file={} host={} process_type={}'.format(
name, cmd, pid_file, host, process_type))
active_controller = system_helper.get_active_controller_name(
con_ssh=con_ssh, auth_info=auth_info)
if not host:
host = active_controller
if process_type == 'sm':
LOG.debug(
'to get_process_info for SM process:{} on host:{}'.format(name,
host))
if host != active_controller:
LOG.warn(
'Already swacted? host:{} is not the active controller now. '
'Active controller is {}'.format(
host, active_controller))
pid, name, impact, status, pid_file = get_process_from_sm(
name, con_ssh=con_ssh, pid_file=pid_file)
if status != 'enabled-active':
LOG.warn('SM process is in status:{}, not "enabled-active"'.format(
status))
if 'disabl' in status:
LOG.warn(
'Wrong controller? Or controller already swacted, '
'wait and try on the other controller')
time.sleep(10)
return get_process_from_sm(name, pid_file=pid_file)
return -1, name, impact, status, pid_file
else:
return pid, name, impact, status, pid_file
elif process_type == 'pmon':
pid = get_pmon_process_id(pid_file, host, con_ssh=con_ssh)
LOG.info('Found: PID={} for PMON process:{}'.format(pid, name))
return pid, name
else:
LOG.info('Try to find the process:{} using "ps"'.format(name))
pid = get_ancestor_process(name, host, cmd=cmd, con_ssh=con_ssh)[0]
if -1 == pid:
return -1, ''
return pid, name
def is_process_running(pid, host, con_ssh=None, retries=3, interval=3):
"""
Check if the process with the PID is existing
Args:
pid (int): process id
host (str): host the process resides
con_ssh: ssh connection/client to the host
retries (int): times to re-try if no process found before return
failure
interval (int): time to wait before next re-try
Returns:
boolean - true if the process existing, false otherwise
msg (str) - the details of the process or error messages
"""
cmd = 'ps -p {}'.format(pid)
for _ in range(retries):
with host_helper.ssh_to_host(host, con_ssh=con_ssh) as host_ssh:
code, output = host_ssh.exec_cmd(cmd, fail_ok=True)
if 0 != code:
LOG.warn(
'Process:{} DOES NOT exist, error:{}'.format(pid, output))
else:
return True, output
time.sleep(interval)
return False, ''
def _get_last_events_timestamps(limit=1, event_log_id=None, con_ssh=None,
auth_info=Tenant.get('admin_platform')):
latest_events = system_helper.get_events_table(limit=limit,
event_log_id=event_log_id,
show_uuid=True,
con_ssh=con_ssh,
auth_info=auth_info)
return latest_events
def kill_sm_process_and_verify_impact(name, cmd='', pid_file='', retries=2,
impact='swact', host='controller-0',
interval=20, action_timeout=90,
total_retries=3, process_type='sm',
on_active_controller=True, con_ssh=None,
auth_info=Tenant.get('admin_platform')):
"""
Kill the process with the specified name and verify the system behaviors
as expected
Args:
name (str): name of the process
cmd (str): executable of the process
pid_file (str): file containing process id
retries (int): times of killing actions upon which the
IMPACT will be triggered
impact (str): system behavior including:
swact -- active controller is swacted
enabled-degraded -- the status of the
service will change to
disabled-failed -- the status of the
service will change to
...
host (str): host to test on
interval (int): least time to wait between kills
action_timeout (int): kills and impact should happen within this
time frame
total_retries (int): total number of retries for whole kill and
wait actions
process_type (str): valid types are: sm, pmon, other
on_active_controller (boolean):
con_ssh: ssh connection/client to the active controller
auth_info
Returns: (pid, host)
pid:
>0 suceess, the final PID of the process
-1 fail because of impact NOT happening after killing the
process up to threshold times
-2 fail because of impact happening before killing threshold times
-3 fail after try total_retries times
host:
the host tested on
"""
active_controller, standby_controller = \
system_helper.get_active_standby_controllers(con_ssh=con_ssh,
auth_info=auth_info)
if on_active_controller:
LOG.info(
'on active controller: {}, host:{}'.format(active_controller, host))
host = active_controller
con_ssh = con_ssh or ControllerClient.get_active_controller()
LOG.info('on host: {}'.format(host))
if total_retries < 1 or retries < 1:
LOG.error(
'retries/total-retries < 1? retires:{}, total retries:{}'.format(
retries, total_retries))
return None
count = 0
for i in range(1, total_retries + 1):
LOG.info(
'retry:{:02d} kill the process:{} and verify impact:{}'.format(
i, name, impact))
exec_times = []
killed_pids = []
timeout = time.time() + action_timeout * (
retries / 2 if retries > 2 else 1)
while time.time() < timeout:
count += 1
LOG.debug(
'retry{:02d}-{:02d}: Failed to get process id for {} on '
'host:{}, swacted unexpectedly?'.format(
i, count, name, host))
try:
pid, proc_name = get_process_info(name, cmd=cmd, host=host,
process_type=process_type,
pid_file=pid_file,
con_ssh=con_ssh)[0:2]
except pexpect.exceptions.EOF:
LOG.warn(
'retry{:02d}-{:02d}: Failed to get process id for {} on '
'host:{}, swacted unexpectedly?'.format(
i, count, name, host))
time.sleep(interval / 3.0)
continue
if -1 == pid:
LOG.error(
'retry{:02d}-{:02d}: Failed to get PID for process with '
'name:{}, cmd:{}, '
'wait and retries'.format(i, count, name, cmd))
time.sleep(interval / 3.0)
continue
if killed_pids and pid in killed_pids:
LOG.warn(
'retry{:02d}-{:02d}: No new process re-created, '
'prev-pid={}, cur-pid={}'.format(
i, count, killed_pids[-1], pid))
time.sleep(interval / 3.0)
continue
last_killed_pid = killed_pids[-1] if killed_pids else None
killed_pids.append(pid)
last_kill_time = exec_times[-1] if exec_times else None
exec_times.append(datetime.datetime.utcnow())
latest_events = _get_last_events_timestamps(
event_log_id=KILL_PROC_EVENT_FORMAT[process_type]['event_id'],
limit=10)
LOG.info(
'retry{:02d}-{:02d}: before kill CLI, proc_name={}, pid={}, '
'last_killed_pid={}, last_kill_time={}'.format(
i, count, proc_name, pid, last_killed_pid, last_kill_time))
LOG.info('\tactive-controller={}, standby-controller={}'.format(
active_controller, standby_controller))
kill_cmd = '{} {}'.format(KILL_CMD, pid)
with host_helper.ssh_to_host(host, con_ssh=con_ssh) as con:
code, output = con.exec_sudo_cmd(kill_cmd, fail_ok=True)
if 0 != code:
# it happens occasionaly
LOG.error('Failed to kill pid:{}, cmd={}, output=<{}>, '
'at run:{}, already terminated?'.format(
pid, kill_cmd, output, count))
if count < retries:
# IMPACT should not happen yet
if not check_impact(impact, proc_name,
last_events=latest_events,
active_controller=active_controller,
standby_controller=standby_controller,
expecting_impact=False,
process_type=process_type, host=host,
con_ssh=con_ssh):
LOG.error(
'Impact:{} observed unexpectedly, it should happen '
'only after killing {} times, '
'actual killed times:{}'.format(impact, retries, count))
return -2, host
LOG.info(
'retry{:02d}-{:02d}: OK, NO impact as expected, impact={}, '
'will kill it another time'.format(i, count, impact))
time.sleep(max(interval * 1 / 2.0, 5))
else:
no_standby_controller = standby_controller is None
expecting_impact = True if not no_standby_controller else False
if not check_impact(
impact, proc_name, last_events=latest_events,
active_controller=active_controller,
standby_controller=standby_controller,
expecting_impact=expecting_impact,
process_type=process_type, host=host, con_ssh=con_ssh):
LOG.error(
'No impact after killing process {} {} times, while '
'{}'.format(proc_name, count,
('expecting impact' if expecting_impact
else 'not expecting impact')))
return -1, host
LOG.info(
'OK, final retry{:02d}-{:02d}: OK, IMPACT happened '
'(if applicable) as expected, '
'impact={}'.format(i, count, impact))
active_controller, standby_controller = \
system_helper.get_active_standby_controllers(
con_ssh=con_ssh)
LOG.info(
'OK, after impact:{} (tried:{} times), '
'now active-controller={}, standby-controller={}'.format(
impact, count, active_controller, standby_controller))
pid, proc_name = get_process_info(name, cmd=cmd, host=host,
pid_file=pid_file,
process_type=process_type,
con_ssh=con_ssh)[0:2]
return pid, active_controller
return -3, host
def wait_for_sm_dump_services_active(timeout=60, fail_ok=False, con_ssh=None,
auth_info=Tenant.get('admin_platform')):
"""
Wait for all services
Args:
timeout:
fail_ok:
con_ssh:
auth_info
Returns:
"""
active_controller = system_helper.get_active_controller_name(
con_ssh=con_ssh, auth_info=auth_info)
return host_helper.wait_for_sm_dump_desired_states(
controller=active_controller, timeout=timeout, fail_ok=fail_ok)