integ/ceph/ceph/files/osd-wait-status.py

247 lines
7.5 KiB
Python

#!/usr/bin/python
#
# Copyright (c) 2019 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
#
# Wait for one or a group of OSDs to match one or a group of statuses
# as reported by "ceph osd tree".
#
# Examples:
# - wait for osd 0 to be up:
# osd-wait-status -o 0 -s up
#
# - wait for osd 0 and osd 1 to be up:
# osd-wait-status -o 0 1 -s up
#
# The amount of time spent waiting for OSDs to match a status can
# be limited by specifying:
#
# - the maximum retry count; the script will if the status doesn't
# match the desired one after more than retry count attempts.
# The interval between attempts is controlled by the "-i" flag.
# Example:
# osd-wait-status -o 0 -s up -c 2 -i 3
# will call "ceph osd tree" once to get the status of osd 0 and if
# it's not "up" then it will try one more time after 3 seconds.
#
# - a deadline as the maximum interval of time the script is looping
# waiting for OSDs to match status. The interval between attempts
# is controlled by the "-i" flag.
# Example:
# osd-wait-status -o 0 -s up -d 10 -i 3
# will call "ceph osd tree" until either osd 0 status is "up" or
# no more than 10 seconds have passed, that's 3-4 attempts depending
# on how much time it takes to run "ceph osd tree"
#
# Status match can be reversed by using "-n" flag.
# Example:
# osd-wait-status -o 0 -n -s up
# waits until osd 0 status is NOT up.
#
# osd-wait-status does not allow matching arbitrary combinations of
# OSDs and statuses. For example: "osd 0 up and osd 1 down" is not
# supported.
#
# Return code is 0 if OSDs match expected status before the
# retry count*interval / deadline limits are reached.
import argparse
import json
import logging
import retrying
import subprocess
import sys
import time
logging.basicConfig(level=logging.DEBUG)
LOG = logging.getLogger('osd-wait-status')
CEPH_BINARY_PATH = '/usr/bin/ceph'
RETRY_INTERVAL_SEC = 1
RETRY_FOREVER = 0
NO_DEADLINE = 0
class OsdException(Exception):
def __init__(self, message, restartable=False):
super(OsdException, self).__init__(message)
self.restartable = restartable
def get_osd_tree():
command = [CEPH_BINARY_PATH,
'osd', 'tree', '--format', 'json']
try:
p = subprocess.Popen(command,
stdout = subprocess.PIPE,
stderr = subprocess.PIPE)
output, error = p.communicate()
if p.returncode != 0:
raise OsdException(
('Command failed: command="{}", '
'returncode={}, output="{}"').format(
' '.join(command),
p.returncode,
output, error),
restartable=True)
except OSError as e:
raise OsdException(
('Command failed: command="{}", '
'reason="{}"').format(command, str(e)))
try:
return json.loads(output)
except ValueError as e:
raise OsdException(
('JSON decode failed: '
'data="{}", error="{}"').format(
output, e))
def osd_match_status(target_osd, target_status,
reverse_logic):
LOG.info(('Match status: '
'target_osd={}, '
'target status={}, '
'reverse_logic={}').format(
target_osd, target_status, reverse_logic))
tree = get_osd_tree()
osd_status = {}
for node in tree.get('nodes'):
name = node.get('name')
if name in target_osd:
osd_status[name] = node.get('status')
if len(osd_status) == len(target_osd):
break
LOG.info('Current OSD(s) status: {}'.format(osd_status))
for name in target_osd:
if name not in osd_status:
raise OsdException(
('Unable to retrieve status '
'for "{}"').format(
name))
if reverse_logic:
if osd_status[name] not in target_status:
del osd_status[name]
else:
if osd_status[name] in target_status:
del osd_status[name]
if len(osd_status) == 0:
LOG.info('OSD(s) status target reached.')
return True
else:
LOG.info('OSD(s) {}matching status {}: {}'.format(
'' if reverse_logic else 'not ',
target_status,
osd_status.keys()))
return False
def osd_wait_status(target_osd, target_status,
reverse_logic,
retry_count, retry_interval,
deadline):
def retry_if_false(result):
return (result is False)
def retry_if_restartable(exception):
return (isinstance(exception, OsdException)
and exception.restartable)
LOG.info(('Wait options: '
'target_osd={}, '
'target_status={}, '
'reverse_logic={}, '
'retry_count={}, '
'retry_interval={}, '
'deadline={}').format(
target_osd, target_status, reverse_logic,
retry_count, retry_interval, deadline))
kwargs = {
'retry_on_result': retry_if_false,
'retry_on_exception': retry_if_restartable}
if retry_count != RETRY_FOREVER:
kwargs['stop_max_attempt_number'] = retry_count
if deadline != NO_DEADLINE:
kwargs['stop_max_delay'] = deadline * 1000
if retry_interval != 0:
kwargs['wait_fixed'] = retry_interval * 1000
if not len(target_osd):
return
retrying.Retrying(**kwargs).call(
osd_match_status,
target_osd, target_status,
reverse_logic)
def non_negative_interger(value):
value = int(value)
if value < 0:
raise argparse.argumenttypeerror(
'{} is a negative integer value'.format(value))
return value
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description='Wait for OSD status match')
parser.add_argument(
'-o', '--osd',
nargs='*',
help='osd id',
type=non_negative_interger,
required=True)
parser.add_argument(
'-n', '--not',
dest='reverse_logic',
help='reverse logic: wait for status NOT to match',
action='store_true',
default=False)
parser.add_argument(
'-s', '--status',
nargs='+',
help='status',
type=str,
required=True)
parser.add_argument(
'-c', '--retry-count',
help='retry count',
type=non_negative_interger,
default=RETRY_FOREVER)
parser.add_argument(
'-i', '--retry-interval',
help='retry interval (seconds)',
type=non_negative_interger,
default=RETRY_INTERVAL_SEC)
parser.add_argument(
'-d', '--deadline',
help='deadline (seconds)',
type=non_negative_interger,
default=NO_DEADLINE)
args = parser.parse_args()
start = time.time()
try:
osd_wait_status(
['osd.{}'.format(o) for o in args.osd],
args.status,
args.reverse_logic,
args.retry_count,
args.retry_interval,
args.deadline)
LOG.info('Elapsed time: {:.02f} seconds'.format(
time.time() - start))
sys.exit(0)
except retrying.RetryError as e:
LOG.warn(
('Retry error: {}. '
'Elapsed time: {:.02f} seconds'.format(
e, time.time() - start)))
except OsdException as e:
LOG.warn(
('OSD wait error: {}. '
'Elapsed time: {:.02f} seconds').format(
e, time.time() - start))
sys.exit(1)