#!/usr/bin/python # # Copyright (c) 2019 Wind River Systems, Inc. # # SPDX-License-Identifier: Apache-2.0 # import ast import os import os.path import re import subprocess import sys DEVICE_NAME_NVME = "nvme" ######### # Utils # ######### def command(arguments, **kwargs): """ Execute e command and capture stdout, stderr & return code """ process = subprocess.Popen( arguments, stdout=subprocess.PIPE, stderr=subprocess.PIPE, **kwargs) out, err = process.communicate() return out, err, process.returncode def get_input(arg, valid_keys): """Convert the input to a dict and perform basic validation""" json_string = arg.replace("\\n", "\n") try: input_dict = ast.literal_eval(json_string) if not all(k in input_dict for k in valid_keys): return None except Exception: return None return input_dict def get_partition_uuid(dev): output, _, _ = command(['blkid', dev]) try: return re.search('PARTUUID=\"(.+?)\"', output).group(1) except AttributeError: return None def device_path_to_device_node(device_path): try: output, _, _ = command(["udevadm", "settle", "-E", device_path]) out, err, retcode = command(["readlink", "-f", device_path]) out = out.rstrip() except Exception as e: return None return out ########################################### # Manage Journal Disk Partitioning Scheme # ########################################### DISK_BY_PARTUUID = "/dev/disk/by-partuuid/" JOURNAL_UUID='45b0969e-9b03-4f30-b4c6-b4b80ceff106' # Type of a journal partition def is_partitioning_correct(disk_path, partition_sizes): """ Validate the existence and size of journal partitions""" # Obtain the device node from the device path. disk_node = device_path_to_device_node(disk_path) # Check that partition table format is GPT output, _, _ = command(["udevadm", "settle", "-E", disk_node]) output, _, _ = command(["parted", "-s", disk_node, "print"]) if not re.search('Partition Table: gpt', output): print("Format of disk node %s is not GPT, zapping disk" % disk_node) return False # Check each partition size partition_index = 1 for size in partition_sizes: # Check that each partition size matches the one in input if DEVICE_NAME_NVME in disk_node: partition_node = '{}p{}'.format(disk_node, str(partition_index)) else: partition_node = '{}{}'.format(disk_node, str(partition_index)) output, _, _ = command(["udevadm", "settle", "-E", partition_node]) cmd = ["parted", "-s", partition_node, "unit", "MiB", "print"] output, _, _ = command(cmd) regex = ("^Disk " + str(partition_node) + ":\\s*" + str(size) + "[\\.0]*MiB") if not re.search(regex, output, re.MULTILINE): print("Journal partition %(node)s size is not %(size)s, " "zapping disk" % {"node": partition_node, "size": size}) return False partition_index += 1 output, _, _ = command(["udevadm", "settle", "-t", "10"]) return True def create_partitions(disk_path, partition_sizes): """ Recreate partitions """ # Obtain the device node from the device path. disk_node = device_path_to_device_node(disk_path) # Issue: After creating a new partition table on a device, Udev does not # always remove old symlinks (i.e. to previous partitions on that device). # Also, even if links are erased before zapping the disk, some of them will # be recreated even though there is no partition to back them! # Therefore, we have to remove the links AFTER we erase the partition table # Issue: DISK_BY_PARTUUID directory is not present at all if there are no # GPT partitions on the storage node so nothing to remove in this case links = [] if os.path.isdir(DISK_BY_PARTUUID): links = [ os.path.join(DISK_BY_PARTUUID,l) for l in os.listdir(DISK_BY_PARTUUID) if os.path.islink(os.path.join(DISK_BY_PARTUUID, l)) ] # Erase all partitions on current node by creating a new GPT table _, err, ret = command(["parted", "-s", disk_node, "mktable", "gpt"]) if ret: print("Error erasing partition table of %(node)s\n" "Return code: %(ret)s reason: %(reason)s" % {"node": disk_node, "ret": ret, "reason": err}) exit(1) # Erase old symlinks for l in links: if disk_node in os.path.realpath(l): os.remove(l) # Create partitions in order used_space_mib = 1 # leave 1 MB at the beginning of the disk num = 1 for size in partition_sizes: cmd = ['parted', '-s', disk_node, 'unit', 'mib', 'mkpart', 'primary', str(used_space_mib), str(used_space_mib + size)] _, err, ret = command(cmd) parms = {"disk_node": disk_node, "start": used_space_mib, "end": used_space_mib + size, "reason": err} print("Created partition from start=%(start)s MiB to end=%(end)s MiB" " on %(disk_node)s" % parms) if ret: print("Failed to create partition with " "start=%(start)s, end=%(end)s " "on %(disk_node)s reason: %(reason)s" % parms) exit(1) # Set partition type to ceph journal # noncritical operation, it makes 'ceph-disk list' output correct info cmd = ['sgdisk', '--change-name={num}:ceph journal'.format(num=num), '--typecode={num}:{uuid}'.format( num=num, uuid=JOURNAL_UUID, ), disk_node] _, err, ret = command(cmd) if ret: print("WARNINIG: Failed to set partition name and typecode") used_space_mib += size num += 1 ########################### # Manage Journal Location # ########################### OSD_PATH = "/var/lib/ceph/osd/" def mount_data_partition(data_path, osdid): """ Mount an OSD data partition and return the mounted path """ # Obtain the device node from the device path. data_node = device_path_to_device_node(data_path) mount_path = OSD_PATH + "ceph-" + str(osdid) output, _, _ = command(['mount']) regex = "^" + data_node + ".*" + mount_path if not re.search(regex, output, re.MULTILINE): cmd = ['mount', '-t', 'xfs', data_node, mount_path] _, _, ret = command(cmd) params = {"node": data_node, "path": mount_path} if ret: print("Failed to mount %(node)s to %(path), aborting" % params) exit(1) else: print("Mounted %(node)s to %(path)s" % params) return mount_path def is_location_correct(path, journal_path, osdid): """ Check if location points to the correct device """ # Obtain the device node from the device path. journal_node = device_path_to_device_node(journal_path) cur_node = os.path.realpath(path + "/journal") if cur_node == journal_node: return True else: return False def fix_location(mount_point, journal_path, osdid): """ Move the journal to the new partition """ # Obtain the device node from the device path. journal_node = device_path_to_device_node(journal_path) # Fix symlink path = mount_point + "/journal" # 'journal' symlink path used by ceph-osd journal_uuid = get_partition_uuid(journal_node) new_target = DISK_BY_PARTUUID + journal_uuid params = {"path": path, "target": new_target} try: if os.path.lexists(path): os.unlink(path) # delete the old symlink os.symlink(new_target, path) print("Symlink created: %(path)s -> %(target)s" % params) except: print("Failed to create symlink: %(path)s -> %(target)s" % params) exit(1) # Fix journal_uuid path = mount_point + "/journal_uuid" try: with open(path, 'w') as f: f.write(journal_uuid) except Exception as ex: # The operation is noncritical, it only makes 'ceph-disk list' # display complete output. We log and continue. params = {"path": path, "uuid": journal_uuid} print("WARNING: Failed to set uuid of %(path)s to %(uuid)s" % params) # Clean the journal partition # even if erasing the partition table, if another journal was present here # it's going to be reused. Journals are always bigger than 100MB. command(['dd', 'if=/dev/zero', 'of=%s' % journal_node, 'bs=1M', 'count=100']) # Format the journal cmd = ['/usr/bin/ceph-osd', '-i', str(osdid), '--pid-file', '/var/run/ceph/osd.%s.pid' % osdid, '-c', '/etc/ceph/ceph.conf', '--cluster', 'ceph', '--mkjournal'] out, err, ret = command(cmd) params = {"journal_node": journal_node, "osdid": osdid, "ret": ret, "reason": err} if not ret: print("Prepared new journal partition: %(journal_node)s " "for osd id: %(osdid)s" % params) else: print("Error initializing journal node: " "%(journal_node)s for osd id: %(osdid)s " "ceph-osd return code: %(ret)s reason: %(reason)s" % params) ######## # Main # ######## def main(argv): # parse and validate arguments err = False partitions = None location = None if len(argv) != 2: err = True elif argv[0] == "partitions": valid_keys = ['disk_path', 'journals'] partitions = get_input(argv[1], valid_keys) if not partitions: err = True elif not isinstance(partitions['journals'], list): err = True elif argv[0] == "location": valid_keys = ['data_path', 'journal_path', 'osdid'] location = get_input(argv[1], valid_keys) if not location: err = True elif not isinstance(location['osdid'], int): err = True else: err = True if err: print("Command intended for internal use only") exit(-1) if partitions: # Recreate partitions only if the existing ones don't match input if not is_partitioning_correct(partitions['disk_path'], partitions['journals']): create_partitions(partitions['disk_path'], partitions['journals']) else: print("Partition table for %s is correct, " "no need to repartition" % device_path_to_device_node(partitions['disk_path'])) elif location: # we need to have the data partition mounted & we can let it mounted mount_point = mount_data_partition(location['data_path'], location['osdid']) # Update journal location only if link point to another partition if not is_location_correct(mount_point, location['journal_path'], location['osdid']): print("Fixing journal location for " "OSD id: %(id)s" % {"node": location['data_path'], "id": location['osdid']}) fix_location(mount_point, location['journal_path'], location['osdid']) else: print("Journal location for %s is correct," "no need to change it" % location['data_path']) main(sys.argv[1:])