From c7a85335e6eff371985a02b9d91bd786c6bf522e Mon Sep 17 00:00:00 2001 From: Alex Kozyrev Date: Thu, 9 Aug 2018 14:01:56 -0400 Subject: [PATCH 01/26] Adding ptp4l and phc2sys processes to the common services list PTP consists of two running services on a system: ptp4l and phc2sys. Adding these two processes to the Engineering data collection tools. Host performance monitoring will take them into account after that. Change-Id: I26cbaeeba79a8a64f5b31e2fc439f333f249b5ef Story: 2002935 Task: 22922 Signed-off-by: Alex Kozyrev --- tools/engtools/hostdata-collectors/centos/build_srpm.data | 2 +- tools/engtools/hostdata-collectors/scripts/cfg/engtools.conf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/engtools/hostdata-collectors/centos/build_srpm.data b/tools/engtools/hostdata-collectors/centos/build_srpm.data index 81d405878..ac9c374eb 100644 --- a/tools/engtools/hostdata-collectors/centos/build_srpm.data +++ b/tools/engtools/hostdata-collectors/centos/build_srpm.data @@ -1,2 +1,2 @@ SRC_DIR="scripts" -TIS_PATCH_VER=1 +TIS_PATCH_VER=2 diff --git a/tools/engtools/hostdata-collectors/scripts/cfg/engtools.conf b/tools/engtools/hostdata-collectors/scripts/cfg/engtools.conf index efe1b9b74..563bc2e07 100644 --- a/tools/engtools/hostdata-collectors/scripts/cfg/engtools.conf +++ b/tools/engtools/hostdata-collectors/scripts/cfg/engtools.conf @@ -74,4 +74,4 @@ STORAGE_SERVICE_LIST=ceph-mon ceph-osd ceph-manager ceph-rest-api RABBITMQ_QUEUE_LIST=notifications.info versioned_notifications.info [CommonServices] -COMMON_SERVICE_LIST=dnsmasq ceilometer-polling haproxy hwmond pmond rmond fsmond sw-patch-agent sysinv-agent syslog-ng hostwd iscsid io-monitor-manager acpid hbsClient logmgmt mtcClient mtcalarmd mtclogd sshd ntpd smartd sm sm-eru sm-watchdog sm-api ceilometer keyring cinder-rtstool tuned polkitd lldpd IPaddr2 dnsmasq systemd-udevd systemd-journald logrotate collectd +COMMON_SERVICE_LIST=dnsmasq ceilometer-polling haproxy hwmond pmond rmond fsmond sw-patch-agent sysinv-agent syslog-ng hostwd iscsid io-monitor-manager acpid hbsClient logmgmt mtcClient mtcalarmd mtclogd sshd ntpd ptp4l phc2sys smartd sm sm-eru sm-watchdog sm-api ceilometer keyring cinder-rtstool tuned polkitd lldpd IPaddr2 dnsmasq systemd-udevd systemd-journald logrotate collectd From c3f90f6ac3b3e6b467f80071971a9a916e6947d6 Mon Sep 17 00:00:00 2001 From: Bin Qian Date: Mon, 9 Jul 2018 09:28:29 -0400 Subject: [PATCH 02/26] SM REST API doc Include documentation of SM REST APIs in restapi-doc package Story: 2002827 Task: 22744 Change-Id: Ie6539dc923e73c5abecf96e712c997dff6944f34 Signed-off-by: Bin Qian --- .../restapi-doc/api-ref-guides/pom.xml | 14 +- .../src/bk-api-ref-smapi-v1.xml | 57 ++++ .../api-ref-guides/src/bk-api-ref.xml | 3 +- restapi-doc/restapi-doc/api-ref/pom.xml | 24 +- .../api-ref/src/docbkx/api-ref-smapi-v1.xml | 30 ++ .../api-ref/src/docbkx/ch_smapi-v1.xml | 104 ++++++ .../src/docbkx/itemizedlist-service-list.xml | 6 +- .../service_group_list-response.json | 81 +++++ .../service_group_show-response.json | 11 + .../v1/api_samples/service_list-response.json | 39 +++ .../service_node_list-response.json | 20 ++ .../service_node_show-response.json | 8 + .../service_parameter_list-response.json | 73 +++++ .../service_parameter_show-response.json | 20 ++ .../v1/api_samples/service_show-response.json | 7 + .../api_samples/smapi-versions-response.json | 24 ++ .../api_samples/versionv1-get-response.json | 59 ++++ .../api-ref/src/wadls/sm-api/v1/common.ent | 244 ++++++++++++++ .../src/wadls/sm-api/v1/sm-api-v1.wadl | 298 ++++++++++++++++++ 19 files changed, 1117 insertions(+), 5 deletions(-) create mode 100644 restapi-doc/restapi-doc/api-ref-guides/src/bk-api-ref-smapi-v1.xml create mode 100644 restapi-doc/restapi-doc/api-ref/src/docbkx/api-ref-smapi-v1.xml create mode 100644 restapi-doc/restapi-doc/api-ref/src/docbkx/ch_smapi-v1.xml create mode 100644 restapi-doc/restapi-doc/api-ref/src/wadls/sm-api/v1/api_samples/service_group_list-response.json create mode 100644 restapi-doc/restapi-doc/api-ref/src/wadls/sm-api/v1/api_samples/service_group_show-response.json create mode 100644 restapi-doc/restapi-doc/api-ref/src/wadls/sm-api/v1/api_samples/service_list-response.json create mode 100644 restapi-doc/restapi-doc/api-ref/src/wadls/sm-api/v1/api_samples/service_node_list-response.json create mode 100644 restapi-doc/restapi-doc/api-ref/src/wadls/sm-api/v1/api_samples/service_node_show-response.json create mode 100644 restapi-doc/restapi-doc/api-ref/src/wadls/sm-api/v1/api_samples/service_parameter_list-response.json create mode 100644 restapi-doc/restapi-doc/api-ref/src/wadls/sm-api/v1/api_samples/service_parameter_show-response.json create mode 100644 restapi-doc/restapi-doc/api-ref/src/wadls/sm-api/v1/api_samples/service_show-response.json create mode 100644 restapi-doc/restapi-doc/api-ref/src/wadls/sm-api/v1/api_samples/smapi-versions-response.json create mode 100644 restapi-doc/restapi-doc/api-ref/src/wadls/sm-api/v1/api_samples/versionv1-get-response.json create mode 100644 restapi-doc/restapi-doc/api-ref/src/wadls/sm-api/v1/common.ent create mode 100644 restapi-doc/restapi-doc/api-ref/src/wadls/sm-api/v1/sm-api-v1.wadl diff --git a/restapi-doc/restapi-doc/api-ref-guides/pom.xml b/restapi-doc/restapi-doc/api-ref-guides/pom.xml index 69e30281e..827be63ed 100644 --- a/restapi-doc/restapi-doc/api-ref-guides/pom.xml +++ b/restapi-doc/restapi-doc/api-ref-guides/pom.xml @@ -2,7 +2,7 @@ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd"> + + + + + + + +GET'> +PUT'> +POST'> +DELETE'> +]> + + + Service Management API v1 + API Reference + + + + + + + Wind River + + + + 2018 + Wind River + + Titanium Cloud + + + + Copyright details are filled in by the + template. + + + + + + diff --git a/restapi-doc/restapi-doc/api-ref-guides/src/bk-api-ref.xml b/restapi-doc/restapi-doc/api-ref-guides/src/bk-api-ref.xml index 09cec4d17..34ff32e85 100644 --- a/restapi-doc/restapi-doc/api-ref-guides/src/bk-api-ref.xml +++ b/restapi-doc/restapi-doc/api-ref-guides/src/bk-api-ref.xml @@ -1,6 +1,6 @@ diff --git a/restapi-doc/restapi-doc/api-ref/src/docbkx/api-ref-smapi-v1.xml b/restapi-doc/restapi-doc/api-ref/src/docbkx/api-ref-smapi-v1.xml new file mode 100644 index 000000000..ad1b3eed4 --- /dev/null +++ b/restapi-doc/restapi-doc/api-ref/src/docbkx/api-ref-smapi-v1.xml @@ -0,0 +1,30 @@ + + + + + Titanium Service Management API v1 + + 2018 + Wind River + + + + + + + + + + diff --git a/restapi-doc/restapi-doc/api-ref/src/docbkx/ch_smapi-v1.xml b/restapi-doc/restapi-doc/api-ref/src/docbkx/ch_smapi-v1.xml new file mode 100644 index 000000000..8c83ab76e --- /dev/null +++ b/restapi-doc/restapi-doc/api-ref/src/docbkx/ch_smapi-v1.xml @@ -0,0 +1,104 @@ + + + + SM API v1 + Interact with Service Management + The typical port used for the SM REST API is 7777. + However, proper technique would be to look up the smapi service endpoint in Keystone. + + + + + + +
+ API versions + + + + + + + + +
+ + + + + +
+ Services + These APIs allow the display of the services running + and their attributes + + + + + + + + + + + + +
+ + + + + +
+ Service Nodes + These APIs allow the display of the service nodes + and their attributes + + + + + + + + +
+ + + + + +
+ Service Groups + These APIs allow the display of the service groups + and their attributes + + + + + + + + +
+ +
diff --git a/restapi-doc/restapi-doc/api-ref/src/docbkx/itemizedlist-service-list.xml b/restapi-doc/restapi-doc/api-ref/src/docbkx/itemizedlist-service-list.xml index 8a6fb0092..2a384c7e2 100644 --- a/restapi-doc/restapi-doc/api-ref/src/docbkx/itemizedlist-service-list.xml +++ b/restapi-doc/restapi-doc/api-ref/src/docbkx/itemizedlist-service-list.xml @@ -1,6 +1,6 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + '> + + + + + + '> + + + + + + '> + + + + + '> + + + + + + '> + + + + + + '> + + + + + + + The universally unique identifier for this object. + + + + + For convenience, resources contain links to themselves. + This allows a client to easily obtain rather than construct + resource URIs. The following types of link relations are + associated with resources: a self link containing a versioned + link to the resource, and a bookmark link containing a permanent + link to a resource that is appropriate for long term storage. + + + '> + + + + + + + Administrative state of the node. + + + + + The operational state of the node. + + + + + The name of the node. + + + + + The operational state of the node + + + + + The availability status of the node. + + + + + The id of the node. + + + '> + + + + + + The type of host that the service is running on. + + + + + The name of the service group. + + + + + The name of the node that the service is running on. + + + + + The state of the service. + + + + + The uuid of the service group. + + + '> + + + + + + + The operational state of the service. + + + + + The id of the service. + + + + + The desired state of the service + + + + + The name of the service. + + + + + The name of the host which the service is running on. + + + '> + + + + GET'> + PUT'> + POST'> + DELETE'> diff --git a/restapi-doc/restapi-doc/api-ref/src/wadls/sm-api/v1/sm-api-v1.wadl b/restapi-doc/restapi-doc/api-ref/src/wadls/sm-api/v1/sm-api-v1.wadl new file mode 100644 index 000000000..bce8aa1d4 --- /dev/null +++ b/restapi-doc/restapi-doc/api-ref/src/wadls/sm-api/v1/sm-api-v1.wadl @@ -0,0 +1,298 @@ + + + +%common;]> + + + + + + + + + + + + + + + + + + + + + + + + + + + The unique identifier of an existing service. + + + + + + + + + The name of an existing service. + + + + + + + + + + + + + + The unique identifier of an existing node. + + + + + + + + + + + + + + + The unique identifier of an existing service group. + + + + + + + + + + + + + + + + + + API version details. + + + + + + + + + + + + + + + + + Lists information about all Titanium Cloud SM API versions. + + + + + + + + + &commonFaults; &getFaults; + + + + + Shows details for SM API v1. + + + + + + + + + &commonFaults; &getFaults; + + + + + + + + + + List all services running. + + + + + + + + The list of services. + + + &serviceListShowParameters; + + + + + + + + &commonFaults; &getFaults; + + + + + + Shows the attributes of a specific service. + + + + + + &serviceListShowParameters; + + + + + + + + &commonFaults; &getFaults; + + + + + + + + + + List all controller nodes in the system. + + + + + + + + The list of controller nodes. + + + &serviceNodeListShowParameters; + + + + + + + + &commonFaults; &getFaults; + + + + + + Shows the attributes of a specific node. + + + + + + &serviceNodeListShowParameters; + + + + + + + + &commonFaults; &getFaults; + + + + + + + + + List all service groups in the system. + + + + + + + + The list of service groups. + + + &serviceGroupListShowParameters; + + + + + + + + &commonFaults; &getFaults; + + + + + + Shows the attributes of a specific service group. + + + + + + &serviceGroupListShowParameters; + + + + + + + + &commonFaults; &getFaults; + + + From 21a02906a4e5fe9c863072f338d1183f7c939716 Mon Sep 17 00:00:00 2001 From: Scott Little Date: Mon, 13 Aug 2018 11:27:15 -0400 Subject: [PATCH 03/26] Relocate ceph to stx-integ/ceph/ceph Move content from stx-upstream to stx-integ Packages will be relocated to stx-integ: ceph/ ceph ceph-manager Change-Id: I94570e59993251d72bbbfb8e7b8d1eb0666868b6 Story: 2002801 Task: 22687 Signed-off-by: Scott Little --- ceph/ceph/centos/build_srpm.data | 5 + ceph/ceph/centos/ceph.spec | 1 + ceph/ceph/files/ceph-manage-journal.py | 326 +++++++++++++++++++++++++ 3 files changed, 332 insertions(+) create mode 100644 ceph/ceph/centos/build_srpm.data create mode 120000 ceph/ceph/centos/ceph.spec create mode 100644 ceph/ceph/files/ceph-manage-journal.py diff --git a/ceph/ceph/centos/build_srpm.data b/ceph/ceph/centos/build_srpm.data new file mode 100644 index 000000000..e65b1435f --- /dev/null +++ b/ceph/ceph/centos/build_srpm.data @@ -0,0 +1,5 @@ +SRC_DIR="$CGCS_BASE/git/ceph" +TIS_BASE_SRCREV=fc689aa5ded5941b8ae86374c7124c7d91782973 +TIS_PATCH_VER=GITREVCOUNT +BUILD_IS_BIG=40 +BUILD_IS_SLOW=26 diff --git a/ceph/ceph/centos/ceph.spec b/ceph/ceph/centos/ceph.spec new file mode 120000 index 000000000..5502d2f3f --- /dev/null +++ b/ceph/ceph/centos/ceph.spec @@ -0,0 +1 @@ +../../../../git/ceph/ceph.spec \ No newline at end of file diff --git a/ceph/ceph/files/ceph-manage-journal.py b/ceph/ceph/files/ceph-manage-journal.py new file mode 100644 index 000000000..b3312e0cb --- /dev/null +++ b/ceph/ceph/files/ceph-manage-journal.py @@ -0,0 +1,326 @@ +#!/usr/bin/python +# +# Copyright (c) 2016 Wind River Systems, Inc. +# +# SPDX-License-Identifier: Apache-2.0 +# + +import ast +import os +import os.path +import re +import subprocess +import sys + + +######### +# Utils # +######### + +def command(arguments, **kwargs): + """ Execute e command and capture stdout, stderr & return code """ + process = subprocess.Popen( + arguments, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + **kwargs) + out, err = process.communicate() + return out, err, process.returncode + + +def get_input(arg, valid_keys): + """Convert the input to a dict and perform basic validation""" + json_string = arg.replace("\\n", "\n") + try: + input_dict = ast.literal_eval(json_string) + if not all(k in input_dict for k in valid_keys): + return None + except Exception: + return None + + return input_dict + + +def get_partition_uuid(dev): + output, _, _ = command(['blkid', dev]) + try: + return re.search('PARTUUID=\"(.+?)\"', output).group(1) + except AttributeError: + return None + + +def device_path_to_device_node(device_path): + try: + output, _, _ = command(["udevadm", "settle", "-E", device_path]) + out, err, retcode = command(["readlink", "-f", device_path]) + out = out.rstrip() + except Exception as e: + return None + + return out + + +########################################### +# Manage Journal Disk Partitioning Scheme # +########################################### + +DISK_BY_PARTUUID = "/dev/disk/by-partuuid/" +JOURNAL_UUID='45b0969e-9b03-4f30-b4c6-b4b80ceff106' # Type of a journal partition + + +def is_partitioning_correct(disk_path, partition_sizes): + """ Validate the existence and size of journal partitions""" + + # Obtain the device node from the device path. + disk_node = device_path_to_device_node(disk_path) + + # Check that partition table format is GPT + output, _, _ = command(["udevadm", "settle", "-E", disk_node]) + output, _, _ = command(["parted", "-s", disk_node, "print"]) + if not re.search('Partition Table: gpt', output): + print "Format of disk node %s is not GPT, zapping disk" % disk_node + return False + + # Check each partition size + partition_index = 1 + for size in partition_sizes: + # Check that each partition size matches the one in input + partition_node = disk_node + str(partition_index) + output, _, _ = command(["udevadm", "settle", "-E", partition_node]) + cmd = ["parted", "-s", partition_node, "unit", "MiB", "print"] + output, _, _ = command(cmd) + + regex = ("^Disk " + str(partition_node) + ":\\s*" + + str(size) + "[\\.0]*MiB") + if not re.search(regex, output, re.MULTILINE): + print ("Journal partition %(node)s size is not %(size)s, " + "zapping disk" % {"node": partition_node, "size": size}) + return False + + partition_index += 1 + + output, _, _ = command(["udevadm", "settle", "-t", "10"]) + return True + + +def create_partitions(disk_path, partition_sizes): + """ Recreate partitions """ + + # Obtain the device node from the device path. + disk_node = device_path_to_device_node(disk_path) + + # Issue: After creating a new partition table on a device, Udev does not + # always remove old symlinks (i.e. to previous partitions on that device). + # Also, even if links are erased before zapping the disk, some of them will + # be recreated even though there is no partition to back them! + # Therefore, we have to remove the links AFTER we erase the partition table + # Issue: DISK_BY_PARTUUID directory is not present at all if there are no + # GPT partitions on the storage node so nothing to remove in this case + links = [] + if os.path.isdir(DISK_BY_PARTUUID): + links = [ os.path.join(DISK_BY_PARTUUID,l) for l in os.listdir(DISK_BY_PARTUUID) + if os.path.islink(os.path.join(DISK_BY_PARTUUID, l)) ] + + # Erase all partitions on current node by creating a new GPT table + _, err, ret = command(["parted", "-s", disk_node, "mktable", "gpt"]) + if ret: + print ("Error erasing partition table of %(node)s\n" + "Return code: %(ret)s reason: %(reason)s" % + {"node": disk_node, "ret": ret, "reason": err}) + exit(1) + + # Erase old symlinks + for l in links: + if disk_node in os.path.realpath(l): + os.remove(l) + + # Create partitions in order + used_space_mib = 1 # leave 1 MB at the beginning of the disk + num = 1 + for size in partition_sizes: + cmd = ['parted', '-s', disk_node, 'unit', 'mib', + 'mkpart', 'primary', + str(used_space_mib), str(used_space_mib + size)] + _, err, ret = command(cmd) + parms = {"disk_node": disk_node, + "start": used_space_mib, + "end": used_space_mib + size, + "reason": err} + print ("Created partition from start=%(start)s MiB to end=%(end)s MiB" + " on %(disk_node)s" % parms) + if ret: + print ("Failed to create partition with " + "start=%(start)s, end=%(end)s " + "on %(disk_node)s reason: %(reason)s" % parms) + exit(1) + # Set partition type to ceph journal + # noncritical operation, it makes 'ceph-disk list' output correct info + cmd = ['sgdisk', + '--change-name={num}:ceph journal'.format(num=num), + '--typecode={num}:{uuid}'.format( + num=num, + uuid=JOURNAL_UUID, + ), + disk_node] + _, err, ret = command(cmd) + if ret: + print ("WARNINIG: Failed to set partition name and typecode") + used_space_mib += size + num += 1 + +########################### +# Manage Journal Location # +########################### + +OSD_PATH = "/var/lib/ceph/osd/" + + +def mount_data_partition(data_path, osdid): + """ Mount an OSD data partition and return the mounted path """ + + # Obtain the device node from the device path. + data_node = device_path_to_device_node(data_path) + + mount_path = OSD_PATH + "ceph-" + str(osdid) + output, _, _ = command(['mount']) + regex = "^" + data_node + ".*" + mount_path + if not re.search(regex, output, re.MULTILINE): + cmd = ['mount', '-t', 'xfs', data_node, mount_path] + _, _, ret = command(cmd) + params = {"node": data_node, "path": mount_path} + if ret: + print "Failed to mount %(node)s to %(path), aborting" % params + exit(1) + else: + print "Mounted %(node)s to %(path)s" % params + return mount_path + + +def is_location_correct(path, journal_path, osdid): + """ Check if location points to the correct device """ + + # Obtain the device node from the device path. + journal_node = device_path_to_device_node(journal_path) + + cur_node = os.path.realpath(path + "/journal") + if cur_node == journal_node: + return True + else: + return False + + +def fix_location(mount_point, journal_path, osdid): + """ Move the journal to the new partition """ + + # Obtain the device node from the device path. + journal_node = device_path_to_device_node(journal_path) + + # Fix symlink + path = mount_point + "/journal" # 'journal' symlink path used by ceph-osd + journal_uuid = get_partition_uuid(journal_node) + new_target = DISK_BY_PARTUUID + journal_uuid + params = {"path": path, "target": new_target} + try: + if os.path.lexists(path): + os.unlink(path) # delete the old symlink + os.symlink(new_target, path) + print "Symlink created: %(path)s -> %(target)s" % params + except: + print "Failed to create symlink: %(path)s -> %(target)s" % params + exit(1) + # Fix journal_uuid + path = mount_point + "/journal_uuid" + try: + with open(path, 'w') as f: + f.write(journal_uuid) + except Exception as ex: + # The operation is noncritical, it only makes 'ceph-disk list' + # display complete output. We log and continue. + params = {"path": path, "uuid": journal_uuid} + print "WARNING: Failed to set uuid of %(path)s to %(uuid)s" % params + + # Clean the journal partition + # even if erasing the partition table, if another journal was present here + # it's going to be reused. Journals are always bigger than 100MB. + command(['dd', 'if=/dev/zero', 'of=%s' % journal_node, + 'bs=1M', 'count=100']) + + # Format the journal + cmd = ['/usr/bin/ceph-osd', '-i', str(osdid), + '--pid-file', '/var/run/ceph/osd.%s.pid' % osdid, + '-c', '/etc/ceph/ceph.conf', + '--cluster', 'ceph', + '--mkjournal'] + out, err, ret = command(cmd) + params = {"journal_node": journal_node, + "osdid": osdid, + "ret": ret, + "reason": err} + if not ret: + print ("Prepared new journal partition: %(journal_node)s " + "for osd id: %(osdid)s") % params + else: + print ("Error initializing journal node: " + "%(journal_node)s for osd id: %(osdid)s " + "ceph-osd return code: %(ret)s reason: %(reason)s" % params) + + +######## +# Main # +######## + +def main(argv): + # parse and validate arguments + err = False + partitions = None + location = None + if len(argv) != 2: + err = True + elif argv[0] == "partitions": + valid_keys = ['disk_path', 'journals'] + partitions = get_input(argv[1], valid_keys) + if not partitions: + err = True + elif not isinstance(partitions['journals'], list): + err = True + elif argv[0] == "location": + valid_keys = ['data_path', 'journal_path', 'osdid'] + location = get_input(argv[1], valid_keys) + if not location: + err = True + elif not isinstance(location['osdid'], int): + err = True + else: + err = True + if err: + print "Command intended for internal use only" + exit(-1) + + if partitions: + # Recreate partitions only if the existing ones don't match input + if not is_partitioning_correct(partitions['disk_path'], + partitions['journals']): + create_partitions(partitions['disk_path'], partitions['journals']) + else: + print ("Partition table for %s is correct, " + "no need to repartition" % + device_path_to_device_node(partitions['disk_path'])) + elif location: + # we need to have the data partition mounted & we can let it mounted + mount_point = mount_data_partition(location['data_path'], + location['osdid']) + # Update journal location only if link point to another partition + if not is_location_correct(mount_point, + location['journal_path'], + location['osdid']): + print ("Fixing journal location for " + "OSD id: %(id)s" % {"node": location['data_path'], + "id": location['osdid']}) + fix_location(mount_point, + location['journal_path'], + location['osdid']) + else: + print ("Journal location for %s is correct," + "no need to change it" % location['data_path']) + +main(sys.argv[1:]) From d4a5366f533b17e84302e79d1eef376a88c93f48 Mon Sep 17 00:00:00 2001 From: Erich Cordoba Date: Mon, 18 Jun 2018 11:34:49 -0500 Subject: [PATCH 04/26] Update TIS_BASE_SRCREV missing references Change-Id: I0378b337ed0613d1b5e9b0d46790688594730b91 Signed-off-by: Erich Cordoba Signed-off-by: Scott Little --- ceph/ceph/centos/build_srpm.data | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ceph/ceph/centos/build_srpm.data b/ceph/ceph/centos/build_srpm.data index e65b1435f..ca131ddd5 100644 --- a/ceph/ceph/centos/build_srpm.data +++ b/ceph/ceph/centos/build_srpm.data @@ -1,5 +1,5 @@ SRC_DIR="$CGCS_BASE/git/ceph" -TIS_BASE_SRCREV=fc689aa5ded5941b8ae86374c7124c7d91782973 +TIS_BASE_SRCREV=3f07f7ff1a5c7bfa8d0de12c966594d5fb7cf4ec TIS_PATCH_VER=GITREVCOUNT BUILD_IS_BIG=40 BUILD_IS_SLOW=26 From 557dfe686c85c89c00e87b464dc6746feba7718b Mon Sep 17 00:00:00 2001 From: Scott Little Date: Mon, 13 Aug 2018 11:39:37 -0400 Subject: [PATCH 05/26] Relocate ceph-manager to stx-integ/ceph/ceph-manager Move content from stx-upstream to stx-integ Packages will be relocated to stx-integ: ceph/ ceph ceph-manager Change-Id: I129faa448e2e52fc82101ae7ebc8ad5688f21523 Story: 2002801 Task: 22687 Signed-off-by: Scott Little --- ceph/ceph-manager/.gitignore | 6 + ceph/ceph-manager/LICENSE | 202 ++++ ceph/ceph-manager/PKG-INFO | 13 + ceph/ceph-manager/centos/build_srpm.data | 3 + ceph/ceph-manager/centos/ceph-manager.spec | 70 ++ ceph/ceph-manager/ceph-manager/LICENSE | 202 ++++ .../ceph-manager/ceph_manager/__init__.py | 5 + .../ceph_manager/cache_tiering.py | 705 ++++++++++++++ .../ceph-manager/ceph_manager/ceph.py | 164 ++++ .../ceph-manager/ceph_manager/constants.py | 107 +++ .../ceph-manager/ceph_manager/exception.py | 130 +++ .../ceph-manager/ceph_manager/i18n.py | 15 + .../ceph-manager/ceph_manager/monitor.py | 893 ++++++++++++++++++ .../ceph-manager/ceph_manager/server.py | 249 +++++ .../ceph_manager/tests/__init__.py | 0 .../ceph_manager/tests/test_cache_flush.py | 309 ++++++ ceph/ceph-manager/ceph-manager/setup.py | 19 + .../ceph-manager/test-requirements.txt | 10 + ceph/ceph-manager/ceph-manager/tox.ini | 29 + .../ceph-manager/files/ceph-manager.logrotate | 11 + ceph/ceph-manager/files/ceph-manager.service | 17 + ceph/ceph-manager/scripts/bin/ceph-manager | 17 + ceph/ceph-manager/scripts/init.d/ceph-manager | 103 ++ 23 files changed, 3279 insertions(+) create mode 100644 ceph/ceph-manager/.gitignore create mode 100644 ceph/ceph-manager/LICENSE create mode 100644 ceph/ceph-manager/PKG-INFO create mode 100644 ceph/ceph-manager/centos/build_srpm.data create mode 100644 ceph/ceph-manager/centos/ceph-manager.spec create mode 100644 ceph/ceph-manager/ceph-manager/LICENSE create mode 100644 ceph/ceph-manager/ceph-manager/ceph_manager/__init__.py create mode 100644 ceph/ceph-manager/ceph-manager/ceph_manager/cache_tiering.py create mode 100644 ceph/ceph-manager/ceph-manager/ceph_manager/ceph.py create mode 100644 ceph/ceph-manager/ceph-manager/ceph_manager/constants.py create mode 100644 ceph/ceph-manager/ceph-manager/ceph_manager/exception.py create mode 100644 ceph/ceph-manager/ceph-manager/ceph_manager/i18n.py create mode 100644 ceph/ceph-manager/ceph-manager/ceph_manager/monitor.py create mode 100644 ceph/ceph-manager/ceph-manager/ceph_manager/server.py create mode 100644 ceph/ceph-manager/ceph-manager/ceph_manager/tests/__init__.py create mode 100644 ceph/ceph-manager/ceph-manager/ceph_manager/tests/test_cache_flush.py create mode 100644 ceph/ceph-manager/ceph-manager/setup.py create mode 100644 ceph/ceph-manager/ceph-manager/test-requirements.txt create mode 100644 ceph/ceph-manager/ceph-manager/tox.ini create mode 100644 ceph/ceph-manager/files/ceph-manager.logrotate create mode 100644 ceph/ceph-manager/files/ceph-manager.service create mode 100644 ceph/ceph-manager/scripts/bin/ceph-manager create mode 100644 ceph/ceph-manager/scripts/init.d/ceph-manager diff --git a/ceph/ceph-manager/.gitignore b/ceph/ceph-manager/.gitignore new file mode 100644 index 000000000..78868598f --- /dev/null +++ b/ceph/ceph-manager/.gitignore @@ -0,0 +1,6 @@ +!.distro +.distro/centos7/rpmbuild/RPMS +.distro/centos7/rpmbuild/SRPMS +.distro/centos7/rpmbuild/BUILD +.distro/centos7/rpmbuild/BUILDROOT +.distro/centos7/rpmbuild/SOURCES/ceph-manager*tar.gz diff --git a/ceph/ceph-manager/LICENSE b/ceph/ceph-manager/LICENSE new file mode 100644 index 000000000..d64569567 --- /dev/null +++ b/ceph/ceph-manager/LICENSE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/ceph/ceph-manager/PKG-INFO b/ceph/ceph-manager/PKG-INFO new file mode 100644 index 000000000..5b6746d87 --- /dev/null +++ b/ceph/ceph-manager/PKG-INFO @@ -0,0 +1,13 @@ +Metadata-Version: 1.1 +Name: ceph-manager +Version: 1.0 +Summary: Handle Ceph API calls and provide status updates via alarms +Home-page: +Author: Windriver +Author-email: info@windriver.com +License: Apache-2.0 + +Description: Handle Ceph API calls and provide status updates via alarms + + +Platform: UNKNOWN diff --git a/ceph/ceph-manager/centos/build_srpm.data b/ceph/ceph-manager/centos/build_srpm.data new file mode 100644 index 000000000..d01510bde --- /dev/null +++ b/ceph/ceph-manager/centos/build_srpm.data @@ -0,0 +1,3 @@ +SRC_DIR="ceph-manager" +COPY_LIST_TO_TAR="files scripts" +TIS_PATCH_VER=4 diff --git a/ceph/ceph-manager/centos/ceph-manager.spec b/ceph/ceph-manager/centos/ceph-manager.spec new file mode 100644 index 000000000..2f54deb5f --- /dev/null +++ b/ceph/ceph-manager/centos/ceph-manager.spec @@ -0,0 +1,70 @@ +Summary: Handle Ceph API calls and provide status updates via alarms +Name: ceph-manager +Version: 1.0 +Release: %{tis_patch_ver}%{?_tis_dist} +License: Apache-2.0 +Group: base +Packager: Wind River +URL: unknown +Source0: %{name}-%{version}.tar.gz + +BuildRequires: python-setuptools +BuildRequires: systemd-units +BuildRequires: systemd-devel +Requires: sysinv + +%description +Handle Ceph API calls and provide status updates via alarms. +Handle sysinv RPC calls for long running Ceph API operations: +- cache tiering enable +- cache tiering disable + +%define local_bindir /usr/bin/ +%define local_etc_initd /etc/init.d/ +%define local_etc_logrotated /etc/logrotate.d/ +%define pythonroot /usr/lib64/python2.7/site-packages + +%define debug_package %{nil} + +%prep +%setup + +%build +%{__python} setup.py build + +%install +%{__python} setup.py install --root=$RPM_BUILD_ROOT \ + --install-lib=%{pythonroot} \ + --prefix=/usr \ + --install-data=/usr/share \ + --single-version-externally-managed + +install -d -m 755 %{buildroot}%{local_etc_initd} +install -p -D -m 700 scripts/init.d/ceph-manager %{buildroot}%{local_etc_initd}/ceph-manager + +install -d -m 755 %{buildroot}%{local_bindir} +install -p -D -m 700 scripts/bin/ceph-manager %{buildroot}%{local_bindir}/ceph-manager + +install -d -m 755 %{buildroot}%{local_etc_logrotated} +install -p -D -m 644 files/ceph-manager.logrotate %{buildroot}%{local_etc_logrotated}/ceph-manager.logrotate + +install -d -m 755 %{buildroot}%{_unitdir} +install -m 644 -p -D files/%{name}.service %{buildroot}%{_unitdir}/%{name}.service + +%clean +rm -rf $RPM_BUILD_ROOT + +# Note: The package name is ceph-manager but the import name is ceph_manager so +# can't use '%{name}'. +%files +%defattr(-,root,root,-) +%doc LICENSE +%{local_bindir}/* +%{local_etc_initd}/* +%{_unitdir}/%{name}.service +%dir %{local_etc_logrotated} +%{local_etc_logrotated}/* +%dir %{pythonroot}/ceph_manager +%{pythonroot}/ceph_manager/* +%dir %{pythonroot}/ceph_manager-%{version}.0-py2.7.egg-info +%{pythonroot}/ceph_manager-%{version}.0-py2.7.egg-info/* diff --git a/ceph/ceph-manager/ceph-manager/LICENSE b/ceph/ceph-manager/ceph-manager/LICENSE new file mode 100644 index 000000000..d64569567 --- /dev/null +++ b/ceph/ceph-manager/ceph-manager/LICENSE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/ceph/ceph-manager/ceph-manager/ceph_manager/__init__.py b/ceph/ceph-manager/ceph-manager/ceph_manager/__init__.py new file mode 100644 index 000000000..754a8f4ef --- /dev/null +++ b/ceph/ceph-manager/ceph-manager/ceph_manager/__init__.py @@ -0,0 +1,5 @@ +# +# Copyright (c) 2016 Wind River Systems, Inc. +# +# SPDX-License-Identifier: Apache-2.0 +# diff --git a/ceph/ceph-manager/ceph-manager/ceph_manager/cache_tiering.py b/ceph/ceph-manager/ceph-manager/ceph_manager/cache_tiering.py new file mode 100644 index 000000000..4e814c3b0 --- /dev/null +++ b/ceph/ceph-manager/ceph-manager/ceph_manager/cache_tiering.py @@ -0,0 +1,705 @@ +# +# Copyright (c) 2016 Wind River Systems, Inc. +# +# SPDX-License-Identifier: Apache-2.0 +# + +import copy +import contextlib +import functools +import math +import subprocess +import time +import traceback +# noinspection PyUnresolvedReferences +import eventlet +# noinspection PyUnresolvedReferences +from eventlet.semaphore import Semaphore +# noinspection PyUnresolvedReferences +from oslo_log import log as logging +# noinspection PyUnresolvedReferences +from sysinv.conductor.cache_tiering_service_config import ServiceConfig + +from i18n import _LI, _LW, _LE + +import constants +import exception +import ceph + +LOG = logging.getLogger(__name__) +CEPH_POOLS = copy.deepcopy(constants.CEPH_POOLS) + +MAX_WAIT = constants.CACHE_FLUSH_MAX_WAIT_OBJ_COUNT_DECREASE_SEC +MIN_WAIT = constants.CACHE_FLUSH_MIN_WAIT_OBJ_COUNT_DECREASE_SEC + + +class LockOwnership(object): + def __init__(self, sem): + self.sem = sem + + @contextlib.contextmanager + def __call__(self): + try: + yield + finally: + if self.sem: + self.sem.release() + + def transfer(self): + new_lo = LockOwnership(self.sem) + self.sem = None + return new_lo + + +class Lock(object): + + def __init__(self): + self.sem = Semaphore(value=1) + + def try_lock(self): + result = self.sem.acquire(blocking=False) + if result: + return LockOwnership(self.sem) + + +class CacheTiering(object): + + def __init__(self, service): + self.service = service + self.lock = Lock() + # will be unlocked by set_initial_config() + self._init_config_lock = self.lock.try_lock() + self.config = None + self.config_desired = None + self.config_applied = None + self.target_max_bytes = {} + + def set_initial_config(self, config): + with self._init_config_lock(): + LOG.info("Setting Ceph cache tiering initial configuration") + self.config = ServiceConfig.from_dict( + config.get(constants.CACHE_TIERING, {})) or \ + ServiceConfig() + self.config_desired = ServiceConfig.from_dict( + config.get(constants.CACHE_TIERING_DESIRED, {})) or \ + ServiceConfig() + self.config_applied = ServiceConfig.from_dict( + config.get(constants.CACHE_TIERING_APPLIED, {})) or \ + ServiceConfig() + if self.config_desired: + LOG.debug("set_initial_config config_desired %s " % + self.config_desired.to_dict()) + if self.config_applied: + LOG.debug("set_initial_config config_applied %s " % + self.config_applied.to_dict()) + + # Check that previous caching tier operation completed + # successfully or perform recovery + if (self.config_desired and + self.config_applied and + (self.config_desired.cache_enabled != + self.config_applied.cache_enabled)): + if self.config_desired.cache_enabled: + self.enable_cache(self.config_desired.to_dict(), + self.config_applied.to_dict(), + self._init_config_lock.transfer()) + else: + self.disable_cache(self.config_desired.to_dict(), + self.config_applied.to_dict(), + self._init_config_lock.transfer()) + + def is_locked(self): + lock_ownership = self.lock.try_lock() + if not lock_ownership: + return True + with lock_ownership(): + return False + + def update_pools_info(self): + global CEPH_POOLS + cfg = self.service.sysinv_conductor.call( + {}, 'get_ceph_pools_config') + CEPH_POOLS = copy.deepcopy(cfg) + LOG.info(_LI("update_pools_info: pools: {}").format(CEPH_POOLS)) + + def enable_cache(self, new_config, applied_config, lock_ownership=None): + new_config = ServiceConfig.from_dict(new_config) + applied_config = ServiceConfig.from_dict(applied_config) + if not lock_ownership: + lock_ownership = self.lock.try_lock() + if not lock_ownership: + raise exception.CephCacheEnableFailure() + with lock_ownership(): + eventlet.spawn(self.do_enable_cache, + new_config, applied_config, + lock_ownership.transfer()) + + def do_enable_cache(self, new_config, applied_config, lock_ownership): + LOG.info(_LI("cache_tiering_enable_cache: " + "new_config={}, applied_config={}").format( + new_config.to_dict(), applied_config.to_dict())) + _unwind_actions = [] + with lock_ownership(): + success = False + _exception = None + try: + self.config_desired.cache_enabled = True + self.update_pools_info() + for pool in CEPH_POOLS: + if (pool['pool_name'] == + constants.CEPH_POOL_OBJECT_GATEWAY_NAME_JEWEL or + pool['pool_name'] == + constants.CEPH_POOL_OBJECT_GATEWAY_NAME_HAMMER): + object_pool_name = \ + self.service.monitor._get_object_pool_name() + pool['pool_name'] = object_pool_name + + self.cache_pool_create(pool) + _unwind_actions.append( + functools.partial(self.cache_pool_delete, pool)) + for pool in CEPH_POOLS: + if (pool['pool_name'] == + constants.CEPH_POOL_OBJECT_GATEWAY_NAME_JEWEL or + pool['pool_name'] == + constants.CEPH_POOL_OBJECT_GATEWAY_NAME_HAMMER): + object_pool_name = \ + self.service.monitor._get_object_pool_name() + pool['pool_name'] = object_pool_name + + self.cache_tier_add(pool) + _unwind_actions.append( + functools.partial(self.cache_tier_remove, pool)) + for pool in CEPH_POOLS: + if (pool['pool_name'] == + constants.CEPH_POOL_OBJECT_GATEWAY_NAME_JEWEL or + pool['pool_name'] == + constants.CEPH_POOL_OBJECT_GATEWAY_NAME_HAMMER): + object_pool_name = \ + self.service.monitor._get_object_pool_name() + pool['pool_name'] = object_pool_name + + self.cache_mode_set(pool, 'writeback') + self.cache_pool_set_config(pool, new_config) + self.cache_overlay_create(pool) + success = True + except Exception as e: + LOG.error(_LE('Failed to enable cache: reason=%s') % + traceback.format_exc()) + for action in reversed(_unwind_actions): + try: + action() + except Exception: + LOG.warn(_LW('Failed cache enable ' + 'unwind action: reason=%s') % + traceback.format_exc()) + success = False + _exception = str(e) + finally: + self.service.monitor.monitor_check_cache_tier(success) + if success: + self.config_applied.cache_enabled = True + self.service.sysinv_conductor.call( + {}, 'cache_tiering_enable_cache_complete', + success=success, exception=_exception, + new_config=new_config.to_dict(), + applied_config=applied_config.to_dict()) + # Run first update of periodic target_max_bytes + self.update_cache_target_max_bytes() + + @contextlib.contextmanager + def ignore_ceph_failure(self): + try: + yield + except exception.CephManagerException: + pass + + def disable_cache(self, new_config, applied_config, lock_ownership=None): + new_config = ServiceConfig.from_dict(new_config) + applied_config = ServiceConfig.from_dict(applied_config) + if not lock_ownership: + lock_ownership = self.lock.try_lock() + if not lock_ownership: + raise exception.CephCacheDisableFailure() + with lock_ownership(): + eventlet.spawn(self.do_disable_cache, + new_config, applied_config, + lock_ownership.transfer()) + + def do_disable_cache(self, new_config, applied_config, lock_ownership): + LOG.info(_LI("cache_tiering_disable_cache: " + "new_config={}, applied_config={}").format( + new_config, applied_config)) + with lock_ownership(): + success = False + _exception = None + try: + self.config_desired.cache_enabled = False + for pool in CEPH_POOLS: + if (pool['pool_name'] == + constants.CEPH_POOL_OBJECT_GATEWAY_NAME_JEWEL or + pool['pool_name'] == + constants.CEPH_POOL_OBJECT_GATEWAY_NAME_HAMMER): + object_pool_name = \ + self.service.monitor._get_object_pool_name() + pool['pool_name'] = object_pool_name + + with self.ignore_ceph_failure(): + self.cache_mode_set( + pool, 'forward') + + for pool in CEPH_POOLS: + if (pool['pool_name'] == + constants.CEPH_POOL_OBJECT_GATEWAY_NAME_JEWEL or + pool['pool_name'] == + constants.CEPH_POOL_OBJECT_GATEWAY_NAME_HAMMER): + object_pool_name = \ + self.service.monitor._get_object_pool_name() + pool['pool_name'] = object_pool_name + + retries_left = 3 + while True: + try: + self.cache_flush(pool) + break + except exception.CephCacheFlushFailure: + retries_left -= 1 + if not retries_left: + # give up + break + else: + time.sleep(1) + for pool in CEPH_POOLS: + if (pool['pool_name'] == + constants.CEPH_POOL_OBJECT_GATEWAY_NAME_JEWEL or + pool['pool_name'] == + constants.CEPH_POOL_OBJECT_GATEWAY_NAME_HAMMER): + object_pool_name = \ + self.service.monitor._get_object_pool_name() + pool['pool_name'] = object_pool_name + + with self.ignore_ceph_failure(): + self.cache_overlay_delete(pool) + self.cache_tier_remove(pool) + for pool in CEPH_POOLS: + if (pool['pool_name'] == + constants.CEPH_POOL_OBJECT_GATEWAY_NAME_JEWEL or + pool['pool_name'] == + constants.CEPH_POOL_OBJECT_GATEWAY_NAME_HAMMER): + object_pool_name = \ + self.service.monitor._get_object_pool_name() + pool['pool_name'] = object_pool_name + + with self.ignore_ceph_failure(): + self.cache_pool_delete(pool) + success = True + except Exception as e: + LOG.warn(_LE('Failed to disable cache: reason=%s') % + traceback.format_exc()) + _exception = str(e) + finally: + self.service.monitor.monitor_check_cache_tier(False) + if success: + self.config_desired.cache_enabled = False + self.config_applied.cache_enabled = False + self.service.sysinv_conductor.call( + {}, 'cache_tiering_disable_cache_complete', + success=success, exception=_exception, + new_config=new_config.to_dict(), + applied_config=applied_config.to_dict()) + + def get_pool_pg_num(self, pool_name): + return self.service.sysinv_conductor.call( + {}, 'get_pool_pg_num', + pool_name=pool_name) + + def cache_pool_create(self, pool): + backing_pool = pool['pool_name'] + cache_pool = backing_pool + '-cache' + pg_num = self.get_pool_pg_num(cache_pool) + if not ceph.osd_pool_exists(self.service.ceph_api, cache_pool): + ceph.osd_pool_create( + self.service.ceph_api, cache_pool, + pg_num, pg_num) + + def cache_pool_delete(self, pool): + cache_pool = pool['pool_name'] + '-cache' + ceph.osd_pool_delete( + self.service.ceph_api, cache_pool) + + def cache_tier_add(self, pool): + backing_pool = pool['pool_name'] + cache_pool = backing_pool + '-cache' + response, body = self.service.ceph_api.osd_tier_add( + backing_pool, cache_pool, + force_nonempty="--force-nonempty", + body='json') + if response.ok: + LOG.info(_LI("Added OSD tier: " + "backing_pool={}, cache_pool={}").format( + backing_pool, cache_pool)) + else: + e = exception.CephPoolAddTierFailure( + backing_pool=backing_pool, + cache_pool=cache_pool, + response_status_code=response.status_code, + response_reason=response.reason, + status=body.get('status'), + output=body.get('output')) + LOG.warn(e) + raise e + + def cache_tier_remove(self, pool): + backing_pool = pool['pool_name'] + cache_pool = backing_pool + '-cache' + response, body = self.service.ceph_api.osd_tier_remove( + backing_pool, cache_pool, body='json') + if response.ok: + LOG.info(_LI("Removed OSD tier: " + "backing_pool={}, cache_pool={}").format( + backing_pool, cache_pool)) + else: + e = exception.CephPoolRemoveTierFailure( + backing_pool=backing_pool, + cache_pool=cache_pool, + response_status_code=response.status_code, + response_reason=response.reason, + status=body.get('status'), + output=body.get('output')) + LOG.warn(e) + raise e + + def cache_mode_set(self, pool, mode): + backing_pool = pool['pool_name'] + cache_pool = backing_pool + '-cache' + response, body = self.service.ceph_api.osd_tier_cachemode( + cache_pool, mode, body='json') + if response.ok: + LOG.info(_LI("Set OSD tier cache mode: " + "cache_pool={}, mode={}").format(cache_pool, mode)) + else: + e = exception.CephCacheSetModeFailure( + cache_pool=cache_pool, + mode=mode, + response_status_code=response.status_code, + response_reason=response.reason, + status=body.get('status'), + output=body.get('output')) + LOG.warn(e) + raise e + + def cache_pool_set_config(self, pool, config): + for name, value in config.params.iteritems(): + self.cache_pool_set_param(pool, name, value) + + def cache_pool_set_param(self, pool, name, value): + backing_pool = pool['pool_name'] + cache_pool = backing_pool + '-cache' + ceph.osd_set_pool_param( + self.service.ceph_api, cache_pool, name, value) + + def cache_overlay_create(self, pool): + backing_pool = pool['pool_name'] + cache_pool = backing_pool + '-cache' + response, body = self.service.ceph_api.osd_tier_set_overlay( + backing_pool, cache_pool, body='json') + if response.ok: + LOG.info(_LI("Set OSD tier overlay: " + "backing_pool={}, cache_pool={}").format( + backing_pool, cache_pool)) + else: + e = exception.CephCacheCreateOverlayFailure( + backing_pool=backing_pool, + cache_pool=cache_pool, + response_status_code=response.status_code, + response_reason=response.reason, + status=body.get('status'), + output=body.get('output')) + LOG.warn(e) + raise e + + def cache_overlay_delete(self, pool): + backing_pool = pool['pool_name'] + cache_pool = pool['pool_name'] + response, body = self.service.ceph_api.osd_tier_remove_overlay( + backing_pool, body='json') + if response.ok: + LOG.info(_LI("Removed OSD tier overlay: " + "backing_pool={}").format(backing_pool)) + else: + e = exception.CephCacheDeleteOverlayFailure( + backing_pool=backing_pool, + cache_pool=cache_pool, + response_status_code=response.status_code, + response_reason=response.reason, + status=body.get('status'), + output=body.get('output')) + LOG.warn(e) + raise e + + @staticmethod + def rados_cache_flush_evict_all(pool): + backing_pool = pool['pool_name'] + cache_pool = backing_pool + '-cache' + try: + subprocess.check_call( + ['/usr/bin/rados', '-p', cache_pool, 'cache-flush-evict-all']) + LOG.info(_LI("Flushed OSD cache pool:" + "cache_pool={}").format(cache_pool)) + except subprocess.CalledProcessError as e: + _e = exception.CephCacheFlushFailure( + cache_pool=cache_pool, + return_code=str(e.returncode), + cmd=" ".join(e.cmd), + output=e.output) + LOG.warn(_e) + raise _e + + def cache_flush(self, pool): + backing_pool = pool['pool_name'] + cache_pool = backing_pool + '-cache' + try: + # set target_max_objects to a small value to force evacuation of + # objects from cache before we use rados cache-flush-evict-all + # WARNING: assuming cache_pool will be deleted after flush so + # we don't have to save/restore the value of target_max_objects + # + self.cache_pool_set_param(pool, 'target_max_objects', 1) + prev_object_count = None + wait_interval = MIN_WAIT + while True: + response, body = self.service.ceph_api.df(body='json') + if not response.ok: + LOG.warn(_LW( + "Failed to retrieve cluster free space stats: " + "status_code=%d, reason=%s") % ( + response.status_code, response.reason)) + break + stats = None + for s in body['output']['pools']: + if s['name'] == cache_pool: + stats = s['stats'] + break + if not stats: + LOG.warn(_LW("Missing pool free space stats: " + "cache_pool=%s") % cache_pool) + break + object_count = stats['objects'] + if object_count < constants.CACHE_FLUSH_OBJECTS_THRESHOLD: + break + if prev_object_count is not None: + delta_objects = object_count - prev_object_count + if delta_objects > 0: + LOG.warn(_LW("Unexpected increase in number " + "of objects in cache pool: " + "cache_pool=%s, prev_object_count=%d, " + "object_count=%d") % ( + cache_pool, prev_object_count, + object_count)) + break + if delta_objects == 0: + wait_interval *= 2 + if wait_interval > MAX_WAIT: + LOG.warn(_LW( + "Cache pool number of objects did not " + "decrease: cache_pool=%s, object_count=%d, " + "wait_interval=%d") % ( + cache_pool, object_count, wait_interval)) + break + else: + wait_interval = MIN_WAIT + time.sleep(wait_interval) + prev_object_count = object_count + except exception.CephPoolSetParamFailure as e: + LOG.warn(e) + finally: + self.rados_cache_flush_evict_all(pool) + + def update_cache_target_max_bytes(self): + "Dynamically compute target_max_bytes of caching pools" + + # Only compute if cache tiering is enabled + if self.config_applied and self.config_desired: + if (not self.config_desired.cache_enabled or + not self.config_applied.cache_enabled): + LOG.debug("Cache tiering disabled, no need to update " + "target_max_bytes.") + return + LOG.debug("Updating target_max_bytes") + + # Get available space + response, body = self.service.ceph_api.osd_df(body='json', + output_method='tree') + if not response.ok: + LOG.warn(_LW( + "Failed to retrieve cluster free space stats: " + "status_code=%d, reason=%s") % ( + response.status_code, response.reason)) + return + + storage_tier_size = 0 + cache_tier_size = 0 + + replication = constants.CEPH_REPLICATION_FACTOR + for node in body['output']['nodes']: + if node['name'] == 'storage-tier': + storage_tier_size = node['kb']*1024/replication + elif node['name'] == 'cache-tier': + cache_tier_size = node['kb']*1024/replication + + if storage_tier_size == 0 or cache_tier_size == 0: + LOG.info("Failed to get cluster size " + "(storage_tier_size=%s, cache_tier_size=%s)," + "retrying on next cycle" % + (storage_tier_size, cache_tier_size)) + return + + # Get available pools + response, body = self.service.ceph_api.osd_lspools(body='json') + if not response.ok: + LOG.warn(_LW( + "Failed to retrieve available pools: " + "status_code=%d, reason=%s") % ( + response.status_code, response.reason)) + return + pools = [p['poolname'] for p in body['output']] + + # Separate backing from caching for easy iteration + backing_pools = [] + caching_pools = [] + for p in pools: + if p.endswith('-cache'): + caching_pools.append(p) + else: + backing_pools.append(p) + LOG.debug("Pools: caching: %s, backing: %s" % (caching_pools, + backing_pools)) + + if not len(caching_pools): + # We do not have caching pools created yet + return + + # Get quota from backing pools that are cached + stats = {} + for p in caching_pools: + backing_name = p.replace('-cache', '') + stats[backing_name] = {} + try: + quota = ceph.osd_pool_get_quota(self.service.ceph_api, + backing_name) + except exception.CephPoolGetQuotaFailure as e: + LOG.warn(_LW( + "Failed to retrieve quota: " + "exception: %s") % str(e)) + return + stats[backing_name]['quota'] = quota['max_bytes'] + stats[backing_name]['quota_pt'] = (quota['max_bytes']*100.0 / + storage_tier_size) + LOG.debug("Quota for pool: %s " + "is: %s B representing %s pt" % + (backing_name, + quota['max_bytes'], + stats[backing_name]['quota_pt'])) + + # target_max_bytes logic: + # - For computing target_max_bytes cache_tier_size must be equal than + # the sum of target_max_bytes of each caching pool + # - target_max_bytes for each caching pool is computed as the + # percentage of quota in corresponding backing pool + # - the caching tiers has to work at full capacity, so if the sum of + # all quotas in the backing tier is different than 100% we need to + # normalize + # - if the quota is zero for any pool we add CACHE_TIERING_MIN_QUOTA + # by default *after* normalization so that we have real minimum + + # We compute the real percentage that need to be normalized after + # ensuring that we have CACHE_TIERING_MIN_QUOTA for each pool with + # a quota of 0 + real_100pt = 90.0 # we start from max and decrease it for each 0 pool + # Note: We must avoid reaching 100% at all costs! and + # cache_target_full_ratio, the Ceph parameter that is supposed to + # protect the cluster against this does not work in Ceph v0.94.6! + # Therefore a value of 90% is better suited for this + for p in caching_pools: + backing_name = p.replace('-cache', '') + if stats[backing_name]['quota_pt'] == 0: + real_100pt -= constants.CACHE_TIERING_MIN_QUOTA + LOG.debug("Quota before normalization for %s is: %s pt" % + (p, stats[backing_name]['quota_pt'])) + + # Compute total percentage of quotas for all backing pools. + # Should be 100% if correctly configured + total_quota_pt = 0 + for p in caching_pools: + backing_name = p.replace('-cache', '') + total_quota_pt += stats[backing_name]['quota_pt'] + LOG.debug("Total quota pt is: %s" % total_quota_pt) + + # Normalize quota pt to 100% (or real_100pt) + if total_quota_pt != 0: # to avoid divide by zero + for p in caching_pools: + backing_name = p.replace('-cache', '') + stats[backing_name]['quota_pt'] = \ + (stats[backing_name]['quota_pt'] * + (real_100pt / total_quota_pt)) + + # Do not allow quota to be 0 for any pool + total = 0 + for p in caching_pools: + backing_name = p.replace('-cache', '') + if stats[backing_name]['quota_pt'] == 0: + stats[backing_name]['quota_pt'] = \ + constants.CACHE_TIERING_MIN_QUOTA + total += stats[backing_name]['quota_pt'] + LOG.debug("Quota after normalization for %s is: %s:" % + (p, stats[backing_name]['quota_pt'])) + + if total > 100: + # Supplementary protection, we really have to avoid going above + # 100%. Note that real_100pt is less than 100% but we still got + # more than 100! + LOG.warn("Total sum of quotas should not go above 100% " + "but is: %s, recalculating in next cycle" % total) + return + LOG.debug("Total sum of quotas is %s pt" % total) + + # Get current target_max_bytes. We cache it to reduce requests + # to ceph-rest-api. We are the ones changing it, so not an issue. + for p in caching_pools: + if p not in self.target_max_bytes: + try: + value = ceph.osd_get_pool_param(self.service.ceph_api, p, + constants.TARGET_MAX_BYTES) + except exception.CephPoolGetParamFailure as e: + LOG.warn(e) + return + self.target_max_bytes[p] = value + LOG.debug("Existing target_max_bytes got from " + "Ceph: %s" % self.target_max_bytes) + + # Set TARGET_MAX_BYTES + LOG.debug("storage_tier_size: %s " + "cache_tier_size: %s" % (storage_tier_size, + cache_tier_size)) + for p in caching_pools: + backing_name = p.replace('-cache', '') + s = stats[backing_name] + target_max_bytes = math.floor(s['quota_pt'] * cache_tier_size / + 100.0) + target_max_bytes = int(target_max_bytes) + LOG.debug("New Target max bytes of pool: %s is: %s B" % ( + p, target_max_bytes)) + + # Set the new target_max_bytes only if it changed + if self.target_max_bytes.get(p) == target_max_bytes: + LOG.debug("Target max bytes of pool: %s " + "is already updated" % p) + continue + try: + ceph.osd_set_pool_param(self.service.ceph_api, p, + constants.TARGET_MAX_BYTES, + target_max_bytes) + self.target_max_bytes[p] = target_max_bytes + except exception.CephPoolSetParamFailure as e: + LOG.warn(e) + continue + return diff --git a/ceph/ceph-manager/ceph-manager/ceph_manager/ceph.py b/ceph/ceph-manager/ceph-manager/ceph_manager/ceph.py new file mode 100644 index 000000000..dff3c8ab5 --- /dev/null +++ b/ceph/ceph-manager/ceph-manager/ceph_manager/ceph.py @@ -0,0 +1,164 @@ +# +# Copyright (c) 2016-2018 Wind River Systems, Inc. +# +# SPDX-License-Identifier: Apache-2.0 +# + +import exception +from i18n import _LI +# noinspection PyUnresolvedReferences +from oslo_log import log as logging + + +LOG = logging.getLogger(__name__) + + +def osd_pool_set_quota(ceph_api, pool_name, max_bytes=0, max_objects=0): + """Set the quota for an OSD pool_name + Setting max_bytes or max_objects to 0 will disable that quota param + :param pool_name: OSD pool_name + :param max_bytes: maximum bytes for OSD pool_name + :param max_objects: maximum objects for OSD pool_name + """ + + # Update quota if needed + prev_quota = osd_pool_get_quota(ceph_api, pool_name) + if prev_quota["max_bytes"] != max_bytes: + resp, b = ceph_api.osd_set_pool_quota(pool_name, 'max_bytes', + max_bytes, body='json') + if resp.ok: + LOG.info(_LI("Set OSD pool_name quota: " + "pool_name={}, max_bytes={}").format( + pool_name, max_bytes)) + else: + e = exception.CephPoolSetQuotaFailure( + pool=pool_name, name='max_bytes', + value=max_bytes, reason=resp.reason) + LOG.error(e) + raise e + if prev_quota["max_objects"] != max_objects: + resp, b = ceph_api.osd_set_pool_quota(pool_name, 'max_objects', + max_objects, + body='json') + if resp.ok: + LOG.info(_LI("Set OSD pool_name quota: " + "pool_name={}, max_objects={}").format( + pool_name, max_objects)) + else: + e = exception.CephPoolSetQuotaFailure( + pool=pool_name, name='max_objects', + value=max_objects, reason=resp.reason) + LOG.error(e) + raise e + + +def osd_pool_get_quota(ceph_api, pool_name): + resp, quota = ceph_api.osd_get_pool_quota(pool_name, body='json') + if not resp.ok: + e = exception.CephPoolGetQuotaFailure( + pool=pool_name, reason=resp.reason) + LOG.error(e) + raise e + else: + return {"max_objects": quota["output"]["quota_max_objects"], + "max_bytes": quota["output"]["quota_max_bytes"]} + + +def osd_pool_exists(ceph_api, pool_name): + response, body = ceph_api.osd_pool_get( + pool_name, "pg_num", body='json') + if response.ok: + return True + return False + + +def osd_pool_create(ceph_api, pool_name, pg_num, pgp_num): + if pool_name.endswith("-cache"): + # ruleset 1: is the ruleset for the cache tier + # Name: cache_tier_ruleset + ruleset = 1 + else: + # ruleset 0: is the default ruleset if no crushmap is loaded or + # the ruleset for the backing tier if loaded: + # Name: storage_tier_ruleset + ruleset = 0 + response, body = ceph_api.osd_pool_create( + pool_name, pg_num, pgp_num, pool_type="replicated", + ruleset=ruleset, body='json') + if response.ok: + LOG.info(_LI("Created OSD pool: " + "pool_name={}, pg_num={}, pgp_num={}, " + "pool_type=replicated, ruleset={}").format( + pool_name, pg_num, pgp_num, ruleset)) + else: + e = exception.CephPoolCreateFailure( + name=pool_name, reason=response.reason) + LOG.error(e) + raise e + + # Explicitly assign the ruleset to the pool on creation since it is + # ignored in the create call + response, body = ceph_api.osd_set_pool_param( + pool_name, "crush_ruleset", ruleset, body='json') + if response.ok: + LOG.info(_LI("Assigned crush ruleset to OS pool: " + "pool_name={}, ruleset={}").format( + pool_name, ruleset)) + else: + e = exception.CephPoolRulesetFailure( + name=pool_name, reason=response.reason) + LOG.error(e) + ceph_api.osd_pool_delete( + pool_name, pool_name, + sure='--yes-i-really-really-mean-it', + body='json') + raise e + + +def osd_pool_delete(ceph_api, pool_name): + """Delete an osd pool + :param pool_name: pool name + """ + response, body = ceph_api.osd_pool_delete( + pool_name, pool_name, + sure='--yes-i-really-really-mean-it', + body='json') + if response.ok: + LOG.info(_LI("Deleted OSD pool {}").format(pool_name)) + else: + e = exception.CephPoolDeleteFailure( + name=pool_name, reason=response.reason) + LOG.warn(e) + raise e + + +def osd_set_pool_param(ceph_api, pool_name, param, value): + response, body = ceph_api.osd_set_pool_param( + pool_name, param, value, + force=None, body='json') + if response.ok: + LOG.info('OSD set pool param: ' + 'pool={}, name={}, value={}'.format( + pool_name, param, value)) + else: + raise exception.CephPoolSetParamFailure( + pool_name=pool_name, + param=param, + value=str(value), + reason=response.reason) + return response, body + + +def osd_get_pool_param(ceph_api, pool_name, param): + response, body = ceph_api.osd_get_pool_param( + pool_name, param, body='json') + if response.ok: + LOG.debug('OSD get pool param: ' + 'pool={}, name={}, value={}'.format( + pool_name, param, body['output'][param])) + else: + raise exception.CephPoolGetParamFailure( + pool_name=pool_name, + param=param, + reason=response.reason) + return body['output'][param] diff --git a/ceph/ceph-manager/ceph-manager/ceph_manager/constants.py b/ceph/ceph-manager/ceph-manager/ceph_manager/constants.py new file mode 100644 index 000000000..5b2977430 --- /dev/null +++ b/ceph/ceph-manager/ceph-manager/ceph_manager/constants.py @@ -0,0 +1,107 @@ +# +# Copyright (c) 2016-2018 Wind River Systems, Inc. +# +# SPDX-License-Identifier: Apache-2.0 +# + +from i18n import _ +# noinspection PyUnresolvedReferences +from sysinv.common import constants as sysinv_constants + +CEPH_POOL_OBJECT_GATEWAY_NAME_JEWEL = \ + sysinv_constants.CEPH_POOL_OBJECT_GATEWAY_NAME_JEWEL +CEPH_POOL_OBJECT_GATEWAY_NAME_HAMMER = \ + sysinv_constants.CEPH_POOL_OBJECT_GATEWAY_NAME_HAMMER +CEPH_POOLS = sysinv_constants.BACKING_POOLS +CEPH_REPLICATION_FACTOR = sysinv_constants.CEPH_REPLICATION_FACTOR_DEFAULT +SERVICE_PARAM_CEPH_CACHE_HIT_SET_TYPE_BLOOM = \ + sysinv_constants.SERVICE_PARAM_CEPH_CACHE_HIT_SET_TYPE_BLOOM +CACHE_TIERING_DEFAULTS = sysinv_constants.CACHE_TIERING_DEFAULTS +TARGET_MAX_BYTES = \ + sysinv_constants.SERVICE_PARAM_CEPH_CACHE_TIER_TARGET_MAX_BYTES + +# Cache tiering section shortener +CACHE_TIERING = \ + sysinv_constants.SERVICE_PARAM_SECTION_CEPH_CACHE_TIER +CACHE_TIERING_DESIRED = \ + sysinv_constants.SERVICE_PARAM_SECTION_CEPH_CACHE_TIER_DESIRED +CACHE_TIERING_APPLIED = \ + sysinv_constants.SERVICE_PARAM_SECTION_CEPH_CACHE_TIER_APPLIED +CACHE_TIERING_SECTIONS = \ + [CACHE_TIERING, CACHE_TIERING_DESIRED, CACHE_TIERING_APPLIED] + +# Cache flush parameters +CACHE_FLUSH_OBJECTS_THRESHOLD = 1000 +CACHE_FLUSH_MIN_WAIT_OBJ_COUNT_DECREASE_SEC = 1 +CACHE_FLUSH_MAX_WAIT_OBJ_COUNT_DECREASE_SEC = 128 + +CACHE_TIERING_MIN_QUOTA = 5 + +FM_ALARM_REASON_MAX_SIZE = 256 + +# TODO this will later change based on parsed health +# clock skew is vm malfunction, mon or osd is equipment mal +ALARM_CAUSE = 'equipment-malfunction' +ALARM_TYPE = 'equipment' + +# Ceph health check interval (in seconds) +CEPH_HEALTH_CHECK_INTERVAL = 60 + +# Ceph health statuses +CEPH_HEALTH_OK = 'HEALTH_OK' +CEPH_HEALTH_WARN = 'HEALTH_WARN' +CEPH_HEALTH_ERR = 'HEALTH_ERR' +CEPH_HEALTH_DOWN = 'CEPH_DOWN' + +# Statuses not reported by Ceph +CEPH_STATUS_CUSTOM = [CEPH_HEALTH_DOWN] + +SEVERITY = {CEPH_HEALTH_DOWN: 'critical', + CEPH_HEALTH_ERR: 'critical', + CEPH_HEALTH_WARN: 'warning'} + +SERVICE_AFFECTING = {CEPH_HEALTH_DOWN: True, + CEPH_HEALTH_ERR: True, + CEPH_HEALTH_WARN: False} + +# TODO this will later change based on parsed health +ALARM_REASON_NO_OSD = _('no OSDs') +ALARM_REASON_OSDS_DOWN = _('OSDs are down') +ALARM_REASON_OSDS_OUT = _('OSDs are out') +ALARM_REASON_OSDS_DOWN_OUT = _('OSDs are down/out') +ALARM_REASON_PEER_HOST_DOWN = _('peer host down') + +REPAIR_ACTION_MAJOR_CRITICAL_ALARM = _( + 'Ensure storage hosts from replication group are unlocked and available.' + 'Check if OSDs of each storage host are up and running.' + 'If problem persists, contact next level of support.') +REPAIR_ACTION = _('If problem persists, contact next level of support.') + +SYSINV_CONDUCTOR_TOPIC = 'sysinv.conductor_manager' +CEPH_MANAGER_TOPIC = 'sysinv.ceph_manager' +SYSINV_CONFIG_FILE = '/etc/sysinv/sysinv.conf' + +# Titanium Cloud version strings +TITANIUM_SERVER_VERSION_16_10 = '16.10' + +CEPH_HEALTH_WARN_REQUIRE_JEWEL_OSDS_NOT_SET = ( + "all OSDs are running jewel or later but the " + "'require_jewel_osds' osdmap flag is not set") + +UPGRADE_COMPLETED = \ + sysinv_constants.UPGRADE_COMPLETED +UPGRADE_ABORTING = \ + sysinv_constants.UPGRADE_ABORTING +UPGRADE_ABORT_COMPLETING = \ + sysinv_constants.UPGRADE_ABORT_COMPLETING +UPGRADE_ABORTING_ROLLBACK = \ + sysinv_constants.UPGRADE_ABORTING_ROLLBACK + +CEPH_FLAG_REQUIRE_JEWEL_OSDS = 'require_jewel_osds' + +# Tiers +CEPH_CRUSH_TIER_SUFFIX = sysinv_constants.CEPH_CRUSH_TIER_SUFFIX +SB_TIER_TYPE_CEPH = sysinv_constants.SB_TIER_TYPE_CEPH +SB_TIER_SUPPORTED = sysinv_constants.SB_TIER_SUPPORTED +SB_TIER_DEFAULT_NAMES = sysinv_constants.SB_TIER_DEFAULT_NAMES +SB_TIER_CEPH_POOLS = sysinv_constants.SB_TIER_CEPH_POOLS diff --git a/ceph/ceph-manager/ceph-manager/ceph_manager/exception.py b/ceph/ceph-manager/ceph-manager/ceph_manager/exception.py new file mode 100644 index 000000000..c2d81b8b4 --- /dev/null +++ b/ceph/ceph-manager/ceph-manager/ceph_manager/exception.py @@ -0,0 +1,130 @@ +# +# Copyright (c) 2016-2017 Wind River Systems, Inc. +# +# SPDX-License-Identifier: Apache-2.0 +# + +# noinspection PyUnresolvedReferences +from i18n import _, _LW +# noinspection PyUnresolvedReferences +from oslo_log import log as logging + + +LOG = logging.getLogger(__name__) + + +class CephManagerException(Exception): + message = _("An unknown exception occurred.") + + def __init__(self, message=None, **kwargs): + self.kwargs = kwargs + if not message: + try: + message = self.message % kwargs + except TypeError: + LOG.warn(_LW('Exception in string format operation')) + for name, value in kwargs.iteritems(): + LOG.error("%s: %s" % (name, value)) + # at least get the core message out if something happened + message = self.message + super(CephManagerException, self).__init__(message) + + +class CephPoolSetQuotaFailure(CephManagerException): + message = _("Error seting the OSD pool " + "quota %(name)s for %(pool)s to %(value)s") \ + + ": %(reason)s" + + +class CephPoolGetQuotaFailure(CephManagerException): + message = _("Error geting the OSD pool quota for %(pool)s") \ + + ": %(reason)s" + + +class CephPoolCreateFailure(CephManagerException): + message = _("Creating OSD pool %(name)s failed: %(reason)s") + + +class CephPoolDeleteFailure(CephManagerException): + message = _("Deleting OSD pool %(name)s failed: %(reason)s") + + +class CephPoolRulesetFailure(CephManagerException): + message = _("Assigning crush ruleset to OSD " + "pool %(name)s failed: %(reason)s") + + +class CephPoolAddTierFailure(CephManagerException): + message = _("Failed to add OSD tier: " + "backing_pool=%(backing_pool)s, cache_pool=%(cache_pool)s, " + "response=%(response_status_code)s:%(response_reason)s, " + "status=%(status)s, output=%(output)s") + + +class CephPoolRemoveTierFailure(CephManagerException): + message = _("Failed to remove tier: " + "backing_pool=%(backing_pool)s, cache_pool=%(cache_pool)s, " + "response=%(response_status_code)s:%(response_reason)s, " + "status=%(status)s, output=%(output)s") + + +class CephCacheSetModeFailure(CephManagerException): + message = _("Failed to set OSD tier cache mode: " + "cache_pool=%(cache_pool)s, mode=%(mode)s, " + "response=%(response_status_code)s:%(response_reason)s, " + "status=%(status)s, output=%(output)s") + + +class CephPoolSetParamFailure(CephManagerException): + message = _("Cannot set Ceph OSD pool parameter: " + "pool_name=%(pool_name)s, param=%(param)s, value=%(value)s. " + "Reason: %(reason)s") + + +class CephPoolGetParamFailure(CephManagerException): + message = _("Cannot get Ceph OSD pool parameter: " + "pool_name=%(pool_name)s, param=%(param)s. " + "Reason: %(reason)s") + + +class CephCacheCreateOverlayFailure(CephManagerException): + message = _("Failed to create overlay: " + "backing_pool=%(backing_pool)s, cache_pool=%(cache_pool)s, " + "response=%(response_status_code)s:%(response_reason)s, " + "status=%(status)s, output=%(output)s") + + +class CephCacheDeleteOverlayFailure(CephManagerException): + message = _("Failed to delete overlay: " + "backing_pool=%(backing_pool)s, cache_pool=%(cache_pool)s, " + "response=%(response_status_code)s:%(response_reason)s, " + "status=%(status)s, output=%(output)s") + + +class CephCacheFlushFailure(CephManagerException): + message = _("Failed to flush cache pool: " + "cache_pool=%(cache_pool)s, " + "return_code=%(return_code)s, " + "cmd=%(cmd)s, output=%(output)s") + + +class CephCacheEnableFailure(CephManagerException): + message = _("Cannot enable Ceph cache tier. " + "Reason: cache tiering operation in progress.") + + +class CephCacheDisableFailure(CephManagerException): + message = _("Cannot disable Ceph cache tier. " + "Reason: cache tiering operation in progress.") + + +class CephSetKeyFailure(CephManagerException): + message = _("Error setting the Ceph flag " + "'%(flag)s' %(extra)s: " + "response=%(response_status_code)s:%(response_reason)s, " + "status=%(status)s, output=%(output)s") + + +class CephApiFailure(CephManagerException): + message = _("API failure: " + "call=%(call)s, reason=%(reason)s") diff --git a/ceph/ceph-manager/ceph-manager/ceph_manager/i18n.py b/ceph/ceph-manager/ceph-manager/ceph_manager/i18n.py new file mode 100644 index 000000000..67977ceae --- /dev/null +++ b/ceph/ceph-manager/ceph-manager/ceph_manager/i18n.py @@ -0,0 +1,15 @@ +# +# Copyright (c) 2016 Wind River Systems, Inc. +# +# SPDX-License-Identifier: Apache-2.0 +# +import oslo_i18n + +DOMAIN = 'ceph-manager' + +_translators = oslo_i18n.TranslatorFactory(domain=DOMAIN) +_ = _translators.primary + +_LI = _translators.log_info +_LW = _translators.log_warning +_LE = _translators.log_error diff --git a/ceph/ceph-manager/ceph-manager/ceph_manager/monitor.py b/ceph/ceph-manager/ceph-manager/ceph_manager/monitor.py new file mode 100644 index 000000000..941e5fc03 --- /dev/null +++ b/ceph/ceph-manager/ceph-manager/ceph_manager/monitor.py @@ -0,0 +1,893 @@ +# +# Copyright (c) 2013-2018 Wind River Systems, Inc. +# +# SPDX-License-Identifier: Apache-2.0 +# + +import time + +# noinspection PyUnresolvedReferences +from fm_api import fm_api +# noinspection PyUnresolvedReferences +from fm_api import constants as fm_constants +# noinspection PyUnresolvedReferences +from oslo_log import log as logging + +from sysinv.conductor.cache_tiering_service_config import ServiceConfig + +# noinspection PyProtectedMember +from i18n import _, _LI, _LW, _LE + +import constants +import exception + +LOG = logging.getLogger(__name__) + + +# When upgrading from 16.10 to 17.x Ceph goes from Hammer release +# to Jewel release. After all storage nodes are upgraded to 17.x +# the cluster is in HEALTH_WARN until administrator explicitly +# enables require_jewel_osds flag - which signals Ceph that it +# can safely transition from Hammer to Jewel +# +# This class is needed only when upgrading from 16.10 to 17.x +# TODO: remove it after 1st 17.x release +# +class HandleUpgradesMixin(object): + + def __init__(self, service): + self.service = service + self.surpress_require_jewel_osds_warning = False + + def setup(self, config): + self._set_upgrade(self.service.retry_get_software_upgrade_status()) + + def _set_upgrade(self, upgrade): + state = upgrade.get('state') + from_version = upgrade.get('from_version') + if (state + and state != constants.UPGRADE_COMPLETED + and from_version == constants.TITANIUM_SERVER_VERSION_16_10): + LOG.info(_LI("Surpress require_jewel_osds health warning")) + self.surpress_require_jewel_osds_warning = True + + def set_flag_require_jewel_osds(self): + try: + response, body = self.service.ceph_api.osd_set_key( + constants.CEPH_FLAG_REQUIRE_JEWEL_OSDS, + body='json') + LOG.info(_LI("Set require_jewel_osds flag")) + except IOError as e: + raise exception.CephApiFailure( + call="osd_set_key", + reason=e.message) + else: + if not response.ok: + raise exception.CephSetKeyFailure( + flag=constants.CEPH_FLAG_REQUIRE_JEWEL_OSDS, + extra=_("needed to complete upgrade to Jewel"), + response_status_code=response.status_code, + response_reason=response.reason, + status=body.get('status'), + output=body.get('output')) + + def filter_health_status(self, health): + health = self.auto_heal(health) + # filter out require_jewel_osds warning + # + if not self.surpress_require_jewel_osds_warning: + return health + if health['health'] != constants.CEPH_HEALTH_WARN: + return health + if (constants.CEPH_HEALTH_WARN_REQUIRE_JEWEL_OSDS_NOT_SET + not in health['detail']): + return health + return self._remove_require_jewel_osds_warning(health) + + def _remove_require_jewel_osds_warning(self, health): + reasons_list = [] + for reason in health['detail'].split(';'): + reason = reason.strip() + if len(reason) == 0: + continue + if constants.CEPH_HEALTH_WARN_REQUIRE_JEWEL_OSDS_NOT_SET in reason: + continue + reasons_list.append(reason) + if len(reasons_list) == 0: + health = { + 'health': constants.CEPH_HEALTH_OK, + 'detail': ''} + else: + health['detail'] = '; '.join(reasons_list) + return health + + def auto_heal(self, health): + if (health['health'] == constants.CEPH_HEALTH_WARN + and (constants.CEPH_HEALTH_WARN_REQUIRE_JEWEL_OSDS_NOT_SET + in health['detail'])): + try: + upgrade = self.service.get_software_upgrade_status() + except Exception as ex: + LOG.warn(_LW( + "Getting software upgrade status failed " + "with: %s. Skip auto-heal attempt " + "(will retry on next ceph status poll).") % str(ex)) + return + state = upgrade.get('state') + # surpress require_jewel_osds in case upgrade is + # in progress but not completed or aborting + if (not self.surpress_require_jewel_osds_warning + and (upgrade.get('from_version') + == constants.TITANIUM_SERVER_VERSION_16_10) + and state not in [ + None, + constants.UPGRADE_COMPLETED, + constants.UPGRADE_ABORTING, + constants.UPGRADE_ABORT_COMPLETING, + constants.UPGRADE_ABORTING_ROLLBACK]): + LOG.info(_LI("Surpress require_jewel_osds health warning")) + self.surpress_require_jewel_osds_warning = True + # set require_jewel_osds in case upgrade is + # not in progress or completed + if (state in [None, constants.UPGRADE_COMPLETED]): + LOG.warn(_LW( + "No upgrade in progress or update completed " + "and require_jewel_osds health warning raised. " + "Set require_jewel_osds flag.")) + self.set_flag_require_jewel_osds() + health = self._remove_require_jewel_osds_warning(health) + LOG.info(_LI("Unsurpress require_jewel_osds health warning")) + self.surpress_require_jewel_osds_warning = False + # unsurpress require_jewel_osds in case upgrade + # is aborting + if (self.surpress_require_jewel_osds_warning + and state in [ + constants.UPGRADE_ABORTING, + constants.UPGRADE_ABORT_COMPLETING, + constants.UPGRADE_ABORTING_ROLLBACK]): + LOG.info(_LI("Unsurpress require_jewel_osds health warning")) + self.surpress_require_jewel_osds_warning = False + return health + + +class Monitor(HandleUpgradesMixin): + + def __init__(self, service): + self.service = service + self.current_ceph_health = "" + self.cache_enabled = False + self.tiers_size = {} + self.known_object_pool_name = None + self.primary_tier_name = constants.SB_TIER_DEFAULT_NAMES[ + constants.SB_TIER_TYPE_CEPH] + constants.CEPH_CRUSH_TIER_SUFFIX + self.cluster_is_up = False + super(Monitor, self).__init__(service) + + def setup(self, config): + self.set_caching_tier_config(config) + super(Monitor, self).setup(config) + + def set_caching_tier_config(self, config): + conf = ServiceConfig().from_dict( + config.get(constants.CACHE_TIERING_APPLIED)) + if conf: + self.cache_enabled = conf.cache_enabled + + def monitor_check_cache_tier(self, enable_flag): + LOG.info(_LI("monitor_check_cache_tier: " + "enable_flag={}".format(enable_flag))) + self.cache_enabled = enable_flag + + def run(self): + # Wait until Ceph cluster is up and we can get the fsid + while True: + self.ceph_get_fsid() + if self.service.entity_instance_id: + break + time.sleep(constants.CEPH_HEALTH_CHECK_INTERVAL) + + # Start monitoring ceph status + while True: + self.ceph_poll_status() + self.ceph_poll_quotas() + time.sleep(constants.CEPH_HEALTH_CHECK_INTERVAL) + + def ceph_get_fsid(self): + # Check whether an alarm has already been raised + self._get_current_alarms() + if self.current_health_alarm: + LOG.info(_LI("Current alarm: %s") % + str(self.current_health_alarm.__dict__)) + + fsid = self._get_fsid() + if not fsid: + # Raise alarm - it will not have an entity_instance_id + self._report_fault({'health': constants.CEPH_HEALTH_DOWN, + 'detail': 'Ceph cluster is down.'}, + fm_constants.FM_ALARM_ID_STORAGE_CEPH) + else: + # Clear alarm with no entity_instance_id + self._clear_fault(fm_constants.FM_ALARM_ID_STORAGE_CEPH) + self.service.entity_instance_id = 'cluster=%s' % fsid + + def ceph_poll_status(self): + # get previous data every time in case: + # * daemon restarted + # * alarm was cleared manually but stored as raised in daemon + self._get_current_alarms() + if self.current_health_alarm: + LOG.info(_LI("Current alarm: %s") % + str(self.current_health_alarm.__dict__)) + + # get ceph health + health = self._get_health() + LOG.info(_LI("Current Ceph health: " + "%(health)s detail: %(detail)s") % health) + + health = self.filter_health_status(health) + if health['health'] != constants.CEPH_HEALTH_OK: + self._report_fault(health, fm_constants.FM_ALARM_ID_STORAGE_CEPH) + self._report_alarm_osds_health() + else: + self._clear_fault(fm_constants.FM_ALARM_ID_STORAGE_CEPH) + self.clear_all_major_critical() + + def filter_health_status(self, health): + return super(Monitor, self).filter_health_status(health) + + def ceph_poll_quotas(self): + self._get_current_alarms() + if self.current_quota_alarms: + LOG.info(_LI("Current quota alarms %s") % + self.current_quota_alarms) + + # Get current current size of each tier + previous_tiers_size = self.tiers_size + self.tiers_size = self._get_tiers_size() + + # Make sure any removed tiers have the alarms cleared + for t in (set(previous_tiers_size)-set(self.tiers_size)): + self._clear_fault(fm_constants.FM_ALARM_ID_STORAGE_CEPH_FREE_SPACE, + "{0}.tier={1}".format( + self.service.entity_instance_id, + t[:-len(constants.CEPH_CRUSH_TIER_SUFFIX)])) + + # Check the quotas on each tier + for tier in self.tiers_size: + # TODO(rchurch): For R6 remove the tier from the default crushmap + # and remove this check. No longer supporting this tier in R5 + if tier == 'cache-tier': + continue + + # Extract the tier name from the crush equivalent + tier_name = tier[:-len(constants.CEPH_CRUSH_TIER_SUFFIX)] + + if self.tiers_size[tier] == 0: + LOG.info(_LI("'%s' tier cluster size not yet available") + % tier_name) + continue + + pools_quota_sum = 0 + if tier == self.primary_tier_name: + for pool in constants.CEPH_POOLS: + if (pool['pool_name'] == + constants.CEPH_POOL_OBJECT_GATEWAY_NAME_JEWEL or + pool['pool_name'] == + constants.CEPH_POOL_OBJECT_GATEWAY_NAME_HAMMER): + object_pool_name = self._get_object_pool_name() + if object_pool_name is None: + LOG.error("Rados gateway object data pool does " + "not exist.") + else: + pools_quota_sum += \ + self._get_osd_pool_quota(object_pool_name) + else: + pools_quota_sum += self._get_osd_pool_quota( + pool['pool_name']) + else: + for pool in constants.SB_TIER_CEPH_POOLS: + pool_name = "{0}-{1}".format(pool['pool_name'], tier_name) + pools_quota_sum += self._get_osd_pool_quota(pool_name) + + # Currently, there is only one pool on the addtional tier(s), + # therefore allow a quota of 0 + if (pools_quota_sum != self.tiers_size[tier] and + pools_quota_sum != 0): + self._report_fault( + {'tier_name': tier_name, + 'tier_eid': "{0}.tier={1}".format( + self.service.entity_instance_id, + tier_name)}, + fm_constants.FM_ALARM_ID_STORAGE_CEPH_FREE_SPACE) + else: + self._clear_fault( + fm_constants.FM_ALARM_ID_STORAGE_CEPH_FREE_SPACE, + "{0}.tier={1}".format(self.service.entity_instance_id, + tier_name)) + + # CEPH HELPERS + + def _get_fsid(self): + try: + response, fsid = self.service.ceph_api.fsid( + body='text', timeout=30) + except IOError as e: + LOG.warning(_LW("ceph_api.fsid failed: %s") % str(e.message)) + self.cluster_is_up = False + return None + + if not response.ok: + LOG.warning(_LW("Get fsid failed: %s") % response.reason) + self.cluster_is_up = False + return None + + self.cluster_is_up = True + return fsid.strip() + + def _get_health(self): + try: + # we use text since it has all info + response, body = self.service.ceph_api.health( + body='text', timeout=30) + except IOError as e: + LOG.warning(_LW("ceph_api.health failed: %s") % str(e.message)) + self.cluster_is_up = False + return {'health': constants.CEPH_HEALTH_DOWN, + 'detail': 'Ceph cluster is down.'} + + if not response.ok: + LOG.warning(_LW("CEPH health check failed: %s") % response.reason) + health_info = [constants.CEPH_HEALTH_DOWN, response.reason] + self.cluster_is_up = False + else: + health_info = body.split(' ', 1) + self.cluster_is_up = True + + health = health_info[0] + + if len(health_info) > 1: + detail = health_info[1] + else: + detail = health_info[0] + + return {'health': health.strip(), + 'detail': detail.strip()} + + def _get_object_pool_name(self): + if self.known_object_pool_name is None: + response, body = self.service.ceph_api.osd_pool_get( + constants.CEPH_POOL_OBJECT_GATEWAY_NAME_JEWEL, + "pg_num", + body='json') + + if response.ok: + self.known_object_pool_name = \ + constants.CEPH_POOL_OBJECT_GATEWAY_NAME_JEWEL + return self.known_object_pool_name + + response, body = self.service.ceph_api.osd_pool_get( + constants.CEPH_POOL_OBJECT_GATEWAY_NAME_HAMMER, + "pg_num", + body='json') + + if response.ok: + self.known_object_pool_name = \ + constants.CEPH_POOL_OBJECT_GATEWAY_NAME_HAMMER + return self.known_object_pool_name + + return self.known_object_pool_name + + def _get_osd_pool_quota(self, pool_name): + try: + resp, quota = self.service.ceph_api.osd_get_pool_quota( + pool_name, body='json') + except IOError: + return 0 + + if not resp.ok: + LOG.error(_LE("Getting the quota for " + "%(name)s pool failed:%(reason)s)") % + {"name": pool_name, "reason": resp.reason}) + return 0 + else: + try: + quota_gib = int(quota["output"]["quota_max_bytes"])/(1024**3) + return quota_gib + except IOError: + return 0 + + # we have two root nodes 'cache-tier' and 'storage-tier' + # to calculate the space that is used by the pools, we must only + # use 'storage-tier' + # this function determines if a certain node is under a certain + # tree + def host_is_in_root(self, search_tree, node, root_name): + if node['type'] == 'root': + if node['name'] == root_name: + return True + else: + return False + return self.host_is_in_root(search_tree, + search_tree[node['parent']], + root_name) + + # The information received from ceph is not properly + # structured for efficient parsing and searching, so + # it must be processed and transformed into a more + # structured form. + # + # Input received from ceph is an array of nodes with the + # following structure: + # [{'id':, 'children':, ....}, + # ...] + # + # We process this array and transform it into a dictionary + # (for efficient access) The transformed "search tree" is a + # dictionary with the following structure: + # { : {'children':} + def _get_tiers_size(self): + try: + resp, body = self.service.ceph_api.osd_df( + body='json', + output_method='tree') + except IOError: + return 0 + if not resp.ok: + LOG.error(_LE("Getting the cluster usage " + "information failed: %(reason)s - " + "%(body)s") % {"reason": resp.reason, + "body": body}) + return {} + + # A node is a crushmap element: root, chassis, host, osd. Create a + # dictionary for the nodes with the key as the id used for efficient + # searching through nodes. + # + # For example: storage-0's node has one child node => OSD 0 + # { + # "id": -4, + # "name": "storage-0", + # "type": "host", + # "type_id": 1, + # "reweight": -1.000000, + # "kb": 51354096, + # "kb_used": 1510348, + # "kb_avail": 49843748, + # "utilization": 2.941047, + # "var": 1.480470, + # "pgs": 0, + # "children": [ + # 0 + # ] + # }, + search_tree = {} + for node in body['output']['nodes']: + search_tree[node['id']] = node + + # Extract the tiers as we will return a dict for the size of each tier + tiers = {k: v for k, v in search_tree.items() if v['type'] == 'root'} + + # For each tier, traverse the heirarchy from the root->chassis->host. + # Sum the host sizes to determine the overall size of the tier + tier_sizes = {} + for tier in tiers.values(): + tier_size = 0 + for chassis_id in tier['children']: + chassis_size = 0 + chassis = search_tree[chassis_id] + for host_id in chassis['children']: + host = search_tree[host_id] + if (chassis_size == 0 or + chassis_size > host['kb']): + chassis_size = host['kb'] + tier_size += chassis_size/(1024 ** 2) + tier_sizes[tier['name']] = tier_size + + return tier_sizes + + # ALARM HELPERS + + @staticmethod + def _check_storage_group(osd_tree, group_id, + hosts, osds, fn_report_alarm): + reasons = set() + degraded_hosts = set() + severity = fm_constants.FM_ALARM_SEVERITY_CRITICAL + for host_id in hosts: + if len(osds[host_id]) == 0: + reasons.add(constants.ALARM_REASON_NO_OSD) + degraded_hosts.add(host_id) + else: + for osd_id in osds[host_id]: + if osd_tree[osd_id]['status'] == 'up': + if osd_tree[osd_id]['reweight'] == 0.0: + reasons.add(constants.ALARM_REASON_OSDS_OUT) + degraded_hosts.add(host_id) + else: + severity = fm_constants.FM_ALARM_SEVERITY_MAJOR + elif osd_tree[osd_id]['status'] == 'down': + reasons.add(constants.ALARM_REASON_OSDS_DOWN) + degraded_hosts.add(host_id) + if constants.ALARM_REASON_OSDS_OUT in reasons \ + and constants.ALARM_REASON_OSDS_DOWN in reasons: + reasons.add(constants.ALARM_REASON_OSDS_DOWN_OUT) + reasons.remove(constants.ALARM_REASON_OSDS_OUT) + if constants.ALARM_REASON_OSDS_DOWN in reasons \ + and constants.ALARM_REASON_OSDS_DOWN_OUT in reasons: + reasons.remove(constants.ALARM_REASON_OSDS_DOWN) + reason = "/".join(list(reasons)) + if severity == fm_constants.FM_ALARM_SEVERITY_CRITICAL: + reason = "{} {}: {}".format( + fm_constants.ALARM_CRITICAL_REPLICATION, + osd_tree[group_id]['name'], + reason) + elif severity == fm_constants.FM_ALARM_SEVERITY_MAJOR: + reason = "{} {}: {}".format( + fm_constants.ALARM_MAJOR_REPLICATION, + osd_tree[group_id]['name'], + reason) + if len(degraded_hosts) == 0: + if len(hosts) < 2: + fn_report_alarm( + osd_tree[group_id]['name'], + "{} {}: {}".format( + fm_constants.ALARM_MAJOR_REPLICATION, + osd_tree[group_id]['name'], + constants.ALARM_REASON_PEER_HOST_DOWN), + fm_constants.FM_ALARM_SEVERITY_MAJOR) + elif len(degraded_hosts) == 1: + fn_report_alarm( + "{}.host={}".format( + osd_tree[group_id]['name'], + osd_tree[list(degraded_hosts)[0]]['name']), + reason, severity) + else: + fn_report_alarm( + osd_tree[group_id]['name'], + reason, severity) + + def _check_storage_tier(self, osd_tree, tier_name, fn_report_alarm): + for tier_id in osd_tree: + if osd_tree[tier_id]['type'] != 'root': + continue + if osd_tree[tier_id]['name'] != tier_name: + continue + for group_id in osd_tree[tier_id]['children']: + if osd_tree[group_id]['type'] != 'chassis': + continue + if not osd_tree[group_id]['name'].startswith('group-'): + continue + hosts = [] + osds = {} + for host_id in osd_tree[group_id]['children']: + if osd_tree[host_id]['type'] != 'host': + continue + hosts.append(host_id) + osds[host_id] = [] + for osd_id in osd_tree[host_id]['children']: + if osd_tree[osd_id]['type'] == 'osd': + osds[host_id].append(osd_id) + self._check_storage_group(osd_tree, group_id, hosts, + osds, fn_report_alarm) + break + + def _current_health_alarm_equals(self, reason, severity): + if not self.current_health_alarm: + return False + if getattr(self.current_health_alarm, 'severity', None) != severity: + return False + if getattr(self.current_health_alarm, 'reason_text', None) != reason: + return False + return True + + def _report_alarm_osds_health(self): + response, osd_tree = self.service.ceph_api.osd_tree(body='json') + if not response.ok: + LOG.error(_LE("Failed to retrieve Ceph OSD tree: " + "status_code: %(status_code)s, reason: %(reason)s") % + {"status_code": response.status_code, + "reason": response.reason}) + return + osd_tree = dict([(n['id'], n) for n in osd_tree['output']['nodes']]) + alarms = [] + + self._check_storage_tier(osd_tree, "storage-tier", + lambda *args: alarms.append(args)) + if self.cache_enabled: + self._check_storage_tier(osd_tree, "cache-tier", + lambda *args: alarms.append(args)) + + old_alarms = {} + for alarm_id in [ + fm_constants.FM_ALARM_ID_STORAGE_CEPH_MAJOR, + fm_constants.FM_ALARM_ID_STORAGE_CEPH_CRITICAL]: + alarm_list = self.service.fm_api.get_faults_by_id(alarm_id) + if not alarm_list: + continue + for alarm in alarm_list: + if alarm.entity_instance_id not in old_alarms: + old_alarms[alarm.entity_instance_id] = [] + old_alarms[alarm.entity_instance_id].append( + (alarm.alarm_id, alarm.reason_text)) + + for peer_group, reason, severity in alarms: + if self._current_health_alarm_equals(reason, severity): + continue + alarm_critical_major = fm_constants.FM_ALARM_ID_STORAGE_CEPH_MAJOR + if severity == fm_constants.FM_ALARM_SEVERITY_CRITICAL: + alarm_critical_major = ( + fm_constants.FM_ALARM_ID_STORAGE_CEPH_CRITICAL) + entity_instance_id = ( + self.service.entity_instance_id + '.peergroup=' + peer_group) + alarm_already_exists = False + if entity_instance_id in old_alarms: + for alarm_id, old_reason in old_alarms[entity_instance_id]: + if (reason == old_reason and + alarm_id == alarm_critical_major): + # if the alarm is exactly the same, we don't need + # to recreate it + old_alarms[entity_instance_id].remove( + (alarm_id, old_reason)) + alarm_already_exists = True + elif (alarm_id == alarm_critical_major): + # if we change just the reason, then we just remove the + # alarm from the list so we don't remove it at the + # end of the function + old_alarms[entity_instance_id].remove( + (alarm_id, old_reason)) + + if (len(old_alarms[entity_instance_id]) == 0): + del old_alarms[entity_instance_id] + + # in case the alarm is exactly the same, we skip the alarm set + if alarm_already_exists is True: + continue + major_repair_action = constants.REPAIR_ACTION_MAJOR_CRITICAL_ALARM + fault = fm_api.Fault( + alarm_id=alarm_critical_major, + alarm_type=fm_constants.FM_ALARM_TYPE_4, + alarm_state=fm_constants.FM_ALARM_STATE_SET, + entity_type_id=fm_constants.FM_ENTITY_TYPE_CLUSTER, + entity_instance_id=entity_instance_id, + severity=severity, + reason_text=reason, + probable_cause=fm_constants.ALARM_PROBABLE_CAUSE_15, + proposed_repair_action=major_repair_action, + service_affecting=constants.SERVICE_AFFECTING['HEALTH_WARN']) + alarm_uuid = self.service.fm_api.set_fault(fault) + if alarm_uuid: + LOG.info(_LI( + "Created storage alarm %(alarm_uuid)s - " + "severity: %(severity)s, reason: %(reason)s, " + "service_affecting: %(service_affecting)s") % { + "alarm_uuid": str(alarm_uuid), + "severity": str(severity), + "reason": reason, + "service_affecting": str( + constants.SERVICE_AFFECTING['HEALTH_WARN'])}) + else: + LOG.error(_LE( + "Failed to create storage alarm - " + "severity: %(severity)s, reason: %(reason)s, " + "service_affecting: %(service_affecting)s") % { + "severity": str(severity), + "reason": reason, + "service_affecting": str( + constants.SERVICE_AFFECTING['HEALTH_WARN'])}) + + for entity_instance_id in old_alarms: + for alarm_id, old_reason in old_alarms[entity_instance_id]: + self.service.fm_api.clear_fault(alarm_id, entity_instance_id) + + @staticmethod + def _parse_reason(health): + """ Parse reason strings received from Ceph """ + if health['health'] in constants.CEPH_STATUS_CUSTOM: + # Don't parse reason messages that we added + return "Storage Alarm Condition: %(health)s. %(detail)s" % health + + reasons_lst = health['detail'].split(';') + + parsed_reasons_text = "" + + # Check if PGs have issues - we can't safely store the entire message + # as it tends to be long + for reason in reasons_lst: + if "pgs" in reason: + parsed_reasons_text += "PGs are degraded/stuck or undersized" + break + + # Extract recovery status + parsed_reasons = [r.strip() for r in reasons_lst if 'recovery' in r] + if parsed_reasons: + parsed_reasons_text += ";" + ";".join(parsed_reasons) + + # We need to keep the most important parts of the messages when storing + # them to fm alarms, therefore text between [] brackets is truncated if + # max size is reached. + + # Add brackets, if needed + if len(parsed_reasons_text): + lbracket = " [" + rbracket = "]" + else: + lbracket = "" + rbracket = "" + + msg = {"head": "Storage Alarm Condition: ", + "tail": ". Please check 'ceph -s' for more details."} + max_size = constants.FM_ALARM_REASON_MAX_SIZE - \ + len(msg["head"]) - len(msg["tail"]) + + return ( + msg['head'] + + (health['health'] + lbracket + parsed_reasons_text)[:max_size-1] + + rbracket + msg['tail']) + + def _report_fault(self, health, alarm_id): + if alarm_id == fm_constants.FM_ALARM_ID_STORAGE_CEPH: + new_severity = constants.SEVERITY[health['health']] + new_reason_text = self._parse_reason(health) + new_service_affecting = \ + constants.SERVICE_AFFECTING[health['health']] + + # Raise or update alarm if necessary + if ((not self.current_health_alarm) or + (self.current_health_alarm.__dict__['severity'] != + new_severity) or + (self.current_health_alarm.__dict__['reason_text'] != + new_reason_text) or + (self.current_health_alarm.__dict__['service_affecting'] != + str(new_service_affecting))): + + fault = fm_api.Fault( + alarm_id=fm_constants.FM_ALARM_ID_STORAGE_CEPH, + alarm_type=fm_constants.FM_ALARM_TYPE_4, + alarm_state=fm_constants.FM_ALARM_STATE_SET, + entity_type_id=fm_constants.FM_ENTITY_TYPE_CLUSTER, + entity_instance_id=self.service.entity_instance_id, + severity=new_severity, + reason_text=new_reason_text, + probable_cause=fm_constants.ALARM_PROBABLE_CAUSE_15, + proposed_repair_action=constants.REPAIR_ACTION, + service_affecting=new_service_affecting) + + alarm_uuid = self.service.fm_api.set_fault(fault) + if alarm_uuid: + LOG.info(_LI( + "Created storage alarm %(alarm_uuid)s - " + "severity: %(severity)s, reason: %(reason)s, " + "service_affecting: %(service_affecting)s") % { + "alarm_uuid": alarm_uuid, + "severity": new_severity, + "reason": new_reason_text, + "service_affecting": new_service_affecting}) + else: + LOG.error(_LE( + "Failed to create storage alarm - " + "severity: %(severity)s, reason: %(reason)s " + "service_affecting: %(service_affecting)s") % { + "severity": new_severity, + "reason": new_reason_text, + "service_affecting": new_service_affecting}) + + # Log detailed reason for later analysis + if (self.current_ceph_health != health['health'] or + self.detailed_health_reason != health['detail']): + LOG.info(_LI("Ceph status changed: %(health)s " + "detailed reason: %(detail)s") % health) + self.current_ceph_health = health['health'] + self.detailed_health_reason = health['detail'] + + elif (alarm_id == fm_constants.FM_ALARM_ID_STORAGE_CEPH_FREE_SPACE and + not health['tier_eid'] in self.current_quota_alarms): + + quota_reason_text = ("Quota/Space mismatch for the %s tier. The " + "sum of Ceph pool quotas does not match the " + "tier size." % health['tier_name']) + fault = fm_api.Fault( + alarm_id=fm_constants.FM_ALARM_ID_STORAGE_CEPH_FREE_SPACE, + alarm_state=fm_constants.FM_ALARM_STATE_SET, + entity_type_id=fm_constants.FM_ENTITY_TYPE_CLUSTER, + entity_instance_id=health['tier_eid'], + severity=fm_constants.FM_ALARM_SEVERITY_MINOR, + reason_text=quota_reason_text, + alarm_type=fm_constants.FM_ALARM_TYPE_7, + probable_cause=fm_constants.ALARM_PROBABLE_CAUSE_75, + proposed_repair_action=( + "Update ceph storage pool quotas to use all available " + "cluster space for the %s tier." % health['tier_name']), + service_affecting=False) + + alarm_uuid = self.service.fm_api.set_fault(fault) + if alarm_uuid: + LOG.info(_LI( + "Created storage quota storage alarm %(alarm_uuid)s. " + "Reason: %(reason)s") % { + "alarm_uuid": alarm_uuid, "reason": quota_reason_text}) + else: + LOG.error(_LE("Failed to create quota " + "storage alarm. Reason: %s") % quota_reason_text) + + def _clear_fault(self, alarm_id, entity_instance_id=None): + # Only clear alarm if there is one already raised + if (alarm_id == fm_constants.FM_ALARM_ID_STORAGE_CEPH and + self.current_health_alarm): + LOG.info(_LI("Clearing health alarm")) + self.service.fm_api.clear_fault( + fm_constants.FM_ALARM_ID_STORAGE_CEPH, + self.service.entity_instance_id) + elif (alarm_id == fm_constants.FM_ALARM_ID_STORAGE_CEPH_FREE_SPACE and + entity_instance_id in self.current_quota_alarms): + LOG.info(_LI("Clearing quota alarm with entity_instance_id %s") + % entity_instance_id) + self.service.fm_api.clear_fault( + fm_constants.FM_ALARM_ID_STORAGE_CEPH_FREE_SPACE, + entity_instance_id) + + def clear_critical_alarm(self, group_name): + alarm_list = self.service.fm_api.get_faults_by_id( + fm_constants.FM_ALARM_ID_STORAGE_CEPH_CRITICAL) + if alarm_list: + for alarm in range(len(alarm_list)): + group_id = alarm_list[alarm].entity_instance_id.find("group-") + group_instance_name = ( + "group-" + + alarm_list[alarm].entity_instance_id[group_id + 6]) + if group_name == group_instance_name: + self.service.fm_api.clear_fault( + fm_constants.FM_ALARM_ID_STORAGE_CEPH_CRITICAL, + alarm_list[alarm].entity_instance_id) + + def clear_all_major_critical(self, group_name=None): + # clear major alarms + alarm_list = self.service.fm_api.get_faults_by_id( + fm_constants.FM_ALARM_ID_STORAGE_CEPH_MAJOR) + if alarm_list: + for alarm in range(len(alarm_list)): + if group_name is not None: + group_id = ( + alarm_list[alarm].entity_instance_id.find("group-")) + group_instance_name = ( + "group-" + + alarm_list[alarm].entity_instance_id[group_id+6]) + if group_name == group_instance_name: + self.service.fm_api.clear_fault( + fm_constants.FM_ALARM_ID_STORAGE_CEPH_MAJOR, + alarm_list[alarm].entity_instance_id) + else: + self.service.fm_api.clear_fault( + fm_constants.FM_ALARM_ID_STORAGE_CEPH_MAJOR, + alarm_list[alarm].entity_instance_id) + # clear critical alarms + alarm_list = self.service.fm_api.get_faults_by_id( + fm_constants.FM_ALARM_ID_STORAGE_CEPH_CRITICAL) + if alarm_list: + for alarm in range(len(alarm_list)): + if group_name is not None: + group_id = ( + alarm_list[alarm].entity_instance_id.find("group-")) + group_instance_name = ( + "group-" + + alarm_list[alarm].entity_instance_id[group_id + 6]) + if group_name == group_instance_name: + self.service.fm_api.clear_fault( + fm_constants.FM_ALARM_ID_STORAGE_CEPH_CRITICAL, + alarm_list[alarm].entity_instance_id) + else: + self.service.fm_api.clear_fault( + fm_constants.FM_ALARM_ID_STORAGE_CEPH_CRITICAL, + alarm_list[alarm].entity_instance_id) + + def _get_current_alarms(self): + """ Retrieve currently raised alarm """ + self.current_health_alarm = self.service.fm_api.get_fault( + fm_constants.FM_ALARM_ID_STORAGE_CEPH, + self.service.entity_instance_id) + quota_faults = self.service.fm_api.get_faults_by_id( + fm_constants.FM_ALARM_ID_STORAGE_CEPH_FREE_SPACE) + if quota_faults: + self.current_quota_alarms = [f.entity_instance_id + for f in quota_faults] + else: + self.current_quota_alarms = [] diff --git a/ceph/ceph-manager/ceph-manager/ceph_manager/server.py b/ceph/ceph-manager/ceph-manager/ceph_manager/server.py new file mode 100644 index 000000000..9403a7c2c --- /dev/null +++ b/ceph/ceph-manager/ceph-manager/ceph_manager/server.py @@ -0,0 +1,249 @@ +# vim: tabstop=4 shiftwidth=4 softtabstop=4 +# +# Copyright (c) 2016-2018 Wind River Systems, Inc. +# +# SPDX-License-Identifier: Apache-2.0 +# + +# https://chrigl.de/posts/2014/08/27/oslo-messaging-example.html +# http://docs.openstack.org/developer/oslo.messaging/server.html + +import sys + +# noinspection PyUnresolvedReferences +import eventlet +# noinspection PyUnresolvedReferences +import oslo_messaging as messaging +# noinspection PyUnresolvedReferences +from fm_api import fm_api +# noinspection PyUnresolvedReferences +from oslo_config import cfg +# noinspection PyUnresolvedReferences +from oslo_log import log as logging +# noinspection PyUnresolvedReferences +from oslo_service import service +# noinspection PyUnresolvedReferences +from oslo_service.periodic_task import PeriodicTasks +# noinspection PyUnresolvedReferences +from oslo_service import loopingcall + +from sysinv.conductor.cache_tiering_service_config import ServiceConfig + +# noinspection PyUnresolvedReferences +from cephclient import wrapper + +from monitor import Monitor +from cache_tiering import CacheTiering +import exception +import constants + +from i18n import _LI, _LW +from retrying import retry + +eventlet.monkey_patch(all=True) + +CONF = cfg.CONF +CONF.register_opts([ + cfg.StrOpt('sysinv_api_bind_ip', + default='0.0.0.0', + help='IP for the Ceph Manager server to bind to')]) +CONF.logging_default_format_string = ( + '%(asctime)s.%(msecs)03d %(process)d ' + '%(levelname)s %(name)s [-] %(message)s') +logging.register_options(CONF) +logging.setup(CONF, __name__) +LOG = logging.getLogger(__name__) +CONF.rpc_backend = 'rabbit' + + +class RpcEndpoint(PeriodicTasks): + + def __init__(self, service=None): + self.service = service + + def cache_tiering_enable_cache(self, _, new_config, applied_config): + LOG.info(_LI("Enabling cache")) + try: + self.service.cache_tiering.enable_cache( + new_config, applied_config) + except exception.CephManagerException as e: + self.service.sysinv_conductor.call( + {}, 'cache_tiering_enable_cache_complete', + success=False, exception=str(e.message), + new_config=new_config, applied_config=applied_config) + + def cache_tiering_disable_cache(self, _, new_config, applied_config): + LOG.info(_LI("Disabling cache")) + try: + self.service.cache_tiering.disable_cache( + new_config, applied_config) + except exception.CephManagerException as e: + self.service.sysinv_conductor.call( + {}, 'cache_tiering_disable_cache_complete', + success=False, exception=str(e.message), + new_config=new_config, applied_config=applied_config) + + def cache_tiering_operation_in_progress(self, _): + is_locked = self.service.cache_tiering.is_locked() + LOG.info(_LI("Cache tiering operation " + "is in progress: %s") % str(is_locked).lower()) + return is_locked + + def get_primary_tier_size(self, _): + """Get the ceph size for the primary tier. + + returns: an int for the size (in GB) of the tier + """ + + tiers_size = self.service.monitor.tiers_size + primary_tier_size = tiers_size.get( + self.service.monitor.primary_tier_name, 0) + LOG.debug(_LI("Ceph cluster primary tier size: %s GB") % + str(primary_tier_size)) + return primary_tier_size + + def get_tiers_size(self, _): + """Get the ceph cluster tier sizes. + + returns: a dict of sizes (in GB) by tier name + """ + + tiers_size = self.service.monitor.tiers_size + LOG.debug(_LI("Ceph cluster tiers (size in GB): %s") % + str(tiers_size)) + return tiers_size + + def is_cluster_up(self, _): + """Report if the last health check was successful. + + This is an independent view of the cluster accessibility that can be + used by the sysinv conductor to gate ceph API calls which would timeout + and potentially block other operations. + + This view is only updated at the rate the monitor checks for a cluster + uuid or a health check (CEPH_HEALTH_CHECK_INTERVAL) + + returns: boolean True if last health check was successful else False + """ + return self.service.monitor.cluster_is_up + + +# This class is needed only when upgrading from 16.10 to 17.x +# TODO: remove it after 1st 17.x release +# +class SysinvConductorUpgradeApi(object): + def __init__(self): + self.sysinv_conductor = None + super(SysinvConductorUpgradeApi, self).__init__() + + def get_software_upgrade_status(self): + LOG.info(_LI("Getting software upgrade status from sysinv")) + cctxt = self.sysinv_conductor.prepare(timeout=2) + upgrade = cctxt.call({}, 'get_software_upgrade_status') + LOG.info(_LI("Software upgrade status: %s") % str(upgrade)) + return upgrade + + @retry(wait_fixed=1000, + retry_on_exception=lambda exception: + LOG.warn(_LW( + "Getting software upgrade status failed " + "with: %s. Retrying... ") % str(exception)) or True) + def retry_get_software_upgrade_status(self): + return self.get_software_upgrade_status() + + +class Service(SysinvConductorUpgradeApi, service.Service): + + def __init__(self, conf): + super(Service, self).__init__() + self.conf = conf + self.rpc_server = None + self.sysinv_conductor = None + self.ceph_api = None + self.entity_instance_id = '' + self.fm_api = fm_api.FaultAPIs() + self.monitor = Monitor(self) + self.cache_tiering = CacheTiering(self) + self.config = None + self.config_desired = None + self.config_applied = None + + def start(self): + super(Service, self).start() + transport = messaging.get_transport(self.conf) + self.sysinv_conductor = messaging.RPCClient( + transport, + messaging.Target( + topic=constants.SYSINV_CONDUCTOR_TOPIC)) + + self.ceph_api = wrapper.CephWrapper( + endpoint='http://localhost:5001/api/v0.1/') + + # Get initial config from sysinv and send it to + # services that need it before starting them + config = self.get_caching_tier_config() + self.monitor.setup(config) + self.rpc_server = messaging.get_rpc_server( + transport, + messaging.Target(topic=constants.CEPH_MANAGER_TOPIC, + server=self.conf.sysinv_api_bind_ip), + [RpcEndpoint(self)], + executor='eventlet') + self.rpc_server.start() + self.cache_tiering.set_initial_config(config) + eventlet.spawn_n(self.monitor.run) + periodic = loopingcall.FixedIntervalLoopingCall( + self.update_ceph_target_max_bytes) + periodic.start(interval=300) + + def get_caching_tier_config(self): + LOG.info("Getting cache tiering configuration from sysinv") + while True: + # Get initial configuration from sysinv, + # retry until sysinv starts + try: + cctxt = self.sysinv_conductor.prepare(timeout=2) + config = cctxt.call({}, 'cache_tiering_get_config') + for section in config: + if section == constants.CACHE_TIERING: + self.config = ServiceConfig().from_dict( + config[section]) + elif section == constants.CACHE_TIERING_DESIRED: + self.config_desired = ServiceConfig().from_dict( + config[section]) + elif section == constants.CACHE_TIERING_APPLIED: + self.config_applied = ServiceConfig().from_dict( + config[section]) + LOG.info("Cache tiering configs: {}".format(config)) + return config + except Exception as ex: + # In production we should retry on every error until connection + # is reestablished. + LOG.warn("Getting cache tiering configuration failed " + "with: {}. Retrying... ".format(str(ex))) + + def stop(self): + try: + self.rpc_server.stop() + self.rpc_server.wait() + except Exception: + pass + super(Service, self).stop() + + def update_ceph_target_max_bytes(self): + try: + self.cache_tiering.update_cache_target_max_bytes() + except Exception as ex: + LOG.exception("Updating Ceph target max bytes failed " + "with: {} retrying on next cycle.".format(str(ex))) + + +def run_service(): + CONF(sys.argv[1:]) + logging.setup(CONF, "ceph-manager") + launcher = service.launch(CONF, Service(CONF), workers=1) + launcher.wait() + + +if __name__ == "__main__": + run_service() diff --git a/ceph/ceph-manager/ceph-manager/ceph_manager/tests/__init__.py b/ceph/ceph-manager/ceph-manager/ceph_manager/tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/ceph/ceph-manager/ceph-manager/ceph_manager/tests/test_cache_flush.py b/ceph/ceph-manager/ceph-manager/ceph_manager/tests/test_cache_flush.py new file mode 100644 index 000000000..2fd265195 --- /dev/null +++ b/ceph/ceph-manager/ceph-manager/ceph_manager/tests/test_cache_flush.py @@ -0,0 +1,309 @@ +# +# Copyright (c) 2016 Wind River Systems, Inc. +# +# SPDX-License-Identifier: Apache-2.0 +# + +import unittest +import mock + +import subprocess +import math + +from ..cache_tiering import CacheTiering +from ..cache_tiering import LOG as CT_LOG +from ..constants import CACHE_FLUSH_OBJECTS_THRESHOLD +from ..constants import CACHE_FLUSH_MIN_WAIT_OBJ_COUNT_DECREASE_SEC as MIN_WAIT +from ..constants import CACHE_FLUSH_MAX_WAIT_OBJ_COUNT_DECREASE_SEC as MAX_WAIT +from ..exception import CephCacheFlushFailure + + +class TestCacheFlush(unittest.TestCase): + + def setUp(self): + self.service = mock.Mock() + self.ceph_api = mock.Mock() + self.service.ceph_api = self.ceph_api + self.cache_tiering = CacheTiering(self.service) + + @mock.patch('subprocess.check_call') + def test_set_param_fail(self, mock_proc_call): + self.ceph_api.osd_set_pool_param = mock.Mock() + self.ceph_api.osd_set_pool_param.return_value = ( + mock.Mock(ok=False, status_code=500, reason='denied'), + {}) + self.cache_tiering.cache_flush({'pool_name': 'test'}) + mock_proc_call.assert_called_with( + ['/usr/bin/rados', '-p', 'test-cache', 'cache-flush-evict-all']) + + @mock.patch('subprocess.check_call') + def test_df_fail(self, mock_proc_call): + self.ceph_api.osd_set_pool_param = mock.Mock() + self.ceph_api.osd_set_pool_param.return_value = ( + mock.Mock(ok=True, status_code=200, reason='OK'), + {}) + self.ceph_api.df = mock.Mock() + self.ceph_api.df.return_value = ( + mock.Mock(ok=False, status_code=500, reason='denied'), + {}) + self.cache_tiering.cache_flush({'pool_name': 'test'}) + self.ceph_api.osd_set_pool_param.assert_called_once_with( + 'test-cache', 'target_max_objects', 1, force=None, body='json') + mock_proc_call.assert_called_with( + ['/usr/bin/rados', '-p', 'test-cache', 'cache-flush-evict-all']) + + @mock.patch('subprocess.check_call') + def test_rados_evict_fail_raises(self, mock_proc_call): + mock_proc_call.side_effect = subprocess.CalledProcessError(1, ['cmd']) + self.ceph_api.osd_set_pool_param = mock.Mock() + self.ceph_api.osd_set_pool_param.return_value = ( + mock.Mock(ok=False, status_code=500, reason='denied'), + {}) + self.assertRaises(CephCacheFlushFailure, + self.cache_tiering.cache_flush, + {'pool_name': 'test'}) + mock_proc_call.assert_called_with( + ['/usr/bin/rados', '-p', 'test-cache', 'cache-flush-evict-all']) + + @mock.patch('subprocess.check_call') + def test_df_missing_pool(self, mock_proc_call): + self.ceph_api.osd_set_pool_param = mock.Mock() + self.ceph_api.osd_set_pool_param.return_value = ( + mock.Mock(ok=True, status_code=200, reason='OK'), + {}) + self.ceph_api.df = mock.Mock() + self.ceph_api.df.return_value = ( + mock.Mock(ok=True, status_code=200, reason='OK'), + {'output': { + 'pools': [ + {'id': 0, + 'name': 'rbd', + 'stats': {'bytes_used': 0, + 'kb_used': 0, + 'max_avail': 9588428800, + 'objects': 0}}]}, + 'status': 'OK'}) + with mock.patch.object(CT_LOG, 'warn') as mock_lw: + self.cache_tiering.cache_flush({'pool_name': 'test'}) + self.ceph_api.df.assert_called_once_with(body='json') + for c in mock_lw.call_args_list: + if 'Missing pool free space' in c[0][0]: + break + else: + self.fail('expected log warning') + self.ceph_api.osd_set_pool_param.assert_called_once_with( + 'test-cache', 'target_max_objects', 1, force=None, body='json') + mock_proc_call.assert_called_with( + ['/usr/bin/rados', '-p', 'test-cache', 'cache-flush-evict-all']) + + @mock.patch('subprocess.check_call') + def test_df_objects_empty(self, mock_proc_call): + self.ceph_api.osd_set_pool_param = mock.Mock() + self.ceph_api.osd_set_pool_param.return_value = ( + mock.Mock(ok=True, status_code=200, reason='OK'), + {}) + self.ceph_api.df = mock.Mock() + self.ceph_api.df.return_value = ( + mock.Mock(ok=True, status_code=200, reason='OK'), + {'output': { + 'pools': [ + {'id': 0, + 'name': 'test-cache', + 'stats': {'bytes_used': 0, + 'kb_used': 0, + 'max_avail': 9588428800, + 'objects': 0}}]}, + 'status': 'OK'}) + self.cache_tiering.cache_flush({'pool_name': 'test'}) + self.ceph_api.df.assert_called_once_with(body='json') + self.ceph_api.osd_set_pool_param.assert_called_once_with( + 'test-cache', 'target_max_objects', 1, force=None, body='json') + mock_proc_call.assert_called_with( + ['/usr/bin/rados', '-p', 'test-cache', 'cache-flush-evict-all']) + + @mock.patch('time.sleep') + @mock.patch('subprocess.check_call') + def test_df_objects_above_threshold(self, mock_proc_call, mock_time_sleep): + self.ceph_api.osd_set_pool_param = mock.Mock() + self.ceph_api.osd_set_pool_param.return_value = ( + mock.Mock(ok=True, status_code=200, reason='OK'), + {}) + self.ceph_api.df = mock.Mock() + self.ceph_api.df.side_effect = [ + (mock.Mock(ok=True, status_code=200, reason='OK'), + {'output': { + 'pools': [ + {'id': 0, + 'name': 'test-cache', + 'stats': {'bytes_used': 0, + 'kb_used': 0, + 'max_avail': 9588428800, + 'objects': CACHE_FLUSH_OBJECTS_THRESHOLD}}]}, + 'status': 'OK'}), + (mock.Mock(ok=True, status_code=200, reason='OK'), + {'output': { + 'pools': [ + {'id': 0, + 'name': 'test-cache', + 'stats': {'bytes_used': 0, + 'kb_used': 0, + 'max_avail': 9588428800, + 'objects': + CACHE_FLUSH_OBJECTS_THRESHOLD - 1}}]}, + 'status': 'OK'})] + self.cache_tiering.cache_flush({'pool_name': 'test'}) + self.ceph_api.osd_set_pool_param.assert_called_once_with( + 'test-cache', 'target_max_objects', 1, force=None, body='json') + self.ceph_api.df.assert_called_with(body='json') + mock_time_sleep.assert_called_once_with(MIN_WAIT) + mock_proc_call.assert_called_with( + ['/usr/bin/rados', '-p', 'test-cache', 'cache-flush-evict-all']) + + @mock.patch('time.sleep') + @mock.patch('subprocess.check_call') + def test_df_objects_interval_increase(self, mock_proc_call, + mock_time_sleep): + self.ceph_api.osd_set_pool_param = mock.Mock() + self.ceph_api.osd_set_pool_param.return_value = ( + mock.Mock(ok=True, status_code=200, reason='OK'), + {}) + self.ceph_api.df = mock.Mock() + self.ceph_api.df.side_effect = [ + (mock.Mock(ok=True, status_code=200, reason='OK'), + {'output': { + 'pools': [ + {'id': 0, + 'name': 'test-cache', + 'stats': {'bytes_used': 0, + 'kb_used': 0, + 'max_avail': 9588428800, + 'objects': + CACHE_FLUSH_OBJECTS_THRESHOLD + 1}}]}, + 'status': 'OK'}), + (mock.Mock(ok=True, status_code=200, reason='OK'), + {'output': { + 'pools': [ + {'id': 0, + 'name': 'test-cache', + 'stats': {'bytes_used': 0, + 'kb_used': 0, + 'max_avail': 9588428800, + 'objects': + CACHE_FLUSH_OBJECTS_THRESHOLD + 1}}]}, + 'status': 'OK'}), + (mock.Mock(ok=True, status_code=200, reason='OK'), + {'output': { + 'pools': [ + {'id': 0, + 'name': 'test-cache', + 'stats': {'bytes_used': 0, + 'kb_used': 0, + 'max_avail': 9588428800, + 'objects': + CACHE_FLUSH_OBJECTS_THRESHOLD + 1}}]}, + 'status': 'OK'}), + (mock.Mock(ok=True, status_code=200, reason='OK'), + {'output': { + 'pools': [ + {'id': 0, + 'name': 'test-cache', + 'stats': {'bytes_used': 0, + 'kb_used': 0, + 'max_avail': 9588428800, + 'objects': + CACHE_FLUSH_OBJECTS_THRESHOLD - 1}}]}, + 'status': 'OK'})] + self.cache_tiering.cache_flush({'pool_name': 'test'}) + self.ceph_api.osd_set_pool_param.assert_called_once_with( + 'test-cache', 'target_max_objects', 1, force=None, body='json') + self.ceph_api.df.assert_called_with(body='json') + self.assertEqual([c[0][0] for c in mock_time_sleep.call_args_list], + [MIN_WAIT, + MIN_WAIT * 2, + MIN_WAIT * 4]) + mock_proc_call.assert_called_with( + ['/usr/bin/rados', '-p', 'test-cache', 'cache-flush-evict-all']) + + @mock.patch('time.sleep') + @mock.patch('subprocess.check_call') + def test_df_objects_allways_over_threshold(self, mock_proc_call, + mock_time_sleep): + self.ceph_api.osd_set_pool_param = mock.Mock() + self.ceph_api.osd_set_pool_param.return_value = ( + mock.Mock(ok=True, status_code=200, reason='OK'), + {}) + self.ceph_api.df = mock.Mock() + self.ceph_api.df.return_value = ( + mock.Mock(ok=True, status_code=200, reason='OK'), + {'output': { + 'pools': [ + {'id': 0, + 'name': 'test-cache', + 'stats': {'bytes_used': 0, + 'kb_used': 0, + 'max_avail': 9588428800, + 'objects': + CACHE_FLUSH_OBJECTS_THRESHOLD + 1}}]}, + 'status': 'OK'}) + # noinspection PyTypeChecker + mock_time_sleep.side_effect = \ + [None]*int(math.ceil(math.log(float(MAX_WAIT)/MIN_WAIT, 2)) + 1) \ + + [Exception('too many sleeps')] + self.cache_tiering.cache_flush({'pool_name': 'test'}) + self.ceph_api.osd_set_pool_param.assert_called_once_with( + 'test-cache', 'target_max_objects', 1, force=None, body='json') + self.ceph_api.df.assert_called_with(body='json') + expected_sleep = [] + interval = MIN_WAIT + while interval <= MAX_WAIT: + expected_sleep.append(interval) + interval *= 2 + self.assertEqual([c[0][0] for c in mock_time_sleep.call_args_list], + expected_sleep) + mock_proc_call.assert_called_with( + ['/usr/bin/rados', '-p', 'test-cache', 'cache-flush-evict-all']) + + @mock.patch('time.sleep') + @mock.patch('subprocess.check_call') + def test_df_objects_increase(self, mock_proc_call, mock_time_sleep): + self.ceph_api.osd_set_pool_param = mock.Mock() + self.ceph_api.osd_set_pool_param.return_value = ( + mock.Mock(ok=True, status_code=200, reason='OK'), + {}) + self.ceph_api.df = mock.Mock() + self.ceph_api.df.side_effect = [ + (mock.Mock(ok=True, status_code=200, reason='OK'), + {'output': { + 'pools': [ + {'id': 0, + 'name': 'test-cache', + 'stats': {'bytes_used': 0, + 'kb_used': 0, + 'max_avail': 9588428800, + 'objects': + CACHE_FLUSH_OBJECTS_THRESHOLD + 1}}]}, + 'status': 'OK'}), + (mock.Mock(ok=True, status_code=200, reason='OK'), + {'output': { + 'pools': [ + {'id': 0, + 'name': 'test-cache', + 'stats': {'bytes_used': 0, + 'kb_used': 0, + 'max_avail': 9588428800, + 'objects': + CACHE_FLUSH_OBJECTS_THRESHOLD + 2}}]}, + 'status': 'OK'})] + with mock.patch.object(CT_LOG, 'warn') as mock_lw: + self.cache_tiering.cache_flush({'pool_name': 'test'}) + for c in mock_lw.call_args_list: + if 'Unexpected increase' in c[0][0]: + break + else: + self.fail('expected log warning') + self.ceph_api.df.assert_called_with(body='json') + mock_time_sleep.assert_called_once_with(MIN_WAIT) + self.ceph_api.osd_set_pool_param.assert_called_once_with( + 'test-cache', 'target_max_objects', 1, force=None, body='json') + mock_proc_call.assert_called_with( + ['/usr/bin/rados', '-p', 'test-cache', 'cache-flush-evict-all']) diff --git a/ceph/ceph-manager/ceph-manager/setup.py b/ceph/ceph-manager/ceph-manager/setup.py new file mode 100644 index 000000000..40cf5012b --- /dev/null +++ b/ceph/ceph-manager/ceph-manager/setup.py @@ -0,0 +1,19 @@ +#!/usr/bin/env python +# +# Copyright (c) 2013-2014, 2016 Wind River Systems, Inc. +# +# SPDX-License-Identifier: Apache-2.0 +# + + +import setuptools + +setuptools.setup( + name='ceph_manager', + version='1.0.0', + description='CEPH manager', + license='Apache-2.0', + packages=['ceph_manager'], + entry_points={ + } +) diff --git a/ceph/ceph-manager/ceph-manager/test-requirements.txt b/ceph/ceph-manager/ceph-manager/test-requirements.txt new file mode 100644 index 000000000..1fdf20563 --- /dev/null +++ b/ceph/ceph-manager/ceph-manager/test-requirements.txt @@ -0,0 +1,10 @@ +# The order of packages is significant, because pip processes them in the order +# of appearance. Changing the order has an impact on the overall integration +# process, which may cause wedges in the gate later. + +mock +flake8 +eventlet +pytest +oslo.log +oslo.i18n \ No newline at end of file diff --git a/ceph/ceph-manager/ceph-manager/tox.ini b/ceph/ceph-manager/ceph-manager/tox.ini new file mode 100644 index 000000000..41d3854b2 --- /dev/null +++ b/ceph/ceph-manager/ceph-manager/tox.ini @@ -0,0 +1,29 @@ +# adapted from glance tox.ini + +[tox] +minversion = 1.6 +envlist = py27,pep8 +skipsdist = True +# tox does not work if the path to the workdir is too long, so move it to /tmp +toxworkdir = /tmp/{env:USER}_ceph_manager_tox + +[testenv] +setenv = VIRTUAL_ENV={envdir} +usedevelop = True +install_command = pip install --no-use-wheel -U --force-reinstall {opts} {packages} +deps = -r{toxinidir}/test-requirements.txt +commands = py.test {posargs} +whitelist_externals = bash +passenv = http_proxy HTTP_PROXY https_proxy HTTPS_PROXY no_proxy NO_PROXY + +[testenv:py27] +basepython = python2.7 +setenv = + PYTHONPATH={toxinidir}/../../../../sysinv/recipes-common/sysinv/sysinv:{toxinidir}/../../../../config/recipes-common/tsconfig/tsconfig + +[testenv:pep8] +commands = + flake8 {posargs} + +[flake8] +exclude = .venv,.git,.tox,dist,doc,etc,*glance/locale*,*lib/python*,*egg,build diff --git a/ceph/ceph-manager/files/ceph-manager.logrotate b/ceph/ceph-manager/files/ceph-manager.logrotate new file mode 100644 index 000000000..8d7a16ab1 --- /dev/null +++ b/ceph/ceph-manager/files/ceph-manager.logrotate @@ -0,0 +1,11 @@ +/var/log/ceph-manager.log { + nodateext + size 10M + start 1 + rotate 10 + missingok + notifempty + compress + delaycompress + copytruncate +} diff --git a/ceph/ceph-manager/files/ceph-manager.service b/ceph/ceph-manager/files/ceph-manager.service new file mode 100644 index 000000000..e8bf26cf9 --- /dev/null +++ b/ceph/ceph-manager/files/ceph-manager.service @@ -0,0 +1,17 @@ +[Unit] +Description=Handle Ceph API calls and provide status updates via alarms +After=ceph.target + +[Service] +Type=forking +Restart=no +KillMode=process +RemainAfterExit=yes +ExecStart=/etc/rc.d/init.d/ceph-manager start +ExecStop=/etc/rc.d/init.d/ceph-manager stop +ExecReload=/etc/rc.d/init.d/ceph-manager reload +PIDFile=/var/run/ceph/ceph-manager.pid + +[Install] +WantedBy=multi-user.target + diff --git a/ceph/ceph-manager/scripts/bin/ceph-manager b/ceph/ceph-manager/scripts/bin/ceph-manager new file mode 100644 index 000000000..9aa4330db --- /dev/null +++ b/ceph/ceph-manager/scripts/bin/ceph-manager @@ -0,0 +1,17 @@ +#!/usr/bin/env python +# +# Copyright (c) 2016 Wind River Systems, Inc. +# +# SPDX-License-Identifier: Apache-2.0 +# + + +import sys + +try: + from ceph_manager.server import run_service +except EnvironmentError as e: + print >> sys.stderr, "Error importing ceph_manager: ", str(e) + sys.exit(1) + +run_service() diff --git a/ceph/ceph-manager/scripts/init.d/ceph-manager b/ceph/ceph-manager/scripts/init.d/ceph-manager new file mode 100644 index 000000000..88bdddfb8 --- /dev/null +++ b/ceph/ceph-manager/scripts/init.d/ceph-manager @@ -0,0 +1,103 @@ +#!/bin/sh +# +# Copyright (c) 2013-2014, 2016 Wind River Systems, Inc. +# +# SPDX-License-Identifier: Apache-2.0 +# + + +### BEGIN INIT INFO +# Provides: ceph-manager +# Required-Start: $ceph +# Required-Stop: $ceph +# Default-Start: 2 3 4 5 +# Default-Stop: 0 1 6 +# Short-Description: Daemon for polling ceph status +# Description: Daemon for polling ceph status +### END INIT INFO + +DESC="ceph-manager" +DAEMON="/usr/bin/ceph-manager" +RUNDIR="/var/run/ceph" +PIDFILE=$RUNDIR/$DESC.pid + +CONFIGFILE="/etc/sysinv/sysinv.conf" +LOGFILE="/var/log/ceph-manager.log" + +start() +{ + if [ -e $PIDFILE ]; then + PIDDIR=/prod/$(cat $PIDFILE) + if [ -d ${PIDFILE} ]; then + echo "$DESC already running." + exit 0 + else + echo "Removing stale PID file $PIDFILE" + rm -f $PIDFILE + fi + fi + + echo -n "Starting $DESC..." + mkdir -p $RUNDIR + start-stop-daemon --start --quiet \ + --pidfile ${PIDFILE} --exec ${DAEMON} \ + --make-pidfile --background \ + -- --log-file=$LOGFILE --config-file=$CONFIGFILE + + if [ $? -eq 0 ]; then + echo "done." + else + echo "failed." + exit 1 + fi +} + +stop() +{ + echo -n "Stopping $DESC..." + start-stop-daemon --stop --quiet --pidfile $PIDFILE --retry 60 + if [ $? -eq 0 ]; then + echo "done." + else + echo "failed." + fi + rm -f $PIDFILE +} + +status() +{ + pid=`cat $PIDFILE 2>/dev/null` + if [ -n "$pid" ]; then + if ps -p $pid &> /dev/null ; then + echo "$DESC is running" + exit 0 + else + echo "$DESC is not running but has pid file" + exit 1 + fi + fi + echo "$DESC is not running" + exit 3 +} + +case "$1" in + start) + start + ;; + stop) + stop + ;; + restart|force-reload|reload) + stop + start + ;; + status) + status + ;; + *) + echo "Usage: $0 {start|stop|force-reload|restart|reload|status}" + exit 1 + ;; +esac + +exit 0 From 084be04d1c8eb3697f3a92bdf1aa2e204324f7d3 Mon Sep 17 00:00:00 2001 From: Ovidiu Poncea Date: Wed, 25 Apr 2018 01:01:01 +0300 Subject: [PATCH 06/26] Fix periodic thread that monitors Ceph A timeout between ceph-manager and sysinv is causing ceph-manager to stop responding. When Ceph Manager detects that 'require_jewel_osds' flag needs to be set it queries sysinv which for whatever reason (most likely dealing with some final operations) fails to respond in a reasonable amount of time. This causes an exception in ceph-manager which breaks one of the periodic threads execution. Change-Id: If49f5ffbce4aeac3d50d52f526d1ce905be3cecb Signed-off-by: Kristine Bujold Signed-off-by: Scott Little --- .../ceph-manager/ceph_manager/monitor.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/ceph/ceph-manager/ceph-manager/ceph_manager/monitor.py b/ceph/ceph-manager/ceph-manager/ceph_manager/monitor.py index 941e5fc03..51308240e 100644 --- a/ceph/ceph-manager/ceph-manager/ceph_manager/monitor.py +++ b/ceph/ceph-manager/ceph-manager/ceph_manager/monitor.py @@ -112,7 +112,7 @@ class HandleUpgradesMixin(object): "Getting software upgrade status failed " "with: %s. Skip auto-heal attempt " "(will retry on next ceph status poll).") % str(ex)) - return + return health state = upgrade.get('state') # surpress require_jewel_osds in case upgrade is # in progress but not completed or aborting @@ -181,15 +181,23 @@ class Monitor(HandleUpgradesMixin): def run(self): # Wait until Ceph cluster is up and we can get the fsid while True: - self.ceph_get_fsid() + try: + self.ceph_get_fsid() + except Exception: + LOG.exception("Error getting fsid, " + "will retry in %ss" % constants.CEPH_HEALTH_CHECK_INTERVAL) if self.service.entity_instance_id: break time.sleep(constants.CEPH_HEALTH_CHECK_INTERVAL) # Start monitoring ceph status while True: - self.ceph_poll_status() - self.ceph_poll_quotas() + try: + self.ceph_poll_status() + self.ceph_poll_quotas() + except Exception: + LOG.exception("Error running periodic monitoring of ceph status, " + "will retry in %ss" % constants.CEPH_HEALTH_CHECK_INTERVAL) time.sleep(constants.CEPH_HEALTH_CHECK_INTERVAL) def ceph_get_fsid(self): From 11bcebdaaddd4a004fdd6cd3b8d2a56ef7958c80 Mon Sep 17 00:00:00 2001 From: Robert Church Date: Wed, 4 Apr 2018 13:28:31 -0500 Subject: [PATCH 07/26] Remove Ceph Cache Tiering Ceph Cache Tiering feature is not supported anymore. This commit removes all the code changes associated with the no longer supported Ceph Cache Tiering. This implies: a. cache tiering cannot be configured on system b. no ceph-caching host could be added c. no ceph-backing host could be added d. ceph-caching/ceph-backing personality sub-type won't show up when 'system host-show'/system host-add command is issued e. ceph-caching/ceph-backing personality sub-type won't show up when host is added/listed from horizon Change-Id: Ic8fe14b7ceca5677f560bd2bce8c70a5c5597a7b Signed-off-by: Jack Ding Signed-off-by: Scott Little --- .../ceph_manager/cache_tiering.py | 705 ------------------ .../ceph-manager/ceph_manager/ceph.py | 13 +- .../ceph-manager/ceph_manager/constants.py | 19 +- .../ceph-manager/ceph_manager/exception.py | 54 +- .../ceph-manager/ceph_manager/monitor.py | 23 - .../ceph-manager/ceph_manager/server.py | 71 -- .../ceph_manager/tests/test_cache_flush.py | 309 -------- 7 files changed, 6 insertions(+), 1188 deletions(-) delete mode 100644 ceph/ceph-manager/ceph-manager/ceph_manager/cache_tiering.py delete mode 100644 ceph/ceph-manager/ceph-manager/ceph_manager/tests/test_cache_flush.py diff --git a/ceph/ceph-manager/ceph-manager/ceph_manager/cache_tiering.py b/ceph/ceph-manager/ceph-manager/ceph_manager/cache_tiering.py deleted file mode 100644 index 4e814c3b0..000000000 --- a/ceph/ceph-manager/ceph-manager/ceph_manager/cache_tiering.py +++ /dev/null @@ -1,705 +0,0 @@ -# -# Copyright (c) 2016 Wind River Systems, Inc. -# -# SPDX-License-Identifier: Apache-2.0 -# - -import copy -import contextlib -import functools -import math -import subprocess -import time -import traceback -# noinspection PyUnresolvedReferences -import eventlet -# noinspection PyUnresolvedReferences -from eventlet.semaphore import Semaphore -# noinspection PyUnresolvedReferences -from oslo_log import log as logging -# noinspection PyUnresolvedReferences -from sysinv.conductor.cache_tiering_service_config import ServiceConfig - -from i18n import _LI, _LW, _LE - -import constants -import exception -import ceph - -LOG = logging.getLogger(__name__) -CEPH_POOLS = copy.deepcopy(constants.CEPH_POOLS) - -MAX_WAIT = constants.CACHE_FLUSH_MAX_WAIT_OBJ_COUNT_DECREASE_SEC -MIN_WAIT = constants.CACHE_FLUSH_MIN_WAIT_OBJ_COUNT_DECREASE_SEC - - -class LockOwnership(object): - def __init__(self, sem): - self.sem = sem - - @contextlib.contextmanager - def __call__(self): - try: - yield - finally: - if self.sem: - self.sem.release() - - def transfer(self): - new_lo = LockOwnership(self.sem) - self.sem = None - return new_lo - - -class Lock(object): - - def __init__(self): - self.sem = Semaphore(value=1) - - def try_lock(self): - result = self.sem.acquire(blocking=False) - if result: - return LockOwnership(self.sem) - - -class CacheTiering(object): - - def __init__(self, service): - self.service = service - self.lock = Lock() - # will be unlocked by set_initial_config() - self._init_config_lock = self.lock.try_lock() - self.config = None - self.config_desired = None - self.config_applied = None - self.target_max_bytes = {} - - def set_initial_config(self, config): - with self._init_config_lock(): - LOG.info("Setting Ceph cache tiering initial configuration") - self.config = ServiceConfig.from_dict( - config.get(constants.CACHE_TIERING, {})) or \ - ServiceConfig() - self.config_desired = ServiceConfig.from_dict( - config.get(constants.CACHE_TIERING_DESIRED, {})) or \ - ServiceConfig() - self.config_applied = ServiceConfig.from_dict( - config.get(constants.CACHE_TIERING_APPLIED, {})) or \ - ServiceConfig() - if self.config_desired: - LOG.debug("set_initial_config config_desired %s " % - self.config_desired.to_dict()) - if self.config_applied: - LOG.debug("set_initial_config config_applied %s " % - self.config_applied.to_dict()) - - # Check that previous caching tier operation completed - # successfully or perform recovery - if (self.config_desired and - self.config_applied and - (self.config_desired.cache_enabled != - self.config_applied.cache_enabled)): - if self.config_desired.cache_enabled: - self.enable_cache(self.config_desired.to_dict(), - self.config_applied.to_dict(), - self._init_config_lock.transfer()) - else: - self.disable_cache(self.config_desired.to_dict(), - self.config_applied.to_dict(), - self._init_config_lock.transfer()) - - def is_locked(self): - lock_ownership = self.lock.try_lock() - if not lock_ownership: - return True - with lock_ownership(): - return False - - def update_pools_info(self): - global CEPH_POOLS - cfg = self.service.sysinv_conductor.call( - {}, 'get_ceph_pools_config') - CEPH_POOLS = copy.deepcopy(cfg) - LOG.info(_LI("update_pools_info: pools: {}").format(CEPH_POOLS)) - - def enable_cache(self, new_config, applied_config, lock_ownership=None): - new_config = ServiceConfig.from_dict(new_config) - applied_config = ServiceConfig.from_dict(applied_config) - if not lock_ownership: - lock_ownership = self.lock.try_lock() - if not lock_ownership: - raise exception.CephCacheEnableFailure() - with lock_ownership(): - eventlet.spawn(self.do_enable_cache, - new_config, applied_config, - lock_ownership.transfer()) - - def do_enable_cache(self, new_config, applied_config, lock_ownership): - LOG.info(_LI("cache_tiering_enable_cache: " - "new_config={}, applied_config={}").format( - new_config.to_dict(), applied_config.to_dict())) - _unwind_actions = [] - with lock_ownership(): - success = False - _exception = None - try: - self.config_desired.cache_enabled = True - self.update_pools_info() - for pool in CEPH_POOLS: - if (pool['pool_name'] == - constants.CEPH_POOL_OBJECT_GATEWAY_NAME_JEWEL or - pool['pool_name'] == - constants.CEPH_POOL_OBJECT_GATEWAY_NAME_HAMMER): - object_pool_name = \ - self.service.monitor._get_object_pool_name() - pool['pool_name'] = object_pool_name - - self.cache_pool_create(pool) - _unwind_actions.append( - functools.partial(self.cache_pool_delete, pool)) - for pool in CEPH_POOLS: - if (pool['pool_name'] == - constants.CEPH_POOL_OBJECT_GATEWAY_NAME_JEWEL or - pool['pool_name'] == - constants.CEPH_POOL_OBJECT_GATEWAY_NAME_HAMMER): - object_pool_name = \ - self.service.monitor._get_object_pool_name() - pool['pool_name'] = object_pool_name - - self.cache_tier_add(pool) - _unwind_actions.append( - functools.partial(self.cache_tier_remove, pool)) - for pool in CEPH_POOLS: - if (pool['pool_name'] == - constants.CEPH_POOL_OBJECT_GATEWAY_NAME_JEWEL or - pool['pool_name'] == - constants.CEPH_POOL_OBJECT_GATEWAY_NAME_HAMMER): - object_pool_name = \ - self.service.monitor._get_object_pool_name() - pool['pool_name'] = object_pool_name - - self.cache_mode_set(pool, 'writeback') - self.cache_pool_set_config(pool, new_config) - self.cache_overlay_create(pool) - success = True - except Exception as e: - LOG.error(_LE('Failed to enable cache: reason=%s') % - traceback.format_exc()) - for action in reversed(_unwind_actions): - try: - action() - except Exception: - LOG.warn(_LW('Failed cache enable ' - 'unwind action: reason=%s') % - traceback.format_exc()) - success = False - _exception = str(e) - finally: - self.service.monitor.monitor_check_cache_tier(success) - if success: - self.config_applied.cache_enabled = True - self.service.sysinv_conductor.call( - {}, 'cache_tiering_enable_cache_complete', - success=success, exception=_exception, - new_config=new_config.to_dict(), - applied_config=applied_config.to_dict()) - # Run first update of periodic target_max_bytes - self.update_cache_target_max_bytes() - - @contextlib.contextmanager - def ignore_ceph_failure(self): - try: - yield - except exception.CephManagerException: - pass - - def disable_cache(self, new_config, applied_config, lock_ownership=None): - new_config = ServiceConfig.from_dict(new_config) - applied_config = ServiceConfig.from_dict(applied_config) - if not lock_ownership: - lock_ownership = self.lock.try_lock() - if not lock_ownership: - raise exception.CephCacheDisableFailure() - with lock_ownership(): - eventlet.spawn(self.do_disable_cache, - new_config, applied_config, - lock_ownership.transfer()) - - def do_disable_cache(self, new_config, applied_config, lock_ownership): - LOG.info(_LI("cache_tiering_disable_cache: " - "new_config={}, applied_config={}").format( - new_config, applied_config)) - with lock_ownership(): - success = False - _exception = None - try: - self.config_desired.cache_enabled = False - for pool in CEPH_POOLS: - if (pool['pool_name'] == - constants.CEPH_POOL_OBJECT_GATEWAY_NAME_JEWEL or - pool['pool_name'] == - constants.CEPH_POOL_OBJECT_GATEWAY_NAME_HAMMER): - object_pool_name = \ - self.service.monitor._get_object_pool_name() - pool['pool_name'] = object_pool_name - - with self.ignore_ceph_failure(): - self.cache_mode_set( - pool, 'forward') - - for pool in CEPH_POOLS: - if (pool['pool_name'] == - constants.CEPH_POOL_OBJECT_GATEWAY_NAME_JEWEL or - pool['pool_name'] == - constants.CEPH_POOL_OBJECT_GATEWAY_NAME_HAMMER): - object_pool_name = \ - self.service.monitor._get_object_pool_name() - pool['pool_name'] = object_pool_name - - retries_left = 3 - while True: - try: - self.cache_flush(pool) - break - except exception.CephCacheFlushFailure: - retries_left -= 1 - if not retries_left: - # give up - break - else: - time.sleep(1) - for pool in CEPH_POOLS: - if (pool['pool_name'] == - constants.CEPH_POOL_OBJECT_GATEWAY_NAME_JEWEL or - pool['pool_name'] == - constants.CEPH_POOL_OBJECT_GATEWAY_NAME_HAMMER): - object_pool_name = \ - self.service.monitor._get_object_pool_name() - pool['pool_name'] = object_pool_name - - with self.ignore_ceph_failure(): - self.cache_overlay_delete(pool) - self.cache_tier_remove(pool) - for pool in CEPH_POOLS: - if (pool['pool_name'] == - constants.CEPH_POOL_OBJECT_GATEWAY_NAME_JEWEL or - pool['pool_name'] == - constants.CEPH_POOL_OBJECT_GATEWAY_NAME_HAMMER): - object_pool_name = \ - self.service.monitor._get_object_pool_name() - pool['pool_name'] = object_pool_name - - with self.ignore_ceph_failure(): - self.cache_pool_delete(pool) - success = True - except Exception as e: - LOG.warn(_LE('Failed to disable cache: reason=%s') % - traceback.format_exc()) - _exception = str(e) - finally: - self.service.monitor.monitor_check_cache_tier(False) - if success: - self.config_desired.cache_enabled = False - self.config_applied.cache_enabled = False - self.service.sysinv_conductor.call( - {}, 'cache_tiering_disable_cache_complete', - success=success, exception=_exception, - new_config=new_config.to_dict(), - applied_config=applied_config.to_dict()) - - def get_pool_pg_num(self, pool_name): - return self.service.sysinv_conductor.call( - {}, 'get_pool_pg_num', - pool_name=pool_name) - - def cache_pool_create(self, pool): - backing_pool = pool['pool_name'] - cache_pool = backing_pool + '-cache' - pg_num = self.get_pool_pg_num(cache_pool) - if not ceph.osd_pool_exists(self.service.ceph_api, cache_pool): - ceph.osd_pool_create( - self.service.ceph_api, cache_pool, - pg_num, pg_num) - - def cache_pool_delete(self, pool): - cache_pool = pool['pool_name'] + '-cache' - ceph.osd_pool_delete( - self.service.ceph_api, cache_pool) - - def cache_tier_add(self, pool): - backing_pool = pool['pool_name'] - cache_pool = backing_pool + '-cache' - response, body = self.service.ceph_api.osd_tier_add( - backing_pool, cache_pool, - force_nonempty="--force-nonempty", - body='json') - if response.ok: - LOG.info(_LI("Added OSD tier: " - "backing_pool={}, cache_pool={}").format( - backing_pool, cache_pool)) - else: - e = exception.CephPoolAddTierFailure( - backing_pool=backing_pool, - cache_pool=cache_pool, - response_status_code=response.status_code, - response_reason=response.reason, - status=body.get('status'), - output=body.get('output')) - LOG.warn(e) - raise e - - def cache_tier_remove(self, pool): - backing_pool = pool['pool_name'] - cache_pool = backing_pool + '-cache' - response, body = self.service.ceph_api.osd_tier_remove( - backing_pool, cache_pool, body='json') - if response.ok: - LOG.info(_LI("Removed OSD tier: " - "backing_pool={}, cache_pool={}").format( - backing_pool, cache_pool)) - else: - e = exception.CephPoolRemoveTierFailure( - backing_pool=backing_pool, - cache_pool=cache_pool, - response_status_code=response.status_code, - response_reason=response.reason, - status=body.get('status'), - output=body.get('output')) - LOG.warn(e) - raise e - - def cache_mode_set(self, pool, mode): - backing_pool = pool['pool_name'] - cache_pool = backing_pool + '-cache' - response, body = self.service.ceph_api.osd_tier_cachemode( - cache_pool, mode, body='json') - if response.ok: - LOG.info(_LI("Set OSD tier cache mode: " - "cache_pool={}, mode={}").format(cache_pool, mode)) - else: - e = exception.CephCacheSetModeFailure( - cache_pool=cache_pool, - mode=mode, - response_status_code=response.status_code, - response_reason=response.reason, - status=body.get('status'), - output=body.get('output')) - LOG.warn(e) - raise e - - def cache_pool_set_config(self, pool, config): - for name, value in config.params.iteritems(): - self.cache_pool_set_param(pool, name, value) - - def cache_pool_set_param(self, pool, name, value): - backing_pool = pool['pool_name'] - cache_pool = backing_pool + '-cache' - ceph.osd_set_pool_param( - self.service.ceph_api, cache_pool, name, value) - - def cache_overlay_create(self, pool): - backing_pool = pool['pool_name'] - cache_pool = backing_pool + '-cache' - response, body = self.service.ceph_api.osd_tier_set_overlay( - backing_pool, cache_pool, body='json') - if response.ok: - LOG.info(_LI("Set OSD tier overlay: " - "backing_pool={}, cache_pool={}").format( - backing_pool, cache_pool)) - else: - e = exception.CephCacheCreateOverlayFailure( - backing_pool=backing_pool, - cache_pool=cache_pool, - response_status_code=response.status_code, - response_reason=response.reason, - status=body.get('status'), - output=body.get('output')) - LOG.warn(e) - raise e - - def cache_overlay_delete(self, pool): - backing_pool = pool['pool_name'] - cache_pool = pool['pool_name'] - response, body = self.service.ceph_api.osd_tier_remove_overlay( - backing_pool, body='json') - if response.ok: - LOG.info(_LI("Removed OSD tier overlay: " - "backing_pool={}").format(backing_pool)) - else: - e = exception.CephCacheDeleteOverlayFailure( - backing_pool=backing_pool, - cache_pool=cache_pool, - response_status_code=response.status_code, - response_reason=response.reason, - status=body.get('status'), - output=body.get('output')) - LOG.warn(e) - raise e - - @staticmethod - def rados_cache_flush_evict_all(pool): - backing_pool = pool['pool_name'] - cache_pool = backing_pool + '-cache' - try: - subprocess.check_call( - ['/usr/bin/rados', '-p', cache_pool, 'cache-flush-evict-all']) - LOG.info(_LI("Flushed OSD cache pool:" - "cache_pool={}").format(cache_pool)) - except subprocess.CalledProcessError as e: - _e = exception.CephCacheFlushFailure( - cache_pool=cache_pool, - return_code=str(e.returncode), - cmd=" ".join(e.cmd), - output=e.output) - LOG.warn(_e) - raise _e - - def cache_flush(self, pool): - backing_pool = pool['pool_name'] - cache_pool = backing_pool + '-cache' - try: - # set target_max_objects to a small value to force evacuation of - # objects from cache before we use rados cache-flush-evict-all - # WARNING: assuming cache_pool will be deleted after flush so - # we don't have to save/restore the value of target_max_objects - # - self.cache_pool_set_param(pool, 'target_max_objects', 1) - prev_object_count = None - wait_interval = MIN_WAIT - while True: - response, body = self.service.ceph_api.df(body='json') - if not response.ok: - LOG.warn(_LW( - "Failed to retrieve cluster free space stats: " - "status_code=%d, reason=%s") % ( - response.status_code, response.reason)) - break - stats = None - for s in body['output']['pools']: - if s['name'] == cache_pool: - stats = s['stats'] - break - if not stats: - LOG.warn(_LW("Missing pool free space stats: " - "cache_pool=%s") % cache_pool) - break - object_count = stats['objects'] - if object_count < constants.CACHE_FLUSH_OBJECTS_THRESHOLD: - break - if prev_object_count is not None: - delta_objects = object_count - prev_object_count - if delta_objects > 0: - LOG.warn(_LW("Unexpected increase in number " - "of objects in cache pool: " - "cache_pool=%s, prev_object_count=%d, " - "object_count=%d") % ( - cache_pool, prev_object_count, - object_count)) - break - if delta_objects == 0: - wait_interval *= 2 - if wait_interval > MAX_WAIT: - LOG.warn(_LW( - "Cache pool number of objects did not " - "decrease: cache_pool=%s, object_count=%d, " - "wait_interval=%d") % ( - cache_pool, object_count, wait_interval)) - break - else: - wait_interval = MIN_WAIT - time.sleep(wait_interval) - prev_object_count = object_count - except exception.CephPoolSetParamFailure as e: - LOG.warn(e) - finally: - self.rados_cache_flush_evict_all(pool) - - def update_cache_target_max_bytes(self): - "Dynamically compute target_max_bytes of caching pools" - - # Only compute if cache tiering is enabled - if self.config_applied and self.config_desired: - if (not self.config_desired.cache_enabled or - not self.config_applied.cache_enabled): - LOG.debug("Cache tiering disabled, no need to update " - "target_max_bytes.") - return - LOG.debug("Updating target_max_bytes") - - # Get available space - response, body = self.service.ceph_api.osd_df(body='json', - output_method='tree') - if not response.ok: - LOG.warn(_LW( - "Failed to retrieve cluster free space stats: " - "status_code=%d, reason=%s") % ( - response.status_code, response.reason)) - return - - storage_tier_size = 0 - cache_tier_size = 0 - - replication = constants.CEPH_REPLICATION_FACTOR - for node in body['output']['nodes']: - if node['name'] == 'storage-tier': - storage_tier_size = node['kb']*1024/replication - elif node['name'] == 'cache-tier': - cache_tier_size = node['kb']*1024/replication - - if storage_tier_size == 0 or cache_tier_size == 0: - LOG.info("Failed to get cluster size " - "(storage_tier_size=%s, cache_tier_size=%s)," - "retrying on next cycle" % - (storage_tier_size, cache_tier_size)) - return - - # Get available pools - response, body = self.service.ceph_api.osd_lspools(body='json') - if not response.ok: - LOG.warn(_LW( - "Failed to retrieve available pools: " - "status_code=%d, reason=%s") % ( - response.status_code, response.reason)) - return - pools = [p['poolname'] for p in body['output']] - - # Separate backing from caching for easy iteration - backing_pools = [] - caching_pools = [] - for p in pools: - if p.endswith('-cache'): - caching_pools.append(p) - else: - backing_pools.append(p) - LOG.debug("Pools: caching: %s, backing: %s" % (caching_pools, - backing_pools)) - - if not len(caching_pools): - # We do not have caching pools created yet - return - - # Get quota from backing pools that are cached - stats = {} - for p in caching_pools: - backing_name = p.replace('-cache', '') - stats[backing_name] = {} - try: - quota = ceph.osd_pool_get_quota(self.service.ceph_api, - backing_name) - except exception.CephPoolGetQuotaFailure as e: - LOG.warn(_LW( - "Failed to retrieve quota: " - "exception: %s") % str(e)) - return - stats[backing_name]['quota'] = quota['max_bytes'] - stats[backing_name]['quota_pt'] = (quota['max_bytes']*100.0 / - storage_tier_size) - LOG.debug("Quota for pool: %s " - "is: %s B representing %s pt" % - (backing_name, - quota['max_bytes'], - stats[backing_name]['quota_pt'])) - - # target_max_bytes logic: - # - For computing target_max_bytes cache_tier_size must be equal than - # the sum of target_max_bytes of each caching pool - # - target_max_bytes for each caching pool is computed as the - # percentage of quota in corresponding backing pool - # - the caching tiers has to work at full capacity, so if the sum of - # all quotas in the backing tier is different than 100% we need to - # normalize - # - if the quota is zero for any pool we add CACHE_TIERING_MIN_QUOTA - # by default *after* normalization so that we have real minimum - - # We compute the real percentage that need to be normalized after - # ensuring that we have CACHE_TIERING_MIN_QUOTA for each pool with - # a quota of 0 - real_100pt = 90.0 # we start from max and decrease it for each 0 pool - # Note: We must avoid reaching 100% at all costs! and - # cache_target_full_ratio, the Ceph parameter that is supposed to - # protect the cluster against this does not work in Ceph v0.94.6! - # Therefore a value of 90% is better suited for this - for p in caching_pools: - backing_name = p.replace('-cache', '') - if stats[backing_name]['quota_pt'] == 0: - real_100pt -= constants.CACHE_TIERING_MIN_QUOTA - LOG.debug("Quota before normalization for %s is: %s pt" % - (p, stats[backing_name]['quota_pt'])) - - # Compute total percentage of quotas for all backing pools. - # Should be 100% if correctly configured - total_quota_pt = 0 - for p in caching_pools: - backing_name = p.replace('-cache', '') - total_quota_pt += stats[backing_name]['quota_pt'] - LOG.debug("Total quota pt is: %s" % total_quota_pt) - - # Normalize quota pt to 100% (or real_100pt) - if total_quota_pt != 0: # to avoid divide by zero - for p in caching_pools: - backing_name = p.replace('-cache', '') - stats[backing_name]['quota_pt'] = \ - (stats[backing_name]['quota_pt'] * - (real_100pt / total_quota_pt)) - - # Do not allow quota to be 0 for any pool - total = 0 - for p in caching_pools: - backing_name = p.replace('-cache', '') - if stats[backing_name]['quota_pt'] == 0: - stats[backing_name]['quota_pt'] = \ - constants.CACHE_TIERING_MIN_QUOTA - total += stats[backing_name]['quota_pt'] - LOG.debug("Quota after normalization for %s is: %s:" % - (p, stats[backing_name]['quota_pt'])) - - if total > 100: - # Supplementary protection, we really have to avoid going above - # 100%. Note that real_100pt is less than 100% but we still got - # more than 100! - LOG.warn("Total sum of quotas should not go above 100% " - "but is: %s, recalculating in next cycle" % total) - return - LOG.debug("Total sum of quotas is %s pt" % total) - - # Get current target_max_bytes. We cache it to reduce requests - # to ceph-rest-api. We are the ones changing it, so not an issue. - for p in caching_pools: - if p not in self.target_max_bytes: - try: - value = ceph.osd_get_pool_param(self.service.ceph_api, p, - constants.TARGET_MAX_BYTES) - except exception.CephPoolGetParamFailure as e: - LOG.warn(e) - return - self.target_max_bytes[p] = value - LOG.debug("Existing target_max_bytes got from " - "Ceph: %s" % self.target_max_bytes) - - # Set TARGET_MAX_BYTES - LOG.debug("storage_tier_size: %s " - "cache_tier_size: %s" % (storage_tier_size, - cache_tier_size)) - for p in caching_pools: - backing_name = p.replace('-cache', '') - s = stats[backing_name] - target_max_bytes = math.floor(s['quota_pt'] * cache_tier_size / - 100.0) - target_max_bytes = int(target_max_bytes) - LOG.debug("New Target max bytes of pool: %s is: %s B" % ( - p, target_max_bytes)) - - # Set the new target_max_bytes only if it changed - if self.target_max_bytes.get(p) == target_max_bytes: - LOG.debug("Target max bytes of pool: %s " - "is already updated" % p) - continue - try: - ceph.osd_set_pool_param(self.service.ceph_api, p, - constants.TARGET_MAX_BYTES, - target_max_bytes) - self.target_max_bytes[p] = target_max_bytes - except exception.CephPoolSetParamFailure as e: - LOG.warn(e) - continue - return diff --git a/ceph/ceph-manager/ceph-manager/ceph_manager/ceph.py b/ceph/ceph-manager/ceph-manager/ceph_manager/ceph.py index dff3c8ab5..a143b5775 100644 --- a/ceph/ceph-manager/ceph-manager/ceph_manager/ceph.py +++ b/ceph/ceph-manager/ceph-manager/ceph_manager/ceph.py @@ -73,15 +73,10 @@ def osd_pool_exists(ceph_api, pool_name): def osd_pool_create(ceph_api, pool_name, pg_num, pgp_num): - if pool_name.endswith("-cache"): - # ruleset 1: is the ruleset for the cache tier - # Name: cache_tier_ruleset - ruleset = 1 - else: - # ruleset 0: is the default ruleset if no crushmap is loaded or - # the ruleset for the backing tier if loaded: - # Name: storage_tier_ruleset - ruleset = 0 + # ruleset 0: is the default ruleset if no crushmap is loaded or + # the ruleset for the backing tier if loaded: + # Name: storage_tier_ruleset + ruleset = 0 response, body = ceph_api.osd_pool_create( pool_name, pg_num, pgp_num, pool_type="replicated", ruleset=ruleset, body='json') diff --git a/ceph/ceph-manager/ceph-manager/ceph_manager/constants.py b/ceph/ceph-manager/ceph-manager/ceph_manager/constants.py index 5b2977430..ede99b2af 100644 --- a/ceph/ceph-manager/ceph-manager/ceph_manager/constants.py +++ b/ceph/ceph-manager/ceph-manager/ceph_manager/constants.py @@ -12,31 +12,14 @@ CEPH_POOL_OBJECT_GATEWAY_NAME_JEWEL = \ sysinv_constants.CEPH_POOL_OBJECT_GATEWAY_NAME_JEWEL CEPH_POOL_OBJECT_GATEWAY_NAME_HAMMER = \ sysinv_constants.CEPH_POOL_OBJECT_GATEWAY_NAME_HAMMER -CEPH_POOLS = sysinv_constants.BACKING_POOLS +CEPH_POOLS = sysinv_constants.CEPH_POOLS CEPH_REPLICATION_FACTOR = sysinv_constants.CEPH_REPLICATION_FACTOR_DEFAULT -SERVICE_PARAM_CEPH_CACHE_HIT_SET_TYPE_BLOOM = \ - sysinv_constants.SERVICE_PARAM_CEPH_CACHE_HIT_SET_TYPE_BLOOM -CACHE_TIERING_DEFAULTS = sysinv_constants.CACHE_TIERING_DEFAULTS -TARGET_MAX_BYTES = \ - sysinv_constants.SERVICE_PARAM_CEPH_CACHE_TIER_TARGET_MAX_BYTES - -# Cache tiering section shortener -CACHE_TIERING = \ - sysinv_constants.SERVICE_PARAM_SECTION_CEPH_CACHE_TIER -CACHE_TIERING_DESIRED = \ - sysinv_constants.SERVICE_PARAM_SECTION_CEPH_CACHE_TIER_DESIRED -CACHE_TIERING_APPLIED = \ - sysinv_constants.SERVICE_PARAM_SECTION_CEPH_CACHE_TIER_APPLIED -CACHE_TIERING_SECTIONS = \ - [CACHE_TIERING, CACHE_TIERING_DESIRED, CACHE_TIERING_APPLIED] # Cache flush parameters CACHE_FLUSH_OBJECTS_THRESHOLD = 1000 CACHE_FLUSH_MIN_WAIT_OBJ_COUNT_DECREASE_SEC = 1 CACHE_FLUSH_MAX_WAIT_OBJ_COUNT_DECREASE_SEC = 128 -CACHE_TIERING_MIN_QUOTA = 5 - FM_ALARM_REASON_MAX_SIZE = 256 # TODO this will later change based on parsed health diff --git a/ceph/ceph-manager/ceph-manager/ceph_manager/exception.py b/ceph/ceph-manager/ceph-manager/ceph_manager/exception.py index c2d81b8b4..3ef078252 100644 --- a/ceph/ceph-manager/ceph-manager/ceph_manager/exception.py +++ b/ceph/ceph-manager/ceph-manager/ceph_manager/exception.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2016-2017 Wind River Systems, Inc. +# Copyright (c) 2016-2018 Wind River Systems, Inc. # # SPDX-License-Identifier: Apache-2.0 # @@ -54,27 +54,6 @@ class CephPoolRulesetFailure(CephManagerException): "pool %(name)s failed: %(reason)s") -class CephPoolAddTierFailure(CephManagerException): - message = _("Failed to add OSD tier: " - "backing_pool=%(backing_pool)s, cache_pool=%(cache_pool)s, " - "response=%(response_status_code)s:%(response_reason)s, " - "status=%(status)s, output=%(output)s") - - -class CephPoolRemoveTierFailure(CephManagerException): - message = _("Failed to remove tier: " - "backing_pool=%(backing_pool)s, cache_pool=%(cache_pool)s, " - "response=%(response_status_code)s:%(response_reason)s, " - "status=%(status)s, output=%(output)s") - - -class CephCacheSetModeFailure(CephManagerException): - message = _("Failed to set OSD tier cache mode: " - "cache_pool=%(cache_pool)s, mode=%(mode)s, " - "response=%(response_status_code)s:%(response_reason)s, " - "status=%(status)s, output=%(output)s") - - class CephPoolSetParamFailure(CephManagerException): message = _("Cannot set Ceph OSD pool parameter: " "pool_name=%(pool_name)s, param=%(param)s, value=%(value)s. " @@ -87,37 +66,6 @@ class CephPoolGetParamFailure(CephManagerException): "Reason: %(reason)s") -class CephCacheCreateOverlayFailure(CephManagerException): - message = _("Failed to create overlay: " - "backing_pool=%(backing_pool)s, cache_pool=%(cache_pool)s, " - "response=%(response_status_code)s:%(response_reason)s, " - "status=%(status)s, output=%(output)s") - - -class CephCacheDeleteOverlayFailure(CephManagerException): - message = _("Failed to delete overlay: " - "backing_pool=%(backing_pool)s, cache_pool=%(cache_pool)s, " - "response=%(response_status_code)s:%(response_reason)s, " - "status=%(status)s, output=%(output)s") - - -class CephCacheFlushFailure(CephManagerException): - message = _("Failed to flush cache pool: " - "cache_pool=%(cache_pool)s, " - "return_code=%(return_code)s, " - "cmd=%(cmd)s, output=%(output)s") - - -class CephCacheEnableFailure(CephManagerException): - message = _("Cannot enable Ceph cache tier. " - "Reason: cache tiering operation in progress.") - - -class CephCacheDisableFailure(CephManagerException): - message = _("Cannot disable Ceph cache tier. " - "Reason: cache tiering operation in progress.") - - class CephSetKeyFailure(CephManagerException): message = _("Error setting the Ceph flag " "'%(flag)s' %(extra)s: " diff --git a/ceph/ceph-manager/ceph-manager/ceph_manager/monitor.py b/ceph/ceph-manager/ceph-manager/ceph_manager/monitor.py index 51308240e..c0960fbd6 100644 --- a/ceph/ceph-manager/ceph-manager/ceph_manager/monitor.py +++ b/ceph/ceph-manager/ceph-manager/ceph_manager/monitor.py @@ -13,8 +13,6 @@ from fm_api import constants as fm_constants # noinspection PyUnresolvedReferences from oslo_log import log as logging -from sysinv.conductor.cache_tiering_service_config import ServiceConfig - # noinspection PyProtectedMember from i18n import _, _LI, _LW, _LE @@ -155,7 +153,6 @@ class Monitor(HandleUpgradesMixin): def __init__(self, service): self.service = service self.current_ceph_health = "" - self.cache_enabled = False self.tiers_size = {} self.known_object_pool_name = None self.primary_tier_name = constants.SB_TIER_DEFAULT_NAMES[ @@ -164,20 +161,8 @@ class Monitor(HandleUpgradesMixin): super(Monitor, self).__init__(service) def setup(self, config): - self.set_caching_tier_config(config) super(Monitor, self).setup(config) - def set_caching_tier_config(self, config): - conf = ServiceConfig().from_dict( - config.get(constants.CACHE_TIERING_APPLIED)) - if conf: - self.cache_enabled = conf.cache_enabled - - def monitor_check_cache_tier(self, enable_flag): - LOG.info(_LI("monitor_check_cache_tier: " - "enable_flag={}".format(enable_flag))) - self.cache_enabled = enable_flag - def run(self): # Wait until Ceph cluster is up and we can get the fsid while True: @@ -262,11 +247,6 @@ class Monitor(HandleUpgradesMixin): # Check the quotas on each tier for tier in self.tiers_size: - # TODO(rchurch): For R6 remove the tier from the default crushmap - # and remove this check. No longer supporting this tier in R5 - if tier == 'cache-tier': - continue - # Extract the tier name from the crush equivalent tier_name = tier[:-len(constants.CEPH_CRUSH_TIER_SUFFIX)] @@ -601,9 +581,6 @@ class Monitor(HandleUpgradesMixin): self._check_storage_tier(osd_tree, "storage-tier", lambda *args: alarms.append(args)) - if self.cache_enabled: - self._check_storage_tier(osd_tree, "cache-tier", - lambda *args: alarms.append(args)) old_alarms = {} for alarm_id in [ diff --git a/ceph/ceph-manager/ceph-manager/ceph_manager/server.py b/ceph/ceph-manager/ceph-manager/ceph_manager/server.py index 9403a7c2c..c8b96a726 100644 --- a/ceph/ceph-manager/ceph-manager/ceph_manager/server.py +++ b/ceph/ceph-manager/ceph-manager/ceph_manager/server.py @@ -27,13 +27,10 @@ from oslo_service.periodic_task import PeriodicTasks # noinspection PyUnresolvedReferences from oslo_service import loopingcall -from sysinv.conductor.cache_tiering_service_config import ServiceConfig - # noinspection PyUnresolvedReferences from cephclient import wrapper from monitor import Monitor -from cache_tiering import CacheTiering import exception import constants @@ -61,34 +58,6 @@ class RpcEndpoint(PeriodicTasks): def __init__(self, service=None): self.service = service - def cache_tiering_enable_cache(self, _, new_config, applied_config): - LOG.info(_LI("Enabling cache")) - try: - self.service.cache_tiering.enable_cache( - new_config, applied_config) - except exception.CephManagerException as e: - self.service.sysinv_conductor.call( - {}, 'cache_tiering_enable_cache_complete', - success=False, exception=str(e.message), - new_config=new_config, applied_config=applied_config) - - def cache_tiering_disable_cache(self, _, new_config, applied_config): - LOG.info(_LI("Disabling cache")) - try: - self.service.cache_tiering.disable_cache( - new_config, applied_config) - except exception.CephManagerException as e: - self.service.sysinv_conductor.call( - {}, 'cache_tiering_disable_cache_complete', - success=False, exception=str(e.message), - new_config=new_config, applied_config=applied_config) - - def cache_tiering_operation_in_progress(self, _): - is_locked = self.service.cache_tiering.is_locked() - LOG.info(_LI("Cache tiering operation " - "is in progress: %s") % str(is_locked).lower()) - return is_locked - def get_primary_tier_size(self, _): """Get the ceph size for the primary tier. @@ -163,7 +132,6 @@ class Service(SysinvConductorUpgradeApi, service.Service): self.entity_instance_id = '' self.fm_api = fm_api.FaultAPIs() self.monitor = Monitor(self) - self.cache_tiering = CacheTiering(self) self.config = None self.config_desired = None self.config_applied = None @@ -181,8 +149,6 @@ class Service(SysinvConductorUpgradeApi, service.Service): # Get initial config from sysinv and send it to # services that need it before starting them - config = self.get_caching_tier_config() - self.monitor.setup(config) self.rpc_server = messaging.get_rpc_server( transport, messaging.Target(topic=constants.CEPH_MANAGER_TOPIC, @@ -190,37 +156,7 @@ class Service(SysinvConductorUpgradeApi, service.Service): [RpcEndpoint(self)], executor='eventlet') self.rpc_server.start() - self.cache_tiering.set_initial_config(config) eventlet.spawn_n(self.monitor.run) - periodic = loopingcall.FixedIntervalLoopingCall( - self.update_ceph_target_max_bytes) - periodic.start(interval=300) - - def get_caching_tier_config(self): - LOG.info("Getting cache tiering configuration from sysinv") - while True: - # Get initial configuration from sysinv, - # retry until sysinv starts - try: - cctxt = self.sysinv_conductor.prepare(timeout=2) - config = cctxt.call({}, 'cache_tiering_get_config') - for section in config: - if section == constants.CACHE_TIERING: - self.config = ServiceConfig().from_dict( - config[section]) - elif section == constants.CACHE_TIERING_DESIRED: - self.config_desired = ServiceConfig().from_dict( - config[section]) - elif section == constants.CACHE_TIERING_APPLIED: - self.config_applied = ServiceConfig().from_dict( - config[section]) - LOG.info("Cache tiering configs: {}".format(config)) - return config - except Exception as ex: - # In production we should retry on every error until connection - # is reestablished. - LOG.warn("Getting cache tiering configuration failed " - "with: {}. Retrying... ".format(str(ex))) def stop(self): try: @@ -230,13 +166,6 @@ class Service(SysinvConductorUpgradeApi, service.Service): pass super(Service, self).stop() - def update_ceph_target_max_bytes(self): - try: - self.cache_tiering.update_cache_target_max_bytes() - except Exception as ex: - LOG.exception("Updating Ceph target max bytes failed " - "with: {} retrying on next cycle.".format(str(ex))) - def run_service(): CONF(sys.argv[1:]) diff --git a/ceph/ceph-manager/ceph-manager/ceph_manager/tests/test_cache_flush.py b/ceph/ceph-manager/ceph-manager/ceph_manager/tests/test_cache_flush.py deleted file mode 100644 index 2fd265195..000000000 --- a/ceph/ceph-manager/ceph-manager/ceph_manager/tests/test_cache_flush.py +++ /dev/null @@ -1,309 +0,0 @@ -# -# Copyright (c) 2016 Wind River Systems, Inc. -# -# SPDX-License-Identifier: Apache-2.0 -# - -import unittest -import mock - -import subprocess -import math - -from ..cache_tiering import CacheTiering -from ..cache_tiering import LOG as CT_LOG -from ..constants import CACHE_FLUSH_OBJECTS_THRESHOLD -from ..constants import CACHE_FLUSH_MIN_WAIT_OBJ_COUNT_DECREASE_SEC as MIN_WAIT -from ..constants import CACHE_FLUSH_MAX_WAIT_OBJ_COUNT_DECREASE_SEC as MAX_WAIT -from ..exception import CephCacheFlushFailure - - -class TestCacheFlush(unittest.TestCase): - - def setUp(self): - self.service = mock.Mock() - self.ceph_api = mock.Mock() - self.service.ceph_api = self.ceph_api - self.cache_tiering = CacheTiering(self.service) - - @mock.patch('subprocess.check_call') - def test_set_param_fail(self, mock_proc_call): - self.ceph_api.osd_set_pool_param = mock.Mock() - self.ceph_api.osd_set_pool_param.return_value = ( - mock.Mock(ok=False, status_code=500, reason='denied'), - {}) - self.cache_tiering.cache_flush({'pool_name': 'test'}) - mock_proc_call.assert_called_with( - ['/usr/bin/rados', '-p', 'test-cache', 'cache-flush-evict-all']) - - @mock.patch('subprocess.check_call') - def test_df_fail(self, mock_proc_call): - self.ceph_api.osd_set_pool_param = mock.Mock() - self.ceph_api.osd_set_pool_param.return_value = ( - mock.Mock(ok=True, status_code=200, reason='OK'), - {}) - self.ceph_api.df = mock.Mock() - self.ceph_api.df.return_value = ( - mock.Mock(ok=False, status_code=500, reason='denied'), - {}) - self.cache_tiering.cache_flush({'pool_name': 'test'}) - self.ceph_api.osd_set_pool_param.assert_called_once_with( - 'test-cache', 'target_max_objects', 1, force=None, body='json') - mock_proc_call.assert_called_with( - ['/usr/bin/rados', '-p', 'test-cache', 'cache-flush-evict-all']) - - @mock.patch('subprocess.check_call') - def test_rados_evict_fail_raises(self, mock_proc_call): - mock_proc_call.side_effect = subprocess.CalledProcessError(1, ['cmd']) - self.ceph_api.osd_set_pool_param = mock.Mock() - self.ceph_api.osd_set_pool_param.return_value = ( - mock.Mock(ok=False, status_code=500, reason='denied'), - {}) - self.assertRaises(CephCacheFlushFailure, - self.cache_tiering.cache_flush, - {'pool_name': 'test'}) - mock_proc_call.assert_called_with( - ['/usr/bin/rados', '-p', 'test-cache', 'cache-flush-evict-all']) - - @mock.patch('subprocess.check_call') - def test_df_missing_pool(self, mock_proc_call): - self.ceph_api.osd_set_pool_param = mock.Mock() - self.ceph_api.osd_set_pool_param.return_value = ( - mock.Mock(ok=True, status_code=200, reason='OK'), - {}) - self.ceph_api.df = mock.Mock() - self.ceph_api.df.return_value = ( - mock.Mock(ok=True, status_code=200, reason='OK'), - {'output': { - 'pools': [ - {'id': 0, - 'name': 'rbd', - 'stats': {'bytes_used': 0, - 'kb_used': 0, - 'max_avail': 9588428800, - 'objects': 0}}]}, - 'status': 'OK'}) - with mock.patch.object(CT_LOG, 'warn') as mock_lw: - self.cache_tiering.cache_flush({'pool_name': 'test'}) - self.ceph_api.df.assert_called_once_with(body='json') - for c in mock_lw.call_args_list: - if 'Missing pool free space' in c[0][0]: - break - else: - self.fail('expected log warning') - self.ceph_api.osd_set_pool_param.assert_called_once_with( - 'test-cache', 'target_max_objects', 1, force=None, body='json') - mock_proc_call.assert_called_with( - ['/usr/bin/rados', '-p', 'test-cache', 'cache-flush-evict-all']) - - @mock.patch('subprocess.check_call') - def test_df_objects_empty(self, mock_proc_call): - self.ceph_api.osd_set_pool_param = mock.Mock() - self.ceph_api.osd_set_pool_param.return_value = ( - mock.Mock(ok=True, status_code=200, reason='OK'), - {}) - self.ceph_api.df = mock.Mock() - self.ceph_api.df.return_value = ( - mock.Mock(ok=True, status_code=200, reason='OK'), - {'output': { - 'pools': [ - {'id': 0, - 'name': 'test-cache', - 'stats': {'bytes_used': 0, - 'kb_used': 0, - 'max_avail': 9588428800, - 'objects': 0}}]}, - 'status': 'OK'}) - self.cache_tiering.cache_flush({'pool_name': 'test'}) - self.ceph_api.df.assert_called_once_with(body='json') - self.ceph_api.osd_set_pool_param.assert_called_once_with( - 'test-cache', 'target_max_objects', 1, force=None, body='json') - mock_proc_call.assert_called_with( - ['/usr/bin/rados', '-p', 'test-cache', 'cache-flush-evict-all']) - - @mock.patch('time.sleep') - @mock.patch('subprocess.check_call') - def test_df_objects_above_threshold(self, mock_proc_call, mock_time_sleep): - self.ceph_api.osd_set_pool_param = mock.Mock() - self.ceph_api.osd_set_pool_param.return_value = ( - mock.Mock(ok=True, status_code=200, reason='OK'), - {}) - self.ceph_api.df = mock.Mock() - self.ceph_api.df.side_effect = [ - (mock.Mock(ok=True, status_code=200, reason='OK'), - {'output': { - 'pools': [ - {'id': 0, - 'name': 'test-cache', - 'stats': {'bytes_used': 0, - 'kb_used': 0, - 'max_avail': 9588428800, - 'objects': CACHE_FLUSH_OBJECTS_THRESHOLD}}]}, - 'status': 'OK'}), - (mock.Mock(ok=True, status_code=200, reason='OK'), - {'output': { - 'pools': [ - {'id': 0, - 'name': 'test-cache', - 'stats': {'bytes_used': 0, - 'kb_used': 0, - 'max_avail': 9588428800, - 'objects': - CACHE_FLUSH_OBJECTS_THRESHOLD - 1}}]}, - 'status': 'OK'})] - self.cache_tiering.cache_flush({'pool_name': 'test'}) - self.ceph_api.osd_set_pool_param.assert_called_once_with( - 'test-cache', 'target_max_objects', 1, force=None, body='json') - self.ceph_api.df.assert_called_with(body='json') - mock_time_sleep.assert_called_once_with(MIN_WAIT) - mock_proc_call.assert_called_with( - ['/usr/bin/rados', '-p', 'test-cache', 'cache-flush-evict-all']) - - @mock.patch('time.sleep') - @mock.patch('subprocess.check_call') - def test_df_objects_interval_increase(self, mock_proc_call, - mock_time_sleep): - self.ceph_api.osd_set_pool_param = mock.Mock() - self.ceph_api.osd_set_pool_param.return_value = ( - mock.Mock(ok=True, status_code=200, reason='OK'), - {}) - self.ceph_api.df = mock.Mock() - self.ceph_api.df.side_effect = [ - (mock.Mock(ok=True, status_code=200, reason='OK'), - {'output': { - 'pools': [ - {'id': 0, - 'name': 'test-cache', - 'stats': {'bytes_used': 0, - 'kb_used': 0, - 'max_avail': 9588428800, - 'objects': - CACHE_FLUSH_OBJECTS_THRESHOLD + 1}}]}, - 'status': 'OK'}), - (mock.Mock(ok=True, status_code=200, reason='OK'), - {'output': { - 'pools': [ - {'id': 0, - 'name': 'test-cache', - 'stats': {'bytes_used': 0, - 'kb_used': 0, - 'max_avail': 9588428800, - 'objects': - CACHE_FLUSH_OBJECTS_THRESHOLD + 1}}]}, - 'status': 'OK'}), - (mock.Mock(ok=True, status_code=200, reason='OK'), - {'output': { - 'pools': [ - {'id': 0, - 'name': 'test-cache', - 'stats': {'bytes_used': 0, - 'kb_used': 0, - 'max_avail': 9588428800, - 'objects': - CACHE_FLUSH_OBJECTS_THRESHOLD + 1}}]}, - 'status': 'OK'}), - (mock.Mock(ok=True, status_code=200, reason='OK'), - {'output': { - 'pools': [ - {'id': 0, - 'name': 'test-cache', - 'stats': {'bytes_used': 0, - 'kb_used': 0, - 'max_avail': 9588428800, - 'objects': - CACHE_FLUSH_OBJECTS_THRESHOLD - 1}}]}, - 'status': 'OK'})] - self.cache_tiering.cache_flush({'pool_name': 'test'}) - self.ceph_api.osd_set_pool_param.assert_called_once_with( - 'test-cache', 'target_max_objects', 1, force=None, body='json') - self.ceph_api.df.assert_called_with(body='json') - self.assertEqual([c[0][0] for c in mock_time_sleep.call_args_list], - [MIN_WAIT, - MIN_WAIT * 2, - MIN_WAIT * 4]) - mock_proc_call.assert_called_with( - ['/usr/bin/rados', '-p', 'test-cache', 'cache-flush-evict-all']) - - @mock.patch('time.sleep') - @mock.patch('subprocess.check_call') - def test_df_objects_allways_over_threshold(self, mock_proc_call, - mock_time_sleep): - self.ceph_api.osd_set_pool_param = mock.Mock() - self.ceph_api.osd_set_pool_param.return_value = ( - mock.Mock(ok=True, status_code=200, reason='OK'), - {}) - self.ceph_api.df = mock.Mock() - self.ceph_api.df.return_value = ( - mock.Mock(ok=True, status_code=200, reason='OK'), - {'output': { - 'pools': [ - {'id': 0, - 'name': 'test-cache', - 'stats': {'bytes_used': 0, - 'kb_used': 0, - 'max_avail': 9588428800, - 'objects': - CACHE_FLUSH_OBJECTS_THRESHOLD + 1}}]}, - 'status': 'OK'}) - # noinspection PyTypeChecker - mock_time_sleep.side_effect = \ - [None]*int(math.ceil(math.log(float(MAX_WAIT)/MIN_WAIT, 2)) + 1) \ - + [Exception('too many sleeps')] - self.cache_tiering.cache_flush({'pool_name': 'test'}) - self.ceph_api.osd_set_pool_param.assert_called_once_with( - 'test-cache', 'target_max_objects', 1, force=None, body='json') - self.ceph_api.df.assert_called_with(body='json') - expected_sleep = [] - interval = MIN_WAIT - while interval <= MAX_WAIT: - expected_sleep.append(interval) - interval *= 2 - self.assertEqual([c[0][0] for c in mock_time_sleep.call_args_list], - expected_sleep) - mock_proc_call.assert_called_with( - ['/usr/bin/rados', '-p', 'test-cache', 'cache-flush-evict-all']) - - @mock.patch('time.sleep') - @mock.patch('subprocess.check_call') - def test_df_objects_increase(self, mock_proc_call, mock_time_sleep): - self.ceph_api.osd_set_pool_param = mock.Mock() - self.ceph_api.osd_set_pool_param.return_value = ( - mock.Mock(ok=True, status_code=200, reason='OK'), - {}) - self.ceph_api.df = mock.Mock() - self.ceph_api.df.side_effect = [ - (mock.Mock(ok=True, status_code=200, reason='OK'), - {'output': { - 'pools': [ - {'id': 0, - 'name': 'test-cache', - 'stats': {'bytes_used': 0, - 'kb_used': 0, - 'max_avail': 9588428800, - 'objects': - CACHE_FLUSH_OBJECTS_THRESHOLD + 1}}]}, - 'status': 'OK'}), - (mock.Mock(ok=True, status_code=200, reason='OK'), - {'output': { - 'pools': [ - {'id': 0, - 'name': 'test-cache', - 'stats': {'bytes_used': 0, - 'kb_used': 0, - 'max_avail': 9588428800, - 'objects': - CACHE_FLUSH_OBJECTS_THRESHOLD + 2}}]}, - 'status': 'OK'})] - with mock.patch.object(CT_LOG, 'warn') as mock_lw: - self.cache_tiering.cache_flush({'pool_name': 'test'}) - for c in mock_lw.call_args_list: - if 'Unexpected increase' in c[0][0]: - break - else: - self.fail('expected log warning') - self.ceph_api.df.assert_called_with(body='json') - mock_time_sleep.assert_called_once_with(MIN_WAIT) - self.ceph_api.osd_set_pool_param.assert_called_once_with( - 'test-cache', 'target_max_objects', 1, force=None, body='json') - mock_proc_call.assert_called_with( - ['/usr/bin/rados', '-p', 'test-cache', 'cache-flush-evict-all']) From 415e9d5340790acd0b984a3e0ed8a1e2e103773f Mon Sep 17 00:00:00 2001 From: Robert Church Date: Mon, 21 May 2018 09:10:15 -0500 Subject: [PATCH 08/26] Update upgrade code for removing Ceph Cache Tiering Story: 2002884 Task: 22846 Change-Id: Ia2207eecea4fe4e590f774764a8beea83fa15fa7 Signed-off-by: Don Penney Signed-off-by: Jack Ding Signed-off-by: Scott Little --- .../ceph-manager/ceph_manager/constants.py | 2 +- .../ceph-manager/ceph_manager/monitor.py | 44 +++++++++---------- .../ceph-manager/ceph_manager/server.py | 7 +-- 3 files changed, 23 insertions(+), 30 deletions(-) diff --git a/ceph/ceph-manager/ceph-manager/ceph_manager/constants.py b/ceph/ceph-manager/ceph-manager/ceph_manager/constants.py index ede99b2af..6cfbba4f8 100644 --- a/ceph/ceph-manager/ceph-manager/ceph_manager/constants.py +++ b/ceph/ceph-manager/ceph-manager/ceph_manager/constants.py @@ -65,7 +65,7 @@ CEPH_MANAGER_TOPIC = 'sysinv.ceph_manager' SYSINV_CONFIG_FILE = '/etc/sysinv/sysinv.conf' # Titanium Cloud version strings -TITANIUM_SERVER_VERSION_16_10 = '16.10' +TITANIUM_SERVER_VERSION_18_03 = '18.03' CEPH_HEALTH_WARN_REQUIRE_JEWEL_OSDS_NOT_SET = ( "all OSDs are running jewel or later but the " diff --git a/ceph/ceph-manager/ceph-manager/ceph_manager/monitor.py b/ceph/ceph-manager/ceph-manager/ceph_manager/monitor.py index c0960fbd6..44c56b221 100644 --- a/ceph/ceph-manager/ceph-manager/ceph_manager/monitor.py +++ b/ceph/ceph-manager/ceph-manager/ceph_manager/monitor.py @@ -22,20 +22,18 @@ import exception LOG = logging.getLogger(__name__) -# When upgrading from 16.10 to 17.x Ceph goes from Hammer release -# to Jewel release. After all storage nodes are upgraded to 17.x -# the cluster is in HEALTH_WARN until administrator explicitly -# enables require_jewel_osds flag - which signals Ceph that it -# can safely transition from Hammer to Jewel +# In 18.03 R5, ceph cache tiering was disabled and prevented from being +# re-enabled. When upgrading from 18.03 (R5) to R6 we need to remove the +# cache-tier from the crushmap ceph-cache-tiering # -# This class is needed only when upgrading from 16.10 to 17.x -# TODO: remove it after 1st 17.x release +# This class is needed only when upgrading from R5 to R6 +# TODO: remove it after 1st R6 release # class HandleUpgradesMixin(object): def __init__(self, service): self.service = service - self.surpress_require_jewel_osds_warning = False + self.wait_for_upgrade_complete = False def setup(self, config): self._set_upgrade(self.service.retry_get_software_upgrade_status()) @@ -45,9 +43,10 @@ class HandleUpgradesMixin(object): from_version = upgrade.get('from_version') if (state and state != constants.UPGRADE_COMPLETED - and from_version == constants.TITANIUM_SERVER_VERSION_16_10): - LOG.info(_LI("Surpress require_jewel_osds health warning")) - self.surpress_require_jewel_osds_warning = True + and from_version == constants.TITANIUM_SERVER_VERSION_18_03): + + LOG.info(_LI("Wait for caph upgrade to complete before monitoring cluster.")) + self.wait_for_upgrade_complete = True def set_flag_require_jewel_osds(self): try: @@ -73,7 +72,7 @@ class HandleUpgradesMixin(object): health = self.auto_heal(health) # filter out require_jewel_osds warning # - if not self.surpress_require_jewel_osds_warning: + if not self.wait_for_upgrade_complete: return health if health['health'] != constants.CEPH_HEALTH_WARN: return health @@ -114,17 +113,16 @@ class HandleUpgradesMixin(object): state = upgrade.get('state') # surpress require_jewel_osds in case upgrade is # in progress but not completed or aborting - if (not self.surpress_require_jewel_osds_warning + if (not self.wait_for_upgrade_complete and (upgrade.get('from_version') - == constants.TITANIUM_SERVER_VERSION_16_10) + == constants.TITANIUM_SERVER_VERSION_18_03) and state not in [ None, constants.UPGRADE_COMPLETED, constants.UPGRADE_ABORTING, constants.UPGRADE_ABORT_COMPLETING, constants.UPGRADE_ABORTING_ROLLBACK]): - LOG.info(_LI("Surpress require_jewel_osds health warning")) - self.surpress_require_jewel_osds_warning = True + self.wait_for_upgrade_complete = True # set require_jewel_osds in case upgrade is # not in progress or completed if (state in [None, constants.UPGRADE_COMPLETED]): @@ -135,16 +133,14 @@ class HandleUpgradesMixin(object): self.set_flag_require_jewel_osds() health = self._remove_require_jewel_osds_warning(health) LOG.info(_LI("Unsurpress require_jewel_osds health warning")) - self.surpress_require_jewel_osds_warning = False + self.wait_for_upgrade_complete = False # unsurpress require_jewel_osds in case upgrade # is aborting - if (self.surpress_require_jewel_osds_warning - and state in [ - constants.UPGRADE_ABORTING, - constants.UPGRADE_ABORT_COMPLETING, - constants.UPGRADE_ABORTING_ROLLBACK]): - LOG.info(_LI("Unsurpress require_jewel_osds health warning")) - self.surpress_require_jewel_osds_warning = False + if (state in [ + constants.UPGRADE_ABORTING, + constants.UPGRADE_ABORT_COMPLETING, + constants.UPGRADE_ABORTING_ROLLBACK]): + self.wait_for_upgrade_complete = False return health diff --git a/ceph/ceph-manager/ceph-manager/ceph_manager/server.py b/ceph/ceph-manager/ceph-manager/ceph_manager/server.py index c8b96a726..72edf406b 100644 --- a/ceph/ceph-manager/ceph-manager/ceph_manager/server.py +++ b/ceph/ceph-manager/ceph-manager/ceph_manager/server.py @@ -97,9 +97,6 @@ class RpcEndpoint(PeriodicTasks): return self.service.monitor.cluster_is_up -# This class is needed only when upgrading from 16.10 to 17.x -# TODO: remove it after 1st 17.x release -# class SysinvConductorUpgradeApi(object): def __init__(self): self.sysinv_conductor = None @@ -113,10 +110,10 @@ class SysinvConductorUpgradeApi(object): return upgrade @retry(wait_fixed=1000, - retry_on_exception=lambda exception: + retry_on_exception=lambda e: LOG.warn(_LW( "Getting software upgrade status failed " - "with: %s. Retrying... ") % str(exception)) or True) + "with: %s. Retrying... ") % str(e)) or True) def retry_get_software_upgrade_status(self): return self.get_software_upgrade_status() From e907c9a48c14f244b09166fc9dc1bf05754f2a9d Mon Sep 17 00:00:00 2001 From: Jack Ding Date: Mon, 23 Jul 2018 14:42:57 -0400 Subject: [PATCH 09/26] Correct typo: caph should be ceph Change-Id: Ibf10aa998458a3b6a62b8c55fe7d90af057dae41 Signed-off-by: Jack Ding Signed-off-by: Scott Little --- ceph/ceph-manager/ceph-manager/ceph_manager/monitor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ceph/ceph-manager/ceph-manager/ceph_manager/monitor.py b/ceph/ceph-manager/ceph-manager/ceph_manager/monitor.py index 44c56b221..2a13f88a1 100644 --- a/ceph/ceph-manager/ceph-manager/ceph_manager/monitor.py +++ b/ceph/ceph-manager/ceph-manager/ceph_manager/monitor.py @@ -45,7 +45,7 @@ class HandleUpgradesMixin(object): and state != constants.UPGRADE_COMPLETED and from_version == constants.TITANIUM_SERVER_VERSION_18_03): - LOG.info(_LI("Wait for caph upgrade to complete before monitoring cluster.")) + LOG.info(_LI("Wait for ceph upgrade to complete before monitoring cluster.")) self.wait_for_upgrade_complete = True def set_flag_require_jewel_osds(self): From 20cc93425fdec30b0df8bc0bc78f675b6546ba85 Mon Sep 17 00:00:00 2001 From: Scott Little Date: Mon, 13 Aug 2018 13:47:43 -0400 Subject: [PATCH 10/26] Move content from stx-upstream to stx-integ Packages will be relocated to stx-integ: ceph/ ceph ceph-manager Change-Id: I5bc8de8791ef7b31bc529e7f732c7b9aeea4de05 Story: 2002801 Task: 22687 Signed-off-by: Scott Little --- centos_pkg_dirs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/centos_pkg_dirs b/centos_pkg_dirs index a73170e16..7d3e286f1 100644 --- a/centos_pkg_dirs +++ b/centos_pkg_dirs @@ -146,3 +146,5 @@ filesystem/parted security/python-keyring grub/grub2 utilities/build-info +ceph/ceph +ceph/ceph-manager From 72f444f6019b4371e6659bbc20028ac36a96caaf Mon Sep 17 00:00:00 2001 From: Scott Little Date: Wed, 15 Aug 2018 14:11:12 -0400 Subject: [PATCH 11/26] Split image.inc across git repos Currently compiling a new package and adding it to the iso still requires a multi-git update because image.inc is a single centralized file in the root git. It would be better to allow a single git update to add a package. Too allow this, image.inc must be split across the git repos and the build tools must be changed to read/merge those files to arrive at the final package list. Current scheme is to name the image.inc files using this schema. ${distro}_${build_target}_image_${build_type}.inc distro = centos, ... build_target = iso, guest ... build_type = std, rt ... Traditionally build_type=std is omitted from config files, so we instread use ${distro}_${build_target}_image.inc. Change-Id: Ic0d076b4537f6edbd0c058c0dab667fea68de647 Story: 2003447 Task: 24649 Depends-On: Ib39b8063e7759842ba15330c68503bfe2dea6e20 Signed-off-by: Scott Little --- centos_guest_image.inc | 48 ++++++ centos_guest_image_rt.inc | 45 ++++++ centos_iso_image.inc | 329 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 422 insertions(+) create mode 100644 centos_guest_image.inc create mode 100644 centos_guest_image_rt.inc create mode 100644 centos_iso_image.inc diff --git a/centos_guest_image.inc b/centos_guest_image.inc new file mode 100644 index 000000000..c68985da3 --- /dev/null +++ b/centos_guest_image.inc @@ -0,0 +1,48 @@ +# List of packages to be included/installed in guest image +# If these have dependencies, they will be pulled in automatically +# + +# build-info +build-info + +# kernel +perf + +# +# Network Drivers +# + +# i40evf-kmod +kmod-i40evf + +# i40e-kmod +kmod-i40e + +# ixgbevf-kmod +kmod-ixgbevf + +# ixgbe-kmod +kmod-ixgbe + +# qat17 +qat17 + +# +# TPM2 libs to enable vTPM on Guest +# + +# tpm-kmod +kmod-tpm + +# tss2 +tss2 + +# tpm2-tools +tpm2-tools + +# +# ima support +# + +# rpm +rpm-plugin-systemd-inhibit diff --git a/centos_guest_image_rt.inc b/centos_guest_image_rt.inc new file mode 100644 index 000000000..c98982df3 --- /dev/null +++ b/centos_guest_image_rt.inc @@ -0,0 +1,45 @@ +# List of packages to be included/installed in RT guest image +# If these have dependencies, they will be pulled in automatically +# + +# build-info +build-info + +# +# Network Drivers +# + +# i40evf-kmod +kmod-i40evf-rt + +# i40e-kmod +kmod-i40e-rt + +# ixgbevf-kmod +kmod-ixgbevf-rt + +# ixgbe-kmod +kmod-ixgbe-rt + +# qat17 +qat17-rt + +# +# TPM2 libs to enable vTPM on Guest +# + +# tpm-kmod +kmod-tpm-rt + +# tss2 +tss2 + +# tpm2-tools +tpm2-tools + +# +# ima support +# + +# rpm +rpm-plugin-systemd-inhibit diff --git a/centos_iso_image.inc b/centos_iso_image.inc new file mode 100644 index 000000000..2e6eb528c --- /dev/null +++ b/centos_iso_image.inc @@ -0,0 +1,329 @@ +# List of packages to be included/installed in ISO +# If these have dependencies, they will be pulled in automatically +# + +# vm-topology +vm-topology + +# namespace-utils +namespace-utils + +# qemu-kvm-ev +qemu-kvm-ev +qemu-img-ev +qemu-kvm-tools-ev + +# nfscheck +nfscheck + +# libvirt +libvirt +libvirt-docs +libvirt-daemon +libvirt-daemon-config-network +libvirt-daemon-config-nwfilter +libvirt-daemon-driver-network +libvirt-daemon-driver-nwfilter +libvirt-daemon-driver-nodedev +libvirt-daemon-driver-secret +libvirt-daemon-driver-storage +libvirt-daemon-driver-qemu +libvirt-daemon-driver-lxc +libvirt-client + +# python-cephclient +python-cephclient + +# python-ryu +python2-ryu +python-ryu-common + +# python-smartpm +python-smartpm + +# lldpd +lldpd + +# nova-utils +nova-utils + +# mlx4-config +mlx4-config + +# wrs-ssl +wrs-ssl + +# tss2 +tss2 + +# tpm2-openssl-engine +tpm2-openssl-engine + +# libtpms +libtpms + +# swtpm +swtpm +swtpm-cuse +swtpm-tools + +# tis-extensions +tis-extensions +tis-extensions-controller + +# python-3parclient +python-3parclient + +# python-lefthandclient +python-lefthandclient + +# collectd-extensions +collectd-extensions + +# influxdb-extensions +influxdb-extensions + +# docker-distribution +docker-distribution + +# helm +helm + +# logmgmt +logmgmt + +# filesystem-scripts +filesystem-scripts + +# io-scheduler +io-scheduler + +# collector +collector + +# platform-util +platform-util +platform-util-noncontroller + +# monitor-tools +monitor-tools + +# e1000e-kmod +kmod-e1000e +kmod-e1000e-rt + +# i40e-kmod +kmod-i40e +kmod-i40e-rt + +# ixgbevf-kmod +kmod-ixgbevf + +# ixgbe-kmod +kmod-ixgbe +kmod-ixgbe-rt + +# qat17 +qat17 +qat17-rt + +# tpm-kmod +kmod-tpm +kmod-tpm-rt + +# integrity-kmod +kmod-integrity +kmod-integrity-rt + +# drbd-kernel +kmod-drbd +kmod-drbd-rt + +# rpm +rpm-plugin-systemd-inhibit + +# dpkg +dpkg + +# cgcs-users +cgcs-users + +# ldapscripts +ldapscripts + +# drbd +drbd +drbd-utils +drbd-udev +drbd-pacemaker +drbd-heartbeat +drbd-bash-completion + +# build-info +build-info + +# initscripts +initscripts + +# setup +setup + +# lshell +lshell + +# nss-pam-ldapd +nss-pam-ldapd + +# centos-release +centos-release + +# nfs-utils +nfs-utils + +# dhcp +dhcp +dhclient + +# openssh +openssh +openssh-clients +openssh-server + +# facter +facter + +# vim +vim-enhanced + +# python +python + +# libvirt-python +libvirt-python + +# lighttpd +lighttpd +lighttpd-fastcgi +lighttpd-mod_geoip +lighttpd-mod_mysql_vhost + +# logrotate +logrotate + +# ntp +ntp +ntp-perl +ntpdate + +# pam +pam + +# shadow-utils +shadow-utils + +# syslog-ng +syslog-ng +syslog-ng-libdbi + +# novnc +novnc + +# sudo +sudo + +# net-snmp +net-snmp-utils +net-snmp-libs +net-snmp-python + +# openldap +openldap +openldap-servers +openldap-clients + +# openvswitch +openvswitch + +# libevent +libevent + +# tpm2-tools +tpm2-tools + +# audit +audit + +# kernel +kernel +kernel-tools +kernel-tools-libs +perf +python-perf + +# puppet +puppet + +# puppet-gnocchi +puppet-gnocchi + +# systemd +systemd + +# python-gunicorn +python2-gunicorn + +# tboot +tboot + +# memcached +memcached + +# kubernetes +kubernetes +kubernetes-master +kubernetes-node +kubernetes-kubeadm +kubernetes-client + +# resource-agents +resource-agents + +# bash +bash + +# haproxy +haproxy + +# iscsi-initiator-utils +iscsi-initiator-utils +iscsi-initiator-utils-iscsiuio + +# iptables +iptables +iptables-services +iptables-utils + +# python-psycopg2 +python-psycopg2 + +# dnsmasq +dnsmasq +dnsmasq-utils + +# rsync +rsync + +# parted +parted + +# python-keyring +python-keyring + +# grub2 +grub2-tools +grub2-efi-x64-modules + +# kernel-rt +kernel-rt +kernel-rt-kvm +kernel-rt-tools From aea80fa088419a4d0553c2122f5c96b1e7a946c0 Mon Sep 17 00:00:00 2001 From: Kristine Bujold Date: Wed, 15 Aug 2018 17:11:27 -0400 Subject: [PATCH 12/26] Fix controller swact error caused by drbd resizing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix a bug found in ‘provider/logical_volume/lvm.rb’ in method ‘def size=(new_size)’ with the variable ‘nuke_fs_on_resize_failure’. The conditional statement was always returning true and thus wiping bytes at the start of the volume even is the variable was set to false. For example resizing the extension filesystem with the command ‘system controllerfs-modify’ would cause a dd command to be executed and erase data. This was seen in the puppet.log Story: 2002990 Task: 23004 Change-Id: I9ce4f9869d8b72549640d1a4181df02490451a88 Signed-off-by: Kristine Bujold --- .../puppet-lvm/centos/build_srpm.data | 2 +- ...ical-statement-for-nuke_fs_on_resize.patch | 45 +++++++++++++++++++ .../puppet-lvm/centos/puppet-lvm.spec | 2 + 3 files changed, 48 insertions(+), 1 deletion(-) create mode 100644 config/puppet-modules/puppet-lvm/centos/files/Fix-the-logical-statement-for-nuke_fs_on_resize.patch diff --git a/config/puppet-modules/puppet-lvm/centos/build_srpm.data b/config/puppet-modules/puppet-lvm/centos/build_srpm.data index 9bfbdd412..6c97141c9 100644 --- a/config/puppet-modules/puppet-lvm/centos/build_srpm.data +++ b/config/puppet-modules/puppet-lvm/centos/build_srpm.data @@ -9,4 +9,4 @@ COPY_LIST="$CGCS_BASE/downloads/puppet/$PREFIX-$MODULE-$GIT_SHA.tar.gz $FILES_BA -TIS_PATCH_VER=4 +TIS_PATCH_VER=5 diff --git a/config/puppet-modules/puppet-lvm/centos/files/Fix-the-logical-statement-for-nuke_fs_on_resize.patch b/config/puppet-modules/puppet-lvm/centos/files/Fix-the-logical-statement-for-nuke_fs_on_resize.patch new file mode 100644 index 000000000..e1796ba3c --- /dev/null +++ b/config/puppet-modules/puppet-lvm/centos/files/Fix-the-logical-statement-for-nuke_fs_on_resize.patch @@ -0,0 +1,45 @@ +From 21d2c4e714611ad08e5aa999e555e1e7591f2717 Mon Sep 17 00:00:00 2001 +From: Kristine Bujold +Date: Thu, 19 Jul 2018 09:02:27 -0400 +Subject: [PATCH 1/1] Patch4: + Fix-the-logical-statement-for-nuke_fs_on_resize_2.patch + +--- + .../puppet/modules/lvm/lib/puppet/provider/logical_volume/lvm.rb | 9 ++++++--- + 1 file changed, 6 insertions(+), 3 deletions(-) + +diff --git a/packstack/puppet/modules/lvm/lib/puppet/provider/logical_volume/lvm.rb b/packstack/puppet/modules/lvm/lib/puppet/provider/logical_volume/lvm.rb +index 2abfea3..f9b1c66 100755 +--- a/packstack/puppet/modules/lvm/lib/puppet/provider/logical_volume/lvm.rb ++++ b/packstack/puppet/modules/lvm/lib/puppet/provider/logical_volume/lvm.rb +@@ -184,13 +184,15 @@ Puppet::Type.type(:logical_volume).provide :lvm do + exec_cmd('umount', path) + exec_cmd('fsadm', '-y', 'check', path ) + r = exec_cmd('fsadm', '-y', 'resize', path, "#{new_size}k") +- if r[:exit] != 0 and @resource[:nuke_fs_on_resize_failure] ++ if r[:exit] != 0 and [:true, "true", true ].include? @resource[:nuke_fs_on_resize_failure] ++ info( "Failed 'fsadm resize' erase the disk #{r}" ) + exec_cmd('dd', 'if=/dev/zero', "of=#{path}", "bs=512", "count=16", "conv=notrunc") + blkid('-g') + end + r = exec_cmd('lvresize', '-r', '-f', '-L', "#{new_size}k", path) + if r[:exit] != 0 +- if @resource[:nuke_fs_on_resize_failure] ++ if [:true, "true", true ].include? @resource[:nuke_fs_on_resize_failure] ++ info( "Failed 'fsadm resize' erase the disk #{r}" ) + exec_cmd('dd', 'if=/dev/zero', "of=#{path}", "bs=512", "count=16", "conv=notrunc") + blkid('-g') + lvresize( '-f', '-L', "#{new_size}k", path) || fail( "Cannot reduce to size #{new_size} because lvresize failed." ) +@@ -215,7 +217,8 @@ Puppet::Type.type(:logical_volume).provide :lvm do + exec_cmd('umount', path) + exec_cmd('fsadm', '-y', 'check', path ) + r = exec_cmd('fsadm', '-y', 'resize', path, "#{new_size}k") +- if r[:exit] != 0 and @resource[:nuke_fs_on_resize_failure] ++ if r[:exit] != 0 and [:true, "true", true ].include? @resource[:nuke_fs_on_resize_failure] ++ info( "Failed 'fsadm resize' erase the disk #{r}" ) + exec_cmd('dd', 'if=/dev/zero', "of=#{path}", "bs=512", "count=16", "conv=notrunc") + blkid('-g') + end +-- +1.8.3.1 + diff --git a/config/puppet-modules/puppet-lvm/centos/puppet-lvm.spec b/config/puppet-modules/puppet-lvm/centos/puppet-lvm.spec index 8f7252092..247dc03c3 100644 --- a/config/puppet-modules/puppet-lvm/centos/puppet-lvm.spec +++ b/config/puppet-modules/puppet-lvm/centos/puppet-lvm.spec @@ -16,6 +16,7 @@ Patch0: 0001-puppet-lvm-kilo-quilt-changes.patch Patch1: 0002-UEFI-pvcreate-fix.patch Patch2: 0003-US94222-Persistent-Dev-Naming.patch Patch3: 0004-extendind-nuke_fs_on_resize_failure-functionality.patch +Patch4: Fix-the-logical-statement-for-nuke_fs_on_resize.patch BuildArch: noarch @@ -34,6 +35,7 @@ A Puppet module for Logical Resource Management (LVM) %patch1 -p1 %patch2 -p1 %patch3 -p1 +%patch4 -p1 %install install -d -m 0755 %{buildroot}/%{_datadir}/puppet/modules/%{module_dir} From 40447bfa0d37e92da34427674cd19063fd9651dd Mon Sep 17 00:00:00 2001 From: Tao Liu Date: Thu, 16 Aug 2018 12:05:42 -0400 Subject: [PATCH 13/26] Decouple Fault Management from stx-config Add FM rest API document Remove the alarm and log APIs from sysinv rest API document Update the collect script with the FM config file new location Story: 2002828 Task: 22747 Depends-On: https://review.openstack.org/#/c/591452/ Change-Id: Ica507af0b6b0dc137bf1e469cdaed678917f24e8 Signed-off-by: Tao Liu --- .../restapi-doc/api-ref-guides/pom.xml | 11 + .../api-ref-guides/src/bk-api-ref-fm-v1.xml | 57 +++ .../api-ref-guides/src/bk-api-ref.xml | 1 + restapi-doc/restapi-doc/api-ref/pom.xml | 11 + .../api-ref/src/docbkx/api-ref-fm-v1.xml | 30 ++ .../api-ref/src/docbkx/ch_fm-api-v1.xml | 110 ++++ .../api-ref/src/docbkx/ch_sysinv-api-v1.xml | 75 +-- .../src/docbkx/itemizedlist-service-list.xml | 4 + .../v1/api_samples/alarm-response.json | 0 .../v1/api_samples/alarm_list-request.json | 1 + .../v1/api_samples/alarm_list-response.json | 0 .../api_samples/alarm_summary-response.json | 0 .../api_samples/event_log_list-request.json | 1 + .../api_samples/event_log_list-response.json | 0 .../api_samples/event_log_show-response.json | 0 .../event_suppression_list-request.json | 1 + .../event_suppression_list-response.json | 0 .../event_suppression_modify-request.json | 0 .../event_suppression_modify-response.json | 0 .../v1/api_samples/fm-versions-response.json | 24 + .../v1/api_samples/version-get-response.json | 24 + .../api_samples/versionv1-get-response.json | 50 ++ .../api-ref/src/wadls/fm-api/v1/common.ent | 436 ++++++++++++++++ .../src/wadls/fm-api/v1/fm-api-v1.wadl | 469 ++++++++++++++++++ .../v1/api_samples/alarm_list-request.json | 1 - .../api_samples/event_log_list-request.json | 1 - .../event_suppression_list-request.json | 1 - .../api_samples/sysinv-versions-response.json | 2 +- .../v1/api_samples/version-get-response.json | 2 +- .../api_samples/versionv1-get-response.json | 30 -- .../src/wadls/sysinv-api/v1/common.ent | 308 ------------ .../wadls/sysinv-api/v1/sysinv-api-v1.wadl | 382 -------------- .../collector/scripts/collect_mask_passwords | 2 +- 33 files changed, 1235 insertions(+), 799 deletions(-) create mode 100644 restapi-doc/restapi-doc/api-ref-guides/src/bk-api-ref-fm-v1.xml create mode 100644 restapi-doc/restapi-doc/api-ref/src/docbkx/api-ref-fm-v1.xml create mode 100644 restapi-doc/restapi-doc/api-ref/src/docbkx/ch_fm-api-v1.xml rename restapi-doc/restapi-doc/api-ref/src/wadls/{sysinv-api => fm-api}/v1/api_samples/alarm-response.json (100%) create mode 100644 restapi-doc/restapi-doc/api-ref/src/wadls/fm-api/v1/api_samples/alarm_list-request.json rename restapi-doc/restapi-doc/api-ref/src/wadls/{sysinv-api => fm-api}/v1/api_samples/alarm_list-response.json (100%) rename restapi-doc/restapi-doc/api-ref/src/wadls/{sysinv-api => fm-api}/v1/api_samples/alarm_summary-response.json (100%) create mode 100644 restapi-doc/restapi-doc/api-ref/src/wadls/fm-api/v1/api_samples/event_log_list-request.json rename restapi-doc/restapi-doc/api-ref/src/wadls/{sysinv-api => fm-api}/v1/api_samples/event_log_list-response.json (100%) rename restapi-doc/restapi-doc/api-ref/src/wadls/{sysinv-api => fm-api}/v1/api_samples/event_log_show-response.json (100%) create mode 100644 restapi-doc/restapi-doc/api-ref/src/wadls/fm-api/v1/api_samples/event_suppression_list-request.json rename restapi-doc/restapi-doc/api-ref/src/wadls/{sysinv-api => fm-api}/v1/api_samples/event_suppression_list-response.json (100%) rename restapi-doc/restapi-doc/api-ref/src/wadls/{sysinv-api => fm-api}/v1/api_samples/event_suppression_modify-request.json (100%) rename restapi-doc/restapi-doc/api-ref/src/wadls/{sysinv-api => fm-api}/v1/api_samples/event_suppression_modify-response.json (100%) create mode 100644 restapi-doc/restapi-doc/api-ref/src/wadls/fm-api/v1/api_samples/fm-versions-response.json create mode 100644 restapi-doc/restapi-doc/api-ref/src/wadls/fm-api/v1/api_samples/version-get-response.json create mode 100644 restapi-doc/restapi-doc/api-ref/src/wadls/fm-api/v1/api_samples/versionv1-get-response.json create mode 100644 restapi-doc/restapi-doc/api-ref/src/wadls/fm-api/v1/common.ent create mode 100644 restapi-doc/restapi-doc/api-ref/src/wadls/fm-api/v1/fm-api-v1.wadl delete mode 100644 restapi-doc/restapi-doc/api-ref/src/wadls/sysinv-api/v1/api_samples/alarm_list-request.json delete mode 100644 restapi-doc/restapi-doc/api-ref/src/wadls/sysinv-api/v1/api_samples/event_log_list-request.json delete mode 100644 restapi-doc/restapi-doc/api-ref/src/wadls/sysinv-api/v1/api_samples/event_suppression_list-request.json diff --git a/restapi-doc/restapi-doc/api-ref-guides/pom.xml b/restapi-doc/restapi-doc/api-ref-guides/pom.xml index 0047e3735..f2669c716 100644 --- a/restapi-doc/restapi-doc/api-ref-guides/pom.xml +++ b/restapi-doc/restapi-doc/api-ref-guides/pom.xml @@ -154,6 +154,17 @@ SPDX-License-Identifier: Apache-2.0 api-ref-smapi-v1 + + os-api-ref-fm-v1 + + generate-pdf + + generate-sources + + bk-api-ref-fm-v1.xml + api-ref-fm-v1 + + api-ref-guides diff --git a/restapi-doc/restapi-doc/api-ref-guides/src/bk-api-ref-fm-v1.xml b/restapi-doc/restapi-doc/api-ref-guides/src/bk-api-ref-fm-v1.xml new file mode 100644 index 000000000..24ee51992 --- /dev/null +++ b/restapi-doc/restapi-doc/api-ref-guides/src/bk-api-ref-fm-v1.xml @@ -0,0 +1,57 @@ + + + + + + + + + +GET'> +PUT'> +POST'> +DELETE'> +]> + + + Titanium Fault Management API v1 + API Reference + + + + + + + Wind River + + + + 2018 + Wind River + + Titanium Cloud + + + + Copyright details are filled in by the + template. + + + + + + diff --git a/restapi-doc/restapi-doc/api-ref-guides/src/bk-api-ref.xml b/restapi-doc/restapi-doc/api-ref-guides/src/bk-api-ref.xml index 532896db5..299f16579 100644 --- a/restapi-doc/restapi-doc/api-ref-guides/src/bk-api-ref.xml +++ b/restapi-doc/restapi-doc/api-ref-guides/src/bk-api-ref.xml @@ -49,4 +49,5 @@ SPDX-License-Identifier: Apache-2.0 + diff --git a/restapi-doc/restapi-doc/api-ref/pom.xml b/restapi-doc/restapi-doc/api-ref/pom.xml index 49b5f37ba..83b21ee68 100644 --- a/restapi-doc/restapi-doc/api-ref/pom.xml +++ b/restapi-doc/restapi-doc/api-ref/pom.xml @@ -87,6 +87,17 @@ SPDX-License-Identifier: Apache-2.0 bk-dcmanager-api-v1 + + fm-api-v1 + + generate-html + + generate-sources + + api-ref-fm-v1.xml + bk-fm-api-v1 + + os-api-ref-compute-v2-cgcs-ext diff --git a/restapi-doc/restapi-doc/api-ref/src/docbkx/api-ref-fm-v1.xml b/restapi-doc/restapi-doc/api-ref/src/docbkx/api-ref-fm-v1.xml new file mode 100644 index 000000000..fca013489 --- /dev/null +++ b/restapi-doc/restapi-doc/api-ref/src/docbkx/api-ref-fm-v1.xml @@ -0,0 +1,30 @@ + + + + + Titanium Fault Management API v1 + + 2017 + Wind River + + + + + + + + + + diff --git a/restapi-doc/restapi-doc/api-ref/src/docbkx/ch_fm-api-v1.xml b/restapi-doc/restapi-doc/api-ref/src/docbkx/ch_fm-api-v1.xml new file mode 100644 index 000000000..2884f048b --- /dev/null +++ b/restapi-doc/restapi-doc/api-ref/src/docbkx/ch_fm-api-v1.xml @@ -0,0 +1,110 @@ + + + + Fault Management API v1 + The API supports alarm and event collection of the cloud platform itself. + The typical port used for the FM REST API is 18002. + However, proper technique would be to look up the FM service endpoint in Keystone. + + + + + + +
+ API versions + + + + + + + + +
+ + + + + +
+ Alarms + These APIs allow the display of the Active Alarms + in the system. + + + + + + + + + + + + + + +
+ + + + + + +
+ Event Log + These APIs allow the display of the Event Log + in the system. The Event log contains both historical alarms and customer logs. + + + + + + + + +
+ + + + + + +
+ Event Suppression + These APIs allow the display of the Event Suppression state + in the system. + + + + + + + + +
+ +
diff --git a/restapi-doc/restapi-doc/api-ref/src/docbkx/ch_sysinv-api-v1.xml b/restapi-doc/restapi-doc/api-ref/src/docbkx/ch_sysinv-api-v1.xml index 0d0aa76b7..fd01cec26 100644 --- a/restapi-doc/restapi-doc/api-ref/src/docbkx/ch_sysinv-api-v1.xml +++ b/restapi-doc/restapi-doc/api-ref/src/docbkx/ch_sysinv-api-v1.xml @@ -16,8 +16,8 @@ SPDX-License-Identifier: Apache-2.0 SysInv API v1 Manage physical servers with the Titanium System Inventory API. This includes inventory collection and configuration of nodes, ports, interfaces, CPUs, disks, - partitions, memory, and sensors. The API also supports alarm collection for fault - events of the cloud itself as well as configuration of the cloud's SNMP interface. + partitions, memory, and sensors. The API also supports configuration of the + cloud's SNMP interface. The typical port used for the SysInv REST API is 6385. However, proper technique would be to look up the sysinv service endpoint in Keystone. @@ -633,77 +633,6 @@ configuration entity for the system. - - - - -
- Alarms - These APIs allow the display of the Active Alarms - in the system. - - - - - - - - - - - - - - -
- - - - - - -
- Event Log - These APIs allow the display of the Event Log - in the system. The Event log contains both historical alarms and customer logs. - - - - - - - - -
- - - - - - -
- Event Suppression - These APIs allow the display of the Event Suppression state - in the system. - - - - - - - - -
- - diff --git a/restapi-doc/restapi-doc/api-ref/src/docbkx/itemizedlist-service-list.xml b/restapi-doc/restapi-doc/api-ref/src/docbkx/itemizedlist-service-list.xml index 0f568bb4c..4516ec933 100644 --- a/restapi-doc/restapi-doc/api-ref/src/docbkx/itemizedlist-service-list.xml +++ b/restapi-doc/restapi-doc/api-ref/src/docbkx/itemizedlist-service-list.xml @@ -46,4 +46,8 @@ SPDX-License-Identifier: Apache-2.0 Titanium SM API Service API v1 + + Titanium Fault Management Service + API v1 + diff --git a/restapi-doc/restapi-doc/api-ref/src/wadls/sysinv-api/v1/api_samples/alarm-response.json b/restapi-doc/restapi-doc/api-ref/src/wadls/fm-api/v1/api_samples/alarm-response.json similarity index 100% rename from restapi-doc/restapi-doc/api-ref/src/wadls/sysinv-api/v1/api_samples/alarm-response.json rename to restapi-doc/restapi-doc/api-ref/src/wadls/fm-api/v1/api_samples/alarm-response.json diff --git a/restapi-doc/restapi-doc/api-ref/src/wadls/fm-api/v1/api_samples/alarm_list-request.json b/restapi-doc/restapi-doc/api-ref/src/wadls/fm-api/v1/api_samples/alarm_list-request.json new file mode 100644 index 000000000..306e7969d --- /dev/null +++ b/restapi-doc/restapi-doc/api-ref/src/wadls/fm-api/v1/api_samples/alarm_list-request.json @@ -0,0 +1 @@ +http://192.168.204.2:18002/v1/alarms?q.field=severity&q.op=eq&q.type=&q.value=major&include_suppress=True diff --git a/restapi-doc/restapi-doc/api-ref/src/wadls/sysinv-api/v1/api_samples/alarm_list-response.json b/restapi-doc/restapi-doc/api-ref/src/wadls/fm-api/v1/api_samples/alarm_list-response.json similarity index 100% rename from restapi-doc/restapi-doc/api-ref/src/wadls/sysinv-api/v1/api_samples/alarm_list-response.json rename to restapi-doc/restapi-doc/api-ref/src/wadls/fm-api/v1/api_samples/alarm_list-response.json diff --git a/restapi-doc/restapi-doc/api-ref/src/wadls/sysinv-api/v1/api_samples/alarm_summary-response.json b/restapi-doc/restapi-doc/api-ref/src/wadls/fm-api/v1/api_samples/alarm_summary-response.json similarity index 100% rename from restapi-doc/restapi-doc/api-ref/src/wadls/sysinv-api/v1/api_samples/alarm_summary-response.json rename to restapi-doc/restapi-doc/api-ref/src/wadls/fm-api/v1/api_samples/alarm_summary-response.json diff --git a/restapi-doc/restapi-doc/api-ref/src/wadls/fm-api/v1/api_samples/event_log_list-request.json b/restapi-doc/restapi-doc/api-ref/src/wadls/fm-api/v1/api_samples/event_log_list-request.json new file mode 100644 index 000000000..aad8f464b --- /dev/null +++ b/restapi-doc/restapi-doc/api-ref/src/wadls/fm-api/v1/api_samples/event_log_list-request.json @@ -0,0 +1 @@ +http://192.168.204.2:18002/v1/event_log?q.field=start&q.field=end&q.op=eq&q.op=eq&q.type=&q.type=&q.value=2014-11-28T16%3A56%3A44&q.value=2014-11-28T16%3A56%3A45&limit=2 diff --git a/restapi-doc/restapi-doc/api-ref/src/wadls/sysinv-api/v1/api_samples/event_log_list-response.json b/restapi-doc/restapi-doc/api-ref/src/wadls/fm-api/v1/api_samples/event_log_list-response.json similarity index 100% rename from restapi-doc/restapi-doc/api-ref/src/wadls/sysinv-api/v1/api_samples/event_log_list-response.json rename to restapi-doc/restapi-doc/api-ref/src/wadls/fm-api/v1/api_samples/event_log_list-response.json diff --git a/restapi-doc/restapi-doc/api-ref/src/wadls/sysinv-api/v1/api_samples/event_log_show-response.json b/restapi-doc/restapi-doc/api-ref/src/wadls/fm-api/v1/api_samples/event_log_show-response.json similarity index 100% rename from restapi-doc/restapi-doc/api-ref/src/wadls/sysinv-api/v1/api_samples/event_log_show-response.json rename to restapi-doc/restapi-doc/api-ref/src/wadls/fm-api/v1/api_samples/event_log_show-response.json diff --git a/restapi-doc/restapi-doc/api-ref/src/wadls/fm-api/v1/api_samples/event_suppression_list-request.json b/restapi-doc/restapi-doc/api-ref/src/wadls/fm-api/v1/api_samples/event_suppression_list-request.json new file mode 100644 index 000000000..82603d52b --- /dev/null +++ b/restapi-doc/restapi-doc/api-ref/src/wadls/fm-api/v1/api_samples/event_suppression_list-request.json @@ -0,0 +1 @@ +http://192.168.204.2:18002/v1/event_suppression diff --git a/restapi-doc/restapi-doc/api-ref/src/wadls/sysinv-api/v1/api_samples/event_suppression_list-response.json b/restapi-doc/restapi-doc/api-ref/src/wadls/fm-api/v1/api_samples/event_suppression_list-response.json similarity index 100% rename from restapi-doc/restapi-doc/api-ref/src/wadls/sysinv-api/v1/api_samples/event_suppression_list-response.json rename to restapi-doc/restapi-doc/api-ref/src/wadls/fm-api/v1/api_samples/event_suppression_list-response.json diff --git a/restapi-doc/restapi-doc/api-ref/src/wadls/sysinv-api/v1/api_samples/event_suppression_modify-request.json b/restapi-doc/restapi-doc/api-ref/src/wadls/fm-api/v1/api_samples/event_suppression_modify-request.json similarity index 100% rename from restapi-doc/restapi-doc/api-ref/src/wadls/sysinv-api/v1/api_samples/event_suppression_modify-request.json rename to restapi-doc/restapi-doc/api-ref/src/wadls/fm-api/v1/api_samples/event_suppression_modify-request.json diff --git a/restapi-doc/restapi-doc/api-ref/src/wadls/sysinv-api/v1/api_samples/event_suppression_modify-response.json b/restapi-doc/restapi-doc/api-ref/src/wadls/fm-api/v1/api_samples/event_suppression_modify-response.json similarity index 100% rename from restapi-doc/restapi-doc/api-ref/src/wadls/sysinv-api/v1/api_samples/event_suppression_modify-response.json rename to restapi-doc/restapi-doc/api-ref/src/wadls/fm-api/v1/api_samples/event_suppression_modify-response.json diff --git a/restapi-doc/restapi-doc/api-ref/src/wadls/fm-api/v1/api_samples/fm-versions-response.json b/restapi-doc/restapi-doc/api-ref/src/wadls/fm-api/v1/api_samples/fm-versions-response.json new file mode 100644 index 000000000..d5d018c38 --- /dev/null +++ b/restapi-doc/restapi-doc/api-ref/src/wadls/fm-api/v1/api_samples/fm-versions-response.json @@ -0,0 +1,24 @@ +{ + "default_version": { + "id": "v1", + "links": [ + { + "href": "http://192.168.204.2:18002/v1/", + "rel": "self" + } + ] + }, + "versions": [ + { + "id": "v1", + "links": [ + { + "href": "http://192.168.204.2:18002/v1/", + "rel": "self" + } + ] + } + ], + "name": "Fault Management API", + "description": "Fault Management is an OpenStack project which provides REST API services for alarms and logs." +} diff --git a/restapi-doc/restapi-doc/api-ref/src/wadls/fm-api/v1/api_samples/version-get-response.json b/restapi-doc/restapi-doc/api-ref/src/wadls/fm-api/v1/api_samples/version-get-response.json new file mode 100644 index 000000000..d5d018c38 --- /dev/null +++ b/restapi-doc/restapi-doc/api-ref/src/wadls/fm-api/v1/api_samples/version-get-response.json @@ -0,0 +1,24 @@ +{ + "default_version": { + "id": "v1", + "links": [ + { + "href": "http://192.168.204.2:18002/v1/", + "rel": "self" + } + ] + }, + "versions": [ + { + "id": "v1", + "links": [ + { + "href": "http://192.168.204.2:18002/v1/", + "rel": "self" + } + ] + } + ], + "name": "Fault Management API", + "description": "Fault Management is an OpenStack project which provides REST API services for alarms and logs." +} diff --git a/restapi-doc/restapi-doc/api-ref/src/wadls/fm-api/v1/api_samples/versionv1-get-response.json b/restapi-doc/restapi-doc/api-ref/src/wadls/fm-api/v1/api_samples/versionv1-get-response.json new file mode 100644 index 000000000..1746a45c2 --- /dev/null +++ b/restapi-doc/restapi-doc/api-ref/src/wadls/fm-api/v1/api_samples/versionv1-get-response.json @@ -0,0 +1,50 @@ +{ + "media_types": [ + { + "base": "application/json", + "type": "application/vnd.openstack.fm.v1+json" + } + ], + "links": [ + { + "href": "http://192.168.204.2:18002/v1/", + "rel": "self" + }, + { + "href": "http://www.windriver.com/developer/fm/dev/api-spec-v1.html", + "type": "text/html", + "rel": "describedby" + } + ], + "event_log": [ + { + "href": "http://192.168.204.2:18002/v1/event_log/", + "rel": "self" + }, + { + "href": "http://192.168.204.2:18002/event_log/", + "rel": "bookmark" + } + ], + "alarms": [ + { + "href": "http://192.168.204.2:18002/v1/alarms/", + "rel": "self" + }, + { + "href": "http://192.168.204.2:18002/alarms/", + "rel": "bookmark" + } + ], + "event_suppression": [ + { + "href": "http://192.168.204.2:18002/v1/event_suppression/", + "rel": "self" + }, + { + "href": "http://192.168.204.2:18002/event_suppression/", + "rel": "bookmark" + } + ], + "id": "v1" +} diff --git a/restapi-doc/restapi-doc/api-ref/src/wadls/fm-api/v1/common.ent b/restapi-doc/restapi-doc/api-ref/src/wadls/fm-api/v1/common.ent new file mode 100644 index 000000000..382e7d00d --- /dev/null +++ b/restapi-doc/restapi-doc/api-ref/src/wadls/fm-api/v1/common.ent @@ -0,0 +1,436 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + '> + + + + + + '> + + + + + + '> + + + + + '> + + + + + + '> + + + + + + '> + + + + + + + The universally unique identifier for this object. + + + + + For convenience, resources contain links to themselves. + This allows a client to easily obtain rather than construct + resource URIs. The following types of link relations are + associated with resources: a self link containing a versioned + link to the resource, and a bookmark link containing a permanent + link to a resource that is appropriate for long term storage. + + + + + The time when the object was created. + + + + + The time when the object was last updated. + + + '> + + + + + + + The alarm ID; each type of alarm has a unique ID. Note + the alarm_id and the entity_instance_id uniquely identify + an alarm instance. + + + + + The instance of the object raising alarm. A . separated list + of sub-entity-type=instance-value pairs, representing the containment + structure of the overall entity instance. Note + the alarm_id and the entity_instance_id uniquely identify + an alarm instance. + + + + + The text description of the alarm. + + + + + The severity of the alarm; critical, + major, minor, or warning. + + + + + The time in UTC at which the alarm has last been updated. + + + + + The unique identifier of the alarm. + + + '> + + + + The state of the alarm; set or clear + + + + + Indicates whether the alarm affects the service. + + + + + The proposed action to clear the alarm. + + + + + The type of the alarm. + + + + + The type of the object raising the alarm. A . separated list + of sub-entity-type, representing the containment structure of the + overall entity type. + + + + + The probable cause of the alarm. + + + + + Indicates whether suppression of the specific alarm is allowed. + + + '> + + + + UUID of the system. + + + + + Overall system status based on alarms present; critical, + degraded, or OK. + + + + + Count of critical alarms on the system + + + + + Count of major alarms on the system + + + + + Count of minor alarms on the system + + + + + Count of warnings on the system + + + '> + + + + + + + The event log ID; each type of event log has a unique ID. Note + the event_log_id and the entity_instance_id uniquely identify + an event log instance. + + + + + The state of the event; set, clear or log + + + + + The instance of the object generating the event log. A . separated list + of sub-entity-type=instance-value pairs, representing the containment + structure of the overall entity instance. Note + the event_log_id and the entity_instance_id uniquely identify + an event log instance. + + + + + The text description of the event log. + + + + + The severity of the event log; critical, + major, minor or warning. + + + + + The time in UTC at which the event log has last been updated. + + + + + The unique identifier of the event log. + + + + + The next attribute is the request to use to get the next n + items. It is used to paginate the event log list. + + + '> + + + + The state of the event; set, clear or log + + + + + Indicates whether the event affects the service. + + + + + The proposed action to clear the event. + + + + + The type of the event. + + + + + The type of the object raising the alarm. A . separated list + of sub-entity-type, representing the containment structure of the + overall entity type. + + + + + The probable cause of the event. + + + + + Indicates whether suppression of the specific event is allowed. + + + '> + + + + + + + The alarm ID type (event ID type) that can be suppressed or unsuppressed. + + + + + + The text description of the event type. + + + + + The suppression status for the event ID type; suppressed or unsuppressed + + + '> + + + + GET'> + PUT'> + POST'> + DELETE'> diff --git a/restapi-doc/restapi-doc/api-ref/src/wadls/fm-api/v1/fm-api-v1.wadl b/restapi-doc/restapi-doc/api-ref/src/wadls/fm-api/v1/fm-api-v1.wadl new file mode 100644 index 000000000..07689fe08 --- /dev/null +++ b/restapi-doc/restapi-doc/api-ref/src/wadls/fm-api/v1/fm-api-v1.wadl @@ -0,0 +1,469 @@ + + + +%common;]> + + + + + + + + + + + + + + + + + + + + + + + + + + + The unique identifier of an existing active alarm. + + + + + + + + + + + + + + + + + + + + + + + + The unique identifier of an event log. + + + + + + + + + + + + + + + + + The unique identifier of an event suppression. + + + + + + + + + + + + + + + + API version details. + + + + + + + + + + + + + + + + + Lists information about all Fault Management API versions. + + + + + + + + + &commonFaults; &getFaults; + + + + + Shows details for Fault Management API v1. + + + + + + + + + &commonFaults; &getFaults; + + + + + + + + + + + + + + + Lists all active alarms based on specified query. + The supported query options are alarm_id, entity_type_id, entity_instance_id, + severity and alarm_type. + + + + + + + + This optional parameter when set to true (include_suppress=true) specifies + to include suppressed alarms in output. + + + + + + + + + + + + + + + + The list of active alarms based on the specified query. + + + &alarmListShowParameters; + + + + + + + + &commonFaults; &getFaults; + + + + + + Shows information about a specific alarm. + + + + + + + &alarmListShowParameters; + &alarmDetailShowParameters; + &commonListShowParameters; + + + + + + + + &commonFaults; &getFaults; + + + + + + Deletes a specific alarm. + NOTE Typically this command should NOT be used. I.e typically + alarms will be and should be cleared by the system + when the alarm condition clears. This command is only provided + in the event that the alarm has cleared but for some reason the + system has not removed the alarm. + + + + + + + + + + + + + Summarize all active alarms by severity. + + + + + + + + This optional parameter when set to true (include_suppress=true) specifies + to include suppressed alarms in the summations (default false). + + + + + + + + + &alarmSummaryShowParameters; + + + + + + + + &commonFaults; + + + + + + + + + + + + + Lists all event logs (historical alarms and customer logs) based on specified query. The logs + are returned in reverse chronological order. + The supported query options are event_log_id, entity_type_id, entity_instance_id, + severity, event_log_type, start and end. + + + + + + + + This parameter specifies filter rules for the logs to + be returned. + + + + + This parameter specifies the maximum number of event logs to + be returned. + + + + + This optional parameter when set to true (alarms=true) specifies + that only alarm event log records should be returned. + + + + + This optional parameter when set to true (logs=true) specifies + that only customer log records should be returned. + + + + + This optional parameter when set to true (include_suppress=true) specifies + to include suppressed alarms in output. + + + + + + + + + + + + + + + + The list of events log based on the specified query. + + + &eventLogListShowParameters; + + + + + + + + &commonFaults; &getFaults; + + + + + Shows information about a specific event log. + + + + + + + &eventLogListShowParameters; + &commonListShowParameters; + + + + + + + + &commonFaults; &getFaults; + + + + + + + + + + + + + Lists suppressed event id's. + + + + + + + + + The list of suppressed event types. + + + &EventSuppressionListShowParameters; + &commonListShowParameters; + + + + + + + + &commonFaults; &getFaults; + + + + + Modifies the value of an event suppression. + + + + + + + + The suppression status of an event suppression; suppressed or unsuppressed + + + + + + + + + + + + + + + + URIs to the modified event suppression. + + + + &EventSuppressionListShowParameters; + &commonListShowParameters; + + + + + + + + &postPutFaults; + + + + diff --git a/restapi-doc/restapi-doc/api-ref/src/wadls/sysinv-api/v1/api_samples/alarm_list-request.json b/restapi-doc/restapi-doc/api-ref/src/wadls/sysinv-api/v1/api_samples/alarm_list-request.json deleted file mode 100644 index 4aa914e46..000000000 --- a/restapi-doc/restapi-doc/api-ref/src/wadls/sysinv-api/v1/api_samples/alarm_list-request.json +++ /dev/null @@ -1 +0,0 @@ -http://192.168.204.2:6385/v1/ialarms?q.field=severity&q.op=eq&q.type=&q.value=major&include_suppress=True diff --git a/restapi-doc/restapi-doc/api-ref/src/wadls/sysinv-api/v1/api_samples/event_log_list-request.json b/restapi-doc/restapi-doc/api-ref/src/wadls/sysinv-api/v1/api_samples/event_log_list-request.json deleted file mode 100644 index f779ac846..000000000 --- a/restapi-doc/restapi-doc/api-ref/src/wadls/sysinv-api/v1/api_samples/event_log_list-request.json +++ /dev/null @@ -1 +0,0 @@ -http://192.168.204.2:6385/v1/event_log?q.field=start&q.field=end&q.op=eq&q.op=eq&q.type=&q.type=&q.value=2014-11-28T16%3A56%3A44&q.value=2014-11-28T16%3A56%3A45&limit=2 diff --git a/restapi-doc/restapi-doc/api-ref/src/wadls/sysinv-api/v1/api_samples/event_suppression_list-request.json b/restapi-doc/restapi-doc/api-ref/src/wadls/sysinv-api/v1/api_samples/event_suppression_list-request.json deleted file mode 100644 index 0de69f46a..000000000 --- a/restapi-doc/restapi-doc/api-ref/src/wadls/sysinv-api/v1/api_samples/event_suppression_list-request.json +++ /dev/null @@ -1 +0,0 @@ -http://192.168.204.2:6385/v1/event_suppression diff --git a/restapi-doc/restapi-doc/api-ref/src/wadls/sysinv-api/v1/api_samples/sysinv-versions-response.json b/restapi-doc/restapi-doc/api-ref/src/wadls/sysinv-api/v1/api_samples/sysinv-versions-response.json index 31a632c5b..964df10ec 100644 --- a/restapi-doc/restapi-doc/api-ref/src/wadls/sysinv-api/v1/api_samples/sysinv-versions-response.json +++ b/restapi-doc/restapi-doc/api-ref/src/wadls/sysinv-api/v1/api_samples/sysinv-versions-response.json @@ -19,6 +19,6 @@ ] } ], - "description":"Titanium Cloud System API allows for the management of physical servers. This includes inventory collection and configuration of hosts, ports, interfaces, CPUs, disk, memory, and system configuration. The API also supports alarms and fault collection for the cloud itself as well as the configuration of the cloud's SNMP interface. ", + "description":"Titanium Cloud System API allows for the management of physical servers. This includes inventory collection and configuration of hosts, ports, interfaces, CPUs, disk, memory, and system configuration. The API also supports the configuration of the cloud's SNMP interface. ", "name":"Titanium SysInv API" } diff --git a/restapi-doc/restapi-doc/api-ref/src/wadls/sysinv-api/v1/api_samples/version-get-response.json b/restapi-doc/restapi-doc/api-ref/src/wadls/sysinv-api/v1/api_samples/version-get-response.json index 31a632c5b..964df10ec 100644 --- a/restapi-doc/restapi-doc/api-ref/src/wadls/sysinv-api/v1/api_samples/version-get-response.json +++ b/restapi-doc/restapi-doc/api-ref/src/wadls/sysinv-api/v1/api_samples/version-get-response.json @@ -19,6 +19,6 @@ ] } ], - "description":"Titanium Cloud System API allows for the management of physical servers. This includes inventory collection and configuration of hosts, ports, interfaces, CPUs, disk, memory, and system configuration. The API also supports alarms and fault collection for the cloud itself as well as the configuration of the cloud's SNMP interface. ", + "description":"Titanium Cloud System API allows for the management of physical servers. This includes inventory collection and configuration of hosts, ports, interfaces, CPUs, disk, memory, and system configuration. The API also supports the configuration of the cloud's SNMP interface. ", "name":"Titanium SysInv API" } diff --git a/restapi-doc/restapi-doc/api-ref/src/wadls/sysinv-api/v1/api_samples/versionv1-get-response.json b/restapi-doc/restapi-doc/api-ref/src/wadls/sysinv-api/v1/api_samples/versionv1-get-response.json index dbb2ae95a..2ad1ad8d5 100644 --- a/restapi-doc/restapi-doc/api-ref/src/wadls/sysinv-api/v1/api_samples/versionv1-get-response.json +++ b/restapi-doc/restapi-doc/api-ref/src/wadls/sysinv-api/v1/api_samples/versionv1-get-response.json @@ -136,36 +136,6 @@ "rel":"bookmark" } ], - "ialarms":[ - { - "href":"http://128.224.150.54:6385/v1/ialarms/", - "rel":"self" - }, - { - "href":"http://128.224.150.54:6385/ialarms/", - "rel":"bookmark" - } - ], - "event_log":[ - { - "href":"http://128.224.150.54:6385/v1/event_log/", - "rel":"self" - }, - { - "href":"http://128.224.150.54:6385/event_log/", - "rel":"bookmark" - } - ], - "event_suppression":[ - { - "href":"http://128.224.150.54:6385/v1/event_suppression/", - "rel":"self" - }, - { - "href":"http://128.224.150.54:6385/event_suppression/", - "rel":"bookmark" - } - ], "icommunity":[ { "href":"http://128.224.150.54:6385/v1/icommunity/", diff --git a/restapi-doc/restapi-doc/api-ref/src/wadls/sysinv-api/v1/common.ent b/restapi-doc/restapi-doc/api-ref/src/wadls/sysinv-api/v1/common.ent index 334e56b65..9e3047f2b 100644 --- a/restapi-doc/restapi-doc/api-ref/src/wadls/sysinv-api/v1/common.ent +++ b/restapi-doc/restapi-doc/api-ref/src/wadls/sysinv-api/v1/common.ent @@ -1538,163 +1538,6 @@ '> - - - - - The alarm ID; each type of alarm has a unique ID. Note - the alarm_id and the entity_instance_id uniquely identify - an alarm instance. - - - - - The instance of the object raising alarm. A . separated list - of sub-entity-type=instance-value pairs, representing the containment - structure of the overall entity instance. Note - the alarm_id and the entity_instance_id uniquely identify - an alarm instance. - - - - - The text description of the alarm. - - - - - The severity of the alarm; critical, - major, minor, or warning. - - - - - The time in UTC at which the alarm has last been updated. - - - - - The unique identifier of the alarm. - - - '> - - - - The state of the alarm; set or clear - - - - - Indicates whether the alarm affects the service. - - - - - The proposed action to clear the alarm. - - - - - The type of the alarm. - - - - - The type of the object raising the alarm. A . separated list - of sub-entity-type, representing the containment structure of the - overall entity type. - - - - - The probable cause of the alarm. - - - - - Indicates whether suppression of the specific alarm is allowed. - - - '> - - - - UUID of the system. - - - - - Overall system status based on alarms present; critical, - degraded, or OK. - - - - - Count of critical alarms on the system - - - - - Count of major alarms on the system - - - - - Count of minor alarms on the system - - - - - Count of warnings on the system - - - - - '> - - - - - - - The event log ID; each type of event log has a unique ID. Note - the event_log_id and the entity_instance_id uniquely identify - an event log instance. - - - - - The state of the event; set, clear or log - - - - - The instance of the object generating the event log. A . separated list - of sub-entity-type=instance-value pairs, representing the containment - structure of the overall entity instance. Note - the event_log_id and the entity_instance_id uniquely identify - an event log instance. - - - - - The text description of the event log. - - - - - The severity of the event log; critical, - major, minor or warning. - - - - - The time in UTC at which the event log has last been updated. - - - - - The unique identifier of the event log. - - - - - The next attribute is the request to use to get the next n - items. It is used to paginate the event log list. - - - '> - - - - The state of the event; set, clear or log - - - - - Indicates whether the event affects the service. - - - - - The proposed action to clear the event. - - - - - The type of the event. - - - - - The type of the object raising the alarm. A . separated list - of sub-entity-type, representing the containment structure of the - overall entity type. - - - - - The probable cause of the event. - - - - - Indicates whether suppression of the specific event is allowed. - - - '> - - - - - - The alarm ID type (event ID type) that can be suppressed or unsuppressed. - - - - - - The text description of the event type. - - - - - The suppression status for the event ID type; suppressed or unsuppressed - - - '> - - - - - - - - - - - The unique identifier of an existing active alarm. - - - - - - - - - - - - - - @@ -581,40 +558,6 @@ SPDX-License-Identifier: Apache-2.0 - - - - - - - - - The unique identifier of an event log. - - - - - - - - - - - - - - - - - The unique identifier of an event suppression. - - - - - - - @@ -3616,140 +3559,6 @@ SPDX-License-Identifier: Apache-2.0 - - - - - - - - - - - - Lists all active alarms based on specified query. - The supported query options are alarm_id, entity_type_id, entity_instance_id, - severity and alarm_type. - - - - - - - - This optional parameter when set to true (include_suppress=true) specifies - to include suppressed alarms in output. - - - - - - - - - - - - - - - - The list of active alarms based on the specified query. - - - &alarmListShowParameters; - - - - - - - - &commonFaults; &getFaults; - - - - - - Shows information about a specific alarm. - - - - - - - &alarmListShowParameters; - &alarmDetailShowParameters; - &commonListShowParameters; - - - - - - - - &commonFaults; &getFaults; - - - - - - Deletes a specific alarm. - NOTE Typically this command should NOT be used. I.e typically - alarms will be and should be cleared by the system - when the alarm condition clears. This command is only provided - in the event that the alarm has cleared but for some reason the - system has not removed the alarm. - - - - - - - - - - - - - Summarize all active alarms by severity. - - - - - - - - This optional parameter when set to true (include_suppress=true) specifies - to include suppressed alarms in the summations (default false). - - - - - - - - - &alarmSummaryShowParameters; - - - - - - - - &commonFaults; - - - @@ -4570,197 +4379,6 @@ OAM Controller-1 IP Address. &postPutFaults; - - - - - - - - - - Lists all event logs (historical alarms and customer logs) based on specified query. The logs - are returned in reverse chronological order. - The supported query options are event_log_id, entity_type_id, entity_instance_id, - severity, event_log_type, start and end. - - - - - - - - This parameter specifies filter rules for the logs to - be returned. - - - - - This parameter specifies the maximum number of event logs to - be returned. - - - - - This optional parameter when set to true (alarms=true) specifies - that only alarm event log records should be returned. - - - - - This optional parameter when set to true (logs=true) specifies - that only customer log records should be returned. - - - - - This optional parameter when set to true (include_suppress=true) specifies - to include suppressed alarms in output. - - - - - - - - - - - - - - - - The list of events log based on the specified query. - - - &eventLogListShowParameters; - - - - - - - - &commonFaults; &getFaults; - - - - - Shows information about a specific event log. - - - - - - - &eventLogListShowParameters; - &commonListShowParameters; - - - - - - - - &commonFaults; &getFaults; - - - - - - - - - - - - Lists suppressed event id's. - - - - - - - - - The list of suppressed event types. - - - &EventSuppressionListShowParameters; - &commonListShowParameters; - - - - - - - - &commonFaults; &getFaults; - - - - - Modifies the value of an event suppression. - - - - - - - - The suppression status of an event suppression; suppressed or unsuppressed - - - - - - - - - - - - - - - - URIs to the modified event suppression. - - - - &EventSuppressionListShowParameters; - &commonListShowParameters; - - - - - - - - &postPutFaults; - - diff --git a/tools/collector/scripts/collect_mask_passwords b/tools/collector/scripts/collect_mask_passwords index b7f0e2461..0dd5fa9ab 100644 --- a/tools/collector/scripts/collect_mask_passwords +++ b/tools/collector/scripts/collect_mask_passwords @@ -13,7 +13,7 @@ for conffile in \ ${COLLECT_NAME_DIR}/etc/aodh/aodh.conf \ ${COLLECT_NAME_DIR}/etc/ceilometer/ceilometer.conf \ ${COLLECT_NAME_DIR}/etc/cinder/cinder.conf \ - ${COLLECT_NAME_DIR}/etc/fm.conf \ + ${COLLECT_NAME_DIR}/etc/fm/fm.conf \ ${COLLECT_NAME_DIR}/etc/glance/glance-api.conf \ ${COLLECT_NAME_DIR}/etc/glance/glance-registry.conf \ ${COLLECT_NAME_DIR}/etc/heat/heat.conf \ From 15564daeaaec04fc84601d45482af748d6624f55 Mon Sep 17 00:00:00 2001 From: yhu6 Date: Wed, 15 Aug 2018 15:19:37 +0800 Subject: [PATCH 14/26] fix: make libibverbs have newer version to match with mlnx-ofa_kernel Depends-On: https://review.openstack.org/591932 Change-Id: If185772a2963ac307ab541c1d20927f6742baaea Story: 2003443 Signed-off-by: yhu6 --- .../0001-Update-package-versioning-for-TIS-format.patch | 4 ++-- networking/mellanox/libibverbs/centos/srpm_path | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/networking/mellanox/libibverbs/centos/meta_patches/0001-Update-package-versioning-for-TIS-format.patch b/networking/mellanox/libibverbs/centos/meta_patches/0001-Update-package-versioning-for-TIS-format.patch index 7c13df876..2e617d9e6 100644 --- a/networking/mellanox/libibverbs/centos/meta_patches/0001-Update-package-versioning-for-TIS-format.patch +++ b/networking/mellanox/libibverbs/centos/meta_patches/0001-Update-package-versioning-for-TIS-format.patch @@ -16,8 +16,8 @@ index e55433c..74cb4d2 100644 Name: libibverbs Version: 41mlnx1 --Release: OFED.4.2.1.0.6.42120 -+Release: OFED.4.2.1.0.6.42120%{?_tis_dist}.%{tis_patch_ver} +-Release: OFED.4.3.2.1.6.43302 ++Release: OFED.4.3.2.1.6.43302%{?_tis_dist}.%{tis_patch_ver} Summary: A library for direct userspace use of RDMA (InfiniBand/iWARP) hardware Group: System Environment/Libraries diff --git a/networking/mellanox/libibverbs/centos/srpm_path b/networking/mellanox/libibverbs/centos/srpm_path index c58521323..7cd73d918 100644 --- a/networking/mellanox/libibverbs/centos/srpm_path +++ b/networking/mellanox/libibverbs/centos/srpm_path @@ -1 +1 @@ -repo:stx/downloads/libibverbs-41mlnx1-OFED.4.2.1.0.6.42120.src.rpm +repo:stx/downloads/libibverbs-41mlnx1-OFED.4.3.2.1.6.43302.src.rpm From ff1ba812c0feab0d8b982cd748b25020f4df8884 Mon Sep 17 00:00:00 2001 From: melissaml Date: Fri, 17 Aug 2018 15:34:13 +0800 Subject: [PATCH 15/26] Remove the duplicated word Change-Id: I68dc653708a33536b69ede4f032457ab951c24dd --- .../centos/patches/US103091-IMA-System-Configuration.patch | 2 +- .../centos/patches/US103091-IMA-System-Configuration.patch | 2 +- monitoring/collectd-extensions/src/mtce_notifier.py | 2 +- .../api-ref/src/wadls/sysinv-api/v1/sysinv-api-v1.wadl | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/kernel-rt/centos/patches/US103091-IMA-System-Configuration.patch b/kernel/kernel-rt/centos/patches/US103091-IMA-System-Configuration.patch index 1e6c79c41..936b2a0db 100644 --- a/kernel/kernel-rt/centos/patches/US103091-IMA-System-Configuration.patch +++ b/kernel/kernel-rt/centos/patches/US103091-IMA-System-Configuration.patch @@ -58,7 +58,7 @@ index d357e7d..f333b29 100644 + +############################################################################### +# -+# We will roll in the IMA X.509 certificate and pull it in the the kernel ++# We will roll in the IMA X.509 certificate and pull it in the kernel +# so that it gets loaded into the _ima keyring during boot. +# +# Ideally, this should have been treated similar to other .x509 certificates diff --git a/kernel/kernel-std/centos/patches/US103091-IMA-System-Configuration.patch b/kernel/kernel-std/centos/patches/US103091-IMA-System-Configuration.patch index 16b4e4f05..382fcc75e 100644 --- a/kernel/kernel-std/centos/patches/US103091-IMA-System-Configuration.patch +++ b/kernel/kernel-std/centos/patches/US103091-IMA-System-Configuration.patch @@ -68,7 +68,7 @@ index 44a82c1..000b9a8 100644 + +############################################################################### +# -+# We will roll in the IMA X.509 certificate and pull it in the the kernel ++# We will roll in the IMA X.509 certificate and pull it in the kernel +# so that it gets loaded into the _ima keyring during boot. +# +# Ideally, this should have been treated similar to other .x509 certificates diff --git a/monitoring/collectd-extensions/src/mtce_notifier.py b/monitoring/collectd-extensions/src/mtce_notifier.py index c18977eab..1ffa88a2a 100755 --- a/monitoring/collectd-extensions/src/mtce_notifier.py +++ b/monitoring/collectd-extensions/src/mtce_notifier.py @@ -58,7 +58,7 @@ FAIL = 1 MTCE_CMD_RX_PORT = 2101 # same state message throttle count. -# ... only send the the degrade message every 'this' number +# ... only send the degrade message every 'this' number # while the state of assert or clear remains the same. ONE_EVERY = 10 diff --git a/restapi-doc/restapi-doc/api-ref/src/wadls/sysinv-api/v1/sysinv-api-v1.wadl b/restapi-doc/restapi-doc/api-ref/src/wadls/sysinv-api/v1/sysinv-api-v1.wadl index cbdd9ab6c..39e5ceb5a 100644 --- a/restapi-doc/restapi-doc/api-ref/src/wadls/sysinv-api/v1/sysinv-api-v1.wadl +++ b/restapi-doc/restapi-doc/api-ref/src/wadls/sysinv-api/v1/sysinv-api-v1.wadl @@ -3980,7 +3980,7 @@ SPDX-License-Identifier: Apache-2.0 name="community" style="plain" type="xsd:string" > - This parameter specifies the the community of which the trap destination is a member. + This parameter specifies the community of which the trap destination is a member. From cf1c5b53fa7a7916221ef8eebd9f77551efd3a40 Mon Sep 17 00:00:00 2001 From: Dean Troyer Date: Fri, 17 Aug 2018 16:00:55 -0500 Subject: [PATCH 16/26] Remove old repo map files Change-Id: I3255e71f48ad592cafd2d63fb8cb982cd38917d5 Signed-off-by: Dean Troyer --- mwa-sparta.map | 20 -------------------- 1 file changed, 20 deletions(-) delete mode 100644 mwa-sparta.map diff --git a/mwa-sparta.map b/mwa-sparta.map deleted file mode 100644 index 78cf86cf3..000000000 --- a/mwa-sparta.map +++ /dev/null @@ -1,20 +0,0 @@ -cgcs/recipes-3rdparty/python|python -cgcs/recipes-base|base -cgcs/common-bsp/recipes-kernel/linux|kernel-std -cgcs/common-bsp/recipes-kernel/linux-rt|kernel-rt -cgcs/recipes-cgi|cgi -cgcs/recipes-connectivity|connectivity -cgcs/recipes-core|core -cgcs/recipes-devtools|devtools -cgcs/recipes-extended|extended -cgcs/recipes-kernel|kernel -cgcs/recipes-networking|networking -cgcs/recipes-power|power -cgcs/recipes-restapi-doc/restapi-doc|restapi-doc -cgcs/recipes-security|security -cgcs/recipes-support|support -avs/drivers/mellanox/libibverbs|mellanox/libibverbs -avs/drivers/mellanox/libmlx4|mellanox/libmlx4 -avs/drivers/mellanox/libmlx5|mellanox/libmlx5 -avs/drivers/mellanox/mlnx-ofa_kernel|mellanox/mlnx-ofa_kernel -avs/drivers/mellanox/rdma-core|mellanox/rdma-core From 08bc309ffefaf616dd8e4746ed9cc477ae7bf2a9 Mon Sep 17 00:00:00 2001 From: Mathieu Godin Date: Tue, 3 Jul 2018 17:02:24 -0400 Subject: [PATCH 17/26] Update API stats data collection Replace the existing implementation of collectApi, which is not ideal as it relies on reading log data for GET/POST requests, with implementation from api-stats.py tool. Only total api, db and rabbit connections stats for services of interest are collected. Individual service pid stats are not collected since they take up storage space and add little value. Gunicorn stats currently represent stats of all gunicorn related services (e.g. panko-api, aodh-api, keystone-public, openstack_dashboard). They will be decomposed in the subsequent commit. Functional tests completed by Mathieu Godin. Change-Id: I8a27fe3374b57d66e35da937a3a250caf78245d3 Story: 2002895 Task: 22858 Signed-off-by: Tee Ngo --- .../scripts/cfg/engtools.conf | 23 ++++- .../scripts/live_stream.py | 94 ++++++++----------- 2 files changed, 63 insertions(+), 54 deletions(-) diff --git a/tools/engtools/hostdata-collectors/scripts/cfg/engtools.conf b/tools/engtools/hostdata-collectors/scripts/cfg/engtools.conf index 563bc2e07..66aed494e 100644 --- a/tools/engtools/hostdata-collectors/scripts/cfg/engtools.conf +++ b/tools/engtools/hostdata-collectors/scripts/cfg/engtools.conf @@ -47,10 +47,11 @@ netstats=10 postgres=30 rabbitmq=3600 vswitch=120 +api_requests=5 [AdditionalOptions] # Set this option to Y/N to enable/disable Openstack API GET/POST collection -API_REQUESTS=N +API_REQUESTS=Y # Set this option to Y/N to enable/disable the collection of all services and not just the ones listed below. Note that this hasn't been tested thoroughly ALL_SERVICES=N @@ -75,3 +76,23 @@ RABBITMQ_QUEUE_LIST=notifications.info versioned_notifications.info [CommonServices] COMMON_SERVICE_LIST=dnsmasq ceilometer-polling haproxy hwmond pmond rmond fsmond sw-patch-agent sysinv-agent syslog-ng hostwd iscsid io-monitor-manager acpid hbsClient logmgmt mtcClient mtcalarmd mtclogd sshd ntpd ptp4l phc2sys smartd sm sm-eru sm-watchdog sm-api ceilometer keyring cinder-rtstool tuned polkitd lldpd IPaddr2 dnsmasq systemd-udevd systemd-journald logrotate collectd + +[StaticServices] +STATIC_SERVICE_LIST=occtop memtop schedtop top.sh iostat.sh netstats.sh diskstats.sh memstats.sh filestats.sh ceph.sh postgres.sh rabbitmq.sh vswitch.sh + +[OpenStackServices] +OPEN_STACK_SERVICE_LIST=nova cinder aodh ceilometer heat glance ceph horizon keystone puppet sysinv neutron nova_api postgres panko nova_cell0 magnum ironic murano gnocchi + +[SkipList] +SKIP_LIST=ps top sh curl awk wc sleep lsof cut grep ip tail su + +[ExcludeList] +EXCLUDE_LIST=python python2 bash perl sudo init + +[ApiStatsConstantPorts] +DB_PORT_NUMBER=5432 +RABBIT_PORT_NUMBER=5672 + +[ApiStatsServices] +API_STATS_STRUCTURE=gunicorn;gunicorn;5000|sysinv-conductor;sysinv-co ;|neutron-server;neutron-s;9696|nova-conductor;nova-cond ;|sysinv-agent;sysinv-ag;|sysinv-api;sysinv-ap;6385|nova-api;nova-api ;18774|cinder-api;cinder-a;8776|glance-api;glance-a;9292|ceilometer;ceilomete;8777|vim;nfv-vim;4545|heat-api;heat-a;8004|heat-engine;heat-e;8004 + diff --git a/tools/engtools/hostdata-collectors/scripts/live_stream.py b/tools/engtools/hostdata-collectors/scripts/live_stream.py index d96773d39..48325ee75 100644 --- a/tools/engtools/hostdata-collectors/scripts/live_stream.py +++ b/tools/engtools/hostdata-collectors/scripts/live_stream.py @@ -14,6 +14,8 @@ import psutil import fcntl import logging import ConfigParser +import itertools +import six from multiprocessing import Process, cpu_count from subprocess import Popen, PIPE from collections import OrderedDict @@ -1114,60 +1116,37 @@ def collectCpuCount(influx_info, node, ci): except Exception: logging.error("cpu_count collection stopped unexpectedly with error: {}. Restarting process...".format(sys.exc_info())) +def countApiStatsServices(lsof_lines, service_port, service_name): + service_count = 0 + for line in lsof_lines: + if service_port is not None and service_name is not None and service_port in line and service_name in line: + service_count += 1 + return service_count -# collect API GET and POST requests/sec -def collectApi(influx_info, node, ci, openstack_svcs): +def collectApiStats(influx_info, node, ci, services): logging.basicConfig(filename="/tmp/livestream.log", filemode="a", format="%(asctime)s %(levelname)s %(message)s", level=logging.INFO) logging.info("api_request data starting collection with a collection interval of {}s".format(ci["cpu_count"])) measurement = "api_requests" tags = {"node": node} - openstack_services = openstack_svcs influx_string = "" + lsof_args = ['lsof', '-Pn', '-i', 'tcp'] while True: try: - fields = {} - tmp = {} - tmp1 = {} - # get initial values - for s in openstack_services: - fields[s] = {"get": 0, "post": 0} - tmp[s] = {"get": 0, "post": 0} - log = "/var/log/{0}/{0}-api.log".format(s) - if os.path.exists(log): - if s == "ceilometer": - p = Popen("awk '/INFO/ && /500/' {} | wc -l".format(log), shell=True, stdout=PIPE) - else: - p = Popen("awk '/INFO/ && /GET/' {} | wc -l".format(log), shell=True, stdout=PIPE) - init_api_get = int(p.stdout.readline()) - tmp[s]["get"] = init_api_get - p.kill() - p = Popen("awk '/INFO/ && /POST/' {} | wc -l".format(log), shell=True, stdout=PIPE) - init_api_post = int(p.stdout.readline()) - tmp[s]["post"] = init_api_post - p.kill() - time.sleep(1) - # get new values - for s in openstack_services: - tmp1[s] = {"get": 0, "post": 0} - log = "/var/log/{0}/{0}-api.log".format(s) - if os.path.exists(log): - if s == "ceilometer": - p = Popen("awk '/INFO/ && /500/' {} | wc -l".format(log), shell=True, stdout=PIPE) - else: - p = Popen("awk '/INFO/ && /GET/' {} | wc -l".format(log), shell=True, stdout=PIPE) - api_get = int(p.stdout.readline()) - tmp1[s]["get"] = api_get - p.kill() - p = Popen("awk '/INFO/ && /POST/' {} | wc -l".format(log), shell=True, stdout=PIPE) - api_post = int(p.stdout.readline()) - tmp1[s]["post"] = api_post - p.kill() - # take difference - for key in fields: - if (key in tmp and key in tmp1) and (tmp1[key]["get"] >= tmp[key]["get"]) and (tmp1[key]["post"] >= tmp[key]["post"]): - fields[key]["get"] = (tmp1[key]["get"] - tmp[key]["get"]) - fields[key]["post"] = (tmp1[key]["post"] - tmp[key]["post"]) - influx_string += "{},'{}'='{}','{}'='{}' '{}'='{}','{}'='{}'".format(measurement, "node", tags["node"], "service", key, "get_requests", fields[key]["get"], "post_requests", fields[key]["post"]) + "\n" + fields = {} + lsof_result = Popen(lsof_args, shell=False, stdout=PIPE) + lsof_lines = list() + while True: + line = lsof_result.stdout.readline().strip("\n") + if not line: + break + lsof_lines.append(line) + lsof_result.kill() + for name, service in services.iteritems(): + api_count = countApiStatsServices(lsof_lines, service['api-port'], service['name']) + db_count = countApiStatsServices(lsof_lines, service['db-port'], service['name']) + rabbit_count = countApiStatsServices(lsof_lines, service['rabbit-port'], service['name']) + fields[name] = {"api": api_count, "db": db_count, "rabbit": rabbit_count} + influx_string += "{},'{}'='{}','{}'='{}' '{}'='{}','{}'='{}','{}'='{}'".format(measurement, "node", tags["node"], "service", name, "api", fields[name]["api"], "db", fields[name]["db"], "rabbit", fields[name]["rabbit"]) + "\n" p = Popen("curl -s -o /dev/null 'http://'{}':'{}'/write?db='{}'' --data-binary '{}'".format(influx_info[0], influx_info[1], influx_info[2], influx_string), shell=True) p.communicate() influx_string = "" @@ -1177,7 +1156,6 @@ def collectApi(influx_info, node, ci, openstack_svcs): logging.error("api_request collection stopped unexpectedly with error: {}. Restarting process...".format(sys.exc_info())) time.sleep(3) - # returns the cores dedicated to platform use def getPlatformCores(node, cpe): if cpe is True or node.startswith("compute"): @@ -1347,12 +1325,7 @@ if __name__ == "__main__": common_services = list() services = {} live_svc = ("live_stream.py",) - static_svcs = ("occtop", "memtop", "schedtop", "top.sh", "iostat.sh", "netstats.sh", "diskstats.sh", "memstats.sh", "filestats.sh", "ceph.sh", "postgres.sh", "rabbitmq.sh", "vswitch.sh") collection_intervals = {"memtop": None, "memstats": None, "occtop": None, "schedtop": None, "load_avg": None, "cpu_count": None, "diskstats": None, "iostat": None, "filestats": None, "netstats": None, "postgres": None, "rabbitmq": None, "vswitch": None} - openstack_services = ("nova", "cinder", "aodh", "ceilometer", "heat", "glance", "ceph", "horizon", "keystone", "puppet", "sysinv", "neutron", "nova_api", "postgres", "panko", "nova_cell0", "magnum", "ironic", "murano", "gnocchi") - # memstats, schedtop, and filestats must skip/exclude certain fields when collect_all is enabled. No need to collect this stuff - exclude_list = ("python", "python2", "bash", "perl", "sudo", "init") - skip_list = ("ps", "top", "sh", "", "curl", "awk", "wc", "sleep", "lsof", "cut", "grep", "ip", "tail", "su") duration = None unconverted_duration = "" collect_api_requests = False @@ -1423,12 +1396,27 @@ if __name__ == "__main__": storage_services = tuple(config.get("StorageServices", "STORAGE_SERVICE_LIST").split()) rabbit_services = tuple(config.get("RabbitmqServices", "RABBITMQ_QUEUE_LIST").split()) common_services = tuple(config.get("CommonServices", "COMMON_SERVICE_LIST").split()) + static_svcs = tuple(config.get("StaticServices", "STATIC_SERVICE_LIST").split()) + openstack_services = tuple(config.get("OpenStackServices", "OPEN_STACK_SERVICE_LIST").split()) + skip_list = tuple(config.get("SkipList", "SKIP_LIST").split()) + exclude_list = tuple(config.get("ExcludeList", "EXCLUDE_LIST").split()) # get collection intervals for i in config.options("Intervals"): if config.get("Intervals", i) == "" or config.get("Intervals", i) is None: collection_intervals[i] = None else: collection_intervals[i] = int(config.get("Intervals", i)) + # get api-stats services + DB_PORT_NUMBER = config.get("ApiStatsConstantPorts", "DB_PORT_NUMBER") + RABBIT_PORT_NUMBER = config.get("ApiStatsConstantPorts", "RABBIT_PORT_NUMBER") + SERVICES = OrderedDict() + SERVICES_INFO = tuple(config.get("ApiStatsServices", "API_STATS_STRUCTURE").split('|')) + for service_string in SERVICES_INFO: + service_tuple = tuple(service_string.split(';')) + if service_tuple[2] != "" and service_tuple[2] != None: + SERVICES[service_tuple[0]] = {'name': service_tuple[1], 'db-port': DB_PORT_NUMBER, 'rabbit-port': RABBIT_PORT_NUMBER, 'api-port': service_tuple[2]} + else: + SERVICES[service_tuple[0]] = {'name': service_tuple[1], 'db-port': DB_PORT_NUMBER, 'rabbit-port': RABBIT_PORT_NUMBER, 'api-port': None} except Exception: print "An error has occurred when parsing the engtools.conf configuration file: {}".format(sys.exc_info()) sys.exit(0) @@ -1551,7 +1539,7 @@ if __name__ == "__main__": tasks.append(p) p.start() if collect_api_requests is True and node_type == "controller": - p = Process(target=collectApi, args=(influx_info, node, collection_intervals, openstack_services), name="api_requests") + p = Process(target=collectApiStats, args=(influx_info, node, collection_intervals, SERVICES), name="api_requests") tasks.append(p) p.start() From 1e9a7c03d4f6f4fcc6124a6ef48a4806fbb9237a Mon Sep 17 00:00:00 2001 From: Paul-Emile Element Date: Mon, 20 Aug 2018 11:09:04 -0400 Subject: [PATCH 18/26] Add ipv6 support in default lighttpd configuration The installed lighttpd.conf file missed an entry for ipv6 support This modification simply adds the missing configuration entry to enable the system to properly support ipv6 configurations Story: 2002986 Task: 23000 Change-Id: I3551e5cfeb4d31a8fefcbd3f6f1350bb17984053 Signed-off-by: Paul-Emile Element --- base/lighttpd/centos/build_srpm.data | 2 +- base/lighttpd/lighttpd-1.4.35/lighttpd.conf | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/base/lighttpd/centos/build_srpm.data b/base/lighttpd/centos/build_srpm.data index f057a5a58..6d94d3e28 100755 --- a/base/lighttpd/centos/build_srpm.data +++ b/base/lighttpd/centos/build_srpm.data @@ -6,4 +6,4 @@ COPY_LIST="lighttpd-1.4.35/index.html.lighttpd \ lighttpd-1.4.35/lighttpd-csr.conf \ lighttpd-1.4.35/check-content-length.patch \ lighttpd-1.4.35/lighttpd-tpm-support.patch" -TIS_PATCH_VER=5 +TIS_PATCH_VER=6 diff --git a/base/lighttpd/lighttpd-1.4.35/lighttpd.conf b/base/lighttpd/lighttpd-1.4.35/lighttpd.conf index 48ada9b6d..31b294800 100755 --- a/base/lighttpd/lighttpd-1.4.35/lighttpd.conf +++ b/base/lighttpd/lighttpd-1.4.35/lighttpd.conf @@ -243,6 +243,9 @@ $HTTP["url"] !~ "^/(rel-[^/]*|feed|updates|static)/" { # ".cgi" => "/usr/bin/perl" ) # +#### Listen to IPv6 +$SERVER["socket"] == "[::]:80" { } + #### status module #status.status-url = "/server-status" #status.config-url = "/server-config" From 54e1cf85d04df0452be95c8a12dd38af7b741596 Mon Sep 17 00:00:00 2001 From: Paul-Emile Element Date: Tue, 21 Aug 2018 13:33:10 -0400 Subject: [PATCH 19/26] Added 'capabilities' attribute to restapi documentation The generated restapi documentation was missing the description of the 'capabilities' attribute. This change update the documentation templates to add the missing entry Version changed from 1.9.0 to 1.9.1 Story: 2003068 Task: 23119 Change-Id: I697824ef9ebc4e25d7da935314f6b038f66a9fcb Signed-off-by: Paul-Emile Element --- restapi-doc/centos/build_srpm.data | 2 +- restapi-doc/centos/restapi-doc.spec | 2 +- restapi-doc/restapi-doc/Makefile | 2 +- restapi-doc/restapi-doc/README.mvn_cache | 5 +- .../src/wadls/sysinv-api/v1/common.ent | 49 +++++++++++++++++++ 5 files changed, 54 insertions(+), 6 deletions(-) diff --git a/restapi-doc/centos/build_srpm.data b/restapi-doc/centos/build_srpm.data index deff7763a..e74282c78 100644 --- a/restapi-doc/centos/build_srpm.data +++ b/restapi-doc/centos/build_srpm.data @@ -3,5 +3,5 @@ COPY_LIST="$SRC_DIR/* \ $CGCS_BASE/downloads/mvn.repo.tgz \ " -TIS_PATCH_VER=26 +TIS_PATCH_VER=27 BUILD_IS_SLOW=3 diff --git a/restapi-doc/centos/restapi-doc.spec b/restapi-doc/centos/restapi-doc.spec index 11655f56b..c4d18f8f2 100644 --- a/restapi-doc/centos/restapi-doc.spec +++ b/restapi-doc/centos/restapi-doc.spec @@ -1,6 +1,6 @@ Summary: RestAPI-Doc Name: restapi-doc -Version: 1.9.0 +Version: 1.9.1 Release: %{tis_patch_ver}%{?_tis_dist} License: Apache-2.0 Group: devel diff --git a/restapi-doc/restapi-doc/Makefile b/restapi-doc/restapi-doc/Makefile index 05e9fbc6f..a8997f0af 100644 --- a/restapi-doc/restapi-doc/Makefile +++ b/restapi-doc/restapi-doc/Makefile @@ -1,5 +1,5 @@ # increment this every release -API_VERSION := "1.9.0" +API_VERSION := "1.9.1" build: @git status > /dev/null ; \ diff --git a/restapi-doc/restapi-doc/README.mvn_cache b/restapi-doc/restapi-doc/README.mvn_cache index d91e0f804..1e2e235f9 100644 --- a/restapi-doc/restapi-doc/README.mvn_cache +++ b/restapi-doc/restapi-doc/README.mvn_cache @@ -5,11 +5,10 @@ Steps to produce mvn.repo.tgz [Maven cache] cd $MY_REPO/stx/stx-integ/restapi-doc/restapi-doc cp Makefile Makefile.backup cp Makefile.mvn_cache Makefile -build_srpms restapi-doc +build-srpms restapi-doc mock -r $MY_BUILD_CFG_STD "FILE_NAME_TO_THE_BUILT_SRPM" -mock -r $MY_BUILD_CFG_STD --copyout /builddir/build/BUILD/restapi-doc-1.6.0/mvn.repo.tgz ~/ +mock -r $MY_BUILD_CFG_STD --copyout /builddir/build/BUILD/restapi-doc-1.9.1/mvn.repo.tgz ~/ cp ~/mvn.repo.tgz $MY_REPO/stx/downloads/ -cd $MY_REPO/stx/downloads/ # only the first time # ln -s ../../../downloads/mvn.repo.tgz mvn.repo.tgz diff --git a/restapi-doc/restapi-doc/api-ref/src/wadls/sysinv-api/v1/common.ent b/restapi-doc/restapi-doc/api-ref/src/wadls/sysinv-api/v1/common.ent index 9e3047f2b..805bb9a24 100644 --- a/restapi-doc/restapi-doc/api-ref/src/wadls/sysinv-api/v1/common.ent +++ b/restapi-doc/restapi-doc/api-ref/src/wadls/sysinv-api/v1/common.ent @@ -168,6 +168,55 @@ The user-specified location of the cloud system. + + + System capabilities. + + + sdn_enabled : (Boolean) Software Defined Networking enabled. + + + region_config : (Boolean) region selection: + + + true : Secondary region. + + + false : Primary region. + + + + + shared_services : Services provided by Primary region. + + + bm_region : Board Management controller network selection: + + + External : OAM network. + + + Internal : Management network. + + + + + cinder_backend : backend selection for Cinder. + + + vswitch_type : vSwitch selection. + + + security_feature : Selection of Spectre and Meltdown mitigation options. + + + https_enabled : (Boolean) selection of https mode for public URLs. + + + + Date: Mon, 9 Jul 2018 17:33:44 -0400 Subject: [PATCH 20/26] Further enhancement of API stats collection Decompose gunicorn related stats to individual service stats. Add new and missing services to config file. Functional tests completed by Mathieu Godin. Change-Id: Ifcb81aa82a57c2a414fb99d43afc856a07d3846d Story: 2002895 Task: 22858 Signed-off-by: Tee Ngo --- .../scripts/cfg/engtools.conf | 3 +- .../scripts/live_stream.py | 61 +++++++++++-------- 2 files changed, 39 insertions(+), 25 deletions(-) diff --git a/tools/engtools/hostdata-collectors/scripts/cfg/engtools.conf b/tools/engtools/hostdata-collectors/scripts/cfg/engtools.conf index 66aed494e..a6c06eac6 100644 --- a/tools/engtools/hostdata-collectors/scripts/cfg/engtools.conf +++ b/tools/engtools/hostdata-collectors/scripts/cfg/engtools.conf @@ -93,6 +93,7 @@ EXCLUDE_LIST=python python2 bash perl sudo init DB_PORT_NUMBER=5432 RABBIT_PORT_NUMBER=5672 +# The api stats data structure has three fields: the name displayed in ps -ef, the name displayed in lsof -Pn -i tcp and the specific api port of the service. [ApiStatsServices] -API_STATS_STRUCTURE=gunicorn;gunicorn;5000|sysinv-conductor;sysinv-co ;|neutron-server;neutron-s;9696|nova-conductor;nova-cond ;|sysinv-agent;sysinv-ag;|sysinv-api;sysinv-ap;6385|nova-api;nova-api ;18774|cinder-api;cinder-a;8776|glance-api;glance-a;9292|ceilometer;ceilomete;8777|vim;nfv-vim;4545|heat-api;heat-a;8004|heat-engine;heat-e;8004 +API_STATS_STRUCTURE=ironic-conductor;ironic-co;|ironic-api;ironic-ap;6485|radosgw-swift;radosgw;8|magnum-conductor;magnum-co;|magnum-api;magnum-ap;9511|murano-api;murano-ap;8082|murano-engine;murano-en;|keystone-public;gunicorn;5000|openstack_dashboard.wsgi;gunicorn;8080|gnocchi-api;gunicorn;8041|aodh-api;gunicorn;8042|panko-api;gunicorn;8977|sysinv-conductor;sysinv-co ;|neutron-server;neutron-s;9696|nova-conductor;nova-cond ;|sysinv-agent;sysinv-ag;|sysinv-api;sysinv-ap;6385|nova-api;nova-api ;18774|cinder-api;cinder-a;8776|glance-api;glance-a;9292|vim;nfv-vim;4545|heat-api;heat-a;8004|heat-engine;heat-e;8004 diff --git a/tools/engtools/hostdata-collectors/scripts/live_stream.py b/tools/engtools/hostdata-collectors/scripts/live_stream.py index 48325ee75..aed8f5520 100644 --- a/tools/engtools/hostdata-collectors/scripts/live_stream.py +++ b/tools/engtools/hostdata-collectors/scripts/live_stream.py @@ -1116,14 +1116,7 @@ def collectCpuCount(influx_info, node, ci): except Exception: logging.error("cpu_count collection stopped unexpectedly with error: {}. Restarting process...".format(sys.exc_info())) -def countApiStatsServices(lsof_lines, service_port, service_name): - service_count = 0 - for line in lsof_lines: - if service_port is not None and service_name is not None and service_port in line and service_name in line: - service_count += 1 - return service_count - -def collectApiStats(influx_info, node, ci, services): +def collectApiStats(influx_info, node, ci, services, db_port, rabbit_port): logging.basicConfig(filename="/tmp/livestream.log", filemode="a", format="%(asctime)s %(levelname)s %(message)s", level=logging.INFO) logging.info("api_request data starting collection with a collection interval of {}s".format(ci["cpu_count"])) measurement = "api_requests" @@ -1132,19 +1125,39 @@ def collectApiStats(influx_info, node, ci, services): lsof_args = ['lsof', '-Pn', '-i', 'tcp'] while True: try: - fields = {} - lsof_result = Popen(lsof_args, shell=False, stdout=PIPE) - lsof_lines = list() - while True: - line = lsof_result.stdout.readline().strip("\n") - if not line: - break - lsof_lines.append(line) - lsof_result.kill() - for name, service in services.iteritems(): - api_count = countApiStatsServices(lsof_lines, service['api-port'], service['name']) - db_count = countApiStatsServices(lsof_lines, service['db-port'], service['name']) - rabbit_count = countApiStatsServices(lsof_lines, service['rabbit-port'], service['name']) + fields = {} + lsof_result = Popen(lsof_args, shell=False, stdout=PIPE) + lsof_lines = list() + while True: + line = lsof_result.stdout.readline().strip("\n") + if not line: + break + lsof_lines.append(line) + lsof_result.kill() + for name, service in services.iteritems(): + pid_list = list() + check_pid = False + if name == "keystone-public": + check_pid = True + ps_result = Popen("pgrep -f --delimiter=' ' keystone-public", shell=True, stdout=PIPE) + pid_list = ps_result.stdout.readline().strip().split(' ') + ps_result.kill() + elif name == "gnocchi-api": + check_pid = True + ps_result = Popen("pgrep -f --delimiter=' ' gnocchi-api", shell=True, stdout=PIPE) + pid_list = ps_result.stdout.readline().strip().split(' ') + ps_result.kill() + api_count = 0 + db_count = 0 + rabbit_count = 0 + for line in lsof_lines: + if service['name'] is not None and service['name'] in line and (not check_pid or any(pid in line for pid in pid_list)): + if service['api-port'] is not None and service['api-port'] in line: + api_count += 1 + elif db_port is not None and db_port in line: + db_count += 1 + elif rabbit_port is not None and rabbit_port in line: + rabbit_count += 1 fields[name] = {"api": api_count, "db": db_count, "rabbit": rabbit_count} influx_string += "{},'{}'='{}','{}'='{}' '{}'='{}','{}'='{}','{}'='{}'".format(measurement, "node", tags["node"], "service", name, "api", fields[name]["api"], "db", fields[name]["db"], "rabbit", fields[name]["rabbit"]) + "\n" p = Popen("curl -s -o /dev/null 'http://'{}':'{}'/write?db='{}'' --data-binary '{}'".format(influx_info[0], influx_info[1], influx_info[2], influx_string), shell=True) @@ -1414,9 +1427,9 @@ if __name__ == "__main__": for service_string in SERVICES_INFO: service_tuple = tuple(service_string.split(';')) if service_tuple[2] != "" and service_tuple[2] != None: - SERVICES[service_tuple[0]] = {'name': service_tuple[1], 'db-port': DB_PORT_NUMBER, 'rabbit-port': RABBIT_PORT_NUMBER, 'api-port': service_tuple[2]} + SERVICES[service_tuple[0]] = {'name': service_tuple[1], 'api-port': service_tuple[2]} else: - SERVICES[service_tuple[0]] = {'name': service_tuple[1], 'db-port': DB_PORT_NUMBER, 'rabbit-port': RABBIT_PORT_NUMBER, 'api-port': None} + SERVICES[service_tuple[0]] = {'name': service_tuple[1], 'api-port': None} except Exception: print "An error has occurred when parsing the engtools.conf configuration file: {}".format(sys.exc_info()) sys.exit(0) @@ -1539,7 +1552,7 @@ if __name__ == "__main__": tasks.append(p) p.start() if collect_api_requests is True and node_type == "controller": - p = Process(target=collectApiStats, args=(influx_info, node, collection_intervals, SERVICES), name="api_requests") + p = Process(target=collectApiStats, args=(influx_info, node, collection_intervals, SERVICES, DB_PORT_NUMBER, RABBIT_PORT_NUMBER), name="api_requests") tasks.append(p) p.start() From 50c14591ca9d6ae2c1951a914591fcfec88cf45c Mon Sep 17 00:00:00 2001 From: Scott Little Date: Fri, 24 Aug 2018 12:24:34 -0400 Subject: [PATCH 21/26] Kernel can no longer build in 10GB tmpfs Problem: The kernel is sporadically reporting build failures due to 'no space on device' when compiled in a 10 GB tmpfs. Solution: Two parts: 1) Increase required size to 11 GB in the build_srpm.data. 2) Modify build system to allow alocation of a 11 GB tmpfs. Change-Id: I48aeff586f71ee5000a99354e33d199a38afec9e Story: 2002835 Task: 24519 Signed-off-by: Scott Little --- kernel/kernel-rt/centos/build_srpm.data | 2 +- kernel/kernel-std/centos/build_srpm.data | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/kernel-rt/centos/build_srpm.data b/kernel/kernel-rt/centos/build_srpm.data index 699bea4d8..ae352ddf0 100644 --- a/kernel/kernel-rt/centos/build_srpm.data +++ b/kernel/kernel-rt/centos/build_srpm.data @@ -1,4 +1,4 @@ COPY_LIST="files/*" TIS_PATCH_VER=43 -BUILD_IS_BIG=10 +BUILD_IS_BIG=11 BUILD_IS_SLOW=12 diff --git a/kernel/kernel-std/centos/build_srpm.data b/kernel/kernel-std/centos/build_srpm.data index 0de47ad03..2789e6f6e 100644 --- a/kernel/kernel-std/centos/build_srpm.data +++ b/kernel/kernel-std/centos/build_srpm.data @@ -1,4 +1,4 @@ COPY_LIST="files/*" TIS_PATCH_VER=36 -BUILD_IS_BIG=10 +BUILD_IS_BIG=11 BUILD_IS_SLOW=12 From 1c261f19555f5d29470d348dea61ff3151bcd5a0 Mon Sep 17 00:00:00 2001 From: Teresa Ho Date: Thu, 12 Jul 2018 11:48:44 -0400 Subject: [PATCH 22/26] Extend sysinv to assign kubernetes labels to nodes Updated restapi documentation for host labels. Change-Id: I5b240177138b3b159f34e1365088986cb516200f Signed-off-by: David Sullivan Story: 2002845 Task: 22793 Depends-On: https://review.openstack.org/#/c/595875/ --- .../api-ref/src/docbkx/ch_sysinv-api-v1.xml | 22 +++++ .../host_label_assign-request.json | 4 + .../api_samples/host_label_list-response.json | 14 +++ .../src/wadls/sysinv-api/v1/common.ent | 26 +++++ .../wadls/sysinv-api/v1/sysinv-api-v1.wadl | 96 +++++++++++++++++++ 5 files changed, 162 insertions(+) create mode 100644 restapi-doc/restapi-doc/api-ref/src/wadls/sysinv-api/v1/api_samples/host_label_assign-request.json create mode 100644 restapi-doc/restapi-doc/api-ref/src/wadls/sysinv-api/v1/api_samples/host_label_list-response.json diff --git a/restapi-doc/restapi-doc/api-ref/src/docbkx/ch_sysinv-api-v1.xml b/restapi-doc/restapi-doc/api-ref/src/docbkx/ch_sysinv-api-v1.xml index fd01cec26..3983dfbf0 100644 --- a/restapi-doc/restapi-doc/api-ref/src/docbkx/ch_sysinv-api-v1.xml +++ b/restapi-doc/restapi-doc/api-ref/src/docbkx/ch_sysinv-api-v1.xml @@ -1358,4 +1358,26 @@ configuration entity for the system. + + + + +
+ Labels + + + + + + + + + + + +
+ diff --git a/restapi-doc/restapi-doc/api-ref/src/wadls/sysinv-api/v1/api_samples/host_label_assign-request.json b/restapi-doc/restapi-doc/api-ref/src/wadls/sysinv-api/v1/api_samples/host_label_assign-request.json new file mode 100644 index 000000000..7afc33d53 --- /dev/null +++ b/restapi-doc/restapi-doc/api-ref/src/wadls/sysinv-api/v1/api_samples/host_label_assign-request.json @@ -0,0 +1,4 @@ +{ + "key1": "value1", + "key2": "value2" +} diff --git a/restapi-doc/restapi-doc/api-ref/src/wadls/sysinv-api/v1/api_samples/host_label_list-response.json b/restapi-doc/restapi-doc/api-ref/src/wadls/sysinv-api/v1/api_samples/host_label_list-response.json new file mode 100644 index 000000000..ba30258d2 --- /dev/null +++ b/restapi-doc/restapi-doc/api-ref/src/wadls/sysinv-api/v1/api_samples/host_label_list-response.json @@ -0,0 +1,14 @@ +{ + "labels": [ + { + "uuid": "a7d37730-c58e-4b18-9046-6bd0f4fe03a8", + "host_uuid": "42e30882-ab1a-41b0-9f65-696f6d804888", + "label": "key1=value1" + }, + { + "uuid": "c9d3aca9-d360-406c-80c7-a059404471c1", + "host_uuid": "42e30882-ab1a-41b0-9f65-696f6d804888", + "label": "key2=value2" + } + ] +} diff --git a/restapi-doc/restapi-doc/api-ref/src/wadls/sysinv-api/v1/common.ent b/restapi-doc/restapi-doc/api-ref/src/wadls/sysinv-api/v1/common.ent index 9e3047f2b..61d36de53 100644 --- a/restapi-doc/restapi-doc/api-ref/src/wadls/sysinv-api/v1/common.ent +++ b/restapi-doc/restapi-doc/api-ref/src/wadls/sysinv-api/v1/common.ent @@ -3903,6 +3903,32 @@ '> + + + + + The universally unique identifier for this object. + + + + + The uuid for the host. + + + + + The label provisioned for the host. + + + '> + GET'> PUT'> diff --git a/restapi-doc/restapi-doc/api-ref/src/wadls/sysinv-api/v1/sysinv-api-v1.wadl b/restapi-doc/restapi-doc/api-ref/src/wadls/sysinv-api/v1/sysinv-api-v1.wadl index f9450a48d..5c4a9f821 100644 --- a/restapi-doc/restapi-doc/api-ref/src/wadls/sysinv-api/v1/sysinv-api-v1.wadl +++ b/restapi-doc/restapi-doc/api-ref/src/wadls/sysinv-api/v1/sysinv-api-v1.wadl @@ -199,6 +199,11 @@ SPDX-License-Identifier: Apache-2.0 + + + + + @@ -1003,6 +1008,26 @@ SPDX-License-Identifier: Apache-2.0 + + + + + The unique identifier of an existing host. + + + + + + + + + + The unique identifier of an existing host label. + + + + + @@ -7263,4 +7288,75 @@ OAM Controller-1 IP Address. &commonFaults; &getFaults; + + + + + + + + Assign label to a host. + + + + + + + + This parameter specifies the label key value pairs. + + + + + + + + + + + + + + + + + &commonFaults; &postPutFaults; + + + + + List host label. + + + + + + &labelListParameters; + + + + + + + + &commonFaults; &getFaults; + + + + + Remove label from a host. + + + + + + + + + From f8f5d48a4c9097fe4deb7054c817deca29c56e2a Mon Sep 17 00:00:00 2001 From: Matt Peters Date: Mon, 27 Aug 2018 10:06:53 -0500 Subject: [PATCH 23/26] Add Mellanox driver packages to ISO image The Mellanox Linux driver packages are being built but were not included in the ISO image, therefore Mellanox NIC devices were not supported. This update ensures these packages are included in the ISO image. Story: 2003104 Task: 24541 Change-Id: I9b843a66bfd7285d6a28a443b7fb35aa8ebdceea Signed-off-by: Matt Peters --- centos_iso_image.inc | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/centos_iso_image.inc b/centos_iso_image.inc index 2e6eb528c..3ebf82576 100644 --- a/centos_iso_image.inc +++ b/centos_iso_image.inc @@ -327,3 +327,8 @@ grub2-efi-x64-modules kernel-rt kernel-rt-kvm kernel-rt-tools + +# mellanox drivers +rdma-core +mlnx-ofa_kernel-modules +mlnx-ofa_kernel-rt-modules From 988112868ca21622c522807e0a72718f51420488 Mon Sep 17 00:00:00 2001 From: Matt Peters Date: Fri, 10 Aug 2018 09:39:42 -0500 Subject: [PATCH 24/26] Enable Mellanox PMDs in ovs-dpdk build The Mellanox devices are not currently supported by ovs-dpdk since they are not built into the set of PMDs enabled. Due to external build dependencies, the default DPDK configuration sets them to disabled. This update enables the mlx4 and mlx5 PMDs and adds the necessary build dependencies to the openvswitch.spec file to properly build and link the Mellanox PMDs. Story: 2003104 Task: 23218 Change-Id: I9048b1b39e279261622f5add02c6642fab21e532 Signed-off-by: Matt Peters --- networking/openvswitch/centos/build_srpm.data | 2 +- .../meta_patches/0007-enable-mlx-pmds.patch | 45 +++++++++++++++++++ .../centos/meta_patches/PATCH_ORDER | 1 + 3 files changed, 47 insertions(+), 1 deletion(-) create mode 100644 networking/openvswitch/centos/meta_patches/0007-enable-mlx-pmds.patch diff --git a/networking/openvswitch/centos/build_srpm.data b/networking/openvswitch/centos/build_srpm.data index 6e36f5d7f..b98158431 100644 --- a/networking/openvswitch/centos/build_srpm.data +++ b/networking/openvswitch/centos/build_srpm.data @@ -1,3 +1,3 @@ COPY_LIST="files/*" -TIS_PATCH_VER=0 +TIS_PATCH_VER=1 BUILD_IS_SLOW=12 diff --git a/networking/openvswitch/centos/meta_patches/0007-enable-mlx-pmds.patch b/networking/openvswitch/centos/meta_patches/0007-enable-mlx-pmds.patch new file mode 100644 index 000000000..a20174ede --- /dev/null +++ b/networking/openvswitch/centos/meta_patches/0007-enable-mlx-pmds.patch @@ -0,0 +1,45 @@ +diff --git a/SOURCES/x86_64-native-linuxapp-gcc-config b/SOURCES/x86_64-native-linuxapp-gcc-config +index f81d420..eab161c 100644 +--- a/SOURCES/x86_64-native-linuxapp-gcc-config ++++ b/SOURCES/x86_64-native-linuxapp-gcc-config +@@ -197,12 +197,12 @@ CONFIG_RTE_LIBRTE_FM10K_DEBUG_DRIVER=n + CONFIG_RTE_LIBRTE_FM10K_RX_OLFLAGS_ENABLE=y + CONFIG_RTE_LIBRTE_FM10K_INC_VECTOR=y + # Compile burst-oriented Mellanox ConnectX-3 (MLX4) PMD +-CONFIG_RTE_LIBRTE_MLX4_PMD=n ++CONFIG_RTE_LIBRTE_MLX4_PMD=y + CONFIG_RTE_LIBRTE_MLX4_DEBUG=n + CONFIG_RTE_LIBRTE_MLX4_DEBUG_BROKEN_VERBS=n + CONFIG_RTE_LIBRTE_MLX4_TX_MP_CACHE=8 + # Compile burst-oriented Mellanox ConnectX-4 & ConnectX-5 (MLX5) PMD +-CONFIG_RTE_LIBRTE_MLX5_PMD=n ++CONFIG_RTE_LIBRTE_MLX5_PMD=y + CONFIG_RTE_LIBRTE_MLX5_DEBUG=n + CONFIG_RTE_LIBRTE_MLX5_TX_MP_CACHE=8 + # Compile burst-oriented Broadcom PMD driver +diff --git a/SPECS/openvswitch.spec b/SPECS/openvswitch.spec +index 29255d5..f392e95 100644 +--- a/SPECS/openvswitch.spec ++++ b/SPECS/openvswitch.spec +@@ -155,6 +155,7 @@ BuildRequires: libcap-ng libcap-ng-devel + %ifarch %{dpdkarches} + # DPDK driver dependencies + BuildRequires: zlib-devel libpcap-devel numactl-devel ++BuildRequires: rdma-core-devel + Requires: python-pyelftools + + # Virtual provide for depending on DPDK-enabled OVS +@@ -356,7 +357,12 @@ cd - + --dpdk \ + < rhel/usr_lib_systemd_system_ovs-vswitchd.service.in \ + > rhel/usr_lib_systemd_system_ovs-vswitchd.service +-make %{?_smp_mflags} ++make %{?_smp_mflags} \ ++%if %{with dpdk} ++%ifarch %{dpdkarches} ++ LDFLAGS="-libverbs -lmlx4 -lmlx5" ++%endif ++%endif + + %install + rm -rf $RPM_BUILD_ROOT diff --git a/networking/openvswitch/centos/meta_patches/PATCH_ORDER b/networking/openvswitch/centos/meta_patches/PATCH_ORDER index 12dcf854c..1c6551c98 100644 --- a/networking/openvswitch/centos/meta_patches/PATCH_ORDER +++ b/networking/openvswitch/centos/meta_patches/PATCH_ORDER @@ -4,3 +4,4 @@ 0004-add-pmon-conf-files.patch 0005-log-rotation-config.patch 0006-rpm-check-with-condition.patch +0007-enable-mlx-pmds.patch From 98aa0d5f2bba7ae8a6e791c850856a79c0ca3d67 Mon Sep 17 00:00:00 2001 From: Robert Church Date: Thu, 16 Aug 2018 14:57:27 -0400 Subject: [PATCH 25/26] Enable helm repository and chart upload tool There's a lot going on here but conceptually we're just enabling a local helm repo along with a helper script to install helm charts into the repo. The first item is to configure lighttpd to serve up helm charts as static information (so no proxying) at http://127.0.0.1/helm_charts". This is fairly straightforward, but the files are served out of /www which isn't a replicated filesystem and which is owned by the www user. The helm puppet manifest is modified to create the "helm_charts" directory for the webserver, to generate the initial index file, and to tell helm to add the new repo for the "wrsroot" user. The various commands are run as specific users with specific environment variables, this is key to making everything work as planned. To allow the wrsroot user to upload charts into /www the helm-upload script will re-run itself as the www user. /etc/sudoers.d is modified to allow this without asking for a password. The upload script will copy the specified charts in to /www/pages/helm_charts, and will then regenerate the index.yaml file. The upload script will then try to sync the files over to the other node. To enable this without prompting for a password we modify /etc/rsyncd.conf to allow passwordless syncing into /www/helm_charts. In a future commit we'll need to sync charts with the other controller when booting up, and also configure the local starlingx helm repo on the second controller. Change-Id: I86a7795decb7833cb22c04e34e298c8d24ed7fa3 Signed-off-by: David Sullivan Story: 2002876 Task: 22831 Depends-On: https://review.openstack.org/596802 --- base/rsync/centos/build_srpm.data | 2 +- base/rsync/files/rsyncd.conf | 6 ++ kubernetes/helm/centos/build_srpm.data | 5 +- kubernetes/helm/centos/files/helm-upload | 79 ++++++++++++++++++++++++ kubernetes/helm/centos/files/helm.sudo | 3 + kubernetes/helm/centos/helm.spec | 13 ++-- 6 files changed, 99 insertions(+), 9 deletions(-) create mode 100644 kubernetes/helm/centos/files/helm-upload create mode 100644 kubernetes/helm/centos/files/helm.sudo diff --git a/base/rsync/centos/build_srpm.data b/base/rsync/centos/build_srpm.data index 2c93764a1..69cb924ed 100644 --- a/base/rsync/centos/build_srpm.data +++ b/base/rsync/centos/build_srpm.data @@ -1,2 +1,2 @@ COPY_LIST="$PKG_BASE/files/rsyncd.conf" -TIS_PATCH_VER=1 +TIS_PATCH_VER=2 diff --git a/base/rsync/files/rsyncd.conf b/base/rsync/files/rsyncd.conf index 8b56742b8..f7a26e1df 100644 --- a/base/rsync/files/rsyncd.conf +++ b/base/rsync/files/rsyncd.conf @@ -49,3 +49,9 @@ read only = yes comment = SSL ca certificate uid = root read only = no + +[helm_charts] + path = /www/pages/helm_charts + comment = Helm chart repo + uid = root + read only = no diff --git a/kubernetes/helm/centos/build_srpm.data b/kubernetes/helm/centos/build_srpm.data index 1d35a996a..d18ca5832 100644 --- a/kubernetes/helm/centos/build_srpm.data +++ b/kubernetes/helm/centos/build_srpm.data @@ -1,7 +1,6 @@ VERSION=2.9.1 TAR_NAME=helm TAR="$TAR_NAME-v$VERSION-linux-amd64.tar.gz" -#COPY_LIST="${CGCS_BASE}/downloads/$TAR ${CGCS_BASE}/downloads/tiller-2.9.1-docker-image.tgz" -COPY_LIST="${CGCS_BASE}/downloads/$TAR" +COPY_LIST="${CGCS_BASE}/downloads/$TAR $FILES_BASE/*" -TIS_PATCH_VER=2 +TIS_PATCH_VER=3 diff --git a/kubernetes/helm/centos/files/helm-upload b/kubernetes/helm/centos/files/helm-upload new file mode 100644 index 000000000..a7f8dcde5 --- /dev/null +++ b/kubernetes/helm/centos/files/helm-upload @@ -0,0 +1,79 @@ +#!/bin/bash + +# +# Copyright (c) 2018 Wind River Systems, Inc. +# +# SPDX-License-Identifier: Apache-2.0 +# + +# This script takes the names of packaged helm charts as arguments. +# It installs them in the on-node helm chart repository and regenerates +# the repository index. + + +# We want to run as the "www" user and scripts can't be setuid. The +# sudoers permissions are set up to allow wrsroot to run this script +# as the "www" user without a password. +if [ $USER != "www" ]; then + exec sudo -u www $0 $@ +fi + + +RETVAL=0 +REINDEX=0 + +REPO_DIR='/www/pages/helm_charts' + +for FILE in "$@"; do + if [ -r $FILE ]; then + # QUESTION: should we disallow overwriting an existing file? + # The versions are embedded in the filename, so it shouldn't + # cause problems. + cp $FILE $REPO_DIR + if [ $? -ne 0 ]; then + echo Problem adding $FILE to helm chart registry. + RETVAL=1 + else + REINDEX=1 + fi + else + echo Cannot read file ${FILE}. + RETVAL=1 + fi +done + + +# Now re-index the helm repository if we successfully copied in +# any new charts. +if [ $REINDEX -eq 1 ]; then + /usr/sbin/helm repo index $REPO_DIR +fi + +if [ ! -f "/etc/platform/simplex" ]; then + # We're not a one node system, copy the files to the other + # controller if we can + if [ $HOSTNAME == "controller-0" ]; then + TARGET="controller-1" + else + TARGET="controller-0" + fi + + # We've modified etc/rsyncd.conf to allow access to /www/helm_charts + # To avoid races, copy over the index file last. + rsync -acv --exclude=index.yaml ${REPO_DIR}/ rsync://${TARGET}/helm_charts + if [ $? -ne 0 ]; then + echo Problem syncing helm charts to $TARGET + RETVAL=1 + fi + + rsync -acv ${REPO_DIR}/index.yaml rsync://${TARGET}/helm_charts + if [ $? -ne 0 ]; then + echo Problem syncing helm chart index file to $TARGET + RETVAL=1 + fi +fi + +# We also need to sync the helm charts on node startup +# in case they were added while the node was down. + +exit $RETVAL diff --git a/kubernetes/helm/centos/files/helm.sudo b/kubernetes/helm/centos/files/helm.sudo new file mode 100644 index 000000000..48e02bfbb --- /dev/null +++ b/kubernetes/helm/centos/files/helm.sudo @@ -0,0 +1,3 @@ +wrsroot ALL=(www) NOPASSWD: /usr/local/sbin/helm-upload + +Defaults lecture=never, secure_path=/usr/local/bin:/usr/bin:/bin:/usr/local/sbin:/usr/sbin:/sbin diff --git a/kubernetes/helm/centos/helm.spec b/kubernetes/helm/centos/helm.spec index 6afe901ef..f4f56e31c 100644 --- a/kubernetes/helm/centos/helm.spec +++ b/kubernetes/helm/centos/helm.spec @@ -7,7 +7,8 @@ Group: devel Packager: Wind River URL: https://github.com/kubernetes/helm/releases Source0: %{name}-v%{version}-linux-amd64.tar.gz -#Source1: tiller-2.9.1-docker-image.tgz +Source1: helm-upload +Source2: helm.sudo Requires: /bin/bash @@ -20,11 +21,13 @@ Requires: /bin/bash %install install -d %{buildroot}%{_sbindir} install -m 755 ${RPM_BUILD_DIR}/linux-amd64/helm %{buildroot}%{_sbindir}/helm -#install -d %{buildroot}%{_sharedstatedir}/tiller -#install -m 400 %{SOURCE1} %{buildroot}%{_sharedstatedir}/tiller/tiller-2.9.1-docker-image.tgz +install -d %{buildroot}/usr/local/sbin +install -m 755 %{SOURCE1} %{buildroot}/usr/local/sbin/helm-upload +install -d %{buildroot}%{_sysconfdir}/sudoers.d +install -m 440 %{SOURCE2} %{buildroot}%{_sysconfdir}/sudoers.d/helm %files %defattr(-,root,root,-) %{_sbindir}/helm -#%{_sharedstatedir}/tiller/tiller-2.9.1-docker-image.tgz - +/usr/local/sbin/helm-upload +%{_sysconfdir}/sudoers.d/helm From ca1afed9479c894670cf15cb5e566dc218b2eaaa Mon Sep 17 00:00:00 2001 From: zhipengl Date: Mon, 20 Aug 2018 19:01:59 +0800 Subject: [PATCH 26/26] [upstream] Remove stx-integ/virt/libvirt/libvirt-2.0.0 Removing this directory because it is no longer used, as we are currently on libvirt-3.5.0 Story: 2003409 Task: 24550 Change-Id: Ic88c2d54df41c4ca271ef8db482e7226ba37d80f Signed-off-by: zhipengl --- virt/libvirt/libvirt-2.0.0/libvirt.logrotate | 14 -------------- virt/libvirt/libvirt-2.0.0/libvirt.lxc | 15 --------------- virt/libvirt/libvirt-2.0.0/libvirt.qemu | 15 --------------- virt/libvirt/libvirt-2.0.0/libvirt.uml | 15 --------------- 4 files changed, 59 deletions(-) delete mode 100644 virt/libvirt/libvirt-2.0.0/libvirt.logrotate delete mode 100644 virt/libvirt/libvirt-2.0.0/libvirt.lxc delete mode 100644 virt/libvirt/libvirt-2.0.0/libvirt.qemu delete mode 100644 virt/libvirt/libvirt-2.0.0/libvirt.uml diff --git a/virt/libvirt/libvirt-2.0.0/libvirt.logrotate b/virt/libvirt/libvirt-2.0.0/libvirt.logrotate deleted file mode 100644 index a60915995..000000000 --- a/virt/libvirt/libvirt-2.0.0/libvirt.logrotate +++ /dev/null @@ -1,14 +0,0 @@ -/var/log/libvirt/libvirtd.log -{ - nodateext - size 10M - start 1 - rotate 20 - missingok - notifempty - compress - sharedscripts - postrotate - /etc/init.d/syslog reload > /dev/null 2>&1 || true - endscript -} diff --git a/virt/libvirt/libvirt-2.0.0/libvirt.lxc b/virt/libvirt/libvirt-2.0.0/libvirt.lxc deleted file mode 100644 index 81ea6210b..000000000 --- a/virt/libvirt/libvirt-2.0.0/libvirt.lxc +++ /dev/null @@ -1,15 +0,0 @@ -/var/log/libvirt/lxc/*.log -{ - nodateext - size 10M - start 1 - rotate 20 - missingok - notifempty - compress - sharedscripts - postrotate - /etc/init.d/syslog reload > /dev/null 2>&1 || true - endscript -} - diff --git a/virt/libvirt/libvirt-2.0.0/libvirt.qemu b/virt/libvirt/libvirt-2.0.0/libvirt.qemu deleted file mode 100644 index 470ef8cda..000000000 --- a/virt/libvirt/libvirt-2.0.0/libvirt.qemu +++ /dev/null @@ -1,15 +0,0 @@ -/var/log/libvirt/qemu/*.log -{ - nodateext - size 10M - start 1 - rotate 4 - missingok - notifempty - compress - sharedscripts - postrotate - /etc/init.d/syslog reload > /dev/null 2>&1 || true - endscript -} - diff --git a/virt/libvirt/libvirt-2.0.0/libvirt.uml b/virt/libvirt/libvirt-2.0.0/libvirt.uml deleted file mode 100644 index 1c26219f0..000000000 --- a/virt/libvirt/libvirt-2.0.0/libvirt.uml +++ /dev/null @@ -1,15 +0,0 @@ -/var/log/libvirt/uml/*.log -{ - nodateext - size 10M - start 1 - rotate 4 - missingok - notifempty - compress - sharedscripts - postrotate - /etc/init.d/syslog reload > /dev/null 2>&1 || true - endscript -} -