Test resize with mem_page_size in flavor

These tests are meant to address issue [1]. It adds three new testcases: * test_hugepage_resize_keyword_large_to_small * test_hugepage_resize_explicit_pagesize_to_small * test_hugepage_resize_explicit_size_to_size All three tests follow the same basic procedure, spawn a guest with a flavor using hw:mem_page_size:<size_a>, resize the guest to a flavor with a different size hw:mem_page_size:<size_b>, and then resize the guest back to the original flavor. Throughout the tests XML checks are conducted to ensure the page size is accurate for the present flavor. Instead of trying to dynamically determine the hugepage sizes configured on the computes, a new config parameter was added to define what hugepage sizes are available on the host. To avoid dynamic ram calculation sizes for the guest based on available hugepages, a guest ram parameter was also added so users may define the size to use when spawning guests. We also need a new job that has multiple hugepage sizes configured. We cannot use our existing whitebox-devstack-multinode job because that one runs tests that dynamically turn on file backed memory, which is incompatible with hugepages. This commit adds tasks into job setup that allows for the setup of hugepages. In our devstack plugin.sh, we set track_instance_changes to True (devstack defaults it to False) to make sure the scheduler has the latest information about available huge pages, and avoid a race whereing instances failed to schedule because our lone 1G page still appeared used by an instance that had actually beed fully deleted. [1] https://bugs.launchpad.net/nova/+bug/1831269 Change-Id: I5282df3b20c24a909f3b7bb97214206bc07e5b91
2022-01-14 12:59:36 -05:00 · 2022-01-14 12:59:36 -05:00 · 071426c223
parent 766ff042bd
commit 071426c223
8 changed files with 289 additions and 16 deletions
--- a/.zuul.yaml
+++ b/.zuul.yaml
@ -3,7 +3,11 @@
    nodes:
      - name: controller
        label: nested-virt-ubuntu-jammy
-      - name: compute
+        # NOTE(artom) We can't name the node 'compute' because that seems to
+        # take precedence over the 'compute' group in playbooks, so things we
+        # want to run on all hosts in the 'compute' group would only run on the
+        # subnode.
+      - name: compute-host
        label: nested-virt-ubuntu-jammy
    groups:
      # Node where tests are executed and test results collected
@ -14,11 +18,11 @@
      - name: compute
        nodes:
          - controller
-          - compute
+          - compute-host
      # Nodes that are not the controller
      - name: subnode
        nodes:
-          - compute
+          - compute-host
      # Switch node for multinode networking setup
      - name: switch
        nodes:
@ -26,7 +30,7 @@
      # Peer nodes for multinode networking setup
      - name: peers
        nodes:
-          - compute
+          - compute-host

 - job:
    name: whitebox-devstack-multinode
@ -49,6 +53,7 @@
      # open source implementation of UEFI for VMs via the OVMF package. In
      # addition to test vTPM hosts need swtpm as well
      extra_packages: ovmf,swtpm-tools
+      tempest_exclude_regex: ^whitebox_tempest_plugin\.api\.compute\.test_hugepages
      devstack_localrc:
        MAX_COMPUTE_NODES: 2
        NOVA_SERVICE_REPORT_INTERVAL: 10
@ -86,7 +91,6 @@
              swtpm_group: swtpm
    group-vars:
      subnode:
-        num_hugepages: 2048
        devstack_localrc:
          LIBVIRT_TYPE: kvm
          NOVA_SERVICE_REPORT_INTERVAL: 10
@ -109,11 +113,24 @@
                swtpm_user: swtpm
                swtpm_group: swtpm
      tempest:
-        num_hugepages: 512
        devstack_plugins:
          barbican: https://opendev.org/openstack/barbican.git
          whitebox-tempest-plugin: https://opendev.org/openstack/whitebox-tempest-plugin.git

+- job:
+    name: whitebox-devstack-multinode-hugepages
+    parent: whitebox-devstack-multinode
+    description: |
+      Runs the hugepages tests on a deployment that has set up hugepages on the hosts.
+    vars:
+      tempest_test_regex: ^whitebox_tempest_plugin\.api\.compute\.test_hugepages
+      # NOTE(artom) The parent job's exclude regex excludes the hugepages
+      # tests, so we need to overwrite it here with a regex that matches
+      # *nothing*.
+      tempest_exclude_regex: $^
+      num_2M_pages: 512
+      num_1G_pages: 1
+
 - job:
    name: whitebox-devstack-ceph-multinode
    parent: devstack-plugin-ceph-multinode-tempest-py3
@ -174,3 +191,6 @@
        - whitebox-devstack-multinode
        - whitebox-devstack-ceph-multinode
        - openstack-tox-pep8
+    experimental:
+      jobs:
+        - whitebox-devstack-multinode-hugepages
--- a/devstack/plugin.sh
+++ b/devstack/plugin.sh
@ -19,6 +19,7 @@ function configure {
    iniset $TEMPEST_CONFIG whitebox default_video_model $WHITEBOX_DEFAULT_VIDEO_MODEL
    iniset $TEMPEST_CONFIG whitebox max_disk_devices_to_attach $WHITEBOX_MAX_DISK_DEVICES_TO_ATTACH
    iniset $TEMPEST_CONFIG whitebox nodes_yaml $WHITEBOX_NODES_YAML
+    iniset $TEMPEST_CONFIG whitebox hugepage_guest_ram_size $WHITEBOX_HUGEPAGE_GUEST_RAM_SIZE

    iniset $TEMPEST_CONFIG whitebox-database user $DATABASE_USER
    iniset $TEMPEST_CONFIG whitebox-database password $DATABASE_PASSWORD
@ -27,6 +28,7 @@ function configure {
    iniset $TEMPEST_CONFIG whitebox-hardware cpu_topology "$WHITEBOX_CPU_TOPOLOGY"
    iniset $TEMPEST_CONFIG whitebox-hardware dedicated_cpus_per_numa "$WHITEBOX_DEDICATED_CPUS_PER_NUMA"
    iniset $TEMPEST_CONFIG whitebox-hardware shared_cpus_per_numa "$WHITEBOX_SHARED_CPUS_PER_NUMA"
+    iniset $TEMPEST_CONFIG whitebox-hardware configured_hugepage_sizes "$WHITEBOX_CONFIGURED_HUGEPAGES"

    iniset $TEMPEST_CONFIG compute-feature-enabled virtio_rng "$COMPUTE_FEATURE_VIRTIO_RNG"
    iniset $TEMPEST_CONFIG compute-feature-enabled rbd_download "$COMPUTE_FEATURE_RBD_DOWNLOAD"
@ -39,6 +41,7 @@ function configure {
    # https://github.com/openstack/devstack/blob/6b0f055b4ed407f8a190f768d0e654235ac015dd/lib/nova#L46C36-L46C50
    iniset $TEMPEST_CONFIG whitebox-nova-compute state_path $DATA_DIR/nova

+    iniset $NOVA_CONF filter_scheduler track_instance_changes True
 }

 if [[ "$1" == "stack" ]]; then
--- a/devstack/settings
+++ b/devstack/settings
@ -7,10 +7,12 @@ WHITEBOX_RX_QUEUE_SIZE=${WHITEBOX_RX_QUEUE_SIZE:-1024}
 WHITEBOX_DEFAULT_VIDEO_MODEL=${WHITEBOX_DEFAULT_VIDEO_MODEL:-'virtio'}
 WHITEBOX_MAX_DISK_DEVICES_TO_ATTACH=${WHITEBOX_MAX_DISK_DEVICES_TO_ATTACH:-7}
 WHITEBOX_NODES_YAML=${WHITEBOX_NODES_YAML:-'/home/zuul/compute_nodes.yaml'}
+WHITEBOX_HUGEPAGE_GUEST_RAM_SIZE=${WHITEBOX_HUGEPAGE_GUEST_RAM_SIZE:-1024}

 WHITEBOX_CPU_TOPOLOGY=${WHITEBOX_CPU_TOPOLOGY:-''}
 WHITEBOX_DEDICATED_CPUS_PER_NUMA=${WHITEBOX_DEDICATED_CPUS_PER_NUMA:-4}
 WHITEBOX_SHARED_CPUS_PER_NUMA=${WHITEBOX_SHARED_CPUS_PER_NUMA:-2}
+WHITEBOX_CONFIGURED_HUGEPAGES=${WHITEBOX_CONFIGURED_HUGEPAGES:-'2048,1048576'}

 COMPUTE_FEATURE_VIRTIO_RNG=${COMPUTE_FEATURE_VIRTIO_RNG:-'True'}
 COMPUTE_FEATURE_RBD_DOWNLOAD=${COMPUTE_FEATURE_RBD_DOWNLOAD:-'False'}
--- a/playbooks/whitebox/pre.yaml
+++ b/playbooks/whitebox/pre.yaml
@ -44,4 +44,45 @@
      shell: |
        cat /home/zuul/compute_nodes.yaml
      run_once: true
-      delegate_to: controller
+      delegate_to: controller
+
+- hosts: compute
+  tasks:
+    - name: Create hugepages for computes
+      block:
+
+        - name: Append to GRUB command line
+          lineinfile:
+            path: /etc/default/grub
+            state: present
+            backrefs: yes
+            regexp: GRUB_CMDLINE_LINUX="([^"]*)"
+            line: GRUB_CMDLINE_LINUX="\1 hugepagesz=2M hugepages={{ num_2M_pages }} hugepagesz=1G hugepages={{ num_1G_pages }} transparent_hugepage=never"
+          become: yes
+
+        - name: Update grub.cfg
+          # NOTE(artom) This assumes an Ubuntu host
+          command: update-grub2
+          become: yes
+
+        - name: Reboot
+          reboot:
+          become: yes
+
+        - name: (Re-)start the Zuul console streamer after the reboot
+          # NOTE(artom) The job will still work if we don't do this, but the
+          # console will get spammed with 'Waiting on logger' messages. See
+          # https://bugs.launchpad.net/openstack-gate/+bug/1806655 for more
+          # info.
+          import_role:
+            name: start-zuul-console
+
+        - name: Add 1G hugetlbfs mount
+          # The 2M hugetlbfs is mounted automatically by the OS, but we need to
+          # manually add the 1G mount.
+          shell: |
+            mkdir /dev/hugepages1G
+            mount -t hugetlbfs -o pagesize=1G none /dev/hugepages1G
+          become: yes
+
+      when: num_2M_pages is defined and num_1G_pages is defined
--- a/whitebox_tempest_plugin/api/compute/base.py
+++ b/whitebox_tempest_plugin/api/compute/base.py
@ -435,3 +435,11 @@ class BaseWhiteboxComputeTest(base.BaseV2ComputeAdminTest):
                           'status = "%s"' % status)
            data = cursor.fetchall()
        return data[0]['COUNT(*)']
+
+    def _get_hugepage_xml_element(self, server_id):
+        """Gather and return all instances of the page element from XML element
+        'memoryBacking/hugepages' in a given server's domain.
+        """
+        root = self.get_server_xml(server_id)
+        huge_pages = root.findall('.memoryBacking/hugepages/page')
+        return huge_pages
--- a/whitebox_tempest_plugin/api/compute/test_cpu_pinning.py
+++ b/whitebox_tempest_plugin/api/compute/test_cpu_pinning.py
@ -612,14 +612,6 @@ class NUMALiveMigrationBase(BasePinningTest):
        cpuset = root.find('./vcpu').attrib.get('cpuset', None)
        return hardware.parse_cpu_spec(cpuset)

-    def _get_hugepage_xml_element(self, server_id):
-        """Gather and return all instances of the page element from XML element
-        'memoryBacking/hugepages' in a given server's domain.
-        """
-        root = self.get_server_xml(server_id)
-        huge_pages = root.findall('.memoryBacking/hugepages/page')
-        return huge_pages
-
    def _validate_hugepage_elements(self, server_id, pagesize):
        """Analyze the hugepage xml element(s) from a provided instance. Expect
        to find only one hugepage element in the domain. Return boolean result
--- a/whitebox_tempest_plugin/api/compute/test_hugepages.py
+++ b/whitebox_tempest_plugin/api/compute/test_hugepages.py
@ -0,0 +1,196 @@
+# Copyright 2022 Red Hat Inc.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+from tempest import config
+import testtools
+from whitebox_tempest_plugin.api.compute import base
+
+from oslo_log import log as logging
+
+CONF = config.CONF
+LOG = logging.getLogger(__name__)
+
+
+class HugePageResize(base.BaseWhiteboxComputeTest):
+
+    @classmethod
+    def skip_checks(cls):
+        super(HugePageResize, cls).skip_checks()
+        if len(getattr(CONF.whitebox_hardware,
+                       'configured_hugepage_sizes')) == 0:
+            msg = "configured_hugepage_sizes in whitebox-hardware is not " \
+                  "present"
+            raise cls.skipException(msg)
+
+    def _get_xml_hugepage_size(self, server_id):
+        """Analyze the hugepage xml element(s) from a provided instance. Expect
+        to find only one hugepage element in the domain. Return boolean result
+        comparing if the found page size is equal to the expected page size.
+        """
+        huge_pages_list = self._get_hugepage_xml_element(server_id)
+        self.assertEqual(1, len(huge_pages_list), "Expected to find 1 "
+                         "hugepage XML element on server %s but found %s"
+                         % (server_id, len(huge_pages_list)))
+        huge_page_xml = huge_pages_list[0]
+        return int(huge_page_xml.attrib['size'])
+
+    def test_hugepage_resize_large_to_small(self):
+        """Resize a guest with large hugepages to small hugepages and back
+
+        Create a guest using a flavor with hw:mem_page_size:large, resize it
+        to a flavor with hw:mem_page_size:small, and then resize it back to
+        the original flavor
+        """
+        flavor_a = self.create_flavor(
+            ram=str(CONF.whitebox.hugepage_guest_ram_size),
+            extra_specs={'hw:mem_page_size': 'large'})
+
+        server = self.create_test_server(flavor=flavor_a['id'],
+                                         wait_until='ACTIVE')
+
+        # Cannot assume the exact pagesize of the guest, verify the backing
+        # memory element is present on the guest and the found size is greater
+        # than or equal to the smallest potential size configured in the
+        # environment
+        large_page_size = self._get_xml_hugepage_size(server['id'])
+        minimum_pagesize_threshold = \
+            min(CONF.whitebox_hardware.configured_hugepage_sizes)
+        self.assertTrue(
+            large_page_size >= minimum_pagesize_threshold,
+            "Pagesize found %s should be greater than or equal to pagesize "
+            "of %s for server %s" %
+            (large_page_size, minimum_pagesize_threshold, server['id'])
+        )
+
+        # Resize the guest using a flavor with hw:mem_page_size:small,
+        # memory backing element should not be present on guest currently so
+        # no need for XML verification
+        flavor_b = self.create_flavor(
+            ram=str(CONF.whitebox.hugepage_guest_ram_size),
+            extra_specs={'hw:mem_page_size': 'small'})
+        self.resize_server(server['id'], flavor_b['id'])
+
+        # Resize instance back to staring flavor size and repeat XML check of
+        # the guest
+        self.resize_server(server['id'], flavor_a['id'])
+        large_page_size = self._get_xml_hugepage_size(server['id'])
+        self.assertTrue(
+            large_page_size >= minimum_pagesize_threshold,
+            "After resizing back to original flavor, pagesize found %s should "
+            "be greater than or equal to pagesize of %s for server %s" %
+            (large_page_size, minimum_pagesize_threshold, server['id'])
+        )
+
+    def test_hugepage_resize_size_to_small(self):
+        """Resize a guest with a specified hugepage size to small hugepages
+
+        Create a guest using a flavor with using an explicit hugepage size(s),
+        based on what is configured in whitebox_hardware. Resize the guest to a
+        flavor with hw:mem_page_size:small, and then resize it back to the
+        original flavor. Repeat this process for every hugepage size configured
+        in in whitebox_hardware.configured_hugepage_sizes
+        """
+        flavor_small = self.create_flavor(
+            ram=str(CONF.whitebox.hugepage_guest_ram_size),
+            extra_specs={'hw:mem_page_size': 'small'})
+
+        # Create a flavor and launch an instance based on every configured
+        # hugepage size in the deployment.
+        for page_size in CONF.whitebox_hardware.configured_hugepage_sizes:
+            flavor_a = self.create_flavor(
+                ram=str(CONF.whitebox.hugepage_guest_ram_size),
+                extra_specs={'hw:mem_page_size': str(page_size)})
+
+            server = self.create_test_server(flavor=flavor_a['id'],
+                                             wait_until='ACTIVE')
+
+            size_found = self._get_xml_hugepage_size(server['id'])
+            self.assertTrue(
+                page_size == size_found,
+                "Expected pagesize of %s not found on server %s instead "
+                "found %s" % (page_size, server['id'], size_found)
+            )
+
+            # Resize the guest using a flavor with hw:mem_page_size:small,
+            # memory backing will not be present in with guest so follow up
+            # XML verification is not necessary
+            self.resize_server(server['id'], flavor_small['id'])
+
+            # Resize back to its original size and confirm memory backing
+            # element is present and has the correct size
+            self.resize_server(server['id'], flavor_a['id'])
+            size_found = self._get_xml_hugepage_size(server['id'])
+            self.assertTrue(
+                page_size == size_found,
+                "Expected pagesize of %s not found on server %s after "
+                "resizing back to original flavor size, instead found %s" %
+                (page_size, server['id'], size_found)
+            )
+
+            self.delete_server(server['id'])
+
+    @testtools.skipUnless(
+        len(CONF.whitebox_hardware.configured_hugepage_sizes) > 1,
+        'Need at least 2 configured hugepage sizes to execute test')
+    def test_hugepage_resize_size_to_size(self):
+        """Resize a guest with a specified hugepage size to another size
+
+        Create two flavors based on the two provided hugepage sizes.  The
+        flavors created use explicit sizes Create a
+        server using the first flavor, resize the guest to the second flavor,
+        and resize back to the original spec
+        """
+        start_size, target_size = \
+            CONF.whitebox_hardware.configured_hugepage_sizes
+
+        flavor_a = self.create_flavor(
+            ram=str(CONF.whitebox.hugepage_guest_ram_size),
+            extra_specs={'hw:mem_page_size': str(start_size)})
+
+        server = self.create_test_server(flavor=flavor_a['id'],
+                                         wait_until='ACTIVE')
+
+        size_found = self._get_xml_hugepage_size(server['id'])
+        self.assertTrue(
+            start_size == size_found,
+            "Expected pagesize of %s not found on server %s instead "
+            "found %s" % (start_size, server['id'], size_found)
+        )
+
+        flavor_b = self.create_flavor(
+            ram=str(CONF.whitebox.hugepage_guest_ram_size),
+            extra_specs={'hw:mem_page_size': str(target_size)})
+
+        # Resize to the target size and confirm memory backing element is
+        # present and has the correct size
+        self.resize_server(server['id'], flavor_b['id'])
+        size_found = self._get_xml_hugepage_size(server['id'])
+        self.assertTrue(
+            target_size == size_found,
+            "Expected pagesize of %s not found on server %s after resize "
+            "instead found %s" % (target_size, server['id'], size_found)
+        )
+
+        # Resize back to its original size and confirm memory backing
+        # element is present and has the correct size
+        self.resize_server(server['id'], flavor_a['id'])
+        size_found = self._get_xml_hugepage_size(server['id'])
+
+        self.assertTrue(
+            start_size == size_found,
+            "Expected pagesize of %s not found on server %s after resizing "
+            "back to original flavor size, instead found %s" %
+            (start_size, server['id'], size_found)
+        )
--- a/whitebox_tempest_plugin/config.py
+++ b/whitebox_tempest_plugin/config.py
@ -133,7 +133,12 @@ general_opts = [
        'libvirt_hw_machine_type',
        default='pc',
        choices=["pc", "q35"],
-        help='The machine type configured for the nova computes')
+        help='The machine type configured for the nova computes'),
+    cfg.IntOpt(
+        'hugepage_guest_ram_size',
+        default=64,
+        help="RAM size in MB to use when launching the guests backed "
+             "by hugepages."),
 ]

 nova_compute_group = cfg.OptGroup(
@ -224,6 +229,12 @@ hardware_opts = [
             '<List of CPUs in that node>. For example, if NUMA node 0 has '
             'CPUs 0 and 1, and NUMA node 1 has CPUs 2 and 3, the value to '
             'set would be `0: [0,1], 1: [2, 3]`.'),
+    cfg.Opt(
+        'configured_hugepage_sizes',
+        type=types.List(types.Integer()),
+        default=[],
+        help='List of configured hugepage sizes available in kB in the '
+             'environment e.g. 2048,1048576'),
    cfg.IntOpt(
        'dedicated_cpus_per_numa',
        default=0,