hemna · April 29, 2026 19:05
diff --git a/test_graceful_shutdown_tempest.py b/test_graceful_shutdown_tempest.py
 # Copyright 2026 SAP SE
 # All Rights Reserved.
 #
 #    Licensed under the Apache License, Version 2.0 (the "License"); you may
 #    not use this file except in compliance with the License. You may obtain
 #    a copy of the License at
 #
 #         http://www.apache.org/licenses/LICENSE-2.0
 #
 #    Unless required by applicable law or agreed to in writing, software
 #    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 #    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 #    License for the specific language governing permissions and limitations
 #    under the License.

 """Tempest integration tests for Cinder graceful shutdown.

 Tests all volume/snapshot operations interrupted by SIGTERM mid-operation
 to verify graceful shutdown waits for in-flight work to complete.

 Uses FakeSlowVolumeDriver (slow-lvm backend) which injects a 15s delay
 into driver operations. SIGTERM is sent 5s into the delay, leaving 10s
 for the graceful shutdown to wait.

 Requirements:
 - DevStack with cinder running
 - slow-lvm backend configured with FakeSlowVolumeDriver
 - slow_driver_delay = 15
 - slow_driver_operations = create_volume,delete_volume,create_snapshot,
    delete_snapshot,extend_volume,create_cloned_volume,
    create_volume_from_snapshot
 - sudo access for signals and service management

 Run:
    sudo /opt/stack/data/venv/bin/python \
        ~/cinder/cinder/tests/test_graceful_shutdown_tempest.py
 """

 import os
 import signal
 import subprocess
 import time

 import openstack


 # --- Configuration ---
 AUTH_URL = os.environ.get('OS_AUTH_URL', 'http://192.168.1.107/identity/v3')
 USERNAME = os.environ.get('OS_USERNAME', 'admin')
 PASSWORD = os.environ.get('OS_PASSWORD', 'openstack')
 PROJECT = os.environ.get('OS_PROJECT_NAME', 'admin')
 DOMAIN = os.environ.get('OS_USER_DOMAIN_NAME', 'Default')

 SLOW_VOLUME_TYPE = 'slow-lvm'
 FAST_VOLUME_TYPE = 'lvmdriver-1'

 # Time to wait before sending SIGTERM (must be < slow_driver_delay)
 SIGTERM_DELAY = 5

 # Max time to wait for operation to complete after SIGTERM
 MAX_WAIT_AFTER_SIGTERM = 90

 # Service restart timeout
 SERVICE_RESTART_TIMEOUT = 30


 # --- Helpers ---

 def get_connection():
    """Create an OpenStack SDK connection."""
    return openstack.connect(
        auth_url=AUTH_URL,
        username=USERNAME,
        password=PASSWORD,
        project_name=PROJECT,
        user_domain_name=DOMAIN,
        project_domain_name=DOMAIN,
    )


 def get_cinder_volume_pid():
    """Get the PID of the cinder-volume parent process."""
    result = subprocess.run(
        ['pgrep', '-f', 'cinder-volume.*--config-file'],
        capture_output=True, text=True
    )
    pids = result.stdout.strip().split('\n')
    if pids and pids[0]:
        return int(pids[0])
    return None


 def send_sigterm():
    """Send SIGTERM to cinder-volume."""
    pid = get_cinder_volume_pid()
    if pid:
        print(f"    SIGTERM -> PID {pid}")
        os.kill(pid, signal.SIGTERM)
        return pid
    raise RuntimeError("Could not find cinder-volume process")


 def restart_cinder_volume():
    """Restart cinder-volume and wait for it to be active."""
    print("    Restarting cinder-volume...")
    subprocess.run(
        ['sudo', 'systemctl', 'restart', 'devstack@c-vol'],
        check=True, timeout=60
    )
    for i in range(SERVICE_RESTART_TIMEOUT):
        time.sleep(1)
        result = subprocess.run(
            ['sudo', 'systemctl', 'is-active', 'devstack@c-vol'],
            capture_output=True, text=True
        )
        if 'active' in result.stdout:
            print(f"    Service up (took {i+1}s)")
            # Wait extra for backend initialization and init_host cleanup
            # The slow driver may re-process stuck volumes from prior test
            time.sleep(10)
            return True
    raise RuntimeError("cinder-volume did not restart within timeout")


 def wait_for_volume_status(conn, vol_id, target, timeout=MAX_WAIT_AFTER_SIGTERM):
    """Wait for volume to reach target status. Returns (volume, elapsed)."""
    start = time.time()
    while time.time() - start < timeout:
        vol = conn.block_storage.get_volume(vol_id)
        if vol.status == target:
            return vol, time.time() - start
        if vol.status == 'error':
            return vol, time.time() - start
        time.sleep(2)
    raise TimeoutError(
        f"Volume {vol_id} did not reach '{target}' within {timeout}s "
        f"(last: {vol.status})")


 def wait_for_snapshot_status(conn, snap_id, target,
                             timeout=MAX_WAIT_AFTER_SIGTERM):
    """Wait for snapshot to reach target status."""
    start = time.time()
    while time.time() - start < timeout:
        snap = conn.block_storage.get_snapshot(snap_id)
        if snap.status == target:
            return snap, time.time() - start
        if snap.status == 'error':
            return snap, time.time() - start
        time.sleep(2)
    raise TimeoutError(
        f"Snapshot {snap_id} did not reach '{target}' within {timeout}s")


 def wait_for_volume_deleted(conn, vol_id, timeout=MAX_WAIT_AFTER_SIGTERM):
    """Wait for volume to be fully deleted."""
    start = time.time()
    while time.time() - start < timeout:
        try:
            vol = conn.block_storage.get_volume(vol_id)
            if vol.status == 'error':
                return False, time.time() - start
        except Exception:
            return True, time.time() - start
        time.sleep(2)
    raise TimeoutError(f"Volume {vol_id} not deleted within {timeout}s")


 def wait_for_snapshot_deleted(conn, snap_id, timeout=MAX_WAIT_AFTER_SIGTERM):
    """Wait for snapshot to be fully deleted."""
    start = time.time()
    while time.time() - start < timeout:
        try:
            snap = conn.block_storage.get_snapshot(snap_id)
            if snap.status == 'error':
                return False, time.time() - start
        except Exception:
            return True, time.time() - start
        time.sleep(2)
    raise TimeoutError(f"Snapshot {snap_id} not deleted within {timeout}s")


 def create_available_volume(conn, name, vol_type=SLOW_VOLUME_TYPE, size=1):
    """Create a volume and wait for it to be available."""
    vol = conn.block_storage.create_volume(
        size=size, name=name, volume_type=vol_type)
    vol, _ = wait_for_volume_status(conn, vol.id, 'available', timeout=120)
    if vol.status != 'available':
        raise RuntimeError(f"Volume {vol.id} stuck in {vol.status}")
    return vol


 def create_available_snapshot(conn, vol_id, name):
    """Create a snapshot and wait for it to be available."""
    snap = conn.block_storage.create_snapshot(
        volume_id=vol_id, name=name)
    snap, _ = wait_for_snapshot_status(conn, snap.id, 'available', timeout=120)
    if snap.status != 'available':
        raise RuntimeError(f"Snapshot {snap.id} stuck in {snap.status}")
    return snap


 def cleanup_volume(conn, vol_id):
    """Force-delete a volume, ignoring errors."""
    try:
        conn.block_storage.reset_volume_status(
            vol_id, status='available',
            attach_status='detached', migration_status=None)
    except Exception:
        pass
    try:
        conn.block_storage.delete_volume(vol_id, force=True)
    except Exception:
        pass


 def cleanup_snapshot(conn, snap_id):
    """Force-delete a snapshot, ignoring errors."""
    try:
        conn.block_storage.reset_snapshot_status(snap_id, status='available')
    except Exception:
        pass
    try:
        conn.block_storage.delete_snapshot(snap_id)
    except Exception:
        pass



 def drain_residual_volumes(conn, timeout=120):
    """Reset and force-delete all volumes/snapshots in transient states.

    This prevents init_host from spending time cleaning up residue from
    prior tests, which can cause prereq volume creation to time out.
    """
    transient_statuses = ('creating', 'deleting', 'attaching', 'detaching',
                          'extending', 'migrating', 'retyping')

    # Clean snapshots first (they block volume deletion)
    try:
        snapshots = list(conn.block_storage.snapshots(all_projects=True))
        for snap in snapshots:
            if snap.status in transient_statuses or snap.status == 'error':
                try:
                    conn.block_storage.reset_snapshot_status(
                        snap.id, status='available')
                    conn.block_storage.delete_snapshot(snap.id)
                except Exception:
                    pass
    except Exception:
        pass

    # Clean volumes in transient states
    residual_ids = []
    try:
        volumes = list(conn.block_storage.volumes(all_projects=True))
        for vol in volumes:
            if vol.status in transient_statuses:
                try:
                    conn.block_storage.reset_volume_status(
                        vol.id, status='error',
                        attach_status='detached', migration_status=None)
                    conn.block_storage.delete_volume(vol.id, force=True)
                    residual_ids.append(vol.id)
                except Exception:
                    pass
    except Exception:
        pass

    if not residual_ids:
        return

    # Wait for all residual volumes to actually be deleted
    import time as _time
    print(f"    Draining {len(residual_ids)} residual volume(s)...")
    start = _time.time()
    while residual_ids and (_time.time() - start < timeout):
        _time.sleep(3)
        remaining = []
        for vol_id in residual_ids:
            try:
                vol = conn.block_storage.get_volume(vol_id)
                if vol.status not in ('deleted',):
                    remaining.append(vol_id)
            except Exception:
                # 404 = deleted
                pass
        residual_ids = remaining

    if residual_ids:
        print(f"    Warning: {len(residual_ids)} volume(s) not fully drained")
    else:
        print(f"    Drained in {_time.time() - start:.0f}s")



 # --- Test Result ---

 class TestResult:
    def __init__(self, name):
        self.name = name
        self.passed = False
        self.message = ""
        self.duration = 0

    def __str__(self):
        s = "PASS" if self.passed else "FAIL"
        return f"[{s}] {self.name} ({self.duration:.1f}s) - {self.message}"


 # --- Tests ---

 def test_01_volume_create_survives_sigterm():
    """SIGTERM during volume create -> volume reaches 'available'."""
    r = TestResult("volume_create_survives_sigterm")
    start = time.time()
    conn = get_connection()
    vol_id = None

    try:
        print("\n[01] Volume CREATE survives SIGTERM")
        vol = conn.block_storage.create_volume(
            size=1, name='gs-test-create', volume_type=SLOW_VOLUME_TYPE)
        vol_id = vol.id
        print(f"    Volume {vol_id} creating...")

        time.sleep(SIGTERM_DELAY)
        send_sigterm()

        vol, elapsed = wait_for_volume_status(conn, vol_id, 'available')
        if vol.status == 'available':
            r.passed = True
            r.message = f"available {elapsed:.1f}s after SIGTERM"
        else:
            r.message = f"ended in '{vol.status}'"

        restart_cinder_volume()
    except Exception as e:
        r.message = f"Exception: {e}"
        try:
            restart_cinder_volume()
        except Exception:
            pass
    finally:
        if vol_id:
            cleanup_volume(conn, vol_id)

    r.duration = time.time() - start
    return r


 def test_02_volume_delete_survives_sigterm():
    """SIGTERM during volume delete -> volume fully deleted."""
    r = TestResult("volume_delete_survives_sigterm")
    start = time.time()
    conn = get_connection()
    vol_id = None

    try:
        print("\n[02] Volume DELETE survives SIGTERM")
        vol = create_available_volume(conn, 'gs-test-delete')
        vol_id = vol.id
        print(f"    Volume {vol_id} available, deleting...")

        conn.block_storage.delete_volume(vol_id)
        time.sleep(SIGTERM_DELAY)
        send_sigterm()

        deleted, elapsed = wait_for_volume_deleted(conn, vol_id)
        if deleted:
            r.passed = True
            r.message = f"deleted {elapsed:.1f}s after SIGTERM"
            vol_id = None  # No cleanup needed
        else:
            r.message = "NOT deleted (stuck in error/deleting)"

        restart_cinder_volume()
    except Exception as e:
        r.message = f"Exception: {e}"
        try:
            restart_cinder_volume()
        except Exception:
            pass
    finally:
        if vol_id:
            cleanup_volume(conn, vol_id)

    r.duration = time.time() - start
    return r


 def test_03_snapshot_create_survives_sigterm():
    """SIGTERM during snapshot create -> snapshot reaches 'available'."""
    r = TestResult("snapshot_create_survives_sigterm")
    start = time.time()
    conn = get_connection()
    vol_id = None
    snap_id = None

    try:
        print("\n[03] Snapshot CREATE survives SIGTERM")
        vol = create_available_volume(conn, 'gs-test-snap-parent')
        vol_id = vol.id
        print(f"    Volume {vol_id} available, creating snapshot...")

        snap = conn.block_storage.create_snapshot(
            volume_id=vol_id, name='gs-test-snap')
        snap_id = snap.id

        time.sleep(SIGTERM_DELAY)
        send_sigterm()

        snap, elapsed = wait_for_snapshot_status(conn, snap_id, 'available')
        if snap.status == 'available':
            r.passed = True
            r.message = f"available {elapsed:.1f}s after SIGTERM"
        else:
            r.message = f"ended in '{snap.status}'"

        restart_cinder_volume()
    except Exception as e:
        r.message = f"Exception: {e}"
        try:
            restart_cinder_volume()
        except Exception:
            pass
    finally:
        if snap_id:
            cleanup_snapshot(conn, snap_id)
        time.sleep(5)
        if vol_id:
            cleanup_volume(conn, vol_id)

    r.duration = time.time() - start
    return r


 def test_04_snapshot_delete_survives_sigterm():
    """SIGTERM during snapshot delete -> snapshot fully deleted."""
    r = TestResult("snapshot_delete_survives_sigterm")
    start = time.time()
    conn = get_connection()
    vol_id = None
    snap_id = None

    try:
        print("\n[04] Snapshot DELETE survives SIGTERM")
        vol = create_available_volume(conn, 'gs-test-snapdel-parent')
        vol_id = vol.id
        snap = create_available_snapshot(conn, vol_id, 'gs-test-snapdel')
        snap_id = snap.id
        print(f"    Snapshot {snap_id} available, deleting...")

        conn.block_storage.delete_snapshot(snap_id)
        time.sleep(SIGTERM_DELAY)
        send_sigterm()

        deleted, elapsed = wait_for_snapshot_deleted(conn, snap_id)
        if deleted:
            r.passed = True
            r.message = f"deleted {elapsed:.1f}s after SIGTERM"
            snap_id = None
        else:
            r.message = "NOT deleted (stuck)"

        restart_cinder_volume()
    except Exception as e:
        r.message = f"Exception: {e}"
        try:
            restart_cinder_volume()
        except Exception:
            pass
    finally:
        if snap_id:
            cleanup_snapshot(conn, snap_id)
        time.sleep(5)
        if vol_id:
            cleanup_volume(conn, vol_id)

    r.duration = time.time() - start
    return r


 def test_05_volume_extend_survives_sigterm():
    """SIGTERM during volume extend -> volume reaches 'available' at new size."""
    r = TestResult("volume_extend_survives_sigterm")
    start = time.time()
    conn = get_connection()
    vol_id = None

    try:
        print("\n[05] Volume EXTEND survives SIGTERM")
        vol = create_available_volume(conn, 'gs-test-extend', size=1)
        vol_id = vol.id
        print(f"    Volume {vol_id} available (1GB), extending to 2GB...")

        conn.block_storage.extend_volume(vol_id, size=2)
        time.sleep(SIGTERM_DELAY)
        send_sigterm()

        vol, elapsed = wait_for_volume_status(conn, vol_id, 'available')
        if vol.status == 'available' and vol.size == 2:
            r.passed = True
            r.message = f"available at 2GB, {elapsed:.1f}s after SIGTERM"
        elif vol.status == 'available':
            r.passed = True
            r.message = (f"available (size={vol.size}GB), "
                         f"{elapsed:.1f}s after SIGTERM")
        else:
            r.message = f"ended in '{vol.status}' size={vol.size}"

        restart_cinder_volume()
    except Exception as e:
        r.message = f"Exception: {e}"
        try:
            restart_cinder_volume()
        except Exception:
            pass
    finally:
        if vol_id:
            cleanup_volume(conn, vol_id)

    r.duration = time.time() - start
    return r


 def test_06_volume_clone_survives_sigterm():
    """SIGTERM during volume clone -> cloned volume reaches 'available'."""
    r = TestResult("volume_clone_survives_sigterm")
    start = time.time()
    conn = get_connection()
    src_vol_id = None
    clone_vol_id = None

    try:
        print("\n[06] Volume CLONE survives SIGTERM")
        src_vol = create_available_volume(conn, 'gs-test-clone-src')
        src_vol_id = src_vol.id
        print(f"    Source {src_vol_id} available, cloning...")

        clone = conn.block_storage.create_volume(
            size=1, name='gs-test-clone-dst',
            volume_type=SLOW_VOLUME_TYPE,
            source_volid=src_vol_id)
        clone_vol_id = clone.id

        time.sleep(SIGTERM_DELAY)
        send_sigterm()

        clone, elapsed = wait_for_volume_status(
            conn, clone_vol_id, 'available')
        if clone.status == 'available':
            r.passed = True
            r.message = f"clone available {elapsed:.1f}s after SIGTERM"
        else:
            r.message = f"clone ended in '{clone.status}'"

        restart_cinder_volume()
    except Exception as e:
        r.message = f"Exception: {e}"
        try:
            restart_cinder_volume()
        except Exception:
            pass
    finally:
        if clone_vol_id:
            cleanup_volume(conn, clone_vol_id)
        time.sleep(5)
        if src_vol_id:
            cleanup_volume(conn, src_vol_id)

    r.duration = time.time() - start
    return r


 def test_07_create_from_snapshot_survives_sigterm():
    """SIGTERM during create-from-snapshot -> volume reaches 'available'."""
    r = TestResult("create_from_snapshot_survives_sigterm")
    start = time.time()
    conn = get_connection()
    src_vol_id = None
    snap_id = None
    new_vol_id = None

    try:
        print("\n[07] Create from SNAPSHOT survives SIGTERM")
        src_vol = create_available_volume(conn, 'gs-test-fromsnap-src')
        src_vol_id = src_vol.id
        snap = create_available_snapshot(conn, src_vol_id, 'gs-test-fromsnap')
        snap_id = snap.id
        print(f"    Snapshot {snap_id} available, creating volume from it...")

        new_vol = conn.block_storage.create_volume(
            size=1, name='gs-test-fromsnap-dst',
            volume_type=SLOW_VOLUME_TYPE,
            snapshot_id=snap_id)
        new_vol_id = new_vol.id

        time.sleep(SIGTERM_DELAY)
        send_sigterm()

        new_vol, elapsed = wait_for_volume_status(
            conn, new_vol_id, 'available')
        if new_vol.status == 'available':
            r.passed = True
            r.message = f"available {elapsed:.1f}s after SIGTERM"
        else:
            r.message = f"ended in '{new_vol.status}'"

        restart_cinder_volume()
    except Exception as e:
        r.message = f"Exception: {e}"
        try:
            restart_cinder_volume()
        except Exception:
            pass
    finally:
        if new_vol_id:
            cleanup_volume(conn, new_vol_id)
        time.sleep(5)
        if snap_id:
            cleanup_snapshot(conn, snap_id)
        time.sleep(5)
        if src_vol_id:
            cleanup_volume(conn, src_vol_id)

    r.duration = time.time() - start
    return r


 def test_08_copy_volume_to_image_survives_sigterm():
    """SIGTERM during copy_volume_to_image -> image upload completes."""
    r = TestResult("copy_volume_to_image_survives_sigterm")
    start = time.time()
    conn = get_connection()
    vol_id = None
    image_id = None

    try:
        print("\n[08] Copy volume to IMAGE survives SIGTERM")
        # Create volume on slow backend (will delay on copy_volume_to_image)
        vol = create_available_volume(conn, 'gs-test-upload-to-image')
        vol_id = vol.id
        print(f"    Volume {vol_id} available, uploading to image...")

        # Upload volume to image (triggers copy_volume_to_image in driver)
        result = conn.block_storage.post(
            f'/volumes/{vol_id}/action',
            json={
                'os-volume_upload_image': {
                    'image_name': 'gs-test-image-upload',
                    'disk_format': 'raw',
                    'container_format': 'bare',
                }
            }
        )
        body = result.json()
        image_id = body.get('os-volume_upload_image', {}).get('image_id')
        print(f"    Image {image_id} upload started")

        time.sleep(SIGTERM_DELAY)
        send_sigterm()

        # Wait for volume to return to 'available' (upload complete)
        vol, elapsed = wait_for_volume_status(conn, vol_id, 'available')
        if vol.status == 'available':
            r.passed = True
            r.message = f"volume back to 'available' {elapsed:.1f}s after SIGTERM"
        else:
            r.message = f"volume ended in '{vol.status}'"

        restart_cinder_volume()
    except Exception as e:
        r.message = f"Exception: {e}"
        try:
            restart_cinder_volume()
        except Exception:
            pass
    finally:
        # Clean up image
        if image_id:
            try:
                conn.image.delete_image(image_id)
            except Exception:
                pass
        if vol_id:
            cleanup_volume(conn, vol_id)

    r.duration = time.time() - start
    return r


 def test_09_migrate_volume_survives_sigterm():
    """SIGTERM during volume migration -> volume reaches 'available'."""
    r = TestResult("migrate_volume_survives_sigterm")
    start = time.time()
    conn = get_connection()
    vol_id = None

    try:
        print("\n[09] Volume MIGRATE survives SIGTERM")
        # Create volume on slow-lvm backend
        vol = create_available_volume(conn, 'gs-test-migrate')
        vol_id = vol.id
        print(f"    Volume {vol_id} available on slow-lvm, migrating to "
              f"lvmdriver-1...")

        # Migrate to the other backend (host-level migration)
        dest_host = 'devstack@lvmdriver-1#lvmdriver-1'
        conn.block_storage.post(
            f'/volumes/{vol_id}/action',
            json={'os-migrate_volume': {'host': dest_host}}
        )

        time.sleep(SIGTERM_DELAY)
        send_sigterm()

        # Wait for volume to return to available (migration complete)
        vol, elapsed = wait_for_volume_status(conn, vol_id, 'available',
                                              timeout=120)
        if vol.status == 'available':
            r.passed = True
            r.message = f"available {elapsed:.1f}s after SIGTERM"
        else:
            r.message = f"ended in '{vol.status}'"

        restart_cinder_volume()
    except Exception as e:
        r.message = f"Exception: {e}"
        try:
            restart_cinder_volume()
        except Exception:
            pass
    finally:
        if vol_id:
            cleanup_volume(conn, vol_id)

    r.duration = time.time() - start
    return r


 def test_10_retype_volume_survives_sigterm():
    """SIGTERM during volume retype -> volume reaches 'available'."""
    r = TestResult("retype_volume_survives_sigterm")
    start = time.time()
    conn = get_connection()
    vol_id = None

    try:
        print("\n[10] Volume RETYPE survives SIGTERM")
        # Create volume on slow-lvm
        vol = create_available_volume(conn, 'gs-test-retype')
        vol_id = vol.id
        print(f"    Volume {vol_id} available (slow-lvm), retyping to "
              f"lvmdriver-1...")

        # Retype to lvmdriver-1 (with migration)
        conn.block_storage.post(
            f'/volumes/{vol_id}/action',
            json={'os-retype': {'new_type': 'lvmdriver-1',
                                'migration_policy': 'on-demand'}}
        )

        time.sleep(SIGTERM_DELAY)
        send_sigterm()

        # Wait for volume to return to available
        vol, elapsed = wait_for_volume_status(conn, vol_id, 'available',
                                              timeout=120)
        if vol.status == 'available':
            r.passed = True
            r.message = f"available {elapsed:.1f}s after SIGTERM"
        else:
            r.message = f"ended in '{vol.status}'"

        restart_cinder_volume()
    except Exception as e:
        r.message = f"Exception: {e}"
        try:
            restart_cinder_volume()
        except Exception:
            pass
    finally:
        if vol_id:
            cleanup_volume(conn, vol_id)

    r.duration = time.time() - start
    return r


 def test_11_manage_existing_survives_sigterm():
    """SIGTERM during manage_existing -> volume reaches 'available'.

    Creates an LV directly on the VG, then uses Cinder's manage API
    to import it as a volume. Sends SIGTERM mid-manage.
    """
    r = TestResult("manage_existing_survives_sigterm")
    start = time.time()
    conn = get_connection()
    vol_id = None
    lv_name = 'gs-test-manage-existing'

    try:
        print("\n[11] MANAGE EXISTING survives SIGTERM")
        # Create a raw LV on the VG that slow-lvm uses
        vg_name = 'stack-volumes-lvmdriver-1'
        subprocess.run(
            ['sudo', 'lvcreate', '-L', '1G', '-n', lv_name, vg_name],
            check=True, capture_output=True, timeout=10
        )
        print(f"    Created LV {vg_name}/{lv_name}")

        # Manage it via Cinder API
        # For LVM driver, source-name is just the LV name (not VG/LV)
        result = conn.block_storage.post(
            '/os-volume-manage',
            json={
                'volume': {
                    'host': 'devstack@slow-lvm',
                    'name': 'gs-test-managed',
                    'volume_type': SLOW_VOLUME_TYPE,
                    'ref': {'source-name': lv_name},
                }
            }
        )
        body = result.json()
        vol_id = body.get('volume', {}).get('id')

        if not vol_id:
            r.message = f"Could not initiate manage: {body}"
            r.duration = time.time() - start
            return r

        print(f"    Managing as volume {vol_id}...")
        time.sleep(SIGTERM_DELAY)
        send_sigterm()

        vol, elapsed = wait_for_volume_status(conn, vol_id, 'available',
                                              timeout=90)
        if vol.status == 'available':
            r.passed = True
            r.message = f"available {elapsed:.1f}s after SIGTERM"
        else:
            r.message = f"ended in '{vol.status}'"

        restart_cinder_volume()
    except Exception as e:
        r.message = f"Exception: {e}"
        try:
            restart_cinder_volume()
        except Exception:
            pass
        # Clean up LV if manage failed
        subprocess.run(
            ['sudo', 'lvremove', '-f',
             f'stack-volumes-lvmdriver-1/{lv_name}'],
            capture_output=True
        )
    finally:
        if vol_id:
            cleanup_volume(conn, vol_id)

    r.duration = time.time() - start
    return r


 def test_12_create_group_survives_sigterm():
    """SIGTERM during group create -> group reaches 'available'.

    Creates a group type, then creates a consistency group on slow-lvm.
    """
    r = TestResult("create_group_survives_sigterm")
    start = time.time()
    conn = get_connection()
    group_id = None
    group_type_id = None

    try:
        print("\n[12] CREATE GROUP survives SIGTERM")

        # Create a group type (avoid reserved CG migration types)
        result = conn.block_storage.get('/group_types',
                                        microversion='3.11')
        group_types = result.json().get('group_types', [])
        # Filter out reserved CG migration group types
        usable_types = [gt for gt in group_types
                        if 'cgsnapshot' not in (gt.get('name') or '').lower()
                        and 'migration' not in
                        (gt.get('description') or '').lower()
                        and (gt.get('name') or '') !=
                        'group_type_for_migration']
        if usable_types:
            group_type_id = usable_types[0]['id']
            print(f"    Using existing group type {group_type_id}")
        else:
            result = conn.block_storage.post(
                '/group_types',
                json={'group_type': {
                    'name': 'gs-test-group-type',
                    'group_specs': {},
                }},
                microversion='3.11'
            )
            group_type_id = result.json()['group_type']['id']
            print(f"    Created group type {group_type_id}")

        # Get slow-lvm volume type ID
        vol_types = list(conn.block_storage.types())
        slow_type_id = None
        for vt in vol_types:
            if vt.name == SLOW_VOLUME_TYPE:
                slow_type_id = vt.id
                break

        if not slow_type_id:
            r.message = "Could not find slow-lvm volume type"
            r.duration = time.time() - start
            return r

        # Create a group
        result = conn.block_storage.post(
            '/groups',
            json={'group': {
                'name': 'gs-test-group',
                'group_type': group_type_id,
                'volume_types': [slow_type_id],
            }},
            microversion='3.13'
        )
        body = result.json()
        group_id = body.get('group', {}).get('id')
        print(f"    Group {group_id} creating...")

        time.sleep(SIGTERM_DELAY)
        send_sigterm()

        # Wait for group to reach available
        start_wait = time.time()
        final_status = None
        while time.time() - start_wait < MAX_WAIT_AFTER_SIGTERM:
            result = conn.block_storage.get(f'/groups/{group_id}',
                                            microversion='3.13')
            group = result.json().get('group', {})
            final_status = group.get('status')
            if final_status == 'available':
                elapsed = time.time() - start_wait
                r.passed = True
                r.message = f"available {elapsed:.1f}s after SIGTERM"
                break
            if final_status == 'error':
                r.message = "group ended in 'error'"
                break
            time.sleep(2)
        else:
            r.message = f"timed out (last status: {final_status})"

        restart_cinder_volume()
    except Exception as e:
        r.message = f"Exception: {e}"
        try:
            restart_cinder_volume()
        except Exception:
            pass
    finally:
        # Cleanup group
        if group_id:
            try:
                conn.block_storage.post(
                    f'/groups/{group_id}/action',
                    json={'delete': {'delete-volumes': False}},
                    microversion='3.13'
                )
            except Exception:
                pass

    r.duration = time.time() - start
    return r


 def test_13_multiple_ops_complete_before_exit():
    """SIGTERM with 3 concurrent creates -> all reach 'available'."""
    r = TestResult("multiple_concurrent_ops_survive_sigterm")
    start = time.time()
    conn = get_connection()
    vol_ids = []

    try:
        print("\n[13] Multiple concurrent creates survive SIGTERM")
        for i in range(3):
            vol = conn.block_storage.create_volume(
                size=1, name=f'gs-test-multi-{i}',
                volume_type=SLOW_VOLUME_TYPE)
            vol_ids.append(vol.id)
            print(f"    Created {vol.id}")

        time.sleep(SIGTERM_DELAY)
        send_sigterm()

        statuses = {}
        for vid in vol_ids:
            try:
                v, elapsed = wait_for_volume_status(
                    conn, vid, 'available', timeout=120)
                statuses[vid] = v.status
            except TimeoutError:
                statuses[vid] = 'timeout'

        all_good = all(s == 'available' for s in statuses.values())
        if all_good:
            r.passed = True
            r.message = f"All {len(vol_ids)} volumes available after SIGTERM"
        else:
            r.message = f"Results: {statuses}"

        restart_cinder_volume()
    except Exception as e:
        r.message = f"Exception: {e}"
        try:
            restart_cinder_volume()
        except Exception:
            pass
    finally:
        for vid in vol_ids:
            cleanup_volume(conn, vid)

    r.duration = time.time() - start
    return r


 def test_14_service_heartbeat_stops_during_drain():
    """After SIGTERM, service stops heartbeating (appears 'down')."""
    r = TestResult("service_heartbeat_stops_during_drain")
    start = time.time()
    conn = get_connection()
    vol_id = None

    try:
        print("\n[14] Service heartbeat stops during drain")

        # Verify service is up
        services = list(conn.block_storage.services())
        slow_svc = [s for s in services if 'slow-lvm' in (s.host or '')]
        if not slow_svc or slow_svc[0].state != 'up':
            r.message = "slow-lvm not 'up' initially - skipping"
            r.duration = time.time() - start
            return r
        print("    Service is 'up'")

        # Start slow op
        vol = conn.block_storage.create_volume(
            size=1, name='gs-test-heartbeat', volume_type=SLOW_VOLUME_TYPE)
        vol_id = vol.id
        time.sleep(SIGTERM_DELAY)
        send_sigterm()

        # The service_down_time is typically 25s. The service should stop
        # heartbeating immediately, but the scheduler won't mark it down
        # until service_down_time elapses. Check after the op completes.
        vol, _ = wait_for_volume_status(conn, vol_id, 'available')

        # After operation completes and process exits, service should be down
        time.sleep(5)
        services = list(conn.block_storage.services())
        slow_svc = [s for s in services if 'slow-lvm' in (s.host or '')]
        state_after = slow_svc[0].state if slow_svc else 'unknown'
        print(f"    Service state after drain: {state_after}")

        # Restart and verify recovery
        restart_cinder_volume()
        time.sleep(10)
        services = list(conn.block_storage.services())
        slow_svc = [s for s in services if 'slow-lvm' in (s.host or '')]
        state_recovered = slow_svc[0].state if slow_svc else 'unknown'

        if vol.status == 'available' and state_recovered == 'up':
            r.passed = True
            r.message = (f"Op completed, service was '{state_after}' "
                         f"during drain, recovered to 'up'")
        else:
            r.message = (f"vol={vol.status}, state_after={state_after}, "
                         f"recovered={state_recovered}")
    except Exception as e:
        r.message = f"Exception: {e}"
        try:
            restart_cinder_volume()
        except Exception:
            pass
    finally:
        if vol_id:
            cleanup_volume(conn, vol_id)

    r.duration = time.time() - start
    return r


 # --- Main ---

 def main():
    print("=" * 72)
    print("  CINDER GRACEFUL SHUTDOWN - LIVE SIGTERM INTEGRATION TESTS")
    print("=" * 72)
    print(f"  Auth:          {AUTH_URL}")
    print(f"  Slow backend:  {SLOW_VOLUME_TYPE} (15s delay)")
    print(f"  SIGTERM after: {SIGTERM_DELAY}s into operation")
    print(f"  Wait timeout:  {MAX_WAIT_AFTER_SIGTERM}s")
    print("=" * 72)

    tests = [
        test_01_volume_create_survives_sigterm,
        test_02_volume_delete_survives_sigterm,
        test_03_snapshot_create_survives_sigterm,
        test_04_snapshot_delete_survives_sigterm,
        test_05_volume_extend_survives_sigterm,
        test_06_volume_clone_survives_sigterm,
        test_07_create_from_snapshot_survives_sigterm,
        test_08_copy_volume_to_image_survives_sigterm,
        test_09_migrate_volume_survives_sigterm,
        test_10_retype_volume_survives_sigterm,
        test_11_manage_existing_survives_sigterm,
        test_12_create_group_survives_sigterm,
        test_13_multiple_ops_complete_before_exit,
        test_14_service_heartbeat_stops_during_drain,
    ]

    conn = get_connection()
    results = []
    for test_func in tests:
        # Reset and delete any residual volumes from prior test's SIGTERM
        # so init_host doesn't queue them up on the slow backend
        drain_residual_volumes(conn)
        try:
            result = test_func()
        except Exception as e:
            result = TestResult(test_func.__name__)
            result.message = f"Unhandled: {e}"
        results.append(result)
        print(f"\n  => {result}")

    # --- Report ---
    print("\n\n")
    print("=" * 72)
    print("  FINAL REPORT")
    print("=" * 72)

    passed = sum(1 for r in results if r.passed)
    failed = sum(1 for r in results if not r.passed)
    total = len(results)
    total_time = sum(r.duration for r in results)

    print(f"\n  Total: {total} | Passed: {passed} | "
          f"Failed: {failed} | Duration: {total_time:.0f}s\n")
    print("-" * 72)

    for r in results:
        s = "PASS" if r.passed else "FAIL"
        print(f"  [{s}] {r.name}")
        print(f"        {r.message}")
        print()

    print("=" * 72)
    if failed == 0:
        print("  ALL TESTS PASSED")
    else:
        print(f"  {failed} TEST(S) FAILED")
    print("=" * 72)

    return 0 if failed == 0 else 1


 if __name__ == '__main__':
    exit(main())
No results found