Last active
April 29, 2026 19:05
-
-
Save hemna/f12d3441aa232b472f6a8266e9ccb4cc to your computer and use it in GitHub Desktop.
Cinder Graceful Shutdown - Live SIGTERM Integration Tests
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Copyright 2026 SAP SE | |
| # All Rights Reserved. | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); you may | |
| # not use this file except in compliance with the License. You may obtain | |
| # a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT | |
| # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the | |
| # License for the specific language governing permissions and limitations | |
| # under the License. | |
| """Tempest integration tests for Cinder graceful shutdown. | |
| Tests all volume/snapshot operations interrupted by SIGTERM mid-operation | |
| to verify graceful shutdown waits for in-flight work to complete. | |
| Uses FakeSlowVolumeDriver (slow-lvm backend) which injects a 15s delay | |
| into driver operations. SIGTERM is sent 5s into the delay, leaving 10s | |
| for the graceful shutdown to wait. | |
| Requirements: | |
| - DevStack with cinder running | |
| - slow-lvm backend configured with FakeSlowVolumeDriver | |
| - slow_driver_delay = 15 | |
| - slow_driver_operations = create_volume,delete_volume,create_snapshot, | |
| delete_snapshot,extend_volume,create_cloned_volume, | |
| create_volume_from_snapshot | |
| - sudo access for signals and service management | |
| Run: | |
| sudo /opt/stack/data/venv/bin/python \ | |
| ~/cinder/cinder/tests/test_graceful_shutdown_tempest.py | |
| """ | |
| import os | |
| import signal | |
| import subprocess | |
| import time | |
| import openstack | |
| # --- Configuration --- | |
| AUTH_URL = os.environ.get('OS_AUTH_URL', 'http://192.168.1.107/identity/v3') | |
| USERNAME = os.environ.get('OS_USERNAME', 'admin') | |
| PASSWORD = os.environ.get('OS_PASSWORD', 'openstack') | |
| PROJECT = os.environ.get('OS_PROJECT_NAME', 'admin') | |
| DOMAIN = os.environ.get('OS_USER_DOMAIN_NAME', 'Default') | |
| SLOW_VOLUME_TYPE = 'slow-lvm' | |
| FAST_VOLUME_TYPE = 'lvmdriver-1' | |
| # Time to wait before sending SIGTERM (must be < slow_driver_delay) | |
| SIGTERM_DELAY = 5 | |
| # Max time to wait for operation to complete after SIGTERM | |
| MAX_WAIT_AFTER_SIGTERM = 90 | |
| # Service restart timeout | |
| SERVICE_RESTART_TIMEOUT = 30 | |
| # --- Helpers --- | |
| def get_connection(): | |
| """Create an OpenStack SDK connection.""" | |
| return openstack.connect( | |
| auth_url=AUTH_URL, | |
| username=USERNAME, | |
| password=PASSWORD, | |
| project_name=PROJECT, | |
| user_domain_name=DOMAIN, | |
| project_domain_name=DOMAIN, | |
| ) | |
| def get_cinder_volume_pid(): | |
| """Get the PID of the cinder-volume parent process.""" | |
| result = subprocess.run( | |
| ['pgrep', '-f', 'cinder-volume.*--config-file'], | |
| capture_output=True, text=True | |
| ) | |
| pids = result.stdout.strip().split('\n') | |
| if pids and pids[0]: | |
| return int(pids[0]) | |
| return None | |
| def send_sigterm(): | |
| """Send SIGTERM to cinder-volume.""" | |
| pid = get_cinder_volume_pid() | |
| if pid: | |
| print(f" SIGTERM -> PID {pid}") | |
| os.kill(pid, signal.SIGTERM) | |
| return pid | |
| raise RuntimeError("Could not find cinder-volume process") | |
| def restart_cinder_volume(): | |
| """Restart cinder-volume and wait for it to be active.""" | |
| print(" Restarting cinder-volume...") | |
| subprocess.run( | |
| ['sudo', 'systemctl', 'restart', 'devstack@c-vol'], | |
| check=True, timeout=60 | |
| ) | |
| for i in range(SERVICE_RESTART_TIMEOUT): | |
| time.sleep(1) | |
| result = subprocess.run( | |
| ['sudo', 'systemctl', 'is-active', 'devstack@c-vol'], | |
| capture_output=True, text=True | |
| ) | |
| if 'active' in result.stdout: | |
| print(f" Service up (took {i+1}s)") | |
| # Wait extra for backend initialization and init_host cleanup | |
| # The slow driver may re-process stuck volumes from prior test | |
| time.sleep(10) | |
| return True | |
| raise RuntimeError("cinder-volume did not restart within timeout") | |
| def wait_for_volume_status(conn, vol_id, target, timeout=MAX_WAIT_AFTER_SIGTERM): | |
| """Wait for volume to reach target status. Returns (volume, elapsed).""" | |
| start = time.time() | |
| while time.time() - start < timeout: | |
| vol = conn.block_storage.get_volume(vol_id) | |
| if vol.status == target: | |
| return vol, time.time() - start | |
| if vol.status == 'error': | |
| return vol, time.time() - start | |
| time.sleep(2) | |
| raise TimeoutError( | |
| f"Volume {vol_id} did not reach '{target}' within {timeout}s " | |
| f"(last: {vol.status})") | |
| def wait_for_snapshot_status(conn, snap_id, target, | |
| timeout=MAX_WAIT_AFTER_SIGTERM): | |
| """Wait for snapshot to reach target status.""" | |
| start = time.time() | |
| while time.time() - start < timeout: | |
| snap = conn.block_storage.get_snapshot(snap_id) | |
| if snap.status == target: | |
| return snap, time.time() - start | |
| if snap.status == 'error': | |
| return snap, time.time() - start | |
| time.sleep(2) | |
| raise TimeoutError( | |
| f"Snapshot {snap_id} did not reach '{target}' within {timeout}s") | |
| def wait_for_volume_deleted(conn, vol_id, timeout=MAX_WAIT_AFTER_SIGTERM): | |
| """Wait for volume to be fully deleted.""" | |
| start = time.time() | |
| while time.time() - start < timeout: | |
| try: | |
| vol = conn.block_storage.get_volume(vol_id) | |
| if vol.status == 'error': | |
| return False, time.time() - start | |
| except Exception: | |
| return True, time.time() - start | |
| time.sleep(2) | |
| raise TimeoutError(f"Volume {vol_id} not deleted within {timeout}s") | |
| def wait_for_snapshot_deleted(conn, snap_id, timeout=MAX_WAIT_AFTER_SIGTERM): | |
| """Wait for snapshot to be fully deleted.""" | |
| start = time.time() | |
| while time.time() - start < timeout: | |
| try: | |
| snap = conn.block_storage.get_snapshot(snap_id) | |
| if snap.status == 'error': | |
| return False, time.time() - start | |
| except Exception: | |
| return True, time.time() - start | |
| time.sleep(2) | |
| raise TimeoutError(f"Snapshot {snap_id} not deleted within {timeout}s") | |
| def create_available_volume(conn, name, vol_type=SLOW_VOLUME_TYPE, size=1): | |
| """Create a volume and wait for it to be available.""" | |
| vol = conn.block_storage.create_volume( | |
| size=size, name=name, volume_type=vol_type) | |
| vol, _ = wait_for_volume_status(conn, vol.id, 'available', timeout=120) | |
| if vol.status != 'available': | |
| raise RuntimeError(f"Volume {vol.id} stuck in {vol.status}") | |
| return vol | |
| def create_available_snapshot(conn, vol_id, name): | |
| """Create a snapshot and wait for it to be available.""" | |
| snap = conn.block_storage.create_snapshot( | |
| volume_id=vol_id, name=name) | |
| snap, _ = wait_for_snapshot_status(conn, snap.id, 'available', timeout=120) | |
| if snap.status != 'available': | |
| raise RuntimeError(f"Snapshot {snap.id} stuck in {snap.status}") | |
| return snap | |
| def cleanup_volume(conn, vol_id): | |
| """Force-delete a volume, ignoring errors.""" | |
| try: | |
| conn.block_storage.reset_volume_status( | |
| vol_id, status='available', | |
| attach_status='detached', migration_status=None) | |
| except Exception: | |
| pass | |
| try: | |
| conn.block_storage.delete_volume(vol_id, force=True) | |
| except Exception: | |
| pass | |
| def cleanup_snapshot(conn, snap_id): | |
| """Force-delete a snapshot, ignoring errors.""" | |
| try: | |
| conn.block_storage.reset_snapshot_status(snap_id, status='available') | |
| except Exception: | |
| pass | |
| try: | |
| conn.block_storage.delete_snapshot(snap_id) | |
| except Exception: | |
| pass | |
| def drain_residual_volumes(conn, timeout=120): | |
| """Reset and force-delete all volumes/snapshots in transient states. | |
| This prevents init_host from spending time cleaning up residue from | |
| prior tests, which can cause prereq volume creation to time out. | |
| """ | |
| transient_statuses = ('creating', 'deleting', 'attaching', 'detaching', | |
| 'extending', 'migrating', 'retyping') | |
| # Clean snapshots first (they block volume deletion) | |
| try: | |
| snapshots = list(conn.block_storage.snapshots(all_projects=True)) | |
| for snap in snapshots: | |
| if snap.status in transient_statuses or snap.status == 'error': | |
| try: | |
| conn.block_storage.reset_snapshot_status( | |
| snap.id, status='available') | |
| conn.block_storage.delete_snapshot(snap.id) | |
| except Exception: | |
| pass | |
| except Exception: | |
| pass | |
| # Clean volumes in transient states | |
| residual_ids = [] | |
| try: | |
| volumes = list(conn.block_storage.volumes(all_projects=True)) | |
| for vol in volumes: | |
| if vol.status in transient_statuses: | |
| try: | |
| conn.block_storage.reset_volume_status( | |
| vol.id, status='error', | |
| attach_status='detached', migration_status=None) | |
| conn.block_storage.delete_volume(vol.id, force=True) | |
| residual_ids.append(vol.id) | |
| except Exception: | |
| pass | |
| except Exception: | |
| pass | |
| if not residual_ids: | |
| return | |
| # Wait for all residual volumes to actually be deleted | |
| import time as _time | |
| print(f" Draining {len(residual_ids)} residual volume(s)...") | |
| start = _time.time() | |
| while residual_ids and (_time.time() - start < timeout): | |
| _time.sleep(3) | |
| remaining = [] | |
| for vol_id in residual_ids: | |
| try: | |
| vol = conn.block_storage.get_volume(vol_id) | |
| if vol.status not in ('deleted',): | |
| remaining.append(vol_id) | |
| except Exception: | |
| # 404 = deleted | |
| pass | |
| residual_ids = remaining | |
| if residual_ids: | |
| print(f" Warning: {len(residual_ids)} volume(s) not fully drained") | |
| else: | |
| print(f" Drained in {_time.time() - start:.0f}s") | |
| # --- Test Result --- | |
| class TestResult: | |
| def __init__(self, name): | |
| self.name = name | |
| self.passed = False | |
| self.message = "" | |
| self.duration = 0 | |
| def __str__(self): | |
| s = "PASS" if self.passed else "FAIL" | |
| return f"[{s}] {self.name} ({self.duration:.1f}s) - {self.message}" | |
| # --- Tests --- | |
| def test_01_volume_create_survives_sigterm(): | |
| """SIGTERM during volume create -> volume reaches 'available'.""" | |
| r = TestResult("volume_create_survives_sigterm") | |
| start = time.time() | |
| conn = get_connection() | |
| vol_id = None | |
| try: | |
| print("\n[01] Volume CREATE survives SIGTERM") | |
| vol = conn.block_storage.create_volume( | |
| size=1, name='gs-test-create', volume_type=SLOW_VOLUME_TYPE) | |
| vol_id = vol.id | |
| print(f" Volume {vol_id} creating...") | |
| time.sleep(SIGTERM_DELAY) | |
| send_sigterm() | |
| vol, elapsed = wait_for_volume_status(conn, vol_id, 'available') | |
| if vol.status == 'available': | |
| r.passed = True | |
| r.message = f"available {elapsed:.1f}s after SIGTERM" | |
| else: | |
| r.message = f"ended in '{vol.status}'" | |
| restart_cinder_volume() | |
| except Exception as e: | |
| r.message = f"Exception: {e}" | |
| try: | |
| restart_cinder_volume() | |
| except Exception: | |
| pass | |
| finally: | |
| if vol_id: | |
| cleanup_volume(conn, vol_id) | |
| r.duration = time.time() - start | |
| return r | |
| def test_02_volume_delete_survives_sigterm(): | |
| """SIGTERM during volume delete -> volume fully deleted.""" | |
| r = TestResult("volume_delete_survives_sigterm") | |
| start = time.time() | |
| conn = get_connection() | |
| vol_id = None | |
| try: | |
| print("\n[02] Volume DELETE survives SIGTERM") | |
| vol = create_available_volume(conn, 'gs-test-delete') | |
| vol_id = vol.id | |
| print(f" Volume {vol_id} available, deleting...") | |
| conn.block_storage.delete_volume(vol_id) | |
| time.sleep(SIGTERM_DELAY) | |
| send_sigterm() | |
| deleted, elapsed = wait_for_volume_deleted(conn, vol_id) | |
| if deleted: | |
| r.passed = True | |
| r.message = f"deleted {elapsed:.1f}s after SIGTERM" | |
| vol_id = None # No cleanup needed | |
| else: | |
| r.message = "NOT deleted (stuck in error/deleting)" | |
| restart_cinder_volume() | |
| except Exception as e: | |
| r.message = f"Exception: {e}" | |
| try: | |
| restart_cinder_volume() | |
| except Exception: | |
| pass | |
| finally: | |
| if vol_id: | |
| cleanup_volume(conn, vol_id) | |
| r.duration = time.time() - start | |
| return r | |
| def test_03_snapshot_create_survives_sigterm(): | |
| """SIGTERM during snapshot create -> snapshot reaches 'available'.""" | |
| r = TestResult("snapshot_create_survives_sigterm") | |
| start = time.time() | |
| conn = get_connection() | |
| vol_id = None | |
| snap_id = None | |
| try: | |
| print("\n[03] Snapshot CREATE survives SIGTERM") | |
| vol = create_available_volume(conn, 'gs-test-snap-parent') | |
| vol_id = vol.id | |
| print(f" Volume {vol_id} available, creating snapshot...") | |
| snap = conn.block_storage.create_snapshot( | |
| volume_id=vol_id, name='gs-test-snap') | |
| snap_id = snap.id | |
| time.sleep(SIGTERM_DELAY) | |
| send_sigterm() | |
| snap, elapsed = wait_for_snapshot_status(conn, snap_id, 'available') | |
| if snap.status == 'available': | |
| r.passed = True | |
| r.message = f"available {elapsed:.1f}s after SIGTERM" | |
| else: | |
| r.message = f"ended in '{snap.status}'" | |
| restart_cinder_volume() | |
| except Exception as e: | |
| r.message = f"Exception: {e}" | |
| try: | |
| restart_cinder_volume() | |
| except Exception: | |
| pass | |
| finally: | |
| if snap_id: | |
| cleanup_snapshot(conn, snap_id) | |
| time.sleep(5) | |
| if vol_id: | |
| cleanup_volume(conn, vol_id) | |
| r.duration = time.time() - start | |
| return r | |
| def test_04_snapshot_delete_survives_sigterm(): | |
| """SIGTERM during snapshot delete -> snapshot fully deleted.""" | |
| r = TestResult("snapshot_delete_survives_sigterm") | |
| start = time.time() | |
| conn = get_connection() | |
| vol_id = None | |
| snap_id = None | |
| try: | |
| print("\n[04] Snapshot DELETE survives SIGTERM") | |
| vol = create_available_volume(conn, 'gs-test-snapdel-parent') | |
| vol_id = vol.id | |
| snap = create_available_snapshot(conn, vol_id, 'gs-test-snapdel') | |
| snap_id = snap.id | |
| print(f" Snapshot {snap_id} available, deleting...") | |
| conn.block_storage.delete_snapshot(snap_id) | |
| time.sleep(SIGTERM_DELAY) | |
| send_sigterm() | |
| deleted, elapsed = wait_for_snapshot_deleted(conn, snap_id) | |
| if deleted: | |
| r.passed = True | |
| r.message = f"deleted {elapsed:.1f}s after SIGTERM" | |
| snap_id = None | |
| else: | |
| r.message = "NOT deleted (stuck)" | |
| restart_cinder_volume() | |
| except Exception as e: | |
| r.message = f"Exception: {e}" | |
| try: | |
| restart_cinder_volume() | |
| except Exception: | |
| pass | |
| finally: | |
| if snap_id: | |
| cleanup_snapshot(conn, snap_id) | |
| time.sleep(5) | |
| if vol_id: | |
| cleanup_volume(conn, vol_id) | |
| r.duration = time.time() - start | |
| return r | |
| def test_05_volume_extend_survives_sigterm(): | |
| """SIGTERM during volume extend -> volume reaches 'available' at new size.""" | |
| r = TestResult("volume_extend_survives_sigterm") | |
| start = time.time() | |
| conn = get_connection() | |
| vol_id = None | |
| try: | |
| print("\n[05] Volume EXTEND survives SIGTERM") | |
| vol = create_available_volume(conn, 'gs-test-extend', size=1) | |
| vol_id = vol.id | |
| print(f" Volume {vol_id} available (1GB), extending to 2GB...") | |
| conn.block_storage.extend_volume(vol_id, size=2) | |
| time.sleep(SIGTERM_DELAY) | |
| send_sigterm() | |
| vol, elapsed = wait_for_volume_status(conn, vol_id, 'available') | |
| if vol.status == 'available' and vol.size == 2: | |
| r.passed = True | |
| r.message = f"available at 2GB, {elapsed:.1f}s after SIGTERM" | |
| elif vol.status == 'available': | |
| r.passed = True | |
| r.message = (f"available (size={vol.size}GB), " | |
| f"{elapsed:.1f}s after SIGTERM") | |
| else: | |
| r.message = f"ended in '{vol.status}' size={vol.size}" | |
| restart_cinder_volume() | |
| except Exception as e: | |
| r.message = f"Exception: {e}" | |
| try: | |
| restart_cinder_volume() | |
| except Exception: | |
| pass | |
| finally: | |
| if vol_id: | |
| cleanup_volume(conn, vol_id) | |
| r.duration = time.time() - start | |
| return r | |
| def test_06_volume_clone_survives_sigterm(): | |
| """SIGTERM during volume clone -> cloned volume reaches 'available'.""" | |
| r = TestResult("volume_clone_survives_sigterm") | |
| start = time.time() | |
| conn = get_connection() | |
| src_vol_id = None | |
| clone_vol_id = None | |
| try: | |
| print("\n[06] Volume CLONE survives SIGTERM") | |
| src_vol = create_available_volume(conn, 'gs-test-clone-src') | |
| src_vol_id = src_vol.id | |
| print(f" Source {src_vol_id} available, cloning...") | |
| clone = conn.block_storage.create_volume( | |
| size=1, name='gs-test-clone-dst', | |
| volume_type=SLOW_VOLUME_TYPE, | |
| source_volid=src_vol_id) | |
| clone_vol_id = clone.id | |
| time.sleep(SIGTERM_DELAY) | |
| send_sigterm() | |
| clone, elapsed = wait_for_volume_status( | |
| conn, clone_vol_id, 'available') | |
| if clone.status == 'available': | |
| r.passed = True | |
| r.message = f"clone available {elapsed:.1f}s after SIGTERM" | |
| else: | |
| r.message = f"clone ended in '{clone.status}'" | |
| restart_cinder_volume() | |
| except Exception as e: | |
| r.message = f"Exception: {e}" | |
| try: | |
| restart_cinder_volume() | |
| except Exception: | |
| pass | |
| finally: | |
| if clone_vol_id: | |
| cleanup_volume(conn, clone_vol_id) | |
| time.sleep(5) | |
| if src_vol_id: | |
| cleanup_volume(conn, src_vol_id) | |
| r.duration = time.time() - start | |
| return r | |
| def test_07_create_from_snapshot_survives_sigterm(): | |
| """SIGTERM during create-from-snapshot -> volume reaches 'available'.""" | |
| r = TestResult("create_from_snapshot_survives_sigterm") | |
| start = time.time() | |
| conn = get_connection() | |
| src_vol_id = None | |
| snap_id = None | |
| new_vol_id = None | |
| try: | |
| print("\n[07] Create from SNAPSHOT survives SIGTERM") | |
| src_vol = create_available_volume(conn, 'gs-test-fromsnap-src') | |
| src_vol_id = src_vol.id | |
| snap = create_available_snapshot(conn, src_vol_id, 'gs-test-fromsnap') | |
| snap_id = snap.id | |
| print(f" Snapshot {snap_id} available, creating volume from it...") | |
| new_vol = conn.block_storage.create_volume( | |
| size=1, name='gs-test-fromsnap-dst', | |
| volume_type=SLOW_VOLUME_TYPE, | |
| snapshot_id=snap_id) | |
| new_vol_id = new_vol.id | |
| time.sleep(SIGTERM_DELAY) | |
| send_sigterm() | |
| new_vol, elapsed = wait_for_volume_status( | |
| conn, new_vol_id, 'available') | |
| if new_vol.status == 'available': | |
| r.passed = True | |
| r.message = f"available {elapsed:.1f}s after SIGTERM" | |
| else: | |
| r.message = f"ended in '{new_vol.status}'" | |
| restart_cinder_volume() | |
| except Exception as e: | |
| r.message = f"Exception: {e}" | |
| try: | |
| restart_cinder_volume() | |
| except Exception: | |
| pass | |
| finally: | |
| if new_vol_id: | |
| cleanup_volume(conn, new_vol_id) | |
| time.sleep(5) | |
| if snap_id: | |
| cleanup_snapshot(conn, snap_id) | |
| time.sleep(5) | |
| if src_vol_id: | |
| cleanup_volume(conn, src_vol_id) | |
| r.duration = time.time() - start | |
| return r | |
| def test_08_copy_volume_to_image_survives_sigterm(): | |
| """SIGTERM during copy_volume_to_image -> image upload completes.""" | |
| r = TestResult("copy_volume_to_image_survives_sigterm") | |
| start = time.time() | |
| conn = get_connection() | |
| vol_id = None | |
| image_id = None | |
| try: | |
| print("\n[08] Copy volume to IMAGE survives SIGTERM") | |
| # Create volume on slow backend (will delay on copy_volume_to_image) | |
| vol = create_available_volume(conn, 'gs-test-upload-to-image') | |
| vol_id = vol.id | |
| print(f" Volume {vol_id} available, uploading to image...") | |
| # Upload volume to image (triggers copy_volume_to_image in driver) | |
| result = conn.block_storage.post( | |
| f'/volumes/{vol_id}/action', | |
| json={ | |
| 'os-volume_upload_image': { | |
| 'image_name': 'gs-test-image-upload', | |
| 'disk_format': 'raw', | |
| 'container_format': 'bare', | |
| } | |
| } | |
| ) | |
| body = result.json() | |
| image_id = body.get('os-volume_upload_image', {}).get('image_id') | |
| print(f" Image {image_id} upload started") | |
| time.sleep(SIGTERM_DELAY) | |
| send_sigterm() | |
| # Wait for volume to return to 'available' (upload complete) | |
| vol, elapsed = wait_for_volume_status(conn, vol_id, 'available') | |
| if vol.status == 'available': | |
| r.passed = True | |
| r.message = f"volume back to 'available' {elapsed:.1f}s after SIGTERM" | |
| else: | |
| r.message = f"volume ended in '{vol.status}'" | |
| restart_cinder_volume() | |
| except Exception as e: | |
| r.message = f"Exception: {e}" | |
| try: | |
| restart_cinder_volume() | |
| except Exception: | |
| pass | |
| finally: | |
| # Clean up image | |
| if image_id: | |
| try: | |
| conn.image.delete_image(image_id) | |
| except Exception: | |
| pass | |
| if vol_id: | |
| cleanup_volume(conn, vol_id) | |
| r.duration = time.time() - start | |
| return r | |
| def test_09_migrate_volume_survives_sigterm(): | |
| """SIGTERM during volume migration -> volume reaches 'available'.""" | |
| r = TestResult("migrate_volume_survives_sigterm") | |
| start = time.time() | |
| conn = get_connection() | |
| vol_id = None | |
| try: | |
| print("\n[09] Volume MIGRATE survives SIGTERM") | |
| # Create volume on slow-lvm backend | |
| vol = create_available_volume(conn, 'gs-test-migrate') | |
| vol_id = vol.id | |
| print(f" Volume {vol_id} available on slow-lvm, migrating to " | |
| f"lvmdriver-1...") | |
| # Migrate to the other backend (host-level migration) | |
| dest_host = 'devstack@lvmdriver-1#lvmdriver-1' | |
| conn.block_storage.post( | |
| f'/volumes/{vol_id}/action', | |
| json={'os-migrate_volume': {'host': dest_host}} | |
| ) | |
| time.sleep(SIGTERM_DELAY) | |
| send_sigterm() | |
| # Wait for volume to return to available (migration complete) | |
| vol, elapsed = wait_for_volume_status(conn, vol_id, 'available', | |
| timeout=120) | |
| if vol.status == 'available': | |
| r.passed = True | |
| r.message = f"available {elapsed:.1f}s after SIGTERM" | |
| else: | |
| r.message = f"ended in '{vol.status}'" | |
| restart_cinder_volume() | |
| except Exception as e: | |
| r.message = f"Exception: {e}" | |
| try: | |
| restart_cinder_volume() | |
| except Exception: | |
| pass | |
| finally: | |
| if vol_id: | |
| cleanup_volume(conn, vol_id) | |
| r.duration = time.time() - start | |
| return r | |
| def test_10_retype_volume_survives_sigterm(): | |
| """SIGTERM during volume retype -> volume reaches 'available'.""" | |
| r = TestResult("retype_volume_survives_sigterm") | |
| start = time.time() | |
| conn = get_connection() | |
| vol_id = None | |
| try: | |
| print("\n[10] Volume RETYPE survives SIGTERM") | |
| # Create volume on slow-lvm | |
| vol = create_available_volume(conn, 'gs-test-retype') | |
| vol_id = vol.id | |
| print(f" Volume {vol_id} available (slow-lvm), retyping to " | |
| f"lvmdriver-1...") | |
| # Retype to lvmdriver-1 (with migration) | |
| conn.block_storage.post( | |
| f'/volumes/{vol_id}/action', | |
| json={'os-retype': {'new_type': 'lvmdriver-1', | |
| 'migration_policy': 'on-demand'}} | |
| ) | |
| time.sleep(SIGTERM_DELAY) | |
| send_sigterm() | |
| # Wait for volume to return to available | |
| vol, elapsed = wait_for_volume_status(conn, vol_id, 'available', | |
| timeout=120) | |
| if vol.status == 'available': | |
| r.passed = True | |
| r.message = f"available {elapsed:.1f}s after SIGTERM" | |
| else: | |
| r.message = f"ended in '{vol.status}'" | |
| restart_cinder_volume() | |
| except Exception as e: | |
| r.message = f"Exception: {e}" | |
| try: | |
| restart_cinder_volume() | |
| except Exception: | |
| pass | |
| finally: | |
| if vol_id: | |
| cleanup_volume(conn, vol_id) | |
| r.duration = time.time() - start | |
| return r | |
| def test_11_manage_existing_survives_sigterm(): | |
| """SIGTERM during manage_existing -> volume reaches 'available'. | |
| Creates an LV directly on the VG, then uses Cinder's manage API | |
| to import it as a volume. Sends SIGTERM mid-manage. | |
| """ | |
| r = TestResult("manage_existing_survives_sigterm") | |
| start = time.time() | |
| conn = get_connection() | |
| vol_id = None | |
| lv_name = 'gs-test-manage-existing' | |
| try: | |
| print("\n[11] MANAGE EXISTING survives SIGTERM") | |
| # Create a raw LV on the VG that slow-lvm uses | |
| vg_name = 'stack-volumes-lvmdriver-1' | |
| subprocess.run( | |
| ['sudo', 'lvcreate', '-L', '1G', '-n', lv_name, vg_name], | |
| check=True, capture_output=True, timeout=10 | |
| ) | |
| print(f" Created LV {vg_name}/{lv_name}") | |
| # Manage it via Cinder API | |
| # For LVM driver, source-name is just the LV name (not VG/LV) | |
| result = conn.block_storage.post( | |
| '/os-volume-manage', | |
| json={ | |
| 'volume': { | |
| 'host': 'devstack@slow-lvm', | |
| 'name': 'gs-test-managed', | |
| 'volume_type': SLOW_VOLUME_TYPE, | |
| 'ref': {'source-name': lv_name}, | |
| } | |
| } | |
| ) | |
| body = result.json() | |
| vol_id = body.get('volume', {}).get('id') | |
| if not vol_id: | |
| r.message = f"Could not initiate manage: {body}" | |
| r.duration = time.time() - start | |
| return r | |
| print(f" Managing as volume {vol_id}...") | |
| time.sleep(SIGTERM_DELAY) | |
| send_sigterm() | |
| vol, elapsed = wait_for_volume_status(conn, vol_id, 'available', | |
| timeout=90) | |
| if vol.status == 'available': | |
| r.passed = True | |
| r.message = f"available {elapsed:.1f}s after SIGTERM" | |
| else: | |
| r.message = f"ended in '{vol.status}'" | |
| restart_cinder_volume() | |
| except Exception as e: | |
| r.message = f"Exception: {e}" | |
| try: | |
| restart_cinder_volume() | |
| except Exception: | |
| pass | |
| # Clean up LV if manage failed | |
| subprocess.run( | |
| ['sudo', 'lvremove', '-f', | |
| f'stack-volumes-lvmdriver-1/{lv_name}'], | |
| capture_output=True | |
| ) | |
| finally: | |
| if vol_id: | |
| cleanup_volume(conn, vol_id) | |
| r.duration = time.time() - start | |
| return r | |
| def test_12_create_group_survives_sigterm(): | |
| """SIGTERM during group create -> group reaches 'available'. | |
| Creates a group type, then creates a consistency group on slow-lvm. | |
| """ | |
| r = TestResult("create_group_survives_sigterm") | |
| start = time.time() | |
| conn = get_connection() | |
| group_id = None | |
| group_type_id = None | |
| try: | |
| print("\n[12] CREATE GROUP survives SIGTERM") | |
| # Create a group type (avoid reserved CG migration types) | |
| result = conn.block_storage.get('/group_types', | |
| microversion='3.11') | |
| group_types = result.json().get('group_types', []) | |
| # Filter out reserved CG migration group types | |
| usable_types = [gt for gt in group_types | |
| if 'cgsnapshot' not in (gt.get('name') or '').lower() | |
| and 'migration' not in | |
| (gt.get('description') or '').lower() | |
| and (gt.get('name') or '') != | |
| 'group_type_for_migration'] | |
| if usable_types: | |
| group_type_id = usable_types[0]['id'] | |
| print(f" Using existing group type {group_type_id}") | |
| else: | |
| result = conn.block_storage.post( | |
| '/group_types', | |
| json={'group_type': { | |
| 'name': 'gs-test-group-type', | |
| 'group_specs': {}, | |
| }}, | |
| microversion='3.11' | |
| ) | |
| group_type_id = result.json()['group_type']['id'] | |
| print(f" Created group type {group_type_id}") | |
| # Get slow-lvm volume type ID | |
| vol_types = list(conn.block_storage.types()) | |
| slow_type_id = None | |
| for vt in vol_types: | |
| if vt.name == SLOW_VOLUME_TYPE: | |
| slow_type_id = vt.id | |
| break | |
| if not slow_type_id: | |
| r.message = "Could not find slow-lvm volume type" | |
| r.duration = time.time() - start | |
| return r | |
| # Create a group | |
| result = conn.block_storage.post( | |
| '/groups', | |
| json={'group': { | |
| 'name': 'gs-test-group', | |
| 'group_type': group_type_id, | |
| 'volume_types': [slow_type_id], | |
| }}, | |
| microversion='3.13' | |
| ) | |
| body = result.json() | |
| group_id = body.get('group', {}).get('id') | |
| print(f" Group {group_id} creating...") | |
| time.sleep(SIGTERM_DELAY) | |
| send_sigterm() | |
| # Wait for group to reach available | |
| start_wait = time.time() | |
| final_status = None | |
| while time.time() - start_wait < MAX_WAIT_AFTER_SIGTERM: | |
| result = conn.block_storage.get(f'/groups/{group_id}', | |
| microversion='3.13') | |
| group = result.json().get('group', {}) | |
| final_status = group.get('status') | |
| if final_status == 'available': | |
| elapsed = time.time() - start_wait | |
| r.passed = True | |
| r.message = f"available {elapsed:.1f}s after SIGTERM" | |
| break | |
| if final_status == 'error': | |
| r.message = "group ended in 'error'" | |
| break | |
| time.sleep(2) | |
| else: | |
| r.message = f"timed out (last status: {final_status})" | |
| restart_cinder_volume() | |
| except Exception as e: | |
| r.message = f"Exception: {e}" | |
| try: | |
| restart_cinder_volume() | |
| except Exception: | |
| pass | |
| finally: | |
| # Cleanup group | |
| if group_id: | |
| try: | |
| conn.block_storage.post( | |
| f'/groups/{group_id}/action', | |
| json={'delete': {'delete-volumes': False}}, | |
| microversion='3.13' | |
| ) | |
| except Exception: | |
| pass | |
| r.duration = time.time() - start | |
| return r | |
| def test_13_multiple_ops_complete_before_exit(): | |
| """SIGTERM with 3 concurrent creates -> all reach 'available'.""" | |
| r = TestResult("multiple_concurrent_ops_survive_sigterm") | |
| start = time.time() | |
| conn = get_connection() | |
| vol_ids = [] | |
| try: | |
| print("\n[13] Multiple concurrent creates survive SIGTERM") | |
| for i in range(3): | |
| vol = conn.block_storage.create_volume( | |
| size=1, name=f'gs-test-multi-{i}', | |
| volume_type=SLOW_VOLUME_TYPE) | |
| vol_ids.append(vol.id) | |
| print(f" Created {vol.id}") | |
| time.sleep(SIGTERM_DELAY) | |
| send_sigterm() | |
| statuses = {} | |
| for vid in vol_ids: | |
| try: | |
| v, elapsed = wait_for_volume_status( | |
| conn, vid, 'available', timeout=120) | |
| statuses[vid] = v.status | |
| except TimeoutError: | |
| statuses[vid] = 'timeout' | |
| all_good = all(s == 'available' for s in statuses.values()) | |
| if all_good: | |
| r.passed = True | |
| r.message = f"All {len(vol_ids)} volumes available after SIGTERM" | |
| else: | |
| r.message = f"Results: {statuses}" | |
| restart_cinder_volume() | |
| except Exception as e: | |
| r.message = f"Exception: {e}" | |
| try: | |
| restart_cinder_volume() | |
| except Exception: | |
| pass | |
| finally: | |
| for vid in vol_ids: | |
| cleanup_volume(conn, vid) | |
| r.duration = time.time() - start | |
| return r | |
| def test_14_service_heartbeat_stops_during_drain(): | |
| """After SIGTERM, service stops heartbeating (appears 'down').""" | |
| r = TestResult("service_heartbeat_stops_during_drain") | |
| start = time.time() | |
| conn = get_connection() | |
| vol_id = None | |
| try: | |
| print("\n[14] Service heartbeat stops during drain") | |
| # Verify service is up | |
| services = list(conn.block_storage.services()) | |
| slow_svc = [s for s in services if 'slow-lvm' in (s.host or '')] | |
| if not slow_svc or slow_svc[0].state != 'up': | |
| r.message = "slow-lvm not 'up' initially - skipping" | |
| r.duration = time.time() - start | |
| return r | |
| print(" Service is 'up'") | |
| # Start slow op | |
| vol = conn.block_storage.create_volume( | |
| size=1, name='gs-test-heartbeat', volume_type=SLOW_VOLUME_TYPE) | |
| vol_id = vol.id | |
| time.sleep(SIGTERM_DELAY) | |
| send_sigterm() | |
| # The service_down_time is typically 25s. The service should stop | |
| # heartbeating immediately, but the scheduler won't mark it down | |
| # until service_down_time elapses. Check after the op completes. | |
| vol, _ = wait_for_volume_status(conn, vol_id, 'available') | |
| # After operation completes and process exits, service should be down | |
| time.sleep(5) | |
| services = list(conn.block_storage.services()) | |
| slow_svc = [s for s in services if 'slow-lvm' in (s.host or '')] | |
| state_after = slow_svc[0].state if slow_svc else 'unknown' | |
| print(f" Service state after drain: {state_after}") | |
| # Restart and verify recovery | |
| restart_cinder_volume() | |
| time.sleep(10) | |
| services = list(conn.block_storage.services()) | |
| slow_svc = [s for s in services if 'slow-lvm' in (s.host or '')] | |
| state_recovered = slow_svc[0].state if slow_svc else 'unknown' | |
| if vol.status == 'available' and state_recovered == 'up': | |
| r.passed = True | |
| r.message = (f"Op completed, service was '{state_after}' " | |
| f"during drain, recovered to 'up'") | |
| else: | |
| r.message = (f"vol={vol.status}, state_after={state_after}, " | |
| f"recovered={state_recovered}") | |
| except Exception as e: | |
| r.message = f"Exception: {e}" | |
| try: | |
| restart_cinder_volume() | |
| except Exception: | |
| pass | |
| finally: | |
| if vol_id: | |
| cleanup_volume(conn, vol_id) | |
| r.duration = time.time() - start | |
| return r | |
| # --- Main --- | |
| def main(): | |
| print("=" * 72) | |
| print(" CINDER GRACEFUL SHUTDOWN - LIVE SIGTERM INTEGRATION TESTS") | |
| print("=" * 72) | |
| print(f" Auth: {AUTH_URL}") | |
| print(f" Slow backend: {SLOW_VOLUME_TYPE} (15s delay)") | |
| print(f" SIGTERM after: {SIGTERM_DELAY}s into operation") | |
| print(f" Wait timeout: {MAX_WAIT_AFTER_SIGTERM}s") | |
| print("=" * 72) | |
| tests = [ | |
| test_01_volume_create_survives_sigterm, | |
| test_02_volume_delete_survives_sigterm, | |
| test_03_snapshot_create_survives_sigterm, | |
| test_04_snapshot_delete_survives_sigterm, | |
| test_05_volume_extend_survives_sigterm, | |
| test_06_volume_clone_survives_sigterm, | |
| test_07_create_from_snapshot_survives_sigterm, | |
| test_08_copy_volume_to_image_survives_sigterm, | |
| test_09_migrate_volume_survives_sigterm, | |
| test_10_retype_volume_survives_sigterm, | |
| test_11_manage_existing_survives_sigterm, | |
| test_12_create_group_survives_sigterm, | |
| test_13_multiple_ops_complete_before_exit, | |
| test_14_service_heartbeat_stops_during_drain, | |
| ] | |
| conn = get_connection() | |
| results = [] | |
| for test_func in tests: | |
| # Reset and delete any residual volumes from prior test's SIGTERM | |
| # so init_host doesn't queue them up on the slow backend | |
| drain_residual_volumes(conn) | |
| try: | |
| result = test_func() | |
| except Exception as e: | |
| result = TestResult(test_func.__name__) | |
| result.message = f"Unhandled: {e}" | |
| results.append(result) | |
| print(f"\n => {result}") | |
| # --- Report --- | |
| print("\n\n") | |
| print("=" * 72) | |
| print(" FINAL REPORT") | |
| print("=" * 72) | |
| passed = sum(1 for r in results if r.passed) | |
| failed = sum(1 for r in results if not r.passed) | |
| total = len(results) | |
| total_time = sum(r.duration for r in results) | |
| print(f"\n Total: {total} | Passed: {passed} | " | |
| f"Failed: {failed} | Duration: {total_time:.0f}s\n") | |
| print("-" * 72) | |
| for r in results: | |
| s = "PASS" if r.passed else "FAIL" | |
| print(f" [{s}] {r.name}") | |
| print(f" {r.message}") | |
| print() | |
| print("=" * 72) | |
| if failed == 0: | |
| print(" ALL TESTS PASSED") | |
| else: | |
| print(f" {failed} TEST(S) FAILED") | |
| print("=" * 72) | |
| return 0 if failed == 0 else 1 | |
| if __name__ == '__main__': | |
| exit(main()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment