Skip to content

Instantly share code, notes, and snippets.

@hemna
Last active April 29, 2026 19:05
Show Gist options
  • Select an option

  • Save hemna/f12d3441aa232b472f6a8266e9ccb4cc to your computer and use it in GitHub Desktop.

Select an option

Save hemna/f12d3441aa232b472f6a8266e9ccb4cc to your computer and use it in GitHub Desktop.
Cinder Graceful Shutdown - Live SIGTERM Integration Tests
# Copyright 2026 SAP SE
# All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
"""Tempest integration tests for Cinder graceful shutdown.
Tests all volume/snapshot operations interrupted by SIGTERM mid-operation
to verify graceful shutdown waits for in-flight work to complete.
Uses FakeSlowVolumeDriver (slow-lvm backend) which injects a 15s delay
into driver operations. SIGTERM is sent 5s into the delay, leaving 10s
for the graceful shutdown to wait.
Requirements:
- DevStack with cinder running
- slow-lvm backend configured with FakeSlowVolumeDriver
- slow_driver_delay = 15
- slow_driver_operations = create_volume,delete_volume,create_snapshot,
delete_snapshot,extend_volume,create_cloned_volume,
create_volume_from_snapshot
- sudo access for signals and service management
Run:
sudo /opt/stack/data/venv/bin/python \
~/cinder/cinder/tests/test_graceful_shutdown_tempest.py
"""
import os
import signal
import subprocess
import time
import openstack
# --- Configuration ---
AUTH_URL = os.environ.get('OS_AUTH_URL', 'http://192.168.1.107/identity/v3')
USERNAME = os.environ.get('OS_USERNAME', 'admin')
PASSWORD = os.environ.get('OS_PASSWORD', 'openstack')
PROJECT = os.environ.get('OS_PROJECT_NAME', 'admin')
DOMAIN = os.environ.get('OS_USER_DOMAIN_NAME', 'Default')
SLOW_VOLUME_TYPE = 'slow-lvm'
FAST_VOLUME_TYPE = 'lvmdriver-1'
# Time to wait before sending SIGTERM (must be < slow_driver_delay)
SIGTERM_DELAY = 5
# Max time to wait for operation to complete after SIGTERM
MAX_WAIT_AFTER_SIGTERM = 90
# Service restart timeout
SERVICE_RESTART_TIMEOUT = 30
# --- Helpers ---
def get_connection():
"""Create an OpenStack SDK connection."""
return openstack.connect(
auth_url=AUTH_URL,
username=USERNAME,
password=PASSWORD,
project_name=PROJECT,
user_domain_name=DOMAIN,
project_domain_name=DOMAIN,
)
def get_cinder_volume_pid():
"""Get the PID of the cinder-volume parent process."""
result = subprocess.run(
['pgrep', '-f', 'cinder-volume.*--config-file'],
capture_output=True, text=True
)
pids = result.stdout.strip().split('\n')
if pids and pids[0]:
return int(pids[0])
return None
def send_sigterm():
"""Send SIGTERM to cinder-volume."""
pid = get_cinder_volume_pid()
if pid:
print(f" SIGTERM -> PID {pid}")
os.kill(pid, signal.SIGTERM)
return pid
raise RuntimeError("Could not find cinder-volume process")
def restart_cinder_volume():
"""Restart cinder-volume and wait for it to be active."""
print(" Restarting cinder-volume...")
subprocess.run(
['sudo', 'systemctl', 'restart', 'devstack@c-vol'],
check=True, timeout=60
)
for i in range(SERVICE_RESTART_TIMEOUT):
time.sleep(1)
result = subprocess.run(
['sudo', 'systemctl', 'is-active', 'devstack@c-vol'],
capture_output=True, text=True
)
if 'active' in result.stdout:
print(f" Service up (took {i+1}s)")
# Wait extra for backend initialization and init_host cleanup
# The slow driver may re-process stuck volumes from prior test
time.sleep(10)
return True
raise RuntimeError("cinder-volume did not restart within timeout")
def wait_for_volume_status(conn, vol_id, target, timeout=MAX_WAIT_AFTER_SIGTERM):
"""Wait for volume to reach target status. Returns (volume, elapsed)."""
start = time.time()
while time.time() - start < timeout:
vol = conn.block_storage.get_volume(vol_id)
if vol.status == target:
return vol, time.time() - start
if vol.status == 'error':
return vol, time.time() - start
time.sleep(2)
raise TimeoutError(
f"Volume {vol_id} did not reach '{target}' within {timeout}s "
f"(last: {vol.status})")
def wait_for_snapshot_status(conn, snap_id, target,
timeout=MAX_WAIT_AFTER_SIGTERM):
"""Wait for snapshot to reach target status."""
start = time.time()
while time.time() - start < timeout:
snap = conn.block_storage.get_snapshot(snap_id)
if snap.status == target:
return snap, time.time() - start
if snap.status == 'error':
return snap, time.time() - start
time.sleep(2)
raise TimeoutError(
f"Snapshot {snap_id} did not reach '{target}' within {timeout}s")
def wait_for_volume_deleted(conn, vol_id, timeout=MAX_WAIT_AFTER_SIGTERM):
"""Wait for volume to be fully deleted."""
start = time.time()
while time.time() - start < timeout:
try:
vol = conn.block_storage.get_volume(vol_id)
if vol.status == 'error':
return False, time.time() - start
except Exception:
return True, time.time() - start
time.sleep(2)
raise TimeoutError(f"Volume {vol_id} not deleted within {timeout}s")
def wait_for_snapshot_deleted(conn, snap_id, timeout=MAX_WAIT_AFTER_SIGTERM):
"""Wait for snapshot to be fully deleted."""
start = time.time()
while time.time() - start < timeout:
try:
snap = conn.block_storage.get_snapshot(snap_id)
if snap.status == 'error':
return False, time.time() - start
except Exception:
return True, time.time() - start
time.sleep(2)
raise TimeoutError(f"Snapshot {snap_id} not deleted within {timeout}s")
def create_available_volume(conn, name, vol_type=SLOW_VOLUME_TYPE, size=1):
"""Create a volume and wait for it to be available."""
vol = conn.block_storage.create_volume(
size=size, name=name, volume_type=vol_type)
vol, _ = wait_for_volume_status(conn, vol.id, 'available', timeout=120)
if vol.status != 'available':
raise RuntimeError(f"Volume {vol.id} stuck in {vol.status}")
return vol
def create_available_snapshot(conn, vol_id, name):
"""Create a snapshot and wait for it to be available."""
snap = conn.block_storage.create_snapshot(
volume_id=vol_id, name=name)
snap, _ = wait_for_snapshot_status(conn, snap.id, 'available', timeout=120)
if snap.status != 'available':
raise RuntimeError(f"Snapshot {snap.id} stuck in {snap.status}")
return snap
def cleanup_volume(conn, vol_id):
"""Force-delete a volume, ignoring errors."""
try:
conn.block_storage.reset_volume_status(
vol_id, status='available',
attach_status='detached', migration_status=None)
except Exception:
pass
try:
conn.block_storage.delete_volume(vol_id, force=True)
except Exception:
pass
def cleanup_snapshot(conn, snap_id):
"""Force-delete a snapshot, ignoring errors."""
try:
conn.block_storage.reset_snapshot_status(snap_id, status='available')
except Exception:
pass
try:
conn.block_storage.delete_snapshot(snap_id)
except Exception:
pass
def drain_residual_volumes(conn, timeout=120):
"""Reset and force-delete all volumes/snapshots in transient states.
This prevents init_host from spending time cleaning up residue from
prior tests, which can cause prereq volume creation to time out.
"""
transient_statuses = ('creating', 'deleting', 'attaching', 'detaching',
'extending', 'migrating', 'retyping')
# Clean snapshots first (they block volume deletion)
try:
snapshots = list(conn.block_storage.snapshots(all_projects=True))
for snap in snapshots:
if snap.status in transient_statuses or snap.status == 'error':
try:
conn.block_storage.reset_snapshot_status(
snap.id, status='available')
conn.block_storage.delete_snapshot(snap.id)
except Exception:
pass
except Exception:
pass
# Clean volumes in transient states
residual_ids = []
try:
volumes = list(conn.block_storage.volumes(all_projects=True))
for vol in volumes:
if vol.status in transient_statuses:
try:
conn.block_storage.reset_volume_status(
vol.id, status='error',
attach_status='detached', migration_status=None)
conn.block_storage.delete_volume(vol.id, force=True)
residual_ids.append(vol.id)
except Exception:
pass
except Exception:
pass
if not residual_ids:
return
# Wait for all residual volumes to actually be deleted
import time as _time
print(f" Draining {len(residual_ids)} residual volume(s)...")
start = _time.time()
while residual_ids and (_time.time() - start < timeout):
_time.sleep(3)
remaining = []
for vol_id in residual_ids:
try:
vol = conn.block_storage.get_volume(vol_id)
if vol.status not in ('deleted',):
remaining.append(vol_id)
except Exception:
# 404 = deleted
pass
residual_ids = remaining
if residual_ids:
print(f" Warning: {len(residual_ids)} volume(s) not fully drained")
else:
print(f" Drained in {_time.time() - start:.0f}s")
# --- Test Result ---
class TestResult:
def __init__(self, name):
self.name = name
self.passed = False
self.message = ""
self.duration = 0
def __str__(self):
s = "PASS" if self.passed else "FAIL"
return f"[{s}] {self.name} ({self.duration:.1f}s) - {self.message}"
# --- Tests ---
def test_01_volume_create_survives_sigterm():
"""SIGTERM during volume create -> volume reaches 'available'."""
r = TestResult("volume_create_survives_sigterm")
start = time.time()
conn = get_connection()
vol_id = None
try:
print("\n[01] Volume CREATE survives SIGTERM")
vol = conn.block_storage.create_volume(
size=1, name='gs-test-create', volume_type=SLOW_VOLUME_TYPE)
vol_id = vol.id
print(f" Volume {vol_id} creating...")
time.sleep(SIGTERM_DELAY)
send_sigterm()
vol, elapsed = wait_for_volume_status(conn, vol_id, 'available')
if vol.status == 'available':
r.passed = True
r.message = f"available {elapsed:.1f}s after SIGTERM"
else:
r.message = f"ended in '{vol.status}'"
restart_cinder_volume()
except Exception as e:
r.message = f"Exception: {e}"
try:
restart_cinder_volume()
except Exception:
pass
finally:
if vol_id:
cleanup_volume(conn, vol_id)
r.duration = time.time() - start
return r
def test_02_volume_delete_survives_sigterm():
"""SIGTERM during volume delete -> volume fully deleted."""
r = TestResult("volume_delete_survives_sigterm")
start = time.time()
conn = get_connection()
vol_id = None
try:
print("\n[02] Volume DELETE survives SIGTERM")
vol = create_available_volume(conn, 'gs-test-delete')
vol_id = vol.id
print(f" Volume {vol_id} available, deleting...")
conn.block_storage.delete_volume(vol_id)
time.sleep(SIGTERM_DELAY)
send_sigterm()
deleted, elapsed = wait_for_volume_deleted(conn, vol_id)
if deleted:
r.passed = True
r.message = f"deleted {elapsed:.1f}s after SIGTERM"
vol_id = None # No cleanup needed
else:
r.message = "NOT deleted (stuck in error/deleting)"
restart_cinder_volume()
except Exception as e:
r.message = f"Exception: {e}"
try:
restart_cinder_volume()
except Exception:
pass
finally:
if vol_id:
cleanup_volume(conn, vol_id)
r.duration = time.time() - start
return r
def test_03_snapshot_create_survives_sigterm():
"""SIGTERM during snapshot create -> snapshot reaches 'available'."""
r = TestResult("snapshot_create_survives_sigterm")
start = time.time()
conn = get_connection()
vol_id = None
snap_id = None
try:
print("\n[03] Snapshot CREATE survives SIGTERM")
vol = create_available_volume(conn, 'gs-test-snap-parent')
vol_id = vol.id
print(f" Volume {vol_id} available, creating snapshot...")
snap = conn.block_storage.create_snapshot(
volume_id=vol_id, name='gs-test-snap')
snap_id = snap.id
time.sleep(SIGTERM_DELAY)
send_sigterm()
snap, elapsed = wait_for_snapshot_status(conn, snap_id, 'available')
if snap.status == 'available':
r.passed = True
r.message = f"available {elapsed:.1f}s after SIGTERM"
else:
r.message = f"ended in '{snap.status}'"
restart_cinder_volume()
except Exception as e:
r.message = f"Exception: {e}"
try:
restart_cinder_volume()
except Exception:
pass
finally:
if snap_id:
cleanup_snapshot(conn, snap_id)
time.sleep(5)
if vol_id:
cleanup_volume(conn, vol_id)
r.duration = time.time() - start
return r
def test_04_snapshot_delete_survives_sigterm():
"""SIGTERM during snapshot delete -> snapshot fully deleted."""
r = TestResult("snapshot_delete_survives_sigterm")
start = time.time()
conn = get_connection()
vol_id = None
snap_id = None
try:
print("\n[04] Snapshot DELETE survives SIGTERM")
vol = create_available_volume(conn, 'gs-test-snapdel-parent')
vol_id = vol.id
snap = create_available_snapshot(conn, vol_id, 'gs-test-snapdel')
snap_id = snap.id
print(f" Snapshot {snap_id} available, deleting...")
conn.block_storage.delete_snapshot(snap_id)
time.sleep(SIGTERM_DELAY)
send_sigterm()
deleted, elapsed = wait_for_snapshot_deleted(conn, snap_id)
if deleted:
r.passed = True
r.message = f"deleted {elapsed:.1f}s after SIGTERM"
snap_id = None
else:
r.message = "NOT deleted (stuck)"
restart_cinder_volume()
except Exception as e:
r.message = f"Exception: {e}"
try:
restart_cinder_volume()
except Exception:
pass
finally:
if snap_id:
cleanup_snapshot(conn, snap_id)
time.sleep(5)
if vol_id:
cleanup_volume(conn, vol_id)
r.duration = time.time() - start
return r
def test_05_volume_extend_survives_sigterm():
"""SIGTERM during volume extend -> volume reaches 'available' at new size."""
r = TestResult("volume_extend_survives_sigterm")
start = time.time()
conn = get_connection()
vol_id = None
try:
print("\n[05] Volume EXTEND survives SIGTERM")
vol = create_available_volume(conn, 'gs-test-extend', size=1)
vol_id = vol.id
print(f" Volume {vol_id} available (1GB), extending to 2GB...")
conn.block_storage.extend_volume(vol_id, size=2)
time.sleep(SIGTERM_DELAY)
send_sigterm()
vol, elapsed = wait_for_volume_status(conn, vol_id, 'available')
if vol.status == 'available' and vol.size == 2:
r.passed = True
r.message = f"available at 2GB, {elapsed:.1f}s after SIGTERM"
elif vol.status == 'available':
r.passed = True
r.message = (f"available (size={vol.size}GB), "
f"{elapsed:.1f}s after SIGTERM")
else:
r.message = f"ended in '{vol.status}' size={vol.size}"
restart_cinder_volume()
except Exception as e:
r.message = f"Exception: {e}"
try:
restart_cinder_volume()
except Exception:
pass
finally:
if vol_id:
cleanup_volume(conn, vol_id)
r.duration = time.time() - start
return r
def test_06_volume_clone_survives_sigterm():
"""SIGTERM during volume clone -> cloned volume reaches 'available'."""
r = TestResult("volume_clone_survives_sigterm")
start = time.time()
conn = get_connection()
src_vol_id = None
clone_vol_id = None
try:
print("\n[06] Volume CLONE survives SIGTERM")
src_vol = create_available_volume(conn, 'gs-test-clone-src')
src_vol_id = src_vol.id
print(f" Source {src_vol_id} available, cloning...")
clone = conn.block_storage.create_volume(
size=1, name='gs-test-clone-dst',
volume_type=SLOW_VOLUME_TYPE,
source_volid=src_vol_id)
clone_vol_id = clone.id
time.sleep(SIGTERM_DELAY)
send_sigterm()
clone, elapsed = wait_for_volume_status(
conn, clone_vol_id, 'available')
if clone.status == 'available':
r.passed = True
r.message = f"clone available {elapsed:.1f}s after SIGTERM"
else:
r.message = f"clone ended in '{clone.status}'"
restart_cinder_volume()
except Exception as e:
r.message = f"Exception: {e}"
try:
restart_cinder_volume()
except Exception:
pass
finally:
if clone_vol_id:
cleanup_volume(conn, clone_vol_id)
time.sleep(5)
if src_vol_id:
cleanup_volume(conn, src_vol_id)
r.duration = time.time() - start
return r
def test_07_create_from_snapshot_survives_sigterm():
"""SIGTERM during create-from-snapshot -> volume reaches 'available'."""
r = TestResult("create_from_snapshot_survives_sigterm")
start = time.time()
conn = get_connection()
src_vol_id = None
snap_id = None
new_vol_id = None
try:
print("\n[07] Create from SNAPSHOT survives SIGTERM")
src_vol = create_available_volume(conn, 'gs-test-fromsnap-src')
src_vol_id = src_vol.id
snap = create_available_snapshot(conn, src_vol_id, 'gs-test-fromsnap')
snap_id = snap.id
print(f" Snapshot {snap_id} available, creating volume from it...")
new_vol = conn.block_storage.create_volume(
size=1, name='gs-test-fromsnap-dst',
volume_type=SLOW_VOLUME_TYPE,
snapshot_id=snap_id)
new_vol_id = new_vol.id
time.sleep(SIGTERM_DELAY)
send_sigterm()
new_vol, elapsed = wait_for_volume_status(
conn, new_vol_id, 'available')
if new_vol.status == 'available':
r.passed = True
r.message = f"available {elapsed:.1f}s after SIGTERM"
else:
r.message = f"ended in '{new_vol.status}'"
restart_cinder_volume()
except Exception as e:
r.message = f"Exception: {e}"
try:
restart_cinder_volume()
except Exception:
pass
finally:
if new_vol_id:
cleanup_volume(conn, new_vol_id)
time.sleep(5)
if snap_id:
cleanup_snapshot(conn, snap_id)
time.sleep(5)
if src_vol_id:
cleanup_volume(conn, src_vol_id)
r.duration = time.time() - start
return r
def test_08_copy_volume_to_image_survives_sigterm():
"""SIGTERM during copy_volume_to_image -> image upload completes."""
r = TestResult("copy_volume_to_image_survives_sigterm")
start = time.time()
conn = get_connection()
vol_id = None
image_id = None
try:
print("\n[08] Copy volume to IMAGE survives SIGTERM")
# Create volume on slow backend (will delay on copy_volume_to_image)
vol = create_available_volume(conn, 'gs-test-upload-to-image')
vol_id = vol.id
print(f" Volume {vol_id} available, uploading to image...")
# Upload volume to image (triggers copy_volume_to_image in driver)
result = conn.block_storage.post(
f'/volumes/{vol_id}/action',
json={
'os-volume_upload_image': {
'image_name': 'gs-test-image-upload',
'disk_format': 'raw',
'container_format': 'bare',
}
}
)
body = result.json()
image_id = body.get('os-volume_upload_image', {}).get('image_id')
print(f" Image {image_id} upload started")
time.sleep(SIGTERM_DELAY)
send_sigterm()
# Wait for volume to return to 'available' (upload complete)
vol, elapsed = wait_for_volume_status(conn, vol_id, 'available')
if vol.status == 'available':
r.passed = True
r.message = f"volume back to 'available' {elapsed:.1f}s after SIGTERM"
else:
r.message = f"volume ended in '{vol.status}'"
restart_cinder_volume()
except Exception as e:
r.message = f"Exception: {e}"
try:
restart_cinder_volume()
except Exception:
pass
finally:
# Clean up image
if image_id:
try:
conn.image.delete_image(image_id)
except Exception:
pass
if vol_id:
cleanup_volume(conn, vol_id)
r.duration = time.time() - start
return r
def test_09_migrate_volume_survives_sigterm():
"""SIGTERM during volume migration -> volume reaches 'available'."""
r = TestResult("migrate_volume_survives_sigterm")
start = time.time()
conn = get_connection()
vol_id = None
try:
print("\n[09] Volume MIGRATE survives SIGTERM")
# Create volume on slow-lvm backend
vol = create_available_volume(conn, 'gs-test-migrate')
vol_id = vol.id
print(f" Volume {vol_id} available on slow-lvm, migrating to "
f"lvmdriver-1...")
# Migrate to the other backend (host-level migration)
dest_host = 'devstack@lvmdriver-1#lvmdriver-1'
conn.block_storage.post(
f'/volumes/{vol_id}/action',
json={'os-migrate_volume': {'host': dest_host}}
)
time.sleep(SIGTERM_DELAY)
send_sigterm()
# Wait for volume to return to available (migration complete)
vol, elapsed = wait_for_volume_status(conn, vol_id, 'available',
timeout=120)
if vol.status == 'available':
r.passed = True
r.message = f"available {elapsed:.1f}s after SIGTERM"
else:
r.message = f"ended in '{vol.status}'"
restart_cinder_volume()
except Exception as e:
r.message = f"Exception: {e}"
try:
restart_cinder_volume()
except Exception:
pass
finally:
if vol_id:
cleanup_volume(conn, vol_id)
r.duration = time.time() - start
return r
def test_10_retype_volume_survives_sigterm():
"""SIGTERM during volume retype -> volume reaches 'available'."""
r = TestResult("retype_volume_survives_sigterm")
start = time.time()
conn = get_connection()
vol_id = None
try:
print("\n[10] Volume RETYPE survives SIGTERM")
# Create volume on slow-lvm
vol = create_available_volume(conn, 'gs-test-retype')
vol_id = vol.id
print(f" Volume {vol_id} available (slow-lvm), retyping to "
f"lvmdriver-1...")
# Retype to lvmdriver-1 (with migration)
conn.block_storage.post(
f'/volumes/{vol_id}/action',
json={'os-retype': {'new_type': 'lvmdriver-1',
'migration_policy': 'on-demand'}}
)
time.sleep(SIGTERM_DELAY)
send_sigterm()
# Wait for volume to return to available
vol, elapsed = wait_for_volume_status(conn, vol_id, 'available',
timeout=120)
if vol.status == 'available':
r.passed = True
r.message = f"available {elapsed:.1f}s after SIGTERM"
else:
r.message = f"ended in '{vol.status}'"
restart_cinder_volume()
except Exception as e:
r.message = f"Exception: {e}"
try:
restart_cinder_volume()
except Exception:
pass
finally:
if vol_id:
cleanup_volume(conn, vol_id)
r.duration = time.time() - start
return r
def test_11_manage_existing_survives_sigterm():
"""SIGTERM during manage_existing -> volume reaches 'available'.
Creates an LV directly on the VG, then uses Cinder's manage API
to import it as a volume. Sends SIGTERM mid-manage.
"""
r = TestResult("manage_existing_survives_sigterm")
start = time.time()
conn = get_connection()
vol_id = None
lv_name = 'gs-test-manage-existing'
try:
print("\n[11] MANAGE EXISTING survives SIGTERM")
# Create a raw LV on the VG that slow-lvm uses
vg_name = 'stack-volumes-lvmdriver-1'
subprocess.run(
['sudo', 'lvcreate', '-L', '1G', '-n', lv_name, vg_name],
check=True, capture_output=True, timeout=10
)
print(f" Created LV {vg_name}/{lv_name}")
# Manage it via Cinder API
# For LVM driver, source-name is just the LV name (not VG/LV)
result = conn.block_storage.post(
'/os-volume-manage',
json={
'volume': {
'host': 'devstack@slow-lvm',
'name': 'gs-test-managed',
'volume_type': SLOW_VOLUME_TYPE,
'ref': {'source-name': lv_name},
}
}
)
body = result.json()
vol_id = body.get('volume', {}).get('id')
if not vol_id:
r.message = f"Could not initiate manage: {body}"
r.duration = time.time() - start
return r
print(f" Managing as volume {vol_id}...")
time.sleep(SIGTERM_DELAY)
send_sigterm()
vol, elapsed = wait_for_volume_status(conn, vol_id, 'available',
timeout=90)
if vol.status == 'available':
r.passed = True
r.message = f"available {elapsed:.1f}s after SIGTERM"
else:
r.message = f"ended in '{vol.status}'"
restart_cinder_volume()
except Exception as e:
r.message = f"Exception: {e}"
try:
restart_cinder_volume()
except Exception:
pass
# Clean up LV if manage failed
subprocess.run(
['sudo', 'lvremove', '-f',
f'stack-volumes-lvmdriver-1/{lv_name}'],
capture_output=True
)
finally:
if vol_id:
cleanup_volume(conn, vol_id)
r.duration = time.time() - start
return r
def test_12_create_group_survives_sigterm():
"""SIGTERM during group create -> group reaches 'available'.
Creates a group type, then creates a consistency group on slow-lvm.
"""
r = TestResult("create_group_survives_sigterm")
start = time.time()
conn = get_connection()
group_id = None
group_type_id = None
try:
print("\n[12] CREATE GROUP survives SIGTERM")
# Create a group type (avoid reserved CG migration types)
result = conn.block_storage.get('/group_types',
microversion='3.11')
group_types = result.json().get('group_types', [])
# Filter out reserved CG migration group types
usable_types = [gt for gt in group_types
if 'cgsnapshot' not in (gt.get('name') or '').lower()
and 'migration' not in
(gt.get('description') or '').lower()
and (gt.get('name') or '') !=
'group_type_for_migration']
if usable_types:
group_type_id = usable_types[0]['id']
print(f" Using existing group type {group_type_id}")
else:
result = conn.block_storage.post(
'/group_types',
json={'group_type': {
'name': 'gs-test-group-type',
'group_specs': {},
}},
microversion='3.11'
)
group_type_id = result.json()['group_type']['id']
print(f" Created group type {group_type_id}")
# Get slow-lvm volume type ID
vol_types = list(conn.block_storage.types())
slow_type_id = None
for vt in vol_types:
if vt.name == SLOW_VOLUME_TYPE:
slow_type_id = vt.id
break
if not slow_type_id:
r.message = "Could not find slow-lvm volume type"
r.duration = time.time() - start
return r
# Create a group
result = conn.block_storage.post(
'/groups',
json={'group': {
'name': 'gs-test-group',
'group_type': group_type_id,
'volume_types': [slow_type_id],
}},
microversion='3.13'
)
body = result.json()
group_id = body.get('group', {}).get('id')
print(f" Group {group_id} creating...")
time.sleep(SIGTERM_DELAY)
send_sigterm()
# Wait for group to reach available
start_wait = time.time()
final_status = None
while time.time() - start_wait < MAX_WAIT_AFTER_SIGTERM:
result = conn.block_storage.get(f'/groups/{group_id}',
microversion='3.13')
group = result.json().get('group', {})
final_status = group.get('status')
if final_status == 'available':
elapsed = time.time() - start_wait
r.passed = True
r.message = f"available {elapsed:.1f}s after SIGTERM"
break
if final_status == 'error':
r.message = "group ended in 'error'"
break
time.sleep(2)
else:
r.message = f"timed out (last status: {final_status})"
restart_cinder_volume()
except Exception as e:
r.message = f"Exception: {e}"
try:
restart_cinder_volume()
except Exception:
pass
finally:
# Cleanup group
if group_id:
try:
conn.block_storage.post(
f'/groups/{group_id}/action',
json={'delete': {'delete-volumes': False}},
microversion='3.13'
)
except Exception:
pass
r.duration = time.time() - start
return r
def test_13_multiple_ops_complete_before_exit():
"""SIGTERM with 3 concurrent creates -> all reach 'available'."""
r = TestResult("multiple_concurrent_ops_survive_sigterm")
start = time.time()
conn = get_connection()
vol_ids = []
try:
print("\n[13] Multiple concurrent creates survive SIGTERM")
for i in range(3):
vol = conn.block_storage.create_volume(
size=1, name=f'gs-test-multi-{i}',
volume_type=SLOW_VOLUME_TYPE)
vol_ids.append(vol.id)
print(f" Created {vol.id}")
time.sleep(SIGTERM_DELAY)
send_sigterm()
statuses = {}
for vid in vol_ids:
try:
v, elapsed = wait_for_volume_status(
conn, vid, 'available', timeout=120)
statuses[vid] = v.status
except TimeoutError:
statuses[vid] = 'timeout'
all_good = all(s == 'available' for s in statuses.values())
if all_good:
r.passed = True
r.message = f"All {len(vol_ids)} volumes available after SIGTERM"
else:
r.message = f"Results: {statuses}"
restart_cinder_volume()
except Exception as e:
r.message = f"Exception: {e}"
try:
restart_cinder_volume()
except Exception:
pass
finally:
for vid in vol_ids:
cleanup_volume(conn, vid)
r.duration = time.time() - start
return r
def test_14_service_heartbeat_stops_during_drain():
"""After SIGTERM, service stops heartbeating (appears 'down')."""
r = TestResult("service_heartbeat_stops_during_drain")
start = time.time()
conn = get_connection()
vol_id = None
try:
print("\n[14] Service heartbeat stops during drain")
# Verify service is up
services = list(conn.block_storage.services())
slow_svc = [s for s in services if 'slow-lvm' in (s.host or '')]
if not slow_svc or slow_svc[0].state != 'up':
r.message = "slow-lvm not 'up' initially - skipping"
r.duration = time.time() - start
return r
print(" Service is 'up'")
# Start slow op
vol = conn.block_storage.create_volume(
size=1, name='gs-test-heartbeat', volume_type=SLOW_VOLUME_TYPE)
vol_id = vol.id
time.sleep(SIGTERM_DELAY)
send_sigterm()
# The service_down_time is typically 25s. The service should stop
# heartbeating immediately, but the scheduler won't mark it down
# until service_down_time elapses. Check after the op completes.
vol, _ = wait_for_volume_status(conn, vol_id, 'available')
# After operation completes and process exits, service should be down
time.sleep(5)
services = list(conn.block_storage.services())
slow_svc = [s for s in services if 'slow-lvm' in (s.host or '')]
state_after = slow_svc[0].state if slow_svc else 'unknown'
print(f" Service state after drain: {state_after}")
# Restart and verify recovery
restart_cinder_volume()
time.sleep(10)
services = list(conn.block_storage.services())
slow_svc = [s for s in services if 'slow-lvm' in (s.host or '')]
state_recovered = slow_svc[0].state if slow_svc else 'unknown'
if vol.status == 'available' and state_recovered == 'up':
r.passed = True
r.message = (f"Op completed, service was '{state_after}' "
f"during drain, recovered to 'up'")
else:
r.message = (f"vol={vol.status}, state_after={state_after}, "
f"recovered={state_recovered}")
except Exception as e:
r.message = f"Exception: {e}"
try:
restart_cinder_volume()
except Exception:
pass
finally:
if vol_id:
cleanup_volume(conn, vol_id)
r.duration = time.time() - start
return r
# --- Main ---
def main():
print("=" * 72)
print(" CINDER GRACEFUL SHUTDOWN - LIVE SIGTERM INTEGRATION TESTS")
print("=" * 72)
print(f" Auth: {AUTH_URL}")
print(f" Slow backend: {SLOW_VOLUME_TYPE} (15s delay)")
print(f" SIGTERM after: {SIGTERM_DELAY}s into operation")
print(f" Wait timeout: {MAX_WAIT_AFTER_SIGTERM}s")
print("=" * 72)
tests = [
test_01_volume_create_survives_sigterm,
test_02_volume_delete_survives_sigterm,
test_03_snapshot_create_survives_sigterm,
test_04_snapshot_delete_survives_sigterm,
test_05_volume_extend_survives_sigterm,
test_06_volume_clone_survives_sigterm,
test_07_create_from_snapshot_survives_sigterm,
test_08_copy_volume_to_image_survives_sigterm,
test_09_migrate_volume_survives_sigterm,
test_10_retype_volume_survives_sigterm,
test_11_manage_existing_survives_sigterm,
test_12_create_group_survives_sigterm,
test_13_multiple_ops_complete_before_exit,
test_14_service_heartbeat_stops_during_drain,
]
conn = get_connection()
results = []
for test_func in tests:
# Reset and delete any residual volumes from prior test's SIGTERM
# so init_host doesn't queue them up on the slow backend
drain_residual_volumes(conn)
try:
result = test_func()
except Exception as e:
result = TestResult(test_func.__name__)
result.message = f"Unhandled: {e}"
results.append(result)
print(f"\n => {result}")
# --- Report ---
print("\n\n")
print("=" * 72)
print(" FINAL REPORT")
print("=" * 72)
passed = sum(1 for r in results if r.passed)
failed = sum(1 for r in results if not r.passed)
total = len(results)
total_time = sum(r.duration for r in results)
print(f"\n Total: {total} | Passed: {passed} | "
f"Failed: {failed} | Duration: {total_time:.0f}s\n")
print("-" * 72)
for r in results:
s = "PASS" if r.passed else "FAIL"
print(f" [{s}] {r.name}")
print(f" {r.message}")
print()
print("=" * 72)
if failed == 0:
print(" ALL TESTS PASSED")
else:
print(f" {failed} TEST(S) FAILED")
print("=" * 72)
return 0 if failed == 0 else 1
if __name__ == '__main__':
exit(main())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment