Skip to content

Instantly share code, notes, and snippets.

@nh2
Created May 10, 2026 14:32
Show Gist options
  • Select an option

  • Save nh2/559d40f45a5e1e1cb4201ae7108674ce to your computer and use it in GitHub Desktop.

Select an option

Save nh2/559d40f45a5e1e1cb4201ae7108674ce to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
# Made for: https://github.com/TheJJ/ceph-balancer/issues/68
"""
Compute the exact counts involved in the jj-balancer's 0-move situation.
Key discovery: pg_num = 2048 (not 512 as I initially assumed).
Verification: 12288 total shards / 6 (pool size) = 2048 PGs.
The -vvv log confirms: ideal_14T = 153.623 = 12288 / 1168.69 * 14.61089.
"""
# Pool parameters (corrected)
pool_size = 6 # EC 4+2
pg_num = 2048 # corrected from initial wrong assumption of 512
total_shards = pool_size * pg_num # 12288
# OSD counts and weights
n_14T = 69 # 14.61089 TiB OSDs
n_20T = 8 # 20.06789 TiB OSDs (DC2)
w_14T = 14.61089
w_20T = 20.06789
total_weight = n_14T * w_14T + n_20T * w_20T
pgs_per_weight = total_shards / total_weight
ideal_14T = pgs_per_weight * w_14T
ideal_20T = pgs_per_weight * w_20T
print("=" * 80)
print("ANALYSIS: Why jj-balancer produces 0 moves on this EC(4+2) cluster")
print("=" * 80)
print()
print(f"Pool 3 (myprojfs_data_ec): EC(4+2), size={pool_size}, pg_num={pg_num}")
print(f"Total shards = size * pg_num = {pool_size} * {pg_num} = {total_shards}")
print(f"CRUSH rule: chooseleaf_indep 6 type datacenter (from root default~hdd)")
print(f" -> Each PG picks 6 of 8 HDD datacenters, 1 OSD per DC")
print()
print(f"Candidate HDD OSDs: {n_14T + n_20T} (69 x 14.61 TiB + 8 x 20.07 TiB)")
print(f"Total candidate CRUSH weight: {total_weight:.5f}")
print(f"pgs_per_weight = {total_shards} / {total_weight:.5f} = {pgs_per_weight:.6f}")
print()
print(f"Per-OSD ideal shard count (pool 3):")
print(f" 14.61 TiB OSD: {ideal_14T:.5f} (log shows: 153.62321701757597)")
print(f" 20.07 TiB OSD: {ideal_20T:.5f} (log shows: 210.99975322340745)")
print()
# Datacenter data with per-pool shard counts (= PGS from ceph osd df for HDD OSDs)
datacenters = {
"HEL1-DC10": {
"osds": [(88,168), (89,186), (90,168), (91,162), (92,163), (93,176), (94,167), (95,175), (96,154), (97,173)],
"weight_per_osd": w_14T,
},
"HEL1-DC2": {
"osds": [(78,211), (79,219), (80,215), (81,211), (82,233), (83,215), (84,231), (85,211)],
"weight_per_osd": w_20T,
},
"HEL1-DC3": {
"osds": [(26,193), (27,194), (28,192), (29,204), (30,196), (31,194), (32,164), (34,196), (35,176)],
"weight_per_osd": w_14T,
},
"HEL1-DC4": {
"osds": [(14,106), (15,103), (16,104), (17,102), (18,103), (19,108), (20,101), (21,100), (22,100), (23,102),
(124,112), (125,100), (126,100), (127,108), (128,100), (129,100), (130,100), (131,99), (132,99), (133,99)],
"weight_per_osd": w_14T,
},
"HEL1-DC6": {
"osds": [(100,174), (101,176), (102,161), (103,160), (104,174), (105,162), (106,192), (107,169), (108,168), (109,177)],
"weight_per_osd": w_14T,
},
"HEL1-DC8": {
"osds": [(2,181), (3,181), (4,168), (5,162), (6,163), (7,166), (8,166), (9,159), (10,162), (11,164)],
"weight_per_osd": w_14T,
},
"HEL1-DC9": {
"osds": [(112,187), (113,171), (114,164), (115,183), (116,164), (117,171), (118,161), (119,158), (120,181), (121,170)],
"weight_per_osd": w_14T,
},
}
print("=" * 80)
print("Per-datacenter shard distribution")
print("=" * 80)
print()
print(f"{'Datacenter':<12} {'#OSDs':>5} {'DC Weight':>10} {'Shards':>7} {'DC Ideal':>9} {'Dev':>7} {'Ceiling':>7}")
print("-" * 65)
for dc_name in sorted(datacenters.keys()):
dc = datacenters[dc_name]
n_osds = len(dc["osds"])
dc_weight = n_osds * dc["weight_per_osd"]
dc_shards = sum(pgs for _, pgs in dc["osds"])
dc_ideal = dc_weight * pgs_per_weight
dc_dev = dc_shards - dc_ideal
ceiling = pg_num # max 1 shard per PG per DC
print(f"{dc_name:<12} {n_osds:>5} {dc_weight:>10.2f} {dc_shards:>7} {dc_ideal:>9.1f} {dc_dev:>+7.1f} {ceiling:>7}")
total_actual = sum(sum(pgs for _, pgs in dc["osds"]) for dc in datacenters.values())
total_ideal = sum(len(dc["osds"]) * dc["weight_per_osd"] * pgs_per_weight for dc in datacenters.values())
print(f"{'TOTAL':<12} {n_14T+n_20T:>5} {total_weight:>10.2f} {total_actual:>7} {total_ideal:>9.1f}")
print()
# Key insight
dc4_shards = sum(pgs for _, pgs in datacenters["HEL1-DC4"]["osds"])
dc4_ideal = 20 * w_14T * pgs_per_weight
dc4_ceiling = pg_num
print("=" * 80)
print("KEY INSIGHT: DC4's ideal exceeds the CRUSH ceiling")
print("=" * 80)
print()
print(f"DC4 has 20 OSDs x {w_14T} weight = {20*w_14T:.2f} total weight")
print(f"DC4's weight-proportional ideal: {dc4_ideal:.1f} shards")
print(f"DC4's CRUSH ceiling (1 shard/PG max): {dc4_ceiling} shards")
print(f"DC4's actual shards: {dc4_shards}")
print()
print(f"DC4's ideal ({dc4_ideal:.0f}) exceeds the ceiling ({dc4_ceiling})!")
print(f"DC4 can NEVER reach its weight-proportional ideal.")
print()
print(f"Deficit: DC4 is {dc4_ideal - dc4_shards:.0f} shards below its 'ideal'")
print(f" (but can only ever get up to {dc4_ceiling}, still {dc4_ideal - dc4_ceiling:.0f} short)")
print()
# Per-OSD view
print("=" * 80)
print("Per-OSD view: the deadlock")
print("=" * 80)
print()
print(f"{'OSD':<8} {'DC':<12} {'Actual':>6} {'Ideal':>7} {'Dev':>7} {'jj-balancer sees'}")
print("-" * 75)
examples = [
("osd.29", "HEL1-DC3", 204, ideal_14T),
("osd.34", "HEL1-DC3", 196, ideal_14T),
("osd.131", "HEL1-DC4", 99, ideal_14T),
("osd.21", "HEL1-DC4", 100, ideal_14T),
("osd.96", "HEL1-DC10", 154, ideal_14T),
("osd.9", "HEL1-DC8", 159, ideal_14T),
("osd.82", "HEL1-DC2", 233, ideal_20T),
]
for name, dc, actual, ideal in examples:
dev = actual - ideal
if actual > ideal:
label = "above ideal -> blocked as target"
else:
label = "below ideal -> valid target"
print(f"{name:<8} {dc:<12} {actual:>6} {ideal:>7.1f} {dev:>+7.1f} {label}")
print()
print("=" * 80)
print("THE DEADLOCK EXPLAINED")
print("=" * 80)
print()
print("1. Source: osd.29 (DC3) has 204 shards > ideal 153.6 -> valid source")
print()
print("2. Target candidates below ideal (valid by count):")
print(" ALL are in DC4: osd.131(99), osd.132(99), osd.133(99), osd.21(100)...")
print()
print("3. CRUSH check for moving osd.29's shard to DC4:")
print(" PG 3.4f9 acting set = [95, 29, 126, 109, 8, 115]")
print(" osd.126 is already in DC4!")
print(" -> Cannot place another shard in DC4. CRUSH VIOLATION.")
print()
print(" This happens for nearly ALL PGs on DC3 OSDs, because with")
print(" 6-of-8 DC selection, ~75% of PGs already include DC4.")
print()
print("4. Fallback targets in other DCs:")
print(" ALL have actual >= ideal (154+ >= 153.6)")
print(" -> Blocked by jj-balancer's destination ideal-count guard")
print()
print("5. Result: 0 valid moves. The balancer is stuck.")
print()
print("=" * 80)
print("ROOT CAUSE")
print("=" * 80)
print()
print("The jj-balancer's `pool_pg_shard_count_ideal()` computes:")
print(f" ideal = {total_shards} / {total_weight:.2f} * osd_weight")
print()
print("This formula assumes shards can be distributed proportionally to weight")
print("across ALL candidate OSDs. But the CRUSH rule constrains placement to")
print("at most 1 shard per PG per datacenter. When a DC's weight-proportional")
print("share exceeds the ceiling (pg_num), the formula's 'ideal' becomes")
print("unachievable, and the excess must be absorbed by other DCs -- pushing")
print("their OSDs above 'ideal' and blocking them as targets.")
print()
non_dc4_osds = sum(len(dc["osds"]) for name, dc in datacenters.items() if name != "HEL1-DC4")
non_dc4_dcs = sum(1 for name in datacenters if name != "HEL1-DC4")
non_dc4_shards = total_shards - dc4_shards
non_dc4_ideal_sum = sum(
len(dc["osds"]) * dc["weight_per_osd"] * pgs_per_weight
for name, dc in datacenters.items() if name != "HEL1-DC4"
)
print(f"In this cluster:")
print(f" DC4 ideal = {dc4_ideal:.0f} shards, ceiling = {dc4_ceiling}, actual = {dc4_shards}")
print(f" Shortfall = {dc4_ideal - dc4_shards:.0f} shards that other DCs must absorb")
print(f" Non-DC4: {non_dc4_osds} OSDs in {non_dc4_dcs} DCs, holding {non_dc4_shards} shards, combined ideal = {non_dc4_ideal_sum:.1f}")
print(f" Excess above combined ideal: {non_dc4_shards} - {non_dc4_ideal_sum:.1f} = {non_dc4_shards - non_dc4_ideal_sum:.1f}")
print(f" Per OSD on average: {(non_dc4_shards - non_dc4_ideal_sum) / non_dc4_osds:.1f} shards above ideal")
print(f" Even the least-loaded non-DC4 14T OSD (osd.96 = 154) exceeds ideal ({ideal_14T:.1f})")
print()
print("=" * 80)
print("SOLUTION: --ignore-ideal-pgcounts destination")
print("=" * 80)
print()
print("This flag disables the destination ideal-count guard, allowing moves to")
print("OSDs that are slightly above ideal by count but still less full by bytes.")
print("The built-in Ceph balancer won't fight back because it also can't move")
print("shards out of those DCs (same CRUSH deadlock applies to it too).")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment