Created
May 10, 2026 14:32
-
-
Save nh2/559d40f45a5e1e1cb4201ae7108674ce to your computer and use it in GitHub Desktop.
Analysis script for https://github.com/TheJJ/ceph-balancer/issues/68
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| # Made for: https://github.com/TheJJ/ceph-balancer/issues/68 | |
| """ | |
| Compute the exact counts involved in the jj-balancer's 0-move situation. | |
| Key discovery: pg_num = 2048 (not 512 as I initially assumed). | |
| Verification: 12288 total shards / 6 (pool size) = 2048 PGs. | |
| The -vvv log confirms: ideal_14T = 153.623 = 12288 / 1168.69 * 14.61089. | |
| """ | |
| # Pool parameters (corrected) | |
| pool_size = 6 # EC 4+2 | |
| pg_num = 2048 # corrected from initial wrong assumption of 512 | |
| total_shards = pool_size * pg_num # 12288 | |
| # OSD counts and weights | |
| n_14T = 69 # 14.61089 TiB OSDs | |
| n_20T = 8 # 20.06789 TiB OSDs (DC2) | |
| w_14T = 14.61089 | |
| w_20T = 20.06789 | |
| total_weight = n_14T * w_14T + n_20T * w_20T | |
| pgs_per_weight = total_shards / total_weight | |
| ideal_14T = pgs_per_weight * w_14T | |
| ideal_20T = pgs_per_weight * w_20T | |
| print("=" * 80) | |
| print("ANALYSIS: Why jj-balancer produces 0 moves on this EC(4+2) cluster") | |
| print("=" * 80) | |
| print() | |
| print(f"Pool 3 (myprojfs_data_ec): EC(4+2), size={pool_size}, pg_num={pg_num}") | |
| print(f"Total shards = size * pg_num = {pool_size} * {pg_num} = {total_shards}") | |
| print(f"CRUSH rule: chooseleaf_indep 6 type datacenter (from root default~hdd)") | |
| print(f" -> Each PG picks 6 of 8 HDD datacenters, 1 OSD per DC") | |
| print() | |
| print(f"Candidate HDD OSDs: {n_14T + n_20T} (69 x 14.61 TiB + 8 x 20.07 TiB)") | |
| print(f"Total candidate CRUSH weight: {total_weight:.5f}") | |
| print(f"pgs_per_weight = {total_shards} / {total_weight:.5f} = {pgs_per_weight:.6f}") | |
| print() | |
| print(f"Per-OSD ideal shard count (pool 3):") | |
| print(f" 14.61 TiB OSD: {ideal_14T:.5f} (log shows: 153.62321701757597)") | |
| print(f" 20.07 TiB OSD: {ideal_20T:.5f} (log shows: 210.99975322340745)") | |
| print() | |
| # Datacenter data with per-pool shard counts (= PGS from ceph osd df for HDD OSDs) | |
| datacenters = { | |
| "HEL1-DC10": { | |
| "osds": [(88,168), (89,186), (90,168), (91,162), (92,163), (93,176), (94,167), (95,175), (96,154), (97,173)], | |
| "weight_per_osd": w_14T, | |
| }, | |
| "HEL1-DC2": { | |
| "osds": [(78,211), (79,219), (80,215), (81,211), (82,233), (83,215), (84,231), (85,211)], | |
| "weight_per_osd": w_20T, | |
| }, | |
| "HEL1-DC3": { | |
| "osds": [(26,193), (27,194), (28,192), (29,204), (30,196), (31,194), (32,164), (34,196), (35,176)], | |
| "weight_per_osd": w_14T, | |
| }, | |
| "HEL1-DC4": { | |
| "osds": [(14,106), (15,103), (16,104), (17,102), (18,103), (19,108), (20,101), (21,100), (22,100), (23,102), | |
| (124,112), (125,100), (126,100), (127,108), (128,100), (129,100), (130,100), (131,99), (132,99), (133,99)], | |
| "weight_per_osd": w_14T, | |
| }, | |
| "HEL1-DC6": { | |
| "osds": [(100,174), (101,176), (102,161), (103,160), (104,174), (105,162), (106,192), (107,169), (108,168), (109,177)], | |
| "weight_per_osd": w_14T, | |
| }, | |
| "HEL1-DC8": { | |
| "osds": [(2,181), (3,181), (4,168), (5,162), (6,163), (7,166), (8,166), (9,159), (10,162), (11,164)], | |
| "weight_per_osd": w_14T, | |
| }, | |
| "HEL1-DC9": { | |
| "osds": [(112,187), (113,171), (114,164), (115,183), (116,164), (117,171), (118,161), (119,158), (120,181), (121,170)], | |
| "weight_per_osd": w_14T, | |
| }, | |
| } | |
| print("=" * 80) | |
| print("Per-datacenter shard distribution") | |
| print("=" * 80) | |
| print() | |
| print(f"{'Datacenter':<12} {'#OSDs':>5} {'DC Weight':>10} {'Shards':>7} {'DC Ideal':>9} {'Dev':>7} {'Ceiling':>7}") | |
| print("-" * 65) | |
| for dc_name in sorted(datacenters.keys()): | |
| dc = datacenters[dc_name] | |
| n_osds = len(dc["osds"]) | |
| dc_weight = n_osds * dc["weight_per_osd"] | |
| dc_shards = sum(pgs for _, pgs in dc["osds"]) | |
| dc_ideal = dc_weight * pgs_per_weight | |
| dc_dev = dc_shards - dc_ideal | |
| ceiling = pg_num # max 1 shard per PG per DC | |
| print(f"{dc_name:<12} {n_osds:>5} {dc_weight:>10.2f} {dc_shards:>7} {dc_ideal:>9.1f} {dc_dev:>+7.1f} {ceiling:>7}") | |
| total_actual = sum(sum(pgs for _, pgs in dc["osds"]) for dc in datacenters.values()) | |
| total_ideal = sum(len(dc["osds"]) * dc["weight_per_osd"] * pgs_per_weight for dc in datacenters.values()) | |
| print(f"{'TOTAL':<12} {n_14T+n_20T:>5} {total_weight:>10.2f} {total_actual:>7} {total_ideal:>9.1f}") | |
| print() | |
| # Key insight | |
| dc4_shards = sum(pgs for _, pgs in datacenters["HEL1-DC4"]["osds"]) | |
| dc4_ideal = 20 * w_14T * pgs_per_weight | |
| dc4_ceiling = pg_num | |
| print("=" * 80) | |
| print("KEY INSIGHT: DC4's ideal exceeds the CRUSH ceiling") | |
| print("=" * 80) | |
| print() | |
| print(f"DC4 has 20 OSDs x {w_14T} weight = {20*w_14T:.2f} total weight") | |
| print(f"DC4's weight-proportional ideal: {dc4_ideal:.1f} shards") | |
| print(f"DC4's CRUSH ceiling (1 shard/PG max): {dc4_ceiling} shards") | |
| print(f"DC4's actual shards: {dc4_shards}") | |
| print() | |
| print(f"DC4's ideal ({dc4_ideal:.0f}) exceeds the ceiling ({dc4_ceiling})!") | |
| print(f"DC4 can NEVER reach its weight-proportional ideal.") | |
| print() | |
| print(f"Deficit: DC4 is {dc4_ideal - dc4_shards:.0f} shards below its 'ideal'") | |
| print(f" (but can only ever get up to {dc4_ceiling}, still {dc4_ideal - dc4_ceiling:.0f} short)") | |
| print() | |
| # Per-OSD view | |
| print("=" * 80) | |
| print("Per-OSD view: the deadlock") | |
| print("=" * 80) | |
| print() | |
| print(f"{'OSD':<8} {'DC':<12} {'Actual':>6} {'Ideal':>7} {'Dev':>7} {'jj-balancer sees'}") | |
| print("-" * 75) | |
| examples = [ | |
| ("osd.29", "HEL1-DC3", 204, ideal_14T), | |
| ("osd.34", "HEL1-DC3", 196, ideal_14T), | |
| ("osd.131", "HEL1-DC4", 99, ideal_14T), | |
| ("osd.21", "HEL1-DC4", 100, ideal_14T), | |
| ("osd.96", "HEL1-DC10", 154, ideal_14T), | |
| ("osd.9", "HEL1-DC8", 159, ideal_14T), | |
| ("osd.82", "HEL1-DC2", 233, ideal_20T), | |
| ] | |
| for name, dc, actual, ideal in examples: | |
| dev = actual - ideal | |
| if actual > ideal: | |
| label = "above ideal -> blocked as target" | |
| else: | |
| label = "below ideal -> valid target" | |
| print(f"{name:<8} {dc:<12} {actual:>6} {ideal:>7.1f} {dev:>+7.1f} {label}") | |
| print() | |
| print("=" * 80) | |
| print("THE DEADLOCK EXPLAINED") | |
| print("=" * 80) | |
| print() | |
| print("1. Source: osd.29 (DC3) has 204 shards > ideal 153.6 -> valid source") | |
| print() | |
| print("2. Target candidates below ideal (valid by count):") | |
| print(" ALL are in DC4: osd.131(99), osd.132(99), osd.133(99), osd.21(100)...") | |
| print() | |
| print("3. CRUSH check for moving osd.29's shard to DC4:") | |
| print(" PG 3.4f9 acting set = [95, 29, 126, 109, 8, 115]") | |
| print(" osd.126 is already in DC4!") | |
| print(" -> Cannot place another shard in DC4. CRUSH VIOLATION.") | |
| print() | |
| print(" This happens for nearly ALL PGs on DC3 OSDs, because with") | |
| print(" 6-of-8 DC selection, ~75% of PGs already include DC4.") | |
| print() | |
| print("4. Fallback targets in other DCs:") | |
| print(" ALL have actual >= ideal (154+ >= 153.6)") | |
| print(" -> Blocked by jj-balancer's destination ideal-count guard") | |
| print() | |
| print("5. Result: 0 valid moves. The balancer is stuck.") | |
| print() | |
| print("=" * 80) | |
| print("ROOT CAUSE") | |
| print("=" * 80) | |
| print() | |
| print("The jj-balancer's `pool_pg_shard_count_ideal()` computes:") | |
| print(f" ideal = {total_shards} / {total_weight:.2f} * osd_weight") | |
| print() | |
| print("This formula assumes shards can be distributed proportionally to weight") | |
| print("across ALL candidate OSDs. But the CRUSH rule constrains placement to") | |
| print("at most 1 shard per PG per datacenter. When a DC's weight-proportional") | |
| print("share exceeds the ceiling (pg_num), the formula's 'ideal' becomes") | |
| print("unachievable, and the excess must be absorbed by other DCs -- pushing") | |
| print("their OSDs above 'ideal' and blocking them as targets.") | |
| print() | |
| non_dc4_osds = sum(len(dc["osds"]) for name, dc in datacenters.items() if name != "HEL1-DC4") | |
| non_dc4_dcs = sum(1 for name in datacenters if name != "HEL1-DC4") | |
| non_dc4_shards = total_shards - dc4_shards | |
| non_dc4_ideal_sum = sum( | |
| len(dc["osds"]) * dc["weight_per_osd"] * pgs_per_weight | |
| for name, dc in datacenters.items() if name != "HEL1-DC4" | |
| ) | |
| print(f"In this cluster:") | |
| print(f" DC4 ideal = {dc4_ideal:.0f} shards, ceiling = {dc4_ceiling}, actual = {dc4_shards}") | |
| print(f" Shortfall = {dc4_ideal - dc4_shards:.0f} shards that other DCs must absorb") | |
| print(f" Non-DC4: {non_dc4_osds} OSDs in {non_dc4_dcs} DCs, holding {non_dc4_shards} shards, combined ideal = {non_dc4_ideal_sum:.1f}") | |
| print(f" Excess above combined ideal: {non_dc4_shards} - {non_dc4_ideal_sum:.1f} = {non_dc4_shards - non_dc4_ideal_sum:.1f}") | |
| print(f" Per OSD on average: {(non_dc4_shards - non_dc4_ideal_sum) / non_dc4_osds:.1f} shards above ideal") | |
| print(f" Even the least-loaded non-DC4 14T OSD (osd.96 = 154) exceeds ideal ({ideal_14T:.1f})") | |
| print() | |
| print("=" * 80) | |
| print("SOLUTION: --ignore-ideal-pgcounts destination") | |
| print("=" * 80) | |
| print() | |
| print("This flag disables the destination ideal-count guard, allowing moves to") | |
| print("OSDs that are slightly above ideal by count but still less full by bytes.") | |
| print("The built-in Ceph balancer won't fight back because it also can't move") | |
| print("shards out of those DCs (same CRUSH deadlock applies to it too).") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment