Created
May 26, 2026 10:06
-
-
Save AmineDiro/83c79911301cd7ce270774ca33ee824f to your computer and use it in GitHub Desktop.
overlap between compute/comm
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| DROP VIEW IF EXISTS gpu; | |
| CREATE VIEW gpu AS | |
| SELECT | |
| slice.id AS id, | |
| TRIM(thread.name) AS stream, | |
| slice.name AS kernel, | |
| slice.category AS cat, | |
| slice.depth AS depth, | |
| slice.ts AS ts, | |
| slice.dur AS dur, | |
| slice.ts + slice.dur AS end_ts | |
| FROM slice | |
| JOIN thread_track ON slice.track_id = thread_track.id | |
| JOIN thread USING (utid) | |
| WHERE thread.name LIKE 'stream %'; | |
| -- 6. Compute ↔ communication overlap (the key question for EP-over-TP) | |
| WITH | |
| s7_kernels AS ( | |
| SELECT ts, end_ts | |
| FROM gpu | |
| WHERE stream = 'stream 7' | |
| AND dur > 0 | |
| AND kernel NOT LIKE 'ProfilerStep%' | |
| AND kernel NOT LIKE 'cuda%' | |
| AND kernel NOT GLOB '*[Aa]utograd*' | |
| ), | |
| s23_kernels AS ( | |
| SELECT id, ts, dur, end_ts | |
| FROM gpu | |
| WHERE stream = 'stream 23' AND dur > 0 | |
| ), | |
| per_s23 AS ( | |
| SELECT | |
| c.id, | |
| c.dur, | |
| COALESCE(SUM(MAX(0, MIN(c.end_ts, s.end_ts) - MAX(c.ts, s.ts))), 0) AS overlap | |
| FROM s23_kernels c | |
| LEFT JOIN s7_kernels s | |
| ON s.ts < c.end_ts AND s.end_ts > c.ts | |
| GROUP BY c.id, c.dur | |
| ) | |
| SELECT | |
| SUM(dur) / 1e6 AS ep_comm_total_ms, | |
| SUM(overlap) / 1e6 AS overlapped_ms, | |
| 100.0 * SUM(overlap) / SUM(dur) AS overlap_pct | |
| FROM per_s23; | |
| -- What survived the filter on stream 7? | |
| SELECT kernel, cat, COUNT(*) AS n, SUM(dur)/1e6 AS total_ms | |
| FROM gpu | |
| WHERE stream = 'stream 7' | |
| AND dur > 0 | |
| AND kernel NOT LIKE 'ProfilerStep%' | |
| AND kernel NOT LIKE 'cuda%' | |
| AND kernel NOT GLOB '*[Aa]utograd*' | |
| GROUP BY kernel, cat | |
| ORDER BY total_ms DESC | |
| LIMIT 20; | |
| -- What is actually present on stream 7 during the all_reduce window? | |
| WITH ar AS ( | |
| SELECT ts, end_ts FROM gpu WHERE stream = 'stream 23' AND dur > 0 | |
| ) | |
| SELECT g.kernel, g.cat, g.depth, g.dur/1e3 AS dur_us | |
| FROM gpu g, ar | |
| WHERE g.stream = 'stream 7' | |
| AND g.dur > 0 | |
| AND g.ts < ar.end_ts | |
| AND g.end_ts > ar.ts | |
| ORDER BY g.ts; | |
| ----- | |
| -- 1. All GPU stream slices, with per-kernel classification | |
| DROP VIEW IF EXISTS gpu; | |
| CREATE VIEW gpu AS | |
| SELECT | |
| slice.id AS id, | |
| slice.parent_id AS parent_id, | |
| TRIM(thread.name) AS stream, | |
| slice.name AS kernel, | |
| slice.category AS cat, | |
| slice.depth AS depth, | |
| slice.ts AS ts, | |
| slice.dur AS dur, | |
| slice.ts + slice.dur AS end_ts, | |
| CASE | |
| WHEN slice.name LIKE 'ncclDev%' | |
| OR slice.name LIKE 'nccl%' THEN 'comm' | |
| WHEN slice.name LIKE 'Memcpy%' | |
| OR slice.name LIKE 'Memset%' THEN 'copy' | |
| ELSE 'compute' | |
| END AS kind | |
| FROM slice | |
| JOIN thread_track ON slice.track_id = thread_track.id | |
| JOIN thread USING (utid) | |
| WHERE thread.name LIKE 'stream %'; | |
| -- 2. Leaf-only view: collapses parent/child stacks (nccl:all_reduce wrapper, ProfilerStep, ...) | |
| DROP VIEW IF EXISTS gpu_leaf; | |
| CREATE VIEW gpu_leaf AS | |
| SELECT g.* | |
| FROM gpu g | |
| WHERE NOT EXISTS (SELECT 1 FROM gpu g2 WHERE g2.parent_id = g.id) | |
| AND dur > 0; | |
| -- 3. Merged compute timeline: union of all compute kernels across all streams | |
| DROP VIEW IF EXISTS compute_merged; | |
| CREATE VIEW compute_merged AS | |
| WITH cs AS ( | |
| SELECT ts, end_ts FROM gpu_leaf WHERE kind = 'compute' | |
| ), | |
| running AS ( | |
| SELECT ts, end_ts, | |
| MAX(end_ts) OVER ( | |
| ORDER BY ts ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING | |
| ) AS prev_max_end | |
| FROM cs | |
| ), | |
| grouped AS ( | |
| SELECT ts, end_ts, | |
| SUM(CASE WHEN prev_max_end IS NULL OR ts > prev_max_end THEN 1 ELSE 0 END) | |
| OVER (ORDER BY ts) AS grp | |
| FROM running | |
| ) | |
| SELECT MIN(ts) AS ts, MAX(end_ts) AS end_ts | |
| FROM grouped | |
| GROUP BY grp; | |
| -- 4. Per-comm-slice overlap against the merged compute timeline | |
| DROP VIEW IF EXISTS comm_overlap; | |
| CREATE VIEW comm_overlap AS | |
| SELECT | |
| c.id, c.stream, c.kernel, c.dur, | |
| COALESCE(SUM(MAX(0, MIN(c.end_ts, m.end_ts) - MAX(c.ts, m.ts))), 0) AS overlap | |
| FROM gpu_leaf c | |
| LEFT JOIN compute_merged m | |
| ON m.ts < c.end_ts AND m.end_ts > c.ts | |
| WHERE c.kind = 'comm' | |
| GROUP BY c.id, c.stream, c.kernel, c.dur; | |
| -- Per stream — to confirm streams really are mixed: | |
| SELECT | |
| stream, | |
| SUM(CASE WHEN kind = 'compute' THEN dur ELSE 0 END)/1e6 AS compute_ms, | |
| SUM(CASE WHEN kind = 'comm' THEN dur ELSE 0 END)/1e6 AS comm_ms, | |
| SUM(CASE WHEN kind = 'copy' THEN dur ELSE 0 END)/1e6 AS copy_ms | |
| FROM gpu_leaf | |
| GROUP BY stream | |
| ORDER BY (compute_ms + comm_ms + copy_ms) DESC; | |
| -- Per comm kernel — which collectives hide well vs which are exposed: | |
| SELECT | |
| kernel, | |
| COUNT(*) AS n, | |
| SUM(dur)/1e6 AS comm_ms, | |
| SUM(dur - overlap)/1e6 AS exposed_ms, | |
| 100.0 * SUM(overlap) / SUM(dur) AS overlap_pct | |
| FROM comm_overlap | |
| GROUP BY kernel | |
| ORDER BY exposed_ms DESC; | |
| SELECT | |
| stream, kernel, | |
| COUNT(*) AS n, | |
| SUM(dur)/1e6 AS comm_ms, | |
| SUM(dur - overlap)/1e6 AS exposed_ms, | |
| 100.0 * SUM(overlap) / SUM(dur) AS overlap_pct | |
| FROM comm_overlap | |
| GROUP BY stream, kernel | |
| ORDER BY exposed_ms DESC; | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment