Skip to content

Instantly share code, notes, and snippets.

@AmineDiro
Created May 26, 2026 10:06
Show Gist options
  • Select an option

  • Save AmineDiro/83c79911301cd7ce270774ca33ee824f to your computer and use it in GitHub Desktop.

Select an option

Save AmineDiro/83c79911301cd7ce270774ca33ee824f to your computer and use it in GitHub Desktop.
overlap between compute/comm
DROP VIEW IF EXISTS gpu;
CREATE VIEW gpu AS
SELECT
slice.id AS id,
TRIM(thread.name) AS stream,
slice.name AS kernel,
slice.category AS cat,
slice.depth AS depth,
slice.ts AS ts,
slice.dur AS dur,
slice.ts + slice.dur AS end_ts
FROM slice
JOIN thread_track ON slice.track_id = thread_track.id
JOIN thread USING (utid)
WHERE thread.name LIKE 'stream %';
-- 6. Compute ↔ communication overlap (the key question for EP-over-TP)
WITH
s7_kernels AS (
SELECT ts, end_ts
FROM gpu
WHERE stream = 'stream 7'
AND dur > 0
AND kernel NOT LIKE 'ProfilerStep%'
AND kernel NOT LIKE 'cuda%'
AND kernel NOT GLOB '*[Aa]utograd*'
),
s23_kernels AS (
SELECT id, ts, dur, end_ts
FROM gpu
WHERE stream = 'stream 23' AND dur > 0
),
per_s23 AS (
SELECT
c.id,
c.dur,
COALESCE(SUM(MAX(0, MIN(c.end_ts, s.end_ts) - MAX(c.ts, s.ts))), 0) AS overlap
FROM s23_kernels c
LEFT JOIN s7_kernels s
ON s.ts < c.end_ts AND s.end_ts > c.ts
GROUP BY c.id, c.dur
)
SELECT
SUM(dur) / 1e6 AS ep_comm_total_ms,
SUM(overlap) / 1e6 AS overlapped_ms,
100.0 * SUM(overlap) / SUM(dur) AS overlap_pct
FROM per_s23;
-- What survived the filter on stream 7?
SELECT kernel, cat, COUNT(*) AS n, SUM(dur)/1e6 AS total_ms
FROM gpu
WHERE stream = 'stream 7'
AND dur > 0
AND kernel NOT LIKE 'ProfilerStep%'
AND kernel NOT LIKE 'cuda%'
AND kernel NOT GLOB '*[Aa]utograd*'
GROUP BY kernel, cat
ORDER BY total_ms DESC
LIMIT 20;
-- What is actually present on stream 7 during the all_reduce window?
WITH ar AS (
SELECT ts, end_ts FROM gpu WHERE stream = 'stream 23' AND dur > 0
)
SELECT g.kernel, g.cat, g.depth, g.dur/1e3 AS dur_us
FROM gpu g, ar
WHERE g.stream = 'stream 7'
AND g.dur > 0
AND g.ts < ar.end_ts
AND g.end_ts > ar.ts
ORDER BY g.ts;
-----
-- 1. All GPU stream slices, with per-kernel classification
DROP VIEW IF EXISTS gpu;
CREATE VIEW gpu AS
SELECT
slice.id AS id,
slice.parent_id AS parent_id,
TRIM(thread.name) AS stream,
slice.name AS kernel,
slice.category AS cat,
slice.depth AS depth,
slice.ts AS ts,
slice.dur AS dur,
slice.ts + slice.dur AS end_ts,
CASE
WHEN slice.name LIKE 'ncclDev%'
OR slice.name LIKE 'nccl%' THEN 'comm'
WHEN slice.name LIKE 'Memcpy%'
OR slice.name LIKE 'Memset%' THEN 'copy'
ELSE 'compute'
END AS kind
FROM slice
JOIN thread_track ON slice.track_id = thread_track.id
JOIN thread USING (utid)
WHERE thread.name LIKE 'stream %';
-- 2. Leaf-only view: collapses parent/child stacks (nccl:all_reduce wrapper, ProfilerStep, ...)
DROP VIEW IF EXISTS gpu_leaf;
CREATE VIEW gpu_leaf AS
SELECT g.*
FROM gpu g
WHERE NOT EXISTS (SELECT 1 FROM gpu g2 WHERE g2.parent_id = g.id)
AND dur > 0;
-- 3. Merged compute timeline: union of all compute kernels across all streams
DROP VIEW IF EXISTS compute_merged;
CREATE VIEW compute_merged AS
WITH cs AS (
SELECT ts, end_ts FROM gpu_leaf WHERE kind = 'compute'
),
running AS (
SELECT ts, end_ts,
MAX(end_ts) OVER (
ORDER BY ts ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING
) AS prev_max_end
FROM cs
),
grouped AS (
SELECT ts, end_ts,
SUM(CASE WHEN prev_max_end IS NULL OR ts > prev_max_end THEN 1 ELSE 0 END)
OVER (ORDER BY ts) AS grp
FROM running
)
SELECT MIN(ts) AS ts, MAX(end_ts) AS end_ts
FROM grouped
GROUP BY grp;
-- 4. Per-comm-slice overlap against the merged compute timeline
DROP VIEW IF EXISTS comm_overlap;
CREATE VIEW comm_overlap AS
SELECT
c.id, c.stream, c.kernel, c.dur,
COALESCE(SUM(MAX(0, MIN(c.end_ts, m.end_ts) - MAX(c.ts, m.ts))), 0) AS overlap
FROM gpu_leaf c
LEFT JOIN compute_merged m
ON m.ts < c.end_ts AND m.end_ts > c.ts
WHERE c.kind = 'comm'
GROUP BY c.id, c.stream, c.kernel, c.dur;
-- Per stream — to confirm streams really are mixed:
SELECT
stream,
SUM(CASE WHEN kind = 'compute' THEN dur ELSE 0 END)/1e6 AS compute_ms,
SUM(CASE WHEN kind = 'comm' THEN dur ELSE 0 END)/1e6 AS comm_ms,
SUM(CASE WHEN kind = 'copy' THEN dur ELSE 0 END)/1e6 AS copy_ms
FROM gpu_leaf
GROUP BY stream
ORDER BY (compute_ms + comm_ms + copy_ms) DESC;
-- Per comm kernel — which collectives hide well vs which are exposed:
SELECT
kernel,
COUNT(*) AS n,
SUM(dur)/1e6 AS comm_ms,
SUM(dur - overlap)/1e6 AS exposed_ms,
100.0 * SUM(overlap) / SUM(dur) AS overlap_pct
FROM comm_overlap
GROUP BY kernel
ORDER BY exposed_ms DESC;
SELECT
stream, kernel,
COUNT(*) AS n,
SUM(dur)/1e6 AS comm_ms,
SUM(dur - overlap)/1e6 AS exposed_ms,
100.0 * SUM(overlap) / SUM(dur) AS overlap_pct
FROM comm_overlap
GROUP BY stream, kernel
ORDER BY exposed_ms DESC;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment