davidlohr · April 21, 2026 00:39
diff --git a/hmm-snoop.bt b/hmm-snoop.bt
 #!/usr/bin/env bpftrace
 /*
 * hmm-snoop.bt -- comprehensive observer for Linux HMM / CPU-GPU memory
 *
 * Reports all CPU<->device memory interactions that flow through the
 * kernel's HMM (Heterogeneous Memory Management) APIs, without any driver
 * cooperation. Works transparently for any driver that uses the standard
 * migrate_vma_* / hmm_range_fault interface: amdgpu/amdkfd, nouveau,
 * drm/xe (via drm_pagemap), and lib/test_hmm. No kernel patches required.
 *
 * WHAT IS CAPTURED
 * ----------------
 *
 *  Migration lifecycle (migrate_vma_setup -> pages -> finalize):
 *      * CPU -> device (driver prefetch):  flags & SELECT_SYSTEM, no fault_page
 *      * device -> CPU (fault-back):       fault_page != NULL  <-- thrash signal
 *      * device -> CPU (driver evict):     flags & SELECT_DEVICE_*, no fault_page
 *      * compound / PMD-order migrations: flagged MIGRATE_PFN_COMPOUND
 *
 *  Per-page PTE traffic (via existing tracepoints):
 *      * set_migration_pte     -- PTE turned into a migration entry
 *      * remove_migration_pte  -- migration entry restored to present/swap
 *
 *  Bulk device-memory teardown (driver unbind):
 *      * migrate_device_range / migrate_device_pfns
 *
 *  Device mirror refresh (hmm_range_fault):
 *      * call count, EBUSY rate (invalidation races = CPU contention)
 *      * EAGAIN / EFAULT / EPERM error breakdown
 *
 *  Device-exclusive entries (GPU atomic ops):
 *      * make_device_exclusive  (driver side: revokes CPU write access)
 *      * restore_exclusive_pte  (CPU side: reclaims access, contention
 *                                signal when ratio ~= 1:1)
 *
 *  Coherent P2P DMA mapping:
 *      * hmm_dma_map_pfn / hmm_dma_unmap_pfn
 *
 *  HMM driver lifecycle:
 *      * mmu_interval_notifier_insert / _remove
 *        (reveals driver attach / detach per process)
 *
 *  Per-device attribution:
 *      * pgmap_owner dimension distinguishes drivers/GPUs in a
 *        multi-device system (amdkfd mdevice, drm_pagemap owner,
 *        nouveau drm->dev, kvmppc_uvmem_pgmap)
 *
 *  EBUSY stack capture:
 *      * Kernel stacks are recorded for every hmm_range_fault -EBUSY
 *        return so you can see WHICH driver is racing the CPU.
 *
 *  MMU notifier invalidations filtered to HMM-relevant events only:
 *      * MMU_NOTIFY_MIGRATE (6) and MMU_NOTIFY_EXCLUSIVE (7)
 *      * (other events -- CLEAR, UNMAP etc. -- are far too hot to track)
 *
 * WHAT THE OUTPUT TELLS YOU
 * -------------------------
 *
 *   Setup:                     number of migration attempts
 *   Collected (cpages):        pages successfully unmapped & ready to copy
 *   cpages < npages:           some pages pinned / unmovable (LONGTERM)
 *   Fault-back events:         CPU touched GPU-resident data -> thrashing
 *   hmm_range_fault -EBUSY:    device mirror raced a CPU PTE mutation
 *   High setup-to-finalize
 *     latency + low cpages:    driver allocation/copy is the bottleneck
 *   Compound migrations:       THP-granularity transfers (good!)
 *   MMU_NOTIFY_MIGRATE >>
 *     actual migrations:       migration is being invoked but falling back
 *
 * USAGE
 * -----
 *
 *   sudo bpftrace hmm-snoop.bt                         # system-wide, until Ctrl-C
 *   sudo bpftrace hmm-snoop.bt 1234                    # filter to tgid 1234
 *   sudo bpftrace hmm-snoop.bt 0 30                    # all tgids, auto-exit after 30s
 *   sudo bpftrace hmm-snoop.bt 1234 60                 # tgid 1234, auto-exit after 60s
 *
 *   # Run a specific workload and trace only its lifetime; bpftrace
 *   # launches the child and stops tracing when it exits:
 *   sudo bpftrace hmm-snoop.bt -c './my_gpu_app --flag'
 *
 *   # Validate the tool path on a dev box by exercising lib/test_hmm:
 *   sudo bpftrace hmm-snoop.bt -c './tools/testing/selftests/mm/hmm-tests'
 *
 * Positional args:  $1 = tgid filter (0 = all, default 0)
 *                   $2 = total seconds before auto-exit (0 = until Ctrl-C, default 0)
 * The periodic summary interval is 5 seconds (edit `interval:s:5` to change).
 *
 * REQUIREMENTS
 * ------------
 *
 *   bpftrace >= 0.14  (struct field access & BTF)
 *   Kernel  >= 5.14   with CONFIG_DEBUG_INFO_BTF=y
 *                     CONFIG_HMM_MIRROR=y
 *                     CONFIG_DEVICE_PRIVATE=y (or =m) for fault-back probes
 *                     CONFIG_KPROBES=y  CONFIG_BPF_EVENTS=y
 *   root privileges.
 *
 * CAVEATS
 * -------
 *
 *  * The canonical, driver-agnostic fault-back signal is
 *    "migrate_vma_setup with args->fault_page != NULL".  We also kprobe
 *    do_huge_pmd_device_private() as a complementary view of the THP
 *    fault-back path.
 *  * Only MMU_NOTIFY_MIGRATE and MMU_NOTIFY_EXCLUSIVE events are tracked;
 *    other mmu_notifier events (CLEAR/UNMAP/...) fire on every munmap and
 *    would drown the output.  The inline filter guarantees near-zero cost
 *    for unrelated invalidations.
 *  * Counters whose probe failed to attach will stay at zero -- bpftrace
 *    prints a warning at startup for each missing symbol.  Check the
 *    startup "probe attached" lines if a column is unexpectedly empty.
 *  * PAGE_SHIFT is hardcoded to 12 (4K pages) in span calculations.  On
 *    architectures with larger base pages, edit ">>12" to ">>16" (64K).
 */

 BEGIN
 {
 	/* $1 = tgid filter (0 = all), $2 = duration secs (0 = until Ctrl-C) */
 	@filter_tgid = (uint64)$1;
 	@duration_s  = (uint64)$2;
 	@start_ns    = nsecs;

 	/* Mirror of enum migrate_vma_direction (include/linux/migrate.h) */
 	@MVMA_SYSTEM     = (uint64)1;	/* MIGRATE_VMA_SELECT_SYSTEM */
 	@MVMA_DEVPRIV    = (uint64)2;	/* MIGRATE_VMA_SELECT_DEVICE_PRIVATE */
 	@MVMA_DEVCOH     = (uint64)4;	/* MIGRATE_VMA_SELECT_DEVICE_COHERENT */
 	@MVMA_COMPOUND   = (uint64)8;	/* MIGRATE_VMA_SELECT_COMPOUND */

 	/* Mirror of enum mmu_notifier_event (include/linux/mmu_notifier.h) */
 	@MMU_NOTIFY_MIGRATE   = (uint64)6;
 	@MMU_NOTIFY_EXCLUSIVE = (uint64)7;

 	/* Classification codes (used as map keys for readability) */
 	@C_CPU2DEV      = "cpu->dev  (prefetch)    ";
 	@C_DEV2CPU_FLT  = "dev->cpu  (fault-back)  ";
 	@C_DEV2CPU_DRV  = "dev->cpu  (driver evict)";
 	@C_UNKNOWN      = "unclassified            ";

 	printf("\nhmm-snoop: tracing HMM / CPU-GPU memory activity");
 	if (@filter_tgid > 0) {
 		printf(" for tgid %d", @filter_tgid);
 	}
 	printf(", interval=5s");
 	if (@duration_s > 0) {
 		printf(", duration=%ds", @duration_s);
 	}
 	printf("\nCtrl-C to exit, summary prints on exit.\n\n");
 }

 /* ---------------------------------------------------------------------- */
 /*  migrate_vma_setup()  --  the canonical entry for every HMM migration   */
 /* ---------------------------------------------------------------------- */

 /*
 * Entry: stash the migrate_vma* so we can pair it with both the
 * kretprobe (to read cpages after setup fills it) and finalize (to
 * compute end-to-end latency).  We classify here (flags + fault_page)
 * because the classification is driven by setup-time state.
 *
 * NOTE: arg0 is NOT accessible at kretprobe in bpftrace -- only retval
 * is.  We save the pointer in a per-tid scratch map @setup_arg0[tid]
 * because the kernel holds mmap_lock across the whole setup->finalize
 * lifecycle, so a single thread can only ever have one in-flight
 * migrate_vma_setup() call at a time.
 */
 kprobe:migrate_vma_setup
 /@filter_tgid == 0 || pid == @filter_tgid/
 {
 	$m       = (struct migrate_vma *)arg0;
 	$flags   = $m->flags;
 	$fault   = (uint64)$m->fault_page;
 	$npages  = ($m->end - $m->start) >> 12;
 	$class   = @C_UNKNOWN;

 	if ($fault != 0) {
 		$class = @C_DEV2CPU_FLT;	/* driver's migrate_to_ram callback */
 	} else if ($flags & @MVMA_SYSTEM) {
 		$class = @C_CPU2DEV;
 	} else if ($flags & (@MVMA_DEVPRIV | @MVMA_DEVCOH)) {
 		$class = @C_DEV2CPU_DRV;
 	}

 	/* Per-pointer state: keyed on arg0 (migrate_vma*) so it survives
 	 * across setup -> pages -> finalize on the same struct.             */
 	@mig_start_ns[arg0]    = nsecs;
 	@mig_class[arg0]       = $class;
 	@mig_tgid[arg0]        = pid;
 	@mig_comm[arg0]        = comm;
 	@mig_flags[arg0]       = $flags;
 	@mig_npages[arg0]      = $npages;
 	@mig_fault_stash[arg0] = $fault;

 	/* Per-tid scratch: lets the kretprobe recover arg0.               */
 	@setup_arg0[tid] = arg0;

 	@mig_setup_total++;
 	@mig_setup_by_class[$class] = count();
 	@mig_setup_pages_requested[$class] = hist($npages);

 	/*
 	 * Per-class (pid, comm) and per-device (pgmap_owner) maps are
 	 * split into three separate maps per class rather than using
 	 * (class_str, pid, comm) tuple keys.  Reason: bpftrace stores
 	 * tuple-key strings unreliably on some 0.1x versions -- the
 	 * string slot comes out as garbage bytes or empty.  Three maps
 	 * sidestep the bug and produce cleaner per-class output.
 	 */
 	if ($fault != 0) {
 		@mig_dev2cpu_flt_by_proc[pid, comm] = count();
 		@mig_dev2cpu_flt_by_device[(uint64)$m->pgmap_owner] = count();
 	} else if ($flags & @MVMA_SYSTEM) {
 		@mig_cpu2dev_by_proc[pid, comm] = count();
 		@mig_cpu2dev_by_device[(uint64)$m->pgmap_owner] = count();
 	} else if ($flags & (@MVMA_DEVPRIV | @MVMA_DEVCOH)) {
 		@mig_dev2cpu_drv_by_proc[pid, comm] = count();
 		@mig_dev2cpu_drv_by_device[(uint64)$m->pgmap_owner] = count();
 	}
 }

 /*
 * Exit: cpages has been populated by migrate_vma_collect/unmap.  If
 * setup failed (retval != 0) we clean up the per-pointer state because
 * no finalize will follow -- the caller unwinds on error.
 */
 kretprobe:migrate_vma_setup
 /@setup_arg0[tid]/
 {
 	$arg    = @setup_arg0[tid];
 	$m      = (struct migrate_vma *)$arg;
 	$cpages = $m->cpages;
 	$class  = @mig_class[$arg];
 	$nreq   = @mig_npages[$arg];
 	$flags  = @mig_flags[$arg];
 	delete(@setup_arg0[tid]);

 	/* See note on sign-extension in hmm_range_fault retprobe below. */
 	$rc = (int32)retval;
 	@mig_setup_ret[$rc] = count();

 	if ($rc != 0) {
 		@mig_setup_failed[$class] = count();
 		delete(@mig_start_ns[$arg]);
 		delete(@mig_class[$arg]);
 		delete(@mig_tgid[$arg]);
 		delete(@mig_comm[$arg]);
 		delete(@mig_flags[$arg]);
 		delete(@mig_npages[$arg]);
 		delete(@mig_fault_stash[$arg]);
 		return;
 	}

 	@mig_setup_pages_collected[$class] = hist($cpages);
 	@mig_setup_pages_collected_total[$class] = sum($cpages);

 	/* Unmigratable ratio: pages requested vs pages actually unmapped */
 	if ($nreq > 0 && $cpages < $nreq) {
 		@mig_partial[$class] = count();
 		@mig_unmigratable_pages[$class] = sum($nreq - $cpages);
 	}

 	if ($flags & @MVMA_COMPOUND) {
 		@mig_compound[$class] = count();
 	}
 }

 /*
 * migrate_vma_pages() -- the copy-commit phase.  We measure setup->pages
 * separately so users can identify where latency comes from (driver
 * alloc/copy between setup and pages vs. finalize overhead after pages).
 */
 kprobe:migrate_vma_pages
 /@mig_start_ns[arg0]/
 {
 	@mig_pages_phase_us[@mig_class[arg0]] =
 		hist((nsecs - @mig_start_ns[arg0]) / 1000);
 	@mig_at_pages_ns[arg0] = nsecs;
 }

 /*
 * migrate_vma_finalize() -- the end of the lifecycle.  We compute end-
 * to-end latency (setup->finalize) and record the final cpages (some
 * may have had MIGRATE_PFN_MIGRATE cleared by __migrate_device_pages).
 */
 kprobe:migrate_vma_finalize
 /@mig_start_ns[arg0]/
 {
 	$m      = (struct migrate_vma *)arg0;
 	$class  = @mig_class[arg0];
 	$cpages = $m->cpages;
 	$lat_us = (nsecs - @mig_start_ns[arg0]) / 1000;

 	@mig_e2e_us[$class] = hist($lat_us);

 	if (@mig_at_pages_ns[arg0]) {
 		@mig_finalize_phase_us[$class] =
 			hist((nsecs - @mig_at_pages_ns[arg0]) / 1000);
 	}

 	@mig_finalize_total++;
 	@mig_finalize_cpages[$class] = sum($cpages);

 	/*
 	 * Per-class cpages-moved maps, split to avoid bpftrace's broken
 	 * tuple-key-with-string printing (see setup-side NOTE).  Use the
 	 * stashed fault_page + flags for dispatch -- same logic as setup.
 	 */
 	$flags_now = @mig_flags[arg0];
 	if (@mig_fault_stash[arg0] != 0) {
 		@mig_dev2cpu_flt_cpages_by_proc[
 			@mig_tgid[arg0], @mig_comm[arg0]] = sum($cpages);
 	} else if ($flags_now & @MVMA_SYSTEM) {
 		@mig_cpu2dev_cpages_by_proc[
 			@mig_tgid[arg0], @mig_comm[arg0]] = sum($cpages);
 	} else if ($flags_now & (@MVMA_DEVPRIV | @MVMA_DEVCOH)) {
 		@mig_dev2cpu_drv_cpages_by_proc[
 			@mig_tgid[arg0], @mig_comm[arg0]] = sum($cpages);
 	}

 	delete(@mig_start_ns[arg0]);
 	delete(@mig_class[arg0]);
 	delete(@mig_tgid[arg0]);
 	delete(@mig_comm[arg0]);
 	delete(@mig_flags[arg0]);
 	delete(@mig_npages[arg0]);
 	delete(@mig_at_pages_ns[arg0]);
 	delete(@mig_fault_stash[arg0]);
 }

 /* ---------------------------------------------------------------------- */
 /*  Bulk device-memory teardown  (driver unbind / chunk reclaim)          */
 /* ---------------------------------------------------------------------- */

 kprobe:migrate_device_range
 /@filter_tgid == 0 || pid == @filter_tgid/
 {
 	@bulk_teardown_range++;
 	@bulk_teardown_pages = hist((uint64)arg2);
 	@bulk_teardown_by_proc[pid, comm] = count();
 }

 /*
 * migrate_device_pfns() was added circa 2024; wildcard so the script
 * still loads on kernels that predate it.
 */
 kprobe:migrate_device_pfns*
 /@filter_tgid == 0 || pid == @filter_tgid/
 {
 	@bulk_teardown_pfns++;
 	@bulk_teardown_pages = hist((uint64)arg1);
 	@bulk_teardown_by_proc[pid, comm] = count();
 }

 /* ---------------------------------------------------------------------- */
 /*  hmm_range_fault()  --  device asks for a CPU mapping snapshot          */
 /* ---------------------------------------------------------------------- */

 kprobe:hmm_range_fault
 /@filter_tgid == 0 || pid == @filter_tgid/
 {
 	$r        = (struct hmm_range *)arg0;
 	$npages   = ($r->end - $r->start) >> 12;
 	$flags    = $r->default_flags;

 	@hrf_ts[tid]    = nsecs;
 	@hrf_npages[tid] = $npages;
 	@hrf_flags[tid]  = $flags;

 	@hrf_total++;
 	@hrf_npages_hist = hist($npages);
 	@hrf_by_proc[pid, comm] = count();

 	/*
 	 * HMM_PFN_REQ_WRITE == HMM_PFN_WRITE == 1UL << (BITS_PER_LONG - 2)
 	 * (bit 62 on 64-bit).  Test for it on ->default_flags to classify
 	 * the caller's intent as read-only mirror vs write-enabled mirror.
 	 */
 	if ($flags & 0x4000000000000000) {
 		@hrf_write++;
 	}
 }

 kretprobe:hmm_range_fault
 /@hrf_ts[tid]/
 {
 	$lat_us = (nsecs - @hrf_ts[tid]) / 1000;
 	/*
 	 * hmm_range_fault returns `int` (32-bit signed).  bpftrace's
 	 * retval is uint64 with the low 32 zero-extended, so plain
 	 * (int64)retval does NOT sign-extend -- e.g. -16 becomes
 	 * 4294967280.  Cast through int32 to force sign extension.
 	 */
 	$ret    = (int32)retval;

 	@hrf_latency_us = hist($lat_us);
 	@hrf_ret[$ret] = count();

 	if ($ret == -16) {		/* -EBUSY -> invalidation race */
 		@hrf_ebusy++;
 		@hrf_ebusy_by_proc[pid, comm] = count();
 		/*
 		 * Capture a kernel stack for EBUSY returns: these are
 		 * the thrash/race events where the device mirror lost a
 		 * race with a concurrent CPU PTE mutation.  The stack
 		 * tells you WHICH driver (nouveau_dmem, kfd_svm, xe_svm,
 		 * drm_pagemap, ...) was the caller.  A single stack-
 		 * count map per (stack, class) lets `print()` show top
 		 * offenders without unbounded growth.
 		 */
 		@hrf_ebusy_stacks[kstack, comm] = count();
 	} else if ($ret == -14) {	/* -EFAULT */
 		@hrf_efault++;
 	} else if ($ret == -11) {	/* -EAGAIN (rare) */
 		@hrf_eagain++;
 	} else if ($ret == -1) {	/* -EPERM */
 		@hrf_eperm++;
 	}

 	delete(@hrf_ts[tid]);
 	delete(@hrf_npages[tid]);
 	delete(@hrf_flags[tid]);
 }

 /* ---------------------------------------------------------------------- */
 /*  Device-exclusive entries  (GPU atomic ops)                             */
 /* ---------------------------------------------------------------------- */

 /*
 * Matches both make_device_exclusive() and the older
 * make_device_exclusive_range() so the script works across kernel
 * versions.  Fires when a driver revokes CPU write access to host
 * pages so the GPU can perform atomics.
 */
 kprobe:make_device_exclusive*
 /@filter_tgid == 0 || pid == @filter_tgid/
 {
 	@dev_exclusive_total++;
 	@dev_exclusive_by_proc[pid, comm] = count();
 }

 /*
 * restore_exclusive_pte() runs when the CPU touches an address whose
 * PTE was converted to a device-exclusive entry by the driver.  The
 * CPU access is trapped via do_swap_page() which calls
 * remove_device_exclusive_entry() -> restore_exclusive_pte().  Pairing
 * its counter with make_device_exclusive gives you the CPU/GPU atomic
 * contention signal (high restore rate = GPU atomics fighting CPU).
 *
 * restore_exclusive_pte is static; wildcard matches it if present,
 * silent if absent (older kernels without the split-out helper).
 */
 kprobe:restore_exclusive_pte*
 /@filter_tgid == 0 || pid == @filter_tgid/
 {
 	@dev_exclusive_restore++;
 	@dev_exclusive_restore_by_proc[pid, comm] = count();
 }

 /* ---------------------------------------------------------------------- */
 /*  Device mirror-range registration / teardown                            */
 /*                                                                         */
 /*  Every HMM-using driver registers a per-process mmu_interval_notifier  */
 /*  covering the VA range it wants to mirror; counting these reveals HMM  */
 /*  driver lifecycle events (attach, detach, process fork).               */
 /* ---------------------------------------------------------------------- */

 kprobe:mmu_interval_notifier_insert
 /@filter_tgid == 0 || pid == @filter_tgid/
 {
 	@mmu_interval_insert++;
 	@mmu_interval_insert_by_proc[pid, comm] = count();
 }

 kprobe:mmu_interval_notifier_remove
 /@filter_tgid == 0 || pid == @filter_tgid/
 {
 	@mmu_interval_remove++;
 	@mmu_interval_remove_by_proc[pid, comm] = count();
 }

 /* ---------------------------------------------------------------------- */
 /*  HMM DMA-mapping (P2P & direct device I/O)                              */
 /* ---------------------------------------------------------------------- */

 /* hmm_dma_map_pfn / _unmap_pfn are newer APIs (added ~2024) -- wildcard
 * so the script stays loadable on older kernels without them.           */
 kprobe:hmm_dma_map_pfn*
 {
 	@hmm_dma_map++;
 }

 kprobe:hmm_dma_unmap_pfn*
 {
 	@hmm_dma_unmap++;
 }

 /* ---------------------------------------------------------------------- */
 /*  CPU-side fault-in path (complementary view to fault_page != NULL)     */
 /* ---------------------------------------------------------------------- */

 /* Fires for the THP fault-back path (device-private PMD entry).
 * Added with PMD-size device-private support; wildcarded so pre-support
 * kernels still load the script without this probe.                      */
 kprobe:do_huge_pmd_device_private*
 /@filter_tgid == 0 || pid == @filter_tgid/
 {
 	@cpu_fault_thp++;
 	@cpu_fault_thp_by_proc[pid, comm] = count();
 }

 /* ---------------------------------------------------------------------- */
 /*  MMU notifier  --  filtered to HMM-relevant events only                 */
 /* ---------------------------------------------------------------------- */

 /*
 * __mmu_notifier_invalidate_range_start is extremely hot (fires on every
 * munmap, mprotect, swap-out, fork, ...).  We must filter inline on
 * event in { MMU_NOTIFY_MIGRATE, MMU_NOTIFY_EXCLUSIVE } -- both originate
 * exclusively from HMM-related paths.
 *
 * NOTE: mmu_notifier_invalidate_range_start() is a static inline wrapper;
 * the real kprobable symbol is __mmu_notifier_invalidate_range_start().
 * This only fires for mm's with notifiers registered (GPU driver context),
 * so the overhead is well bounded.
 */
 kprobe:__mmu_notifier_invalidate_range_start
 {
 	$r     = (struct mmu_notifier_range *)arg0;
 	$event = (uint64)$r->event;

 	if ($event != @MMU_NOTIFY_MIGRATE && $event != @MMU_NOTIFY_EXCLUSIVE) {
 		return;
 	}
 	if (@filter_tgid != 0 && pid != @filter_tgid) {
 		return;
 	}

 	$span = ($r->end - $r->start) >> 12;

 	if ($event == @MMU_NOTIFY_MIGRATE) {
 		@mmu_notify_migrate++;
 		@mmu_notify_migrate_pages = hist($span);
 		@mmu_notify_migrate_by_proc[pid, comm] = count();
 	} else {
 		@mmu_notify_exclusive++;
 		@mmu_notify_exclusive_pages = hist($span);
 	}
 }

 /* ---------------------------------------------------------------------- */
 /*  Per-page PTE tracepoints                                               */
 /*                                                                         */
 /*  Asymmetric by kernel design, not a tool bug:                           */
 /*   * set_migration_pte fires only from try_to_migrate_one() in mm/rmap.c.*/
 /*     migrate_device_unmap() calls try_to_migrate() ONLY when a folio was */
 /*     still mapped at unmap time -- which is the case for already-device   */
 /*     pages (dev->cpu fault-back) but *not* for the cpu->device path, where */
 /*     migrate_vma_collect_pmd() replaces PTEs directly with set_pte_at(). */
 /*   * remove_migration_pte fires from remove_migration_ptes() in          */
 /*     mm/migrate.c which __migrate_device_finalize() invokes for every    */
 /*     migration, both directions.                                         */
 /*                                                                         */
 /*  Expect:  remove_migration_pte  >=  set_migration_pte.                  */
 /*  A large imbalance (set << remove) indicates heavy cpu->device traffic.  */
 /* ---------------------------------------------------------------------- */

 tracepoint:migrate:set_migration_pte
 /@filter_tgid == 0 || pid == @filter_tgid/
 {
 	@pte_set_migration++;
 	@pte_set_migration_order = lhist(args->order, 0, 10, 1);
 }

 tracepoint:migrate:remove_migration_pte
 /@filter_tgid == 0 || pid == @filter_tgid/
 {
 	@pte_remove_migration++;
 	@pte_remove_migration_order = lhist(args->order, 0, 10, 1);
 }

 /* ---------------------------------------------------------------------- */
 /*  Periodic summary                                                       */
 /* ---------------------------------------------------------------------- */

 interval:s:5
 {
 	$elapsed = (nsecs - @start_ns) / 1000000000;

 	printf("\n=========================================================\n");
 	printf("  hmm-snoop summary at t=%ds", $elapsed);
 	if (@filter_tgid > 0) {
 		printf("  (tgid %d only)", @filter_tgid);
 	}
 	printf("\n=========================================================\n");

 	printf("\n-- Migration lifecycle --\n");
 	printf("%s\n", "migrate_vma_setup attempts:");
 	print(@mig_setup_by_class);
 	printf("%s\n", "migrate_vma_setup -> cpages (pages actually collected):");
 	print(@mig_setup_pages_collected_total);
 	printf("%s\n", "migrate_vma_setup return codes (0=ok):");
 	print(@mig_setup_ret);

 	printf("\n-- Top processes -- cpu->dev prefetch --\n");
 	print(@mig_cpu2dev_by_proc, 10);
 	printf("\n-- Top processes -- dev->cpu fault-back --\n");
 	print(@mig_dev2cpu_flt_by_proc, 10);
 	printf("\n-- Top processes -- dev->cpu driver evict --\n");
 	print(@mig_dev2cpu_drv_by_proc, 10);

 	printf("\n-- End-to-end latency (setup->finalize, microseconds) --\n");
 	print(@mig_e2e_us);

 	printf("\n-- Partial / unmigratable (cpages < npages requested) --\n");
 	print(@mig_partial);
 	print(@mig_unmigratable_pages);

 	printf("\n-- Device mirror (hmm_range_fault) --\n");
 	printf("  total calls: %d", @hrf_total);
 	printf("  -EBUSY (invalidation race): %d", @hrf_ebusy);
 	printf("  -EFAULT: %d   -EAGAIN: %d   -EPERM: %d\n",
 		@hrf_efault, @hrf_eagain, @hrf_eperm);
 	print(@hrf_latency_us);

 	printf("\n-- MMU notifier events (HMM-scoped) --\n");
 	printf("  MMU_NOTIFY_MIGRATE:   %d\n", @mmu_notify_migrate);
 	printf("  MMU_NOTIFY_EXCLUSIVE: %d\n", @mmu_notify_exclusive);

 	printf("\n-- Per-page PTE traffic --\n");
 	printf("  set_migration_pte:    %d\n", @pte_set_migration);
 	printf("  remove_migration_pte: %d\n", @pte_remove_migration);

 	printf("\n-- Device-exclusive / DMA --\n");
 	printf("  make_device_exclusive:   %d\n", @dev_exclusive_total);
 	printf("  restore_exclusive_pte:   %d  (CPU accesses back-claiming)\n",
 		@dev_exclusive_restore);
 	printf("  hmm_dma_map_pfn:         %d\n", @hmm_dma_map);
 	printf("  hmm_dma_unmap_pfn:       %d\n", @hmm_dma_unmap);

 	printf("\n-- HMM driver lifecycle --\n");
 	printf("  mmu_interval_notifier_insert: %d\n", @mmu_interval_insert);
 	printf("  mmu_interval_notifier_remove: %d\n", @mmu_interval_remove);

 	printf("\n-- Bulk teardown --\n");
 	printf("  migrate_device_range: %d   migrate_device_pfns: %d\n",
 		@bulk_teardown_range, @bulk_teardown_pfns);
 }

 /*
 * Auto-exit after the requested duration.  Run frequently so the exit
 * latency is bounded to ~1s.  Use a separate 1s probe (rather than
 * re-using the 5s summary interval) so "run for 10s" actually exits
 * at ~10s instead of the next 5s summary tick.
 */
 interval:s:1
 /@duration_s > 0/
 {
 	if ((nsecs - @start_ns) / 1000000000 >= @duration_s) {
 		exit();
 	}
 }

 /* ---------------------------------------------------------------------- */
 /*  Final report on exit                                                   */
 /* ---------------------------------------------------------------------- */

 END
 {
 	printf("\n\n");
 	printf("=========================================================\n");
 	printf("  hmm-snoop final report\n");
 	printf("=========================================================\n");

 	printf("\n--- Migration attempts by class ---\n");
 	print(@mig_setup_by_class);

 	printf("\n--- Pages successfully collected (cpages) by class ---\n");
 	print(@mig_setup_pages_collected);
 	print(@mig_setup_pages_collected_total);

 	printf("\n--- Pages requested (npages) by class ---\n");
 	print(@mig_setup_pages_requested);

 	printf("\n--- End-to-end migration latency (setup->finalize, us) ---\n");
 	print(@mig_e2e_us);

 	printf("\n--- Setup->pages phase latency (us) ---\n");
 	print(@mig_pages_phase_us);

 	printf("\n--- Pages->finalize phase latency (us) ---\n");
 	print(@mig_finalize_phase_us);

 	printf("\n--- Pages successfully migrated at finalize (cpages sum) ---\n");
 	print(@mig_finalize_cpages);

 	printf("\n--- Compound (PMD-order) migrations ---\n");
 	print(@mig_compound);

 	printf("\n--- Migrations where cpages<npages (pinned pages) ---\n");
 	print(@mig_partial);
 	print(@mig_unmigratable_pages);

 	printf("\n--- Failed migrate_vma_setup() by class ---\n");
 	print(@mig_setup_failed);

 	printf("\n--- Top processes: cpu->dev prefetch starts ---\n");
 	print(@mig_cpu2dev_by_proc, 40);

 	printf("\n--- Top processes: dev->cpu fault-back starts ---\n");
 	print(@mig_dev2cpu_flt_by_proc, 40);

 	printf("\n--- Top processes: dev->cpu driver-evict starts ---\n");
 	print(@mig_dev2cpu_drv_by_proc, 40);

 	printf("\n--- Top processes: cpu->dev cpages actually moved ---\n");
 	print(@mig_cpu2dev_cpages_by_proc, 40);

 	printf("\n--- Top processes: dev->cpu fault-back cpages moved ---\n");
 	print(@mig_dev2cpu_flt_cpages_by_proc, 40);

 	printf("\n--- Top processes: dev->cpu driver-evict cpages moved ---\n");
 	print(@mig_dev2cpu_drv_cpages_by_proc, 40);

 	printf("\n--- hmm_range_fault (device mirror refresh) ---\n");
 	printf("  calls:  %d\n", @hrf_total);
 	printf("  write-enabled:  %d\n", @hrf_write);
 	printf("  -EBUSY (inval race, thrash hint): %d\n", @hrf_ebusy);
 	printf("  -EFAULT: %d   -EAGAIN: %d   -EPERM: %d\n",
 		@hrf_efault, @hrf_eagain, @hrf_eperm);
 	print(@hrf_ret);
 	print(@hrf_npages_hist);
 	print(@hrf_latency_us);

 	printf("\n--- hmm_range_fault top callers ---\n");
 	print(@hrf_by_proc, 20);

 	printf("\n--- hmm_range_fault EBUSY top processes ---\n");
 	print(@hrf_ebusy_by_proc, 20);

 	printf("\n--- hmm_range_fault EBUSY top kernel stacks ---\n");
 	printf("  (identifies which driver is racing the CPU)\n");
 	print(@hrf_ebusy_stacks, 10);

 	printf("\n--- CPU-side fault paths ---\n");
 	printf("  do_huge_pmd_device_private: %d\n", @cpu_fault_thp);
 	print(@cpu_fault_thp_by_proc, 10);

 	printf("\n--- MMU notifier (HMM-scoped) ---\n");
 	printf("  MMU_NOTIFY_MIGRATE:   %d\n", @mmu_notify_migrate);
 	printf("  MMU_NOTIFY_EXCLUSIVE: %d\n", @mmu_notify_exclusive);
 	print(@mmu_notify_migrate_pages);
 	print(@mmu_notify_exclusive_pages);
 	print(@mmu_notify_migrate_by_proc, 10);

 	printf("\n--- Per-page PTE traffic (tracepoint-driven) ---\n");
 	printf("  set_migration_pte:    %d\n", @pte_set_migration);
 	printf("  remove_migration_pte: %d\n", @pte_remove_migration);
 	printf("  Note: set_migration_pte fires only for already-mapped folios\n");
 	printf("  (typically dev->cpu paths). cpu->dev migrations replace PTEs\n");
 	printf("  directly in migrate_vma_collect_pmd() without try_to_migrate,\n");
 	printf("  so expect  remove >= set  during heavy prefetch activity.\n");
 	print(@pte_set_migration_order);
 	print(@pte_remove_migration_order);

 	printf("\n--- Device-exclusive ---\n");
 	printf("  make_device_exclusive total:      %d\n", @dev_exclusive_total);
 	printf("  restore_exclusive_pte total:      %d  (CPU back-claim)\n",
 		@dev_exclusive_restore);
 	print(@dev_exclusive_by_proc, 10);
 	print(@dev_exclusive_restore_by_proc, 10);

 	printf("\n--- HMM driver lifecycle ---\n");
 	printf("  mmu_interval_notifier_insert: %d\n", @mmu_interval_insert);
 	printf("  mmu_interval_notifier_remove: %d\n", @mmu_interval_remove);
 	print(@mmu_interval_insert_by_proc, 10);
 	print(@mmu_interval_remove_by_proc, 10);

 	printf("\n--- Migrations by pgmap_owner (driver/device dimension) ---\n");
 	printf("  pgmap_owner values are kernel pointers to the driver's\n");
 	printf("  device identity: amdkfd mdevice, drm_pagemap->owner,\n");
 	printf("  nouveau drm->dev, kvmppc_uvmem_pgmap, etc.\n");
 	printf("  Cross-reference against /proc/kallsyms or kernel logs.\n");
 	printf("\n  cpu->dev prefetch:\n");
 	print(@mig_cpu2dev_by_device);
 	printf("\n  dev->cpu fault-back:\n");
 	print(@mig_dev2cpu_flt_by_device);
 	printf("\n  dev->cpu driver evict:\n");
 	print(@mig_dev2cpu_drv_by_device);

 	printf("\n--- HMM DMA mapping ---\n");
 	printf("  hmm_dma_map_pfn:   %d\n", @hmm_dma_map);
 	printf("  hmm_dma_unmap_pfn: %d\n", @hmm_dma_unmap);

 	printf("\n--- Bulk teardown ---\n");
 	printf("  migrate_device_range: %d\n", @bulk_teardown_range);
 	printf("  migrate_device_pfns:  %d\n", @bulk_teardown_pfns);
 	print(@bulk_teardown_pages);
 	print(@bulk_teardown_by_proc, 10);

 	/*
 	 * Clear every map we printed explicitly so bpftrace's auto-
 	 * print at shutdown does not emit duplicates.  Also clear the
 	 * scratch / constant maps that would otherwise appear.
 	 */

 	/* scratch (per-migration state) */
 	clear(@mig_start_ns);   clear(@mig_class);
 	clear(@mig_tgid);       clear(@mig_comm);
 	clear(@mig_flags);      clear(@mig_npages);
 	clear(@mig_at_pages_ns);
 	clear(@mig_fault_stash);
 	clear(@setup_arg0);
 	clear(@hrf_ts);         clear(@hrf_npages);  clear(@hrf_flags);

 	/*
 	 * Scalar counters: we deliberately DO NOT clear() these.  The
 	 * bpftrace auto-print at END emits them as `@name: value` lines
 	 * which is the machine-readable form required by hmm-snoop-plot.py.
 	 * The human-readable "  MMU_NOTIFY_MIGRATE: N" printfs above
 	 * already exist for eyeball review; having both in the output
 	 * is a deliberate convenience.
 	 */

 	/* aggregations */
 	clear(@mig_setup_by_class);
 	clear(@mig_setup_pages_collected);
 	clear(@mig_setup_pages_collected_total);
 	clear(@mig_setup_pages_requested);
 	clear(@mig_setup_ret);
 	clear(@mig_cpu2dev_by_proc);
 	clear(@mig_dev2cpu_flt_by_proc);
 	clear(@mig_dev2cpu_drv_by_proc);
 	clear(@mig_cpu2dev_cpages_by_proc);
 	clear(@mig_dev2cpu_flt_cpages_by_proc);
 	clear(@mig_dev2cpu_drv_cpages_by_proc);
 	clear(@mig_finalize_cpages);
 	clear(@mig_setup_failed);
 	clear(@mig_partial);           clear(@mig_unmigratable_pages);
 	clear(@mig_compound);
 	clear(@mig_e2e_us);            clear(@mig_pages_phase_us);
 	clear(@mig_finalize_phase_us);
 	clear(@hrf_npages_hist);       clear(@hrf_latency_us);
 	clear(@hrf_ret);               clear(@hrf_by_proc);
 	clear(@hrf_ebusy_by_proc);
 	clear(@cpu_fault_thp_by_proc);
 	clear(@mmu_notify_migrate_pages);
 	clear(@mmu_notify_exclusive_pages);
 	clear(@mmu_notify_migrate_by_proc);
 	clear(@pte_set_migration_order);
 	clear(@pte_remove_migration_order);
 	clear(@dev_exclusive_by_proc);
 	clear(@dev_exclusive_restore_by_proc);
 	clear(@mmu_interval_insert_by_proc);
 	clear(@mmu_interval_remove_by_proc);
 	clear(@mig_cpu2dev_by_device);
 	clear(@mig_dev2cpu_flt_by_device);
 	clear(@mig_dev2cpu_drv_by_device);
 	clear(@hrf_ebusy_stacks);
 	clear(@bulk_teardown_pages);   clear(@bulk_teardown_by_proc);

 	/* constants / config */
 	clear(@filter_tgid);    clear(@duration_s);   clear(@start_ns);
 	clear(@MVMA_SYSTEM);    clear(@MVMA_DEVPRIV);
 	clear(@MVMA_DEVCOH);    clear(@MVMA_COMPOUND);
 	clear(@MMU_NOTIFY_MIGRATE);  clear(@MMU_NOTIFY_EXCLUSIVE);
 	clear(@C_CPU2DEV);      clear(@C_DEV2CPU_FLT);
 	clear(@C_DEV2CPU_DRV);  clear(@C_UNKNOWN);
 }
No results found