Created
April 21, 2026 00:39
-
-
Save davidlohr/08923e5148ad0ddae16b4a01e7a6d1dc to your computer and use it in GitHub Desktop.
Linux HMM snooping tool
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env bpftrace | |
| /* | |
| * hmm-snoop.bt -- comprehensive observer for Linux HMM / CPU-GPU memory | |
| * | |
| * Reports all CPU<->device memory interactions that flow through the | |
| * kernel's HMM (Heterogeneous Memory Management) APIs, without any driver | |
| * cooperation. Works transparently for any driver that uses the standard | |
| * migrate_vma_* / hmm_range_fault interface: amdgpu/amdkfd, nouveau, | |
| * drm/xe (via drm_pagemap), and lib/test_hmm. No kernel patches required. | |
| * | |
| * WHAT IS CAPTURED | |
| * ---------------- | |
| * | |
| * Migration lifecycle (migrate_vma_setup -> pages -> finalize): | |
| * * CPU -> device (driver prefetch): flags & SELECT_SYSTEM, no fault_page | |
| * * device -> CPU (fault-back): fault_page != NULL <-- thrash signal | |
| * * device -> CPU (driver evict): flags & SELECT_DEVICE_*, no fault_page | |
| * * compound / PMD-order migrations: flagged MIGRATE_PFN_COMPOUND | |
| * | |
| * Per-page PTE traffic (via existing tracepoints): | |
| * * set_migration_pte -- PTE turned into a migration entry | |
| * * remove_migration_pte -- migration entry restored to present/swap | |
| * | |
| * Bulk device-memory teardown (driver unbind): | |
| * * migrate_device_range / migrate_device_pfns | |
| * | |
| * Device mirror refresh (hmm_range_fault): | |
| * * call count, EBUSY rate (invalidation races = CPU contention) | |
| * * EAGAIN / EFAULT / EPERM error breakdown | |
| * | |
| * Device-exclusive entries (GPU atomic ops): | |
| * * make_device_exclusive (driver side: revokes CPU write access) | |
| * * restore_exclusive_pte (CPU side: reclaims access, contention | |
| * signal when ratio ~= 1:1) | |
| * | |
| * Coherent P2P DMA mapping: | |
| * * hmm_dma_map_pfn / hmm_dma_unmap_pfn | |
| * | |
| * HMM driver lifecycle: | |
| * * mmu_interval_notifier_insert / _remove | |
| * (reveals driver attach / detach per process) | |
| * | |
| * Per-device attribution: | |
| * * pgmap_owner dimension distinguishes drivers/GPUs in a | |
| * multi-device system (amdkfd mdevice, drm_pagemap owner, | |
| * nouveau drm->dev, kvmppc_uvmem_pgmap) | |
| * | |
| * EBUSY stack capture: | |
| * * Kernel stacks are recorded for every hmm_range_fault -EBUSY | |
| * return so you can see WHICH driver is racing the CPU. | |
| * | |
| * MMU notifier invalidations filtered to HMM-relevant events only: | |
| * * MMU_NOTIFY_MIGRATE (6) and MMU_NOTIFY_EXCLUSIVE (7) | |
| * * (other events -- CLEAR, UNMAP etc. -- are far too hot to track) | |
| * | |
| * WHAT THE OUTPUT TELLS YOU | |
| * ------------------------- | |
| * | |
| * Setup: number of migration attempts | |
| * Collected (cpages): pages successfully unmapped & ready to copy | |
| * cpages < npages: some pages pinned / unmovable (LONGTERM) | |
| * Fault-back events: CPU touched GPU-resident data -> thrashing | |
| * hmm_range_fault -EBUSY: device mirror raced a CPU PTE mutation | |
| * High setup-to-finalize | |
| * latency + low cpages: driver allocation/copy is the bottleneck | |
| * Compound migrations: THP-granularity transfers (good!) | |
| * MMU_NOTIFY_MIGRATE >> | |
| * actual migrations: migration is being invoked but falling back | |
| * | |
| * USAGE | |
| * ----- | |
| * | |
| * sudo bpftrace hmm-snoop.bt # system-wide, until Ctrl-C | |
| * sudo bpftrace hmm-snoop.bt 1234 # filter to tgid 1234 | |
| * sudo bpftrace hmm-snoop.bt 0 30 # all tgids, auto-exit after 30s | |
| * sudo bpftrace hmm-snoop.bt 1234 60 # tgid 1234, auto-exit after 60s | |
| * | |
| * # Run a specific workload and trace only its lifetime; bpftrace | |
| * # launches the child and stops tracing when it exits: | |
| * sudo bpftrace hmm-snoop.bt -c './my_gpu_app --flag' | |
| * | |
| * # Validate the tool path on a dev box by exercising lib/test_hmm: | |
| * sudo bpftrace hmm-snoop.bt -c './tools/testing/selftests/mm/hmm-tests' | |
| * | |
| * Positional args: $1 = tgid filter (0 = all, default 0) | |
| * $2 = total seconds before auto-exit (0 = until Ctrl-C, default 0) | |
| * The periodic summary interval is 5 seconds (edit `interval:s:5` to change). | |
| * | |
| * REQUIREMENTS | |
| * ------------ | |
| * | |
| * bpftrace >= 0.14 (struct field access & BTF) | |
| * Kernel >= 5.14 with CONFIG_DEBUG_INFO_BTF=y | |
| * CONFIG_HMM_MIRROR=y | |
| * CONFIG_DEVICE_PRIVATE=y (or =m) for fault-back probes | |
| * CONFIG_KPROBES=y CONFIG_BPF_EVENTS=y | |
| * root privileges. | |
| * | |
| * CAVEATS | |
| * ------- | |
| * | |
| * * The canonical, driver-agnostic fault-back signal is | |
| * "migrate_vma_setup with args->fault_page != NULL". We also kprobe | |
| * do_huge_pmd_device_private() as a complementary view of the THP | |
| * fault-back path. | |
| * * Only MMU_NOTIFY_MIGRATE and MMU_NOTIFY_EXCLUSIVE events are tracked; | |
| * other mmu_notifier events (CLEAR/UNMAP/...) fire on every munmap and | |
| * would drown the output. The inline filter guarantees near-zero cost | |
| * for unrelated invalidations. | |
| * * Counters whose probe failed to attach will stay at zero -- bpftrace | |
| * prints a warning at startup for each missing symbol. Check the | |
| * startup "probe attached" lines if a column is unexpectedly empty. | |
| * * PAGE_SHIFT is hardcoded to 12 (4K pages) in span calculations. On | |
| * architectures with larger base pages, edit ">>12" to ">>16" (64K). | |
| */ | |
| BEGIN | |
| { | |
| /* $1 = tgid filter (0 = all), $2 = duration secs (0 = until Ctrl-C) */ | |
| @filter_tgid = (uint64)$1; | |
| @duration_s = (uint64)$2; | |
| @start_ns = nsecs; | |
| /* Mirror of enum migrate_vma_direction (include/linux/migrate.h) */ | |
| @MVMA_SYSTEM = (uint64)1; /* MIGRATE_VMA_SELECT_SYSTEM */ | |
| @MVMA_DEVPRIV = (uint64)2; /* MIGRATE_VMA_SELECT_DEVICE_PRIVATE */ | |
| @MVMA_DEVCOH = (uint64)4; /* MIGRATE_VMA_SELECT_DEVICE_COHERENT */ | |
| @MVMA_COMPOUND = (uint64)8; /* MIGRATE_VMA_SELECT_COMPOUND */ | |
| /* Mirror of enum mmu_notifier_event (include/linux/mmu_notifier.h) */ | |
| @MMU_NOTIFY_MIGRATE = (uint64)6; | |
| @MMU_NOTIFY_EXCLUSIVE = (uint64)7; | |
| /* Classification codes (used as map keys for readability) */ | |
| @C_CPU2DEV = "cpu->dev (prefetch) "; | |
| @C_DEV2CPU_FLT = "dev->cpu (fault-back) "; | |
| @C_DEV2CPU_DRV = "dev->cpu (driver evict)"; | |
| @C_UNKNOWN = "unclassified "; | |
| printf("\nhmm-snoop: tracing HMM / CPU-GPU memory activity"); | |
| if (@filter_tgid > 0) { | |
| printf(" for tgid %d", @filter_tgid); | |
| } | |
| printf(", interval=5s"); | |
| if (@duration_s > 0) { | |
| printf(", duration=%ds", @duration_s); | |
| } | |
| printf("\nCtrl-C to exit, summary prints on exit.\n\n"); | |
| } | |
| /* ---------------------------------------------------------------------- */ | |
| /* migrate_vma_setup() -- the canonical entry for every HMM migration */ | |
| /* ---------------------------------------------------------------------- */ | |
| /* | |
| * Entry: stash the migrate_vma* so we can pair it with both the | |
| * kretprobe (to read cpages after setup fills it) and finalize (to | |
| * compute end-to-end latency). We classify here (flags + fault_page) | |
| * because the classification is driven by setup-time state. | |
| * | |
| * NOTE: arg0 is NOT accessible at kretprobe in bpftrace -- only retval | |
| * is. We save the pointer in a per-tid scratch map @setup_arg0[tid] | |
| * because the kernel holds mmap_lock across the whole setup->finalize | |
| * lifecycle, so a single thread can only ever have one in-flight | |
| * migrate_vma_setup() call at a time. | |
| */ | |
| kprobe:migrate_vma_setup | |
| /@filter_tgid == 0 || pid == @filter_tgid/ | |
| { | |
| $m = (struct migrate_vma *)arg0; | |
| $flags = $m->flags; | |
| $fault = (uint64)$m->fault_page; | |
| $npages = ($m->end - $m->start) >> 12; | |
| $class = @C_UNKNOWN; | |
| if ($fault != 0) { | |
| $class = @C_DEV2CPU_FLT; /* driver's migrate_to_ram callback */ | |
| } else if ($flags & @MVMA_SYSTEM) { | |
| $class = @C_CPU2DEV; | |
| } else if ($flags & (@MVMA_DEVPRIV | @MVMA_DEVCOH)) { | |
| $class = @C_DEV2CPU_DRV; | |
| } | |
| /* Per-pointer state: keyed on arg0 (migrate_vma*) so it survives | |
| * across setup -> pages -> finalize on the same struct. */ | |
| @mig_start_ns[arg0] = nsecs; | |
| @mig_class[arg0] = $class; | |
| @mig_tgid[arg0] = pid; | |
| @mig_comm[arg0] = comm; | |
| @mig_flags[arg0] = $flags; | |
| @mig_npages[arg0] = $npages; | |
| @mig_fault_stash[arg0] = $fault; | |
| /* Per-tid scratch: lets the kretprobe recover arg0. */ | |
| @setup_arg0[tid] = arg0; | |
| @mig_setup_total++; | |
| @mig_setup_by_class[$class] = count(); | |
| @mig_setup_pages_requested[$class] = hist($npages); | |
| /* | |
| * Per-class (pid, comm) and per-device (pgmap_owner) maps are | |
| * split into three separate maps per class rather than using | |
| * (class_str, pid, comm) tuple keys. Reason: bpftrace stores | |
| * tuple-key strings unreliably on some 0.1x versions -- the | |
| * string slot comes out as garbage bytes or empty. Three maps | |
| * sidestep the bug and produce cleaner per-class output. | |
| */ | |
| if ($fault != 0) { | |
| @mig_dev2cpu_flt_by_proc[pid, comm] = count(); | |
| @mig_dev2cpu_flt_by_device[(uint64)$m->pgmap_owner] = count(); | |
| } else if ($flags & @MVMA_SYSTEM) { | |
| @mig_cpu2dev_by_proc[pid, comm] = count(); | |
| @mig_cpu2dev_by_device[(uint64)$m->pgmap_owner] = count(); | |
| } else if ($flags & (@MVMA_DEVPRIV | @MVMA_DEVCOH)) { | |
| @mig_dev2cpu_drv_by_proc[pid, comm] = count(); | |
| @mig_dev2cpu_drv_by_device[(uint64)$m->pgmap_owner] = count(); | |
| } | |
| } | |
| /* | |
| * Exit: cpages has been populated by migrate_vma_collect/unmap. If | |
| * setup failed (retval != 0) we clean up the per-pointer state because | |
| * no finalize will follow -- the caller unwinds on error. | |
| */ | |
| kretprobe:migrate_vma_setup | |
| /@setup_arg0[tid]/ | |
| { | |
| $arg = @setup_arg0[tid]; | |
| $m = (struct migrate_vma *)$arg; | |
| $cpages = $m->cpages; | |
| $class = @mig_class[$arg]; | |
| $nreq = @mig_npages[$arg]; | |
| $flags = @mig_flags[$arg]; | |
| delete(@setup_arg0[tid]); | |
| /* See note on sign-extension in hmm_range_fault retprobe below. */ | |
| $rc = (int32)retval; | |
| @mig_setup_ret[$rc] = count(); | |
| if ($rc != 0) { | |
| @mig_setup_failed[$class] = count(); | |
| delete(@mig_start_ns[$arg]); | |
| delete(@mig_class[$arg]); | |
| delete(@mig_tgid[$arg]); | |
| delete(@mig_comm[$arg]); | |
| delete(@mig_flags[$arg]); | |
| delete(@mig_npages[$arg]); | |
| delete(@mig_fault_stash[$arg]); | |
| return; | |
| } | |
| @mig_setup_pages_collected[$class] = hist($cpages); | |
| @mig_setup_pages_collected_total[$class] = sum($cpages); | |
| /* Unmigratable ratio: pages requested vs pages actually unmapped */ | |
| if ($nreq > 0 && $cpages < $nreq) { | |
| @mig_partial[$class] = count(); | |
| @mig_unmigratable_pages[$class] = sum($nreq - $cpages); | |
| } | |
| if ($flags & @MVMA_COMPOUND) { | |
| @mig_compound[$class] = count(); | |
| } | |
| } | |
| /* | |
| * migrate_vma_pages() -- the copy-commit phase. We measure setup->pages | |
| * separately so users can identify where latency comes from (driver | |
| * alloc/copy between setup and pages vs. finalize overhead after pages). | |
| */ | |
| kprobe:migrate_vma_pages | |
| /@mig_start_ns[arg0]/ | |
| { | |
| @mig_pages_phase_us[@mig_class[arg0]] = | |
| hist((nsecs - @mig_start_ns[arg0]) / 1000); | |
| @mig_at_pages_ns[arg0] = nsecs; | |
| } | |
| /* | |
| * migrate_vma_finalize() -- the end of the lifecycle. We compute end- | |
| * to-end latency (setup->finalize) and record the final cpages (some | |
| * may have had MIGRATE_PFN_MIGRATE cleared by __migrate_device_pages). | |
| */ | |
| kprobe:migrate_vma_finalize | |
| /@mig_start_ns[arg0]/ | |
| { | |
| $m = (struct migrate_vma *)arg0; | |
| $class = @mig_class[arg0]; | |
| $cpages = $m->cpages; | |
| $lat_us = (nsecs - @mig_start_ns[arg0]) / 1000; | |
| @mig_e2e_us[$class] = hist($lat_us); | |
| if (@mig_at_pages_ns[arg0]) { | |
| @mig_finalize_phase_us[$class] = | |
| hist((nsecs - @mig_at_pages_ns[arg0]) / 1000); | |
| } | |
| @mig_finalize_total++; | |
| @mig_finalize_cpages[$class] = sum($cpages); | |
| /* | |
| * Per-class cpages-moved maps, split to avoid bpftrace's broken | |
| * tuple-key-with-string printing (see setup-side NOTE). Use the | |
| * stashed fault_page + flags for dispatch -- same logic as setup. | |
| */ | |
| $flags_now = @mig_flags[arg0]; | |
| if (@mig_fault_stash[arg0] != 0) { | |
| @mig_dev2cpu_flt_cpages_by_proc[ | |
| @mig_tgid[arg0], @mig_comm[arg0]] = sum($cpages); | |
| } else if ($flags_now & @MVMA_SYSTEM) { | |
| @mig_cpu2dev_cpages_by_proc[ | |
| @mig_tgid[arg0], @mig_comm[arg0]] = sum($cpages); | |
| } else if ($flags_now & (@MVMA_DEVPRIV | @MVMA_DEVCOH)) { | |
| @mig_dev2cpu_drv_cpages_by_proc[ | |
| @mig_tgid[arg0], @mig_comm[arg0]] = sum($cpages); | |
| } | |
| delete(@mig_start_ns[arg0]); | |
| delete(@mig_class[arg0]); | |
| delete(@mig_tgid[arg0]); | |
| delete(@mig_comm[arg0]); | |
| delete(@mig_flags[arg0]); | |
| delete(@mig_npages[arg0]); | |
| delete(@mig_at_pages_ns[arg0]); | |
| delete(@mig_fault_stash[arg0]); | |
| } | |
| /* ---------------------------------------------------------------------- */ | |
| /* Bulk device-memory teardown (driver unbind / chunk reclaim) */ | |
| /* ---------------------------------------------------------------------- */ | |
| kprobe:migrate_device_range | |
| /@filter_tgid == 0 || pid == @filter_tgid/ | |
| { | |
| @bulk_teardown_range++; | |
| @bulk_teardown_pages = hist((uint64)arg2); | |
| @bulk_teardown_by_proc[pid, comm] = count(); | |
| } | |
| /* | |
| * migrate_device_pfns() was added circa 2024; wildcard so the script | |
| * still loads on kernels that predate it. | |
| */ | |
| kprobe:migrate_device_pfns* | |
| /@filter_tgid == 0 || pid == @filter_tgid/ | |
| { | |
| @bulk_teardown_pfns++; | |
| @bulk_teardown_pages = hist((uint64)arg1); | |
| @bulk_teardown_by_proc[pid, comm] = count(); | |
| } | |
| /* ---------------------------------------------------------------------- */ | |
| /* hmm_range_fault() -- device asks for a CPU mapping snapshot */ | |
| /* ---------------------------------------------------------------------- */ | |
| kprobe:hmm_range_fault | |
| /@filter_tgid == 0 || pid == @filter_tgid/ | |
| { | |
| $r = (struct hmm_range *)arg0; | |
| $npages = ($r->end - $r->start) >> 12; | |
| $flags = $r->default_flags; | |
| @hrf_ts[tid] = nsecs; | |
| @hrf_npages[tid] = $npages; | |
| @hrf_flags[tid] = $flags; | |
| @hrf_total++; | |
| @hrf_npages_hist = hist($npages); | |
| @hrf_by_proc[pid, comm] = count(); | |
| /* | |
| * HMM_PFN_REQ_WRITE == HMM_PFN_WRITE == 1UL << (BITS_PER_LONG - 2) | |
| * (bit 62 on 64-bit). Test for it on ->default_flags to classify | |
| * the caller's intent as read-only mirror vs write-enabled mirror. | |
| */ | |
| if ($flags & 0x4000000000000000) { | |
| @hrf_write++; | |
| } | |
| } | |
| kretprobe:hmm_range_fault | |
| /@hrf_ts[tid]/ | |
| { | |
| $lat_us = (nsecs - @hrf_ts[tid]) / 1000; | |
| /* | |
| * hmm_range_fault returns `int` (32-bit signed). bpftrace's | |
| * retval is uint64 with the low 32 zero-extended, so plain | |
| * (int64)retval does NOT sign-extend -- e.g. -16 becomes | |
| * 4294967280. Cast through int32 to force sign extension. | |
| */ | |
| $ret = (int32)retval; | |
| @hrf_latency_us = hist($lat_us); | |
| @hrf_ret[$ret] = count(); | |
| if ($ret == -16) { /* -EBUSY -> invalidation race */ | |
| @hrf_ebusy++; | |
| @hrf_ebusy_by_proc[pid, comm] = count(); | |
| /* | |
| * Capture a kernel stack for EBUSY returns: these are | |
| * the thrash/race events where the device mirror lost a | |
| * race with a concurrent CPU PTE mutation. The stack | |
| * tells you WHICH driver (nouveau_dmem, kfd_svm, xe_svm, | |
| * drm_pagemap, ...) was the caller. A single stack- | |
| * count map per (stack, class) lets `print()` show top | |
| * offenders without unbounded growth. | |
| */ | |
| @hrf_ebusy_stacks[kstack, comm] = count(); | |
| } else if ($ret == -14) { /* -EFAULT */ | |
| @hrf_efault++; | |
| } else if ($ret == -11) { /* -EAGAIN (rare) */ | |
| @hrf_eagain++; | |
| } else if ($ret == -1) { /* -EPERM */ | |
| @hrf_eperm++; | |
| } | |
| delete(@hrf_ts[tid]); | |
| delete(@hrf_npages[tid]); | |
| delete(@hrf_flags[tid]); | |
| } | |
| /* ---------------------------------------------------------------------- */ | |
| /* Device-exclusive entries (GPU atomic ops) */ | |
| /* ---------------------------------------------------------------------- */ | |
| /* | |
| * Matches both make_device_exclusive() and the older | |
| * make_device_exclusive_range() so the script works across kernel | |
| * versions. Fires when a driver revokes CPU write access to host | |
| * pages so the GPU can perform atomics. | |
| */ | |
| kprobe:make_device_exclusive* | |
| /@filter_tgid == 0 || pid == @filter_tgid/ | |
| { | |
| @dev_exclusive_total++; | |
| @dev_exclusive_by_proc[pid, comm] = count(); | |
| } | |
| /* | |
| * restore_exclusive_pte() runs when the CPU touches an address whose | |
| * PTE was converted to a device-exclusive entry by the driver. The | |
| * CPU access is trapped via do_swap_page() which calls | |
| * remove_device_exclusive_entry() -> restore_exclusive_pte(). Pairing | |
| * its counter with make_device_exclusive gives you the CPU/GPU atomic | |
| * contention signal (high restore rate = GPU atomics fighting CPU). | |
| * | |
| * restore_exclusive_pte is static; wildcard matches it if present, | |
| * silent if absent (older kernels without the split-out helper). | |
| */ | |
| kprobe:restore_exclusive_pte* | |
| /@filter_tgid == 0 || pid == @filter_tgid/ | |
| { | |
| @dev_exclusive_restore++; | |
| @dev_exclusive_restore_by_proc[pid, comm] = count(); | |
| } | |
| /* ---------------------------------------------------------------------- */ | |
| /* Device mirror-range registration / teardown */ | |
| /* */ | |
| /* Every HMM-using driver registers a per-process mmu_interval_notifier */ | |
| /* covering the VA range it wants to mirror; counting these reveals HMM */ | |
| /* driver lifecycle events (attach, detach, process fork). */ | |
| /* ---------------------------------------------------------------------- */ | |
| kprobe:mmu_interval_notifier_insert | |
| /@filter_tgid == 0 || pid == @filter_tgid/ | |
| { | |
| @mmu_interval_insert++; | |
| @mmu_interval_insert_by_proc[pid, comm] = count(); | |
| } | |
| kprobe:mmu_interval_notifier_remove | |
| /@filter_tgid == 0 || pid == @filter_tgid/ | |
| { | |
| @mmu_interval_remove++; | |
| @mmu_interval_remove_by_proc[pid, comm] = count(); | |
| } | |
| /* ---------------------------------------------------------------------- */ | |
| /* HMM DMA-mapping (P2P & direct device I/O) */ | |
| /* ---------------------------------------------------------------------- */ | |
| /* hmm_dma_map_pfn / _unmap_pfn are newer APIs (added ~2024) -- wildcard | |
| * so the script stays loadable on older kernels without them. */ | |
| kprobe:hmm_dma_map_pfn* | |
| { | |
| @hmm_dma_map++; | |
| } | |
| kprobe:hmm_dma_unmap_pfn* | |
| { | |
| @hmm_dma_unmap++; | |
| } | |
| /* ---------------------------------------------------------------------- */ | |
| /* CPU-side fault-in path (complementary view to fault_page != NULL) */ | |
| /* ---------------------------------------------------------------------- */ | |
| /* Fires for the THP fault-back path (device-private PMD entry). | |
| * Added with PMD-size device-private support; wildcarded so pre-support | |
| * kernels still load the script without this probe. */ | |
| kprobe:do_huge_pmd_device_private* | |
| /@filter_tgid == 0 || pid == @filter_tgid/ | |
| { | |
| @cpu_fault_thp++; | |
| @cpu_fault_thp_by_proc[pid, comm] = count(); | |
| } | |
| /* ---------------------------------------------------------------------- */ | |
| /* MMU notifier -- filtered to HMM-relevant events only */ | |
| /* ---------------------------------------------------------------------- */ | |
| /* | |
| * __mmu_notifier_invalidate_range_start is extremely hot (fires on every | |
| * munmap, mprotect, swap-out, fork, ...). We must filter inline on | |
| * event in { MMU_NOTIFY_MIGRATE, MMU_NOTIFY_EXCLUSIVE } -- both originate | |
| * exclusively from HMM-related paths. | |
| * | |
| * NOTE: mmu_notifier_invalidate_range_start() is a static inline wrapper; | |
| * the real kprobable symbol is __mmu_notifier_invalidate_range_start(). | |
| * This only fires for mm's with notifiers registered (GPU driver context), | |
| * so the overhead is well bounded. | |
| */ | |
| kprobe:__mmu_notifier_invalidate_range_start | |
| { | |
| $r = (struct mmu_notifier_range *)arg0; | |
| $event = (uint64)$r->event; | |
| if ($event != @MMU_NOTIFY_MIGRATE && $event != @MMU_NOTIFY_EXCLUSIVE) { | |
| return; | |
| } | |
| if (@filter_tgid != 0 && pid != @filter_tgid) { | |
| return; | |
| } | |
| $span = ($r->end - $r->start) >> 12; | |
| if ($event == @MMU_NOTIFY_MIGRATE) { | |
| @mmu_notify_migrate++; | |
| @mmu_notify_migrate_pages = hist($span); | |
| @mmu_notify_migrate_by_proc[pid, comm] = count(); | |
| } else { | |
| @mmu_notify_exclusive++; | |
| @mmu_notify_exclusive_pages = hist($span); | |
| } | |
| } | |
| /* ---------------------------------------------------------------------- */ | |
| /* Per-page PTE tracepoints */ | |
| /* */ | |
| /* Asymmetric by kernel design, not a tool bug: */ | |
| /* * set_migration_pte fires only from try_to_migrate_one() in mm/rmap.c.*/ | |
| /* migrate_device_unmap() calls try_to_migrate() ONLY when a folio was */ | |
| /* still mapped at unmap time -- which is the case for already-device */ | |
| /* pages (dev->cpu fault-back) but *not* for the cpu->device path, where */ | |
| /* migrate_vma_collect_pmd() replaces PTEs directly with set_pte_at(). */ | |
| /* * remove_migration_pte fires from remove_migration_ptes() in */ | |
| /* mm/migrate.c which __migrate_device_finalize() invokes for every */ | |
| /* migration, both directions. */ | |
| /* */ | |
| /* Expect: remove_migration_pte >= set_migration_pte. */ | |
| /* A large imbalance (set << remove) indicates heavy cpu->device traffic. */ | |
| /* ---------------------------------------------------------------------- */ | |
| tracepoint:migrate:set_migration_pte | |
| /@filter_tgid == 0 || pid == @filter_tgid/ | |
| { | |
| @pte_set_migration++; | |
| @pte_set_migration_order = lhist(args->order, 0, 10, 1); | |
| } | |
| tracepoint:migrate:remove_migration_pte | |
| /@filter_tgid == 0 || pid == @filter_tgid/ | |
| { | |
| @pte_remove_migration++; | |
| @pte_remove_migration_order = lhist(args->order, 0, 10, 1); | |
| } | |
| /* ---------------------------------------------------------------------- */ | |
| /* Periodic summary */ | |
| /* ---------------------------------------------------------------------- */ | |
| interval:s:5 | |
| { | |
| $elapsed = (nsecs - @start_ns) / 1000000000; | |
| printf("\n=========================================================\n"); | |
| printf(" hmm-snoop summary at t=%ds", $elapsed); | |
| if (@filter_tgid > 0) { | |
| printf(" (tgid %d only)", @filter_tgid); | |
| } | |
| printf("\n=========================================================\n"); | |
| printf("\n-- Migration lifecycle --\n"); | |
| printf("%s\n", "migrate_vma_setup attempts:"); | |
| print(@mig_setup_by_class); | |
| printf("%s\n", "migrate_vma_setup -> cpages (pages actually collected):"); | |
| print(@mig_setup_pages_collected_total); | |
| printf("%s\n", "migrate_vma_setup return codes (0=ok):"); | |
| print(@mig_setup_ret); | |
| printf("\n-- Top processes -- cpu->dev prefetch --\n"); | |
| print(@mig_cpu2dev_by_proc, 10); | |
| printf("\n-- Top processes -- dev->cpu fault-back --\n"); | |
| print(@mig_dev2cpu_flt_by_proc, 10); | |
| printf("\n-- Top processes -- dev->cpu driver evict --\n"); | |
| print(@mig_dev2cpu_drv_by_proc, 10); | |
| printf("\n-- End-to-end latency (setup->finalize, microseconds) --\n"); | |
| print(@mig_e2e_us); | |
| printf("\n-- Partial / unmigratable (cpages < npages requested) --\n"); | |
| print(@mig_partial); | |
| print(@mig_unmigratable_pages); | |
| printf("\n-- Device mirror (hmm_range_fault) --\n"); | |
| printf(" total calls: %d", @hrf_total); | |
| printf(" -EBUSY (invalidation race): %d", @hrf_ebusy); | |
| printf(" -EFAULT: %d -EAGAIN: %d -EPERM: %d\n", | |
| @hrf_efault, @hrf_eagain, @hrf_eperm); | |
| print(@hrf_latency_us); | |
| printf("\n-- MMU notifier events (HMM-scoped) --\n"); | |
| printf(" MMU_NOTIFY_MIGRATE: %d\n", @mmu_notify_migrate); | |
| printf(" MMU_NOTIFY_EXCLUSIVE: %d\n", @mmu_notify_exclusive); | |
| printf("\n-- Per-page PTE traffic --\n"); | |
| printf(" set_migration_pte: %d\n", @pte_set_migration); | |
| printf(" remove_migration_pte: %d\n", @pte_remove_migration); | |
| printf("\n-- Device-exclusive / DMA --\n"); | |
| printf(" make_device_exclusive: %d\n", @dev_exclusive_total); | |
| printf(" restore_exclusive_pte: %d (CPU accesses back-claiming)\n", | |
| @dev_exclusive_restore); | |
| printf(" hmm_dma_map_pfn: %d\n", @hmm_dma_map); | |
| printf(" hmm_dma_unmap_pfn: %d\n", @hmm_dma_unmap); | |
| printf("\n-- HMM driver lifecycle --\n"); | |
| printf(" mmu_interval_notifier_insert: %d\n", @mmu_interval_insert); | |
| printf(" mmu_interval_notifier_remove: %d\n", @mmu_interval_remove); | |
| printf("\n-- Bulk teardown --\n"); | |
| printf(" migrate_device_range: %d migrate_device_pfns: %d\n", | |
| @bulk_teardown_range, @bulk_teardown_pfns); | |
| } | |
| /* | |
| * Auto-exit after the requested duration. Run frequently so the exit | |
| * latency is bounded to ~1s. Use a separate 1s probe (rather than | |
| * re-using the 5s summary interval) so "run for 10s" actually exits | |
| * at ~10s instead of the next 5s summary tick. | |
| */ | |
| interval:s:1 | |
| /@duration_s > 0/ | |
| { | |
| if ((nsecs - @start_ns) / 1000000000 >= @duration_s) { | |
| exit(); | |
| } | |
| } | |
| /* ---------------------------------------------------------------------- */ | |
| /* Final report on exit */ | |
| /* ---------------------------------------------------------------------- */ | |
| END | |
| { | |
| printf("\n\n"); | |
| printf("=========================================================\n"); | |
| printf(" hmm-snoop final report\n"); | |
| printf("=========================================================\n"); | |
| printf("\n--- Migration attempts by class ---\n"); | |
| print(@mig_setup_by_class); | |
| printf("\n--- Pages successfully collected (cpages) by class ---\n"); | |
| print(@mig_setup_pages_collected); | |
| print(@mig_setup_pages_collected_total); | |
| printf("\n--- Pages requested (npages) by class ---\n"); | |
| print(@mig_setup_pages_requested); | |
| printf("\n--- End-to-end migration latency (setup->finalize, us) ---\n"); | |
| print(@mig_e2e_us); | |
| printf("\n--- Setup->pages phase latency (us) ---\n"); | |
| print(@mig_pages_phase_us); | |
| printf("\n--- Pages->finalize phase latency (us) ---\n"); | |
| print(@mig_finalize_phase_us); | |
| printf("\n--- Pages successfully migrated at finalize (cpages sum) ---\n"); | |
| print(@mig_finalize_cpages); | |
| printf("\n--- Compound (PMD-order) migrations ---\n"); | |
| print(@mig_compound); | |
| printf("\n--- Migrations where cpages<npages (pinned pages) ---\n"); | |
| print(@mig_partial); | |
| print(@mig_unmigratable_pages); | |
| printf("\n--- Failed migrate_vma_setup() by class ---\n"); | |
| print(@mig_setup_failed); | |
| printf("\n--- Top processes: cpu->dev prefetch starts ---\n"); | |
| print(@mig_cpu2dev_by_proc, 40); | |
| printf("\n--- Top processes: dev->cpu fault-back starts ---\n"); | |
| print(@mig_dev2cpu_flt_by_proc, 40); | |
| printf("\n--- Top processes: dev->cpu driver-evict starts ---\n"); | |
| print(@mig_dev2cpu_drv_by_proc, 40); | |
| printf("\n--- Top processes: cpu->dev cpages actually moved ---\n"); | |
| print(@mig_cpu2dev_cpages_by_proc, 40); | |
| printf("\n--- Top processes: dev->cpu fault-back cpages moved ---\n"); | |
| print(@mig_dev2cpu_flt_cpages_by_proc, 40); | |
| printf("\n--- Top processes: dev->cpu driver-evict cpages moved ---\n"); | |
| print(@mig_dev2cpu_drv_cpages_by_proc, 40); | |
| printf("\n--- hmm_range_fault (device mirror refresh) ---\n"); | |
| printf(" calls: %d\n", @hrf_total); | |
| printf(" write-enabled: %d\n", @hrf_write); | |
| printf(" -EBUSY (inval race, thrash hint): %d\n", @hrf_ebusy); | |
| printf(" -EFAULT: %d -EAGAIN: %d -EPERM: %d\n", | |
| @hrf_efault, @hrf_eagain, @hrf_eperm); | |
| print(@hrf_ret); | |
| print(@hrf_npages_hist); | |
| print(@hrf_latency_us); | |
| printf("\n--- hmm_range_fault top callers ---\n"); | |
| print(@hrf_by_proc, 20); | |
| printf("\n--- hmm_range_fault EBUSY top processes ---\n"); | |
| print(@hrf_ebusy_by_proc, 20); | |
| printf("\n--- hmm_range_fault EBUSY top kernel stacks ---\n"); | |
| printf(" (identifies which driver is racing the CPU)\n"); | |
| print(@hrf_ebusy_stacks, 10); | |
| printf("\n--- CPU-side fault paths ---\n"); | |
| printf(" do_huge_pmd_device_private: %d\n", @cpu_fault_thp); | |
| print(@cpu_fault_thp_by_proc, 10); | |
| printf("\n--- MMU notifier (HMM-scoped) ---\n"); | |
| printf(" MMU_NOTIFY_MIGRATE: %d\n", @mmu_notify_migrate); | |
| printf(" MMU_NOTIFY_EXCLUSIVE: %d\n", @mmu_notify_exclusive); | |
| print(@mmu_notify_migrate_pages); | |
| print(@mmu_notify_exclusive_pages); | |
| print(@mmu_notify_migrate_by_proc, 10); | |
| printf("\n--- Per-page PTE traffic (tracepoint-driven) ---\n"); | |
| printf(" set_migration_pte: %d\n", @pte_set_migration); | |
| printf(" remove_migration_pte: %d\n", @pte_remove_migration); | |
| printf(" Note: set_migration_pte fires only for already-mapped folios\n"); | |
| printf(" (typically dev->cpu paths). cpu->dev migrations replace PTEs\n"); | |
| printf(" directly in migrate_vma_collect_pmd() without try_to_migrate,\n"); | |
| printf(" so expect remove >= set during heavy prefetch activity.\n"); | |
| print(@pte_set_migration_order); | |
| print(@pte_remove_migration_order); | |
| printf("\n--- Device-exclusive ---\n"); | |
| printf(" make_device_exclusive total: %d\n", @dev_exclusive_total); | |
| printf(" restore_exclusive_pte total: %d (CPU back-claim)\n", | |
| @dev_exclusive_restore); | |
| print(@dev_exclusive_by_proc, 10); | |
| print(@dev_exclusive_restore_by_proc, 10); | |
| printf("\n--- HMM driver lifecycle ---\n"); | |
| printf(" mmu_interval_notifier_insert: %d\n", @mmu_interval_insert); | |
| printf(" mmu_interval_notifier_remove: %d\n", @mmu_interval_remove); | |
| print(@mmu_interval_insert_by_proc, 10); | |
| print(@mmu_interval_remove_by_proc, 10); | |
| printf("\n--- Migrations by pgmap_owner (driver/device dimension) ---\n"); | |
| printf(" pgmap_owner values are kernel pointers to the driver's\n"); | |
| printf(" device identity: amdkfd mdevice, drm_pagemap->owner,\n"); | |
| printf(" nouveau drm->dev, kvmppc_uvmem_pgmap, etc.\n"); | |
| printf(" Cross-reference against /proc/kallsyms or kernel logs.\n"); | |
| printf("\n cpu->dev prefetch:\n"); | |
| print(@mig_cpu2dev_by_device); | |
| printf("\n dev->cpu fault-back:\n"); | |
| print(@mig_dev2cpu_flt_by_device); | |
| printf("\n dev->cpu driver evict:\n"); | |
| print(@mig_dev2cpu_drv_by_device); | |
| printf("\n--- HMM DMA mapping ---\n"); | |
| printf(" hmm_dma_map_pfn: %d\n", @hmm_dma_map); | |
| printf(" hmm_dma_unmap_pfn: %d\n", @hmm_dma_unmap); | |
| printf("\n--- Bulk teardown ---\n"); | |
| printf(" migrate_device_range: %d\n", @bulk_teardown_range); | |
| printf(" migrate_device_pfns: %d\n", @bulk_teardown_pfns); | |
| print(@bulk_teardown_pages); | |
| print(@bulk_teardown_by_proc, 10); | |
| /* | |
| * Clear every map we printed explicitly so bpftrace's auto- | |
| * print at shutdown does not emit duplicates. Also clear the | |
| * scratch / constant maps that would otherwise appear. | |
| */ | |
| /* scratch (per-migration state) */ | |
| clear(@mig_start_ns); clear(@mig_class); | |
| clear(@mig_tgid); clear(@mig_comm); | |
| clear(@mig_flags); clear(@mig_npages); | |
| clear(@mig_at_pages_ns); | |
| clear(@mig_fault_stash); | |
| clear(@setup_arg0); | |
| clear(@hrf_ts); clear(@hrf_npages); clear(@hrf_flags); | |
| /* | |
| * Scalar counters: we deliberately DO NOT clear() these. The | |
| * bpftrace auto-print at END emits them as `@name: value` lines | |
| * which is the machine-readable form required by hmm-snoop-plot.py. | |
| * The human-readable " MMU_NOTIFY_MIGRATE: N" printfs above | |
| * already exist for eyeball review; having both in the output | |
| * is a deliberate convenience. | |
| */ | |
| /* aggregations */ | |
| clear(@mig_setup_by_class); | |
| clear(@mig_setup_pages_collected); | |
| clear(@mig_setup_pages_collected_total); | |
| clear(@mig_setup_pages_requested); | |
| clear(@mig_setup_ret); | |
| clear(@mig_cpu2dev_by_proc); | |
| clear(@mig_dev2cpu_flt_by_proc); | |
| clear(@mig_dev2cpu_drv_by_proc); | |
| clear(@mig_cpu2dev_cpages_by_proc); | |
| clear(@mig_dev2cpu_flt_cpages_by_proc); | |
| clear(@mig_dev2cpu_drv_cpages_by_proc); | |
| clear(@mig_finalize_cpages); | |
| clear(@mig_setup_failed); | |
| clear(@mig_partial); clear(@mig_unmigratable_pages); | |
| clear(@mig_compound); | |
| clear(@mig_e2e_us); clear(@mig_pages_phase_us); | |
| clear(@mig_finalize_phase_us); | |
| clear(@hrf_npages_hist); clear(@hrf_latency_us); | |
| clear(@hrf_ret); clear(@hrf_by_proc); | |
| clear(@hrf_ebusy_by_proc); | |
| clear(@cpu_fault_thp_by_proc); | |
| clear(@mmu_notify_migrate_pages); | |
| clear(@mmu_notify_exclusive_pages); | |
| clear(@mmu_notify_migrate_by_proc); | |
| clear(@pte_set_migration_order); | |
| clear(@pte_remove_migration_order); | |
| clear(@dev_exclusive_by_proc); | |
| clear(@dev_exclusive_restore_by_proc); | |
| clear(@mmu_interval_insert_by_proc); | |
| clear(@mmu_interval_remove_by_proc); | |
| clear(@mig_cpu2dev_by_device); | |
| clear(@mig_dev2cpu_flt_by_device); | |
| clear(@mig_dev2cpu_drv_by_device); | |
| clear(@hrf_ebusy_stacks); | |
| clear(@bulk_teardown_pages); clear(@bulk_teardown_by_proc); | |
| /* constants / config */ | |
| clear(@filter_tgid); clear(@duration_s); clear(@start_ns); | |
| clear(@MVMA_SYSTEM); clear(@MVMA_DEVPRIV); | |
| clear(@MVMA_DEVCOH); clear(@MVMA_COMPOUND); | |
| clear(@MMU_NOTIFY_MIGRATE); clear(@MMU_NOTIFY_EXCLUSIVE); | |
| clear(@C_CPU2DEV); clear(@C_DEV2CPU_FLT); | |
| clear(@C_DEV2CPU_DRV); clear(@C_UNKNOWN); | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment