Created
April 23, 2026 20:03
-
-
Save davidlohr/01ad349da27e18dc6bff4d6481cd3c54 to your computer and use it in GitHub Desktop.
BPF script to profile io_uring workloads
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env bpftrace | |
| /* | |
| * io_uring workload insight. | |
| * | |
| * Gives a complete picture of io_uring activity during a workload: | |
| * - ring lifecycle (create, cqring wait) | |
| * - submission & completion volume and per-opcode breakdown | |
| * - per-request latency (submit -> complete) per opcode | |
| * - registration-path activity (buffers/files/etc.), with stacks | |
| * - async-work offload and io_uring_enter traffic | |
| * - fixed-buffer hot path (io_import_reg_buf) count | |
| * | |
| * Usage (as root): | |
| * ./io_uring_register_insight.bt 2>&1 | tee io_uring.log | |
| * (start workload in another terminal) | |
| * (Ctrl-C when done) | |
| * | |
| * Per-second lines show spike behaviour; end-of-run summary shows totals, | |
| * latency histograms, call-stack attribution, and opcode distributions. | |
| * | |
| */ | |
| BEGIN | |
| { | |
| printf("Tracing io_uring. Ctrl-C to stop.\n"); | |
| printf("per-second: submit / complete / async-off | enter | per-buf / batch-reg\n\n"); | |
| } | |
| tracepoint:io_uring:io_uring_create | |
| { | |
| @ring_create_total++; | |
| printf("[lifecycle] ring_create pid=%d comm=%s fd=%d ctx=%p sq=%u cq=%u flags=0x%x\n", | |
| pid, comm, args->fd, args->ctx, | |
| args->sq_entries, args->cq_entries, args->flags); | |
| } | |
| tracepoint:io_uring:io_uring_cqring_wait | |
| { | |
| @cqring_wait_total++; | |
| @cqring_wait_this_sec++; | |
| @cqring_wait_min_events = hist(args->min_events); | |
| } | |
| tracepoint:io_uring:io_uring_submit_req | |
| { | |
| @submit_total++; | |
| @submit_this_sec++; | |
| @submit_opcode[args->opcode] = count(); | |
| @submit_by_pid[pid, comm] = count(); | |
| /* track per-request for submit->complete latency */ | |
| @req_submit_ns[args->req] = nsecs; | |
| @req_submit_op[args->req] = args->opcode; | |
| if (args->sq_thread) { | |
| @submit_sqpoll++; | |
| } | |
| } | |
| tracepoint:io_uring:io_uring_complete | |
| { | |
| @complete_total++; | |
| @complete_this_sec++; | |
| if (args->res >= 0) { | |
| @complete_ok++; | |
| } else { | |
| @complete_err++; | |
| } | |
| if (@req_submit_ns[args->req]) { | |
| $lat_ns = nsecs - @req_submit_ns[args->req]; | |
| $op = @req_submit_op[args->req]; | |
| @op_lat_us[$op] = hist($lat_ns / 1000); | |
| delete(@req_submit_ns[args->req]); | |
| delete(@req_submit_op[args->req]); | |
| } | |
| } | |
| tracepoint:io_uring:io_uring_queue_async_work | |
| { | |
| @async_total++; | |
| @async_this_sec++; | |
| @async_opcode[args->opcode] = count(); | |
| } | |
| /* io_uring_enter is the main syscall that pushes submissions + waits for CQEs */ | |
| tracepoint:syscalls:sys_enter_io_uring_enter | |
| { | |
| @enter_total++; | |
| @enter_this_sec++; | |
| @enter_by_pid[pid, comm] = count(); | |
| } | |
| kprobe:io_import_reg_buf | |
| { | |
| @hot_import_reg_buf++; | |
| } | |
| tracepoint:io_uring:io_uring_register | |
| { | |
| @register_opcode[args->opcode] = count(); | |
| @register_total++; | |
| } | |
| kprobe:io_sqe_buffers_register | |
| { | |
| @batch_reg_total++; | |
| @batch_reg_this_sec++; | |
| @batch_reg_stacks[kstack(6)] = count(); | |
| @batch_reg_start[tid] = nsecs; | |
| } | |
| kretprobe:io_sqe_buffers_register | |
| /@batch_reg_start[tid]/ | |
| { | |
| @batch_reg_lat_us = hist((nsecs - @batch_reg_start[tid]) / 1000); | |
| delete(@batch_reg_start[tid]); | |
| } | |
| kprobe:io_sqe_buffers_unregister | |
| { | |
| @batch_unreg_total++; | |
| } | |
| kprobe:io_sqe_buffer_register | |
| { | |
| @per_buf_total++; | |
| @per_buf_this_sec++; | |
| @per_buf_start[tid] = nsecs; | |
| } | |
| kretprobe:io_sqe_buffer_register | |
| /@per_buf_start[tid]/ | |
| { | |
| @per_buf_lat_us = hist((nsecs - @per_buf_start[tid]) / 1000); | |
| delete(@per_buf_start[tid]); | |
| } | |
| interval:s:1 | |
| { | |
| printf("[%3lld] sub=%-7lu cmp=%-7lu async=%-4lu ent=%-5lu wait=%-4lu | pbuf=%-4lu breg=%-3lu\n", | |
| elapsed / 1000000000, | |
| (uint64)@submit_this_sec, | |
| (uint64)@complete_this_sec, | |
| (uint64)@async_this_sec, | |
| (uint64)@enter_this_sec, | |
| (uint64)@cqring_wait_this_sec, | |
| (uint64)@per_buf_this_sec, | |
| (uint64)@batch_reg_this_sec); | |
| @submit_this_sec = 0; | |
| @complete_this_sec = 0; | |
| @async_this_sec = 0; | |
| @enter_this_sec = 0; | |
| @cqring_wait_this_sec = 0; | |
| @per_buf_this_sec = 0; | |
| @batch_reg_this_sec = 0; | |
| } | |
| END | |
| { | |
| printf("\n=== Ring lifecycle ===\n"); | |
| printf("io_uring_create events: %lu\n", (uint64)@ring_create_total); | |
| printf("io_uring_cqring_wait events: %lu\n", (uint64)@cqring_wait_total); | |
| printf("\n=== SQE / CQE volume ===\n"); | |
| printf("io_uring_submit_req (total SQEs): %lu\n", (uint64)@submit_total); | |
| printf(" of which submitted by sqpoll: %lu\n", (uint64)@submit_sqpoll); | |
| printf("io_uring_complete (total CQEs): %lu\n", (uint64)@complete_total); | |
| printf("io_uring_queue_async_work: %lu\n", (uint64)@async_total); | |
| printf("sys_enter_io_uring_enter calls: %lu\n", (uint64)@enter_total); | |
| printf("\n=== Fixed-buffer hot path (per-I/O) ===\n"); | |
| printf("io_import_reg_buf calls: %lu\n", (uint64)@hot_import_reg_buf); | |
| printf("\n=== Registration ===\n"); | |
| printf("IORING_REGISTER_* syscalls total: %lu\n", (uint64)@register_total); | |
| printf("io_sqe_buffers_register (batch): %lu\n", (uint64)@batch_reg_total); | |
| printf("io_sqe_buffers_unregister: %lu\n", (uint64)@batch_unreg_total); | |
| printf("io_sqe_buffer_register (per-iovec): %lu\n", (uint64)@per_buf_total); | |
| printf("\n=== SQE opcode distribution (submit_req) ===\n"); | |
| print(@submit_opcode); | |
| printf("\n=== Async-offloaded opcode distribution ===\n"); | |
| print(@async_opcode); | |
| printf("\n=== Completion result sign breakdown ===\n"); | |
| printf("ok (res >= 0): %lu\n", (uint64)@complete_ok); | |
| printf("err (res < 0): %lu\n", (uint64)@complete_err); | |
| printf("\n=== Submit/complete per (pid, comm) ===\n"); | |
| print(@submit_by_pid); | |
| printf("\n=== io_uring_enter per (pid, comm) ===\n"); | |
| print(@enter_by_pid); | |
| printf("\n=== Per-opcode submit->complete latency (us) ===\n"); | |
| print(@op_lat_us); | |
| printf("\n=== cqring_wait min_events distribution ===\n"); | |
| print(@cqring_wait_min_events); | |
| printf("\n=== IORING_REGISTER_* opcodes seen ===\n"); | |
| print(@register_opcode); | |
| printf("\n=== io_sqe_buffer_register latency (us) ===\n"); | |
| print(@per_buf_lat_us); | |
| printf("\n=== io_sqe_buffers_register latency (us) ===\n"); | |
| print(@batch_reg_lat_us); | |
| printf("\n=== io_sqe_buffers_register call stacks ===\n"); | |
| print(@batch_reg_stacks); | |
| /* drop internal state */ | |
| clear(@req_submit_ns); | |
| clear(@req_submit_op); | |
| clear(@per_buf_start); | |
| clear(@batch_reg_start); | |
| clear(@submit_this_sec); | |
| clear(@complete_this_sec); | |
| clear(@async_this_sec); | |
| clear(@enter_this_sec); | |
| clear(@cqring_wait_this_sec); | |
| clear(@per_buf_this_sec); | |
| clear(@batch_reg_this_sec); | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment