Created
May 11, 2020 07:36
-
-
Save ethercflow/16b661c6219beb67e1550bec4ff207e8 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# @lint-avoid-python-3-compatibility-imports | |
from __future__ import print_function | |
from bcc import ArgString, BPF | |
from time import sleep | |
bpf_text = """ | |
#include <uapi/linux/ptrace.h> | |
#include <linux/sched.h> | |
struct data { | |
u64 ts; | |
u32 prev_pid; | |
u32 next_pid; | |
int stack_id; | |
char prev_comm[TASK_COMM_LEN]; | |
char next_comm[TASK_COMM_LEN]; | |
}; | |
BPF_PERF_OUTPUT(events); | |
BPF_STACK_TRACE(stack_traces, 16384); | |
static inline bool h_strcmp(const char *comm) | |
{ | |
char filter[] = "cop_normal"; | |
int len = sizeof(filter) - 1; | |
int i = 0; | |
for (; i < len; ++i) { | |
if (comm[i] != filter[i]) { | |
bpf_trace_printk("not equal, %d\\n", i); | |
return false; | |
} | |
} | |
return true; | |
} | |
TRACEPOINT_PROBE(sched, sched_switch) { | |
struct data data = {}; | |
// if (args->prev_state & (TASK_REPORT_MAX - 1)) | |
// return 0; | |
if (!(args->prev_state & TASK_REPORT_MAX)) | |
return 0; | |
if (!h_strcmp(args->prev_comm)) | |
return 0; | |
data.ts = bpf_ktime_get_ns(); | |
data.prev_pid = args->prev_pid; | |
data.next_pid = args->next_pid; | |
data.stack_id = stack_traces.get_stackid(args, 0); | |
bpf_probe_read(&data.prev_comm, TASK_COMM_LEN, args->prev_comm); | |
bpf_probe_read(&data.next_comm, TASK_COMM_LEN, args->next_comm); | |
events.perf_submit(args, &data, sizeof(data)); | |
return 0; | |
} | |
""" | |
b = BPF(text=bpf_text) | |
stack_traces = b.get_table("stack_traces") | |
# process event | |
def print_event(cpu, data, size): | |
event = b["events"].event(data) | |
print("%-24s %-14.14s %-6s -> %-14.14s %-6s" % ( | |
event.ts, | |
event.prev_comm.decode("utf-8", "replace"), | |
event.prev_pid, | |
event.next_comm.decode("utf-8", "replace"), | |
event.next_pid, | |
)) | |
for addr in stack_traces.walk(event.stack_id): | |
sym = b.ksym(addr, show_offset=True) | |
print("\t%s" % sym) | |
print("") | |
b["events"].open_perf_buffer(print_event, page_cnt=64) | |
while (1): | |
try: | |
sleep(1) | |
b.perf_buffer_poll() | |
except KeyboardInterrupt: | |
exit() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
工具用途:
trace 所有 cop_normal* 线程因被动抢占而让出 cpu 的事件 (注意,这里没有记录因 cpu 饱和导致的被动抢占,若需要包含此情况,则需开启 42、43行,并注释掉 45、46行),并记录时间戳,发生被动抢占时的 kernel stack,和后继线程信息。输出如下:
时间戳解析 (注意小数点位置):
部署方式:
nohup ./ivcsw.py &
注意事项:
开销:
虽然 bpf 本身开销很小,但调度器事件属于高频事件,累加起来会有一定的性能影响,之前在内部 benchmark 平台测试过 runqlat,会产生大约 2% 的性能会退 (实际损耗和负载有关系),因此在客户那边运行时,需要先关注一段时间业务指标。
第 85 行 sleep 1 是为了降低此工具带来的 context switch 开销,可酌情调整;
此工具会产生一定 IO,因此为避免干扰 TiKV,最好放在另外一个盘上;
产生的文件大小与事件频率和运行时长有关,需要关注下,避免爆盘;
若出现事件丢失情况,需酌情增大 22 行或 82 行的 page size;