jart · November 13, 2024 16:18
diff --git a/trace.cpp b/trace.cpp
 // -*- mode:c;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
 // vi: set et ft=c ts=4 sts=4 sw=4 fenc=utf-8 :vi

 #define _GNU_SOURCE
 #include <pthread.h>
 #include <stdatomic.h>
 #include <stdbool.h>
 #include <stdio.h>
 #include <threads.h>
 #include <unistd.h>

 struct TraceEvent
 {
    unsigned long long ts;
    int pid;
    int tid;
    const char* name;
    const char* cat;
    char ph;
 };

 static int g_pid;
 static atomic_bool g_oom;
 static atomic_int g_count;
 static thread_local int g_id;
 static thread_local int g_ids;
 static thread_local int g_tid;
 static unsigned long g_start_rdtsc;
 static struct TraceEvent g_events[1000000];

 static unsigned long
 rdtsc(void)
 {
 #ifdef __x86_64__
    unsigned ax, dx;
    __asm__ volatile("rdtsc" : "=a"(ax), "=d"(dx));
    return (unsigned long)dx << 32 | ax;
 #else
    unsigned long c;
    __asm__ volatile("mrs %0, cntvct_el0" : "=r"(c));
    return c * 48; // the fudge factor
 #endif
 }

 static int
 llamafile_trace_oom(void)
 {
    if (atomic_load_explicit(&g_oom, memory_order_relaxed))
        return -1;
    if (atomic_exchange_explicit(&g_oom, true, memory_order_acq_rel))
        return -1;
    fprintf(stderr, "warning: ran out of trace event memory\n");
    return -1;
 }

 static int
 llamafile_trace_reserve(int count)
 {
    int id = atomic_load_explicit(&g_count, memory_order_relaxed);
    if (id + count > sizeof(g_events) / sizeof(*g_events))
        return llamafile_trace_oom();
    id = atomic_fetch_add_explicit(&g_count, count, memory_order_acq_rel);
    if (id + count > sizeof(g_events) / sizeof(*g_events))
        return llamafile_trace_oom();
    return id;
 }

 static void
 llamafile_trace_event(int id, const char* name, const char* cat, char ph)
 {
    g_events[id].ts = rdtsc();
    g_events[id].pid = g_pid ? g_pid - 1 : getpid();
    g_events[id].tid = g_tid ? g_tid - 1 : gettid();
    g_events[id].name = name;
    g_events[id].cat = cat;
    g_events[id].ph = ph;
 }

 void
 llamafile_trace_set_pid(int pid)
 {
    g_pid = pid + 1;
 }

 void
 llamafile_trace_set_tid(int tid)
 {
    g_tid = tid + 1;
 }

 void
 llamafile_trace_begin(const char* name)
 {
    if (g_ids < 2) {
        g_ids = 20;
        g_id = llamafile_trace_reserve(g_ids);
        if (g_id == -1) {
            g_ids = 0;
            return;
        }
    }
    llamafile_trace_event(g_id++, name, "category", 'B');
    --g_ids;
 }

 void
 llamafile_trace_end(const char* name)
 {
    if (g_ids < 1)
        return;
    llamafile_trace_event(g_id++, name, "category", 'E');
    --g_ids;
 }

 static void
 llamafile_trace_save(const char* filename)
 {
    int count = atomic_load_explicit(&g_count, memory_order_relaxed);
    if (!count)
        return;
    fprintf(stderr, "saving trace to %s...\n", filename);
    FILE* file = fopen(filename, "w");
    if (!file) {
        perror(filename);
        return;
    }
    fprintf(file, "[\n");
    bool once = false;
    for (int i = 0; i < count; i++) {
        if (!g_events[i].name)
            continue;
        if (!once) {
            once = true;
        } else {
            fputs(",\n", file);
        }
        fprintf(file,
                "{\"name\": \"%s\", \"cat\": \"%s\", \"ph\": \"%c\", "
                "\"ts\": %.3f, \"pid\": %d, \"tid\": %d}",
                g_events[i].name,
                g_events[i].cat,
                g_events[i].ph,
                (g_events[i].ts - g_start_rdtsc) / 3000.,
                g_events[i].pid,
                g_events[i].tid);
    }
    fprintf(file, "\n]\n");
    fclose(file);
 }

 __attribute__((__constructor__)) static void
 trace_startup(void)
 {
    g_start_rdtsc = rdtsc();
 }

 __attribute__((__destructor__)) static void
 trace_shutdown(void)
 {
    llamafile_trace_save("trace.json"); // see chrome://tracing/
 }
diff --git a/z-example-usage.cpp b/z-example-usage.cpp
 // -*- mode:c;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
 // vi: set et ft=c ts=4 sts=4 sw=4 fenc=utf-8 :vi

 #include "trace.h"

 #include <assert.h>
 #include <pthread.h>
 #include <stdio.h>
 #include <sys/resource.h>
 #include <sys/time.h>
 #include <time.h>

 #define ITERATIONS 100000
 #define THREADS 30

 int g_chores;
 pthread_mutex_t g_locker = PTHREAD_MUTEX_INITIALIZER;

 void*
 worker(void* arg)
 {
    llamafile_trace_begin("worker");
    for (int i = 0; i < ITERATIONS; ++i) {
        pthread_mutex_lock(&g_locker);
        ++g_chores;
        pthread_mutex_unlock(&g_locker);
    }
    llamafile_trace_end("worker");
    return 0;
 }

 struct timeval
 tub(struct timeval a, struct timeval b)
 {
    a.tv_sec -= b.tv_sec;
    if (a.tv_usec < b.tv_usec) {
        a.tv_usec += 1000000;
        a.tv_sec--;
    }
    a.tv_usec -= b.tv_usec;
    return a;
 }

 long
 tomicros(struct timeval x)
 {
    return x.tv_sec * 1000000ul + x.tv_usec;
 }

 int
 main()
 {
    cpu_set_t x;
    CPU_ZERO(&x);
    CPU_SET(0, &x);
    /* CPU_SET(1, &x); */
    sched_setaffinity(0, sizeof(x), &x);

    struct timeval start;
    gettimeofday(&start, 0);

    pthread_t th[THREADS];
    for (int i = 0; i < THREADS; ++i)
        pthread_create(&th[i], 0, worker, 0);
    for (int i = 0; i < THREADS; ++i)
        pthread_join(th[i], 0);
    assert(g_chores == THREADS * ITERATIONS);

    struct rusage ru;
    struct timeval end;
    gettimeofday(&end, 0);
    getrusage(RUSAGE_SELF, &ru);
    printf("%16ld us real\n"
           "%16ld us user\n"
           "%16ld us sys\n",
           tomicros(tub(end, start)),
           tomicros(ru.ru_utime),
           tomicros(ru.ru_stime));
 }
	// -- mode:c;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 --
	// vi: set et ft=c ts=4 sts=4 sw=4 fenc=utf-8 :vi

	#define _GNU_SOURCE
	#include <pthread.h>
	#include <stdatomic.h>
	#include <stdbool.h>
	#include <stdio.h>
	#include <threads.h>
	#include <unistd.h>

	struct TraceEvent
	{
	unsigned long long ts;
	int pid;
	int tid;
	const char* name;
	const char* cat;
	char ph;
	};

	static int g_pid;
	static atomic_bool g_oom;
	static atomic_int g_count;
	static thread_local int g_id;
	static thread_local int g_ids;
	static thread_local int g_tid;
	static unsigned long g_start_rdtsc;
	static struct TraceEvent g_events[1000000];

	static unsigned long
	rdtsc(void)
	{
	#ifdef __x86_64__
	unsigned ax, dx;
	__asm__ volatile("rdtsc" : "=a"(ax), "=d"(dx));
	return (unsigned long)dx << 32 \| ax;
	#else
	unsigned long c;
	__asm__ volatile("mrs %0, cntvct_el0" : "=r"(c));
	return c * 48; // the fudge factor
	#endif
	}

	static int
	llamafile_trace_oom(void)
	{
	if (atomic_load_explicit(&g_oom, memory_order_relaxed))
	return -1;
	if (atomic_exchange_explicit(&g_oom, true, memory_order_acq_rel))
	return -1;
	fprintf(stderr, "warning: ran out of trace event memory\n");
	return -1;
	}

	static int
	llamafile_trace_reserve(int count)
	{
	int id = atomic_load_explicit(&g_count, memory_order_relaxed);
	if (id + count > sizeof(g_events) / sizeof(*g_events))
	return llamafile_trace_oom();
	id = atomic_fetch_add_explicit(&g_count, count, memory_order_acq_rel);
	if (id + count > sizeof(g_events) / sizeof(*g_events))
	return llamafile_trace_oom();
	return id;
	}

	static void
	llamafile_trace_event(int id, const char* name, const char* cat, char ph)
	{
	g_events[id].ts = rdtsc();
	g_events[id].pid = g_pid ? g_pid - 1 : getpid();
	g_events[id].tid = g_tid ? g_tid - 1 : gettid();
	g_events[id].name = name;
	g_events[id].cat = cat;
	g_events[id].ph = ph;
	}

	void
	llamafile_trace_set_pid(int pid)
	{
	g_pid = pid + 1;
	}

	void
	llamafile_trace_set_tid(int tid)
	{
	g_tid = tid + 1;
	}

	void
	llamafile_trace_begin(const char* name)
	{
	if (g_ids < 2) {
	g_ids = 20;
	g_id = llamafile_trace_reserve(g_ids);
	if (g_id == -1) {
	g_ids = 0;
	return;
	}
	}
	llamafile_trace_event(g_id++, name, "category", 'B');
	--g_ids;
	}

	void
	llamafile_trace_end(const char* name)
	{
	if (g_ids < 1)
	return;
	llamafile_trace_event(g_id++, name, "category", 'E');
	--g_ids;
	}

	static void
	llamafile_trace_save(const char* filename)
	{
	int count = atomic_load_explicit(&g_count, memory_order_relaxed);
	if (!count)
	return;
	fprintf(stderr, "saving trace to %s...\n", filename);
	FILE* file = fopen(filename, "w");
	if (!file) {
	perror(filename);
	return;
	}
	fprintf(file, "[\n");
	bool once = false;
	for (int i = 0; i < count; i++) {
	if (!g_events[i].name)
	continue;
	if (!once) {
	once = true;
	} else {
	fputs(",\n", file);
	}
	fprintf(file,
	"{\"name\": \"%s\", \"cat\": \"%s\", \"ph\": \"%c\", "
	"\"ts\": %.3f, \"pid\": %d, \"tid\": %d}",
	g_events[i].name,
	g_events[i].cat,
	g_events[i].ph,
	(g_events[i].ts - g_start_rdtsc) / 3000.,
	g_events[i].pid,
	g_events[i].tid);
	}
	fprintf(file, "\n]\n");
	fclose(file);
	}

	__attribute__((__constructor__)) static void
	trace_startup(void)
	{
	g_start_rdtsc = rdtsc();
	}

	__attribute__((__destructor__)) static void
	trace_shutdown(void)
	{
	llamafile_trace_save("trace.json"); // see chrome://tracing/
	}
	// -- mode:c;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 --
	// vi: set et ft=c ts=4 sts=4 sw=4 fenc=utf-8 :vi

	#include "trace.h"

	#include <assert.h>
	#include <pthread.h>
	#include <stdio.h>
	#include <sys/resource.h>
	#include <sys/time.h>
	#include <time.h>

	#define ITERATIONS 100000
	#define THREADS 30

	int g_chores;
	pthread_mutex_t g_locker = PTHREAD_MUTEX_INITIALIZER;

	void*
	worker(void* arg)
	{
	llamafile_trace_begin("worker");
	for (int i = 0; i < ITERATIONS; ++i) {
	pthread_mutex_lock(&g_locker);
	++g_chores;
	pthread_mutex_unlock(&g_locker);
	}
	llamafile_trace_end("worker");
	return 0;
	}

	struct timeval
	tub(struct timeval a, struct timeval b)
	{
	a.tv_sec -= b.tv_sec;
	if (a.tv_usec < b.tv_usec) {
	a.tv_usec += 1000000;
	a.tv_sec--;
	}
	a.tv_usec -= b.tv_usec;
	return a;
	}

	long
	tomicros(struct timeval x)
	{
	return x.tv_sec * 1000000ul + x.tv_usec;
	}

	int
	main()
	{
	cpu_set_t x;
	CPU_ZERO(&x);
	CPU_SET(0, &x);
	/* CPU_SET(1, &x); */
	sched_setaffinity(0, sizeof(x), &x);

	struct timeval start;
	gettimeofday(&start, 0);

	pthread_t th[THREADS];
	for (int i = 0; i < THREADS; ++i)
	pthread_create(&th[i], 0, worker, 0);
	for (int i = 0; i < THREADS; ++i)
	pthread_join(th[i], 0);
	assert(g_chores == THREADS * ITERATIONS);

	struct rusage ru;
	struct timeval end;
	gettimeofday(&end, 0);
	getrusage(RUSAGE_SELF, &ru);
	printf("%16ld us real\n"
	"%16ld us user\n"
	"%16ld us sys\n",
	tomicros(tub(end, start)),
	tomicros(ru.ru_utime),
	tomicros(ru.ru_stime));
	}