Skip to content

Instantly share code, notes, and snippets.

@pfn
Created March 24, 2026 21:58
Show Gist options
  • Select an option

  • Save pfn/552d3a23ade52596d45877edb47a68cc to your computer and use it in GitHub Desktop.

Select an option

Save pfn/552d3a23ade52596d45877edb47a68cc to your computer and use it in GitHub Desktop.
My llama-server preload
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <spawn.h>
#include <dlfcn.h>
#include <stdbool.h>
#include <errno.h>
#include <nvml.h>
typedef int (*posix_spawn_t)(pid_t *, const char *, const posix_spawn_file_actions_t *,
const posix_spawnattr_t *, char *const [], char *const []);
int posix_spawn(pid_t *pid, const char *path, const posix_spawn_file_actions_t *file_actions,
const posix_spawnattr_t *attrp, char *const argv[], char *const envp[]) {
posix_spawn_t orig_posix_spawn = (posix_spawn_t)dlsym(RTLD_NEXT, "posix_spawn");
bool is_llama_server = (path && strstr(path, "llama-server"));
bool is_model_launch = false;
if (is_llama_server && argv) {
for (int i = 0; argv[i] != NULL; i++) {
if (strcmp(argv[i], "-m") == 0 || strcmp(argv[i], "--model") == 0) {
is_model_launch = true;
break;
}
}
}
if (is_llama_server && is_model_launch) {
// 1. Get the desired BUFFER from the user (e.g., 24576 for 24GB)
long desired_buffer_mib = 1024;
char *env_val = getenv("LLAMA_ARG_FIT_TARGET");
if (env_val) desired_buffer_mib = atol(env_val);
// 2. Query NVML for current usage
nvmlInit();
nvmlDevice_t device;
nvmlMemory_t memory;
nvmlDeviceGetHandleByIndex(0, &device);
nvmlDeviceGetMemoryInfo(device, &memory);
long used_mib = memory.used / (1024 * 1024);
nvmlShutdown();
// 3. Formula: NEW_FIT_TARGET = (Desired Total Buffer) - (Already Used)
long new_fit_target = desired_buffer_mib - used_mib;
if (new_fit_target < 2048) {
fprintf(stderr, "[LD_PRELOAD] Warning: Current VRAM usage (%ld MiB) already exceeds "
"the desired absolute buffer (%ld MiB). Targeting 2048MiB free after load.\n",
used_mib, desired_buffer_mib);
new_fit_target = 2048;
}
char fit_var[64];
snprintf(fit_var, sizeof(fit_var), "LLAMA_ARG_FIT_TARGET=%ld", new_fit_target);
printf("[fit_target_jit.so: updated to LLAMA_ARG_FIT_TARGET=%ld]\n", new_fit_target);
// 4. Update environment for the child process
int env_count = 0;
while (envp && envp[env_count]) env_count++;
char **new_env = malloc((env_count + 2) * sizeof(char *));
int j = 0;
for (int i = 0; i < env_count; i++) {
if (strncmp(envp[i], "LLAMA_ARG_FIT_TARGET=", 21) != 0) {
new_env[j++] = strdup(envp[i]);
}
}
new_env[j++] = strdup(fit_var);
new_env[j] = NULL;
int result = orig_posix_spawn(pid, path, file_actions, attrp, argv, new_env);
for (int i = 0; i < j; i++) free(new_env[i]);
free(new_env);
return result;
}
return orig_posix_spawn(pid, path, file_actions, attrp, argv, envp);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment