Created
March 24, 2026 21:58
-
-
Save pfn/552d3a23ade52596d45877edb47a68cc to your computer and use it in GitHub Desktop.
My llama-server preload
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #define _GNU_SOURCE | |
| #include <stdio.h> | |
| #include <stdlib.h> | |
| #include <string.h> | |
| #include <spawn.h> | |
| #include <dlfcn.h> | |
| #include <stdbool.h> | |
| #include <errno.h> | |
| #include <nvml.h> | |
| typedef int (*posix_spawn_t)(pid_t *, const char *, const posix_spawn_file_actions_t *, | |
| const posix_spawnattr_t *, char *const [], char *const []); | |
| int posix_spawn(pid_t *pid, const char *path, const posix_spawn_file_actions_t *file_actions, | |
| const posix_spawnattr_t *attrp, char *const argv[], char *const envp[]) { | |
| posix_spawn_t orig_posix_spawn = (posix_spawn_t)dlsym(RTLD_NEXT, "posix_spawn"); | |
| bool is_llama_server = (path && strstr(path, "llama-server")); | |
| bool is_model_launch = false; | |
| if (is_llama_server && argv) { | |
| for (int i = 0; argv[i] != NULL; i++) { | |
| if (strcmp(argv[i], "-m") == 0 || strcmp(argv[i], "--model") == 0) { | |
| is_model_launch = true; | |
| break; | |
| } | |
| } | |
| } | |
| if (is_llama_server && is_model_launch) { | |
| // 1. Get the desired BUFFER from the user (e.g., 24576 for 24GB) | |
| long desired_buffer_mib = 1024; | |
| char *env_val = getenv("LLAMA_ARG_FIT_TARGET"); | |
| if (env_val) desired_buffer_mib = atol(env_val); | |
| // 2. Query NVML for current usage | |
| nvmlInit(); | |
| nvmlDevice_t device; | |
| nvmlMemory_t memory; | |
| nvmlDeviceGetHandleByIndex(0, &device); | |
| nvmlDeviceGetMemoryInfo(device, &memory); | |
| long used_mib = memory.used / (1024 * 1024); | |
| nvmlShutdown(); | |
| // 3. Formula: NEW_FIT_TARGET = (Desired Total Buffer) - (Already Used) | |
| long new_fit_target = desired_buffer_mib - used_mib; | |
| if (new_fit_target < 2048) { | |
| fprintf(stderr, "[LD_PRELOAD] Warning: Current VRAM usage (%ld MiB) already exceeds " | |
| "the desired absolute buffer (%ld MiB). Targeting 2048MiB free after load.\n", | |
| used_mib, desired_buffer_mib); | |
| new_fit_target = 2048; | |
| } | |
| char fit_var[64]; | |
| snprintf(fit_var, sizeof(fit_var), "LLAMA_ARG_FIT_TARGET=%ld", new_fit_target); | |
| printf("[fit_target_jit.so: updated to LLAMA_ARG_FIT_TARGET=%ld]\n", new_fit_target); | |
| // 4. Update environment for the child process | |
| int env_count = 0; | |
| while (envp && envp[env_count]) env_count++; | |
| char **new_env = malloc((env_count + 2) * sizeof(char *)); | |
| int j = 0; | |
| for (int i = 0; i < env_count; i++) { | |
| if (strncmp(envp[i], "LLAMA_ARG_FIT_TARGET=", 21) != 0) { | |
| new_env[j++] = strdup(envp[i]); | |
| } | |
| } | |
| new_env[j++] = strdup(fit_var); | |
| new_env[j] = NULL; | |
| int result = orig_posix_spawn(pid, path, file_actions, attrp, argv, new_env); | |
| for (int i = 0; i < j; i++) free(new_env[i]); | |
| free(new_env); | |
| return result; | |
| } | |
| return orig_posix_spawn(pid, path, file_actions, attrp, argv, envp); | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment