pfn · March 24, 2026 21:58
diff --git a/fit_target_jit.c b/fit_target_jit.c
 #define _GNU_SOURCE
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <spawn.h>
 #include <dlfcn.h>
 #include <stdbool.h>
 #include <errno.h>
 #include <nvml.h>

 typedef int (*posix_spawn_t)(pid_t *, const char *, const posix_spawn_file_actions_t *,
                             const posix_spawnattr_t *, char *const [], char *const []);

 int posix_spawn(pid_t *pid, const char *path, const posix_spawn_file_actions_t *file_actions,
                const posix_spawnattr_t *attrp, char *const argv[], char *const envp[]) {

    posix_spawn_t orig_posix_spawn = (posix_spawn_t)dlsym(RTLD_NEXT, "posix_spawn");

    bool is_llama_server = (path && strstr(path, "llama-server"));
    bool is_model_launch = false;

    if (is_llama_server && argv) {
        for (int i = 0; argv[i] != NULL; i++) {
            if (strcmp(argv[i], "-m") == 0 || strcmp(argv[i], "--model") == 0) {
                is_model_launch = true;
                break;
            }   
        }   
    }   

    if (is_llama_server && is_model_launch) {
        // 1. Get the desired BUFFER from the user (e.g., 24576 for 24GB)
        long desired_buffer_mib = 1024;
        char *env_val = getenv("LLAMA_ARG_FIT_TARGET");
        if (env_val) desired_buffer_mib = atol(env_val);

        // 2. Query NVML for current usage 
        nvmlInit();
        nvmlDevice_t device;
        nvmlMemory_t memory;
        nvmlDeviceGetHandleByIndex(0, &device);
        nvmlDeviceGetMemoryInfo(device, &memory);
        long used_mib = memory.used / (1024 * 1024);
        nvmlShutdown();

        // 3. Formula: NEW_FIT_TARGET = (Desired Total Buffer) - (Already Used)
        long new_fit_target = desired_buffer_mib - used_mib;

        if (new_fit_target < 2048) {
            fprintf(stderr, "[LD_PRELOAD] Warning: Current VRAM usage (%ld MiB) already exceeds "
                            "the desired absolute buffer (%ld MiB). Targeting 2048MiB free after load.\n",
                            used_mib, desired_buffer_mib);
            new_fit_target = 2048;
        }

        char fit_var[64];
        snprintf(fit_var, sizeof(fit_var), "LLAMA_ARG_FIT_TARGET=%ld", new_fit_target);

        printf("[fit_target_jit.so: updated to LLAMA_ARG_FIT_TARGET=%ld]\n", new_fit_target);

        // 4. Update environment for the child process
        int env_count = 0;
        while (envp && envp[env_count]) env_count++;
        char **new_env = malloc((env_count + 2) * sizeof(char *));
        int j = 0;
        for (int i = 0; i < env_count; i++) {
            if (strncmp(envp[i], "LLAMA_ARG_FIT_TARGET=", 21) != 0) {
                new_env[j++] = strdup(envp[i]);
            }
        }
        new_env[j++] = strdup(fit_var);
        new_env[j] = NULL;

        int result = orig_posix_spawn(pid, path, file_actions, attrp, argv, new_env);

        for (int i = 0; i < j; i++) free(new_env[i]);
        free(new_env);
        return result;
    }

    return orig_posix_spawn(pid, path, file_actions, attrp, argv, envp);
 }
	#define _GNU_SOURCE
	#include <stdio.h>
	#include <stdlib.h>
	#include <string.h>
	#include <spawn.h>
	#include <dlfcn.h>
	#include <stdbool.h>
	#include <errno.h>
	#include <nvml.h>

	typedef int (posix_spawn_t)(pid_t , const char , const posix_spawn_file_actions_t ,
	const posix_spawnattr_t , char const [], char *const []);

	int posix_spawn(pid_t pid, const char path, const posix_spawn_file_actions_t *file_actions,
	const posix_spawnattr_t attrp, char const argv[], char *const envp[]) {

	posix_spawn_t orig_posix_spawn = (posix_spawn_t)dlsym(RTLD_NEXT, "posix_spawn");

	bool is_llama_server = (path && strstr(path, "llama-server"));
	bool is_model_launch = false;

	if (is_llama_server && argv) {
	for (int i = 0; argv[i] != NULL; i++) {
	if (strcmp(argv[i], "-m") == 0 \|\| strcmp(argv[i], "--model") == 0) {
	is_model_launch = true;
	break;
	}
	}
	}

	if (is_llama_server && is_model_launch) {
	// 1. Get the desired BUFFER from the user (e.g., 24576 for 24GB)
	long desired_buffer_mib = 1024;
	char *env_val = getenv("LLAMA_ARG_FIT_TARGET");
	if (env_val) desired_buffer_mib = atol(env_val);

	// 2. Query NVML for current usage
	nvmlInit();
	nvmlDevice_t device;
	nvmlMemory_t memory;
	nvmlDeviceGetHandleByIndex(0, &device);
	nvmlDeviceGetMemoryInfo(device, &memory);
	long used_mib = memory.used / (1024 * 1024);
	nvmlShutdown();

	// 3. Formula: NEW_FIT_TARGET = (Desired Total Buffer) - (Already Used)
	long new_fit_target = desired_buffer_mib - used_mib;

	if (new_fit_target < 2048) {
	fprintf(stderr, "[LD_PRELOAD] Warning: Current VRAM usage (%ld MiB) already exceeds "
	"the desired absolute buffer (%ld MiB). Targeting 2048MiB free after load.\n",
	used_mib, desired_buffer_mib);
	new_fit_target = 2048;
	}

	char fit_var[64];
	snprintf(fit_var, sizeof(fit_var), "LLAMA_ARG_FIT_TARGET=%ld", new_fit_target);

	printf("[fit_target_jit.so: updated to LLAMA_ARG_FIT_TARGET=%ld]\n", new_fit_target);

	// 4. Update environment for the child process
	int env_count = 0;
	while (envp && envp[env_count]) env_count++;
	char *new_env = malloc((env_count + 2) sizeof(char *));
	int j = 0;
	for (int i = 0; i < env_count; i++) {
	if (strncmp(envp[i], "LLAMA_ARG_FIT_TARGET=", 21) != 0) {
	new_env[j++] = strdup(envp[i]);
	}
	}
	new_env[j++] = strdup(fit_var);
	new_env[j] = NULL;

	int result = orig_posix_spawn(pid, path, file_actions, attrp, argv, new_env);

	for (int i = 0; i < j; i++) free(new_env[i]);
	free(new_env);
	return result;
	}

	return orig_posix_spawn(pid, path, file_actions, attrp, argv, envp);
	}
No results found