Last active
April 5, 2026 04:15
-
-
Save ddh0/a27277362668fb54e888e0572d407c34 to your computer and use it in GitHub Desktop.
llama.cpp - capture tensors to .npy format
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #include "arg.h" | |
| #include "common.h" | |
| #include "log.h" | |
| #include "llama.h" | |
| #include "ggml.h" | |
| #include <map> | |
| #include <cstdio> | |
| #include <string> | |
| #include <vector> | |
| #include <fstream> | |
| #include <numeric> | |
| #include <sstream> | |
| #include <iomanip> | |
| #include <algorithm> | |
| #include "../../src/eval-callback-data.h" | |
| /** | |
| * Sanitizes a tensor name to be used as part of a filename. | |
| * Replaces characters that are invalid in filenames on common OSes with underscores. | |
| * @param name The original tensor name. | |
| * @return A sanitized string suitable for use in a filename. | |
| */ | |
| static std::string sanitize_filename(const std::string & name) { | |
| std::string sanitized = name; | |
| for (char & c : sanitized) { | |
| if (std::string("/\\:*?\"<>|").find(c) != std::string::npos) { | |
| c = '_'; | |
| } | |
| } | |
| return sanitized; | |
| } | |
| /** | |
| * Maps a GGML type to its corresponding NumPy data type descriptor string. | |
| * @param type The GGML data type. | |
| * @return A string representing the NumPy dtype, or an empty string if unsupported. | |
| */ | |
| static std::string get_npy_descr(ggml_type type) { | |
| switch (type) { | |
| case GGML_TYPE_F32: return "'<f4'"; | |
| case GGML_TYPE_F16: return "'<f2'"; | |
| case GGML_TYPE_I64: return "'<i8'"; | |
| case GGML_TYPE_I32: return "'<i4'"; | |
| case GGML_TYPE_I16: return "'<i2'"; | |
| case GGML_TYPE_I8: return "'<i1'"; | |
| // Note: BF16 is handled by converting to F32 before this function is called. | |
| default: return ""; | |
| } | |
| } | |
| static std::string ggml_ne_string(const ggml_tensor * t); // forward declaration | |
| /** | |
| * Callback to save a tensor's data to disk in NPY (NumPy) format v1.0. | |
| * | |
| * ref: https://numpy.org/doc/stable/reference/generated/numpy.lib.format.html#format-version-1-0 | |
| * | |
| * @param t current tensor | |
| * @param ask when ask is true, we return true if we want to receive the data for this tensor. | |
| * @param user_data a pointer to a `callback_data` struct. | |
| * @return always returns true to continue graph execution. | |
| */ | |
| static bool save_tensor_to_npy(struct ggml_tensor * t, bool ask, void * user_data) { | |
| if (ask) { | |
| // currently only support non-quantized tensors, which we can easily save to NPY format | |
| if (ggml_is_quantized(t->type)) { | |
| return false; | |
| } | |
| ggml_type type_to_check = t->type; | |
| if (type_to_check == GGML_TYPE_BF16) { | |
| type_to_check = GGML_TYPE_F32; // we promise to convert BF16 GGML tensors to F32 GGML tensors before converting/writing | |
| } | |
| return !get_npy_descr(type_to_check).empty(); | |
| } | |
| auto * cb_data = (callback_data *) user_data; | |
| // prepare tensor data (ensure it's contiguous and in a supported format) | |
| ggml_type type_to_save = t->type; | |
| std::vector<uint8_t> data_to_save; | |
| if (t->type == GGML_TYPE_BF16) { | |
| // we are going to convert BF16 data to F32 | |
| type_to_save = GGML_TYPE_F32; | |
| const int64_t n_elems = ggml_nelements(t); | |
| data_to_save.resize(n_elems * sizeof(float)); | |
| // create a temporary buffer for the bf16 data | |
| std::vector<uint8_t> bf16_data(ggml_nbytes(t)); | |
| ggml_backend_tensor_get(t, bf16_data.data(), 0, ggml_nbytes(t)); | |
| // manually convert bf16 to f32 | |
| float * dst_f32 = (float *) data_to_save.data(); | |
| ggml_bf16_t * src_bf16 = (ggml_bf16_t *) bf16_data.data(); | |
| for (int64_t i = 0; i < n_elems; ++i) { | |
| dst_f32[i] = ggml_bf16_to_fp32(src_bf16[i]); | |
| } | |
| } else { | |
| // for other types, get a contiguous copy from the backend. | |
| // this handles GPU --> CPU transfers and non-contiguous tensors automatically. | |
| data_to_save.resize(ggml_nbytes(t)); | |
| ggml_backend_tensor_get(t, data_to_save.data(), 0, ggml_nbytes(t)); | |
| } | |
| // construct file header string | |
| std::string descr = get_npy_descr(type_to_save); | |
| std::string shape_str = "("; | |
| const int n_dims = ggml_n_dims(t); | |
| if (n_dims > 0) { | |
| for (int i = n_dims - 1; i >= 0; --i) { | |
| shape_str += std::to_string(t->ne[i]); | |
| if ((n_dims > 1 && i > 0)) { | |
| shape_str += ", "; | |
| } | |
| } | |
| } | |
| if (n_dims == 1) { | |
| shape_str += ","; | |
| } | |
| shape_str += ")"; | |
| std::string header_dict_nl = "{'descr': " + descr + ", 'fortran_order': False, 'shape': " + shape_str + ", }\n"; | |
| // determine filename | |
| std::string descriptive_name; | |
| auto it = cb_data->tensor_descriptive_names.find(t); | |
| if (it != cb_data->tensor_descriptive_names.end()) { | |
| descriptive_name = it->second; | |
| } else { | |
| descriptive_name = t->name; | |
| } | |
| if (descriptive_name.empty()) { | |
| descriptive_name = "unnamed"; | |
| } | |
| // create file | |
| LOG("%s: saving tensor '%s'\n", __func__, descriptive_name.c_str()); | |
| LOG("%s: -- op: %s, type: %s, shape: [%s]\n", __func__, ggml_op_desc(t), ggml_type_name(t->type), ggml_ne_string(t).c_str()); | |
| std::stringstream ss; | |
| ss << std::setw(4) << std::setfill('0') << cb_data->file_counter++ << "_" << sanitize_filename(descriptive_name) << ".npy"; | |
| std::string filename = ss.str(); | |
| std::ofstream file(filename, std::ios::binary); | |
| if (!file) { | |
| LOG_ERR("%s: -- failed to open file '%s' for writing\n", __func__, filename.c_str()); | |
| return true; | |
| } | |
| // write file header | |
| // magic string and version 1.0 | |
| file.write("\x93NUMPY", 6); | |
| file.put('\x01'); | |
| file.put('\x00'); | |
| // header length and padding calculation | |
| size_t unpadded_len = 10 + header_dict_nl.length(); // magic, version, and length fields | |
| size_t padding = (64 - (unpadded_len % 64)) % 64; | |
| std::string header_padded = header_dict_nl; | |
| header_padded.insert(header_padded.length() - 1, padding, ' '); | |
| uint16_t header_len_val = header_padded.length(); | |
| // write header length (2 bytes, LE) | |
| file.put(header_len_val & 0xFF); | |
| file.put((header_len_val >> 8) & 0xFF); | |
| // write header content | |
| file.write(header_padded.c_str(), header_padded.length()); | |
| // write tensor data | |
| file.write(reinterpret_cast<const char*>(data_to_save.data()), data_to_save.size()); | |
| file.close(); | |
| LOG("%s: -- saved to %s\n", __func__, filename.c_str()); | |
| return true; | |
| } | |
| static std::string ggml_ne_string(const ggml_tensor * t) { | |
| std::string str; | |
| for (int i = 0; i < GGML_MAX_DIMS; ++i) { | |
| str += std::to_string(t->ne[i]); | |
| if (i + 1 < GGML_MAX_DIMS) { | |
| str += ", "; | |
| } | |
| } | |
| return str; | |
| } | |
| static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) { | |
| union { | |
| float f; | |
| uint32_t i; | |
| } u; | |
| u.i = (uint32_t)h.bits << 16; | |
| return u.f; | |
| } | |
| static float ggml_get_float_value(uint8_t * data, ggml_type type, const size_t * nb, size_t i0, size_t i1, size_t i2, size_t i3) { | |
| size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0]; | |
| float v; | |
| if (type == GGML_TYPE_F16) { | |
| v = ggml_fp16_to_fp32(*(ggml_fp16_t *) &data[i]); | |
| } else if (type == GGML_TYPE_F32) { | |
| v = *(float *) &data[i]; | |
| } else if (type == GGML_TYPE_I64) { | |
| v = (float) *(int64_t *) &data[i]; | |
| } else if (type == GGML_TYPE_I32) { | |
| v = (float) *(int32_t *) &data[i]; | |
| } else if (type == GGML_TYPE_I16) { | |
| v = (float) *(int16_t *) &data[i]; | |
| } else if (type == GGML_TYPE_I8) { | |
| v = (float) *(int8_t *) &data[i]; | |
| } else if (type == GGML_TYPE_BF16) { | |
| v = ggml_compute_bf16_to_fp32(*(ggml_bf16_t *) &data[i]); | |
| } else { | |
| GGML_ABORT("fatal error"); | |
| } | |
| return v; | |
| } | |
| static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n) { | |
| GGML_ASSERT(n > 0); | |
| float sum = 0; | |
| for (int64_t i3 = 0; i3 < ne[3]; i3++) { | |
| for (int64_t i2 = 0; i2 < ne[2]; i2++) { | |
| for (int64_t i1 = 0; i1 < ne[1]; i1++) { | |
| for (int64_t i0 = 0; i0 < ne[0]; i0++) { | |
| const float v = ggml_get_float_value(data, type, nb, i0, i1, i2, i3); | |
| sum += v; | |
| } | |
| } | |
| } | |
| } | |
| for (int64_t i3 = 0; i3 < ne[3]; i3++) { | |
| LOG(" [\n"); | |
| for (int64_t i2 = 0; i2 < ne[2]; i2++) { | |
| if (i2 == n && ne[2] > 2*n) { | |
| LOG(" ..., \n"); | |
| i2 = ne[2] - n; | |
| } | |
| LOG(" [\n"); | |
| for (int64_t i1 = 0; i1 < ne[1]; i1++) { | |
| if (i1 == n && ne[1] > 2*n) { | |
| LOG(" ..., \n"); | |
| i1 = ne[1] - n; | |
| } | |
| LOG(" ["); | |
| for (int64_t i0 = 0; i0 < ne[0]; i0++) { | |
| if (i0 == n && ne[0] > 2*n) { | |
| LOG("..., "); | |
| i0 = ne[0] - n; | |
| } | |
| const float v = ggml_get_float_value(data, type, nb, i0, i1, i2, i3); | |
| LOG("%12.4f", v); | |
| if (i0 < ne[0] - 1) LOG(", "); | |
| } | |
| LOG("],\n"); | |
| } | |
| LOG(" ],\n"); | |
| } | |
| LOG(" ]\n"); | |
| LOG(" sum = %f\n", sum); | |
| } | |
| // TODO: make this abort configurable/optional? | |
| if (std::isnan(sum)) { | |
| LOG_ERR("encountered NaN - aborting\n"); | |
| exit(0); | |
| } | |
| } | |
| /** | |
| * GGML operations callback during the graph execution. | |
| * | |
| * @param t current tensor | |
| * @param ask when ask is true, the scheduler wants to know if we are interested in data from this tensor | |
| * if we return true, a follow-up call will be made with ask=false in which we can do the actual collection. | |
| * see ggml_backend_sched_eval_callback | |
| * @param user_data user data to pass at each call back | |
| * @return true to receive data or continue the graph, false otherwise | |
| */ | |
| static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) { | |
| auto * cb_data = (callback_data *) user_data; | |
| const struct ggml_tensor * src0 = t->src[0]; | |
| const struct ggml_tensor * src1 = t->src[1]; | |
| if (ask) { | |
| return true; // Always retrieve data | |
| } | |
| char src1_str[128] = {0}; | |
| if (src1) { | |
| snprintf(src1_str, sizeof(src1_str), "%s{%s}", src1->name, ggml_ne_string(src1).c_str()); | |
| } | |
| LOG("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__, | |
| t->name, ggml_type_name(t->type), ggml_op_desc(t), | |
| src0->name, ggml_ne_string(src0).c_str(), | |
| src1 ? src1_str : "", | |
| ggml_ne_string(t).c_str()); | |
| // copy the data from the GPU memory if needed | |
| const bool is_host = ggml_backend_buffer_is_host(t->buffer); | |
| if (!is_host) { | |
| auto n_bytes = ggml_nbytes(t); | |
| cb_data->data.resize(n_bytes); | |
| ggml_backend_tensor_get(t, cb_data->data.data(), 0, n_bytes); | |
| } | |
| if (!ggml_is_quantized(t->type)) { | |
| uint8_t * data = is_host ? (uint8_t *) t->data : cb_data->data.data(); | |
| ggml_print_tensor(data, t->type, t->ne, t->nb, 3); | |
| } | |
| return true; | |
| } | |
| static bool run(llama_context * ctx, const common_params & params) { | |
| const llama_model * model = llama_get_model(ctx); | |
| const llama_vocab * vocab = llama_model_get_vocab(model); | |
| const bool add_bos = llama_vocab_get_add_bos(vocab); | |
| std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, add_bos); | |
| if (tokens.empty()) { | |
| LOG_ERR("%s : there are not input tokens to process - (try to provide a prompt with '-p')\n", __func__); | |
| return false; | |
| } | |
| if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()))) { | |
| LOG_ERR("%s : failed to eval\n", __func__); | |
| return false; | |
| } | |
| return true; | |
| } | |
| int main(int argc, char ** argv) { | |
| callback_data cb_data; | |
| common_params params; | |
| if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) { | |
| return 1; | |
| } | |
| common_init(); | |
| llama_backend_init(); | |
| llama_numa_init(params.numa); | |
| // pass the callback to the backend scheduler | |
| // it will be executed for each node during the graph computation | |
| params.cb_eval = save_tensor_to_npy; | |
| params.cb_eval_user_data = &cb_data; | |
| params.warmup = false; | |
| // init | |
| common_init_result llama_init = common_init_from_params(params); | |
| llama_model * model = llama_init.model.get(); | |
| llama_context * ctx = llama_init.context.get(); | |
| if (model == nullptr || ctx == nullptr) { | |
| LOG_ERR("%s : failed to init\n", __func__); | |
| return 1; | |
| } | |
| // print system information | |
| { | |
| LOG_INF("\n"); | |
| LOG_INF("%s\n", common_params_get_system_info(params).c_str()); | |
| LOG_INF("\n"); | |
| } | |
| bool OK = run(ctx, params); | |
| if (!OK) { | |
| return 1; | |
| } | |
| LOG("\n"); | |
| llama_perf_context_print(ctx); | |
| llama_backend_free(); | |
| return 0; | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment