#include #include #include #include #include #include #include // errno, EISDIR, ENOENT, ENOTDIR #include // open #include // vm_page_size #include // mmap, PROT_READ, MAP_PRIVATE #include // fstat, stat #include // off_t, ssize_t #include // close #include #include "internal/datatype.h" #include "internal/kernel-args.h" // gptoss_expert_prediction #include "internal/log.h" #include "internal/uuid.h" #include "internal/storage.h" #include "internal/math.h" #include "internal/model.h" static size_t round_up_to_page_size(size_t bytes) { const size_t page_size_mask = (size_t) vm_page_size - 1; if ((bytes & page_size_mask) != 0) { bytes |= page_size_mask; bytes += 1; } return bytes; } static size_t round_down_to_page_size(size_t bytes) { const size_t page_size_mask = (size_t) vm_page_size - 1; return bytes & ~page_size_mask; } static enum gptoss_status read_fd(int fd, void* data, size_t size, const char* path) { assert(fd != -1); assert(data != NULL); assert(size != 0); size_t bytes_to_read = size; char* current_byte = (char*) data; do { const ssize_t read_result = read(fd, current_byte, bytes_to_read); if (read_result < 0) { GPTOSS_LOG_ERROR("reading %zu bytes from file %s failed with error %d", size, path, errno); return gptoss_status_io_error; } current_byte += (size_t) read_result; bytes_to_read -= (size_t) read_result; } while (bytes_to_read != 0); return gptoss_status_success; } static void prefetch_fd(int fd, size_t offset, size_t size, const char* path) { // radvisory.ra_count is int, so we can't prefetch 2GB+ at once const size_t prefetch_max = round_down_to_page_size((size_t) INT_MAX); do { const size_t prefetch_size = math_min(size, prefetch_max); const struct radvisory ra = { .ra_offset = offset, .ra_count = (int) prefetch_size, }; if (fcntl(fd, F_RDADVISE, &ra) == -1) { GPTOSS_LOG_WARNING("fcntl(%s, F_RDADVISE, .ra_offset=%zu, .ra_count=%d) failed with error %d\n", path, (size_t) ra.ra_offset, ra.ra_count, errno); return; } offset += prefetch_size; size -= prefetch_size; } while (size != 0); } enum gptoss_status GPTOSS_ABI gptoss_model_create_from_file( const char* path, gptoss_model_t* model_out) { *model_out = NULL; enum gptoss_status status = gptoss_status_success; struct gptoss_model* model = NULL; struct gptoss_tokenizer* tokenizer = NULL; int fd = -1; size_t file_offset = 0; fd = open(path, O_RDONLY); if (fd == -1) { GPTOSS_LOG_ERROR("open(%s) failed with error %d", path, errno); switch (errno) { case EISDIR: case ENOENT: case ENOTDIR: status = gptoss_status_invalid_argument; break; default: status = gptoss_status_io_error; break; } goto cleanup; } struct gptoss_file_header file_header; status = read_fd(fd, &file_header, sizeof(file_header), path); if (status != gptoss_status_success) { goto cleanup; } file_offset += sizeof(file_header); if (file_header.magic[0] != 'G' || file_header.magic[1] != 'P' || file_header.magic[2] != 'T' || file_header.magic[3] != '-' || file_header.magic[4] != 'O' || file_header.magic[5] != 'S' || file_header.magic[6] != 'S' || file_header.magic[7] != ' ' || file_header.magic[8] != 'v' || file_header.magic[9] != '1' || file_header.magic[10] != '.' || file_header.magic[11] != '0' || file_header.zero != 0) { GPTOSS_LOG_ERROR("invalid magic in file %s", path); status = gptoss_status_invalid_argument; goto cleanup; } struct gptoss_uuid model_uuid; status = read_fd(fd, &model_uuid, sizeof(model_uuid), path); if (status != gptoss_status_success) { goto cleanup; } file_offset += sizeof(model_uuid); if (!gptoss_is_gptoss_model_uuid(&model_uuid)) { GPTOSS_LOG_ERROR("unsupported model UUID " UUID_FORMAT, UUID_ARGS(model_uuid)); status = gptoss_status_invalid_argument; goto cleanup; } struct gptoss_gptoss_model_header model_header; status = read_fd(fd, &model_header, sizeof(model_header), path); if (status != gptoss_status_success) { goto cleanup; } file_offset += sizeof(model_header); struct gptoss_uuid layout_uuid; status = read_fd(fd, &layout_uuid, sizeof(layout_uuid), path); if (status != gptoss_status_success) { goto cleanup; } file_offset += sizeof(layout_uuid); if (!gptoss_is_applegpu_layout_uuid(&layout_uuid)) { GPTOSS_LOG_ERROR("unsupported layout UUID " UUID_FORMAT, UUID_ARGS(layout_uuid)); status = gptoss_status_invalid_argument; goto cleanup; } const size_t model_size = sizeof(struct gptoss_model) + model_header.num_blocks * sizeof(struct gptoss_metal_buffer); model = malloc(model_size); if (model == NULL) { GPTOSS_LOG_ERROR("failed to allocate %zu bytes for model descriptor", model_size); status = gptoss_status_insufficient_memory; goto cleanup; } memset(model, 0, model_size); atomic_store_explicit(&model->ref_count, 1, memory_order_relaxed); model->context_length = model_header.context_length; model->num_blocks = model_header.num_blocks; model->num_experts = model_header.num_experts; model->num_active_experts = model_header.num_active_experts; model->embedding_dim = model_header.embedding_dim; model->mlp_dim = model_header.mlp_dim; model->swiglu_limit = model_header.swiglu_limit; model->head_dim = model_header.head_dim; model->num_heads = model_header.num_heads; model->num_kv_heads = model_header.num_kv_heads; model->attention_window = model_header.attention_window; model->rope_theta = model_header.rope_theta; model->interpolation_scale = model_header.interpolation_scale; model->yarn_offset = model_header.yarn_offset; model->yarn_scale = model_header.yarn_scale; model->yarn_multiplier = model_header.yarn_multiplier; model->rmsnorm_epsilon = model_header.rmsnorm_epsilon; model->max_batch_tokens = GPTOSS_DEFAULT_BATCH_SIZE; struct gptoss_uuid tokenizer_uuid; status = read_fd(fd, &tokenizer_uuid, sizeof(tokenizer_uuid), path); if (status != gptoss_status_success) { goto cleanup; } file_offset += sizeof(tokenizer_uuid); if (!gptoss_is_tiktoken_tokenizer_uuid(&tokenizer_uuid)) { GPTOSS_LOG_ERROR("unsupported tokenizer UUID " UUID_FORMAT, UUID_ARGS(tokenizer_uuid)); status = gptoss_status_invalid_argument; goto cleanup; } struct gptoss_tiktoken_tokenizer_header tokenizer_header; status = read_fd(fd, &tokenizer_header, sizeof(tokenizer_header), path); if (status != gptoss_status_success) { goto cleanup; } file_offset += sizeof(tokenizer_header); tokenizer = malloc(sizeof(struct gptoss_tokenizer)); if (tokenizer == NULL) { GPTOSS_LOG_ERROR("failed to allocate %zu bytes for tokenizer descriptor", sizeof(struct gptoss_tokenizer)); status = gptoss_status_insufficient_memory; goto cleanup; } memset(tokenizer, 0, sizeof(struct gptoss_tokenizer)); // Initialize all special token IDs to UINT32_MAX (0xFF in all bytes) memset(tokenizer->special_token_id, 0xFF, sizeof(tokenizer->special_token_id)); atomic_store_explicit(&tokenizer->ref_count, 1, memory_order_relaxed); tokenizer->num_special_tokens = tokenizer_header.num_special_tokens; tokenizer->num_text_tokens = tokenizer_header.num_text_tokens; model->vocabulary_size = tokenizer_header.num_special_tokens + tokenizer_header.num_text_tokens; for (uint32_t t = 0; t < tokenizer_header.num_special_tokens; t++) { struct gptoss_uuid token_uuid; status = read_fd(fd, &token_uuid, sizeof(token_uuid), path); if (status != gptoss_status_success) { goto cleanup; } file_offset += sizeof(token_uuid); const enum gptoss_special_token token = gptoss_special_token_decode_uuid(&token_uuid); if (token != gptoss_special_token_invalid) { tokenizer->special_token_id[token - 1] = tokenizer_header.num_text_tokens + t; } } const size_t tokenizer_start_offset = file_offset; const size_t tokenizer_end_offset = tokenizer_start_offset + tokenizer_header.regex_size + tokenizer_header.tokens_size; const size_t tokenizer_mapping_start = round_down_to_page_size(tokenizer_start_offset); const size_t tokenizer_mapping_size = round_up_to_page_size(tokenizer_end_offset) - tokenizer_mapping_start; void* tokenizer_mapping_ptr = mmap(NULL, tokenizer_mapping_size, PROT_READ, MAP_PRIVATE, fd, tokenizer_mapping_start); if (tokenizer_mapping_ptr == (void*) -1) { GPTOSS_LOG_ERROR("failed to mmap(%s) tokenizer at offset %zu size %zu", path, tokenizer_mapping_start, tokenizer_mapping_size); status = gptoss_status_io_error; goto cleanup; } tokenizer->mapping_ptr = tokenizer_mapping_ptr; tokenizer->mapping_size = tokenizer_mapping_size; tokenizer->regex_ptr = (const char*) tokenizer_mapping_ptr + (tokenizer_start_offset - tokenizer_mapping_start); tokenizer->tokens_ptr = tokenizer->regex_ptr + tokenizer_header.regex_size; if (madvise(tokenizer_mapping_ptr, tokenizer_mapping_size, MADV_RANDOM | MADV_WILLNEED) != 0) { GPTOSS_LOG_WARNING("madvise(%s, size=%zu) failed with error %d", path, tokenizer_mapping_size, errno); } prefetch_fd(fd, tokenizer_mapping_start, tokenizer_mapping_size, path); struct stat model_stat = {0}; int stat_result = fstat(fd, &model_stat); if (stat_result != 0) { GPTOSS_LOG_ERROR("stat(%s) failed with error %d", path, errno); status = gptoss_status_io_error; goto cleanup; } const size_t model_mapping_start = round_up_to_page_size(tokenizer_end_offset); const size_t model_mapping_size = round_up_to_page_size((size_t) model_stat.st_size) - model_mapping_start; void* model_mapping_ptr = mmap(NULL, model_mapping_size, PROT_READ, MAP_PRIVATE, fd, model_mapping_start); if (model_mapping_ptr == (void*) -1) { GPTOSS_LOG_ERROR("failed to mmap(%s) model weights at offset %zu size %zu", path, model_mapping_start, model_mapping_size); status = gptoss_status_io_error; goto cleanup; } model->mapping_ptr = model_mapping_ptr; model->mapping_size = model_mapping_size; if (madvise(model_mapping_ptr, model_mapping_size, MADV_SEQUENTIAL | MADV_WILLNEED) != 0) { GPTOSS_LOG_WARNING("madvise(%s, size=%zu) failed with error %d", path, model_mapping_size, errno); } prefetch_fd(fd, model_mapping_start, model_mapping_size, path); // Initialize Metal status = gptoss_metal_device_create_system_default(&model->device); if (status != gptoss_status_success) { goto cleanup; } model->max_threadgroups = model->device.num_cores * 3; status = gptoss_metal_command_queue_create(&model->device, &model->command_queue); if (status != gptoss_status_success) { goto cleanup; } // Metal kernels status = gptoss_metal_library_create_default(&model->device, &model->library); if (status != gptoss_status_success) { goto cleanup; } status = gptoss_metal_function_create(&model->library, "gptoss_bf16_f32_embeddings", &model->bf16_f32_embeddings_fn); if (status != gptoss_status_success) { goto cleanup; } status = gptoss_metal_function_create(&model->library, "gptoss_f32_bf16w_rmsnorm", &model->f32_bf16w_rmsnorm_fn); if (status != gptoss_status_success) { goto cleanup; } status = gptoss_metal_function_create(&model->library, "gptoss_f32_bf16w_matmul", &model->f32_bf16w_matmul_fn); if (status != gptoss_status_success) { goto cleanup; } status = gptoss_metal_function_create(&model->library, "gptoss_f32_bf16w_unembedding", &model->f32_bf16w_unembedding_fn); if (status != gptoss_status_success) { goto cleanup; } status = gptoss_metal_function_create(&model->library, "gptoss_f32_rope", &model->f32_rope_fn); if (status != gptoss_status_success) { goto cleanup; } status = gptoss_metal_function_create(&model->library, "gptoss_f32_mf4w_moe_matmul_swiglu", &model->f32_mf4w_moe_matmul_swiglu_fn); if (status != gptoss_status_success) { goto cleanup; } status = gptoss_metal_function_create(&model->library, "gptoss_f32_mf4w_moe_matmul", &model->f32_mf4w_moe_matmul_fn); if (status != gptoss_status_success) { goto cleanup; } status = gptoss_metal_function_create(&model->library, "gptoss_f32_accumulate_e4", &model->f32_accumulate_e4_fn); if (status != gptoss_status_success) { goto cleanup; } status = gptoss_metal_function_create(&model->library, "gptoss_f32_topk_softmax_e32_k4", &model->f32_topk_softmax_e32_k4_fn); if (status != gptoss_status_success) { goto cleanup; } status = gptoss_metal_function_create(&model->library, "gptoss_f32_topk_softmax_e128_k4", &model->f32_topk_softmax_e128_k4_fn); if (status != gptoss_status_success) { goto cleanup; } status = gptoss_metal_function_create(&model->library, "gptoss_f32_softmax", &model->f32_softmax_fn); if (status != gptoss_status_success) { goto cleanup; } status = gptoss_metal_function_create(&model->library, "gptoss_f32_sdpa_q8_d64", &model->f32_sdpa_q8_d64_fn); if (status != gptoss_status_success) { goto cleanup; } // Weight buffers const char* current_ptr = (const char*) model->mapping_ptr; const size_t embedding_weight_size = math_round_up_po2(model->vocabulary_size * model->embedding_dim * sizeof(gptoss_bfloat16), 16); model->attn_rmsnorm_gain_offset = embedding_weight_size; const size_t rmsnorm_weight_size = math_round_up_po2(model->embedding_dim * sizeof(gptoss_bfloat16), 16); model->attn_qkv_weight_offset = model->attn_rmsnorm_gain_offset + rmsnorm_weight_size; const size_t attn_qkv_dim = model->head_dim * (model->num_heads + 2 * model->num_kv_heads); const size_t attn_qkv_weight_size = math_round_up_po2(attn_qkv_dim * model->embedding_dim * sizeof(gptoss_bfloat16), 16); model->attn_qkv_bias_offset = model->attn_qkv_weight_offset + attn_qkv_weight_size; const size_t attn_qkv_bias_size = math_round_up_po2(attn_qkv_dim * sizeof(gptoss_bfloat16), 16); model->attn_sdpa_sink_offset = model->attn_qkv_bias_offset + attn_qkv_bias_size; const size_t attn_sink_weight_size = math_round_up_po2(model->num_heads * sizeof(gptoss_bfloat16), 16); model->attn_out_weight_offset = model->attn_sdpa_sink_offset + attn_sink_weight_size; const size_t attn_out_weight_size = math_round_up_po2(model->embedding_dim * model->num_heads * model->head_dim * sizeof(gptoss_bfloat16), 16); model->attn_out_bias_offset = model->attn_out_weight_offset + attn_out_weight_size; const size_t attn_out_bias_size = math_round_up_po2(model->embedding_dim * sizeof(gptoss_bfloat16), 16); model->mlp_rmsnorm_gain_offset = model->attn_out_bias_offset + attn_out_bias_size; model->mlp_gate_weight_offset = model->mlp_rmsnorm_gain_offset + rmsnorm_weight_size; const size_t mlp_gate_weight_size = math_round_up_po2(model->num_experts * model->embedding_dim * sizeof(gptoss_bfloat16), 16); model->mlp_gate_bias_offset = model->mlp_gate_weight_offset + mlp_gate_weight_size; const size_t mlp_gate_bias_size = math_round_up_po2(model->num_experts * sizeof(gptoss_bfloat16), 16); const size_t per_block_shared_weights_size = rmsnorm_weight_size + attn_qkv_weight_size + attn_qkv_bias_size + attn_sink_weight_size + attn_out_weight_size + attn_out_bias_size + rmsnorm_weight_size + mlp_gate_weight_size + mlp_gate_bias_size; model->rmsnorm_weight_offset = embedding_weight_size + model->num_blocks * per_block_shared_weights_size; model->unembedding_weight_offset = model->rmsnorm_weight_offset + rmsnorm_weight_size; const size_t unembedding_weight_size = math_round_up_po2(model->vocabulary_size * model->embedding_dim * sizeof(gptoss_bfloat16), 16); model->per_block_shared_weights_size = per_block_shared_weights_size; const size_t shared_weights_size = round_up_to_page_size(embedding_weight_size + rmsnorm_weight_size + unembedding_weight_size + model->num_blocks * per_block_shared_weights_size); status = gptoss_metal_buffer_wrap(&model->device, shared_weights_size, current_ptr, &model->shared_weight_buffer); if (status != gptoss_status_success) { GPTOSS_LOG_ERROR("failed to map expert-shared weight of size %zu onto a Metal buffer", shared_weights_size); goto cleanup; } current_ptr += shared_weights_size; model->weights_size += shared_weights_size; const size_t mlp_swiglu_weight_block_size = math_round_up_po2(2 * model->mlp_dim * model->embedding_dim / 2, 16); model->mlp_swiglu_scale_offset = mlp_swiglu_weight_block_size; const size_t mlp_swiglu_weight_scale_size = math_round_up_po2(2 * model->mlp_dim * model->embedding_dim / 32, 16); model->mlp_swiglu_bias_offset = model->mlp_swiglu_scale_offset + mlp_swiglu_weight_scale_size; const size_t mlp_swiglu_bias_size = math_round_up_po2(2 * model->mlp_dim * sizeof(gptoss_bfloat16), 16); model->mlp_out_block_offset = model->mlp_swiglu_bias_offset + mlp_swiglu_bias_size; const size_t mlp_out_weight_block_size = math_round_up_po2(model->embedding_dim * model->mlp_dim / 2, 16); model->mlp_out_scale_offset = model->mlp_out_block_offset + mlp_out_weight_block_size; const size_t mlp_out_weight_scale_size = math_round_up_po2(model->embedding_dim * model->mlp_dim / 32, 16); model->mlp_out_bias_offset = model->mlp_out_scale_offset + mlp_out_weight_scale_size; const size_t mlp_out_bias_size = math_round_up_po2(model->embedding_dim * sizeof(gptoss_bfloat16), 16); model->per_expert_block_weight_size = mlp_swiglu_weight_block_size + mlp_swiglu_weight_scale_size + mlp_swiglu_bias_size + mlp_out_weight_block_size + mlp_out_weight_scale_size + mlp_out_bias_size; const size_t moe_block_weight_size = round_up_to_page_size(model->num_experts * model->per_expert_block_weight_size); for (uint32_t n = 0; n < model->num_blocks; n++) { status = gptoss_metal_buffer_wrap(&model->device, moe_block_weight_size, current_ptr, &model->block_weight_buffers[n]); if (status != gptoss_status_success) { GPTOSS_LOG_ERROR("failed to map block #%" PRIu32 " MoE weight of size %zu onto a Metal buffer", n, moe_block_weight_size); goto cleanup; } current_ptr += moe_block_weight_size; model->weights_size += moe_block_weight_size; } // Activation buffers status = gptoss_metal_buffer_create(&model->device, model->max_batch_tokens * model->embedding_dim * sizeof(float), NULL, &model->residual_activation_buffer); if (status != gptoss_status_success) { goto cleanup; } status = gptoss_metal_buffer_create(&model->device, model->max_batch_tokens * model->embedding_dim * sizeof(float), NULL, &model->rmsnorm_activation_buffer); if (status != gptoss_status_success) { goto cleanup; } status = gptoss_metal_buffer_create(&model->device, model->max_batch_tokens * model->head_dim * (model->num_heads + 2 * model->num_kv_heads) * sizeof(float), NULL, &model->qkv_activation_buffer); if (status != gptoss_status_success) { goto cleanup; } status = gptoss_metal_buffer_create(&model->device, model->max_batch_tokens * model->head_dim * model->num_heads * sizeof(float), NULL, &model->sdpa_activation_buffer); if (status != gptoss_status_success) { goto cleanup; } status = gptoss_metal_buffer_create(&model->device, model->max_batch_tokens * model->num_experts * sizeof(float), NULL, &model->gate_activation_buffer); if (status != gptoss_status_success) { goto cleanup; } status = gptoss_metal_buffer_create(&model->device, model->max_batch_tokens * model->num_experts * sizeof(struct gptoss_expert_prediction), NULL, &model->expert_activation_buffer); if (status != gptoss_status_success) { goto cleanup; } status = gptoss_metal_buffer_create(&model->device, model->max_batch_tokens * model->num_active_experts * model->mlp_dim * sizeof(float), NULL, &model->swiglu_activation_buffer); if (status != gptoss_status_success) { goto cleanup; } status = gptoss_metal_buffer_create(&model->device, model->max_batch_tokens * model->num_active_experts * model->embedding_dim * sizeof(float), NULL, &model->moe_activation_buffer); if (status != gptoss_status_success) { goto cleanup; } model->allocation_size = model->residual_activation_buffer.size + model->rmsnorm_activation_buffer.size + model->qkv_activation_buffer.size + model->sdpa_activation_buffer.size + model->gate_activation_buffer.size + model->expert_activation_buffer.size + model->swiglu_activation_buffer.size + model->moe_activation_buffer.size; // Commit tokenizer model->tokenizer = tokenizer; tokenizer = NULL; // Commit model *model_out = model; model = NULL; cleanup: if (fd != -1) { close(fd); fd = -1; } gptoss_model_release(model); // does nothing if model is NULL gptoss_tokenizer_release(tokenizer); // does nothing if tokenizer is NULL return status; } enum gptoss_status GPTOSS_ABI gptoss_model_get_tokenizer( gptoss_model_t model, gptoss_tokenizer_t* tokenizer_out) { gptoss_tokenizer_t tokenizer = model->tokenizer; atomic_fetch_add_explicit(&tokenizer->ref_count, 1, memory_order_relaxed); *tokenizer_out = tokenizer; return gptoss_status_success; } enum gptoss_status GPTOSS_ABI gptoss_model_get_max_context_length( gptoss_model_t model, size_t* max_context_length_out) { *max_context_length_out = model->context_length; return gptoss_status_success; } enum gptoss_status GPTOSS_ABI gptoss_model_retain( gptoss_model_t model) { atomic_fetch_add_explicit(&model->ref_count, 1, memory_order_relaxed); return gptoss_status_success; } enum gptoss_status GPTOSS_ABI gptoss_model_release( gptoss_model_t model) { if (model != NULL) { if (atomic_fetch_sub_explicit(&model->ref_count, 1, memory_order_acq_rel) == 1) { gptoss_tokenizer_release(model->tokenizer); // Activation buffers gptoss_metal_buffer_release(&model->residual_activation_buffer); gptoss_metal_buffer_release(&model->rmsnorm_activation_buffer); gptoss_metal_buffer_release(&model->qkv_activation_buffer); gptoss_metal_buffer_release(&model->sdpa_activation_buffer); gptoss_metal_buffer_release(&model->gate_activation_buffer); gptoss_metal_buffer_release(&model->expert_activation_buffer); gptoss_metal_buffer_release(&model->swiglu_activation_buffer); gptoss_metal_buffer_release(&model->moe_activation_buffer); // Weight buffers gptoss_metal_buffer_release(&model->shared_weight_buffer); for (uint32_t n = 0; n < model->num_blocks; n++) { gptoss_metal_buffer_release(&model->block_weight_buffers[n]); } // Metal kernels gptoss_metal_function_release(&model->bf16_f32_embeddings_fn); gptoss_metal_function_release(&model->f32_bf16w_rmsnorm_fn); gptoss_metal_function_release(&model->f32_bf16w_matmul_fn); gptoss_metal_function_release(&model->f32_bf16w_unembedding_fn); gptoss_metal_function_release(&model->f32_rope_fn); gptoss_metal_function_release(&model->f32_mf4w_moe_matmul_swiglu_fn); gptoss_metal_function_release(&model->f32_mf4w_moe_matmul_fn); gptoss_metal_function_release(&model->f32_accumulate_e4_fn); gptoss_metal_function_release(&model->f32_topk_softmax_e32_k4_fn); gptoss_metal_function_release(&model->f32_topk_softmax_e128_k4_fn); gptoss_metal_function_release(&model->f32_softmax_fn); gptoss_metal_function_release(&model->f32_sdpa_q8_d64_fn); gptoss_metal_library_release(&model->library); gptoss_metal_command_queue_release(&model->command_queue); gptoss_metal_device_release(&model->device); // Weight buffers if (model->mapping_ptr != NULL && model->mapping_size != 0) { if (munmap(model->mapping_ptr, model->mapping_size) != 0) { GPTOSS_LOG_WARNING("munmap for model weight mapping failed with error %d", errno); } } const size_t model_size = sizeof(struct gptoss_model) + model->num_blocks * sizeof(struct gptoss_metal_buffer); memset(model, 0, model_size); free(model); } } return gptoss_status_success; }