Files
Dominik Kundel 243a1b0276 Initial commit
Co-authored-by: Zhuohan Li <zhuohan@openai.com>
Co-authored-by: Maratyszcza <marat@openai.com>
Co-authored-by: Volodymyr Kyrylov <vol@wilab.org.ua>
2025-08-05 08:19:49 -07:00

561 lines
25 KiB
C

#include <assert.h>
#include <inttypes.h>
#include <stdatomic.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h> // errno, EISDIR, ENOENT, ENOTDIR
#include <fcntl.h> // open
#include <mach/vm_page_size.h> // vm_page_size
#include <sys/mman.h> // mmap, PROT_READ, MAP_PRIVATE
#include <sys/stat.h> // fstat, stat
#include <sys/types.h> // off_t, ssize_t
#include <unistd.h> // close
#include <gpt-oss.h>
#include "internal/datatype.h"
#include "internal/kernel-args.h" // gptoss_expert_prediction
#include "internal/log.h"
#include "internal/uuid.h"
#include "internal/storage.h"
#include "internal/math.h"
#include "internal/model.h"
static size_t round_up_to_page_size(size_t bytes) {
const size_t page_size_mask = (size_t) vm_page_size - 1;
if ((bytes & page_size_mask) != 0) {
bytes |= page_size_mask;
bytes += 1;
}
return bytes;
}
static size_t round_down_to_page_size(size_t bytes) {
const size_t page_size_mask = (size_t) vm_page_size - 1;
return bytes & ~page_size_mask;
}
static enum gptoss_status read_fd(int fd, void* data, size_t size, const char* path) {
assert(fd != -1);
assert(data != NULL);
assert(size != 0);
size_t bytes_to_read = size;
char* current_byte = (char*) data;
do {
const ssize_t read_result = read(fd, current_byte, bytes_to_read);
if (read_result < 0) {
GPTOSS_LOG_ERROR("reading %zu bytes from file %s failed with error %d",
size, path, errno);
return gptoss_status_io_error;
}
current_byte += (size_t) read_result;
bytes_to_read -= (size_t) read_result;
} while (bytes_to_read != 0);
return gptoss_status_success;
}
static void prefetch_fd(int fd, size_t offset, size_t size, const char* path) {
// radvisory.ra_count is int, so we can't prefetch 2GB+ at once
const size_t prefetch_max = round_down_to_page_size((size_t) INT_MAX);
do {
const size_t prefetch_size = math_min(size, prefetch_max);
const struct radvisory ra = {
.ra_offset = offset,
.ra_count = (int) prefetch_size,
};
if (fcntl(fd, F_RDADVISE, &ra) == -1) {
GPTOSS_LOG_WARNING("fcntl(%s, F_RDADVISE, .ra_offset=%zu, .ra_count=%d) failed with error %d\n",
path, (size_t) ra.ra_offset, ra.ra_count, errno);
return;
}
offset += prefetch_size;
size -= prefetch_size;
} while (size != 0);
}
enum gptoss_status GPTOSS_ABI gptoss_model_create_from_file(
const char* path,
gptoss_model_t* model_out)
{
*model_out = NULL;
enum gptoss_status status = gptoss_status_success;
struct gptoss_model* model = NULL;
struct gptoss_tokenizer* tokenizer = NULL;
int fd = -1;
size_t file_offset = 0;
fd = open(path, O_RDONLY);
if (fd == -1) {
GPTOSS_LOG_ERROR("open(%s) failed with error %d", path, errno);
switch (errno) {
case EISDIR:
case ENOENT:
case ENOTDIR:
status = gptoss_status_invalid_argument;
break;
default:
status = gptoss_status_io_error;
break;
}
goto cleanup;
}
struct gptoss_file_header file_header;
status = read_fd(fd, &file_header, sizeof(file_header), path);
if (status != gptoss_status_success) {
goto cleanup;
}
file_offset += sizeof(file_header);
if (file_header.magic[0] != 'G' ||
file_header.magic[1] != 'P' ||
file_header.magic[2] != 'T' ||
file_header.magic[3] != '-' ||
file_header.magic[4] != 'O' ||
file_header.magic[5] != 'S' ||
file_header.magic[6] != 'S' ||
file_header.magic[7] != ' ' ||
file_header.magic[8] != 'v' ||
file_header.magic[9] != '1' ||
file_header.magic[10] != '.' ||
file_header.magic[11] != '0' ||
file_header.zero != 0)
{
GPTOSS_LOG_ERROR("invalid magic in file %s", path);
status = gptoss_status_invalid_argument;
goto cleanup;
}
struct gptoss_uuid model_uuid;
status = read_fd(fd, &model_uuid, sizeof(model_uuid), path);
if (status != gptoss_status_success) {
goto cleanup;
}
file_offset += sizeof(model_uuid);
if (!gptoss_is_gptoss_model_uuid(&model_uuid)) {
GPTOSS_LOG_ERROR("unsupported model UUID " UUID_FORMAT, UUID_ARGS(model_uuid));
status = gptoss_status_invalid_argument;
goto cleanup;
}
struct gptoss_gptoss_model_header model_header;
status = read_fd(fd, &model_header, sizeof(model_header), path);
if (status != gptoss_status_success) {
goto cleanup;
}
file_offset += sizeof(model_header);
struct gptoss_uuid layout_uuid;
status = read_fd(fd, &layout_uuid, sizeof(layout_uuid), path);
if (status != gptoss_status_success) {
goto cleanup;
}
file_offset += sizeof(layout_uuid);
if (!gptoss_is_applegpu_layout_uuid(&layout_uuid)) {
GPTOSS_LOG_ERROR("unsupported layout UUID " UUID_FORMAT, UUID_ARGS(layout_uuid));
status = gptoss_status_invalid_argument;
goto cleanup;
}
const size_t model_size = sizeof(struct gptoss_model) + model_header.num_blocks * sizeof(struct gptoss_metal_buffer);
model = malloc(model_size);
if (model == NULL) {
GPTOSS_LOG_ERROR("failed to allocate %zu bytes for model descriptor", model_size);
status = gptoss_status_insufficient_memory;
goto cleanup;
}
memset(model, 0, model_size);
atomic_store_explicit(&model->ref_count, 1, memory_order_relaxed);
model->context_length = model_header.context_length;
model->num_blocks = model_header.num_blocks;
model->num_experts = model_header.num_experts;
model->num_active_experts = model_header.num_active_experts;
model->embedding_dim = model_header.embedding_dim;
model->mlp_dim = model_header.mlp_dim;
model->swiglu_limit = model_header.swiglu_limit;
model->head_dim = model_header.head_dim;
model->num_heads = model_header.num_heads;
model->num_kv_heads = model_header.num_kv_heads;
model->attention_window = model_header.attention_window;
model->rope_theta = model_header.rope_theta;
model->interpolation_scale = model_header.interpolation_scale;
model->yarn_offset = model_header.yarn_offset;
model->yarn_scale = model_header.yarn_scale;
model->yarn_multiplier = model_header.yarn_multiplier;
model->rmsnorm_epsilon = model_header.rmsnorm_epsilon;
model->max_batch_tokens = GPTOSS_DEFAULT_BATCH_SIZE;
struct gptoss_uuid tokenizer_uuid;
status = read_fd(fd, &tokenizer_uuid, sizeof(tokenizer_uuid), path);
if (status != gptoss_status_success) {
goto cleanup;
}
file_offset += sizeof(tokenizer_uuid);
if (!gptoss_is_tiktoken_tokenizer_uuid(&tokenizer_uuid)) {
GPTOSS_LOG_ERROR("unsupported tokenizer UUID " UUID_FORMAT, UUID_ARGS(tokenizer_uuid));
status = gptoss_status_invalid_argument;
goto cleanup;
}
struct gptoss_tiktoken_tokenizer_header tokenizer_header;
status = read_fd(fd, &tokenizer_header, sizeof(tokenizer_header), path);
if (status != gptoss_status_success) {
goto cleanup;
}
file_offset += sizeof(tokenizer_header);
tokenizer = malloc(sizeof(struct gptoss_tokenizer));
if (tokenizer == NULL) {
GPTOSS_LOG_ERROR("failed to allocate %zu bytes for tokenizer descriptor", sizeof(struct gptoss_tokenizer));
status = gptoss_status_insufficient_memory;
goto cleanup;
}
memset(tokenizer, 0, sizeof(struct gptoss_tokenizer));
// Initialize all special token IDs to UINT32_MAX (0xFF in all bytes)
memset(tokenizer->special_token_id, 0xFF, sizeof(tokenizer->special_token_id));
atomic_store_explicit(&tokenizer->ref_count, 1, memory_order_relaxed);
tokenizer->num_special_tokens = tokenizer_header.num_special_tokens;
tokenizer->num_text_tokens = tokenizer_header.num_text_tokens;
model->vocabulary_size = tokenizer_header.num_special_tokens + tokenizer_header.num_text_tokens;
for (uint32_t t = 0; t < tokenizer_header.num_special_tokens; t++) {
struct gptoss_uuid token_uuid;
status = read_fd(fd, &token_uuid, sizeof(token_uuid), path);
if (status != gptoss_status_success) {
goto cleanup;
}
file_offset += sizeof(token_uuid);
const enum gptoss_special_token token = gptoss_special_token_decode_uuid(&token_uuid);
if (token != gptoss_special_token_invalid) {
tokenizer->special_token_id[token - 1] = tokenizer_header.num_text_tokens + t;
}
}
const size_t tokenizer_start_offset = file_offset;
const size_t tokenizer_end_offset = tokenizer_start_offset + tokenizer_header.regex_size + tokenizer_header.tokens_size;
const size_t tokenizer_mapping_start = round_down_to_page_size(tokenizer_start_offset);
const size_t tokenizer_mapping_size = round_up_to_page_size(tokenizer_end_offset) - tokenizer_mapping_start;
void* tokenizer_mapping_ptr = mmap(NULL, tokenizer_mapping_size, PROT_READ, MAP_PRIVATE, fd, tokenizer_mapping_start);
if (tokenizer_mapping_ptr == (void*) -1) {
GPTOSS_LOG_ERROR("failed to mmap(%s) tokenizer at offset %zu size %zu",
path, tokenizer_mapping_start, tokenizer_mapping_size);
status = gptoss_status_io_error;
goto cleanup;
}
tokenizer->mapping_ptr = tokenizer_mapping_ptr;
tokenizer->mapping_size = tokenizer_mapping_size;
tokenizer->regex_ptr = (const char*) tokenizer_mapping_ptr + (tokenizer_start_offset - tokenizer_mapping_start);
tokenizer->tokens_ptr = tokenizer->regex_ptr + tokenizer_header.regex_size;
if (madvise(tokenizer_mapping_ptr, tokenizer_mapping_size, MADV_RANDOM | MADV_WILLNEED) != 0) {
GPTOSS_LOG_WARNING("madvise(%s, size=%zu) failed with error %d", path, tokenizer_mapping_size, errno);
}
prefetch_fd(fd, tokenizer_mapping_start, tokenizer_mapping_size, path);
struct stat model_stat = {0};
int stat_result = fstat(fd, &model_stat);
if (stat_result != 0) {
GPTOSS_LOG_ERROR("stat(%s) failed with error %d", path, errno);
status = gptoss_status_io_error;
goto cleanup;
}
const size_t model_mapping_start = round_up_to_page_size(tokenizer_end_offset);
const size_t model_mapping_size = round_up_to_page_size((size_t) model_stat.st_size) - model_mapping_start;
void* model_mapping_ptr = mmap(NULL, model_mapping_size, PROT_READ, MAP_PRIVATE, fd, model_mapping_start);
if (model_mapping_ptr == (void*) -1) {
GPTOSS_LOG_ERROR("failed to mmap(%s) model weights at offset %zu size %zu",
path, model_mapping_start, model_mapping_size);
status = gptoss_status_io_error;
goto cleanup;
}
model->mapping_ptr = model_mapping_ptr;
model->mapping_size = model_mapping_size;
if (madvise(model_mapping_ptr, model_mapping_size, MADV_SEQUENTIAL | MADV_WILLNEED) != 0) {
GPTOSS_LOG_WARNING("madvise(%s, size=%zu) failed with error %d", path, model_mapping_size, errno);
}
prefetch_fd(fd, model_mapping_start, model_mapping_size, path);
// Initialize Metal
status = gptoss_metal_device_create_system_default(&model->device);
if (status != gptoss_status_success) {
goto cleanup;
}
model->max_threadgroups = model->device.num_cores * 3;
status = gptoss_metal_command_queue_create(&model->device, &model->command_queue);
if (status != gptoss_status_success) {
goto cleanup;
}
// Metal kernels
status = gptoss_metal_library_create_default(&model->device, &model->library);
if (status != gptoss_status_success) {
goto cleanup;
}
status = gptoss_metal_function_create(&model->library, "gptoss_bf16_f32_embeddings", &model->bf16_f32_embeddings_fn);
if (status != gptoss_status_success) {
goto cleanup;
}
status = gptoss_metal_function_create(&model->library, "gptoss_f32_bf16w_rmsnorm", &model->f32_bf16w_rmsnorm_fn);
if (status != gptoss_status_success) {
goto cleanup;
}
status = gptoss_metal_function_create(&model->library, "gptoss_f32_bf16w_matmul", &model->f32_bf16w_matmul_fn);
if (status != gptoss_status_success) {
goto cleanup;
}
status = gptoss_metal_function_create(&model->library, "gptoss_f32_bf16w_unembedding", &model->f32_bf16w_unembedding_fn);
if (status != gptoss_status_success) {
goto cleanup;
}
status = gptoss_metal_function_create(&model->library, "gptoss_f32_rope", &model->f32_rope_fn);
if (status != gptoss_status_success) {
goto cleanup;
}
status = gptoss_metal_function_create(&model->library, "gptoss_f32_mf4w_moe_matmul_swiglu", &model->f32_mf4w_moe_matmul_swiglu_fn);
if (status != gptoss_status_success) {
goto cleanup;
}
status = gptoss_metal_function_create(&model->library, "gptoss_f32_mf4w_moe_matmul", &model->f32_mf4w_moe_matmul_fn);
if (status != gptoss_status_success) {
goto cleanup;
}
status = gptoss_metal_function_create(&model->library, "gptoss_f32_accumulate_e4", &model->f32_accumulate_e4_fn);
if (status != gptoss_status_success) {
goto cleanup;
}
status = gptoss_metal_function_create(&model->library, "gptoss_f32_topk_softmax_e32_k4", &model->f32_topk_softmax_e32_k4_fn);
if (status != gptoss_status_success) {
goto cleanup;
}
status = gptoss_metal_function_create(&model->library, "gptoss_f32_topk_softmax_e128_k4", &model->f32_topk_softmax_e128_k4_fn);
if (status != gptoss_status_success) {
goto cleanup;
}
status = gptoss_metal_function_create(&model->library, "gptoss_f32_softmax", &model->f32_softmax_fn);
if (status != gptoss_status_success) {
goto cleanup;
}
status = gptoss_metal_function_create(&model->library, "gptoss_f32_sdpa_q8_d64", &model->f32_sdpa_q8_d64_fn);
if (status != gptoss_status_success) {
goto cleanup;
}
// Weight buffers
const char* current_ptr = (const char*) model->mapping_ptr;
const size_t embedding_weight_size = math_round_up_po2(model->vocabulary_size * model->embedding_dim * sizeof(gptoss_bfloat16), 16);
model->attn_rmsnorm_gain_offset = embedding_weight_size;
const size_t rmsnorm_weight_size = math_round_up_po2(model->embedding_dim * sizeof(gptoss_bfloat16), 16);
model->attn_qkv_weight_offset = model->attn_rmsnorm_gain_offset + rmsnorm_weight_size;
const size_t attn_qkv_dim = model->head_dim * (model->num_heads + 2 * model->num_kv_heads);
const size_t attn_qkv_weight_size = math_round_up_po2(attn_qkv_dim * model->embedding_dim * sizeof(gptoss_bfloat16), 16);
model->attn_qkv_bias_offset = model->attn_qkv_weight_offset + attn_qkv_weight_size;
const size_t attn_qkv_bias_size = math_round_up_po2(attn_qkv_dim * sizeof(gptoss_bfloat16), 16);
model->attn_sdpa_sink_offset = model->attn_qkv_bias_offset + attn_qkv_bias_size;
const size_t attn_sink_weight_size = math_round_up_po2(model->num_heads * sizeof(gptoss_bfloat16), 16);
model->attn_out_weight_offset = model->attn_sdpa_sink_offset + attn_sink_weight_size;
const size_t attn_out_weight_size = math_round_up_po2(model->embedding_dim * model->num_heads * model->head_dim * sizeof(gptoss_bfloat16), 16);
model->attn_out_bias_offset = model->attn_out_weight_offset + attn_out_weight_size;
const size_t attn_out_bias_size = math_round_up_po2(model->embedding_dim * sizeof(gptoss_bfloat16), 16);
model->mlp_rmsnorm_gain_offset = model->attn_out_bias_offset + attn_out_bias_size;
model->mlp_gate_weight_offset = model->mlp_rmsnorm_gain_offset + rmsnorm_weight_size;
const size_t mlp_gate_weight_size = math_round_up_po2(model->num_experts * model->embedding_dim * sizeof(gptoss_bfloat16), 16);
model->mlp_gate_bias_offset = model->mlp_gate_weight_offset + mlp_gate_weight_size;
const size_t mlp_gate_bias_size = math_round_up_po2(model->num_experts * sizeof(gptoss_bfloat16), 16);
const size_t per_block_shared_weights_size =
rmsnorm_weight_size + attn_qkv_weight_size + attn_qkv_bias_size + attn_sink_weight_size + attn_out_weight_size + attn_out_bias_size +
rmsnorm_weight_size + mlp_gate_weight_size + mlp_gate_bias_size;
model->rmsnorm_weight_offset = embedding_weight_size + model->num_blocks * per_block_shared_weights_size;
model->unembedding_weight_offset = model->rmsnorm_weight_offset + rmsnorm_weight_size;
const size_t unembedding_weight_size = math_round_up_po2(model->vocabulary_size * model->embedding_dim * sizeof(gptoss_bfloat16), 16);
model->per_block_shared_weights_size = per_block_shared_weights_size;
const size_t shared_weights_size =
round_up_to_page_size(embedding_weight_size + rmsnorm_weight_size + unembedding_weight_size + model->num_blocks * per_block_shared_weights_size);
status = gptoss_metal_buffer_wrap(&model->device, shared_weights_size, current_ptr, &model->shared_weight_buffer);
if (status != gptoss_status_success) {
GPTOSS_LOG_ERROR("failed to map expert-shared weight of size %zu onto a Metal buffer", shared_weights_size);
goto cleanup;
}
current_ptr += shared_weights_size;
model->weights_size += shared_weights_size;
const size_t mlp_swiglu_weight_block_size = math_round_up_po2(2 * model->mlp_dim * model->embedding_dim / 2, 16);
model->mlp_swiglu_scale_offset = mlp_swiglu_weight_block_size;
const size_t mlp_swiglu_weight_scale_size = math_round_up_po2(2 * model->mlp_dim * model->embedding_dim / 32, 16);
model->mlp_swiglu_bias_offset = model->mlp_swiglu_scale_offset + mlp_swiglu_weight_scale_size;
const size_t mlp_swiglu_bias_size = math_round_up_po2(2 * model->mlp_dim * sizeof(gptoss_bfloat16), 16);
model->mlp_out_block_offset = model->mlp_swiglu_bias_offset + mlp_swiglu_bias_size;
const size_t mlp_out_weight_block_size = math_round_up_po2(model->embedding_dim * model->mlp_dim / 2, 16);
model->mlp_out_scale_offset = model->mlp_out_block_offset + mlp_out_weight_block_size;
const size_t mlp_out_weight_scale_size = math_round_up_po2(model->embedding_dim * model->mlp_dim / 32, 16);
model->mlp_out_bias_offset = model->mlp_out_scale_offset + mlp_out_weight_scale_size;
const size_t mlp_out_bias_size = math_round_up_po2(model->embedding_dim * sizeof(gptoss_bfloat16), 16);
model->per_expert_block_weight_size =
mlp_swiglu_weight_block_size + mlp_swiglu_weight_scale_size + mlp_swiglu_bias_size + mlp_out_weight_block_size + mlp_out_weight_scale_size + mlp_out_bias_size;
const size_t moe_block_weight_size = round_up_to_page_size(model->num_experts * model->per_expert_block_weight_size);
for (uint32_t n = 0; n < model->num_blocks; n++) {
status = gptoss_metal_buffer_wrap(&model->device, moe_block_weight_size, current_ptr, &model->block_weight_buffers[n]);
if (status != gptoss_status_success) {
GPTOSS_LOG_ERROR("failed to map block #%" PRIu32 " MoE weight of size %zu onto a Metal buffer",
n, moe_block_weight_size);
goto cleanup;
}
current_ptr += moe_block_weight_size;
model->weights_size += moe_block_weight_size;
}
// Activation buffers
status = gptoss_metal_buffer_create(&model->device, model->max_batch_tokens * model->embedding_dim * sizeof(float), NULL, &model->residual_activation_buffer);
if (status != gptoss_status_success) {
goto cleanup;
}
status = gptoss_metal_buffer_create(&model->device, model->max_batch_tokens * model->embedding_dim * sizeof(float), NULL, &model->rmsnorm_activation_buffer);
if (status != gptoss_status_success) {
goto cleanup;
}
status = gptoss_metal_buffer_create(&model->device, model->max_batch_tokens * model->head_dim * (model->num_heads + 2 * model->num_kv_heads) * sizeof(float), NULL, &model->qkv_activation_buffer);
if (status != gptoss_status_success) {
goto cleanup;
}
status = gptoss_metal_buffer_create(&model->device, model->max_batch_tokens * model->head_dim * model->num_heads * sizeof(float), NULL, &model->sdpa_activation_buffer);
if (status != gptoss_status_success) {
goto cleanup;
}
status = gptoss_metal_buffer_create(&model->device, model->max_batch_tokens * model->num_experts * sizeof(float), NULL, &model->gate_activation_buffer);
if (status != gptoss_status_success) {
goto cleanup;
}
status = gptoss_metal_buffer_create(&model->device, model->max_batch_tokens * model->num_experts * sizeof(struct gptoss_expert_prediction), NULL, &model->expert_activation_buffer);
if (status != gptoss_status_success) {
goto cleanup;
}
status = gptoss_metal_buffer_create(&model->device, model->max_batch_tokens * model->num_active_experts * model->mlp_dim * sizeof(float), NULL, &model->swiglu_activation_buffer);
if (status != gptoss_status_success) {
goto cleanup;
}
status = gptoss_metal_buffer_create(&model->device, model->max_batch_tokens * model->num_active_experts * model->embedding_dim * sizeof(float), NULL, &model->moe_activation_buffer);
if (status != gptoss_status_success) {
goto cleanup;
}
model->allocation_size =
model->residual_activation_buffer.size + model->rmsnorm_activation_buffer.size +
model->qkv_activation_buffer.size + model->sdpa_activation_buffer.size +
model->gate_activation_buffer.size + model->expert_activation_buffer.size + model->swiglu_activation_buffer.size + model->moe_activation_buffer.size;
// Commit tokenizer
model->tokenizer = tokenizer;
tokenizer = NULL;
// Commit model
*model_out = model;
model = NULL;
cleanup:
if (fd != -1) {
close(fd);
fd = -1;
}
gptoss_model_release(model); // does nothing if model is NULL
gptoss_tokenizer_release(tokenizer); // does nothing if tokenizer is NULL
return status;
}
enum gptoss_status GPTOSS_ABI gptoss_model_get_tokenizer(
gptoss_model_t model,
gptoss_tokenizer_t* tokenizer_out)
{
gptoss_tokenizer_t tokenizer = model->tokenizer;
atomic_fetch_add_explicit(&tokenizer->ref_count, 1, memory_order_relaxed);
*tokenizer_out = tokenizer;
return gptoss_status_success;
}
enum gptoss_status GPTOSS_ABI gptoss_model_get_max_context_length(
gptoss_model_t model,
size_t* max_context_length_out)
{
*max_context_length_out = model->context_length;
return gptoss_status_success;
}
enum gptoss_status GPTOSS_ABI gptoss_model_retain(
gptoss_model_t model)
{
atomic_fetch_add_explicit(&model->ref_count, 1, memory_order_relaxed);
return gptoss_status_success;
}
enum gptoss_status GPTOSS_ABI gptoss_model_release(
gptoss_model_t model)
{
if (model != NULL) {
if (atomic_fetch_sub_explicit(&model->ref_count, 1, memory_order_acq_rel) == 1) {
gptoss_tokenizer_release(model->tokenizer);
// Activation buffers
gptoss_metal_buffer_release(&model->residual_activation_buffer);
gptoss_metal_buffer_release(&model->rmsnorm_activation_buffer);
gptoss_metal_buffer_release(&model->qkv_activation_buffer);
gptoss_metal_buffer_release(&model->sdpa_activation_buffer);
gptoss_metal_buffer_release(&model->gate_activation_buffer);
gptoss_metal_buffer_release(&model->expert_activation_buffer);
gptoss_metal_buffer_release(&model->swiglu_activation_buffer);
gptoss_metal_buffer_release(&model->moe_activation_buffer);
// Weight buffers
gptoss_metal_buffer_release(&model->shared_weight_buffer);
for (uint32_t n = 0; n < model->num_blocks; n++) {
gptoss_metal_buffer_release(&model->block_weight_buffers[n]);
}
// Metal kernels
gptoss_metal_function_release(&model->bf16_f32_embeddings_fn);
gptoss_metal_function_release(&model->f32_bf16w_rmsnorm_fn);
gptoss_metal_function_release(&model->f32_bf16w_matmul_fn);
gptoss_metal_function_release(&model->f32_bf16w_unembedding_fn);
gptoss_metal_function_release(&model->f32_rope_fn);
gptoss_metal_function_release(&model->f32_mf4w_moe_matmul_swiglu_fn);
gptoss_metal_function_release(&model->f32_mf4w_moe_matmul_fn);
gptoss_metal_function_release(&model->f32_accumulate_e4_fn);
gptoss_metal_function_release(&model->f32_topk_softmax_e32_k4_fn);
gptoss_metal_function_release(&model->f32_topk_softmax_e128_k4_fn);
gptoss_metal_function_release(&model->f32_softmax_fn);
gptoss_metal_function_release(&model->f32_sdpa_q8_d64_fn);
gptoss_metal_library_release(&model->library);
gptoss_metal_command_queue_release(&model->command_queue);
gptoss_metal_device_release(&model->device);
// Weight buffers
if (model->mapping_ptr != NULL && model->mapping_size != 0) {
if (munmap(model->mapping_ptr, model->mapping_size) != 0) {
GPTOSS_LOG_WARNING("munmap for model weight mapping failed with error %d", errno);
}
}
const size_t model_size = sizeof(struct gptoss_model) + model->num_blocks * sizeof(struct gptoss_metal_buffer);
memset(model, 0, model_size);
free(model);
}
}
return gptoss_status_success;
}