Files
Dominik Kundel 243a1b0276 Initial commit
Co-authored-by: Zhuohan Li <zhuohan@openai.com>
Co-authored-by: Maratyszcza <marat@openai.com>
Co-authored-by: Volodymyr Kyrylov <vol@wilab.org.ua>
2025-08-05 08:19:49 -07:00

454 lines
16 KiB
Objective-C

#import <Foundation/Foundation.h>
#import <Metal/Metal.h>
#include <dispatch/dispatch.h>
#include <mach-o/getsect.h>
#include <gpt-oss/types.h>
#include <internal/log.h>
#include <internal/metal.h>
static size_t gptoss_metal_device_get_core_count(id<MTLDevice> device) {
if (!device) {
return 0;
}
const uint64_t target_registry_id = [device registryID];
io_iterator_t it = IO_OBJECT_NULL;
const kern_return_t kr = IOServiceGetMatchingServices(
kIOMainPortDefault,
IOServiceMatching("IOAccelerator"),
&it
);
if (kr != KERN_SUCCESS) {
GPTOSS_LOG_ERROR("failed to find IOAccelerator objects: error %d", kr);
return 0;
}
size_t result = 0;
for (io_object_t obj = IOIteratorNext(it); obj != IO_OBJECT_NULL; obj = IOIteratorNext(it)) {
uint64_t registry_id = 0;
if (IORegistryEntryGetRegistryEntryID(obj, &registry_id) == KERN_SUCCESS &&
registry_id == target_registry_id)
{
// Read "gpu-core-count" from this accelerator node
const CFTypeRef value = IORegistryEntryCreateCFProperty(
obj, CFSTR("gpu-core-count"), kCFAllocatorDefault, 0);
if (value != NULL) {
if (CFGetTypeID(value) == CFNumberGetTypeID()) {
int32_t n = -1;
if (CFNumberGetValue((CFNumberRef) value, kCFNumberSInt32Type, &n) && n > 0) {
result = (size_t) n;
}
}
CFRelease(value);
}
IOObjectRelease(obj);
break;
}
IOObjectRelease(obj);
}
IOObjectRelease(it);
return result;
}
enum gptoss_status gptoss_metal_device_create_system_default(
struct gptoss_metal_device* device_out)
{
id<MTLDevice> device_obj = MTLCreateSystemDefaultDevice();
if (device_obj == nil) {
GPTOSS_LOG_ERROR("failed to create Metal device");
return gptoss_status_unsupported_system;
}
device_out->object = (void*) device_obj;
device_out->num_cores = gptoss_metal_device_get_core_count(device_obj);
device_out->max_buffer_size = (size_t) [device_obj maxBufferLength];
device_out->max_threadgroup_memory = (size_t) [device_obj maxThreadgroupMemoryLength];
const MTLSize max_threadgroup_threads = [device_obj maxThreadsPerThreadgroup];
device_out->max_threadgroup_threads_x = (size_t) max_threadgroup_threads.width;
device_out->max_threadgroup_threads_y = (size_t) max_threadgroup_threads.height;
device_out->max_threadgroup_threads_z = (size_t) max_threadgroup_threads.depth;
return gptoss_status_success;
}
enum gptoss_status gptoss_metal_device_release(
struct gptoss_metal_device* device)
{
if (device->object != NULL) {
id<MTLDevice> device_obj = (id<MTLDevice>) device->object;
[device_obj release];
}
memset(device, 0, sizeof(struct gptoss_metal_device));
return gptoss_status_success;
}
extern const struct mach_header_64 __dso_handle;
enum gptoss_status gptoss_metal_library_create_default(
const struct gptoss_metal_device* device,
struct gptoss_metal_library* library_out)
{
enum gptoss_status status = gptoss_status_success;
id<MTLDevice> device_obj = (id<MTLDevice>) device->object;
id<MTLLibrary> library_obj = nil;
NSError* error_obj = nil;
NSString* error_string_obj = nil;
dispatch_data_t library_blob = NULL;
unsigned long library_size = 0;
uint8_t* library_data = getsectiondata(&__dso_handle, "__METAL", "__shaders", &library_size);
if (library_data != NULL) {
library_blob = dispatch_data_create(library_data, library_size, NULL, DISPATCH_DATA_DESTRUCTOR_DEFAULT);
library_obj = [device_obj newLibraryWithData:library_blob error:&error_obj];
if (library_obj == nil) {
error_string_obj = [error_obj localizedDescription];
GPTOSS_LOG_ERROR("failed to create Metal library: %s", [error_string_obj UTF8String]);
status = gptoss_status_unsupported_system;
goto cleanup;
}
} else {
// Fall-back to loading from the bundle
library_obj = [device_obj newDefaultLibrary];
if (library_obj == nil) {
GPTOSS_LOG_ERROR("failed to create Metal default library");
status = gptoss_status_unsupported_system;
goto cleanup;
}
}
*library_out = (struct gptoss_metal_library) {
.object = (void*) library_obj,
};
cleanup:
if (library_blob != NULL) {
dispatch_release(library_blob);
}
if (error_string_obj != nil) {
[error_string_obj release];
}
if (error_obj != nil) {
[error_obj release];
}
return status;
}
enum gptoss_status gptoss_metal_library_release(
struct gptoss_metal_library* library)
{
if (library->object != NULL) {
id<MTLLibrary> library_obj = (id<MTLLibrary>) library->object;
[library_obj release];
}
memset(library, 0, sizeof(struct gptoss_metal_library));
return gptoss_status_success;
}
enum gptoss_status gptoss_metal_function_create(
const struct gptoss_metal_library* library,
const char* name,
struct gptoss_metal_function* function_out)
{
NSString* name_obj = nil;
NSError* error_obj = nil;
NSString* error_string_obj = nil;
id<MTLFunction> function_obj = nil;
enum gptoss_status status = gptoss_status_success;
id<MTLLibrary> library_obj = (id<MTLLibrary>) library->object;
name_obj = [NSString stringWithUTF8String:name];
function_obj = [library_obj newFunctionWithName:name_obj];
if (function_obj == nil) {
GPTOSS_LOG_ERROR("failed to create Metal function %s", name);
status = gptoss_status_unsupported_system;
goto cleanup;
}
id<MTLDevice> device_obj = [library_obj device];
id<MTLComputePipelineState> pipeline_state_obj = [device_obj newComputePipelineStateWithFunction:function_obj error:&error_obj];
if (pipeline_state_obj == nil) {
error_string_obj = [error_obj localizedDescription];
GPTOSS_LOG_ERROR("failed to create Metal compute pipeline state for function %s: %s",
name, [error_string_obj UTF8String]);
status = gptoss_status_unsupported_system;
goto cleanup;
}
// Commit
function_out->function_object = function_obj;
function_out->pipeline_state_object = pipeline_state_obj;
function_out->max_threadgroup_threads = (size_t) [pipeline_state_obj maxTotalThreadsPerThreadgroup];
function_out->simdgroup_threads = (size_t) [pipeline_state_obj threadExecutionWidth];
function_out->static_threadgroup_memory = (size_t) [pipeline_state_obj staticThreadgroupMemoryLength];
function_obj = nil;
pipeline_state_obj = nil;
cleanup:
if (name_obj != nil) {
[name_obj release];
}
if (function_obj != nil) {
[function_obj release];
}
if (error_string_obj != nil) {
[error_string_obj release];
}
if (error_obj != nil) {
[error_obj release];
}
return status;
}
enum gptoss_status gptoss_metal_function_release(
struct gptoss_metal_function* function)
{
if (function->pipeline_state_object != NULL) {
id<MTLComputePipelineState> pipeline_state_obj = (id<MTLComputePipelineState>) function->pipeline_state_object;
[pipeline_state_obj release];
}
if (function->function_object != NULL) {
id<MTLFunction> function_obj = (id<MTLFunction>) function->function_object;
[function_obj release];
}
memset(function, 0, sizeof(struct gptoss_metal_function));
return gptoss_status_success;
}
enum gptoss_status gptoss_metal_buffer_create(
const struct gptoss_metal_device* device,
size_t size,
const void* data,
struct gptoss_metal_buffer* buffer_out)
{
id<MTLDevice> device_obj = (id<MTLDevice>) device->object;
id<MTLBuffer> buffer_obj = nil;
if (data != NULL) {
buffer_obj = [device_obj newBufferWithBytes:data length:size options:MTLResourceStorageModeShared];
} else {
buffer_obj = [device_obj newBufferWithLength:size options:MTLResourceStorageModeShared];
}
if (buffer_obj == nil) {
GPTOSS_LOG_ERROR("failed to create Metal buffer of size %zu", size);
return gptoss_status_unsupported_system;
}
buffer_out->object = (void*) buffer_obj;
buffer_out->size = size;
buffer_out->ptr = [buffer_obj contents];
return gptoss_status_success;
}
enum gptoss_status gptoss_metal_buffer_wrap(
const struct gptoss_metal_device* device,
size_t size,
const void* data,
struct gptoss_metal_buffer* buffer_out)
{
id<MTLDevice> device_obj = (id<MTLDevice>) device->object;
id<MTLBuffer> buffer_obj = [device_obj newBufferWithBytesNoCopy:(void*) data length:size options:MTLResourceStorageModeShared deallocator:nil];
if (buffer_obj == nil) {
GPTOSS_LOG_ERROR("failed to wrap Metal buffer of size %zu", size);
return gptoss_status_unsupported_system;
}
buffer_out->object = (void*) buffer_obj;
buffer_out->size = size;
buffer_out->ptr = (void*) data;
return gptoss_status_success;
}
enum gptoss_status gptoss_metal_buffer_release(
struct gptoss_metal_buffer* buffer)
{
if (buffer->object != NULL) {
id<MTLBuffer> buffer_obj = (id<MTLBuffer>) buffer->object;
[buffer_obj release];
}
memset(buffer, 0, sizeof(struct gptoss_metal_buffer));
return gptoss_status_success;
}
enum gptoss_status gptoss_metal_command_queue_create(
const struct gptoss_metal_device* device,
struct gptoss_metal_command_queue* command_queue_out)
{
id<MTLDevice> device_obj = (id<MTLDevice>) device->object;
id<MTLCommandQueue> command_queue_obj = [device_obj newCommandQueue];
if (command_queue_obj == nil) {
GPTOSS_LOG_ERROR("failed to create Metal command queue");
return gptoss_status_unsupported_system;
}
command_queue_out->object = (void*) command_queue_obj;
return gptoss_status_success;
}
enum gptoss_status gptoss_metal_command_queue_release(
struct gptoss_metal_command_queue* command_queue)
{
if (command_queue->object != NULL) {
id<MTLCommandQueue> command_queue_obj = (id<MTLCommandQueue>) command_queue->object;
[command_queue_obj release];
}
memset(command_queue, 0, sizeof(struct gptoss_metal_command_queue));
return gptoss_status_success;
}
enum gptoss_status gptoss_metal_command_buffer_create(
const struct gptoss_metal_command_queue* command_queue,
struct gptoss_metal_command_buffer* command_buffer_out)
{
id<MTLCommandQueue> command_queue_obj = (id<MTLCommandQueue>) command_queue->object;
id<MTLCommandBuffer> command_buffer_obj = [command_queue_obj commandBuffer];
if (command_buffer_obj == nil) {
GPTOSS_LOG_ERROR("failed to create Metal command buffer");
return gptoss_status_unsupported_system;
}
[command_buffer_obj retain];
command_buffer_out->object = (void*) command_buffer_obj;
return gptoss_status_success;
}
enum gptoss_status gptoss_metal_command_buffer_encode_fill_buffer(
const struct gptoss_metal_command_buffer* command_buffer,
const struct gptoss_metal_buffer* buffer,
size_t offset,
size_t size,
uint8_t fill_value)
{
if (command_buffer->object == NULL) {
return gptoss_status_invalid_state;
}
if (buffer->object == NULL) {
return gptoss_status_invalid_argument;
}
id<MTLCommandBuffer> command_buffer_obj = (id<MTLCommandBuffer>) command_buffer->object;
id<MTLBuffer> buffer_obj = (id<MTLBuffer>) buffer->object;
id<MTLBlitCommandEncoder> command_encoder_obj = [command_buffer_obj blitCommandEncoder];
const NSRange range = NSMakeRange((NSUInteger) offset, (NSUInteger) size);
[command_encoder_obj fillBuffer:buffer_obj range:range value:fill_value];
[command_encoder_obj endEncoding];
return gptoss_status_success;
}
enum gptoss_status gptoss_metal_command_buffer_encode_copy_buffer(
const struct gptoss_metal_command_buffer* command_buffer,
const struct gptoss_metal_buffer* input_buffer,
size_t input_offset,
const struct gptoss_metal_buffer* output_buffer,
size_t output_offset,
size_t size)
{
if (command_buffer->object == NULL) {
return gptoss_status_invalid_state;
}
if (input_buffer->object == NULL) {
return gptoss_status_invalid_argument;
}
if (output_buffer->object == NULL) {
return gptoss_status_invalid_argument;
}
id<MTLCommandBuffer> command_buffer_obj = (id<MTLCommandBuffer>) command_buffer->object;
id<MTLBuffer> input_buffer_obj = (id<MTLBuffer>) input_buffer->object;
id<MTLBuffer> output_buffer_obj = (id<MTLBuffer>) output_buffer->object;
id<MTLBlitCommandEncoder> command_encoder_obj = [command_buffer_obj blitCommandEncoder];
[command_encoder_obj copyFromBuffer:input_buffer_obj sourceOffset:(NSUInteger) input_offset
toBuffer:output_buffer_obj destinationOffset:(NSUInteger) output_offset
size:(NSUInteger) size];
[command_encoder_obj endEncoding];
return gptoss_status_success;
}
enum gptoss_status gptoss_metal_command_buffer_encode_launch_kernel(
const struct gptoss_metal_command_buffer* command_buffer,
const struct gptoss_metal_function* function,
size_t threadgroup_size_x,
size_t threadgroup_size_y,
size_t threadgroup_size_z,
size_t num_threadgroups_x,
size_t num_threadgroups_y,
size_t num_threadgroups_z,
size_t params_size,
const void* params,
size_t num_buffers,
const struct gptoss_metal_buffer** buffers,
const size_t* buffer_offsets)
{
if (command_buffer->object == NULL || function->pipeline_state_object == NULL) {
return gptoss_status_invalid_state;
}
id<MTLCommandBuffer> command_buffer_obj = (id<MTLCommandBuffer>) command_buffer->object;
id<MTLComputePipelineState> pipeline_state_obj = (id<MTLComputePipelineState>) function->pipeline_state_object;
id<MTLComputeCommandEncoder> command_encoder_obj = [command_buffer_obj computeCommandEncoder];
// Set kernel arguments
[command_encoder_obj setComputePipelineState:pipeline_state_obj];
[command_encoder_obj setBytes:params length:params_size atIndex:0];
for (size_t i = 0; i < num_buffers; ++i) {
id<MTLBuffer> buffer_obj = (id<MTLBuffer>) buffers[i]->object;
const NSUInteger offset = buffer_offsets == NULL ? 0 : (NSUInteger) buffer_offsets[i];
[command_encoder_obj setBuffer:buffer_obj offset:offset atIndex:i + 1];
}
// Dispatch kernel
const MTLSize threadgroup_size = MTLSizeMake(threadgroup_size_x, threadgroup_size_y, threadgroup_size_z);
const MTLSize num_threadgroups = MTLSizeMake(num_threadgroups_x, num_threadgroups_y, num_threadgroups_z);
[command_encoder_obj dispatchThreadgroups:num_threadgroups threadsPerThreadgroup:threadgroup_size];
[command_encoder_obj endEncoding];
return gptoss_status_success;
}
enum gptoss_status gptoss_metal_command_buffer_commit(
const struct gptoss_metal_command_buffer* command_buffer)
{
if (command_buffer->object == NULL) {
return gptoss_status_invalid_state;
}
id<MTLCommandBuffer> command_buffer_obj = (id<MTLCommandBuffer>) command_buffer->object;
[command_buffer_obj commit];
return gptoss_status_success;
}
enum gptoss_status gptoss_metal_command_buffer_wait_completion(
const struct gptoss_metal_command_buffer* command_buffer,
double* elapsed_seconds)
{
if (command_buffer->object == NULL) {
return gptoss_status_invalid_state;
}
id<MTLCommandBuffer> command_buffer_obj = (id<MTLCommandBuffer>) command_buffer->object;
[command_buffer_obj waitUntilCompleted];
if (elapsed_seconds != NULL) {
const CFTimeInterval start_time = [command_buffer_obj GPUStartTime];
const CFTimeInterval end_time = [command_buffer_obj GPUEndTime];
*elapsed_seconds = (double) end_time - (double) start_time;
}
return gptoss_status_success;
}
enum gptoss_status gptoss_metal_command_buffer_release(
struct gptoss_metal_command_buffer* command_buffer)
{
if (command_buffer->object != NULL) {
id<MTLCommandBuffer> command_buffer_obj = (id<MTLCommandBuffer>) command_buffer->object;
[command_buffer_obj release];
}
memset(command_buffer, 0, sizeof(struct gptoss_metal_command_buffer));
return gptoss_status_success;
}