Initial commit
Co-authored-by: Zhuohan Li <zhuohan@openai.com> Co-authored-by: Maratyszcza <marat@openai.com> Co-authored-by: Volodymyr Kyrylov <vol@wilab.org.ua>
This commit is contained in:
95
gpt_oss/metal/benchmark/f32-bf16w-rmsnorm.cc
Normal file
95
gpt_oss/metal/benchmark/f32-bf16w-rmsnorm.cc
Normal file
@@ -0,0 +1,95 @@
|
||||
#include <gpt-oss.h>
|
||||
#include <internal/datatype.h>
|
||||
#include <internal/metal.hpp>
|
||||
#include <internal/metal-kernels.h>
|
||||
|
||||
#include <cstring>
|
||||
|
||||
#include <benchmark/benchmark.h>
|
||||
|
||||
using gptoss::Check;
|
||||
using namespace gptoss::metal;
|
||||
|
||||
constexpr float kEpsilon = 1.0e-5f;
|
||||
constexpr uint64_t kSeed = UINT64_C(1019827666124465388);
|
||||
|
||||
static void f32_bf16w_rnsnorm(benchmark::State& state) {
|
||||
const size_t num_tokens = 1;
|
||||
const size_t num_channels = state.range(0);
|
||||
|
||||
Device device;
|
||||
CommandQueue command_queue{device};
|
||||
Library library{device};
|
||||
Function f32_fill_random_fn{library, "gptoss_f32_fill_random"};
|
||||
Function bf16_fill_random_fn{library, "gptoss_bf16_fill_random"};
|
||||
Function f32_bf16w_rmsnorm_fn{library, "gptoss_f32_bf16w_rmsnorm"};
|
||||
Buffer input_buffer{device, num_tokens * num_channels * sizeof(float)};
|
||||
Buffer weight_buffer{device, num_channels * sizeof(gptoss_bfloat16)};
|
||||
Buffer output_buffer{device, num_tokens * num_channels * sizeof(float)};
|
||||
|
||||
{
|
||||
CommandBuffer command_buffer{command_queue};
|
||||
|
||||
size_t offset = 0;
|
||||
Check(gptoss_metal_command_buffer_encode_launch_f32_fill_random(
|
||||
command_buffer.handle(),
|
||||
f32_fill_random_fn.handle(),
|
||||
/*threadgroup_size=*/0,
|
||||
/*max_threadgroups=*/10,
|
||||
/*output_buffer=*/input_buffer.handle(),
|
||||
/*output_offset=*/0,
|
||||
num_channels, kSeed, offset, /*min=*/-1.0f, /*max=*/1.0),
|
||||
"gptoss_metal_command_buffer_encode_launch_f32_fill_random");
|
||||
offset += num_channels;
|
||||
|
||||
Check(gptoss_metal_command_buffer_encode_launch_bf16_fill_random(
|
||||
command_buffer.handle(),
|
||||
bf16_fill_random_fn.handle(),
|
||||
/*threadgroup_size=*/0,
|
||||
/*max_threadgroups=*/10,
|
||||
/*output_buffer=*/weight_buffer.handle(),
|
||||
/*output_offset=*/0,
|
||||
num_channels, kSeed, offset, /*min=*/-1.0f, /*max=*/1.0),
|
||||
"gptoss_metal_command_buffer_encode_launch_bf16_fill_random");
|
||||
offset += num_channels;
|
||||
|
||||
command_buffer.commit();
|
||||
command_buffer.wait_completion();
|
||||
}
|
||||
|
||||
for (auto _ : state) {
|
||||
CommandBuffer command_buffer{command_queue};
|
||||
|
||||
Check(gptoss_metal_command_buffer_encode_launch_f32_bf16w_rmsnorm(
|
||||
command_buffer.handle(),
|
||||
f32_bf16w_rmsnorm_fn.handle(),
|
||||
input_buffer.handle(),
|
||||
/*input_offset=*/0,
|
||||
weight_buffer.handle(),
|
||||
/*weight_offset=*/0,
|
||||
output_buffer.handle(),
|
||||
/*output_offset=*/0,
|
||||
num_tokens,
|
||||
num_channels,
|
||||
kEpsilon),
|
||||
"gptoss_metal_command_buffer_encode_launch_f32_bf16w_rmsnorm");
|
||||
|
||||
command_buffer.commit();
|
||||
const double elapsed_seconds = command_buffer.wait_completion();
|
||||
state.SetIterationTime(elapsed_seconds);
|
||||
}
|
||||
|
||||
const size_t num_elements = num_tokens * num_channels;
|
||||
state.counters["elements"] =
|
||||
benchmark::Counter(state.iterations() * num_elements,
|
||||
benchmark::Counter::kIsRate);
|
||||
|
||||
const int64_t bytes_per_iteration = input_buffer.size() + weight_buffer.size() + output_buffer.size();
|
||||
state.counters["bytes"] =
|
||||
benchmark::Counter(state.iterations() * bytes_per_iteration,
|
||||
benchmark::Counter::kIsRate);
|
||||
}
|
||||
|
||||
BENCHMARK(f32_bf16w_rnsnorm)->Arg(2880)->UseManualTime()->Unit(benchmark::kMicrosecond);
|
||||
|
||||
BENCHMARK_MAIN();
|
||||
55
gpt_oss/metal/benchmark/f32-random.cc
Normal file
55
gpt_oss/metal/benchmark/f32-random.cc
Normal file
@@ -0,0 +1,55 @@
|
||||
#include <gpt-oss.h>
|
||||
#include <internal/metal.hpp>
|
||||
#include <internal/metal-kernels.h>
|
||||
|
||||
#include <benchmark/benchmark.h>
|
||||
|
||||
using gptoss::Check;
|
||||
using namespace gptoss::metal;
|
||||
|
||||
static void f32_fill_random(benchmark::State& state) {
|
||||
const size_t numel = state.range(0);
|
||||
|
||||
Device device;
|
||||
CommandQueue command_queue{device};
|
||||
Library library{device};
|
||||
Function f32_fill_random_fn{library, "gptoss_f32_fill_random"};
|
||||
Buffer buffer{device, numel * sizeof(float)};
|
||||
|
||||
constexpr uint64_t seed = UINT64_C(1019827666124465388);
|
||||
constexpr uint64_t offset = UINT64_C(12345678901234567890);
|
||||
const float min = -1.0f;
|
||||
const float max = 7.0f;
|
||||
for (auto _ : state) {
|
||||
CommandBuffer command_buffer{command_queue};
|
||||
|
||||
Check(gptoss_metal_command_buffer_encode_launch_f32_fill_random(
|
||||
command_buffer.handle(),
|
||||
f32_fill_random_fn.handle(),
|
||||
/*threadgroup_size=*/0,
|
||||
/*max_threadgroups=*/120,
|
||||
/*output_buffer=*/buffer.handle(),
|
||||
/*output_offset=*/0,
|
||||
numel, seed, offset, min, max),
|
||||
"gptoss_metal_command_buffer_encode_launch_f32_fill_random");
|
||||
|
||||
command_buffer.commit();
|
||||
const double elapsed_seconds = command_buffer.wait_completion();
|
||||
state.SetIterationTime(elapsed_seconds);
|
||||
}
|
||||
|
||||
const int64_t elements_per_iteration = numel;
|
||||
state.counters["elements"] =
|
||||
benchmark::Counter(state.iterations() * elements_per_iteration,
|
||||
benchmark::Counter::kIsRate);
|
||||
|
||||
const int64_t bytes_per_iteration = numel * sizeof(float);
|
||||
state.counters["bytes"] =
|
||||
benchmark::Counter(state.iterations() * bytes_per_iteration,
|
||||
benchmark::Counter::kIsRate);
|
||||
}
|
||||
|
||||
constexpr int64_t giga = INT64_C(1073741824);
|
||||
BENCHMARK(f32_fill_random)->Arg(2 * giga)->UseManualTime()->Unit(benchmark::kMicrosecond);
|
||||
|
||||
BENCHMARK_MAIN();
|
||||
65
gpt_oss/metal/benchmark/mf4-f32-convert.cc
Normal file
65
gpt_oss/metal/benchmark/mf4-f32-convert.cc
Normal file
@@ -0,0 +1,65 @@
|
||||
#include <gpt-oss.h>
|
||||
#include <internal/datatype.h>
|
||||
#include <internal/metal.hpp>
|
||||
#include <internal/metal-kernels.h>
|
||||
|
||||
#include <cstring>
|
||||
|
||||
#include <benchmark/benchmark.h>
|
||||
|
||||
using gptoss::Check;
|
||||
using namespace gptoss::metal;
|
||||
|
||||
static void mf4_f32_convert(benchmark::State& state) {
|
||||
const size_t num_blocks = state.range(0);
|
||||
const size_t num_elements = num_blocks * 32;
|
||||
const size_t num_bytes = num_elements / 2;
|
||||
|
||||
Device device;
|
||||
CommandQueue command_queue{device};
|
||||
Library library{device};
|
||||
Function mf4_f32_convert_fn{library, "gptoss_mf4_f32_convert"};
|
||||
Buffer block_buffer{device, num_bytes};
|
||||
Buffer scale_buffer{device, num_blocks * sizeof(gptoss_float8ue8m0)};
|
||||
Buffer output_buffer{device, num_elements * sizeof(float)};
|
||||
|
||||
std::memset(block_buffer.ptr(), 0x91, num_bytes); // force subnormals
|
||||
std::memset(scale_buffer.ptr(), 128, num_blocks * sizeof(uint8_t)); // scale = 2.0
|
||||
|
||||
for (auto _ : state) {
|
||||
CommandBuffer command_buffer{command_queue};
|
||||
|
||||
Check(gptoss_metal_command_buffer_encode_launch_mf4_f32_convert(
|
||||
command_buffer.handle(),
|
||||
mf4_f32_convert_fn.handle(),
|
||||
/*threadgroup_size=*/0,
|
||||
/*max_threadgroups=*/120,
|
||||
block_buffer.handle(),
|
||||
scale_buffer.handle(),
|
||||
output_buffer.handle(),
|
||||
num_elements),
|
||||
"gptoss_metal_command_buffer_encode_launch_mf4_f32_convert");
|
||||
|
||||
command_buffer.commit();
|
||||
const double elapsed_seconds = command_buffer.wait_completion();
|
||||
state.SetIterationTime(elapsed_seconds);
|
||||
}
|
||||
|
||||
state.counters["blocks"] =
|
||||
benchmark::Counter(state.iterations() * num_blocks,
|
||||
benchmark::Counter::kIsRate);
|
||||
|
||||
state.counters["elements"] =
|
||||
benchmark::Counter(state.iterations() * num_elements,
|
||||
benchmark::Counter::kIsRate);
|
||||
|
||||
const int64_t bytes_per_iteration = num_bytes + num_blocks + num_elements * sizeof(float);
|
||||
state.counters["bytes"] =
|
||||
benchmark::Counter(state.iterations() * bytes_per_iteration,
|
||||
benchmark::Counter::kIsRate);
|
||||
}
|
||||
|
||||
constexpr int64_t mega = INT64_C(1048576);
|
||||
BENCHMARK(mf4_f32_convert)->Arg(256 * mega)->UseManualTime()->Unit(benchmark::kMicrosecond);
|
||||
|
||||
BENCHMARK_MAIN();
|
||||
53
gpt_oss/metal/benchmark/u32-random.cc
Normal file
53
gpt_oss/metal/benchmark/u32-random.cc
Normal file
@@ -0,0 +1,53 @@
|
||||
#include <gpt-oss.h>
|
||||
#include <internal/metal.hpp>
|
||||
#include <internal/metal-kernels.h>
|
||||
|
||||
#include <benchmark/benchmark.h>
|
||||
|
||||
using gptoss::Check;
|
||||
using namespace gptoss::metal;
|
||||
|
||||
static void u32_fill_random(benchmark::State& state) {
|
||||
const size_t numel = state.range(0);
|
||||
|
||||
Device device;
|
||||
CommandQueue command_queue{device};
|
||||
Library library{device};
|
||||
Function u32_fill_random_fn{library, "gptoss_u32_fill_random"};
|
||||
Buffer buffer{device, numel * sizeof(float)};
|
||||
|
||||
constexpr uint64_t seed = UINT64_C(1019827666124465388);
|
||||
constexpr uint64_t offset = UINT64_C(12345678901234567890);
|
||||
for (auto _ : state) {
|
||||
CommandBuffer command_buffer{command_queue};
|
||||
|
||||
Check(gptoss_metal_command_buffer_encode_launch_u32_fill_random(
|
||||
command_buffer.handle(),
|
||||
u32_fill_random_fn.handle(),
|
||||
/*threadgroup_size=*/0,
|
||||
/*max_threadgroups=*/120,
|
||||
/*output_buffer=*/buffer.handle(),
|
||||
/*output_offset=*/0,
|
||||
numel, seed, offset),
|
||||
"gptoss_metal_command_buffer_encode_launch_u32_fill_random");
|
||||
|
||||
command_buffer.commit();
|
||||
const double elapsed_seconds = command_buffer.wait_completion();
|
||||
state.SetIterationTime(elapsed_seconds);
|
||||
}
|
||||
|
||||
const int64_t elements_per_iteration = numel;
|
||||
state.counters["elements"] =
|
||||
benchmark::Counter(state.iterations() * elements_per_iteration,
|
||||
benchmark::Counter::kIsRate);
|
||||
|
||||
const int64_t bytes_per_iteration = numel * sizeof(float);
|
||||
state.counters["bytes"] =
|
||||
benchmark::Counter(state.iterations() * bytes_per_iteration,
|
||||
benchmark::Counter::kIsRate);
|
||||
}
|
||||
|
||||
constexpr int64_t giga = INT64_C(1073741824);
|
||||
BENCHMARK(u32_fill_random)->Arg(2 * giga)->UseManualTime()->Unit(benchmark::kMicrosecond);
|
||||
|
||||
BENCHMARK_MAIN();
|
||||
Reference in New Issue
Block a user