* bf16

* tests

* gpt-oss

* enable gptoss for engine

* rough estimate

* convert to mxfp4

* handle safetensors U8

* clamp glu/linear

* update tokenizer

* MXFP4 support

This implements the Open Compute Microscaling (MX) FP4 format
as a tensor type with backend implementations focusing
on mulmat and mulmatid on CPU, CUDA, and Metal.

* Unit tests for MXFP4 support

This exercises various operations and shapes on both CPU and GPU (if detected
on the system)

* cuda graph

* unit test adjustments

* cuda: optimize memory access

Read 4 bytes at a time (8 elements) when performing mul_mat_vec_mxfp4

* mac: fix crash on old macos versions

cblas_sgemm is only supported on v13.3 and up, however bf16 is
only supported on v14+ so we were falling back to ggml-blas and
crashing on bf16 tensors.  Checking for the function being null
seems to be the simplest way to condittionally avoid registering the
backend.

* server: Minimum context length for gptoss

This model requires a minimum context length of 8192 to function
effectively. Users can set higher values through all normal mechanisms
but lower values will be silently reset.

* ggml: Multiply by numParallel for gptoss sliding window

When computing the graph size estimate, the context size is already
multiplied by numParallel so estimates reflect that. However, since
sliding window models use a smaller, fixed context size, they need
to manually take numParallel into account.

* gpt-oss integration

includes harmony parser and thinking levels, etc.

* fix sync

* fix tests

* fix lint

---------

Co-authored-by: Daniel Hiltgen <daniel@ollama.com>
Co-authored-by: Jesse Gross <jesse@ollama.com>
Co-authored-by: Devon Rifkin <drifkin@drifkin.net>
This commit is contained in:
Michael Yang
2025-08-05 12:21:16 -07:00
committed by GitHub
parent 0d38b66502
commit fa7776fd24
56 changed files with 6670 additions and 328 deletions

View File

@@ -276,6 +276,7 @@ type Tensor interface {
Cos(ctx Context) Tensor
Tanh(ctx Context) Tensor
GELU(ctx Context) Tensor
QuickGELU(ctx Context) Tensor
SILU(ctx Context) Tensor
RELU(ctx Context) Tensor
Sigmoid(ctx Context) Tensor
@@ -283,7 +284,7 @@ type Tensor interface {
Reshape(ctx Context, shape ...int) Tensor
View(ctx Context, offset int, shape ...int) Tensor
Permute(ctx Context, shape ...int) Tensor
Contiguous(ctx Context) Tensor
Contiguous(ctx Context, shape ...int) Tensor
Set(ctx Context, t2 Tensor, offset int, strides ...int) Tensor
Pad(ctx Context, shape ...int) Tensor
@@ -468,4 +469,5 @@ const (
DTypeQ80
DTypeQ40
DTypeI32
DTypeMXFP4
)

View File

@@ -239,10 +239,12 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
createTensor := func(t tensor, bts []*C.struct_ggml_backend_buffer_type, layer int) *C.struct_ggml_tensor {
for _, bt := range bts {
if _, ok := ctxs[bt]; !ok {
// slog.Info("XXX before ggml_init")
ctxs[bt] = C.ggml_init(C.struct_ggml_init_params{
mem_size: C.ggml_tensor_overhead() * C.size_t(maxTensors),
no_alloc: true,
})
// slog.Info("XXX after ggml_init")
}
targets[t.source.Name] = append(targets[t.source.Name], t.target)
@@ -541,6 +543,8 @@ func (b *Backend) NewContextSize(n int) ml.Context {
var allocatedBuffers []*C.struct_ggml_backend_buffer
// slog.Info("XXX before ggml_init")
// defer slog.Info("XXX after ggml_init")
return &Context{
b: b,
maxGraphNodes: n,
@@ -708,6 +712,8 @@ func (c *Context) newTensor(dtype ml.DType, shape []int) ml.Tensor {
cdtype = C.GGML_TYPE_Q4_0
case ml.DTypeI32:
cdtype = C.GGML_TYPE_I32
case ml.DTypeMXFP4:
cdtype = C.GGML_TYPE_MXFP4
default:
panic("unsupported dtype")
}
@@ -896,6 +902,8 @@ func (t *Tensor) DType() ml.DType {
return ml.DTypeQ40
case C.GGML_TYPE_I32:
return ml.DTypeI32
case C.GGML_TYPE_MXFP4:
return ml.DTypeMXFP4
default:
return ml.DTypeOther
}
@@ -958,10 +966,35 @@ func (t *Tensor) Concat(ctx ml.Context, t2 ml.Tensor, dim int) ml.Tensor {
}
}
func (t *Tensor) Contiguous(ctx ml.Context) ml.Tensor {
return &Tensor{
b: t.b,
t: C.ggml_cont(ctx.(*Context).ctx, t.t),
func (t *Tensor) Contiguous(ctx ml.Context, shape ...int) ml.Tensor {
switch len(shape) {
case 0:
return &Tensor{
b: t.b,
t: C.ggml_cont(ctx.(*Context).ctx, t.t),
}
case 1:
return &Tensor{
b: t.b,
t: C.ggml_cont_1d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0])),
}
case 2:
return &Tensor{
b: t.b,
t: C.ggml_cont_2d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0]), C.int64_t(shape[1])),
}
case 3:
return &Tensor{
b: t.b,
t: C.ggml_cont_3d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0]), C.int64_t(shape[1]), C.int64_t(shape[2])),
}
case 4:
return &Tensor{
b: t.b,
t: C.ggml_cont_4d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0]), C.int64_t(shape[1]), C.int64_t(shape[2]), C.int64_t(shape[3])),
}
default:
panic("unsupported number of dimensions")
}
}
@@ -1176,11 +1209,18 @@ func (t *Tensor) View(ctx ml.Context, offset int, shape ...int) ml.Tensor {
func (t *Tensor) RoPE(ctx ml.Context, positions ml.Tensor, ropeDim int, ropeBase, ropeScale float32, options ...func(*rope.Options)) ml.Tensor {
// Default options
opts := &rope.Options{OriginalContextLength: 131072, Factors: &Tensor{}}
opts := rope.Options{
Factors: &Tensor{},
OriginalContextLength: 131072,
ExtrapolationFactor: 0.,
AttentionFactor: 1.,
BetaFast: 32.,
BetaSlow: 1.,
}
// Apply any provided options
for _, option := range options {
option(opts)
option(&opts)
}
dequant := t.t
@@ -1200,10 +1240,10 @@ func (t *Tensor) RoPE(ctx ml.Context, positions ml.Tensor, ropeDim int, ropeBase
C.int(opts.OriginalContextLength),
C.float(ropeBase),
C.float(ropeScale),
C.float(0.0),
C.float(1.0),
C.float(32.0),
C.float(1.0),
C.float(opts.ExtrapolationFactor),
C.float(opts.AttentionFactor),
C.float(opts.BetaFast),
C.float(opts.BetaSlow),
),
}
}
@@ -1222,6 +1262,13 @@ func (t *Tensor) GELU(ctx ml.Context) ml.Tensor {
}
}
func (t *Tensor) QuickGELU(ctx ml.Context) ml.Tensor {
return &Tensor{
b: t.b,
t: C.ggml_gelu_quick_inplace(ctx.(*Context).ctx, t.t),
}
}
func (t *Tensor) SILU(ctx ml.Context) ml.Tensor {
return &Tensor{
b: t.b,
@@ -1350,3 +1397,65 @@ func (t *Tensor) Clamp(ctx ml.Context, min, max float32) ml.Tensor {
t: C.ggml_clamp(ctx.(*Context).ctx, t.t, C.float(min), C.float(max)),
}
}
func (c Context) FromBytes(dtype ml.DType, s []uint8, shape ...int) ml.Tensor {
// Unchecked to handle quantized types
t := c.newTensor(dtype, shape)
if len(s) > 0 {
C.ggml_backend_tensor_set(t.(*Tensor).t, unsafe.Pointer(&s[0]), 0, C.ggml_nbytes(t.(*Tensor).t))
}
return t
}
// TODO - DRY this out with New if possible
func newTestBackend(size int) *Backend {
var cpus []*C.struct_ggml_backend_device
for _, d := range devices() {
switch C.ggml_backend_dev_type(d) {
case C.GGML_BACKEND_DEVICE_TYPE_CPU:
if len(cpus) == 0 {
// only the first cpu device should be used
cpus = append(cpus, d)
break
}
}
}
var schedBackends []*C.struct_ggml_backend
var schedBufts []*C.struct_ggml_backend_buffer_type
b := C.ggml_backend_dev_init(cpus[0], nil)
bt := C.ggml_backend_get_default_buffer_type(b)
C.ggml_backend_cpu_set_n_threads(b, C.int(Threads(runtime.NumCPU())))
// C.ggml_backend_cpu_set_n_threads(b, 1) // DEBUGGING
schedBackends = append(schedBackends, b)
schedBufts = append(schedBufts, bt)
return &Backend{
meta: nil,
sched: C.ggml_backend_sched_new(
(*C.ggml_backend_t)(unsafe.Pointer(&schedBackends[0])),
(*C.ggml_backend_buffer_type_t)(unsafe.Pointer(&schedBufts[0])),
C.int(len(schedBackends)),
C.size_t(max(8192, size)),
false,
false,
),
input: bt,
maxGraphNodes: max(8192, size),
schedBackends: schedBackends,
schedBufts: schedBufts,
}
}
func newTestContext(b *Backend, n int) *Context {
n = max(8192, n)
// slog.Info("XXX before ggml_init")
// defer slog.Info("XXX after ggml_init")
return &Context{
b: b,
maxGraphNodes: n,
ctx: C.ggml_init(C.struct_ggml_init_params{
mem_size: C.size_t(n)*C.ggml_tensor_overhead() + C.ggml_graph_overhead_custom(C.size_t(n), false),
no_alloc: true,
}),
}
}

View File

@@ -353,7 +353,7 @@ extern "C" {
GGML_TYPE_F16 = 1,
GGML_TYPE_Q4_0 = 2,
GGML_TYPE_Q4_1 = 3,
// GGML_TYPE_Q4_2 = 4, support has been removed
GGML_TYPE_MXFP4 = 4, // Formerly removed type GGML_TYPE_Q4_2
// GGML_TYPE_Q4_3 = 5, support has been removed
GGML_TYPE_Q5_0 = 6,
GGML_TYPE_Q5_1 = 7,

View File

@@ -505,6 +505,11 @@ static const struct ggml_backend_reg_i ggml_backend_blas_reg_i = {
};
ggml_backend_reg_t ggml_backend_blas_reg(void) {
// MacOS prior to v14 does not include cblas_sgemm - disable this backend if it isn't available
if (&cblas_sgemm == NULL) {
GGML_LOG_INFO("Disabling ggml-blas backend on old MacOS version\n");
return NULL;
}
static struct ggml_backend_reg ggml_backend_blas_reg = {
/* .api_version = */ GGML_BACKEND_API_VERSION,
/* .iface = */ ggml_backend_blas_reg_i,

View File

@@ -417,6 +417,13 @@ typedef struct {
} block_iq4_xs;
static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding");
#define MXFP4 32
typedef struct {
uint8_t d; // scale E8M0 float
uint8_t qs[MXFP4 / 2]; // (32) 4 bit elements E2M1 float
} block_mxfp4;
static_assert(sizeof(block_mxfp4) == sizeof(uint8_t) + MXFP4/2, "wrong mxfp4 block size/padding");
#endif // GGML_COMMON_DECL
#endif // GGML_COMMON_DECL

View File

@@ -58,6 +58,8 @@ void ggml_vec_dot_iq4_nl_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const
void ggml_vec_dot_iq4_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_mxfp4(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const float * GGML_RESTRICT y, size_t by, int nrc);
#ifdef __cplusplus
}
#endif

View File

@@ -362,6 +362,11 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
.vec_dot_type = GGML_TYPE_Q8_K,
.nrows = 1,
},
[GGML_TYPE_MXFP4] = {
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_mxfp4,
.vec_dot_type = GGML_TYPE_F32,
.nrows = 1,
},
};
const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type) {

View File

@@ -4965,6 +4965,7 @@ void ggml_compute_forward_clamp(
case GGML_TYPE_I32:
case GGML_TYPE_I64:
case GGML_TYPE_F64:
case GGML_TYPE_MXFP4:
case GGML_TYPE_COUNT:
{
GGML_ABORT("fatal error");

View File

@@ -250,3 +250,93 @@ ggml_float ggml_vec_log_soft_max_f32(const int n, float * y, const float * x, fl
}
return sum = (ggml_float)logf(sum);
}
#define MXFP4 32
typedef struct {
uint8_t d; // scale E8M0 float
uint8_t qs[MXFP4 / 2]; // (32) 4 bit elements E2M1 float
} block_mxfp4;
static_assert(sizeof(block_mxfp4) == sizeof(uint8_t) + MXFP4/2, "wrong mxfp4 block size/padding");
#define MXFP4_VALS {0.0, 0.5, 1.0, 1.5, 2.0, 3.0, 4.0, 6.0, 0.0, -0.5, -1.0, -1.5, -2.0, -3.0, -4.0, -6.0}
void ggml_vec_dot_mxfp4(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const float * GGML_RESTRICT y, size_t by, int nrc) {
assert(nrc == 1);
GGML_UNUSED(nrc);
GGML_UNUSED(bx);
GGML_UNUSED(by);
GGML_UNUSED(bs);
ggml_float mxfp4_table[] = MXFP4_VALS;
#if defined(GGML_SIMD)
float sumf = 0.0f;
const int np = (n & ~(GGML_F32_STEP - 1));
const block_mxfp4 * GGML_RESTRICT xx = (const block_mxfp4 *) vx;
GGML_F32_VEC sum[GGML_F32_ARR] = { GGML_F32_VEC_ZERO };
GGML_F32_VEC scalev;
GGML_F32_VEC ax[GGML_F32_ARR];
GGML_F32_VEC ay[GGML_F32_ARR];
for (int i = 0; i < np; i += GGML_F32_STEP) { // ARM: +16 AVX512: +64
for (int j = 0; j < GGML_F32_ARR; j++) { // ARM: 0 .. 4 AVX512: 0 .. 4
// convert GGML_F32_ARR X elements
const int ib = (i + j*GGML_F32_EPR) / MXFP4;
const block_mxfp4 * GGML_RESTRICT x = &xx[ib];
union {
uint32_t as_bits;
float as_value;
} scale;
scale.as_bits = (((uint32_t)x->d) << 23);
scalev = GGML_F32_VEC_SET1(scale.as_value);
float xf[GGML_F32_EPR]= {0.f};
assert(((i+j*GGML_F32_EPR) % MXFP4)+GGML_F32_ARR < MXFP4 && "block overrun");
for (int qi = 0; qi < GGML_F32_EPR/2 ; ++qi) {
xf[qi*2] = mxfp4_table[(x->qs[((i+j*GGML_F32_EPR)%MXFP4)/2+qi] & 0xf)];
xf[qi*2+1] = mxfp4_table[(x->qs[((i+j*GGML_F32_EPR)%MXFP4)/2+qi] & 0xf0) >> 4];
}
ax[j] = GGML_F32_VEC_MUL(GGML_F32_VEC_LOAD(xf), scalev);
ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
sum[j] = GGML_F32_VEC_FMA(sum[j], ax[j], ay[j]);
}
}
GGML_F32_VEC_REDUCE(sumf, sum);
// leftovers
for (int i = np; i < n; i+=2) {
const int ib = i / MXFP4;
const block_mxfp4 * GGML_RESTRICT x = &xx[ib];
union {
uint32_t as_bits;
float as_value;
} scale;
scale.as_bits = (((uint32_t)x->d) << 23);
sumf += y[i] * scale.as_value * mxfp4_table[(x->qs[(i%MXFP4)/2] & 0xf)];
sumf += y[i+1] * scale.as_value * mxfp4_table[(x->qs[(i%MXFP4)/2] & 0xf0) >> 4];
}
#else // defined(GGML_SIMD)
const int nb = n / MXFP4;
assert(n % MXFP4 == 0);
int yi = 0;
const block_mxfp4 * GGML_RESTRICT xx = (const block_mxfp4 *) vx;
ggml_float sumf = 0.0;
for (int ib = 0; ib < nb; ++ib) {
const block_mxfp4 * GGML_RESTRICT x = &xx[ib + 0];
union {
uint32_t as_bits;
float as_value;
} scale;
scale.as_bits = (((uint32_t)x->d) << 23);
for (int i = 0; i < MXFP4/2; ++i) {
sumf += mxfp4_table[(x->qs[i] & 0xf)] * (ggml_float)(scale.as_value) * (ggml_float)(y[ib*MXFP4 + i*2]);
sumf += mxfp4_table[(x->qs[i] & 0xf0) >> 4] * (ggml_float)(scale.as_value) * (ggml_float)(y[ib*MXFP4 + i*2+1]);
}
}
#endif
*s = sumf;
}

View File

@@ -42,6 +42,8 @@ void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * G
void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t * GGML_RESTRICT x, size_t bx, ggml_bf16_t * GGML_RESTRICT y, size_t by, int nrc);
void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * GGML_RESTRICT x, size_t bx, ggml_fp16_t * GGML_RESTRICT y, size_t by, int nrc);
void ggml_vec_dot_mxfp4(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const float * GGML_RESTRICT y, size_t by, int nrc);
void ggml_vec_silu_f32(const int n, float * y, const float * x);
ggml_float ggml_vec_soft_max_f32(const int n, float * y, const float * x, float max);
ggml_float ggml_vec_log_soft_max_f32(const int n, float * y, const float * x, float max);

View File

@@ -571,6 +571,82 @@ static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int64_t
dequantize_block_iq4_xs<<<nb, 32, 0, stream>>>(vx, y);
}
// MXFP4 dequantize derived from dequantize_block_q4_0
template<typename dst_t>
static __global__ void dequantize_block_mxfp4(const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32) {
const uint16_t dst_bias = 15;
const uint16_t dst_0p5 = 0x3800;
const uint16_t dst_m_bits = 10;
const int64_t i = blockIdx.x;
// assume 32 threads
const int64_t tid = threadIdx.x;
const int64_t il = tid/8;
const int64_t ir = tid%8;
const int64_t ib = 8*i + ir;
if (ib >= nb32) {
return;
}
const uint64_t offset = 256*i + MXFP4*ir + 8*il;
dst_t * y = yy + offset;
const block_mxfp4 * x = (const block_mxfp4 *)vx + ib;
union {
uint32_t as_bits;
float as_value;
} scale;
scale.as_bits = (((uint32_t)x->d) << 23);
// offset within the block 1/4 chunks (8 items)
const uint8_t * q = x->qs + 4*il;
for (int l = 0; l < 4; ++l) {
uint16_t em0 = q[l] & 0x07;
uint16_t em1 = q[l] & 0x70;
// float16 values
iq1m_scale_t x0;
iq1m_scale_t x1;
x0.u16 = (em0 << (dst_m_bits - 1)) | ((q[l] & 0x08) << 12);
x1.u16 = (em1 << (dst_m_bits - 5)) | ((q[l] & 0x80) << 8);
// Three cases:
// x is normal and non-zero: Correct bias
if ((em0 & 0x06) != 0) {
x0.u16 = x0.u16 + ((dst_bias - 1) << dst_m_bits);
}
if ((em1 & 0x60) != 0) {
x1.u16 = x1.u16 + ((dst_bias - 1) << dst_m_bits);
}
// x is subnormal (x == 0bs001 where s is the sign): Map to +-0.5 in the dst type
if (em0 == 0x01) {
x0.u16 = dst_0p5 | (x0.u16 & 0x8000);
}
if (em1 == 0x10) {
x1.u16 = dst_0p5 | (x1.u16 & 0x8000);
}
// x is zero, do nothing
// XXX it looks correct here - but mulmat still gives bad results...
// printf("i:%lld ir:%lld il:%lld l:%d y_offset:[%3lld +%d] = %f \n",
// i, ir, il, l, 256*i + 32*ir + 4*il, l*2+ 0, scale * float(x0.f16));
// printf("i:%lld ir:%lld il:%lld l:%d y_offset:[%3lld +%d] = %f \n",
// i, ir, il, l, 256*i + 32*ir + 4*il, l*2+ 1, scale * float(x1.f16));
y[l*2] = scale.as_value * float(x0.f16);
y[l*2+1] = scale.as_value * float(x1.f16);
}
}
// derived from dequantize_row_q4_0_cuda
template<typename dst_t>
static void dequantize_row_mxfp4_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
const int nb32 = k / 32;
const int nb = (k + 255) / 256;
dequantize_block_mxfp4<<<nb, 32, 0, stream>>>(vx, y, nb32);
}
template <typename src_t, typename dst_t>
static __global__ void convert_unary(
const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t ne00, const int64_t ne01, const int64_t ne02,
@@ -664,6 +740,8 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
return convert_unary_cont_cuda<float>;
case GGML_TYPE_BF16:
return convert_unary_cont_cuda<nv_bfloat16>;
case GGML_TYPE_MXFP4:
return dequantize_row_mxfp4_cuda;
default:
return nullptr;
}
@@ -713,6 +791,8 @@ to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
return convert_unary_cont_cuda<half>;
case GGML_TYPE_BF16:
return convert_unary_cont_cuda<nv_bfloat16>;
case GGML_TYPE_MXFP4:
return dequantize_row_mxfp4_cuda;
default:
return nullptr;
}

View File

@@ -21,6 +21,7 @@
#include "ggml-cuda/im2col.cuh"
#include "ggml-cuda/mmq.cuh"
#include "ggml-cuda/mmv.cuh"
#include "ggml-cuda/mmvmxfp4.cuh"
#include "ggml-cuda/mmvq.cuh"
#include "ggml-cuda/norm.cuh"
#include "ggml-cuda/opt-step-adamw.cuh"
@@ -1202,7 +1203,7 @@ static void ggml_cuda_op_mul_mat_cublas(
const int cc = ggml_cuda_info().devices[id].cc;
const bool use_fp16 = (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT;
const bool use_fp16 = (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT && src0->type != GGML_TYPE_MXFP4;
if (src0->type == GGML_TYPE_BF16 && ggml_is_contiguous(src0) && row_diff == src0->ne[1]) {
ggml_cuda_pool_alloc<nv_bfloat16> src1_as_bf16(ctx.pool(id));
@@ -1924,7 +1925,11 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
&& src0->ne[0] % 2 == 0 && src1->ne[1] == 1;
bool use_mul_mat_vec_q = ggml_is_quantized(src0->type) && !bad_padding_clear
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
&& src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
&& src1->ne[1] <= MMVQ_MAX_BATCH_SIZE
&& src0->type != GGML_TYPE_MXFP4;
bool use_mul_mat_vec_mxfp4 = src0->type == GGML_TYPE_MXFP4
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
&& src0->ne[0] % 2 == 0 && src1->ne[1] == 1;
bool use_mul_mat_q = ggml_is_quantized(src0->type) && !bad_padding_clear
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
@@ -1978,6 +1983,8 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, quantize_row_q8_1_cuda);
} else if (use_mul_mat_q) {
ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_q, quantize_mmq_q8_1_cuda);
} else if (use_mul_mat_vec_mxfp4) {
ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec_mxfp4, nullptr);
} else {
ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_cublas, nullptr);
}
@@ -1997,6 +2004,10 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
if (src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
if (ne2 == 1 && src0->type == GGML_TYPE_MXFP4) {
ggml_cuda_mul_mat_vec_mxfp4(ctx, src0, src1, ids, dst);
return;
}
if (ne2 == 1) {
if (ggml_is_quantized(src0->type)) {
ggml_cuda_mul_mat_vec_q(ctx, src0, src1, ids, dst);
@@ -2498,20 +2509,6 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
#endif
}
// workarounds to exclude Gemma3n's `project_per_layer_input` operation from the batch-size heuristic, specific to ollama's implementation of gemma3n
// number of layers is different for per_layer_proj between gemma3n:2b and gemma3n:4b, which is why we don't check that value here
if (node->op == GGML_OP_ADD && node->src[1] && node->src[1]->ne[1] > 1 && !(node->ne[0] == 256
&& node->ne[2] == 1
&& node->ne[3] == 1
&& node->src[0] ? std::string(node->src[0]->name).find(gemma3n_node_name) != std::string::npos : false
&& node->src[1] ? node->src[1]->name == gemma3n_per_layer_proj_src1_name : false)) {
// Generally, changes in batch size or context size can cause changes to the grid size of some kernels.
use_cuda_graph = false;
#ifndef NDEBUG
GGML_LOG_INFO("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
#endif
}
if (node->op == GGML_OP_CPY) {
// Store the pointers which are updated for each token, such that these can be sent
@@ -3056,6 +3053,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
case GGML_TYPE_IQ4_NL:
case GGML_TYPE_IQ4_XS:
case GGML_TYPE_BF16:
case GGML_TYPE_MXFP4:
#ifdef GGML_USE_MUSA
if (a->type == GGML_TYPE_Q3_K) {
return false;

View File

@@ -0,0 +1,307 @@
#include "ggml.h"
#include "common.cuh"
#include "mmvmxfp4.cuh"
// MXFP4 implementation derived from mmv.cu float32 code paths
typedef union {
half f16;
uint16_t u16;
} f16_t;
template <typename type_acc, int block_size> // TODO type_acc unused - consider bf16 support
static __global__ void mul_mat_vec_mxfp4(
const block_mxfp4 * __restrict__ x, const float * __restrict__ y, const int32_t * __restrict__ ids, float * __restrict__ dst,
const int64_t ncols2, const int64_t nchannels_y, const int64_t stride_row,
const int64_t channel_ratio, const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst,
const int64_t sample_ratio, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst) {
const int64_t row = blockIdx.x;
const int64_t channel_dst = blockIdx.y;
const int64_t channel_x = ids ? ids[channel_dst] : channel_dst / channel_ratio;
const int64_t channel_y = ids ? channel_dst % nchannels_y : channel_dst;
const int64_t sample_dst = blockIdx.z;
const int64_t sample_x = sample_dst / sample_ratio;
const int64_t sample_y = sample_dst;
const int tid = threadIdx.x;
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
const uint16_t dst_bias = 15;
const uint16_t dst_0p5 = 0x3800;
const uint16_t dst_m_bits = 10;
x += sample_x *stride_sample_x + channel_x *stride_channel_x + row*stride_row;
y += sample_y *stride_sample_y + channel_y *stride_channel_y;
dst += sample_dst*stride_sample_dst + channel_dst*stride_channel_dst;
const float2 * y2 = (const float2 *) y;
extern __shared__ char data_mmv[]; // allocated in GPU shared memory: warp_size*sizeof(float)
float * buf_iw = (float *) data_mmv;
if (block_size > warp_size) {
if (tid < warp_size) {
buf_iw[tid] = 0.0f;
}
__syncthreads();
}
float sumf = 0.0f;
for (int64_t col2 = tid; col2 < ncols2; col2 += block_size) {
int offset0 = col2 / (MXFP4/2);
int i = col2 % (MXFP4/2);
const block_mxfp4 *x2 = x+offset0;
union {
uint32_t as_bits;
float as_value;
} scale;
scale.as_bits = (((uint32_t)x2->d) << 23);
uint16_t em0 = x2->qs[i] & 0x07;
uint16_t em1 = x2->qs[i] & 0x70;
// float16 values
f16_t x0;
f16_t x1;
x0.u16 = (em0 << (dst_m_bits - 1)) | ((x2->qs[i] & 0x08) << 12);
x1.u16 = (em1 << (dst_m_bits - 5)) | ((x2->qs[i] & 0x80) << 8);
// Three cases:
// x is normal and non-zero: Correct bias
if ((em0 & 0x06) != 0) {
x0.u16 = x0.u16 + ((dst_bias - 1) << dst_m_bits);
}
if ((em1 & 0x60) != 0) {
x1.u16 = x1.u16 + ((dst_bias - 1) << dst_m_bits);
}
// x is subnormal (x == 0bs001 where s is the sign): Map to +-0.5 in the dst type
if (em0 == 0x01) {
x0.u16 = dst_0p5 | (x0.u16 & 0x8000);
}
if (em1 == 0x10) {
x1.u16 = dst_0p5 | (x1.u16 & 0x8000);
}
// x is zero, do nothing
if (isnan(scale.as_value)) {
sumf = scale.as_value;
break;
}
const float2 tmpx = {x0.f16, x1.f16};
const float2 tmpy = y2[col2];
sumf += tmpx.x*tmpy.x*scale.as_value;
sumf += tmpx.y*tmpy.y*scale.as_value;
}
sumf = warp_reduce_sum<warp_size>(sumf);
if (block_size > warp_size) {
buf_iw[tid/warp_size] = sumf;
__syncthreads();
if (tid >= warp_size) {
return;
}
sumf = buf_iw[tid];
sumf = warp_reduce_sum<warp_size>(sumf);
}
if (tid != 0) {
return;
}
dst[row] = sumf;
}
template <typename type_acc>
static void launch_mul_mat_vec_cuda_mxfp4(
const block_mxfp4 * x, const float * y, const int32_t * ids, float * dst,
const int64_t ncols, const int64_t nrows, const int64_t stride_row, const int64_t nchannels_x, const int64_t nchannels_y, const int64_t nchannels_dst,
const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t nsamples_x,
const int64_t nsamples_dst, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst,
cudaStream_t stream) {
GGML_ASSERT(ncols % 2 == 0);
// GGML_ASSERT(stride_row % 2 == 0); // TODO
GGML_ASSERT(ids || nchannels_dst % nchannels_x == 0);
GGML_ASSERT( nsamples_dst % nsamples_x == 0);
const int64_t channel_ratio = nchannels_dst / nchannels_x;
const int64_t sample_ratio = nsamples_dst / nsamples_x;
int device;
int warp_size;
CUDA_CHECK(cudaGetDevice(&device));
warp_size = ggml_cuda_info().devices[device].warp_size;
int64_t block_size_best = warp_size;
int64_t niter_best = (ncols + 2*warp_size - 1) / (2*warp_size);
int64_t max_block_size = 256;
if(ggml_cuda_info().devices[device].cc > GGML_CUDA_CC_OFFSET_AMD && ggml_cuda_info().devices[device].cc < GGML_CUDA_CC_RDNA1) {
max_block_size = 128;
}
for (int64_t block_size = 2*warp_size; block_size <= max_block_size; block_size += warp_size) {
const int64_t niter = (ncols + 2*block_size - 1) / (2*block_size);
if (niter < niter_best) {
niter_best = niter;
block_size_best = block_size;
}
}
const int smem = warp_size*sizeof(float);
const dim3 block_nums(nrows, nchannels_dst, nsamples_dst);
const dim3 block_dims(block_size_best, 1, 1);
switch (block_size_best) {
case 32: {
mul_mat_vec_mxfp4<type_acc, 32><<<block_nums, block_dims, smem, stream>>>
(x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y,
stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
} break;
case 64: {
mul_mat_vec_mxfp4<type_acc, 64><<<block_nums, block_dims, smem, stream>>>
(x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y,
stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
} break;
case 96: {
mul_mat_vec_mxfp4<type_acc, 96><<<block_nums, block_dims, smem, stream>>>
(x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y,
stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
} break;
case 128: {
mul_mat_vec_mxfp4<type_acc, 128><<<block_nums, block_dims, smem, stream>>>
(x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y,
stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
} break;
case 160: {
mul_mat_vec_mxfp4<type_acc, 160><<<block_nums, block_dims, smem, stream>>>
(x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y,
stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
} break;
case 192: {
mul_mat_vec_mxfp4<type_acc, 192><<<block_nums, block_dims, smem, stream>>>
(x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y,
stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
} break;
case 224: {
mul_mat_vec_mxfp4<type_acc, 224><<<block_nums, block_dims, smem, stream>>>
(x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y,
stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
} break;
case 256: {
mul_mat_vec_mxfp4<type_acc, 256><<<block_nums, block_dims, smem, stream>>>
(x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y,
stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
} break;
default: {
GGML_ABORT("fatal error");
} break;
}
}
static void mul_mat_vec_cuda_mxfp4(
const block_mxfp4 * x, const float * y, const int32_t * ids, float * dst,
const int64_t ncols, const int64_t nrows, const int64_t stride_row, const int64_t nchannels_x, const int64_t nchannels_y, const int64_t nchannels_dst,
const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t nsamples_x,
const int64_t nsamples_dst, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst,
enum ggml_prec prec, cudaStream_t stream) {
launch_mul_mat_vec_cuda_mxfp4<float>
(x, y, ids, dst, ncols, nrows, stride_row, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
}
void ggml_cuda_mul_mat_vec_mxfp4(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) {
GGML_ASSERT( src1->type == GGML_TYPE_F32);
GGML_ASSERT(!ids || ids->type == GGML_TYPE_I32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
GGML_TENSOR_BINARY_OP_LOCALS;
const size_t ts_src0 = ggml_type_size(src0->type);
const size_t ts_src1 = ggml_type_size(src1->type);
const size_t ts_dst = ggml_type_size(dst->type);
GGML_ASSERT(!ids || ne12 == 1); // Implementation is only correct for batch size 1.
GGML_ASSERT(ne13 == ne3);
// GGML_ASSERT( nb00 == ts_src0); // TODO adjust for block sizing logic
GGML_ASSERT( nb10 == ts_src1);
GGML_ASSERT(!ids || ids->nb[0] == ggml_type_size(ids->type));
GGML_ASSERT( nb0 == ts_dst);
const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
const enum ggml_prec prec = fast_fp16_available(cc) ? ggml_prec(dst->op_params[0]) : GGML_PREC_F32;
const float * src1_d = (const float *) src1->data;
const int32_t * ids_d = ids ? (const int32_t *) ids->data : nullptr;
float * dst_d = (float *) dst->data;
const int64_t stride_row = src0->nb[1] / ts_src0;
const int64_t s11 = src1->nb[1] / ts_src1;
const int64_t s1 = dst->nb[1] / ts_dst;
const int64_t stride_channel_x = src0->nb[2] / ts_src0;
const int64_t s12 = src1->nb[2] / ts_src1;
const int64_t s2 = dst->nb[2] / ts_dst;
const int64_t stride_sample_x = src0->nb[3] / ts_src0;
const int64_t stride_sample_y = src1->nb[3] / ts_src1;
const int64_t stride_sample_dst = dst->nb[3] / ts_dst;
const int64_t nsamples_dst = ne3;
const int64_t nsamples_x = ne03;
const int64_t nchannels_x = ne02;
const int64_t nrows = ne01;
const int64_t ncols = ne00;
// For MUL_MAT_ID the memory layout is different than for MUL_MAT:
const int64_t ncols_dst = ids ? ne2 : ne1;
const int64_t nchannels_y = ids ? ne11 : ne12;
const int64_t nchannels_dst = ids ? ne1 : ne2;
const int64_t stride_channel_dst = ids ? s1 : s2;
const int64_t stride_channel_y = ids ? s11 : s12;
GGML_ASSERT(ncols_dst == 1);
const block_mxfp4 * src0_d = (const block_mxfp4 *) src0->data;
mul_mat_vec_cuda_mxfp4(src0_d, src1_d, ids_d, dst_d, ncols, nrows, stride_row,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, prec, ctx.stream());
}
void ggml_cuda_op_mul_mat_vec_mxfp4(
ggml_backend_cuda_context & ctx,
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
const int64_t src1_padded_row_size, cudaStream_t stream) {
GGML_ASSERT(src1->type == GGML_TYPE_F32);
GGML_ASSERT(dst->type == GGML_TYPE_F32);
const int64_t ne00 = src0->ne[0];
const int64_t row_diff = row_high - row_low;
GGML_ASSERT(src1_ncols == 1);
const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
const enum ggml_prec prec = fast_fp16_available(cc) ? ggml_prec(dst->op_params[0]) : GGML_PREC_F32;
// ggml_cuda_op provides single, contiguous matrices
const int64_t stride_row = ne00 / MXFP4;
const int64_t nchannels_x = 1;
const int64_t nchannels_y = 1;
const int64_t nchannels_dst = 1;
const int64_t stride_channel_x = 0;
const int64_t stride_channel_y = 0;
const int64_t stride_channel_dst = 0;
const int64_t nsamples_x = 1;
const int64_t nsamples_dst = 1;
const int64_t stride_sample_x = 0;
const int64_t stride_sample_y = 0;
const int64_t stride_sample_dst = 0;
const block_mxfp4 * src0_d = (const block_mxfp4 *) src0_dd_i;
mul_mat_vec_cuda_mxfp4(src0_d, src1_ddf_i, nullptr, dst_dd_i, ne00, row_diff, stride_row,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, prec, stream);
GGML_UNUSED(ctx);
GGML_UNUSED(src1);
GGML_UNUSED(dst);
GGML_UNUSED(src1_ddq_i);
GGML_UNUSED(src1_ncols);
GGML_UNUSED(src1_padded_row_size);
}

View File

@@ -0,0 +1,9 @@
#include "common.cuh"
void ggml_cuda_mul_mat_vec_mxfp4(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst);
void ggml_cuda_op_mul_mat_vec_mxfp4(
ggml_backend_cuda_context & ctx,
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
const int64_t src1_padded_row_size, cudaStream_t stream);

View File

@@ -421,6 +421,13 @@ typedef struct {
} block_iq4_xs;
static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding");
#define MXFP4 32
typedef struct {
uint8_t d; // scale E8M0 float
uint8_t qs[MXFP4 / 2]; // (32) 4 bit elements E2M1 float
} block_mxfp4;
static_assert(sizeof(block_mxfp4) == sizeof(uint8_t) + MXFP4/2, "wrong mxfp4 block size/padding");
#endif // GGML_COMMON_DECL
#endif // GGML_COMMON_DECL
@@ -1929,6 +1936,9 @@ GGML_TABLE_END()
#define N_R0_IQ4_XS 2
#define N_SG_IQ4_XS 2
#define N_R0_MXFP4 4
#define N_SG_MXFP4 2
// kernel argument structs
//
// - element counters (e.g. ne00) typically use int32_t to reduce register usage
@@ -4380,16 +4390,16 @@ void mul_vec_q_n_f32_impl(
device const char * src1,
device char * dst,
threadgroup char * shmem,
uint3 tgpig,
ushort tiisg,
ushort sgitg) {
const int nb = args.ne00/QK4_0;
uint3 tgpig, // Threadgroup Position in Grid
ushort tiisg, // Thread Index in SIMD Group
ushort sgitg) { // SIMD Group Index in ThreadGroup
const int nb = args.ne00/QK4_0; // src0->ne[0] / 32
const int r0 = tgpig.x;
const int r1 = tgpig.y;
const int im = tgpig.z;
const int first_row = (r0 * nsg + sgitg) * nr0;
const int first_row = (r0 * nsg + sgitg) * nr0; // nsg=2 nr0=4
const uint i12 = im%args.ne12;
const uint i13 = im/args.ne12;
@@ -9222,6 +9232,49 @@ kernel void kernel_mul_mm_id(
}
}
template <typename type4x4>
void dequantize_mxfp4(device const block_mxfp4 * xb, short il, thread type4x4 & reg) {
float4x4 reg_f;
const ushort dst_bias = 15;
const ushort dst_0p5 = 0x3800;
const ushort dst_m_bits = 10;
const half scale = (half)(as_type<float>(((uint32_t)xb->d) << 23));
// il:0 first 16, il:1 last 16
for (int i = 0; i < 8; i++) {
ushort em0 = xb->qs[il*8 + i] & 0x07;
ushort em1 = xb->qs[il*8 + i] & 0x70;
// float16 values
ushort x0 = (em0 << (dst_m_bits - 1)) | ((xb->qs[il*8 + i] & 0x08) << 12);
ushort x1 = (em1 << (dst_m_bits - 5)) | ((xb->qs[il*8 + i] & 0x80) << 8);
// Three cases:
// x is normal and non-zero: Correct bias
if ((em0 & 0x06) != 0) {
x0 = x0 + ((dst_bias - 1) << dst_m_bits);
}
if ((em1 & 0x60) != 0) {
x1 = x1 + ((dst_bias - 1) << dst_m_bits);
}
// x is subnormal (x == 0bs001 where s is the sign): Map to +-0.5 in the dst type
if (em0 == 0x01) {
x0 = dst_0p5 | (x0 & 0x8000);
}
if (em1 == 0x10) {
x1 = dst_0p5 | (x1 & 0x8000);
}
// x is zero, do nothing
if (isnan(scale)) {
reg_f[i/2][2*(i%2) + 0] = scale;
reg_f[i/2][2*(i%2) + 1] = scale;
} else {
reg_f[i/2][2*(i%2) + 0] = scale * as_type<half>(x0);
reg_f[i/2][2*(i%2) + 1] = scale * as_type<half>(x1);
}
}
reg = (type4x4) reg_f;
}
#define QK_NL 16
//
@@ -9289,6 +9342,8 @@ template [[host_name("kernel_mul_mm_iq1_m_f32")]] kernel mul_mm_t kernel_mul_m
template [[host_name("kernel_mul_mm_iq4_nl_f32")]] kernel mul_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, block_iq4_nl, 2, dequantize_iq4_nl>;
template [[host_name("kernel_mul_mm_iq4_xs_f32")]] kernel mul_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, block_iq4_xs, QK_NL, dequantize_iq4_xs>;
template [[host_name("kernel_mul_mm_mxfp4_f32")]] kernel mul_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, block_mxfp4, 2, dequantize_mxfp4>;
//
// indirect matrix-matrix multiplication
//
@@ -9320,6 +9375,8 @@ template [[host_name("kernel_mul_mm_id_iq1_m_f16")]] kernel mul_mm_id kernel_m
template [[host_name("kernel_mul_mm_id_iq4_nl_f16")]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, block_iq4_nl, 2, dequantize_iq4_nl>;
template [[host_name("kernel_mul_mm_id_iq4_xs_f16")]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, block_iq4_xs, QK_NL, dequantize_iq4_xs>;
template [[host_name("kernel_mul_mm_id_mxfp4_f16")]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, block_mxfp4, 2, dequantize_mxfp4>;
//
// matrix-vector multiplication
@@ -9436,6 +9493,120 @@ kernel void kernel_mul_mv_id(
sgitg);
}
// MXFP32 implementation derived from mul_vec_q_n_f32_impl and block_q_n_dot_y
void mul_mv_mxfp4_f32_impl(
ggml_metal_kargs_mul_mv args,
device const char * src0,
device const char * src1,
device char * dst,
threadgroup char * shmem,
uint3 tgpig,
ushort tiisg,
ushort sgitg) {
const ushort dst_bias = 15;
const ushort dst_0p5 = 0x3800;
const ushort dst_m_bits = 10;
const int nr0 = N_R0_MXFP4;
const int nsg = N_SG_MXFP4;
const int nw = N_SIMDWIDTH;
const int nb = args.ne00/MXFP4;
const int r0 = tgpig.x;
const int r1 = tgpig.y;
const int im = tgpig.z;
const int first_row = (r0 * nsg + sgitg) * nr0;
const uint i12 = im%args.ne12;
const uint i13 = im/args.ne12;
const uint64_t offset1 = r1*args.nb11 + (i12 )*args.nb12 + (i13 )*args.nb13;
device const float * y = (device const float *) (src1 + offset1);
// pointers to src0 rows
device const block_mxfp4 * ax[nr0];
for (int row = 0; row < nr0; ++row) {
const uint64_t offset0 = (first_row + row)*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
ax[row] = (device const block_mxfp4 *) ((device char *) src0 + offset0);
}
float yl[16]; // src1 vector cache
float sumf[nr0] = {0.f};
const short ix = (tiisg/2);
const short il = (tiisg%2)*16;
device const float * yb = y + ix*MXFP4 + il;
// each thread in a SIMD group deals with half a block.
for (int ib = ix; ib < nb; ib += nw/2) {
#pragma unroll
for (short row = 0; row < nr0; row++) {
// Processes 16 items
device const block_mxfp4 * qb_curr = ax[row] + ib;
float d = as_type<float>(((uint32_t)(ax[row] + ib)->d) << 23);
// il = 0 or 16
device const uint8_t *qs = ((device const uint8_t *) qb_curr + 1 + il/2);
for (int i = 0; i < 8; ++i) {
ushort em0 = qs[i] & 0x07;
ushort em1 = qs[i] & 0x70;
ushort x0 = (em0 << (dst_m_bits - 1)) | ((qs[i] & 0x08) << 12);
ushort x1 = (em1 << (dst_m_bits - 5)) | ((qs[i] & 0x80) << 8);
// Three cases:
// x is normal and non-zero: Correct bias
if ((em0 & 0x06) != 0) {
x0 = x0 + ((dst_bias - 1) << dst_m_bits);
}
if ((em1 & 0x60) != 0) {
x1 = x1 + ((dst_bias - 1) << dst_m_bits);
}
// x is subnormal (x == 0bs001 where s is the sign): Map to +-0.5 in the dst type
if (em0 == 0x01) {
x0 = dst_0p5 | (x0 & 0x8000);
}
if (em1 == 0x10) {
x1 = dst_0p5 | (x1 & 0x8000);
}
// x is zero, do nothing
if (!isnan(d)) {
sumf[row] += yb[i*2] * as_type<half>(x0) * d
+ yb[i*2+1] * as_type<half>(x1) * d;
} else {
sumf[row] = d;
}
}
}
yb += MXFP4 * 16;
}
device float * dst_f32 = (device float *) dst + im*args.ne0*args.ne1 + r1*args.ne0;
for (int row = 0; row < nr0; ++row) {
const float tot = simd_sum(sumf[row]);
if (tiisg == 0 && first_row + row < args.ne01) {
dst_f32[first_row + row] = tot;
}
}
}
[[host_name("kernel_mul_mv_mxfp4_f32")]]
kernel void kernel_mul_mv_mxfp4_f32(
constant ggml_metal_kargs_mul_mv & args,
device const char * src0,
device const char * src1,
device char * dst,
threadgroup char * shmem [[threadgroup(0)]],
uint3 tgpig[[threadgroup_position_in_grid]],
ushort tiisg[[thread_index_in_simdgroup]],
ushort sgitg[[simdgroup_index_in_threadgroup]]) {
mul_mv_mxfp4_f32_impl(args, src0, src1, dst, shmem, tgpig, tiisg, sgitg);
}
typedef decltype(kernel_mul_mv_id<mmv_fn<kernel_mul_mv_impl<float, float4, float, float4>>>) kernel_mul_mv_id_t;
template [[host_name("kernel_mul_mv_id_f32_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_impl<float, float4, float, float4>>>;
@@ -9465,6 +9636,8 @@ template [[host_name("kernel_mul_mv_id_iq2_s_f32")]] kernel kernel_mul_mv_id_t
template [[host_name("kernel_mul_mv_id_iq4_nl_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq4_nl_f32_impl <N_R0_IQ4_NL, N_SG_IQ4_NL, N_SIMDWIDTH>>>;
template [[host_name("kernel_mul_mv_id_iq4_xs_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq4_xs_f32_impl <N_R0_IQ4_XS, N_SG_IQ4_XS, N_SIMDWIDTH>>>;
template [[host_name("kernel_mul_mv_id_mxfp4_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<mul_mv_mxfp4_f32_impl>>;
kernel void kernel_pool_2d_max_f32(
device const float * src0,
device float * dst,

View File

@@ -65,6 +65,9 @@
#define N_R0_IQ4_XS 2
#define N_SG_IQ4_XS 2
#define N_R0_MXFP4 4
#define N_SG_MXFP4 2
// kernel argument structs
//
// - element counters (e.g. ne00) typically use int32_t to reduce register usage

View File

@@ -40,6 +40,7 @@ static const NSInteger MTLGPUFamilyMetal3_GGML = 5001;
static struct ggml_backend_reg g_ggml_backend_metal_reg;
static struct ggml_backend_device g_ggml_backend_metal_device;
// information about a Metal device
// note: assumes single GPU device - the default one
// TODO: support multiple GPU devices
@@ -209,6 +210,7 @@ enum ggml_metal_kernel_type {
GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_0_F32,
GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_1_F32,
GGML_METAL_KERNEL_TYPE_MUL_MV_Q8_0_F32,
GGML_METAL_KERNEL_TYPE_MUL_MV_MXFP4_F32,
GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_F16_F32_R1_2,
GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_F16_F32_R1_3,
GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_F16_F32_R1_4,
@@ -288,6 +290,7 @@ enum ggml_metal_kernel_type {
GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_M_F32,
GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_NL_F32,
GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_XS_F32,
GGML_METAL_KERNEL_TYPE_MUL_MV_ID_MXFP4_F32,
GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32,
GGML_METAL_KERNEL_TYPE_MUL_MM_F16_F32,
GGML_METAL_KERNEL_TYPE_MUL_MM_BF16_F32,
@@ -310,6 +313,7 @@ enum ggml_metal_kernel_type {
GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_M_F32,
GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32,
GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_XS_F32,
GGML_METAL_KERNEL_TYPE_MUL_MM_MXFP4_F32,
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_MAP0_F16,
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_MAP1_F32,
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F16,
@@ -334,6 +338,7 @@ enum ggml_metal_kernel_type {
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_M_F16,
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F16,
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F16,
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_MXFP4_F16,
GGML_METAL_KERNEL_TYPE_ROPE_NORM_F32,
GGML_METAL_KERNEL_TYPE_ROPE_NORM_F16,
GGML_METAL_KERNEL_TYPE_ROPE_MULTI_F32,
@@ -934,7 +939,7 @@ static id<MTLLibrary> ggml_metal_load_library(id<MTLDevice> device, bool use_bfl
MTLCompileOptions * options = [MTLCompileOptions new];
options.preprocessorMacros = prep;
//[options setFastMathEnabled:false];
metal_library = [device newLibraryWithSource:src options:options error:&error];
@@ -1157,6 +1162,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_0_F32, mul_mv_q5_0_f32, has_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_1_F32, mul_mv_q5_1_f32, has_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q8_0_F32, mul_mv_q8_0_f32, has_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_MXFP4_F32, mul_mv_mxfp4_f32, has_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_F16_F32_R1_2, mul_mv_ext_f16_f32_r1_2, has_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_F16_F32_R1_3, mul_mv_ext_f16_f32_r1_3, has_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_EXT_F16_F32_R1_4, mul_mv_ext_f16_f32_r1_4, has_simdgroup_reduction);
@@ -1236,6 +1242,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_M_F32, mul_mv_id_iq1_m_f32, has_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_NL_F32, mul_mv_id_iq4_nl_f32, has_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_XS_F32, mul_mv_id_iq4_xs_f32, has_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_MXFP4_F32, mul_mv_id_mxfp4_f32, has_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32, mul_mm_f32_f32, has_simdgroup_mm);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_F16_F32, mul_mm_f16_f32, has_simdgroup_mm);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_BF16_F32, mul_mm_bf16_f32, has_simdgroup_mm && use_bfloat);
@@ -1258,6 +1265,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_M_F32, mul_mm_iq1_m_f32, has_simdgroup_mm);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32, mul_mm_iq4_nl_f32, has_simdgroup_mm);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_XS_F32, mul_mm_iq4_xs_f32, has_simdgroup_mm);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_MXFP4_F32, mul_mm_mxfp4_f32, has_simdgroup_mm);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_MAP0_F16, mul_mm_id_map0_f16, has_simdgroup_mm);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_MAP1_F32, mul_mm_id_map1_f32, has_simdgroup_mm);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F16, mul_mm_id_f32_f16, has_simdgroup_mm);
@@ -1282,6 +1290,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_M_F16, mul_mm_id_iq1_m_f16, has_simdgroup_mm);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F16, mul_mm_id_iq4_nl_f16, has_simdgroup_mm);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F16, mul_mm_id_iq4_xs_f16, has_simdgroup_mm);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_MXFP4_F16, mul_mm_id_mxfp4_f16, has_simdgroup_mm);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_NORM_F32, rope_norm_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_NORM_F16, rope_norm_f16, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_MULTI_F32, rope_multi_f32, true);
@@ -3007,6 +3016,7 @@ static bool ggml_metal_encode_node(
case GGML_TYPE_IQ1_M: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_M_F32 ].pipeline; break;
case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32 ].pipeline; break;
case GGML_TYPE_IQ4_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_XS_F32 ].pipeline; break;
case GGML_TYPE_MXFP4: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_MXFP4_F32 ].pipeline; break;
default: GGML_ABORT("MUL MAT-MAT not implemented");
}
@@ -3212,6 +3222,12 @@ static bool ggml_metal_encode_node(
smem = 32*sizeof(float);
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_XS_F32].pipeline;
} break;
case GGML_TYPE_MXFP4:
{
nsg = N_SG_MXFP4;
nr0 = N_R0_MXFP4;
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_MXFP4_F32].pipeline;
} break;
default:
{
GGML_LOG_ERROR("Asserting on type %d\n", (int)src0t);
@@ -3396,6 +3412,7 @@ static bool ggml_metal_encode_node(
case GGML_TYPE_IQ1_M: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_M_F16 ].pipeline; break;
case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F16 ].pipeline; break;
case GGML_TYPE_IQ4_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F16 ].pipeline; break;
case GGML_TYPE_MXFP4: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_MXFP4_F16 ].pipeline; break;
default: GGML_ABORT("MUL_MAT_ID not implemented");
}
@@ -3607,6 +3624,12 @@ static bool ggml_metal_encode_node(
smem = 32*sizeof(float);
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_XS_F32].pipeline;
} break;
case GGML_TYPE_MXFP4:
{
nsg = N_SG_MXFP4;
nr0 = N_R0_MXFP4;
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_MXFP4_F32].pipeline;
} break;
default:
{
GGML_LOG_ERROR("Asserting on type %d\n", (int)src2t);

View File

@@ -1902,16 +1902,16 @@ void mul_vec_q_n_f32_impl(
device const char * src1,
device char * dst,
threadgroup char * shmem,
uint3 tgpig,
ushort tiisg,
ushort sgitg) {
const int nb = args.ne00/QK4_0;
uint3 tgpig, // Threadgroup Position in Grid
ushort tiisg, // Thread Index in SIMD Group
ushort sgitg) { // SIMD Group Index in ThreadGroup
const int nb = args.ne00/QK4_0; // src0->ne[0] / 32
const int r0 = tgpig.x;
const int r1 = tgpig.y;
const int im = tgpig.z;
const int first_row = (r0 * nsg + sgitg) * nr0;
const int first_row = (r0 * nsg + sgitg) * nr0; // nsg=2 nr0=4
const uint i12 = im%args.ne12;
const uint i13 = im/args.ne12;
@@ -6744,6 +6744,49 @@ kernel void kernel_mul_mm_id(
}
}
template <typename type4x4>
void dequantize_mxfp4(device const block_mxfp4 * xb, short il, thread type4x4 & reg) {
float4x4 reg_f;
const ushort dst_bias = 15;
const ushort dst_0p5 = 0x3800;
const ushort dst_m_bits = 10;
const half scale = (half)(as_type<float>(((uint32_t)xb->d) << 23));
// il:0 first 16, il:1 last 16
for (int i = 0; i < 8; i++) {
ushort em0 = xb->qs[il*8 + i] & 0x07;
ushort em1 = xb->qs[il*8 + i] & 0x70;
// float16 values
ushort x0 = (em0 << (dst_m_bits - 1)) | ((xb->qs[il*8 + i] & 0x08) << 12);
ushort x1 = (em1 << (dst_m_bits - 5)) | ((xb->qs[il*8 + i] & 0x80) << 8);
// Three cases:
// x is normal and non-zero: Correct bias
if ((em0 & 0x06) != 0) {
x0 = x0 + ((dst_bias - 1) << dst_m_bits);
}
if ((em1 & 0x60) != 0) {
x1 = x1 + ((dst_bias - 1) << dst_m_bits);
}
// x is subnormal (x == 0bs001 where s is the sign): Map to +-0.5 in the dst type
if (em0 == 0x01) {
x0 = dst_0p5 | (x0 & 0x8000);
}
if (em1 == 0x10) {
x1 = dst_0p5 | (x1 & 0x8000);
}
// x is zero, do nothing
if (isnan(scale)) {
reg_f[i/2][2*(i%2) + 0] = scale;
reg_f[i/2][2*(i%2) + 1] = scale;
} else {
reg_f[i/2][2*(i%2) + 0] = scale * as_type<half>(x0);
reg_f[i/2][2*(i%2) + 1] = scale * as_type<half>(x1);
}
}
reg = (type4x4) reg_f;
}
#define QK_NL 16
//
@@ -6811,6 +6854,8 @@ template [[host_name("kernel_mul_mm_iq1_m_f32")]] kernel mul_mm_t kernel_mul_m
template [[host_name("kernel_mul_mm_iq4_nl_f32")]] kernel mul_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, block_iq4_nl, 2, dequantize_iq4_nl>;
template [[host_name("kernel_mul_mm_iq4_xs_f32")]] kernel mul_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, block_iq4_xs, QK_NL, dequantize_iq4_xs>;
template [[host_name("kernel_mul_mm_mxfp4_f32")]] kernel mul_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, block_mxfp4, 2, dequantize_mxfp4>;
//
// indirect matrix-matrix multiplication
//
@@ -6842,6 +6887,8 @@ template [[host_name("kernel_mul_mm_id_iq1_m_f16")]] kernel mul_mm_id kernel_m
template [[host_name("kernel_mul_mm_id_iq4_nl_f16")]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, block_iq4_nl, 2, dequantize_iq4_nl>;
template [[host_name("kernel_mul_mm_id_iq4_xs_f16")]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, block_iq4_xs, QK_NL, dequantize_iq4_xs>;
template [[host_name("kernel_mul_mm_id_mxfp4_f16")]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, block_mxfp4, 2, dequantize_mxfp4>;
//
// matrix-vector multiplication
@@ -6958,6 +7005,120 @@ kernel void kernel_mul_mv_id(
sgitg);
}
// MXFP32 implementation derived from mul_vec_q_n_f32_impl and block_q_n_dot_y
void mul_mv_mxfp4_f32_impl(
ggml_metal_kargs_mul_mv args,
device const char * src0,
device const char * src1,
device char * dst,
threadgroup char * shmem,
uint3 tgpig,
ushort tiisg,
ushort sgitg) {
const ushort dst_bias = 15;
const ushort dst_0p5 = 0x3800;
const ushort dst_m_bits = 10;
const int nr0 = N_R0_MXFP4;
const int nsg = N_SG_MXFP4;
const int nw = N_SIMDWIDTH;
const int nb = args.ne00/MXFP4;
const int r0 = tgpig.x;
const int r1 = tgpig.y;
const int im = tgpig.z;
const int first_row = (r0 * nsg + sgitg) * nr0;
const uint i12 = im%args.ne12;
const uint i13 = im/args.ne12;
const uint64_t offset1 = r1*args.nb11 + (i12 )*args.nb12 + (i13 )*args.nb13;
device const float * y = (device const float *) (src1 + offset1);
// pointers to src0 rows
device const block_mxfp4 * ax[nr0];
for (int row = 0; row < nr0; ++row) {
const uint64_t offset0 = (first_row + row)*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
ax[row] = (device const block_mxfp4 *) ((device char *) src0 + offset0);
}
float yl[16]; // src1 vector cache
float sumf[nr0] = {0.f};
const short ix = (tiisg/2);
const short il = (tiisg%2)*16;
device const float * yb = y + ix*MXFP4 + il;
// each thread in a SIMD group deals with half a block.
for (int ib = ix; ib < nb; ib += nw/2) {
#pragma unroll
for (short row = 0; row < nr0; row++) {
// Processes 16 items
device const block_mxfp4 * qb_curr = ax[row] + ib;
float d = as_type<float>(((uint32_t)(ax[row] + ib)->d) << 23);
// il = 0 or 16
device const uint8_t *qs = ((device const uint8_t *) qb_curr + 1 + il/2);
for (int i = 0; i < 8; ++i) {
ushort em0 = qs[i] & 0x07;
ushort em1 = qs[i] & 0x70;
ushort x0 = (em0 << (dst_m_bits - 1)) | ((qs[i] & 0x08) << 12);
ushort x1 = (em1 << (dst_m_bits - 5)) | ((qs[i] & 0x80) << 8);
// Three cases:
// x is normal and non-zero: Correct bias
if ((em0 & 0x06) != 0) {
x0 = x0 + ((dst_bias - 1) << dst_m_bits);
}
if ((em1 & 0x60) != 0) {
x1 = x1 + ((dst_bias - 1) << dst_m_bits);
}
// x is subnormal (x == 0bs001 where s is the sign): Map to +-0.5 in the dst type
if (em0 == 0x01) {
x0 = dst_0p5 | (x0 & 0x8000);
}
if (em1 == 0x10) {
x1 = dst_0p5 | (x1 & 0x8000);
}
// x is zero, do nothing
if (!isnan(d)) {
sumf[row] += yb[i*2] * as_type<half>(x0) * d
+ yb[i*2+1] * as_type<half>(x1) * d;
} else {
sumf[row] = d;
}
}
}
yb += MXFP4 * 16;
}
device float * dst_f32 = (device float *) dst + im*args.ne0*args.ne1 + r1*args.ne0;
for (int row = 0; row < nr0; ++row) {
const float tot = simd_sum(sumf[row]);
if (tiisg == 0 && first_row + row < args.ne01) {
dst_f32[first_row + row] = tot;
}
}
}
[[host_name("kernel_mul_mv_mxfp4_f32")]]
kernel void kernel_mul_mv_mxfp4_f32(
constant ggml_metal_kargs_mul_mv & args,
device const char * src0,
device const char * src1,
device char * dst,
threadgroup char * shmem [[threadgroup(0)]],
uint3 tgpig[[threadgroup_position_in_grid]],
ushort tiisg[[thread_index_in_simdgroup]],
ushort sgitg[[simdgroup_index_in_threadgroup]]) {
mul_mv_mxfp4_f32_impl(args, src0, src1, dst, shmem, tgpig, tiisg, sgitg);
}
typedef decltype(kernel_mul_mv_id<mmv_fn<kernel_mul_mv_impl<float, float4, float, float4>>>) kernel_mul_mv_id_t;
template [[host_name("kernel_mul_mv_id_f32_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_impl<float, float4, float, float4>>>;
@@ -6987,6 +7148,8 @@ template [[host_name("kernel_mul_mv_id_iq2_s_f32")]] kernel kernel_mul_mv_id_t
template [[host_name("kernel_mul_mv_id_iq4_nl_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq4_nl_f32_impl <N_R0_IQ4_NL, N_SG_IQ4_NL, N_SIMDWIDTH>>>;
template [[host_name("kernel_mul_mv_id_iq4_xs_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq4_xs_f32_impl <N_R0_IQ4_XS, N_SG_IQ4_XS, N_SIMDWIDTH>>>;
template [[host_name("kernel_mul_mv_id_mxfp4_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<mul_mv_mxfp4_f32_impl>>;
kernel void kernel_pool_2d_max_f32(
device const float * src0,
device float * dst,

View File

@@ -4925,6 +4925,144 @@ void quantize_row_iq2_s_ref(const float * GGML_RESTRICT x, block_iq2_s * GGML_RE
quantize_iq2_s(x, y, 1, k, NULL);
}
// =============================== mxfp4 (de)-quantization
void quantize_row_mxfp4_ref(const float * GGML_RESTRICT x, block_mxfp4 * GGML_RESTRICT y, int64_t k) {
static const int qk = MXFP4;
static const uint32_t E8_BIAS = 127;
static const uint32_t E2_BIAS = 1;
assert(k % qk == 0);
const int nb = k / qk;
for (int i = 0; i < nb; i++) {
float amax = 0.0f; // absolute max
for (int j = 0; j < qk; j++) {
const float v = x[i*qk + j];
if (amax < fabsf(v)) {
amax = fabsf(v);
}
}
const float dequant_scale = amax / 6.0f;
uint32_t dequant_scale_exponent = 0;
memcpy(&dequant_scale_exponent, &dequant_scale, sizeof(dequant_scale_exponent));
// Rounding up
dequant_scale_exponent = (dequant_scale_exponent + 0x007FFFFF) & 0x7F800000;
// Rounding down
// dequant_scale_exponent = dequant_scale_exponent & 0x7F800000;
float dequant_scale_rounded = 0.0f;
memcpy(&dequant_scale_rounded, &dequant_scale_exponent, sizeof(dequant_scale_rounded));
float quant_scale = 0.0f;
if (dequant_scale_rounded != 0.0f) {
quant_scale = 1.0f / dequant_scale_rounded;
}
y[i].d = (uint8_t)(dequant_scale_exponent >> 23);
for (int j = 0; j < qk/2; ++j) {
const float x0 = x[i*qk + j*2]*quant_scale;
const float x1 = x[i*qk + j*2+1]*quant_scale;
uint32_t xi0 = 0;
uint32_t xi1 = 0;
memcpy(&xi0, &x0, sizeof(xi0));
memcpy(&xi1, &x1, sizeof(xi1));
uint32_t s0 = xi0 & 0x80000000;
uint32_t s1 = xi1 & 0x80000000;
uint32_t e0 = (xi0 >> 23) & 0xFF;
uint32_t e1 = (xi1 >> 23) & 0xFF;
uint32_t m0 = (xi0 & 0x7FFFFF);
uint32_t m1 = (xi1 & 0x7FFFFF);
// 0.25 <= x < 0.75 maps to 0.5, a denormal number
// Move implicit bit 1 at the beginning to mantissa for denormals
// adjusted_exponents
uint32_t ae0 = E8_BIAS - (e0 + 1);
uint32_t ae1 = E8_BIAS - (e1 + 1);
if (e0 < E8_BIAS) {
m0 = (0x400000 | (m0 >> 1)) >> ae0;
}
if (e1 < E8_BIAS) {
m1 = (0x400000 | (m1 >> 1)) >> ae1;
}
// For normal numbers, we change the bias from 127 to 1, and for subnormals, we keep exponent as 0.
e0 = MAX(e0, E8_BIAS - E2_BIAS) - (E8_BIAS - E2_BIAS);
e1 = MAX(e1, E8_BIAS - E2_BIAS) - (E8_BIAS - E2_BIAS);
// Combine sign, exponent, and mantissa, while saturating
// rounding nearest with tie breaking up by adding +1 to one bit right of the LSB, then shift right
uint32_t tmp0 = MIN((((e0 << 2) | (m0 >> 21)) + 1) >> 1, 0x7);
uint32_t tmp1 = MIN((((e1 << 2) | (m1 >> 21)) + 1) >> 1, 0x7);
uint8_t v0 = (uint8_t)((s0 >> 28) | tmp0);
uint8_t v1 = (uint8_t)((s1 >> 28) | tmp1);
y[i].qs[j] = v0;
y[i].qs[j] |= v1 << 4;
}
}
}
void dequantize_row_mxfp4(const block_mxfp4 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
assert(k % MXFP4 == 0);
const int nb = k / MXFP4;
const uint16_t dst_bias = 15;
const uint16_t dst_0p5 = 0x3800;
const uint16_t dst_m_bits = 10;
for (int i = 0; i < nb; i++) {
union {
uint32_t as_bits;
float as_value;
} scale;
scale.as_bits = (((uint32_t)x[i].d) << 23);
for (int j = 0; j < MXFP4/2; ++j) {
uint16_t em0 = x[i].qs[j] & 0x07;
uint16_t em1 = x[i].qs[j] & 0x70;
// float16 values
uint16_t x0 = (em0 << (dst_m_bits - 1)) | ((x[i].qs[j] & 0x08) << 12);
uint16_t x1 = (em1 << (dst_m_bits - 5)) | ((x[i].qs[j] & 0x80) << 8);
// Three cases:
// x is normal and non-zero: Correct bias
if ((em0 & 0x06) != 0) {
x0 = x0 + ((dst_bias - 1) << dst_m_bits);
}
if ((em1 & 0x60) != 0) {
x1 = x1 + ((dst_bias - 1) << dst_m_bits);
}
// x is subnormal (x == 0bs001 where s is the sign): Map to +-0.5 in the dst type
if (em0 == 0x01) {
x0 = dst_0p5 | (x0 & 0x8000);
}
if (em1 == 0x10) {
x1 = dst_0p5 | (x1 & 0x8000);
}
// x is zero, do nothing
if (isnan(scale.as_value)) {
y[i*MXFP4 + j*2] = scale.as_value;
y[i*MXFP4 + j*2+1] = scale.as_value;
} else {
y[i*MXFP4 + j*2] = GGML_FP16_TO_FP32(x0)*scale.as_value;
y[i*MXFP4 + j*2+1] = GGML_FP16_TO_FP32(x1)*scale.as_value;
}
}
}
}
size_t quantize_mxfp4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
quantize_row_mxfp4_ref(src, dst, (int64_t)nrow*n_per_row);
return nrow * ggml_row_size(GGML_TYPE_MXFP4, n_per_row);
}
// =============================== data validation
static bool validate_float(float f, size_t i) {
@@ -5214,7 +5352,9 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte
{
VALIDATE_ROW_DATA_D_F16_IMPL(block_iq4_nl, data, nb);
} break;
case GGML_TYPE_MXFP4:
// TODO - anything to validate?
break;
case GGML_TYPE_I8:
case GGML_TYPE_I16:
case GGML_TYPE_I32:

View File

@@ -37,6 +37,8 @@ GGML_API void quantize_row_iq4_xs_ref (const float * GGML_RESTRICT x, block_iq4_
GGML_API void quantize_row_iq3_s_ref (const float * GGML_RESTRICT x, block_iq3_s * GGML_RESTRICT y, int64_t k);
GGML_API void quantize_row_iq2_s_ref (const float * GGML_RESTRICT x, block_iq2_s * GGML_RESTRICT y, int64_t k);
GGML_API void quantize_row_mxfp4_ref(const float * GGML_RESTRICT x, block_mxfp4 * GGML_RESTRICT y, int64_t k);
// Dequantization
GGML_API void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
GGML_API void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
@@ -65,6 +67,8 @@ GGML_API void dequantize_row_iq4_nl (const block_iq4_nl * GGML_RESTRICT x, floa
GGML_API void dequantize_row_iq4_xs (const block_iq4_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
GGML_API void dequantize_row_iq3_s (const block_iq3_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
GGML_API void dequantize_row_mxfp4(const block_mxfp4 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
GGML_API size_t quantize_iq2_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
GGML_API size_t quantize_iq2_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
@@ -90,6 +94,8 @@ GGML_API size_t quantize_q5_0(const float * GGML_RESTRICT src, void * GGML_RESTR
GGML_API size_t quantize_q5_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
GGML_API size_t quantize_q8_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
GGML_API size_t quantize_mxfp4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
GGML_API void iq2xs_init_impl(enum ggml_type type);
GGML_API void iq2xs_free_impl(enum ggml_type type);
GGML_API void iq3xs_init_impl(int grid_size);

View File

@@ -589,11 +589,13 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
.to_float = (ggml_to_float_t) dequantize_row_q4_1,
.from_float_ref = (ggml_from_float_t) quantize_row_q4_1_ref,
},
[4] = { // GGML_TYPE_Q4_2
.type_name = "DEPRECATED",
.blck_size = 0,
.type_size = 0,
.is_quantized = false,
[GGML_TYPE_MXFP4] = { // formerly deprecated GGML_TYPE_Q4_2
.type_name = "mxfp4",
.blck_size = MXFP4,
.type_size = sizeof(block_mxfp4),
.is_quantized = true,
.to_float = (ggml_to_float_t) dequantize_row_mxfp4,
.from_float_ref = (ggml_from_float_t) quantize_row_mxfp4_ref,
},
[5] = { // GGML_TYPE_Q4_3
.type_name = "DEPRECATED",
@@ -6446,6 +6448,7 @@ size_t ggml_quantize_chunk(
case GGML_TYPE_IQ1_M: result = quantize_iq1_m (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_IQ4_NL: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_IQ4_XS: result = quantize_iq4_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_MXFP4: result = quantize_mxfp4 (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_F16:
{
size_t elemsize = sizeof(ggml_fp16_t);

View File

@@ -0,0 +1,60 @@
package ggml
import (
"bytes"
"log/slog"
"os"
"slices"
"testing"
"github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/fs/ggml"
"github.com/ollama/ollama/logutil"
"github.com/ollama/ollama/ml"
)
func TestMain(m *testing.M) {
slog.SetDefault(logutil.NewLogger(os.Stderr, envconfig.LogLevel()))
os.Exit(m.Run())
}
func setup(tb testing.TB) ml.Backend {
tb.Helper()
f, err := os.CreateTemp(tb.TempDir(), "*.bin")
if err != nil {
tb.Fatal(err)
}
defer f.Close()
if err := ggml.WriteGGUF(f, ggml.KV{
"general.architecture": "test",
"test.block_count": uint32(1),
}, []*ggml.Tensor{
{Name: "blk.0.weight", Shape: []uint64{1}, WriterTo: bytes.NewBuffer(slices.Repeat([]byte{0}, 4))},
}); err != nil {
tb.Fatal(err)
}
b, err := New(f.Name(), ml.BackendParams{NumGPULayers: 1})
if err != nil {
tb.Fatal(err)
}
return b
}
// initContextOrSkip takes a testing.T and true for GPU
// If GPUs are not available, the current test is skipped
// gpu=false will always succed
func initContextOrSkip(t *testing.T, b ml.Backend, gpu bool) ml.Context {
if gpu && len(b.(*Backend).schedBackends) == 1 {
t.Skip("No GPU detected, skipping GPU test case")
}
ctx := b.NewContext()
t.Cleanup(func() { ctx.Close() })
if gpu {
return ctx.Layer(0)
}
return ctx.Input()
}

View File

@@ -0,0 +1,795 @@
package ggml
import (
"math"
"math/rand"
"os"
"testing"
"github.com/ollama/ollama/ml"
fsggml "github.com/ollama/ollama/fs/ggml"
)
/*
To get GPUs loading in these tests on windows...
$env:OLLAMA_LIBRARY_PATH="$(pwd)\build\lib\ollama"
$env:PATH="$(pwd)\build\lib\ollama;$env:PATH"
go test .\ml\backend\ggml\... -run TestMXFP4
*/
// MXFP4 reference: https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf
// E2M1 values
var mxfp4_vals = []float32{
0.0, // 0 00 0 = 0x0
0.5, // 0 00 1 = 0x1
1.0, // 0 01 0 = 0x2
1.5, // 0 01 1 = 0x3
2.0, // 0 10 0 = 0x4
3.0, // 0 10 1 = 0x5
4.0, // 0 11 0 = 0x6
6.0, // 0 11 1 = 0x7
0.0, // 1 00 0 = 0x8
-0.5, // 1 00 1 = 0x9
-1.0, // 1 01 0 = 0xa
-1.5, // 1 01 1 = 0xb
-2.0, // 1 10 0 = 0xc
-3.0, // 1 10 1 = 0xd
-4.0, // 1 11 0 = 0xe
-6.0, // 1 11 1 = 0xf
}
func TestMXFP4Ops(t *testing.T) {
b := setup(t)
for _, useGPU := range []bool{false, true} {
useGPU := useGPU
var label string
if useGPU {
label = "gpu"
} else {
label = "cpu"
}
t.Run(label, func(t *testing.T) {
t.Run("mulmatid", func(t *testing.T) {
// Use exact values that are supported without scaling so we can compare against an fp32 tensor
t.Run("exact", func(t *testing.T) {
r := rand.New(rand.NewSource(0))
ctx := initContextOrSkip(t, b, useGPU)
const s00 = 64
const s01 = 1
const s02 = 2
const s10 = s00
const s11 = 1
const s12 = 1
// const s00 = 2880
// const s01 = 5760
// const s02 = 32
// const s10 = s00
// const s11 = 1
// const s12 = 64
data := [s00 * s01 * s02]float32{}
for i := range data {
data[i] = mxfp4_vals[r.Int()%len(mxfp4_vals)]
}
mxData := Quantize(fsggml.TensorTypeMXFP4, data[:], []uint64{uint64(len(data))})
dtype := ml.DTypeMXFP4
t1 := ctx.(*Context).FromBytes(dtype, mxData, s00, s01, s02)
t1f := ctx.(*Context).FromFloatSlice(data[:], s00, s01, s02)
// for i := range len(data) / 32 { // MXFP4 block size
// vals := [32]string{}
// for j := range vals {
// vals[j] = fmt.Sprintf("%0.2f", data[i*32+j])
// }
// t.Logf(" t1[%s]\n", strings.Join(vals[:], ", "))
// }
// random 0-1 float
d2 := [s10 * s11 * s12]float32{}
for i := range d2 {
d2[i] = float32(r.Float32())
}
// for i := range len(d2) / s10 {
// vals := [s10]string{}
// for j := range vals {
// vals[j] = fmt.Sprintf("%0.2f", d2[i*s10+j])
// }
// t.Logf(" t2[%s]\n", strings.Join(vals[:], ", "))
// }
t2 := ctx.(*Context).FromFloatSlice(d2[:], s10, s11, s12)
d3 := [4 * s12]int32{}
for i := range d3 {
d3[i] = int32(i) % s02
}
t3 := ctx.(*Context).FromIntSlice(d3[:], 4, s12)
// t.Log("calling MulmatID")
t4 := t1.MulmatID(ctx, t2, t3)
t4f := t1f.MulmatID(ctx, t2, t3)
d4 := ml.Dump(ctx, t4, ml.DumpWithPrecision(2)) // lower precision for CPU accuracy
d4f := ml.Dump(ctx, t4f, ml.DumpWithPrecision(2))
if d4 != d4f {
t.Fatalf("expected (f32): \n%s\n\n but got (mxfp4): \n%s", d4f, d4)
}
// t.Logf("MulmatID results matched:\n%s", d4)
})
t.Run("range", func(t *testing.T) {
r := rand.New(rand.NewSource(0))
ctx := initContextOrSkip(t, b, useGPU)
const s0 = 64
const s1 = 2
const s2 = 4
const idlen = 4
data := [s0 * s1 * s2]float32{}
inTotal := float32(0)
for i := range data {
data[i] = float32(i)
inTotal += float32(i)
}
mxData := Quantize(fsggml.TensorTypeMXFP4, data[:], []uint64{uint64(len(data))})
// Reconvert back to floats to remove the quantization fidelity loss for comparison
dataf := ConvertToF32(mxData, uint32(fsggml.TensorTypeMXFP4), uint64(len(data)))
dtype := ml.DTypeMXFP4
t1 := ctx.(*Context).FromBytes(dtype, mxData, s0, s1, s2)
t1f := ctx.(*Context).FromFloatSlice(dataf, s0, s1, s2)
// for i := range len(data) / 32 {
// vals := [32]string{}
// for j := range vals {
// vals[j] = fmt.Sprintf("%0.2f", dataf[i*32+j])
// }
// t.Logf(" t1[%s]\n", strings.Join(vals[:], ", "))
// }
d2 := [s0]float32{}
for i := range d2 {
// d2[i] = float32(i)
d2[i] = float32(r.Float32())
}
// for i := range len(d2) / s0 {
// vals := [s0]string{}
// for j := range vals {
// vals[j] = fmt.Sprintf("%0.2f", d2[i*s0+j])
// }
// t.Logf(" t2[%s]\n", strings.Join(vals[:], ", "))
// }
t2 := ctx.(*Context).FromFloatSlice(d2[:], s0)
// TODO - there might be a CUDA bug here...
d3 := [idlen]int32{1, 1, 2, 3}
// for i := range d3 {
// d3[i] = int32(i) % s2
// t.Logf("%d] %d", i, d3[i])
// }
t3 := ctx.(*Context).FromIntSlice(d3[:], idlen)
// t.Log("calling Mulmat")
t4 := t1.MulmatID(ctx, t2, t3)
t4f := t1f.MulmatID(ctx, t2, t3)
// Metal has some drift so use reduced precision for dump comparisons
d4 := ml.Dump(ctx, t4, ml.DumpWithPrecision(2))
d4f := ml.Dump(ctx, t4f, ml.DumpWithPrecision(2))
r4 := t4.Floats()
r4f := t4f.Floats()
sim := cosineSimilarity(r4, r4f)
if sim < 0.99 {
t.Logf("expected (f32): \n%s\n\n but got (mxfp4): \n%s", d4f, d4)
t.Fatalf("failed similarity test: %f", sim)
}
t.Logf("similarity: %f", sim)
if d4 != d4f {
t.Fatalf("expected (f32): \n%s\n\n but got (mxfp4): \n%s", d4f, d4)
}
// t.Logf("mxfp4 result\n%s", d4)
})
t.Run("random", func(t *testing.T) {
r := rand.New(rand.NewSource(0))
ctx := initContextOrSkip(t, b, useGPU)
const s00 = 2880
const s01 = 5760
const s02 = 32
const s10 = s00
const s11 = 1
const s12 = 64
const idlen = 4
data := [s00 * s01 * s02]float32{}
for i := range data {
data[i] = float32(r.Float32() * 10.0)
}
mxData := Quantize(fsggml.TensorTypeMXFP4, data[:], []uint64{uint64(len(data))})
// Reconvert back to floats to remove the quantization fidelity loss for comparison
dataf := ConvertToF32(mxData, uint32(fsggml.TensorTypeMXFP4), uint64(len(data)))
dtype := ml.DTypeMXFP4
t1 := ctx.(*Context).FromBytes(dtype, mxData, s00, s01, s02)
t1f := ctx.(*Context).FromFloatSlice(dataf, s00, s01, s02)
// for i := range len(data) / 32 {
// vals := [32]string{}
// for j := range vals {
// vals[j] = fmt.Sprintf("%0.2f", dataf[i*32+j])
// }
// t.Logf(" t1[%s]\n", strings.Join(vals[:], ", "))
// }
d2 := [s10 * s11 * s12]float32{}
for i := range d2 {
// d2[i] = float32(i)
d2[i] = float32(r.Float32())
}
// for i := range len(d2) / s0 {
// vals := [s0]string{}
// for j := range vals {
// vals[j] = fmt.Sprintf("%0.2f", d2[i*s0+j])
// }
// t.Logf(" t2[%s]\n", strings.Join(vals[:], ", "))
// }
t2 := ctx.(*Context).FromFloatSlice(d2[:], s10, s11, s12)
// arange equiv
d3 := [idlen * s12]int32{}
for i := range d3 {
d3[i] = int32(i) % s02
}
t3 := ctx.(*Context).FromIntSlice(d3[:], idlen, s12)
// t.Log("calling Mulmat")
// t3 := t1.Mulmat(ctx, t2)
// t3f := t1f.Mulmat(ctx, t2)
t4 := t1.MulmatID(ctx, t2, t3)
t4f := t1f.MulmatID(ctx, t2, t3)
// Metal and CPU have some drift so use reduced precision for dump comparisons
d4 := ml.Dump(ctx, t4, ml.DumpWithPrecision(1))
d4f := ml.Dump(ctx, t4f, ml.DumpWithPrecision(1))
// t.Logf("mxfp4 data: \n%s", d4)
r4 := t4.Floats()
r4f := t4f.Floats()
sim := cosineSimilarity(r4, r4f)
if sim < 0.99 {
t.Logf("expected (f32): \n%s\n\n but got (mxfp4): \n%s", d4f, d4)
t.Fatalf("failed similarity test: %f", sim)
}
t.Logf("similarity: %f", sim)
if d4 != d4f {
t.Fatalf("expected (f32): \n%s\n\n but got (mxfp4): \n%s", d4f, d4)
}
})
// Use data file(s) with real data
t.Run("example_7", func(t *testing.T) {
ctx := initContextOrSkip(t, b, useGPU)
data0, err := os.ReadFile("mlp-gateup.bin")
if err != nil {
t.Skip("missing mlp-gateup.bin file, skipping test")
}
data1, err := os.ReadFile("hidden-states-7.bin")
if err != nil {
t.Skip("missing hidden-states.bin file, skipping test")
}
data2, err := os.ReadFile("selected-experts-7.bin")
if err != nil {
t.Skip("missing selected-experts.bin file, skipping test")
}
dtype := ml.DTypeMXFP4
data0f := ConvertToF32(data0, uint32(fsggml.TensorTypeMXFP4), 2880*5760*32)
t1 := ctx.(*Context).FromBytes(dtype, data0, 2880, 5760, 32)
t1f := ctx.(*Context).FromFloatSlice(data0f, 2880, 5760, 32)
// t.Logf("f32: \n%s", ml.Dump(ctx, t1f))
t2 := ctx.(*Context).FromBytes(ml.DTypeF32, data1, 2880, 1, 7)
// t.Logf("hidden-state: \n%s", ml.Dump(ctx, t2))
t3 := ctx.(*Context).FromBytes(ml.DTypeI32, data2, 4, 7)
// t.Logf("experts: \n%s", ml.Dump(ctx, t3))
// t.Log("calling MulmatID")
t4 := t1.MulmatID(ctx, t2, t3)
t4f := t1f.MulmatID(ctx, t2, t3)
d4 := ml.Dump(ctx, t4)
d4f := ml.Dump(ctx, t4f)
r4 := t4.Floats()
r4f := t4f.Floats()
sim := cosineSimilarity(r4, r4f)
if sim < 0.99 {
t.Fatalf("failed similarity test: %f", sim)
}
t.Logf("similarity: %f", sim)
if d4 != d4f {
t.Fatalf("expected (f32): \n%s\n\n but got (mxfp4): \n%s", d4f, d4)
}
// t.Logf("MulmatID results matched:\n%s", d4)
})
// Use data file(s) with real data
t.Run("example_384", func(t *testing.T) {
ctx := initContextOrSkip(t, b, useGPU)
data0, err := os.ReadFile("mlp-gateup.bin")
if err != nil {
t.Skip("missing mlp-gateup.bin file, skipping test")
}
data1, err := os.ReadFile("hidden-states-384.bin")
if err != nil {
t.Skip("missing hidden-states.bin file, skipping test")
}
data2, err := os.ReadFile("selected-experts-384.bin")
if err != nil {
t.Skip("missing selected-experts.bin file, skipping test")
}
dtype := ml.DTypeMXFP4
data0f := ConvertToF32(data0, uint32(fsggml.TensorTypeMXFP4), 2880*5760*32)
t1 := ctx.(*Context).FromBytes(dtype, data0, 2880, 5760, 32)
t1f := ctx.(*Context).FromFloatSlice(data0f, 2880, 5760, 32)
// t.Logf("f32: \n%s", ml.Dump(ctx, t1f))
t2 := ctx.(*Context).FromBytes(ml.DTypeF32, data1, 2880, 1, 384)
// t.Logf("hidden-state: \n%s", ml.Dump(ctx, t2))
t3 := ctx.(*Context).FromBytes(ml.DTypeI32, data2, 4, 384)
// t.Logf("experts: \n%s", ml.Dump(ctx, t3))
// t.Log("calling MulmatID")
t4 := t1.MulmatID(ctx, t2, t3)
t4f := t1f.MulmatID(ctx, t2, t3)
d4 := ml.Dump(ctx, t4, ml.DumpWithPrecision(3))
d4f := ml.Dump(ctx, t4f, ml.DumpWithPrecision(3))
r4 := t4.Floats()
r4f := t4f.Floats()
sim := cosineSimilarity(r4, r4f)
if sim < 0.99 {
t.Fatalf("failed similarity test: %f", sim)
}
t.Logf("similarity: %f", sim)
if d4 != d4f {
t.Fatalf("expected (f32): \n%s\n\n but got (mxfp4): \n%s", d4f, d4)
}
// t.Logf("MulmatID results matched:\n%s", d4)
})
// Use data file(s) with real data
t.Run("example_1d", func(t *testing.T) {
r := rand.New(rand.NewSource(0))
ctx := initContextOrSkip(t, b, useGPU)
data0, err := os.ReadFile("mlp-gateup.bin")
if err != nil {
t.Skip("missing mlp-gateup.bin file, skipping test")
}
dtype := ml.DTypeMXFP4
data0f := ConvertToF32(data0, uint32(fsggml.TensorTypeMXFP4), 2880*5760*32)
t1 := ctx.(*Context).FromBytes(dtype, data0, 2880, 5760, 32)
t1f := ctx.(*Context).FromFloatSlice(data0f, 2880, 5760, 32)
// t.Logf("f32: \n%s", ml.Dump(ctx, t1f))
data1 := [2880]float32{}
for i := range data1 {
data1[i] = float32(r.Float32())
}
t2 := ctx.(*Context).FromFloatSlice(data1[:], 2880)
// t.Logf("hidden-state: \n%s", ml.Dump(ctx, t2))
data2 := [4]int32{
12, 30, 17, 7,
// 7, 17, 12, 30,
}
t3 := ctx.(*Context).FromIntSlice(data2[:], 4)
// t.Logf("experts: \n%s", ml.Dump(ctx, t3))
// t.Log("calling MulmatID")
t4 := t1.MulmatID(ctx, t2, t3)
t4f := t1f.MulmatID(ctx, t2, t3)
d4 := ml.Dump(ctx, t4)
d4f := ml.Dump(ctx, t4f)
r4 := t4.Floats()
r4f := t4f.Floats()
sim := cosineSimilarity(r4, r4f)
if sim < 0.99 {
t.Fatalf("failed similarity test: %f", sim)
}
t.Logf("similarity: %f", sim)
if d4 != d4f {
t.Fatalf("expected (f32): \n%s\n\n but got (mxfp4): \n%s", d4f, d4)
}
// t.Logf("MulmatID results matched:\n%s", d4)
})
})
t.Run("mm", func(t *testing.T) {
t.Run("example", func(t *testing.T) {
r := rand.New(rand.NewSource(0))
ctx := initContextOrSkip(t, b, useGPU)
data0, err := os.ReadFile("mlp-gateup.bin")
if err != nil {
t.Skip("missing mlp-gateup.bin file, skipping test")
}
data1 := [2880 * 1 * 32]float32{}
for i := range data1 {
data1[i] = float32(r.Float32())
}
dtype := ml.DTypeMXFP4
data0f := ConvertToF32(data0, uint32(fsggml.TensorTypeMXFP4), 2880*5760*32)
t1 := ctx.(*Context).FromBytes(dtype, data0, 2880, 5760, 32)
t1f := ctx.(*Context).FromFloatSlice(data0f, 2880, 5760, 32)
// t.Logf("f32: \n%s", ml.Dump(ctx, t1f))
t2 := ctx.(*Context).FromFloatSlice(data1[:], 2880, 1, 32)
t4 := t1.Mulmat(ctx, t2)
t4f := t1f.Mulmat(ctx, t2)
d4 := ml.Dump(ctx, t4, ml.DumpWithPrecision(3))
d4f := ml.Dump(ctx, t4f, ml.DumpWithPrecision(3))
r4 := t4.Floats()
r4f := t4f.Floats()
sim := cosineSimilarity(r4, r4f)
if sim < 0.99 {
t.Fatalf("failed similarity test: %f", sim)
}
t.Logf("similarity: %f", sim)
if d4 != d4f {
t.Fatalf("expected (f32): \n%s\n\n but got (mxfp4): \n%s", d4f, d4)
}
// t.Logf("Mulmat results matched:\n%s", d4)
})
t.Run("exact/3x3", func(t *testing.T) {
r := rand.New(rand.NewSource(0))
ctx := initContextOrSkip(t, b, useGPU)
const s10 = 64
const s11 = 1
const s12 = 2
const s20 = s10
const s21 = 1
const s22 = 2
data := [s10 * s11 * s12]float32{}
for i := range data {
data[i] = mxfp4_vals[r.Int()%len(mxfp4_vals)]
}
// for i := range len(data) / 32 {
// vals := [32]string{}
// for j := range vals {
// vals[j] = fmt.Sprintf("%0.2f", data[i*32+j])
// }
// t.Logf(" [%s]\n", strings.Join(vals[:], ", "))
// }
mxData := Quantize(fsggml.TensorTypeMXFP4, data[:], []uint64{uint64(len(data))})
// for i := range len(mxData) / 17 {
// vals := [17]string{}
// for j := range vals {
// vals[j] = fmt.Sprintf("%0.2x", mxData[i*17+j])
// }
// t.Logf(" %s\n", strings.Join(vals[:], ", "))
// }
dtype := ml.DTypeMXFP4
t1 := ctx.(*Context).FromBytes(dtype, mxData, s10, s11, s12)
t1f := ctx.(*Context).FromFloatSlice(data[:], s10, s11, s12)
d2 := [s20 * s21 * s22]float32{}
for i := range d2 {
d2[i] = float32(r.Float32())
}
t2 := ctx.(*Context).FromFloatSlice(d2[:], s20, s21, s22)
t3f := t1f.Mulmat(ctx, t2)
t3 := t1.Mulmat(ctx, t2)
d3 := ml.Dump(ctx, t3)
d3f := ml.Dump(ctx, t3f)
if d3 != d3f {
t.Fatalf("expected (f32): \n%s\n\n but got (mxfp4): \n%s", d3f, d3)
}
})
t.Run("exact/2x2", func(t *testing.T) {
r := rand.New(rand.NewSource(0))
ctx := initContextOrSkip(t, b, useGPU)
const s0 = 32
const s1 = 64
data := [s0 * s1]float32{}
for i := range data {
data[i] = mxfp4_vals[r.Int()%len(mxfp4_vals)]
}
// for i := range 4 {
// vals := [32]string{}
// for j := range vals {
// vals[j] = fmt.Sprintf("%0.2f", data[i*32+j])
// }
// t.Logf(" [%s]\n", strings.Join(vals[:], ", "))
// }
mxData := Quantize(fsggml.TensorTypeMXFP4, data[:], []uint64{uint64(len(data))})
// for i := range len(mxData) / 17 {
// vals := [17]string{}
// for j := range vals {
// vals[j] = fmt.Sprintf("%0.2x", mxData[i*17+j])
// }
// t.Logf(" %s\n", strings.Join(vals[:], ", "))
// }
dtype := ml.DTypeMXFP4
t1 := ctx.(*Context).FromBytes(dtype, mxData, s0, s1)
t1f := ctx.(*Context).FromFloatSlice(data[:], s0, s1)
d2 := [s0 * s1]float32{}
for i := range d2 {
d2[i] = float32(r.Float32())
}
t2 := ctx.(*Context).FromFloatSlice(d2[:], s0, s1)
t3f := t1f.Mulmat(ctx, t2)
t3 := t1.Mulmat(ctx, t2)
d3 := ml.Dump(ctx, t3)
d3f := ml.Dump(ctx, t3f)
if d3 != d3f {
t.Fatalf("expected (f32): \n%s\n\n but got (mxfp4): \n%s", d3f, d3)
}
})
t.Run("exact/2x1", func(t *testing.T) {
r := rand.New(rand.NewSource(0))
ctx := initContextOrSkip(t, b, useGPU)
const s0 = 64
const s1 = 4
data := [s0 * s1]float32{}
for i := range data {
data[i] = mxfp4_vals[r.Int()%len(mxfp4_vals)]
}
// for i := range len(data) / 32 {
// vals := [32]string{}
// for j := range vals {
// vals[j] = fmt.Sprintf("%0.2f", data[i*32+j])
// }
// t.Logf(" t1[%s]\n", strings.Join(vals[:], ", "))
// }
mxData := Quantize(fsggml.TensorTypeMXFP4, data[:], []uint64{uint64(len(data))})
// for i := range len(mxData) / 17 {
// vals := [17]string{}
// for j := range vals {
// vals[j] = fmt.Sprintf("%0.2x", mxData[i*17+j])
// }
// t.Logf(" %s\n", strings.Join(vals[:], ", "))
// }
dtype := ml.DTypeMXFP4
t1 := ctx.(*Context).FromBytes(dtype, mxData, s0, s1)
t1f := ctx.(*Context).FromFloatSlice(data[:], s0, s1)
d2 := [s0]float32{}
for i := range d2 {
d2[i] = float32(r.Float32())
}
// for i := range len(d2) / 32 {
// vals := [32]string{}
// for j := range vals {
// vals[j] = fmt.Sprintf("%0.2f", d2[i*32+j])
// }
// t.Logf(" t2[%s]\n", strings.Join(vals[:], ", "))
// }
t2 := ctx.(*Context).FromFloatSlice(d2[:], s0)
t3f := t1f.Mulmat(ctx, t2)
t3 := t1.Mulmat(ctx, t2)
d3 := ml.Dump(ctx, t3, ml.DumpWithPrecision(3))
d3f := ml.Dump(ctx, t3f, ml.DumpWithPrecision(3))
if d3 != d3f {
t.Fatalf("expected (f32): \n%s\n\n but got (mxfp4): \n%s", d3f, d3)
}
})
t.Run("range/2d", func(t *testing.T) {
r := rand.New(rand.NewSource(0))
ctx := initContextOrSkip(t, b, useGPU)
const s0 = 32
const s1 = 4
data := [s0 * s1]float32{}
inTotal := float32(0)
for i := range data {
data[i] = float32(i)
inTotal += float32(i)
}
mxData := Quantize(fsggml.TensorTypeMXFP4, data[:], []uint64{uint64(len(data))})
// Reconvert back to floats to remove the quantization fidelity loss for comparison
dataf := ConvertToF32(mxData, uint32(fsggml.TensorTypeMXFP4), uint64(len(data)))
dtype := ml.DTypeMXFP4
t1 := ctx.(*Context).FromBytes(dtype, mxData, s0, s1)
t1f := ctx.(*Context).FromFloatSlice(dataf, s0, s1)
// for i := range len(data) / 32 {
// vals := [32]string{}
// for j := range vals {
// vals[j] = fmt.Sprintf("%0.2f", dataf[i*32+j])
// }
// t.Logf(" t1[%s]\n", strings.Join(vals[:], ", "))
// }
d2 := [s0 * s1]float32{}
for i := range d2 {
// d2[i] = float32(i)
d2[i] = float32(r.Float32())
}
// for i := range len(d2) / s0 {
// vals := [s0]string{}
// for j := range vals {
// vals[j] = fmt.Sprintf("%0.2f", d2[i*s0+j])
// }
// t.Logf(" t2[%s]\n", strings.Join(vals[:], ", "))
// }
t2 := ctx.(*Context).FromFloatSlice(d2[:], s0, s1)
// t.Log("calling Mulmat")
t3 := t1.Mulmat(ctx, t2)
t3f := t1f.Mulmat(ctx, t2)
d3 := ml.Dump(ctx, t3, ml.DumpWithPrecision(2))
d3f := ml.Dump(ctx, t3f, ml.DumpWithPrecision(2))
r3 := t3.Floats()
r3f := t3f.Floats()
sim := cosineSimilarity(r3, r3f)
if sim < 0.99 {
t.Logf("expected (f32): \n%s\n\n but got (mxfp4): \n%s", d3f, d3)
t.Fatalf("failed similarity test: %f", sim)
}
t.Logf("similarity: %f", sim)
if d3 != d3f {
t.Fatalf("expected (f32): \n%s\n\n but got (mxfp4): \n%s", d3f, d3)
}
})
t.Run("range/3d", func(t *testing.T) {
ctx := initContextOrSkip(t, b, useGPU)
data := [32 * 4 * 2]float32{}
inTotal := float32(0)
for i := range data {
data[i] = float32(i)
inTotal += float32(i)
}
mxData := Quantize(fsggml.TensorTypeMXFP4, data[:], []uint64{uint64(len(data))})
dtype := ml.DTypeMXFP4
// Reconvert back to floats to remove the quantization fidelity loss for comparison
dataf := ConvertToF32(mxData, uint32(fsggml.TensorTypeMXFP4), uint64(len(data)))
t1 := ctx.(*Context).FromBytes(dtype, mxData, 32, 4, 2)
t1f := ctx.(*Context).FromFloatSlice(dataf, 32, 4, 2)
d2 := [32 * 4 * 2]float32{}
for i := range d2 {
d2[i] = 2.0
}
t2 := ctx.(*Context).FromFloatSlice(d2[:], 32, 4, 2)
// t.Log("calling Mulmat")
t3 := t1.Mulmat(ctx, t2)
t3f := t1f.Mulmat(ctx, t2)
d3 := ml.Dump(ctx, t3)
d3f := ml.Dump(ctx, t3f)
r3 := t3.Floats()
r3f := t3f.Floats()
sim := cosineSimilarity(r3, r3f)
if sim < 0.99 {
t.Logf("expected (f32): \n%s\n\n but got (mxfp4): \n%s", d3f, d3)
t.Fatalf("failed similarity test: %f", sim)
}
t.Logf("similarity: %f", sim)
if d3 != d3f {
t.Fatalf("expected (f32): \n%s\n\n but got (mxfp4): \n%s", d3f, d3)
}
})
})
})
}
}
func TestMXFP4Simple(t *testing.T) {
b := setup(t)
t.Run("fixed", func(t *testing.T) {
ctx := initContextOrSkip(t, b, false)
data := [32 * 2]float32{
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
}
mxData := Quantize(fsggml.TensorTypeMXFP4, data[:], []uint64{uint64(len(data))})
dtype := ml.DTypeMXFP4
// Reconvert back to floats to remove the quantization fidelity loss for comparison
dataf := ConvertToF32(mxData, uint32(fsggml.TensorTypeMXFP4), uint64(len(data)))
t1 := ctx.(*Context).FromBytes(dtype, mxData, 32, 2)
t1f := ctx.(*Context).FromFloatSlice(dataf, 32, 2)
d2 := [32 * 2]float32{
// 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
}
t2 := ctx.(*Context).FromFloatSlice(d2[:], 32, 2)
t.Log("calling Mulmat")
t3f := t1f.Mulmat(ctx, t2)
t3 := t1.Mulmat(ctx, t2)
d3 := ml.Dump(ctx, t3)
d3f := ml.Dump(ctx, t3f)
if d3 != d3f {
t.Fatalf("expected (f32): \n%s\n\n but got (mxfp4): \n%s", d3f, d3)
}
t.Logf("result (mxfp4): \n%s", d3)
})
}
func TestMXFP4Conversion(t *testing.T) {
t.Run("quantize/exact", func(t *testing.T) {
r := rand.New(rand.NewSource(0))
data := [32 * 4]float32{}
for i := range data {
data[i] = mxfp4_vals[r.Int()%len(mxfp4_vals)]
}
mxData := Quantize(fsggml.TensorTypeMXFP4, data[:], []uint64{uint64(len(data))})
newData := ConvertToF32(mxData, uint32(fsggml.TensorTypeMXFP4), uint64(len(data)))
if len(data) != len(newData) {
t.Fatalf("length mismatch. started with %d but got %d", len(data), len(newData))
}
for i := range data {
if data[i] != newData[i] {
t.Logf("started with: %v", data)
t.Logf("got : %v", newData)
t.Fatalf("mismatched data starting at offset %d started with %f but got %f", i, data[i], newData[i])
}
}
})
t.Run("quantize/arange", func(t *testing.T) {
data := [32 * 8]float32{}
for i := range data {
data[i] = float32(i) // / float32(6.0)
}
mxData := Quantize(fsggml.TensorTypeMXFP4, data[:], []uint64{uint64(len(data))})
newData := ConvertToF32(mxData, uint32(fsggml.TensorTypeMXFP4), uint64(len(data)))
if len(data) != len(newData) {
t.Fatalf("length mismatch. started with %d but got %d", len(data), len(newData))
}
sim := cosineSimilarity(data[:], newData)
if sim < 0.99 {
t.Fatalf("failed similarity test: %f", sim)
}
t.Logf("similarity: %f", sim)
})
}
func dotProduct[V float32 | float64](v1, v2 []V) V {
var result V = 0
for i := range v1 {
result += v1[i] * v2[i]
}
return result
}
func magnitude[V float32 | float64](v []V) V {
var result V = 0
for _, val := range v {
result += val * val
}
return V(math.Sqrt(float64(result)))
}
func cosineSimilarity[V float32 | float64](v1, v2 []V) V {
return dotProduct(v1, v2) / (magnitude(v1) * magnitude(v2))
}

View File

@@ -44,6 +44,8 @@ func ConvertToF32(data []byte, dtype uint32, nelements uint64) []float32 {
C.dequantize_row_q6_K((*C.block_q6_K)(unsafe.Pointer(&data[0])), (*C.float)(&f32s[0]), elems)
case C.GGML_TYPE_BF16:
C.ggml_bf16_to_fp32_row((*C.ggml_bf16_t)(unsafe.Pointer(&data[0])), (*C.float)(&f32s[0]), elems)
case C.GGML_TYPE_MXFP4:
C.dequantize_row_mxfp4((*C.block_mxfp4)(unsafe.Pointer(&data[0])), (*C.float)(&f32s[0]), elems)
default:
panic("unsupported quantization format")
}

View File

@@ -15,3 +15,26 @@ func (m *Linear) Forward(ctx ml.Context, t ml.Tensor) ml.Tensor {
return t
}
type LinearBatch struct {
Weight ml.Tensor `gguf:"weight"`
Bias ml.Tensor `gguf:"bias"`
}
func (m *LinearBatch) Forward(ctx ml.Context, t, indices ml.Tensor) ml.Tensor {
t = m.Weight.MulmatID(ctx, t, indices)
if m.Bias != nil {
var bias ml.Tensor
if len(indices.Shape()) > 1 {
// FIXME: Rows does not support 2D indices for a 2D input tensor so reshape indices to 1D.
bias = m.Bias.Rows(ctx, indices.Contiguous(ctx, indices.Dim(0)*indices.Dim(1))).
Duplicate(ctx).
Reshape(ctx, m.Bias.Dim(0), indices.Dim(0), indices.Dim(1))
} else {
bias = m.Bias.Rows(ctx, indices)
}
t = t.Add(ctx, bias)
}
return t
}

View File

@@ -4,9 +4,15 @@ import "github.com/ollama/ollama/ml"
// Options contains optional parameters for RoPE function
type Options struct {
OriginalContextLength int
Type int
Factors ml.Tensor
OriginalContextLength int
// YaRN options
ExtrapolationFactor,
AttentionFactor,
BetaFast,
BetaSlow float32
}
// WithOriginalContextLength sets a custom context length
@@ -31,3 +37,15 @@ func WithFactors(factors ml.Tensor) func(*Options) {
}
}
}
func WithExtrapolationFactor(extrapolationFactor float32) func(*Options) {
return func(opts *Options) {
opts.ExtrapolationFactor = extrapolationFactor
}
}
func WithAttentionFactor(attentionFactor float32) func(*Options) {
return func(opts *Options) {
opts.AttentionFactor = attentionFactor
}
}