Compare commits
1 Commits
v0.5.3
...
jmorganca/
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
80d41b579b |
@@ -8,6 +8,8 @@ linters:
|
||||
- containedctx
|
||||
- contextcheck
|
||||
- errcheck
|
||||
- exportloopref
|
||||
- gci
|
||||
- gocheckcompilerdirectives
|
||||
- gofmt
|
||||
- gofumpt
|
||||
@@ -28,6 +30,8 @@ linters:
|
||||
- wastedassign
|
||||
- whitespace
|
||||
linters-settings:
|
||||
gci:
|
||||
sections: [standard, default, localmodule]
|
||||
staticcheck:
|
||||
checks:
|
||||
- all
|
||||
|
||||
2
Makefile
2
Makefile
@@ -8,9 +8,11 @@ include make/cuda-v12-defs.make
|
||||
include make/rocm-defs.make
|
||||
|
||||
ifeq ($(CUSTOM_CPU_FLAGS),)
|
||||
ifneq ($(OS),darwin)
|
||||
ifeq ($(ARCH),amd64)
|
||||
RUNNER_TARGETS=cpu
|
||||
endif
|
||||
endif
|
||||
# Without CUSTOM_CPU_FLAGS we default to build both v11 and v12 if present
|
||||
ifeq ($(OLLAMA_SKIP_CUDA_GENERATE),)
|
||||
ifneq ($(CUDA_11_COMPILER),)
|
||||
|
||||
@@ -407,8 +407,6 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
||||
|
||||
### Database
|
||||
|
||||
- [PostgreSQL extension pgai](https://github.com/timescale/pgai) (Create and search embeddings from Ollama models using pgvector)
|
||||
- [Get started guide](https://github.com/timescale/pgai/blob/main/docs/ollama.md)
|
||||
- [MindsDB](https://github.com/mindsdb/mindsdb/blob/staging/mindsdb/integrations/handlers/ollama_handler/README.md) (Connects Ollama models with nearly 200 data platforms and apps)
|
||||
- [chromem-go](https://github.com/philippgille/chromem-go/blob/v0.5.0/embed_ollama.go) with [example](https://github.com/philippgille/chromem-go/tree/v0.5.0/examples/rag-wikipedia-ollama)
|
||||
- [Kangaroo](https://github.com/dbkangaroo/kangaroo) (AI-powered SQL client and admin tool for popular databases)
|
||||
|
||||
147
llama/ggml-metal-embed.metal
vendored
147
llama/ggml-metal-embed.metal
vendored
@@ -2081,6 +2081,7 @@ typedef struct {
|
||||
float attn_factor;
|
||||
float beta_fast;
|
||||
float beta_slow;
|
||||
int32_t sections[4];
|
||||
} ggml_metal_kargs_rope;
|
||||
|
||||
typedef struct {
|
||||
@@ -4785,8 +4786,148 @@ kernel void kernel_rope_neox(
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template<typename T>
|
||||
kernel void kernel_rope_multi(
|
||||
constant ggml_metal_kargs_rope & args,
|
||||
device const char * src0,
|
||||
device const char * src1,
|
||||
device const char * src2,
|
||||
device char * dst,
|
||||
ushort tiitg[[thread_index_in_threadgroup]],
|
||||
ushort3 tptg [[threads_per_threadgroup]],
|
||||
uint3 tgpig[[threadgroup_position_in_grid]]) {
|
||||
const int i3 = tgpig[2];
|
||||
const int i2 = tgpig[1];
|
||||
const int i1 = tgpig[0];
|
||||
|
||||
float corr_dims[2];
|
||||
rope_yarn_corr_dims(args.n_dims, args.n_ctx_orig, args.freq_base, args.beta_fast, args.beta_slow, corr_dims);
|
||||
|
||||
device const int32_t * pos = (device const int32_t *) src1;
|
||||
|
||||
int sect_dims = args.sections[0] + args.sections[1] + args.sections[2] + args.sections[3];
|
||||
int sec_w = args.sections[1] + args.sections[0];
|
||||
|
||||
const float inv_ndims = -1.f/args.n_dims;
|
||||
|
||||
float cos_theta;
|
||||
float sin_theta;
|
||||
|
||||
for (int i0 = 2*tiitg; i0 < args.ne0; i0 += 2*tptg.x) {
|
||||
if (i0 < args.n_dims) {
|
||||
const int ic = i0/2;
|
||||
const int sector = ic % sect_dims;
|
||||
|
||||
float theta_base = (float) pos[i2];
|
||||
if (sector >= args.sections[0] && sector < sec_w) {
|
||||
theta_base = (float) pos[i2 + args.ne2];
|
||||
}
|
||||
else if (sector >= sec_w && sector < sec_w + args.sections[2]) {
|
||||
theta_base = (float) pos[i2 + args.ne2 * 2];
|
||||
}
|
||||
else if (sector >= sec_w + args.sections[2]) {
|
||||
theta_base = (float) pos[i2 + args.ne2 * 3];
|
||||
}
|
||||
|
||||
float theta = theta_base*pow(args.freq_base, inv_ndims*i0);
|
||||
|
||||
const float freq_factor = src2 != src0 ? ((device const float *) src2)[ic] : 1.0f;
|
||||
|
||||
rope_yarn(theta/freq_factor, args.freq_scale, corr_dims, i0, args.ext_factor, args.attn_factor, &cos_theta, &sin_theta);
|
||||
|
||||
device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + ic*args.nb00);
|
||||
device T * dst_data = (device T *)( dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + ic*args.nb0);
|
||||
|
||||
const float x0 = src[0];
|
||||
const float x1 = src[args.n_dims/2];
|
||||
|
||||
dst_data[0] = x0*cos_theta - x1*sin_theta;
|
||||
dst_data[args.n_dims/2] = x0*sin_theta + x1*cos_theta;
|
||||
} else {
|
||||
device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + i0*args.nb00);
|
||||
device T * dst_data = (device T *)( dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + i0*args.nb0);
|
||||
|
||||
dst_data[0] = src[0];
|
||||
dst_data[1] = src[1];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
kernel void kernel_rope_vision(
|
||||
constant ggml_metal_kargs_rope & args,
|
||||
device const char * src0,
|
||||
device const char * src1,
|
||||
device const char * src2,
|
||||
device char * dst,
|
||||
ushort tiitg[[thread_index_in_threadgroup]],
|
||||
ushort3 tptg [[threads_per_threadgroup]],
|
||||
uint3 tgpig[[threadgroup_position_in_grid]]) {
|
||||
const int i3 = tgpig[2];
|
||||
const int i2 = tgpig[1];
|
||||
const int i1 = tgpig[0];
|
||||
|
||||
float corr_dims[2];
|
||||
rope_yarn_corr_dims(args.n_dims, args.n_ctx_orig, args.freq_base, args.beta_fast, args.beta_slow, corr_dims);
|
||||
|
||||
device const int32_t * pos = (device const int32_t *) src1;
|
||||
|
||||
int sect_dims = args.sections[0] + args.sections[1];
|
||||
int sec_w = args.sections[1] + args.sections[0];
|
||||
int sec_e = args.sections[2] + sec_w;
|
||||
|
||||
const float inv_ndims = -1.f/args.n_dims;
|
||||
|
||||
float cos_theta;
|
||||
float sin_theta;
|
||||
|
||||
for (int i0 = 2*tiitg; i0 < args.ne0; i0 += 2*tptg.x) {
|
||||
const int ic = i0/2;
|
||||
const int sector = ic % sect_dims;
|
||||
|
||||
float theta_base = (float) pos[i2];
|
||||
if (sector >= args.sections[0] && sector < sec_w) {
|
||||
theta_base = (float) pos[i2 + args.ne2];
|
||||
}
|
||||
else if (sector >= sec_w && sector < sec_w + args.sections[2]) {
|
||||
theta_base = (float) pos[i2 + args.ne2 * 2];
|
||||
}
|
||||
else if (sector >= sec_w + args.sections[2]) {
|
||||
theta_base = (float) pos[i2 + args.ne2 * 3];
|
||||
}
|
||||
|
||||
int p = sector;
|
||||
if (sector >= sec_w + args.sections[2]) {
|
||||
p = sector - (sec_w + args.sections[2]);
|
||||
} else if (sector >= sec_w) {
|
||||
p = sector - sec_w;
|
||||
} else if (sector >= args.sections[0]) {
|
||||
p = sector - args.sections[0];
|
||||
}
|
||||
|
||||
const float theta = theta_base*pow(args.freq_base, inv_ndims*2.0f*p);
|
||||
|
||||
const float freq_factor = src2 != src0 ? ((device const float *) src2)[ic] : 1.0f;
|
||||
|
||||
rope_yarn(theta/freq_factor, args.freq_scale, corr_dims, i0, args.ext_factor, args.attn_factor, &cos_theta, &sin_theta);
|
||||
|
||||
device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + ic*args.nb00);
|
||||
device T * dst_data = (device T *)( dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + ic*args.nb0);
|
||||
|
||||
const float x0 = src[0];
|
||||
const float x1 = src[args.n_dims];
|
||||
|
||||
dst_data[0] = x0*cos_theta - x1*sin_theta;
|
||||
dst_data[args.n_dims] = x0*sin_theta + x1*cos_theta;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
typedef decltype(kernel_rope_norm<float>) kernel_rope_norm_t;
|
||||
typedef decltype(kernel_rope_neox<float>) kernel_rope_neox_t;
|
||||
typedef decltype(kernel_rope_multi<float>) kernel_rope_multi_t;
|
||||
typedef decltype(kernel_rope_vision<float>) kernel_rope_vision_t;
|
||||
|
||||
template [[host_name("kernel_rope_norm_f32")]] kernel kernel_rope_norm_t kernel_rope_norm<float>;
|
||||
template [[host_name("kernel_rope_norm_f16")]] kernel kernel_rope_norm_t kernel_rope_norm<half>;
|
||||
@@ -4794,6 +4935,12 @@ template [[host_name("kernel_rope_norm_f16")]] kernel kernel_rope_norm_t kernel_
|
||||
template [[host_name("kernel_rope_neox_f32")]] kernel kernel_rope_neox_t kernel_rope_neox<float>;
|
||||
template [[host_name("kernel_rope_neox_f16")]] kernel kernel_rope_neox_t kernel_rope_neox<half>;
|
||||
|
||||
template [[host_name("kernel_rope_multi_f32")]] kernel kernel_rope_multi_t kernel_rope_multi<float>;
|
||||
template [[host_name("kernel_rope_multi_f16")]] kernel kernel_rope_multi_t kernel_rope_multi<half>;
|
||||
|
||||
template [[host_name("kernel_rope_vision_f32")]] kernel kernel_rope_vision_t kernel_rope_vision<float>;
|
||||
template [[host_name("kernel_rope_vision_f16")]] kernel kernel_rope_vision_t kernel_rope_vision<half>;
|
||||
|
||||
typedef void (im2col_t)(
|
||||
device const float * x,
|
||||
device char * dst,
|
||||
|
||||
1
llama/ggml-metal-impl.h
vendored
1
llama/ggml-metal-impl.h
vendored
@@ -169,6 +169,7 @@ typedef struct {
|
||||
float attn_factor;
|
||||
float beta_fast;
|
||||
float beta_slow;
|
||||
int32_t sections[4];
|
||||
} ggml_metal_kargs_rope;
|
||||
|
||||
typedef struct {
|
||||
|
||||
146
llama/ggml-metal.metal
vendored
146
llama/ggml-metal.metal
vendored
@@ -2594,8 +2594,148 @@ kernel void kernel_rope_neox(
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template<typename T>
|
||||
kernel void kernel_rope_multi(
|
||||
constant ggml_metal_kargs_rope & args,
|
||||
device const char * src0,
|
||||
device const char * src1,
|
||||
device const char * src2,
|
||||
device char * dst,
|
||||
ushort tiitg[[thread_index_in_threadgroup]],
|
||||
ushort3 tptg [[threads_per_threadgroup]],
|
||||
uint3 tgpig[[threadgroup_position_in_grid]]) {
|
||||
const int i3 = tgpig[2];
|
||||
const int i2 = tgpig[1];
|
||||
const int i1 = tgpig[0];
|
||||
|
||||
float corr_dims[2];
|
||||
rope_yarn_corr_dims(args.n_dims, args.n_ctx_orig, args.freq_base, args.beta_fast, args.beta_slow, corr_dims);
|
||||
|
||||
device const int32_t * pos = (device const int32_t *) src1;
|
||||
|
||||
int sect_dims = args.sections[0] + args.sections[1] + args.sections[2] + args.sections[3];
|
||||
int sec_w = args.sections[1] + args.sections[0];
|
||||
|
||||
const float inv_ndims = -1.f/args.n_dims;
|
||||
|
||||
float cos_theta;
|
||||
float sin_theta;
|
||||
|
||||
for (int i0 = 2*tiitg; i0 < args.ne0; i0 += 2*tptg.x) {
|
||||
if (i0 < args.n_dims) {
|
||||
const int ic = i0/2;
|
||||
const int sector = ic % sect_dims;
|
||||
|
||||
float theta_base = (float) pos[i2];
|
||||
if (sector >= args.sections[0] && sector < sec_w) {
|
||||
theta_base = (float) pos[i2 + args.ne2];
|
||||
}
|
||||
else if (sector >= sec_w && sector < sec_w + args.sections[2]) {
|
||||
theta_base = (float) pos[i2 + args.ne2 * 2];
|
||||
}
|
||||
else if (sector >= sec_w + args.sections[2]) {
|
||||
theta_base = (float) pos[i2 + args.ne2 * 3];
|
||||
}
|
||||
|
||||
float theta = theta_base*pow(args.freq_base, inv_ndims*i0);
|
||||
|
||||
const float freq_factor = src2 != src0 ? ((device const float *) src2)[ic] : 1.0f;
|
||||
|
||||
rope_yarn(theta/freq_factor, args.freq_scale, corr_dims, i0, args.ext_factor, args.attn_factor, &cos_theta, &sin_theta);
|
||||
|
||||
device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + ic*args.nb00);
|
||||
device T * dst_data = (device T *)( dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + ic*args.nb0);
|
||||
|
||||
const float x0 = src[0];
|
||||
const float x1 = src[args.n_dims/2];
|
||||
|
||||
dst_data[0] = x0*cos_theta - x1*sin_theta;
|
||||
dst_data[args.n_dims/2] = x0*sin_theta + x1*cos_theta;
|
||||
} else {
|
||||
device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + i0*args.nb00);
|
||||
device T * dst_data = (device T *)( dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + i0*args.nb0);
|
||||
|
||||
dst_data[0] = src[0];
|
||||
dst_data[1] = src[1];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
kernel void kernel_rope_vision(
|
||||
constant ggml_metal_kargs_rope & args,
|
||||
device const char * src0,
|
||||
device const char * src1,
|
||||
device const char * src2,
|
||||
device char * dst,
|
||||
ushort tiitg[[thread_index_in_threadgroup]],
|
||||
ushort3 tptg [[threads_per_threadgroup]],
|
||||
uint3 tgpig[[threadgroup_position_in_grid]]) {
|
||||
const int i3 = tgpig[2];
|
||||
const int i2 = tgpig[1];
|
||||
const int i1 = tgpig[0];
|
||||
|
||||
float corr_dims[2];
|
||||
rope_yarn_corr_dims(args.n_dims, args.n_ctx_orig, args.freq_base, args.beta_fast, args.beta_slow, corr_dims);
|
||||
|
||||
device const int32_t * pos = (device const int32_t *) src1;
|
||||
|
||||
int sect_dims = args.sections[0] + args.sections[1];
|
||||
int sec_w = args.sections[1] + args.sections[0];
|
||||
int sec_e = args.sections[2] + sec_w;
|
||||
|
||||
const float inv_ndims = -1.f/args.n_dims;
|
||||
|
||||
float cos_theta;
|
||||
float sin_theta;
|
||||
|
||||
for (int i0 = 2*tiitg; i0 < args.ne0; i0 += 2*tptg.x) {
|
||||
const int ic = i0/2;
|
||||
const int sector = ic % sect_dims;
|
||||
|
||||
float theta_base = (float) pos[i2];
|
||||
if (sector >= args.sections[0] && sector < sec_w) {
|
||||
theta_base = (float) pos[i2 + args.ne2];
|
||||
}
|
||||
else if (sector >= sec_w && sector < sec_w + args.sections[2]) {
|
||||
theta_base = (float) pos[i2 + args.ne2 * 2];
|
||||
}
|
||||
else if (sector >= sec_w + args.sections[2]) {
|
||||
theta_base = (float) pos[i2 + args.ne2 * 3];
|
||||
}
|
||||
|
||||
int p = sector;
|
||||
if (sector >= sec_w + args.sections[2]) {
|
||||
p = sector - (sec_w + args.sections[2]);
|
||||
} else if (sector >= sec_w) {
|
||||
p = sector - sec_w;
|
||||
} else if (sector >= args.sections[0]) {
|
||||
p = sector - args.sections[0];
|
||||
}
|
||||
|
||||
const float theta = theta_base*pow(args.freq_base, inv_ndims*2.0f*p);
|
||||
|
||||
const float freq_factor = src2 != src0 ? ((device const float *) src2)[ic] : 1.0f;
|
||||
|
||||
rope_yarn(theta/freq_factor, args.freq_scale, corr_dims, i0, args.ext_factor, args.attn_factor, &cos_theta, &sin_theta);
|
||||
|
||||
device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + ic*args.nb00);
|
||||
device T * dst_data = (device T *)( dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + ic*args.nb0);
|
||||
|
||||
const float x0 = src[0];
|
||||
const float x1 = src[args.n_dims];
|
||||
|
||||
dst_data[0] = x0*cos_theta - x1*sin_theta;
|
||||
dst_data[args.n_dims] = x0*sin_theta + x1*cos_theta;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
typedef decltype(kernel_rope_norm<float>) kernel_rope_norm_t;
|
||||
typedef decltype(kernel_rope_neox<float>) kernel_rope_neox_t;
|
||||
typedef decltype(kernel_rope_multi<float>) kernel_rope_multi_t;
|
||||
typedef decltype(kernel_rope_vision<float>) kernel_rope_vision_t;
|
||||
|
||||
template [[host_name("kernel_rope_norm_f32")]] kernel kernel_rope_norm_t kernel_rope_norm<float>;
|
||||
template [[host_name("kernel_rope_norm_f16")]] kernel kernel_rope_norm_t kernel_rope_norm<half>;
|
||||
@@ -2603,6 +2743,12 @@ template [[host_name("kernel_rope_norm_f16")]] kernel kernel_rope_norm_t kernel_
|
||||
template [[host_name("kernel_rope_neox_f32")]] kernel kernel_rope_neox_t kernel_rope_neox<float>;
|
||||
template [[host_name("kernel_rope_neox_f16")]] kernel kernel_rope_neox_t kernel_rope_neox<half>;
|
||||
|
||||
template [[host_name("kernel_rope_multi_f32")]] kernel kernel_rope_multi_t kernel_rope_multi<float>;
|
||||
template [[host_name("kernel_rope_multi_f16")]] kernel kernel_rope_multi_t kernel_rope_multi<half>;
|
||||
|
||||
template [[host_name("kernel_rope_vision_f32")]] kernel kernel_rope_vision_t kernel_rope_vision<float>;
|
||||
template [[host_name("kernel_rope_vision_f16")]] kernel kernel_rope_vision_t kernel_rope_vision<half>;
|
||||
|
||||
typedef void (im2col_t)(
|
||||
device const float * x,
|
||||
device char * dst,
|
||||
|
||||
54
llama/ggml-metal_darwin_arm64.m
vendored
54
llama/ggml-metal_darwin_arm64.m
vendored
@@ -328,6 +328,10 @@ enum ggml_metal_kernel_type {
|
||||
GGML_METAL_KERNEL_TYPE_ROPE_NORM_F16,
|
||||
GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F32,
|
||||
GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F16,
|
||||
GGML_METAL_KERNEL_TYPE_ROPE_MULTI_F32,
|
||||
GGML_METAL_KERNEL_TYPE_ROPE_MULTI_F16,
|
||||
GGML_METAL_KERNEL_TYPE_ROPE_VISION_F32,
|
||||
GGML_METAL_KERNEL_TYPE_ROPE_VISION_F16,
|
||||
GGML_METAL_KERNEL_TYPE_IM2COL_F16,
|
||||
GGML_METAL_KERNEL_TYPE_IM2COL_F32,
|
||||
GGML_METAL_KERNEL_TYPE_IM2COL_EXT_F16,
|
||||
@@ -928,6 +932,10 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_NORM_F16, rope_norm_f16, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F32, rope_neox_f32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F16, rope_neox_f16, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_MULTI_F32, rope_multi_f32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_MULTI_F16, rope_multi_f16, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_VISION_F32, rope_vision_f32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_VISION_F16, rope_vision_f16, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_IM2COL_F16, im2col_f16, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_IM2COL_F32, im2col_f32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_IM2COL_EXT_F16, im2col_ext_f16, true);
|
||||
@@ -1155,16 +1163,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
|
||||
case GGML_OP_NORM:
|
||||
return true;
|
||||
case GGML_OP_ROPE:
|
||||
{
|
||||
const int mode = ((const int32_t *) op->op_params)[2];
|
||||
if (mode & GGML_ROPE_TYPE_MROPE) {
|
||||
return false;
|
||||
}
|
||||
if (mode & GGML_ROPE_TYPE_VISION) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
return true;
|
||||
case GGML_OP_IM2COL:
|
||||
return op->src[0]->type == GGML_TYPE_F16;
|
||||
case GGML_OP_POOL_1D:
|
||||
@@ -3083,6 +3082,7 @@ static void ggml_metal_encode_node(
|
||||
float attn_factor;
|
||||
float beta_fast;
|
||||
float beta_slow;
|
||||
int32_t sections[4];
|
||||
|
||||
memcpy(&freq_base, (const int32_t *) dst->op_params + 5, sizeof(float));
|
||||
memcpy(&freq_scale, (const int32_t *) dst->op_params + 6, sizeof(float));
|
||||
@@ -3090,21 +3090,44 @@ static void ggml_metal_encode_node(
|
||||
memcpy(&attn_factor, (const int32_t *) dst->op_params + 8, sizeof(float));
|
||||
memcpy(&beta_fast, (const int32_t *) dst->op_params + 9, sizeof(float));
|
||||
memcpy(&beta_slow, (const int32_t *) dst->op_params + 10, sizeof(float));
|
||||
memcpy(§ions, (const int32_t *) dst->op_params + 11, sizeof(int32_t)*4);
|
||||
|
||||
const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
|
||||
const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE;
|
||||
const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
|
||||
|
||||
if (is_mrope) {
|
||||
GGML_ASSERT(sections[0] > 0 || sections[1] > 0 || sections[2] > 0);
|
||||
}
|
||||
|
||||
if (is_vision) {
|
||||
GGML_ASSERT(n_dims == ne00/2);
|
||||
}
|
||||
|
||||
id<MTLComputePipelineState> pipeline = nil;
|
||||
|
||||
if (!is_neox) {
|
||||
if (is_neox) {
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NORM_F32].pipeline; break;
|
||||
case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NORM_F16].pipeline; break;
|
||||
case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F32].pipeline; break;
|
||||
case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F16].pipeline; break;
|
||||
default: GGML_ABORT("fatal error");
|
||||
};
|
||||
} else if (is_mrope && !is_vision) {
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_MULTI_F32].pipeline; break;
|
||||
case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_MULTI_F16].pipeline; break;
|
||||
default: GGML_ABORT("fatal error");
|
||||
};
|
||||
} else if (is_vision) {
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_VISION_F32].pipeline; break;
|
||||
case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_VISION_F16].pipeline; break;
|
||||
default: GGML_ABORT("fatal error");
|
||||
};
|
||||
} else {
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F32].pipeline; break;
|
||||
case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F16].pipeline; break;
|
||||
case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NORM_F32].pipeline; break;
|
||||
case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NORM_F16].pipeline; break;
|
||||
default: GGML_ABORT("fatal error");
|
||||
};
|
||||
}
|
||||
@@ -3135,6 +3158,7 @@ static void ggml_metal_encode_node(
|
||||
/*.attn_factor =*/ attn_factor,
|
||||
/*.beta_fast =*/ beta_fast,
|
||||
/*.beta_slow =*/ beta_slow,
|
||||
/*.sections =*/ {sections[0], sections[1], sections[2], sections[3]}
|
||||
};
|
||||
|
||||
[encoder setComputePipelineState:pipeline];
|
||||
|
||||
299
llama/patches/0014-qwen2vl-support.patch
Normal file
299
llama/patches/0014-qwen2vl-support.patch
Normal file
@@ -0,0 +1,299 @@
|
||||
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
||||
From: jmorganca <jmorganca@gmail.com>
|
||||
Date: Sun, 15 Dec 2024 23:56:24 -0800
|
||||
Subject: [PATCH] qwen2vl support
|
||||
|
||||
---
|
||||
ggml/src/ggml-metal/ggml-metal-impl.h | 1 +
|
||||
ggml/src/ggml-metal/ggml-metal.m | 54 +++++++---
|
||||
ggml/src/ggml-metal/ggml-metal.metal | 146 ++++++++++++++++++++++++++
|
||||
3 files changed, 186 insertions(+), 15 deletions(-)
|
||||
|
||||
diff --git a/ggml/src/ggml-metal/ggml-metal-impl.h b/ggml/src/ggml-metal/ggml-metal-impl.h
|
||||
index e3dc25f1..766a4999 100644
|
||||
--- a/ggml/src/ggml-metal/ggml-metal-impl.h
|
||||
+++ b/ggml/src/ggml-metal/ggml-metal-impl.h
|
||||
@@ -143,6 +143,7 @@ typedef struct {
|
||||
float attn_factor;
|
||||
float beta_fast;
|
||||
float beta_slow;
|
||||
+ int32_t sections[4];
|
||||
} ggml_metal_kargs_rope;
|
||||
|
||||
typedef struct {
|
||||
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
|
||||
index 787fc713..806c9fd3 100644
|
||||
--- a/ggml/src/ggml-metal/ggml-metal.m
|
||||
+++ b/ggml/src/ggml-metal/ggml-metal.m
|
||||
@@ -302,6 +302,10 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
|
||||
GGML_METAL_KERNEL_TYPE_ROPE_NORM_F16,
|
||||
GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F32,
|
||||
GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F16,
|
||||
+ GGML_METAL_KERNEL_TYPE_ROPE_MULTI_F32,
|
||||
+ GGML_METAL_KERNEL_TYPE_ROPE_MULTI_F16,
|
||||
+ GGML_METAL_KERNEL_TYPE_ROPE_VISION_F32,
|
||||
+ GGML_METAL_KERNEL_TYPE_ROPE_VISION_F16,
|
||||
GGML_METAL_KERNEL_TYPE_IM2COL_F16,
|
||||
GGML_METAL_KERNEL_TYPE_IM2COL_F32,
|
||||
GGML_METAL_KERNEL_TYPE_IM2COL_EXT_F16,
|
||||
@@ -902,6 +906,10 @@ @implementation GGMLMetalClass
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_NORM_F16, rope_norm_f16, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F32, rope_neox_f32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F16, rope_neox_f16, true);
|
||||
+ GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_MULTI_F32, rope_multi_f32, true);
|
||||
+ GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_MULTI_F16, rope_multi_f16, true);
|
||||
+ GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_VISION_F32, rope_vision_f32, true);
|
||||
+ GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_VISION_F16, rope_vision_f16, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_IM2COL_F16, im2col_f16, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_IM2COL_F32, im2col_f32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_IM2COL_EXT_F16, im2col_ext_f16, true);
|
||||
@@ -1129,16 +1137,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
|
||||
case GGML_OP_NORM:
|
||||
return true;
|
||||
case GGML_OP_ROPE:
|
||||
- {
|
||||
- const int mode = ((const int32_t *) op->op_params)[2];
|
||||
- if (mode & GGML_ROPE_TYPE_MROPE) {
|
||||
- return false;
|
||||
- }
|
||||
- if (mode & GGML_ROPE_TYPE_VISION) {
|
||||
- return false;
|
||||
- }
|
||||
- return true;
|
||||
- }
|
||||
+ return true;
|
||||
case GGML_OP_IM2COL:
|
||||
return op->src[0]->type == GGML_TYPE_F16;
|
||||
case GGML_OP_POOL_1D:
|
||||
@@ -3057,6 +3056,7 @@ static void ggml_metal_encode_node(
|
||||
float attn_factor;
|
||||
float beta_fast;
|
||||
float beta_slow;
|
||||
+ int32_t sections[4];
|
||||
|
||||
memcpy(&freq_base, (const int32_t *) dst->op_params + 5, sizeof(float));
|
||||
memcpy(&freq_scale, (const int32_t *) dst->op_params + 6, sizeof(float));
|
||||
@@ -3064,21 +3064,44 @@ static void ggml_metal_encode_node(
|
||||
memcpy(&attn_factor, (const int32_t *) dst->op_params + 8, sizeof(float));
|
||||
memcpy(&beta_fast, (const int32_t *) dst->op_params + 9, sizeof(float));
|
||||
memcpy(&beta_slow, (const int32_t *) dst->op_params + 10, sizeof(float));
|
||||
+ memcpy(§ions, (const int32_t *) dst->op_params + 11, sizeof(int32_t)*4);
|
||||
|
||||
const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
|
||||
+ const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE;
|
||||
+ const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
|
||||
+
|
||||
+ if (is_mrope) {
|
||||
+ GGML_ASSERT(sections[0] > 0 || sections[1] > 0 || sections[2] > 0);
|
||||
+ }
|
||||
+
|
||||
+ if (is_vision) {
|
||||
+ GGML_ASSERT(n_dims == ne00/2);
|
||||
+ }
|
||||
|
||||
id<MTLComputePipelineState> pipeline = nil;
|
||||
|
||||
- if (!is_neox) {
|
||||
+ if (is_neox) {
|
||||
switch (src0->type) {
|
||||
- case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NORM_F32].pipeline; break;
|
||||
- case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NORM_F16].pipeline; break;
|
||||
+ case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F32].pipeline; break;
|
||||
+ case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F16].pipeline; break;
|
||||
+ default: GGML_ABORT("fatal error");
|
||||
+ };
|
||||
+ } else if (is_mrope && !is_vision) {
|
||||
+ switch (src0->type) {
|
||||
+ case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_MULTI_F32].pipeline; break;
|
||||
+ case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_MULTI_F16].pipeline; break;
|
||||
+ default: GGML_ABORT("fatal error");
|
||||
+ };
|
||||
+ } else if (is_vision) {
|
||||
+ switch (src0->type) {
|
||||
+ case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_VISION_F32].pipeline; break;
|
||||
+ case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_VISION_F16].pipeline; break;
|
||||
default: GGML_ABORT("fatal error");
|
||||
};
|
||||
} else {
|
||||
switch (src0->type) {
|
||||
- case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F32].pipeline; break;
|
||||
- case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F16].pipeline; break;
|
||||
+ case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NORM_F32].pipeline; break;
|
||||
+ case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NORM_F16].pipeline; break;
|
||||
default: GGML_ABORT("fatal error");
|
||||
};
|
||||
}
|
||||
@@ -3109,6 +3132,7 @@ static void ggml_metal_encode_node(
|
||||
/*.attn_factor =*/ attn_factor,
|
||||
/*.beta_fast =*/ beta_fast,
|
||||
/*.beta_slow =*/ beta_slow,
|
||||
+ /*.sections =*/ {sections[0], sections[1], sections[2], sections[3]}
|
||||
};
|
||||
|
||||
[encoder setComputePipelineState:pipeline];
|
||||
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
|
||||
index 204c93e6..67b3240f 100644
|
||||
--- a/ggml/src/ggml-metal/ggml-metal.metal
|
||||
+++ b/ggml/src/ggml-metal/ggml-metal.metal
|
||||
@@ -2568,8 +2568,148 @@ kernel void kernel_rope_neox(
|
||||
}
|
||||
}
|
||||
|
||||
+
|
||||
+template<typename T>
|
||||
+kernel void kernel_rope_multi(
|
||||
+ constant ggml_metal_kargs_rope & args,
|
||||
+ device const char * src0,
|
||||
+ device const char * src1,
|
||||
+ device const char * src2,
|
||||
+ device char * dst,
|
||||
+ ushort tiitg[[thread_index_in_threadgroup]],
|
||||
+ ushort3 tptg [[threads_per_threadgroup]],
|
||||
+ uint3 tgpig[[threadgroup_position_in_grid]]) {
|
||||
+ const int i3 = tgpig[2];
|
||||
+ const int i2 = tgpig[1];
|
||||
+ const int i1 = tgpig[0];
|
||||
+
|
||||
+ float corr_dims[2];
|
||||
+ rope_yarn_corr_dims(args.n_dims, args.n_ctx_orig, args.freq_base, args.beta_fast, args.beta_slow, corr_dims);
|
||||
+
|
||||
+ device const int32_t * pos = (device const int32_t *) src1;
|
||||
+
|
||||
+ int sect_dims = args.sections[0] + args.sections[1] + args.sections[2] + args.sections[3];
|
||||
+ int sec_w = args.sections[1] + args.sections[0];
|
||||
+
|
||||
+ const float inv_ndims = -1.f/args.n_dims;
|
||||
+
|
||||
+ float cos_theta;
|
||||
+ float sin_theta;
|
||||
+
|
||||
+ for (int i0 = 2*tiitg; i0 < args.ne0; i0 += 2*tptg.x) {
|
||||
+ if (i0 < args.n_dims) {
|
||||
+ const int ic = i0/2;
|
||||
+ const int sector = ic % sect_dims;
|
||||
+
|
||||
+ float theta_base = (float) pos[i2];
|
||||
+ if (sector >= args.sections[0] && sector < sec_w) {
|
||||
+ theta_base = (float) pos[i2 + args.ne2];
|
||||
+ }
|
||||
+ else if (sector >= sec_w && sector < sec_w + args.sections[2]) {
|
||||
+ theta_base = (float) pos[i2 + args.ne2 * 2];
|
||||
+ }
|
||||
+ else if (sector >= sec_w + args.sections[2]) {
|
||||
+ theta_base = (float) pos[i2 + args.ne2 * 3];
|
||||
+ }
|
||||
+
|
||||
+ float theta = theta_base*pow(args.freq_base, inv_ndims*i0);
|
||||
+
|
||||
+ const float freq_factor = src2 != src0 ? ((device const float *) src2)[ic] : 1.0f;
|
||||
+
|
||||
+ rope_yarn(theta/freq_factor, args.freq_scale, corr_dims, i0, args.ext_factor, args.attn_factor, &cos_theta, &sin_theta);
|
||||
+
|
||||
+ device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + ic*args.nb00);
|
||||
+ device T * dst_data = (device T *)( dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + ic*args.nb0);
|
||||
+
|
||||
+ const float x0 = src[0];
|
||||
+ const float x1 = src[args.n_dims/2];
|
||||
+
|
||||
+ dst_data[0] = x0*cos_theta - x1*sin_theta;
|
||||
+ dst_data[args.n_dims/2] = x0*sin_theta + x1*cos_theta;
|
||||
+ } else {
|
||||
+ device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + i0*args.nb00);
|
||||
+ device T * dst_data = (device T *)( dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + i0*args.nb0);
|
||||
+
|
||||
+ dst_data[0] = src[0];
|
||||
+ dst_data[1] = src[1];
|
||||
+ }
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
+template<typename T>
|
||||
+kernel void kernel_rope_vision(
|
||||
+ constant ggml_metal_kargs_rope & args,
|
||||
+ device const char * src0,
|
||||
+ device const char * src1,
|
||||
+ device const char * src2,
|
||||
+ device char * dst,
|
||||
+ ushort tiitg[[thread_index_in_threadgroup]],
|
||||
+ ushort3 tptg [[threads_per_threadgroup]],
|
||||
+ uint3 tgpig[[threadgroup_position_in_grid]]) {
|
||||
+ const int i3 = tgpig[2];
|
||||
+ const int i2 = tgpig[1];
|
||||
+ const int i1 = tgpig[0];
|
||||
+
|
||||
+ float corr_dims[2];
|
||||
+ rope_yarn_corr_dims(args.n_dims, args.n_ctx_orig, args.freq_base, args.beta_fast, args.beta_slow, corr_dims);
|
||||
+
|
||||
+ device const int32_t * pos = (device const int32_t *) src1;
|
||||
+
|
||||
+ int sect_dims = args.sections[0] + args.sections[1];
|
||||
+ int sec_w = args.sections[1] + args.sections[0];
|
||||
+ int sec_e = args.sections[2] + sec_w;
|
||||
+
|
||||
+ const float inv_ndims = -1.f/args.n_dims;
|
||||
+
|
||||
+ float cos_theta;
|
||||
+ float sin_theta;
|
||||
+
|
||||
+ for (int i0 = 2*tiitg; i0 < args.ne0; i0 += 2*tptg.x) {
|
||||
+ const int ic = i0/2;
|
||||
+ const int sector = ic % sect_dims;
|
||||
+
|
||||
+ float theta_base = (float) pos[i2];
|
||||
+ if (sector >= args.sections[0] && sector < sec_w) {
|
||||
+ theta_base = (float) pos[i2 + args.ne2];
|
||||
+ }
|
||||
+ else if (sector >= sec_w && sector < sec_w + args.sections[2]) {
|
||||
+ theta_base = (float) pos[i2 + args.ne2 * 2];
|
||||
+ }
|
||||
+ else if (sector >= sec_w + args.sections[2]) {
|
||||
+ theta_base = (float) pos[i2 + args.ne2 * 3];
|
||||
+ }
|
||||
+
|
||||
+ int p = sector;
|
||||
+ if (sector >= sec_w + args.sections[2]) {
|
||||
+ p = sector - (sec_w + args.sections[2]);
|
||||
+ } else if (sector >= sec_w) {
|
||||
+ p = sector - sec_w;
|
||||
+ } else if (sector >= args.sections[0]) {
|
||||
+ p = sector - args.sections[0];
|
||||
+ }
|
||||
+
|
||||
+ const float theta = theta_base*pow(args.freq_base, inv_ndims*2.0f*p);
|
||||
+
|
||||
+ const float freq_factor = src2 != src0 ? ((device const float *) src2)[ic] : 1.0f;
|
||||
+
|
||||
+ rope_yarn(theta/freq_factor, args.freq_scale, corr_dims, i0, args.ext_factor, args.attn_factor, &cos_theta, &sin_theta);
|
||||
+
|
||||
+ device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + ic*args.nb00);
|
||||
+ device T * dst_data = (device T *)( dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + ic*args.nb0);
|
||||
+
|
||||
+ const float x0 = src[0];
|
||||
+ const float x1 = src[args.n_dims];
|
||||
+
|
||||
+ dst_data[0] = x0*cos_theta - x1*sin_theta;
|
||||
+ dst_data[args.n_dims] = x0*sin_theta + x1*cos_theta;
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
+
|
||||
typedef decltype(kernel_rope_norm<float>) kernel_rope_norm_t;
|
||||
typedef decltype(kernel_rope_neox<float>) kernel_rope_neox_t;
|
||||
+typedef decltype(kernel_rope_multi<float>) kernel_rope_multi_t;
|
||||
+typedef decltype(kernel_rope_vision<float>) kernel_rope_vision_t;
|
||||
|
||||
template [[host_name("kernel_rope_norm_f32")]] kernel kernel_rope_norm_t kernel_rope_norm<float>;
|
||||
template [[host_name("kernel_rope_norm_f16")]] kernel kernel_rope_norm_t kernel_rope_norm<half>;
|
||||
@@ -2577,6 +2717,12 @@ template [[host_name("kernel_rope_norm_f16")]] kernel kernel_rope_norm_t kernel_
|
||||
template [[host_name("kernel_rope_neox_f32")]] kernel kernel_rope_neox_t kernel_rope_neox<float>;
|
||||
template [[host_name("kernel_rope_neox_f16")]] kernel kernel_rope_neox_t kernel_rope_neox<half>;
|
||||
|
||||
+template [[host_name("kernel_rope_multi_f32")]] kernel kernel_rope_multi_t kernel_rope_multi<float>;
|
||||
+template [[host_name("kernel_rope_multi_f16")]] kernel kernel_rope_multi_t kernel_rope_multi<half>;
|
||||
+
|
||||
+template [[host_name("kernel_rope_vision_f32")]] kernel kernel_rope_vision_t kernel_rope_vision<float>;
|
||||
+template [[host_name("kernel_rope_vision_f16")]] kernel kernel_rope_vision_t kernel_rope_vision<half>;
|
||||
+
|
||||
typedef void (im2col_t)(
|
||||
device const float * x,
|
||||
device char * dst,
|
||||
@@ -674,6 +674,21 @@ type CompletionResponse struct {
|
||||
}
|
||||
|
||||
func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error {
|
||||
if err := s.sem.Acquire(ctx, 1); err != nil {
|
||||
if errors.Is(err, context.Canceled) {
|
||||
slog.Info("aborting completion request due to client closing the connection")
|
||||
} else {
|
||||
slog.Error("Failed to acquire semaphore", "error", err)
|
||||
}
|
||||
return err
|
||||
}
|
||||
defer s.sem.Release(1)
|
||||
|
||||
// put an upper limit on num_predict to avoid the model running on forever
|
||||
if req.Options.NumPredict < 0 || req.Options.NumPredict > 10*s.options.NumCtx {
|
||||
req.Options.NumPredict = 10 * s.options.NumCtx
|
||||
}
|
||||
|
||||
request := map[string]any{
|
||||
"prompt": req.Prompt,
|
||||
"stream": true,
|
||||
@@ -699,10 +714,16 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
|
||||
"cache_prompt": true,
|
||||
}
|
||||
|
||||
// Make sure the server is ready
|
||||
status, err := s.getServerStatusRetry(ctx)
|
||||
if err != nil {
|
||||
return err
|
||||
} else if status != ServerStatusReady {
|
||||
return fmt.Errorf("unexpected server status: %s", status.ToString())
|
||||
}
|
||||
|
||||
if len(req.Format) > 0 {
|
||||
switch {
|
||||
case bytes.Equal(req.Format, []byte(`""`)):
|
||||
// fallthrough
|
||||
case bytes.Equal(req.Format, []byte(`"json"`)):
|
||||
request["grammar"] = grammarJSON
|
||||
case bytes.HasPrefix(req.Format, []byte("{")):
|
||||
@@ -713,33 +734,10 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
|
||||
}
|
||||
request["grammar"] = string(g)
|
||||
default:
|
||||
return fmt.Errorf("invalid format: %q; expected \"json\" or a valid JSON Schema", req.Format)
|
||||
return errors.New(`invalid format: expected "json" or a JSON schema`)
|
||||
}
|
||||
}
|
||||
|
||||
if err := s.sem.Acquire(ctx, 1); err != nil {
|
||||
if errors.Is(err, context.Canceled) {
|
||||
slog.Info("aborting completion request due to client closing the connection")
|
||||
} else {
|
||||
slog.Error("Failed to acquire semaphore", "error", err)
|
||||
}
|
||||
return err
|
||||
}
|
||||
defer s.sem.Release(1)
|
||||
|
||||
// put an upper limit on num_predict to avoid the model running on forever
|
||||
if req.Options.NumPredict < 0 || req.Options.NumPredict > 10*s.options.NumCtx {
|
||||
req.Options.NumPredict = 10 * s.options.NumCtx
|
||||
}
|
||||
|
||||
// Make sure the server is ready
|
||||
status, err := s.getServerStatusRetry(ctx)
|
||||
if err != nil {
|
||||
return err
|
||||
} else if status != ServerStatusReady {
|
||||
return fmt.Errorf("unexpected server status: %s", status.ToString())
|
||||
}
|
||||
|
||||
// Handling JSON marshaling with special characters unescaped.
|
||||
buffer := &bytes.Buffer{}
|
||||
enc := json.NewEncoder(buffer)
|
||||
|
||||
@@ -1,63 +0,0 @@
|
||||
package llm
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"github.com/ollama/ollama/api"
|
||||
"golang.org/x/sync/semaphore"
|
||||
)
|
||||
|
||||
func TestLLMServerCompletionFormat(t *testing.T) {
|
||||
// This test was written to fix an already deployed issue. It is a bit
|
||||
// of a mess, and but it's good enough, until we can refactoring the
|
||||
// Completion method to be more testable.
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
s := &llmServer{
|
||||
sem: semaphore.NewWeighted(1), // required to prevent nil panic
|
||||
}
|
||||
|
||||
checkInvalid := func(format string) {
|
||||
t.Helper()
|
||||
err := s.Completion(ctx, CompletionRequest{
|
||||
Options: new(api.Options),
|
||||
Format: []byte(format),
|
||||
}, nil)
|
||||
|
||||
want := fmt.Sprintf("invalid format: %q; expected \"json\" or a valid JSON Schema", format)
|
||||
if err == nil || !strings.Contains(err.Error(), want) {
|
||||
t.Fatalf("err = %v; want %q", err, want)
|
||||
}
|
||||
}
|
||||
|
||||
checkInvalid("X") // invalid format
|
||||
checkInvalid(`"X"`) // invalid JSON Schema
|
||||
|
||||
cancel() // prevent further processing if request makes it past the format check
|
||||
|
||||
checkCanceled := func(err error) {
|
||||
t.Helper()
|
||||
if !errors.Is(err, context.Canceled) {
|
||||
t.Fatalf("Completion: err = %v; expected context.Canceled", err)
|
||||
}
|
||||
}
|
||||
|
||||
valids := []string{`"json"`, `{"type":"object"}`, ``, `""`}
|
||||
for _, valid := range valids {
|
||||
err := s.Completion(ctx, CompletionRequest{
|
||||
Options: new(api.Options),
|
||||
Format: []byte(valid),
|
||||
}, nil)
|
||||
checkCanceled(err)
|
||||
}
|
||||
|
||||
err := s.Completion(ctx, CompletionRequest{
|
||||
Options: new(api.Options),
|
||||
Format: nil, // missing format
|
||||
}, nil)
|
||||
checkCanceled(err)
|
||||
}
|
||||
@@ -19,7 +19,6 @@ const config: ForgeConfig = {
|
||||
icon: './assets/icon.icns',
|
||||
extraResource: [
|
||||
'../dist/ollama',
|
||||
'../dist/darwin-amd64/lib',
|
||||
path.join(__dirname, './assets/iconTemplate.png'),
|
||||
path.join(__dirname, './assets/iconTemplate@2x.png'),
|
||||
path.join(__dirname, './assets/iconUpdateTemplate.png'),
|
||||
@@ -43,7 +42,7 @@ const config: ForgeConfig = {
|
||||
}
|
||||
: {}),
|
||||
osxUniversal: {
|
||||
x64ArchFiles: '**/ollama*',
|
||||
x64ArchFiles: '**/ollama',
|
||||
},
|
||||
},
|
||||
rebuildConfig: {},
|
||||
|
||||
@@ -72,7 +72,6 @@ func locateRunnersOnce() {
|
||||
paths := []string{
|
||||
filepath.Join(filepath.Dir(exe), "llama", "build", runtime.GOOS+"-"+runtime.GOARCH, "runners"),
|
||||
filepath.Join(filepath.Dir(exe), envconfig.LibRelativeToExe(), "lib", "ollama", "runners"),
|
||||
filepath.Join(filepath.Dir(exe), "lib", "ollama", "runners"),
|
||||
}
|
||||
for _, path := range paths {
|
||||
if _, err := os.Stat(path); err == nil {
|
||||
|
||||
@@ -18,18 +18,10 @@ rm -rf llama/build dist/darwin-*
|
||||
echo "Building darwin arm64"
|
||||
GOOS=darwin ARCH=arm64 GOARCH=arm64 make -j 8 dist
|
||||
echo "Building darwin amd64 with AVX enabled"
|
||||
GOOS=darwin ARCH=amd64 GOARCH=amd64 CUSTOM_CPU_FLAGS="avx" make -j 8 dist_exe
|
||||
GOOS=darwin ARCH=amd64 GOARCH=amd64 CUSTOM_CPU_FLAGS="avx" make -j 8 dist
|
||||
|
||||
# Generate the universal ollama binary for stand-alone usage: metal + avx
|
||||
lipo -create -output dist/ollama-darwin dist/darwin-arm64/bin/ollama dist/darwin-amd64/bin/ollama
|
||||
|
||||
echo "Building darwin amd64 with runners"
|
||||
rm dist/darwin-amd64/bin/ollama
|
||||
GOOS=darwin ARCH=amd64 GOARCH=amd64 make -j 8 dist
|
||||
# Generate the universal ollama binary for the app bundle: metal + no-avx
|
||||
lipo -create -output dist/ollama dist/darwin-arm64/bin/ollama dist/darwin-amd64/bin/ollama
|
||||
|
||||
|
||||
if [ -n "$APPLE_IDENTITY" ]; then
|
||||
codesign --deep --force --options=runtime --sign "$APPLE_IDENTITY" --timestamp dist/ollama
|
||||
else
|
||||
@@ -56,4 +48,5 @@ ditto -c -k --keepParent dist/ollama dist/temp.zip
|
||||
if [ -n "$APPLE_IDENTITY" ]; then
|
||||
xcrun notarytool submit dist/temp.zip --wait --timeout 10m --apple-id $APPLE_ID --password $APPLE_PASSWORD --team-id $APPLE_TEAM_ID
|
||||
fi
|
||||
mv dist/ollama dist/ollama-darwin
|
||||
rm -f dist/temp.zip
|
||||
|
||||
Reference in New Issue
Block a user