llm: do not error on "null" format (#8139 )

This fixes another regression in the previous commit that fixed other known bugs.
readme: change getting started guide link for pgai (#8119 )
2024-12-17 09:49:37 -08:00 · 2024-12-16 22:13:23 -08:00 · 2024-12-16 21:57:49 -08:00 · 2024-12-16 18:45:46 -08:00 · 2024-12-16 18:45:02 -08:00 · 2024-12-16 17:14:37 +08:00
13 changed files with 135 additions and 669 deletions
--- a/.golangci.yaml
+++ b/.golangci.yaml
@@ -8,8 +8,6 @@ linters:
    - containedctx
    - contextcheck
    - errcheck
-    - exportloopref
-    - gci
    - gocheckcompilerdirectives
    - gofmt
    - gofumpt
@@ -30,8 +28,6 @@ linters:
    - wastedassign
    - whitespace
 linters-settings:
-  gci:
-    sections: [standard, default, localmodule]
  staticcheck:
    checks:
      - all
--- a/2
+++ b/2
@@ -8,11 +8,9 @@ include make/cuda-v12-defs.make
 include make/rocm-defs.make

 ifeq ($(CUSTOM_CPU_FLAGS),)
-ifneq ($(OS),darwin)
 ifeq ($(ARCH),amd64)
 	RUNNER_TARGETS=cpu
 endif
-endif
 # Without CUSTOM_CPU_FLAGS we default to build both v11 and v12 if present
 ifeq ($(OLLAMA_SKIP_CUDA_GENERATE),)
 ifneq ($(CUDA_11_COMPILER),)
--- a/README.md
+++ b/README.md
@@ -407,6 +407,8 @@ See the [API documentation](./docs/api.md) for all endpoints.

 ### Database

+- [pgai](https://github.com/timescale/pgai) - PostgreSQL as a vector database (Create and search embeddings from Ollama models using pgvector)
+   - [Get started guide](https://github.com/timescale/pgai/blob/main/docs/vectorizer-quick-start.md)
 - [MindsDB](https://github.com/mindsdb/mindsdb/blob/staging/mindsdb/integrations/handlers/ollama_handler/README.md) (Connects Ollama models with nearly 200 data platforms and apps)
 - [chromem-go](https://github.com/philippgille/chromem-go/blob/v0.5.0/embed_ollama.go) with [example](https://github.com/philippgille/chromem-go/tree/v0.5.0/examples/rag-wikipedia-ollama)
 - [Kangaroo](https://github.com/dbkangaroo/kangaroo) (AI-powered SQL client and admin tool for popular databases)
--- a/llama/ggml-metal-embed.metal
+++ b/llama/ggml-metal-embed.metal
@@ -2081,7 +2081,6 @@ typedef struct {
    float    attn_factor;
    float    beta_fast;
    float    beta_slow;
-    int32_t  sections[4];
 } ggml_metal_kargs_rope;

 typedef struct {
@@ -4786,148 +4785,8 @@ kernel void kernel_rope_neox(
    }
 }

-
-template<typename T>
-kernel void kernel_rope_multi(
-        constant ggml_metal_kargs_rope & args,
-        device const char * src0,
-        device const char * src1,
-        device const char * src2,
-        device       char * dst,
-        ushort  tiitg[[thread_index_in_threadgroup]],
-        ushort3 tptg [[threads_per_threadgroup]],
-        uint3   tgpig[[threadgroup_position_in_grid]]) {
-    const int i3 = tgpig[2];
-    const int i2 = tgpig[1];
-    const int i1 = tgpig[0];
-
-    float corr_dims[2];
-    rope_yarn_corr_dims(args.n_dims, args.n_ctx_orig, args.freq_base, args.beta_fast, args.beta_slow, corr_dims);
-
-    device const int32_t * pos = (device const int32_t *) src1;
-
-    int sect_dims = args.sections[0] + args.sections[1] + args.sections[2] + args.sections[3];
-    int sec_w = args.sections[1] + args.sections[0];
-
-    const float inv_ndims = -1.f/args.n_dims;
-
-    float cos_theta;
-    float sin_theta;
-
-    for (int i0 = 2*tiitg; i0 < args.ne0; i0 += 2*tptg.x) {
-        if (i0 < args.n_dims) {
-            const int ic = i0/2;
-            const int sector = ic % sect_dims;
-
-            float theta_base = (float) pos[i2];
-            if (sector >= args.sections[0] && sector < sec_w) {
-                theta_base = (float) pos[i2 + args.ne2];
-            }
-            else if (sector >= sec_w && sector < sec_w + args.sections[2]) {
-                theta_base = (float) pos[i2 + args.ne2 * 2];
-            }
-            else if (sector >= sec_w + args.sections[2]) {
-                theta_base = (float) pos[i2 + args.ne2 * 3];
-            }
-
-            float theta = theta_base*pow(args.freq_base, inv_ndims*i0);
-
-            const float freq_factor = src2 != src0 ? ((device const float *) src2)[ic] : 1.0f;
-
-            rope_yarn(theta/freq_factor, args.freq_scale, corr_dims, i0, args.ext_factor, args.attn_factor, &cos_theta, &sin_theta);
-
-            device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + ic*args.nb00);
-            device       T * dst_data  = (device T *)( dst + i3*args.nb3  + i2*args.nb2  + i1*args.nb1  + ic*args.nb0);
-
-            const float x0 = src[0];
-            const float x1 = src[args.n_dims/2];
-
-            dst_data[0]             = x0*cos_theta - x1*sin_theta;
-            dst_data[args.n_dims/2] = x0*sin_theta + x1*cos_theta;
-        } else {
-            device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + i0*args.nb00);
-            device       T * dst_data  = (device T *)( dst + i3*args.nb3  + i2*args.nb2  + i1*args.nb1  + i0*args.nb0);
-
-            dst_data[0] = src[0];
-            dst_data[1] = src[1];
-        }
-    }
-}
-
-template<typename T>
-kernel void kernel_rope_vision(
-        constant ggml_metal_kargs_rope & args,
-        device const char * src0,
-        device const char * src1,
-        device const char * src2,
-        device       char * dst,
-        ushort  tiitg[[thread_index_in_threadgroup]],
-        ushort3 tptg [[threads_per_threadgroup]],
-        uint3   tgpig[[threadgroup_position_in_grid]]) {
-    const int i3 = tgpig[2];
-    const int i2 = tgpig[1];
-    const int i1 = tgpig[0];
-
-    float corr_dims[2];
-    rope_yarn_corr_dims(args.n_dims, args.n_ctx_orig, args.freq_base, args.beta_fast, args.beta_slow, corr_dims);
-
-    device const int32_t * pos = (device const int32_t *) src1;
-
-    int sect_dims = args.sections[0] + args.sections[1];
-    int sec_w = args.sections[1] + args.sections[0];
-    int sec_e = args.sections[2] + sec_w;
-
-    const float inv_ndims = -1.f/args.n_dims;
-
-    float cos_theta;
-    float sin_theta;
-
-    for (int i0 = 2*tiitg; i0 < args.ne0; i0 += 2*tptg.x) {
-        const int ic = i0/2;
-        const int sector = ic % sect_dims;
-
-        float theta_base = (float) pos[i2];
-        if (sector >= args.sections[0] && sector < sec_w) {
-            theta_base = (float) pos[i2 + args.ne2];
-        }
-        else if (sector >= sec_w && sector < sec_w + args.sections[2]) {
-            theta_base = (float) pos[i2 + args.ne2 * 2];
-        }
-        else if (sector >= sec_w + args.sections[2]) {
-            theta_base = (float) pos[i2 + args.ne2 * 3];
-        }
-
-        int p = sector;
-        if (sector >= sec_w + args.sections[2]) {
-            p = sector - (sec_w + args.sections[2]);
-        } else if (sector >= sec_w) {
-            p = sector - sec_w;
-        } else if (sector >= args.sections[0]) {
-            p = sector - args.sections[0];
-        }
-
-        const float theta = theta_base*pow(args.freq_base, inv_ndims*2.0f*p);
-
-        const float freq_factor = src2 != src0 ? ((device const float *) src2)[ic] : 1.0f;
-
-        rope_yarn(theta/freq_factor, args.freq_scale, corr_dims, i0, args.ext_factor, args.attn_factor, &cos_theta, &sin_theta);
-
-        device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + ic*args.nb00);
-        device       T * dst_data  = (device T *)( dst + i3*args.nb3  + i2*args.nb2  + i1*args.nb1  + ic*args.nb0);
-
-        const float x0 = src[0];
-        const float x1 = src[args.n_dims];
-
-        dst_data[0]             = x0*cos_theta - x1*sin_theta;
-        dst_data[args.n_dims]   = x0*sin_theta + x1*cos_theta;
-    }
-}
-
-
 typedef decltype(kernel_rope_norm<float>) kernel_rope_norm_t;
 typedef decltype(kernel_rope_neox<float>) kernel_rope_neox_t;
-typedef decltype(kernel_rope_multi<float>) kernel_rope_multi_t;
-typedef decltype(kernel_rope_vision<float>) kernel_rope_vision_t;

 template [[host_name("kernel_rope_norm_f32")]] kernel kernel_rope_norm_t kernel_rope_norm<float>;
 template [[host_name("kernel_rope_norm_f16")]] kernel kernel_rope_norm_t kernel_rope_norm<half>;
@@ -4935,12 +4794,6 @@ template [[host_name("kernel_rope_norm_f16")]] kernel kernel_rope_norm_t kernel_
 template [[host_name("kernel_rope_neox_f32")]] kernel kernel_rope_neox_t kernel_rope_neox<float>;
 template [[host_name("kernel_rope_neox_f16")]] kernel kernel_rope_neox_t kernel_rope_neox<half>;

-template [[host_name("kernel_rope_multi_f32")]] kernel kernel_rope_multi_t kernel_rope_multi<float>;
-template [[host_name("kernel_rope_multi_f16")]] kernel kernel_rope_multi_t kernel_rope_multi<half>;
-
-template [[host_name("kernel_rope_vision_f32")]] kernel kernel_rope_vision_t kernel_rope_vision<float>;
-template [[host_name("kernel_rope_vision_f16")]] kernel kernel_rope_vision_t kernel_rope_vision<half>;
-
 typedef void (im2col_t)(
        device const float * x,
        device        char * dst,
--- a/llama/ggml-metal-impl.h
+++ b/llama/ggml-metal-impl.h
@@ -169,7 +169,6 @@ typedef struct {
    float    attn_factor;
    float    beta_fast;
    float    beta_slow;
-    int32_t  sections[4];
 } ggml_metal_kargs_rope;

 typedef struct {
--- a/llama/ggml-metal.metal
+++ b/llama/ggml-metal.metal
@@ -2594,148 +2594,8 @@ kernel void kernel_rope_neox(
    }
 }

-
-template<typename T>
-kernel void kernel_rope_multi(
-        constant ggml_metal_kargs_rope & args,
-        device const char * src0,
-        device const char * src1,
-        device const char * src2,
-        device       char * dst,
-        ushort  tiitg[[thread_index_in_threadgroup]],
-        ushort3 tptg [[threads_per_threadgroup]],
-        uint3   tgpig[[threadgroup_position_in_grid]]) {
-    const int i3 = tgpig[2];
-    const int i2 = tgpig[1];
-    const int i1 = tgpig[0];
-
-    float corr_dims[2];
-    rope_yarn_corr_dims(args.n_dims, args.n_ctx_orig, args.freq_base, args.beta_fast, args.beta_slow, corr_dims);
-
-    device const int32_t * pos = (device const int32_t *) src1;
-
-    int sect_dims = args.sections[0] + args.sections[1] + args.sections[2] + args.sections[3];
-    int sec_w = args.sections[1] + args.sections[0];
-
-    const float inv_ndims = -1.f/args.n_dims;
-
-    float cos_theta;
-    float sin_theta;
-
-    for (int i0 = 2*tiitg; i0 < args.ne0; i0 += 2*tptg.x) {
-        if (i0 < args.n_dims) {
-            const int ic = i0/2;
-            const int sector = ic % sect_dims;
-
-            float theta_base = (float) pos[i2];
-            if (sector >= args.sections[0] && sector < sec_w) {
-                theta_base = (float) pos[i2 + args.ne2];
-            }
-            else if (sector >= sec_w && sector < sec_w + args.sections[2]) {
-                theta_base = (float) pos[i2 + args.ne2 * 2];
-            }
-            else if (sector >= sec_w + args.sections[2]) {
-                theta_base = (float) pos[i2 + args.ne2 * 3];
-            }
-
-            float theta = theta_base*pow(args.freq_base, inv_ndims*i0);
-
-            const float freq_factor = src2 != src0 ? ((device const float *) src2)[ic] : 1.0f;
-
-            rope_yarn(theta/freq_factor, args.freq_scale, corr_dims, i0, args.ext_factor, args.attn_factor, &cos_theta, &sin_theta);
-
-            device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + ic*args.nb00);
-            device       T * dst_data  = (device T *)( dst + i3*args.nb3  + i2*args.nb2  + i1*args.nb1  + ic*args.nb0);
-
-            const float x0 = src[0];
-            const float x1 = src[args.n_dims/2];
-
-            dst_data[0]             = x0*cos_theta - x1*sin_theta;
-            dst_data[args.n_dims/2] = x0*sin_theta + x1*cos_theta;
-        } else {
-            device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + i0*args.nb00);
-            device       T * dst_data  = (device T *)( dst + i3*args.nb3  + i2*args.nb2  + i1*args.nb1  + i0*args.nb0);
-
-            dst_data[0] = src[0];
-            dst_data[1] = src[1];
-        }
-    }
-}
-
-template<typename T>
-kernel void kernel_rope_vision(
-        constant ggml_metal_kargs_rope & args,
-        device const char * src0,
-        device const char * src1,
-        device const char * src2,
-        device       char * dst,
-        ushort  tiitg[[thread_index_in_threadgroup]],
-        ushort3 tptg [[threads_per_threadgroup]],
-        uint3   tgpig[[threadgroup_position_in_grid]]) {
-    const int i3 = tgpig[2];
-    const int i2 = tgpig[1];
-    const int i1 = tgpig[0];
-
-    float corr_dims[2];
-    rope_yarn_corr_dims(args.n_dims, args.n_ctx_orig, args.freq_base, args.beta_fast, args.beta_slow, corr_dims);
-
-    device const int32_t * pos = (device const int32_t *) src1;
-
-    int sect_dims = args.sections[0] + args.sections[1];
-    int sec_w = args.sections[1] + args.sections[0];
-    int sec_e = args.sections[2] + sec_w;
-
-    const float inv_ndims = -1.f/args.n_dims;
-
-    float cos_theta;
-    float sin_theta;
-
-    for (int i0 = 2*tiitg; i0 < args.ne0; i0 += 2*tptg.x) {
-        const int ic = i0/2;
-        const int sector = ic % sect_dims;
-
-        float theta_base = (float) pos[i2];
-        if (sector >= args.sections[0] && sector < sec_w) {
-            theta_base = (float) pos[i2 + args.ne2];
-        }
-        else if (sector >= sec_w && sector < sec_w + args.sections[2]) {
-            theta_base = (float) pos[i2 + args.ne2 * 2];
-        }
-        else if (sector >= sec_w + args.sections[2]) {
-            theta_base = (float) pos[i2 + args.ne2 * 3];
-        }
-
-        int p = sector;
-        if (sector >= sec_w + args.sections[2]) {
-            p = sector - (sec_w + args.sections[2]);
-        } else if (sector >= sec_w) {
-            p = sector - sec_w;
-        } else if (sector >= args.sections[0]) {
-            p = sector - args.sections[0];
-        }
-
-        const float theta = theta_base*pow(args.freq_base, inv_ndims*2.0f*p);
-
-        const float freq_factor = src2 != src0 ? ((device const float *) src2)[ic] : 1.0f;
-
-        rope_yarn(theta/freq_factor, args.freq_scale, corr_dims, i0, args.ext_factor, args.attn_factor, &cos_theta, &sin_theta);
-
-        device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + ic*args.nb00);
-        device       T * dst_data  = (device T *)( dst + i3*args.nb3  + i2*args.nb2  + i1*args.nb1  + ic*args.nb0);
-
-        const float x0 = src[0];
-        const float x1 = src[args.n_dims];
-
-        dst_data[0]             = x0*cos_theta - x1*sin_theta;
-        dst_data[args.n_dims]   = x0*sin_theta + x1*cos_theta;
-    }
-}
-
-
 typedef decltype(kernel_rope_norm<float>) kernel_rope_norm_t;
 typedef decltype(kernel_rope_neox<float>) kernel_rope_neox_t;
-typedef decltype(kernel_rope_multi<float>) kernel_rope_multi_t;
-typedef decltype(kernel_rope_vision<float>) kernel_rope_vision_t;

 template [[host_name("kernel_rope_norm_f32")]] kernel kernel_rope_norm_t kernel_rope_norm<float>;
 template [[host_name("kernel_rope_norm_f16")]] kernel kernel_rope_norm_t kernel_rope_norm<half>;
@@ -2743,12 +2603,6 @@ template [[host_name("kernel_rope_norm_f16")]] kernel kernel_rope_norm_t kernel_
 template [[host_name("kernel_rope_neox_f32")]] kernel kernel_rope_neox_t kernel_rope_neox<float>;
 template [[host_name("kernel_rope_neox_f16")]] kernel kernel_rope_neox_t kernel_rope_neox<half>;

-template [[host_name("kernel_rope_multi_f32")]] kernel kernel_rope_multi_t kernel_rope_multi<float>;
-template [[host_name("kernel_rope_multi_f16")]] kernel kernel_rope_multi_t kernel_rope_multi<half>;
-
-template [[host_name("kernel_rope_vision_f32")]] kernel kernel_rope_vision_t kernel_rope_vision<float>;
-template [[host_name("kernel_rope_vision_f16")]] kernel kernel_rope_vision_t kernel_rope_vision<half>;
-
 typedef void (im2col_t)(
        device const float * x,
        device        char * dst,
--- a/llama/ggml-metal_darwin_arm64.m
+++ b/llama/ggml-metal_darwin_arm64.m
@@ -328,10 +328,6 @@ enum ggml_metal_kernel_type {
    GGML_METAL_KERNEL_TYPE_ROPE_NORM_F16,
    GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F32,
    GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F16,
-    GGML_METAL_KERNEL_TYPE_ROPE_MULTI_F32,
-    GGML_METAL_KERNEL_TYPE_ROPE_MULTI_F16,
-    GGML_METAL_KERNEL_TYPE_ROPE_VISION_F32,
-    GGML_METAL_KERNEL_TYPE_ROPE_VISION_F16,
    GGML_METAL_KERNEL_TYPE_IM2COL_F16,
    GGML_METAL_KERNEL_TYPE_IM2COL_F32,
    GGML_METAL_KERNEL_TYPE_IM2COL_EXT_F16,
@@ -932,10 +928,6 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_NORM_F16,                 rope_norm_f16,                  true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F32,                 rope_neox_f32,                  true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F16,                 rope_neox_f16,                  true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_MULTI_F32,                rope_multi_f32,                 true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_MULTI_F16,                rope_multi_f16,                 true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_VISION_F32,               rope_vision_f32,                true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_VISION_F16,               rope_vision_f16,                true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_IM2COL_F16,                    im2col_f16,                     true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_IM2COL_F32,                    im2col_f32,                     true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_IM2COL_EXT_F16,                im2col_ext_f16,                 true);
@@ -1163,7 +1155,16 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
        case GGML_OP_NORM:
            return true;
        case GGML_OP_ROPE:
-            return true;
+            {
+                const int mode = ((const int32_t *) op->op_params)[2];
+                if (mode & GGML_ROPE_TYPE_MROPE) {
+                    return false;
+                }
+                if (mode & GGML_ROPE_TYPE_VISION) {
+                    return false;
+                }
+                return true;
+            }
        case GGML_OP_IM2COL:
            return op->src[0]->type == GGML_TYPE_F16;
        case GGML_OP_POOL_1D:
@@ -3082,7 +3083,6 @@ static void ggml_metal_encode_node(
                float attn_factor;
                float beta_fast;
                float beta_slow;
-                int32_t sections[4];

                memcpy(&freq_base,   (const int32_t *) dst->op_params +  5, sizeof(float));
                memcpy(&freq_scale,  (const int32_t *) dst->op_params +  6, sizeof(float));
@@ -3090,44 +3090,21 @@ static void ggml_metal_encode_node(
                memcpy(&attn_factor, (const int32_t *) dst->op_params +  8, sizeof(float));
                memcpy(&beta_fast,   (const int32_t *) dst->op_params +  9, sizeof(float));
                memcpy(&beta_slow,   (const int32_t *) dst->op_params + 10, sizeof(float));
-                memcpy(&sections,    (const int32_t *) dst->op_params + 11, sizeof(int32_t)*4);

                const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
-                const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE;
-                const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
-
-                if (is_mrope) {
-                    GGML_ASSERT(sections[0] > 0 || sections[1] > 0 || sections[2] > 0);
-                }
-
-                if (is_vision) {
-                    GGML_ASSERT(n_dims == ne00/2);
-                }

                id<MTLComputePipelineState> pipeline = nil;

-                if (is_neox) {
+                if (!is_neox) {
                    switch (src0->type) {
-                        case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F32].pipeline; break;
-                        case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F16].pipeline; break;
-                        default: GGML_ABORT("fatal error");
-                    };
-                } else if (is_mrope && !is_vision) {
-                    switch (src0->type) {
-                        case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_MULTI_F32].pipeline; break;
-                        case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_MULTI_F16].pipeline; break;
-                        default: GGML_ABORT("fatal error");
-                    };
-                } else if (is_vision) {
-                    switch (src0->type) {
-                        case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_VISION_F32].pipeline; break;
-                        case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_VISION_F16].pipeline; break;
+                        case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NORM_F32].pipeline; break;
+                        case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NORM_F16].pipeline; break;
                        default: GGML_ABORT("fatal error");
                    };
                } else {
                    switch (src0->type) {
-                        case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NORM_F32].pipeline; break;
-                        case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NORM_F16].pipeline; break;
+                        case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F32].pipeline; break;
+                        case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F16].pipeline; break;
                        default: GGML_ABORT("fatal error");
                    };
                }
@@ -3158,7 +3135,6 @@ static void ggml_metal_encode_node(
                    /*.attn_factor =*/ attn_factor,
                    /*.beta_fast   =*/ beta_fast,
                    /*.beta_slow   =*/ beta_slow,
-                    /*.sections    =*/ {sections[0], sections[1], sections[2], sections[3]}
                };

                [encoder setComputePipelineState:pipeline];
--- a/llama/patches/0014-qwen2vl-support.patch
+++ b/llama/patches/0014-qwen2vl-support.patch
@@ -1,299 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: jmorganca <jmorganca@gmail.com>
-Date: Sun, 15 Dec 2024 23:56:24 -0800
-Subject: [PATCH] qwen2vl support
-
---
- ggml/src/ggml-metal/ggml-metal-impl.h |   1 +
- ggml/src/ggml-metal/ggml-metal.m      |  54 +++++++---
- ggml/src/ggml-metal/ggml-metal.metal  | 146 ++++++++++++++++++++++++++
- 3 files changed, 186 insertions(+), 15 deletions(-)
-
-diff --git a/ggml/src/ggml-metal/ggml-metal-impl.h b/ggml/src/ggml-metal/ggml-metal-impl.h
-index e3dc25f1..766a4999 100644
--- a/ggml/src/ggml-metal/ggml-metal-impl.h
-+++ b/ggml/src/ggml-metal/ggml-metal-impl.h
-@@ -143,6 +143,7 @@ typedef struct {
-     float    attn_factor;
-     float    beta_fast;
-     float    beta_slow;
-+    int32_t  sections[4];
- } ggml_metal_kargs_rope;
- 
- typedef struct {
-diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
-index 787fc713..806c9fd3 100644
--- a/ggml/src/ggml-metal/ggml-metal.m
-+++ b/ggml/src/ggml-metal/ggml-metal.m
-@@ -302,6 +302,10 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
-     GGML_METAL_KERNEL_TYPE_ROPE_NORM_F16,
-     GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F32,
-     GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F16,
-+    GGML_METAL_KERNEL_TYPE_ROPE_MULTI_F32,
-+    GGML_METAL_KERNEL_TYPE_ROPE_MULTI_F16,
-+    GGML_METAL_KERNEL_TYPE_ROPE_VISION_F32,
-+    GGML_METAL_KERNEL_TYPE_ROPE_VISION_F16,
-     GGML_METAL_KERNEL_TYPE_IM2COL_F16,
-     GGML_METAL_KERNEL_TYPE_IM2COL_F32,
-     GGML_METAL_KERNEL_TYPE_IM2COL_EXT_F16,
-@@ -902,6 +906,10 @@ @implementation GGMLMetalClass
-         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_NORM_F16,                 rope_norm_f16,                  true);
-         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F32,                 rope_neox_f32,                  true);
-         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F16,                 rope_neox_f16,                  true);
-+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_MULTI_F32,                rope_multi_f32,                 true);
-+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_MULTI_F16,                rope_multi_f16,                 true);
-+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_VISION_F32,               rope_vision_f32,                true);
-+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_VISION_F16,               rope_vision_f16,                true);
-         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_IM2COL_F16,                    im2col_f16,                     true);
-         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_IM2COL_F32,                    im2col_f32,                     true);
-         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_IM2COL_EXT_F16,                im2col_ext_f16,                 true);
-@@ -1129,16 +1137,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
-         case GGML_OP_NORM:
-             return true;
-         case GGML_OP_ROPE:
-            {
-                const int mode = ((const int32_t *) op->op_params)[2];
-                if (mode & GGML_ROPE_TYPE_MROPE) {
-                    return false;
-                }
-                if (mode & GGML_ROPE_TYPE_VISION) {
-                    return false;
-                }
-                return true;
-            }
-+            return true;
-         case GGML_OP_IM2COL:
-             return op->src[0]->type == GGML_TYPE_F16;
-         case GGML_OP_POOL_1D:
-@@ -3057,6 +3056,7 @@ static void ggml_metal_encode_node(
-                 float attn_factor;
-                 float beta_fast;
-                 float beta_slow;
-+                int32_t sections[4];
- 
-                 memcpy(&freq_base,   (const int32_t *) dst->op_params +  5, sizeof(float));
-                 memcpy(&freq_scale,  (const int32_t *) dst->op_params +  6, sizeof(float));
-@@ -3064,21 +3064,44 @@ static void ggml_metal_encode_node(
-                 memcpy(&attn_factor, (const int32_t *) dst->op_params +  8, sizeof(float));
-                 memcpy(&beta_fast,   (const int32_t *) dst->op_params +  9, sizeof(float));
-                 memcpy(&beta_slow,   (const int32_t *) dst->op_params + 10, sizeof(float));
-+                memcpy(&sections,    (const int32_t *) dst->op_params + 11, sizeof(int32_t)*4);
- 
-                 const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
-+                const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE;
-+                const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
-+
-+                if (is_mrope) {
-+                    GGML_ASSERT(sections[0] > 0 || sections[1] > 0 || sections[2] > 0);
-+                }
-+
-+                if (is_vision) {
-+                    GGML_ASSERT(n_dims == ne00/2);
-+                }
- 
-                 id<MTLComputePipelineState> pipeline = nil;
- 
-                if (!is_neox) {
-+                if (is_neox) {
-                     switch (src0->type) {
-                        case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NORM_F32].pipeline; break;
-                        case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NORM_F16].pipeline; break;
-+                        case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F32].pipeline; break;
-+                        case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F16].pipeline; break;
-+                        default: GGML_ABORT("fatal error");
-+                    };
-+                } else if (is_mrope && !is_vision) {
-+                    switch (src0->type) {
-+                        case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_MULTI_F32].pipeline; break;
-+                        case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_MULTI_F16].pipeline; break;
-+                        default: GGML_ABORT("fatal error");
-+                    };
-+                } else if (is_vision) {
-+                    switch (src0->type) {
-+                        case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_VISION_F32].pipeline; break;
-+                        case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_VISION_F16].pipeline; break;
-                         default: GGML_ABORT("fatal error");
-                     };
-                 } else {
-                     switch (src0->type) {
-                        case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F32].pipeline; break;
-                        case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F16].pipeline; break;
-+                        case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NORM_F32].pipeline; break;
-+                        case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NORM_F16].pipeline; break;
-                         default: GGML_ABORT("fatal error");
-                     };
-                 }
-@@ -3109,6 +3132,7 @@ static void ggml_metal_encode_node(
-                     /*.attn_factor =*/ attn_factor,
-                     /*.beta_fast   =*/ beta_fast,
-                     /*.beta_slow   =*/ beta_slow,
-+                    /*.sections    =*/ {sections[0], sections[1], sections[2], sections[3]}
-                 };
- 
-                 [encoder setComputePipelineState:pipeline];
-diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
-index 204c93e6..67b3240f 100644
--- a/ggml/src/ggml-metal/ggml-metal.metal
-+++ b/ggml/src/ggml-metal/ggml-metal.metal
-@@ -2568,8 +2568,148 @@ kernel void kernel_rope_neox(
-     }
- }
- 
-+
-+template<typename T>
-+kernel void kernel_rope_multi(
-+        constant ggml_metal_kargs_rope & args,
-+        device const char * src0,
-+        device const char * src1,
-+        device const char * src2,
-+        device       char * dst,
-+        ushort  tiitg[[thread_index_in_threadgroup]],
-+        ushort3 tptg [[threads_per_threadgroup]],
-+        uint3   tgpig[[threadgroup_position_in_grid]]) {
-+    const int i3 = tgpig[2];
-+    const int i2 = tgpig[1];
-+    const int i1 = tgpig[0];
-+
-+    float corr_dims[2];
-+    rope_yarn_corr_dims(args.n_dims, args.n_ctx_orig, args.freq_base, args.beta_fast, args.beta_slow, corr_dims);
-+
-+    device const int32_t * pos = (device const int32_t *) src1;
-+
-+    int sect_dims = args.sections[0] + args.sections[1] + args.sections[2] + args.sections[3];
-+    int sec_w = args.sections[1] + args.sections[0];
-+
-+    const float inv_ndims = -1.f/args.n_dims;
-+
-+    float cos_theta;
-+    float sin_theta;
-+
-+    for (int i0 = 2*tiitg; i0 < args.ne0; i0 += 2*tptg.x) {
-+        if (i0 < args.n_dims) {
-+            const int ic = i0/2;
-+            const int sector = ic % sect_dims;
-+
-+            float theta_base = (float) pos[i2];
-+            if (sector >= args.sections[0] && sector < sec_w) {
-+                theta_base = (float) pos[i2 + args.ne2];
-+            }
-+            else if (sector >= sec_w && sector < sec_w + args.sections[2]) {
-+                theta_base = (float) pos[i2 + args.ne2 * 2];
-+            }
-+            else if (sector >= sec_w + args.sections[2]) {
-+                theta_base = (float) pos[i2 + args.ne2 * 3];
-+            }
-+
-+            float theta = theta_base*pow(args.freq_base, inv_ndims*i0);
-+
-+            const float freq_factor = src2 != src0 ? ((device const float *) src2)[ic] : 1.0f;
-+
-+            rope_yarn(theta/freq_factor, args.freq_scale, corr_dims, i0, args.ext_factor, args.attn_factor, &cos_theta, &sin_theta);
-+
-+            device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + ic*args.nb00);
-+            device       T * dst_data  = (device T *)( dst + i3*args.nb3  + i2*args.nb2  + i1*args.nb1  + ic*args.nb0);
-+
-+            const float x0 = src[0];
-+            const float x1 = src[args.n_dims/2];
-+
-+            dst_data[0]             = x0*cos_theta - x1*sin_theta;
-+            dst_data[args.n_dims/2] = x0*sin_theta + x1*cos_theta;
-+        } else {
-+            device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + i0*args.nb00);
-+            device       T * dst_data  = (device T *)( dst + i3*args.nb3  + i2*args.nb2  + i1*args.nb1  + i0*args.nb0);
-+
-+            dst_data[0] = src[0];
-+            dst_data[1] = src[1];
-+        }
-+    }
-+}
-+
-+template<typename T>
-+kernel void kernel_rope_vision(
-+        constant ggml_metal_kargs_rope & args,
-+        device const char * src0,
-+        device const char * src1,
-+        device const char * src2,
-+        device       char * dst,
-+        ushort  tiitg[[thread_index_in_threadgroup]],
-+        ushort3 tptg [[threads_per_threadgroup]],
-+        uint3   tgpig[[threadgroup_position_in_grid]]) {
-+    const int i3 = tgpig[2];
-+    const int i2 = tgpig[1];
-+    const int i1 = tgpig[0];
-+
-+    float corr_dims[2];
-+    rope_yarn_corr_dims(args.n_dims, args.n_ctx_orig, args.freq_base, args.beta_fast, args.beta_slow, corr_dims);
-+
-+    device const int32_t * pos = (device const int32_t *) src1;
-+
-+    int sect_dims = args.sections[0] + args.sections[1];
-+    int sec_w = args.sections[1] + args.sections[0];
-+    int sec_e = args.sections[2] + sec_w;
-+
-+    const float inv_ndims = -1.f/args.n_dims;
-+
-+    float cos_theta;
-+    float sin_theta;
-+
-+    for (int i0 = 2*tiitg; i0 < args.ne0; i0 += 2*tptg.x) {
-+        const int ic = i0/2;
-+        const int sector = ic % sect_dims;
-+
-+        float theta_base = (float) pos[i2];
-+        if (sector >= args.sections[0] && sector < sec_w) {
-+            theta_base = (float) pos[i2 + args.ne2];
-+        }
-+        else if (sector >= sec_w && sector < sec_w + args.sections[2]) {
-+            theta_base = (float) pos[i2 + args.ne2 * 2];
-+        }
-+        else if (sector >= sec_w + args.sections[2]) {
-+            theta_base = (float) pos[i2 + args.ne2 * 3];
-+        }
-+
-+        int p = sector;
-+        if (sector >= sec_w + args.sections[2]) {
-+            p = sector - (sec_w + args.sections[2]);
-+        } else if (sector >= sec_w) {
-+            p = sector - sec_w;
-+        } else if (sector >= args.sections[0]) {
-+            p = sector - args.sections[0];
-+        }
-+
-+        const float theta = theta_base*pow(args.freq_base, inv_ndims*2.0f*p);
-+
-+        const float freq_factor = src2 != src0 ? ((device const float *) src2)[ic] : 1.0f;
-+
-+        rope_yarn(theta/freq_factor, args.freq_scale, corr_dims, i0, args.ext_factor, args.attn_factor, &cos_theta, &sin_theta);
-+
-+        device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + ic*args.nb00);
-+        device       T * dst_data  = (device T *)( dst + i3*args.nb3  + i2*args.nb2  + i1*args.nb1  + ic*args.nb0);
-+
-+        const float x0 = src[0];
-+        const float x1 = src[args.n_dims];
-+
-+        dst_data[0]             = x0*cos_theta - x1*sin_theta;
-+        dst_data[args.n_dims]   = x0*sin_theta + x1*cos_theta;
-+    }
-+}
-+
-+
- typedef decltype(kernel_rope_norm<float>) kernel_rope_norm_t;
- typedef decltype(kernel_rope_neox<float>) kernel_rope_neox_t;
-+typedef decltype(kernel_rope_multi<float>) kernel_rope_multi_t;
-+typedef decltype(kernel_rope_vision<float>) kernel_rope_vision_t;
- 
- template [[host_name("kernel_rope_norm_f32")]] kernel kernel_rope_norm_t kernel_rope_norm<float>;
- template [[host_name("kernel_rope_norm_f16")]] kernel kernel_rope_norm_t kernel_rope_norm<half>;
-@@ -2577,6 +2717,12 @@ template [[host_name("kernel_rope_norm_f16")]] kernel kernel_rope_norm_t kernel_
- template [[host_name("kernel_rope_neox_f32")]] kernel kernel_rope_neox_t kernel_rope_neox<float>;
- template [[host_name("kernel_rope_neox_f16")]] kernel kernel_rope_neox_t kernel_rope_neox<half>;
- 
-+template [[host_name("kernel_rope_multi_f32")]] kernel kernel_rope_multi_t kernel_rope_multi<float>;
-+template [[host_name("kernel_rope_multi_f16")]] kernel kernel_rope_multi_t kernel_rope_multi<half>;
-+
-+template [[host_name("kernel_rope_vision_f32")]] kernel kernel_rope_vision_t kernel_rope_vision<float>;
-+template [[host_name("kernel_rope_vision_f16")]] kernel kernel_rope_vision_t kernel_rope_vision<half>;
-+
- typedef void (im2col_t)(
-         device const float * x,
-         device        char * dst,
--- a/llm/server.go
+++ b/llm/server.go
@@ -674,21 +674,6 @@ type CompletionResponse struct {
 }

 func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error {
-	if err := s.sem.Acquire(ctx, 1); err != nil {
-		if errors.Is(err, context.Canceled) {
-			slog.Info("aborting completion request due to client closing the connection")
-		} else {
-			slog.Error("Failed to acquire semaphore", "error", err)
-		}
-		return err
-	}
-	defer s.sem.Release(1)
-
-	// put an upper limit on num_predict to avoid the model running on forever
-	if req.Options.NumPredict < 0 || req.Options.NumPredict > 10*s.options.NumCtx {
-		req.Options.NumPredict = 10 * s.options.NumCtx
-	}
-
 	request := map[string]any{
 		"prompt":            req.Prompt,
 		"stream":            true,
@@ -714,30 +699,51 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
 		"cache_prompt":      true,
 	}

-	// Make sure the server is ready
-	status, err := s.getServerStatusRetry(ctx)
-	if err != nil {
-		return err
-	} else if status != ServerStatusReady {
-		return fmt.Errorf("unexpected server status: %s", status.ToString())
-	}
-
 	if len(req.Format) > 0 {
-		switch {
-		case bytes.Equal(req.Format, []byte(`"json"`)):
+		switch string(req.Format) {
+		case `null`, `""`:
+			// Field was set, but "missing" a value. We accept
+			// these as "not set".
+			break
+		case `"json"`:
 			request["grammar"] = grammarJSON
-		case bytes.HasPrefix(req.Format, []byte("{")):
+		default:
+			if req.Format[0] != '{' {
+				return fmt.Errorf("invalid format: %q; expected \"json\" or a valid JSON Schema object", req.Format)
+			}
+
 			// User provided a JSON schema
 			g := llama.SchemaToGrammar(req.Format)
 			if g == nil {
 				return fmt.Errorf("invalid JSON schema in format")
 			}
 			request["grammar"] = string(g)
-		default:
-			return errors.New(`invalid format: expected "json" or a JSON schema`)
 		}
 	}

+	if err := s.sem.Acquire(ctx, 1); err != nil {
+		if errors.Is(err, context.Canceled) {
+			slog.Info("aborting completion request due to client closing the connection")
+		} else {
+			slog.Error("Failed to acquire semaphore", "error", err)
+		}
+		return err
+	}
+	defer s.sem.Release(1)
+
+	// put an upper limit on num_predict to avoid the model running on forever
+	if req.Options.NumPredict < 0 || req.Options.NumPredict > 10*s.options.NumCtx {
+		req.Options.NumPredict = 10 * s.options.NumCtx
+	}
+
+	// Make sure the server is ready
+	status, err := s.getServerStatusRetry(ctx)
+	if err != nil {
+		return err
+	} else if status != ServerStatusReady {
+		return fmt.Errorf("unexpected server status: %s", status.ToString())
+	}
+
 	// Handling JSON marshaling with special characters unescaped.
 	buffer := &bytes.Buffer{}
 	enc := json.NewEncoder(buffer)
--- a/llm/server_test.go
+++ b/llm/server_test.go
@@ -0,0 +1,72 @@
+package llm
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"strings"
+	"testing"
+
+	"github.com/ollama/ollama/api"
+	"golang.org/x/sync/semaphore"
+)
+
+func TestLLMServerCompletionFormat(t *testing.T) {
+	// This test was written to fix an already deployed issue. It is a bit
+	// of a mess, and but it's good enough, until we can refactoring the
+	// Completion method to be more testable.
+
+	ctx, cancel := context.WithCancel(context.Background())
+	s := &llmServer{
+		sem: semaphore.NewWeighted(1), // required to prevent nil panic
+	}
+
+	checkInvalid := func(format string) {
+		t.Helper()
+		err := s.Completion(ctx, CompletionRequest{
+			Options: new(api.Options),
+			Format:  []byte(format),
+		}, nil)
+
+		want := fmt.Sprintf("invalid format: %q; expected \"json\" or a valid JSON Schema", format)
+		if err == nil || !strings.Contains(err.Error(), want) {
+			t.Fatalf("err = %v; want %q", err, want)
+		}
+	}
+
+	checkInvalid("X")   // invalid format
+	checkInvalid(`"X"`) // invalid JSON Schema
+
+	cancel() // prevent further processing if request makes it past the format check
+
+	checkValid := func(err error) {
+		t.Helper()
+		if !errors.Is(err, context.Canceled) {
+			t.Fatalf("Completion: err = %v; expected context.Canceled", err)
+		}
+	}
+
+	valids := []string{
+		// "missing"
+		``,
+		`""`,
+		`null`,
+
+		// JSON
+		`"json"`,
+		`{"type":"object"}`,
+	}
+	for _, valid := range valids {
+		err := s.Completion(ctx, CompletionRequest{
+			Options: new(api.Options),
+			Format:  []byte(valid),
+		}, nil)
+		checkValid(err)
+	}
+
+	err := s.Completion(ctx, CompletionRequest{
+		Options: new(api.Options),
+		Format:  nil, // missing format
+	}, nil)
+	checkValid(err)
+}
--- a/macapp/forge.config.ts
+++ b/macapp/forge.config.ts
@@ -19,6 +19,7 @@ const config: ForgeConfig = {
    icon: './assets/icon.icns',
    extraResource: [
      '../dist/ollama',
+      '../dist/darwin-amd64/lib',
      path.join(__dirname, './assets/iconTemplate.png'),
      path.join(__dirname, './assets/iconTemplate@2x.png'),
      path.join(__dirname, './assets/iconUpdateTemplate.png'),
@@ -42,7 +43,7 @@ const config: ForgeConfig = {
        }
      : {}),
    osxUniversal: {
-      x64ArchFiles: '**/ollama',
+      x64ArchFiles: '**/ollama*',
    },
  },
  rebuildConfig: {},
--- a/runners/common.go
+++ b/runners/common.go
@@ -72,6 +72,7 @@ func locateRunnersOnce() {
 	paths := []string{
 		filepath.Join(filepath.Dir(exe), "llama", "build", runtime.GOOS+"-"+runtime.GOARCH, "runners"),
 		filepath.Join(filepath.Dir(exe), envconfig.LibRelativeToExe(), "lib", "ollama", "runners"),
+		filepath.Join(filepath.Dir(exe), "lib", "ollama", "runners"),
 	}
 	for _, path := range paths {
 		if _, err := os.Stat(path); err == nil {
--- a/scripts/build_darwin.sh
+++ b/scripts/build_darwin.sh
@@ -18,10 +18,18 @@ rm -rf llama/build dist/darwin-*
 echo "Building darwin arm64"
 GOOS=darwin ARCH=arm64 GOARCH=arm64 make -j 8 dist
 echo "Building darwin amd64 with AVX enabled"
-GOOS=darwin ARCH=amd64 GOARCH=amd64 CUSTOM_CPU_FLAGS="avx" make -j 8 dist
+GOOS=darwin ARCH=amd64 GOARCH=amd64 CUSTOM_CPU_FLAGS="avx" make -j 8 dist_exe

+# Generate the universal ollama binary for stand-alone usage: metal + avx
+lipo -create -output dist/ollama-darwin dist/darwin-arm64/bin/ollama dist/darwin-amd64/bin/ollama

+echo "Building darwin amd64 with runners"
+rm dist/darwin-amd64/bin/ollama
+GOOS=darwin ARCH=amd64 GOARCH=amd64 make -j 8 dist
+# Generate the universal ollama binary for the app bundle: metal + no-avx
 lipo -create -output dist/ollama dist/darwin-arm64/bin/ollama dist/darwin-amd64/bin/ollama
+
+
 if [ -n "$APPLE_IDENTITY" ]; then
    codesign --deep --force --options=runtime --sign "$APPLE_IDENTITY" --timestamp dist/ollama
 else
@@ -48,5 +56,4 @@ ditto -c -k --keepParent dist/ollama dist/temp.zip
 if [ -n "$APPLE_IDENTITY" ]; then
    xcrun notarytool submit dist/temp.zip --wait --timeout 10m --apple-id $APPLE_ID --password $APPLE_PASSWORD --team-id $APPLE_TEAM_ID
 fi
-mv dist/ollama dist/ollama-darwin
 rm -f dist/temp.zip
Author	SHA1	Message	Date
Blake Mizerany	2ddc32d5c5	llm: do not error on "null" format (#8139 ) This fixes another regression in the previous commit that fixed other known bugs.	2024-12-17 09:49:37 -08:00
Jascha Beste	2cde4b8817	readme: change getting started guide link for pgai (#8119 )	2024-12-16 22:13:23 -08:00
Blake Mizerany	87f0a49fe6	llm: do not silently fail for supplied, but invalid formats (#8130 ) Changes in #8002 introduced fixes for bugs with mangling JSON Schemas. It also fixed a bug where the server would silently fail when clients requested invalid formats. It also, unfortunately, introduced a bug where the server would reject requests with an empty format, which should be allowed. The change in #8127 updated the code to allow the empty format, but also reintroduced the regression where the server would silently fail when the format was set, but invalid. This commit fixes both regressions. The server does not reject the empty format, but it does reject invalid formats. It also adds tests to help us catch regressions in the future. Also, the updated code provides a more detailed error message when a client sends a non-empty, but invalid format, echoing the invalid format in the response. This commits also takes the opportunity to remove superfluous linter checks.	2024-12-16 21:57:49 -08:00
Jeffrey Morgan	0f06a6daa7	llm: loosen format check to default to no format (#8127 )	2024-12-16 18:45:46 -08:00
Daniel Hiltgen	8f805dd74b	darwin: restore multiple runners for x86 (#8125 ) In 0.5.2 we simplified packaging to have avx only for macos x86. It looks like there may still be some non-AVX systems out there, so this puts back the prior logic of building no-AVX for the primary binary, and now 2 runners for avx and avx2. These will be packaged in the App bundle only, so the stand-alone binary will now be without AVX support on macos. On arm, we'll also see these runners reported as available in the log, but they're dormant and will never be used at runtime.	2024-12-16 18:45:02 -08:00
Michael	89d5e2f2fd	readme: example/get started guide for pgai with Ollama (#8115 ) readme: example/get started guide for pgai with Ollama	2024-12-16 17:14:37 +08:00
Jascha Beste	297ada6c87	readme: add pgai to readme for semantic search (#8028 ) * docs: switch around database integrations order and link to quickstart * docs: link to blog post in example readme * chore: link to main readme * readme: removing example to link externally readme: removing example to link externally so we don't have to keep this example up-to-date ---------	2024-12-16 17:02:28 +08:00