Sync vendored ggml to add Vulkan support
This commit is contained in:
parent
2d443b3dd6
commit
449e5c07ae
|
|
@ -32,7 +32,7 @@ PATCHES=$(wildcard llama/patches/*.patch)
|
|||
apply-patches: $(addsuffix ed, $(PATCHES))
|
||||
|
||||
%.patched: %.patch
|
||||
@if git -c user.name=nobody -c 'user.email=<>' -C $(WORKDIR) am -3 $(realpath $<); then touch $@; else git -C $(WORKDIR) am --abort; exit 1; fi
|
||||
@if git -c commit.gpgSign=false -c user.name=nobody -c 'user.email=<>' -C $(WORKDIR) am -3 $(realpath $<); then touch $@; else git -C $(WORKDIR) am --abort; exit 1; fi
|
||||
|
||||
.PHONY: checkout
|
||||
checkout: $(WORKDIR)
|
||||
|
|
|
|||
|
|
@ -0,0 +1,92 @@
|
|||
find_package(Vulkan COMPONENTS glslc REQUIRED)
|
||||
|
||||
if (Vulkan_FOUND)
|
||||
message(STATUS "Vulkan found")
|
||||
|
||||
ggml_add_backend_library(ggml-vulkan
|
||||
ggml-vulkan.cpp
|
||||
../../include/ggml-vulkan.h
|
||||
)
|
||||
|
||||
# Compile a test shader to determine whether GL_NV_cooperative_matrix2 is supported.
|
||||
# If it's not, there will be an error to stderr.
|
||||
# If it's supported, set a define to indicate that we should compile those shaders
|
||||
execute_process(COMMAND ${Vulkan_GLSLC_EXECUTABLE} -o - -fshader-stage=compute --target-env=vulkan1.3 "${CMAKE_CURRENT_SOURCE_DIR}/vulkan-shaders/test_coopmat2_support.comp"
|
||||
OUTPUT_VARIABLE glslc_output
|
||||
ERROR_VARIABLE glslc_error)
|
||||
|
||||
if (${glslc_error} MATCHES ".*extension not supported: GL_NV_cooperative_matrix2.*")
|
||||
message(STATUS "GL_NV_cooperative_matrix2 not supported by glslc")
|
||||
else()
|
||||
message(STATUS "GL_NV_cooperative_matrix2 supported by glslc")
|
||||
add_compile_definitions(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
|
||||
endif()
|
||||
|
||||
target_link_libraries(ggml-vulkan PRIVATE Vulkan::Vulkan)
|
||||
target_include_directories(ggml-vulkan PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
|
||||
|
||||
# Workaround to the "can't dereference invalidated vector iterator" bug in clang-cl debug build
|
||||
# Posssibly relevant: https://stackoverflow.com/questions/74748276/visual-studio-no-displays-the-correct-length-of-stdvector
|
||||
if (MSVC AND CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
|
||||
add_compile_definitions(_ITERATOR_DEBUG_LEVEL=0)
|
||||
endif()
|
||||
|
||||
if (GGML_VULKAN_CHECK_RESULTS)
|
||||
add_compile_definitions(GGML_VULKAN_CHECK_RESULTS)
|
||||
endif()
|
||||
|
||||
if (GGML_VULKAN_DEBUG)
|
||||
add_compile_definitions(GGML_VULKAN_DEBUG)
|
||||
endif()
|
||||
|
||||
if (GGML_VULKAN_MEMORY_DEBUG)
|
||||
add_compile_definitions(GGML_VULKAN_MEMORY_DEBUG)
|
||||
endif()
|
||||
|
||||
if (GGML_VULKAN_SHADER_DEBUG_INFO)
|
||||
add_compile_definitions(GGML_VULKAN_SHADER_DEBUG_INFO)
|
||||
endif()
|
||||
|
||||
if (GGML_VULKAN_PERF)
|
||||
add_compile_definitions(GGML_VULKAN_PERF)
|
||||
endif()
|
||||
|
||||
if (GGML_VULKAN_VALIDATE)
|
||||
add_compile_definitions(GGML_VULKAN_VALIDATE)
|
||||
endif()
|
||||
|
||||
if (GGML_VULKAN_RUN_TESTS)
|
||||
add_compile_definitions(GGML_VULKAN_RUN_TESTS)
|
||||
endif()
|
||||
|
||||
add_subdirectory(vulkan-shaders)
|
||||
|
||||
set (_ggml_vk_genshaders_cmd vulkan-shaders-gen)
|
||||
set (_ggml_vk_header ${CMAKE_CURRENT_BINARY_DIR}/ggml-vulkan-shaders.hpp)
|
||||
set (_ggml_vk_source ${CMAKE_CURRENT_BINARY_DIR}/ggml-vulkan-shaders.cpp)
|
||||
set (_ggml_vk_input_dir ${CMAKE_CURRENT_SOURCE_DIR}/vulkan-shaders)
|
||||
set (_ggml_vk_output_dir ${CMAKE_CURRENT_BINARY_DIR}/vulkan-shaders.spv)
|
||||
|
||||
file(GLOB _ggml_vk_shader_deps "${_ggml_vk_input_dir}/*.comp")
|
||||
|
||||
add_custom_command(
|
||||
OUTPUT ${_ggml_vk_header}
|
||||
${_ggml_vk_source}
|
||||
|
||||
COMMAND "$<TARGET_FILE_DIR:vulkan-shaders-gen>/${_ggml_vk_genshaders_cmd}"
|
||||
--glslc ${Vulkan_GLSLC_EXECUTABLE}
|
||||
--input-dir ${_ggml_vk_input_dir}
|
||||
--output-dir ${_ggml_vk_output_dir}
|
||||
--target-hpp ${_ggml_vk_header}
|
||||
--target-cpp ${_ggml_vk_source}
|
||||
--no-clean
|
||||
|
||||
DEPENDS ${_ggml_vk_shader_deps} ${_ggml_vk_genshaders_cmd}
|
||||
COMMENT "Generate vulkan shaders"
|
||||
)
|
||||
|
||||
target_sources(ggml-vulkan PRIVATE ${_ggml_vk_source} ${_ggml_vk_header})
|
||||
|
||||
else()
|
||||
message(WARNING "Vulkan not found")
|
||||
endif()
|
||||
File diff suppressed because it is too large
Load Diff
|
|
@ -0,0 +1,9 @@
|
|||
find_package (Threads REQUIRED)
|
||||
find_package(Vulkan COMPONENTS glslc REQUIRED)
|
||||
|
||||
set(TARGET vulkan-shaders-gen)
|
||||
add_executable(${TARGET} vulkan-shaders-gen.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
target_link_libraries(vulkan-shaders-gen PUBLIC Threads::Threads)
|
||||
target_link_libraries(vulkan-shaders-gen PRIVATE Vulkan::Vulkan)
|
||||
|
|
@ -0,0 +1,29 @@
|
|||
#version 450
|
||||
|
||||
#include "types.comp"
|
||||
#include "generic_binary_head.comp"
|
||||
|
||||
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
void main() {
|
||||
const uint idx = gl_GlobalInvocationID.x;
|
||||
if (idx >= p.ne) {
|
||||
return;
|
||||
}
|
||||
|
||||
const uint offset = p.param3;
|
||||
const uint src1_i = idx - offset;
|
||||
const uint oz = src1_i / p.nb02;
|
||||
const uint oy = (src1_i - (oz * p.nb02)) / p.nb01;
|
||||
const uint ox = src1_i % p.nb01;
|
||||
|
||||
uint i00, i01, i02, i03;
|
||||
get_indices(idx, i00, i01, i02, i03);
|
||||
|
||||
if (ox < p.ne10 && oy < p.ne11 && oz < p.ne12) {
|
||||
data_d[get_doffset() + dst_idx(i00, i01, i02, i03)] = D_TYPE(FLOAT_TYPE(data_a[get_aoffset() + src0_idx(i00, i01, i02, i03)]) + FLOAT_TYPE(data_b[get_boffset() + ox + oy * p.ne10 + oz * p.ne10 * p.ne11]));
|
||||
} else {
|
||||
data_d[get_doffset() + dst_idx(i00, i01, i02, i03)] = D_TYPE(FLOAT_TYPE(data_a[get_aoffset() + src0_idx(i00, i01, i02, i03)]));
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,29 @@
|
|||
#version 450
|
||||
|
||||
#extension GL_EXT_shader_16bit_storage : require
|
||||
|
||||
#include "types.comp"
|
||||
#include "generic_binary_head.comp"
|
||||
|
||||
const uint num_threads = 256;
|
||||
|
||||
layout(local_size_x = num_threads, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
void main() {
|
||||
uint idx = get_idx();
|
||||
|
||||
// num_threads * num_iter must equal 512, to match the wg_denoms and get_idx calculation
|
||||
const uint num_iter = 2;
|
||||
|
||||
[[unroll]] for (uint i = 0; i < num_iter; ++i) {
|
||||
if (idx >= p.ne) {
|
||||
continue;
|
||||
}
|
||||
uint i00, i01, i02, i03;
|
||||
get_indices(idx, i00, i01, i02, i03);
|
||||
|
||||
data_d[get_doffset() + dst_idx(i00, i01, i02, i03)] = D_TYPE(FLOAT_TYPE(data_a[get_aoffset() + src0_idx(i00, i01, i02, i03)]) + FLOAT_TYPE(data_b[get_boffset() + src1_idx(i00, i01, i02, i03)]));
|
||||
|
||||
idx += num_threads;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,69 @@
|
|||
#version 450
|
||||
|
||||
#include "types.comp"
|
||||
|
||||
#define BLOCK_SIZE 1024
|
||||
#define ASC 0
|
||||
|
||||
layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
|
||||
layout (binding = 1) buffer D {int data_d[];};
|
||||
|
||||
layout (push_constant) uniform parameter {
|
||||
uint ncols;
|
||||
uint ncols_pad;
|
||||
uint order;
|
||||
} p;
|
||||
|
||||
shared int dst_row[BLOCK_SIZE];
|
||||
|
||||
void swap(uint idx0, uint idx1) {
|
||||
int tmp = dst_row[idx0];
|
||||
dst_row[idx0] = dst_row[idx1];
|
||||
dst_row[idx1] = tmp;
|
||||
}
|
||||
|
||||
void main() {
|
||||
// bitonic sort
|
||||
const int col = int(gl_LocalInvocationID.x);
|
||||
const uint row = gl_WorkGroupID.y;
|
||||
|
||||
const uint row_offset = row * p.ncols;
|
||||
|
||||
// initialize indices
|
||||
if (col < p.ncols_pad) {
|
||||
dst_row[col] = col;
|
||||
}
|
||||
barrier();
|
||||
|
||||
for (uint k = 2; k <= p.ncols_pad; k *= 2) {
|
||||
for (uint j = k / 2; j > 0; j /= 2) {
|
||||
const uint ixj = col ^ j;
|
||||
if (col < p.ncols_pad && ixj > col) {
|
||||
if ((col & k) == 0) {
|
||||
if (dst_row[col] >= p.ncols ||
|
||||
(dst_row[ixj] < p.ncols && (p.order == ASC ?
|
||||
data_a[row_offset + dst_row[col]] > data_a[row_offset + dst_row[ixj]] :
|
||||
data_a[row_offset + dst_row[col]] < data_a[row_offset + dst_row[ixj]]))
|
||||
) {
|
||||
swap(col, ixj);
|
||||
}
|
||||
} else {
|
||||
if (dst_row[ixj] >= p.ncols ||
|
||||
(dst_row[col] < p.ncols && (p.order == ASC ?
|
||||
data_a[row_offset + dst_row[col]] < data_a[row_offset + dst_row[ixj]] :
|
||||
data_a[row_offset + dst_row[col]] > data_a[row_offset + dst_row[ixj]]))
|
||||
) {
|
||||
swap(col, ixj);
|
||||
}
|
||||
}
|
||||
}
|
||||
barrier();
|
||||
}
|
||||
}
|
||||
|
||||
if (col < p.ncols) {
|
||||
data_d[row_offset + col] = dst_row[col];
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,17 @@
|
|||
#version 450
|
||||
|
||||
#include "types.comp"
|
||||
#include "generic_unary_head.comp"
|
||||
|
||||
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
void main() {
|
||||
const uint idx = get_idx();
|
||||
|
||||
if (idx >= p.ne) {
|
||||
return;
|
||||
}
|
||||
|
||||
const FLOAT_TYPE val = FLOAT_TYPE(data_a[get_aoffset() + src0_idx(idx)]);
|
||||
data_d[get_doffset() + dst_idx(idx)] = D_TYPE(val < p.param1 ? p.param1 : (val > p.param2 ? p.param2 : val));
|
||||
}
|
||||
|
|
@ -0,0 +1,41 @@
|
|||
#version 450
|
||||
|
||||
#include "types.comp"
|
||||
#include "generic_binary_head.comp"
|
||||
|
||||
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
void main() {
|
||||
const uint idx = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
|
||||
const int dim = p.param3;
|
||||
|
||||
if (idx >= p.ne) {
|
||||
return;
|
||||
}
|
||||
|
||||
const uint i3 = idx / (p.ne22*p.ne21*p.ne20);
|
||||
const uint i3_offset = i3 * p.ne22*p.ne21*p.ne20;
|
||||
const uint i2 = (idx - i3_offset) / (p.ne21*p.ne20);
|
||||
const uint i2_offset = i2*p.ne21*p.ne20;
|
||||
const uint i1 = (idx - i3_offset - i2_offset) / p.ne20;
|
||||
const uint i0 = idx - i3_offset - i2_offset - i1*p.ne20;
|
||||
|
||||
uint o[4] = {0, 0, 0, 0};
|
||||
o[dim] = dim == 0 ? p.ne00 : (dim == 1 ? p.ne01 : (dim == 2 ? p.ne02 : p.ne03));
|
||||
|
||||
const uint src0_idx = i3*p.nb03 + i2*p.nb02 + i1*p.nb01 + i0*p.nb00;
|
||||
const uint src1_idx = (i3 - o[3])*p.nb13 + (i2 - o[2])*p.nb12 + (i1 - o[1])*p.nb11 + (i0 - o[0])*p.nb10;
|
||||
const uint dst_idx = i3*p.nb23 + i2*p.nb22 + i1*p.nb21 + i0*p.nb20;
|
||||
|
||||
const bool is_src0 = i0 < p.ne00 && i1 < p.ne01 && i2 < p.ne02 && i3 < p.ne03;
|
||||
|
||||
#ifndef OPTIMIZATION_ERROR_WORKAROUND
|
||||
data_d[get_doffset() + dst_idx] = D_TYPE(is_src0 ? data_a[get_aoffset() + src0_idx] : data_b[get_boffset() + src1_idx]);
|
||||
#else
|
||||
if (is_src0) {
|
||||
data_d[get_doffset() + dst_idx] = data_a[get_aoffset() + src0_idx];
|
||||
} else {
|
||||
data_d[get_doffset() + dst_idx] = data_b[get_boffset() + src1_idx];
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
|
@ -0,0 +1,42 @@
|
|||
#version 450
|
||||
|
||||
#include "types.comp"
|
||||
#include "generic_unary_head.comp"
|
||||
|
||||
#extension GL_EXT_control_flow_attributes : require
|
||||
|
||||
const uint num_threads = 128;
|
||||
|
||||
layout(local_size_x = num_threads, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
void main() {
|
||||
uint idx = get_idx();
|
||||
|
||||
// num_threads * num_iter must equal 512, to match the wg_denoms and get_idx calculation
|
||||
const uint num_iter = 4;
|
||||
|
||||
// fast path for when all four iterations are in-bounds
|
||||
if (idx + (num_iter-1)*num_threads < p.ne) {
|
||||
[[unroll]] for (uint i = 0; i < num_iter; ++i) {
|
||||
#ifndef OPTIMIZATION_ERROR_WORKAROUND
|
||||
data_d[get_doffset() + idx] = D_TYPE(data_a[get_aoffset() + idx]);
|
||||
#else
|
||||
data_d[get_doffset() + idx] = data_a[get_aoffset() + idx];
|
||||
#endif
|
||||
idx += num_threads;
|
||||
}
|
||||
} else {
|
||||
[[unroll]] for (uint i = 0; i < num_iter; ++i) {
|
||||
if (idx >= p.ne) {
|
||||
continue;
|
||||
}
|
||||
|
||||
#ifndef OPTIMIZATION_ERROR_WORKAROUND
|
||||
data_d[get_doffset() + idx] = D_TYPE(data_a[get_aoffset() + idx]);
|
||||
#else
|
||||
data_d[get_doffset() + idx] = data_a[get_aoffset() + idx];
|
||||
#endif
|
||||
idx += num_threads;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,20 @@
|
|||
#version 450
|
||||
|
||||
#include "types.comp"
|
||||
#include "generic_unary_head.comp"
|
||||
|
||||
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
void main() {
|
||||
const uint idx = get_idx();
|
||||
|
||||
if (idx >= p.ne) {
|
||||
return;
|
||||
}
|
||||
|
||||
#ifndef OPTIMIZATION_ERROR_WORKAROUND
|
||||
data_d[get_doffset() + dst_idx(idx)] = D_TYPE(data_a[get_aoffset() + src0_idx(idx)]);
|
||||
#else
|
||||
data_d[get_doffset() + dst_idx(idx)] = data_a[get_aoffset() + src0_idx(idx)];
|
||||
#endif
|
||||
}
|
||||
|
|
@ -0,0 +1,17 @@
|
|||
#version 450
|
||||
|
||||
#include "types.comp"
|
||||
#include "generic_unary_head.comp"
|
||||
|
||||
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
void main() {
|
||||
const uint idx = get_idx();
|
||||
|
||||
if (idx >= p.ne) {
|
||||
return;
|
||||
}
|
||||
|
||||
const FLOAT_TYPE val = FLOAT_TYPE(data_a[get_aoffset() + src0_idx(idx)]);
|
||||
data_d[get_doffset() + dst_idx(idx)] = D_TYPE(cos(val));
|
||||
}
|
||||
|
|
@ -0,0 +1,20 @@
|
|||
#version 450
|
||||
|
||||
#include "dequant_head.comp"
|
||||
|
||||
layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer A {float data_a[];};
|
||||
layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
|
||||
|
||||
void main() {
|
||||
const uint i = gl_GlobalInvocationID.x * 16;
|
||||
|
||||
if (i >= p.nel) {
|
||||
return;
|
||||
}
|
||||
|
||||
[[unroll]] for (uint l = 0; l < 16; l++) {
|
||||
data_b[i + l] = D_TYPE(data_a[i + l]);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,118 @@
|
|||
#if !defined(DATA_A_F32) && !defined(DATA_A_F16)
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types_int8 : require
|
||||
#endif
|
||||
|
||||
#include "types.comp"
|
||||
|
||||
#if defined(A_TYPE_PACKED16)
|
||||
layout (binding = 0) readonly buffer A_PACKED16 {A_TYPE_PACKED16 data_a_packed16[];};
|
||||
#endif
|
||||
#if defined(A_TYPE_PACKED32)
|
||||
layout (binding = 0) readonly buffer A_PACKED32 {A_TYPE_PACKED32 data_a_packed32[];};
|
||||
#endif
|
||||
|
||||
#if defined(DATA_A_F32)
|
||||
vec2 dequantize(uint ib, uint iqs, uint a_offset) {
|
||||
return vec2(data_a[a_offset + ib], data_a[a_offset + ib + 1]);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(DATA_A_F16)
|
||||
vec2 dequantize(uint ib, uint iqs, uint a_offset) {
|
||||
return vec2(data_a[a_offset + ib], data_a[a_offset + ib + 1]);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(DATA_A_Q4_0)
|
||||
vec2 dequantize(uint ib, uint iqs, uint a_offset) {
|
||||
const uint vui = uint(data_a[a_offset + ib].qs[iqs]);
|
||||
return (vec2(vui & 0xF, vui >> 4) - 8.0f);
|
||||
}
|
||||
vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
|
||||
const uint vui = uint(data_a_packed16[a_offset + ib].qs[iqs/2]);
|
||||
return (vec4(vui & 0xF, (vui >> 4) & 0xF, (vui >> 8) & 0xF, vui >> 12) - 8.0f);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(DATA_A_Q4_1)
|
||||
vec2 dequantize(uint ib, uint iqs, uint a_offset) {
|
||||
const uint vui = uint(data_a[a_offset + ib].qs[iqs]);
|
||||
return vec2(vui & 0xF, vui >> 4);
|
||||
}
|
||||
vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
|
||||
const uint vui = uint(data_a_packed16[a_offset + ib].qs[iqs/2]);
|
||||
return vec4(vui & 0xF, (vui >> 4) & 0xF, (vui >> 8) & 0xF, vui >> 12);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(DATA_A_Q5_0)
|
||||
vec2 dequantize(uint ib, uint iqs, uint a_offset) {
|
||||
const uint uint_qh = uint(data_a[a_offset + ib].qh[1]) << 16 | data_a[a_offset + ib].qh[0];
|
||||
const ivec2 qh = ivec2(((uint_qh >> iqs) << 4) & 0x10, (uint_qh >> (iqs + 12)) & 0x10);
|
||||
const uint vui = uint(data_a[a_offset + ib].qs[iqs]);
|
||||
return (vec2((vui & 0xF) | qh.x, (vui >> 4) | qh.y) - 16.0f);
|
||||
}
|
||||
vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
|
||||
const uint uint_qh = uint(data_a_packed16[a_offset + ib].qh[1]) << 16 | data_a_packed16[a_offset + ib].qh[0];
|
||||
const ivec2 qh0 = ivec2(((uint_qh >> iqs) << 4) & 0x10, (uint_qh >> (iqs + 12)) & 0x10);
|
||||
const ivec2 qh1 = ivec2(((uint_qh >> (iqs + 1)) << 4) & 0x10, (uint_qh >> (iqs + 13)) & 0x10);
|
||||
const uint vui = uint(data_a_packed16[a_offset + ib].qs[iqs/2]);
|
||||
return (vec4((vui & 0xF) | qh0.x, ((vui >> 4) & 0xF) | qh0.y, ((vui >> 8) & 0xF) | qh1.x, (vui >> 12) | qh1.y) - 16.0f);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(DATA_A_Q5_1)
|
||||
vec2 dequantize(uint ib, uint iqs, uint a_offset) {
|
||||
const uint uint_qh = data_a[a_offset + ib].qh;
|
||||
const ivec2 qh = ivec2(((uint_qh >> iqs) << 4) & 0x10, (uint_qh >> (iqs + 12)) & 0x10);
|
||||
const uint vui = uint(data_a[a_offset + ib].qs[iqs]);
|
||||
return vec2((vui & 0xF) | qh.x, (vui >> 4) | qh.y);
|
||||
}
|
||||
vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
|
||||
const uint uint_qh = data_a_packed16[a_offset + ib].qh;
|
||||
const ivec2 qh0 = ivec2(((uint_qh >> iqs) << 4) & 0x10, (uint_qh >> (iqs + 12)) & 0x10);
|
||||
const ivec2 qh1 = ivec2(((uint_qh >> (iqs + 1)) << 4) & 0x10, (uint_qh >> (iqs + 13)) & 0x10);
|
||||
const uint vui = uint(data_a_packed16[a_offset + ib].qs[iqs/2]);
|
||||
return vec4((vui & 0xF) | qh0.x, ((vui >> 4) & 0xF) | qh0.y, ((vui >> 8) & 0xF) | qh1.x, (vui >> 12) | qh1.y);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(DATA_A_Q8_0)
|
||||
vec2 dequantize(uint ib, uint iqs, uint a_offset) {
|
||||
return vec2(int(data_a[a_offset + ib].qs[iqs]), int(data_a[a_offset + ib].qs[iqs + 1]));
|
||||
}
|
||||
vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
|
||||
uint32_t v0 = data_a_packed16[a_offset + ib].qs[iqs/2];
|
||||
uint32_t v1 = data_a_packed16[a_offset + ib].qs[iqs/2 + 1];
|
||||
return vec4(int8_t(v0 & 0xFF), int8_t(v0 >> 8), int8_t(v1 & 0xFF), int8_t(v1 >> 8));
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(DATA_A_IQ4_NL)
|
||||
vec2 dequantize(uint ib, uint iqs, uint a_offset) {
|
||||
const uint vui = uint(data_a[a_offset + ib].qs[iqs]);
|
||||
return vec2(kvalues_iq4nl[vui & 0xF], kvalues_iq4nl[vui >> 4]);
|
||||
}
|
||||
vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
|
||||
const uint vui = uint(data_a_packed16[a_offset + ib].qs[iqs/2]);
|
||||
return vec4(kvalues_iq4nl[vui & 0xF], kvalues_iq4nl[(vui >> 4) & 0xF], kvalues_iq4nl[(vui >> 8) & 0xF], kvalues_iq4nl[vui >> 12]);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(DATA_A_F32) || defined(DATA_A_F16)
|
||||
vec2 get_dm(uint ib, uint a_offset) {
|
||||
return vec2(0, 0);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(DATA_A_Q4_0) || defined(DATA_A_Q5_0) || defined(DATA_A_Q8_0) || defined(DATA_A_IQ4_NL)
|
||||
vec2 get_dm(uint ib, uint a_offset) {
|
||||
return vec2(float(data_a[a_offset + ib].d), 0);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(DATA_A_Q4_1) || defined(DATA_A_Q5_1)
|
||||
vec2 get_dm(uint ib, uint a_offset) {
|
||||
return vec2(float(data_a[a_offset + ib].d), float(data_a[a_offset + ib].m));
|
||||
}
|
||||
#endif
|
||||
|
|
@ -0,0 +1,325 @@
|
|||
|
||||
#include "types.comp"
|
||||
|
||||
layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufQ4_0 {
|
||||
block_q4_0_packed16 block;
|
||||
};
|
||||
|
||||
float16_t dequantFuncQ4_0(const in decodeBufQ4_0 bl, const in uint blockCoords[2], const in uint coordInBlock[2])
|
||||
{
|
||||
const float16_t d = bl.block.d;
|
||||
const uint idx = coordInBlock[1];
|
||||
const uint shift = (idx & 0x10) >> 2;
|
||||
uint32_t qs = uint32_t(bl.block.qs[(idx & 0xE) >> 1]);
|
||||
qs >>= shift;
|
||||
qs &= 0x0F0F;
|
||||
qs = unpack8(qs)[idx & 1];
|
||||
float16_t ret = (float16_t(qs) - float16_t(8)) * d;
|
||||
return ret;
|
||||
}
|
||||
|
||||
layout(buffer_reference, std430, buffer_reference_align = 4) buffer decodeBufQ4_1 {
|
||||
block_q4_1 block;
|
||||
};
|
||||
|
||||
float16_t dequantFuncQ4_1(const in decodeBufQ4_1 bl, const in uint blockCoords[2], const in uint coordInBlock[2])
|
||||
{
|
||||
const float16_t d = bl.block.d;
|
||||
const float16_t m = bl.block.m;
|
||||
const uint idx = coordInBlock[1];
|
||||
const uint iqs = idx & 0xF;
|
||||
const uint shift = (idx & 0x10) >> 2;
|
||||
uint32_t qs = bl.block.qs[iqs];
|
||||
qs >>= shift;
|
||||
qs &= 0xF;
|
||||
float16_t ret = float16_t(qs) * d + m;
|
||||
return ret;
|
||||
}
|
||||
|
||||
layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufQ5_0 {
|
||||
block_q5_0 block;
|
||||
};
|
||||
|
||||
float16_t dequantFuncQ5_0(const in decodeBufQ5_0 bl, const in uint blockCoords[2], const in uint coordInBlock[2])
|
||||
{
|
||||
const float16_t d = bl.block.d;
|
||||
const uint idx = coordInBlock[1];
|
||||
const uint iqs = idx & 0xF;
|
||||
|
||||
const uint uint_qh = uint(bl.block.qh[1]) << 16 | bl.block.qh[0];
|
||||
const uint qh = ((uint_qh >> idx) << 4) & 0x10;
|
||||
|
||||
const uint shift = (idx & 0x10) >> 2;
|
||||
uint32_t qs = bl.block.qs[iqs];
|
||||
qs >>= shift;
|
||||
qs &= 0xF;
|
||||
|
||||
float16_t ret = (float16_t(qs | qh) - float16_t(16)) * d;
|
||||
return ret;
|
||||
}
|
||||
|
||||
layout(buffer_reference, std430, buffer_reference_align = 8) buffer decodeBufQ5_1 {
|
||||
block_q5_1 block;
|
||||
};
|
||||
|
||||
float16_t dequantFuncQ5_1(const in decodeBufQ5_1 bl, const in uint blockCoords[2], const in uint coordInBlock[2])
|
||||
{
|
||||
const float16_t d = bl.block.d;
|
||||
const float16_t m = bl.block.m;
|
||||
const uint idx = coordInBlock[1];
|
||||
const uint iqs = idx & 0xF;
|
||||
|
||||
const uint uint_qh = bl.block.qh;
|
||||
const uint qh = ((uint_qh >> idx) << 4) & 0x10;
|
||||
|
||||
const uint shift = (idx & 0x10) >> 2;
|
||||
uint32_t qs = bl.block.qs[iqs];
|
||||
qs >>= shift;
|
||||
qs &= 0xF;
|
||||
|
||||
float16_t ret = float16_t(qs | qh) * d + m;
|
||||
return ret;
|
||||
}
|
||||
|
||||
layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufQ8_0 {
|
||||
block_q8_0_packed16 block;
|
||||
};
|
||||
|
||||
float16_t dequantFuncQ8_0(const in decodeBufQ8_0 bl, const in uint blockCoords[2], const in uint coordInBlock[2])
|
||||
{
|
||||
const float16_t d = bl.block.d;
|
||||
const uint idx = coordInBlock[1];
|
||||
const uint iqs = idx;
|
||||
|
||||
// Load 16b and select the byte for this element
|
||||
int32_t qs = unpack8(int32_t(bl.block.qs[(iqs & 0x1E) >> 1]))[iqs & 1];
|
||||
float16_t ret = float16_t(qs) * d;
|
||||
return ret;
|
||||
}
|
||||
|
||||
layout(buffer_reference, std430, buffer_reference_align = 4) buffer decodeBufQ2_K {
|
||||
block_q2_K block;
|
||||
};
|
||||
|
||||
float16_t dequantFuncQ2_K(const in decodeBufQ2_K bl, const in uint blockCoords[2], const in uint coordInBlock[2])
|
||||
{
|
||||
const f16vec2 d = bl.block.d;
|
||||
const uint idx = coordInBlock[1];
|
||||
const uint iqs = idx;
|
||||
|
||||
const uint qsi = (iqs / 128) * 32 + (iqs % 32); // 0..31
|
||||
const uint scalesi = iqs / 16; // 0..15
|
||||
const uint qsshift = ((iqs % 128) / 32) * 2; // 0,2,4,6
|
||||
|
||||
uint32_t qs = bl.block.qs[qsi];
|
||||
const uint scales = bl.block.scales[scalesi];
|
||||
float16_t ret = d.x * float16_t(scales & 0xF) * float16_t((qs >> qsshift) & 3) - d.y * float16_t(scales >> 4);
|
||||
return ret;
|
||||
}
|
||||
|
||||
layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufQ3_K {
|
||||
block_q3_K block;
|
||||
};
|
||||
|
||||
float16_t dequantFuncQ3_K(const in decodeBufQ3_K bl, const in uint blockCoords[2], const in uint coordInBlock[2])
|
||||
{
|
||||
const uint idx = coordInBlock[1];
|
||||
const uint iqs = idx;
|
||||
|
||||
const uint n = iqs / 128; // 0,1
|
||||
const uint qsi = n * 32 + (iqs % 32); // 0..63
|
||||
const uint hmi = (iqs % 32); // 0..31
|
||||
const uint j = (iqs % 128) / 8; // 0..15
|
||||
const uint is = iqs / 16; // 0..15
|
||||
const uint halfsplit = ((iqs % 128) / 32); // 0,1,2,3
|
||||
const uint qsshift = halfsplit * 2; // 0,2,4,6
|
||||
const uint m = 1 << (4 * n + halfsplit); // 1,2,4,8,16,32,64,128
|
||||
|
||||
uint32_t scaleidx0 = (is < 8) ? is : (is-8);
|
||||
uint32_t scaleidx0shift = (is < 8) ? 0 : 4;
|
||||
uint32_t scaleidx1 = is + 8 - (is/4)*4;
|
||||
uint32_t scaleidx1shift = (is/4)*2;
|
||||
|
||||
const int8_t us = int8_t(((bl.block.scales[scaleidx0] >> scaleidx0shift) & 0xF) | (((bl.block.scales[scaleidx1] >> scaleidx1shift) & 3) << 4));
|
||||
|
||||
const float16_t dl = bl.block.d * float16_t(us - 32);
|
||||
|
||||
float16_t ret = dl * float16_t(int8_t((bl.block.qs[qsi ] >> qsshift) & 3) - (((bl.block.hmask[hmi ] & m) != 0) ? 0 : 4));
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ4_K {
|
||||
block_q4_K block;
|
||||
};
|
||||
|
||||
layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ4_K_packed16 {
|
||||
block_q4_K_packed16 block;
|
||||
};
|
||||
|
||||
float16_t dequantFuncQ4_K(const in decodeBufQ4_K bl, const in uint blockCoords[2], const in uint coordInBlock[2])
|
||||
{
|
||||
decodeBufQ4_K_packed16 bl16 = decodeBufQ4_K_packed16(bl);
|
||||
const uint idx = coordInBlock[1];
|
||||
|
||||
const uint b = (idx & 0x20) >> 5; // 0,1
|
||||
const uint is = (idx & 0xE0) >> 5; // 0..7
|
||||
|
||||
const f16vec2 loadd = bl.block.d;
|
||||
|
||||
uint32_t sc;
|
||||
uint32_t mbyte;
|
||||
|
||||
uint32_t scidx0 = (is < 4) ? is : (is + 4);
|
||||
uint32_t scidx1 = (is < 4) ? is : (is - 4);
|
||||
uint32_t scidxmask1 = (is < 4) ? 0x30 : 0xC0;
|
||||
uint32_t scidxshift1 = (is < 4) ? 0 : 2;
|
||||
uint32_t mbidx0 = is + 4;
|
||||
uint32_t mbidx1 = (is < 4) ? is + 4 : is;
|
||||
uint32_t mbidxmask0 = (is < 4) ? 0xF : 0xF0;
|
||||
uint32_t mbidxshift0 = (is < 4) ? 0 : 4;
|
||||
uint32_t mbidxmask1 = (is < 4) ? 0x30 : 0xC0;
|
||||
uint32_t mbidxshift1 = (is < 4) ? 0 : 2;
|
||||
|
||||
sc = uint8_t((bl.block.scales[scidx0] & 0xF) | ((bl.block.scales[scidx1] & scidxmask1) >> scidxshift1));
|
||||
mbyte = uint8_t(((bl.block.scales[mbidx0] & mbidxmask0) >> mbidxshift0) | ((bl.block.scales[mbidx1] & mbidxmask1) >> mbidxshift1));
|
||||
|
||||
const float16_t d = loadd.x * float16_t(sc);
|
||||
const float16_t m = loadd.y * float16_t(mbyte);
|
||||
|
||||
uint qs = uint32_t(bl16.block.qs[((idx & 0xC0) >> 2) + ((idx & 0x1E) >> 1)]);
|
||||
qs = (qs >> (b * 4)) & 0x0F0F;
|
||||
qs = unpack8(qs)[idx & 1];
|
||||
|
||||
float16_t ret = d * float16_t(qs) - m;
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ5_K {
|
||||
block_q5_K block;
|
||||
};
|
||||
|
||||
layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ5_K_packed16 {
|
||||
block_q5_K_packed16 block;
|
||||
};
|
||||
|
||||
float16_t dequantFuncQ5_K(const in decodeBufQ5_K bl, const in uint blockCoords[2], const in uint coordInBlock[2])
|
||||
{
|
||||
decodeBufQ5_K_packed16 bl16 = decodeBufQ5_K_packed16(bl);
|
||||
const uint idx = coordInBlock[1];
|
||||
|
||||
const uint b = (idx & 0x20) >> 5; // 0,1
|
||||
const uint is = (idx & 0xE0) >> 5; // 0..7
|
||||
|
||||
const uint32_t hm = 0x0101 << is;
|
||||
|
||||
const f16vec2 loadd = bl.block.d;
|
||||
|
||||
uint32_t sc;
|
||||
uint32_t mbyte;
|
||||
|
||||
uint32_t scidx0 = (is < 4) ? is : (is + 4);
|
||||
uint32_t scidx1 = (is < 4) ? is : (is - 4);
|
||||
uint32_t scidxmask1 = (is < 4) ? 0x30 : 0xC0;
|
||||
uint32_t scidxshift1 = (is < 4) ? 0 : 2;
|
||||
uint32_t mbidx0 = is + 4;
|
||||
uint32_t mbidx1 = (is < 4) ? is + 4 : is;
|
||||
uint32_t mbidxmask0 = (is < 4) ? 0xF : 0xF0;
|
||||
uint32_t mbidxshift0 = (is < 4) ? 0 : 4;
|
||||
uint32_t mbidxmask1 = (is < 4) ? 0x30 : 0xC0;
|
||||
uint32_t mbidxshift1 = (is < 4) ? 0 : 2;
|
||||
|
||||
sc = uint8_t((bl.block.scales[scidx0] & 0xF) | ((bl.block.scales[scidx1] & scidxmask1) >> scidxshift1));
|
||||
mbyte = uint8_t(((bl.block.scales[mbidx0] & mbidxmask0) >> mbidxshift0) | ((bl.block.scales[mbidx1] & mbidxmask1) >> mbidxshift1));
|
||||
|
||||
const float16_t d = loadd.x * float16_t(sc);
|
||||
const float16_t m = loadd.y * float16_t(mbyte);
|
||||
|
||||
uint qh = uint32_t(bl16.block.qh[(idx & 0x1E) >> 1]);
|
||||
qh = qh & hm;
|
||||
qh = unpack8(qh)[idx & 1];
|
||||
|
||||
uint qs = uint32_t(bl16.block.qs[((idx & 0xC0) >> 2) + ((idx & 0x1E) >> 1)]);
|
||||
qs = (qs >> (b * 4)) & 0x0F0F;
|
||||
qs = unpack8(qs)[idx & 1];
|
||||
|
||||
float16_t ret = d * (float16_t(qs) + (qh != 0 ? float16_t(16) : float16_t(0))) - m;
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufQ6_K {
|
||||
block_q6_K block;
|
||||
};
|
||||
|
||||
layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ6_K_packed16 {
|
||||
block_q6_K_packed16 block;
|
||||
};
|
||||
|
||||
float16_t dequantFuncQ6_K(const in decodeBufQ6_K bl, const in uint blockCoords[2], const in uint coordInBlock[2])
|
||||
{
|
||||
decodeBufQ6_K_packed16 bl16 = decodeBufQ6_K_packed16(bl);
|
||||
const uint idx = coordInBlock[1];
|
||||
|
||||
const uint b = (idx & 0x40) >> 6; // 0,1
|
||||
const uint qhshift = (idx & 0x60) >> 4; // 0,2,4,6
|
||||
const uint is = (idx & 0xF0) >> 4; // 0..15
|
||||
|
||||
const float16_t dscale = bl.block.d * float16_t(bl.block.scales[is]);
|
||||
|
||||
uint ql = uint32_t(bl16.block.ql[((idx & 0x80) >> 2) + ((idx & 0x3E) >> 1)]);
|
||||
ql = (ql >> (b * 4)) & 0x0F0F;
|
||||
|
||||
uint qh = uint32_t(bl16.block.qh[((idx & 0x80) >> 3) + ((idx & 0x1E) >> 1)]);
|
||||
qh = ((qh >> qhshift) & 0x0303) << 4;
|
||||
|
||||
int q = unpack8(ql | qh)[idx & 1];
|
||||
|
||||
float16_t ret = dscale * float16_t(q - 32);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
#if defined(DATA_A_IQ4_NL)
|
||||
layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufIQ4_NL {
|
||||
block_iq4_nl block;
|
||||
};
|
||||
|
||||
float16_t dequantFuncIQ4_NL(const in decodeBufIQ4_NL bl, const in uint blockCoords[2], const in uint coordInBlock[2])
|
||||
{
|
||||
const float16_t d = bl.block.d;
|
||||
const uint idx = coordInBlock[1];
|
||||
const uint iqs = idx & 0xF;
|
||||
const uint shift = (idx & 0x10) >> 2;
|
||||
uint32_t qs = bl.block.qs[iqs];
|
||||
qs >>= shift;
|
||||
qs &= 0xF;
|
||||
float16_t ret = float16_t(kvalues_iq4nl[qs]) * d;
|
||||
return ret;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(DATA_A_Q4_0)
|
||||
#define dequantFuncA dequantFuncQ4_0
|
||||
#elif defined(DATA_A_Q4_1)
|
||||
#define dequantFuncA dequantFuncQ4_1
|
||||
#elif defined(DATA_A_Q5_0)
|
||||
#define dequantFuncA dequantFuncQ5_0
|
||||
#elif defined(DATA_A_Q5_1)
|
||||
#define dequantFuncA dequantFuncQ5_1
|
||||
#elif defined(DATA_A_Q8_0)
|
||||
#define dequantFuncA dequantFuncQ8_0
|
||||
#elif defined(DATA_A_Q2_K)
|
||||
#define dequantFuncA dequantFuncQ2_K
|
||||
#elif defined(DATA_A_Q3_K)
|
||||
#define dequantFuncA dequantFuncQ3_K
|
||||
#elif defined(DATA_A_Q4_K)
|
||||
#define dequantFuncA dequantFuncQ4_K
|
||||
#elif defined(DATA_A_Q5_K)
|
||||
#define dequantFuncA dequantFuncQ5_K
|
||||
#elif defined(DATA_A_Q6_K)
|
||||
#define dequantFuncA dequantFuncQ6_K
|
||||
#elif defined(DATA_A_IQ4_NL)
|
||||
#define dequantFuncA dequantFuncIQ4_NL
|
||||
#endif
|
||||
|
|
@ -0,0 +1,13 @@
|
|||
#extension GL_EXT_control_flow_attributes : require
|
||||
#extension GL_EXT_shader_16bit_storage : require
|
||||
|
||||
layout (push_constant) uniform parameter
|
||||
{
|
||||
uint M;
|
||||
uint K;
|
||||
uint stride_a;
|
||||
uint stride_b;
|
||||
uint nel;
|
||||
} p;
|
||||
|
||||
#include "types.comp"
|
||||
|
|
@ -0,0 +1,32 @@
|
|||
#version 450
|
||||
|
||||
#include "dequant_head.comp"
|
||||
|
||||
layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer A {block_iq4_nl data_a[];};
|
||||
layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
|
||||
|
||||
void main() {
|
||||
const uint i = gl_WorkGroupID.x * 4 + gl_LocalInvocationID.x / 64;
|
||||
|
||||
init_iq4nl_shmem();
|
||||
|
||||
const uint tid = gl_LocalInvocationID.x % 64;
|
||||
const uint il = tid/32;
|
||||
const uint ir = tid%32;
|
||||
const uint ib = 32*i + ir;
|
||||
if (ib >= p.nel / 32) {
|
||||
return;
|
||||
}
|
||||
|
||||
const uint q_idx = 8*il;
|
||||
const uint b_idx = 1024*i + 32*ir + q_idx;
|
||||
|
||||
const float d = float(data_a[ib].d);
|
||||
|
||||
[[unroll]] for (uint l = 0; l < 8; ++l) {
|
||||
data_b[b_idx + l + 0] = D_TYPE(d * kvalues_iq4nl[data_a[ib].qs[q_idx + l] & 0xF]);
|
||||
data_b[b_idx + l + 16] = D_TYPE(d * kvalues_iq4nl[data_a[ib].qs[q_idx + l] >> 4]);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,34 @@
|
|||
#version 450
|
||||
|
||||
#include "dequant_head.comp"
|
||||
|
||||
layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
|
||||
layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
|
||||
|
||||
void main() {
|
||||
[[unroll]] for (uint wgy = 0; wgy < 256; wgy++) {
|
||||
const uint i = gl_WorkGroupID.x * 256 + wgy;
|
||||
if (i >= p.M * p.K / QUANT_K) {
|
||||
return;
|
||||
}
|
||||
|
||||
const uint tid = gl_LocalInvocationID.x;
|
||||
const uint ip = tid / 32;
|
||||
const uint il = tid - 32 * ip;
|
||||
const uint is = 8 * ip + il / 16;
|
||||
|
||||
const uint y_idx = i * QUANT_K + 128 * ip + il;
|
||||
|
||||
const uint ql_idx = 32 * ip + il;
|
||||
const uint8_t qs = data_a[i].qs[32 * ip + il];
|
||||
|
||||
FLOAT_TYPE dall = FLOAT_TYPE(data_a[i].d.x);
|
||||
FLOAT_TYPE dmin = FLOAT_TYPE(data_a[i].d.y);
|
||||
data_b[y_idx + 0] = D_TYPE(dall * FLOAT_TYPE((data_a[i].scales[is+0] & 0xF) * ((qs >> 0) & 3)) - dmin * FLOAT_TYPE(data_a[i].scales[is+0] >> 4));
|
||||
data_b[y_idx + 32] = D_TYPE(dall * FLOAT_TYPE((data_a[i].scales[is+2] & 0xF) * ((qs >> 2) & 3)) - dmin * FLOAT_TYPE(data_a[i].scales[is+2] >> 4));
|
||||
data_b[y_idx + 64] = D_TYPE(dall * FLOAT_TYPE((data_a[i].scales[is+4] & 0xF) * ((qs >> 4) & 3)) - dmin * FLOAT_TYPE(data_a[i].scales[is+4] >> 4));
|
||||
data_b[y_idx + 96] = D_TYPE(dall * FLOAT_TYPE((data_a[i].scales[is+6] & 0xF) * ((qs >> 6) & 3)) - dmin * FLOAT_TYPE(data_a[i].scales[is+6] >> 4));
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,42 @@
|
|||
#version 450
|
||||
|
||||
#include "dequant_head.comp"
|
||||
|
||||
layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
|
||||
layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
|
||||
|
||||
void main() {
|
||||
[[unroll]] for (uint wgy = 0; wgy < 256; wgy++) {
|
||||
const uint i = uint(gl_WorkGroupID.x * 256 + wgy);
|
||||
if (i >= p.M * p.K / QUANT_K) {
|
||||
return;
|
||||
}
|
||||
|
||||
const uint r = gl_LocalInvocationID.x / 4;
|
||||
const uint tid = r / 2;
|
||||
const uint is0 = r % 2;
|
||||
const uint l0 = 16 * is0 + 4 * (gl_LocalInvocationID.x % 4);
|
||||
const uint n = tid / 4;
|
||||
const uint j = tid - 4*n;
|
||||
|
||||
const uint8_t m = uint8_t(1 << (4*n + j));
|
||||
const uint is = 8*n + 2*j + is0;
|
||||
const uint shift = 2*j;
|
||||
|
||||
const int8_t us = int8_t(is < 4 ? (data_a[i].scales[is-0] & 0xF) | (((data_a[i].scales[is+8] >> 0) & 3) << 4) :
|
||||
is < 8 ? (data_a[i].scales[is-0] & 0xF) | (((data_a[i].scales[is+4] >> 2) & 3) << 4) :
|
||||
is < 12 ? (data_a[i].scales[is-8] >> 4) | (((data_a[i].scales[is+0] >> 4) & 3) << 4) :
|
||||
(data_a[i].scales[is-8] >> 4) | (((data_a[i].scales[is-4] >> 6) & 3) << 4));
|
||||
const FLOAT_TYPE d_all = FLOAT_TYPE(data_a[i].d);
|
||||
const FLOAT_TYPE dl = d_all * FLOAT_TYPE(us - 32);
|
||||
|
||||
const uint y_idx = i * QUANT_K + 128 * n + 32 * j;
|
||||
const uint qs_idx = 32*n;
|
||||
|
||||
for (uint l = l0; l < l0 + 4; ++l) {
|
||||
data_b[y_idx + l] = D_TYPE(dl * FLOAT_TYPE(int8_t((data_a[i].qs[qs_idx + l] >> shift) & 3) - (((data_a[i].hmask[l] & m) != 0) ? 0 : 4)));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,30 @@
|
|||
#version 450
|
||||
|
||||
#include "dequant_head.comp"
|
||||
|
||||
layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer A {block_q4_0 data_a[];};
|
||||
layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
|
||||
|
||||
void main() {
|
||||
const uint i = gl_WorkGroupID.x * 4 + gl_LocalInvocationID.x / 64;
|
||||
|
||||
const uint tid = gl_LocalInvocationID.x % 64;
|
||||
const uint il = tid/32;
|
||||
const uint ir = tid%32;
|
||||
const uint ib = 32*i + ir;
|
||||
if (ib >= p.nel / 32) {
|
||||
return;
|
||||
}
|
||||
|
||||
const uint q_idx = 8*il;
|
||||
const uint b_idx = 1024*i + 32*ir + q_idx;
|
||||
|
||||
const float d = float(data_a[ib].d);
|
||||
|
||||
[[unroll]] for (uint l = 0; l < 8; ++l) {
|
||||
data_b[b_idx + l + 0] = D_TYPE(d * ((data_a[ib].qs[q_idx + l] & 0xF) - 8.0f));
|
||||
data_b[b_idx + l + 16] = D_TYPE(d * ((data_a[ib].qs[q_idx + l] >> 4) - 8.0f));
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,32 @@
|
|||
#version 450
|
||||
|
||||
#include "dequant_head.comp"
|
||||
|
||||
layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer A {block_q4_1 data_a[];};
|
||||
layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
|
||||
|
||||
void main() {
|
||||
const uint i = gl_WorkGroupID.x * 4 + gl_LocalInvocationID.x / 64;
|
||||
|
||||
const uint tid = gl_LocalInvocationID.x % 64;
|
||||
const uint il = tid/32;
|
||||
const uint ir = tid%32;
|
||||
const uint ib = 32*i + ir;
|
||||
if (ib >= p.nel / 32) {
|
||||
return;
|
||||
}
|
||||
|
||||
const uint b_idx = 1024*i + 32*ir + 8*il;
|
||||
|
||||
const float d = float(data_a[ib].d);
|
||||
const float m = float(data_a[ib].m);
|
||||
|
||||
const uint q_idx = 8*il;
|
||||
|
||||
[[unroll]] for (uint l = 0; l < 8; ++l) {
|
||||
data_b[b_idx + l + 0] = D_TYPE(d * (data_a[ib].qs[q_idx + l] & 0xF) + m);
|
||||
data_b[b_idx + l + 16] = D_TYPE(d * (data_a[ib].qs[q_idx + l] >> 4) + m);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,68 @@
|
|||
#version 450
|
||||
|
||||
#include "dequant_head.comp"
|
||||
|
||||
layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
|
||||
layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
|
||||
|
||||
void main() {
|
||||
[[unroll]] for (uint wgy = 0; wgy < 256; wgy++) {
|
||||
const uint ib = gl_WorkGroupID.x * 256 + wgy;
|
||||
if (ib >= p.M * p.K / QUANT_K) {
|
||||
return;
|
||||
}
|
||||
|
||||
const uint tid = gl_LocalInvocationID.x;
|
||||
const uint il = tid / 8;
|
||||
const uint ir = tid % 8;
|
||||
const uint is = 2 * il;
|
||||
const uint n = 4;
|
||||
|
||||
const FLOAT_TYPE dall = FLOAT_TYPE(data_a[ib].d.x);
|
||||
const FLOAT_TYPE dmin = FLOAT_TYPE(data_a[ib].d.y);
|
||||
|
||||
const uint y_idx = ib * QUANT_K + 64 * il + n * ir;
|
||||
const uint qs_idx = 32*il + n * ir;
|
||||
|
||||
uint scidx0 = (is < 4) ? is : (is + 4);
|
||||
uint scidx1 = (is < 4) ? is : (is - 4);
|
||||
uint scidxmask1 = (is < 4) ? 0x30 : 0xC0;
|
||||
uint scidxshift1 = (is < 4) ? 0 : 2;
|
||||
uint mbidx0 = is + 4;
|
||||
uint mbidx1 = (is < 4) ? is + 4 : is;
|
||||
uint mbidxmask0 = (is < 4) ? 0xF : 0xF0;
|
||||
uint mbidxshift0 = (is < 4) ? 0 : 4;
|
||||
uint mbidxmask1 = (is < 4) ? 0x30 : 0xC0;
|
||||
uint mbidxshift1 = (is < 4) ? 0 : 2;
|
||||
|
||||
uint8_t sc = uint8_t((data_a[ib].scales[scidx0] & 0xF) | ((data_a[ib].scales[scidx1] & scidxmask1) >> scidxshift1));
|
||||
uint8_t mbyte = uint8_t((data_a[ib].scales[mbidx0] & mbidxmask0) >> mbidxshift0 | ((data_a[ib].scales[mbidx1] & mbidxmask1) >> mbidxshift1));
|
||||
|
||||
const FLOAT_TYPE d1 = dall * sc;
|
||||
const FLOAT_TYPE m1 = dmin * mbyte;
|
||||
|
||||
scidx0 = (is < 4) ? is + 1 : (is + 5);
|
||||
scidx1 = (is < 4) ? is + 1 : (is - 3);
|
||||
scidxmask1 = (is < 4) ? 0x30 : 0xC0;
|
||||
scidxshift1 = (is < 4) ? 0 : 2;
|
||||
mbidx0 = is + 5;
|
||||
mbidx1 = (is < 4) ? is + 5 : is + 1;
|
||||
mbidxmask0 = (is < 4) ? 0xF : 0xF0;
|
||||
mbidxshift0 = (is < 4) ? 0 : 4;
|
||||
mbidxmask1 = (is < 4) ? 0x30 : 0xC0;
|
||||
mbidxshift1 = (is < 4) ? 0 : 2;
|
||||
|
||||
sc = uint8_t((data_a[ib].scales[scidx0] & 0xF) | ((data_a[ib].scales[scidx1] & scidxmask1) >> scidxshift1));
|
||||
mbyte = uint8_t((data_a[ib].scales[mbidx0] & mbidxmask0) >> mbidxshift0 | ((data_a[ib].scales[mbidx1] & mbidxmask1) >> mbidxshift1));
|
||||
|
||||
const FLOAT_TYPE d2 = dall * sc;
|
||||
const FLOAT_TYPE m2 = dmin * mbyte;
|
||||
|
||||
[[unroll]] for (uint l = 0; l < n; ++l) {
|
||||
data_b[y_idx + l ] = D_TYPE(d1 * FLOAT_TYPE(data_a[ib].qs[qs_idx + l] & 0xF) - m1);
|
||||
data_b[y_idx + l + 32] = D_TYPE(d2 * FLOAT_TYPE(data_a[ib].qs[qs_idx + l] >> 4) - m2);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,34 @@
|
|||
#version 450
|
||||
|
||||
#include "dequant_head.comp"
|
||||
|
||||
layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer A {block_q5_0 data_a[];};
|
||||
layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
|
||||
|
||||
void main() {
|
||||
const uint i = gl_WorkGroupID.x * 4 + gl_LocalInvocationID.x / 64;
|
||||
|
||||
const uint tid = gl_LocalInvocationID.x % 64;
|
||||
const uint il = tid/32;
|
||||
const uint ir = tid%32;
|
||||
const uint ib = 32*i + ir;
|
||||
if (ib >= p.nel / 32) {
|
||||
return;
|
||||
}
|
||||
|
||||
const uint b_idx = 1024*i + 32*ir + 8*il;
|
||||
|
||||
const float d = float(data_a[ib].d);
|
||||
const uint qh = uint(data_a[ib].qh[1]) << 16 | data_a[ib].qh[0];
|
||||
|
||||
const uint q_idx = 8*il;
|
||||
|
||||
[[unroll]] for (uint l = 0; l < 8; ++l) {
|
||||
const uint iqs = q_idx + l;
|
||||
const uint vui = uint(data_a[ib].qs[iqs]);
|
||||
data_b[b_idx + l + 0] = D_TYPE(d * (((vui & 0xF) | (((qh >> iqs) << 4) & 0x10)) - 16.0f));
|
||||
data_b[b_idx + l + 16] = D_TYPE(d * (((vui >> 4) | ((qh >> (iqs + 12)) & 0x10)) - 16.0f));
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,35 @@
|
|||
#version 450
|
||||
|
||||
#include "dequant_head.comp"
|
||||
|
||||
layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer A {block_q5_1 data_a[];};
|
||||
layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
|
||||
|
||||
void main() {
|
||||
const uint i = gl_WorkGroupID.x * 4 + gl_LocalInvocationID.x / 64;
|
||||
|
||||
const uint tid = gl_LocalInvocationID.x % 64;
|
||||
const uint il = tid/32;
|
||||
const uint ir = tid%32;
|
||||
const uint ib = 32*i + ir;
|
||||
if (ib >= p.nel / 32) {
|
||||
return;
|
||||
}
|
||||
|
||||
const uint b_idx = 1024*i + 32*ir + 8*il;
|
||||
|
||||
const float d = float(data_a[ib].d);
|
||||
const float m = float(data_a[ib].m);
|
||||
const uint qh = data_a[ib].qh;
|
||||
|
||||
const uint q_idx = 8*il;
|
||||
|
||||
[[unroll]] for (uint l = 0; l < 8; ++l) {
|
||||
const uint iqs = q_idx + l;
|
||||
const uint vui = uint(data_a[ib].qs[iqs]);
|
||||
data_b[b_idx + l + 0] = D_TYPE(d * (((vui & 0xF) | (((qh >> iqs) << 4) & 0x10))) + m);
|
||||
data_b[b_idx + l + 16] = D_TYPE(d * (((vui >> 4) | ((qh >> (iqs + 12)) & 0x10))) + m);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,70 @@
|
|||
#version 450
|
||||
|
||||
#include "dequant_head.comp"
|
||||
|
||||
layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
|
||||
layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
|
||||
|
||||
void main() {
|
||||
[[unroll]] for (uint wgy = 0; wgy < 256; wgy++) {
|
||||
const uint ib = gl_WorkGroupID.x * 256 + wgy;
|
||||
if (ib >= p.M * p.K / QUANT_K) {
|
||||
return;
|
||||
}
|
||||
|
||||
const uint tid = gl_LocalInvocationID.x;
|
||||
const uint il = tid / 16;
|
||||
const uint ir = tid % 16;
|
||||
const uint is = 2 * il;
|
||||
|
||||
const FLOAT_TYPE dall = FLOAT_TYPE(data_a[ib].d.x);
|
||||
const FLOAT_TYPE dmin = FLOAT_TYPE(data_a[ib].d.y);
|
||||
|
||||
const uint y_idx = ib * QUANT_K + 64 * il + 2 * ir;
|
||||
const uint qs_idx = 32*il + 2 * ir;
|
||||
const uint qh_idx = 2 * ir;
|
||||
|
||||
uint scidx0 = (is < 4) ? is : (is + 4);
|
||||
uint scidx1 = (is < 4) ? is : (is - 4);
|
||||
uint scidxmask1 = (is < 4) ? 0x30 : 0xC0;
|
||||
uint scidxshift1 = (is < 4) ? 0 : 2;
|
||||
uint mbidx0 = is + 4;
|
||||
uint mbidx1 = (is < 4) ? is + 4 : is;
|
||||
uint mbidxmask0 = (is < 4) ? 0xF : 0xF0;
|
||||
uint mbidxshift0 = (is < 4) ? 0 : 4;
|
||||
uint mbidxmask1 = (is < 4) ? 0x30 : 0xC0;
|
||||
uint mbidxshift1 = (is < 4) ? 0 : 2;
|
||||
|
||||
uint8_t sc = uint8_t((data_a[ib].scales[scidx0] & 0xF) | ((data_a[ib].scales[scidx1] & scidxmask1) >> scidxshift1));
|
||||
uint8_t mbyte = uint8_t((data_a[ib].scales[mbidx0] & mbidxmask0) >> mbidxshift0 | ((data_a[ib].scales[mbidx1] & mbidxmask1) >> mbidxshift1));
|
||||
|
||||
const FLOAT_TYPE d1 = dall * sc;
|
||||
const FLOAT_TYPE m1 = dmin * mbyte;
|
||||
|
||||
scidx0 = (is < 4) ? is + 1 : (is + 5);
|
||||
scidx1 = (is < 4) ? is + 1 : (is - 3);
|
||||
scidxmask1 = (is < 4) ? 0x30 : 0xC0;
|
||||
scidxshift1 = (is < 4) ? 0 : 2;
|
||||
mbidx0 = is + 5;
|
||||
mbidx1 = (is < 4) ? is + 5 : is + 1;
|
||||
mbidxmask0 = (is < 4) ? 0xF : 0xF0;
|
||||
mbidxshift0 = (is < 4) ? 0 : 4;
|
||||
mbidxmask1 = (is < 4) ? 0x30 : 0xC0;
|
||||
mbidxshift1 = (is < 4) ? 0 : 2;
|
||||
|
||||
sc = uint8_t((data_a[ib].scales[scidx0] & 0xF) | ((data_a[ib].scales[scidx1] & scidxmask1) >> scidxshift1));
|
||||
mbyte = uint8_t((data_a[ib].scales[mbidx0] & mbidxmask0) >> mbidxshift0 | ((data_a[ib].scales[mbidx1] & mbidxmask1) >> mbidxshift1));
|
||||
|
||||
const FLOAT_TYPE d2 = dall * sc;
|
||||
const FLOAT_TYPE m2 = dmin * mbyte;
|
||||
|
||||
const uint8_t hm1 = uint8_t(1 << (2 * il ));
|
||||
const uint8_t hm2 = uint8_t(1 << (2 * il + 1));
|
||||
data_b[y_idx ] = D_TYPE(d1 * FLOAT_TYPE((data_a[ib].qs[qs_idx ] & 0xF) + (((data_a[ib].qh[qh_idx ] & hm1) != 0) ? 16 : 0)) - m1);
|
||||
data_b[y_idx + 1] = D_TYPE(d1 * FLOAT_TYPE((data_a[ib].qs[qs_idx + 1] & 0xF) + (((data_a[ib].qh[qh_idx + 1] & hm1) != 0) ? 16 : 0)) - m1);
|
||||
data_b[y_idx + 32] = D_TYPE(d2 * FLOAT_TYPE((data_a[ib].qs[qs_idx ] >> 4) + (((data_a[ib].qh[qh_idx ] & hm2) != 0) ? 16 : 0)) - m2);
|
||||
data_b[y_idx + 33] = D_TYPE(d2 * FLOAT_TYPE((data_a[ib].qs[qs_idx + 1] >> 4) + (((data_a[ib].qh[qh_idx + 1] & hm2) != 0) ? 16 : 0)) - m2);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,33 @@
|
|||
#version 450
|
||||
|
||||
#include "dequant_head.comp"
|
||||
|
||||
layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
|
||||
layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
|
||||
|
||||
void main() {
|
||||
[[unroll]] for (uint wgy = 0; wgy < 256; wgy++) {
|
||||
const uint i = gl_WorkGroupID.x * 256 + wgy;
|
||||
if (i >= p.M * p.K / QUANT_K) {
|
||||
return;
|
||||
}
|
||||
const uint tid = gl_LocalInvocationID.x;
|
||||
const uint ip = tid / 32;
|
||||
const uint il = tid - 32 * ip;
|
||||
const uint is = 8 * ip + il / 16;
|
||||
|
||||
const uint y_idx = i * QUANT_K + 128 * ip + il;
|
||||
|
||||
const uint ql_idx = 64 * ip + il;
|
||||
const uint8_t qh = data_a[i].qh[32 * ip + il];
|
||||
|
||||
const FLOAT_TYPE d = FLOAT_TYPE(data_a[i].d);
|
||||
|
||||
data_b[y_idx + 0] = D_TYPE(d * FLOAT_TYPE(data_a[i].scales[is + 0] * (int8_t((data_a[i].ql[ql_idx + 0] & 0xF) | (((qh >> 0) & 3) << 4)) - 32)));
|
||||
data_b[y_idx + 32] = D_TYPE(d * FLOAT_TYPE(data_a[i].scales[is + 2] * (int8_t((data_a[i].ql[ql_idx + 32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32)));
|
||||
data_b[y_idx + 64] = D_TYPE(d * FLOAT_TYPE(data_a[i].scales[is + 4] * (int8_t((data_a[i].ql[ql_idx + 0] >> 4) | (((qh >> 4) & 3) << 4)) - 32)));
|
||||
data_b[y_idx + 96] = D_TYPE(d * FLOAT_TYPE(data_a[i].scales[is + 6] * (int8_t((data_a[i].ql[ql_idx + 32] >> 4) | (((qh >> 6) & 3) << 4)) - 32)));
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,31 @@
|
|||
#version 450
|
||||
|
||||
#include "dequant_head.comp"
|
||||
|
||||
layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer A {block_q8_0 data_a[];};
|
||||
layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
|
||||
|
||||
void main() {
|
||||
const uint i = gl_WorkGroupID.x * 4 + gl_LocalInvocationID.x / 64;
|
||||
|
||||
const uint tid = gl_LocalInvocationID.x % 64;
|
||||
const uint il = tid/32;
|
||||
const uint ir = tid%32;
|
||||
const uint ib = 32*i + ir;
|
||||
if (ib >= p.nel / 32) {
|
||||
return;
|
||||
}
|
||||
|
||||
const uint b_idx = 1024*i + 32*ir + 16*il;
|
||||
|
||||
const float d = float(data_a[ib].d);
|
||||
|
||||
const uint q_idx = 16*il;
|
||||
|
||||
[[unroll]] for (uint l = 0; l < 16; l += 2) {
|
||||
data_b[b_idx + l ] = D_TYPE(d * data_a[ib].qs[q_idx + l ]);
|
||||
data_b[b_idx + l + 1] = D_TYPE(d * data_a[ib].qs[q_idx + l + 1]);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,34 @@
|
|||
#version 450
|
||||
|
||||
#extension GL_EXT_shader_16bit_storage : require
|
||||
#extension GL_EXT_control_flow_attributes : enable
|
||||
|
||||
layout (push_constant) uniform parameter
|
||||
{
|
||||
uint ncols;
|
||||
uint rows_per_channel;
|
||||
uint n_past;
|
||||
} p;
|
||||
|
||||
#include "types.comp"
|
||||
|
||||
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
|
||||
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
|
||||
|
||||
void main() {
|
||||
const uint col = gl_GlobalInvocationID.y;
|
||||
const uint row = gl_GlobalInvocationID.x;
|
||||
|
||||
if (col >= p.ncols) {
|
||||
return;
|
||||
}
|
||||
|
||||
const uint i = row*p.ncols + col;
|
||||
if (col > p.n_past + row % p.rows_per_channel) {
|
||||
data_d[i] = D_TYPE(uintBitsToFloat(0xFF800000));
|
||||
} else {
|
||||
data_d[i] = D_TYPE(data_a[i]);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,27 @@
|
|||
#version 450
|
||||
|
||||
#include "types.comp"
|
||||
#include "generic_binary_head.comp"
|
||||
|
||||
const uint num_threads = 256;
|
||||
|
||||
layout(local_size_x = num_threads, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
void main() {
|
||||
uint idx = get_idx();
|
||||
|
||||
// num_threads * num_iter must equal 512, to match the wg_denoms and get_idx calculation
|
||||
const uint num_iter = 2;
|
||||
|
||||
[[unroll]] for (uint i = 0; i < num_iter; ++i) {
|
||||
if (idx >= p.ne) {
|
||||
continue;
|
||||
}
|
||||
uint i00, i01, i02, i03;
|
||||
get_indices(idx, i00, i01, i02, i03);
|
||||
|
||||
data_d[get_doffset() + dst_idx(i00, i01, i02, i03)] = D_TYPE(FLOAT_TYPE(data_a[get_aoffset() + src0_idx(i00, i01, i02, i03)]) / FLOAT_TYPE(data_b[get_boffset() + src1_idx(i00, i01, i02, i03)]));
|
||||
|
||||
idx += num_threads;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,289 @@
|
|||
#version 450
|
||||
|
||||
#extension GL_EXT_control_flow_attributes : enable
|
||||
#extension GL_EXT_shader_16bit_storage : require
|
||||
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types_int8 : require
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types_int32 : require
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types_int16 : require
|
||||
|
||||
#extension GL_KHR_memory_scope_semantics : enable
|
||||
#extension GL_KHR_cooperative_matrix : enable
|
||||
#extension GL_NV_cooperative_matrix2 : enable
|
||||
#extension GL_EXT_buffer_reference : enable
|
||||
#extension GL_KHR_shader_subgroup_ballot : enable
|
||||
#extension GL_KHR_shader_subgroup_vote : enable
|
||||
#extension GL_EXT_null_initializer : enable
|
||||
|
||||
#include "types.comp"
|
||||
#include "dequant_funcs_cm2.comp"
|
||||
|
||||
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout (constant_id = 1) const uint32_t Br = 32;
|
||||
layout (constant_id = 2) const uint32_t Bc = 32;
|
||||
layout (constant_id = 3) const uint32_t D = 32;
|
||||
layout (constant_id = 4) const uint32_t Clamp = gl_CooperativeMatrixClampModeConstantNV;
|
||||
|
||||
layout (push_constant) uniform parameter {
|
||||
uint32_t N;
|
||||
uint32_t KV;
|
||||
|
||||
uint32_t ne1;
|
||||
uint32_t ne2;
|
||||
uint32_t ne3;
|
||||
|
||||
uint32_t neq2;
|
||||
uint32_t neq3;
|
||||
uint32_t nek2;
|
||||
uint32_t nek3;
|
||||
uint32_t nev2;
|
||||
uint32_t nev3;
|
||||
uint32_t nem1;
|
||||
|
||||
uint32_t nb02;
|
||||
uint32_t nb03;
|
||||
uint32_t nb12;
|
||||
uint32_t nb13;
|
||||
uint32_t nb22;
|
||||
uint32_t nb23;
|
||||
uint32_t nb31;
|
||||
|
||||
float scale;
|
||||
float max_bias;
|
||||
float logit_softcap;
|
||||
|
||||
uint32_t mask;
|
||||
uint32_t n_head_log2;
|
||||
float m0;
|
||||
float m1;
|
||||
} p;
|
||||
|
||||
layout (binding = 0) readonly buffer Q {uint8_t data_q[];};
|
||||
layout (binding = 1) readonly buffer K {uint8_t data_k[];};
|
||||
layout (binding = 2) readonly buffer V {uint8_t data_v[];};
|
||||
layout (binding = 3) readonly buffer M {uint8_t data_m[];};
|
||||
layout (binding = 4) writeonly buffer O {D_TYPE data_o[];};
|
||||
|
||||
#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b))
|
||||
|
||||
ACC_TYPE maxReduce(const in ACC_TYPE x, const in ACC_TYPE y) {
|
||||
return max(x, y);
|
||||
}
|
||||
|
||||
ACC_TYPE smearReduce(const in ACC_TYPE x, const in ACC_TYPE y) {
|
||||
return x;
|
||||
}
|
||||
|
||||
// Replace matrix elements >= numRows or numCols with 'replace'
|
||||
ACC_TYPE replacePadding(const in uint32_t row, const in uint32_t col, const in ACC_TYPE elem, const in ACC_TYPE replace, const in uint32_t numRows, const in uint32_t numCols) {
|
||||
if (row >= numRows || col >= numCols) {
|
||||
return replace;
|
||||
}
|
||||
return elem;
|
||||
}
|
||||
|
||||
ACC_TYPE Exp(const in uint32_t row, const in uint32_t col, const in ACC_TYPE elem)
|
||||
{
|
||||
return exp(elem);
|
||||
}
|
||||
|
||||
ACC_TYPE Max(const in uint32_t row, const in uint32_t col, const in ACC_TYPE elem0, const in ACC_TYPE elem1)
|
||||
{
|
||||
return max(elem0, elem1);
|
||||
}
|
||||
|
||||
#if defined(BLOCK_SIZE)
|
||||
#define DECODEFUNC , DEQUANTFUNC
|
||||
#else
|
||||
#define DECODEFUNC
|
||||
#endif
|
||||
|
||||
void main() {
|
||||
#if defined(DATA_A_IQ4_NL)
|
||||
init_iq4nl_shmem();
|
||||
#endif
|
||||
|
||||
const uint32_t N = p.N;
|
||||
const uint32_t KV = p.KV;
|
||||
|
||||
const uint32_t Tr = CEIL_DIV(N, Br);
|
||||
const uint32_t Tc = CEIL_DIV(KV, Bc);
|
||||
|
||||
const uint32_t i = gl_WorkGroupID.x;
|
||||
|
||||
const uint32_t iq2 = gl_WorkGroupID.y;
|
||||
const uint32_t iq3 = gl_WorkGroupID.z;
|
||||
|
||||
// broadcast factors
|
||||
const uint32_t rk2 = p.neq2/p.nek2;
|
||||
const uint32_t rk3 = p.neq3/p.nek3;
|
||||
|
||||
const uint32_t rv2 = p.neq2/p.nev2;
|
||||
const uint32_t rv3 = p.neq3/p.nev3;
|
||||
|
||||
// k indices
|
||||
const uint32_t ik3 = iq3 / rk3;
|
||||
const uint32_t ik2 = iq2 / rk2;
|
||||
|
||||
// v indices
|
||||
const uint32_t iv3 = iq3 / rv3;
|
||||
const uint32_t iv2 = iq2 / rv2;
|
||||
|
||||
tensorLayoutNV<2, gl_CooperativeMatrixClampModeConstantNV> tensorLayoutQ = createTensorLayoutNV(2, gl_CooperativeMatrixClampModeConstantNV);
|
||||
tensorLayoutNV<2, Clamp> tensorLayoutK = createTensorLayoutNV(2, Clamp);
|
||||
tensorLayoutNV<2, Clamp> tensorLayoutV = createTensorLayoutNV(2, Clamp);
|
||||
|
||||
tensorViewNV<2, false, 1, 0> tensorViewTranspose = createTensorViewNV(2, false, 1, 0);
|
||||
|
||||
#if defined(BLOCK_SIZE)
|
||||
tensorLayoutK = setTensorLayoutBlockSizeNV(tensorLayoutK, 1, BLOCK_SIZE);
|
||||
tensorLayoutV = setTensorLayoutBlockSizeNV(tensorLayoutV, 1, BLOCK_SIZE);
|
||||
#endif
|
||||
|
||||
tensorLayoutQ = setTensorLayoutDimensionNV(tensorLayoutQ, N, D);
|
||||
tensorLayoutK = setTensorLayoutDimensionNV(tensorLayoutK, KV, D);
|
||||
tensorLayoutV = setTensorLayoutDimensionNV(tensorLayoutV, KV, D);
|
||||
|
||||
coopmat<Q_TYPE, gl_ScopeWorkgroup, Br, D, gl_MatrixUseA> Q;
|
||||
coopmat<float16_t, gl_ScopeWorkgroup, Br, D, gl_MatrixUseA> Qf16;
|
||||
|
||||
uint32_t q_offset = iq2*p.nb02+iq3*p.nb03;
|
||||
coopMatLoadTensorNV(Q, data_q, q_offset, sliceTensorLayoutNV(tensorLayoutQ, i * Br, Br, 0, D));
|
||||
|
||||
Qf16 = coopmat<float16_t, gl_ScopeWorkgroup, Br, D, gl_MatrixUseA>(Q);
|
||||
Qf16 *= float16_t(p.scale);
|
||||
|
||||
coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, D, gl_MatrixUseAccumulator> O = coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, D, gl_MatrixUseAccumulator>(0);
|
||||
|
||||
coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator> L, M;
|
||||
|
||||
L = coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator>(0);
|
||||
M = coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator>(-1.0/0.0);
|
||||
|
||||
ACC_TYPE slope = ACC_TYPE(1.0);
|
||||
|
||||
// ALiBi
|
||||
if (p.max_bias > 0.0f) {
|
||||
const uint32_t h = iq2;
|
||||
|
||||
const ACC_TYPE base = ACC_TYPE(h < p.n_head_log2 ? p.m0 : p.m1);
|
||||
const int exph = int(h < p.n_head_log2 ? h + 1 : 2*(h - p.n_head_log2) + 1);
|
||||
|
||||
slope = pow(base, ACC_TYPE(exph));
|
||||
}
|
||||
|
||||
[[dont_unroll]]
|
||||
for (uint32_t j = 0; j < Tc; ++j) {
|
||||
|
||||
coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator> S = coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator>(0);
|
||||
|
||||
coopmat<float16_t, gl_ScopeWorkgroup, D, Bc, gl_MatrixUseB> K_T;
|
||||
|
||||
uint32_t k_offset = ik2*p.nb12 + ik3*p.nb13;
|
||||
coopMatLoadTensorNV(K_T, data_k, k_offset, sliceTensorLayoutNV(tensorLayoutK, j * Bc, Bc, 0, D), tensorViewTranspose DECODEFUNC);
|
||||
S = coopMatMulAdd(Qf16, K_T, S);
|
||||
|
||||
if (p.logit_softcap != 0.0f) {
|
||||
[[unroll]]
|
||||
for (int k = 0; k < S.length(); ++k) {
|
||||
S[k] = ACC_TYPE(p.logit_softcap)*tanh(S[k]);
|
||||
}
|
||||
}
|
||||
|
||||
if (p.mask != 0) {
|
||||
tensorLayoutNV<2, gl_CooperativeMatrixClampModeConstantNV> tensorLayoutM = createTensorLayoutNV(2, gl_CooperativeMatrixClampModeConstantNV);
|
||||
tensorLayoutM = setTensorLayoutDimensionNV(tensorLayoutM, p.nem1, KV);
|
||||
|
||||
coopmat<float16_t, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator> mv;
|
||||
|
||||
coopMatLoadTensorNV(mv, data_m, 0, sliceTensorLayoutNV(tensorLayoutM, i * Br, Br, j * Bc, Bc));
|
||||
|
||||
S += slope*coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator>(mv);
|
||||
}
|
||||
|
||||
// Clear padding elements to -inf, so they don't contribute to rowmax
|
||||
if (Clamp != 0 &&
|
||||
((j + 1) * Bc > KV ||
|
||||
(i + 1) * Br > N)) {
|
||||
|
||||
uint R = ((i + 1) * Br > N) ? (N % Br) : Br;
|
||||
uint C = ((j + 1) * Bc > KV) ? (KV % Bc) : Bc;
|
||||
|
||||
coopMatPerElementNV(S, S, replacePadding, ACC_TYPE(-1.0/0.0), R, C);
|
||||
}
|
||||
|
||||
coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator> rowmax, P, rowsum, eM;
|
||||
|
||||
coopMatReduceNV(rowmax, S, gl_CooperativeMatrixReduceRowNV, maxReduce);
|
||||
|
||||
coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator> Mold = M;
|
||||
|
||||
// M = max(rowmax, Mold)
|
||||
// P = e^(S - M)
|
||||
// eM = e^(Mold - M)
|
||||
coopMatPerElementNV(M, rowmax, Max, Mold);
|
||||
coopMatPerElementNV(P, S - M, Exp);
|
||||
coopMatPerElementNV(eM, Mold - M, Exp);
|
||||
|
||||
// Clear padding elements to 0, so they don't contribute to rowsum
|
||||
if (Clamp != 0 &&
|
||||
((j + 1) * Bc > KV ||
|
||||
(i + 1) * Br > N)) {
|
||||
|
||||
uint R = ((i + 1) * Br > N) ? (N % Br) : Br;
|
||||
uint C = ((j + 1) * Bc > KV) ? (KV % Bc) : Bc;
|
||||
|
||||
coopMatPerElementNV(P, P, replacePadding, ACC_TYPE(0.0), R, C);
|
||||
}
|
||||
|
||||
coopmat<float16_t, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseA> P_A = coopmat<float16_t, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseA>(P);
|
||||
|
||||
// compute rowsum by multiplying by matrix of all ones.
|
||||
coopmat<float16_t, gl_ScopeWorkgroup, Bc, Bc, gl_MatrixUseB> One = coopmat<float16_t, gl_ScopeWorkgroup, Bc, Bc, gl_MatrixUseB>(1.0);
|
||||
|
||||
rowsum = coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator>(0.0);
|
||||
rowsum = coopMatMulAdd(P_A, One, rowsum);
|
||||
|
||||
coopmat<float16_t, gl_ScopeWorkgroup, Bc, D, gl_MatrixUseB> V;
|
||||
uint32_t v_offset = iv2*p.nb22 + iv3*p.nb23;
|
||||
coopMatLoadTensorNV(V, data_v, v_offset, sliceTensorLayoutNV(tensorLayoutV, j * Bc, Bc, 0, D) DECODEFUNC);
|
||||
|
||||
L = eM*L + rowsum;
|
||||
|
||||
// This is the "diagonal" matrix in the paper, but since we do componentwise
|
||||
// multiply rather than matrix multiply it has the diagonal element smeared
|
||||
// across the row
|
||||
coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, D, gl_MatrixUseAccumulator> eMdiag;
|
||||
|
||||
// resize eM by using smear/reduce
|
||||
coopMatReduceNV(eMdiag, eM, gl_CooperativeMatrixReduceRowNV, smearReduce);
|
||||
|
||||
O = eMdiag * O;
|
||||
|
||||
O = coopMatMulAdd(P_A, V, O);
|
||||
}
|
||||
|
||||
coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, D, gl_MatrixUseAccumulator> Ldiag;
|
||||
|
||||
// resize L by using smear/reduce
|
||||
coopMatReduceNV(Ldiag, L, gl_CooperativeMatrixReduceRowNV, smearReduce);
|
||||
|
||||
[[unroll]]
|
||||
for (int k = 0; k < Ldiag.length(); ++k) {
|
||||
Ldiag[k] = ACC_TYPE(1.0) / Ldiag[k];
|
||||
}
|
||||
|
||||
O = Ldiag*O;
|
||||
|
||||
tensorLayoutNV<3, gl_CooperativeMatrixClampModeConstantNV> tensorLayoutD = createTensorLayoutNV(3, gl_CooperativeMatrixClampModeConstantNV);
|
||||
tensorLayoutD = setTensorLayoutDimensionNV(tensorLayoutD, p.ne2, p.ne1, D);
|
||||
|
||||
// permute dimensions
|
||||
tensorViewNV<3, false, 1, 0, 2> tensorViewPermute = createTensorViewNV(3, false, 1, 0, 2);
|
||||
uint32_t o_offset = iq3*p.ne2*p.ne1;
|
||||
|
||||
coopmat<D_TYPE, gl_ScopeWorkgroup, Br, D, gl_MatrixUseAccumulator> O_D = coopmat<D_TYPE, gl_ScopeWorkgroup, Br, D, gl_MatrixUseAccumulator>(O);
|
||||
coopMatStoreTensorNV(O_D, data_o, o_offset, sliceTensorLayoutNV(tensorLayoutD, i * Br, Br, iq2, 1, 0, D), tensorViewPermute);
|
||||
}
|
||||
|
|
@ -0,0 +1,25 @@
|
|||
#version 450
|
||||
|
||||
#include "generic_head.comp"
|
||||
#include "types.comp"
|
||||
|
||||
#extension GL_EXT_control_flow_attributes : enable
|
||||
|
||||
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
|
||||
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
|
||||
|
||||
void main() {
|
||||
const float GELU_COEF_A = 0.044715f;
|
||||
const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
|
||||
const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
|
||||
|
||||
if (i >= p.KX) {
|
||||
return;
|
||||
}
|
||||
|
||||
const float xi = float(data_a[i]);
|
||||
const float val = SQRT_2_OVER_PI*xi*(1.0f + GELU_COEF_A*xi*xi);
|
||||
data_d[i] = D_TYPE(0.5f*xi*(2.0f - 2.0f / (exp(2 * val) + 1)));
|
||||
}
|
||||
|
|
@ -0,0 +1,23 @@
|
|||
#version 450
|
||||
|
||||
#include "generic_head.comp"
|
||||
#include "types.comp"
|
||||
|
||||
#extension GL_EXT_control_flow_attributes : enable
|
||||
|
||||
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
|
||||
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
|
||||
|
||||
void main() {
|
||||
const float GELU_QUICK_COEF = -1.702f;
|
||||
const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
|
||||
|
||||
if (i >= p.KX) {
|
||||
return;
|
||||
}
|
||||
|
||||
const float x = float(data_a[i]);
|
||||
data_d[i] = D_TYPE(x * (1.0f / (1.0f + exp(GELU_QUICK_COEF * x))));
|
||||
}
|
||||
|
|
@ -0,0 +1,64 @@
|
|||
#extension GL_EXT_shader_16bit_storage : require
|
||||
#extension GL_EXT_control_flow_attributes : require
|
||||
|
||||
layout (push_constant) uniform parameter
|
||||
{
|
||||
uint ne;
|
||||
uint ne00; uint ne01; uint ne02; uint ne03; uint nb00; uint nb01; uint nb02; uint nb03;
|
||||
uint ne10; uint ne11; uint ne12; uint ne13; uint nb10; uint nb11; uint nb12; uint nb13;
|
||||
uint ne20; uint ne21; uint ne22; uint ne23; uint nb20; uint nb21; uint nb22; uint nb23;
|
||||
uint misalign_offsets;
|
||||
float param1; float param2; int param3;
|
||||
} p;
|
||||
|
||||
layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
|
||||
layout (binding = 1) readonly buffer B {B_TYPE data_b[];};
|
||||
layout (binding = 2) writeonly buffer D {D_TYPE data_d[];};
|
||||
|
||||
// true if src0/src1 are the same shape and the indices can be reused without additional modulus
|
||||
layout(constant_id = 0) const bool norepeat = false;
|
||||
|
||||
uint get_idx() {
|
||||
return gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
|
||||
}
|
||||
|
||||
uint get_aoffset() { return p.misalign_offsets >> 16; }
|
||||
uint get_boffset() { return (p.misalign_offsets >> 8) & 0xFF; }
|
||||
uint get_doffset() { return p.misalign_offsets & 0xFF; }
|
||||
|
||||
// mod and div are expensive and coordinates/dimensions are often power of 2 or equal to 1
|
||||
uint fastmod(uint a, uint b) {
|
||||
if ((b & (b-1)) == 0) {
|
||||
return a & (b-1);
|
||||
}
|
||||
return a % b;
|
||||
}
|
||||
|
||||
uint fastdiv(uint a, uint b) {
|
||||
return (a < b) ? 0 : (a / b);
|
||||
}
|
||||
|
||||
void get_indices(uint idx, out uint i00, out uint i01, out uint i02, out uint i03) {
|
||||
i03 = fastdiv(idx, (p.ne02*p.ne01*p.ne00));
|
||||
const uint i03_offset = i03 * p.ne02*p.ne01*p.ne00;
|
||||
i02 = fastdiv((idx - i03_offset), (p.ne01*p.ne00));
|
||||
const uint i02_offset = i02*p.ne01*p.ne00;
|
||||
i01 = (idx - i03_offset - i02_offset) / p.ne00;
|
||||
i00 = idx - i03_offset - i02_offset - i01*p.ne00;
|
||||
}
|
||||
|
||||
uint src0_idx(uint i00, uint i01, uint i02, uint i03) {
|
||||
return i03*p.nb03 + i02*p.nb02 + i01*p.nb01 + i00*p.nb00;
|
||||
}
|
||||
|
||||
uint src1_idx(uint i00, uint i01, uint i02, uint i03) {
|
||||
if (norepeat) {
|
||||
return i03*p.nb13 + i02*p.nb12 + i01*p.nb11 + i00*p.nb10;
|
||||
} else {
|
||||
return fastmod(i03, p.ne13)*p.nb13 + fastmod(i02, p.ne12)*p.nb12 + fastmod(i01, p.ne11)*p.nb11 + fastmod(i00, p.ne10)*p.nb10;
|
||||
}
|
||||
}
|
||||
|
||||
uint dst_idx(uint i00, uint i01, uint i02, uint i03) {
|
||||
return i03*p.nb23 + i02*p.nb22 + i01*p.nb21 + i00*p.nb20;
|
||||
}
|
||||
|
|
@ -0,0 +1,9 @@
|
|||
#extension GL_EXT_shader_16bit_storage : require
|
||||
|
||||
layout (push_constant) uniform parameter
|
||||
{
|
||||
uint KX;
|
||||
uint KY;
|
||||
float param1;
|
||||
float param2;
|
||||
} p;
|
||||
|
|
@ -0,0 +1,56 @@
|
|||
#extension GL_EXT_shader_16bit_storage : require
|
||||
#extension GL_EXT_control_flow_attributes : require
|
||||
|
||||
layout (push_constant) uniform parameter
|
||||
{
|
||||
uint ne;
|
||||
uint ne00; uint ne01; uint ne02; uint ne03; uint nb00; uint nb01; uint nb02; uint nb03;
|
||||
uint ne10; uint ne11; uint ne12; uint ne13; uint nb10; uint nb11; uint nb12; uint nb13;
|
||||
uint misalign_offsets;
|
||||
float param1; float param2;
|
||||
|
||||
uint ne0_012mp; uint ne0_012L;
|
||||
uint ne0_01mp; uint ne0_01L;
|
||||
uint ne0_0mp; uint ne0_0L;
|
||||
uint ne1_012mp; uint ne1_012L;
|
||||
uint ne1_01mp; uint ne1_01L;
|
||||
uint ne1_0mp; uint ne1_0L;
|
||||
} p;
|
||||
|
||||
layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
|
||||
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
|
||||
|
||||
uint get_idx() {
|
||||
return gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
|
||||
}
|
||||
|
||||
uint get_aoffset() { return p.misalign_offsets >> 16; }
|
||||
uint get_doffset() { return p.misalign_offsets & 0xFFFF; }
|
||||
|
||||
// see init_fastdiv_values in ggml-vulkan.cpp
|
||||
uint fastdiv(uint n, uint mp, uint L) {
|
||||
uint msbs, lsbs;
|
||||
// msbs = mulhi(n, mp)
|
||||
umulExtended(n, mp, msbs, lsbs);
|
||||
return (msbs + n) >> L;
|
||||
}
|
||||
|
||||
uint src0_idx(uint idx) {
|
||||
const uint i03 = fastdiv(idx, p.ne0_012mp, p.ne0_012L);
|
||||
const uint i03_offset = i03 * p.ne02*p.ne01*p.ne00;
|
||||
const uint i02 = fastdiv(idx - i03_offset, p.ne0_01mp, p.ne0_01L);
|
||||
const uint i02_offset = i02*p.ne01*p.ne00;
|
||||
const uint i01 = fastdiv(idx - i03_offset - i02_offset, p.ne0_0mp, p.ne0_0L);
|
||||
const uint i00 = idx - i03_offset - i02_offset - i01*p.ne00;
|
||||
return i03*p.nb03 + i02*p.nb02 + i01*p.nb01 + i00*p.nb00;
|
||||
}
|
||||
|
||||
uint dst_idx(uint idx) {
|
||||
const uint i13 = fastdiv(idx, p.ne1_012mp, p.ne1_012L);
|
||||
const uint i13_offset = i13 * p.ne12*p.ne11*p.ne10;
|
||||
const uint i12 = fastdiv(idx - i13_offset, p.ne1_01mp, p.ne1_01L);
|
||||
const uint i12_offset = i12*p.ne11*p.ne10;
|
||||
const uint i11 = fastdiv(idx - i13_offset - i12_offset, p.ne1_0mp, p.ne1_0L);
|
||||
const uint i10 = idx - i13_offset - i12_offset - i11*p.ne10;
|
||||
return i13*p.nb13 + i12*p.nb12 + i11*p.nb11 + i10*p.nb10;
|
||||
}
|
||||
|
|
@ -0,0 +1,28 @@
|
|||
#version 450
|
||||
|
||||
#include "types.comp"
|
||||
#include "generic_binary_head.comp"
|
||||
|
||||
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
void main() {
|
||||
const uint i00 = gl_GlobalInvocationID.x;
|
||||
const uint i10 = gl_GlobalInvocationID.y;
|
||||
const uint i11 = (gl_GlobalInvocationID.z)/p.ne12;
|
||||
const uint i12 = (gl_GlobalInvocationID.z)%p.ne12;
|
||||
|
||||
if (i00 >= p.ne00) {
|
||||
return;
|
||||
}
|
||||
|
||||
const uint i01 = data_b[get_boffset() + i10*p.nb10 + i11*p.nb11 + i12*p.nb12];
|
||||
|
||||
const uint a_offset = get_aoffset() + i01*p.nb01 + i11*p.nb02 + i12*p.nb03;
|
||||
const uint d_offset = get_doffset() + i10*p.nb21 + i11*p.nb22 + i12*p.nb23;
|
||||
|
||||
#ifndef OPTIMIZATION_ERROR_WORKAROUND
|
||||
data_d[d_offset + i00] = D_TYPE(data_a[a_offset + i00]);
|
||||
#else
|
||||
data_d[d_offset + i00] = data_a[a_offset + i00];
|
||||
#endif
|
||||
}
|
||||
|
|
@ -0,0 +1,39 @@
|
|||
#version 450
|
||||
|
||||
#include "types.comp"
|
||||
#include "generic_binary_head.comp"
|
||||
#include "dequant_funcs.comp"
|
||||
|
||||
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
void main() {
|
||||
const uint i00 = (gl_GlobalInvocationID.x)*2;
|
||||
const uint i10 = gl_GlobalInvocationID.y;
|
||||
const uint i11 = (gl_GlobalInvocationID.z)/p.ne12;
|
||||
const uint i12 = (gl_GlobalInvocationID.z)%p.ne12;
|
||||
|
||||
#if defined(DATA_A_IQ4_NL)
|
||||
init_iq4nl_shmem();
|
||||
#endif
|
||||
|
||||
if (i00 >= p.ne00) {
|
||||
return;
|
||||
}
|
||||
|
||||
const uint i01 = data_b[i10*p.nb10 + i11*p.nb11 + i12*p.nb12];
|
||||
|
||||
const uint a_offset = i01*p.nb01 + i11*p.nb02 + i12*p.nb03;
|
||||
const uint d_offset = i10*p.nb21 + i11*p.nb22 + i12*p.nb23;
|
||||
|
||||
const uint ib = a_offset + i00/QUANT_K; // block index
|
||||
const uint iqs = (i00%QUANT_K)/QUANT_R; // quant index
|
||||
const uint iybs = i00 - i00%QUANT_K; // dst block start index
|
||||
const uint y_offset = QUANT_R == 1 ? 1 : QUANT_K/2;
|
||||
|
||||
vec2 v = dequantize(ib, iqs, 0);
|
||||
const vec2 dm = get_dm(ib, 0);
|
||||
v = v * dm.x + dm.y;
|
||||
|
||||
data_d[d_offset + iybs + iqs ] = D_TYPE(v.x);
|
||||
data_d[d_offset + iybs + iqs + y_offset] = D_TYPE(v.y);
|
||||
}
|
||||
|
|
@ -0,0 +1,66 @@
|
|||
#version 450
|
||||
|
||||
#include "generic_head.comp"
|
||||
#include "types.comp"
|
||||
|
||||
#extension GL_EXT_control_flow_attributes : enable
|
||||
#define BLOCK_SIZE 512
|
||||
|
||||
layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
|
||||
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
|
||||
|
||||
shared float tmp[BLOCK_SIZE];
|
||||
|
||||
void main() {
|
||||
const uint group_size = p.KX;
|
||||
const float eps = p.param1;
|
||||
|
||||
const uint tid = gl_LocalInvocationID.x;
|
||||
const uint start = gl_WorkGroupID.x * group_size + tid;
|
||||
const uint end = (gl_WorkGroupID.x + 1) * group_size;
|
||||
|
||||
tmp[tid] = 0.0f;
|
||||
|
||||
// Calculate mean
|
||||
[[unroll]] for (uint col = start; col < end; col += BLOCK_SIZE) {
|
||||
tmp[tid] += float(data_a[col]);
|
||||
}
|
||||
|
||||
// tmp up partial tmps and write back result
|
||||
barrier();
|
||||
[[unroll]] for (int s = BLOCK_SIZE / 2; s > 0; s >>= 1) {
|
||||
if (tid < s) {
|
||||
tmp[tid] += tmp[tid + s];
|
||||
}
|
||||
barrier();
|
||||
}
|
||||
|
||||
const float mean = tmp[0] / group_size;
|
||||
barrier();
|
||||
tmp[tid] = 0.0f;
|
||||
|
||||
// Calculate variance
|
||||
[[unroll]] for (uint col = start; col < end; col += BLOCK_SIZE) {
|
||||
const float xi = float(data_a[col]) - mean;
|
||||
data_d[col] = D_TYPE(xi);
|
||||
tmp[tid] += xi * xi;
|
||||
}
|
||||
|
||||
// sum up partial sums and write back result
|
||||
barrier();
|
||||
[[unroll]] for (int s = BLOCK_SIZE / 2; s > 0; s >>= 1) {
|
||||
if (tid < s) {
|
||||
tmp[tid] += tmp[tid + s];
|
||||
}
|
||||
barrier();
|
||||
}
|
||||
|
||||
const float variance = tmp[0] / group_size;
|
||||
const float scale = inversesqrt(variance + eps);
|
||||
|
||||
[[unroll]] for (uint col = start; col < end; col += BLOCK_SIZE) {
|
||||
data_d[col] *= D_TYPE(scale);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,87 @@
|
|||
#version 450
|
||||
|
||||
#extension GL_EXT_shader_16bit_storage : require
|
||||
#extension GL_EXT_spirv_intrinsics: enable
|
||||
#extension GL_EXT_control_flow_attributes : require
|
||||
|
||||
#if RTE16
|
||||
spirv_execution_mode(capabilities = [4467], 4462, 16); // RoundingModeRTE, 16 bits
|
||||
#endif
|
||||
|
||||
layout (push_constant) uniform parameter
|
||||
{
|
||||
uint batch_offset; uint offset_delta;
|
||||
uint IC;
|
||||
uint IW; uint IH;
|
||||
uint OW; uint OH;
|
||||
uint KW; uint KH;
|
||||
uint pelements;
|
||||
uint CHW;
|
||||
int s0; int s1;
|
||||
int p0; int p1;
|
||||
int d0; int d1;
|
||||
} p;
|
||||
|
||||
#include "types.comp"
|
||||
|
||||
layout(constant_id = 0) const uint BLOCK_SIZE = 32;
|
||||
|
||||
const uint NUM_ITER = 512 / BLOCK_SIZE;
|
||||
|
||||
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
|
||||
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
|
||||
|
||||
void main() {
|
||||
const uint gidx = gl_GlobalInvocationID.x;
|
||||
|
||||
const uint oh = gl_GlobalInvocationID.y;
|
||||
const uint batch = gl_GlobalInvocationID.z / p.IC;
|
||||
const uint ic = gl_GlobalInvocationID.z % p.IC;
|
||||
|
||||
A_TYPE values[NUM_ITER];
|
||||
uint offset_dst[NUM_ITER];
|
||||
[[unroll]] for (uint idx = 0; idx < NUM_ITER; ++idx) {
|
||||
values[idx] = A_TYPE(0);
|
||||
}
|
||||
|
||||
[[unroll]] for (uint idx = 0; idx < NUM_ITER; ++idx) {
|
||||
|
||||
const uint i = gidx * NUM_ITER + idx;
|
||||
|
||||
const uint ksize = p.OW * (p.KH > 1 ? p.KW : 1);
|
||||
const uint kx = i / ksize;
|
||||
const uint kd = kx * ksize;
|
||||
const uint ky = (i - kd) / p.OW;
|
||||
const uint ix = i % p.OW;
|
||||
|
||||
const uint iiw = ix * p.s0 + kx * p.d0 - p.p0;
|
||||
const uint iih = oh * p.s1 + ky * p.d1 - p.p1;
|
||||
|
||||
offset_dst[idx] =
|
||||
((batch * p.OH + oh) * p.OW + ix) * p.CHW +
|
||||
(ic * (p.KW * p.KH) + ky * p.KW + kx);
|
||||
|
||||
if (i >= p.pelements) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (iih < p.IH && iiw < p.IW) {
|
||||
const uint offset_src = ic * p.offset_delta + batch * p.batch_offset;
|
||||
values[idx] = data_a[offset_src + iih * p.IW + iiw];
|
||||
}
|
||||
}
|
||||
|
||||
[[unroll]] for (uint idx = 0; idx < NUM_ITER; ++idx) {
|
||||
|
||||
const uint i = gidx * NUM_ITER + idx;
|
||||
|
||||
if (i >= p.pelements) {
|
||||
continue;
|
||||
}
|
||||
|
||||
data_d[offset_dst[idx]] = D_TYPE(values[idx]);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,22 @@
|
|||
#version 450
|
||||
|
||||
#include "generic_head.comp"
|
||||
#include "types.comp"
|
||||
|
||||
#extension GL_EXT_control_flow_attributes : enable
|
||||
|
||||
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
|
||||
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
|
||||
|
||||
void main() {
|
||||
const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
|
||||
|
||||
if (i >= p.KX) {
|
||||
return;
|
||||
}
|
||||
|
||||
const float val = float(data_a[i]);
|
||||
data_d[i] = D_TYPE(max(val, 0.0f) + min(val, 0.0f) * p.param1);
|
||||
}
|
||||
|
|
@ -0,0 +1,27 @@
|
|||
#version 450
|
||||
|
||||
#include "types.comp"
|
||||
#include "generic_binary_head.comp"
|
||||
|
||||
const uint num_threads = 256;
|
||||
|
||||
layout(local_size_x = num_threads, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
void main() {
|
||||
uint idx = get_idx();
|
||||
|
||||
// num_threads * num_iter must equal 512, to match the wg_denoms and get_idx calculation
|
||||
const uint num_iter = 2;
|
||||
|
||||
[[unroll]] for (uint i = 0; i < num_iter; ++i) {
|
||||
if (idx >= p.ne) {
|
||||
continue;
|
||||
}
|
||||
uint i00, i01, i02, i03;
|
||||
get_indices(idx, i00, i01, i02, i03);
|
||||
|
||||
data_d[get_doffset() + dst_idx(i00, i01, i02, i03)] = D_TYPE(FLOAT_TYPE(data_a[get_aoffset() + src0_idx(i00, i01, i02, i03)]) * FLOAT_TYPE(data_b[get_boffset() + src1_idx(i00, i01, i02, i03)]));
|
||||
|
||||
idx += num_threads;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,48 @@
|
|||
#version 450
|
||||
|
||||
#extension GL_EXT_control_flow_attributes : enable
|
||||
|
||||
layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer A {float data_a[];};
|
||||
layout (binding = 0) readonly buffer A4 {vec4 data_a4[];};
|
||||
layout (binding = 1) writeonly buffer D {float data_d[];};
|
||||
layout (binding = 1) writeonly buffer D4 {vec4 data_d4[];};
|
||||
|
||||
layout (push_constant) uniform parameter {
|
||||
uint ne;
|
||||
uint k_num;
|
||||
} p;
|
||||
|
||||
void main() {
|
||||
// Each invocation handles four consecutive components
|
||||
const uint idx = gl_GlobalInvocationID.x * 4;
|
||||
|
||||
if (idx >= p.ne) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Check if all four components are in bounds and aligned,
|
||||
// then use vector loads
|
||||
if (idx + 3 < p.ne && (p.ne % 4) == 0) {
|
||||
vec4 result = vec4(0.0f);
|
||||
|
||||
[[unroll]] for (uint i = 0; i < p.k_num; i++) {
|
||||
result += data_a4[(i * p.ne + idx) / 4];
|
||||
}
|
||||
|
||||
data_d4[idx / 4] = result;
|
||||
} else {
|
||||
[[unroll]] for (uint j = 0; j < 4; ++j) {
|
||||
if (idx + j < p.ne) {
|
||||
float result = 0.0f;
|
||||
|
||||
[[unroll]] for (uint i = 0; i < p.k_num; i++) {
|
||||
result += data_a[i * p.ne + idx + j];
|
||||
}
|
||||
|
||||
data_d[idx + j] = result;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,152 @@
|
|||
#version 450
|
||||
|
||||
#ifdef FLOAT16
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require
|
||||
#endif
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types : require
|
||||
|
||||
#include "mul_mat_vec_base.comp"
|
||||
|
||||
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
#if !defined(DATA_A_F32) && !defined(DATA_A_F16)
|
||||
#define K_PER_ITER 8
|
||||
#else
|
||||
#define K_PER_ITER 2
|
||||
#endif
|
||||
|
||||
|
||||
uint a_offset, b_offset, d_offset, y_offset;
|
||||
|
||||
void iter(inout FLOAT_TYPE temp[NUM_COLS][NUM_ROWS], const uint first_row, const uint num_rows, const uint tid, const uint i, bool lastiter)
|
||||
{
|
||||
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
|
||||
const uint col = i*BLOCK_SIZE + K_PER_ITER*tid;
|
||||
const uint iqs = (col%QUANT_K)/QUANT_R; // quant index
|
||||
const uint iybs = col - col%QUANT_K; // y block start index
|
||||
|
||||
#if K_PER_ITER == 8
|
||||
#if QUANT_R == 2
|
||||
const B_TYPE_VEC4 bv02 = data_b_v4[(j*p.batch_stride_b + b_offset + iybs + iqs) / 4];
|
||||
const B_TYPE_VEC4 bv13 = data_b_v4[(j*p.batch_stride_b + b_offset + iybs + iqs + y_offset) / 4];
|
||||
const vec4 bv0 = vec4(bv02.x, bv13.x, bv02.y, bv13.y);
|
||||
const vec4 bv1 = vec4(bv02.z, bv13.z, bv02.w, bv13.w);
|
||||
#else
|
||||
const vec4 bv0 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + iybs + iqs) / 4]);
|
||||
const vec4 bv1 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + iybs + iqs) / 4 + 1]);
|
||||
#endif
|
||||
#else
|
||||
// Check if the second of the pair of elements is OOB, and don't fetch B or
|
||||
// accumulate it. We still fetch a pair of elements for A, which is fine for
|
||||
// quantized formats since they'll be within the same block. We should
|
||||
// probably skip fetching the second element for F16/F32, but as of now we
|
||||
// still do.
|
||||
const bool OOB = lastiter && (iybs + iqs + y_offset >= p.ncols);
|
||||
|
||||
FLOAT_TYPE b0 = 0, b1 = 0;
|
||||
b0 = FLOAT_TYPE(data_b[j*p.batch_stride_b + b_offset + iybs + iqs]);
|
||||
if (!OOB) {
|
||||
b1 = FLOAT_TYPE(data_b[j*p.batch_stride_b + b_offset + iybs + iqs + y_offset]);
|
||||
}
|
||||
#endif
|
||||
uint ibi = first_row*p.ncols;
|
||||
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
||||
const uint ib = (ibi + col)/QUANT_K; // block index
|
||||
ibi += p.ncols;
|
||||
|
||||
#if K_PER_ITER == 8
|
||||
vec4 v = dequantize4(ib, iqs, a_offset);
|
||||
vec4 v2 = dequantize4(ib, iqs+(4/QUANT_R), a_offset);
|
||||
|
||||
const vec2 dm = get_dm(ib, a_offset);
|
||||
if (dm.y != 0) { // quant has min component
|
||||
v = v * dm.x + dm.y;
|
||||
v2 = v2 * dm.x + dm.y;
|
||||
}
|
||||
|
||||
// matrix multiplication
|
||||
FLOAT_TYPE rowtmp = dot(bv0, v);
|
||||
rowtmp += dot(bv1, v2);
|
||||
|
||||
if (dm.y == 0)
|
||||
rowtmp *= dm.x;
|
||||
|
||||
temp[j][n] += rowtmp;
|
||||
#else
|
||||
const vec2 v = dequantize(ib, iqs, a_offset);
|
||||
|
||||
// matrix multiplication
|
||||
temp[j][n] = fma(FLOAT_TYPE(v.x), b0, temp[j][n]);
|
||||
if (!OOB) {
|
||||
temp[j][n] = fma(FLOAT_TYPE(v.y), b1, temp[j][n]);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
|
||||
const uint tid = gl_LocalInvocationID.x;
|
||||
|
||||
get_offsets(a_offset, b_offset, d_offset);
|
||||
a_offset /= QUANT_K;
|
||||
|
||||
y_offset = QUANT_R == 1 ? 1 : QUANT_K/2;
|
||||
|
||||
FLOAT_TYPE temp[NUM_COLS][NUM_ROWS];
|
||||
|
||||
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
|
||||
[[unroll]] for (uint i = 0; i < NUM_ROWS; ++i) {
|
||||
temp[j][i] = FLOAT_TYPE(0);
|
||||
}
|
||||
}
|
||||
|
||||
uint num_iters = p.ncols / (K_PER_ITER * BLOCK_SIZE);
|
||||
if (num_iters * K_PER_ITER * BLOCK_SIZE + K_PER_ITER*tid < p.ncols) {
|
||||
num_iters++;
|
||||
}
|
||||
int unroll_count = 4;
|
||||
uint unrolled_iters = num_iters & ~(unroll_count - 1);
|
||||
|
||||
uint i = 0;
|
||||
while (i < unrolled_iters) {
|
||||
// Manually partially unroll the loop
|
||||
[[unroll]] for (uint k = 0; k < unroll_count; ++k) {
|
||||
iter(temp, first_row, num_rows, tid, i*K_PER_ITER, false);
|
||||
i++;
|
||||
}
|
||||
}
|
||||
unroll_count = 2;
|
||||
unrolled_iters = num_iters & ~(unroll_count - 1);
|
||||
while (i < unrolled_iters) {
|
||||
// Manually partially unroll the loop
|
||||
[[unroll]] for (uint k = 0; k < unroll_count; ++k) {
|
||||
iter(temp, first_row, num_rows, tid, i*K_PER_ITER, false);
|
||||
i++;
|
||||
}
|
||||
}
|
||||
while (i < num_iters) {
|
||||
iter(temp, first_row, num_rows, tid, i*K_PER_ITER, true);
|
||||
i++;
|
||||
}
|
||||
|
||||
reduce_result(temp, d_offset, first_row, num_rows, tid);
|
||||
}
|
||||
|
||||
void main() {
|
||||
const uint first_row = NUM_ROWS * (gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z);
|
||||
|
||||
#if defined(DATA_A_IQ4_NL)
|
||||
init_iq4nl_shmem();
|
||||
#endif
|
||||
|
||||
// do NUM_ROWS at a time, unless there aren't enough remaining rows
|
||||
if (first_row + NUM_ROWS <= p.stride_d) {
|
||||
compute_outputs(first_row, NUM_ROWS);
|
||||
} else {
|
||||
if (first_row >= p.stride_d) {
|
||||
return;
|
||||
}
|
||||
compute_outputs(first_row, p.stride_d - first_row);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,118 @@
|
|||
#extension GL_EXT_control_flow_attributes : enable
|
||||
#extension GL_EXT_shader_16bit_storage : require
|
||||
#extension GL_EXT_shader_8bit_storage : require
|
||||
|
||||
#ifdef MUL_MAT_ID
|
||||
#define EXPERT_COUNT 8
|
||||
#endif
|
||||
|
||||
#include "types.comp"
|
||||
|
||||
layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
|
||||
layout (binding = 1) readonly buffer B {B_TYPE data_b[];};
|
||||
layout (binding = 1) readonly buffer BV2 {B_TYPE_VEC2 data_b_v2[];};
|
||||
layout (binding = 1) readonly buffer BV4 {B_TYPE_VEC4 data_b_v4[];};
|
||||
|
||||
layout (binding = 2) writeonly buffer D {D_TYPE data_d[];};
|
||||
#ifdef MUL_MAT_ID
|
||||
layout (binding = 3) readonly buffer IDS {int data_ids[];};
|
||||
#endif
|
||||
|
||||
#include "dequant_funcs.comp"
|
||||
|
||||
layout (push_constant) uniform parameter
|
||||
{
|
||||
uint ncols;
|
||||
uint stride_a;
|
||||
uint stride_b;
|
||||
uint stride_d;
|
||||
|
||||
uint batch_stride_a;
|
||||
uint batch_stride_b;
|
||||
uint batch_stride_d;
|
||||
|
||||
#ifdef MUL_MAT_ID
|
||||
uint nei0;
|
||||
uint ne11;
|
||||
#else
|
||||
uint ne02;
|
||||
uint ne12;
|
||||
uint broadcast2;
|
||||
uint broadcast3;
|
||||
#endif
|
||||
} p;
|
||||
|
||||
void get_offsets(out uint a_offset, out uint b_offset, out uint d_offset) {
|
||||
#ifdef MUL_MAT_ID
|
||||
const uint expert_idx = gl_GlobalInvocationID.y;
|
||||
#else
|
||||
const uint batch_idx = gl_GlobalInvocationID.y;
|
||||
#endif
|
||||
|
||||
#ifndef MUL_MAT_ID
|
||||
uint batch_idx_a = 0;
|
||||
if (batch_idx != 0) {
|
||||
const uint i13 = batch_idx / p.ne12;
|
||||
const uint i12 = batch_idx % p.ne12;
|
||||
|
||||
const uint i03 = i13 / p.broadcast3;
|
||||
const uint i02 = i12 / p.broadcast2;
|
||||
|
||||
batch_idx_a = i03 * p.ne02 + i02;
|
||||
}
|
||||
#else
|
||||
const uint expert_id = data_ids[expert_idx];
|
||||
#endif
|
||||
|
||||
a_offset =
|
||||
#ifdef MUL_MAT_ID
|
||||
expert_id * p.batch_stride_a;
|
||||
#else
|
||||
batch_idx_a * p.batch_stride_a;
|
||||
#endif
|
||||
b_offset =
|
||||
#ifdef MUL_MAT_ID
|
||||
(expert_idx % p.ne11) * p.stride_b;
|
||||
#else
|
||||
batch_idx * p.batch_stride_b;
|
||||
#endif
|
||||
d_offset =
|
||||
#ifdef MUL_MAT_ID
|
||||
expert_idx * p.stride_d;
|
||||
#else
|
||||
batch_idx * p.batch_stride_d;
|
||||
#endif
|
||||
}
|
||||
|
||||
layout (constant_id = 0) const uint BLOCK_SIZE = 32;
|
||||
layout (constant_id = 1) const uint NUM_ROWS = 1;
|
||||
layout (constant_id = 2) const uint NUM_COLS = 1;
|
||||
|
||||
shared FLOAT_TYPE tmpsh[NUM_COLS][NUM_ROWS][BLOCK_SIZE];
|
||||
|
||||
void reduce_result(const in FLOAT_TYPE temp[NUM_COLS][NUM_ROWS], const in uint32_t d_offset, const in uint32_t first_row, const in uint32_t num_rows, const in uint32_t tid) {
|
||||
// sum up partial sums and write back result
|
||||
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
|
||||
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
||||
tmpsh[j][n][tid] = temp[j][n];
|
||||
}
|
||||
}
|
||||
barrier();
|
||||
[[unroll]] for (uint s = BLOCK_SIZE/2; s > 0; s >>= 1) {
|
||||
if (tid < s) {
|
||||
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
|
||||
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
||||
tmpsh[j][n][tid] += tmpsh[j][n][tid + s];
|
||||
}
|
||||
}
|
||||
}
|
||||
barrier();
|
||||
}
|
||||
if (tid == 0) {
|
||||
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
|
||||
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
||||
data_d[j*p.batch_stride_d + d_offset + first_row + n] = D_TYPE(tmpsh[j][n][0]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,71 @@
|
|||
#version 450
|
||||
|
||||
#extension GL_EXT_control_flow_attributes : enable
|
||||
#extension GL_EXT_shader_16bit_storage : require
|
||||
|
||||
#define BLOCK_SIZE 32
|
||||
#define FLOAT_TYPE float
|
||||
|
||||
layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
|
||||
layout (binding = 1) readonly buffer B {B_TYPE data_b[];};
|
||||
layout (binding = 2) writeonly buffer D {D_TYPE dst[];};
|
||||
|
||||
layout (push_constant) uniform parameter
|
||||
{
|
||||
uint ncols_x;
|
||||
uint nrows_x;
|
||||
uint row_stride_x;
|
||||
uint channel_stride_x;
|
||||
uint channel_x_divisor;
|
||||
uint b_offset;
|
||||
uint d_offset;
|
||||
} p;
|
||||
|
||||
shared FLOAT_TYPE tmp[BLOCK_SIZE];
|
||||
|
||||
void main() {
|
||||
const uint tid = gl_LocalInvocationID.x;
|
||||
const uint row_x = gl_GlobalInvocationID.y;
|
||||
const uint channel = gl_GlobalInvocationID.z;
|
||||
const uint channel_x = channel / p.channel_x_divisor;
|
||||
|
||||
const uint nrows_y = p.ncols_x;
|
||||
const uint nrows_dst = p.nrows_x;
|
||||
const uint row_dst = row_x;
|
||||
|
||||
const uint idst = channel*nrows_dst + row_dst;
|
||||
|
||||
tmp[tid] = 0.0f;
|
||||
|
||||
for (uint col_x0 = 0; col_x0 < p.ncols_x; col_x0 += BLOCK_SIZE) {
|
||||
const uint col_x = col_x0 + tid;
|
||||
|
||||
if (col_x >= p.ncols_x) {
|
||||
break;
|
||||
}
|
||||
|
||||
const uint row_y = col_x;
|
||||
|
||||
const uint ix = channel_x*p.channel_stride_x + row_x*p.row_stride_x + col_x;
|
||||
const uint iy = channel*nrows_y + row_y;
|
||||
|
||||
const FLOAT_TYPE xi = FLOAT_TYPE(data_a[ix]);
|
||||
|
||||
tmp[tid] = fma(xi, FLOAT_TYPE(data_b[iy]), tmp[tid]);
|
||||
}
|
||||
|
||||
// sum up partial sums and write back result
|
||||
barrier();
|
||||
[[unroll]] for (int s = BLOCK_SIZE / 2; s > 0; s >>= 1) {
|
||||
if (tid < s) {
|
||||
tmp[tid] += tmp[tid + s];
|
||||
}
|
||||
barrier();
|
||||
}
|
||||
|
||||
if (tid == 0) {
|
||||
dst[idst] = tmp[0];
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,73 @@
|
|||
#version 450
|
||||
|
||||
#extension GL_EXT_control_flow_attributes : enable
|
||||
#extension GL_EXT_shader_16bit_storage : require
|
||||
|
||||
#define BLOCK_SIZE 32
|
||||
#define FLOAT_TYPE float
|
||||
|
||||
layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
|
||||
layout (binding = 1) readonly buffer B {B_TYPE data_b[];};
|
||||
layout (binding = 2) writeonly buffer D {D_TYPE dst[];};
|
||||
|
||||
layout (push_constant) uniform parameter
|
||||
{
|
||||
uint ncols_x;
|
||||
uint nrows_x;
|
||||
uint nchannels_x;
|
||||
uint nchannels_y;
|
||||
uint b_offset;
|
||||
uint d_offset;
|
||||
} p;
|
||||
|
||||
shared FLOAT_TYPE tmp[BLOCK_SIZE];
|
||||
|
||||
void main() {
|
||||
const uint tid = gl_LocalInvocationID.x;
|
||||
const uint row_x = gl_GlobalInvocationID.y;
|
||||
const uint channel = gl_GlobalInvocationID.z;
|
||||
const uint channel_x = channel / (p.nchannels_y / p.nchannels_x);
|
||||
|
||||
const uint nrows_y = p.ncols_x;
|
||||
const uint nrows_dst = p.nrows_x;
|
||||
const uint row_dst = row_x;
|
||||
|
||||
tmp[tid] = FLOAT_TYPE(0.0f);
|
||||
|
||||
for (uint col_x0 = 0; col_x0 < p.ncols_x; col_x0 += BLOCK_SIZE) {
|
||||
const uint col_x = col_x0 + tid;
|
||||
|
||||
if (col_x >= p.ncols_x) {
|
||||
break;
|
||||
}
|
||||
|
||||
// x is transposed and permuted
|
||||
const uint ix = row_x*p.nchannels_x*p.ncols_x + channel_x*p.ncols_x + col_x;
|
||||
const FLOAT_TYPE xi = FLOAT_TYPE(data_a[ix]);
|
||||
|
||||
const uint row_y = col_x;
|
||||
|
||||
// y is not transposed but permuted
|
||||
const uint iy = channel*nrows_y + row_y;
|
||||
|
||||
tmp[tid] = fma(xi, FLOAT_TYPE(data_b[iy]), tmp[tid]);
|
||||
}
|
||||
|
||||
// dst is not transposed and not permuted
|
||||
const uint idst = channel*nrows_dst + row_dst;
|
||||
|
||||
// sum up partial sums and write back result
|
||||
barrier();
|
||||
[[unroll]] for (int s = BLOCK_SIZE / 2; s > 0; s >>= 1) {
|
||||
if (tid < s) {
|
||||
tmp[tid] += tmp[tid + s];
|
||||
}
|
||||
barrier();
|
||||
}
|
||||
|
||||
if (tid == 0) {
|
||||
dst[idst] = tmp[0];
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,115 @@
|
|||
#version 450
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types : require
|
||||
|
||||
#include "mul_mat_vec_base.comp"
|
||||
|
||||
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
|
||||
uint a_offset, b_offset, d_offset;
|
||||
get_offsets(a_offset, b_offset, d_offset);
|
||||
|
||||
const uint num_blocks_per_row = p.ncols / QUANT_K;
|
||||
|
||||
// 16 threads are used to process each block
|
||||
const uint it_size = gl_WorkGroupSize.x/16;
|
||||
const uint tid = gl_LocalInvocationID.x;
|
||||
const uint itid = tid%16; // 0...16
|
||||
const uint ix = tid/16;
|
||||
|
||||
const uint step = 8;
|
||||
|
||||
const uint v_im = itid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
|
||||
const uint v_in = itid - step*v_im; // 0...15 or 0...7
|
||||
|
||||
const uint l0 = 2*v_in; // 0...15
|
||||
const uint q_offset = 32*v_im + l0;
|
||||
const uint s_offset = 8*v_im;
|
||||
const uint y_offset = 128*v_im + l0;
|
||||
|
||||
FLOAT_TYPE temp[NUM_COLS][NUM_ROWS];
|
||||
|
||||
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
|
||||
[[unroll]] for (uint i = 0; i < NUM_ROWS; ++i) {
|
||||
temp[j][i] = FLOAT_TYPE(0);
|
||||
}
|
||||
}
|
||||
|
||||
[[unroll]] for (uint i = ix; i < num_blocks_per_row; i += it_size) {
|
||||
const uint y_idx = i * QUANT_K + y_offset;
|
||||
|
||||
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
||||
const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row;
|
||||
f16vec2 d = data_a[ib0 + i].d;
|
||||
const FLOAT_TYPE dall = d.x;
|
||||
const FLOAT_TYPE dmin = d.y;
|
||||
|
||||
uint32_t s0_u32 = data_a_packed32[ib0 + i].scales[s_offset / 4 + 0];
|
||||
uint32_t s4_u32 = data_a_packed32[ib0 + i].scales[s_offset / 4 + 1];
|
||||
|
||||
uint32_t s0_lo4_u32 = s0_u32 & 0x0F0F0F0F;
|
||||
uint32_t s0_hi4_u32 = (s0_u32 >> 4) & 0x0F0F0F0F;
|
||||
uint32_t s4_lo4_u32 = s4_u32 & 0x0F0F0F0F;
|
||||
uint32_t s4_hi4_u32 = (s4_u32 >> 4) & 0x0F0F0F0F;
|
||||
|
||||
uvec4 s0_lo4 = uvec4(unpack8(s0_lo4_u32));
|
||||
uvec4 s4_lo4 = uvec4(unpack8(s4_lo4_u32));
|
||||
uvec4 s0_hi4 = uvec4(unpack8(s0_hi4_u32));
|
||||
uvec4 s4_hi4 = uvec4(unpack8(s4_hi4_u32));
|
||||
|
||||
uint16_t qs0_u16 = data_a_packed16[ib0 + i].qs[q_offset / 2 + 0];
|
||||
uint16_t qs16_u16 = data_a_packed16[ib0 + i].qs[q_offset / 2 + 8];
|
||||
uvec2 qs0 = uvec2(unpack8(qs0_u16));
|
||||
uvec2 qs16 = uvec2(unpack8(qs16_u16));
|
||||
|
||||
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
|
||||
B_TYPE_VEC2 b0 = data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 0];
|
||||
B_TYPE_VEC2 b16 = data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 8];
|
||||
B_TYPE_VEC2 b32 = data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 16];
|
||||
B_TYPE_VEC2 b48 = data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 24];
|
||||
B_TYPE_VEC2 b64 = data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 32];
|
||||
B_TYPE_VEC2 b80 = data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 40];
|
||||
B_TYPE_VEC2 b96 = data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 48];
|
||||
B_TYPE_VEC2 b112 = data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 56];
|
||||
|
||||
FLOAT_TYPE sum1 = FLOAT_TYPE(0.0);
|
||||
FLOAT_TYPE sum2 = FLOAT_TYPE(0.0);
|
||||
[[unroll]] for (int l = 0; l < 2; ++l) {
|
||||
sum1 = fma(FLOAT_TYPE(b0[l]), FLOAT_TYPE(s0_lo4[0]) * FLOAT_TYPE((qs0[l] >> 0) & 3),
|
||||
fma(FLOAT_TYPE(b16[l]), FLOAT_TYPE(s0_lo4[1]) * FLOAT_TYPE((qs16[l] >> 0) & 3),
|
||||
fma(FLOAT_TYPE(b32[l]), FLOAT_TYPE(s0_lo4[2]) * FLOAT_TYPE((qs0[l] >> 2) & 3),
|
||||
fma(FLOAT_TYPE(b48[l]), FLOAT_TYPE(s0_lo4[3]) * FLOAT_TYPE((qs16[l] >> 2) & 3),
|
||||
fma(FLOAT_TYPE(b64[l]), FLOAT_TYPE(s4_lo4[0]) * FLOAT_TYPE((qs0[l] >> 4) & 3),
|
||||
fma(FLOAT_TYPE(b80[l]), FLOAT_TYPE(s4_lo4[1]) * FLOAT_TYPE((qs16[l] >> 4) & 3),
|
||||
fma(FLOAT_TYPE(b96[l]), FLOAT_TYPE(s4_lo4[2]) * FLOAT_TYPE((qs0[l] >> 6) & 3),
|
||||
fma(FLOAT_TYPE(b112[l]), FLOAT_TYPE(s4_lo4[3]) * FLOAT_TYPE((qs16[l] >> 6) & 3), sum1))))))));
|
||||
sum2 = fma(FLOAT_TYPE(b0[l]), FLOAT_TYPE(s0_hi4[0]),
|
||||
fma(FLOAT_TYPE(b16[l]), FLOAT_TYPE(s0_hi4[1]),
|
||||
fma(FLOAT_TYPE(b32[l]), FLOAT_TYPE(s0_hi4[2]),
|
||||
fma(FLOAT_TYPE(b48[l]), FLOAT_TYPE(s0_hi4[3]),
|
||||
fma(FLOAT_TYPE(b64[l]), FLOAT_TYPE(s4_hi4[0]),
|
||||
fma(FLOAT_TYPE(b80[l]), FLOAT_TYPE(s4_hi4[1]),
|
||||
fma(FLOAT_TYPE(b96[l]), FLOAT_TYPE(s4_hi4[2]),
|
||||
fma(FLOAT_TYPE(b112[l]), FLOAT_TYPE(s4_hi4[3]), sum2))))))));
|
||||
}
|
||||
temp[j][n] = fma(dall, sum1, fma(-dmin, sum2, temp[j][n]));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
reduce_result(temp, d_offset, first_row, num_rows, tid);
|
||||
}
|
||||
|
||||
void main() {
|
||||
const uint first_row = NUM_ROWS * (gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z);
|
||||
|
||||
// do NUM_ROWS at a time, unless there aren't enough remaining rows
|
||||
if (first_row + NUM_ROWS <= p.stride_d) {
|
||||
compute_outputs(first_row, NUM_ROWS);
|
||||
} else {
|
||||
if (first_row >= p.stride_d) {
|
||||
return;
|
||||
}
|
||||
compute_outputs(first_row, p.stride_d - first_row);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,103 @@
|
|||
#version 450
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types : require
|
||||
|
||||
#include "mul_mat_vec_base.comp"
|
||||
|
||||
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
|
||||
uint a_offset, b_offset, d_offset;
|
||||
get_offsets(a_offset, b_offset, d_offset);
|
||||
|
||||
const uint num_blocks_per_row = p.ncols / QUANT_K;
|
||||
|
||||
// 16 threads are used to process each block
|
||||
const uint it_size = gl_WorkGroupSize.x/16;
|
||||
const uint tid = gl_LocalInvocationID.x;
|
||||
const uint itid = tid%16; // 0...16
|
||||
const uint ix = tid/16;
|
||||
|
||||
const uint step = 8;
|
||||
|
||||
const uint v_im = itid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
|
||||
const uint v_in = itid - step*v_im; // 0...15 or 0...7
|
||||
|
||||
const uint8_t m = uint8_t(1 << (4 * v_im));
|
||||
|
||||
const uint l0 = 2*v_in; // 0...15
|
||||
const uint q_offset = 32*v_im + l0;
|
||||
const uint y_offset = 128*v_im + l0;
|
||||
|
||||
FLOAT_TYPE temp[NUM_COLS][NUM_ROWS];
|
||||
|
||||
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
|
||||
[[unroll]] for (uint i = 0; i < NUM_ROWS; ++i) {
|
||||
temp[j][i] = FLOAT_TYPE(0);
|
||||
}
|
||||
}
|
||||
|
||||
const uint s_shift = 4 * v_im;
|
||||
|
||||
[[unroll]] for (uint i = ix; i < num_blocks_per_row; i += it_size) {
|
||||
const uint y_idx = i * QUANT_K + y_offset;
|
||||
|
||||
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
||||
const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row;
|
||||
const FLOAT_TYPE d = FLOAT_TYPE(data_a[ib0 + i].d);
|
||||
|
||||
uint16_t s0_16 = data_a_packed16[ib0 + i].scales[0];
|
||||
uint16_t s2_16 = data_a_packed16[ib0 + i].scales[1];
|
||||
uint16_t s4_16 = data_a_packed16[ib0 + i].scales[2];
|
||||
uint16_t s6_16 = data_a_packed16[ib0 + i].scales[3];
|
||||
uint16_t s8_16 = data_a_packed16[ib0 + i].scales[4];
|
||||
uint16_t s10_16 = data_a_packed16[ib0 + i].scales[5];
|
||||
u8vec2 s0 = unpack8(s0_16);
|
||||
u8vec2 s2 = unpack8(s2_16);
|
||||
u8vec2 s4 = unpack8(s4_16);
|
||||
u8vec2 s6 = unpack8(s6_16);
|
||||
u8vec2 s8 = unpack8(s8_16);
|
||||
u8vec2 s10 = unpack8(s10_16);
|
||||
|
||||
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
|
||||
|
||||
B_TYPE_VEC2 b0 = data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 0];
|
||||
B_TYPE_VEC2 b16 = data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 8];
|
||||
B_TYPE_VEC2 b32 = data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 16];
|
||||
B_TYPE_VEC2 b48 = data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 24];
|
||||
B_TYPE_VEC2 b64 = data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 32];
|
||||
B_TYPE_VEC2 b80 = data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 40];
|
||||
B_TYPE_VEC2 b96 = data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 48];
|
||||
B_TYPE_VEC2 b112 = data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 56];
|
||||
|
||||
FLOAT_TYPE sum = FLOAT_TYPE(0.0);
|
||||
[[unroll]] for (int l = 0; l < 2; ++l) {
|
||||
sum = fma(FLOAT_TYPE(b0[l]) * FLOAT_TYPE(int8_t(((s0[0] >> s_shift) & 0xF) | ((s8[0] >> (s_shift + 0) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l ] ) & 3) - (((data_a[ib0 + i].hmask[l0 + l ] & (m << 0)) != 0) ? 0 : 4)),
|
||||
fma(FLOAT_TYPE(b32[l]) * FLOAT_TYPE(int8_t(((s2[0] >> s_shift) & 0xF) | ((s10[0] >> (s_shift + 0) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l ] >> 2) & 3) - (((data_a[ib0 + i].hmask[l0 + l ] & (m << 1)) != 0) ? 0 : 4)),
|
||||
fma(FLOAT_TYPE(b64[l]) * FLOAT_TYPE(int8_t(((s4[0] >> s_shift) & 0xF) | ((s8[0] >> (s_shift + 2) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l ] >> 4) & 3) - (((data_a[ib0 + i].hmask[l0 + l ] & (m << 2)) != 0) ? 0 : 4)),
|
||||
fma(FLOAT_TYPE(b96[l]) * FLOAT_TYPE(int8_t(((s6[0] >> s_shift) & 0xF) | ((s10[0] >> (s_shift + 2) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l ] >> 6) & 3) - (((data_a[ib0 + i].hmask[l0 + l ] & (m << 3)) != 0) ? 0 : 4)),
|
||||
fma(FLOAT_TYPE(b16[l]) * FLOAT_TYPE(int8_t(((s0[1] >> s_shift) & 0xF) | ((s8[1] >> (s_shift + 0) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] ) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 0)) != 0) ? 0 : 4)),
|
||||
fma(FLOAT_TYPE(b48[l]) * FLOAT_TYPE(int8_t(((s2[1] >> s_shift) & 0xF) | ((s10[1] >> (s_shift + 0) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] >> 2) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 1)) != 0) ? 0 : 4)),
|
||||
fma(FLOAT_TYPE(b80[l]) * FLOAT_TYPE(int8_t(((s4[1] >> s_shift) & 0xF) | ((s8[1] >> (s_shift + 2) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] >> 4) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 2)) != 0) ? 0 : 4)),
|
||||
fma(FLOAT_TYPE(b112[l]) * FLOAT_TYPE(int8_t(((s6[1] >> s_shift) & 0xF) | ((s10[1] >> (s_shift + 2) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] >> 6) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 3)) != 0) ? 0 : 4)), sum))))))));
|
||||
}
|
||||
temp[j][n] = fma(d, sum, temp[j][n]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
reduce_result(temp, d_offset, first_row, num_rows, tid);
|
||||
}
|
||||
|
||||
void main() {
|
||||
const uint first_row = NUM_ROWS * (gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z);
|
||||
|
||||
// do NUM_ROWS at a time, unless there aren't enough remaining rows
|
||||
if (first_row + NUM_ROWS <= p.stride_d) {
|
||||
compute_outputs(first_row, NUM_ROWS);
|
||||
} else {
|
||||
if (first_row >= p.stride_d) {
|
||||
return;
|
||||
}
|
||||
compute_outputs(first_row, p.stride_d - first_row);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,133 @@
|
|||
#version 450
|
||||
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types : require
|
||||
|
||||
#include "mul_mat_vec_base.comp"
|
||||
|
||||
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
|
||||
uint a_offset, b_offset, d_offset;
|
||||
get_offsets(a_offset, b_offset, d_offset);
|
||||
|
||||
const uint num_blocks_per_row = p.ncols / QUANT_K;
|
||||
|
||||
// 16 threads are used to process each block
|
||||
const uint it_size = gl_WorkGroupSize.x/16;
|
||||
const uint tid = gl_LocalInvocationID.x;
|
||||
const uint itid = tid%16; // 0...16
|
||||
const uint ix = tid/16;
|
||||
|
||||
const uint step = 4;
|
||||
|
||||
const uint il = itid/step; // 0...3
|
||||
const uint ir = itid - step*il; // 0...7 or 0...3
|
||||
const uint n = 4;
|
||||
|
||||
const uint v_im = il / 2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
|
||||
const uint v_in = il % 2;
|
||||
|
||||
const uint l0 = n * (2 * ir + v_in); // 0...15
|
||||
const uint q_offset = 32*v_im + l0;
|
||||
const uint y_offset = 64*v_im + l0;
|
||||
|
||||
FLOAT_TYPE temp[NUM_COLS][NUM_ROWS];
|
||||
|
||||
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
|
||||
[[unroll]] for (uint i = 0; i < NUM_ROWS; ++i) {
|
||||
temp[j][i] = FLOAT_TYPE(0);
|
||||
}
|
||||
}
|
||||
|
||||
[[unroll]] for (uint i = ix; i < num_blocks_per_row; i += it_size) {
|
||||
const uint y1_idx = i * QUANT_K + y_offset;
|
||||
const uint y2_idx = y1_idx + 128;
|
||||
|
||||
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
||||
const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row;
|
||||
f16vec2 d = data_a[ib0 + i].d;
|
||||
const FLOAT_TYPE dall = FLOAT_TYPE(d.x);
|
||||
const FLOAT_TYPE dmin = FLOAT_TYPE(d.y);
|
||||
|
||||
uint32_t scale0_u32 = data_a_packed16[ib0 + i].scales[v_im ];
|
||||
uint32_t scale4_u32 = data_a_packed16[ib0 + i].scales[v_im + 2];
|
||||
uint32_t scale8_u32 = data_a_packed16[ib0 + i].scales[v_im + 4];
|
||||
uvec4 scale0 = uvec4(unpack8(scale0_u32));
|
||||
uvec4 scale4 = uvec4(unpack8(scale4_u32));
|
||||
uvec4 scale8 = uvec4(unpack8(scale8_u32));
|
||||
|
||||
const uint32_t sc0 = ( scale0.x & 0x3f);
|
||||
const uint32_t sc1 = ( scale0.y & 0x3f);
|
||||
const uint32_t sc2 = ( scale4.x & 0x3f);
|
||||
const uint32_t sc3 = ( scale4.y & 0x3f);
|
||||
const uint32_t sc4 = (( scale8.x & 0x0f) | ((scale0.x & 0xc0) >> 2));
|
||||
const uint32_t sc5 = (( scale8.y & 0x0f) | ((scale0.y & 0xc0) >> 2));
|
||||
const uint32_t sc6 = (((scale8.x >> 4) & 0x0f) | ((scale4.x & 0xc0) >> 2));
|
||||
const uint32_t sc7 = (((scale8.y >> 4) & 0x0f) | ((scale4.y & 0xc0) >> 2));
|
||||
|
||||
uint32_t qs0_u32 = data_a_packed32[ib0 + i].qs[q_offset / 4];
|
||||
uint32_t qs64_u32 = data_a_packed32[ib0 + i].qs[q_offset / 4 + 16];
|
||||
|
||||
uint32_t qs0_u32_lo4 = qs0_u32 & 0x0F0F0F0F;
|
||||
uint32_t qs0_u32_hi4 = (qs0_u32 >> 4) & 0x0F0F0F0F;
|
||||
uint32_t qs64_u32_lo4 = qs64_u32 & 0x0F0F0F0F;
|
||||
uint32_t qs64_u32_hi4 = (qs64_u32 >> 4) & 0x0F0F0F0F;
|
||||
|
||||
uvec4 qs0_lo4 = uvec4(unpack8(qs0_u32_lo4));
|
||||
uvec4 qs64_lo4 = uvec4(unpack8(qs64_u32_lo4));
|
||||
uvec4 qs0_hi4 = uvec4(unpack8(qs0_u32_hi4));
|
||||
uvec4 qs64_hi4 = uvec4(unpack8(qs64_u32_hi4));
|
||||
|
||||
const uint32_t q4_0 = qs0_lo4.x;
|
||||
const uint32_t q4_1 = qs0_lo4.y;
|
||||
const uint32_t q4_2 = qs0_lo4.z;
|
||||
const uint32_t q4_3 = qs0_lo4.w;
|
||||
const uint32_t q4_4 = qs0_hi4.x;
|
||||
const uint32_t q4_5 = qs0_hi4.y;
|
||||
const uint32_t q4_6 = qs0_hi4.z;
|
||||
const uint32_t q4_7 = qs0_hi4.w;
|
||||
const uint32_t q4_8 = qs64_lo4.x;
|
||||
const uint32_t q4_9 = qs64_lo4.y;
|
||||
const uint32_t q4_10 = qs64_lo4.z;
|
||||
const uint32_t q4_11 = qs64_lo4.w;
|
||||
const uint32_t q4_12 = qs64_hi4.x;
|
||||
const uint32_t q4_13 = qs64_hi4.y;
|
||||
const uint32_t q4_14 = qs64_hi4.z;
|
||||
const uint32_t q4_15 = qs64_hi4.w;
|
||||
|
||||
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
|
||||
B_TYPE_VEC4 by10 = data_b_v4[(j*p.batch_stride_b + b_offset + y1_idx) / 4];
|
||||
B_TYPE_VEC4 by132 = data_b_v4[(j*p.batch_stride_b + b_offset + y1_idx) / 4 + 8];
|
||||
B_TYPE_VEC4 by20 = data_b_v4[(j*p.batch_stride_b + b_offset + y2_idx) / 4];
|
||||
B_TYPE_VEC4 by232 = data_b_v4[(j*p.batch_stride_b + b_offset + y2_idx) / 4 + 8];
|
||||
|
||||
const FLOAT_TYPE sx = fma(FLOAT_TYPE(by10.x), q4_0, fma(FLOAT_TYPE(by10.y), q4_1, fma(FLOAT_TYPE(by10.z), q4_2, FLOAT_TYPE(by10.w) * q4_3)));
|
||||
const FLOAT_TYPE sy = fma(FLOAT_TYPE(by132.x), q4_4, fma(FLOAT_TYPE(by132.y), q4_5, fma(FLOAT_TYPE(by132.z), q4_6, FLOAT_TYPE(by132.w) * q4_7)));
|
||||
const FLOAT_TYPE sz = fma(FLOAT_TYPE(by20.x), q4_8, fma(FLOAT_TYPE(by20.y), q4_9, fma(FLOAT_TYPE(by20.z), q4_10, FLOAT_TYPE(by20.w) * q4_11)));
|
||||
const FLOAT_TYPE sw = fma(FLOAT_TYPE(by232.x), q4_12, fma(FLOAT_TYPE(by232.y), q4_13, fma(FLOAT_TYPE(by232.z), q4_14, FLOAT_TYPE(by232.w) * q4_15)));
|
||||
const FLOAT_TYPE smin =
|
||||
fma(FLOAT_TYPE(by10.x), sc2, fma(FLOAT_TYPE(by132.x), sc3, fma(FLOAT_TYPE(by20.x), sc6, fma(FLOAT_TYPE(by232.x), sc7,
|
||||
fma(FLOAT_TYPE(by10.y), sc2, fma(FLOAT_TYPE(by132.y), sc3, fma(FLOAT_TYPE(by20.y), sc6, fma(FLOAT_TYPE(by232.y), sc7,
|
||||
fma(FLOAT_TYPE(by10.z), sc2, fma(FLOAT_TYPE(by132.z), sc3, fma(FLOAT_TYPE(by20.z), sc6, fma(FLOAT_TYPE(by232.z), sc7,
|
||||
fma(FLOAT_TYPE(by10.w), sc2, fma(FLOAT_TYPE(by132.w), sc3, fma(FLOAT_TYPE(by20.w), sc6, FLOAT_TYPE(by232.w) * sc7)))))))))))))));
|
||||
temp[j][n] = fma(dall, fma(sx, sc0, fma(sy, sc1, fma(sz, sc4, sw * sc5))), fma(-dmin, smin, temp[j][n]));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
reduce_result(temp, d_offset, first_row, num_rows, tid);
|
||||
}
|
||||
|
||||
void main() {
|
||||
const uint first_row = NUM_ROWS * (gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z);
|
||||
|
||||
// do NUM_ROWS at a time, unless there aren't enough remaining rows
|
||||
if (first_row + NUM_ROWS <= p.stride_d) {
|
||||
compute_outputs(first_row, NUM_ROWS);
|
||||
} else {
|
||||
if (first_row >= p.stride_d) {
|
||||
return;
|
||||
}
|
||||
compute_outputs(first_row, p.stride_d - first_row);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,162 @@
|
|||
#version 450
|
||||
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types : require
|
||||
|
||||
#include "mul_mat_vec_base.comp"
|
||||
|
||||
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
|
||||
uint a_offset, b_offset, d_offset;
|
||||
get_offsets(a_offset, b_offset, d_offset);
|
||||
|
||||
const uint num_blocks_per_row = p.ncols / QUANT_K;
|
||||
|
||||
// 16 threads are used to process each block
|
||||
const uint it_size = gl_WorkGroupSize.x/16;
|
||||
const uint tid = gl_LocalInvocationID.x;
|
||||
const uint itid = tid%16; // 0...16
|
||||
const uint ix = tid/16;
|
||||
|
||||
const uint il = itid/4; // 0...3
|
||||
const uint ir = itid - 4*il; // 0...7 or 0...3
|
||||
|
||||
const uint v_im = il / 2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
|
||||
const uint v_in = il % 2;
|
||||
|
||||
const uint l0 = 4*ir + 2*v_in; // 0...15
|
||||
const uint q_offset = 32*v_im + l0;
|
||||
const uint y_offset = 64*v_im + l0;
|
||||
|
||||
FLOAT_TYPE temp[NUM_COLS][NUM_ROWS];
|
||||
|
||||
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
|
||||
[[unroll]] for (uint i = 0; i < NUM_ROWS; ++i) {
|
||||
temp[j][i] = FLOAT_TYPE(0);
|
||||
}
|
||||
}
|
||||
|
||||
[[unroll]] for (uint i = ix; i < num_blocks_per_row; i += it_size) {
|
||||
const uint y1_idx = i * QUANT_K + y_offset;
|
||||
const uint y2_idx = y1_idx + 128;
|
||||
|
||||
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
||||
const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row;
|
||||
f16vec2 d = data_a[ib0 + i].d;
|
||||
const FLOAT_TYPE dall = FLOAT_TYPE(d.x);
|
||||
const FLOAT_TYPE dmin = FLOAT_TYPE(d.y);
|
||||
|
||||
uint32_t scale0_u32 = data_a_packed16[ib0 + i].scales[v_im ];
|
||||
uint32_t scale4_u32 = data_a_packed16[ib0 + i].scales[v_im + 2];
|
||||
uint32_t scale8_u32 = data_a_packed16[ib0 + i].scales[v_im + 4];
|
||||
uvec4 scale0 = uvec4(unpack8(scale0_u32));
|
||||
uvec4 scale4 = uvec4(unpack8(scale4_u32));
|
||||
uvec4 scale8 = uvec4(unpack8(scale8_u32));
|
||||
|
||||
const uint32_t sc0 = ( scale0.x & 0x3f);
|
||||
const uint32_t sc1 = ( scale0.y & 0x3f);
|
||||
const uint32_t sc2 = ( scale4.x & 0x3f);
|
||||
const uint32_t sc3 = ( scale4.y & 0x3f);
|
||||
const uint32_t sc4 = (( scale8.x & 0x0f) | ((scale0.x & 0xc0) >> 2));
|
||||
const uint32_t sc5 = (( scale8.y & 0x0f) | ((scale0.y & 0xc0) >> 2));
|
||||
const uint32_t sc6 = (((scale8.x >> 4) & 0x0f) | ((scale4.x & 0xc0) >> 2));
|
||||
const uint32_t sc7 = (((scale8.y >> 4) & 0x0f) | ((scale4.y & 0xc0) >> 2));
|
||||
|
||||
uint32_t qs0_16_u32 = uint32_t(data_a_packed16[ib0 + i].qs[q_offset / 2]) | (uint32_t(data_a_packed16[ib0 + i].qs[q_offset / 2 + 8]) << 16);
|
||||
uint32_t qs64_80_u32 = uint32_t(data_a_packed16[ib0 + i].qs[q_offset / 2 + 32]) | (uint32_t(data_a_packed16[ib0 + i].qs[q_offset / 2 + 40]) << 16);
|
||||
|
||||
uint32_t qs0_16_u32_lo4 = qs0_16_u32 & 0x0F0F0F0F;
|
||||
uint32_t qs0_16_u32_hi4 = (qs0_16_u32 >> 4) & 0x0F0F0F0F;
|
||||
uint32_t qs64_80_u32_lo4 = qs64_80_u32 & 0x0F0F0F0F;
|
||||
uint32_t qs64_80_u32_hi4 = (qs64_80_u32 >> 4) & 0x0F0F0F0F;
|
||||
|
||||
uint32_t qh = pack32(u16vec2(data_a_packed16[ib0 + i].qh[l0 / 2], data_a_packed16[ib0 + i].qh[l0 / 2 + 8]));
|
||||
|
||||
uint32_t qs0_16_lo4_offset16 = ((qh >> (2*v_im)) & 0x01010101) << 4;
|
||||
uint32_t qs0_16_hi4_offset16 = ((qh >> (2*v_im)) & 0x02020202) << 3;
|
||||
uint32_t qs64_80_lo4_offset16 = ((qh >> (2*v_im)) & 0x10101010) << 0;
|
||||
uint32_t qs64_80_hi4_offset16 = ((qh >> (2*v_im)) & 0x20202020) >> 1;
|
||||
|
||||
qs0_16_u32_lo4 += qs0_16_lo4_offset16;
|
||||
qs0_16_u32_hi4 += qs0_16_hi4_offset16;
|
||||
qs64_80_u32_lo4 += qs64_80_lo4_offset16;
|
||||
qs64_80_u32_hi4 += qs64_80_hi4_offset16;
|
||||
|
||||
uvec4 qs0_16_lo4 = uvec4(unpack8(qs0_16_u32_lo4));
|
||||
uvec4 qs64_80_lo4 = uvec4(unpack8(qs64_80_u32_lo4));
|
||||
uvec4 qs0_16_hi4 = uvec4(unpack8(qs0_16_u32_hi4));
|
||||
uvec4 qs64_80_hi4 = uvec4(unpack8(qs64_80_u32_hi4));
|
||||
|
||||
const uint32_t q4_0 = qs0_16_lo4.x;
|
||||
const uint32_t q4_1 = qs0_16_lo4.y;
|
||||
const uint32_t q4_2 = qs0_16_lo4.z;
|
||||
const uint32_t q4_3 = qs0_16_lo4.w;
|
||||
const uint32_t q4_4 = qs0_16_hi4.x;
|
||||
const uint32_t q4_5 = qs0_16_hi4.y;
|
||||
const uint32_t q4_6 = qs0_16_hi4.z;
|
||||
const uint32_t q4_7 = qs0_16_hi4.w;
|
||||
const uint32_t q4_8 = qs64_80_lo4.x;
|
||||
const uint32_t q4_9 = qs64_80_lo4.y;
|
||||
const uint32_t q4_10 = qs64_80_lo4.z;
|
||||
const uint32_t q4_11 = qs64_80_lo4.w;
|
||||
const uint32_t q4_12 = qs64_80_hi4.x;
|
||||
const uint32_t q4_13 = qs64_80_hi4.y;
|
||||
const uint32_t q4_14 = qs64_80_hi4.z;
|
||||
const uint32_t q4_15 = qs64_80_hi4.w;
|
||||
|
||||
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
|
||||
B_TYPE_VEC2 by10 = data_b_v2[(j*p.batch_stride_b + b_offset + y1_idx) / 2];
|
||||
B_TYPE_VEC2 by116 = data_b_v2[(j*p.batch_stride_b + b_offset + y1_idx) / 2 + 8];
|
||||
B_TYPE_VEC2 by132 = data_b_v2[(j*p.batch_stride_b + b_offset + y1_idx) / 2 + 16];
|
||||
B_TYPE_VEC2 by148 = data_b_v2[(j*p.batch_stride_b + b_offset + y1_idx) / 2 + 24];
|
||||
B_TYPE_VEC2 by20 = data_b_v2[(j*p.batch_stride_b + b_offset + y2_idx) / 2];
|
||||
B_TYPE_VEC2 by216 = data_b_v2[(j*p.batch_stride_b + b_offset + y2_idx) / 2 + 8];
|
||||
B_TYPE_VEC2 by232 = data_b_v2[(j*p.batch_stride_b + b_offset + y2_idx) / 2 + 16];
|
||||
B_TYPE_VEC2 by248 = data_b_v2[(j*p.batch_stride_b + b_offset + y2_idx) / 2 + 24];
|
||||
|
||||
const FLOAT_TYPE sx =
|
||||
fma(FLOAT_TYPE(by10.x), q4_0,
|
||||
fma(FLOAT_TYPE(by10.y), q4_1,
|
||||
fma(FLOAT_TYPE(by116.x), q4_2,
|
||||
FLOAT_TYPE(by116.y) * q4_3)));
|
||||
const FLOAT_TYPE sy =
|
||||
fma(FLOAT_TYPE(by132.x), q4_4,
|
||||
fma(FLOAT_TYPE(by132.y), q4_5,
|
||||
fma(FLOAT_TYPE(by148.x), q4_6,
|
||||
FLOAT_TYPE(by148.y) * q4_7)));
|
||||
const FLOAT_TYPE sz =
|
||||
fma(FLOAT_TYPE(by20.x), q4_8,
|
||||
fma(FLOAT_TYPE(by20.y), q4_9,
|
||||
fma(FLOAT_TYPE(by216.x), q4_10,
|
||||
FLOAT_TYPE(by216.y) * q4_11)));
|
||||
const FLOAT_TYPE sw =
|
||||
fma(FLOAT_TYPE(by232.x), q4_12,
|
||||
fma(FLOAT_TYPE(by232.y), q4_13,
|
||||
fma(FLOAT_TYPE(by248.x), q4_14,
|
||||
FLOAT_TYPE(by248.y) * q4_15)));
|
||||
const FLOAT_TYPE smin =
|
||||
fma(FLOAT_TYPE(by10.x) + FLOAT_TYPE(by10.y) + FLOAT_TYPE(by116.x) + FLOAT_TYPE(by116.y), sc2,
|
||||
fma(FLOAT_TYPE(by132.x) + FLOAT_TYPE(by132.y) + FLOAT_TYPE(by148.x) + FLOAT_TYPE(by148.y), sc3,
|
||||
fma(FLOAT_TYPE(by20.x) + FLOAT_TYPE(by20.y) + FLOAT_TYPE(by216.x) + FLOAT_TYPE(by216.y), sc6,
|
||||
(FLOAT_TYPE(by232.x) + FLOAT_TYPE(by232.y) + FLOAT_TYPE(by248.x) + FLOAT_TYPE(by248.y)) * sc7)));
|
||||
temp[j][n] = fma(dall, fma(sx, sc0, fma(sy, sc1, fma(sz, sc4, sw * sc5))), fma(-dmin, smin, temp[j][n]));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
reduce_result(temp, d_offset, first_row, num_rows, tid);
|
||||
}
|
||||
|
||||
void main() {
|
||||
const uint first_row = NUM_ROWS * (gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z);
|
||||
|
||||
// do NUM_ROWS at a time, unless there aren't enough remaining rows
|
||||
if (first_row + NUM_ROWS <= p.stride_d) {
|
||||
compute_outputs(first_row, NUM_ROWS);
|
||||
} else {
|
||||
if (first_row >= p.stride_d) {
|
||||
return;
|
||||
}
|
||||
compute_outputs(first_row, p.stride_d - first_row);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,112 @@
|
|||
#version 450
|
||||
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types : require
|
||||
|
||||
#include "mul_mat_vec_base.comp"
|
||||
|
||||
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
|
||||
uint a_offset, b_offset, d_offset;
|
||||
get_offsets(a_offset, b_offset, d_offset);
|
||||
|
||||
const uint num_blocks_per_row = p.ncols / QUANT_K;
|
||||
|
||||
// 16 threads are used to process each block
|
||||
const uint it_size = gl_WorkGroupSize.x/16;
|
||||
const uint tid = gl_LocalInvocationID.x;
|
||||
const uint itid = tid%16; // 0...16
|
||||
const uint ix = tid/16;
|
||||
|
||||
const uint step = 8;
|
||||
|
||||
const uint v_im = itid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
|
||||
const uint v_in = itid - step*v_im; // 0...15 or 0...7
|
||||
|
||||
const uint l0 = 4 * v_in; // 0, 4, 8, ..., 28
|
||||
const uint is = v_in / 4;
|
||||
|
||||
const uint ql_offset = 64*v_im + l0;
|
||||
const uint qh_offset = 32*v_im + l0;
|
||||
const uint s_offset = 8*v_im + is;
|
||||
const uint y_offset = 128*v_im + l0;
|
||||
|
||||
FLOAT_TYPE temp[NUM_COLS][NUM_ROWS];
|
||||
|
||||
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
|
||||
[[unroll]] for (uint i = 0; i < NUM_ROWS; ++i) {
|
||||
temp[j][i] = FLOAT_TYPE(0);
|
||||
}
|
||||
}
|
||||
|
||||
[[unroll]] for (uint i = ix; i < num_blocks_per_row; i += it_size) {
|
||||
const uint y_idx = i * QUANT_K + y_offset;
|
||||
|
||||
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
||||
const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row;
|
||||
const FLOAT_TYPE d = FLOAT_TYPE(data_a[ib0 + i].d);
|
||||
|
||||
FLOAT_TYPE scales[4];
|
||||
scales[0] = FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 0]);
|
||||
scales[1] = FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 2]);
|
||||
scales[2] = FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 4]);
|
||||
scales[3] = FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 6]);
|
||||
|
||||
uint32_t ql0_u32 = uint32_t(data_a_packed16[ib0 + i].ql[ql_offset / 2]) | (uint32_t(data_a_packed16[ib0 + i].ql[ql_offset / 2 + 1]) << 16);
|
||||
uint32_t ql32_u32 = uint32_t(data_a_packed16[ib0 + i].ql[ql_offset / 2 + 16]) | (uint32_t(data_a_packed16[ib0 + i].ql[ql_offset / 2 + 17]) << 16);
|
||||
|
||||
uint32_t ql0_u32_lo4 = ql0_u32 & 0x0F0F0F0F;
|
||||
uint32_t ql0_u32_hi4 = (ql0_u32 >> 4) & 0x0F0F0F0F;
|
||||
uint32_t ql32_u32_lo4 = ql32_u32 & 0x0F0F0F0F;
|
||||
uint32_t ql32_u32_hi4 = (ql32_u32 >> 4) & 0x0F0F0F0F;
|
||||
|
||||
uint32_t qh_u32 = uint32_t(data_a_packed16[ib0 + i].qh[qh_offset / 2]) | (uint32_t(data_a_packed16[ib0 + i].qh[qh_offset / 2 + 1]) << 16);
|
||||
uint32_t qh0_u32 = (qh_u32 & 0x03030303) << 4;
|
||||
uint32_t qh2_u32 = (qh_u32 & 0x0C0C0C0C) << 2;
|
||||
uint32_t qh4_u32 = (qh_u32 & 0x30303030) << 0;
|
||||
uint32_t qh6_u32 = (qh_u32 & 0xC0C0C0C0) >> 2;
|
||||
|
||||
uint32_t q0_u32 = ql0_u32_lo4 | qh0_u32;
|
||||
uint32_t q1_u32 = ql32_u32_lo4 | qh2_u32;
|
||||
uint32_t q2_u32 = ql0_u32_hi4 | qh4_u32;
|
||||
uint32_t q3_u32 = ql32_u32_hi4 | qh6_u32;
|
||||
|
||||
uvec4 q0 = uvec4(unpack8(q0_u32));
|
||||
uvec4 q1 = uvec4(unpack8(q1_u32));
|
||||
uvec4 q2 = uvec4(unpack8(q2_u32));
|
||||
uvec4 q3 = uvec4(unpack8(q3_u32));
|
||||
|
||||
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
|
||||
B_TYPE_VEC4 by0 = data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4];
|
||||
B_TYPE_VEC4 by32 = data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 + 8];
|
||||
B_TYPE_VEC4 by64 = data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 + 16];
|
||||
B_TYPE_VEC4 by96 = data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 + 24];
|
||||
|
||||
FLOAT_TYPE sum = FLOAT_TYPE(0.0);
|
||||
[[unroll]] for (int l = 0; l < 4; ++l) {
|
||||
sum = fma(FLOAT_TYPE(by0[l]) * scales[0], FLOAT_TYPE(int8_t(q0[l]) - 32),
|
||||
fma(FLOAT_TYPE(by32[l]) * scales[1], FLOAT_TYPE(int8_t(q1[l]) - 32),
|
||||
fma(FLOAT_TYPE(by64[l]) * scales[2], FLOAT_TYPE(int8_t(q2[l]) - 32),
|
||||
fma(FLOAT_TYPE(by96[l]) * scales[3], FLOAT_TYPE(int8_t(q3[l]) - 32), sum))));
|
||||
}
|
||||
temp[j][n] += sum * d;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
reduce_result(temp, d_offset, first_row, num_rows, tid);
|
||||
}
|
||||
|
||||
void main() {
|
||||
const uint first_row = NUM_ROWS * (gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z);
|
||||
|
||||
// do NUM_ROWS at a time, unless there aren't enough remaining rows
|
||||
if (first_row + NUM_ROWS <= p.stride_d) {
|
||||
compute_outputs(first_row, NUM_ROWS);
|
||||
} else {
|
||||
if (first_row >= p.stride_d) {
|
||||
return;
|
||||
}
|
||||
compute_outputs(first_row, p.stride_d - first_row);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,631 @@
|
|||
#version 450
|
||||
|
||||
#extension GL_EXT_control_flow_attributes : enable
|
||||
#extension GL_EXT_shader_16bit_storage : require
|
||||
|
||||
#ifdef FLOAT16
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require
|
||||
#endif
|
||||
|
||||
#ifdef COOPMAT
|
||||
#extension GL_KHR_cooperative_matrix : enable
|
||||
#extension GL_KHR_memory_scope_semantics : enable
|
||||
#extension GL_KHR_shader_subgroup_basic : enable
|
||||
#endif
|
||||
|
||||
#ifdef MUL_MAT_ID
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types_int16 : require
|
||||
#endif
|
||||
|
||||
#include "types.comp"
|
||||
|
||||
#ifndef LOAD_VEC_A
|
||||
#define LOAD_VEC_A 1
|
||||
#endif
|
||||
#ifndef LOAD_VEC_B
|
||||
#define LOAD_VEC_B 1
|
||||
#endif
|
||||
|
||||
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
|
||||
layout (binding = 1) readonly buffer B {B_TYPE data_b[];};
|
||||
layout (binding = 2) writeonly buffer D {D_TYPE data_d[];};
|
||||
|
||||
#ifdef MUL_MAT_ID
|
||||
layout (binding = 3) readonly buffer IDS {int data_ids[];};
|
||||
#endif
|
||||
|
||||
layout (push_constant) uniform parameter
|
||||
{
|
||||
uint M;
|
||||
uint N;
|
||||
uint K;
|
||||
uint stride_a;
|
||||
uint stride_b;
|
||||
uint stride_d;
|
||||
|
||||
uint batch_stride_a;
|
||||
uint batch_stride_b;
|
||||
uint batch_stride_d;
|
||||
|
||||
#ifdef MUL_MAT_ID
|
||||
uint nei0;
|
||||
uint nei1;
|
||||
uint nbi1;
|
||||
uint ne11;
|
||||
#else
|
||||
uint k_split;
|
||||
uint ne02;
|
||||
uint ne12;
|
||||
uint broadcast2;
|
||||
uint broadcast3;
|
||||
#endif
|
||||
} p;
|
||||
|
||||
layout (constant_id = 0) const uint BLOCK_SIZE = 64;
|
||||
layout (constant_id = 1) const uint BM = 64;
|
||||
layout (constant_id = 2) const uint BN = 64;
|
||||
layout (constant_id = 3) const uint BK = 16; // Assumed to be 32 if working with a quant
|
||||
layout (constant_id = 4) const uint WM = 32;
|
||||
layout (constant_id = 5) const uint WN = 32;
|
||||
layout (constant_id = 6) const uint WMITER = 2;
|
||||
layout (constant_id = 7) const uint TM = 4;
|
||||
layout (constant_id = 8) const uint TN = 2;
|
||||
layout (constant_id = 9) const uint TK = 1; // Only needed for coopmat
|
||||
layout (constant_id = 10) const uint WARP = 32;
|
||||
|
||||
#ifdef COOPMAT
|
||||
#define SHMEM_STRIDE (BK + 8)
|
||||
#else
|
||||
#define SHMEM_STRIDE (BK + 1)
|
||||
#endif
|
||||
|
||||
shared FLOAT_TYPE buf_a[BM * SHMEM_STRIDE];
|
||||
shared FLOAT_TYPE buf_b[BN * SHMEM_STRIDE];
|
||||
|
||||
#ifdef MUL_MAT_ID
|
||||
shared u16vec2 row_ids[3072];
|
||||
#endif // MUL_MAT_ID
|
||||
|
||||
#define NUM_WARPS (BLOCK_SIZE / WARP)
|
||||
|
||||
#ifdef COOPMAT
|
||||
shared ACC_TYPE coopmat_stage[TM * TN * NUM_WARPS];
|
||||
#endif
|
||||
|
||||
void main() {
|
||||
#if defined(DATA_A_IQ4_NL)
|
||||
init_iq4nl_shmem();
|
||||
#endif
|
||||
|
||||
#ifdef MUL_MAT_ID
|
||||
const uint expert_idx = gl_GlobalInvocationID.z;
|
||||
#else
|
||||
const uint batch_idx = gl_GlobalInvocationID.z;
|
||||
|
||||
const uint i13 = batch_idx / p.ne12;
|
||||
const uint i12 = batch_idx % p.ne12;
|
||||
|
||||
const uint i03 = i13 / p.broadcast3;
|
||||
const uint i02 = i12 / p.broadcast2;
|
||||
|
||||
const uint batch_idx_a = i03 * p.ne02 + i02;
|
||||
#endif
|
||||
|
||||
const uint blocks_m = (p.M + BM - 1) / BM;
|
||||
const uint ir = gl_WorkGroupID.x % blocks_m;
|
||||
const uint ik = gl_WorkGroupID.x / blocks_m;
|
||||
const uint ic = gl_WorkGroupID.y;
|
||||
|
||||
const uint WNITER = (WM * WN) / (WARP * TM * TN * WMITER);
|
||||
const uint WSUBM = WM / WMITER;
|
||||
const uint WSUBN = WN / WNITER;
|
||||
|
||||
#ifdef COOPMAT
|
||||
const uint warp_i = gl_SubgroupID;
|
||||
|
||||
const uint tiw = gl_SubgroupInvocationID;
|
||||
|
||||
const uint cms_per_row = WM / TM;
|
||||
const uint cms_per_col = WN / TN;
|
||||
|
||||
const uint storestride = WARP / TM;
|
||||
const uint store_r = tiw % TM;
|
||||
const uint store_c = tiw / TM;
|
||||
#else
|
||||
const uint warp_i = gl_LocalInvocationID.x / WARP;
|
||||
|
||||
const uint tiw = gl_LocalInvocationID.x % WARP;
|
||||
|
||||
const uint tiwr = tiw % (WSUBM / TM);
|
||||
const uint tiwc = tiw / (WSUBM / TM);
|
||||
#endif
|
||||
|
||||
const uint warp_r = warp_i % (BM / WM);
|
||||
const uint warp_c = warp_i / (BM / WM);
|
||||
|
||||
const uint loadr_a = gl_LocalInvocationID.x % (BK / LOAD_VEC_A);
|
||||
const uint loadc_a = gl_LocalInvocationID.x / (BK / LOAD_VEC_A);
|
||||
const uint loadr_b = gl_LocalInvocationID.x % (BK / LOAD_VEC_B);
|
||||
const uint loadc_b = gl_LocalInvocationID.x / (BK / LOAD_VEC_B);
|
||||
|
||||
const uint loadstride_a = gl_WorkGroupSize.x * LOAD_VEC_A / BK;
|
||||
const uint loadstride_b = gl_WorkGroupSize.x * LOAD_VEC_B / BK;
|
||||
|
||||
#ifdef MUL_MAT_ID
|
||||
uint _ne1 = 0;
|
||||
for (uint ii1 = 0; ii1 < p.nei1; ii1++) {
|
||||
for (uint ii0 = 0; ii0 < p.nei0; ii0++) {
|
||||
if (data_ids[ii1*p.nbi1 + ii0] == expert_idx) {
|
||||
row_ids[_ne1] = u16vec2(ii0, ii1);
|
||||
_ne1++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
barrier();
|
||||
|
||||
// Workgroup has no work
|
||||
if (ic * BN >= _ne1) return;
|
||||
#endif
|
||||
|
||||
#ifdef MUL_MAT_ID
|
||||
const uint start_k = 0;
|
||||
const uint end_k = p.K;
|
||||
#else
|
||||
const uint start_k = ik * p.k_split;
|
||||
const uint end_k = min(p.K, (ik + 1) * p.k_split);
|
||||
#endif
|
||||
|
||||
uint pos_a = (
|
||||
#ifdef MUL_MAT_ID
|
||||
expert_idx * p.batch_stride_a +
|
||||
#else
|
||||
batch_idx_a * p.batch_stride_a +
|
||||
#endif
|
||||
ir * BM * p.stride_a + start_k) / LOAD_VEC_A;
|
||||
#ifdef MUL_MAT_ID
|
||||
uint pos_b = 0;
|
||||
#else
|
||||
uint pos_b = (batch_idx * p.batch_stride_b + ic * BN * p.stride_b + start_k) / LOAD_VEC_B;
|
||||
#endif
|
||||
|
||||
#ifdef COOPMAT
|
||||
coopmat<float16_t, gl_ScopeSubgroup, TM, TK, gl_MatrixUseA> cache_a;
|
||||
coopmat<float16_t, gl_ScopeSubgroup, TK, TN, gl_MatrixUseB> cache_b;
|
||||
coopmat<ACC_TYPE, gl_ScopeSubgroup, TM, TN, gl_MatrixUseAccumulator> sums[cms_per_row * cms_per_col];
|
||||
|
||||
[[unroll]] for (uint i = 0; i < cms_per_row * cms_per_col; i++) {
|
||||
sums[i] = coopmat<ACC_TYPE, gl_ScopeSubgroup, TM, TN, gl_MatrixUseAccumulator>(0.0f);
|
||||
}
|
||||
#else
|
||||
ACC_TYPE sums[WMITER * TM * WNITER * TN];
|
||||
FLOAT_TYPE cache_a[WMITER * TM];
|
||||
FLOAT_TYPE cache_b[WNITER * TN];
|
||||
|
||||
[[unroll]] for (uint i = 0; i < WMITER*TM*WNITER*TN; i++) {
|
||||
sums[i] = ACC_TYPE(0.0f);
|
||||
}
|
||||
#endif
|
||||
|
||||
for (uint block = start_k; block < end_k; block += BK) {
|
||||
[[unroll]] for (uint l = 0; l < BM; l += loadstride_a) {
|
||||
|
||||
#if defined(DATA_A_F32) || defined(DATA_A_F16)
|
||||
#if LOAD_VEC_A == 8
|
||||
const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
|
||||
const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A;
|
||||
buf_a[buf_idx ] = FLOAT_TYPE(data_a[idx][0].x);
|
||||
buf_a[buf_idx + 1] = FLOAT_TYPE(data_a[idx][0].y);
|
||||
buf_a[buf_idx + 2] = FLOAT_TYPE(data_a[idx][0].z);
|
||||
buf_a[buf_idx + 3] = FLOAT_TYPE(data_a[idx][0].w);
|
||||
buf_a[buf_idx + 4] = FLOAT_TYPE(data_a[idx][1].x);
|
||||
buf_a[buf_idx + 5] = FLOAT_TYPE(data_a[idx][1].y);
|
||||
buf_a[buf_idx + 6] = FLOAT_TYPE(data_a[idx][1].z);
|
||||
buf_a[buf_idx + 7] = FLOAT_TYPE(data_a[idx][1].w);
|
||||
#elif LOAD_VEC_A == 4
|
||||
const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
|
||||
const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A;
|
||||
buf_a[buf_idx ] = FLOAT_TYPE(data_a[idx].x);
|
||||
buf_a[buf_idx + 1] = FLOAT_TYPE(data_a[idx].y);
|
||||
buf_a[buf_idx + 2] = FLOAT_TYPE(data_a[idx].z);
|
||||
buf_a[buf_idx + 3] = FLOAT_TYPE(data_a[idx].w);
|
||||
#else
|
||||
if (ir * BM + loadc_a + l < p.M && block + loadr_a < end_k) {
|
||||
buf_a[(loadc_a + l) * SHMEM_STRIDE + loadr_a] = FLOAT_TYPE(data_a[pos_a + (loadc_a + l) * p.stride_a + loadr_a]);
|
||||
} else {
|
||||
buf_a[(loadc_a + l) * SHMEM_STRIDE + loadr_a] = FLOAT_TYPE(0.0f);
|
||||
}
|
||||
#endif
|
||||
#elif defined(DATA_A_Q4_0)
|
||||
const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
|
||||
const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a;
|
||||
|
||||
const uint ib = idx / 16;
|
||||
const uint iqs = idx & 0xF;
|
||||
|
||||
const float d = float(data_a[ib].d);
|
||||
const uint vui = uint(data_a[ib].qs[iqs]);
|
||||
const vec2 v = (vec2(vui & 0xF, vui >> 4) - 8.0f) * d;
|
||||
|
||||
buf_a[buf_idx ] = FLOAT_TYPE(v.x);
|
||||
buf_a[buf_idx + 16] = FLOAT_TYPE(v.y);
|
||||
#elif defined(DATA_A_Q4_1)
|
||||
const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
|
||||
const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a;
|
||||
|
||||
const uint ib = idx / 16;
|
||||
const uint iqs = idx & 0xF;
|
||||
|
||||
const float d = float(data_a[ib].d);
|
||||
const float m = float(data_a[ib].m);
|
||||
const uint vui = uint(data_a[ib].qs[iqs]);
|
||||
const vec2 v = vec2(vui & 0xF, vui >> 4) * d + m;
|
||||
|
||||
buf_a[buf_idx ] = FLOAT_TYPE(v.x);
|
||||
buf_a[buf_idx + 16] = FLOAT_TYPE(v.y);
|
||||
#elif defined(DATA_A_Q5_0)
|
||||
const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
|
||||
const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a;
|
||||
|
||||
const uint ib = idx / 16;
|
||||
const uint iqs = idx & 0xF;
|
||||
|
||||
const float d = float(data_a[ib].d);
|
||||
const uint uint_qh = uint(data_a[ib].qh[1]) << 16 | data_a[ib].qh[0];
|
||||
const ivec2 qh = ivec2(((uint_qh >> iqs) << 4) & 0x10, (uint_qh >> (iqs + 12)) & 0x10);
|
||||
const uint vui = uint(data_a[ib].qs[iqs]);
|
||||
const vec2 v = (vec2((vui & 0xF) | qh.x, (vui >> 4) | qh.y) - 16.0f) * d;
|
||||
|
||||
buf_a[buf_idx ] = FLOAT_TYPE(v.x);
|
||||
buf_a[buf_idx + 16] = FLOAT_TYPE(v.y);
|
||||
#elif defined(DATA_A_Q5_1)
|
||||
const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
|
||||
const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a;
|
||||
|
||||
const uint ib = idx / 16;
|
||||
const uint iqs = idx & 0xF;
|
||||
|
||||
const float d = float(data_a[ib].d);
|
||||
const float m = float(data_a[ib].m);
|
||||
const uint uint_qh = data_a[ib].qh;
|
||||
const ivec2 qh = ivec2(((uint_qh >> iqs) << 4) & 0x10, (uint_qh >> (iqs + 12)) & 0x10);
|
||||
const uint vui = uint(data_a[ib].qs[iqs]);
|
||||
const vec2 v = vec2((vui & 0xF) | qh.x, (vui >> 4) | qh.y) * d + m;
|
||||
|
||||
buf_a[buf_idx ] = FLOAT_TYPE(v.x);
|
||||
buf_a[buf_idx + 16] = FLOAT_TYPE(v.y);
|
||||
#elif defined(DATA_A_Q8_0)
|
||||
const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
|
||||
const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A;
|
||||
|
||||
const uint ib = idx / 16;
|
||||
const uint iqs = (idx & 0xF) * 2;
|
||||
|
||||
const float d = float(data_a[ib].d);
|
||||
const vec2 v = vec2(int(data_a[ib].qs[iqs]), int(data_a[ib].qs[iqs + 1])) * d;
|
||||
|
||||
buf_a[buf_idx ] = FLOAT_TYPE(v.x);
|
||||
buf_a[buf_idx + 1] = FLOAT_TYPE(v.y);
|
||||
#elif defined(DATA_A_Q2_K)
|
||||
const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
|
||||
const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A;
|
||||
|
||||
const uint ib = idx / 128; // 2 values per idx
|
||||
const uint iqs = idx % 128; // 0..127
|
||||
|
||||
const uint qsi = (iqs / 64) * 32 + (iqs % 16) * 2; // 0,2,4..30
|
||||
const uint scalesi = iqs / 8; // 0..15
|
||||
const uint qsshift = ((iqs % 64) / 16) * 2; // 0,2,4,6
|
||||
|
||||
const uvec2 qs = uvec2(data_a[ib].qs[qsi], data_a[ib].qs[qsi + 1]);
|
||||
const uint scales = data_a[ib].scales[scalesi];
|
||||
const vec2 d = vec2(data_a[ib].d);
|
||||
|
||||
const vec2 v = d.x * float(scales & 0xF) * vec2((qs >> qsshift) & 3) - d.y * float(scales >> 4);
|
||||
|
||||
buf_a[buf_idx ] = FLOAT_TYPE(v.x);
|
||||
buf_a[buf_idx + 1] = FLOAT_TYPE(v.y);
|
||||
#elif defined(DATA_A_Q3_K)
|
||||
const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
|
||||
const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A;
|
||||
|
||||
const uint ib = idx / 128; // 2 values per idx
|
||||
const uint iqs = idx % 128; // 0..127
|
||||
|
||||
const uint n = iqs / 64; // 0,1
|
||||
const uint qsi = n * 32 + (iqs % 16) * 2; // 0,2,4..62
|
||||
const uint hmi = (iqs % 16) * 2; // 0,2,4..30
|
||||
const uint j = (iqs % 64) / 4; // 0..3
|
||||
const uint is = iqs / 8; // 0..15
|
||||
const uint halfsplit = ((iqs % 64) / 16); // 0,1,2,3
|
||||
const uint qsshift = halfsplit * 2; // 0,2,4,6
|
||||
const uint m = 1 << (4 * n + halfsplit); // 1,2,4,8,16,32,64,128
|
||||
|
||||
const int8_t us = int8_t(is < 4 ? (data_a[ib].scales[is-0] & 0xF) | (((data_a[ib].scales[is+8] >> 0) & 3) << 4) :
|
||||
is < 8 ? (data_a[ib].scales[is-0] & 0xF) | (((data_a[ib].scales[is+4] >> 2) & 3) << 4) :
|
||||
is < 12 ? (data_a[ib].scales[is-8] >> 4) | (((data_a[ib].scales[is+0] >> 4) & 3) << 4) :
|
||||
(data_a[ib].scales[is-8] >> 4) | (((data_a[ib].scales[is-4] >> 6) & 3) << 4));
|
||||
const float dl = float(data_a[ib].d) * float(us - 32);
|
||||
|
||||
buf_a[buf_idx ] = FLOAT_TYPE(dl * float(int8_t((data_a[ib].qs[qsi ] >> qsshift) & 3) - (((data_a[ib].hmask[hmi ] & m) != 0) ? 0 : 4)));
|
||||
buf_a[buf_idx + 1] = FLOAT_TYPE(dl * float(int8_t((data_a[ib].qs[qsi + 1] >> qsshift) & 3) - (((data_a[ib].hmask[hmi + 1] & m) != 0) ? 0 : 4)));
|
||||
#elif defined(DATA_A_Q4_K)
|
||||
const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
|
||||
const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A;
|
||||
|
||||
const uint ib = idx / 128; // 2 values per idx
|
||||
const uint iqs = idx % 128; // 0..127
|
||||
|
||||
const uint n = iqs / 32; // 0,1,2,3
|
||||
const uint b = (iqs % 32) / 16; // 0,1
|
||||
const uint is = 2 * n + b; // 0..7
|
||||
const uint qsi = n * 32 + (iqs % 16) * 2; // 0,2,4..126
|
||||
|
||||
const vec2 loadd = vec2(data_a[ib].d);
|
||||
|
||||
const uint scidx0 = (is < 4) ? is : (is + 4);
|
||||
const uint scidx1 = (is < 4) ? is : (is - 4);
|
||||
const uint scidxmask1 = (is < 4) ? 0x30 : 0xC0;
|
||||
const uint scidxshift1 = (is < 4) ? 0 : 2;
|
||||
const uint mbidx0 = is + 4;
|
||||
const uint mbidx1 = (is < 4) ? is + 4 : is;
|
||||
const uint mbidxmask0 = (is < 4) ? 0xF : 0xF0;
|
||||
const uint mbidxshift0 = (is < 4) ? 0 : 4;
|
||||
const uint mbidxmask1 = (is < 4) ? 0x30 : 0xC0;
|
||||
const uint mbidxshift1 = (is < 4) ? 0 : 2;
|
||||
|
||||
const uint8_t sc = uint8_t((data_a[ib].scales[scidx0] & 0xF) | ((data_a[ib].scales[scidx1] & scidxmask1) >> scidxshift1));
|
||||
const uint8_t mbyte = uint8_t((data_a[ib].scales[mbidx0] & mbidxmask0) >> mbidxshift0 | ((data_a[ib].scales[mbidx1] & mbidxmask1) >> mbidxshift1));
|
||||
|
||||
const float d = loadd.x * sc;
|
||||
const float m = -loadd.y * mbyte;
|
||||
|
||||
buf_a[buf_idx ] = FLOAT_TYPE(fma(d, float((data_a[ib].qs[qsi ] >> (b * 4)) & 0xF), m));
|
||||
buf_a[buf_idx + 1] = FLOAT_TYPE(fma(d, float((data_a[ib].qs[qsi + 1] >> (b * 4)) & 0xF), m));
|
||||
#elif defined(DATA_A_Q5_K)
|
||||
const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
|
||||
const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A;
|
||||
|
||||
const uint ib = idx / 128; // 2 values per idx
|
||||
const uint iqs = idx % 128; // 0..127
|
||||
|
||||
const uint n = iqs / 32; // 0,1,2,3
|
||||
const uint b = (iqs % 32) / 16; // 0,1
|
||||
const uint is = 2 * n + b; // 0..7
|
||||
const uint qsi = n * 32 + (iqs % 16) * 2; // 0,2,4..126
|
||||
const uint qhi = (iqs % 16) * 2; // 0,2,4..30
|
||||
|
||||
const uint8_t hm = uint8_t(1 << (iqs / 16));
|
||||
|
||||
const vec2 loadd = vec2(data_a[ib].d);
|
||||
|
||||
const uint scidx0 = (is < 4) ? is : (is + 4);
|
||||
const uint scidx1 = (is < 4) ? is : (is - 4);
|
||||
const uint scidxmask1 = (is < 4) ? 0x30 : 0xC0;
|
||||
const uint scidxshift1 = (is < 4) ? 0 : 2;
|
||||
const uint mbidx0 = is + 4;
|
||||
const uint mbidx1 = (is < 4) ? is + 4 : is;
|
||||
const uint mbidxmask0 = (is < 4) ? 0xF : 0xF0;
|
||||
const uint mbidxshift0 = (is < 4) ? 0 : 4;
|
||||
const uint mbidxmask1 = (is < 4) ? 0x30 : 0xC0;
|
||||
const uint mbidxshift1 = (is < 4) ? 0 : 2;
|
||||
|
||||
const uint8_t sc = uint8_t((data_a[ib].scales[scidx0] & 0xF) | ((data_a[ib].scales[scidx1] & scidxmask1) >> scidxshift1));
|
||||
const uint8_t mbyte = uint8_t(((data_a[ib].scales[mbidx0] & mbidxmask0) >> mbidxshift0) | ((data_a[ib].scales[mbidx1] & mbidxmask1) >> mbidxshift1));
|
||||
|
||||
const float d = loadd.x * sc;
|
||||
const float m = -loadd.y * mbyte;
|
||||
|
||||
buf_a[buf_idx ] = FLOAT_TYPE(fma(d, float((data_a[ib].qs[qsi ] >> (b * 4)) & 0xF) + float((data_a[ib].qh[qhi ] & hm) != 0 ? 16 : 0), m));
|
||||
buf_a[buf_idx + 1] = FLOAT_TYPE(fma(d, float((data_a[ib].qs[qsi + 1] >> (b * 4)) & 0xF) + float((data_a[ib].qh[qhi + 1] & hm) != 0 ? 16 : 0), m));
|
||||
#elif defined(DATA_A_Q6_K)
|
||||
const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
|
||||
const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A;
|
||||
|
||||
const uint ib = idx / 128; // 2 values per idx
|
||||
const uint iqs = idx % 128; // 0..127
|
||||
|
||||
const uint n = iqs / 64; // 0,1
|
||||
const uint b = (iqs % 64) / 32; // 0,1
|
||||
const uint is_b = (iqs % 16) / 8; // 0,1
|
||||
const uint qhshift = ((iqs % 64) / 16) * 2; // 0,2,4,6
|
||||
const uint is = 8 * n + qhshift + is_b; // 0..15
|
||||
const uint qsi = n * 64 + (iqs % 32) * 2; // 0,2,4..126
|
||||
const uint qhi = n * 32 + (iqs % 16) * 2; // 0,2,4..62
|
||||
|
||||
const float dscale = float(data_a[ib].d) * float(data_a[ib].scales[is]);
|
||||
|
||||
buf_a[buf_idx ] = FLOAT_TYPE(dscale * float(int8_t(((data_a[ib].ql[qsi ] >> (b * 4)) & 0xF) | (((data_a[ib].qh[qhi ] >> qhshift) & 3) << 4)) - 32));
|
||||
buf_a[buf_idx + 1] = FLOAT_TYPE(dscale * float(int8_t(((data_a[ib].ql[qsi + 1] >> (b * 4)) & 0xF) | (((data_a[ib].qh[qhi + 1] >> qhshift) & 3) << 4)) - 32));
|
||||
#elif defined(DATA_A_IQ4_NL)
|
||||
const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
|
||||
const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a;
|
||||
|
||||
const uint ib = idx / 16;
|
||||
const uint iqs = idx & 0xF;
|
||||
|
||||
const float d = float(data_a[ib].d);
|
||||
const uint vui = uint(data_a[ib].qs[iqs]);
|
||||
const vec2 v = vec2(kvalues_iq4nl[vui & 0xF], kvalues_iq4nl[vui >> 4]) * d;
|
||||
|
||||
buf_a[buf_idx ] = FLOAT_TYPE(v.x);
|
||||
buf_a[buf_idx + 16] = FLOAT_TYPE(v.y);
|
||||
#endif
|
||||
}
|
||||
[[unroll]] for (uint l = 0; l < BN; l += loadstride_b) {
|
||||
#if LOAD_VEC_B == 8
|
||||
#ifdef MUL_MAT_ID
|
||||
const u16vec2 row_idx = row_ids[ic * BN + loadc_b + l];
|
||||
const uint idx = pos_b + row_idx.y * p.batch_stride_b / LOAD_VEC_B + (row_idx.x % p.ne11) * p.stride_b / LOAD_VEC_B + loadr_b;
|
||||
#else
|
||||
const uint idx = pos_b + (loadc_b + l) * p.stride_b / LOAD_VEC_B + loadr_b;
|
||||
#endif
|
||||
const uint buf_idx = (loadc_b + l) * SHMEM_STRIDE + loadr_b * LOAD_VEC_B;
|
||||
buf_b[buf_idx + 0] = FLOAT_TYPE(data_b[idx][0].x);
|
||||
buf_b[buf_idx + 1] = FLOAT_TYPE(data_b[idx][0].y);
|
||||
buf_b[buf_idx + 2] = FLOAT_TYPE(data_b[idx][0].z);
|
||||
buf_b[buf_idx + 3] = FLOAT_TYPE(data_b[idx][0].w);
|
||||
buf_b[buf_idx + 4] = FLOAT_TYPE(data_b[idx][1].x);
|
||||
buf_b[buf_idx + 5] = FLOAT_TYPE(data_b[idx][1].y);
|
||||
buf_b[buf_idx + 6] = FLOAT_TYPE(data_b[idx][1].z);
|
||||
buf_b[buf_idx + 7] = FLOAT_TYPE(data_b[idx][1].w);
|
||||
#elif LOAD_VEC_B == 4
|
||||
#ifdef MUL_MAT_ID
|
||||
const u16vec2 row_idx = row_ids[ic * BN + loadc_b + l];
|
||||
const uint idx = pos_b + row_idx.y * p.batch_stride_b / LOAD_VEC_B + (row_idx.x % p.ne11) * p.stride_b / LOAD_VEC_B + loadr_b;
|
||||
#else
|
||||
const uint idx = pos_b + (loadc_b + l) * p.stride_b / LOAD_VEC_B + loadr_b;
|
||||
#endif
|
||||
const uint buf_idx = (loadc_b + l) * SHMEM_STRIDE + loadr_b * LOAD_VEC_B;
|
||||
buf_b[buf_idx + 0] = FLOAT_TYPE(data_b[idx].x);
|
||||
buf_b[buf_idx + 1] = FLOAT_TYPE(data_b[idx].y);
|
||||
buf_b[buf_idx + 2] = FLOAT_TYPE(data_b[idx].z);
|
||||
buf_b[buf_idx + 3] = FLOAT_TYPE(data_b[idx].w);
|
||||
#elif !MUL_MAT_ID
|
||||
if (ic * BN + loadc_b + l < p.N && block + loadr_b < end_k) {
|
||||
buf_b[(loadc_b + l) * SHMEM_STRIDE + loadr_b] = FLOAT_TYPE(data_b[pos_b + (loadc_b + l) * p.stride_b + loadr_b]);
|
||||
} else {
|
||||
buf_b[(loadc_b + l) * SHMEM_STRIDE + loadr_b] = FLOAT_TYPE(0.0f);
|
||||
}
|
||||
#else
|
||||
const uint row_i = ic * BN + loadc_b + l;
|
||||
if (row_i < _ne1) {
|
||||
const u16vec2 row_idx = row_ids[row_i];
|
||||
buf_b[(loadc_b + l) * SHMEM_STRIDE + loadr_b] = FLOAT_TYPE(data_b[pos_b + row_idx.y * p.batch_stride_b + (row_idx.x % p.ne11) * p.stride_b + loadr_b]);
|
||||
} else {
|
||||
buf_b[(loadc_b + l) * SHMEM_STRIDE + loadr_b] = FLOAT_TYPE(0.0f);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
barrier();
|
||||
|
||||
pos_a += BK / LOAD_VEC_A;
|
||||
pos_b += BK / LOAD_VEC_B;
|
||||
|
||||
#ifdef COOPMAT
|
||||
[[unroll]] for (uint i = 0; i < BK; i += TK) {
|
||||
[[unroll]] for (uint cm_row = 0; cm_row < cms_per_row; cm_row++) {
|
||||
// Load from shared into cache
|
||||
coopMatLoad(cache_a, buf_a, (warp_r * WM + cm_row * TM) * SHMEM_STRIDE + i, SHMEM_STRIDE, gl_CooperativeMatrixLayoutRowMajor);
|
||||
|
||||
[[unroll]] for (uint cm_col = 0; cm_col < cms_per_col; cm_col++) {
|
||||
coopMatLoad(cache_b, buf_b, (warp_c * WN + cm_col * TN) * SHMEM_STRIDE + i, SHMEM_STRIDE, gl_CooperativeMatrixLayoutColumnMajor);
|
||||
|
||||
sums[cm_col * cms_per_row + cm_row] = coopMatMulAdd(cache_a, cache_b, sums[cm_col * cms_per_row + cm_row]);
|
||||
}
|
||||
}
|
||||
}
|
||||
#else
|
||||
[[unroll]] for (uint i = 0; i < BK; i++) {
|
||||
// Load from shared into cache
|
||||
[[unroll]] for (uint wsir = 0; wsir < WMITER; wsir++) {
|
||||
[[unroll]] for (uint j = 0; j < TM; j++) {
|
||||
cache_a[wsir * TM + j] = buf_a[(warp_r * WM + wsir * WSUBM + tiwr * TM + j) * SHMEM_STRIDE + i];
|
||||
}
|
||||
}
|
||||
[[unroll]] for (uint wsic = 0; wsic < WNITER; wsic++) {
|
||||
[[unroll]] for (uint j = 0; j < TN; j++) {
|
||||
cache_b[wsic * TN + j] = buf_b[(warp_c * WN + wsic * WSUBN + tiwc * TN + j) * SHMEM_STRIDE + i];
|
||||
}
|
||||
}
|
||||
|
||||
[[unroll]] for (uint wsic = 0; wsic < WNITER; wsic++) {
|
||||
[[unroll]] for (uint wsir = 0; wsir < WMITER; wsir++) {
|
||||
[[unroll]] for (uint cc = 0; cc < TN; cc++) {
|
||||
[[unroll]] for (uint cr = 0; cr < TM; cr++) {
|
||||
const uint sums_idx = (wsic * TN + cc) * (WMITER * TM) + wsir * TM + cr;
|
||||
sums[sums_idx] = fma(ACC_TYPE(cache_a[wsir * TM + cr]), ACC_TYPE(cache_b[wsic * TN + cc]), sums[sums_idx]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
barrier();
|
||||
}
|
||||
|
||||
const uint dr = ir * BM + warp_r * WM;
|
||||
const uint dc = ic * BN + warp_c * WN;
|
||||
|
||||
#ifndef MUL_MAT_ID
|
||||
const uint offsets = batch_idx * p.batch_stride_d + ik * p.batch_stride_d * gl_NumWorkGroups.z;
|
||||
#endif
|
||||
|
||||
#ifdef COOPMAT
|
||||
#ifdef MUL_MAT_ID
|
||||
[[unroll]] for (uint cm_row = 0; cm_row < cms_per_row; cm_row++) {
|
||||
[[unroll]] for (uint cm_col = 0; cm_col < cms_per_col; cm_col++) {
|
||||
coopMatStore(sums[cm_col * cms_per_row + cm_row], coopmat_stage, warp_i * TM * TN, TM, gl_CooperativeMatrixLayoutColumnMajor);
|
||||
|
||||
[[unroll]] for (uint col = 0; col < BN; col += storestride) {
|
||||
const uint row_i = dc + cm_col * TN + col + store_c;
|
||||
if (row_i >= _ne1) break;
|
||||
|
||||
const u16vec2 row_idx = row_ids[row_i];
|
||||
|
||||
data_d[row_idx.y * p.batch_stride_d + row_idx.x * p.stride_d + dr + cm_row * TM + store_r] = D_TYPE(coopmat_stage[warp_i * TM * TN + (col + store_c) * TM + store_r]);
|
||||
}
|
||||
}
|
||||
}
|
||||
#else
|
||||
const bool is_aligned = p.stride_d % 4 == 0; // Assumption: D_TYPE == float
|
||||
|
||||
[[unroll]] for (uint cm_row = 0; cm_row < cms_per_row; cm_row++) {
|
||||
[[unroll]] for (uint cm_col = 0; cm_col < cms_per_col; cm_col++) {
|
||||
const bool is_in_bounds = dr + (cm_row + 1) * TM <= p.M && dc + (cm_col + 1) * TN <= p.N;
|
||||
|
||||
if (is_aligned && is_in_bounds) {
|
||||
// Full coopMat is within bounds and stride_d is aligned with 16B
|
||||
coopmat<D_TYPE, gl_ScopeSubgroup, TM, TN, gl_MatrixUseAccumulator> cm_dtype = coopmat<D_TYPE, gl_ScopeSubgroup, TM, TN, gl_MatrixUseAccumulator>(sums[cm_col * cms_per_row + cm_row]);
|
||||
coopMatStore(cm_dtype, data_d, offsets + (dc + cm_col * TN) * p.stride_d + dr + cm_row * TM, p.stride_d, gl_CooperativeMatrixLayoutColumnMajor);
|
||||
} else if (is_in_bounds) {
|
||||
// Full coopMat is within bounds, but stride_d is not aligned
|
||||
coopMatStore(sums[cm_col * cms_per_row + cm_row], coopmat_stage, warp_i * TM * TN, TM, gl_CooperativeMatrixLayoutColumnMajor);
|
||||
|
||||
[[unroll]] for (uint col = 0; col < TN; col += storestride) {
|
||||
data_d[offsets + (dc + cm_col * TN + col + store_c) * p.stride_d + dr + cm_row * TM + store_r] = D_TYPE(coopmat_stage[warp_i * TM * TN + (col + store_c) * TM + store_r]);
|
||||
}
|
||||
} else if (dr + cm_row * TM < p.M && dc + cm_col * TN < p.N) {
|
||||
// Partial coopMat is within bounds
|
||||
coopMatStore(sums[cm_col * cms_per_row + cm_row], coopmat_stage, warp_i * TM * TN, TM, gl_CooperativeMatrixLayoutColumnMajor);
|
||||
|
||||
[[unroll]] for (uint col = 0; col < TN; col += storestride) {
|
||||
if (dr + cm_row * TM + store_r < p.M && dc + cm_col * TN + col + store_c < p.N) {
|
||||
data_d[offsets + (dc + cm_col * TN + col + store_c) * p.stride_d + dr + cm_row * TM + store_r] = D_TYPE(coopmat_stage[warp_i * TM * TN + (col + store_c) * TM + store_r]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif // MUL_MAT_ID
|
||||
#else
|
||||
[[unroll]] for (uint wsic = 0; wsic < WNITER; wsic++) {
|
||||
[[unroll]] for (uint wsir = 0; wsir < WMITER; wsir++) {
|
||||
|
||||
const uint dr_warp = dr + wsir * WSUBM + tiwr * TM;
|
||||
const uint dc_warp = dc + wsic * WSUBN + tiwc * TN;
|
||||
[[unroll]] for (uint cc = 0; cc < TN; cc++) {
|
||||
#ifdef MUL_MAT_ID
|
||||
const uint row_i = dc_warp + cc;
|
||||
if (row_i >= _ne1) break;
|
||||
|
||||
const u16vec2 row_idx = row_ids[row_i];
|
||||
#endif // MUL_MAT_ID
|
||||
[[unroll]] for (uint cr = 0; cr < TM; cr++) {
|
||||
#ifdef MUL_MAT_ID
|
||||
data_d[row_idx.y * p.batch_stride_d + row_idx.x * p.stride_d + dr_warp + cr] = D_TYPE(sums[(wsic * TN + cc) * (WMITER * TM) + wsir * TM + cr]);
|
||||
#else
|
||||
if (dr_warp + cr < p.M && dc_warp + cc < p.N) {
|
||||
data_d[offsets + (dc_warp + cc) * p.stride_d + dr_warp + cr] = D_TYPE(sums[(wsic * TN + cc) * (WMITER * TM) + wsir * TM + cr]);
|
||||
}
|
||||
#endif // MUL_MAT_ID
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif // COOPMAT
|
||||
}
|
||||
|
|
@ -0,0 +1,328 @@
|
|||
#version 450
|
||||
|
||||
#extension GL_EXT_control_flow_attributes : enable
|
||||
#extension GL_EXT_shader_16bit_storage : require
|
||||
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types_int8 : require
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types_int32 : require
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types_int16 : require
|
||||
|
||||
#extension GL_KHR_memory_scope_semantics : enable
|
||||
#extension GL_KHR_cooperative_matrix : enable
|
||||
#extension GL_NV_cooperative_matrix2 : enable
|
||||
#extension GL_EXT_buffer_reference : enable
|
||||
#extension GL_KHR_shader_subgroup_ballot : enable
|
||||
#extension GL_KHR_shader_subgroup_vote : enable
|
||||
|
||||
#include "types.comp"
|
||||
|
||||
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout (constant_id = 1) const uint BM = 64;
|
||||
layout (constant_id = 2) const uint BN = 64;
|
||||
layout (constant_id = 3) const uint BK = 16; // Assumed to be 32 if working with a quant
|
||||
|
||||
layout (push_constant) uniform parameter
|
||||
{
|
||||
uint M;
|
||||
uint N;
|
||||
uint K;
|
||||
uint stride_a;
|
||||
uint stride_b;
|
||||
uint stride_d;
|
||||
|
||||
uint batch_stride_a;
|
||||
uint batch_stride_b;
|
||||
uint batch_stride_d;
|
||||
|
||||
#ifdef MUL_MAT_ID
|
||||
uint nei0;
|
||||
uint nei1;
|
||||
uint nbi1;
|
||||
uint ne11;
|
||||
#else
|
||||
uint k_split;
|
||||
uint ne02;
|
||||
uint ne12;
|
||||
uint broadcast2;
|
||||
uint broadcast3;
|
||||
#endif
|
||||
} p;
|
||||
|
||||
|
||||
layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
|
||||
layout (binding = 1) readonly buffer B {B_TYPE data_b[];};
|
||||
layout (binding = 2) writeonly buffer D {D_TYPE data_d[];};
|
||||
|
||||
#if QUANT_K > 1
|
||||
#define DECODEFUNCA , dequantFuncA
|
||||
#define MAT_A_TYPE float16_t
|
||||
|
||||
#include "dequant_funcs_cm2.comp"
|
||||
|
||||
#else
|
||||
#define DECODEFUNCA
|
||||
#define MAT_A_TYPE A_TYPE
|
||||
#endif
|
||||
|
||||
#define MAT_B_TYPE B_TYPE
|
||||
|
||||
#ifdef MUL_MAT_ID
|
||||
layout (binding = 3) readonly buffer IDS {int data_ids[];};
|
||||
|
||||
shared u16vec4 row_ids[3072];
|
||||
|
||||
layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufB {
|
||||
B_TYPE b[];
|
||||
};
|
||||
|
||||
uint _ne1;
|
||||
shared uint _ne1_sh;
|
||||
|
||||
B_TYPE decodeFuncB(const in decodeBufB bl, const in uint blockCoords[2], const in uint coordInBlock[2])
|
||||
{
|
||||
const uint row_i = blockCoords[0];
|
||||
|
||||
if (row_i >= _ne1) {
|
||||
return B_TYPE(0.0);
|
||||
}
|
||||
|
||||
const u16vec4 row_idx = row_ids[row_i];
|
||||
B_TYPE ret = data_b[row_idx.y * p.batch_stride_b + row_idx.x * p.stride_b + blockCoords[1]];
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
D_TYPE perElemOpD(const in uint32_t r, const in uint32_t c, const in D_TYPE elem, const in uint32_t ir, const in uint32_t ic)
|
||||
{
|
||||
uint dr = ir * BM + r;
|
||||
uint dc = ic * BN + c;
|
||||
|
||||
if (dr < p.M && dc < _ne1) {
|
||||
uint row_i = dc;
|
||||
const u16vec4 row_idx = row_ids[row_i];
|
||||
data_d[row_idx.y * p.batch_stride_d + row_idx.z * p.stride_d + dr] = elem;
|
||||
}
|
||||
return elem;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
void main() {
|
||||
#if defined(DATA_A_IQ4_NL)
|
||||
init_iq4nl_shmem();
|
||||
#endif
|
||||
|
||||
#ifdef MUL_MAT_ID
|
||||
const uint expert_idx = gl_GlobalInvocationID.z;
|
||||
#else
|
||||
const uint batch_idx = gl_GlobalInvocationID.z;
|
||||
|
||||
const uint i13 = batch_idx / p.ne12;
|
||||
const uint i12 = batch_idx % p.ne12;
|
||||
|
||||
const uint i03 = i13 / p.broadcast3;
|
||||
const uint i02 = i12 / p.broadcast2;
|
||||
|
||||
const uint batch_idx_a = i03 * p.ne02 + i02;
|
||||
#endif
|
||||
|
||||
const uint blocks_m = (p.M + BM - 1) / BM;
|
||||
const uint ir = gl_WorkGroupID.x % blocks_m;
|
||||
const uint ik = gl_WorkGroupID.x / blocks_m;
|
||||
const uint ic = gl_WorkGroupID.y;
|
||||
|
||||
#ifdef MUL_MAT_ID
|
||||
// Spread the search across all elements in the first subgroup
|
||||
if (gl_SubgroupID == 0) {
|
||||
_ne1 = 0;
|
||||
uint num_elements = p.nei1 * p.nei0;
|
||||
|
||||
for (uint i = gl_SubgroupInvocationID; subgroupAny(i < num_elements); i += gl_SubgroupSize) {
|
||||
bool in_range = i < num_elements;
|
||||
uint ii0 = i % p.nei0;
|
||||
uint ii1 = i / p.nei0;
|
||||
uint id = in_range ? data_ids[ii1*p.nbi1 + ii0] : 0;
|
||||
uvec4 ballot = subgroupBallot(in_range && id == expert_idx);
|
||||
uint idx = subgroupBallotExclusiveBitCount(ballot);
|
||||
if (in_range && id == expert_idx) {
|
||||
row_ids[_ne1 + idx] = u16vec4(ii0 % p.ne11, ii1, ii0, 0);
|
||||
}
|
||||
_ne1 += subgroupBallotBitCount(ballot);
|
||||
}
|
||||
_ne1_sh = _ne1;
|
||||
}
|
||||
|
||||
barrier();
|
||||
|
||||
_ne1 = _ne1_sh;
|
||||
|
||||
// Workgroup has no work
|
||||
if (ic * BN >= _ne1) return;
|
||||
#endif
|
||||
|
||||
#ifdef MUL_MAT_ID
|
||||
uint start_k = 0;
|
||||
const uint end_k = p.K;
|
||||
#else
|
||||
uint start_k = ik * p.k_split;
|
||||
const uint end_k = min(p.K, (ik + 1) * p.k_split);
|
||||
#endif
|
||||
|
||||
coopmat<ACC_TYPE, gl_ScopeWorkgroup, BM, BN, gl_MatrixUseAccumulator> sum;
|
||||
sum = coopmat<ACC_TYPE, gl_ScopeWorkgroup, BM, BN, gl_MatrixUseAccumulator>(0.0);
|
||||
|
||||
#ifdef MUL_MAT_ID
|
||||
uint pos_a = (expert_idx * p.batch_stride_a) / QUANT_K;
|
||||
uint pos_b = 0;
|
||||
#else
|
||||
uint pos_a = (batch_idx_a * p.batch_stride_a) / QUANT_K;
|
||||
uint pos_b = batch_idx * p.batch_stride_b;
|
||||
#endif
|
||||
|
||||
uint stride_a = p.stride_a / QUANT_K;
|
||||
uint stride_b = p.stride_b;
|
||||
|
||||
// Hint to the compiler that values are aligned (want 16B alignment).
|
||||
// Quants are always block-aligned, no alignment needed.
|
||||
#if ALIGNED
|
||||
#if QUANT_K == 1
|
||||
stride_a &= ~7;
|
||||
#endif
|
||||
stride_b &= ~7;
|
||||
#endif
|
||||
|
||||
// Create layouts for both clamped and unclamped accesses
|
||||
tensorLayoutNV<2> tensorLayoutA = createTensorLayoutNV(2);
|
||||
tensorLayoutNV<2, gl_CooperativeMatrixClampModeConstantNV> tensorLayoutAClamp = createTensorLayoutNV(2, gl_CooperativeMatrixClampModeConstantNV);
|
||||
tensorLayoutNV<2> tensorLayoutB = createTensorLayoutNV(2);
|
||||
tensorLayoutNV<2, gl_CooperativeMatrixClampModeConstantNV> tensorLayoutBClamp = createTensorLayoutNV(2, gl_CooperativeMatrixClampModeConstantNV);
|
||||
tensorLayoutNV<2, gl_CooperativeMatrixClampModeConstantNV> tensorLayoutD = createTensorLayoutNV(2, gl_CooperativeMatrixClampModeConstantNV);
|
||||
|
||||
#if QUANT_K > 1
|
||||
tensorLayoutA = setTensorLayoutBlockSizeNV(tensorLayoutA, 1, QUANT_K);
|
||||
tensorLayoutAClamp = setTensorLayoutBlockSizeNV(tensorLayoutAClamp, 1, QUANT_K);
|
||||
#endif
|
||||
|
||||
// Use end_k rather than p.K as the dimension because that's what
|
||||
// we need to bound check against when using split_k
|
||||
tensorLayoutA = setTensorLayoutDimensionNV(tensorLayoutA, p.M, end_k);
|
||||
tensorLayoutB = setTensorLayoutDimensionNV(tensorLayoutB, p.N, end_k);
|
||||
tensorLayoutD = setTensorLayoutDimensionNV(tensorLayoutD, p.N, p.M);
|
||||
tensorLayoutAClamp = setTensorLayoutDimensionNV(tensorLayoutAClamp, p.M, end_k);
|
||||
tensorLayoutBClamp = setTensorLayoutDimensionNV(tensorLayoutBClamp, p.N, end_k);
|
||||
|
||||
tensorViewNV<2, false, 1, 0> tensorViewTranspose = createTensorViewNV(2, false, 1, 0);
|
||||
|
||||
#if !defined(MUL_MAT_ID)
|
||||
// Detect a fast path where all loads are entirely in bounds and no clamping is required
|
||||
if ((ir + 1) * BM <= p.M && (ic + 1) * BN <= p.N && (start_k % BK) == 0 && (end_k % BK) == 0 &&
|
||||
#if QUANT_K == 1
|
||||
(stride_a % 8) == 0 &&
|
||||
#endif
|
||||
(stride_b % 8) == 0 && (start_k % 8) == 0) {
|
||||
// Hint to the compiler that values are aligned (want 16B alignment)
|
||||
start_k &= ~7;
|
||||
stride_b &= ~7;
|
||||
#if QUANT_K == 1
|
||||
stride_a &= ~7;
|
||||
#endif
|
||||
|
||||
tensorLayoutA = setTensorLayoutStrideNV(tensorLayoutA, stride_a, 1);
|
||||
tensorLayoutB = setTensorLayoutStrideNV(tensorLayoutB, stride_b, 1);
|
||||
|
||||
uint k_iters = (end_k - start_k + BK - 1) / BK;
|
||||
|
||||
for (uint block_k = start_k, i = 0; i < k_iters; block_k += BK, ++i) {
|
||||
|
||||
coopmat<MAT_A_TYPE, gl_ScopeWorkgroup, BM, BK, gl_MatrixUseA> mat_a;
|
||||
coopmat<MAT_B_TYPE, gl_ScopeWorkgroup, BK, BN, gl_MatrixUseB> mat_b;
|
||||
|
||||
coopMatLoadTensorNV(mat_a, data_a, pos_a, sliceTensorLayoutNV(tensorLayoutA, ir * BM, BM, block_k, BK) DECODEFUNCA);
|
||||
coopmat<FLOAT_TYPE, gl_ScopeWorkgroup, BM, BK, gl_MatrixUseA> mat_a_ft = coopmat<FLOAT_TYPE, gl_ScopeWorkgroup, BM, BK, gl_MatrixUseA>(mat_a);
|
||||
|
||||
coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutB, ic * BN, BN, block_k, BK), tensorViewTranspose);
|
||||
coopmat<FLOAT_TYPE, gl_ScopeWorkgroup, BK, BN, gl_MatrixUseB> mat_b_ft = coopmat<FLOAT_TYPE, gl_ScopeWorkgroup, BK, BN, gl_MatrixUseB>(mat_b);
|
||||
|
||||
sum = coopMatMulAdd(mat_a_ft, mat_b_ft, sum);
|
||||
}
|
||||
} else
|
||||
#endif // !defined(MUL_MAT_ID)
|
||||
{
|
||||
tensorLayoutA = setTensorLayoutStrideNV(tensorLayoutA, stride_a, 1);
|
||||
|
||||
tensorLayoutAClamp = setTensorLayoutStrideNV(tensorLayoutAClamp, stride_a, 1);
|
||||
|
||||
tensorLayoutB = setTensorLayoutStrideNV(tensorLayoutB, stride_b, 1);
|
||||
|
||||
tensorLayoutBClamp = setTensorLayoutStrideNV(tensorLayoutBClamp, stride_b, 1);
|
||||
|
||||
[[dont_unroll]]
|
||||
for (uint block_k = start_k; block_k < end_k; block_k += BK) {
|
||||
|
||||
coopmat<MAT_A_TYPE, gl_ScopeWorkgroup, BM, BK, gl_MatrixUseA> mat_a;
|
||||
coopmat<MAT_B_TYPE, gl_ScopeWorkgroup, BK, BN, gl_MatrixUseB> mat_b;
|
||||
coopmat<FLOAT_TYPE, gl_ScopeWorkgroup, BM, BK, gl_MatrixUseA> mat_a_ft;
|
||||
coopmat<FLOAT_TYPE, gl_ScopeWorkgroup, BK, BN, gl_MatrixUseB> mat_b_ft;
|
||||
|
||||
// Clamping is expensive, so detect different code paths for each combination
|
||||
// of A and B needing clamping.
|
||||
bool unclampedA = (ir + 1) * BM <= p.M && block_k + BK <= end_k && (block_k % 8) == 0;
|
||||
#ifdef MUL_MAT_ID
|
||||
bool unclampedB = true;
|
||||
#else
|
||||
bool unclampedB = (ic + 1) * BN <= p.N && block_k + BK <= end_k && (block_k % 8) == 0;
|
||||
#endif
|
||||
if (unclampedA && unclampedB) {
|
||||
coopMatLoadTensorNV(mat_a, data_a, pos_a, sliceTensorLayoutNV(tensorLayoutA, ir * BM, BM, (block_k & ~7), BK) DECODEFUNCA);
|
||||
#ifdef MUL_MAT_ID
|
||||
coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutB, ic * BN, BN, block_k, BK), tensorViewTranspose, decodeFuncB);
|
||||
#else
|
||||
coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutB, ic * BN, BN, (block_k & ~7), BK), tensorViewTranspose);
|
||||
#endif
|
||||
mat_a_ft = coopmat<FLOAT_TYPE, gl_ScopeWorkgroup, BM, BK, gl_MatrixUseA>(mat_a);
|
||||
mat_b_ft = coopmat<FLOAT_TYPE, gl_ScopeWorkgroup, BK, BN, gl_MatrixUseB>(mat_b);
|
||||
sum = coopMatMulAdd(mat_a_ft, mat_b_ft, sum);
|
||||
} else if (unclampedA && !unclampedB) {
|
||||
coopMatLoadTensorNV(mat_a, data_a, pos_a, sliceTensorLayoutNV(tensorLayoutA, ir * BM, BM, (block_k & ~7), BK) DECODEFUNCA);
|
||||
coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutBClamp, ic * BN, BN, block_k, BK), tensorViewTranspose);
|
||||
|
||||
mat_a_ft = coopmat<FLOAT_TYPE, gl_ScopeWorkgroup, BM, BK, gl_MatrixUseA>(mat_a);
|
||||
mat_b_ft = coopmat<FLOAT_TYPE, gl_ScopeWorkgroup, BK, BN, gl_MatrixUseB>(mat_b);
|
||||
sum = coopMatMulAdd(mat_a_ft, mat_b_ft, sum);
|
||||
} else if (!unclampedA && unclampedB) {
|
||||
coopMatLoadTensorNV(mat_a, data_a, pos_a, sliceTensorLayoutNV(tensorLayoutAClamp, ir * BM, BM, block_k, BK) DECODEFUNCA);
|
||||
#ifdef MUL_MAT_ID
|
||||
coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutB, ic * BN, BN, block_k, BK), tensorViewTranspose, decodeFuncB);
|
||||
#else
|
||||
coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutB, ic * BN, BN, (block_k & ~7), BK), tensorViewTranspose);
|
||||
#endif
|
||||
mat_a_ft = coopmat<FLOAT_TYPE, gl_ScopeWorkgroup, BM, BK, gl_MatrixUseA>(mat_a);
|
||||
mat_b_ft = coopmat<FLOAT_TYPE, gl_ScopeWorkgroup, BK, BN, gl_MatrixUseB>(mat_b);
|
||||
sum = coopMatMulAdd(mat_a_ft, mat_b_ft, sum);
|
||||
} else if (!unclampedA && !unclampedB) {
|
||||
coopMatLoadTensorNV(mat_a, data_a, pos_a, sliceTensorLayoutNV(tensorLayoutAClamp, ir * BM, BM, block_k, BK) DECODEFUNCA);
|
||||
coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutBClamp, ic * BN, BN, block_k, BK), tensorViewTranspose);
|
||||
|
||||
mat_a_ft = coopmat<FLOAT_TYPE, gl_ScopeWorkgroup, BM, BK, gl_MatrixUseA>(mat_a);
|
||||
mat_b_ft = coopmat<FLOAT_TYPE, gl_ScopeWorkgroup, BK, BN, gl_MatrixUseB>(mat_b);
|
||||
sum = coopMatMulAdd(mat_a_ft, mat_b_ft, sum);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Convert from ACC_TYPE to D_TYPE
|
||||
coopmat<D_TYPE, gl_ScopeWorkgroup, BM, BN, gl_MatrixUseAccumulator> mat_d;
|
||||
mat_d = coopmat<D_TYPE, gl_ScopeWorkgroup, BM, BN, gl_MatrixUseAccumulator>(sum);
|
||||
|
||||
#ifdef MUL_MAT_ID
|
||||
// Call callback to store each element, remapping row through shared memory
|
||||
coopMatPerElementNV(mat_d, mat_d, perElemOpD, ir, ic);
|
||||
#else
|
||||
tensorLayoutD = setTensorLayoutStrideNV(tensorLayoutD, p.stride_d, 1);
|
||||
|
||||
uint pos_d = batch_idx * p.batch_stride_d + ik * p.batch_stride_d * gl_NumWorkGroups.z;
|
||||
coopMatStoreTensorNV(mat_d, data_d, pos_d, sliceTensorLayoutNV(tensorLayoutD, ic * BN, BN, ir * BM, BM), tensorViewTranspose);
|
||||
#endif
|
||||
}
|
||||
|
|
@ -0,0 +1,44 @@
|
|||
#version 450
|
||||
|
||||
#include "generic_head.comp"
|
||||
#include "types.comp"
|
||||
|
||||
#extension GL_EXT_control_flow_attributes : enable
|
||||
#define BLOCK_SIZE 512
|
||||
|
||||
layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
|
||||
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
|
||||
|
||||
shared vec2 sum[BLOCK_SIZE];
|
||||
|
||||
void main() {
|
||||
const uint row = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x;
|
||||
const uint tid = gl_LocalInvocationID.x;
|
||||
|
||||
sum[tid] = vec2(0.0f, 0.0f);
|
||||
|
||||
[[unroll]] for (uint col = tid; col < p.KX; col += BLOCK_SIZE) {
|
||||
const float xi = float(data_a[row*p.KX + col]);
|
||||
sum[tid].x += xi;
|
||||
sum[tid].y += xi * xi;
|
||||
}
|
||||
|
||||
// sum up partial sums and write back result
|
||||
barrier();
|
||||
[[unroll]] for (int s = BLOCK_SIZE / 2; s > 0; s >>= 1) {
|
||||
if (tid < s) {
|
||||
sum[tid] += sum[tid + s];
|
||||
}
|
||||
barrier();
|
||||
}
|
||||
|
||||
const float mean = sum[0].x / p.KX;
|
||||
const float var = sum[0].y / p.KX - mean * mean;
|
||||
const float inv_std = inversesqrt(var + p.param1);
|
||||
|
||||
[[unroll]] for (uint col = tid; col < p.KX; col += BLOCK_SIZE) {
|
||||
data_d[row*p.KX + col] = D_TYPE((float(data_a[row*p.KX + col]) - mean) * inv_std);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,28 @@
|
|||
#version 450
|
||||
|
||||
#include "types.comp"
|
||||
#include "generic_unary_head.comp"
|
||||
|
||||
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
void main() {
|
||||
const uint idx = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
|
||||
|
||||
if (idx >= p.ne) {
|
||||
return;
|
||||
}
|
||||
|
||||
const uint i3 = idx / (p.ne12*p.ne11*p.ne10);
|
||||
const uint i3_offset = i3 * p.ne12*p.ne11*p.ne10;
|
||||
const uint i2 = (idx - i3_offset) / (p.ne11*p.ne10);
|
||||
const uint i2_offset = i2*p.ne11*p.ne10;
|
||||
const uint i1 = (idx - i3_offset - i2_offset) / p.ne10;
|
||||
const uint i0 = idx - i3_offset - i2_offset - i1*p.ne10;
|
||||
|
||||
const uint src0_idx = i3*p.nb03 + i2*p.nb02 + i1*p.nb01 + i0*p.nb00;
|
||||
const uint dst_idx = i3*p.nb13 + i2*p.nb12 + i1*p.nb11 + i0*p.nb10;
|
||||
|
||||
const bool is_src0 = i0 < p.ne00 && i1 < p.ne01 && i2 < p.ne02 && i3 < p.ne03;
|
||||
|
||||
data_d[get_doffset() + dst_idx] = D_TYPE(is_src0 ? data_a[get_aoffset() + src0_idx] : 0.0f);
|
||||
}
|
||||
|
|
@ -0,0 +1,74 @@
|
|||
#version 450
|
||||
|
||||
#include "types.comp"
|
||||
|
||||
#extension GL_EXT_shader_16bit_storage : require
|
||||
|
||||
layout(push_constant) uniform parameter {
|
||||
uint IW; uint IH;
|
||||
uint OW; uint OH;
|
||||
uint OC;
|
||||
uint pelements;
|
||||
uint op;
|
||||
int k0; int k1;
|
||||
int s0; int s1;
|
||||
int p0; int p1;
|
||||
} p;
|
||||
|
||||
#define BLOCK_SIZE 512
|
||||
#define FLT_MAX 3.402823466e+38F
|
||||
#define OP_POOL_MAX 0u
|
||||
#define OP_POOL_AVG 1u
|
||||
|
||||
layout (local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout(binding = 0) readonly buffer X {A_TYPE data_a[];};
|
||||
layout(binding = 1) writeonly buffer D {D_TYPE data_d[];};
|
||||
|
||||
void main() {
|
||||
const uint idx = gl_GlobalInvocationID.x;
|
||||
if (idx >= p.pelements) {
|
||||
return;
|
||||
}
|
||||
|
||||
const uint O_HW = p.OW * p.OH;
|
||||
|
||||
const uint nc = idx / O_HW;
|
||||
const uint cur_oh = (idx % O_HW) / p.OW;
|
||||
const uint cur_ow = (idx % O_HW) % p.OW;
|
||||
|
||||
const int start_h = int(cur_oh) * p.s0 - p.p0;
|
||||
const uint bh = max(start_h, 0);
|
||||
const uint eh = min(start_h + p.k0, p.IH);
|
||||
|
||||
const int start_w = int(cur_ow) * p.s1 - p.p1;
|
||||
const uint bw = max(start_w, 0);
|
||||
const uint ew = min(start_w + p.k1, p.IW);
|
||||
|
||||
const float scale = 1.0 / float(p.k0 * p.k1);
|
||||
float res;
|
||||
|
||||
if (p.op == OP_POOL_AVG) {
|
||||
res = 0.0;
|
||||
} else if (p.op == OP_POOL_MAX) {
|
||||
res = -FLT_MAX;
|
||||
} else {
|
||||
return;
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (uint i = bh; i < eh; i++) {
|
||||
#pragma unroll
|
||||
for (uint j = bw; j < ew; j++) {
|
||||
const float cur = D_TYPE(data_a[nc * p.IH * p.IW + i * p.IW + j]);
|
||||
|
||||
if (p.op == OP_POOL_AVG) {
|
||||
res += cur * scale;
|
||||
} else if (p.op == OP_POOL_MAX) {
|
||||
res = max(res, cur);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
data_d[nc * O_HW + cur_oh * p.OW + cur_ow] = res;
|
||||
}
|
||||
|
|
@ -0,0 +1,21 @@
|
|||
#version 450
|
||||
|
||||
#include "generic_head.comp"
|
||||
#include "types.comp"
|
||||
|
||||
#extension GL_EXT_control_flow_attributes : enable
|
||||
|
||||
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
|
||||
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
|
||||
|
||||
void main() {
|
||||
const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
|
||||
|
||||
if (i >= p.KX) {
|
||||
return;
|
||||
}
|
||||
|
||||
data_d[i] = max(float(data_a[i]), 0);
|
||||
}
|
||||
|
|
@ -0,0 +1,26 @@
|
|||
#version 450
|
||||
|
||||
#include "types.comp"
|
||||
#include "generic_unary_head.comp"
|
||||
|
||||
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
uint src0_idx_mod(uint idx) {
|
||||
const uint i13 = idx / (p.ne12*p.ne11*p.ne10);
|
||||
const uint i13_offset = i13 * p.ne12*p.ne11*p.ne10;
|
||||
const uint i12 = (idx - i13_offset) / (p.ne11*p.ne10);
|
||||
const uint i12_offset = i12*p.ne11*p.ne10;
|
||||
const uint i11 = (idx - i13_offset - i12_offset) / p.ne10;
|
||||
const uint i10 = idx - i13_offset - i12_offset - i11*p.ne10;
|
||||
return (i13 % p.ne03)*p.nb03 + (i12 % p.ne02)*p.nb02 + (i11 % p.ne01)*p.nb01 + (i10 % p.ne00)*p.nb00;
|
||||
}
|
||||
|
||||
void main() {
|
||||
const uint idx = get_idx();
|
||||
|
||||
if (idx >= p.ne) {
|
||||
return;
|
||||
}
|
||||
|
||||
data_d[get_doffset() + dst_idx(idx)] = D_TYPE(data_a[get_aoffset() + src0_idx_mod(idx)]);
|
||||
}
|
||||
|
|
@ -0,0 +1,42 @@
|
|||
#version 450
|
||||
|
||||
#include "generic_head.comp"
|
||||
#include "types.comp"
|
||||
|
||||
#extension GL_EXT_control_flow_attributes : enable
|
||||
#define BLOCK_SIZE 512
|
||||
|
||||
layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
|
||||
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
|
||||
|
||||
shared FLOAT_TYPE sum[BLOCK_SIZE];
|
||||
|
||||
void main() {
|
||||
const uint row = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x;
|
||||
const uint tid = gl_LocalInvocationID.x;
|
||||
|
||||
sum[tid] = FLOAT_TYPE(0.0f); // partial sum for thread in warp
|
||||
|
||||
[[unroll]] for (uint col = tid; col < p.KX; col += BLOCK_SIZE) {
|
||||
const FLOAT_TYPE xi = FLOAT_TYPE(data_a[row*p.KX + col]);
|
||||
sum[tid] += xi * xi;
|
||||
}
|
||||
|
||||
// sum up partial sums and write back result
|
||||
barrier();
|
||||
[[unroll]] for (int s = BLOCK_SIZE / 2; s > 0; s >>= 1) {
|
||||
if (tid < s) {
|
||||
sum[tid] += sum[tid + s];
|
||||
}
|
||||
barrier();
|
||||
}
|
||||
|
||||
const FLOAT_TYPE mean = sum[0] / FLOAT_TYPE(p.KX);
|
||||
const FLOAT_TYPE scale = inversesqrt(mean + FLOAT_TYPE(p.param1));
|
||||
|
||||
[[unroll]] for (uint col = tid; col < p.KX; col += BLOCK_SIZE) {
|
||||
data_d[row*p.KX + col] = D_TYPE(scale * FLOAT_TYPE(data_a[row*p.KX + col]));
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,49 @@
|
|||
#include "types.comp"
|
||||
|
||||
#extension GL_EXT_shader_16bit_storage : require
|
||||
#extension GL_EXT_spirv_intrinsics: enable
|
||||
|
||||
#if RTE16
|
||||
spirv_execution_mode(capabilities = [4467], 4462, 16); // RoundingModeRTE, 16 bits
|
||||
#endif
|
||||
|
||||
layout(local_size_x = 1, local_size_y = 256, local_size_z = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
|
||||
layout (binding = 1) readonly buffer Y {int data_pos[];};
|
||||
layout (binding = 2) readonly buffer Z {float data_ff[];};
|
||||
layout (binding = 3) writeonly buffer D {D_TYPE data_d[];};
|
||||
|
||||
layout (push_constant) uniform parameter {
|
||||
uint ncols;
|
||||
uint n_dims;
|
||||
float freq_scale;
|
||||
uint p_delta_rows;
|
||||
float freq_base;
|
||||
float ext_factor;
|
||||
float attn_factor;
|
||||
float corr_dims[2];
|
||||
float theta_scale;
|
||||
uint has_ff;
|
||||
} p;
|
||||
|
||||
float rope_yarn_ramp(const float low, const float high, const uint i0) {
|
||||
const float y = (i0 / 2 - low) / max(0.001f, high - low);
|
||||
return 1.0f - min(1.0f, max(0.0f, y));
|
||||
}
|
||||
|
||||
void rope_yarn(const float theta_extrap, const uint i0, out float cos_theta, out float sin_theta) {
|
||||
float mscale = p.attn_factor;
|
||||
// Get n-d rotational scaling corrected for extrapolation
|
||||
float theta_interp = p.freq_scale * theta_extrap;
|
||||
float theta = theta_interp;
|
||||
if (p.ext_factor != 0.0f) {
|
||||
float ramp_mix = rope_yarn_ramp(p.corr_dims[0], p.corr_dims[1], i0) * p.ext_factor;
|
||||
theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
|
||||
|
||||
// Get n-d magnitude scaling corrected for interpolation
|
||||
mscale *= 1.0f + 0.1f * log(1.0f / p.freq_scale);
|
||||
}
|
||||
cos_theta = cos(theta) * mscale;
|
||||
sin_theta = sin(theta) * mscale;
|
||||
}
|
||||
|
|
@ -0,0 +1,37 @@
|
|||
#version 450
|
||||
|
||||
#include "rope_head.comp"
|
||||
|
||||
void main() {
|
||||
const uint col = gl_GlobalInvocationID.y * 2;
|
||||
const uint row = gl_GlobalInvocationID.x;
|
||||
|
||||
if (col >= p.ncols) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (col >= p.n_dims) {
|
||||
const uint i = row*p.ncols + col;
|
||||
|
||||
data_d[i + 0] = data_a[i + 0];
|
||||
data_d[i + 1] = data_a[i + 1];
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
const uint i = row*p.ncols + col/2;
|
||||
const uint i2 = row/p.p_delta_rows;
|
||||
|
||||
const float theta_base = data_pos[i2] * pow(p.theta_scale, col/2.0f);
|
||||
|
||||
const float freq_factor = p.has_ff != 0 ? data_ff[col/2] : 1.0f;
|
||||
|
||||
float cos_theta, sin_theta;
|
||||
rope_yarn(theta_base / freq_factor, col, cos_theta, sin_theta);
|
||||
|
||||
const float x0 = float(data_a[i + 0]);
|
||||
const float x1 = float(data_a[i + p.n_dims/2]);
|
||||
|
||||
data_d[i + 0] = D_TYPE(x0*cos_theta - x1*sin_theta);
|
||||
data_d[i + p.n_dims/2] = D_TYPE(x0*sin_theta + x1*cos_theta);
|
||||
}
|
||||
|
|
@ -0,0 +1,37 @@
|
|||
#version 450
|
||||
|
||||
#include "rope_head.comp"
|
||||
|
||||
void main() {
|
||||
const uint col = gl_GlobalInvocationID.y * 2;
|
||||
const uint row = gl_GlobalInvocationID.x;
|
||||
|
||||
if (col >= p.ncols) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (col >= p.n_dims) {
|
||||
const uint i = row*p.ncols + col;
|
||||
|
||||
data_d[i + 0] = data_a[i + 0];
|
||||
data_d[i + 1] = data_a[i + 1];
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
const uint i = row*p.ncols + col;
|
||||
const uint i2 = row/p.p_delta_rows;
|
||||
|
||||
const float theta_base = data_pos[i2] * pow(p.theta_scale, col/2.0f);
|
||||
|
||||
const float freq_factor = p.has_ff != 0 ? data_ff[col/2] : 1.0f;
|
||||
|
||||
float cos_theta, sin_theta;
|
||||
rope_yarn(theta_base / freq_factor, col, cos_theta, sin_theta);
|
||||
|
||||
const float x0 = float(data_a[i + 0]);
|
||||
const float x1 = float(data_a[i + 1]);
|
||||
|
||||
data_d[i + 0] = D_TYPE(x0*cos_theta - x1*sin_theta);
|
||||
data_d[i + 1] = D_TYPE(x0*sin_theta + x1*cos_theta);
|
||||
}
|
||||
|
|
@ -0,0 +1,24 @@
|
|||
#version 450
|
||||
|
||||
#include "types.comp"
|
||||
#include "generic_unary_head.comp"
|
||||
|
||||
const uint num_threads = 128;
|
||||
|
||||
layout(local_size_x = num_threads, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
void main() {
|
||||
uint idx = get_idx();
|
||||
|
||||
// num_threads * num_iter must equal 512, to match the wg_denoms and get_idx calculation
|
||||
const uint num_iter = 4;
|
||||
|
||||
[[unroll]] for (uint i = 0; i < num_iter; ++i) {
|
||||
if (idx >= p.ne) {
|
||||
continue;
|
||||
}
|
||||
|
||||
data_d[get_doffset() + idx] = D_TYPE(FLOAT_TYPE(data_a[get_aoffset() + idx]) * FLOAT_TYPE(p.param1));
|
||||
idx += num_threads;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,22 @@
|
|||
#version 450
|
||||
|
||||
#include "generic_head.comp"
|
||||
#include "types.comp"
|
||||
|
||||
#extension GL_EXT_control_flow_attributes : enable
|
||||
|
||||
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
|
||||
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
|
||||
|
||||
void main() {
|
||||
const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
|
||||
|
||||
if (i >= p.KX) {
|
||||
return;
|
||||
}
|
||||
|
||||
const float xi = float(data_a[i]);
|
||||
data_d[i] = D_TYPE(xi / (1.0f + exp(-xi)));
|
||||
}
|
||||
|
|
@ -0,0 +1,17 @@
|
|||
#version 450
|
||||
|
||||
#include "types.comp"
|
||||
#include "generic_unary_head.comp"
|
||||
|
||||
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
void main() {
|
||||
const uint idx = get_idx();
|
||||
|
||||
if (idx >= p.ne) {
|
||||
return;
|
||||
}
|
||||
|
||||
const FLOAT_TYPE val = FLOAT_TYPE(data_a[get_aoffset() + src0_idx(idx)]);
|
||||
data_d[get_doffset() + dst_idx(idx)] = D_TYPE(sin(val));
|
||||
}
|
||||
|
|
@ -0,0 +1,174 @@
|
|||
#version 450
|
||||
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require
|
||||
#extension GL_EXT_control_flow_attributes : enable
|
||||
|
||||
layout (push_constant) uniform parameter
|
||||
{
|
||||
uint KX;
|
||||
uint KY;
|
||||
float scale;
|
||||
float max_bias;
|
||||
float m0;
|
||||
float m1;
|
||||
uint n_head_log2;
|
||||
uint nrows_x;
|
||||
} p;
|
||||
|
||||
#include "types.comp"
|
||||
|
||||
layout(constant_id = 0) const uint BLOCK_SIZE = 32;
|
||||
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
|
||||
layout (binding = 1) readonly buffer Y {B_TYPE data_b[];};
|
||||
layout (binding = 2) buffer D {D_TYPE data_d[];};
|
||||
|
||||
shared FLOAT_TYPE vals[BLOCK_SIZE];
|
||||
|
||||
// num_iters is the number of BLOCK_SIZE loop iterations we need to iterate
|
||||
// over all the columns. The main function tries to pass a constant here,
|
||||
// as if it were a template function, to allow unrolling.
|
||||
void soft_max(uint num_iters) {
|
||||
const uint tid = gl_LocalInvocationID.x;
|
||||
const uint rowx = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x;
|
||||
const uint rowy = (p.KY > 0) ? (rowx % p.KY) : 0;
|
||||
|
||||
if (rowx >= p.nrows_x) {
|
||||
return;
|
||||
}
|
||||
|
||||
float slope = 1.0f;
|
||||
|
||||
// ALiBi
|
||||
if (p.max_bias > 0.0f) {
|
||||
const uint h = rowx/p.KY; // head index
|
||||
|
||||
const float base = h < p.n_head_log2 ? p.m0 : p.m1;
|
||||
const uint exp = h < p.n_head_log2 ? h + 1 : 2*(h - p.n_head_log2) + 1;
|
||||
|
||||
slope = pow(base, exp);
|
||||
}
|
||||
|
||||
// Find max
|
||||
FLOAT_TYPE max_val = uintBitsToFloat(0xFF800000);
|
||||
|
||||
// Cache values while we compute the max, so we don't need to read them
|
||||
// again when we're ready to compute exp(x-max).
|
||||
const uint DATA_CACHE_SIZE = 16;
|
||||
FLOAT_TYPE data_cache[DATA_CACHE_SIZE];
|
||||
|
||||
[[unroll]] for (uint col0 = 0, idx = 0; idx < num_iters; col0 += BLOCK_SIZE, ++idx) {
|
||||
const uint col = col0 + tid;
|
||||
|
||||
FLOAT_TYPE a = FLOAT_TYPE(0);
|
||||
if (col < p.KX) {
|
||||
a = data_a[rowx * p.KX + col];
|
||||
}
|
||||
|
||||
FLOAT_TYPE b = FLOAT_TYPE(0);
|
||||
if (p.KY > 0 && col < p.KX) {
|
||||
b = data_b[rowy * p.KX + col];
|
||||
}
|
||||
|
||||
FLOAT_TYPE v = a * p.scale + slope * b;
|
||||
|
||||
if (col < p.KX) {
|
||||
max_val = max(max_val, v);
|
||||
}
|
||||
|
||||
if (idx < DATA_CACHE_SIZE) {
|
||||
data_cache[idx] = v;
|
||||
}
|
||||
}
|
||||
|
||||
// reduce across the workgroup
|
||||
vals[tid] = max_val;
|
||||
barrier();
|
||||
[[unroll]] for (uint s = BLOCK_SIZE / 2; s > 0; s >>= 1) {
|
||||
if (tid < s) {
|
||||
vals[tid] = max(vals[tid], vals[tid + s]);
|
||||
}
|
||||
barrier();
|
||||
}
|
||||
|
||||
max_val = vals[0];
|
||||
barrier();
|
||||
|
||||
FLOAT_TYPE sum = FLOAT_TYPE(0.0f);
|
||||
|
||||
// Compute sum{exp(x - max)}
|
||||
[[unroll]] for (uint col0 = 0, idx = 0; idx < num_iters; col0 += BLOCK_SIZE, ++idx) {
|
||||
const uint col = col0 + tid;
|
||||
|
||||
if (col >= p.KX) {
|
||||
break;
|
||||
}
|
||||
|
||||
// compute exp(a*scale+b*slope), add it to sum, and cache the new value
|
||||
// in data_cache if possible.
|
||||
const uint i = rowx * p.KX + col;
|
||||
FLOAT_TYPE val;
|
||||
if (idx < DATA_CACHE_SIZE) {
|
||||
val = exp(data_cache[idx] - max_val);
|
||||
} else {
|
||||
val = exp(FLOAT_TYPE(data_a[i]) * p.scale + (p.KY > 0 ? slope * FLOAT_TYPE(data_b[rowy * p.KX + col]) : FLOAT_TYPE(0.0f)) - max_val);
|
||||
}
|
||||
sum += val;
|
||||
if (idx < DATA_CACHE_SIZE) {
|
||||
data_cache[idx] = val;
|
||||
} else {
|
||||
data_d[i] = D_TYPE(val);
|
||||
}
|
||||
}
|
||||
|
||||
// reduce across the workgroup
|
||||
vals[tid] = sum;
|
||||
barrier();
|
||||
[[unroll]] for (uint s = BLOCK_SIZE / 2; s > 0; s >>= 1) {
|
||||
if (tid < s) {
|
||||
vals[tid] += vals[tid + s];
|
||||
}
|
||||
barrier();
|
||||
}
|
||||
sum = vals[0];
|
||||
|
||||
FLOAT_TYPE rcpdivisor = 1.0/sum;
|
||||
|
||||
[[unroll]] for (uint col0 = 0, idx = 0; idx < num_iters; col0 += BLOCK_SIZE, ++idx) {
|
||||
const uint col = col0 + tid;
|
||||
|
||||
if (col >= p.KX) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (idx < DATA_CACHE_SIZE) {
|
||||
data_d[rowx*p.KX + col] = D_TYPE(data_cache[idx] * rcpdivisor);
|
||||
} else {
|
||||
data_d[rowx*p.KX + col] *= D_TYPE(rcpdivisor);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void main() {
|
||||
// instantiate the soft_max function for several different
|
||||
// dimensions, to allow loop unrolling
|
||||
uint num_blocks = (p.KX + BLOCK_SIZE - 1) / BLOCK_SIZE;
|
||||
if (num_blocks > 32) {
|
||||
soft_max(num_blocks);
|
||||
} else if (num_blocks > 16) {
|
||||
soft_max(32);
|
||||
} else if (num_blocks > 8) {
|
||||
soft_max(16);
|
||||
} else if (num_blocks > 4) {
|
||||
soft_max(8);
|
||||
} else if (num_blocks == 4) {
|
||||
soft_max(4);
|
||||
} else if (num_blocks == 3) {
|
||||
soft_max(3);
|
||||
} else if (num_blocks == 2) {
|
||||
soft_max(2);
|
||||
} else if (num_blocks == 1) {
|
||||
soft_max(1);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,17 @@
|
|||
#version 450
|
||||
|
||||
#include "types.comp"
|
||||
#include "generic_unary_head.comp"
|
||||
|
||||
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
void main() {
|
||||
const uint idx = get_idx();
|
||||
|
||||
if (idx >= p.ne) {
|
||||
return;
|
||||
}
|
||||
|
||||
const FLOAT_TYPE val = FLOAT_TYPE(data_a[get_aoffset() + src0_idx(idx)]);
|
||||
data_d[get_doffset() + dst_idx(idx)] = D_TYPE(val * val);
|
||||
}
|
||||
|
|
@ -0,0 +1,37 @@
|
|||
#version 450
|
||||
|
||||
#include "generic_head.comp"
|
||||
#include "types.comp"
|
||||
|
||||
#extension GL_EXT_control_flow_attributes : enable
|
||||
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
|
||||
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
|
||||
|
||||
layout (constant_id = 0) const uint BLOCK_SIZE = 32;
|
||||
|
||||
shared FLOAT_TYPE tmp[BLOCK_SIZE];
|
||||
|
||||
void main() {
|
||||
const uint row = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x;
|
||||
const uint col = gl_LocalInvocationID.x;
|
||||
|
||||
tmp[col] = FLOAT_TYPE(0.0f);
|
||||
|
||||
for (uint i = col; i < p.KX; i += BLOCK_SIZE) {
|
||||
tmp[col] += FLOAT_TYPE(data_a[row*p.KX + i]);
|
||||
}
|
||||
|
||||
barrier();
|
||||
[[unroll]] for (int s = int(BLOCK_SIZE) / 2; s > 0; s >>= 1) {
|
||||
if (col < s) {
|
||||
tmp[col] += tmp[col + s];
|
||||
}
|
||||
barrier();
|
||||
}
|
||||
|
||||
if (col == 0) {
|
||||
data_d[row] = D_TYPE(tmp[0]);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,20 @@
|
|||
#version 450
|
||||
|
||||
#include "generic_head.comp"
|
||||
#include "types.comp"
|
||||
|
||||
#extension GL_EXT_control_flow_attributes : enable
|
||||
|
||||
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
|
||||
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
|
||||
|
||||
void main() {
|
||||
const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
|
||||
|
||||
if (i >= p.KX) {
|
||||
return;
|
||||
}
|
||||
data_d[i] = D_TYPE(1. - 2. / (exp(2.*data_a[i]) + 1.));
|
||||
}
|
||||
|
|
@ -0,0 +1,7 @@
|
|||
#version 460
|
||||
|
||||
#extension GL_NV_cooperative_matrix2 : require
|
||||
|
||||
void main()
|
||||
{
|
||||
}
|
||||
|
|
@ -0,0 +1,41 @@
|
|||
#version 450
|
||||
|
||||
#extension GL_EXT_shader_16bit_storage : require
|
||||
|
||||
layout (push_constant) uniform parameter
|
||||
{
|
||||
uint nb1;
|
||||
uint dim;
|
||||
uint max_period;
|
||||
} p;
|
||||
|
||||
#include "types.comp"
|
||||
|
||||
#extension GL_EXT_control_flow_attributes : enable
|
||||
#define BLOCK_SIZE 256
|
||||
|
||||
layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
|
||||
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
|
||||
|
||||
void main() {
|
||||
const uint i = gl_WorkGroupID.y;
|
||||
const uint j = gl_GlobalInvocationID.x;
|
||||
const uint d_offset = i * p.nb1;
|
||||
|
||||
if (p.dim % 2 != 0 && j == ((p.dim + 1) / 2)) {
|
||||
data_d[d_offset + p.dim] = 0.f;
|
||||
}
|
||||
|
||||
const uint half_dim = p.dim / 2;
|
||||
if (j >= half_dim) {
|
||||
return;
|
||||
}
|
||||
|
||||
const float timestep = float(data_a[i]);
|
||||
const float freq = float(exp(-log(p.max_period) * j / half_dim));
|
||||
const float arg = timestep * freq;
|
||||
data_d[d_offset + j] = D_TYPE(cos(arg));
|
||||
data_d[d_offset + j + half_dim] = D_TYPE(sin(arg));
|
||||
}
|
||||
|
|
@ -0,0 +1,323 @@
|
|||
|
||||
#if !defined(GGML_TYPES_COMP)
|
||||
#define GGML_TYPES_COMP
|
||||
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types : require
|
||||
|
||||
#if defined(DATA_A_F32)
|
||||
#define QUANT_K 1
|
||||
#define QUANT_R 1
|
||||
|
||||
#if !defined(LOAD_VEC_A) || LOAD_VEC_A == 1
|
||||
#define A_TYPE float
|
||||
#elif LOAD_VEC_A == 4
|
||||
#define A_TYPE vec4
|
||||
#elif LOAD_VEC_A == 8
|
||||
#define A_TYPE mat2x4
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(DATA_A_F16)
|
||||
#define QUANT_K 1
|
||||
#define QUANT_R 1
|
||||
|
||||
#if !defined(LOAD_VEC_A) || LOAD_VEC_A == 1
|
||||
#define A_TYPE float16_t
|
||||
#elif LOAD_VEC_A == 4
|
||||
#define A_TYPE f16vec4
|
||||
#elif LOAD_VEC_A == 8
|
||||
#define A_TYPE f16mat2x4
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#define QUANT_K_Q4_0 32
|
||||
#define QUANT_R_Q4_0 2
|
||||
|
||||
struct block_q4_0
|
||||
{
|
||||
float16_t d;
|
||||
uint8_t qs[16];
|
||||
};
|
||||
struct block_q4_0_packed16
|
||||
{
|
||||
float16_t d;
|
||||
uint16_t qs[16/2];
|
||||
};
|
||||
|
||||
#if defined(DATA_A_Q4_0)
|
||||
#define QUANT_K QUANT_K_Q4_0
|
||||
#define QUANT_R QUANT_R_Q4_0
|
||||
#define A_TYPE block_q4_0
|
||||
#define A_TYPE_PACKED16 block_q4_0_packed16
|
||||
#endif
|
||||
|
||||
#define QUANT_K_Q4_1 32
|
||||
#define QUANT_R_Q4_1 2
|
||||
|
||||
struct block_q4_1
|
||||
{
|
||||
float16_t d;
|
||||
float16_t m;
|
||||
uint8_t qs[16];
|
||||
};
|
||||
|
||||
struct block_q4_1_packed16
|
||||
{
|
||||
float16_t d;
|
||||
float16_t m;
|
||||
uint16_t qs[16/2];
|
||||
};
|
||||
|
||||
#if defined(DATA_A_Q4_1)
|
||||
#define QUANT_K QUANT_K_Q4_1
|
||||
#define QUANT_R QUANT_R_Q4_1
|
||||
#define A_TYPE block_q4_1
|
||||
#define A_TYPE_PACKED16 block_q4_1_packed16
|
||||
#endif
|
||||
|
||||
#define QUANT_K_Q5_0 32
|
||||
#define QUANT_R_Q5_0 2
|
||||
|
||||
struct block_q5_0
|
||||
{
|
||||
float16_t d;
|
||||
uint16_t qh[2];
|
||||
uint8_t qs[16];
|
||||
};
|
||||
|
||||
struct block_q5_0_packed16
|
||||
{
|
||||
float16_t d;
|
||||
uint16_t qh[2];
|
||||
uint16_t qs[16/2];
|
||||
};
|
||||
|
||||
#if defined(DATA_A_Q5_0)
|
||||
#define QUANT_K QUANT_K_Q5_0
|
||||
#define QUANT_R QUANT_R_Q5_0
|
||||
#define A_TYPE block_q5_0
|
||||
#define A_TYPE_PACKED16 block_q5_0_packed16
|
||||
#endif
|
||||
|
||||
#define QUANT_K_Q5_1 32
|
||||
#define QUANT_R_Q5_1 2
|
||||
|
||||
struct block_q5_1
|
||||
{
|
||||
float16_t d;
|
||||
float16_t m;
|
||||
uint qh;
|
||||
uint8_t qs[16];
|
||||
};
|
||||
|
||||
struct block_q5_1_packed16
|
||||
{
|
||||
float16_t d;
|
||||
float16_t m;
|
||||
uint qh;
|
||||
uint16_t qs[16/2];
|
||||
};
|
||||
|
||||
#if defined(DATA_A_Q5_1)
|
||||
#define QUANT_K QUANT_K_Q5_1
|
||||
#define QUANT_R QUANT_R_Q5_1
|
||||
#define A_TYPE block_q5_1
|
||||
#define A_TYPE_PACKED16 block_q5_1_packed16
|
||||
#endif
|
||||
|
||||
#define QUANT_K_Q8_0 32
|
||||
#define QUANT_R_Q8_0 1
|
||||
|
||||
struct block_q8_0
|
||||
{
|
||||
float16_t d;
|
||||
int8_t qs[32];
|
||||
};
|
||||
struct block_q8_0_packed16
|
||||
{
|
||||
float16_t d;
|
||||
uint16_t qs[32/2];
|
||||
};
|
||||
|
||||
#if defined(DATA_A_Q8_0)
|
||||
#define QUANT_K QUANT_K_Q8_0
|
||||
#define QUANT_R QUANT_R_Q8_0
|
||||
#define A_TYPE block_q8_0
|
||||
#define A_TYPE_PACKED16 block_q8_0_packed16
|
||||
#endif
|
||||
|
||||
// K-quants
|
||||
#define QUANT_K_Q2_K 256
|
||||
|
||||
struct block_q2_K
|
||||
{
|
||||
uint8_t scales[QUANT_K_Q2_K/16];
|
||||
uint8_t qs[QUANT_K_Q2_K/4];
|
||||
f16vec2 d;
|
||||
};
|
||||
|
||||
struct block_q2_K_packed16
|
||||
{
|
||||
uint16_t scales[QUANT_K_Q2_K/16/2];
|
||||
uint16_t qs[QUANT_K_Q2_K/4/2];
|
||||
f16vec2 d;
|
||||
};
|
||||
|
||||
struct block_q2_K_packed32
|
||||
{
|
||||
uint32_t scales[QUANT_K_Q2_K/16/4];
|
||||
uint32_t qs[QUANT_K_Q2_K/4/4];
|
||||
f16vec2 d;
|
||||
};
|
||||
|
||||
#if defined(DATA_A_Q2_K)
|
||||
#define QUANT_K QUANT_K_Q2_K
|
||||
#define A_TYPE block_q2_K
|
||||
#define A_TYPE_PACKED16 block_q2_K_packed16
|
||||
#define A_TYPE_PACKED32 block_q2_K_packed32
|
||||
#endif
|
||||
|
||||
#define QUANT_K_Q3_K 256
|
||||
|
||||
struct block_q3_K
|
||||
{
|
||||
uint8_t hmask[QUANT_K_Q3_K/8];
|
||||
uint8_t qs[QUANT_K_Q3_K/4];
|
||||
uint8_t scales[12];
|
||||
float16_t d;
|
||||
};
|
||||
|
||||
struct block_q3_K_packed16
|
||||
{
|
||||
uint16_t hmask[QUANT_K_Q3_K/8/2];
|
||||
uint16_t qs[QUANT_K_Q3_K/4/2];
|
||||
uint16_t scales[12/2];
|
||||
float16_t d;
|
||||
};
|
||||
|
||||
#if defined(DATA_A_Q3_K)
|
||||
#define QUANT_K QUANT_K_Q3_K
|
||||
#define A_TYPE block_q3_K
|
||||
#define A_TYPE_PACKED16 block_q3_K_packed16
|
||||
#endif
|
||||
|
||||
#define QUANT_K_Q4_K 256
|
||||
|
||||
struct block_q4_K
|
||||
{
|
||||
f16vec2 d;
|
||||
uint8_t scales[3*QUANT_K_Q4_K/64];
|
||||
uint8_t qs[QUANT_K_Q4_K/2];
|
||||
};
|
||||
|
||||
struct block_q4_K_packed16
|
||||
{
|
||||
f16vec2 d;
|
||||
uint16_t scales[3*QUANT_K_Q4_K/64/2];
|
||||
uint16_t qs[QUANT_K_Q4_K/2/2];
|
||||
};
|
||||
|
||||
struct block_q4_K_packed32
|
||||
{
|
||||
f16vec2 d;
|
||||
uint32_t scales[3*QUANT_K_Q4_K/64/4];
|
||||
uint32_t qs[QUANT_K_Q4_K/2/4];
|
||||
};
|
||||
|
||||
#if defined(DATA_A_Q4_K)
|
||||
#define QUANT_K QUANT_K_Q4_K
|
||||
#define A_TYPE block_q4_K
|
||||
#define A_TYPE_PACKED16 block_q4_K_packed16
|
||||
#define A_TYPE_PACKED32 block_q4_K_packed32
|
||||
#endif
|
||||
|
||||
#define QUANT_K_Q5_K 256
|
||||
|
||||
struct block_q5_K
|
||||
{
|
||||
f16vec2 d;
|
||||
uint8_t scales[12];
|
||||
uint8_t qh[QUANT_K_Q5_K/8];
|
||||
uint8_t qs[QUANT_K_Q5_K/2];
|
||||
};
|
||||
|
||||
struct block_q5_K_packed16
|
||||
{
|
||||
f16vec2 d;
|
||||
uint16_t scales[12/2];
|
||||
uint16_t qh[QUANT_K_Q5_K/8/2];
|
||||
uint16_t qs[QUANT_K_Q5_K/2/2];
|
||||
};
|
||||
|
||||
#if defined(DATA_A_Q5_K)
|
||||
#define QUANT_K QUANT_K_Q5_K
|
||||
#define A_TYPE block_q5_K
|
||||
#define A_TYPE_PACKED16 block_q5_K_packed16
|
||||
#endif
|
||||
|
||||
#define QUANT_K_Q6_K 256
|
||||
|
||||
struct block_q6_K
|
||||
{
|
||||
uint8_t ql[QUANT_K_Q6_K/2];
|
||||
uint8_t qh[QUANT_K_Q6_K/4];
|
||||
int8_t scales[QUANT_K_Q6_K/16];
|
||||
float16_t d;
|
||||
};
|
||||
|
||||
struct block_q6_K_packed16
|
||||
{
|
||||
uint16_t ql[QUANT_K_Q6_K/2/2];
|
||||
uint16_t qh[QUANT_K_Q6_K/4/2];
|
||||
int8_t scales[QUANT_K_Q6_K/16];
|
||||
float16_t d;
|
||||
};
|
||||
|
||||
#if defined(DATA_A_Q6_K)
|
||||
#define QUANT_K QUANT_K_Q6_K
|
||||
#define A_TYPE block_q6_K
|
||||
#define A_TYPE_PACKED16 block_q6_K_packed16
|
||||
#endif
|
||||
|
||||
// IQuants
|
||||
|
||||
#define QUANT_K_IQ4_NL 32
|
||||
#define QUANT_R_IQ4_NL 2
|
||||
|
||||
struct block_iq4_nl
|
||||
{
|
||||
float16_t d;
|
||||
uint8_t qs[QUANT_K_IQ4_NL/2];
|
||||
};
|
||||
|
||||
struct block_iq4_nl_packed16
|
||||
{
|
||||
float16_t d;
|
||||
uint16_t qs[QUANT_K_IQ4_NL/2/2];
|
||||
};
|
||||
|
||||
#if defined(DATA_A_IQ4_NL)
|
||||
|
||||
const int8_t kvalues_iq4nl_const[16] = {
|
||||
int8_t(-127), int8_t(-104), int8_t(-83), int8_t(-65), int8_t(-49), int8_t(-35), int8_t(-22), int8_t(-10),
|
||||
int8_t(1), int8_t(13), int8_t(25), int8_t(38), int8_t(53), int8_t(69), int8_t(89), int8_t(113)
|
||||
};
|
||||
|
||||
shared FLOAT_TYPE kvalues_iq4nl[16];
|
||||
|
||||
void init_iq4nl_shmem()
|
||||
{
|
||||
// copy the table into shared memory and sync
|
||||
if (gl_LocalInvocationIndex.x < 16) {
|
||||
kvalues_iq4nl[gl_LocalInvocationIndex.x] = FLOAT_TYPE(kvalues_iq4nl_const[gl_LocalInvocationIndex.x]);
|
||||
}
|
||||
barrier();
|
||||
}
|
||||
|
||||
#define QUANT_K QUANT_K_IQ4_NL
|
||||
#define QUANT_R QUANT_R_IQ4_NL
|
||||
#define A_TYPE block_iq4_nl
|
||||
#define A_TYPE_PACKED16 block_iq4_nl_packed16
|
||||
#endif
|
||||
|
||||
#endif // !defined(GGML_TYPES_COMP)
|
||||
|
|
@ -0,0 +1,36 @@
|
|||
#version 450
|
||||
|
||||
layout (push_constant) uniform parameter
|
||||
{
|
||||
uint ne; uint a_offset; uint d_offset;
|
||||
uint nb00; uint nb01; uint nb02; uint nb03;
|
||||
uint ne10; uint ne11; uint ne12; uint ne13;
|
||||
float sf0; float sf1; float sf2; float sf3;
|
||||
} p;
|
||||
|
||||
#include "types.comp"
|
||||
|
||||
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
|
||||
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
|
||||
|
||||
void main() {
|
||||
const uint idx = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
|
||||
|
||||
if (idx >= p.ne) {
|
||||
return;
|
||||
}
|
||||
|
||||
const uint i10 = idx % p.ne10;
|
||||
const uint i11 = (idx / p.ne10) % p.ne11;
|
||||
const uint i12 = (idx / (p.ne10 * p.ne11)) % p.ne12;
|
||||
const uint i13 = (idx / (p.ne10 * p.ne11 * p.ne12)) % p.ne13;
|
||||
|
||||
const uint i00 = uint(i10 / p.sf0);
|
||||
const uint i01 = uint(i11 / p.sf1);
|
||||
const uint i02 = uint(i12 / p.sf2);
|
||||
const uint i03 = uint(i13 / p.sf3);
|
||||
|
||||
data_d[p.d_offset + idx] = D_TYPE(data_a[p.a_offset + i03 * p.nb03 + i02 * p.nb02 + i01 * p.nb01 + i00 * p.nb00]);
|
||||
}
|
||||
|
|
@ -0,0 +1,594 @@
|
|||
|
||||
|
||||
#include <iostream>
|
||||
#include <fstream>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <stdexcept>
|
||||
#include <array>
|
||||
#include <vector>
|
||||
#include <map>
|
||||
#include <thread>
|
||||
#include <mutex>
|
||||
#include <future>
|
||||
#include <queue>
|
||||
#include <condition_variable>
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <cstdlib>
|
||||
#include <cassert>
|
||||
#include <sys/stat.h>
|
||||
#include <sys/types.h>
|
||||
|
||||
#ifdef _WIN32
|
||||
#include <windows.h>
|
||||
#include <direct.h> // For _mkdir on Windows
|
||||
#include <algorithm> // For std::replace on w64devkit
|
||||
#else
|
||||
#include <unistd.h>
|
||||
#include <sys/wait.h>
|
||||
#include <fcntl.h>
|
||||
#endif
|
||||
|
||||
#include <vulkan/vulkan_core.h>
|
||||
|
||||
#define ASYNCIO_CONCURRENCY 64
|
||||
|
||||
std::mutex lock;
|
||||
std::vector<std::pair<std::string, std::string>> shader_fnames;
|
||||
|
||||
std::string GLSLC = "glslc";
|
||||
std::string input_dir = "vulkan-shaders";
|
||||
std::string output_dir = "/tmp";
|
||||
std::string target_hpp = "ggml-vulkan-shaders.hpp";
|
||||
std::string target_cpp = "ggml-vulkan-shaders.cpp";
|
||||
bool no_clean = false;
|
||||
|
||||
const std::vector<std::string> type_names = {
|
||||
"f32",
|
||||
"f16",
|
||||
"q4_0",
|
||||
"q4_1",
|
||||
"q5_0",
|
||||
"q5_1",
|
||||
"q8_0",
|
||||
"q2_k",
|
||||
"q3_k",
|
||||
"q4_k",
|
||||
"q5_k",
|
||||
"q6_k",
|
||||
"iq4_nl"
|
||||
};
|
||||
|
||||
namespace {
|
||||
void execute_command(const std::string& command, std::string& stdout_str, std::string& stderr_str) {
|
||||
#ifdef _WIN32
|
||||
HANDLE stdout_read, stdout_write;
|
||||
HANDLE stderr_read, stderr_write;
|
||||
SECURITY_ATTRIBUTES sa = { sizeof(SECURITY_ATTRIBUTES), NULL, TRUE };
|
||||
|
||||
if (!CreatePipe(&stdout_read, &stdout_write, &sa, 0) ||
|
||||
!SetHandleInformation(stdout_read, HANDLE_FLAG_INHERIT, 0)) {
|
||||
throw std::runtime_error("Failed to create stdout pipe");
|
||||
}
|
||||
|
||||
if (!CreatePipe(&stderr_read, &stderr_write, &sa, 0) ||
|
||||
!SetHandleInformation(stderr_read, HANDLE_FLAG_INHERIT, 0)) {
|
||||
throw std::runtime_error("Failed to create stderr pipe");
|
||||
}
|
||||
|
||||
PROCESS_INFORMATION pi;
|
||||
STARTUPINFOA si = {};
|
||||
si.cb = sizeof(STARTUPINFOA);
|
||||
si.dwFlags = STARTF_USESTDHANDLES;
|
||||
si.hStdOutput = stdout_write;
|
||||
si.hStdError = stderr_write;
|
||||
|
||||
std::vector<char> cmd(command.begin(), command.end());
|
||||
cmd.push_back('\0');
|
||||
|
||||
if (!CreateProcessA(NULL, cmd.data(), NULL, NULL, TRUE, 0, NULL, NULL, &si, &pi)) {
|
||||
throw std::runtime_error("Failed to create process");
|
||||
}
|
||||
|
||||
CloseHandle(stdout_write);
|
||||
CloseHandle(stderr_write);
|
||||
|
||||
std::array<char, 128> buffer;
|
||||
DWORD bytes_read;
|
||||
|
||||
while (ReadFile(stdout_read, buffer.data(), (DWORD)buffer.size(), &bytes_read, NULL) && bytes_read > 0) {
|
||||
stdout_str.append(buffer.data(), bytes_read);
|
||||
}
|
||||
|
||||
while (ReadFile(stderr_read, buffer.data(), (DWORD)buffer.size(), &bytes_read, NULL) && bytes_read > 0) {
|
||||
stderr_str.append(buffer.data(), bytes_read);
|
||||
}
|
||||
|
||||
CloseHandle(stdout_read);
|
||||
CloseHandle(stderr_read);
|
||||
WaitForSingleObject(pi.hProcess, INFINITE);
|
||||
CloseHandle(pi.hProcess);
|
||||
CloseHandle(pi.hThread);
|
||||
#else
|
||||
int stdout_pipe[2];
|
||||
int stderr_pipe[2];
|
||||
|
||||
if (pipe(stdout_pipe) != 0 || pipe(stderr_pipe) != 0) {
|
||||
throw std::runtime_error("Failed to create pipes");
|
||||
}
|
||||
|
||||
pid_t pid = fork();
|
||||
if (pid < 0) {
|
||||
throw std::runtime_error("Failed to fork process");
|
||||
}
|
||||
|
||||
if (pid == 0) {
|
||||
close(stdout_pipe[0]);
|
||||
close(stderr_pipe[0]);
|
||||
dup2(stdout_pipe[1], STDOUT_FILENO);
|
||||
dup2(stderr_pipe[1], STDERR_FILENO);
|
||||
close(stdout_pipe[1]);
|
||||
close(stderr_pipe[1]);
|
||||
execl("/bin/sh", "sh", "-c", command.c_str(), (char*) nullptr);
|
||||
_exit(EXIT_FAILURE);
|
||||
} else {
|
||||
close(stdout_pipe[1]);
|
||||
close(stderr_pipe[1]);
|
||||
|
||||
std::array<char, 128> buffer;
|
||||
ssize_t bytes_read;
|
||||
|
||||
while ((bytes_read = read(stdout_pipe[0], buffer.data(), buffer.size())) > 0) {
|
||||
stdout_str.append(buffer.data(), bytes_read);
|
||||
}
|
||||
|
||||
while ((bytes_read = read(stderr_pipe[0], buffer.data(), buffer.size())) > 0) {
|
||||
stderr_str.append(buffer.data(), bytes_read);
|
||||
}
|
||||
|
||||
close(stdout_pipe[0]);
|
||||
close(stderr_pipe[0]);
|
||||
waitpid(pid, nullptr, 0);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
bool directory_exists(const std::string& path) {
|
||||
struct stat info;
|
||||
if (stat(path.c_str(), &info) != 0) {
|
||||
return false; // Path doesn't exist or can't be accessed
|
||||
}
|
||||
return (info.st_mode & S_IFDIR) != 0; // Check if it is a directory
|
||||
}
|
||||
|
||||
bool create_directory(const std::string& path) {
|
||||
#ifdef _WIN32
|
||||
return _mkdir(path.c_str()) == 0 || errno == EEXIST; // EEXIST means the directory already exists
|
||||
#else
|
||||
return mkdir(path.c_str(), 0755) == 0 || errno == EEXIST; // 0755 is the directory permissions
|
||||
#endif
|
||||
}
|
||||
|
||||
std::string to_uppercase(const std::string& input) {
|
||||
std::string result = input;
|
||||
for (char& c : result) {
|
||||
c = std::toupper(c);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
bool string_ends_with(const std::string& str, const std::string& suffix) {
|
||||
if (suffix.size() > str.size()) {
|
||||
return false;
|
||||
}
|
||||
return std::equal(suffix.rbegin(), suffix.rend(), str.rbegin());
|
||||
}
|
||||
|
||||
static const char path_separator = '/';
|
||||
|
||||
std::string join_paths(const std::string& path1, const std::string& path2) {
|
||||
return path1 + path_separator + path2;
|
||||
}
|
||||
|
||||
std::string basename(const std::string &path) {
|
||||
return path.substr(path.find_last_of("/\\") + 1);
|
||||
}
|
||||
|
||||
// variables to track number of compiles in progress
|
||||
static uint32_t compile_count = 0;
|
||||
static std::mutex compile_count_mutex;
|
||||
static std::condition_variable compile_count_cond;
|
||||
|
||||
void string_to_spv_func(const std::string& _name, const std::string& in_fname, const std::map<std::string, std::string>& defines, bool fp16 = true, bool coopmat = false, bool coopmat2 = false, bool f16acc = false) {
|
||||
std::string name = _name + (f16acc ? "_f16acc" : "") + (coopmat ? "_coopmat" : "") + (coopmat2 ? "_cm2" : (fp16 ? "" : "_fp32"));
|
||||
std::string out_fname = join_paths(output_dir, name + ".spv");
|
||||
std::string in_path = join_paths(input_dir, in_fname);
|
||||
|
||||
std::string target_env = (name.find("_cm2") != std::string::npos) ? "--target-env=vulkan1.3" : "--target-env=vulkan1.2";
|
||||
|
||||
// disable spirv-opt for coopmat shaders for https://github.com/ggerganov/llama.cpp/issues/10734
|
||||
std::string opt_level = coopmat ? "" : "-O";
|
||||
|
||||
#ifdef _WIN32
|
||||
std::vector<std::string> cmd = {GLSLC, "-fshader-stage=compute", target_env, opt_level, "\"" + in_path + "\"", "-o", "\"" + out_fname + "\""};
|
||||
#else
|
||||
std::vector<std::string> cmd = {GLSLC, "-fshader-stage=compute", target_env, opt_level, in_path, "-o", out_fname};
|
||||
#endif
|
||||
|
||||
#ifdef GGML_VULKAN_SHADER_DEBUG_INFO
|
||||
cmd.push_back("-g");
|
||||
#endif
|
||||
|
||||
for (const auto& define : defines) {
|
||||
cmd.push_back("-D" + define.first + "=" + define.second);
|
||||
}
|
||||
|
||||
std::string command;
|
||||
for (const auto& part : cmd) {
|
||||
command += part + " ";
|
||||
}
|
||||
|
||||
std::string stdout_str, stderr_str;
|
||||
try {
|
||||
// std::cout << "Executing command: ";
|
||||
// for (const auto& part : cmd) {
|
||||
// std::cout << part << " ";
|
||||
// }
|
||||
// std::cout << std::endl;
|
||||
|
||||
execute_command(command, stdout_str, stderr_str);
|
||||
if (!stderr_str.empty()) {
|
||||
std::cerr << "cannot compile " << name << "\n\n" << command << "\n\n" << stderr_str << std::endl;
|
||||
return;
|
||||
}
|
||||
|
||||
std::lock_guard<std::mutex> guard(lock);
|
||||
shader_fnames.push_back(std::make_pair(name, out_fname));
|
||||
} catch (const std::exception& e) {
|
||||
std::cerr << "Error executing command for " << name << ": " << e.what() << std::endl;
|
||||
}
|
||||
{
|
||||
std::lock_guard<std::mutex> guard(compile_count_mutex);
|
||||
assert(compile_count > 0);
|
||||
compile_count--;
|
||||
}
|
||||
compile_count_cond.notify_all();
|
||||
}
|
||||
|
||||
std::map<std::string, std::string> merge_maps(const std::map<std::string, std::string>& a, const std::map<std::string, std::string>& b) {
|
||||
std::map<std::string, std::string> result = a;
|
||||
result.insert(b.begin(), b.end());
|
||||
return result;
|
||||
}
|
||||
|
||||
static std::vector<std::future<void>> compiles;
|
||||
void string_to_spv(const std::string& _name, const std::string& in_fname, const std::map<std::string, std::string>& defines, bool fp16 = true, bool coopmat = false, bool coopmat2 = false, bool f16acc = false) {
|
||||
{
|
||||
// wait until fewer than N compiles are in progress.
|
||||
// 16 is an arbitrary limit, the goal is to avoid "failed to create pipe" errors.
|
||||
uint32_t N = 16;
|
||||
std::unique_lock<std::mutex> guard(compile_count_mutex);
|
||||
while (compile_count >= N) {
|
||||
compile_count_cond.wait(guard);
|
||||
}
|
||||
compile_count++;
|
||||
}
|
||||
compiles.push_back(std::async(string_to_spv_func, _name, in_fname, defines, fp16, coopmat, coopmat2, f16acc));
|
||||
}
|
||||
|
||||
void matmul_shaders(bool fp16, bool matmul_id, bool coopmat, bool coopmat2, bool f16acc) {
|
||||
std::string load_vec = coopmat2 ? "1" : fp16 ? "8" : "4";
|
||||
std::string aligned_b_type_f32 = coopmat2 ? "float" : fp16 ? "mat2x4" : "vec4";
|
||||
std::string aligned_b_type_f16 = coopmat2 ? "float16_t" : fp16 ? "f16mat2x4" : "f16vec4";
|
||||
|
||||
std::map<std::string, std::string> base_dict = {{"FLOAT_TYPE", (coopmat2 || fp16) ? "float16_t" : "float"}};
|
||||
std::string shader_name = "matmul";
|
||||
|
||||
if (matmul_id) {
|
||||
base_dict["MUL_MAT_ID"] = "1";
|
||||
shader_name = "matmul_id";
|
||||
}
|
||||
|
||||
if (fp16) {
|
||||
base_dict["FLOAT16"] = "1";
|
||||
}
|
||||
|
||||
base_dict["ACC_TYPE"] = f16acc ? "float16_t" : "float";
|
||||
|
||||
if (coopmat) {
|
||||
base_dict["COOPMAT"] = "1";
|
||||
}
|
||||
|
||||
base_dict["ACC_TYPE"] = f16acc ? "float16_t" : "float";
|
||||
|
||||
std::string source_name = coopmat2 ? "mul_mm_cm2.comp" : "mul_mm.comp";
|
||||
|
||||
// Shaders with f16 B_TYPE
|
||||
string_to_spv(shader_name + "_f32_f16", source_name, merge_maps(base_dict, {{"DATA_A_F32", "1"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}, }), fp16, coopmat, coopmat2, f16acc);
|
||||
string_to_spv(shader_name + "_f32_f16_aligned", source_name, merge_maps(base_dict, {{"DATA_A_F32", "1"}, {"LOAD_VEC_A", load_vec}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"D_TYPE", "float"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc);
|
||||
|
||||
string_to_spv(shader_name + "_f16_aligned", source_name, merge_maps(base_dict, {{"DATA_A_F16", "1"}, {"LOAD_VEC_A", load_vec}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"D_TYPE", "float"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc);
|
||||
string_to_spv(shader_name + "_f16", source_name, merge_maps(base_dict, {{"DATA_A_F16", "1"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}), fp16, coopmat, coopmat2, f16acc);
|
||||
|
||||
for (const auto& tname : type_names) {
|
||||
std::string data_a_key = "DATA_A_" + to_uppercase(tname);
|
||||
// For unaligned, load one at a time for f32/f16, or two at a time for quants
|
||||
std::string load_vec_a_unaligned = (coopmat2 || tname == "f32" || tname == "f16") ? "1" : "2";
|
||||
// For aligned matmul loads
|
||||
std::string load_vec_a = (coopmat2 || tname == "f32" || tname == "f16") ? load_vec : "2";
|
||||
|
||||
string_to_spv(shader_name + "_" + tname + "_f32", source_name, merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a_unaligned}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"B_IS_FLOAT", "1"}}), fp16, coopmat, coopmat2, f16acc);
|
||||
string_to_spv(shader_name + "_" + tname + "_f32_aligned", source_name, merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f32}, {"D_TYPE", "float"}, {"B_IS_FLOAT", "1"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc);
|
||||
|
||||
if (tname != "f16" && tname != "f32") {
|
||||
string_to_spv(shader_name + "_" + tname + "_f16", source_name, merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a_unaligned}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}, {"B_IS_FLOAT", "1"}}), fp16, coopmat, coopmat2, f16acc);
|
||||
string_to_spv(shader_name + "_" + tname + "_f16_aligned", source_name, merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"D_TYPE", "float"}, {"B_IS_FLOAT", "1"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void process_shaders() {
|
||||
std::cout << "ggml_vulkan: Generating and compiling shaders to SPIR-V" << std::endl;
|
||||
std::map<std::string, std::string> base_dict = {{"FLOAT_TYPE", "float"}};
|
||||
|
||||
// matmul
|
||||
for (const auto& matmul_id : {false, true}) {
|
||||
// No coopmats
|
||||
// fp32
|
||||
matmul_shaders(false, matmul_id, false, false, false);
|
||||
|
||||
// fp16, fp32acc and fp16acc
|
||||
matmul_shaders(true, matmul_id, false, false, false);
|
||||
matmul_shaders(true, matmul_id, false, false, true);
|
||||
|
||||
// Coopmat, fp32acc and fp16acc
|
||||
matmul_shaders(true, matmul_id, true, false, false);
|
||||
matmul_shaders(true, matmul_id, true, false, true);
|
||||
|
||||
#if defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
|
||||
// Coopmat2, fp32acc and fp16acc
|
||||
matmul_shaders(true, matmul_id, false, true, false);
|
||||
matmul_shaders(true, matmul_id, false, true, true);
|
||||
#endif
|
||||
}
|
||||
|
||||
#if defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
|
||||
// flash attention
|
||||
for (const auto& f16acc : {false, true}) {
|
||||
std::string acctype = f16acc ? "float16_t" : "float";
|
||||
|
||||
for (const auto& tname : type_names) {
|
||||
if (tname == "f32") {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (tname == "f16") {
|
||||
string_to_spv("flash_attn_f32_f16_" + tname, "flash_attn_cm2.comp",
|
||||
merge_maps(base_dict, {{"Q_TYPE", "float"}, {"D_TYPE", "float"}, {"ACC_TYPE", acctype}}), true, false, true, f16acc);
|
||||
} else {
|
||||
std::string data_a_key = "DATA_A_" + to_uppercase(tname);
|
||||
string_to_spv("flash_attn_f32_f16_" + tname, "flash_attn_cm2.comp",
|
||||
merge_maps(base_dict, {{data_a_key, "1"}, {"Q_TYPE", "float"}, {"D_TYPE", "float"}, {"ACC_TYPE", acctype}, {"DEQUANTFUNC", "dequantFunc"+to_uppercase(tname) }, {"BLOCK_SIZE", "QUANT_K_"+to_uppercase(tname) }}), true, false, true, f16acc);
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
for (const auto& tname : type_names) {
|
||||
// mul mat vec
|
||||
std::string data_a_key = "DATA_A_" + to_uppercase(tname);
|
||||
std::string shader = (string_ends_with(tname, "_k")) ? "mul_mat_vec_" + tname + ".comp" : "mul_mat_vec.comp";
|
||||
|
||||
string_to_spv("mul_mat_vec_" + tname + "_f32_f32", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float"}, {"B_TYPE_VEC2", "vec2"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}}));
|
||||
string_to_spv("mul_mat_vec_" + tname + "_f16_f32", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float16_t"}, {"B_TYPE_VEC2", "f16vec2"}, {"B_TYPE_VEC4", "f16vec4"}, {"D_TYPE", "float"}}));
|
||||
|
||||
string_to_spv("mul_mat_vec_id_" + tname + "_f32", shader, merge_maps(base_dict, {{"MUL_MAT_ID", "1"}, {data_a_key, "1"}, {"B_TYPE", "float"}, {"B_TYPE_VEC2", "vec2"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}}));
|
||||
|
||||
// Dequant shaders
|
||||
if (tname != "f16") {
|
||||
string_to_spv("dequant_" + tname, "dequant_" + tname + ".comp", merge_maps(base_dict, {{data_a_key, "1"}, {"D_TYPE", "float16_t"}}));
|
||||
}
|
||||
|
||||
if (!string_ends_with(tname, "_k")) {
|
||||
shader = (tname == "f32" || tname == "f16") ? "get_rows.comp" : "get_rows_quant.comp";
|
||||
|
||||
if (tname == "f16") {
|
||||
string_to_spv("get_rows_" + tname, shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}}));
|
||||
} else {
|
||||
string_to_spv("get_rows_" + tname, shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float16_t"}}));
|
||||
}
|
||||
string_to_spv("get_rows_" + tname + "_f32", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float"}}));
|
||||
}
|
||||
}
|
||||
|
||||
string_to_spv("mul_mat_vec_p021_f16_f32", "mul_mat_vec_p021.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
string_to_spv("mul_mat_vec_nc_f16_f32", "mul_mat_vec_nc.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
|
||||
// Norms
|
||||
string_to_spv("norm_f32", "norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
|
||||
string_to_spv("group_norm_f32", "group_norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
|
||||
string_to_spv("rms_norm_f32", "rms_norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
|
||||
|
||||
string_to_spv("cpy_f32_f32", "copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
string_to_spv("cpy_f32_f16", "copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}});
|
||||
string_to_spv("cpy_f16_f16", "copy.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}});
|
||||
string_to_spv("contig_cpy_f32_f32", "contig_copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
string_to_spv("contig_cpy_f32_f16", "contig_copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}});
|
||||
string_to_spv("contig_cpy_f16_f16", "contig_copy.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}});
|
||||
|
||||
string_to_spv("add_f32", "add.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
||||
string_to_spv("add_f16_f32_f16", "add.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float"}, {"D_TYPE", "float16_t"}, {"FLOAT_TYPE", "float"}});
|
||||
|
||||
string_to_spv("acc_f32", "acc.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
||||
|
||||
string_to_spv("split_k_reduce", "mul_mat_split_k_reduce.comp", {});
|
||||
|
||||
string_to_spv("mul_f32", "mul.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
||||
|
||||
string_to_spv("div_f32", "div.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
||||
|
||||
string_to_spv("repeat_f32", "repeat.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
|
||||
string_to_spv("scale_f32", "scale.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
||||
|
||||
string_to_spv("sqr_f32", "square.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
||||
|
||||
string_to_spv("sin_f32", "sin.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
||||
|
||||
string_to_spv("cos_f32", "cos.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
||||
|
||||
string_to_spv("clamp_f32", "clamp.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
||||
|
||||
string_to_spv("pad_f32", "pad.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
|
||||
string_to_spv("concat_f32", "concat.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
string_to_spv("concat_f16", "concat.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}});
|
||||
string_to_spv("concat_i32", "concat.comp", {{"A_TYPE", "int"}, {"B_TYPE", "int"}, {"D_TYPE", "int"}});
|
||||
|
||||
string_to_spv("upscale_f32", "upscale.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
|
||||
string_to_spv("gelu_f32", "gelu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
string_to_spv("gelu_quick_f32", "gelu_quick.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
string_to_spv("silu_f32", "silu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
string_to_spv("relu_f32", "relu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
string_to_spv("leaky_relu_f32", "leaky_relu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
string_to_spv("tanh_f32", "tanh.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
|
||||
string_to_spv("diag_mask_inf_f32", "diag_mask_inf.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
|
||||
string_to_spv("soft_max_f32", "soft_max.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}));
|
||||
string_to_spv("soft_max_f32_f16", "soft_max.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}));
|
||||
|
||||
string_to_spv("rope_norm_f32", "rope_norm.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
string_to_spv("rope_norm_f16", "rope_norm.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
|
||||
string_to_spv("rope_norm_f16_rte", "rope_norm.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"RTE16", "1"}});
|
||||
|
||||
string_to_spv("rope_neox_f32", "rope_neox.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
string_to_spv("rope_neox_f16", "rope_neox.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
|
||||
string_to_spv("rope_neox_f16_rte", "rope_neox.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"RTE16", "1"}});
|
||||
|
||||
string_to_spv("argsort_f32", "argsort.comp", {{"A_TYPE", "float"}});
|
||||
|
||||
string_to_spv("sum_rows_f32", "sum_rows.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
|
||||
|
||||
string_to_spv("im2col_f32", "im2col.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
|
||||
string_to_spv("im2col_f32_f16", "im2col.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}}));
|
||||
string_to_spv("im2col_f32_f16_rte", "im2col.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}, {"RTE16", "1"}}));
|
||||
|
||||
string_to_spv("timestep_embedding_f32", "timestep_embedding.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
|
||||
|
||||
string_to_spv("pool2d_f32", "pool2d.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
|
||||
|
||||
string_to_spv("rwkv_wkv6_f32", "wkv6.comp", merge_maps(base_dict, {{"A_TYPE", "float"}}));
|
||||
|
||||
for (auto &c : compiles) {
|
||||
c.wait();
|
||||
}
|
||||
}
|
||||
|
||||
void write_output_files() {
|
||||
FILE* hdr = fopen(target_hpp.c_str(), "w");
|
||||
FILE* src = fopen(target_cpp.c_str(), "w");
|
||||
|
||||
fprintf(hdr, "#include <cstdint>\n\n");
|
||||
fprintf(src, "#include \"%s\"\n\n", basename(target_hpp).c_str());
|
||||
|
||||
for (const auto& pair : shader_fnames) {
|
||||
const std::string& name = pair.first;
|
||||
#ifdef _WIN32
|
||||
std::string path = pair.second;
|
||||
std::replace(path.begin(), path.end(), '/', '\\' );
|
||||
#else
|
||||
const std::string& path = pair.second;
|
||||
#endif
|
||||
|
||||
FILE* spv = fopen(path.c_str(), "rb");
|
||||
if (!spv) {
|
||||
std::cerr << "Error opening SPIR-V file: " << path << " (" << strerror(errno) << ")\n";
|
||||
continue;
|
||||
}
|
||||
|
||||
fseek(spv, 0, SEEK_END);
|
||||
size_t size = ftell(spv);
|
||||
fseek(spv, 0, SEEK_SET);
|
||||
|
||||
std::vector<unsigned char> data(size);
|
||||
size_t read_size = fread(data.data(), 1, size, spv);
|
||||
fclose(spv);
|
||||
if (read_size != size) {
|
||||
std::cerr << "Error reading SPIR-V file: " << path << " (" << strerror(errno) << ")\n";
|
||||
continue;
|
||||
}
|
||||
|
||||
fprintf(hdr, "extern unsigned char %s_data[%zu];\n", name.c_str(), size);
|
||||
fprintf(hdr, "const uint64_t %s_len = %zu;\n\n", name.c_str(), size);
|
||||
|
||||
fprintf(src, "unsigned char %s_data[%zu] = {\n", name.c_str(), size);
|
||||
for (size_t i = 0; i < size; ++i) {
|
||||
fprintf(src, "0x%02x,", data[i]);
|
||||
if ((i + 1) % 12 == 0) fprintf(src, "\n");
|
||||
}
|
||||
fprintf(src, "\n};\n\n");
|
||||
|
||||
if (!no_clean) {
|
||||
std::remove(path.c_str());
|
||||
}
|
||||
}
|
||||
|
||||
fclose(hdr);
|
||||
fclose(src);
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
std::map<std::string, std::string> args;
|
||||
for (int i = 1; i < argc; ++i) {
|
||||
std::string arg = argv[i];
|
||||
if (arg.rfind("--", 0) == 0) {
|
||||
if (i + 1 < argc && argv[i + 1][0] != '-') {
|
||||
args[arg] = argv[i + 1];
|
||||
++i;
|
||||
} else {
|
||||
args[arg] = "";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (args.find("--glslc") != args.end()) {
|
||||
GLSLC = args["--glslc"]; // Path to glslc
|
||||
}
|
||||
if (args.find("--input-dir") != args.end()) {
|
||||
input_dir = args["--input-dir"]; // Directory containing shader sources
|
||||
}
|
||||
if (args.find("--output-dir") != args.end()) {
|
||||
output_dir = args["--output-dir"]; // Directory for containing SPIR-V output
|
||||
}
|
||||
if (args.find("--target-hpp") != args.end()) {
|
||||
target_hpp = args["--target-hpp"]; // Path to generated header file
|
||||
}
|
||||
if (args.find("--target-cpp") != args.end()) {
|
||||
target_cpp = args["--target-cpp"]; // Path to generated cpp file
|
||||
}
|
||||
if (args.find("--no-clean") != args.end()) {
|
||||
no_clean = true; // Keep temporary SPIR-V files in output-dir after build
|
||||
}
|
||||
|
||||
if (!directory_exists(input_dir)) {
|
||||
std::cerr << "\"" << input_dir << "\" must be a valid directory containing shader sources" << std::endl;
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
|
||||
if (!directory_exists(output_dir)) {
|
||||
if (!create_directory(output_dir)) {
|
||||
std::cerr << "Error creating output directory: " << output_dir << "\n";
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
}
|
||||
|
||||
process_shaders();
|
||||
|
||||
write_output_files();
|
||||
|
||||
return EXIT_SUCCESS;
|
||||
}
|
||||
|
|
@ -0,0 +1,87 @@
|
|||
#version 450
|
||||
|
||||
#extension GL_EXT_control_flow_attributes : require
|
||||
|
||||
#define BLOCK_SIZE 64
|
||||
layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout(push_constant) uniform Parameters {
|
||||
uint B;
|
||||
uint T;
|
||||
uint C;
|
||||
uint H;
|
||||
};
|
||||
|
||||
layout(binding = 0) readonly buffer KBuf { A_TYPE k[]; };
|
||||
layout(binding = 1) readonly buffer VBuf { A_TYPE v[]; };
|
||||
layout(binding = 2) readonly buffer RBuf { A_TYPE r[]; };
|
||||
layout(binding = 3) readonly buffer TimeFBuf { A_TYPE tf[]; };
|
||||
layout(binding = 4) readonly buffer TimeDBuf { A_TYPE td[]; };
|
||||
layout(binding = 5) readonly buffer StateBuf { A_TYPE state_in[]; };
|
||||
layout(binding = 6) buffer DstBuf { A_TYPE dst[]; };
|
||||
|
||||
shared A_TYPE _k[BLOCK_SIZE], _r[BLOCK_SIZE], _tf[BLOCK_SIZE], _td[BLOCK_SIZE];
|
||||
|
||||
void main() {
|
||||
const uint head_size = BLOCK_SIZE;
|
||||
const uint batch_id = gl_WorkGroupID.x / H;
|
||||
const uint head_id = gl_WorkGroupID.x % H;
|
||||
const uint tid = gl_LocalInvocationID.x;
|
||||
|
||||
const uint state_size = C * head_size;
|
||||
const uint n_seq_tokens = T / B;
|
||||
|
||||
if (batch_id >= B || head_id >= H) {
|
||||
return;
|
||||
}
|
||||
|
||||
A_TYPE state[BLOCK_SIZE];
|
||||
[[unroll]] for (uint i = 0; i < head_size; i++) {
|
||||
state[i] = state_in[batch_id * state_size + head_id * head_size * head_size
|
||||
+ i * head_size + tid];
|
||||
}
|
||||
|
||||
barrier();
|
||||
_tf[tid] = tf[head_id * head_size + tid];
|
||||
barrier();
|
||||
|
||||
const uint start_t = batch_id * n_seq_tokens * C + head_id * head_size + tid;
|
||||
const uint end_t = (batch_id + 1) * n_seq_tokens * C + head_id * head_size + tid;
|
||||
|
||||
for (uint t = start_t; t < end_t; t += C) {
|
||||
barrier();
|
||||
_k[tid] = k[t];
|
||||
_r[tid] = r[t];
|
||||
_td[tid] = td[t];
|
||||
barrier();
|
||||
|
||||
const A_TYPE v_val = v[t];
|
||||
A_TYPE y = 0.0;
|
||||
|
||||
[[unroll]] for (uint j = 0; j < head_size; j += 4) {
|
||||
vec4 k_vec = vec4(_k[j], _k[j+1], _k[j+2], _k[j+3]);
|
||||
vec4 r_vec = vec4(_r[j], _r[j+1], _r[j+2], _r[j+3]);
|
||||
vec4 tf_vec = vec4(_tf[j], _tf[j+1], _tf[j+2], _tf[j+3]);
|
||||
vec4 td_vec = vec4(_td[j], _td[j+1], _td[j+2], _td[j+3]);
|
||||
vec4 s_vec = vec4(state[j], state[j+1], state[j+2], state[j+3]);
|
||||
|
||||
vec4 kv = k_vec * v_val;
|
||||
|
||||
vec4 temp = tf_vec * kv + s_vec;
|
||||
y += dot(r_vec, temp);
|
||||
|
||||
s_vec = s_vec * td_vec + kv;
|
||||
state[j] = s_vec.x;
|
||||
state[j+1] = s_vec.y;
|
||||
state[j+2] = s_vec.z;
|
||||
state[j+3] = s_vec.w;
|
||||
}
|
||||
|
||||
dst[t] = y;
|
||||
}
|
||||
|
||||
[[unroll]] for (uint i = 0; i < head_size; i++) {
|
||||
dst[T * C + batch_id * state_size + head_id * head_size * head_size
|
||||
+ i * head_size + tid] = state[i];
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue