diff --git a/llama/llama.cpp/common/arg.cpp b/llama/llama.cpp/common/arg.cpp new file mode 100644 index 000000000..c4ad85c47 --- /dev/null +++ b/llama/llama.cpp/common/arg.cpp @@ -0,0 +1,3410 @@ +#include "arg.h" + +#include "chat.h" +#include "common.h" +#include "gguf.h" // for reading GGUF splits +#include "json-schema-to-grammar.h" +#include "log.h" +#include "sampling.h" + +// fix problem with std::min and std::max +#if defined(_WIN32) +#define WIN32_LEAN_AND_MEAN +#ifndef NOMINMAX +# define NOMINMAX +#endif +#include +#endif + +#define JSON_ASSERT GGML_ASSERT +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +//#define LLAMA_USE_CURL + +#if defined(LLAMA_USE_CURL) +#include +#include +#include +#endif + +using json = nlohmann::ordered_json; + +std::initializer_list mmproj_examples = { + LLAMA_EXAMPLE_MTMD, + LLAMA_EXAMPLE_SERVER, +}; + +static std::string read_file(const std::string & fname) { + std::ifstream file(fname); + if (!file) { + throw std::runtime_error(string_format("error: failed to open file '%s'\n", fname.c_str())); + } + std::string content((std::istreambuf_iterator(file)), std::istreambuf_iterator()); + file.close(); + return content; +} + +static void write_file(const std::string & fname, const std::string & content) { + std::ofstream file(fname); + if (!file) { + throw std::runtime_error(string_format("error: failed to open file '%s'\n", fname.c_str())); + } + file << content; + file.close(); +} + +common_arg & common_arg::set_examples(std::initializer_list examples) { + this->examples = std::move(examples); + return *this; +} + +common_arg & common_arg::set_excludes(std::initializer_list excludes) { + this->excludes = std::move(excludes); + return *this; +} + +common_arg & common_arg::set_env(const char * env) { + help = help + "\n(env: " + env + ")"; + this->env = env; + return *this; +} + +common_arg & common_arg::set_sparam() { + is_sparam = true; + return *this; +} + +bool common_arg::in_example(enum llama_example ex) { + return examples.find(ex) != examples.end(); +} + +bool common_arg::is_exclude(enum llama_example ex) { + return excludes.find(ex) != excludes.end(); +} + +bool common_arg::get_value_from_env(std::string & output) { + if (env == nullptr) return false; + char * value = std::getenv(env); + if (value) { + output = value; + return true; + } + return false; +} + +bool common_arg::has_value_from_env() { + return env != nullptr && std::getenv(env); +} + +static std::vector break_str_into_lines(std::string input, size_t max_char_per_line) { + std::vector result; + std::istringstream iss(input); + std::string line; + auto add_line = [&](const std::string& l) { + if (l.length() <= max_char_per_line) { + result.push_back(l); + } else { + std::istringstream line_stream(l); + std::string word, current_line; + while (line_stream >> word) { + if (current_line.length() + !current_line.empty() + word.length() > max_char_per_line) { + if (!current_line.empty()) result.push_back(current_line); + current_line = word; + } else { + current_line += (!current_line.empty() ? " " : "") + word; + } + } + if (!current_line.empty()) result.push_back(current_line); + } + }; + while (std::getline(iss, line)) { + add_line(line); + } + return result; +} + +std::string common_arg::to_string() { + // params for printing to console + const static int n_leading_spaces = 40; + const static int n_char_per_line_help = 70; // TODO: detect this based on current console + std::string leading_spaces(n_leading_spaces, ' '); + + std::ostringstream ss; + for (const auto arg : args) { + if (arg == args.front()) { + if (args.size() == 1) { + ss << arg; + } else { + // first arg is usually abbreviation, we need padding to make it more beautiful + auto tmp = std::string(arg) + ", "; + auto spaces = std::string(std::max(0, 7 - (int)tmp.size()), ' '); + ss << tmp << spaces; + } + } else { + ss << arg << (arg != args.back() ? ", " : ""); + } + } + if (value_hint) ss << " " << value_hint; + if (value_hint_2) ss << " " << value_hint_2; + if (ss.tellp() > n_leading_spaces - 3) { + // current line is too long, add new line + ss << "\n" << leading_spaces; + } else { + // padding between arg and help, same line + ss << std::string(leading_spaces.size() - ss.tellp(), ' '); + } + const auto help_lines = break_str_into_lines(help, n_char_per_line_help); + for (const auto & line : help_lines) { + ss << (&line == &help_lines.front() ? "" : leading_spaces) << line << "\n"; + } + return ss.str(); +} + +// +// downloader +// + +struct common_hf_file_res { + std::string repo; // repo name with ":tag" removed + std::string ggufFile; + std::string mmprojFile; +}; + +#ifdef LLAMA_USE_CURL + +bool common_has_curl() { + return true; +} + +#ifdef __linux__ +#include +#elif defined(_WIN32) +# if !defined(PATH_MAX) +# define PATH_MAX MAX_PATH +# endif +#elif defined(_AIX) +#include +#else +#include +#endif +#define LLAMA_CURL_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083 + +// +// CURL utils +// + +using curl_ptr = std::unique_ptr; + +// cannot use unique_ptr for curl_slist, because we cannot update without destroying the old one +struct curl_slist_ptr { + struct curl_slist * ptr = nullptr; + ~curl_slist_ptr() { + if (ptr) { + curl_slist_free_all(ptr); + } + } +}; + +#define CURL_MAX_RETRY 3 +#define CURL_RETRY_DELAY_SECONDS 2 + +static bool curl_perform_with_retry(const std::string & url, CURL * curl, int max_attempts, int retry_delay_seconds, const char * method_name) { + int remaining_attempts = max_attempts; + + while (remaining_attempts > 0) { + LOG_INF("%s: %s %s (attempt %d of %d)...\n", __func__ , method_name, url.c_str(), max_attempts - remaining_attempts + 1, max_attempts); + + CURLcode res = curl_easy_perform(curl); + if (res == CURLE_OK) { + return true; + } + + int exponential_backoff_delay = std::pow(retry_delay_seconds, max_attempts - remaining_attempts) * 1000; + LOG_WRN("%s: curl_easy_perform() failed: %s, retrying after %d milliseconds...\n", __func__, curl_easy_strerror(res), exponential_backoff_delay); + + remaining_attempts--; + if (remaining_attempts == 0) break; + std::this_thread::sleep_for(std::chrono::milliseconds(exponential_backoff_delay)); + } + + LOG_ERR("%s: curl_easy_perform() failed after %d attempts\n", __func__, max_attempts); + + return false; +} + +// download one single file from remote URL to local path +static bool common_download_file_single(const std::string & url, const std::string & path, const std::string & bearer_token, bool offline) { + // Check if the file already exists locally + auto file_exists = std::filesystem::exists(path); + + // If the file exists, check its JSON metadata companion file. + std::string metadata_path = path + ".json"; + nlohmann::json metadata; // TODO @ngxson : get rid of this json, use regex instead + std::string etag; + std::string last_modified; + + if (file_exists) { + if (offline) { + LOG_INF("%s: using cached file (offline mode): %s\n", __func__, path.c_str()); + return true; // skip verification/downloading + } + // Try and read the JSON metadata file (note: stream autoclosed upon exiting this block). + std::ifstream metadata_in(metadata_path); + if (metadata_in.good()) { + try { + metadata_in >> metadata; + LOG_DBG("%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str()); + if (metadata.contains("etag") && metadata.at("etag").is_string()) { + etag = metadata.at("etag"); + } + if (metadata.contains("lastModified") && metadata.at("lastModified").is_string()) { + last_modified = metadata.at("lastModified"); + } + } catch (const nlohmann::json::exception & e) { + LOG_ERR("%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what()); + } + } + // if we cannot open the metadata file, we assume that the downloaded file is not valid (etag and last-modified are left empty, so we will download it again) + } else { + if (offline) { + LOG_ERR("%s: required file is not available in cache (offline mode): %s\n", __func__, path.c_str()); + return false; + } + LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str()); + } + + // Send a HEAD request to retrieve the etag and last-modified headers + struct common_load_model_from_url_headers { + std::string etag; + std::string last_modified; + }; + + common_load_model_from_url_headers headers; + bool head_request_ok = false; + bool should_download = !file_exists; // by default, we should download if the file does not exist + + // Initialize libcurl + curl_ptr curl(curl_easy_init(), &curl_easy_cleanup); + curl_slist_ptr http_headers; + if (!curl) { + LOG_ERR("%s: error initializing libcurl\n", __func__); + return false; + } + + // Set the URL, allow to follow http redirection + curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str()); + curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L); + + http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp"); + // Check if hf-token or bearer-token was specified + if (!bearer_token.empty()) { + std::string auth_header = "Authorization: Bearer " + bearer_token; + http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str()); + } + curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr); + +#if defined(_WIN32) + // CURLSSLOPT_NATIVE_CA tells libcurl to use standard certificate store of + // operating system. Currently implemented under MS-Windows. + curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA); +#endif + + typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *); + auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t { + common_load_model_from_url_headers * headers = (common_load_model_from_url_headers *) userdata; + + static std::regex header_regex("([^:]+): (.*)\r\n"); + static std::regex etag_regex("ETag", std::regex_constants::icase); + static std::regex last_modified_regex("Last-Modified", std::regex_constants::icase); + + std::string header(buffer, n_items); + std::smatch match; + if (std::regex_match(header, match, header_regex)) { + const std::string & key = match[1]; + const std::string & value = match[2]; + if (std::regex_match(key, match, etag_regex)) { + headers->etag = value; + } else if (std::regex_match(key, match, last_modified_regex)) { + headers->last_modified = value; + } + } + return n_items; + }; + + curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 1L); // will trigger the HEAD verb + curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L); // hide head request progress + curl_easy_setopt(curl.get(), CURLOPT_HEADERFUNCTION, static_cast(header_callback)); + curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers); + + // we only allow retrying once for HEAD requests + // this is for the use case of using running offline (no internet), retrying can be annoying + bool was_perform_successful = curl_perform_with_retry(url, curl.get(), 1, 0, "HEAD"); + if (!was_perform_successful) { + head_request_ok = false; + } + + long http_code = 0; + curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &http_code); + if (http_code == 200) { + head_request_ok = true; + } else { + LOG_WRN("%s: HEAD invalid http status code received: %ld\n", __func__, http_code); + head_request_ok = false; + } + + // if head_request_ok is false, we don't have the etag or last-modified headers + // we leave should_download as-is, which is true if the file does not exist + if (head_request_ok) { + // check if ETag or Last-Modified headers are different + // if it is, we need to download the file again + if (!etag.empty() && etag != headers.etag) { + LOG_WRN("%s: ETag header is different (%s != %s): triggering a new download\n", __func__, etag.c_str(), headers.etag.c_str()); + should_download = true; + } else if (!last_modified.empty() && last_modified != headers.last_modified) { + LOG_WRN("%s: Last-Modified header is different (%s != %s): triggering a new download\n", __func__, last_modified.c_str(), headers.last_modified.c_str()); + should_download = true; + } + } + + if (should_download) { + std::string path_temporary = path + ".downloadInProgress"; + if (file_exists) { + LOG_WRN("%s: deleting previous downloaded file: %s\n", __func__, path.c_str()); + if (remove(path.c_str()) != 0) { + LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str()); + return false; + } + } + + // Set the output file + + struct FILE_deleter { + void operator()(FILE * f) const { + fclose(f); + } + }; + + std::unique_ptr outfile(fopen(path_temporary.c_str(), "wb")); + if (!outfile) { + LOG_ERR("%s: error opening local file for writing: %s\n", __func__, path.c_str()); + return false; + } + + typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * data, size_t size, size_t nmemb, void * fd); + auto write_callback = [](void * data, size_t size, size_t nmemb, void * fd) -> size_t { + return fwrite(data, size, nmemb, (FILE *)fd); + }; + curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 0L); + curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast(write_callback)); + curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, outfile.get()); + + // display download progress + curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 0L); + + // helper function to hide password in URL + auto llama_download_hide_password_in_url = [](const std::string & url) -> std::string { + std::size_t protocol_pos = url.find("://"); + if (protocol_pos == std::string::npos) { + return url; // Malformed URL + } + + std::size_t at_pos = url.find('@', protocol_pos + 3); + if (at_pos == std::string::npos) { + return url; // No password in URL + } + + return url.substr(0, protocol_pos + 3) + "********" + url.substr(at_pos); + }; + + // start the download + LOG_INF("%s: trying to download model from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__, + llama_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str()); + bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS, "GET"); + if (!was_perform_successful) { + return false; + } + + long http_code = 0; + curl_easy_getinfo (curl.get(), CURLINFO_RESPONSE_CODE, &http_code); + if (http_code < 200 || http_code >= 400) { + LOG_ERR("%s: invalid http status code received: %ld\n", __func__, http_code); + return false; + } + + // Causes file to be closed explicitly here before we rename it. + outfile.reset(); + + // Write the updated JSON metadata file. + metadata.update({ + {"url", url}, + {"etag", headers.etag}, + {"lastModified", headers.last_modified} + }); + write_file(metadata_path, metadata.dump(4)); + LOG_DBG("%s: file metadata saved: %s\n", __func__, metadata_path.c_str()); + + if (rename(path_temporary.c_str(), path.c_str()) != 0) { + LOG_ERR("%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str()); + return false; + } + } else { + LOG_INF("%s: using cached file: %s\n", __func__, path.c_str()); + } + + return true; +} + +// download multiple files from remote URLs to local paths +// the input is a vector of pairs +static bool common_download_file_multiple(const std::vector> & urls, const std::string & bearer_token, bool offline) { + // Prepare download in parallel + std::vector> futures_download; + for (auto const & item : urls) { + futures_download.push_back(std::async(std::launch::async, [bearer_token, offline](const std::pair & it) -> bool { + return common_download_file_single(it.first, it.second, bearer_token, offline); + }, item)); + } + + // Wait for all downloads to complete + for (auto & f : futures_download) { + if (!f.get()) { + return false; + } + } + + return true; +} + +static bool common_download_model( + const common_params_model & model, + const std::string & bearer_token, + bool offline) { + // Basic validation of the model.url + if (model.url.empty()) { + LOG_ERR("%s: invalid model url\n", __func__); + return false; + } + + if (!common_download_file_single(model.url, model.path, bearer_token, offline)) { + return false; + } + + // check for additional GGUFs split to download + int n_split = 0; + { + struct gguf_init_params gguf_params = { + /*.no_alloc = */ true, + /*.ctx = */ NULL, + }; + auto * ctx_gguf = gguf_init_from_file(model.path.c_str(), gguf_params); + if (!ctx_gguf) { + LOG_ERR("\n%s: failed to load input GGUF from %s\n", __func__, model.path.c_str()); + return false; + } + + auto key_n_split = gguf_find_key(ctx_gguf, LLM_KV_SPLIT_COUNT); + if (key_n_split >= 0) { + n_split = gguf_get_val_u16(ctx_gguf, key_n_split); + } + + gguf_free(ctx_gguf); + } + + if (n_split > 1) { + char split_prefix[PATH_MAX] = {0}; + char split_url_prefix[LLAMA_CURL_MAX_URL_LENGTH] = {0}; + + // Verify the first split file format + // and extract split URL and PATH prefixes + { + if (!llama_split_prefix(split_prefix, sizeof(split_prefix), model.path.c_str(), 0, n_split)) { + LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__, model.path.c_str(), n_split); + return false; + } + + if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model.url.c_str(), 0, n_split)) { + LOG_ERR("\n%s: unexpected model url: %s n_split=%d\n", __func__, model.url.c_str(), n_split); + return false; + } + } + + std::vector> urls; + for (int idx = 1; idx < n_split; idx++) { + char split_path[PATH_MAX] = {0}; + llama_split_path(split_path, sizeof(split_path), split_prefix, idx, n_split); + + char split_url[LLAMA_CURL_MAX_URL_LENGTH] = {0}; + llama_split_path(split_url, sizeof(split_url), split_url_prefix, idx, n_split); + + if (std::string(split_path) == model.path) { + continue; // skip the already downloaded file + } + + urls.push_back({split_url, split_path}); + } + + // Download in parallel + common_download_file_multiple(urls, bearer_token, offline); + } + + return true; +} + +std::pair> common_remote_get_content(const std::string & url, const common_remote_params & params) { + curl_ptr curl(curl_easy_init(), &curl_easy_cleanup); + curl_slist_ptr http_headers; + std::vector res_buffer; + + curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str()); + curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L); + curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L); + typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * ptr, size_t size, size_t nmemb, void * data); + auto write_callback = [](void * ptr, size_t size, size_t nmemb, void * data) -> size_t { + auto data_vec = static_cast *>(data); + data_vec->insert(data_vec->end(), (char *)ptr, (char *)ptr + size * nmemb); + return size * nmemb; + }; + curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast(write_callback)); + curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, &res_buffer); +#if defined(_WIN32) + curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA); +#endif + if (params.timeout > 0) { + curl_easy_setopt(curl.get(), CURLOPT_TIMEOUT, params.timeout); + } + if (params.max_size > 0) { + curl_easy_setopt(curl.get(), CURLOPT_MAXFILESIZE, params.max_size); + } + http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp"); + for (const auto & header : params.headers) { + http_headers.ptr = curl_slist_append(http_headers.ptr, header.c_str()); + } + curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr); + + CURLcode res = curl_easy_perform(curl.get()); + + if (res != CURLE_OK) { + std::string error_msg = curl_easy_strerror(res); + throw std::runtime_error("error: cannot make GET request: " + error_msg); + } + + long res_code; + curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &res_code); + + return { res_code, std::move(res_buffer) }; +} + +/** + * Allow getting the HF file from the HF repo with tag (like ollama), for example: + * - bartowski/Llama-3.2-3B-Instruct-GGUF:q4 + * - bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M + * - bartowski/Llama-3.2-3B-Instruct-GGUF:q5_k_s + * Tag is optional, default to "latest" (meaning it checks for Q4_K_M first, then Q4, then if not found, return the first GGUF file in repo) + * + * Return pair of (with "repo" already having tag removed) + * + * Note: we use the Ollama-compatible HF API, but not using the blobId. Instead, we use the special "ggufFile" field which returns the value for "hf_file". This is done to be backward-compatible with existing cache files. + */ +static struct common_hf_file_res common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & bearer_token, bool offline) { + auto parts = string_split(hf_repo_with_tag, ':'); + std::string tag = parts.size() > 1 ? parts.back() : "latest"; + std::string hf_repo = parts[0]; + if (string_split(hf_repo, '/').size() != 2) { + throw std::invalid_argument("error: invalid HF repo format, expected /[:quant]\n"); + } + + std::string url = get_model_endpoint() + "v2/" + hf_repo + "/manifests/" + tag; + + // headers + std::vector headers; + headers.push_back("Accept: application/json"); + if (!bearer_token.empty()) { + headers.push_back("Authorization: Bearer " + bearer_token); + } + // Important: the User-Agent must be "llama-cpp" to get the "ggufFile" field in the response + // User-Agent header is already set in common_remote_get_content, no need to set it here + + // we use "=" to avoid clashing with other component, while still being allowed on windows + std::string cached_response_fname = "manifest=" + hf_repo + "=" + tag + ".json"; + string_replace_all(cached_response_fname, "/", "_"); + std::string cached_response_path = fs_get_cache_file(cached_response_fname); + + // make the request + common_remote_params params; + params.headers = headers; + long res_code = 0; + std::string res_str; + bool use_cache = false; + if (!offline) { + try { + auto res = common_remote_get_content(url, params); + res_code = res.first; + res_str = std::string(res.second.data(), res.second.size()); + } catch (const std::exception & e) { + LOG_WRN("error: failed to get manifest at %s: %s\n", url.c_str(), e.what()); + } + } + if (res_code == 0) { + if (std::filesystem::exists(cached_response_path)) { + LOG_WRN("trying to read manifest from cache: %s\n", cached_response_path.c_str()); + res_str = read_file(cached_response_path); + res_code = 200; + use_cache = true; + } else { + throw std::runtime_error( + offline ? "error: failed to get manifest (offline mode)" + : "error: failed to get manifest (check your internet connection)"); + } + } + std::string ggufFile; + std::string mmprojFile; + + if (res_code == 200 || res_code == 304) { + // extract ggufFile.rfilename in json, using regex + { + std::regex pattern("\"ggufFile\"[\\s\\S]*?\"rfilename\"\\s*:\\s*\"([^\"]+)\""); + std::smatch match; + if (std::regex_search(res_str, match, pattern)) { + ggufFile = match[1].str(); + } + } + // extract mmprojFile.rfilename in json, using regex + { + std::regex pattern("\"mmprojFile\"[\\s\\S]*?\"rfilename\"\\s*:\\s*\"([^\"]+)\""); + std::smatch match; + if (std::regex_search(res_str, match, pattern)) { + mmprojFile = match[1].str(); + } + } + if (!use_cache) { + // if not using cached response, update the cache file + write_file(cached_response_path, res_str); + } + } else if (res_code == 401) { + throw std::runtime_error("error: model is private or does not exist; if you are accessing a gated model, please provide a valid HF token"); + } else { + throw std::runtime_error(string_format("error from HF API, response code: %ld, data: %s", res_code, res_str.c_str())); + } + + // check response + if (ggufFile.empty()) { + throw std::runtime_error("error: model does not have ggufFile"); + } + + return { hf_repo, ggufFile, mmprojFile }; +} + +#else + +bool common_has_curl() { + return false; +} + +static bool common_download_file_single(const std::string &, const std::string &, const std::string &, bool) { + LOG_ERR("error: built without CURL, cannot download model from internet\n"); + return false; +} + +static bool common_download_file_multiple(const std::vector> &, const std::string &, bool) { + LOG_ERR("error: built without CURL, cannot download model from the internet\n"); + return false; +} + +static bool common_download_model( + const common_params_model &, + const std::string &, + bool) { + LOG_ERR("error: built without CURL, cannot download model from the internet\n"); + return false; +} + +static struct common_hf_file_res common_get_hf_file(const std::string &, const std::string &, bool) { + LOG_ERR("error: built without CURL, cannot download model from the internet\n"); + return {}; +} + +std::pair> common_remote_get_content(const std::string & url, const common_remote_params &) { + if (!url.empty()) { + throw std::runtime_error("error: built without CURL, cannot download model from the internet"); + } + + return {}; +} + +#endif // LLAMA_USE_CURL + +// +// utils +// + +struct handle_model_result { + bool found_mmproj = false; + common_params_model mmproj; +}; + +static handle_model_result common_params_handle_model( + struct common_params_model & model, + const std::string & bearer_token, + const std::string & model_path_default, + bool offline) { + handle_model_result result; + // handle pre-fill default model path and url based on hf_repo and hf_file + { + if (!model.hf_repo.empty()) { + // short-hand to avoid specifying --hf-file -> default it to --model + if (model.hf_file.empty()) { + if (model.path.empty()) { + auto auto_detected = common_get_hf_file(model.hf_repo, bearer_token, offline); + if (auto_detected.repo.empty() || auto_detected.ggufFile.empty()) { + exit(1); // built without CURL, error message already printed + } + model.hf_repo = auto_detected.repo; + model.hf_file = auto_detected.ggufFile; + if (!auto_detected.mmprojFile.empty()) { + result.found_mmproj = true; + result.mmproj.hf_repo = model.hf_repo; + result.mmproj.hf_file = auto_detected.mmprojFile; + } + } else { + model.hf_file = model.path; + } + } + + std::string model_endpoint = get_model_endpoint(); + model.url = model_endpoint + model.hf_repo + "/resolve/main/" + model.hf_file; + // make sure model path is present (for caching purposes) + if (model.path.empty()) { + // this is to avoid different repo having same file name, or same file name in different subdirs + std::string filename = model.hf_repo + "_" + model.hf_file; + // to make sure we don't have any slashes in the filename + string_replace_all(filename, "/", "_"); + model.path = fs_get_cache_file(filename); + } + + } else if (!model.url.empty()) { + if (model.path.empty()) { + auto f = string_split(model.url, '#').front(); + f = string_split(f, '?').front(); + model.path = fs_get_cache_file(string_split(f, '/').back()); + } + + } else if (model.path.empty()) { + model.path = model_path_default; + } + } + + // then, download it if needed + if (!model.url.empty()) { + bool ok = common_download_model(model, bearer_token, offline); + if (!ok) { + LOG_ERR("error: failed to download model from %s\n", model.url.c_str()); + exit(1); + } + } + + return result; +} + +const std::vector kv_cache_types = { + GGML_TYPE_F32, + GGML_TYPE_F16, + GGML_TYPE_BF16, + GGML_TYPE_Q8_0, + GGML_TYPE_Q4_0, + GGML_TYPE_Q4_1, + GGML_TYPE_IQ4_NL, + GGML_TYPE_Q5_0, + GGML_TYPE_Q5_1, +}; + +static ggml_type kv_cache_type_from_str(const std::string & s) { + for (const auto & type : kv_cache_types) { + if (ggml_type_name(type) == s) { + return type; + } + } + throw std::runtime_error("Unsupported cache type: " + s); +} + +static std::string get_all_kv_cache_types() { + std::ostringstream msg; + for (const auto & type : kv_cache_types) { + msg << ggml_type_name(type) << (&type == &kv_cache_types.back() ? "" : ", "); + } + return msg.str(); +} + +// +// CLI argument parsing functions +// + +static bool common_params_parse_ex(int argc, char ** argv, common_params_context & ctx_arg) { + std::string arg; + const std::string arg_prefix = "--"; + common_params & params = ctx_arg.params; + + std::unordered_map arg_to_options; + for (auto & opt : ctx_arg.options) { + for (const auto & arg : opt.args) { + arg_to_options[arg] = &opt; + } + } + + // handle environment variables + for (auto & opt : ctx_arg.options) { + std::string value; + if (opt.get_value_from_env(value)) { + try { + if (opt.handler_void && (value == "1" || value == "true")) { + opt.handler_void(params); + } + if (opt.handler_int) { + opt.handler_int(params, std::stoi(value)); + } + if (opt.handler_string) { + opt.handler_string(params, value); + continue; + } + } catch (std::exception & e) { + throw std::invalid_argument(string_format( + "error while handling environment variable \"%s\": %s\n\n", opt.env, e.what())); + } + } + } + + // handle command line arguments + auto check_arg = [&](int i) { + if (i+1 >= argc) { + throw std::invalid_argument("expected value for argument"); + } + }; + + for (int i = 1; i < argc; i++) { + const std::string arg_prefix = "--"; + + std::string arg = argv[i]; + if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) { + std::replace(arg.begin(), arg.end(), '_', '-'); + } + if (arg_to_options.find(arg) == arg_to_options.end()) { + throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str())); + } + auto opt = *arg_to_options[arg]; + if (opt.has_value_from_env()) { + fprintf(stderr, "warn: %s environment variable is set, but will be overwritten by command line argument %s\n", opt.env, arg.c_str()); + } + try { + if (opt.handler_void) { + opt.handler_void(params); + continue; + } + + // arg with single value + check_arg(i); + std::string val = argv[++i]; + if (opt.handler_int) { + opt.handler_int(params, std::stoi(val)); + continue; + } + if (opt.handler_string) { + opt.handler_string(params, val); + continue; + } + + // arg with 2 values + check_arg(i); + std::string val2 = argv[++i]; + if (opt.handler_str_str) { + opt.handler_str_str(params, val, val2); + continue; + } + } catch (std::exception & e) { + throw std::invalid_argument(string_format( + "error while handling argument \"%s\": %s\n\n" + "usage:\n%s\n\nto show complete usage, run with -h", + arg.c_str(), e.what(), arg_to_options[arg]->to_string().c_str())); + } + } + + postprocess_cpu_params(params.cpuparams, nullptr); + postprocess_cpu_params(params.cpuparams_batch, ¶ms.cpuparams); + + postprocess_cpu_params(params.speculative.cpuparams, ¶ms.cpuparams); + postprocess_cpu_params(params.speculative.cpuparams_batch, ¶ms.cpuparams_batch); + + if (params.prompt_cache_all && (params.interactive || params.interactive_first)) { + throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n"); + } + + // handle model and download + { + auto res = common_params_handle_model(params.model, params.hf_token, DEFAULT_MODEL_PATH, params.offline); + if (params.no_mmproj) { + params.mmproj = {}; + } else if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) { + // optionally, handle mmproj model when -hf is specified + params.mmproj = res.mmproj; + } + // only download mmproj if the current example is using it + for (auto & ex : mmproj_examples) { + if (ctx_arg.ex == ex) { + common_params_handle_model(params.mmproj, params.hf_token, "", params.offline); + break; + } + } + common_params_handle_model(params.speculative.model, params.hf_token, "", params.offline); + common_params_handle_model(params.vocoder.model, params.hf_token, "", params.offline); + } + + if (params.escape) { + string_process_escapes(params.prompt); + string_process_escapes(params.input_prefix); + string_process_escapes(params.input_suffix); + for (auto & antiprompt : params.antiprompt) { + string_process_escapes(antiprompt); + } + for (auto & seq_breaker : params.sampling.dry_sequence_breakers) { + string_process_escapes(seq_breaker); + } + } + + if (!params.kv_overrides.empty()) { + params.kv_overrides.emplace_back(); + params.kv_overrides.back().key[0] = 0; + } + + if (!params.tensor_buft_overrides.empty()) { + params.tensor_buft_overrides.push_back({nullptr, nullptr}); + } + + if (!params.chat_template.empty() && !common_chat_verify_template(params.chat_template, params.use_jinja)) { + throw std::runtime_error(string_format( + "error: the supplied chat template is not supported: %s%s\n", + params.chat_template.c_str(), + params.use_jinja ? "" : "\nnote: llama.cpp was started without --jinja, we only support commonly used templates" + )); + } + + return true; +} + +static void common_params_print_usage(common_params_context & ctx_arg) { + auto print_options = [](std::vector & options) { + for (common_arg * opt : options) { + printf("%s", opt->to_string().c_str()); + } + }; + + std::vector common_options; + std::vector sparam_options; + std::vector specific_options; + for (auto & opt : ctx_arg.options) { + // in case multiple LLAMA_EXAMPLE_* are set, we prioritize the LLAMA_EXAMPLE_* matching current example + if (opt.is_sparam) { + sparam_options.push_back(&opt); + } else if (opt.in_example(ctx_arg.ex)) { + specific_options.push_back(&opt); + } else { + common_options.push_back(&opt); + } + } + printf("----- common params -----\n\n"); + print_options(common_options); + printf("\n\n----- sampling params -----\n\n"); + print_options(sparam_options); + // TODO: maybe convert enum llama_example to string + printf("\n\n----- example-specific params -----\n\n"); + print_options(specific_options); +} + +static void common_params_print_completion(common_params_context & ctx_arg) { + std::vector common_options; + std::vector sparam_options; + std::vector specific_options; + + for (auto & opt : ctx_arg.options) { + if (opt.is_sparam) { + sparam_options.push_back(&opt); + } else if (opt.in_example(ctx_arg.ex)) { + specific_options.push_back(&opt); + } else { + common_options.push_back(&opt); + } + } + + printf("_llama_completions() {\n"); + printf(" local cur prev opts\n"); + printf(" COMPREPLY=()\n"); + printf(" cur=\"${COMP_WORDS[COMP_CWORD]}\"\n"); + printf(" prev=\"${COMP_WORDS[COMP_CWORD-1]}\"\n\n"); + + printf(" opts=\""); + auto print_options = [](const std::vector & options) { + for (const common_arg * opt : options) { + for (const char * arg : opt->args) { + printf("%s ", arg); + } + } + }; + + print_options(common_options); + print_options(sparam_options); + print_options(specific_options); + printf("\"\n\n"); + + printf(" case \"$prev\" in\n"); + printf(" --model)\n"); + printf(" COMPREPLY=( $(compgen -f -X '!*.gguf' -- \"$cur\") $(compgen -d -- \"$cur\") )\n"); + printf(" return 0\n"); + printf(" ;;\n"); + printf(" --grammar-file)\n"); + printf(" COMPREPLY=( $(compgen -f -X '!*.gbnf' -- \"$cur\") $(compgen -d -- \"$cur\") )\n"); + printf(" return 0\n"); + printf(" ;;\n"); + printf(" --chat-template-file)\n"); + printf(" COMPREPLY=( $(compgen -f -X '!*.jinja' -- \"$cur\") $(compgen -d -- \"$cur\") )\n"); + printf(" return 0\n"); + printf(" ;;\n"); + printf(" *)\n"); + printf(" COMPREPLY=( $(compgen -W \"${opts}\" -- \"$cur\") )\n"); + printf(" return 0\n"); + printf(" ;;\n"); + printf(" esac\n"); + printf("}\n\n"); + + std::set executables = { + "llama-batched", + "llama-batched-bench", + "llama-bench", + "llama-cli", + "llama-convert-llama2c-to-ggml", + "llama-cvector-generator", + "llama-embedding", + "llama-eval-callback", + "llama-export-lora", + "llama-gen-docs", + "llama-gguf", + "llama-gguf-hash", + "llama-gguf-split", + "llama-gritlm", + "llama-imatrix", + "llama-infill", + "llama-mtmd-cli", + "llama-llava-clip-quantize-cli", + "llama-lookahead", + "llama-lookup", + "llama-lookup-create", + "llama-lookup-merge", + "llama-lookup-stats", + "llama-parallel", + "llama-passkey", + "llama-perplexity", + "llama-q8dot", + "llama-quantize", + "llama-qwen2vl-cli", + "llama-retrieval", + "llama-run", + "llama-save-load-state", + "llama-server", + "llama-simple", + "llama-simple-chat", + "llama-speculative", + "llama-speculative-simple", + "llama-tokenize", + "llama-tts", + "llama-vdot" + }; + + for (const auto& exe : executables) { + printf("complete -F _llama_completions %s\n", exe.c_str()); + } +} + +static std::vector parse_device_list(const std::string & value) { + std::vector devices; + auto dev_names = string_split(value, ','); + if (dev_names.empty()) { + throw std::invalid_argument("no devices specified"); + } + if (dev_names.size() == 1 && dev_names[0] == "none") { + devices.push_back(nullptr); + } else { + for (const auto & device : dev_names) { + auto * dev = ggml_backend_dev_by_name(device.c_str()); + if (!dev || ggml_backend_dev_type(dev) != GGML_BACKEND_DEVICE_TYPE_GPU) { + throw std::invalid_argument(string_format("invalid device: %s", device.c_str())); + } + devices.push_back(dev); + } + devices.push_back(nullptr); + } + return devices; +} + +static void add_rpc_devices(std::string servers) { + auto rpc_servers = string_split(servers, ','); + if (rpc_servers.empty()) { + throw std::invalid_argument("no RPC servers specified"); + } + ggml_backend_reg_t rpc_reg = ggml_backend_reg_by_name("RPC"); + if (!rpc_reg) { + throw std::invalid_argument("failed to find RPC backend"); + } + typedef ggml_backend_dev_t (*ggml_backend_rpc_add_device_t)(const char * endpoint); + ggml_backend_rpc_add_device_t ggml_backend_rpc_add_device_fn = (ggml_backend_rpc_add_device_t) ggml_backend_reg_get_proc_address(rpc_reg, "ggml_backend_rpc_add_device"); + if (!ggml_backend_rpc_add_device_fn) { + throw std::invalid_argument("failed to find RPC device add function"); + } + for (const auto & server : rpc_servers) { + ggml_backend_dev_t dev = ggml_backend_rpc_add_device_fn(server.c_str()); + if (dev) { + ggml_backend_device_register(dev); + } else { + throw std::invalid_argument("failed to register RPC device"); + } + } +} + +bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **)) { + auto ctx_arg = common_params_parser_init(params, ex, print_usage); + const common_params params_org = ctx_arg.params; // the example can modify the default params + + try { + if (!common_params_parse_ex(argc, argv, ctx_arg)) { + ctx_arg.params = params_org; + return false; + } + if (ctx_arg.params.usage) { + common_params_print_usage(ctx_arg); + if (ctx_arg.print_usage) { + ctx_arg.print_usage(argc, argv); + } + exit(0); + } + if (ctx_arg.params.completion) { + common_params_print_completion(ctx_arg); + exit(0); + } + } catch (const std::invalid_argument & ex) { + fprintf(stderr, "%s\n", ex.what()); + ctx_arg.params = params_org; + return false; + } catch (std::exception & ex) { + fprintf(stderr, "%s\n", ex.what()); + exit(1); // for other exceptions, we exit with status code 1 + } + + return true; +} + +static std::string list_builtin_chat_templates() { + std::vector supported_tmpl; + int32_t res = llama_chat_builtin_templates(nullptr, 0); + supported_tmpl.resize(res); + res = llama_chat_builtin_templates(supported_tmpl.data(), supported_tmpl.size()); + std::ostringstream msg; + for (auto & tmpl : supported_tmpl) { + msg << tmpl << (&tmpl == &supported_tmpl.back() ? "" : ", "); + } + return msg.str(); +} + +common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **)) { + // load dynamic backends + ggml_backend_load_all(); + + common_params_context ctx_arg(params); + ctx_arg.print_usage = print_usage; + ctx_arg.ex = ex; + + std::string sampler_type_chars; + std::string sampler_type_names; + for (const auto & sampler : params.sampling.samplers) { + sampler_type_chars += common_sampler_type_to_chr(sampler); + sampler_type_names += common_sampler_type_to_str(sampler) + ";"; + } + sampler_type_names.pop_back(); + + + /** + * filter options by example + * rules: + * - all examples inherit options from LLAMA_EXAMPLE_COMMON + * - if LLAMA_EXAMPLE_* is set (other than COMMON), we only show the option in the corresponding example + * - if both {LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_*,} are set, we will prioritize the LLAMA_EXAMPLE_* matching current example + */ + auto add_opt = [&](common_arg arg) { + if ((arg.in_example(ex) || arg.in_example(LLAMA_EXAMPLE_COMMON)) && !arg.is_exclude(ex)) { + ctx_arg.options.push_back(std::move(arg)); + } + }; + + + add_opt(common_arg( + {"-h", "--help", "--usage"}, + "print usage and exit", + [](common_params & params) { + params.usage = true; + } + )); + add_opt(common_arg( + {"--version"}, + "show version and build info", + [](common_params &) { + fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT); + fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET); + exit(0); + } + )); + add_opt(common_arg( + {"--completion-bash"}, + "print source-able bash completion script for llama.cpp", + [](common_params & params) { + params.completion = true; + } + )); + add_opt(common_arg( + {"--verbose-prompt"}, + string_format("print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false"), + [](common_params & params) { + params.verbose_prompt = true; + } + )); + add_opt(common_arg( + {"--no-display-prompt"}, + string_format("don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false"), + [](common_params & params) { + params.display_prompt = false; + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); + add_opt(common_arg( + {"-co", "--color"}, + string_format("colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false"), + [](common_params & params) { + params.use_color = true; + } + ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP})); + add_opt(common_arg( + {"-t", "--threads"}, "N", + string_format("number of threads to use during generation (default: %d)", params.cpuparams.n_threads), + [](common_params & params, int value) { + params.cpuparams.n_threads = value; + if (params.cpuparams.n_threads <= 0) { + params.cpuparams.n_threads = std::thread::hardware_concurrency(); + } + } + ).set_env("LLAMA_ARG_THREADS")); + add_opt(common_arg( + {"-tb", "--threads-batch"}, "N", + "number of threads to use during batch and prompt processing (default: same as --threads)", + [](common_params & params, int value) { + params.cpuparams_batch.n_threads = value; + if (params.cpuparams_batch.n_threads <= 0) { + params.cpuparams_batch.n_threads = std::thread::hardware_concurrency(); + } + } + )); + add_opt(common_arg( + {"-C", "--cpu-mask"}, "M", + "CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: \"\")", + [](common_params & params, const std::string & mask) { + params.cpuparams.mask_valid = true; + if (!parse_cpu_mask(mask, params.cpuparams.cpumask)) { + throw std::invalid_argument("invalid cpumask"); + } + } + )); + add_opt(common_arg( + {"-Cr", "--cpu-range"}, "lo-hi", + "range of CPUs for affinity. Complements --cpu-mask", + [](common_params & params, const std::string & range) { + params.cpuparams.mask_valid = true; + if (!parse_cpu_range(range, params.cpuparams.cpumask)) { + throw std::invalid_argument("invalid range"); + } + } + )); + add_opt(common_arg( + {"--cpu-strict"}, "<0|1>", + string_format("use strict CPU placement (default: %u)\n", (unsigned) params.cpuparams.strict_cpu), + [](common_params & params, const std::string & value) { + params.cpuparams.strict_cpu = std::stoul(value); + } + )); + add_opt(common_arg( + {"--prio"}, "N", + string_format("set process/thread priority : low(-1), normal(0), medium(1), high(2), realtime(3) (default: %d)\n", params.cpuparams.priority), + [](common_params & params, int prio) { + if (prio < GGML_SCHED_PRIO_LOW || prio > GGML_SCHED_PRIO_REALTIME) { + throw std::invalid_argument("invalid value"); + } + params.cpuparams.priority = (enum ggml_sched_priority) prio; + } + )); + add_opt(common_arg( + {"--poll"}, "<0...100>", + string_format("use polling level to wait for work (0 - no polling, default: %u)\n", (unsigned) params.cpuparams.poll), + [](common_params & params, const std::string & value) { + params.cpuparams.poll = std::stoul(value); + } + )); + add_opt(common_arg( + {"-Cb", "--cpu-mask-batch"}, "M", + "CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask)", + [](common_params & params, const std::string & mask) { + params.cpuparams_batch.mask_valid = true; + if (!parse_cpu_mask(mask, params.cpuparams_batch.cpumask)) { + throw std::invalid_argument("invalid cpumask"); + } + } + )); + add_opt(common_arg( + {"-Crb", "--cpu-range-batch"}, "lo-hi", + "ranges of CPUs for affinity. Complements --cpu-mask-batch", + [](common_params & params, const std::string & range) { + params.cpuparams_batch.mask_valid = true; + if (!parse_cpu_range(range, params.cpuparams_batch.cpumask)) { + throw std::invalid_argument("invalid range"); + } + } + )); + add_opt(common_arg( + {"--cpu-strict-batch"}, "<0|1>", + "use strict CPU placement (default: same as --cpu-strict)", + [](common_params & params, int value) { + params.cpuparams_batch.strict_cpu = value; + } + )); + add_opt(common_arg( + {"--prio-batch"}, "N", + string_format("set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams_batch.priority), + [](common_params & params, int prio) { + if (prio < 0 || prio > 3) { + throw std::invalid_argument("invalid value"); + } + params.cpuparams_batch.priority = (enum ggml_sched_priority) prio; + } + )); + add_opt(common_arg( + {"--poll-batch"}, "<0|1>", + "use polling to wait for work (default: same as --poll)", + [](common_params & params, int value) { + params.cpuparams_batch.poll = value; + } + )); + add_opt(common_arg( + {"-lcs", "--lookup-cache-static"}, "FNAME", + "path to static lookup cache to use for lookup decoding (not updated by generation)", + [](common_params & params, const std::string & value) { + params.lookup_cache_static = value; + } + ).set_examples({LLAMA_EXAMPLE_LOOKUP})); + add_opt(common_arg( + {"-lcd", "--lookup-cache-dynamic"}, "FNAME", + "path to dynamic lookup cache to use for lookup decoding (updated by generation)", + [](common_params & params, const std::string & value) { + params.lookup_cache_dynamic = value; + } + ).set_examples({LLAMA_EXAMPLE_LOOKUP})); + add_opt(common_arg( + {"-c", "--ctx-size"}, "N", + string_format("size of the prompt context (default: %d, 0 = loaded from model)", params.n_ctx), + [](common_params & params, int value) { + params.n_ctx = value; + } + ).set_env("LLAMA_ARG_CTX_SIZE")); + add_opt(common_arg( + {"-n", "--predict", "--n-predict"}, "N", + string_format( + ex == LLAMA_EXAMPLE_MAIN + ? "number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)" + : "number of tokens to predict (default: %d, -1 = infinity)", + params.n_predict), + [](common_params & params, int value) { + params.n_predict = value; + } + ).set_env("LLAMA_ARG_N_PREDICT")); + add_opt(common_arg( + {"-b", "--batch-size"}, "N", + string_format("logical maximum batch size (default: %d)", params.n_batch), + [](common_params & params, int value) { + params.n_batch = value; + } + ).set_env("LLAMA_ARG_BATCH")); + add_opt(common_arg( + {"-ub", "--ubatch-size"}, "N", + string_format("physical maximum batch size (default: %d)", params.n_ubatch), + [](common_params & params, int value) { + params.n_ubatch = value; + } + ).set_env("LLAMA_ARG_UBATCH")); + add_opt(common_arg( + {"--keep"}, "N", + string_format("number of tokens to keep from the initial prompt (default: %d, -1 = all)", params.n_keep), + [](common_params & params, int value) { + params.n_keep = value; + } + )); + add_opt(common_arg( + {"--swa-full"}, + string_format("use full-size SWA cache (default: %s)\n" + "[(more info)](https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)", params.swa_full ? "true" : "false"), + [](common_params & params) { + params.swa_full = true; + } + ).set_env("LLAMA_ARG_SWA_FULL")); + add_opt(common_arg( + {"--no-context-shift"}, + string_format("disables context shift on infinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"), + [](common_params & params) { + params.ctx_shift = false; + } + ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT")); + add_opt(common_arg( + {"--chunks"}, "N", + string_format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks), + [](common_params & params, int value) { + params.n_chunks = value; + } + ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_RETRIEVAL})); + add_opt(common_arg( + {"-fa", "--flash-attn"}, + string_format("enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled"), + [](common_params & params) { + params.flash_attn = true; + } + ).set_env("LLAMA_ARG_FLASH_ATTN")); + add_opt(common_arg( + {"-p", "--prompt"}, "PROMPT", + "prompt to start generation with; for system message, use -sys", + [](common_params & params, const std::string & value) { + params.prompt = value; + } + ).set_excludes({LLAMA_EXAMPLE_SERVER})); + add_opt(common_arg( + {"-sys", "--system-prompt"}, "PROMPT", + "system prompt to use with model (if applicable, depending on chat template)", + [](common_params & params, const std::string & value) { + params.system_prompt = value; + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); + add_opt(common_arg( + {"--no-perf"}, + string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"), + [](common_params & params) { + params.no_perf = true; + params.sampling.no_perf = true; + } + ).set_env("LLAMA_ARG_NO_PERF")); + add_opt(common_arg( + {"-f", "--file"}, "FNAME", + "a file containing the prompt (default: none)", + [](common_params & params, const std::string & value) { + params.prompt = read_file(value); + // store the external file name in params + params.prompt_file = value; + if (!params.prompt.empty() && params.prompt.back() == '\n') { + params.prompt.pop_back(); + } + } + ).set_excludes({LLAMA_EXAMPLE_SERVER})); + add_opt(common_arg( + {"-sysf", "--system-prompt-file"}, "FNAME", + "a file containing the system prompt (default: none)", + [](common_params & params, const std::string & value) { + params.system_prompt = read_file(value); + if (!params.system_prompt.empty() && params.system_prompt.back() == '\n') { + params.system_prompt.pop_back(); + } + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); + add_opt(common_arg( + {"--in-file"}, "FNAME", + "an input file (repeat to specify multiple files)", + [](common_params & params, const std::string & value) { + std::ifstream file(value); + if (!file) { + throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str())); + } + params.in_files.push_back(value); + } + ).set_examples({LLAMA_EXAMPLE_IMATRIX})); + add_opt(common_arg( + {"-bf", "--binary-file"}, "FNAME", + "binary file containing the prompt (default: none)", + [](common_params & params, const std::string & value) { + std::ifstream file(value, std::ios::binary); + if (!file) { + throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str())); + } + // store the external file name in params + params.prompt_file = value; + std::ostringstream ss; + ss << file.rdbuf(); + params.prompt = ss.str(); + fprintf(stderr, "Read %zu bytes from binary file %s\n", params.prompt.size(), value.c_str()); + } + ).set_excludes({LLAMA_EXAMPLE_SERVER})); + add_opt(common_arg( + {"-e", "--escape"}, + string_format("process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"), + [](common_params & params) { + params.escape = true; + } + )); + add_opt(common_arg( + {"--no-escape"}, + "do not process escape sequences", + [](common_params & params) { + params.escape = false; + } + )); + add_opt(common_arg( + {"-ptc", "--print-token-count"}, "N", + string_format("print token count every N tokens (default: %d)", params.n_print), + [](common_params & params, int value) { + params.n_print = value; + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); + add_opt(common_arg( + {"--prompt-cache"}, "FNAME", + "file to cache prompt state for faster startup (default: none)", + [](common_params & params, const std::string & value) { + params.path_prompt_cache = value; + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); + add_opt(common_arg( + {"--prompt-cache-all"}, + "if specified, saves user input and generations to cache as well\n", + [](common_params & params) { + params.prompt_cache_all = true; + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); + add_opt(common_arg( + {"--prompt-cache-ro"}, + "if specified, uses the prompt cache but does not update it", + [](common_params & params) { + params.prompt_cache_ro = true; + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); + add_opt(common_arg( + {"-r", "--reverse-prompt"}, "PROMPT", + "halt generation at PROMPT, return control in interactive mode\n", + [](common_params & params, const std::string & value) { + params.antiprompt.emplace_back(value); + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); + add_opt(common_arg( + {"-sp", "--special"}, + string_format("special tokens output enabled (default: %s)", params.special ? "true" : "false"), + [](common_params & params) { + params.special = true; + } + ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER})); + add_opt(common_arg( + {"-cnv", "--conversation"}, + "run in conversation mode:\n" + "- does not print special tokens and suffix/prefix\n" + "- interactive mode is also enabled\n" + "(default: auto enabled if chat template is available)", + [](common_params & params) { + params.conversation_mode = COMMON_CONVERSATION_MODE_ENABLED; + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); + add_opt(common_arg( + {"-no-cnv", "--no-conversation"}, + "force disable conversation mode (default: false)", + [](common_params & params) { + params.conversation_mode = COMMON_CONVERSATION_MODE_DISABLED; + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); + add_opt(common_arg( + {"-st", "--single-turn"}, + "run conversation for a single turn only, then exit when done\n" + "will not be interactive if first turn is predefined with --prompt\n" + "(default: false)", + [](common_params & params) { + params.single_turn = true; + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); + add_opt(common_arg( + {"-i", "--interactive"}, + string_format("run in interactive mode (default: %s)", params.interactive ? "true" : "false"), + [](common_params & params) { + params.interactive = true; + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); + add_opt(common_arg( + {"-if", "--interactive-first"}, + string_format("run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false"), + [](common_params & params) { + params.interactive_first = true; + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); + add_opt(common_arg( + {"-mli", "--multiline-input"}, + "allows you to write or paste multiple lines without ending each in '\\'", + [](common_params & params) { + params.multiline_input = true; + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); + add_opt(common_arg( + {"--in-prefix-bos"}, + "prefix BOS to user inputs, preceding the `--in-prefix` string", + [](common_params & params) { + params.input_prefix_bos = true; + params.enable_chat_template = false; + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); + add_opt(common_arg( + {"--in-prefix"}, "STRING", + "string to prefix user inputs with (default: empty)", + [](common_params & params, const std::string & value) { + params.input_prefix = value; + params.enable_chat_template = false; + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); + add_opt(common_arg( + {"--in-suffix"}, "STRING", + "string to suffix after user inputs with (default: empty)", + [](common_params & params, const std::string & value) { + params.input_suffix = value; + params.enable_chat_template = false; + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); + add_opt(common_arg( + {"--no-warmup"}, + "skip warming up the model with an empty run", + [](common_params & params) { + params.warmup = false; + } + ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL})); + add_opt(common_arg( + {"--spm-infill"}, + string_format( + "use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: %s)", + params.spm_infill ? "enabled" : "disabled" + ), + [](common_params & params) { + params.spm_infill = true; + } + ).set_examples({LLAMA_EXAMPLE_SERVER})); + add_opt(common_arg( + {"--samplers"}, "SAMPLERS", + string_format("samplers that will be used for generation in the order, separated by \';\'\n(default: %s)", sampler_type_names.c_str()), + [](common_params & params, const std::string & value) { + const auto sampler_names = string_split(value, ';'); + params.sampling.samplers = common_sampler_types_from_names(sampler_names, true); + } + ).set_sparam()); + add_opt(common_arg( + {"-s", "--seed"}, "SEED", + string_format("RNG seed (default: %d, use random seed for %d)", params.sampling.seed, LLAMA_DEFAULT_SEED), + [](common_params & params, const std::string & value) { + params.sampling.seed = std::stoul(value); + } + ).set_sparam()); + add_opt(common_arg( + {"--sampling-seq", "--sampler-seq"}, "SEQUENCE", + string_format("simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str()), + [](common_params & params, const std::string & value) { + params.sampling.samplers = common_sampler_types_from_chars(value); + } + ).set_sparam()); + add_opt(common_arg( + {"--ignore-eos"}, + "ignore end of stream token and continue generating (implies --logit-bias EOS-inf)", + [](common_params & params) { + params.sampling.ignore_eos = true; + } + ).set_sparam()); + add_opt(common_arg( + {"--temp"}, "N", + string_format("temperature (default: %.1f)", (double)params.sampling.temp), + [](common_params & params, const std::string & value) { + params.sampling.temp = std::stof(value); + params.sampling.temp = std::max(params.sampling.temp, 0.0f); + } + ).set_sparam()); + add_opt(common_arg( + {"--top-k"}, "N", + string_format("top-k sampling (default: %d, 0 = disabled)", params.sampling.top_k), + [](common_params & params, int value) { + params.sampling.top_k = value; + } + ).set_sparam()); + add_opt(common_arg( + {"--top-p"}, "N", + string_format("top-p sampling (default: %.1f, 1.0 = disabled)", (double)params.sampling.top_p), + [](common_params & params, const std::string & value) { + params.sampling.top_p = std::stof(value); + } + ).set_sparam()); + add_opt(common_arg( + {"--min-p"}, "N", + string_format("min-p sampling (default: %.1f, 0.0 = disabled)", (double)params.sampling.min_p), + [](common_params & params, const std::string & value) { + params.sampling.min_p = std::stof(value); + } + ).set_sparam()); + add_opt(common_arg( + {"--top-nsigma"}, "N", + string_format("top-n-sigma sampling (default: %.1f, -1.0 = disabled)", params.sampling.top_n_sigma), + [](common_params & params, const std::string & value) { + params.sampling.top_n_sigma = std::stof(value); + } + ).set_examples({LLAMA_EXAMPLE_MAIN}).set_sparam()); + add_opt(common_arg( + {"--xtc-probability"}, "N", + string_format("xtc probability (default: %.1f, 0.0 = disabled)", (double)params.sampling.xtc_probability), + [](common_params & params, const std::string & value) { + params.sampling.xtc_probability = std::stof(value); + } + ).set_sparam()); + add_opt(common_arg( + {"--xtc-threshold"}, "N", + string_format("xtc threshold (default: %.1f, 1.0 = disabled)", (double)params.sampling.xtc_threshold), + [](common_params & params, const std::string & value) { + params.sampling.xtc_threshold = std::stof(value); + } + ).set_sparam()); + add_opt(common_arg( + {"--typical"}, "N", + string_format("locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)params.sampling.typ_p), + [](common_params & params, const std::string & value) { + params.sampling.typ_p = std::stof(value); + } + ).set_sparam()); + add_opt(common_arg( + {"--repeat-last-n"}, "N", + string_format("last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", params.sampling.penalty_last_n), + [](common_params & params, int value) { + if (value < -1) { + throw std::runtime_error(string_format("error: invalid repeat-last-n = %d\n", value)); + } + params.sampling.penalty_last_n = value; + params.sampling.n_prev = std::max(params.sampling.n_prev, params.sampling.penalty_last_n); + } + ).set_sparam()); + add_opt(common_arg( + {"--repeat-penalty"}, "N", + string_format("penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)params.sampling.penalty_repeat), + [](common_params & params, const std::string & value) { + params.sampling.penalty_repeat = std::stof(value); + } + ).set_sparam()); + add_opt(common_arg( + {"--presence-penalty"}, "N", + string_format("repeat alpha presence penalty (default: %.1f, 0.0 = disabled)", (double)params.sampling.penalty_present), + [](common_params & params, const std::string & value) { + params.sampling.penalty_present = std::stof(value); + } + ).set_sparam()); + add_opt(common_arg( + {"--frequency-penalty"}, "N", + string_format("repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)", (double)params.sampling.penalty_freq), + [](common_params & params, const std::string & value) { + params.sampling.penalty_freq = std::stof(value); + } + ).set_sparam()); + add_opt(common_arg( + {"--dry-multiplier"}, "N", + string_format("set DRY sampling multiplier (default: %.1f, 0.0 = disabled)", (double)params.sampling.dry_multiplier), + [](common_params & params, const std::string & value) { + params.sampling.dry_multiplier = std::stof(value); + } + ).set_sparam()); + add_opt(common_arg( + {"--dry-base"}, "N", + string_format("set DRY sampling base value (default: %.2f)", (double)params.sampling.dry_base), + [](common_params & params, const std::string & value) { + float potential_base = std::stof(value); + if (potential_base >= 1.0f) + { + params.sampling.dry_base = potential_base; + } + } + ).set_sparam()); + add_opt(common_arg( + {"--dry-allowed-length"}, "N", + string_format("set allowed length for DRY sampling (default: %d)", params.sampling.dry_allowed_length), + [](common_params & params, int value) { + params.sampling.dry_allowed_length = value; + } + ).set_sparam()); + add_opt(common_arg( + {"--dry-penalty-last-n"}, "N", + string_format("set DRY penalty for the last n tokens (default: %d, 0 = disable, -1 = context size)", params.sampling.dry_penalty_last_n), + [](common_params & params, int value) { + if (value < -1) { + throw std::runtime_error(string_format("error: invalid dry-penalty-last-n = %d\n", value)); + } + params.sampling.dry_penalty_last_n = value; + } + ).set_sparam()); + add_opt(common_arg( + {"--dry-sequence-breaker"}, "STRING", + string_format("add sequence breaker for DRY sampling, clearing out default breakers (%s) in the process; use \"none\" to not use any sequence breakers\n", + params.sampling.dry_sequence_breakers.empty() ? "none" : + std::accumulate(std::next(params.sampling.dry_sequence_breakers.begin()), + params.sampling.dry_sequence_breakers.end(), + std::string("'") + (params.sampling.dry_sequence_breakers[0] == "\n" ? "\\n" : params.sampling.dry_sequence_breakers[0]) + "'", + [](const std::string& a, const std::string& b) { + std::string formatted_b = (b == "\n") ? "\\n" : b; + return a + ", '" + formatted_b + "'"; + }).c_str()), + [](common_params & params, const std::string & value) { + static bool defaults_cleared = false; + + if (!defaults_cleared) { + params.sampling.dry_sequence_breakers.clear(); + defaults_cleared = true; + } + + if (value == "none") { + params.sampling.dry_sequence_breakers.clear(); + } else { + params.sampling.dry_sequence_breakers.emplace_back(value); + } + } + ).set_sparam()); + add_opt(common_arg( + {"--dynatemp-range"}, "N", + string_format("dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)params.sampling.dynatemp_range), + [](common_params & params, const std::string & value) { + params.sampling.dynatemp_range = std::stof(value); + } + ).set_sparam()); + add_opt(common_arg( + {"--dynatemp-exp"}, "N", + string_format("dynamic temperature exponent (default: %.1f)", (double)params.sampling.dynatemp_exponent), + [](common_params & params, const std::string & value) { + params.sampling.dynatemp_exponent = std::stof(value); + } + ).set_sparam()); + add_opt(common_arg( + {"--mirostat"}, "N", + string_format("use Mirostat sampling.\nTop K, Nucleus and Locally Typical samplers are ignored if used.\n" + "(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", params.sampling.mirostat), + [](common_params & params, int value) { + params.sampling.mirostat = value; + } + ).set_sparam()); + add_opt(common_arg( + {"--mirostat-lr"}, "N", + string_format("Mirostat learning rate, parameter eta (default: %.1f)", (double)params.sampling.mirostat_eta), + [](common_params & params, const std::string & value) { + params.sampling.mirostat_eta = std::stof(value); + } + ).set_sparam()); + add_opt(common_arg( + {"--mirostat-ent"}, "N", + string_format("Mirostat target entropy, parameter tau (default: %.1f)", (double)params.sampling.mirostat_tau), + [](common_params & params, const std::string & value) { + params.sampling.mirostat_tau = std::stof(value); + } + ).set_sparam()); + add_opt(common_arg( + {"-l", "--logit-bias"}, "TOKEN_ID(+/-)BIAS", + "modifies the likelihood of token appearing in the completion,\n" + "i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n" + "or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'", + [](common_params & params, const std::string & value) { + std::stringstream ss(value); + llama_token key; + char sign; + std::string value_str; + try { + if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) { + const float bias = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f); + params.sampling.logit_bias.push_back({key, bias}); + } else { + throw std::invalid_argument("invalid input format"); + } + } catch (const std::exception&) { + throw std::invalid_argument("invalid input format"); + } + } + ).set_sparam()); + add_opt(common_arg( + {"--grammar"}, "GRAMMAR", + string_format("BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", params.sampling.grammar.c_str()), + [](common_params & params, const std::string & value) { + params.sampling.grammar = value; + } + ).set_sparam()); + add_opt(common_arg( + {"--grammar-file"}, "FNAME", + "file to read grammar from", + [](common_params & params, const std::string & value) { + params.sampling.grammar = read_file(value); + } + ).set_sparam()); + add_opt(common_arg( + {"-j", "--json-schema"}, "SCHEMA", + "JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object\nFor schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead", + [](common_params & params, const std::string & value) { + params.sampling.grammar = json_schema_to_grammar(json::parse(value)); + } + ).set_sparam()); + add_opt(common_arg( + {"-jf", "--json-schema-file"}, "FILE", + "File containing a JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object\nFor schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead", + [](common_params & params, const std::string & value) { + std::ifstream file(value); + if (!file) { + throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str())); + } + std::string schema; + std::copy( + std::istreambuf_iterator(file), + std::istreambuf_iterator(), + std::back_inserter(schema) + ); + params.sampling.grammar = json_schema_to_grammar(json::parse(schema)); + } + ).set_sparam()); + add_opt(common_arg( + {"--pooling"}, "{none,mean,cls,last,rank}", + "pooling type for embeddings, use model default if unspecified", + [](common_params & params, const std::string & value) { + /**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; } + else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; } + else if (value == "cls") { params.pooling_type = LLAMA_POOLING_TYPE_CLS; } + else if (value == "last") { params.pooling_type = LLAMA_POOLING_TYPE_LAST; } + else if (value == "rank") { params.pooling_type = LLAMA_POOLING_TYPE_RANK; } + else { throw std::invalid_argument("invalid value"); } + } + ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_POOLING")); + add_opt(common_arg( + {"--attention"}, "{causal,non-causal}", + "attention type for embeddings, use model default if unspecified", + [](common_params & params, const std::string & value) { + /**/ if (value == "causal") { params.attention_type = LLAMA_ATTENTION_TYPE_CAUSAL; } + else if (value == "non-causal") { params.attention_type = LLAMA_ATTENTION_TYPE_NON_CAUSAL; } + else { throw std::invalid_argument("invalid value"); } + } + ).set_examples({LLAMA_EXAMPLE_EMBEDDING})); + add_opt(common_arg( + {"--rope-scaling"}, "{none,linear,yarn}", + "RoPE frequency scaling method, defaults to linear unless specified by the model", + [](common_params & params, const std::string & value) { + /**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; } + else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; } + else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; } + else { throw std::invalid_argument("invalid value"); } + } + ).set_env("LLAMA_ARG_ROPE_SCALING_TYPE")); + add_opt(common_arg( + {"--rope-scale"}, "N", + "RoPE context scaling factor, expands context by a factor of N", + [](common_params & params, const std::string & value) { + params.rope_freq_scale = 1.0f / std::stof(value); + } + ).set_env("LLAMA_ARG_ROPE_SCALE")); + add_opt(common_arg( + {"--rope-freq-base"}, "N", + "RoPE base frequency, used by NTK-aware scaling (default: loaded from model)", + [](common_params & params, const std::string & value) { + params.rope_freq_base = std::stof(value); + } + ).set_env("LLAMA_ARG_ROPE_FREQ_BASE")); + add_opt(common_arg( + {"--rope-freq-scale"}, "N", + "RoPE frequency scaling factor, expands context by a factor of 1/N", + [](common_params & params, const std::string & value) { + params.rope_freq_scale = std::stof(value); + } + ).set_env("LLAMA_ARG_ROPE_FREQ_SCALE")); + add_opt(common_arg( + {"--yarn-orig-ctx"}, "N", + string_format("YaRN: original context size of model (default: %d = model training context size)", params.yarn_orig_ctx), + [](common_params & params, int value) { + params.yarn_orig_ctx = value; + } + ).set_env("LLAMA_ARG_YARN_ORIG_CTX")); + add_opt(common_arg( + {"--yarn-ext-factor"}, "N", + string_format("YaRN: extrapolation mix factor (default: %.1f, 0.0 = full interpolation)", (double)params.yarn_ext_factor), + [](common_params & params, const std::string & value) { + params.yarn_ext_factor = std::stof(value); + } + ).set_env("LLAMA_ARG_YARN_EXT_FACTOR")); + add_opt(common_arg( + {"--yarn-attn-factor"}, "N", + string_format("YaRN: scale sqrt(t) or attention magnitude (default: %.1f)", (double)params.yarn_attn_factor), + [](common_params & params, const std::string & value) { + params.yarn_attn_factor = std::stof(value); + } + ).set_env("LLAMA_ARG_YARN_ATTN_FACTOR")); + add_opt(common_arg( + {"--yarn-beta-slow"}, "N", + string_format("YaRN: high correction dim or alpha (default: %.1f)", (double)params.yarn_beta_slow), + [](common_params & params, const std::string & value) { + params.yarn_beta_slow = std::stof(value); + } + ).set_env("LLAMA_ARG_YARN_BETA_SLOW")); + add_opt(common_arg( + {"--yarn-beta-fast"}, "N", + string_format("YaRN: low correction dim or beta (default: %.1f)", (double)params.yarn_beta_fast), + [](common_params & params, const std::string & value) { + params.yarn_beta_fast = std::stof(value); + } + ).set_env("LLAMA_ARG_YARN_BETA_FAST")); + add_opt(common_arg( + {"-gan", "--grp-attn-n"}, "N", + string_format("group-attention factor (default: %d)", params.grp_attn_n), + [](common_params & params, int value) { + params.grp_attn_n = value; + } + ).set_env("LLAMA_ARG_GRP_ATTN_N").set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_PASSKEY})); + add_opt(common_arg( + {"-gaw", "--grp-attn-w"}, "N", + string_format("group-attention width (default: %d)", params.grp_attn_w), + [](common_params & params, int value) { + params.grp_attn_w = value; + } + ).set_env("LLAMA_ARG_GRP_ATTN_W").set_examples({LLAMA_EXAMPLE_MAIN})); + add_opt(common_arg( + {"-nkvo", "--no-kv-offload"}, + "disable KV offload", + [](common_params & params) { + params.no_kv_offload = true; + } + ).set_env("LLAMA_ARG_NO_KV_OFFLOAD")); + add_opt(common_arg( + {"-ctk", "--cache-type-k"}, "TYPE", + string_format( + "KV cache data type for K\n" + "allowed values: %s\n" + "(default: %s)", + get_all_kv_cache_types().c_str(), + ggml_type_name(params.cache_type_k) + ), + [](common_params & params, const std::string & value) { + params.cache_type_k = kv_cache_type_from_str(value); + } + ).set_env("LLAMA_ARG_CACHE_TYPE_K")); + add_opt(common_arg( + {"-ctv", "--cache-type-v"}, "TYPE", + string_format( + "KV cache data type for V\n" + "allowed values: %s\n" + "(default: %s)", + get_all_kv_cache_types().c_str(), + ggml_type_name(params.cache_type_v) + ), + [](common_params & params, const std::string & value) { + params.cache_type_v = kv_cache_type_from_str(value); + } + ).set_env("LLAMA_ARG_CACHE_TYPE_V")); + add_opt(common_arg( + {"--hellaswag"}, + "compute HellaSwag score over random tasks from datafile supplied with -f", + [](common_params & params) { + params.hellaswag = true; + } + ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); + add_opt(common_arg( + {"--hellaswag-tasks"}, "N", + string_format("number of tasks to use when computing the HellaSwag score (default: %zu)", params.hellaswag_tasks), + [](common_params & params, int value) { + params.hellaswag_tasks = value; + } + ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); + add_opt(common_arg( + {"--winogrande"}, + "compute Winogrande score over random tasks from datafile supplied with -f", + [](common_params & params) { + params.winogrande = true; + } + ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); + add_opt(common_arg( + {"--winogrande-tasks"}, "N", + string_format("number of tasks to use when computing the Winogrande score (default: %zu)", params.winogrande_tasks), + [](common_params & params, int value) { + params.winogrande_tasks = value; + } + ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); + add_opt(common_arg( + {"--multiple-choice"}, + "compute multiple choice score over random tasks from datafile supplied with -f", + [](common_params & params) { + params.multiple_choice = true; + } + ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); + add_opt(common_arg( + {"--multiple-choice-tasks"}, "N", + string_format("number of tasks to use when computing the multiple choice score (default: %zu)", params.multiple_choice_tasks), + [](common_params & params, int value) { + params.multiple_choice_tasks = value; + } + ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); + add_opt(common_arg( + {"--kl-divergence"}, + "computes KL-divergence to logits provided via --kl-divergence-base", + [](common_params & params) { + params.kl_divergence = true; + } + ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); + add_opt(common_arg( + {"--save-all-logits", "--kl-divergence-base"}, "FNAME", + "set logits file", + [](common_params & params, const std::string & value) { + params.logits_file = value; + } + ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); + add_opt(common_arg( + {"--ppl-stride"}, "N", + string_format("stride for perplexity calculation (default: %d)", params.ppl_stride), + [](common_params & params, int value) { + params.ppl_stride = value; + } + ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); + add_opt(common_arg( + {"--ppl-output-type"}, "<0|1>", + string_format("output type for perplexity calculation (default: %d)", params.ppl_output_type), + [](common_params & params, int value) { + params.ppl_output_type = value; + } + ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); + add_opt(common_arg( + {"-dt", "--defrag-thold"}, "N", + string_format("KV cache defragmentation threshold (default: %.1f, < 0 - disabled)", (double)params.defrag_thold), + [](common_params & params, const std::string & value) { + params.defrag_thold = std::stof(value); + } + ).set_env("LLAMA_ARG_DEFRAG_THOLD")); + add_opt(common_arg( + {"-np", "--parallel"}, "N", + string_format("number of parallel sequences to decode (default: %d)", params.n_parallel), + [](common_params & params, int value) { + params.n_parallel = value; + } + ).set_env("LLAMA_ARG_N_PARALLEL")); + add_opt(common_arg( + {"-ns", "--sequences"}, "N", + string_format("number of sequences to decode (default: %d)", params.n_sequences), + [](common_params & params, int value) { + params.n_sequences = value; + } + ).set_examples({LLAMA_EXAMPLE_PARALLEL})); + add_opt(common_arg( + {"-cb", "--cont-batching"}, + string_format("enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled"), + [](common_params & params) { + params.cont_batching = true; + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CONT_BATCHING")); + add_opt(common_arg( + {"-nocb", "--no-cont-batching"}, + "disable continuous batching", + [](common_params & params) { + params.cont_batching = false; + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONT_BATCHING")); + add_opt(common_arg( + {"--mmproj"}, "FILE", + "path to a multimodal projector file. see tools/mtmd/README.md\n" + "note: if -hf is used, this argument can be omitted", + [](common_params & params, const std::string & value) { + params.mmproj.path = value; + } + ).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ")); + add_opt(common_arg( + {"--mmproj-url"}, "URL", + "URL to a multimodal projector file. see tools/mtmd/README.md", + [](common_params & params, const std::string & value) { + params.mmproj.url = value; + } + ).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_URL")); + add_opt(common_arg( + {"--no-mmproj"}, + "explicitly disable multimodal projector, useful when using -hf", + [](common_params & params) { + params.no_mmproj = true; + } + ).set_examples(mmproj_examples).set_env("LLAMA_ARG_NO_MMPROJ")); + add_opt(common_arg( + {"--no-mmproj-offload"}, + "do not offload multimodal projector to GPU", + [](common_params & params) { + params.mmproj_use_gpu = false; + } + ).set_examples(mmproj_examples).set_env("LLAMA_ARG_NO_MMPROJ_OFFLOAD")); + add_opt(common_arg( + {"--image", "--audio"}, "FILE", + "path to an image or audio file. use with multimodal models, can be repeated if you have multiple files\n", + [](common_params & params, const std::string & value) { + params.image.emplace_back(value); + } + ).set_examples({LLAMA_EXAMPLE_MTMD})); + if (llama_supports_rpc()) { + add_opt(common_arg( + {"--rpc"}, "SERVERS", + "comma separated list of RPC servers", + [](common_params & params, const std::string & value) { + add_rpc_devices(value); + GGML_UNUSED(params); + } + ).set_env("LLAMA_ARG_RPC")); + } + add_opt(common_arg( + {"--mlock"}, + "force system to keep model in RAM rather than swapping or compressing", + [](common_params & params) { + params.use_mlock = true; + } + ).set_env("LLAMA_ARG_MLOCK")); + add_opt(common_arg( + {"--no-mmap"}, + "do not memory-map model (slower load but may reduce pageouts if not using mlock)", + [](common_params & params) { + params.use_mmap = false; + } + ).set_env("LLAMA_ARG_NO_MMAP")); + add_opt(common_arg( + {"--numa"}, "TYPE", + "attempt optimizations that help on some NUMA systems\n" + "- distribute: spread execution evenly over all nodes\n" + "- isolate: only spawn threads on CPUs on the node that execution started on\n" + "- numactl: use the CPU map provided by numactl\n" + "if run without this previously, it is recommended to drop the system page cache before using this\n" + "see https://github.com/ggml-org/llama.cpp/issues/1437", + [](common_params & params, const std::string & value) { + /**/ if (value == "distribute" || value == "") { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; } + else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; } + else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; } + else { throw std::invalid_argument("invalid value"); } + } + ).set_env("LLAMA_ARG_NUMA")); + add_opt(common_arg( + {"-dev", "--device"}, "", + "comma-separated list of devices to use for offloading (none = don't offload)\n" + "use --list-devices to see a list of available devices", + [](common_params & params, const std::string & value) { + params.devices = parse_device_list(value); + } + ).set_env("LLAMA_ARG_DEVICE")); + add_opt(common_arg( + {"--list-devices"}, + "print list of available devices and exit", + [](common_params &) { + std::vector rpc_devices; + std::vector all_devices; + for (size_t i = 0; i < ggml_backend_dev_count(); ++i) { + auto * dev = ggml_backend_dev_get(i); + if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_GPU) { + ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev); + if (ggml_backend_reg_name(reg) == std::string("RPC")) { + rpc_devices.push_back(dev); + } else { + all_devices.push_back(dev); + } + } + } + // insert RPC devices in front + all_devices.insert(all_devices.begin(), rpc_devices.begin(), rpc_devices.end()); + printf("Available devices:\n"); + for (size_t i = 0; i < all_devices.size(); ++i) { + auto * dev = all_devices[i]; + size_t free, total; + ggml_backend_dev_memory(dev, &free, &total); + printf(" %s: %s (%zu MiB, %zu MiB free)\n", ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), total / 1024 / 1024, free / 1024 / 1024); + } + exit(0); + } + )); + add_opt(common_arg( + {"--override-tensor", "-ot"}, "=,...", + "override tensor buffer type", [](common_params & params, const std::string & value) { + /* static */ std::map buft_list; + if (buft_list.empty()) { + // enumerate all the devices and add their buffer types to the list + for (size_t i = 0; i < ggml_backend_dev_count(); ++i) { + auto * dev = ggml_backend_dev_get(i); + auto * buft = ggml_backend_dev_buffer_type(dev); + if (buft) { + buft_list[ggml_backend_buft_name(buft)] = buft; + } + } + } + + for (const auto & override : string_split(value, ',')) { + std::string::size_type pos = override.find('='); + if (pos == std::string::npos) { + throw std::invalid_argument("invalid value"); + } + std::string tensor_name = override.substr(0, pos); + std::string buffer_type = override.substr(pos + 1); + + if (buft_list.find(buffer_type) == buft_list.end()) { + printf("Available buffer types:\n"); + for (const auto & it : buft_list) { + printf(" %s\n", ggml_backend_buft_name(it.second)); + } + throw std::invalid_argument("unknown buffer type"); + } + // FIXME: this leaks memory + params.tensor_buft_overrides.push_back({strdup(tensor_name.c_str()), buft_list.at(buffer_type)}); + } + } + )); + add_opt(common_arg( + {"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N", + "number of layers to store in VRAM", + [](common_params & params, int value) { + params.n_gpu_layers = value; + if (!llama_supports_gpu_offload()) { + fprintf(stderr, "warning: no usable GPU found, --gpu-layers option will be ignored\n"); + fprintf(stderr, "warning: one possible reason is that llama.cpp was compiled without GPU support\n"); + fprintf(stderr, "warning: consult docs/build.md for compilation instructions\n"); + } + } + ).set_env("LLAMA_ARG_N_GPU_LAYERS")); + add_opt(common_arg( + {"-sm", "--split-mode"}, "{none,layer,row}", + "how to split the model across multiple GPUs, one of:\n" + "- none: use one GPU only\n" + "- layer (default): split layers and KV across GPUs\n" + "- row: split rows across GPUs", + [](common_params & params, const std::string & value) { + std::string arg_next = value; + if (arg_next == "none") { + params.split_mode = LLAMA_SPLIT_MODE_NONE; + } else if (arg_next == "layer") { + params.split_mode = LLAMA_SPLIT_MODE_LAYER; + } else if (arg_next == "row") { + params.split_mode = LLAMA_SPLIT_MODE_ROW; + } else { + throw std::invalid_argument("invalid value"); + } + if (!llama_supports_gpu_offload()) { + fprintf(stderr, "warning: llama.cpp was compiled without support for GPU offload. Setting the split mode has no effect.\n"); + } + } + ).set_env("LLAMA_ARG_SPLIT_MODE")); + add_opt(common_arg( + {"-ts", "--tensor-split"}, "N0,N1,N2,...", + "fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1", + [](common_params & params, const std::string & value) { + std::string arg_next = value; + + // split string by , and / + const std::regex regex{ R"([,/]+)" }; + std::sregex_token_iterator it{ arg_next.begin(), arg_next.end(), regex, -1 }; + std::vector split_arg{ it, {} }; + if (split_arg.size() >= llama_max_devices()) { + throw std::invalid_argument( + string_format("got %d input configs, but system only has %d devices", (int)split_arg.size(), (int)llama_max_devices()) + ); + } + for (size_t i = 0; i < llama_max_devices(); ++i) { + if (i < split_arg.size()) { + params.tensor_split[i] = std::stof(split_arg[i]); + } else { + params.tensor_split[i] = 0.0f; + } + } + if (!llama_supports_gpu_offload()) { + fprintf(stderr, "warning: llama.cpp was compiled without support for GPU offload. Setting a tensor split has no effect.\n"); + } + } + ).set_env("LLAMA_ARG_TENSOR_SPLIT")); + add_opt(common_arg( + {"-mg", "--main-gpu"}, "INDEX", + string_format("the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: %d)", params.main_gpu), + [](common_params & params, int value) { + params.main_gpu = value; + if (!llama_supports_gpu_offload()) { + fprintf(stderr, "warning: llama.cpp was compiled without support for GPU offload. Setting the main GPU has no effect.\n"); + } + } + ).set_env("LLAMA_ARG_MAIN_GPU")); + add_opt(common_arg( + {"--check-tensors"}, + string_format("check model tensor data for invalid values (default: %s)", params.check_tensors ? "true" : "false"), + [](common_params & params) { + params.check_tensors = true; + } + )); + add_opt(common_arg( + {"--override-kv"}, "KEY=TYPE:VALUE", + "advanced option to override model metadata by key. may be specified multiple times.\n" + "types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false", + [](common_params & params, const std::string & value) { + if (!string_parse_kv_override(value.c_str(), params.kv_overrides)) { + throw std::runtime_error(string_format("error: Invalid type for KV override: %s\n", value.c_str())); + } + } + )); + add_opt(common_arg( + {"--no-op-offload"}, + string_format("disable offloading host tensor operations to device (default: %s)", params.no_op_offload ? "true" : "false"), + [](common_params & params) { + params.no_op_offload = true; + } + )); + add_opt(common_arg( + {"--lora"}, "FNAME", + "path to LoRA adapter (can be repeated to use multiple adapters)", + [](common_params & params, const std::string & value) { + params.lora_adapters.push_back({ std::string(value), 1.0, nullptr }); + } + // we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg + ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA})); + add_opt(common_arg( + {"--lora-scaled"}, "FNAME", "SCALE", + "path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters)", + [](common_params & params, const std::string & fname, const std::string & scale) { + params.lora_adapters.push_back({ fname, std::stof(scale), nullptr }); + } + // we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg + ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA})); + add_opt(common_arg( + {"--control-vector"}, "FNAME", + "add a control vector\nnote: this argument can be repeated to add multiple control vectors", + [](common_params & params, const std::string & value) { + params.control_vectors.push_back({ 1.0f, value, }); + } + )); + add_opt(common_arg( + {"--control-vector-scaled"}, "FNAME", "SCALE", + "add a control vector with user defined scaling SCALE\n" + "note: this argument can be repeated to add multiple scaled control vectors", + [](common_params & params, const std::string & fname, const std::string & scale) { + params.control_vectors.push_back({ std::stof(scale), fname }); + } + )); + add_opt(common_arg( + {"--control-vector-layer-range"}, "START", "END", + "layer range to apply the control vector(s) to, start and end inclusive", + [](common_params & params, const std::string & start, const std::string & end) { + params.control_vector_layer_start = std::stoi(start); + params.control_vector_layer_end = std::stoi(end); + } + )); + add_opt(common_arg( + {"-a", "--alias"}, "STRING", + "set alias for model name (to be used by REST API)", + [](common_params & params, const std::string & value) { + params.model_alias = value; + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ALIAS")); + add_opt(common_arg( + {"-m", "--model"}, "FNAME", + ex == LLAMA_EXAMPLE_EXPORT_LORA + ? std::string("model path from which to load base model") + : string_format( + "model path (default: `models/$filename` with filename from `--hf-file` " + "or `--model-url` if set, otherwise %s)", DEFAULT_MODEL_PATH + ), + [](common_params & params, const std::string & value) { + params.model.path = value; + } + ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}).set_env("LLAMA_ARG_MODEL")); + add_opt(common_arg( + {"-mu", "--model-url"}, "MODEL_URL", + "model download url (default: unused)", + [](common_params & params, const std::string & value) { + params.model.url = value; + } + ).set_env("LLAMA_ARG_MODEL_URL")); + add_opt(common_arg( + {"-hf", "-hfr", "--hf-repo"}, "/[:quant]", + "Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.\n" + "mmproj is also downloaded automatically if available. to disable, add --no-mmproj\n" + "example: unsloth/phi-4-GGUF:q4_k_m\n" + "(default: unused)", + [](common_params & params, const std::string & value) { + params.model.hf_repo = value; + } + ).set_env("LLAMA_ARG_HF_REPO")); + add_opt(common_arg( + {"-hfd", "-hfrd", "--hf-repo-draft"}, "/[:quant]", + "Same as --hf-repo, but for the draft model (default: unused)", + [](common_params & params, const std::string & value) { + params.speculative.model.hf_repo = value; + } + ).set_env("LLAMA_ARG_HFD_REPO")); + add_opt(common_arg( + {"-hff", "--hf-file"}, "FILE", + "Hugging Face model file. If specified, it will override the quant in --hf-repo (default: unused)", + [](common_params & params, const std::string & value) { + params.model.hf_file = value; + } + ).set_env("LLAMA_ARG_HF_FILE")); + add_opt(common_arg( + {"-hfv", "-hfrv", "--hf-repo-v"}, "/[:quant]", + "Hugging Face model repository for the vocoder model (default: unused)", + [](common_params & params, const std::string & value) { + params.vocoder.model.hf_repo = value; + } + ).set_env("LLAMA_ARG_HF_REPO_V")); + add_opt(common_arg( + {"-hffv", "--hf-file-v"}, "FILE", + "Hugging Face model file for the vocoder model (default: unused)", + [](common_params & params, const std::string & value) { + params.vocoder.model.hf_file = value; + } + ).set_env("LLAMA_ARG_HF_FILE_V")); + add_opt(common_arg( + {"-hft", "--hf-token"}, "TOKEN", + "Hugging Face access token (default: value from HF_TOKEN environment variable)", + [](common_params & params, const std::string & value) { + params.hf_token = value; + } + ).set_env("HF_TOKEN")); + add_opt(common_arg( + {"--context-file"}, "FNAME", + "file to load context from (repeat to specify multiple files)", + [](common_params & params, const std::string & value) { + std::ifstream file(value, std::ios::binary); + if (!file) { + throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str())); + } + params.context_files.push_back(value); + } + ).set_examples({LLAMA_EXAMPLE_RETRIEVAL})); + add_opt(common_arg( + {"--chunk-size"}, "N", + string_format("minimum length of embedded text chunks (default: %d)", params.chunk_size), + [](common_params & params, int value) { + params.chunk_size = value; + } + ).set_examples({LLAMA_EXAMPLE_RETRIEVAL})); + add_opt(common_arg( + {"--chunk-separator"}, "STRING", + string_format("separator between chunks (default: '%s')", params.chunk_separator.c_str()), + [](common_params & params, const std::string & value) { + params.chunk_separator = value; + } + ).set_examples({LLAMA_EXAMPLE_RETRIEVAL})); + add_opt(common_arg( + {"--junk"}, "N", + string_format("number of times to repeat the junk text (default: %d)", params.n_junk), + [](common_params & params, int value) { + params.n_junk = value; + } + ).set_examples({LLAMA_EXAMPLE_PASSKEY, LLAMA_EXAMPLE_PARALLEL})); + add_opt(common_arg( + {"--pos"}, "N", + string_format("position of the passkey in the junk text (default: %d)", params.i_pos), + [](common_params & params, int value) { + params.i_pos = value; + } + ).set_examples({LLAMA_EXAMPLE_PASSKEY})); + add_opt(common_arg( + {"-o", "--output", "--output-file"}, "FNAME", + string_format("output file (default: '%s')", params.out_file.c_str()), + [](common_params & params, const std::string & value) { + params.out_file = value; + } + ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS})); + add_opt(common_arg( + {"-ofreq", "--output-frequency"}, "N", + string_format("output the imatrix every N iterations (default: %d)", params.n_out_freq), + [](common_params & params, int value) { + params.n_out_freq = value; + } + ).set_examples({LLAMA_EXAMPLE_IMATRIX})); + add_opt(common_arg( + {"--save-frequency"}, "N", + string_format("save an imatrix copy every N iterations (default: %d)", params.n_save_freq), + [](common_params & params, int value) { + params.n_save_freq = value; + } + ).set_examples({LLAMA_EXAMPLE_IMATRIX})); + add_opt(common_arg( + {"--process-output"}, + string_format("collect data for the output tensor (default: %s)", params.process_output ? "true" : "false"), + [](common_params & params) { + params.process_output = true; + } + ).set_examples({LLAMA_EXAMPLE_IMATRIX})); + add_opt(common_arg( + {"--no-ppl"}, + string_format("do not compute perplexity (default: %s)", params.compute_ppl ? "true" : "false"), + [](common_params & params) { + params.compute_ppl = false; + } + ).set_examples({LLAMA_EXAMPLE_IMATRIX})); + add_opt(common_arg( + {"--chunk", "--from-chunk"}, "N", + string_format("start processing the input from chunk N (default: %d)", params.i_chunk), + [](common_params & params, int value) { + params.i_chunk = value; + } + ).set_examples({LLAMA_EXAMPLE_IMATRIX})); + add_opt(common_arg( + {"--parse-special"}, + string_format("prase special tokens (chat, tool, etc) (default: %s)", params.parse_special ? "true" : "false"), + [](common_params & params) { + params.parse_special = true; + } + ).set_examples({LLAMA_EXAMPLE_IMATRIX})); + add_opt(common_arg( + {"-pps"}, + string_format("is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false"), + [](common_params & params) { + params.is_pp_shared = true; + } + ).set_examples({LLAMA_EXAMPLE_BENCH, LLAMA_EXAMPLE_PARALLEL})); + add_opt(common_arg( + {"-npp"}, "n0,n1,...", + "number of prompt tokens", + [](common_params & params, const std::string & value) { + auto p = string_split(value, ','); + params.n_pp.insert(params.n_pp.end(), p.begin(), p.end()); + } + ).set_examples({LLAMA_EXAMPLE_BENCH})); + add_opt(common_arg( + {"-ntg"}, "n0,n1,...", + "number of text generation tokens", + [](common_params & params, const std::string & value) { + auto p = string_split(value, ','); + params.n_tg.insert(params.n_tg.end(), p.begin(), p.end()); + } + ).set_examples({LLAMA_EXAMPLE_BENCH})); + add_opt(common_arg( + {"-npl"}, "n0,n1,...", + "number of parallel prompts", + [](common_params & params, const std::string & value) { + auto p = string_split(value, ','); + params.n_pl.insert(params.n_pl.end(), p.begin(), p.end()); + } + ).set_examples({LLAMA_EXAMPLE_BENCH})); + add_opt(common_arg( + {"--embd-normalize"}, "N", + string_format("normalisation for embeddings (default: %d) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)", params.embd_normalize), + [](common_params & params, int value) { + params.embd_normalize = value; + } + ).set_examples({LLAMA_EXAMPLE_EMBEDDING})); + add_opt(common_arg( + {"--embd-output-format"}, "FORMAT", + "empty = default, \"array\" = [[],[]...], \"json\" = openai style, \"json+\" = same \"json\" + cosine similarity matrix", + [](common_params & params, const std::string & value) { + params.embd_out = value; + } + ).set_examples({LLAMA_EXAMPLE_EMBEDDING})); + add_opt(common_arg( + {"--embd-separator"}, "STRING", + "separator of embeddings (default \\n) for example \"<#sep#>\"", + [](common_params & params, const std::string & value) { + params.embd_sep = value; + } + ).set_examples({LLAMA_EXAMPLE_EMBEDDING})); + add_opt(common_arg( + {"--cls-separator"}, "STRING", + "separator of classification sequences (default \\t) for example \"<#seq#>\"", + [](common_params & params, const std::string & value) { + params.cls_sep = value; + } + ).set_examples({LLAMA_EXAMPLE_EMBEDDING})); + add_opt(common_arg( + {"--host"}, "HOST", + string_format("ip address to listen, or bind to an UNIX socket if the address ends with .sock (default: %s)", params.hostname.c_str()), + [](common_params & params, const std::string & value) { + params.hostname = value; + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_HOST")); + add_opt(common_arg( + {"--port"}, "PORT", + string_format("port to listen (default: %d)", params.port), + [](common_params & params, int value) { + params.port = value; + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_PORT")); + add_opt(common_arg( + {"--path"}, "PATH", + string_format("path to serve static files from (default: %s)", params.public_path.c_str()), + [](common_params & params, const std::string & value) { + params.public_path = value; + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_STATIC_PATH")); + add_opt(common_arg( + {"--no-webui"}, + string_format("Disable the Web UI (default: %s)", params.webui ? "enabled" : "disabled"), + [](common_params & params) { + params.webui = false; + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_WEBUI")); + add_opt(common_arg( + {"--embedding", "--embeddings"}, + string_format("restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled"), + [](common_params & params) { + params.embedding = true; + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_EMBEDDINGS")); + add_opt(common_arg( + {"--reranking", "--rerank"}, + string_format("enable reranking endpoint on server (default: %s)", "disabled"), + [](common_params & params) { + params.embedding = true; + params.pooling_type = LLAMA_POOLING_TYPE_RANK; + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_RERANKING")); + add_opt(common_arg( + {"--api-key"}, "KEY", + "API key to use for authentication (default: none)", + [](common_params & params, const std::string & value) { + params.api_keys.push_back(value); + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_API_KEY")); + add_opt(common_arg( + {"--api-key-file"}, "FNAME", + "path to file containing API keys (default: none)", + [](common_params & params, const std::string & value) { + std::ifstream key_file(value); + if (!key_file) { + throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str())); + } + std::string key; + while (std::getline(key_file, key)) { + if (!key.empty()) { + params.api_keys.push_back(key); + } + } + key_file.close(); + } + ).set_examples({LLAMA_EXAMPLE_SERVER})); + add_opt(common_arg( + {"--ssl-key-file"}, "FNAME", + "path to file a PEM-encoded SSL private key", + [](common_params & params, const std::string & value) { + params.ssl_file_key = value; + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SSL_KEY_FILE")); + add_opt(common_arg( + {"--ssl-cert-file"}, "FNAME", + "path to file a PEM-encoded SSL certificate", + [](common_params & params, const std::string & value) { + params.ssl_file_cert = value; + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SSL_CERT_FILE")); + add_opt(common_arg( + {"-to", "--timeout"}, "N", + string_format("server read/write timeout in seconds (default: %d)", params.timeout_read), + [](common_params & params, int value) { + params.timeout_read = value; + params.timeout_write = value; + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_TIMEOUT")); + add_opt(common_arg( + {"--threads-http"}, "N", + string_format("number of threads used to process HTTP requests (default: %d)", params.n_threads_http), + [](common_params & params, int value) { + params.n_threads_http = value; + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_THREADS_HTTP")); + add_opt(common_arg( + {"--cache-reuse"}, "N", + string_format( + "min chunk size to attempt reusing from the cache via KV shifting (default: %d)\n" + "[(card)](https://ggml.ai/f0.png)", params.n_cache_reuse + ), + [](common_params & params, int value) { + params.n_cache_reuse = value; + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CACHE_REUSE")); + add_opt(common_arg( + {"--metrics"}, + string_format("enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled"), + [](common_params & params) { + params.endpoint_metrics = true; + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_METRICS")); + add_opt(common_arg( + {"--slots"}, + string_format("enable slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"), + [](common_params & params) { + params.endpoint_slots = true; + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_SLOTS")); + add_opt(common_arg( + {"--props"}, + string_format("enable changing global properties via POST /props (default: %s)", params.endpoint_props ? "enabled" : "disabled"), + [](common_params & params) { + params.endpoint_props = true; + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_PROPS")); + add_opt(common_arg( + {"--no-slots"}, + "disables slots monitoring endpoint", + [](common_params & params) { + params.endpoint_slots = false; + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_ENDPOINT_SLOTS")); + add_opt(common_arg( + {"--slot-save-path"}, "PATH", + "path to save slot kv cache (default: disabled)", + [](common_params & params, const std::string & value) { + params.slot_save_path = value; + // if doesn't end with DIRECTORY_SEPARATOR, add it + if (!params.slot_save_path.empty() && params.slot_save_path[params.slot_save_path.size() - 1] != DIRECTORY_SEPARATOR) { + params.slot_save_path += DIRECTORY_SEPARATOR; + } + } + ).set_examples({LLAMA_EXAMPLE_SERVER})); + add_opt(common_arg( + {"--jinja"}, + "use jinja template for chat (default: disabled)", + [](common_params & params) { + params.use_jinja = true; + } + ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_JINJA")); + add_opt(common_arg( + {"--reasoning-format"}, "FORMAT", + "controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n" + "- none: leaves thoughts unparsed in `message.content`\n" + "- deepseek: puts thoughts in `message.reasoning_content` (except in streaming mode, which behaves as `none`)\n" + "(default: deepseek)", + [](common_params & params, const std::string & value) { + /**/ if (value == "deepseek") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; } + else if (value == "deepseek-legacy") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY; } + else if (value == "none") { params.reasoning_format = COMMON_REASONING_FORMAT_NONE; } + else { throw std::invalid_argument("invalid value"); } + } + ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK")); + add_opt(common_arg( + {"--reasoning-budget"}, "N", + "controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)", + [](common_params & params, int value) { + if (value != 0 && value != -1) { throw std::invalid_argument("invalid value"); } + params.reasoning_budget = value; + } + ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK_BUDGET")); + add_opt(common_arg( + {"--chat-template"}, "JINJA_TEMPLATE", + string_format( + "set custom jinja chat template (default: template taken from model's metadata)\n" + "if suffix/prefix are specified, template will be disabled\n" + "only commonly used templates are accepted (unless --jinja is set before this flag):\n" + "list of built-in templates:\n%s", list_builtin_chat_templates().c_str() + ), + [](common_params & params, const std::string & value) { + params.chat_template = value; + } + ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_CHAT_TEMPLATE")); + add_opt(common_arg( + {"--chat-template-file"}, "JINJA_TEMPLATE_FILE", + string_format( + "set custom jinja chat template file (default: template taken from model's metadata)\n" + "if suffix/prefix are specified, template will be disabled\n" + "only commonly used templates are accepted (unless --jinja is set before this flag):\n" + "list of built-in templates:\n%s", list_builtin_chat_templates().c_str() + ), + [](common_params & params, const std::string & value) { + params.chat_template = read_file(value); + } + ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE_FILE")); + add_opt(common_arg( + {"--no-prefill-assistant"}, + string_format( + "whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)\n" + "when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled\n" + ), + [](common_params & params) { + params.prefill_assistant = false; + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_PREFILL_ASSISTANT")); + add_opt(common_arg( + {"-sps", "--slot-prompt-similarity"}, "SIMILARITY", + string_format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity), + [](common_params & params, const std::string & value) { + params.slot_prompt_similarity = std::stof(value); + } + ).set_examples({LLAMA_EXAMPLE_SERVER})); + add_opt(common_arg( + {"--lora-init-without-apply"}, + string_format("load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: %s)", params.lora_init_without_apply ? "enabled" : "disabled"), + [](common_params & params) { + params.lora_init_without_apply = true; + } + ).set_examples({LLAMA_EXAMPLE_SERVER})); + add_opt(common_arg( + {"--simple-io"}, + "use basic IO for better compatibility in subprocesses and limited consoles", + [](common_params & params) { + params.simple_io = true; + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); + add_opt(common_arg( + {"--positive-file"}, "FNAME", + string_format("positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str()), + [](common_params & params, const std::string & value) { + params.cvector_positive_file = value; + } + ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR})); + add_opt(common_arg( + {"--negative-file"}, "FNAME", + string_format("negative prompts file, one prompt per line (default: '%s')", params.cvector_negative_file.c_str()), + [](common_params & params, const std::string & value) { + params.cvector_negative_file = value; + } + ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR})); + add_opt(common_arg( + {"--pca-batch"}, "N", + string_format("batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)", params.n_pca_batch), + [](common_params & params, int value) { + params.n_pca_batch = value; + } + ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR})); + add_opt(common_arg( + {"--pca-iter"}, "N", + string_format("number of iterations used for PCA (default: %d)", params.n_pca_iterations), + [](common_params & params, int value) { + params.n_pca_iterations = value; + } + ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR})); + add_opt(common_arg( + {"--method"}, "{pca, mean}", + "dimensionality reduction method to be used (default: pca)", + [](common_params & params, const std::string & value) { + /**/ if (value == "pca") { params.cvector_dimre_method = DIMRE_METHOD_PCA; } + else if (value == "mean") { params.cvector_dimre_method = DIMRE_METHOD_MEAN; } + else { throw std::invalid_argument("invalid value"); } + } + ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR})); + add_opt(common_arg( + {"--output-format"}, "{md,jsonl}", + "output format for batched-bench results (default: md)", + [](common_params & params, const std::string & value) { + /**/ if (value == "jsonl") { params.batched_bench_output_jsonl = true; } + else if (value == "md") { params.batched_bench_output_jsonl = false; } + else { throw std::invalid_argument("invalid value"); } + } + ).set_examples({LLAMA_EXAMPLE_BENCH})); + add_opt(common_arg( + {"--log-disable"}, + "Log disable", + [](common_params &) { + common_log_pause(common_log_main()); + } + )); + add_opt(common_arg( + {"--log-file"}, "FNAME", + "Log to file", + [](common_params &, const std::string & value) { + common_log_set_file(common_log_main(), value.c_str()); + } + )); + add_opt(common_arg( + {"--log-colors"}, + "Enable colored logging", + [](common_params &) { + common_log_set_colors(common_log_main(), true); + } + ).set_env("LLAMA_LOG_COLORS")); + add_opt(common_arg( + {"-v", "--verbose", "--log-verbose"}, + "Set verbosity level to infinity (i.e. log all messages, useful for debugging)", + [](common_params & params) { + params.verbosity = INT_MAX; + common_log_set_verbosity_thold(INT_MAX); + } + )); + add_opt(common_arg( + {"--offline"}, + "Offline mode: forces use of cache, prevents network access", + [](common_params & params) { + params.offline = true; + } + ).set_env("LLAMA_OFFLINE")); + add_opt(common_arg( + {"-lv", "--verbosity", "--log-verbosity"}, "N", + "Set the verbosity threshold. Messages with a higher verbosity will be ignored.", + [](common_params & params, int value) { + params.verbosity = value; + common_log_set_verbosity_thold(value); + } + ).set_env("LLAMA_LOG_VERBOSITY")); + add_opt(common_arg( + {"--log-prefix"}, + "Enable prefix in log messages", + [](common_params &) { + common_log_set_prefix(common_log_main(), true); + } + ).set_env("LLAMA_LOG_PREFIX")); + add_opt(common_arg( + {"--log-timestamps"}, + "Enable timestamps in log messages", + [](common_params &) { + common_log_set_timestamps(common_log_main(), true); + } + ).set_env("LLAMA_LOG_TIMESTAMPS")); + + // speculative parameters + add_opt(common_arg( + {"-td", "--threads-draft"}, "N", + "number of threads to use during generation (default: same as --threads)", + [](common_params & params, int value) { + params.speculative.cpuparams.n_threads = value; + if (params.speculative.cpuparams.n_threads <= 0) { + params.speculative.cpuparams.n_threads = std::thread::hardware_concurrency(); + } + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(common_arg( + {"-tbd", "--threads-batch-draft"}, "N", + "number of threads to use during batch and prompt processing (default: same as --threads-draft)", + [](common_params & params, int value) { + params.speculative.cpuparams_batch.n_threads = value; + if (params.speculative.cpuparams_batch.n_threads <= 0) { + params.speculative.cpuparams_batch.n_threads = std::thread::hardware_concurrency(); + } + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(common_arg( + {"-Cd", "--cpu-mask-draft"}, "M", + "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)", + [](common_params & params, const std::string & mask) { + params.speculative.cpuparams.mask_valid = true; + if (!parse_cpu_mask(mask, params.speculative.cpuparams.cpumask)) { + throw std::invalid_argument("invalid cpumask"); + } + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(common_arg( + {"-Crd", "--cpu-range-draft"}, "lo-hi", + "Ranges of CPUs for affinity. Complements --cpu-mask-draft", + [](common_params & params, const std::string & range) { + params.speculative.cpuparams.mask_valid = true; + if (!parse_cpu_range(range, params.speculative.cpuparams.cpumask)) { + throw std::invalid_argument("invalid range"); + } + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(common_arg( + {"--cpu-strict-draft"}, "<0|1>", + "Use strict CPU placement for draft model (default: same as --cpu-strict)", + [](common_params & params, int value) { + params.speculative.cpuparams.strict_cpu = value; + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(common_arg( + {"--prio-draft"}, "N", + string_format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.speculative.cpuparams.priority), + [](common_params & params, int prio) { + if (prio < 0 || prio > 3) { + throw std::invalid_argument("invalid value"); + } + params.speculative.cpuparams.priority = (enum ggml_sched_priority) prio; + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(common_arg( + {"--poll-draft"}, "<0|1>", + "Use polling to wait for draft model work (default: same as --poll])", + [](common_params & params, int value) { + params.speculative.cpuparams.poll = value; + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(common_arg( + {"-Cbd", "--cpu-mask-batch-draft"}, "M", + "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)", + [](common_params & params, const std::string & mask) { + params.speculative.cpuparams_batch.mask_valid = true; + if (!parse_cpu_mask(mask, params.speculative.cpuparams_batch.cpumask)) { + throw std::invalid_argument("invalid cpumask"); + } + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(common_arg( + {"-Crbd", "--cpu-range-batch-draft"}, "lo-hi", + "Ranges of CPUs for affinity. Complements --cpu-mask-draft-batch)", + [](common_params & params, const std::string & range) { + params.speculative.cpuparams_batch.mask_valid = true; + if (!parse_cpu_range(range, params.speculative.cpuparams_batch.cpumask)) { + throw std::invalid_argument("invalid cpumask"); + } + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(common_arg( + {"--cpu-strict-batch-draft"}, "<0|1>", + "Use strict CPU placement for draft model (default: --cpu-strict-draft)", + [](common_params & params, int value) { + params.speculative.cpuparams_batch.strict_cpu = value; + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(common_arg( + {"--prio-batch-draft"}, "N", + string_format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.speculative.cpuparams_batch.priority), + [](common_params & params, int prio) { + if (prio < 0 || prio > 3) { + throw std::invalid_argument("invalid value"); + } + params.speculative.cpuparams_batch.priority = (enum ggml_sched_priority) prio; + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(common_arg( + {"--poll-batch-draft"}, "<0|1>", + "Use polling to wait for draft model work (default: --poll-draft)", + [](common_params & params, int value) { + params.speculative.cpuparams_batch.poll = value; + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(common_arg( + {"--draft-max", "--draft", "--draft-n"}, "N", + string_format("number of tokens to draft for speculative decoding (default: %d)", params.speculative.n_max), + [](common_params & params, int value) { + params.speculative.n_max = value; + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_MAX")); + add_opt(common_arg( + {"--draft-min", "--draft-n-min"}, "N", + string_format("minimum number of draft tokens to use for speculative decoding (default: %d)", params.speculative.n_min), + [](common_params & params, int value) { + params.speculative.n_min = value; + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_MIN")); + add_opt(common_arg( + {"--draft-p-split"}, "P", + string_format("speculative decoding split probability (default: %.1f)", (double)params.speculative.p_split), + [](common_params & params, const std::string & value) { + params.speculative.p_split = std::stof(value); + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}).set_env("LLAMA_ARG_DRAFT_P_SPLIT")); + add_opt(common_arg( + {"--draft-p-min"}, "P", + string_format("minimum speculative decoding probability (greedy) (default: %.1f)", (double)params.speculative.p_min), + [](common_params & params, const std::string & value) { + params.speculative.p_min = std::stof(value); + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_P_MIN")); + add_opt(common_arg( + {"-cd", "--ctx-size-draft"}, "N", + string_format("size of the prompt context for the draft model (default: %d, 0 = loaded from model)", params.speculative.n_ctx), + [](common_params & params, int value) { + params.speculative.n_ctx = value; + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CTX_SIZE_DRAFT")); + add_opt(common_arg( + {"-devd", "--device-draft"}, "", + "comma-separated list of devices to use for offloading the draft model (none = don't offload)\n" + "use --list-devices to see a list of available devices", + [](common_params & params, const std::string & value) { + params.speculative.devices = parse_device_list(value); + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER})); + add_opt(common_arg( + {"-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft"}, "N", + "number of layers to store in VRAM for the draft model", + [](common_params & params, int value) { + params.speculative.n_gpu_layers = value; + if (!llama_supports_gpu_offload()) { + fprintf(stderr, "warning: no usable GPU found, --gpu-layers-draft option will be ignored\n"); + fprintf(stderr, "warning: one possible reason is that llama.cpp was compiled without GPU support\n"); + fprintf(stderr, "warning: consult docs/build.md for compilation instructions\n"); + } + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_N_GPU_LAYERS_DRAFT")); + add_opt(common_arg( + {"-md", "--model-draft"}, "FNAME", + "draft model for speculative decoding (default: unused)", + [](common_params & params, const std::string & value) { + params.speculative.model.path = value; + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL_DRAFT")); + add_opt(common_arg( + {"-ctkd", "--cache-type-k-draft"}, "TYPE", + string_format( + "KV cache data type for K for the draft model\n" + "allowed values: %s\n" + "(default: %s)", + get_all_kv_cache_types().c_str(), + ggml_type_name(params.speculative.cache_type_k) + ), + [](common_params & params, const std::string & value) { + params.speculative.cache_type_k = kv_cache_type_from_str(value); + } + ).set_env("LLAMA_ARG_CACHE_TYPE_K_DRAFT")); + add_opt(common_arg( + {"-ctvd", "--cache-type-v-draft"}, "TYPE", + string_format( + "KV cache data type for V for the draft model\n" + "allowed values: %s\n" + "(default: %s)", + get_all_kv_cache_types().c_str(), + ggml_type_name(params.speculative.cache_type_v) + ), + [](common_params & params, const std::string & value) { + params.speculative.cache_type_v = kv_cache_type_from_str(value); + } + ).set_env("LLAMA_ARG_CACHE_TYPE_V_DRAFT")); + + add_opt(common_arg( + {"-mv", "--model-vocoder"}, "FNAME", + "vocoder model for audio generation (default: unused)", + [](common_params & params, const std::string & value) { + params.vocoder.model.path = value; + } + ).set_examples({LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_SERVER})); + add_opt(common_arg( + {"--tts-use-guide-tokens"}, + "Use guide tokens to improve TTS word recall", + [](common_params & params) { + params.vocoder.use_guide_tokens = true; + } + ).set_examples({LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_SERVER})); + add_opt(common_arg( + {"--tts-speaker-file"}, "FNAME", + "speaker file path for audio generation", + [](common_params & params, const std::string & value) { + params.vocoder.speaker_file = value; + } + ).set_examples({LLAMA_EXAMPLE_TTS})); + + // model-specific + add_opt(common_arg( + {"--tts-oute-default"}, + string_format("use default OuteTTS models (note: can download weights from the internet)"), + [](common_params & params) { + params.model.hf_repo = "OuteAI/OuteTTS-0.2-500M-GGUF"; + params.model.hf_file = "OuteTTS-0.2-500M-Q8_0.gguf"; + params.vocoder.model.hf_repo = "ggml-org/WavTokenizer"; + params.vocoder.model.hf_file = "WavTokenizer-Large-75-F16.gguf"; + } + ).set_examples({LLAMA_EXAMPLE_TTS})); + + add_opt(common_arg( + {"--embd-bge-small-en-default"}, + string_format("use default bge-small-en-v1.5 model (note: can download weights from the internet)"), + [](common_params & params) { + params.model.hf_repo = "ggml-org/bge-small-en-v1.5-Q8_0-GGUF"; + params.model.hf_file = "bge-small-en-v1.5-q8_0.gguf"; + params.pooling_type = LLAMA_POOLING_TYPE_NONE; + params.embd_normalize = 2; + params.n_ctx = 512; + params.verbose_prompt = true; + params.embedding = true; + } + ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER})); + + add_opt(common_arg( + {"--embd-e5-small-en-default"}, + string_format("use default e5-small-v2 model (note: can download weights from the internet)"), + [](common_params & params) { + params.model.hf_repo = "ggml-org/e5-small-v2-Q8_0-GGUF"; + params.model.hf_file = "e5-small-v2-q8_0.gguf"; + params.pooling_type = LLAMA_POOLING_TYPE_NONE; + params.embd_normalize = 2; + params.n_ctx = 512; + params.verbose_prompt = true; + params.embedding = true; + } + ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER})); + + add_opt(common_arg( + {"--embd-gte-small-default"}, + string_format("use default gte-small model (note: can download weights from the internet)"), + [](common_params & params) { + params.model.hf_repo = "ggml-org/gte-small-Q8_0-GGUF"; + params.model.hf_file = "gte-small-q8_0.gguf"; + params.pooling_type = LLAMA_POOLING_TYPE_NONE; + params.embd_normalize = 2; + params.n_ctx = 512; + params.verbose_prompt = true; + params.embedding = true; + } + ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER})); + + add_opt(common_arg( + {"--fim-qwen-1.5b-default"}, + string_format("use default Qwen 2.5 Coder 1.5B (note: can download weights from the internet)"), + [](common_params & params) { + params.model.hf_repo = "ggml-org/Qwen2.5-Coder-1.5B-Q8_0-GGUF"; + params.model.hf_file = "qwen2.5-coder-1.5b-q8_0.gguf"; + params.port = 8012; + params.n_gpu_layers = 99; + params.flash_attn = true; + params.n_ubatch = 1024; + params.n_batch = 1024; + params.n_ctx = 0; + params.n_cache_reuse = 256; + } + ).set_examples({LLAMA_EXAMPLE_SERVER})); + + add_opt(common_arg( + {"--fim-qwen-3b-default"}, + string_format("use default Qwen 2.5 Coder 3B (note: can download weights from the internet)"), + [](common_params & params) { + params.model.hf_repo = "ggml-org/Qwen2.5-Coder-3B-Q8_0-GGUF"; + params.model.hf_file = "qwen2.5-coder-3b-q8_0.gguf"; + params.port = 8012; + params.n_gpu_layers = 99; + params.flash_attn = true; + params.n_ubatch = 1024; + params.n_batch = 1024; + params.n_ctx = 0; + params.n_cache_reuse = 256; + } + ).set_examples({LLAMA_EXAMPLE_SERVER})); + + add_opt(common_arg( + {"--fim-qwen-7b-default"}, + string_format("use default Qwen 2.5 Coder 7B (note: can download weights from the internet)"), + [](common_params & params) { + params.model.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF"; + params.model.hf_file = "qwen2.5-coder-7b-q8_0.gguf"; + params.port = 8012; + params.n_gpu_layers = 99; + params.flash_attn = true; + params.n_ubatch = 1024; + params.n_batch = 1024; + params.n_ctx = 0; + params.n_cache_reuse = 256; + } + ).set_examples({LLAMA_EXAMPLE_SERVER})); + + add_opt(common_arg( + {"--fim-qwen-7b-spec"}, + string_format("use Qwen 2.5 Coder 7B + 0.5B draft for speculative decoding (note: can download weights from the internet)"), + [](common_params & params) { + params.model.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF"; + params.model.hf_file = "qwen2.5-coder-7b-q8_0.gguf"; + params.speculative.model.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF"; + params.speculative.model.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf"; + params.speculative.n_gpu_layers = 99; + params.port = 8012; + params.n_gpu_layers = 99; + params.flash_attn = true; + params.n_ubatch = 1024; + params.n_batch = 1024; + params.n_ctx = 0; + params.n_cache_reuse = 256; + } + ).set_examples({LLAMA_EXAMPLE_SERVER})); + + add_opt(common_arg( + {"--fim-qwen-14b-spec"}, + string_format("use Qwen 2.5 Coder 14B + 0.5B draft for speculative decoding (note: can download weights from the internet)"), + [](common_params & params) { + params.model.hf_repo = "ggml-org/Qwen2.5-Coder-14B-Q8_0-GGUF"; + params.model.hf_file = "qwen2.5-coder-14b-q8_0.gguf"; + params.speculative.model.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF"; + params.speculative.model.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf"; + params.speculative.n_gpu_layers = 99; + params.port = 8012; + params.n_gpu_layers = 99; + params.flash_attn = true; + params.n_ubatch = 1024; + params.n_batch = 1024; + params.n_ctx = 0; + params.n_cache_reuse = 256; + } + ).set_examples({LLAMA_EXAMPLE_SERVER})); + + return ctx_arg; +} diff --git a/llama/llama.cpp/common/arg.h b/llama/llama.cpp/common/arg.h new file mode 100644 index 000000000..70bea100f --- /dev/null +++ b/llama/llama.cpp/common/arg.h @@ -0,0 +1,89 @@ +#pragma once + +#include "common.h" + +#include +#include +#include + +// +// CLI argument parsing +// + +struct common_arg { + std::set examples = {LLAMA_EXAMPLE_COMMON}; + std::set excludes = {}; + std::vector args; + const char * value_hint = nullptr; // help text or example for arg value + const char * value_hint_2 = nullptr; // for second arg value + const char * env = nullptr; + std::string help; + bool is_sparam = false; // is current arg a sampling param? + void (*handler_void) (common_params & params) = nullptr; + void (*handler_string) (common_params & params, const std::string &) = nullptr; + void (*handler_str_str)(common_params & params, const std::string &, const std::string &) = nullptr; + void (*handler_int) (common_params & params, int) = nullptr; + + common_arg( + const std::initializer_list & args, + const char * value_hint, + const std::string & help, + void (*handler)(common_params & params, const std::string &) + ) : args(args), value_hint(value_hint), help(help), handler_string(handler) {} + + common_arg( + const std::initializer_list & args, + const char * value_hint, + const std::string & help, + void (*handler)(common_params & params, int) + ) : args(args), value_hint(value_hint), help(help), handler_int(handler) {} + + common_arg( + const std::initializer_list & args, + const std::string & help, + void (*handler)(common_params & params) + ) : args(args), help(help), handler_void(handler) {} + + // support 2 values for arg + common_arg( + const std::initializer_list & args, + const char * value_hint, + const char * value_hint_2, + const std::string & help, + void (*handler)(common_params & params, const std::string &, const std::string &) + ) : args(args), value_hint(value_hint), value_hint_2(value_hint_2), help(help), handler_str_str(handler) {} + + common_arg & set_examples(std::initializer_list examples); + common_arg & set_excludes(std::initializer_list excludes); + common_arg & set_env(const char * env); + common_arg & set_sparam(); + bool in_example(enum llama_example ex); + bool is_exclude(enum llama_example ex); + bool get_value_from_env(std::string & output); + bool has_value_from_env(); + std::string to_string(); +}; + +struct common_params_context { + enum llama_example ex = LLAMA_EXAMPLE_COMMON; + common_params & params; + std::vector options; + void(*print_usage)(int, char **) = nullptr; + common_params_context(common_params & params) : params(params) {} +}; + +// parse input arguments from CLI +// if one argument has invalid value, it will automatically display usage of the specific argument (and not the full usage message) +bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr); + +// function to be used by test-arg-parser +common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr); +bool common_has_curl(); + +struct common_remote_params { + std::vector headers; + long timeout = 0; // CURLOPT_TIMEOUT, in seconds ; 0 means no timeout + long max_size = 0; // max size of the response ; unlimited if 0 ; max is 2GB +}; +// get remote file content, returns +std::pair> common_remote_get_content(const std::string & url, const common_remote_params & params); diff --git a/llama/llama.cpp/common/chat-parser.cpp b/llama/llama.cpp/common/chat-parser.cpp new file mode 100644 index 000000000..18a30e49a --- /dev/null +++ b/llama/llama.cpp/common/chat-parser.cpp @@ -0,0 +1,385 @@ +#include "chat-parser.h" +#include "common.h" +#include "log.h" +#include "regex-partial.h" + +#include +#include +#include +#include + +using json = nlohmann::ordered_json; + +common_chat_msg_parser::common_chat_msg_parser(const std::string & input, bool is_partial, const common_chat_syntax & syntax) + : input_(input), is_partial_(is_partial), syntax_(syntax) +{ + result_.role = "assistant"; + + while (true) { + std::string id = std::to_string(std::rand()); + if (input.find(id) == std::string::npos) { + healing_marker_ = id; + break; + } + } +} + +std::string common_chat_msg_parser::str(const common_string_range & rng) const { + GGML_ASSERT(rng.begin <= rng.end); + return input_.substr(rng.begin, rng.end - rng.begin); +} + +void common_chat_msg_parser::add_content(const std::string &content) { + result_.content += content; +} + +void common_chat_msg_parser::add_reasoning_content(const std::string &reasoning_content) { + result_.reasoning_content += reasoning_content; +} + +bool common_chat_msg_parser::add_tool_call(const std::string & name, const std::string & id, const std::string & arguments) { + if (name.empty()) { + return false; + } + + common_chat_tool_call tool_call; + tool_call.name = name; + tool_call.arguments = arguments; + tool_call.id = id; + + // LOG_DBG("Tool call arguments:\n\traw: %s\n\tresult: %s\n", arguments.c_str(), tool_call.arguments.c_str()); + result_.tool_calls.emplace_back(tool_call); + + return true; +} +bool common_chat_msg_parser::add_tool_call(const json & tool_call) { + std::string name = tool_call.contains("name") ? tool_call.at("name") : ""; + std::string id = tool_call.contains("id") ? tool_call.at("id") : ""; + std::string arguments = tool_call.contains("arguments") ? tool_call.at("arguments") : ""; + return add_tool_call(name, id, arguments); +} + +bool common_chat_msg_parser::add_tool_calls(const json & arr) { + for (const auto & item : arr) { + if (!add_tool_call(item)) { + return false; + } + } + return true; +} +void common_chat_msg_parser::finish() { + if (!is_partial_ && pos_ != input_.size()) { + throw std::runtime_error("Unexpected content at end of input");// + input_.substr(pos_)); + } +} + +bool common_chat_msg_parser::consume_spaces() { + const auto length = input_.size(); + auto consumed = false; + while (pos_ < length && std::isspace(input_[pos_])) { + ++pos_; + consumed = true; + } + return consumed; +} + +bool common_chat_msg_parser::try_consume_literal(const std::string & literal) { + auto pos = pos_; + for (auto i = 0u; i < literal.size(); ++i) { + if (pos >= input_.size()) { + return false; + } + if (input_[pos] != literal[i]) { + return false; + } + ++pos; + } + pos_ = pos; + return true; +} + +std::optional common_chat_msg_parser::try_find_literal(const std::string & literal) { + auto idx = input_.find(literal, pos_); + if (idx != std::string::npos) { + find_regex_result res; + res.prelude = input_.substr(pos_, idx - pos_); + auto end = idx + literal.size(); + res.groups.emplace_back(common_string_range{idx, end}); + move_to(end); + return res; + } + if (is_partial_) { + idx = string_find_partial_stop(input_, literal); + if (idx != std::string::npos && idx >= pos_) { + find_regex_result res; + res.prelude = input_.substr(pos_, idx - pos_); + auto end = input_.size(); + res.groups.emplace_back(common_string_range{idx, end}); + move_to(end); + return res; + } + } + return std::nullopt; +} + +void common_chat_msg_parser::consume_literal(const std::string & literal) { + if (!try_consume_literal(literal)) { + throw common_chat_msg_partial_exception(literal); + } +} + +bool common_chat_msg_parser::try_parse_reasoning(const std::string & start_think, const std::string & end_think) { + auto handle_reasoning = [&](const std::string & reasoning, bool closed) { + auto stripped_reasoning = string_strip(reasoning); + if (stripped_reasoning.empty()) { + return; + } + if (syntax_.reasoning_in_content) { + add_content(syntax_.reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK ? "" : start_think); + add_content(stripped_reasoning); + if (closed) { + add_content(syntax_.reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK ? "" : end_think); + } + } else { + add_reasoning_content(stripped_reasoning); + } + }; + if (syntax_.reasoning_format != COMMON_REASONING_FORMAT_NONE) { + if (syntax_.thinking_forced_open || try_consume_literal(start_think)) { + if (auto res = try_find_literal(end_think)) { + handle_reasoning(res->prelude, /* closed */ true); + consume_spaces(); + return true; + } + auto rest = consume_rest(); + if (!rest.empty()) { + handle_reasoning(rest, /* closed */ !is_partial()); + } + // Allow unclosed thinking tags, for now (https://github.com/ggml-org/llama.cpp/issues/13812, https://github.com/ggml-org/llama.cpp/issues/13877) + // if (!syntax_.thinking_forced_open) { + // throw common_chat_msg_partial_exception(end_think); + // } + return true; + } + } + return false; +} + +std::string common_chat_msg_parser::consume_rest() { + auto rest = input_.substr(pos_); + pos_ = input_.size(); + return rest; +} + +// Tries to find the regex, consumes it (pos right after it) and gives the prelude (right before it) and the groups to the callback. +std::optional common_chat_msg_parser::try_find_regex(const common_regex & regex, size_t from, bool add_prelude_to_content) { + auto m = regex.search(input_, from == std::string::npos ? pos_ : from); + if (m.type == COMMON_REGEX_MATCH_TYPE_NONE) { + return std::nullopt; + } + auto prelude = input_.substr(pos_, m.groups[0].begin - pos_); + pos_ = m.groups[0].end; + + if (add_prelude_to_content) { + add_content(prelude); + } + if (m.type == COMMON_REGEX_MATCH_TYPE_PARTIAL) { + if (is_partial()) { + throw common_chat_msg_partial_exception(regex.str()); + } + return std::nullopt; + } + return find_regex_result{prelude, m.groups}; +} + +common_chat_msg_parser::find_regex_result common_chat_msg_parser::consume_regex(const common_regex & regex) { + if (auto result = try_consume_regex(regex)) { + return *result; + } + throw common_chat_msg_partial_exception(regex.str()); +} + +std::optional common_chat_msg_parser::try_consume_regex(const common_regex & regex) { + auto m = regex.search(input_, pos_); + if (m.type == COMMON_REGEX_MATCH_TYPE_NONE) { + return std::nullopt; + } + if (m.type == COMMON_REGEX_MATCH_TYPE_PARTIAL) { + if (is_partial()) { + throw common_chat_msg_partial_exception(regex.str()); + } + return std::nullopt; + } + if (m.groups[0].begin != pos_) { + // Didn't match at the current position. + return std::nullopt; + } + pos_ = m.groups[0].end; + + return find_regex_result { + /* .prelude = */ "", + m.groups, + }; +} + +std::optional common_chat_msg_parser::try_consume_json() { + auto it = input_.cbegin() + pos_; + const auto end = input_.cend(); + common_json result; + if (!common_json_parse(it, end, healing_marker_, result)) { + return std::nullopt; + } + pos_ = std::distance(input_.cbegin(), it); + if (result.healing_marker.marker.empty()) { + // No healing marker, just return the parsed json + return result; + } + if (!is_partial()) { + throw common_chat_msg_partial_exception("JSON"); + } + return result; +} + +common_json common_chat_msg_parser::consume_json() { + if (auto result = try_consume_json()) { + return *result; + } + throw common_chat_msg_partial_exception("JSON"); +} + +common_chat_msg_parser::consume_json_result common_chat_msg_parser::consume_json_with_dumped_args( + const std::vector> & args_paths, + const std::vector> & content_paths +) { + if (auto result = try_consume_json_with_dumped_args(args_paths, content_paths)) { + return *result; + } + throw common_chat_msg_partial_exception("JSON"); +} + +std::optional common_chat_msg_parser::try_consume_json_with_dumped_args( + const std::vector> & args_paths, + const std::vector> & content_paths +) { + auto partial = try_consume_json(); + if (!partial) { + return std::nullopt; + } + auto is_arguments_path = [&](const std::vector & path) { + return std::find(args_paths.begin(), args_paths.end(), path) != args_paths.end(); + }; + auto is_content_path = [&](const std::vector & path) { + return std::find(content_paths.begin(), content_paths.end(), path) != content_paths.end(); + }; + + if (partial->healing_marker.marker.empty()) { + if (args_paths.empty()) { + // No arguments to dump, and JSON was parsed fully. + return consume_json_result { + partial->json, + /* .is_partial = */ false, + }; + } + if (is_arguments_path({})) { + // Entire JSON is the arguments and was parsed fully. + return consume_json_result { + partial->json.dump(), + /* .is_partial = */ false, + }; + } + } + + LOG_DBG("Parsed partial JSON: %s (json_healing_marker: %s)\n", partial->json.dump().c_str(), partial->healing_marker.json_dump_marker.c_str()); + + auto found_healing_marker = false; + std::vector path; + std::function remove_unsupported_healings_and_dump_args = [&](const json & j) -> json { + if (is_arguments_path(path)) { + auto arguments = j.dump(); + if (is_partial() && !partial->healing_marker.marker.empty()) { + auto idx = arguments.find(partial->healing_marker.json_dump_marker); + if (idx != std::string::npos) { + arguments.resize(idx); + found_healing_marker = true; + } + if (arguments == "\"") { + // This happens because of completing `:"$magic` after `"arguments"` + arguments = ""; + } + } + return arguments; + } + if (is_content_path(path)) { + if (!j.is_string()) { + throw std::runtime_error("Content path must be a string"); + } + std::string str = j; + auto idx = str.find(partial->healing_marker.marker); // not using json_dump_marker as we're inside a string + if (idx != std::string::npos) { + str.resize(idx); + found_healing_marker = true; + } + return str; + } + if (j.is_object()) { + auto obj = json::object(); + for (const auto & p : j.items()) { + const auto & key = p.key(); + const auto & value = p.value(); + const std::string key_str = key; // NOLINT + auto idx = key_str.find(healing_marker_); + if (idx != std::string::npos) { + found_healing_marker = true; + break; + } + path.push_back(key_str); + if (value.is_string()) { + const std::string value_str = value; + if (value_str.find(healing_marker_) != std::string::npos) { + found_healing_marker = true; + if (is_content_path(path)) { + if (partial->healing_marker.marker == partial->healing_marker.json_dump_marker) { + // The healing occurred inside the string: good. Otherwise we just ditch the entire key/value pair. + obj[key] = remove_unsupported_healings_and_dump_args(value); + } + } + break; + } + obj[key] = value; + } else { + obj[key] = remove_unsupported_healings_and_dump_args(value); + } + path.pop_back(); + } + return obj; + } + if (j.is_array()) { + auto arr = json::array(); + for (const auto & value : j) { + if (value.is_string()) { + std::string str = value; + auto idx = str.find(healing_marker_); + if (idx != std::string::npos) { + // Don't heal array values that aren't in the arguments. + found_healing_marker = true; + break; + } + } + arr.push_back(remove_unsupported_healings_and_dump_args(value)); + } + return arr; + } + return j; + }; + + auto cleaned = remove_unsupported_healings_and_dump_args(partial->json); + LOG_DBG("Cleaned up JSON %s to %s (json_healing_marker : '%s')\n", partial->json.dump().c_str(), cleaned.dump().c_str(), partial->healing_marker.json_dump_marker.c_str()); + return consume_json_result { + cleaned, + /* .is_partial = */ found_healing_marker, + }; +} + +void common_chat_msg_parser::clear_tools() { + result_.tool_calls.clear(); +} diff --git a/llama/llama.cpp/common/chat-parser.h b/llama/llama.cpp/common/chat-parser.h new file mode 100644 index 000000000..0e64c341a --- /dev/null +++ b/llama/llama.cpp/common/chat-parser.h @@ -0,0 +1,120 @@ +#pragma once + +#include "chat.h" +#include "json-partial.h" +#include "regex-partial.h" + +#include + +#include +#include +#include + +class common_chat_msg_partial_exception : public std::runtime_error { + public: + common_chat_msg_partial_exception(const std::string & message) : std::runtime_error(message) {} +}; + +class common_chat_msg_parser { + std::string input_; + bool is_partial_; + common_chat_syntax syntax_; + std::string healing_marker_; + + size_t pos_ = 0; + common_chat_msg result_; + + public: + common_chat_msg_parser(const std::string & input, bool is_partial, const common_chat_syntax & syntax); + const std::string & input() const { return input_; } + size_t pos() const { return pos_; } + const std::string & healing_marker() const { return healing_marker_; } + const bool & is_partial() const { return is_partial_; } + const common_chat_msg & result() const { return result_; } + const common_chat_syntax & syntax() const { return syntax_; } + + void move_to(size_t pos) { + if (pos > input_.size()) { + throw std::runtime_error("Invalid position!"); + } + pos_ = pos; + } + void move_back(size_t n) { + if (pos_ < n) { + throw std::runtime_error("Can't move back that far!"); + } + pos_ -= n; + } + + // Get the substring of the input at the given range + std::string str(const common_string_range & rng) const; + + // Appends to the result.content field + void add_content(const std::string & content); + + // Appends to the result.reasoning_content field + void add_reasoning_content(const std::string & reasoning_content); + + // Adds a tool call to the result. If the tool call is too incomplete (e.g. name empty), it won't add anything. + bool add_tool_call(const std::string & name, const std::string & id, const std::string & arguments); + + // Adds a tool call using the "name", "id" and "arguments" fields of the json object + bool add_tool_call(const nlohmann::ordered_json & tool_call); + + // Adds an array of tool calls using their "name", "id" and "arguments" fields. + bool add_tool_calls(const nlohmann::ordered_json & arr); + + void finish(); + + bool consume_spaces(); + + void consume_literal(const std::string & literal); + + bool try_parse_reasoning(const std::string & start_think, const std::string & end_think); + + std::string consume_rest(); + + struct find_regex_result { + std::string prelude; + std::vector groups; + }; + + std::optional try_find_regex(const common_regex & regex, size_t from = std::string::npos, bool add_prelude_to_content = true); + + bool try_consume_literal(const std::string & literal); + + std::optional try_find_literal(const std::string & literal); + + find_regex_result consume_regex(const common_regex & regex); + + std::optional try_consume_regex(const common_regex & regex); + + std::optional try_consume_json(); + common_json consume_json(); + + struct consume_json_result { + nlohmann::ordered_json value; + bool is_partial; + }; + + /* + Consume (possibly partial) json and converts specific subtrees to (possibly truncated) JSON strings. + + By default, object keys can't be truncated, nor can string values (their corresponding key is removed, + e.g. `{"foo": "bar", "baz": "b` -> `{"foo": "bar"}` + + But one can allow subpaths to be kept truncated, and possibly json-dumped to truncated json strings + - with `content_paths={{"foo"}}` -> `{"foo": "b` -> {"foo": "b"}` + - with `args_paths={{"foo"}}` -> `{"foo": {"b` -> `{"foo": "{b"}` + */ + consume_json_result consume_json_with_dumped_args( + const std::vector> & args_paths = {}, + const std::vector> & content_paths = {} + ); + std::optional try_consume_json_with_dumped_args( + const std::vector> & args_paths = {}, + const std::vector> & content_paths = {} + ); + + void clear_tools(); +}; diff --git a/llama/llama.cpp/common/chat.cpp b/llama/llama.cpp/common/chat.cpp new file mode 100644 index 000000000..7d9aaeb12 --- /dev/null +++ b/llama/llama.cpp/common/chat.cpp @@ -0,0 +1,1932 @@ +#include "chat.h" +#include "chat-parser.h" +#include "common.h" +#include "json-partial.h" +#include "json-schema-to-grammar.h" +#include "log.h" +#include "regex-partial.h" + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +static std::string format_time(const std::chrono::system_clock::time_point & now, const std::string & format) { + auto time = std::chrono::system_clock::to_time_t(now); + auto local_time = *std::localtime(&time); + std::ostringstream ss; + ss << std::put_time(&local_time, format.c_str()); + auto res = ss.str(); + return res; +} + +static std::string string_diff(const std::string & last, const std::string & current) { + if (last.empty()) { + return current; + } + if (!string_starts_with(current, last)) { + if (string_starts_with(last, current)) { + // This happens if the last generation ended on a partial stop word (not erased), + // and the current ended on a stop word (erased). + return ""; + } + throw std::runtime_error("Invalid diff: '" + last + "' not found at start of '" + current + "'"); + } + return current.substr(last.size()); +} + +static bool has_content_or_tool_calls(const common_chat_msg & msg) { + return !msg.content.empty() || !msg.tool_calls.empty(); +} + +template <> +json common_chat_msg::to_json_oaicompat() const +{ + json message { + {"role", "assistant"}, + }; + if (!reasoning_content.empty()) { + message["reasoning_content"] = reasoning_content; + } + if (content.empty() && !tool_calls.empty()) { + message["content"] = json(); + } else { + message["content"] = content; + } + if (!tool_calls.empty()) { + auto arr = json::array(); + for (const auto & tc : tool_calls) { + arr.push_back({ + {"type", "function"}, + {"function", { + {"name", tc.name}, + {"arguments", tc.arguments}, + }}, + {"id", tc.id}, + // // Some templates generate and require an id (sometimes in a very specific format, e.g. Mistral Nemo). + // // We only generate a random id for the ones that don't generate one by themselves + // // (they also won't get to see it as their template likely doesn't use it, so it's all for the client) + // {"id", tc.id.empty() ? gen_tool_call_id() : tc.id}, + }); + } + message["tool_calls"] = arr; + } + return message; +} + +std::vector common_chat_msg_diff::compute_diffs(const common_chat_msg & previous_msg, const common_chat_msg & new_msg) { + std::vector diffs; + if (previous_msg.reasoning_content != new_msg.reasoning_content) { + auto & diff = diffs.emplace_back(); + diff.reasoning_content_delta = string_diff(previous_msg.reasoning_content, new_msg.reasoning_content); + } + if (previous_msg.content != new_msg.content) { + auto & diff = diffs.emplace_back(); + diff.content_delta = string_diff(previous_msg.content, new_msg.content); + } + + if (new_msg.tool_calls.size() < previous_msg.tool_calls.size()) { + throw std::runtime_error("Invalid diff: now finding less tool calls!"); + } + + if (!previous_msg.tool_calls.empty()) { + auto idx = previous_msg.tool_calls.size() - 1; + const auto & pref = previous_msg.tool_calls[idx]; + const auto & newf = new_msg.tool_calls[idx]; + if (pref.name != newf.name) { + throw std::runtime_error("Invalid diff: tool call mismatch!"); + } + auto args_diff = string_diff(pref.arguments, newf.arguments); + if (!args_diff.empty() || pref.id != newf.id) { + auto & diff = diffs.emplace_back(); + diff.tool_call_index = idx; + if (pref.id != newf.id) { + diff.tool_call_delta.id = newf.id; + diff.tool_call_delta.name = newf.name; + } + diff.tool_call_delta.arguments = args_diff; + } + } + for (size_t idx = previous_msg.tool_calls.size(); idx < new_msg.tool_calls.size(); ++idx) { + auto & diff = diffs.emplace_back(); + diff.tool_call_index = idx; + diff.tool_call_delta = new_msg.tool_calls[idx]; + } + return diffs; +} + +typedef minja::chat_template common_chat_template; + +struct common_chat_templates { + bool has_explicit_template; // Model had builtin template or template overridde was specified. + std::unique_ptr template_default; // always set (defaults to chatml) + std::unique_ptr template_tool_use; +}; + +struct templates_params { + json messages; + json tools; + common_chat_tool_choice tool_choice; + json json_schema; + bool parallel_tool_calls; + bool stream; + std::string grammar; + bool add_generation_prompt = true; + bool enable_thinking = true; + std::chrono::system_clock::time_point now = std::chrono::system_clock::now(); +}; + +common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice) { + if (tool_choice == "auto") { + return COMMON_CHAT_TOOL_CHOICE_AUTO; + } + if (tool_choice == "none") { + return COMMON_CHAT_TOOL_CHOICE_NONE; + } + if (tool_choice == "required") { + return COMMON_CHAT_TOOL_CHOICE_REQUIRED; + } + throw std::runtime_error("Invalid tool_choice: " + tool_choice); +} + +template <> +std::vector common_chat_msgs_parse_oaicompat(const json & messages) { + std::vector msgs; + + try { + + if (!messages.is_array()) { + throw std::runtime_error("Expected 'messages' to be an array, got " + messages.dump()); + } + + for (const auto & message : messages) { + if (!message.is_object()) { + throw std::runtime_error("Expected 'message' to be an object, got " + message.dump()); + } + + common_chat_msg msg; + if (!message.contains("role")) { + throw std::runtime_error("Missing 'role' in message: " + message.dump()); + } + msg.role = message.at("role"); + + auto has_content = message.contains("content"); + auto has_tool_calls = message.contains("tool_calls"); + if (has_content) { + const auto & content = message.at("content"); + if (content.is_string()) { + msg.content = content; + } else if (content.is_array()) { + for (const auto & part : content) { + if (!part.contains("type")) { + throw std::runtime_error("Missing content part type: " + part.dump()); + } + const auto & type = part.at("type"); + if (type != "text") { + throw std::runtime_error("Unsupported content part type: " + type.dump()); + } + common_chat_msg_content_part msg_part; + msg_part.type = type; + msg_part.text = part.at("text"); + msg.content_parts.push_back(msg_part); + } + } else if (!content.is_null()) { + throw std::runtime_error("Invalid 'content' type: expected string or array, got " + content.dump() + " (ref: https://github.com/ggml-org/llama.cpp/issues/8367)"); + } + } + if (has_tool_calls) { + for (const auto & tool_call : message.at("tool_calls")) { + common_chat_tool_call tc; + if (!tool_call.contains("type")) { + throw std::runtime_error("Missing tool call type: " + tool_call.dump()); + } + const auto & type = tool_call.at("type"); + if (type != "function") { + throw std::runtime_error("Unsupported tool call type: " + tool_call.dump()); + } + if (!tool_call.contains("function")) { + throw std::runtime_error("Missing tool call function: " + tool_call.dump()); + } + const auto & fc = tool_call.at("function"); + if (!fc.contains("name")) { + throw std::runtime_error("Missing tool call name: " + tool_call.dump()); + } + tc.name = fc.at("name"); + tc.arguments = fc.at("arguments"); + if (tool_call.contains("id")) { + tc.id = tool_call.at("id"); + } + msg.tool_calls.push_back(tc); + } + } + if (!has_content && !has_tool_calls) { + throw std::runtime_error("Expected 'content' or 'tool_calls' (ref: https://github.com/ggml-org/llama.cpp/issues/8367 & https://github.com/ggml-org/llama.cpp/issues/12279)"); + } + if (message.contains("reasoning_content")) { + msg.reasoning_content = message.at("reasoning_content"); + } + if (message.contains("name")) { + msg.tool_name = message.at("name"); + } + if (message.contains("tool_call_id")) { + msg.tool_call_id = message.at("tool_call_id"); + } + + msgs.push_back(msg); + } + } catch (const std::exception & e) { + // @ngxson : disable otherwise it's bloating the API response + // printf("%s\n", std::string("; messages = ") + messages.dump(2)); + throw std::runtime_error("Failed to parse messages: " + std::string(e.what())); + } + + return msgs; +} + +template <> +json common_chat_msgs_to_json_oaicompat(const std::vector & msgs, bool concat_typed_text) { + json messages = json::array(); + for (const auto & msg : msgs) { + if (!msg.content.empty() && !msg.content_parts.empty()) { + throw std::runtime_error("Cannot specify both content and content_parts"); + } + json jmsg { + {"role", msg.role}, + }; + if (!msg.content.empty()) { + jmsg["content"] = msg.content; + } else if (!msg.content_parts.empty()) { + if (concat_typed_text) { + std::string text; + for (const auto & part : msg.content_parts) { + if (part.type != "text") { + LOG_WRN("Ignoring content part type: %s\n", part.type.c_str()); + continue; + } + if (!text.empty()) { + text += '\n'; + } + text += part.text; + } + jmsg["content"] = text; + } else { + auto & parts = jmsg["content"] = json::array(); + for (const auto & part : msg.content_parts) { + parts.push_back({ + {"type", part.type}, + {"text", part.text}, + }); + } + } + } else { + jmsg["content"] = json(); // null + } + if (!msg.reasoning_content.empty()) { + jmsg["reasoning_content"] = msg.reasoning_content; + } + if (!msg.tool_name.empty()) { + jmsg["name"] = msg.tool_name; + } + if (!msg.tool_call_id.empty()) { + jmsg["tool_call_id"] = msg.tool_call_id; + } + if (!msg.tool_calls.empty()) { + auto & tool_calls = jmsg["tool_calls"] = json::array(); + for (const auto & tool_call : msg.tool_calls) { + json tc { + {"type", "function"}, + {"function", { + {"name", tool_call.name}, + {"arguments", tool_call.arguments}, + }}, + }; + if (!tool_call.id.empty()) { + tc["id"] = tool_call.id; + } + tool_calls.push_back(tc); + } + } + messages.push_back(jmsg); + } + return messages; +} + +template <> +std::vector common_chat_msgs_parse_oaicompat(const std::string & messages) { + return common_chat_msgs_parse_oaicompat(json::parse(messages)); +} + +template <> +std::vector common_chat_tools_parse_oaicompat(const json & tools) { + std::vector result; + + try { + if (!tools.is_null()) { + if (!tools.is_array()) { + throw std::runtime_error("Expected 'tools' to be an array, got " + tools.dump()); + } + for (const auto & tool : tools) { + if (!tool.contains("type")) { + throw std::runtime_error("Missing tool type: " + tool.dump()); + } + const auto & type = tool.at("type"); + if (!type.is_string() || type != "function") { + throw std::runtime_error("Unsupported tool type: " + tool.dump()); + } + if (!tool.contains("function")) { + throw std::runtime_error("Missing tool function: " + tool.dump()); + } + + const auto & function = tool.at("function"); + result.push_back({ + /* .name = */ function.at("name"), + /* .description = */ function.at("description"), + /* .parameters = */ function.at("parameters").dump(), + }); + } + } + } catch (const std::exception & e) { + throw std::runtime_error("Failed to parse tools: " + std::string(e.what()) + "; tools = " + tools.dump(2)); + } + + return result; +} + +template <> +std::vector common_chat_tools_parse_oaicompat(const std::string & tools) { + return common_chat_tools_parse_oaicompat(json::parse(tools)); +} + +template <> +json common_chat_tools_to_json_oaicompat(const std::vector & tools) { + if (tools.empty()) { + return json(); + } + + auto result = json::array(); + for (const auto & tool : tools) { + result.push_back({ + {"type", "function"}, + {"function", { + {"name", tool.name}, + {"description", tool.description}, + {"parameters", json::parse(tool.parameters)}, + }}, + }); + } + return result; +} + +template <> json common_chat_msg_diff_to_json_oaicompat(const common_chat_msg_diff & diff) { + json delta = json::object(); + if (!diff.reasoning_content_delta.empty()) { + delta["reasoning_content"] = diff.reasoning_content_delta; + } + if (!diff.content_delta.empty()) { + delta["content"] = diff.content_delta; + } + if (diff.tool_call_index != std::string::npos) { + json tool_call; + tool_call["index"] = diff.tool_call_index; + if (!diff.tool_call_delta.id.empty()) { + tool_call["id"] = diff.tool_call_delta.id; + tool_call["type"] = "function"; + } + json function = json::object(); + if (!diff.tool_call_delta.name.empty()) { + function["name"] = diff.tool_call_delta.name; + } + function["arguments"] = diff.tool_call_delta.arguments; + tool_call["function"] = function; + delta["tool_calls"] = json::array({tool_call}); + } + return delta; +} + +bool common_chat_verify_template(const std::string & tmpl, bool use_jinja) { + if (use_jinja) { + try { + common_chat_msg msg; + msg.role = "user"; + msg.content = "test"; + + auto tmpls = common_chat_templates_init(/* model= */ nullptr, tmpl); + + common_chat_templates_inputs inputs; + inputs.messages = {msg}; + + common_chat_templates_apply(tmpls.get(), inputs); + return true; + } catch (const std::exception & e) { + LOG_ERR("%s: failed to apply template: %s\n", __func__, e.what()); + return false; + } + } + llama_chat_message chat[] = {{"user", "test"}}; + const int res = llama_chat_apply_template(tmpl.c_str(), chat, 1, true, nullptr, 0); + return res >= 0; +} + +std::string common_chat_format_single( + const struct common_chat_templates * tmpls, + const std::vector & past_msg, + const common_chat_msg & new_msg, + bool add_ass, + bool use_jinja) { + + common_chat_templates_inputs inputs; + inputs.use_jinja = use_jinja; + + std::string fmt_past_msg; + if (!past_msg.empty()) { + inputs.messages = past_msg; + inputs.add_generation_prompt = false; + fmt_past_msg = common_chat_templates_apply(tmpls, inputs).prompt; + } + std::ostringstream ss; + // if the past_msg ends with a newline, we must preserve it in the formatted version + if (add_ass && !fmt_past_msg.empty() && fmt_past_msg.back() == '\n') { + ss << "\n"; + }; + // format chat with new_msg + inputs.messages.push_back(new_msg); + inputs.add_generation_prompt = add_ass; + auto fmt_new_msg = common_chat_templates_apply(tmpls, inputs).prompt; + // get the diff part + ss << fmt_new_msg.substr(fmt_past_msg.size(), fmt_new_msg.size() - fmt_past_msg.size()); + return ss.str(); +} + +std::string common_chat_format_example(const struct common_chat_templates * tmpls, bool use_jinja) { + common_chat_templates_inputs inputs; + inputs.use_jinja = use_jinja; + auto add_simple_msg = [&](auto role, auto content) { + common_chat_msg msg; + msg.role = role; + msg.content = content; + inputs.messages.push_back(msg); + }; + add_simple_msg("system", "You are a helpful assistant"); + add_simple_msg("user", "Hello"); + add_simple_msg("assistant", "Hi there"); + add_simple_msg("user", "How are you?"); + return common_chat_templates_apply(tmpls, inputs).prompt; +} + +#define CHATML_TEMPLATE_SRC \ + "{%- for message in messages -%}\n" \ + " {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>\n' -}}\n" \ + "{%- endfor -%}\n" \ + "{%- if add_generation_prompt -%}\n" \ + " {{- '<|im_start|>assistant\n' -}}\n" \ + "{%- endif -%}" + +void common_chat_templates_free(struct common_chat_templates * tmpls) { + delete tmpls; +} + +bool common_chat_templates_was_explicit(const struct common_chat_templates * tmpls) { + return tmpls->has_explicit_template; +} + +const char * common_chat_templates_source(const struct common_chat_templates * tmpls, const char * variant) { + if (variant != nullptr) { + if (strcmp(variant, "tool_use") == 0) { + if (tmpls->template_tool_use) { + return tmpls->template_tool_use->source().c_str(); + } + return nullptr; + } else { + LOG_DBG("%s: unknown template variant: %s\n", __func__, variant); + } + } + return tmpls->template_default->source().c_str(); +} + +common_chat_templates_ptr common_chat_templates_init( + const struct llama_model * model, + const std::string & chat_template_override, + const std::string & bos_token_override, + const std::string & eos_token_override) +{ + std::string default_template_src; + std::string template_tool_use_src; + + bool has_explicit_template = !chat_template_override.empty(); + if (chat_template_override.empty()) { + GGML_ASSERT(model != nullptr); + const auto * str = llama_model_chat_template(model, /* name */ nullptr); + if (str) { + default_template_src = str; + has_explicit_template = true; + } + str = llama_model_chat_template(model, /* name */ "tool_use"); + if (str) { + template_tool_use_src = str; + has_explicit_template = true; + } + } else { + default_template_src = chat_template_override; + } + if (default_template_src.empty() || default_template_src == "chatml") { + if (!template_tool_use_src.empty()) { + default_template_src = template_tool_use_src; + } else { + default_template_src = CHATML_TEMPLATE_SRC; + } + } + std::string token_bos = bos_token_override; + std::string token_eos = eos_token_override; + if (model) { + const auto * vocab = llama_model_get_vocab(model); + const auto get_token = [&](llama_token token, const char * name, const char * jinja_variable_name) { + if (token == LLAMA_TOKEN_NULL) { + if (default_template_src.find(jinja_variable_name) != std::string::npos + || template_tool_use_src.find(jinja_variable_name) != std::string::npos) { + LOG_WRN("common_chat_templates_init: warning: vocab does not have a %s token, jinja template won't work as intended.\n", name); + } + return std::string(); + } + return common_token_to_piece(vocab, token, true); + }; + token_bos = get_token(llama_vocab_bos(vocab), "BOS", "bos_token"); + token_eos = get_token(llama_vocab_eos(vocab), "EOS", "eos_token"); + } + common_chat_templates_ptr tmpls(new common_chat_templates()); + tmpls->has_explicit_template = has_explicit_template; + try { + tmpls->template_default = std::make_unique(default_template_src, token_bos, token_eos); + } catch (const std::exception & e) { + LOG_ERR("%s: failed to parse chat template (defaulting to chatml): %s \n", __func__, e.what()); + tmpls->template_default = std::make_unique(CHATML_TEMPLATE_SRC, token_bos, token_eos); + } + if (!template_tool_use_src.empty()) { + try { + tmpls->template_tool_use = std::make_unique(template_tool_use_src, token_bos, token_eos); + } catch (const std::exception & e) { + LOG_ERR("%s: failed to parse tool use chat template (ignoring it): %s\n", __func__, e.what()); + } + } + return tmpls; +} + +const char * common_chat_format_name(common_chat_format format) { + switch (format) { + case COMMON_CHAT_FORMAT_CONTENT_ONLY: return "Content-only"; + case COMMON_CHAT_FORMAT_GENERIC: return "Generic"; + case COMMON_CHAT_FORMAT_MISTRAL_NEMO: return "Mistral Nemo"; + case COMMON_CHAT_FORMAT_LLAMA_3_X: return "Llama 3.x"; + case COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS: return "Llama 3.x with builtin tools"; + case COMMON_CHAT_FORMAT_DEEPSEEK_R1: return "DeepSeek R1"; + case COMMON_CHAT_FORMAT_FIREFUNCTION_V2: return "FireFunction v2"; + case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2: return "Functionary v3.2"; + case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1: return "Functionary v3.1 Llama 3.1"; + case COMMON_CHAT_FORMAT_HERMES_2_PRO: return "Hermes 2 Pro"; + case COMMON_CHAT_FORMAT_COMMAND_R7B: return "Command R7B"; + default: + throw std::runtime_error("Unknown chat format"); + } +} + +const char * common_reasoning_format_name(common_reasoning_format format) { + switch (format) { + case COMMON_REASONING_FORMAT_NONE: return "none"; + case COMMON_REASONING_FORMAT_DEEPSEEK: return "deepseek"; + case COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY: return "deepseek-legacy"; + default: + throw std::runtime_error("Unknown reasoning format"); + } +} + +static std::string wrap_code_as_arguments(common_chat_msg_parser & builder, const std::string & code) { + std::string arguments; + if (builder.is_partial()) { + arguments = (json {{"code", code + builder.healing_marker()}}).dump(); + auto idx = arguments.find(builder.healing_marker()); + if (idx != std::string::npos) { + arguments.resize(idx); + } + } else { + arguments = (json {{"code", code}}).dump(); + } + return arguments; +} + +/** + * Takes a prefix regex that must have 1 group to capture the function name, a closing suffix, and expects json parameters in between. + * Aggregates the prefix, suffix and in-between text into the content. + */ +static void parse_json_tool_calls( + common_chat_msg_parser & builder, + const std::optional & block_open, + const std::optional & function_regex_start_only, + const std::optional & function_regex, + const common_regex & close_regex, + const std::optional & block_close, + bool allow_raw_python = false, + const std::function & get_function_name = nullptr) { + + auto parse_tool_calls = [&]() { + size_t from = std::string::npos; + auto first = true; + while (true) { + auto res = function_regex_start_only && first + ? builder.try_consume_regex(*function_regex_start_only) + : function_regex + ? builder.try_find_regex(*function_regex, from) + : std::nullopt; + if (res) { + std::string name; + if (get_function_name) { + name = get_function_name(*res); + } else { + GGML_ASSERT(res->groups.size() == 2); + name = builder.str(res->groups[1]); + } + first = false; + if (name.empty()) { + // get_function_name signalled us that we should skip this match and treat it as content. + from = res->groups[0].begin + 1; + continue; + } + from = std::string::npos; + + auto maybe_raw_python = name == "python" && allow_raw_python; + if (builder.input()[builder.pos()] == '{' || !maybe_raw_python) { + if (auto arguments = builder.try_consume_json_with_dumped_args({{}})) { + if (!builder.add_tool_call(name, "", arguments->value) || arguments->is_partial) { + throw common_chat_msg_partial_exception("incomplete tool call"); + } + builder.consume_regex(close_regex); + } + continue; + } + if (maybe_raw_python) { + auto arguments = wrap_code_as_arguments(builder, builder.consume_rest()); + if (!builder.add_tool_call(name, "", arguments)) { + throw common_chat_msg_partial_exception("incomplete tool call"); + } + return; + } + throw common_chat_msg_partial_exception("incomplete tool call"); + } + break; + } + if (block_close) { + builder.consume_regex(*block_close); + } + builder.consume_spaces(); + builder.add_content(builder.consume_rest()); + }; + if (block_open) { + if (auto res = builder.try_find_regex(*block_open)) { + parse_tool_calls(); + } else { + builder.add_content(builder.consume_rest()); + } + } else { + parse_tool_calls(); + } +} + +static void parse_prefixed_json_tool_call_array(common_chat_msg_parser & builder, const common_regex & prefix, size_t rstrip_prefix = 0) { + static const std::vector> args_paths = {{"arguments"}}; + if (auto res = builder.try_find_regex(prefix)) { + builder.move_back(rstrip_prefix); + auto tool_calls = builder.consume_json_with_dumped_args(args_paths); + if (!builder.add_tool_calls(tool_calls.value) || tool_calls.is_partial) { + throw common_chat_msg_partial_exception("incomplete tool call array"); + } + } else { + builder.add_content(builder.consume_rest()); + } +} + +static void foreach_function(const json & tools, const std::function & fn) { + for (const auto & tool : tools) { + if (!tool.contains("type") || tool.at("type") != "function" || !tool.contains("function")) { + LOG_INF("Skipping tool without function: %s", tool.dump(2).c_str()); + continue; + } + fn(tool); + } +} + +static std::string apply( + const common_chat_template & tmpl, + const nlohmann::ordered_json & messages, + const nlohmann::ordered_json & tools, + bool add_generation_prompt, + const nlohmann::ordered_json & extra_context = nlohmann::ordered_json()) +{ + minja::chat_template_inputs tmpl_inputs; + tmpl_inputs.messages = messages; + tmpl_inputs.tools = tools; + tmpl_inputs.add_generation_prompt = add_generation_prompt; + tmpl_inputs.extra_context = extra_context; + // TODO: add flag to control date/time, if only for testing purposes. + // tmpl_inputs.now = std::chrono::system_clock::now(); + + minja::chat_template_options tmpl_opts; + // To avoid double BOS / EOS tokens, we're manually removing begining / trailing tokens + // instead of using `chat_template_options.use_bos_token = false`, since these tokens + // may be needed inside the template / between messages too. + auto result = tmpl.apply(tmpl_inputs, tmpl_opts); + if (string_starts_with(result, tmpl.bos_token())) { + result = result.substr(tmpl.bos_token().size()); + } + if (string_ends_with(result, tmpl.eos_token())) { + result = result.substr(0, result.size() - tmpl.eos_token().size()); + } + return result; +} + +static common_chat_params common_chat_params_init_generic(const common_chat_template & tmpl, const struct templates_params & inputs) { + common_chat_params data; + + auto tool_call_schemas = json::array(); + foreach_function(inputs.tools, [&](const json & tool) { + const auto & function = tool.at("function"); + auto tool_schema = json { + {"type", "object"}, + {"properties", { + {"name", { + {"type", "string"}, + {"const", function.at("name")}, + }}, + {"arguments", function.at("parameters")}, + }}, + {"required", json::array({"name", "arguments"})}, + }; + if (function.contains("description")) { + tool_schema["description"] = function.at("description"); + } + if (inputs.parallel_tool_calls) { + tool_schema.at("properties")["id"] = { + {"type", "string"}, + {"minLength", 4}, + }; + tool_schema.at("required").push_back("id"); + } + tool_call_schemas.emplace_back(tool_schema); + }); + const auto tool_call = + inputs.parallel_tool_calls + ? json { + {"type", "object"}, + {"properties", { + {"tool_calls", { + {"type", "array"}, + {"items", tool_call_schemas.size() == 1 ? tool_call_schemas[0] : json { + {"anyOf", tool_call_schemas}, + }}, + {"minItems", 1}, + }}, + }}, + {"required", json::array({"tool_calls"})}, + } + : json { + {"type", "object"}, + {"properties", { + {"tool_call", tool_call_schemas.size() == 1 ? tool_call_schemas[0] : json { + {"anyOf", tool_call_schemas}, + }}, + }}, + {"required", json::array({"tool_call"})}, + }; + const auto schema = + inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED + ? json { + {"anyOf", json::array({ + tool_call, + { + {"type", "object"}, + {"properties", { + {"response", inputs.json_schema.is_null() + ? json {{"type", "string"}} + : inputs.json_schema + }, + }}, + {"required", json::array({"response"})}, + }, + })} + } + : tool_call; + + data.grammar_lazy = false; + data.grammar = build_grammar([&](const common_grammar_builder & builder) { + builder.add_schema("root", schema); + }); + + auto tweaked_messages = common_chat_template::add_system( + inputs.messages, + "Respond in JSON format, either with `tool_call` (a request to call tools) or with `response` reply to the user's request"); + + data.prompt = apply(tmpl, tweaked_messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt); + data.format = COMMON_CHAT_FORMAT_GENERIC; + return data; +} +static void common_chat_parse_generic(common_chat_msg_parser & builder) { + if (!builder.syntax().parse_tool_calls) { + builder.add_content(builder.consume_rest()); + return; + } + static const std::vector> content_paths = { + {"response"}, + }; + static const std::vector> args_paths = { + {"tool_call", "arguments"}, + {"tool_calls", "arguments"}, + }; + auto data = builder.consume_json_with_dumped_args(args_paths, content_paths); + if (data.value.contains("tool_calls")) { + if (!builder.add_tool_calls(data.value.at("tool_calls")) || data.is_partial) { + throw common_chat_msg_partial_exception("incomplete tool calls"); + } + } else if (data.value.contains("tool_call")) { + if (!builder.add_tool_call(data.value.at("tool_call")) || data.is_partial) { + throw common_chat_msg_partial_exception("incomplete tool call"); + } + } else if (data.value.contains("response")) { + const auto & response = data.value.at("response"); + builder.add_content(response.is_string() ? response.template get() : response.dump(2)); + if (data.is_partial) { + throw common_chat_msg_partial_exception("incomplete response"); + } + } else { + throw common_chat_msg_partial_exception("Expected 'tool_call', 'tool_calls' or 'response' in JSON"); + } +} + +static common_chat_params common_chat_params_init_mistral_nemo(const common_chat_template & tmpl, const struct templates_params & inputs) { + common_chat_params data; + data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED; + data.grammar = build_grammar([&](const common_grammar_builder & builder) { + auto schemas = json::array(); + foreach_function(inputs.tools, [&](const json & tool) { + const auto & function = tool.at("function"); + schemas.push_back({ + {"type", "object"}, + {"properties", { + // Important note: the model is probably trained to take a JSON stringified arguments value. + // It's hard to constrain that for now (while reusing the JSON schema conversion), so we're just expecting a plain object. + {"name", { + {"type", "string"}, + {"const", function.at("name")}, + }}, + {"arguments", function.at("parameters")}, + {"id", { + {"type", "string"}, + // Nemo's template expects a 9-character alphanumeric ID. + {"pattern", "^[a-zA-Z0-9]{9}$"}, + }}, + }}, + {"required", json::array({"name", "arguments", "id"})}, + }); + }); + auto schema = json { + {"type", "array"}, + {"items", schemas.size() == 1 ? schemas[0] : json {{"anyOf", schemas}}}, + {"minItems", 1}, + }; + if (!inputs.parallel_tool_calls) { + schema["maxItems"] = 1; + } + builder.add_rule("root", "\"[TOOL_CALLS]\" " + builder.add_schema("tool_calls", schema)); + }); + data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "[TOOL_CALLS]"}); + data.preserved_tokens = { + "[TOOL_CALLS]", + }; + data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt); + data.format = COMMON_CHAT_FORMAT_MISTRAL_NEMO; + return data; +} +static void common_chat_parse_mistral_nemo(common_chat_msg_parser & builder) { + if (!builder.syntax().parse_tool_calls) { + builder.add_content(builder.consume_rest()); + return; + } + + static const common_regex prefix(regex_escape("[TOOL_CALLS]")); + parse_prefixed_json_tool_call_array(builder, prefix); +} + +static common_chat_params common_chat_params_init_command_r7b(const common_chat_template & tmpl, const struct templates_params & inputs) { + common_chat_params data; + + auto adjusted_messages = json::array(); + for (const auto & msg : inputs.messages) { + auto has_reasoning_content = msg.contains("reasoning_content") && msg.at("reasoning_content").is_string(); + auto has_tool_calls = msg.contains("tool_calls") && msg.at("tool_calls").is_array(); + if (has_reasoning_content && has_tool_calls) { + auto adjusted_message = msg; + adjusted_message["tool_plan"] = msg.at("reasoning_content"); + adjusted_message.erase("reasoning_content"); + adjusted_messages.push_back(adjusted_message); + } else { + adjusted_messages.push_back(msg); + } + } + data.prompt = apply(tmpl, adjusted_messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt, {}); + data.format = COMMON_CHAT_FORMAT_COMMAND_R7B; + if (string_ends_with(data.prompt, "<|START_THINKING|>")) { + if (!inputs.enable_thinking) { + data.prompt += "<|END_THINKING|>"; + } else { + data.thinking_forced_open = true; + } + } else if (!inputs.enable_thinking && string_ends_with(data.prompt, "<|CHATBOT_TOKEN|>")) { + data.prompt += "<|START_THINKING|><|END_THINKING|>"; + } + + data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED; + data.grammar = build_grammar([&](const common_grammar_builder & builder) { + auto schemas = json::array(); + foreach_function(inputs.tools, [&](const json & tool) { + const auto & function = tool.at("function"); + schemas.push_back({ + {"type", "object"}, + {"properties", { + {"tool_call_id", { + {"type", "string"}, + // Command-R's template expects an integer string. + {"pattern", "^[0-9]{1,10}$"}, + }}, + {"tool_name", { + {"type", "string"}, + {"const", function.at("name")}, + }}, + {"parameters", function.at("parameters")}, + }}, + {"required", json::array({"tool_call_id", "tool_name", "parameters"})}, + }); + }); + auto schema = json { + {"type", "array"}, + {"items", schemas.size() == 1 ? schemas[0] : json {{"anyOf", schemas}}}, + {"minItems", 1}, + }; + if (!inputs.parallel_tool_calls) { + schema["maxItems"] = 1; + } + builder.add_rule("root", + std::string(data.thinking_forced_open ? "( \"<|END_THINKING|>\" space )? " : "") + + "\"<|START_ACTION|>\" " + builder.add_schema("tool_calls", schema) + " \"<|END_ACTION|>\""); + }); + data.grammar_triggers.push_back({ + COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL, + // If thinking_forced_open, then we capture the tag in the grammar, + // (important for required tool choice) and in the trigger's first capture (decides what is sent to the grammar) + std::string(data.thinking_forced_open ? "[\\s\\S]*?(<\\|END_THINKING\\|>\\s*)" : "(?:<\\|START_THINKING\\|>[\\s\\S]*?<\\|END_THINKING\\|>\\s*)?") + + "(<\\|START_ACTION\\|>)[\\s\\S]*" + }); + data.preserved_tokens = { + "<|START_ACTION|>", + "<|END_ACTION|>", + "<|START_RESPONSE|>", + "<|END_RESPONSE|>", + "<|START_THINKING|>", + "<|END_THINKING|>", + }; + return data; +} + +static void common_chat_parse_command_r7b(common_chat_msg_parser & builder) { + builder.try_parse_reasoning("<|START_THINKING|>", "<|END_THINKING|>"); + + static const common_regex start_action_regex("<\\|START_ACTION\\|>"); + static const common_regex end_action_regex("<\\|END_ACTION\\|>"); + static const common_regex start_response_regex("<\\|START_RESPONSE\\|>"); + static const common_regex end_response_regex("<\\|END_RESPONSE\\|>"); + + if (auto res = builder.try_find_regex(start_action_regex)) { + // If we didn't extract thoughts, prelude includes them. + auto tool_calls = builder.consume_json_with_dumped_args({{"parameters"}}); + for (const auto & tool_call : tool_calls.value) { + std::string name = tool_call.contains("tool_name") ? tool_call.at("tool_name") : ""; + std::string id = tool_call.contains("tool_call_id") ? tool_call.at("tool_call_id") : ""; + std::string arguments = tool_call.contains("parameters") ? tool_call.at("parameters") : ""; + if (!builder.add_tool_call(name, id, arguments) || tool_calls.is_partial) { + throw common_chat_msg_partial_exception("incomplete tool call"); + } + } + if (tool_calls.is_partial) { + throw common_chat_msg_partial_exception("incomplete tool call"); + } + builder.consume_regex(end_action_regex); + } else if (auto res = builder.try_find_regex(start_response_regex)) { + if (!builder.try_find_regex(end_response_regex)) { + builder.add_content(builder.consume_rest()); + throw common_chat_msg_partial_exception(end_response_regex.str()); + } + } else { + builder.add_content(builder.consume_rest()); + } +} + +static void expect_tool_parameters(const std::string & name, const json & parameters, const std::vector & expected_properties) { + if (!parameters.is_object() || !parameters.contains("type") || parameters.at("type") != "object" || !parameters.contains("properties") || !parameters.contains("required")) { + throw std::runtime_error("Parameters of tool " + name + " must be an object w/ required properties"); + } + const auto & parameters_properties = parameters.at("properties"); + const auto & parameters_required = parameters.at("required"); + for (const auto & prop : expected_properties) { + if (!parameters_properties.contains(prop)) { + throw std::runtime_error("Parameters of tool " + name + " is missing property: " + prop); // NOLINT + } + if (std::find(parameters_required.begin(), parameters_required.end(), json(prop)) == parameters_required.end()) { + throw std::runtime_error("Parameters of tool " + name + " must have property marked as required: " + prop); // NOLINT + } + } + if (parameters_properties.size() != expected_properties.size()) { + throw std::runtime_error("Parameters of tool " + name + " must only have these properties:" + string_join(expected_properties, ", ")); + } +} + +static common_chat_params common_chat_params_init_llama_3_x(const common_chat_template & tmpl, const struct templates_params & inputs, bool allow_python_tag_builtin_tools) { + auto builtin_tools = json::array(); + common_chat_params data; + if (!inputs.tools.is_null()) { + data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED; + data.grammar = build_grammar([&](const common_grammar_builder & builder) { + std::vector tool_rules; + + auto handle_builtin_tool = [&](const std::string & name, const json & parameters) { + if (name == "wolfram_alpha" || name == "web_search" || name == "brave_search") { + // https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/tool_runtime/wolfram_alpha/wolfram_alpha.py + // https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/tool_runtime/brave_search/brave_search.py + expect_tool_parameters(name, parameters, {"query"}); + } else if (name == "python" || name == "code_interpreter") { + // https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/inline/tool_runtime/code_interpreter/code_interpreter.py + expect_tool_parameters(name, parameters, {"code"}); + } else { + return false; + } + + std::vector kvs; + for (const auto & [key, value] : parameters.at("properties").items()) { + kvs.push_back("\"" + key + "=\" " + builder.add_schema(name + "-args-" + key, value)); // NOLINT + } + + tool_rules.push_back( + builder.add_rule( + name + "-call", + "\"<|python_tag|>" + name + ".call(\" " + string_join(kvs, " \", \" ") + " \")\"")); + builtin_tools.push_back(name); + + return true; + }; + + foreach_function(inputs.tools, [&](const json & tool) { + const auto & function = tool.at("function"); + std::string name = function.at("name"); + auto parameters = function.at("parameters"); + builder.resolve_refs(parameters); + + // https://github.com/meta-llama/llama-stack/tree/main/llama_stack/providers/remote/tool_runtime + if (allow_python_tag_builtin_tools) { + handle_builtin_tool(name, parameters); + } + tool_rules.push_back( + builder.add_rule( + name + "-call", + "\"{\" space " + "( \"\\\"type\\\"\" space \":\" space \"\\\"function\\\"\" space \",\" space )? " + " \"\\\"name\\\"\" space \":\" space \"\\\"" + name + "\\\"\" space \",\" space " + " \"\\\"parameters\\\"\" space \":\" space " + builder.add_schema(name + "-args", parameters) + " " + "\"}\" space")); + }); + // Small models may hallucinate function names so we match anything (*at the start*) that looks like the JSON of a function call, regardless of the name. + data.grammar_triggers.push_back({ + COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL, + "(\\{\\s*(?:\"type\"\\s*:\\s*\"function\"\\s*,\\s*)?\"name\"\\s*:\\s*\")[\\s\\S]*", // + name + "\"[\\s\\S]*", + }); + if (!builtin_tools.empty()) { + data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<|python_tag|>"}); + data.preserved_tokens.push_back("<|python_tag|>"); + } + // Allow a few empty lines on top of the usual constrained json schema space rule. + builder.add_rule("root", string_join(tool_rules, " | ")); + data.additional_stops.push_back("<|eom_id|>"); + }); + data.format = allow_python_tag_builtin_tools && !builtin_tools.empty() + ? COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS + : COMMON_CHAT_FORMAT_LLAMA_3_X; + } else { + data.format = COMMON_CHAT_FORMAT_CONTENT_ONLY; + } + data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt, { + {"date_string", format_time(inputs.now, "%d %b %Y")}, + {"tools_in_user_message", false}, + {"builtin_tools", builtin_tools.empty() ? json() : builtin_tools}, + }); + return data; +} +static void common_chat_parse_llama_3_1(common_chat_msg_parser & builder, bool with_builtin_tools = false) { + if (!builder.syntax().parse_tool_calls) { + builder.add_content(builder.consume_rest()); + return; + } + + static const common_regex function_regex( + "\\s*\\{\\s*(?:\"type\"\\s*:\\s*\"function\"\\s*,\\s*)?\"name\"\\s*:\\s*\"([^\"]+)\"\\s*,\\s*\"parameters\"\\s*: "); + static const common_regex close_regex("\\}\\s*"); + + static const common_regex function_name_regex("\\s*(\\w+)\\s*\\.\\s*call\\("); + static const common_regex arg_name_regex("\\s*(\\w+)\\s*=\\s*"); + + if (with_builtin_tools) { + static const common_regex builtin_call_regex("<\\|python_tag\\|>"); + if (auto res = builder.try_find_regex(builtin_call_regex)) { + auto fun_res = builder.consume_regex(function_name_regex); + auto function_name = builder.str(fun_res.groups[1]); + + common_healing_marker healing_marker; + json args = json::object(); + while (true) { + if (auto arg_res = builder.try_consume_regex(arg_name_regex)) { + auto arg_name = builder.str(arg_res->groups[1]); + auto partial = builder.consume_json(); + args[arg_name] = partial.json; + healing_marker.marker = partial.healing_marker.marker; + healing_marker.json_dump_marker = partial.healing_marker.json_dump_marker; + builder.consume_spaces(); + if (!builder.try_consume_literal(",")) { + break; + } + } else { + break; + } + } + builder.consume_literal(")"); + builder.consume_spaces(); + + auto arguments = args.dump(); + if (!builder.add_tool_call(function_name, "", arguments)) { + throw common_chat_msg_partial_exception("Incomplete tool call"); + } + return; + } + } + parse_json_tool_calls( + builder, + /* block_open= */ std::nullopt, + /* function_regex_start_only= */ function_regex, + /* function_regex= */ std::nullopt, + close_regex, + std::nullopt); + +} + +static common_chat_params common_chat_params_init_deepseek_r1(const common_chat_template & tmpl, const struct templates_params & inputs) { + common_chat_params data; + auto prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt); + + // Hacks to fix the official (broken) prompt. + // It is advisable to use --chat-template-file models/templates/llama-cpp-deepseek-r1.jinja instead, + // until the official template is fixed. + if (tmpl.source().find("{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}") != std::string::npos) { + // Don't leave the chat dangling after tool results + if (string_ends_with(prompt, "<|tool▁outputs▁end|>")) { + prompt += "<|end▁of▁sentence|>"; + if (inputs.add_generation_prompt) { + prompt += "<|Assistant|>"; + } + } + // Fix up tool call delta example added by Minja + prompt = std::regex_replace( + prompt, + std::regex("(<|tool▁call▁end|>)[\\s\\r\\n]*(<|tool▁outputs▁begin|>|<|User|>)"), + "$1<|tool▁calls▁end|><|end▁of▁sentence|>$2"); + } + data.prompt = prompt; + data.format = COMMON_CHAT_FORMAT_DEEPSEEK_R1; + if (string_ends_with(data.prompt, "\n")) { + if (!inputs.enable_thinking) { + data.prompt += ""; + } else { + data.thinking_forced_open = true; + } + } + + if (inputs.tools.is_array() && !inputs.tools.empty()) { + data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED && inputs.json_schema.is_null(); + data.grammar = build_grammar([&](const common_grammar_builder & builder) { + std::vector tool_rules; + foreach_function(inputs.tools, [&](const json & tool) { + const auto & function = tool.at("function"); + std::string name = function.at("name"); + auto parameters = function.at("parameters"); + builder.resolve_refs(parameters); + tool_rules.push_back(builder.add_rule(name + "-call", + "( \"<|tool▁call▁begin|>\" )? \"function<|tool▁sep|>" + name + "\\n" + "```json\\n\" " + builder.add_schema(name + "-args", parameters) + " " + "\"```<|tool▁call▁end|>\"")); + }); + // Distill Qwen 7B & 32B models seem confused re/ syntax of their tool call opening tag, + // so we accept common variants (then it's all constrained) + builder.add_rule("root", + std::string(data.thinking_forced_open ? "( \"\" space )? " : "") + + "( \"<|tool▁calls▁begin|>\" | \"<|tool_calls_begin|>\" | \"<|tool calls begin|>\" | \"<|tool\\\\_calls\\\\_begin|>\" | \"<|tool▁calls|>\" ) " + "(" + string_join(tool_rules, " | ") + ")" + (inputs.parallel_tool_calls ? "*" : "") + " " + "\"<|tool▁calls▁end|>\"" + " space"); + data.grammar_triggers.push_back({ + COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL, + // If thinking_forced_open, then we capture the tag in the grammar, + // (important for required tool choice) and in the trigger's first capture (decides what is sent to the grammar) + std::string(data.thinking_forced_open ? "[\\s\\S]*?(\\s*)" : "(?:[\\s\\S]*?\\s*)?") + + "(<|tool▁calls▁begin|>|<|tool_calls_begin|>|<|tool calls begin|>|<|tool\\\\_calls\\\\_begin|>|<|tool▁calls|>)[\\s\\S]*" + }); + data.preserved_tokens = { + "", + "", + "<|tool▁calls▁begin|>", + "<|tool▁call▁begin|>", + "<|tool▁sep|>", + "<|tool▁call▁end|>", + "<|tool▁calls▁end|", + }; + }); + } + return data; +} +static void common_chat_parse_deepseek_r1(common_chat_msg_parser & builder) { + builder.try_parse_reasoning("", ""); + if (!builder.syntax().parse_tool_calls) { + builder.add_content(builder.consume_rest()); + return; + } + + static const common_regex tool_calls_begin("(?:<|tool▁calls▁begin|>|<|tool_calls_begin|>|<|tool calls begin|>|<|tool\\\\_calls\\\\_begin|>|<|tool▁calls|>)"); + static const common_regex tool_calls_end("<|tool▁calls▁end|>"); + static const common_regex function_regex("(?:<|tool▁call▁begin|>)?function<|tool▁sep|>([^\n]+)\n```json\n"); + static const common_regex close_regex("```[\\s\\r\\n]*<|tool▁call▁end|>"); + + parse_json_tool_calls( + builder, + /* block_open= */ tool_calls_begin, + /* function_regex_start_only= */ std::nullopt, + function_regex, + close_regex, + tool_calls_end); +} + +static common_chat_params common_chat_params_init_firefunction_v2(const common_chat_template & tmpl, const struct templates_params & inputs) { + LOG_DBG("%s\n", __func__); + common_chat_params data; + data.prompt = apply(tmpl, inputs.messages, /* tools= */ nullptr, inputs.add_generation_prompt, { + {"datetime", format_time(inputs.now, "%b %d %Y %H:%M:%S GMT")}, + {"functions", json(inputs.tools.empty() ? "" : inputs.tools.dump(2))}, + }); + if (inputs.tools.is_array() && !inputs.tools.empty()) { + data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED; + data.grammar = build_grammar([&](const common_grammar_builder & builder) { + auto schemas = json::array(); + foreach_function(inputs.tools, [&](const json & tool) { + const auto & function = tool.at("function"); + schemas.push_back({ + {"type", "object"}, + {"properties", { + {"name", { + {"type", "string"}, + {"const", function.at("name")}, + }}, + {"arguments", function.at("parameters")}, + }}, + {"required", json::array({"name", "arguments", "id"})}, + }); + }); + auto schema = json { + {"type", "array"}, + {"items", schemas.size() == 1 ? schemas[0] : json {{"anyOf", schemas}}}, + {"minItems", 1}, + }; + if (!inputs.parallel_tool_calls) { + schema["maxItems"] = 1; + } + builder.add_rule("root", "\" functools\"? " + builder.add_schema("tool_calls", schema)); + }); + data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, " functools["}); + data.preserved_tokens = { + " functools[", + }; + data.format = COMMON_CHAT_FORMAT_FIREFUNCTION_V2; + } else { + data.format = COMMON_CHAT_FORMAT_CONTENT_ONLY; + } + return data; +} +static void common_chat_parse_firefunction_v2(common_chat_msg_parser & builder) { + if (!builder.syntax().parse_tool_calls) { + builder.add_content(builder.consume_rest()); + return; + } + static const common_regex prefix(regex_escape(" functools[")); + parse_prefixed_json_tool_call_array(builder, prefix, /* rstrip_prefix= */ 1); +} + +static common_chat_params common_chat_params_init_functionary_v3_2(const common_chat_template & tmpl, const struct templates_params & inputs) { + // >>>all\nlet's call functions>>>fn1\n{"arg1": 1...}\n>>>fn2\n{"arg1": 1...}... + // Using ">>>f1\n", ">>>f2\n"... as trigger words for the grammar + // If the function is python, we also allow raw python code (if the line after `python\n` doesn't start w/ opening `{`), which the model seems to prefer for multiline code. + common_chat_params data; + data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt); + data.format = COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2; + if (inputs.tools.is_array() && !inputs.tools.empty()) { + data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED; + data.grammar = build_grammar([&](const common_grammar_builder & builder) { + std::vector first_tool_rules; + std::vector subsequent_tool_rules; + foreach_function(inputs.tools, [&](const json & tool) { + const auto & function = tool.at("function"); + std::string name = function.at("name"); + auto parameters = function.at("parameters"); + builder.resolve_refs(parameters); + std::string args_pattern = "[\\s\\S]*"; + auto args_rule = builder.add_schema(name + "-args", parameters); + if (name == "python") { + args_rule = builder.add_rule(name + "-maybe-raw-args", args_rule + " | [^{] .*"); + } else { + args_pattern = "\\{" + args_pattern; + } + auto call_rule = builder.add_rule(name + "-call", "\"" + name + "\\n\" " + args_rule); + first_tool_rules.push_back(call_rule); + if (inputs.parallel_tool_calls) { + subsequent_tool_rules.push_back(builder.add_rule(name + "-call2", "\">>>\" " + call_rule)); + } + data.grammar_triggers.push_back({ + COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL, + "((?:[\\s\\S]+?>>>)?" + regex_escape(name) + "\n)" + args_pattern, + }); + }); + data.preserved_tokens = { + "<|end_header_id|>", + }; + auto first_rule = first_tool_rules.empty() ? "" : builder.add_rule("first_tool_call", string_join(first_tool_rules, " | ")) + " space"; + if (inputs.parallel_tool_calls) { + auto subsequent_rule = builder.add_rule("subsequent_tool_call", string_join(subsequent_tool_rules, " | ")) + " space"; + builder.add_rule("root", first_rule + " (" + subsequent_rule + ")*"); + } else { + builder.add_rule("root", first_rule); + } + + }); + } + return data; +} +static void common_chat_parse_functionary_v3_2(common_chat_msg_parser & builder) { + static const common_regex function_regex_start_only(R"((\w+\n\{|python\n|all\n))"); + static const common_regex function_regex(R"(>>>(\w+\n\{|python\n|all\n))"); + static const common_regex close_regex(R"(\s*)"); + + parse_json_tool_calls( + builder, + std::nullopt, + function_regex_start_only, + function_regex, + close_regex, + std::nullopt, + /* allow_raw_python= */ true, + /* get_function_name= */ [&](const auto & res) -> std::string { + auto at_start = res.groups[0].begin == 0; + auto name = builder.str(res.groups[1]); + if (!name.empty() && name.back() == '{') { + // Unconsume the opening brace '{' to ensure the JSON parsing goes well. + builder.move_back(1); + } + auto idx = name.find_last_not_of("\n{"); + name = name.substr(0, idx + 1); + if (at_start && name == "all") { + return ""; + } + return name; + }); +} + +static common_chat_params common_chat_params_init_functionary_v3_1_llama_3_1(const common_chat_template & tmpl, const struct templates_params & inputs) { + // https://github.com/MeetKai/functionary/blob/main/tests/prompt_test_v3-llama3.1.txt + common_chat_params data; + + if (!inputs.tools.is_null()) { + std::string python_code_argument_name; + auto has_raw_python = false; + + data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED; + data.grammar = build_grammar([&](const common_grammar_builder & builder) { + std::vector tool_rules; + foreach_function(inputs.tools, [&](const json & tool) { + const auto & function = tool.at("function"); + const auto & parameters = function.at("parameters"); + std::string name = function.at("name"); + if (name == "python" || name == "ipython") { + if (!parameters.contains("type")) { + throw std::runtime_error("Missing type in python tool"); + } + has_raw_python = true; + const auto & type = parameters.at("type"); + if (type == "object") { + auto properties = parameters.at("properties"); + for (auto it = properties.begin(); it != properties.end(); ++it) { + if (it.value().at("type") == "string") { + if (!python_code_argument_name.empty()) { + throw std::runtime_error("Multiple string arguments found in python tool"); + } + python_code_argument_name = it.key(); + } + } + if (python_code_argument_name.empty()) { + throw std::runtime_error("No string argument found in python tool"); + } + } else if (type != "string") { + throw std::runtime_error("Invalid type in python tool: " + type.dump()); + } + } + tool_rules.push_back(builder.add_rule(name + "-call", "\"\" " + builder.add_schema(name + "-args", parameters) + " \"\" space")); + }); + if (has_raw_python) { + tool_rules.push_back(builder.add_rule("python-call", "\"<|python_tag|>\" .*")); + data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<|python_tag|>"}); + data.preserved_tokens.push_back("<|python_tag|>"); + } + auto tool_call = builder.add_rule("tool_call", string_join(tool_rules, " | ")) + " space"; + builder.add_rule("root", inputs.parallel_tool_calls ? "(" + tool_call + ")+" : tool_call); + data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "")); + + static const common_regex function_regex(R"()"); + static const common_regex close_regex(R"()"); + + parse_json_tool_calls( + builder, + /* block_open= */ std::nullopt, + /* function_regex_start_only= */ std::nullopt, + function_regex, + close_regex, + std::nullopt); + + if (auto res = builder.try_find_regex(python_tag_regex)) { + auto arguments = wrap_code_as_arguments(builder, builder.consume_rest()); + builder.add_tool_call("python", "", arguments); + return; + } +} + +static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat_template & tmpl, const struct templates_params & inputs) { + common_chat_params data; + + json additional_context = { + {"enable_thinking", inputs.enable_thinking}, + }; + + data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt, additional_context); + data.format = COMMON_CHAT_FORMAT_HERMES_2_PRO; + if (string_ends_with(data.prompt, "\n")) { + if (!inputs.enable_thinking) { + data.prompt += ""; + } else { + data.thinking_forced_open = true; + } + } + + if (!inputs.tools.is_null()) { + // (content)?({"name": "foo", "arguments": {"a": 1}})* + data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED; + data.grammar = build_grammar([&](const common_grammar_builder & builder) { + std::vector tool_rules; + std::vector tool_call_alts; + std::vector escaped_names; + foreach_function(inputs.tools, [&](const json & tool) { + const auto & function = tool.at("function"); + std::string name = function.at("name"); + auto parameters = function.at("parameters"); + builder.resolve_refs(parameters); + tool_rules.push_back(builder.add_schema(name + "-call", { + {"type", "object"}, + {"properties", json { + {"name", json {{"const", name}}}, + {"arguments", parameters}, + }}, + {"required", json::array({"name", "arguments"})}, + })); + tool_call_alts.push_back(builder.add_rule( + name + "-function-tag", + "\"\" space " + + builder.add_schema(name + "-args", parameters) + " " + "\"\" space")); + + data.grammar_triggers.push_back({ + COMMON_GRAMMAR_TRIGGER_TYPE_WORD, + "", + }); + auto escaped_name = regex_escape(name); + data.grammar_triggers.push_back({ + COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN, + " alt_tags { + any_tool_call, + "\"\" space " + any_tool_call + " \"\"", + // The rest is just to accommodate common "good bad" outputs. + "\"\" space " + any_tool_call + " \"\"", + "\"\" space " + any_tool_call + " \"\"", + "\"\" space " + any_tool_call + " \"\"", + "\"\" space " + any_tool_call + " \"\"", + "\"\" space " + any_tool_call + " \"\"", + "\"\" space " + any_tool_call + " \"\"", + }; + auto wrappable_tool_call = builder.add_rule("wrappable_tool_call", "( " + string_join(alt_tags, " | ") + " ) space"); + tool_call_alts.push_back(wrappable_tool_call); + tool_call_alts.push_back( + "( \"```\\n\" | \"```json\\n\" | \"```xml\\n\" ) space " + wrappable_tool_call + " space \"```\" space "); + auto tool_call = builder.add_rule("tool_call", string_join(tool_call_alts, " | ")); + builder.add_rule("root", + std::string(data.thinking_forced_open ? "( \"\" space )? " : "") + + (inputs.parallel_tool_calls ? "(" + tool_call + ")+" : tool_call)); + // Trigger on some common known "good bad" outputs (only from the start and with a json that's about a specific argument name to avoid false positives) + data.grammar_triggers.push_back({ + COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL, + // If thinking_forced_open, then we capture the tag in the grammar, + // (important for required tool choice) and in the trigger's first capture (decides what is sent to the grammar) + std::string(data.thinking_forced_open ? "[\\s\\S]*?(\\s*)" : "(?:[\\s\\S]*?\\s*)?") + ( + "(\\s*" + "(?:" + "||||)?" + "\\s*\\{\\s*\"name\"\\s*:\\s*\"(?:" + string_join(escaped_names, "|") + ")\"" + ")" + ")[\\s\\S]*" + ), + }); + data.preserved_tokens = { + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "```", + "```json", + "```xml", + }; + }); + } + + return data; +} +static void common_chat_parse_hermes_2_pro(common_chat_msg_parser & builder) { + builder.try_parse_reasoning("", ""); + if (!builder.syntax().parse_tool_calls) { + builder.add_content(builder.consume_rest()); + return; + } + + static const common_regex open_regex( + "(?:" + "(```(?:xml|json)?\\n\\s*)?" // match 1 (block_start) + "(" // match 2 (open_tag) + "" + "|" + "|" + "|" + "|" + "|" + "|" + "|" + ")?" + "(\\s*\\{\\s*\"name\")" // match 3 (named tool call) + ")" + "|]+)>" // match 4 (function name) + "|" // match 5 (function name again) + ); + + if (auto res = builder.try_find_regex(open_regex)) { + const auto & block_start = res->groups[1]; + std::string block_end = block_start.empty() ? "" : "```"; + + const auto & open_tag = res->groups[2]; + std::string close_tag; + + if (!res->groups[3].empty()) { + builder.move_to(res->groups[3].begin); + close_tag = open_tag.empty() ? "" : "value) || tool_call->is_partial) { + throw common_chat_msg_partial_exception("incomplete tool call"); + } + builder.consume_spaces(); + builder.consume_literal(close_tag); + builder.consume_spaces(); + if (!block_end.empty()) { + builder.consume_literal(block_end); + builder.consume_spaces(); + } + builder.add_content(builder.consume_rest()); + } else { + throw common_chat_msg_partial_exception("failed to parse tool call"); + } + } else { + auto function_name = builder.str(res->groups[4]); + if (function_name.empty()) { + function_name = builder.str(res->groups[5]); + } + GGML_ASSERT(!function_name.empty()); + + close_tag = ""; + + if (auto arguments = builder.try_consume_json_with_dumped_args({{}})) { + if (!builder.add_tool_call(function_name, "", arguments->value) || arguments->is_partial) { + throw common_chat_msg_partial_exception("incomplete tool call"); + } + builder.consume_spaces(); + builder.consume_literal(close_tag); + builder.consume_spaces(); + if (!block_end.empty()) { + builder.consume_literal(block_end); + builder.consume_spaces(); + } + } + builder.add_content(builder.consume_rest()); + } + } else { + builder.add_content(builder.consume_rest()); + } +} + +static common_chat_params common_chat_params_init_without_tools(const common_chat_template & tmpl, const struct templates_params & inputs) { + common_chat_params data; + data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt); + data.format = COMMON_CHAT_FORMAT_CONTENT_ONLY; + data.grammar_lazy = false; + if (!inputs.json_schema.is_null()) { + if (!inputs.grammar.empty()) { + throw std::runtime_error("Either \"json_schema\" or \"grammar\" can be specified, but not both"); + } + data.grammar = json_schema_to_grammar(inputs.json_schema); + } else { + data.grammar = inputs.grammar; + } + return data; +} + +static common_chat_params common_chat_templates_apply_jinja( + const struct common_chat_templates * tmpls, + const struct common_chat_templates_inputs & inputs) +{ + templates_params params; + params.tools = common_chat_tools_to_json_oaicompat(inputs.tools); + const auto & tmpl = params.tools.is_array() && tmpls->template_tool_use + ? *tmpls->template_tool_use + : *tmpls->template_default; + const auto & src = tmpl.source(); + const auto & caps = tmpl.original_caps(); + params.messages = common_chat_msgs_to_json_oaicompat(inputs.messages, /* concat_text= */ !tmpl.original_caps().requires_typed_content); + params.add_generation_prompt = inputs.add_generation_prompt; + params.tool_choice = inputs.tool_choice; + params.enable_thinking = inputs.enable_thinking; + params.grammar = inputs.grammar; + params.now = inputs.now; + if (!inputs.json_schema.empty()) { + params.json_schema = json::parse(inputs.json_schema); + } + + if (inputs.parallel_tool_calls && !tmpl.original_caps().supports_parallel_tool_calls) { + LOG_DBG("Disabling parallel_tool_calls because the template does not support it\n"); + params.parallel_tool_calls = false; + } else { + params.parallel_tool_calls = inputs.parallel_tool_calls; + } + + if (params.tools.is_array()) { + if (params.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE && !params.grammar.empty()) { + throw std::runtime_error("Cannot specify grammar with tools"); + } + if (caps.supports_tool_calls && !caps.supports_tools) { + LOG_WRN("Template supports tool calls but does not natively describe tools. The fallback behaviour used may produce bad results, inspect prompt w/ --verbose & consider overriding the template.\n"); + } + } + + // DeepSeek R1: use handler in all cases except json schema (thinking / tools). + if (src.find("<|tool▁calls▁begin|>") != std::string::npos && params.json_schema.is_null()) { + return common_chat_params_init_deepseek_r1(tmpl, params); + } + + // Command R7B: : use handler in all cases except json schema (thinking / tools). + if (src.find("<|END_THINKING|><|START_ACTION|>") != std::string::npos && params.json_schema.is_null()) { + return common_chat_params_init_command_r7b(tmpl, params); + } + + // Hermes 2/3 Pro, Qwen 2.5 Instruct (w/ tools) + if (src.find("") != std::string::npos && params.json_schema.is_null()) { + return common_chat_params_init_hermes_2_pro(tmpl, params); + } + + // Use generic handler when mixing tools + JSON schema. + // TODO: support that mix in handlers below. + if ((params.tools.is_array() && params.json_schema.is_object())) { + return common_chat_params_init_generic(tmpl, params); + } + + // Functionary prepends "all\n" to plain content outputs, so we use its handler in all cases. + if (src.find(">>>all") != std::string::npos) { + return common_chat_params_init_functionary_v3_2(tmpl, params); + } + + // Firefunction v2 requires datetime and functions in the context even w/o tools, so we also use its handler in all cases. + if (src.find(" functools[") != std::string::npos) { + return common_chat_params_init_firefunction_v2(tmpl, params); + } + + // Functionary v3.1 (w/ tools) + if (src.find("<|start_header_id|>") != std::string::npos + && src.find("ipython<|end_header_id|>") != std::string::npos) { + auto allow_python_tag_builtin_tools = src.find("<|python_tag|>") != std::string::npos; + return common_chat_params_init_llama_3_x(tmpl, params, allow_python_tag_builtin_tools); + } + + // Plain handler (no tools) + if (params.tools.is_null() || inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_NONE) { + return common_chat_params_init_without_tools(tmpl, params); + } + + // Mistral Nemo (w/ tools) + if (src.find("[TOOL_CALLS]") != std::string::npos) { + return common_chat_params_init_mistral_nemo(tmpl, params); + } + + // Generic fallback + return common_chat_params_init_generic(tmpl, params); +} + +// Legacy template route (adhoc C++ implementation of known templates), forward to llama_chat_apply_template. +static common_chat_params common_chat_templates_apply_legacy( + const struct common_chat_templates * tmpls, + const struct common_chat_templates_inputs & inputs) +{ + int alloc_size = 0; + std::vector chat; + std::vector contents; + for (const auto & msg : inputs.messages) { + auto content = msg.content; + for (const auto & part : msg.content_parts) { + if (part.type != "text") { + LOG_WRN("Ignoring non-text content part: %s\n", part.type.c_str()); + continue; + } + if (!content.empty()) { + content += "\n";; + } + content += part.text; + } + contents.emplace_back(std::move(content)); + } + for (size_t i = 0; i < contents.size(); ++i) { + const auto & msg = inputs.messages[i]; + const auto & content = contents[i]; + chat.push_back({msg.role.c_str(), content.c_str()}); + alloc_size += (msg.role.size() + content.size()) * 1.25; + } + + std::vector buf(alloc_size); + + // run the first time to get the total output length + const auto & src = tmpls->template_default->source(); + int32_t res = llama_chat_apply_template(src.c_str(), chat.data(), chat.size(), inputs.add_generation_prompt, buf.data(), buf.size()); + + // error: chat template is not supported + if (res < 0) { + // if the custom "tmpl" is not supported, we throw an error + // this is a bit redundant (for good), since we're not sure if user validated the custom template with llama_chat_verify_template() + throw std::runtime_error("this custom template is not supported, try using --jinja"); + } + + // if it turns out that our buffer is too small, we resize it + if ((size_t) res > buf.size()) { + buf.resize(res); + res = llama_chat_apply_template(src.c_str(), chat.data(), chat.size(), inputs.add_generation_prompt, buf.data(), buf.size()); + } + + common_chat_params params; + params.prompt = std::string(buf.data(), res); + if (!inputs.json_schema.empty()) { + params.grammar = json_schema_to_grammar(json::parse(inputs.json_schema)); + } else { + params.grammar = inputs.grammar; + } + return params; +} + +common_chat_params common_chat_templates_apply( + const struct common_chat_templates * tmpls, + const struct common_chat_templates_inputs & inputs) +{ + GGML_ASSERT(tmpls != nullptr); + return inputs.use_jinja + ? common_chat_templates_apply_jinja(tmpls, inputs) + : common_chat_templates_apply_legacy(tmpls, inputs); +} + +static void common_chat_parse_content_only(common_chat_msg_parser & builder) { + builder.add_content(builder.consume_rest()); +} + +static void common_chat_parse(common_chat_msg_parser & builder) { + LOG_DBG("Parsing input with format %s: %s\n", common_chat_format_name(builder.syntax().format), builder.input().c_str()); + + switch (builder.syntax().format) { + case COMMON_CHAT_FORMAT_CONTENT_ONLY: + common_chat_parse_content_only(builder); + break; + case COMMON_CHAT_FORMAT_GENERIC: + common_chat_parse_generic(builder); + break; + case COMMON_CHAT_FORMAT_MISTRAL_NEMO: + common_chat_parse_mistral_nemo(builder); + break; + case COMMON_CHAT_FORMAT_LLAMA_3_X: + common_chat_parse_llama_3_1(builder); + break; + case COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS: + common_chat_parse_llama_3_1(builder, /* with_builtin_tools= */ true); + break; + case COMMON_CHAT_FORMAT_DEEPSEEK_R1: + common_chat_parse_deepseek_r1(builder); + break; + case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2: + common_chat_parse_functionary_v3_2(builder); + break; + case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1: + common_chat_parse_functionary_v3_1_llama_3_1(builder); + break; + case COMMON_CHAT_FORMAT_HERMES_2_PRO: + common_chat_parse_hermes_2_pro(builder); + break; + case COMMON_CHAT_FORMAT_FIREFUNCTION_V2: + common_chat_parse_firefunction_v2(builder); + break; + case COMMON_CHAT_FORMAT_COMMAND_R7B: + common_chat_parse_command_r7b(builder); + break; + default: + throw std::runtime_error(std::string("Unsupported format: ") + common_chat_format_name(builder.syntax().format)); + } + builder.finish(); +} + +common_chat_msg common_chat_parse(const std::string & input, bool is_partial, const common_chat_syntax & syntax) { + common_chat_msg_parser builder(input, is_partial, syntax); + try { + common_chat_parse(builder); + } catch (const common_chat_msg_partial_exception & ex) { + LOG_DBG("Partial parse: %s\n", ex.what()); + if (!is_partial) { + builder.clear_tools(); + builder.move_to(0); + common_chat_parse_content_only(builder); + } + } + auto msg = builder.result(); + LOG_DBG("Parsed message: %s\n", common_chat_msgs_to_json_oaicompat({msg}).at(0).dump().c_str()); + return msg; +} diff --git a/llama/llama.cpp/common/chat.h b/llama/llama.cpp/common/chat.h new file mode 100644 index 000000000..9f59e6b08 --- /dev/null +++ b/llama/llama.cpp/common/chat.h @@ -0,0 +1,202 @@ +// Chat support (incl. tool call grammar constraining & output parsing) w/ generic & custom template handlers. + +#pragma once + +#include "common.h" +#include +#include +#include +#include + +struct common_chat_templates; + +struct common_chat_tool_call { + std::string name; + std::string arguments; + std::string id; + + bool operator==(const common_chat_tool_call & other) const { + return name == other.name && arguments == other.arguments && id == other.id; + } +}; + +struct common_chat_msg_content_part { + std::string type; + std::string text; + + bool operator==(const common_chat_msg_content_part & other) const { + return type == other.type && text == other.text; + } +}; + +struct common_chat_msg { + std::string role; + std::string content; + std::vector content_parts = {}; + std::vector tool_calls = {}; + std::string reasoning_content; + std::string tool_name; + std::string tool_call_id; + + template T to_json_oaicompat() const; + + bool empty() const { + return content.empty() && content_parts.empty() && tool_calls.empty() && reasoning_content.empty() && tool_name.empty() && tool_call_id.empty(); + } + void ensure_tool_call_ids_set(std::vector & ids_cache, const std::function & gen_tool_call_id) { + for (auto i = 0u; i < tool_calls.size(); i++) { + if (ids_cache.size() <= i) { + auto id = tool_calls[i].id; + if (id.empty()) { + id = gen_tool_call_id(); + } + ids_cache.push_back(id); + } + tool_calls[i].id = ids_cache[i]; + } + } + bool operator==(const common_chat_msg & other) const { + return role == other.role + && content == other.content + && content_parts == other.content_parts + && tool_calls == other.tool_calls + && reasoning_content == other.reasoning_content + && tool_name == other.tool_name + && tool_call_id == other.tool_call_id; + } + bool operator!=(const common_chat_msg & other) const { + return !(*this == other); + } +}; + +struct common_chat_msg_diff { + std::string reasoning_content_delta; + std::string content_delta; + size_t tool_call_index = std::string::npos; + common_chat_tool_call tool_call_delta; + + static std::vector compute_diffs(const common_chat_msg & previous_msg, const common_chat_msg & new_msg); + + bool operator==(const common_chat_msg_diff & other) const { + return content_delta == other.content_delta + && tool_call_index == other.tool_call_index + && tool_call_delta == other.tool_call_delta; + } +}; + +struct common_chat_tool { + std::string name; + std::string description; + std::string parameters; +}; + +enum common_chat_tool_choice { + COMMON_CHAT_TOOL_CHOICE_AUTO, + COMMON_CHAT_TOOL_CHOICE_REQUIRED, + COMMON_CHAT_TOOL_CHOICE_NONE, +}; + +enum common_chat_format { + COMMON_CHAT_FORMAT_CONTENT_ONLY, + COMMON_CHAT_FORMAT_GENERIC, + COMMON_CHAT_FORMAT_MISTRAL_NEMO, + COMMON_CHAT_FORMAT_LLAMA_3_X, + COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS, + COMMON_CHAT_FORMAT_DEEPSEEK_R1, + COMMON_CHAT_FORMAT_FIREFUNCTION_V2, + COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2, + COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1, + COMMON_CHAT_FORMAT_HERMES_2_PRO, + COMMON_CHAT_FORMAT_COMMAND_R7B, + + COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats +}; + +struct common_chat_templates_inputs { + std::vector messages; + std::string grammar; + std::string json_schema; + bool add_generation_prompt = true; + bool use_jinja = true; + // Parameters below only supported when use_jinja is true + std::vector tools; + common_chat_tool_choice tool_choice = COMMON_CHAT_TOOL_CHOICE_AUTO; + bool parallel_tool_calls = false; + common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE; + bool enable_thinking = true; + std::chrono::system_clock::time_point now = std::chrono::system_clock::now(); +}; + +struct common_chat_params { + common_chat_format format = COMMON_CHAT_FORMAT_CONTENT_ONLY; + std::string prompt; + std::string grammar; + bool grammar_lazy = false; + bool thinking_forced_open = false; + std::vector grammar_triggers; + std::vector preserved_tokens; + std::vector additional_stops; +}; + +struct common_chat_syntax { + common_chat_format format = COMMON_CHAT_FORMAT_CONTENT_ONLY; + common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE; + // Whether reasoning_content should be inlined in the content (e.g. for reasoning_format=deepseek in stream mode) + bool reasoning_in_content = false; + bool thinking_forced_open = false; + bool parse_tool_calls = true; +}; + +// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid +bool common_chat_verify_template(const std::string & tmpl, bool use_jinja); + +void common_chat_templates_free(struct common_chat_templates * tmpls); + +struct common_chat_templates_deleter { void operator()(common_chat_templates * tmpls) { common_chat_templates_free(tmpls); } }; + +typedef std::unique_ptr common_chat_templates_ptr; + +common_chat_templates_ptr common_chat_templates_init( + const struct llama_model * model, + const std::string & chat_template_override, + const std::string & bos_token_override = "", + const std::string & eos_token_override = ""); + +bool common_chat_templates_was_explicit(const struct common_chat_templates * tmpls); +const char * common_chat_templates_source(const struct common_chat_templates * tmpls, const char * variant = nullptr); + + +struct common_chat_params common_chat_templates_apply( + const struct common_chat_templates * tmpls, + const struct common_chat_templates_inputs & inputs); + +// Format single message, while taking into account the position of that message in chat history +std::string common_chat_format_single( + const struct common_chat_templates * tmpls, + const std::vector & past_msg, + const common_chat_msg & new_msg, + bool add_ass, + bool use_jinja); + +// Returns an example of formatted chat +std::string common_chat_format_example( + const struct common_chat_templates * tmpls, + bool use_jinja); + +const char* common_chat_format_name(common_chat_format format); +const char* common_reasoning_format_name(common_reasoning_format format); +common_chat_msg common_chat_parse(const std::string & input, bool is_partial, const common_chat_syntax & syntax); + +common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice); + +// Parses a JSON array of messages in OpenAI's chat completion API format. +// T can be std::string containing JSON or nlohmann::ordered_json +template std::vector common_chat_msgs_parse_oaicompat(const T & messages); +template T common_chat_msgs_to_json_oaicompat(const std::vector & msgs, bool concat_typed_text = false); + +// Parses a JSON array of tools in OpenAI's chat completion tool call API format. +// T can be std::string containing JSON or nlohmann::ordered_json +template std::vector common_chat_tools_parse_oaicompat(const T & tools); +template T common_chat_tools_to_json_oaicompat(const std::vector & tools); + +template T common_chat_msg_diff_to_json_oaicompat(const common_chat_msg_diff & diff); diff --git a/llama/llama.cpp/common/console.cpp b/llama/llama.cpp/common/console.cpp new file mode 100644 index 000000000..078a8d678 --- /dev/null +++ b/llama/llama.cpp/common/console.cpp @@ -0,0 +1,504 @@ +#include "console.h" +#include +#include + +#if defined(_WIN32) +#define WIN32_LEAN_AND_MEAN +#ifndef NOMINMAX +#define NOMINMAX +#endif +#include +#include +#include +#ifndef ENABLE_VIRTUAL_TERMINAL_PROCESSING +#define ENABLE_VIRTUAL_TERMINAL_PROCESSING 0x0004 +#endif +#else +#include +#include +#include +#include +#include +#include +#include +#include +#endif + +#define ANSI_COLOR_RED "\x1b[31m" +#define ANSI_COLOR_GREEN "\x1b[32m" +#define ANSI_COLOR_YELLOW "\x1b[33m" +#define ANSI_COLOR_BLUE "\x1b[34m" +#define ANSI_COLOR_MAGENTA "\x1b[35m" +#define ANSI_COLOR_CYAN "\x1b[36m" +#define ANSI_COLOR_RESET "\x1b[0m" +#define ANSI_BOLD "\x1b[1m" + +namespace console { + + // + // Console state + // + + static bool advanced_display = false; + static bool simple_io = true; + static display_t current_display = reset; + + static FILE* out = stdout; + +#if defined (_WIN32) + static void* hConsole; +#else + static FILE* tty = nullptr; + static termios initial_state; +#endif + + // + // Init and cleanup + // + + void init(bool use_simple_io, bool use_advanced_display) { + advanced_display = use_advanced_display; + simple_io = use_simple_io; +#if defined(_WIN32) + // Windows-specific console initialization + DWORD dwMode = 0; + hConsole = GetStdHandle(STD_OUTPUT_HANDLE); + if (hConsole == INVALID_HANDLE_VALUE || !GetConsoleMode(hConsole, &dwMode)) { + hConsole = GetStdHandle(STD_ERROR_HANDLE); + if (hConsole != INVALID_HANDLE_VALUE && (!GetConsoleMode(hConsole, &dwMode))) { + hConsole = nullptr; + simple_io = true; + } + } + if (hConsole) { + // Check conditions combined to reduce nesting + if (advanced_display && !(dwMode & ENABLE_VIRTUAL_TERMINAL_PROCESSING) && + !SetConsoleMode(hConsole, dwMode | ENABLE_VIRTUAL_TERMINAL_PROCESSING)) { + advanced_display = false; + } + // Set console output codepage to UTF8 + SetConsoleOutputCP(CP_UTF8); + } + HANDLE hConIn = GetStdHandle(STD_INPUT_HANDLE); + if (hConIn != INVALID_HANDLE_VALUE && GetConsoleMode(hConIn, &dwMode)) { + // Set console input codepage to UTF16 + _setmode(_fileno(stdin), _O_WTEXT); + + // Set ICANON (ENABLE_LINE_INPUT) and ECHO (ENABLE_ECHO_INPUT) + if (simple_io) { + dwMode |= ENABLE_LINE_INPUT | ENABLE_ECHO_INPUT; + } else { + dwMode &= ~(ENABLE_LINE_INPUT | ENABLE_ECHO_INPUT); + } + if (!SetConsoleMode(hConIn, dwMode)) { + simple_io = true; + } + } + if (simple_io) { + _setmode(_fileno(stdin), _O_U8TEXT); + } +#else + // POSIX-specific console initialization + if (!simple_io) { + struct termios new_termios; + tcgetattr(STDIN_FILENO, &initial_state); + new_termios = initial_state; + new_termios.c_lflag &= ~(ICANON | ECHO); + new_termios.c_cc[VMIN] = 1; + new_termios.c_cc[VTIME] = 0; + tcsetattr(STDIN_FILENO, TCSANOW, &new_termios); + + tty = fopen("/dev/tty", "w+"); + if (tty != nullptr) { + out = tty; + } + } + + setlocale(LC_ALL, ""); +#endif + } + + void cleanup() { + // Reset console display + set_display(reset); + +#if !defined(_WIN32) + // Restore settings on POSIX systems + if (!simple_io) { + if (tty != nullptr) { + out = stdout; + fclose(tty); + tty = nullptr; + } + tcsetattr(STDIN_FILENO, TCSANOW, &initial_state); + } +#endif + } + + // + // Display and IO + // + + // Keep track of current display and only emit ANSI code if it changes + void set_display(display_t display) { + if (advanced_display && current_display != display) { + fflush(stdout); + switch(display) { + case reset: + fprintf(out, ANSI_COLOR_RESET); + break; + case prompt: + fprintf(out, ANSI_COLOR_YELLOW); + break; + case user_input: + fprintf(out, ANSI_BOLD ANSI_COLOR_GREEN); + break; + case error: + fprintf(out, ANSI_BOLD ANSI_COLOR_RED); + } + current_display = display; + fflush(out); + } + } + + static char32_t getchar32() { +#if defined(_WIN32) + HANDLE hConsole = GetStdHandle(STD_INPUT_HANDLE); + wchar_t high_surrogate = 0; + + while (true) { + INPUT_RECORD record; + DWORD count; + if (!ReadConsoleInputW(hConsole, &record, 1, &count) || count == 0) { + return WEOF; + } + + if (record.EventType == KEY_EVENT && record.Event.KeyEvent.bKeyDown) { + wchar_t wc = record.Event.KeyEvent.uChar.UnicodeChar; + if (wc == 0) { + continue; + } + + if ((wc >= 0xD800) && (wc <= 0xDBFF)) { // Check if wc is a high surrogate + high_surrogate = wc; + continue; + } + if ((wc >= 0xDC00) && (wc <= 0xDFFF)) { // Check if wc is a low surrogate + if (high_surrogate != 0) { // Check if we have a high surrogate + return ((high_surrogate - 0xD800) << 10) + (wc - 0xDC00) + 0x10000; + } + } + + high_surrogate = 0; // Reset the high surrogate + return static_cast(wc); + } + } +#else + wchar_t wc = getwchar(); + if (static_cast(wc) == WEOF) { + return WEOF; + } + +#if WCHAR_MAX == 0xFFFF + if ((wc >= 0xD800) && (wc <= 0xDBFF)) { // Check if wc is a high surrogate + wchar_t low_surrogate = getwchar(); + if ((low_surrogate >= 0xDC00) && (low_surrogate <= 0xDFFF)) { // Check if the next wchar is a low surrogate + return (static_cast(wc & 0x03FF) << 10) + (low_surrogate & 0x03FF) + 0x10000; + } + } + if ((wc >= 0xD800) && (wc <= 0xDFFF)) { // Invalid surrogate pair + return 0xFFFD; // Return the replacement character U+FFFD + } +#endif + + return static_cast(wc); +#endif + } + + static void pop_cursor() { +#if defined(_WIN32) + if (hConsole != NULL) { + CONSOLE_SCREEN_BUFFER_INFO bufferInfo; + GetConsoleScreenBufferInfo(hConsole, &bufferInfo); + + COORD newCursorPosition = bufferInfo.dwCursorPosition; + if (newCursorPosition.X == 0) { + newCursorPosition.X = bufferInfo.dwSize.X - 1; + newCursorPosition.Y -= 1; + } else { + newCursorPosition.X -= 1; + } + + SetConsoleCursorPosition(hConsole, newCursorPosition); + return; + } +#endif + putc('\b', out); + } + + static int estimateWidth(char32_t codepoint) { +#if defined(_WIN32) + (void)codepoint; + return 1; +#else + return wcwidth(codepoint); +#endif + } + + static int put_codepoint(const char* utf8_codepoint, size_t length, int expectedWidth) { +#if defined(_WIN32) + CONSOLE_SCREEN_BUFFER_INFO bufferInfo; + if (!GetConsoleScreenBufferInfo(hConsole, &bufferInfo)) { + // go with the default + return expectedWidth; + } + COORD initialPosition = bufferInfo.dwCursorPosition; + DWORD nNumberOfChars = length; + WriteConsole(hConsole, utf8_codepoint, nNumberOfChars, &nNumberOfChars, NULL); + + CONSOLE_SCREEN_BUFFER_INFO newBufferInfo; + GetConsoleScreenBufferInfo(hConsole, &newBufferInfo); + + // Figure out our real position if we're in the last column + if (utf8_codepoint[0] != 0x09 && initialPosition.X == newBufferInfo.dwSize.X - 1) { + DWORD nNumberOfChars; + WriteConsole(hConsole, &" \b", 2, &nNumberOfChars, NULL); + GetConsoleScreenBufferInfo(hConsole, &newBufferInfo); + } + + int width = newBufferInfo.dwCursorPosition.X - initialPosition.X; + if (width < 0) { + width += newBufferInfo.dwSize.X; + } + return width; +#else + // We can trust expectedWidth if we've got one + if (expectedWidth >= 0 || tty == nullptr) { + fwrite(utf8_codepoint, length, 1, out); + return expectedWidth; + } + + fputs("\033[6n", tty); // Query cursor position + int x1; + int y1; + int x2; + int y2; + int results = 0; + results = fscanf(tty, "\033[%d;%dR", &y1, &x1); + + fwrite(utf8_codepoint, length, 1, tty); + + fputs("\033[6n", tty); // Query cursor position + results += fscanf(tty, "\033[%d;%dR", &y2, &x2); + + if (results != 4) { + return expectedWidth; + } + + int width = x2 - x1; + if (width < 0) { + // Calculate the width considering text wrapping + struct winsize w; + ioctl(STDOUT_FILENO, TIOCGWINSZ, &w); + width += w.ws_col; + } + return width; +#endif + } + + static void replace_last(char ch) { +#if defined(_WIN32) + pop_cursor(); + put_codepoint(&ch, 1, 1); +#else + fprintf(out, "\b%c", ch); +#endif + } + + static void append_utf8(char32_t ch, std::string & out) { + if (ch <= 0x7F) { + out.push_back(static_cast(ch)); + } else if (ch <= 0x7FF) { + out.push_back(static_cast(0xC0 | ((ch >> 6) & 0x1F))); + out.push_back(static_cast(0x80 | (ch & 0x3F))); + } else if (ch <= 0xFFFF) { + out.push_back(static_cast(0xE0 | ((ch >> 12) & 0x0F))); + out.push_back(static_cast(0x80 | ((ch >> 6) & 0x3F))); + out.push_back(static_cast(0x80 | (ch & 0x3F))); + } else if (ch <= 0x10FFFF) { + out.push_back(static_cast(0xF0 | ((ch >> 18) & 0x07))); + out.push_back(static_cast(0x80 | ((ch >> 12) & 0x3F))); + out.push_back(static_cast(0x80 | ((ch >> 6) & 0x3F))); + out.push_back(static_cast(0x80 | (ch & 0x3F))); + } else { + // Invalid Unicode code point + } + } + + // Helper function to remove the last UTF-8 character from a string + static void pop_back_utf8_char(std::string & line) { + if (line.empty()) { + return; + } + + size_t pos = line.length() - 1; + + // Find the start of the last UTF-8 character (checking up to 4 bytes back) + for (size_t i = 0; i < 3 && pos > 0; ++i, --pos) { + if ((line[pos] & 0xC0) != 0x80) { + break; // Found the start of the character + } + } + line.erase(pos); + } + + static bool readline_advanced(std::string & line, bool multiline_input) { + if (out != stdout) { + fflush(stdout); + } + + line.clear(); + std::vector widths; + bool is_special_char = false; + bool end_of_stream = false; + + char32_t input_char; + while (true) { + fflush(out); // Ensure all output is displayed before waiting for input + input_char = getchar32(); + + if (input_char == '\r' || input_char == '\n') { + break; + } + + if (input_char == (char32_t) WEOF || input_char == 0x04 /* Ctrl+D*/) { + end_of_stream = true; + break; + } + + if (is_special_char) { + set_display(user_input); + replace_last(line.back()); + is_special_char = false; + } + + if (input_char == '\033') { // Escape sequence + char32_t code = getchar32(); + if (code == '[' || code == 0x1B) { + // Discard the rest of the escape sequence + while ((code = getchar32()) != (char32_t) WEOF) { + if ((code >= 'A' && code <= 'Z') || (code >= 'a' && code <= 'z') || code == '~') { + break; + } + } + } + } else if (input_char == 0x08 || input_char == 0x7F) { // Backspace + if (!widths.empty()) { + int count; + do { + count = widths.back(); + widths.pop_back(); + // Move cursor back, print space, and move cursor back again + for (int i = 0; i < count; i++) { + replace_last(' '); + pop_cursor(); + } + pop_back_utf8_char(line); + } while (count == 0 && !widths.empty()); + } + } else { + int offset = line.length(); + append_utf8(input_char, line); + int width = put_codepoint(line.c_str() + offset, line.length() - offset, estimateWidth(input_char)); + if (width < 0) { + width = 0; + } + widths.push_back(width); + } + + if (!line.empty() && (line.back() == '\\' || line.back() == '/')) { + set_display(prompt); + replace_last(line.back()); + is_special_char = true; + } + } + + bool has_more = multiline_input; + if (is_special_char) { + replace_last(' '); + pop_cursor(); + + char last = line.back(); + line.pop_back(); + if (last == '\\') { + line += '\n'; + fputc('\n', out); + has_more = !has_more; + } else { + // llama will just eat the single space, it won't act as a space + if (line.length() == 1 && line.back() == ' ') { + line.clear(); + pop_cursor(); + } + has_more = false; + } + } else { + if (end_of_stream) { + has_more = false; + } else { + line += '\n'; + fputc('\n', out); + } + } + + fflush(out); + return has_more; + } + + static bool readline_simple(std::string & line, bool multiline_input) { +#if defined(_WIN32) + std::wstring wline; + if (!std::getline(std::wcin, wline)) { + // Input stream is bad or EOF received + line.clear(); + GenerateConsoleCtrlEvent(CTRL_C_EVENT, 0); + return false; + } + + int size_needed = WideCharToMultiByte(CP_UTF8, 0, &wline[0], (int)wline.size(), NULL, 0, NULL, NULL); + line.resize(size_needed); + WideCharToMultiByte(CP_UTF8, 0, &wline[0], (int)wline.size(), &line[0], size_needed, NULL, NULL); +#else + if (!std::getline(std::cin, line)) { + // Input stream is bad or EOF received + line.clear(); + return false; + } +#endif + if (!line.empty()) { + char last = line.back(); + if (last == '/') { // Always return control on '/' symbol + line.pop_back(); + return false; + } + if (last == '\\') { // '\\' changes the default action + line.pop_back(); + multiline_input = !multiline_input; + } + } + line += '\n'; + + // By default, continue input if multiline_input is set + return multiline_input; + } + + bool readline(std::string & line, bool multiline_input) { + set_display(user_input); + + if (simple_io) { + return readline_simple(line, multiline_input); + } + return readline_advanced(line, multiline_input); + } + +} diff --git a/llama/llama.cpp/common/console.h b/llama/llama.cpp/common/console.h new file mode 100644 index 000000000..ec175269b --- /dev/null +++ b/llama/llama.cpp/common/console.h @@ -0,0 +1,19 @@ +// Console functions + +#pragma once + +#include + +namespace console { + enum display_t { + reset = 0, + prompt, + user_input, + error + }; + + void init(bool use_simple_io, bool use_advanced_display); + void cleanup(); + void set_display(display_t display); + bool readline(std::string & line, bool multiline_input); +} diff --git a/llama/llama.cpp/common/json-partial.cpp b/llama/llama.cpp/common/json-partial.cpp new file mode 100644 index 000000000..d9d916998 --- /dev/null +++ b/llama/llama.cpp/common/json-partial.cpp @@ -0,0 +1,256 @@ +#include "json-partial.h" + +#include "log.h" + +#include + +#include + +using json = nlohmann::ordered_json; + +enum common_json_stack_element_type { + COMMON_JSON_STACK_ELEMENT_OBJECT, + COMMON_JSON_STACK_ELEMENT_KEY, + COMMON_JSON_STACK_ELEMENT_ARRAY, +}; + +struct common_json_stack_element { + common_json_stack_element_type type; + std::string key; +}; + +bool common_json_parse( + const std::string & input, + const std::string & healing_marker, + common_json & out) +{ + std::string::const_iterator it = input.begin(); + const auto end = input.end(); + return common_json_parse(it, end, healing_marker, out); +} + +bool common_json_parse( + std::string::const_iterator & it, + const std::string::const_iterator & end, + const std::string & healing_marker, + common_json & out) +{ + // // https://json.nlohmann.me/features/parsing/sax_interface/ + struct json_error_locator : public nlohmann::json_sax { + std::size_t position; + bool found_error; + std::string last_token; + std::string exception_message; + std::vector stack; + + json_error_locator() : position(0), found_error(false) {} + + bool parse_error(std::size_t position, const std::string & last_token, const json::exception & ex) override { // NOLINT + this->position = position - 1; + this->found_error = true; + this->last_token = last_token; + this->exception_message = ex.what(); + return false; + } + void close_value() { + if (!stack.empty() && (stack.back().type == COMMON_JSON_STACK_ELEMENT_KEY)) { + stack.pop_back(); + } + } + bool null() override { // NOLINT + close_value(); + return true; + } + bool boolean(bool) override { // NOLINT + close_value(); + return true; + } + bool number_integer(number_integer_t) override { // NOLINT + close_value(); + return true; + } + bool number_unsigned(number_unsigned_t) override { // NOLINT + close_value(); + return true; + } + bool number_float(number_float_t, const string_t &) override { // NOLINT + close_value(); + return true; + } + bool string(string_t &) override { // NOLINT + close_value(); + return true; + } + bool binary(binary_t &) override { // NOLINT + close_value(); + return true; + } + bool start_object(std::size_t) override { // NOLINT + stack.push_back({COMMON_JSON_STACK_ELEMENT_OBJECT, ""}); + return true; + } + bool end_object() override { + GGML_ASSERT(!stack.empty() && stack.back().type == COMMON_JSON_STACK_ELEMENT_OBJECT); + stack.pop_back(); + close_value(); + return true; + } + bool key(string_t & key) override { // NOLINT + stack.push_back({COMMON_JSON_STACK_ELEMENT_KEY, key}); + return true; + } + bool start_array(std::size_t) override { // NOLINT + stack.push_back({COMMON_JSON_STACK_ELEMENT_ARRAY, ""}); + return true; + } + bool end_array() override { + GGML_ASSERT(!stack.empty() && stack.back().type == COMMON_JSON_STACK_ELEMENT_ARRAY); + stack.pop_back(); + close_value(); + return true; + } + }; + json_error_locator err_loc; + auto start = it; + json::sax_parse(it, end, &err_loc); + + if (err_loc.found_error) { + it = start; + auto temptative_end = it + err_loc.position; + // LOG_DBG("Error at position %zu (is_end = %s): %s\n", err_loc.position, temptative_end == end ? "true" : "false", err_loc.exception_message.c_str()); + + auto input = std::string(it, temptative_end); + try { + out.json = json::parse(input); + // out.json = json::parse(it, temptative_end); + it = temptative_end; + return true; + } catch (const std::exception & ex) { + // No, needs healing. + LOG_DBG("Failed to parse up to error: %s: <<<%s>>>\n", ex.what(), std::string(it, temptative_end).c_str()); + } + auto can_parse = [](const std::string & str) { + try { + auto _ = json::parse(str); // NOLINT + return true; + } catch (const std::exception &) { + return false; + } + }; + if (!healing_marker.empty() && !err_loc.stack.empty()) { + std::string str(it, temptative_end); + auto last_non_sp_pos = str.find_last_not_of(" \n\r\t"); + if (last_non_sp_pos == std::string::npos) { + throw std::runtime_error("Cannot heal a truncated JSON that stopped in an unknown location"); + } + auto last_non_sp_char = str[last_non_sp_pos]; + // Used to detect stops on a number, which may not be complete. + auto was_maybe_number = [&]() { + if (!str.empty() && std::isspace(str.back())) { + return false; + } + return std::isdigit(last_non_sp_char) || + last_non_sp_char == '.' || + last_non_sp_char == 'e' || + last_non_sp_char == 'E' || + last_non_sp_char == '-'; + }; + + std::string closing; + for (size_t i = err_loc.stack.size(); i > 0; i--) { + auto & el = err_loc.stack[i - 1]; + if (el.type == COMMON_JSON_STACK_ELEMENT_OBJECT) { + closing += "}"; + } else if (el.type == COMMON_JSON_STACK_ELEMENT_ARRAY) { + closing += "]"; + } else if (el.type != COMMON_JSON_STACK_ELEMENT_KEY) { + throw std::runtime_error("Unexpected stack element type"); + } + } + + const auto & magic_seed = out.healing_marker.marker = healing_marker;//"$llama.cpp.json$"; + + if (err_loc.stack.back().type == COMMON_JSON_STACK_ELEMENT_KEY) { + // We're inside an object value + if (last_non_sp_char == ':' && can_parse(str + "1" + closing)) { + // Was about to create an object value + str += (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing; + } else if (can_parse(str + ": 1" + closing)) { + str += (out.healing_marker.json_dump_marker = ":\"" + magic_seed) + "\"" + closing; + } else if (last_non_sp_char == '{' && can_parse(str + closing)) { + // Was about to create an object + str += (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\": 1" + closing; + } else if (can_parse(str + "\"" + closing)) { + // Was inside an object value string + str += (out.healing_marker.json_dump_marker = magic_seed) + "\"" + closing; + } else if (str[str.length() - 1] == '\\' && can_parse(str + "\\\"" + closing)) { + // Was inside an object value string after an escape + str += (out.healing_marker.json_dump_marker = "\\" + magic_seed) + "\"" + closing; + } else { + // find last : + auto last_pos = str.find_last_of(':'); + if (last_pos == std::string::npos) { + throw std::runtime_error("Cannot heal a truncated JSON that stopped in an unknown location"); + } + // Cutting back to opening : for object value + str = str.substr(0, last_pos + 1) + (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing; + } + } else if (err_loc.stack.back().type == COMMON_JSON_STACK_ELEMENT_ARRAY) { + if ((last_non_sp_char == ',' || last_non_sp_char == '[') && can_parse(str + "1" + closing)) { + // Was about to create an array value + str += (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing; + } else if (can_parse(str + "\"" + closing)) { + // Was inside an array value string + str += (out.healing_marker.json_dump_marker = magic_seed) + "\"" + closing; + } else if (str[str.length() - 1] == '\\' && can_parse(str + "\\\"" + closing)) { + // Was inside an array value string after an escape + str += (out.healing_marker.json_dump_marker = "\\" + magic_seed) + "\"" + closing; + } else if (!was_maybe_number() && can_parse(str + ", 1" + closing)) { + // Had just finished a value + str += (out.healing_marker.json_dump_marker = ",\"" + magic_seed) + "\"" + closing; + } else { + auto last_pos = str.find_last_of("[,"); + if (last_pos == std::string::npos) { + throw std::runtime_error("Cannot heal a truncated JSON array stopped in an unknown location"); + } + // Cutting back to last [ or , for array value + str = str.substr(0, last_pos + 1) + (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing; + } + } else if (err_loc.stack.back().type == COMMON_JSON_STACK_ELEMENT_OBJECT) { + if ((last_non_sp_char == '{' && can_parse(str + closing)) || + (last_non_sp_char == ',' && can_parse(str + "\"\": 1" + closing))) { + // Was about to create an object key+value + str += (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\": 1" + closing; + } else if (!was_maybe_number() && can_parse(str + ",\"\": 1" + closing)) { + // Was about to create an object key+value + str += (out.healing_marker.json_dump_marker = ",\"" + magic_seed) + "\": 1" + closing; + } else if (can_parse(str + "\": 1" + closing)) { + // Was inside an object key string + str += (out.healing_marker.json_dump_marker = magic_seed) + "\": 1" + closing; + } else if (str[str.length() - 1] == '\\' && can_parse(str + "\\\": 1" + closing)) { + // Was inside an object key string after an escape + str += (out.healing_marker.json_dump_marker = "\\" + magic_seed) + "\": 1" + closing; + } else { + auto last_pos = str.find_last_of(':'); + if (last_pos == std::string::npos) { + throw std::runtime_error("Cannot heal a truncated JSON object stopped in an unknown location"); + } + // fprintf(stderr, "Cutting back to last : for object key+value\n"); + str = str.substr(0, last_pos + 1) + (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing; + } + } else { + throw std::runtime_error("Cannot heal a truncated JSON object stopped in an unknown location"); + } + // fprintf(stderr, "HEALED:\nSTRING <<<\n%s\n>>>\n\nmagic_cut: <<<\n%s\n>>>\n\n", str.c_str(), out.healing_marker.json_dump_marker.c_str()); + out.json = json::parse(str); + it = temptative_end; + return true; + } + // TODO: handle unclosed top-level primitive if the stack was empty but we got an error (e.g. "tru", "\"", etc...) + // fprintf(stderr, "Closing: TODO\n"); + return false; + } + out.json = json::parse(it, end); + it = end; + return true; +} diff --git a/llama/llama.cpp/common/json-partial.h b/llama/llama.cpp/common/json-partial.h new file mode 100644 index 000000000..f63356dc4 --- /dev/null +++ b/llama/llama.cpp/common/json-partial.h @@ -0,0 +1,38 @@ +#pragma once + +#include + +// Healing marker (empty if the JSON was fully parsed / wasn't healed). +struct common_healing_marker { + // Raw marker. + std::string marker; + + // Cutting the `common_json.json.dump()` string at the (only) occurrence of this marker should yield the original partial JSON string (modulo spaces / if it had the same dump format). + std::string json_dump_marker; +}; + +// Represents a parsed JSON object, with its optional healing marker (a JSON dump fragment that can be used to find the position of healing in the JSON dump string) +struct common_json { + nlohmann::ordered_json json; + + common_healing_marker healing_marker; +}; + +// Parse the JSON string, healing (closing) any partial JSON if `healing_marker` is not empty. +// +// Healing completes partial JSON strings by adding a (possibly modified) healing marker, then whatever is needed to close the JSON. +// This allows to parse the resulting healed JSON string, yet be able to cut it again if needed at the healing marker. +// (this is used when parsing JSON outputs from the models, then crafting partial JSONs for the partial tool calls in OAI format). +// +// For instance, parsing `{` with a healing marker `foo` will produce a healed JSON `{"foo":1}`, w/ json_dump_marker = `"foo"` (which can be used to break the JSON again). +bool common_json_parse( + const std::string & input, + const std::string & healing_marker, + common_json & out); + +// Parse the JSON string (see overload above), but advancing an iterator to the end of the input when the (potentially partial) parsing succeeds. +bool common_json_parse( + std::string::const_iterator & it, + const std::string::const_iterator & end, + const std::string & healing_marker, + common_json & out); diff --git a/llama/llama.cpp/common/regex-partial.cpp b/llama/llama.cpp/common/regex-partial.cpp new file mode 100644 index 000000000..4bff6b663 --- /dev/null +++ b/llama/llama.cpp/common/regex-partial.cpp @@ -0,0 +1,204 @@ +#include "regex-partial.h" +#include "common.h" +#include +#include + +common_regex::common_regex(const std::string & pattern) : + pattern(pattern), + rx(pattern), + rx_reversed_partial(regex_to_reversed_partial_regex(pattern)) {} + +common_regex_match common_regex::search(const std::string & input, size_t pos, bool as_match) const { + std::smatch match; + if (pos > input.size()) { + throw std::runtime_error("Position out of bounds"); + } + auto start = input.begin() + pos; + auto found = as_match + ? std::regex_match(start, input.end(), match, rx) + : std::regex_search(start, input.end(), match, rx); + if (found) { + common_regex_match res; + res.type = COMMON_REGEX_MATCH_TYPE_FULL; + for (size_t i = 0; i < match.size(); ++i) { + auto begin = pos + match.position(i); + res.groups.emplace_back(begin, begin + match.length(i)); + } + return res; + } + std::match_results srmatch; + if (std::regex_match(input.rbegin(), input.rend() - pos, srmatch, rx_reversed_partial)) { + auto group = srmatch[1].str(); + if (group.length() != 0) { + auto it = srmatch[1].second.base(); + // auto position = static_cast(std::distance(input.begin(), it)); + if ((!as_match) || it == input.begin()) { + common_regex_match res; + res.type = COMMON_REGEX_MATCH_TYPE_PARTIAL; + const size_t begin = std::distance(input.begin(), it); + const size_t end = input.size(); + if (begin == std::string::npos || end == std::string::npos || begin > end) { + throw std::runtime_error("Invalid range"); + } + res.groups.push_back({begin, end}); + return res; + } + } + } + return {}; +} + +/* + Transforms a regex pattern to a partial match pattern that operates on a reversed input string to find partial final matches of the original pattern. + + Ideally we'd like to use boost::match_partial (https://beta.boost.org/doc/libs/1_59_0/libs/regex/doc/html/boost_regex/partial_matches.html) + to see if a string ends with a partial regex match, but but it's not in std::regex yet. + Instead, we'll the regex into a partial match regex operating as a full match on the reverse iterators of the input. + + - /abcd/ -> (dcba|cba|ba|a).* -> ((?:(?:(?:(?:d)?c)?b)?a).* + - /a|b/ -> (a|b).* + - /a*?/ -> error, could match "" + - /a*b/ -> ((?:b)?a*+).* (final repetitions become eager) + - /.*?ab/ -> ((?:b)?a).* (merge .*) + - /a.*?b/ -> ((?:b)?.*?a).* (keep reluctant matches) + - /a(bc)d/ -> ((?:(?:d)?(?:(?:c)?b))?a).* + - /a(bc|de)/ -> ((?:(?:(?:e)?d)?|(?:(?:c)?b)?)?a).* + - /ab{2,4}c/ -> abbb?b?c -> ((?:(?:(?:(?:(?:c)?b)?b)?b?)?b?)?a).* + + The regex will match a reversed string fully, and the end of the first (And only) capturing group will indicate the reversed start of the original partial pattern + (i.e. just where the final .* starts in the inverted pattern; all other groups are turned into non-capturing groups, and reluctant quantifiers are ignored) +*/ +std::string regex_to_reversed_partial_regex(const std::string & pattern) { + auto it = pattern.begin(); + const auto end = pattern.end(); + + std::function process = [&]() { + std::vector> alternatives(1); + std::vector * sequence = &alternatives.back(); + + while (it != end) { + if (*it == '[') { + auto start = it; + ++it; + while (it != end) { + if ((*it == '\\') && (++it != end)) { + ++it; + } else if ((it != end) && (*it == ']')) { + break; + } else { + ++it; + } + } + if (it == end) { + throw std::runtime_error("Unmatched '[' in pattern"); + } + ++it; + sequence->push_back(std::string(start, it)); + } else if (*it == '*' || *it == '?' || *it == '+') { + if (sequence->empty()) { + throw std::runtime_error("Quantifier without preceding element"); + } + sequence->back() += *it; + auto is_star = *it == '*'; + ++it; + if (is_star) { + if (*it == '?') { + ++it; + } + } + } else if (*it == '{') { + if (sequence->empty()) { + throw std::runtime_error("Repetition without preceding element"); + } + ++it; + auto start = it; + while (it != end && *it != '}') { + ++it; + } + if (it == end) { + throw std::runtime_error("Unmatched '{' in pattern"); + } + auto parts = string_split(std::string(start, it), ","); + ++it; + if (parts.size() > 2) { + throw std::runtime_error("Invalid repetition range in pattern"); + } + + auto parseOptInt = [&](const std::string & s, const std::optional & def = std::nullopt) -> std::optional { + if (s.empty()) { + return def; + } + return std::stoi(s); + }; + auto min = parseOptInt(parts[0], 0); + auto max = parts.size() == 1 ? min : parseOptInt(parts[1]); + if (min && max && *max < *min) { + throw std::runtime_error("Invalid repetition range in pattern"); + } + // Brutal but... let's repeat at least min times, then ? for the delta between min & max (or * for unbounded) + auto part = sequence->back(); + sequence->pop_back(); + for (int i = 0; i < *min; i++) { + sequence->push_back(part); + } + if (max) { + for (int i = *min; i < *max; i++) { + sequence->push_back(part + "?"); + } + } else { + sequence->push_back(part + "*"); + } + } else if (*it == '(') { + ++it; + if (it != end && *it == '?' && (it + 1 != end) && *(it + 1) == ':') { + it += 2; + } + auto sub = process(); + if (*it != ')') { + throw std::runtime_error("Unmatched '(' in pattern"); + } + ++it; + auto & part = sequence->emplace_back("(?:"); + part += sub; + part += ")"; + } else if (*it == ')') { + break; + } else if (*it == '|') { + ++it; + alternatives.emplace_back(); + sequence = &alternatives.back(); + } else if (*it == '\\' && (++it != end)) { + auto str = std::string("\\") + *it; + sequence->push_back(str); + ++it; + } else if (it != end) { + sequence->push_back(std::string(1, *it)); + ++it; + } + } + + // /abcd/ -> (dcba|cba|ba|a).* -> ((?:(?:(?:d)?c)?b)?a).* + // if n(=4) parts, opening n-1(=3) non-capturing groups after the 1 capturing group + // We'll do the outermost capturing group and final .* in the enclosing function. + std::vector res_alts; + for (const auto & parts : alternatives) { + auto & res = res_alts.emplace_back(); + for (size_t i = 0; i < parts.size() - 1; i++) { + res += "(?:"; + } + for (auto it = parts.rbegin(); it != parts.rend(); ++it) { + res += *it; + if (it != parts.rend() - 1) { + res += ")?"; + } + } + } + return string_join(res_alts, "|"); + }; + auto res = process(); + if (it != end) { + throw std::runtime_error("Unmatched '(' in pattern"); + } + + return "(" + res + ")[\\s\\S]*"; +} diff --git a/llama/llama.cpp/common/regex-partial.h b/llama/llama.cpp/common/regex-partial.h new file mode 100644 index 000000000..634cb4022 --- /dev/null +++ b/llama/llama.cpp/common/regex-partial.h @@ -0,0 +1,56 @@ +#pragma once + +#include +#include + +enum common_regex_match_type { + COMMON_REGEX_MATCH_TYPE_NONE, + COMMON_REGEX_MATCH_TYPE_PARTIAL, + COMMON_REGEX_MATCH_TYPE_FULL, +}; + +struct common_string_range { + size_t begin; + size_t end; + common_string_range(size_t begin, size_t end) : begin(begin), end(end) { + if (begin > end) { + throw std::runtime_error("Invalid range"); + } + } + // prevent default ctor + common_string_range() = delete; + bool empty() const { + return begin == end; + } + bool operator==(const common_string_range & other) const { + return begin == other.begin && end == other.end; + } +}; + +struct common_regex_match { + common_regex_match_type type = COMMON_REGEX_MATCH_TYPE_NONE; + std::vector groups; + + bool operator==(const common_regex_match & other) const { + return type == other.type && groups == other.groups; + } + bool operator!=(const common_regex_match & other) const { + return !(*this == other); + } +}; + +class common_regex { + std::string pattern; + std::regex rx; + std::regex rx_reversed_partial; + + public: + explicit common_regex(const std::string & pattern); + + common_regex_match search(const std::string & input, size_t pos, bool as_match = false) const; + + const std::string & str() const { return pattern; } +}; + +// For testing only (pretty print of failures). +std::string regex_to_reversed_partial_regex(const std::string & pattern); diff --git a/llama/llama.cpp/tools/mtmd/deprecation-warning.cpp b/llama/llama.cpp/tools/mtmd/deprecation-warning.cpp new file mode 100644 index 000000000..dded0a56a --- /dev/null +++ b/llama/llama.cpp/tools/mtmd/deprecation-warning.cpp @@ -0,0 +1,22 @@ +#include +#include + +int main(int argc, char** argv) { + std::string filename = "main"; + if (argc >= 1) { + filename = argv[0]; + } + + // Get only the program name from the full path + size_t pos = filename.find_last_of("/\\"); + if (pos != std::string::npos) { + filename = filename.substr(pos+1); + } + + fprintf(stdout, "\n"); + fprintf(stdout, "WARNING: The binary '%s' is deprecated.\n", filename.c_str()); + fprintf(stdout, "Please use 'llama-mtmd-cli' instead.\n"); + fprintf(stdout, "\n"); + + return EXIT_FAILURE; +} diff --git a/llama/llama.cpp/tools/mtmd/mtmd-audio.cpp b/llama/llama.cpp/tools/mtmd/mtmd-audio.cpp new file mode 100644 index 000000000..4d053895c --- /dev/null +++ b/llama/llama.cpp/tools/mtmd/mtmd-audio.cpp @@ -0,0 +1,769 @@ +#include "mtmd-audio.h" + +#define _USE_MATH_DEFINES // for M_PI +#include +#include +#include +#include +#include +#include +#include + +// most of the code here is copied from whisper.cpp + +// align x to upper multiple of n +#define _ALIGN(x, n) ((((x) + (n) - 1) / (n)) * (n)) + +namespace whisper_preprocessor { + +#define SIN_COS_N_COUNT WHISPER_N_FFT +namespace { +struct whisper_global_cache { + // In FFT, we frequently use sine and cosine operations with the same values. + // We can use precalculated values to speed up the process. + float sin_vals[SIN_COS_N_COUNT]; + float cos_vals[SIN_COS_N_COUNT]; + + // Hann window (Use cosf to eliminate difference) + // ref: https://pytorch.org/docs/stable/generated/torch.hann_window.html + // ref: https://github.com/openai/whisper/blob/main/whisper/audio.py#L147 + float hann_window[WHISPER_N_FFT]; + + whisper_global_cache() { + fill_sin_cos_table(); + fill_hann_window(sizeof(hann_window)/sizeof(hann_window[0]), true, hann_window); + } + + void fill_sin_cos_table() { + for (int i = 0; i < SIN_COS_N_COUNT; i++) { + double theta = (2 * M_PI * i) / SIN_COS_N_COUNT; + sin_vals[i] = sinf(theta); + cos_vals[i] = cosf(theta); + } + } + + void fill_hann_window(int length, bool periodic, float * output) { + int offset = -1; + if (periodic) { + offset = 0; + } + for (int i = 0; i < length; i++) { + output[i] = 0.5 * (1.0 - cosf((2.0 * M_PI * i) / (length + offset))); + } + } +} global_cache; +} + +// naive Discrete Fourier Transform +// input is real-valued +// output is complex-valued +static void dft(const float* in, int N, float* out) { + const int sin_cos_step = SIN_COS_N_COUNT / N; + + for (int k = 0; k < N; k++) { + float re = 0; + float im = 0; + + for (int n = 0; n < N; n++) { + int idx = (k * n * sin_cos_step) % (SIN_COS_N_COUNT); // t = 2*M_PI*k*n/N + re += in[n]*global_cache.cos_vals[idx]; // cos(t) + im -= in[n]*global_cache.sin_vals[idx]; // sin(t) + } + + out[k*2 + 0] = re; + out[k*2 + 1] = im; + } +} + +// Cooley-Tukey FFT +// poor man's implementation - use something better +// input is real-valued +// output is complex-valued +static void fft(float* in, int N, float* out) { + if (N == 1) { + out[0] = in[0]; + out[1] = 0; + return; + } + + const int half_N = N / 2; + if (N - half_N*2 == 1) { + dft(in, N, out); + return; + } + + float* even = in + N; + for (int i = 0; i < half_N; ++i) { + even[i]= in[2*i]; + } + float* even_fft = out + 2 * N; + fft(even, half_N, even_fft); + + float* odd = even; + for (int i = 0; i < half_N; ++i) { + odd[i] = in[2*i + 1]; + } + float* odd_fft = even_fft + N; + fft(odd, half_N, odd_fft); + + const int sin_cos_step = SIN_COS_N_COUNT / N; + for (int k = 0; k < half_N; k++) { + int idx = k * sin_cos_step; // t = 2*M_PI*k/N + float re = global_cache.cos_vals[idx]; // cos(t) + float im = -global_cache.sin_vals[idx]; // sin(t) + + float re_odd = odd_fft[2*k + 0]; + float im_odd = odd_fft[2*k + 1]; + + out[2*k + 0] = even_fft[2*k + 0] + re*re_odd - im*im_odd; + out[2*k + 1] = even_fft[2*k + 1] + re*im_odd + im*re_odd; + + out[2*(k + half_N) + 0] = even_fft[2*k + 0] - re*re_odd + im*im_odd; + out[2*(k + half_N) + 1] = even_fft[2*k + 1] - re*im_odd - im*re_odd; + } +} + +static void log_mel_spectrogram_worker_thread(int ith, const float * hann, const std::vector & samples, + int n_samples, int frame_size, int frame_step, int n_threads, + const whisper_filters & filters, whisper_mel & mel) { + std::vector fft_in(frame_size * 2, 0.0); + std::vector fft_out(frame_size * 2 * 2 * 2); + + int n_fft = filters.n_fft; + int i = ith; + + // make sure n_fft == 1 + (WHISPER_N_FFT / 2), bin_0 to bin_nyquist + WHISPER_ASSERT(n_fft == 1 + (frame_size / 2)); + + // calculate FFT only when fft_in are not all zero + for (; i < std::min(n_samples / frame_step + 1, mel.n_len); i += n_threads) { + const int offset = i * frame_step; + + // apply Hann window (~10% faster) + for (int j = 0; j < std::min(frame_size, n_samples - offset); j++) { + fft_in[j] = hann[j] * samples[offset + j]; + } + + // fill the rest with zeros + if (n_samples - offset < frame_size) { + std::fill(fft_in.begin() + (n_samples - offset), fft_in.end(), 0.0); + } + + // FFT + fft(fft_in.data(), frame_size, fft_out.data()); + + // Calculate modulus^2 of complex numbers + // Use pow(fft_out[2 * j + 0], 2) + pow(fft_out[2 * j + 1], 2) causes inference quality problem? Interesting. + for (int j = 0; j < n_fft; j++) { + fft_out[j] = (fft_out[2 * j + 0] * fft_out[2 * j + 0] + fft_out[2 * j + 1] * fft_out[2 * j + 1]); + } + + // mel spectrogram + for (int j = 0; j < mel.n_mel; j++) { + double sum = 0.0; + // unroll loop (suggested by GH user @lunixbochs) + int k = 0; + for (k = 0; k < n_fft - 3; k += 4) { + sum += + fft_out[k + 0] * filters.data[j * n_fft + k + 0] + + fft_out[k + 1] * filters.data[j * n_fft + k + 1] + + fft_out[k + 2] * filters.data[j * n_fft + k + 2] + + fft_out[k + 3] * filters.data[j * n_fft + k + 3]; + } + // handle n_fft remainder + for (; k < n_fft; k++) { + sum += fft_out[k] * filters.data[j * n_fft + k]; + } + sum = log10(std::max(sum, 1e-10)); + mel.data[j * mel.n_len + i] = sum; + } + } + + // Otherwise fft_out are all zero + double sum = log10(1e-10); + for (; i < mel.n_len; i += n_threads) { + for (int j = 0; j < mel.n_mel; j++) { + mel.data[j * mel.n_len + i] = sum; + } + } +} + +// ref: https://github.com/openai/whisper/blob/main/whisper/audio.py#L110-L157 +static bool log_mel_spectrogram( + const float * samples, + const int n_samples, + const int /*sample_rate*/, + const int frame_size, + const int frame_step, + const int n_mel, + const int n_threads, + const whisper_filters & filters, + const bool debug, + whisper_mel & mel) { + //const int64_t t_start_us = ggml_time_us(); + + // Hann window + WHISPER_ASSERT(frame_size == WHISPER_N_FFT && "Unsupported frame_size"); + const float * hann = global_cache.hann_window; + + // Calculate the length of padding + int64_t stage_1_pad = WHISPER_SAMPLE_RATE * 30; + int64_t stage_2_pad = frame_size / 2; + + // Initialize a vector and copy data from C array to it. + std::vector samples_padded; + samples_padded.resize(n_samples + stage_1_pad + stage_2_pad * 2); + std::copy(samples, samples + n_samples, samples_padded.begin() + stage_2_pad); + + // pad 30 seconds of zeros at the end of audio (480,000 samples) + reflective pad 200 samples at the end of audio + std::fill(samples_padded.begin() + n_samples + stage_2_pad, samples_padded.begin() + n_samples + stage_1_pad + 2 * stage_2_pad, 0); + + // reflective pad 200 samples at the beginning of audio + std::reverse_copy(samples + 1, samples + 1 + stage_2_pad, samples_padded.begin()); + + mel.n_mel = n_mel; + // https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/SpectralOps.cpp#L936 + // Calculate number of frames + remove the last frame + mel.n_len = (samples_padded.size() - frame_size) / frame_step; + // Calculate semi-padded sample length to ensure compatibility + mel.n_len_org = 1 + (n_samples + stage_2_pad - frame_size) / frame_step; + mel.data.resize(mel.n_mel * mel.n_len); + + { + std::vector workers(n_threads - 1); + for (int iw = 0; iw < n_threads - 1; ++iw) { + workers[iw] = std::thread( + log_mel_spectrogram_worker_thread, iw + 1, hann, std::cref(samples_padded), + n_samples + stage_2_pad, frame_size, frame_step, n_threads, + std::cref(filters), std::ref(mel)); + } + + // main thread + log_mel_spectrogram_worker_thread(0, hann, samples_padded, n_samples + stage_2_pad, frame_size, frame_step, n_threads, filters, mel); + + for (int iw = 0; iw < n_threads - 1; ++iw) { + workers[iw].join(); + } + } + + // clamping and normalization + double mmax = -1e20; + for (int i = 0; i < mel.n_mel*mel.n_len; i++) { + if (mel.data[i] > mmax) { + mmax = mel.data[i]; + } + } + + mmax -= 8.0; + + for (int i = 0; i < mel.n_mel*mel.n_len; i++) { + if (mel.data[i] < mmax) { + mel.data[i] = mmax; + } + + mel.data[i] = (mel.data[i] + 4.0)/4.0; + } + + // Dump log_mel_spectrogram + if (debug) { + std::ofstream outFile("log_mel_spectrogram.json"); + outFile << "["; + for (uint64_t i = 0; i < mel.data.size() - 1; i++) { + outFile << mel.data[i] << ", "; + } + outFile << mel.data[mel.data.size() - 1] << "]"; + outFile.close(); + } + + return true; +} + +bool preprocess_audio( + const float * samples, + size_t n_samples, + const whisper_filters & filters, + std::vector & output) { + + if (n_samples == 0) { + // empty audio + return false; + } + + whisper_mel out_full; + bool ok = log_mel_spectrogram( + samples, + n_samples, + COMMON_SAMPLE_RATE, + WHISPER_N_FFT, + WHISPER_HOP_LENGTH, + filters.n_mel, + 4, // n_threads + filters, + false, // debug + out_full); + if (!ok) { + return false; + } + + // because the cgraph in clip.cpp only accepts 3000 frames each, we need to split the mel + // we always expect the mel to have 3000 silent frames at the end + // printf("n_len %d\n", out_full.n_len); + const size_t frames_per_chunk = 3000; + GGML_ASSERT((size_t)out_full.n_len > frames_per_chunk); + for (size_t off = 0; off < (size_t)out_full.n_len; off += frames_per_chunk) { + int n_len = std::min(frames_per_chunk, (size_t)out_full.n_len - off); + if ((size_t)n_len < frames_per_chunk) { + break; // last uncomplete chunk will always be a padded chunk, safe to ignore + } + + whisper_mel out_chunk; + out_chunk.n_len = n_len; + out_chunk.n_mel = out_full.n_mel; + out_chunk.n_len_org = out_full.n_mel; // unused + out_chunk.data.reserve(out_chunk.n_mel * out_chunk.n_len); + + for (int i = 0; i < out_full.n_mel; i++) { + auto src = out_full.data.begin() + i*out_full.n_len + off; + out_chunk.data.insert(out_chunk.data.end(), src, src + frames_per_chunk); + } + + output.push_back(std::move(out_chunk)); + } + + return true; +} + +} // namespace whisper_preprocessor + + +// precalculated mel filter banks +// values are multiplied by 1000.0 to save space, and will be divided by 1000.0 in the end of the function +// +// generated from python code: +// +// from numpy import load +// data = load('mel_filters.npz') +// lst = data.files +// for item in lst: +// print(item) +// print(data[item].shape) +// n_mel = data[item].shape[0] +// n_fft = data[item].shape[1] +// for i, row in enumerate(data[item]): +// for j, val in enumerate(row): +// val = val * 1000.0 +// if val != 0: +// print(f"data[{i*n_fft + j}] = {val:.6f};") + +namespace whisper_precalc_filters { + +whisper_preprocessor::whisper_filters get_128_bins() { + whisper_preprocessor::whisper_filters filters; + filters.n_mel = 128; + filters.n_fft = 201; + std::vector data(filters.n_mel * filters.n_fft, 0.0f); + + data[1] = 12.37398665; + data[202] = 30.39256483; + data[404] = 24.74797331; + data[605] = 18.01857911; + data[807] = 37.12195903; + data[1008] = 5.64459199; + data[1009] = 6.72939420; + data[1210] = 36.03715822; + data[1412] = 19.10337992; + data[1613] = 23.66316877; + data[1815] = 31.47736564; + data[2016] = 11.28918398; + data[2017] = 1.08480197; + data[2218] = 41.68175161; + data[2420] = 13.45878839; + data[2621] = 29.30776216; + data[2823] = 25.83277412; + data[3024] = 16.93377644; + data[3226] = 38.20675984; + data[3427] = 4.55979025; + data[3428] = 7.81419594; + data[3629] = 34.95235741; + data[3831] = 20.18818259; + data[4032] = 22.57836796; + data[4234] = 32.56217018; + data[4435] = 10.20438317; + data[4436] = 2.16960395; + data[4637] = 40.59694707; + data[4839] = 14.54358920; + data[5040] = 28.22295949; + data[5242] = 26.91757679; + data[5443] = 15.84897563; + data[5645] = 39.29156065; + data[5846] = 3.47498828; + data[5847] = 8.89899861; + data[6048] = 33.86755288; + data[6250] = 21.27298526; + data[6451] = 21.49356715; + data[6653] = 33.64697099; + data[6854] = 9.11958050; + data[6855] = 3.25440569; + data[7056] = 39.51214626; + data[7258] = 15.62839188; + data[7459] = 27.13815868; + data[7661] = 28.00237760; + data[7862] = 14.76417296; + data[8064] = 40.37636518; + data[8265] = 2.38068704; + data[8266] = 10.20263787; + data[8467] = 31.61146119; + data[8669] = 24.54700135; + data[8870] = 15.32919332; + data[8871] = 1.66583748; + data[9072] = 36.72905266; + data[9274] = 20.09709924; + data[9475] = 16.93102531; + data[9476] = 2.90265540; + data[9677] = 32.84499049; + data[9879] = 23.52004871; + data[10080] = 11.03894413; + data[10081] = 10.72582975; + data[10282] = 22.71829173; + data[10484] = 32.27872774; + data[10685] = 0.11626833; + data[10686] = 22.85348251; + data[10887] = 8.56344029; + data[10888] = 14.97978810; + data[11089] = 15.51398356; + data[11090] = 8.51490628; + data[11291] = 21.10680379; + data[11292] = 3.32652032; + data[11493] = 25.47064796; + data[11695] = 27.35907957; + data[11896] = 0.65853616; + data[11897] = 23.83812517; + data[12098] = 3.44359246; + data[12099] = 21.22455277; + data[12300] = 5.35842171; + data[12301] = 19.42555793; + data[12502] = 6.49324711; + data[12503] = 18.35542172; + data[12704] = 6.93138083; + data[12705] = 17.93504693; + data[12906] = 6.74968259; + data[12907] = 18.09151843; + data[13108] = 6.01899112; + data[13109] = 18.75767298; + data[13310] = 4.80452832; + data[13311] = 19.87172849; + data[13512] = 3.16627859; + data[13513] = 21.37690969; + data[13514] = 1.25317345; + data[13714] = 1.15934468; + data[13715] = 20.80361731; + data[13716] = 4.04486805; + data[13917] = 17.55363122; + data[13918] = 7.08320038; + data[14119] = 14.07538634; + data[14120] = 10.32655034; + data[14321] = 10.40921453; + data[14322] = 13.73696327; + data[14523] = 6.59187697; + data[14524] = 17.27988198; + data[14525] = 1.46804214; + data[14725] = 2.65681883; + data[14726] = 18.09193194; + data[14727] = 5.85655728; + data[14928] = 13.34277913; + data[14929] = 10.28267574; + data[15130] = 8.56800377; + data[15131] = 14.72230814; + data[15132] = 1.04039861; + data[15332] = 3.79085587; + data[15333] = 17.14678481; + data[15334] = 6.11609267; + data[15535] = 11.75929047; + data[15536] = 11.13393717; + data[15737] = 6.43857848; + data[15738] = 16.07806236; + data[15739] = 4.23917221; + data[15939] = 1.19989377; + data[15940] = 12.75671553; + data[15941] = 9.65298992; + data[16142] = 7.06935255; + data[16143] = 14.94054683; + data[16144] = 4.19024844; + data[16344] = 1.51483389; + data[16345] = 12.00899947; + data[16346] = 9.84823331; + data[16547] = 6.10224018; + data[16548] = 15.33857174; + data[16549] = 5.57676842; + data[16749] = 0.36827257; + data[16750] = 9.89749376; + data[16751] = 11.35340426; + data[16752] = 2.05122307; + data[16952] = 3.89297144; + data[16953] = 12.97352277; + data[16954] = 8.06631614; + data[17155] = 6.74493238; + data[17156] = 13.85874674; + data[17157] = 5.41190524; + data[17357] = 0.74220158; + data[17358] = 8.98779090; + data[17359] = 11.37871388; + data[17360] = 3.32958088; + data[17560] = 2.82313535; + data[17561] = 10.68049297; + data[17562] = 9.43340641; + data[17563] = 1.76325557; + data[17763] = 4.39018616; + data[17764] = 11.87758986; + data[17765] = 7.97005836; + data[17766] = 0.66104700; + data[17966] = 5.49466675; + data[17967] = 12.62953598; + data[17968] = 6.93987962; + data[18169] = 6.18401915; + data[18170] = 12.93473132; + data[18171] = 6.29778765; + data[18371] = 0.02325210; + data[18372] = 6.50206627; + data[18373] = 12.32661773; + data[18374] = 6.00216538; + data[18574] = 0.31548753; + data[18575] = 6.48925547; + data[18576] = 12.04130240; + data[18577] = 6.01462880; + data[18777] = 0.29979556; + data[18778] = 6.18288014; + data[18779] = 12.04272825; + data[18780] = 6.29981188; + data[18781] = 0.55689598; + data[18980] = 0.01120471; + data[18981] = 5.61729167; + data[18982] = 11.22337859; + data[18983] = 6.82516303; + data[18984] = 1.35264499; + data[19184] = 4.82410006; + data[19185] = 10.16623247; + data[19186] = 7.56075513; + data[19187] = 2.34590308; + data[19387] = 3.83235747; + data[19388] = 8.92296247; + data[19389] = 8.47910438; + data[19390] = 3.50978645; + data[19590] = 2.66873185; + data[19591] = 7.51965167; + data[19592] = 9.55500547; + data[19593] = 4.81966138; + data[19594] = 0.08431751; + data[19793] = 1.35767367; + data[19794] = 5.98019501; + data[19795] = 10.60271543; + data[19796] = 6.25298498; + data[19797] = 1.74059917; + data[19997] = 4.32644226; + data[19998] = 8.73131864; + data[19999] = 7.78916525; + data[20000] = 3.48923868; + data[20200] = 2.57835095; + data[20201] = 6.77582854; + data[20202] = 9.40941647; + data[20203] = 5.31194592; + data[20204] = 1.21447595; + data[20403] = 0.75411191; + data[20404] = 4.75395704; + data[20405] = 8.75380263; + data[20406] = 7.19209015; + data[20407] = 3.28754401; + data[20607] = 2.68179690; + data[20608] = 6.49331464; + data[20609] = 9.11457930; + data[20610] = 5.39387390; + data[20611] = 1.67316827; + data[20810] = 0.57394296; + data[20811] = 4.20600036; + data[20812] = 7.83805829; + data[20813] = 7.52023002; + data[20814] = 3.97470826; + data[20815] = 0.42918732; + data[21014] = 1.90464477; + data[21015] = 5.36569161; + data[21016] = 8.82673822; + data[21017] = 6.27609482; + data[21018] = 2.89750961; + data[21218] = 2.89885257; + data[21219] = 6.19694078; + data[21220] = 8.56699049; + data[21221] = 5.34748193; + data[21222] = 2.12797290; + data[21421] = 0.44750227; + data[21422] = 3.59030394; + data[21423] = 6.73310598; + data[21424] = 7.77023612; + data[21425] = 4.70231380; + data[21426] = 1.63439126; + data[21625] = 1.01536023; + data[21626] = 4.01018746; + data[21627] = 7.00501446; + data[21628] = 7.23442994; + data[21629] = 4.31095669; + data[21630] = 1.38748321; + data[21829] = 1.33348850; + data[21830] = 4.18730825; + data[21831] = 7.04112789; + data[21832] = 6.93188375; + data[21833] = 4.14605811; + data[21834] = 1.36023236; + data[22033] = 1.42879714; + data[22034] = 4.14824858; + data[22035] = 6.86769979; + data[22036] = 6.83705276; + data[22037] = 4.18239459; + data[22038] = 1.52773573; + data[22237] = 1.32610439; + data[22238] = 3.91751388; + data[22239] = 6.50892360; + data[22240] = 6.92639686; + data[22241] = 4.39672917; + data[22242] = 1.86706171; + data[22441] = 1.04827771; + data[22442] = 3.51767405; + data[22443] = 5.98707050; + data[22444] = 7.17824046; + data[22445] = 4.76767914; + data[22446] = 2.35711760; + data[22645] = 0.61636406; + data[22646] = 2.96949223; + data[22647] = 5.32262027; + data[22648] = 7.57265091; + data[22649] = 5.27558755; + data[22650] = 2.97852419; + data[22651] = 0.68146095; + data[22849] = 0.04971400; + data[22850] = 2.29204819; + data[22851] = 4.53438237; + data[22852] = 6.77671656; + data[22853] = 5.90240723; + data[22854] = 3.71349836; + data[22855] = 1.52458926; + data[23054] = 1.50285335; + data[23055] = 3.63961048; + data[23056] = 5.77636715; + data[23057] = 6.63159089; + data[23058] = 4.54574358; + data[23059] = 2.45989650; + data[23060] = 0.37404924; + data[23258] = 0.61795861; + data[23259] = 2.65410915; + data[23260] = 4.69025923; + data[23261] = 6.72641024; + data[23262] = 5.46034705; + data[23263] = 3.47270933; + data[23264] = 1.48507138; + data[23463] = 1.59233576; + data[23464] = 3.53261665; + data[23465] = 5.47289755; + data[23466] = 6.44368259; + data[23467] = 4.54962999; + data[23468] = 2.65557761; + data[23469] = 0.76152512; + data[23667] = 0.46749352; + data[23668] = 2.31641904; + data[23669] = 4.16534441; + data[23670] = 6.01426978; + data[23671] = 5.67844696; + data[23672] = 3.87357362; + data[23673] = 2.06870004; + data[23674] = 0.26382666; + data[23872] = 1.05349103; + data[23873] = 2.81536230; + data[23874] = 4.57723346; + data[23875] = 6.33910485; + data[23876] = 5.12815686; + data[23877] = 3.40826320; + data[23878] = 1.68837002; + data[24077] = 1.43350090; + data[24078] = 3.11241671; + data[24079] = 4.79133241; + data[24080] = 6.40943693; + data[24081] = 4.77052201; + data[24082] = 3.13160778; + data[24083] = 1.49269309; + data[24281] = 0.02932359; + data[24282] = 1.62918994; + data[24283] = 3.22905602; + data[24284] = 4.82892245; + data[24285] = 6.14671456; + data[24286] = 4.58496623; + data[24287] = 3.02321767; + data[24288] = 1.46146910; + data[24486] = 0.13601698; + data[24487] = 1.66055572; + data[24488] = 3.18509457; + data[24489] = 4.70963307; + data[24490] = 6.04072399; + data[24491] = 4.55250870; + data[24492] = 3.06429295; + data[24493] = 1.57607743; + data[24494] = 0.08786193; + data[24691] = 0.09328097; + data[24692] = 1.54603878; + data[24693] = 2.99879676; + data[24694] = 4.45155473; + data[24695] = 5.90431225; + data[24696] = 4.65566106; + data[24697] = 3.23751615; + data[24698] = 1.81937125; + data[24699] = 0.40122634; + data[24897] = 1.30262633; + data[24898] = 2.68698297; + data[24899] = 4.07133950; + data[24900] = 5.45569602; + data[24901] = 4.87832492; + data[24902] = 3.52695142; + data[24903] = 2.17557792; + data[24904] = 0.82420459; + data[25102] = 0.94595028; + data[25103] = 2.26512621; + data[25104] = 3.58430226; + data[25105] = 4.90347855; + data[25106] = 5.20569785; + data[25107] = 3.91795207; + data[25108] = 2.63020652; + data[25109] = 1.34246063; + data[25110] = 0.05471494; + data[25307] = 0.49037894; + data[25308] = 1.74744334; + data[25309] = 3.00450763; + data[25310] = 4.26157191; + data[25311] = 5.51863620; + data[25312] = 4.39707236; + data[25313] = 3.16995848; + data[25314] = 1.94284460; + data[25315] = 0.71573065; + data[25513] = 1.14698056; + data[25514] = 2.34485767; + data[25515] = 3.54273478; + data[25516] = 4.74061165; + data[25517] = 4.95198462; + data[25518] = 3.78264743; + data[25519] = 2.61331047; + data[25520] = 1.44397374; + data[25521] = 0.27463681; + data[25718] = 0.47569509; + data[25719] = 1.61717169; + data[25720] = 2.75864848; + data[25721] = 3.90012516; + data[25722] = 5.04160160; + data[25723] = 4.45712078; + data[25724] = 3.34284059; + data[25725] = 2.22856039; + data[25726] = 1.11428020; + + for (auto & val : data) { + val /= 1000.0f; + } + + filters.data = std::move(data); + return filters; +} + +} // namespace whisper_precalc_filters diff --git a/llama/llama.cpp/tools/mtmd/mtmd-audio.h b/llama/llama.cpp/tools/mtmd/mtmd-audio.h new file mode 100644 index 000000000..b7b940aff --- /dev/null +++ b/llama/llama.cpp/tools/mtmd/mtmd-audio.h @@ -0,0 +1,47 @@ +#pragma once + +#include "ggml.h" + +#include +#include +#include + +#define WHISPER_ASSERT GGML_ASSERT + +#define WHISPER_SAMPLE_RATE 16000 +#define WHISPER_N_FFT 400 +#define WHISPER_HOP_LENGTH 160 +#define WHISPER_CHUNK_SIZE 30 + +#define COMMON_SAMPLE_RATE 16000 + +namespace whisper_preprocessor { + +struct whisper_mel { + int n_len; + int n_len_org; + int n_mel; + + std::vector data; +}; + +struct whisper_filters { + int32_t n_mel; + int32_t n_fft; + + std::vector data; +}; + +bool preprocess_audio( + const float * samples, + size_t n_samples, + const whisper_filters & filters, + std::vector & output); + +} // namespace whisper_preprocessor + +namespace whisper_precalc_filters { + +whisper_preprocessor::whisper_filters get_128_bins(); + +} // namespace whisper_precalc_filters diff --git a/llama/llama.cpp/tools/mtmd/mtmd-cli.cpp b/llama/llama.cpp/tools/mtmd/mtmd-cli.cpp new file mode 100644 index 000000000..599e682e0 --- /dev/null +++ b/llama/llama.cpp/tools/mtmd/mtmd-cli.cpp @@ -0,0 +1,386 @@ +#include "arg.h" +#include "log.h" +#include "common.h" +#include "sampling.h" +#include "llama.h" +#include "ggml.h" +#include "console.h" +#include "chat.h" +#include "mtmd.h" +#include "mtmd-helper.h" + +#include +#include +#include + +#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) +#include +#include +#elif defined (_WIN32) +#define WIN32_LEAN_AND_MEAN +#ifndef NOMINMAX +#define NOMINMAX +#endif +#include +#include +#endif + +// volatile, because of signal being an interrupt +static volatile bool g_is_generating = false; +static volatile bool g_is_interrupted = false; + +/** + * Please note that this is NOT a production-ready stuff. + * It is a playground for trying multimodal support in llama.cpp. + * For contributors: please keep this code simple and easy to understand. + */ + +static void show_additional_info(int /*argc*/, char ** argv) { + LOG( + "Experimental CLI for multimodal\n\n" + "Usage: %s [options] -m --mmproj --image --audio