From a40f2b656fab364ce0aff98dbefe9bd9c3721cc9 Mon Sep 17 00:00:00 2001
From: Alon <alonfaraj@gmail.com>
Date: Wed, 20 Sep 2023 15:06:36 +0300
Subject: [PATCH 01/10] CI: FreeBSD fix (#3258)

* - freebsd ci: use qemu
---
 .github/workflows/build.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 4b6071f5a1311..aecebaf936b23 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -468,6 +468,7 @@ jobs:
       with:
         operating_system: freebsd
         version: '13.2'
+        hypervisor: 'qemu'
         run: |
             sudo pkg update
             sudo pkg install -y gmake automake autoconf pkgconf llvm15 clinfo clover opencl clblast openblas

From 80834daecf4b9021770361a6d5e1b9c7a60e6854 Mon Sep 17 00:00:00 2001
From: kang <tpdns9032100@gmail.com>
Date: Wed, 20 Sep 2023 22:48:22 +0900
Subject: [PATCH 02/10] flake : Restore default package's buildInputs (#3262)

---
 flake.nix | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/flake.nix b/flake.nix
index b0fb8642cb618..7723357afe419 100644
--- a/flake.nix
+++ b/flake.nix
@@ -52,7 +52,8 @@
       in
       {
         packages.default = pkgs.stdenv.mkDerivation {
-          inherit name src meta postPatch nativeBuildInputs buildInputs postInstall;
+          inherit name src meta postPatch nativeBuildInputs postInstall;
+          buildInputs = osSpecific;
           cmakeFlags = cmakeFlags
             ++ (if isAarch64 && isDarwin then [
             "-DCMAKE_C_FLAGS=-D__ARM_FEATURE_DOTPROD=1"

From 65c2c1c5ab7c5089dbc6d10bc49b9c58f0164317 Mon Sep 17 00:00:00 2001
From: Cebtenzzre <cebtenzzre@gmail.com>
Date: Wed, 20 Sep 2023 12:06:08 -0400
Subject: [PATCH 03/10] benchmark-matmult : do not use integer abs() on a float
 (#3277)

---
 examples/benchmark/benchmark-matmult.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/examples/benchmark/benchmark-matmult.cpp b/examples/benchmark/benchmark-matmult.cpp
index c8f7d486976d7..f1c382aa9b955 100644
--- a/examples/benchmark/benchmark-matmult.cpp
+++ b/examples/benchmark/benchmark-matmult.cpp
@@ -21,7 +21,7 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
 
-void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
+static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
     struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
 
     if (plan.work_size > 0) {
@@ -32,7 +32,7 @@ void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph,
     ggml_graph_compute(graph, &plan);
 }
 
-float tensor_sum_elements(const ggml_tensor * tensor) {
+static float tensor_sum_elements(const ggml_tensor * tensor) {
     double sum = 0;
     if (tensor->type == GGML_TYPE_F32) {
         for (int j = 0; j < tensor->ne[1]; j++) {
@@ -44,7 +44,7 @@ float tensor_sum_elements(const ggml_tensor * tensor) {
     return sum;
 }
 
-void tensor_dump(const ggml_tensor * tensor, const char * name) {
+static void tensor_dump(const ggml_tensor * tensor, const char * name) {
     printf("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi) - ", name,
         tensor->type, ggml_type_name(tensor->type),
         tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->nb[0], tensor->nb[1], tensor->nb[2]);
@@ -59,7 +59,7 @@ struct benchmark_params_struct {
     int32_t n_iterations  = 10;
 };
 
-void print_usage(int /*argc*/, char ** argv, struct benchmark_params_struct params) {
+static void print_usage(int /*argc*/, char ** argv, struct benchmark_params_struct params) {
     fprintf(stderr, "usage: %s [options]\n", argv[0]);
     fprintf(stderr, "\n");
     fprintf(stderr, "options:\n");
@@ -253,7 +253,7 @@ int main(int argc, char ** argv)  {
         // Check that the matrix multiplication result is in the right ballpark
         // We cannot use the exact value from the F32 multiplication because the quantizuation will be slightly different
         float sum_of_Q4_result = tensor_sum_elements(gf31.nodes[0]);
-        float delta = abs(sum_of_Q4_result - sum_of_F32_reference);
+        float delta = std::abs(sum_of_Q4_result - sum_of_F32_reference);
         float allowed_delta = (sum_of_F32_reference) / 1000 / 1000; //  Let's accept an epsilon of 10^-6
 
         if (delta > allowed_delta)  {

From a5661d7e71d15b8dfc81bc0510ba912ebe85dfa3 Mon Sep 17 00:00:00 2001
From: Cebtenzzre <cebtenzzre@gmail.com>
Date: Wed, 20 Sep 2023 12:12:47 -0400
Subject: [PATCH 04/10] llama : allow gguf RoPE keys to be overridden with
 defaults (#3240)

---
 common/common.cpp          |  6 ++--
 examples/server/server.cpp |  4 +--
 llama.cpp                  | 60 +++++++++++++++-----------------------
 3 files changed, 29 insertions(+), 41 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 6d655fd5548c5..2597ba06aee16 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -647,9 +647,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     printf("  --cfg-negative-prompt-file FNAME\n");
     printf("                        negative prompt file to use for guidance. (default: empty)\n");
     printf("  --cfg-scale N         strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale);
-    printf("  --rope-scale N        RoPE context linear scaling factor, inverse of --rope-freq-scale (default: %g)\n", 1.0f/params.rope_freq_scale);
-    printf("  --rope-freq-base N    RoPE base frequency, used by NTK-aware scaling (default: %.1f)\n", params.rope_freq_base);
-    printf("  --rope-freq-scale N   RoPE frequency linear scaling factor, inverse of --rope-scale (default: %g)\n", params.rope_freq_scale);
+    printf("  --rope-scale N        RoPE context linear scaling factor, inverse of --rope-freq-scale\n");
+    printf("  --rope-freq-base N    RoPE base frequency, used by NTK-aware scaling (default: loaded from model)\n");
+    printf("  --rope-freq-scale N   RoPE frequency linear scaling factor (default: loaded from model)\n");
     printf("  --ignore-eos          ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
     printf("  --no-penalize-nl      do not penalize newline token\n");
     printf("  --memory-f32          use f32 instead of f16 for memory key+value (default: disabled)\n");
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 1bb8e92c0f95e..ebd7f2fc579e9 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -701,8 +701,8 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
     printf("  -v, --verbose         verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
     printf("  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
     printf("  -c N, --ctx-size N    size of the prompt context (default: %d)\n", params.n_ctx);
-    printf("  --rope-freq-base N    RoPE base frequency (default: %.1f)\n", params.rope_freq_base);
-    printf("  --rope-freq-scale N   RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale);
+    printf("  --rope-freq-base N    RoPE base frequency (default: loaded from model)\n");
+    printf("  --rope-freq-scale N   RoPE frequency scaling factor (default: loaded from model)\n");
     printf("  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch);
     printf("  --memory-f32          use f32 instead of f16 for memory key+value (default: disabled)\n");
     printf("                        not recommended: doubles context memory required and no measurable increase in quality\n");
diff --git a/llama.cpp b/llama.cpp
index 79b48897d8bbe..358bf5ec8a7ad 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -929,23 +929,22 @@ static const size_t kB = 1024;
 static const size_t MB = kB*kB;
 static const size_t GB = kB*kB*kB;
 
-// default hparams (LLaMA 7B)
 struct llama_hparams {
-    uint32_t n_vocab     = 32000;
-    uint32_t n_ctx_train = 2048;  // the context size used during training
-    uint32_t n_ctx       = 512;   // the context size used during inference
-    uint32_t n_embd      = 4096;
-    uint32_t n_head      = 32;
-    uint32_t n_head_kv   = 32;
-    uint32_t n_layer     = 32;
-    uint32_t n_rot       = 64;
-    uint32_t n_ff        = 11008;
-
-    float f_norm_eps     = 1e-5;
-    float f_norm_rms_eps = 1e-5;
-
-    float rope_freq_base  = 10000.0f;
-    float rope_freq_scale = 1.0f;
+    uint32_t n_vocab;
+    uint32_t n_ctx_train; // context size the model was trained on
+    uint32_t n_ctx;       // context size used during inference
+    uint32_t n_embd;
+    uint32_t n_head;
+    uint32_t n_head_kv;
+    uint32_t n_layer;
+    uint32_t n_rot;
+    uint32_t n_ff;
+
+    float f_norm_eps;
+    float f_norm_rms_eps;
+
+    float rope_freq_base;
+    float rope_freq_scale;
 
     bool operator!=(const llama_hparams & other) const {
         return static_cast<bool>(memcmp(this, &other, sizeof(llama_hparams))); // NOLINT
@@ -1076,7 +1075,7 @@ struct llama_model {
 
     std::string name = "n/a";
 
-    llama_hparams hparams;
+    llama_hparams hparams = {};
     llama_vocab   vocab;
 
     struct ggml_tensor * tok_embeddings;
@@ -1674,28 +1673,17 @@ static void llm_load_hparams(
     hparams.n_head_kv = hparams.n_head;
     GGUF_GET_KEY(ctx, hparams.n_head_kv, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ATTENTION_HEAD_COUNT_KV));
 
-    // TODO: manually setting rope freq base and scale should override this
-    // FIXME: partial fix when the param specified is not the default value, but
-    //        will not work for overriding the model value to the params default
-
-    llama_context_params defaults = llama_context_default_params();
-
-    // rope_freq_base
-    {
-        float ropebase = 10000.0f;
-        GGUF_GET_KEY(ctx, ropebase, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE));
-        if (ropebase != 10000.0f && rope_freq_base == defaults.rope_freq_base) {
-            rope_freq_base = ropebase;
-        }
+    // rope_freq_base (optional)
+    if (rope_freq_base == 0.0f) {
+        rope_freq_base = 10000.0f;
+        GGUF_GET_KEY(ctx, rope_freq_base, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE));
     }
 
     // rope_freq_scale (inverse of the kv) is optional
-    {
+    if (rope_freq_scale == 0.0f) {
         float ropescale = 1.0f;
         GGUF_GET_KEY(ctx, ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR));
-        if (ropescale != 1.0f && rope_freq_scale == defaults.rope_freq_scale) {
-            rope_freq_scale = 1.0f/ropescale;
-        }
+        rope_freq_scale = 1.0f/ropescale;
     }
 
     // sanity check for n_rot (optional)
@@ -6188,8 +6176,8 @@ struct llama_context_params llama_context_default_params() {
         /*.n_gpu_layers                =*/ 0,
         /*.main_gpu                    =*/ 0,
         /*.tensor_split                =*/ nullptr,
-        /*.rope_freq_base              =*/ 10000.0f,
-        /*.rope_freq_scale             =*/ 1.0f,
+        /*.rope_freq_base              =*/ 0.0f,
+        /*.rope_freq_scale             =*/ 0.0f,
         /*.progress_callback           =*/ nullptr,
         /*.progress_callback_user_data =*/ nullptr,
         /*.low_vram                    =*/ false,

From 7eb41179edc56083ef4eb2df7967ac9ff38b34fb Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 20 Sep 2023 20:48:22 +0300
Subject: [PATCH 05/10] readme : update hot topics

---
 README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.md b/README.md
index d8fd8bc4478e3..670e2e6734f64 100644
--- a/README.md
+++ b/README.md
@@ -11,6 +11,8 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
 
 ### Hot topics
 
+- Parallel decoding + continuous batching support incoming: [#3228](https://github.com/ggerganov/llama.cpp/pull/3228) \
+  **Devs should become familiar with the new API**
 - Local Falcon 180B inference on Mac Studio
 
   https://github.com/ggerganov/llama.cpp/assets/1991296/98abd4e8-7077-464c-ae89-aebabca7757e

From 8185710a80531e9ee0c0cb99d3a9c9af1019ab67 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= <johannesg@5d6.de>
Date: Thu, 21 Sep 2023 10:43:53 +0200
Subject: [PATCH 06/10] CUDA: use only 1 thread if fully offloaded (#2915)

---
 llama.cpp | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/llama.cpp b/llama.cpp
index 358bf5ec8a7ad..346636501ce15 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -3765,6 +3765,15 @@ static bool llama_eval_internal(
         n_threads = std::min(4, n_threads);
     }
 
+    // If all tensors can be run on the GPU then using more than 1 thread is detrimental.
+    const bool full_offload_supported = model.arch == LLM_ARCH_LLAMA ||
+        model.arch == LLM_ARCH_BAICHUAN ||
+        model.arch == LLM_ARCH_FALCON;
+    const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3;
+    if (ggml_cpu_has_cublas() && full_offload_supported && fully_offloaded) {
+        n_threads = 1;
+    }
+
     struct ggml_tensor * res        = gf->nodes[gf->n_nodes - 1];
     struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
 

From f56c418ab0a635c020bcb5bf44b8f00cb3c9e514 Mon Sep 17 00:00:00 2001
From: yuiseki <yuiseki@gmail.com>
Date: Thu, 21 Sep 2023 17:57:40 +0900
Subject: [PATCH 07/10] embedding : update README.md (#3224)

---
 examples/embedding/README.md | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/examples/embedding/README.md b/examples/embedding/README.md
index fe8f5dcc62ed9..6929454c5e549 100644
--- a/examples/embedding/README.md
+++ b/examples/embedding/README.md
@@ -1,3 +1,21 @@
-# embedding
+# llama.cpp/example/embedding
 
-TODO
+This example demonstrates generate high-dimensional embedding vector of a given text with llama.cpp.
+
+## Quick Start
+
+To get started right away, run the following command, making sure to use the correct path for the model you have:
+
+### Unix-based systems (Linux, macOS, etc.):
+
+```bash
+./embedding -m ./path/to/model --log-disable -p "Hello World!" 2>/dev/null
+```
+
+### Windows:
+
+```powershell
+embedding.exe -m ./path/to/model --log-disable -p "Hello World!" 2>$null
+```
+
+The above command will output space-separated float values.

From 324f3403d54ae4499a1d68623161015f7419fb76 Mon Sep 17 00:00:00 2001
From: Edward Taylor <edeetee@gmail.com>
Date: Thu, 21 Sep 2023 21:08:20 +1200
Subject: [PATCH 08/10] zig : fix for updated c lib (#3259)

---
 build.zig | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/build.zig b/build.zig
index f2769ba8c2e7a..3a8978bc37fe2 100644
--- a/build.zig
+++ b/build.zig
@@ -36,17 +36,20 @@ const Maker = struct {
     }
 
     fn init(builder: *std.build.Builder) !Maker {
-        const commit_hash = @embedFile(".git/refs/heads/master");
+        // const commit_hash = @embedFile(".git/refs/heads/master");
+        const target = builder.standardTargetOptions(.{});
         const config_header = builder.addConfigHeader(
             .{ .style = .blank, .include_path = "build-info.h" },
             .{
                 .BUILD_NUMBER = 0,
-                .BUILD_COMMIT = commit_hash[0 .. commit_hash.len - 1], // omit newline
+                .BUILD_COMMIT = "12345", // omit newline
+                .BUILD_COMPILER = "Zig 0.11.0",
+                .BUILD_TARGET = try target.allocDescription(builder.allocator),
             },
         );
         var m = Maker{
             .builder = builder,
-            .target = builder.standardTargetOptions(.{}),
+            .target = target,
             .optimize = builder.standardOptimizeOption(.{}),
             .config_header = config_header,
             .enable_lto = false,
@@ -58,7 +61,7 @@ const Maker = struct {
         try m.addCFlag("-std=c11");
         try m.addCxxFlag("-std=c++11");
         try m.addProjectInclude(&.{});
-        try m.addProjectInclude(&.{"examples"});
+        try m.addProjectInclude(&.{"common"});
         return m;
     }
 
@@ -71,6 +74,7 @@ const Maker = struct {
             o.addCSourceFiles(&.{src}, m.cxxflags.items);
             o.linkLibCpp();
         }
+        o.addConfigHeader(m.config_header);
         for (m.include_dirs.items) |i| o.addIncludePath(.{ .path = i });
         o.want_lto = m.enable_lto;
         return o;
@@ -104,15 +108,15 @@ pub fn build(b: *std.build.Builder) !void {
     const ggml = make.obj("ggml", "ggml.c");
     const ggml_alloc = make.obj("ggml-alloc", "ggml-alloc.c");
     const llama = make.obj("llama", "llama.cpp");
-    const common = make.obj("common", "examples/common.cpp");
-    const console = make.obj("common", "examples/console.cpp");
-    const grammar_parser = make.obj("grammar-parser", "examples/grammar-parser.cpp");
+    const common = make.obj("common", "common/common.cpp");
+    const console = make.obj("common", "common/console.cpp");
+    const grammar_parser = make.obj("grammar-parser", "common/grammar-parser.cpp");
 
     _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, llama, common, console, grammar_parser });
-    _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, llama });
+    _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, llama, common });
     _ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, llama, common });
     _ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, llama, common });
-    _ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, llama });
+    _ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, llama, common });
 
     const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, llama, common, grammar_parser });
     if (server.target.isWindows()) {

From 36b904e20003017f50108ae68359ef87a192dae2 Mon Sep 17 00:00:00 2001
From: shibe2 <shibe@tuta.io>
Date: Thu, 21 Sep 2023 22:10:26 +0400
Subject: [PATCH 09/10] ggml-opencl.cpp: Make private functions static (#3300)

---
 ggml-opencl.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/ggml-opencl.cpp b/ggml-opencl.cpp
index 777048d011157..c7d9150fec2f0 100644
--- a/ggml-opencl.cpp
+++ b/ggml-opencl.cpp
@@ -847,7 +847,7 @@ std::array<std::string, 2> mul_str_values = {
     "mul_f32", "float"
 };
 
-std::string& replace(std::string& s, const std::string& from, const std::string& to) {
+static std::string& replace(std::string& s, const std::string& from, const std::string& to) {
     size_t pos = 0;
     while ((pos = s.find(from, pos)) != std::string::npos) {
          s.replace(pos, from.length(), to);
@@ -856,7 +856,7 @@ std::string& replace(std::string& s, const std::string& from, const std::string&
     return s;
 }
 
-std::string generate_kernels() {
+static std::string generate_kernels() {
     std::stringstream src;
     src << program_source << '\n';
     src << k_quants_source << '\n';
@@ -1788,7 +1788,7 @@ bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tens
     return false;
 }
 
-bool ggml_cl_mul_mat_use_f16(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * /* dst */) {
+static bool ggml_cl_mul_mat_use_f16(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * /* dst */) {
     // If device doesn't support FP16
     if (!fp16_support) {
         return false;

From bc9d3e3971e5607a10ff4c24e39568ce1ac87271 Mon Sep 17 00:00:00 2001
From: Lee Drake <b.lee.drake@gmail.com>
Date: Thu, 21 Sep 2023 13:00:24 -0600
Subject: [PATCH 10/10] Update README.md (#3289)

* Update README.md

* Update README.md

Co-authored-by: slaren <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
---
 README.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/README.md b/README.md
index 670e2e6734f64..42686aacc970b 100644
--- a/README.md
+++ b/README.md
@@ -557,6 +557,10 @@ python3 convert.py models/7B/
 # quantize the model to 4-bits (using q4_0 method)
 ./quantize ./models/7B/ggml-model-f16.gguf ./models/7B/ggml-model-q4_0.gguf q4_0
 
+# update the gguf filetype to current if older version is unsupported by another application
+./quantize ./models/7B/ggml-model-q4_0.gguf ./models/7B/ggml-model-q4_0-v2.gguf COPY
+
+
 # run the inference
 ./main -m ./models/7B/ggml-model-q4_0.gguf -n 128
 ```