Speed up prediction on CPUs with many cores

This change adds an if statement to the GGML synchronization code that causes significantly fewer memory barriers to be used. The syncthreads function has also been introduced so that GGML_OP_MUL_MAT can add it's barrier for initialization on its own. That's important, since if tiny BLAS doesn't need matrix B quantized, then the barrier can be skipped. This change clamps the thread count to 20 maximum after the prefill is completed. Charting thread count for numerous models on a Threadripper reveals that twenty threads is consistently the optimal for prediction Compared to the blog post https://justine.lol/matmul/#professional the token generation speed for TinyLLaMA 1.1B has increased, from 52 to 98 tokens per second. Prompt token per second is up to 2000. With Mistral 7b the gains are more modest, going from 17 to 21 tok / sec
Mozilla-Ocho · May 3, 2024 · 89c189e · 89c189e
1 parent e2b3cb2
commit 89c189e
Show file tree

Hide file tree

Showing 8 changed files with 295 additions and 259 deletions.
diff --git a/llama.cpp/ggml.c b/llama.cpp/ggml.c
diff --git a/llama.cpp/ggml.h b/llama.cpp/ggml.h
@@ -681,6 +681,8 @@ extern "C" {
         GGML_TASK_TYPE_FINALIZE,
     };
 
+    struct ggml_barrier;
+
     struct ggml_compute_params {
         enum ggml_task_type type;
 
@@ -690,6 +692,8 @@ extern "C" {
         // work buffer for all threads
         size_t wsize;
         void * wdata;
+
+        struct ggml_barrier *barrier;
     };
 
     // numa strategies

diff --git a/llama.cpp/llama.cpp b/llama.cpp/llama.cpp
@@ -11306,6 +11306,14 @@ static int llama_decode_internal(
             n_threads = std::min(4, n_threads);
         }
 
+        // [jart] On CPUs with many cores (e.g. EPYC, Threadripper)
+        //        using more than twenty threads for token prediction
+        //        never helps. This number appears to be optimal for all
+        //        models ranging from TinyLLaMA 1.1B to mighty Mixtral 8x22B.
+        if (n_tokens <= 2) {
+            n_threads = std::min(20, n_threads);
+        }
+
         ggml_backend_sched_alloc_graph(lctx.sched, gf);
 
         llama_set_inputs(lctx, u_batch);

diff --git a/llamafile/llamafile.h b/llamafile/llamafile.h
@@ -59,7 +59,7 @@ bool llamafile_sgemm(long, long, long, const void *, long, const void *, long, v
 
 struct ggml_tensor;
 struct ggml_compute_params;
-bool llamafile_mixmul(struct ggml_compute_params *, const struct ggml_tensor *,
+bool llamafile_mixmul(const struct ggml_compute_params *, const struct ggml_tensor *,
                       const struct ggml_tensor *, const struct ggml_tensor *, struct ggml_tensor *);
 size_t llamafile_mixmul_needs(const struct ggml_tensor *, const struct ggml_tensor *,
                               const struct ggml_tensor *);

diff --git a/llamafile/sgemm.cpp b/llamafile/sgemm.cpp
@@ -124,7 +124,7 @@ bool llamafile_sgemm(long m, long n, long k, const void *A, long lda, const void
 /**
  * Performs "mixture of experts" tensor multiplication on CPU.
  */
-bool llamafile_mixmul(ggml_compute_params *params, const ggml_tensor *weights,
+bool llamafile_mixmul(const ggml_compute_params *params, const ggml_tensor *weights,
                       const ggml_tensor *thought, const ggml_tensor *plan, ggml_tensor *result) {
     return funcs.mixmul(params, weights, thought, plan, result);
 }
diff --git a/llamafile/sgemm.h b/llamafile/sgemm.h
@@ -26,31 +26,31 @@ bool llamafile_sgemm_arm80(long, long, long, const void *, long, const void *, l
 bool llamafile_sgemm_arm82(long, long, long, const void *, long, const void *, long, void *, long,
                            int, int, int, int, int, int, int);
 
-bool llamafile_mixmul_unsupported(struct ggml_compute_params *, const struct ggml_tensor *,
+bool llamafile_mixmul_unsupported(const struct ggml_compute_params *, const struct ggml_tensor *,
                                   const struct ggml_tensor *, const struct ggml_tensor *,
                                   struct ggml_tensor *);
-bool llamafile_mixmul_amd_avx(struct ggml_compute_params *, const struct ggml_tensor *,
+bool llamafile_mixmul_amd_avx(const struct ggml_compute_params *, const struct ggml_tensor *,
                               const struct ggml_tensor *, const struct ggml_tensor *,
                               struct ggml_tensor *);
-bool llamafile_mixmul_amd_fma(struct ggml_compute_params *, const struct ggml_tensor *,
+bool llamafile_mixmul_amd_fma(const struct ggml_compute_params *, const struct ggml_tensor *,
                               const struct ggml_tensor *, const struct ggml_tensor *,
                               struct ggml_tensor *);
-bool llamafile_mixmul_amd_avx2(struct ggml_compute_params *, const struct ggml_tensor *,
+bool llamafile_mixmul_amd_avx2(const struct ggml_compute_params *, const struct ggml_tensor *,
                                const struct ggml_tensor *, const struct ggml_tensor *,
                                struct ggml_tensor *);
-bool llamafile_mixmul_amd_avxvnni(struct ggml_compute_params *, const struct ggml_tensor *,
+bool llamafile_mixmul_amd_avxvnni(const struct ggml_compute_params *, const struct ggml_tensor *,
                                   const struct ggml_tensor *, const struct ggml_tensor *,
                                   struct ggml_tensor *);
-bool llamafile_mixmul_amd_avx512f(struct ggml_compute_params *, const struct ggml_tensor *,
+bool llamafile_mixmul_amd_avx512f(const struct ggml_compute_params *, const struct ggml_tensor *,
                                   const struct ggml_tensor *, const struct ggml_tensor *,
                                   struct ggml_tensor *);
-bool llamafile_mixmul_amd_zen4(struct ggml_compute_params *, const struct ggml_tensor *,
+bool llamafile_mixmul_amd_zen4(const struct ggml_compute_params *, const struct ggml_tensor *,
                                const struct ggml_tensor *, const struct ggml_tensor *,
                                struct ggml_tensor *);
-bool llamafile_mixmul_arm80(struct ggml_compute_params *, const struct ggml_tensor *,
+bool llamafile_mixmul_arm80(const struct ggml_compute_params *, const struct ggml_tensor *,
                             const struct ggml_tensor *, const struct ggml_tensor *,
                             struct ggml_tensor *);
-bool llamafile_mixmul_arm82(struct ggml_compute_params *, const struct ggml_tensor *,
+bool llamafile_mixmul_arm82(const struct ggml_compute_params *, const struct ggml_tensor *,
                             const struct ggml_tensor *, const struct ggml_tensor *,
                             struct ggml_tensor *);
 

diff --git a/llamafile/tinyblas_cpu_mixmul.inc b/llamafile/tinyblas_cpu_mixmul.inc
@@ -74,8 +74,8 @@ namespace {
 
 class MixMul {
   public:
-    MixMul(ggml_compute_params *params, const ggml_tensor *weights, const ggml_tensor *thought,
-           const ggml_tensor *plan, ggml_tensor *result)
+    MixMul(const ggml_compute_params *params, const ggml_tensor *weights,
+           const ggml_tensor *thought, const ggml_tensor *plan, ggml_tensor *result)
         : params(params),
           weights(weights),
           thought(thought),
@@ -371,7 +371,7 @@ class MixMul {
         return res;
     }
 
-    ggml_compute_params *const params;
+    const ggml_compute_params *const params;
     const ggml_tensor *const weights;
     const ggml_tensor *const thought;
     const ggml_tensor *const plan;
@@ -400,7 +400,7 @@ class MixMul {
 /**
  * Performs "mixture of experts" tensor multiplication on CPU.
  */
-bool llamafile_mixmul(ggml_compute_params *params, const ggml_tensor *weights,
+bool llamafile_mixmul(const ggml_compute_params *params, const ggml_tensor *weights,
                       const ggml_tensor *thought, const ggml_tensor *plan, ggml_tensor *result) {
     MixMul mm{params, weights, thought, plan, result};
     return mm.allocate_shared_memory() && mm.mixmul();

diff --git a/llamafile/tinyblas_cpu_unsupported.cpp b/llamafile/tinyblas_cpu_unsupported.cpp
@@ -23,7 +23,7 @@ bool llamafile_sgemm_unsupported(int m, int n, int k, const void *A, int lda, co
     return false;
 }
 
-bool llamafile_mixmul_unsupported(struct ggml_compute_params *params,
+bool llamafile_mixmul_unsupported(const struct ggml_compute_params *params,
                                   const struct ggml_tensor *weights,
                                   const struct ggml_tensor *thought, const struct ggml_tensor *plan,
                                   struct ggml_tensor *result) {