Skip to content

Commit

Permalink
Speed up prediction on CPUs with many cores
Browse files Browse the repository at this point in the history
This change adds an if statement to the GGML synchronization code that
causes significantly fewer memory barriers to be used. The syncthreads
function has also been introduced so that GGML_OP_MUL_MAT can add it's
barrier for initialization on its own. That's important, since if tiny
BLAS doesn't need matrix B quantized, then the barrier can be skipped.

This change clamps the thread count to 20 maximum after the prefill is
completed. Charting thread count for numerous models on a Threadripper
reveals that twenty threads is consistently the optimal for prediction

Compared to the blog post https://justine.lol/matmul/#professional the
token generation speed for TinyLLaMA 1.1B has increased, from 52 to 98
tokens per second. Prompt token per second is up to 2000. With Mistral
7b the gains are more modest, going from 17 to 21 tok / sec
  • Loading branch information
jart committed May 3, 2024
1 parent e2b3cb2 commit 89c189e
Show file tree
Hide file tree
Showing 8 changed files with 295 additions and 259 deletions.
510 changes: 267 additions & 243 deletions llama.cpp/ggml.c

Large diffs are not rendered by default.

4 changes: 4 additions & 0 deletions llama.cpp/ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -681,6 +681,8 @@ extern "C" {
GGML_TASK_TYPE_FINALIZE,
};

struct ggml_barrier;

struct ggml_compute_params {
enum ggml_task_type type;

Expand All @@ -690,6 +692,8 @@ extern "C" {
// work buffer for all threads
size_t wsize;
void * wdata;

struct ggml_barrier *barrier;
};

// numa strategies
Expand Down
8 changes: 8 additions & 0 deletions llama.cpp/llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11306,6 +11306,14 @@ static int llama_decode_internal(
n_threads = std::min(4, n_threads);
}

// [jart] On CPUs with many cores (e.g. EPYC, Threadripper)
// using more than twenty threads for token prediction
// never helps. This number appears to be optimal for all
// models ranging from TinyLLaMA 1.1B to mighty Mixtral 8x22B.
if (n_tokens <= 2) {
n_threads = std::min(20, n_threads);
}

ggml_backend_sched_alloc_graph(lctx.sched, gf);

llama_set_inputs(lctx, u_batch);
Expand Down
2 changes: 1 addition & 1 deletion llamafile/llamafile.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ bool llamafile_sgemm(long, long, long, const void *, long, const void *, long, v

struct ggml_tensor;
struct ggml_compute_params;
bool llamafile_mixmul(struct ggml_compute_params *, const struct ggml_tensor *,
bool llamafile_mixmul(const struct ggml_compute_params *, const struct ggml_tensor *,
const struct ggml_tensor *, const struct ggml_tensor *, struct ggml_tensor *);
size_t llamafile_mixmul_needs(const struct ggml_tensor *, const struct ggml_tensor *,
const struct ggml_tensor *);
Expand Down
2 changes: 1 addition & 1 deletion llamafile/sgemm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ bool llamafile_sgemm(long m, long n, long k, const void *A, long lda, const void
/**
* Performs "mixture of experts" tensor multiplication on CPU.
*/
bool llamafile_mixmul(ggml_compute_params *params, const ggml_tensor *weights,
bool llamafile_mixmul(const ggml_compute_params *params, const ggml_tensor *weights,
const ggml_tensor *thought, const ggml_tensor *plan, ggml_tensor *result) {
return funcs.mixmul(params, weights, thought, plan, result);
}
18 changes: 9 additions & 9 deletions llamafile/sgemm.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,31 +26,31 @@ bool llamafile_sgemm_arm80(long, long, long, const void *, long, const void *, l
bool llamafile_sgemm_arm82(long, long, long, const void *, long, const void *, long, void *, long,
int, int, int, int, int, int, int);

bool llamafile_mixmul_unsupported(struct ggml_compute_params *, const struct ggml_tensor *,
bool llamafile_mixmul_unsupported(const struct ggml_compute_params *, const struct ggml_tensor *,
const struct ggml_tensor *, const struct ggml_tensor *,
struct ggml_tensor *);
bool llamafile_mixmul_amd_avx(struct ggml_compute_params *, const struct ggml_tensor *,
bool llamafile_mixmul_amd_avx(const struct ggml_compute_params *, const struct ggml_tensor *,
const struct ggml_tensor *, const struct ggml_tensor *,
struct ggml_tensor *);
bool llamafile_mixmul_amd_fma(struct ggml_compute_params *, const struct ggml_tensor *,
bool llamafile_mixmul_amd_fma(const struct ggml_compute_params *, const struct ggml_tensor *,
const struct ggml_tensor *, const struct ggml_tensor *,
struct ggml_tensor *);
bool llamafile_mixmul_amd_avx2(struct ggml_compute_params *, const struct ggml_tensor *,
bool llamafile_mixmul_amd_avx2(const struct ggml_compute_params *, const struct ggml_tensor *,
const struct ggml_tensor *, const struct ggml_tensor *,
struct ggml_tensor *);
bool llamafile_mixmul_amd_avxvnni(struct ggml_compute_params *, const struct ggml_tensor *,
bool llamafile_mixmul_amd_avxvnni(const struct ggml_compute_params *, const struct ggml_tensor *,
const struct ggml_tensor *, const struct ggml_tensor *,
struct ggml_tensor *);
bool llamafile_mixmul_amd_avx512f(struct ggml_compute_params *, const struct ggml_tensor *,
bool llamafile_mixmul_amd_avx512f(const struct ggml_compute_params *, const struct ggml_tensor *,
const struct ggml_tensor *, const struct ggml_tensor *,
struct ggml_tensor *);
bool llamafile_mixmul_amd_zen4(struct ggml_compute_params *, const struct ggml_tensor *,
bool llamafile_mixmul_amd_zen4(const struct ggml_compute_params *, const struct ggml_tensor *,
const struct ggml_tensor *, const struct ggml_tensor *,
struct ggml_tensor *);
bool llamafile_mixmul_arm80(struct ggml_compute_params *, const struct ggml_tensor *,
bool llamafile_mixmul_arm80(const struct ggml_compute_params *, const struct ggml_tensor *,
const struct ggml_tensor *, const struct ggml_tensor *,
struct ggml_tensor *);
bool llamafile_mixmul_arm82(struct ggml_compute_params *, const struct ggml_tensor *,
bool llamafile_mixmul_arm82(const struct ggml_compute_params *, const struct ggml_tensor *,
const struct ggml_tensor *, const struct ggml_tensor *,
struct ggml_tensor *);

Expand Down
8 changes: 4 additions & 4 deletions llamafile/tinyblas_cpu_mixmul.inc
Original file line number Diff line number Diff line change
Expand Up @@ -74,8 +74,8 @@ namespace {

class MixMul {
public:
MixMul(ggml_compute_params *params, const ggml_tensor *weights, const ggml_tensor *thought,
const ggml_tensor *plan, ggml_tensor *result)
MixMul(const ggml_compute_params *params, const ggml_tensor *weights,
const ggml_tensor *thought, const ggml_tensor *plan, ggml_tensor *result)
: params(params),
weights(weights),
thought(thought),
Expand Down Expand Up @@ -371,7 +371,7 @@ class MixMul {
return res;
}

ggml_compute_params *const params;
const ggml_compute_params *const params;
const ggml_tensor *const weights;
const ggml_tensor *const thought;
const ggml_tensor *const plan;
Expand Down Expand Up @@ -400,7 +400,7 @@ class MixMul {
/**
* Performs "mixture of experts" tensor multiplication on CPU.
*/
bool llamafile_mixmul(ggml_compute_params *params, const ggml_tensor *weights,
bool llamafile_mixmul(const ggml_compute_params *params, const ggml_tensor *weights,
const ggml_tensor *thought, const ggml_tensor *plan, ggml_tensor *result) {
MixMul mm{params, weights, thought, plan, result};
return mm.allocate_shared_memory() && mm.mixmul();
Expand Down
2 changes: 1 addition & 1 deletion llamafile/tinyblas_cpu_unsupported.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ bool llamafile_sgemm_unsupported(int m, int n, int k, const void *A, int lda, co
return false;
}

bool llamafile_mixmul_unsupported(struct ggml_compute_params *params,
bool llamafile_mixmul_unsupported(const struct ggml_compute_params *params,
const struct ggml_tensor *weights,
const struct ggml_tensor *thought, const struct ggml_tensor *plan,
struct ggml_tensor *result) {
Expand Down

0 comments on commit 89c189e

Please sign in to comment.