Skip to content

Commit

Permalink
Add --precise and --fast flags
Browse files Browse the repository at this point in the history
  • Loading branch information
jart committed May 3, 2024
1 parent 89c189e commit bbae0f6
Show file tree
Hide file tree
Showing 14 changed files with 1,613 additions and 942 deletions.
10 changes: 10 additions & 0 deletions llama.cpp/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -279,6 +279,16 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
if (arg == "--cli") {
return true;
}
if (arg == "--fast") {
FLAG_precise = false;
FLAG_precision_specified = true;
return true;
}
if (arg == "--precise") {
FLAG_precise = true;
FLAG_precision_specified = true;
return true;
}
if (arg == "--trap") {
FLAG_trap = true;
FLAG_unsecure = true; // for better backtraces
Expand Down
18 changes: 10 additions & 8 deletions llama.cpp/ggml-vector.inc
Original file line number Diff line number Diff line change
Expand Up @@ -1189,15 +1189,15 @@ void ggml_vec_silu_f32(const int n, float * y, const float * x) {
int i = 0;

#if defined(__AVX512F__)
if (!FLAG_trap) {
if (!FLAG_trap && !FLAG_precise) {
for (; i + 15 < n; i += 16) {
_mm512_storeu_ps(y + i, llamafile_silu_avx512(_mm512_loadu_ps(x + i)));
}
}
#endif

#if defined(__AVX2__) && defined(__FMA__)
if (!FLAG_trap) {
if (!FLAG_trap && !FLAG_precise) {
for (; i + 7 < n; i += 8) {
_mm256_storeu_ps(y + i, llamafile_silu_avx2fma(_mm256_loadu_ps(x + i)));
}
Expand Down Expand Up @@ -1288,7 +1288,7 @@ float ggml_vec_soft_max_f32(const int n, float * y, const float * x, float max)
ggml_float sum = 0;

#if defined(__AVX512F__)
if (!FLAG_trap) {
if (!FLAG_trap && !FLAG_precise) {
for (; i + 15 < n; i += 16) {
__m512 val = llamafile_expf_avx512(_mm512_sub_ps(_mm512_loadu_ps(x + i),
_mm512_set1_ps(max)));
Expand All @@ -1299,7 +1299,7 @@ float ggml_vec_soft_max_f32(const int n, float * y, const float * x, float max)
#endif

#if defined(__AVX2__) && defined(__FMA__)
if (!FLAG_trap) {
if (!FLAG_trap && !FLAG_precise) {
for (; i + 7 < n; i += 8) {
__m256 val = llamafile_expf_avx2fma(_mm256_sub_ps(_mm256_loadu_ps(x + i),
_mm256_set1_ps(max)));
Expand All @@ -1314,10 +1314,12 @@ float ggml_vec_soft_max_f32(const int n, float * y, const float * x, float max)
#endif

#ifdef __ARM_NEON
for (; i + 3 < n; i += 4) {
float32x4_t val = llamafile_expf_neon(vsubq_f32(vld1q_f32(x + i), vdupq_n_f32(max)));
vst1q_f32(y + i, val);
sum += vaddvq_f32(val);
if (!FLAG_trap) {
for (; i + 3 < n; i += 4) {
float32x4_t val = llamafile_expf_neon(vsubq_f32(vld1q_f32(x + i), vdupq_n_f32(max)));
vst1q_f32(y + i, val);
sum += vaddvq_f32(val);
}
}
#endif

Expand Down
32 changes: 32 additions & 0 deletions llama.cpp/main/main.1
Original file line number Diff line number Diff line change
Expand Up @@ -600,10 +600,36 @@ useful for shell scripts when the
flag is also supplied.
.It Fl Fl grammar-file Ar FNAME
File to read grammar from.
.It Fl Fl fast
Put llamafile into fast math mode. This disables algorithms that reduce
floating point rounding, e.g. Kahan summation, and certain functions
like expf() will be vectorized but handle underflows less gracefully.
It's unspecified whether llamafile runs in fast or precise math mode
when neither flag is specified.
.It Fl Fl precise
Put llamafile into precise math mode. This enables algorithms that
reduce floating point rounding, e.g. Kahan summation, and certain
functions like expf() will always handle subnormals correctly. It's
unspecified whether llamafile runs in fast or precise math mode when
neither flag is specified.
.It Fl Fl trap
Put llamafile into math trapping mode. When floating point exceptions
occur, such as NaNs, overflow, and divide by zero, llamafile will print
a warning to the console. This warning will include a C++ backtrace the
first time an exception is trapped. The op graph will also be dumped to
a file, and llamafile will report the specific op where the exception
occurred. This is useful for troubleshooting when reporting issues.
USing this feature will disable sandboxing. Math trapping is only
possible if your CPU supports it. That is generally the case on AMD64,
however it's less common on ARM64.
.It Fl Fl prompt-cache Ar FNAME
File to cache prompt state for faster startup.
.Pp
Default: none
.It Fl fa Ar FNAME , Fl Fl flash-attn
Enable Flash Attention. This is a mathematical shortcut that can speed
up inference for certain models. This feature is still under active
development.
.It Fl Fl prompt-cache-all
If specified, saves user input and generations to cache as well. Not supported with
.Fl Fl interactive
Expand Down Expand Up @@ -653,8 +679,14 @@ Enable colorized output to differentiate visually distinguishing between
prompts, user input, and generated text.
.It Fl Fl no-display-prompt , Fl Fl silent-prompt
Don't echo the prompt itself to standard output.
.It Fl Fl keep Ar N
Specifies number of tokens to keep from the initial prompt. The default
is -1 which means all tokens.
.It Fl Fl multiline-input
Allows you to write or paste multiple lines without ending each in '\[rs]'.
.It Fl Fl cont-batching
Enables continuous batching, a.k.a. dynamic batching.
is -1 which means all tokens.
.El
.Sh SERVER OPTIONS
The following options may be specified when
Expand Down

0 comments on commit bbae0f6

Please sign in to comment.