Add --precise and --fast flags

Mozilla-Ocho · May 3, 2024 · bbae0f6 · bbae0f6
1 parent 89c189e
commit bbae0f6
Show file tree

Hide file tree

Showing 14 changed files with 1,613 additions and 942 deletions.
diff --git a/llama.cpp/common.cpp b/llama.cpp/common.cpp
@@ -279,6 +279,16 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
     if (arg == "--cli") {
         return true;
     }
+    if (arg == "--fast") {
+        FLAG_precise = false;
+        FLAG_precision_specified = true;
+        return true;
+    }
+    if (arg == "--precise") {
+        FLAG_precise = true;
+        FLAG_precision_specified = true;
+        return true;
+    }
     if (arg == "--trap") {
         FLAG_trap = true;
         FLAG_unsecure = true; // for better backtraces

diff --git a/llama.cpp/ggml-vector.inc b/llama.cpp/ggml-vector.inc
@@ -1189,15 +1189,15 @@ void ggml_vec_silu_f32(const int n, float * y, const float * x) {
     int i = 0;
 
 #if defined(__AVX512F__)
-    if (!FLAG_trap) {
+    if (!FLAG_trap && !FLAG_precise) {
         for (; i + 15 < n; i += 16) {
             _mm512_storeu_ps(y + i, llamafile_silu_avx512(_mm512_loadu_ps(x + i)));
         }
     }
 #endif
 
 #if defined(__AVX2__) && defined(__FMA__)
-    if (!FLAG_trap) {
+    if (!FLAG_trap && !FLAG_precise) {
         for (; i + 7 < n; i += 8) {
             _mm256_storeu_ps(y + i, llamafile_silu_avx2fma(_mm256_loadu_ps(x + i)));
         }
@@ -1288,7 +1288,7 @@ float ggml_vec_soft_max_f32(const int n, float * y, const float * x, float max)
     ggml_float sum = 0;
 
 #if defined(__AVX512F__)
-    if (!FLAG_trap) {
+    if (!FLAG_trap && !FLAG_precise) {
         for (; i + 15 < n; i += 16) {
             __m512 val = llamafile_expf_avx512(_mm512_sub_ps(_mm512_loadu_ps(x + i),
                                                              _mm512_set1_ps(max)));
@@ -1299,7 +1299,7 @@ float ggml_vec_soft_max_f32(const int n, float * y, const float * x, float max)
 #endif
 
 #if defined(__AVX2__) && defined(__FMA__)
-    if (!FLAG_trap) {
+    if (!FLAG_trap && !FLAG_precise) {
         for (; i + 7 < n; i += 8) {
             __m256 val = llamafile_expf_avx2fma(_mm256_sub_ps(_mm256_loadu_ps(x + i),
                                                               _mm256_set1_ps(max)));
@@ -1314,10 +1314,12 @@ float ggml_vec_soft_max_f32(const int n, float * y, const float * x, float max)
 #endif
 
 #ifdef __ARM_NEON
-    for (; i + 3 < n; i += 4) {
-        float32x4_t val = llamafile_expf_neon(vsubq_f32(vld1q_f32(x + i), vdupq_n_f32(max)));
-        vst1q_f32(y + i, val);
-        sum += vaddvq_f32(val);
+    if (!FLAG_trap) {
+        for (; i + 3 < n; i += 4) {
+            float32x4_t val = llamafile_expf_neon(vsubq_f32(vld1q_f32(x + i), vdupq_n_f32(max)));
+            vst1q_f32(y + i, val);
+            sum += vaddvq_f32(val);
+        }
     }
 #endif
 

diff --git a/llama.cpp/main/main.1 b/llama.cpp/main/main.1
@@ -600,10 +600,36 @@ useful for shell scripts when the
 flag is also supplied.
 .It Fl Fl grammar-file Ar FNAME
 File to read grammar from.
+.It Fl Fl fast
+Put llamafile into fast math mode. This disables algorithms that reduce
+floating point rounding, e.g. Kahan summation, and certain functions
+like expf() will be vectorized but handle underflows less gracefully.
+It's unspecified whether llamafile runs in fast or precise math mode
+when neither flag is specified.
+.It Fl Fl precise
+Put llamafile into precise math mode. This enables algorithms that
+reduce floating point rounding, e.g. Kahan summation, and certain
+functions like expf() will always handle subnormals correctly. It's
+unspecified whether llamafile runs in fast or precise math mode when
+neither flag is specified.
+.It Fl Fl trap
+Put llamafile into math trapping mode. When floating point exceptions
+occur, such as NaNs, overflow, and divide by zero, llamafile will print
+a warning to the console. This warning will include a C++ backtrace the
+first time an exception is trapped. The op graph will also be dumped to
+a file, and llamafile will report the specific op where the exception
+occurred. This is useful for troubleshooting when reporting issues.
+USing this feature will disable sandboxing. Math trapping is only
+possible if your CPU supports it. That is generally the case on AMD64,
+however it's less common on ARM64.
 .It Fl Fl prompt-cache Ar FNAME
 File to cache prompt state for faster startup.
 .Pp
 Default: none
+.It Fl fa Ar FNAME , Fl Fl flash-attn
+Enable Flash Attention. This is a mathematical shortcut that can speed
+up inference for certain models. This feature is still under active
+development.
 .It Fl Fl prompt-cache-all
 If specified, saves user input and generations to cache as well. Not supported with
 .Fl Fl interactive
@@ -653,8 +679,14 @@ Enable colorized output to differentiate visually distinguishing between
 prompts, user input, and generated text.
 .It Fl Fl no-display-prompt , Fl Fl silent-prompt
 Don't echo the prompt itself to standard output.
+.It Fl Fl keep Ar N
+Specifies number of tokens to keep from the initial prompt. The default
+is -1 which means all tokens.
 .It Fl Fl multiline-input
 Allows you to write or paste multiple lines without ending each in '\[rs]'.
+.It Fl Fl cont-batching
+Enables continuous batching, a.k.a. dynamic batching.
+is -1 which means all tokens.
 .El
 .Sh SERVER OPTIONS
 The following options may be specified when