upgrade to llama.cpp b2969

kherud · kherud · commit 9d245e588a47 · 2024-05-22T22:30:55.000+02:00
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.12)
+cmake_minimum_required(VERSION 3.14)
 
 project(jllama CXX)
 
@@ -22,7 +22,7 @@ FetchContent_MakeAvailable(json)
 FetchContent_Declare(
 	llama.cpp
 	GIT_REPOSITORY https://github.com/ggerganov/llama.cpp.git
-	GIT_TAG        b2885
+	GIT_TAG        b2969
 )
 FetchContent_MakeAvailable(llama.cpp)
 
diff --git a/src/main/cpp/server.hpp b/src/main/cpp/server.hpp
@@ -94,7 +94,6 @@ struct slot_params
     bool stream = true;
     bool cache_prompt = false; // remember the prompt to avoid reprocessing all prompt
 
-    uint32_t seed = -1; // RNG seed
     int32_t n_keep = 0; // number of tokens to keep from initial prompt
     int32_t n_discard =
         0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half
@@ -1100,7 +1099,7 @@ struct server_context
                         sampler_names.emplace_back(sampler_name);
                     }
                 }
-                slot.sparams.samplers_sequence = sampler_types_from_names(sampler_names, false);
+                slot.sparams.samplers_sequence = llama_sampling_types_from_names(sampler_names, false);
             }
             else
             {
@@ -1120,7 +1119,6 @@ struct server_context
                 send_error(task, "Failed to parse grammar", ERROR_TYPE_INVALID_REQUEST);
                 return false;
             }
-            llama_set_rng_seed(ctx, slot.params.seed);
         }
 
         slot.command = SLOT_COMMAND_LOAD_PROMPT;
@@ -1374,13 +1372,13 @@ struct server_context
         samplers_sequence.reserve(slot.sparams.samplers_sequence.size());
         for (const auto &sampler_type : slot.sparams.samplers_sequence)
         {
-            samplers_sequence.emplace_back(sampler_type_to_name_string(sampler_type));
+            samplers_sequence.emplace_back(llama_sampling_type_to_str(sampler_type));
         }
 
         return json{{"n_ctx", slot.n_ctx},
                     {"n_predict", slot.n_predict},
                     {"model", params.model_alias},
-                    {"seed", slot.params.seed},
+                    {"seed", slot.sparams.seed},
                     {"temperature", slot.sparams.temp},
                     {"dynatemp_range", slot.sparams.dynatemp_range},
                     {"dynatemp_exponent", slot.sparams.dynatemp_exponent},
@@ -2143,7 +2141,7 @@ struct server_context
                                 slot.command = SLOT_COMMAND_NONE;
                                 slot.release();
                                 slot.print_timings();
-                                send_final_response(slot);
+                                send_error(slot, "input is too large to process. increase the physical batch size", ERROR_TYPE_SERVER);
                                 continue;
                             }
                         }

Original file line number	Diff line number	Diff line change
`@@ -94,7 +94,6 @@ struct slot_params`
`94`	`94`	`bool stream = true;`
`95`	`95`	`bool cache_prompt = false; // remember the prompt to avoid reprocessing all prompt`
`96`	`96`
`97`		`- uint32_t seed = -1; // RNG seed`
`98`	`97`	`int32_t n_keep = 0; // number of tokens to keep from initial prompt`
`99`	`98`	`int32_t n_discard =`
`100`	`99`	`0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half`
`@@ -1100,7 +1099,7 @@ struct server_context`
`1100`	`1099`	`sampler_names.emplace_back(sampler_name);`
`1101`	`1100`	`}`
`1102`	`1101`	`}`
`1103`		`- slot.sparams.samplers_sequence = sampler_types_from_names(sampler_names, false);`
	`1102`	`+ slot.sparams.samplers_sequence = llama_sampling_types_from_names(sampler_names, false);`
`1104`	`1103`	`}`
`1105`	`1104`	`else`
`1106`	`1105`	`{`
`@@ -1120,7 +1119,6 @@ struct server_context`
`1120`	`1119`	`send_error(task, "Failed to parse grammar", ERROR_TYPE_INVALID_REQUEST);`
`1121`	`1120`	`return false;`
`1122`	`1121`	`}`
`1123`		`- llama_set_rng_seed(ctx, slot.params.seed);`
`1124`	`1122`	`}`
`1125`	`1123`
`1126`	`1124`	`slot.command = SLOT_COMMAND_LOAD_PROMPT;`
`@@ -1374,13 +1372,13 @@ struct server_context`
`1374`	`1372`	`samplers_sequence.reserve(slot.sparams.samplers_sequence.size());`
`1375`	`1373`	`for (const auto &sampler_type : slot.sparams.samplers_sequence)`
`1376`	`1374`	`{`
`1377`		`- samplers_sequence.emplace_back(sampler_type_to_name_string(sampler_type));`
	`1375`	`+ samplers_sequence.emplace_back(llama_sampling_type_to_str(sampler_type));`
`1378`	`1376`	`}`
`1379`	`1377`
`1380`	`1378`	`return json{{"n_ctx", slot.n_ctx},`
`1381`	`1379`	`{"n_predict", slot.n_predict},`
`1382`	`1380`	`{"model", params.model_alias},`
`1383`		`- {"seed", slot.params.seed},`
	`1381`	`+ {"seed", slot.sparams.seed},`
`1384`	`1382`	`{"temperature", slot.sparams.temp},`
`1385`	`1383`	`{"dynatemp_range", slot.sparams.dynatemp_range},`
`1386`	`1384`	`{"dynatemp_exponent", slot.sparams.dynatemp_exponent},`
`@@ -2143,7 +2141,7 @@ struct server_context`
`2143`	`2141`	`slot.command = SLOT_COMMAND_NONE;`
`2144`	`2142`	`slot.release();`
`2145`	`2143`	`slot.print_timings();`
`2146`		`- send_final_response(slot);`
	`2144`	`+ send_error(slot, "input is too large to process. increase the physical batch size", ERROR_TYPE_SERVER);`
`2147`	`2145`	`continue;`
`2148`	`2146`	`}`
`2149`	`2147`	`}`