triton-inference-server
diff --git a/‎README.md‎
Lines changed: 6 additions & 6 deletions b/‎README.md‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎all_models/disaggregated_serving/disaggregated_serving_bls/config.pbtxt‎
Lines changed: 22 additions & 0 deletions b/‎all_models/disaggregated_serving/disaggregated_serving_bls/config.pbtxt‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎all_models/inflight_batcher_llm/ensemble/config.pbtxt‎
Lines changed: 47 additions & 0 deletions b/‎all_models/inflight_batcher_llm/ensemble/config.pbtxt‎
Lines changed: 47 additions & 0 deletions
@@ -73,7 +73,7 @@ repo. If you don't find your answer there you can ask questions on the
     - [Scheduling](#scheduling)
     - [Key-Value Cache](#key-value-cache)
     - [Decoding](#decoding)
-      - [Decoding Modes - Top-k, Top-p, Top-k Top-p, Beam Search and Medusa](#decoding-modes---top-k-top-p-top-k-top-p-beam-search-and-medusa)
+      - [Decoding Modes - Top-k, Top-p, Top-k Top-p, Beam Search, Medusa, ReDrafter, Lookahead and Eagle](#decoding-modes---top-k-top-p-top-k-top-p-beam-search-medusa-redrafter-lookahead-and-eagle)
       - [Speculative Decoding](#speculative-decoding)
     - [Chunked Context](#chunked-context)
     - [Quantization](#quantization)
@@ -606,15 +606,15 @@ TRT-LLM engine. Parameters for KV cache can be found in the
 
 ### Decoding
 
-#### Decoding Modes - Top-k, Top-p, Top-k Top-p, Beam Search and Medusa
+#### Decoding Modes - Top-k, Top-p, Top-k Top-p, Beam Search, Medusa, ReDrafter, Lookahead and Eagle
 
 TensorRT-LLM supports various decoding modes, including top-k, top-p,
-top-k top-p, beam search and Medusa. See the
+top-k top-p, beam search Medusa, ReDrafter, Lookahead and Eagle. See the
 [Sampling Parameters](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/advanced/gpt-runtime.md#sampling-parameters)
 section to learn more about top-k, top-p, top-k top-p and beam search decoding.
-For more details on Medusa, please refer to the
-[Medusa Decoding](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/medusa)
-documentation.
+Please refer to the
+[speculative decoding documentation](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/advanced/speculative-decoding.md)
+for more details on Medusa, ReDrafter, Lookahead and Eagle.
 
 Parameters for decoding modes can be found in the
 [model config](./docs/model_config.md#tensorrt_llm_model) of tensorrt_llm model.
 
@@ -229,6 +229,13 @@ input [
     dims: [ 1 ]
     optional: true
   },
+  {
+    name: "return_kv_cache_reuse_stats"
+    data_type: TYPE_BOOL
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
   {
     name: "exclude_input_in_output"
     data_type: TYPE_BOOL
@@ -349,6 +356,21 @@ output [
     name: "sequence_index"
     data_type: TYPE_INT32
     dims: [ 1 ]
+  },
+  {
+    name: "kv_cache_alloc_new_blocks"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+  },
+  {
+    name: "kv_cache_reused_blocks"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+  },
+  {
+    name: "kv_cache_alloc_total_blocks"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
   }
 ]
 instance_group [
 
@@ -62,6 +62,12 @@ input [
    dims: [ -1 ]
    optional: true
   },
+  {
+    name: "exclude_input_in_output"
+    data_type: TYPE_BOOL
+    dims: [ 1 ]
+    optional: true
+  },
   {
     name: "end_id"
     data_type: TYPE_INT32
@@ -146,6 +152,12 @@ input [
     dims: [ 1 ]
     optional: true
   },
+  {
+    name: "return_kv_cache_reuse_stats"
+    data_type: TYPE_BOOL
+    dims: [ 1 ]
+    optional: true
+  },
   {
     name: "beam_width"
     data_type: TYPE_INT32
@@ -265,6 +277,21 @@ output [
     name: "sequence_index"
     data_type: TYPE_INT32
     dims: [ 1 ]
+  },
+  {
+    name: "kv_cache_alloc_new_blocks"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+  },
+  {
+    name: "kv_cache_reused_blocks"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+  },
+  {
+    name: "kv_cache_alloc_total_blocks"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
   }
 ]
 ensemble_scheduling {
@@ -376,6 +403,10 @@ ensemble_scheduling {
         key: "decoder_input_lengths"
         value: "_REQUEST_DECODER_INPUT_LEN"
       }
+      input_map {
+        key: "exclude_input_in_output"
+        value: "exclude_input_in_output"
+      }
       input_map {
         key: "request_output_len"
         value: "_REQUEST_OUTPUT_LEN"
@@ -440,6 +471,10 @@ ensemble_scheduling {
           key: "return_generation_logits"
           value: "return_generation_logits"
       }
+      input_map {
+          key: "return_kv_cache_reuse_stats"
+          value: "return_kv_cache_reuse_stats"
+      }
       input_map {
           key: "num_return_sequences"
           value: "num_return_sequences"
@@ -515,6 +550,18 @@ ensemble_scheduling {
       output_map {
         key: "sequence_index"
         value: "sequence_index"
+      },
+      output_map {
+        key: "kv_cache_alloc_new_blocks"
+        value: "kv_cache_alloc_new_blocks"
+      },
+      output_map {
+        key: "kv_cache_reused_blocks"
+        value: "kv_cache_reused_blocks"
+      },
+      output_map {
+        key: "kv_cache_alloc_total_blocks"
+        value: "kv_cache_alloc_total_blocks"
       }
     },
     {