Skip to content

Commit ad209ce

Browse files
authored
TensorRT-LLM backend v0.16 release (#668)
1 parent 8534b1c commit ad209ce

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

45 files changed

+2028
-308
lines changed

README.md

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ repo. If you don't find your answer there you can ask questions on the
7373
- [Scheduling](#scheduling)
7474
- [Key-Value Cache](#key-value-cache)
7575
- [Decoding](#decoding)
76-
- [Decoding Modes - Top-k, Top-p, Top-k Top-p, Beam Search and Medusa](#decoding-modes---top-k-top-p-top-k-top-p-beam-search-and-medusa)
76+
- [Decoding Modes - Top-k, Top-p, Top-k Top-p, Beam Search, Medusa, ReDrafter, Lookahead and Eagle](#decoding-modes---top-k-top-p-top-k-top-p-beam-search-medusa-redrafter-lookahead-and-eagle)
7777
- [Speculative Decoding](#speculative-decoding)
7878
- [Chunked Context](#chunked-context)
7979
- [Quantization](#quantization)
@@ -606,15 +606,15 @@ TRT-LLM engine. Parameters for KV cache can be found in the
606606

607607
### Decoding
608608

609-
#### Decoding Modes - Top-k, Top-p, Top-k Top-p, Beam Search and Medusa
609+
#### Decoding Modes - Top-k, Top-p, Top-k Top-p, Beam Search, Medusa, ReDrafter, Lookahead and Eagle
610610

611611
TensorRT-LLM supports various decoding modes, including top-k, top-p,
612-
top-k top-p, beam search and Medusa. See the
612+
top-k top-p, beam search Medusa, ReDrafter, Lookahead and Eagle. See the
613613
[Sampling Parameters](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/advanced/gpt-runtime.md#sampling-parameters)
614614
section to learn more about top-k, top-p, top-k top-p and beam search decoding.
615-
For more details on Medusa, please refer to the
616-
[Medusa Decoding](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/medusa)
617-
documentation.
615+
Please refer to the
616+
[speculative decoding documentation](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/advanced/speculative-decoding.md)
617+
for more details on Medusa, ReDrafter, Lookahead and Eagle.
618618

619619
Parameters for decoding modes can be found in the
620620
[model config](./docs/model_config.md#tensorrt_llm_model) of tensorrt_llm model.

all_models/disaggregated_serving/disaggregated_serving_bls/config.pbtxt

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -229,6 +229,13 @@ input [
229229
dims: [ 1 ]
230230
optional: true
231231
},
232+
{
233+
name: "return_kv_cache_reuse_stats"
234+
data_type: TYPE_BOOL
235+
dims: [ 1 ]
236+
reshape: { shape: [ ] }
237+
optional: true
238+
},
232239
{
233240
name: "exclude_input_in_output"
234241
data_type: TYPE_BOOL
@@ -349,6 +356,21 @@ output [
349356
name: "sequence_index"
350357
data_type: TYPE_INT32
351358
dims: [ 1 ]
359+
},
360+
{
361+
name: "kv_cache_alloc_new_blocks"
362+
data_type: TYPE_INT32
363+
dims: [ 1 ]
364+
},
365+
{
366+
name: "kv_cache_reused_blocks"
367+
data_type: TYPE_INT32
368+
dims: [ 1 ]
369+
},
370+
{
371+
name: "kv_cache_alloc_total_blocks"
372+
data_type: TYPE_INT32
373+
dims: [ 1 ]
352374
}
353375
]
354376
instance_group [

all_models/inflight_batcher_llm/ensemble/config.pbtxt

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,12 @@ input [
6262
dims: [ -1 ]
6363
optional: true
6464
},
65+
{
66+
name: "exclude_input_in_output"
67+
data_type: TYPE_BOOL
68+
dims: [ 1 ]
69+
optional: true
70+
},
6571
{
6672
name: "end_id"
6773
data_type: TYPE_INT32
@@ -146,6 +152,12 @@ input [
146152
dims: [ 1 ]
147153
optional: true
148154
},
155+
{
156+
name: "return_kv_cache_reuse_stats"
157+
data_type: TYPE_BOOL
158+
dims: [ 1 ]
159+
optional: true
160+
},
149161
{
150162
name: "beam_width"
151163
data_type: TYPE_INT32
@@ -265,6 +277,21 @@ output [
265277
name: "sequence_index"
266278
data_type: TYPE_INT32
267279
dims: [ 1 ]
280+
},
281+
{
282+
name: "kv_cache_alloc_new_blocks"
283+
data_type: TYPE_INT32
284+
dims: [ 1 ]
285+
},
286+
{
287+
name: "kv_cache_reused_blocks"
288+
data_type: TYPE_INT32
289+
dims: [ 1 ]
290+
},
291+
{
292+
name: "kv_cache_alloc_total_blocks"
293+
data_type: TYPE_INT32
294+
dims: [ 1 ]
268295
}
269296
]
270297
ensemble_scheduling {
@@ -376,6 +403,10 @@ ensemble_scheduling {
376403
key: "decoder_input_lengths"
377404
value: "_REQUEST_DECODER_INPUT_LEN"
378405
}
406+
input_map {
407+
key: "exclude_input_in_output"
408+
value: "exclude_input_in_output"
409+
}
379410
input_map {
380411
key: "request_output_len"
381412
value: "_REQUEST_OUTPUT_LEN"
@@ -440,6 +471,10 @@ ensemble_scheduling {
440471
key: "return_generation_logits"
441472
value: "return_generation_logits"
442473
}
474+
input_map {
475+
key: "return_kv_cache_reuse_stats"
476+
value: "return_kv_cache_reuse_stats"
477+
}
443478
input_map {
444479
key: "num_return_sequences"
445480
value: "num_return_sequences"
@@ -515,6 +550,18 @@ ensemble_scheduling {
515550
output_map {
516551
key: "sequence_index"
517552
value: "sequence_index"
553+
},
554+
output_map {
555+
key: "kv_cache_alloc_new_blocks"
556+
value: "kv_cache_alloc_new_blocks"
557+
},
558+
output_map {
559+
key: "kv_cache_reused_blocks"
560+
value: "kv_cache_reused_blocks"
561+
},
562+
output_map {
563+
key: "kv_cache_alloc_total_blocks"
564+
value: "kv_cache_alloc_total_blocks"
518565
}
519566
},
520567
{

0 commit comments

Comments
 (0)