NVIDIA
diff --git a/‎cpp/tensorrt_llm/common/attentionOp.h‎
Lines changed: 0 additions & 2 deletions b/‎cpp/tensorrt_llm/common/attentionOp.h‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎cpp/tensorrt_llm/nanobind/thop/bindings.cpp‎
Lines changed: 13 additions & 13 deletions b/‎cpp/tensorrt_llm/nanobind/thop/bindings.cpp‎
Lines changed: 13 additions & 13 deletions
diff --git a/‎cpp/tensorrt_llm/plugins/gptAttentionPlugin/gptAttentionPlugin.cpp‎
Lines changed: 0 additions & 6 deletions b/‎cpp/tensorrt_llm/plugins/gptAttentionPlugin/gptAttentionPlugin.cpp‎
Lines changed: 0 additions & 6 deletions
diff --git a/‎cpp/tensorrt_llm/plugins/gptAttentionPlugin/gptAttentionPlugin.h‎
Lines changed: 1 addition & 2 deletions b/‎cpp/tensorrt_llm/plugins/gptAttentionPlugin/gptAttentionPlugin.h‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎cpp/tensorrt_llm/pybind/thop/bindings.cpp‎
Lines changed: 13 additions & 13 deletions b/‎cpp/tensorrt_llm/pybind/thop/bindings.cpp‎
Lines changed: 13 additions & 13 deletions
@@ -127,7 +127,6 @@ class AttentionOp
     public:
         // Attention packed mask input (used by context FMHA).
         uint32_t const* attention_packed_mask = nullptr;
-        kernels::KVBlockArray::DataType* host_block_offsets = nullptr;
         int32_t batch_size = 0;
         float2 const* mrope_rotary_cos_sin = nullptr;
 
@@ -182,7 +181,6 @@ class AttentionOp
             ss << "context_buf_sf: " << this->context_buf_sf << std::endl;
             ss << "key_value_cache: " << (half*) this->key_value_cache << std::endl;
             ss << "block_offsets: " << this->block_offsets << std::endl;
-            ss << "host_block_offsets: " << this->host_block_offsets << std::endl;
             ss << "host_primary_pool_pointer: " << this->host_primary_pool_pointer << std::endl;
             ss << "host_secondary_pool_pointer: " << this->host_secondary_pool_pointer << std::endl;
             ss << "batch_size: " << this->batch_size << std::endl;
 
@@ -42,19 +42,19 @@ void initBindings(nb::module_& m)
         nb::arg("output_sf") = std::nullopt, nb::arg("workspace_") = std::nullopt, nb::arg("sequence_length"),
         nb::arg("host_past_key_value_lengths"), nb::arg("host_total_kv_lens"), nb::arg("context_lengths"),
         nb::arg("host_context_lengths"), nb::arg("host_request_types"),
-        nb::arg("kv_cache_block_offsets") = std::nullopt, nb::arg("host_kv_cache_block_offsets") = std::nullopt,
-        nb::arg("host_kv_cache_pool_pointers") = std::nullopt, nb::arg("host_kv_cache_pool_mapping") = std::nullopt,
-        nb::arg("cache_indirection") = std::nullopt, nb::arg("kv_scale_orig_quant") = std::nullopt,
-        nb::arg("kv_scale_quant_orig") = std::nullopt, nb::arg("out_scale") = std::nullopt,
-        nb::arg("rotary_inv_freq") = std::nullopt, nb::arg("rotary_cos_sin") = std::nullopt,
-        nb::arg("latent_cache") = std::nullopt, nb::arg("q_pe") = std::nullopt,
-        nb::arg("block_ids_per_seq") = std::nullopt, nb::arg("attention_sinks") = std::nullopt, nb::arg("is_fused_qkv"),
-        nb::arg("update_kv_cache"), nb::arg("predicted_tokens_per_seq"), nb::arg("layer_idx"), nb::arg("num_heads"),
-        nb::arg("num_kv_heads"), nb::arg("head_size"), nb::arg("tokens_per_block") = std::nullopt,
-        nb::arg("max_num_requests"), nb::arg("max_context_length"), nb::arg("attention_window_size"),
-        nb::arg("sink_token_length"), nb::arg("beam_width"), nb::arg("mask_type"), nb::arg("quant_mode"),
-        nb::arg("q_scaling"), nb::arg("position_embedding_type"), nb::arg("rotary_embedding_dim"),
-        nb::arg("rotary_embedding_base"), nb::arg("rotary_embedding_scale_type"), nb::arg("rotary_embedding_scales"),
+        nb::arg("kv_cache_block_offsets") = std::nullopt, nb::arg("host_kv_cache_pool_pointers") = std::nullopt,
+        nb::arg("host_kv_cache_pool_mapping") = std::nullopt, nb::arg("cache_indirection") = std::nullopt,
+        nb::arg("kv_scale_orig_quant") = std::nullopt, nb::arg("kv_scale_quant_orig") = std::nullopt,
+        nb::arg("out_scale") = std::nullopt, nb::arg("rotary_inv_freq") = std::nullopt,
+        nb::arg("rotary_cos_sin") = std::nullopt, nb::arg("latent_cache") = std::nullopt,
+        nb::arg("q_pe") = std::nullopt, nb::arg("block_ids_per_seq") = std::nullopt,
+        nb::arg("attention_sinks") = std::nullopt, nb::arg("is_fused_qkv"), nb::arg("update_kv_cache"),
+        nb::arg("predicted_tokens_per_seq"), nb::arg("layer_idx"), nb::arg("num_heads"), nb::arg("num_kv_heads"),
+        nb::arg("head_size"), nb::arg("tokens_per_block") = std::nullopt, nb::arg("max_num_requests"),
+        nb::arg("max_context_length"), nb::arg("attention_window_size"), nb::arg("sink_token_length"),
+        nb::arg("beam_width"), nb::arg("mask_type"), nb::arg("quant_mode"), nb::arg("q_scaling"),
+        nb::arg("position_embedding_type"), nb::arg("rotary_embedding_dim"), nb::arg("rotary_embedding_base"),
+        nb::arg("rotary_embedding_scale_type"), nb::arg("rotary_embedding_scales"),
         nb::arg("rotary_embedding_max_position_info"), nb::arg("use_paged_context_fmha"),
         nb::arg("attention_input_type") = std::nullopt, nb::arg("is_mla_enable"),
         nb::arg("chunked_prefill_buffer_batch_size") = std::nullopt, nb::arg("q_lora_rank") = std::nullopt,
 
@@ -858,7 +858,6 @@ int GPTAttentionPlugin::enqueueSome(int32_t seqIdxBeg, int32_t localNbSeq, int32
 
     int max_blocks_per_sequence = 0;
     kernels::KVBlockArray::DataType* block_offsets = nullptr;
-    kernels::KVBlockArray::DataType* host_block_offsets = nullptr;
     void* host_primary_pool_pointer = nullptr;
     void* host_secondary_pool_pointer = nullptr;
     if (useKVCache() && mPagedKVCache)
@@ -882,10 +881,6 @@ int GPTAttentionPlugin::enqueueSome(int32_t seqIdxBeg, int32_t localNbSeq, int32
             = reinterpret_cast<kernels::KVBlockArray::DataType*>(inputs[getIdx(IdxEntry::KV_CACHE_BLOCK_OFFSETS)])
             + poolOffset + seqOffset;
 
-        host_block_offsets
-            = reinterpret_cast<kernels::KVBlockArray::DataType*>(inputs[getIdx(IdxEntry::HOST_KV_CACHE_BLOCK_OFFSETS)])
-            + poolOffset + seqOffset;
-
         auto const* const typed_host_pool_pointers
             = static_cast<char* const*>(inputs[getIdx(IdxEntry::HOST_KV_CACHE_POOL_POINTERS)]);
 
@@ -1046,7 +1041,6 @@ int GPTAttentionPlugin::enqueueSome(int32_t seqIdxBeg, int32_t localNbSeq, int32
         common_enqueue_params.max_past_kv_length = max_context_kv_len;
         EnqueueContextParams<T> enqueue_params{common_enqueue_params};
         enqueue_params.attention_packed_mask = attention_packed_mask;
-        enqueue_params.host_block_offsets = host_block_offsets;
         enqueue_params.batch_size = batch_size;
         enqueue_params.mrope_rotary_cos_sin = mrope_rotary_cos_sin;
         enqueue_params.total_kv_len = enqueue_params.num_tokens;
 
@@ -55,8 +55,7 @@ namespace tensorrt_llm::plugins
 //                      all elements must be identical.
 //     8.  past_key_value_pool [batch_size, 2, local_num_kv_heads, max_seq_len, head_size] or
 //         block_offsets [batch_size, 2, max_blocks_per_seq] if paged kv cache (optional)
-//     8.1 host_block_offsets [batch_size, 2, max_blocks_per_seq] if paged kv cache (optional)
-//     8.2 host_pool_pointers [2] if paged kv cache (optional)
+//     8.1 host_pool_pointers [2] if paged kv cache (optional)
 //     9.  kv_cache_quantization_scale [1] (optional)
 //     10. kv_cache_dequantization_scale [1] (optional)
 //     11. attention_output_quantization_scale [1] (on device, optional)
 
@@ -42,19 +42,19 @@ void initBindings(pybind11::module_& m)
         py::arg("output_sf") = std::nullopt, py::arg("workspace_") = std::nullopt, py::arg("sequence_length"),
         py::arg("host_past_key_value_lengths"), py::arg("host_total_kv_lens"), py::arg("context_lengths"),
         py::arg("host_context_lengths"), py::arg("host_request_types"),
-        py::arg("kv_cache_block_offsets") = std::nullopt, py::arg("host_kv_cache_block_offsets") = std::nullopt,
-        py::arg("host_kv_cache_pool_pointers") = std::nullopt, py::arg("host_kv_cache_pool_mapping") = std::nullopt,
-        py::arg("cache_indirection") = std::nullopt, py::arg("kv_scale_orig_quant") = std::nullopt,
-        py::arg("kv_scale_quant_orig") = std::nullopt, py::arg("out_scale") = std::nullopt,
-        py::arg("rotary_inv_freq") = std::nullopt, py::arg("rotary_cos_sin") = std::nullopt,
-        py::arg("latent_cache") = std::nullopt, py::arg("q_pe") = std::nullopt,
-        py::arg("block_ids_per_seq") = std::nullopt, py::arg("attention_sinks") = std::nullopt, py::arg("is_fused_qkv"),
-        py::arg("update_kv_cache"), py::arg("predicted_tokens_per_seq"), py::arg("layer_idx"), py::arg("num_heads"),
-        py::arg("num_kv_heads"), py::arg("head_size"), py::arg("tokens_per_block") = std::nullopt,
-        py::arg("max_num_requests"), py::arg("max_context_length"), py::arg("attention_window_size"),
-        py::arg("sink_token_length"), py::arg("beam_width"), py::arg("mask_type"), py::arg("quant_mode"),
-        py::arg("q_scaling"), py::arg("position_embedding_type"), py::arg("rotary_embedding_dim"),
-        py::arg("rotary_embedding_base"), py::arg("rotary_embedding_scale_type"), py::arg("rotary_embedding_scales"),
+        py::arg("kv_cache_block_offsets") = std::nullopt, py::arg("host_kv_cache_pool_pointers") = std::nullopt,
+        py::arg("host_kv_cache_pool_mapping") = std::nullopt, py::arg("cache_indirection") = std::nullopt,
+        py::arg("kv_scale_orig_quant") = std::nullopt, py::arg("kv_scale_quant_orig") = std::nullopt,
+        py::arg("out_scale") = std::nullopt, py::arg("rotary_inv_freq") = std::nullopt,
+        py::arg("rotary_cos_sin") = std::nullopt, py::arg("latent_cache") = std::nullopt,
+        py::arg("q_pe") = std::nullopt, py::arg("block_ids_per_seq") = std::nullopt,
+        py::arg("attention_sinks") = std::nullopt, py::arg("is_fused_qkv"), py::arg("update_kv_cache"),
+        py::arg("predicted_tokens_per_seq"), py::arg("layer_idx"), py::arg("num_heads"), py::arg("num_kv_heads"),
+        py::arg("head_size"), py::arg("tokens_per_block") = std::nullopt, py::arg("max_num_requests"),
+        py::arg("max_context_length"), py::arg("attention_window_size"), py::arg("sink_token_length"),
+        py::arg("beam_width"), py::arg("mask_type"), py::arg("quant_mode"), py::arg("q_scaling"),
+        py::arg("position_embedding_type"), py::arg("rotary_embedding_dim"), py::arg("rotary_embedding_base"),
+        py::arg("rotary_embedding_scale_type"), py::arg("rotary_embedding_scales"),
         py::arg("rotary_embedding_max_position_info"), py::arg("use_paged_context_fmha"),
         py::arg("attention_input_type") = std::nullopt, py::arg("is_mla_enable"),
         py::arg("chunked_prefill_buffer_batch_size") = std::nullopt, py::arg("q_lora_rank") = std::nullopt,