microsoft · qjia7 · Jan 12, 2026 · Jan 14, 2026 · Copilot · Jan 14, 2026
diff --git a/include/onnxruntime/core/session/onnxruntime_c_api.h b/include/onnxruntime/core/session/onnxruntime_c_api.h
@@ -6785,6 +6785,33 @@ struct OrtApi {
                   _Out_writes_(num_outputs) const OrtEpDevice** outputs_ep_devices,
                   _In_ size_t num_outputs);
 
+  /** \brief Copy OrtValue instances containing Tensors between devices with offset and size control.
+   *
+   * Extended version of CopyTensors that supports copying with source/destination offsets and custom sizes.
+   * All offsets and sizes are in bytes.
+   *
+   * \param[in] env The OrtEnv instance to use.
+   * \param[in] src_tensors Array of OrtValue instances containing the source tensors to copy.
+   * \param[in] dst_tensors Array of OrtValue instances to copy the source tensors to.
+   * \param[in] source_offsets Optional array of source offsets in bytes. May be nullptr for all zeros.
+   * \param[in] destination_offsets Optional array of destination offsets in bytes. May be nullptr for all zeros.
+   * \param[in] sizes Optional array of sizes in bytes to copy. May be nullptr to copy entire tensors.
+   * \param[in] stream Optional OrtSyncStream that can be used to perform the copy asynchronously. May be nullptr.
+   * \param[in] num_tensors The number of tensors to copy.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.24
+   */
+  ORT_API2_STATUS(CopyTensorsEx, _In_ const OrtEnv* env,
+                  _In_reads_(num_tensors) const OrtValue* const* src_tensors,
+                  _In_reads_(num_tensors) OrtValue* const* dst_tensors,
+                  _In_reads_opt_(num_tensors) const size_t* source_offsets,
+                  _In_reads_opt_(num_tensors) const size_t* destination_offsets,
+                  _In_reads_opt_(num_tensors) const size_t* sizes,
+                  _In_opt_ OrtSyncStream* stream,
+                  _In_ size_t num_tensors);
+
   /// @}
 };
 

diff --git a/include/onnxruntime/core/session/onnxruntime_ep_c_api.h b/include/onnxruntime/core/session/onnxruntime_ep_c_api.h
@@ -129,6 +129,9 @@ struct OrtDataTransferImpl {
    * \param[in] this_ptr Pointer to the OrtDataTransferImpl instance.
    * \param[in] src_tensors Array of source OrtValue pointers to copy from.
    * \param[in] dst_tensors Array of destination OrtValue pointers to copy to.
+   * \param[in] source_offsets Optional array of source offsets in bytes. May be nullptr for all zeros.
+   * \param[in] destination_offsets Optional array of destination offsets in bytes. May be nullptr for all zeros.
+   * \param[in] sizes Optional array of sizes in bytes to copy. May be nullptr to copy entire tensors.
    * \param[in] streams Array of OrtSyncStream pointers for the copy operations, if the execution provider is stream
    *                    aware. nullptr if it is not.
    * \param[in] num_tensors Number of tensors to copy.
@@ -140,6 +143,9 @@ struct OrtDataTransferImpl {
   ORT_API2_STATUS(CopyTensors, _In_ OrtDataTransferImpl* this_ptr,
                   _In_reads_(num_tensors) const OrtValue** src_tensors,
                   _In_reads_(num_tensors) OrtValue** dst_tensors,
+                  _In_reads_opt_(num_tensors) const size_t* source_offsets,
+                  _In_reads_opt_(num_tensors) const size_t* destination_offsets,
+                  _In_reads_opt_(num_tensors) const size_t* sizes,
                   _In_reads_(num_tensors) OrtSyncStream** streams,
                   _In_ size_t num_tensors);
 };

diff --git a/onnxruntime/core/framework/data_transfer.cc b/onnxruntime/core/framework/data_transfer.cc
@@ -21,14 +21,25 @@ common::Status IDataTransfer::CopyTensor(const Tensor& /*src*/, Tensor& /*dst*/)
   ORT_NOT_IMPLEMENTED(__FUNCTION__, " is not implemented");
 }
 
+common::Status IDataTransfer::CopyTensor(const Tensor& /*src*/, Tensor& /*dst*/, size_t /*src_offset*/, size_t /*dst_offset*/, size_t /*size*/) const {
+  ORT_NOT_IMPLEMENTED(__FUNCTION__, " with offsets/size is not implemented");
+}
+
 common::Status IDataTransfer::CopyTensors(const std::vector<IDataTransfer::SrcDstPair>& src_dst_pairs) const {
   for (const auto& pair : src_dst_pairs) {
-    if (pair.src_stream)
-      ORT_RETURN_IF_ERROR(CopyTensorAsync(pair.src, pair.dst, *pair.src_stream));
-    else
-      ORT_RETURN_IF_ERROR(CopyTensor(pair.src, pair.dst));
+    // Use offset-aware methods when offsets or size are provided
+    if (pair.source_offset != 0 || pair.destination_offset != 0 || pair.size != 0) {
+      if (pair.src_stream)
+        ORT_RETURN_IF_ERROR(CopyTensorAsync(pair.src, pair.dst, pair.source_offset, pair.destination_offset, pair.size, *pair.src_stream));
+      else
+        ORT_RETURN_IF_ERROR(CopyTensor(pair.src, pair.dst, pair.source_offset, pair.destination_offset, pair.size));
+    } else {
+      if (pair.src_stream)
+        ORT_RETURN_IF_ERROR(CopyTensorAsync(pair.src, pair.dst, *pair.src_stream));
+      else
+        ORT_RETURN_IF_ERROR(CopyTensor(pair.src, pair.dst));
+    }
   }
-
   return Status::OK();
 }
 
@@ -81,4 +92,30 @@ common::Status CPUDataTransfer::CopyTensor(const Tensor& src, Tensor& dst) const
 #endif
 }
 
+common::Status CPUDataTransfer::CopyTensor(const Tensor& src, Tensor& dst, size_t src_offset, size_t dst_offset, size_t size) const {
+  const void* src_data = src.DataRaw();
+  void* dst_data = dst.MutableDataRaw();
+
+  // Determine actual copy size
+  size_t copy_size = (size == 0) ? src.SizeInBytes() : size;
+
+  // Validate offsets and size
+  ORT_ENFORCE(src_offset + copy_size <= src.SizeInBytes(),
+              "Source offset + size exceeds source tensor size. src_offset=", src_offset,
+              ", size=", copy_size, ", src.SizeInBytes()=", src.SizeInBytes());
+  ORT_ENFORCE(dst_offset + copy_size <= dst.SizeInBytes(),
+              "Destination offset + size exceeds destination tensor size. dst_offset=", dst_offset,
+              ", size=", copy_size, ", dst.SizeInBytes()=", dst.SizeInBytes());
+
+  if (!src.IsDataTypeString()) {
+    const char* src_bytes = static_cast<const char*>(src_data) + src_offset;
+    char* dst_bytes = static_cast<char*>(dst_data) + dst_offset;
+    memcpy(dst_bytes, src_bytes, copy_size);
+  } else {
+    ORT_NOT_IMPLEMENTED("CopyTensor with offsets is not supported for string tensors");
+  }
+
+  return Status::OK();
+}
+
 };  // namespace onnxruntime
diff --git a/onnxruntime/core/framework/data_transfer.h b/onnxruntime/core/framework/data_transfer.h
@@ -31,14 +31,23 @@ class IDataTransfer {
 
   virtual common::Status CopyTensor(const Tensor& src, Tensor& dst) const;
 
+  virtual common::Status CopyTensor(const Tensor& src, Tensor& dst, size_t src_offset, size_t dst_offset, size_t size) const;
+
   virtual common::Status CopyTensorAsync(const Tensor& /*src*/, Tensor& /*dst*/, Stream& /*stream*/) const {
     ORT_NOT_IMPLEMENTED(__FUNCTION__, " is not implemented");
   }
 
+  virtual common::Status CopyTensorAsync(const Tensor& /*src*/, Tensor& /*dst*/, size_t /*src_offset*/, size_t /*dst_offset*/, size_t /*size*/, Stream& /*stream*/) const {
+    ORT_NOT_IMPLEMENTED(__FUNCTION__, " is not implemented");
+  }
+
   struct SrcDstPair {
     std::reference_wrapper<const Tensor> src;
     std::reference_wrapper<Tensor> dst;
-    Stream* src_stream;  // producer stream of src
+    Stream* src_stream;             // producer stream of src
+    size_t source_offset = 0;       // offset in source tensor (in bytes)
+    size_t destination_offset = 0;  // offset in destination tensor (in bytes)
+    size_t size = 0;                // number of bytes to copy (0 means copy entire tensor)
   };
 
   // batched copy. default implementation copies each entry sequentially, and returns on first failure.
@@ -62,5 +71,6 @@ class CPUDataTransfer : public IDataTransfer {
   using IDataTransfer::CopyTensor;
   bool CanCopy(const OrtDevice& src_device, const OrtDevice& dst_device) const override;
   common::Status CopyTensor(const Tensor& src, Tensor& dst) const override;
+  common::Status CopyTensor(const Tensor& src, Tensor& dst, size_t src_offset, size_t dst_offset, size_t size) const override;
 };
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/framework/plugin_data_transfer.cc b/onnxruntime/core/framework/plugin_data_transfer.cc
@@ -44,7 +44,7 @@ Status DataTransfer::CopyTensors(const std::vector<SrcDstPair>& src_dst_pairs) c
     streams.push_back(reinterpret_cast<OrtSyncStream*>(src_dst_pairs[i].src_stream));
   }
 
-  auto* status = impl_.CopyTensors(&impl_, src_values.data(), dst_values.data(), streams.data(),
+  auto* status = impl_.CopyTensors(&impl_, src_values.data(), dst_values.data(), nullptr, nullptr, nullptr, streams.data(),
                                    src_dst_pairs.size());
 
   return ToStatusAndRelease(status);
@@ -59,7 +59,7 @@ Status DataTransfer::CopyTensorImpl(const Tensor& src_tensor, Tensor& dst_tensor
   const OrtValue* src_ptr = &src;
   OrtValue* dst_ptr = &dst;
   OrtSyncStream* stream_ptr = nullptr;  // static_cast<OrtSyncStream*>(stream);
-  auto* status = impl_.CopyTensors(&impl_, &src_ptr, &dst_ptr, &stream_ptr, 1);
+  auto* status = impl_.CopyTensors(&impl_, &src_ptr, &dst_ptr, nullptr, nullptr, nullptr, &stream_ptr, 1);
 
   return ToStatusAndRelease(status);
 }

diff --git a/onnxruntime/core/providers/webgpu/buffer_manager.cc b/onnxruntime/core/providers/webgpu/buffer_manager.cc
@@ -450,11 +450,11 @@ BufferManager::BufferManager(WebGpuContext& context, BufferCacheMode storage_buf
       default_cache_{CreateBufferCacheManager(BufferCacheMode::Disabled)} {
 }
 
-void BufferManager::Upload(void* src, WGPUBuffer dst, size_t size) const {
+void BufferManager::Upload(void* src, WGPUBuffer dst, size_t size, size_t src_offset, size_t dst_offset) const {
   // If the buffer is mapped, we can directly write to it.
-  void* mapped_data = wgpuBufferGetMappedRange(dst, 0, WGPU_WHOLE_MAP_SIZE);  // ensure the buffer is mapped
+  void* mapped_data = wgpuBufferGetMappedRange(dst, dst_offset, size);
   if (mapped_data) {
-    memcpy(mapped_data, src, size);
+    memcpy(mapped_data, static_cast<char*>(src) + src_offset, size);
     wgpuBufferUnmap(dst);
     return;
   }
@@ -468,31 +468,31 @@ void BufferManager::Upload(void* src, WGPUBuffer dst, size_t size) const {
   desc.mappedAtCreation = true;
 
   auto staging_buffer = context_.Device().CreateBuffer(&desc);
-  mapped_data = staging_buffer.GetMappedRange();
-  memcpy(mapped_data, src, size);
+  void* staging_mapped_data = staging_buffer.GetMappedRange();
+  memcpy(staging_mapped_data, static_cast<char*>(src) + src_offset, size);
   staging_buffer.Unmap();
 
   auto& command_encoder = context_.GetCommandEncoder();
   context_.EndComputePass();
-  command_encoder.CopyBufferToBuffer(staging_buffer, 0, dst, 0, buffer_size);
+  command_encoder.CopyBufferToBuffer(staging_buffer, 0, dst, dst_offset, buffer_size);
   context_.Flush(*this);
 }
 
-void BufferManager::MemCpy(WGPUBuffer src, WGPUBuffer dst, size_t size) const {
+void BufferManager::MemCpy(WGPUBuffer src, WGPUBuffer dst, size_t size, size_t src_offset, size_t dst_offset) const {
   ORT_ENFORCE(src != dst, "Source and destination buffers must be different.");
   EnforceBufferUnmapped(context_, src);
   EnforceBufferUnmapped(context_, dst);
 
   auto buffer_size = NormalizeBufferSize(size);
   auto src_size = static_cast<size_t>(wgpuBufferGetSize(src));
   auto dst_size = static_cast<size_t>(wgpuBufferGetSize(dst));
-  ORT_ENFORCE(buffer_size <= src_size && buffer_size <= dst_size,
+  ORT_ENFORCE(src_offset + buffer_size <= src_size && dst_offset + buffer_size <= dst_size,
               "Source and destination buffers must have enough space for the copy operation. src_size=",
-              src_size, ", dst_size=", dst_size, ", copy_size=", buffer_size, ".");
+              src_size, ", dst_size=", dst_size, ", src_offset=", src_offset, ", dst_offset=", dst_offset, ", copy_size=", buffer_size, ".");
 
   auto& command_encoder = context_.GetCommandEncoder();
   context_.EndComputePass();
-  command_encoder.CopyBufferToBuffer(src, 0, dst, 0, buffer_size);
+  command_encoder.CopyBufferToBuffer(src, src_offset, dst, dst_offset, buffer_size);
 }
 
 WGPUBuffer BufferManager::Create(size_t size, wgpu::BufferUsage usage) const {
@@ -533,7 +533,7 @@ void BufferManager::Release(WGPUBuffer buffer) const {
   GetCacheManager(buffer).ReleaseBuffer(buffer);
 }
 
-void BufferManager::Download(WGPUBuffer src, void* dst, size_t size) const {
+void BufferManager::Download(WGPUBuffer src, void* dst, size_t size, size_t src_offset, size_t dst_offset) const {
   EnforceBufferUnmapped(context_, src);
   auto buffer_size = NormalizeBufferSize(size);
 
@@ -544,7 +544,7 @@ void BufferManager::Download(WGPUBuffer src, void* dst, size_t size) const {
   auto staging_buffer = context_.Device().CreateBuffer(&desc);
   auto& command_encoder = context_.GetCommandEncoder();
   context_.EndComputePass();
-  command_encoder.CopyBufferToBuffer(src, 0, staging_buffer, 0, buffer_size);
+  command_encoder.CopyBufferToBuffer(src, src_offset, staging_buffer, 0, buffer_size);
   context_.Flush(*this);
 
   // TODO: revise wait in whole project
@@ -554,7 +554,7 @@ void BufferManager::Download(WGPUBuffer src, void* dst, size_t size) const {
   })) == Status::OK());
 
   auto mapped_data = staging_buffer.GetConstMappedRange();
-  memcpy(dst, mapped_data, size);
+  memcpy(static_cast<char*>(dst) + dst_offset, mapped_data, size);
   staging_buffer.Unmap();
 }
 

diff --git a/onnxruntime/core/providers/webgpu/buffer_manager.h b/onnxruntime/core/providers/webgpu/buffer_manager.h
@@ -68,12 +68,12 @@ class IBufferCacheManager {
 class BufferManager {
  public:
   BufferManager(WebGpuContext& context, BufferCacheMode storage_buffer_cache_mode, BufferCacheMode uniform_buffer_cache_mode, BufferCacheMode query_resolve_buffer_cache_mode);
-  void Upload(void* src, WGPUBuffer dst, size_t size) const;
-  void MemCpy(WGPUBuffer src, WGPUBuffer dst, size_t size) const;
+  void Upload(void* src, WGPUBuffer dst, size_t size, size_t src_offset = 0, size_t dst_offset = 0) const;
+  void MemCpy(WGPUBuffer src, WGPUBuffer dst, size_t size, size_t src_offset = 0, size_t dst_offset = 0) const;
+  void Download(WGPUBuffer src, void* dst, size_t size, size_t src_offset = 0, size_t dst_offset = 0) const;
   WGPUBuffer Create(size_t size, wgpu::BufferUsage usage) const;
   bool SupportsUMA() const;  // Check if CreateUMA is supported (i.e., the device has BufferMapExtendedUsages feature)
   void Release(WGPUBuffer buffer) const;
-  void Download(WGPUBuffer src, void* dst, size_t size) const;
   void RefreshPendingBuffers(GraphCaptureState graph_capture_state) const;
 
  private:

diff --git a/onnxruntime/core/providers/webgpu/data_transfer.cc b/onnxruntime/core/providers/webgpu/data_transfer.cc
@@ -14,7 +14,11 @@ bool DataTransfer::CanCopy(const OrtDevice& src_device, const OrtDevice& dst_dev
 }
 
 common::Status DataTransfer::CopyTensor(const Tensor& src, Tensor& dst) const {
-  size_t bytes = src.SizeInBytes();
+  return CopyTensor(src, dst, 0, 0, 0);
+}
+
+common::Status DataTransfer::CopyTensor(const Tensor& src, Tensor& dst, size_t src_offset, size_t dst_offset, size_t size) const {
+  size_t bytes = size > 0 ? size : src.SizeInBytes();
   if (bytes > 0) {
     void const* src_data = src.DataRaw();
     void* dst_data = dst.MutableDataRaw();
@@ -26,14 +30,16 @@ common::Status DataTransfer::CopyTensor(const Tensor& src, Tensor& dst) const {
       if (src_device.Type() == OrtDevice::GPU) {
         // copy from GPU to GPU
         buffer_manager_.MemCpy(static_cast<WGPUBuffer>(const_cast<void*>(src_data)),
-                               static_cast<WGPUBuffer>(dst_data), bytes);
+                               static_cast<WGPUBuffer>(dst_data), bytes, src_offset, dst_offset);
       } else {
         // copy from CPU to GPU
-        buffer_manager_.Upload(const_cast<void*>(src_data), static_cast<WGPUBuffer>(dst_data), bytes);
+        buffer_manager_.Upload(const_cast<void*>(src_data),
+                               static_cast<WGPUBuffer>(dst_data), bytes, src_offset, dst_offset);
       }
     } else /* if (src_device.Type() == OrtDevice::GPU) */ {
       // copy from GPU to CPU
-      buffer_manager_.Download(static_cast<WGPUBuffer>(const_cast<void*>(src_data)), dst_data, bytes);
+      buffer_manager_.Download(static_cast<WGPUBuffer>(const_cast<void*>(src_data)),
+                               dst_data, bytes, src_offset, dst_offset);
     }
   }
 

diff --git a/onnxruntime/core/providers/webgpu/data_transfer.h b/onnxruntime/core/providers/webgpu/data_transfer.h
@@ -20,6 +20,9 @@ class DataTransfer : public IDataTransfer {
 
   common::Status CopyTensor(const Tensor& src, Tensor& dst) const override;
 
+  // Copy tensor with offset and size support
+  common::Status CopyTensor(const Tensor& src, Tensor& dst, size_t src_offset, size_t dst_offset, size_t size) const override;
+
  private:
   const BufferManager& buffer_manager_;
 };

diff --git a/onnxruntime/core/providers/webgpu/webgpu_provider_factory.cc b/onnxruntime/core/providers/webgpu/webgpu_provider_factory.cc
@@ -317,6 +317,9 @@ struct WebGpuDataTransferImpl : OrtDataTransferImpl {
   static OrtStatus* CopyTensorsImpl(OrtDataTransferImpl* this_ptr,
                                     const OrtValue** src_tensors,
                                     OrtValue** dst_tensors,
+                                    const size_t* source_offsets,
+                                    const size_t* destination_offsets,
+                                    const size_t* sizes,
                                     OrtSyncStream** /*streams*/,
                                     size_t num_tensors) noexcept {
     auto& impl = *static_cast<WebGpuDataTransferImpl*>(this_ptr);
@@ -348,7 +351,13 @@ struct WebGpuDataTransferImpl : OrtDataTransferImpl {
     for (size_t idx = 0; idx < num_tensors; ++idx) {
       const OrtValue* src_tensor = src_tensors[idx];
       OrtValue* dst_tensor = dst_tensors[idx];
-      auto status = impl.data_transfer_->CopyTensor(src_tensor->Get<Tensor>(), *dst_tensor->GetMutable<Tensor>());
+      size_t src_offset = source_offsets ? source_offsets[idx] : 0;
+      size_t dst_offset = destination_offsets ? destination_offsets[idx] : 0;
+      size_t copy_size = sizes ? sizes[idx] : 0;
+
+      common::Status status = impl.data_transfer_->CopyTensor(src_tensor->Get<Tensor>(), *dst_tensor->GetMutable<Tensor>(),
+                                                              src_offset, dst_offset, copy_size);
+
       if (!status.IsOK()) {
         return OrtApis::CreateStatus(ORT_RUNTIME_EXCEPTION, status.ErrorMessage().c_str());
       }