Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions include/onnxruntime/core/session/onnxruntime_c_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -6785,6 +6785,33 @@ struct OrtApi {
_Out_writes_(num_outputs) const OrtEpDevice** outputs_ep_devices,
_In_ size_t num_outputs);

/** \brief Copy OrtValue instances containing Tensors between devices with offset and size control.
*
* Extended version of CopyTensors that supports copying with source/destination offsets and custom sizes.
* All offsets and sizes are in bytes.
*
* \param[in] env The OrtEnv instance to use.
* \param[in] src_tensors Array of OrtValue instances containing the source tensors to copy.
* \param[in] dst_tensors Array of OrtValue instances to copy the source tensors to.
* \param[in] source_offsets Optional array of source offsets in bytes. May be nullptr for all zeros.
* \param[in] destination_offsets Optional array of destination offsets in bytes. May be nullptr for all zeros.
* \param[in] sizes Optional array of sizes in bytes to copy. May be nullptr to copy entire tensors.
* \param[in] stream Optional OrtSyncStream that can be used to perform the copy asynchronously. May be nullptr.
* \param[in] num_tensors The number of tensors to copy.
*
* \snippet{doc} snippets.dox OrtStatus Return Value
*
* \since Version 1.24
*/
Comment on lines +6788 to +6805
Copy link

Copilot AI Jan 14, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The documentation for CopyTensorsEx should clarify the expected behavior when offsets and sizes would cause out-of-bounds access. It should specify whether the implementation is expected to validate bounds and return an error, or if the caller is responsible for ensuring valid parameters. This is important for API consumers to understand their responsibilities and avoid undefined behavior.

Copilot uses AI. Check for mistakes.
ORT_API2_STATUS(CopyTensorsEx, _In_ const OrtEnv* env,
_In_reads_(num_tensors) const OrtValue* const* src_tensors,
_In_reads_(num_tensors) OrtValue* const* dst_tensors,
_In_reads_opt_(num_tensors) const size_t* source_offsets,
_In_reads_opt_(num_tensors) const size_t* destination_offsets,
_In_reads_opt_(num_tensors) const size_t* sizes,
_In_opt_ OrtSyncStream* stream,
_In_ size_t num_tensors);

/// @}
};

Expand Down
6 changes: 6 additions & 0 deletions include/onnxruntime/core/session/onnxruntime_ep_c_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,9 @@ struct OrtDataTransferImpl {
* \param[in] this_ptr Pointer to the OrtDataTransferImpl instance.
* \param[in] src_tensors Array of source OrtValue pointers to copy from.
* \param[in] dst_tensors Array of destination OrtValue pointers to copy to.
* \param[in] source_offsets Optional array of source offsets in bytes. May be nullptr for all zeros.
* \param[in] destination_offsets Optional array of destination offsets in bytes. May be nullptr for all zeros.
* \param[in] sizes Optional array of sizes in bytes to copy. May be nullptr to copy entire tensors.
* \param[in] streams Array of OrtSyncStream pointers for the copy operations, if the execution provider is stream
* aware. nullptr if it is not.
* \param[in] num_tensors Number of tensors to copy.
Expand All @@ -140,6 +143,9 @@ struct OrtDataTransferImpl {
ORT_API2_STATUS(CopyTensors, _In_ OrtDataTransferImpl* this_ptr,
_In_reads_(num_tensors) const OrtValue** src_tensors,
_In_reads_(num_tensors) OrtValue** dst_tensors,
_In_reads_opt_(num_tensors) const size_t* source_offsets,
_In_reads_opt_(num_tensors) const size_t* destination_offsets,
_In_reads_opt_(num_tensors) const size_t* sizes,
_In_reads_(num_tensors) OrtSyncStream** streams,
_In_ size_t num_tensors);
};
Expand Down
47 changes: 42 additions & 5 deletions onnxruntime/core/framework/data_transfer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,25 @@ common::Status IDataTransfer::CopyTensor(const Tensor& /*src*/, Tensor& /*dst*/)
ORT_NOT_IMPLEMENTED(__FUNCTION__, " is not implemented");
}

common::Status IDataTransfer::CopyTensor(const Tensor& /*src*/, Tensor& /*dst*/, size_t /*src_offset*/, size_t /*dst_offset*/, size_t /*size*/) const {
ORT_NOT_IMPLEMENTED(__FUNCTION__, " with offsets/size is not implemented");
}

common::Status IDataTransfer::CopyTensors(const std::vector<IDataTransfer::SrcDstPair>& src_dst_pairs) const {
for (const auto& pair : src_dst_pairs) {
if (pair.src_stream)
ORT_RETURN_IF_ERROR(CopyTensorAsync(pair.src, pair.dst, *pair.src_stream));
else
ORT_RETURN_IF_ERROR(CopyTensor(pair.src, pair.dst));
// Use offset-aware methods when offsets or size are provided
if (pair.source_offset != 0 || pair.destination_offset != 0 || pair.size != 0) {
if (pair.src_stream)
ORT_RETURN_IF_ERROR(CopyTensorAsync(pair.src, pair.dst, pair.source_offset, pair.destination_offset, pair.size, *pair.src_stream));
else
ORT_RETURN_IF_ERROR(CopyTensor(pair.src, pair.dst, pair.source_offset, pair.destination_offset, pair.size));
} else {
if (pair.src_stream)
ORT_RETURN_IF_ERROR(CopyTensorAsync(pair.src, pair.dst, *pair.src_stream));
else
ORT_RETURN_IF_ERROR(CopyTensor(pair.src, pair.dst));
}
}

return Status::OK();
}

Expand Down Expand Up @@ -81,4 +92,30 @@ common::Status CPUDataTransfer::CopyTensor(const Tensor& src, Tensor& dst) const
#endif
}

common::Status CPUDataTransfer::CopyTensor(const Tensor& src, Tensor& dst, size_t src_offset, size_t dst_offset, size_t size) const {
const void* src_data = src.DataRaw();
void* dst_data = dst.MutableDataRaw();

// Determine actual copy size
size_t copy_size = (size == 0) ? src.SizeInBytes() : size;

// Validate offsets and size
ORT_ENFORCE(src_offset + copy_size <= src.SizeInBytes(),
"Source offset + size exceeds source tensor size. src_offset=", src_offset,
", size=", copy_size, ", src.SizeInBytes()=", src.SizeInBytes());
ORT_ENFORCE(dst_offset + copy_size <= dst.SizeInBytes(),
"Destination offset + size exceeds destination tensor size. dst_offset=", dst_offset,
", size=", copy_size, ", dst.SizeInBytes()=", dst.SizeInBytes());

if (!src.IsDataTypeString()) {
const char* src_bytes = static_cast<const char*>(src_data) + src_offset;
char* dst_bytes = static_cast<char*>(dst_data) + dst_offset;
memcpy(dst_bytes, src_bytes, copy_size);
} else {
ORT_NOT_IMPLEMENTED("CopyTensor with offsets is not supported for string tensors");
}

return Status::OK();
}

}; // namespace onnxruntime
12 changes: 11 additions & 1 deletion onnxruntime/core/framework/data_transfer.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,14 +31,23 @@ class IDataTransfer {

virtual common::Status CopyTensor(const Tensor& src, Tensor& dst) const;

virtual common::Status CopyTensor(const Tensor& src, Tensor& dst, size_t src_offset, size_t dst_offset, size_t size) const;

virtual common::Status CopyTensorAsync(const Tensor& /*src*/, Tensor& /*dst*/, Stream& /*stream*/) const {
ORT_NOT_IMPLEMENTED(__FUNCTION__, " is not implemented");
}

virtual common::Status CopyTensorAsync(const Tensor& /*src*/, Tensor& /*dst*/, size_t /*src_offset*/, size_t /*dst_offset*/, size_t /*size*/, Stream& /*stream*/) const {
ORT_NOT_IMPLEMENTED(__FUNCTION__, " is not implemented");
}

struct SrcDstPair {
std::reference_wrapper<const Tensor> src;
std::reference_wrapper<Tensor> dst;
Stream* src_stream; // producer stream of src
Stream* src_stream; // producer stream of src
size_t source_offset = 0; // offset in source tensor (in bytes)
size_t destination_offset = 0; // offset in destination tensor (in bytes)
size_t size = 0; // number of bytes to copy (0 means copy entire tensor)
};

// batched copy. default implementation copies each entry sequentially, and returns on first failure.
Expand All @@ -62,5 +71,6 @@ class CPUDataTransfer : public IDataTransfer {
using IDataTransfer::CopyTensor;
bool CanCopy(const OrtDevice& src_device, const OrtDevice& dst_device) const override;
common::Status CopyTensor(const Tensor& src, Tensor& dst) const override;
common::Status CopyTensor(const Tensor& src, Tensor& dst, size_t src_offset, size_t dst_offset, size_t size) const override;
};
} // namespace onnxruntime
4 changes: 2 additions & 2 deletions onnxruntime/core/framework/plugin_data_transfer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ Status DataTransfer::CopyTensors(const std::vector<SrcDstPair>& src_dst_pairs) c
streams.push_back(reinterpret_cast<OrtSyncStream*>(src_dst_pairs[i].src_stream));
}

auto* status = impl_.CopyTensors(&impl_, src_values.data(), dst_values.data(), streams.data(),
auto* status = impl_.CopyTensors(&impl_, src_values.data(), dst_values.data(), nullptr, nullptr, nullptr, streams.data(),
src_dst_pairs.size());

return ToStatusAndRelease(status);
Expand All @@ -59,7 +59,7 @@ Status DataTransfer::CopyTensorImpl(const Tensor& src_tensor, Tensor& dst_tensor
const OrtValue* src_ptr = &src;
OrtValue* dst_ptr = &dst;
OrtSyncStream* stream_ptr = nullptr; // static_cast<OrtSyncStream*>(stream);
auto* status = impl_.CopyTensors(&impl_, &src_ptr, &dst_ptr, &stream_ptr, 1);
auto* status = impl_.CopyTensors(&impl_, &src_ptr, &dst_ptr, nullptr, nullptr, nullptr, &stream_ptr, 1);

return ToStatusAndRelease(status);
}
Expand Down
26 changes: 13 additions & 13 deletions onnxruntime/core/providers/webgpu/buffer_manager.cc
Original file line number Diff line number Diff line change
Expand Up @@ -450,11 +450,11 @@ BufferManager::BufferManager(WebGpuContext& context, BufferCacheMode storage_buf
default_cache_{CreateBufferCacheManager(BufferCacheMode::Disabled)} {
}

void BufferManager::Upload(void* src, WGPUBuffer dst, size_t size) const {
void BufferManager::Upload(void* src, WGPUBuffer dst, size_t size, size_t src_offset, size_t dst_offset) const {
// If the buffer is mapped, we can directly write to it.
void* mapped_data = wgpuBufferGetMappedRange(dst, 0, WGPU_WHOLE_MAP_SIZE); // ensure the buffer is mapped
void* mapped_data = wgpuBufferGetMappedRange(dst, dst_offset, size);
if (mapped_data) {
memcpy(mapped_data, src, size);
memcpy(mapped_data, static_cast<char*>(src) + src_offset, size);
wgpuBufferUnmap(dst);
return;
}
Expand All @@ -468,31 +468,31 @@ void BufferManager::Upload(void* src, WGPUBuffer dst, size_t size) const {
desc.mappedAtCreation = true;

auto staging_buffer = context_.Device().CreateBuffer(&desc);
mapped_data = staging_buffer.GetMappedRange();
memcpy(mapped_data, src, size);
void* staging_mapped_data = staging_buffer.GetMappedRange();
memcpy(staging_mapped_data, static_cast<char*>(src) + src_offset, size);
staging_buffer.Unmap();

auto& command_encoder = context_.GetCommandEncoder();
context_.EndComputePass();
command_encoder.CopyBufferToBuffer(staging_buffer, 0, dst, 0, buffer_size);
command_encoder.CopyBufferToBuffer(staging_buffer, 0, dst, dst_offset, buffer_size);
context_.Flush(*this);
}

void BufferManager::MemCpy(WGPUBuffer src, WGPUBuffer dst, size_t size) const {
void BufferManager::MemCpy(WGPUBuffer src, WGPUBuffer dst, size_t size, size_t src_offset, size_t dst_offset) const {
ORT_ENFORCE(src != dst, "Source and destination buffers must be different.");
EnforceBufferUnmapped(context_, src);
EnforceBufferUnmapped(context_, dst);

auto buffer_size = NormalizeBufferSize(size);
auto src_size = static_cast<size_t>(wgpuBufferGetSize(src));
auto dst_size = static_cast<size_t>(wgpuBufferGetSize(dst));
ORT_ENFORCE(buffer_size <= src_size && buffer_size <= dst_size,
ORT_ENFORCE(src_offset + buffer_size <= src_size && dst_offset + buffer_size <= dst_size,
"Source and destination buffers must have enough space for the copy operation. src_size=",
src_size, ", dst_size=", dst_size, ", copy_size=", buffer_size, ".");
src_size, ", dst_size=", dst_size, ", src_offset=", src_offset, ", dst_offset=", dst_offset, ", copy_size=", buffer_size, ".");

auto& command_encoder = context_.GetCommandEncoder();
context_.EndComputePass();
command_encoder.CopyBufferToBuffer(src, 0, dst, 0, buffer_size);
command_encoder.CopyBufferToBuffer(src, src_offset, dst, dst_offset, buffer_size);
}
Comment on lines +481 to 496
Copy link

Copilot AI Jan 14, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Potential issue with buffer size normalization when using offsets. The NormalizeBufferSize function rounds up 'size' to be aligned to 16 bytes, but when dst_offset is applied, the total required buffer space is actually dst_offset + buffer_size. The current validation at line 489 checks dst_offset + buffer_size against dst_size, which is correct. However, if the destination buffer was created with a size that was normalized independently, there could be cases where the aligned buffer_size causes the operation to exceed the actual buffer bounds when combined with the offset.

Copilot uses AI. Check for mistakes.

WGPUBuffer BufferManager::Create(size_t size, wgpu::BufferUsage usage) const {
Expand Down Expand Up @@ -533,7 +533,7 @@ void BufferManager::Release(WGPUBuffer buffer) const {
GetCacheManager(buffer).ReleaseBuffer(buffer);
}

void BufferManager::Download(WGPUBuffer src, void* dst, size_t size) const {
void BufferManager::Download(WGPUBuffer src, void* dst, size_t size, size_t src_offset, size_t dst_offset) const {
EnforceBufferUnmapped(context_, src);
auto buffer_size = NormalizeBufferSize(size);

Expand All @@ -544,7 +544,7 @@ void BufferManager::Download(WGPUBuffer src, void* dst, size_t size) const {
auto staging_buffer = context_.Device().CreateBuffer(&desc);
auto& command_encoder = context_.GetCommandEncoder();
context_.EndComputePass();
command_encoder.CopyBufferToBuffer(src, 0, staging_buffer, 0, buffer_size);
command_encoder.CopyBufferToBuffer(src, src_offset, staging_buffer, 0, buffer_size);
context_.Flush(*this);

// TODO: revise wait in whole project
Expand All @@ -554,7 +554,7 @@ void BufferManager::Download(WGPUBuffer src, void* dst, size_t size) const {
})) == Status::OK());

auto mapped_data = staging_buffer.GetConstMappedRange();
memcpy(dst, mapped_data, size);
memcpy(static_cast<char*>(dst) + dst_offset, mapped_data, size);
staging_buffer.Unmap();
}

Expand Down
6 changes: 3 additions & 3 deletions onnxruntime/core/providers/webgpu/buffer_manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -68,12 +68,12 @@ class IBufferCacheManager {
class BufferManager {
public:
BufferManager(WebGpuContext& context, BufferCacheMode storage_buffer_cache_mode, BufferCacheMode uniform_buffer_cache_mode, BufferCacheMode query_resolve_buffer_cache_mode);
void Upload(void* src, WGPUBuffer dst, size_t size) const;
void MemCpy(WGPUBuffer src, WGPUBuffer dst, size_t size) const;
void Upload(void* src, WGPUBuffer dst, size_t size, size_t src_offset = 0, size_t dst_offset = 0) const;
void MemCpy(WGPUBuffer src, WGPUBuffer dst, size_t size, size_t src_offset = 0, size_t dst_offset = 0) const;
void Download(WGPUBuffer src, void* dst, size_t size, size_t src_offset = 0, size_t dst_offset = 0) const;
WGPUBuffer Create(size_t size, wgpu::BufferUsage usage) const;
bool SupportsUMA() const; // Check if CreateUMA is supported (i.e., the device has BufferMapExtendedUsages feature)
void Release(WGPUBuffer buffer) const;
void Download(WGPUBuffer src, void* dst, size_t size) const;
void RefreshPendingBuffers(GraphCaptureState graph_capture_state) const;

private:
Expand Down
14 changes: 10 additions & 4 deletions onnxruntime/core/providers/webgpu/data_transfer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,11 @@ bool DataTransfer::CanCopy(const OrtDevice& src_device, const OrtDevice& dst_dev
}

common::Status DataTransfer::CopyTensor(const Tensor& src, Tensor& dst) const {
size_t bytes = src.SizeInBytes();
return CopyTensor(src, dst, 0, 0, 0);
}

common::Status DataTransfer::CopyTensor(const Tensor& src, Tensor& dst, size_t src_offset, size_t dst_offset, size_t size) const {
size_t bytes = size > 0 ? size : src.SizeInBytes();
if (bytes > 0) {
void const* src_data = src.DataRaw();
void* dst_data = dst.MutableDataRaw();
Expand All @@ -26,14 +30,16 @@ common::Status DataTransfer::CopyTensor(const Tensor& src, Tensor& dst) const {
if (src_device.Type() == OrtDevice::GPU) {
// copy from GPU to GPU
buffer_manager_.MemCpy(static_cast<WGPUBuffer>(const_cast<void*>(src_data)),
static_cast<WGPUBuffer>(dst_data), bytes);
static_cast<WGPUBuffer>(dst_data), bytes, src_offset, dst_offset);
} else {
// copy from CPU to GPU
buffer_manager_.Upload(const_cast<void*>(src_data), static_cast<WGPUBuffer>(dst_data), bytes);
buffer_manager_.Upload(const_cast<void*>(src_data),
static_cast<WGPUBuffer>(dst_data), bytes, src_offset, dst_offset);
}
} else /* if (src_device.Type() == OrtDevice::GPU) */ {
// copy from GPU to CPU
buffer_manager_.Download(static_cast<WGPUBuffer>(const_cast<void*>(src_data)), dst_data, bytes);
buffer_manager_.Download(static_cast<WGPUBuffer>(const_cast<void*>(src_data)),
dst_data, bytes, src_offset, dst_offset);
}
}

Expand Down
3 changes: 3 additions & 0 deletions onnxruntime/core/providers/webgpu/data_transfer.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@ class DataTransfer : public IDataTransfer {

common::Status CopyTensor(const Tensor& src, Tensor& dst) const override;

// Copy tensor with offset and size support
common::Status CopyTensor(const Tensor& src, Tensor& dst, size_t src_offset, size_t dst_offset, size_t size) const override;

private:
const BufferManager& buffer_manager_;
};
Expand Down
11 changes: 10 additions & 1 deletion onnxruntime/core/providers/webgpu/webgpu_provider_factory.cc
Original file line number Diff line number Diff line change
Expand Up @@ -317,6 +317,9 @@ struct WebGpuDataTransferImpl : OrtDataTransferImpl {
static OrtStatus* CopyTensorsImpl(OrtDataTransferImpl* this_ptr,
const OrtValue** src_tensors,
OrtValue** dst_tensors,
const size_t* source_offsets,
const size_t* destination_offsets,
const size_t* sizes,
OrtSyncStream** /*streams*/,
size_t num_tensors) noexcept {
auto& impl = *static_cast<WebGpuDataTransferImpl*>(this_ptr);
Expand Down Expand Up @@ -348,7 +351,13 @@ struct WebGpuDataTransferImpl : OrtDataTransferImpl {
for (size_t idx = 0; idx < num_tensors; ++idx) {
const OrtValue* src_tensor = src_tensors[idx];
OrtValue* dst_tensor = dst_tensors[idx];
auto status = impl.data_transfer_->CopyTensor(src_tensor->Get<Tensor>(), *dst_tensor->GetMutable<Tensor>());
size_t src_offset = source_offsets ? source_offsets[idx] : 0;
size_t dst_offset = destination_offsets ? destination_offsets[idx] : 0;
size_t copy_size = sizes ? sizes[idx] : 0;

common::Status status = impl.data_transfer_->CopyTensor(src_tensor->Get<Tensor>(), *dst_tensor->GetMutable<Tensor>(),
src_offset, dst_offset, copy_size);

if (!status.IsOK()) {
return OrtApis::CreateStatus(ORT_RUNTIME_EXCEPTION, status.ErrorMessage().c_str());
}
Expand Down
Loading
Loading