Skip to content

Commit

Permalink
[Amdgpu] Add amdgpu backend profiler (taichi-dev#7330)
Browse files Browse the repository at this point in the history
Issue: #taichi-dev#6434

### Brief Summary
1. Currently only default(event) profiler is available on AMDGPU
2. Here is the show
<img width="1198" alt="image"
src="https://user-images.githubusercontent.com/47965866/217734581-4c7f7fa7-4d17-4243-b4bd-0a70d1c88f4a.png">

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
2 people authored and quadpixels committed May 13, 2023
1 parent f2a7d38 commit 2ba6d5f
Show file tree
Hide file tree
Showing 9 changed files with 385 additions and 4 deletions.
8 changes: 8 additions & 0 deletions taichi/program/kernel_profiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
#include "taichi/rhi/cuda/cuda_profiler.h"
#include "taichi/system/timeline.h"

#include "taichi/rhi/amdgpu/amdgpu_profiler.h"

namespace taichi::lang {

void KernelProfileStatisticalResult::insert_record(double t) {
Expand Down Expand Up @@ -143,6 +145,12 @@ std::unique_ptr<KernelProfilerBase> make_profiler(Arch arch, bool enable) {
return std::make_unique<KernelProfilerCUDA>(enable);
#else
TI_NOT_IMPLEMENTED;
#endif
} else if (arch == Arch::amdgpu) {
#if defined(TI_WITH_AMDGPU)
return std::make_unique<KernelProfilerAMDGPU>();
#else
TI_NOT_IMPLEMENTED
#endif
} else {
return std::make_unique<DefaultProfiler>();
Expand Down
1 change: 1 addition & 0 deletions taichi/rhi/amdgpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ target_sources(${AMDGPU_RHI}
amdgpu_caching_allocator.cpp
amdgpu_context.cpp
amdgpu_driver.cpp
amdgpu_profiler.cpp
)

target_include_directories(${AMDGPU_RHI}
Expand Down
17 changes: 17 additions & 0 deletions taichi/rhi/amdgpu/amdgpu_context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@
#include "taichi/program/program.h"
#include "taichi/system/threading.h"
#include "taichi/rhi/amdgpu/amdgpu_driver.h"
#include "taichi/rhi/amdgpu/amdgpu_profiler.h"
#include "taichi/analysis/offline_cache_util.h"
#include "taichi/util/offline_cache.h"

namespace taichi {
namespace lang {
Expand Down Expand Up @@ -120,6 +122,17 @@ void AMDGPUContext::launch(void *func,
unsigned grid_dim,
unsigned block_dim,
std::size_t dynamic_shared_mem_bytes) {
KernelProfilerBase::TaskHandle task_handle;
// Kernel launch
if (profiler_) {
KernelProfilerAMDGPU *profiler_amdgpu =
dynamic_cast<KernelProfilerAMDGPU *>(profiler_);
std::string primal_task_name, key;
bool valid =
offline_cache::try_demangle_name(task_name, primal_task_name, key);
profiler_amdgpu->trace(task_handle, valid ? primal_task_name : task_name,
func, grid_dim, block_dim, 0);
}
auto pack_size = get_args_byte(arg_sizes);
char *packed_arg = (char *)std::malloc(pack_size);
pack_args(arg_pointers, arg_sizes, packed_arg);
Expand All @@ -132,6 +145,10 @@ void AMDGPUContext::launch(void *func,
reinterpret_cast<void **>(&config));
}
std::free(packed_arg);

if (profiler_)
profiler_->stop(task_handle);

if (debug_) {
driver_.stream_synchronize(nullptr);
}
Expand Down
5 changes: 5 additions & 0 deletions taichi/rhi/amdgpu/amdgpu_context.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ class AMDGPUContext {
int compute_capability_;
std::string mcpu_;
std::mutex lock_;
KernelProfilerBase *profiler_;
AMDGPUDriver &driver_;
bool debug_;
std::vector<void *> kernel_arg_pointer_;
Expand Down Expand Up @@ -53,6 +54,10 @@ class AMDGPUContext {

int get_args_byte(std::vector<int> arg_sizes);

void set_profiler(KernelProfilerBase *profiler) {
profiler_ = profiler;
}

void launch(void *func,
const std::string &task_name,
const std::vector<void *> &arg_pointers,
Expand Down
252 changes: 252 additions & 0 deletions taichi/rhi/amdgpu/amdgpu_profiler.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,252 @@
#include "taichi/rhi/amdgpu/amdgpu_profiler.h"
#include "taichi/rhi/amdgpu/amdgpu_driver.h"
#include "taichi/rhi/amdgpu/amdgpu_context.h"
#include "taichi/rhi/amdgpu/amdgpu_types.h"

namespace taichi::lang {
#if defined(TI_WITH_AMDGPU)

std::string KernelProfilerAMDGPU::get_device_name() {
return AMDGPUContext::get_instance().get_device_name();
}

bool KernelProfilerAMDGPU::reinit_with_metrics(
const std::vector<std::string> metrics) {
TI_NOT_IMPLEMENTED
}

bool KernelProfilerAMDGPU::set_profiler_toolkit(std::string toolkit_name) {
if (toolkit_name.compare("default") == 0) {
return true;
}
TI_WARN("Only default(event) profiler is allowed on AMDGPU");
return false;
}

KernelProfilerBase::TaskHandle KernelProfilerAMDGPU::start_with_handle(
const std::string &kernel_name) {
TI_NOT_IMPLEMENTED;
}

void KernelProfilerAMDGPU::trace(KernelProfilerBase::TaskHandle &task_handle,
const std::string &kernel_name,
void *kernel,
uint32_t grid_size,
uint32_t block_size,
uint32_t dynamic_smem_size) {
int register_per_thread = 0;
int static_shared_mem_per_block = 0;
// int max_active_blocks_per_multiprocessor = 0;
task_handle = event_toolkit_->start_with_handle(kernel_name);
KernelProfileTracedRecord record;

AMDGPUDriver::get_instance().kernel_get_attribute(
&register_per_thread, HIPfunction_attribute::HIP_FUNC_ATTRIBUTE_NUM_REGS,
kernel);
AMDGPUDriver::get_instance().kernel_get_attribute(
&static_shared_mem_per_block,
HIPfunction_attribute::HIP_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, kernel);
// kernel_get_occupancy doesn't work well
// AMDGPUDriver::get_instance().kernel_get_occupancy(
// &max_active_blocks_per_multiprocessor, kernel, block_size,
// dynamic_smem_size);

record.name = kernel_name;
record.register_per_thread = register_per_thread;
record.shared_mem_per_block = static_shared_mem_per_block + dynamic_smem_size;
record.grid_size = grid_size;
record.block_size = block_size;
// record.active_blocks_per_multiprocessor =
// max_active_blocks_per_multiprocessor;

traced_records_.push_back(record);
}

void KernelProfilerAMDGPU::stop(KernelProfilerBase::TaskHandle handle) {
AMDGPUDriver::get_instance().event_record(handle, 0);
AMDGPUDriver::get_instance().stream_synchronize(nullptr);

// get elapsed time and destroy events
auto record = event_toolkit_->get_current_event_record();
AMDGPUDriver::get_instance().event_elapsed_time(
&record->kernel_elapsed_time_in_ms, record->start_event, handle);
AMDGPUDriver::get_instance().event_elapsed_time(
&record->time_since_base, event_toolkit_->get_base_event(),
record->start_event);

AMDGPUDriver::get_instance().event_destroy(record->start_event);
AMDGPUDriver::get_instance().event_destroy(record->stop_event);
}

bool KernelProfilerAMDGPU::statistics_on_traced_records() {
for (auto &record : traced_records_) {
auto it =
std::find_if(statistical_results_.begin(), statistical_results_.end(),
[&](KernelProfileStatisticalResult &result) {
return result.name == record.name;
});
if (it == statistical_results_.end()) {
statistical_results_.emplace_back(record.name);
it = std::prev(statistical_results_.end());
}
it->insert_record(record.kernel_elapsed_time_in_ms);
total_time_ms_ += record.kernel_elapsed_time_in_ms;
}

return true;
}

void KernelProfilerAMDGPU::sync() {
AMDGPUDriver::get_instance().stream_synchronize(nullptr);
}

void KernelProfilerAMDGPU::update() {
event_toolkit_->update_record(records_size_after_sync_, traced_records_);
event_toolkit_->update_timeline(traced_records_);
statistics_on_traced_records();
event_toolkit_->clear();
records_size_after_sync_ = traced_records_.size();
}

void KernelProfilerAMDGPU::clear() {
update();
total_time_ms_ = 0;
records_size_after_sync_ = 0;
traced_records_.clear();
statistical_results_.clear();
}

#else
std::string KernelProfilerAMDGPU::get_device_name() {
TI_NOT_IMPLEMENTED
}

bool KernelProfilerAMDGPU::reinit_with_metrics(
const std::vector<std::string> metrics){TI_NOT_IMPLEMENTED}

KernelProfilerBase::TaskHandle
KernelProfilerAMDGPU::start_with_handle(const std::string &kernel_name) {
TI_NOT_IMPLEMENTED;
}

void KernelProfilerAMDGPU::trace(KernelProfilerBase::TaskHandle &task_handle,
const std::string &kernel_name,
void *kernel,
uint32_t grid_size,
uint32_t block_size,
uint32_t dynamic_smem_size) {
TI_NOT_IMPLEMENTED;
}

void KernelProfilerAMDGPU::stop(KernelProfilerBase::TaskHandle handle) {
TI_NOT_IMPLEMENTED
}

bool KernelProfilerAMDGPU::statistics_on_traced_records() {
TI_NOT_IMPLEMENTED
}

void KernelProfilerAMDGPU::sync() {
TI_NOT_IMPLEMENTED
}
void KernelProfilerAMDGPU::update() {
TI_NOT_IMPLEMENTED
}

void KernelProfilerAMDGPU::clear(){TI_NOT_IMPLEMENTED}

#endif

#if defined(TI_WITH_AMDGPU)

KernelProfilerBase::TaskHandle EventToolkitAMDGPU::start_with_handle(
const std::string &kernel_name) {
EventRecord record;
record.name = kernel_name;

AMDGPUDriver::get_instance().event_create(&(record.start_event),
HIP_EVENT_DEFAULT);
AMDGPUDriver::get_instance().event_create(&(record.stop_event),
HIP_EVENT_DEFAULT);
AMDGPUDriver::get_instance().event_record((record.start_event), 0);
event_records_.push_back(record);

if (!base_event_) {
int n_iters = 100;
// Warm up
for (int i = 0; i < n_iters; i++) {
void *e;
AMDGPUDriver::get_instance().event_create(&e, HIP_EVENT_DEFAULT);
AMDGPUDriver::get_instance().event_record(e, 0);
AMDGPUDriver::get_instance().event_synchronize(e);
auto final_t = Time::get_time();
if (i == n_iters - 1) {
base_event_ = e;
// ignore the overhead of sync, event_create and systematic time offset.
base_time_ = final_t;
} else {
AMDGPUDriver::get_instance().event_destroy(e);
}
}
}
return record.stop_event;
}

void EventToolkitAMDGPU::update_record(
uint32_t records_size_after_sync,
std::vector<KernelProfileTracedRecord> &traced_records) {
uint32_t events_num = event_records_.size();
uint32_t records_num = traced_records.size();
TI_ERROR_IF(
records_size_after_sync + events_num != records_num,
"KernelProfilerAMDGPU::EventToolkitAMDGPU: event_records_.size({}) != "
"traced_records_.size({})",
records_size_after_sync + events_num, records_num);

uint32_t idx = 0;
for (auto &record : event_records_) {
// copy to traced_records_ then clear event_records_
traced_records[records_size_after_sync + idx].kernel_elapsed_time_in_ms =
record.kernel_elapsed_time_in_ms;
traced_records[records_size_after_sync + idx].time_since_base =
record.time_since_base;
idx++;
}
}

void EventToolkitAMDGPU::update_timeline(
std::vector<KernelProfileTracedRecord> &traced_records) {
if (Timelines::get_instance().get_enabled()) {
auto &timeline = Timeline::get_this_thread_instance();
for (auto &record : traced_records) {
timeline.insert_event({record.name, /*param_name=begin*/ true,
base_time_ + record.time_since_base * 1e-3,
"amdgpu"});
timeline.insert_event({record.name, /*param_name=begin*/ false,
base_time_ + (record.time_since_base +
record.kernel_elapsed_time_in_ms) *
1e-3,
"amdgpu"});
}
}
}

#else

KernelProfilerBase::TaskHandle
EventToolkitAMDGPU::start_with_handle(const std::string &kernel_name) {
TI_NOT_IMPLEMENTED;
}
void EventToolkitAMDGPU::update_record(
uint32_t records_size_after_sync,
std::vector<KernelProfileTracedRecord> &traced_records) {
TI_NOT_IMPLEMENTED;
}
void EventToolkitAMDGPU::update_timeline(
std::vector<KernelProfileTracedRecord> &traced_records) {
TI_NOT_IMPLEMENTED;
}

#endif

} // namespace taichi::lang
Loading

0 comments on commit 2ba6d5f

Please sign in to comment.