torch/csrc/autograd/profiler_kineto.h

#pragma once

#include <string>
#include <vector>

#include <torch/csrc/profiler/api.h>
#include <torch/csrc/profiler/kineto_shim.h>
#include <torch/csrc/profiler/util.h>

namespace torch {
namespace autograd {
namespace profiler {

struct TORCH_API KinetoEvent {
  uint64_t startThreadId() const {
    return start_thread_id_;
  }

  KinetoEvent& startThreadId(uint64_t start_thread_id) {
    start_thread_id_ = start_thread_id;
    return *this;
  }

  uint64_t endThreadId() const {
    return end_thread_id_;
  }

  KinetoEvent& endThreadId(uint64_t end_thread_id) {
    end_thread_id_ = end_thread_id;
    return *this;
  }

  uint8_t activityType() const {
    return activity_type_;
  }

  KinetoEvent& activityType(uint8_t activity_type) {
    activity_type_ = activity_type;
    return *this;
  }

  uint64_t fwdThreadId() const {
    return fwd_thread_id_;
  }

  KinetoEvent& fwdThreadId(uint64_t fwd_thread_id) {
    fwd_thread_id_ = fwd_thread_id;
    return *this;
  }

  bool hasShapes() const {
    return shapes_ != c10::nullopt;
  }

  const std::vector<std::vector<int64_t>>& shapes() const {
    return *shapes_;
  }

  KinetoEvent& shapes(const std::vector<std::vector<int64_t>>& shapes) {
    shapes_ = shapes;
    return *this;
  }

  bool hasTypes() const {
    return dtypes_ != c10::nullopt;
  }

  const std::vector<std::string>& dtypes() const {
    return *dtypes_;
  }

  KinetoEvent& dtypes(const std::vector<std::string>& dtypes) {
    dtypes_ = dtypes;
    return *this;
  }

  uint64_t flops() const {
    return flops_;
  }

  KinetoEvent& flops(uint64_t flops) {
    flops_ = flops;
    return *this;
  }

  int64_t sequenceNr() const {
    return sequence_nr_;
  }

  KinetoEvent& sequenceNr(int64_t sequence_nr) {
    sequence_nr_ = sequence_nr;
    return *this;
  }

  bool hasStack() const {
    return stack_ != c10::nullopt;
  }

  const std::vector<std::string>& stack() const {
    return *stack_;
  }

  KinetoEvent& stack(const std::vector<std::string>& st) {
    stack_ = st;
    return *this;
  }

  uint8_t scope() const {
    return scope_;
  }

  KinetoEvent& scope(uint8_t scope) {
    scope_ = scope;
    return *this;
  }

  bool hasModuleHierarchy() const {
    return module_hierarchy_ != c10::nullopt;
  }

  const std::vector<std::string>& moduleHierarchy() const {
    return *module_hierarchy_;
  }

  KinetoEvent& moduleHierarchy(const std::vector<std::string>& module_hierarchy) {
    module_hierarchy_ = module_hierarchy;
    return *this;
  }

  KinetoEvent& debugHandle(int64_t debug_handle) {
    debug_handle_ = debug_handle;
    return *this;
  }

  int64_t debugHandle() const {
    return debug_handle_;
  }

  std::string name() const {
    return name_;
  }

  KinetoEvent& name(const std::string& evt_name) {
    name_ = evt_name;
    return *this;
  }

  KinetoEvent& setAsync(bool is_async) {
    is_async_ = is_async;
    return *this;
  }

  c10::DeviceType deviceType() const {
    return (c10::DeviceType)device_type_;
  }

  KinetoEvent& deviceType(c10::DeviceType device_type) {
    device_type_ = (int8_t)device_type;
    return *this;
  }

  uint8_t deviceIndex() const {
    return device_index_;
  }

  KinetoEvent& deviceIndex(uint8_t device_index) {
    device_index_ = device_index;
    return *this;
  }

  int64_t nBytes() const {
    return nbytes_;
  }

  KinetoEvent& nBytes(int64_t nbytes) {
    nbytes_ = nbytes;
    return *this;
  }

  uint64_t startUs() const {
    return start_us_;
  }

  KinetoEvent& startUs(uint64_t start_us) {
    start_us_ = start_us;
    return *this;
  }

  uint64_t durationUs() const {
    return duration_us_;
  }

  KinetoEvent& durationUs(uint64_t duration_us) {
    duration_us_ = duration_us;
    return *this;
  }

  bool isAsync() const {
    return is_async_;
  }

  uint64_t correlationId() const {
    return correlation_id_;
  }

  KinetoEvent& correlationId(uint64_t correlation_id)  {
    correlation_id_ = correlation_id;
    return *this;
  }

  uint64_t linkedCorrelationId() const {
    return linked_correlation_id_;
  }

  KinetoEvent& linkedCorrelationId(uint64_t linked_correlation_id) {
    linked_correlation_id_ = linked_correlation_id;
    return *this;
  }

  int64_t deviceResourceId() const {
    return device_resource_id_;
  }

  KinetoEvent& deviceResourceId(int64_t device_resource_id) {
    device_resource_id_ = device_resource_id;
    return *this;
  }

  std::string backend() const {
    return backend_;
  }

  KinetoEvent& backend(const std::string& backend) {
    backend_ = backend;
    return *this;
  }

  int64_t cudaElapsedUs() const;

  uint64_t start_thread_id_ = 0;
  uint64_t end_thread_id_ = 0;
  uint64_t fwd_thread_id_ = 0;
  int64_t sequence_nr_ = -1;
  uint8_t scope_ = 0;

  uint8_t activity_type_ = 0;
  c10::optional<std::vector<std::vector<int64_t>>> shapes_;
  c10::optional<std::vector<std::string>> stack_;
  c10::optional<std::vector<std::string>> module_hierarchy_;
  c10::optional<std::vector<std::string>> dtypes_;
  uint64_t flops_ = 0;

  std::string name_;
  uint8_t device_index_ = 0;
  int8_t device_type_ = 0;
  uint64_t start_us_ = 0;
  uint64_t duration_us_ = 0;
  uint64_t correlation_id_ = 0;
  uint64_t linked_correlation_id_ = 0;
  int64_t device_resource_id_ = 0;
  int64_t nbytes_ = 0;
  bool is_async_{false};
  int64_t debug_handle_{-1};
  std::string backend_;

  torch::profiler::impl::CUDAEventStub cuda_event_start_ = nullptr;
  torch::profiler::impl::CUDAEventStub cuda_event_end_ = nullptr;
};

// Consolidating events returned directly from Kineto
// with events manually created by us (e.g. start/stop marks,
// memory allocation events)
struct TORCH_API ProfilerResult {
  ProfilerResult();
  ProfilerResult(
      uint64_t start_time,
      std::vector<KinetoEvent> events,
      torch::profiler::impl::kineto::ActivityTraceWrapper trace);
  ~ProfilerResult();

  uint64_t trace_start_us() const {
    return trace_start_us_;
  }

  const std::vector<KinetoEvent>& events() const {
    return events_;
  }

  void save(const std::string& path);

 private:
  uint64_t trace_start_us_ = 0;
  std::vector<KinetoEvent> events_;
  torch::profiler::impl::kineto::ActivityTraceWrapper trace_;
};

/*
 * This API is used by backends to record latency of events that
 * happened in the backend but were not visible to pytorch runtime.
 * For example, if part of the model is lowered to a dsp backend, then
 * the execution of that part of the model is delegated to the backend.
 * When backend finishes execution it has an option to provide profiling
 * information (latency only at th emoment) corresponding to different operators
 * that were executed in the backend.
 * When such events are recorded by backend using this API, the event
 * records will be collected by active kineto profiler. If no kineto profiler
 * is active then the event is ignored.
 * This provides us with a way to generate all the profiling information
 * for a model regardless of where model (or part of it) executed.
 * @param start_time_us: start time in us of the event
 * @param end_time_us: end time in us of the event
 * @param debug_handle: debug handle to correlate this event/op with
 * model level module/source information
 * @param scope: scope of the event, e.g. LITE_INTERPRETER, RECORD_FN etc.
 * @param event_name: name of the event, e.g. op name
 * @param backend_name: name of the backend where the event took place.
 */
TORCH_API void reportBackendEventToActiveKinetoProfiler(
    const int64_t start_time_us,
    const int64_t end_time_us,
    const int64_t debug_handle,
    const at::RecordScope scope,
    const std::string& event_name,
    const std::string& backend_name);

TORCH_API void enableProfiler(
    const torch::profiler::impl::ProfilerConfig& config,
    const std::set<torch::profiler::impl::ActivityType>& activities,
    const std::unordered_set<at::RecordScope>& scopes = {});

/*
 * Same as enableProfiler but with callback to do post-processing of
 * KinetoEvents.
 * enableProfilerWithEventPostProcess enables profiler to capture
 * specified activities, with specified RecordFunction scope, if any.
 * Additionally, it takes a functor that does in-place post processing of
 * events, e.g. populate stack trace or module hierarchy information lazily
 * using debug_handle.
 * Example usage is with lite interpreter that has recording scope of LITE_INTERPRETER.
 * In this case lite interpreter runtime, records debug handles in RecordFunction, along
 * with other information. Debug handles are eventually passed down to KinetoEvent and
 * recorded as part of the event. KinetoEdgeCPUProfiler,
 * in torch/csrc/jit/mobile/profiler_edge.cpp, enables profiler using post-processing
 * callback, via enableProfilerWithEventPostProcess, that takes these debug handles
 * and generates stack trace and module hierarchy information, once profiling is done.
 */
TORCH_API void enableProfilerWithEventPostProcess(
    const torch::profiler::impl::ProfilerConfig& config,
    const std::set<torch::profiler::impl::ActivityType>& activities,
    std::function<void(std::vector<KinetoEvent>&)>&& cb,
    const std::unordered_set<at::RecordScope>& scopes = {});

TORCH_API std::unique_ptr<ProfilerResult> disableProfiler();

TORCH_API void prepareProfiler(
    const torch::profiler::impl::ProfilerConfig& config,
    const std::set<torch::profiler::impl::ActivityType>& activities);

namespace python_tracer {

/*
Libtorch does not depend on Python (e.g. cannot #include <Python.h>); however
when we call the profiler from libtorch_python we need the profiler to be able
to ingest the data that we collect from the Python tracer. (`PyEval_SetProfile`)

In order to solve this dependency issue we define a set of methods which do not
contain any Python symbols, but can contain the information that Kineto needs
such as times and names. The python tracer then implements these functions and
wraps their registration in an init function which is called from
`torch/csrc/autograd/init.cpp`. This pattern of registration for faux python
dependencies in libtorch is common in the PyTorch codebase.
*/
enum CallType { kPyCall = 0, kPyModuleCall, kCCall };

struct TORCH_API PyTraceEvent {
  int64_t startTime_;
  int64_t endTime_;
  std::string name_;

  uint64_t thread_id_;
  PyTraceEvent* parent_;
  CallType call_type_;
  size_t module_id_;  // Only set call_type_ == kPyModuleCall

  // Index in the list of raw call and return events. This allows one to
  // convert a vector of PyTraceEvents back into the constituent call and
  // return events, even when events share the same timestamp.
  size_t call_idx_;
  size_t return_idx_;
};

enum Command { kStartOne = 0, kStartAll, kStop, kClear };
using CallFn = void (*)(Command);
using TraceEventsFn = std::vector<std::unique_ptr<PyTraceEvent>> (*)();

TORCH_API void registerFunctions(
  CallFn call,
  TraceEventsFn get_events
);

// Because we are interleaving events, the Python tracer should use the same
// timer as the profiler.
TORCH_API int64_t now();
}  // namespace python_tracer

} // namespace profiler
}} // namespace torch::autograd