Skip to content

Commit

Permalink
tc comprehension integration Ref. SINGA-482
Browse files Browse the repository at this point in the history
  • Loading branch information
Your Name committed Aug 15, 2019
1 parent 806dbe7 commit 827658a
Show file tree
Hide file tree
Showing 7 changed files with 539 additions and 3 deletions.
44 changes: 44 additions & 0 deletions cmake/Dependencies.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -149,3 +149,47 @@ IF(USE_MKLDNN)
INCLUDE_DIRECTORIES(${MKLDNN_INCLUDE_DIR})
LIST(APPEND SINGA_LINKER_LIBS ${MKLDNN_LIBRARIES})
ENDIF()


### Tensor comprehensions
### Tensor comprehensions
### Tensor comprehensions
# the path should be consistent with the inlcude path in src
INCLUDE_DIRECTORIES(/root/TensorComprehensions)
INCLUDE_DIRECTORIES(/root/TensorComprehensions/tc/version)
INCLUDE_DIRECTORIES(/root/TensorComprehensions/build)

# polyhedral model required
INCLUDE_DIRECTORIES(/root/TensorComprehensions/isl_interface/include)

# dlpack
INCLUDE_DIRECTORIES(/root/TensorComprehensions/third-party/dlpack/include)
# Halide
INCLUDE_DIRECTORIES(/root/conda/envs/tc_build/include/Halide)

# llvm
INCLUDE_DIRECTORIES(/root/conda/envs/tc_build/include)

# torch ATen header TO DELETE
INCLUDE_DIRECTORIES(/root/conda/envs/tc_build/lib/python3.6/site-packages/torch/lib/include)

# find Halide lib
set(HALIDE_PREFIX "/root/conda/envs/tc_build")
find_library(HALIDE_LIBRARIES REQUIRED NAMES Halide PATHS ${HALIDE_PREFIX} PATH_SUFFIXES lib lib64 NO_DEFAULT_PATH)
message(STATUS "Found Halide.so file: ${HALIDE_LIBRARIES}")

# find tc lib
link_directories(/root/TensorComprehensions/build/tc/aten)
link_directories(/root/TensorComprehensions/build/tc/lang)
link_directories(/root/TensorComprehensions/build/tc/core)
link_directories(/root/TensorComprehensions/build/tc/autotuner)
link_directories(/root/TensorComprehensions/build/tc/proto)

# torch(aten) lib to delete
link_directories(/root/conda/envs/tc_build/lib/python3.6/site-packages/torch/lib)

LIST(APPEND SINGA_LINKER_LIBS ${HALIDE_LIBRARIES} tc_aten tc_lang tc_core_cpu tc_cuda tc_core_cuda_no_sdk tc_core tc_autotuner tc_proto ATen)

### Tensor comprehensions
### Tensor comprehensions
### Tensor comprehensions
101 changes: 101 additions & 0 deletions include/singa/core/tensor.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,14 @@
#include <tuple>
#include <memory>

#include <dlpack/dlpack.h>
#include <tc/core/tensor.h>
#include <tc/utils/compiler_options.h>
#include <tc/core/compiler.h>
#include <tc/core/utils/time.h>
#include <tc/core/cuda/cuda_backend.h>
#include "tc/core/cuda/cuda_tc_executor.h"

#include "singa/core/common.h"
#include "singa/core/device.h"
#include "singa/proto/core.pb.h"
Expand Down Expand Up @@ -147,6 +155,7 @@ class Tensor {

/// Return average L2 norm
float L2() const;

// --------------------------------------------------------------------------
// ---Following methods changes the internal data
// --------------------------------------------------------------------------
Expand Down Expand Up @@ -603,6 +612,98 @@ Tensor ConcatRows(const vector<Tensor> &in);
Tensor ConcatenateColumns(const vector<Tensor> &in);
/// Alias name for function ConcatenateColumns
Tensor ConcatColumns(const vector<Tensor> &in);




/// tc integration start
DLManagedTensor* toDLPack(const Tensor& src);
//Tensor fromDLPack(const DLManagedTensor* src);

inline std::vector<tc::DLTensorUPtr> makeDLTensors(
const std::vector<Tensor>& tensors);

template <typename Backend>
std::unique_ptr<typename Backend::ExecutorType> compileTC(
const std::string& tc,
const std::string& entryPoint,
const std::vector<Tensor>& inputs,
const typename Backend::MappingOptionsType& options,
const tc::CompilerOptions& compilerOptions = tc::CompilerOptions());


std::vector<tc::DLTensorUPtr> inferOutputTensorInfo(
const std::string& tc,
const std::string& entryPoint,
const std::vector<Tensor>& inputs);

std::vector<Tensor> prepareOutputs(
const std::string& tc,
const std::string& entryPoint,
const std::vector<Tensor>& inputs);

template <typename Executor>
void runTC(
const Executor& executor,
const std::vector<Tensor>& inputs,
std::vector<Tensor>& outputs);


// tensor comprehension operations
Tensor SoftMaxTC(const Tensor &in);
Tensor ReluTC(const Tensor &in);
Tensor MatMulTC(const Tensor &in1,const Tensor &in2);


// makeDLConstTensors implementation
inline std::vector<tc::DLConstTensorUPtr> makeDLConstTensors(const std::vector<Tensor>& tensors) {
std::vector<tc::DLConstTensorUPtr> dlTensors;
for (auto tensor : tensors) {
auto dlMTensor = toDLPack(tensor);
dlTensors.push_back(tc::makeDLConstTensor(&(dlMTensor->dl_tensor)));
dlMTensor->deleter(dlMTensor);
}
return dlTensors;
}

// makeDLTensors implementation
inline std::vector<tc::DLTensorUPtr> makeDLTensors( const std::vector<Tensor>& tensors) {
std::vector<tc::DLTensorUPtr> dlTensors;
for (auto tensor : tensors) {
auto dlMTensor = toDLPack(tensor);
dlTensors.push_back(tc::makeDLTensor(&(dlMTensor->dl_tensor)));
dlMTensor->deleter(dlMTensor);
}
return dlTensors;
}


// compile implementation
template <typename Backend>
std::unique_ptr<typename Backend::ExecutorType> compileTC(
const std::string& tc,
const std::string& entryPoint,
const std::vector<Tensor>& inputs,
const typename Backend::MappingOptionsType& options,
const tc::CompilerOptions& compilerOptions) {
auto inputDLTensors = makeDLConstTensors(inputs);
return tc::compile<Backend>(
tc, entryPoint, extractRawPtrs(inputDLTensors), options, compilerOptions);
}

// run implementation
template <typename Executor>
void runTC(
const Executor& executor,
const std::vector<Tensor>& inputs,
std::vector<Tensor>& outputs) {
auto inputDLTensors = makeDLConstTensors(inputs);
auto outputDLTensors = makeDLTensors(outputs);
return executor.run( extractRawPtrs(inputDLTensors), extractRawPtrs(outputDLTensors));
}

/// tc integration end

} // namespace singa

#endif // SINGA_CORE_TENSOR_H_
22 changes: 22 additions & 0 deletions src/api/core_tensor.i
Original file line number Diff line number Diff line change
Expand Up @@ -345,4 +345,26 @@ namespace singa{

Tensor CrossEntropyFwd(const Tensor& p, const Tensor& t);
Tensor SoftmaxCrossEntropyBwd(const Tensor& p, const Tensor& t);

/* ============ Tensor Comprehensions ============ */
/* /root/incubator-singa/build/src/api/singa_wrap.cxx:14938:166: error: use of deleted function */
/* due to below issue, abort this approach
std::vector<Tensor> prepareOutputs(
const std::string& tc,
const std::string& entryPoint,
const std::vector<Tensor>& inputs);
template <typename Executor>
void runTC( const Executor& executor, const std::vector<Tensor>& inputs, std::vector<Tensor>& outputs);
%template(runTCCuda) runTC<tc::CudaTcExecutor>;
template <typename Backend>
std::unique_ptr<typename Backend::ExecutorType> compileTC(
const std::string& tc,
const std::string& entryPoint,
const std::vector<Tensor>& inputs,
const typename Backend::MappingOptionsType& options,
const tc::CompilerOptions& compilerOptions = tc::CompilerOptions());
%template(compileTCCuda) compileTC<tc::CudaBackend>;
*/
}
165 changes: 165 additions & 0 deletions src/core/tensor/tensor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,23 @@
#include "./tensor_math_cpp.h"
#include "./tensor_math_cuda.h"
#include "./tensor_math_opencl.h"

#include <utility>
#include <algorithm>

//#include <tc/lang/error_report.h>
//#include <tc/core/compiler.h>
#include "tc/core/check.h"
#include "tc/core/compiler.h"
#include "tc/core/tc_executor.h"
#include "tc/core/tensor.h"

#define Noaxis 9999

// namespace is already exist in singa
// aliasing to avoid duplicates
namespace tclang = lang;

namespace singa {

Tensor::~Tensor() {
Expand Down Expand Up @@ -1334,4 +1345,158 @@ Tensor Reshape(const Tensor &in, const Shape &s) {
return out.Reshape(s);
}


/// tc integration start
struct SingaDLManagedTensor {
Tensor handle;
DLManagedTensor tensor;
};

void deleter(DLManagedTensor* arg) {
delete static_cast<SingaDLManagedTensor*>(arg->manager_ctx);
}

static DLDataType getDLDataType(const Tensor& t) {
DLDataType dtype;
dtype.lanes = 1;
// TODO: get the number of bytes of the datatype
//dtype.bits = t.data_type() * 8;
dtype.bits = 4 * 8;
switch (t.data_type()) {
case kFloat32:
dtype.code = DLDataTypeCode::kDLFloat;
break;
default:
throw std::logic_error("only kFloat32 is supported for dlpack conversion");
break;
}
return dtype;
}

static DLContext getDLContext(const Tensor& tensor, const int64_t& device_id) {
DLContext ctx;
ctx.device_id = device_id;
ctx.device_type = DLDeviceType::kDLGPU;
//TODO: fix this
//if (tensor.is_cuda()) {
// ctx.device_type = DLDeviceType::kDLGPU;
//} else {
// ctx.device_type = DLDeviceType::kDLCPU;
//}
return ctx;
}

// This function returns a shared_ptr to memory managed DLpack tensor
// constructed out of ATen tensor
DLManagedTensor* toDLPack(const Tensor& src) {
SingaDLManagedTensor* singaDLManagedTensor(new SingaDLManagedTensor);
singaDLManagedTensor->handle = src;
singaDLManagedTensor->tensor.manager_ctx = singaDLManagedTensor;
singaDLManagedTensor->tensor.deleter = &deleter;
singaDLManagedTensor->tensor.dl_tensor.data = src.block()->mutable_data();
int64_t device_id = 0;
// TODO: fix this
//if (src.is_cuda()) {
// device_id = src.get_device();
//}
singaDLManagedTensor->tensor.dl_tensor.ctx = getDLContext(src, device_id);
singaDLManagedTensor->tensor.dl_tensor.ndim = src.nDim();
singaDLManagedTensor->tensor.dl_tensor.dtype = getDLDataType(src);

auto shapeVec = new std::vector<int64_t>(src.shape().begin(),src.shape().end());
singaDLManagedTensor->tensor.dl_tensor.shape = shapeVec->data();

auto strideVec = new std::vector<int64_t>(src.stride().begin(),src.stride().end());
singaDLManagedTensor->tensor.dl_tensor.strides = strideVec->data();

singaDLManagedTensor->tensor.dl_tensor.byte_offset = 0;
return &(singaDLManagedTensor->tensor);
}

// prepare output
std::vector<tc::DLTensorUPtr> inferOutputTensorInfo(
const std::string& tc,
const std::string& entryPoint,
const std::vector<Tensor>& inputs) {
auto parsedTcs = tc::detail::parse(tc);
if (parsedTcs.count(entryPoint) != 1u) {
TC_CHECK_GE(parsedTcs.size(), 1u)
<< "No TC was parsed, should have thrown earlier";
throw tclang::ErrorReport(parsedTcs.begin()->second)
<< "\nattempting to access undefined entryPoint: " << entryPoint;
}
auto inputDLTensors = makeDLConstTensors(inputs);
return makeDLTensorVector(tc::detail::inferOutputTensorInfo(parsedTcs.at(entryPoint), extractRawPtrs(inputDLTensors)));
}

std::vector<Tensor> prepareOutputs(
const std::string& tc,
const std::string& entryPoint,
const std::vector<Tensor>& inputs) {
std::vector<Tensor> outputs;
auto outTensorInfo = inferOutputTensorInfo(tc, entryPoint, inputs);
if (outTensorInfo.size() == 0) {
return outputs;
}
TC_CHECK_GE(inputs.size(), 1u)
<< "NYI: Need >= 1 input tensors to determine "
<< "backend and prepare ATen outputs. Add an overload with just an ATen "
<< "backend";

auto dev = inputs[0].device();
auto dtype = inputs[0].data_type();
for (size_t i = 0; i < outTensorInfo.size(); ++i) {
tc::TensorInfo info(outTensorInfo[i]);
Shape shape(info.shape.begin(), info.shape.end());

Tensor tmp(shape, dev, dtype);
outputs.push_back(tmp);
}
return outputs;
}


// examples of TC operations
Tensor SoftMaxTC(const Tensor &in) {
std::string tc= R"TC(
def softmax(float(N, D) I) -> (O, expsum) {
expsum(n) +=! exp(I(n, d))
O(n, d) = exp(I(n, d)) / expsum(n)
}
)TC";
auto naiveOptions = tc::CudaBackend::MappingOptionsType::makeNaiveMappingOptions();
auto pExecutor = singa::compileTC<tc::CudaBackend>(tc, "softmax", {in}, {naiveOptions});
auto outputs = singa::prepareOutputs(tc, "softmax", {in});
singa::runTC(*pExecutor, {in}, outputs);
return outputs[0];
}

Tensor ReluTC(const Tensor &in) {
std::string tc = R"TC(
def relu(float(B,M) I) -> (O1){
O1(b, m) = fmax(I(b, m), 0)
}
)TC";
auto naiveOptions = tc::CudaBackend::MappingOptionsType::makeNaiveMappingOptions();
auto pExecutor = singa::compileTC<tc::CudaBackend>(tc, "relu", {in}, {naiveOptions});
auto outputs = singa::prepareOutputs(tc, "relu", {in});
singa::runTC(*pExecutor, {in}, outputs);
return outputs[0];
}

Tensor MatMulTC(const Tensor &in1,const Tensor &in2) {
std::string tc = R"TC(
def matmul(float(M,N) A, float(N,K) B) -> (output) {
output(i, j) +=! A(i, kk) * B(kk, j)
}
)TC";
auto naiveOptions = tc::CudaBackend::MappingOptionsType::makeNaiveMappingOptions();
auto pExecutor = singa::compileTC<tc::CudaBackend>(tc, "matmul", {in1, in2}, {naiveOptions});
auto outputs = singa::prepareOutputs(tc, "matmul", {in1, in2});
singa::runTC(*pExecutor, {in1, in2}, outputs);
return outputs[0];
}
/// tc integration end


} // namespace singa
4 changes: 3 additions & 1 deletion test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
INCLUDE_DIRECTORIES(${CMAKE_BINARY_DIR}/include)



IF(ENABLE_DIST)
ADD_EXECUTABLE(test_ep "singa/test_ep.cc")
ADD_DEPENDENCIES(test_ep singa)
Expand All @@ -33,7 +35,7 @@ LIST(REMOVE_ITEM singa_test_source "singa/test_ep.cc")
ADD_EXECUTABLE(test_singa "gtest/gtest_main.cc" ${singa_test_source})
ADD_DEPENDENCIES(test_singa singa)
#MESSAGE(STATUS "link libs" ${singa_linker_libs})
TARGET_LINK_LIBRARIES(test_singa gtest singa )
TARGET_LINK_LIBRARIES(test_singa gtest singa ${SINGA_LINKER_LIBS})
IF(UNIX AND (NOT APPLE))
LIST(APPEND LINK_FLAGS "-pthread")
ENDIF()
Expand Down
Loading

0 comments on commit 827658a

Please sign in to comment.