From e5cc9b74c383c70dbab8811d65e85bf9453c8090 Mon Sep 17 00:00:00 2001 From: kbobrovs Date: Mon, 11 Oct 2021 08:19:55 -0700 Subject: [PATCH] [ESIMD] Refactor esimd intrinsic mapping to BE intrinsics. (#4720) * [ESIMD] Refactor esimd intrinsic mapping to BE intrinsics. This patch - makes names and parameter lists of __esimd* intrinsics match their @llvm.genx counterparts. The benefits are: * this removes the extra logical translation layer between __esimd* and @llvm.genx thus simplifying overall user-level esimd intrinsic translation * allows to reuse lots of functionality between SLM and surface memory accesses - moves some of the translations and argument setting (like accessor field to surface index, setting scale) from LowerESIMD.cpp to the ESIMD headers, which simplifies code base. - for all memory intrinsics moves host and device implementations to the same intrinsic function prototype separating them via __SYCL_DEVICE_ONLY__ macro thus avoiding duplication of the prototypes - removes certain redundant __esimd* intrinsics, such as SLM memory accesses (which are normal surface accesses with special surface index 254), and __esimd_reduced_fmax,... which have the same functionality as usual fmax,... This is also a preparatory step for fixing SLM memory accesses (revising vector lengths, element types restirictions) Signed-off-by: Konstantin S Bobrovsky --- llvm/lib/SYCLLowerIR/LowerESIMD.cpp | 259 ++-- .../SYCLLowerIR/esimd_lower_debug_info.ll | 20 +- llvm/test/SYCLLowerIR/esimd_lower_intrins.ll | 120 +- .../ext/intel/experimental/esimd/common.hpp | 5 + .../experimental/esimd/detail/intrin.hpp | 51 +- .../experimental/esimd/detail/math_intrin.hpp | 375 +++--- .../esimd/detail/memory_intrin.hpp | 1150 +++++++---------- .../esimd/detail/simd_obj_impl.hpp | 19 +- .../intel/experimental/esimd/detail/util.hpp | 6 + .../ext/intel/experimental/esimd/math.hpp | 54 +- .../ext/intel/experimental/esimd/memory.hpp | 257 ++-- sycl/test/esimd/intrins_trans.cpp | 160 ++- 12 files changed, 1178 insertions(+), 1298 deletions(-) diff --git a/llvm/lib/SYCLLowerIR/LowerESIMD.cpp b/llvm/lib/SYCLLowerIR/LowerESIMD.cpp index 6c979d3ada026..8fb811ac5ba64 100644 --- a/llvm/lib/SYCLLowerIR/LowerESIMD.cpp +++ b/llvm/lib/SYCLLowerIR/LowerESIMD.cpp @@ -92,7 +92,6 @@ struct ESIMDIntrinDesc { SRC_CALL_ARG, // is a call argument SRC_CALL_ALL, // this and subsequent args are just copied from the src call SRC_TMPL_ARG, // is an integer template argument - NUM_BYTES, // is a number of bytes (gather.scaled and scatter.scaled) UNDEF, // is an undef value CONST_INT8, // is an i8 constant CONST_INT16, // is an i16 constant @@ -102,7 +101,6 @@ struct ESIMDIntrinDesc { enum class GenXArgConversion : int16_t { NONE, // no conversion - TO_SI, // convert to 32-bit integer surface index TO_I1, // convert vector of N-bit integer to 1-bit TO_I8, // convert vector of N-bit integer to 18-bit TO_I16, // convert vector of N-bit integer to 16-bit @@ -174,7 +172,6 @@ class ESIMDIntrinDescTable { } DEF_ARG_RULE(l, SRC_CALL_ALL) DEF_ARG_RULE(u, UNDEF) - DEF_ARG_RULE(nbs, NUM_BYTES) static constexpr ESIMDIntrinDesc::ArgRule t(int16_t N) { return ESIMDIntrinDesc::ArgRule{ @@ -218,10 +215,11 @@ class ESIMDIntrinDescTable { {{N, ESIMDIntrinDesc::GenXArgConversion::TO_I1}}}; } + // Just an alias for a(int16_t N) to mark surface index arguments. static constexpr ESIMDIntrinDesc::ArgRule aSI(int16_t N) { return ESIMDIntrinDesc::ArgRule{ ESIMDIntrinDesc::SRC_CALL_ARG, - {{N, ESIMDIntrinDesc::GenXArgConversion::TO_SI}}}; + {{N, ESIMDIntrinDesc::GenXArgConversion::NONE}}}; } static constexpr ESIMDIntrinDesc::ArgRule c8(int16_t N) { @@ -249,6 +247,16 @@ class ESIMDIntrinDescTable { } public: + // The table which describes rules how to generate @llvm.genx.* intrinsics + // from templated __esimd* intrinsics. The general rule is that the order and + // the semantics of intrinsic arguments is the same in both intrinsic forms. + // But for some arguments, where @llvm.genx.* mandates that the argument must + // be 'constant' (see Intrinsic_definitions.py from the vcintrinsics repo), + // it is passed as template argument to the corrsponding __esimd* intrinsic, + // hence leading to some "gaps" in __esimd* form's arguments compared to the + // @llvm.genx.* form. + // TODO - fix all __esimd* intrinsics and table entries according to the rule + // above. ESIMDIntrinDescTable() { Table = { // An element of the table is std::pair of ; key is the @@ -284,90 +292,150 @@ class ESIMDIntrinDescTable { {"vload", {"vload", {l(0)}}}, {"vstore", {"vstore", {a(1), a(0)}}}, - {"flat_block_read_unaligned", {"svm.block.ld.unaligned", {l(0)}}}, - {"flat_block_write", {"svm.block.st", {l(1)}}}, - {"flat_read", {"svm.gather", {ai1(2), a(1), a(0), u(-1)}}}, - {"flat_read4", + {"svm_block_ld_unaligned", {"svm.block.ld.unaligned", {l(0)}}}, + {"svm_block_st", {"svm.block.st", {l(1)}}}, + {"svm_gather", {"svm.gather", {ai1(2), a(1), a(0), u(-1)}}}, + {"svm_gather4_scaled", {"svm.gather4.scaled", {ai1(1), t(2), c16(0), c64(0), a(0), u(-1)}}}, - {"flat_write", {"svm.scatter", {ai1(3), a(2), a(0), a(1)}}}, - {"flat_write4", + {"svm_scatter", {"svm.scatter", {ai1(3), a(2), a(0), a(1)}}}, + {"svm_scatter4_scaled", {"svm.scatter4.scaled", {ai1(2), t(2), c16(0), c64(0), a(0), a(1)}}}, - // surface index-based gather/scatter: - // num blocks, scale, surface index, global offset, elem offsets - {"surf_read", {"gather.scaled2", {t(3), c16(0), aSI(1), a(2), a(3)}}}, - // pred, num blocks, scale, surface index, global offset, elem offsets, - // data to write - {"surf_write", - {"scatter.scaled", {ai1(0), t(3), c16(0), aSI(2), a(3), a(4), a(5)}}}, - // intrinsics to query thread's coordinates: {"group_id_x", {"group.id.x", {}}}, {"group_id_y", {"group.id.y", {}}}, {"group_id_z", {"group.id.z", {}}}, {"local_id", {"local.id", {}}}, {"local_size", {"local.size", {}}}, - {"flat_atomic0", {"svm.atomic", {ai1(1), a(0), u(-1)}, bo(0)}}, - {"flat_atomic1", {"svm.atomic", {ai1(2), a(0), a(1), u(-1)}, bo(0)}}, - {"flat_atomic2", + {"svm_atomic0", {"svm.atomic", {ai1(1), a(0), u(-1)}, bo(0)}}, + {"svm_atomic1", {"svm.atomic", {ai1(2), a(0), a(1), u(-1)}, bo(0)}}, + {"svm_atomic2", {"svm.atomic", {ai1(3), a(0), a(1), a(2), u(-1)}, bo(0)}}, - {"reduced_fmax", {"fmax", {a(0), a(1)}}}, - {"reduced_umax", {"umax", {a(0), a(1)}}}, - {"reduced_smax", {"smax", {a(0), a(1)}}}, - {"reduced_fmin", {"fmin", {a(0), a(1)}}}, - {"reduced_umin", {"umin", {a(0), a(1)}}}, - {"reduced_smin", {"smin", {a(0), a(1)}}}, {"dp4", {"dp4", {a(0), a(1)}}}, - // 2nd argumnent of media.* is a surface index - - // it is produced by casting and truncating the OpenCL opaque image - // pointer - // source media_block* intrinsic argument; this is according the the - // OpenCL runtime - JIT compiler handshake protocol for OpenCL images. - {"media_block_load", - {"media.ld", {a(0), aSI(1), a(2), a(3), a(4), a(5)}}}, - {"media_block_store", - {"media.st", {a(0), aSI(1), a(2), a(3), a(4), a(5), a(6)}}}, - {"slm_fence", {"fence", {a(0)}}}, + + {"fence", {"fence", {a(0)}}}, {"barrier", {"barrier", {}}}, {"sbarrier", {"sbarrier", {a(0)}}}, - {"block_read", {"oword.ld.unaligned", {c32(0), aSI(0), a(1)}}}, - {"block_write", {"oword.st", {aSI(0), a(1), a(2)}}}, - {"slm_block_read", {"oword.ld", {c32(0), c32(SLM_BTI), a(0)}}}, - {"slm_block_write", {"oword.st", {c32(SLM_BTI), a(0), a(1)}}}, - {"slm_read", - {"gather.scaled", - {ai1(1), nbs(-1), c16(0), c32(SLM_BTI), c32(0), a(0), u(-1)}}}, - {"slm_read4", - {"gather4.scaled", - {ai1(1), t(2), c16(0), c32(SLM_BTI), c32(0), a(0), u(-1)}}}, - {"slm_write", - {"scatter.scaled", - {ai1(2), nbs(1), c16(0), c32(SLM_BTI), c32(0), a(0), a(1)}}}, - {"slm_write4", - {"scatter4.scaled", - {ai1(2), t(2), c16(0), c32(SLM_BTI), c32(0), a(0), a(1)}}}, - {"slm_atomic0", - {"dword.atomic", {ai1(1), c32(SLM_BTI), a(0), u(-1)}, bo(0)}}, - {"slm_atomic1", - {"dword.atomic", {ai1(2), c32(SLM_BTI), a(0), a(1), u(-1)}, bo(0)}}, - {"slm_atomic2", - {"dword.atomic", - {ai1(3), c32(SLM_BTI), a(0), a(1), a(2), u(-1)}, - bo(0)}}, - {"raw_sends_load", + + // arg0: i32 modifiers, constant + // arg1: i32 surface index + // arg2: i32 plane, constant + // arg3: i32 block width in bytes, constant + // (block height inferred from return type size and block width) + // arg4: i32 x byte offset + // arg5: i32 y byte offset + {"media_ld", {"media.ld", {t(3), aSI(0), t(5), t(6), a(1), a(2)}}}, + + // arg0: i32 modifiers, constant + // arg1: i32 surface index + // arg2: i32 plane, constant + // arg3: i32 block width in bytes, constant + // (block height inferred from data type size and block width) + // arg4: i32 x byte offset + // arg5: i32 y byte offset + // arg6: data to write (overloaded) + {"media_st", + {"media.st", {t(3), aSI(0), t(5), t(6), a(1), a(2), a(3)}}}, + + // arg0 : i32 is_modified, CONSTANT + // arg1 : i32 surface index + // arg2 : i32 offset(in owords for.ld / in bytes for.ld.unaligned) + {"oword_ld_unaligned", {"oword.ld.unaligned", {t(3), aSI(0), a(1)}}}, + {"oword_ld", {"oword.ld", {t(3), aSI(0), a(1)}}}, + + // arg0: i32 surface index + // arg1: i32 offset (in owords) + // arg2: data to write (overloaded) + {"oword_st", {"oword.st", {aSI(0), a(1), a(2)}}}, + + // surface index-based gather/scatter: + // arg0: i32 log2 num blocks, CONSTANT (0/1/2 for num blocks 1/2/4) + // arg1: i16 scale, CONSTANT + // arg2: i32 surface index + // arg3: i32 global offset in bytes + // arg4: vXi32 element offset in bytes (overloaded) + {"gather_scaled2", + {"gather.scaled2", {t(3), t(4), aSI(0), a(1), a(2)}}}, + + // arg0: vXi1 predicate (overloaded) + // arg1: i32 log2 num blocks, CONSTANT (0/1/2 for num blocks 1/2/4) + // arg2: i16 scale, CONSTANT + // arg3: i32 surface index + // arg4: i32 global offset in bytes + // arg5: vXi32 element offset in bytes (overloaded) + // arg6: old value of the data read + {"gather_scaled", + {"gather.scaled", {ai1(0), t(3), t(4), aSI(1), a(2), a(3), u(-1)}}}, + + // arg0: vXi1 predicate (overloaded) + // arg1: i32 log2 num blocks, CONSTANT (0/1/2 for num blocks 1/2/4) + // arg2: i16 scale, CONSTANT + // arg3: i32 surface index + // arg4: i32 global offset in bytes + // arg5: vXi32 element offset (overloaded) + // arg6: data to write (overloaded) + {"scatter_scaled", + {"scatter.scaled", {ai1(0), t(3), t(4), aSI(1), a(2), a(3), a(4)}}}, + + // arg0: vXi1 predicate (overloaded) (overloaded) + // arg1: i32 channel mask, CONSTANT + // arg2: i16 scale, CONSTANT + // arg3: i32 surface index + // arg4: i32 global offset in bytes + // arg5: vXi32 element offset in bytes (overloaded) + // arg6: old value of the data read + {"gather4_scaled", + {"gather4.scaled", {ai1(0), t(3), t(4), aSI(1), a(2), a(3), u(-1)}}}, + + // arg0: vXi1 predicate (overloaded) + // arg1: i32 channel mask, constant + // arg2: i16 scale, constant + // arg3: i32 surface index + // arg4: i32 global offset in bytes + // arg5: vXi32 element offset in bytes (overloaded) + // arg6: data to write (overloaded) + {"scatter4_scaled", + {"scatter4.scaled", {ai1(0), t(3), t(4), aSI(1), a(2), a(3), a(4)}}}, + + // arg0: vXi1 predicate (overloaded) + // arg1: i32 surface index + // arg2: vXi32 element offset in bytes + // arg3: vXi32 original value of the register that the data is read into + {"dword_atomic0", + {"dword.atomic", {ai1(0), aSI(1), a(2), u(-1)}, bo(0)}}, + + // arg0: vXi1 predicate (overloaded) + // arg1: i32 surface index + // arg2: vXi32 element offset in bytes (overloaded) + // arg3: vXi32/vXfloat src + // arg4: vXi32/vXfloat original value of the register that the data is + // read into + {"dword_atomic1", + {"dword.atomic", {ai1(0), aSI(1), a(2), a(3), u(-1)}, bo(0)}}, + + // arg0: vXi1 predicate (overloaded) + // arg1: i32 surface index + // arg2: vXi32 element offset in bytes + // arg3: vXi32 src0 + // arg4: vXi32 src1 + // arg5: vXi32 original value of the register that the data is read into + {"dword_atomic2", + {"dword.atomic", {ai1(0), aSI(1), a(2), a(3), a(4), u(-1)}, bo(0)}}, + + {"raw_sends2", {"raw.sends2", {a(0), a(1), ai1(2), a(3), a(4), a(5), a(6), a(7), a(8), a(9), a(10), a(11)}}}, - {"raw_send_load", + {"raw_send2", {"raw.send2", {a(0), a(1), ai1(2), a(3), a(4), a(5), a(6), a(7), a(8), a(9)}}}, - {"raw_sends_store", + {"raw_sends2_noresult", {"raw.sends2.noresult", {a(0), a(1), ai1(2), a(3), a(4), a(5), a(6), a(7), a(8), a(9)}}}, - {"raw_send_store", + {"raw_send2_noresult", {"raw.send2.noresult", {a(0), a(1), ai1(2), a(3), a(4), a(5), a(6), a(7)}}}, - {"satf", {"sat", {a(0)}}}, + {"sat", {"sat", {a(0)}}}, {"fptoui_sat", {"fptoui.sat", {a(0)}}}, {"fptosi_sat", {"fptosi.sat", {a(0)}}}, {"uutrunc_sat", {"uutrunc.sat", {a(0)}}}, @@ -401,8 +469,8 @@ class ESIMDIntrinDescTable { {"smin", {"smin", {a(0), a(1)}}}, {"bfrev", {"bfrev", {a(0)}}}, {"cbit", {"cbit", {a(0)}}}, - {"bfins", {"bfi", {a(0), a(1), a(2), a(3)}}}, - {"bfext", {"sbfe", {a(0), a(1), a(2)}}}, + {"bfi", {"bfi", {a(0), a(1), a(2), a(3)}}}, + {"sbfe", {"sbfe", {a(0), a(1), a(2)}}}, {"fbl", {"fbl", {a(0)}}}, {"sfbh", {"sfbh", {a(0)}}}, {"ufbh", {"ufbh", {a(0)}}}, @@ -410,12 +478,12 @@ class ESIMDIntrinDescTable { {"log", {"log", {a(0)}}}, {"exp", {"exp", {a(0)}}}, {"sqrt", {"sqrt", {a(0)}}}, - {"sqrt_ieee", {"ieee.sqrt", {a(0)}}}, + {"ieee_sqrt", {"ieee.sqrt", {a(0)}}}, {"rsqrt", {"rsqrt", {a(0)}}}, {"sin", {"sin", {a(0)}}}, {"cos", {"cos", {a(0)}}}, {"pow", {"pow", {a(0), a(1)}}}, - {"div_ieee", {"ieee.div", {a(0), a(1)}}}, + {"ieee_div", {"ieee.div", {a(0), a(1)}}}, {"uudp4a", {"uudp4a", {a(0), a(1), a(2)}}}, {"usdp4a", {"usdp4a", {a(0), a(1), a(2)}}}, {"sudp4a", {"sudp4a", {a(0), a(1), a(2)}}}, @@ -542,7 +610,6 @@ static APInt parseTemplateArg(id::FunctionEncoding *FE, unsigned int N, Ty = IntegerType::getInt16Ty(Ctx); break; case ESIMDIntrinDesc::GenXArgConversion::TO_I32: - case ESIMDIntrinDesc::GenXArgConversion::TO_SI: Ty = IntegerType::getInt32Ty(Ctx); break; } @@ -826,7 +893,7 @@ static bool translateVStore(CallInst &CI, SmallPtrSet &GVTS) { return true; } -static void translateGetValue(CallInst &CI) { +static void translateGetSurfaceIndex(CallInst &CI) { auto opnd = CI.getArgOperand(0); assert(opnd->getType()->isPointerTy()); IRBuilder<> Builder(&CI); @@ -1000,15 +1067,6 @@ static void createESIMDIntrinsicArgs(const ESIMDIntrinDesc &Desc, GenXArgs.push_back(Cmp); break; } - case ESIMDIntrinDesc::GenXArgConversion::TO_SI: { - // convert a pointer to 32-bit integer surface index - assert(Arg->getType()->isPointerTy()); - IRBuilder<> Bld(&CI); - Value *Res = - Bld.CreatePtrToInt(Arg, IntegerType::getInt32Ty(CI.getContext())); - GenXArgs.push_back(Res); - break; - } default: llvm_unreachable("Unknown ESIMD arg conversion"); } @@ -1029,19 +1087,6 @@ static void createESIMDIntrinsicArgs(const ESIMDIntrinDesc &Desc, GenXArgs.push_back(ArgVal); break; } - case ESIMDIntrinDesc::GenXArgRuleKind::NUM_BYTES: { - Type *Ty = Rule.I.Arg.CallArgNo == -1 - ? CI.getType() - : CI.getArgOperand(Rule.I.Arg.CallArgNo)->getType(); - assert(Ty->isVectorTy()); - int NBits = - cast(Ty)->getElementType()->getPrimitiveSizeInBits(); - assert(NBits == 8 || NBits == 16 || NBits == 32); - int NWords = NBits / 16; - GenXArgs.push_back( - ConstantInt::get(IntegerType::getInt32Ty(CI.getContext()), NWords)); - break; - } case ESIMDIntrinDesc::GenXArgRuleKind::UNDEF: { Type *Ty = Rule.I.Arg.CallArgNo == -1 ? CI.getType() @@ -1385,7 +1430,7 @@ size_t SYCLLowerESIMDPass::runOnFunction(Function &F, F.addFnAttr(Attribute::AlwaysInline); SmallVector ESIMDIntrCalls; - SmallVector ESIMDToErases; + SmallVector ToErase; for (Instruction &I : instructions(F)) { if (auto CastOp = dyn_cast(&I)) { @@ -1406,7 +1451,7 @@ size_t SYCLLowerESIMDPass::runOnFunction(Function &F, llvm::Instruction::CastOps TruncOp = llvm::Instruction::Trunc; llvm::Value *NewDst = Builder.CreateCast(TruncOp, Src, DstTy); CastOp->replaceAllUsesWith(NewDst); - ESIMDToErases.push_back(CastOp); + ToErase.push_back(CastOp); } } @@ -1415,7 +1460,7 @@ size_t SYCLLowerESIMDPass::runOnFunction(Function &F, if (CI && (Callee = CI->getCalledFunction())) { // TODO workaround for ESIMD BE until it starts supporting @llvm.assume if (match(&I, PatternMatch::m_Intrinsic())) { - ESIMDToErases.push_back(CI); + ToErase.push_back(CI); continue; } StringRef Name = Callee->getName(); @@ -1433,17 +1478,17 @@ size_t SYCLLowerESIMDPass::runOnFunction(Function &F, if (Name.startswith("N2cl4sycl3ext5intel12experimental5esimd8slm_init")) { // tag the kernel with meta-data SLMSize, and remove this builtin translateSLMInit(*CI); - ESIMDToErases.push_back(CI); + ToErase.push_back(CI); continue; } if (Name.startswith("__esimd_pack_mask")) { translatePackMask(*CI); - ESIMDToErases.push_back(CI); + ToErase.push_back(CI); continue; } if (Name.startswith("__esimd_unpack_mask")) { translateUnPackMask(*CI); - ESIMDToErases.push_back(CI); + ToErase.push_back(CI); continue; } // If vload/vstore is not about the vector-types used by @@ -1452,20 +1497,20 @@ size_t SYCLLowerESIMDPass::runOnFunction(Function &F, // those insts can be optimized by llvm ASAP. if (Name.startswith("__esimd_vload")) { if (translateVLoad(*CI, GVTS)) { - ESIMDToErases.push_back(CI); + ToErase.push_back(CI); continue; } } if (Name.startswith("__esimd_vstore")) { if (translateVStore(*CI, GVTS)) { - ESIMDToErases.push_back(CI); + ToErase.push_back(CI); continue; } } - if (Name.startswith("__esimd_get_value")) { - translateGetValue(*CI); - ESIMDToErases.push_back(CI); + if (Name.startswith("__esimd_get_surface_index")) { + translateGetSurfaceIndex(*CI); + ToErase.push_back(CI); continue; } @@ -1499,14 +1544,14 @@ size_t SYCLLowerESIMDPass::runOnFunction(Function &F, // Replaces the original global load and it is uses and stores the old // instructions to ESIMDToErases. translateSpirvGlobalUses(LI, SpirvGlobal->getName().drop_front(PrefLen), - ESIMDToErases); + ToErase); } } // Now demangle and translate found ESIMD intrinsic calls for (auto *CI : ESIMDIntrCalls) { translateESIMDIntrinsicCall(*CI); } - for (auto *CI : ESIMDToErases) { + for (auto *CI : ToErase) { CI->eraseFromParent(); } diff --git a/llvm/test/SYCLLowerIR/esimd_lower_debug_info.ll b/llvm/test/SYCLLowerIR/esimd_lower_debug_info.ll index 8d9476575534e..438900c72d041 100644 --- a/llvm/test/SYCLLowerIR/esimd_lower_debug_info.ll +++ b/llvm/test/SYCLLowerIR/esimd_lower_debug_info.ll @@ -6,16 +6,20 @@ @__spirv_BuiltInGlobalInvocationId = external dso_local local_unnamed_addr addrspace(1) constant <3 x i64>, align 32 -declare spir_func <16 x float> @_Z18__esimd_block_readIfLi16EPU3AS1fEN2cl4sycl5INTEL3gpu11vector_typeIT_XT0_EE4typeET1_j(float addrspace(1)*, i32) +declare spir_func <16 x float> @_Z26__esimd_oword_ld_unalignedIfLi16EjLi0EEN2cl4sycl3ext5intel12experimental5esimd6detail11vector_typeIT_XT0_EE4typeET1_j(i32, i32) +declare spir_func i32 @_Z25__esimd_get_surface_indexIPU3AS1fEjT_(float addrspace(1)*) -define spir_func void @func1(float addrspace(1)* %arg1, i32 %arg2 ){ + +define spir_func void @func1(float addrspace(1)* %arg1, i32 %arg2){ ; CHECK-LABEL: @func1( -; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint float addrspace(1)* [[ARG1:%.*]] to i32, !dbg [[DBG11:![0-9]+]] -; CHECK-NEXT: [[CALL1_I_I_ESIMD:%.*]] = call <16 x float> @llvm.genx.oword.ld.unaligned.v16f32(i32 0, i32 [[TMP1]], i32 [[ARG2:%.*]]), !dbg [[DBG11]] -; CHECK-NEXT: call void @llvm.dbg.value(metadata <16 x float> [[CALL1_I_I_ESIMD]], metadata [[META9:![0-9]+]], metadata !DIExpression()), !dbg [[DBG11]] -; CHECK-NEXT: ret void, !dbg [[DBG12:![0-9]+]] -; - %call1.i.i = tail call spir_func <16 x float> @_Z18__esimd_block_readIfLi16EPU3AS1fEN2cl4sycl5INTEL3gpu11vector_typeIT_XT0_EE4typeET1_j(float addrspace(1)* %arg1, i32 %arg2) +; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint float addrspace(1)* [[ARG1:%.*]] to i32, !dbg [[DBG1:![0-9]+]] +; CHECK-NEXT: call void @llvm.dbg.value(metadata i32 [[TMP1]], metadata !{{[0-9]+}}, metadata !DIExpression()), !dbg [[DBG1]] +; CHECK-NEXT: [[CALL1_I_I_ESIMD:%.*]] = call <16 x float> @llvm.genx.oword.ld.unaligned.v16f32(i32 0, i32 [[TMP1]], i32 [[ARG2:%.*]]), !dbg [[DBG2:![0-9]+]] +; CHECK-NEXT: call void @llvm.dbg.value(metadata <16 x float> [[CALL1_I_I_ESIMD]], metadata !{{[0-9]+}}, metadata !DIExpression()), !dbg [[DBG2]] +; CHECK-NEXT: ret void, !dbg !{{[0-9]+}} + + %sfi = call spir_func i32 @_Z25__esimd_get_surface_indexIPU3AS1fEjT_(float addrspace(1)* %arg1) + %res = call spir_func <16 x float> @_Z26__esimd_oword_ld_unalignedIfLi16EjLi0EEN2cl4sycl3ext5intel12experimental5esimd6detail11vector_typeIT_XT0_EE4typeET1_j(i32 %sfi, i32 %arg2) ret void } diff --git a/llvm/test/SYCLLowerIR/esimd_lower_intrins.ll b/llvm/test/SYCLLowerIR/esimd_lower_intrins.ll index f90d1e449997a..736babde2bed7 100644 --- a/llvm/test/SYCLLowerIR/esimd_lower_intrins.ll +++ b/llvm/test/SYCLLowerIR/esimd_lower_intrins.ll @@ -2,6 +2,13 @@ ; consumable by the CM back-end. ; ; RUN: opt < %s -LowerESIMD -S | FileCheck %s +; +; TODO refactor all the test cases - make them C++ and move to +; sycl\test\esimd\intrins_trans.cpp for much easier maintenance w/o losing +; testing strength. Formally, each LLVM pass should have .ll tests, but this is +; not practical in this case. +; +; All new test cases should be added to intrins_trans.cpp target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64" target triple = "spir64-unknown-unknown" @@ -16,77 +23,6 @@ target triple = "spir64-unknown-unknown" ; LowerESIMD pass should process every function, ; !sycl_explicit_simd metadata is not necessary. -define dso_local spir_func <32 x i32> @FUNC_1() { - %a_1 = alloca <32 x i64> - %1 = load <32 x i64>, <32 x i64>* %a_1 - %a_2 = alloca <32 x i16> - %2 = load <32 x i16>, <32 x i16>* %a_2 - %ret_val = call spir_func <32 x i32> @_Z20__esimd_flat_atomic0ILN2cm3gen14CmAtomicOpTypeE2EjLi32ELNS1_9CacheHintE0ELS3_0EENS1_13__vector_typeIT0_XT1_EE4typeENS4_IyXT1_EE4typeENS4_ItXT1_EE4typeE(<32 x i64> %1, <32 x i16> %2) -; CHECK: %{{[0-9a-zA-Z_.]+}} = call <32 x i32> @llvm.genx.svm.atomic.inc.v32i32.v32i1.v32i64(<32 x i1> %{{[0-9a-zA-Z_.]+}}, <32 x i64> %{{[0-9a-zA-Z_.]+}}, <32 x i32> undef) - ret <32 x i32> %ret_val -} - -define dso_local spir_func <32 x i32> @FUNC_2() { - %a_1 = alloca <32 x i64> - %1 = load <32 x i64>, <32 x i64>* %a_1 - %a_2 = alloca <32 x i32> - %2 = load <32 x i32>, <32 x i32>* %a_2 - %a_3 = alloca <32 x i16> - %3 = load <32 x i16>, <32 x i16>* %a_3 - %ret_val = call spir_func <32 x i32> @_Z20__esimd_flat_atomic1ILN2cm3gen14CmAtomicOpTypeE0EjLi32ELNS1_9CacheHintE0ELS3_0EENS1_13__vector_typeIT0_XT1_EE4typeENS4_IyXT1_EE4typeES7_NS4_ItXT1_EE4typeE(<32 x i64> %1, <32 x i32> %2, <32 x i16> %3) -; CHECK: %{{[0-9a-zA-Z_.]+}} = call <32 x i32> @llvm.genx.svm.atomic.add.v32i32.v32i1.v32i64(<32 x i1> %{{[0-9a-zA-Z_.]+}}, <32 x i64> %{{[0-9a-zA-Z_.]+}}, <32 x i32> %{{[0-9a-zA-Z_.]+}}, <32 x i32> undef) - ret <32 x i32> %ret_val -} - -define dso_local spir_func <32 x i32> @FUNC_3() { - %a_1 = alloca <32 x i64> - %1 = load <32 x i64>, <32 x i64>* %a_1 - %a_2 = alloca <32 x i32> - %2 = load <32 x i32>, <32 x i32>* %a_2 - %a_3 = alloca <32 x i32> - %3 = load <32 x i32>, <32 x i32>* %a_3 - %a_4 = alloca <32 x i16> - %4 = load <32 x i16>, <32 x i16>* %a_4 - %ret_val = call spir_func <32 x i32> @_Z20__esimd_flat_atomic2ILN2cm3gen14CmAtomicOpTypeE7EjLi32ELNS1_9CacheHintE0ELS3_0EENS1_13__vector_typeIT0_XT1_EE4typeENS4_IyXT1_EE4typeES7_S7_NS4_ItXT1_EE4typeE(<32 x i64> %1, <32 x i32> %2, <32 x i32> %3, <32 x i16> %4) -; CHECK: %{{[0-9a-zA-Z_.]+}} = call <32 x i32> @llvm.genx.svm.atomic.cmpxchg.v32i32.v32i1.v32i64(<32 x i1> %{{[0-9a-zA-Z_.]+}}, <32 x i64> %{{[0-9a-zA-Z_.]+}}, <32 x i32> %{{[0-9a-zA-Z_.]+}}, <32 x i32> %{{[0-9a-zA-Z_.]+}}, <32 x i32> undef) - ret <32 x i32> %ret_val -} - -define dso_local spir_func <32 x i32> @FUNC_4() { - %ret_val = call spir_func <32 x i32> @_Z33__esimd_flat_block_read_unalignedIjLi32ELN2cm3gen9CacheHintE0ELS2_0EENS1_13__vector_typeIT_XT0_EE4typeEy(i64 0) -; CHECK: %{{[0-9a-zA-Z_.]+}} = call <32 x i32> @llvm.genx.svm.block.ld.unaligned.v32i32.i64(i64 0) - ret <32 x i32> %ret_val -} - -define dso_local spir_func void @FUNC_5() { - %a_1 = alloca <32 x i32> - %1 = load <32 x i32>, <32 x i32>* %a_1 - call spir_func void @_Z24__esimd_flat_block_writeIjLi32ELN2cm3gen9CacheHintE0ELS2_0EEvyNS1_13__vector_typeIT_XT0_EE4typeE(i64 0, <32 x i32> %1) -; CHECK: call void @llvm.genx.svm.block.st.i64.v32i32(i64 0, <32 x i32> %{{[0-9a-zA-Z_.]+}}) - ret void -} - -define dso_local spir_func <32 x i32> @FUNC_6() { - %a_1 = alloca <32 x i64> - %1 = load <32 x i64>, <32 x i64>* %a_1 - %a_2 = alloca <32 x i16> - %2 = load <32 x i16>, <32 x i16>* %a_2 - %ret_val = call spir_func <32 x i32> @_Z17__esimd_flat_readIjLi32ELi0ELN2cm3gen9CacheHintE0ELS2_0EENS1_13__vector_typeIT_XmlT0_clL_ZNS1_20ElemsPerAddrDecodingEjET1_EEE4typeENS3_IyXT0_EE4typeEiNS3_ItXT0_EE4typeE(<32 x i64> %1, i32 0, <32 x i16> %2) -; CHECK: %{{[0-9a-zA-Z_.]+}} = call <32 x i32> @llvm.genx.svm.gather.v32i32.v32i1.v32i64(<32 x i1> %{{[0-9a-zA-Z_.]+}}, i32 0, <32 x i64> %{{[0-9a-zA-Z_.]+}}, <32 x i32> undef) - ret <32 x i32> %ret_val -} - -define dso_local spir_func void @FUNC_7() { - %a_1 = alloca <32 x i64> - %1 = load <32 x i64>, <32 x i64>* %a_1 - %a_2 = alloca <32 x i32> - %2 = load <32 x i32>, <32 x i32>* %a_2 - %a_3 = alloca <32 x i16> - %3 = load <32 x i16>, <32 x i16>* %a_3 - call spir_func void @_Z18__esimd_flat_writeIjLi32ELi0ELN2cm3gen9CacheHintE0ELS2_0EEvNS1_13__vector_typeIyXT0_EE4typeENS3_IT_XmlT0_clL_ZNS1_20ElemsPerAddrDecodingEjET1_EEE4typeEiNS3_ItXT0_EE4typeE(<32 x i64> %1, <32 x i32> %2, i32 0, <32 x i16> %3) -; CHECK: call void @llvm.genx.svm.scatter.v32i1.v32i64.v32i32(<32 x i1> %{{[0-9a-zA-Z_.]+}}, i32 0, <32 x i64> %{{[0-9a-zA-Z_.]+}}, <32 x i32> %{{[0-9a-zA-Z_.]+}}) - ret void -} define dso_local spir_func <16 x i16> @FUNC_8() { %a_1 = alloca <16 x i16> @@ -98,16 +34,6 @@ define dso_local spir_func <16 x i16> @FUNC_8() { ret <16 x i16> %ret_val } -define dso_local spir_func <1 x float> @FUNC_9() { - %a_1 = alloca <1 x float> - %1 = load <1 x float>, <1 x float>* %a_1 - %a_2 = alloca <1 x float> - %2 = load <1 x float>, <1 x float>* %a_2 - %ret_val = call spir_func <1 x float> @_Z16__esimd_div_ieeeILi1EEN2cm3gen13__vector_typeIfXT_EE4typeES4_S4_(<1 x float> %1, <1 x float> %2) -; CHECK: %{{[0-9a-zA-Z_.]+}} = call <1 x float> @llvm.genx.ieee.div.v1f32(<1 x float> %{{[0-9a-zA-Z_.]+}}, <1 x float> %{{[0-9a-zA-Z_.]+}}) - ret <1 x float> %ret_val -} - define dso_local spir_func <8 x float> @FUNC_10() { %a_1 = alloca <16 x float> %1 = load <16 x float>, <16 x float>* %a_1 @@ -126,20 +52,6 @@ define dso_local spir_func <16 x float> @FUNC_11() { ret <16 x float> %ret_val } -define dso_local spir_func <32 x i32> @FUNC_21(%opencl.image2d_ro_t addrspace(1)* %0, i32 %1, i32 %2) { - %ret_val = call spir_func <32 x i32> @_Z24__esimd_media_block_loadIiLi4ELi8E14ocl_image2d_roEN2cm3gen13__vector_typeIT_XmlT0_T1_EE4typeEjT2_jjjj(i32 0, %opencl.image2d_ro_t addrspace(1)* %0, i32 0, i32 32, i32 %1, i32 %2) -; CHECK: %{{[0-9a-zA-Z_.]+}} = call <32 x i32> @llvm.genx.media.ld.v32i32(i32 0, i32 %{{[0-9a-zA-Z_.]+}}, i32 0, i32 32, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}) - ret <32 x i32> %ret_val -} - -define dso_local spir_func void @FUNC_22(%opencl.image2d_wo_t addrspace(1)* %0, i32 %1, i32 %2) { - %a_3 = alloca <32 x i32> - %4 = load <32 x i32>, <32 x i32>* %a_3 - call spir_func void @_Z25__esimd_media_block_storeIiLi4ELi8E14ocl_image2d_woEvjT2_jjjjN2cm3gen13__vector_typeIT_XmlT0_T1_EE4typeE(i32 0, %opencl.image2d_wo_t addrspace(1)* %0, i32 0, i32 32, i32 %1, i32 %2, <32 x i32> %4) -; CHECK: call void @llvm.genx.media.st.v32i32(i32 0, i32 %{{[0-9a-zA-Z_.]+}}, i32 0, i32 32, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, <32 x i32> %{{[0-9a-zA-Z_.]+}}) - ret void -} - define dso_local spir_func <16 x i32> @FUNC_23() { %ret_val = call spir_func <16 x i32> @_Z13__esimd_vloadIiLi16EEN2cm3gen13__vector_typeIT_XT0_EE4typeEPKS5_(<16 x i32> addrspace(4)* addrspacecast (<16 x i32>* getelementptr inbounds (%"cm::gen::simd", %"cm::gen::simd"* @vg, i32 0, i32 0) to <16 x i32> addrspace(4)*)) ; CHECK: %ret_val1 = load <16 x i32>, <16 x i32> addrspace(4)* addrspacecast (<16 x i32>* getelementptr inbounds (%"cm::gen::simd", %"cm::gen::simd"* @vg, i32 0, i32 0) to <16 x i32> addrspace(4)*), align 64 @@ -268,12 +180,6 @@ define dso_local spir_func <16 x i32> @FUNC_39() { ret <16 x i32> %ret_val } -define dso_local spir_func <8 x i32> @FUNC_40() { - %ret_val = call spir_func <8 x i32> @_Z22__esimd_slm_block_readIiLi8EEN2cl4sycl3ext5intel3gpu11vector_typeIT_XT0_EE4typeEj(i32 0) -; CHECK: %{{[0-9a-zA-Z_.]+}} = call <8 x i32> @llvm.genx.oword.ld.v8i32(i32 0, i32 254, i32 0) - ret <8 x i32> %ret_val -} - define dso_local spir_func void @FUNC_41() { call spir_func void @_Z16__esimd_sbarrierN2cl4sycl3ext5intel3gpu17EsimdSbarrierTypeE(i8 zeroext 1) ; CHECK: call void @llvm.genx.sbarrier(i8 1) @@ -378,16 +284,7 @@ define dso_local spir_func <32 x half> @FUNC_52() { } declare dso_local i32 @_Z15__esimd_lane_idv() - -declare dso_local spir_func <32 x i32> @_Z20__esimd_flat_atomic0ILN2cm3gen14CmAtomicOpTypeE2EjLi32ELNS1_9CacheHintE0ELS3_0EENS1_13__vector_typeIT0_XT1_EE4typeENS4_IyXT1_EE4typeENS4_ItXT1_EE4typeE(<32 x i64> %0, <32 x i16> %1) -declare dso_local spir_func <32 x i32> @_Z20__esimd_flat_atomic1ILN2cm3gen14CmAtomicOpTypeE0EjLi32ELNS1_9CacheHintE0ELS3_0EENS1_13__vector_typeIT0_XT1_EE4typeENS4_IyXT1_EE4typeES7_NS4_ItXT1_EE4typeE(<32 x i64> %0, <32 x i32> %1, <32 x i16> %2) -declare dso_local spir_func <32 x i32> @_Z20__esimd_flat_atomic2ILN2cm3gen14CmAtomicOpTypeE7EjLi32ELNS1_9CacheHintE0ELS3_0EENS1_13__vector_typeIT0_XT1_EE4typeENS4_IyXT1_EE4typeES7_S7_NS4_ItXT1_EE4typeE(<32 x i64> %0, <32 x i32> %1, <32 x i32> %2, <32 x i16> %3) -declare dso_local spir_func <32 x i32> @_Z33__esimd_flat_block_read_unalignedIjLi32ELN2cm3gen9CacheHintE0ELS2_0EENS1_13__vector_typeIT_XT0_EE4typeEy(i64 %0) -declare dso_local spir_func void @_Z24__esimd_flat_block_writeIjLi32ELN2cm3gen9CacheHintE0ELS2_0EEvyNS1_13__vector_typeIT_XT0_EE4typeE(i64 %0, <32 x i32> %1) -declare dso_local spir_func <32 x i32> @_Z17__esimd_flat_readIjLi32ELi0ELN2cm3gen9CacheHintE0ELS2_0EENS1_13__vector_typeIT_XmlT0_clL_ZNS1_20ElemsPerAddrDecodingEjET1_EEE4typeENS3_IyXT0_EE4typeEiNS3_ItXT0_EE4typeE(<32 x i64> %0, i32 %1, <32 x i16> %2) -declare dso_local spir_func void @_Z18__esimd_flat_writeIjLi32ELi0ELN2cm3gen9CacheHintE0ELS2_0EEvNS1_13__vector_typeIyXT0_EE4typeENS3_IT_XmlT0_clL_ZNS1_20ElemsPerAddrDecodingEjET1_EEE4typeEiNS3_ItXT0_EE4typeE(<32 x i64> %0, <32 x i32> %1, i32 %2, <32 x i16> %3) declare dso_local spir_func <16 x i16> @_Z12__esimd_sminIsLi16EEN2cm3gen13__vector_typeIT_XT0_EE4typeES5_S5_(<16 x i16> %0, <16 x i16> %1) -declare dso_local spir_func <1 x float> @_Z16__esimd_div_ieeeILi1EEN2cm3gen13__vector_typeIfXT_EE4typeES4_S4_(<1 x float> %0, <1 x float> %1) declare dso_local spir_func <8 x float> @_Z16__esimd_rdregionIfLi16ELi8ELi0ELi8ELi1ELi0EEN2cm3gen13__vector_typeIT_XT1_EE4typeENS2_IS3_XT0_EE4typeEt(<16 x float> %0, i16 zeroext %1) declare dso_local spir_func <16 x float> @_Z16__esimd_wrregionIfLi16ELi8ELi0ELi8ELi1ELi0EEN2cm3gen13__vector_typeIT_XT0_EE4typeES5_NS2_IS3_XT1_EE4typeEtNS2_ItXT1_EE4typeE(<16 x float> %0, <8 x float> %1, i16 zeroext %2, <8 x i16> %3) declare dso_local spir_func <16 x i32> @_Z13__esimd_vloadIiLi16EEN2cm3gen13__vector_typeIT_XT0_EE4typeEPKS5_(<16 x i32> addrspace(4)* %0) @@ -404,8 +301,6 @@ declare dso_local spir_func void @_Z14__esimd_vstoreIfLi1EEvPN2cm3gen13__vector_ declare dso_local spir_func <16 x float> @_Z13__esimd_vloadIfLi16EEN2cm3gen13__vector_typeIT_XT0_EE4typeEPKS5_(<16 x float> addrspace(4)* %0) declare dso_local spir_func void @_Z14__esimd_vstoreIfLi8EEvPN2cm3gen13__vector_typeIT_XT0_EE4typeES5_(<8 x float> addrspace(4)* %0, <8 x float> %1) declare dso_local spir_func <8 x float> @_Z13__esimd_vloadIfLi8EEN2cm3gen13__vector_typeIT_XT0_EE4typeEPKS5_(<8 x float> addrspace(4)* %0) -declare dso_local spir_func <32 x i32> @_Z24__esimd_media_block_loadIiLi4ELi8E14ocl_image2d_roEN2cm3gen13__vector_typeIT_XmlT0_T1_EE4typeEjT2_jjjj(i32 %0, %opencl.image2d_ro_t addrspace(1)* %1, i32 %2, i32 %3, i32 %4, i32 %5) -declare dso_local spir_func void @_Z25__esimd_media_block_storeIiLi4ELi8E14ocl_image2d_woEvjT2_jjjjN2cm3gen13__vector_typeIT_XmlT0_T1_EE4typeE(i32 %0, %opencl.image2d_wo_t addrspace(1)* %1, i32 %2, i32 %3, i32 %4, i32 %5, <32 x i32> %6) declare dso_local spir_func <32 x i32> @_Z13__esimd_vloadIiLi32EEN2cm3gen13__vector_typeIT_XT0_EE4typeEPKS5_(<32 x i32> addrspace(4)* %0) declare dso_local spir_func void @_Z14__esimd_vstoreIfLi16EEvPN2cm3gen13__vector_typeIT_XT0_EE4typeES5_(<16 x float> addrspace(4)* %0, <16 x float> %1) declare dso_local spir_func void @_ZN2cl4sycl3ext5intel12experimental5esimd8slm_initEj(i32) @@ -417,7 +312,6 @@ declare dso_local spir_func <16 x i32> @_Z18__esimd_uudp4a_satIjjjjLi16EEN2cl4sy declare dso_local spir_func <16 x i32> @_Z18__esimd_usdp4a_satIjiiiLi16EEN2cl4sycl3ext5intel3gpu11vector_typeIT_XT3_EE4typeENS4_IT0_XT3_EE4typeENS4_IT1_XT3_EE4typeENS4_IT2_XT3_EE4typeE(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2) declare dso_local spir_func <16 x i32> @_Z18__esimd_sudp4a_satIijjjLi16EEN2cl4sycl3ext5intel3gpu11vector_typeIT_XT3_EE4typeENS4_IT0_XT3_EE4typeENS4_IT1_XT3_EE4typeENS4_IT2_XT3_EE4typeE(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2) declare dso_local spir_func <16 x i32> @_Z18__esimd_ssdp4a_satIiiiiLi16EEN2cl4sycl3ext5intel3gpu11vector_typeIT_XT3_EE4typeENS4_IT0_XT3_EE4typeENS4_IT1_XT3_EE4typeENS4_IT2_XT3_EE4typeE(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2) -declare dso_local spir_func <8 x i32> @_Z22__esimd_slm_block_readIiLi8EEN2cl4sycl3ext5intel3gpu11vector_typeIT_XT0_EE4typeEj(i32 %0) declare dso_local spir_func void @_Z16__esimd_sbarrierN2cl4sycl3ext5intel3gpu17EsimdSbarrierTypeE(i8 %0) declare dso_local spir_func <8 x i32> @_Z18__esimd_rdindirectIiLi16ELi8ELi0EEN2cl4sycl3ext5intel3gpu11vector_typeIT_XT1_EE4typeENS4_IS5_XT0_EE4typeENS4_ItXT1_EE4typeE(<16 x i32>, <8 x i16>) declare dso_local spir_func <16 x i32> @_Z18__esimd_wrindirectIiLi16ELi8ELi0EEN2cl4sycl3ext5intel3gpu11vector_typeIT_XT0_EE4typeES7_NS4_IS5_XT1_EE4typeENS4_ItXT1_EE4typeESB_(<16 x i32>, <8 x i32>, <8 x i16>, <8 x i16>) diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/common.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/common.hpp index 9ac08f711aada..fa48478973c2c 100644 --- a/sycl/include/sycl/ext/intel/experimental/esimd/common.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd/common.hpp @@ -244,6 +244,11 @@ using EsimdSbarrierType = split_barrier_action; #define ESIMD_SBARRIER_WAIT EsimdSbarrierType::WAIT #define ESIMD_SBARRIER_SIGNAL EsimdSbarrierType::SIGNAL +/// Surface index type. Surface is an internal representation of a memory block +/// addressable by GPU in "stateful" memory model, and each surface is +/// identified by its "binding table index" - surface index. +using SurfaceIndex = unsigned int; + } // namespace esimd } // namespace experimental } // namespace intel diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/detail/intrin.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/detail/intrin.hpp index 5d8661bb85cd5..bb8e6d5843ab4 100644 --- a/sycl/include/sycl/ext/intel/experimental/esimd/detail/intrin.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd/detail/intrin.hpp @@ -62,11 +62,11 @@ // template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_rdregion(__SEIEED::vector_type_t Input, uint16_t Offset); template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_rdindirect(__SEIEED::vector_type_t Input, __SEIEED::vector_type_t Offset); @@ -119,13 +119,13 @@ __esimd_rdindirect(__SEIEED::vector_type_t Input, // template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_wrregion(__SEIEED::vector_type_t OldVal, __SEIEED::vector_type_t NewVal, uint16_t Offset, __SEIEED::simd_mask_storage_t Mask = 1); template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_wrindirect(__SEIEED::vector_type_t OldVal, __SEIEED::vector_type_t NewVal, __SEIEED::vector_type_t Offset, @@ -221,7 +221,7 @@ readRegion(const __SEIEED::vector_type_t &Base, // optimization on simd object // template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_vload(const __SEIEED::vector_type_t *ptr); // vstore @@ -229,24 +229,43 @@ __esimd_vload(const __SEIEED::vector_type_t *ptr); // map to the backend vstore intrinsic, used by compiler to control // optimization on simd object template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION void -__esimd_vstore(__SEIEED::vector_type_t *ptr, - __SEIEED::vector_type_t vals); +__ESIMD_INTRIN void __esimd_vstore(__SEIEED::vector_type_t *ptr, + __SEIEED::vector_type_t vals); template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION uint16_t -__esimd_any(__SEIEED::vector_type_t src); +__ESIMD_INTRIN uint16_t __esimd_any(__SEIEED::vector_type_t src) +#ifdef __SYCL_DEVICE_ONLY__ + ; +#else +{ + for (unsigned int i = 0; i != N; i++) { + if (src[i] != 0) + return 1; + } + return 0; +} +#endif // __SYCL_DEVICE_ONLY__ template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION uint16_t -__esimd_all(__SEIEED::vector_type_t src); +__ESIMD_INTRIN uint16_t __esimd_all(__SEIEED::vector_type_t src) +#ifdef __SYCL_DEVICE_ONLY__ + ; +#else +{ + for (unsigned int i = 0; i != N; i++) { + if (src[i] == 0) + return 0; + } + return 1; +} +#endif // __SYCL_DEVICE_ONLY__ #ifndef __SYCL_DEVICE_ONLY__ // Implementations of ESIMD intrinsics for the SYCL host device template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_rdregion(__SEIEED::vector_type_t Input, uint16_t Offset) { uint16_t EltOffset = Offset / sizeof(T); assert(Offset % sizeof(T) == 0); @@ -265,7 +284,7 @@ __esimd_rdregion(__SEIEED::vector_type_t Input, uint16_t Offset) { } template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_rdindirect(__SEIEED::vector_type_t Input, __SEIEED::vector_type_t Offset) { __SEIEED::vector_type_t Result; @@ -280,7 +299,7 @@ __esimd_rdindirect(__SEIEED::vector_type_t Input, template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_wrregion(__SEIEED::vector_type_t OldVal, __SEIEED::vector_type_t NewVal, uint16_t Offset, __SEIEED::simd_mask_storage_t Mask) { @@ -303,7 +322,7 @@ __esimd_wrregion(__SEIEED::vector_type_t OldVal, } template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_wrindirect(__SEIEED::vector_type_t OldVal, __SEIEED::vector_type_t NewVal, __SEIEED::vector_type_t Offset, diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/detail/math_intrin.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/detail/math_intrin.hpp index 7f2f5fc5bdd10..5b7a080cdbe90 100644 --- a/sycl/include/sycl/ext/intel/experimental/esimd/detail/math_intrin.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd/detail/math_intrin.hpp @@ -21,307 +21,292 @@ // saturation intrinsics template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t -__esimd_satf(__SEIEED::vector_type_t src); +__ESIMD_INTRIN __SEIEED::vector_type_t +__esimd_sat(__SEIEED::vector_type_t src); template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_fptoui_sat(__SEIEED::vector_type_t src); template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_fptosi_sat(__SEIEED::vector_type_t src); template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_uutrunc_sat(__SEIEED::vector_type_t src); template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_ustrunc_sat(__SEIEED::vector_type_t src); template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_sutrunc_sat(__SEIEED::vector_type_t src); template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_sstrunc_sat(__SEIEED::vector_type_t src); template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_abs(__SEIEED::vector_type_t src0); template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_ssshl(__SEIEED::vector_type_t src0, __SEIEED::vector_type_t src1); template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_sushl(__SEIEED::vector_type_t src0, __SEIEED::vector_type_t src1); template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_usshl(__SEIEED::vector_type_t src0, __SEIEED::vector_type_t src1); template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_uushl(__SEIEED::vector_type_t src0, __SEIEED::vector_type_t src1); template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_ssshl_sat(__SEIEED::vector_type_t src0, __SEIEED::vector_type_t src1); template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_sushl_sat(__SEIEED::vector_type_t src0, __SEIEED::vector_type_t src1); template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_usshl_sat(__SEIEED::vector_type_t src0, __SEIEED::vector_type_t src1); template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_uushl_sat(__SEIEED::vector_type_t src0, __SEIEED::vector_type_t src1); template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_rol(__SEIEED::vector_type_t src0, __SEIEED::vector_type_t src1); template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_ror(__SEIEED::vector_type_t src0, __SEIEED::vector_type_t src1); template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_umulh(__SEIEED::vector_type_t src0, __SEIEED::vector_type_t src1); template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_smulh(__SEIEED::vector_type_t src0, __SEIEED::vector_type_t src1); template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_frc(__SEIEED::vector_type_t src0); /// 3 kinds of max template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_fmax(__SEIEED::vector_type_t src0, __SEIEED::vector_type_t src1); template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_umax(__SEIEED::vector_type_t src0, __SEIEED::vector_type_t src1); template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_smax(__SEIEED::vector_type_t src0, __SEIEED::vector_type_t src1); template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_lzd(__SEIEED::vector_type_t src0); /// 3 kinds of min template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_fmin(__SEIEED::vector_type_t src0, __SEIEED::vector_type_t src1); template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_umin(__SEIEED::vector_type_t src0, __SEIEED::vector_type_t src1); template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_smin(__SEIEED::vector_type_t src0, __SEIEED::vector_type_t src1); template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_bfrev(__SEIEED::vector_type_t src0); template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_cbit(__SEIEED::vector_type_t src0); template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t __esimd_bfins( +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_bfi( __SEIEED::vector_type_t src0, __SEIEED::vector_type_t src1, __SEIEED::vector_type_t src2, __SEIEED::vector_type_t src3); template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t -__esimd_bfext(__SEIEED::vector_type_t src0, - __SEIEED::vector_type_t src1, - __SEIEED::vector_type_t src2); +__ESIMD_INTRIN __SEIEED::vector_type_t +__esimd_sbfe(__SEIEED::vector_type_t src0, + __SEIEED::vector_type_t src1, + __SEIEED::vector_type_t src2); template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_fbl(__SEIEED::vector_type_t src0); template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_sfbh(__SEIEED::vector_type_t src0); template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_ufbh(__SEIEED::vector_type_t src0); template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_inv(__SEIEED::vector_type_t src0); template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_log(__SEIEED::vector_type_t src0); template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_exp(__SEIEED::vector_type_t src0); template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_sqrt(__SEIEED::vector_type_t src0); template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t -__esimd_sqrt_ieee(__SEIEED::vector_type_t src0); +__ESIMD_INTRIN __SEIEED::vector_type_t +__esimd_ieee_sqrt(__SEIEED::vector_type_t src0); template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_rsqrt(__SEIEED::vector_type_t src0); template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_sin(__SEIEED::vector_type_t src0); template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_cos(__SEIEED::vector_type_t src0); template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_pow(__SEIEED::vector_type_t src0, __SEIEED::vector_type_t src1); template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t -__esimd_div_ieee(__SEIEED::vector_type_t src0, +__ESIMD_INTRIN __SEIEED::vector_type_t +__esimd_ieee_div(__SEIEED::vector_type_t src0, __SEIEED::vector_type_t src1); template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_rndd(__SEIEED::vector_type_t src0); template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_rndu(__SEIEED::vector_type_t src0); template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_rnde(__SEIEED::vector_type_t src0); template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_rndz(__SEIEED::vector_type_t src0); template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t -__esimd_sqrt_ieee(__SEIEED::vector_type_t src0); +__ESIMD_INTRIN __SEIEED::vector_type_t +__esimd_ieee_sqrt(__SEIEED::vector_type_t src0); template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t -__esimd_div_ieee(__SEIEED::vector_type_t src0, +__ESIMD_INTRIN __SEIEED::vector_type_t +__esimd_ieee_div(__SEIEED::vector_type_t src0, __SEIEED::vector_type_t src1); template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION uint32_t +__ESIMD_INTRIN uint32_t __esimd_pack_mask(__SEIEED::vector_type_t src0); template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_unpack_mask(uint32_t src0); template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_uudp4a(__SEIEED::vector_type_t src0, __SEIEED::vector_type_t src1, __SEIEED::vector_type_t src2); template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_usdp4a(__SEIEED::vector_type_t src0, __SEIEED::vector_type_t src1, __SEIEED::vector_type_t src2); template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_sudp4a(__SEIEED::vector_type_t src0, __SEIEED::vector_type_t src1, __SEIEED::vector_type_t src2); template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_ssdp4a(__SEIEED::vector_type_t src0, __SEIEED::vector_type_t src1, __SEIEED::vector_type_t src2); template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_uudp4a_sat(__SEIEED::vector_type_t src0, __SEIEED::vector_type_t src1, __SEIEED::vector_type_t src2); template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_usdp4a_sat(__SEIEED::vector_type_t src0, __SEIEED::vector_type_t src1, __SEIEED::vector_type_t src2); template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_sudp4a_sat(__SEIEED::vector_type_t src0, __SEIEED::vector_type_t src1, __SEIEED::vector_type_t src2); template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_ssdp4a_sat(__SEIEED::vector_type_t src0, __SEIEED::vector_type_t src1, __SEIEED::vector_type_t src2); -// Reduction functions template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t -__esimd_reduced_fmax(__SEIEED::vector_type_t src1, - __SEIEED::vector_type_t src2); - -template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t -__esimd_reduced_umax(__SEIEED::vector_type_t src1, - __SEIEED::vector_type_t src2); - -template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t -__esimd_reduced_smax(__SEIEED::vector_type_t src1, - __SEIEED::vector_type_t src2); - -template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t -__esimd_reduced_fmin(__SEIEED::vector_type_t src1, - __SEIEED::vector_type_t src2); - -template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t -__esimd_reduced_umin(__SEIEED::vector_type_t src1, - __SEIEED::vector_type_t src2); - -template -__SEIEED::vector_type_t SYCL_EXTERNAL SYCL_ESIMD_FUNCTION -__esimd_reduced_smin(__SEIEED::vector_type_t src1, - __SEIEED::vector_type_t src2); - -template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_dp4(__SEIEED::vector_type_t v1, - __SEIEED::vector_type_t v2); + __SEIEED::vector_type_t v2) +#ifdef __SYCL_DEVICE_ONLY__ + ; +#else +{ + __SEIEED::vector_type_t retv; + for (auto i = 0; i != N; i += 4) { + Ty dp = (v1[i] * v2[i]) + (v1[i + 1] * v2[i + 1]) + + (v1[i + 2] * v2[i + 2]) + (v1[i + 3] * v2[i + 3]); + retv[i] = dp; + retv[i + 1] = dp; + retv[i + 2] = dp; + retv[i + 3] = dp; + } + return retv; +} +#endif // __SYCL_DEVICE_ONLY__ #ifdef __SYCL_DEVICE_ONLY__ // lane-id for reusing scalar math functions. // Depending upon the SIMT mode(8/16/32), the return value is // in the range of 0-7, 0-15, or 0-31. -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION int __esimd_lane_id(); +__ESIMD_INTRIN int __esimd_lane_id(); // Wrapper for designating a scalar region of code that will be // vectorized by the backend compiler. @@ -335,8 +320,8 @@ SYCL_EXTERNAL SYCL_ESIMD_FUNCTION int __esimd_lane_id(); #define ESIMD_MATH_INTRINSIC_IMPL(type, func) \ template \ - SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t \ - ocl_##func(__SEIEED::vector_type_t src0) { \ + __ESIMD_INTRIN __SEIEED::vector_type_t ocl_##func( \ + __SEIEED::vector_type_t src0) { \ __SEIEED::vector_type_t retv; \ __ESIMD_SIMT_BEGIN(SZ, lane) \ retv[lane] = sycl::func(src0[lane]); \ @@ -385,8 +370,8 @@ inline T extract(const uint32_t &width, const uint32_t &offset, uint32_t src, } template -inline __SEIEED::vector_type_t -__esimd_satf(__SEIEED::vector_type_t src) { +__ESIMD_INTRIN __SEIEED::vector_type_t +__esimd_sat(__SEIEED::vector_type_t src) { __SEIEED::vector_type_t retv; for (int i = 0; i < SZ; i++) { SIMDCF_ELEMENT_SKIP(i); @@ -396,7 +381,7 @@ __esimd_satf(__SEIEED::vector_type_t src) { }; template -inline __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_fptoui_sat(__SEIEED::vector_type_t src) { __SEIEED::vector_type_t retv; for (int i = 0; i < SZ; i++) { @@ -407,7 +392,7 @@ __esimd_fptoui_sat(__SEIEED::vector_type_t src) { }; template -inline __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_fptosi_sat(__SEIEED::vector_type_t src) { __SEIEED::vector_type_t retv; for (int i = 0; i < SZ; i++) { @@ -418,7 +403,7 @@ __esimd_fptosi_sat(__SEIEED::vector_type_t src) { }; template -inline __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_uutrunc_sat(__SEIEED::vector_type_t src) { __SEIEED::vector_type_t retv; for (int i = 0; i < SZ; i++) { @@ -429,7 +414,7 @@ __esimd_uutrunc_sat(__SEIEED::vector_type_t src) { }; template -inline __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_ustrunc_sat(__SEIEED::vector_type_t src) { __SEIEED::vector_type_t retv; for (int i = 0; i < SZ; i++) { @@ -440,7 +425,7 @@ __esimd_ustrunc_sat(__SEIEED::vector_type_t src) { }; template -inline __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_sutrunc_sat(__SEIEED::vector_type_t src) { __SEIEED::vector_type_t retv; for (int i = 0; i < SZ; i++) { @@ -451,7 +436,7 @@ __esimd_sutrunc_sat(__SEIEED::vector_type_t src) { }; template -inline __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_sstrunc_sat(__SEIEED::vector_type_t src) { __SEIEED::vector_type_t retv; for (int i = 0; i < SZ; i++) { @@ -462,7 +447,7 @@ __esimd_sstrunc_sat(__SEIEED::vector_type_t src) { }; template -inline __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_abs(__SEIEED::vector_type_t src0) { int i; typename __SEIEEED::abstype::type ret; @@ -481,7 +466,7 @@ __esimd_abs(__SEIEED::vector_type_t src0) { }; template -inline __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_ssshl(__SEIEED::vector_type_t src0, __SEIEED::vector_type_t src1) { int i; @@ -496,7 +481,7 @@ __esimd_ssshl(__SEIEED::vector_type_t src0, return retv; }; template -inline __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_sushl(__SEIEED::vector_type_t src0, __SEIEED::vector_type_t src1) { int i; @@ -511,7 +496,7 @@ __esimd_sushl(__SEIEED::vector_type_t src0, return retv; }; template -inline __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_usshl(__SEIEED::vector_type_t src0, __SEIEED::vector_type_t src1) { int i; @@ -526,7 +511,7 @@ __esimd_usshl(__SEIEED::vector_type_t src0, return retv; }; template -inline __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_uushl(__SEIEED::vector_type_t src0, __SEIEED::vector_type_t src1) { int i; @@ -541,7 +526,7 @@ __esimd_uushl(__SEIEED::vector_type_t src0, return retv; }; template -inline __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_ssshl_sat(__SEIEED::vector_type_t src0, __SEIEED::vector_type_t src1) { int i; @@ -556,7 +541,7 @@ __esimd_ssshl_sat(__SEIEED::vector_type_t src0, return retv; }; template -inline __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_sushl_sat(__SEIEED::vector_type_t src0, __SEIEED::vector_type_t src1) { int i; @@ -571,7 +556,7 @@ __esimd_sushl_sat(__SEIEED::vector_type_t src0, return retv; }; template -inline __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_usshl_sat(__SEIEED::vector_type_t src0, __SEIEED::vector_type_t src1) { int i; @@ -586,7 +571,7 @@ __esimd_usshl_sat(__SEIEED::vector_type_t src0, return retv; }; template -inline __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_uushl_sat(__SEIEED::vector_type_t src0, __SEIEED::vector_type_t src1) { int i; @@ -602,17 +587,17 @@ __esimd_uushl_sat(__SEIEED::vector_type_t src0, }; template -inline __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_rol(__SEIEED::vector_type_t src0, __SEIEED::vector_type_t src1){}; template -inline __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_ror(__SEIEED::vector_type_t src0, __SEIEED::vector_type_t src1){}; template -inline __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_umulh(__SEIEED::vector_type_t src0, __SEIEED::vector_type_t src1) { int i; @@ -628,7 +613,7 @@ __esimd_umulh(__SEIEED::vector_type_t src0, } template -inline __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_smulh(__SEIEED::vector_type_t src0, __SEIEED::vector_type_t src1) { int i; @@ -644,7 +629,7 @@ __esimd_smulh(__SEIEED::vector_type_t src0, }; template -inline __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_frc(__SEIEED::vector_type_t src0) { __SEIEED::vector_type_t retv; for (int i = 0; i < SZ; i++) { @@ -656,7 +641,7 @@ __esimd_frc(__SEIEED::vector_type_t src0) { /// 3 kinds of max template -inline __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_fmax(__SEIEED::vector_type_t src0, __SEIEED::vector_type_t src1) { int i; @@ -674,7 +659,7 @@ __esimd_fmax(__SEIEED::vector_type_t src0, return retv; }; template -inline __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_umax(__SEIEED::vector_type_t src0, __SEIEED::vector_type_t src1) { int i; @@ -692,7 +677,7 @@ __esimd_umax(__SEIEED::vector_type_t src0, return retv; }; template -inline __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_smax(__SEIEED::vector_type_t src0, __SEIEED::vector_type_t src1) { int i; @@ -711,7 +696,7 @@ __esimd_smax(__SEIEED::vector_type_t src0, }; template -inline __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_lzd(__SEIEED::vector_type_t src0) { int i; T ret; @@ -733,7 +718,7 @@ __esimd_lzd(__SEIEED::vector_type_t src0) { /// 3 kinds of min template -inline __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_fmin(__SEIEED::vector_type_t src0, __SEIEED::vector_type_t src1) { int i; @@ -752,7 +737,7 @@ __esimd_fmin(__SEIEED::vector_type_t src0, }; template -inline __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_umin(__SEIEED::vector_type_t src0, __SEIEED::vector_type_t src1) { int i; @@ -771,7 +756,7 @@ __esimd_umin(__SEIEED::vector_type_t src0, }; template -inline __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_smin(__SEIEED::vector_type_t src0, __SEIEED::vector_type_t src1) { int i; @@ -790,7 +775,7 @@ __esimd_smin(__SEIEED::vector_type_t src0, }; template -inline __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_bfrev(__SEIEED::vector_type_t src0) { int i, j; __SEIEED::vector_type_t retv; @@ -815,7 +800,7 @@ __esimd_bfrev(__SEIEED::vector_type_t src0) { }; template -inline __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_cbit(__SEIEED::vector_type_t src0) { int i; uint32_t ret; @@ -838,11 +823,11 @@ __esimd_cbit(__SEIEED::vector_type_t src0) { }; template -inline __SEIEED::vector_type_t -__esimd_bfins(__SEIEED::vector_type_t width, - __SEIEED::vector_type_t offset, - __SEIEED::vector_type_t val, - __SEIEED::vector_type_t src) { +__ESIMD_INTRIN __SEIEED::vector_type_t +__esimd_bfi(__SEIEED::vector_type_t width, + __SEIEED::vector_type_t offset, + __SEIEED::vector_type_t val, + __SEIEED::vector_type_t src) { int i; typename __SEIEEED::maxtype::type ret; __SEIEED::vector_type_t retv; @@ -864,10 +849,10 @@ __esimd_bfins(__SEIEED::vector_type_t width, }; template -inline __SEIEED::vector_type_t -__esimd_bfext(__SEIEED::vector_type_t width, - __SEIEED::vector_type_t offset, - __SEIEED::vector_type_t src) { +__ESIMD_INTRIN __SEIEED::vector_type_t +__esimd_sbfe(__SEIEED::vector_type_t width, + __SEIEED::vector_type_t offset, + __SEIEED::vector_type_t src) { int i; typename __SEIEEED::maxtype::type ret; __SEIEED::vector_type_t retv; @@ -883,7 +868,7 @@ __esimd_bfext(__SEIEED::vector_type_t width, }; template -inline __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_fbl(__SEIEED::vector_type_t src0) { int i; T0 ret; @@ -908,7 +893,7 @@ __esimd_fbl(__SEIEED::vector_type_t src0) { }; template -inline __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_sfbh(__SEIEED::vector_type_t src0) { int i, cval; @@ -940,7 +925,7 @@ __esimd_sfbh(__SEIEED::vector_type_t src0) { }; template -inline __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_ufbh(__SEIEED::vector_type_t src0) { uint32_t ret; __SEIEED::vector_type_t retv; @@ -964,7 +949,7 @@ __esimd_ufbh(__SEIEED::vector_type_t src0) { }; template -inline __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_inv(__SEIEED::vector_type_t src0) { __SEIEED::vector_type_t retv; @@ -976,7 +961,7 @@ __esimd_inv(__SEIEED::vector_type_t src0) { }; template -inline __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_log(__SEIEED::vector_type_t src0) { __SEIEED::vector_type_t retv; @@ -987,7 +972,7 @@ __esimd_log(__SEIEED::vector_type_t src0) { return retv; }; template -inline __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_exp(__SEIEED::vector_type_t src0) { __SEIEED::vector_type_t retv; @@ -998,7 +983,7 @@ __esimd_exp(__SEIEED::vector_type_t src0) { return retv; }; template -inline __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_sqrt(__SEIEED::vector_type_t src0) { __SEIEED::vector_type_t retv; @@ -1009,8 +994,8 @@ __esimd_sqrt(__SEIEED::vector_type_t src0) { return retv; }; template -inline __SEIEED::vector_type_t -__esimd_sqrt_ieee(__SEIEED::vector_type_t src0) { +__ESIMD_INTRIN __SEIEED::vector_type_t +__esimd_ieee_sqrt(__SEIEED::vector_type_t src0) { __SEIEED::vector_type_t retv; for (int i = 0; i < SZ; i++) { @@ -1020,7 +1005,7 @@ __esimd_sqrt_ieee(__SEIEED::vector_type_t src0) { return retv; }; template -inline __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_rsqrt(__SEIEED::vector_type_t src0) { __SEIEED::vector_type_t retv; @@ -1031,7 +1016,7 @@ __esimd_rsqrt(__SEIEED::vector_type_t src0) { return retv; }; template -inline __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_sin(__SEIEED::vector_type_t src) { __SEIEED::vector_type_t retv; for (int i = 0; i < SZ; i++) { @@ -1041,7 +1026,7 @@ __esimd_sin(__SEIEED::vector_type_t src) { return retv; }; template -inline __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_cos(__SEIEED::vector_type_t src) { __SEIEED::vector_type_t retv; for (int i = 0; i < SZ; i++) { @@ -1051,7 +1036,7 @@ __esimd_cos(__SEIEED::vector_type_t src) { return retv; }; template -inline __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_pow(__SEIEED::vector_type_t src0, __SEIEED::vector_type_t src1) { __SEIEED::vector_type_t retv; @@ -1064,8 +1049,8 @@ __esimd_pow(__SEIEED::vector_type_t src0, }; template -inline __SEIEED::vector_type_t -__esimd_div_ieee(__SEIEED::vector_type_t src0, +__ESIMD_INTRIN __SEIEED::vector_type_t +__esimd_ieee_div(__SEIEED::vector_type_t src0, __SEIEED::vector_type_t src1) { __SEIEED::vector_type_t divinv; __SEIEED::vector_type_t retv; @@ -1084,7 +1069,7 @@ __esimd_div_ieee(__SEIEED::vector_type_t src0, }; template -inline __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_rndd(__SEIEED::vector_type_t src0) { __SEIEED::vector_type_t retv; @@ -1096,7 +1081,7 @@ __esimd_rndd(__SEIEED::vector_type_t src0) { }; template -inline __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_rndu(__SEIEED::vector_type_t src0) { __SEIEED::vector_type_t retv; int increment; @@ -1116,7 +1101,7 @@ __esimd_rndu(__SEIEED::vector_type_t src0) { }; template -inline __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_rnde(__SEIEED::vector_type_t src0) { __SEIEED::vector_type_t retv; int increment; @@ -1138,7 +1123,7 @@ __esimd_rnde(__SEIEED::vector_type_t src0) { }; template -inline __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_rndz(__SEIEED::vector_type_t src0) { __SEIEED::vector_type_t retv; int increment; @@ -1157,8 +1142,8 @@ __esimd_rndz(__SEIEED::vector_type_t src0) { }; template -inline __SEIEED::vector_type_t -__esimd_sqrt_ieee(__SEIEED::vector_type_t src0) { +__ESIMD_INTRIN __SEIEED::vector_type_t +__esimd_ieee_sqrt(__SEIEED::vector_type_t src0) { __SEIEED::vector_type_t retv; for (int i = 0; i < SZ; i++) { @@ -1169,8 +1154,8 @@ __esimd_sqrt_ieee(__SEIEED::vector_type_t src0) { }; template -inline __SEIEED::vector_type_t -__esimd_div_ieee(__SEIEED::vector_type_t src0, +__ESIMD_INTRIN __SEIEED::vector_type_t +__esimd_ieee_div(__SEIEED::vector_type_t src0, __SEIEED::vector_type_t src1) { __SEIEED::vector_type_t divinv; __SEIEED::vector_type_t retv; @@ -1189,7 +1174,8 @@ __esimd_div_ieee(__SEIEED::vector_type_t src0, }; template -inline uint32_t __esimd_pack_mask(__SEIEED::vector_type_t src0) { +__ESIMD_INTRIN uint32_t +__esimd_pack_mask(__SEIEED::vector_type_t src0) { // We don't check the arguments here as this function is only invoked by // wrapper code (which does the checks already) uint32_t retv = 0; @@ -1203,7 +1189,8 @@ inline uint32_t __esimd_pack_mask(__SEIEED::vector_type_t src0) { }; template -inline __SEIEED::vector_type_t __esimd_unpack_mask(uint32_t src0) { +__ESIMD_INTRIN __SEIEED::vector_type_t +__esimd_unpack_mask(uint32_t src0) { __SEIEED::vector_type_t retv; for (int i = 0; i < N; i++) { if ((src0 >> i) & 0x1) { @@ -1216,7 +1203,7 @@ inline __SEIEED::vector_type_t __esimd_unpack_mask(uint32_t src0) { }; template -inline __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_dp4a(__SEIEED::vector_type_t src0, __SEIEED::vector_type_t src1, __SEIEED::vector_type_t src2) { @@ -1253,7 +1240,7 @@ __esimd_dp4a(__SEIEED::vector_type_t src0, }; template -inline __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_reduced_max(__SEIEED::vector_type_t src1, __SEIEED::vector_type_t src2) { __SEIEED::vector_type_t retv; @@ -1268,28 +1255,7 @@ __esimd_reduced_max(__SEIEED::vector_type_t src1, } template -inline __SEIEED::vector_type_t -__esimd_reduced_fmax(__SEIEED::vector_type_t src1, - __SEIEED::vector_type_t src2) { - return __esimd_reduced_max(src1, src2); -} - -template -inline __SEIEED::vector_type_t -__esimd_reduced_umax(__SEIEED::vector_type_t src1, - __SEIEED::vector_type_t src2) { - return __esimd_reduced_max(src1, src2); -} - -template -inline __SEIEED::vector_type_t -__esimd_reduced_smax(__SEIEED::vector_type_t src1, - __SEIEED::vector_type_t src2) { - return __esimd_reduced_max(src1, src2); -} - -template -inline __SEIEED::vector_type_t +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_reduced_min(__SEIEED::vector_type_t src1, __SEIEED::vector_type_t src2) { __SEIEED::vector_type_t retv; @@ -1303,25 +1269,4 @@ __esimd_reduced_min(__SEIEED::vector_type_t src1, return retv; } -template -inline __SEIEED::vector_type_t -__esimd_reduced_fmin(__SEIEED::vector_type_t src1, - __SEIEED::vector_type_t src2) { - return __esimd_reduced_min(src1, src2); -} - -template -inline __SEIEED::vector_type_t -__esimd_reduced_umin(__SEIEED::vector_type_t src1, - __SEIEED::vector_type_t src2) { - return __esimd_reduced_min(src1, src2); -} - -template -inline __SEIEED::vector_type_t -__esimd_reduced_smin(__SEIEED::vector_type_t src1, - __SEIEED::vector_type_t src2) { - return __esimd_reduced_min(src1, src2); -} - #endif // #ifdef __SYCL_DEVICE_ONLY__ diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/detail/memory_intrin.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/detail/memory_intrin.hpp index 1751c32d37084..823d9a4334b50 100644 --- a/sycl/include/sycl/ext/intel/experimental/esimd/detail/memory_intrin.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd/detail/memory_intrin.hpp @@ -26,6 +26,7 @@ namespace intel { namespace experimental { namespace esimd { namespace detail { + // Provides access to sycl accessor class' private members. class AccessorPrivateProxy { public: @@ -78,62 +79,237 @@ constexpr unsigned int ElemsPerAddrDecoding(unsigned int ElemsPerAddrEncoded) { template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION +__ESIMD_INTRIN __SEIEED::vector_type_t - __esimd_flat_read(__SEIEED::vector_type_t addrs, - int ElemsPerAddr = NumBlk, - __SEIEED::simd_mask_storage_t pred = 1); + __esimd_svm_gather(__SEIEED::vector_type_t addrs, + int ElemsPerAddr = NumBlk, + __SEIEED::simd_mask_storage_t pred = 1) +#ifdef __SYCL_DEVICE_ONLY__ + ; +#else +{ + auto NumBlkDecoded = __SEIEED::ElemsPerAddrDecoding(NumBlk); + __SEIEED::vector_type_t V; + ElemsPerAddr = __SEIEED::ElemsPerAddrDecoding(ElemsPerAddr); + + for (int I = 0; I < N; I++) { + if (pred[I]) { + Ty *Addr = reinterpret_cast(addrs[I]); + if (sizeof(Ty) == 2) + ElemsPerAddr = ElemsPerAddr / 2; + if (sizeof(Ty) <= 2) { + for (int J = 0; J < NumBlkDecoded && J < ElemsPerAddr; J++) + V[I * NumBlkDecoded + J] = *(Addr + J); + } else { + for (int J = 0; J < NumBlkDecoded && J < ElemsPerAddr; J++) + V[J * N + I] = *(Addr + J); + } + } + } + return V; +} +#endif // __SYCL_DEVICE_ONLY__ // flat_write does flat-address scatter template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION void __esimd_flat_write( +__ESIMD_INTRIN void __esimd_svm_scatter( __SEIEED::vector_type_t addrs, __SEIEED::vector_type_t vals, - int ElemsPerAddr = NumBlk, __SEIEED::simd_mask_storage_t pred = 1); + int ElemsPerAddr = NumBlk, __SEIEED::simd_mask_storage_t pred = 1) +#ifdef __SYCL_DEVICE_ONLY__ + ; +#else +{ + auto NumBlkDecoded = __SEIEED::ElemsPerAddrDecoding(NumBlk); + ElemsPerAddr = __SEIEED::ElemsPerAddrDecoding(ElemsPerAddr); + + for (int I = 0; I < N; I++) { + if (pred[I]) { + Ty *Addr = reinterpret_cast(addrs[I]); + if (sizeof(Ty) == 2) + ElemsPerAddr = ElemsPerAddr / 2; + if (sizeof(Ty) <= 2) { + for (int J = 0; J < NumBlkDecoded && J < ElemsPerAddr; J++) + *(Addr + J) = vals[I * NumBlkDecoded + J]; + } else { + for (int J = 0; J < NumBlkDecoded && J < ElemsPerAddr; J++) + *(Addr + J) = vals[J * N + I]; + } + } + } +} +#endif // __SYCL_DEVICE_ONLY__ // flat_block_read reads a block of data from one flat address template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t -__esimd_flat_block_read_unaligned(uint64_t addr); +__ESIMD_INTRIN __SEIEED::vector_type_t +__esimd_svm_block_ld_unaligned(uint64_t addr) +#ifdef __SYCL_DEVICE_ONLY__ + ; +#else +{ + __SEIEED::vector_type_t V; + + for (int I = 0; I < N; I++) { + Ty *Addr = reinterpret_cast(addr + I * sizeof(Ty)); + V[I] = *Addr; + } + return V; +} +#endif // __SYCL_DEVICE_ONLY__ // flat_block_write writes a block of data using one flat address template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION void -__esimd_flat_block_write(uint64_t addr, __SEIEED::vector_type_t vals); +__ESIMD_INTRIN void __esimd_svm_block_st(uint64_t addr, + __SEIEED::vector_type_t vals) +#ifdef __SYCL_DEVICE_ONLY__ + ; +#else +{ + for (int I = 0; I < N; I++) { + Ty *Addr = reinterpret_cast(addr + I * sizeof(Ty)); + *Addr = vals[I]; + } +} +#endif // __SYCL_DEVICE_ONLY__ // Reads a block of data from given surface at given offset. -template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t -__esimd_block_read(SurfIndAliasTy surf_ind, uint32_t offset); +template +__ESIMD_INTRIN __SEIEED::vector_type_t +__esimd_oword_ld_unaligned(SurfIndAliasTy surf_ind, uint32_t offset) +#ifdef __SYCL_DEVICE_ONLY__ + ; +#else +{ + throw cl::sycl::feature_not_supported(); +} +#endif // __SYCL_DEVICE_ONLY__ // Writes given block of data to a surface with given index at given offset. template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION void -__esimd_block_write(SurfIndAliasTy surf_ind, uint32_t offset, - __SEIEED::vector_type_t vals); +__ESIMD_INTRIN void __esimd_oword_st(SurfIndAliasTy surf_ind, uint32_t offset, + __SEIEED::vector_type_t vals) +#ifdef __SYCL_DEVICE_ONLY__ + ; +#else +{ + throw cl::sycl::feature_not_supported(); +} +#endif // __SYCL_DEVICE_ONLY__ // flat_read4 does flat-address gather4 template -__SEIEED::vector_type_t - SYCL_EXTERNAL SYCL_ESIMD_FUNCTION - __esimd_flat_read4(__SEIEED::vector_type_t addrs, - __SEIEED::simd_mask_storage_t pred = 1); +__SEIEED::vector_type_t __ESIMD_INTRIN +__esimd_svm_gather4_scaled(__SEIEED::vector_type_t addrs, + __SEIEED::simd_mask_storage_t pred = 1) +#ifdef __SYCL_DEVICE_ONLY__ + ; +#else +{ + __SEIEED::vector_type_t V; + unsigned int Next = 0; + + if constexpr (__SEIEE::is_channel_enabled(Mask, __SEIEE::rgba_channel::R)) { + for (int I = 0; I < N; I++, Next++) { + if (pred[I]) { + Ty *Addr = reinterpret_cast(addrs[I]); + V[Next] = *Addr; + } + } + } + + if constexpr (__SEIEE::is_channel_enabled(Mask, __SEIEE::rgba_channel::G)) { + for (int I = 0; I < N; I++, Next++) { + if (pred[I]) { + Ty *Addr = reinterpret_cast(addrs[I] + sizeof(Ty)); + V[Next] = *Addr; + } + } + } + + if constexpr (__SEIEE::is_channel_enabled(Mask, __SEIEE::rgba_channel::B)) { + for (int I = 0; I < N; I++, Next++) { + if (pred[I]) { + Ty *Addr = reinterpret_cast(addrs[I] + sizeof(Ty) + sizeof(Ty)); + V[Next] = *Addr; + } + } + } + + if constexpr (__SEIEE::is_channel_enabled(Mask, __SEIEE::rgba_channel::A)) { + for (int I = 0; I < N; I++, Next++) { + if (pred[I]) { + Ty *Addr = reinterpret_cast(addrs[I] + sizeof(Ty) + sizeof(Ty) + + sizeof(Ty)); + V[Next] = *Addr; + } + } + } + + return V; +} +#endif // __SYCL_DEVICE_ONLY__ // flat_write does flat-address scatter template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION void __esimd_flat_write4( +__ESIMD_INTRIN void __esimd_svm_scatter4_scaled( __SEIEED::vector_type_t addrs, __SEIEED::vector_type_t vals, - __SEIEED::simd_mask_storage_t pred = 1); + __SEIEED::simd_mask_storage_t pred = 1) +#ifdef __SYCL_DEVICE_ONLY__ + ; +#else +{ + __SEIEED::vector_type_t V; + unsigned int Next = 0; + + if constexpr (__SEIEE::is_channel_enabled(Mask, __SEIEE::rgba_channel::R)) { + for (int I = 0; I < N; I++, Next++) { + if (pred[I]) { + Ty *Addr = reinterpret_cast(addrs[I]); + *Addr = vals[Next]; + } + } + } + + if constexpr (__SEIEE::is_channel_enabled(Mask, __SEIEE::rgba_channel::G)) { + for (int I = 0; I < N; I++, Next++) { + if (pred[I]) { + Ty *Addr = reinterpret_cast(addrs[I] + sizeof(Ty)); + *Addr = vals[Next]; + } + } + } + + if constexpr (__SEIEE::is_channel_enabled(Mask, __SEIEE::rgba_channel::B)) { + for (int I = 0; I < N; I++, Next++) { + if (pred[I]) { + Ty *Addr = reinterpret_cast(addrs[I] + sizeof(Ty) + sizeof(Ty)); + *Addr = vals[Next]; + } + } + } + + if constexpr (__SEIEE::is_channel_enabled(Mask, __SEIEE::rgba_channel::A)) { + for (int I = 0; I < N; I++, Next++) { + if (pred[I]) { + Ty *Addr = reinterpret_cast(addrs[I] + sizeof(Ty) + sizeof(Ty) + + sizeof(Ty)); + *Addr = vals[Next]; + } + } + } +} +#endif // __SYCL_DEVICE_ONLY__ // Low-level surface-based gather. Collects elements located at given offsets in // a surface and returns them as a single \ref simd object. Element can be @@ -146,29 +322,28 @@ SYCL_EXTERNAL SYCL_ESIMD_FUNCTION void __esimd_flat_write4( // accessor used to denote the surface // @tparam TySizeLog2 - Log2 of the number of bytes read per element: // 0 - 1 byte, 1 - 2 bytes, 2 - 4 bytes +// @tparam Scale - offset scaling factor; must be zero currently // @tparam L1H - L1 cache hint // @tparam L3H - L3 cache hint // // Formal parameters: -// @param scale - the scale; must be 0 // @param surf_ind - the surface index, taken from the SYCL memory object // @param global_offset - offset added to each individual element's offset to // compute actual memory access offset for that element // @param elem_offsets - per-element offsets // template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t -__esimd_surf_read(int16_t scale, SurfIndAliasTy surf_ind, - uint32_t global_offset, - __SEIEED::vector_type_t elem_offsets) +__ESIMD_INTRIN __SEIEED::vector_type_t +__esimd_gather_scaled2(SurfIndAliasTy surf_ind, uint32_t global_offset, + __SEIEED::vector_type_t elem_offsets) #ifdef __SYCL_DEVICE_ONLY__ ; #else { static_assert(N == 1 || N == 8 || N == 16); - static_assert(TySizeLog2 <= 2); + static_assert(TySizeLog2 <= 2 && Scale == 0); static_assert(std::is_integral::value || TySizeLog2 == 2); throw cl::sycl::feature_not_supported(); } @@ -185,13 +360,13 @@ __esimd_surf_read(int16_t scale, SurfIndAliasTy surf_ind, // accessor used to denote the surface // @tparam TySizeLog2 - Log2 of the number of bytes written per element: // 0 - 1 byte, 1 - 2 bytes, 2 - 4 bytes +// @tparam Scale - offset scale; only 0 is supported for now // @tparam L1H - L1 cache hint // @tparam L3H - L3 cache hint // // Formal parameters: // @param pred - per-element predicates; elements with zero corresponding // predicates are not written -// @param scale - the scale; must be 0 // @param surf_ind - the surface index, taken from the SYCL memory object // @param global_offset - offset added to each individual element's offset to // compute actual memory access offset for that element @@ -199,18 +374,18 @@ __esimd_surf_read(int16_t scale, SurfIndAliasTy surf_ind, // @param vals - values to write // template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION void -__esimd_surf_write(__SEIEED::simd_mask_storage_t pred, int16_t scale, - SurfIndAliasTy surf_ind, uint32_t global_offset, - __SEIEED::vector_type_t elem_offsets, - __SEIEED::vector_type_t vals) +__ESIMD_INTRIN void +__esimd_scatter_scaled(__SEIEED::simd_mask_storage_t pred, + SurfIndAliasTy surf_ind, uint32_t global_offset, + __SEIEED::vector_type_t elem_offsets, + __SEIEED::vector_type_t vals) #ifdef __SYCL_DEVICE_ONLY__ ; #else { - static_assert(N == 1 || N == 8 || N == 16); + static_assert(N == 1 || N == 8 || N == 16 || N == 32); static_assert(TySizeLog2 <= 2); static_assert(std::is_integral::value || TySizeLog2 == 2); throw cl::sycl::feature_not_supported(); @@ -224,469 +399,200 @@ __esimd_surf_write(__SEIEED::simd_mask_storage_t pred, int16_t scale, template <__SEIEE::atomic_op Op, typename Ty, int N, __SEIEE::CacheHint L1H = __SEIEE::CacheHint::None, __SEIEE::CacheHint L3H = __SEIEE::CacheHint::None> -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t -__esimd_flat_atomic0(__SEIEED::vector_type_t addrs, - __SEIEED::simd_mask_storage_t pred); - -template <__SEIEE::atomic_op Op, typename Ty, int N, - __SEIEE::CacheHint L1H = __SEIEE::CacheHint::None, - __SEIEE::CacheHint L3H = __SEIEE::CacheHint::None> -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t -__esimd_flat_atomic1(__SEIEED::vector_type_t addrs, - __SEIEED::vector_type_t src0, - __SEIEED::simd_mask_storage_t pred); +__ESIMD_INTRIN __SEIEED::vector_type_t +__esimd_svm_atomic0(__SEIEED::vector_type_t addrs, + __SEIEED::simd_mask_storage_t pred) +#ifdef __SYCL_DEVICE_ONLY__ + ; +#else +{ + throw cl::sycl::feature_not_supported(); +} +#endif // __SYCL_DEVICE_ONLY__ template <__SEIEE::atomic_op Op, typename Ty, int N, __SEIEE::CacheHint L1H = __SEIEE::CacheHint::None, __SEIEE::CacheHint L3H = __SEIEE::CacheHint::None> -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t -__esimd_flat_atomic2(__SEIEED::vector_type_t addrs, - __SEIEED::vector_type_t src0, - __SEIEED::vector_type_t src1, - __SEIEED::simd_mask_storage_t pred); - -// esimd_barrier, generic group barrier -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION void __esimd_barrier(); - -// generic work-group split barrier -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION void -__esimd_sbarrier(__SEIEE::split_barrier_action flag); - -// slm_fence sets the SLM read/write order -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION void __esimd_slm_fence(uint8_t cntl); - -// slm_read does SLM gather -template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t -__esimd_slm_read(__SEIEED::vector_type_t addrs, - __SEIEED::simd_mask_storage_t pred = 1); - -// slm_write does SLM scatter -template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION void -__esimd_slm_write(__SEIEED::vector_type_t addrs, - __SEIEED::vector_type_t vals, - __SEIEED::simd_mask_storage_t pred = 1); - -// slm_block_read reads a block of data from SLM -template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t -__esimd_slm_block_read(uint32_t addr); - -// slm_block_write writes a block of data to SLM -template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION void -__esimd_slm_block_write(uint32_t addr, __SEIEED::vector_type_t vals); - -// slm_read4 does SLM gather4 -template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION - __SEIEED::vector_type_t - __esimd_slm_read4(__SEIEED::vector_type_t addrs, - __SEIEED::simd_mask_storage_t pred = 1); - -// slm_write4 does SLM scatter4 -template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION void __esimd_slm_write4( - __SEIEED::vector_type_t addrs, - __SEIEED::vector_type_t vals, - __SEIEED::simd_mask_storage_t pred = 1); - -// slm_atomic: SLM atomic -template <__SEIEE::atomic_op Op, typename Ty, int N> -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t -__esimd_slm_atomic0(__SEIEED::vector_type_t addrs, - __SEIEED::simd_mask_storage_t pred); - -template <__SEIEE::atomic_op Op, typename Ty, int N> -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t -__esimd_slm_atomic1(__SEIEED::vector_type_t addrs, +__ESIMD_INTRIN __SEIEED::vector_type_t +__esimd_svm_atomic1(__SEIEED::vector_type_t addrs, __SEIEED::vector_type_t src0, - __SEIEED::simd_mask_storage_t pred); - -template <__SEIEE::atomic_op Op, typename Ty, int N> -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t -__esimd_slm_atomic2(__SEIEED::vector_type_t addrs, - __SEIEED::vector_type_t src0, - __SEIEED::vector_type_t src1, - __SEIEED::simd_mask_storage_t pred); - -// Media block load -// -// @param Ty the element data type. -// -// @param M the hight of the 2D block. -// -// @param N the width of the 2D block. -// -// @param TACC type of the surface handle. -// -// @param modifier top/bottom field surface access control. -// -// @param handle the surface handle. -// -// @param plane planar surface index. -// -// @param width the width of the return block. -// -// @param x X-coordinate of the left upper rectangle corner in BYTES. -// -// @param y Y-coordinate of the left upper rectangle corner in ROWS. -// -// @return the linearized 2D block data read from surface. -// -template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t -__esimd_media_block_load(unsigned modififer, TACC handle, unsigned plane, - unsigned width, unsigned x, unsigned y); - -// Media block store -// -// @param Ty the element data type. -// -// @param M the hight of the 2D block. -// -// @param N the width of the 2D block. -// -// @param TACC type of the surface handle. -// -// @param modifier top/bottom field surface access control. -// -// @param handle the surface handle. -// -// @param plane planar surface index. -// -// @param width the width of the return block. -// -// @param x X-coordinate of the left upper rectangle corner in BYTES. -// -// @param y Y-coordinate of the left upper rectangle corner in ROWS. -// -// @param vals the linearized 2D block data to be written to surface. -// -template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION void -__esimd_media_block_store(unsigned modififer, TACC handle, unsigned plane, - unsigned width, unsigned x, unsigned y, - __SEIEED::vector_type_t vals); - -/// \brief esimd_get_value -/// -/// @param sid the SYCL accessor. -/// -/// Returns the binding table index value. -/// -template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION uint32_t -__esimd_get_value(SurfIndAliasTy sid); - -/// \brief Raw sends load. -/// -/// @param modifier the send message flags (Bit-0: isSendc, Bit-1: isEOT). -/// -/// @param execSize the execution size, which must be a compile time constant. -/// -/// @param pred the predicate to specify enabled channels. -/// -/// @param numSrc0 the number of GRFs for source-0, which must be a compile time -/// constant. -/// -/// @param numSrc1 the number of GRFs for source-1, which must be a compile time -/// constant. -/// -/// @param numDst the number of GRFs for destination, which must be a compile -/// time constant. -/// -/// @param sfid the shared function ID, which must be a compile time constant. -/// -/// @param exDesc the extended message descriptor. -/// -/// @param msgDesc the message descriptor. -/// -/// @param msgSrc0 the first source operand of send message. -/// -/// @param msgSrc1 the second source operand of send message. -/// -/// @param msgDst the destination operand of send message. -/// -/// Returns a simd vector of type Ty1 and size N1. -/// -template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t -__esimd_raw_sends_load(uint8_t modifier, uint8_t execSize, - __SEIEED::simd_mask_storage_t pred, uint8_t numSrc0, - uint8_t numSrc1, uint8_t numDst, uint8_t sfid, - uint32_t exDesc, uint32_t msgDesc, - __SEIEED::vector_type_t msgSrc0, - __SEIEED::vector_type_t msgSrc1, - __SEIEED::vector_type_t msgDst); - -/// \brief Raw send load. -/// -/// @param modifier the send message flags (Bit-0: isSendc, Bit-1: isEOT). -/// -/// @param execSize the execution size, which must be a compile time constant. -/// -/// @param pred the predicate to specify enabled channels. -/// -/// @param numSrc0 the number of GRFs for source-0, which must be a compile time -/// constant. -/// -/// @param numDst the number of GRFs for destination, which must be a compile -/// time constant. -/// -/// @param sfid the shared function ID, which must be a compile time constant. -/// -/// @param exDesc the extended message descriptor. -/// -/// @param msgDesc the message descriptor. -/// -/// @param msgSrc0 the first source operand of send message. -/// -/// @param msgDst the destination operand of send message. -/// -/// Returns a simd vector of type Ty1 and size N1. -/// -template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION __SEIEED::vector_type_t -__esimd_raw_send_load(uint8_t modifier, uint8_t execSize, - __SEIEED::simd_mask_storage_t pred, uint8_t numSrc0, - uint8_t numDst, uint8_t sfid, uint32_t exDesc, - uint32_t msgDesc, - __SEIEED::vector_type_t msgSrc0, - __SEIEED::vector_type_t msgDst); - -/// \brief Raw sends store. -/// -/// @param modifier the send message flags (Bit-0: isSendc, Bit-1: isEOT). -/// -/// @param execSize the execution size, which must be a compile time constant. -/// -/// @param pred the predicate to specify enabled channels. -/// -/// @param numSrc0 the number of GRFs for source-0, which must be a compile time -/// constant. -/// -/// @param numSrc1 the number of GRFs for source-1, which must be a compile time -/// constant. -/// -/// @param sfid the shared function ID, which must be a compile time constant. -/// -/// @param exDesc the extended message descriptor. -/// -/// @param msgDesc the message descriptor. -/// -/// @param msgSrc0 the first source operand of send message. -/// -/// @param msgSrc1 the second source operand of send message. -/// -template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION void __esimd_raw_sends_store( - uint8_t modifier, uint8_t execSize, __SEIEED::simd_mask_storage_t pred, - uint8_t numSrc0, uint8_t numSrc1, uint8_t sfid, uint32_t exDesc, - uint32_t msgDesc, __SEIEED::vector_type_t msgSrc0, - __SEIEED::vector_type_t msgSrc1); - -/// \brief Raw send store. -/// -/// @param modifier the send message flags (Bit-0: isSendc, Bit-1: isEOT). -/// -/// @param execSize the execution size, which must be a compile time constant. -/// -/// @param pred the predicate to specify enabled channels. -/// -/// @param numSrc0 the number of GRFs for source-0, which must be a compile time -/// constant. -/// -/// @param sfid the shared function ID, which must be a compile time constant. -/// -/// @param exDesc the extended message descriptor. -/// -/// @param msgDesc the message descriptor. -/// -/// @param msgSrc0 the first source operand of send message. -/// -template -SYCL_EXTERNAL SYCL_ESIMD_FUNCTION void -__esimd_raw_send_store(uint8_t modifier, uint8_t execSize, - __SEIEED::simd_mask_storage_t pred, uint8_t numSrc0, - uint8_t sfid, uint32_t exDesc, uint32_t msgDesc, - __SEIEED::vector_type_t msgSrc0); -#ifndef __SYCL_DEVICE_ONLY__ - -template -inline __SEIEED::vector_type_t -__esimd_flat_read(__SEIEED::vector_type_t addrs, int ElemsPerAddr, - __SEIEED::simd_mask_storage_t pred) { - auto NumBlkDecoded = __SEIEED::ElemsPerAddrDecoding(NumBlk); - __SEIEED::vector_type_t V; - ElemsPerAddr = __SEIEED::ElemsPerAddrDecoding(ElemsPerAddr); - - for (int I = 0; I < N; I++) { - if (pred[I]) { - Ty *Addr = reinterpret_cast(addrs[I]); - if (sizeof(Ty) == 2) - ElemsPerAddr = ElemsPerAddr / 2; - if (sizeof(Ty) <= 2) { - for (int J = 0; J < NumBlkDecoded && J < ElemsPerAddr; J++) - V[I * NumBlkDecoded + J] = *(Addr + J); - } else { - for (int J = 0; J < NumBlkDecoded && J < ElemsPerAddr; J++) - V[J * N + I] = *(Addr + J); - } - } - } - return V; -} - -template -inline __SEIEED::vector_type_t -__esimd_flat_read4(__SEIEED::vector_type_t addrs, - __SEIEED::simd_mask_storage_t pred) { - __SEIEED::vector_type_t V; - unsigned int Next = 0; - - if constexpr (__SEIEE::is_channel_enabled(Mask, __SEIEE::rgba_channel::R)) { - for (int I = 0; I < N; I++, Next++) { - if (pred[I]) { - Ty *Addr = reinterpret_cast(addrs[I]); - V[Next] = *Addr; - } - } - } - - if constexpr (__SEIEE::is_channel_enabled(Mask, __SEIEE::rgba_channel::G)) { - for (int I = 0; I < N; I++, Next++) { - if (pred[I]) { - Ty *Addr = reinterpret_cast(addrs[I] + sizeof(Ty)); - V[Next] = *Addr; - } - } - } - - if constexpr (__SEIEE::is_channel_enabled(Mask, __SEIEE::rgba_channel::B)) { - for (int I = 0; I < N; I++, Next++) { - if (pred[I]) { - Ty *Addr = reinterpret_cast(addrs[I] + sizeof(Ty) + sizeof(Ty)); - V[Next] = *Addr; - } - } - } - - if constexpr (__SEIEE::is_channel_enabled(Mask, __SEIEE::rgba_channel::A)) { - for (int I = 0; I < N; I++, Next++) { - if (pred[I]) { - Ty *Addr = reinterpret_cast(addrs[I] + sizeof(Ty) + sizeof(Ty) + - sizeof(Ty)); - V[Next] = *Addr; - } - } - } + __SEIEED::simd_mask_storage_t pred) +#ifdef __SYCL_DEVICE_ONLY__ + ; +#else +{ + throw cl::sycl::feature_not_supported(); +} +#endif // __SYCL_DEVICE_ONLY__ - return V; +template <__SEIEE::atomic_op Op, typename Ty, int N, + __SEIEE::CacheHint L1H = __SEIEE::CacheHint::None, + __SEIEE::CacheHint L3H = __SEIEE::CacheHint::None> +__ESIMD_INTRIN __SEIEED::vector_type_t +__esimd_svm_atomic2(__SEIEED::vector_type_t addrs, + __SEIEED::vector_type_t src0, + __SEIEED::vector_type_t src1, + __SEIEED::simd_mask_storage_t pred) +#ifdef __SYCL_DEVICE_ONLY__ + ; +#else +{ + throw cl::sycl::feature_not_supported(); } +#endif // __SYCL_DEVICE_ONLY__ -template -inline void __esimd_flat_write( - __SEIEED::vector_type_t addrs, - __SEIEED::vector_type_t - vals, - int ElemsPerAddr, __SEIEED::simd_mask_storage_t pred) { - auto NumBlkDecoded = __SEIEED::ElemsPerAddrDecoding(NumBlk); - ElemsPerAddr = __SEIEED::ElemsPerAddrDecoding(ElemsPerAddr); +// esimd_barrier, generic group barrier +__ESIMD_INTRIN void __esimd_barrier() +#ifdef __SYCL_DEVICE_ONLY__ + ; +#else +{ + throw cl::sycl::feature_not_supported(); +} +#endif // __SYCL_DEVICE_ONLY__ - for (int I = 0; I < N; I++) { - if (pred[I]) { - Ty *Addr = reinterpret_cast(addrs[I]); - if (sizeof(Ty) == 2) - ElemsPerAddr = ElemsPerAddr / 2; - if (sizeof(Ty) <= 2) { - for (int J = 0; J < NumBlkDecoded && J < ElemsPerAddr; J++) - *(Addr + J) = vals[I * NumBlkDecoded + J]; - } else { - for (int J = 0; J < NumBlkDecoded && J < ElemsPerAddr; J++) - *(Addr + J) = vals[J * N + I]; - } - } - } +// generic work-group split barrier +__ESIMD_INTRIN void __esimd_sbarrier(__SEIEE::split_barrier_action flag) +#ifdef __SYCL_DEVICE_ONLY__ + ; +#else +{ + throw cl::sycl::feature_not_supported(); } +#endif // __SYCL_DEVICE_ONLY__ -template -inline void __esimd_flat_write4( - __SEIEED::vector_type_t addrs, - __SEIEED::vector_type_t vals, - __SEIEED::simd_mask_storage_t pred) { - __SEIEED::vector_type_t V; - unsigned int Next = 0; +// slm_fence sets the SLM read/write order +__ESIMD_INTRIN void __esimd_fence(uint8_t cntl) +#ifdef __SYCL_DEVICE_ONLY__ + ; +#else +{ + throw cl::sycl::feature_not_supported(); +} +#endif // __SYCL_DEVICE_ONLY__ - if constexpr (__SEIEE::is_channel_enabled(Mask, __SEIEE::rgba_channel::R)) { - for (int I = 0; I < N; I++, Next++) { - if (pred[I]) { - Ty *Addr = reinterpret_cast(addrs[I]); - *Addr = vals[Next]; - } - } - } +// Scaled gather from a surface. +template +__ESIMD_INTRIN __SEIEED::vector_type_t +__esimd_gather_scaled(__SEIEED::simd_mask_storage_t pred, + SurfIndAliasTy surf_ind, uint32_t global_offset, + __SEIEED::vector_type_t addrs) +#ifdef __SYCL_DEVICE_ONLY__ + ; +#else +{ + throw cl::sycl::feature_not_supported(); +} +#endif // __SYCL_DEVICE_ONLY__ - if constexpr (__SEIEE::is_channel_enabled(Mask, __SEIEE::rgba_channel::G)) { - for (int I = 0; I < N; I++, Next++) { - if (pred[I]) { - Ty *Addr = reinterpret_cast(addrs[I] + sizeof(Ty)); - *Addr = vals[Next]; - } - } - } +// Reads a block of data from given surface at given offset, offset must be +// 16-byte-aligned. +template +__ESIMD_INTRIN __SEIEED::vector_type_t +__esimd_oword_ld(SurfIndAliasTy surf_ind, uint32_t addr) +#ifdef __SYCL_DEVICE_ONLY__ + ; +#else +{ + throw cl::sycl::feature_not_supported(); +} +#endif // __SYCL_DEVICE_ONLY__ - if constexpr (__SEIEE::is_channel_enabled(Mask, __SEIEE::rgba_channel::B)) { - for (int I = 0; I < N; I++, Next++) { - if (pred[I]) { - Ty *Addr = reinterpret_cast(addrs[I] + sizeof(Ty) + sizeof(Ty)); - *Addr = vals[Next]; - } - } - } +// gather4 scaled from a surface/SLM +template +__ESIMD_INTRIN __SEIEED::vector_type_t +__esimd_gather4_scaled(__SEIEED::simd_mask_storage_t pred, + SurfIndAliasTy surf_ind, int global_offset, + __SEIEED::vector_type_t offsets) +#ifdef __SYCL_DEVICE_ONLY__ + ; +#else +{ + throw cl::sycl::feature_not_supported(); +} +#endif // __SYCL_DEVICE_ONLY__ - if constexpr (__SEIEE::is_channel_enabled(Mask, __SEIEE::rgba_channel::A)) { - for (int I = 0; I < N; I++, Next++) { - if (pred[I]) { - Ty *Addr = reinterpret_cast(addrs[I] + sizeof(Ty) + sizeof(Ty) + - sizeof(Ty)); - *Addr = vals[Next]; - } - } - } +// scatter4 scaled to a surface/SLM +template +__ESIMD_INTRIN void __esimd_scatter4_scaled( + __SEIEED::simd_mask_storage_t pred, SurfIndAliasTy surf_ind, + int global_offset, __SEIEED::vector_type_t offsets, + __SEIEED::vector_type_t vals) +#ifdef __SYCL_DEVICE_ONLY__ + ; +#else +{ + throw cl::sycl::feature_not_supported(); } +#endif // __SYCL_DEVICE_ONLY__ -template -inline __SEIEED::vector_type_t -__esimd_flat_block_read_unaligned(uint64_t addr) { - __SEIEED::vector_type_t V; +// Surface-based atomic operations +template <__SEIEE::atomic_op Op, typename Ty, int N, typename SurfIndAliasTy> +__ESIMD_INTRIN __SEIEED::vector_type_t +__esimd_dword_atomic0(__SEIEED::simd_mask_storage_t pred, + SurfIndAliasTy surf_ind, + __SEIEED::vector_type_t addrs) +#ifdef __SYCL_DEVICE_ONLY__ + ; +#else +{ + throw cl::sycl::feature_not_supported(); +} +#endif // __SYCL_DEVICE_ONLY__ - for (int I = 0; I < N; I++) { - Ty *Addr = reinterpret_cast(addr + I * sizeof(Ty)); - V[I] = *Addr; - } - return V; +template <__SEIEE::atomic_op Op, typename Ty, int N, typename SurfIndAliasTy> +__ESIMD_INTRIN __SEIEED::vector_type_t +__esimd_dword_atomic1(__SEIEED::simd_mask_storage_t pred, + SurfIndAliasTy surf_ind, + __SEIEED::vector_type_t addrs, + __SEIEED::vector_type_t src0) +#ifdef __SYCL_DEVICE_ONLY__ + ; +#else +{ + throw cl::sycl::feature_not_supported(); } +#endif // __SYCL_DEVICE_ONLY__ -template -inline void __esimd_flat_block_write(uint64_t addr, - __SEIEED::vector_type_t vals) { - for (int I = 0; I < N; I++) { - Ty *Addr = reinterpret_cast(addr + I * sizeof(Ty)); - *Addr = vals[I]; - } +template <__SEIEE::atomic_op Op, typename Ty, int N, typename SurfIndAliasTy> +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_dword_atomic2( + __SEIEED::simd_mask_storage_t pred, SurfIndAliasTy surf_ind, + __SEIEED::vector_type_t addrs, + __SEIEED::vector_type_t src0, __SEIEED::vector_type_t src1) +#ifdef __SYCL_DEVICE_ONLY__ + ; +#else +{ + throw cl::sycl::feature_not_supported(); } +#endif // __SYCL_DEVICE_ONLY__ -template -inline __SEIEED::vector_type_t -__esimd_media_block_load(unsigned modififer, TACC handle, unsigned plane, - unsigned width, unsigned x, unsigned y) { +// Media block load. +// +// @tparam Ty the element data type. +// @tparam M the hight of the 2D block. +// @tparam N the width of the 2D block. +// @tparam Modifier top/bottom field surface access control. +// @tparam TACC type of the surface handle. +// @tparam Plane planar surface index. +// @tparam BlockWidth the width of the return block. +// @param handle the surface handle. +// @param x X-coordinate of the left upper rectangle corner in BYTES. +// @param y Y-coordinate of the left upper rectangle corner in ROWS. +// +// @return the linearized 2D block data read from surface. +// +template +__ESIMD_INTRIN __SEIEED::vector_type_t +__esimd_media_ld(TACC handle, unsigned x, unsigned y) +#ifdef __SYCL_DEVICE_ONLY__ + ; +#else +{ // On host the input surface is modeled as sycl image 2d object, // and the read/write access is done through accessor, // which is passed in as the handle argument. @@ -733,12 +639,30 @@ __esimd_media_block_load(unsigned modififer, TACC handle, unsigned plane, return vals; } +#endif // __SYCL_DEVICE_ONLY__ -template -inline void __esimd_media_block_store(unsigned modififer, TACC handle, - unsigned plane, unsigned width, - unsigned x, unsigned y, - __SEIEED::vector_type_t vals) { +// Media block store +// +// @tparam Ty the element data type. +// @tparam M the hight of the 2D block. +// @tparam N the width of the 2D block. +// @tparam Modifier top/bottom field surface access control. +// @tparam TACC type of the surface handle. +// @tparam Plane planar surface index. +// @tparam BlockWidth the width of the return block. +// @param handle the surface handle. +// @param x X-coordinate of the left upper rectangle corner in BYTES. +// @param y Y-coordinate of the left upper rectangle corner in ROWS. +// @param vals the linearized 2D block data to be written to surface. +// +template +__ESIMD_INTRIN void __esimd_media_st(TACC handle, unsigned x, unsigned y, + __SEIEED::vector_type_t vals) +#ifdef __SYCL_DEVICE_ONLY__ + ; +#else +{ unsigned bpp = __SEIEED::AccessorPrivateProxy::getElemSize(handle); unsigned vpp = bpp / sizeof(Ty); auto range = __SEIEED::AccessorPrivateProxy::getImageRange(handle); @@ -779,173 +703,37 @@ inline void __esimd_media_block_store(unsigned modififer, TACC handle, j++; } } +#endif // __SYCL_DEVICE_ONLY__ -template -inline uint16_t __esimd_any(__SEIEED::vector_type_t src) { - for (unsigned int i = 0; i != N; i++) { - if (src[i] != 0) - return 1; - } - return 0; -} - -template -inline uint16_t __esimd_all(__SEIEED::vector_type_t src) { - for (unsigned int i = 0; i != N; i++) { - if (src[i] == 0) - return 0; - } - return 1; -} - -template -inline __SEIEED::vector_type_t -__esimd_dp4(__SEIEED::vector_type_t v1, - __SEIEED::vector_type_t v2) { - __SEIEED::vector_type_t retv; - for (auto i = 0; i != N; i += 4) { - Ty dp = (v1[i] * v2[i]) + (v1[i + 1] * v2[i + 1]) + - (v1[i + 2] * v2[i + 2]) + (v1[i + 3] * v2[i + 3]); - retv[i] = dp; - retv[i + 1] = dp; - retv[i + 2] = dp; - retv[i + 3] = dp; - } - return retv; -} - -/// TODO -inline void __esimd_barrier() {} - -inline void __esimd_sbarrier(__SEIEE::split_barrier_action flag) {} - -inline void __esimd_slm_fence(uint8_t cntl) {} - -template -inline __SEIEED::vector_type_t -__esimd_slm_read(__SEIEED::vector_type_t addrs, - __SEIEED::simd_mask_storage_t pred) { - __SEIEED::vector_type_t retv; - return retv; -} - -// slm_write does SLM scatter -template -inline void __esimd_slm_write(__SEIEED::vector_type_t addrs, - __SEIEED::vector_type_t vals, - __SEIEED::simd_mask_storage_t pred) {} - -// slm_block_read reads a block of data from SLM -template -inline __SEIEED::vector_type_t __esimd_slm_block_read(uint32_t addr) { - __SEIEED::vector_type_t retv; - return retv; -} - -// slm_block_write writes a block of data to SLM -template -inline void __esimd_slm_block_write(uint32_t addr, - __SEIEED::vector_type_t vals) {} - -// slm_read4 does SLM gather4 -template -inline __SEIEED::vector_type_t -__esimd_slm_read4(__SEIEED::vector_type_t addrs, - __SEIEED::simd_mask_storage_t pred) { - __SEIEED::vector_type_t retv; - return retv; -} - -// slm_write4 does SLM scatter4 -template -inline void __esimd_slm_write4( - __SEIEED::vector_type_t addrs, - __SEIEED::vector_type_t vals, - __SEIEED::simd_mask_storage_t pred) {} - -// slm_atomic: SLM atomic -template <__SEIEE::atomic_op Op, typename Ty, int N> -inline __SEIEED::vector_type_t -__esimd_slm_atomic0(__SEIEED::vector_type_t addrs, - __SEIEED::simd_mask_storage_t pred) { - __SEIEED::vector_type_t retv; - return retv; -} - -template <__SEIEE::atomic_op Op, typename Ty, int N> -inline __SEIEED::vector_type_t -__esimd_slm_atomic1(__SEIEED::vector_type_t addrs, - __SEIEED::vector_type_t src0, - __SEIEED::simd_mask_storage_t pred) { - __SEIEED::vector_type_t retv; - return retv; -} - -template <__SEIEE::atomic_op Op, typename Ty, int N> -inline __SEIEED::vector_type_t -__esimd_slm_atomic2(__SEIEED::vector_type_t addrs, - __SEIEED::vector_type_t src0, - __SEIEED::vector_type_t src1, - __SEIEED::simd_mask_storage_t pred) { - __SEIEED::vector_type_t retv; - return retv; -} - -template <__SEIEE::atomic_op Op, typename Ty, int N, __SEIEE::CacheHint L1H, - __SEIEE::CacheHint L3H> -inline __SEIEED::vector_type_t -__esimd_flat_atomic0(__SEIEED::vector_type_t addrs, - __SEIEED::simd_mask_storage_t pred) { - __SEIEED::vector_type_t retv; - return retv; -} - -template <__SEIEE::atomic_op Op, typename Ty, int N, __SEIEE::CacheHint L1H, - __SEIEE::CacheHint L3H> -inline __SEIEED::vector_type_t -__esimd_flat_atomic1(__SEIEED::vector_type_t addrs, - __SEIEED::vector_type_t src0, - __SEIEED::simd_mask_storage_t pred) { - __SEIEED::vector_type_t retv; - return retv; -} - -template <__SEIEE::atomic_op Op, typename Ty, int N, __SEIEE::CacheHint L1H, - __SEIEE::CacheHint L3H> -inline __SEIEED::vector_type_t -__esimd_flat_atomic2(__SEIEED::vector_type_t addrs, - __SEIEED::vector_type_t src0, - __SEIEED::vector_type_t src1, - __SEIEED::simd_mask_storage_t pred) { - __SEIEED::vector_type_t retv; - return retv; -} - -template -inline __SEIEED::vector_type_t -__esimd_block_read(SurfIndAliasTy surf_ind, uint32_t offset) { - throw cl::sycl::feature_not_supported(); - return __SEIEED::vector_type_t(); -} - -template -inline void __esimd_block_write(SurfIndAliasTy surf_ind, uint32_t offset, - __SEIEED::vector_type_t vals) { - - throw cl::sycl::feature_not_supported(); -} - -/// \brief esimd_get_value -/// -/// @param acc the SYCL accessor. -/// -/// Returns the binding table index value. +#ifdef __SYCL_DEVICE_ONLY__ +/// \brief Converts given value to a surface index. +/// The input must always be a result of +/// detail::AccessorPrivateProxy::getNativeImageObj(acc) +/// where acc is a buffer or image accessor. If the result is, say, 'obj', then +/// 'obj' is really a value of the surface index kept in a differently typed +/// accessor field. Front-end compilation time type of 'obj' is either +/// ConcreteASPtrType (detail::DecoratedType::type *), for a buffer +/// or +/// image{1,2,3}d_t OpenCL type for an image +/// But when doing code generation, FE replaces e.g. '__read_only image2d_t' FE +/// type with '%opencl.image2d_ro_t addrspace(1) *' LLVM type. +/// image2d_t can neither be reinterpret_cast'ed from pointer to intptr_t +/// (because it is not a pointer at FE translation time), nor it can be +/// bit_cast'ed to intptr_t (because it is not trivially copyable). This +/// intrinsic takes advantage of the fact that in LLVM IR 'obj' is always a +/// pointer, where we can do ptr to uint32_t conversion. +/// This intrinsic can be called only from the device code, as +/// accessor => memory handle translation for host is different. +/// +/// @param SYCL accessor's native memory object extracted from it via +/// getNativeImageObj. +/// +/// Returns the surface index (binding table index) value 'sid' corresponds to. /// -template -inline uint32_t __esimd_get_value(AccessorTy acc) { - throw cl::sycl::feature_not_supported(); - return 0; -} +template +__ESIMD_INTRIN __SEIEE::SurfaceIndex +__esimd_get_surface_index(SurfIndAliasTy sid); +#endif // __SYCL_DEVICE_ONLY__ /// \brief Raw sends load. /// @@ -979,16 +767,20 @@ inline uint32_t __esimd_get_value(AccessorTy acc) { /// Returns a simd vector of type Ty1 and size N1. /// template -inline __SEIEED::vector_type_t __esimd_raw_sends_load( + int N = 16> +__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_raw_sends2( uint8_t modifier, uint8_t execSize, __SEIEED::simd_mask_storage_t pred, uint8_t numSrc0, uint8_t numSrc1, uint8_t numDst, uint8_t sfid, uint32_t exDesc, uint32_t msgDesc, __SEIEED::vector_type_t msgSrc0, __SEIEED::vector_type_t msgSrc1, - __SEIEED::vector_type_t msgDst) { + __SEIEED::vector_type_t msgDst) +#ifdef __SYCL_DEVICE_ONLY__ + ; +#else +{ throw cl::sycl::feature_not_supported(); - return 0; } +#endif // __SYCL_DEVICE_ONLY__ /// \brief Raw send load. /// @@ -1016,15 +808,20 @@ inline __SEIEED::vector_type_t __esimd_raw_sends_load( /// /// Returns a simd vector of type Ty1 and size N1. /// -template -inline __SEIEED::vector_type_t __esimd_raw_send_load( - uint8_t modifier, uint8_t execSize, __SEIEED::simd_mask_storage_t pred, - uint8_t numSrc0, uint8_t numDst, uint8_t sfid, uint32_t exDesc, - uint32_t msgDesc, __SEIEED::vector_type_t msgSrc0, - __SEIEED::vector_type_t msgDst) { +template +__ESIMD_INTRIN __SEIEED::vector_type_t +__esimd_raw_send2(uint8_t modifier, uint8_t execSize, + __SEIEED::simd_mask_storage_t pred, uint8_t numSrc0, + uint8_t numDst, uint8_t sfid, uint32_t exDesc, + uint32_t msgDesc, __SEIEED::vector_type_t msgSrc0, + __SEIEED::vector_type_t msgDst) +#ifdef __SYCL_DEVICE_ONLY__ + ; +#else +{ throw cl::sycl::feature_not_supported(); - return 0; } +#endif // __SYCL_DEVICE_ONLY__ /// \brief Raw sends store. /// @@ -1050,16 +847,19 @@ inline __SEIEED::vector_type_t __esimd_raw_send_load( /// /// @param msgSrc1 the second source operand of send message. /// -template -inline void __esimd_raw_sends_store(uint8_t modifier, uint8_t execSize, - __SEIEED::simd_mask_storage_t pred, - uint8_t numSrc0, uint8_t numSrc1, - uint8_t sfid, uint32_t exDesc, - uint32_t msgDesc, - __SEIEED::vector_type_t msgSrc0, - __SEIEED::vector_type_t msgSrc1) { +template +__ESIMD_INTRIN void __esimd_raw_sends2_noresult( + uint8_t modifier, uint8_t execSize, __SEIEED::simd_mask_storage_t pred, + uint8_t numSrc0, uint8_t numSrc1, uint8_t sfid, uint32_t exDesc, + uint32_t msgDesc, __SEIEED::vector_type_t msgSrc0, + __SEIEED::vector_type_t msgSrc1) +#ifdef __SYCL_DEVICE_ONLY__ + ; +#else +{ throw cl::sycl::feature_not_supported(); } +#endif // __SYCL_DEVICE_ONLY__ /// \brief Raw send store. /// @@ -1080,13 +880,15 @@ inline void __esimd_raw_sends_store(uint8_t modifier, uint8_t execSize, /// /// @param msgSrc0 the first source operand of send message. /// -template -inline void __esimd_raw_send_store(uint8_t modifier, uint8_t execSize, - __SEIEED::simd_mask_storage_t pred, - uint8_t numSrc0, uint8_t sfid, - uint32_t exDesc, uint32_t msgDesc, - __SEIEED::vector_type_t msgSrc0) { +template +__ESIMD_INTRIN void __esimd_raw_send2_noresult( + uint8_t modifier, uint8_t execSize, __SEIEED::simd_mask_storage_t pred, + uint8_t numSrc0, uint8_t sfid, uint32_t exDesc, uint32_t msgDesc, + __SEIEED::vector_type_t msgSrc0) +#ifdef __SYCL_DEVICE_ONLY__ + ; +#else +{ throw cl::sycl::feature_not_supported(); } - #endif // __SYCL_DEVICE_ONLY__ diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/detail/simd_obj_impl.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/detail/simd_obj_impl.hpp index b765b7f83e761..caeb4138bea37 100644 --- a/sycl/include/sycl/ext/intel/experimental/esimd/detail/simd_obj_impl.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd/detail/simd_obj_impl.hpp @@ -647,7 +647,7 @@ void simd_obj_impl::copy_from(const T *const Addr) { uintptr_t AddrVal = reinterpret_cast(Addr); *this = - __esimd_flat_block_read_unaligned( + __esimd_svm_block_ld_unaligned( AddrVal); } @@ -666,10 +666,11 @@ simd_obj_impl::copy_from(AccessorT acc, uint32_t offset) { static_assert(Sz <= 8 * OperandSize::OWORD, "block size must be at most 8 owords"); #if defined(__SYCL_DEVICE_ONLY__) - auto surf_ind = AccessorPrivateProxy::getNativeImageObj(acc); - *this = __esimd_block_read(surf_ind, offset); + auto surf_ind = + __esimd_get_surface_index(AccessorPrivateProxy::getNativeImageObj(acc)); + *this = __esimd_oword_ld_unaligned(surf_ind, offset); #else - *this = __esimd_block_read(acc, offset); + *this = __esimd_oword_ld_unaligned(acc, offset); #endif // __SYCL_DEVICE_ONLY__ } @@ -686,8 +687,7 @@ void simd_obj_impl::copy_to(T *addr) { "block size must be at most 8 owords"); uintptr_t AddrVal = reinterpret_cast(addr); - __esimd_flat_block_write(AddrVal, - data()); + __esimd_svm_block_st(AddrVal, data()); } template @@ -706,10 +706,11 @@ simd_obj_impl::copy_to(AccessorT acc, uint32_t offset) { "block size must be at most 8 owords"); #if defined(__SYCL_DEVICE_ONLY__) - auto surf_ind = AccessorPrivateProxy::getNativeImageObj(acc); - __esimd_block_write(surf_ind, offset >> 4, data()); + auto surf_ind = + __esimd_get_surface_index(AccessorPrivateProxy::getNativeImageObj(acc)); + __esimd_oword_st(surf_ind, offset >> 4, data()); #else - __esimd_block_write(acc, offset >> 4, data()); + __esimd_oword_st(acc, offset >> 4, data()); #endif // __SYCL_DEVICE_ONLY__ } } // namespace detail diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/detail/util.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/detail/util.hpp index 021452042f097..bf47b8868128e 100755 --- a/sycl/include/sycl/ext/intel/experimental/esimd/detail/util.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd/detail/util.hpp @@ -19,6 +19,12 @@ #define __SEIEE sycl::ext::intel::experimental::esimd #define __SEIEEED sycl::ext::intel::experimental::esimd::emu::detail +#ifdef __SYCL_DEVICE_ONLY__ +#define __ESIMD_INTRIN SYCL_EXTERNAL SYCL_ESIMD_FUNCTION +#else +#define __ESIMD_INTRIN inline +#endif // __SYCL_DEVICE_ONLY__ + __SYCL_INLINE_NAMESPACE(cl) { namespace sycl { namespace ext { diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/math.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/math.hpp index 5134d42bd31e9..403edec49f4d0 100644 --- a/sycl/include/sycl/ext/intel/experimental/esimd/math.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd/math.hpp @@ -38,7 +38,7 @@ namespace esimd { template ESIMD_NODEBUG ESIMD_INLINE simd esimd_sat(simd src) { if constexpr (std::is_floating_point::value) - return __esimd_satf(src.data()); + return __esimd_sat(src.data()); else if constexpr (std::is_floating_point::value) { if constexpr (std::is_unsigned::value) return __esimd_fptoui_sat(src.data()); @@ -692,7 +692,7 @@ ESIMD_NODEBUG ESIMD_INLINE simd esimd_max(simd src0, simd src1, int flag = saturation_off) { if constexpr (std::is_floating_point::value) { auto Result = __esimd_fmax(src0.data(), src1.data()); - Result = (flag == saturation_off) ? Result : __esimd_satf(Result); + Result = (flag == saturation_off) ? Result : __esimd_sat(Result); return simd(Result); } else if constexpr (std::is_unsigned::value) { auto Result = __esimd_umax(src0.data(), src1.data()); @@ -779,7 +779,7 @@ ESIMD_NODEBUG ESIMD_INLINE simd esimd_min(simd src0, simd src1, int flag = saturation_off) { if constexpr (std::is_floating_point::value) { auto Result = __esimd_fmin(src0.data(), src1.data()); - Result = (flag == saturation_off) ? Result : __esimd_satf(Result); + Result = (flag == saturation_off) ? Result : __esimd_sat(Result); return simd(Result); } else if constexpr (std::is_unsigned::value) { auto Result = __esimd_umin(src0.data(), src1.data()); @@ -1350,7 +1350,7 @@ ESIMD_NODEBUG ESIMD_INLINE simd Src2 = src2; simd Src3 = src3; - return __esimd_bfins(Src0.data(), Src1.data(), Src2.data(), Src3.data()); + return __esimd_bfi(Src0.data(), Src1.data(), Src2.data(), Src3.data()); } template @@ -1376,7 +1376,7 @@ ESIMD_NODEBUG ESIMD_INLINE simd Src1 = src1; simd Src2 = src2; - return __esimd_bfext(Src0.data(), Src1.data(), Src2.data()); + return __esimd_sbfe(Src0.data(), Src1.data(), Src2.data()); } template @@ -1423,11 +1423,11 @@ esimd_bf_extract(T1 src0, T2 src1, T3 src2) { // a "typename T". Since the type can only be float, we hack it // by defining T=void without instantiating it to be float. -#define ESIMD_INTRINSIC_DEF(type, name) \ +#define ESIMD_INTRINSIC_DEF(type, name, iname) \ template \ ESIMD_NODEBUG ESIMD_INLINE simd esimd_##name( \ simd src0, int flag = saturation_off) { \ - simd Result = __esimd_##name(src0.data()); \ + simd Result = __esimd_##iname(src0.data()); \ if (flag != saturation_on) \ return Result; \ return esimd_sat(Result); \ @@ -1440,25 +1440,25 @@ esimd_bf_extract(T1 src0, T2 src1, T3 src2) { return Result[0]; \ } -ESIMD_INTRINSIC_DEF(float, inv) -ESIMD_INTRINSIC_DEF(float, log) -ESIMD_INTRINSIC_DEF(float, exp) -ESIMD_INTRINSIC_DEF(float, sqrt) -ESIMD_INTRINSIC_DEF(float, sqrt_ieee) -ESIMD_INTRINSIC_DEF(float, rsqrt) -ESIMD_INTRINSIC_DEF(float, sin) -ESIMD_INTRINSIC_DEF(float, cos) +ESIMD_INTRINSIC_DEF(float, inv, inv) +ESIMD_INTRINSIC_DEF(float, log, log) +ESIMD_INTRINSIC_DEF(float, exp, exp) +ESIMD_INTRINSIC_DEF(float, sqrt, sqrt) +ESIMD_INTRINSIC_DEF(float, ieee_sqrt, sqrt_ieee) +ESIMD_INTRINSIC_DEF(float, rsqrt, rsqrt) +ESIMD_INTRINSIC_DEF(float, sin, sin) +ESIMD_INTRINSIC_DEF(float, cos, cos) -ESIMD_INTRINSIC_DEF(double, sqrt_ieee) +ESIMD_INTRINSIC_DEF(double, ieee_sqrt, sqrt_ieee) #undef ESIMD_INTRINSIC_DEF -#define ESIMD_INTRINSIC_DEF(ftype, name) \ +#define ESIMD_INTRINSIC_DEF(ftype, name, iname) \ template \ ESIMD_NODEBUG ESIMD_INLINE simd esimd_##name( \ simd src0, U src1, int flag = saturation_off) { \ simd Src1 = src1; \ - simd Result = __esimd_##name(src0.data(), Src1.data()); \ + simd Result = __esimd_##iname(src0.data(), Src1.data()); \ if (flag != saturation_on) \ return Result; \ \ @@ -1481,10 +1481,10 @@ ESIMD_INTRINSIC_DEF(double, sqrt_ieee) return Result[0]; \ } -ESIMD_INTRINSIC_DEF(float, pow) +ESIMD_INTRINSIC_DEF(float, pow, pow) -ESIMD_INTRINSIC_DEF(float, div_ieee) -ESIMD_INTRINSIC_DEF(double, div_ieee) +ESIMD_INTRINSIC_DEF(float, div_ieee, ieee_div) +ESIMD_INTRINSIC_DEF(double, div_ieee, ieee_div) #undef ESIMD_INTRINSIC_DEF @@ -2394,11 +2394,11 @@ template struct esimd_apply_reduced_max { template simd operator()(simd v1, simd v2) { if constexpr (std::is_floating_point::value) { - return __esimd_reduced_fmax(v1.data(), v2.data()); + return __esimd_fmax(v1.data(), v2.data()); } else if constexpr (std::is_unsigned::value) { - return __esimd_reduced_umax(v1.data(), v2.data()); + return __esimd_umax(v1.data(), v2.data()); } else { - return __esimd_reduced_smax(v1.data(), v2.data()); + return __esimd_smax(v1.data(), v2.data()); } } }; @@ -2407,11 +2407,11 @@ template struct esimd_apply_reduced_min { template simd operator()(simd v1, simd v2) { if constexpr (std::is_floating_point::value) { - return __esimd_reduced_fmin(v1.data(), v2.data()); + return __esimd_fmin(v1.data(), v2.data()); } else if constexpr (std::is_unsigned::value) { - return __esimd_reduced_umin(v1.data(), v2.data()); + return __esimd_umin(v1.data(), v2.data()); } else { - return __esimd_reduced_smin(v1.data(), v2.data()); + return __esimd_smin(v1.data(), v2.data()); } } }; diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp index 8f3cb71d1429e..7645a7db7ccef 100644 --- a/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp @@ -26,6 +26,45 @@ namespace intel { namespace experimental { namespace esimd { +namespace detail { +// Type used in internal functions to designate SLM access by +// providing dummy accessor of this type. Used to make it possible to delegate +// implemenations of SLM memory accesses to general surface-based memory +// accesses and thus reuse validity checks etc. +struct LocalAccessorMarker {}; + +// Shared Local Memory Binding Table Index (aka surface index). +static inline constexpr SurfaceIndex SLM_BTI = 254; +static inline constexpr SurfaceIndex INVALID_BTI = + static_cast(-1); +} // namespace detail + +/// Get surface index corresponding to a SYCL accessor. +/// +/// \param acc a SYCL buffer or image accessor. +/// \return the index of the corresponding surface (aka "binding table index"). +/// +/// \ingroup sycl_esimd +template +ESIMD_INLINE ESIMD_NODEBUG SurfaceIndex get_surface_index(AccessorTy acc) { +#ifdef __SYCL_DEVICE_ONLY__ + if constexpr (std::is_same_v) { + return detail::SLM_BTI; + } else { + const auto mem_obj = detail::AccessorPrivateProxy::getNativeImageObj(acc); + return __esimd_get_surface_index(mem_obj); + } +#else + throw cl::sycl::feature_not_supported(); +#endif +} + +#ifdef __SYCL_DEVICE_ONLY__ +#define __ESIMD_GET_SURF_HANDLE(acc) get_surface_index(acc) +#else +#define __ESIMD_GET_SURF_HANDLE(acc) acc +#endif // __SYCL_DEVICE_ONLY__ + // TODO @Pennycook // {quote} // ...I'd like us to think more about what we can do to make these interfaces @@ -84,29 +123,29 @@ gather(T *p, simd offsets, simd_mask pred = 1) { if constexpr (sizeof(T) == 1 && ElemsPerAddr == 2) { auto Ret = - __esimd_flat_read(), L1H, L3H>( + __esimd_svm_gather(), L1H, L3H>( addrs.data(), detail::ElemsPerAddrEncoding(), pred.data()); return __esimd_rdregion(Ret, 0); } else if constexpr (sizeof(T) == 1 && ElemsPerAddr == 1) { auto Ret = - __esimd_flat_read(), L1H, L3H>( + __esimd_svm_gather(), L1H, L3H>( addrs.data(), detail::ElemsPerAddrEncoding(), pred.data()); return __esimd_rdregion(Ret, 0); } else if constexpr (sizeof(T) == 2 && ElemsPerAddr == 1) { auto Ret = - __esimd_flat_read(), L1H, L3H>( + __esimd_svm_gather(), L1H, L3H>( addrs.data(), detail::ElemsPerAddrEncoding<2>(), pred.data()); return __esimd_rdregion(Ret, 0); } else if constexpr (sizeof(T) == 2) - return __esimd_flat_read(), - L1H, L3H>( + return __esimd_svm_gather< + T, n, detail::ElemsPerAddrEncoding(), L1H, L3H>( addrs.data(), detail::ElemsPerAddrEncoding<2 * ElemsPerAddr>(), pred.data()); else - return __esimd_flat_read(), - L1H, L3H>( + return __esimd_svm_gather< + T, n, detail::ElemsPerAddrEncoding(), L1H, L3H>( addrs.data(), detail::ElemsPerAddrEncoding(), pred.data()); } @@ -134,31 +173,31 @@ scatter(T *p, simd vals, simd offsets, simd D; D = __esimd_wrregion( D.data(), vals.data(), 0); - __esimd_flat_write(), L1H, L3H>( + __esimd_svm_scatter(), L1H, L3H>( addrs.data(), D.data(), detail::ElemsPerAddrEncoding(), pred.data()); } else if constexpr (sizeof(T) == 1 && ElemsPerAddr == 1) { simd D; D = __esimd_wrregion( D.data(), vals.data(), 0); - __esimd_flat_write(), L1H, L3H>( + __esimd_svm_scatter(), L1H, L3H>( addrs.data(), D.data(), detail::ElemsPerAddrEncoding(), pred.data()); } else if constexpr (sizeof(T) == 2 && ElemsPerAddr == 1) { simd D; D = __esimd_wrregion(D.data(), vals.data(), 0); - __esimd_flat_write(), L1H, L3H>( + __esimd_svm_scatter(), L1H, L3H>( addrs.data(), D.data(), detail::ElemsPerAddrEncoding<2>(), pred.data()); } else if constexpr (sizeof(T) == 2) - __esimd_flat_write(), L1H, - L3H>(addrs.data(), vals.data(), - detail::ElemsPerAddrEncoding<2 * ElemsPerAddr>(), - pred.data()); + __esimd_svm_scatter(), L1H, + L3H>(addrs.data(), vals.data(), + detail::ElemsPerAddrEncoding<2 * ElemsPerAddr>(), + pred.data()); else - __esimd_flat_write(), L1H, - L3H>(addrs.data(), vals.data(), - detail::ElemsPerAddrEncoding(), - pred.data()); + __esimd_svm_scatter(), L1H, + L3H>(addrs.data(), vals.data(), + detail::ElemsPerAddrEncoding(), + pred.data()); } /// Flat-address block-load. @@ -182,7 +221,7 @@ ESIMD_INLINE ESIMD_NODEBUG simd block_load(const T *const addr) { "block size must be at most 8 owords"); uintptr_t Addr = reinterpret_cast(addr); - return __esimd_flat_block_read_unaligned(Addr); + return __esimd_svm_block_ld_unaligned(Addr); } /// Accessor-based block-load. @@ -214,7 +253,7 @@ ESIMD_INLINE ESIMD_NODEBUG void block_store(T *p, simd vals) { "block size must be at most 8 owords"); uintptr_t Addr = reinterpret_cast(p); - __esimd_flat_block_write(Addr, vals.data()); + __esimd_svm_block_st(Addr, vals.data()); } /// Accessor-based block-store. @@ -260,6 +299,7 @@ ESIMD_INLINE ESIMD_NODEBUG glob_offset *= t_scale; offsets *= t_scale; } + const auto si = get_surface_index(acc); if constexpr (sizeof(T) < 4) { static_assert(std::is_integral::value, @@ -267,26 +307,13 @@ ESIMD_INLINE ESIMD_NODEBUG using PromoT = typename sycl::detail::conditional_t::value, int32_t, uint32_t>; -#if defined(__SYCL_DEVICE_ONLY__) - const auto surf_ind = detail::AccessorPrivateProxy::getNativeImageObj(acc); - const simd promo_vals = - __esimd_surf_read(scale, surf_ind, glob_offset, offsets.data()); -#else const simd promo_vals = - __esimd_surf_read( - scale, acc, glob_offset, offsets.data()); -#endif + __esimd_gather_scaled2(si, glob_offset, offsets.data()); return convert(promo_vals); } else { -#if defined(__SYCL_DEVICE_ONLY__) - const auto surf_ind = detail::AccessorPrivateProxy::getNativeImageObj(acc); - return __esimd_surf_read( - scale, surf_ind, glob_offset, offsets.data()); -#else - return __esimd_surf_read( - scale, acc, glob_offset, offsets.data()); -#endif + return __esimd_gather_scaled2(si, glob_offset, offsets.data()); } } @@ -321,12 +348,13 @@ ESIMD_INLINE ESIMD_NODEBUG constexpr int TypeSizeLog2 = detail::ElemsPerAddrEncoding(); // TODO (performance) use hardware-supported scale once BE supports it - constexpr uint32_t scale = 0; + constexpr int16_t scale = 0; constexpr uint32_t t_scale = sizeof(T); if constexpr (t_scale > 1) { glob_offset *= t_scale; offsets *= t_scale; } + const auto si = __ESIMD_GET_SURF_HANDLE(acc); if constexpr (sizeof(T) < 4) { static_assert(std::is_integral::value, @@ -335,25 +363,12 @@ ESIMD_INLINE ESIMD_NODEBUG typename sycl::detail::conditional_t::value, int32_t, uint32_t>; const simd promo_vals = convert(vals); -#if defined(__SYCL_DEVICE_ONLY__) - const auto surf_ind = detail::AccessorPrivateProxy::getNativeImageObj(acc); - __esimd_surf_write( - pred.data(), scale, surf_ind, glob_offset, offsets.data(), - promo_vals.data()); -#else - __esimd_surf_write( - pred.data(), scale, acc, glob_offset, offsets.data(), - promo_vals.data()); -#endif + __esimd_scatter_scaled(pred.data(), si, glob_offset, offsets.data(), + promo_vals.data()); } else { -#if defined(__SYCL_DEVICE_ONLY__) - const auto surf_ind = detail::AccessorPrivateProxy::getNativeImageObj(acc); - __esimd_surf_write( - pred.data(), scale, surf_ind, glob_offset, offsets.data(), vals.data()); -#else - __esimd_surf_write( - pred.data(), scale, acc, glob_offset, offsets.data(), vals.data()); -#endif + __esimd_scatter_scaled( + pred.data(), si, glob_offset, offsets.data(), vals.data()); } } @@ -395,7 +410,8 @@ gather_rgba(T *p, simd offsets, simd_mask pred = 1) { simd offsets_i = convert(offsets); simd addrs(reinterpret_cast(p)); addrs = addrs + offsets_i; - return __esimd_flat_read4(addrs.data(), pred.data()); + return __esimd_svm_gather4_scaled(addrs.data(), + pred.data()); } /// Flat-address gather4. @@ -435,8 +451,8 @@ ESIMD_INLINE ESIMD_NODEBUG simd offsets_i = convert(offsets); simd addrs(reinterpret_cast(p)); addrs = addrs + offsets_i; - __esimd_flat_write4(addrs.data(), vals.data(), - pred.data()); + __esimd_svm_scatter4_scaled(addrs.data(), vals.data(), + pred.data()); } /// Flat-address scatter4. @@ -561,7 +577,7 @@ ESIMD_NODEBUG ESIMD_INLINE simd vAddr(reinterpret_cast(p)); simd offset_i1 = convert(offset); vAddr += offset_i1; - return __esimd_flat_atomic0(vAddr.data(), pred.data()); + return __esimd_svm_atomic0(vAddr.data(), pred.data()); } /// Flat-address atomic, one source operand, add/sub/min/max etc. @@ -576,8 +592,8 @@ ESIMD_NODEBUG ESIMD_INLINE simd vAddr(reinterpret_cast(p)); simd offset_i1 = convert(offset); vAddr += offset_i1; - return __esimd_flat_atomic1(vAddr.data(), src0.data(), - pred.data()); + return __esimd_svm_atomic1(vAddr.data(), src0.data(), + pred.data()); } /// Flat-address atomic, two source operands. @@ -592,8 +608,8 @@ ESIMD_NODEBUG ESIMD_INLINE simd vAddr(reinterpret_cast(p)); simd offset_i1 = convert(offset); vAddr += offset_i1; - return __esimd_flat_atomic2(vAddr.data(), src0.data(), - src1.data(), pred.data()); + return __esimd_svm_atomic2(vAddr.data(), src0.data(), + src1.data(), pred.data()); } /// Bits used to form the bitmask that controls the behavior of esimd_fence @@ -623,7 +639,7 @@ enum EsimdFenceMask { /// \tparam cntl is the bitmask composed from enum EsimdFenceMask /// \ingroup sycl_esimd ESIMD_INLINE ESIMD_NODEBUG void esimd_fence(uint8_t cntl) { - __esimd_slm_fence(cntl); + __esimd_fence(cntl); } /// Generic work-group barrier. @@ -635,7 +651,7 @@ ESIMD_INLINE ESIMD_NODEBUG void esimd_fence(uint8_t cntl) { /// control flow. /// \ingroup sycl_esimd inline ESIMD_NODEBUG void esimd_barrier() { - __esimd_slm_fence(ESIMD_GLOBAL_COHERENT_FENCE | ESIMD_LOCAL_BARRIER); + __esimd_fence(ESIMD_GLOBAL_COHERENT_FENCE | ESIMD_LOCAL_BARRIER); __esimd_barrier(); } @@ -657,7 +673,11 @@ SYCL_EXTERNAL SYCL_ESIMD_FUNCTION void slm_init(uint32_t size); template ESIMD_INLINE ESIMD_NODEBUG std::enable_if_t<(n == 16 || n == 32), simd> slm_gather(simd offsets, simd_mask pred = 1) { - return __esimd_slm_read(offsets.data(), pred.data()); + // TODO reimplement using __esimd_gather_scaled2 + constexpr int TypeSizeLog2 = detail::ElemsPerAddrEncoding(); + const auto si = __ESIMD_GET_SURF_HANDLE(detail::LocalAccessorMarker()); + return __esimd_gather_scaled( + pred.data(), si, 0 /*glob_offset*/, offsets.data()); } /// SLM gather (deprecated version). @@ -670,10 +690,17 @@ ESIMD_INLINE } /// SLM scatter. +// TODO support 1-,2-byte elements template -ESIMD_INLINE ESIMD_NODEBUG std::enable_if_t<(n == 16 || n == 32)> -slm_scatter(simd vals, simd offsets, simd_mask pred = 1) { - __esimd_slm_write(offsets.data(), vals.data(), pred.data()); +ESIMD_INLINE + ESIMD_NODEBUG std::enable_if_t<(n == 16 || n == 32) && (sizeof(T) == 4)> + slm_scatter(simd vals, simd offsets, + simd_mask pred = 1) { + const auto si = __ESIMD_GET_SURF_HANDLE(detail::LocalAccessorMarker()); + constexpr int TypeSizeLog2 = detail::ElemsPerAddrEncoding(); + constexpr int16_t scale = 0; + __esimd_scatter_scaled( + pred.data(), si, 0 /*glob_offset*/, offsets.data(), vals.data()); } /// SLM scatter (deprecated version). @@ -698,7 +725,10 @@ ESIMD_INLINE ESIMD_NODEBUG std::enable_if_t<(N == 8 || N == 16 || N == 32) && (sizeof(T) == 4), simd> slm_gather_rgba(simd offsets, simd pred = 1) { - return __esimd_slm_read4(offsets.data(), pred.data()); + + const auto si = __ESIMD_GET_SURF_HANDLE(detail::LocalAccessorMarker()); + return __esimd_gather4_scaled( + pred.data(), si, 0 /*global_offset*/, offsets.data()); } /// SLM gather4. @@ -730,7 +760,11 @@ ESIMD_INLINE ESIMD_NODEBUG std::enable_if_t<(N == 8 || N == 16 || N == 32) && (sizeof(T) == 4)> slm_scatter_rgba(simd vals, simd offsets, simd_mask pred = 1) { - __esimd_slm_write4(offsets.data(), vals.data(), pred.data()); + const auto si = __ESIMD_GET_SURF_HANDLE(detail::LocalAccessorMarker()); + constexpr int16_t Scale = 0; + constexpr int global_offset = 0; + __esimd_scatter4_scaled( + pred.data(), si, global_offset, offsets.data(), vals.data()); } /// SLM scatter4. @@ -756,7 +790,8 @@ ESIMD_INLINE ESIMD_NODEBUG simd slm_block_load(uint32_t offset) { static_assert(Sz <= 16 * detail::OperandSize::OWORD, "block size must be at most 16 owords"); - return __esimd_slm_block_read(offset >> 4); + const auto si = __ESIMD_GET_SURF_HANDLE(detail::LocalAccessorMarker()); + return __esimd_oword_ld(si, offset >> 4); } /// SLM block-store. @@ -773,8 +808,9 @@ ESIMD_INLINE ESIMD_NODEBUG void slm_block_store(uint32_t offset, static_assert(Sz <= 8 * detail::OperandSize::OWORD, "block size must be at most 8 owords"); + const auto si = __ESIMD_GET_SURF_HANDLE(detail::LocalAccessorMarker()); // offset in genx.oword.st is in owords - __esimd_slm_block_write(offset >> 4, vals.data()); + __esimd_oword_st(si, offset >> 4, vals.data()); } /// SLM atomic, zero source operand: inc and dec. @@ -783,7 +819,8 @@ ESIMD_NODEBUG ESIMD_INLINE typename sycl::detail::enable_if_t(), simd> slm_atomic(simd offsets, simd_mask pred) { - return __esimd_slm_atomic0(offsets.data(), pred.data()); + const auto si = __ESIMD_GET_SURF_HANDLE(detail::LocalAccessorMarker()); + return __esimd_dword_atomic0(pred.data(), si, offsets.data()); } /// SLM atomic, one source operand, add/sub/min/max etc. @@ -792,8 +829,9 @@ ESIMD_NODEBUG ESIMD_INLINE typename sycl::detail::enable_if_t(), simd> slm_atomic(simd offsets, simd src0, simd_mask pred) { - return __esimd_slm_atomic1(offsets.data(), src0.data(), - pred.data()); + const auto si = __ESIMD_GET_SURF_HANDLE(detail::LocalAccessorMarker()); + return __esimd_dword_atomic1(pred.data(), si, offsets.data(), + src0.data()); } /// SLM atomic, two source operands. @@ -803,8 +841,9 @@ ESIMD_NODEBUG ESIMD_INLINE simd> slm_atomic(simd offsets, simd src0, simd src1, simd_mask pred) { - return __esimd_slm_atomic2(offsets.data(), src0.data(), src1.data(), - pred.data()); + const auto si = __ESIMD_GET_SURF_HANDLE(detail::LocalAccessorMarker()); + return __esimd_dword_atomic2(pred.data(), si, offsets.data(), + src0.data(), src1.data()); } /// @} @@ -830,24 +869,24 @@ media_block_load(AccessorTy acc, unsigned x, unsigned y) { static_assert(Width <= 64u, "valid block width is in range [1, 64]"); static_assert(m <= 64u, "valid block height is in range [1, 64]"); static_assert(plane <= 3u, "valid plane index is in range [0, 3]"); -#if defined(__SYCL_DEVICE_ONLY__) + + const auto si = __ESIMD_GET_SURF_HANDLE(acc); + using SurfIndTy = decltype(si); constexpr unsigned int RoundedWidth = Width < 4 ? 4 : detail::getNextPowerOf2(); + constexpr int BlockWidth = sizeof(T) * n; + constexpr int Mod = 0; if constexpr (Width < RoundedWidth) { constexpr unsigned int n1 = RoundedWidth / sizeof(T); - simd temp = __esimd_media_block_load( - 0, detail::AccessorPrivateProxy::getNativeImageObj(acc), plane, - sizeof(T) * n, x, y); + simd temp = + __esimd_media_ld( + si, x, y); return temp.template select(0, 0); } else { - return __esimd_media_block_load( - 0, detail::AccessorPrivateProxy::getNativeImageObj(acc), plane, - sizeof(T) * n, x, y); + return __esimd_media_ld( + si, x, y); } -#else - return __esimd_media_block_load(0, acc, plane, sizeof(T) * n, x, y); -#endif // __SYCL_DEVICE_ONLY__ } /// Media block store. @@ -872,28 +911,25 @@ media_block_store(AccessorTy acc, unsigned x, unsigned y, simd vals) { static_assert(Width <= 64u, "valid block width is in range [1, 64]"); static_assert(m <= 64u, "valid block height is in range [1, 64]"); static_assert(plane <= 3u, "valid plane index is in range [0, 3]"); -#if defined(__SYCL_DEVICE_ONLY__) + const auto si = __ESIMD_GET_SURF_HANDLE(acc); + using SurfIndTy = decltype(si); constexpr unsigned int RoundedWidth = Width < 4 ? 4 : detail::getNextPowerOf2(); constexpr unsigned int n1 = RoundedWidth / sizeof(T); + constexpr int BlockWidth = sizeof(T) * n; + constexpr int Mod = 0; if constexpr (Width < RoundedWidth) { simd temp; auto temp_ref = temp.template bit_cast_view(); auto vals_ref = vals.template bit_cast_view(); temp_ref.template select() = vals_ref; - __esimd_media_block_store( - 0, detail::AccessorPrivateProxy::getNativeImageObj(acc), plane, - sizeof(T) * n, x, y, temp.data()); + __esimd_media_st(si, x, y, + temp.data()); } else { - __esimd_media_block_store( - 0, detail::AccessorPrivateProxy::getNativeImageObj(acc), plane, - sizeof(T) * n, x, y, vals.data()); + __esimd_media_st(si, x, y, + vals.data()); } -#else - __esimd_media_block_store(0, acc, plane, sizeof(T) * n, x, y, - vals.data()); -#endif // __SYCL_DEVICE_ONLY__ } #ifndef __SYCL_DEVICE_ONLY__ @@ -909,13 +945,9 @@ inline void slm_init(uint32_t size) {} /// /// \ingroup sycl_esimd template +__SYCL_DEPRECATED("use get_surface_index") ESIMD_INLINE ESIMD_NODEBUG uint32_t esimd_get_value(AccessorTy acc) { -#if defined(__SYCL_DEVICE_ONLY__) - return __esimd_get_value( - detail::AccessorPrivateProxy::getNativeImageObj(acc)); -#else - return __esimd_get_value(acc); -#endif // __SYCL_DEVICE_ONLY__ + return static_cast(get_surface_index(acc)); } /// \defgroup sycl_esimd_raw_send_api Raw send APIs @@ -966,7 +998,7 @@ esimd_raw_sends_load(simd msgDst, simd msgSrc0, static_assert(_Width3 % 32 == 0, "Invalid size for raw send msgSrc1"); uint8_t modifier = ((isEOT & 0x1) << 1) | (isSendc & 0x1); - return __esimd_raw_sends_load( + return __esimd_raw_sends2( modifier, execSize, mask.data(), numSrc0, numSrc1, numDst, sfid, exDesc, msgDesc, msgSrc0.data(), msgSrc1.data(), msgDst.data()); } @@ -1004,7 +1036,7 @@ esimd_raw_send_load(simd msgDst, simd msgSrc0, uint32_t exDesc, static_assert(_Width2 % 32 == 0, "Invalid size for raw send msgSrc0"); uint8_t modifier = ((isEOT & 0x1) << 1) | (isSendc & 0x1); - return __esimd_raw_send_load( + return __esimd_raw_send2( modifier, execSize, mask.data(), numSrc0, numDst, sfid, exDesc, msgDesc, msgSrc0.data(), msgDst.data()); } @@ -1042,7 +1074,7 @@ esimd_raw_sends_store(simd msgSrc0, simd msgSrc1, static_assert(_Width2 % 32 == 0, "Invalid size for raw send msgSrc1"); uint8_t modifier = ((isEOT & 0x1) << 1) | (isSendc & 0x1); - __esimd_raw_sends_store( + __esimd_raw_sends2_noresult( modifier, execSize, mask.data(), numSrc0, numSrc1, sfid, exDesc, msgDesc, msgSrc0.data(), msgSrc1.data()); } @@ -1074,11 +1106,14 @@ esimd_raw_send_store(simd msgSrc0, uint32_t exDesc, uint32_t msgDesc, static_assert(_Width1 % 32 == 0, "Invalid size for raw send msgSrc0"); uint8_t modifier = ((isEOT & 0x1) << 1) | (isSendc & 0x1); - __esimd_raw_send_store(modifier, execSize, mask.data(), numSrc0, - sfid, exDesc, msgDesc, msgSrc0.data()); + __esimd_raw_send2_noresult(modifier, execSize, mask.data(), + numSrc0, sfid, exDesc, msgDesc, + msgSrc0.data()); } /// @} +#undef __ESIMD_GET_SURF_HANDLE + } // namespace esimd } // namespace experimental } // namespace intel diff --git a/sycl/test/esimd/intrins_trans.cpp b/sycl/test/esimd/intrins_trans.cpp index d848a9d7bd15e..bfe8c386ff01c 100644 --- a/sycl/test/esimd/intrins_trans.cpp +++ b/sycl/test/esimd/intrins_trans.cpp @@ -45,29 +45,27 @@ SYCL_ESIMD_FUNCTION SYCL_EXTERNAL simd foo() { simd_mask pred; v_addr += offsets; - __esimd_flat_atomic0( - v_addr.data(), pred.data()); + __esimd_svm_atomic0(v_addr.data(), pred.data()); // CHECK: %{{[0-9a-zA-Z_.]+}} = call <32 x i32> @llvm.genx.svm.atomic.inc.v32i32.v32i1.v32i64(<32 x i1> %{{[0-9a-zA-Z_.]+}}, <32 x i64> %{{[0-9a-zA-Z_.]+}}, <32 x i32> undef) - __esimd_flat_atomic1( - v_addr.data(), v1.data(), pred.data()); + __esimd_svm_atomic1(v_addr.data(), v1.data(), + pred.data()); // CHECK: %{{[0-9a-zA-Z_.]+}} = call <32 x i32> @llvm.genx.svm.atomic.add.v32i32.v32i1.v32i64(<32 x i1> %{{[0-9a-zA-Z_.]+}}, <32 x i64> %{{[0-9a-zA-Z_.]+}}, <32 x i32> %{{[0-9a-zA-Z_.]+}}, <32 x i32> undef) - __esimd_flat_atomic2( + __esimd_svm_atomic2( v_addr.data(), v1.data(), v1.data(), pred.data()); // CHECK: %{{[0-9a-zA-Z_.]+}} = call <32 x i32> @llvm.genx.svm.atomic.cmpxchg.v32i32.v32i1.v32i64(<32 x i1> %{{[0-9a-zA-Z_.]+}}, <32 x i64> %{{[0-9a-zA-Z_.]+}}, <32 x i32> %{{[0-9a-zA-Z_.]+}}, <32 x i32> %{{[0-9a-zA-Z_.]+}}, <32 x i32> undef) uintptr_t addr = reinterpret_cast(ptr); - simd v00 = - __esimd_flat_block_read_unaligned(addr); + simd v00 = __esimd_svm_block_ld_unaligned(addr); // CHECK: %{{[0-9a-zA-Z_.]+}} = call <32 x i32> @llvm.genx.svm.block.ld.unaligned.v32i32.i64(i64 %{{[0-9a-zA-Z_.]+}}) - __esimd_flat_block_write(addr, v00.data()); + __esimd_svm_block_st(addr, v00.data()); // CHECK: call void @llvm.genx.svm.block.st.i64.v32i32(i64 %{{[0-9a-zA-Z_.]+}}, <32 x i32> %{{[0-9a-zA-Z_.]+}}) simd v01 = - __esimd_flat_read(v_addr.data(), 0, pred.data()); + __esimd_svm_gather(v_addr.data(), 0, pred.data()); // CHECK: %{{[0-9a-zA-Z_.]+}} = call <32 x i32> @llvm.genx.svm.gather.v32i32.v32i1.v32i64(<32 x i1> %{{[0-9a-zA-Z_.]+}}, i32 0, <32 x i64> %{{[0-9a-zA-Z_.]+}}, <32 x i32> undef) - __esimd_flat_write(v_addr.data(), v01.data(), 0, pred.data()); + __esimd_svm_scatter(v_addr.data(), v01.data(), 0, pred.data()); // CHECK: call void @llvm.genx.svm.scatter.v32i1.v32i64.v32i32(<32 x i1> %{{[0-9a-zA-Z_.]+}}, i32 0, <32 x i64> %{{[0-9a-zA-Z_.]+}}, <32 x i32> %{{[0-9a-zA-Z_.]+}}) simd mina(0, 1); @@ -77,7 +75,7 @@ SYCL_ESIMD_FUNCTION SYCL_EXTERNAL simd foo() { simd diva(2.f); simd divb(1.f); - diva = __esimd_div_ieee<1>(diva.data(), divb.data()); + diva = __esimd_ieee_div<1>(diva.data(), divb.data()); // CHECK: %{{[0-9a-zA-Z_.]+}} = call <1 x float> @llvm.genx.ieee.div.v1f32(<1 x float> %{{[0-9a-zA-Z_.]+}}, <1 x float> %{{[0-9a-zA-Z_.]+}}) simd a(0.1f); @@ -102,12 +100,16 @@ SYCL_ESIMD_FUNCTION SYCL_EXTERNAL simd foo() { simd va; va = media_block_load(pA, x, y); - // CHECK: %[[SI0:[0-9a-zA-Z_.]+]] = ptrtoint %opencl.image2d_ro_t addrspace(1)* %{{[0-9a-zA-Z_.]+}} to i32 + // CHECK: %[[SI0_VAL:[0-9a-zA-Z_.]+]] = ptrtoint %opencl.image2d_ro_t addrspace(1)* %{{[0-9a-zA-Z_.]+}} to i32 + // CHECK: store i32 %[[SI0_VAL]], i32 addrspace(4)* %[[SI0_ADDR:[0-9a-zA-Z_.]+]] + // CHECK: %[[SI0:[0-9a-zA-Z_.]+]] = load i32, i32 addrspace(4)* %[[SI0_ADDR]] // CHECK: %{{[0-9a-zA-Z_.]+}} = call <32 x i32> @llvm.genx.media.ld.v32i32(i32 0, i32 %[[SI0]], i32 0, i32 32, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}) simd vb = va + 1; media_block_store(pB, x, y, vb); - // CHECK: %[[SI2:[0-9a-zA-Z_.]+]] = ptrtoint %opencl.image2d_wo_t addrspace(1)* %{{[0-9a-zA-Z_.]+}} to i32 + // CHECK: %[[SI2_VAL:[0-9a-zA-Z_.]+]] = ptrtoint %opencl.image2d_wo_t addrspace(1)* %{{[0-9a-zA-Z_.]+}} to i32 + // CHECK: store i32 %[[SI2_VAL]], i32 addrspace(4)* %[[SI2_ADDR:[0-9a-zA-Z_.]+]] + // CHECK: %[[SI2:[0-9a-zA-Z_.]+]] = load i32, i32 addrspace(4)* %[[SI2_ADDR]] // CHECK: call void @llvm.genx.media.st.v32i32(i32 0, i32 %[[SI2]], i32 0, i32 32, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, <32 x i32> %{{[0-9a-zA-Z_.]+}}) auto ee = __esimd_vload((detail::vector_type_t *)(&vg)); @@ -120,27 +122,149 @@ SYCL_ESIMD_FUNCTION SYCL_EXTERNAL simd foo() { sycl::access::target::device> acc; simd offsets = 1; - simd pred{1, 0, 1, 0, 1, 0, 1, 0}; + simd_mask<8> pred{1, 0, 1, 0, 1, 0, 1, 0}; // 4-byte element gather simd v = gather(acc, offsets, 100); - // CHECK: %[[SI3:[0-9a-zA-Z_.]+]] = ptrtoint i32 addrspace(1)* %{{[0-9a-zA-Z_.]+}} to i32 + // CHECK: %[[SI3_VAL:[0-9a-zA-Z_.]+]] = ptrtoint i32 addrspace(1)* %{{[0-9a-zA-Z_.]+}} to i32 + // CHECK: store i32 %[[SI3_VAL]], i32 addrspace(4)* %[[SI3_ADDR:[0-9a-zA-Z_.]+]] + // CHECK: %[[SI3:[0-9a-zA-Z_.]+]] = load i32, i32 addrspace(4)* %[[SI3_ADDR]] // CHECK: %{{[0-9a-zA-Z_.]+}} = call <8 x i32> @llvm.genx.gather.scaled2.v8i32.v8i32(i32 2, i16 0, i32 %[[SI3]], i32 %{{[0-9a-zA-Z_.]+}}, <8 x i32> %{{[0-9a-zA-Z_.]+}}) // 4-byte element scatter scatter(acc, v, offsets, 100, pred); - // CHECK: %[[SI4:[0-9a-zA-Z_.]+]] = ptrtoint i32 addrspace(1)* %{{[0-9a-zA-Z_.]+}} to i32 + // CHECK: %[[SI4_VAL:[0-9a-zA-Z_.]+]] = ptrtoint i32 addrspace(1)* %{{[0-9a-zA-Z_.]+}} to i32 + // CHECK: store i32 %[[SI4_VAL]], i32 addrspace(4)* %[[SI4_ADDR:[0-9a-zA-Z_.]+]] + // CHECK: %[[SI4:[0-9a-zA-Z_.]+]] = load i32, i32 addrspace(4)* %[[SI4_ADDR]] // CHECK: call void @llvm.genx.scatter.scaled.v8i1.v8i32.v8i32(<8 x i1> %{{[0-9a-zA-Z_.]+}}, i32 2, i16 0, i32 %[[SI4]], i32 %{{[0-9a-zA-Z_.]+}}, <8 x i32> %{{[0-9a-zA-Z_.]+}}, <8 x i32> %{{[0-9a-zA-Z_.]+}}) // 1-byte element gather simd v1 = gather(acc, offsets, 100); - // CHECK: %[[SI5:[0-9a-zA-Z_.]+]] = ptrtoint i32 addrspace(1)* %{{[0-9a-zA-Z_.]+}} to i32 + // CHECK: %[[SI5_VAL:[0-9a-zA-Z_.]+]] = ptrtoint i32 addrspace(1)* %{{[0-9a-zA-Z_.]+}} to i32 + // CHECK: store i32 %[[SI5_VAL]], i32 addrspace(4)* %[[SI5_ADDR:[0-9a-zA-Z_.]+]] + // CHECK: %[[SI5:[0-9a-zA-Z_.]+]] = load i32, i32 addrspace(4)* %[[SI5_ADDR]] // CHECK: %{{[0-9a-zA-Z_.]+}} = call <8 x i32> @llvm.genx.gather.scaled2.v8i32.v8i32(i32 0, i16 0, i32 %[[SI5]], i32 %{{[0-9a-zA-Z_.]+}}, <8 x i32> %{{[0-9a-zA-Z_.]+}}) // 1-byte element scatter scatter(acc, v1, offsets, 100, pred); - // CHECK: %[[SI6:[0-9a-zA-Z_.]+]] = ptrtoint i32 addrspace(1)* %{{[0-9a-zA-Z_.]+}} to i32 + // CHECK: %[[SI6_VAL:[0-9a-zA-Z_.]+]] = ptrtoint i32 addrspace(1)* %{{[0-9a-zA-Z_.]+}} to i32 + // CHECK: store i32 %[[SI6_VAL]], i32 addrspace(4)* %[[SI6_ADDR:[0-9a-zA-Z_.]+]] + // CHECK: %[[SI6:[0-9a-zA-Z_.]+]] = load i32, i32 addrspace(4)* %[[SI6_ADDR]] // CHECK: call void @llvm.genx.scatter.scaled.v8i1.v8i32.v8i32(<8 x i1> %{{[0-9a-zA-Z_.]+}}, i32 0, i16 0, i32 %[[SI6]], i32 %{{[0-9a-zA-Z_.]+}}, <8 x i32> %{{[0-9a-zA-Z_.]+}}, <8 x i32> %{{[0-9a-zA-Z_.]+}}) } return d; } + +// TODO +// 1. __esimd* intrinsic translation tests from +// llvm\test\SYCLLowerIR\esimd_lower_intrins.ll should be refactored and +// moved here, as the form below is much easier to maintain with the same +// level of testing strength +// 2. Test cases above should be refactored not to use user-level APIs like +// gather and use __esimd* calls instead. +template using vec = typename simd::vector_type; + +template using mask = typename simd_mask::vector_type; + +SYCL_EXTERNAL void use(const vec &x) SYCL_ESIMD_FUNCTION; +SYCL_EXTERNAL void use(const vec &x) SYCL_ESIMD_FUNCTION; +SYCL_EXTERNAL void use(const vec &x) SYCL_ESIMD_FUNCTION; + +SYCL_EXTERNAL vec get8f() SYCL_ESIMD_FUNCTION; +SYCL_EXTERNAL vec get8i() SYCL_ESIMD_FUNCTION; +SYCL_EXTERNAL vec get8ui64() SYCL_ESIMD_FUNCTION; +SYCL_EXTERNAL vec get8ui16() SYCL_ESIMD_FUNCTION; +SYCL_EXTERNAL vec get8ui8() SYCL_ESIMD_FUNCTION; + +SYCL_EXTERNAL void +test_mem_intrins(uint64_t addr, const vec &xf, + const vec &xi) SYCL_ESIMD_FUNCTION { + { + constexpr SurfaceIndex si = 0; + vec x = __esimd_oword_ld_unaligned(si, 0); + // CHECK-LABEL: %{{[a-zA-Z0-9.]+}} = call <8 x float> @llvm.genx.oword.ld.unaligned.v8f32(i32 0, i32 0, i32 0) + use(x); + } + { + constexpr SurfaceIndex si = 0; + vec x = __esimd_oword_ld(si, 0); + // CHECK-LABEL: %{{[a-zA-Z0-9.]+}} = call <8 x float> @llvm.genx.oword.ld.v8f32(i32 0, i32 0, i32 0) + use(x); + } + { + constexpr SurfaceIndex si = 0; + __esimd_oword_st(si, 0, get8f()); + // CHECK-LABEL: call void @llvm.genx.oword.st.v8f32(i32 0, i32 0, <8 x float> %{{[a-zA-Z0-9.]+}}) + } + { + // TODO + // vec x = __esimd_svm_block_ld(addr); + } { + vec x = __esimd_svm_block_ld_unaligned(addr); + // CHECK-LABEL: %{{[a-zA-Z0-9.]+}} = call <8 x i32> @llvm.genx.svm.block.ld.unaligned.v8i32.i64(i64 %{{[a-zA-Z0-9.]+}}) + use(x); + } + { + __esimd_svm_block_st(addr, get8i()); + // CHECK-LABEL: call void @llvm.genx.svm.block.st.i64.v8i32(i64 %{{[a-zA-Z0-9.]+}}, <8 x i32> %{{[a-zA-Z0-9.]+}}) + } + { + auto x = __esimd_svm_gather(get8ui64(), 0, get8ui16()); + // CHECK-LABEL: %{{[a-zA-Z0-9.]+}} = call <8 x i8> @llvm.genx.svm.gather.v8i8.v8i1.v8i64(<8 x i1> %{{[a-zA-Z0-9.]+}}, i32 0, <8 x i64> %{{[a-zA-Z0-9.]+}}, <8 x i8> undef) + use(x); + } + { + __esimd_svm_scatter(get8ui64(), get8ui8(), 0, get8ui16()); + // CHECK-LABEL: call void @llvm.genx.svm.scatter.v8i1.v8i64.v8i8(<8 x i1> %{{[a-zA-Z0-9.]+}}, i32 0, <8 x i64> %{{[a-zA-Z0-9.]+}}, <8 x i8> %{{[a-zA-Z0-9.]+}}) + } + { + auto x = + __esimd_svm_atomic0(get8ui64(), get8ui16()); + // CHECK-LABEL: %{{[a-zA-Z0-9.]+}} = call <8 x i32> @llvm.genx.svm.atomic.inc.v8i32.v8i1.v8i64(<8 x i1> %{{[a-zA-Z0-9.]+}}, <8 x i64> %{{[a-zA-Z0-9.]+}}, <8 x i32> undef) + use(x); + } + { + vec src0 = get8f(); + auto x = __esimd_svm_atomic1(get8ui64(), src0, + get8ui16()); + // CHECK-LABEL: %{{[a-zA-Z0-9.]+}} = call <8 x float> @llvm.genx.svm.atomic.fmin.v8f32.v8i1.v8i64(<8 x i1> %{{[a-zA-Z0-9.]+}}, <8 x i64> %{{[a-zA-Z0-9.]+}}, <8 x float> %{{[a-zA-Z0-9.]+}}, <8 x float> undef) + use(x); + } + { + vec src0 = get8f(); + vec src1 = get8f(); + auto x = __esimd_svm_atomic2(get8ui64(), src0, + src1, get8ui16()); + // CHECK-LABEL: %{{[a-zA-Z0-9.]+}} = call <8 x float> @llvm.genx.svm.atomic.fcmpwr.v8f32.v8i1.v8i64(<8 x i1> %{{[a-zA-Z0-9.]+}}, <8 x i64> %{{[a-zA-Z0-9.]+}}, <8 x float> %{{[a-zA-Z0-9.]+}}, <8 x float> %{{[a-zA-Z0-9.]+}}, <8 x float> undef) + use(x); + } + { + constexpr SurfaceIndex si = 0; + vec x = + __esimd_media_ld(si, 0, 0); + // CHECK-LABEL: %{{[a-zA-Z0-9.]+}} = call <8 x float> @llvm.genx.media.ld.v8f32(i32 0, i32 0, i32 0, i32 4, i32 0, i32 0) + use(x); + } + { + constexpr SurfaceIndex si = 0; + vec x = get8f(); + __esimd_media_st(si, 0, 0, x); + // CHECK-LABEL: call void @llvm.genx.media.st.v8f32(i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, <8 x float> %{{[a-zA-Z0-9.]+}}) + } +} + +SYCL_EXTERNAL void test_math_intrins() SYCL_ESIMD_FUNCTION { + { + vec x0 = get8f(); + vec x1 = get8f(); + auto y = __esimd_ieee_div<8>(x0, x1); + // CHECK-LABEL: %{{[a-zA-Z0-9.]+}} = call <8 x float> @llvm.genx.ieee.div.v8f32(<8 x float> %{{[a-zA-Z0-9.]+}}, <8 x float> %{{[a-zA-Z0-9.]+}}) + use(y); + } + { + vec x = get8f(); + auto y = __esimd_ieee_sqrt<8>(x); + // CHECK-LABEL: %{{[a-zA-Z0-9.]+}} = call <8 x float> @llvm.genx.ieee.sqrt.v8f32(<8 x float> %{{[a-zA-Z0-9.]+}}) + use(y); + } +}