Skip to content

Commit e143dcc

Browse files
v-klochkovvmustya
andauthored
[ESIMD] Add non-experimental version of fence() for DG2 and PVC (intel#11923)
The new esimd::fence() accepts 3 template enum parameters and is usable only on DG2 and PVC. It implements the subset of functionality supported by experimental::esimd::lsc_fence(). The functionality that was not included into the public version is a) not supported by hardware and/or GPU drivers b) too specific and is too risky to use now. --------- Signed-off-by: Klochkov, Vyacheslav N <[email protected]> Co-authored-by: Victor Mustya <[email protected]>
1 parent 207d5e7 commit e143dcc

File tree

7 files changed

+316
-36
lines changed

7 files changed

+316
-36
lines changed

sycl/include/sycl/ext/intel/esimd/common.hpp

+56
Original file line numberDiff line numberDiff line change
@@ -384,6 +384,62 @@ enum class cache_hint : uint8_t {
384384
const_cached = 7
385385
};
386386

387+
/// The scope that fence() operation should apply to.
388+
/// Supported platforms: DG2, PVC
389+
enum class fence_scope : uint8_t {
390+
/// Wait until all previous memory transactions from this thread are observed
391+
/// within the local thread-group.
392+
group = 0,
393+
394+
/// Wait until all previous memory transactions from this thread are observed
395+
/// within the local sub-slice.
396+
local = 1,
397+
398+
/// Wait until all previous memory transactions from this thread are observed
399+
/// in the local tile.
400+
tile = 2,
401+
402+
/// Wait until all previous memory transactions from this thread are observed
403+
/// in the local GPU.
404+
gpu = 3,
405+
406+
/// Wait until all previous memory transactions from this thread are observed
407+
/// across all GPUs in the system.
408+
gpus = 4,
409+
410+
/// Global memory data-port only: wait until all previous memory transactions
411+
/// from this thread are observed at the "system" level.
412+
system = 5,
413+
414+
/// Global memory data-port only: for GPUs that do not follow
415+
/// PCIe Write ordering for downstream writes targeting device memory,
416+
/// this op will commit to device memory all downstream and peer writes that
417+
/// have reached the device.
418+
system_acquire = 6
419+
};
420+
421+
/// The cache flush operation to apply to caches after fence() is complete.
422+
/// Supported platforms: DG2, PVC
423+
enum class fence_flush_op : uint8_t {
424+
none = 0, /// no operation;
425+
evict = 1, /// R/W: evict dirty lines; R/W and RO: invalidate clean lines
426+
invalidate = 2, /// R/W and RO: invalidate all clean lines;
427+
428+
// enum with the value 3 is reserved;
429+
430+
clean = 4 /// R/W: dirty lines are written to memory, but retained in
431+
/// cache in clean state; RO: no effect.
432+
};
433+
434+
/// The target memory kind for fence() operation.
435+
/// Supported platforms: DG2, PVC
436+
enum class memory_kind : uint8_t {
437+
global = 0, /// untyped global memory
438+
// enum with the value 1 is reserved;
439+
image = 2, /// image (also known as typed global memory)
440+
local = 3, /// shared local memory
441+
};
442+
387443
/// L1, L2 or L3 cache hint levels. L3 is reserved for future use.
388444
enum class cache_level : uint8_t { L1 = 1, L2 = 2, L3 = 3 };
389445

sycl/include/sycl/ext/intel/esimd/detail/memory_intrin.hpp

+18
Original file line numberDiff line numberDiff line change
@@ -641,6 +641,24 @@ __ESIMD_INTRIN void __esimd_fence(uint8_t cntl)
641641
}
642642
#endif // __SYCL_DEVICE_ONLY__
643643

644+
/// Memory fence.
645+
/// Supported platforms: DG2, PVC
646+
///
647+
/// @tparam Kind is the Sfid shaded function.
648+
/// @tparam FenceOp is the fence operation.
649+
/// @tparam Scope is the operation scope.
650+
/// @tparam N is the SIMD size of operation (the number of addresses to access)
651+
/// @param pred is predicates.
652+
template <uint8_t Kind, uint8_t FenceOp, uint8_t Scope, int N>
653+
__ESIMD_INTRIN void __esimd_lsc_fence(__ESIMD_DNS::simd_mask_storage_t<N> pred)
654+
#ifdef __SYCL_DEVICE_ONLY__
655+
;
656+
#else // __SYCL_DEVICE_ONLY__
657+
{
658+
__ESIMD_UNSUPPORTED_ON_HOST;
659+
}
660+
#endif // __SYCL_DEVICE_ONLY__
661+
644662
// Predicated (masked) scaled gather from a surface.
645663
//
646664
// Template (compile-time constant) parameters:

sycl/include/sycl/ext/intel/esimd/memory.hpp

+20
Original file line numberDiff line numberDiff line change
@@ -5627,6 +5627,26 @@ template <uint8_t cntl> __ESIMD_API void fence() { __esimd_fence(cntl); }
56275627
__SYCL_DEPRECATED("use fence<fence_mask>()")
56285628
__ESIMD_API void fence(fence_mask cntl) { __esimd_fence(cntl); }
56295629

5630+
/// Memory fence.
5631+
/// Supported platforms: DG2, PVC
5632+
///
5633+
/// @tparam Kind is the memory kind.
5634+
/// @tparam FenceOp is the fence cache flush operation to apply after fence.
5635+
/// @tparam Scope is the fence operation scope.
5636+
template <memory_kind Kind = memory_kind::global,
5637+
fence_flush_op FenceOp = fence_flush_op::none,
5638+
fence_scope Scope = fence_scope::group>
5639+
__ESIMD_API void fence() {
5640+
static_assert(
5641+
Kind != memory_kind::local ||
5642+
(FenceOp == fence_flush_op::none && Scope == fence_scope::group),
5643+
"SLM fence must have 'none' lsc_fence_op and 'group' scope");
5644+
constexpr int N = 16;
5645+
simd_mask<N> Mask = 1;
5646+
__esimd_lsc_fence<static_cast<uint8_t>(Kind), static_cast<uint8_t>(FenceOp),
5647+
static_cast<uint8_t>(Scope), N>(Mask.data());
5648+
}
5649+
56305650
/// Generic work-group barrier.
56315651
/// Performs barrier synchronization for all threads within the same thread
56325652
/// group. The barrier instruction causes the executing thread to wait until

sycl/include/sycl/ext/intel/experimental/esimd/common.hpp

+19-16
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,8 @@ namespace ext::intel::experimental::esimd {
2727

2828
/// The scope that lsc_fence operation should apply to
2929
/// Supported platforms: DG2, PVC
30-
enum class lsc_scope : uint8_t {
30+
enum class __SYCL_DEPRECATED(
31+
"use sycl::ext::intel::esimd::fence_scope") lsc_scope : uint8_t {
3132
group = 0, /// flush out to the threadgroup's scope
3233
local = 1, /// flush out to the local scope
3334
tile = 2, /// tile, flush out to several DSSs
@@ -39,24 +40,26 @@ enum class lsc_scope : uint8_t {
3940

4041
/// The lsc_fence operation to apply to caches
4142
/// Supported platforms: DG2, PVC
42-
enum class lsc_fence_op : uint8_t {
43-
none = 0, /// no operation
44-
evict = 1, /// dirty lines evicted and invalidated from L1
45-
invalidate = 2, /// invalidate all clean lines
46-
discard = 3, /// direct and clean lines are discarded w/o eviction
47-
clean = 4, /// dirty lines are written to memory, but retained in cache
48-
/// in clean state
49-
flushl3 = 5, /// flush only L3
50-
};
43+
enum class __SYCL_DEPRECATED("use sycl::ext::intel::esimd::fence_flush_op")
44+
lsc_fence_op : uint8_t {
45+
none = 0, /// no operation
46+
evict = 1, /// dirty lines evicted and invalidated from L1
47+
invalidate = 2, /// invalidate all clean lines
48+
discard = 3, /// direct and clean lines are discarded w/o eviction
49+
clean = 4, /// dirty lines are written to memory, but retained in cache
50+
/// in clean state
51+
flushl3 = 5, /// flush only L3
52+
};
5153

5254
/// The specific LSC shared function to fence with lsc_fence
5355
/// Supported platforms: DG2, PVC
54-
enum class lsc_memory_kind : uint8_t {
55-
untyped_global = 0, /// untyped global memory
56-
untyped_global_low_pri = 1, /// low-priority untyped global memory
57-
typed_global = 2, /// typed global memory
58-
shared_local = 3, /// shared local memory
59-
};
56+
enum class __SYCL_DEPRECATED("use sycl::ext::intel::esimd::memory_kind")
57+
lsc_memory_kind : uint8_t {
58+
untyped_global = 0, /// untyped global memory
59+
untyped_global_low_pri = 1, /// low-priority untyped global memory
60+
typed_global = 2, /// typed global memory
61+
shared_local = 3, /// shared local memory
62+
};
6063

6164
using lsc_data_size = __ESIMD_DNS::lsc_data_size;
6265

sycl/include/sycl/ext/intel/experimental/esimd/detail/memory_intrin.hpp

-19
Original file line numberDiff line numberDiff line change
@@ -416,25 +416,6 @@ __esimd_lsc_xatomic_slm_2(
416416
}
417417
#endif // __SYCL_DEVICE_ONLY__
418418

419-
/// Memory fence.
420-
/// Supported platforms: DG2, PVC
421-
///
422-
/// @tparam Kind is the Sfid shaded function.
423-
/// @tparam FenceOp is the fence operation.
424-
/// @tparam Scope is the operation scope.
425-
/// @tparam N is the SIMD size of operation (the number of addresses to access)
426-
/// @param pred is predicates.
427-
template <__ESIMD_ENS::lsc_memory_kind Kind, __ESIMD_ENS::lsc_fence_op FenceOp,
428-
__ESIMD_ENS::lsc_scope Scope, int N>
429-
__ESIMD_INTRIN void __esimd_lsc_fence(__ESIMD_DNS::simd_mask_storage_t<N> pred)
430-
#ifdef __SYCL_DEVICE_ONLY__
431-
;
432-
#else // __SYCL_DEVICE_ONLY__
433-
{
434-
__ESIMD_UNSUPPORTED_ON_HOST;
435-
}
436-
#endif // __SYCL_DEVICE_ONLY__
437-
438419
__ESIMD_INTRIN uint32_t __esimd_slm_alloc(uint32_t size)
439420
#ifdef __SYCL_DEVICE_ONLY__
440421
;

sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp

+6-1
Original file line numberDiff line numberDiff line change
@@ -3091,12 +3091,17 @@ lsc_atomic_update(AccessorTy acc, __ESIMD_NS::simd<uint32_t, N> offsets,
30913091
template <lsc_memory_kind Kind = lsc_memory_kind::untyped_global,
30923092
lsc_fence_op FenceOp = lsc_fence_op::none,
30933093
lsc_scope Scope = lsc_scope::group, int N = 16>
3094+
__SYCL_DEPRECATED("use sycl::ext::intel::esimd::fence<Kind, FenceOp, Scope>()")
30943095
__ESIMD_API void lsc_fence(__ESIMD_NS::simd_mask<N> pred = 1) {
30953096
static_assert(
30963097
Kind != lsc_memory_kind::shared_local ||
30973098
(FenceOp == lsc_fence_op::none && Scope == lsc_scope::group),
30983099
"SLM fence must have 'none' lsc_fence_op and 'group' scope");
3099-
__esimd_lsc_fence<Kind, FenceOp, Scope, N>(pred.data());
3100+
static_assert(Kind != lsc_memory_kind::untyped_global_low_pri,
3101+
"lsc_memory_kind::untyped_global_low_pri is not supported in HW"
3102+
" and/or GPU drivers");
3103+
__esimd_lsc_fence<static_cast<uint8_t>(Kind), static_cast<uint8_t>(FenceOp),
3104+
static_cast<uint8_t>(Scope), N>(pred.data());
31003105
}
31013106

31023107
/// @} sycl_esimd_memory_lsc

0 commit comments

Comments
 (0)