Skip to content

Commit 86c018d

Browse files
Explicitly initialize contents of scratch buffer to avoid performance anomalies.
1 parent f5713f1 commit 86c018d

File tree

27 files changed

+173
-15
lines changed

27 files changed

+173
-15
lines changed

build_all.sh

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,9 @@ if [[ $USE_LEGION -eq 1 ]]; then
7676
make -C legion clean
7777
fi
7878
if [[ $USE_REGENT -eq 1 ]]; then
79-
make -C regent clean
79+
SHARD_SIZE=30 make -C regent clean
80+
SHARD_SIZE=15 make -C regent clean
81+
SHARD_SIZE=14 make -C regent clean
8082
fi
8183
if [[ $USE_REALM -eq 1 ]]; then
8284
make -C realm clean

chapel/task_benchmark.chpl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,7 @@ proc execute_task_graph2(graph, task_result, task_ready, task_used) {
131131

132132
var scratch_bytes = graph.scratch_bytes_per_task;
133133
var scratch_ptr = c_malloc(int(8), scratch_bytes);
134+
task_graph_prepare_scratch(scratch_ptr, scratch_bytes);
134135

135136
// Initialize input_ptr and input_bytes... these don't need to
136137
// change because we can just set n_inputs dynamically.

charm++/subchare.C

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,8 @@ void Subchare::initGraph(MulticastMsg* msg) {
9696
output.resize(graph.output_bytes_per_task);
9797
scratch.resize(graph.scratch_bytes_per_task);
9898

99+
TaskGraph::prepare_scratch(scratch.data(), scratch.size());
100+
99101
CProxySection_Subchare::contribute(sid, CkCallback(CkReductionTarget(Main, workerReady), mainProxy));
100102
}
101103

core/core.cc

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -543,6 +543,8 @@ size_t TaskGraph::num_dependencies(long dset, long point) const
543543
return SIZE_MAX;
544544
}
545545

546+
#define MAGIC_VALUE UINT64_C(0x5C4A7C8B) // can you read it? it says "SCRATCHB" (kinda)
547+
546548
void TaskGraph::execute_point(long timestep, long point,
547549
char *output_ptr, size_t output_bytes,
548550
const char **input_ptr, const size_t *input_bytes,
@@ -607,12 +609,25 @@ void TaskGraph::execute_point(long timestep, long point,
607609

608610
// Validate scratch
609611
assert(scratch_bytes == scratch_bytes_per_task);
612+
if (scratch_bytes > 0) {
613+
uint64_t *scratch = reinterpret_cast<uint64_t *>(scratch_ptr);
614+
assert(*scratch == MAGIC_VALUE);
615+
}
610616

611617
// Execute kernel
612618
Kernel k(kernel);
613619
k.execute(graph_index, timestep, point, scratch_ptr, scratch_bytes);
614620
}
615621

622+
void TaskGraph::prepare_scratch(char *scratch_ptr, size_t scratch_bytes)
623+
{
624+
assert(scratch_bytes % sizeof(uint64_t) == 0);
625+
uint64_t *base_ptr = reinterpret_cast<uint64_t *>(scratch_ptr);
626+
for (long i = 0; i < scratch_bytes/sizeof(uint64_t); ++i) {
627+
base_ptr[i] = MAGIC_VALUE;
628+
}
629+
}
630+
616631
static TaskGraph default_graph(long graph_index)
617632
{
618633
TaskGraph graph;

core/core.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@ struct TaskGraph : public task_graph_t {
6969
const char **input_ptr, const size_t *input_bytes,
7070
size_t n_inputs,
7171
char *scratch_ptr, size_t scratch_bytes) const;
72+
static void prepare_scratch(char *scratch_ptr, size_t scratch_bytes);
7273
};
7374

7475
struct App {

core/core_c.cc

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,11 @@ void task_graph_execute_point_scratch_nonconst(task_graph_t graph, long timestep
148148
scratch_ptr, scratch_bytes);
149149
}
150150

151+
void task_graph_prepare_scratch(char *scratch_ptr, size_t scratch_bytes)
152+
{
153+
TaskGraph::prepare_scratch(scratch_ptr, scratch_bytes);
154+
}
155+
151156
void interval_list_destroy(interval_list_t intervals)
152157
{
153158
std::vector<std::pair<long, long> > *i = unwrap(intervals);

core/core_c.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,7 @@ void task_graph_execute_point_scratch_nonconst(task_graph_t graph, long timestep
116116
int64_t **input_ptr, const size_t *input_bytes,
117117
size_t n_inputs,
118118
char *scratch_ptr, size_t scratch_bytes);
119+
void task_graph_prepare_scratch(char *scratch_ptr, size_t scratch_bytes);
119120

120121
typedef struct task_graph_list_t {
121122
void *impl;

dask/task_bench_core.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -151,7 +151,10 @@ def execute_point_no_scratch(graph_array, timestep, point, *inputs):
151151

152152

153153
def init_scratch_direct(scratch_bytes):
154-
return np.empty(scratch_bytes, dtype=np.ubyte)
154+
scratch = np.empty(scratch_bytes, dtype=np.ubyte)
155+
scratch_ptr = ffi.cast("char *", scratch.ctypes.data)
156+
c.task_graph_prepare_scratch(scratch_ptr, scratch_bytes)
157+
return scratch
155158

156159

157160
@dask.delayed

kernel_bench/main.cc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,7 @@ KernelBenchApp::KernelBenchApp(int argc, char **argv)
144144
scratch_buff.reserve(nb_workers);
145145
for (i = 0; i < nb_workers; i++) {
146146
scratch_buff.emplace_back(graph.scratch_bytes_per_task, 0);
147+
TaskGraph::prepare_scratch(scratch_buff[i].data(), graph.scratch_bytes_per_task);
147148
}
148149

149150
// init timer array

legion/main.cc

Lines changed: 52 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ using namespace Legion::Mapping;
2828

2929
enum TaskIDs {
3030
TID_TOP,
31+
TID_INIT,
3132
TID_LEAF,
3233
TID_DUMMY,
3334
};
@@ -97,6 +98,20 @@ void get_base_and_size(Runtime *runtime,
9798
bytes = rect.volume();
9899
}
99100

101+
void init(const Task *task,
102+
const std::vector<PhysicalRegion> &regions,
103+
Context ctx, Runtime *runtime)
104+
{
105+
log_taskbench.info("Init at point %lld", task->index_point[0]);
106+
107+
char *scratch_ptr = NULL;
108+
size_t scratch_bytes = 0;
109+
Rect<1> scratch_rect = runtime->get_index_space_domain(
110+
regions[0].get_logical_region().get_index_space());
111+
get_base_and_size(runtime, regions[0], task->regions[0], scratch_rect, scratch_ptr, scratch_bytes);
112+
TaskGraph::prepare_scratch(scratch_ptr, scratch_bytes);
113+
}
114+
100115
void leaf(const Task *task,
101116
const std::vector<PhysicalRegion> &regions,
102117
Context ctx, Runtime *runtime)
@@ -160,6 +175,8 @@ struct LegionApp : public App {
160175

161176
void execute_timestep(size_t i, long t);
162177

178+
void init(size_t i);
179+
163180
void issue_execution_fence_and_block();
164181

165182
private:
@@ -269,6 +286,10 @@ void LegionApp::run()
269286
display();
270287
}
271288

289+
for (size_t idx = 0; idx < graphs.size(); ++idx) {
290+
init(idx);
291+
}
292+
272293
execute_main_loop(); // warm-up
273294

274295
issue_execution_fence_and_block();
@@ -329,6 +350,29 @@ void LegionApp::execute_main_loop()
329350
}
330351
}
331352

353+
void LegionApp::init(size_t idx)
354+
{
355+
const TaskGraph &g = graphs[idx];
356+
357+
Rect<1> bounds(0, g.max_width-1);
358+
359+
if (g.scratch_bytes_per_task != 0) {
360+
for (long i = 0; i < num_fields; ++i) {
361+
FieldID fout(FID_FIRST + i);
362+
IndexLauncher launcher(TID_INIT, bounds, TaskArgument(), ArgumentMap());
363+
MappingTagID tag = exact_instance ? Legion::Mapping::DefaultMapper::EXACT_REGION : 0;
364+
const LogicalRegionT<1> &sratch_region = scratch_regions[idx];
365+
const LogicalPartitionT<1> &scratch = scratch_partitions[idx];
366+
launcher.add_region_requirement(
367+
RegionRequirement(scratch, 0 /* default projection */,
368+
READ_WRITE, EXCLUSIVE, sratch_region, tag)
369+
.add_field(fout));
370+
371+
runtime->execute_index_space(ctx, launcher);
372+
}
373+
}
374+
}
375+
332376
void LegionApp::execute_timestep(size_t idx, long t)
333377
{
334378
const TaskGraph &g = graphs[idx];
@@ -372,7 +416,7 @@ void LegionApp::execute_timestep(size_t idx, long t)
372416
const LogicalPartitionT<1> &scratch = scratch_partitions[idx];
373417
launcher.add_region_requirement(
374418
RegionRequirement(scratch, 0 /* default projection */,
375-
WRITE_DISCARD, EXCLUSIVE, sratch_region, tag)
419+
READ_WRITE, EXCLUSIVE, sratch_region, tag)
376420
.add_field(fout));
377421
}
378422

@@ -419,6 +463,13 @@ int main(int argc, char **argv)
419463
Runtime::preregister_task_variant<top>(registrar, "top");
420464
}
421465

466+
{
467+
TaskVariantRegistrar registrar(TID_INIT, "init");
468+
registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
469+
registrar.set_leaf();
470+
Runtime::preregister_task_variant<init>(registrar, "init");
471+
}
472+
422473
{
423474
TaskVariantRegistrar registrar(TID_LEAF, "leaf");
424475
registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));

0 commit comments

Comments
 (0)