Skip to content

Commit 82c7252

Browse files
authored
Merge pull request StanfordLegion#32 from QingleiCao/dtd_trimming
parsec benchmark optimization
2 parents db7a848 + 3844b6d commit 82c7252

20 files changed

+5853
-306
lines changed

.travis.yml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,11 @@ before_install:
5454
if [[ $USE_OMPSS2 -eq 1 ]]; then
5555
sudo apt-get install -qq libnuma-dev gperf libboost1.65-dev
5656
fi
57+
if [[ $USE_PARSEC -eq 1 ]]; then
58+
sudo snap install cmake --classic
59+
export PATH=/snap/bin:$PATH
60+
cmake --version
61+
fi
5762
fi
5863
install:
5964
- export CCACHE_BASEDIR=$PWD

build_all.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -158,9 +158,9 @@ if [[ $USE_PARSEC -eq 1 ]]; then
158158
mkdir -p "$PARSEC_DIR"
159159
pushd "$PARSEC_DIR"
160160
if [[ $TASKBENCH_USE_HWLOC -eq 1 ]]; then
161-
../contrib/platforms/config.linux -DPARSEC_GPU_WITH_CUDA=OFF -DCMAKE_INSTALL_PREFIX=$PWD -DHWLOC_DIR=$HWLOC_DIR
161+
../configure --prefix=$PWD --with-mpi --with-hwloc=$HWLOC_DIR --disable-debug
162162
else
163-
../contrib/platforms/config.linux -DPARSEC_GPU_WITH_CUDA=OFF -DCMAKE_INSTALL_PREFIX=$PWD
163+
../configure --prefix=$PWD --with-mpi --disable-debug --with-cuda=no
164164
fi
165165
make -j$THREADS
166166
make install

get_deps.sh

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -169,7 +169,11 @@ if [[ $USE_PARSEC -eq 1 ]]; then
169169
export PARSEC_DIR=$PARSEC_DL_DIR/build
170170
EOF
171171
mkdir -p "$PARSEC_DL_DIR"
172-
git clone https://[email protected]/wwu12/parsec.git "$PARSEC_DL_DIR"
172+
git clone https://bitbucket.org/icldistcomp/parsec.git "$PARSEC_DL_DIR"
173+
pushd "$PARSEC_DL_DIR"
174+
git apply ../../parsec/patch.diff
175+
git checkout -b 242498dd7ce3974c01db888d7e4d759a69e5bcdb
176+
popd
173177
fi
174178

175179
if [[ $USE_CHARM -eq 1 ]]; then

parsec/Makefile

Lines changed: 65 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,18 +2,32 @@ ifndef PARSEC_DIR
22
$(error PARSEC_DIR variable is not defined, aborting build)
33
endif
44

5+
PARSEC_SRC = $(PARSEC_DIR)/../..
6+
7+
#PARSEC_DIR = /home/qcao3/task-bench/software/parsec/build/install
8+
#PARSEC_SRC = /home/qcao3/task-bench/software/parsec
9+
10+
#PARSEC_DIR = /home/qcao3/task-bench/software/parsec-dtd-interface/build/install
11+
#PARSEC_SRC = /home/qcao3/task-bench/software/parsec-dtd-interface
12+
513
DEBUG ?= 0
614

715
CC = mpic++
16+
cc = mpicc
17+
PP = ${PARSEC_DIR}/bin/parsec-ptgpp
818

919
CFLAGS = -std=c++11 -D_GNU_SOURCE
1020
LDFLAGS = -m64 -std=c++11 -Wall -D_GNU_SOURCE
1121

22+
CFLAGS_JDF = -D_GNU_SOURCE
23+
1224
ifeq ($(strip $(DEBUG)),0)
1325
CFLAGS += -O3
26+
CFLAGS_JDF += -O3
1427
LDFLAGS += -O3
1528
else
1629
CFLAGS += -g -O0
30+
CFLAGS_JDF += -g -O0
1731
LDFLAGS += -g -O0
1832
endif
1933

@@ -22,11 +36,15 @@ endif
2236
#PLASMA_DIR = /sw/plasma/2.8.0g
2337

2438
# Include directories
25-
INC = -I$(PARSEC_DIR)/include -I$(PARSEC_DIR)/include/parsec -I../core
39+
INC = -I$(PARSEC_DIR)/include -I$(PARSEC_DIR)/include/parsec -I$(PARSEC_DIR)/../ -I../core -I$(PARSEC_SRC) -I$(PARSEC_SRC)/parsec
2640
INC_EXT = -I$(HWLOC_DIR)/include
2741

2842
# Location of the libraries.
29-
LIB = -Wl,-rpath,$(PARSEC_DIR)/lib $(PARSEC_DIR)/lib/libparsec.so -L../core -lcore_s
43+
ifneq ("$(wildcard $(PARSEC_DIR)/lib64/libparsec.so)","")
44+
LIB = -Wl,-rpath,$(PARSEC_DIR)/lib64 $(PARSEC_DIR)/lib64/libparsec.so -L../core -lcore_s
45+
else
46+
LIB = -Wl,-rpath,$(PARSEC_DIR)/lib $(PARSEC_DIR)/lib/libparsec.so -L../core -lcore_s
47+
endif
3048
LIB_EXT = -lpthread -lm -latomic
3149

3250
INC := $(INC) $(INC_EXT)
@@ -36,28 +54,68 @@ CFLAGS += $(INC)
3654

3755
include ../core/make_blas.mk
3856

39-
TARGET = main main_buffer_core
57+
TARGET = main_dtd main_shard main_buffer main_ptg
4058
all: $(TARGET)
4159

4260
.PRECIOUS: %.cc %.o
4361

62+
stencil_1d.c stencil_1d.h: stencil_1d.jdf
63+
$(PP) -E -i $< -o $(basename $<)
64+
65+
nearest_radix_5.c nearest_radix_5.h: nearest_radix_5.jdf
66+
$(PP) -E -i $< -o $(basename $<)
67+
68+
stencil_1d.o: stencil_1d.c stencil_1d.h benchmark_internal.h
69+
$(cc) -c $(CFLAGS_JDF) $(INC) $<
70+
71+
nearest_radix_5.o: nearest_radix_5.c nearest_radix_5.h benchmark_internal.h
72+
$(cc) -c $(CFLAGS_JDF) $(INC) $<
73+
74+
benchmark.c benchmark.h: benchmark.jdf
75+
$(PP) -E -i $< -o $(basename $<)
76+
77+
benchmark.o: benchmark.c benchmark.h benchmark_internal.h
78+
$(cc) -c $(CFLAGS_JDF) $(INC) $<
79+
80+
spread_radix5_period3.c spread_radix5_period3.h: spread_radix5_period3.jdf
81+
$(PP) -E -i $< -o $(basename $<)
82+
83+
spread_radix5_period3.o: spread_radix5_period3.c spread_radix5_period3.h benchmark_internal.h
84+
$(cc) -c $(CFLAGS_JDF) $(INC) $<
85+
86+
benchmark_internal.o: benchmark_internal.cc
87+
$(CC) -c $(CFLAGS) $<
88+
4489
common.o: common.cc common.h
4590
$(CC) -c $(CFLAGS) $<
4691

4792
main.o: main.cc ../core/timer.h
4893
$(CC) -c $(CFLAGS) $<
94+
95+
main_shard.o: main_shard.cc ../core/timer.h
96+
$(CC) -c $(CFLAGS) $<
4997

50-
main: main.o common.o
98+
main_jdf.o: main_jdf.cc ../core/timer.h ../core/core_c.h
99+
$(CC) -c $(CFLAGS) $<
100+
101+
main_buffer.o: main_buffer.cc ../core/timer.h
102+
$(CC) -c $(CFLAGS) $<
103+
104+
main_dtd: main.o common.o
105+
$(CC) $^ $(LIB) $(LDFLAGS) -o $@
106+
107+
main_shard: main_shard.o common.o
51108
$(CC) $^ $(LIB) $(LDFLAGS) -o $@
52109

53-
main_buffer_core.o: main_buffer_core.cc ../core/timer.h
54-
$(CC) -c $(CFLAGS) $<
110+
main_ptg: main_jdf.o common.o stencil_1d.o nearest_radix_5.o benchmark_internal.o benchmark.o spread_radix5_period3.o
111+
$(CC) $^ $(LIB) $(LDFLAGS) -o $@
55112

56-
main_buffer_core: main_buffer_core.o common.o
113+
main_buffer: main_buffer.o common.o
57114
$(CC) $^ $(LIB) $(LDFLAGS) -o $@
58115

59116
clean:
60117
rm -f *.o
61118
rm -f $(TARGET)
119+
rm -f benchmark.h benchmark.c stencil_1d.c stencil_1d.h nearest_radix_5.c nearest_radix_5.h spread_radix5_period3.c spread_radix5_period3.h
62120

63121
.PHONY: all clean

parsec/benchmark.jdf

Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
extern "C" %{
2+
/*
3+
* Copyright (c) 2017-2019 The Universiy of Tennessee and The Universiy
4+
* of Tennessee Research Foundation. All rights
5+
* reserved.
6+
*/
7+
#include <parsec/data_dist/matrix/matrix.h>
8+
#include "benchmark_internal.h"
9+
#include "core_c.h"
10+
11+
%}
12+
13+
descA [ type = "parsec_tiled_matrix_dc_t*" ]
14+
graph [ type = "task_graph_t" ]
15+
nb_fields [ type = "int" ]
16+
time_steps [ type = "int" ]
17+
graph_idx [ type = "int" ]
18+
extra_local_memory [ type = "char**" ]
19+
20+
update(t, x, k)
21+
22+
t = 1 .. time_steps-1
23+
24+
offset = %{ return task_graph_offset_at_timestep(graph, t); %}
25+
width = %{ return task_graph_width_at_timestep(graph, t); %}
26+
27+
x = offset .. offset+width-1
28+
m = t % nb_fields
29+
30+
in_first = %{ return get_in_first(graph, t, x); %}
31+
in_last = %{ return get_in_last(graph, t, x); %}
32+
num_args = %{ return get_num_args(graph, t, x, in_first, in_last); %}
33+
34+
k = in_first .. in_last
35+
36+
: descA(m, x)
37+
38+
RW I <- (num_args >= 2)? A benchmark(t-1, k): NULL
39+
-> (num_args >= 2 && k == in_first)? A1 benchmark(t, x)
40+
-> (num_args >= 3 && k == in_first + 1)? A2 benchmark(t, x)
41+
-> (num_args >= 4 && k == in_first + 2)? A3 benchmark(t, x)
42+
-> (num_args >= 5 && k == in_first + 3)? A4 benchmark(t, x)
43+
-> (num_args >= 6 && k == in_first + 4)? A5 benchmark(t, x)
44+
45+
BODY
46+
{
47+
//printf("update (%d, %d, %d): in (%d, %d)\n", t, x, k, in_first, in_last);
48+
}
49+
END
50+
51+
benchmark(t, x)
52+
53+
t = 0 .. time_steps-1
54+
55+
offset = %{ return task_graph_offset_at_timestep(graph, t); %}
56+
width = %{ return task_graph_width_at_timestep(graph, t); %}
57+
58+
x = offset .. offset+width-1
59+
m = t % nb_fields
60+
61+
in_first = %{ return get_in_first(graph, t, x); %}
62+
in_last = %{ return get_in_last(graph, t, x); %}
63+
num_args = %{ return get_num_args(graph, t, x, in_first, in_last); %}
64+
65+
out_first = %{ return get_out_first(graph, t, x); %}
66+
out_last = %{ return get_out_last(graph, t, x); %}
67+
num_args_out = %{ return get_num_args_out(graph, t, x, out_first, out_last); %}
68+
69+
: descA(m, x)
70+
71+
READ A1 <- (t > 0 && num_args >= 2)? I update(t, x, in_first): NULL
72+
READ A2 <- (t > 0 && num_args >= 3)? I update(t, x, in_first+1): NULL
73+
READ A3 <- (t > 0 && num_args >= 4)? I update(t, x, in_first+2): NULL
74+
READ A4 <- (t > 0 && num_args >= 5)? I update(t, x, in_first+3): NULL
75+
READ A5 <- (t > 0 && num_args >= 6)? I update(t, x, in_first+4): NULL
76+
77+
RW A <- descA(m, x)
78+
-> (t < time_steps-1 && num_args_out >= 2)? I update(t+1, out_first .. out_last, x)
79+
-> descA(m, x)
80+
81+
BODY
82+
{
83+
//printf("benchmark (%d, %d): in (%d, %d); out (%d, %d)\n", t, x, in_first, in_last, out_first, out_last);
84+
CORE_kernel(es, graph, A, A1, A2, A3, A4, A5, num_args, x, t, graph_idx, descA->super.myrank, extra_local_memory);
85+
}
86+
END
87+
88+
extern "C" %{
89+
90+
parsec_taskpool_t*
91+
parsec_benchmark_New(parsec_tiled_matrix_dc_t *A, task_graph_t graph, int nb_fields,
92+
int time_steps, int graph_idx, char **extra_local_memory)
93+
{
94+
parsec_taskpool_t* benchmark_taskpool;
95+
parsec_benchmark_taskpool_t* taskpool = NULL;
96+
97+
taskpool = parsec_benchmark_new(A, graph, nb_fields, time_steps, graph_idx, extra_local_memory);
98+
benchmark_taskpool = (parsec_taskpool_t*)taskpool;
99+
100+
parsec_matrix_add2arena(&(taskpool->arenas_datatypes[PARSEC_benchmark_DEFAULT_ARENA]),
101+
parsec_datatype_float_t, matrix_UpperLower,
102+
1, A->mb, A->nb, A->mb,
103+
PARSEC_ARENA_ALIGNMENT_SSE, -1 );
104+
105+
return benchmark_taskpool;
106+
}
107+
108+
void parsec_benchmark_Destruct(parsec_taskpool_t *taskpool)
109+
{
110+
parsec_benchmark_taskpool_t *benchmark_taskpool = (parsec_benchmark_taskpool_t *)taskpool;
111+
parsec_matrix_del2arena(&(benchmark_taskpool->arenas_datatypes[PARSEC_benchmark_DEFAULT_ARENA]));
112+
parsec_taskpool_free(taskpool);
113+
}
114+
115+
int parsec_benchmark(parsec_context_t *parsec,
116+
parsec_tiled_matrix_dc_t *A, task_graph_t graph, int nb_fields,
117+
int time_steps, int graph_idx, char **extra_local_memory)
118+
{
119+
parsec_taskpool_t *parsec_benchmark = NULL;
120+
121+
parsec_benchmark = parsec_benchmark_New(A, graph, nb_fields, time_steps, graph_idx, extra_local_memory);
122+
123+
if( parsec_benchmark != NULL ){
124+
parsec_enqueue(parsec, parsec_benchmark);
125+
parsec_context_start(parsec);
126+
parsec_context_wait(parsec);
127+
parsec_benchmark_Destruct(parsec_benchmark);
128+
}
129+
130+
return 0;
131+
}
132+
133+
%}

0 commit comments

Comments
 (0)