Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Sum of Absolute Differences Benchmark #22

Open
wants to merge 16 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
161 changes: 161 additions & 0 deletions examples/sum_abs_diff/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
# Copyright (c) 2019, University of Washington All rights reserved.
#
# Redistribution and use in source and binary forms, with or without modification,
# are permitted provided that the following conditions are met:
#
# Redistributions of source code must retain the above copyright notice, this list
# of conditions and the following disclaimer.
#
# Redistributions in binary form must reproduce the above copyright notice, this
# list of conditions and the following disclaimer in the documentation and/or
# other materials provided with the distribution.
#
# Neither the name of the copyright holder nor the names of its contributors may
# be used to endorse or promote products derived from this software without
# specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

################################################################################
# Paths / Environment Configuration
################################################################################
_REPO_ROOT ?= $(shell git rev-parse --show-toplevel)
CURRENT_PATH := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))

-include $(_REPO_ROOT)/environment.mk

################################################################################
# Define BSG_MACHINE_PATH, the location of the Makefile.machine.include file
# that defines the machine to compile and simulate on. Using BSG_F1_DIR (which
# is set in environment.mk) uses the same machine as in bsg_replicant.
################################################################################

BSG_MACHINE_PATH=$(BSG_F1_DIR)

################################################################################
# Define the range of versions
################################################################################
# Kernel versions. See kernel/README.md for more information. Version names do
# not need to use v* and can be any string
VERSIONS = v0 v1 v2 v3 v4 v5 v6 v7 v8 v9 v10

################################################################################
# Define any sources that should be used compiled during kernel compilation,
# including the source file with the kernel itself. kernel.riscv will
# be the name of the compiled RISC-V Binary for the Manycore
#
# Use KERNEL_*LIBRARIES list sources that should be compiled and linked with all
# kernel.cpp versions. However, if you have version-specific sources you must
# come up with your own solution.
#
# Use KERNEL_INCLUDES to specify the path to directories that contain headers.
################################################################################

# C Libraries
KERNEL_CLIBRARIES +=
# C++ Libraries
KERNEL_CXXLIBRARIES +=

KERNEL_INCLUDES += -I$(CURRENT_PATH)/kernel/include

# Define the default kernel.cpp file. If KERNEL_DEFAULT is not defined it will
# be set to kernel.cpp in the same directory as this Makefile.
DEFAULT_VERSION := v0
KERNEL_DEFAULT := kernel/$(DEFAULT_VERSION)/kernel.cpp

################################################################################
# Include the kernel build rules (This must be included after KERNEL_*LIBRARIES,
# KERNEL_DEFAULT, KERNEL_INCLUDES, etc)
################################################################################

-include $(FRAGMENTS_PATH)/kernel/cudalite.mk

################################################################################
# END OF KERNEL-SPECIFIC RULES / START OF HOST-SPECIFIC RULES
################################################################################


################################################################################
# Define the $(HOST_TARGET), the name of the host executable to generate. The
# cosimulation host executable will be called
# $(HOST_TARGET).cosim. HOST_*SOURCES list the host files that should be
# compiled and linked into the executable.
################################################################################

HOST_TARGET := sum_abs_diff
HOST_CSOURCES :=
HOST_CXXSOURCES := $(HOST_TARGET).cpp
HOST_INCLUDES := -I$(CURRENT_PATH)

################################################################################
# Include the Cosimulation host build rules (This must be included after
# HOST_*SOURCES, HOST_TARGET, HOST_INCLUDES, etc)
################################################################################

-include $(FRAGMENTS_PATH)/host/cosim.mk

################################################################################
# Define the clean rules. clean calls the makefile-specific cleans, whereas
# users can add commands and dependencies to custom.clean.
################################################################################
version.clean:
rm -rf kernel/*/*{.csv,.log,.rvo,.riscv,.vpd,.key,.png,.dis}
rm -rf kernel/*/{stats,pc_stats}

custom.clean: version.clean

clean: cosim.clean analysis.clean cudalite.clean custom.clean

################################################################################
# Define overall-goals. The all rule runs all kernel versions, and the default
# kernel.
################################################################################

_HELP_STRING := "Makefile Rules\n"

_HELP_STRING += " default: \n"
_HELP_STRING += " - Run the default kernel ($KERNEL_DEFAULT) and generate all of the\n"
_HELP_STRING += " analysis products\n"
default: stats graphs pc_stats

_HELP_STRING += " analysis: \n"
_HELP_STRING += " - Launch indpendent cosimulation executions of each kernel version.\n"
_HELP_STRING += " When execution finishes, it generates all the analysis products \n"
_HELP_STRING += " for each kernel in each respective kernel/<version_name>/ \n"
_HELP_STRING += " directory\n"
analysis: $(foreach v,$(VERSIONS),kernel/$v/stats kernel/$v/graphs kernel/$v/pc_stats)

_HELP_STRING += " statistics: \n"
_HELP_STRING += " - Launch indpendent cosimulation executions of each kernel version.\n"
_HELP_STRING += " When execution finishes, it generates ONLY the parsed operation \n"
_HELP_STRING += " stats for each kernel in each respective kernel/<version_name>/ \n"
_HELP_STRING += " directory\n"
statistics: $(foreach v,$(VERSIONS),kernel/$v/stats)

_HELP_STRING += " all: \n"
_HELP_STRING += " - Launch both the default and analysis target\n"
all: analysis default

.DEFAULT_GOAL = help
_HELP_STRING += " help: \n"
_HELP_STRING += " - Output a friendly help message.\n"
help:
@echo -e $(HELP_STRING)

# Always re-run, if asked.
.PHONY: default analysis help

# These last three lines ensure that _HELP_STRING is appended to the top of
# whatever else comes before it.
_HELP_STRING += "\n"
_HELP_STRING += $(HELP_STRING)
HELP_STRING := $(_HELP_STRING)
106 changes: 106 additions & 0 deletions examples/sum_abs_diff/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
# Sum of Absolute Differences

This example runs sum of absolute differences on a refernce matrix
and a frame matrix. Asuming parameters for the height and width of
reference and frame matrixes, the result is a matrix with dimensions
(reference width - frame width + 1) x (reference height - frame height +1).

Element (x,y) in the result matrix, is the sum of absolute difference between
the frame, and the sub-matrix of the reference starting from position (x,y) with
the same height and width as the frame. The purpose of this kernel is to find the
porition of the reference matrix that has the most resemblence to the frame matrix.

The kernel code is located in the subdirectories of [kernel](kernel). The actual
sum of absolute differences code is in the header file
[kernel/include/sum_abs_diff.hpp](kernel/include/sum_abs_diff.hpp).

# Makefile Targets

For a list of all Makefile targets, run `make help`.

## Versions

There are several different versions of this kernel. Each is a subdirectory in
the [kernel](kernel) directory.

### Version 0
In this version, each tile only performs one unit of work, i.e. calculating one
element of the result matrix. Based on the size of the result matrix, and the
tile group dimensions, enough tile groups will be launched to populate the
entire result matrix.


### Version 1

In this version, each tile performs multiple units of work. The workload share
of each tile group is defined by block_size_x/y and passed to the kernel. Each
tile then performs the kernel in a loop, with each iteration belonging to a
unit of work (or vitual thread).


### Version 2

In this version, frame dimensions are fixed at 4 by 4.


### Version 3

In this version, the frame dimensions are templatized in the kernel, instead
of being passed in as an input argument to the kernel. This gives the compiler
the opportunity to optimize based on frame dimensions known at compiler time.


### Version 4

In this version, the reference dimensions are templatized in the kernel, instead
of being passed in as an input argument to the kernel. This gives the compiler
the opportunity to optimize based on reference dimensions known at compiler time.


### Version 5

Version 5 - Refernce and frame dimensions templatized
In this version, both the reference and frame dimensions are templatized
in the kernel, instead of being passed in as an input argument to the kernel.
This gives the compiler the opportunity to optimize based on reference
dimensions known at compiler time.


### Version 6

This version loads the frame into tile group shared memory using tile group
shared memory macros, and uses the shared memory for redundant accesses to
the frame.


### Version 7

This version uses tile group shared memory macros for storing refernce & frame
Due to redundant accesses to DRAM, performance can be improved by loading reference
and frame matrxix into shared meomry and using that for compuation.

This version is currently not working.


### Version 8

This version loads frame into tile group shared memory similar to version 7,
however it uses the new replacement for the deprecated tile group shared memory
macros, the striped array.


### Version 9

Similar to version 8, but uses the new templatized barrier library instead of
the deprecated barrier macros.


### Version 10

This version combines all optimizations together:
* Templatized reference and frame dimensions
* Frame is loaded into tile group shared striped memory
* Templatized barrier library is used for synchronization



10 changes: 10 additions & 0 deletions examples/sum_abs_diff/kernel/include/sum_abs_diff.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#ifndef __SUM_ABS_DIFF_HPP
#define __SUM_ABS_DIFF_HPP
#include <cstdint>

#define ABS(x) ((x) >= 0) ? (x) : (-(x))
#define MIN(x,y) (((x) <= (y)) ? (x) : (y))



#endif //__SUM_ABS_DIFF_HPP
81 changes: 81 additions & 0 deletions examples/sum_abs_diff/kernel/v0/kernel.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
/*
* This kernel performs sum of absolute differences (SAD) between a
* Reference matrix and a frame matrix.
* The kernel searches over the reference matrix, and calculates
* sum of absolute differnces for every sub-matrix that matches
* the sizes of the frame matrix, and stores the result for that
* location into result matrix.
*/

// BSG_TILE_GROUP_X_DIM and BSG_TILE_GROUP_Y_DIM must be defined
// before bsg_manycore.h and bsg_tile_group_barrier.h are
// included. bsg_tiles_X and bsg_tiles_Y must also be defined for
// legacy reasons, but they are deprecated.
#define BSG_TILE_GROUP_X_DIM 4
#define BSG_TILE_GROUP_Y_DIM 4
#define bsg_tiles_X BSG_TILE_GROUP_X_DIM
#define bsg_tiles_Y BSG_TILE_GROUP_Y_DIM
#include <bsg_manycore.h>
#include <bsg_tile_group_barrier.h>
#include <cstdint>

#include <sum_abs_diff.hpp>

INIT_TILE_GROUP_BARRIER(r_barrier, c_barrier,
0, BSG_TILE_GROUP_X_DIM-1,
0, BSG_TILE_GROUP_Y_DIM-1);


/*
* Version 0 - Singler work per tile
* In this version, each tile only performs one unit of work, i.e. calculating one
* element of the result matrix. Based on the size of the result matrix, and the
* tile group dimensions, enough tile groups will be launched to populate the
* entire result matrix.
*/
int __attribute__ ((noinline)) sum_abs_diff_single_work_per_tile (int *REF, int *FRAME, int *RES,
uint32_t ref_height, uint32_t ref_width,
uint32_t frame_height, uint32_t frame_width,
uint32_t res_height, uint32_t res_width) {

int start_y = __bsg_tile_group_id_y * bsg_tiles_Y + bsg_y;
int end_y = MIN (start_y + frame_height, res_height);
int start_x = __bsg_tile_group_id_x * bsg_tiles_X + bsg_x;
int end_x = MIN (start_x + frame_width, res_width);


int sad = 0;
for (int y = start_y; y < end_y; y ++) {
for (int x = start_x; x < end_x; x ++) {
sad += ABS ( (REF[y * ref_width + x] - FRAME[(y - start_y) * frame_width + (x - start_x)]) );
}
}

RES[start_y * res_width + start_x] = sad;
return 0;
}
Comment on lines +36 to +56
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As I said in the meeting - if you use templates for the input datatype you won't have binary bloat. I think it's worth your time to do this.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you're worried about namespace collision you can do:

namespace v0{
// Kernel source

}

And then in the kernel.cpp file:

using namespace v0;




extern "C" {
int __attribute__ ((noinline)) kernel_sum_abs_diff(
int *REF, int *FRAME, int *RES,
uint32_t REF_HEIGHT, uint32_t REF_WIDTH,
uint32_t FRAME_HEIGHT, uint32_t FRAME_WIDTH,
uint32_t RES_HEIGHT, uint32_t RES_WIDTH,
uint32_t block_size_y, uint32_t block_size_x) {
int rc;
bsg_cuda_print_stat_kernel_start();
bsg_cuda_print_stat_start(0);
rc = sum_abs_diff_single_work_per_tile (REF, FRAME, RES,
REF_HEIGHT, REF_WIDTH,
FRAME_HEIGHT, FRAME_WIDTH,
RES_HEIGHT, RES_WIDTH);
bsg_cuda_print_stat_end(0);

bsg_tile_group_barrier(&r_barrier, &c_barrier);

bsg_cuda_print_stat_kernel_end();
return rc;
}
}
Loading