Skip to content

Commit 17ad5c5

Browse files
committed
Add sequential GPU solver.
1 parent e28dbc9 commit 17ad5c5

14 files changed

+461
-214
lines changed

+examples/ex6.m

+4-4
Original file line numberDiff line numberDiff line change
@@ -24,9 +24,9 @@
2424

2525
function domain = setDomain()
2626

27-
nx = 64; xin = -3; xout = 1.5;
28-
ny = 64; yin = -2.5; yout = 2.5;
29-
mgl = 5;
27+
nx = 500; xin = -3; xout = 1.5;
28+
ny = 500; yin = -2.5; yout = 2.5;
29+
mgl = 2;
3030

3131
domain = tribosolver.Domain(xin,xout,nx,yin,yout,ny,mgl);
3232

@@ -75,4 +75,4 @@
7575
numCycles,gamma ...
7676
);
7777

78-
end
78+
end

+examples/ex7.m

+6-6
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
function results = ex7()
22

3-
% Example #7: Uses the gpu parallel line solver
3+
% Example #7: Uses the gpu sequential line solver
44
%
55
% To run, go to the project root directory and type: examples.ex7
66
%
@@ -24,8 +24,8 @@
2424

2525
function domain = setDomain()
2626

27-
nx = 976; xin = -3; xout = 1.5;
28-
ny = 976; yin = -2.5; yout = 2.5;
27+
nx = 128; xin = -3; xout = 1.5;
28+
ny = 64; yin = -2.5; yout = 2.5;
2929
mgl = 1;
3030

3131
domain = tribosolver.Domain(xin,xout,nx,yin,yout,ny,mgl);
@@ -34,7 +34,7 @@
3434

3535
function moes = setMoes()
3636

37-
M = 15; L = 5;
37+
M = 15; L = 0;
3838
H0 = -0.53;
3939

4040
moes = tribosolver.Moes(M,L,H0);
@@ -43,8 +43,8 @@
4343

4444
function exec = setExecution()
4545

46-
% We solve using double precision using the CPU
47-
prec = "single"; dev = "gpu";
46+
% We solve using single precision using the GPU sequential solver.
47+
prec = "single"; dev = "gpu_seq";
4848

4949
% A verbosity level of 2 indicates the display of both text (verbosity > 0)
5050
% and graphical (verbosity > 1) plots during the solution scheme. Note that

+tribosolver/+internal/Domain.m

+2-2
Original file line numberDiff line numberDiff line change
@@ -75,8 +75,8 @@
7575
y = domain.yin + (0:ny-1)*obj.dy;
7676
end
7777

78-
obj.x = repmat(reshape(x,[],1),1,ny);
79-
obj.y = repmat(reshape(y,1,[]),nx,1);
78+
obj.x = cast(repmat(reshape(x,[],1),1,ny),"like",p);
79+
obj.y = cast(repmat(reshape(y,1,[]),nx,1),"like",p);
8080
% obj.dx = gradient(obj.x);
8181
% obj.dy = gradient(obj.y);
8282

+tribosolver/+internal/Level.m

+5-4
Original file line numberDiff line numberDiff line change
@@ -48,15 +48,16 @@
4848

4949
[nx,ny] = size(obj.Domain.x);
5050
obj.k = initK(nx,ny,obj.Domain.dx,obj.Domain.dy);
51+
obj.k = cast(obj.k,"like",obj.h);
5152

52-
obj.fb = cast(-2*pi/3,"like",obj.h);
53+
obj.fb = cast(-2*pi/3,underlyingType(obj.k));
5354
obj.p_rhs = zeros(nx,ny,"like",obj.h);
5455
obj.p_old = zeros(nx,ny,"like",obj.h);
5556

5657
% TODO: Create some algorithm that finds the best FFT padding
5758
% for the fastest convolution.
58-
padX = obj.Domain.nx*3;
59-
padY = obj.Domain.ny*3;
59+
padX = 2^nextpow2(obj.Domain.nx*2);
60+
padY = 2^nextpow2(obj.Domain.ny*2);
6061
obj.k_fft = fft2(obj.k,padX,padY);
6162

6263
obj.calcDeformation();
@@ -73,7 +74,7 @@ function calcDeformation(obj)
7374
nx = obj.Domain.nx;
7475
ny = obj.Domain.ny;
7576
[padX,padY] = size(obj.k_fft);
76-
w=ifft2(fft2(obj.Results.p,padX,padY) .* obj.k_fft);
77+
w=ifft2(fft2(obj.Results.p,padX,padY) .* obj.k_fft,"symmetric");
7778
obj.Results.w=w(nx:(2*nx-1),ny:(2*ny-1));
7879

7980
end

+tribosolver/Execution.m

+2-2
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
} = "double";
2222

2323
Device(1,1) string { ...
24-
ismember(Device,["gpu","cpu_seq","cpu_par","gpu"]) ...
24+
ismember(Device,["gpu_seq","cpu_seq","cpu_par","gpu_par"]) ...
2525
} = "cpu_seq";
2626

2727
Verbosity(1,1) uint64 {mustBeNonempty} = false;
@@ -52,7 +52,7 @@
5252

5353
function proto = getProto(obj)
5454
proto = cast([],obj.BasePrecision);
55-
if strcmpi(obj.Device,"gpu")
55+
if strcmpi(obj.Device,"gpu_seq") || strcmpi(obj.Device,"gpu_par")
5656
proto = gpuArray(proto);
5757
end
5858
end

+utils/bandedSolveGPU.m

+16-18
Original file line numberDiff line numberDiff line change
@@ -1,26 +1,24 @@
1-
function X = bandedSolveGPU(Y,A)
1+
function X = bandedSolveGPU(A,Y)
22

33
% Aggregate the bands
4-
ds = squeeze(A(2,:,:));
5-
dl = squeeze(A(3,:,:));
6-
d = squeeze(A(4,:,:));
7-
du = squeeze(A(5,:,:));
8-
dw = squeeze(A(6,:,:));
9-
Y = squeeze(Y);
4+
ds = A(:,2);
5+
dl = A(:,3);
6+
d = A(:,4);
7+
du = A(:,5);
8+
dw = A(:,6);
9+
Y = Y(1:end-1,:);
1010

11-
if ~isfile("+utils/bandedSolveGPUmex." + mexext)
11+
if ~isfile("+utils/pentasolver." + mexext)
1212
disp("Building cuda-mex file: Banded Solver")
13-
mexcuda("+utils/bandedSolveGPUmex.cu","-outdir","+utils","-lcublas")
13+
if ispc
14+
!cd +utils && nmake
15+
end
1416
end
1517

16-
ds = gpuArray(ds);
17-
dl = gpuArray(dl);
18-
d = gpuArray(d);
19-
du = gpuArray(du);
20-
dw = gpuArray(dw);
21-
Y = gpuArray(Y);
18+
X = [ ...
19+
zeros("like",Y); ...
20+
utils.pentasolver(Y,ds,dl,d,du,dw); ...
21+
zeros("like",Y); ...
22+
];
2223

23-
X = utils.bandedSolveGPUmex(ds,dl,d,du,dw,Y);
24-
25-
X = gather(X);
2624
end

+utils/bandedSolveGPUmex.cu

-157
This file was deleted.

+utils/make-mex.bat

+25
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
@echo off
2+
3+
@REM Output Name
4+
SET OUT=pentasolver.mexw64
5+
6+
@REM Include Directories
7+
SET INC=-I%MATLABROOT%\extern\include -I%MATLABROOT%\toolbox\parallel\gpu\extern\include
8+
9+
@REM Link Line
10+
SET MATLIBDIR=%MATLABROOT%\extern\lib\win64\microsoft
11+
SET LDFLAGS=-shared -L%MATLIBDIR% -llibmx -llibmex -llibmat -lgpu -lcusparse -lcublas -Xlinker -EXPORT:mexFunction -Xlinker -noimplib
12+
13+
@REM Defines
14+
SET DEFINES=-DMATLAB_MEXCMD_RELEASE=R2018a -DMX_COMPAT_64 -DMATLAB_MEX_FILE
15+
16+
@REM Compiler Flags
17+
SET CXXFLAGS=-x cu -std=c++17
18+
SET CXXDEBUGFLAGS=-g -G %CXXFLAGS%
19+
SET CXXRELEASEFLAGS=-O2 %CXXFLAGS% -Xcompiler -O2
20+
21+
IF "%~1"=="debug" (
22+
nvcc %CXXDEBUGFLAGS% %DEFINES% %INC% pentasolver.cpp -o %OUT% %LDFLAGS%
23+
) ELSE (
24+
nvcc %CXXRELEASEFLAGS% %DEFINES% %INC% pentasolver.cpp -o %OUT% %LDFLAGS%
25+
)

0 commit comments

Comments
 (0)