theComputeKid
diff --git a/‎+examples/ex6.m
+4-4 b/‎+examples/ex6.m
+4-4
diff --git a/‎+examples/ex7.m
+6-6 b/‎+examples/ex7.m
+6-6
diff --git a/‎+tribosolver/+internal/Domain.m
+2-2 b/‎+tribosolver/+internal/Domain.m
+2-2
diff --git a/‎+tribosolver/+internal/Level.m
+5-4 b/‎+tribosolver/+internal/Level.m
+5-4
diff --git a/‎+tribosolver/Execution.m
+2-2 b/‎+tribosolver/Execution.m
+2-2
diff --git a/‎+utils/bandedSolveGPU.m
+16-18 b/‎+utils/bandedSolveGPU.m
+16-18
diff --git a/‎+utils/bandedSolveGPUmex.cu
-157 b/‎+utils/bandedSolveGPUmex.cu
-157
diff --git a/‎+utils/make-mex.bat
+25 b/‎+utils/make-mex.bat
+25
@@ -24,9 +24,9 @@
 
 function domain = setDomain()
 
-nx = 64; xin = -3; xout = 1.5;
-ny = 64; yin = -2.5; yout = 2.5;
-mgl = 5;
+nx = 500; xin = -3; xout = 1.5;
+ny = 500; yin = -2.5; yout = 2.5;
+mgl = 2;
 
 domain = tribosolver.Domain(xin,xout,nx,yin,yout,ny,mgl);
 
@@ -75,4 +75,4 @@
     numCycles,gamma ...
     );
 
-end
+end
@@ -1,6 +1,6 @@
 function results = ex7()
 
-% Example #7: Uses the gpu parallel line solver
+% Example #7: Uses the gpu sequential line solver
 %
 % To run, go to the project root directory and type: examples.ex7
 %
@@ -24,8 +24,8 @@
 
 function domain = setDomain()
 
-nx = 976; xin = -3; xout = 1.5;
-ny = 976; yin = -2.5; yout = 2.5;
+nx = 128; xin = -3; xout = 1.5;
+ny = 64; yin = -2.5; yout = 2.5;
 mgl = 1;
 
 domain = tribosolver.Domain(xin,xout,nx,yin,yout,ny,mgl);
@@ -34,7 +34,7 @@
 
 function moes = setMoes()
 
-M = 15; L = 5;
+M = 15; L = 0;
 H0 = -0.53;
 
 moes = tribosolver.Moes(M,L,H0);
@@ -43,8 +43,8 @@
 
 function exec = setExecution()
 
-% We solve using double precision using the CPU
-prec = "single"; dev = "gpu";
+% We solve using single precision using the GPU sequential solver.
+prec = "single"; dev = "gpu_seq";
 
 % A verbosity level of 2 indicates the display of both text (verbosity > 0)
 % and graphical (verbosity > 1) plots during the solution scheme. Note that
 
@@ -75,8 +75,8 @@
                 y = domain.yin + (0:ny-1)*obj.dy;
             end
 
-            obj.x = repmat(reshape(x,[],1),1,ny);
-            obj.y = repmat(reshape(y,1,[]),nx,1);
+            obj.x = cast(repmat(reshape(x,[],1),1,ny),"like",p);
+            obj.y = cast(repmat(reshape(y,1,[]),nx,1),"like",p);
             %             obj.dx = gradient(obj.x);
             %             obj.dy = gradient(obj.y);
 
 
@@ -48,15 +48,16 @@
 
             [nx,ny] = size(obj.Domain.x);
             obj.k = initK(nx,ny,obj.Domain.dx,obj.Domain.dy);
+            obj.k = cast(obj.k,"like",obj.h);
 
-            obj.fb = cast(-2*pi/3,"like",obj.h);
+            obj.fb = cast(-2*pi/3,underlyingType(obj.k));
             obj.p_rhs = zeros(nx,ny,"like",obj.h);
             obj.p_old = zeros(nx,ny,"like",obj.h);
 
             % TODO: Create some algorithm that finds the best FFT padding
             % for the fastest convolution.
-            padX = obj.Domain.nx*3;
-            padY = obj.Domain.ny*3;
+            padX = 2^nextpow2(obj.Domain.nx*2);
+            padY = 2^nextpow2(obj.Domain.ny*2);
             obj.k_fft = fft2(obj.k,padX,padY);
 
             obj.calcDeformation();
@@ -73,7 +74,7 @@ function calcDeformation(obj)
                 nx = obj.Domain.nx;
                 ny = obj.Domain.ny;
                 [padX,padY] = size(obj.k_fft);
-                w=ifft2(fft2(obj.Results.p,padX,padY) .* obj.k_fft);
+                w=ifft2(fft2(obj.Results.p,padX,padY) .* obj.k_fft,"symmetric");
                 obj.Results.w=w(nx:(2*nx-1),ny:(2*ny-1));
 
             end
 
@@ -21,7 +21,7 @@
             } = "double";
 
         Device(1,1) string { ...
-            ismember(Device,["gpu","cpu_seq","cpu_par","gpu"]) ...
+            ismember(Device,["gpu_seq","cpu_seq","cpu_par","gpu_par"]) ...
             } = "cpu_seq";
 
         Verbosity(1,1) uint64 {mustBeNonempty} = false;
@@ -52,7 +52,7 @@
 
         function proto = getProto(obj)
             proto = cast([],obj.BasePrecision);
-            if strcmpi(obj.Device,"gpu")
+            if strcmpi(obj.Device,"gpu_seq") || strcmpi(obj.Device,"gpu_par")
                 proto = gpuArray(proto);
             end
         end
 
@@ -1,26 +1,24 @@
-function X = bandedSolveGPU(Y,A)
+function X = bandedSolveGPU(A,Y)
 
 % Aggregate the bands
-ds = squeeze(A(2,:,:));
-dl = squeeze(A(3,:,:));
-d = squeeze(A(4,:,:));
-du = squeeze(A(5,:,:));
-dw = squeeze(A(6,:,:));
-Y = squeeze(Y);
+ds = A(:,2);
+dl = A(:,3);
+d =  A(:,4);
+du = A(:,5);
+dw = A(:,6);
+Y = Y(1:end-1,:);
 
-if ~isfile("+utils/bandedSolveGPUmex." + mexext)
+if ~isfile("+utils/pentasolver." + mexext)
     disp("Building cuda-mex file: Banded Solver")
-    mexcuda("+utils/bandedSolveGPUmex.cu","-outdir","+utils","-lcublas")
+    if ispc
+        !cd +utils && nmake
+    end
 end
 
-ds = gpuArray(ds);
-dl = gpuArray(dl);
-d = gpuArray(d);
-du = gpuArray(du);
-dw = gpuArray(dw);
-Y = gpuArray(Y);
+X = [ ...
+    zeros("like",Y); ...
+    utils.pentasolver(Y,ds,dl,d,du,dw); ...
+    zeros("like",Y); ...
+    ];
 
-X = utils.bandedSolveGPUmex(ds,dl,d,du,dw,Y);
-
-X = gather(X);
 end
@@ -0,0 +1,25 @@
+@echo off
+
+@REM Output Name
+SET OUT=pentasolver.mexw64
+
+@REM Include Directories
+SET INC=-I%MATLABROOT%\extern\include -I%MATLABROOT%\toolbox\parallel\gpu\extern\include
+
+@REM Link Line
+SET MATLIBDIR=%MATLABROOT%\extern\lib\win64\microsoft
+SET LDFLAGS=-shared -L%MATLIBDIR% -llibmx -llibmex -llibmat -lgpu -lcusparse -lcublas -Xlinker -EXPORT:mexFunction -Xlinker -noimplib
+
+@REM Defines
+SET DEFINES=-DMATLAB_MEXCMD_RELEASE=R2018a -DMX_COMPAT_64 -DMATLAB_MEX_FILE
+
+@REM Compiler Flags
+SET CXXFLAGS=-x cu -std=c++17
+SET CXXDEBUGFLAGS=-g -G %CXXFLAGS%
+SET CXXRELEASEFLAGS=-O2 %CXXFLAGS% -Xcompiler -O2
+
+IF "%~1"=="debug" (
+	nvcc %CXXDEBUGFLAGS% %DEFINES% %INC% pentasolver.cpp -o %OUT% %LDFLAGS%
+) ELSE (
+	nvcc %CXXRELEASEFLAGS% %DEFINES% %INC% pentasolver.cpp -o %OUT% %LDFLAGS%
+)