Perf regression on NVPL BLAS wrt OpenBLAS | sgemm #1

Rohanjames1997 · 2024-12-13T17:41:52Z

Hi community,

I benchmarked the sgemm (cblas_sgemm) implementation of NVPL BLAS against that of OpenBLAS and I noticed perf regressions on matrices of certain sizes. (Data and Code below)

Setup configuration:

Arm Neoverse V2 machine with 16 cores, all of which were utilized during the benchmark.
NVPL 24.7 (link)
Numpy 2.2.0 (to benchmark OpenBLAS)
Followed build instructions from here with default settings.

I'd like to know if these perf numbers are expected, or if there are any configs or flags that can be enabled to tune the performance of NVPL BLAS.

Note that for other configurations of matrix sizes, NVPL performed better, and even had a lower median latency overall.

Perf comparison between OpenBLAS and NVPL BLAS

M	N	K	NVPL BLAS (msec) 1000 iters	OpenBLAS (msec) 1000 iters	Speedup with NVPL BLAS (Higher is better for NVPL)
2	2	64	0.72172	0.5641	0.78161
2	4	64	0.67573	0.56672	0.83867
2	16	64	0.79675	0.68736	0.86271
2	32	32	0.72562	0.71693	0.98801
2	32	64	1.10518	0.81158	0.73434
2	64	16	0.74114	0.70787	0.9551
2	64	32	1.08362	0.7968	0.73531
2	64	64	1.87112	1.11628	0.59658
4	2	64	0.75437	0.58103	0.77021
4	4	64	0.72119	0.5765	0.79937
4	16	64	0.83788	0.75054	0.89576
4	32	32	0.77964	0.76127	0.97644
4	32	64	1.17008	0.90241	0.77124
4	64	16	0.81116	0.79441	0.97935
4	64	32	1.17261	0.93508	0.79743
4	64	64	3.62554	1.30558	0.36011
8	2	64	0.85158	0.66638	0.78252
8	4	64	0.82522	0.65446	0.79308
8	16	64	0.96524	0.87977	0.91145
8	32	32	0.88541	0.86832	0.98069
8	32	64	3.61929	1.17064	0.32344
8	64	16	0.97116	0.93412	0.96186
8	64	32	3.59675	1.18733	0.33011
8	64	64	3.66001	1.80268	0.49253
16	2	64	1.13117	0.82064	0.72548
16	4	64	1.09963	0.80013	0.72764
16	8	64	0.96488	0.91648	0.94984
16	16	64	4.70802	1.20854	0.2567
16	32	32	4.57972	1.20139	0.26233
16	32	64	3.80076	1.84226	0.48471
16	64	16	4.02203	1.23763	0.30771
16	64	32	3.64223	1.81389	0.49802
16	64	64	4.00294	3.01051	0.75208
32	2	32	0.89665	0.83947	0.93623
32	2	64	1.56609	0.99373	0.63453
32	4	32	0.90476	0.76437	0.84484
32	4	64	1.55195	0.98085	0.63201
32	8	64	3.63049	1.30868	0.36047
32	16	32	3.86673	1.29938	0.33604
32	16	64	4.32205	1.90592	0.44098
32	32	16	4.29408	1.28317	0.29882
32	32	32	3.65153	1.88017	0.5149
32	32	64	4.97947	3.07775	0.61809
32	64	8	3.88485	1.38545	0.35663
32	64	16	3.76845	1.91069	0.50702
32	64	32	4.28047	3.05152	0.71289
64	2	16	0.83448	0.79155	0.94855
64	2	32	1.40934	1.02067	0.72422
64	2	64	2.63082	1.47629	0.56115
64	4	16	0.88816	0.80085	0.90169
64	4	32	1.45726	1.03807	0.71235
64	4	64	4.29375	1.48583	0.34604
64	8	32	4.56628	1.37019	0.30007
64	8	64	3.80024	2.11644	0.55692
64	16	16	4.88562	1.36042	0.27845
64	16	32	4.03931	1.97411	0.48872
64	16	64	4.28662	3.28422	0.76616
64	32	8	4.02653	1.40667	0.34935
64	32	16	3.94059	1.96242	0.498
64	32	32	4.22758	3.14021	0.74279
64	64	4	3.63487	1.8723	0.51509
64	64	8	3.90227	2.14672	0.55012
64	64	16	3.81728	3.23963	0.84868

Code for sgemm on NVPL BLAS

A modified version of the sgemm.c

/******************************************************************************
 * Content: 
 *     This example demonstrates use of API as below:
 *     cblas_sgemm
 *
 ******************************************************************************/
#include "example_helper.h"
#include <time.h>

int main() {
    nvpl_int_t lda, ldb, ldc;
    float alpha = 1.0f;
    float beta = 1.0f;
    enum CBLAS_ORDER order = CblasRowMajor;
    enum CBLAS_TRANSPOSE transA = CblasNoTrans;
    enum CBLAS_TRANSPOSE transB = CblasNoTrans;
    float *A = NULL;
    float *B = NULL;
    float *C = NULL;

    printf("\nExample: cblas_sgemm for matrix-matrix multiplication\n\n");

    int num_iterations = 1000;

    for (nvpl_int_t M = 2; M <= 64; M *= 2) {
        for (nvpl_int_t N = 2; N <= 64; N *= 2) {
            for (nvpl_int_t K = 2; K <= 64; K *= 2) {
                lda = (transA == CblasNoTrans) ? K : M;
                ldb = (transB == CblasNoTrans) ? N : K;
                ldc = N;

                nvpl_int_t rowsA = (transA == CblasNoTrans) ? M : K;
                nvpl_int_t colsA = (transA == CblasNoTrans) ? K : M;
                nvpl_int_t rowsB = (transB == CblasNoTrans) ? K : N;
                nvpl_int_t colsB = (transB == CblasNoTrans) ? N : K;
                nvpl_int_t rowsC = M;
                nvpl_int_t colsC = N;

                // allocate memory
                A = (float *)malloc(lda * rowsA * sizeof(float));
                B = (float *)malloc(ldb * rowsB * sizeof(float));
                C = (float *)malloc(ldc * rowsC * sizeof(float));

                // fill data
                fill_smatrix(A, rowsA, colsA, lda, order, Full, CblasNonUnit);
                fill_smatrix(B, rowsB, colsB, ldb, order, Full, CblasNonUnit);
                fill_smatrix(C, rowsC, colsC, ldc, order, Full, CblasNonUnit);

                // Measure time for the loop
                struct timespec start_time, end_time;
                clock_gettime(CLOCK_MONOTONIC, &start_time);

                for (int i = 0; i < num_iterations; i++) {
                    cblas_sgemm(order, transA, transB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
                }

                clock_gettime(CLOCK_MONOTONIC, &end_time);

                double elapsed_time = (end_time.tv_sec - start_time.tv_sec) * 1e3 +
                                      (end_time.tv_nsec - start_time.tv_nsec) / 1e6;
                printf("\nM=%" PRId64 ", N=%" PRId64 ", K=%" PRId64 ": Time for %d iterations: %.6f ms\n", 
                       (int64_t)M, (int64_t)N, (int64_t)K, num_iterations, elapsed_time);

                // release memory
                free(A);
                free(B);
                free(C);
            }
        }
    }

    return 0;
}

Code for sgemm on OpenBLAS

A modified version of the sgemm.py

#!/usr/bin/python

import os
import sys
import time
import numpy
from numpy.random import randn

def run_sgemm(l):
	M = 2
	while M <= 64:
		N = 2
		while N <= 64:
			K = 2
			while K <= 64:
				A = randn(M,K).astype('float32')
				B = randn(K,N).astype('float32')
				start = time.time();
				for i in range(0,l):
					ref = numpy.dot(A,B)
				end = time.time()
	
				timediff = (end -start) 
				mflops = ( 2*N*N*N) *l / timediff
				mflops *= 1e-6
				size = "%dx%d, %dx%d" % (M,K,K,N)
				print("%14s :\t%20f MFlops\t%20f ms" % (size,mflops,timediff*1e3 ))
				K *= 2  # Multiplicative increment for K
			N *= 2  # Multiplicative increment for N
		M *= 2  # Multiplicative increment for M

if __name__ == "__main__":
	N=2
	NMAX=16
	NINC=2
	LOOPS=1000

	z=0
	for arg in sys.argv:
		if z == 1:
			N = int(arg)
		elif z == 2:
			NMAX = int(arg)
		elif z == 3:
			NINC = int(arg)
		elif z == 4:
			LOOPS = int(arg)

		z = z + 1

	if 'OPENBLAS_LOOPS' in os.environ:
		p = os.environ['OPENBLAS_LOOPS']
		if p:
			LOOPS = int(p);

	print("From: %d To: %d Step=%d Loops=%d" % (N, NMAX, NINC, LOOPS))
	print("\tSIZE\t\t\tFlops\t\t\t\t\tTime")

	run_sgemm(LOOPS)

The text was updated successfully, but these errors were encountered:

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Perf regression on NVPL BLAS wrt OpenBLAS | sgemm #1

Perf regression on NVPL BLAS wrt OpenBLAS | sgemm #1

Rohanjames1997 commented Dec 13, 2024 •

edited

Loading

Perf regression on NVPL BLAS wrt OpenBLAS | sgemm #1

Perf regression on NVPL BLAS wrt OpenBLAS | sgemm #1

Comments

Rohanjames1997 commented Dec 13, 2024 • edited Loading

Rohanjames1997 commented Dec 13, 2024 •

edited

Loading