Skip to content

Commit 4ddbc53

Browse files
author
CSWater
committed
add missed files for lec09
1 parent b3c40d2 commit 4ddbc53

File tree

5 files changed

+279
-0
lines changed

5 files changed

+279
-0
lines changed
Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
#include <stdio.h>
2+
#include <stdlib.h>
3+
#include <cstdint>
4+
#include <omp.h>
5+
#include <cstdlib>
6+
#include <ctime>
7+
#include <iostream>
8+
#include <iomanip>
9+
using namespace std;
10+
11+
#define phi(A, i,j,k) A[(i) * jmax * kmax + (j) * kmax + (k)]
12+
13+
14+
void GaussSeidel(double *A, double osth , uint64_t iter, uint64_t imax, uint64_t jmax, uint64_t kmax) {
15+
for(uint64_t it = 0; it < iter; it++) {
16+
for(uint64_t k = 1; k < kmax-1; k++) {
17+
for(uint64_t j = 1; j < jmax-1; j++) {
18+
for(uint64_t i = 1; i < imax-1; i++) {
19+
phi(A, i, j, k) = ( phi(A, i-1, j, k) + phi(A, i+1, j, k)
20+
+ phi(A, i, j-1, k) + phi(A, i, j+1, k)
21+
+ phi(A, i, j, k-1) + phi(A, i, j, k+1) )* osth;
22+
}
23+
}
24+
}
25+
}
26+
}
27+
28+
29+
void GaussSeidelParallel(double *A, double osth , uint64_t iter, uint64_t imax, uint64_t jmax, uint64_t kmax) {
30+
int tid, numthreads;
31+
uint64_t it, i, j, k, jStart, jEnd;
32+
for(it = 0; it < iter; it++) {
33+
#pragma omp parallel private(tid, i, j, k, jStart, jEnd)
34+
{
35+
tid = omp_get_thread_num();
36+
#pragma omp single
37+
{
38+
numthreads = omp_get_num_threads();
39+
cout << "numthreads : " << numthreads << endl;
40+
}//default barrier for all threads
41+
jStart = jmax / numthreads * tid + 1;
42+
jEnd = jStart + jmax / numthreads;
43+
for(uint64_t l = 1; l < kmax + numthreads - 1; l++) {
44+
k = l - tid;
45+
if(1 <= k < kmax - 1) {
46+
for(j = jStart; j <= jEnd; j++) {
47+
for(i = 1; i < imax - 1; i++) {
48+
phi(A, i, j, k) = ( phi(A, i-1, j, k) + phi(A, i+1, j, k)
49+
+ phi(A, i, j-1, k) + phi(A, i, j+1, k)
50+
+ phi(A, i, j, k-1) + phi(A, i, j, k+1) ) * osth;
51+
}
52+
}
53+
}
54+
}
55+
}
56+
57+
}
58+
}
59+
60+
double get_time(struct timespec *start,
61+
struct timespec *end)
62+
{
63+
return end->tv_sec - start->tv_sec +
64+
(end->tv_nsec - start->tv_nsec) * 1e-9;
65+
}
66+
67+
#define N (imax * jmax * kmax)
68+
int main(int argc, char* argv[]){
69+
uint64_t imax = 1024, jmax = 1681, kmax = 1024;
70+
//uint64_t imax = 16, jmax = 17, kmax = 16;
71+
72+
double *A = nullptr;
73+
A = (double *)malloc(N * sizeof(double));
74+
75+
double time_used, lups, perf;
76+
struct timespec start, end;
77+
78+
#pragma omp for
79+
for(int i = 0; i < N; i++) {
80+
A[i] = random() % 100;
81+
}
82+
83+
cout << "imax:" << imax << ", jmax:" << jmax << ", kmax:" << kmax << endl;
84+
cout << setw(12) << "threadsnum" << "\t" << setw(10) <<"lup" << endl;
85+
86+
//serial GaussSeidel
87+
clock_gettime(CLOCK_MONOTONIC_RAW, &start);
88+
GaussSeidel(A, 1/6.0, 1, imax, jmax, kmax);
89+
clock_gettime(CLOCK_MONOTONIC_RAW, &end);
90+
time_used = get_time(&start, &end);
91+
lups = (imax - 2) * (jmax - 2) * (kmax - 2);
92+
perf = 1.0 * lups / time_used * 1e-6; // unit MLUP/s
93+
cout << setw(12) << "1" << "\t" << setprecision(4) << perf << endl;
94+
95+
//parallel GaussSeidel
96+
for(int threadnum = 2; threadnum <= 16; threadnum +=2 ) {
97+
omp_set_num_threads(threadnum);
98+
clock_gettime(CLOCK_MONOTONIC_RAW, &start);
99+
GaussSeidelParallel(A, 1/6.0, 1, imax, jmax, kmax);
100+
clock_gettime(CLOCK_MONOTONIC_RAW, &end);
101+
time_used = get_time(&start, &end);
102+
lups = (imax - 2) * (jmax - 2) * (kmax - 2);
103+
perf = 1.0 * lups / time_used * 1e-6; // unit MLUP/s
104+
cout << setw(12) << threadnum << "\t" << setprecision(4) << perf << endl;
105+
}
106+
return 0;
107+
}
108+
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
#include <stdio.h>
2+
#include <stdlib.h>
3+
#include <omp.h>
4+
5+
#define N 100
6+
int main(int argc, char* argv[]){
7+
int IND, ID, NT;
8+
int S[16][8];
9+
int A[N];
10+
for(int i = 0; i < N; i++) {
11+
A[i] = rand() % 8;
12+
}
13+
omp_set_num_threads(16);
14+
#pragma omp parallel private(ID, IND)
15+
{
16+
ID = omp_get_thread_num();
17+
#pragma omp for nowait
18+
for(int i = 0; i < N; i++) {
19+
IND = A[i];
20+
S[ID][IND] = S[ID][IND] + 1;
21+
}
22+
#pragma critical
23+
{
24+
for(int i = 0; i < 8; i++) {
25+
S[0][i] = S[0][i] + S[ID][i];
26+
}
27+
}
28+
}
29+
30+
return 0;
31+
}
32+
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
#include <iostream>
2+
#include <stdio.h>
3+
#include <stdlib.h>
4+
#include <omp.h>
5+
#include <unistd.h>
6+
#include <sys/syscall.h>
7+
#include <string.h>
8+
#include <sys/ioctl.h>
9+
#include <linux/perf_event.h>
10+
#include <linux/hw_breakpoint.h>
11+
#include <asm/unistd.h>
12+
#include <errno.h>
13+
#include <stdint.h>
14+
#include <inttypes.h>
15+
#include <time.h>
16+
17+
using namespace std;
18+
19+
#define N 512
20+
#define LOOP 100000000
21+
22+
//get time
23+
double get_time(struct timespec *start,
24+
struct timespec *end)
25+
{
26+
return end->tv_sec - start->tv_sec +
27+
(end->tv_nsec - start->tv_nsec) * 1e-9;
28+
}
29+
30+
int main(int argc, char* argv[]){
31+
float A[N], B[N], C[N], D[N];
32+
for(int i = 0; i < N; i++) {
33+
A[i] = B[i] = C[i] = D[i];
34+
}
35+
36+
struct timespec start, end;
37+
clock_gettime(CLOCK_MONOTONIC_RAW, &start);
38+
for(int iter = 1; iter < LOOP; iter++) {
39+
#pragma omp parallel
40+
{
41+
#pragma omp parallel for
42+
for(int i = 0; i < N; i++) {
43+
A[i] = B[i] + C[i] * D[i];
44+
}
45+
}
46+
}
47+
clock_gettime(CLOCK_MONOTONIC_RAW, &end);
48+
double used_time = get_time(&start, &end);
49+
double flops = 1.0 * N * 2 * LOOP;
50+
cout << "achieved flops:" << flops / used_time << endl;
51+
return 0;
52+
}
53+
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
#include <stdio.h>
2+
#include <stdlib.h>
3+
#include <omp.h>
4+
5+
#define N 10000000
6+
int main(int argc, char* argv[]){
7+
double A[N] = {0}, B[N] = {0};
8+
#pragma omp for
9+
for(int i = 0; i < N; i++) {
10+
A[i] = random() % 100;
11+
B[i] = A[i] * A[i];
12+
}
13+
return 0;
14+
}
15+
Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
#include <stdio.h>
2+
#include <omp.h>
3+
#include <cstdint>
4+
#include <cstdlib>
5+
#include <ctime>
6+
#include <iostream>
7+
#include <iomanip>
8+
using namespace std;
9+
10+
const int64_t N = 30000;
11+
const int64_t M = 30000;
12+
13+
double get_time(struct timespec *start,
14+
struct timespec *end)
15+
{
16+
return end->tv_sec - start->tv_sec +
17+
(end->tv_nsec - start->tv_nsec) * 1e-9;
18+
}
19+
20+
void dmvm(uint64_t n, uint64_t m, double *lhs, double *rhs, double *mat) {
21+
uint64_t offset, r, c;
22+
#pragma omp parallel for private(offset,c) schedule(static)
23+
for(r = 0; r < n; ++r) {
24+
offset = m * r;
25+
for(c = 0; c < m; ++c) {
26+
lhs[r] += mat[c + offset]* rhs[c];
27+
}
28+
}
29+
}
30+
31+
32+
int main(int argc, char* argv[]){
33+
double *mat = nullptr, *x = nullptr, *y = nullptr;
34+
35+
//malloc data
36+
mat = (double *)malloc(M * N * sizeof(double));
37+
x = (double *)malloc(N * sizeof(double));
38+
y = (double *)malloc(N * sizeof(double));
39+
40+
//serial init, data should on one numa memory
41+
for(uint64_t i = 0; i < M * N; i++) {
42+
mat[i] = i % 100;
43+
}
44+
for(uint64_t i = 0; i < N; i++) {
45+
x[i] = y[i] = i % 10;
46+
}
47+
48+
double time_used, flops;
49+
struct timespec start, end;
50+
int iters[] = {200, 500, 1000, 2000};
51+
cout << setw(10) << "iter" << "\t" << setw(10) << "threads" << "\t" << setw(10) << "perf" << endl;
52+
for(int nthreads = 2; nthreads <=32; nthreads = nthreads * 2) {
53+
for(int i = 0; i < 4; i++) {
54+
for(int j = 0; j < iters[i]; j++) {
55+
omp_set_num_threads(nthreads);
56+
clock_gettime(CLOCK_MONOTONIC_RAW, &start);
57+
dmvm(N, M, y, x, mat);
58+
clock_gettime(CLOCK_MONOTONIC_RAW, &end);
59+
time_used = get_time(&start, &end);
60+
flops = M * N * 2 / time_used;
61+
cout << setw(10) << iters[i] << "\t" << setw(10) << nthreads << "\t" << setw(10) << setprecision(4) << flops << endl;
62+
}
63+
}
64+
}
65+
66+
free(mat);
67+
free(x);
68+
free(y);
69+
return 0;
70+
}
71+

0 commit comments

Comments
 (0)