-
Notifications
You must be signed in to change notification settings - Fork 0
/
test.cu
142 lines (112 loc) · 3.95 KB
/
test.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
#include <iostream>
#include "poly_eval.cu"
#define HOST_DEVICE_INLINE __host__ __device__ __forceinline__
#define HOST_INLINE __host__ __forceinline__
#define P_MOD 10
template <unsigned P = P_MOD>
class Dummy_Scalar
{
public:
static constexpr unsigned MODULUS = P;
unsigned x;
static HOST_DEVICE_INLINE Dummy_Scalar zero() { return {0}; }
static HOST_DEVICE_INLINE Dummy_Scalar one() { return {1}; }
friend HOST_INLINE std::ostream& operator<<(std::ostream& os, const Dummy_Scalar& scalar)
{
os << scalar.x;
return os;
}
friend HOST_DEVICE_INLINE Dummy_Scalar operator+(Dummy_Scalar p1, const Dummy_Scalar& p2)
{
return {(p1.x + p2.x) % MODULUS};
}
friend HOST_DEVICE_INLINE Dummy_Scalar operator*(Dummy_Scalar p1, const Dummy_Scalar& p2)
{
return {(p1.x * p2.x) % MODULUS};
}
friend HOST_DEVICE_INLINE bool operator==(const Dummy_Scalar& p1, const Dummy_Scalar& p2)
{
return (p1.x == p2.x);
}
friend HOST_DEVICE_INLINE bool operator!=(const Dummy_Scalar& p1, const Dummy_Scalar& p2)
{
return (p1.x != p2.x);
}
friend HOST_DEVICE_INLINE bool operator==(const Dummy_Scalar& p1, const unsigned p2)
{
return (p1.x == p2);
}
static HOST_DEVICE_INLINE Dummy_Scalar neg(const Dummy_Scalar& scalar)
{
return {MODULUS - scalar.x};
}
static HOST_INLINE Dummy_Scalar rand_host()
{
return {static_cast<unsigned>(rand() % MODULUS)};
}
};
typedef Dummy_Scalar<> test_scalar;
int main(int argc, char** argv)
{
cudaEvent_t start, stop;
float time;
// test parameters
int coeffs_log_size = (argc > 1) ? atoi(argv[1]) : 20;
int coeffs_size = 1 << coeffs_log_size;
int domain_size = (argc > 2) ? atoi(argv[2]) : 7;
int batch_size = (argc > 3) ? atoi(argv[3]) : 10;
int total_coeffs_size = batch_size * coeffs_size;
int total_results_size = batch_size * domain_size;
printf("running poly eval of degree, 2^%d, domain_size=%d, batch_size=%d, scalar modulus=%d\n", coeffs_log_size, domain_size, batch_size, P_MOD);
// init inputs
test_scalar* coeffs = new test_scalar[total_coeffs_size];
test_scalar* domain = new test_scalar[domain_size];
for (int i = 0; i < total_coeffs_size; i++)
{
coeffs[i] = test_scalar::rand_host();
}
for (int i = 0; i < domain_size; i++)
{
domain[i] = test_scalar::rand_host();
}
std::cout << "finished generating inputs" << std::endl;
test_scalar* results = new test_scalar[total_results_size];
test_scalar* ref_results = new test_scalar[total_results_size];
// allocate memory and copy to device
test_scalar* d_coeffs;
test_scalar* d_domain;
test_scalar* d_results;
cudaMalloc(&d_coeffs, sizeof(test_scalar) * total_coeffs_size);
cudaMalloc(&d_domain, sizeof(test_scalar) * domain_size);
cudaMalloc(&d_results, sizeof(test_scalar) * total_results_size);
cudaMemcpy(d_coeffs, coeffs, sizeof(test_scalar) * total_coeffs_size, cudaMemcpyHostToDevice);
cudaMemcpy(d_domain, domain, sizeof(test_scalar) * domain_size, cudaMemcpyHostToDevice);
std::cout << "finished copying to device" << std::endl;
cudaEventCreate(&start);
cudaEventCreate(&stop);
// warm up
poly_eval(d_coeffs, d_domain, coeffs_size, domain_size, batch_size, d_results);
cudaDeviceSynchronize();
cudaEventRecord(start, 0);
poly_eval(d_coeffs, d_domain, coeffs_size, domain_size, batch_size, d_results);
cudaEventRecord(stop, 0);
cudaDeviceSynchronize();
cudaEventElapsedTime(&time, start, stop);
printf("runtime : %.3f ms.\n", time);
// run reference and check correctness
poly_eval_ref(coeffs, domain, coeffs_size, domain_size, batch_size, ref_results);
cudaMemcpy(results, d_results, sizeof(test_scalar) * total_results_size, cudaMemcpyDeviceToHost);
bool success = true;
for (unsigned i = 0; i < total_results_size; i++) {
if (results[i] != ref_results[i]) {
success = false;
}
}
if (success) {
std::cout << "success!" << std::endl;
}
else {
std::cout << "fail!" << std::endl;
}
return 0;
}