1 module dopt.core.cuda.math;
2 
3 import std.algorithm;
4 import std.conv;
5 import std.functional;
6 import std.math;
7 import std.range;
8 
9 import dopt.core.cuda;
10 import dopt.core.cuda.nvrtc;
11 import dopt.core.ops;
12 import dopt.core.types;
13 
14 import derelict.cuda;
15 
16 package
17 {
18     extern(C)
19     {
20         enum
21         {
22             CUBLAS_STATUS_SUCCESS         =0,
23             CUBLAS_STATUS_NOT_INITIALIZED =1,
24             CUBLAS_STATUS_ALLOC_FAILED    =3,
25             CUBLAS_STATUS_INVALID_VALUE   =7,
26             CUBLAS_STATUS_ARCH_MISMATCH   =8,
27             CUBLAS_STATUS_MAPPING_ERROR   =11,
28             CUBLAS_STATUS_EXECUTION_FAILED=13,
29             CUBLAS_STATUS_INTERNAL_ERROR  =14,
30             CUBLAS_STATUS_NOT_SUPPORTED   =15,
31             CUBLAS_STATUS_LICENSE_ERROR   =16
32         }
33 
34         alias cublasStatus_t = int;
35 
36         enum
37         {
38             CUBLAS_OP_N=0,  
39             CUBLAS_OP_T=1,  
40             CUBLAS_OP_C=2  
41         }
42 
43         alias cublasOperation_t = int;
44 
45         enum
46         { 
47             CUBLAS_POINTER_MODE_HOST   = 0,  
48             CUBLAS_POINTER_MODE_DEVICE = 1        
49         }
50 
51         alias cublasPointerMode_t = int;
52 
53         struct cublasContext;
54         alias cublasHandle_t = cublasContext *;
55 
56         cublasStatus_t function(cublasHandle_t *handle) cublasCreate_v2;
57         cublasStatus_t function(cublasHandle_t handle, cublasPointerMode_t mode) cublasSetPointerMode_v2;
58 
59         cublasStatus_t function(cublasHandle_t handle, int n, const float *x, int incx, const float *y, int incy, float *result) cublasSdot_v2;
60         cublasStatus_t function(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const float *alpha, const float *A, int lda, const float *B, int ldb, const float *beta, float *C, int ldc) cublasSgemm_v2;
61         cublasStatus_t function(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, const float *alpha, const float *A, int lda, const float *beta, const float *B, int ldb, float *C, int ldc) cublasSgeam;
62     }
63 
64     void initialize()
65     {
66         //TODO: make a DerelictCuBLAS library
67         import core.sys.posix.dlfcn;
68 		import std.string;
69 		fh = dlopen("libcublas.so".toStringz, RTLD_LAZY);
70 		cublasCreate_v2 = cast(typeof(cublasCreate_v2))dlsym(fh, "cublasCreate_v2");
71 		cublasSetPointerMode_v2 = cast(typeof(cublasSetPointerMode_v2))dlsym(fh, "cublasSetPointerMode_v2");
72 		cublasSdot_v2 = cast(typeof(cublasSdot_v2))dlsym(fh, "cublasSdot_v2");
73 		cublasSgemm_v2 = cast(typeof(cublasSgemm_v2))dlsym(fh, "cublasSgemm_v2");
74 		cublasSgeam = cast(typeof(cublasSgeam))dlsym(fh, "cublasSgeam");
75 
76         cublasCreate_v2(&mCuBLASHandle);
77 
78         mixin(generateRegistrations());
79         registerCUDAKernel("matmul", toDelegate(&matmulKernelCtr));
80         registerCUDAKernel("sum", toDelegate(&sumKernelCtr));
81         registerCUDAKernel("maxElement", toDelegate(&maxElementKernelCtr));
82         registerCUDAKernel("argmin", toDelegate(&argminKernelCtr));
83     }
84 
85     cublasHandle_t mCuBLASHandle;
86 }
87 
88 private
89 {
90     void *fh;
91 
92     immutable string[] arith = ["add", "sub", "mul", "div"];
93     immutable string[] comp = ["lt", "lte", "gt", "gte", "eq", "neq"];
94     immutable string[] binfunc = ["max", "min", "pow"];
95     immutable string[] unfunc = ["neg", "abs", "sgn", "exp", "log", "sqrt"];
96     
97     string generateRegistrations()
98     {
99         return chain(arith, comp, binfunc, unfunc)
100               .map!(x => "registerCUDAKernel(\"" ~ x ~ "\", toDelegate(&" ~ x ~ "KernelCtr));")
101               .joiner("\n")
102               .to!string;
103     }
104 
105     //Arithmetic operations
106     alias addKernelCtr = cudaKernelCtr!("+");
107     alias subKernelCtr = cudaKernelCtr!("-");
108     alias mulKernelCtr = cudaKernelCtr!("*");
109     alias divKernelCtr = cudaKernelCtr!("/");
110     
111     //Comparison operations
112     alias ltKernelCtr = cudaKernelCtr!("<");
113     alias lteKernelCtr = cudaKernelCtr!("<=");
114     alias gtKernelCtr = cudaKernelCtr!(">");
115     alias gteKernelCtr = cudaKernelCtr!(">=");
116     alias eqKernelCtr = cudaKernelCtr!("==");
117     alias neqKernelCtr = cudaKernelCtr!("!=");
118 
119     //Binary functions
120     alias maxKernelCtr = cudaKernelCtr!("max", 2, false);
121     alias minKernelCtr = cudaKernelCtr!("min", 2, false);
122     alias powKernelCtr = cudaKernelCtr!("pow", 2, false);
123 
124     //Unary functions
125     alias negKernelCtr = cudaKernelCtr!("-", 1, false);
126     alias absKernelCtr = cudaKernelCtr!("abs", 1, false);
127     alias sgnKernelCtr = cudaKernelCtr!("sgn", 1, false);
128     alias expKernelCtr = cudaKernelCtr!("exp", 1, false);
129     alias logKernelCtr = cudaKernelCtr!("log", 1, false);
130     alias sqrtKernelCtr = cudaKernelCtr!("sqrt", 1, false);
131 
132     CUDAKernel cudaKernelCtr(string opName, int deps = 2, bool infix = true)(Operation op)
133     {
134         static if(deps == 2)
135         {
136             enum code = `
137                 //TODO: change this to iterate over elements with a stride to fully exploit SIMD units
138                 extern "C" __global__ void pointwiseKernel(size_t n, const T *dep1, const T *dep2, T *output)
139                 {
140                     size_t i = blockDim.x * blockIdx.x + threadIdx.x;
141 
142                     if(i < n)
143                     {
144                         output[i] = ` ~ (infix ? "dep1[i] " ~ opName ~ " dep2[i]" : opName ~ "(dep1[i], dep2[i])") ~ `;
145                     }
146                 }
147             `;
148         }
149         else static if(deps == 1)
150         {
151             enum code = `
152                 T __device__ sgn(T a)
153                 {
154                     return (T)((T(0) < a) - (a < T(0)));
155                 }
156 
157                 //TODO: change this to iterate over elements with a stride to fully exploit SIMD units
158                 extern "C" __global__ void pointwiseKernel(size_t n, const T *dep1, T *output)
159                 {
160                     size_t i = blockDim.x * blockIdx.x + threadIdx.x;
161 
162                     if(i < n)
163                     {
164                         output[i] = ` ~ opName ~ `(dep1[i]);
165                     }
166                 }
167             `;
168         }
169         else
170         {
171             assert(0, "Not implemented.");
172         }
173 
174         return new PointwiseCUDAKernel("typedef " ~ op.outputType.elementType.cudaType() ~ " T;\n" ~ code, op);
175     }
176 
177     class PointwiseCUDAKernel : CUDAKernel
178     {
179         this(string code, Operation op)
180         {
181             mKernel = mKernelCache.get(code, new NVRTCKernel("pointwiseKernel", code));
182             mKernelCache[code] = mKernel;
183             mOp = op;
184         }
185 
186         override void execute(const(CUDABuffer)[] inputs, CUDABuffer output)
187         {
188             //Args = [vol, <inputs...>, output]
189             void*[] args = new void*[inputs.length + 2];
190             CUdeviceptr[] ptrs = new CUdeviceptr[inputs.length + 1];
191 
192             size_t n = mOp.outputType.volume;
193             args[0] = &n;
194 
195             for(size_t i = 0; i < inputs.length; i++)
196             {
197                 ptrs[i] = inputs[i].ptr;
198                 args[i + 1] = &ptrs[i];
199             }
200 
201             ptrs[$ - 1] = output.ptr;
202             args[$ - 1] = &ptrs[$ - 1];
203 
204             uint numThreads = 512;
205 			uint numBlocks = (cast(uint)n + numThreads) / numThreads;
206             mKernel.execute(numBlocks, numThreads, args);
207         }
208 
209         static NVRTCKernel[string] mKernelCache;
210         NVRTCKernel mKernel;
211         Operation mOp;
212     }
213 
214     CUDAKernel matmulKernelCtr(Operation op)
215     {
216         return new MatmulKernel(op);
217     }
218 
219     class MatmulKernel : CUDAKernel
220     {
221         this(Operation op)
222         {
223             mOp = op;
224             ashape = mOp.deps[0].outputType.shape.map!(x => cast(int)x).array();
225 			bshape = mOp.deps[1].outputType.shape.map!(x => cast(int)x).array();
226 			cshape = mOp.outputType.shape.map!(x => cast(int)x).array();
227         }
228 
229         override void execute(const(CUDABuffer)[] inputs, CUDABuffer output)
230         {
231             if(mOp.outputType.elementType == DataType.float32)
232             {
233                 auto a = cast(float *)inputs[0].ptr;
234 				auto b = cast(float *)inputs[1].ptr;
235 				auto c = cast(float *)output.ptr;
236 				float alpha = 1;
237 				float beta = 0;
238 
239 				cublasSgemm_v2(mCuBLASHandle, CUBLAS_OP_N, CUBLAS_OP_N, bshape[1], ashape[0], ashape[1], &alpha, b,
240                     bshape[1], a, ashape[1], &beta, c, cshape[1]);
241             }
242             else
243             {
244                 throw new Exception("Element type not supported.");
245             }
246         }
247 
248         Operation mOp;
249         int[] ashape;
250 		int[] bshape;
251 		int[] cshape;
252     }
253 
254     CUDAKernel sumKernelCtr(Operation op)
255     {
256         return new SumKernel(op);
257     }
258 
259     class SumKernel : CUDAKernel
260     {
261         this(Operation op)
262         {
263             mInput = variable(TensorType(op.deps[0].elementType, op.deps[0].shape));
264             mOp = sum(mInput, op.attributes["axes"].get!(size_t[]));
265         }
266 
267         override void execute(const(CUDABuffer)[] inputs, CUDABuffer output)
268         {
269             import dopt.core.cpu : evaluateCPU;
270 
271             auto inbuf = new byte[inputs[0].numBytes];
272             inputs[0].get(inbuf);
273 
274             import dopt.core.cpu : evaluate;
275             auto outbuf = evaluateCPU([mOp], [mInput: Buffer(inbuf)])[0];
276 
277             output.set(outbuf.as!byte);
278         }
279 
280         Operation mInput;
281         Operation mOp;
282     }
283 
284     CUDAKernel maxElementKernelCtr(Operation op)
285     {
286         return new MaxElementKernel(op);
287     }
288 
289     class MaxElementKernel : CUDAKernel
290     {
291         this(Operation op)
292         {
293             mInput = variable(TensorType(op.deps[0].elementType, op.deps[0].shape));
294             mOp = maxElement(mInput, op.attributes["axes"].get!(size_t[]));
295         }
296 
297         override void execute(const(CUDABuffer)[] inputs, CUDABuffer output)
298         {
299             import dopt.core.cpu : evaluateCPU;
300 
301             auto inbuf = new byte[inputs[0].numBytes];
302             inputs[0].get(inbuf);
303 
304             import dopt.core.cpu : evaluate;
305             auto outbuf = evaluateCPU([mOp], [mInput: Buffer(inbuf)])[0];
306 
307             output.set(outbuf.as!byte);
308         }
309 
310         Operation mInput;
311         Operation mOp;
312     }
313 
314     CUDAKernel argminKernelCtr(Operation op)
315     {
316         return new ArgminKernel(op);
317     }
318 
319     class ArgminKernel : CUDAKernel
320     {
321         this(Operation op)
322         {
323             mInput = variable(TensorType(op.deps[0].elementType, op.deps[0].shape));
324             mOp = argmin(mInput, op.attributes["axis"].get!size_t);
325         }
326 
327         override void execute(const(CUDABuffer)[] inputs, CUDABuffer output)
328         {
329             import dopt.core.cpu : evaluateCPU;
330 
331             auto inbuf = new byte[inputs[0].numBytes];
332             inputs[0].get(inbuf);
333 
334             import dopt.core.cpu : evaluate;
335             auto outbuf = evaluateCPU([mOp], [mInput: Buffer(inbuf)])[0];
336 
337             output.set(outbuf.as!byte);
338         }
339 
340         Operation mInput;
341         Operation mOp;
342     }
343 }