1 module dopt.cuda.math;
2 
3 import std.algorithm;
4 import std.conv;
5 import std.functional;
6 import std.math;
7 import std.range;
8 
9 import dopt.cuda;
10 import dopt.cuda.nvrtc;
11 import dopt.core.ops;
12 import dopt.core.types;
13 
14 import derelict.cuda;
15 
16 package
17 {
18     extern(C)
19     {
20         enum
21         {
22             CUBLAS_STATUS_SUCCESS         =0,
23             CUBLAS_STATUS_NOT_INITIALIZED =1,
24             CUBLAS_STATUS_ALLOC_FAILED    =3,
25             CUBLAS_STATUS_INVALID_VALUE   =7,
26             CUBLAS_STATUS_ARCH_MISMATCH   =8,
27             CUBLAS_STATUS_MAPPING_ERROR   =11,
28             CUBLAS_STATUS_EXECUTION_FAILED=13,
29             CUBLAS_STATUS_INTERNAL_ERROR  =14,
30             CUBLAS_STATUS_NOT_SUPPORTED   =15,
31             CUBLAS_STATUS_LICENSE_ERROR   =16
32         }
33 
34         alias cublasStatus_t = int;
35 
36         enum
37         {
38             CUBLAS_OP_N=0,  
39             CUBLAS_OP_T=1,  
40             CUBLAS_OP_C=2  
41         }
42 
43         alias cublasOperation_t = int;
44 
45         enum
46         { 
47             CUBLAS_POINTER_MODE_HOST   = 0,  
48             CUBLAS_POINTER_MODE_DEVICE = 1        
49         }
50 
51         alias cublasPointerMode_t = int;
52 
53         struct cublasContext;
54         alias cublasHandle_t = cublasContext *;
55 
56         cublasStatus_t function(cublasHandle_t *handle) cublasCreate_v2;
57         cublasStatus_t function(cublasHandle_t handle, cublasPointerMode_t mode) cublasSetPointerMode_v2;
58 
59         cublasStatus_t function(cublasHandle_t handle, int n, const float *x, int incx, const float *y, int incy, float *result) cublasSdot_v2;
60         cublasStatus_t function(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const float *alpha, const float *A, int lda, const float *B, int ldb, const float *beta, float *C, int ldc) cublasSgemm_v2;
61         cublasStatus_t function(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, const float *alpha, const float *A, int lda, const float *beta, const float *B, int ldb, float *C, int ldc) cublasSgeam;
62     }
63 
64     void initialize()
65     {
66         //TODO: make a DerelictCuBLAS library
67         import core.sys.posix.dlfcn;
68 		import std..string;
69 		fh = dlopen("libcublas.so".toStringz, RTLD_LAZY);
70 		cublasCreate_v2 = cast(typeof(cublasCreate_v2))dlsym(fh, "cublasCreate_v2");
71 		cublasSetPointerMode_v2 = cast(typeof(cublasSetPointerMode_v2))dlsym(fh, "cublasSetPointerMode_v2");
72 		cublasSdot_v2 = cast(typeof(cublasSdot_v2))dlsym(fh, "cublasSdot_v2");
73 		cublasSgemm_v2 = cast(typeof(cublasSgemm_v2))dlsym(fh, "cublasSgemm_v2");
74 		cublasSgeam = cast(typeof(cublasSgeam))dlsym(fh, "cublasSgeam");
75 
76         cublasCreate_v2(&mCuBLASHandle);
77 
78         mixin(generateRegistrations());
79         registerCUDAKernel("matmul", toDelegate(&matmulKernelCtr));
80     }
81 
82     cublasHandle_t mCuBLASHandle;
83 }
84 
85 private
86 {
87     void *fh;
88 
89     immutable string[] arith = ["add", "sub", "mul", "div"];
90     immutable string[] comp = ["lt", "lte", "gt", "gte", "eq", "neq"];
91     immutable string[] binfunc = ["max", "min", "pow"];
92     immutable string[] unfunc = ["neg", "abs", "sgn", "exp", "log", "sqrt"];
93     
94     string generateRegistrations()
95     {
96         return chain(arith, comp, binfunc, unfunc)
97               .map!(x => "registerCUDAKernel(\"" ~ x ~ "\", toDelegate(&" ~ x ~ "KernelCtr));")
98               .joiner("\n")
99               .to!string;
100     }
101 
102     //Arithmetic operations
103     alias addKernelCtr = cudaKernelCtr!("+");
104     alias subKernelCtr = cudaKernelCtr!("-");
105     alias mulKernelCtr = cudaKernelCtr!("*");
106     alias divKernelCtr = cudaKernelCtr!("/");
107     
108     //Comparison operations
109     alias ltKernelCtr = cudaKernelCtr!("<");
110     alias lteKernelCtr = cudaKernelCtr!("<=");
111     alias gtKernelCtr = cudaKernelCtr!(">");
112     alias gteKernelCtr = cudaKernelCtr!(">=");
113     alias eqKernelCtr = cudaKernelCtr!("==");
114     alias neqKernelCtr = cudaKernelCtr!("!=");
115 
116     //Binary functions
117     alias maxKernelCtr = cudaKernelCtr!("max", 2, false);
118     alias minKernelCtr = cudaKernelCtr!("min", 2, false);
119     alias powKernelCtr = cudaKernelCtr!("pow", 2, false);
120 
121     //Unary functions
122     alias negKernelCtr = cudaKernelCtr!("-", 1, false);
123     alias absKernelCtr = cudaKernelCtr!("abs", 1, false);
124     alias sgnKernelCtr = cudaKernelCtr!("sgn", 1, false);
125     alias expKernelCtr = cudaKernelCtr!("exp", 1, false);
126     alias logKernelCtr = cudaKernelCtr!("log", 1, false);
127     alias sqrtKernelCtr = cudaKernelCtr!("sqrt", 1, false);
128 
129     CUDAKernel cudaKernelCtr(string opName, int deps = 2, bool infix = true)(Operation op)
130     {
131         static if(deps == 2)
132         {
133             enum code = `
134                 extern "C" __global__ void pointwiseKernel(size_t n, const T *dep1, const T *dep2, T *output)
135                 {
136                     size_t i0 = blockDim.x * blockIdx.x + threadIdx.x;
137 
138                     for(size_t i = i0; i < n; i += blockDim.x * gridDim.x)
139                     {
140                         output[i] = ` ~ (infix ? "dep1[i] " ~ opName ~ " dep2[i]" : opName ~ "(dep1[i], dep2[i])") ~ `;
141                     }
142                 }
143             `;
144         }
145         else static if(deps == 1)
146         {
147             enum code = `
148                 T __device__ sgn(T a)
149                 {
150                     return (T)((T(0) < a) - (a < T(0)));
151                 }
152 
153                 extern "C" __global__ void pointwiseKernel(size_t n, const T *dep1, T *output)
154                 {
155                     size_t i0 = blockDim.x * blockIdx.x + threadIdx.x;
156 
157                     for(size_t i = i0; i < n; i += blockDim.x * gridDim.x)
158                     {
159                         output[i] = ` ~ opName ~ `(dep1[i]);
160                     }
161                 }
162             `;
163         }
164         else
165         {
166             assert(0, "Not implemented.");
167         }
168 
169         return new PointwiseCUDAKernel("typedef " ~ op.outputType.elementType.cudaType() ~ " T;\n" ~ code, op);
170     }
171 
172     class PointwiseCUDAKernel : CUDAKernel
173     {
174         this(string code, Operation op)
175         {
176             mKernel = mKernelCache.get(code, new NVRTCKernel("pointwiseKernel", code));
177             mKernelCache[code] = mKernel;
178             mOp = op;
179         }
180 
181         override void execute(const(CUDABuffer)[] inputs, CUDABuffer output)
182         {
183             //Args = [vol, <inputs...>, output]
184             void*[] args = new void*[inputs.length + 2];
185             CUdeviceptr[] ptrs = new CUdeviceptr[inputs.length + 1];
186 
187             size_t n = mOp.outputType.volume;
188             args[0] = &n;
189 
190             for(size_t i = 0; i < inputs.length; i++)
191             {
192                 ptrs[i] = inputs[i].ptr;
193                 args[i + 1] = &ptrs[i];
194             }
195 
196             ptrs[$ - 1] = output.ptr;
197             args[$ - 1] = &ptrs[$ - 1];
198 
199             uint numThreads = 512;
200 			uint numBlocks = (cast(uint)n + numThreads) / numThreads;
201             mKernel.execute(numBlocks, numThreads, args);
202         }
203 
204         static NVRTCKernel[string] mKernelCache;
205         NVRTCKernel mKernel;
206         Operation mOp;
207     }
208 
209     CUDAKernel matmulKernelCtr(Operation op)
210     {
211         return new MatmulKernel(op);
212     }
213 
214     class MatmulKernel : CUDAKernel
215     {
216         this(Operation op)
217         {
218             mOp = op;
219             ashape = mOp.deps[0].outputType.shape.map!(x => cast(int)x).array();
220 			bshape = mOp.deps[1].outputType.shape.map!(x => cast(int)x).array();
221 			cshape = mOp.outputType.shape.map!(x => cast(int)x).array();
222         }
223 
224         override void execute(const(CUDABuffer)[] inputs, CUDABuffer output)
225         {
226             if(mOp.outputType.elementType == DataType.float32)
227             {
228                 auto a = cast(float *)inputs[0].ptr;
229 				auto b = cast(float *)inputs[1].ptr;
230 				auto c = cast(float *)output.ptr;
231 				float alpha = 1;
232 				float beta = 0;
233 
234 				cublasSgemm_v2(mCuBLASHandle, CUBLAS_OP_N, CUBLAS_OP_N, bshape[1], ashape[0], ashape[1], &alpha, b,
235                     bshape[1], a, ashape[1], &beta, c, cshape[1]);
236             }
237             else
238             {
239                 throw new Exception("Element type not supported.");
240             }
241         }
242 
243         Operation mOp;
244         int[] ashape;
245 		int[] bshape;
246 		int[] cshape;
247     }
248 }