1 /**
2     This is the main interface for the dopt CUDA backend.
3 
4     The APIs in this module allow users to evaluate operation graphs on GPUs through the use of CUDA. There is also
5     functionality to register CUDA implementations of custom operations.
6 
7     In future, this module will also have an interface allowing the user to register their own optimisation passes to
8     be called when constructing a plan.
9 
10     Authors: Henry Gouk
11 */
12 module dopt.core.cuda;
13 
14 import std.exception;
15 
16 import dopt.core.cuda.basic;
17 import dopt.core.cuda.nvrtc;
18 import dopt.core.cuda.math;
19 import dopt.core.cuda.nnet;
20 import dopt.core.cuda.random;
21 import dopt.core.ops;
22 import dopt.core.types;
23 
24 import derelict.cuda;
25 
26 alias CUDAKernelCtr = CUDAKernel delegate(Operation op);
27 
28 private __gshared
29 {
30     CUdevice mDevice;
31     CUcontext mContext;
32 }
33 
34 void initialize()
35 {
36     //TODO: handle case where CUDA isn't available
37     DerelictCUDADriver.load();
38     
39     //Initialise CUDA and create a context
40     cuInit(0);
41     cuDeviceGet(&mDevice, 0);
42     cuCtxCreate(&mContext, 0, mDevice);
43 
44     //Initialize submodules
45     dopt.core.cuda.basic.initialize();
46     dopt.core.cuda.nvrtc.initialize();
47     dopt.core.cuda.math.initialize();
48     dopt.core.cuda.nnet.initialize();
49     dopt.core.cuda.random.initialize();
50 }
51 
52 /**
53     Provides a common interface for CUDA kernels.
54 */
55 interface CUDAKernel
56 {
57     /**
58         Runs the kernel with the given inputs and outputs.
59 
60         Params:
61             inputs = An array of CUDABuffer objects, each corresponding to one of the dependencies of the operation
62             used to construct this kernel.
63             output = The destination buffer.
64     */
65     void execute(const(CUDABuffer)[] inputs, CUDABuffer output);
66 }
67 
68 /**
69     A class that encapsulates the CUDA memory allocation/deallocation process.
70 */
71 class CUDABuffer
72 {
73     public
74     {
75         /**
76             Constructs a CUDABuffer object and allocates memory on the CUDA device.
77 
78             Params:
79                 numBytes = The number of bytes to be allocated on the CUDA device.
80         */
81         static CUDABuffer create(size_t numBytes)
82         {
83             CUDABuffer ret = new CUDABuffer();
84             ret.mNumBytes = numBytes;
85             enforce(cuMemAlloc(&(ret.mPtr), ret.mNumBytes) == CUDA_SUCCESS, "CUDA memory allocation failed");
86             enforce(cuMemsetD8(ret.mPtr, 0, ret.mNumBytes) == CUDA_SUCCESS, "CUDA default buffer initialisation failed");
87 
88             return ret;
89         }
90 
91         static void destroy(CUDABuffer buf)
92         {
93             enforce(cuMemFree(buf.mPtr) == CUDA_SUCCESS, "Failed to free CUDA device memory.");
94         }
95 
96         /**
97             Copies data from the host to the device.
98 
99             Params:
100                 buf = An array of data to be copied to the device.
101         */
102         void set(const void[] buf)
103         {
104             enforce(buf.length == mNumBytes, "input buffer is the wrong length.");
105 			enforce(cuMemcpyHtoD(mPtr, buf.ptr, buf.length) == CUDA_SUCCESS, "Failed to set contents of CUDA buffer");
106         }
107 
108         /**
109             Copies data from the device to the host.
110 
111             Params:
112                 buf = The buffer that the data from the CUDA device will be written to.
113         */
114         void get(void[] buf) const
115         {
116             enforce(buf.length == mNumBytes, "output buffer is the wrong length.");
117 			enforce(cuMemcpyDtoH(buf.ptr, mPtr, buf.length) == CUDA_SUCCESS, "Failed to get contents of CUDA buffer");
118         }
119 
120         /**
121             Provides the size of the buffer allocated on the CUDA device.
122 
123             Returns:
124                 The number of bytes allocated on the CUDA device.
125         */
126         size_t numBytes() const
127         {
128             return mNumBytes;
129         }
130 
131         /**
132             Provides the device pointer.
133 
134             Returns:
135                 A CUDA device pointer.
136         */
137         inout(CUdeviceptr) ptr() inout
138         {
139             return mPtr;
140         }
141     }
142 
143     private
144     {
145         size_t mNumBytes;
146         CUdeviceptr mPtr;
147 
148         this()
149         {
150             //
151         }
152 
153         void zero()
154         {
155             enforce(cuMemsetD8(mPtr, 0, mNumBytes) == CUDA_SUCCESS, "CUDA zero buffer failed");
156         }
157     }
158 }
159 
160 /**
161     A Plan stores all the resources (preallocated buffers, custom CUDA kernels) required to evaluate nodes from the
162     Operation graph.
163 
164     An instance of Plan can be constructed using the $(D compileCUDA) function. The primary use case for a CUDAPlan is when the
165     same set of operations are likely to be evaluated more than once. This prevents the dopt CUDA runtime from
166     reallocating and optimising the CUDA kernels every time the same set of operations is to be executed.
167 */
168 class CUDAPlan
169 {
170     public
171     {
172         long[string] profiler;
173 
174         this(Operation[] outputs)
175         {
176             import std.algorithm : canFind, filter;
177             import std.array : array;
178 
179             auto sortedOps = topologicalSort(outputs);
180 
181             foreach(o; sortedOps)
182             {
183                 if(o.opType == "variable" || o.opType == "reshape" || o.opType == "constant")
184                 {
185                     continue;
186                 }
187                 
188                 auto k = mKernelCtrs.get(o.opType, null);
189 
190                 enforce(k !is null, "Could not construct a CUDA kernel for operation of type '" ~ o.opType ~ "'");
191 
192                 mKernels[o] = k(o);
193             }
194 
195             mOps = sortedOps.array;
196             mOutputs = outputs.array;
197 
198             foreach(o; mOps)
199             {
200                 //For reshape operations, we will just reuse the buffer of o.deps[0]
201                 if(o.opType == "reshape")
202                 {
203                     results[o] = results[o.deps[0]];
204                 }
205                 else
206                 {
207                     results[o] = CUDABuffer.create(o.volume * o.elementType.sizeOf);
208 
209                     if(o.opType == "constant")
210                     {
211                         results[o].set(o.value.as!ubyte);
212                     }
213                 }
214             }
215 
216             results.rehash();
217         }
218 
219         Buffer[] execute(Buffer[Operation] args = null)
220         {
221             auto rets = new Buffer[mOutputs.length];
222 
223             foreach(i, o; mOutputs)
224             {
225                 rets[i] = Buffer(new ubyte[o.outputType.volume * o.outputType.elementType.sizeOf()]);
226             }
227 
228             execute(args, rets);
229 
230             return rets;
231         }
232         
233         /**
234             Executes the plan.
235 
236             Params:
237                 args = A set of variable assignments.
238         */
239         void execute(Buffer[Operation] args, Buffer[] rets)
240         {
241             import std.datetime : StopWatch;
242             StopWatch sw;
243 
244             //Make sure all the args are variable assignments
245             foreach(o; args.keys)
246             {
247                 enforce(o.opType == "variable",
248                     "All assignments in args must be for Operations with an opType of 'variable'");
249             }
250 
251             //Load the args into their buffers
252             foreach(k, v; args)
253             {
254                 results[k].set(v.as!ubyte);
255             }
256 
257             //Iterate through each operation and execute it
258             foreach(o; mOps)
259             {
260                 if(o.opType == "variable")
261                 {
262                     if(!(o in args))
263                     {
264                         sw.reset();
265                         sw.start();
266 
267                         auto buf = cast(Buffer)o.value;
268                         results[o].set(buf.as!ubyte);
269 
270                         sw.stop();
271 
272                         profiler["variable"] = profiler.get("variable", 0) + sw.peek.usecs;
273                     }
274                     
275                     continue;
276                 }
277                 else if(o.opType == "reshape" || o.opType == "constant")
278                 {
279                     continue;
280                 }
281 
282                 //Get the input buffers
283                 CUDABuffer[] inputs;
284 
285                 foreach(d; o.deps)
286                 {
287                     inputs ~= results[d];
288                 }
289 
290                 //Execute the operation
291                 sw.reset();
292                 sw.start();
293                 results[o].zero();
294                 mKernels[o].execute(inputs, results[o]);
295                 sw.stop();
296 
297                 profiler[o.opType] = profiler.get(o.opType, 0) + sw.peek.usecs;
298             }
299 
300             foreach(i, o; mOutputs)
301             {
302                 results[o].get(rets[i].as!ubyte);
303             }
304         }
305 
306         ~this()
307         {
308             cleanup();
309         }
310 
311         void cleanup()
312         {
313             if(clean)
314             {
315                 return;
316             }
317 
318             foreach(o; mOps)
319             {
320                 if(o.opType != "reshape")
321                 {
322                     CUDABuffer.destroy(results[o]);
323                 }
324             }
325 
326             clean = true;
327         }
328     }
329 
330     private
331     {
332         Operation[] mOutputs;
333         Operation[] mOps;
334         CUDAKernel[Operation] mKernels;
335         CUDABuffer[Operation] results;
336         bool clean = false;
337     }
338 }
339 
340 /**
341     Used for performing a one-off evaluation of a set of operations.
342 
343     If you are planning to operate the same set of operations multiple times, but with different variables assignments,
344     then you should construct a $(D CUDAPlan).
345 
346     Params:
347         ops = The operations to be evaluated.
348         args = A set of optional variable assignments.
349 
350     Returns:
351         The result of evaluating $(D ops).
352 */
353 Buffer[] evaluateCUDA(Operation[] ops, Buffer[Operation] args = null)
354 {
355     auto p = new CUDAPlan(ops);
356     
357     auto ret = p.execute(args);
358 
359     p.cleanup();
360 
361     return ret;
362 }
363 
364 /**
365     A convenience overload that evaluates a single operation and returns a single $(D Buffer).
366 
367     Params:
368         op = The operation to be evaluated.
369         args = A set of optional variable assignments.
370 
371     Returns:
372         The result of evaluating $(D op)
373 */
374 Buffer evaluateCUDA(Operation op, Buffer[Operation] args = null)
375 {
376     return evaluateCUDA([op], args)[0];
377 }
378 
379 /**
380     Registers a CUDA kernel constructor for a given operation type.
381 
382     Params:
383         opName = The type of operation this kernel constructor caters to.
384         kernelCtr = The constructor that should be associated with operations with the type $(D opType).
385 */
386 void registerCUDAKernel(string opName, CUDAKernelCtr kernelCtr)
387 {
388     enforce((opName in mKernelCtrs) is null,
389         "A CUDAKernelCtr is already registered for the operation '" ~ opName ~ "'");
390 
391     mKernelCtrs[opName] = kernelCtr;
392 }
393 
394 /**
395     Deregisters a kernel constructor associated with the given operation type.
396 
397     Params:
398         opType = The operation type that should have its kernel deregistered.
399 */
400 void deregisterCUDAKernel(string opType)
401 {
402     mKernelCtrs.remove(opType);
403 }
404 
405 /**
406     Provides a list of all operation types supported by the CUDA backend.
407 
408     Returns:
409         A string array of the operation types that have kernels registered.
410 */
411 string[] listCUDAOperations()
412 {
413     return mKernelCtrs.keys ~ ["variable", "reshape"];
414 }
415 
416 package
417 {
418     string cudaType(DataType t)
419     {
420         switch(t)
421         {
422             case DataType.float32:
423                 return "float";
424             
425             case DataType.int32:
426                 return "int";
427 
428             default:
429                 import std.conv : to;
430                 assert(0, "DataType '" ~ t.to!string ~ "' is not currently supported by the CUDA backend");
431         }
432     }
433 }
434 
435 private
436 {
437     CUDAKernelCtr[string] mKernelCtrs;
438 }
439 
440 unittest
441 {
442     auto a = float32([], [3.0f]);
443     auto b = float32([], [4.0f]);
444     auto c = float32([], [-1.0f]);
445 
446     auto y = a * b + c;
447 
448     assert(evaluateCUDA(y).as!float[0] == 11.0f);
449 }