1 /**
2     This is the main interface for the dopt CUDA backend.
3 
4     The APIs in this module allow users to evaluate operation graphs on GPUs through the use of CUDA. There is also
5     functionality to register CUDA implementations of custom operations.
6 
7     In future, this module will also have an interface allowing the user to register their own optimisation passes to
8     be called when constructing a plan.
9 
10     Authors: Henry Gouk
11 */
12 module dopt.cuda;
13 
14 import std.exception;
15 
16 import dopt.cuda.basic;
17 import dopt.cuda.nvrtc;
18 import dopt.cuda.math;
19 import dopt.cuda.nnet;
20 import dopt.cuda.random;
21 import dopt.core;
22 
23 import derelict.cuda;
24 
25 alias CUDAKernelCtr = CUDAKernel delegate(Operation op);
26 
27 private __gshared
28 {
29     CUdevice mDevice;
30     CUcontext mContext;
31 }
32 
33 /**
34     Registers all the kernels for the CUDA backend
35 */
36 shared static this()
37 {
38     try
39     {
40         DerelictCUDADriver.load();
41         
42         //Initialise CUDA and create a context
43         cuInit(0);
44         cuDeviceGet(&mDevice, 0);
45         cuCtxCreate(&mContext, 0, mDevice);
46 
47         //Initialize submodules
48         dopt.cuda.basic.initialize();
49         dopt.cuda.nvrtc.initialize();
50         dopt.cuda.math.initialize();
51         dopt.cuda.nnet.initialize();
52         dopt.cuda.random.initialize();
53 
54         import std.functional : toDelegate;
55         defaultEvaluator = toDelegate(&evaluateCUDA);
56         defaultCompiler = (Operation[] ops) { return new CUDAPlan(ops); };
57     }
58     catch(Exception e)
59     {
60         //TODO: probably log something here
61     }
62 }
63 
64 /**
65     Provides a common interface for CUDA kernels.
66 */
67 interface CUDAKernel
68 {
69     /**
70         Runs the kernel with the given inputs and outputs.
71 
72         Params:
73             inputs = An array of CUDABuffer objects, each corresponding to one of the dependencies of the operation
74             used to construct this kernel.
75             output = The destination buffer.
76     */
77     void execute(const(CUDABuffer)[] inputs, CUDABuffer output);
78 }
79 
80 private class CUDACPUKernel : CUDAKernel
81 {
82     this(Operation op)
83     {
84         import std.algorithm : map;
85         import std.array : array;
86 
87         mDeps = op
88                .deps
89                .map!(x => variable(x.outputType))
90                .array();
91         
92         mOp = createOperation(op.opType, mDeps, op.attributes);
93     }
94 
95     void execute(const(CUDABuffer)[] inputs, CUDABuffer output)
96     {
97         import std.range : zip;
98         import dopt.cpu : evaluateCPU;
99 
100         foreach(cudaInput, cpuInput; zip(inputs, mDeps))
101         {
102             cudaInput.get(cpuInput.value.as!ubyte);
103         }
104 
105         Buffer ret = evaluateCPU([mOp])[0];
106 
107         output.set(ret.as!ubyte);
108     }
109 
110     Buffer[] mInputs;
111     Operation[] mDeps;
112     Operation mOp;
113 }
114 
115 private CUDAKernel cudaCPUCtr(Operation op)
116 {
117     return new CUDACPUKernel(op);
118 }
119 
120 /**
121     A class that encapsulates the CUDA memory allocation/deallocation process.
122 */
123 class CUDABuffer
124 {
125     public
126     {
127         /**
128             Constructs a CUDABuffer object and allocates memory on the CUDA device.
129 
130             Params:
131                 numBytes = The number of bytes to be allocated on the CUDA device.
132         */
133         static CUDABuffer create(size_t numBytes)
134         {
135             CUDABuffer ret = new CUDABuffer();
136             ret.mNumBytes = numBytes;
137             enforce(cuMemAlloc(&(ret.mPtr), ret.mNumBytes) == CUDA_SUCCESS, "CUDA memory allocation failed");
138             enforce(cuMemsetD8(ret.mPtr, 0, ret.mNumBytes) == CUDA_SUCCESS,
139                 "CUDA default buffer initialisation failed");
140 
141             return ret;
142         }
143 
144         /**
145             Releases the CUDA resources used by buf internally.
146         */
147         static void destroy(CUDABuffer buf)
148         {
149             enforce(cuMemFree(buf.mPtr) == CUDA_SUCCESS, "Failed to free CUDA device memory.");
150         }
151 
152         /**
153             Copies data from the host to the device.
154 
155             Params:
156                 buf = An array of data to be copied to the device.
157         */
158         void set(const void[] buf)
159         {
160             enforce(buf.length == mNumBytes, "input buffer is the wrong length.");
161 			enforce(cuMemcpyHtoD(mPtr, buf.ptr, buf.length) == CUDA_SUCCESS, "Failed to set contents of CUDA buffer");
162         }
163 
164         /**
165             Copies data from the device to the host.
166 
167             Params:
168                 buf = The buffer that the data from the CUDA device will be written to.
169         */
170         void get(void[] buf) const
171         {
172             enforce(buf.length == mNumBytes, "output buffer is the wrong length.");
173 			enforce(cuMemcpyDtoH(buf.ptr, mPtr, buf.length) == CUDA_SUCCESS, "Failed to get contents of CUDA buffer");
174         }
175 
176         /**
177             Provides the size of the buffer allocated on the CUDA device.
178 
179             Returns:
180                 The number of bytes allocated on the CUDA device.
181         */
182         size_t numBytes() const
183         {
184             return mNumBytes;
185         }
186 
187         /**
188             Provides the device pointer.
189 
190             Returns:
191                 A CUDA device pointer.
192         */
193         inout(CUdeviceptr) ptr() inout
194         {
195             return mPtr;
196         }
197     }
198 
199     private
200     {
201         size_t mNumBytes;
202         CUdeviceptr mPtr;
203 
204         this()
205         {
206             //
207         }
208 
209         void zero()
210         {
211             enforce(cuMemsetD8(mPtr, 0, mNumBytes) == CUDA_SUCCESS, "CUDA zero buffer failed");
212         }
213     }
214 }
215 
216 /**
217     A Plan stores all the resources (preallocated buffers, custom CUDA kernels) required to evaluate nodes from the
218     Operation graph.
219 
220     An instance of Plan can be constructed using the $(D compileCUDA) function. The primary use case for a CUDAPlan is when the
221     same set of operations are likely to be evaluated more than once. This prevents the dopt CUDA runtime from
222     reallocating and optimising the CUDA kernels every time the same set of operations is to be executed.
223 */
224 class CUDAPlan : Plan
225 {
226     public
227     {
228         long[string] profiler;
229 
230         this(Operation[] outputs)
231         {
232             import std.algorithm : canFind, filter;
233             import std.array : array;
234             import std.functional : toDelegate;
235 
236             super(outputs);
237 
238             auto sortedOps = topologicalSort(outputs);
239 
240             foreach(o; sortedOps)
241             {
242                 if(o.opType == "variable" || o.opType == "reshape" || o.opType == "constant")
243                 {
244                     continue;
245                 }
246                 
247                 auto k = mKernelCtrs.get(o.opType, toDelegate(&cudaCPUCtr));
248 
249                 enforce(k !is null, "Could not construct a CUDA kernel for operation of type '" ~ o.opType ~ "'");
250 
251                 mKernels[o] = k(o);
252             }
253 
254             mOps = sortedOps.array;
255 
256             foreach(o; mOps)
257             {
258                 //For reshape operations, we will just reuse the buffer of o.deps[0]
259                 if(o.opType == "reshape")
260                 {
261                     results[o] = results[o.deps[0]];
262                 }
263                 else
264                 {
265                     results[o] = CUDABuffer.create(o.volume * o.elementType.sizeOf);
266 
267                     if(o.opType == "constant")
268                     {
269                         results[o].set(o.value.as!ubyte);
270                     }
271                 }
272             }
273 
274             results.rehash();
275         }
276 
277         ~this()
278         {
279             cleanup();
280         }
281 
282         /**
283             Releases CUDA resources associated with this plan.
284         */
285         void cleanup()
286         {
287             if(clean)
288             {
289                 return;
290             }
291 
292             foreach(o; mOps)
293             {
294                 if(o.opType != "reshape")
295                 {
296                     CUDABuffer.destroy(results[o]);
297                 }
298             }
299 
300             clean = true;
301         }
302     }
303 
304     protected
305     {
306         override void executeImpl(Buffer[Operation] args, Buffer[] rets)
307         {
308             import std.datetime.stopwatch : StopWatch;
309             StopWatch sw;
310 
311             //Make sure all the args are variable assignments
312             foreach(o; args.keys)
313             {
314                 enforce(o.opType == "variable",
315                     "All assignments in args must be for Operations with an opType of 'variable'");
316             }
317 
318             //Load the args into their buffers
319             foreach(k, v; args)
320             {
321                 results[k].set(v.as!ubyte);
322             }
323 
324             //Iterate through each operation and execute it
325             foreach(o; mOps)
326             {
327                 if(o.opType == "variable")
328                 {
329                     if(!(o in args))
330                     {
331                         sw.reset();
332                         sw.start();
333 
334                         auto buf = cast(Buffer)o.value;
335                         results[o].set(buf.as!ubyte);
336 
337                         sw.stop();
338 
339                         profiler["variable"] = profiler.get("variable", 0) + sw.peek.split.usecs;
340                     }
341                     
342                     continue;
343                 }
344                 else if(o.opType == "reshape" || o.opType == "constant")
345                 {
346                     continue;
347                 }
348 
349                 //Get the input buffers
350                 CUDABuffer[] inputs;
351 
352                 foreach(d; o.deps)
353                 {
354                     inputs ~= results[d];
355                 }
356 
357                 //Execute the operation
358                 sw.reset();
359                 sw.start();
360                 results[o].zero();
361                 mKernels[o].execute(inputs, results[o]);
362                 sw.stop();
363 
364                 profiler[o.opType] = profiler.get(o.opType, 0) + sw.peek.split.usecs;
365             }
366 
367             foreach(i, o; mOutputs)
368             {
369                 results[o].get(rets[i].as!ubyte);
370             }
371         }
372     }
373 
374     private
375     {
376         Operation[] mOps;
377         CUDAKernel[Operation] mKernels;
378         CUDABuffer[Operation] results;
379         bool clean = false;
380     }
381 }
382 
383 /**
384     Used for performing a one-off evaluation of a set of operations.
385 
386     If you are planning to operate the same set of operations multiple times, but with different variables assignments,
387     then you should construct a $(D CUDAPlan).
388 
389     Params:
390         ops = The operations to be evaluated.
391         args = A set of optional variable assignments.
392 
393     Returns:
394         The result of evaluating $(D ops).
395 */
396 Buffer[] evaluateCUDA(Operation[] ops, Buffer[Operation] args = null)
397 {
398     auto p = new CUDAPlan(ops);
399     
400     auto ret = p.execute(args);
401 
402     p.cleanup();
403 
404     return ret;
405 }
406 
407 /**
408     A convenience overload that evaluates a single operation and returns a single $(D Buffer).
409 
410     Params:
411         op = The operation to be evaluated.
412         args = A set of optional variable assignments.
413 
414     Returns:
415         The result of evaluating $(D op)
416 */
417 Buffer evaluateCUDA(Operation op, Buffer[Operation] args = null)
418 {
419     return evaluateCUDA([op], args)[0];
420 }
421 
422 /**
423     Registers a CUDA kernel constructor for a given operation type.
424 
425     Params:
426         opName = The type of operation this kernel constructor caters to.
427         kernelCtr = The constructor that should be associated with operations with the type $(D opType).
428 */
429 void registerCUDAKernel(string opName, CUDAKernelCtr kernelCtr)
430 {
431     enforce((opName in mKernelCtrs) is null,
432         "A CUDAKernelCtr is already registered for the operation '" ~ opName ~ "'");
433 
434     mKernelCtrs[opName] = kernelCtr;
435 }
436 
437 /**
438     Deregisters a kernel constructor associated with the given operation type.
439 
440     Params:
441         opType = The operation type that should have its kernel deregistered.
442 */
443 void deregisterCUDAKernel(string opType)
444 {
445     mKernelCtrs.remove(opType);
446 }
447 
448 /**
449     Provides a list of all operation types supported by the CUDA backend.
450 
451     Returns:
452         A string array of the operation types that have kernels registered.
453 */
454 string[] listCUDAOperations()
455 {
456     return mKernelCtrs.keys ~ ["variable", "reshape"];
457 }
458 
459 package
460 {
461     string cudaType(DataType t)
462     {
463         switch(t)
464         {
465             case DataType.float32:
466                 return "float";
467             
468             case DataType.int32:
469                 return "int";
470 
471             default:
472                 import std.conv : to;
473                 assert(0, "DataType '" ~ t.to!string ~ "' is not currently supported by the CUDA backend");
474         }
475     }
476 }
477 
478 private
479 {
480     CUDAKernelCtr[string] mKernelCtrs;
481 }
482 
483 unittest
484 {
485     auto a = float32([], [3.0f]);
486     auto b = float32([], [4.0f]);
487     auto c = float32([], [-1.0f]);
488 
489     auto y = a * b + c;
490 
491     assert(evaluateCUDA(y).as!float[0] == 11.0f);
492 }