1 /**
2     This is the main interface for the dopt CUDA backend.
3 
4     The APIs in this module allow users to evaluate operation graphs on GPUs through the use of CUDA. There is also
5     functionality to register CUDA implementations of custom operations.
6 
7     In future, this module will also have an interface allowing the user to register their own optimisation passes to
8     be called when constructing a plan.
9 
10     Authors: Henry Gouk
11 */
12 module dopt.cuda;
13 
14 import std.exception;
15 
16 import dopt.cuda.basic;
17 import dopt.cuda.nvrtc;
18 import dopt.cuda.math;
19 import dopt.cuda.nnet;
20 import dopt.cuda.random;
21 import dopt.core;
22 
23 import derelict.cuda;
24 
25 alias CUDAKernelCtr = CUDAKernel delegate(Operation op);
26 
27 private __gshared
28 {
29     CUdevice mDevice;
30     CUcontext mContext;
31 }
32 
33 /**
34     Registers all the kernels for the CUDA backend
35 */
36 shared static this()
37 {
38     try
39     {
40         DerelictCUDADriver.load();
41         
42         //Initialise CUDA and create a context
43         cuInit(0);
44         cuDeviceGet(&mDevice, 0);
45         cuCtxCreate(&mContext, 0, mDevice);
46 
47         //Initialize submodules
48         dopt.cuda.basic.initialize();
49         dopt.cuda.nvrtc.initialize();
50         dopt.cuda.math.initialize();
51         dopt.cuda.nnet.initialize();
52         dopt.cuda.random.initialize();
53 
54         import std.functional : toDelegate;
55         defaultEvaluator = toDelegate(&evaluateCUDA);
56         defaultCompiler = (Operation[] ops) { return new CUDAPlan(ops); };
57         defaultVarAllocator = (size_t numBytes) { return CUDABuffer.create(numBytes); };
58     }
59     catch(Exception e)
60     {
61         //TODO: probably log something here
62     }
63 }
64 
65 /**
66     Provides a common interface for CUDA kernels.
67 */
68 interface CUDAKernel
69 {
70     /**
71         Runs the kernel with the given inputs and outputs.
72 
73         Params:
74             inputs = An array of CUDABuffer objects, each corresponding to one of the dependencies of the operation
75             used to construct this kernel.
76             output = The destination buffer.
77     */
78     void execute(const(CUDABuffer)[] inputs, CUDABuffer output);
79 }
80 
81 private class CUDACPUKernel : CUDAKernel
82 {
83     this(Operation op)
84     {
85         import std.algorithm : map;
86         import std.array : array;
87 
88         mDeps = op
89                .deps
90                .map!(x => variable(x.outputType))
91                .array();
92         
93         mOp = createOperation(op.opType, mDeps, op.attributes);
94     }
95 
96     void execute(const(CUDABuffer)[] inputs, CUDABuffer output)
97     {
98         import std.range : zip;
99         import dopt.cpu : evaluateCPU;
100 
101         foreach(cudaInput, cpuInput; zip(inputs, mDeps))
102         {
103             cpuInput.value.set(cudaInput);
104         }
105 
106         DeviceBuffer ret = evaluateCPU([mOp])[0];
107 
108         output.set(ret);
109     }
110 
111     DeviceBuffer[] mInputs;
112     Operation[] mDeps;
113     Operation mOp;
114 }
115 
116 private CUDAKernel cudaCPUCtr(Operation op)
117 {
118     return new CUDACPUKernel(op);
119 }
120 
121 /**
122     A class that encapsulates the CUDA memory allocation/deallocation process.
123 */
124 class CUDABuffer : DeviceBuffer
125 {
126     public
127     {
128         /**
129             Constructs a CUDABuffer object and allocates memory on the CUDA device.
130 
131             Params:
132                 numBytes = The number of bytes to be allocated on the CUDA device.
133         */
134         static CUDABuffer create(size_t numBytes)
135         {
136             import core.memory : GC;
137             import std.conv : to;
138 
139             //Rely on the GC to run some finalisers to free CUDA memory. I know this is bad please help.
140             GC.collect();
141 
142             CUDABuffer ret = new CUDABuffer();
143 
144             if(numBytes == 0)
145             {
146                 return ret;
147             }
148 
149             ret.mNumBytes = numBytes;
150             enforce(cuMemAlloc(&(ret.mPtr), ret.mNumBytes) == CUDA_SUCCESS,
151                 "CUDA memory allocation failed: unable to allocate " ~ numBytes.to!string ~ " bytes");
152             enforce(cuMemsetD8(ret.mPtr, 0, ret.mNumBytes) == CUDA_SUCCESS,
153                 "CUDA default buffer initialisation failed");
154 
155             return ret;
156         }
157 
158         /**
159             Releases the CUDA resources used by buf internally.
160         */
161         static void destroy(CUDABuffer buf)
162         {
163             enforce(cuMemFree(buf.mPtr) == CUDA_SUCCESS, "Failed to free CUDA device memory.");
164         }
165 
166         /**
167             Copies data from the host to the device.
168 
169             Params:
170                 buf = An array of data to be copied to the device.
171         */
172         override void set(const void[] buf)
173         {
174             enforce(buf.length == mNumBytes, "input buffer is the wrong length.");
175 			enforce(cuMemcpyHtoD(mPtr, buf.ptr, buf.length) == CUDA_SUCCESS, "Failed to set contents of CUDA buffer");
176         }
177 
178         override void set(const DeviceBuffer buf)
179         {
180             import dopt.cpu : CPUBuffer;
181 
182             enforce(numBytes == buf.numBytes, "Mismatch in buffer size");
183 
184             auto cubuf = cast(CUDABuffer)buf;
185             auto cpubuf = cast(CPUBuffer)buf;
186 
187             if(cubuf !is null)
188             {
189                 cuMemcpyDtoD(mPtr, cubuf.ptr, numBytes);
190             }
191             else if(cpubuf !is null)
192             {
193                 cuMemcpyHtoD(mPtr, cpubuf.raw.ptr, numBytes);
194             }
195             else
196             {
197                 super.set(buf);
198             }
199         }
200 
201         /**
202             Copies data from the device to the host.
203 
204             Params:
205                 buf = The buffer that the data from the CUDA device will be written to.
206         */
207         override void get(void[] buf) const
208         {
209             enforce(buf.length == mNumBytes, "output buffer is the wrong length.");
210 			enforce(cuMemcpyDtoH(buf.ptr, mPtr, buf.length) == CUDA_SUCCESS, "Failed to get contents of CUDA buffer");
211         }
212 
213         /**
214             Provides the size of the buffer allocated on the CUDA device.
215 
216             Returns:
217                 The number of bytes allocated on the CUDA device.
218         */
219         override size_t numBytes() const
220         {
221             return mNumBytes;
222         }
223 
224         /**
225             Provides the device pointer.
226 
227             Returns:
228                 A CUDA device pointer.
229         */
230         inout(CUdeviceptr) ptr() inout
231         {
232             return mPtr;
233         }
234     }
235 
236     private
237     {
238         size_t mNumBytes;
239         CUdeviceptr mPtr;
240 
241         this()
242         {
243             //
244         }
245 
246         void zero()
247         {
248             enforce(cuMemsetD8(mPtr, 0, mNumBytes) == CUDA_SUCCESS, "CUDA zero buffer failed");
249         }
250     }
251 }
252 
253 /**
254     A Plan stores all the resources (preallocated buffers, custom CUDA kernels) required to evaluate nodes from the
255     Operation graph.
256 
257     An instance of Plan can be constructed using the $(D compileCUDA) function. The primary use case for a CUDAPlan is when the
258     same set of operations are likely to be evaluated more than once. This prevents the dopt CUDA runtime from
259     reallocating and optimising the CUDA kernels every time the same set of operations is to be executed.
260 */
261 class CUDAPlan : Plan
262 {
263     public
264     {
265         long[string] profiler;
266 
267         this(Operation[] outputs)
268         {
269             import std.algorithm : canFind, filter;
270             import std.array : array;
271             import std.functional : toDelegate;
272 
273             super(outputs);
274 
275             auto sortedOps = topologicalSort(outputs);
276 
277             foreach(o; sortedOps)
278             {
279                 if(o.opType == "variable" || o.opType == "reshape" || o.opType == "constant")
280                 {
281                     continue;
282                 }
283                 
284                 auto k = mKernelCtrs.get(o.opType, toDelegate(&cudaCPUCtr));
285 
286                 enforce(k !is null, "Could not construct a CUDA kernel for operation of type '" ~ o.opType ~ "'");
287 
288                 mKernels[o] = k(o);
289             }
290 
291             mOps = sortedOps.array;
292 
293             foreach(o; mOps)
294             {
295                 if(o.opType == "reshape")
296                 {
297                     //This will be overwritten in executeImpl, but we want a slot in the hashmap for it now.
298                     mResults[o] = mResults[o.deps[0]];
299                 }
300                 else
301                 {
302                     mResults[o] = CUDABuffer.create(o.volume * o.elementType.sizeOf);
303 
304                     if(o.opType == "constant")
305                     {
306                         mResults[o].set(o.value);
307                     }
308                 }
309             }
310 
311             mResults.rehash();
312         }
313 
314         ~this()
315         {
316             cleanup();
317         }
318 
319         /**
320             Releases CUDA resources associated with this plan.
321         */
322         void cleanup()
323         {
324             if(mClean)
325             {
326                 return;
327             }
328 
329             foreach(o; mOps)
330             {
331                 if(o.opType != "reshape")
332                 {
333                     CUDABuffer.destroy(mResults[o]);
334                 }
335             }
336 
337             mClean = true;
338         }
339     }
340 
341     protected
342     {
343         override void executeImpl(DeviceBuffer[Operation] args, DeviceBuffer[] rets)
344         {
345             import std.datetime.stopwatch : StopWatch;
346             StopWatch sw;
347 
348             //Make sure all the args are variable assignments. Is this arbitrary?
349             foreach(o; args.keys)
350             {
351                 enforce(o.opType == "variable",
352                     "All assignments in args must be for Operations with an opType of 'variable'");
353             }
354 
355             //Iterate through each operation and execute it
356             foreach(o; mOps)
357             {
358                 if(o.opType == "variable" || o.opType == "constant")
359                 {
360                     continue;
361                 }
362 
363                 //Get the input buffers
364                 CUDABuffer[] inputs;
365                 CUDABuffer output = mResults[o];
366 
367                 foreach(d; o.deps)
368                 {
369                     if(d.opType == "variable")
370                     {
371                         CUDABuffer cubuf;
372 
373                         if(d in args)
374                         {
375                             cubuf = cast(CUDABuffer)args[d];
376 
377                             if(cubuf is null)
378                             {
379                                 cubuf = mResults[d];
380                                 cubuf.set(args[d]);
381                             }
382                         }
383                         else
384                         {
385                             cubuf = cast(CUDABuffer)d.value;
386 
387                             if(cubuf is null)
388                             {
389                                 cubuf = mResults[d];
390                                 cubuf.set(d.value);
391                             }
392 
393                         }
394 
395                         inputs ~= cubuf;
396                     }
397                     else
398                     {
399                         inputs ~= mResults[d];
400                     }
401                 }
402 
403                 if(o.opType == "reshape")
404                 {
405                     mResults[o] = inputs[0];
406                 }
407                 else
408                 {
409                     //Execute the operation
410                     sw.reset();
411                     sw.start();
412                     mKernels[o].execute(inputs, output);
413                     sw.stop();
414 
415                     profiler[o.opType] = profiler.get(o.opType, 0) + sw.peek.split.usecs;
416                 }
417             }
418 
419             foreach(i, o; mOutputs)
420             {
421                 rets[i].set(mResults[o]);
422             }
423         }
424     }
425 
426     private
427     {
428         Operation[] mOps;
429         CUDAKernel[Operation] mKernels;
430         CUDABuffer[Operation] mResults;
431         bool mClean = false;
432     }
433 }
434 
435 /**
436     Used for performing a one-off evaluation of a set of operations.
437 
438     If you are planning to operate the same set of operations multiple times, but with different variables assignments,
439     then you should construct a $(D CUDAPlan).
440 
441     Params:
442         ops = The operations to be evaluated.
443         args = A set of optional variable assignments.
444 
445     Returns:
446         The result of evaluating $(D ops).
447 */
448 DeviceBuffer[] evaluateCUDA(Operation[] ops, DeviceBuffer[Operation] args = null)
449 {
450     auto p = new CUDAPlan(ops);
451     
452     auto ret = p.execute(args);
453 
454     return ret;
455 }
456 
457 /**
458     A convenience overload that evaluates a single operation and returns a single $(D DeviceBuffer).
459 
460     Params:
461         op = The operation to be evaluated.
462         args = A set of optional variable assignments.
463 
464     Returns:
465         The result of evaluating $(D op)
466 */
467 DeviceBuffer evaluateCUDA(Operation op, DeviceBuffer[Operation] args = null)
468 {
469     return evaluateCUDA([op], args)[0];
470 }
471 
472 /**
473     Registers a CUDA kernel constructor for a given operation type.
474 
475     Params:
476         opName = The type of operation this kernel constructor caters to.
477         kernelCtr = The constructor that should be associated with operations with the type $(D opType).
478 */
479 void registerCUDAKernel(string opName, CUDAKernelCtr kernelCtr)
480 {
481     enforce((opName in mKernelCtrs) is null,
482         "A CUDAKernelCtr is already registered for the operation '" ~ opName ~ "'");
483 
484     mKernelCtrs[opName] = kernelCtr;
485 }
486 
487 /**
488     Deregisters a kernel constructor associated with the given operation type.
489 
490     Params:
491         opType = The operation type that should have its kernel deregistered.
492 */
493 void deregisterCUDAKernel(string opType)
494 {
495     mKernelCtrs.remove(opType);
496 }
497 
498 /**
499     Provides a list of all operation types supported by the CUDA backend.
500 
501     Returns:
502         A string array of the operation types that have kernels registered.
503 */
504 string[] listCUDAOperations()
505 {
506     return mKernelCtrs.keys ~ ["variable", "reshape"];
507 }
508 
509 package
510 {
511     string cudaType(DataType t)
512     {
513         switch(t)
514         {
515             case DataType.float32:
516                 return "float";
517             
518             case DataType.int32:
519                 return "int";
520 
521             default:
522                 import std.conv : to;
523                 assert(0, "DataType '" ~ t.to!string ~ "' is not currently supported by the CUDA backend");
524         }
525     }
526 }
527 
528 private
529 {
530     CUDAKernelCtr[string] mKernelCtrs;
531 }
532 
533 unittest
534 {
535     auto a = float32([], [3.0f]);
536     auto b = float32([], [4.0f]);
537     auto c = float32([], [-1.0f]);
538 
539     auto y = a * b + c;
540 
541     assert(evaluateCUDA(y).get!float[0] == 11.0f);
542 }