1 module dopt.core.cuda.nnet.cudnn7;
2 
3 import std.algorithm;
4 import std.array;
5 import std.functional;
6 
7 import dopt.core.cuda;
8 import dopt.core.ops;
9 
10 import derelict.cuda;
11 import derelict.cudnn7;
12 
13 package
14 {
15     void initializeCuDNN7()
16     {
17         DerelictCuDNN7.load();
18         
19         registerCUDAKernel("convolution", toDelegate(&cudaKernelCtr!ConvolutionForward));
20         registerCUDAKernel("convolutionFeaturesGrad", toDelegate(&cudaKernelCtr!ConvolutionFeaturesGrad));
21         registerCUDAKernel("convolutionFiltersGrad", toDelegate(&cudaKernelCtr!ConvolutionFiltersGrad));
22         registerCUDAKernel("maxpool", toDelegate(&cudaKernelCtr!MaxpoolForward));
23         registerCUDAKernel("maxpoolGrad", toDelegate(&cudaKernelCtr!MaxpoolGrad));
24         registerCUDAKernel("softmax", toDelegate(&cudaKernelCtr!Softmax));
25         registerCUDAKernel("softmaxGrad", toDelegate(&cudaKernelCtr!SoftmaxGrad));
26         registerCUDAKernel("addBias", toDelegate(&cudaKernelCtr!AddBias));
27         registerCUDAKernel("addBiasGrad", toDelegate(&cudaKernelCtr!AddBiasGrad));
28         registerCUDAKernel("batchNormTrain", toDelegate(&cudaKernelCtr!BatchNormTrain));
29         registerCUDAKernel("batchNormGrad", toDelegate(&cudaKernelCtr!BatchNormGrad));
30 
31         cudnnCreate(&handle);
32     }
33 }
34 
35 private
36 {
37     cudnnHandle_t handle;
38 
39     void cudnnCheck(cudnnStatus_t status, string mod = __MODULE__, size_t line = __LINE__)
40     {
41         import std.conv : to;
42         import std.exception : enforce;
43         enforce(status == CUDNN_STATUS_SUCCESS, mod ~ "(" ~ line.to!string ~ "): Failed to execute cuDNN function." ~
44             " Error code: " ~ status.to!string);
45     }
46 
47     CUDAKernel cudaKernelCtr(K)(Operation op)
48     {
49         return new K(op);
50     }
51 
52     class ConvolutionBase : CUDAKernel
53     {
54         this(Operation op, int[] inShape, int[] filterShape, int[] outShape)
55         {
56             mOp = op;
57 
58             int padH = 0;
59             int padW = 0;
60             int strideY = 1;
61             int strideX = 1;
62 
63             auto padding = op.attributes["padding"].get!(size_t[]);
64             padH = cast(int)padding[0];
65             padW = cast(int)padding[1];
66 
67             auto stride = op.attributes["stride"].get!(size_t[]);
68             strideY = cast(int)stride[0];
69             strideX = cast(int)stride[1];
70 
71             auto dilation = [1LU, 1LU];
72             int dilY = cast(int)dilation[0];
73             int dilX = cast(int)dilation[1];
74 
75             cudnnCreateTensorDescriptor(&xDesc).cudnnCheck();
76 			cudnnCreateFilterDescriptor(&wDesc).cudnnCheck();
77 			cudnnCreateConvolutionDescriptor(&convDesc).cudnnCheck();
78 			cudnnCreateTensorDescriptor(&yDesc).cudnnCheck();
79 
80             cudnnSetTensor4dDescriptor(xDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, inShape[0], inShape[1], inShape[2],
81                 inShape[3]).cudnnCheck();
82             cudnnSetFilter4dDescriptor(wDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, filterShape[0], filterShape[1],
83                 filterShape[2], filterShape[3]).cudnnCheck();
84             cudnnSetConvolution2dDescriptor(convDesc, padH, padW, strideY, strideX, dilY, dilX, CUDNN_CONVOLUTION,
85                 CUDNN_DATA_FLOAT).cudnnCheck();
86             cudnnSetTensor4dDescriptor(yDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, outShape[0], outShape[1],
87                 outShape[2], outShape[3]).cudnnCheck();
88         }
89 
90         ~this()
91         {
92             cudnnDestroyFilterDescriptor(wDesc).cudnnCheck();
93 			cudnnDestroyTensorDescriptor(yDesc).cudnnCheck();
94 			cudnnDestroyConvolutionDescriptor(convDesc).cudnnCheck();
95 			cudnnDestroyTensorDescriptor(xDesc).cudnnCheck();
96         }
97 
98         abstract void execute(const(CUDABuffer)[] inputs, CUDABuffer output);
99 
100         Operation mOp;
101         cudnnTensorDescriptor_t xDesc;
102 		cudnnFilterDescriptor_t wDesc;
103 		cudnnTensorDescriptor_t bDesc;
104 		cudnnConvolutionDescriptor_t convDesc;
105 		cudnnTensorDescriptor_t yDesc;
106     }
107 
108     private static CUDABuffer mWorkspace;
109 
110     class ConvolutionForward : ConvolutionBase
111     {
112 		private cudnnConvolutionFwdAlgo_t mAlgo;
113 
114         this(Operation op)
115         {
116             auto inShape = op.deps[0].outputType.shape.map!(x => cast(int)x).array();
117             auto filterShape = op.deps[1].outputType.shape.map!(x => cast(int)x).array();
118             auto outShape = op.outputType.shape.map!(x => cast(int)x).array();
119 
120             super(op, inShape, filterShape, outShape);
121 
122 			cudnnConvolutionFwdAlgoPerf_t[9] algoPerfs;
123 			int numAlgos;
124 			cudnnFindConvolutionForwardAlgorithm(handle, xDesc, wDesc, convDesc, yDesc, cast(int)algoPerfs.length, &numAlgos, algoPerfs.ptr);
125 
126 			if(mWorkspace !is null && algoPerfs[0].memory > mWorkspace.numBytes)
127 			{
128 				CUDABuffer.destroy(mWorkspace);
129 				mWorkspace = null;
130 			}
131 
132 			if(mWorkspace is null)
133 			{
134 				mWorkspace = CUDABuffer.create(algoPerfs[0].memory);
135 			}
136 
137 			mAlgo = algoPerfs[0].algo;
138         }
139 
140         override void execute(const(CUDABuffer)[] inputs, CUDABuffer output)
141         {
142             auto x = cast(void *)inputs[0].ptr;
143             auto w = cast(void *)inputs[1].ptr;
144             auto y = cast(void *)output.ptr;
145             float alpha = 1;
146             float beta = 0;
147 
148 			auto ws = cast(void *)(mWorkspace.ptr);
149 			auto wss = mWorkspace.numBytes;
150             cudnnConvolutionForward(handle, &alpha, xDesc, x, wDesc, w, convDesc, mAlgo, ws, wss, &beta, yDesc, y)
151             .cudnnCheck();
152 
153             cuCtxSynchronize();
154         }
155     }
156 
157     class ConvolutionFeaturesGrad : ConvolutionBase
158     {
159 		private cudnnConvolutionBwdDataAlgo_t mAlgo;
160 
161         this(Operation op)
162         {
163             auto inShape = op.shape.map!(x => cast(int)x).array();
164             auto filterShape = op.deps[1].shape.map!(x => cast(int)x).array();
165             auto outShape = op.deps[0].shape.map!(x => cast(int)x).array();
166 
167             super(op, inShape, filterShape, outShape);
168 
169 			cudnnConvolutionBwdDataAlgoPerf_t[9] algoPerfs;
170 			int numAlgos;
171 			cudnnFindConvolutionBackwardDataAlgorithm(handle, wDesc, yDesc, convDesc, xDesc, cast(int)algoPerfs.length, &numAlgos, algoPerfs.ptr);
172 
173 			if(mWorkspace !is null && algoPerfs[0].memory > mWorkspace.numBytes)
174 			{
175 				CUDABuffer.destroy(mWorkspace);
176 				mWorkspace = null;
177 			}
178 
179 			if(mWorkspace is null)
180 			{
181 				mWorkspace = CUDABuffer.create(algoPerfs[0].memory);
182 			}
183 
184 			mAlgo = algoPerfs[0].algo;
185         }
186 
187         override void execute(const(CUDABuffer)[] inputs, CUDABuffer output)
188         {
189             auto w = cast(void *)inputs[1].ptr;
190             auto dy = cast(void *)inputs[0].ptr;
191             auto dx = cast(void *)output.ptr;
192             float alpha = 1;
193             float beta = 0;
194 
195             cudnnConvolutionBackwardData(handle, &alpha, wDesc, w, yDesc, dy, convDesc, mAlgo, cast(void *)mWorkspace.ptr, mWorkspace.numBytes, &beta, xDesc, dx)
196             .cudnnCheck();
197 
198             cuCtxSynchronize();
199         }
200     }
201 
202     class ConvolutionFiltersGrad : ConvolutionBase
203     {
204 		private cudnnConvolutionBwdFilterAlgo_t mAlgo;
205 
206         this(Operation op)
207         {
208             auto inShape = op.deps[1].outputType.shape.map!(x => cast(int)x).array();
209             auto filterShape = op.outputType.shape.map!(x => cast(int)x).array();
210             auto outShape = op.deps[0].outputType.shape.map!(x => cast(int)x).array();
211 
212             super(op, inShape, filterShape, outShape);
213 
214 			cudnnConvolutionBwdFilterAlgoPerf_t[9] algoPerfs;
215 			int numAlgos;
216 			cudnnFindConvolutionBackwardFilterAlgorithm(handle, xDesc, yDesc, convDesc, wDesc, cast(int)algoPerfs.length, &numAlgos, algoPerfs.ptr);
217 
218 			if(mWorkspace !is null && algoPerfs[0].memory > mWorkspace.numBytes)
219 			{
220 				CUDABuffer.destroy(mWorkspace);
221 				mWorkspace = null;
222 			}
223 
224 			if(mWorkspace is null)
225 			{
226 				mWorkspace = CUDABuffer.create(algoPerfs[0].memory);
227 			}
228 
229 			mAlgo = algoPerfs[0].algo;
230         }
231 
232         override void execute(const(CUDABuffer)[] inputs, CUDABuffer output)
233         {
234             auto x = cast(void *)inputs[1].ptr;
235             auto dy = cast(void *)inputs[0].ptr;
236             auto dw = cast(void *)output.ptr;
237             float alpha = 1;
238             float beta = 0;
239 
240             cudnnConvolutionBackwardFilter(handle, &alpha, xDesc, x, yDesc, dy, convDesc, mAlgo, cast(void *)mWorkspace.ptr, mWorkspace.numBytes, &beta, wDesc,
241                 dw).cudnnCheck();
242 
243             cuCtxSynchronize();
244         }
245     }
246 
247     class MaxpoolBase : CUDAKernel
248     {
249         this(Operation op, int[] inShape, int[]outShape)
250         {
251             auto dims = op.attributes["dims"].get!(size_t[]);
252             auto poolShape = dims.map!(x => cast(int)x).array();
253             auto poolStride = poolShape.dup;
254 
255             cudnnCreatePoolingDescriptor(&poolingDesc).cudnnCheck();
256 			cudnnSetPooling2dDescriptor(poolingDesc, CUDNN_POOLING_MAX, 1, cast(int)poolShape[0],
257                 cast(int)poolShape[1], 0, 0, cast(int)poolStride[0], cast(int)poolStride[1]).cudnnCheck();
258 
259 			cudnnCreateTensorDescriptor(&xDesc).cudnnCheck();
260 			cudnnCreateTensorDescriptor(&yDesc).cudnnCheck();
261 			cudnnSetTensor4dDescriptor(xDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, inShape[0], inShape[1], inShape[2],
262                 inShape[3]).cudnnCheck();
263 			cudnnSetTensor4dDescriptor(yDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, outShape[0], outShape[1],
264                 outShape[2], outShape[3]).cudnnCheck();
265 
266         }
267 
268         ~this()
269 		{
270 			cudnnDestroyPoolingDescriptor(poolingDesc).cudnnCheck();
271 			cudnnDestroyTensorDescriptor(xDesc).cudnnCheck();
272 			cudnnDestroyTensorDescriptor(yDesc).cudnnCheck();
273 		}
274 
275         abstract void execute(const(CUDABuffer)[] inputs, CUDABuffer output);
276 
277         cudnnPoolingDescriptor_t poolingDesc;
278 		cudnnTensorDescriptor_t xDesc;
279 		cudnnTensorDescriptor_t yDesc;
280     }
281 
282     class MaxpoolForward : MaxpoolBase
283     {
284         this(Operation op)
285         {
286             auto inShape = op.deps[0].outputType.shape.map!(x => cast(int)x).array();
287 			auto outShape = op.outputType.shape.map!(x => cast(int)x).array();
288 
289             super(op, inShape, outShape);
290         }
291 
292         override void execute(const(CUDABuffer)[] inputs, CUDABuffer output)
293         {
294             auto x = cast(void *)inputs[0].ptr;
295 			auto y = cast(void *)output.ptr;
296 			float alpha = 1;
297 			float beta = 0;
298 
299 			cudnnPoolingForward(handle, poolingDesc, &alpha, xDesc, x, &beta, yDesc, y).cudnnCheck();
300 
301             cuCtxSynchronize();
302         }
303     }
304 
305     class MaxpoolGrad : MaxpoolBase
306     {
307         this(Operation op)
308         {
309             auto inShape = op.deps[2].outputType.shape.map!(x => cast(int)x).array();
310 			auto outShape = op.deps[1].outputType.shape.map!(x => cast(int)x).array();
311 
312             super(op, inShape, outShape);
313         }
314 
315         override void execute(const(CUDABuffer)[] inputs, CUDABuffer output)
316         {
317             auto dx = cast(void *)output.ptr;
318 			auto dy = cast(void *)inputs[0].ptr;
319 			auto y = cast(void *)inputs[1].ptr;
320 			auto x = cast(void *)inputs[2].ptr;
321 			float alpha = 1;
322 			float beta = 0;
323 
324 			cudnnPoolingBackward(handle, poolingDesc, &alpha, yDesc, y, yDesc, dy, xDesc, x, &beta, xDesc, dx)
325             .cudnnCheck();
326 
327             cuCtxSynchronize();
328         }
329     }
330 
331     class Softmax : CUDAKernel
332     {
333         this(Operation op)
334         {
335             auto shape = op.shape.map!(x => cast(int)x).array();
336             auto vol = 1;
337             
338             for(size_t i = 2; i < shape.length; i++)
339             {
340                 vol *= shape[i];
341             }
342 
343 			cudnnCreateTensorDescriptor(&desc).cudnnCheck();
344 			cudnnSetTensor4dDescriptor(desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, shape[0], shape[1], vol, 1)
345             .cudnnCheck();
346         }
347 
348         ~this()
349         {
350             cudnnDestroyTensorDescriptor(desc).cudnnCheck();
351         }
352 
353         override void execute(const(CUDABuffer)[] inputs, CUDABuffer output)
354         {
355             float alpha = 1.0;
356 			float beta = 0.0;
357 			auto x = cast(void *)inputs[0].ptr;
358 			auto y = cast(void *)output.ptr;
359 
360 			cudnnSoftmaxForward(handle, CUDNN_SOFTMAX_ACCURATE, CUDNN_SOFTMAX_MODE_CHANNEL, &alpha, desc, x, &beta,
361                 desc, y).cudnnCheck();
362             
363             cuCtxSynchronize();
364         }
365 
366         cudnnTensorDescriptor_t desc;
367     }
368 
369     class SoftmaxGrad : CUDAKernel
370     {
371         this(Operation op)
372         {
373             auto shape = op.shape.map!(x => cast(int)x).array();
374 
375 			cudnnCreateTensorDescriptor(&desc).cudnnCheck();
376 			cudnnSetTensor4dDescriptor(desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, shape[0], shape[1],
377                 reduce!"a * b"(1, shape[2 .. $]), 1).cudnnCheck();
378         }
379 
380         ~this()
381         {
382             cudnnDestroyTensorDescriptor(desc).cudnnCheck();
383         }
384 
385         override void execute(const(CUDABuffer)[] inputs, CUDABuffer output)
386         {
387             float alpha = 1.0;
388 			float beta = 0.0;
389 			auto dy = cast(void *)inputs[0].ptr;
390 			auto y = cast(void *)inputs[1].ptr;
391 			auto dx = cast(void *)output.ptr;
392 
393 			cudnnSoftmaxBackward(handle, CUDNN_SOFTMAX_ACCURATE, CUDNN_SOFTMAX_MODE_CHANNEL, &alpha, desc, y, desc, dy,
394                 &beta, desc, dx).cudnnCheck();
395 
396             cuCtxSynchronize();
397         }
398 
399         cudnnTensorDescriptor_t desc;
400     }
401 
402     class AddBias : CUDAKernel
403     {
404         this(Operation op)
405         {
406             auto shape = op.shape.map!(x => cast(int)x).array();
407 
408 			cudnnCreateTensorDescriptor(&cDesc).cudnnCheck();
409 			cudnnSetTensor4dDescriptor(cDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, shape[0], shape[1],
410                 reduce!"a * b"(1, shape[2 .. $]), 1).cudnnCheck();
411             
412             cudnnCreateTensorDescriptor(&aDesc).cudnnCheck();
413             cudnnSetTensor4dDescriptor(aDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, shape[1], 1, 1).cudnnCheck();
414         }
415 
416         ~this()
417         {
418             cudnnDestroyTensorDescriptor(cDesc).cudnnCheck();
419             cudnnDestroyTensorDescriptor(aDesc).cudnnCheck();
420         }
421 
422         override void execute(const(CUDABuffer)[] inputs, CUDABuffer output)
423         {
424             cuMemcpy(output.ptr, inputs[0].ptr, output.numBytes);
425 
426             float alpha = 1;
427             float beta = 1;
428 
429             cudnnAddTensor(handle, &alpha, aDesc, cast(void *)inputs[1].ptr, &beta, cDesc, cast(void *)output.ptr);
430         }
431 
432         cudnnTensorDescriptor_t cDesc;
433         cudnnTensorDescriptor_t aDesc;
434     }
435 
436     class AddBiasGrad : CUDAKernel
437     {
438         this(Operation op)
439         {
440             auto shape = op.deps[0].shape.map!(x => cast(int)x).array();
441 
442 			cudnnCreateTensorDescriptor(&dyDesc).cudnnCheck();
443 			cudnnSetTensor4dDescriptor(dyDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, shape[0], shape[1],
444                 reduce!"a * b"(1, shape[2 .. $]), 1).cudnnCheck();
445             
446             cudnnCreateTensorDescriptor(&dbDesc).cudnnCheck();
447             cudnnSetTensor4dDescriptor(dbDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, shape[1], 1, 1).cudnnCheck();
448         }
449 
450         ~this()
451         {
452             cudnnDestroyTensorDescriptor(dyDesc).cudnnCheck();
453             cudnnDestroyTensorDescriptor(dbDesc).cudnnCheck();
454         }
455 
456         override void execute(const(CUDABuffer)[] inputs, CUDABuffer output)
457         {
458             float alpha = 1.0f;
459             float beta = 1.0f;
460 
461             cudnnConvolutionBackwardBias(handle, &alpha, dyDesc, cast(void *)inputs[0].ptr, &beta, dbDesc,
462                 cast(void *)output.ptr);
463         }
464 
465         cudnnTensorDescriptor_t dyDesc;
466         cudnnTensorDescriptor_t dbDesc;
467     }
468 
469     class BatchNormTrain : CUDAKernel
470     {
471         this(Operation op)
472         {
473             if(op.rank == 2)
474             {
475                 mode = CUDNN_BATCHNORM_PER_ACTIVATION;
476             }
477             else
478             {
479                 mode = CUDNN_BATCHNORM_SPATIAL;
480             }
481 
482             import std.range;
483 
484             auto shape = op.deps[0].shape
485                         .chain(repeat(1))
486                         .map!(x => cast(int)x)
487                         .take(4)
488                         .array();
489 
490             cudnnCreateTensorDescriptor(&xDesc).cudnnCheck();
491             cudnnCreateTensorDescriptor(&bnDesc).cudnnCheck();
492 
493             cudnnSetTensor4dDescriptor(xDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, shape[0], shape[1], shape[2],
494                 shape[3]).cudnnCheck();
495             cudnnDeriveBNTensorDescriptor(bnDesc, xDesc, mode).cudnnCheck();
496         }
497 
498         ~this()
499         {
500             cudnnDestroyTensorDescriptor(xDesc).cudnnCheck();
501             cudnnDestroyTensorDescriptor(bnDesc).cudnnCheck();
502         }
503 
504         override void execute(const(CUDABuffer)[] inputs, CUDABuffer output)
505         {
506             float alpha = 1.0f;
507             float beta = 0.0f;
508 
509             cudnnBatchNormalizationForwardTraining(handle, mode, &alpha, &beta, xDesc,
510                 cast(void *)inputs[0].ptr, xDesc, cast(void *)output.ptr, bnDesc, cast(void *)inputs[1].ptr,
511                 cast(void *)inputs[2].ptr, 0, null, null, 1e-5f, null, null).cudnnCheck();
512         }
513 
514         cudnnBatchNormMode_t mode;
515         cudnnTensorDescriptor_t xDesc;
516         cudnnTensorDescriptor_t bnDesc;
517     }
518 
519     class BatchNormGrad : CUDAKernel
520     {
521         this(Operation op)
522         {
523             if(op.deps[1].rank == 2)
524             {
525                 mode = CUDNN_BATCHNORM_PER_ACTIVATION;
526             }
527             else
528             {
529                 mode = CUDNN_BATCHNORM_SPATIAL;
530             }
531 
532             import std.range;
533 
534             auto shape = op.deps[1].shape
535                         .chain(repeat(1))
536                         .map!(x => cast(int)x)
537                         .take(4)
538                         .array();
539 
540             cudnnCreateTensorDescriptor(&xDesc).cudnnCheck();
541             cudnnCreateTensorDescriptor(&bnDesc).cudnnCheck();
542 
543             cudnnSetTensor4dDescriptor(xDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, shape[0], shape[1], shape[2],
544                 shape[3]).cudnnCheck();
545             cudnnDeriveBNTensorDescriptor(bnDesc, xDesc, mode).cudnnCheck();
546         }
547 
548         ~this()
549         {
550             cudnnDestroyTensorDescriptor(xDesc).cudnnCheck();
551             cudnnDestroyTensorDescriptor(bnDesc).cudnnCheck();
552         }
553 
554         override void execute(const(CUDABuffer)[] inputs, CUDABuffer output)
555         {
556             float alpha = 1.0f;
557             float beta = 0.0f;
558 
559             void *dx = cast(void *)(output.ptr);
560             void *dscale = cast(void *)(output.ptr + inputs[1].numBytes);
561             void *dbias = cast(void *)(output.ptr + inputs[1].numBytes + inputs[2].numBytes);
562 
563             cudnnBatchNormalizationBackward(handle, mode, &alpha, &beta, &alpha, &beta, xDesc,
564                 cast(void *)inputs[1].ptr, xDesc, cast(void *)inputs[0].ptr, xDesc, dx, bnDesc,
565                 cast(void *)inputs[2].ptr, dscale, dbias, 1e-5f, null, null);
566         }
567 
568         cudnnBatchNormMode_t mode;
569         cudnnTensorDescriptor_t xDesc;
570         cudnnTensorDescriptor_t bnDesc;
571     }
572 }