1 module dopt.core.cuda.nnet.cudnn5;
2 
3 import std.algorithm;
4 import std.array;
5 import std.functional;
6 
7 import dopt.core.cuda;
8 import dopt.core.ops;
9 
10 import derelict.cuda;
11 import derelict.cudnn5;
12 
13 package
14 {
15     void initializeCuDNN5()
16     {
17         DerelictCuDNN5.load();
18         
19         registerCUDAKernel("convolution", toDelegate(&cudaKernelCtr!ConvolutionForward));
20         registerCUDAKernel("convolutionFeaturesGrad", toDelegate(&cudaKernelCtr!ConvolutionFeaturesGrad));
21         registerCUDAKernel("convolutionFiltersGrad", toDelegate(&cudaKernelCtr!ConvolutionFiltersGrad));
22         registerCUDAKernel("maxpool", toDelegate(&cudaKernelCtr!MaxpoolForward));
23         registerCUDAKernel("maxpoolGrad", toDelegate(&cudaKernelCtr!MaxpoolGrad));
24         registerCUDAKernel("softmax", toDelegate(&cudaKernelCtr!Softmax));
25         registerCUDAKernel("softmaxGrad", toDelegate(&cudaKernelCtr!SoftmaxGrad));
26         registerCUDAKernel("addBias", toDelegate(&cudaKernelCtr!AddBias));
27         registerCUDAKernel("addBiasGrad", toDelegate(&cudaKernelCtr!AddBiasGrad));
28         registerCUDAKernel("batchNormTrain", toDelegate(&cudaKernelCtr!BatchNormTrain));
29         registerCUDAKernel("batchNormGrad", toDelegate(&cudaKernelCtr!BatchNormGrad));
30 
31         cudnnCreate(&handle);
32     }
33 }
34 
35 private
36 {
37     cudnnHandle_t handle;
38 
39     void cudnnCheck(cudnnStatus_t status, string mod = __MODULE__, size_t line = __LINE__)
40     {
41         import std.conv : to;
42         import std.exception : enforce;
43         enforce(status == CUDNN_STATUS_SUCCESS, mod ~ "(" ~ line.to!string ~ "): Failed to execute cuDNN function." ~
44             " Error code: " ~ status.to!string);
45     }
46 
47     CUDAKernel cudaKernelCtr(K)(Operation op)
48     {
49         return new K(op);
50     }
51 
52     class ConvolutionBase : CUDAKernel
53     {
54         this(Operation op, int[] inShape, int[] filterShape, int[] outShape)
55         {
56             mOp = op;
57 
58             int padH = 0;
59             int padW = 0;
60             int strideY = 1;
61             int strideX = 1;
62 
63             auto padding = op.attributes["padding"].get!(size_t[]);
64             padH = cast(int)padding[0];
65             padW = cast(int)padding[1];
66 
67             auto stride = op.attributes["stride"].get!(size_t[]);
68             strideY = cast(int)stride[0];
69             strideX = cast(int)stride[1];
70 
71             cudnnCreateTensorDescriptor(&xDesc).cudnnCheck();
72 			cudnnCreateFilterDescriptor(&wDesc).cudnnCheck();
73 			cudnnCreateConvolutionDescriptor(&convDesc).cudnnCheck();
74 			cudnnCreateTensorDescriptor(&yDesc).cudnnCheck();
75 
76             cudnnSetTensor4dDescriptor(xDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, inShape[0], inShape[1], inShape[2],
77                 inShape[3]).cudnnCheck();
78             cudnnSetFilter4dDescriptor(wDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, filterShape[0], filterShape[1],
79                 filterShape[2], filterShape[3]).cudnnCheck();
80             cudnnSetConvolution2dDescriptor_v5(convDesc, padH, padW, strideY, strideX, 1, 1, CUDNN_CONVOLUTION,
81                 CUDNN_DATA_FLOAT).cudnnCheck();
82             cudnnSetTensor4dDescriptor(yDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, outShape[0], outShape[1],
83                 outShape[2], outShape[3]).cudnnCheck();
84         }
85 
86         ~this()
87         {
88             cudnnDestroyFilterDescriptor(wDesc).cudnnCheck();
89 			cudnnDestroyTensorDescriptor(yDesc).cudnnCheck();
90 			cudnnDestroyConvolutionDescriptor(convDesc).cudnnCheck();
91 			cudnnDestroyTensorDescriptor(xDesc).cudnnCheck();
92         }
93 
94         abstract void execute(const(CUDABuffer)[] inputs, CUDABuffer output);
95 
96         Operation mOp;
97         cudnnTensorDescriptor_t xDesc;
98 		cudnnFilterDescriptor_t wDesc;
99 		cudnnTensorDescriptor_t bDesc;
100 		cudnnConvolutionDescriptor_t convDesc;
101 		cudnnTensorDescriptor_t yDesc;
102     }
103 
104     class ConvolutionForward : ConvolutionBase
105     {
106         this(Operation op)
107         {
108             auto inShape = op.deps[0].outputType.shape.map!(x => cast(int)x).array();
109             auto filterShape = op.deps[1].outputType.shape.map!(x => cast(int)x).array();
110             auto outShape = op.outputType.shape.map!(x => cast(int)x).array();
111 
112             super(op, inShape, filterShape, outShape);
113         }
114 
115         override void execute(const(CUDABuffer)[] inputs, CUDABuffer output)
116         {
117             auto x = cast(void *)inputs[0].ptr;
118             auto w = cast(void *)inputs[1].ptr;
119             auto y = cast(void *)output.ptr;
120             float alpha = 1;
121             float beta = 0;
122 
123             cudnnConvolutionForward(handle, &alpha, xDesc, x, wDesc, w, convDesc, 0, null, 0, &beta, yDesc, y)
124             .cudnnCheck();
125 
126             cuCtxSynchronize();
127         }
128     }
129 
130     class ConvolutionFeaturesGrad : ConvolutionBase
131     {
132         this(Operation op)
133         {
134             auto inShape = op.shape.map!(x => cast(int)x).array();
135             auto filterShape = op.deps[1].shape.map!(x => cast(int)x).array();
136             auto outShape = op.deps[0].shape.map!(x => cast(int)x).array();
137 
138             super(op, inShape, filterShape, outShape);
139         }
140 
141         override void execute(const(CUDABuffer)[] inputs, CUDABuffer output)
142         {
143             auto w = cast(void *)inputs[1].ptr;
144             auto dy = cast(void *)inputs[0].ptr;
145             auto dx = cast(void *)output.ptr;
146             float alpha = 1;
147             float beta = 0;
148 
149             cudnnConvolutionBackwardData(handle, &alpha, wDesc, w, yDesc, dy, convDesc, 0, null, 0, &beta, xDesc, dx)
150             .cudnnCheck();
151 
152             cuCtxSynchronize();
153         }
154     }
155 
156     class ConvolutionFiltersGrad : ConvolutionBase
157     {
158         this(Operation op)
159         {
160             auto inShape = op.deps[1].outputType.shape.map!(x => cast(int)x).array();
161             auto filterShape = op.outputType.shape.map!(x => cast(int)x).array();
162             auto outShape = op.deps[0].outputType.shape.map!(x => cast(int)x).array();
163 
164             super(op, inShape, filterShape, outShape);
165         }
166 
167         override void execute(const(CUDABuffer)[] inputs, CUDABuffer output)
168         {
169             auto x = cast(void *)inputs[1].ptr;
170             auto dy = cast(void *)inputs[0].ptr;
171             auto dw = cast(void *)output.ptr;
172             float alpha = 1;
173             float beta = 0;
174 
175             cudnnConvolutionBackwardFilter(handle, &alpha, xDesc, x, yDesc, dy, convDesc, 0, null, 0, &beta, wDesc,
176                 dw).cudnnCheck();
177 
178             cuCtxSynchronize();
179         }
180     }
181 
182     class MaxpoolBase : CUDAKernel
183     {
184         this(Operation op, int[] inShape, int[]outShape)
185         {
186             auto dims = op.attributes["dims"].get!(size_t[]);
187             auto poolShape = dims.map!(x => cast(int)x).array();
188             auto poolStride = poolShape.dup;
189 
190             cudnnCreatePoolingDescriptor(&poolingDesc).cudnnCheck();
191 			cudnnSetPooling2dDescriptor(poolingDesc, CUDNN_POOLING_MAX, 1, cast(int)poolShape[0],
192                 cast(int)poolShape[1], 0, 0, cast(int)poolStride[0], cast(int)poolStride[1]).cudnnCheck();
193 
194 			cudnnCreateTensorDescriptor(&xDesc).cudnnCheck();
195 			cudnnCreateTensorDescriptor(&yDesc).cudnnCheck();
196 			cudnnSetTensor4dDescriptor(xDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, inShape[0], inShape[1], inShape[2],
197                 inShape[3]).cudnnCheck();
198 			cudnnSetTensor4dDescriptor(yDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, outShape[0], outShape[1],
199                 outShape[2], outShape[3]).cudnnCheck();
200 
201         }
202 
203         ~this()
204 		{
205 			cudnnDestroyPoolingDescriptor(poolingDesc).cudnnCheck();
206 			cudnnDestroyTensorDescriptor(xDesc).cudnnCheck();
207 			cudnnDestroyTensorDescriptor(yDesc).cudnnCheck();
208 		}
209 
210         abstract void execute(const(CUDABuffer)[] inputs, CUDABuffer output);
211 
212         cudnnPoolingDescriptor_t poolingDesc;
213 		cudnnTensorDescriptor_t xDesc;
214 		cudnnTensorDescriptor_t yDesc;
215     }
216 
217     class MaxpoolForward : MaxpoolBase
218     {
219         this(Operation op)
220         {
221             auto inShape = op.deps[0].outputType.shape.map!(x => cast(int)x).array();
222 			auto outShape = op.outputType.shape.map!(x => cast(int)x).array();
223 
224             super(op, inShape, outShape);
225         }
226 
227         override void execute(const(CUDABuffer)[] inputs, CUDABuffer output)
228         {
229             auto x = cast(void *)inputs[0].ptr;
230 			auto y = cast(void *)output.ptr;
231 			float alpha = 1;
232 			float beta = 0;
233 
234 			cudnnPoolingForward(handle, poolingDesc, &alpha, xDesc, x, &beta, yDesc, y).cudnnCheck();
235 
236             cuCtxSynchronize();
237         }
238     }
239 
240     class MaxpoolGrad : MaxpoolBase
241     {
242         this(Operation op)
243         {
244             auto inShape = op.deps[2].outputType.shape.map!(x => cast(int)x).array();
245 			auto outShape = op.deps[1].outputType.shape.map!(x => cast(int)x).array();
246 
247             super(op, inShape, outShape);
248         }
249 
250         override void execute(const(CUDABuffer)[] inputs, CUDABuffer output)
251         {
252             auto dx = cast(void *)output.ptr;
253 			auto dy = cast(void *)inputs[0].ptr;
254 			auto y = cast(void *)inputs[1].ptr;
255 			auto x = cast(void *)inputs[2].ptr;
256 			float alpha = 1;
257 			float beta = 0;
258 
259 			cudnnPoolingBackward(handle, poolingDesc, &alpha, yDesc, y, yDesc, dy, xDesc, x, &beta, xDesc, dx)
260             .cudnnCheck();
261 
262             cuCtxSynchronize();
263         }
264     }
265 
266     class Softmax : CUDAKernel
267     {
268         this(Operation op)
269         {
270             auto shape = op.shape.map!(x => cast(int)x).array();
271             auto vol = 1;
272             
273             for(size_t i = 2; i < shape.length; i++)
274             {
275                 vol *= shape[i];
276             }
277 
278 			cudnnCreateTensorDescriptor(&desc).cudnnCheck();
279 			cudnnSetTensor4dDescriptor(desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, shape[0], shape[1], vol, 1)
280             .cudnnCheck();
281         }
282 
283         ~this()
284         {
285             cudnnDestroyTensorDescriptor(desc).cudnnCheck();
286         }
287 
288         override void execute(const(CUDABuffer)[] inputs, CUDABuffer output)
289         {
290             float alpha = 1.0;
291 			float beta = 0.0;
292 			auto x = cast(void *)inputs[0].ptr;
293 			auto y = cast(void *)output.ptr;
294 
295 			cudnnSoftmaxForward(handle, CUDNN_SOFTMAX_ACCURATE, CUDNN_SOFTMAX_MODE_CHANNEL, &alpha, desc, x, &beta,
296                 desc, y).cudnnCheck();
297             
298             cuCtxSynchronize();
299         }
300 
301         cudnnTensorDescriptor_t desc;
302     }
303 
304     class SoftmaxGrad : CUDAKernel
305     {
306         this(Operation op)
307         {
308             auto shape = op.shape.map!(x => cast(int)x).array();
309 
310 			cudnnCreateTensorDescriptor(&desc).cudnnCheck();
311 			cudnnSetTensor4dDescriptor(desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, shape[0], shape[1],
312                 reduce!"a * b"(1, shape[2 .. $]), 1).cudnnCheck();
313         }
314 
315         ~this()
316         {
317             cudnnDestroyTensorDescriptor(desc).cudnnCheck();
318         }
319 
320         override void execute(const(CUDABuffer)[] inputs, CUDABuffer output)
321         {
322             float alpha = 1.0;
323 			float beta = 0.0;
324 			auto dy = cast(void *)inputs[0].ptr;
325 			auto y = cast(void *)inputs[1].ptr;
326 			auto dx = cast(void *)output.ptr;
327 
328 			cudnnSoftmaxBackward(handle, CUDNN_SOFTMAX_ACCURATE, CUDNN_SOFTMAX_MODE_CHANNEL, &alpha, desc, y, desc, dy,
329                 &beta, desc, dx).cudnnCheck();
330 
331             cuCtxSynchronize();
332         }
333 
334         cudnnTensorDescriptor_t desc;
335     }
336 
337     class AddBias : CUDAKernel
338     {
339         this(Operation op)
340         {
341             auto shape = op.shape.map!(x => cast(int)x).array();
342 
343 			cudnnCreateTensorDescriptor(&cDesc).cudnnCheck();
344 			cudnnSetTensor4dDescriptor(cDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, shape[0], shape[1],
345                 reduce!"a * b"(1, shape[2 .. $]), 1).cudnnCheck();
346             
347             cudnnCreateTensorDescriptor(&aDesc).cudnnCheck();
348             cudnnSetTensor4dDescriptor(aDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, shape[1], 1, 1).cudnnCheck();
349         }
350 
351         ~this()
352         {
353             cudnnDestroyTensorDescriptor(cDesc).cudnnCheck();
354             cudnnDestroyTensorDescriptor(aDesc).cudnnCheck();
355         }
356 
357         override void execute(const(CUDABuffer)[] inputs, CUDABuffer output)
358         {
359             cuMemcpy(output.ptr, inputs[0].ptr, output.numBytes);
360 
361             float alpha = 1;
362             float beta = 1;
363 
364             cudnnAddTensor(handle, &alpha, aDesc, cast(void *)inputs[1].ptr, &beta, cDesc, cast(void *)output.ptr);
365         }
366 
367         cudnnTensorDescriptor_t cDesc;
368         cudnnTensorDescriptor_t aDesc;
369     }
370 
371     class AddBiasGrad : CUDAKernel
372     {
373         this(Operation op)
374         {
375             auto shape = op.deps[0].shape.map!(x => cast(int)x).array();
376 
377 			cudnnCreateTensorDescriptor(&dyDesc).cudnnCheck();
378 			cudnnSetTensor4dDescriptor(dyDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, shape[0], shape[1],
379                 reduce!"a * b"(1, shape[2 .. $]), 1).cudnnCheck();
380             
381             cudnnCreateTensorDescriptor(&dbDesc).cudnnCheck();
382             cudnnSetTensor4dDescriptor(dbDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, shape[1], 1, 1).cudnnCheck();
383         }
384 
385         ~this()
386         {
387             cudnnDestroyTensorDescriptor(dyDesc).cudnnCheck();
388             cudnnDestroyTensorDescriptor(dbDesc).cudnnCheck();
389         }
390 
391         override void execute(const(CUDABuffer)[] inputs, CUDABuffer output)
392         {
393             float alpha = 1.0f;
394             float beta = 1.0f;
395 
396             cudnnConvolutionBackwardBias(handle, &alpha, dyDesc, cast(void *)inputs[0].ptr, &beta, dbDesc,
397                 cast(void *)output.ptr);
398         }
399 
400         cudnnTensorDescriptor_t dyDesc;
401         cudnnTensorDescriptor_t dbDesc;
402     }
403 
404     class BatchNormTrain : CUDAKernel
405     {
406         this(Operation op)
407         {
408             if(op.rank == 2)
409             {
410                 mode = CUDNN_BATCHNORM_PER_ACTIVATION;
411             }
412             else
413             {
414                 mode = CUDNN_BATCHNORM_SPATIAL;
415             }
416 
417             import std.range;
418 
419             auto shape = op.deps[0].shape
420                         .chain(repeat(1))
421                         .map!(x => cast(int)x)
422                         .take(4)
423                         .array();
424 
425             cudnnCreateTensorDescriptor(&xDesc).cudnnCheck();
426             cudnnCreateTensorDescriptor(&bnDesc).cudnnCheck();
427 
428             cudnnSetTensor4dDescriptor(xDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, shape[0], shape[1], shape[2],
429                 shape[3]).cudnnCheck();
430             cudnnDeriveBNTensorDescriptor(bnDesc, xDesc, mode).cudnnCheck();
431         }
432 
433         ~this()
434         {
435             cudnnDestroyTensorDescriptor(xDesc).cudnnCheck();
436             cudnnDestroyTensorDescriptor(bnDesc).cudnnCheck();
437         }
438 
439         override void execute(const(CUDABuffer)[] inputs, CUDABuffer output)
440         {
441             float alpha = 1.0f;
442             float beta = 0.0f;
443 
444             cudnnBatchNormalizationForwardTraining(handle, mode, &alpha, &beta, xDesc,
445                 cast(void *)inputs[0].ptr, xDesc, cast(void *)output.ptr, bnDesc, cast(void *)inputs[1].ptr,
446                 cast(void *)inputs[2].ptr, 0, null, null, 1e-5f, null, null).cudnnCheck();
447         }
448 
449         cudnnBatchNormMode_t mode;
450         cudnnTensorDescriptor_t xDesc;
451         cudnnTensorDescriptor_t bnDesc;
452     }
453 
454     class BatchNormGrad : CUDAKernel
455     {
456         this(Operation op)
457         {
458             if(op.deps[1].rank == 2)
459             {
460                 mode = CUDNN_BATCHNORM_PER_ACTIVATION;
461             }
462             else
463             {
464                 mode = CUDNN_BATCHNORM_SPATIAL;
465             }
466 
467             import std.range;
468 
469             auto shape = op.deps[1].shape
470                         .chain(repeat(1))
471                         .map!(x => cast(int)x)
472                         .take(4)
473                         .array();
474 
475             cudnnCreateTensorDescriptor(&xDesc).cudnnCheck();
476             cudnnCreateTensorDescriptor(&bnDesc).cudnnCheck();
477 
478             cudnnSetTensor4dDescriptor(xDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, shape[0], shape[1], shape[2],
479                 shape[3]).cudnnCheck();
480             cudnnDeriveBNTensorDescriptor(bnDesc, xDesc, mode).cudnnCheck();
481         }
482 
483         ~this()
484         {
485             cudnnDestroyTensorDescriptor(xDesc).cudnnCheck();
486             cudnnDestroyTensorDescriptor(bnDesc).cudnnCheck();
487         }
488 
489         override void execute(const(CUDABuffer)[] inputs, CUDABuffer output)
490         {
491             float alpha = 1.0f;
492             float beta = 0.0f;
493 
494             void *dx = cast(void *)(output.ptr);
495             void *dscale = cast(void *)(output.ptr + inputs[1].numBytes);
496             void *dbias = cast(void *)(output.ptr + inputs[1].numBytes + inputs[2].numBytes);
497 
498             cudnnBatchNormalizationBackward(handle, mode, &alpha, &beta, &alpha, &beta, xDesc,
499                 cast(void *)inputs[1].ptr, xDesc, cast(void *)inputs[0].ptr, xDesc, dx, bnDesc,
500                 cast(void *)inputs[2].ptr, dscale, dbias, 1e-5f, null, null);
501         }
502 
503         cudnnBatchNormMode_t mode;
504         cudnnTensorDescriptor_t xDesc;
505         cudnnTensorDescriptor_t bnDesc;
506     }
507 }