1 module dopt.cuda.nnet.cudnn5;
2 
3 import std.algorithm;
4 import std.array;
5 import std.functional;
6 
7 import dopt.cuda;
8 import dopt.core.ops;
9 
10 import derelict.cuda;
11 import derelict.cudnn5;
12 
13 package
14 {
15     void initializeCuDNN5()
16     {
17         DerelictCuDNN5.load();
18         
19         registerCUDAKernel("convolution", toDelegate(&cudaKernelCtr!ConvolutionForward));
20         registerCUDAKernel("convolutionFeaturesGrad", toDelegate(&cudaKernelCtr!ConvolutionFeaturesGrad));
21         registerCUDAKernel("convolutionFiltersGrad", toDelegate(&cudaKernelCtr!ConvolutionFiltersGrad));
22         registerCUDAKernel("maxpool", toDelegate(&cudaKernelCtr!MaxpoolForward));
23         registerCUDAKernel("maxpoolGrad", toDelegate(&cudaKernelCtr!MaxpoolGrad));
24         registerCUDAKernel("softmax", toDelegate(&cudaKernelCtr!Softmax));
25         registerCUDAKernel("softmaxGrad", toDelegate(&cudaKernelCtr!SoftmaxGrad));
26         registerCUDAKernel("relu", toDelegate(&cudaKernelCtr!ReLU));
27         registerCUDAKernel("reluGrad", toDelegate(&cudaKernelCtr!ReLUGrad));
28         registerCUDAKernel("addBias", toDelegate(&cudaKernelCtr!AddBias));
29         registerCUDAKernel("addBiasGrad", toDelegate(&cudaKernelCtr!AddBiasGrad));
30         registerCUDAKernel("batchNormTrain", toDelegate(&cudaKernelCtr!BatchNormTrain));
31         registerCUDAKernel("batchNormGrad", toDelegate(&cudaKernelCtr!BatchNormGrad));
32         registerCUDAKernel("batchNormInference", toDelegate(&cudaKernelCtr!BatchNormInference));
33 
34         cudnnCreate(&handle);
35     }
36 }
37 
38 private
39 {
40     cudnnHandle_t handle;
41 
42     void cudnnCheck(cudnnStatus_t status, string mod = __MODULE__, size_t line = __LINE__)
43     {
44         import std.conv : to;
45         import std.exception : enforce;
46         enforce(status == CUDNN_STATUS_SUCCESS, mod ~ "(" ~ line.to!string ~ "): Failed to execute cuDNN function." ~
47             " Error code: " ~ status.to!string);
48     }
49 
50     CUDAKernel cudaKernelCtr(K)(Operation op)
51     {
52         return new K(op);
53     }
54 
55     class ConvolutionBase : CUDAKernel
56     {
57         this(Operation op, int[] inShape, int[] filterShape, int[] outShape)
58         {
59             mOp = op;
60 
61             int padH = 0;
62             int padW = 0;
63             int strideY = 1;
64             int strideX = 1;
65 
66             auto padding = op.attributes["padding"].get!(size_t[]);
67             padH = cast(int)padding[0];
68             padW = cast(int)padding[1];
69 
70             auto stride = op.attributes["stride"].get!(size_t[]);
71             strideY = cast(int)stride[0];
72             strideX = cast(int)stride[1];
73 
74             cudnnCreateTensorDescriptor(&xDesc).cudnnCheck();
75 			cudnnCreateFilterDescriptor(&wDesc).cudnnCheck();
76 			cudnnCreateConvolutionDescriptor(&convDesc).cudnnCheck();
77 			cudnnCreateTensorDescriptor(&yDesc).cudnnCheck();
78 
79             cudnnSetTensor4dDescriptor(xDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, inShape[0], inShape[1], inShape[2],
80                 inShape[3]).cudnnCheck();
81             cudnnSetFilter4dDescriptor(wDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, filterShape[0], filterShape[1],
82                 filterShape[2], filterShape[3]).cudnnCheck();
83             cudnnSetConvolution2dDescriptor_v5(convDesc, padH, padW, strideY, strideX, 1, 1, CUDNN_CONVOLUTION,
84                 CUDNN_DATA_FLOAT).cudnnCheck();
85             cudnnSetTensor4dDescriptor(yDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, outShape[0], outShape[1],
86                 outShape[2], outShape[3]).cudnnCheck();
87         }
88 
89         ~this()
90         {
91             cudnnDestroyFilterDescriptor(wDesc).cudnnCheck();
92 			cudnnDestroyTensorDescriptor(yDesc).cudnnCheck();
93 			cudnnDestroyConvolutionDescriptor(convDesc).cudnnCheck();
94 			cudnnDestroyTensorDescriptor(xDesc).cudnnCheck();
95         }
96 
97         abstract void execute(const(CUDABuffer)[] inputs, CUDABuffer output);
98 
99         Operation mOp;
100         cudnnTensorDescriptor_t xDesc;
101 		cudnnFilterDescriptor_t wDesc;
102 		cudnnTensorDescriptor_t bDesc;
103 		cudnnConvolutionDescriptor_t convDesc;
104 		cudnnTensorDescriptor_t yDesc;
105     }
106 
107     class ConvolutionForward : ConvolutionBase
108     {
109         this(Operation op)
110         {
111             auto inShape = op.deps[0].outputType.shape.map!(x => cast(int)x).array();
112             auto filterShape = op.deps[1].outputType.shape.map!(x => cast(int)x).array();
113             auto outShape = op.outputType.shape.map!(x => cast(int)x).array();
114 
115             super(op, inShape, filterShape, outShape);
116         }
117 
118         override void execute(const(CUDABuffer)[] inputs, CUDABuffer output)
119         {
120             auto x = cast(void *)inputs[0].ptr;
121             auto w = cast(void *)inputs[1].ptr;
122             auto y = cast(void *)output.ptr;
123             float alpha = 1;
124             float beta = 0;
125 
126             cudnnConvolutionForward(handle, &alpha, xDesc, x, wDesc, w, convDesc, 0, null, 0, &beta, yDesc, y)
127             .cudnnCheck();
128 
129             cuCtxSynchronize();
130         }
131     }
132 
133     class ConvolutionFeaturesGrad : ConvolutionBase
134     {
135         this(Operation op)
136         {
137             auto inShape = op.shape.map!(x => cast(int)x).array();
138             auto filterShape = op.deps[1].shape.map!(x => cast(int)x).array();
139             auto outShape = op.deps[0].shape.map!(x => cast(int)x).array();
140 
141             super(op, inShape, filterShape, outShape);
142         }
143 
144         override void execute(const(CUDABuffer)[] inputs, CUDABuffer output)
145         {
146             auto w = cast(void *)inputs[1].ptr;
147             auto dy = cast(void *)inputs[0].ptr;
148             auto dx = cast(void *)output.ptr;
149             float alpha = 1;
150             float beta = 0;
151 
152             cudnnConvolutionBackwardData(handle, &alpha, wDesc, w, yDesc, dy, convDesc, 0, null, 0, &beta, xDesc, dx)
153             .cudnnCheck();
154 
155             cuCtxSynchronize();
156         }
157     }
158 
159     class ConvolutionFiltersGrad : ConvolutionBase
160     {
161         this(Operation op)
162         {
163             auto inShape = op.deps[1].outputType.shape.map!(x => cast(int)x).array();
164             auto filterShape = op.outputType.shape.map!(x => cast(int)x).array();
165             auto outShape = op.deps[0].outputType.shape.map!(x => cast(int)x).array();
166 
167             super(op, inShape, filterShape, outShape);
168         }
169 
170         override void execute(const(CUDABuffer)[] inputs, CUDABuffer output)
171         {
172             auto x = cast(void *)inputs[1].ptr;
173             auto dy = cast(void *)inputs[0].ptr;
174             auto dw = cast(void *)output.ptr;
175             float alpha = 1;
176             float beta = 0;
177 
178             cudnnConvolutionBackwardFilter(handle, &alpha, xDesc, x, yDesc, dy, convDesc, 0, null, 0, &beta, wDesc,
179                 dw).cudnnCheck();
180 
181             cuCtxSynchronize();
182         }
183     }
184 
185     class MaxpoolBase : CUDAKernel
186     {
187         this(Operation op, int[] inShape, int[]outShape)
188         {
189             auto dims = op.attributes["dims"].get!(size_t[]);
190             auto poolShape = dims.map!(x => cast(int)x).array();
191             auto poolStride = poolShape.dup;
192 
193             cudnnCreatePoolingDescriptor(&poolingDesc).cudnnCheck();
194 			cudnnSetPooling2dDescriptor(poolingDesc, CUDNN_POOLING_MAX, 1, cast(int)poolShape[0],
195                 cast(int)poolShape[1], 0, 0, cast(int)poolStride[0], cast(int)poolStride[1]).cudnnCheck();
196 
197 			cudnnCreateTensorDescriptor(&xDesc).cudnnCheck();
198 			cudnnCreateTensorDescriptor(&yDesc).cudnnCheck();
199 			cudnnSetTensor4dDescriptor(xDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, inShape[0], inShape[1], inShape[2],
200                 inShape[3]).cudnnCheck();
201 			cudnnSetTensor4dDescriptor(yDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, outShape[0], outShape[1],
202                 outShape[2], outShape[3]).cudnnCheck();
203 
204         }
205 
206         ~this()
207 		{
208 			cudnnDestroyPoolingDescriptor(poolingDesc).cudnnCheck();
209 			cudnnDestroyTensorDescriptor(xDesc).cudnnCheck();
210 			cudnnDestroyTensorDescriptor(yDesc).cudnnCheck();
211 		}
212 
213         abstract void execute(const(CUDABuffer)[] inputs, CUDABuffer output);
214 
215         cudnnPoolingDescriptor_t poolingDesc;
216 		cudnnTensorDescriptor_t xDesc;
217 		cudnnTensorDescriptor_t yDesc;
218     }
219 
220     class MaxpoolForward : MaxpoolBase
221     {
222         this(Operation op)
223         {
224             auto inShape = op.deps[0].outputType.shape.map!(x => cast(int)x).array();
225 			auto outShape = op.outputType.shape.map!(x => cast(int)x).array();
226 
227             super(op, inShape, outShape);
228         }
229 
230         override void execute(const(CUDABuffer)[] inputs, CUDABuffer output)
231         {
232             auto x = cast(void *)inputs[0].ptr;
233 			auto y = cast(void *)output.ptr;
234 			float alpha = 1;
235 			float beta = 0;
236 
237 			cudnnPoolingForward(handle, poolingDesc, &alpha, xDesc, x, &beta, yDesc, y).cudnnCheck();
238 
239             cuCtxSynchronize();
240         }
241     }
242 
243     class MaxpoolGrad : MaxpoolBase
244     {
245         this(Operation op)
246         {
247             auto inShape = op.deps[2].outputType.shape.map!(x => cast(int)x).array();
248 			auto outShape = op.deps[1].outputType.shape.map!(x => cast(int)x).array();
249 
250             super(op, inShape, outShape);
251         }
252 
253         override void execute(const(CUDABuffer)[] inputs, CUDABuffer output)
254         {
255             auto dx = cast(void *)output.ptr;
256 			auto dy = cast(void *)inputs[0].ptr;
257 			auto y = cast(void *)inputs[1].ptr;
258 			auto x = cast(void *)inputs[2].ptr;
259 			float alpha = 1;
260 			float beta = 0;
261 
262 			cudnnPoolingBackward(handle, poolingDesc, &alpha, yDesc, y, yDesc, dy, xDesc, x, &beta, xDesc, dx)
263             .cudnnCheck();
264 
265             cuCtxSynchronize();
266         }
267     }
268 
269     class Softmax : CUDAKernel
270     {
271         this(Operation op)
272         {
273             auto shape = op.shape.map!(x => cast(int)x).array();
274             auto vol = 1;
275             
276             for(size_t i = 2; i < shape.length; i++)
277             {
278                 vol *= shape[i];
279             }
280 
281 			cudnnCreateTensorDescriptor(&desc).cudnnCheck();
282 			cudnnSetTensor4dDescriptor(desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, shape[0], shape[1], vol, 1)
283             .cudnnCheck();
284         }
285 
286         ~this()
287         {
288             cudnnDestroyTensorDescriptor(desc).cudnnCheck();
289         }
290 
291         override void execute(const(CUDABuffer)[] inputs, CUDABuffer output)
292         {
293             float alpha = 1.0;
294 			float beta = 0.0;
295 			auto x = cast(void *)inputs[0].ptr;
296 			auto y = cast(void *)output.ptr;
297 
298 			cudnnSoftmaxForward(handle, CUDNN_SOFTMAX_ACCURATE, CUDNN_SOFTMAX_MODE_CHANNEL, &alpha, desc, x, &beta,
299                 desc, y).cudnnCheck();
300             
301             cuCtxSynchronize();
302         }
303 
304         cudnnTensorDescriptor_t desc;
305     }
306 
307     class SoftmaxGrad : CUDAKernel
308     {
309         this(Operation op)
310         {
311             auto shape = op.shape.map!(x => cast(int)x).array();
312 
313 			cudnnCreateTensorDescriptor(&desc).cudnnCheck();
314 			cudnnSetTensor4dDescriptor(desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, shape[0], shape[1],
315                 reduce!"a * b"(1, shape[2 .. $]), 1).cudnnCheck();
316         }
317 
318         ~this()
319         {
320             cudnnDestroyTensorDescriptor(desc).cudnnCheck();
321         }
322 
323         override void execute(const(CUDABuffer)[] inputs, CUDABuffer output)
324         {
325             float alpha = 1.0;
326 			float beta = 0.0;
327 			auto dy = cast(void *)inputs[0].ptr;
328 			auto y = cast(void *)inputs[1].ptr;
329 			auto dx = cast(void *)output.ptr;
330 
331 			cudnnSoftmaxBackward(handle, CUDNN_SOFTMAX_ACCURATE, CUDNN_SOFTMAX_MODE_CHANNEL, &alpha, desc, y, desc, dy,
332                 &beta, desc, dx).cudnnCheck();
333 
334             cuCtxSynchronize();
335         }
336 
337         cudnnTensorDescriptor_t desc;
338     }
339 
340     class ReLU : CUDAKernel
341     {
342         this(Operation op)
343         {
344             import std.range : drop;
345 
346             auto shape = op.shape.map!(x => cast(int)x).array();
347 
348 			cudnnCreateTensorDescriptor(&desc).cudnnCheck();
349 			cudnnSetTensor4dDescriptor(desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, shape[0], shape.length > 1 ? shape[1] : 1,
350                 reduce!"a * b"(1, shape.drop(2).array()), 1).cudnnCheck();
351             
352             cudnnCreateActivationDescriptor(&actDesc).cudnnCheck();
353             cudnnSetActivationDescriptor(actDesc, CUDNN_ACTIVATION_RELU, CUDNN_PROPAGATE_NAN, 0.0).cudnnCheck();
354         }
355 
356         ~this()
357         {
358             cudnnDestroyTensorDescriptor(desc).cudnnCheck();
359             cudnnDestroyActivationDescriptor(actDesc).cudnnCheck();
360         }
361 
362         override void execute(const(CUDABuffer)[] inputs, CUDABuffer output)
363         {
364             float alpha = 1.0;
365 			float beta = 0.0;
366             auto x = cast(void *)inputs[0].ptr;
367             auto y = cast(void *)output.ptr;
368 
369 			cudnnActivationForward(handle, actDesc, &alpha, desc, x, &beta, desc, y).cudnnCheck();
370 
371             cuCtxSynchronize();
372         }
373 
374         cudnnTensorDescriptor_t desc;
375         cudnnActivationDescriptor_t actDesc;
376     }
377 
378     class ReLUGrad : CUDAKernel
379     {
380         this(Operation op)
381         {
382             import std.range : drop;
383 
384             auto shape = op.shape.map!(x => cast(int)x).array();
385 
386 			cudnnCreateTensorDescriptor(&desc).cudnnCheck();
387 			cudnnSetTensor4dDescriptor(desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, shape[0], shape.length > 1 ? shape[1] : 1,
388                 reduce!"a * b"(1, shape.drop(2).array()), 1).cudnnCheck();
389             
390             cudnnCreateActivationDescriptor(&actDesc).cudnnCheck();
391             cudnnSetActivationDescriptor(actDesc, CUDNN_ACTIVATION_RELU, CUDNN_PROPAGATE_NAN, 0.0).cudnnCheck();
392         }
393 
394         ~this()
395         {
396             cudnnDestroyTensorDescriptor(desc).cudnnCheck();
397             cudnnDestroyActivationDescriptor(actDesc).cudnnCheck();
398         }
399 
400         override void execute(const(CUDABuffer)[] inputs, CUDABuffer output)
401         {
402             float alpha = 1.0;
403 			float beta = 0.0;
404             auto dy = cast(void *)inputs[0].ptr;
405             auto y = cast(void *)inputs[1].ptr;
406             auto x = cast(void *)inputs[2].ptr;
407             auto dx = cast(void *)output.ptr;
408 
409 			cudnnActivationBackward(handle, actDesc, &alpha, desc, y, desc, dy, desc, x, &beta, desc, dx).cudnnCheck();
410 
411             cuCtxSynchronize();
412         }
413 
414         cudnnTensorDescriptor_t desc;
415         cudnnActivationDescriptor_t actDesc;
416     }
417 
418     class AddBias : CUDAKernel
419     {
420         this(Operation op)
421         {
422             auto shape = op.shape.map!(x => cast(int)x).array();
423 
424 			cudnnCreateTensorDescriptor(&cDesc).cudnnCheck();
425 			cudnnSetTensor4dDescriptor(cDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, shape[0], shape[1],
426                 reduce!"a * b"(1, shape[2 .. $]), 1).cudnnCheck();
427             
428             cudnnCreateTensorDescriptor(&aDesc).cudnnCheck();
429             cudnnSetTensor4dDescriptor(aDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, shape[1], 1, 1).cudnnCheck();
430         }
431 
432         ~this()
433         {
434             cudnnDestroyTensorDescriptor(cDesc).cudnnCheck();
435             cudnnDestroyTensorDescriptor(aDesc).cudnnCheck();
436         }
437 
438         override void execute(const(CUDABuffer)[] inputs, CUDABuffer output)
439         {
440             cuMemcpy(output.ptr, inputs[0].ptr, output.numBytes);
441 
442             float alpha = 1;
443             float beta = 1;
444 
445             cudnnAddTensor(handle, &alpha, aDesc, cast(void *)inputs[1].ptr, &beta, cDesc, cast(void *)output.ptr);
446         }
447 
448         cudnnTensorDescriptor_t cDesc;
449         cudnnTensorDescriptor_t aDesc;
450     }
451 
452     class AddBiasGrad : CUDAKernel
453     {
454         this(Operation op)
455         {
456             auto shape = op.deps[0].shape.map!(x => cast(int)x).array();
457 
458 			cudnnCreateTensorDescriptor(&dyDesc).cudnnCheck();
459 			cudnnSetTensor4dDescriptor(dyDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, shape[0], shape[1],
460                 reduce!"a * b"(1, shape[2 .. $]), 1).cudnnCheck();
461             
462             cudnnCreateTensorDescriptor(&dbDesc).cudnnCheck();
463             cudnnSetTensor4dDescriptor(dbDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, shape[1], 1, 1).cudnnCheck();
464         }
465 
466         ~this()
467         {
468             cudnnDestroyTensorDescriptor(dyDesc).cudnnCheck();
469             cudnnDestroyTensorDescriptor(dbDesc).cudnnCheck();
470         }
471 
472         override void execute(const(CUDABuffer)[] inputs, CUDABuffer output)
473         {
474             float alpha = 1.0f;
475             float beta = 1.0f;
476 
477             cudnnConvolutionBackwardBias(handle, &alpha, dyDesc, cast(void *)inputs[0].ptr, &beta, dbDesc,
478                 cast(void *)output.ptr);
479         }
480 
481         cudnnTensorDescriptor_t dyDesc;
482         cudnnTensorDescriptor_t dbDesc;
483     }
484 
485     abstract class BatchNormBase : CUDAKernel
486     {
487         this(Operation op)
488         {
489             if(op.deps[0].rank == 2)
490             {
491                 mode = CUDNN_BATCHNORM_PER_ACTIVATION;
492             }
493             else
494             {
495                 mode = CUDNN_BATCHNORM_SPATIAL;
496             }
497 
498             import std.range;
499 
500             auto shape = op.deps[0].shape
501                         .chain(repeat(1))
502                         .map!(x => cast(int)x)
503                         .take(4)
504                         .array();
505 
506             cudnnCreateTensorDescriptor(&xDesc).cudnnCheck();
507             cudnnCreateTensorDescriptor(&bnDesc).cudnnCheck();
508 
509             cudnnSetTensor4dDescriptor(xDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, shape[0], shape[1], shape[2],
510                 shape[3]).cudnnCheck();
511             cudnnDeriveBNTensorDescriptor(bnDesc, xDesc, mode).cudnnCheck();
512         }
513 
514         ~this()
515         {
516             cudnnDestroyTensorDescriptor(xDesc).cudnnCheck();
517             cudnnDestroyTensorDescriptor(bnDesc).cudnnCheck();
518         }
519 
520         cudnnBatchNormMode_t mode;
521         cudnnTensorDescriptor_t xDesc;
522         cudnnTensorDescriptor_t bnDesc;
523     }
524 
525     class BatchNormTrain : BatchNormBase
526     {
527         this(Operation op)
528         {
529             super(op);
530             mMomentum = 1.0 - op.attributes["momentum"].get!double;
531         }
532 
533         void execute(const(CUDABuffer)[] inputs, CUDABuffer output)
534         {
535             float alpha = 1.0f;
536             float beta = 0.0f;
537 
538             //We're going to pack the running mean/variance after the BN forward prop. Let the higher level
539             //API slice them out into different nodes.
540             auto mean = output.ptr + inputs[0].numBytes;
541             auto var = mean + (output.numBytes - inputs[0].numBytes) / 2;
542 
543             cuMemcpy(mean, inputs[3].ptr, inputs[3].numBytes);
544             cuMemcpy(var, inputs[4].ptr, inputs[4].numBytes);
545 
546             cudnnBatchNormalizationForwardTraining(handle, mode, &alpha, &beta, xDesc,
547                 cast(void *)inputs[0].ptr, xDesc, cast(void *)output.ptr, bnDesc, cast(void *)inputs[1].ptr,
548                 cast(void *)inputs[2].ptr, mMomentum, cast(void *)mean, cast(void *)var, 1e-5f, null, null).cudnnCheck();
549         }
550 
551         double mMomentum;
552     }
553 
554     class BatchNormGrad : BatchNormBase
555     {
556         this(Operation op)
557         {
558             super(op);
559         }
560 
561         void execute(const(CUDABuffer)[] inputs, CUDABuffer output)
562         {
563             float alpha = 1.0f;
564             float beta = 0.0f;
565 
566             void *dx = cast(void *)(output.ptr);
567             void *dscale = cast(void *)(output.ptr + inputs[1].numBytes);
568             void *dbias = cast(void *)(output.ptr + inputs[1].numBytes + inputs[2].numBytes);
569 
570             cudnnBatchNormalizationBackward(handle, mode, &alpha, &beta, &alpha, &beta, xDesc,
571                 cast(void *)inputs[1].ptr, xDesc, cast(void *)inputs[0].ptr, xDesc, dx, bnDesc,
572                 cast(void *)inputs[2].ptr, dscale, dbias, 1e-5f, null, null);
573         }
574     }
575 
576     class BatchNormInference : BatchNormBase
577     {
578         this(Operation op)
579         {
580             super(op);
581         }
582 
583         void execute(const(CUDABuffer)[] inputs, CUDABuffer output)
584         {
585             float alpha = 1.0f;
586             float beta = 0.0f;
587 
588             cudnnBatchNormalizationForwardInference(handle, mode, &alpha, &beta, xDesc, cast(void *)inputs[0].ptr,
589                 xDesc, cast(void *)output.ptr, bnDesc, cast(void *)inputs[1].ptr, cast(void *)inputs[2].ptr,
590                 cast(void *)inputs[3].ptr, cast(void *)inputs[4].ptr, 1e-5).cudnnCheck();
591         }
592     }
593 }