1 module dopt.cuda.nnet.cudnn5; 2 3 import std.algorithm; 4 import std.array; 5 import std.functional; 6 7 import dopt.cuda; 8 import dopt.core.ops; 9 10 import derelict.cuda; 11 import derelict.cudnn5; 12 13 package 14 { 15 void initializeCuDNN5() 16 { 17 DerelictCuDNN5.load(); 18 19 registerCUDAKernel("convolution", toDelegate(&cudaKernelCtr!ConvolutionForward)); 20 registerCUDAKernel("convolutionFeaturesGrad", toDelegate(&cudaKernelCtr!ConvolutionFeaturesGrad)); 21 registerCUDAKernel("convolutionFiltersGrad", toDelegate(&cudaKernelCtr!ConvolutionFiltersGrad)); 22 registerCUDAKernel("maxpool", toDelegate(&cudaKernelCtr!MaxpoolForward)); 23 registerCUDAKernel("maxpoolGrad", toDelegate(&cudaKernelCtr!MaxpoolGrad)); 24 registerCUDAKernel("softmax", toDelegate(&cudaKernelCtr!Softmax)); 25 registerCUDAKernel("softmaxGrad", toDelegate(&cudaKernelCtr!SoftmaxGrad)); 26 registerCUDAKernel("relu", toDelegate(&cudaKernelCtr!ReLU)); 27 registerCUDAKernel("reluGrad", toDelegate(&cudaKernelCtr!ReLUGrad)); 28 registerCUDAKernel("addBias", toDelegate(&cudaKernelCtr!AddBias)); 29 registerCUDAKernel("addBiasGrad", toDelegate(&cudaKernelCtr!AddBiasGrad)); 30 registerCUDAKernel("batchNormTrain", toDelegate(&cudaKernelCtr!BatchNormTrain)); 31 registerCUDAKernel("batchNormGrad", toDelegate(&cudaKernelCtr!BatchNormGrad)); 32 registerCUDAKernel("batchNormInference", toDelegate(&cudaKernelCtr!BatchNormInference)); 33 34 cudnnCreate(&handle); 35 } 36 } 37 38 private 39 { 40 cudnnHandle_t handle; 41 42 void cudnnCheck(cudnnStatus_t status, string mod = __MODULE__, size_t line = __LINE__) 43 { 44 import std.conv : to; 45 import std.exception : enforce; 46 enforce(status == CUDNN_STATUS_SUCCESS, mod ~ "(" ~ line.to!string ~ "): Failed to execute cuDNN function." ~ 47 " Error code: " ~ status.to!string); 48 } 49 50 CUDAKernel cudaKernelCtr(K)(Operation op) 51 { 52 return new K(op); 53 } 54 55 class ConvolutionBase : CUDAKernel 56 { 57 this(Operation op, int[] inShape, int[] filterShape, int[] outShape) 58 { 59 mOp = op; 60 61 int padH = 0; 62 int padW = 0; 63 int strideY = 1; 64 int strideX = 1; 65 66 auto padding = op.attributes["padding"].get!(size_t[]); 67 padH = cast(int)padding[0]; 68 padW = cast(int)padding[1]; 69 70 auto stride = op.attributes["stride"].get!(size_t[]); 71 strideY = cast(int)stride[0]; 72 strideX = cast(int)stride[1]; 73 74 cudnnCreateTensorDescriptor(&xDesc).cudnnCheck(); 75 cudnnCreateFilterDescriptor(&wDesc).cudnnCheck(); 76 cudnnCreateConvolutionDescriptor(&convDesc).cudnnCheck(); 77 cudnnCreateTensorDescriptor(&yDesc).cudnnCheck(); 78 79 cudnnSetTensor4dDescriptor(xDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, inShape[0], inShape[1], inShape[2], 80 inShape[3]).cudnnCheck(); 81 cudnnSetFilter4dDescriptor(wDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, filterShape[0], filterShape[1], 82 filterShape[2], filterShape[3]).cudnnCheck(); 83 cudnnSetConvolution2dDescriptor_v5(convDesc, padH, padW, strideY, strideX, 1, 1, CUDNN_CONVOLUTION, 84 CUDNN_DATA_FLOAT).cudnnCheck(); 85 cudnnSetTensor4dDescriptor(yDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, outShape[0], outShape[1], 86 outShape[2], outShape[3]).cudnnCheck(); 87 } 88 89 ~this() 90 { 91 cudnnDestroyFilterDescriptor(wDesc).cudnnCheck(); 92 cudnnDestroyTensorDescriptor(yDesc).cudnnCheck(); 93 cudnnDestroyConvolutionDescriptor(convDesc).cudnnCheck(); 94 cudnnDestroyTensorDescriptor(xDesc).cudnnCheck(); 95 } 96 97 abstract void execute(const(CUDABuffer)[] inputs, CUDABuffer output); 98 99 Operation mOp; 100 cudnnTensorDescriptor_t xDesc; 101 cudnnFilterDescriptor_t wDesc; 102 cudnnTensorDescriptor_t bDesc; 103 cudnnConvolutionDescriptor_t convDesc; 104 cudnnTensorDescriptor_t yDesc; 105 } 106 107 class ConvolutionForward : ConvolutionBase 108 { 109 this(Operation op) 110 { 111 auto inShape = op.deps[0].outputType.shape.map!(x => cast(int)x).array(); 112 auto filterShape = op.deps[1].outputType.shape.map!(x => cast(int)x).array(); 113 auto outShape = op.outputType.shape.map!(x => cast(int)x).array(); 114 115 super(op, inShape, filterShape, outShape); 116 } 117 118 override void execute(const(CUDABuffer)[] inputs, CUDABuffer output) 119 { 120 auto x = cast(void *)inputs[0].ptr; 121 auto w = cast(void *)inputs[1].ptr; 122 auto y = cast(void *)output.ptr; 123 float alpha = 1; 124 float beta = 0; 125 126 cudnnConvolutionForward(handle, &alpha, xDesc, x, wDesc, w, convDesc, 0, null, 0, &beta, yDesc, y) 127 .cudnnCheck(); 128 129 cuCtxSynchronize(); 130 } 131 } 132 133 class ConvolutionFeaturesGrad : ConvolutionBase 134 { 135 this(Operation op) 136 { 137 auto inShape = op.shape.map!(x => cast(int)x).array(); 138 auto filterShape = op.deps[1].shape.map!(x => cast(int)x).array(); 139 auto outShape = op.deps[0].shape.map!(x => cast(int)x).array(); 140 141 super(op, inShape, filterShape, outShape); 142 } 143 144 override void execute(const(CUDABuffer)[] inputs, CUDABuffer output) 145 { 146 auto w = cast(void *)inputs[1].ptr; 147 auto dy = cast(void *)inputs[0].ptr; 148 auto dx = cast(void *)output.ptr; 149 float alpha = 1; 150 float beta = 0; 151 152 cudnnConvolutionBackwardData(handle, &alpha, wDesc, w, yDesc, dy, convDesc, 0, null, 0, &beta, xDesc, dx) 153 .cudnnCheck(); 154 155 cuCtxSynchronize(); 156 } 157 } 158 159 class ConvolutionFiltersGrad : ConvolutionBase 160 { 161 this(Operation op) 162 { 163 auto inShape = op.deps[1].outputType.shape.map!(x => cast(int)x).array(); 164 auto filterShape = op.outputType.shape.map!(x => cast(int)x).array(); 165 auto outShape = op.deps[0].outputType.shape.map!(x => cast(int)x).array(); 166 167 super(op, inShape, filterShape, outShape); 168 } 169 170 override void execute(const(CUDABuffer)[] inputs, CUDABuffer output) 171 { 172 auto x = cast(void *)inputs[1].ptr; 173 auto dy = cast(void *)inputs[0].ptr; 174 auto dw = cast(void *)output.ptr; 175 float alpha = 1; 176 float beta = 0; 177 178 cudnnConvolutionBackwardFilter(handle, &alpha, xDesc, x, yDesc, dy, convDesc, 0, null, 0, &beta, wDesc, 179 dw).cudnnCheck(); 180 181 cuCtxSynchronize(); 182 } 183 } 184 185 class MaxpoolBase : CUDAKernel 186 { 187 this(Operation op, int[] inShape, int[]outShape) 188 { 189 auto dims = op.attributes["dims"].get!(size_t[]); 190 auto poolShape = dims.map!(x => cast(int)x).array(); 191 auto poolStride = poolShape.dup; 192 193 cudnnCreatePoolingDescriptor(&poolingDesc).cudnnCheck(); 194 cudnnSetPooling2dDescriptor(poolingDesc, CUDNN_POOLING_MAX, 1, cast(int)poolShape[0], 195 cast(int)poolShape[1], 0, 0, cast(int)poolStride[0], cast(int)poolStride[1]).cudnnCheck(); 196 197 cudnnCreateTensorDescriptor(&xDesc).cudnnCheck(); 198 cudnnCreateTensorDescriptor(&yDesc).cudnnCheck(); 199 cudnnSetTensor4dDescriptor(xDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, inShape[0], inShape[1], inShape[2], 200 inShape[3]).cudnnCheck(); 201 cudnnSetTensor4dDescriptor(yDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, outShape[0], outShape[1], 202 outShape[2], outShape[3]).cudnnCheck(); 203 204 } 205 206 ~this() 207 { 208 cudnnDestroyPoolingDescriptor(poolingDesc).cudnnCheck(); 209 cudnnDestroyTensorDescriptor(xDesc).cudnnCheck(); 210 cudnnDestroyTensorDescriptor(yDesc).cudnnCheck(); 211 } 212 213 abstract void execute(const(CUDABuffer)[] inputs, CUDABuffer output); 214 215 cudnnPoolingDescriptor_t poolingDesc; 216 cudnnTensorDescriptor_t xDesc; 217 cudnnTensorDescriptor_t yDesc; 218 } 219 220 class MaxpoolForward : MaxpoolBase 221 { 222 this(Operation op) 223 { 224 auto inShape = op.deps[0].outputType.shape.map!(x => cast(int)x).array(); 225 auto outShape = op.outputType.shape.map!(x => cast(int)x).array(); 226 227 super(op, inShape, outShape); 228 } 229 230 override void execute(const(CUDABuffer)[] inputs, CUDABuffer output) 231 { 232 auto x = cast(void *)inputs[0].ptr; 233 auto y = cast(void *)output.ptr; 234 float alpha = 1; 235 float beta = 0; 236 237 cudnnPoolingForward(handle, poolingDesc, &alpha, xDesc, x, &beta, yDesc, y).cudnnCheck(); 238 239 cuCtxSynchronize(); 240 } 241 } 242 243 class MaxpoolGrad : MaxpoolBase 244 { 245 this(Operation op) 246 { 247 auto inShape = op.deps[2].outputType.shape.map!(x => cast(int)x).array(); 248 auto outShape = op.deps[1].outputType.shape.map!(x => cast(int)x).array(); 249 250 super(op, inShape, outShape); 251 } 252 253 override void execute(const(CUDABuffer)[] inputs, CUDABuffer output) 254 { 255 auto dx = cast(void *)output.ptr; 256 auto dy = cast(void *)inputs[0].ptr; 257 auto y = cast(void *)inputs[1].ptr; 258 auto x = cast(void *)inputs[2].ptr; 259 float alpha = 1; 260 float beta = 0; 261 262 cudnnPoolingBackward(handle, poolingDesc, &alpha, yDesc, y, yDesc, dy, xDesc, x, &beta, xDesc, dx) 263 .cudnnCheck(); 264 265 cuCtxSynchronize(); 266 } 267 } 268 269 class Softmax : CUDAKernel 270 { 271 this(Operation op) 272 { 273 auto shape = op.shape.map!(x => cast(int)x).array(); 274 auto vol = 1; 275 276 for(size_t i = 2; i < shape.length; i++) 277 { 278 vol *= shape[i]; 279 } 280 281 cudnnCreateTensorDescriptor(&desc).cudnnCheck(); 282 cudnnSetTensor4dDescriptor(desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, shape[0], shape[1], vol, 1) 283 .cudnnCheck(); 284 } 285 286 ~this() 287 { 288 cudnnDestroyTensorDescriptor(desc).cudnnCheck(); 289 } 290 291 override void execute(const(CUDABuffer)[] inputs, CUDABuffer output) 292 { 293 float alpha = 1.0; 294 float beta = 0.0; 295 auto x = cast(void *)inputs[0].ptr; 296 auto y = cast(void *)output.ptr; 297 298 cudnnSoftmaxForward(handle, CUDNN_SOFTMAX_ACCURATE, CUDNN_SOFTMAX_MODE_CHANNEL, &alpha, desc, x, &beta, 299 desc, y).cudnnCheck(); 300 301 cuCtxSynchronize(); 302 } 303 304 cudnnTensorDescriptor_t desc; 305 } 306 307 class SoftmaxGrad : CUDAKernel 308 { 309 this(Operation op) 310 { 311 auto shape = op.shape.map!(x => cast(int)x).array(); 312 313 cudnnCreateTensorDescriptor(&desc).cudnnCheck(); 314 cudnnSetTensor4dDescriptor(desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, shape[0], shape[1], 315 reduce!"a * b"(1, shape[2 .. $]), 1).cudnnCheck(); 316 } 317 318 ~this() 319 { 320 cudnnDestroyTensorDescriptor(desc).cudnnCheck(); 321 } 322 323 override void execute(const(CUDABuffer)[] inputs, CUDABuffer output) 324 { 325 float alpha = 1.0; 326 float beta = 0.0; 327 auto dy = cast(void *)inputs[0].ptr; 328 auto y = cast(void *)inputs[1].ptr; 329 auto dx = cast(void *)output.ptr; 330 331 cudnnSoftmaxBackward(handle, CUDNN_SOFTMAX_ACCURATE, CUDNN_SOFTMAX_MODE_CHANNEL, &alpha, desc, y, desc, dy, 332 &beta, desc, dx).cudnnCheck(); 333 334 cuCtxSynchronize(); 335 } 336 337 cudnnTensorDescriptor_t desc; 338 } 339 340 class ReLU : CUDAKernel 341 { 342 this(Operation op) 343 { 344 import std.range : drop; 345 346 auto shape = op.shape.map!(x => cast(int)x).array(); 347 348 cudnnCreateTensorDescriptor(&desc).cudnnCheck(); 349 cudnnSetTensor4dDescriptor(desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, shape[0], shape.length > 1 ? shape[1] : 1, 350 reduce!"a * b"(1, shape.drop(2).array()), 1).cudnnCheck(); 351 352 cudnnCreateActivationDescriptor(&actDesc).cudnnCheck(); 353 cudnnSetActivationDescriptor(actDesc, CUDNN_ACTIVATION_RELU, CUDNN_PROPAGATE_NAN, 0.0).cudnnCheck(); 354 } 355 356 ~this() 357 { 358 cudnnDestroyTensorDescriptor(desc).cudnnCheck(); 359 cudnnDestroyActivationDescriptor(actDesc).cudnnCheck(); 360 } 361 362 override void execute(const(CUDABuffer)[] inputs, CUDABuffer output) 363 { 364 float alpha = 1.0; 365 float beta = 0.0; 366 auto x = cast(void *)inputs[0].ptr; 367 auto y = cast(void *)output.ptr; 368 369 cudnnActivationForward(handle, actDesc, &alpha, desc, x, &beta, desc, y).cudnnCheck(); 370 371 cuCtxSynchronize(); 372 } 373 374 cudnnTensorDescriptor_t desc; 375 cudnnActivationDescriptor_t actDesc; 376 } 377 378 class ReLUGrad : CUDAKernel 379 { 380 this(Operation op) 381 { 382 import std.range : drop; 383 384 auto shape = op.shape.map!(x => cast(int)x).array(); 385 386 cudnnCreateTensorDescriptor(&desc).cudnnCheck(); 387 cudnnSetTensor4dDescriptor(desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, shape[0], shape.length > 1 ? shape[1] : 1, 388 reduce!"a * b"(1, shape.drop(2).array()), 1).cudnnCheck(); 389 390 cudnnCreateActivationDescriptor(&actDesc).cudnnCheck(); 391 cudnnSetActivationDescriptor(actDesc, CUDNN_ACTIVATION_RELU, CUDNN_PROPAGATE_NAN, 0.0).cudnnCheck(); 392 } 393 394 ~this() 395 { 396 cudnnDestroyTensorDescriptor(desc).cudnnCheck(); 397 cudnnDestroyActivationDescriptor(actDesc).cudnnCheck(); 398 } 399 400 override void execute(const(CUDABuffer)[] inputs, CUDABuffer output) 401 { 402 float alpha = 1.0; 403 float beta = 0.0; 404 auto dy = cast(void *)inputs[0].ptr; 405 auto y = cast(void *)inputs[1].ptr; 406 auto x = cast(void *)inputs[2].ptr; 407 auto dx = cast(void *)output.ptr; 408 409 cudnnActivationBackward(handle, actDesc, &alpha, desc, y, desc, dy, desc, x, &beta, desc, dx).cudnnCheck(); 410 411 cuCtxSynchronize(); 412 } 413 414 cudnnTensorDescriptor_t desc; 415 cudnnActivationDescriptor_t actDesc; 416 } 417 418 class AddBias : CUDAKernel 419 { 420 this(Operation op) 421 { 422 auto shape = op.shape.map!(x => cast(int)x).array(); 423 424 cudnnCreateTensorDescriptor(&cDesc).cudnnCheck(); 425 cudnnSetTensor4dDescriptor(cDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, shape[0], shape[1], 426 reduce!"a * b"(1, shape[2 .. $]), 1).cudnnCheck(); 427 428 cudnnCreateTensorDescriptor(&aDesc).cudnnCheck(); 429 cudnnSetTensor4dDescriptor(aDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, shape[1], 1, 1).cudnnCheck(); 430 } 431 432 ~this() 433 { 434 cudnnDestroyTensorDescriptor(cDesc).cudnnCheck(); 435 cudnnDestroyTensorDescriptor(aDesc).cudnnCheck(); 436 } 437 438 override void execute(const(CUDABuffer)[] inputs, CUDABuffer output) 439 { 440 cuMemcpy(output.ptr, inputs[0].ptr, output.numBytes); 441 442 float alpha = 1; 443 float beta = 1; 444 445 cudnnAddTensor(handle, &alpha, aDesc, cast(void *)inputs[1].ptr, &beta, cDesc, cast(void *)output.ptr); 446 } 447 448 cudnnTensorDescriptor_t cDesc; 449 cudnnTensorDescriptor_t aDesc; 450 } 451 452 class AddBiasGrad : CUDAKernel 453 { 454 this(Operation op) 455 { 456 auto shape = op.deps[0].shape.map!(x => cast(int)x).array(); 457 458 cudnnCreateTensorDescriptor(&dyDesc).cudnnCheck(); 459 cudnnSetTensor4dDescriptor(dyDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, shape[0], shape[1], 460 reduce!"a * b"(1, shape[2 .. $]), 1).cudnnCheck(); 461 462 cudnnCreateTensorDescriptor(&dbDesc).cudnnCheck(); 463 cudnnSetTensor4dDescriptor(dbDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, shape[1], 1, 1).cudnnCheck(); 464 } 465 466 ~this() 467 { 468 cudnnDestroyTensorDescriptor(dyDesc).cudnnCheck(); 469 cudnnDestroyTensorDescriptor(dbDesc).cudnnCheck(); 470 } 471 472 override void execute(const(CUDABuffer)[] inputs, CUDABuffer output) 473 { 474 float alpha = 1.0f; 475 float beta = 1.0f; 476 477 cudnnConvolutionBackwardBias(handle, &alpha, dyDesc, cast(void *)inputs[0].ptr, &beta, dbDesc, 478 cast(void *)output.ptr); 479 } 480 481 cudnnTensorDescriptor_t dyDesc; 482 cudnnTensorDescriptor_t dbDesc; 483 } 484 485 abstract class BatchNormBase : CUDAKernel 486 { 487 this(Operation op) 488 { 489 if(op.deps[0].rank == 2) 490 { 491 mode = CUDNN_BATCHNORM_PER_ACTIVATION; 492 } 493 else 494 { 495 mode = CUDNN_BATCHNORM_SPATIAL; 496 } 497 498 import std.range; 499 500 auto shape = op.deps[0].shape 501 .chain(repeat(1)) 502 .map!(x => cast(int)x) 503 .take(4) 504 .array(); 505 506 cudnnCreateTensorDescriptor(&xDesc).cudnnCheck(); 507 cudnnCreateTensorDescriptor(&bnDesc).cudnnCheck(); 508 509 cudnnSetTensor4dDescriptor(xDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, shape[0], shape[1], shape[2], 510 shape[3]).cudnnCheck(); 511 cudnnDeriveBNTensorDescriptor(bnDesc, xDesc, mode).cudnnCheck(); 512 } 513 514 ~this() 515 { 516 cudnnDestroyTensorDescriptor(xDesc).cudnnCheck(); 517 cudnnDestroyTensorDescriptor(bnDesc).cudnnCheck(); 518 } 519 520 cudnnBatchNormMode_t mode; 521 cudnnTensorDescriptor_t xDesc; 522 cudnnTensorDescriptor_t bnDesc; 523 } 524 525 class BatchNormTrain : BatchNormBase 526 { 527 this(Operation op) 528 { 529 super(op); 530 mMomentum = 1.0 - op.attributes["momentum"].get!double; 531 } 532 533 void execute(const(CUDABuffer)[] inputs, CUDABuffer output) 534 { 535 float alpha = 1.0f; 536 float beta = 0.0f; 537 538 //We're going to pack the running mean/variance after the BN forward prop. Let the higher level 539 //API slice them out into different nodes. 540 auto mean = output.ptr + inputs[0].numBytes; 541 auto var = mean + (output.numBytes - inputs[0].numBytes) / 2; 542 543 cuMemcpy(mean, inputs[3].ptr, inputs[3].numBytes); 544 cuMemcpy(var, inputs[4].ptr, inputs[4].numBytes); 545 546 cudnnBatchNormalizationForwardTraining(handle, mode, &alpha, &beta, xDesc, 547 cast(void *)inputs[0].ptr, xDesc, cast(void *)output.ptr, bnDesc, cast(void *)inputs[1].ptr, 548 cast(void *)inputs[2].ptr, mMomentum, cast(void *)mean, cast(void *)var, 1e-5f, null, null).cudnnCheck(); 549 } 550 551 double mMomentum; 552 } 553 554 class BatchNormGrad : BatchNormBase 555 { 556 this(Operation op) 557 { 558 super(op); 559 } 560 561 void execute(const(CUDABuffer)[] inputs, CUDABuffer output) 562 { 563 float alpha = 1.0f; 564 float beta = 0.0f; 565 566 void *dx = cast(void *)(output.ptr); 567 void *dscale = cast(void *)(output.ptr + inputs[1].numBytes); 568 void *dbias = cast(void *)(output.ptr + inputs[1].numBytes + inputs[2].numBytes); 569 570 cudnnBatchNormalizationBackward(handle, mode, &alpha, &beta, &alpha, &beta, xDesc, 571 cast(void *)inputs[1].ptr, xDesc, cast(void *)inputs[0].ptr, xDesc, dx, bnDesc, 572 cast(void *)inputs[2].ptr, dscale, dbias, 1e-5f, null, null); 573 } 574 } 575 576 class BatchNormInference : BatchNormBase 577 { 578 this(Operation op) 579 { 580 super(op); 581 } 582 583 void execute(const(CUDABuffer)[] inputs, CUDABuffer output) 584 { 585 float alpha = 1.0f; 586 float beta = 0.0f; 587 588 cudnnBatchNormalizationForwardInference(handle, mode, &alpha, &beta, xDesc, cast(void *)inputs[0].ptr, 589 xDesc, cast(void *)output.ptr, bnDesc, cast(void *)inputs[1].ptr, cast(void *)inputs[2].ptr, 590 cast(void *)inputs[3].ptr, cast(void *)inputs[4].ptr, 1e-5).cudnnCheck(); 591 } 592 } 593 }