1 module dopt.core.cuda.nnet.cudnn7; 2 3 import std.algorithm; 4 import std.array; 5 import std.functional; 6 7 import dopt.core.cuda; 8 import dopt.core.ops; 9 10 import derelict.cuda; 11 import derelict.cudnn7; 12 13 package 14 { 15 void initializeCuDNN7() 16 { 17 DerelictCuDNN7.load(); 18 19 registerCUDAKernel("convolution", toDelegate(&cudaKernelCtr!ConvolutionForward)); 20 registerCUDAKernel("convolutionFeaturesGrad", toDelegate(&cudaKernelCtr!ConvolutionFeaturesGrad)); 21 registerCUDAKernel("convolutionFiltersGrad", toDelegate(&cudaKernelCtr!ConvolutionFiltersGrad)); 22 registerCUDAKernel("maxpool", toDelegate(&cudaKernelCtr!MaxpoolForward)); 23 registerCUDAKernel("maxpoolGrad", toDelegate(&cudaKernelCtr!MaxpoolGrad)); 24 registerCUDAKernel("softmax", toDelegate(&cudaKernelCtr!Softmax)); 25 registerCUDAKernel("softmaxGrad", toDelegate(&cudaKernelCtr!SoftmaxGrad)); 26 registerCUDAKernel("addBias", toDelegate(&cudaKernelCtr!AddBias)); 27 registerCUDAKernel("addBiasGrad", toDelegate(&cudaKernelCtr!AddBiasGrad)); 28 registerCUDAKernel("batchNormTrain", toDelegate(&cudaKernelCtr!BatchNormTrain)); 29 registerCUDAKernel("batchNormGrad", toDelegate(&cudaKernelCtr!BatchNormGrad)); 30 31 cudnnCreate(&handle); 32 } 33 } 34 35 private 36 { 37 cudnnHandle_t handle; 38 39 void cudnnCheck(cudnnStatus_t status, string mod = __MODULE__, size_t line = __LINE__) 40 { 41 import std.conv : to; 42 import std.exception : enforce; 43 enforce(status == CUDNN_STATUS_SUCCESS, mod ~ "(" ~ line.to!string ~ "): Failed to execute cuDNN function." ~ 44 " Error code: " ~ status.to!string); 45 } 46 47 CUDAKernel cudaKernelCtr(K)(Operation op) 48 { 49 return new K(op); 50 } 51 52 class ConvolutionBase : CUDAKernel 53 { 54 this(Operation op, int[] inShape, int[] filterShape, int[] outShape) 55 { 56 mOp = op; 57 58 int padH = 0; 59 int padW = 0; 60 int strideY = 1; 61 int strideX = 1; 62 63 auto padding = op.attributes["padding"].get!(size_t[]); 64 padH = cast(int)padding[0]; 65 padW = cast(int)padding[1]; 66 67 auto stride = op.attributes["stride"].get!(size_t[]); 68 strideY = cast(int)stride[0]; 69 strideX = cast(int)stride[1]; 70 71 auto dilation = [1LU, 1LU]; 72 int dilY = cast(int)dilation[0]; 73 int dilX = cast(int)dilation[1]; 74 75 cudnnCreateTensorDescriptor(&xDesc).cudnnCheck(); 76 cudnnCreateFilterDescriptor(&wDesc).cudnnCheck(); 77 cudnnCreateConvolutionDescriptor(&convDesc).cudnnCheck(); 78 cudnnCreateTensorDescriptor(&yDesc).cudnnCheck(); 79 80 cudnnSetTensor4dDescriptor(xDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, inShape[0], inShape[1], inShape[2], 81 inShape[3]).cudnnCheck(); 82 cudnnSetFilter4dDescriptor(wDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, filterShape[0], filterShape[1], 83 filterShape[2], filterShape[3]).cudnnCheck(); 84 cudnnSetConvolution2dDescriptor(convDesc, padH, padW, strideY, strideX, dilY, dilX, CUDNN_CONVOLUTION, 85 CUDNN_DATA_FLOAT).cudnnCheck(); 86 cudnnSetTensor4dDescriptor(yDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, outShape[0], outShape[1], 87 outShape[2], outShape[3]).cudnnCheck(); 88 } 89 90 ~this() 91 { 92 cudnnDestroyFilterDescriptor(wDesc).cudnnCheck(); 93 cudnnDestroyTensorDescriptor(yDesc).cudnnCheck(); 94 cudnnDestroyConvolutionDescriptor(convDesc).cudnnCheck(); 95 cudnnDestroyTensorDescriptor(xDesc).cudnnCheck(); 96 } 97 98 abstract void execute(const(CUDABuffer)[] inputs, CUDABuffer output); 99 100 Operation mOp; 101 cudnnTensorDescriptor_t xDesc; 102 cudnnFilterDescriptor_t wDesc; 103 cudnnTensorDescriptor_t bDesc; 104 cudnnConvolutionDescriptor_t convDesc; 105 cudnnTensorDescriptor_t yDesc; 106 } 107 108 private static CUDABuffer mWorkspace; 109 110 class ConvolutionForward : ConvolutionBase 111 { 112 private cudnnConvolutionFwdAlgo_t mAlgo; 113 114 this(Operation op) 115 { 116 auto inShape = op.deps[0].outputType.shape.map!(x => cast(int)x).array(); 117 auto filterShape = op.deps[1].outputType.shape.map!(x => cast(int)x).array(); 118 auto outShape = op.outputType.shape.map!(x => cast(int)x).array(); 119 120 super(op, inShape, filterShape, outShape); 121 122 cudnnConvolutionFwdAlgoPerf_t[9] algoPerfs; 123 int numAlgos; 124 cudnnFindConvolutionForwardAlgorithm(handle, xDesc, wDesc, convDesc, yDesc, cast(int)algoPerfs.length, &numAlgos, algoPerfs.ptr); 125 126 if(mWorkspace !is null && algoPerfs[0].memory > mWorkspace.numBytes) 127 { 128 CUDABuffer.destroy(mWorkspace); 129 mWorkspace = null; 130 } 131 132 if(mWorkspace is null) 133 { 134 mWorkspace = CUDABuffer.create(algoPerfs[0].memory); 135 } 136 137 mAlgo = algoPerfs[0].algo; 138 } 139 140 override void execute(const(CUDABuffer)[] inputs, CUDABuffer output) 141 { 142 auto x = cast(void *)inputs[0].ptr; 143 auto w = cast(void *)inputs[1].ptr; 144 auto y = cast(void *)output.ptr; 145 float alpha = 1; 146 float beta = 0; 147 148 auto ws = cast(void *)(mWorkspace.ptr); 149 auto wss = mWorkspace.numBytes; 150 cudnnConvolutionForward(handle, &alpha, xDesc, x, wDesc, w, convDesc, mAlgo, ws, wss, &beta, yDesc, y) 151 .cudnnCheck(); 152 153 cuCtxSynchronize(); 154 } 155 } 156 157 class ConvolutionFeaturesGrad : ConvolutionBase 158 { 159 private cudnnConvolutionBwdDataAlgo_t mAlgo; 160 161 this(Operation op) 162 { 163 auto inShape = op.shape.map!(x => cast(int)x).array(); 164 auto filterShape = op.deps[1].shape.map!(x => cast(int)x).array(); 165 auto outShape = op.deps[0].shape.map!(x => cast(int)x).array(); 166 167 super(op, inShape, filterShape, outShape); 168 169 cudnnConvolutionBwdDataAlgoPerf_t[9] algoPerfs; 170 int numAlgos; 171 cudnnFindConvolutionBackwardDataAlgorithm(handle, wDesc, yDesc, convDesc, xDesc, cast(int)algoPerfs.length, &numAlgos, algoPerfs.ptr); 172 173 if(mWorkspace !is null && algoPerfs[0].memory > mWorkspace.numBytes) 174 { 175 CUDABuffer.destroy(mWorkspace); 176 mWorkspace = null; 177 } 178 179 if(mWorkspace is null) 180 { 181 mWorkspace = CUDABuffer.create(algoPerfs[0].memory); 182 } 183 184 mAlgo = algoPerfs[0].algo; 185 } 186 187 override void execute(const(CUDABuffer)[] inputs, CUDABuffer output) 188 { 189 auto w = cast(void *)inputs[1].ptr; 190 auto dy = cast(void *)inputs[0].ptr; 191 auto dx = cast(void *)output.ptr; 192 float alpha = 1; 193 float beta = 0; 194 195 cudnnConvolutionBackwardData(handle, &alpha, wDesc, w, yDesc, dy, convDesc, mAlgo, cast(void *)mWorkspace.ptr, mWorkspace.numBytes, &beta, xDesc, dx) 196 .cudnnCheck(); 197 198 cuCtxSynchronize(); 199 } 200 } 201 202 class ConvolutionFiltersGrad : ConvolutionBase 203 { 204 private cudnnConvolutionBwdFilterAlgo_t mAlgo; 205 206 this(Operation op) 207 { 208 auto inShape = op.deps[1].outputType.shape.map!(x => cast(int)x).array(); 209 auto filterShape = op.outputType.shape.map!(x => cast(int)x).array(); 210 auto outShape = op.deps[0].outputType.shape.map!(x => cast(int)x).array(); 211 212 super(op, inShape, filterShape, outShape); 213 214 cudnnConvolutionBwdFilterAlgoPerf_t[9] algoPerfs; 215 int numAlgos; 216 cudnnFindConvolutionBackwardFilterAlgorithm(handle, xDesc, yDesc, convDesc, wDesc, cast(int)algoPerfs.length, &numAlgos, algoPerfs.ptr); 217 218 if(mWorkspace !is null && algoPerfs[0].memory > mWorkspace.numBytes) 219 { 220 CUDABuffer.destroy(mWorkspace); 221 mWorkspace = null; 222 } 223 224 if(mWorkspace is null) 225 { 226 mWorkspace = CUDABuffer.create(algoPerfs[0].memory); 227 } 228 229 mAlgo = algoPerfs[0].algo; 230 } 231 232 override void execute(const(CUDABuffer)[] inputs, CUDABuffer output) 233 { 234 auto x = cast(void *)inputs[1].ptr; 235 auto dy = cast(void *)inputs[0].ptr; 236 auto dw = cast(void *)output.ptr; 237 float alpha = 1; 238 float beta = 0; 239 240 cudnnConvolutionBackwardFilter(handle, &alpha, xDesc, x, yDesc, dy, convDesc, mAlgo, cast(void *)mWorkspace.ptr, mWorkspace.numBytes, &beta, wDesc, 241 dw).cudnnCheck(); 242 243 cuCtxSynchronize(); 244 } 245 } 246 247 class MaxpoolBase : CUDAKernel 248 { 249 this(Operation op, int[] inShape, int[]outShape) 250 { 251 auto dims = op.attributes["dims"].get!(size_t[]); 252 auto poolShape = dims.map!(x => cast(int)x).array(); 253 auto poolStride = poolShape.dup; 254 255 cudnnCreatePoolingDescriptor(&poolingDesc).cudnnCheck(); 256 cudnnSetPooling2dDescriptor(poolingDesc, CUDNN_POOLING_MAX, 1, cast(int)poolShape[0], 257 cast(int)poolShape[1], 0, 0, cast(int)poolStride[0], cast(int)poolStride[1]).cudnnCheck(); 258 259 cudnnCreateTensorDescriptor(&xDesc).cudnnCheck(); 260 cudnnCreateTensorDescriptor(&yDesc).cudnnCheck(); 261 cudnnSetTensor4dDescriptor(xDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, inShape[0], inShape[1], inShape[2], 262 inShape[3]).cudnnCheck(); 263 cudnnSetTensor4dDescriptor(yDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, outShape[0], outShape[1], 264 outShape[2], outShape[3]).cudnnCheck(); 265 266 } 267 268 ~this() 269 { 270 cudnnDestroyPoolingDescriptor(poolingDesc).cudnnCheck(); 271 cudnnDestroyTensorDescriptor(xDesc).cudnnCheck(); 272 cudnnDestroyTensorDescriptor(yDesc).cudnnCheck(); 273 } 274 275 abstract void execute(const(CUDABuffer)[] inputs, CUDABuffer output); 276 277 cudnnPoolingDescriptor_t poolingDesc; 278 cudnnTensorDescriptor_t xDesc; 279 cudnnTensorDescriptor_t yDesc; 280 } 281 282 class MaxpoolForward : MaxpoolBase 283 { 284 this(Operation op) 285 { 286 auto inShape = op.deps[0].outputType.shape.map!(x => cast(int)x).array(); 287 auto outShape = op.outputType.shape.map!(x => cast(int)x).array(); 288 289 super(op, inShape, outShape); 290 } 291 292 override void execute(const(CUDABuffer)[] inputs, CUDABuffer output) 293 { 294 auto x = cast(void *)inputs[0].ptr; 295 auto y = cast(void *)output.ptr; 296 float alpha = 1; 297 float beta = 0; 298 299 cudnnPoolingForward(handle, poolingDesc, &alpha, xDesc, x, &beta, yDesc, y).cudnnCheck(); 300 301 cuCtxSynchronize(); 302 } 303 } 304 305 class MaxpoolGrad : MaxpoolBase 306 { 307 this(Operation op) 308 { 309 auto inShape = op.deps[2].outputType.shape.map!(x => cast(int)x).array(); 310 auto outShape = op.deps[1].outputType.shape.map!(x => cast(int)x).array(); 311 312 super(op, inShape, outShape); 313 } 314 315 override void execute(const(CUDABuffer)[] inputs, CUDABuffer output) 316 { 317 auto dx = cast(void *)output.ptr; 318 auto dy = cast(void *)inputs[0].ptr; 319 auto y = cast(void *)inputs[1].ptr; 320 auto x = cast(void *)inputs[2].ptr; 321 float alpha = 1; 322 float beta = 0; 323 324 cudnnPoolingBackward(handle, poolingDesc, &alpha, yDesc, y, yDesc, dy, xDesc, x, &beta, xDesc, dx) 325 .cudnnCheck(); 326 327 cuCtxSynchronize(); 328 } 329 } 330 331 class Softmax : CUDAKernel 332 { 333 this(Operation op) 334 { 335 auto shape = op.shape.map!(x => cast(int)x).array(); 336 auto vol = 1; 337 338 for(size_t i = 2; i < shape.length; i++) 339 { 340 vol *= shape[i]; 341 } 342 343 cudnnCreateTensorDescriptor(&desc).cudnnCheck(); 344 cudnnSetTensor4dDescriptor(desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, shape[0], shape[1], vol, 1) 345 .cudnnCheck(); 346 } 347 348 ~this() 349 { 350 cudnnDestroyTensorDescriptor(desc).cudnnCheck(); 351 } 352 353 override void execute(const(CUDABuffer)[] inputs, CUDABuffer output) 354 { 355 float alpha = 1.0; 356 float beta = 0.0; 357 auto x = cast(void *)inputs[0].ptr; 358 auto y = cast(void *)output.ptr; 359 360 cudnnSoftmaxForward(handle, CUDNN_SOFTMAX_ACCURATE, CUDNN_SOFTMAX_MODE_CHANNEL, &alpha, desc, x, &beta, 361 desc, y).cudnnCheck(); 362 363 cuCtxSynchronize(); 364 } 365 366 cudnnTensorDescriptor_t desc; 367 } 368 369 class SoftmaxGrad : CUDAKernel 370 { 371 this(Operation op) 372 { 373 auto shape = op.shape.map!(x => cast(int)x).array(); 374 375 cudnnCreateTensorDescriptor(&desc).cudnnCheck(); 376 cudnnSetTensor4dDescriptor(desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, shape[0], shape[1], 377 reduce!"a * b"(1, shape[2 .. $]), 1).cudnnCheck(); 378 } 379 380 ~this() 381 { 382 cudnnDestroyTensorDescriptor(desc).cudnnCheck(); 383 } 384 385 override void execute(const(CUDABuffer)[] inputs, CUDABuffer output) 386 { 387 float alpha = 1.0; 388 float beta = 0.0; 389 auto dy = cast(void *)inputs[0].ptr; 390 auto y = cast(void *)inputs[1].ptr; 391 auto dx = cast(void *)output.ptr; 392 393 cudnnSoftmaxBackward(handle, CUDNN_SOFTMAX_ACCURATE, CUDNN_SOFTMAX_MODE_CHANNEL, &alpha, desc, y, desc, dy, 394 &beta, desc, dx).cudnnCheck(); 395 396 cuCtxSynchronize(); 397 } 398 399 cudnnTensorDescriptor_t desc; 400 } 401 402 class AddBias : CUDAKernel 403 { 404 this(Operation op) 405 { 406 auto shape = op.shape.map!(x => cast(int)x).array(); 407 408 cudnnCreateTensorDescriptor(&cDesc).cudnnCheck(); 409 cudnnSetTensor4dDescriptor(cDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, shape[0], shape[1], 410 reduce!"a * b"(1, shape[2 .. $]), 1).cudnnCheck(); 411 412 cudnnCreateTensorDescriptor(&aDesc).cudnnCheck(); 413 cudnnSetTensor4dDescriptor(aDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, shape[1], 1, 1).cudnnCheck(); 414 } 415 416 ~this() 417 { 418 cudnnDestroyTensorDescriptor(cDesc).cudnnCheck(); 419 cudnnDestroyTensorDescriptor(aDesc).cudnnCheck(); 420 } 421 422 override void execute(const(CUDABuffer)[] inputs, CUDABuffer output) 423 { 424 cuMemcpy(output.ptr, inputs[0].ptr, output.numBytes); 425 426 float alpha = 1; 427 float beta = 1; 428 429 cudnnAddTensor(handle, &alpha, aDesc, cast(void *)inputs[1].ptr, &beta, cDesc, cast(void *)output.ptr); 430 } 431 432 cudnnTensorDescriptor_t cDesc; 433 cudnnTensorDescriptor_t aDesc; 434 } 435 436 class AddBiasGrad : CUDAKernel 437 { 438 this(Operation op) 439 { 440 auto shape = op.deps[0].shape.map!(x => cast(int)x).array(); 441 442 cudnnCreateTensorDescriptor(&dyDesc).cudnnCheck(); 443 cudnnSetTensor4dDescriptor(dyDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, shape[0], shape[1], 444 reduce!"a * b"(1, shape[2 .. $]), 1).cudnnCheck(); 445 446 cudnnCreateTensorDescriptor(&dbDesc).cudnnCheck(); 447 cudnnSetTensor4dDescriptor(dbDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, shape[1], 1, 1).cudnnCheck(); 448 } 449 450 ~this() 451 { 452 cudnnDestroyTensorDescriptor(dyDesc).cudnnCheck(); 453 cudnnDestroyTensorDescriptor(dbDesc).cudnnCheck(); 454 } 455 456 override void execute(const(CUDABuffer)[] inputs, CUDABuffer output) 457 { 458 float alpha = 1.0f; 459 float beta = 1.0f; 460 461 cudnnConvolutionBackwardBias(handle, &alpha, dyDesc, cast(void *)inputs[0].ptr, &beta, dbDesc, 462 cast(void *)output.ptr); 463 } 464 465 cudnnTensorDescriptor_t dyDesc; 466 cudnnTensorDescriptor_t dbDesc; 467 } 468 469 class BatchNormTrain : CUDAKernel 470 { 471 this(Operation op) 472 { 473 if(op.rank == 2) 474 { 475 mode = CUDNN_BATCHNORM_PER_ACTIVATION; 476 } 477 else 478 { 479 mode = CUDNN_BATCHNORM_SPATIAL; 480 } 481 482 import std.range; 483 484 auto shape = op.deps[0].shape 485 .chain(repeat(1)) 486 .map!(x => cast(int)x) 487 .take(4) 488 .array(); 489 490 cudnnCreateTensorDescriptor(&xDesc).cudnnCheck(); 491 cudnnCreateTensorDescriptor(&bnDesc).cudnnCheck(); 492 493 cudnnSetTensor4dDescriptor(xDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, shape[0], shape[1], shape[2], 494 shape[3]).cudnnCheck(); 495 cudnnDeriveBNTensorDescriptor(bnDesc, xDesc, mode).cudnnCheck(); 496 } 497 498 ~this() 499 { 500 cudnnDestroyTensorDescriptor(xDesc).cudnnCheck(); 501 cudnnDestroyTensorDescriptor(bnDesc).cudnnCheck(); 502 } 503 504 override void execute(const(CUDABuffer)[] inputs, CUDABuffer output) 505 { 506 float alpha = 1.0f; 507 float beta = 0.0f; 508 509 cudnnBatchNormalizationForwardTraining(handle, mode, &alpha, &beta, xDesc, 510 cast(void *)inputs[0].ptr, xDesc, cast(void *)output.ptr, bnDesc, cast(void *)inputs[1].ptr, 511 cast(void *)inputs[2].ptr, 0, null, null, 1e-5f, null, null).cudnnCheck(); 512 } 513 514 cudnnBatchNormMode_t mode; 515 cudnnTensorDescriptor_t xDesc; 516 cudnnTensorDescriptor_t bnDesc; 517 } 518 519 class BatchNormGrad : CUDAKernel 520 { 521 this(Operation op) 522 { 523 if(op.deps[1].rank == 2) 524 { 525 mode = CUDNN_BATCHNORM_PER_ACTIVATION; 526 } 527 else 528 { 529 mode = CUDNN_BATCHNORM_SPATIAL; 530 } 531 532 import std.range; 533 534 auto shape = op.deps[1].shape 535 .chain(repeat(1)) 536 .map!(x => cast(int)x) 537 .take(4) 538 .array(); 539 540 cudnnCreateTensorDescriptor(&xDesc).cudnnCheck(); 541 cudnnCreateTensorDescriptor(&bnDesc).cudnnCheck(); 542 543 cudnnSetTensor4dDescriptor(xDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, shape[0], shape[1], shape[2], 544 shape[3]).cudnnCheck(); 545 cudnnDeriveBNTensorDescriptor(bnDesc, xDesc, mode).cudnnCheck(); 546 } 547 548 ~this() 549 { 550 cudnnDestroyTensorDescriptor(xDesc).cudnnCheck(); 551 cudnnDestroyTensorDescriptor(bnDesc).cudnnCheck(); 552 } 553 554 override void execute(const(CUDABuffer)[] inputs, CUDABuffer output) 555 { 556 float alpha = 1.0f; 557 float beta = 0.0f; 558 559 void *dx = cast(void *)(output.ptr); 560 void *dscale = cast(void *)(output.ptr + inputs[1].numBytes); 561 void *dbias = cast(void *)(output.ptr + inputs[1].numBytes + inputs[2].numBytes); 562 563 cudnnBatchNormalizationBackward(handle, mode, &alpha, &beta, &alpha, &beta, xDesc, 564 cast(void *)inputs[1].ptr, xDesc, cast(void *)inputs[0].ptr, xDesc, dx, bnDesc, 565 cast(void *)inputs[2].ptr, dscale, dbias, 1e-5f, null, null); 566 } 567 568 cudnnBatchNormMode_t mode; 569 cudnnTensorDescriptor_t xDesc; 570 cudnnTensorDescriptor_t bnDesc; 571 } 572 }