1 module dopt.core.cuda.nnet.cudnn5; 2 3 import std.algorithm; 4 import std.array; 5 import std.functional; 6 7 import dopt.core.cuda; 8 import dopt.core.ops; 9 10 import derelict.cuda; 11 import derelict.cudnn5; 12 13 package 14 { 15 void initializeCuDNN5() 16 { 17 DerelictCuDNN5.load(); 18 19 registerCUDAKernel("convolution", toDelegate(&cudaKernelCtr!ConvolutionForward)); 20 registerCUDAKernel("convolutionFeaturesGrad", toDelegate(&cudaKernelCtr!ConvolutionFeaturesGrad)); 21 registerCUDAKernel("convolutionFiltersGrad", toDelegate(&cudaKernelCtr!ConvolutionFiltersGrad)); 22 registerCUDAKernel("maxpool", toDelegate(&cudaKernelCtr!MaxpoolForward)); 23 registerCUDAKernel("maxpoolGrad", toDelegate(&cudaKernelCtr!MaxpoolGrad)); 24 registerCUDAKernel("softmax", toDelegate(&cudaKernelCtr!Softmax)); 25 registerCUDAKernel("softmaxGrad", toDelegate(&cudaKernelCtr!SoftmaxGrad)); 26 registerCUDAKernel("addBias", toDelegate(&cudaKernelCtr!AddBias)); 27 registerCUDAKernel("addBiasGrad", toDelegate(&cudaKernelCtr!AddBiasGrad)); 28 registerCUDAKernel("batchNormTrain", toDelegate(&cudaKernelCtr!BatchNormTrain)); 29 registerCUDAKernel("batchNormGrad", toDelegate(&cudaKernelCtr!BatchNormGrad)); 30 31 cudnnCreate(&handle); 32 } 33 } 34 35 private 36 { 37 cudnnHandle_t handle; 38 39 void cudnnCheck(cudnnStatus_t status, string mod = __MODULE__, size_t line = __LINE__) 40 { 41 import std.conv : to; 42 import std.exception : enforce; 43 enforce(status == CUDNN_STATUS_SUCCESS, mod ~ "(" ~ line.to!string ~ "): Failed to execute cuDNN function." ~ 44 " Error code: " ~ status.to!string); 45 } 46 47 CUDAKernel cudaKernelCtr(K)(Operation op) 48 { 49 return new K(op); 50 } 51 52 class ConvolutionBase : CUDAKernel 53 { 54 this(Operation op, int[] inShape, int[] filterShape, int[] outShape) 55 { 56 mOp = op; 57 58 int padH = 0; 59 int padW = 0; 60 int strideY = 1; 61 int strideX = 1; 62 63 auto padding = op.attributes["padding"].get!(size_t[]); 64 padH = cast(int)padding[0]; 65 padW = cast(int)padding[1]; 66 67 auto stride = op.attributes["stride"].get!(size_t[]); 68 strideY = cast(int)stride[0]; 69 strideX = cast(int)stride[1]; 70 71 cudnnCreateTensorDescriptor(&xDesc).cudnnCheck(); 72 cudnnCreateFilterDescriptor(&wDesc).cudnnCheck(); 73 cudnnCreateConvolutionDescriptor(&convDesc).cudnnCheck(); 74 cudnnCreateTensorDescriptor(&yDesc).cudnnCheck(); 75 76 cudnnSetTensor4dDescriptor(xDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, inShape[0], inShape[1], inShape[2], 77 inShape[3]).cudnnCheck(); 78 cudnnSetFilter4dDescriptor(wDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, filterShape[0], filterShape[1], 79 filterShape[2], filterShape[3]).cudnnCheck(); 80 cudnnSetConvolution2dDescriptor_v5(convDesc, padH, padW, strideY, strideX, 1, 1, CUDNN_CONVOLUTION, 81 CUDNN_DATA_FLOAT).cudnnCheck(); 82 cudnnSetTensor4dDescriptor(yDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, outShape[0], outShape[1], 83 outShape[2], outShape[3]).cudnnCheck(); 84 } 85 86 ~this() 87 { 88 cudnnDestroyFilterDescriptor(wDesc).cudnnCheck(); 89 cudnnDestroyTensorDescriptor(yDesc).cudnnCheck(); 90 cudnnDestroyConvolutionDescriptor(convDesc).cudnnCheck(); 91 cudnnDestroyTensorDescriptor(xDesc).cudnnCheck(); 92 } 93 94 abstract void execute(const(CUDABuffer)[] inputs, CUDABuffer output); 95 96 Operation mOp; 97 cudnnTensorDescriptor_t xDesc; 98 cudnnFilterDescriptor_t wDesc; 99 cudnnTensorDescriptor_t bDesc; 100 cudnnConvolutionDescriptor_t convDesc; 101 cudnnTensorDescriptor_t yDesc; 102 } 103 104 class ConvolutionForward : ConvolutionBase 105 { 106 this(Operation op) 107 { 108 auto inShape = op.deps[0].outputType.shape.map!(x => cast(int)x).array(); 109 auto filterShape = op.deps[1].outputType.shape.map!(x => cast(int)x).array(); 110 auto outShape = op.outputType.shape.map!(x => cast(int)x).array(); 111 112 super(op, inShape, filterShape, outShape); 113 } 114 115 override void execute(const(CUDABuffer)[] inputs, CUDABuffer output) 116 { 117 auto x = cast(void *)inputs[0].ptr; 118 auto w = cast(void *)inputs[1].ptr; 119 auto y = cast(void *)output.ptr; 120 float alpha = 1; 121 float beta = 0; 122 123 cudnnConvolutionForward(handle, &alpha, xDesc, x, wDesc, w, convDesc, 0, null, 0, &beta, yDesc, y) 124 .cudnnCheck(); 125 126 cuCtxSynchronize(); 127 } 128 } 129 130 class ConvolutionFeaturesGrad : ConvolutionBase 131 { 132 this(Operation op) 133 { 134 auto inShape = op.shape.map!(x => cast(int)x).array(); 135 auto filterShape = op.deps[1].shape.map!(x => cast(int)x).array(); 136 auto outShape = op.deps[0].shape.map!(x => cast(int)x).array(); 137 138 super(op, inShape, filterShape, outShape); 139 } 140 141 override void execute(const(CUDABuffer)[] inputs, CUDABuffer output) 142 { 143 auto w = cast(void *)inputs[1].ptr; 144 auto dy = cast(void *)inputs[0].ptr; 145 auto dx = cast(void *)output.ptr; 146 float alpha = 1; 147 float beta = 0; 148 149 cudnnConvolutionBackwardData(handle, &alpha, wDesc, w, yDesc, dy, convDesc, 0, null, 0, &beta, xDesc, dx) 150 .cudnnCheck(); 151 152 cuCtxSynchronize(); 153 } 154 } 155 156 class ConvolutionFiltersGrad : ConvolutionBase 157 { 158 this(Operation op) 159 { 160 auto inShape = op.deps[1].outputType.shape.map!(x => cast(int)x).array(); 161 auto filterShape = op.outputType.shape.map!(x => cast(int)x).array(); 162 auto outShape = op.deps[0].outputType.shape.map!(x => cast(int)x).array(); 163 164 super(op, inShape, filterShape, outShape); 165 } 166 167 override void execute(const(CUDABuffer)[] inputs, CUDABuffer output) 168 { 169 auto x = cast(void *)inputs[1].ptr; 170 auto dy = cast(void *)inputs[0].ptr; 171 auto dw = cast(void *)output.ptr; 172 float alpha = 1; 173 float beta = 0; 174 175 cudnnConvolutionBackwardFilter(handle, &alpha, xDesc, x, yDesc, dy, convDesc, 0, null, 0, &beta, wDesc, 176 dw).cudnnCheck(); 177 178 cuCtxSynchronize(); 179 } 180 } 181 182 class MaxpoolBase : CUDAKernel 183 { 184 this(Operation op, int[] inShape, int[]outShape) 185 { 186 auto dims = op.attributes["dims"].get!(size_t[]); 187 auto poolShape = dims.map!(x => cast(int)x).array(); 188 auto poolStride = poolShape.dup; 189 190 cudnnCreatePoolingDescriptor(&poolingDesc).cudnnCheck(); 191 cudnnSetPooling2dDescriptor(poolingDesc, CUDNN_POOLING_MAX, 1, cast(int)poolShape[0], 192 cast(int)poolShape[1], 0, 0, cast(int)poolStride[0], cast(int)poolStride[1]).cudnnCheck(); 193 194 cudnnCreateTensorDescriptor(&xDesc).cudnnCheck(); 195 cudnnCreateTensorDescriptor(&yDesc).cudnnCheck(); 196 cudnnSetTensor4dDescriptor(xDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, inShape[0], inShape[1], inShape[2], 197 inShape[3]).cudnnCheck(); 198 cudnnSetTensor4dDescriptor(yDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, outShape[0], outShape[1], 199 outShape[2], outShape[3]).cudnnCheck(); 200 201 } 202 203 ~this() 204 { 205 cudnnDestroyPoolingDescriptor(poolingDesc).cudnnCheck(); 206 cudnnDestroyTensorDescriptor(xDesc).cudnnCheck(); 207 cudnnDestroyTensorDescriptor(yDesc).cudnnCheck(); 208 } 209 210 abstract void execute(const(CUDABuffer)[] inputs, CUDABuffer output); 211 212 cudnnPoolingDescriptor_t poolingDesc; 213 cudnnTensorDescriptor_t xDesc; 214 cudnnTensorDescriptor_t yDesc; 215 } 216 217 class MaxpoolForward : MaxpoolBase 218 { 219 this(Operation op) 220 { 221 auto inShape = op.deps[0].outputType.shape.map!(x => cast(int)x).array(); 222 auto outShape = op.outputType.shape.map!(x => cast(int)x).array(); 223 224 super(op, inShape, outShape); 225 } 226 227 override void execute(const(CUDABuffer)[] inputs, CUDABuffer output) 228 { 229 auto x = cast(void *)inputs[0].ptr; 230 auto y = cast(void *)output.ptr; 231 float alpha = 1; 232 float beta = 0; 233 234 cudnnPoolingForward(handle, poolingDesc, &alpha, xDesc, x, &beta, yDesc, y).cudnnCheck(); 235 236 cuCtxSynchronize(); 237 } 238 } 239 240 class MaxpoolGrad : MaxpoolBase 241 { 242 this(Operation op) 243 { 244 auto inShape = op.deps[2].outputType.shape.map!(x => cast(int)x).array(); 245 auto outShape = op.deps[1].outputType.shape.map!(x => cast(int)x).array(); 246 247 super(op, inShape, outShape); 248 } 249 250 override void execute(const(CUDABuffer)[] inputs, CUDABuffer output) 251 { 252 auto dx = cast(void *)output.ptr; 253 auto dy = cast(void *)inputs[0].ptr; 254 auto y = cast(void *)inputs[1].ptr; 255 auto x = cast(void *)inputs[2].ptr; 256 float alpha = 1; 257 float beta = 0; 258 259 cudnnPoolingBackward(handle, poolingDesc, &alpha, yDesc, y, yDesc, dy, xDesc, x, &beta, xDesc, dx) 260 .cudnnCheck(); 261 262 cuCtxSynchronize(); 263 } 264 } 265 266 class Softmax : CUDAKernel 267 { 268 this(Operation op) 269 { 270 auto shape = op.shape.map!(x => cast(int)x).array(); 271 auto vol = 1; 272 273 for(size_t i = 2; i < shape.length; i++) 274 { 275 vol *= shape[i]; 276 } 277 278 cudnnCreateTensorDescriptor(&desc).cudnnCheck(); 279 cudnnSetTensor4dDescriptor(desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, shape[0], shape[1], vol, 1) 280 .cudnnCheck(); 281 } 282 283 ~this() 284 { 285 cudnnDestroyTensorDescriptor(desc).cudnnCheck(); 286 } 287 288 override void execute(const(CUDABuffer)[] inputs, CUDABuffer output) 289 { 290 float alpha = 1.0; 291 float beta = 0.0; 292 auto x = cast(void *)inputs[0].ptr; 293 auto y = cast(void *)output.ptr; 294 295 cudnnSoftmaxForward(handle, CUDNN_SOFTMAX_ACCURATE, CUDNN_SOFTMAX_MODE_CHANNEL, &alpha, desc, x, &beta, 296 desc, y).cudnnCheck(); 297 298 cuCtxSynchronize(); 299 } 300 301 cudnnTensorDescriptor_t desc; 302 } 303 304 class SoftmaxGrad : CUDAKernel 305 { 306 this(Operation op) 307 { 308 auto shape = op.shape.map!(x => cast(int)x).array(); 309 310 cudnnCreateTensorDescriptor(&desc).cudnnCheck(); 311 cudnnSetTensor4dDescriptor(desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, shape[0], shape[1], 312 reduce!"a * b"(1, shape[2 .. $]), 1).cudnnCheck(); 313 } 314 315 ~this() 316 { 317 cudnnDestroyTensorDescriptor(desc).cudnnCheck(); 318 } 319 320 override void execute(const(CUDABuffer)[] inputs, CUDABuffer output) 321 { 322 float alpha = 1.0; 323 float beta = 0.0; 324 auto dy = cast(void *)inputs[0].ptr; 325 auto y = cast(void *)inputs[1].ptr; 326 auto dx = cast(void *)output.ptr; 327 328 cudnnSoftmaxBackward(handle, CUDNN_SOFTMAX_ACCURATE, CUDNN_SOFTMAX_MODE_CHANNEL, &alpha, desc, y, desc, dy, 329 &beta, desc, dx).cudnnCheck(); 330 331 cuCtxSynchronize(); 332 } 333 334 cudnnTensorDescriptor_t desc; 335 } 336 337 class AddBias : CUDAKernel 338 { 339 this(Operation op) 340 { 341 auto shape = op.shape.map!(x => cast(int)x).array(); 342 343 cudnnCreateTensorDescriptor(&cDesc).cudnnCheck(); 344 cudnnSetTensor4dDescriptor(cDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, shape[0], shape[1], 345 reduce!"a * b"(1, shape[2 .. $]), 1).cudnnCheck(); 346 347 cudnnCreateTensorDescriptor(&aDesc).cudnnCheck(); 348 cudnnSetTensor4dDescriptor(aDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, shape[1], 1, 1).cudnnCheck(); 349 } 350 351 ~this() 352 { 353 cudnnDestroyTensorDescriptor(cDesc).cudnnCheck(); 354 cudnnDestroyTensorDescriptor(aDesc).cudnnCheck(); 355 } 356 357 override void execute(const(CUDABuffer)[] inputs, CUDABuffer output) 358 { 359 cuMemcpy(output.ptr, inputs[0].ptr, output.numBytes); 360 361 float alpha = 1; 362 float beta = 1; 363 364 cudnnAddTensor(handle, &alpha, aDesc, cast(void *)inputs[1].ptr, &beta, cDesc, cast(void *)output.ptr); 365 } 366 367 cudnnTensorDescriptor_t cDesc; 368 cudnnTensorDescriptor_t aDesc; 369 } 370 371 class AddBiasGrad : CUDAKernel 372 { 373 this(Operation op) 374 { 375 auto shape = op.deps[0].shape.map!(x => cast(int)x).array(); 376 377 cudnnCreateTensorDescriptor(&dyDesc).cudnnCheck(); 378 cudnnSetTensor4dDescriptor(dyDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, shape[0], shape[1], 379 reduce!"a * b"(1, shape[2 .. $]), 1).cudnnCheck(); 380 381 cudnnCreateTensorDescriptor(&dbDesc).cudnnCheck(); 382 cudnnSetTensor4dDescriptor(dbDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, shape[1], 1, 1).cudnnCheck(); 383 } 384 385 ~this() 386 { 387 cudnnDestroyTensorDescriptor(dyDesc).cudnnCheck(); 388 cudnnDestroyTensorDescriptor(dbDesc).cudnnCheck(); 389 } 390 391 override void execute(const(CUDABuffer)[] inputs, CUDABuffer output) 392 { 393 float alpha = 1.0f; 394 float beta = 1.0f; 395 396 cudnnConvolutionBackwardBias(handle, &alpha, dyDesc, cast(void *)inputs[0].ptr, &beta, dbDesc, 397 cast(void *)output.ptr); 398 } 399 400 cudnnTensorDescriptor_t dyDesc; 401 cudnnTensorDescriptor_t dbDesc; 402 } 403 404 class BatchNormTrain : CUDAKernel 405 { 406 this(Operation op) 407 { 408 if(op.rank == 2) 409 { 410 mode = CUDNN_BATCHNORM_PER_ACTIVATION; 411 } 412 else 413 { 414 mode = CUDNN_BATCHNORM_SPATIAL; 415 } 416 417 import std.range; 418 419 auto shape = op.deps[0].shape 420 .chain(repeat(1)) 421 .map!(x => cast(int)x) 422 .take(4) 423 .array(); 424 425 cudnnCreateTensorDescriptor(&xDesc).cudnnCheck(); 426 cudnnCreateTensorDescriptor(&bnDesc).cudnnCheck(); 427 428 cudnnSetTensor4dDescriptor(xDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, shape[0], shape[1], shape[2], 429 shape[3]).cudnnCheck(); 430 cudnnDeriveBNTensorDescriptor(bnDesc, xDesc, mode).cudnnCheck(); 431 } 432 433 ~this() 434 { 435 cudnnDestroyTensorDescriptor(xDesc).cudnnCheck(); 436 cudnnDestroyTensorDescriptor(bnDesc).cudnnCheck(); 437 } 438 439 override void execute(const(CUDABuffer)[] inputs, CUDABuffer output) 440 { 441 float alpha = 1.0f; 442 float beta = 0.0f; 443 444 cudnnBatchNormalizationForwardTraining(handle, mode, &alpha, &beta, xDesc, 445 cast(void *)inputs[0].ptr, xDesc, cast(void *)output.ptr, bnDesc, cast(void *)inputs[1].ptr, 446 cast(void *)inputs[2].ptr, 0, null, null, 1e-5f, null, null).cudnnCheck(); 447 } 448 449 cudnnBatchNormMode_t mode; 450 cudnnTensorDescriptor_t xDesc; 451 cudnnTensorDescriptor_t bnDesc; 452 } 453 454 class BatchNormGrad : CUDAKernel 455 { 456 this(Operation op) 457 { 458 if(op.deps[1].rank == 2) 459 { 460 mode = CUDNN_BATCHNORM_PER_ACTIVATION; 461 } 462 else 463 { 464 mode = CUDNN_BATCHNORM_SPATIAL; 465 } 466 467 import std.range; 468 469 auto shape = op.deps[1].shape 470 .chain(repeat(1)) 471 .map!(x => cast(int)x) 472 .take(4) 473 .array(); 474 475 cudnnCreateTensorDescriptor(&xDesc).cudnnCheck(); 476 cudnnCreateTensorDescriptor(&bnDesc).cudnnCheck(); 477 478 cudnnSetTensor4dDescriptor(xDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, shape[0], shape[1], shape[2], 479 shape[3]).cudnnCheck(); 480 cudnnDeriveBNTensorDescriptor(bnDesc, xDesc, mode).cudnnCheck(); 481 } 482 483 ~this() 484 { 485 cudnnDestroyTensorDescriptor(xDesc).cudnnCheck(); 486 cudnnDestroyTensorDescriptor(bnDesc).cudnnCheck(); 487 } 488 489 override void execute(const(CUDABuffer)[] inputs, CUDABuffer output) 490 { 491 float alpha = 1.0f; 492 float beta = 0.0f; 493 494 void *dx = cast(void *)(output.ptr); 495 void *dscale = cast(void *)(output.ptr + inputs[1].numBytes); 496 void *dbias = cast(void *)(output.ptr + inputs[1].numBytes + inputs[2].numBytes); 497 498 cudnnBatchNormalizationBackward(handle, mode, &alpha, &beta, &alpha, &beta, xDesc, 499 cast(void *)inputs[1].ptr, xDesc, cast(void *)inputs[0].ptr, xDesc, dx, bnDesc, 500 cast(void *)inputs[2].ptr, dscale, dbias, 1e-5f, null, null); 501 } 502 503 cudnnBatchNormMode_t mode; 504 cudnnTensorDescriptor_t xDesc; 505 cudnnTensorDescriptor_t bnDesc; 506 } 507 }