1 /** 2 This is the main interface for the dopt CUDA backend. 3 4 The APIs in this module allow users to evaluate operation graphs on GPUs through the use of CUDA. There is also 5 functionality to register CUDA implementations of custom operations. 6 7 In future, this module will also have an interface allowing the user to register their own optimisation passes to 8 be called when constructing a plan. 9 10 Authors: Henry Gouk 11 */ 12 module dopt.cuda; 13 14 import std.exception; 15 16 import dopt.cuda.basic; 17 import dopt.cuda.nvrtc; 18 import dopt.cuda.math; 19 import dopt.cuda.nnet; 20 import dopt.cuda.random; 21 import dopt.core; 22 23 import derelict.cuda; 24 25 alias CUDAKernelCtr = CUDAKernel delegate(Operation op); 26 27 private __gshared 28 { 29 CUdevice mDevice; 30 CUcontext mContext; 31 } 32 33 /** 34 Registers all the kernels for the CUDA backend 35 */ 36 shared static this() 37 { 38 try 39 { 40 DerelictCUDADriver.load(); 41 42 //Initialise CUDA and create a context 43 cuInit(0); 44 cuDeviceGet(&mDevice, 0); 45 cuCtxCreate(&mContext, 0, mDevice); 46 47 //Initialize submodules 48 dopt.cuda.basic.initialize(); 49 dopt.cuda.nvrtc.initialize(); 50 dopt.cuda.math.initialize(); 51 dopt.cuda.nnet.initialize(); 52 dopt.cuda.random.initialize(); 53 54 import std.functional : toDelegate; 55 defaultEvaluator = toDelegate(&evaluateCUDA); 56 defaultCompiler = (Operation[] ops) { return new CUDAPlan(ops); }; 57 defaultVarAllocator = (size_t numBytes) { return CUDABuffer.create(numBytes); }; 58 } 59 catch(Exception e) 60 { 61 //TODO: probably log something here 62 } 63 } 64 65 /** 66 Provides a common interface for CUDA kernels. 67 */ 68 interface CUDAKernel 69 { 70 /** 71 Runs the kernel with the given inputs and outputs. 72 73 Params: 74 inputs = An array of CUDABuffer objects, each corresponding to one of the dependencies of the operation 75 used to construct this kernel. 76 output = The destination buffer. 77 */ 78 void execute(const(CUDABuffer)[] inputs, CUDABuffer output); 79 } 80 81 private class CUDACPUKernel : CUDAKernel 82 { 83 this(Operation op) 84 { 85 import std.algorithm : map; 86 import std.array : array; 87 88 mDeps = op 89 .deps 90 .map!(x => variable(x.outputType)) 91 .array(); 92 93 mOp = createOperation(op.opType, mDeps, op.attributes); 94 } 95 96 void execute(const(CUDABuffer)[] inputs, CUDABuffer output) 97 { 98 import std.range : zip; 99 import dopt.cpu : evaluateCPU; 100 101 foreach(cudaInput, cpuInput; zip(inputs, mDeps)) 102 { 103 cpuInput.value.set(cudaInput); 104 } 105 106 DeviceBuffer ret = evaluateCPU([mOp])[0]; 107 108 output.set(ret); 109 } 110 111 DeviceBuffer[] mInputs; 112 Operation[] mDeps; 113 Operation mOp; 114 } 115 116 private CUDAKernel cudaCPUCtr(Operation op) 117 { 118 return new CUDACPUKernel(op); 119 } 120 121 /** 122 A class that encapsulates the CUDA memory allocation/deallocation process. 123 */ 124 class CUDABuffer : DeviceBuffer 125 { 126 public 127 { 128 /** 129 Constructs a CUDABuffer object and allocates memory on the CUDA device. 130 131 Params: 132 numBytes = The number of bytes to be allocated on the CUDA device. 133 */ 134 static CUDABuffer create(size_t numBytes) 135 { 136 import core.memory : GC; 137 import std.conv : to; 138 139 //Rely on the GC to run some finalisers to free CUDA memory. I know this is bad please help. 140 GC.collect(); 141 142 CUDABuffer ret = new CUDABuffer(); 143 144 if(numBytes == 0) 145 { 146 return ret; 147 } 148 149 ret.mNumBytes = numBytes; 150 enforce(cuMemAlloc(&(ret.mPtr), ret.mNumBytes) == CUDA_SUCCESS, 151 "CUDA memory allocation failed: unable to allocate " ~ numBytes.to!string ~ " bytes"); 152 enforce(cuMemsetD8(ret.mPtr, 0, ret.mNumBytes) == CUDA_SUCCESS, 153 "CUDA default buffer initialisation failed"); 154 155 return ret; 156 } 157 158 /** 159 Releases the CUDA resources used by buf internally. 160 */ 161 static void destroy(CUDABuffer buf) 162 { 163 enforce(cuMemFree(buf.mPtr) == CUDA_SUCCESS, "Failed to free CUDA device memory."); 164 } 165 166 /** 167 Copies data from the host to the device. 168 169 Params: 170 buf = An array of data to be copied to the device. 171 */ 172 override void set(const void[] buf) 173 { 174 enforce(buf.length == mNumBytes, "input buffer is the wrong length."); 175 enforce(cuMemcpyHtoD(mPtr, buf.ptr, buf.length) == CUDA_SUCCESS, "Failed to set contents of CUDA buffer"); 176 } 177 178 override void set(const DeviceBuffer buf) 179 { 180 import dopt.cpu : CPUBuffer; 181 182 enforce(numBytes == buf.numBytes, "Mismatch in buffer size"); 183 184 auto cubuf = cast(CUDABuffer)buf; 185 auto cpubuf = cast(CPUBuffer)buf; 186 187 if(cubuf !is null) 188 { 189 cuMemcpyDtoD(mPtr, cubuf.ptr, numBytes); 190 } 191 else if(cpubuf !is null) 192 { 193 cuMemcpyHtoD(mPtr, cpubuf.raw.ptr, numBytes); 194 } 195 else 196 { 197 super.set(buf); 198 } 199 } 200 201 /** 202 Copies data from the device to the host. 203 204 Params: 205 buf = The buffer that the data from the CUDA device will be written to. 206 */ 207 override void get(void[] buf) const 208 { 209 enforce(buf.length == mNumBytes, "output buffer is the wrong length."); 210 enforce(cuMemcpyDtoH(buf.ptr, mPtr, buf.length) == CUDA_SUCCESS, "Failed to get contents of CUDA buffer"); 211 } 212 213 /** 214 Provides the size of the buffer allocated on the CUDA device. 215 216 Returns: 217 The number of bytes allocated on the CUDA device. 218 */ 219 override size_t numBytes() const 220 { 221 return mNumBytes; 222 } 223 224 /** 225 Provides the device pointer. 226 227 Returns: 228 A CUDA device pointer. 229 */ 230 inout(CUdeviceptr) ptr() inout 231 { 232 return mPtr; 233 } 234 } 235 236 private 237 { 238 size_t mNumBytes; 239 CUdeviceptr mPtr; 240 241 this() 242 { 243 // 244 } 245 246 void zero() 247 { 248 enforce(cuMemsetD8(mPtr, 0, mNumBytes) == CUDA_SUCCESS, "CUDA zero buffer failed"); 249 } 250 } 251 } 252 253 /** 254 A Plan stores all the resources (preallocated buffers, custom CUDA kernels) required to evaluate nodes from the 255 Operation graph. 256 257 An instance of Plan can be constructed using the $(D compileCUDA) function. The primary use case for a CUDAPlan is when the 258 same set of operations are likely to be evaluated more than once. This prevents the dopt CUDA runtime from 259 reallocating and optimising the CUDA kernels every time the same set of operations is to be executed. 260 */ 261 class CUDAPlan : Plan 262 { 263 public 264 { 265 long[string] profiler; 266 267 this(Operation[] outputs) 268 { 269 import std.algorithm : canFind, filter; 270 import std.array : array; 271 import std.functional : toDelegate; 272 273 super(outputs); 274 275 auto sortedOps = topologicalSort(outputs); 276 277 foreach(o; sortedOps) 278 { 279 if(o.opType == "variable" || o.opType == "reshape" || o.opType == "constant") 280 { 281 continue; 282 } 283 284 auto k = mKernelCtrs.get(o.opType, toDelegate(&cudaCPUCtr)); 285 286 enforce(k !is null, "Could not construct a CUDA kernel for operation of type '" ~ o.opType ~ "'"); 287 288 mKernels[o] = k(o); 289 } 290 291 mOps = sortedOps.array; 292 293 foreach(o; mOps) 294 { 295 if(o.opType == "reshape") 296 { 297 //This will be overwritten in executeImpl, but we want a slot in the hashmap for it now. 298 mResults[o] = mResults[o.deps[0]]; 299 } 300 else 301 { 302 mResults[o] = CUDABuffer.create(o.volume * o.elementType.sizeOf); 303 304 if(o.opType == "constant") 305 { 306 mResults[o].set(o.value); 307 } 308 } 309 } 310 311 mResults.rehash(); 312 } 313 314 ~this() 315 { 316 cleanup(); 317 } 318 319 /** 320 Releases CUDA resources associated with this plan. 321 */ 322 void cleanup() 323 { 324 if(mClean) 325 { 326 return; 327 } 328 329 foreach(o; mOps) 330 { 331 if(o.opType != "reshape") 332 { 333 CUDABuffer.destroy(mResults[o]); 334 } 335 } 336 337 mClean = true; 338 } 339 } 340 341 protected 342 { 343 override void executeImpl(DeviceBuffer[Operation] args, DeviceBuffer[] rets) 344 { 345 import std.datetime.stopwatch : StopWatch; 346 StopWatch sw; 347 348 //Make sure all the args are variable assignments. Is this arbitrary? 349 foreach(o; args.keys) 350 { 351 enforce(o.opType == "variable", 352 "All assignments in args must be for Operations with an opType of 'variable'"); 353 } 354 355 //Iterate through each operation and execute it 356 foreach(o; mOps) 357 { 358 if(o.opType == "variable" || o.opType == "constant") 359 { 360 continue; 361 } 362 363 //Get the input buffers 364 CUDABuffer[] inputs; 365 CUDABuffer output = mResults[o]; 366 367 foreach(d; o.deps) 368 { 369 if(d.opType == "variable") 370 { 371 CUDABuffer cubuf; 372 373 if(d in args) 374 { 375 cubuf = cast(CUDABuffer)args[d]; 376 377 if(cubuf is null) 378 { 379 cubuf = mResults[d]; 380 cubuf.set(args[d]); 381 } 382 } 383 else 384 { 385 cubuf = cast(CUDABuffer)d.value; 386 387 if(cubuf is null) 388 { 389 cubuf = mResults[d]; 390 cubuf.set(d.value); 391 } 392 393 } 394 395 inputs ~= cubuf; 396 } 397 else 398 { 399 inputs ~= mResults[d]; 400 } 401 } 402 403 if(o.opType == "reshape") 404 { 405 mResults[o] = inputs[0]; 406 } 407 else 408 { 409 //Execute the operation 410 sw.reset(); 411 sw.start(); 412 mKernels[o].execute(inputs, output); 413 sw.stop(); 414 415 profiler[o.opType] = profiler.get(o.opType, 0) + sw.peek.split.usecs; 416 } 417 } 418 419 foreach(i, o; mOutputs) 420 { 421 rets[i].set(mResults[o]); 422 } 423 } 424 } 425 426 private 427 { 428 Operation[] mOps; 429 CUDAKernel[Operation] mKernels; 430 CUDABuffer[Operation] mResults; 431 bool mClean = false; 432 } 433 } 434 435 /** 436 Used for performing a one-off evaluation of a set of operations. 437 438 If you are planning to operate the same set of operations multiple times, but with different variables assignments, 439 then you should construct a $(D CUDAPlan). 440 441 Params: 442 ops = The operations to be evaluated. 443 args = A set of optional variable assignments. 444 445 Returns: 446 The result of evaluating $(D ops). 447 */ 448 DeviceBuffer[] evaluateCUDA(Operation[] ops, DeviceBuffer[Operation] args = null) 449 { 450 auto p = new CUDAPlan(ops); 451 452 auto ret = p.execute(args); 453 454 return ret; 455 } 456 457 /** 458 A convenience overload that evaluates a single operation and returns a single $(D DeviceBuffer). 459 460 Params: 461 op = The operation to be evaluated. 462 args = A set of optional variable assignments. 463 464 Returns: 465 The result of evaluating $(D op) 466 */ 467 DeviceBuffer evaluateCUDA(Operation op, DeviceBuffer[Operation] args = null) 468 { 469 return evaluateCUDA([op], args)[0]; 470 } 471 472 /** 473 Registers a CUDA kernel constructor for a given operation type. 474 475 Params: 476 opName = The type of operation this kernel constructor caters to. 477 kernelCtr = The constructor that should be associated with operations with the type $(D opType). 478 */ 479 void registerCUDAKernel(string opName, CUDAKernelCtr kernelCtr) 480 { 481 enforce((opName in mKernelCtrs) is null, 482 "A CUDAKernelCtr is already registered for the operation '" ~ opName ~ "'"); 483 484 mKernelCtrs[opName] = kernelCtr; 485 } 486 487 /** 488 Deregisters a kernel constructor associated with the given operation type. 489 490 Params: 491 opType = The operation type that should have its kernel deregistered. 492 */ 493 void deregisterCUDAKernel(string opType) 494 { 495 mKernelCtrs.remove(opType); 496 } 497 498 /** 499 Provides a list of all operation types supported by the CUDA backend. 500 501 Returns: 502 A string array of the operation types that have kernels registered. 503 */ 504 string[] listCUDAOperations() 505 { 506 return mKernelCtrs.keys ~ ["variable", "reshape"]; 507 } 508 509 package 510 { 511 string cudaType(DataType t) 512 { 513 switch(t) 514 { 515 case DataType.float32: 516 return "float"; 517 518 case DataType.int32: 519 return "int"; 520 521 default: 522 import std.conv : to; 523 assert(0, "DataType '" ~ t.to!string ~ "' is not currently supported by the CUDA backend"); 524 } 525 } 526 } 527 528 private 529 { 530 CUDAKernelCtr[string] mKernelCtrs; 531 } 532 533 unittest 534 { 535 auto a = float32([], [3.0f]); 536 auto b = float32([], [4.0f]); 537 auto c = float32([], [-1.0f]); 538 539 auto y = a * b + c; 540 541 assert(evaluateCUDA(y).get!float[0] == 11.0f); 542 }