1 /** 2 This is the main interface for the dopt CUDA backend. 3 4 The APIs in this module allow users to evaluate operation graphs on GPUs through the use of CUDA. There is also 5 functionality to register CUDA implementations of custom operations. 6 7 In future, this module will also have an interface allowing the user to register their own optimisation passes to 8 be called when constructing a plan. 9 10 Authors: Henry Gouk 11 */ 12 module dopt.cuda; 13 14 import std.exception; 15 16 import dopt.cuda.basic; 17 import dopt.cuda.nvrtc; 18 import dopt.cuda.math; 19 import dopt.cuda.nnet; 20 import dopt.cuda.random; 21 import dopt.core; 22 23 import derelict.cuda; 24 25 alias CUDAKernelCtr = CUDAKernel delegate(Operation op); 26 27 private __gshared 28 { 29 CUdevice mDevice; 30 CUcontext mContext; 31 } 32 33 /** 34 Registers all the kernels for the CUDA backend 35 */ 36 shared static this() 37 { 38 try 39 { 40 DerelictCUDADriver.load(); 41 42 //Initialise CUDA and create a context 43 cuInit(0); 44 cuDeviceGet(&mDevice, 0); 45 cuCtxCreate(&mContext, 0, mDevice); 46 47 //Initialize submodules 48 dopt.cuda.basic.initialize(); 49 dopt.cuda.nvrtc.initialize(); 50 dopt.cuda.math.initialize(); 51 dopt.cuda.nnet.initialize(); 52 dopt.cuda.random.initialize(); 53 54 import std.functional : toDelegate; 55 defaultEvaluator = toDelegate(&evaluateCUDA); 56 defaultCompiler = (Operation[] ops) { return new CUDAPlan(ops); }; 57 } 58 catch(Exception e) 59 { 60 //TODO: probably log something here 61 } 62 } 63 64 /** 65 Provides a common interface for CUDA kernels. 66 */ 67 interface CUDAKernel 68 { 69 /** 70 Runs the kernel with the given inputs and outputs. 71 72 Params: 73 inputs = An array of CUDABuffer objects, each corresponding to one of the dependencies of the operation 74 used to construct this kernel. 75 output = The destination buffer. 76 */ 77 void execute(const(CUDABuffer)[] inputs, CUDABuffer output); 78 } 79 80 private class CUDACPUKernel : CUDAKernel 81 { 82 this(Operation op) 83 { 84 import std.algorithm : map; 85 import std.array : array; 86 87 mDeps = op 88 .deps 89 .map!(x => variable(x.outputType)) 90 .array(); 91 92 mOp = createOperation(op.opType, mDeps, op.attributes); 93 } 94 95 void execute(const(CUDABuffer)[] inputs, CUDABuffer output) 96 { 97 import std.range : zip; 98 import dopt.cpu : evaluateCPU; 99 100 foreach(cudaInput, cpuInput; zip(inputs, mDeps)) 101 { 102 cudaInput.get(cpuInput.value.as!ubyte); 103 } 104 105 Buffer ret = evaluateCPU([mOp])[0]; 106 107 output.set(ret.as!ubyte); 108 } 109 110 Buffer[] mInputs; 111 Operation[] mDeps; 112 Operation mOp; 113 } 114 115 private CUDAKernel cudaCPUCtr(Operation op) 116 { 117 return new CUDACPUKernel(op); 118 } 119 120 /** 121 A class that encapsulates the CUDA memory allocation/deallocation process. 122 */ 123 class CUDABuffer 124 { 125 public 126 { 127 /** 128 Constructs a CUDABuffer object and allocates memory on the CUDA device. 129 130 Params: 131 numBytes = The number of bytes to be allocated on the CUDA device. 132 */ 133 static CUDABuffer create(size_t numBytes) 134 { 135 CUDABuffer ret = new CUDABuffer(); 136 ret.mNumBytes = numBytes; 137 enforce(cuMemAlloc(&(ret.mPtr), ret.mNumBytes) == CUDA_SUCCESS, "CUDA memory allocation failed"); 138 enforce(cuMemsetD8(ret.mPtr, 0, ret.mNumBytes) == CUDA_SUCCESS, 139 "CUDA default buffer initialisation failed"); 140 141 return ret; 142 } 143 144 /** 145 Releases the CUDA resources used by buf internally. 146 */ 147 static void destroy(CUDABuffer buf) 148 { 149 enforce(cuMemFree(buf.mPtr) == CUDA_SUCCESS, "Failed to free CUDA device memory."); 150 } 151 152 /** 153 Copies data from the host to the device. 154 155 Params: 156 buf = An array of data to be copied to the device. 157 */ 158 void set(const void[] buf) 159 { 160 enforce(buf.length == mNumBytes, "input buffer is the wrong length."); 161 enforce(cuMemcpyHtoD(mPtr, buf.ptr, buf.length) == CUDA_SUCCESS, "Failed to set contents of CUDA buffer"); 162 } 163 164 /** 165 Copies data from the device to the host. 166 167 Params: 168 buf = The buffer that the data from the CUDA device will be written to. 169 */ 170 void get(void[] buf) const 171 { 172 enforce(buf.length == mNumBytes, "output buffer is the wrong length."); 173 enforce(cuMemcpyDtoH(buf.ptr, mPtr, buf.length) == CUDA_SUCCESS, "Failed to get contents of CUDA buffer"); 174 } 175 176 /** 177 Provides the size of the buffer allocated on the CUDA device. 178 179 Returns: 180 The number of bytes allocated on the CUDA device. 181 */ 182 size_t numBytes() const 183 { 184 return mNumBytes; 185 } 186 187 /** 188 Provides the device pointer. 189 190 Returns: 191 A CUDA device pointer. 192 */ 193 inout(CUdeviceptr) ptr() inout 194 { 195 return mPtr; 196 } 197 } 198 199 private 200 { 201 size_t mNumBytes; 202 CUdeviceptr mPtr; 203 204 this() 205 { 206 // 207 } 208 209 void zero() 210 { 211 enforce(cuMemsetD8(mPtr, 0, mNumBytes) == CUDA_SUCCESS, "CUDA zero buffer failed"); 212 } 213 } 214 } 215 216 /** 217 A Plan stores all the resources (preallocated buffers, custom CUDA kernels) required to evaluate nodes from the 218 Operation graph. 219 220 An instance of Plan can be constructed using the $(D compileCUDA) function. The primary use case for a CUDAPlan is when the 221 same set of operations are likely to be evaluated more than once. This prevents the dopt CUDA runtime from 222 reallocating and optimising the CUDA kernels every time the same set of operations is to be executed. 223 */ 224 class CUDAPlan : Plan 225 { 226 public 227 { 228 long[string] profiler; 229 230 this(Operation[] outputs) 231 { 232 import std.algorithm : canFind, filter; 233 import std.array : array; 234 import std.functional : toDelegate; 235 236 super(outputs); 237 238 auto sortedOps = topologicalSort(outputs); 239 240 foreach(o; sortedOps) 241 { 242 if(o.opType == "variable" || o.opType == "reshape" || o.opType == "constant") 243 { 244 continue; 245 } 246 247 auto k = mKernelCtrs.get(o.opType, toDelegate(&cudaCPUCtr)); 248 249 enforce(k !is null, "Could not construct a CUDA kernel for operation of type '" ~ o.opType ~ "'"); 250 251 mKernels[o] = k(o); 252 } 253 254 mOps = sortedOps.array; 255 256 foreach(o; mOps) 257 { 258 //For reshape operations, we will just reuse the buffer of o.deps[0] 259 if(o.opType == "reshape") 260 { 261 results[o] = results[o.deps[0]]; 262 } 263 else 264 { 265 results[o] = CUDABuffer.create(o.volume * o.elementType.sizeOf); 266 267 if(o.opType == "constant") 268 { 269 results[o].set(o.value.as!ubyte); 270 } 271 } 272 } 273 274 results.rehash(); 275 } 276 277 ~this() 278 { 279 cleanup(); 280 } 281 282 /** 283 Releases CUDA resources associated with this plan. 284 */ 285 void cleanup() 286 { 287 if(clean) 288 { 289 return; 290 } 291 292 foreach(o; mOps) 293 { 294 if(o.opType != "reshape") 295 { 296 CUDABuffer.destroy(results[o]); 297 } 298 } 299 300 clean = true; 301 } 302 } 303 304 protected 305 { 306 override void executeImpl(Buffer[Operation] args, Buffer[] rets) 307 { 308 import std.datetime.stopwatch : StopWatch; 309 StopWatch sw; 310 311 //Make sure all the args are variable assignments 312 foreach(o; args.keys) 313 { 314 enforce(o.opType == "variable", 315 "All assignments in args must be for Operations with an opType of 'variable'"); 316 } 317 318 //Load the args into their buffers 319 foreach(k, v; args) 320 { 321 results[k].set(v.as!ubyte); 322 } 323 324 //Iterate through each operation and execute it 325 foreach(o; mOps) 326 { 327 if(o.opType == "variable") 328 { 329 if(!(o in args)) 330 { 331 sw.reset(); 332 sw.start(); 333 334 auto buf = cast(Buffer)o.value; 335 results[o].set(buf.as!ubyte); 336 337 sw.stop(); 338 339 profiler["variable"] = profiler.get("variable", 0) + sw.peek.split.usecs; 340 } 341 342 continue; 343 } 344 else if(o.opType == "reshape" || o.opType == "constant") 345 { 346 continue; 347 } 348 349 //Get the input buffers 350 CUDABuffer[] inputs; 351 352 foreach(d; o.deps) 353 { 354 inputs ~= results[d]; 355 } 356 357 //Execute the operation 358 sw.reset(); 359 sw.start(); 360 results[o].zero(); 361 mKernels[o].execute(inputs, results[o]); 362 sw.stop(); 363 364 profiler[o.opType] = profiler.get(o.opType, 0) + sw.peek.split.usecs; 365 } 366 367 foreach(i, o; mOutputs) 368 { 369 results[o].get(rets[i].as!ubyte); 370 } 371 } 372 } 373 374 private 375 { 376 Operation[] mOps; 377 CUDAKernel[Operation] mKernels; 378 CUDABuffer[Operation] results; 379 bool clean = false; 380 } 381 } 382 383 /** 384 Used for performing a one-off evaluation of a set of operations. 385 386 If you are planning to operate the same set of operations multiple times, but with different variables assignments, 387 then you should construct a $(D CUDAPlan). 388 389 Params: 390 ops = The operations to be evaluated. 391 args = A set of optional variable assignments. 392 393 Returns: 394 The result of evaluating $(D ops). 395 */ 396 Buffer[] evaluateCUDA(Operation[] ops, Buffer[Operation] args = null) 397 { 398 auto p = new CUDAPlan(ops); 399 400 auto ret = p.execute(args); 401 402 p.cleanup(); 403 404 return ret; 405 } 406 407 /** 408 A convenience overload that evaluates a single operation and returns a single $(D Buffer). 409 410 Params: 411 op = The operation to be evaluated. 412 args = A set of optional variable assignments. 413 414 Returns: 415 The result of evaluating $(D op) 416 */ 417 Buffer evaluateCUDA(Operation op, Buffer[Operation] args = null) 418 { 419 return evaluateCUDA([op], args)[0]; 420 } 421 422 /** 423 Registers a CUDA kernel constructor for a given operation type. 424 425 Params: 426 opName = The type of operation this kernel constructor caters to. 427 kernelCtr = The constructor that should be associated with operations with the type $(D opType). 428 */ 429 void registerCUDAKernel(string opName, CUDAKernelCtr kernelCtr) 430 { 431 enforce((opName in mKernelCtrs) is null, 432 "A CUDAKernelCtr is already registered for the operation '" ~ opName ~ "'"); 433 434 mKernelCtrs[opName] = kernelCtr; 435 } 436 437 /** 438 Deregisters a kernel constructor associated with the given operation type. 439 440 Params: 441 opType = The operation type that should have its kernel deregistered. 442 */ 443 void deregisterCUDAKernel(string opType) 444 { 445 mKernelCtrs.remove(opType); 446 } 447 448 /** 449 Provides a list of all operation types supported by the CUDA backend. 450 451 Returns: 452 A string array of the operation types that have kernels registered. 453 */ 454 string[] listCUDAOperations() 455 { 456 return mKernelCtrs.keys ~ ["variable", "reshape"]; 457 } 458 459 package 460 { 461 string cudaType(DataType t) 462 { 463 switch(t) 464 { 465 case DataType.float32: 466 return "float"; 467 468 case DataType.int32: 469 return "int"; 470 471 default: 472 import std.conv : to; 473 assert(0, "DataType '" ~ t.to!string ~ "' is not currently supported by the CUDA backend"); 474 } 475 } 476 } 477 478 private 479 { 480 CUDAKernelCtr[string] mKernelCtrs; 481 } 482 483 unittest 484 { 485 auto a = float32([], [3.0f]); 486 auto b = float32([], [4.0f]); 487 auto c = float32([], [-1.0f]); 488 489 auto y = a * b + c; 490 491 assert(evaluateCUDA(y).as!float[0] == 11.0f); 492 }