1 /** 2 This is the main interface for the dopt CUDA backend. 3 4 The APIs in this module allow users to evaluate operation graphs on GPUs through the use of CUDA. There is also 5 functionality to register CUDA implementations of custom operations. 6 7 In future, this module will also have an interface allowing the user to register their own optimisation passes to 8 be called when constructing a plan. 9 10 Authors: Henry Gouk 11 */ 12 module dopt.core.cuda; 13 14 import std.exception; 15 16 import dopt.core.cuda.basic; 17 import dopt.core.cuda.nvrtc; 18 import dopt.core.cuda.math; 19 import dopt.core.cuda.nnet; 20 import dopt.core.cuda.random; 21 import dopt.core.ops; 22 import dopt.core.types; 23 24 import derelict.cuda; 25 26 alias CUDAKernelCtr = CUDAKernel delegate(Operation op); 27 28 private __gshared 29 { 30 CUdevice mDevice; 31 CUcontext mContext; 32 } 33 34 void initialize() 35 { 36 //TODO: handle case where CUDA isn't available 37 DerelictCUDADriver.load(); 38 39 //Initialise CUDA and create a context 40 cuInit(0); 41 cuDeviceGet(&mDevice, 0); 42 cuCtxCreate(&mContext, 0, mDevice); 43 44 //Initialize submodules 45 dopt.core.cuda.basic.initialize(); 46 dopt.core.cuda.nvrtc.initialize(); 47 dopt.core.cuda.math.initialize(); 48 dopt.core.cuda.nnet.initialize(); 49 dopt.core.cuda.random.initialize(); 50 } 51 52 /** 53 Provides a common interface for CUDA kernels. 54 */ 55 interface CUDAKernel 56 { 57 /** 58 Runs the kernel with the given inputs and outputs. 59 60 Params: 61 inputs = An array of CUDABuffer objects, each corresponding to one of the dependencies of the operation 62 used to construct this kernel. 63 output = The destination buffer. 64 */ 65 void execute(const(CUDABuffer)[] inputs, CUDABuffer output); 66 } 67 68 /** 69 A class that encapsulates the CUDA memory allocation/deallocation process. 70 */ 71 class CUDABuffer 72 { 73 public 74 { 75 /** 76 Constructs a CUDABuffer object and allocates memory on the CUDA device. 77 78 Params: 79 numBytes = The number of bytes to be allocated on the CUDA device. 80 */ 81 static CUDABuffer create(size_t numBytes) 82 { 83 CUDABuffer ret = new CUDABuffer(); 84 ret.mNumBytes = numBytes; 85 enforce(cuMemAlloc(&(ret.mPtr), ret.mNumBytes) == CUDA_SUCCESS, "CUDA memory allocation failed"); 86 enforce(cuMemsetD8(ret.mPtr, 0, ret.mNumBytes) == CUDA_SUCCESS, "CUDA default buffer initialisation failed"); 87 88 return ret; 89 } 90 91 static void destroy(CUDABuffer buf) 92 { 93 enforce(cuMemFree(buf.mPtr) == CUDA_SUCCESS, "Failed to free CUDA device memory."); 94 } 95 96 /** 97 Copies data from the host to the device. 98 99 Params: 100 buf = An array of data to be copied to the device. 101 */ 102 void set(const void[] buf) 103 { 104 enforce(buf.length == mNumBytes, "input buffer is the wrong length."); 105 enforce(cuMemcpyHtoD(mPtr, buf.ptr, buf.length) == CUDA_SUCCESS, "Failed to set contents of CUDA buffer"); 106 } 107 108 /** 109 Copies data from the device to the host. 110 111 Params: 112 buf = The buffer that the data from the CUDA device will be written to. 113 */ 114 void get(void[] buf) const 115 { 116 enforce(buf.length == mNumBytes, "output buffer is the wrong length."); 117 enforce(cuMemcpyDtoH(buf.ptr, mPtr, buf.length) == CUDA_SUCCESS, "Failed to get contents of CUDA buffer"); 118 } 119 120 /** 121 Provides the size of the buffer allocated on the CUDA device. 122 123 Returns: 124 The number of bytes allocated on the CUDA device. 125 */ 126 size_t numBytes() const 127 { 128 return mNumBytes; 129 } 130 131 /** 132 Provides the device pointer. 133 134 Returns: 135 A CUDA device pointer. 136 */ 137 inout(CUdeviceptr) ptr() inout 138 { 139 return mPtr; 140 } 141 } 142 143 private 144 { 145 size_t mNumBytes; 146 CUdeviceptr mPtr; 147 148 this() 149 { 150 // 151 } 152 153 void zero() 154 { 155 enforce(cuMemsetD8(mPtr, 0, mNumBytes) == CUDA_SUCCESS, "CUDA zero buffer failed"); 156 } 157 } 158 } 159 160 /** 161 A Plan stores all the resources (preallocated buffers, custom CUDA kernels) required to evaluate nodes from the 162 Operation graph. 163 164 An instance of Plan can be constructed using the $(D compileCUDA) function. The primary use case for a CUDAPlan is when the 165 same set of operations are likely to be evaluated more than once. This prevents the dopt CUDA runtime from 166 reallocating and optimising the CUDA kernels every time the same set of operations is to be executed. 167 */ 168 class CUDAPlan 169 { 170 public 171 { 172 long[string] profiler; 173 174 this(Operation[] outputs) 175 { 176 import std.algorithm : canFind, filter; 177 import std.array : array; 178 179 auto sortedOps = topologicalSort(outputs); 180 181 foreach(o; sortedOps) 182 { 183 if(o.opType == "variable" || o.opType == "reshape" || o.opType == "constant") 184 { 185 continue; 186 } 187 188 auto k = mKernelCtrs.get(o.opType, null); 189 190 enforce(k !is null, "Could not construct a CUDA kernel for operation of type '" ~ o.opType ~ "'"); 191 192 mKernels[o] = k(o); 193 } 194 195 mOps = sortedOps.array; 196 mOutputs = outputs.array; 197 198 foreach(o; mOps) 199 { 200 //For reshape operations, we will just reuse the buffer of o.deps[0] 201 if(o.opType == "reshape") 202 { 203 results[o] = results[o.deps[0]]; 204 } 205 else 206 { 207 results[o] = CUDABuffer.create(o.volume * o.elementType.sizeOf); 208 209 if(o.opType == "constant") 210 { 211 results[o].set(o.value.as!ubyte); 212 } 213 } 214 } 215 216 results.rehash(); 217 } 218 219 Buffer[] execute(Buffer[Operation] args = null) 220 { 221 auto rets = new Buffer[mOutputs.length]; 222 223 foreach(i, o; mOutputs) 224 { 225 rets[i] = Buffer(new ubyte[o.outputType.volume * o.outputType.elementType.sizeOf()]); 226 } 227 228 execute(args, rets); 229 230 return rets; 231 } 232 233 /** 234 Executes the plan. 235 236 Params: 237 args = A set of variable assignments. 238 */ 239 void execute(Buffer[Operation] args, Buffer[] rets) 240 { 241 import std.datetime : StopWatch; 242 StopWatch sw; 243 244 //Make sure all the args are variable assignments 245 foreach(o; args.keys) 246 { 247 enforce(o.opType == "variable", 248 "All assignments in args must be for Operations with an opType of 'variable'"); 249 } 250 251 //Load the args into their buffers 252 foreach(k, v; args) 253 { 254 results[k].set(v.as!ubyte); 255 } 256 257 //Iterate through each operation and execute it 258 foreach(o; mOps) 259 { 260 if(o.opType == "variable") 261 { 262 if(!(o in args)) 263 { 264 sw.reset(); 265 sw.start(); 266 267 auto buf = cast(Buffer)o.value; 268 results[o].set(buf.as!ubyte); 269 270 sw.stop(); 271 272 profiler["variable"] = profiler.get("variable", 0) + sw.peek.usecs; 273 } 274 275 continue; 276 } 277 else if(o.opType == "reshape" || o.opType == "constant") 278 { 279 continue; 280 } 281 282 //Get the input buffers 283 CUDABuffer[] inputs; 284 285 foreach(d; o.deps) 286 { 287 inputs ~= results[d]; 288 } 289 290 //Execute the operation 291 sw.reset(); 292 sw.start(); 293 results[o].zero(); 294 mKernels[o].execute(inputs, results[o]); 295 sw.stop(); 296 297 profiler[o.opType] = profiler.get(o.opType, 0) + sw.peek.usecs; 298 } 299 300 foreach(i, o; mOutputs) 301 { 302 results[o].get(rets[i].as!ubyte); 303 } 304 } 305 306 ~this() 307 { 308 cleanup(); 309 } 310 311 void cleanup() 312 { 313 if(clean) 314 { 315 return; 316 } 317 318 foreach(o; mOps) 319 { 320 if(o.opType != "reshape") 321 { 322 CUDABuffer.destroy(results[o]); 323 } 324 } 325 326 clean = true; 327 } 328 } 329 330 private 331 { 332 Operation[] mOutputs; 333 Operation[] mOps; 334 CUDAKernel[Operation] mKernels; 335 CUDABuffer[Operation] results; 336 bool clean = false; 337 } 338 } 339 340 /** 341 Used for performing a one-off evaluation of a set of operations. 342 343 If you are planning to operate the same set of operations multiple times, but with different variables assignments, 344 then you should construct a $(D CUDAPlan). 345 346 Params: 347 ops = The operations to be evaluated. 348 args = A set of optional variable assignments. 349 350 Returns: 351 The result of evaluating $(D ops). 352 */ 353 Buffer[] evaluateCUDA(Operation[] ops, Buffer[Operation] args = null) 354 { 355 auto p = new CUDAPlan(ops); 356 357 auto ret = p.execute(args); 358 359 p.cleanup(); 360 361 return ret; 362 } 363 364 /** 365 A convenience overload that evaluates a single operation and returns a single $(D Buffer). 366 367 Params: 368 op = The operation to be evaluated. 369 args = A set of optional variable assignments. 370 371 Returns: 372 The result of evaluating $(D op) 373 */ 374 Buffer evaluateCUDA(Operation op, Buffer[Operation] args = null) 375 { 376 return evaluateCUDA([op], args)[0]; 377 } 378 379 /** 380 Registers a CUDA kernel constructor for a given operation type. 381 382 Params: 383 opName = The type of operation this kernel constructor caters to. 384 kernelCtr = The constructor that should be associated with operations with the type $(D opType). 385 */ 386 void registerCUDAKernel(string opName, CUDAKernelCtr kernelCtr) 387 { 388 enforce((opName in mKernelCtrs) is null, 389 "A CUDAKernelCtr is already registered for the operation '" ~ opName ~ "'"); 390 391 mKernelCtrs[opName] = kernelCtr; 392 } 393 394 /** 395 Deregisters a kernel constructor associated with the given operation type. 396 397 Params: 398 opType = The operation type that should have its kernel deregistered. 399 */ 400 void deregisterCUDAKernel(string opType) 401 { 402 mKernelCtrs.remove(opType); 403 } 404 405 /** 406 Provides a list of all operation types supported by the CUDA backend. 407 408 Returns: 409 A string array of the operation types that have kernels registered. 410 */ 411 string[] listCUDAOperations() 412 { 413 return mKernelCtrs.keys ~ ["variable", "reshape"]; 414 } 415 416 package 417 { 418 string cudaType(DataType t) 419 { 420 switch(t) 421 { 422 case DataType.float32: 423 return "float"; 424 425 case DataType.int32: 426 return "int"; 427 428 default: 429 import std.conv : to; 430 assert(0, "DataType '" ~ t.to!string ~ "' is not currently supported by the CUDA backend"); 431 } 432 } 433 } 434 435 private 436 { 437 CUDAKernelCtr[string] mKernelCtrs; 438 } 439 440 unittest 441 { 442 auto a = float32([], [3.0f]); 443 auto b = float32([], [4.0f]); 444 auto c = float32([], [-1.0f]); 445 446 auto y = a * b + c; 447 448 assert(evaluateCUDA(y).as!float[0] == 11.0f); 449 }