1 module dopt.core.cuda.basic; 2 3 import std.functional; 4 5 import dopt.core.cuda; 6 import dopt.core.cuda.math; 7 import dopt.core.cuda.nvrtc; 8 import dopt.core.ops; 9 import dopt.core.types; 10 11 import derelict.cuda; 12 13 package 14 { 15 void initialize() 16 { 17 registerCUDAKernel("slice", toDelegate(&cudaKernelCtr!Slice)); 18 registerCUDAKernel("pad", toDelegate(&cudaKernelCtr!Pad)); 19 registerCUDAKernel("repeat", toDelegate(&cudaKernelCtr!Repeat)); 20 registerCUDAKernel("transpose", toDelegate(&cudaKernelCtr!Transpose)); 21 } 22 } 23 24 private 25 { 26 CUDAKernel cudaKernelCtr(K)(Operation op) 27 { 28 return new K(op); 29 } 30 31 class Slice : CUDAKernel 32 { 33 this(Operation op) 34 { 35 mOp = op; 36 } 37 38 override void execute(const(CUDABuffer)[] inputs, CUDABuffer output) 39 { 40 size_t size = 4; 41 42 void sliceImpl(const(CUdeviceptr) inputptr, in size_t[] inShape, size_t inVol, 43 CUdeviceptr outputptr, in size_t[] outShape, size_t outVol, in size_t[] offset) 44 { 45 if(inShape.length == 0) 46 { 47 cuMemcpy(outputptr, inputptr, size); 48 } 49 else if(inShape.length == 1) 50 { 51 cuMemcpy(outputptr, inputptr + offset[0] * size, outShape[0] * size); 52 } 53 else 54 { 55 for(size_t i = 0; i < outShape[0]; i++) 56 { 57 sliceImpl(inputptr + (i + offset[0]) * inVol * size, 58 inShape[1 .. $], 59 inVol / inShape[1], 60 outputptr + i * outVol * size, 61 outShape[1 .. $], 62 outVol / outShape[1], 63 offset[1 .. $]); 64 } 65 } 66 } 67 68 auto inShape = mOp.deps[0].outputType.shape; 69 auto outShape = mOp.outputType.shape; 70 size_t inVol = mOp.deps[0].outputType.volume; 71 size_t outVol = mOp.outputType.volume; 72 auto offset = mOp.attributes["start"].get!(size_t[]); 73 74 if(inShape.length > 0) 75 { 76 inVol /= inShape[0]; 77 outVol /= outShape[0]; 78 } 79 80 sliceImpl(inputs[0].ptr, inShape, inVol, output.ptr, outShape, outVol, offset); 81 } 82 83 Operation mOp; 84 } 85 86 class Pad : CUDAKernel 87 { 88 this(Operation op) 89 { 90 mOp = op; 91 } 92 93 void execute(const(CUDABuffer)[] inputs, CUDABuffer output) 94 { 95 size_t size = 4; 96 97 void padImpl(CUdeviceptr inputptr, size_t[] inShape, size_t inVol, 98 CUdeviceptr outputptr, size_t[] outShape, size_t outVol, size_t[] offset) 99 { 100 if(inShape.length == 0) 101 { 102 cuMemcpy(outputptr, inputptr, size); 103 } 104 else if(inShape.length == 1) 105 { 106 cuMemcpy(outputptr + offset[0] * size, inputptr, inShape[0] * size); 107 } 108 else 109 { 110 for(size_t i = 0; i < inShape[0]; i++) 111 { 112 padImpl(inputptr + i * inVol * size, 113 inShape[1 .. $], 114 inVol / inShape[1], 115 outputptr + (i + offset[0]) * outVol * size, 116 outShape[1 .. $], 117 outVol / outShape[1], 118 offset[1 .. $]); 119 } 120 } 121 } 122 123 auto inShape = mOp.deps[0].outputType.shape; 124 auto outShape = mOp.outputType.shape; 125 size_t inVol = mOp.deps[0].outputType.volume; 126 size_t outVol = mOp.outputType.volume; 127 auto offset = mOp.attributes["before"].get!(size_t[]); 128 129 if(inShape.length > 0) 130 { 131 inVol /= inShape[0]; 132 outVol /= outShape[0]; 133 } 134 135 cuMemsetD8(output.ptr, 0, output.numBytes); 136 137 padImpl(inputs[0].ptr, inShape, inVol, output.ptr, outShape, outVol, offset); 138 } 139 140 Operation mOp; 141 } 142 143 class Repeat : CUDAKernel 144 { 145 this(Operation op) 146 { 147 mOp = op; 148 149 if(mRepeatKernel is null) 150 { 151 mRepeatKernel = new NVRTCKernel("repeatBlocks", ` 152 extern "C" __global__ void repeatBlocks(const char *inbuf, size_t elemSize, size_t len, char *outbuf, size_t reps) 153 { 154 size_t i = blockDim.x * blockIdx.x + threadIdx.x; 155 156 if(i < elemSize * reps * len) 157 { 158 size_t e = i % elemSize; 159 size_t l = i / (elemSize * reps); 160 outbuf[i] = inbuf[l * elemSize + e]; 161 } 162 } 163 `); 164 } 165 } 166 167 override void execute(const(CUDABuffer)[] inputs, CUDABuffer output) 168 { 169 void process(CUDABuffer inbuf, CUDABuffer outbuf, size_t elemSize, size_t len, size_t reps) 170 { 171 uint n = cast(uint)(elemSize * len * reps); 172 uint numThreads = 512; 173 uint numBlocks = (cast(uint)n + numThreads) / numThreads; 174 mRepeatKernel.execute(numBlocks, numThreads, inbuf.ptr, elemSize, len, outbuf.ptr, reps); 175 } 176 177 //Iterate over each axis, from smallest stride to largest stride 178 size_t elemSize = sizeOf(mOp.elementType); 179 auto inbuf = cast(CUDABuffer)inputs[0]; 180 CUDABuffer outbuf; 181 182 foreach_reverse(i, a; mOp.attributes["repetitions"].get!(size_t[])) 183 { 184 elemSize *= mOp.deps[0].shape[i]; 185 186 if(a == 1) 187 { 188 continue; 189 } 190 191 outbuf = CUDABuffer.create(inbuf.numBytes * a); 192 193 process(inbuf, outbuf, elemSize, inbuf.numBytes / elemSize, a); 194 195 elemSize *= a; 196 197 if(inbuf.ptr != inputs[0].ptr) 198 { 199 CUDABuffer.destroy(inbuf); 200 } 201 202 inbuf = outbuf; 203 } 204 205 cuMemcpy(output.ptr, outbuf.ptr, output.numBytes); 206 CUDABuffer.destroy(outbuf); 207 } 208 209 Operation mOp; 210 static NVRTCKernel mRepeatKernel; 211 } 212 213 class Transpose : CUDAKernel 214 { 215 this(Operation op) 216 { 217 mOp = op; 218 } 219 220 void execute(const(CUDABuffer)[] inputs, CUDABuffer output) 221 { 222 if(mOp.outputType.elementType == DataType.float32) 223 { 224 auto a = cast(float *)inputs[0].ptr; 225 auto c = cast(float *)output.ptr; 226 float alpha = 1; 227 float beta = 0; 228 229 auto mShape = mOp.outputType.shape; 230 231 cublasSgeam(mCuBLASHandle, CUBLAS_OP_T, CUBLAS_OP_T, cast(int)mShape[1], cast(int)mShape[0], &alpha, a, 232 cast(int)mShape[0], &beta, a, cast(int)mShape[0], c, cast(int)mShape[1]); 233 } 234 else 235 { 236 throw new Exception("Element type not supported."); 237 } 238 } 239 240 Operation mOp; 241 } 242 }