1 module dopt.cuda.basic; 2 3 import std.functional; 4 5 import dopt.cuda; 6 import dopt.cuda.math; 7 import dopt.cuda.nvrtc; 8 import dopt.core.ops; 9 import dopt.core.types; 10 11 import derelict.cuda; 12 13 package 14 { 15 void initialize() 16 { 17 registerCUDAKernel("slice", toDelegate(&cudaKernelCtr!Slice)); 18 registerCUDAKernel("pad", toDelegate(&cudaKernelCtr!Pad)); 19 registerCUDAKernel("repeat", toDelegate(&cudaKernelCtr!Repeat)); 20 registerCUDAKernel("transpose", toDelegate(&cudaKernelCtr!Transpose)); 21 } 22 } 23 24 private 25 { 26 CUDAKernel cudaKernelCtr(K)(Operation op) 27 { 28 return new K(op); 29 } 30 31 class Slice : CUDAKernel 32 { 33 this(Operation op) 34 { 35 mOp = op; 36 } 37 38 override void execute(const(CUDABuffer)[] inputs, CUDABuffer output) 39 { 40 size_t size = 4; 41 42 void sliceImpl(const(CUdeviceptr) inputptr, in size_t[] inShape, size_t inVol, 43 CUdeviceptr outputptr, in size_t[] outShape, size_t outVol, in size_t[] offset) 44 { 45 if(inShape.length == 0) 46 { 47 cuMemcpy(outputptr, inputptr, size); 48 } 49 else if(inShape.length == 1) 50 { 51 cuMemcpy(outputptr, inputptr + offset[0] * size, outShape[0] * size); 52 } 53 else 54 { 55 for(size_t i = 0; i < outShape[0]; i++) 56 { 57 sliceImpl(inputptr + (i + offset[0]) * inVol * size, 58 inShape[1 .. $], 59 inVol / inShape[1], 60 outputptr + i * outVol * size, 61 outShape[1 .. $], 62 outVol / outShape[1], 63 offset[1 .. $]); 64 } 65 } 66 } 67 68 auto inShape = mOp.deps[0].outputType.shape; 69 auto outShape = mOp.outputType.shape; 70 size_t inVol = mOp.deps[0].outputType.volume; 71 size_t outVol = mOp.outputType.volume; 72 auto offset = mOp.attributes["start"].get!(size_t[]); 73 74 if(inShape.length > 0) 75 { 76 inVol /= inShape[0]; 77 outVol /= outShape[0]; 78 } 79 80 sliceImpl(inputs[0].ptr, inShape, inVol, output.ptr, outShape, outVol, offset); 81 } 82 83 Operation mOp; 84 } 85 86 class Pad : CUDAKernel 87 { 88 this(Operation op) 89 { 90 mOp = op; 91 } 92 93 void execute(const(CUDABuffer)[] inputs, CUDABuffer output) 94 { 95 size_t size = 4; 96 97 void padImpl(CUdeviceptr inputptr, size_t[] inShape, size_t inVol, 98 CUdeviceptr outputptr, size_t[] outShape, size_t outVol, size_t[] offset) 99 { 100 if(inShape.length == 0) 101 { 102 cuMemcpy(outputptr, inputptr, size); 103 } 104 else if(inShape.length == 1) 105 { 106 cuMemcpy(outputptr + offset[0] * size, inputptr, inShape[0] * size); 107 } 108 else 109 { 110 for(size_t i = 0; i < inShape[0]; i++) 111 { 112 padImpl(inputptr + i * inVol * size, 113 inShape[1 .. $], 114 inVol / inShape[1], 115 outputptr + (i + offset[0]) * outVol * size, 116 outShape[1 .. $], 117 outVol / outShape[1], 118 offset[1 .. $]); 119 } 120 } 121 } 122 123 auto inShape = mOp.deps[0].outputType.shape; 124 auto outShape = mOp.outputType.shape; 125 size_t inVol = mOp.deps[0].outputType.volume; 126 size_t outVol = mOp.outputType.volume; 127 auto offset = mOp.attributes["before"].get!(size_t[]); 128 129 if(inShape.length > 0) 130 { 131 inVol /= inShape[0]; 132 outVol /= outShape[0]; 133 } 134 135 cuMemsetD8(output.ptr, 0, output.numBytes); 136 137 padImpl(inputs[0].ptr, inShape, inVol, output.ptr, outShape, outVol, offset); 138 } 139 140 Operation mOp; 141 } 142 143 class Repeat : CUDAKernel 144 { 145 this(Operation op) 146 { 147 mOp = op; 148 149 if(mRepeatKernel is null) 150 { 151 mRepeatKernel = new NVRTCKernel("repeatBlocks", ` 152 extern "C" __global__ void repeatBlocks(const char *inbuf, size_t elemSize, size_t len, char *outbuf, size_t reps) 153 { 154 size_t i = blockDim.x * blockIdx.x + threadIdx.x; 155 156 if(i < elemSize * reps * len) 157 { 158 size_t e = i % elemSize; 159 size_t l = i / (elemSize * reps); 160 outbuf[i] = inbuf[l * elemSize + e]; 161 } 162 } 163 `); 164 } 165 } 166 167 override void execute(const(CUDABuffer)[] inputs, CUDABuffer output) 168 { 169 if(inputs[0].numBytes == output.numBytes) 170 { 171 cuMemcpy(output.ptr, inputs[0].ptr, output.numBytes); 172 return; 173 } 174 175 void process(CUDABuffer inbuf, CUDABuffer outbuf, size_t elemSize, size_t len, size_t reps) 176 { 177 uint n = cast(uint)(elemSize * len * reps); 178 uint numThreads = 512; 179 uint numBlocks = (cast(uint)n + numThreads) / numThreads; 180 mRepeatKernel.execute(numBlocks, numThreads, inbuf.ptr, elemSize, len, outbuf.ptr, reps); 181 } 182 183 //Iterate over each axis, from smallest stride to largest stride 184 size_t elemSize = sizeOf(mOp.elementType); 185 auto inbuf = cast(CUDABuffer)inputs[0]; 186 CUDABuffer outbuf; 187 188 foreach_reverse(i, a; mOp.attributes["repetitions"].get!(size_t[])) 189 { 190 elemSize *= mOp.deps[0].shape[i]; 191 192 if(a == 1) 193 { 194 continue; 195 } 196 197 outbuf = CUDABuffer.create(inbuf.numBytes * a); 198 199 process(inbuf, outbuf, elemSize, inbuf.numBytes / elemSize, a); 200 201 elemSize *= a; 202 203 if(inbuf.ptr != inputs[0].ptr) 204 { 205 CUDABuffer.destroy(inbuf); 206 } 207 208 inbuf = outbuf; 209 } 210 211 cuMemcpy(output.ptr, outbuf.ptr, output.numBytes); 212 CUDABuffer.destroy(outbuf); 213 } 214 215 Operation mOp; 216 static NVRTCKernel mRepeatKernel; 217 } 218 219 class Transpose : CUDAKernel 220 { 221 this(Operation op) 222 { 223 mOp = op; 224 } 225 226 void execute(const(CUDABuffer)[] inputs, CUDABuffer output) 227 { 228 if(mOp.outputType.elementType == DataType.float32) 229 { 230 auto a = cast(float *)inputs[0].ptr; 231 auto c = cast(float *)output.ptr; 232 float alpha = 1; 233 float beta = 0; 234 235 auto mShape = mOp.outputType.shape; 236 237 cublasSgeam(mCuBLASHandle, CUBLAS_OP_T, CUBLAS_OP_T, cast(int)mShape[1], cast(int)mShape[0], &alpha, a, 238 cast(int)mShape[0], &beta, a, cast(int)mShape[0], c, cast(int)mShape[1]); 239 } 240 else 241 { 242 throw new Exception("Element type not supported."); 243 } 244 } 245 246 Operation mOp; 247 } 248 }