1 module dopt.core.cuda.math; 2 3 import std.algorithm; 4 import std.conv; 5 import std.functional; 6 import std.math; 7 import std.range; 8 9 import dopt.core.cuda; 10 import dopt.core.cuda.nvrtc; 11 import dopt.core.ops; 12 import dopt.core.types; 13 14 import derelict.cuda; 15 16 package 17 { 18 extern(C) 19 { 20 enum 21 { 22 CUBLAS_STATUS_SUCCESS =0, 23 CUBLAS_STATUS_NOT_INITIALIZED =1, 24 CUBLAS_STATUS_ALLOC_FAILED =3, 25 CUBLAS_STATUS_INVALID_VALUE =7, 26 CUBLAS_STATUS_ARCH_MISMATCH =8, 27 CUBLAS_STATUS_MAPPING_ERROR =11, 28 CUBLAS_STATUS_EXECUTION_FAILED=13, 29 CUBLAS_STATUS_INTERNAL_ERROR =14, 30 CUBLAS_STATUS_NOT_SUPPORTED =15, 31 CUBLAS_STATUS_LICENSE_ERROR =16 32 } 33 34 alias cublasStatus_t = int; 35 36 enum 37 { 38 CUBLAS_OP_N=0, 39 CUBLAS_OP_T=1, 40 CUBLAS_OP_C=2 41 } 42 43 alias cublasOperation_t = int; 44 45 enum 46 { 47 CUBLAS_POINTER_MODE_HOST = 0, 48 CUBLAS_POINTER_MODE_DEVICE = 1 49 } 50 51 alias cublasPointerMode_t = int; 52 53 struct cublasContext; 54 alias cublasHandle_t = cublasContext *; 55 56 cublasStatus_t function(cublasHandle_t *handle) cublasCreate_v2; 57 cublasStatus_t function(cublasHandle_t handle, cublasPointerMode_t mode) cublasSetPointerMode_v2; 58 59 cublasStatus_t function(cublasHandle_t handle, int n, const float *x, int incx, const float *y, int incy, float *result) cublasSdot_v2; 60 cublasStatus_t function(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const float *alpha, const float *A, int lda, const float *B, int ldb, const float *beta, float *C, int ldc) cublasSgemm_v2; 61 cublasStatus_t function(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, const float *alpha, const float *A, int lda, const float *beta, const float *B, int ldb, float *C, int ldc) cublasSgeam; 62 } 63 64 void initialize() 65 { 66 //TODO: make a DerelictCuBLAS library 67 import core.sys.posix.dlfcn; 68 import std.string; 69 fh = dlopen("libcublas.so".toStringz, RTLD_LAZY); 70 cublasCreate_v2 = cast(typeof(cublasCreate_v2))dlsym(fh, "cublasCreate_v2"); 71 cublasSetPointerMode_v2 = cast(typeof(cublasSetPointerMode_v2))dlsym(fh, "cublasSetPointerMode_v2"); 72 cublasSdot_v2 = cast(typeof(cublasSdot_v2))dlsym(fh, "cublasSdot_v2"); 73 cublasSgemm_v2 = cast(typeof(cublasSgemm_v2))dlsym(fh, "cublasSgemm_v2"); 74 cublasSgeam = cast(typeof(cublasSgeam))dlsym(fh, "cublasSgeam"); 75 76 cublasCreate_v2(&mCuBLASHandle); 77 78 mixin(generateRegistrations()); 79 registerCUDAKernel("matmul", toDelegate(&matmulKernelCtr)); 80 registerCUDAKernel("sum", toDelegate(&sumKernelCtr)); 81 registerCUDAKernel("maxElement", toDelegate(&maxElementKernelCtr)); 82 registerCUDAKernel("argmin", toDelegate(&argminKernelCtr)); 83 } 84 85 cublasHandle_t mCuBLASHandle; 86 } 87 88 private 89 { 90 void *fh; 91 92 immutable string[] arith = ["add", "sub", "mul", "div"]; 93 immutable string[] comp = ["lt", "lte", "gt", "gte", "eq", "neq"]; 94 immutable string[] binfunc = ["max", "min", "pow"]; 95 immutable string[] unfunc = ["neg", "abs", "sgn", "exp", "log", "sqrt"]; 96 97 string generateRegistrations() 98 { 99 return chain(arith, comp, binfunc, unfunc) 100 .map!(x => "registerCUDAKernel(\"" ~ x ~ "\", toDelegate(&" ~ x ~ "KernelCtr));") 101 .joiner("\n") 102 .to!string; 103 } 104 105 //Arithmetic operations 106 alias addKernelCtr = cudaKernelCtr!("+"); 107 alias subKernelCtr = cudaKernelCtr!("-"); 108 alias mulKernelCtr = cudaKernelCtr!("*"); 109 alias divKernelCtr = cudaKernelCtr!("/"); 110 111 //Comparison operations 112 alias ltKernelCtr = cudaKernelCtr!("<"); 113 alias lteKernelCtr = cudaKernelCtr!("<="); 114 alias gtKernelCtr = cudaKernelCtr!(">"); 115 alias gteKernelCtr = cudaKernelCtr!(">="); 116 alias eqKernelCtr = cudaKernelCtr!("=="); 117 alias neqKernelCtr = cudaKernelCtr!("!="); 118 119 //Binary functions 120 alias maxKernelCtr = cudaKernelCtr!("max", 2, false); 121 alias minKernelCtr = cudaKernelCtr!("min", 2, false); 122 alias powKernelCtr = cudaKernelCtr!("pow", 2, false); 123 124 //Unary functions 125 alias negKernelCtr = cudaKernelCtr!("-", 1, false); 126 alias absKernelCtr = cudaKernelCtr!("abs", 1, false); 127 alias sgnKernelCtr = cudaKernelCtr!("sgn", 1, false); 128 alias expKernelCtr = cudaKernelCtr!("exp", 1, false); 129 alias logKernelCtr = cudaKernelCtr!("log", 1, false); 130 alias sqrtKernelCtr = cudaKernelCtr!("sqrt", 1, false); 131 132 CUDAKernel cudaKernelCtr(string opName, int deps = 2, bool infix = true)(Operation op) 133 { 134 static if(deps == 2) 135 { 136 enum code = ` 137 //TODO: change this to iterate over elements with a stride to fully exploit SIMD units 138 extern "C" __global__ void pointwiseKernel(size_t n, const T *dep1, const T *dep2, T *output) 139 { 140 size_t i = blockDim.x * blockIdx.x + threadIdx.x; 141 142 if(i < n) 143 { 144 output[i] = ` ~ (infix ? "dep1[i] " ~ opName ~ " dep2[i]" : opName ~ "(dep1[i], dep2[i])") ~ `; 145 } 146 } 147 `; 148 } 149 else static if(deps == 1) 150 { 151 enum code = ` 152 T __device__ sgn(T a) 153 { 154 return (T)((T(0) < a) - (a < T(0))); 155 } 156 157 //TODO: change this to iterate over elements with a stride to fully exploit SIMD units 158 extern "C" __global__ void pointwiseKernel(size_t n, const T *dep1, T *output) 159 { 160 size_t i = blockDim.x * blockIdx.x + threadIdx.x; 161 162 if(i < n) 163 { 164 output[i] = ` ~ opName ~ `(dep1[i]); 165 } 166 } 167 `; 168 } 169 else 170 { 171 assert(0, "Not implemented."); 172 } 173 174 return new PointwiseCUDAKernel("typedef " ~ op.outputType.elementType.cudaType() ~ " T;\n" ~ code, op); 175 } 176 177 class PointwiseCUDAKernel : CUDAKernel 178 { 179 this(string code, Operation op) 180 { 181 mKernel = mKernelCache.get(code, new NVRTCKernel("pointwiseKernel", code)); 182 mKernelCache[code] = mKernel; 183 mOp = op; 184 } 185 186 override void execute(const(CUDABuffer)[] inputs, CUDABuffer output) 187 { 188 //Args = [vol, <inputs...>, output] 189 void*[] args = new void*[inputs.length + 2]; 190 CUdeviceptr[] ptrs = new CUdeviceptr[inputs.length + 1]; 191 192 size_t n = mOp.outputType.volume; 193 args[0] = &n; 194 195 for(size_t i = 0; i < inputs.length; i++) 196 { 197 ptrs[i] = inputs[i].ptr; 198 args[i + 1] = &ptrs[i]; 199 } 200 201 ptrs[$ - 1] = output.ptr; 202 args[$ - 1] = &ptrs[$ - 1]; 203 204 uint numThreads = 512; 205 uint numBlocks = (cast(uint)n + numThreads) / numThreads; 206 mKernel.execute(numBlocks, numThreads, args); 207 } 208 209 static NVRTCKernel[string] mKernelCache; 210 NVRTCKernel mKernel; 211 Operation mOp; 212 } 213 214 CUDAKernel matmulKernelCtr(Operation op) 215 { 216 return new MatmulKernel(op); 217 } 218 219 class MatmulKernel : CUDAKernel 220 { 221 this(Operation op) 222 { 223 mOp = op; 224 ashape = mOp.deps[0].outputType.shape.map!(x => cast(int)x).array(); 225 bshape = mOp.deps[1].outputType.shape.map!(x => cast(int)x).array(); 226 cshape = mOp.outputType.shape.map!(x => cast(int)x).array(); 227 } 228 229 override void execute(const(CUDABuffer)[] inputs, CUDABuffer output) 230 { 231 if(mOp.outputType.elementType == DataType.float32) 232 { 233 auto a = cast(float *)inputs[0].ptr; 234 auto b = cast(float *)inputs[1].ptr; 235 auto c = cast(float *)output.ptr; 236 float alpha = 1; 237 float beta = 0; 238 239 cublasSgemm_v2(mCuBLASHandle, CUBLAS_OP_N, CUBLAS_OP_N, bshape[1], ashape[0], ashape[1], &alpha, b, 240 bshape[1], a, ashape[1], &beta, c, cshape[1]); 241 } 242 else 243 { 244 throw new Exception("Element type not supported."); 245 } 246 } 247 248 Operation mOp; 249 int[] ashape; 250 int[] bshape; 251 int[] cshape; 252 } 253 254 CUDAKernel sumKernelCtr(Operation op) 255 { 256 return new SumKernel(op); 257 } 258 259 class SumKernel : CUDAKernel 260 { 261 this(Operation op) 262 { 263 mInput = variable(TensorType(op.deps[0].elementType, op.deps[0].shape)); 264 mOp = sum(mInput, op.attributes["axes"].get!(size_t[])); 265 } 266 267 override void execute(const(CUDABuffer)[] inputs, CUDABuffer output) 268 { 269 import dopt.core.cpu : evaluateCPU; 270 271 auto inbuf = new byte[inputs[0].numBytes]; 272 inputs[0].get(inbuf); 273 274 import dopt.core.cpu : evaluate; 275 auto outbuf = evaluateCPU([mOp], [mInput: Buffer(inbuf)])[0]; 276 277 output.set(outbuf.as!byte); 278 } 279 280 Operation mInput; 281 Operation mOp; 282 } 283 284 CUDAKernel maxElementKernelCtr(Operation op) 285 { 286 return new MaxElementKernel(op); 287 } 288 289 class MaxElementKernel : CUDAKernel 290 { 291 this(Operation op) 292 { 293 mInput = variable(TensorType(op.deps[0].elementType, op.deps[0].shape)); 294 mOp = maxElement(mInput, op.attributes["axes"].get!(size_t[])); 295 } 296 297 override void execute(const(CUDABuffer)[] inputs, CUDABuffer output) 298 { 299 import dopt.core.cpu : evaluateCPU; 300 301 auto inbuf = new byte[inputs[0].numBytes]; 302 inputs[0].get(inbuf); 303 304 import dopt.core.cpu : evaluate; 305 auto outbuf = evaluateCPU([mOp], [mInput: Buffer(inbuf)])[0]; 306 307 output.set(outbuf.as!byte); 308 } 309 310 Operation mInput; 311 Operation mOp; 312 } 313 314 CUDAKernel argminKernelCtr(Operation op) 315 { 316 return new ArgminKernel(op); 317 } 318 319 class ArgminKernel : CUDAKernel 320 { 321 this(Operation op) 322 { 323 mInput = variable(TensorType(op.deps[0].elementType, op.deps[0].shape)); 324 mOp = argmin(mInput, op.attributes["axis"].get!size_t); 325 } 326 327 override void execute(const(CUDABuffer)[] inputs, CUDABuffer output) 328 { 329 import dopt.core.cpu : evaluateCPU; 330 331 auto inbuf = new byte[inputs[0].numBytes]; 332 inputs[0].get(inbuf); 333 334 import dopt.core.cpu : evaluate; 335 auto outbuf = evaluateCPU([mOp], [mInput: Buffer(inbuf)])[0]; 336 337 output.set(outbuf.as!byte); 338 } 339 340 Operation mInput; 341 Operation mOp; 342 } 343 }