1 module dopt.cuda.math; 2 3 import std.algorithm; 4 import std.conv; 5 import std.functional; 6 import std.math; 7 import std.range; 8 9 import dopt.cuda; 10 import dopt.cuda.nvrtc; 11 import dopt.core.ops; 12 import dopt.core.types; 13 14 import derelict.cuda; 15 16 package 17 { 18 extern(C) 19 { 20 enum 21 { 22 CUBLAS_STATUS_SUCCESS =0, 23 CUBLAS_STATUS_NOT_INITIALIZED =1, 24 CUBLAS_STATUS_ALLOC_FAILED =3, 25 CUBLAS_STATUS_INVALID_VALUE =7, 26 CUBLAS_STATUS_ARCH_MISMATCH =8, 27 CUBLAS_STATUS_MAPPING_ERROR =11, 28 CUBLAS_STATUS_EXECUTION_FAILED=13, 29 CUBLAS_STATUS_INTERNAL_ERROR =14, 30 CUBLAS_STATUS_NOT_SUPPORTED =15, 31 CUBLAS_STATUS_LICENSE_ERROR =16 32 } 33 34 alias cublasStatus_t = int; 35 36 enum 37 { 38 CUBLAS_OP_N=0, 39 CUBLAS_OP_T=1, 40 CUBLAS_OP_C=2 41 } 42 43 alias cublasOperation_t = int; 44 45 enum 46 { 47 CUBLAS_POINTER_MODE_HOST = 0, 48 CUBLAS_POINTER_MODE_DEVICE = 1 49 } 50 51 alias cublasPointerMode_t = int; 52 53 struct cublasContext; 54 alias cublasHandle_t = cublasContext *; 55 56 cublasStatus_t function(cublasHandle_t *handle) cublasCreate_v2; 57 cublasStatus_t function(cublasHandle_t handle, cublasPointerMode_t mode) cublasSetPointerMode_v2; 58 59 cublasStatus_t function(cublasHandle_t handle, int n, const float *x, int incx, const float *y, int incy, float *result) cublasSdot_v2; 60 cublasStatus_t function(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const float *alpha, const float *A, int lda, const float *B, int ldb, const float *beta, float *C, int ldc) cublasSgemm_v2; 61 cublasStatus_t function(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, const float *alpha, const float *A, int lda, const float *beta, const float *B, int ldb, float *C, int ldc) cublasSgeam; 62 } 63 64 void initialize() 65 { 66 //TODO: make a DerelictCuBLAS library 67 import core.sys.posix.dlfcn; 68 import std..string; 69 fh = dlopen("libcublas.so".toStringz, RTLD_LAZY); 70 cublasCreate_v2 = cast(typeof(cublasCreate_v2))dlsym(fh, "cublasCreate_v2"); 71 cublasSetPointerMode_v2 = cast(typeof(cublasSetPointerMode_v2))dlsym(fh, "cublasSetPointerMode_v2"); 72 cublasSdot_v2 = cast(typeof(cublasSdot_v2))dlsym(fh, "cublasSdot_v2"); 73 cublasSgemm_v2 = cast(typeof(cublasSgemm_v2))dlsym(fh, "cublasSgemm_v2"); 74 cublasSgeam = cast(typeof(cublasSgeam))dlsym(fh, "cublasSgeam"); 75 76 cublasCreate_v2(&mCuBLASHandle); 77 78 mixin(generateRegistrations()); 79 registerCUDAKernel("matmul", toDelegate(&matmulKernelCtr)); 80 } 81 82 cublasHandle_t mCuBLASHandle; 83 } 84 85 private 86 { 87 void *fh; 88 89 immutable string[] arith = ["add", "sub", "mul", "div"]; 90 immutable string[] comp = ["lt", "lte", "gt", "gte", "eq", "neq"]; 91 immutable string[] binfunc = ["max", "min", "pow"]; 92 immutable string[] unfunc = ["neg", "abs", "sgn", "exp", "log", "sqrt"]; 93 94 string generateRegistrations() 95 { 96 return chain(arith, comp, binfunc, unfunc) 97 .map!(x => "registerCUDAKernel(\"" ~ x ~ "\", toDelegate(&" ~ x ~ "KernelCtr));") 98 .joiner("\n") 99 .to!string; 100 } 101 102 //Arithmetic operations 103 alias addKernelCtr = cudaKernelCtr!("+"); 104 alias subKernelCtr = cudaKernelCtr!("-"); 105 alias mulKernelCtr = cudaKernelCtr!("*"); 106 alias divKernelCtr = cudaKernelCtr!("/"); 107 108 //Comparison operations 109 alias ltKernelCtr = cudaKernelCtr!("<"); 110 alias lteKernelCtr = cudaKernelCtr!("<="); 111 alias gtKernelCtr = cudaKernelCtr!(">"); 112 alias gteKernelCtr = cudaKernelCtr!(">="); 113 alias eqKernelCtr = cudaKernelCtr!("=="); 114 alias neqKernelCtr = cudaKernelCtr!("!="); 115 116 //Binary functions 117 alias maxKernelCtr = cudaKernelCtr!("max", 2, false); 118 alias minKernelCtr = cudaKernelCtr!("min", 2, false); 119 alias powKernelCtr = cudaKernelCtr!("pow", 2, false); 120 121 //Unary functions 122 alias negKernelCtr = cudaKernelCtr!("-", 1, false); 123 alias absKernelCtr = cudaKernelCtr!("abs", 1, false); 124 alias sgnKernelCtr = cudaKernelCtr!("sgn", 1, false); 125 alias expKernelCtr = cudaKernelCtr!("exp", 1, false); 126 alias logKernelCtr = cudaKernelCtr!("log", 1, false); 127 alias sqrtKernelCtr = cudaKernelCtr!("sqrt", 1, false); 128 129 CUDAKernel cudaKernelCtr(string opName, int deps = 2, bool infix = true)(Operation op) 130 { 131 static if(deps == 2) 132 { 133 enum code = ` 134 extern "C" __global__ void pointwiseKernel(size_t n, const T *dep1, const T *dep2, T *output) 135 { 136 size_t i0 = blockDim.x * blockIdx.x + threadIdx.x; 137 138 for(size_t i = i0; i < n; i += blockDim.x * gridDim.x) 139 { 140 output[i] = ` ~ (infix ? "dep1[i] " ~ opName ~ " dep2[i]" : opName ~ "(dep1[i], dep2[i])") ~ `; 141 } 142 } 143 `; 144 } 145 else static if(deps == 1) 146 { 147 enum code = ` 148 T __device__ sgn(T a) 149 { 150 return (T)((T(0) < a) - (a < T(0))); 151 } 152 153 extern "C" __global__ void pointwiseKernel(size_t n, const T *dep1, T *output) 154 { 155 size_t i0 = blockDim.x * blockIdx.x + threadIdx.x; 156 157 for(size_t i = i0; i < n; i += blockDim.x * gridDim.x) 158 { 159 output[i] = ` ~ opName ~ `(dep1[i]); 160 } 161 } 162 `; 163 } 164 else 165 { 166 assert(0, "Not implemented."); 167 } 168 169 return new PointwiseCUDAKernel("typedef " ~ op.outputType.elementType.cudaType() ~ " T;\n" ~ code, op); 170 } 171 172 class PointwiseCUDAKernel : CUDAKernel 173 { 174 this(string code, Operation op) 175 { 176 mKernel = mKernelCache.get(code, new NVRTCKernel("pointwiseKernel", code)); 177 mKernelCache[code] = mKernel; 178 mOp = op; 179 } 180 181 override void execute(const(CUDABuffer)[] inputs, CUDABuffer output) 182 { 183 //Args = [vol, <inputs...>, output] 184 void*[] args = new void*[inputs.length + 2]; 185 CUdeviceptr[] ptrs = new CUdeviceptr[inputs.length + 1]; 186 187 size_t n = mOp.outputType.volume; 188 args[0] = &n; 189 190 for(size_t i = 0; i < inputs.length; i++) 191 { 192 ptrs[i] = inputs[i].ptr; 193 args[i + 1] = &ptrs[i]; 194 } 195 196 ptrs[$ - 1] = output.ptr; 197 args[$ - 1] = &ptrs[$ - 1]; 198 199 uint numThreads = 512; 200 uint numBlocks = (cast(uint)n + numThreads) / numThreads; 201 mKernel.execute(numBlocks, numThreads, args); 202 } 203 204 static NVRTCKernel[string] mKernelCache; 205 NVRTCKernel mKernel; 206 Operation mOp; 207 } 208 209 CUDAKernel matmulKernelCtr(Operation op) 210 { 211 return new MatmulKernel(op); 212 } 213 214 class MatmulKernel : CUDAKernel 215 { 216 this(Operation op) 217 { 218 mOp = op; 219 ashape = mOp.deps[0].outputType.shape.map!(x => cast(int)x).array(); 220 bshape = mOp.deps[1].outputType.shape.map!(x => cast(int)x).array(); 221 cshape = mOp.outputType.shape.map!(x => cast(int)x).array(); 222 } 223 224 override void execute(const(CUDABuffer)[] inputs, CUDABuffer output) 225 { 226 if(mOp.outputType.elementType == DataType.float32) 227 { 228 auto a = cast(float *)inputs[0].ptr; 229 auto b = cast(float *)inputs[1].ptr; 230 auto c = cast(float *)output.ptr; 231 float alpha = 1; 232 float beta = 0; 233 234 cublasSgemm_v2(mCuBLASHandle, CUBLAS_OP_N, CUBLAS_OP_N, bshape[1], ashape[0], ashape[1], &alpha, b, 235 bshape[1], a, ashape[1], &beta, c, cshape[1]); 236 } 237 else 238 { 239 throw new Exception("Element type not supported."); 240 } 241 } 242 243 Operation mOp; 244 int[] ashape; 245 int[] bshape; 246 int[] cshape; 247 } 248 }