1 module dopt.core.cuda.basic;
2 
3 import std.functional;
4 
5 import dopt.core.cuda;
6 import dopt.core.cuda.math;
7 import dopt.core.cuda.nvrtc;
8 import dopt.core.ops;
9 import dopt.core.types;
10 
11 import derelict.cuda;
12 
13 package
14 {
15     void initialize()
16     {
17         registerCUDAKernel("slice", toDelegate(&cudaKernelCtr!Slice));
18         registerCUDAKernel("pad", toDelegate(&cudaKernelCtr!Pad));
19         registerCUDAKernel("repeat", toDelegate(&cudaKernelCtr!Repeat));
20         registerCUDAKernel("transpose", toDelegate(&cudaKernelCtr!Transpose));
21     }
22 }
23 
24 private
25 {
26     CUDAKernel cudaKernelCtr(K)(Operation op)
27     {
28         return new K(op);
29     }
30 
31     class Slice : CUDAKernel
32     {
33         this(Operation op)
34         {
35             mOp = op;
36         }
37 
38         override void execute(const(CUDABuffer)[] inputs, CUDABuffer output)
39         {
40             size_t size = 4;
41 
42             void sliceImpl(const(CUdeviceptr) inputptr, in size_t[] inShape, size_t inVol,
43                         CUdeviceptr outputptr, in size_t[] outShape, size_t outVol, in size_t[] offset)
44             {
45                 if(inShape.length == 0)
46                 {
47                     cuMemcpy(outputptr, inputptr, size);
48                 }
49                 else if(inShape.length == 1)
50                 {
51                     cuMemcpy(outputptr, inputptr + offset[0] * size, outShape[0] * size);
52                 }
53                 else
54                 {
55                     for(size_t i = 0; i < outShape[0]; i++)
56                     {
57                         sliceImpl(inputptr + (i + offset[0]) * inVol * size,
58                                     inShape[1 .. $],
59                                     inVol / inShape[1],
60                                     outputptr + i * outVol * size,
61                                     outShape[1 .. $],
62                                     outVol / outShape[1],
63                                     offset[1 .. $]);
64                     }
65                 }
66             }
67 
68             auto inShape = mOp.deps[0].outputType.shape;
69             auto outShape = mOp.outputType.shape;
70             size_t inVol = mOp.deps[0].outputType.volume;
71             size_t outVol = mOp.outputType.volume;
72             auto offset = mOp.attributes["start"].get!(size_t[]);
73 
74             if(inShape.length > 0)
75             {
76                 inVol /= inShape[0];
77                 outVol /= outShape[0];
78             }
79 
80             sliceImpl(inputs[0].ptr, inShape, inVol, output.ptr, outShape, outVol, offset);
81         }
82 
83         Operation mOp;
84     }
85 
86     class Pad : CUDAKernel
87     {
88         this(Operation op)
89         {
90             mOp = op;
91         }
92 
93         void execute(const(CUDABuffer)[] inputs, CUDABuffer output)
94         {
95             size_t size = 4;
96 
97             void padImpl(CUdeviceptr inputptr, size_t[] inShape, size_t inVol,
98                         CUdeviceptr outputptr, size_t[] outShape, size_t outVol, size_t[] offset)
99             {
100                 if(inShape.length == 0)
101                 {
102                     cuMemcpy(outputptr, inputptr, size);
103                 }
104                 else if(inShape.length == 1)
105                 {
106                     cuMemcpy(outputptr + offset[0] * size, inputptr, inShape[0] * size);
107                 }
108                 else
109                 {
110                     for(size_t i = 0; i < inShape[0]; i++)
111                     {
112                         padImpl(inputptr + i * inVol * size,
113                                     inShape[1 .. $],
114                                     inVol / inShape[1],
115                                     outputptr + (i + offset[0]) * outVol * size,
116                                     outShape[1 .. $],
117                                     outVol / outShape[1],
118                                     offset[1 .. $]);
119                     }
120                 }
121             }
122 
123             auto inShape = mOp.deps[0].outputType.shape;
124             auto outShape = mOp.outputType.shape;
125             size_t inVol = mOp.deps[0].outputType.volume;
126             size_t outVol = mOp.outputType.volume;
127             auto offset = mOp.attributes["before"].get!(size_t[]);
128 
129             if(inShape.length > 0)
130             {
131                 inVol /= inShape[0];
132                 outVol /= outShape[0];
133             }
134 
135             cuMemsetD8(output.ptr, 0, output.numBytes);
136 
137             padImpl(inputs[0].ptr, inShape, inVol, output.ptr, outShape, outVol, offset);
138         }
139 
140         Operation mOp;
141     }
142 
143     class Repeat : CUDAKernel
144     {
145         this(Operation op)
146         {
147             mOp = op;
148 
149             if(mRepeatKernel is null)
150             {
151                 mRepeatKernel = new NVRTCKernel("repeatBlocks", `
152 extern "C" __global__ void repeatBlocks(const char *inbuf, size_t elemSize, size_t len, char *outbuf, size_t reps)
153 {
154     size_t i = blockDim.x * blockIdx.x + threadIdx.x;
155 
156     if(i < elemSize * reps * len)
157     {
158         size_t e = i % elemSize;
159         size_t l = i / (elemSize * reps);
160         outbuf[i] = inbuf[l * elemSize + e];
161     }
162 }
163                 `);
164             }
165         }
166 
167         override void execute(const(CUDABuffer)[] inputs, CUDABuffer output)
168         {
169             void process(CUDABuffer inbuf, CUDABuffer outbuf, size_t elemSize, size_t len, size_t reps)
170             {
171                 uint n = cast(uint)(elemSize * len * reps);
172                 uint numThreads = 512;
173 			    uint numBlocks = (cast(uint)n + numThreads) / numThreads;
174                 mRepeatKernel.execute(numBlocks, numThreads, inbuf.ptr, elemSize, len, outbuf.ptr, reps);
175             }
176 
177             //Iterate over each axis, from smallest stride to largest stride
178             size_t elemSize = sizeOf(mOp.elementType);
179             auto inbuf = cast(CUDABuffer)inputs[0];
180             CUDABuffer outbuf;
181 
182             foreach_reverse(i, a; mOp.attributes["repetitions"].get!(size_t[]))
183             {
184                 elemSize *= mOp.deps[0].shape[i];
185 
186                 if(a == 1)
187                 {
188                     continue;
189                 }
190                 
191                 outbuf = CUDABuffer.create(inbuf.numBytes * a);
192                 
193                 process(inbuf, outbuf, elemSize, inbuf.numBytes / elemSize, a);
194 
195                 elemSize *= a;
196 
197                 if(inbuf.ptr != inputs[0].ptr)
198                 {
199                     CUDABuffer.destroy(inbuf);
200                 }
201 
202                 inbuf = outbuf;
203             }
204 
205             cuMemcpy(output.ptr, outbuf.ptr, output.numBytes);
206             CUDABuffer.destroy(outbuf);
207         }
208 
209         Operation mOp;
210         static NVRTCKernel mRepeatKernel;
211     }
212 
213     class Transpose : CUDAKernel
214     {
215         this(Operation op)
216         {
217             mOp = op;
218         }
219 
220         void execute(const(CUDABuffer)[] inputs, CUDABuffer output)
221         {
222             if(mOp.outputType.elementType == DataType.float32)
223             {
224                 auto a = cast(float *)inputs[0].ptr;
225 				auto c = cast(float *)output.ptr;
226 				float alpha = 1;
227 				float beta = 0;
228 
229                 auto mShape = mOp.outputType.shape;
230 
231 				cublasSgeam(mCuBLASHandle, CUBLAS_OP_T, CUBLAS_OP_T, cast(int)mShape[1], cast(int)mShape[0], &alpha, a,
232                     cast(int)mShape[0], &beta, a, cast(int)mShape[0], c, cast(int)mShape[1]);
233 			}
234             else
235             {
236                 throw new Exception("Element type not supported.");
237             }
238         }
239 
240         Operation mOp;
241     }
242 }