1 module dopt.cuda.basic;
2 
3 import std.functional;
4 
5 import dopt.cuda;
6 import dopt.cuda.math;
7 import dopt.cuda.nvrtc;
8 import dopt.core.ops;
9 import dopt.core.types;
10 
11 import derelict.cuda;
12 
13 package
14 {
15     void initialize()
16     {
17         registerCUDAKernel("slice", toDelegate(&cudaKernelCtr!Slice));
18         registerCUDAKernel("pad", toDelegate(&cudaKernelCtr!Pad));
19         registerCUDAKernel("repeat", toDelegate(&cudaKernelCtr!Repeat));
20         registerCUDAKernel("transpose", toDelegate(&cudaKernelCtr!Transpose));
21     }
22 }
23 
24 private
25 {
26     CUDAKernel cudaKernelCtr(K)(Operation op)
27     {
28         return new K(op);
29     }
30 
31     class Slice : CUDAKernel
32     {
33         this(Operation op)
34         {
35             mOp = op;
36         }
37 
38         override void execute(const(CUDABuffer)[] inputs, CUDABuffer output)
39         {
40             size_t size = 4;
41 
42             void sliceImpl(const(CUdeviceptr) inputptr, in size_t[] inShape, size_t inVol,
43                         CUdeviceptr outputptr, in size_t[] outShape, size_t outVol, in size_t[] offset)
44             {
45                 if(inShape.length == 0)
46                 {
47                     cuMemcpy(outputptr, inputptr, size);
48                 }
49                 else if(inShape.length == 1)
50                 {
51                     cuMemcpy(outputptr, inputptr + offset[0] * size, outShape[0] * size);
52                 }
53                 else
54                 {
55                     for(size_t i = 0; i < outShape[0]; i++)
56                     {
57                         sliceImpl(inputptr + (i + offset[0]) * inVol * size,
58                                     inShape[1 .. $],
59                                     inVol / inShape[1],
60                                     outputptr + i * outVol * size,
61                                     outShape[1 .. $],
62                                     outVol / outShape[1],
63                                     offset[1 .. $]);
64                     }
65                 }
66             }
67 
68             auto inShape = mOp.deps[0].outputType.shape;
69             auto outShape = mOp.outputType.shape;
70             size_t inVol = mOp.deps[0].outputType.volume;
71             size_t outVol = mOp.outputType.volume;
72             auto offset = mOp.attributes["start"].get!(size_t[]);
73 
74             if(inShape.length > 0)
75             {
76                 inVol /= inShape[0];
77                 outVol /= outShape[0];
78             }
79 
80             sliceImpl(inputs[0].ptr, inShape, inVol, output.ptr, outShape, outVol, offset);
81         }
82 
83         Operation mOp;
84     }
85 
86     class Pad : CUDAKernel
87     {
88         this(Operation op)
89         {
90             mOp = op;
91         }
92 
93         void execute(const(CUDABuffer)[] inputs, CUDABuffer output)
94         {
95             size_t size = 4;
96 
97             void padImpl(CUdeviceptr inputptr, size_t[] inShape, size_t inVol,
98                         CUdeviceptr outputptr, size_t[] outShape, size_t outVol, size_t[] offset)
99             {
100                 if(inShape.length == 0)
101                 {
102                     cuMemcpy(outputptr, inputptr, size);
103                 }
104                 else if(inShape.length == 1)
105                 {
106                     cuMemcpy(outputptr + offset[0] * size, inputptr, inShape[0] * size);
107                 }
108                 else
109                 {
110                     for(size_t i = 0; i < inShape[0]; i++)
111                     {
112                         padImpl(inputptr + i * inVol * size,
113                                     inShape[1 .. $],
114                                     inVol / inShape[1],
115                                     outputptr + (i + offset[0]) * outVol * size,
116                                     outShape[1 .. $],
117                                     outVol / outShape[1],
118                                     offset[1 .. $]);
119                     }
120                 }
121             }
122 
123             auto inShape = mOp.deps[0].outputType.shape;
124             auto outShape = mOp.outputType.shape;
125             size_t inVol = mOp.deps[0].outputType.volume;
126             size_t outVol = mOp.outputType.volume;
127             auto offset = mOp.attributes["before"].get!(size_t[]);
128 
129             if(inShape.length > 0)
130             {
131                 inVol /= inShape[0];
132                 outVol /= outShape[0];
133             }
134 
135             cuMemsetD8(output.ptr, 0, output.numBytes);
136 
137             padImpl(inputs[0].ptr, inShape, inVol, output.ptr, outShape, outVol, offset);
138         }
139 
140         Operation mOp;
141     }
142 
143     class Repeat : CUDAKernel
144     {
145         this(Operation op)
146         {
147             mOp = op;
148 
149             if(mRepeatKernel is null)
150             {
151                 mRepeatKernel = new NVRTCKernel("repeatBlocks", `
152 extern "C" __global__ void repeatBlocks(const char *inbuf, size_t elemSize, size_t len, char *outbuf, size_t reps)
153 {
154     size_t i = blockDim.x * blockIdx.x + threadIdx.x;
155 
156     if(i < elemSize * reps * len)
157     {
158         size_t e = i % elemSize;
159         size_t l = i / (elemSize * reps);
160         outbuf[i] = inbuf[l * elemSize + e];
161     }
162 }
163                 `);
164             }
165         }
166 
167         override void execute(const(CUDABuffer)[] inputs, CUDABuffer output)
168         {
169             if(inputs[0].numBytes == output.numBytes)
170             {
171                 cuMemcpy(output.ptr, inputs[0].ptr, output.numBytes);
172                 return;
173             }
174 
175             void process(CUDABuffer inbuf, CUDABuffer outbuf, size_t elemSize, size_t len, size_t reps)
176             {
177                 uint n = cast(uint)(elemSize * len * reps);
178                 uint numThreads = 512;
179 			    uint numBlocks = (cast(uint)n + numThreads) / numThreads;
180                 mRepeatKernel.execute(numBlocks, numThreads, inbuf.ptr, elemSize, len, outbuf.ptr, reps);
181             }
182 
183             //Iterate over each axis, from smallest stride to largest stride
184             size_t elemSize = sizeOf(mOp.elementType);
185             auto inbuf = cast(CUDABuffer)inputs[0];
186             CUDABuffer outbuf;
187 
188             foreach_reverse(i, a; mOp.attributes["repetitions"].get!(size_t[]))
189             {
190                 elemSize *= mOp.deps[0].shape[i];
191 
192                 if(a == 1)
193                 {
194                     continue;
195                 }
196                 
197                 outbuf = CUDABuffer.create(inbuf.numBytes * a);
198                 
199                 process(inbuf, outbuf, elemSize, inbuf.numBytes / elemSize, a);
200 
201                 elemSize *= a;
202 
203                 if(inbuf.ptr != inputs[0].ptr)
204                 {
205                     CUDABuffer.destroy(inbuf);
206                 }
207 
208                 inbuf = outbuf;
209             }
210 
211             cuMemcpy(output.ptr, outbuf.ptr, output.numBytes);
212             CUDABuffer.destroy(outbuf);
213         }
214 
215         Operation mOp;
216         static NVRTCKernel mRepeatKernel;
217     }
218 
219     class Transpose : CUDAKernel
220     {
221         this(Operation op)
222         {
223             mOp = op;
224         }
225 
226         void execute(const(CUDABuffer)[] inputs, CUDABuffer output)
227         {
228             if(mOp.outputType.elementType == DataType.float32)
229             {
230                 auto a = cast(float *)inputs[0].ptr;
231 				auto c = cast(float *)output.ptr;
232 				float alpha = 1;
233 				float beta = 0;
234 
235                 auto mShape = mOp.outputType.shape;
236 
237 				cublasSgeam(mCuBLASHandle, CUBLAS_OP_T, CUBLAS_OP_T, cast(int)mShape[1], cast(int)mShape[0], &alpha, a,
238                     cast(int)mShape[0], &beta, a, cast(int)mShape[0], c, cast(int)mShape[1]);
239 			}
240             else
241             {
242                 throw new Exception("Element type not supported.");
243             }
244         }
245 
246         Operation mOp;
247     }
248 }