1 module dopt.core.cpu.basic;
2 
3 import dopt.core;
4 
5 package
6 {
7     void initialize()
8     {
9         import std.functional : toDelegate;
10 
11         registerCPUKernel("slice", new CPUKernelDelegate(toDelegate(&slice)));
12         registerCPUKernel("pad", new CPUKernelDelegate(toDelegate(&pad)));
13         registerCPUKernel("transpose", new CPUKernelDelegate(toDelegate(&transpose)));
14         registerCPUKernel("repeat", new CPUKernelDelegate(toDelegate(&repeat)));
15     }
16 }
17 
18 private
19 {
20     void slice(Operation op, const(Buffer)[] inputs, Buffer output)
21     {
22         size_t size = 4;
23 
24         void sliceImpl(const(ubyte)[] input, in size_t[] inShape, size_t inVol,
25                        ubyte[] output, in size_t[] outShape, size_t outVol, in size_t[] offset)
26         {
27             if(inShape.length == 0)
28             {
29                 output[] = input[];
30             }
31             else if(inShape.length == 1)
32             {
33                 output[] = input[offset[0] * size .. (offset[0] + outShape[0]) * size];
34             }
35             else
36             {
37                 for(size_t i = 0; i < outShape[0]; i++)
38                 {
39                     sliceImpl(input[(i + offset[0]) * inVol * size.. (i + offset[0] + 1) * inVol * size],
40                                 inShape[1 .. $],
41                                 inVol / inShape[1],
42                                 output[i * outVol * size .. (i + 1) * outVol * size],
43                                 outShape[1 .. $],
44                                 outVol / outShape[1],
45                                 offset[1 .. $]);
46                 }
47             }
48         }
49 
50         auto inShape = op.deps[0].outputType.shape;
51         auto outShape = op.outputType.shape;
52         size_t inVol = op.deps[0].outputType.volume;
53         size_t outVol = op.outputType.volume;
54         auto offset = op.attributes["start"].get!(size_t[]);
55 
56         if(inShape.length > 0)
57         {
58             inVol /= inShape[0];
59             outVol /= outShape[0];
60         }
61 
62         sliceImpl(inputs[0].as!ubyte, inShape, inVol, output.as!ubyte, outShape, outVol, offset);
63     }
64 
65     void pad(Operation op, const(Buffer)[] inputs, Buffer output)
66     {
67         size_t size = 4;
68 
69         void padImpl(const(ubyte[]) input, size_t[] inShape, size_t inVol,
70                      ubyte[] output, size_t[] outShape, size_t outVol, size_t[] offset)
71         {
72             if(inShape.length == 0)
73             {
74                 output[] = input[];
75             }
76             else if(inShape.length == 1)
77             {
78                 output[0 .. offset[0] * size] = 0;
79                 output[offset[0] * size .. (offset[0] + inShape[0]) * size] = input[];
80                 output[(offset[0] + inShape[0]) * size .. $] = 0;
81             }
82             else
83             {
84                 output[0 .. offset[0] * outVol * size] = 0;
85 
86                 for(size_t i = 0; i < inShape[0]; i++)
87                 {
88                     padImpl(input[i * inVol * size.. (i + 1) * inVol * size],
89                                 inShape[1 .. $],
90                                 inVol / inShape[1],
91                                 output[(i + offset[0]) * outVol * size .. (i + offset[0] + 1) * outVol * size],
92                                 outShape[1 .. $],
93                                 outVol / outShape[1],
94                                 offset[1 .. $]);
95                 }
96 
97                 output[(offset[0] + inShape[0]) * outVol * size .. $] = 0;
98             }
99         }
100 
101         auto inShape = op.deps[0].outputType.shape;
102         auto outShape = op.outputType.shape;
103         size_t inVol = op.deps[0].outputType.volume;
104         size_t outVol = op.outputType.volume;
105         auto offset = op.attributes["before"].get!(size_t[]);
106 
107         if(inShape.length > 0)
108         {
109             inVol /= inShape[0];
110             outVol /= outShape[0];
111         }
112 
113         padImpl(inputs[0].as!ubyte, inShape, inVol, output.as!ubyte, outShape, outVol, offset);
114     }
115 
116     void transpose(Operation op, const(Buffer)[] inputs, Buffer output)
117     {
118         import std.exception : enforce;
119         enforce(op.outputType.rank <= 2, "transpose is only implemented for rank <= 2");
120 
121         //Check whether we actually need to reorder them..
122         auto order = op
123                     .attributes["order"]
124                     .get!(size_t[]);
125 
126         if(order == [0, 1])
127         {
128             return;
129         }
130 
131         if(op.outputType.rank < 2)
132         {
133             output.as!ubyte[] = inputs[0].as!ubyte[];
134         }
135         else
136         {
137             auto inBuf = inputs[0].as!ubyte;
138             auto outBuf = output.as!ubyte;
139             size_t size = outBuf.length / op.outputType.volume;
140             size_t rows = op.outputType.shape[0];
141             size_t cols = op.outputType.shape[1];
142 
143             for(size_t r = 0; r < rows; r++)
144             {
145                 for(size_t c = 0; c < cols; c++)
146                 {
147                     outBuf[size * (r * cols + c) .. size * (r * cols + c + 1)] =
148                         inBuf[size * (c * rows + r) .. size * (c * rows + r + 1)];
149                 }
150             }
151         }
152     }
153 
154     void repeat(Operation op, const(Buffer)[] inputs, Buffer output)
155     {
156         void run(T)()
157         {
158             void process(const(T)[] inbuf, T[] outbuf, size_t reps, size_t vol)
159             {
160                 import std.array : array;
161                 import std.range : iota;
162                 import std.parallelism : parallel;
163 
164                 //for(size_t i = 0; i < inbuf.length; i += vol)
165                 foreach(i; iota(0, inbuf.length, vol).array().parallel)
166                 {
167                     for(size_t o = i * reps; o < (i + vol) * reps; o += vol)
168                     {
169                         outbuf[o .. o + vol] = inbuf[i .. i + vol];
170                     }
171                 }
172             }
173 
174             //Iterate over each axis, from smallest stride to largest stride
175             size_t vol = 1;
176             auto inbuf = inputs[0].as!T;
177             T[] outbuf;
178 
179             foreach_reverse(i, a; op.attributes["repetitions"].get!(size_t[]))
180             {
181                 vol *= op.deps[0].shape[i];
182                 outbuf = new T[inbuf.length * a];
183                 process(inbuf, outbuf, a, vol);
184                 vol *= a;
185                 inbuf = outbuf;
186             }
187 
188             output.as!T[] = outbuf[];
189         }
190 
191         switch(op.outputType.elementType)
192         {
193             case DataType.float32:
194                 run!float();
195                 break;
196 
197             case DataType.int32:
198                 run!int();
199                 break;
200 
201             default:
202                 throw new Exception("Not implemented.");
203         }
204     }
205 }