1 module dopt.cpu.basic;
2 
3 import dopt.core;
4 import dopt.cpu;
5 
6 package
7 {
8     void initialize()
9     {
10         import std.functional : toDelegate;
11 
12         registerCPUKernel("slice", new CPUKernelDelegate(toDelegate(&slice)));
13         registerCPUKernel("pad", new CPUKernelDelegate(toDelegate(&pad)));
14         registerCPUKernel("transpose", new CPUKernelDelegate(toDelegate(&transpose)));
15         registerCPUKernel("repeat", new CPUKernelDelegate(toDelegate(&repeat)));
16     }
17 }
18 
19 private
20 {
21     void slice(Operation op, const(void[])[] inputs, void[] output)
22     {
23         size_t size = 4;
24 
25         void sliceImpl(const(ubyte)[] input, in size_t[] inShape, size_t inVol,
26                        ubyte[] output, in size_t[] outShape, size_t outVol, in size_t[] offset)
27         {
28             if(inShape.length == 0)
29             {
30                 output[] = input[];
31             }
32             else if(inShape.length == 1)
33             {
34                 output[] = input[offset[0] * size .. (offset[0] + outShape[0]) * size];
35             }
36             else
37             {
38                 for(size_t i = 0; i < outShape[0]; i++)
39                 {
40                     sliceImpl(input[(i + offset[0]) * inVol * size.. (i + offset[0] + 1) * inVol * size],
41                                 inShape[1 .. $],
42                                 inVol / inShape[1],
43                                 output[i * outVol * size .. (i + 1) * outVol * size],
44                                 outShape[1 .. $],
45                                 outVol / outShape[1],
46                                 offset[1 .. $]);
47                 }
48             }
49         }
50 
51         auto inShape = op.deps[0].outputType.shape;
52         auto outShape = op.outputType.shape;
53         size_t inVol = op.deps[0].outputType.volume;
54         size_t outVol = op.outputType.volume;
55         auto offset = op.attributes["start"].get!(size_t[]);
56 
57         if(inShape.length > 0)
58         {
59             inVol /= inShape[0];
60             outVol /= outShape[0];
61         }
62 
63         sliceImpl(cast(const(ubyte[]))inputs[0], inShape, inVol, cast(ubyte[])output, outShape, outVol, offset);
64     }
65 
66     void pad(Operation op, const(void[])[] inputs, void[] output)
67     {
68         size_t size = 4;
69 
70         void padImpl(const(ubyte[]) input, size_t[] inShape, size_t inVol,
71                      ubyte[] output, size_t[] outShape, size_t outVol, size_t[] offset)
72         {
73             if(inShape.length == 0)
74             {
75                 output[] = input[];
76             }
77             else if(inShape.length == 1)
78             {
79                 output[0 .. offset[0] * size] = 0;
80                 output[offset[0] * size .. (offset[0] + inShape[0]) * size] = input[];
81                 output[(offset[0] + inShape[0]) * size .. $] = 0;
82             }
83             else
84             {
85                 output[0 .. offset[0] * outVol * size] = 0;
86 
87                 for(size_t i = 0; i < inShape[0]; i++)
88                 {
89                     padImpl(input[i * inVol * size.. (i + 1) * inVol * size],
90                                 inShape[1 .. $],
91                                 inVol / inShape[1],
92                                 output[(i + offset[0]) * outVol * size .. (i + offset[0] + 1) * outVol * size],
93                                 outShape[1 .. $],
94                                 outVol / outShape[1],
95                                 offset[1 .. $]);
96                 }
97 
98                 output[(offset[0] + inShape[0]) * outVol * size .. $] = 0;
99             }
100         }
101 
102         auto inShape = op.deps[0].outputType.shape;
103         auto outShape = op.outputType.shape;
104         size_t inVol = op.deps[0].outputType.volume;
105         size_t outVol = op.outputType.volume;
106         auto offset = op.attributes["before"].get!(size_t[]);
107 
108         if(inShape.length > 0)
109         {
110             inVol /= inShape[0];
111             outVol /= outShape[0];
112         }
113 
114         padImpl(cast(const(ubyte[]))inputs[0], inShape, inVol, cast(ubyte[])output, outShape, outVol, offset);
115     }
116 
117     void transpose(Operation op, const(void[])[] inputs, void[] output)
118     {
119         import std.exception : enforce;
120         enforce(op.outputType.rank <= 2, "transpose is only implemented for rank <= 2");
121 
122         //Check whether we actually need to reorder them..
123         auto order = op
124                     .attributes["order"]
125                     .get!(size_t[]);
126 
127         if(order == [0, 1])
128         {
129             return;
130         }
131 
132         if(op.outputType.rank < 2)
133         {
134             output[] = inputs[0][];
135         }
136         else
137         {
138             auto inBuf = cast(const(ubyte[]))inputs[0];
139             auto outBuf = cast(ubyte[])output;
140             size_t size = outBuf.length / op.outputType.volume;
141             size_t rows = op.outputType.shape[0];
142             size_t cols = op.outputType.shape[1];
143 
144             for(size_t r = 0; r < rows; r++)
145             {
146                 for(size_t c = 0; c < cols; c++)
147                 {
148                     outBuf[size * (r * cols + c) .. size * (r * cols + c + 1)] =
149                         inBuf[size * (c * rows + r) .. size * (c * rows + r + 1)];
150                 }
151             }
152         }
153     }
154 
155     void repeat(Operation op, const(void[])[] inputs, void[] output)
156     {
157         void run(T)()
158         {
159             void process(const(T)[] inbuf, T[] outbuf, size_t reps, size_t vol)
160             {
161                 import std.array : array;
162                 import std.range : iota;
163                 import std.parallelism : parallel;
164 
165                 //for(size_t i = 0; i < inbuf.length; i += vol)
166                 foreach(i; iota(0, inbuf.length, vol).array().parallel)
167                 {
168                     for(size_t o = i * reps; o < (i + vol) * reps; o += vol)
169                     {
170                         outbuf[o .. o + vol] = inbuf[i .. i + vol];
171                     }
172                 }
173             }
174 
175             //Iterate over each axis, from smallest stride to largest stride
176             size_t vol = 1;
177             auto inbuf = cast(T[])inputs[0];
178             T[] outbuf;
179 
180             foreach_reverse(i, a; op.attributes["repetitions"].get!(size_t[]))
181             {
182                 vol *= op.deps[0].shape[i];
183                 outbuf = new T[inbuf.length * a];
184                 process(inbuf, outbuf, a, vol);
185                 vol *= a;
186                 inbuf = outbuf;
187             }
188 
189             output[] = outbuf[];
190         }
191 
192         switch(op.outputType.elementType)
193         {
194             case DataType.float32:
195                 run!float();
196                 break;
197 
198             case DataType.int32:
199                 run!int();
200                 break;
201 
202             default:
203                 throw new Exception("Not implemented.");
204         }
205     }
206 }