1 module dopt.cpu.basic; 2 3 import dopt.core; 4 import dopt.cpu; 5 6 package 7 { 8 void initialize() 9 { 10 import std.functional : toDelegate; 11 12 registerCPUKernel("slice", new CPUKernelDelegate(toDelegate(&slice))); 13 registerCPUKernel("pad", new CPUKernelDelegate(toDelegate(&pad))); 14 registerCPUKernel("transpose", new CPUKernelDelegate(toDelegate(&transpose))); 15 registerCPUKernel("repeat", new CPUKernelDelegate(toDelegate(&repeat))); 16 } 17 } 18 19 private 20 { 21 void slice(Operation op, const(void[])[] inputs, void[] output) 22 { 23 size_t size = 4; 24 25 void sliceImpl(const(ubyte)[] input, in size_t[] inShape, size_t inVol, 26 ubyte[] output, in size_t[] outShape, size_t outVol, in size_t[] offset) 27 { 28 if(inShape.length == 0) 29 { 30 output[] = input[]; 31 } 32 else if(inShape.length == 1) 33 { 34 output[] = input[offset[0] * size .. (offset[0] + outShape[0]) * size]; 35 } 36 else 37 { 38 for(size_t i = 0; i < outShape[0]; i++) 39 { 40 sliceImpl(input[(i + offset[0]) * inVol * size.. (i + offset[0] + 1) * inVol * size], 41 inShape[1 .. $], 42 inVol / inShape[1], 43 output[i * outVol * size .. (i + 1) * outVol * size], 44 outShape[1 .. $], 45 outVol / outShape[1], 46 offset[1 .. $]); 47 } 48 } 49 } 50 51 auto inShape = op.deps[0].outputType.shape; 52 auto outShape = op.outputType.shape; 53 size_t inVol = op.deps[0].outputType.volume; 54 size_t outVol = op.outputType.volume; 55 auto offset = op.attributes["start"].get!(size_t[]); 56 57 if(inShape.length > 0) 58 { 59 inVol /= inShape[0]; 60 outVol /= outShape[0]; 61 } 62 63 sliceImpl(cast(const(ubyte[]))inputs[0], inShape, inVol, cast(ubyte[])output, outShape, outVol, offset); 64 } 65 66 void pad(Operation op, const(void[])[] inputs, void[] output) 67 { 68 size_t size = 4; 69 70 void padImpl(const(ubyte[]) input, size_t[] inShape, size_t inVol, 71 ubyte[] output, size_t[] outShape, size_t outVol, size_t[] offset) 72 { 73 if(inShape.length == 0) 74 { 75 output[] = input[]; 76 } 77 else if(inShape.length == 1) 78 { 79 output[0 .. offset[0] * size] = 0; 80 output[offset[0] * size .. (offset[0] + inShape[0]) * size] = input[]; 81 output[(offset[0] + inShape[0]) * size .. $] = 0; 82 } 83 else 84 { 85 output[0 .. offset[0] * outVol * size] = 0; 86 87 for(size_t i = 0; i < inShape[0]; i++) 88 { 89 padImpl(input[i * inVol * size.. (i + 1) * inVol * size], 90 inShape[1 .. $], 91 inVol / inShape[1], 92 output[(i + offset[0]) * outVol * size .. (i + offset[0] + 1) * outVol * size], 93 outShape[1 .. $], 94 outVol / outShape[1], 95 offset[1 .. $]); 96 } 97 98 output[(offset[0] + inShape[0]) * outVol * size .. $] = 0; 99 } 100 } 101 102 auto inShape = op.deps[0].outputType.shape; 103 auto outShape = op.outputType.shape; 104 size_t inVol = op.deps[0].outputType.volume; 105 size_t outVol = op.outputType.volume; 106 auto offset = op.attributes["before"].get!(size_t[]); 107 108 if(inShape.length > 0) 109 { 110 inVol /= inShape[0]; 111 outVol /= outShape[0]; 112 } 113 114 padImpl(cast(const(ubyte[]))inputs[0], inShape, inVol, cast(ubyte[])output, outShape, outVol, offset); 115 } 116 117 void transpose(Operation op, const(void[])[] inputs, void[] output) 118 { 119 import std.exception : enforce; 120 enforce(op.outputType.rank <= 2, "transpose is only implemented for rank <= 2"); 121 122 //Check whether we actually need to reorder them.. 123 auto order = op 124 .attributes["order"] 125 .get!(size_t[]); 126 127 if(order == [0, 1]) 128 { 129 return; 130 } 131 132 if(op.outputType.rank < 2) 133 { 134 output[] = inputs[0][]; 135 } 136 else 137 { 138 auto inBuf = cast(const(ubyte[]))inputs[0]; 139 auto outBuf = cast(ubyte[])output; 140 size_t size = outBuf.length / op.outputType.volume; 141 size_t rows = op.outputType.shape[0]; 142 size_t cols = op.outputType.shape[1]; 143 144 for(size_t r = 0; r < rows; r++) 145 { 146 for(size_t c = 0; c < cols; c++) 147 { 148 outBuf[size * (r * cols + c) .. size * (r * cols + c + 1)] = 149 inBuf[size * (c * rows + r) .. size * (c * rows + r + 1)]; 150 } 151 } 152 } 153 } 154 155 void repeat(Operation op, const(void[])[] inputs, void[] output) 156 { 157 void run(T)() 158 { 159 void process(const(T)[] inbuf, T[] outbuf, size_t reps, size_t vol) 160 { 161 import std.array : array; 162 import std.range : iota; 163 import std.parallelism : parallel; 164 165 //for(size_t i = 0; i < inbuf.length; i += vol) 166 foreach(i; iota(0, inbuf.length, vol).array().parallel) 167 { 168 for(size_t o = i * reps; o < (i + vol) * reps; o += vol) 169 { 170 outbuf[o .. o + vol] = inbuf[i .. i + vol]; 171 } 172 } 173 } 174 175 //Iterate over each axis, from smallest stride to largest stride 176 size_t vol = 1; 177 auto inbuf = cast(T[])inputs[0]; 178 T[] outbuf; 179 180 foreach_reverse(i, a; op.attributes["repetitions"].get!(size_t[])) 181 { 182 vol *= op.deps[0].shape[i]; 183 outbuf = new T[inbuf.length * a]; 184 process(inbuf, outbuf, a, vol); 185 vol *= a; 186 inbuf = outbuf; 187 } 188 189 output[] = outbuf[]; 190 } 191 192 switch(op.outputType.elementType) 193 { 194 case DataType.float32: 195 run!float(); 196 break; 197 198 case DataType.int32: 199 run!int(); 200 break; 201 202 default: 203 throw new Exception("Not implemented."); 204 } 205 } 206 }