1 module dopt.core.cpu.basic; 2 3 import dopt.core; 4 5 package 6 { 7 void initialize() 8 { 9 import std.functional : toDelegate; 10 11 registerCPUKernel("slice", new CPUKernelDelegate(toDelegate(&slice))); 12 registerCPUKernel("pad", new CPUKernelDelegate(toDelegate(&pad))); 13 registerCPUKernel("transpose", new CPUKernelDelegate(toDelegate(&transpose))); 14 registerCPUKernel("repeat", new CPUKernelDelegate(toDelegate(&repeat))); 15 } 16 } 17 18 private 19 { 20 void slice(Operation op, const(Buffer)[] inputs, Buffer output) 21 { 22 size_t size = 4; 23 24 void sliceImpl(const(ubyte)[] input, in size_t[] inShape, size_t inVol, 25 ubyte[] output, in size_t[] outShape, size_t outVol, in size_t[] offset) 26 { 27 if(inShape.length == 0) 28 { 29 output[] = input[]; 30 } 31 else if(inShape.length == 1) 32 { 33 output[] = input[offset[0] * size .. (offset[0] + outShape[0]) * size]; 34 } 35 else 36 { 37 for(size_t i = 0; i < outShape[0]; i++) 38 { 39 sliceImpl(input[(i + offset[0]) * inVol * size.. (i + offset[0] + 1) * inVol * size], 40 inShape[1 .. $], 41 inVol / inShape[1], 42 output[i * outVol * size .. (i + 1) * outVol * size], 43 outShape[1 .. $], 44 outVol / outShape[1], 45 offset[1 .. $]); 46 } 47 } 48 } 49 50 auto inShape = op.deps[0].outputType.shape; 51 auto outShape = op.outputType.shape; 52 size_t inVol = op.deps[0].outputType.volume; 53 size_t outVol = op.outputType.volume; 54 auto offset = op.attributes["start"].get!(size_t[]); 55 56 if(inShape.length > 0) 57 { 58 inVol /= inShape[0]; 59 outVol /= outShape[0]; 60 } 61 62 sliceImpl(inputs[0].as!ubyte, inShape, inVol, output.as!ubyte, outShape, outVol, offset); 63 } 64 65 void pad(Operation op, const(Buffer)[] inputs, Buffer output) 66 { 67 size_t size = 4; 68 69 void padImpl(const(ubyte[]) input, size_t[] inShape, size_t inVol, 70 ubyte[] output, size_t[] outShape, size_t outVol, size_t[] offset) 71 { 72 if(inShape.length == 0) 73 { 74 output[] = input[]; 75 } 76 else if(inShape.length == 1) 77 { 78 output[0 .. offset[0] * size] = 0; 79 output[offset[0] * size .. (offset[0] + inShape[0]) * size] = input[]; 80 output[(offset[0] + inShape[0]) * size .. $] = 0; 81 } 82 else 83 { 84 output[0 .. offset[0] * outVol * size] = 0; 85 86 for(size_t i = 0; i < inShape[0]; i++) 87 { 88 padImpl(input[i * inVol * size.. (i + 1) * inVol * size], 89 inShape[1 .. $], 90 inVol / inShape[1], 91 output[(i + offset[0]) * outVol * size .. (i + offset[0] + 1) * outVol * size], 92 outShape[1 .. $], 93 outVol / outShape[1], 94 offset[1 .. $]); 95 } 96 97 output[(offset[0] + inShape[0]) * outVol * size .. $] = 0; 98 } 99 } 100 101 auto inShape = op.deps[0].outputType.shape; 102 auto outShape = op.outputType.shape; 103 size_t inVol = op.deps[0].outputType.volume; 104 size_t outVol = op.outputType.volume; 105 auto offset = op.attributes["before"].get!(size_t[]); 106 107 if(inShape.length > 0) 108 { 109 inVol /= inShape[0]; 110 outVol /= outShape[0]; 111 } 112 113 padImpl(inputs[0].as!ubyte, inShape, inVol, output.as!ubyte, outShape, outVol, offset); 114 } 115 116 void transpose(Operation op, const(Buffer)[] inputs, Buffer output) 117 { 118 import std.exception : enforce; 119 enforce(op.outputType.rank <= 2, "transpose is only implemented for rank <= 2"); 120 121 //Check whether we actually need to reorder them.. 122 auto order = op 123 .attributes["order"] 124 .get!(size_t[]); 125 126 if(order == [0, 1]) 127 { 128 return; 129 } 130 131 if(op.outputType.rank < 2) 132 { 133 output.as!ubyte[] = inputs[0].as!ubyte[]; 134 } 135 else 136 { 137 auto inBuf = inputs[0].as!ubyte; 138 auto outBuf = output.as!ubyte; 139 size_t size = outBuf.length / op.outputType.volume; 140 size_t rows = op.outputType.shape[0]; 141 size_t cols = op.outputType.shape[1]; 142 143 for(size_t r = 0; r < rows; r++) 144 { 145 for(size_t c = 0; c < cols; c++) 146 { 147 outBuf[size * (r * cols + c) .. size * (r * cols + c + 1)] = 148 inBuf[size * (c * rows + r) .. size * (c * rows + r + 1)]; 149 } 150 } 151 } 152 } 153 154 void repeat(Operation op, const(Buffer)[] inputs, Buffer output) 155 { 156 void run(T)() 157 { 158 void process(const(T)[] inbuf, T[] outbuf, size_t reps, size_t vol) 159 { 160 import std.array : array; 161 import std.range : iota; 162 import std.parallelism : parallel; 163 164 //for(size_t i = 0; i < inbuf.length; i += vol) 165 foreach(i; iota(0, inbuf.length, vol).array().parallel) 166 { 167 for(size_t o = i * reps; o < (i + vol) * reps; o += vol) 168 { 169 outbuf[o .. o + vol] = inbuf[i .. i + vol]; 170 } 171 } 172 } 173 174 //Iterate over each axis, from smallest stride to largest stride 175 size_t vol = 1; 176 auto inbuf = inputs[0].as!T; 177 T[] outbuf; 178 179 foreach_reverse(i, a; op.attributes["repetitions"].get!(size_t[])) 180 { 181 vol *= op.deps[0].shape[i]; 182 outbuf = new T[inbuf.length * a]; 183 process(inbuf, outbuf, a, vol); 184 vol *= a; 185 inbuf = outbuf; 186 } 187 188 output.as!T[] = outbuf[]; 189 } 190 191 switch(op.outputType.elementType) 192 { 193 case DataType.float32: 194 run!float(); 195 break; 196 197 case DataType.int32: 198 run!int(); 199 break; 200 201 default: 202 throw new Exception("Not implemented."); 203 } 204 } 205 }