1 /** 2 This module contains methods for initialising the parameters of neural networks. 3 4 Several of the methods implemented in this module rely on $(D fan_in) and $(D fan_out) values. These are calculated 5 differently depending on the rank of the parameter. 6 7 For rank-2 tensors, 8 9 $(D fan_in = shape[0]), $(D fan_out = shape[1]) 10 11 For rank-4 tensors, 12 13 $(D fan_in = shape[1] * shape[2] * shape[3]), $(D fan_out = shape[0] * shape[2] * shape[3]) 14 */ 15 module dopt.nnet.parameters; 16 17 import std.math; 18 19 import dopt.core; 20 import dopt.online; 21 22 /** 23 Used to initialize a parameter in the neural network. 24 25 The $(D param) parameter will contain an $(D Operation) representing a variable. The ParamInitializer will set the 26 default value of this variable according to some parameter initialisation scheme. 27 */ 28 alias ParamInitializer = void delegate(Operation param); 29 30 private 31 { 32 void fillUniform(float[] vals, float minval, float maxval) 33 { 34 import std.random : uniform; 35 36 for(size_t i = 0; i < vals.length; i++) 37 { 38 vals[i] = uniform(minval, maxval); 39 } 40 } 41 42 void fillGaussian(float[] vals, float mean, float stddev) 43 { 44 import std.mathspecial : normalDistributionInverse; 45 import std.random : uniform; 46 47 for(size_t i = 0; i < vals.length; i++) 48 { 49 vals[i] = normalDistributionInverse(uniform(0.0f, 1.0f)) * stddev + mean; 50 } 51 } 52 53 size_t fanIn(size_t[] shape) 54 { 55 if(shape.length == 2) 56 { 57 return shape[1]; 58 } 59 else if(shape.length == 4) 60 { 61 return shape[1] * shape[2] * shape[3]; 62 } 63 else 64 { 65 import std.conv : to; 66 throw new Exception("Cannot compute fan-in for a parameter tensor of rank " ~ shape.length.to!string); 67 } 68 } 69 70 size_t fanOut(size_t[] shape) 71 { 72 if(shape.length == 2) 73 { 74 return shape[0]; 75 } 76 else if(shape.length == 4) 77 { 78 return shape[0] * shape[2] * shape[3]; 79 } 80 else 81 { 82 import std.conv : to; 83 throw new Exception("Cannot compute fan-out for a parameter tensor of rank " ~ shape.length.to!string); 84 } 85 } 86 } 87 88 /** 89 Encapsulates information about network parameters. 90 91 This can be used to keep track of per-parameter loss functions (e.g., weight decay), and also projection functions 92 that can be applied using constrained optimisation methods. 93 */ 94 struct Parameter 95 { 96 ///An "variable" operation. 97 Operation symbol; 98 99 ///Used for applying loss terms to this parameter (e.g., weight decay) 100 Operation loss; 101 102 ///A projection operation that can enforce some constraint 103 Projection projection; 104 } 105 106 /** 107 Creates a parameter initialiser that sets the initial value of each element in a parameter tensor to a constant 108 value. 109 110 Params: 111 val = The constant value to be used for initialisation. 112 113 Returns: 114 The constructed $(D ParamInitializer). 115 */ 116 ParamInitializer constantInit(float val) 117 { 118 void init(Operation param) 119 { 120 import std.array : array; 121 import std.range : repeat; 122 123 param.value.set(repeat(val, param.volume).array()); 124 } 125 126 return &init; 127 } 128 129 /** 130 Creates a parameter initialiser that sets the initial value of each element in a parameter tensor to a different 131 sample from a uniform distribution. 132 133 Params: 134 minval = The lower bound of the uniform distribution. 135 maxval = The upper bound of the uniform distribution. 136 137 Returns: 138 The constructed $(D ParamInitializer). 139 */ 140 ParamInitializer uniformInit(float minval, float maxval) 141 { 142 void init(Operation param) 143 { 144 auto buf = param.value.get!float; 145 fillUniform(buf, minval, maxval); 146 param.value.set(buf); 147 } 148 149 return &init; 150 } 151 152 /** 153 Creates a parameter initialiser that sets the initial value of each element in a parameter tensor to a different 154 sample from a Gaussian distribution. 155 156 Params: 157 mean = The mean of the Gaussian distribution. 158 stddev = The standard deviation of the Gaussian distribution. 159 160 Returns: 161 The constructed $(D ParamInitializer). 162 */ 163 ParamInitializer gaussianInit(float mean, float stddev) 164 { 165 void init(Operation param) 166 { 167 auto buf = param.value.get!float; 168 fillGaussian(buf, mean, stddev); 169 param.value.set(buf); 170 } 171 172 return &init; 173 } 174 175 /** 176 Creates a parameter initialiser that uses the method of Glorot and Bengio (2010). 177 178 This technique initialises a parameter with samples from the following uniform distribution: 179 180 U(-6 / (fan_in + fan_out), 6 / (fan_in + fan_out)) 181 182 For more details, see Glorot and Bengio (2010): http://jmlr.org/proceedings/papers/v9/glorot10a/glorot10a.pdf 183 184 Returns: 185 The constructed $(D ParamInitiaizer). 186 */ 187 ParamInitializer glorotUniformInit() 188 { 189 void init(Operation param) 190 { 191 auto bound = sqrt(6.0f / (param.shape.fanIn + param.shape.fanOut)); 192 auto buf = param.value.get!float; 193 fillUniform(buf, -bound, bound); 194 param.value.set(buf); 195 } 196 197 return &init; 198 } 199 200 /** 201 Creates a parameter initialiser that uses the method of Glorot and Bengio (2010). 202 203 This technique initialises a parameter with samples from the following Gaussian distribution: 204 205 μ = 0 206 σ = sqrt(2 / (fan_in + fan_out)) 207 208 For more details, see Glorot and Bengio (2010): http://jmlr.org/proceedings/papers/v9/glorot10a/glorot10a.pdf 209 210 Returns: 211 The constructed $(D ParamInitiaizer). 212 */ 213 ParamInitializer glorotGaussianInit() 214 { 215 void init(Operation param) 216 { 217 auto buf = param.value.get!float; 218 fillGaussian(buf, 0, sqrt(2.0f / (param.shape.fanIn + param.shape.fanOut))); 219 param.value.set(buf); 220 } 221 222 return &init; 223 } 224 225 /** 226 Creates a parameter initialiser that uses the method of He et al. (2015). 227 228 This technique initialises a parameter with samples from the following uniform distribution: 229 230 U(-6 / fan_in, 6 / fan_in) 231 232 For more details, see He et al. (2015): http://arxiv.org/abs/1502.01852 233 234 Returns: 235 The constructed $(D ParamInitiaizer). 236 */ 237 ParamInitializer heUniformInit() 238 { 239 void init(Operation param) 240 { 241 auto buf = param.value.get!float; 242 fillUniform(buf, 0, sqrt(6.0f / (param.shape.fanIn))); 243 param.value.set(buf); 244 } 245 246 return &init; 247 } 248 249 /** 250 Creates a parameter initialiser that uses the method of He et al. (2015). 251 252 This technique initialises a parameter with samples from the following Gaussian distribution: 253 254 μ = 0 255 σ = sqrt(2 / fan_in) 256 257 For more details, see He et al. (2015): http://arxiv.org/abs/1502.01852 258 259 Returns: 260 The constructed $(D ParamInitiaizer). 261 */ 262 ParamInitializer heGaussianInit() 263 { 264 void init(Operation param) 265 { 266 auto buf = param.value.get!float; 267 fillGaussian(buf, 0, sqrt(2.0f / (param.shape.fanIn))); 268 param.value.set(buf); 269 } 270 271 return &init; 272 }