1 /** 2 This module contains methods for initialising the parameters of neural networks. 3 4 Several of the methods implemented in this module rely on $(D fan_in) and $(D fan_out) values. These are calculated 5 differently depending on the rank of the parameter. 6 7 For rank-2 tensors, 8 9 $(D fan_in = shape[0]), $(D fan_out = shape[1]) 10 11 For rank-4 tensors, 12 13 $(D fan_in = shape[1] * shape[2] * shape[3]), $(D fan_out = shape[0] * shape[2] * shape[3]) 14 */ 15 module dopt.nnet.parameters; 16 17 import std.math; 18 19 import dopt.core; 20 import dopt.online; 21 22 /** 23 Used to initialize a parameter in the neural network. 24 25 The $(D param) parameter will contain an $(D Operation) representing a variable. The ParamInitializer will set the 26 default value of this variable according to some parameter initialisation scheme. 27 */ 28 alias ParamInitializer = void delegate(Operation param); 29 30 private 31 { 32 void fillUniform(float[] vals, float minval, float maxval) 33 { 34 import std.random : uniform; 35 36 for(size_t i = 0; i < vals.length; i++) 37 { 38 vals[i] = uniform(minval, maxval); 39 } 40 } 41 42 void fillGaussian(float[] vals, float mean, float stddev) 43 { 44 import std.mathspecial : normalDistributionInverse; 45 import std.random : uniform; 46 47 for(size_t i = 0; i < vals.length; i++) 48 { 49 vals[i] = normalDistributionInverse(uniform(0.0f, 1.0f)) * stddev + mean; 50 } 51 } 52 53 size_t fanIn(size_t[] shape) 54 { 55 if(shape.length == 2) 56 { 57 return shape[1]; 58 } 59 else if(shape.length == 4) 60 { 61 return shape[1] * shape[2] * shape[3]; 62 } 63 else 64 { 65 import std.conv : to; 66 throw new Exception("Cannot compute fan-in for a parameter tensor of rank " ~ shape.length.to!string); 67 } 68 } 69 70 size_t fanOut(size_t[] shape) 71 { 72 if(shape.length == 2) 73 { 74 return shape[0]; 75 } 76 else if(shape.length == 4) 77 { 78 return shape[0] * shape[2] * shape[3]; 79 } 80 else 81 { 82 import std.conv : to; 83 throw new Exception("Cannot compute fan-out for a parameter tensor of rank " ~ shape.length.to!string); 84 } 85 } 86 } 87 88 /** 89 Encapsulates information about network parameters. 90 91 This can be used to keep track of per-parameter loss functions (e.g., weight decay), and also projection functions 92 that can be applied using constrained optimisation methods. 93 */ 94 struct Parameter 95 { 96 ///An "variable" operation. 97 Operation symbol; 98 99 ///Used for applying loss terms to this parameter (e.g., weight decay) 100 Operation loss; 101 102 ///A projection operation that can enforce some constraint 103 Projection projection; 104 } 105 106 /** 107 Creates a parameter initialiser that sets the initial value of each element in a parameter tensor to a constant 108 value. 109 110 Params: 111 val = The constant value to be used for initialisation. 112 113 Returns: 114 The constructed $(D ParamInitializer). 115 */ 116 ParamInitializer constantInit(float val) 117 { 118 void init(Operation param) 119 { 120 param.value.as!float[] = val; 121 } 122 123 return &init; 124 } 125 126 /** 127 Creates a parameter initialiser that sets the initial value of each element in a parameter tensor to a different 128 sample from a uniform distribution. 129 130 Params: 131 minval = The lower bound of the uniform distribution. 132 maxval = The upper bound of the uniform distribution. 133 134 Returns: 135 The constructed $(D ParamInitializer). 136 */ 137 ParamInitializer uniformInit(float minval, float maxval) 138 { 139 void init(Operation param) 140 { 141 fillUniform(param.value.as!float, minval, maxval); 142 } 143 144 return &init; 145 } 146 147 /** 148 Creates a parameter initialiser that sets the initial value of each element in a parameter tensor to a different 149 sample from a Gaussian distribution. 150 151 Params: 152 mean = The mean of the Gaussian distribution. 153 stddev = The standard deviation of the Gaussian distribution. 154 155 Returns: 156 The constructed $(D ParamInitializer). 157 */ 158 ParamInitializer gaussianInit(float mean, float stddev) 159 { 160 void init(Operation param) 161 { 162 fillGaussian(param.value.as!float, mean, stddev); 163 } 164 165 return &init; 166 } 167 168 /** 169 Creates a parameter initialiser that uses the method of Glorot and Bengio (2010). 170 171 This technique initialises a parameter with samples from the following uniform distribution: 172 173 U(-6 / (fan_in + fan_out), 6 / (fan_in + fan_out)) 174 175 For more details, see Glorot and Bengio (2010): http://jmlr.org/proceedings/papers/v9/glorot10a/glorot10a.pdf 176 177 Returns: 178 The constructed $(D ParamInitiaizer). 179 */ 180 ParamInitializer glorotUniformInit() 181 { 182 void init(Operation param) 183 { 184 auto bound = sqrt(6.0f / (param.shape.fanIn + param.shape.fanOut)); 185 fillUniform(param.value.as!float, -bound, bound); 186 } 187 188 return &init; 189 } 190 191 /** 192 Creates a parameter initialiser that uses the method of Glorot and Bengio (2010). 193 194 This technique initialises a parameter with samples from the following Gaussian distribution: 195 196 μ = 0 197 σ = sqrt(2 / (fan_in + fan_out)) 198 199 For more details, see Glorot and Bengio (2010): http://jmlr.org/proceedings/papers/v9/glorot10a/glorot10a.pdf 200 201 Returns: 202 The constructed $(D ParamInitiaizer). 203 */ 204 ParamInitializer glorotGaussianInit() 205 { 206 void init(Operation param) 207 { 208 fillGaussian(param.value.as!float, 0, sqrt(2.0f / (param.shape.fanIn + param.shape.fanOut))); 209 } 210 211 return &init; 212 } 213 214 /** 215 Creates a parameter initialiser that uses the method of He et al. (2015). 216 217 This technique initialises a parameter with samples from the following uniform distribution: 218 219 U(-6 / fan_in, 6 / fan_in) 220 221 For more details, see He et al. (2015): http://arxiv.org/abs/1502.01852 222 223 Returns: 224 The constructed $(D ParamInitiaizer). 225 */ 226 ParamInitializer heUniformInit() 227 { 228 void init(Operation param) 229 { 230 fillUniform(param.value.as!float, 0, sqrt(6.0f / (param.shape.fanIn))); 231 } 232 233 return &init; 234 } 235 236 /** 237 Creates a parameter initialiser that uses the method of He et al. (2015). 238 239 This technique initialises a parameter with samples from the following Gaussian distribution: 240 241 μ = 0 242 σ = sqrt(2 / fan_in) 243 244 For more details, see He et al. (2015): http://arxiv.org/abs/1502.01852 245 246 Returns: 247 The constructed $(D ParamInitiaizer). 248 */ 249 ParamInitializer heGaussianInit() 250 { 251 void init(Operation param) 252 { 253 fillGaussian(param.value.as!float, 0, sqrt(2.0f / (param.shape.fanIn ))); 254 } 255 256 return &init; 257 }