1 /** 2 This module contains methods for initialising the parameters of neural networks. 3 4 Several of the methods implemented in this module rely on $(D fan_in) and $(D fan_out) values. These are calculated 5 differently depending on the rank of the parameter. 6 7 For rank-2 tensors, 8 9 $(D fan_in = shape[0]), $(D fan_out = shape[1]) 10 11 For rank-4 tensors, 12 13 $(D fan_in = shape[1] * shape[2] * shape[3]), $(D fan_out = shape[0] * shape[2] * shape[3]) 14 */ 15 module dopt.nnet.parameters; 16 17 import std.math; 18 19 import dopt; 20 21 /** 22 Used to initialize a parameter in the neural network. 23 24 The $(D param) parameter will contain an $(D Operation) representing a variable. The ParamInitializer will set the 25 default value of this variable according to some parameter initialisation scheme. 26 */ 27 alias ParamInitializer = void delegate(Operation param); 28 29 private 30 { 31 void fillUniform(float[] vals, float minval, float maxval) 32 { 33 import std.random : uniform; 34 35 for(size_t i = 0; i < vals.length; i++) 36 { 37 vals[i] = uniform(minval, maxval); 38 } 39 } 40 41 void fillGaussian(float[] vals, float mean, float stddev) 42 { 43 import std.mathspecial : normalDistributionInverse; 44 import std.random : uniform; 45 46 for(size_t i = 0; i < vals.length; i++) 47 { 48 vals[i] = normalDistributionInverse(uniform(0.0f, 1.0f)) * stddev + mean; 49 } 50 } 51 52 size_t fanIn(size_t[] shape) 53 { 54 if(shape.length == 2) 55 { 56 return shape[1]; 57 } 58 else if(shape.length == 4) 59 { 60 return shape[1] * shape[2] * shape[3]; 61 } 62 else 63 { 64 import std.conv : to; 65 throw new Exception("Cannot compute fan-in for a parameter tensor of rank " ~ shape.length.to!string); 66 } 67 } 68 69 size_t fanOut(size_t[] shape) 70 { 71 if(shape.length == 2) 72 { 73 return shape[0]; 74 } 75 else if(shape.length == 4) 76 { 77 return shape[0] * shape[2] * shape[3]; 78 } 79 else 80 { 81 import std.conv : to; 82 throw new Exception("Cannot compute fan-out for a parameter tensor of rank " ~ shape.length.to!string); 83 } 84 } 85 } 86 87 /** 88 Encapsulates information about network parameters. 89 90 This can be used to keep track of per-parameter loss functions (e.g., weight decay), and also projection functions 91 that can be applied using constrained optimisation methods. 92 */ 93 struct Parameter 94 { 95 ///An "variable" operation. 96 Operation symbol; 97 98 ///Used for applying loss terms to this parameter (e.g., weight decay) 99 Operation loss; 100 101 ///A projection operation that can enforce some constraint 102 Projection projection; 103 } 104 105 /** 106 Creates a parameter initialiser that sets the initial value of each element in a parameter tensor to a constant 107 value. 108 109 Params: 110 val = The constant value to be used for initialisation. 111 112 Returns: 113 The constructed $(D ParamInitializer). 114 */ 115 ParamInitializer constantInit(float val) 116 { 117 void init(Operation param) 118 { 119 param.value.as!float[] = val; 120 } 121 122 return &init; 123 } 124 125 /** 126 Creates a parameter initialiser that sets the initial value of each element in a parameter tensor to a different 127 sample from a uniform distribution. 128 129 Params: 130 minval = The lower bound of the uniform distribution. 131 maxval = The upper bound of the uniform distribution. 132 133 Returns: 134 The constructed $(D ParamInitializer). 135 */ 136 ParamInitializer uniformInit(float minval, float maxval) 137 { 138 void init(Operation param) 139 { 140 fillUniform(param.value.as!float, minval, maxval); 141 } 142 143 return &init; 144 } 145 146 /** 147 Creates a parameter initialiser that sets the initial value of each element in a parameter tensor to a different 148 sample from a Gaussian distribution. 149 150 Params: 151 mean = The mean of the Gaussian distribution. 152 stddev = The standard deviation of the Gaussian distribution. 153 154 Returns: 155 The constructed $(D ParamInitializer). 156 */ 157 ParamInitializer gaussianInit(float mean, float stddev) 158 { 159 void init(Operation param) 160 { 161 fillGaussian(param.value.as!float, mean, stddev); 162 } 163 164 return &init; 165 } 166 167 /** 168 Creates a parameter initialiser that uses the method of Glorot and Bengio (2010). 169 170 This technique initialises a parameter with samples from the following uniform distribution: 171 172 U(-6 / (fan_in + fan_out), 6 / (fan_in + fan_out)) 173 174 For more details, see Glorot and Bengio (2010): http://jmlr.org/proceedings/papers/v9/glorot10a/glorot10a.pdf 175 176 Returns: 177 The constructed $(D ParamInitiaizer). 178 */ 179 ParamInitializer glorotUniformInit() 180 { 181 void init(Operation param) 182 { 183 auto bound = sqrt(6.0f / (param.shape.fanIn + param.shape.fanOut)); 184 fillUniform(param.value.as!float, -bound, bound); 185 } 186 187 return &init; 188 } 189 190 /** 191 Creates a parameter initialiser that uses the method of Glorot and Bengio (2010). 192 193 This technique initialises a parameter with samples from the following Gaussian distribution: 194 195 μ = 0 196 σ = sqrt(2 / (fan_in + fan_out)) 197 198 For more details, see Glorot and Bengio (2010): http://jmlr.org/proceedings/papers/v9/glorot10a/glorot10a.pdf 199 200 Returns: 201 The constructed $(D ParamInitiaizer). 202 */ 203 ParamInitializer glorotGaussianInit() 204 { 205 void init(Operation param) 206 { 207 fillGaussian(param.value.as!float, 0, sqrt(2.0f / (param.shape.fanIn + param.shape.fanOut))); 208 } 209 210 return &init; 211 } 212 213 /** 214 Creates a parameter initialiser that uses the method of He et al. (2015). 215 216 This technique initialises a parameter with samples from the following uniform distribution: 217 218 U(-6 / fan_in, 6 / fan_in) 219 220 For more details, see He et al. (2015): http://arxiv.org/abs/1502.01852 221 222 Returns: 223 The constructed $(D ParamInitiaizer). 224 */ 225 ParamInitializer heUniformInit() 226 { 227 void init(Operation param) 228 { 229 fillUniform(param.value.as!float, 0, sqrt(6.0f / (param.shape.fanIn))); 230 } 231 232 return &init; 233 } 234 235 /** 236 Creates a parameter initialiser that uses the method of He et al. (2015). 237 238 This technique initialises a parameter with samples from the following Gaussian distribution: 239 240 μ = 0 241 σ = sqrt(2 / fan_in) 242 243 For more details, see He et al. (2015): http://arxiv.org/abs/1502.01852 244 245 Returns: 246 The constructed $(D ParamInitiaizer). 247 */ 248 ParamInitializer heGaussianInit() 249 { 250 void init(Operation param) 251 { 252 fillGaussian(param.value.as!float, 0, sqrt(2.0f / (param.shape.fanIn ))); 253 } 254 255 return &init; 256 }