1 /**
2     This module contains methods for initialising the parameters of neural networks.
3 
4     Several of the methods implemented in this module rely on $(D fan_in) and $(D fan_out) values. These are calculated
5     differently depending on the rank of the parameter.
6 
7     For rank-2 tensors,
8         
9         $(D fan_in = shape[0]), $(D fan_out = shape[1])
10     
11     For rank-4 tensors,
12         
13         $(D fan_in = shape[1] * shape[2] * shape[3]), $(D fan_out = shape[0] * shape[2] * shape[3])
14 */
15 module dopt.nnet.parameters;
16 
17 import std.math;
18 
19 import dopt.core;
20 import dopt.online;
21 
22 /**
23     Used to initialize a parameter in the neural network.
24 
25     The $(D param) parameter will contain an $(D Operation) representing a variable. The ParamInitializer will set the
26     default value of this variable according to some parameter initialisation scheme.
27 */
28 alias ParamInitializer = void delegate(Operation param);
29 
30 private
31 {
32     void fillUniform(float[] vals, float minval, float maxval)
33     {
34         import std.random : uniform;
35 
36         for(size_t i = 0; i < vals.length; i++)
37         {
38             vals[i] = uniform(minval, maxval);
39         }
40     }
41 
42     void fillGaussian(float[] vals, float mean, float stddev)
43     {
44         import std.mathspecial : normalDistributionInverse;
45         import std.random : uniform;
46 
47         for(size_t i = 0; i < vals.length; i++)
48         {
49             vals[i] = normalDistributionInverse(uniform(0.0f, 1.0f)) * stddev + mean;
50         }
51     }
52 
53     size_t fanIn(size_t[] shape)
54     {
55         if(shape.length == 2)
56         {
57             return shape[1];
58         }
59         else if(shape.length == 4)
60         {
61             return shape[1] * shape[2] * shape[3];
62         }
63         else
64         {
65             import std.conv : to;
66             throw new Exception("Cannot compute fan-in for a parameter tensor of rank " ~ shape.length.to!string);
67         }
68     }
69 
70     size_t fanOut(size_t[] shape)
71     {
72         if(shape.length == 2)
73         {
74             return shape[0];
75         }
76         else if(shape.length == 4)
77         {
78             return shape[0] * shape[2] * shape[3];
79         }
80         else
81         {
82             import std.conv : to;
83             throw new Exception("Cannot compute fan-out for a parameter tensor of rank " ~ shape.length.to!string);
84         }
85     }
86 }
87 
88 /**
89     Encapsulates information about network parameters.
90 
91     This can be used to keep track of per-parameter loss functions (e.g., weight decay), and also projection functions
92     that can be applied using constrained optimisation methods.
93 */
94 struct Parameter
95 {
96     ///An "variable" operation.
97     Operation symbol;
98 
99     ///Used for applying loss terms to this parameter (e.g., weight decay)
100     Operation loss;
101 
102     ///A projection operation that can enforce some constraint
103     Projection projection;
104 }
105 
106 /**
107     Creates a parameter initialiser that sets the initial value of each element in a parameter tensor to a constant
108     value.
109 
110     Params:
111         val = The constant value to be used for initialisation.
112 
113     Returns:
114         The constructed $(D ParamInitializer).
115 */
116 ParamInitializer constantInit(float val)
117 {
118     void init(Operation param)
119     {
120         import std.array : array;
121         import std.range : repeat;
122 
123         param.value.set(repeat(val, param.volume).array());
124     }
125 
126     return &init;
127 }
128 
129 /**
130     Creates a parameter initialiser that sets the initial value of each element in a parameter tensor to a different
131     sample from a uniform distribution.
132 
133     Params:
134         minval = The lower bound of the uniform distribution.
135         maxval = The upper bound of the uniform distribution.
136     
137     Returns:
138         The constructed $(D ParamInitializer).
139 */
140 ParamInitializer uniformInit(float minval, float maxval)
141 {
142     void init(Operation param)
143     {
144         auto buf = param.value.get!float;
145         fillUniform(buf, minval, maxval);
146         param.value.set(buf);
147     }
148 
149     return &init;
150 }
151 
152 /**
153     Creates a parameter initialiser that sets the initial value of each element in a parameter tensor to a different
154     sample from a Gaussian distribution.
155 
156     Params:
157         mean = The mean of the Gaussian distribution.
158         stddev = The standard deviation of the Gaussian distribution.
159     
160     Returns:
161         The constructed $(D ParamInitializer).
162 */
163 ParamInitializer gaussianInit(float mean, float stddev)
164 {
165     void init(Operation param)
166     {
167         auto buf = param.value.get!float;
168         fillGaussian(buf, mean, stddev);
169         param.value.set(buf);
170     }
171 
172     return &init;
173 }
174 
175 /**
176     Creates a parameter initialiser that uses the method of Glorot and Bengio (2010).
177 
178     This technique initialises a parameter with samples from the following uniform distribution:
179 
180     U(-6 / (fan_in + fan_out), 6 / (fan_in + fan_out))
181 
182     For more details, see Glorot and Bengio (2010): http://jmlr.org/proceedings/papers/v9/glorot10a/glorot10a.pdf
183 
184     Returns:
185         The constructed $(D ParamInitiaizer).
186 */
187 ParamInitializer glorotUniformInit()
188 {
189     void init(Operation param)
190     {
191         auto bound = sqrt(6.0f / (param.shape.fanIn + param.shape.fanOut));
192         auto buf = param.value.get!float;
193         fillUniform(buf, -bound, bound);
194         param.value.set(buf);
195     }
196 
197     return &init;
198 }
199 
200 /**
201     Creates a parameter initialiser that uses the method of Glorot and Bengio (2010).
202 
203     This technique initialises a parameter with samples from the following Gaussian distribution:
204 
205     μ = 0
206     σ = sqrt(2 / (fan_in + fan_out))
207 
208     For more details, see Glorot and Bengio (2010): http://jmlr.org/proceedings/papers/v9/glorot10a/glorot10a.pdf
209 
210     Returns:
211         The constructed $(D ParamInitiaizer).
212 */
213 ParamInitializer glorotGaussianInit()
214 {
215     void init(Operation param)
216     {
217         auto buf = param.value.get!float;
218         fillGaussian(buf, 0, sqrt(2.0f / (param.shape.fanIn + param.shape.fanOut)));
219         param.value.set(buf);
220     }
221 
222     return &init;
223 }
224 
225 /**
226     Creates a parameter initialiser that uses the method of He et al. (2015).
227 
228     This technique initialises a parameter with samples from the following uniform distribution:
229 
230     U(-6 / fan_in, 6 / fan_in)
231 
232     For more details, see He et al. (2015): http://arxiv.org/abs/1502.01852
233 
234     Returns:
235         The constructed $(D ParamInitiaizer).
236 */  
237 ParamInitializer heUniformInit()
238 {
239     void init(Operation param)
240     {
241         auto buf = param.value.get!float;
242         fillUniform(buf, 0, sqrt(6.0f / (param.shape.fanIn)));
243         param.value.set(buf);
244     }
245 
246     return &init;
247 }
248 
249 /**
250     Creates a parameter initialiser that uses the method of He et al. (2015).
251 
252     This technique initialises a parameter with samples from the following Gaussian distribution:
253 
254     μ = 0
255     σ = sqrt(2 / fan_in)
256 
257     For more details, see He et al. (2015): http://arxiv.org/abs/1502.01852
258 
259     Returns:
260         The constructed $(D ParamInitiaizer).
261 */  
262 ParamInitializer heGaussianInit()
263 {
264     void init(Operation param)
265     {
266         auto buf = param.value.get!float;
267         fillGaussian(buf, 0, sqrt(2.0f / (param.shape.fanIn)));
268         param.value.set(buf);
269     }
270 
271     return &init;
272 }