1 /**
2     This module contains methods for initialising the parameters of neural networks.
3 
4     Several of the methods implemented in this module rely on $(D fan_in) and $(D fan_out) values. These are calculated
5     differently depending on the rank of the parameter.
6 
7     For rank-2 tensors,
8         
9         $(D fan_in = shape[0]), $(D fan_out = shape[1])
10     
11     For rank-4 tensors,
12         
13         $(D fan_in = shape[1] * shape[2] * shape[3]), $(D fan_out = shape[0] * shape[2] * shape[3])
14 */
15 module dopt.nnet.parameters;
16 
17 import std.math;
18 
19 import dopt.core;
20 import dopt.online;
21 
22 /**
23     Used to initialize a parameter in the neural network.
24 
25     The $(D param) parameter will contain an $(D Operation) representing a variable. The ParamInitializer will set the
26     default value of this variable according to some parameter initialisation scheme.
27 */
28 alias ParamInitializer = void delegate(Operation param);
29 
30 private
31 {
32     void fillUniform(float[] vals, float minval, float maxval)
33     {
34         import std.random : uniform;
35 
36         for(size_t i = 0; i < vals.length; i++)
37         {
38             vals[i] = uniform(minval, maxval);
39         }
40     }
41 
42     void fillGaussian(float[] vals, float mean, float stddev)
43     {
44         import std.mathspecial : normalDistributionInverse;
45         import std.random : uniform;
46 
47         for(size_t i = 0; i < vals.length; i++)
48         {
49             vals[i] = normalDistributionInverse(uniform(0.0f, 1.0f)) * stddev + mean;
50         }
51     }
52 
53     size_t fanIn(size_t[] shape)
54     {
55         if(shape.length == 2)
56         {
57             return shape[1];
58         }
59         else if(shape.length == 4)
60         {
61             return shape[1] * shape[2] * shape[3];
62         }
63         else
64         {
65             import std.conv : to;
66             throw new Exception("Cannot compute fan-in for a parameter tensor of rank " ~ shape.length.to!string);
67         }
68     }
69 
70     size_t fanOut(size_t[] shape)
71     {
72         if(shape.length == 2)
73         {
74             return shape[0];
75         }
76         else if(shape.length == 4)
77         {
78             return shape[0] * shape[2] * shape[3];
79         }
80         else
81         {
82             import std.conv : to;
83             throw new Exception("Cannot compute fan-out for a parameter tensor of rank " ~ shape.length.to!string);
84         }
85     }
86 }
87 
88 /**
89     Encapsulates information about network parameters.
90 
91     This can be used to keep track of per-parameter loss functions (e.g., weight decay), and also projection functions
92     that can be applied using constrained optimisation methods.
93 */
94 struct Parameter
95 {
96     ///An "variable" operation.
97     Operation symbol;
98 
99     ///Used for applying loss terms to this parameter (e.g., weight decay)
100     Operation loss;
101 
102     ///A projection operation that can enforce some constraint
103     Projection projection;
104 }
105 
106 /**
107     Creates a parameter initialiser that sets the initial value of each element in a parameter tensor to a constant
108     value.
109 
110     Params:
111         val = The constant value to be used for initialisation.
112 
113     Returns:
114         The constructed $(D ParamInitializer).
115 */
116 ParamInitializer constantInit(float val)
117 {
118     void init(Operation param)
119     {
120         param.value.as!float[] = val;
121     }
122 
123     return &init;
124 }
125 
126 /**
127     Creates a parameter initialiser that sets the initial value of each element in a parameter tensor to a different
128     sample from a uniform distribution.
129 
130     Params:
131         minval = The lower bound of the uniform distribution.
132         maxval = The upper bound of the uniform distribution.
133     
134     Returns:
135         The constructed $(D ParamInitializer).
136 */
137 ParamInitializer uniformInit(float minval, float maxval)
138 {
139     void init(Operation param)
140     {
141         fillUniform(param.value.as!float, minval, maxval);
142     }
143 
144     return &init;
145 }
146 
147 /**
148     Creates a parameter initialiser that sets the initial value of each element in a parameter tensor to a different
149     sample from a Gaussian distribution.
150 
151     Params:
152         mean = The mean of the Gaussian distribution.
153         stddev = The standard deviation of the Gaussian distribution.
154     
155     Returns:
156         The constructed $(D ParamInitializer).
157 */
158 ParamInitializer gaussianInit(float mean, float stddev)
159 {
160     void init(Operation param)
161     {
162         fillGaussian(param.value.as!float, mean, stddev);
163     }
164 
165     return &init;
166 }
167 
168 /**
169     Creates a parameter initialiser that uses the method of Glorot and Bengio (2010).
170 
171     This technique initialises a parameter with samples from the following uniform distribution:
172 
173     U(-6 / (fan_in + fan_out), 6 / (fan_in + fan_out))
174 
175     For more details, see Glorot and Bengio (2010): http://jmlr.org/proceedings/papers/v9/glorot10a/glorot10a.pdf
176 
177     Returns:
178         The constructed $(D ParamInitiaizer).
179 */
180 ParamInitializer glorotUniformInit()
181 {
182     void init(Operation param)
183     {
184         auto bound = sqrt(6.0f / (param.shape.fanIn + param.shape.fanOut));
185         fillUniform(param.value.as!float, -bound, bound);
186     }
187 
188     return &init;
189 }
190 
191 /**
192     Creates a parameter initialiser that uses the method of Glorot and Bengio (2010).
193 
194     This technique initialises a parameter with samples from the following Gaussian distribution:
195 
196     μ = 0
197     σ = sqrt(2 / (fan_in + fan_out))
198 
199     For more details, see Glorot and Bengio (2010): http://jmlr.org/proceedings/papers/v9/glorot10a/glorot10a.pdf
200 
201     Returns:
202         The constructed $(D ParamInitiaizer).
203 */
204 ParamInitializer glorotGaussianInit()
205 {
206     void init(Operation param)
207     {
208         fillGaussian(param.value.as!float, 0, sqrt(2.0f / (param.shape.fanIn + param.shape.fanOut)));
209     }
210 
211     return &init;
212 }
213 
214 /**
215     Creates a parameter initialiser that uses the method of He et al. (2015).
216 
217     This technique initialises a parameter with samples from the following uniform distribution:
218 
219     U(-6 / fan_in, 6 / fan_in)
220 
221     For more details, see He et al. (2015): http://arxiv.org/abs/1502.01852
222 
223     Returns:
224         The constructed $(D ParamInitiaizer).
225 */  
226 ParamInitializer heUniformInit()
227 {
228     void init(Operation param)
229     {
230         fillUniform(param.value.as!float, 0, sqrt(6.0f / (param.shape.fanIn)));
231     }
232 
233     return &init;
234 }
235 
236 /**
237     Creates a parameter initialiser that uses the method of He et al. (2015).
238 
239     This technique initialises a parameter with samples from the following Gaussian distribution:
240 
241     μ = 0
242     σ = sqrt(2 / fan_in)
243 
244     For more details, see He et al. (2015): http://arxiv.org/abs/1502.01852
245 
246     Returns:
247         The constructed $(D ParamInitiaizer).
248 */  
249 ParamInitializer heGaussianInit()
250 {
251     void init(Operation param)
252     {
253         fillGaussian(param.value.as!float, 0, sqrt(2.0f / (param.shape.fanIn )));
254     }
255 
256     return &init;
257 }