1 /**
2     This module contains methods for initialising the parameters of neural networks.
3 
4     Several of the methods implemented in this module rely on $(D fan_in) and $(D fan_out) values. These are calculated
5     differently depending on the rank of the parameter.
6 
7     For rank-2 tensors,
8         
9         $(D fan_in = shape[0]), $(D fan_out = shape[1])
10     
11     For rank-4 tensors,
12         
13         $(D fan_in = shape[1] * shape[2] * shape[3]), $(D fan_out = shape[0] * shape[2] * shape[3])
14 */
15 module dopt.nnet.parameters;
16 
17 import std.math;
18 
19 import dopt;
20 
21 /**
22     Used to initialize a parameter in the neural network.
23 
24     The $(D param) parameter will contain an $(D Operation) representing a variable. The ParamInitializer will set the
25     default value of this variable according to some parameter initialisation scheme.
26 */
27 alias ParamInitializer = void delegate(Operation param);
28 
29 private
30 {
31     void fillUniform(float[] vals, float minval, float maxval)
32     {
33         import std.random : uniform;
34 
35         for(size_t i = 0; i < vals.length; i++)
36         {
37             vals[i] = uniform(minval, maxval);
38         }
39     }
40 
41     void fillGaussian(float[] vals, float mean, float stddev)
42     {
43         import std.mathspecial : normalDistributionInverse;
44         import std.random : uniform;
45 
46         for(size_t i = 0; i < vals.length; i++)
47         {
48             vals[i] = normalDistributionInverse(uniform(0.0f, 1.0f)) * stddev + mean;
49         }
50     }
51 
52     size_t fanIn(size_t[] shape)
53     {
54         if(shape.length == 2)
55         {
56             return shape[1];
57         }
58         else if(shape.length == 4)
59         {
60             return shape[1] * shape[2] * shape[3];
61         }
62         else
63         {
64             import std.conv : to;
65             throw new Exception("Cannot compute fan-in for a parameter tensor of rank " ~ shape.length.to!string);
66         }
67     }
68 
69     size_t fanOut(size_t[] shape)
70     {
71         if(shape.length == 2)
72         {
73             return shape[0];
74         }
75         else if(shape.length == 4)
76         {
77             return shape[0] * shape[2] * shape[3];
78         }
79         else
80         {
81             import std.conv : to;
82             throw new Exception("Cannot compute fan-out for a parameter tensor of rank " ~ shape.length.to!string);
83         }
84     }
85 }
86 
87 /**
88     Encapsulates information about network parameters.
89 
90     This can be used to keep track of per-parameter loss functions (e.g., weight decay), and also projection functions
91     that can be applied using constrained optimisation methods.
92 */
93 struct Parameter
94 {
95     ///An "variable" operation.
96     Operation symbol;
97 
98     ///Used for applying loss terms to this parameter (e.g., weight decay)
99     Operation loss;
100 
101     ///A projection operation that can enforce some constraint
102     Projection projection;
103 }
104 
105 /**
106     Creates a parameter initialiser that sets the initial value of each element in a parameter tensor to a constant
107     value.
108 
109     Params:
110         val = The constant value to be used for initialisation.
111 
112     Returns:
113         The constructed $(D ParamInitializer).
114 */
115 ParamInitializer constantInit(float val)
116 {
117     void init(Operation param)
118     {
119         param.value.as!float[] = val;
120     }
121 
122     return &init;
123 }
124 
125 /**
126     Creates a parameter initialiser that sets the initial value of each element in a parameter tensor to a different
127     sample from a uniform distribution.
128 
129     Params:
130         minval = The lower bound of the uniform distribution.
131         maxval = The upper bound of the uniform distribution.
132     
133     Returns:
134         The constructed $(D ParamInitializer).
135 */
136 ParamInitializer uniformInit(float minval, float maxval)
137 {
138     void init(Operation param)
139     {
140         fillUniform(param.value.as!float, minval, maxval);
141     }
142 
143     return &init;
144 }
145 
146 /**
147     Creates a parameter initialiser that sets the initial value of each element in a parameter tensor to a different
148     sample from a Gaussian distribution.
149 
150     Params:
151         mean = The mean of the Gaussian distribution.
152         stddev = The standard deviation of the Gaussian distribution.
153     
154     Returns:
155         The constructed $(D ParamInitializer).
156 */
157 ParamInitializer gaussianInit(float mean, float stddev)
158 {
159     void init(Operation param)
160     {
161         fillGaussian(param.value.as!float, mean, stddev);
162     }
163 
164     return &init;
165 }
166 
167 /**
168     Creates a parameter initialiser that uses the method of Glorot and Bengio (2010).
169 
170     This technique initialises a parameter with samples from the following uniform distribution:
171 
172     U(-6 / (fan_in + fan_out), 6 / (fan_in + fan_out))
173 
174     For more details, see Glorot and Bengio (2010): http://jmlr.org/proceedings/papers/v9/glorot10a/glorot10a.pdf
175 
176     Returns:
177         The constructed $(D ParamInitiaizer).
178 */
179 ParamInitializer glorotUniformInit()
180 {
181     void init(Operation param)
182     {
183         auto bound = sqrt(6.0f / (param.shape.fanIn + param.shape.fanOut));
184         fillUniform(param.value.as!float, -bound, bound);
185     }
186 
187     return &init;
188 }
189 
190 /**
191     Creates a parameter initialiser that uses the method of Glorot and Bengio (2010).
192 
193     This technique initialises a parameter with samples from the following Gaussian distribution:
194 
195     μ = 0
196     σ = sqrt(2 / (fan_in + fan_out))
197 
198     For more details, see Glorot and Bengio (2010): http://jmlr.org/proceedings/papers/v9/glorot10a/glorot10a.pdf
199 
200     Returns:
201         The constructed $(D ParamInitiaizer).
202 */
203 ParamInitializer glorotGaussianInit()
204 {
205     void init(Operation param)
206     {
207         fillGaussian(param.value.as!float, 0, sqrt(2.0f / (param.shape.fanIn + param.shape.fanOut)));
208     }
209 
210     return &init;
211 }
212 
213 /**
214     Creates a parameter initialiser that uses the method of He et al. (2015).
215 
216     This technique initialises a parameter with samples from the following uniform distribution:
217 
218     U(-6 / fan_in, 6 / fan_in)
219 
220     For more details, see He et al. (2015): http://arxiv.org/abs/1502.01852
221 
222     Returns:
223         The constructed $(D ParamInitiaizer).
224 */  
225 ParamInitializer heUniformInit()
226 {
227     void init(Operation param)
228     {
229         fillUniform(param.value.as!float, 0, sqrt(6.0f / (param.shape.fanIn)));
230     }
231 
232     return &init;
233 }
234 
235 /**
236     Creates a parameter initialiser that uses the method of He et al. (2015).
237 
238     This technique initialises a parameter with samples from the following Gaussian distribution:
239 
240     μ = 0
241     σ = sqrt(2 / fan_in)
242 
243     For more details, see He et al. (2015): http://arxiv.org/abs/1502.01852
244 
245     Returns:
246         The constructed $(D ParamInitiaizer).
247 */  
248 ParamInitializer heGaussianInit()
249 {
250     void init(Operation param)
251     {
252         fillGaussian(param.value.as!float, 0, sqrt(2.0f / (param.shape.fanIn )));
253     }
254 
255     return &init;
256 }