1 /** 2 Contains an implementation of the regularisation techniques presented in Gouk et al. (2018). 3 4 Gouk, H., Frank, E., Pfahringer, B., & Cree, M. (2018). Regularisation of Neural Networks by Enforcing Lipschitz 5 Continuity. arXiv preprint arXiv:1804.04368. 6 7 Authors: Henry Gouk 8 */ 9 module dopt.nnet.lipschitz; 10 11 import std.exception; 12 13 import dopt.core; 14 import dopt.online; 15 16 /** 17 Returns a projection function that can be used to constrain a matrix norm. 18 19 The operator norm induced by the vector p-norm is used. 20 21 Params: 22 maxnorm = A scalar value indicating the maximum allowable operator norm. 23 p = The vector p-norm that will induce the operator norm. 24 25 Returns: 26 A projection function that can be used with the online optimisation algorithms. 27 */ 28 Projection projMatrix(Operation maxnorm, float p = 2) 29 { 30 Operation proj(Operation param) 31 { 32 auto norm = matrixNorm(param, p); 33 34 return maxNorm(param, norm, maxnorm); 35 } 36 37 return &proj; 38 } 39 40 /** 41 Computes the induced operator norm corresponding to the vector p-norm. 42 */ 43 Operation matrixNorm(Operation param, float p) 44 { 45 import std.exception : enforce; 46 47 enforce(param.rank == 2, "This function only operates on matrices"); 48 49 if(p == 1.0f) 50 { 51 /*if(param.rank != 2) 52 { 53 param = param.reshape([param.shape[0], param.volume / param.shape[0]]); 54 }*/ 55 56 /* 57 The matrix norm induced by the L1 vector norm is max Sum_i abs(a_ij), which is the maximum absolute column 58 sum. We are doing a row sum here because the weight matrices in dense and convolutional layers are 59 transposed before being multiplied with the input features. 60 */ 61 62 return param.abs().sum([1]).maxElement(); 63 } 64 else if(p == 2.0f) 65 { 66 auto x = uniformSample([param.shape[0], 1]) * 2.0f - 1.0f; 67 68 for(int i = 0; i < 2; i++) 69 { 70 x = matmul(param.transpose([1, 0]), matmul(param, x)); 71 } 72 73 auto v = x / sqrt(sum(x * x)); 74 auto y = matmul(param, v); 75 76 return sqrt(sum(y * y)); 77 } 78 else if(p == float.infinity) 79 { 80 /* 81 The matrix norm induced by the L_infty vector norm is max Sum_j abs(a_ij), which is the maximum absolute 82 row sum. We are doing a column sum here because the weight matrices in dense and convolutional layers are 83 transposed before being multiplied with the input features. 84 */ 85 86 return param.abs().sum([0]).maxElement(); 87 } 88 else 89 { 90 import std.conv : to; 91 92 throw new Exception("Cannot compute matrix norm for p=" ~ p.to!string); 93 } 94 } 95 96 Projection projConvParams(Operation maxnorm, size_t[] inShape, size_t[] stride, size_t[] padding, float p = 2.0f) 97 { 98 Operation proj(Operation param) 99 { 100 auto norm = convParamsNorm(param, inShape, stride, padding, p); 101 102 return maxNorm(param, norm, maxnorm); 103 } 104 105 return &proj; 106 } 107 108 Operation convParamsNorm(Operation param, size_t[] inShape, size_t[] stride, size_t[] padding, float p = 2.0f, 109 size_t n = 2) 110 { 111 if(p == 2.0f) 112 { 113 auto x = uniformSample([cast(size_t)1, param.shape[1]] ~ inShape) * 2.0f - 1.0f; 114 115 for(int i = 0; i < n; i++) 116 { 117 x = x 118 .convolution(param, padding, stride) 119 .convolutionTranspose(param, padding, stride); 120 } 121 122 auto v = x / sqrt(sum(x * x)); 123 auto y = convolution(v, param, padding, stride); 124 125 return sqrt(sum(y * y)); 126 } 127 else if(p == 1.0f || p == float.infinity) 128 { 129 //Turns out this is equivalent, but only for $p \in \{1, infty\}$ 130 if(param.rank != 2) 131 { 132 param = param.reshape([param.shape[0], param.volume / param.shape[0]]); 133 } 134 135 return matrixNorm(param, p); 136 } 137 else 138 { 139 import std.conv : to; 140 141 throw new Exception("Cannot compute convolution params norm for p=" ~ p.to!string); 142 } 143 } 144 145 unittest 146 { 147 auto k = float32([1, 1, 3, 3], [ 148 1, 2, 3, 4, 5, 6, 7, 8, 9 149 ]); 150 151 auto norm = convParamsNorm(k, [200, 200], [1, 1], [1, 1], 2.0f); 152 153 import std.stdio; 154 writeln(norm.evaluate().get!float[0]); 155 } 156 157 /** 158 Performs a projection of param such that the new norm will be less than or equal to maxval. 159 */ 160 Operation maxNorm(Operation param, Operation norm, Operation maxval) 161 { 162 return param * (1.0f / max(float32([], [1.0f]), norm / maxval)); 163 }