cifar10 source code

1 #!/usr/bin/env dub
2 /+ dub.sdl:
3 dependency "dopt" path=".."
4 dependency "progress-d" version="~>1.0.0"
5 +/
6 module cifar10;
7 
8 import dopt.core;
9 import dopt.nnet;
10 import dopt.online;
11 import progress;
12 
13 /*
14 	This example trains a VGG19-style network on the CIFAR-10 dataset of tiny images.
15 
16 	VGG networks are fairly easy to understand, compared to some of the more recently presented models like GoogLeNet.
17 	See ``Very Deep Convolutional Networks for Large-Scale Image Recognition'' by Simonyan and Zisserman for more
18 	details. This example uses the dopt.nnet.models package to make defining a VGG model very easy.
19 
20 	The CIFAR-10 dataset contains 60,000 32x32 pixel colour images. Each of these images belongs to one of 10 classes.
21 	In the standard setting, 50,000 of these images are used for training a model, and the other 10,000 are used for
22 	evaluating how well the model works.
23 */
24 
25 void main(string[] args)
26 {
27 	import std.algorithm : joiner;
28 	import std.array : array;
29 	import std.format : format;
30 	import std.range : zip, chunks;
31 	import std.stdio : stderr, stdout, write, writeln;
32 
33 	if(args.length != 2)
34 	{
35 		stderr.writeln("Usage: cifar10.d <data directory>");
36 		return;
37 	}
38 
39 	/*
40 		Loads the CIFAR-10 dataset. Download this in the binary format from https://www.cs.toronto.edu/~kriz/cifar.html
41 
42 		This also wraps the Dataset in an ImageTransformer, which will procedurally generate random crops and
43 		horizontal flips of the training images---a popular form of data augmentation for image datasets.
44 	*/
45 	writeln("Loading data...");
46     auto data = loadCIFAR10(args[1]);
47 	data.train = new ImageTransformer(data.train, 4, 4, true, false);
48 
49 	/*
50 	Now we create two variable nodes. ``features'' is used to represent a minibatch of input images, and ``labels''
51 	will be used to represent the label corresponding to each of those images.
52 	*/
53 	writeln("Constructing network graph...");
54 	size_t batchSize = 100;
55     auto features = float32([batchSize, 3, 32, 32]);
56     auto labels = float32([batchSize, 10]);
57 
58 	/*
59 	There are a few predefined models in dopt.nnet.models, such as vgg19. We provide it with the variable we want to
60 	use as the input to this model, tell it what sizes the fully connected layers should be, and then put a softmax
61 	activation function on the end. The softmax function is the standard activation function when one is performing
62 	a classification task.
63 	*/
64     auto preds = vgg19(features, [512, 512])
65 				.dense(10)
66 				.softmax();
67     
68 	//The DAGNetwork class takes the inputs and outputs of a network and aggregates parameters in several different.
69     auto network = new DAGNetwork([features], [preds]);
70 
71 	/*
72 	Layer objects have both ``output'' and ``trainOutput'' fields, because operations like dropout perform different
73 	computations at train and test time. Therefore, we construct two different loss symbols: one for optimising, and
74 	one for evaluating.
75 	*/
76     auto lossSym = crossEntropy(preds.trainOutput, labels) + network.paramLoss;
77 	auto testLossSym = crossEntropy(preds.output, labels) + network.paramLoss;
78 
79 	/*
80 	Now we set up an optimiser. Adam is good for proof of concepts, due to the fast convergence, however the
81 	performance of the final model is often slightly worse than that of a model trained with SGD+momentum.
82 	*/
83 	writeln("Creating optimiser...");
84 	auto learningRate = float32([], [0.0001f]);
85 	auto updater = amsgrad([lossSym, preds.trainOutput], network.params, network.paramProj, learningRate);
86 	auto testPlan = compile([testLossSym, preds.output]);
87 
88 	writeln("Training...");
89 
90 	float[] fs = new float[features.volume];
91 	float[] ls = new float[labels.volume];
92 
93 	//Iterate for 140 epochs of training!
94 	foreach(e; 0 .. 140)
95 	{
96 		float trainLoss = 0;
97         float testLoss = 0;
98         float trainAcc = 0;
99         float testAcc = 0;
100         float trainNum = 0;
101         float testNum = 0;
102 
103 		//Decreasing the learning rate after a while often results in better performance.
104 		if(e == 100)
105 		{
106 			learningRate.value.set([0.00001f]);
107 		}
108 		else if(e == 120)
109 		{
110 			learningRate.value.set([0.000001f]);
111 		}
112 
113 		data.train.restart();
114 		data.test.restart();
115 
116 		auto trainProgress = new Progress(data.train.length / batchSize);
117 
118 		while(!data.train.finished())
119 		{
120 			data.train.getBatch([fs, ls]);
121 
122 			//Make an update to the model parameters using the minibatch of training data
123 			auto res = updater([
124 				features: buffer(fs),
125 				labels: buffer(ls)
126 			]);
127 
128 			trainLoss += res[0].get!float[0] * batchSize;
129 			trainAcc += computeAccuracy(ls, res[1].get!float);
130 			trainNum += batchSize;
131 
132 			float loss = trainLoss / trainNum;
133 			float acc = trainAcc / trainNum;
134 
135 			trainProgress.title = format("Epoch: %03d  Loss: %02.4f  Acc: %.4f", e + 1, loss, acc);
136             trainProgress.next();
137 		}
138 
139 		writeln();
140 
141 		auto testProgress = new Progress(data.test.length / batchSize);
142 
143 		while(!data.test.finished())
144 		{
145 			data.test.getBatch([fs, ls]);
146 
147 			//Make some predictions
148 			auto res = testPlan.execute([
149 				features: buffer(fs),
150 				labels: buffer(ls)
151 			]);
152 
153 			testLoss += res[0].get!float[0] * batchSize;
154 			testAcc += computeAccuracy(ls, res[1].get!float);
155 			testNum += batchSize;
156 
157 			float loss = testLoss / testNum;
158 			float acc = testAcc / testNum;
159 
160 			testProgress.title = format("            Loss: %02.4f  Acc: %.4f", loss, acc);
161             testProgress.next();
162 		}
163 
164 		writeln();
165 		writeln();
166 	}
167 }
168 
169 float computeAccuracy(float[] ls, float[] preds)
170 {
171 	import std.algorithm : maxIndex;
172 	import std.range : chunks, zip;
173 
174 	float correct = 0;
175 
176 	foreach(p, t; zip(preds.chunks(10), ls.chunks(10)))
177 	{
178 		if(p.maxIndex == t.maxIndex)
179 		{
180 			correct++;
181 		}
182 	}
183 
184 	return correct;
185 }