-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnet.cpp
148 lines (127 loc) · 4.57 KB
/
net.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
/*
* net.cpp - Vanilla Neural Network
* Implements a basic multilayered perceptron with backpropagation powered by Milligrad.
* Nodes utilise the He Initialisation on weights and the tanh activation function on non-output layers.
* Training performs mini-batch SGD based on the mean squared loss function.
* Benson Zhang
*/
#include <vector>
#include <cmath>
#include <algorithm>
#include <random>
#include <iostream>
#include "milligrad.hpp"
#include "net.hpp"
using VarPtr = std::shared_ptr<Var>;
Net::Node::Node(int inputs, std::mt19937& rng) {
// Xavier Initialisation
std::normal_distribution<double> dist(0.0, std::sqrt(2.0 / inputs));
for (int i = 0; i < inputs; ++i) {
w.emplace_back(std::make_shared<Var>(dist(rng)));
}
b = std::make_shared<Var>(dist(rng));
}
VarPtr Net::Node::operator()(const std::vector<VarPtr>& x, bool activation) {
// R(w^T x + b)
VarPtr dp = std::make_shared<Var>(0.0);
for (int i = 0; i < w.size(); i++) {
dp = dp + w[i] * x[i];
}
dp = dp + b;
if (activation) return tanh(dp);
return dp;
}
std::vector<VarPtr> Net::Node::params() {
std::vector<VarPtr> p = w;
p.push_back(b);
return p;
}
Net::Layer::Layer(int inputs, int outputs, std::mt19937& rng) {
for (int i = 0; i < outputs; ++i) {
nodes.emplace_back(Node(inputs, rng));
}
}
std::vector<VarPtr> Net::Layer::operator()(const std::vector<VarPtr>& x, bool activation) {
std::vector<VarPtr> layer;
for (auto& n : nodes) {
layer.emplace_back(n(x, activation));
}
return layer;
}
std::vector<VarPtr> Net::Layer::params() {
std::vector<VarPtr> p;
for (auto& n : nodes) {
std::vector<VarPtr> node_params = n.params();
p.reserve(p.size() + std::distance(node_params.begin(), node_params.end()));
p.insert(p.end(), node_params.begin(), node_params.end());
}
return p;
}
Net::Net(int inputs, std::vector<int> outputs) {
std::random_device rd;
rng.seed(rd());
// outputs: vector[layer1 size, layer2 size, ..., ], outputs size must >= 1
layers.emplace_back(Layer(inputs, outputs[0], rng));
for (int i = 0; i < outputs.size() - 1; i++) {
layers.emplace_back(Layer(outputs[i], outputs[i+1], rng));
}
}
std::vector<VarPtr> Net::operator()(std::vector<VarPtr> x) {
// forward pass
for (int i = 0; i < layers.size() - 1; ++i) {
x = layers[i](x, true);
}
x = layers[layers.size() - 1](x, false); // dont activate output layer
return x;
}
std::vector<VarPtr> Net::params() {
std::vector<VarPtr> p;
for (auto& layer : layers) {
std::vector<VarPtr> layer_params = layer.params();
p.reserve(p.size() + std::distance(layer_params.begin(), layer_params.end()));
p.insert(p.end(), layer_params.begin(), layer_params.end());
}
return p;
}
VarPtr mse_loss(const std::vector<VarPtr>& ytrue, const std::vector<VarPtr>& ypred, int batch_size, std::mt19937& rng) {
// Create a random permutation of the size of y to simulate drawing batches
std::vector<int> perm(ytrue.size());
for (int i = 0; i < ytrue.size(); ++i) perm[i] = i;
std::shuffle(std::begin(perm), std::end(perm), rng);
auto loss = std::make_shared<Var>(0.0);
for (int i = 0; i < batch_size; ++i) {
int idx = perm[i];
loss = loss + pow(ytrue[idx] - ypred[idx], 2);
}
return loss / batch_size;
}
void train(Net& model, const std::vector<std::vector<VarPtr>>& x, const std::vector<VarPtr>& y, int epochs, double lr, int batch_size) {
std::mt19937 rng{std::random_device{}()};
int bs = (batch_size == 0) ? y.size() : bs;
for (int e = 1; e <= epochs; ++e) {
// forward pass model
std::vector<VarPtr> y_pred;
for (int i = 0; i < x.size(); ++i) {
// currently assumes the case with 1 output node per input
y_pred.emplace_back(model(x[i])[0]);
}
// calculate loss (stochastic), flush, backprop
auto loss = mse_loss(y, y_pred, bs, rng);
model.zero_grad();
loss->backward();
// gradient descent updates
std::vector<VarPtr> parameters = model.params();
for (auto& p : parameters) {
p->val = p->val - lr * p->grad;
}
if (e % 5 == 0) {
std::cout << "iteration: " << e << ", loss: " << loss->val << std::endl;
// print out ypred
std::cout << "Predictions: ";
for (int i = 0; i < y_pred.size(); i++) {
std::cout << y_pred[i]->val << " ";
}
std::cout << std::endl;
}
}
}