pskrunner14 · pskrunner14 · Sep 17, 2018 · Sep 17, 2018 · Sep 17, 2018 · Sep 17, 2018
diff --git a/.gitignore b/.gitignore
@@ -4,7 +4,7 @@ __pycache__/
 *$py.class
 
 # C extensions
-*.so
+# *.so
 
 # Distribution / packaging
 .Python
@@ -15,7 +15,7 @@ dist/
 downloads/
 eggs/
 .eggs/
-lib/
+# lib/
 lib64/
 parts/
 sdist/

diff --git a/data.py b/data.py
@@ -5,35 +5,31 @@
 
 np.random.seed(42)
 
-class DataLoader():
+def load_dataset(flatten=False):
+    (X_train, y_train), (X_test, y_test) = keras.datasets.mnist.load_data()
+
+    # normalize x
+    X_train = X_train / 255.
+    X_test = X_test / 255.
 
-    @staticmethod
-    def load_dataset(flatten=False):
-        (X_train, y_train), (X_test, y_test) = keras.datasets.mnist.load_data()
-
-        # normalize x
-        X_train = X_train.astype(float) / 255.
-        X_test = X_test.astype(float) / 255.
+    # we reserve the last 10000 training examples for validation
+    X_train, X_val = X_train[:-10000], X_train[-10000:]
+    y_train, y_val = y_train[:-10000], y_train[-10000:]
 
-        # we reserve the last 10000 training examples for validation
-        X_train, X_val = X_train[:-10000], X_train[-10000:]
-        y_train, y_val = y_train[:-10000], y_train[-10000:]
+    if flatten:
+        X_train = X_train.reshape([X_train.shape[0], -1])
+        X_val = X_val.reshape([X_val.shape[0], -1])
+        X_test = X_test.reshape([X_test.shape[0], -1])
 
-        if flatten:
-            X_train = X_train.reshape([X_train.shape[0], -1])
-            X_val = X_val.reshape([X_val.shape[0], -1])
-            X_test = X_test.reshape([X_test.shape[0], -1])
+    return X_train, y_train, X_val, y_val, X_test, y_test
 
-        return X_train, y_train, X_val, y_val, X_test, y_test
-
-    @staticmethod
-    def iterate_minibatches(inputs, targets, batchsize, shuffle=False):
-        assert len(inputs) == len(targets)
+def iterate_minibatches(inputs, targets, batchsize, shuffle=False):
+    assert len(inputs) == len(targets)
+    if shuffle:
+        indices = np.random.permutation(len(inputs))
+    for start_idx in trange(0, len(inputs) - batchsize + 1, batchsize):
         if shuffle:
-            indices = np.random.permutation(len(inputs))
-        for start_idx in trange(0, len(inputs) - batchsize + 1, batchsize):
-            if shuffle:
-                excerpt = indices[start_idx:start_idx + batchsize]
-            else:
-                excerpt = slice(start_idx, start_idx + batchsize)
-            yield inputs[excerpt], targets[excerpt]
+            excerpt = indices[start_idx:start_idx + batchsize]
+        else:
+            excerpt = slice(start_idx, start_idx + batchsize)
+        yield inputs[excerpt], targets[excerpt]
diff --git a/functions.py b/functions.py
@@ -0,0 +1,91 @@
+""" FUNCTIONAL API: functions.py
+*   Purpose: Core functional API for performing computation on CPU/GPU.
+*   @author Prabhsimran Singh
+*   @version 2.0 17/10/18
+"""
+import numpy as np
+
+from ops.cpu_ops import (
+    cpu_matmul,
+    cpu_matsum,
+    cpu_matprod,
+    cpu_sum,
+    cpu_prod,
+    cpu_maximum
+)
+from ops.numba_ops import (
+    numba_matmul,
+    numba_matsum,
+    numba_matprod,
+    numba_sum,
+    numba_prod,
+    numba_maximum
+)
+
+NUM_THREADS = 32
+
+def get_cuda_execution_config(m, n):
+    gridBlock = (NUM_THREADS, NUM_THREADS)
+    gridDim = ((n // gridBlock[0]) + 1, (m // gridBlock[1]) + 1)
+    return gridDim, gridBlock
+
+def matmul(a, b, method='cpu'):
+    # fall back to cpu if dim inconsistency (numpy handle)
+    if method == 'cpu' or len(a.shape) != len(b.shape) or len(a.shape) == 1 or len(b.shape) == 1:
+        return cpu_matmul(a, b)
+    elif method == 'gpu':
+        m, n, k = a.shape[0], a.shape[1], b.shape[1]
+        c = np.zeros(shape=(m, k))
+        gridDim, gridBlock = get_cuda_execution_config(m, k)
+        numba_matmul[gridDim, gridBlock](a, b, c, m, n, k)
+        return c
+
+def matsum(a, b, method='cpu'):
+    if method == 'cpu' or len(a.shape) != len(b.shape) or len(a.shape) == 1 or len(b.shape) == 1:
+        return cpu_matsum(a, b)
+    if method == 'gpu':
+        m, n = a.shape[0], a.shape[1]
+        c = np.zeros(shape=(m, n))
+        gridDim, gridBlock = get_cuda_execution_config(m, n)
+        numba_matsum[gridDim, gridBlock](a, b, c, m, n)
+        return c.reshape((m, n))
+
+def matprod(a, b, method='cpu'):
+    if method == 'cpu' or len(a.shape) != len(b.shape) or len(a.shape) == 1 or len(b.shape) == 1:
+        return cpu_matprod(a, b)
+    if method == 'gpu':
+        m, n = a.shape[0], a.shape[1]
+        c = np.zeros(shape=(m, n))
+        gridDim, gridBlock = get_cuda_execution_config(m, n)
+        numba_matprod[gridDim, gridBlock](a, b, c, m, n)
+        return c.reshape((m, n))
+
+def add(a, value, method='cpu'):
+    if method == 'cpu' or len(a.shape) == 1:
+        return cpu_sum(a, value)
+    if method == 'gpu':
+        m, n = a.shape[0], a.shape[1]
+        c = np.zeros(shape=(m, n))
+        gridDim, gridBlock = get_cuda_execution_config(m, n)
+        numba_sum[gridDim, gridBlock](a, value, c, m, n)
+        return c.reshape((m, n))
+
+def prod(a, value, method='cpu'):
+    if method == 'cpu' or len(a.shape) == 1:
+        return cpu_prod(a, value)
+    if method == 'gpu':
+        m, n = a.shape[0], a.shape[1]
+        c = np.zeros(shape=(m, n))
+        gridDim, gridBlock = get_cuda_execution_config(m, n)
+        numba_prod[gridDim, gridBlock](a, value, c, m, n)
+        return c.reshape((m, n))
+
+def maximum(a, value, method='cpu'):
+    if method == 'cpu' or len(a.shape) == 1:
+        return cpu_maximum(a, value)
+    if method == 'gpu':
+        m, n = a.shape[0], a.shape[1]
+        c = np.zeros(shape=(m, n))
+        gridDim, gridBlock = get_cuda_execution_config(m, n)
+        numba_maximum[gridDim, gridBlock](a, value, c, m, n)
+        return c
diff --git a/layers.py b/layers.py
@@ -1,6 +1,8 @@
 from __future__ import print_function
 import numpy as np
 
+import functions as F
+
 np.random.seed(42)
 
 class Layer:
@@ -34,23 +36,26 @@ def backward(self, inputs, gradients, **kwargs):
         """
         pass
 
-class Dense(Layer):
+class Dense():
     """ Dense layer.
     A dense layer is a layer which performs a learned affine transformation:
         f(x) = <W*x> + b
+    input shape: [batch, input_units]
+    output shape: [batch, output units]
     """
-    
-    def __init__(self, input_units, output_units):
+
+    def __init__(self, input_units, output_units, method='cpu'):
         self.type = 'dense'
+        self.method = method
 
-        # initialize weights with glorot/xavier uniform initialization
-        self.weights = np.random.randn(input_units, output_units) * np.sqrt(6. / (input_units + output_units))
+        # initialize weights with small random numbers. We use xavier initialization
+        self.weights = F.prod(np.random.randn(input_units, output_units), np.sqrt(2. / (input_units + output_units)), method=self.method)
         self.biases = np.zeros(output_units)
 
     def _init_g2(self):
         self.g2_weights = np.zeros_like(self.weights)
         self.g2_biases = np.zeros_like(self.biases)
-        
+
     def forward(self, inputs):
         """ Forward pass of the dense layer.
         Perform an affine transformation:
@@ -59,59 +64,68 @@ def forward(self, inputs):
         input shape: [batch, input_units] 
         output shape: [batch, output units]
         """
-        return np.dot(inputs, self.weights) + self.biases
-
+        Wx = F.matmul(inputs, self.weights, method=self.method)
+        Z = F.matsum(Wx, self.biases, method=self.method)
+        return Z
+
     def backward(self, inputs, gradients, **kwargs):
+        """ Backward pass of the layer.
+        Performs a backpropagation step through the layer, with respect to the given input.
+        To compute loss gradients w.r.t input, you need to apply chain rule (backprop):
+            dL / dx  = (dL / dZ) * (dZ / dx)
+        """
         lr = kwargs.get('lr', 0.001)
         gamma = kwargs.get('gamma', 0.9)
         epsilon = kwargs.get('epsilon', 1e-7)
         optim = kwargs.get('optim', 'rmsprop')
 
         # dL / dx = dL / dZ * dZ / dx = gradients * W
-        grad_input = np.dot(gradients, self.weights.T)
+        grad_input = F.matmul(gradients, self.weights.T, method=self.method)
         # m -> batch size
         m = inputs.shape[0]
 
         # compute gradient w.r.t. weights and biases
         # dL / dW = dL / dZ * dZ / dW = gradients * inputs
-        grad_weights = np.dot(inputs.T, gradients) / m
+        grad_weights = F.prod(F.matmul(inputs.T, gradients, method=self.method), 1. / m, method=self.method)
         # dL / db = dL / dZ * dZ / db = gradients * 1
-        grad_biases = gradients.sum(axis=0) / m
-
+        grad_biases = F.prod(gradients.sum(axis=0), 1. / m, method=self.method)
         assert grad_weights.shape == self.weights.shape and grad_biases.shape == self.biases.shape
 
-        update_weights = lr * grad_weights
-        update_biases = lr * grad_biases
+        update_weights = F.prod(grad_weights, lr, method=self.method)
+        update_biases = F.prod(grad_biases, lr, method=self.method)
 
         if optim == 'rmsprop':
             if not hasattr(self, 'g2_weights'):
                 self._init_g2()
-            self.g2_weights = (self.g2_weights * gamma) + np.square(grad_weights) * (1 - gamma)
-            self.g2_biases = (self.g2_biases * gamma) + np.square(grad_biases) * (1 - gamma)
-
-            self.weights -= update_weights / (np.sqrt(self.g2_weights) + epsilon)
-            self.biases -= update_biases / (np.sqrt(self.g2_biases) + epsilon)
+            self.g2_weights = F.matsum(F.prod(self.g2_weights, gamma, method=self.method), F.prod(np.square(grad_weights), (1 - gamma), method=self.method), method=self.method)
+            self.g2_biases = F.matsum(F.prod(self.g2_biases, gamma, method=self.method), F.prod(np.square(grad_biases), (1 - gamma), method=self.method), method=self.method)
+            self.weights = F.matsum(self.weights, -F.matprod(update_weights, 1. / np.sqrt(F.add(self.g2_weights, epsilon, method=self.method)), method=self.method), method=self.method)
+            self.biases = F.matsum(self.biases, -F.matprod(update_biases, 1. / np.sqrt(F.add(self.g2_biases, epsilon, method=self.method)), method=self.method), method=self.method)
         elif optim == 'gd':
-            self.weights -= update_weights
-            self.biases -= update_biases
+            self.weights = F.matsum(self.weights, -update_weights, method=self.method)
+            self.biases = F.matsum(self.biases, -update_biases, method=self.method)
 
         # propagate back the gradients of Loss wrt to layer inputs
         # dL / dx
         return grad_input
-
-class ReLU(Layer):
-    """ReLU layer. 
-    Simply applies elementwise rectified linear unit to all inputs.
+
+class ReLU():
+    """ ReLU layer.
+
+    Applies elementwise rectified linear unit to all inputs:
+        f(x) = max(0, x)
+
+        input shape: [batch, input_units]
+        output shape: [batch, input_units]
     """
 
-    def __init__(self):
+    def __init__(self, method='cpu'):
         self.type = 'relu'
+        self.method = method
 
     def forward(self, inputs):
-        """Apply elementwise ReLU to [batch, input_units] matrix"""
-        return np.maximum(0, inputs)
+        return F.maximum(inputs, 0., method=self.method)
 
     def backward(self, inputs, gradients, **kwargs):
-        """Compute gradient of loss w.r.t. ReLU input"""
-        grad_relu = inputs > 0
-        return gradients * grad_relu
+        grad_relu = inputs > 0.
+        return F.matprod(gradients, grad_relu, method=self.method)
-Original file line number
+Diff line change
@@ Expand Up / @@ -4,7 +4,7 @@ __pycache__/ @@
     *$py.class
     # C extensions
-    *.so
+    # *.so
     # Distribution / packaging
     .Python
@@ Expand All / @@ -15,7 +15,7 @@ dist/ @@
     downloads/
     eggs/
     .eggs/
-    lib/
+    # lib/
     lib64/
     parts/
     sdist/
@@ Expand Down @@