Outline the overall design

* This closes #2 and closes #18 * I kept running into issues w/ the libloading crate (referencing #15) * I'm currently playing around with NDArray (referencing #20), though the way the Index trait works with ArrayBase is rather complicated * See the changelog for more information
jonysy · Mar 16, 2017 · 31a2547 · 31a2547
1 parent 5466f37
commit 31a2547
Show file tree

Hide file tree

Showing 48 changed files with 1,662 additions and 885 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -13,8 +13,8 @@ license = "MIT/Apache-2.0"
 enum_primitive = "0.1.1"
 lazy_static = "0.2.4"
 libloading = "0.3.2"
-log = "0.3.6"
+log = "0.3.7"
 ndarray = "0.8.0"
 
-[dev-dependencies]
-compiletest_rs = "0.2.5"
+# [dev-dependencies]
+# compiletest_rs = "0.2.5"
diff --git a/README.md b/README.md
@@ -24,6 +24,37 @@ a few necessary additions/modifications.
 > available in the Parenchyma project, as the different approaches that are currently being 
 > considered may prove to be better than the original approach.
 
+## Example
+
+Parenchyma comes without any extension packages. The following example therefore assumes that
+you have add both `parenchyma` and the Parenchyma ExtensionPackage `parenchyma-nn` to your
+Cargo manifest.
+
+```rust
+extern crate parenchyma as pa;
+extern crate parenchyma_nn as pann;
+
+use pa::{Backend, Native, OpenCL, SharedTensor};
+
+fn main() {
+    let ref native: Backend = Backend::new::<Native>().unwrap();
+    // Initialize an OpenCL or CUDA backend packaged with the NN extension.
+    let ref backend = pann::Backend::new::<OpenCL>().unwrap();
+
+    // Initialize two `SharedTensor`s.
+    let shape = 1;
+    let ref x = SharedTensor::<f32>::with(backend, shape, vec![3.5]).unwrap();
+    let ref mut result = SharedTensor::<f32>::new(shape);
+
+    // Run the sigmoid operation, provided by the NN extension, on 
+    // your OpenCL/CUDA enabled GPU (or CPU, which is possible through OpenCL)
+    backend.sigmoid(x, result).unwrap();
+
+    // Print the result: `[0.97068775] shape=[1], strides=[1]`
+    println!("{:?}", result.read(native).unwrap().as_native().unwrap());
+}
+```
+
 ## License
 
 Dual licensed under

diff --git a/benches/shared_tensor.rs b/benches/shared_tensor.rs
@@ -1,94 +1,83 @@
 #![feature(test)]
 
 extern crate parenchyma;
-extern crate parenchyma_opencl;
 extern crate test;
 
-use parenchyma::{Backend, Device, Framework, Native, SharedTensor};
-use parenchyma::DeviceKind::{Cpu, Gpu};
-use parenchyma_opencl::{OpenCL, OpenCLDevice};
+use parenchyma::{Backend, Native, OpenCL, SharedTensor};
 use test::Bencher;
 
-fn native_backend() -> Backend<Native> {
-    Backend::default().unwrap()
+fn native_backend() -> Backend {
+    Backend::new::<Native>().unwrap()
 }
 
-fn opencl_backend() -> Backend<OpenCL> {
-    Backend::default().unwrap()
+fn opencl_backend() -> Backend {
+    Backend::new::<OpenCL>().unwrap()
 }
 
-fn sync_back_and_forth<A, B>(b: &mut Bencher, backend1: Backend<A>, backend2: Backend<B>, s: usize) 
-    where A: Framework, B: Framework,
-{
-    let ref dev1 = backend1.devices()[0];
-    let ref dev2 = backend2.devices()[0];
+fn sync_back_and_forth(b: &mut Bencher, backend1: Backend, backend2: Backend, s: usize) {
 
-    let mem = &mut SharedTensor::<u8>::from(vec![s]);
+    let mem = &mut SharedTensor::<u8>::new(s);
 
     // initialize and warm-up
-    let _ = mem.write_only(dev2).unwrap();
-    let _ = mem.read_write(dev1).unwrap();
-    let _ = mem.read_write(dev2).unwrap();
+    let _ = mem.write(&backend2).unwrap();
+    let _ = mem.read_write(&backend1).unwrap();
+    let _ = mem.read_write(&backend2).unwrap();
 
     b.bytes = s as u64 * 2; // we do two transfers per iteration
 
     b.iter(|| {
-        let _ = mem.read_write(dev1).unwrap();
-        let _ = mem.read_write(dev2).unwrap();
+        let _ = mem.read_write(&backend1).unwrap();
+        let _ = mem.read_write(&backend2).unwrap();
     });
 }
 
-fn unidirectional_sync<A, B>(b: &mut Bencher, src: Backend<A>, dst: Backend<B>, size: usize)
-    where A: Framework, B: Framework,
-{
-    let ref src_dev = src.devices()[0];
-    let ref dst_dev = dst.devices()[0];
+fn unidirectional_sync(b: &mut Bencher, src: Backend, dst: Backend, size: usize) {
 
-    let mem = &mut SharedTensor::<u8>::from(vec![size]);
+    let mem = &mut SharedTensor::<u8>::new(size);
 
     // initialize and warm-up
-    let _ = mem.write_only(src_dev).unwrap();
-    let _ = mem.read(dst_dev).unwrap();
+    let _ = mem.write(&src).unwrap();
+    let _ = mem.read(&dst).unwrap();
 
     b.bytes = size as u64;
 
     b.iter(|| {
-        let _ = mem.write_only(src_dev).unwrap();
-        let _ = mem.read(dst_dev).unwrap();
+        let _ = mem.write(&src).unwrap();
+        let _ = mem.read(&dst).unwrap();
     });
 }
 
-#[inline(never)]
-fn bench_256_alloc_1mb_opencl_profile(b: &mut Bencher, device: &OpenCLDevice, size: usize) {
-    b.iter(|| 
-        for _ in 0..256 {
-            let _ = device.allocate_memory(size).unwrap(); });
-}
+// #[inline(never)]
+// fn bench_256_alloc_1mb_opencl_profile(b: &mut Bencher, device: &OpenCLDevice, size: usize) {
+//     b.iter(|| 
+//         for _ in 0..256 {
+//             let _ = device.allocate_memory(size).unwrap(); });
+// }
 
-// #[bench]
-// fn bench_256_alloc_1mb_opencl_cpu(b: &mut Bencher) {
-//     let opencl_backend = opencl_backend();
-//     let cpu = opencl_backend.devices().iter().filter(|d| *d.kind() == Cpu).nth(0).unwrap();
+// // #[bench]
+// // fn bench_256_alloc_1mb_opencl_cpu(b: &mut Bencher) {
+// //     let opencl_backend = opencl_backend();
+// //     let cpu = opencl_backend.devices().iter().filter(|d| *d.kind() == Cpu).nth(0).unwrap();
 
-//     bench_256_alloc_1mb_opencl_profile(b, cpu, 1_048_576);
-// }
+// //     bench_256_alloc_1mb_opencl_profile(b, cpu, 1_048_576);
+// // }
+
+// // #[bench]
+// // fn bench_256_alloc_1mb_opencl_gpu(b: &mut Bencher) {
+// //     let opencl_backend = opencl_backend();
+// //     let gpu = opencl_backend.devices().iter().filter(|d| *d.kind() == Gpu).nth(0).unwrap();
+
+// //     bench_256_alloc_1mb_opencl_profile(b, gpu, 1_048_576);
+// // }
 
 // #[bench]
-// fn bench_256_alloc_1mb_opencl_gpu(b: &mut Bencher) {
+// fn bench_256_alloc_1mb_opencl(b: &mut Bencher) {
 //     let opencl_backend = opencl_backend();
-//     let gpu = opencl_backend.devices().iter().filter(|d| *d.kind() == Gpu).nth(0).unwrap();
+//     let ref d = opencl_backend.devices()[0];
 
-//     bench_256_alloc_1mb_opencl_profile(b, gpu, 1_048_576);
+//     bench_256_alloc_1mb_opencl_profile(b, d, 1_048_576);
 // }
 
-#[bench]
-fn bench_256_alloc_1mb_opencl(b: &mut Bencher) {
-    let opencl_backend = opencl_backend();
-    let ref d = opencl_backend.devices()[0];
-
-    bench_256_alloc_1mb_opencl_profile(b, d, 1_048_576);
-}
-
 #[bench]
 fn bench_sync_1kb_native_opencl_back_and_forth(b: &mut Bencher) {
     sync_back_and_forth(b, opencl_backend(), native_backend(), 1024);
@@ -134,33 +123,33 @@ fn bench_sync_128mb_opencl_to_native(b: &mut Bencher) {
     unidirectional_sync(b, opencl_backend(), native_backend(), 128 * 1_048_576);
 }
 
-// fn bench_shared_tensor_access_time_first_(b: &mut Bencher, device: &OpenCLDevice) {
+// // fn bench_shared_tensor_access_time_first_(b: &mut Bencher, device: &OpenCLDevice) {
 
-//     let native_backend = native_backend();
-//     let ref native_cpu = native_backend.devices()[0];
+// //     let native_backend = native_backend();
+// //     let ref native_cpu = native_backend.devices()[0];
 
-//     let mut x = SharedTensor::<f32>::from(vec![128]);
-//     x.write_only(native_cpu).unwrap();
-//     x.write_only(device).unwrap();
-//     x.read(native_cpu).unwrap();
+// //     let mut x = SharedTensor::<f32>::from(vec![128]);
+// //     x.write_only(native_cpu).unwrap();
+// //     x.write_only(device).unwrap();
+// //     x.read(native_cpu).unwrap();
 
-//     b.iter(|| {
-//         let _ = x.read(native_cpu).unwrap();
-//     })
-// }
+// //     b.iter(|| {
+// //         let _ = x.read(native_cpu).unwrap();
+// //     })
+// // }
 
-// #[bench]
-// fn bench_shared_tensor_access_time_first_cpu(b: &mut Bencher) {
-//     let opencl_backend = opencl_backend();
-//     let opencl_cpu = opencl_backend.devices().iter().filter(|d| *d.kind() == Cpu).nth(0).unwrap();
+// // #[bench]
+// // fn bench_shared_tensor_access_time_first_cpu(b: &mut Bencher) {
+// //     let opencl_backend = opencl_backend();
+// //     let opencl_cpu = opencl_backend.devices().iter().filter(|d| *d.kind() == Cpu).nth(0).unwrap();
 
-//     bench_shared_tensor_access_time_first_(b, opencl_cpu);
-// }
+// //     bench_shared_tensor_access_time_first_(b, opencl_cpu);
+// // }
 
-// #[bench]
-// fn bench_shared_tensor_access_time_first_gpu(b: &mut Bencher) {
-//     let opencl_backend = opencl_backend();
-//     let opencl_gpu = opencl_backend.devices().iter().filter(|d| *d.kind() == Gpu).nth(0).unwrap();
+// // #[bench]
+// // fn bench_shared_tensor_access_time_first_gpu(b: &mut Bencher) {
+// //     let opencl_backend = opencl_backend();
+// //     let opencl_gpu = opencl_backend.devices().iter().filter(|d| *d.kind() == Gpu).nth(0).unwrap();
 
-//     bench_shared_tensor_access_time_first_(b, opencl_gpu);
-// }
+// //     bench_shared_tensor_access_time_first_(b, opencl_gpu);
+// // }
diff --git a/src/backend.rs b/src/backend.rs
@@ -1,5 +1,7 @@
-use super::{Context, ComputeDevice, Framework};
-use super::error::Result;
+use std::ops;
+use super::{BoxContext, Context, Device, Error, ExtensionPackage, Framework, Hardware, Unextended};
+use super::Result;
+use utility::{self, TryDefault};
 
 /// The heart of Parenchyma - provides an interface for running parallel computations on one or 
 /// more devices.
@@ -15,76 +17,70 @@ use super::error::Result;
 /// the framework to the [`Backend::new`](#method.new) associated function, or by simply 
 /// calling [`Backend::default`](#method.default). The framework determines which devices are 
 /// available and how parallel kernel functions can be executed.
-///
-/// ## Examples
-///
-/// ```rust
-/// use parenchyma::{Backend, Framework, Native};
-///
-/// 
-/// // Construct a new framework.
-/// let framework = Native::new().expect("failed to initialize framework");
-///
-/// // Available devices can be obtained through the framework.
-/// let selection = framework.available_devices.clone();
-///
-/// // Create a ready to go backend from the framework.
-/// let backend = Backend::new(framework, selection).expect("failed to construct backend");
-///
-/// // ..
-/// ```
-///
-/// Construct a default backend:
-///
-/// ```rust
-/// use parenchyma::{Backend, Native};
-///
-/// // A default native backend.
-/// let backend: Backend<Native> = Backend::default().expect("something went wrong!");
-///
-/// // ..
-/// ```
 #[derive(Debug)]
-pub struct Backend {
+pub struct Backend<X = Unextended> {
     /// The initialized framework.
-    pub framework: Box<Framework>, /* &'static str,*/
+    ///
+    /// The Framework implementation such as OpenCL, CUDA, etc. defines, which should be used and
+    /// determines which hardwares will be available and how parallel kernel functions can be
+    /// executed.
+    framework: Box<Framework>,
     /// The context associated with the `framework`.
     ///
-    /// Contexts are the heart of both OpenCL and CUDA applications. See the [`Context`] trait for
-    /// more information.
+    /// Contexts are the heart of both OpenCL and CUDA applications. Contexts are created from one 
+    /// or more devices that are capable of executing methods and synchronizing memory. See 
+    /// the [`Context`] trait for more information.
     ///
     /// [`Context`]: (./trait.Context.html)
-    pub context: Box<Context>,
-    /// The chosen device
-    ///
-    /// The default active device is the first device found (index = `0`).
-    active: usize,
+    context: Box<Context<Package = X>>,
 }
 
-impl Backend {
+impl<X> Backend<X> where X: ExtensionPackage {
 
-    /// Constructs a backend using the most potent framework given the underlying hardware.
-    pub fn new() -> Backend {
+    /// Initialize a new backend.
+    pub fn new<F>() -> Result<Self> where F: BoxContext<X> + Framework + TryDefault<Err = Error> {
 
-        unimplemented!()
+        let framework = Box::new(F::try_default()?);
+        let selection = framework.available_hardware();
+        let context = framework.enclose(selection)?;
+
+        Ok(Backend { framework: framework, context })
     }
 
-    /// Attempts to construct a backend from the specified `framework`.
-    pub fn with<F>(framework: F) -> Result<Backend> where F: Framework {
+    /// Constructs a backend from the specified `framework` and `selection`.
+    pub fn with<F>(fwrk: F, selection: Vec<Hardware>) -> Result<Self> 
+        where F: BoxContext<X> + Framework {
+
+        let framework = Box::new(fwrk);
+        let context = framework.enclose(selection)?;
 
-        unimplemented!()
+        Ok(Backend { framework, context })
     }
 
-    // /// Try all provided `frameworks` in the specified order, choosing the first framework that 
-    // // initializes without failure.
-    // pub fn try(frameworks: Vec<Box<Framework>>) -> Result<Backend>;
+    /// Set the device at the specified `index` as the active device.
+    ///
+    /// Only one device can be the _active_ device - the device in which operations are executed.
+    pub fn set_active(&mut self, index: usize) -> Result {
+
+        self.context.set_active(index)
+    }
 }
 
-impl Backend {
+impl<X> ops::Deref for Backend<X> where X: ExtensionPackage {
+
+    type Target = X::Extension;
 
-    /// Returns the current device.
-    pub fn compute_device<T>(&self) -> &ComputeDevice<T> {
+    fn deref<'a>(&'a self) -> &'a X::Extension {
 
-        unimplemented!()
+        self.context.extension()
     }
-}
+}
+
+impl<X> utility::Has<Device> for Backend<X> where X: ExtensionPackage {
+
+    fn get_ref(&self) -> &Device {
+        self.context.active_device()
+    }
+}
+
+// pub trait AsBackend { }
diff --git a/src/changelog.rs b/src/changelog.rs
@@ -6,7 +6,10 @@
 /// * Partially implemented a CUDA API wrapper
 /// * Partially implemented native support
 /// * Worked on a fallback mechanism (see issue#15)
-/// * Chose a tensor lib (ndarray)
+/// * Use a tensor lib (ndarray) as the underlying native memory representation
 /// * No longer requires framework related feature flags (from the original Collenchyma project)
 /// * Implemented auto-sync
+/// * Add `Bundle` logic
+/// * Removed `IBinary`/`HashMap` technique. Use structs instead
+/// * No longer requires backends parameterized by a framework
 pub mod r0_0_3 {}