From 31a2547e05bc38a5426d47217934f35ed049aa13 Mon Sep 17 00:00:00 2001
From: Jony <jonysy@users.noreply.github.com>
Date: Thu, 16 Mar 2017 18:16:10 -0400
Subject: [PATCH] Outline the overall design

* This closes #2 and closes #18
* I kept running into issues w/ the libloading crate (referencing #15)
* I'm currently playing around with NDArray (referencing #20), though
the way the Index trait works with ArrayBase is rather complicated
* See the changelog for more information
---
 Cargo.toml                                    |   6 +-
 README.md                                     |  31 ++
 benches/shared_tensor.rs                      | 139 ++++---
 src/backend.rs                                | 106 +++---
 src/changelog.rs                              |   5 +-
 src/context.rs                                |  27 ++
 src/error.rs                                  |   4 +-
 src/extension.rs                              |  80 +++++
 src/framework.rs                              |  32 ++
 src/frameworks/cuda/{sh.rs => foreign.rs}     |   0
 src/frameworks/cuda/{hl => high}/error.rs     |   0
 .../cuda/{hl/high.rs => high/mod.rs}          |   3 +
 src/frameworks/cuda/{hl => high}/utility.rs   |   0
 src/frameworks/cuda/hl/mod.rs                 |   7 -
 src/frameworks/cuda/interface.rs              |   6 +
 src/frameworks/cuda/mod.rs                    |  11 -
 src/frameworks/{loader.rs => macros.rs}       |   2 +-
 src/frameworks/mod.rs                         |   3 +-
 src/frameworks/native/context.rs              |  23 ++
 src/frameworks/native/device.rs               |  88 +++++
 src/frameworks/native/framework.rs            |  57 +++
 src/frameworks/native/memory.rs               |   6 +
 src/frameworks/native/mod.rs                  |  27 +-
 src/frameworks/opencl/{sh.rs => foreign.rs}   |  17 +-
 src/frameworks/opencl/{hl => high}/error.rs   |  16 +-
 src/frameworks/opencl/high/functions.rs       |  25 ++
 .../opencl/{hl/high.rs => high/mod.rs}        | 268 +++++++-------
 src/frameworks/opencl/{hl => high}/utility.rs |   4 +-
 src/frameworks/opencl/hl/mod.rs               |   7 -
 src/frameworks/opencl/interface/context.rs    |  90 +++++
 src/frameworks/opencl/interface/device.rs     | 130 +++++++
 src/frameworks/opencl/interface/event.rs      |   8 +
 src/frameworks/opencl/interface/framework.rs  | 150 ++++++++
 src/frameworks/opencl/interface/image.rs      |   3 +
 src/frameworks/opencl/interface/memory.rs     |  28 ++
 src/frameworks/opencl/interface/mod.rs        |  13 +
 src/frameworks/opencl/mod.rs                  |  76 +---
 src/hardware.rs                               | 116 ++++++
 src/interface.rs                              | 111 ------
 src/lib.rs                                    |  74 +++-
 src/memory.rs                                 |  48 ++-
 src/tensor.rs                                 | 339 ++++++++++++------
 src/utility.rs                                |  30 +-
 tests/backend_specs.rs                        |  14 +
 tests/cuda.rs                                 |  52 ---
 tests/lib.rs                                  | 104 ------
 tests/opencl.rs                               |  82 -----
 tests/shared_memory_specs.rs                  |  79 ++++
 48 files changed, 1662 insertions(+), 885 deletions(-)
 create mode 100644 src/context.rs
 create mode 100644 src/extension.rs
 create mode 100644 src/framework.rs
 rename src/frameworks/cuda/{sh.rs => foreign.rs} (100%)
 rename src/frameworks/cuda/{hl => high}/error.rs (100%)
 rename src/frameworks/cuda/{hl/high.rs => high/mod.rs} (99%)
 rename src/frameworks/cuda/{hl => high}/utility.rs (100%)
 delete mode 100644 src/frameworks/cuda/hl/mod.rs
 create mode 100644 src/frameworks/cuda/interface.rs
 rename src/frameworks/{loader.rs => macros.rs} (95%)
 create mode 100644 src/frameworks/native/context.rs
 create mode 100644 src/frameworks/native/device.rs
 create mode 100644 src/frameworks/native/framework.rs
 create mode 100644 src/frameworks/native/memory.rs
 rename src/frameworks/opencl/{sh.rs => foreign.rs} (99%)
 rename src/frameworks/opencl/{hl => high}/error.rs (95%)
 create mode 100644 src/frameworks/opencl/high/functions.rs
 rename src/frameworks/opencl/{hl/high.rs => high/mod.rs} (86%)
 rename src/frameworks/opencl/{hl => high}/utility.rs (81%)
 delete mode 100644 src/frameworks/opencl/hl/mod.rs
 create mode 100644 src/frameworks/opencl/interface/context.rs
 create mode 100644 src/frameworks/opencl/interface/device.rs
 create mode 100644 src/frameworks/opencl/interface/event.rs
 create mode 100644 src/frameworks/opencl/interface/framework.rs
 create mode 100644 src/frameworks/opencl/interface/image.rs
 create mode 100644 src/frameworks/opencl/interface/memory.rs
 create mode 100644 src/frameworks/opencl/interface/mod.rs
 create mode 100644 src/hardware.rs
 delete mode 100644 src/interface.rs
 create mode 100644 tests/backend_specs.rs
 delete mode 100644 tests/cuda.rs
 delete mode 100644 tests/lib.rs
 delete mode 100644 tests/opencl.rs
 create mode 100644 tests/shared_memory_specs.rs

diff --git a/Cargo.toml b/Cargo.toml
index ce53d61..27b77e4 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -13,8 +13,8 @@ license = "MIT/Apache-2.0"
 enum_primitive = "0.1.1"
 lazy_static = "0.2.4"
 libloading = "0.3.2"
-log = "0.3.6"
+log = "0.3.7"
 ndarray = "0.8.0"
 
-[dev-dependencies]
-compiletest_rs = "0.2.5"
\ No newline at end of file
+# [dev-dependencies]
+# compiletest_rs = "0.2.5"
\ No newline at end of file
diff --git a/README.md b/README.md
index d0213b4..056a630 100644
--- a/README.md
+++ b/README.md
@@ -24,6 +24,37 @@ a few necessary additions/modifications.
 > available in the Parenchyma project, as the different approaches that are currently being 
 > considered may prove to be better than the original approach.
 
+## Example
+
+Parenchyma comes without any extension packages. The following example therefore assumes that
+you have add both `parenchyma` and the Parenchyma ExtensionPackage `parenchyma-nn` to your
+Cargo manifest.
+
+```rust
+extern crate parenchyma as pa;
+extern crate parenchyma_nn as pann;
+
+use pa::{Backend, Native, OpenCL, SharedTensor};
+
+fn main() {
+    let ref native: Backend = Backend::new::<Native>().unwrap();
+    // Initialize an OpenCL or CUDA backend packaged with the NN extension.
+    let ref backend = pann::Backend::new::<OpenCL>().unwrap();
+
+    // Initialize two `SharedTensor`s.
+    let shape = 1;
+    let ref x = SharedTensor::<f32>::with(backend, shape, vec![3.5]).unwrap();
+    let ref mut result = SharedTensor::<f32>::new(shape);
+
+    // Run the sigmoid operation, provided by the NN extension, on 
+    // your OpenCL/CUDA enabled GPU (or CPU, which is possible through OpenCL)
+    backend.sigmoid(x, result).unwrap();
+
+    // Print the result: `[0.97068775] shape=[1], strides=[1]`
+    println!("{:?}", result.read(native).unwrap().as_native().unwrap());
+}
+```
+
 ## License
 
 Dual licensed under
diff --git a/benches/shared_tensor.rs b/benches/shared_tensor.rs
index ada945c..564fb1d 100644
--- a/benches/shared_tensor.rs
+++ b/benches/shared_tensor.rs
@@ -1,94 +1,83 @@
 #![feature(test)]
 
 extern crate parenchyma;
-extern crate parenchyma_opencl;
 extern crate test;
 
-use parenchyma::{Backend, Device, Framework, Native, SharedTensor};
-use parenchyma::DeviceKind::{Cpu, Gpu};
-use parenchyma_opencl::{OpenCL, OpenCLDevice};
+use parenchyma::{Backend, Native, OpenCL, SharedTensor};
 use test::Bencher;
 
-fn native_backend() -> Backend<Native> {
-    Backend::default().unwrap()
+fn native_backend() -> Backend {
+    Backend::new::<Native>().unwrap()
 }
 
-fn opencl_backend() -> Backend<OpenCL> {
-    Backend::default().unwrap()
+fn opencl_backend() -> Backend {
+    Backend::new::<OpenCL>().unwrap()
 }
 
-fn sync_back_and_forth<A, B>(b: &mut Bencher, backend1: Backend<A>, backend2: Backend<B>, s: usize) 
-    where A: Framework, B: Framework,
-{
-    let ref dev1 = backend1.devices()[0];
-    let ref dev2 = backend2.devices()[0];
+fn sync_back_and_forth(b: &mut Bencher, backend1: Backend, backend2: Backend, s: usize) {
 
-    let mem = &mut SharedTensor::<u8>::from(vec![s]);
+    let mem = &mut SharedTensor::<u8>::new(s);
 
     // initialize and warm-up
-    let _ = mem.write_only(dev2).unwrap();
-    let _ = mem.read_write(dev1).unwrap();
-    let _ = mem.read_write(dev2).unwrap();
+    let _ = mem.write(&backend2).unwrap();
+    let _ = mem.read_write(&backend1).unwrap();
+    let _ = mem.read_write(&backend2).unwrap();
 
     b.bytes = s as u64 * 2; // we do two transfers per iteration
 
     b.iter(|| {
-        let _ = mem.read_write(dev1).unwrap();
-        let _ = mem.read_write(dev2).unwrap();
+        let _ = mem.read_write(&backend1).unwrap();
+        let _ = mem.read_write(&backend2).unwrap();
     });
 }
 
-fn unidirectional_sync<A, B>(b: &mut Bencher, src: Backend<A>, dst: Backend<B>, size: usize)
-    where A: Framework, B: Framework,
-{
-    let ref src_dev = src.devices()[0];
-    let ref dst_dev = dst.devices()[0];
+fn unidirectional_sync(b: &mut Bencher, src: Backend, dst: Backend, size: usize) {
 
-    let mem = &mut SharedTensor::<u8>::from(vec![size]);
+    let mem = &mut SharedTensor::<u8>::new(size);
 
     // initialize and warm-up
-    let _ = mem.write_only(src_dev).unwrap();
-    let _ = mem.read(dst_dev).unwrap();
+    let _ = mem.write(&src).unwrap();
+    let _ = mem.read(&dst).unwrap();
 
     b.bytes = size as u64;
 
     b.iter(|| {
-        let _ = mem.write_only(src_dev).unwrap();
-        let _ = mem.read(dst_dev).unwrap();
+        let _ = mem.write(&src).unwrap();
+        let _ = mem.read(&dst).unwrap();
     });
 }
 
-#[inline(never)]
-fn bench_256_alloc_1mb_opencl_profile(b: &mut Bencher, device: &OpenCLDevice, size: usize) {
-    b.iter(|| 
-        for _ in 0..256 {
-            let _ = device.allocate_memory(size).unwrap(); });
-}
+// #[inline(never)]
+// fn bench_256_alloc_1mb_opencl_profile(b: &mut Bencher, device: &OpenCLDevice, size: usize) {
+//     b.iter(|| 
+//         for _ in 0..256 {
+//             let _ = device.allocate_memory(size).unwrap(); });
+// }
 
-// #[bench]
-// fn bench_256_alloc_1mb_opencl_cpu(b: &mut Bencher) {
-//     let opencl_backend = opencl_backend();
-//     let cpu = opencl_backend.devices().iter().filter(|d| *d.kind() == Cpu).nth(0).unwrap();
+// // #[bench]
+// // fn bench_256_alloc_1mb_opencl_cpu(b: &mut Bencher) {
+// //     let opencl_backend = opencl_backend();
+// //     let cpu = opencl_backend.devices().iter().filter(|d| *d.kind() == Cpu).nth(0).unwrap();
 
-//     bench_256_alloc_1mb_opencl_profile(b, cpu, 1_048_576);
-// }
+// //     bench_256_alloc_1mb_opencl_profile(b, cpu, 1_048_576);
+// // }
+
+// // #[bench]
+// // fn bench_256_alloc_1mb_opencl_gpu(b: &mut Bencher) {
+// //     let opencl_backend = opencl_backend();
+// //     let gpu = opencl_backend.devices().iter().filter(|d| *d.kind() == Gpu).nth(0).unwrap();
+
+// //     bench_256_alloc_1mb_opencl_profile(b, gpu, 1_048_576);
+// // }
 
 // #[bench]
-// fn bench_256_alloc_1mb_opencl_gpu(b: &mut Bencher) {
+// fn bench_256_alloc_1mb_opencl(b: &mut Bencher) {
 //     let opencl_backend = opencl_backend();
-//     let gpu = opencl_backend.devices().iter().filter(|d| *d.kind() == Gpu).nth(0).unwrap();
+//     let ref d = opencl_backend.devices()[0];
 
-//     bench_256_alloc_1mb_opencl_profile(b, gpu, 1_048_576);
+//     bench_256_alloc_1mb_opencl_profile(b, d, 1_048_576);
 // }
 
-#[bench]
-fn bench_256_alloc_1mb_opencl(b: &mut Bencher) {
-    let opencl_backend = opencl_backend();
-    let ref d = opencl_backend.devices()[0];
-
-    bench_256_alloc_1mb_opencl_profile(b, d, 1_048_576);
-}
-
 #[bench]
 fn bench_sync_1kb_native_opencl_back_and_forth(b: &mut Bencher) {
     sync_back_and_forth(b, opencl_backend(), native_backend(), 1024);
@@ -134,33 +123,33 @@ fn bench_sync_128mb_opencl_to_native(b: &mut Bencher) {
     unidirectional_sync(b, opencl_backend(), native_backend(), 128 * 1_048_576);
 }
 
-// fn bench_shared_tensor_access_time_first_(b: &mut Bencher, device: &OpenCLDevice) {
+// // fn bench_shared_tensor_access_time_first_(b: &mut Bencher, device: &OpenCLDevice) {
 
-//     let native_backend = native_backend();
-//     let ref native_cpu = native_backend.devices()[0];
+// //     let native_backend = native_backend();
+// //     let ref native_cpu = native_backend.devices()[0];
 
-//     let mut x = SharedTensor::<f32>::from(vec![128]);
-//     x.write_only(native_cpu).unwrap();
-//     x.write_only(device).unwrap();
-//     x.read(native_cpu).unwrap();
+// //     let mut x = SharedTensor::<f32>::from(vec![128]);
+// //     x.write_only(native_cpu).unwrap();
+// //     x.write_only(device).unwrap();
+// //     x.read(native_cpu).unwrap();
 
-//     b.iter(|| {
-//         let _ = x.read(native_cpu).unwrap();
-//     })
-// }
+// //     b.iter(|| {
+// //         let _ = x.read(native_cpu).unwrap();
+// //     })
+// // }
 
-// #[bench]
-// fn bench_shared_tensor_access_time_first_cpu(b: &mut Bencher) {
-//     let opencl_backend = opencl_backend();
-//     let opencl_cpu = opencl_backend.devices().iter().filter(|d| *d.kind() == Cpu).nth(0).unwrap();
+// // #[bench]
+// // fn bench_shared_tensor_access_time_first_cpu(b: &mut Bencher) {
+// //     let opencl_backend = opencl_backend();
+// //     let opencl_cpu = opencl_backend.devices().iter().filter(|d| *d.kind() == Cpu).nth(0).unwrap();
 
-//     bench_shared_tensor_access_time_first_(b, opencl_cpu);
-// }
+// //     bench_shared_tensor_access_time_first_(b, opencl_cpu);
+// // }
 
-// #[bench]
-// fn bench_shared_tensor_access_time_first_gpu(b: &mut Bencher) {
-//     let opencl_backend = opencl_backend();
-//     let opencl_gpu = opencl_backend.devices().iter().filter(|d| *d.kind() == Gpu).nth(0).unwrap();
+// // #[bench]
+// // fn bench_shared_tensor_access_time_first_gpu(b: &mut Bencher) {
+// //     let opencl_backend = opencl_backend();
+// //     let opencl_gpu = opencl_backend.devices().iter().filter(|d| *d.kind() == Gpu).nth(0).unwrap();
 
-//     bench_shared_tensor_access_time_first_(b, opencl_gpu);
-// }
\ No newline at end of file
+// //     bench_shared_tensor_access_time_first_(b, opencl_gpu);
+// // }
\ No newline at end of file
diff --git a/src/backend.rs b/src/backend.rs
index 1b44a57..9f0fb09 100644
--- a/src/backend.rs
+++ b/src/backend.rs
@@ -1,5 +1,7 @@
-use super::{Context, ComputeDevice, Framework};
-use super::error::Result;
+use std::ops;
+use super::{BoxContext, Context, Device, Error, ExtensionPackage, Framework, Hardware, Unextended};
+use super::Result;
+use utility::{self, TryDefault};
 
 /// The heart of Parenchyma - provides an interface for running parallel computations on one or 
 /// more devices.
@@ -15,76 +17,70 @@ use super::error::Result;
 /// the framework to the [`Backend::new`](#method.new) associated function, or by simply 
 /// calling [`Backend::default`](#method.default). The framework determines which devices are 
 /// available and how parallel kernel functions can be executed.
-///
-/// ## Examples
-///
-/// ```rust
-/// use parenchyma::{Backend, Framework, Native};
-///
-/// 
-/// // Construct a new framework.
-/// let framework = Native::new().expect("failed to initialize framework");
-///
-/// // Available devices can be obtained through the framework.
-/// let selection = framework.available_devices.clone();
-///
-/// // Create a ready to go backend from the framework.
-/// let backend = Backend::new(framework, selection).expect("failed to construct backend");
-///
-/// // ..
-/// ```
-///
-/// Construct a default backend:
-///
-/// ```rust
-/// use parenchyma::{Backend, Native};
-///
-/// // A default native backend.
-/// let backend: Backend<Native> = Backend::default().expect("something went wrong!");
-///
-/// // ..
-/// ```
 #[derive(Debug)]
-pub struct Backend {
+pub struct Backend<X = Unextended> {
     /// The initialized framework.
-    pub framework: Box<Framework>, /* &'static str,*/
+    ///
+    /// The Framework implementation such as OpenCL, CUDA, etc. defines, which should be used and
+    /// determines which hardwares will be available and how parallel kernel functions can be
+    /// executed.
+    framework: Box<Framework>,
     /// The context associated with the `framework`.
     ///
-    /// Contexts are the heart of both OpenCL and CUDA applications. See the [`Context`] trait for
-    /// more information.
+    /// Contexts are the heart of both OpenCL and CUDA applications. Contexts are created from one 
+    /// or more devices that are capable of executing methods and synchronizing memory. See 
+    /// the [`Context`] trait for more information.
     ///
     /// [`Context`]: (./trait.Context.html)
-    pub context: Box<Context>,
-    /// The chosen device
-    ///
-    /// The default active device is the first device found (index = `0`).
-    active: usize,
+    context: Box<Context<Package = X>>,
 }
 
-impl Backend {
+impl<X> Backend<X> where X: ExtensionPackage {
 
-    /// Constructs a backend using the most potent framework given the underlying hardware.
-    pub fn new() -> Backend {
+    /// Initialize a new backend.
+    pub fn new<F>() -> Result<Self> where F: BoxContext<X> + Framework + TryDefault<Err = Error> {
 
-        unimplemented!()
+        let framework = Box::new(F::try_default()?);
+        let selection = framework.available_hardware();
+        let context = framework.enclose(selection)?;
+
+        Ok(Backend { framework: framework, context })
     }
 
-    /// Attempts to construct a backend from the specified `framework`.
-    pub fn with<F>(framework: F) -> Result<Backend> where F: Framework {
+    /// Constructs a backend from the specified `framework` and `selection`.
+    pub fn with<F>(fwrk: F, selection: Vec<Hardware>) -> Result<Self> 
+        where F: BoxContext<X> + Framework {
+
+        let framework = Box::new(fwrk);
+        let context = framework.enclose(selection)?;
 
-        unimplemented!()
+        Ok(Backend { framework, context })
     }
 
-    // /// Try all provided `frameworks` in the specified order, choosing the first framework that 
-    // // initializes without failure.
-    // pub fn try(frameworks: Vec<Box<Framework>>) -> Result<Backend>;
+    /// Set the device at the specified `index` as the active device.
+    ///
+    /// Only one device can be the _active_ device - the device in which operations are executed.
+    pub fn set_active(&mut self, index: usize) -> Result {
+
+        self.context.set_active(index)
+    }
 }
 
-impl Backend {
+impl<X> ops::Deref for Backend<X> where X: ExtensionPackage {
+
+    type Target = X::Extension;
 
-    /// Returns the current device.
-    pub fn compute_device<T>(&self) -> &ComputeDevice<T> {
+    fn deref<'a>(&'a self) -> &'a X::Extension {
 
-        unimplemented!()
+        self.context.extension()
     }
-}
\ No newline at end of file
+}
+
+impl<X> utility::Has<Device> for Backend<X> where X: ExtensionPackage {
+
+    fn get_ref(&self) -> &Device {
+        self.context.active_device()
+    }
+}
+
+// pub trait AsBackend { }
\ No newline at end of file
diff --git a/src/changelog.rs b/src/changelog.rs
index f1a5f95..3211a9a 100644
--- a/src/changelog.rs
+++ b/src/changelog.rs
@@ -6,7 +6,10 @@
 /// * Partially implemented a CUDA API wrapper
 /// * Partially implemented native support
 /// * Worked on a fallback mechanism (see issue#15)
-/// * Chose a tensor lib (ndarray)
+/// * Use a tensor lib (ndarray) as the underlying native memory representation
 /// * No longer requires framework related feature flags (from the original Collenchyma project)
 /// * Implemented auto-sync
+/// * Add `Bundle` logic
+/// * Removed `IBinary`/`HashMap` technique. Use structs instead
+/// * No longer requires backends parameterized by a framework
 pub mod r0_0_3 {}
\ No newline at end of file
diff --git a/src/context.rs b/src/context.rs
new file mode 100644
index 0000000..8661015
--- /dev/null
+++ b/src/context.rs
@@ -0,0 +1,27 @@
+use std::fmt::Debug;
+use super::{Device, ExtensionPackage, Result};
+use utility::Has;
+
+/// Contexts are the heart of both OpenCL and CUDA applications. Contexts provide a container for
+/// objects such as memory, command-queues, programs/modules and kernels.
+pub trait Context: Debug {
+
+    /// The extension package built for `Self`.
+    type Package: ExtensionPackage;
+
+    /// Returns the _active_ device.
+    fn active_device(&self) -> &Device;
+
+    /// Set the device at the specified `index` as the active device.
+    fn set_active(&mut self, index: usize) -> Result;
+
+    #[doc(hidden)]
+    fn extension(&self) -> &<Self::Package as ExtensionPackage>::Extension;
+}
+
+impl<I> Has<Device> for I where I: Context {
+
+    fn get_ref(&self) -> &Device {
+        self.active_device()
+    }
+}
\ No newline at end of file
diff --git a/src/error.rs b/src/error.rs
index d271c70..21ec07d 100644
--- a/src/error.rs
+++ b/src/error.rs
@@ -3,7 +3,7 @@
 use std::{error, fmt, result};
 
 /// A specialized `Result` type.
-pub type Result<T = ()> = result::Result<T, Error>;
+pub type Result<T = (), E = Error> = result::Result<T, E>;
 
 /// The core error type used in Parenchyma.
 #[derive(Debug)]
@@ -28,7 +28,7 @@ pub enum ErrorKind {
     InvalidReshapedTensorSize,
     /// An error returned when attempting to access uninitialized memory.
     UninitializedMemory,
-    /// Memory allocation was not found for a provided `Device`.
+    /// Unable to drop the provided device because a memory allocation was not found for it.
     AllocatedMemoryNotFoundForDevice,
     /// An error occurred while attempting to synchronize memory.
     MemorySynchronizationFailed,
diff --git a/src/extension.rs b/src/extension.rs
new file mode 100644
index 0000000..dd06078
--- /dev/null
+++ b/src/extension.rs
@@ -0,0 +1,80 @@
+use std::any::Any;
+use std::fmt::Debug;
+use super::Result;
+
+/// Provides the generic functionality for a backend-specific implementation of a library.
+///
+/// A package can be a binary, a source file, c code, a single kernel, etc., or a collective which
+/// share related functionalities. A package is provided by a specific library such as BLAS. Notice
+/// that packages are analogous to those of Rust (i.e., crates):
+///
+/// compiled crate <-> package
+/// library (one or more modules) <-> bundle
+///
+/// A package needs to be _built_, which is handled by the specific implementation of a binary
+/// representation, and returns initialized operations based on the library. Interacting directly
+/// with the package itself is possible, but it should be used to construct the backend-agnostic
+/// operations, which can then be executed and parallelized via a unified interface.
+pub trait ExtensionPackage: 'static + Debug + Sized {
+    /// The name of the package.
+    ///
+    /// This associated constant is primarily used for logging/debugging purposes. The naming 
+    /// convention is as follows: "[organization]/[package-name]" (e.g., "parenchyma/nn").
+    const PACKAGE_NAME: &'static str;
+
+    /// A library can be a binary, a source file, c code, a single kernel, etc., or a collective.
+    ///
+    /// A backend is a Rust struct like any other, therefore you probably would like to implement
+    /// certain methods for the Backend. As the whole purpose of a Backend is to provide an
+    /// abstraction over various computation devices and computation languages, these implemented
+    /// methods will than be able to execute on different devices and use the full power of 
+    /// the machine's underlying hardware.
+    ///
+    /// Extending the backend with operations is easy. In Parenchyma we call crates, which provide
+    /// operations for the backend, _extensions_. Extensions are usually a group of related 
+    /// operations of a common field. Two examples for Parenchyma extensions 
+    /// are [BLAS][parenchyma-blas] and [NN][parenchyma-nn].
+    ///
+    /// An extension provides generic traits and the explicit implementation of these traits for 
+    /// one or (even better) all available Parenchyma frameworks - common host CPU, OpenCL, CUDA.
+    ///
+    /// The structure of an extension is pretty simple with as little overhead as possible. Macros 
+    /// and build-scripts make implementations even easier. If you would like to use specific 
+    /// extension for you backend, all you need to do is set them as dependencies in your Cargo 
+    /// file in addition to the Parenchyma crate. The extension then automatically extends the 
+    /// backend provided by Parenchyma.
+    ///
+    /// Extending the backend with your own extension is a straightforward process. For now we 
+    /// recommend that you take a look at the general code structure 
+    /// of [Parenchyma-BLAS][parenchyma-blas] or its documentation. Let us now about your extension 
+    /// on the Gitter chat, we are happy to feature your Parenchyma Extension on the README.
+    type Extension: ?Sized;
+}
+
+/// Builds a package and provides the functionality for turning a library into backend-specific, 
+/// executable operations, and tailored for the target framework.
+pub trait Build<Target>: Sized {
+
+    /// Compiles the library into package after initializing and configuring the library.
+    fn build(&mut Target) -> Result<Self>;
+}
+
+// =========================
+
+/// A marker type for _unextended_ backends/contexts.
+#[derive(Debug)]
+pub struct Unextended;
+
+impl ExtensionPackage for Unextended {
+
+    const PACKAGE_NAME: &'static str = "";
+
+    type Extension = Any;
+}
+
+impl<A> Build<A> for Unextended {
+
+    fn build(_: &mut A) -> Result<Unextended> {
+        Ok(Unextended)
+    }
+}
\ No newline at end of file
diff --git a/src/framework.rs b/src/framework.rs
new file mode 100644
index 0000000..cd544a0
--- /dev/null
+++ b/src/framework.rs
@@ -0,0 +1,32 @@
+use std::fmt::Debug;
+
+use super::{Context, Hardware, Result};
+
+/// A trait implemented for all frameworks. `Framework`s contain a list of all available devices as 
+/// well as other objects specific to the implementor.
+///
+/// The default framework is simply the host CPU for common computation. To make use of other
+/// devices such as GPUs, you may choose a GPGPU framework (such as OpenCL or CUDA) to access the 
+/// processing capabilities of the device(s).
+pub trait Framework: 'static + Debug {
+    /// The name of the framework.
+    ///
+    /// This associated constant is mainly used for the purposes of debugging and reporting errors.
+    ///
+    /// [issue#29924](https://github.com/rust-lang/rust/issues/29924): remove `Framework::name`
+    const FRAMEWORK_NAME: &'static str;
+
+    /// Returns the cached and available hardware.
+    fn available_hardware(&self) -> Vec<Hardware>;
+}
+
+/// Initialize a context, box it, and then return it.
+pub trait BoxContext<ExtensionPackage>: Framework {
+
+    // /// The `Context` representation for this framework.
+    // type Context: TryFrom<ContextConfig<Self>, Err = Error>;
+
+    /// Create a context from a selection of hardware devices and then wrap it in a box.
+    fn enclose(&self, selection: Vec<Hardware>) 
+        -> Result<Box<Context<Package = ExtensionPackage>>>;
+}
\ No newline at end of file
diff --git a/src/frameworks/cuda/sh.rs b/src/frameworks/cuda/foreign.rs
similarity index 100%
rename from src/frameworks/cuda/sh.rs
rename to src/frameworks/cuda/foreign.rs
diff --git a/src/frameworks/cuda/hl/error.rs b/src/frameworks/cuda/high/error.rs
similarity index 100%
rename from src/frameworks/cuda/hl/error.rs
rename to src/frameworks/cuda/high/error.rs
diff --git a/src/frameworks/cuda/hl/high.rs b/src/frameworks/cuda/high/mod.rs
similarity index 99%
rename from src/frameworks/cuda/hl/high.rs
rename to src/frameworks/cuda/high/mod.rs
index fe308db..db54b2c 100644
--- a/src/frameworks/cuda/hl/high.rs
+++ b/src/frameworks/cuda/high/mod.rs
@@ -1,3 +1,6 @@
+mod error;
+mod utility;
+
 use std::{cmp, ffi, ops};
 
 use super::Result;
diff --git a/src/frameworks/cuda/hl/utility.rs b/src/frameworks/cuda/high/utility.rs
similarity index 100%
rename from src/frameworks/cuda/hl/utility.rs
rename to src/frameworks/cuda/high/utility.rs
diff --git a/src/frameworks/cuda/hl/mod.rs b/src/frameworks/cuda/hl/mod.rs
deleted file mode 100644
index 24ecc47..0000000
--- a/src/frameworks/cuda/hl/mod.rs
+++ /dev/null
@@ -1,7 +0,0 @@
-pub use self::error::{Error, ErrorKind, Result};
-pub use self::high::{init, ndevices};
-pub use self::high::{Context, Device, Memory};
-
-mod error;
-mod high;
-mod utility;
\ No newline at end of file
diff --git a/src/frameworks/cuda/interface.rs b/src/frameworks/cuda/interface.rs
new file mode 100644
index 0000000..c26055c
--- /dev/null
+++ b/src/frameworks/cuda/interface.rs
@@ -0,0 +1,6 @@
+/// Provides the CUDA framework.
+#[derive(Debug)]
+pub struct Cuda {
+    // /// A list of available devices.
+    // pub available_devices: Vec<CudaDevice>,
+}
\ No newline at end of file
diff --git a/src/frameworks/cuda/mod.rs b/src/frameworks/cuda/mod.rs
index ebadbf7..e69de29 100644
--- a/src/frameworks/cuda/mod.rs
+++ b/src/frameworks/cuda/mod.rs
@@ -1,11 +0,0 @@
-//! CUDA backend support.
-
-pub mod hl;
-pub mod sh;
-
-/// Provides the CUDA framework.
-#[derive(Debug)]
-pub struct Cuda {
-    // /// A list of available devices.
-    // pub available_devices: Vec<CudaDevice>,
-}
\ No newline at end of file
diff --git a/src/frameworks/loader.rs b/src/frameworks/macros.rs
similarity index 95%
rename from src/frameworks/loader.rs
rename to src/frameworks/macros.rs
index 0a5d5fd..c501138 100644
--- a/src/frameworks/loader.rs
+++ b/src/frameworks/macros.rs
@@ -21,7 +21,7 @@ macro_rules! dynamic_extern {
                 type FnPtr = unsafe extern $linkage fn($($argument_type),*) $(-> $ret_ty)*;
 
                 lazy_static! {
-                    static ref FN_PTR: ::frameworks::loader::Sym<FnPtr> = {
+                    static ref FN_PTR: ::frameworks::macros::Sym<FnPtr> = {
                         unsafe {
                             use lib::Library;
                             use std::path::Path;
diff --git a/src/frameworks/mod.rs b/src/frameworks/mod.rs
index 7384823..35cb01f 100644
--- a/src/frameworks/mod.rs
+++ b/src/frameworks/mod.rs
@@ -1,5 +1,4 @@
-#[macro_use] mod loader;
+#[macro_use] mod macros;
 
-pub mod cuda;
 pub mod native;
 pub mod opencl;
\ No newline at end of file
diff --git a/src/frameworks/native/context.rs b/src/frameworks/native/context.rs
new file mode 100644
index 0000000..6d04a99
--- /dev/null
+++ b/src/frameworks/native/context.rs
@@ -0,0 +1,23 @@
+use {Context, Device, ExtensionPackage, Result};
+use std::marker::{PhantomData, Unsize};
+use super::NativeDevice;
+
+/// The native context.
+#[derive(Clone, Debug)]
+pub struct NativeContext<X>(pub PhantomData<X>);
+
+impl<X> Context for NativeContext<X> where X: ExtensionPackage, Self: Unsize<X::Extension> {
+
+    type Package = X;
+
+    fn active_device(&self) -> &Device {
+        static NATIVE_DEVICE: NativeDevice = NativeDevice;
+        &NATIVE_DEVICE
+    }
+
+    fn set_active(&mut self, _: usize) -> Result { Ok(()) }
+
+    fn extension(&self) -> &X::Extension {
+        self
+    }
+}
\ No newline at end of file
diff --git a/src/frameworks/native/device.rs b/src/frameworks/native/device.rs
new file mode 100644
index 0000000..5a10b2f
--- /dev/null
+++ b/src/frameworks/native/device.rs
@@ -0,0 +1,88 @@
+use {Alloc, ComputeDevice, Memory, Result, Shape, Synch, Viewable};
+use super::NativeMemory;
+
+/// The native device.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct NativeDevice;
+
+impl Viewable for NativeDevice {
+
+    fn view(&self) -> ComputeDevice {
+        ComputeDevice::Native(NativeDevice)
+    }
+}
+
+impl<T> Alloc<T> for NativeDevice {
+
+
+    fn alloc(&self, shape: &Shape) -> Result<Memory<T>> {
+        // TODO
+
+        let mut buffer = Vec::with_capacity(shape.capacity);
+
+        unsafe {
+            buffer.set_len(shape.capacity);
+        }
+
+        Ok(Memory::Native(
+            NativeMemory::from_shape_vec(
+                &shape.dims[..], 
+                buffer).unwrap()))
+    }
+
+    fn allocwrite(&self, shape: &Shape, data: Vec<T>) -> Result<Memory<T>> {
+        // TODO
+
+        Ok(Memory::Native(
+            NativeMemory::from_shape_vec(
+                &shape.dims[..], 
+                data).unwrap()))
+    }
+}
+
+impl<T> Synch<T> for NativeDevice where T: Clone {
+
+    fn write(
+        &self, 
+        memory: &mut Memory<T>, 
+        src_device: &ComputeDevice, 
+        source: &Memory<T>) 
+    -> Result {
+
+        match *src_device {
+            ComputeDevice::Native(_) => {
+                let memory = unsafe { memory.as_mut_native_unchecked() };
+                let source = unsafe { source.as_native_unchecked() };
+                // > Array implements .clone_from() to reuse an array's existing allocation. 
+                // > Semantically equivalent to *self = other.clone(), but potentially more efficient.
+                Ok(memory.clone_from(source))
+            },
+
+            ComputeDevice::OpenCL(ref cl_device) => {
+                cl_device.read(source, &mut ComputeDevice::Native(NativeDevice), memory)
+            }
+        }
+    }
+
+    fn read(
+        &self, 
+        memory: &Memory<T>, 
+        dest_device: &mut ComputeDevice, 
+        destination: &mut Memory<T>) 
+    -> Result {
+
+        match *dest_device {
+            ComputeDevice::Native(_) => {
+                let source = unsafe { memory.as_native_unchecked() };
+                let destination = unsafe { destination.as_mut_native_unchecked() };
+                // > Array implements .clone_from() to reuse an array's existing allocation. 
+                // > Semantically equivalent to *self = other.clone(), but potentially more efficient.
+                Ok(destination.clone_from(source))
+            },
+
+            ComputeDevice::OpenCL(ref mut cl_device) => {
+                cl_device.write(destination, &ComputeDevice::Native(NativeDevice), memory)
+            }
+        }
+    }
+}
\ No newline at end of file
diff --git a/src/frameworks/native/framework.rs b/src/frameworks/native/framework.rs
new file mode 100644
index 0000000..0530b80
--- /dev/null
+++ b/src/frameworks/native/framework.rs
@@ -0,0 +1,57 @@
+use {BoxContext, Context, Error, ExtensionPackage, Framework, Hardware, HardwareKind, Result};
+use std::marker::{PhantomData, Unsize};
+use super::NativeContext;
+use utility::TryDefault;
+
+const NATIVE: &'static str = "Native";
+
+/// The native framework
+///
+/// # Example
+///
+/// ```rust
+/// use parenchyma::{Backend, Native, SharedTensor};
+///
+/// let ref host: Backend = Backend::new::<Native>().unwrap();
+///
+/// let sh: SharedTensor = SharedTensor::with(host, [2, 2], vec![1., 2., 3., 4.]).unwrap();
+///
+/// let tensor = sh.read(host).unwrap();
+///
+/// println!("{:#?}", tensor);
+/// ```
+#[derive(Debug)]
+pub struct Native;
+
+impl Framework for Native {
+
+    const FRAMEWORK_NAME: &'static str = NATIVE;
+
+    fn available_hardware(&self) -> Vec<Hardware> {
+        vec![Hardware {
+            id: 0,
+            framework: NATIVE,
+            kind: HardwareKind::Central,
+            name: String::from("HOST CPU"),
+            compute_units: 1,
+        }]
+    }
+}
+
+impl<X> BoxContext<X> for Native 
+    where X: ExtensionPackage, 
+          NativeContext<X>: Unsize<X::Extension> 
+          {
+
+    fn enclose(&self, _: Vec<Hardware>) -> Result<Box<Context<Package = X>>> {
+        Ok(Box::new(NativeContext(PhantomData)))
+    }
+}
+
+impl TryDefault for Native {
+    type Err = Error;
+
+    fn try_default() -> Result<Native> {
+        Ok(Native)
+    }
+}
\ No newline at end of file
diff --git a/src/frameworks/native/memory.rs b/src/frameworks/native/memory.rs
new file mode 100644
index 0000000..2f22115
--- /dev/null
+++ b/src/frameworks/native/memory.rs
@@ -0,0 +1,6 @@
+use ndarray::{Array, IxDyn};
+
+/// Represents a native array.
+///
+/// note: named `Memory` for consistency across frameworks.
+pub type NativeMemory<T> = Array<T, IxDyn>;
\ No newline at end of file
diff --git a/src/frameworks/native/mod.rs b/src/frameworks/native/mod.rs
index 0d816c4..d38c8ef 100644
--- a/src/frameworks/native/mod.rs
+++ b/src/frameworks/native/mod.rs
@@ -1,20 +1,11 @@
 //! Native backend support.
 
-use ndarray::{Array, IxDyn};
-
-/// Provides the native framework.
-#[derive(Debug)]
-pub struct Native;
-
-/// Represents a native array.
-///
-/// note: named `Memory` for consistency across frameworks.
-pub type Memory<T> = Array<T, IxDyn>;
-
-/// The native context.
-#[derive(Clone, Debug)]
-pub struct NativeContext;
-
-/// The native device.
-#[derive(Clone, Debug)]
-pub struct NativeDevice;
\ No newline at end of file
+pub use self::context::NativeContext;
+pub use self::device::NativeDevice;
+pub use self::framework::Native;
+pub use self::memory::NativeMemory;
+
+mod context;
+mod device;
+mod framework;
+mod memory;
\ No newline at end of file
diff --git a/src/frameworks/opencl/sh.rs b/src/frameworks/opencl/foreign.rs
similarity index 99%
rename from src/frameworks/opencl/sh.rs
rename to src/frameworks/opencl/foreign.rs
index 395556d..3d7dd04 100644
--- a/src/frameworks/opencl/sh.rs
+++ b/src/frameworks/opencl/foreign.rs
@@ -66,12 +66,14 @@ pub type cl_event_info                  = cl_uint;
 pub type cl_command_type                = cl_uint;
 pub type cl_profiling_info              = cl_uint;
 
+#[derive(Debug)]
 #[repr(C)]
 pub struct cl_image_format {
     image_channel_order:        cl_channel_order,
     image_channel_data_type:    cl_channel_type
 }
 
+#[derive(Debug)]
 pub struct cl_buffer_region {
     origin:     libc::size_t,
     size:       libc::size_t
@@ -445,8 +447,17 @@ pub static CL_PROFILING_COMMAND_SUBMIT:                  cl_uint = 0x1281;
 pub static CL_PROFILING_COMMAND_START:                   cl_uint = 0x1282;
 pub static CL_PROFILING_COMMAND_END:                     cl_uint = 0x1283;
 
-dynamic_extern! {
-    #[link="OpenCL"]
+// dynamic_extern! {
+//     #[link="OpenCL"]
+
+#[link(name = "OpenCL", kind = "framework")]
+#[cfg(target_os = "macos")]
+extern { }
+
+#[link(name = "OpenCL")]
+#[cfg(target_os = "linux")]
+extern { }
+
     extern "C" {
         
       /* Platform APIs */
@@ -844,4 +855,4 @@ dynamic_extern! {
        */
       pub fn clGetExtensionFunctionAddress(func_name: *const libc::c_char) -> *mut raw::c_void;
     }
-}
\ No newline at end of file
+//}
\ No newline at end of file
diff --git a/src/frameworks/opencl/hl/error.rs b/src/frameworks/opencl/high/error.rs
similarity index 95%
rename from src/frameworks/opencl/hl/error.rs
rename to src/frameworks/opencl/high/error.rs
index 2fdd76b..8e47f41 100644
--- a/src/frameworks/opencl/hl/error.rs
+++ b/src/frameworks/opencl/high/error.rs
@@ -1,14 +1,17 @@
-use frameworks::opencl::sh::CLStatus;
 use std::{error, fmt, result};
+use super::super::foreign::CLStatus;
 
+/// A specialized `Result` type.
 pub type Result<T = ()> = result::Result<T, Error>;
 
+/// The error type used in the OpenCL module.
 #[derive(Debug)]
 pub struct Error {
     kind: ErrorKind,
     inner: Option<Box<error::Error + Send + Sync>>,
 }
 
+/// A list of general categories.
 #[derive(Debug, Copy, Clone)]
 pub enum ErrorKind {
     /// No OpenCL devices that matched device_type were found.
@@ -69,7 +72,7 @@ pub enum ErrorKind {
 impl From<CLStatus> for ErrorKind {
 
     fn from(cl_status: CLStatus) -> ErrorKind {
-        use frameworks::opencl::sh::CLStatus::*;
+        use super::super::foreign::CLStatus::*;
         use self::ErrorKind::*;
 
         match cl_status {
@@ -163,15 +166,6 @@ impl Error {
         }
     }
 
-    pub fn get_ref(&self) -> Option<&(error::Error + Send + Sync + 'static)> {
-        use std::ops::Deref;
-
-        match self.inner {
-            Some(ref error) => Some(error.deref()),
-            _ => None
-        }
-    }
-
     /// Returns the corresponding `ErrorKind` for this error.
     pub fn kind(&self) -> ErrorKind {
         self.kind
diff --git a/src/frameworks/opencl/high/functions.rs b/src/frameworks/opencl/high/functions.rs
new file mode 100644
index 0000000..7d17d27
--- /dev/null
+++ b/src/frameworks/opencl/high/functions.rs
@@ -0,0 +1,25 @@
+use std::ptr;
+
+use super::{Platform, Result};
+use super::utility;
+use super::super::foreign;
+
+/// Number of platforms
+pub fn nplatforms() -> Result<u32> {
+    unsafe {
+        let mut nplatforms = 0;
+        let ret_value = foreign::clGetPlatformIDs(0, ptr::null_mut(), &mut nplatforms);
+        return utility::check(ret_value, || nplatforms);
+    }
+}
+
+/// Obtain the list of platforms available.
+pub fn platforms() -> Result<Vec<Platform>> {
+    unsafe {
+        let nplatforms = nplatforms()?;
+        let mut vec_id = vec![ptr::null_mut(); nplatforms as usize];
+        let pointer = vec_id.as_mut_ptr();
+        let ret_value = foreign::clGetPlatformIDs(nplatforms, pointer, ptr::null_mut());
+        return utility::check(ret_value, || vec_id.iter().map(|&id| Platform(id)).collect());
+    }
+}
\ No newline at end of file
diff --git a/src/frameworks/opencl/hl/high.rs b/src/frameworks/opencl/high/mod.rs
similarity index 86%
rename from src/frameworks/opencl/hl/high.rs
rename to src/frameworks/opencl/high/mod.rs
index 7828f79..1bfd257 100644
--- a/src/frameworks/opencl/hl/high.rs
+++ b/src/frameworks/opencl/high/mod.rs
@@ -1,37 +1,25 @@
-use std::os::raw;
-use std::{clone, cmp, ffi, mem, ops, ptr};
+//! Wrapper module for OpenCL
+#![allow(missing_docs, unused_qualifications)]
 
-use super::Result;
-use super::utility;
-use super::super::sh;
+pub use self::error::{Error, ErrorKind, Result};
+pub use self::functions::{nplatforms, platforms};
 
-/// Number of platforms
-pub fn nplatforms() -> Result<u32> {
-    unsafe {
-        let mut nplatforms = 0;
-        let ret_value = sh::clGetPlatformIDs(0, ptr::null_mut(), &mut nplatforms);
-        return utility::check(ret_value, || nplatforms);
-    }
-}
+mod error;
+mod functions;
+mod utility;
 
-/// Obtain the list of platforms available.
-pub fn platforms() -> Result<Vec<Platform>> {
-    unsafe {
-        let nplatforms = nplatforms()?;
-        let mut vec_id = vec![0 as sh::cl_platform_id; nplatforms as usize];
-        let ret_value = sh::clGetPlatformIDs(nplatforms, vec_id.as_mut_ptr(), ptr::null_mut());
-        return utility::check(ret_value, || vec_id.iter().map(|&id| Platform(id)).collect());
-    }
-}
+use std::{clone, cmp, ffi, mem, ops, ptr};
+use std::os::raw::c_void;
+use super::foreign;
 
 #[derive(Debug)]
-pub struct Buffer(sh::cl_mem);
+pub struct Buffer(foreign::cl_mem);
 
 impl Buffer {
     /// Increments the memory object reference count.
     pub fn retain(&self) -> Result {
         unsafe {
-            let ret_value = sh::clRetainMemObject(self.0);
+            let ret_value = foreign::clRetainMemObject(self.0);
 
             return utility::check(ret_value, || {});
         }
@@ -40,7 +28,7 @@ impl Buffer {
     /// Decrements the memory object reference count.
     pub fn release(&self) -> Result {
         unsafe {
-            let ret_value = sh::clReleaseMemObject(self.0);
+            let ret_value = foreign::clReleaseMemObject(self.0);
             
             return utility::check(ret_value, || {});
         }
@@ -61,9 +49,13 @@ impl ops::Drop for Buffer {
 }
 
 #[derive(Debug)]
-pub struct Context(sh::cl_context);
+pub struct Context(foreign::cl_context);
 
 impl Context {
+    pub fn ptr(&self) -> &foreign::cl_context {
+        &self.0
+    }
+
     /// Creates an OpenCL context.
     ///
     /// An OpenCL context is created with one or more devices. Contexts are used by the OpenCL 
@@ -92,7 +84,7 @@ impl Context {
             // The number of devices specified in the devices argument.
             let number_of_devices = devices.len() as u32;
             // A pointer to a list of unique devices returned by clGetDeviceIDs for a platform.
-            let raw_devices: Vec<*mut raw::c_void> = devices.iter().map(|d| d.0).collect();
+            let raw_devices: Vec<*mut c_void> = devices.iter().map(|d| d.0).collect();
             let raw_devices_ptr = raw_devices.as_ptr();
 
             // A callback function that can be registered by the application. This callback function 
@@ -111,15 +103,15 @@ impl Context {
             // `user_data` is a pointer to user supplied data.
             //
             // TODO
-            let pfn_notify: extern fn(*const i8, *const raw::c_void, usize, *mut raw::c_void) 
+            let pfn_notify: extern fn(*const i8, *const c_void, usize, *mut c_void) 
                 = mem::transmute(ptr::null::<fn()>());
 
             // Passed as the `user_data` argument when pfn_notify is called. user_data can be NULL.
             //
             // TODO
-            let user_data: *mut raw::c_void = ptr::null_mut();
+            let user_data: *mut c_void = ptr::null_mut();
 
-            let cl_context = sh::clCreateContext(
+            let cl_context = foreign::clCreateContext(
                 properties, 
                 number_of_devices, 
                 raw_devices_ptr, 
@@ -128,7 +120,7 @@ impl Context {
                 &mut errcode_ret
             );
 
-            let ret_value = sh::CLStatus::new(errcode_ret)
+            let ret_value = foreign::CLStatus::new(errcode_ret)
                 .expect("failed to convert `i32` to `CLStatus`");
 
             return utility::check(ret_value, || Context(cl_context));
@@ -149,17 +141,17 @@ impl Context {
     /// application. The size of the buffer that host_ptr points to must be greater than or equal 
     /// to the size bytes.
     pub fn create_buffer<F, H>(&self, f: F, size: usize, h: H) -> Result<Buffer> 
-        where F: Into<Option<sh::cl_bitfield>>,
-              H: Into<Option<*mut raw::c_void>>,
+        where F: Into<Option<foreign::cl_bitfield>>,
+              H: Into<Option<*mut c_void>>,
     {
         unsafe {
             let mut errcode_ret: i32 = 0;
-            let flags = f.into().unwrap_or(sh::CL_MEM_READ_WRITE);
+            let flags = f.into().unwrap_or(foreign::CL_MEM_READ_WRITE);
             let host_pointer = h.into().unwrap_or(ptr::null_mut());
 
-            let mem = sh::clCreateBuffer(self.0, flags, size, host_pointer, &mut errcode_ret);
+            let mem = foreign::clCreateBuffer(self.0, flags, size, host_pointer, &mut errcode_ret);
 
-            let ret_value = sh::CLStatus::new(errcode_ret)
+            let ret_value = foreign::CLStatus::new(errcode_ret)
                 .expect("failed to convert `i32` to `CLStatus`");
 
             return utility::check(ret_value, || Buffer(mem));
@@ -185,9 +177,9 @@ impl Context {
             let ptrs: Vec<*const i8> = cstrings.iter().map(|s| s.as_ptr()).collect();
             let ptr = ptrs.as_ptr();
 
-            let cl_program = sh::clCreateProgramWithSource(self.0, n, ptr, lens_ptr, &mut errcode);
+            let cl_program = foreign::clCreateProgramWithSource(self.0, n, ptr, lens_ptr, &mut errcode);
 
-            let ret_value = sh::CLStatus::new(errcode).expect("failed to convert i32 to CLStatus");
+            let ret_value = foreign::CLStatus::new(errcode).expect("failed to convert i32 to CLStatus");
 
             return utility::check(ret_value, || Program(cl_program));
         }
@@ -196,7 +188,7 @@ impl Context {
     /// Increment the context reference count.
     fn retain(&self) -> Result {
         unsafe {
-            let ret_value = sh::clRetainContext(self.0);
+            let ret_value = foreign::clRetainContext(self.0);
             return utility::check(ret_value, || {});
         }
     }
@@ -204,7 +196,7 @@ impl Context {
     /// Decrement the context reference count.
     fn release(&self) -> Result {
         unsafe {
-            let ret_value = sh::clReleaseContext(self.0);
+            let ret_value = foreign::clReleaseContext(self.0);
             return utility::check(ret_value, || {});
         }
     }
@@ -218,6 +210,8 @@ impl clone::Clone for Context {
     }
 }
 
+impl cmp::Eq for Context { }
+
 impl cmp::PartialEq<Context> for Context {
 
     fn eq(&self, other: &Context) -> bool {
@@ -235,20 +229,24 @@ impl ops::Drop for Context {
 
 /// Newtype with an internal type of `cl_device_id`.
 #[derive(Clone, Debug)]
-pub struct Device(sh::cl_device_id);
+pub struct Device(foreign::cl_device_id);
 
 impl Device {
+    pub fn ptr(&self) -> &foreign::cl_device_id {
+        &self.0
+    }
+
     /// The default compute device address space size specified as an unsigned integer value 
     /// in bits. Currently supported values are 32 or 64 bits.
     pub fn address_bits(&self) -> Result<u32> {
-        let parameter = sh::CL_DEVICE_ADDRESS_BITS;
+        let parameter = foreign::CL_DEVICE_ADDRESS_BITS;
         let res = self.info(parameter, |_| 0u32, |b| b as *mut u32 as _)?;
         Ok(res)
     }
 
     /// Is CL_TRUE if the device is available and CL_FALSE if the device is not available.
     pub fn available(&self) -> Result<bool> {
-        let parameter = sh::CL_DEVICE_AVAILABLE;
+        let parameter = foreign::CL_DEVICE_AVAILABLE;
         let res = self.info(parameter, |_| 0u32, |b| b as *mut u32 as _)?;
         Ok(res != 0)
     }
@@ -257,7 +255,7 @@ impl Device {
     /// program source. Is CL_TRUE if the compiler is available. This can be CL_FALSE for the 
     /// embedded platform profile only.
     pub fn compiler_available(&self) -> Result<bool> {
-        let parameter = sh::CL_DEVICE_COMPILER_AVAILABLE;
+        let parameter = foreign::CL_DEVICE_COMPILER_AVAILABLE;
         let res = self.info(parameter, |_| 0u32, |b| b as *mut u32 as _)?;
         Ok(res != 0)
     }
@@ -282,7 +280,7 @@ impl Device {
 
     /// Is CL_TRUE if the OpenCL device is a little endian device and CL_FALSE otherwise.
     pub fn endian_little(&self) -> Result<bool> {
-        let parameter = sh::CL_DEVICE_ENDIAN_LITTLE;
+        let parameter = foreign::CL_DEVICE_ENDIAN_LITTLE;
         let res = self.info(parameter, |_| 0u32, |b| b as *mut u32 as _)?;
         Ok(res != 0)
     }
@@ -291,7 +289,7 @@ impl Device {
     /// etc. in the device. Is CL_FALSE if the device does not implement error correction. This can 
     /// be a requirement for certain clients of OpenCL.
     pub fn error_correction_support(&self) -> Result<bool> {
-        let parameter = sh::CL_DEVICE_ERROR_CORRECTION_SUPPORT;
+        let parameter = foreign::CL_DEVICE_ERROR_CORRECTION_SUPPORT;
         let res = self.info(parameter, |_| 0u32, |b| b as *mut u32 as _)?;
         Ok(res != 0)
     }
@@ -325,7 +323,7 @@ impl Device {
     /// cl_khr_byte_addressable_store
     /// cl_khr_fp16
     pub fn extensions(&self) -> Result<Vec<String>> {
-        let parameter = sh::CL_DEVICE_EXTENSIONS;
+        let parameter = foreign::CL_DEVICE_EXTENSIONS;
         let res = self.info(parameter, |size| vec![0u8; size], |b| b.as_mut_ptr() as _);
         res.map(|b| String::from_utf8(b).expect("UTF8 string")).map(|st| {
             st.split_whitespace().map(|s| s.into()).collect()
@@ -334,7 +332,7 @@ impl Device {
 
     /// Size of global memory cache in bytes.
     pub fn global_mem_cache_size(&self) -> Result<u64> {
-        let parameter = sh::CL_DEVICE_GLOBAL_MEM_CACHE_SIZE;
+        let parameter = foreign::CL_DEVICE_GLOBAL_MEM_CACHE_SIZE;
         let res = self.info(parameter, |_| 0u64, |b| b as *mut u64 as _)?;
         Ok(res)
     }
@@ -348,14 +346,14 @@ impl Device {
 
     /// Size of global memory cache line in bytes.
     pub fn global_mem_cacheline_size(&self) -> Result<u32> {
-        let parameter = sh::CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE;
+        let parameter = foreign::CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE;
         let res = self.info(parameter, |_| 0u32, |b| b as *mut u32 as _)?;
         Ok(res)
     }
 
     /// Size of global memory cache line in bytes.
     pub fn global_mem_size(&self) -> Result<u64> {
-        let parameter = sh::CL_DEVICE_GLOBAL_MEM_SIZE;
+        let parameter = foreign::CL_DEVICE_GLOBAL_MEM_SIZE;
         let res = self.info(parameter, |_| 0u64, |b| b as *mut u64 as _)?;
         Ok(res)
     }
@@ -378,7 +376,7 @@ impl Device {
 
     /// Is CL_TRUE if images are supported by the OpenCL device and CL_FALSE otherwise.
     pub fn image_support(&self) -> Result<bool> {
-        let parameter = sh::CL_DEVICE_IMAGE_SUPPORT;
+        let parameter = foreign::CL_DEVICE_IMAGE_SUPPORT;
         let res = self.info(parameter, |_| 0u32, |b| b as *mut u32 as _)?;
         Ok(res != 0)
     }
@@ -386,7 +384,7 @@ impl Device {
     /// Max height of 2D image in pixels. The minimum value is 8192 if CL_DEVICE_IMAGE_SUPPORT 
     /// is CL_TRUE.
     pub fn image2d_max_height(&self) -> Result<usize> {
-        let parameter = sh::CL_DEVICE_IMAGE2D_MAX_HEIGHT;
+        let parameter = foreign::CL_DEVICE_IMAGE2D_MAX_HEIGHT;
         let res = self.info(parameter, |_| 0usize, |b| b as *mut usize as _)?;
         Ok(res)
     }
@@ -394,7 +392,7 @@ impl Device {
     /// Max width of 2D image in pixels. The minimum value is 8192 if CL_DEVICE_IMAGE_SUPPORT 
     /// is CL_TRUE.
     pub fn image2d_max_width(&self) -> Result<usize> {
-        let parameter = sh::CL_DEVICE_IMAGE2D_MAX_WIDTH;
+        let parameter = foreign::CL_DEVICE_IMAGE2D_MAX_WIDTH;
         let res = self.info(parameter, |_| 0usize, |b| b as *mut usize as _)?;
         Ok(res)
     }
@@ -402,7 +400,7 @@ impl Device {
     /// Max depth of 3D image in pixels. The minimum value is 2048 if CL_DEVICE_IMAGE_SUPPORT 
     /// is CL_TRUE.
     pub fn image3d_max_depth(&self) -> Result<usize> {
-        let parameter = sh::CL_DEVICE_IMAGE3D_MAX_DEPTH;
+        let parameter = foreign::CL_DEVICE_IMAGE3D_MAX_DEPTH;
         let res = self.info(parameter, |_| 0usize, |b| b as *mut usize as _)?;
         Ok(res)
     }
@@ -410,7 +408,7 @@ impl Device {
     /// Max height of 3D image in pixels. The minimum value is 2048 if CL_DEVICE_IMAGE_SUPPORT 
     /// is CL_TRUE.
     pub fn image3d_max_height(&self) -> Result<usize> {
-        let parameter = sh::CL_DEVICE_IMAGE3D_MAX_HEIGHT;
+        let parameter = foreign::CL_DEVICE_IMAGE3D_MAX_HEIGHT;
         let res = self.info(parameter, |_| 0usize, |b| b as *mut usize as _)?;
         Ok(res)
     }
@@ -418,14 +416,14 @@ impl Device {
     /// Max width of 3D image in pixels. The minimum value is 2048 if CL_DEVICE_IMAGE_SUPPORT 
     /// is CL_TRUE.
     pub fn image3d_max_width(&self) -> Result<usize> {
-        let parameter = sh::CL_DEVICE_IMAGE3D_MAX_WIDTH;
+        let parameter = foreign::CL_DEVICE_IMAGE3D_MAX_WIDTH;
         let res = self.info(parameter, |_| 0usize, |b| b as *mut usize as _)?;
         Ok(res)
     }
 
     /// Size of local memory arena in bytes. The minimum value is 16 KB.
     pub fn local_mem_size(&self) -> Result<u64> {
-        let parameter = sh::CL_DEVICE_LOCAL_MEM_SIZE;
+        let parameter = foreign::CL_DEVICE_LOCAL_MEM_SIZE;
         let res = self.info(parameter, |_| 0u64, |b| b as *mut u64 as _)?;
         Ok(res)
     }
@@ -439,14 +437,14 @@ impl Device {
 
     /// Size of local memory arena in bytes. The minimum value is 16 KB.
     pub fn max_clock_frequency(&self) -> Result<u32> {
-        let parameter = sh::CL_DEVICE_MAX_CLOCK_FREQUENCY;
+        let parameter = foreign::CL_DEVICE_MAX_CLOCK_FREQUENCY;
         let res = self.info(parameter, |_| 0u32, |b| b as *mut u32 as _)?;
         Ok(res)
     }
 
     /// The number of parallel compute cores on the OpenCL device. The minimum value is 1.
     pub fn max_compute_units(&self) -> Result<u32> {
-        let parameter = sh::CL_DEVICE_MAX_COMPUTE_UNITS;
+        let parameter = foreign::CL_DEVICE_MAX_COMPUTE_UNITS;
         let res = self.info(parameter, |_| 0u32, |b| b as *mut u32 as _)?;
         Ok(res)
     }
@@ -454,14 +452,14 @@ impl Device {
     /// Max number of arguments declared with the __constant qualifier in a kernel. The minimum 
     /// value is 8.
     pub fn max_constant_args(&self) -> Result<u32> {
-        let parameter = sh::CL_DEVICE_MAX_CONSTANT_ARGS;
+        let parameter = foreign::CL_DEVICE_MAX_CONSTANT_ARGS;
         let res = self.info(parameter, |_| 0u32, |b| b as *mut u32 as _)?;
         Ok(res)
     }
 
     /// Max size in bytes of a constant buffer allocation. The minimum value is 64 KB.
     pub fn max_constant_buffer_size(&self) -> Result<u64> {
-        let parameter = sh::CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE;
+        let parameter = foreign::CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE;
         let res = self.info(parameter, |_| 0u64, |b| b as *mut u64 as _)?;
         Ok(res)
     }
@@ -469,14 +467,14 @@ impl Device {
     /// Max size of memory object allocation in bytes. The minimum value is 
     /// max (1/4th of CL_DEVICE_GLOBAL_MEM_SIZE, 128*1024*1024)
     pub fn max_mem_alloc_size(&self) -> Result<u64> {
-        let parameter = sh::CL_DEVICE_MAX_MEM_ALLOC_SIZE;
+        let parameter = foreign::CL_DEVICE_MAX_MEM_ALLOC_SIZE;
         let res = self.info(parameter, |_| 0u64, |b| b as *mut u64 as _)?;
         Ok(res)
     }
 
     /// Max size in bytes of the arguments that can be passed to a kernel. The minimum value is 256.
     pub fn max_parameter_size(&self) -> Result<usize> {
-        let parameter = sh::CL_DEVICE_MAX_PARAMETER_SIZE;
+        let parameter = foreign::CL_DEVICE_MAX_PARAMETER_SIZE;
         let res = self.info(parameter, |_| 0usize, |b| b as *mut usize as _)?;
         Ok(res)
     }
@@ -484,7 +482,7 @@ impl Device {
     /// Max number of simultaneous image objects that can be read by a kernel. The minimum value 
     /// is 128 if CL_DEVICE_IMAGE_SUPPORT is CL_TRUE.
     pub fn max_read_image_args(&self) -> Result<u32> {
-        let parameter = sh::CL_DEVICE_MAX_READ_IMAGE_ARGS;
+        let parameter = foreign::CL_DEVICE_MAX_READ_IMAGE_ARGS;
         let res = self.info(parameter, |_| 0u32, |b| b as *mut u32 as _)?;
         Ok(res)
     }
@@ -492,7 +490,7 @@ impl Device {
     /// Maximum number of samplers that can be used in a kernel. The minimum value is 16 
     /// if CL_DEVICE_IMAGE_SUPPORT is CL_TRUE.
     pub fn max_samplers(&self) -> Result<u32> {
-        let parameter = sh::CL_DEVICE_MAX_SAMPLERS;
+        let parameter = foreign::CL_DEVICE_MAX_SAMPLERS;
         let res = self.info(parameter, |_| 0u32, |b| b as *mut u32 as _)?;
         Ok(res)
     }
@@ -500,7 +498,7 @@ impl Device {
     /// Maximum number of work-items in a work-group executing a kernel using the data parallel 
     /// execution model. (Refer to clEnqueueNDRangeKernel). The minimum value is 1.
     pub fn max_work_group_size(&self) -> Result<usize> {
-        let parameter = sh::CL_DEVICE_MAX_WORK_GROUP_SIZE;
+        let parameter = foreign::CL_DEVICE_MAX_WORK_GROUP_SIZE;
         let res = self.info(parameter, |_| 0usize, |b| b as *mut usize as _)?;
         Ok(res)
     }
@@ -508,7 +506,7 @@ impl Device {
     /// Maximum dimensions that specify the global and local work-item IDs used by the data 
     /// parallel execution model. (Refer to clEnqueueNDRangeKernel). The minimum value is 3.
     pub fn max_work_item_dimensions(&self) -> Result<u32> {
-        let parameter = sh::CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS;
+        let parameter = foreign::CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS;
         let res = self.info(parameter, |_| 0u32, |b| b as *mut u32 as _)?;
         Ok(res)
     }
@@ -519,7 +517,7 @@ impl Device {
     /// Returns n size_t entries, where n is the value returned by the query 
     /// for CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS. The minimum value is (1, 1, 1).
     pub fn max_work_item_sizes(&self) -> Result<Vec<usize>> {
-        let parameter = sh::CL_DEVICE_MAX_WORK_ITEM_SIZES;
+        let parameter = foreign::CL_DEVICE_MAX_WORK_ITEM_SIZES;
         let ve = |size| vec![1usize; size / mem::size_of::<usize>()];
         let res = self.info(parameter, ve, |b| b.as_mut_ptr() as _)?;
 
@@ -529,28 +527,28 @@ impl Device {
     /// Max number of simultaneous image objects that can be written to by a kernel. The minimum 
     /// value is 8 if CL_DEVICE_IMAGE_SUPPORT is CL_TRUE.
     pub fn max_write_image_args(&self) -> Result<u32> {
-        let parameter = sh::CL_DEVICE_MAX_WRITE_IMAGE_ARGS;
+        let parameter = foreign::CL_DEVICE_MAX_WRITE_IMAGE_ARGS;
         let res = self.info(parameter, |_| 0u32, |b| b as *mut u32 as _)?;
         Ok(res)
     }
 
     /// Describes the alignment in bits of the base address of any allocated memory object.
     pub fn mem_base_addr_align(&self) -> Result<u32> {
-        let parameter = sh::CL_DEVICE_MEM_BASE_ADDR_ALIGN;
+        let parameter = foreign::CL_DEVICE_MEM_BASE_ADDR_ALIGN;
         let res = self.info(parameter, |_| 0u32, |b| b as *mut u32 as _)?;
         Ok(res)
     }
 
     /// The smallest alignment in bytes which can be used for any data type.
     pub fn min_data_type_align_size(&self) -> Result<u32> {
-        let parameter = sh::CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE;
+        let parameter = foreign::CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE;
         let res = self.info(parameter, |_| 0u32, |b| b as *mut u32 as _)?;
         Ok(res)
     }
 
     /// Device name string.
     pub fn name(&self) -> Result<String> {
-        let parameter = sh::CL_DEVICE_NAME;
+        let parameter = foreign::CL_DEVICE_NAME;
         let res = self.info(parameter, |size| vec![0u8; size], |b| b.as_mut_ptr() as _);
         res.map(|b| String::from_utf8(b).unwrap())
     }
@@ -558,7 +556,7 @@ impl Device {
 //    /// The platform associated with this device.
 //    pub fn platform(&self) -> Result {
 //
-//        let _ = sh::CL_DEVICE_PLATFORM;
+//        let _ = foreign::CL_DEVICE_PLATFORM;
 //        unimplemented!()
 //    }
 
@@ -570,7 +568,7 @@ impl Device {
 //    /// must return 0.
 //    pub fn preferred_vector_width_char(&self) -> Result<u32> {
 //
-//        let _ = sh::CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR;
+//        let _ = foreign::CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR;
 //        
 //        unimplemented!()
 //    }
@@ -583,7 +581,7 @@ impl Device {
 //    /// must return 0.
 //    pub fn preferred_vector_width_short(&self) -> Result<u32> {
 //
-//        let _ = sh::CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT;
+//        let _ = foreign::CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT;
 //        
 //        unimplemented!()
 //    }
@@ -596,7 +594,7 @@ impl Device {
 //    /// must return 0.
 //    pub fn preferred_vector_width_int(&self) -> Result<u32> {
 //
-//        let _ = sh::CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT;
+//        let _ = foreign::CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT;
 //        
 //        unimplemented!()
 //    }
@@ -609,7 +607,7 @@ impl Device {
 //    /// must return 0.
 //    pub fn preferred_vector_width_long(&self) -> Result<u32> {
 //
-//        let _ = sh::CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG;
+//        let _ = foreign::CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG;
 //        
 //        unimplemented!()
 //    }
@@ -622,7 +620,7 @@ impl Device {
 //    /// must return 0.
 //    pub fn preferred_vector_width_float(&self) -> Result<u32> {
 //
-//        let _ = sh::CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT;
+//        let _ = foreign::CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT;
 //        
 //        unimplemented!()
 //    }
@@ -635,7 +633,7 @@ impl Device {
 //    /// must return 0.
 //    pub fn preferred_vector_width_double(&self) -> Result<u32> {
 //
-//        let _ = sh::CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE;
+//        let _ = foreign::CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE;
 //        
 //        unimplemented!()
 //    }
@@ -648,14 +646,14 @@ impl Device {
     ///
     /// EMBEDDED_PROFILE - if the device supports the OpenCL embedded profile.
     pub fn profile(&self) -> Result<String> {
-        let parameter = sh::CL_DEVICE_PROFILE;
+        let parameter = foreign::CL_DEVICE_PROFILE;
         let res = self.info(parameter, |size| vec![0u8; size], |b| b.as_mut_ptr() as _);
         res.map(|b| String::from_utf8(b).unwrap())
     }
 
     /// The smallest alignment in bytes which can be used for any data type.
     pub fn profiling_timer_resolution(&self) -> Result<usize> {
-        let parameter = sh::CL_DEVICE_PROFILING_TIMER_RESOLUTION;
+        let parameter = foreign::CL_DEVICE_PROFILING_TIMER_RESOLUTION;
         let res = self.info(parameter, |_| 0usize, |b| b as *mut usize as _)?;
         Ok(res)
     }
@@ -699,14 +697,14 @@ impl Device {
     /// of: CL_DEVICE_TYPE_CPU, CL_DEVICE_TYPE_GPU, CL_DEVICE_TYPE_ACCELERATOR, 
     /// or CL_DEVICE_TYPE_DEFAULT.
     pub fn type_(&self) -> Result<u64> {
-        let parameter = sh::CL_DEVICE_TYPE;
+        let parameter = foreign::CL_DEVICE_TYPE;
         let res = self.info(parameter, |_| 0u64, |b| b as *mut u64 as _)?;
         Ok(res)
     }
 
     /// Vendor name string.
     pub fn vendor(&self) -> Result<String> {
-        let parameter = sh::CL_DEVICE_VENDOR;
+        let parameter = foreign::CL_DEVICE_VENDOR;
         let res = self.info(parameter, |size| vec![0u8; size], |b| b.as_mut_ptr() as _);
         res.map(|b| String::from_utf8(b).unwrap())
     }
@@ -714,7 +712,7 @@ impl Device {
     /// A unique device vendor identifier. An example of a unique device identifier could be 
     /// the PCIe ID.
     pub fn vendor_id(&self) -> Result<u32> {
-        let parameter = sh::CL_DEVICE_VENDOR_ID;
+        let parameter = foreign::CL_DEVICE_VENDOR_ID;
         let res = self.info(parameter, |_| 0u32, |b| b as *mut u32 as _)?;
         Ok(res)
     }
@@ -726,14 +724,14 @@ impl Device {
     ///
     /// The major_version.minor_version value returned will be 1.0.
     pub fn version(&self) -> Result<String> {
-        let parameter = sh::CL_DEVICE_VERSION;
+        let parameter = foreign::CL_DEVICE_VERSION;
         let res = self.info(parameter, |size| vec![0u8; size], |b| b.as_mut_ptr() as _);
         res.map(|b| String::from_utf8(b).unwrap())
     }
 
     /// OpenCL software driver version string in the form major_number.minor_number.
     pub fn driver_version(&self) -> Result<String> {
-        let parameter = sh::CL_DRIVER_VERSION;
+        let parameter = foreign::CL_DRIVER_VERSION;
         let res = self.info(parameter, |size| vec![0u8; size], |b| b.as_mut_ptr() as _);
         res.map(|b| String::from_utf8(b).unwrap())
     }
@@ -742,7 +740,7 @@ impl Device {
     fn info_size(&self, parameter: u32) -> Result<usize> {
         unsafe {
             let mut size = 0;
-            let ret_value = sh::clGetDeviceInfo(self.0, parameter, 0, ptr::null_mut(), &mut size);
+            let ret_value = foreign::clGetDeviceInfo(self.0, parameter, 0, ptr::null_mut(), &mut size);
 
             return utility::check(ret_value, || size);
         }
@@ -750,19 +748,21 @@ impl Device {
 
     fn info<F1, F2, T>(&self, p: u32, f1: F1, f2: F2) -> Result<T>
         where F1: Fn(usize) -> T, 
-              F2: Fn(&mut T) -> *mut raw::c_void {
+              F2: Fn(&mut T) -> *mut c_void {
         unsafe {
 
             let size = self.info_size(p)?;
             let mut ret = f1(size);
             
-            let ret_value = sh::clGetDeviceInfo(self.0, p, size, f2(&mut ret), ptr::null_mut());
+            let ret_value = foreign::clGetDeviceInfo(self.0, p, size, f2(&mut ret), ptr::null_mut());
 
             return utility::check(ret_value, || ret);
         }
     }
 }
 
+impl cmp::Eq for Device { }
+
 impl cmp::PartialEq<Device> for Device {
 
     fn eq(&self, other: &Device) -> bool {
@@ -772,19 +772,19 @@ impl cmp::PartialEq<Device> for Device {
 }
 
 // TODO use newtype: https://github.com/rust-lang/rust/issues/32146
-pub type Event = sh::cl_event;
+pub type Event = foreign::cl_event;
 
 #[derive(Debug)]
-pub struct Kernel(sh::cl_kernel);
+pub struct Kernel(foreign::cl_kernel);
 
 pub trait KernelArg {
     fn size() -> usize;
-    fn pointer(&self) -> *mut raw::c_void;
+    fn pointer(&self) -> *mut c_void;
 }
 
 impl KernelArg for Buffer {
-    fn size() -> usize { mem::size_of::<sh::cl_mem>() }
-    fn pointer(&self) -> sh::cl_mem { self.0 }
+    fn size() -> usize { mem::size_of::<foreign::cl_mem>() }
+    fn pointer(&self) -> foreign::cl_mem { unsafe { mem::transmute(self) } }
 }
 
 impl Kernel {
@@ -824,7 +824,7 @@ impl Kernel {
         unsafe {
             let size = A::size();
             let ptr = buf.pointer();
-            let ret_value = sh::clSetKernelArg(self.0, position, size, ptr);
+            let ret_value = foreign::clSetKernelArg(self.0, position, size, ptr);
             return utility::check(ret_value, || {});
         }
     }
@@ -832,7 +832,7 @@ impl Kernel {
     /// Increment the kernel reference count.
     fn retain(&self) -> Result {
         unsafe {
-            let ret_value = sh::clRetainKernel(self.0);
+            let ret_value = foreign::clRetainKernel(self.0);
             return utility::check(ret_value, || {});
         }
     }
@@ -840,7 +840,7 @@ impl Kernel {
     /// Decrement the kernel reference count.
     fn release(&self) -> Result {
         unsafe {
-            let ret_value = sh::clReleaseKernel(self.0);
+            let ret_value = foreign::clReleaseKernel(self.0);
             return utility::check(ret_value, || {});
         }
     }
@@ -863,7 +863,7 @@ impl ops::Drop for Kernel {
 
 /// Newtype with an internal type of `cl_platform_id`.
 #[derive(Clone, Debug)]
-pub struct Platform(sh::cl_platform_id);
+pub struct Platform(foreign::cl_platform_id);
 
 impl Platform {
 
@@ -876,17 +876,17 @@ impl Platform {
     /// * `EMBEDDED_PROFILE` - if the implementation supports the OpenCL embedded profile. The 
     /// embedded profile is defined to be a subset for each version of OpenCL.
     pub fn profile(&self) -> Result<String> {
-        self.info(sh::CL_PLATFORM_PROFILE)
+        self.info(foreign::CL_PLATFORM_PROFILE)
     }
 
     /// Returns the platform name.
     pub fn name(&self) -> Result<String> {
-        self.info(sh::CL_PLATFORM_NAME)
+        self.info(foreign::CL_PLATFORM_NAME)
     }
 
     /// Returns the platform vendor.
     pub fn vendor(&self) -> Result<String> {
-        self.info(sh::CL_PLATFORM_VENDOR)
+        self.info(foreign::CL_PLATFORM_VENDOR)
     }
 
     /// Returns a space-separated list of extension names (the extension names themselves do 
@@ -898,13 +898,13 @@ impl Platform {
             st.split_whitespace().map(|s| s.into()).collect()
         };
 
-        self.info(sh::CL_PLATFORM_EXTENSIONS).map(closure)
+        self.info(foreign::CL_PLATFORM_EXTENSIONS).map(closure)
     }
 
     pub fn ndevices_by_type(&self, t: u64) -> Result<u32> {
         unsafe {
             let mut ndevices = 0;
-            let ret_value = sh::clGetDeviceIDs(self.0, t, 0, ptr::null_mut(), &mut ndevices);
+            let ret_value = foreign::clGetDeviceIDs(self.0, t, 0, ptr::null_mut(), &mut ndevices);
             return utility::check(ret_value, || ndevices);
         }
     }
@@ -912,23 +912,23 @@ impl Platform {
     pub fn devices_by_type(&self, t: u64) -> Result<Vec<Device>> {
         unsafe {
             let ndevices = self.ndevices_by_type(t)?;
-            let mut vec_id = vec![0 as sh::cl_device_id; ndevices as usize];
+            let mut vec_id = vec![0 as foreign::cl_device_id; ndevices as usize];
             let n = ptr::null_mut();
 
-            let ret_value = sh::clGetDeviceIDs(self.0, t, ndevices, vec_id.as_mut_ptr(), n);
+            let ret_value = foreign::clGetDeviceIDs(self.0, t, ndevices, vec_id.as_mut_ptr(), n);
             utility::check(ret_value, || vec_id.iter().map(|&id| Device(id)).collect())
         }
     }
 
     pub fn devices(&self) -> Result<Vec<Device>> {
-        self.devices_by_type(sh::CL_DEVICE_TYPE_ALL)
+        self.devices_by_type(foreign::CL_DEVICE_TYPE_ALL)
     }
 
     /// Returns the size of `parameter`.
     fn info_size(&self, parameter: u32) -> Result<usize> {
         unsafe {
             let mut size = 0;
-            let ret_value = sh::clGetPlatformInfo(self.0, parameter, 0, ptr::null_mut(), &mut size);
+            let ret_value = foreign::clGetPlatformInfo(self.0, parameter, 0, ptr::null_mut(), &mut size);
 
             return utility::check(ret_value, || size);
         }
@@ -938,11 +938,11 @@ impl Platform {
         unsafe {
             let size = self.info_size(parameter)?;
             let mut bytes = vec![0u8; size];
-            let ret_value = sh::clGetPlatformInfo(
+            let ret_value = foreign::clGetPlatformInfo(
                 self.0, 
                 parameter, 
                 size, 
-                bytes.as_mut_ptr() as *mut raw::c_void, 
+                bytes.as_mut_ptr() as *mut c_void, 
                 ptr::null_mut()
             );
 
@@ -952,7 +952,7 @@ impl Platform {
 }
 
 #[derive(Debug)]
-pub struct Program(sh::cl_program);
+pub struct Program(foreign::cl_program);
 
 impl Program {
 
@@ -968,7 +968,7 @@ impl Program {
     pub fn build<T>(&self, devices: &[Device], opt: T) -> Result where T: Into<Option<String>> {
         unsafe {
             let num_devices = devices.len() as u32;
-            let raw_devices: Vec<sh::cl_device_id> = devices.iter().map(|d| d.0).collect();
+            let raw_devices: Vec<foreign::cl_device_id> = devices.iter().map(|d| d.0).collect();
             let raw_devices_ptr = raw_devices.as_ptr();
 
             let options = match opt.into() {
@@ -980,7 +980,7 @@ impl Program {
             let pfn_notify = mem::transmute(ptr::null::<fn()>());
             let user_data = ptr::null_mut();
 
-            let ret_value = sh::clBuildProgram(
+            let ret_value = foreign::clBuildProgram(
                 self.0, 
                 num_devices, 
                 raw_devices_ptr,
@@ -999,8 +999,8 @@ impl Program {
             let mut errcode = 0i32;
             let cstring = ffi::CString::new(name.as_ref()).unwrap();
             let ptr = cstring.as_ptr();
-            let cl_kernel = sh::clCreateKernel(self.0, ptr, &mut errcode);
-            let ret_value = sh::CLStatus::new(errcode).expect("failed to convert i32 to CLStatus");
+            let cl_kernel = foreign::clCreateKernel(self.0, ptr, &mut errcode);
+            let ret_value = foreign::CLStatus::new(errcode).expect("failed to convert i32 to CLStatus");
 
             return utility::check(ret_value, || Kernel(cl_kernel));
         }
@@ -1009,7 +1009,7 @@ impl Program {
     /// Increment the context reference count.
     fn retain(&self) -> Result {
         unsafe {
-            let ret_value = sh::clRetainProgram(self.0);
+            let ret_value = foreign::clRetainProgram(self.0);
             return utility::check(ret_value, || {});
         }
     }
@@ -1017,7 +1017,7 @@ impl Program {
     /// Decrement the context reference count.
     fn release(&self) -> Result {
         unsafe {
-            let ret_value = sh::clReleaseProgram(self.0);
+            let ret_value = foreign::clReleaseProgram(self.0);
             return utility::check(ret_value, || {});
         }
     }
@@ -1039,7 +1039,7 @@ impl ops::Drop for Program {
 }
 
 #[derive(Debug)]
-pub struct Queue(sh::cl_command_queue);
+pub struct Queue(foreign::cl_command_queue);
 
 impl Queue {
 
@@ -1047,9 +1047,9 @@ impl Queue {
     pub fn new(context: &Context, device: &Device, properties: u64) -> Result<Self> {
         unsafe {
             let mut errcode_ret: i32 = 0;
-            let cl_command_queue = sh::clCreateCommandQueue(context.0, device.0, 
+            let cl_command_queue = foreign::clCreateCommandQueue(context.0, device.0, 
                 properties, &mut errcode_ret);
-            let ret_value = sh::CLStatus::new(errcode_ret).expect("failed to convert i32 to CLStatus");
+            let ret_value = foreign::CLStatus::new(errcode_ret).expect("failed to convert i32 to CLStatus");
 
             return utility::check(ret_value, || Queue(cl_command_queue));
         }
@@ -1099,7 +1099,7 @@ impl Queue {
         blocking_write:  bool, 
         offset:          usize,
         cb:              usize, 
-        ptr:             *const raw::c_void,
+        ptr:             *const c_void,
         event_wait_list: &[Event]) 
         -> Result<Event> {
 
@@ -1109,11 +1109,11 @@ impl Queue {
             let events = 
                 if num_events_in_wait_list > 0 { event_wait_list.as_ptr() } else { ptr::null() };
 
-            let mut new_event = 0 as sh::cl_event;
+            let mut new_event = 0 as foreign::cl_event;
 
             let blocking_write_u32 = if blocking_write { 1 } else { 0 };
 
-            let ret_value = sh::clEnqueueWriteBuffer(
+            let ret_value = foreign::clEnqueueWriteBuffer(
                 self.0, 
                 buffer.0, 
                 blocking_write_u32,
@@ -1160,7 +1160,7 @@ impl Queue {
         blocking_read:   bool,
         offset:          usize,
         cb:              usize,
-        ptr:             *mut raw::c_void,
+        ptr:             *mut c_void,
         event_wait_list: &[Event])
         -> Result<Event> {
 
@@ -1169,11 +1169,11 @@ impl Queue {
             let events = 
                 if num_events_in_wait_list > 0 { event_wait_list.as_ptr() } else { ptr::null() };
 
-            let mut new_event = 0 as sh::cl_event;
+            let mut new_event = 0 as foreign::cl_event;
 
             let blocking_read_u32 = if blocking_read { 1 } else { 0 };
 
-            let ret_value = sh::clEnqueueReadBuffer(
+            let ret_value = foreign::clEnqueueReadBuffer(
                 self.0, 
                 buffer.0, 
                 blocking_read_u32,
@@ -1256,7 +1256,7 @@ impl Queue {
             let events = 
                 if num_events_in_wait_list > 0 { event_wait_list.as_ptr() } else { ptr::null() };
 
-            let mut new_event = 0 as sh::cl_event;
+            let mut new_event = 0 as foreign::cl_event;
 
             // == ptrs
             let global_work_size_ptr = 
@@ -1265,7 +1265,7 @@ impl Queue {
             let local_work_size_ptr =
                 if local_work_size.len() > 0 { local_work_size.as_ptr() } else { ptr::null() };
 
-            let ret_value = sh::clEnqueueNDRangeKernel(
+            let ret_value = foreign::clEnqueueNDRangeKernel(
                 self.0,
                 kernel.0,
                 work_dim,
@@ -1284,7 +1284,7 @@ impl Queue {
     /// Increments the command_queue reference count.
     fn retain(&self) -> Result {
         unsafe {
-            let ret_value = sh::clRetainCommandQueue(self.0);
+            let ret_value = foreign::clRetainCommandQueue(self.0);
             return utility::check(ret_value, || {});
         }
     }
@@ -1292,7 +1292,7 @@ impl Queue {
     /// Decrements the command_queue reference count.
     fn release(&self) -> Result {
         unsafe {
-            let ret_value = sh::clReleaseCommandQueue(self.0);
+            let ret_value = foreign::clReleaseCommandQueue(self.0);
             return utility::check(ret_value, || {});
         }
     }
@@ -1308,6 +1308,16 @@ impl clone::Clone for Queue {
     }
 }
 
+impl cmp::Eq for Queue { }
+
+impl cmp::PartialEq<Queue> for Queue {
+
+    fn eq(&self, other: &Queue) -> bool {
+
+        self.0 == other.0
+    }
+}
+
 impl ops::Drop for Queue {
 
     fn drop(&mut self) {
diff --git a/src/frameworks/opencl/hl/utility.rs b/src/frameworks/opencl/high/utility.rs
similarity index 81%
rename from src/frameworks/opencl/hl/utility.rs
rename to src/frameworks/opencl/high/utility.rs
index c9a36a1..944f05d 100644
--- a/src/frameworks/opencl/hl/utility.rs
+++ b/src/frameworks/opencl/high/utility.rs
@@ -1,5 +1,5 @@
-use super::error::{ErrorKind, Result};
-use super::super::sh::CLStatus;
+use super::{ErrorKind, Result};
+use super::super::foreign::CLStatus;
 
 pub fn check<F, T>(cl_status: CLStatus, ok_value: F) -> Result<T> where F: FnOnce() -> T {
     
diff --git a/src/frameworks/opencl/hl/mod.rs b/src/frameworks/opencl/hl/mod.rs
deleted file mode 100644
index d0862ca..0000000
--- a/src/frameworks/opencl/hl/mod.rs
+++ /dev/null
@@ -1,7 +0,0 @@
-pub use self::error::{Error, ErrorKind, Result};
-pub use self::high::{nplatforms, platforms};
-pub use self::high::{Buffer, Context, Device, Event, Kernel, KernelArg, Platform, Program, Queue};
-
-mod error;
-mod high;
-mod utility;
\ No newline at end of file
diff --git a/src/frameworks/opencl/interface/context.rs b/src/frameworks/opencl/interface/context.rs
new file mode 100644
index 0000000..023313f
--- /dev/null
+++ b/src/frameworks/opencl/interface/context.rs
@@ -0,0 +1,90 @@
+use {Context, Device, Error, ErrorKind, ExtensionPackage, Result};
+use std::marker::Unsize;
+use super::OpenCLDevice;
+use super::super::high;
+use utility::Uninitialized;
+
+/// Represents an OpenCL context.
+///
+/// A context is responsible for managing OpenCL objects and resources (command-queues, program 
+/// objects, kernel objects, executing kernels, etc.). The usual configuration is a single context 
+/// encapsulating multiple devices. The resources, such as [buffers][buffer] and [events][event], 
+/// can be shared across multiple devices in a single context. Other possible setups include:
+///
+/// * a single context for multiple devices
+/// * a single context for a single device
+/// * a context for each device
+///
+/// note: multi-platform contexts are not supported in OpenCL.
+///
+/// ## Programs
+///
+/// An OpenCL context can have multiple programs associated with it. Programs can be compiled
+/// individually to avoid possible name clashes due to using packages from multiple package 
+/// authors.
+///
+/// [buffer]: ./frameworks/opencl/struct.Memory.html
+/// [event]: ./frameworks/opencl/struct.Event.html
+#[derive(Debug)]
+pub struct OpenCLContext<X> {
+    /// The high-level context.
+    pub(super) context: high::Context,
+    /// Holds the extension package implementation.
+    pub(super) package: X,
+    /// A list of devices associated with the context.
+    pub(super) selection: Vec<OpenCLDevice>,
+    /// The index of the _active_ device.
+    pub(super) active: usize,
+}
+
+impl<X> OpenCLContext<X> {
+
+    /// Returns the `package`.
+    ///
+    /// [package author]
+    pub fn package(&self) -> &X {
+        &self.package
+    }
+
+    /// Returns the _active_ OpenCL device.
+    ///
+    /// [package author]
+    pub fn device(&self) -> &OpenCLDevice {
+        &self.selection[self.active]
+    }
+}
+
+impl OpenCLContext<Uninitialized> {
+    /// Creates and returns a program.
+    pub fn create_program<I>(&mut self, src: &[I]) -> Result<high::Program> where I: AsRef<str> {
+        let program = self.context.create_program_with_source(src)?;
+        let raw_devices: Vec<_> = self.selection.iter().map(|d| d.device.clone()).collect();
+        program.build(&raw_devices, None /* TODO */)?;
+        Ok(program)
+    }
+}
+
+impl<X> Context for OpenCLContext<X> where X: ExtensionPackage, Self: Unsize<X::Extension> {
+
+    type Package = X;
+
+    fn active_device(&self) -> &Device {
+
+        &self.selection[self.active]
+    }
+
+    fn set_active(&mut self, idx: usize) -> Result {
+        if idx >= self.selection.len() {
+            return Err(Error::new(ErrorKind::Other, "device index out of range"));
+        }
+
+        self.active = idx;
+
+        Ok(())
+    }
+
+    fn extension(&self) -> &X::Extension {
+
+        self
+    }
+}
\ No newline at end of file
diff --git a/src/frameworks/opencl/interface/device.rs b/src/frameworks/opencl/interface/device.rs
new file mode 100644
index 0000000..52ecd79
--- /dev/null
+++ b/src/frameworks/opencl/interface/device.rs
@@ -0,0 +1,130 @@
+use {Alloc, ComputeDevice, ErrorKind, Memory, Result, Shape, Synch, Viewable};
+use std::os::raw::c_void;
+use super::OpenCLMemory;
+use super::super::{foreign, high};
+use utility;
+
+/// Represents an OpenCL device.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct OpenCLDevice {
+    pub(super) device: high::Device,
+    pub(super) context: high::Context,
+    /// A command queue
+    ///
+    /// A command queue is the mechanism for interaction with the device. The queue is used for 
+    /// operations such as kernel launches and memory copies. At least one command queue per device
+    /// is required. Queues are used by the host application to submit work to devices and 
+    /// associated with devices within a context.
+    ///
+    /// __commands__:
+    ///
+    /// - memory copy or mapping
+    /// - device code execution
+    /// - synchronization point
+    ///
+    /// __modes__:
+    ///
+    /// - in-order
+    /// - out-of-order
+    ///
+    /// ## TODO
+    ///
+    /// * Use events to synchronize
+    pub(super) queue: high::Queue,
+}
+
+impl OpenCLDevice {
+
+    /// Returns the OpenCL command queue.
+    ///
+    /// [package author]
+    pub fn queue(&self) -> &high::Queue {
+        &self.queue
+    }
+}
+
+impl Viewable for OpenCLDevice {
+
+    fn view(&self) -> ComputeDevice {
+        ComputeDevice::OpenCL(self.clone())
+    }
+}
+
+impl<T> Alloc<T> for OpenCLDevice {
+
+
+    fn alloc(&self, shape: &Shape) -> Result<Memory<T>> {
+        // TODO
+
+        let flag = foreign::CL_MEM_READ_WRITE;
+        let length = shape.capacity();
+        let size = utility::allocated::<T>(length);
+        let buffer = self.context.create_buffer(flag, size, None)?;
+        let cl_buffer = OpenCLMemory { buf: buffer };
+        Ok(Memory::OpenCL(cl_buffer))
+    }
+
+    fn allocwrite(&self, shape: &Shape, mut data: Vec<T>) -> Result<Memory<T>> {
+        // TODO
+
+        let flag = foreign::CL_MEM_READ_ONLY | foreign::CL_MEM_COPY_HOST_PTR;
+        let length = shape.capacity();
+        let size = utility::allocated::<T>(length);
+        let pointer = data.as_mut_ptr();
+
+        // create buffer and fill it immediately
+        let buffer = self.context.create_buffer(flag, size, pointer as *mut c_void)?;
+        let cl_buffer = OpenCLMemory { buf: buffer };
+        Ok(Memory::OpenCL(cl_buffer))
+    }
+}
+
+impl<T> Synch<T> for OpenCLDevice {
+
+    fn write(&self, memory: &mut Memory<T>, _: &ComputeDevice, source: &Memory<T>) -> Result {
+        match *source {
+            Memory::Native(ref native_memory) => {
+                let cl_memory = unsafe { memory.as_opencl_unchecked() };
+
+                let length = native_memory.len();
+                let size = utility::allocated::<T>(length);
+                let slice = native_memory.as_slice_memory_order().unwrap();
+                let slice_pointer = slice.as_ptr();
+
+                let ref buf = cl_memory.buf;
+                let block = true; // TODO async
+                let offset = 0;
+                let _ = self.queue
+                    .enqueue_write_buffer(buf, block, offset, size, slice_pointer as *const c_void, &[])?;
+
+                Ok(())
+            },
+
+            _ => Err(ErrorKind::NoAvailableSynchronizationRouteFound.into()),
+        }
+    }
+
+    /// Synchronizes `memory` to `destination`.
+    fn read(&self, memory: &Memory<T>, _: &mut ComputeDevice, destination: &mut Memory<T>) -> Result {
+        match *destination {
+            Memory::Native(ref mut native_memory) => {
+                let cl_memory = unsafe { memory.as_opencl_unchecked() };
+
+                let length = native_memory.len();
+                let size = utility::allocated::<T>(length);
+                let slice = native_memory.as_slice_memory_order_mut().unwrap();
+                let slice_pointer = slice.as_mut_ptr();
+
+                let ref buf = cl_memory.buf;
+                let block = true; // TODO async
+                let offset = 0;
+                let _ = self.queue
+                    .enqueue_read_buffer(buf, block, offset, size, slice_pointer as *mut c_void, &[])?;
+
+                Ok(())
+            },
+
+            _ => Err(ErrorKind::NoAvailableSynchronizationRouteFound.into()),
+        }
+    }
+}
\ No newline at end of file
diff --git a/src/frameworks/opencl/interface/event.rs b/src/frameworks/opencl/interface/event.rs
new file mode 100644
index 0000000..cb981ce
--- /dev/null
+++ b/src/frameworks/opencl/interface/event.rs
@@ -0,0 +1,8 @@
+/// Represents an OpenCL event.
+///
+/// Most OpenCL operations happen asynchronously on an OpenCL device. Multiple OpenCL operations 
+/// can be ordered and synchronized by way of the event objects yielded by the operations. An event
+/// object can be used as an input to other operations which will wait until the event has 
+/// finished executing to run.
+#[derive(Debug)]
+pub struct OpenCLEvent(() /* TODO */);
\ No newline at end of file
diff --git a/src/frameworks/opencl/interface/framework.rs b/src/frameworks/opencl/interface/framework.rs
new file mode 100644
index 0000000..a75bdac
--- /dev/null
+++ b/src/frameworks/opencl/interface/framework.rs
@@ -0,0 +1,150 @@
+use {BoxContext, Build, Context, Error, ErrorKind, ExtensionPackage, Framework};
+use {Hardware, HardwareKind, Result};
+use std::marker::Unsize;
+use super::{OpenCLContext, OpenCLDevice};
+use super::super::{foreign, high};
+use utility::{TryDefault, Uninitialized};
+
+const OPEN_CL: &'static str = "OpenCL";
+
+/// Provides the OpenCL framework.
+///
+/// # Flow
+///
+/// Since multiple platforms can exist, the first available platform is selected during 
+/// the initialization. A list of available devices are then provided for your choosing. Then,
+/// the provided selection of devices are used to create a context, with a command queue for each
+/// device. At this stage, a program(s) is compiled. A (host) program is essentially a collection 
+/// of kernels. A kernel is the smallest unit of execution.
+///
+/// In OpenCL, the host code can read in a kernel binary (i.e., compiled off-line) or a kernel 
+/// source file (i.e., compile on-line). More information on on-line/off-line compilation can be
+/// found [here][1]. Kernels are expensive to start, so they're typically used to do a large amount
+/// of work. Memory allocated on an OpenCL device can be used when executing kernels, and then 
+/// transfered back.
+///
+/// Work-groups, a collection of work-items, are assigned to execute on compute-units. A work-item
+/// is an instance of a kernel as runtime. That kernel instance is at a point in an index, which 
+/// can be thought of as a grid and the work-groups which contain the work-items can be thought of 
+/// as sub-grids within the grid. The work-groups can be defined explicitly or implicitly by 
+/// simply specifying the number of work-items, both dealing with data parallelism. In terms of task
+/// parallelism, kernels are executed independent of an index space.
+/// It should also be noted that there are [built-in scalar data types][2] along with
+/// [built-in functions][3].
+///
+/// [1]: https://www.fixstars.com/en/opencl/book/OpenCLProgrammingBook/online-offline-compilation/
+/// [2]: https://www.khronos.org/registry/OpenCL/sdk/1.0/docs/man/xhtml/scalarDataTypes.html
+/// [3]: https://www.khronos.org/registry/OpenCL/sdk/1.0/docs/man/xhtml/mathFunctions.html
+#[derive(Debug)]
+pub struct OpenCL {
+    /// raw device pointers
+    cache: Vec<high::Device>,
+    /// A list of available devices for the first platform found.
+    ///
+    /// Platforms are defined by the implementation. Platforms enables the host to interact with 
+    /// OpenCL-capable devices.
+    pub available_hardware: Vec<Hardware>,
+}
+
+impl Framework for OpenCL {
+
+    const FRAMEWORK_NAME: &'static str = OPEN_CL;
+
+    fn available_hardware(&self) -> Vec<Hardware> {
+        self.available_hardware.clone()
+    }
+}
+
+impl<X> BoxContext<X> for OpenCL 
+    where X: ExtensionPackage + Build<OpenCLContext<Uninitialized>>,
+          OpenCLContext<X>: Unsize<X::Extension> 
+          {
+
+    fn enclose(&self, hw_selection: Vec<Hardware>) -> Result<Box<Context<Package = X>>> {
+        let indices: Vec<usize> = hw_selection.iter().map(|hardware| hardware.id).collect();
+
+        let devices: Vec<high::Device> = self.cache
+            .iter()
+            .enumerate()
+            .filter(|&(idx, _)| indices.contains(&idx))
+            .map(|(_, device)| device.clone())
+            .collect();
+
+        // create a context for the devices
+        let hl_context = high::Context::new(&devices)?;
+
+        let mut selection = vec![];
+
+        for raw_device in devices.into_iter() {
+
+            // create a command queue (with profiling enabled, needed for timing kernels)
+            let queue = high::Queue::new(&hl_context, &raw_device, foreign::CL_QUEUE_PROFILING_ENABLE)?;
+
+            selection.push(OpenCLDevice {
+                device: raw_device,
+                context: hl_context.clone(),
+                queue,
+            });
+        }
+
+        let mut context: OpenCLContext<Uninitialized> = OpenCLContext {
+            context: hl_context,
+            package: (),
+            selection: selection,
+            active: 0,
+        };
+
+        let package = X::build(&mut context)?;
+
+        Ok(Box::new(OpenCLContext {
+            context: context.context,
+            package: package,
+            selection: context.selection,
+            active: context.active,
+        }))
+    }
+}
+
+impl TryDefault for OpenCL {
+
+    type Err = Error;
+
+    fn try_default() -> Result<OpenCL> {
+
+        let mut cache = vec![];
+
+        let available_hardware = {
+            high::platforms()?.remove(0).devices()?.into_iter()
+                //.skip(|dev_handle| dev_handle.available())
+                .enumerate()
+                .map(|(index, dev_handle)| {
+                    let hardware = Hardware {
+                        id: index,
+                        framework: OPEN_CL,
+                        kind: match dev_handle.type_().unwrap() {
+                            foreign::CL_DEVICE_TYPE_CPU => HardwareKind::Central,
+                            foreign::CL_DEVICE_TYPE_GPU => HardwareKind::Graphics,
+                            foreign::CL_DEVICE_TYPE_ACCELERATOR => HardwareKind::Accelerator,
+                            _ => HardwareKind::Other,
+                        },
+                        name: dev_handle.name().unwrap(),
+                        compute_units: dev_handle.max_compute_units().unwrap() as usize,
+                    };
+
+                    cache.push(dev_handle);
+
+                    hardware
+                })
+                .collect()
+        };
+
+        Ok(OpenCL { cache, available_hardware })
+    }
+}
+
+impl From<high::Error> for Error {
+
+    fn from(e: high::Error) -> Error {
+        Error::new(ErrorKind::Framework { name: OPEN_CL }, e)
+    }
+}
\ No newline at end of file
diff --git a/src/frameworks/opencl/interface/image.rs b/src/frameworks/opencl/interface/image.rs
new file mode 100644
index 0000000..323048a
--- /dev/null
+++ b/src/frameworks/opencl/interface/image.rs
@@ -0,0 +1,3 @@
+/// Represents an OpenCL image, opaque 2D/3D objects.
+#[derive(Debug)]
+pub struct OpenCLImage(() /* TODO */);
\ No newline at end of file
diff --git a/src/frameworks/opencl/interface/memory.rs b/src/frameworks/opencl/interface/memory.rs
new file mode 100644
index 0000000..cbd4aa9
--- /dev/null
+++ b/src/frameworks/opencl/interface/memory.rs
@@ -0,0 +1,28 @@
+use Memory;
+use super::super::high;
+
+/// A `Memory` wraps around an OpenCL buffer id that manages its deallocation, named 
+/// as such for consistency's sake.
+///
+/// Memory objects can be copied to host memory, from host memory, or to other memory objects.
+/// Copying from the host to a device is considered _writing_. Copying from a device to the host is
+/// considered _reading_.
+///
+/// Unlike CUDA, OpenCL [buffers][1] are only context specific, not device specific. Also note: 
+/// currently, lazy allocation is used on the NVIDIA driver. That is, the buffer object, in a sense,
+/// is located _nowhere_ when allocated. It only exists when needed.
+///
+/// [1]: https://goo.gl/S9B3TL
+#[derive(Clone, Debug)]
+pub struct OpenCLMemory {
+    pub(super) buf: high::Buffer,
+}
+
+impl<T> ::opencl::high::KernelArg for Memory<T> {
+    fn size() -> usize { ::std::mem::size_of::<::opencl::foreign::cl_mem>() }
+
+    fn pointer(&self) -> ::opencl::foreign::cl_mem {
+
+        unsafe { self.as_opencl_unchecked().buf.pointer() }
+    }
+}
\ No newline at end of file
diff --git a/src/frameworks/opencl/interface/mod.rs b/src/frameworks/opencl/interface/mod.rs
new file mode 100644
index 0000000..c076a2a
--- /dev/null
+++ b/src/frameworks/opencl/interface/mod.rs
@@ -0,0 +1,13 @@
+pub use self::context::OpenCLContext;
+pub use self::device::OpenCLDevice;
+pub use self::event::OpenCLEvent;
+pub use self::framework::OpenCL;
+pub use self::image::OpenCLImage;
+pub use self::memory::OpenCLMemory;
+
+mod context;
+mod device;
+mod event;
+mod framework;
+mod image;
+mod memory;
\ No newline at end of file
diff --git a/src/frameworks/opencl/mod.rs b/src/frameworks/opencl/mod.rs
index c795c7b..0f66049 100644
--- a/src/frameworks/opencl/mod.rs
+++ b/src/frameworks/opencl/mod.rs
@@ -1,74 +1,8 @@
-//! OpenCL backend support.
-//!
-//! ## Scalar Data Types
-//!
-//! TODO
-//!
-//! Built-in scalar data types:
-//! https://www.khronos.org/registry/OpenCL/sdk/1.0/docs/man/xhtml/scalarDataTypes.html
-//!
-//! ## Terminology
-//!
-//! Work-group: A collection of work items + has a unique work-group ID. work-groups are assigned
-//! to execute on compute-units
-//!
-//! Work-item: An instance of a kernel at run time + has a unique ID within the work-group
-//!
-//! TODO
-//!
-//! ## Flow
-//!
-//! - Initialize the framework
-//! - Select the desired platform
-//! - Select the desired devices from the platform
-//! - Create a context 
-//! - create a command queue per device
-//! - Compile programs
-//!     - A program is essentially a collection of kernels.
-//! - Create a kernel from the successfully compiled program
-//!     - A kernel is the smallest unit of execution. Kernels are expensive to start so they're
-//!     typically used to do a large amount of work.
-//! - Specify arguments to the kernel
-//! - Allocate memory on devices
-//! - Transfer data to devices
-//! - Execute
-//! - Transfer results back
-//! - Free memory on devices
-//!
-//! ## Events (TODO)
-//!
-//! Most OpenCL operations happen asynchronously on the OpenCL Device. To provide the possibility 
-//! to order and synchronize multiple operations, the execution of an operation yields a event 
-//! object. This event can be used as an input to other operations which will wait until this event 
-//! has finished executing to run.
+//! OpenCL backend support - heterogeneous computing.
 
-pub mod hl;
-pub mod sh;
+pub mod foreign;
+pub mod high;
 
-// use super::super::Framework;
+pub use self::interface::{OpenCL, OpenCLContext, OpenCLDevice, OpenCLEvent, OpenCLImage, OpenCLMemory};
 
-/// Provides the OpenCL framework.
-#[derive(Debug)]
-pub struct OpenCL {
-    // /// A list of available devices for the first platform found.
-    // pub available_devices: Vec<OpenClDevice>,
-}
-
-/// A `Memory` wraps around an OpenCL buffer id that manages its deallocation, named 
-/// as such for consistency's sake.
-///
-/// OpenCL buffers are only context specific, not device specific.
-#[derive(Clone, Debug)]
-pub struct Memory {
-    hl: hl::Buffer,
-    capacity: usize,
-}
-
-// impl OpenCL {
-
-//     /// Attempts to initialize the framework.
-//     pub fn new() -> Result<OpenCL> {
-
-//         unimplemented!()
-//     }
-// }
\ No newline at end of file
+mod interface;
\ No newline at end of file
diff --git a/src/hardware.rs b/src/hardware.rs
new file mode 100644
index 0000000..880e52c
--- /dev/null
+++ b/src/hardware.rs
@@ -0,0 +1,116 @@
+use frameworks::native::NativeDevice;
+use frameworks::opencl::OpenCLDevice;
+use super::{Memory, Result, Shape};
+
+/// An device capable of processing data.
+///
+/// A compute device is a processor, such as a CPU or a GPU. `Device` is simply 
+/// an [alias][issue#8634] for the various trait bounds associated with a compute device.
+///
+/// [issue#8634]: https://github.com/rust-lang/rust/issues/8634
+pub trait Device: 
+    'static 
+    + Alloc<u8> + Alloc<f32> + Alloc<f64> 
+    + Synch<u8> + Synch<f32> + Synch<f64> 
+    + Viewable {
+
+    }
+
+impl<D> Device for D where D: 
+    'static 
+    + Alloc<u8> + Alloc<f32> + Alloc<f64> 
+    + Synch<u8> + Synch<f32> + Synch<f64> 
+    + Viewable {
+        
+    }
+
+/// A _viewable_ device.
+pub trait Viewable {
+
+    /// Returns a device _view_.
+    fn view(&self) -> ComputeDevice;
+}
+
+/// A wrapper around the various compute devices.
+///
+/// `ComputeDevice` and `Viewable` use the [_family_ pattern][pattern].
+///
+/// [pattern]: https://www.reddit.com/r/rust/comments/2rdoxx/enum_variants_as_types/cnezl0p/
+#[derive(Debug, Eq, PartialEq)]
+pub enum ComputeDevice {
+    /// A native host device
+    Native(NativeDevice),
+    /// An OpenCL device
+    OpenCL(OpenCLDevice),
+}
+
+impl ComputeDevice {
+
+    /// Returns the device
+    pub fn device(&self) -> &Device {
+        match *self {
+            ComputeDevice::Native(ref d) => d,
+            ComputeDevice::OpenCL(ref d) => d,
+        }
+    }
+}
+
+/// Allocator
+pub trait Alloc<T> {
+
+    /// Allocates memory on the device.
+    fn alloc(&self, shape: &Shape) -> Result<Memory<T>>;
+
+    /// Allocates and transfers memory `data` to the device.
+    fn allocwrite(&self, shape: &Shape, data: Vec<T>) -> Result<Memory<T>>;
+}
+
+/// Synchronizer
+///
+/// note: host <-> GPU for now.. GPU <-> GPU later..
+pub trait Synch<T> {
+
+    // TODO refactor
+
+    /// Synchronizes `memory` from `source`.
+    fn write(&self, memory: &mut Memory<T>, s_location: &ComputeDevice, s: &Memory<T>) -> Result;
+
+    /// Synchronizes `memory` to `destination`.
+    fn read(&self, memory: &Memory<T>, d_location: &mut ComputeDevice, d: &mut Memory<T>) -> Result;
+}
+
+/// Hardware can be GPUs, multi-core CPUs or DSPs, Cell/B.E. processor or whatever else
+/// is supported by the provided framework. The struct holds all important information about 
+/// the hardware. To execute code on hardware, turn hardware into a [`Device`].
+///
+/// [`Device`]: [device]: ./struct.Device.html
+#[derive(Clone, Debug)]
+pub struct Hardware {
+    /// The unique ID of the hardware.
+    pub id: usize,
+    /// Framework marker
+    pub framework: &'static str,
+    /// The type of compute device, such as a CPU or a GPU.
+    pub kind: HardwareKind,
+    /// The name.
+    pub name: String,
+    /// The number of compute units.
+    ///
+    /// A compute device usually has multiple compute units.
+    pub compute_units: usize,
+}
+
+/// General categories for devices, used to identify the type of a device.
+#[derive(Clone, Debug, Eq, Ord, PartialEq, PartialOrd)]
+pub enum HardwareKind {
+    /// Used for accelerators. Accelerators can communicate with host processor using a peripheral
+    /// interconnect such as PCIe.
+    Accelerator,
+    /// Used for devices that are host processors. The host processor runs the implementations
+    /// and is a single or multi-core CPU.
+    Central,
+    /// Used for GPU devices.
+    Graphics,
+    /// Used for anything else.
+    Other,
+}
\ No newline at end of file
diff --git a/src/interface.rs b/src/interface.rs
deleted file mode 100644
index 9104744..0000000
--- a/src/interface.rs
+++ /dev/null
@@ -1,111 +0,0 @@
-use std::fmt::Debug;
-
-use super::{Memory, Shape};
-use super::error::Result;
-
-/// Represents the location of a buffer or memory, which the associated device can 
-/// use to access it.
-#[derive(Debug, Eq, PartialEq)]
-pub struct Address {
-    /// A string literal containing the name of the framework.
-    pub framework: &'static str,
-    /// The context identifier
-    pub context: isize,
-    /// The device identifier.
-    pub device: isize,
-}
-
-/// A device capable of processing data.
-///
-/// The `T` type associated with the [`SharedTensor`](./struct.SharedTensor.html).
-pub trait ComputeDevice<T> {
-
-    /// Allocates memory on the device.
-    fn allocate(&self, shape: &Shape) -> Result<Memory<T>>;
-
-    /// Allocates memory on the device.
-    fn allocate_with(&self, shape: &Shape, slice: &mut [T]) -> Result<Memory<T>>;
-
-    // /// Synchronizes `memory` from `source`.
-    // fn sync_in(&self, memory: &mut Memory<T>, source: &Memory<T>) -> Result;
-
-    /// Synchronizes `memory` to `destination`.
-    fn sync_out(&self, memory: &Memory<T>, destination: &mut Memory<T>) -> Result;
-
-    /// Returns the location of the device.
-    ///
-    /// The `addr` method is used by `SharedTensor`s for memory storage purposes. The _address_
-    /// is simply the name of the framework associated with the device, the device's unique
-    /// identifier, and an integer associated with the context the device is contained in.
-    fn addr(&self) -> Address;
-}
-
-/// Contexts are the heart of both OpenCL and CUDA applications. Contexts provide a container for
-/// objects such as memory, command-queues, programs/modules and kernels.
-pub trait Context: Debug {
-
-}
-
-/// A trait implemented for all frameworks. `Framework`s contain a list of all available devices as 
-/// well as other objects specific to the implementor.
-///
-/// The default framework is simply the host CPU for common computation. To make use of other
-/// devices such as GPUs, you may choose a GPGPU framework (such as OpenCL or CUDA) to access the 
-/// processing capabilities of the device(s).
-pub trait Framework: Debug {
-    /// The name of the framework.
-    ///
-    /// This associated constant is mainly used for the purposes of debugging and reporting errors.
-    ///
-    /// note: *uses the "SCREAMING_SNAKE_CASE" naming convention (e.g., `"OPEN_CL"`).
-    const FRAMEWORK_NAME: &'static str;
-
-    // type Context: Context;
-
-    // fn try_init(&self) -> Result;
-
-    // fn try_context(&self, selection: Vec<Hardware>) -> Result<Box<Context>>;
-
-    // TODO:
-    // https://github.com/rust-lang/rust/issues/29924
-    #[doc(hidden)]
-    fn name(&self) -> &'static str {
-        Self::FRAMEWORK_NAME
-    }
-}
-
-// /// The object-safe version of `Framework`.
-// trait FrameworkObject: Debug { }
-
-/// The generic hardware representation for a `ComputeDevice`.
-///
-/// A compute device is a processor, such as a CPU or a GPU.
-pub struct Hardware {
-    /// The unique ID of the hardware.
-    id: usize,
-    /// The type of compute device, such as a CPU or a GPU.
-    kind: HardwareKind,
-    /// The name.
-    name: String,
-    /// The number of compute units.
-    ///
-    /// A compute device usually has multiple compute units.
-    compute_units: usize,
-    // /// Framework marker
-    // framework: PhantomData<F>,
-}
-
-/// General categories for devices, used to identify the type of a device.
-#[derive(Clone, Debug, Eq, Ord, PartialEq, PartialOrd)]
-pub enum HardwareKind {
-    /// Used for accelerators. Accelerators can communicate with host processor using a peripheral
-    /// interconnect such as PCIe.
-    Accelerator,
-    /// Used for devices that are host processors. The host processor runs the implementations
-    /// and is a single or multi-core CPU.
-    Central,
-    /// Used for GPU devices.
-    Graphics,
-    /// Used for anything else.
-    Other,
-}
\ No newline at end of file
diff --git a/src/lib.rs b/src/lib.rs
index 2bb5add..08d55a7 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,7 +1,7 @@
 //! Provides a simple, unified API for running highly parallel computations on different
 //! devices across different GPGPU frameworks, allowing you to swap your backend at runtime.
 //!
-//! Parenchyma is a hard fork of [Collenchyma], a now-defunct project started at [Autumn].
+//! Parenchyma began as a hard fork of [Collenchyma], a now-defunct project started at [Autumn].
 //!
 //! ## Abstract
 //!
@@ -30,10 +30,10 @@
 //!
 //! ## Architecture
 //!
-//! The single entry point of Parenchyma is the [Backend](./struct.Backend.html) type. A
+//! The single entry point of Parenchyma is a [Backend](./struct.Backend.html). A
 //! backend is agnostic over the device it runs operations on. In order to be agnostic over the 
-//! device, such as native host CPU, GPUs, accelerators or any other devices, the backend needs to be
-//! agnostic over the framework as well. The framework is important, as it provides the interface 
+//! device, such as native host CPU, GPUs, accelerators or any other devices, the backend needs to 
+//! be agnostic over the framework as well. The framework is important, as it provides the interface 
 //! to execute operations on devices, among other things. Since different vendors of hardware use 
 //! different frameworks, it becomes important that the backend is agnostic over the framework.
 //! This allows us to run computations on any machine without having to worry about hardware 
@@ -56,13 +56,13 @@
 //! a BLAS extension. Whether or not the dot operation is executed on one GPU, multiple GPUS or on 
 //! a CPU device depends solely on how you configured the backend.
 //!
-//! ### Bundles
+//! ### Packages
 //!
-//! The concept of Parenchyma extensions has one more component - the [Bundle](./trait.Bundle.html)
+//! The concept of Parenchyma extensions has one more component - the [Package](./trait.ExtensionPackage.html)
 //! trait. As opposed to executing code on the native CPU, other devices need to compile and build 
-//! the extension manually at runtime, which makes up a significant part of a framework. We need an
+//! the extension manually at runtime which makes up a significant part of a framework. We need an
 //! instance that's able to be initialized at runtime for holding the sate and compiled 
-//! operations - which is the bundle's main purpose.
+//! operations - which is the package's main purpose.
 //!
 //! ### Memory
 //!
@@ -75,6 +75,33 @@
 //! the device. Memory can also be passed around to different backends. Operations take tensors
 //! as arguments while handling the synchronization and allocation for you.
 //!
+//! ## Example
+//!
+//! ```ignore
+//! extern crate parenchyma as pa;
+//! extern crate parenchyma_nn as pann;
+//! 
+//! use pa::{Backend, Native, OpenCL, SharedTensor};
+//! 
+//! fn main() {
+//!     let ref native: Backend = Backend::new::<Native>()?;
+//!     // Initialize an OpenCL or CUDA backend packaged with the NN extension.
+//!     let ref backend = pann::Backend::new::<OpenCL>()?;
+//! 
+//!     // Initialize two `SharedTensor`s.
+//!     let shape = 1;
+//!     let ref x = SharedTensor::<f32>::with(backend, shape, vec![3.5])?;
+//!     let ref mut result = SharedTensor::<f32>::new(shape);
+//! 
+//!     // Run the sigmoid operation, provided by the NN extension, on 
+//!     // your OpenCL/CUDA enabled GPU (or CPU, which is possible through OpenCL)
+//!     backend.sigmoid(x, result)?;
+//! 
+//!     // Print the result: `[0.97068775] shape=[1], strides=[1]`
+//!     println!("{:?}", result.read(native)?.as_native()?);
+//! }
+//! ```
+//!
 //! ## Development
 //!
 //! At the moment, Parenchyma itself will provide Rust APIs for the important 
@@ -90,9 +117,9 @@
 //! [Collenchyma]: https://github.com/autumnai/collenchyma
 //! [Autumn]: https://github.com/autumnai
 
-#![allow(warnings)]
-// #![deny(missing_docs, unused_import_braces, unused_qualifications)]
-#![feature(associated_consts, field_init_shorthand, libc, type_ascription, untagged_unions)]
+#![deny(missing_docs, missing_debug_implementations, unused_import_braces, unused_qualifications)]
+#![feature(associated_consts, pub_restricted)]
+#![feature(libc, unsize, untagged_unions)]
 
 #[macro_use] extern crate enum_primitive;
 #[macro_use] extern crate lazy_static;
@@ -103,16 +130,27 @@ extern crate libloading as lib;
 extern crate ndarray;
 
 pub mod changelog;
-pub mod error;
-pub mod frameworks;
+pub mod utility;
+
+pub use self::frameworks::{native, opencl};
 
 pub use self::backend::Backend;
-pub use self::interface::{Address, ComputeDevice, Context, Framework, Hardware, HardwareKind};
+pub use self::context::Context;
+pub use self::error::{Error, ErrorKind, Result};
+pub use self::extension::{Build, ExtensionPackage, Unextended};
+pub use self::framework::{BoxContext, Framework};
+pub use self::frameworks::native::Native;
+pub use self::frameworks::opencl::OpenCL;
+pub use self::hardware::{Alloc, ComputeDevice, Device, Hardware, HardwareKind, Synch, Viewable};
 pub use self::memory::Memory;
-pub use self::tensor::{Shape, SharedTensor, Tensor, TensorMut, u64Map};
+pub use self::tensor::{Shape, SharedTensor};
 
 mod backend;
-mod interface;
+mod context;
+mod error;
+mod extension;
+mod framework;
+mod frameworks;
+mod hardware;
 mod memory;
-mod tensor;
-mod utility;
\ No newline at end of file
+mod tensor;
\ No newline at end of file
diff --git a/src/memory.rs b/src/memory.rs
index 24830ad..2de5c1d 100644
--- a/src/memory.rs
+++ b/src/memory.rs
@@ -1,9 +1,14 @@
-use frameworks::native::Memory as NativeMemory;
-use frameworks::opencl::Memory as OpenCLMemory;
+use native::NativeMemory;
+use opencl::OpenCLMemory;
 
-/// Represents a memory object.
+/// Provides a representation for memory across different frameworks.
 ///
-/// note: downcast methods are provided.
+/// Memory is allocated by a device in a way that it is accessible for its computations.
+///
+/// Downcast methods are provided, but normally you will want to use a [`SharedTensor`] which 
+/// handles synchronization of the latest memory copy to the required device.
+///
+/// [`SharedTensor`]: ./struct.SharedTensor.html
 #[derive(Debug)]
 pub enum Memory<T> {
     // /// A CUDA memory object.
@@ -11,7 +16,6 @@ pub enum Memory<T> {
 
     /// The native memory representation.
     Native(NativeMemory<T>),
-
     /// An OpenCL Memory.
     OpenCL(OpenCLMemory),
 }
@@ -33,6 +37,22 @@ impl<T> Memory<T> {
         }
     }
 
+    /// Returns a reference to the native memory representation.
+    pub unsafe fn as_native_unchecked(&self) -> &NativeMemory<T> {
+        match *self {
+            Memory::Native(ref native) => native,
+            _ => unreachable!()
+        }
+    }
+
+    /// Returns a reference to the native memory representation.
+    pub unsafe fn as_mut_native_unchecked(&mut self) -> &mut NativeMemory<T> {
+        match *self {
+            Memory::Native(ref mut native) => native,
+            _ => unreachable!()
+        }
+    }
+
     /// Returns the native memory representation, consuming the convertee.
     pub fn into_native(self) -> Option<NativeMemory<T>> {
         match self {
@@ -57,11 +77,19 @@ impl<T> Memory<T> {
         }
     }
 
-    /// Returns the OpenCL memory, consuming the convertee.
-    pub fn into_opencl(self) -> Option<OpenCLMemory> {
-        match self {
-            Memory::OpenCL(opencl) => Some(opencl),
-            _ => None
+    /// Returns a reference to the opencl memory representation.
+    pub unsafe fn as_opencl_unchecked(&self) -> &OpenCLMemory {
+        match *self {
+            Memory::OpenCL(ref opencl) => opencl,
+            _ => unreachable!()
         }
     }
+
+    // /// Returns the OpenCL memory, consuming the convertee.
+    // pub fn into_opencl(self) -> Option<OpenCLMemory> {
+    //     match self {
+    //         Memory::OpenCL(opencl) => Some(opencl),
+    //         _ => None
+    //     }
+    // }
 }
\ No newline at end of file
diff --git a/src/tensor.rs b/src/tensor.rs
index 0a23518..01f459c 100644
--- a/src/tensor.rs
+++ b/src/tensor.rs
@@ -1,10 +1,9 @@
-use std::{convert, mem};
+use std::mem;
 use std::cell::{Cell, RefCell};
 use std::marker::PhantomData;
 
-use super::{Address, Backend, Memory};
-use super::utility;
-use super::error::{ErrorKind, Result};
+use {Alloc, ComputeDevice, Device, ErrorKind, Memory, Result, Synch};
+use utility::Has;
 
 /// A shared tensor for framework-agnostic, memory-aware, n-dimensional storage. 
 ///
@@ -12,6 +11,11 @@ use super::error::{ErrorKind, Result};
 /// for one similar piece of data. `SharedTensor` handles synchronization of memory of type `T`, by 
 /// which it is parameterized, and provides the functionality for memory management across devices.
 ///
+/// `SharedTensor` holds copies and their version numbers. A user can request any number of
+/// immutable `Tensor`s or a single mutable `Tensor` (enforced by borrowck). It's possible to 
+/// validate at runtime that tensor data is initialized when a user requests a tensor for reading
+/// and skip the initialization check if a tensor is requested only for writing.
+///
 /// ## Terminology
 ///
 /// In Parenchyma, multidimensional Rust arrays represent tensors. A vector, a tensor with a 
@@ -56,11 +60,13 @@ use super::error::{ErrorKind, Result};
 /// TODO
 /// ```
 #[derive(Debug)]
-pub struct SharedTensor<T> {
+pub struct SharedTensor<T = f32> {
     /// The shape of the shared tensor.
     pub shape: Shape,
+
     /// A vector of buffers.
-    copies: RefCell<Vec<(Address, Memory<T>)>>,
+    copies: RefCell<Vec<(ComputeDevice, Memory<T>)>>,
+
     /// Indicates whether or not memory is synchronized (synchronization state).
     ///
     /// There are only two possible states:
@@ -86,90 +92,168 @@ pub struct SharedTensor<T> {
     /// corresponding memory is _ticked_ or increased. The value `0` means that the memory object 
     /// at that specific location is uninitialized or outdated.
     versions: u64Map,
+
     /// A marker for `T`.
     phantom: PhantomData<T>,
 }
 
-impl<T> SharedTensor<T> /* TODO where T: Scalar | Float */ {
+impl<T> SharedTensor<T> where Device: Alloc<T> + Synch<T> {
 
     /// Constructs a new `SharedTensor` with a shape of `sh`.
-    pub fn new<I>(sh: I) -> Result<Self> where I: Into<Shape> {
+    pub fn new<A>(sh: A) -> Self where A: Into<Shape> {
 
         let shape = sh.into();
         let copies = RefCell::new(vec![]);
         let versions = u64Map::new();
 
-        Ok(SharedTensor { shape, copies, versions, phantom: PhantomData })
+        SharedTensor { shape, copies, versions, phantom: PhantomData }
     }
 
-    /// Constructs a new `SharedTensor` from the supplied `chunk` of data with a shape of `sh`.
-    ///
-    /// # Examples
-    ///
-    /// ```
-    /// # #![allow(dead_code)]
-    ///
-    /// let framework = Native::new();
-    /// let ref backend = Backend::with(framework)?;
-    ///
-    /// let shared: SharedTensor<f32> = SharedTensor::with(backend, [2, 2], [1., 2., 3., 4.])?;
-    /// ```
-    pub fn with<I, A>(backend: &Backend, sh: I, mut chunk: A) -> Result<Self> 
-        where I: Into<Shape>, 
-              A: AsMut<[T]> {
+    /// Constructs a new `SharedTensor` containing a `chunk` of data with a shape of `sh`.
+    pub fn with<H, I>(con: &H, sh: I, chunk: Vec<T>) -> Result<Self>
+        where H: Has<Device>,
+              I: Into<Shape>,
+              {
 
         let shape = sh.into();
-        let mut slice = chunk.as_mut();
-        let buffer = backend.compute_device::<T>().allocate_with(&shape, &mut slice)?;
-        let vec = vec![(backend.compute_device::<T>().addr(), buffer)];
-        let copies = RefCell::new(vec);
+        let device = con.get_ref();
+        let buffer = device.allocwrite(&shape, chunk)?;
+        let copies = RefCell::new(vec![(device.view(), buffer)]);
         let versions = u64Map::with(1);
 
         Ok(SharedTensor { shape, copies, versions, phantom: PhantomData })
     }
 
-    /// Pre-allocate memory on the active device and track it.
-    pub fn allocate(&mut self, backend: &Backend) -> Result {
+    /// Allocates memory on the active device and tracks it.
+    pub fn alloc<H, I>(con: &H, sh: I) -> Result<Self> 
+        where H: Has<Device>, 
+              I: Into<Shape> 
+              {
 
-        let buffer = backend.compute_device::<T>().allocate(&self.shape)?;
+        let shape = sh.into();
+        let device = con.get_ref();
+        let buffer = device.alloc(&shape)?;
+        let copies = RefCell::new(vec![(device.view(), buffer)]);
+        let versions = u64Map::with(1); // ? TODO
 
-        unimplemented!()
+        Ok(SharedTensor { shape, copies, versions, phantom: PhantomData })
+    }
+
+    /// Drops memory allocation on the specified device. Returns error if no memory has been 
+    /// allocated on this device.
+    ///
+    // TODO FIXME: synchronize memory elsewhere if possible..?
+    // TODO silence the error..?
+    pub fn dealloc<H>(&mut self, con: &H) -> Result<Memory<T>> where H: Has<Device> {
+
+        let device = con.get_ref();
+        let location = device.view();
+
+        match self.get_location_index(&location) {
+            Some(i) => {
+                let (_, memory) = self.copies.borrow_mut().remove(i);
+
+                let version = self.versions.get();
+                let mask = (1 << i) - 1;
+                let lower = version & mask;
+                let upper = (version >> 1) & (!mask);
+                self.versions.set(lower | upper);
+
+                Ok(memory)
+            },
+
+            _ => Err(ErrorKind::AllocatedMemoryNotFoundForDevice.into())
+        }
+    }
+
+    // /// Changes the capacity and shape of the tensor.
+    // ///
+    // /// **Caution**: Drops all copies which are not on the current device.
+    // ///
+    // /// `SharedTensor::reshape` is preferred over this method if the size of the old and new shape
+    // /// are identical because it will not reallocate memory.
+    // pub fn realloc<H, I>(&mut self, dev: &H, sh: I) -> Result 
+    //     where H: Has<Device>, 
+    //           I: Into<Shape> 
+    //           {
+
+    //     unimplemented!()
+    // }
+
+    /// Change the shape of the Tensor.
+    ///
+    /// # Returns
+    ///
+    /// Returns an error if the size of the new shape is not equal to the size of the old shape.
+    /// If you want to change the shape to one of a different size, use `SharedTensor::realloc`.
+    pub fn reshape<I>(&mut self, sh: I) -> Result where I: Into<Shape> {
+        let shape = sh.into();
+
+        if shape.capacity() != self.shape.capacity() {
+            return Err(ErrorKind::InvalidReshapedTensorSize.into());
+        }
+
+        self.shape = shape;
+
+        Ok(())
+    }
+
+    /// Returns the number of elements the tensor can hold without reallocating.
+    pub fn capacity(&self) -> usize {
+        self.shape.capacity()
     }
 }
 
-/// An `impl` block containing the read/write/auto-sync logic.
-impl<T> SharedTensor<T> {
+/// This block contains the read/write/auto-sync logic.
+impl<T> SharedTensor<T> where Device: Alloc<T> + Synch<T> {
 
     /// View an underlying tensor for reading on the active device.
     ///
     /// This method can fail if memory allocation fails or if no memory is initialized.
     /// The borrowck guarantees that the shared tensor outlives all of its tensors.
-    pub fn view<'shared>(&'shared self, backend: &Backend) -> Result<Tensor<'shared, T>> {
-        if self.versions.empty() {
-            return Err(ErrorKind::UninitializedMemory.into());
-        }
+    ///
+    /// Summary:
+    ///
+    /// 1) Check if there is initialized data anywhere
+    /// 2) Lookup memory and its version for `device`, allocate it if it doesn't exist
+    /// 3) Check version, if it's old, synchronize
+    pub fn read<'shared, H>(&'shared self, dev: &H) -> Result<&'shared Memory<T>> 
+        where H: Has<Device> {
 
-        let i = self.get_or_create_location_index(backend)?;
-        self.sync_if_necessary(backend, i)?;
-        self.versions.insert(i);
+        let i = self.autosync(dev, false)?;
 
         let borrowed_copies = self.copies.borrow();
 
-        let (ref address, ref buffer) = borrowed_copies[i];
+        let (_, ref buffer) = borrowed_copies[i];
 
-        let address = unsafe { utility::extend_lifetime::<'shared>(address) };
-        let memory = unsafe { utility::extend_lifetime::<'shared>(buffer) };
+        let memory = unsafe { extend_lifetime::<'shared>(buffer) };
 
-        Ok(Tensor { address, memory })
+        Ok(memory)
     }
 
     /// View an underlying tensor for reading and writing on the active device. The memory 
     /// location is set as the latest.
     ///
     /// This method can fail is memory allocation fails or if no memory is initialized.
-    pub fn view_mut<'buf>(&'buf mut self, backend: &Backend) -> Result<TensorMut<'buf, T>> {
+    ///
+    /// Summary:
+    ///
+    /// 1) Check if there is initialized data anywhere
+    /// 2) Lookup memory and its version for `device`, allocate it if it doesn't exist
+    /// 3) Check version, if it's old, synchronize
+    /// 4) Increase memory version and latest_version
+    pub fn read_write<'shared, H>(&'shared mut self, dev: &H) -> Result<&'shared mut Memory<T>> 
+        where H: Has<Device> {
+
+        let i = self.autosync(dev, true)?;
+
+        let mut borrowed_copies = self.copies.borrow_mut();
+
+        let (_, ref mut buffer) = borrowed_copies[i];
+
+        let memory = unsafe { extend_lifetime_mut::<'shared>(buffer) };
 
-        unimplemented!()
+        Ok(memory)
     }
 
     /// View an underlying tensor for writing only.
@@ -178,21 +262,41 @@ impl<T> SharedTensor<T> {
     /// be overwritten anyway. The caller must initialize all elements contained in the tensor. This
     /// convention isn't enforced, but failure to do so may result in undefined data later.
     ///
+    /// Summary:
+    ///
+    /// 1) *Skip initialization check
+    /// 2) Lookup memory and its version for `device`, allocate it if it doesn't exist
+    /// 3) *Skip synchronization
+    /// 4) Increase memory version and latest_version
+    ///
     /// TODO
     ///
     /// * Add an `invalidate` method:
     ///
     ///     If the caller fails to overwrite memory, it must call `invalidate` to return the vector
     ///     to an uninitialized state.
-    pub fn write<'buf>(&'buf mut self, backend: &Backend) -> Result<TensorMut<'buf, T>> {
+    pub fn write<'shared, H>(&'shared mut self, con: &H) -> Result<&'shared mut Memory<T>>
+        where H: Has<Device> {
+
+        let i = self.get_or_create_location_index(con)?;
+        self.versions.set(1 << i);
+
+        let mut borrowed_copies = self.copies.borrow_mut();
+
+        let (_, ref mut buffer) = borrowed_copies[i];
 
-        unimplemented!()
+        let memory = unsafe { extend_lifetime_mut::<'shared>(buffer) };
+
+        Ok(memory)
     }
+}
+
+impl<T> SharedTensor<T> where Device: Alloc<T> + Synch<T> {
 
-    fn get_location_index(&self, address: &Address) -> Option<usize> {
+    fn get_location_index(&self, location: &ComputeDevice) -> Option<usize> {
 
         for (i, l) in self.copies.borrow().iter().map(|&(ref l, _)| l).enumerate() {
-            if l.eq(address) {
+            if l.eq(location) {
                 return Some(i);
             }
         }
@@ -200,11 +304,13 @@ impl<T> SharedTensor<T> {
         None
     }
 
-    fn get_or_create_location_index(&self, backend: &Backend) -> Result<usize> {
+    fn get_or_create_location_index<H>(&self, con: &H) -> Result<usize> where H: Has<Device> {
 
-        let address = backend.compute_device::<T>().addr();
+        let device = con.get_ref();
 
-        if let Some(i) = self.get_location_index(&address) {
+        let location = device.view();
+
+        if let Some(i) = self.get_location_index(&location) {
             return Ok(i);
         }
 
@@ -212,21 +318,40 @@ impl<T> SharedTensor<T> {
             return Err(ErrorKind::BitMapCapacityExceeded.into());
         }
 
-        let memory = backend.compute_device::<T>().allocate(&self.shape)?;
-        self.copies.borrow_mut().push((address, memory));
+        let memory = device.alloc(&self.shape)?;
+        self.copies.borrow_mut().push((location, memory));
 
         Ok(self.copies.borrow().len() - 1)
     }
 
-    // TODO: 
-    //
-    // * Choose the best source to copy data from.
-    //      That would require some additional traits that return costs for transferring data 
-    //      between different backends.
-    //
-    // Actually I think that there would be only transfers between `Native` <-> `Cuda` 
-    // and `Native` <-> `OpenCL` in foreseeable future, so it's best to not over-engineer here.
-    fn sync_if_necessary(&self, backend: &Backend, destination_index: usize) -> Result {
+    /// Sync if necessary
+    ///
+    /// TODO: 
+    ///
+    /// * Choose the best source to copy data from.
+    ///      That would require some additional traits that return costs for transferring data 
+    ///      between different backends.
+    ///
+    /// note: Typically, there would be transfers between `Native` <-> `GPU` in foreseeable 
+    /// future, so it's best to not over-engineer here.
+    pub fn autosync<H>(&self, dev: &H, tick: bool) -> Result<usize> where H: Has<Device> {
+        if self.versions.empty() {
+            return Err(ErrorKind::UninitializedMemory.into());
+        }
+
+        let i = self.get_or_create_location_index(dev)?;
+        self.autosync_(i)?;
+
+        if tick {
+            self.versions.set(1 << i);
+        } else {
+            self.versions.insert(i);
+        }
+
+        Ok(i)
+    }
+
+    fn autosync_(&self, destination_index: usize) -> Result {
 
         if self.versions.contains(destination_index) {
 
@@ -246,25 +371,24 @@ impl<T> SharedTensor<T> {
         let (source, mut destination) = {
             if source_index < destination_index {
                 let (left, right) = borrowed_copies.split_at_mut(destination_index);
-                (&mut left[source_index], &mut right[0])
+                (&left[source_index], &mut right[0])
             } else {
                 let (left, right) = borrowed_copies.split_at_mut(source_index);
-                (&mut right[0], &mut left[destination_index])
+                (&right[0], &mut left[destination_index])
             }
         };
 
-        backend.compute_device().sync_out(&source.1, &mut destination.1)
+        // TODO refactor
 
-        // TODO:
-        //
         // Backends may define transfers asymmetrically. E.g. CUDA may know how to transfer to and 
         // from Native backend, while Native may know nothing about CUDA at all. So if first 
         // attempt fails we change order and try again.
+        match source.0.device().read(&source.1, &mut destination.0, &mut destination.1) {
+            Err(ref e) if e.kind() == ErrorKind::NoAvailableSynchronizationRouteFound => { },
+            ret @ _ => return ret,
+        }
 
-
-        // dst_loc.mem_transfer.sync_in(
-        //      dst_loc.mem.as_mut(), src_loc.compute_device.deref(),
-        //      src_loc.mem.deref()).map_err(|e| e.into())
+        destination.0.device().write(&mut destination.1, &source.0, &source.1)
 
         // TODO: try transfer indirectly via Native backend
     }
@@ -277,27 +401,43 @@ pub struct Shape {
     ///
     /// # Example
     ///
-    /// ```ignore
+    /// ```{.text}
     /// // The following tensor has 9 components
     ///
     /// [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
     /// ```
-    capacity: usize,
+    pub capacity: usize,
     /// The total number of indices.
     ///
     /// # Example
     ///
     /// The following tensor has a rank of 2:
     ///
-    /// ```ignore
+    /// ```{.text}
     /// [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
     /// ```
     rank: usize,
     /// The dimensions of the tensor.
-    dims: Vec<usize>,
+    pub dims: Vec<usize>,
+}
+
+impl Shape {
+
+    /// Returns the capacity
+    pub fn capacity(&self) -> usize {
+
+        self.capacity
+    }
 }
 
-impl convert::From<[usize; 1]> for Shape {
+impl From<usize> for Shape {
+
+    fn from(n: usize) -> Shape {
+        [n].into()
+    }
+}
+
+impl From<[usize; 1]> for Shape {
 
     fn from(array: [usize; 1]) -> Shape {
         let capacity = array[0];
@@ -308,7 +448,7 @@ impl convert::From<[usize; 1]> for Shape {
     }
 }
 
-impl convert::From<[usize; 2]> for Shape {
+impl From<[usize; 2]> for Shape {
 
     fn from(array: [usize; 2]) -> Shape {
         let capacity = array.iter().fold(1, |acc, &dims| acc * dims);
@@ -319,20 +459,15 @@ impl convert::From<[usize; 2]> for Shape {
     }
 }
 
-/// An immutable view.
-///
-/// TODO:
-///
-/// Parameterization over mutability would help here..
-pub struct Tensor<'a, T: 'a> {
-    address: &'a Address,
-    memory: &'a Memory<T>,
-}
+impl From<[usize; 3]> for Shape {
+
+    fn from(array: [usize; 3]) -> Shape {
+        let capacity = array.iter().fold(1, |acc, &dims| acc * dims);
+        let rank = 3;
+        let dims = array.to_vec();
 
-/// A mutable view.
-pub struct TensorMut<'a, T: 'a> {
-    address: &'a Address,
-    memory: &'a mut Memory<T>,
+        Shape { capacity, rank, dims }
+    }
 }
 
 /// A "newtype" with an internal type of `Cell<u64>`. `u64Map` uses [bit manipulation][1] to manage 
@@ -361,6 +496,10 @@ impl u64Map {
         self.0.get()
     }
 
+    fn set(&self, v: u64) {
+        self.0.set(v)
+    }
+
     fn empty(&self) -> bool {
         self.0.get() == 0
     }
@@ -378,14 +517,10 @@ impl u64Map {
     }
 }
 
-mod tests {
-    use super::u64Map;
+unsafe fn extend_lifetime<'a, 'b, T>(t: &'a T) -> &'b T {
+    mem::transmute::<&'a T, &'b T>(t)
+}
 
-    #[test]
-    fn u64_map_contains() {
-        let key = 10;
-        let u64_map = u64Map::new();
-        u64_map.insert(key);
-        assert!(u64_map.contains(key))
-    }
+unsafe fn extend_lifetime_mut<'a, 'b, T>(t: &'a mut T) -> &'b mut T {
+    mem::transmute::<&'a mut T, &'b mut T>(t)
 }
\ No newline at end of file
diff --git a/src/utility.rs b/src/utility.rs
index acccd60..9d4726d 100644
--- a/src/utility.rs
+++ b/src/utility.rs
@@ -1,14 +1,28 @@
-use std::mem;
+//! Helper functions and traits
 
-/// Returns the size of the allocated memory in bytes.
-pub fn allocated<T>(length: usize) -> usize {
-    length * mem::size_of::<T>()
+/// Marker trait for uninitialized objects.
+pub type Uninitialized = ();
+
+/// A trait for simple _has_ relationships.
+pub trait Has<T: ?Sized> {
+
+    /// Get a reference to `T`.
+    fn get_ref(&self) -> &T;
 }
 
-pub unsafe fn extend_lifetime<'a, 'b, T>(t: &'a T) -> &'b T {
-    mem::transmute::<&'a T, &'b T>(t)
+/// Attempt to construct a default value of a type.
+// TODO move to a crate?
+pub trait TryDefault: Sized {
+    /// The type returned in the event of an error.
+    type Err;
+
+    /// Returns the default value for a type.
+    fn try_default() -> Result<Self, Self::Err>;
 }
 
-unsafe fn extend_lifetime_mut<'a, 'b, T>(t: &'a mut T) -> &'b mut T {
-    mem::transmute::<&'a mut T, &'b mut T>(t)
+/// Returns the size of the allocated memory in bytes.
+pub fn allocated<T>(length: usize) -> usize {
+    use std::mem;
+
+    length * mem::size_of::<T>()
 }
\ No newline at end of file
diff --git a/tests/backend_specs.rs b/tests/backend_specs.rs
new file mode 100644
index 0000000..06565b8
--- /dev/null
+++ b/tests/backend_specs.rs
@@ -0,0 +1,14 @@
+extern crate parenchyma as pa;
+
+#[cfg(test)]
+mod backend_spec {
+    mod native {
+        use pa::{Backend, Native};
+
+        #[test]
+        fn it_can_create_default_backend() {
+            let backend: Result<Backend, _> = Backend::new::<Native>();
+            assert!(backend.is_ok());
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/cuda.rs b/tests/cuda.rs
deleted file mode 100644
index d84b9bc..0000000
--- a/tests/cuda.rs
+++ /dev/null
@@ -1,52 +0,0 @@
-// #![cfg(test)]
-
-// extern crate parenchyma;
-// extern crate parenchyma_cuda as cuda;
-
-// mod shared_memory_spec {
-//     use cuda::Cuda;
-//     use parenchyma::{Backend, Native, NativeMemory, SharedTensor};
-
-//     fn write_to_memory<T: Copy>(mem: &mut NativeMemory, data: &[T]) {
-
-//         let buffer = mem.as_mut_slice::<T>();
-
-//         for (i, datum) in data.iter().enumerate() {
-
-//             buffer[i] = *datum;
-//         }
-//     }
-
-//     #[test]
-//     fn it_creates_new_shared_memory_for_cuda() {
-
-//         let backend: Backend<Cuda> = Backend::default().unwrap();
-//         let shape = vec![10];
-//         let mut shared_data = SharedTensor::<f32>::from(shape);
-//         assert!(shared_data.write_only(backend.context()).is_ok())
-//     }
-
-//     #[test]
-//     fn it_syncs_from_native_to_cuda_and_back() {
-//         let shape = vec![3];
-
-//         let cuda: Backend<Cuda> = Backend::default().unwrap();
-
-//         let native: Backend<Native> = Backend::default().unwrap();
-
-//         let mut mem: SharedTensor<f64> = SharedTensor::from(shape);
-
-//         write_to_memory(
-//             mem.write_only(native.context()).unwrap(), 
-//             &[1.0, 2.0, 123.456]
-//         );
-
-//         assert!(mem.read(cuda.context()).is_ok());
-
-//         // It has not successfully synced to the device.
-//         // Not the other way around.
-//         assert!(mem.drop_context(native.context()).is_ok());
-
-//         assert_eq!(mem.read(native.context()).unwrap().as_slice::<f64>(), [1.0, 2.0, 123.456]);
-//     }
-// }
\ No newline at end of file
diff --git a/tests/lib.rs b/tests/lib.rs
deleted file mode 100644
index 2a59f0d..0000000
--- a/tests/lib.rs
+++ /dev/null
@@ -1,104 +0,0 @@
-// extern crate parenchyma;
-
-// #[cfg(test)]
-// mod backend_spec {
-//     use parenchyma::{Backend, Native};
-
-//     #[test]
-//     fn it_can_create_default_backend() {
-
-//         assert!(Backend::<Native>::default().is_ok());
-//     }
-// }
-
-// #[cfg(test)]
-// mod framework_spec {
-//     use parenchyma::{Framework, Native};
-
-//     #[test]
-//     fn it_can_init_native_framework() {
-//         let framework = Native::new().unwrap();
-//         assert_eq!(framework.available_devices.len(), 1);
-//     }
-// }
-
-// #[cfg(test)]
-// mod shared_memory_spec {
-//     use parenchyma::{Context, Framework, Native, NativeContext, SharedTensor};
-//     use parenchyma::error::ErrorKind;
-
-//     #[test]
-//     fn it_creates_new_shared_memory_for_native() {
-//         let native = Native::new().unwrap();
-//         let context = NativeContext::new(native.available_devices.clone()).unwrap();
-//         let ref device = context.devices()[0];
-//         let mut tensor = SharedTensor::<f32>::from(vec![10]);
-//         let data = tensor.write_only(device).unwrap().as_slice::<f32>();
-//         assert_eq!(10, data.len());
-//     }
-
-//     // #[test]
-//     // fn it_fails_on_initialized_memory_read() {
-//     //     let native = Native::new().unwrap();
-//     //     let context = NativeContext::new(native.available_devices.clone()).unwrap();
-//     //     let mut tensor = SharedTensor::<f32>::from(vec![10]);
-
-//     //     assert_eq!(tensor.read(&context).unwrap_err().kind(), ErrorKind::UninitializedMemory);
-
-//     //     assert_eq!(tensor.read_write(&context).unwrap_err().kind(), ErrorKind::UninitializedMemory);
-
-//     //     tensor.write_only(&context).unwrap();
-//     //     tensor.drop_context(&context).unwrap();
-
-//     //     assert_eq!(tensor.read(&context).unwrap_err().kind(), ErrorKind::UninitializedMemory);
-//     // }
-// }
-
-// #[cfg(test)]
-// mod tensor_spec {
-//     use parenchyma::SharedTensor;
-
-//     // #[test]
-//     // fn it_returns_correct_tensor_desc_stride() {
-
-//     //     let tensor_desc_r0 = Tensor::from(vec![]);
-//     //     let tensor_desc_r1 = Tensor::from(vec![5]);
-//     //     let tensor_desc_r2 = Tensor::from(vec![2, 4]);
-//     //     let tensor_desc_r3 = Tensor::from(vec![2, 2, 4]);
-//     //     let tensor_desc_r4 = Tensor::from(vec![2, 2, 4, 4]);
-
-//     //     assert!(vec![0; 0] == tensor_desc_r0.default_stride());
-//     //     assert_eq!(vec![1], tensor_desc_r1.default_stride());
-//     //     assert_eq!(vec![4, 1], tensor_desc_r2.default_stride());
-//     //     assert_eq!(vec![8, 4, 1], tensor_desc_r3.default_stride());
-//     //     assert_eq!(vec![32, 16, 4, 1], tensor_desc_r4.default_stride());
-//     // }
-
-//     #[test]
-//     fn it_returns_correct_size_for_rank_0() {
-//         // In order for memory to be correctly allocated, the size should never return less than 1.
-//         let tensor_desc_r0 = SharedTensor::<f32>::from(vec![]);
-//         assert_eq!(1, tensor_desc_r0.capacity());
-//     }
-
-//     // #[test]
-//     // fn it_resizes_tensor() {
-//     //     let mut shared_tensor = Tensor::<f32>::from(vec![10, 20, 30]);
-//     //     assert_eq!(shared_tensor.shape(), &[10, 20, 30]);
-
-//     //     shared_tensor.replace(vec![2, 3, 4, 5]);
-//     //     assert_eq!(shared_tensor.shape(), &[2, 3, 4, 5]);
-//     // }
-
-//     // #[test]
-//     // fn it_reshapes_correctly() {
-//     //     let mut shared_data = Tensor::<f32>::from(vec![10]);
-//     //     assert!(shared_data.reshape(vec![5, 2]).is_ok());
-//     // }
-
-//     // #[test]
-//     // fn it_returns_err_for_invalid_size_reshape() {
-//     //     let mut shared_data = Tensor::<f32>::new(vec![10]);
-//     //     assert!(shared_data.reshape(vec![10, 2]).is_err());
-//     // }
-// }
diff --git a/tests/opencl.rs b/tests/opencl.rs
deleted file mode 100644
index fd22f88..0000000
--- a/tests/opencl.rs
+++ /dev/null
@@ -1,82 +0,0 @@
-// #![cfg(test)]
-
-// extern crate parenchyma;
-// extern crate parenchyma_opencl as opencl;
-
-// mod shared_memory_spec {
-//     use opencl::OpenCL;
-//     use parenchyma::{Backend, DeviceKind, Framework, Native, NativeMemory, SharedTensor};
-
-//     fn write_to_memory<T: Copy>(mem: &mut NativeMemory, data: &[T]) {
-
-//         let buffer = mem.as_mut_slice::<T>();
-
-//         for (i, datum) in data.iter().enumerate() {
-
-//             buffer[i] = *datum;
-//         }
-//     }
-
-//     #[test]
-//     fn it_creates_new_shared_memory_for_opencl() {
-
-//         let backend: Backend<OpenCL> = Backend::default().unwrap();
-//         let shape = vec![10];
-//         let mut shared_data = SharedTensor::<f32>::from(shape);
-//         assert!(shared_data.write_only(&backend.devices()[0]).is_ok())
-//     }
-
-//     #[test]
-//     fn it_syncs_from_native_to_opencl_and_back_cpu() {
-//         let shape = vec![3];
-
-//         let framework = OpenCL::new().unwrap();
-//         let selection = framework.available_platforms[0].available_devices.clone();
-//         let cl: Backend<OpenCL> = Backend::new(framework, selection).unwrap();
-//         let cl_cpu = cl.devices().iter().filter(|d| *d.kind() == DeviceKind::Cpu).nth(0).unwrap();
-
-//         let native: Backend<Native> = Backend::default().unwrap();
-
-//         let mut mem: SharedTensor<f64> = SharedTensor::from(shape);
-
-//         write_to_memory(
-//             mem.write_only(&native.devices()[0]).unwrap(), 
-//             &[1.0, 2.0, 123.456]
-//         );
-
-//         assert!(mem.read(cl_cpu).is_ok());
-
-//         // It has not successfully synced to the device.
-//         // Not the other way around.
-//         assert!(mem.drop_device(&native.devices()[0]).is_ok());
-
-//         assert_eq!(mem.read(&native.devices()[0]).unwrap().as_slice::<f64>(), [1.0, 2.0, 123.456]);
-//     }
-
-//     #[test]
-//     fn it_syncs_from_native_to_opencl_and_back_gpu() {
-//         let shape = vec![3];
-
-//         let framework = OpenCL::new().unwrap();
-//         let selection = framework.available_platforms[0].available_devices.clone();
-//         let cl: Backend<OpenCL> = Backend::new(framework, selection).unwrap();
-//         let cl_gpu = cl.devices().iter().filter(|d| *d.kind() == DeviceKind::Gpu).nth(0).unwrap();
-
-//         let native: Backend<Native> = Backend::default().unwrap();
-
-//         let mut mem: SharedTensor<f64> = SharedTensor::from(shape);
-
-//         write_to_memory(
-//             mem.write_only(&native.devices()[0]).unwrap(), 
-//             &[1.0, 2.0, 123.456]
-//         );
-
-//         assert!(mem.read(cl_gpu).is_ok());
-
-//         // It has not successfully synced to the device.
-//         // Not the other way around.
-//         assert!(mem.drop_device(&native.devices()[0]).is_ok());
-
-//         assert_eq!(mem.read(&native.devices()[0]).unwrap().as_slice::<f64>(), [1.0, 2.0, 123.456]);
-//     }
-// }
\ No newline at end of file
diff --git a/tests/shared_memory_specs.rs b/tests/shared_memory_specs.rs
new file mode 100644
index 0000000..f051827
--- /dev/null
+++ b/tests/shared_memory_specs.rs
@@ -0,0 +1,79 @@
+extern crate parenchyma as pa;
+
+#[cfg(test)]
+mod shared_memory_spec {
+    use pa::{Backend, ErrorKind, Memory, Native, OpenCL, SharedTensor};
+
+    pub fn write(memory: &mut Memory<f32>, data: &[f32]) {
+        let ndarray = unsafe { memory.as_mut_native_unchecked() };
+        let buf = ndarray.as_slice_memory_order_mut().unwrap();
+
+        for (index, datum) in data.iter().enumerate() {
+            buf[index] = *datum;
+        }
+    }
+
+    #[test]
+    fn it_creates_new_shared_memory_for_native() {
+        let ref host: Backend = Backend::new::<Native>().unwrap();
+        let mut shared_data = SharedTensor::<f32>::new(10);
+        let tensor = shared_data.write(host).unwrap();
+        assert_eq!(tensor.as_native().unwrap().len(), 10);
+    }
+
+    #[test]
+    //#[cfg(feature = "opencl")]
+    fn it_creates_new_shared_memory_for_opencl() {
+        let ref backend: Backend = Backend::new::<OpenCL>().unwrap();
+        let mut shared_data: SharedTensor = SharedTensor::new(10);
+        assert!(shared_data.write(backend).is_ok());
+    }
+
+    #[test]
+    fn it_fails_on_initialized_memory_read() {
+        let ref host: Backend = Backend::new::<Native>().unwrap();
+        let mut shared_data = SharedTensor::<f32>::new(10);
+        assert_eq!(shared_data.read(host).unwrap_err().kind(), ErrorKind::UninitializedMemory);
+        assert_eq!(shared_data.read_write(host).unwrap_err().kind(), ErrorKind::UninitializedMemory);
+
+        // initialize memory
+        let _ = shared_data.write(host).unwrap();
+        let _ = shared_data.dealloc(host).unwrap();
+
+        assert_eq!(shared_data.read(host).unwrap_err().kind(), ErrorKind::UninitializedMemory);
+    }
+
+    #[test]
+    //#[cfg(feature = "opencl")]
+    fn it_syncs_from_native_to_opencl_and_back() {
+        let ref host: Backend = Backend::new::<Native>().unwrap();
+        let ref backend: Backend = Backend::new::<OpenCL>().unwrap();
+
+        let mut sh = SharedTensor::<f32>::new(3);
+        write(sh.write(host).unwrap(), &[1.0f32, 2.0, 123.456]);
+        let _ = sh.read(backend).unwrap();
+
+        // It has not successfully synced to the device.
+        // Not the other way around.
+
+        //let _ = sh.dealloc(host).unwrap();// TODO ?
+        let _ = sh.dealloc(backend).unwrap();
+
+        assert_eq!(
+            sh.read(host).unwrap().as_native().unwrap().as_slice_memory_order().unwrap(), 
+            [1.0, 2.0, 123.456]
+        );
+    }
+
+    #[test]
+    fn it_reshapes_correctly() {
+        let mut shared_data = SharedTensor::<f32>::new(10);
+        assert!(shared_data.reshape([5, 2]).is_ok());
+    }
+
+    #[test]
+    fn it_returns_err_for_invalid_size_reshape() {
+        let mut shared_data = SharedTensor::<f32>::new(10);
+        assert!(shared_data.reshape([10, 2]).is_err());
+    }
+}
\ No newline at end of file