Skip to content

Commit

Permalink
Outline the overall design
Browse files Browse the repository at this point in the history
* This closes #2 and closes #18
* I kept running into issues w/ the libloading crate (referencing #15)
* I'm currently playing around with NDArray (referencing #20), though
the way the Index trait works with ArrayBase is rather complicated
* See the changelog for more information
  • Loading branch information
jonysy committed Mar 16, 2017
1 parent 5466f37 commit 31a2547
Show file tree
Hide file tree
Showing 48 changed files with 1,662 additions and 885 deletions.
6 changes: 3 additions & 3 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@ license = "MIT/Apache-2.0"
enum_primitive = "0.1.1"
lazy_static = "0.2.4"
libloading = "0.3.2"
log = "0.3.6"
log = "0.3.7"
ndarray = "0.8.0"

[dev-dependencies]
compiletest_rs = "0.2.5"
# [dev-dependencies]
# compiletest_rs = "0.2.5"
31 changes: 31 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,37 @@ a few necessary additions/modifications.
> available in the Parenchyma project, as the different approaches that are currently being
> considered may prove to be better than the original approach.
## Example

Parenchyma comes without any extension packages. The following example therefore assumes that
you have add both `parenchyma` and the Parenchyma ExtensionPackage `parenchyma-nn` to your
Cargo manifest.

```rust
extern crate parenchyma as pa;
extern crate parenchyma_nn as pann;

use pa::{Backend, Native, OpenCL, SharedTensor};

fn main() {
let ref native: Backend = Backend::new::<Native>().unwrap();
// Initialize an OpenCL or CUDA backend packaged with the NN extension.
let ref backend = pann::Backend::new::<OpenCL>().unwrap();

// Initialize two `SharedTensor`s.
let shape = 1;
let ref x = SharedTensor::<f32>::with(backend, shape, vec![3.5]).unwrap();
let ref mut result = SharedTensor::<f32>::new(shape);

// Run the sigmoid operation, provided by the NN extension, on
// your OpenCL/CUDA enabled GPU (or CPU, which is possible through OpenCL)
backend.sigmoid(x, result).unwrap();

// Print the result: `[0.97068775] shape=[1], strides=[1]`
println!("{:?}", result.read(native).unwrap().as_native().unwrap());
}
```

## License

Dual licensed under
Expand Down
139 changes: 64 additions & 75 deletions benches/shared_tensor.rs
Original file line number Diff line number Diff line change
@@ -1,94 +1,83 @@
#![feature(test)]

extern crate parenchyma;
extern crate parenchyma_opencl;
extern crate test;

use parenchyma::{Backend, Device, Framework, Native, SharedTensor};
use parenchyma::DeviceKind::{Cpu, Gpu};
use parenchyma_opencl::{OpenCL, OpenCLDevice};
use parenchyma::{Backend, Native, OpenCL, SharedTensor};
use test::Bencher;

fn native_backend() -> Backend<Native> {
Backend::default().unwrap()
fn native_backend() -> Backend {
Backend::new::<Native>().unwrap()
}

fn opencl_backend() -> Backend<OpenCL> {
Backend::default().unwrap()
fn opencl_backend() -> Backend {
Backend::new::<OpenCL>().unwrap()
}

fn sync_back_and_forth<A, B>(b: &mut Bencher, backend1: Backend<A>, backend2: Backend<B>, s: usize)
where A: Framework, B: Framework,
{
let ref dev1 = backend1.devices()[0];
let ref dev2 = backend2.devices()[0];
fn sync_back_and_forth(b: &mut Bencher, backend1: Backend, backend2: Backend, s: usize) {

let mem = &mut SharedTensor::<u8>::from(vec![s]);
let mem = &mut SharedTensor::<u8>::new(s);

// initialize and warm-up
let _ = mem.write_only(dev2).unwrap();
let _ = mem.read_write(dev1).unwrap();
let _ = mem.read_write(dev2).unwrap();
let _ = mem.write(&backend2).unwrap();
let _ = mem.read_write(&backend1).unwrap();
let _ = mem.read_write(&backend2).unwrap();

b.bytes = s as u64 * 2; // we do two transfers per iteration

b.iter(|| {
let _ = mem.read_write(dev1).unwrap();
let _ = mem.read_write(dev2).unwrap();
let _ = mem.read_write(&backend1).unwrap();
let _ = mem.read_write(&backend2).unwrap();
});
}

fn unidirectional_sync<A, B>(b: &mut Bencher, src: Backend<A>, dst: Backend<B>, size: usize)
where A: Framework, B: Framework,
{
let ref src_dev = src.devices()[0];
let ref dst_dev = dst.devices()[0];
fn unidirectional_sync(b: &mut Bencher, src: Backend, dst: Backend, size: usize) {

let mem = &mut SharedTensor::<u8>::from(vec![size]);
let mem = &mut SharedTensor::<u8>::new(size);

// initialize and warm-up
let _ = mem.write_only(src_dev).unwrap();
let _ = mem.read(dst_dev).unwrap();
let _ = mem.write(&src).unwrap();
let _ = mem.read(&dst).unwrap();

b.bytes = size as u64;

b.iter(|| {
let _ = mem.write_only(src_dev).unwrap();
let _ = mem.read(dst_dev).unwrap();
let _ = mem.write(&src).unwrap();
let _ = mem.read(&dst).unwrap();
});
}

#[inline(never)]
fn bench_256_alloc_1mb_opencl_profile(b: &mut Bencher, device: &OpenCLDevice, size: usize) {
b.iter(||
for _ in 0..256 {
let _ = device.allocate_memory(size).unwrap(); });
}
// #[inline(never)]
// fn bench_256_alloc_1mb_opencl_profile(b: &mut Bencher, device: &OpenCLDevice, size: usize) {
// b.iter(||
// for _ in 0..256 {
// let _ = device.allocate_memory(size).unwrap(); });
// }

// #[bench]
// fn bench_256_alloc_1mb_opencl_cpu(b: &mut Bencher) {
// let opencl_backend = opencl_backend();
// let cpu = opencl_backend.devices().iter().filter(|d| *d.kind() == Cpu).nth(0).unwrap();
// // #[bench]
// // fn bench_256_alloc_1mb_opencl_cpu(b: &mut Bencher) {
// // let opencl_backend = opencl_backend();
// // let cpu = opencl_backend.devices().iter().filter(|d| *d.kind() == Cpu).nth(0).unwrap();

// bench_256_alloc_1mb_opencl_profile(b, cpu, 1_048_576);
// }
// // bench_256_alloc_1mb_opencl_profile(b, cpu, 1_048_576);
// // }

// // #[bench]
// // fn bench_256_alloc_1mb_opencl_gpu(b: &mut Bencher) {
// // let opencl_backend = opencl_backend();
// // let gpu = opencl_backend.devices().iter().filter(|d| *d.kind() == Gpu).nth(0).unwrap();

// // bench_256_alloc_1mb_opencl_profile(b, gpu, 1_048_576);
// // }

// #[bench]
// fn bench_256_alloc_1mb_opencl_gpu(b: &mut Bencher) {
// fn bench_256_alloc_1mb_opencl(b: &mut Bencher) {
// let opencl_backend = opencl_backend();
// let gpu = opencl_backend.devices().iter().filter(|d| *d.kind() == Gpu).nth(0).unwrap();
// let ref d = opencl_backend.devices()[0];

// bench_256_alloc_1mb_opencl_profile(b, gpu, 1_048_576);
// bench_256_alloc_1mb_opencl_profile(b, d, 1_048_576);
// }

#[bench]
fn bench_256_alloc_1mb_opencl(b: &mut Bencher) {
let opencl_backend = opencl_backend();
let ref d = opencl_backend.devices()[0];

bench_256_alloc_1mb_opencl_profile(b, d, 1_048_576);
}

#[bench]
fn bench_sync_1kb_native_opencl_back_and_forth(b: &mut Bencher) {
sync_back_and_forth(b, opencl_backend(), native_backend(), 1024);
Expand Down Expand Up @@ -134,33 +123,33 @@ fn bench_sync_128mb_opencl_to_native(b: &mut Bencher) {
unidirectional_sync(b, opencl_backend(), native_backend(), 128 * 1_048_576);
}

// fn bench_shared_tensor_access_time_first_(b: &mut Bencher, device: &OpenCLDevice) {
// // fn bench_shared_tensor_access_time_first_(b: &mut Bencher, device: &OpenCLDevice) {

// let native_backend = native_backend();
// let ref native_cpu = native_backend.devices()[0];
// // let native_backend = native_backend();
// // let ref native_cpu = native_backend.devices()[0];

// let mut x = SharedTensor::<f32>::from(vec![128]);
// x.write_only(native_cpu).unwrap();
// x.write_only(device).unwrap();
// x.read(native_cpu).unwrap();
// // let mut x = SharedTensor::<f32>::from(vec![128]);
// // x.write_only(native_cpu).unwrap();
// // x.write_only(device).unwrap();
// // x.read(native_cpu).unwrap();

// b.iter(|| {
// let _ = x.read(native_cpu).unwrap();
// })
// }
// // b.iter(|| {
// // let _ = x.read(native_cpu).unwrap();
// // })
// // }

// #[bench]
// fn bench_shared_tensor_access_time_first_cpu(b: &mut Bencher) {
// let opencl_backend = opencl_backend();
// let opencl_cpu = opencl_backend.devices().iter().filter(|d| *d.kind() == Cpu).nth(0).unwrap();
// // #[bench]
// // fn bench_shared_tensor_access_time_first_cpu(b: &mut Bencher) {
// // let opencl_backend = opencl_backend();
// // let opencl_cpu = opencl_backend.devices().iter().filter(|d| *d.kind() == Cpu).nth(0).unwrap();

// bench_shared_tensor_access_time_first_(b, opencl_cpu);
// }
// // bench_shared_tensor_access_time_first_(b, opencl_cpu);
// // }

// #[bench]
// fn bench_shared_tensor_access_time_first_gpu(b: &mut Bencher) {
// let opencl_backend = opencl_backend();
// let opencl_gpu = opencl_backend.devices().iter().filter(|d| *d.kind() == Gpu).nth(0).unwrap();
// // #[bench]
// // fn bench_shared_tensor_access_time_first_gpu(b: &mut Bencher) {
// // let opencl_backend = opencl_backend();
// // let opencl_gpu = opencl_backend.devices().iter().filter(|d| *d.kind() == Gpu).nth(0).unwrap();

// bench_shared_tensor_access_time_first_(b, opencl_gpu);
// }
// // bench_shared_tensor_access_time_first_(b, opencl_gpu);
// // }
106 changes: 51 additions & 55 deletions src/backend.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
use super::{Context, ComputeDevice, Framework};
use super::error::Result;
use std::ops;
use super::{BoxContext, Context, Device, Error, ExtensionPackage, Framework, Hardware, Unextended};
use super::Result;
use utility::{self, TryDefault};

/// The heart of Parenchyma - provides an interface for running parallel computations on one or
/// more devices.
Expand All @@ -15,76 +17,70 @@ use super::error::Result;
/// the framework to the [`Backend::new`](#method.new) associated function, or by simply
/// calling [`Backend::default`](#method.default). The framework determines which devices are
/// available and how parallel kernel functions can be executed.
///
/// ## Examples
///
/// ```rust
/// use parenchyma::{Backend, Framework, Native};
///
///
/// // Construct a new framework.
/// let framework = Native::new().expect("failed to initialize framework");
///
/// // Available devices can be obtained through the framework.
/// let selection = framework.available_devices.clone();
///
/// // Create a ready to go backend from the framework.
/// let backend = Backend::new(framework, selection).expect("failed to construct backend");
///
/// // ..
/// ```
///
/// Construct a default backend:
///
/// ```rust
/// use parenchyma::{Backend, Native};
///
/// // A default native backend.
/// let backend: Backend<Native> = Backend::default().expect("something went wrong!");
///
/// // ..
/// ```
#[derive(Debug)]
pub struct Backend {
pub struct Backend<X = Unextended> {
/// The initialized framework.
pub framework: Box<Framework>, /* &'static str,*/
///
/// The Framework implementation such as OpenCL, CUDA, etc. defines, which should be used and
/// determines which hardwares will be available and how parallel kernel functions can be
/// executed.
framework: Box<Framework>,
/// The context associated with the `framework`.
///
/// Contexts are the heart of both OpenCL and CUDA applications. See the [`Context`] trait for
/// more information.
/// Contexts are the heart of both OpenCL and CUDA applications. Contexts are created from one
/// or more devices that are capable of executing methods and synchronizing memory. See
/// the [`Context`] trait for more information.
///
/// [`Context`]: (./trait.Context.html)
pub context: Box<Context>,
/// The chosen device
///
/// The default active device is the first device found (index = `0`).
active: usize,
context: Box<Context<Package = X>>,
}

impl Backend {
impl<X> Backend<X> where X: ExtensionPackage {

/// Constructs a backend using the most potent framework given the underlying hardware.
pub fn new() -> Backend {
/// Initialize a new backend.
pub fn new<F>() -> Result<Self> where F: BoxContext<X> + Framework + TryDefault<Err = Error> {

unimplemented!()
let framework = Box::new(F::try_default()?);
let selection = framework.available_hardware();
let context = framework.enclose(selection)?;

Ok(Backend { framework: framework, context })
}

/// Attempts to construct a backend from the specified `framework`.
pub fn with<F>(framework: F) -> Result<Backend> where F: Framework {
/// Constructs a backend from the specified `framework` and `selection`.
pub fn with<F>(fwrk: F, selection: Vec<Hardware>) -> Result<Self>
where F: BoxContext<X> + Framework {

let framework = Box::new(fwrk);
let context = framework.enclose(selection)?;

unimplemented!()
Ok(Backend { framework, context })
}

// /// Try all provided `frameworks` in the specified order, choosing the first framework that
// // initializes without failure.
// pub fn try(frameworks: Vec<Box<Framework>>) -> Result<Backend>;
/// Set the device at the specified `index` as the active device.
///
/// Only one device can be the _active_ device - the device in which operations are executed.
pub fn set_active(&mut self, index: usize) -> Result {

self.context.set_active(index)
}
}

impl Backend {
impl<X> ops::Deref for Backend<X> where X: ExtensionPackage {

type Target = X::Extension;

/// Returns the current device.
pub fn compute_device<T>(&self) -> &ComputeDevice<T> {
fn deref<'a>(&'a self) -> &'a X::Extension {

unimplemented!()
self.context.extension()
}
}
}

impl<X> utility::Has<Device> for Backend<X> where X: ExtensionPackage {

fn get_ref(&self) -> &Device {
self.context.active_device()
}
}

// pub trait AsBackend { }
5 changes: 4 additions & 1 deletion src/changelog.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,10 @@
/// * Partially implemented a CUDA API wrapper
/// * Partially implemented native support
/// * Worked on a fallback mechanism (see issue#15)
/// * Chose a tensor lib (ndarray)
/// * Use a tensor lib (ndarray) as the underlying native memory representation
/// * No longer requires framework related feature flags (from the original Collenchyma project)
/// * Implemented auto-sync
/// * Add `Bundle` logic
/// * Removed `IBinary`/`HashMap` technique. Use structs instead
/// * No longer requires backends parameterized by a framework
pub mod r0_0_3 {}
Loading

0 comments on commit 31a2547

Please sign in to comment.