From 31a2547e05bc38a5426d47217934f35ed049aa13 Mon Sep 17 00:00:00 2001 From: Jony Date: Thu, 16 Mar 2017 18:16:10 -0400 Subject: [PATCH] Outline the overall design * This closes #2 and closes #18 * I kept running into issues w/ the libloading crate (referencing #15) * I'm currently playing around with NDArray (referencing #20), though the way the Index trait works with ArrayBase is rather complicated * See the changelog for more information --- Cargo.toml | 6 +- README.md | 31 ++ benches/shared_tensor.rs | 139 ++++--- src/backend.rs | 106 +++--- src/changelog.rs | 5 +- src/context.rs | 27 ++ src/error.rs | 4 +- src/extension.rs | 80 +++++ src/framework.rs | 32 ++ src/frameworks/cuda/{sh.rs => foreign.rs} | 0 src/frameworks/cuda/{hl => high}/error.rs | 0 .../cuda/{hl/high.rs => high/mod.rs} | 3 + src/frameworks/cuda/{hl => high}/utility.rs | 0 src/frameworks/cuda/hl/mod.rs | 7 - src/frameworks/cuda/interface.rs | 6 + src/frameworks/cuda/mod.rs | 11 - src/frameworks/{loader.rs => macros.rs} | 2 +- src/frameworks/mod.rs | 3 +- src/frameworks/native/context.rs | 23 ++ src/frameworks/native/device.rs | 88 +++++ src/frameworks/native/framework.rs | 57 +++ src/frameworks/native/memory.rs | 6 + src/frameworks/native/mod.rs | 27 +- src/frameworks/opencl/{sh.rs => foreign.rs} | 17 +- src/frameworks/opencl/{hl => high}/error.rs | 16 +- src/frameworks/opencl/high/functions.rs | 25 ++ .../opencl/{hl/high.rs => high/mod.rs} | 268 +++++++------- src/frameworks/opencl/{hl => high}/utility.rs | 4 +- src/frameworks/opencl/hl/mod.rs | 7 - src/frameworks/opencl/interface/context.rs | 90 +++++ src/frameworks/opencl/interface/device.rs | 130 +++++++ src/frameworks/opencl/interface/event.rs | 8 + src/frameworks/opencl/interface/framework.rs | 150 ++++++++ src/frameworks/opencl/interface/image.rs | 3 + src/frameworks/opencl/interface/memory.rs | 28 ++ src/frameworks/opencl/interface/mod.rs | 13 + src/frameworks/opencl/mod.rs | 76 +--- src/hardware.rs | 116 ++++++ src/interface.rs | 111 ------ src/lib.rs | 74 +++- src/memory.rs | 48 ++- src/tensor.rs | 339 ++++++++++++------ src/utility.rs | 30 +- tests/backend_specs.rs | 14 + tests/cuda.rs | 52 --- tests/lib.rs | 104 ------ tests/opencl.rs | 82 ----- tests/shared_memory_specs.rs | 79 ++++ 48 files changed, 1662 insertions(+), 885 deletions(-) create mode 100644 src/context.rs create mode 100644 src/extension.rs create mode 100644 src/framework.rs rename src/frameworks/cuda/{sh.rs => foreign.rs} (100%) rename src/frameworks/cuda/{hl => high}/error.rs (100%) rename src/frameworks/cuda/{hl/high.rs => high/mod.rs} (99%) rename src/frameworks/cuda/{hl => high}/utility.rs (100%) delete mode 100644 src/frameworks/cuda/hl/mod.rs create mode 100644 src/frameworks/cuda/interface.rs rename src/frameworks/{loader.rs => macros.rs} (95%) create mode 100644 src/frameworks/native/context.rs create mode 100644 src/frameworks/native/device.rs create mode 100644 src/frameworks/native/framework.rs create mode 100644 src/frameworks/native/memory.rs rename src/frameworks/opencl/{sh.rs => foreign.rs} (99%) rename src/frameworks/opencl/{hl => high}/error.rs (95%) create mode 100644 src/frameworks/opencl/high/functions.rs rename src/frameworks/opencl/{hl/high.rs => high/mod.rs} (86%) rename src/frameworks/opencl/{hl => high}/utility.rs (81%) delete mode 100644 src/frameworks/opencl/hl/mod.rs create mode 100644 src/frameworks/opencl/interface/context.rs create mode 100644 src/frameworks/opencl/interface/device.rs create mode 100644 src/frameworks/opencl/interface/event.rs create mode 100644 src/frameworks/opencl/interface/framework.rs create mode 100644 src/frameworks/opencl/interface/image.rs create mode 100644 src/frameworks/opencl/interface/memory.rs create mode 100644 src/frameworks/opencl/interface/mod.rs create mode 100644 src/hardware.rs delete mode 100644 src/interface.rs create mode 100644 tests/backend_specs.rs delete mode 100644 tests/cuda.rs delete mode 100644 tests/lib.rs delete mode 100644 tests/opencl.rs create mode 100644 tests/shared_memory_specs.rs diff --git a/Cargo.toml b/Cargo.toml index ce53d61..27b77e4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,8 +13,8 @@ license = "MIT/Apache-2.0" enum_primitive = "0.1.1" lazy_static = "0.2.4" libloading = "0.3.2" -log = "0.3.6" +log = "0.3.7" ndarray = "0.8.0" -[dev-dependencies] -compiletest_rs = "0.2.5" \ No newline at end of file +# [dev-dependencies] +# compiletest_rs = "0.2.5" \ No newline at end of file diff --git a/README.md b/README.md index d0213b4..056a630 100644 --- a/README.md +++ b/README.md @@ -24,6 +24,37 @@ a few necessary additions/modifications. > available in the Parenchyma project, as the different approaches that are currently being > considered may prove to be better than the original approach. +## Example + +Parenchyma comes without any extension packages. The following example therefore assumes that +you have add both `parenchyma` and the Parenchyma ExtensionPackage `parenchyma-nn` to your +Cargo manifest. + +```rust +extern crate parenchyma as pa; +extern crate parenchyma_nn as pann; + +use pa::{Backend, Native, OpenCL, SharedTensor}; + +fn main() { + let ref native: Backend = Backend::new::().unwrap(); + // Initialize an OpenCL or CUDA backend packaged with the NN extension. + let ref backend = pann::Backend::new::().unwrap(); + + // Initialize two `SharedTensor`s. + let shape = 1; + let ref x = SharedTensor::::with(backend, shape, vec![3.5]).unwrap(); + let ref mut result = SharedTensor::::new(shape); + + // Run the sigmoid operation, provided by the NN extension, on + // your OpenCL/CUDA enabled GPU (or CPU, which is possible through OpenCL) + backend.sigmoid(x, result).unwrap(); + + // Print the result: `[0.97068775] shape=[1], strides=[1]` + println!("{:?}", result.read(native).unwrap().as_native().unwrap()); +} +``` + ## License Dual licensed under diff --git a/benches/shared_tensor.rs b/benches/shared_tensor.rs index ada945c..564fb1d 100644 --- a/benches/shared_tensor.rs +++ b/benches/shared_tensor.rs @@ -1,94 +1,83 @@ #![feature(test)] extern crate parenchyma; -extern crate parenchyma_opencl; extern crate test; -use parenchyma::{Backend, Device, Framework, Native, SharedTensor}; -use parenchyma::DeviceKind::{Cpu, Gpu}; -use parenchyma_opencl::{OpenCL, OpenCLDevice}; +use parenchyma::{Backend, Native, OpenCL, SharedTensor}; use test::Bencher; -fn native_backend() -> Backend { - Backend::default().unwrap() +fn native_backend() -> Backend { + Backend::new::().unwrap() } -fn opencl_backend() -> Backend { - Backend::default().unwrap() +fn opencl_backend() -> Backend { + Backend::new::().unwrap() } -fn sync_back_and_forth(b: &mut Bencher, backend1: Backend, backend2: Backend, s: usize) - where A: Framework, B: Framework, -{ - let ref dev1 = backend1.devices()[0]; - let ref dev2 = backend2.devices()[0]; +fn sync_back_and_forth(b: &mut Bencher, backend1: Backend, backend2: Backend, s: usize) { - let mem = &mut SharedTensor::::from(vec![s]); + let mem = &mut SharedTensor::::new(s); // initialize and warm-up - let _ = mem.write_only(dev2).unwrap(); - let _ = mem.read_write(dev1).unwrap(); - let _ = mem.read_write(dev2).unwrap(); + let _ = mem.write(&backend2).unwrap(); + let _ = mem.read_write(&backend1).unwrap(); + let _ = mem.read_write(&backend2).unwrap(); b.bytes = s as u64 * 2; // we do two transfers per iteration b.iter(|| { - let _ = mem.read_write(dev1).unwrap(); - let _ = mem.read_write(dev2).unwrap(); + let _ = mem.read_write(&backend1).unwrap(); + let _ = mem.read_write(&backend2).unwrap(); }); } -fn unidirectional_sync(b: &mut Bencher, src: Backend, dst: Backend, size: usize) - where A: Framework, B: Framework, -{ - let ref src_dev = src.devices()[0]; - let ref dst_dev = dst.devices()[0]; +fn unidirectional_sync(b: &mut Bencher, src: Backend, dst: Backend, size: usize) { - let mem = &mut SharedTensor::::from(vec![size]); + let mem = &mut SharedTensor::::new(size); // initialize and warm-up - let _ = mem.write_only(src_dev).unwrap(); - let _ = mem.read(dst_dev).unwrap(); + let _ = mem.write(&src).unwrap(); + let _ = mem.read(&dst).unwrap(); b.bytes = size as u64; b.iter(|| { - let _ = mem.write_only(src_dev).unwrap(); - let _ = mem.read(dst_dev).unwrap(); + let _ = mem.write(&src).unwrap(); + let _ = mem.read(&dst).unwrap(); }); } -#[inline(never)] -fn bench_256_alloc_1mb_opencl_profile(b: &mut Bencher, device: &OpenCLDevice, size: usize) { - b.iter(|| - for _ in 0..256 { - let _ = device.allocate_memory(size).unwrap(); }); -} +// #[inline(never)] +// fn bench_256_alloc_1mb_opencl_profile(b: &mut Bencher, device: &OpenCLDevice, size: usize) { +// b.iter(|| +// for _ in 0..256 { +// let _ = device.allocate_memory(size).unwrap(); }); +// } -// #[bench] -// fn bench_256_alloc_1mb_opencl_cpu(b: &mut Bencher) { -// let opencl_backend = opencl_backend(); -// let cpu = opencl_backend.devices().iter().filter(|d| *d.kind() == Cpu).nth(0).unwrap(); +// // #[bench] +// // fn bench_256_alloc_1mb_opencl_cpu(b: &mut Bencher) { +// // let opencl_backend = opencl_backend(); +// // let cpu = opencl_backend.devices().iter().filter(|d| *d.kind() == Cpu).nth(0).unwrap(); -// bench_256_alloc_1mb_opencl_profile(b, cpu, 1_048_576); -// } +// // bench_256_alloc_1mb_opencl_profile(b, cpu, 1_048_576); +// // } + +// // #[bench] +// // fn bench_256_alloc_1mb_opencl_gpu(b: &mut Bencher) { +// // let opencl_backend = opencl_backend(); +// // let gpu = opencl_backend.devices().iter().filter(|d| *d.kind() == Gpu).nth(0).unwrap(); + +// // bench_256_alloc_1mb_opencl_profile(b, gpu, 1_048_576); +// // } // #[bench] -// fn bench_256_alloc_1mb_opencl_gpu(b: &mut Bencher) { +// fn bench_256_alloc_1mb_opencl(b: &mut Bencher) { // let opencl_backend = opencl_backend(); -// let gpu = opencl_backend.devices().iter().filter(|d| *d.kind() == Gpu).nth(0).unwrap(); +// let ref d = opencl_backend.devices()[0]; -// bench_256_alloc_1mb_opencl_profile(b, gpu, 1_048_576); +// bench_256_alloc_1mb_opencl_profile(b, d, 1_048_576); // } -#[bench] -fn bench_256_alloc_1mb_opencl(b: &mut Bencher) { - let opencl_backend = opencl_backend(); - let ref d = opencl_backend.devices()[0]; - - bench_256_alloc_1mb_opencl_profile(b, d, 1_048_576); -} - #[bench] fn bench_sync_1kb_native_opencl_back_and_forth(b: &mut Bencher) { sync_back_and_forth(b, opencl_backend(), native_backend(), 1024); @@ -134,33 +123,33 @@ fn bench_sync_128mb_opencl_to_native(b: &mut Bencher) { unidirectional_sync(b, opencl_backend(), native_backend(), 128 * 1_048_576); } -// fn bench_shared_tensor_access_time_first_(b: &mut Bencher, device: &OpenCLDevice) { +// // fn bench_shared_tensor_access_time_first_(b: &mut Bencher, device: &OpenCLDevice) { -// let native_backend = native_backend(); -// let ref native_cpu = native_backend.devices()[0]; +// // let native_backend = native_backend(); +// // let ref native_cpu = native_backend.devices()[0]; -// let mut x = SharedTensor::::from(vec![128]); -// x.write_only(native_cpu).unwrap(); -// x.write_only(device).unwrap(); -// x.read(native_cpu).unwrap(); +// // let mut x = SharedTensor::::from(vec![128]); +// // x.write_only(native_cpu).unwrap(); +// // x.write_only(device).unwrap(); +// // x.read(native_cpu).unwrap(); -// b.iter(|| { -// let _ = x.read(native_cpu).unwrap(); -// }) -// } +// // b.iter(|| { +// // let _ = x.read(native_cpu).unwrap(); +// // }) +// // } -// #[bench] -// fn bench_shared_tensor_access_time_first_cpu(b: &mut Bencher) { -// let opencl_backend = opencl_backend(); -// let opencl_cpu = opencl_backend.devices().iter().filter(|d| *d.kind() == Cpu).nth(0).unwrap(); +// // #[bench] +// // fn bench_shared_tensor_access_time_first_cpu(b: &mut Bencher) { +// // let opencl_backend = opencl_backend(); +// // let opencl_cpu = opencl_backend.devices().iter().filter(|d| *d.kind() == Cpu).nth(0).unwrap(); -// bench_shared_tensor_access_time_first_(b, opencl_cpu); -// } +// // bench_shared_tensor_access_time_first_(b, opencl_cpu); +// // } -// #[bench] -// fn bench_shared_tensor_access_time_first_gpu(b: &mut Bencher) { -// let opencl_backend = opencl_backend(); -// let opencl_gpu = opencl_backend.devices().iter().filter(|d| *d.kind() == Gpu).nth(0).unwrap(); +// // #[bench] +// // fn bench_shared_tensor_access_time_first_gpu(b: &mut Bencher) { +// // let opencl_backend = opencl_backend(); +// // let opencl_gpu = opencl_backend.devices().iter().filter(|d| *d.kind() == Gpu).nth(0).unwrap(); -// bench_shared_tensor_access_time_first_(b, opencl_gpu); -// } \ No newline at end of file +// // bench_shared_tensor_access_time_first_(b, opencl_gpu); +// // } \ No newline at end of file diff --git a/src/backend.rs b/src/backend.rs index 1b44a57..9f0fb09 100644 --- a/src/backend.rs +++ b/src/backend.rs @@ -1,5 +1,7 @@ -use super::{Context, ComputeDevice, Framework}; -use super::error::Result; +use std::ops; +use super::{BoxContext, Context, Device, Error, ExtensionPackage, Framework, Hardware, Unextended}; +use super::Result; +use utility::{self, TryDefault}; /// The heart of Parenchyma - provides an interface for running parallel computations on one or /// more devices. @@ -15,76 +17,70 @@ use super::error::Result; /// the framework to the [`Backend::new`](#method.new) associated function, or by simply /// calling [`Backend::default`](#method.default). The framework determines which devices are /// available and how parallel kernel functions can be executed. -/// -/// ## Examples -/// -/// ```rust -/// use parenchyma::{Backend, Framework, Native}; -/// -/// -/// // Construct a new framework. -/// let framework = Native::new().expect("failed to initialize framework"); -/// -/// // Available devices can be obtained through the framework. -/// let selection = framework.available_devices.clone(); -/// -/// // Create a ready to go backend from the framework. -/// let backend = Backend::new(framework, selection).expect("failed to construct backend"); -/// -/// // .. -/// ``` -/// -/// Construct a default backend: -/// -/// ```rust -/// use parenchyma::{Backend, Native}; -/// -/// // A default native backend. -/// let backend: Backend = Backend::default().expect("something went wrong!"); -/// -/// // .. -/// ``` #[derive(Debug)] -pub struct Backend { +pub struct Backend { /// The initialized framework. - pub framework: Box, /* &'static str,*/ + /// + /// The Framework implementation such as OpenCL, CUDA, etc. defines, which should be used and + /// determines which hardwares will be available and how parallel kernel functions can be + /// executed. + framework: Box, /// The context associated with the `framework`. /// - /// Contexts are the heart of both OpenCL and CUDA applications. See the [`Context`] trait for - /// more information. + /// Contexts are the heart of both OpenCL and CUDA applications. Contexts are created from one + /// or more devices that are capable of executing methods and synchronizing memory. See + /// the [`Context`] trait for more information. /// /// [`Context`]: (./trait.Context.html) - pub context: Box, - /// The chosen device - /// - /// The default active device is the first device found (index = `0`). - active: usize, + context: Box>, } -impl Backend { +impl Backend where X: ExtensionPackage { - /// Constructs a backend using the most potent framework given the underlying hardware. - pub fn new() -> Backend { + /// Initialize a new backend. + pub fn new() -> Result where F: BoxContext + Framework + TryDefault { - unimplemented!() + let framework = Box::new(F::try_default()?); + let selection = framework.available_hardware(); + let context = framework.enclose(selection)?; + + Ok(Backend { framework: framework, context }) } - /// Attempts to construct a backend from the specified `framework`. - pub fn with(framework: F) -> Result where F: Framework { + /// Constructs a backend from the specified `framework` and `selection`. + pub fn with(fwrk: F, selection: Vec) -> Result + where F: BoxContext + Framework { + + let framework = Box::new(fwrk); + let context = framework.enclose(selection)?; - unimplemented!() + Ok(Backend { framework, context }) } - // /// Try all provided `frameworks` in the specified order, choosing the first framework that - // // initializes without failure. - // pub fn try(frameworks: Vec>) -> Result; + /// Set the device at the specified `index` as the active device. + /// + /// Only one device can be the _active_ device - the device in which operations are executed. + pub fn set_active(&mut self, index: usize) -> Result { + + self.context.set_active(index) + } } -impl Backend { +impl ops::Deref for Backend where X: ExtensionPackage { + + type Target = X::Extension; - /// Returns the current device. - pub fn compute_device(&self) -> &ComputeDevice { + fn deref<'a>(&'a self) -> &'a X::Extension { - unimplemented!() + self.context.extension() } -} \ No newline at end of file +} + +impl utility::Has for Backend where X: ExtensionPackage { + + fn get_ref(&self) -> &Device { + self.context.active_device() + } +} + +// pub trait AsBackend { } \ No newline at end of file diff --git a/src/changelog.rs b/src/changelog.rs index f1a5f95..3211a9a 100644 --- a/src/changelog.rs +++ b/src/changelog.rs @@ -6,7 +6,10 @@ /// * Partially implemented a CUDA API wrapper /// * Partially implemented native support /// * Worked on a fallback mechanism (see issue#15) -/// * Chose a tensor lib (ndarray) +/// * Use a tensor lib (ndarray) as the underlying native memory representation /// * No longer requires framework related feature flags (from the original Collenchyma project) /// * Implemented auto-sync +/// * Add `Bundle` logic +/// * Removed `IBinary`/`HashMap` technique. Use structs instead +/// * No longer requires backends parameterized by a framework pub mod r0_0_3 {} \ No newline at end of file diff --git a/src/context.rs b/src/context.rs new file mode 100644 index 0000000..8661015 --- /dev/null +++ b/src/context.rs @@ -0,0 +1,27 @@ +use std::fmt::Debug; +use super::{Device, ExtensionPackage, Result}; +use utility::Has; + +/// Contexts are the heart of both OpenCL and CUDA applications. Contexts provide a container for +/// objects such as memory, command-queues, programs/modules and kernels. +pub trait Context: Debug { + + /// The extension package built for `Self`. + type Package: ExtensionPackage; + + /// Returns the _active_ device. + fn active_device(&self) -> &Device; + + /// Set the device at the specified `index` as the active device. + fn set_active(&mut self, index: usize) -> Result; + + #[doc(hidden)] + fn extension(&self) -> &::Extension; +} + +impl Has for I where I: Context { + + fn get_ref(&self) -> &Device { + self.active_device() + } +} \ No newline at end of file diff --git a/src/error.rs b/src/error.rs index d271c70..21ec07d 100644 --- a/src/error.rs +++ b/src/error.rs @@ -3,7 +3,7 @@ use std::{error, fmt, result}; /// A specialized `Result` type. -pub type Result = result::Result; +pub type Result = result::Result; /// The core error type used in Parenchyma. #[derive(Debug)] @@ -28,7 +28,7 @@ pub enum ErrorKind { InvalidReshapedTensorSize, /// An error returned when attempting to access uninitialized memory. UninitializedMemory, - /// Memory allocation was not found for a provided `Device`. + /// Unable to drop the provided device because a memory allocation was not found for it. AllocatedMemoryNotFoundForDevice, /// An error occurred while attempting to synchronize memory. MemorySynchronizationFailed, diff --git a/src/extension.rs b/src/extension.rs new file mode 100644 index 0000000..dd06078 --- /dev/null +++ b/src/extension.rs @@ -0,0 +1,80 @@ +use std::any::Any; +use std::fmt::Debug; +use super::Result; + +/// Provides the generic functionality for a backend-specific implementation of a library. +/// +/// A package can be a binary, a source file, c code, a single kernel, etc., or a collective which +/// share related functionalities. A package is provided by a specific library such as BLAS. Notice +/// that packages are analogous to those of Rust (i.e., crates): +/// +/// compiled crate <-> package +/// library (one or more modules) <-> bundle +/// +/// A package needs to be _built_, which is handled by the specific implementation of a binary +/// representation, and returns initialized operations based on the library. Interacting directly +/// with the package itself is possible, but it should be used to construct the backend-agnostic +/// operations, which can then be executed and parallelized via a unified interface. +pub trait ExtensionPackage: 'static + Debug + Sized { + /// The name of the package. + /// + /// This associated constant is primarily used for logging/debugging purposes. The naming + /// convention is as follows: "[organization]/[package-name]" (e.g., "parenchyma/nn"). + const PACKAGE_NAME: &'static str; + + /// A library can be a binary, a source file, c code, a single kernel, etc., or a collective. + /// + /// A backend is a Rust struct like any other, therefore you probably would like to implement + /// certain methods for the Backend. As the whole purpose of a Backend is to provide an + /// abstraction over various computation devices and computation languages, these implemented + /// methods will than be able to execute on different devices and use the full power of + /// the machine's underlying hardware. + /// + /// Extending the backend with operations is easy. In Parenchyma we call crates, which provide + /// operations for the backend, _extensions_. Extensions are usually a group of related + /// operations of a common field. Two examples for Parenchyma extensions + /// are [BLAS][parenchyma-blas] and [NN][parenchyma-nn]. + /// + /// An extension provides generic traits and the explicit implementation of these traits for + /// one or (even better) all available Parenchyma frameworks - common host CPU, OpenCL, CUDA. + /// + /// The structure of an extension is pretty simple with as little overhead as possible. Macros + /// and build-scripts make implementations even easier. If you would like to use specific + /// extension for you backend, all you need to do is set them as dependencies in your Cargo + /// file in addition to the Parenchyma crate. The extension then automatically extends the + /// backend provided by Parenchyma. + /// + /// Extending the backend with your own extension is a straightforward process. For now we + /// recommend that you take a look at the general code structure + /// of [Parenchyma-BLAS][parenchyma-blas] or its documentation. Let us now about your extension + /// on the Gitter chat, we are happy to feature your Parenchyma Extension on the README. + type Extension: ?Sized; +} + +/// Builds a package and provides the functionality for turning a library into backend-specific, +/// executable operations, and tailored for the target framework. +pub trait Build: Sized { + + /// Compiles the library into package after initializing and configuring the library. + fn build(&mut Target) -> Result; +} + +// ========================= + +/// A marker type for _unextended_ backends/contexts. +#[derive(Debug)] +pub struct Unextended; + +impl ExtensionPackage for Unextended { + + const PACKAGE_NAME: &'static str = ""; + + type Extension = Any; +} + +impl Build for Unextended { + + fn build(_: &mut A) -> Result { + Ok(Unextended) + } +} \ No newline at end of file diff --git a/src/framework.rs b/src/framework.rs new file mode 100644 index 0000000..cd544a0 --- /dev/null +++ b/src/framework.rs @@ -0,0 +1,32 @@ +use std::fmt::Debug; + +use super::{Context, Hardware, Result}; + +/// A trait implemented for all frameworks. `Framework`s contain a list of all available devices as +/// well as other objects specific to the implementor. +/// +/// The default framework is simply the host CPU for common computation. To make use of other +/// devices such as GPUs, you may choose a GPGPU framework (such as OpenCL or CUDA) to access the +/// processing capabilities of the device(s). +pub trait Framework: 'static + Debug { + /// The name of the framework. + /// + /// This associated constant is mainly used for the purposes of debugging and reporting errors. + /// + /// [issue#29924](https://github.com/rust-lang/rust/issues/29924): remove `Framework::name` + const FRAMEWORK_NAME: &'static str; + + /// Returns the cached and available hardware. + fn available_hardware(&self) -> Vec; +} + +/// Initialize a context, box it, and then return it. +pub trait BoxContext: Framework { + + // /// The `Context` representation for this framework. + // type Context: TryFrom, Err = Error>; + + /// Create a context from a selection of hardware devices and then wrap it in a box. + fn enclose(&self, selection: Vec) + -> Result>>; +} \ No newline at end of file diff --git a/src/frameworks/cuda/sh.rs b/src/frameworks/cuda/foreign.rs similarity index 100% rename from src/frameworks/cuda/sh.rs rename to src/frameworks/cuda/foreign.rs diff --git a/src/frameworks/cuda/hl/error.rs b/src/frameworks/cuda/high/error.rs similarity index 100% rename from src/frameworks/cuda/hl/error.rs rename to src/frameworks/cuda/high/error.rs diff --git a/src/frameworks/cuda/hl/high.rs b/src/frameworks/cuda/high/mod.rs similarity index 99% rename from src/frameworks/cuda/hl/high.rs rename to src/frameworks/cuda/high/mod.rs index fe308db..db54b2c 100644 --- a/src/frameworks/cuda/hl/high.rs +++ b/src/frameworks/cuda/high/mod.rs @@ -1,3 +1,6 @@ +mod error; +mod utility; + use std::{cmp, ffi, ops}; use super::Result; diff --git a/src/frameworks/cuda/hl/utility.rs b/src/frameworks/cuda/high/utility.rs similarity index 100% rename from src/frameworks/cuda/hl/utility.rs rename to src/frameworks/cuda/high/utility.rs diff --git a/src/frameworks/cuda/hl/mod.rs b/src/frameworks/cuda/hl/mod.rs deleted file mode 100644 index 24ecc47..0000000 --- a/src/frameworks/cuda/hl/mod.rs +++ /dev/null @@ -1,7 +0,0 @@ -pub use self::error::{Error, ErrorKind, Result}; -pub use self::high::{init, ndevices}; -pub use self::high::{Context, Device, Memory}; - -mod error; -mod high; -mod utility; \ No newline at end of file diff --git a/src/frameworks/cuda/interface.rs b/src/frameworks/cuda/interface.rs new file mode 100644 index 0000000..c26055c --- /dev/null +++ b/src/frameworks/cuda/interface.rs @@ -0,0 +1,6 @@ +/// Provides the CUDA framework. +#[derive(Debug)] +pub struct Cuda { + // /// A list of available devices. + // pub available_devices: Vec, +} \ No newline at end of file diff --git a/src/frameworks/cuda/mod.rs b/src/frameworks/cuda/mod.rs index ebadbf7..e69de29 100644 --- a/src/frameworks/cuda/mod.rs +++ b/src/frameworks/cuda/mod.rs @@ -1,11 +0,0 @@ -//! CUDA backend support. - -pub mod hl; -pub mod sh; - -/// Provides the CUDA framework. -#[derive(Debug)] -pub struct Cuda { - // /// A list of available devices. - // pub available_devices: Vec, -} \ No newline at end of file diff --git a/src/frameworks/loader.rs b/src/frameworks/macros.rs similarity index 95% rename from src/frameworks/loader.rs rename to src/frameworks/macros.rs index 0a5d5fd..c501138 100644 --- a/src/frameworks/loader.rs +++ b/src/frameworks/macros.rs @@ -21,7 +21,7 @@ macro_rules! dynamic_extern { type FnPtr = unsafe extern $linkage fn($($argument_type),*) $(-> $ret_ty)*; lazy_static! { - static ref FN_PTR: ::frameworks::loader::Sym = { + static ref FN_PTR: ::frameworks::macros::Sym = { unsafe { use lib::Library; use std::path::Path; diff --git a/src/frameworks/mod.rs b/src/frameworks/mod.rs index 7384823..35cb01f 100644 --- a/src/frameworks/mod.rs +++ b/src/frameworks/mod.rs @@ -1,5 +1,4 @@ -#[macro_use] mod loader; +#[macro_use] mod macros; -pub mod cuda; pub mod native; pub mod opencl; \ No newline at end of file diff --git a/src/frameworks/native/context.rs b/src/frameworks/native/context.rs new file mode 100644 index 0000000..6d04a99 --- /dev/null +++ b/src/frameworks/native/context.rs @@ -0,0 +1,23 @@ +use {Context, Device, ExtensionPackage, Result}; +use std::marker::{PhantomData, Unsize}; +use super::NativeDevice; + +/// The native context. +#[derive(Clone, Debug)] +pub struct NativeContext(pub PhantomData); + +impl Context for NativeContext where X: ExtensionPackage, Self: Unsize { + + type Package = X; + + fn active_device(&self) -> &Device { + static NATIVE_DEVICE: NativeDevice = NativeDevice; + &NATIVE_DEVICE + } + + fn set_active(&mut self, _: usize) -> Result { Ok(()) } + + fn extension(&self) -> &X::Extension { + self + } +} \ No newline at end of file diff --git a/src/frameworks/native/device.rs b/src/frameworks/native/device.rs new file mode 100644 index 0000000..5a10b2f --- /dev/null +++ b/src/frameworks/native/device.rs @@ -0,0 +1,88 @@ +use {Alloc, ComputeDevice, Memory, Result, Shape, Synch, Viewable}; +use super::NativeMemory; + +/// The native device. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct NativeDevice; + +impl Viewable for NativeDevice { + + fn view(&self) -> ComputeDevice { + ComputeDevice::Native(NativeDevice) + } +} + +impl Alloc for NativeDevice { + + + fn alloc(&self, shape: &Shape) -> Result> { + // TODO + + let mut buffer = Vec::with_capacity(shape.capacity); + + unsafe { + buffer.set_len(shape.capacity); + } + + Ok(Memory::Native( + NativeMemory::from_shape_vec( + &shape.dims[..], + buffer).unwrap())) + } + + fn allocwrite(&self, shape: &Shape, data: Vec) -> Result> { + // TODO + + Ok(Memory::Native( + NativeMemory::from_shape_vec( + &shape.dims[..], + data).unwrap())) + } +} + +impl Synch for NativeDevice where T: Clone { + + fn write( + &self, + memory: &mut Memory, + src_device: &ComputeDevice, + source: &Memory) + -> Result { + + match *src_device { + ComputeDevice::Native(_) => { + let memory = unsafe { memory.as_mut_native_unchecked() }; + let source = unsafe { source.as_native_unchecked() }; + // > Array implements .clone_from() to reuse an array's existing allocation. + // > Semantically equivalent to *self = other.clone(), but potentially more efficient. + Ok(memory.clone_from(source)) + }, + + ComputeDevice::OpenCL(ref cl_device) => { + cl_device.read(source, &mut ComputeDevice::Native(NativeDevice), memory) + } + } + } + + fn read( + &self, + memory: &Memory, + dest_device: &mut ComputeDevice, + destination: &mut Memory) + -> Result { + + match *dest_device { + ComputeDevice::Native(_) => { + let source = unsafe { memory.as_native_unchecked() }; + let destination = unsafe { destination.as_mut_native_unchecked() }; + // > Array implements .clone_from() to reuse an array's existing allocation. + // > Semantically equivalent to *self = other.clone(), but potentially more efficient. + Ok(destination.clone_from(source)) + }, + + ComputeDevice::OpenCL(ref mut cl_device) => { + cl_device.write(destination, &ComputeDevice::Native(NativeDevice), memory) + } + } + } +} \ No newline at end of file diff --git a/src/frameworks/native/framework.rs b/src/frameworks/native/framework.rs new file mode 100644 index 0000000..0530b80 --- /dev/null +++ b/src/frameworks/native/framework.rs @@ -0,0 +1,57 @@ +use {BoxContext, Context, Error, ExtensionPackage, Framework, Hardware, HardwareKind, Result}; +use std::marker::{PhantomData, Unsize}; +use super::NativeContext; +use utility::TryDefault; + +const NATIVE: &'static str = "Native"; + +/// The native framework +/// +/// # Example +/// +/// ```rust +/// use parenchyma::{Backend, Native, SharedTensor}; +/// +/// let ref host: Backend = Backend::new::().unwrap(); +/// +/// let sh: SharedTensor = SharedTensor::with(host, [2, 2], vec![1., 2., 3., 4.]).unwrap(); +/// +/// let tensor = sh.read(host).unwrap(); +/// +/// println!("{:#?}", tensor); +/// ``` +#[derive(Debug)] +pub struct Native; + +impl Framework for Native { + + const FRAMEWORK_NAME: &'static str = NATIVE; + + fn available_hardware(&self) -> Vec { + vec![Hardware { + id: 0, + framework: NATIVE, + kind: HardwareKind::Central, + name: String::from("HOST CPU"), + compute_units: 1, + }] + } +} + +impl BoxContext for Native + where X: ExtensionPackage, + NativeContext: Unsize + { + + fn enclose(&self, _: Vec) -> Result>> { + Ok(Box::new(NativeContext(PhantomData))) + } +} + +impl TryDefault for Native { + type Err = Error; + + fn try_default() -> Result { + Ok(Native) + } +} \ No newline at end of file diff --git a/src/frameworks/native/memory.rs b/src/frameworks/native/memory.rs new file mode 100644 index 0000000..2f22115 --- /dev/null +++ b/src/frameworks/native/memory.rs @@ -0,0 +1,6 @@ +use ndarray::{Array, IxDyn}; + +/// Represents a native array. +/// +/// note: named `Memory` for consistency across frameworks. +pub type NativeMemory = Array; \ No newline at end of file diff --git a/src/frameworks/native/mod.rs b/src/frameworks/native/mod.rs index 0d816c4..d38c8ef 100644 --- a/src/frameworks/native/mod.rs +++ b/src/frameworks/native/mod.rs @@ -1,20 +1,11 @@ //! Native backend support. -use ndarray::{Array, IxDyn}; - -/// Provides the native framework. -#[derive(Debug)] -pub struct Native; - -/// Represents a native array. -/// -/// note: named `Memory` for consistency across frameworks. -pub type Memory = Array; - -/// The native context. -#[derive(Clone, Debug)] -pub struct NativeContext; - -/// The native device. -#[derive(Clone, Debug)] -pub struct NativeDevice; \ No newline at end of file +pub use self::context::NativeContext; +pub use self::device::NativeDevice; +pub use self::framework::Native; +pub use self::memory::NativeMemory; + +mod context; +mod device; +mod framework; +mod memory; \ No newline at end of file diff --git a/src/frameworks/opencl/sh.rs b/src/frameworks/opencl/foreign.rs similarity index 99% rename from src/frameworks/opencl/sh.rs rename to src/frameworks/opencl/foreign.rs index 395556d..3d7dd04 100644 --- a/src/frameworks/opencl/sh.rs +++ b/src/frameworks/opencl/foreign.rs @@ -66,12 +66,14 @@ pub type cl_event_info = cl_uint; pub type cl_command_type = cl_uint; pub type cl_profiling_info = cl_uint; +#[derive(Debug)] #[repr(C)] pub struct cl_image_format { image_channel_order: cl_channel_order, image_channel_data_type: cl_channel_type } +#[derive(Debug)] pub struct cl_buffer_region { origin: libc::size_t, size: libc::size_t @@ -445,8 +447,17 @@ pub static CL_PROFILING_COMMAND_SUBMIT: cl_uint = 0x1281; pub static CL_PROFILING_COMMAND_START: cl_uint = 0x1282; pub static CL_PROFILING_COMMAND_END: cl_uint = 0x1283; -dynamic_extern! { - #[link="OpenCL"] +// dynamic_extern! { +// #[link="OpenCL"] + +#[link(name = "OpenCL", kind = "framework")] +#[cfg(target_os = "macos")] +extern { } + +#[link(name = "OpenCL")] +#[cfg(target_os = "linux")] +extern { } + extern "C" { /* Platform APIs */ @@ -844,4 +855,4 @@ dynamic_extern! { */ pub fn clGetExtensionFunctionAddress(func_name: *const libc::c_char) -> *mut raw::c_void; } -} \ No newline at end of file +//} \ No newline at end of file diff --git a/src/frameworks/opencl/hl/error.rs b/src/frameworks/opencl/high/error.rs similarity index 95% rename from src/frameworks/opencl/hl/error.rs rename to src/frameworks/opencl/high/error.rs index 2fdd76b..8e47f41 100644 --- a/src/frameworks/opencl/hl/error.rs +++ b/src/frameworks/opencl/high/error.rs @@ -1,14 +1,17 @@ -use frameworks::opencl::sh::CLStatus; use std::{error, fmt, result}; +use super::super::foreign::CLStatus; +/// A specialized `Result` type. pub type Result = result::Result; +/// The error type used in the OpenCL module. #[derive(Debug)] pub struct Error { kind: ErrorKind, inner: Option>, } +/// A list of general categories. #[derive(Debug, Copy, Clone)] pub enum ErrorKind { /// No OpenCL devices that matched device_type were found. @@ -69,7 +72,7 @@ pub enum ErrorKind { impl From for ErrorKind { fn from(cl_status: CLStatus) -> ErrorKind { - use frameworks::opencl::sh::CLStatus::*; + use super::super::foreign::CLStatus::*; use self::ErrorKind::*; match cl_status { @@ -163,15 +166,6 @@ impl Error { } } - pub fn get_ref(&self) -> Option<&(error::Error + Send + Sync + 'static)> { - use std::ops::Deref; - - match self.inner { - Some(ref error) => Some(error.deref()), - _ => None - } - } - /// Returns the corresponding `ErrorKind` for this error. pub fn kind(&self) -> ErrorKind { self.kind diff --git a/src/frameworks/opencl/high/functions.rs b/src/frameworks/opencl/high/functions.rs new file mode 100644 index 0000000..7d17d27 --- /dev/null +++ b/src/frameworks/opencl/high/functions.rs @@ -0,0 +1,25 @@ +use std::ptr; + +use super::{Platform, Result}; +use super::utility; +use super::super::foreign; + +/// Number of platforms +pub fn nplatforms() -> Result { + unsafe { + let mut nplatforms = 0; + let ret_value = foreign::clGetPlatformIDs(0, ptr::null_mut(), &mut nplatforms); + return utility::check(ret_value, || nplatforms); + } +} + +/// Obtain the list of platforms available. +pub fn platforms() -> Result> { + unsafe { + let nplatforms = nplatforms()?; + let mut vec_id = vec![ptr::null_mut(); nplatforms as usize]; + let pointer = vec_id.as_mut_ptr(); + let ret_value = foreign::clGetPlatformIDs(nplatforms, pointer, ptr::null_mut()); + return utility::check(ret_value, || vec_id.iter().map(|&id| Platform(id)).collect()); + } +} \ No newline at end of file diff --git a/src/frameworks/opencl/hl/high.rs b/src/frameworks/opencl/high/mod.rs similarity index 86% rename from src/frameworks/opencl/hl/high.rs rename to src/frameworks/opencl/high/mod.rs index 7828f79..1bfd257 100644 --- a/src/frameworks/opencl/hl/high.rs +++ b/src/frameworks/opencl/high/mod.rs @@ -1,37 +1,25 @@ -use std::os::raw; -use std::{clone, cmp, ffi, mem, ops, ptr}; +//! Wrapper module for OpenCL +#![allow(missing_docs, unused_qualifications)] -use super::Result; -use super::utility; -use super::super::sh; +pub use self::error::{Error, ErrorKind, Result}; +pub use self::functions::{nplatforms, platforms}; -/// Number of platforms -pub fn nplatforms() -> Result { - unsafe { - let mut nplatforms = 0; - let ret_value = sh::clGetPlatformIDs(0, ptr::null_mut(), &mut nplatforms); - return utility::check(ret_value, || nplatforms); - } -} +mod error; +mod functions; +mod utility; -/// Obtain the list of platforms available. -pub fn platforms() -> Result> { - unsafe { - let nplatforms = nplatforms()?; - let mut vec_id = vec![0 as sh::cl_platform_id; nplatforms as usize]; - let ret_value = sh::clGetPlatformIDs(nplatforms, vec_id.as_mut_ptr(), ptr::null_mut()); - return utility::check(ret_value, || vec_id.iter().map(|&id| Platform(id)).collect()); - } -} +use std::{clone, cmp, ffi, mem, ops, ptr}; +use std::os::raw::c_void; +use super::foreign; #[derive(Debug)] -pub struct Buffer(sh::cl_mem); +pub struct Buffer(foreign::cl_mem); impl Buffer { /// Increments the memory object reference count. pub fn retain(&self) -> Result { unsafe { - let ret_value = sh::clRetainMemObject(self.0); + let ret_value = foreign::clRetainMemObject(self.0); return utility::check(ret_value, || {}); } @@ -40,7 +28,7 @@ impl Buffer { /// Decrements the memory object reference count. pub fn release(&self) -> Result { unsafe { - let ret_value = sh::clReleaseMemObject(self.0); + let ret_value = foreign::clReleaseMemObject(self.0); return utility::check(ret_value, || {}); } @@ -61,9 +49,13 @@ impl ops::Drop for Buffer { } #[derive(Debug)] -pub struct Context(sh::cl_context); +pub struct Context(foreign::cl_context); impl Context { + pub fn ptr(&self) -> &foreign::cl_context { + &self.0 + } + /// Creates an OpenCL context. /// /// An OpenCL context is created with one or more devices. Contexts are used by the OpenCL @@ -92,7 +84,7 @@ impl Context { // The number of devices specified in the devices argument. let number_of_devices = devices.len() as u32; // A pointer to a list of unique devices returned by clGetDeviceIDs for a platform. - let raw_devices: Vec<*mut raw::c_void> = devices.iter().map(|d| d.0).collect(); + let raw_devices: Vec<*mut c_void> = devices.iter().map(|d| d.0).collect(); let raw_devices_ptr = raw_devices.as_ptr(); // A callback function that can be registered by the application. This callback function @@ -111,15 +103,15 @@ impl Context { // `user_data` is a pointer to user supplied data. // // TODO - let pfn_notify: extern fn(*const i8, *const raw::c_void, usize, *mut raw::c_void) + let pfn_notify: extern fn(*const i8, *const c_void, usize, *mut c_void) = mem::transmute(ptr::null::()); // Passed as the `user_data` argument when pfn_notify is called. user_data can be NULL. // // TODO - let user_data: *mut raw::c_void = ptr::null_mut(); + let user_data: *mut c_void = ptr::null_mut(); - let cl_context = sh::clCreateContext( + let cl_context = foreign::clCreateContext( properties, number_of_devices, raw_devices_ptr, @@ -128,7 +120,7 @@ impl Context { &mut errcode_ret ); - let ret_value = sh::CLStatus::new(errcode_ret) + let ret_value = foreign::CLStatus::new(errcode_ret) .expect("failed to convert `i32` to `CLStatus`"); return utility::check(ret_value, || Context(cl_context)); @@ -149,17 +141,17 @@ impl Context { /// application. The size of the buffer that host_ptr points to must be greater than or equal /// to the size bytes. pub fn create_buffer(&self, f: F, size: usize, h: H) -> Result - where F: Into>, - H: Into>, + where F: Into>, + H: Into>, { unsafe { let mut errcode_ret: i32 = 0; - let flags = f.into().unwrap_or(sh::CL_MEM_READ_WRITE); + let flags = f.into().unwrap_or(foreign::CL_MEM_READ_WRITE); let host_pointer = h.into().unwrap_or(ptr::null_mut()); - let mem = sh::clCreateBuffer(self.0, flags, size, host_pointer, &mut errcode_ret); + let mem = foreign::clCreateBuffer(self.0, flags, size, host_pointer, &mut errcode_ret); - let ret_value = sh::CLStatus::new(errcode_ret) + let ret_value = foreign::CLStatus::new(errcode_ret) .expect("failed to convert `i32` to `CLStatus`"); return utility::check(ret_value, || Buffer(mem)); @@ -185,9 +177,9 @@ impl Context { let ptrs: Vec<*const i8> = cstrings.iter().map(|s| s.as_ptr()).collect(); let ptr = ptrs.as_ptr(); - let cl_program = sh::clCreateProgramWithSource(self.0, n, ptr, lens_ptr, &mut errcode); + let cl_program = foreign::clCreateProgramWithSource(self.0, n, ptr, lens_ptr, &mut errcode); - let ret_value = sh::CLStatus::new(errcode).expect("failed to convert i32 to CLStatus"); + let ret_value = foreign::CLStatus::new(errcode).expect("failed to convert i32 to CLStatus"); return utility::check(ret_value, || Program(cl_program)); } @@ -196,7 +188,7 @@ impl Context { /// Increment the context reference count. fn retain(&self) -> Result { unsafe { - let ret_value = sh::clRetainContext(self.0); + let ret_value = foreign::clRetainContext(self.0); return utility::check(ret_value, || {}); } } @@ -204,7 +196,7 @@ impl Context { /// Decrement the context reference count. fn release(&self) -> Result { unsafe { - let ret_value = sh::clReleaseContext(self.0); + let ret_value = foreign::clReleaseContext(self.0); return utility::check(ret_value, || {}); } } @@ -218,6 +210,8 @@ impl clone::Clone for Context { } } +impl cmp::Eq for Context { } + impl cmp::PartialEq for Context { fn eq(&self, other: &Context) -> bool { @@ -235,20 +229,24 @@ impl ops::Drop for Context { /// Newtype with an internal type of `cl_device_id`. #[derive(Clone, Debug)] -pub struct Device(sh::cl_device_id); +pub struct Device(foreign::cl_device_id); impl Device { + pub fn ptr(&self) -> &foreign::cl_device_id { + &self.0 + } + /// The default compute device address space size specified as an unsigned integer value /// in bits. Currently supported values are 32 or 64 bits. pub fn address_bits(&self) -> Result { - let parameter = sh::CL_DEVICE_ADDRESS_BITS; + let parameter = foreign::CL_DEVICE_ADDRESS_BITS; let res = self.info(parameter, |_| 0u32, |b| b as *mut u32 as _)?; Ok(res) } /// Is CL_TRUE if the device is available and CL_FALSE if the device is not available. pub fn available(&self) -> Result { - let parameter = sh::CL_DEVICE_AVAILABLE; + let parameter = foreign::CL_DEVICE_AVAILABLE; let res = self.info(parameter, |_| 0u32, |b| b as *mut u32 as _)?; Ok(res != 0) } @@ -257,7 +255,7 @@ impl Device { /// program source. Is CL_TRUE if the compiler is available. This can be CL_FALSE for the /// embedded platform profile only. pub fn compiler_available(&self) -> Result { - let parameter = sh::CL_DEVICE_COMPILER_AVAILABLE; + let parameter = foreign::CL_DEVICE_COMPILER_AVAILABLE; let res = self.info(parameter, |_| 0u32, |b| b as *mut u32 as _)?; Ok(res != 0) } @@ -282,7 +280,7 @@ impl Device { /// Is CL_TRUE if the OpenCL device is a little endian device and CL_FALSE otherwise. pub fn endian_little(&self) -> Result { - let parameter = sh::CL_DEVICE_ENDIAN_LITTLE; + let parameter = foreign::CL_DEVICE_ENDIAN_LITTLE; let res = self.info(parameter, |_| 0u32, |b| b as *mut u32 as _)?; Ok(res != 0) } @@ -291,7 +289,7 @@ impl Device { /// etc. in the device. Is CL_FALSE if the device does not implement error correction. This can /// be a requirement for certain clients of OpenCL. pub fn error_correction_support(&self) -> Result { - let parameter = sh::CL_DEVICE_ERROR_CORRECTION_SUPPORT; + let parameter = foreign::CL_DEVICE_ERROR_CORRECTION_SUPPORT; let res = self.info(parameter, |_| 0u32, |b| b as *mut u32 as _)?; Ok(res != 0) } @@ -325,7 +323,7 @@ impl Device { /// cl_khr_byte_addressable_store /// cl_khr_fp16 pub fn extensions(&self) -> Result> { - let parameter = sh::CL_DEVICE_EXTENSIONS; + let parameter = foreign::CL_DEVICE_EXTENSIONS; let res = self.info(parameter, |size| vec![0u8; size], |b| b.as_mut_ptr() as _); res.map(|b| String::from_utf8(b).expect("UTF8 string")).map(|st| { st.split_whitespace().map(|s| s.into()).collect() @@ -334,7 +332,7 @@ impl Device { /// Size of global memory cache in bytes. pub fn global_mem_cache_size(&self) -> Result { - let parameter = sh::CL_DEVICE_GLOBAL_MEM_CACHE_SIZE; + let parameter = foreign::CL_DEVICE_GLOBAL_MEM_CACHE_SIZE; let res = self.info(parameter, |_| 0u64, |b| b as *mut u64 as _)?; Ok(res) } @@ -348,14 +346,14 @@ impl Device { /// Size of global memory cache line in bytes. pub fn global_mem_cacheline_size(&self) -> Result { - let parameter = sh::CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE; + let parameter = foreign::CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE; let res = self.info(parameter, |_| 0u32, |b| b as *mut u32 as _)?; Ok(res) } /// Size of global memory cache line in bytes. pub fn global_mem_size(&self) -> Result { - let parameter = sh::CL_DEVICE_GLOBAL_MEM_SIZE; + let parameter = foreign::CL_DEVICE_GLOBAL_MEM_SIZE; let res = self.info(parameter, |_| 0u64, |b| b as *mut u64 as _)?; Ok(res) } @@ -378,7 +376,7 @@ impl Device { /// Is CL_TRUE if images are supported by the OpenCL device and CL_FALSE otherwise. pub fn image_support(&self) -> Result { - let parameter = sh::CL_DEVICE_IMAGE_SUPPORT; + let parameter = foreign::CL_DEVICE_IMAGE_SUPPORT; let res = self.info(parameter, |_| 0u32, |b| b as *mut u32 as _)?; Ok(res != 0) } @@ -386,7 +384,7 @@ impl Device { /// Max height of 2D image in pixels. The minimum value is 8192 if CL_DEVICE_IMAGE_SUPPORT /// is CL_TRUE. pub fn image2d_max_height(&self) -> Result { - let parameter = sh::CL_DEVICE_IMAGE2D_MAX_HEIGHT; + let parameter = foreign::CL_DEVICE_IMAGE2D_MAX_HEIGHT; let res = self.info(parameter, |_| 0usize, |b| b as *mut usize as _)?; Ok(res) } @@ -394,7 +392,7 @@ impl Device { /// Max width of 2D image in pixels. The minimum value is 8192 if CL_DEVICE_IMAGE_SUPPORT /// is CL_TRUE. pub fn image2d_max_width(&self) -> Result { - let parameter = sh::CL_DEVICE_IMAGE2D_MAX_WIDTH; + let parameter = foreign::CL_DEVICE_IMAGE2D_MAX_WIDTH; let res = self.info(parameter, |_| 0usize, |b| b as *mut usize as _)?; Ok(res) } @@ -402,7 +400,7 @@ impl Device { /// Max depth of 3D image in pixels. The minimum value is 2048 if CL_DEVICE_IMAGE_SUPPORT /// is CL_TRUE. pub fn image3d_max_depth(&self) -> Result { - let parameter = sh::CL_DEVICE_IMAGE3D_MAX_DEPTH; + let parameter = foreign::CL_DEVICE_IMAGE3D_MAX_DEPTH; let res = self.info(parameter, |_| 0usize, |b| b as *mut usize as _)?; Ok(res) } @@ -410,7 +408,7 @@ impl Device { /// Max height of 3D image in pixels. The minimum value is 2048 if CL_DEVICE_IMAGE_SUPPORT /// is CL_TRUE. pub fn image3d_max_height(&self) -> Result { - let parameter = sh::CL_DEVICE_IMAGE3D_MAX_HEIGHT; + let parameter = foreign::CL_DEVICE_IMAGE3D_MAX_HEIGHT; let res = self.info(parameter, |_| 0usize, |b| b as *mut usize as _)?; Ok(res) } @@ -418,14 +416,14 @@ impl Device { /// Max width of 3D image in pixels. The minimum value is 2048 if CL_DEVICE_IMAGE_SUPPORT /// is CL_TRUE. pub fn image3d_max_width(&self) -> Result { - let parameter = sh::CL_DEVICE_IMAGE3D_MAX_WIDTH; + let parameter = foreign::CL_DEVICE_IMAGE3D_MAX_WIDTH; let res = self.info(parameter, |_| 0usize, |b| b as *mut usize as _)?; Ok(res) } /// Size of local memory arena in bytes. The minimum value is 16 KB. pub fn local_mem_size(&self) -> Result { - let parameter = sh::CL_DEVICE_LOCAL_MEM_SIZE; + let parameter = foreign::CL_DEVICE_LOCAL_MEM_SIZE; let res = self.info(parameter, |_| 0u64, |b| b as *mut u64 as _)?; Ok(res) } @@ -439,14 +437,14 @@ impl Device { /// Size of local memory arena in bytes. The minimum value is 16 KB. pub fn max_clock_frequency(&self) -> Result { - let parameter = sh::CL_DEVICE_MAX_CLOCK_FREQUENCY; + let parameter = foreign::CL_DEVICE_MAX_CLOCK_FREQUENCY; let res = self.info(parameter, |_| 0u32, |b| b as *mut u32 as _)?; Ok(res) } /// The number of parallel compute cores on the OpenCL device. The minimum value is 1. pub fn max_compute_units(&self) -> Result { - let parameter = sh::CL_DEVICE_MAX_COMPUTE_UNITS; + let parameter = foreign::CL_DEVICE_MAX_COMPUTE_UNITS; let res = self.info(parameter, |_| 0u32, |b| b as *mut u32 as _)?; Ok(res) } @@ -454,14 +452,14 @@ impl Device { /// Max number of arguments declared with the __constant qualifier in a kernel. The minimum /// value is 8. pub fn max_constant_args(&self) -> Result { - let parameter = sh::CL_DEVICE_MAX_CONSTANT_ARGS; + let parameter = foreign::CL_DEVICE_MAX_CONSTANT_ARGS; let res = self.info(parameter, |_| 0u32, |b| b as *mut u32 as _)?; Ok(res) } /// Max size in bytes of a constant buffer allocation. The minimum value is 64 KB. pub fn max_constant_buffer_size(&self) -> Result { - let parameter = sh::CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE; + let parameter = foreign::CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE; let res = self.info(parameter, |_| 0u64, |b| b as *mut u64 as _)?; Ok(res) } @@ -469,14 +467,14 @@ impl Device { /// Max size of memory object allocation in bytes. The minimum value is /// max (1/4th of CL_DEVICE_GLOBAL_MEM_SIZE, 128*1024*1024) pub fn max_mem_alloc_size(&self) -> Result { - let parameter = sh::CL_DEVICE_MAX_MEM_ALLOC_SIZE; + let parameter = foreign::CL_DEVICE_MAX_MEM_ALLOC_SIZE; let res = self.info(parameter, |_| 0u64, |b| b as *mut u64 as _)?; Ok(res) } /// Max size in bytes of the arguments that can be passed to a kernel. The minimum value is 256. pub fn max_parameter_size(&self) -> Result { - let parameter = sh::CL_DEVICE_MAX_PARAMETER_SIZE; + let parameter = foreign::CL_DEVICE_MAX_PARAMETER_SIZE; let res = self.info(parameter, |_| 0usize, |b| b as *mut usize as _)?; Ok(res) } @@ -484,7 +482,7 @@ impl Device { /// Max number of simultaneous image objects that can be read by a kernel. The minimum value /// is 128 if CL_DEVICE_IMAGE_SUPPORT is CL_TRUE. pub fn max_read_image_args(&self) -> Result { - let parameter = sh::CL_DEVICE_MAX_READ_IMAGE_ARGS; + let parameter = foreign::CL_DEVICE_MAX_READ_IMAGE_ARGS; let res = self.info(parameter, |_| 0u32, |b| b as *mut u32 as _)?; Ok(res) } @@ -492,7 +490,7 @@ impl Device { /// Maximum number of samplers that can be used in a kernel. The minimum value is 16 /// if CL_DEVICE_IMAGE_SUPPORT is CL_TRUE. pub fn max_samplers(&self) -> Result { - let parameter = sh::CL_DEVICE_MAX_SAMPLERS; + let parameter = foreign::CL_DEVICE_MAX_SAMPLERS; let res = self.info(parameter, |_| 0u32, |b| b as *mut u32 as _)?; Ok(res) } @@ -500,7 +498,7 @@ impl Device { /// Maximum number of work-items in a work-group executing a kernel using the data parallel /// execution model. (Refer to clEnqueueNDRangeKernel). The minimum value is 1. pub fn max_work_group_size(&self) -> Result { - let parameter = sh::CL_DEVICE_MAX_WORK_GROUP_SIZE; + let parameter = foreign::CL_DEVICE_MAX_WORK_GROUP_SIZE; let res = self.info(parameter, |_| 0usize, |b| b as *mut usize as _)?; Ok(res) } @@ -508,7 +506,7 @@ impl Device { /// Maximum dimensions that specify the global and local work-item IDs used by the data /// parallel execution model. (Refer to clEnqueueNDRangeKernel). The minimum value is 3. pub fn max_work_item_dimensions(&self) -> Result { - let parameter = sh::CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS; + let parameter = foreign::CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS; let res = self.info(parameter, |_| 0u32, |b| b as *mut u32 as _)?; Ok(res) } @@ -519,7 +517,7 @@ impl Device { /// Returns n size_t entries, where n is the value returned by the query /// for CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS. The minimum value is (1, 1, 1). pub fn max_work_item_sizes(&self) -> Result> { - let parameter = sh::CL_DEVICE_MAX_WORK_ITEM_SIZES; + let parameter = foreign::CL_DEVICE_MAX_WORK_ITEM_SIZES; let ve = |size| vec![1usize; size / mem::size_of::()]; let res = self.info(parameter, ve, |b| b.as_mut_ptr() as _)?; @@ -529,28 +527,28 @@ impl Device { /// Max number of simultaneous image objects that can be written to by a kernel. The minimum /// value is 8 if CL_DEVICE_IMAGE_SUPPORT is CL_TRUE. pub fn max_write_image_args(&self) -> Result { - let parameter = sh::CL_DEVICE_MAX_WRITE_IMAGE_ARGS; + let parameter = foreign::CL_DEVICE_MAX_WRITE_IMAGE_ARGS; let res = self.info(parameter, |_| 0u32, |b| b as *mut u32 as _)?; Ok(res) } /// Describes the alignment in bits of the base address of any allocated memory object. pub fn mem_base_addr_align(&self) -> Result { - let parameter = sh::CL_DEVICE_MEM_BASE_ADDR_ALIGN; + let parameter = foreign::CL_DEVICE_MEM_BASE_ADDR_ALIGN; let res = self.info(parameter, |_| 0u32, |b| b as *mut u32 as _)?; Ok(res) } /// The smallest alignment in bytes which can be used for any data type. pub fn min_data_type_align_size(&self) -> Result { - let parameter = sh::CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE; + let parameter = foreign::CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE; let res = self.info(parameter, |_| 0u32, |b| b as *mut u32 as _)?; Ok(res) } /// Device name string. pub fn name(&self) -> Result { - let parameter = sh::CL_DEVICE_NAME; + let parameter = foreign::CL_DEVICE_NAME; let res = self.info(parameter, |size| vec![0u8; size], |b| b.as_mut_ptr() as _); res.map(|b| String::from_utf8(b).unwrap()) } @@ -558,7 +556,7 @@ impl Device { // /// The platform associated with this device. // pub fn platform(&self) -> Result { // -// let _ = sh::CL_DEVICE_PLATFORM; +// let _ = foreign::CL_DEVICE_PLATFORM; // unimplemented!() // } @@ -570,7 +568,7 @@ impl Device { // /// must return 0. // pub fn preferred_vector_width_char(&self) -> Result { // -// let _ = sh::CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR; +// let _ = foreign::CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR; // // unimplemented!() // } @@ -583,7 +581,7 @@ impl Device { // /// must return 0. // pub fn preferred_vector_width_short(&self) -> Result { // -// let _ = sh::CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT; +// let _ = foreign::CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT; // // unimplemented!() // } @@ -596,7 +594,7 @@ impl Device { // /// must return 0. // pub fn preferred_vector_width_int(&self) -> Result { // -// let _ = sh::CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT; +// let _ = foreign::CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT; // // unimplemented!() // } @@ -609,7 +607,7 @@ impl Device { // /// must return 0. // pub fn preferred_vector_width_long(&self) -> Result { // -// let _ = sh::CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG; +// let _ = foreign::CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG; // // unimplemented!() // } @@ -622,7 +620,7 @@ impl Device { // /// must return 0. // pub fn preferred_vector_width_float(&self) -> Result { // -// let _ = sh::CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT; +// let _ = foreign::CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT; // // unimplemented!() // } @@ -635,7 +633,7 @@ impl Device { // /// must return 0. // pub fn preferred_vector_width_double(&self) -> Result { // -// let _ = sh::CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE; +// let _ = foreign::CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE; // // unimplemented!() // } @@ -648,14 +646,14 @@ impl Device { /// /// EMBEDDED_PROFILE - if the device supports the OpenCL embedded profile. pub fn profile(&self) -> Result { - let parameter = sh::CL_DEVICE_PROFILE; + let parameter = foreign::CL_DEVICE_PROFILE; let res = self.info(parameter, |size| vec![0u8; size], |b| b.as_mut_ptr() as _); res.map(|b| String::from_utf8(b).unwrap()) } /// The smallest alignment in bytes which can be used for any data type. pub fn profiling_timer_resolution(&self) -> Result { - let parameter = sh::CL_DEVICE_PROFILING_TIMER_RESOLUTION; + let parameter = foreign::CL_DEVICE_PROFILING_TIMER_RESOLUTION; let res = self.info(parameter, |_| 0usize, |b| b as *mut usize as _)?; Ok(res) } @@ -699,14 +697,14 @@ impl Device { /// of: CL_DEVICE_TYPE_CPU, CL_DEVICE_TYPE_GPU, CL_DEVICE_TYPE_ACCELERATOR, /// or CL_DEVICE_TYPE_DEFAULT. pub fn type_(&self) -> Result { - let parameter = sh::CL_DEVICE_TYPE; + let parameter = foreign::CL_DEVICE_TYPE; let res = self.info(parameter, |_| 0u64, |b| b as *mut u64 as _)?; Ok(res) } /// Vendor name string. pub fn vendor(&self) -> Result { - let parameter = sh::CL_DEVICE_VENDOR; + let parameter = foreign::CL_DEVICE_VENDOR; let res = self.info(parameter, |size| vec![0u8; size], |b| b.as_mut_ptr() as _); res.map(|b| String::from_utf8(b).unwrap()) } @@ -714,7 +712,7 @@ impl Device { /// A unique device vendor identifier. An example of a unique device identifier could be /// the PCIe ID. pub fn vendor_id(&self) -> Result { - let parameter = sh::CL_DEVICE_VENDOR_ID; + let parameter = foreign::CL_DEVICE_VENDOR_ID; let res = self.info(parameter, |_| 0u32, |b| b as *mut u32 as _)?; Ok(res) } @@ -726,14 +724,14 @@ impl Device { /// /// The major_version.minor_version value returned will be 1.0. pub fn version(&self) -> Result { - let parameter = sh::CL_DEVICE_VERSION; + let parameter = foreign::CL_DEVICE_VERSION; let res = self.info(parameter, |size| vec![0u8; size], |b| b.as_mut_ptr() as _); res.map(|b| String::from_utf8(b).unwrap()) } /// OpenCL software driver version string in the form major_number.minor_number. pub fn driver_version(&self) -> Result { - let parameter = sh::CL_DRIVER_VERSION; + let parameter = foreign::CL_DRIVER_VERSION; let res = self.info(parameter, |size| vec![0u8; size], |b| b.as_mut_ptr() as _); res.map(|b| String::from_utf8(b).unwrap()) } @@ -742,7 +740,7 @@ impl Device { fn info_size(&self, parameter: u32) -> Result { unsafe { let mut size = 0; - let ret_value = sh::clGetDeviceInfo(self.0, parameter, 0, ptr::null_mut(), &mut size); + let ret_value = foreign::clGetDeviceInfo(self.0, parameter, 0, ptr::null_mut(), &mut size); return utility::check(ret_value, || size); } @@ -750,19 +748,21 @@ impl Device { fn info(&self, p: u32, f1: F1, f2: F2) -> Result where F1: Fn(usize) -> T, - F2: Fn(&mut T) -> *mut raw::c_void { + F2: Fn(&mut T) -> *mut c_void { unsafe { let size = self.info_size(p)?; let mut ret = f1(size); - let ret_value = sh::clGetDeviceInfo(self.0, p, size, f2(&mut ret), ptr::null_mut()); + let ret_value = foreign::clGetDeviceInfo(self.0, p, size, f2(&mut ret), ptr::null_mut()); return utility::check(ret_value, || ret); } } } +impl cmp::Eq for Device { } + impl cmp::PartialEq for Device { fn eq(&self, other: &Device) -> bool { @@ -772,19 +772,19 @@ impl cmp::PartialEq for Device { } // TODO use newtype: https://github.com/rust-lang/rust/issues/32146 -pub type Event = sh::cl_event; +pub type Event = foreign::cl_event; #[derive(Debug)] -pub struct Kernel(sh::cl_kernel); +pub struct Kernel(foreign::cl_kernel); pub trait KernelArg { fn size() -> usize; - fn pointer(&self) -> *mut raw::c_void; + fn pointer(&self) -> *mut c_void; } impl KernelArg for Buffer { - fn size() -> usize { mem::size_of::() } - fn pointer(&self) -> sh::cl_mem { self.0 } + fn size() -> usize { mem::size_of::() } + fn pointer(&self) -> foreign::cl_mem { unsafe { mem::transmute(self) } } } impl Kernel { @@ -824,7 +824,7 @@ impl Kernel { unsafe { let size = A::size(); let ptr = buf.pointer(); - let ret_value = sh::clSetKernelArg(self.0, position, size, ptr); + let ret_value = foreign::clSetKernelArg(self.0, position, size, ptr); return utility::check(ret_value, || {}); } } @@ -832,7 +832,7 @@ impl Kernel { /// Increment the kernel reference count. fn retain(&self) -> Result { unsafe { - let ret_value = sh::clRetainKernel(self.0); + let ret_value = foreign::clRetainKernel(self.0); return utility::check(ret_value, || {}); } } @@ -840,7 +840,7 @@ impl Kernel { /// Decrement the kernel reference count. fn release(&self) -> Result { unsafe { - let ret_value = sh::clReleaseKernel(self.0); + let ret_value = foreign::clReleaseKernel(self.0); return utility::check(ret_value, || {}); } } @@ -863,7 +863,7 @@ impl ops::Drop for Kernel { /// Newtype with an internal type of `cl_platform_id`. #[derive(Clone, Debug)] -pub struct Platform(sh::cl_platform_id); +pub struct Platform(foreign::cl_platform_id); impl Platform { @@ -876,17 +876,17 @@ impl Platform { /// * `EMBEDDED_PROFILE` - if the implementation supports the OpenCL embedded profile. The /// embedded profile is defined to be a subset for each version of OpenCL. pub fn profile(&self) -> Result { - self.info(sh::CL_PLATFORM_PROFILE) + self.info(foreign::CL_PLATFORM_PROFILE) } /// Returns the platform name. pub fn name(&self) -> Result { - self.info(sh::CL_PLATFORM_NAME) + self.info(foreign::CL_PLATFORM_NAME) } /// Returns the platform vendor. pub fn vendor(&self) -> Result { - self.info(sh::CL_PLATFORM_VENDOR) + self.info(foreign::CL_PLATFORM_VENDOR) } /// Returns a space-separated list of extension names (the extension names themselves do @@ -898,13 +898,13 @@ impl Platform { st.split_whitespace().map(|s| s.into()).collect() }; - self.info(sh::CL_PLATFORM_EXTENSIONS).map(closure) + self.info(foreign::CL_PLATFORM_EXTENSIONS).map(closure) } pub fn ndevices_by_type(&self, t: u64) -> Result { unsafe { let mut ndevices = 0; - let ret_value = sh::clGetDeviceIDs(self.0, t, 0, ptr::null_mut(), &mut ndevices); + let ret_value = foreign::clGetDeviceIDs(self.0, t, 0, ptr::null_mut(), &mut ndevices); return utility::check(ret_value, || ndevices); } } @@ -912,23 +912,23 @@ impl Platform { pub fn devices_by_type(&self, t: u64) -> Result> { unsafe { let ndevices = self.ndevices_by_type(t)?; - let mut vec_id = vec![0 as sh::cl_device_id; ndevices as usize]; + let mut vec_id = vec![0 as foreign::cl_device_id; ndevices as usize]; let n = ptr::null_mut(); - let ret_value = sh::clGetDeviceIDs(self.0, t, ndevices, vec_id.as_mut_ptr(), n); + let ret_value = foreign::clGetDeviceIDs(self.0, t, ndevices, vec_id.as_mut_ptr(), n); utility::check(ret_value, || vec_id.iter().map(|&id| Device(id)).collect()) } } pub fn devices(&self) -> Result> { - self.devices_by_type(sh::CL_DEVICE_TYPE_ALL) + self.devices_by_type(foreign::CL_DEVICE_TYPE_ALL) } /// Returns the size of `parameter`. fn info_size(&self, parameter: u32) -> Result { unsafe { let mut size = 0; - let ret_value = sh::clGetPlatformInfo(self.0, parameter, 0, ptr::null_mut(), &mut size); + let ret_value = foreign::clGetPlatformInfo(self.0, parameter, 0, ptr::null_mut(), &mut size); return utility::check(ret_value, || size); } @@ -938,11 +938,11 @@ impl Platform { unsafe { let size = self.info_size(parameter)?; let mut bytes = vec![0u8; size]; - let ret_value = sh::clGetPlatformInfo( + let ret_value = foreign::clGetPlatformInfo( self.0, parameter, size, - bytes.as_mut_ptr() as *mut raw::c_void, + bytes.as_mut_ptr() as *mut c_void, ptr::null_mut() ); @@ -952,7 +952,7 @@ impl Platform { } #[derive(Debug)] -pub struct Program(sh::cl_program); +pub struct Program(foreign::cl_program); impl Program { @@ -968,7 +968,7 @@ impl Program { pub fn build(&self, devices: &[Device], opt: T) -> Result where T: Into> { unsafe { let num_devices = devices.len() as u32; - let raw_devices: Vec = devices.iter().map(|d| d.0).collect(); + let raw_devices: Vec = devices.iter().map(|d| d.0).collect(); let raw_devices_ptr = raw_devices.as_ptr(); let options = match opt.into() { @@ -980,7 +980,7 @@ impl Program { let pfn_notify = mem::transmute(ptr::null::()); let user_data = ptr::null_mut(); - let ret_value = sh::clBuildProgram( + let ret_value = foreign::clBuildProgram( self.0, num_devices, raw_devices_ptr, @@ -999,8 +999,8 @@ impl Program { let mut errcode = 0i32; let cstring = ffi::CString::new(name.as_ref()).unwrap(); let ptr = cstring.as_ptr(); - let cl_kernel = sh::clCreateKernel(self.0, ptr, &mut errcode); - let ret_value = sh::CLStatus::new(errcode).expect("failed to convert i32 to CLStatus"); + let cl_kernel = foreign::clCreateKernel(self.0, ptr, &mut errcode); + let ret_value = foreign::CLStatus::new(errcode).expect("failed to convert i32 to CLStatus"); return utility::check(ret_value, || Kernel(cl_kernel)); } @@ -1009,7 +1009,7 @@ impl Program { /// Increment the context reference count. fn retain(&self) -> Result { unsafe { - let ret_value = sh::clRetainProgram(self.0); + let ret_value = foreign::clRetainProgram(self.0); return utility::check(ret_value, || {}); } } @@ -1017,7 +1017,7 @@ impl Program { /// Decrement the context reference count. fn release(&self) -> Result { unsafe { - let ret_value = sh::clReleaseProgram(self.0); + let ret_value = foreign::clReleaseProgram(self.0); return utility::check(ret_value, || {}); } } @@ -1039,7 +1039,7 @@ impl ops::Drop for Program { } #[derive(Debug)] -pub struct Queue(sh::cl_command_queue); +pub struct Queue(foreign::cl_command_queue); impl Queue { @@ -1047,9 +1047,9 @@ impl Queue { pub fn new(context: &Context, device: &Device, properties: u64) -> Result { unsafe { let mut errcode_ret: i32 = 0; - let cl_command_queue = sh::clCreateCommandQueue(context.0, device.0, + let cl_command_queue = foreign::clCreateCommandQueue(context.0, device.0, properties, &mut errcode_ret); - let ret_value = sh::CLStatus::new(errcode_ret).expect("failed to convert i32 to CLStatus"); + let ret_value = foreign::CLStatus::new(errcode_ret).expect("failed to convert i32 to CLStatus"); return utility::check(ret_value, || Queue(cl_command_queue)); } @@ -1099,7 +1099,7 @@ impl Queue { blocking_write: bool, offset: usize, cb: usize, - ptr: *const raw::c_void, + ptr: *const c_void, event_wait_list: &[Event]) -> Result { @@ -1109,11 +1109,11 @@ impl Queue { let events = if num_events_in_wait_list > 0 { event_wait_list.as_ptr() } else { ptr::null() }; - let mut new_event = 0 as sh::cl_event; + let mut new_event = 0 as foreign::cl_event; let blocking_write_u32 = if blocking_write { 1 } else { 0 }; - let ret_value = sh::clEnqueueWriteBuffer( + let ret_value = foreign::clEnqueueWriteBuffer( self.0, buffer.0, blocking_write_u32, @@ -1160,7 +1160,7 @@ impl Queue { blocking_read: bool, offset: usize, cb: usize, - ptr: *mut raw::c_void, + ptr: *mut c_void, event_wait_list: &[Event]) -> Result { @@ -1169,11 +1169,11 @@ impl Queue { let events = if num_events_in_wait_list > 0 { event_wait_list.as_ptr() } else { ptr::null() }; - let mut new_event = 0 as sh::cl_event; + let mut new_event = 0 as foreign::cl_event; let blocking_read_u32 = if blocking_read { 1 } else { 0 }; - let ret_value = sh::clEnqueueReadBuffer( + let ret_value = foreign::clEnqueueReadBuffer( self.0, buffer.0, blocking_read_u32, @@ -1256,7 +1256,7 @@ impl Queue { let events = if num_events_in_wait_list > 0 { event_wait_list.as_ptr() } else { ptr::null() }; - let mut new_event = 0 as sh::cl_event; + let mut new_event = 0 as foreign::cl_event; // == ptrs let global_work_size_ptr = @@ -1265,7 +1265,7 @@ impl Queue { let local_work_size_ptr = if local_work_size.len() > 0 { local_work_size.as_ptr() } else { ptr::null() }; - let ret_value = sh::clEnqueueNDRangeKernel( + let ret_value = foreign::clEnqueueNDRangeKernel( self.0, kernel.0, work_dim, @@ -1284,7 +1284,7 @@ impl Queue { /// Increments the command_queue reference count. fn retain(&self) -> Result { unsafe { - let ret_value = sh::clRetainCommandQueue(self.0); + let ret_value = foreign::clRetainCommandQueue(self.0); return utility::check(ret_value, || {}); } } @@ -1292,7 +1292,7 @@ impl Queue { /// Decrements the command_queue reference count. fn release(&self) -> Result { unsafe { - let ret_value = sh::clReleaseCommandQueue(self.0); + let ret_value = foreign::clReleaseCommandQueue(self.0); return utility::check(ret_value, || {}); } } @@ -1308,6 +1308,16 @@ impl clone::Clone for Queue { } } +impl cmp::Eq for Queue { } + +impl cmp::PartialEq for Queue { + + fn eq(&self, other: &Queue) -> bool { + + self.0 == other.0 + } +} + impl ops::Drop for Queue { fn drop(&mut self) { diff --git a/src/frameworks/opencl/hl/utility.rs b/src/frameworks/opencl/high/utility.rs similarity index 81% rename from src/frameworks/opencl/hl/utility.rs rename to src/frameworks/opencl/high/utility.rs index c9a36a1..944f05d 100644 --- a/src/frameworks/opencl/hl/utility.rs +++ b/src/frameworks/opencl/high/utility.rs @@ -1,5 +1,5 @@ -use super::error::{ErrorKind, Result}; -use super::super::sh::CLStatus; +use super::{ErrorKind, Result}; +use super::super::foreign::CLStatus; pub fn check(cl_status: CLStatus, ok_value: F) -> Result where F: FnOnce() -> T { diff --git a/src/frameworks/opencl/hl/mod.rs b/src/frameworks/opencl/hl/mod.rs deleted file mode 100644 index d0862ca..0000000 --- a/src/frameworks/opencl/hl/mod.rs +++ /dev/null @@ -1,7 +0,0 @@ -pub use self::error::{Error, ErrorKind, Result}; -pub use self::high::{nplatforms, platforms}; -pub use self::high::{Buffer, Context, Device, Event, Kernel, KernelArg, Platform, Program, Queue}; - -mod error; -mod high; -mod utility; \ No newline at end of file diff --git a/src/frameworks/opencl/interface/context.rs b/src/frameworks/opencl/interface/context.rs new file mode 100644 index 0000000..023313f --- /dev/null +++ b/src/frameworks/opencl/interface/context.rs @@ -0,0 +1,90 @@ +use {Context, Device, Error, ErrorKind, ExtensionPackage, Result}; +use std::marker::Unsize; +use super::OpenCLDevice; +use super::super::high; +use utility::Uninitialized; + +/// Represents an OpenCL context. +/// +/// A context is responsible for managing OpenCL objects and resources (command-queues, program +/// objects, kernel objects, executing kernels, etc.). The usual configuration is a single context +/// encapsulating multiple devices. The resources, such as [buffers][buffer] and [events][event], +/// can be shared across multiple devices in a single context. Other possible setups include: +/// +/// * a single context for multiple devices +/// * a single context for a single device +/// * a context for each device +/// +/// note: multi-platform contexts are not supported in OpenCL. +/// +/// ## Programs +/// +/// An OpenCL context can have multiple programs associated with it. Programs can be compiled +/// individually to avoid possible name clashes due to using packages from multiple package +/// authors. +/// +/// [buffer]: ./frameworks/opencl/struct.Memory.html +/// [event]: ./frameworks/opencl/struct.Event.html +#[derive(Debug)] +pub struct OpenCLContext { + /// The high-level context. + pub(super) context: high::Context, + /// Holds the extension package implementation. + pub(super) package: X, + /// A list of devices associated with the context. + pub(super) selection: Vec, + /// The index of the _active_ device. + pub(super) active: usize, +} + +impl OpenCLContext { + + /// Returns the `package`. + /// + /// [package author] + pub fn package(&self) -> &X { + &self.package + } + + /// Returns the _active_ OpenCL device. + /// + /// [package author] + pub fn device(&self) -> &OpenCLDevice { + &self.selection[self.active] + } +} + +impl OpenCLContext { + /// Creates and returns a program. + pub fn create_program(&mut self, src: &[I]) -> Result where I: AsRef { + let program = self.context.create_program_with_source(src)?; + let raw_devices: Vec<_> = self.selection.iter().map(|d| d.device.clone()).collect(); + program.build(&raw_devices, None /* TODO */)?; + Ok(program) + } +} + +impl Context for OpenCLContext where X: ExtensionPackage, Self: Unsize { + + type Package = X; + + fn active_device(&self) -> &Device { + + &self.selection[self.active] + } + + fn set_active(&mut self, idx: usize) -> Result { + if idx >= self.selection.len() { + return Err(Error::new(ErrorKind::Other, "device index out of range")); + } + + self.active = idx; + + Ok(()) + } + + fn extension(&self) -> &X::Extension { + + self + } +} \ No newline at end of file diff --git a/src/frameworks/opencl/interface/device.rs b/src/frameworks/opencl/interface/device.rs new file mode 100644 index 0000000..52ecd79 --- /dev/null +++ b/src/frameworks/opencl/interface/device.rs @@ -0,0 +1,130 @@ +use {Alloc, ComputeDevice, ErrorKind, Memory, Result, Shape, Synch, Viewable}; +use std::os::raw::c_void; +use super::OpenCLMemory; +use super::super::{foreign, high}; +use utility; + +/// Represents an OpenCL device. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct OpenCLDevice { + pub(super) device: high::Device, + pub(super) context: high::Context, + /// A command queue + /// + /// A command queue is the mechanism for interaction with the device. The queue is used for + /// operations such as kernel launches and memory copies. At least one command queue per device + /// is required. Queues are used by the host application to submit work to devices and + /// associated with devices within a context. + /// + /// __commands__: + /// + /// - memory copy or mapping + /// - device code execution + /// - synchronization point + /// + /// __modes__: + /// + /// - in-order + /// - out-of-order + /// + /// ## TODO + /// + /// * Use events to synchronize + pub(super) queue: high::Queue, +} + +impl OpenCLDevice { + + /// Returns the OpenCL command queue. + /// + /// [package author] + pub fn queue(&self) -> &high::Queue { + &self.queue + } +} + +impl Viewable for OpenCLDevice { + + fn view(&self) -> ComputeDevice { + ComputeDevice::OpenCL(self.clone()) + } +} + +impl Alloc for OpenCLDevice { + + + fn alloc(&self, shape: &Shape) -> Result> { + // TODO + + let flag = foreign::CL_MEM_READ_WRITE; + let length = shape.capacity(); + let size = utility::allocated::(length); + let buffer = self.context.create_buffer(flag, size, None)?; + let cl_buffer = OpenCLMemory { buf: buffer }; + Ok(Memory::OpenCL(cl_buffer)) + } + + fn allocwrite(&self, shape: &Shape, mut data: Vec) -> Result> { + // TODO + + let flag = foreign::CL_MEM_READ_ONLY | foreign::CL_MEM_COPY_HOST_PTR; + let length = shape.capacity(); + let size = utility::allocated::(length); + let pointer = data.as_mut_ptr(); + + // create buffer and fill it immediately + let buffer = self.context.create_buffer(flag, size, pointer as *mut c_void)?; + let cl_buffer = OpenCLMemory { buf: buffer }; + Ok(Memory::OpenCL(cl_buffer)) + } +} + +impl Synch for OpenCLDevice { + + fn write(&self, memory: &mut Memory, _: &ComputeDevice, source: &Memory) -> Result { + match *source { + Memory::Native(ref native_memory) => { + let cl_memory = unsafe { memory.as_opencl_unchecked() }; + + let length = native_memory.len(); + let size = utility::allocated::(length); + let slice = native_memory.as_slice_memory_order().unwrap(); + let slice_pointer = slice.as_ptr(); + + let ref buf = cl_memory.buf; + let block = true; // TODO async + let offset = 0; + let _ = self.queue + .enqueue_write_buffer(buf, block, offset, size, slice_pointer as *const c_void, &[])?; + + Ok(()) + }, + + _ => Err(ErrorKind::NoAvailableSynchronizationRouteFound.into()), + } + } + + /// Synchronizes `memory` to `destination`. + fn read(&self, memory: &Memory, _: &mut ComputeDevice, destination: &mut Memory) -> Result { + match *destination { + Memory::Native(ref mut native_memory) => { + let cl_memory = unsafe { memory.as_opencl_unchecked() }; + + let length = native_memory.len(); + let size = utility::allocated::(length); + let slice = native_memory.as_slice_memory_order_mut().unwrap(); + let slice_pointer = slice.as_mut_ptr(); + + let ref buf = cl_memory.buf; + let block = true; // TODO async + let offset = 0; + let _ = self.queue + .enqueue_read_buffer(buf, block, offset, size, slice_pointer as *mut c_void, &[])?; + + Ok(()) + }, + + _ => Err(ErrorKind::NoAvailableSynchronizationRouteFound.into()), + } + } +} \ No newline at end of file diff --git a/src/frameworks/opencl/interface/event.rs b/src/frameworks/opencl/interface/event.rs new file mode 100644 index 0000000..cb981ce --- /dev/null +++ b/src/frameworks/opencl/interface/event.rs @@ -0,0 +1,8 @@ +/// Represents an OpenCL event. +/// +/// Most OpenCL operations happen asynchronously on an OpenCL device. Multiple OpenCL operations +/// can be ordered and synchronized by way of the event objects yielded by the operations. An event +/// object can be used as an input to other operations which will wait until the event has +/// finished executing to run. +#[derive(Debug)] +pub struct OpenCLEvent(() /* TODO */); \ No newline at end of file diff --git a/src/frameworks/opencl/interface/framework.rs b/src/frameworks/opencl/interface/framework.rs new file mode 100644 index 0000000..a75bdac --- /dev/null +++ b/src/frameworks/opencl/interface/framework.rs @@ -0,0 +1,150 @@ +use {BoxContext, Build, Context, Error, ErrorKind, ExtensionPackage, Framework}; +use {Hardware, HardwareKind, Result}; +use std::marker::Unsize; +use super::{OpenCLContext, OpenCLDevice}; +use super::super::{foreign, high}; +use utility::{TryDefault, Uninitialized}; + +const OPEN_CL: &'static str = "OpenCL"; + +/// Provides the OpenCL framework. +/// +/// # Flow +/// +/// Since multiple platforms can exist, the first available platform is selected during +/// the initialization. A list of available devices are then provided for your choosing. Then, +/// the provided selection of devices are used to create a context, with a command queue for each +/// device. At this stage, a program(s) is compiled. A (host) program is essentially a collection +/// of kernels. A kernel is the smallest unit of execution. +/// +/// In OpenCL, the host code can read in a kernel binary (i.e., compiled off-line) or a kernel +/// source file (i.e., compile on-line). More information on on-line/off-line compilation can be +/// found [here][1]. Kernels are expensive to start, so they're typically used to do a large amount +/// of work. Memory allocated on an OpenCL device can be used when executing kernels, and then +/// transfered back. +/// +/// Work-groups, a collection of work-items, are assigned to execute on compute-units. A work-item +/// is an instance of a kernel as runtime. That kernel instance is at a point in an index, which +/// can be thought of as a grid and the work-groups which contain the work-items can be thought of +/// as sub-grids within the grid. The work-groups can be defined explicitly or implicitly by +/// simply specifying the number of work-items, both dealing with data parallelism. In terms of task +/// parallelism, kernels are executed independent of an index space. +/// It should also be noted that there are [built-in scalar data types][2] along with +/// [built-in functions][3]. +/// +/// [1]: https://www.fixstars.com/en/opencl/book/OpenCLProgrammingBook/online-offline-compilation/ +/// [2]: https://www.khronos.org/registry/OpenCL/sdk/1.0/docs/man/xhtml/scalarDataTypes.html +/// [3]: https://www.khronos.org/registry/OpenCL/sdk/1.0/docs/man/xhtml/mathFunctions.html +#[derive(Debug)] +pub struct OpenCL { + /// raw device pointers + cache: Vec, + /// A list of available devices for the first platform found. + /// + /// Platforms are defined by the implementation. Platforms enables the host to interact with + /// OpenCL-capable devices. + pub available_hardware: Vec, +} + +impl Framework for OpenCL { + + const FRAMEWORK_NAME: &'static str = OPEN_CL; + + fn available_hardware(&self) -> Vec { + self.available_hardware.clone() + } +} + +impl BoxContext for OpenCL + where X: ExtensionPackage + Build>, + OpenCLContext: Unsize + { + + fn enclose(&self, hw_selection: Vec) -> Result>> { + let indices: Vec = hw_selection.iter().map(|hardware| hardware.id).collect(); + + let devices: Vec = self.cache + .iter() + .enumerate() + .filter(|&(idx, _)| indices.contains(&idx)) + .map(|(_, device)| device.clone()) + .collect(); + + // create a context for the devices + let hl_context = high::Context::new(&devices)?; + + let mut selection = vec![]; + + for raw_device in devices.into_iter() { + + // create a command queue (with profiling enabled, needed for timing kernels) + let queue = high::Queue::new(&hl_context, &raw_device, foreign::CL_QUEUE_PROFILING_ENABLE)?; + + selection.push(OpenCLDevice { + device: raw_device, + context: hl_context.clone(), + queue, + }); + } + + let mut context: OpenCLContext = OpenCLContext { + context: hl_context, + package: (), + selection: selection, + active: 0, + }; + + let package = X::build(&mut context)?; + + Ok(Box::new(OpenCLContext { + context: context.context, + package: package, + selection: context.selection, + active: context.active, + })) + } +} + +impl TryDefault for OpenCL { + + type Err = Error; + + fn try_default() -> Result { + + let mut cache = vec![]; + + let available_hardware = { + high::platforms()?.remove(0).devices()?.into_iter() + //.skip(|dev_handle| dev_handle.available()) + .enumerate() + .map(|(index, dev_handle)| { + let hardware = Hardware { + id: index, + framework: OPEN_CL, + kind: match dev_handle.type_().unwrap() { + foreign::CL_DEVICE_TYPE_CPU => HardwareKind::Central, + foreign::CL_DEVICE_TYPE_GPU => HardwareKind::Graphics, + foreign::CL_DEVICE_TYPE_ACCELERATOR => HardwareKind::Accelerator, + _ => HardwareKind::Other, + }, + name: dev_handle.name().unwrap(), + compute_units: dev_handle.max_compute_units().unwrap() as usize, + }; + + cache.push(dev_handle); + + hardware + }) + .collect() + }; + + Ok(OpenCL { cache, available_hardware }) + } +} + +impl From for Error { + + fn from(e: high::Error) -> Error { + Error::new(ErrorKind::Framework { name: OPEN_CL }, e) + } +} \ No newline at end of file diff --git a/src/frameworks/opencl/interface/image.rs b/src/frameworks/opencl/interface/image.rs new file mode 100644 index 0000000..323048a --- /dev/null +++ b/src/frameworks/opencl/interface/image.rs @@ -0,0 +1,3 @@ +/// Represents an OpenCL image, opaque 2D/3D objects. +#[derive(Debug)] +pub struct OpenCLImage(() /* TODO */); \ No newline at end of file diff --git a/src/frameworks/opencl/interface/memory.rs b/src/frameworks/opencl/interface/memory.rs new file mode 100644 index 0000000..cbd4aa9 --- /dev/null +++ b/src/frameworks/opencl/interface/memory.rs @@ -0,0 +1,28 @@ +use Memory; +use super::super::high; + +/// A `Memory` wraps around an OpenCL buffer id that manages its deallocation, named +/// as such for consistency's sake. +/// +/// Memory objects can be copied to host memory, from host memory, or to other memory objects. +/// Copying from the host to a device is considered _writing_. Copying from a device to the host is +/// considered _reading_. +/// +/// Unlike CUDA, OpenCL [buffers][1] are only context specific, not device specific. Also note: +/// currently, lazy allocation is used on the NVIDIA driver. That is, the buffer object, in a sense, +/// is located _nowhere_ when allocated. It only exists when needed. +/// +/// [1]: https://goo.gl/S9B3TL +#[derive(Clone, Debug)] +pub struct OpenCLMemory { + pub(super) buf: high::Buffer, +} + +impl ::opencl::high::KernelArg for Memory { + fn size() -> usize { ::std::mem::size_of::<::opencl::foreign::cl_mem>() } + + fn pointer(&self) -> ::opencl::foreign::cl_mem { + + unsafe { self.as_opencl_unchecked().buf.pointer() } + } +} \ No newline at end of file diff --git a/src/frameworks/opencl/interface/mod.rs b/src/frameworks/opencl/interface/mod.rs new file mode 100644 index 0000000..c076a2a --- /dev/null +++ b/src/frameworks/opencl/interface/mod.rs @@ -0,0 +1,13 @@ +pub use self::context::OpenCLContext; +pub use self::device::OpenCLDevice; +pub use self::event::OpenCLEvent; +pub use self::framework::OpenCL; +pub use self::image::OpenCLImage; +pub use self::memory::OpenCLMemory; + +mod context; +mod device; +mod event; +mod framework; +mod image; +mod memory; \ No newline at end of file diff --git a/src/frameworks/opencl/mod.rs b/src/frameworks/opencl/mod.rs index c795c7b..0f66049 100644 --- a/src/frameworks/opencl/mod.rs +++ b/src/frameworks/opencl/mod.rs @@ -1,74 +1,8 @@ -//! OpenCL backend support. -//! -//! ## Scalar Data Types -//! -//! TODO -//! -//! Built-in scalar data types: -//! https://www.khronos.org/registry/OpenCL/sdk/1.0/docs/man/xhtml/scalarDataTypes.html -//! -//! ## Terminology -//! -//! Work-group: A collection of work items + has a unique work-group ID. work-groups are assigned -//! to execute on compute-units -//! -//! Work-item: An instance of a kernel at run time + has a unique ID within the work-group -//! -//! TODO -//! -//! ## Flow -//! -//! - Initialize the framework -//! - Select the desired platform -//! - Select the desired devices from the platform -//! - Create a context -//! - create a command queue per device -//! - Compile programs -//! - A program is essentially a collection of kernels. -//! - Create a kernel from the successfully compiled program -//! - A kernel is the smallest unit of execution. Kernels are expensive to start so they're -//! typically used to do a large amount of work. -//! - Specify arguments to the kernel -//! - Allocate memory on devices -//! - Transfer data to devices -//! - Execute -//! - Transfer results back -//! - Free memory on devices -//! -//! ## Events (TODO) -//! -//! Most OpenCL operations happen asynchronously on the OpenCL Device. To provide the possibility -//! to order and synchronize multiple operations, the execution of an operation yields a event -//! object. This event can be used as an input to other operations which will wait until this event -//! has finished executing to run. +//! OpenCL backend support - heterogeneous computing. -pub mod hl; -pub mod sh; +pub mod foreign; +pub mod high; -// use super::super::Framework; +pub use self::interface::{OpenCL, OpenCLContext, OpenCLDevice, OpenCLEvent, OpenCLImage, OpenCLMemory}; -/// Provides the OpenCL framework. -#[derive(Debug)] -pub struct OpenCL { - // /// A list of available devices for the first platform found. - // pub available_devices: Vec, -} - -/// A `Memory` wraps around an OpenCL buffer id that manages its deallocation, named -/// as such for consistency's sake. -/// -/// OpenCL buffers are only context specific, not device specific. -#[derive(Clone, Debug)] -pub struct Memory { - hl: hl::Buffer, - capacity: usize, -} - -// impl OpenCL { - -// /// Attempts to initialize the framework. -// pub fn new() -> Result { - -// unimplemented!() -// } -// } \ No newline at end of file +mod interface; \ No newline at end of file diff --git a/src/hardware.rs b/src/hardware.rs new file mode 100644 index 0000000..880e52c --- /dev/null +++ b/src/hardware.rs @@ -0,0 +1,116 @@ +use frameworks::native::NativeDevice; +use frameworks::opencl::OpenCLDevice; +use super::{Memory, Result, Shape}; + +/// An device capable of processing data. +/// +/// A compute device is a processor, such as a CPU or a GPU. `Device` is simply +/// an [alias][issue#8634] for the various trait bounds associated with a compute device. +/// +/// [issue#8634]: https://github.com/rust-lang/rust/issues/8634 +pub trait Device: + 'static + + Alloc + Alloc + Alloc + + Synch + Synch + Synch + + Viewable { + + } + +impl Device for D where D: + 'static + + Alloc + Alloc + Alloc + + Synch + Synch + Synch + + Viewable { + + } + +/// A _viewable_ device. +pub trait Viewable { + + /// Returns a device _view_. + fn view(&self) -> ComputeDevice; +} + +/// A wrapper around the various compute devices. +/// +/// `ComputeDevice` and `Viewable` use the [_family_ pattern][pattern]. +/// +/// [pattern]: https://www.reddit.com/r/rust/comments/2rdoxx/enum_variants_as_types/cnezl0p/ +#[derive(Debug, Eq, PartialEq)] +pub enum ComputeDevice { + /// A native host device + Native(NativeDevice), + /// An OpenCL device + OpenCL(OpenCLDevice), +} + +impl ComputeDevice { + + /// Returns the device + pub fn device(&self) -> &Device { + match *self { + ComputeDevice::Native(ref d) => d, + ComputeDevice::OpenCL(ref d) => d, + } + } +} + +/// Allocator +pub trait Alloc { + + /// Allocates memory on the device. + fn alloc(&self, shape: &Shape) -> Result>; + + /// Allocates and transfers memory `data` to the device. + fn allocwrite(&self, shape: &Shape, data: Vec) -> Result>; +} + +/// Synchronizer +/// +/// note: host <-> GPU for now.. GPU <-> GPU later.. +pub trait Synch { + + // TODO refactor + + /// Synchronizes `memory` from `source`. + fn write(&self, memory: &mut Memory, s_location: &ComputeDevice, s: &Memory) -> Result; + + /// Synchronizes `memory` to `destination`. + fn read(&self, memory: &Memory, d_location: &mut ComputeDevice, d: &mut Memory) -> Result; +} + +/// Hardware can be GPUs, multi-core CPUs or DSPs, Cell/B.E. processor or whatever else +/// is supported by the provided framework. The struct holds all important information about +/// the hardware. To execute code on hardware, turn hardware into a [`Device`]. +/// +/// [`Device`]: [device]: ./struct.Device.html +#[derive(Clone, Debug)] +pub struct Hardware { + /// The unique ID of the hardware. + pub id: usize, + /// Framework marker + pub framework: &'static str, + /// The type of compute device, such as a CPU or a GPU. + pub kind: HardwareKind, + /// The name. + pub name: String, + /// The number of compute units. + /// + /// A compute device usually has multiple compute units. + pub compute_units: usize, +} + +/// General categories for devices, used to identify the type of a device. +#[derive(Clone, Debug, Eq, Ord, PartialEq, PartialOrd)] +pub enum HardwareKind { + /// Used for accelerators. Accelerators can communicate with host processor using a peripheral + /// interconnect such as PCIe. + Accelerator, + /// Used for devices that are host processors. The host processor runs the implementations + /// and is a single or multi-core CPU. + Central, + /// Used for GPU devices. + Graphics, + /// Used for anything else. + Other, +} \ No newline at end of file diff --git a/src/interface.rs b/src/interface.rs deleted file mode 100644 index 9104744..0000000 --- a/src/interface.rs +++ /dev/null @@ -1,111 +0,0 @@ -use std::fmt::Debug; - -use super::{Memory, Shape}; -use super::error::Result; - -/// Represents the location of a buffer or memory, which the associated device can -/// use to access it. -#[derive(Debug, Eq, PartialEq)] -pub struct Address { - /// A string literal containing the name of the framework. - pub framework: &'static str, - /// The context identifier - pub context: isize, - /// The device identifier. - pub device: isize, -} - -/// A device capable of processing data. -/// -/// The `T` type associated with the [`SharedTensor`](./struct.SharedTensor.html). -pub trait ComputeDevice { - - /// Allocates memory on the device. - fn allocate(&self, shape: &Shape) -> Result>; - - /// Allocates memory on the device. - fn allocate_with(&self, shape: &Shape, slice: &mut [T]) -> Result>; - - // /// Synchronizes `memory` from `source`. - // fn sync_in(&self, memory: &mut Memory, source: &Memory) -> Result; - - /// Synchronizes `memory` to `destination`. - fn sync_out(&self, memory: &Memory, destination: &mut Memory) -> Result; - - /// Returns the location of the device. - /// - /// The `addr` method is used by `SharedTensor`s for memory storage purposes. The _address_ - /// is simply the name of the framework associated with the device, the device's unique - /// identifier, and an integer associated with the context the device is contained in. - fn addr(&self) -> Address; -} - -/// Contexts are the heart of both OpenCL and CUDA applications. Contexts provide a container for -/// objects such as memory, command-queues, programs/modules and kernels. -pub trait Context: Debug { - -} - -/// A trait implemented for all frameworks. `Framework`s contain a list of all available devices as -/// well as other objects specific to the implementor. -/// -/// The default framework is simply the host CPU for common computation. To make use of other -/// devices such as GPUs, you may choose a GPGPU framework (such as OpenCL or CUDA) to access the -/// processing capabilities of the device(s). -pub trait Framework: Debug { - /// The name of the framework. - /// - /// This associated constant is mainly used for the purposes of debugging and reporting errors. - /// - /// note: *uses the "SCREAMING_SNAKE_CASE" naming convention (e.g., `"OPEN_CL"`). - const FRAMEWORK_NAME: &'static str; - - // type Context: Context; - - // fn try_init(&self) -> Result; - - // fn try_context(&self, selection: Vec) -> Result>; - - // TODO: - // https://github.com/rust-lang/rust/issues/29924 - #[doc(hidden)] - fn name(&self) -> &'static str { - Self::FRAMEWORK_NAME - } -} - -// /// The object-safe version of `Framework`. -// trait FrameworkObject: Debug { } - -/// The generic hardware representation for a `ComputeDevice`. -/// -/// A compute device is a processor, such as a CPU or a GPU. -pub struct Hardware { - /// The unique ID of the hardware. - id: usize, - /// The type of compute device, such as a CPU or a GPU. - kind: HardwareKind, - /// The name. - name: String, - /// The number of compute units. - /// - /// A compute device usually has multiple compute units. - compute_units: usize, - // /// Framework marker - // framework: PhantomData, -} - -/// General categories for devices, used to identify the type of a device. -#[derive(Clone, Debug, Eq, Ord, PartialEq, PartialOrd)] -pub enum HardwareKind { - /// Used for accelerators. Accelerators can communicate with host processor using a peripheral - /// interconnect such as PCIe. - Accelerator, - /// Used for devices that are host processors. The host processor runs the implementations - /// and is a single or multi-core CPU. - Central, - /// Used for GPU devices. - Graphics, - /// Used for anything else. - Other, -} \ No newline at end of file diff --git a/src/lib.rs b/src/lib.rs index 2bb5add..08d55a7 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,7 +1,7 @@ //! Provides a simple, unified API for running highly parallel computations on different //! devices across different GPGPU frameworks, allowing you to swap your backend at runtime. //! -//! Parenchyma is a hard fork of [Collenchyma], a now-defunct project started at [Autumn]. +//! Parenchyma began as a hard fork of [Collenchyma], a now-defunct project started at [Autumn]. //! //! ## Abstract //! @@ -30,10 +30,10 @@ //! //! ## Architecture //! -//! The single entry point of Parenchyma is the [Backend](./struct.Backend.html) type. A +//! The single entry point of Parenchyma is a [Backend](./struct.Backend.html). A //! backend is agnostic over the device it runs operations on. In order to be agnostic over the -//! device, such as native host CPU, GPUs, accelerators or any other devices, the backend needs to be -//! agnostic over the framework as well. The framework is important, as it provides the interface +//! device, such as native host CPU, GPUs, accelerators or any other devices, the backend needs to +//! be agnostic over the framework as well. The framework is important, as it provides the interface //! to execute operations on devices, among other things. Since different vendors of hardware use //! different frameworks, it becomes important that the backend is agnostic over the framework. //! This allows us to run computations on any machine without having to worry about hardware @@ -56,13 +56,13 @@ //! a BLAS extension. Whether or not the dot operation is executed on one GPU, multiple GPUS or on //! a CPU device depends solely on how you configured the backend. //! -//! ### Bundles +//! ### Packages //! -//! The concept of Parenchyma extensions has one more component - the [Bundle](./trait.Bundle.html) +//! The concept of Parenchyma extensions has one more component - the [Package](./trait.ExtensionPackage.html) //! trait. As opposed to executing code on the native CPU, other devices need to compile and build -//! the extension manually at runtime, which makes up a significant part of a framework. We need an +//! the extension manually at runtime which makes up a significant part of a framework. We need an //! instance that's able to be initialized at runtime for holding the sate and compiled -//! operations - which is the bundle's main purpose. +//! operations - which is the package's main purpose. //! //! ### Memory //! @@ -75,6 +75,33 @@ //! the device. Memory can also be passed around to different backends. Operations take tensors //! as arguments while handling the synchronization and allocation for you. //! +//! ## Example +//! +//! ```ignore +//! extern crate parenchyma as pa; +//! extern crate parenchyma_nn as pann; +//! +//! use pa::{Backend, Native, OpenCL, SharedTensor}; +//! +//! fn main() { +//! let ref native: Backend = Backend::new::()?; +//! // Initialize an OpenCL or CUDA backend packaged with the NN extension. +//! let ref backend = pann::Backend::new::()?; +//! +//! // Initialize two `SharedTensor`s. +//! let shape = 1; +//! let ref x = SharedTensor::::with(backend, shape, vec![3.5])?; +//! let ref mut result = SharedTensor::::new(shape); +//! +//! // Run the sigmoid operation, provided by the NN extension, on +//! // your OpenCL/CUDA enabled GPU (or CPU, which is possible through OpenCL) +//! backend.sigmoid(x, result)?; +//! +//! // Print the result: `[0.97068775] shape=[1], strides=[1]` +//! println!("{:?}", result.read(native)?.as_native()?); +//! } +//! ``` +//! //! ## Development //! //! At the moment, Parenchyma itself will provide Rust APIs for the important @@ -90,9 +117,9 @@ //! [Collenchyma]: https://github.com/autumnai/collenchyma //! [Autumn]: https://github.com/autumnai -#![allow(warnings)] -// #![deny(missing_docs, unused_import_braces, unused_qualifications)] -#![feature(associated_consts, field_init_shorthand, libc, type_ascription, untagged_unions)] +#![deny(missing_docs, missing_debug_implementations, unused_import_braces, unused_qualifications)] +#![feature(associated_consts, pub_restricted)] +#![feature(libc, unsize, untagged_unions)] #[macro_use] extern crate enum_primitive; #[macro_use] extern crate lazy_static; @@ -103,16 +130,27 @@ extern crate libloading as lib; extern crate ndarray; pub mod changelog; -pub mod error; -pub mod frameworks; +pub mod utility; + +pub use self::frameworks::{native, opencl}; pub use self::backend::Backend; -pub use self::interface::{Address, ComputeDevice, Context, Framework, Hardware, HardwareKind}; +pub use self::context::Context; +pub use self::error::{Error, ErrorKind, Result}; +pub use self::extension::{Build, ExtensionPackage, Unextended}; +pub use self::framework::{BoxContext, Framework}; +pub use self::frameworks::native::Native; +pub use self::frameworks::opencl::OpenCL; +pub use self::hardware::{Alloc, ComputeDevice, Device, Hardware, HardwareKind, Synch, Viewable}; pub use self::memory::Memory; -pub use self::tensor::{Shape, SharedTensor, Tensor, TensorMut, u64Map}; +pub use self::tensor::{Shape, SharedTensor}; mod backend; -mod interface; +mod context; +mod error; +mod extension; +mod framework; +mod frameworks; +mod hardware; mod memory; -mod tensor; -mod utility; \ No newline at end of file +mod tensor; \ No newline at end of file diff --git a/src/memory.rs b/src/memory.rs index 24830ad..2de5c1d 100644 --- a/src/memory.rs +++ b/src/memory.rs @@ -1,9 +1,14 @@ -use frameworks::native::Memory as NativeMemory; -use frameworks::opencl::Memory as OpenCLMemory; +use native::NativeMemory; +use opencl::OpenCLMemory; -/// Represents a memory object. +/// Provides a representation for memory across different frameworks. /// -/// note: downcast methods are provided. +/// Memory is allocated by a device in a way that it is accessible for its computations. +/// +/// Downcast methods are provided, but normally you will want to use a [`SharedTensor`] which +/// handles synchronization of the latest memory copy to the required device. +/// +/// [`SharedTensor`]: ./struct.SharedTensor.html #[derive(Debug)] pub enum Memory { // /// A CUDA memory object. @@ -11,7 +16,6 @@ pub enum Memory { /// The native memory representation. Native(NativeMemory), - /// An OpenCL Memory. OpenCL(OpenCLMemory), } @@ -33,6 +37,22 @@ impl Memory { } } + /// Returns a reference to the native memory representation. + pub unsafe fn as_native_unchecked(&self) -> &NativeMemory { + match *self { + Memory::Native(ref native) => native, + _ => unreachable!() + } + } + + /// Returns a reference to the native memory representation. + pub unsafe fn as_mut_native_unchecked(&mut self) -> &mut NativeMemory { + match *self { + Memory::Native(ref mut native) => native, + _ => unreachable!() + } + } + /// Returns the native memory representation, consuming the convertee. pub fn into_native(self) -> Option> { match self { @@ -57,11 +77,19 @@ impl Memory { } } - /// Returns the OpenCL memory, consuming the convertee. - pub fn into_opencl(self) -> Option { - match self { - Memory::OpenCL(opencl) => Some(opencl), - _ => None + /// Returns a reference to the opencl memory representation. + pub unsafe fn as_opencl_unchecked(&self) -> &OpenCLMemory { + match *self { + Memory::OpenCL(ref opencl) => opencl, + _ => unreachable!() } } + + // /// Returns the OpenCL memory, consuming the convertee. + // pub fn into_opencl(self) -> Option { + // match self { + // Memory::OpenCL(opencl) => Some(opencl), + // _ => None + // } + // } } \ No newline at end of file diff --git a/src/tensor.rs b/src/tensor.rs index 0a23518..01f459c 100644 --- a/src/tensor.rs +++ b/src/tensor.rs @@ -1,10 +1,9 @@ -use std::{convert, mem}; +use std::mem; use std::cell::{Cell, RefCell}; use std::marker::PhantomData; -use super::{Address, Backend, Memory}; -use super::utility; -use super::error::{ErrorKind, Result}; +use {Alloc, ComputeDevice, Device, ErrorKind, Memory, Result, Synch}; +use utility::Has; /// A shared tensor for framework-agnostic, memory-aware, n-dimensional storage. /// @@ -12,6 +11,11 @@ use super::error::{ErrorKind, Result}; /// for one similar piece of data. `SharedTensor` handles synchronization of memory of type `T`, by /// which it is parameterized, and provides the functionality for memory management across devices. /// +/// `SharedTensor` holds copies and their version numbers. A user can request any number of +/// immutable `Tensor`s or a single mutable `Tensor` (enforced by borrowck). It's possible to +/// validate at runtime that tensor data is initialized when a user requests a tensor for reading +/// and skip the initialization check if a tensor is requested only for writing. +/// /// ## Terminology /// /// In Parenchyma, multidimensional Rust arrays represent tensors. A vector, a tensor with a @@ -56,11 +60,13 @@ use super::error::{ErrorKind, Result}; /// TODO /// ``` #[derive(Debug)] -pub struct SharedTensor { +pub struct SharedTensor { /// The shape of the shared tensor. pub shape: Shape, + /// A vector of buffers. - copies: RefCell)>>, + copies: RefCell)>>, + /// Indicates whether or not memory is synchronized (synchronization state). /// /// There are only two possible states: @@ -86,90 +92,168 @@ pub struct SharedTensor { /// corresponding memory is _ticked_ or increased. The value `0` means that the memory object /// at that specific location is uninitialized or outdated. versions: u64Map, + /// A marker for `T`. phantom: PhantomData, } -impl SharedTensor /* TODO where T: Scalar | Float */ { +impl SharedTensor where Device: Alloc + Synch { /// Constructs a new `SharedTensor` with a shape of `sh`. - pub fn new(sh: I) -> Result where I: Into { + pub fn new(sh: A) -> Self where A: Into { let shape = sh.into(); let copies = RefCell::new(vec![]); let versions = u64Map::new(); - Ok(SharedTensor { shape, copies, versions, phantom: PhantomData }) + SharedTensor { shape, copies, versions, phantom: PhantomData } } - /// Constructs a new `SharedTensor` from the supplied `chunk` of data with a shape of `sh`. - /// - /// # Examples - /// - /// ``` - /// # #![allow(dead_code)] - /// - /// let framework = Native::new(); - /// let ref backend = Backend::with(framework)?; - /// - /// let shared: SharedTensor = SharedTensor::with(backend, [2, 2], [1., 2., 3., 4.])?; - /// ``` - pub fn with(backend: &Backend, sh: I, mut chunk: A) -> Result - where I: Into, - A: AsMut<[T]> { + /// Constructs a new `SharedTensor` containing a `chunk` of data with a shape of `sh`. + pub fn with(con: &H, sh: I, chunk: Vec) -> Result + where H: Has, + I: Into, + { let shape = sh.into(); - let mut slice = chunk.as_mut(); - let buffer = backend.compute_device::().allocate_with(&shape, &mut slice)?; - let vec = vec![(backend.compute_device::().addr(), buffer)]; - let copies = RefCell::new(vec); + let device = con.get_ref(); + let buffer = device.allocwrite(&shape, chunk)?; + let copies = RefCell::new(vec![(device.view(), buffer)]); let versions = u64Map::with(1); Ok(SharedTensor { shape, copies, versions, phantom: PhantomData }) } - /// Pre-allocate memory on the active device and track it. - pub fn allocate(&mut self, backend: &Backend) -> Result { + /// Allocates memory on the active device and tracks it. + pub fn alloc(con: &H, sh: I) -> Result + where H: Has, + I: Into + { - let buffer = backend.compute_device::().allocate(&self.shape)?; + let shape = sh.into(); + let device = con.get_ref(); + let buffer = device.alloc(&shape)?; + let copies = RefCell::new(vec![(device.view(), buffer)]); + let versions = u64Map::with(1); // ? TODO - unimplemented!() + Ok(SharedTensor { shape, copies, versions, phantom: PhantomData }) + } + + /// Drops memory allocation on the specified device. Returns error if no memory has been + /// allocated on this device. + /// + // TODO FIXME: synchronize memory elsewhere if possible..? + // TODO silence the error..? + pub fn dealloc(&mut self, con: &H) -> Result> where H: Has { + + let device = con.get_ref(); + let location = device.view(); + + match self.get_location_index(&location) { + Some(i) => { + let (_, memory) = self.copies.borrow_mut().remove(i); + + let version = self.versions.get(); + let mask = (1 << i) - 1; + let lower = version & mask; + let upper = (version >> 1) & (!mask); + self.versions.set(lower | upper); + + Ok(memory) + }, + + _ => Err(ErrorKind::AllocatedMemoryNotFoundForDevice.into()) + } + } + + // /// Changes the capacity and shape of the tensor. + // /// + // /// **Caution**: Drops all copies which are not on the current device. + // /// + // /// `SharedTensor::reshape` is preferred over this method if the size of the old and new shape + // /// are identical because it will not reallocate memory. + // pub fn realloc(&mut self, dev: &H, sh: I) -> Result + // where H: Has, + // I: Into + // { + + // unimplemented!() + // } + + /// Change the shape of the Tensor. + /// + /// # Returns + /// + /// Returns an error if the size of the new shape is not equal to the size of the old shape. + /// If you want to change the shape to one of a different size, use `SharedTensor::realloc`. + pub fn reshape(&mut self, sh: I) -> Result where I: Into { + let shape = sh.into(); + + if shape.capacity() != self.shape.capacity() { + return Err(ErrorKind::InvalidReshapedTensorSize.into()); + } + + self.shape = shape; + + Ok(()) + } + + /// Returns the number of elements the tensor can hold without reallocating. + pub fn capacity(&self) -> usize { + self.shape.capacity() } } -/// An `impl` block containing the read/write/auto-sync logic. -impl SharedTensor { +/// This block contains the read/write/auto-sync logic. +impl SharedTensor where Device: Alloc + Synch { /// View an underlying tensor for reading on the active device. /// /// This method can fail if memory allocation fails or if no memory is initialized. /// The borrowck guarantees that the shared tensor outlives all of its tensors. - pub fn view<'shared>(&'shared self, backend: &Backend) -> Result> { - if self.versions.empty() { - return Err(ErrorKind::UninitializedMemory.into()); - } + /// + /// Summary: + /// + /// 1) Check if there is initialized data anywhere + /// 2) Lookup memory and its version for `device`, allocate it if it doesn't exist + /// 3) Check version, if it's old, synchronize + pub fn read<'shared, H>(&'shared self, dev: &H) -> Result<&'shared Memory> + where H: Has { - let i = self.get_or_create_location_index(backend)?; - self.sync_if_necessary(backend, i)?; - self.versions.insert(i); + let i = self.autosync(dev, false)?; let borrowed_copies = self.copies.borrow(); - let (ref address, ref buffer) = borrowed_copies[i]; + let (_, ref buffer) = borrowed_copies[i]; - let address = unsafe { utility::extend_lifetime::<'shared>(address) }; - let memory = unsafe { utility::extend_lifetime::<'shared>(buffer) }; + let memory = unsafe { extend_lifetime::<'shared>(buffer) }; - Ok(Tensor { address, memory }) + Ok(memory) } /// View an underlying tensor for reading and writing on the active device. The memory /// location is set as the latest. /// /// This method can fail is memory allocation fails or if no memory is initialized. - pub fn view_mut<'buf>(&'buf mut self, backend: &Backend) -> Result> { + /// + /// Summary: + /// + /// 1) Check if there is initialized data anywhere + /// 2) Lookup memory and its version for `device`, allocate it if it doesn't exist + /// 3) Check version, if it's old, synchronize + /// 4) Increase memory version and latest_version + pub fn read_write<'shared, H>(&'shared mut self, dev: &H) -> Result<&'shared mut Memory> + where H: Has { + + let i = self.autosync(dev, true)?; + + let mut borrowed_copies = self.copies.borrow_mut(); + + let (_, ref mut buffer) = borrowed_copies[i]; + + let memory = unsafe { extend_lifetime_mut::<'shared>(buffer) }; - unimplemented!() + Ok(memory) } /// View an underlying tensor for writing only. @@ -178,21 +262,41 @@ impl SharedTensor { /// be overwritten anyway. The caller must initialize all elements contained in the tensor. This /// convention isn't enforced, but failure to do so may result in undefined data later. /// + /// Summary: + /// + /// 1) *Skip initialization check + /// 2) Lookup memory and its version for `device`, allocate it if it doesn't exist + /// 3) *Skip synchronization + /// 4) Increase memory version and latest_version + /// /// TODO /// /// * Add an `invalidate` method: /// /// If the caller fails to overwrite memory, it must call `invalidate` to return the vector /// to an uninitialized state. - pub fn write<'buf>(&'buf mut self, backend: &Backend) -> Result> { + pub fn write<'shared, H>(&'shared mut self, con: &H) -> Result<&'shared mut Memory> + where H: Has { + + let i = self.get_or_create_location_index(con)?; + self.versions.set(1 << i); + + let mut borrowed_copies = self.copies.borrow_mut(); + + let (_, ref mut buffer) = borrowed_copies[i]; - unimplemented!() + let memory = unsafe { extend_lifetime_mut::<'shared>(buffer) }; + + Ok(memory) } +} + +impl SharedTensor where Device: Alloc + Synch { - fn get_location_index(&self, address: &Address) -> Option { + fn get_location_index(&self, location: &ComputeDevice) -> Option { for (i, l) in self.copies.borrow().iter().map(|&(ref l, _)| l).enumerate() { - if l.eq(address) { + if l.eq(location) { return Some(i); } } @@ -200,11 +304,13 @@ impl SharedTensor { None } - fn get_or_create_location_index(&self, backend: &Backend) -> Result { + fn get_or_create_location_index(&self, con: &H) -> Result where H: Has { - let address = backend.compute_device::().addr(); + let device = con.get_ref(); - if let Some(i) = self.get_location_index(&address) { + let location = device.view(); + + if let Some(i) = self.get_location_index(&location) { return Ok(i); } @@ -212,21 +318,40 @@ impl SharedTensor { return Err(ErrorKind::BitMapCapacityExceeded.into()); } - let memory = backend.compute_device::().allocate(&self.shape)?; - self.copies.borrow_mut().push((address, memory)); + let memory = device.alloc(&self.shape)?; + self.copies.borrow_mut().push((location, memory)); Ok(self.copies.borrow().len() - 1) } - // TODO: - // - // * Choose the best source to copy data from. - // That would require some additional traits that return costs for transferring data - // between different backends. - // - // Actually I think that there would be only transfers between `Native` <-> `Cuda` - // and `Native` <-> `OpenCL` in foreseeable future, so it's best to not over-engineer here. - fn sync_if_necessary(&self, backend: &Backend, destination_index: usize) -> Result { + /// Sync if necessary + /// + /// TODO: + /// + /// * Choose the best source to copy data from. + /// That would require some additional traits that return costs for transferring data + /// between different backends. + /// + /// note: Typically, there would be transfers between `Native` <-> `GPU` in foreseeable + /// future, so it's best to not over-engineer here. + pub fn autosync(&self, dev: &H, tick: bool) -> Result where H: Has { + if self.versions.empty() { + return Err(ErrorKind::UninitializedMemory.into()); + } + + let i = self.get_or_create_location_index(dev)?; + self.autosync_(i)?; + + if tick { + self.versions.set(1 << i); + } else { + self.versions.insert(i); + } + + Ok(i) + } + + fn autosync_(&self, destination_index: usize) -> Result { if self.versions.contains(destination_index) { @@ -246,25 +371,24 @@ impl SharedTensor { let (source, mut destination) = { if source_index < destination_index { let (left, right) = borrowed_copies.split_at_mut(destination_index); - (&mut left[source_index], &mut right[0]) + (&left[source_index], &mut right[0]) } else { let (left, right) = borrowed_copies.split_at_mut(source_index); - (&mut right[0], &mut left[destination_index]) + (&right[0], &mut left[destination_index]) } }; - backend.compute_device().sync_out(&source.1, &mut destination.1) + // TODO refactor - // TODO: - // // Backends may define transfers asymmetrically. E.g. CUDA may know how to transfer to and // from Native backend, while Native may know nothing about CUDA at all. So if first // attempt fails we change order and try again. + match source.0.device().read(&source.1, &mut destination.0, &mut destination.1) { + Err(ref e) if e.kind() == ErrorKind::NoAvailableSynchronizationRouteFound => { }, + ret @ _ => return ret, + } - - // dst_loc.mem_transfer.sync_in( - // dst_loc.mem.as_mut(), src_loc.compute_device.deref(), - // src_loc.mem.deref()).map_err(|e| e.into()) + destination.0.device().write(&mut destination.1, &source.0, &source.1) // TODO: try transfer indirectly via Native backend } @@ -277,27 +401,43 @@ pub struct Shape { /// /// # Example /// - /// ```ignore + /// ```{.text} /// // The following tensor has 9 components /// /// [[1, 2, 3], [4, 5, 6], [7, 8, 9]] /// ``` - capacity: usize, + pub capacity: usize, /// The total number of indices. /// /// # Example /// /// The following tensor has a rank of 2: /// - /// ```ignore + /// ```{.text} /// [[1, 2, 3], [4, 5, 6], [7, 8, 9]] /// ``` rank: usize, /// The dimensions of the tensor. - dims: Vec, + pub dims: Vec, +} + +impl Shape { + + /// Returns the capacity + pub fn capacity(&self) -> usize { + + self.capacity + } } -impl convert::From<[usize; 1]> for Shape { +impl From for Shape { + + fn from(n: usize) -> Shape { + [n].into() + } +} + +impl From<[usize; 1]> for Shape { fn from(array: [usize; 1]) -> Shape { let capacity = array[0]; @@ -308,7 +448,7 @@ impl convert::From<[usize; 1]> for Shape { } } -impl convert::From<[usize; 2]> for Shape { +impl From<[usize; 2]> for Shape { fn from(array: [usize; 2]) -> Shape { let capacity = array.iter().fold(1, |acc, &dims| acc * dims); @@ -319,20 +459,15 @@ impl convert::From<[usize; 2]> for Shape { } } -/// An immutable view. -/// -/// TODO: -/// -/// Parameterization over mutability would help here.. -pub struct Tensor<'a, T: 'a> { - address: &'a Address, - memory: &'a Memory, -} +impl From<[usize; 3]> for Shape { + + fn from(array: [usize; 3]) -> Shape { + let capacity = array.iter().fold(1, |acc, &dims| acc * dims); + let rank = 3; + let dims = array.to_vec(); -/// A mutable view. -pub struct TensorMut<'a, T: 'a> { - address: &'a Address, - memory: &'a mut Memory, + Shape { capacity, rank, dims } + } } /// A "newtype" with an internal type of `Cell`. `u64Map` uses [bit manipulation][1] to manage @@ -361,6 +496,10 @@ impl u64Map { self.0.get() } + fn set(&self, v: u64) { + self.0.set(v) + } + fn empty(&self) -> bool { self.0.get() == 0 } @@ -378,14 +517,10 @@ impl u64Map { } } -mod tests { - use super::u64Map; +unsafe fn extend_lifetime<'a, 'b, T>(t: &'a T) -> &'b T { + mem::transmute::<&'a T, &'b T>(t) +} - #[test] - fn u64_map_contains() { - let key = 10; - let u64_map = u64Map::new(); - u64_map.insert(key); - assert!(u64_map.contains(key)) - } +unsafe fn extend_lifetime_mut<'a, 'b, T>(t: &'a mut T) -> &'b mut T { + mem::transmute::<&'a mut T, &'b mut T>(t) } \ No newline at end of file diff --git a/src/utility.rs b/src/utility.rs index acccd60..9d4726d 100644 --- a/src/utility.rs +++ b/src/utility.rs @@ -1,14 +1,28 @@ -use std::mem; +//! Helper functions and traits -/// Returns the size of the allocated memory in bytes. -pub fn allocated(length: usize) -> usize { - length * mem::size_of::() +/// Marker trait for uninitialized objects. +pub type Uninitialized = (); + +/// A trait for simple _has_ relationships. +pub trait Has { + + /// Get a reference to `T`. + fn get_ref(&self) -> &T; } -pub unsafe fn extend_lifetime<'a, 'b, T>(t: &'a T) -> &'b T { - mem::transmute::<&'a T, &'b T>(t) +/// Attempt to construct a default value of a type. +// TODO move to a crate? +pub trait TryDefault: Sized { + /// The type returned in the event of an error. + type Err; + + /// Returns the default value for a type. + fn try_default() -> Result; } -unsafe fn extend_lifetime_mut<'a, 'b, T>(t: &'a mut T) -> &'b mut T { - mem::transmute::<&'a mut T, &'b mut T>(t) +/// Returns the size of the allocated memory in bytes. +pub fn allocated(length: usize) -> usize { + use std::mem; + + length * mem::size_of::() } \ No newline at end of file diff --git a/tests/backend_specs.rs b/tests/backend_specs.rs new file mode 100644 index 0000000..06565b8 --- /dev/null +++ b/tests/backend_specs.rs @@ -0,0 +1,14 @@ +extern crate parenchyma as pa; + +#[cfg(test)] +mod backend_spec { + mod native { + use pa::{Backend, Native}; + + #[test] + fn it_can_create_default_backend() { + let backend: Result = Backend::new::(); + assert!(backend.is_ok()); + } + } +} \ No newline at end of file diff --git a/tests/cuda.rs b/tests/cuda.rs deleted file mode 100644 index d84b9bc..0000000 --- a/tests/cuda.rs +++ /dev/null @@ -1,52 +0,0 @@ -// #![cfg(test)] - -// extern crate parenchyma; -// extern crate parenchyma_cuda as cuda; - -// mod shared_memory_spec { -// use cuda::Cuda; -// use parenchyma::{Backend, Native, NativeMemory, SharedTensor}; - -// fn write_to_memory(mem: &mut NativeMemory, data: &[T]) { - -// let buffer = mem.as_mut_slice::(); - -// for (i, datum) in data.iter().enumerate() { - -// buffer[i] = *datum; -// } -// } - -// #[test] -// fn it_creates_new_shared_memory_for_cuda() { - -// let backend: Backend = Backend::default().unwrap(); -// let shape = vec![10]; -// let mut shared_data = SharedTensor::::from(shape); -// assert!(shared_data.write_only(backend.context()).is_ok()) -// } - -// #[test] -// fn it_syncs_from_native_to_cuda_and_back() { -// let shape = vec![3]; - -// let cuda: Backend = Backend::default().unwrap(); - -// let native: Backend = Backend::default().unwrap(); - -// let mut mem: SharedTensor = SharedTensor::from(shape); - -// write_to_memory( -// mem.write_only(native.context()).unwrap(), -// &[1.0, 2.0, 123.456] -// ); - -// assert!(mem.read(cuda.context()).is_ok()); - -// // It has not successfully synced to the device. -// // Not the other way around. -// assert!(mem.drop_context(native.context()).is_ok()); - -// assert_eq!(mem.read(native.context()).unwrap().as_slice::(), [1.0, 2.0, 123.456]); -// } -// } \ No newline at end of file diff --git a/tests/lib.rs b/tests/lib.rs deleted file mode 100644 index 2a59f0d..0000000 --- a/tests/lib.rs +++ /dev/null @@ -1,104 +0,0 @@ -// extern crate parenchyma; - -// #[cfg(test)] -// mod backend_spec { -// use parenchyma::{Backend, Native}; - -// #[test] -// fn it_can_create_default_backend() { - -// assert!(Backend::::default().is_ok()); -// } -// } - -// #[cfg(test)] -// mod framework_spec { -// use parenchyma::{Framework, Native}; - -// #[test] -// fn it_can_init_native_framework() { -// let framework = Native::new().unwrap(); -// assert_eq!(framework.available_devices.len(), 1); -// } -// } - -// #[cfg(test)] -// mod shared_memory_spec { -// use parenchyma::{Context, Framework, Native, NativeContext, SharedTensor}; -// use parenchyma::error::ErrorKind; - -// #[test] -// fn it_creates_new_shared_memory_for_native() { -// let native = Native::new().unwrap(); -// let context = NativeContext::new(native.available_devices.clone()).unwrap(); -// let ref device = context.devices()[0]; -// let mut tensor = SharedTensor::::from(vec![10]); -// let data = tensor.write_only(device).unwrap().as_slice::(); -// assert_eq!(10, data.len()); -// } - -// // #[test] -// // fn it_fails_on_initialized_memory_read() { -// // let native = Native::new().unwrap(); -// // let context = NativeContext::new(native.available_devices.clone()).unwrap(); -// // let mut tensor = SharedTensor::::from(vec![10]); - -// // assert_eq!(tensor.read(&context).unwrap_err().kind(), ErrorKind::UninitializedMemory); - -// // assert_eq!(tensor.read_write(&context).unwrap_err().kind(), ErrorKind::UninitializedMemory); - -// // tensor.write_only(&context).unwrap(); -// // tensor.drop_context(&context).unwrap(); - -// // assert_eq!(tensor.read(&context).unwrap_err().kind(), ErrorKind::UninitializedMemory); -// // } -// } - -// #[cfg(test)] -// mod tensor_spec { -// use parenchyma::SharedTensor; - -// // #[test] -// // fn it_returns_correct_tensor_desc_stride() { - -// // let tensor_desc_r0 = Tensor::from(vec![]); -// // let tensor_desc_r1 = Tensor::from(vec![5]); -// // let tensor_desc_r2 = Tensor::from(vec![2, 4]); -// // let tensor_desc_r3 = Tensor::from(vec![2, 2, 4]); -// // let tensor_desc_r4 = Tensor::from(vec![2, 2, 4, 4]); - -// // assert!(vec![0; 0] == tensor_desc_r0.default_stride()); -// // assert_eq!(vec![1], tensor_desc_r1.default_stride()); -// // assert_eq!(vec![4, 1], tensor_desc_r2.default_stride()); -// // assert_eq!(vec![8, 4, 1], tensor_desc_r3.default_stride()); -// // assert_eq!(vec![32, 16, 4, 1], tensor_desc_r4.default_stride()); -// // } - -// #[test] -// fn it_returns_correct_size_for_rank_0() { -// // In order for memory to be correctly allocated, the size should never return less than 1. -// let tensor_desc_r0 = SharedTensor::::from(vec![]); -// assert_eq!(1, tensor_desc_r0.capacity()); -// } - -// // #[test] -// // fn it_resizes_tensor() { -// // let mut shared_tensor = Tensor::::from(vec![10, 20, 30]); -// // assert_eq!(shared_tensor.shape(), &[10, 20, 30]); - -// // shared_tensor.replace(vec![2, 3, 4, 5]); -// // assert_eq!(shared_tensor.shape(), &[2, 3, 4, 5]); -// // } - -// // #[test] -// // fn it_reshapes_correctly() { -// // let mut shared_data = Tensor::::from(vec![10]); -// // assert!(shared_data.reshape(vec![5, 2]).is_ok()); -// // } - -// // #[test] -// // fn it_returns_err_for_invalid_size_reshape() { -// // let mut shared_data = Tensor::::new(vec![10]); -// // assert!(shared_data.reshape(vec![10, 2]).is_err()); -// // } -// } diff --git a/tests/opencl.rs b/tests/opencl.rs deleted file mode 100644 index fd22f88..0000000 --- a/tests/opencl.rs +++ /dev/null @@ -1,82 +0,0 @@ -// #![cfg(test)] - -// extern crate parenchyma; -// extern crate parenchyma_opencl as opencl; - -// mod shared_memory_spec { -// use opencl::OpenCL; -// use parenchyma::{Backend, DeviceKind, Framework, Native, NativeMemory, SharedTensor}; - -// fn write_to_memory(mem: &mut NativeMemory, data: &[T]) { - -// let buffer = mem.as_mut_slice::(); - -// for (i, datum) in data.iter().enumerate() { - -// buffer[i] = *datum; -// } -// } - -// #[test] -// fn it_creates_new_shared_memory_for_opencl() { - -// let backend: Backend = Backend::default().unwrap(); -// let shape = vec![10]; -// let mut shared_data = SharedTensor::::from(shape); -// assert!(shared_data.write_only(&backend.devices()[0]).is_ok()) -// } - -// #[test] -// fn it_syncs_from_native_to_opencl_and_back_cpu() { -// let shape = vec![3]; - -// let framework = OpenCL::new().unwrap(); -// let selection = framework.available_platforms[0].available_devices.clone(); -// let cl: Backend = Backend::new(framework, selection).unwrap(); -// let cl_cpu = cl.devices().iter().filter(|d| *d.kind() == DeviceKind::Cpu).nth(0).unwrap(); - -// let native: Backend = Backend::default().unwrap(); - -// let mut mem: SharedTensor = SharedTensor::from(shape); - -// write_to_memory( -// mem.write_only(&native.devices()[0]).unwrap(), -// &[1.0, 2.0, 123.456] -// ); - -// assert!(mem.read(cl_cpu).is_ok()); - -// // It has not successfully synced to the device. -// // Not the other way around. -// assert!(mem.drop_device(&native.devices()[0]).is_ok()); - -// assert_eq!(mem.read(&native.devices()[0]).unwrap().as_slice::(), [1.0, 2.0, 123.456]); -// } - -// #[test] -// fn it_syncs_from_native_to_opencl_and_back_gpu() { -// let shape = vec![3]; - -// let framework = OpenCL::new().unwrap(); -// let selection = framework.available_platforms[0].available_devices.clone(); -// let cl: Backend = Backend::new(framework, selection).unwrap(); -// let cl_gpu = cl.devices().iter().filter(|d| *d.kind() == DeviceKind::Gpu).nth(0).unwrap(); - -// let native: Backend = Backend::default().unwrap(); - -// let mut mem: SharedTensor = SharedTensor::from(shape); - -// write_to_memory( -// mem.write_only(&native.devices()[0]).unwrap(), -// &[1.0, 2.0, 123.456] -// ); - -// assert!(mem.read(cl_gpu).is_ok()); - -// // It has not successfully synced to the device. -// // Not the other way around. -// assert!(mem.drop_device(&native.devices()[0]).is_ok()); - -// assert_eq!(mem.read(&native.devices()[0]).unwrap().as_slice::(), [1.0, 2.0, 123.456]); -// } -// } \ No newline at end of file diff --git a/tests/shared_memory_specs.rs b/tests/shared_memory_specs.rs new file mode 100644 index 0000000..f051827 --- /dev/null +++ b/tests/shared_memory_specs.rs @@ -0,0 +1,79 @@ +extern crate parenchyma as pa; + +#[cfg(test)] +mod shared_memory_spec { + use pa::{Backend, ErrorKind, Memory, Native, OpenCL, SharedTensor}; + + pub fn write(memory: &mut Memory, data: &[f32]) { + let ndarray = unsafe { memory.as_mut_native_unchecked() }; + let buf = ndarray.as_slice_memory_order_mut().unwrap(); + + for (index, datum) in data.iter().enumerate() { + buf[index] = *datum; + } + } + + #[test] + fn it_creates_new_shared_memory_for_native() { + let ref host: Backend = Backend::new::().unwrap(); + let mut shared_data = SharedTensor::::new(10); + let tensor = shared_data.write(host).unwrap(); + assert_eq!(tensor.as_native().unwrap().len(), 10); + } + + #[test] + //#[cfg(feature = "opencl")] + fn it_creates_new_shared_memory_for_opencl() { + let ref backend: Backend = Backend::new::().unwrap(); + let mut shared_data: SharedTensor = SharedTensor::new(10); + assert!(shared_data.write(backend).is_ok()); + } + + #[test] + fn it_fails_on_initialized_memory_read() { + let ref host: Backend = Backend::new::().unwrap(); + let mut shared_data = SharedTensor::::new(10); + assert_eq!(shared_data.read(host).unwrap_err().kind(), ErrorKind::UninitializedMemory); + assert_eq!(shared_data.read_write(host).unwrap_err().kind(), ErrorKind::UninitializedMemory); + + // initialize memory + let _ = shared_data.write(host).unwrap(); + let _ = shared_data.dealloc(host).unwrap(); + + assert_eq!(shared_data.read(host).unwrap_err().kind(), ErrorKind::UninitializedMemory); + } + + #[test] + //#[cfg(feature = "opencl")] + fn it_syncs_from_native_to_opencl_and_back() { + let ref host: Backend = Backend::new::().unwrap(); + let ref backend: Backend = Backend::new::().unwrap(); + + let mut sh = SharedTensor::::new(3); + write(sh.write(host).unwrap(), &[1.0f32, 2.0, 123.456]); + let _ = sh.read(backend).unwrap(); + + // It has not successfully synced to the device. + // Not the other way around. + + //let _ = sh.dealloc(host).unwrap();// TODO ? + let _ = sh.dealloc(backend).unwrap(); + + assert_eq!( + sh.read(host).unwrap().as_native().unwrap().as_slice_memory_order().unwrap(), + [1.0, 2.0, 123.456] + ); + } + + #[test] + fn it_reshapes_correctly() { + let mut shared_data = SharedTensor::::new(10); + assert!(shared_data.reshape([5, 2]).is_ok()); + } + + #[test] + fn it_returns_err_for_invalid_size_reshape() { + let mut shared_data = SharedTensor::::new(10); + assert!(shared_data.reshape([10, 2]).is_err()); + } +} \ No newline at end of file