diff --git a/CMakeLists.txt b/CMakeLists.txt index 1bb16b1..f0927c4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -27,7 +27,8 @@ OPTION(BUILD_EXAMPLES "Build examples." OFF) OPTION(HAVE_CUDA "Have cuda_runtime.h." OFF) IF(HAVE_CUDA) - # noop + INCLUDE_DIRECTORIES(${CUDA_HOME}/include) + LINK_DIRECTORIES(${CUDA_HOME}/lib64) ELSE() ADD_DEFINITIONS(-DUSE_FAKE_CUDA_RUNTIME) ENDIF() diff --git a/INSTALL b/INSTALL new file mode 100755 index 0000000..6915429 --- /dev/null +++ b/INSTALL @@ -0,0 +1,10 @@ +#!/bin/sh +set -e + +if [ -z $PREFIX ]; then + PREFIX=$HOME/local +fi + +./configure --prefix=$PREFIX + +make install diff --git a/configure b/configure index 125d8f5..1dd3fd9 100755 --- a/configure +++ b/configure @@ -2,6 +2,7 @@ set -e PREFIX=$(pwd)/local +CUDA_HOME=/usr/local/cuda USE_OPENCV=0 BUILD_TESTS=0 BUILD_BENCHMARKS=0 @@ -11,10 +12,6 @@ BUILD_GBENCH=0 HAVE_CUDA=0 VERBOSE=0 -if [ $(find /usr/include/cuda_runtime.h | wc -l) -gt 0 ]; then - HAVE_CUDA=1 -fi - parse_args() { for i in "$@"; do case $i in @@ -48,6 +45,10 @@ parse_args() { --build-gbench) BUILD_GBENCH=1 ;; + --with-cuda=*) + CUDA_HOME="${i#*=}" + echo "configure --with-cuda=$CUDA_HOME" + ;; --verbose) VERBOSE=1 ;; @@ -57,6 +58,10 @@ parse_args() { ;; esac done + + if [ -f $CUDA_HOME/include/cuda_runtime.h ]; then + HAVE_CUDA=1 + fi } CMAKE_FLAGS= @@ -96,7 +101,11 @@ add_cmake_flags() { add_cmake_flag BUILD_TESTS ${BUILD_TESTS} add_cmake_flag BUILD_BENCHMARKS ${BUILD_BENCHMARKS} add_cmake_flag BUILD_EXAMPLES ${BUILD_EXAMPLES} - add_cmake_flag HAVE_CUDA ${HAVE_CUDA} + + if [ ${HAVE_CUDA} -eq 1 ]; then + add_cmake_flag HAVE_CUDA ${HAVE_CUDA} + add_cmake_flag CUDA_HOME $CUDA_HOME + fi if [ ${BUILD_EXAMPLES} -eq 1 ]; then add_cmake_flag USE_OPENCV ${USE_OPENCV} diff --git a/include/ttl/bits/std_copy.hpp b/include/ttl/bits/std_copy.hpp new file mode 100644 index 0000000..a754694 --- /dev/null +++ b/include/ttl/bits/std_copy.hpp @@ -0,0 +1,28 @@ +#pragma once +#include +#include + +namespace ttl +{ +namespace internal +{ +namespace experimental +{ +template +void copy(const basic_tensor &dst, + const basic_tensor &src) +{ + using copier = internal::cuda_copier; + copier::copy(dst.data(), src.data(), src.data_size()); +} + +template +void copy(const basic_tensor &dst, + const basic_tensor &src) +{ + using copier = internal::cuda_copier; + copier::copy(dst.data(), src.data(), src.data_size()); +} +} // namespace experimental +} // namespace internal +} // namespace ttl diff --git a/include/ttl/bits/std_range.hpp b/include/ttl/bits/std_range.hpp index b4f3e70..1363838 100644 --- a/include/ttl/bits/std_range.hpp +++ b/include/ttl/bits/std_range.hpp @@ -45,6 +45,5 @@ basic_integer_range range(N m, N n) { return basic_integer_range(m, n); } - } // namespace internal } // namespace ttl diff --git a/include/ttl/bits/std_tensor_mixin.hpp b/include/ttl/bits/std_tensor_mixin.hpp index 36cce84..eca1d9e 100644 --- a/include/ttl/bits/std_tensor_mixin.hpp +++ b/include/ttl/bits/std_tensor_mixin.hpp @@ -15,6 +15,8 @@ class basic_scalar_mixin using data_ref = typename trait::ref_type; using data_t = typename trait::Data; + using Dim = typename S::dimension_type; + data_t data_; protected: @@ -33,6 +35,10 @@ class basic_scalar_mixin basic_scalar_mixin(data_ptr data, const S &) : data_(data) {} + constexpr Dim size() const { return 1; } + + constexpr auto dims() const { return S().dims(); } + constexpr size_t data_size() const { return sizeof(R); } data_ptr data() const { return data_.get(); } @@ -40,16 +46,6 @@ class basic_scalar_mixin data_ptr data_end() const { return data_.get() + 1; } S shape() const { return S(); } - - void from_host(const void *data) const - { - basic_copier()(data_.get(), data, data_size()); - } - - void to_host(void *data) const - { - basic_copier()(data, data_.get(), data_size()); - } }; template @@ -121,6 +117,10 @@ class basic_tensor_mixin static constexpr auto rank = S::rank; + Dim size() const { return shape_.size(); } + + const auto &dims() const { return shape_.dims(); } + size_t data_size() const { return shape_.size() * sizeof(R); } const S &shape() const { return shape_; } @@ -158,16 +158,6 @@ class basic_tensor_mixin return slice_type(data_.get() + i * sub_shape.size(), batch(j - i, sub_shape)); } - - void from_host(const void *data) const - { - basic_copier()(data_.get(), data, data_size()); - } - - void to_host(void *data) const - { - basic_copier()(data, data_.get(), data_size()); - } }; } // namespace internal } // namespace ttl diff --git a/include/ttl/experimental/copy b/include/ttl/experimental/copy new file mode 100644 index 0000000..4c86da6 --- /dev/null +++ b/include/ttl/experimental/copy @@ -0,0 +1,8 @@ +// # -*- mode: c++ -*- +#pragma once +#include + +namespace ttl +{ +using internal::experimental::copy; +} // namespace ttl diff --git a/include/ttl/range b/include/ttl/range index 14dedab..10de5df 100644 --- a/include/ttl/range +++ b/include/ttl/range @@ -4,6 +4,7 @@ #include #include +#include namespace ttl { @@ -11,10 +12,9 @@ using internal::range; using rank_t = uint8_t; -// FIXME: make T less generic -template auto range(const T &t) +template +auto range(const internal::basic_tensor &t) { return range(std::get(t.shape().dims())); } - } // namespace ttl diff --git a/tests/bench_cuda_tensor.cpp b/tests/bench_cuda_tensor.cpp index 7bb5ddd..c21debd 100644 --- a/tests/bench_cuda_tensor.cpp +++ b/tests/bench_cuda_tensor.cpp @@ -1,16 +1,18 @@ #include "benchmark.hpp" #include +#include -template struct bench_cuda_tensor { +template +struct bench_cuda_tensor { static void run(benchmark::State &state) { ttl::cuda_tensor m1(n); ttl::tensor m2(n); for (auto _ : state) { - m1.from_host(m2.data()); - m1.to_host(m2.data()); + ttl::copy(ttl::ref(m1), ttl::view(m2)); + ttl::copy(ttl::ref(m2), ttl::view(m1)); } } }; diff --git a/tests/test_copy.cpp b/tests/test_copy.cpp new file mode 100644 index 0000000..d80af3f --- /dev/null +++ b/tests/test_copy.cpp @@ -0,0 +1,35 @@ +#include "testing.hpp" + +#include +#include +#include +#include +#include +#include + +void test_copy(int n) +{ + ttl::tensor x_host(n); + ttl::cuda_tensor x_cuda(n); + + ttl::fill(ttl::ref(x_host), 1); + ttl::copy(ttl::ref(x_cuda), ttl::view(x_host)); + + ttl::fill(ttl::ref(x_host), 2); + for (auto i : ttl::range<0>(x_host)) { ASSERT_EQ(x_host.data()[i], 2); } + + ttl::copy(ttl::ref(x_host), ttl::view(x_cuda)); + for (auto i : ttl::range<0>(x_host)) { ASSERT_EQ(x_host.data()[i], 1); } +} + +TEST(copy_test, test_copy) +{ + test_copy(1); + test_copy(2); + test_copy(10); + test_copy(100); + test_copy(1000); + test_copy(1 << 20); + test_copy(1 << 20); + test_copy(1 << 20); +} diff --git a/tests/test_cuda_tensor.cpp b/tests/test_cuda_tensor.cpp index 3c431ab..bba1942 100644 --- a/tests/test_cuda_tensor.cpp +++ b/tests/test_cuda_tensor.cpp @@ -1,6 +1,7 @@ #include "testing.hpp" #include +#include #include #include @@ -23,11 +24,10 @@ TEST(cuda_tensor_test, test0) { using R = float; cuda_tensor m0; - tensor x; - m0.from_host(x.data()); - m0.to_host(x.data()); + ttl::copy(ttl::ref(m0), ttl::view(x)); + ttl::copy(ttl::ref(x), ttl::view(m0)); } TEST(cuda_tensor_test, test1) @@ -42,8 +42,8 @@ TEST(cuda_tensor_test, test2) cuda_tensor m1(10, 100); tensor m2(10, 100); - m1.from_host(m2.data()); - m1.to_host(m2.data()); + ttl::copy(ttl::ref(m1), ttl::view(m2)); + ttl::copy(ttl::ref(m2), ttl::view(m1)); m1.slice(1, 2); auto r = ref(m1); @@ -58,14 +58,16 @@ TEST(cuda_tensor_test, test_3) cuda_tensor m1(ttl::make_shape(10, 100)); } -template void test_auto_ref() +template +void test_auto_ref() { static_assert( std::is_convertible, cuda_tensor_ref>::value, "can't convert to ref"); } -template void test_auto_view() +template +void test_auto_view() { static_assert( std::is_convertible, cuda_tensor_view>::value, @@ -87,15 +89,17 @@ TEST(cuda_tensor_test, test_convert) test_auto_view(); } -template void test_copy(const ttl::shape &shape) +template +void test_copy(const ttl::shape &shape) { tensor x(shape); cuda_tensor y(shape); tensor z(shape); std::iota(x.data(), x.data_end(), 1); - y.from_host(x.data()); - y.to_host(z.data()); + + ttl::copy(ttl::ref(y), ttl::view(x)); + ttl::copy(ttl::ref(z), ttl::view(y)); for (auto i : ttl::range(shape.size())) { ASSERT_EQ(x.data()[i], z.data()[i]); @@ -103,12 +107,12 @@ template void test_copy(const ttl::shape &shape) { cuda_tensor_ref ry = ref(y); - ry.from_host(x.data()); - ry.to_host(x.data()); + ttl::copy(ry, ttl::view(x)); + ttl::copy(ttl::ref(z), ttl::view(ry)); } { cuda_tensor_view vy = view(y); - vy.to_host(x.data()); + ttl::copy(ttl::ref(x), vy); } } diff --git a/tests/test_public_types.cpp b/tests/test_public_types.cpp index 08f1712..42097a3 100644 --- a/tests/test_public_types.cpp +++ b/tests/test_public_types.cpp @@ -45,6 +45,16 @@ ttl::shape unit_shape() return ttl::shape(dims); } +template +void test_public_apis(const T &t) +{ + const auto size = t.size(); + ASSERT_EQ(size, static_cast(1)); + + const auto dims = t.dims(); + static_assert(dims.size() == T::rank, ""); +} + template struct test_ranked_type { template @@ -65,6 +75,10 @@ struct test_ranked_type { Tensor t(unit_shape()); TensorRef tr(t); TensorView tv(t); + + test_public_apis(t); + test_public_apis(tr); + test_public_apis(tv); } }; diff --git a/tests/test_range.cpp b/tests/test_range.cpp index 41eb5f2..23565e5 100644 --- a/tests/test_range.cpp +++ b/tests/test_range.cpp @@ -4,14 +4,12 @@ #include #include -using ttl::range; - int tri(int n) { return n * (n + 1) / 2; } void test_range_n(int n) { int s = 0; - for (auto i : range(n)) { s += i; } + for (auto i : ttl::range(n)) { s += i; } ASSERT_EQ(s, tri(n - 1)); } @@ -29,9 +27,9 @@ TEST(range_test, test_2) { ttl::tensor x(4, 5, 6); int idx = 0; - for (auto i : range<0>(x)) { - for (auto j : range<1>(x)) { - for (auto k : range<2>(x)) { x.at(i, j, k) = ++idx; } + for (auto i : ttl::range<0>(x)) { + for (auto j : ttl::range<1>(x)) { + for (auto k : ttl::range<2>(x)) { x.at(i, j, k) = ++idx; } } } ASSERT_EQ(ttl::sum(view(x)), tri(4 * 5 * 6));