diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1bb16b1..f0927c4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -27,7 +27,8 @@ OPTION(BUILD_EXAMPLES "Build examples." OFF)
 OPTION(HAVE_CUDA "Have cuda_runtime.h." OFF)
 
 IF(HAVE_CUDA)
-    # noop
+    INCLUDE_DIRECTORIES(${CUDA_HOME}/include)
+    LINK_DIRECTORIES(${CUDA_HOME}/lib64)
 ELSE()
     ADD_DEFINITIONS(-DUSE_FAKE_CUDA_RUNTIME)
 ENDIF()
diff --git a/INSTALL b/INSTALL
new file mode 100755
index 0000000..6915429
--- /dev/null
+++ b/INSTALL
@@ -0,0 +1,10 @@
+#!/bin/sh
+set -e
+
+if [ -z $PREFIX ]; then
+    PREFIX=$HOME/local
+fi
+
+./configure --prefix=$PREFIX
+
+make install
diff --git a/configure b/configure
index 125d8f5..1dd3fd9 100755
--- a/configure
+++ b/configure
@@ -2,6 +2,7 @@
 set -e
 
 PREFIX=$(pwd)/local
+CUDA_HOME=/usr/local/cuda
 USE_OPENCV=0
 BUILD_TESTS=0
 BUILD_BENCHMARKS=0
@@ -11,10 +12,6 @@ BUILD_GBENCH=0
 HAVE_CUDA=0
 VERBOSE=0
 
-if [ $(find /usr/include/cuda_runtime.h | wc -l) -gt 0 ]; then
-    HAVE_CUDA=1
-fi
-
 parse_args() {
     for i in "$@"; do
         case $i in
@@ -48,6 +45,10 @@ parse_args() {
         --build-gbench)
             BUILD_GBENCH=1
             ;;
+        --with-cuda=*)
+            CUDA_HOME="${i#*=}"
+            echo "configure --with-cuda=$CUDA_HOME"
+            ;;
         --verbose)
             VERBOSE=1
             ;;
@@ -57,6 +58,10 @@ parse_args() {
             ;;
         esac
     done
+
+    if [ -f $CUDA_HOME/include/cuda_runtime.h ]; then
+        HAVE_CUDA=1
+    fi
 }
 
 CMAKE_FLAGS=
@@ -96,7 +101,11 @@ add_cmake_flags() {
     add_cmake_flag BUILD_TESTS ${BUILD_TESTS}
     add_cmake_flag BUILD_BENCHMARKS ${BUILD_BENCHMARKS}
     add_cmake_flag BUILD_EXAMPLES ${BUILD_EXAMPLES}
-    add_cmake_flag HAVE_CUDA ${HAVE_CUDA}
+
+    if [ ${HAVE_CUDA} -eq 1 ]; then
+        add_cmake_flag HAVE_CUDA ${HAVE_CUDA}
+        add_cmake_flag CUDA_HOME $CUDA_HOME
+    fi
 
     if [ ${BUILD_EXAMPLES} -eq 1 ]; then
         add_cmake_flag USE_OPENCV ${USE_OPENCV}
diff --git a/include/ttl/bits/std_copy.hpp b/include/ttl/bits/std_copy.hpp
new file mode 100644
index 0000000..a754694
--- /dev/null
+++ b/include/ttl/bits/std_copy.hpp
@@ -0,0 +1,28 @@
+#pragma once
+#include <ttl/bits/std_cuda_allocator.hpp>
+#include <ttl/bits/std_tensor.hpp>
+
+namespace ttl
+{
+namespace internal
+{
+namespace experimental
+{
+template <typename R, typename S>
+void copy(const basic_tensor<R, S, host_memory, readwrite> &dst,
+          const basic_tensor<R, S, cuda_memory, readonly> &src)
+{
+    using copier = internal::cuda_copier;
+    copier::copy<copier::d2h>(dst.data(), src.data(), src.data_size());
+}
+
+template <typename R, typename S>
+void copy(const basic_tensor<R, S, cuda_memory, readwrite> &dst,
+          const basic_tensor<R, S, host_memory, readonly> &src)
+{
+    using copier = internal::cuda_copier;
+    copier::copy<copier::h2d>(dst.data(), src.data(), src.data_size());
+}
+}  // namespace experimental
+}  // namespace internal
+}  // namespace ttl
diff --git a/include/ttl/bits/std_range.hpp b/include/ttl/bits/std_range.hpp
index b4f3e70..1363838 100644
--- a/include/ttl/bits/std_range.hpp
+++ b/include/ttl/bits/std_range.hpp
@@ -45,6 +45,5 @@ basic_integer_range<N> range(N m, N n)
 {
     return basic_integer_range<N>(m, n);
 }
-
 }  // namespace internal
 }  // namespace ttl
diff --git a/include/ttl/bits/std_tensor_mixin.hpp b/include/ttl/bits/std_tensor_mixin.hpp
index 36cce84..eca1d9e 100644
--- a/include/ttl/bits/std_tensor_mixin.hpp
+++ b/include/ttl/bits/std_tensor_mixin.hpp
@@ -15,6 +15,8 @@ class basic_scalar_mixin
     using data_ref = typename trait::ref_type;
     using data_t = typename trait::Data;
 
+    using Dim = typename S::dimension_type;
+
     data_t data_;
 
   protected:
@@ -33,6 +35,10 @@ class basic_scalar_mixin
 
     basic_scalar_mixin(data_ptr data, const S &) : data_(data) {}
 
+    constexpr Dim size() const { return 1; }
+
+    constexpr auto dims() const { return S().dims(); }
+
     constexpr size_t data_size() const { return sizeof(R); }
 
     data_ptr data() const { return data_.get(); }
@@ -40,16 +46,6 @@ class basic_scalar_mixin
     data_ptr data_end() const { return data_.get() + 1; }
 
     S shape() const { return S(); }
-
-    void from_host(const void *data) const
-    {
-        basic_copier<D, host_memory>()(data_.get(), data, data_size());
-    }
-
-    void to_host(void *data) const
-    {
-        basic_copier<host_memory, D>()(data, data_.get(), data_size());
-    }
 };
 
 template <typename R, typename S, typename D, typename A>
@@ -121,6 +117,10 @@ class basic_tensor_mixin
 
     static constexpr auto rank = S::rank;
 
+    Dim size() const { return shape_.size(); }
+
+    const auto &dims() const { return shape_.dims(); }
+
     size_t data_size() const { return shape_.size() * sizeof(R); }
 
     const S &shape() const { return shape_; }
@@ -158,16 +158,6 @@ class basic_tensor_mixin
         return slice_type(data_.get() + i * sub_shape.size(),
                           batch(j - i, sub_shape));
     }
-
-    void from_host(const void *data) const
-    {
-        basic_copier<D, host_memory>()(data_.get(), data, data_size());
-    }
-
-    void to_host(void *data) const
-    {
-        basic_copier<host_memory, D>()(data, data_.get(), data_size());
-    }
 };
 }  // namespace internal
 }  // namespace ttl
diff --git a/include/ttl/experimental/copy b/include/ttl/experimental/copy
new file mode 100644
index 0000000..4c86da6
--- /dev/null
+++ b/include/ttl/experimental/copy
@@ -0,0 +1,8 @@
+// # -*- mode: c++ -*-
+#pragma once
+#include <ttl/bits/std_copy.hpp>
+
+namespace ttl
+{
+using internal::experimental::copy;
+}  // namespace ttl
diff --git a/include/ttl/range b/include/ttl/range
index 14dedab..10de5df 100644
--- a/include/ttl/range
+++ b/include/ttl/range
@@ -4,6 +4,7 @@
 #include <cstdint>
 
 #include <ttl/bits/std_range.hpp>
+#include <ttl/bits/std_tensor_fwd.hpp>
 
 namespace ttl
 {
@@ -11,10 +12,9 @@ using internal::range;
 
 using rank_t = uint8_t;
 
-// FIXME: make T less generic
-template <rank_t r, typename T> auto range(const T &t)
+template <rank_t r, typename R, typename S, typename D, typename A>
+auto range(const internal::basic_tensor<R, S, D, A> &t)
 {
     return range(std::get<r>(t.shape().dims()));
 }
-
 }  // namespace ttl
diff --git a/tests/bench_cuda_tensor.cpp b/tests/bench_cuda_tensor.cpp
index 7bb5ddd..c21debd 100644
--- a/tests/bench_cuda_tensor.cpp
+++ b/tests/bench_cuda_tensor.cpp
@@ -1,16 +1,18 @@
 #include "benchmark.hpp"
 
 #include <ttl/cuda_tensor>
+#include <ttl/experimental/copy>
 
-template <typename R, int n> struct bench_cuda_tensor {
+template <typename R, int n>
+struct bench_cuda_tensor {
     static void run(benchmark::State &state)
     {
         ttl::cuda_tensor<R, 1> m1(n);
         ttl::tensor<R, 1> m2(n);
 
         for (auto _ : state) {
-            m1.from_host(m2.data());
-            m1.to_host(m2.data());
+            ttl::copy(ttl::ref(m1), ttl::view(m2));
+            ttl::copy(ttl::ref(m2), ttl::view(m1));
         }
     }
 };
diff --git a/tests/test_copy.cpp b/tests/test_copy.cpp
new file mode 100644
index 0000000..d80af3f
--- /dev/null
+++ b/tests/test_copy.cpp
@@ -0,0 +1,35 @@
+#include "testing.hpp"
+
+#include <ttl/algorithm>
+#include <ttl/cuda_tensor>
+#include <ttl/device>
+#include <ttl/experimental/copy>
+#include <ttl/range>
+#include <ttl/tensor>
+
+void test_copy(int n)
+{
+    ttl::tensor<int, 1> x_host(n);
+    ttl::cuda_tensor<int, 1> x_cuda(n);
+
+    ttl::fill(ttl::ref(x_host), 1);
+    ttl::copy(ttl::ref(x_cuda), ttl::view(x_host));
+
+    ttl::fill(ttl::ref(x_host), 2);
+    for (auto i : ttl::range<0>(x_host)) { ASSERT_EQ(x_host.data()[i], 2); }
+
+    ttl::copy(ttl::ref(x_host), ttl::view(x_cuda));
+    for (auto i : ttl::range<0>(x_host)) { ASSERT_EQ(x_host.data()[i], 1); }
+}
+
+TEST(copy_test, test_copy)
+{
+    test_copy(1);
+    test_copy(2);
+    test_copy(10);
+    test_copy(100);
+    test_copy(1000);
+    test_copy(1 << 20);
+    test_copy(1 << 20);
+    test_copy(1 << 20);
+}
diff --git a/tests/test_cuda_tensor.cpp b/tests/test_cuda_tensor.cpp
index 3c431ab..bba1942 100644
--- a/tests/test_cuda_tensor.cpp
+++ b/tests/test_cuda_tensor.cpp
@@ -1,6 +1,7 @@
 #include "testing.hpp"
 
 #include <ttl/cuda_tensor>
+#include <ttl/experimental/copy>
 #include <ttl/range>
 #include <ttl/tensor>
 
@@ -23,11 +24,10 @@ TEST(cuda_tensor_test, test0)
 {
     using R = float;
     cuda_tensor<R, 0> m0;
-
     tensor<R, 0> x;
 
-    m0.from_host(x.data());
-    m0.to_host(x.data());
+    ttl::copy(ttl::ref(m0), ttl::view(x));
+    ttl::copy(ttl::ref(x), ttl::view(m0));
 }
 
 TEST(cuda_tensor_test, test1)
@@ -42,8 +42,8 @@ TEST(cuda_tensor_test, test2)
     cuda_tensor<R, 2> m1(10, 100);
     tensor<R, 2> m2(10, 100);
 
-    m1.from_host(m2.data());
-    m1.to_host(m2.data());
+    ttl::copy(ttl::ref(m1), ttl::view(m2));
+    ttl::copy(ttl::ref(m2), ttl::view(m1));
 
     m1.slice(1, 2);
     auto r = ref(m1);
@@ -58,14 +58,16 @@ TEST(cuda_tensor_test, test_3)
     cuda_tensor<R, 2> m1(ttl::make_shape(10, 100));
 }
 
-template <typename R, uint8_t r> void test_auto_ref()
+template <typename R, uint8_t r>
+void test_auto_ref()
 {
     static_assert(
         std::is_convertible<cuda_tensor<R, r>, cuda_tensor_ref<R, r>>::value,
         "can't convert to ref");
 }
 
-template <typename R, uint8_t r> void test_auto_view()
+template <typename R, uint8_t r>
+void test_auto_view()
 {
     static_assert(
         std::is_convertible<cuda_tensor<R, r>, cuda_tensor_view<R, r>>::value,
@@ -87,15 +89,17 @@ TEST(cuda_tensor_test, test_convert)
     test_auto_view<int, 2>();
 }
 
-template <typename R, uint8_t r> void test_copy(const ttl::shape<r> &shape)
+template <typename R, uint8_t r>
+void test_copy(const ttl::shape<r> &shape)
 {
     tensor<R, r> x(shape);
     cuda_tensor<R, r> y(shape);
     tensor<R, r> z(shape);
 
     std::iota(x.data(), x.data_end(), 1);
-    y.from_host(x.data());
-    y.to_host(z.data());
+
+    ttl::copy(ttl::ref(y), ttl::view(x));
+    ttl::copy(ttl::ref(z), ttl::view(y));
 
     for (auto i : ttl::range(shape.size())) {
         ASSERT_EQ(x.data()[i], z.data()[i]);
@@ -103,12 +107,12 @@ template <typename R, uint8_t r> void test_copy(const ttl::shape<r> &shape)
 
     {
         cuda_tensor_ref<R, r> ry = ref(y);
-        ry.from_host(x.data());
-        ry.to_host(x.data());
+        ttl::copy(ry, ttl::view(x));
+        ttl::copy(ttl::ref(z), ttl::view(ry));
     }
     {
         cuda_tensor_view<R, r> vy = view(y);
-        vy.to_host(x.data());
+        ttl::copy(ttl::ref(x), vy);
     }
 }
 
diff --git a/tests/test_public_types.cpp b/tests/test_public_types.cpp
index 08f1712..42097a3 100644
--- a/tests/test_public_types.cpp
+++ b/tests/test_public_types.cpp
@@ -45,6 +45,16 @@ ttl::shape<r> unit_shape()
     return ttl::shape<r>(dims);
 }
 
+template <typename T>
+void test_public_apis(const T &t)
+{
+    const auto size = t.size();
+    ASSERT_EQ(size, static_cast<decltype(size)>(1));
+
+    const auto dims = t.dims();
+    static_assert(dims.size() == T::rank, "");
+}
+
 template <ttl::rank_t r>
 struct test_ranked_type {
     template <typename R>
@@ -65,6 +75,10 @@ struct test_ranked_type {
         Tensor t(unit_shape<r>());
         TensorRef tr(t);
         TensorView tv(t);
+
+        test_public_apis(t);
+        test_public_apis(tr);
+        test_public_apis(tv);
     }
 };
 
diff --git a/tests/test_range.cpp b/tests/test_range.cpp
index 41eb5f2..23565e5 100644
--- a/tests/test_range.cpp
+++ b/tests/test_range.cpp
@@ -4,14 +4,12 @@
 #include <ttl/range>
 #include <ttl/tensor>
 
-using ttl::range;
-
 int tri(int n) { return n * (n + 1) / 2; }
 
 void test_range_n(int n)
 {
     int s = 0;
-    for (auto i : range(n)) { s += i; }
+    for (auto i : ttl::range(n)) { s += i; }
     ASSERT_EQ(s, tri(n - 1));
 }
 
@@ -29,9 +27,9 @@ TEST(range_test, test_2)
 {
     ttl::tensor<int, 3> x(4, 5, 6);
     int idx = 0;
-    for (auto i : range<0>(x)) {
-        for (auto j : range<1>(x)) {
-            for (auto k : range<2>(x)) { x.at(i, j, k) = ++idx; }
+    for (auto i : ttl::range<0>(x)) {
+        for (auto j : ttl::range<1>(x)) {
+            for (auto k : ttl::range<2>(x)) { x.at(i, j, k) = ++idx; }
         }
     }
     ASSERT_EQ(ttl::sum(view(x)), tri(4 * 5 * 6));