From 5c96f23d1d15c9bdde1cd65f389ff79d29534d8f Mon Sep 17 00:00:00 2001
From: "Aart J.C. Bik" <abik@nvidia.com>
Date: Wed, 22 Jan 2025 11:26:26 -0800
Subject: [PATCH] Implement the () operator on sparse tensors

Note that is a fully functional "locator" for the () operator
that works for *all* versatile sparse tensors. Currently, the
operator is defined at sparse_tensor level, but it should be
moved into tensor_impl after this. Also, clients should always
be aware that the () operator for compressed levels are *not*
random-access, but involve a search to find if the element is
stored.
---
 examples/sparse_tensor.cu                | 36 ++++++++++-
 include/matx/core/make_sparse_tensor.h   | 16 ++++-
 include/matx/core/sparse_tensor.h        | 76 ++++++++++++++++++++++++
 include/matx/core/sparse_tensor_format.h | 58 ++++++++++--------
 include/matx/core/tensor_impl.h          |  3 +-
 5 files changed, 159 insertions(+), 30 deletions(-)

diff --git a/examples/sparse_tensor.cu b/examples/sparse_tensor.cu
index 40cca436..2ac5cb14 100644
--- a/examples/sparse_tensor.cu
+++ b/examples/sparse_tensor.cu
@@ -41,6 +41,22 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   cudaStream_t stream = 0;
   cudaExecutor exec{stream};
 
+  //
+  // Print some formats that are used for the versatile sparse tensor
+  // type. Note that common formats like COO and CSR have good library
+  // support in e.g. cuSPARSE, but MatX provides a much more general
+  // way to define the sparse tensor storage through a DSL (see doc).
+  //
+  experimental::Scalar::print();   // scalars
+  experimental::SpVec::print();    // sparse vectors
+  experimental::COO::print();      // various sparse matrix formats
+  experimental::CSR::print();
+  experimental::CSC::print();
+  experimental::DCSR::print();
+  experimental::BSR<2,2>::print(); // 2x2 blocks
+  experimental::COO4::print();     // 4-dim tensor in COO
+  experimental::CSF5::print();     // 5-dim tensor in CSF
+
   //
   // Creates a COO matrix for the following 4x8 dense matrix with 5 nonzero
   // elements, using the factory method that uses MatX tensors for the 1-dim
@@ -71,7 +87,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   //
   // This shows:
   //
-  // tensor_impl_2_f32: Tensor{float} Rank: 2, Sizes:[4, 8], Levels:[4, 8]
+  // tensor_impl_2_f32: SparseTensor{float} Rank: 2, Sizes:[4, 8], Levels:[4, 8]
   // nse    = 5
   // format = ( d0, d1 ) -> ( d0 : compressed(non-unique), d1 : singleton )
   // crd[0] = ( 0  0  3  3  3 )
@@ -81,6 +97,24 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   //
   print(Acoo);
 
+  //
+  // A very naive way to convert the sparse matrix back to a dense
+  // matrix. Note that one should **never** use the ()-operator in
+  // performance critical code, since sparse data structures do
+  // not provide O(1) random access to their elements (compressed
+  // levels will use some form of search to determine if an element
+  // is present). Instead, conversions (and other operations) should
+  // use sparse operations that are tailored for the sparse data
+  // structure (such as scanning by row for CSR).
+  //
+  tensor_t<float, 2> Dense{{m, n}};
+  for (index_t i = 0; i < m; i++) {
+    for (index_t j = 0; j < n; j++) {
+      Dense(i, j) = Acoo(i, j);
+    }
+  }
+  print(Dense);
+
   // TODO: operations on Acoo
 
   MATX_EXIT_HANDLER();
diff --git a/include/matx/core/make_sparse_tensor.h b/include/matx/core/make_sparse_tensor.h
index 3cffa46c..6d81565b 100644
--- a/include/matx/core/make_sparse_tensor.h
+++ b/include/matx/core/make_sparse_tensor.h
@@ -55,11 +55,23 @@ auto make_tensor_coo(ValTensor &val, CrdTensor &row, CrdTensor &col,
   static_assert(ValTensor::Rank() == 1 && CrdTensor::Rank() == 1);
   using VAL = typename ValTensor::value_type;
   using CRD = typename CrdTensor::value_type;
-  using POS = int; // no positions, although some forms use [0,nse]
+  using POS = index_t;
+  // Note that the COO API typically does not involve positions.
+  // However, under the formal DSL specifications, the top level
+  // compression should set up pos[0] = {0, nse}. This is done
+  // here, using the same memory space as the other data.
+  POS *ptr;
+  matxMemorySpace_t space = GetPointerKind(val.GetStorage().data());
+  matxAlloc((void **)&ptr, 2 * sizeof(POS), space, 0);
+  ptr[0] = 0;
+  ptr[1] = val.Size(0);
+  raw_pointer_buffer<POS, matx_allocator<POS>> topp{ptr, 2 * sizeof(POS),
+                                                    owning};
+  basic_storage<decltype(topp)> tp{std::move(topp)};
   raw_pointer_buffer<POS, matx_allocator<POS>> emptyp{nullptr, 0, owning};
   basic_storage<decltype(emptyp)> ep{std::move(emptyp)};
   return sparse_tensor_t<VAL, CRD, POS, COO>(
-      shape, val.GetStorage(), {row.GetStorage(), col.GetStorage()}, {ep, ep});
+      shape, val.GetStorage(), {row.GetStorage(), col.GetStorage()}, {tp, ep});
 }
 
 // Constructs a sparse matrix in CSR format directly from the values, the row
diff --git a/include/matx/core/sparse_tensor.h b/include/matx/core/sparse_tensor.h
index 473325b0..e0a1e03d 100644
--- a/include/matx/core/sparse_tensor.h
+++ b/include/matx/core/sparse_tensor.h
@@ -109,6 +109,82 @@ class sparse_tensor_t
   index_t crdSize(int l) const { return coordinates_[l].size() / sizeof(CRD); }
   index_t posSize(int l) const { return positions_[l].size() / sizeof(POS); }
 
+  // Locates position of an element at given indices, or returns -1 when not
+  // found.
+  template <int L = 0>
+  __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ index_t
+  GetPos(index_t *lvlsz, index_t *lvl, index_t pos) const {
+    if constexpr (L < LVL) {
+      using ftype = std::tuple_element_t<L, typename TF::LVLSPECS>;
+      if constexpr (ftype::lvltype == LvlType::Dense) {
+        // Dense level: pos * size + i.
+        // TODO: see below, use a constexpr GetLvlSize(L) instead?
+        const index_t dpos = pos * lvlsz[L] + lvl[L];
+        if constexpr (L + 1 < LVL) {
+          return GetPos<L + 1>(lvlsz, lvl, dpos);
+        } else {
+          return dpos;
+        }
+      } else if constexpr (ftype::lvltype == LvlType::Singleton) {
+        // Singleton level: pos if crd[pos] == i and next levels match.
+        if (this->CRDData(L)[pos] == lvl[L]) {
+          if constexpr (L + 1 < LVL) {
+            return GetPos<L + 1>(lvlsz, lvl, pos);
+          } else {
+            return pos;
+          }
+        }
+      } else if constexpr (ftype::lvltype == LvlType::Compressed ||
+                           ftype::lvltype == LvlType::CompressedNonUnique) {
+        // Compressed level: scan for match on i and test next levels.
+        const CRD *c = this->CRDData(L);
+        const POS *p = this->POSData(L);
+        for (index_t pp = p[pos], hi = p[pos + 1]; pp < hi; pp++) {
+          if (c[pp] == lvl[L]) {
+            if constexpr (L + 1 < LVL) {
+              const index_t cpos = GetPos<L + 1>(lvlsz, lvl, pp);
+              if constexpr (ftype::lvltype == LvlType::Compressed) {
+                return cpos; // always end scan (unique)
+              } else if (cpos != -1) {
+                return cpos; // only end scan on success (non-unique)
+              }
+            } else {
+              return pp;
+            }
+          }
+        }
+      }
+    }
+    return -1; // not found
+  }
+
+  // Element getter (viz. "lhs = Acoo(0,0);"). Note that due to the compact
+  // nature of sparse data structures, these storage formats do not provide
+  // cheap random access to their elements. Instead, the implementation will
+  // search for a stored element at the given position (which involves a scan
+  // at each compressed level). The implicit value zero is returned when the
+  // element cannot be found. So, although functional for testing, clients
+  // should avoid using getters inside performance critial regions, since
+  // the implementation is far worse than O(1).
+  template <typename... Is>
+  __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ VAL
+  operator()(Is... indices) const noexcept {
+    static_assert(
+        sizeof...(Is) == DIM,
+        "Number of indices of operator() must match rank of sparse tensor");
+    cuda::std::array<index_t, DIM> dim{indices...};
+    cuda::std::array<index_t, LVL> lvl;
+    cuda::std::array<index_t, LVL> lvlsz;
+    TF::dim2lvl(dim.data(), lvl.data(), /*asSize=*/false);
+    // TODO: only compute once and provide a constexpr LvlSize(l) instead?
+    TF::dim2lvl(this->Shape().data(), lvlsz.data(), /*asSize=*/true);
+    const index_t pos = GetPos(lvlsz.data(), lvl.data(), 0);
+    if (pos != -1) {
+      return this->Data()[pos];
+    }
+    return static_cast<VAL>(0); // implicit zero
+  }
+
 private:
   // Primary storage of sparse tensor (explicitly stored element values).
   StorageV values_;
diff --git a/include/matx/core/sparse_tensor_format.h b/include/matx/core/sparse_tensor_format.h
index f5b733ef..cdba702d 100644
--- a/include/matx/core/sparse_tensor_format.h
+++ b/include/matx/core/sparse_tensor_format.h
@@ -255,42 +255,48 @@ template <int D, typename... LvlSpecs> class SparseTensorFormat {
   template <typename CRD, int L = 0>
   __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ static void
   dim2lvl(const CRD *dims, CRD *lvls, bool asSize) {
-    using ftype = std::tuple_element_t<L, LVLSPECS>;
-    if constexpr (ftype::expr::op == LvlOp::Id) {
-      lvls[L] = dims[ftype::expr::di];
-    } else if constexpr (ftype::expr::op == LvlOp::Div) {
-      lvls[L] = dims[ftype::expr::di] / ftype::expr::cj;
-    } else if constexpr (ftype::expr::op == LvlOp::Mod) {
-      lvls[L] =
-          asSize ? ftype::expr::cj : (dims[ftype::expr::di] % ftype::expr::cj);
-    }
-    if constexpr (L + 1 < LVL) {
-      dim2lvl<CRD, L + 1>(dims, lvls, asSize);
+    if constexpr (L < LVL) {
+      using ftype = std::tuple_element_t<L, LVLSPECS>;
+      if constexpr (ftype::expr::op == LvlOp::Id) {
+        lvls[L] = dims[ftype::expr::di];
+      } else if constexpr (ftype::expr::op == LvlOp::Div) {
+        lvls[L] = dims[ftype::expr::di] / ftype::expr::cj;
+      } else if constexpr (ftype::expr::op == LvlOp::Mod) {
+        lvls[L] = asSize ? ftype::expr::cj
+                         : (dims[ftype::expr::di] % ftype::expr::cj);
+      }
+      if constexpr (L + 1 < LVL) {
+        dim2lvl<CRD, L + 1>(dims, lvls, asSize);
+      }
     }
   }
 
   template <typename CRD, int L = 0>
   __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ static void
   lvl2dim(const CRD *lvls, CRD *dims) {
-    using ftype = std::tuple_element_t<L, LVLSPECS>;
-    if constexpr (ftype::expr::op == LvlOp::Id) {
-      dims[ftype::expr::di] = lvls[L];
-    } else if constexpr (ftype::expr::op == LvlOp::Div) {
-      dims[ftype::expr::di] = lvls[L] * ftype::expr::cj;
-    } else if constexpr (ftype::expr::op == LvlOp::Mod) {
-      dims[ftype::expr::di] += lvls[L]; // update (seen second)
-    }
-    if constexpr (L + 1 < LVL) {
-      lvl2dim<CRD, L + 1>(lvls, dims);
+    if constexpr (L < LVL) {
+      using ftype = std::tuple_element_t<L, LVLSPECS>;
+      if constexpr (ftype::expr::op == LvlOp::Id) {
+        dims[ftype::expr::di] = lvls[L];
+      } else if constexpr (ftype::expr::op == LvlOp::Div) {
+        dims[ftype::expr::di] = lvls[L] * ftype::expr::cj;
+      } else if constexpr (ftype::expr::op == LvlOp::Mod) {
+        dims[ftype::expr::di] += lvls[L]; // update (seen second)
+      }
+      if constexpr (L + 1 < LVL) {
+        lvl2dim<CRD, L + 1>(lvls, dims);
+      }
     }
   }
 
   template <int L = 0> static void printLevel() {
-    using ftype = std::tuple_element_t<L, LVLSPECS>;
-    std::cout << " " << ftype::toString();
-    if constexpr (L + 1 < LVL) {
-      std::cout << ",";
-      printLevel<L + 1>();
+    if constexpr (L < LVL) {
+      using ftype = std::tuple_element_t<L, LVLSPECS>;
+      std::cout << " " << ftype::toString();
+      if constexpr (L + 1 < LVL) {
+        std::cout << ",";
+        printLevel<L + 1>();
+      }
     }
   }
 
diff --git a/include/matx/core/tensor_impl.h b/include/matx/core/tensor_impl.h
index 4bfe1586..12fff7ce 100644
--- a/include/matx/core/tensor_impl.h
+++ b/include/matx/core/tensor_impl.h
@@ -637,7 +637,8 @@ MATX_IGNORE_WARNING_POP_GCC
      * @return
      *    A shape of the data with the appropriate dimensions set
      */
-    __MATX_INLINE__ auto Shape() const noexcept { return this->desc_.Shape(); }
+    __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__
+    auto Shape() const noexcept { return this->desc_.Shape(); }
 
     /**
      * Get the strides the tensor from the underlying data