CompFUSE · PDoakORNL · Nov 15, 2024 · Nov 22, 2024 · Nov 25, 2024 · Dec 5, 2024
diff --git a/include/dca/linalg/matrix.hpp b/include/dca/linalg/matrix.hpp
diff --git a/include/dca/linalg/vector.hpp b/include/dca/linalg/vector.hpp
@@ -34,7 +34,7 @@ namespace dca {
 namespace linalg {
 // dca::linalg::
 
-  template <typename ScalarType, DeviceType device_name = DeviceType::CPU,
+template <typename ScalarType, DeviceType device_name = DeviceType::CPU,
           class Allocator = util::DefaultAllocator<ScalarType, device_name>>
 class Vector : public Allocator {
 public:
@@ -52,9 +52,8 @@ class Vector : public Allocator {
 
   /** copy constructor except for name.
    *  this is strange but for historical reasons is kept.
-   *  has needed to be explicit because with the `const ThisType&` somehow lead to an implicit conversion
-   *  from an int to a Vector& argument that landed here.
-   *  This occurred in Debug with 
+   *  has needed to be explicit because with the `const ThisType&` somehow lead to an implicit
+   * conversion from an int to a Vector& argument that landed here. This occurred in Debug with
    */
   explicit Vector(const ThisType& rhs, const std::string& name = default_name_);
 
@@ -323,7 +322,8 @@ void Vector<ScalarType, device_name, Allocator>::setAsync(const Container& rhs,
 }
 
 template <typename ScalarType, DeviceType device_name, class Allocator>
-void Vector<ScalarType, device_name, Allocator>::setToZeroAsync(const util::GpuStream& stream [[maybe_unused]]) {
+void Vector<ScalarType, device_name, Allocator>::setToZeroAsync(const util::GpuStream& stream
+                                                                [[maybe_unused]]) {
   // TODO: implement in copy.hpp.
 #ifdef DCA_HAVE_GPU
   checkRC(cudaMemsetAsync(data_, 0, size_ * sizeof(ScalarType), stream));
@@ -333,12 +333,14 @@ void Vector<ScalarType, device_name, Allocator>::setToZeroAsync(const util::GpuS
 }
 
 template <typename ScalarType, DeviceType device_name, class Allocator>
-void Vector<ScalarType, device_name, Allocator>::setToZero(const util::GpuStream& stream [[maybe_unused]]) {
+void Vector<ScalarType, device_name, Allocator>::setToZero(const util::GpuStream& stream
+                                                           [[maybe_unused]]) {
   dca::linalg::util::Memory<device_name>::setToZero(data_, size_, stream);
 }
 
 // template <typename ScalarType, DeviceType device_name, class Allocator>
-// void Vector<ScalarType, device_name, Allocator>::setToZero(const util::GpuStream& stream [[maybe_unused]]) {
+// void Vector<ScalarType, device_name, Allocator>::setToZero(const util::GpuStream& stream
+// [[maybe_unused]]) {
 //   // TODO: implement in copy.hpp.
 //   dca::linalg::util::memory<device_name>::setToZero(data_, size_, stream);
 // }

diff --git a/include/dca/phys/dca_step/cluster_solver/ctint/structs/solver_configuration.hpp b/include/dca/phys/dca_step/cluster_solver/ctint/structs/solver_configuration.hpp
@@ -33,6 +33,8 @@ namespace solver {
 namespace ctint {
 // dca::phys::solver::ctint::
 
+
+// I think the class hierarch isn't helpful here.
 class SolverConfiguration : public MatrixConfiguration {
 public:
   using BaseClass = MatrixConfiguration;

diff --git a/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_base.hpp b/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_base.hpp
@@ -41,6 +41,8 @@
 #include "dca/phys/dca_step/cluster_solver/ctint/walker/tools/d_matrix_builder_gpu.hpp"
 #endif
 
+//#define DEBUG_SUBMATRIX
+
 namespace dca {
 namespace phys {
 namespace solver {
@@ -238,7 +240,7 @@ CtintWalkerBase<Parameters, DIST>::CtintWalkerBase(const Parameters& parameters_
       thread_id_(id),
 
       streams_{&linalg::util::getStreamContainer()(thread_id_, 0),
-               &linalg::util::getStreamContainer()(thread_id_, 1)},
+               &linalg::util::getStreamContainer()(thread_id_, 0)},
 
       rng_(rng_ref),
 

diff --git a/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_cpu_submatrix.hpp b/include/dca/phys/dca_step/cluster_solver/ctint/walker/ctint_walker_cpu_submatrix.hpp
@@ -55,7 +55,8 @@ class CtintWalkerSubmatrixCpu : public CtintWalkerSubmatrixBase<Parameters, DIST
 
   using Resource = DMatrixBuilder<linalg::CPU, Scalar>;
 
-  CtintWalkerSubmatrixCpu(const Parameters& pars_ref, const Data& /*data*/, Rng& rng_ref, DMatrixBuilder<linalg::CPU, Scalar>& d_matrix_builder, int id = 0);
+  CtintWalkerSubmatrixCpu(const Parameters& pars_ref, const Data& /*data*/, Rng& rng_ref,
+                          DMatrixBuilder<linalg::CPU, Scalar>& d_matrix_builder, int id = 0);
 
   virtual ~CtintWalkerSubmatrixCpu() = default;
 
@@ -64,11 +65,29 @@ class CtintWalkerSubmatrixCpu : public CtintWalkerSubmatrixBase<Parameters, DIST
   using BaseClass::order;
 
   void setMFromConfig() override;
+
 protected:
   void doSteps();
   void generateDelayedMoves(int nbr_of_movesto_delay);
 
-  /** This does a bunch of things, this is the majority of a step
+  void markThermalized() override;
+
+  void updateM() override;
+
+  DMatrixBuilder<linalg::CPU, Scalar>& d_matrix_builder_;
+
+  BaseClass::MatrixPair getM();
+
+  /** The following methods are really only here to get decent unit testing
+      they shouldn't really be called outside of the base implementations
+  */
+  void computeMInit() override;
+  void computeGInit() override;
+  BaseClass::MatrixPair getRawM();
+  BaseClass::MatrixPair getRawG();
+
+private:
+    /** This does a bunch of things, this is the majority of a step
    *  For each delayed_move it:
    *  * computes the acceptance prob
    *  * compares std::abs(acceptance prob) and random value
@@ -82,17 +101,6 @@ class CtintWalkerSubmatrixCpu : public CtintWalkerSubmatrixBase<Parameters, DIST
    *        weight_term = prob_const_[field_type][b] * the vertex interaction strength;
    *        BaseClass::mc_log_weight_ += std::log(std::abs(mc_weight_ratio));
    */
-  void mainSubmatrixProcess();
-
-  void markThermalized() override;
-
-  void updateM() override;
-
-  void transformM();
-
-  DMatrixBuilder<linalg::CPU, Scalar>& d_matrix_builder_;
-private:
-
   void doSubmatrixUpdate();
 
   /** returns [acceptance_probability , mc_weight_ratio ]
@@ -103,10 +111,7 @@ class CtintWalkerSubmatrixCpu : public CtintWalkerSubmatrixBase<Parameters, DIST
 
   void removeRowAndColOfGammaInv();
 
-  void computeMInit() override;
-
   //  void computeG0Init();
-  void computeGInit() override;
 
   Move generateMoveType();
 
@@ -174,11 +179,11 @@ class CtintWalkerSubmatrixCpu : public CtintWalkerSubmatrixBase<Parameters, DIST
   using SubmatrixBase::nbr_of_indices_;
   using SubmatrixBase::q_;
   using SubmatrixBase::det_ratio_;
+
 protected:
   using BaseClass::acceptance_prob_;
 
 protected:
-
   using BaseClass::flop_;
 };
 
@@ -187,8 +192,7 @@ CtintWalkerSubmatrixCpu<Parameters, DIST>::CtintWalkerSubmatrixCpu(
     const Parameters& parameters_ref, const Data& data, Rng& rng_ref,
     DMatrixBuilder<linalg::CPU, Scalar>& d_matrix_builder, int id)
     : SubmatrixBase(parameters_ref, data, rng_ref, id), d_matrix_builder_(d_matrix_builder) {
-
-    for (int b = 0; b < n_bands_; ++b) {
+  for (int b = 0; b < n_bands_; ++b) {
     for (int i = 1; i <= 3; ++i) {
       f_[i][b] = d_matrix_builder_.computeF(i, b);
       f_[-i][b] = d_matrix_builder_.computeF(-i, b);
@@ -202,17 +206,23 @@ CtintWalkerSubmatrixCpu<Parameters, DIST>::CtintWalkerSubmatrixCpu(
     }
     f_[0][b] = 1;
   }
-
 }
 
 template <class Parameters, DistType DIST>
 void CtintWalkerSubmatrixCpu<Parameters, DIST>::setMFromConfig() {
   BaseClass::setMFromConfigImpl(d_matrix_builder_);
-  transformM();
+  SubmatrixBase::transformM();
+#ifdef DEBUG_SUBMATRIX
+  std::cout << "cpu M post setMFromConfigImpl: \n";
+  M_[0].set_name("cpu_M_0");
+  M_[1].set_name("cpu_M_1");
+  M_[0].print();
+  M_[1].print();
+#endif
 }
 
 template <class Parameters, DistType DIST>
-void CtintWalkerSubmatrixCpu<Parameters,DIST>::markThermalized() {
+void CtintWalkerSubmatrixCpu<Parameters, DIST>::markThermalized() {
   thermalized_ = true;
 
   nb_steps_per_sweep_ = std::max(1., std::ceil(sweeps_per_meas_ * partial_order_avg_.mean()));
@@ -225,7 +235,7 @@ void CtintWalkerSubmatrixCpu<Parameters,DIST>::markThermalized() {
   // Recompute the Monte Carlo weight.
   setMFromConfig();
 #ifndef NDEBUG
-  //writeAlphas();
+  // writeAlphas();
 #endif
 }
 
@@ -242,26 +252,55 @@ void CtintWalkerSubmatrixCpu<Parameters, DIST>::computeMInit() {
       Scalar f_j;
       D_.resize(std::make_pair(delta, n_init_[s]));
 
+      if (delta == 0 || n_init_[s] == 0)
+        throw std::runtime_error(
+            "expansion factor dropped to 0 or below use a higher beta or larger interaction!");
+
       d_matrix_builder_.computeG0(D_, configuration_.getSector(s), n_init_[s], n_max_[s], 0);
 
+#ifdef DEBUG_SUBMATRIX
+      D_.print();
+#endif
+
+      std::array<linalg::Vector<Real, linalg::CPU>, 2> f_values;
+      f_values[s].resize(n_init_[s]);
       for (int j = 0; j < n_init_[s]; ++j) {
         const auto field_type = configuration_.getSector(s).getAuxFieldType(j);
         const auto b = configuration_.getSector(s).getRightB(j);
         f_j = f_[field_type][b] - 1;
-
+        f_values[s][j] = f_[field_type][b];
         for (int i = 0; i < delta; ++i) {
           D_(i, j) *= f_j;
         }
       }
 
+#ifdef DEBUG_SUBMATRIX
+      f_values[0].set_name("cpu_f_values_0");
+      f_values[1].set_name("cpu_f_values_1");
+      f_values[0].print();
+      f_values[1].print();
+      std::cout << "cpu D post f factor mult\n";
+      D_.print();
+      using namespace dca::addt_str_oper;
+      std::cout << "M_[" << s << "] size: " << M_[s].size() << '\n';
+#endif
+
       M_[s].resize(n_max_[s]);
 
       MatrixView M(M_[s], 0, 0, n_init_[s], n_init_[s]);
       MatrixView D_M(M_[s], n_init_[s], 0, delta, n_init_[s]);
 
+#ifdef DEBUG_SUBMATRIX
+      std::cout << "cpu M pre gemm\n";
+      M_[s].print();
+#endif
+
       linalg::matrixop::gemm(D_, M, D_M);
       flop_ += 2 * D_.nrRows() * D_.nrCols() * M.nrCols();
 
+#ifdef DEBUG_SUBMATRIX
+      D_M.print();
+#endif
       for (int i = 0; i < n_max_[s]; ++i) {
         for (int j = n_init_[s]; j < n_max_[s]; ++j) {
           M_[s](i, j) = 0;
@@ -270,6 +309,9 @@ void CtintWalkerSubmatrixCpu<Parameters, DIST>::computeMInit() {
 
       for (int i = n_init_[s]; i < n_max_[s]; ++i)
         M_[s](i, i) = 1;
+#ifdef DEBUG_SUBMATRIX
+      M_[s].print();
+#endif
     }
   }
 }
@@ -386,6 +428,8 @@ void CtintWalkerSubmatrixCpu<Parameters, DIST>::updateM() {
     linalg::matrixop::copyCols(M_[s], source_list_[s], M_[s], removal_list_[s]);
     M_[s].resize(configuration_.getSector(s).size());
   }
+
+
 }
 
 template <class Parameters, DistType DIST>
@@ -493,20 +537,6 @@ void CtintWalkerSubmatrixCpu<Parameters, DIST>::recomputeGammaInv() {
   }
 }
 
-template <class Parameters, DistType DIST>
-void CtintWalkerSubmatrixCpu<Parameters, DIST>::transformM() {
-  for (int s = 0; s < 2; ++s) {
-    for (int j = 0; j < M_[s].size().second; ++j) {
-      for (int i = 0; i < M_[s].size().first; ++i) {
-        const auto field_type = configuration_.getSector(s).getAuxFieldType(i);
-        const auto b = configuration_.getSector(s).getLeftB(i);
-        const Scalar f_i = -(f_[field_type][b] - 1);
-        M_[s](i, j) /= f_i;
-      }
-    }
-  }
-}
-
 template <class Parameters, DistType DIST>
 void CtintWalkerSubmatrixCpu<Parameters, DIST>::computeM(typename BaseClass::MatrixPair& m_accum) {
   for (int s = 0; s < 2; ++s) {
@@ -591,6 +621,34 @@ void CtintWalkerSubmatrixCpu<Parameters, DIST>::computeMixedInsertionAndRemoval(
   computeRemovalMatrix(s);
 }
 
+template <class Parameters, DistType DIST>
+CtintWalkerSubmatrixCpu<Parameters, DIST>::BaseClass::MatrixPair CtintWalkerSubmatrixCpu<
+    Parameters, DIST>::getRawM() {
+  typename BaseClass::MatrixPair M;
+  M = M_;
+  M[0].set_name("subMatrixCPU::M[0]");
+  M[1].set_name("subMatrixCPU::M[1]");
+  return M;
+}
+
+template <class Parameters, DistType DIST>
+CtintWalkerSubmatrixCpu<Parameters, DIST>::BaseClass::MatrixPair CtintWalkerSubmatrixCpu<
+    Parameters, DIST>::getRawG() {
+  typename BaseClass::MatrixPair G;
+  G = G_;
+  G[0].set_name("subMatrixCPU::G[0]");
+  G[1].set_name("subMatrixCPU::G[1]");
+  return G;
+}
+
+template <class Parameters, DistType DIST>
+CtintWalkerSubmatrixCpu<Parameters, DIST>::BaseClass::MatrixPair CtintWalkerSubmatrixCpu<Parameters,
+                                                                                         DIST>::getM() {
+  typename BaseClass::MatrixPair M;
+  computeM(M);
+  return M;
+}
+
 }  // namespace ctint
 }  // namespace solver
 }  // namespace phys