QMCPACK · ye-luo · Aug 28, 2023 · Aug 22, 2023 · Aug 23, 2023 · Aug 24, 2023
diff --git a/src/QMCWaveFunctions/BsplineFactory/SplineR2R.cpp b/src/QMCWaveFunctions/BsplineFactory/SplineR2R.cpp
@@ -17,6 +17,7 @@
 #include "SplineR2R.h"
 #include "spline2/MultiBsplineEval.hpp"
 #include "QMCWaveFunctions/BsplineFactory/contraction_helper.hpp"
+#include "Platforms/CPU/BLAS.hpp"
 
 namespace qmcplusplus
 {
@@ -56,7 +57,7 @@ void SplineR2R<ST>::storeParamsBeforeRotation()
 {
   const auto spline_ptr     = SplineInst->getSplinePtr();
   const auto coefs_tot_size = spline_ptr->coefs_size;
-  coef_copy_                = std::make_shared<std::vector<RealType>>(coefs_tot_size);
+  coef_copy_                = std::make_shared<std::vector<ST>>(coefs_tot_size);
 
   std::copy_n(spline_ptr->coefs, coefs_tot_size, coef_copy_->begin());
 }
@@ -120,21 +121,12 @@ void SplineR2R<ST>::applyRotation(const ValueMatrix& rot_mat, bool use_stored_co
     std::copy_n(spl_coefs, coefs_tot_size, coef_copy_->begin());
   }
 
-  // Apply rotation the dumb way b/c I can't get BLAS::gemm to work...
-  for (auto i = 0; i < BasisSetSize; i++)
-  {
+  std::vector<ST> rot_mat_padded(Nsplines * Nsplines, 0);
+  for (auto i = 0; i < OrbitalSetSize; i++)
     for (auto j = 0; j < OrbitalSetSize; j++)
-    {
-      const auto cur_elem = Nsplines * i + j;
-      auto newval{0.};
-      for (auto k = 0; k < OrbitalSetSize; k++)
-      {
-        const auto index = i * Nsplines + k;
-        newval += (*coef_copy_)[index] * rot_mat[k][j];
-      }
-      spl_coefs[cur_elem] = newval;
-    }
-  }
+       rot_mat_padded[i * Nsplines + j] = rot_mat.data()[i * OrbitalSetSize + j];
+  BLAS::gemm('N', 'N', Nsplines, BasisSetSize, Nsplines, ST(1.0), rot_mat_padded.data(), Nsplines, (*coef_copy_).data(), Nsplines, ST(0.0), spl_coefs, Nsplines);
+
 }
 
 

diff --git a/src/QMCWaveFunctions/BsplineFactory/SplineR2R.h b/src/QMCWaveFunctions/BsplineFactory/SplineR2R.h
@@ -59,7 +59,7 @@ class SplineR2R : public BsplineSet
   std::shared_ptr<MultiBspline<ST>> SplineInst;
 
   ///Copy of original splines for orbital rotation
-  std::shared_ptr<std::vector<RealType>> coef_copy_;
+  std::shared_ptr<std::vector<ST>> coef_copy_;
 
   ///thread private ratios for reduction when using nested threading, numVP x numThread
   Matrix<TT> ratios_private;