Qiskit · MozammilQ · Jun 5, 2024 · Jun 5, 2024 · Jun 6, 2024 · Jun 6, 2024
diff --git a/releasenotes/notes/mps-svd-with-cuquantum-c0392854d1f373e0.yaml b/releasenotes/notes/mps-svd-with-cuquantum-c0392854d1f373e0.yaml
@@ -0,0 +1,40 @@
+---
+features:
+  - |
+    This PR adds the ability to run Matrix Product State Simulation on Nvidia GPUs.
+    To be precise, this PR offloads the Singular Value Decomposition required for
+    Matrix Product State Simulation to Nvidia GPUs with the help of cuQuantum.
+
+    While choosing for the backend for Matrix Product State simulation users can 
+    choose all as usual, but this time they can choose the device as GPU.
+
+    Example
+
+    .. code-block:: python
+
+      from qiskit_aer import AerSimulator
+      from qiskit.circuit import QuantumCircuit
+      from qiskit.compiler import transpile
+
+      num_qubits = 10
+      shots = 5
+
+      qc = QuantumCircuit(num_qubits)
+      qc.h(0)
+
+      for control, target in zip(range(num_qubits-1), range(1, num_qubits)):
+        qc.cx(control, target)
+
+      qc.measure_all()
+
+      sim = AerSimulator(method="matrix_product_state", device="GPU")
+      qc_t = transpile(qc, backend=sim)
+      job = sim.run(qc_t, shots = shots)
+
+      counts = job.result().get_counts()
+      counts
+
+
+
+
+
diff --git a/src/simulators/matrix_product_state/matrix_product_state.hpp b/src/simulators/matrix_product_state/matrix_product_state.hpp
@@ -360,6 +360,14 @@ void State::set_config(const Config &config) {
 
   // Set LAPACK SVD
   MPS::set_mps_lapack_svd(config.mps_lapack);
+
+  // Set device for SVD
+  MPS::set_mps_svd_device(config.device);
+
+  // Get CUDA device, if GPU offloading enabled
+  if (config.device.compare("GPU") == 0) {
+    MPS::set_cuda_device();
+  }
 }
 
 void State::add_metadata(ExperimentResult &result) const {

diff --git a/src/simulators/matrix_product_state/matrix_product_state_internal.cpp b/src/simulators/matrix_product_state/matrix_product_state_internal.cpp
@@ -19,6 +19,7 @@
 #include "stdlib.h"
 #include "string.h"
 #include <iostream>
+#include <string>
 #include <utility>
 
 #include "framework/linalg/almost_equal.hpp"
@@ -45,6 +46,7 @@ double MPS::json_chop_threshold_ = 1E-8;
 std::stringstream MPS::logging_str_;
 bool MPS::mps_log_data_ = 0;
 bool MPS::mps_lapack_ = false;
+std::string MPS::mps_svd_device_;
 
 //------------------------------------------------------------------------
 // local function declarations
@@ -663,8 +665,9 @@ void MPS::common_apply_2_qubit_gate(
 
   MPS_Tensor left_gamma, right_gamma;
   rvector_t lambda;
-  double discarded_value = MPS_Tensor::Decompose(temp, left_gamma, lambda,
-                                                 right_gamma, MPS::mps_lapack_);
+  double discarded_value =
+      MPS_Tensor::Decompose(temp, left_gamma, lambda, right_gamma,
+                            MPS::mps_lapack_, MPS::mps_svd_device_);
 
   if (discarded_value > json_chop_threshold_)
     MPS::print_to_log("discarded_value=", discarded_value, ", ");
@@ -1803,7 +1806,15 @@ void MPS::initialize_from_matrix(uint_t num_qubits, const cmatrix_t &mat) {
     // step 2 - SVD
     S.clear();
     S.resize(std::min(reshaped_matrix.GetRows(), reshaped_matrix.GetColumns()));
-    csvd_wrapper(reshaped_matrix, U, S, V, MPS::mps_lapack_);
+
+    if (MPS::mps_svd_device_.compare("GPU") == 0) {
+#ifdef AER_THRUST_CUDA
+      cutensor_csvd_wrapper(reshaped_matrix, U, S, V);
+#endif // AER_THRUST_CUDA
+    } else {
+      csvd_wrapper(reshaped_matrix, U, S, V, MPS::mps_lapack_);
+    }
+
     reduce_zeros(U, S, V, MPS_Tensor::get_max_bond_dimension(),
                  MPS_Tensor::get_truncation_threshold(), MPS::mps_lapack_);
 

diff --git a/src/simulators/matrix_product_state/matrix_product_state_internal.hpp b/src/simulators/matrix_product_state/matrix_product_state_internal.hpp
@@ -15,12 +15,12 @@
 #ifndef _aer_matrix_product_state_hpp_
 #define _aer_matrix_product_state_hpp_
 
-#include <cstdarg>
-
 #include "framework/json.hpp"
 #include "framework/operations.hpp"
 #include "framework/utils.hpp"
 #include "matrix_product_state_tensor.hpp"
+#include <cstdarg>
+#include <string>
 
 namespace AER {
 namespace MatrixProductState {
@@ -321,6 +321,20 @@ class MPS {
   }
 
   static void set_mps_lapack_svd(bool mps_lapack) { mps_lapack_ = mps_lapack; }
+  static void set_mps_svd_device(std::string mps_svd_device) {
+    mps_svd_device_ = mps_svd_device;
+  }
+
+  static void set_cuda_device() {
+    // the prop could be used to log the properties of the device.
+
+#ifdef AER_THRUST_CUDA
+    cudaDeviceProp prop;
+    int deviceId{-1};
+    HANDLE_CUDA_ERROR(cudaGetDevice(&deviceId));
+    HANDLE_CUDA_ERROR(cudaGetDeviceProperties(&prop, deviceId));
+#endif // AER_THRUST_CUDA
+  }
 
   static uint_t get_omp_threads() { return omp_threads_; }
   static uint_t get_omp_threshold() { return omp_threshold_; }
@@ -570,6 +584,7 @@ class MPS {
   static bool mps_log_data_;
   static MPS_swap_direction mps_swap_direction_;
   static bool mps_lapack_;
+  static std::string mps_svd_device_;
 };
 
 inline std::ostream &operator<<(std::ostream &out, const rvector_t &vec) {

diff --git a/src/simulators/matrix_product_state/matrix_product_state_tensor.hpp b/src/simulators/matrix_product_state/matrix_product_state_tensor.hpp
@@ -159,7 +159,7 @@ class MPS_Tensor {
                              const MPS_Tensor &right_gamma, bool mul_by_lambda);
   static double Decompose(MPS_Tensor &temp, MPS_Tensor &left_gamma,
                           rvector_t &lambda, MPS_Tensor &right_gamma,
-                          bool mps_lapack);
+                          bool mps_lapack, std::string mps_svd_device);
   static void reshape_for_3_qubits_before_SVD(const std::vector<cmatrix_t> data,
                                               MPS_Tensor &reshaped_tensor);
   static void contract_2_dimensions(const MPS_Tensor &left_gamma,
@@ -592,13 +592,19 @@ void MPS_Tensor::contract_2_dimensions(const MPS_Tensor &left_gamma,
 //---------------------------------------------------------------
 double MPS_Tensor::Decompose(MPS_Tensor &temp, MPS_Tensor &left_gamma,
                              rvector_t &lambda, MPS_Tensor &right_gamma,
-                             bool mps_lapack) {
+                             bool mps_lapack, std::string mps_svd_device) {
   cmatrix_t C;
   C = reshape_before_SVD(temp.data_);
   cmatrix_t U, V;
   rvector_t S(std::min(C.GetRows(), C.GetColumns()));
 
-  csvd_wrapper(C, U, S, V, mps_lapack);
+  if (mps_svd_device.compare("GPU") == 0) {
+#ifdef AER_THRUST_CUDA
+    cutensor_csvd_wrapper(C, U, S, V);
+#endif // AER_THRUST_CUDA
+  } else {
+    csvd_wrapper(C, U, S, V, mps_lapack);
+  }
   double discarded_value = 0.0;
   discarded_value = reduce_zeros(U, S, V, max_bond_dimension_,
                                  truncation_threshold_, mps_lapack);

diff --git a/src/simulators/matrix_product_state/svd.cpp b/src/simulators/matrix_product_state/svd.cpp
@@ -28,6 +28,7 @@
 #include <stdlib.h>
 
 namespace AER {
+
 // default values
 constexpr auto mul_factor = 1e2;
 constexpr long double tiny_factor = 1e30;
@@ -667,4 +668,167 @@ void lapack_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S,
   }
 }
 
+#ifdef AER_THRUST_CUDA
+void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S,
+                           cmatrix_t &V) {
+
+  bool transposed = false;
+
+  const int64_t rows = A.GetRows(), cols = A.GetColumns();
+
+  if (rows < cols) {
+    transposed = true;
+    A = AER::Utils::dagger(A);
+  }
+  cmatrix_t A_cpy = A;
+
+  const int64_t min_dim = std::min(rows, cols);
+  const int64_t lda = std::max(rows, cols);
+
+  U.resize(lda, min_dim);
+  V.resize(min_dim, min_dim);
+  S.resize(min_dim);
+
+  size_t sizeA = A.size() * sizeof(complex_t);
+  size_t sizeU = U.size() * sizeof(complex_t);
+  size_t sizeS = S.size() * sizeof(double);
+  size_t sizeV = V.size() * sizeof(complex_t);
+
+  complex_t *cutensor_A = A.move_to_buffer(), *cutensor_U = U.move_to_buffer(),
+            *cutensor_V = V.move_to_buffer();
+
+  cudaDataType_t typeData = CUDA_C_64F;
+
+  std::vector<int32_t> modesA{'m', 'n'};
+  std::vector<int32_t> modesU{'m', 'x'};
+  std::vector<int32_t> modesV{'x', 'n'};
+
+  cudaStream_t stream;
+  HANDLE_CUDA_ERROR(cudaStreamCreate(&stream));
+
+  cutensornetHandle_t handle;
+  HANDLE_ERROR(cutensornetCreate(&handle));
+
+  double *cutensor_S = (double *)malloc(sizeS);
+
+  void *D_A;
+  void *D_U;
+  void *D_S;
+  void *D_V;
+
+  HANDLE_CUDA_ERROR(cudaMalloc((void **)&D_A, sizeA));
+  HANDLE_CUDA_ERROR(cudaMalloc((void **)&D_U, sizeU));
+  HANDLE_CUDA_ERROR(cudaMalloc((void **)&D_S, sizeS));
+  HANDLE_CUDA_ERROR(cudaMalloc((void **)&D_V, sizeV));
+
+  HANDLE_CUDA_ERROR(cudaMemcpy(D_A, cutensor_A, sizeA, cudaMemcpyHostToDevice));
+
+  cutensornetTensorDescriptor_t descTensorA;
+  cutensornetTensorDescriptor_t descTensorU;
+  cutensornetTensorDescriptor_t descTensorV;
+
+  const int32_t numModesA = modesA.size();
+  const int32_t numModesU = modesU.size();
+  const int32_t numModesV = modesV.size();
+
+  std::vector<int64_t> extentA{lda, min_dim};     // shape of A
+  std::vector<int64_t> extentU{lda, min_dim};     // shape of U :)
+  std::vector<int64_t> extentV{min_dim, min_dim}; // shape of V
+
+  const int64_t *strides =
+      NULL; // matrices stores the entries in column-major-order.
+
+  HANDLE_ERROR(cutensornetCreateTensorDescriptor(
+      handle, numModesA, extentA.data(), strides, modesA.data(), typeData,
+      &descTensorA));
+  HANDLE_ERROR(cutensornetCreateTensorDescriptor(
+      handle, numModesU, extentU.data(), strides, modesU.data(), typeData,
+      &descTensorU));
+  HANDLE_ERROR(cutensornetCreateTensorDescriptor(
+      handle, numModesV, extentV.data(), strides, modesV.data(), typeData,
+      &descTensorV));
+
+  cutensornetWorkspaceDescriptor_t workDesc;
+  HANDLE_ERROR(cutensornetCreateWorkspaceDescriptor(handle, &workDesc));
+  HANDLE_ERROR(cutensornetWorkspaceComputeSVDSizes(
+      handle, descTensorA, descTensorU, descTensorV, NULL, workDesc));
+  int64_t hostWorkspaceSize, deviceWorkspaceSize;
+  // for tensor SVD, it does not matter which cutensornetWorksizePref_t we pick
+  HANDLE_ERROR(cutensornetWorkspaceGetMemorySize(
+      handle, workDesc, CUTENSORNET_WORKSIZE_PREF_RECOMMENDED,
+      CUTENSORNET_MEMSPACE_DEVICE, CUTENSORNET_WORKSPACE_SCRATCH,
+      &deviceWorkspaceSize));
+  HANDLE_ERROR(cutensornetWorkspaceGetMemorySize(
+      handle, workDesc, CUTENSORNET_WORKSIZE_PREF_RECOMMENDED,
+      CUTENSORNET_MEMSPACE_HOST, CUTENSORNET_WORKSPACE_SCRATCH,
+      &hostWorkspaceSize));
+
+  void *devWork = nullptr, *hostWork = nullptr;
+  if (deviceWorkspaceSize > 0) {
+    HANDLE_CUDA_ERROR(cudaMalloc(&devWork, deviceWorkspaceSize));
+  }
+  if (hostWorkspaceSize > 0) {
+    hostWork = malloc(hostWorkspaceSize);
+  }
+  HANDLE_ERROR(cutensornetWorkspaceSetMemory(
+      handle, workDesc, CUTENSORNET_MEMSPACE_DEVICE,
+      CUTENSORNET_WORKSPACE_SCRATCH, devWork, deviceWorkspaceSize));
+  HANDLE_ERROR(cutensornetWorkspaceSetMemory(
+      handle, workDesc, CUTENSORNET_MEMSPACE_HOST,
+      CUTENSORNET_WORKSPACE_SCRATCH, hostWork, hostWorkspaceSize));
+
+  // Requesting for Exact SVD.
+  HANDLE_ERROR(cutensornetTensorSVD(handle, descTensorA, D_A, descTensorU, D_U,
+                                    D_S, descTensorV, D_V, NULL, NULL, workDesc,
+                                    stream));
+
+  HANDLE_CUDA_ERROR(
+      cudaMemcpyAsync(cutensor_U, D_U, sizeU, cudaMemcpyDeviceToHost));
+  HANDLE_CUDA_ERROR(
+      cudaMemcpyAsync(cutensor_S, D_S, sizeS, cudaMemcpyDeviceToHost));
+  HANDLE_CUDA_ERROR(
+      cudaMemcpyAsync(cutensor_V, D_V, sizeV, cudaMemcpyDeviceToHost));
+
+  S.clear();
+  for (int i = 0; i < min_dim; i++)
+    S.push_back(cutensor_S[i]);
+
+  A = cmatrix_t::move_from_buffer(lda, min_dim, cutensor_A);
+  U = cmatrix_t::move_from_buffer(lda, min_dim, cutensor_U);
+  V = cmatrix_t::move_from_buffer(min_dim, min_dim, cutensor_V);
+
+  V = AER::Utils::dagger(V);
+  validate_SVD_result(A_cpy, U, S, V);
+  if (transposed) {
+    std::swap(U, V);
+  }
+
+  /***************
+   * Free resources
+   ****************/
+
+  HANDLE_ERROR(cutensornetDestroyTensorDescriptor(descTensorA));
+  HANDLE_ERROR(cutensornetDestroyTensorDescriptor(descTensorU));
+  HANDLE_ERROR(cutensornetDestroyTensorDescriptor(descTensorV));
+  HANDLE_ERROR(cutensornetDestroyWorkspaceDescriptor(workDesc));
+  HANDLE_CUDA_ERROR(cudaStreamDestroy(stream));
+  HANDLE_ERROR(cutensornetDestroy(handle));
+
+  if (cutensor_S)
+    free(cutensor_S);
+  if (D_A)
+    cudaFree(D_A);
+  if (D_U)
+    cudaFree(D_U);
+  if (D_S)
+    cudaFree(D_S);
+  if (D_V)
+    cudaFree(D_V);
+  if (devWork)
+    cudaFree(devWork);
+  if (hostWork)
+    free(hostWork);
+}
+#endif // AER_THRUST_CUDA
+
 } // namespace AER
diff --git a/src/simulators/matrix_product_state/svd.hpp b/src/simulators/matrix_product_state/svd.hpp
@@ -55,6 +55,40 @@ void validate_SVD_result(const cmatrix_t &A, const cmatrix_t &U,
 void validate_SVdD_result(const cmatrix_t &A, const cmatrix_t &U,
                           const rvector_t &S, const cmatrix_t &V);
 
+#ifdef AER_THRUST_CUDA
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <cutensornet.h>
+#include <vector>
+
+#define HANDLE_ERROR(x)                                                        \
+  {                                                                            \
+    const auto err = x;                                                        \
+    if (err != CUTENSORNET_STATUS_SUCCESS) {                                   \
+      std::stringstream str;                                                   \
+      str << "ERROR TensorNet::contractor : "                                  \
+          << cutensornetGetErrorString(err);                                   \
+      throw std::runtime_error(str.str());                                     \
+    }                                                                          \
+  };
+
+#define HANDLE_CUDA_ERROR(x)                                                   \
+  {                                                                            \
+    const auto err = x;                                                        \
+    if (err != cudaSuccess) {                                                  \
+      std::stringstream str;                                                   \
+      str << "ERROR TensorNet::contractor : " << cudaGetErrorString(err);      \
+      throw std::runtime_error(str.str());                                     \
+    }                                                                          \
+  };
+
+// cutensor call
+void cutensor_csvd_wrapper(cmatrix_t &C, cmatrix_t &U, rvector_t &S,
+                           cmatrix_t &V);
+
+#endif // AER_THRUST_CUDA
+
 //-------------------------------------------------------------------------
 } // end namespace AER
 //-------------------------------------------------------------------------