Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MPS with cuQuantum #2168

Open
wants to merge 34 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 24 commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
563ae6e
initial layout
MozammilQ Jun 5, 2024
280f868
refactor code
MozammilQ Jun 5, 2024
ae44c69
refactor code
MozammilQ Jun 6, 2024
5b48265
refactor code
MozammilQ Jun 6, 2024
517a554
refactor code
MozammilQ Jun 6, 2024
7e40588
refactor code
MozammilQ Jun 6, 2024
ebf9ca0
refactor code
MozammilQ Jun 6, 2024
a422690
refactor code
MozammilQ Jun 6, 2024
80b59d5
refactor code
MozammilQ Jun 6, 2024
52f1ed4
refactor code
MozammilQ Jun 7, 2024
ed43e71
refactor code
MozammilQ Jun 9, 2024
649a5d7
refactor code
MozammilQ Jun 9, 2024
83e4b5e
Merge branch 'main' into mps-cutensor
doichanj Jun 10, 2024
c33571f
refactor code
MozammilQ Jun 11, 2024
abc5552
Merge branch 'main' into mps-cutensor
doichanj Jun 14, 2024
f0205e3
refactor code
MozammilQ Jun 14, 2024
629f65f
refactor code
MozammilQ Jun 15, 2024
644a822
added release note
MozammilQ Jun 16, 2024
e6f2288
refactor code
MozammilQ Jun 17, 2024
42f983e
Merge branch 'Qiskit:main' into mps-cutensor
MozammilQ Jun 17, 2024
c24b9e2
refactor code
MozammilQ Jun 18, 2024
34e9502
refactor code
MozammilQ Jun 18, 2024
00f88e9
refactor code; included test
MozammilQ Jun 18, 2024
454f8c0
lint
MozammilQ Jun 18, 2024
985c7f2
added suggestion
MozammilQ Jun 18, 2024
7ffab7d
Merge branch 'main' into mps-cutensor
doichanj Jul 4, 2024
6b0b41d
Merge branch 'main' into mps-cutensor
MozammilQ Aug 30, 2024
34a5e75
fixed a typo
MozammilQ Aug 31, 2024
a1ae308
refactor code
MozammilQ Sep 10, 2024
859e946
Merge branch 'Qiskit:main' into mps-cutensor
MozammilQ Oct 4, 2024
2ae116e
Merge branch 'Qiskit:main' into mps-cutensor
MozammilQ Nov 6, 2024
ed9a907
refactor code
MozammilQ Nov 11, 2024
a4dbd12
Merge branch 'Qiskit:main' into mps-cutensor
MozammilQ Nov 12, 2024
e1e80d1
added cublas for contract
MozammilQ Nov 28, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 40 additions & 0 deletions releasenotes/notes/mps-svd-with-cuquantum-c0392854d1f373e0.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
---
features:
- |
This PR adds the ability to run Matrix Product State Simulation on Nvidia GPUs.
To be precise, this PR offloads the Singular Value Decomposition required for
Matrix Product State Simulation to Nvidia GPUs with the help of cuQuantum.

While choosing for the backend for Matrix Product State simulation users can
choose all as usual, but this time they can choose the device as GPU.

Example

.. code-block:: python

from qiskit_aer import AerSimulator
from qiskit.circuit import QuantumCircuit
from qiskit.compiler import transpile

num_qubits = 10
shots = 5

qc = QuantumCircuit(num_qubits)
qc.h(0)

for control, target in zip(range(num_qubits-1), range(1, num_qubits)):
qc.cx(control, target)

qc.measure_all()

sim = AerSimulator(method="matrix_product_state", device="GPU")
qc_t = transpile(qc, backend=sim)
job = sim.run(qc_t, shots = shots)

counts = job.result().get_counts()
counts





8 changes: 8 additions & 0 deletions src/simulators/matrix_product_state/matrix_product_state.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -360,6 +360,14 @@ void State::set_config(const Config &config) {

// Set LAPACK SVD
MPS::set_mps_lapack_svd(config.mps_lapack);

// Set device for SVD
MPS::set_mps_svd_device(config.device);

// Get CUDA device, if GPU offloading enabled
if (config.device.compare("GPU") == 0) {
MPS::set_cuda_device();
}
}

void State::add_metadata(ExperimentResult &result) const {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include "stdlib.h"
#include "string.h"
#include <iostream>
#include <string>
#include <utility>

#include "framework/linalg/almost_equal.hpp"
Expand All @@ -45,6 +46,7 @@ double MPS::json_chop_threshold_ = 1E-8;
std::stringstream MPS::logging_str_;
bool MPS::mps_log_data_ = 0;
bool MPS::mps_lapack_ = false;
std::string MPS::mps_svd_device_;

//------------------------------------------------------------------------
// local function declarations
Expand Down Expand Up @@ -663,8 +665,9 @@ void MPS::common_apply_2_qubit_gate(

MPS_Tensor left_gamma, right_gamma;
rvector_t lambda;
double discarded_value = MPS_Tensor::Decompose(temp, left_gamma, lambda,
right_gamma, MPS::mps_lapack_);
double discarded_value =
MPS_Tensor::Decompose(temp, left_gamma, lambda, right_gamma,
MPS::mps_lapack_, MPS::mps_svd_device_);

if (discarded_value > json_chop_threshold_)
MPS::print_to_log("discarded_value=", discarded_value, ", ");
Expand Down Expand Up @@ -1803,7 +1806,15 @@ void MPS::initialize_from_matrix(uint_t num_qubits, const cmatrix_t &mat) {
// step 2 - SVD
S.clear();
S.resize(std::min(reshaped_matrix.GetRows(), reshaped_matrix.GetColumns()));
csvd_wrapper(reshaped_matrix, U, S, V, MPS::mps_lapack_);

if (MPS::mps_svd_device_.compare("GPU") == 0) {
#ifdef AER_THRUST_CUDA
cutensor_csvd_wrapper(reshaped_matrix, U, S, V);
#endif // AER_THRUST_CUDA
} else {
csvd_wrapper(reshaped_matrix, U, S, V, MPS::mps_lapack_);
}

reduce_zeros(U, S, V, MPS_Tensor::get_max_bond_dimension(),
MPS_Tensor::get_truncation_threshold(), MPS::mps_lapack_);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,12 @@
#ifndef _aer_matrix_product_state_hpp_
#define _aer_matrix_product_state_hpp_

#include <cstdarg>

#include "framework/json.hpp"
#include "framework/operations.hpp"
#include "framework/utils.hpp"
#include "matrix_product_state_tensor.hpp"
#include <cstdarg>
#include <string>

namespace AER {
namespace MatrixProductState {
Expand Down Expand Up @@ -321,6 +321,20 @@ class MPS {
}

static void set_mps_lapack_svd(bool mps_lapack) { mps_lapack_ = mps_lapack; }
static void set_mps_svd_device(std::string mps_svd_device) {
mps_svd_device_ = mps_svd_device;
}

static void set_cuda_device() {
// the prop could be used to log the properties of the device.

#ifdef AER_THRUST_CUDA
cudaDeviceProp prop;
int deviceId{-1};
HANDLE_CUDA_ERROR(cudaGetDevice(&deviceId));
HANDLE_CUDA_ERROR(cudaGetDeviceProperties(&prop, deviceId));
#endif // AER_THRUST_CUDA
}

static uint_t get_omp_threads() { return omp_threads_; }
static uint_t get_omp_threshold() { return omp_threshold_; }
Expand Down Expand Up @@ -570,6 +584,7 @@ class MPS {
static bool mps_log_data_;
static MPS_swap_direction mps_swap_direction_;
static bool mps_lapack_;
static std::string mps_svd_device_;
};

inline std::ostream &operator<<(std::ostream &out, const rvector_t &vec) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ class MPS_Tensor {
const MPS_Tensor &right_gamma, bool mul_by_lambda);
static double Decompose(MPS_Tensor &temp, MPS_Tensor &left_gamma,
rvector_t &lambda, MPS_Tensor &right_gamma,
bool mps_lapack);
bool mps_lapack, std::string mps_svd_device);
static void reshape_for_3_qubits_before_SVD(const std::vector<cmatrix_t> data,
MPS_Tensor &reshaped_tensor);
static void contract_2_dimensions(const MPS_Tensor &left_gamma,
Expand Down Expand Up @@ -592,13 +592,19 @@ void MPS_Tensor::contract_2_dimensions(const MPS_Tensor &left_gamma,
//---------------------------------------------------------------
double MPS_Tensor::Decompose(MPS_Tensor &temp, MPS_Tensor &left_gamma,
rvector_t &lambda, MPS_Tensor &right_gamma,
bool mps_lapack) {
bool mps_lapack, std::string mps_svd_device) {
cmatrix_t C;
C = reshape_before_SVD(temp.data_);
cmatrix_t U, V;
rvector_t S(std::min(C.GetRows(), C.GetColumns()));

csvd_wrapper(C, U, S, V, mps_lapack);
if (mps_svd_device.compare("GPU") == 0) {
#ifdef AER_THRUST_CUDA
cutensor_csvd_wrapper(C, U, S, V);
#endif // AER_THRUST_CUDA
} else {
csvd_wrapper(C, U, S, V, mps_lapack);
}
double discarded_value = 0.0;
discarded_value = reduce_zeros(U, S, V, max_bond_dimension_,
truncation_threshold_, mps_lapack);
Expand Down
164 changes: 164 additions & 0 deletions src/simulators/matrix_product_state/svd.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
#include <stdlib.h>

namespace AER {

// default values
constexpr auto mul_factor = 1e2;
constexpr long double tiny_factor = 1e30;
Expand Down Expand Up @@ -667,4 +668,167 @@ void lapack_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S,
}
}

#ifdef AER_THRUST_CUDA
void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S,
cmatrix_t &V) {

bool transposed = false;

const int64_t rows = A.GetRows(), cols = A.GetColumns();

if (rows < cols) {
transposed = true;
A = AER::Utils::dagger(A);
}
cmatrix_t A_cpy = A;

const int64_t min_dim = std::min(rows, cols);
const int64_t lda = std::max(rows, cols);

U.resize(lda, min_dim);
V.resize(min_dim, min_dim);
S.resize(min_dim);

size_t sizeA = A.size() * sizeof(complex_t);
size_t sizeU = U.size() * sizeof(complex_t);
size_t sizeS = S.size() * sizeof(double);
size_t sizeV = V.size() * sizeof(complex_t);

complex_t *cutensor_A = A.move_to_buffer(), *cutensor_U = U.move_to_buffer(),
*cutensor_V = V.move_to_buffer();

cudaDataType_t typeData = CUDA_C_64F;

std::vector<int32_t> modesA{'m', 'n'};
std::vector<int32_t> modesU{'m', 'x'};
std::vector<int32_t> modesV{'x', 'n'};

cudaStream_t stream;
HANDLE_CUDA_ERROR(cudaStreamCreate(&stream));

cutensornetHandle_t handle;
HANDLE_ERROR(cutensornetCreate(&handle));

double *cutensor_S = (double *)malloc(sizeS);

void *D_A;
void *D_U;
void *D_S;
void *D_V;

HANDLE_CUDA_ERROR(cudaMalloc((void **)&D_A, sizeA));
HANDLE_CUDA_ERROR(cudaMalloc((void **)&D_U, sizeU));
HANDLE_CUDA_ERROR(cudaMalloc((void **)&D_S, sizeS));
HANDLE_CUDA_ERROR(cudaMalloc((void **)&D_V, sizeV));

HANDLE_CUDA_ERROR(cudaMemcpy(D_A, cutensor_A, sizeA, cudaMemcpyHostToDevice));

cutensornetTensorDescriptor_t descTensorA;
cutensornetTensorDescriptor_t descTensorU;
cutensornetTensorDescriptor_t descTensorV;

const int32_t numModesA = modesA.size();
const int32_t numModesU = modesU.size();
const int32_t numModesV = modesV.size();

std::vector<int64_t> extentA{lda, min_dim}; // shape of A
std::vector<int64_t> extentU{lda, min_dim}; // shape of U :)
std::vector<int64_t> extentV{min_dim, min_dim}; // shape of V

const int64_t *strides =
NULL; // matrices stores the entries in column-major-order.

HANDLE_ERROR(cutensornetCreateTensorDescriptor(
handle, numModesA, extentA.data(), strides, modesA.data(), typeData,
&descTensorA));
HANDLE_ERROR(cutensornetCreateTensorDescriptor(
handle, numModesU, extentU.data(), strides, modesU.data(), typeData,
&descTensorU));
HANDLE_ERROR(cutensornetCreateTensorDescriptor(
handle, numModesV, extentV.data(), strides, modesV.data(), typeData,
&descTensorV));

cutensornetWorkspaceDescriptor_t workDesc;
HANDLE_ERROR(cutensornetCreateWorkspaceDescriptor(handle, &workDesc));
HANDLE_ERROR(cutensornetWorkspaceComputeSVDSizes(
handle, descTensorA, descTensorU, descTensorV, NULL, workDesc));
int64_t hostWorkspaceSize, deviceWorkspaceSize;
// for tensor SVD, it does not matter which cutensornetWorksizePref_t we pick
HANDLE_ERROR(cutensornetWorkspaceGetMemorySize(
handle, workDesc, CUTENSORNET_WORKSIZE_PREF_RECOMMENDED,
CUTENSORNET_MEMSPACE_DEVICE, CUTENSORNET_WORKSPACE_SCRATCH,
&deviceWorkspaceSize));
HANDLE_ERROR(cutensornetWorkspaceGetMemorySize(
handle, workDesc, CUTENSORNET_WORKSIZE_PREF_RECOMMENDED,
CUTENSORNET_MEMSPACE_HOST, CUTENSORNET_WORKSPACE_SCRATCH,
&hostWorkspaceSize));

void *devWork = nullptr, *hostWork = nullptr;
if (deviceWorkspaceSize > 0) {
HANDLE_CUDA_ERROR(cudaMalloc(&devWork, deviceWorkspaceSize));
}
if (hostWorkspaceSize > 0) {
hostWork = malloc(hostWorkspaceSize);
}
HANDLE_ERROR(cutensornetWorkspaceSetMemory(
handle, workDesc, CUTENSORNET_MEMSPACE_DEVICE,
CUTENSORNET_WORKSPACE_SCRATCH, devWork, deviceWorkspaceSize));
HANDLE_ERROR(cutensornetWorkspaceSetMemory(
handle, workDesc, CUTENSORNET_MEMSPACE_HOST,
CUTENSORNET_WORKSPACE_SCRATCH, hostWork, hostWorkspaceSize));

// Requesting for Exact SVD.
HANDLE_ERROR(cutensornetTensorSVD(handle, descTensorA, D_A, descTensorU, D_U,
D_S, descTensorV, D_V, NULL, NULL, workDesc,
stream));

HANDLE_CUDA_ERROR(
cudaMemcpyAsync(cutensor_U, D_U, sizeU, cudaMemcpyDeviceToHost));
HANDLE_CUDA_ERROR(
cudaMemcpyAsync(cutensor_S, D_S, sizeS, cudaMemcpyDeviceToHost));
HANDLE_CUDA_ERROR(
cudaMemcpyAsync(cutensor_V, D_V, sizeV, cudaMemcpyDeviceToHost));

S.clear();
for (int i = 0; i < min_dim; i++)
S.push_back(cutensor_S[i]);

A = cmatrix_t::move_from_buffer(lda, min_dim, cutensor_A);
U = cmatrix_t::move_from_buffer(lda, min_dim, cutensor_U);
V = cmatrix_t::move_from_buffer(min_dim, min_dim, cutensor_V);

V = AER::Utils::dagger(V);
validate_SVD_result(A_cpy, U, S, V);
if (transposed) {
std::swap(U, V);
}

/***************
* Free resources
****************/

HANDLE_ERROR(cutensornetDestroyTensorDescriptor(descTensorA));
HANDLE_ERROR(cutensornetDestroyTensorDescriptor(descTensorU));
HANDLE_ERROR(cutensornetDestroyTensorDescriptor(descTensorV));
HANDLE_ERROR(cutensornetDestroyWorkspaceDescriptor(workDesc));
HANDLE_CUDA_ERROR(cudaStreamDestroy(stream));
HANDLE_ERROR(cutensornetDestroy(handle));

if (cutensor_S)
free(cutensor_S);
if (D_A)
cudaFree(D_A);
if (D_U)
cudaFree(D_U);
if (D_S)
cudaFree(D_S);
if (D_V)
cudaFree(D_V);
if (devWork)
cudaFree(devWork);
if (hostWork)
free(hostWork);
}
#endif // AER_THRUST_CUDA

} // namespace AER
34 changes: 34 additions & 0 deletions src/simulators/matrix_product_state/svd.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,40 @@ void validate_SVD_result(const cmatrix_t &A, const cmatrix_t &U,
void validate_SVdD_result(const cmatrix_t &A, const cmatrix_t &U,
const rvector_t &S, const cmatrix_t &V);

#ifdef AER_THRUST_CUDA

#include <cuda.h>
#include <cuda_runtime.h>
#include <cutensornet.h>
#include <vector>

#define HANDLE_ERROR(x) \
{ \
const auto err = x; \
if (err != CUTENSORNET_STATUS_SUCCESS) { \
std::stringstream str; \
str << "ERROR TensorNet::contractor : " \
<< cutensornetGetErrorString(err); \
throw std::runtime_error(str.str()); \
} \
};

#define HANDLE_CUDA_ERROR(x) \
{ \
const auto err = x; \
if (err != cudaSuccess) { \
std::stringstream str; \
str << "ERROR TensorNet::contractor : " << cudaGetErrorString(err); \
throw std::runtime_error(str.str()); \
} \
};

// cutensor call
void cutensor_csvd_wrapper(cmatrix_t &C, cmatrix_t &U, rvector_t &S,
cmatrix_t &V);

#endif // AER_THRUST_CUDA

//-------------------------------------------------------------------------
} // end namespace AER
//-------------------------------------------------------------------------
Expand Down
Loading
Loading