-
-
Save mrkwjc/ebb22e8b592122cc8be6 to your computer and use it in GitHub Desktop.
| # ### Interface cuSOLVER PyCUDA | |
| from __future__ import print_function | |
| import pycuda.gpuarray as gpuarray | |
| import pycuda.driver as cuda | |
| import pycuda.autoinit | |
| import numpy as np | |
| import scipy.sparse as sp | |
| import ctypes | |
| ## Wrap the cuSOLVER cusolverSpDcsrlsvqr() using ctypes | |
| ## http://docs.nvidia.com/cuda/cusolver/#cusolver-lt-t-gt-csrlsvqr | |
| # cuSparse | |
| _libcusparse = ctypes.cdll.LoadLibrary('libcusparse.so') | |
| _libcusparse.cusparseCreate.restype = int | |
| _libcusparse.cusparseCreate.argtypes = [ctypes.c_void_p] | |
| _libcusparse.cusparseDestroy.restype = int | |
| _libcusparse.cusparseDestroy.argtypes = [ctypes.c_void_p] | |
| _libcusparse.cusparseCreateMatDescr.restype = int | |
| _libcusparse.cusparseCreateMatDescr.argtypes = [ctypes.c_void_p] | |
| # cuSOLVER | |
| _libcusolver = ctypes.cdll.LoadLibrary('libcusolver.so') | |
| _libcusolver.cusolverSpCreate.restype = int | |
| _libcusolver.cusolverSpCreate.argtypes = [ctypes.c_void_p] | |
| _libcusolver.cusolverSpDestroy.restype = int | |
| _libcusolver.cusolverSpDestroy.argtypes = [ctypes.c_void_p] | |
| _libcusolver.cusolverSpDcsrlsvqr.restype = int | |
| _libcusolver.cusolverSpDcsrlsvqr.argtypes= [ctypes.c_void_p, | |
| ctypes.c_int, | |
| ctypes.c_int, | |
| ctypes.c_void_p, | |
| ctypes.c_void_p, | |
| ctypes.c_void_p, | |
| ctypes.c_void_p, | |
| ctypes.c_void_p, | |
| ctypes.c_double, | |
| ctypes.c_int, | |
| ctypes.c_void_p, | |
| ctypes.c_void_p] | |
| def cuspsolve(A, b): | |
| Acsr = sp.csr_matrix(A, dtype=float) | |
| b = np.asarray(b, dtype=float) | |
| x = np.empty_like(b) | |
| # Copy arrays to GPU | |
| dcsrVal = gpuarray.to_gpu(Acsr.data) | |
| dcsrColInd = gpuarray.to_gpu(Acsr.indices) | |
| dcsrIndPtr = gpuarray.to_gpu(Acsr.indptr) | |
| dx = gpuarray.to_gpu(x) | |
| db = gpuarray.to_gpu(b) | |
| # Create solver parameters | |
| m = ctypes.c_int(Acsr.shape[0]) # Need check if A is square | |
| nnz = ctypes.c_int(Acsr.nnz) | |
| descrA = ctypes.c_void_p() | |
| reorder = ctypes.c_int(0) | |
| tol = ctypes.c_double(1e-10) | |
| singularity = ctypes.c_int(0) # -1 if A not singular | |
| # create cusparse handle | |
| _cusp_handle = ctypes.c_void_p() | |
| status = _libcusparse.cusparseCreate(ctypes.byref(_cusp_handle)) | |
| assert(status == 0) | |
| cusp_handle = _cusp_handle.value | |
| # create MatDescriptor | |
| status = _libcusparse.cusparseCreateMatDescr(ctypes.byref(descrA)) | |
| assert(status == 0) | |
| #create cusolver handle | |
| _cuso_handle = ctypes.c_void_p() | |
| status = _libcusolver.cusolverSpCreate(ctypes.byref(_cuso_handle)) | |
| assert(status == 0) | |
| cuso_handle = _cuso_handle.value | |
| # Solve | |
| res=_libcusolver.cusolverSpDcsrlsvqr(cuso_handle, | |
| m, | |
| nnz, | |
| descrA, | |
| int(dcsrVal.gpudata), | |
| int(dcsrIndPtr.gpudata), | |
| int(dcsrColInd.gpudata), | |
| int(db.gpudata), | |
| tol, | |
| reorder, | |
| int(dx.gpudata), | |
| ctypes.byref(singularity)) | |
| assert(res == 0) | |
| if singularity.value != -1: | |
| raise ValueError('Singular matrix!') | |
| x = dx.get() # Get result as numpy array | |
| # Destroy handles | |
| status = _libcusolver.cusolverSpDestroy(cuso_handle) | |
| assert(status == 0) | |
| status = _libcusparse.cusparseDestroy(cusp_handle) | |
| assert(status == 0) | |
| # Return result | |
| return x | |
| # Test | |
| if __name__ == '__main__': | |
| A = np.diag(np.arange(1, 5)) | |
| b = np.ones(4) | |
| x = cuspsolve(A, b) | |
| np.testing.assert_almost_equal(x, np.array([1. , 0.5, 0.33333333, 0.25])) | |
| # Timing comparison | |
| from scipy.sparse import rand | |
| from scipy.sparse.linalg import spsolve | |
| from scipy.sparse import coo_matrix | |
| import time | |
| n = 10000 | |
| i = j = np.arange(n) | |
| diag = np.ones(n) | |
| A = rand(n, n, density=0.001) | |
| A = A.tocsr() | |
| A[i, j] = diag | |
| b = np.ones(n) | |
| t0 = time.time() | |
| x = spsolve(A, b) | |
| dt1 = time.time() - t0 | |
| print("scipy.sparse.linalg.spsolve time: %s" %dt1) | |
| t0 = time.time() | |
| x = cuspsolve(A, b) | |
| dt2 = time.time() - t0 | |
| print("cuspsolve time: %s" %dt2) | |
| ratio = dt1/dt2 | |
| if ratio > 1: | |
| print("CUDA is %s times faster than CPU." %ratio) | |
| else: | |
| print("CUDA is %s times slower than CPU." %(1./ratio)) |
Well, i tested things and this definitely NOT the data copying issue. CUDA is simply slower!
To see this in the even more spectacular way i higly reccomend to install scikit-umfpack (using pip). The default superlu solver used in spsolve from scipy works using one core only, whereas umfpack boosts solution using all your CPUs.
Well, i tested things and this definitely NOT the data copying issue. CUDA is simply slower!
To see this in the even more spectacular way i higly reccomend to install
scikit-umfpack(using pip). The default superlu solver used inspsolvefrom scipy works using one core only, whereas umfpack boosts solution using all your CPUs.
ರ_ರ 心塞,you do need a test to a matrix with rank more than 4000.I really can't understand it is necessary to use cuda to solve the matrix with so small rank, e...m,just 4.😅
This is probably because of transferring data to and from GPU. Maybe only _libcusolver.cusolverSpDcsrlsvqr call should be wrapped with time measures. This would make sense i we are keeping data on GPU...