Singular values calculation only with CUDA

I'm trying to use the new cusolverDnSgesvd routine of CUDA 7.0 for the calculation of the singular values. The full code is reported below:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>

#include<iostream>
#include<stdlib.h>
#include<stdio.h>
#include <cusolverDn.h>
#include <cuda_runtime_api.h>

/***********************/
/* CUDA ERROR CHECKING */
/***********************/
void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)

 if (code != cudaSuccess)
 
 fprintf(stderr,"GPUassert: %s %s %dn", cudaGetErrorString(code), file, line);
 if (abort) exit(code); 
 

void gpuErrchk(cudaError_t ans) gpuAssert((ans), __FILE__, __LINE__); 

/********/
/* MAIN */
/********/
int main()

 int M = 10;
 int N = 10;

 // --- Setting the host matrix
 float *h_A = (float *)malloc(M * N * sizeof(float));
 for(unsigned int i = 0; i < M; i++)
 for(unsigned int j = 0; j < N; j++)
 h_A[j*M + i] = (i + j) * (i + j);
 
 

 // --- Setting the device matrix and moving the host matrix to the device
 float *d_A; gpuErrchk(cudaMalloc(&d_A, M * N * sizeof(float)));
 gpuErrchk(cudaMemcpy(d_A, h_A, M * N * sizeof(float), cudaMemcpyHostToDevice));

 // --- host side SVD results space
 float *h_U = (float *)malloc(M * M * sizeof(float));
 float *h_V = (float *)malloc(N * N * sizeof(float));
 float *h_S = (float *)malloc(N * sizeof(float));

 // --- device side SVD workspace and matrices
 int work_size = 0;

 int *devInfo; gpuErrchk(cudaMalloc(&devInfo, sizeof(int)));
 float *d_U; gpuErrchk(cudaMalloc(&d_U, M * M * sizeof(float)));
 float *d_V; gpuErrchk(cudaMalloc(&d_V, N * N * sizeof(float)));
 float *d_S; gpuErrchk(cudaMalloc(&d_S, N * sizeof(float)));

 cusolverStatus_t stat;

 // --- CUDA solver initialization
 cusolverDnHandle_t solver_handle;
 cusolverDnCreate(&solver_handle);

 stat = cusolverDnSgesvd_bufferSize(solver_handle, M, N, &work_size);
 if(stat != CUSOLVER_STATUS_SUCCESS ) std::cout << "Initialization of cuSolver failed. N";

 float *work; gpuErrchk(cudaMalloc(&work, work_size * sizeof(float)));
 //float *rwork; gpuErrchk(cudaMalloc(&rwork, work_size * sizeof(float)));

 // --- CUDA SVD execution
 //stat = cusolverDnSgesvd(solver_handle, 'A', 'A', M, N, d_A, M, d_S, d_U, M, d_V, N, work, work_size, NULL, devInfo);
 stat = cusolverDnSgesvd(solver_handle, 'N', 'N', M, N, d_A, M, d_S, d_U, M, d_V, N, work, work_size, NULL, devInfo);
 cudaDeviceSynchronize();

 int devInfo_h = 0;
 gpuErrchk(cudaMemcpy(&devInfo_h, devInfo, sizeof(int), cudaMemcpyDeviceToHost));
 std::cout << "devInfo = " << devInfo_h << "n";

 switch(stat)
 case CUSOLVER_STATUS_SUCCESS: std::cout << "SVD computation successn"; break;
 case CUSOLVER_STATUS_NOT_INITIALIZED: std::cout << "Library cuSolver not initialized correctlyn"; break;
 case CUSOLVER_STATUS_INVALID_VALUE: std::cout << "Invalid parameters passedn"; break;
 case CUSOLVER_STATUS_INTERNAL_ERROR: std::cout << "Internal operation failedn"; break;
 

 if (devInfo_h == 0 && stat == CUSOLVER_STATUS_SUCCESS) std::cout << "SVD successfulnn";

 // --- Moving the results from device to host
 gpuErrchk(cudaMemcpy(h_S, d_S, N * sizeof(float), cudaMemcpyDeviceToHost));

 for(int i = 0; i < N; i++) std::cout << "d_S["<<i<<"] = " << h_S[i] << std::endl;

 cusolverDnDestroy(solver_handle);

 return 0;

If I ask for the computation of the full SVD (commented line with jobu = 'A' and jobvt = 'A') everything works fine. If I ask for the computation of the singular values only (line with jobu = 'N' and jobvt = 'N'), cusolverDnSgesvd returns

CUSOLVER_STATUS_INVALID_VALUE

Please note that, in this case devInfo = 0, so I cannot spot the invalid parameter.

Please also note that the documentation PDF lacks information about the rwork parameter so that I have dealt with it as a dummy parameter.

edited Jan 20 '17 at 11:46

JackOLantern

14.7k355111

asked Jan 23 '15 at 10:13

WestWizard

317

add a comment |

I'm trying to use the new cusolverDnSgesvd routine of CUDA 7.0 for the calculation of the singular values. The full code is reported below:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>

#include<iostream>
#include<stdlib.h>
#include<stdio.h>
#include <cusolverDn.h>
#include <cuda_runtime_api.h>

/***********************/
/* CUDA ERROR CHECKING */
/***********************/
void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)

 if (code != cudaSuccess)
 
 fprintf(stderr,"GPUassert: %s %s %dn", cudaGetErrorString(code), file, line);
 if (abort) exit(code); 
 

void gpuErrchk(cudaError_t ans) gpuAssert((ans), __FILE__, __LINE__); 

/********/
/* MAIN */
/********/
int main()

 int M = 10;
 int N = 10;

 // --- Setting the host matrix
 float *h_A = (float *)malloc(M * N * sizeof(float));
 for(unsigned int i = 0; i < M; i++)
 for(unsigned int j = 0; j < N; j++)
 h_A[j*M + i] = (i + j) * (i + j);
 
 

 // --- Setting the device matrix and moving the host matrix to the device
 float *d_A; gpuErrchk(cudaMalloc(&d_A, M * N * sizeof(float)));
 gpuErrchk(cudaMemcpy(d_A, h_A, M * N * sizeof(float), cudaMemcpyHostToDevice));

 // --- host side SVD results space
 float *h_U = (float *)malloc(M * M * sizeof(float));
 float *h_V = (float *)malloc(N * N * sizeof(float));
 float *h_S = (float *)malloc(N * sizeof(float));

 // --- device side SVD workspace and matrices
 int work_size = 0;

 int *devInfo; gpuErrchk(cudaMalloc(&devInfo, sizeof(int)));
 float *d_U; gpuErrchk(cudaMalloc(&d_U, M * M * sizeof(float)));
 float *d_V; gpuErrchk(cudaMalloc(&d_V, N * N * sizeof(float)));
 float *d_S; gpuErrchk(cudaMalloc(&d_S, N * sizeof(float)));

 cusolverStatus_t stat;

 // --- CUDA solver initialization
 cusolverDnHandle_t solver_handle;
 cusolverDnCreate(&solver_handle);

 stat = cusolverDnSgesvd_bufferSize(solver_handle, M, N, &work_size);
 if(stat != CUSOLVER_STATUS_SUCCESS ) std::cout << "Initialization of cuSolver failed. N";

 float *work; gpuErrchk(cudaMalloc(&work, work_size * sizeof(float)));
 //float *rwork; gpuErrchk(cudaMalloc(&rwork, work_size * sizeof(float)));

 // --- CUDA SVD execution
 //stat = cusolverDnSgesvd(solver_handle, 'A', 'A', M, N, d_A, M, d_S, d_U, M, d_V, N, work, work_size, NULL, devInfo);
 stat = cusolverDnSgesvd(solver_handle, 'N', 'N', M, N, d_A, M, d_S, d_U, M, d_V, N, work, work_size, NULL, devInfo);
 cudaDeviceSynchronize();

 int devInfo_h = 0;
 gpuErrchk(cudaMemcpy(&devInfo_h, devInfo, sizeof(int), cudaMemcpyDeviceToHost));
 std::cout << "devInfo = " << devInfo_h << "n";

 switch(stat)
 case CUSOLVER_STATUS_SUCCESS: std::cout << "SVD computation successn"; break;
 case CUSOLVER_STATUS_NOT_INITIALIZED: std::cout << "Library cuSolver not initialized correctlyn"; break;
 case CUSOLVER_STATUS_INVALID_VALUE: std::cout << "Invalid parameters passedn"; break;
 case CUSOLVER_STATUS_INTERNAL_ERROR: std::cout << "Internal operation failedn"; break;
 

 if (devInfo_h == 0 && stat == CUSOLVER_STATUS_SUCCESS) std::cout << "SVD successfulnn";

 // --- Moving the results from device to host
 gpuErrchk(cudaMemcpy(h_S, d_S, N * sizeof(float), cudaMemcpyDeviceToHost));

 for(int i = 0; i < N; i++) std::cout << "d_S["<<i<<"] = " << h_S[i] << std::endl;

 cusolverDnDestroy(solver_handle);

 return 0;

CUSOLVER_STATUS_INVALID_VALUE

Please note that, in this case devInfo = 0, so I cannot spot the invalid parameter.

Please also note that the documentation PDF lacks information about the rwork parameter so that I have dealt with it as a dummy parameter.

edited Jan 20 '17 at 11:46

JackOLantern

14.7k355111

asked Jan 23 '15 at 10:13

WestWizard

317

add a comment |

I'm trying to use the new cusolverDnSgesvd routine of CUDA 7.0 for the calculation of the singular values. The full code is reported below:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>

#include<iostream>
#include<stdlib.h>
#include<stdio.h>
#include <cusolverDn.h>
#include <cuda_runtime_api.h>

/***********************/
/* CUDA ERROR CHECKING */
/***********************/
void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)

 if (code != cudaSuccess)
 
 fprintf(stderr,"GPUassert: %s %s %dn", cudaGetErrorString(code), file, line);
 if (abort) exit(code); 
 

void gpuErrchk(cudaError_t ans) gpuAssert((ans), __FILE__, __LINE__); 

/********/
/* MAIN */
/********/
int main()

 int M = 10;
 int N = 10;

 // --- Setting the host matrix
 float *h_A = (float *)malloc(M * N * sizeof(float));
 for(unsigned int i = 0; i < M; i++)
 for(unsigned int j = 0; j < N; j++)
 h_A[j*M + i] = (i + j) * (i + j);
 
 

 // --- Setting the device matrix and moving the host matrix to the device
 float *d_A; gpuErrchk(cudaMalloc(&d_A, M * N * sizeof(float)));
 gpuErrchk(cudaMemcpy(d_A, h_A, M * N * sizeof(float), cudaMemcpyHostToDevice));

 // --- host side SVD results space
 float *h_U = (float *)malloc(M * M * sizeof(float));
 float *h_V = (float *)malloc(N * N * sizeof(float));
 float *h_S = (float *)malloc(N * sizeof(float));

 // --- device side SVD workspace and matrices
 int work_size = 0;

 int *devInfo; gpuErrchk(cudaMalloc(&devInfo, sizeof(int)));
 float *d_U; gpuErrchk(cudaMalloc(&d_U, M * M * sizeof(float)));
 float *d_V; gpuErrchk(cudaMalloc(&d_V, N * N * sizeof(float)));
 float *d_S; gpuErrchk(cudaMalloc(&d_S, N * sizeof(float)));

 cusolverStatus_t stat;

 // --- CUDA solver initialization
 cusolverDnHandle_t solver_handle;
 cusolverDnCreate(&solver_handle);

 stat = cusolverDnSgesvd_bufferSize(solver_handle, M, N, &work_size);
 if(stat != CUSOLVER_STATUS_SUCCESS ) std::cout << "Initialization of cuSolver failed. N";

 float *work; gpuErrchk(cudaMalloc(&work, work_size * sizeof(float)));
 //float *rwork; gpuErrchk(cudaMalloc(&rwork, work_size * sizeof(float)));

 // --- CUDA SVD execution
 //stat = cusolverDnSgesvd(solver_handle, 'A', 'A', M, N, d_A, M, d_S, d_U, M, d_V, N, work, work_size, NULL, devInfo);
 stat = cusolverDnSgesvd(solver_handle, 'N', 'N', M, N, d_A, M, d_S, d_U, M, d_V, N, work, work_size, NULL, devInfo);
 cudaDeviceSynchronize();

 int devInfo_h = 0;
 gpuErrchk(cudaMemcpy(&devInfo_h, devInfo, sizeof(int), cudaMemcpyDeviceToHost));
 std::cout << "devInfo = " << devInfo_h << "n";

 switch(stat)
 case CUSOLVER_STATUS_SUCCESS: std::cout << "SVD computation successn"; break;
 case CUSOLVER_STATUS_NOT_INITIALIZED: std::cout << "Library cuSolver not initialized correctlyn"; break;
 case CUSOLVER_STATUS_INVALID_VALUE: std::cout << "Invalid parameters passedn"; break;
 case CUSOLVER_STATUS_INTERNAL_ERROR: std::cout << "Internal operation failedn"; break;
 

 if (devInfo_h == 0 && stat == CUSOLVER_STATUS_SUCCESS) std::cout << "SVD successfulnn";

 // --- Moving the results from device to host
 gpuErrchk(cudaMemcpy(h_S, d_S, N * sizeof(float), cudaMemcpyDeviceToHost));

 for(int i = 0; i < N; i++) std::cout << "d_S["<<i<<"] = " << h_S[i] << std::endl;

 cusolverDnDestroy(solver_handle);

 return 0;

CUSOLVER_STATUS_INVALID_VALUE

Please note that, in this case devInfo = 0, so I cannot spot the invalid parameter.

Please also note that the documentation PDF lacks information about the rwork parameter so that I have dealt with it as a dummy parameter.

edited Jan 20 '17 at 11:46

JackOLantern

14.7k355111

asked Jan 23 '15 at 10:13

WestWizard

317

I'm trying to use the new cusolverDnSgesvd routine of CUDA 7.0 for the calculation of the singular values. The full code is reported below:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>

#include<iostream>
#include<stdlib.h>
#include<stdio.h>
#include <cusolverDn.h>
#include <cuda_runtime_api.h>

/***********************/
/* CUDA ERROR CHECKING */
/***********************/
void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)

 if (code != cudaSuccess)
 
 fprintf(stderr,"GPUassert: %s %s %dn", cudaGetErrorString(code), file, line);
 if (abort) exit(code); 
 

void gpuErrchk(cudaError_t ans) gpuAssert((ans), __FILE__, __LINE__); 

/********/
/* MAIN */
/********/
int main()

 int M = 10;
 int N = 10;

 // --- Setting the host matrix
 float *h_A = (float *)malloc(M * N * sizeof(float));
 for(unsigned int i = 0; i < M; i++)
 for(unsigned int j = 0; j < N; j++)
 h_A[j*M + i] = (i + j) * (i + j);
 
 

 // --- Setting the device matrix and moving the host matrix to the device
 float *d_A; gpuErrchk(cudaMalloc(&d_A, M * N * sizeof(float)));
 gpuErrchk(cudaMemcpy(d_A, h_A, M * N * sizeof(float), cudaMemcpyHostToDevice));

 // --- host side SVD results space
 float *h_U = (float *)malloc(M * M * sizeof(float));
 float *h_V = (float *)malloc(N * N * sizeof(float));
 float *h_S = (float *)malloc(N * sizeof(float));

 // --- device side SVD workspace and matrices
 int work_size = 0;

 int *devInfo; gpuErrchk(cudaMalloc(&devInfo, sizeof(int)));
 float *d_U; gpuErrchk(cudaMalloc(&d_U, M * M * sizeof(float)));
 float *d_V; gpuErrchk(cudaMalloc(&d_V, N * N * sizeof(float)));
 float *d_S; gpuErrchk(cudaMalloc(&d_S, N * sizeof(float)));

 cusolverStatus_t stat;

 // --- CUDA solver initialization
 cusolverDnHandle_t solver_handle;
 cusolverDnCreate(&solver_handle);

 stat = cusolverDnSgesvd_bufferSize(solver_handle, M, N, &work_size);
 if(stat != CUSOLVER_STATUS_SUCCESS ) std::cout << "Initialization of cuSolver failed. N";

 float *work; gpuErrchk(cudaMalloc(&work, work_size * sizeof(float)));
 //float *rwork; gpuErrchk(cudaMalloc(&rwork, work_size * sizeof(float)));

 // --- CUDA SVD execution
 //stat = cusolverDnSgesvd(solver_handle, 'A', 'A', M, N, d_A, M, d_S, d_U, M, d_V, N, work, work_size, NULL, devInfo);
 stat = cusolverDnSgesvd(solver_handle, 'N', 'N', M, N, d_A, M, d_S, d_U, M, d_V, N, work, work_size, NULL, devInfo);
 cudaDeviceSynchronize();

 int devInfo_h = 0;
 gpuErrchk(cudaMemcpy(&devInfo_h, devInfo, sizeof(int), cudaMemcpyDeviceToHost));
 std::cout << "devInfo = " << devInfo_h << "n";

 switch(stat)
 case CUSOLVER_STATUS_SUCCESS: std::cout << "SVD computation successn"; break;
 case CUSOLVER_STATUS_NOT_INITIALIZED: std::cout << "Library cuSolver not initialized correctlyn"; break;
 case CUSOLVER_STATUS_INVALID_VALUE: std::cout << "Invalid parameters passedn"; break;
 case CUSOLVER_STATUS_INTERNAL_ERROR: std::cout << "Internal operation failedn"; break;
 

 if (devInfo_h == 0 && stat == CUSOLVER_STATUS_SUCCESS) std::cout << "SVD successfulnn";

 // --- Moving the results from device to host
 gpuErrchk(cudaMemcpy(h_S, d_S, N * sizeof(float), cudaMemcpyDeviceToHost));

 for(int i = 0; i < N; i++) std::cout << "d_S["<<i<<"] = " << h_S[i] << std::endl;

 cusolverDnDestroy(solver_handle);

 return 0;

CUSOLVER_STATUS_INVALID_VALUE

Please note that, in this case devInfo = 0, so I cannot spot the invalid parameter.

Please also note that the documentation PDF lacks information about the rwork parameter so that I have dealt with it as a dummy parameter.

cuda svd cusolver

edited Jan 20 '17 at 11:46

JackOLantern

14.7k355111

asked Jan 23 '15 at 10:13

WestWizard

317

edited Jan 20 '17 at 11:46

JackOLantern

14.7k355111

asked Jan 23 '15 at 10:13

WestWizard

317

edited Jan 20 '17 at 11:46

JackOLantern

14.7k355111

edited Jan 20 '17 at 11:46

JackOLantern

14.7k355111

edited Jan 20 '17 at 11:46

JackOLantern

14.7k355111

asked Jan 23 '15 at 10:13

WestWizard

317

asked Jan 23 '15 at 10:13

WestWizard

317

asked Jan 23 '15 at 10:13

WestWizard

317

add a comment |

2 Answers
2

active

oldest

votes

At this time the cuSolver gesvd function only supports jobu = 'A' and jobvt = 'A'

So the error when you specify other combinations is expected. From the documentation:

Remark 2: gesvd only supports jobu='A' and jobvt='A' and returns matrix U and VH

answered Mar 22 '15 at 15:26

Robert Crovella

96.9k5110152

1

As of CUDA 8.0, gesvd supports jobu/jobvt = A, S, O, or N.

– lebedov
Nov 13 '16 at 22:47

add a comment |

USE OF cusolver<T>nSgesvd

As remarked by lebedov, as of CUDA 8.0, it is now possible to calculate the singular values only by cusolverDnSgesvd. I report below a slightly modified version of your code with two calls to cusolverDnSgesvd, one performing the singular values calculation only

cusolverDnSgesvd(solver_handle, 'N', 'N', M, N, d_A, M, d_S, NULL, M, NULL, N, work, work_size, NULL, devInfo)

and one performing the full SVD calculation

cusolverDnSgesvd(solver_handle, 'A', 'A', M, N, d_A, M, d_S, d_U, M, d_V, N, work, work_size, NULL, devInfo)

As you already remarked, the two 'A' fields for the full SVD case are changed to 'N' in the singular values only case. Please, note that, in the singular values only case, there is no need to store space for the singular vector matrices U and V. Indeed, a NULL pointer is passed.

The singular values calculation only is faster than the full SVD calculation. On a GTX 960, for a 1000x1000 matrix, the timing has been the following:

Singular values only: 559 ms
Full SVD: 2239 ms

Here is the full code:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>

#include<iostream>
#include<stdlib.h>
#include<stdio.h>

#include <cusolverDn.h>
#include <cuda_runtime_api.h>

#include "Utilities.cuh"
#include "TimingGPU.cuh"

/********/
/* MAIN */
/********/
int main()

 int M = 1000;
 int N = 1000;

 TimingGPU timerGPU;
 float elapsedTime;

 // --- Setting the host matrix
 float *h_A = (float *)malloc(M * N * sizeof(float));
 for (unsigned int i = 0; i < M; i++)
 for (unsigned int j = 0; j < N; j++)
 h_A[j*M + i] = (i + j) * (i + j);
 
 

 // --- Setting the device matrix and moving the host matrix to the device
 float *d_A; gpuErrchk(cudaMalloc(&d_A, M * N * sizeof(float)));
 gpuErrchk(cudaMemcpy(d_A, h_A, M * N * sizeof(float), cudaMemcpyHostToDevice));

 // --- host side SVD results space
 float *h_U = (float *)malloc(M * M * sizeof(float));
 float *h_V = (float *)malloc(N * N * sizeof(float));
 float *h_S = (float *)malloc(N * sizeof(float));

 // --- device side SVD workspace and matrices
 int work_size = 0;

 int *devInfo; gpuErrchk(cudaMalloc(&devInfo, sizeof(int)));
 float *d_U; gpuErrchk(cudaMalloc(&d_U, M * M * sizeof(float)));
 float *d_V; gpuErrchk(cudaMalloc(&d_V, N * N * sizeof(float)));
 float *d_S; gpuErrchk(cudaMalloc(&d_S, N * sizeof(float)));

 cusolverStatus_t stat;

 // --- CUDA solver initialization
 cusolverDnHandle_t solver_handle;
 cusolveSafeCall(cusolverDnCreate(&solver_handle));

 cusolveSafeCall(cusolverDnSgesvd_bufferSize(solver_handle, M, N, &work_size));

 float *work; gpuErrchk(cudaMalloc(&work, work_size * sizeof(float)));

 // --- CUDA SVD execution - Singular values only
 timerGPU.StartCounter();
 cusolveSafeCall(cusolverDnSgesvd(solver_handle, 'N', 'N', M, N, d_A, M, d_S, NULL, M, NULL, N, work, work_size, NULL, devInfo));
 elapsedTime = timerGPU.GetCounter();

 int devInfo_h = 0;
 gpuErrchk(cudaMemcpy(&devInfo_h, devInfo, sizeof(int), cudaMemcpyDeviceToHost));
 if (devInfo_h == 0)
 printf("SVD successfull for the singular values calculation onlynn");
 else if (devInfo_h < 0)
 printf("SVD unsuccessfull for the singular values calculation only. Parameter %i is wrongn", -devInfo_h);
 else
 printf("SVD unsuccessfull for the singular values calculation only. A number of %i superdiagonals of an intermediate bidiagonal form did not converge to zeron", devInfo_h);

 printf("Calculation of the singular values only: %f msnn", elapsedTime);

 // --- Moving the results from device to host
 //gpuErrchk(cudaMemcpy(h_S, d_S, N * sizeof(float), cudaMemcpyDeviceToHost));
 //for (int i = 0; i < N; i++) std::cout << "d_S[" << i << "] = " << h_S[i] << std::endl;

 // --- CUDA SVD execution - Full SVD
 timerGPU.StartCounter();
 cusolveSafeCall(cusolverDnSgesvd(solver_handle, 'A', 'A', M, N, d_A, M, d_S, d_U, M, d_V, N, work, work_size, NULL, devInfo));
 elapsedTime = timerGPU.GetCounter();

 devInfo_h = 0;
 gpuErrchk(cudaMemcpy(&devInfo_h, devInfo, sizeof(int), cudaMemcpyDeviceToHost));
 if (devInfo_h == 0)
 printf("SVD successfull for the full SVD calculationnn");
 else if (devInfo_h < 0)
 printf("SVD unsuccessfull for the full SVD calculation. Parameter %i is wrongn", -devInfo_h);
 else
 printf("SVD unsuccessfull for the full SVD calculation. A number of %i superdiagonals of an intermediate bidiagonal form did not converge to zeron", devInfo_h);

 printf("Calculation of the full SVD calculation: %f msnn", elapsedTime);

 cusolveSafeCall(cusolverDnDestroy(solver_handle));

 return 0;

EDIT - PERFORMANCE ACROSS DIFFERENT VERSIONS OF CUDA

I have compared the performance of the singular values only calculation and the the Full SVD computations for CUDA 8.0, CUDA 9.1 and CUDA 10.0, for a 5000x5000 matrix. Here are the results on a GTX 960.

Computation type CUDA 8.0 CUDA 9.1 CUDA 10.0 
__________________________________________________________________

Singular values only 17s 15s 15s
Full SVD 161s 159s 457s
__________________________________________________________________

edited Nov 13 '18 at 10:12

answered Jan 20 '17 at 11:45

JackOLantern

14.7k355111

add a comment |

Your Answer

StackExchange.ifUsing("editor", function ()
StackExchange.using("externalEditor", function ()
StackExchange.using("snippets", function ()
StackExchange.snippets.init();
);
);
, "code-snippets");

StackExchange.ready(function()
var channelOptions =
tags: "".split(" "),
id: "1"
;
initTagRenderer("".split(" "), "".split(" "), channelOptions);

StackExchange.using("externalEditor", function()
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled)
StackExchange.using("snippets", function()
createEditor();
);

else
createEditor();

);

function createEditor()
StackExchange.prepareEditor(
heartbeatType: 'answer',
autoActivateHeartbeat: false,
convertImagesToLinks: true,
noModals: true,
showLowRepImageUploadWarning: true,
reputationToPostImages: 10,
bindNavPrevention: true,
postfix: "",
imageUploader:
brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
allowUrls: true
,
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
);

);

draft saved

draft discarded

StackExchange.ready(
function ()
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f28107525%2fsingular-values-calculation-only-with-cuda%23new-answer', 'question_page');

);

Post as a guest

Name

Required, but never shown

2 Answers
2

active

oldest

votes

2 Answers
2

active

oldest

votes

At this time the cuSolver gesvd function only supports jobu = 'A' and jobvt = 'A'

So the error when you specify other combinations is expected. From the documentation:

Remark 2: gesvd only supports jobu='A' and jobvt='A' and returns matrix U and VH

answered Mar 22 '15 at 15:26

Robert Crovella

96.9k5110152

1

As of CUDA 8.0, gesvd supports jobu/jobvt = A, S, O, or N.

– lebedov
Nov 13 '16 at 22:47

add a comment |

At this time the cuSolver gesvd function only supports jobu = 'A' and jobvt = 'A'

So the error when you specify other combinations is expected. From the documentation:

Remark 2: gesvd only supports jobu='A' and jobvt='A' and returns matrix U and VH

answered Mar 22 '15 at 15:26

Robert Crovella

96.9k5110152

1

As of CUDA 8.0, gesvd supports jobu/jobvt = A, S, O, or N.

– lebedov
Nov 13 '16 at 22:47

add a comment |

At this time the cuSolver gesvd function only supports jobu = 'A' and jobvt = 'A'

So the error when you specify other combinations is expected. From the documentation:

Remark 2: gesvd only supports jobu='A' and jobvt='A' and returns matrix U and VH

answered Mar 22 '15 at 15:26

Robert Crovella

96.9k5110152

At this time the cuSolver gesvd function only supports jobu = 'A' and jobvt = 'A'

So the error when you specify other combinations is expected. From the documentation:

Remark 2: gesvd only supports jobu='A' and jobvt='A' and returns matrix U and VH

answered Mar 22 '15 at 15:26

Robert Crovella

96.9k5110152

answered Mar 22 '15 at 15:26

Robert Crovella

96.9k5110152

answered Mar 22 '15 at 15:26

Robert Crovella

96.9k5110152

answered Mar 22 '15 at 15:26

Robert Crovella

96.9k5110152

1

As of CUDA 8.0, gesvd supports jobu/jobvt = A, S, O, or N.

– lebedov
Nov 13 '16 at 22:47

add a comment |

1

As of CUDA 8.0, gesvd supports jobu/jobvt = A, S, O, or N.

– lebedov
Nov 13 '16 at 22:47

As of CUDA 8.0, gesvd supports jobu/jobvt = A, S, O, or N.

– lebedov
Nov 13 '16 at 22:47

add a comment |

USE OF cusolver<T>nSgesvd

cusolverDnSgesvd(solver_handle, 'N', 'N', M, N, d_A, M, d_S, NULL, M, NULL, N, work, work_size, NULL, devInfo)

and one performing the full SVD calculation

cusolverDnSgesvd(solver_handle, 'A', 'A', M, N, d_A, M, d_S, d_U, M, d_V, N, work, work_size, NULL, devInfo)

The singular values calculation only is faster than the full SVD calculation. On a GTX 960, for a 1000x1000 matrix, the timing has been the following:

Singular values only: 559 ms
Full SVD: 2239 ms

Here is the full code:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>

#include<iostream>
#include<stdlib.h>
#include<stdio.h>

#include <cusolverDn.h>
#include <cuda_runtime_api.h>

#include "Utilities.cuh"
#include "TimingGPU.cuh"

/********/
/* MAIN */
/********/
int main()

 int M = 1000;
 int N = 1000;

 TimingGPU timerGPU;
 float elapsedTime;

 // --- Setting the host matrix
 float *h_A = (float *)malloc(M * N * sizeof(float));
 for (unsigned int i = 0; i < M; i++)
 for (unsigned int j = 0; j < N; j++)
 h_A[j*M + i] = (i + j) * (i + j);
 
 

 // --- Setting the device matrix and moving the host matrix to the device
 float *d_A; gpuErrchk(cudaMalloc(&d_A, M * N * sizeof(float)));
 gpuErrchk(cudaMemcpy(d_A, h_A, M * N * sizeof(float), cudaMemcpyHostToDevice));

 // --- host side SVD results space
 float *h_U = (float *)malloc(M * M * sizeof(float));
 float *h_V = (float *)malloc(N * N * sizeof(float));
 float *h_S = (float *)malloc(N * sizeof(float));

 // --- device side SVD workspace and matrices
 int work_size = 0;

 int *devInfo; gpuErrchk(cudaMalloc(&devInfo, sizeof(int)));
 float *d_U; gpuErrchk(cudaMalloc(&d_U, M * M * sizeof(float)));
 float *d_V; gpuErrchk(cudaMalloc(&d_V, N * N * sizeof(float)));
 float *d_S; gpuErrchk(cudaMalloc(&d_S, N * sizeof(float)));

 cusolverStatus_t stat;

 // --- CUDA solver initialization
 cusolverDnHandle_t solver_handle;
 cusolveSafeCall(cusolverDnCreate(&solver_handle));

 cusolveSafeCall(cusolverDnSgesvd_bufferSize(solver_handle, M, N, &work_size));

 float *work; gpuErrchk(cudaMalloc(&work, work_size * sizeof(float)));

 // --- CUDA SVD execution - Singular values only
 timerGPU.StartCounter();
 cusolveSafeCall(cusolverDnSgesvd(solver_handle, 'N', 'N', M, N, d_A, M, d_S, NULL, M, NULL, N, work, work_size, NULL, devInfo));
 elapsedTime = timerGPU.GetCounter();

 int devInfo_h = 0;
 gpuErrchk(cudaMemcpy(&devInfo_h, devInfo, sizeof(int), cudaMemcpyDeviceToHost));
 if (devInfo_h == 0)
 printf("SVD successfull for the singular values calculation onlynn");
 else if (devInfo_h < 0)
 printf("SVD unsuccessfull for the singular values calculation only. Parameter %i is wrongn", -devInfo_h);
 else
 printf("SVD unsuccessfull for the singular values calculation only. A number of %i superdiagonals of an intermediate bidiagonal form did not converge to zeron", devInfo_h);

 printf("Calculation of the singular values only: %f msnn", elapsedTime);

 // --- Moving the results from device to host
 //gpuErrchk(cudaMemcpy(h_S, d_S, N * sizeof(float), cudaMemcpyDeviceToHost));
 //for (int i = 0; i < N; i++) std::cout << "d_S[" << i << "] = " << h_S[i] << std::endl;

 // --- CUDA SVD execution - Full SVD
 timerGPU.StartCounter();
 cusolveSafeCall(cusolverDnSgesvd(solver_handle, 'A', 'A', M, N, d_A, M, d_S, d_U, M, d_V, N, work, work_size, NULL, devInfo));
 elapsedTime = timerGPU.GetCounter();

 devInfo_h = 0;
 gpuErrchk(cudaMemcpy(&devInfo_h, devInfo, sizeof(int), cudaMemcpyDeviceToHost));
 if (devInfo_h == 0)
 printf("SVD successfull for the full SVD calculationnn");
 else if (devInfo_h < 0)
 printf("SVD unsuccessfull for the full SVD calculation. Parameter %i is wrongn", -devInfo_h);
 else
 printf("SVD unsuccessfull for the full SVD calculation. A number of %i superdiagonals of an intermediate bidiagonal form did not converge to zeron", devInfo_h);

 printf("Calculation of the full SVD calculation: %f msnn", elapsedTime);

 cusolveSafeCall(cusolverDnDestroy(solver_handle));

 return 0;

EDIT - PERFORMANCE ACROSS DIFFERENT VERSIONS OF CUDA

Computation type CUDA 8.0 CUDA 9.1 CUDA 10.0 
__________________________________________________________________

Singular values only 17s 15s 15s
Full SVD 161s 159s 457s
__________________________________________________________________

edited Nov 13 '18 at 10:12

answered Jan 20 '17 at 11:45

JackOLantern

14.7k355111

add a comment |

USE OF cusolver<T>nSgesvd

cusolverDnSgesvd(solver_handle, 'N', 'N', M, N, d_A, M, d_S, NULL, M, NULL, N, work, work_size, NULL, devInfo)

and one performing the full SVD calculation

cusolverDnSgesvd(solver_handle, 'A', 'A', M, N, d_A, M, d_S, d_U, M, d_V, N, work, work_size, NULL, devInfo)

The singular values calculation only is faster than the full SVD calculation. On a GTX 960, for a 1000x1000 matrix, the timing has been the following:

Singular values only: 559 ms
Full SVD: 2239 ms

Here is the full code:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>

#include<iostream>
#include<stdlib.h>
#include<stdio.h>

#include <cusolverDn.h>
#include <cuda_runtime_api.h>

#include "Utilities.cuh"
#include "TimingGPU.cuh"

/********/
/* MAIN */
/********/
int main()

 int M = 1000;
 int N = 1000;

 TimingGPU timerGPU;
 float elapsedTime;

 // --- Setting the host matrix
 float *h_A = (float *)malloc(M * N * sizeof(float));
 for (unsigned int i = 0; i < M; i++)
 for (unsigned int j = 0; j < N; j++)
 h_A[j*M + i] = (i + j) * (i + j);
 
 

 // --- Setting the device matrix and moving the host matrix to the device
 float *d_A; gpuErrchk(cudaMalloc(&d_A, M * N * sizeof(float)));
 gpuErrchk(cudaMemcpy(d_A, h_A, M * N * sizeof(float), cudaMemcpyHostToDevice));

 // --- host side SVD results space
 float *h_U = (float *)malloc(M * M * sizeof(float));
 float *h_V = (float *)malloc(N * N * sizeof(float));
 float *h_S = (float *)malloc(N * sizeof(float));

 // --- device side SVD workspace and matrices
 int work_size = 0;

 int *devInfo; gpuErrchk(cudaMalloc(&devInfo, sizeof(int)));
 float *d_U; gpuErrchk(cudaMalloc(&d_U, M * M * sizeof(float)));
 float *d_V; gpuErrchk(cudaMalloc(&d_V, N * N * sizeof(float)));
 float *d_S; gpuErrchk(cudaMalloc(&d_S, N * sizeof(float)));

 cusolverStatus_t stat;

 // --- CUDA solver initialization
 cusolverDnHandle_t solver_handle;
 cusolveSafeCall(cusolverDnCreate(&solver_handle));

 cusolveSafeCall(cusolverDnSgesvd_bufferSize(solver_handle, M, N, &work_size));

 float *work; gpuErrchk(cudaMalloc(&work, work_size * sizeof(float)));

 // --- CUDA SVD execution - Singular values only
 timerGPU.StartCounter();
 cusolveSafeCall(cusolverDnSgesvd(solver_handle, 'N', 'N', M, N, d_A, M, d_S, NULL, M, NULL, N, work, work_size, NULL, devInfo));
 elapsedTime = timerGPU.GetCounter();

 int devInfo_h = 0;
 gpuErrchk(cudaMemcpy(&devInfo_h, devInfo, sizeof(int), cudaMemcpyDeviceToHost));
 if (devInfo_h == 0)
 printf("SVD successfull for the singular values calculation onlynn");
 else if (devInfo_h < 0)
 printf("SVD unsuccessfull for the singular values calculation only. Parameter %i is wrongn", -devInfo_h);
 else
 printf("SVD unsuccessfull for the singular values calculation only. A number of %i superdiagonals of an intermediate bidiagonal form did not converge to zeron", devInfo_h);

 printf("Calculation of the singular values only: %f msnn", elapsedTime);

 // --- Moving the results from device to host
 //gpuErrchk(cudaMemcpy(h_S, d_S, N * sizeof(float), cudaMemcpyDeviceToHost));
 //for (int i = 0; i < N; i++) std::cout << "d_S[" << i << "] = " << h_S[i] << std::endl;

 // --- CUDA SVD execution - Full SVD
 timerGPU.StartCounter();
 cusolveSafeCall(cusolverDnSgesvd(solver_handle, 'A', 'A', M, N, d_A, M, d_S, d_U, M, d_V, N, work, work_size, NULL, devInfo));
 elapsedTime = timerGPU.GetCounter();

 devInfo_h = 0;
 gpuErrchk(cudaMemcpy(&devInfo_h, devInfo, sizeof(int), cudaMemcpyDeviceToHost));
 if (devInfo_h == 0)
 printf("SVD successfull for the full SVD calculationnn");
 else if (devInfo_h < 0)
 printf("SVD unsuccessfull for the full SVD calculation. Parameter %i is wrongn", -devInfo_h);
 else
 printf("SVD unsuccessfull for the full SVD calculation. A number of %i superdiagonals of an intermediate bidiagonal form did not converge to zeron", devInfo_h);

 printf("Calculation of the full SVD calculation: %f msnn", elapsedTime);

 cusolveSafeCall(cusolverDnDestroy(solver_handle));

 return 0;

EDIT - PERFORMANCE ACROSS DIFFERENT VERSIONS OF CUDA

Computation type CUDA 8.0 CUDA 9.1 CUDA 10.0 
__________________________________________________________________

Singular values only 17s 15s 15s
Full SVD 161s 159s 457s
__________________________________________________________________

edited Nov 13 '18 at 10:12

answered Jan 20 '17 at 11:45

JackOLantern

14.7k355111

add a comment |

USE OF cusolver<T>nSgesvd

cusolverDnSgesvd(solver_handle, 'N', 'N', M, N, d_A, M, d_S, NULL, M, NULL, N, work, work_size, NULL, devInfo)

and one performing the full SVD calculation

cusolverDnSgesvd(solver_handle, 'A', 'A', M, N, d_A, M, d_S, d_U, M, d_V, N, work, work_size, NULL, devInfo)

The singular values calculation only is faster than the full SVD calculation. On a GTX 960, for a 1000x1000 matrix, the timing has been the following:

Singular values only: 559 ms
Full SVD: 2239 ms

Here is the full code:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>

#include<iostream>
#include<stdlib.h>
#include<stdio.h>

#include <cusolverDn.h>
#include <cuda_runtime_api.h>

#include "Utilities.cuh"
#include "TimingGPU.cuh"

/********/
/* MAIN */
/********/
int main()

 int M = 1000;
 int N = 1000;

 TimingGPU timerGPU;
 float elapsedTime;

 // --- Setting the host matrix
 float *h_A = (float *)malloc(M * N * sizeof(float));
 for (unsigned int i = 0; i < M; i++)
 for (unsigned int j = 0; j < N; j++)
 h_A[j*M + i] = (i + j) * (i + j);
 
 

 // --- Setting the device matrix and moving the host matrix to the device
 float *d_A; gpuErrchk(cudaMalloc(&d_A, M * N * sizeof(float)));
 gpuErrchk(cudaMemcpy(d_A, h_A, M * N * sizeof(float), cudaMemcpyHostToDevice));

 // --- host side SVD results space
 float *h_U = (float *)malloc(M * M * sizeof(float));
 float *h_V = (float *)malloc(N * N * sizeof(float));
 float *h_S = (float *)malloc(N * sizeof(float));

 // --- device side SVD workspace and matrices
 int work_size = 0;

 int *devInfo; gpuErrchk(cudaMalloc(&devInfo, sizeof(int)));
 float *d_U; gpuErrchk(cudaMalloc(&d_U, M * M * sizeof(float)));
 float *d_V; gpuErrchk(cudaMalloc(&d_V, N * N * sizeof(float)));
 float *d_S; gpuErrchk(cudaMalloc(&d_S, N * sizeof(float)));

 cusolverStatus_t stat;

 // --- CUDA solver initialization
 cusolverDnHandle_t solver_handle;
 cusolveSafeCall(cusolverDnCreate(&solver_handle));

 cusolveSafeCall(cusolverDnSgesvd_bufferSize(solver_handle, M, N, &work_size));

 float *work; gpuErrchk(cudaMalloc(&work, work_size * sizeof(float)));

 // --- CUDA SVD execution - Singular values only
 timerGPU.StartCounter();
 cusolveSafeCall(cusolverDnSgesvd(solver_handle, 'N', 'N', M, N, d_A, M, d_S, NULL, M, NULL, N, work, work_size, NULL, devInfo));
 elapsedTime = timerGPU.GetCounter();

 int devInfo_h = 0;
 gpuErrchk(cudaMemcpy(&devInfo_h, devInfo, sizeof(int), cudaMemcpyDeviceToHost));
 if (devInfo_h == 0)
 printf("SVD successfull for the singular values calculation onlynn");
 else if (devInfo_h < 0)
 printf("SVD unsuccessfull for the singular values calculation only. Parameter %i is wrongn", -devInfo_h);
 else
 printf("SVD unsuccessfull for the singular values calculation only. A number of %i superdiagonals of an intermediate bidiagonal form did not converge to zeron", devInfo_h);

 printf("Calculation of the singular values only: %f msnn", elapsedTime);

 // --- Moving the results from device to host
 //gpuErrchk(cudaMemcpy(h_S, d_S, N * sizeof(float), cudaMemcpyDeviceToHost));
 //for (int i = 0; i < N; i++) std::cout << "d_S[" << i << "] = " << h_S[i] << std::endl;

 // --- CUDA SVD execution - Full SVD
 timerGPU.StartCounter();
 cusolveSafeCall(cusolverDnSgesvd(solver_handle, 'A', 'A', M, N, d_A, M, d_S, d_U, M, d_V, N, work, work_size, NULL, devInfo));
 elapsedTime = timerGPU.GetCounter();

 devInfo_h = 0;
 gpuErrchk(cudaMemcpy(&devInfo_h, devInfo, sizeof(int), cudaMemcpyDeviceToHost));
 if (devInfo_h == 0)
 printf("SVD successfull for the full SVD calculationnn");
 else if (devInfo_h < 0)
 printf("SVD unsuccessfull for the full SVD calculation. Parameter %i is wrongn", -devInfo_h);
 else
 printf("SVD unsuccessfull for the full SVD calculation. A number of %i superdiagonals of an intermediate bidiagonal form did not converge to zeron", devInfo_h);

 printf("Calculation of the full SVD calculation: %f msnn", elapsedTime);

 cusolveSafeCall(cusolverDnDestroy(solver_handle));

 return 0;

EDIT - PERFORMANCE ACROSS DIFFERENT VERSIONS OF CUDA

Computation type CUDA 8.0 CUDA 9.1 CUDA 10.0 
__________________________________________________________________

Singular values only 17s 15s 15s
Full SVD 161s 159s 457s
__________________________________________________________________

edited Nov 13 '18 at 10:12

answered Jan 20 '17 at 11:45

JackOLantern

14.7k355111

USE OF cusolver<T>nSgesvd

cusolverDnSgesvd(solver_handle, 'N', 'N', M, N, d_A, M, d_S, NULL, M, NULL, N, work, work_size, NULL, devInfo)

and one performing the full SVD calculation

cusolverDnSgesvd(solver_handle, 'A', 'A', M, N, d_A, M, d_S, d_U, M, d_V, N, work, work_size, NULL, devInfo)

The singular values calculation only is faster than the full SVD calculation. On a GTX 960, for a 1000x1000 matrix, the timing has been the following:

Singular values only: 559 ms
Full SVD: 2239 ms

Here is the full code:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>

#include<iostream>
#include<stdlib.h>
#include<stdio.h>

#include <cusolverDn.h>
#include <cuda_runtime_api.h>

#include "Utilities.cuh"
#include "TimingGPU.cuh"

/********/
/* MAIN */
/********/
int main()

 int M = 1000;
 int N = 1000;

 TimingGPU timerGPU;
 float elapsedTime;

 // --- Setting the host matrix
 float *h_A = (float *)malloc(M * N * sizeof(float));
 for (unsigned int i = 0; i < M; i++)
 for (unsigned int j = 0; j < N; j++)
 h_A[j*M + i] = (i + j) * (i + j);
 
 

 // --- Setting the device matrix and moving the host matrix to the device
 float *d_A; gpuErrchk(cudaMalloc(&d_A, M * N * sizeof(float)));
 gpuErrchk(cudaMemcpy(d_A, h_A, M * N * sizeof(float), cudaMemcpyHostToDevice));

 // --- host side SVD results space
 float *h_U = (float *)malloc(M * M * sizeof(float));
 float *h_V = (float *)malloc(N * N * sizeof(float));
 float *h_S = (float *)malloc(N * sizeof(float));

 // --- device side SVD workspace and matrices
 int work_size = 0;

 int *devInfo; gpuErrchk(cudaMalloc(&devInfo, sizeof(int)));
 float *d_U; gpuErrchk(cudaMalloc(&d_U, M * M * sizeof(float)));
 float *d_V; gpuErrchk(cudaMalloc(&d_V, N * N * sizeof(float)));
 float *d_S; gpuErrchk(cudaMalloc(&d_S, N * sizeof(float)));

 cusolverStatus_t stat;

 // --- CUDA solver initialization
 cusolverDnHandle_t solver_handle;
 cusolveSafeCall(cusolverDnCreate(&solver_handle));

 cusolveSafeCall(cusolverDnSgesvd_bufferSize(solver_handle, M, N, &work_size));

 float *work; gpuErrchk(cudaMalloc(&work, work_size * sizeof(float)));

 // --- CUDA SVD execution - Singular values only
 timerGPU.StartCounter();
 cusolveSafeCall(cusolverDnSgesvd(solver_handle, 'N', 'N', M, N, d_A, M, d_S, NULL, M, NULL, N, work, work_size, NULL, devInfo));
 elapsedTime = timerGPU.GetCounter();

 int devInfo_h = 0;
 gpuErrchk(cudaMemcpy(&devInfo_h, devInfo, sizeof(int), cudaMemcpyDeviceToHost));
 if (devInfo_h == 0)
 printf("SVD successfull for the singular values calculation onlynn");
 else if (devInfo_h < 0)
 printf("SVD unsuccessfull for the singular values calculation only. Parameter %i is wrongn", -devInfo_h);
 else
 printf("SVD unsuccessfull for the singular values calculation only. A number of %i superdiagonals of an intermediate bidiagonal form did not converge to zeron", devInfo_h);

 printf("Calculation of the singular values only: %f msnn", elapsedTime);

 // --- Moving the results from device to host
 //gpuErrchk(cudaMemcpy(h_S, d_S, N * sizeof(float), cudaMemcpyDeviceToHost));
 //for (int i = 0; i < N; i++) std::cout << "d_S[" << i << "] = " << h_S[i] << std::endl;

 // --- CUDA SVD execution - Full SVD
 timerGPU.StartCounter();
 cusolveSafeCall(cusolverDnSgesvd(solver_handle, 'A', 'A', M, N, d_A, M, d_S, d_U, M, d_V, N, work, work_size, NULL, devInfo));
 elapsedTime = timerGPU.GetCounter();

 devInfo_h = 0;
 gpuErrchk(cudaMemcpy(&devInfo_h, devInfo, sizeof(int), cudaMemcpyDeviceToHost));
 if (devInfo_h == 0)
 printf("SVD successfull for the full SVD calculationnn");
 else if (devInfo_h < 0)
 printf("SVD unsuccessfull for the full SVD calculation. Parameter %i is wrongn", -devInfo_h);
 else
 printf("SVD unsuccessfull for the full SVD calculation. A number of %i superdiagonals of an intermediate bidiagonal form did not converge to zeron", devInfo_h);

 printf("Calculation of the full SVD calculation: %f msnn", elapsedTime);

 cusolveSafeCall(cusolverDnDestroy(solver_handle));

 return 0;

EDIT - PERFORMANCE ACROSS DIFFERENT VERSIONS OF CUDA

Computation type CUDA 8.0 CUDA 9.1 CUDA 10.0 
__________________________________________________________________

Singular values only 17s 15s 15s
Full SVD 161s 159s 457s
__________________________________________________________________

edited Nov 13 '18 at 10:12

answered Jan 20 '17 at 11:45

JackOLantern

14.7k355111

edited Nov 13 '18 at 10:12

answered Jan 20 '17 at 11:45

JackOLantern

14.7k355111

answered Jan 20 '17 at 11:45

JackOLantern

14.7k355111

answered Jan 20 '17 at 11:45

JackOLantern

14.7k355111

add a comment |

draft saved

draft discarded

Thanks for contributing an answer to Stack Overflow!

Please be sure to answer the question. Provide details and share your research!

But avoid …

Asking for help, clarification, or responding to other answers.

Making statements based on opinion; back them up with references or personal experience.

To learn more, see our tips on writing great answers.

draft saved

draft discarded

Post as a guest

Name

Required, but never shown

Name

Required, but never shown

Name

Required, but never shown

This page is only for reference, If you need detailed information, please check here

搜尋此網誌

Dfyjkt