Singular values calculation only with CUDA










3















I'm trying to use the new cusolverDnSgesvd routine of CUDA 7.0 for the calculation of the singular values. The full code is reported below:



#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>

#include<iostream>
#include<stdlib.h>
#include<stdio.h>
#include <cusolverDn.h>
#include <cuda_runtime_api.h>

/***********************/
/* CUDA ERROR CHECKING */
/***********************/
void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)

if (code != cudaSuccess)

fprintf(stderr,"GPUassert: %s %s %dn", cudaGetErrorString(code), file, line);
if (abort) exit(code);


void gpuErrchk(cudaError_t ans) gpuAssert((ans), __FILE__, __LINE__);

/********/
/* MAIN */
/********/
int main()

int M = 10;
int N = 10;

// --- Setting the host matrix
float *h_A = (float *)malloc(M * N * sizeof(float));
for(unsigned int i = 0; i < M; i++)
for(unsigned int j = 0; j < N; j++)
h_A[j*M + i] = (i + j) * (i + j);



// --- Setting the device matrix and moving the host matrix to the device
float *d_A; gpuErrchk(cudaMalloc(&d_A, M * N * sizeof(float)));
gpuErrchk(cudaMemcpy(d_A, h_A, M * N * sizeof(float), cudaMemcpyHostToDevice));

// --- host side SVD results space
float *h_U = (float *)malloc(M * M * sizeof(float));
float *h_V = (float *)malloc(N * N * sizeof(float));
float *h_S = (float *)malloc(N * sizeof(float));

// --- device side SVD workspace and matrices
int work_size = 0;

int *devInfo; gpuErrchk(cudaMalloc(&devInfo, sizeof(int)));
float *d_U; gpuErrchk(cudaMalloc(&d_U, M * M * sizeof(float)));
float *d_V; gpuErrchk(cudaMalloc(&d_V, N * N * sizeof(float)));
float *d_S; gpuErrchk(cudaMalloc(&d_S, N * sizeof(float)));

cusolverStatus_t stat;

// --- CUDA solver initialization
cusolverDnHandle_t solver_handle;
cusolverDnCreate(&solver_handle);

stat = cusolverDnSgesvd_bufferSize(solver_handle, M, N, &work_size);
if(stat != CUSOLVER_STATUS_SUCCESS ) std::cout << "Initialization of cuSolver failed. N";

float *work; gpuErrchk(cudaMalloc(&work, work_size * sizeof(float)));
//float *rwork; gpuErrchk(cudaMalloc(&rwork, work_size * sizeof(float)));

// --- CUDA SVD execution
//stat = cusolverDnSgesvd(solver_handle, 'A', 'A', M, N, d_A, M, d_S, d_U, M, d_V, N, work, work_size, NULL, devInfo);
stat = cusolverDnSgesvd(solver_handle, 'N', 'N', M, N, d_A, M, d_S, d_U, M, d_V, N, work, work_size, NULL, devInfo);
cudaDeviceSynchronize();

int devInfo_h = 0;
gpuErrchk(cudaMemcpy(&devInfo_h, devInfo, sizeof(int), cudaMemcpyDeviceToHost));
std::cout << "devInfo = " << devInfo_h << "n";

switch(stat)
case CUSOLVER_STATUS_SUCCESS: std::cout << "SVD computation successn"; break;
case CUSOLVER_STATUS_NOT_INITIALIZED: std::cout << "Library cuSolver not initialized correctlyn"; break;
case CUSOLVER_STATUS_INVALID_VALUE: std::cout << "Invalid parameters passedn"; break;
case CUSOLVER_STATUS_INTERNAL_ERROR: std::cout << "Internal operation failedn"; break;


if (devInfo_h == 0 && stat == CUSOLVER_STATUS_SUCCESS) std::cout << "SVD successfulnn";

// --- Moving the results from device to host
gpuErrchk(cudaMemcpy(h_S, d_S, N * sizeof(float), cudaMemcpyDeviceToHost));

for(int i = 0; i < N; i++) std::cout << "d_S["<<i<<"] = " << h_S[i] << std::endl;

cusolverDnDestroy(solver_handle);

return 0;




If I ask for the computation of the full SVD (commented line with jobu = 'A' and jobvt = 'A') everything works fine. If I ask for the computation of the singular values only (line with jobu = 'N' and jobvt = 'N'), cusolverDnSgesvd returns



CUSOLVER_STATUS_INVALID_VALUE


Please note that, in this case devInfo = 0, so I cannot spot the invalid parameter.



Please also note that the documentation PDF lacks information about the rwork parameter so that I have dealt with it as a dummy parameter.










share|improve this question




























    3















    I'm trying to use the new cusolverDnSgesvd routine of CUDA 7.0 for the calculation of the singular values. The full code is reported below:



    #include "cuda_runtime.h"
    #include "device_launch_parameters.h"

    #include <stdio.h>

    #include<iostream>
    #include<stdlib.h>
    #include<stdio.h>
    #include <cusolverDn.h>
    #include <cuda_runtime_api.h>

    /***********************/
    /* CUDA ERROR CHECKING */
    /***********************/
    void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)

    if (code != cudaSuccess)

    fprintf(stderr,"GPUassert: %s %s %dn", cudaGetErrorString(code), file, line);
    if (abort) exit(code);


    void gpuErrchk(cudaError_t ans) gpuAssert((ans), __FILE__, __LINE__);

    /********/
    /* MAIN */
    /********/
    int main()

    int M = 10;
    int N = 10;

    // --- Setting the host matrix
    float *h_A = (float *)malloc(M * N * sizeof(float));
    for(unsigned int i = 0; i < M; i++)
    for(unsigned int j = 0; j < N; j++)
    h_A[j*M + i] = (i + j) * (i + j);



    // --- Setting the device matrix and moving the host matrix to the device
    float *d_A; gpuErrchk(cudaMalloc(&d_A, M * N * sizeof(float)));
    gpuErrchk(cudaMemcpy(d_A, h_A, M * N * sizeof(float), cudaMemcpyHostToDevice));

    // --- host side SVD results space
    float *h_U = (float *)malloc(M * M * sizeof(float));
    float *h_V = (float *)malloc(N * N * sizeof(float));
    float *h_S = (float *)malloc(N * sizeof(float));

    // --- device side SVD workspace and matrices
    int work_size = 0;

    int *devInfo; gpuErrchk(cudaMalloc(&devInfo, sizeof(int)));
    float *d_U; gpuErrchk(cudaMalloc(&d_U, M * M * sizeof(float)));
    float *d_V; gpuErrchk(cudaMalloc(&d_V, N * N * sizeof(float)));
    float *d_S; gpuErrchk(cudaMalloc(&d_S, N * sizeof(float)));

    cusolverStatus_t stat;

    // --- CUDA solver initialization
    cusolverDnHandle_t solver_handle;
    cusolverDnCreate(&solver_handle);

    stat = cusolverDnSgesvd_bufferSize(solver_handle, M, N, &work_size);
    if(stat != CUSOLVER_STATUS_SUCCESS ) std::cout << "Initialization of cuSolver failed. N";

    float *work; gpuErrchk(cudaMalloc(&work, work_size * sizeof(float)));
    //float *rwork; gpuErrchk(cudaMalloc(&rwork, work_size * sizeof(float)));

    // --- CUDA SVD execution
    //stat = cusolverDnSgesvd(solver_handle, 'A', 'A', M, N, d_A, M, d_S, d_U, M, d_V, N, work, work_size, NULL, devInfo);
    stat = cusolverDnSgesvd(solver_handle, 'N', 'N', M, N, d_A, M, d_S, d_U, M, d_V, N, work, work_size, NULL, devInfo);
    cudaDeviceSynchronize();

    int devInfo_h = 0;
    gpuErrchk(cudaMemcpy(&devInfo_h, devInfo, sizeof(int), cudaMemcpyDeviceToHost));
    std::cout << "devInfo = " << devInfo_h << "n";

    switch(stat)
    case CUSOLVER_STATUS_SUCCESS: std::cout << "SVD computation successn"; break;
    case CUSOLVER_STATUS_NOT_INITIALIZED: std::cout << "Library cuSolver not initialized correctlyn"; break;
    case CUSOLVER_STATUS_INVALID_VALUE: std::cout << "Invalid parameters passedn"; break;
    case CUSOLVER_STATUS_INTERNAL_ERROR: std::cout << "Internal operation failedn"; break;


    if (devInfo_h == 0 && stat == CUSOLVER_STATUS_SUCCESS) std::cout << "SVD successfulnn";

    // --- Moving the results from device to host
    gpuErrchk(cudaMemcpy(h_S, d_S, N * sizeof(float), cudaMemcpyDeviceToHost));

    for(int i = 0; i < N; i++) std::cout << "d_S["<<i<<"] = " << h_S[i] << std::endl;

    cusolverDnDestroy(solver_handle);

    return 0;




    If I ask for the computation of the full SVD (commented line with jobu = 'A' and jobvt = 'A') everything works fine. If I ask for the computation of the singular values only (line with jobu = 'N' and jobvt = 'N'), cusolverDnSgesvd returns



    CUSOLVER_STATUS_INVALID_VALUE


    Please note that, in this case devInfo = 0, so I cannot spot the invalid parameter.



    Please also note that the documentation PDF lacks information about the rwork parameter so that I have dealt with it as a dummy parameter.










    share|improve this question


























      3












      3








      3








      I'm trying to use the new cusolverDnSgesvd routine of CUDA 7.0 for the calculation of the singular values. The full code is reported below:



      #include "cuda_runtime.h"
      #include "device_launch_parameters.h"

      #include <stdio.h>

      #include<iostream>
      #include<stdlib.h>
      #include<stdio.h>
      #include <cusolverDn.h>
      #include <cuda_runtime_api.h>

      /***********************/
      /* CUDA ERROR CHECKING */
      /***********************/
      void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)

      if (code != cudaSuccess)

      fprintf(stderr,"GPUassert: %s %s %dn", cudaGetErrorString(code), file, line);
      if (abort) exit(code);


      void gpuErrchk(cudaError_t ans) gpuAssert((ans), __FILE__, __LINE__);

      /********/
      /* MAIN */
      /********/
      int main()

      int M = 10;
      int N = 10;

      // --- Setting the host matrix
      float *h_A = (float *)malloc(M * N * sizeof(float));
      for(unsigned int i = 0; i < M; i++)
      for(unsigned int j = 0; j < N; j++)
      h_A[j*M + i] = (i + j) * (i + j);



      // --- Setting the device matrix and moving the host matrix to the device
      float *d_A; gpuErrchk(cudaMalloc(&d_A, M * N * sizeof(float)));
      gpuErrchk(cudaMemcpy(d_A, h_A, M * N * sizeof(float), cudaMemcpyHostToDevice));

      // --- host side SVD results space
      float *h_U = (float *)malloc(M * M * sizeof(float));
      float *h_V = (float *)malloc(N * N * sizeof(float));
      float *h_S = (float *)malloc(N * sizeof(float));

      // --- device side SVD workspace and matrices
      int work_size = 0;

      int *devInfo; gpuErrchk(cudaMalloc(&devInfo, sizeof(int)));
      float *d_U; gpuErrchk(cudaMalloc(&d_U, M * M * sizeof(float)));
      float *d_V; gpuErrchk(cudaMalloc(&d_V, N * N * sizeof(float)));
      float *d_S; gpuErrchk(cudaMalloc(&d_S, N * sizeof(float)));

      cusolverStatus_t stat;

      // --- CUDA solver initialization
      cusolverDnHandle_t solver_handle;
      cusolverDnCreate(&solver_handle);

      stat = cusolverDnSgesvd_bufferSize(solver_handle, M, N, &work_size);
      if(stat != CUSOLVER_STATUS_SUCCESS ) std::cout << "Initialization of cuSolver failed. N";

      float *work; gpuErrchk(cudaMalloc(&work, work_size * sizeof(float)));
      //float *rwork; gpuErrchk(cudaMalloc(&rwork, work_size * sizeof(float)));

      // --- CUDA SVD execution
      //stat = cusolverDnSgesvd(solver_handle, 'A', 'A', M, N, d_A, M, d_S, d_U, M, d_V, N, work, work_size, NULL, devInfo);
      stat = cusolverDnSgesvd(solver_handle, 'N', 'N', M, N, d_A, M, d_S, d_U, M, d_V, N, work, work_size, NULL, devInfo);
      cudaDeviceSynchronize();

      int devInfo_h = 0;
      gpuErrchk(cudaMemcpy(&devInfo_h, devInfo, sizeof(int), cudaMemcpyDeviceToHost));
      std::cout << "devInfo = " << devInfo_h << "n";

      switch(stat)
      case CUSOLVER_STATUS_SUCCESS: std::cout << "SVD computation successn"; break;
      case CUSOLVER_STATUS_NOT_INITIALIZED: std::cout << "Library cuSolver not initialized correctlyn"; break;
      case CUSOLVER_STATUS_INVALID_VALUE: std::cout << "Invalid parameters passedn"; break;
      case CUSOLVER_STATUS_INTERNAL_ERROR: std::cout << "Internal operation failedn"; break;


      if (devInfo_h == 0 && stat == CUSOLVER_STATUS_SUCCESS) std::cout << "SVD successfulnn";

      // --- Moving the results from device to host
      gpuErrchk(cudaMemcpy(h_S, d_S, N * sizeof(float), cudaMemcpyDeviceToHost));

      for(int i = 0; i < N; i++) std::cout << "d_S["<<i<<"] = " << h_S[i] << std::endl;

      cusolverDnDestroy(solver_handle);

      return 0;




      If I ask for the computation of the full SVD (commented line with jobu = 'A' and jobvt = 'A') everything works fine. If I ask for the computation of the singular values only (line with jobu = 'N' and jobvt = 'N'), cusolverDnSgesvd returns



      CUSOLVER_STATUS_INVALID_VALUE


      Please note that, in this case devInfo = 0, so I cannot spot the invalid parameter.



      Please also note that the documentation PDF lacks information about the rwork parameter so that I have dealt with it as a dummy parameter.










      share|improve this question
















      I'm trying to use the new cusolverDnSgesvd routine of CUDA 7.0 for the calculation of the singular values. The full code is reported below:



      #include "cuda_runtime.h"
      #include "device_launch_parameters.h"

      #include <stdio.h>

      #include<iostream>
      #include<stdlib.h>
      #include<stdio.h>
      #include <cusolverDn.h>
      #include <cuda_runtime_api.h>

      /***********************/
      /* CUDA ERROR CHECKING */
      /***********************/
      void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)

      if (code != cudaSuccess)

      fprintf(stderr,"GPUassert: %s %s %dn", cudaGetErrorString(code), file, line);
      if (abort) exit(code);


      void gpuErrchk(cudaError_t ans) gpuAssert((ans), __FILE__, __LINE__);

      /********/
      /* MAIN */
      /********/
      int main()

      int M = 10;
      int N = 10;

      // --- Setting the host matrix
      float *h_A = (float *)malloc(M * N * sizeof(float));
      for(unsigned int i = 0; i < M; i++)
      for(unsigned int j = 0; j < N; j++)
      h_A[j*M + i] = (i + j) * (i + j);



      // --- Setting the device matrix and moving the host matrix to the device
      float *d_A; gpuErrchk(cudaMalloc(&d_A, M * N * sizeof(float)));
      gpuErrchk(cudaMemcpy(d_A, h_A, M * N * sizeof(float), cudaMemcpyHostToDevice));

      // --- host side SVD results space
      float *h_U = (float *)malloc(M * M * sizeof(float));
      float *h_V = (float *)malloc(N * N * sizeof(float));
      float *h_S = (float *)malloc(N * sizeof(float));

      // --- device side SVD workspace and matrices
      int work_size = 0;

      int *devInfo; gpuErrchk(cudaMalloc(&devInfo, sizeof(int)));
      float *d_U; gpuErrchk(cudaMalloc(&d_U, M * M * sizeof(float)));
      float *d_V; gpuErrchk(cudaMalloc(&d_V, N * N * sizeof(float)));
      float *d_S; gpuErrchk(cudaMalloc(&d_S, N * sizeof(float)));

      cusolverStatus_t stat;

      // --- CUDA solver initialization
      cusolverDnHandle_t solver_handle;
      cusolverDnCreate(&solver_handle);

      stat = cusolverDnSgesvd_bufferSize(solver_handle, M, N, &work_size);
      if(stat != CUSOLVER_STATUS_SUCCESS ) std::cout << "Initialization of cuSolver failed. N";

      float *work; gpuErrchk(cudaMalloc(&work, work_size * sizeof(float)));
      //float *rwork; gpuErrchk(cudaMalloc(&rwork, work_size * sizeof(float)));

      // --- CUDA SVD execution
      //stat = cusolverDnSgesvd(solver_handle, 'A', 'A', M, N, d_A, M, d_S, d_U, M, d_V, N, work, work_size, NULL, devInfo);
      stat = cusolverDnSgesvd(solver_handle, 'N', 'N', M, N, d_A, M, d_S, d_U, M, d_V, N, work, work_size, NULL, devInfo);
      cudaDeviceSynchronize();

      int devInfo_h = 0;
      gpuErrchk(cudaMemcpy(&devInfo_h, devInfo, sizeof(int), cudaMemcpyDeviceToHost));
      std::cout << "devInfo = " << devInfo_h << "n";

      switch(stat)
      case CUSOLVER_STATUS_SUCCESS: std::cout << "SVD computation successn"; break;
      case CUSOLVER_STATUS_NOT_INITIALIZED: std::cout << "Library cuSolver not initialized correctlyn"; break;
      case CUSOLVER_STATUS_INVALID_VALUE: std::cout << "Invalid parameters passedn"; break;
      case CUSOLVER_STATUS_INTERNAL_ERROR: std::cout << "Internal operation failedn"; break;


      if (devInfo_h == 0 && stat == CUSOLVER_STATUS_SUCCESS) std::cout << "SVD successfulnn";

      // --- Moving the results from device to host
      gpuErrchk(cudaMemcpy(h_S, d_S, N * sizeof(float), cudaMemcpyDeviceToHost));

      for(int i = 0; i < N; i++) std::cout << "d_S["<<i<<"] = " << h_S[i] << std::endl;

      cusolverDnDestroy(solver_handle);

      return 0;




      If I ask for the computation of the full SVD (commented line with jobu = 'A' and jobvt = 'A') everything works fine. If I ask for the computation of the singular values only (line with jobu = 'N' and jobvt = 'N'), cusolverDnSgesvd returns



      CUSOLVER_STATUS_INVALID_VALUE


      Please note that, in this case devInfo = 0, so I cannot spot the invalid parameter.



      Please also note that the documentation PDF lacks information about the rwork parameter so that I have dealt with it as a dummy parameter.







      cuda svd cusolver






      share|improve this question















      share|improve this question













      share|improve this question




      share|improve this question








      edited Jan 20 '17 at 11:46









      JackOLantern

      14.7k355111




      14.7k355111










      asked Jan 23 '15 at 10:13









      WestWizardWestWizard

      317




      317






















          2 Answers
          2






          active

          oldest

          votes


















          1














          At this time the cuSolver gesvd function only supports jobu = 'A' and jobvt = 'A'



          So the error when you specify other combinations is expected. From the documentation:




          Remark 2: gesvd only supports jobu='A' and jobvt='A' and returns matrix U and VH







          share|improve this answer


















          • 1





            As of CUDA 8.0, gesvd supports jobu/jobvt = A, S, O, or N.

            – lebedov
            Nov 13 '16 at 22:47


















          0














          USE OF cusolver<T>nSgesvd



          As remarked by lebedov, as of CUDA 8.0, it is now possible to calculate the singular values only by cusolverDnSgesvd. I report below a slightly modified version of your code with two calls to cusolverDnSgesvd, one performing the singular values calculation only



          cusolverDnSgesvd(solver_handle, 'N', 'N', M, N, d_A, M, d_S, NULL, M, NULL, N, work, work_size, NULL, devInfo)


          and one performing the full SVD calculation



          cusolverDnSgesvd(solver_handle, 'A', 'A', M, N, d_A, M, d_S, d_U, M, d_V, N, work, work_size, NULL, devInfo)


          As you already remarked, the two 'A' fields for the full SVD case are changed to 'N' in the singular values only case. Please, note that, in the singular values only case, there is no need to store space for the singular vector matrices U and V. Indeed, a NULL pointer is passed.



          The singular values calculation only is faster than the full SVD calculation. On a GTX 960, for a 1000x1000 matrix, the timing has been the following:



          Singular values only: 559 ms
          Full SVD: 2239 ms


          Here is the full code:



          #include "cuda_runtime.h"
          #include "device_launch_parameters.h"

          #include <stdio.h>

          #include<iostream>
          #include<stdlib.h>
          #include<stdio.h>

          #include <cusolverDn.h>
          #include <cuda_runtime_api.h>

          #include "Utilities.cuh"
          #include "TimingGPU.cuh"

          /********/
          /* MAIN */
          /********/
          int main()

          int M = 1000;
          int N = 1000;

          TimingGPU timerGPU;
          float elapsedTime;

          // --- Setting the host matrix
          float *h_A = (float *)malloc(M * N * sizeof(float));
          for (unsigned int i = 0; i < M; i++)
          for (unsigned int j = 0; j < N; j++)
          h_A[j*M + i] = (i + j) * (i + j);



          // --- Setting the device matrix and moving the host matrix to the device
          float *d_A; gpuErrchk(cudaMalloc(&d_A, M * N * sizeof(float)));
          gpuErrchk(cudaMemcpy(d_A, h_A, M * N * sizeof(float), cudaMemcpyHostToDevice));

          // --- host side SVD results space
          float *h_U = (float *)malloc(M * M * sizeof(float));
          float *h_V = (float *)malloc(N * N * sizeof(float));
          float *h_S = (float *)malloc(N * sizeof(float));

          // --- device side SVD workspace and matrices
          int work_size = 0;

          int *devInfo; gpuErrchk(cudaMalloc(&devInfo, sizeof(int)));
          float *d_U; gpuErrchk(cudaMalloc(&d_U, M * M * sizeof(float)));
          float *d_V; gpuErrchk(cudaMalloc(&d_V, N * N * sizeof(float)));
          float *d_S; gpuErrchk(cudaMalloc(&d_S, N * sizeof(float)));

          cusolverStatus_t stat;

          // --- CUDA solver initialization
          cusolverDnHandle_t solver_handle;
          cusolveSafeCall(cusolverDnCreate(&solver_handle));

          cusolveSafeCall(cusolverDnSgesvd_bufferSize(solver_handle, M, N, &work_size));

          float *work; gpuErrchk(cudaMalloc(&work, work_size * sizeof(float)));

          // --- CUDA SVD execution - Singular values only
          timerGPU.StartCounter();
          cusolveSafeCall(cusolverDnSgesvd(solver_handle, 'N', 'N', M, N, d_A, M, d_S, NULL, M, NULL, N, work, work_size, NULL, devInfo));
          elapsedTime = timerGPU.GetCounter();

          int devInfo_h = 0;
          gpuErrchk(cudaMemcpy(&devInfo_h, devInfo, sizeof(int), cudaMemcpyDeviceToHost));
          if (devInfo_h == 0)
          printf("SVD successfull for the singular values calculation onlynn");
          else if (devInfo_h < 0)
          printf("SVD unsuccessfull for the singular values calculation only. Parameter %i is wrongn", -devInfo_h);
          else
          printf("SVD unsuccessfull for the singular values calculation only. A number of %i superdiagonals of an intermediate bidiagonal form did not converge to zeron", devInfo_h);

          printf("Calculation of the singular values only: %f msnn", elapsedTime);

          // --- Moving the results from device to host
          //gpuErrchk(cudaMemcpy(h_S, d_S, N * sizeof(float), cudaMemcpyDeviceToHost));
          //for (int i = 0; i < N; i++) std::cout << "d_S[" << i << "] = " << h_S[i] << std::endl;

          // --- CUDA SVD execution - Full SVD
          timerGPU.StartCounter();
          cusolveSafeCall(cusolverDnSgesvd(solver_handle, 'A', 'A', M, N, d_A, M, d_S, d_U, M, d_V, N, work, work_size, NULL, devInfo));
          elapsedTime = timerGPU.GetCounter();

          devInfo_h = 0;
          gpuErrchk(cudaMemcpy(&devInfo_h, devInfo, sizeof(int), cudaMemcpyDeviceToHost));
          if (devInfo_h == 0)
          printf("SVD successfull for the full SVD calculationnn");
          else if (devInfo_h < 0)
          printf("SVD unsuccessfull for the full SVD calculation. Parameter %i is wrongn", -devInfo_h);
          else
          printf("SVD unsuccessfull for the full SVD calculation. A number of %i superdiagonals of an intermediate bidiagonal form did not converge to zeron", devInfo_h);

          printf("Calculation of the full SVD calculation: %f msnn", elapsedTime);

          cusolveSafeCall(cusolverDnDestroy(solver_handle));

          return 0;




          EDIT - PERFORMANCE ACROSS DIFFERENT VERSIONS OF CUDA



          I have compared the performance of the singular values only calculation and the the Full SVD computations for CUDA 8.0, CUDA 9.1 and CUDA 10.0, for a 5000x5000 matrix. Here are the results on a GTX 960.



          Computation type CUDA 8.0 CUDA 9.1 CUDA 10.0 
          __________________________________________________________________

          Singular values only 17s 15s 15s
          Full SVD 161s 159s 457s
          __________________________________________________________________





          share|improve this answer
























            Your Answer






            StackExchange.ifUsing("editor", function ()
            StackExchange.using("externalEditor", function ()
            StackExchange.using("snippets", function ()
            StackExchange.snippets.init();
            );
            );
            , "code-snippets");

            StackExchange.ready(function()
            var channelOptions =
            tags: "".split(" "),
            id: "1"
            ;
            initTagRenderer("".split(" "), "".split(" "), channelOptions);

            StackExchange.using("externalEditor", function()
            // Have to fire editor after snippets, if snippets enabled
            if (StackExchange.settings.snippets.snippetsEnabled)
            StackExchange.using("snippets", function()
            createEditor();
            );

            else
            createEditor();

            );

            function createEditor()
            StackExchange.prepareEditor(
            heartbeatType: 'answer',
            autoActivateHeartbeat: false,
            convertImagesToLinks: true,
            noModals: true,
            showLowRepImageUploadWarning: true,
            reputationToPostImages: 10,
            bindNavPrevention: true,
            postfix: "",
            imageUploader:
            brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
            contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
            allowUrls: true
            ,
            onDemand: true,
            discardSelector: ".discard-answer"
            ,immediatelyShowMarkdownHelp:true
            );



            );













            draft saved

            draft discarded


















            StackExchange.ready(
            function ()
            StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f28107525%2fsingular-values-calculation-only-with-cuda%23new-answer', 'question_page');

            );

            Post as a guest















            Required, but never shown

























            2 Answers
            2






            active

            oldest

            votes








            2 Answers
            2






            active

            oldest

            votes









            active

            oldest

            votes






            active

            oldest

            votes









            1














            At this time the cuSolver gesvd function only supports jobu = 'A' and jobvt = 'A'



            So the error when you specify other combinations is expected. From the documentation:




            Remark 2: gesvd only supports jobu='A' and jobvt='A' and returns matrix U and VH







            share|improve this answer


















            • 1





              As of CUDA 8.0, gesvd supports jobu/jobvt = A, S, O, or N.

              – lebedov
              Nov 13 '16 at 22:47















            1














            At this time the cuSolver gesvd function only supports jobu = 'A' and jobvt = 'A'



            So the error when you specify other combinations is expected. From the documentation:




            Remark 2: gesvd only supports jobu='A' and jobvt='A' and returns matrix U and VH







            share|improve this answer


















            • 1





              As of CUDA 8.0, gesvd supports jobu/jobvt = A, S, O, or N.

              – lebedov
              Nov 13 '16 at 22:47













            1












            1








            1







            At this time the cuSolver gesvd function only supports jobu = 'A' and jobvt = 'A'



            So the error when you specify other combinations is expected. From the documentation:




            Remark 2: gesvd only supports jobu='A' and jobvt='A' and returns matrix U and VH







            share|improve this answer













            At this time the cuSolver gesvd function only supports jobu = 'A' and jobvt = 'A'



            So the error when you specify other combinations is expected. From the documentation:




            Remark 2: gesvd only supports jobu='A' and jobvt='A' and returns matrix U and VH








            share|improve this answer












            share|improve this answer



            share|improve this answer










            answered Mar 22 '15 at 15:26









            Robert CrovellaRobert Crovella

            96.9k5110152




            96.9k5110152







            • 1





              As of CUDA 8.0, gesvd supports jobu/jobvt = A, S, O, or N.

              – lebedov
              Nov 13 '16 at 22:47












            • 1





              As of CUDA 8.0, gesvd supports jobu/jobvt = A, S, O, or N.

              – lebedov
              Nov 13 '16 at 22:47







            1




            1





            As of CUDA 8.0, gesvd supports jobu/jobvt = A, S, O, or N.

            – lebedov
            Nov 13 '16 at 22:47





            As of CUDA 8.0, gesvd supports jobu/jobvt = A, S, O, or N.

            – lebedov
            Nov 13 '16 at 22:47













            0














            USE OF cusolver<T>nSgesvd



            As remarked by lebedov, as of CUDA 8.0, it is now possible to calculate the singular values only by cusolverDnSgesvd. I report below a slightly modified version of your code with two calls to cusolverDnSgesvd, one performing the singular values calculation only



            cusolverDnSgesvd(solver_handle, 'N', 'N', M, N, d_A, M, d_S, NULL, M, NULL, N, work, work_size, NULL, devInfo)


            and one performing the full SVD calculation



            cusolverDnSgesvd(solver_handle, 'A', 'A', M, N, d_A, M, d_S, d_U, M, d_V, N, work, work_size, NULL, devInfo)


            As you already remarked, the two 'A' fields for the full SVD case are changed to 'N' in the singular values only case. Please, note that, in the singular values only case, there is no need to store space for the singular vector matrices U and V. Indeed, a NULL pointer is passed.



            The singular values calculation only is faster than the full SVD calculation. On a GTX 960, for a 1000x1000 matrix, the timing has been the following:



            Singular values only: 559 ms
            Full SVD: 2239 ms


            Here is the full code:



            #include "cuda_runtime.h"
            #include "device_launch_parameters.h"

            #include <stdio.h>

            #include<iostream>
            #include<stdlib.h>
            #include<stdio.h>

            #include <cusolverDn.h>
            #include <cuda_runtime_api.h>

            #include "Utilities.cuh"
            #include "TimingGPU.cuh"

            /********/
            /* MAIN */
            /********/
            int main()

            int M = 1000;
            int N = 1000;

            TimingGPU timerGPU;
            float elapsedTime;

            // --- Setting the host matrix
            float *h_A = (float *)malloc(M * N * sizeof(float));
            for (unsigned int i = 0; i < M; i++)
            for (unsigned int j = 0; j < N; j++)
            h_A[j*M + i] = (i + j) * (i + j);



            // --- Setting the device matrix and moving the host matrix to the device
            float *d_A; gpuErrchk(cudaMalloc(&d_A, M * N * sizeof(float)));
            gpuErrchk(cudaMemcpy(d_A, h_A, M * N * sizeof(float), cudaMemcpyHostToDevice));

            // --- host side SVD results space
            float *h_U = (float *)malloc(M * M * sizeof(float));
            float *h_V = (float *)malloc(N * N * sizeof(float));
            float *h_S = (float *)malloc(N * sizeof(float));

            // --- device side SVD workspace and matrices
            int work_size = 0;

            int *devInfo; gpuErrchk(cudaMalloc(&devInfo, sizeof(int)));
            float *d_U; gpuErrchk(cudaMalloc(&d_U, M * M * sizeof(float)));
            float *d_V; gpuErrchk(cudaMalloc(&d_V, N * N * sizeof(float)));
            float *d_S; gpuErrchk(cudaMalloc(&d_S, N * sizeof(float)));

            cusolverStatus_t stat;

            // --- CUDA solver initialization
            cusolverDnHandle_t solver_handle;
            cusolveSafeCall(cusolverDnCreate(&solver_handle));

            cusolveSafeCall(cusolverDnSgesvd_bufferSize(solver_handle, M, N, &work_size));

            float *work; gpuErrchk(cudaMalloc(&work, work_size * sizeof(float)));

            // --- CUDA SVD execution - Singular values only
            timerGPU.StartCounter();
            cusolveSafeCall(cusolverDnSgesvd(solver_handle, 'N', 'N', M, N, d_A, M, d_S, NULL, M, NULL, N, work, work_size, NULL, devInfo));
            elapsedTime = timerGPU.GetCounter();

            int devInfo_h = 0;
            gpuErrchk(cudaMemcpy(&devInfo_h, devInfo, sizeof(int), cudaMemcpyDeviceToHost));
            if (devInfo_h == 0)
            printf("SVD successfull for the singular values calculation onlynn");
            else if (devInfo_h < 0)
            printf("SVD unsuccessfull for the singular values calculation only. Parameter %i is wrongn", -devInfo_h);
            else
            printf("SVD unsuccessfull for the singular values calculation only. A number of %i superdiagonals of an intermediate bidiagonal form did not converge to zeron", devInfo_h);

            printf("Calculation of the singular values only: %f msnn", elapsedTime);

            // --- Moving the results from device to host
            //gpuErrchk(cudaMemcpy(h_S, d_S, N * sizeof(float), cudaMemcpyDeviceToHost));
            //for (int i = 0; i < N; i++) std::cout << "d_S[" << i << "] = " << h_S[i] << std::endl;

            // --- CUDA SVD execution - Full SVD
            timerGPU.StartCounter();
            cusolveSafeCall(cusolverDnSgesvd(solver_handle, 'A', 'A', M, N, d_A, M, d_S, d_U, M, d_V, N, work, work_size, NULL, devInfo));
            elapsedTime = timerGPU.GetCounter();

            devInfo_h = 0;
            gpuErrchk(cudaMemcpy(&devInfo_h, devInfo, sizeof(int), cudaMemcpyDeviceToHost));
            if (devInfo_h == 0)
            printf("SVD successfull for the full SVD calculationnn");
            else if (devInfo_h < 0)
            printf("SVD unsuccessfull for the full SVD calculation. Parameter %i is wrongn", -devInfo_h);
            else
            printf("SVD unsuccessfull for the full SVD calculation. A number of %i superdiagonals of an intermediate bidiagonal form did not converge to zeron", devInfo_h);

            printf("Calculation of the full SVD calculation: %f msnn", elapsedTime);

            cusolveSafeCall(cusolverDnDestroy(solver_handle));

            return 0;




            EDIT - PERFORMANCE ACROSS DIFFERENT VERSIONS OF CUDA



            I have compared the performance of the singular values only calculation and the the Full SVD computations for CUDA 8.0, CUDA 9.1 and CUDA 10.0, for a 5000x5000 matrix. Here are the results on a GTX 960.



            Computation type CUDA 8.0 CUDA 9.1 CUDA 10.0 
            __________________________________________________________________

            Singular values only 17s 15s 15s
            Full SVD 161s 159s 457s
            __________________________________________________________________





            share|improve this answer





























              0














              USE OF cusolver<T>nSgesvd



              As remarked by lebedov, as of CUDA 8.0, it is now possible to calculate the singular values only by cusolverDnSgesvd. I report below a slightly modified version of your code with two calls to cusolverDnSgesvd, one performing the singular values calculation only



              cusolverDnSgesvd(solver_handle, 'N', 'N', M, N, d_A, M, d_S, NULL, M, NULL, N, work, work_size, NULL, devInfo)


              and one performing the full SVD calculation



              cusolverDnSgesvd(solver_handle, 'A', 'A', M, N, d_A, M, d_S, d_U, M, d_V, N, work, work_size, NULL, devInfo)


              As you already remarked, the two 'A' fields for the full SVD case are changed to 'N' in the singular values only case. Please, note that, in the singular values only case, there is no need to store space for the singular vector matrices U and V. Indeed, a NULL pointer is passed.



              The singular values calculation only is faster than the full SVD calculation. On a GTX 960, for a 1000x1000 matrix, the timing has been the following:



              Singular values only: 559 ms
              Full SVD: 2239 ms


              Here is the full code:



              #include "cuda_runtime.h"
              #include "device_launch_parameters.h"

              #include <stdio.h>

              #include<iostream>
              #include<stdlib.h>
              #include<stdio.h>

              #include <cusolverDn.h>
              #include <cuda_runtime_api.h>

              #include "Utilities.cuh"
              #include "TimingGPU.cuh"

              /********/
              /* MAIN */
              /********/
              int main()

              int M = 1000;
              int N = 1000;

              TimingGPU timerGPU;
              float elapsedTime;

              // --- Setting the host matrix
              float *h_A = (float *)malloc(M * N * sizeof(float));
              for (unsigned int i = 0; i < M; i++)
              for (unsigned int j = 0; j < N; j++)
              h_A[j*M + i] = (i + j) * (i + j);



              // --- Setting the device matrix and moving the host matrix to the device
              float *d_A; gpuErrchk(cudaMalloc(&d_A, M * N * sizeof(float)));
              gpuErrchk(cudaMemcpy(d_A, h_A, M * N * sizeof(float), cudaMemcpyHostToDevice));

              // --- host side SVD results space
              float *h_U = (float *)malloc(M * M * sizeof(float));
              float *h_V = (float *)malloc(N * N * sizeof(float));
              float *h_S = (float *)malloc(N * sizeof(float));

              // --- device side SVD workspace and matrices
              int work_size = 0;

              int *devInfo; gpuErrchk(cudaMalloc(&devInfo, sizeof(int)));
              float *d_U; gpuErrchk(cudaMalloc(&d_U, M * M * sizeof(float)));
              float *d_V; gpuErrchk(cudaMalloc(&d_V, N * N * sizeof(float)));
              float *d_S; gpuErrchk(cudaMalloc(&d_S, N * sizeof(float)));

              cusolverStatus_t stat;

              // --- CUDA solver initialization
              cusolverDnHandle_t solver_handle;
              cusolveSafeCall(cusolverDnCreate(&solver_handle));

              cusolveSafeCall(cusolverDnSgesvd_bufferSize(solver_handle, M, N, &work_size));

              float *work; gpuErrchk(cudaMalloc(&work, work_size * sizeof(float)));

              // --- CUDA SVD execution - Singular values only
              timerGPU.StartCounter();
              cusolveSafeCall(cusolverDnSgesvd(solver_handle, 'N', 'N', M, N, d_A, M, d_S, NULL, M, NULL, N, work, work_size, NULL, devInfo));
              elapsedTime = timerGPU.GetCounter();

              int devInfo_h = 0;
              gpuErrchk(cudaMemcpy(&devInfo_h, devInfo, sizeof(int), cudaMemcpyDeviceToHost));
              if (devInfo_h == 0)
              printf("SVD successfull for the singular values calculation onlynn");
              else if (devInfo_h < 0)
              printf("SVD unsuccessfull for the singular values calculation only. Parameter %i is wrongn", -devInfo_h);
              else
              printf("SVD unsuccessfull for the singular values calculation only. A number of %i superdiagonals of an intermediate bidiagonal form did not converge to zeron", devInfo_h);

              printf("Calculation of the singular values only: %f msnn", elapsedTime);

              // --- Moving the results from device to host
              //gpuErrchk(cudaMemcpy(h_S, d_S, N * sizeof(float), cudaMemcpyDeviceToHost));
              //for (int i = 0; i < N; i++) std::cout << "d_S[" << i << "] = " << h_S[i] << std::endl;

              // --- CUDA SVD execution - Full SVD
              timerGPU.StartCounter();
              cusolveSafeCall(cusolverDnSgesvd(solver_handle, 'A', 'A', M, N, d_A, M, d_S, d_U, M, d_V, N, work, work_size, NULL, devInfo));
              elapsedTime = timerGPU.GetCounter();

              devInfo_h = 0;
              gpuErrchk(cudaMemcpy(&devInfo_h, devInfo, sizeof(int), cudaMemcpyDeviceToHost));
              if (devInfo_h == 0)
              printf("SVD successfull for the full SVD calculationnn");
              else if (devInfo_h < 0)
              printf("SVD unsuccessfull for the full SVD calculation. Parameter %i is wrongn", -devInfo_h);
              else
              printf("SVD unsuccessfull for the full SVD calculation. A number of %i superdiagonals of an intermediate bidiagonal form did not converge to zeron", devInfo_h);

              printf("Calculation of the full SVD calculation: %f msnn", elapsedTime);

              cusolveSafeCall(cusolverDnDestroy(solver_handle));

              return 0;




              EDIT - PERFORMANCE ACROSS DIFFERENT VERSIONS OF CUDA



              I have compared the performance of the singular values only calculation and the the Full SVD computations for CUDA 8.0, CUDA 9.1 and CUDA 10.0, for a 5000x5000 matrix. Here are the results on a GTX 960.



              Computation type CUDA 8.0 CUDA 9.1 CUDA 10.0 
              __________________________________________________________________

              Singular values only 17s 15s 15s
              Full SVD 161s 159s 457s
              __________________________________________________________________





              share|improve this answer



























                0












                0








                0







                USE OF cusolver<T>nSgesvd



                As remarked by lebedov, as of CUDA 8.0, it is now possible to calculate the singular values only by cusolverDnSgesvd. I report below a slightly modified version of your code with two calls to cusolverDnSgesvd, one performing the singular values calculation only



                cusolverDnSgesvd(solver_handle, 'N', 'N', M, N, d_A, M, d_S, NULL, M, NULL, N, work, work_size, NULL, devInfo)


                and one performing the full SVD calculation



                cusolverDnSgesvd(solver_handle, 'A', 'A', M, N, d_A, M, d_S, d_U, M, d_V, N, work, work_size, NULL, devInfo)


                As you already remarked, the two 'A' fields for the full SVD case are changed to 'N' in the singular values only case. Please, note that, in the singular values only case, there is no need to store space for the singular vector matrices U and V. Indeed, a NULL pointer is passed.



                The singular values calculation only is faster than the full SVD calculation. On a GTX 960, for a 1000x1000 matrix, the timing has been the following:



                Singular values only: 559 ms
                Full SVD: 2239 ms


                Here is the full code:



                #include "cuda_runtime.h"
                #include "device_launch_parameters.h"

                #include <stdio.h>

                #include<iostream>
                #include<stdlib.h>
                #include<stdio.h>

                #include <cusolverDn.h>
                #include <cuda_runtime_api.h>

                #include "Utilities.cuh"
                #include "TimingGPU.cuh"

                /********/
                /* MAIN */
                /********/
                int main()

                int M = 1000;
                int N = 1000;

                TimingGPU timerGPU;
                float elapsedTime;

                // --- Setting the host matrix
                float *h_A = (float *)malloc(M * N * sizeof(float));
                for (unsigned int i = 0; i < M; i++)
                for (unsigned int j = 0; j < N; j++)
                h_A[j*M + i] = (i + j) * (i + j);



                // --- Setting the device matrix and moving the host matrix to the device
                float *d_A; gpuErrchk(cudaMalloc(&d_A, M * N * sizeof(float)));
                gpuErrchk(cudaMemcpy(d_A, h_A, M * N * sizeof(float), cudaMemcpyHostToDevice));

                // --- host side SVD results space
                float *h_U = (float *)malloc(M * M * sizeof(float));
                float *h_V = (float *)malloc(N * N * sizeof(float));
                float *h_S = (float *)malloc(N * sizeof(float));

                // --- device side SVD workspace and matrices
                int work_size = 0;

                int *devInfo; gpuErrchk(cudaMalloc(&devInfo, sizeof(int)));
                float *d_U; gpuErrchk(cudaMalloc(&d_U, M * M * sizeof(float)));
                float *d_V; gpuErrchk(cudaMalloc(&d_V, N * N * sizeof(float)));
                float *d_S; gpuErrchk(cudaMalloc(&d_S, N * sizeof(float)));

                cusolverStatus_t stat;

                // --- CUDA solver initialization
                cusolverDnHandle_t solver_handle;
                cusolveSafeCall(cusolverDnCreate(&solver_handle));

                cusolveSafeCall(cusolverDnSgesvd_bufferSize(solver_handle, M, N, &work_size));

                float *work; gpuErrchk(cudaMalloc(&work, work_size * sizeof(float)));

                // --- CUDA SVD execution - Singular values only
                timerGPU.StartCounter();
                cusolveSafeCall(cusolverDnSgesvd(solver_handle, 'N', 'N', M, N, d_A, M, d_S, NULL, M, NULL, N, work, work_size, NULL, devInfo));
                elapsedTime = timerGPU.GetCounter();

                int devInfo_h = 0;
                gpuErrchk(cudaMemcpy(&devInfo_h, devInfo, sizeof(int), cudaMemcpyDeviceToHost));
                if (devInfo_h == 0)
                printf("SVD successfull for the singular values calculation onlynn");
                else if (devInfo_h < 0)
                printf("SVD unsuccessfull for the singular values calculation only. Parameter %i is wrongn", -devInfo_h);
                else
                printf("SVD unsuccessfull for the singular values calculation only. A number of %i superdiagonals of an intermediate bidiagonal form did not converge to zeron", devInfo_h);

                printf("Calculation of the singular values only: %f msnn", elapsedTime);

                // --- Moving the results from device to host
                //gpuErrchk(cudaMemcpy(h_S, d_S, N * sizeof(float), cudaMemcpyDeviceToHost));
                //for (int i = 0; i < N; i++) std::cout << "d_S[" << i << "] = " << h_S[i] << std::endl;

                // --- CUDA SVD execution - Full SVD
                timerGPU.StartCounter();
                cusolveSafeCall(cusolverDnSgesvd(solver_handle, 'A', 'A', M, N, d_A, M, d_S, d_U, M, d_V, N, work, work_size, NULL, devInfo));
                elapsedTime = timerGPU.GetCounter();

                devInfo_h = 0;
                gpuErrchk(cudaMemcpy(&devInfo_h, devInfo, sizeof(int), cudaMemcpyDeviceToHost));
                if (devInfo_h == 0)
                printf("SVD successfull for the full SVD calculationnn");
                else if (devInfo_h < 0)
                printf("SVD unsuccessfull for the full SVD calculation. Parameter %i is wrongn", -devInfo_h);
                else
                printf("SVD unsuccessfull for the full SVD calculation. A number of %i superdiagonals of an intermediate bidiagonal form did not converge to zeron", devInfo_h);

                printf("Calculation of the full SVD calculation: %f msnn", elapsedTime);

                cusolveSafeCall(cusolverDnDestroy(solver_handle));

                return 0;




                EDIT - PERFORMANCE ACROSS DIFFERENT VERSIONS OF CUDA



                I have compared the performance of the singular values only calculation and the the Full SVD computations for CUDA 8.0, CUDA 9.1 and CUDA 10.0, for a 5000x5000 matrix. Here are the results on a GTX 960.



                Computation type CUDA 8.0 CUDA 9.1 CUDA 10.0 
                __________________________________________________________________

                Singular values only 17s 15s 15s
                Full SVD 161s 159s 457s
                __________________________________________________________________





                share|improve this answer















                USE OF cusolver<T>nSgesvd



                As remarked by lebedov, as of CUDA 8.0, it is now possible to calculate the singular values only by cusolverDnSgesvd. I report below a slightly modified version of your code with two calls to cusolverDnSgesvd, one performing the singular values calculation only



                cusolverDnSgesvd(solver_handle, 'N', 'N', M, N, d_A, M, d_S, NULL, M, NULL, N, work, work_size, NULL, devInfo)


                and one performing the full SVD calculation



                cusolverDnSgesvd(solver_handle, 'A', 'A', M, N, d_A, M, d_S, d_U, M, d_V, N, work, work_size, NULL, devInfo)


                As you already remarked, the two 'A' fields for the full SVD case are changed to 'N' in the singular values only case. Please, note that, in the singular values only case, there is no need to store space for the singular vector matrices U and V. Indeed, a NULL pointer is passed.



                The singular values calculation only is faster than the full SVD calculation. On a GTX 960, for a 1000x1000 matrix, the timing has been the following:



                Singular values only: 559 ms
                Full SVD: 2239 ms


                Here is the full code:



                #include "cuda_runtime.h"
                #include "device_launch_parameters.h"

                #include <stdio.h>

                #include<iostream>
                #include<stdlib.h>
                #include<stdio.h>

                #include <cusolverDn.h>
                #include <cuda_runtime_api.h>

                #include "Utilities.cuh"
                #include "TimingGPU.cuh"

                /********/
                /* MAIN */
                /********/
                int main()

                int M = 1000;
                int N = 1000;

                TimingGPU timerGPU;
                float elapsedTime;

                // --- Setting the host matrix
                float *h_A = (float *)malloc(M * N * sizeof(float));
                for (unsigned int i = 0; i < M; i++)
                for (unsigned int j = 0; j < N; j++)
                h_A[j*M + i] = (i + j) * (i + j);



                // --- Setting the device matrix and moving the host matrix to the device
                float *d_A; gpuErrchk(cudaMalloc(&d_A, M * N * sizeof(float)));
                gpuErrchk(cudaMemcpy(d_A, h_A, M * N * sizeof(float), cudaMemcpyHostToDevice));

                // --- host side SVD results space
                float *h_U = (float *)malloc(M * M * sizeof(float));
                float *h_V = (float *)malloc(N * N * sizeof(float));
                float *h_S = (float *)malloc(N * sizeof(float));

                // --- device side SVD workspace and matrices
                int work_size = 0;

                int *devInfo; gpuErrchk(cudaMalloc(&devInfo, sizeof(int)));
                float *d_U; gpuErrchk(cudaMalloc(&d_U, M * M * sizeof(float)));
                float *d_V; gpuErrchk(cudaMalloc(&d_V, N * N * sizeof(float)));
                float *d_S; gpuErrchk(cudaMalloc(&d_S, N * sizeof(float)));

                cusolverStatus_t stat;

                // --- CUDA solver initialization
                cusolverDnHandle_t solver_handle;
                cusolveSafeCall(cusolverDnCreate(&solver_handle));

                cusolveSafeCall(cusolverDnSgesvd_bufferSize(solver_handle, M, N, &work_size));

                float *work; gpuErrchk(cudaMalloc(&work, work_size * sizeof(float)));

                // --- CUDA SVD execution - Singular values only
                timerGPU.StartCounter();
                cusolveSafeCall(cusolverDnSgesvd(solver_handle, 'N', 'N', M, N, d_A, M, d_S, NULL, M, NULL, N, work, work_size, NULL, devInfo));
                elapsedTime = timerGPU.GetCounter();

                int devInfo_h = 0;
                gpuErrchk(cudaMemcpy(&devInfo_h, devInfo, sizeof(int), cudaMemcpyDeviceToHost));
                if (devInfo_h == 0)
                printf("SVD successfull for the singular values calculation onlynn");
                else if (devInfo_h < 0)
                printf("SVD unsuccessfull for the singular values calculation only. Parameter %i is wrongn", -devInfo_h);
                else
                printf("SVD unsuccessfull for the singular values calculation only. A number of %i superdiagonals of an intermediate bidiagonal form did not converge to zeron", devInfo_h);

                printf("Calculation of the singular values only: %f msnn", elapsedTime);

                // --- Moving the results from device to host
                //gpuErrchk(cudaMemcpy(h_S, d_S, N * sizeof(float), cudaMemcpyDeviceToHost));
                //for (int i = 0; i < N; i++) std::cout << "d_S[" << i << "] = " << h_S[i] << std::endl;

                // --- CUDA SVD execution - Full SVD
                timerGPU.StartCounter();
                cusolveSafeCall(cusolverDnSgesvd(solver_handle, 'A', 'A', M, N, d_A, M, d_S, d_U, M, d_V, N, work, work_size, NULL, devInfo));
                elapsedTime = timerGPU.GetCounter();

                devInfo_h = 0;
                gpuErrchk(cudaMemcpy(&devInfo_h, devInfo, sizeof(int), cudaMemcpyDeviceToHost));
                if (devInfo_h == 0)
                printf("SVD successfull for the full SVD calculationnn");
                else if (devInfo_h < 0)
                printf("SVD unsuccessfull for the full SVD calculation. Parameter %i is wrongn", -devInfo_h);
                else
                printf("SVD unsuccessfull for the full SVD calculation. A number of %i superdiagonals of an intermediate bidiagonal form did not converge to zeron", devInfo_h);

                printf("Calculation of the full SVD calculation: %f msnn", elapsedTime);

                cusolveSafeCall(cusolverDnDestroy(solver_handle));

                return 0;




                EDIT - PERFORMANCE ACROSS DIFFERENT VERSIONS OF CUDA



                I have compared the performance of the singular values only calculation and the the Full SVD computations for CUDA 8.0, CUDA 9.1 and CUDA 10.0, for a 5000x5000 matrix. Here are the results on a GTX 960.



                Computation type CUDA 8.0 CUDA 9.1 CUDA 10.0 
                __________________________________________________________________

                Singular values only 17s 15s 15s
                Full SVD 161s 159s 457s
                __________________________________________________________________






                share|improve this answer














                share|improve this answer



                share|improve this answer








                edited Nov 13 '18 at 10:12

























                answered Jan 20 '17 at 11:45









                JackOLanternJackOLantern

                14.7k355111




                14.7k355111



























                    draft saved

                    draft discarded
















































                    Thanks for contributing an answer to Stack Overflow!


                    • Please be sure to answer the question. Provide details and share your research!

                    But avoid


                    • Asking for help, clarification, or responding to other answers.

                    • Making statements based on opinion; back them up with references or personal experience.

                    To learn more, see our tips on writing great answers.




                    draft saved


                    draft discarded














                    StackExchange.ready(
                    function ()
                    StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f28107525%2fsingular-values-calculation-only-with-cuda%23new-answer', 'question_page');

                    );

                    Post as a guest















                    Required, but never shown





















































                    Required, but never shown














                    Required, but never shown












                    Required, but never shown







                    Required, but never shown

































                    Required, but never shown














                    Required, but never shown












                    Required, but never shown







                    Required, but never shown







                    Popular posts from this blog

                    𛂒𛀶,𛀽𛀑𛂀𛃧𛂓𛀙𛃆𛃑𛃷𛂟𛁡𛀢𛀟𛁤𛂽𛁕𛁪𛂟𛂯,𛁞𛂧𛀴𛁄𛁠𛁼𛂿𛀤 𛂘,𛁺𛂾𛃭𛃭𛃵𛀺,𛂣𛃍𛂖𛃶 𛀸𛃀𛂖𛁶𛁏𛁚 𛂢𛂞 𛁰𛂆𛀔,𛁸𛀽𛁓𛃋𛂇𛃧𛀧𛃣𛂐𛃇,𛂂𛃻𛃲𛁬𛃞𛀧𛃃𛀅 𛂭𛁠𛁡𛃇𛀷𛃓𛁥,𛁙𛁘𛁞𛃸𛁸𛃣𛁜,𛂛,𛃿,𛁯𛂘𛂌𛃛𛁱𛃌𛂈𛂇 𛁊𛃲,𛀕𛃴𛀜 𛀶𛂆𛀶𛃟𛂉𛀣,𛂐𛁞𛁾 𛁷𛂑𛁳𛂯𛀬𛃅,𛃶𛁼

                    How do I collapse sections of code in Visual Studio Code for Windows?

                    ャフサォクコ ケウ,コ,ワ メ,ロスョノ゙,クネ,フムカヤヲニ,エコ゚ツ ウイオン゙ケワサネォキモュキォウイノンコチ゚メヌナイゥフュ,カヒウネェ ネ,ホノケ,ムュキ ッボーミュハ,チ ツス ィ メウイマヤ,゙ウチ ヅ ロ,ォジヌェ ャヌット ェ,マャ,チナエヒネソキツテ トホヲヲミーァ