cuda - cuSPARSE multiplication function outputting incorrect value -


i trying implement sparse matrix multiplication using cusparse library. have used of code documentation, here. though getting correct row pointers, column pointers of output, getting incorrect output values.

i have followed below steps: 1. generate coo format matrix. 2. convert same csr format 3. find non-zero elements vector number of non-zero elements in output matrix 4. perform matrix multiplication 5. print results

other output-matrix values, getting correct results(the row pointers, column pointers correct)- same can printed executing below code. know missing simple, not able figure out. please let me know going wrong , how can rectified.

    #include <stdio.h> #include <stdlib.h> #include <cuda_runtime.h> #include "cusparse_v2.h" #include <cuda_runtime_api.h> #include<iostream> #include<iomanip> #include<assert.h> #include <time.h> #include <sys/time.h>  #define cusparse_check(x) {cusparsestatus_t _c=x; if (_c != cusparse_status_success) {printf("cusparse fail: %d, line: %d\n", (int)_c, __line__); if(_c == cusparse_status_matrix_type_not_supported) {printf("cusparse_status_matrix_type_not_supported\n");} if(_c == cusparse_status_internal_error) {printf("cusparse_status_internal_error\n");} exit(-1);}}  #define cleanup(s)                                   \ {                                                 \     printf ("%s\n", s);                              \     if (yhostptr)           free(yhostptr);          \     if (zhostptr)           free(zhostptr);          \     if (xindhostptr)        free(xindhostptr);       \     if (xvalhostptr)        free(xvalhostptr);       \     if (coorowindexhostptr) free(coorowindexhostptr);\     if (coocolindexhostptr) free(coocolindexhostptr);\     if (coovalhostptr)      free(coovalhostptr);     \     if (y)                  cudafree(y);             \     if (z)                  cudafree(z);             \     if (xind)               cudafree(xind);          \     if (xval)               cudafree(xval);          \     if (csrrowptr)          cudafree(csrrowptr);     \     if (coorowindex)        cudafree(coorowindex);   \     if (coocolindex)        cudafree(coocolindex);   \     if (cooval)             cudafree(cooval);        \     if (descr)              cusparsedestroymatdescr(descr);\     if (handle)             cusparsedestroy(handle); \     cudadevicereset();          \     fflush (stdout);                                 \ } while (0)  double timerval()     {         struct timeval st;         gettimeofday(&st, null);         return (st.tv_sec+st.tv_usec*1e-6);     }  int main(){      cudaerror_t cudastat1 = cudasuccess,cudastat2 = cudasuccess,cudastat3 = cudasuccess,cudastat4 = cudasuccess,cudastat5 = cudasuccess,cudastat6 = cudasuccess; cusparsestatus_t status; cusparsehandle_t handle=0; cusparsematdescr_t descr=0; int *    coorowindexhostptr=0; int *    coocolindexhostptr=0;     double * coovalhostptr=0; int *    coorowindex=0; int *    coocolindex=0;     float * cooval=0; int *    xindhostptr=0; double * xvalhostptr=0; double * yhostptr=0; int *    xind=0; double * xval=0; double * y=0;   int *    csrrowptr=0; int *    csrcolptr = 0;  double * zhostptr=0;  double * z=0;  int      n, nnz, nnz_vector; double dzero =0.0; double dtwo  =2.0; double dthree=3.0; double dfive =5.0; cusparsestatus_t stat; double avg_time = 0, s_time, e_time; cusparsematdescr_t descra, descrb, descrc;   stat = cusparsecreatematdescr(&descra); cusparse_check(stat);  stat = cusparsecreatematdescr(&descrb); cusparse_check(stat);  stat = cusparsecreatematdescr(&descrc); cusparse_check(stat);  stat = cusparsesetmattype(descra, cusparse_matrix_type_general); cusparse_check(stat);  stat = cusparsesetmattype(descrb, cusparse_matrix_type_general); cusparse_check(stat);  stat = cusparsesetmattype(descrc, cusparse_matrix_type_general); cusparse_check(stat);  stat = cusparsesetmatindexbase(descra, cusparse_index_base_zero); cusparse_check(stat);  stat = cusparsesetmatindexbase(descrb, cusparse_index_base_zero); cusparse_check(stat);  stat = cusparsesetmatindexbase(descrc, cusparse_index_base_zero); cusparse_check(stat);   printf("testing example\n"); /* create following sparse test matrix in coo format */ /* |1.0     2.0 3.0|    |    4.0        |    |5.0     6.0 7.0|    |    8.0     9.0| */ n=4; nnz=9;  coorowindexhostptr = (int *)   malloc(nnz*sizeof(coorowindexhostptr[0]));  coocolindexhostptr = (int *)   malloc(nnz*sizeof(coocolindexhostptr[0]));  coovalhostptr      = (double *)malloc(nnz*sizeof(coovalhostptr[0]));  if ((!coorowindexhostptr) || (!coocolindexhostptr) || (!coovalhostptr)){     cleanup("host malloc failed (matrix)");     return 1;  } coorowindexhostptr[0]=0; coocolindexhostptr[0]=0; coovalhostptr[0]=1.0;   coorowindexhostptr[1]=0; coocolindexhostptr[1]=2; coovalhostptr[1]=2.0;   coorowindexhostptr[2]=0; coocolindexhostptr[2]=3; coovalhostptr[2]=3.0;   coorowindexhostptr[3]=1; coocolindexhostptr[3]=1; coovalhostptr[3]=4.0;   coorowindexhostptr[4]=2; coocolindexhostptr[4]=0; coovalhostptr[4]=5.0;   coorowindexhostptr[5]=2; coocolindexhostptr[5]=2; coovalhostptr[5]=6.0; coorowindexhostptr[6]=2; coocolindexhostptr[6]=3; coovalhostptr[6]=7.0;   coorowindexhostptr[7]=3; coocolindexhostptr[7]=1; coovalhostptr[7]=8.0;   coorowindexhostptr[8]=3; coocolindexhostptr[8]=3; coovalhostptr[8]=9.0;    //print matrix printf("input data:\n"); (int i=0; i<nnz; i++){             printf("coorowindexhostptr[%d]=%d  ",i,coorowindexhostptr[i]);     printf("coocolindexhostptr[%d]=%d  ",i,coocolindexhostptr[i]);     printf("coovalhostptr[%d]=%f     \n",i,coovalhostptr[i]); }  /* allocate gpu memory , copy matrix , vectors */ cudastat1 = cudamalloc((void**)&coorowindex,nnz*sizeof(coorowindex[0]));  cudastat2 = cudamalloc((void**)&coocolindex,nnz*sizeof(coocolindex[0])); cudastat3 = cudamalloc((void**)&cooval,     nnz*sizeof(cooval[0]));   if ((cudastat1 != cudasuccess) ||     (cudastat2 != cudasuccess) ||     (cudastat3 != cudasuccess) ||     (cudastat4 != cudasuccess) ||     (cudastat5 != cudasuccess) ||     (cudastat6 != cudasuccess)) {     cleanup("device malloc failed");     return 1;  }     cudastat1 = cudamemcpy(coorowindex, coorowindexhostptr,                         (size_t)(nnz*sizeof(coorowindex[0])),                         cudamemcpyhosttodevice); cudastat2 = cudamemcpy(coocolindex, coocolindexhostptr,                         (size_t)(nnz*sizeof(coocolindex[0])),                         cudamemcpyhosttodevice); cudastat3 = cudamemcpy(cooval,      coovalhostptr,                              (size_t)(nnz*sizeof(cooval[0])),                              cudamemcpyhosttodevice);  if ((cudastat1 != cudasuccess) ||     (cudastat2 != cudasuccess) ||     (cudastat3 != cudasuccess) ||     (cudastat4 != cudasuccess) ||     (cudastat5 != cudasuccess) ||     (cudastat6 != cudasuccess)) {     cleanup("memcpy host device failed");     return 1; }  /* initialize cusparse library */ status= cusparsecreate(&handle); if (status != cusparse_status_success) {     cleanup("cusparse library initialization failed");     return 1; }  /* create , setup matrix descriptor */  status= cusparsecreatematdescr(&descr);  if (status != cusparse_status_success) {     cleanup("matrix descriptor initialization failed");     return 1; }     cusparsesetmattype(descr,cusparse_matrix_type_general); cusparsesetmatindexbase(descr,cusparse_index_base_zero);    /* exercise conversion routines (convert matrix coo 2 csr format) */ cudastat1 = cudamalloc(&csrrowptr,(n+1)*sizeof(csrrowptr[0])); if (cudastat1 != cudasuccess) {     cleanup("device malloc failed (csrrowptr)");     return 1; }  status= cusparsexcoo2csr(handle,coorowindex,nnz,n,                          csrrowptr,cusparse_index_base_zero);  if (status != cusparse_status_success) {     cleanup("conversion coo csr format failed");     return 1; }   /* int *csr_values; csr_values = (int *)malloc((n+1)*sizeof(int));  cudastat3 = cudamemcpy(csr_values, csrrowptr, (n+1)*sizeof(int), cudamemcpydevicetohost);     if (cudastat3 != cudasuccess) {     cleanup("device memcopy failed: csr values");     return 1;     } printf("csr values \n");     for(int y2 =0; y2< n+1; y2++)     printf("%d \t", csr_values[y2]);      */ /*  int y1;     printf("\n");     printf("col orig  is\n");     for(y1 =0; y1 < nnz; y1++)     {         printf("%d\t", coocolindex[y1]);     }         printf("\n");     printf("nnz orig is\n");     for(y1 =0; y1 < nnz; y1++)     {         printf("%f\t", h_csrvalc[y1]);     }  */   //csrrowptr data present //csrrowptr, coocolindex, cooval (all 3 matrix data) shall used here operation  int nnza = nnz, nnzb = nnz, nnzc; cusparseoperation_t transa = cusparse_operation_non_transpose; cusparseoperation_t transb = cusparse_operation_non_transpose;  // figure out size of c int basec; int *csrrowptrc, *csrcolindc; float *csrvalc;  // nnztotaldevhostptr points host memory int *nnztotaldevhostptr = &nnzc;     stat = cusparsesetpointermode(handle, cusparse_pointer_mode_host);     cusparse_check(stat);  cudastat1 = cudamalloc((void**)&csrrowptrc, sizeof(int)*(n+1));     if (cudastat1 != cudasuccess) {     cleanup("device malloc failed (csrrowptr)");     return 1; }  s_time=timerval(); //from here add code multiply  /*  */    stat = cusparsexcsrgemmnnz(handle, transa, transb, n, n, n,                                 descra, nnza, csrrowptr, coocolindex /*csrcolind*/,                                 descrb, nnzb, csrrowptr, coocolindex /*csrcolind*/,                                 descrc, csrrowptrc, nnztotaldevhostptr );     cusparse_check(stat);      if (null != nnztotaldevhostptr)     {         nnzc = *nnztotaldevhostptr;     }     else{     cudastat1 = cudamemcpy(&nnzc, csrrowptrc+n, sizeof(int), cudamemcpydevicetohost);     cudastat2 = cudamemcpy(&basec, csrrowptrc, sizeof(int), cudamemcpydevicetohost);     if (cudastat1 || cudastat2 != cudasuccess) {     cleanup("device malloc failed (csrrowptr)");     return 1;     }         nnzc -= basec;}      cudastat1 = cudamalloc((void**)&csrcolindc, sizeof(int)*nnzc);     if (cudastat1 != cudasuccess) {     cleanup("device malloc failed (csrcolindc)");     return 1;     }      cudastat1 = cudamalloc((void**)&csrvalc, sizeof(float)*nnzc);     if (cudastat1 != cudasuccess) {     cleanup("device malloc failed (csrvalc)");     return 1;     }      stat = cusparsescsrgemm(handle, transa, transb, n, n, n,     descra, nnza,     cooval/*csrvala*/, csrrowptr, coocolindex,     descrb, nnzb,     cooval/*csrvala*/, csrrowptr, coocolindex,     descrc,     csrvalc/*csrvala*/, csrrowptrc, csrcolindc);      cusparse_check(stat);      cudadevicesynchronize();      int *h_csrrowptrc = null, *h_csrcolindc = null;      float *h_csrvalc = null;     h_csrvalc =  (float *)malloc(nnzc*sizeof(float));     h_csrrowptrc = (int *)malloc(n+1*sizeof(int));     h_csrcolindc = (int *)malloc(nnzc*sizeof(int));       cudastat1 = cudamemcpy(h_csrrowptrc, csrrowptrc, (n+1)*sizeof(int), cudamemcpydevicetohost);     if (cudastat1 != cudasuccess) {     cleanup("device memcopy failed csrrowptrc");     return 1;     }      cudastat2 = cudamemcpy(h_csrcolindc, csrcolindc,  nnzc*sizeof(int), cudamemcpydevicetohost);     if (cudastat2 != cudasuccess) {     cleanup("device memcopy failed: coocolindex");     return 1;     }      printf("nnz value %d, nnzc %d\n", nnz, nnzc);       cudastat3 = cudamemcpy(h_csrvalc, csrvalc, nnzc*sizeof(float), cudamemcpydevicetohost);     if (cudastat3 != cudasuccess) {     cleanup("device memcopy failed: csrvalc");     return 1;     }      int y1;     printf("row is\n");     for(y1 =0; y1 < n+1; y1++)     {         printf("%d\t", h_csrrowptrc[y1]);     }     printf("\n");     printf("col is\n");     for(y1 =0; y1 < nnzc; y1++)     {         printf("%d\t", h_csrcolindc[y1]);     }         printf("\n");     printf("nnz is\n");     for(y1 =0; y1 < nnzc; y1++)     {         printf("%f\t", h_csrvalc[y1]);     }      /* destroy matrix descriptor */      status = cusparsedestroymatdescr(descr);      descr = 0;     if (status != cusparse_status_success) {         cleanup("matrix descriptor destruction failed");         return 1;     }          /* destroy handle */     status = cusparsedestroy(handle);     handle = 0;     if (status != cusparse_status_success) {         cleanup("cusparse library release of resources failed");         return 1;     }         cudafree(csrrowptr);     cudafree(coocolindex);     cudafree(coorowindex);     cudafree(cooval);     cudafree(csrrowptrc);     cudafree(csrcolindc);     cudafree(csrvalc);      return 0; }       

you mixing float , double. example:

double * coovalhostptr=0; 

and

float * cooval=0; 

when copy double host values float array on device, you're not going expect:

cudastat3 = cudamemcpy(cooval,      coovalhostptr,                          (size_t)(nnz*sizeof(cooval[0])),                          cudamemcpyhosttodevice); 

since using cusparsescsrgemm, assume intent use float. based on that, following code has issue fixed (just changed bunch of double declarations float), , seems produce sensible results:

#include <stdio.h> #include <stdlib.h> #include <cuda_runtime.h> #include "cusparse_v2.h" #include <cuda_runtime_api.h> #include<iostream> #include<iomanip> #include<assert.h> #include <time.h> #include <sys/time.h>  #define cusparse_check(x) {cusparsestatus_t _c=x; if (_c != cusparse_status_success) {printf("cusparse fail: %d, line: %d\n", (int)_c, __line__); if(_c == cusparse_status_matrix_type_not_supported) {printf("cusparse_status_matrix_type_not_supported\n");} if(_c == cusparse_status_internal_error) {printf("cusparse_status_internal_error\n");} exit(-1);}}  #define cleanup(s)                                   \ {                                                 \     printf ("%s\n", s);                              \     if (yhostptr)           free(yhostptr);          \     if (zhostptr)           free(zhostptr);          \     if (xindhostptr)        free(xindhostptr);       \     if (xvalhostptr)        free(xvalhostptr);       \     if (coorowindexhostptr) free(coorowindexhostptr);\     if (coocolindexhostptr) free(coocolindexhostptr);\     if (coovalhostptr)      free(coovalhostptr);     \     if (y)                  cudafree(y);             \     if (z)                  cudafree(z);             \     if (xind)               cudafree(xind);          \     if (xval)               cudafree(xval);          \     if (csrrowptr)          cudafree(csrrowptr);     \     if (coorowindex)        cudafree(coorowindex);   \     if (coocolindex)        cudafree(coocolindex);   \     if (cooval)             cudafree(cooval);        \     if (descr)              cusparsedestroymatdescr(descr);\     if (handle)             cusparsedestroy(handle); \     cudadevicereset();          \     fflush (stdout);                                 \ } while (0)  double timerval()     {         struct timeval st;         gettimeofday(&st, null);         return (st.tv_sec+st.tv_usec*1e-6);     }  int main(){      cudaerror_t cudastat1 = cudasuccess,cudastat2 = cudasuccess,cudastat3 = cudasuccess,cudastat4 = cudasuccess,cudastat5 = cudasuccess,cudastat6 = cudasuccess; cusparsestatus_t status; cusparsehandle_t handle=0; cusparsematdescr_t descr=0; int *    coorowindexhostptr=0; int *    coocolindexhostptr=0;     float * coovalhostptr=0; int *    coorowindex=0; int *    coocolindex=0;     float * cooval=0; int *    xindhostptr=0; float * xvalhostptr=0; float * yhostptr=0; int *    xind=0; float * xval=0; float * y=0;   int *    csrrowptr=0; int *    csrcolptr = 0;  float * zhostptr=0;  float * z=0;  int      n, nnz, nnz_vector; double dzero =0.0; double dtwo  =2.0; double dthree=3.0; double dfive =5.0; cusparsestatus_t stat; double avg_time = 0, s_time, e_time; cusparsematdescr_t descra, descrb, descrc;   stat = cusparsecreatematdescr(&descra); cusparse_check(stat);  stat = cusparsecreatematdescr(&descrb); cusparse_check(stat);  stat = cusparsecreatematdescr(&descrc); cusparse_check(stat);  stat = cusparsesetmattype(descra, cusparse_matrix_type_general); cusparse_check(stat);  stat = cusparsesetmattype(descrb, cusparse_matrix_type_general); cusparse_check(stat);  stat = cusparsesetmattype(descrc, cusparse_matrix_type_general); cusparse_check(stat);  stat = cusparsesetmatindexbase(descra, cusparse_index_base_zero); cusparse_check(stat);  stat = cusparsesetmatindexbase(descrb, cusparse_index_base_zero); cusparse_check(stat);  stat = cusparsesetmatindexbase(descrc, cusparse_index_base_zero); cusparse_check(stat);   printf("testing example\n"); /* create following sparse test matrix in coo format */ /* |1.0     2.0 3.0|    |    4.0        |    |5.0     6.0 7.0|    |    8.0     9.0| */ n=4; nnz=9;  coorowindexhostptr = (int *)   malloc(nnz*sizeof(coorowindexhostptr[0]));  coocolindexhostptr = (int *)   malloc(nnz*sizeof(coocolindexhostptr[0]));  coovalhostptr      = (float *)malloc(nnz*sizeof(coovalhostptr[0]));  if ((!coorowindexhostptr) || (!coocolindexhostptr) || (!coovalhostptr)){     cleanup("host malloc failed (matrix)");     return 1;  } coorowindexhostptr[0]=0; coocolindexhostptr[0]=0; coovalhostptr[0]=1.0;   coorowindexhostptr[1]=0; coocolindexhostptr[1]=2; coovalhostptr[1]=2.0;   coorowindexhostptr[2]=0; coocolindexhostptr[2]=3; coovalhostptr[2]=3.0;   coorowindexhostptr[3]=1; coocolindexhostptr[3]=1; coovalhostptr[3]=4.0;   coorowindexhostptr[4]=2; coocolindexhostptr[4]=0; coovalhostptr[4]=5.0;   coorowindexhostptr[5]=2; coocolindexhostptr[5]=2; coovalhostptr[5]=6.0; coorowindexhostptr[6]=2; coocolindexhostptr[6]=3; coovalhostptr[6]=7.0;   coorowindexhostptr[7]=3; coocolindexhostptr[7]=1; coovalhostptr[7]=8.0;   coorowindexhostptr[8]=3; coocolindexhostptr[8]=3; coovalhostptr[8]=9.0;    //print matrix printf("input data:\n"); (int i=0; i<nnz; i++){             printf("coorowindexhostptr[%d]=%d  ",i,coorowindexhostptr[i]);     printf("coocolindexhostptr[%d]=%d  ",i,coocolindexhostptr[i]);     printf("coovalhostptr[%d]=%f     \n",i,coovalhostptr[i]); }  /* allocate gpu memory , copy matrix , vectors */ cudastat1 = cudamalloc((void**)&coorowindex,nnz*sizeof(coorowindex[0]));  cudastat2 = cudamalloc((void**)&coocolindex,nnz*sizeof(coocolindex[0])); cudastat3 = cudamalloc((void**)&cooval,     nnz*sizeof(cooval[0]));   if ((cudastat1 != cudasuccess) ||     (cudastat2 != cudasuccess) ||     (cudastat3 != cudasuccess) ||     (cudastat4 != cudasuccess) ||     (cudastat5 != cudasuccess) ||     (cudastat6 != cudasuccess)) {     cleanup("device malloc failed");     return 1;  }     cudastat1 = cudamemcpy(coorowindex, coorowindexhostptr,                         (size_t)(nnz*sizeof(coorowindex[0])),                         cudamemcpyhosttodevice); cudastat2 = cudamemcpy(coocolindex, coocolindexhostptr,                         (size_t)(nnz*sizeof(coocolindex[0])),                         cudamemcpyhosttodevice); cudastat3 = cudamemcpy(cooval,      coovalhostptr,                              (size_t)(nnz*sizeof(cooval[0])),                              cudamemcpyhosttodevice);  if ((cudastat1 != cudasuccess) ||     (cudastat2 != cudasuccess) ||     (cudastat3 != cudasuccess) ||     (cudastat4 != cudasuccess) ||     (cudastat5 != cudasuccess) ||     (cudastat6 != cudasuccess)) {     cleanup("memcpy host device failed");     return 1; }  /* initialize cusparse library */ status= cusparsecreate(&handle); if (status != cusparse_status_success) {     cleanup("cusparse library initialization failed");     return 1; }  /* create , setup matrix descriptor */  status= cusparsecreatematdescr(&descr);  if (status != cusparse_status_success) {     cleanup("matrix descriptor initialization failed");     return 1; }     cusparsesetmattype(descr,cusparse_matrix_type_general); cusparsesetmatindexbase(descr,cusparse_index_base_zero);    /* exercise conversion routines (convert matrix coo 2 csr format) */ cudastat1 = cudamalloc(&csrrowptr,(n+1)*sizeof(csrrowptr[0])); if (cudastat1 != cudasuccess) {     cleanup("device malloc failed (csrrowptr)");     return 1; }  status= cusparsexcoo2csr(handle,coorowindex,nnz,n,                          csrrowptr,cusparse_index_base_zero);  if (status != cusparse_status_success) {     cleanup("conversion coo csr format failed");     return 1; }   /* int *csr_values; csr_values = (int *)malloc((n+1)*sizeof(int));  cudastat3 = cudamemcpy(csr_values, csrrowptr, (n+1)*sizeof(int), cudamemcpydevicetohost);     if (cudastat3 != cudasuccess) {     cleanup("device memcopy failed: csr values");     return 1;     } printf("csr values \n");     for(int y2 =0; y2< n+1; y2++)     printf("%d \t", csr_values[y2]);      */ /*  int y1;     printf("\n");     printf("col orig  is\n");     for(y1 =0; y1 < nnz; y1++)     {         printf("%d\t", coocolindex[y1]);     }         printf("\n");     printf("nnz orig is\n");     for(y1 =0; y1 < nnz; y1++)     {         printf("%f\t", h_csrvalc[y1]);     }  */   //csrrowptr data present //csrrowptr, coocolindex, cooval (all 3 matrix data) shall used here operation  int nnza = nnz, nnzb = nnz, nnzc; cusparseoperation_t transa = cusparse_operation_non_transpose; cusparseoperation_t transb = cusparse_operation_non_transpose;  // figure out size of c int basec; int *csrrowptrc, *csrcolindc; float *csrvalc;  // nnztotaldevhostptr points host memory int *nnztotaldevhostptr = &nnzc;     stat = cusparsesetpointermode(handle, cusparse_pointer_mode_host);     cusparse_check(stat);  cudastat1 = cudamalloc((void**)&csrrowptrc, sizeof(int)*(n+1));     if (cudastat1 != cudasuccess) {     cleanup("device malloc failed (csrrowptr)");     return 1; }  s_time=timerval(); //from here add code multiply  /*  */    stat = cusparsexcsrgemmnnz(handle, transa, transb, n, n, n,                                 descra, nnza, csrrowptr, coocolindex /*csrcolind*/,                                 descrb, nnzb, csrrowptr, coocolindex /*csrcolind*/,                                 descrc, csrrowptrc, nnztotaldevhostptr );     cusparse_check(stat);      if (null != nnztotaldevhostptr)     {         nnzc = *nnztotaldevhostptr;     }     else{     cudastat1 = cudamemcpy(&nnzc, csrrowptrc+n, sizeof(int), cudamemcpydevicetohost);     cudastat2 = cudamemcpy(&basec, csrrowptrc, sizeof(int), cudamemcpydevicetohost);     if (cudastat1 || cudastat2 != cudasuccess) {     cleanup("device malloc failed (csrrowptr)");     return 1;     }         nnzc -= basec;}      cudastat1 = cudamalloc((void**)&csrcolindc, sizeof(int)*nnzc);     if (cudastat1 != cudasuccess) {     cleanup("device malloc failed (csrcolindc)");     return 1;     }      cudastat1 = cudamalloc((void**)&csrvalc, sizeof(float)*nnzc);     if (cudastat1 != cudasuccess) {     cleanup("device malloc failed (csrvalc)");     return 1;     }      stat = cusparsescsrgemm(handle, transa, transb, n, n, n,     descra, nnza,     cooval/*csrvala*/, csrrowptr, coocolindex,     descrb, nnzb,     cooval/*csrvala*/, csrrowptr, coocolindex,     descrc,     csrvalc/*csrvala*/, csrrowptrc, csrcolindc);      cusparse_check(stat);      cudadevicesynchronize();      int *h_csrrowptrc = null, *h_csrcolindc = null;      float *h_csrvalc = null;     h_csrvalc =  (float *)malloc(nnzc*sizeof(float));     h_csrrowptrc = (int *)malloc(n+1*sizeof(int));     h_csrcolindc = (int *)malloc(nnzc*sizeof(int));       cudastat1 = cudamemcpy(h_csrrowptrc, csrrowptrc, (n+1)*sizeof(int), cudamemcpydevicetohost);     if (cudastat1 != cudasuccess) {     cleanup("device memcopy failed csrrowptrc");     return 1;     }      cudastat2 = cudamemcpy(h_csrcolindc, csrcolindc,  nnzc*sizeof(int), cudamemcpydevicetohost);     if (cudastat2 != cudasuccess) {     cleanup("device memcopy failed: coocolindex");     return 1;     }      printf("nnz value %d, nnzc %d\n", nnz, nnzc);       cudastat3 = cudamemcpy(h_csrvalc, csrvalc, nnzc*sizeof(float), cudamemcpydevicetohost);     if (cudastat3 != cudasuccess) {     cleanup("device memcopy failed: csrvalc");     return 1;     }      int y1;     printf("row is\n");     for(y1 =0; y1 < n+1; y1++)     {         printf("%d\t", h_csrrowptrc[y1]);     }     printf("\n");     printf("col is\n");     for(y1 =0; y1 < nnzc; y1++)     {         printf("%d\t", h_csrcolindc[y1]);     }         printf("\n");     printf("nnz is\n");     for(y1 =0; y1 < nnzc; y1++)     {         printf("%f\t", h_csrvalc[y1]);     }     printf("\n");     /* destroy matrix descriptor */      status = cusparsedestroymatdescr(descr);      descr = 0;     if (status != cusparse_status_success) {         cleanup("matrix descriptor destruction failed");         return 1;     }          /* destroy handle */     status = cusparsedestroy(handle);     handle = 0;     if (status != cusparse_status_success) {         cleanup("cusparse library release of resources failed");         return 1;     }         cudafree(csrrowptr);     cudafree(coocolindex);     cudafree(coorowindex);     cudafree(cooval);     cudafree(csrrowptrc);     cudafree(csrcolindc);     cudafree(csrvalc);      return 0; }  

Comments

Popular posts from this blog

asp.net mvc - SSO between MVCForum and Umbraco7 -

Python Tkinter keyboard using bind -

ubuntu - Selenium Node Not Connecting to Hub, Not Opening Port -