cuda - cuSPARSE multiplication function outputting incorrect value -
i trying implement sparse matrix multiplication using cusparse library. have used of code documentation, here. though getting correct row pointers, column pointers of output, getting incorrect output values.
i have followed below steps: 1. generate coo format matrix. 2. convert same csr format 3. find non-zero elements vector number of non-zero elements in output matrix 4. perform matrix multiplication 5. print results
other output-matrix values, getting correct results(the row pointers, column pointers correct)- same can printed executing below code. know missing simple, not able figure out. please let me know going wrong , how can rectified.
#include <stdio.h> #include <stdlib.h> #include <cuda_runtime.h> #include "cusparse_v2.h" #include <cuda_runtime_api.h> #include<iostream> #include<iomanip> #include<assert.h> #include <time.h> #include <sys/time.h> #define cusparse_check(x) {cusparsestatus_t _c=x; if (_c != cusparse_status_success) {printf("cusparse fail: %d, line: %d\n", (int)_c, __line__); if(_c == cusparse_status_matrix_type_not_supported) {printf("cusparse_status_matrix_type_not_supported\n");} if(_c == cusparse_status_internal_error) {printf("cusparse_status_internal_error\n");} exit(-1);}} #define cleanup(s) \ { \ printf ("%s\n", s); \ if (yhostptr) free(yhostptr); \ if (zhostptr) free(zhostptr); \ if (xindhostptr) free(xindhostptr); \ if (xvalhostptr) free(xvalhostptr); \ if (coorowindexhostptr) free(coorowindexhostptr);\ if (coocolindexhostptr) free(coocolindexhostptr);\ if (coovalhostptr) free(coovalhostptr); \ if (y) cudafree(y); \ if (z) cudafree(z); \ if (xind) cudafree(xind); \ if (xval) cudafree(xval); \ if (csrrowptr) cudafree(csrrowptr); \ if (coorowindex) cudafree(coorowindex); \ if (coocolindex) cudafree(coocolindex); \ if (cooval) cudafree(cooval); \ if (descr) cusparsedestroymatdescr(descr);\ if (handle) cusparsedestroy(handle); \ cudadevicereset(); \ fflush (stdout); \ } while (0) double timerval() { struct timeval st; gettimeofday(&st, null); return (st.tv_sec+st.tv_usec*1e-6); } int main(){ cudaerror_t cudastat1 = cudasuccess,cudastat2 = cudasuccess,cudastat3 = cudasuccess,cudastat4 = cudasuccess,cudastat5 = cudasuccess,cudastat6 = cudasuccess; cusparsestatus_t status; cusparsehandle_t handle=0; cusparsematdescr_t descr=0; int * coorowindexhostptr=0; int * coocolindexhostptr=0; double * coovalhostptr=0; int * coorowindex=0; int * coocolindex=0; float * cooval=0; int * xindhostptr=0; double * xvalhostptr=0; double * yhostptr=0; int * xind=0; double * xval=0; double * y=0; int * csrrowptr=0; int * csrcolptr = 0; double * zhostptr=0; double * z=0; int n, nnz, nnz_vector; double dzero =0.0; double dtwo =2.0; double dthree=3.0; double dfive =5.0; cusparsestatus_t stat; double avg_time = 0, s_time, e_time; cusparsematdescr_t descra, descrb, descrc; stat = cusparsecreatematdescr(&descra); cusparse_check(stat); stat = cusparsecreatematdescr(&descrb); cusparse_check(stat); stat = cusparsecreatematdescr(&descrc); cusparse_check(stat); stat = cusparsesetmattype(descra, cusparse_matrix_type_general); cusparse_check(stat); stat = cusparsesetmattype(descrb, cusparse_matrix_type_general); cusparse_check(stat); stat = cusparsesetmattype(descrc, cusparse_matrix_type_general); cusparse_check(stat); stat = cusparsesetmatindexbase(descra, cusparse_index_base_zero); cusparse_check(stat); stat = cusparsesetmatindexbase(descrb, cusparse_index_base_zero); cusparse_check(stat); stat = cusparsesetmatindexbase(descrc, cusparse_index_base_zero); cusparse_check(stat); printf("testing example\n"); /* create following sparse test matrix in coo format */ /* |1.0 2.0 3.0| | 4.0 | |5.0 6.0 7.0| | 8.0 9.0| */ n=4; nnz=9; coorowindexhostptr = (int *) malloc(nnz*sizeof(coorowindexhostptr[0])); coocolindexhostptr = (int *) malloc(nnz*sizeof(coocolindexhostptr[0])); coovalhostptr = (double *)malloc(nnz*sizeof(coovalhostptr[0])); if ((!coorowindexhostptr) || (!coocolindexhostptr) || (!coovalhostptr)){ cleanup("host malloc failed (matrix)"); return 1; } coorowindexhostptr[0]=0; coocolindexhostptr[0]=0; coovalhostptr[0]=1.0; coorowindexhostptr[1]=0; coocolindexhostptr[1]=2; coovalhostptr[1]=2.0; coorowindexhostptr[2]=0; coocolindexhostptr[2]=3; coovalhostptr[2]=3.0; coorowindexhostptr[3]=1; coocolindexhostptr[3]=1; coovalhostptr[3]=4.0; coorowindexhostptr[4]=2; coocolindexhostptr[4]=0; coovalhostptr[4]=5.0; coorowindexhostptr[5]=2; coocolindexhostptr[5]=2; coovalhostptr[5]=6.0; coorowindexhostptr[6]=2; coocolindexhostptr[6]=3; coovalhostptr[6]=7.0; coorowindexhostptr[7]=3; coocolindexhostptr[7]=1; coovalhostptr[7]=8.0; coorowindexhostptr[8]=3; coocolindexhostptr[8]=3; coovalhostptr[8]=9.0; //print matrix printf("input data:\n"); (int i=0; i<nnz; i++){ printf("coorowindexhostptr[%d]=%d ",i,coorowindexhostptr[i]); printf("coocolindexhostptr[%d]=%d ",i,coocolindexhostptr[i]); printf("coovalhostptr[%d]=%f \n",i,coovalhostptr[i]); } /* allocate gpu memory , copy matrix , vectors */ cudastat1 = cudamalloc((void**)&coorowindex,nnz*sizeof(coorowindex[0])); cudastat2 = cudamalloc((void**)&coocolindex,nnz*sizeof(coocolindex[0])); cudastat3 = cudamalloc((void**)&cooval, nnz*sizeof(cooval[0])); if ((cudastat1 != cudasuccess) || (cudastat2 != cudasuccess) || (cudastat3 != cudasuccess) || (cudastat4 != cudasuccess) || (cudastat5 != cudasuccess) || (cudastat6 != cudasuccess)) { cleanup("device malloc failed"); return 1; } cudastat1 = cudamemcpy(coorowindex, coorowindexhostptr, (size_t)(nnz*sizeof(coorowindex[0])), cudamemcpyhosttodevice); cudastat2 = cudamemcpy(coocolindex, coocolindexhostptr, (size_t)(nnz*sizeof(coocolindex[0])), cudamemcpyhosttodevice); cudastat3 = cudamemcpy(cooval, coovalhostptr, (size_t)(nnz*sizeof(cooval[0])), cudamemcpyhosttodevice); if ((cudastat1 != cudasuccess) || (cudastat2 != cudasuccess) || (cudastat3 != cudasuccess) || (cudastat4 != cudasuccess) || (cudastat5 != cudasuccess) || (cudastat6 != cudasuccess)) { cleanup("memcpy host device failed"); return 1; } /* initialize cusparse library */ status= cusparsecreate(&handle); if (status != cusparse_status_success) { cleanup("cusparse library initialization failed"); return 1; } /* create , setup matrix descriptor */ status= cusparsecreatematdescr(&descr); if (status != cusparse_status_success) { cleanup("matrix descriptor initialization failed"); return 1; } cusparsesetmattype(descr,cusparse_matrix_type_general); cusparsesetmatindexbase(descr,cusparse_index_base_zero); /* exercise conversion routines (convert matrix coo 2 csr format) */ cudastat1 = cudamalloc(&csrrowptr,(n+1)*sizeof(csrrowptr[0])); if (cudastat1 != cudasuccess) { cleanup("device malloc failed (csrrowptr)"); return 1; } status= cusparsexcoo2csr(handle,coorowindex,nnz,n, csrrowptr,cusparse_index_base_zero); if (status != cusparse_status_success) { cleanup("conversion coo csr format failed"); return 1; } /* int *csr_values; csr_values = (int *)malloc((n+1)*sizeof(int)); cudastat3 = cudamemcpy(csr_values, csrrowptr, (n+1)*sizeof(int), cudamemcpydevicetohost); if (cudastat3 != cudasuccess) { cleanup("device memcopy failed: csr values"); return 1; } printf("csr values \n"); for(int y2 =0; y2< n+1; y2++) printf("%d \t", csr_values[y2]); */ /* int y1; printf("\n"); printf("col orig is\n"); for(y1 =0; y1 < nnz; y1++) { printf("%d\t", coocolindex[y1]); } printf("\n"); printf("nnz orig is\n"); for(y1 =0; y1 < nnz; y1++) { printf("%f\t", h_csrvalc[y1]); } */ //csrrowptr data present //csrrowptr, coocolindex, cooval (all 3 matrix data) shall used here operation int nnza = nnz, nnzb = nnz, nnzc; cusparseoperation_t transa = cusparse_operation_non_transpose; cusparseoperation_t transb = cusparse_operation_non_transpose; // figure out size of c int basec; int *csrrowptrc, *csrcolindc; float *csrvalc; // nnztotaldevhostptr points host memory int *nnztotaldevhostptr = &nnzc; stat = cusparsesetpointermode(handle, cusparse_pointer_mode_host); cusparse_check(stat); cudastat1 = cudamalloc((void**)&csrrowptrc, sizeof(int)*(n+1)); if (cudastat1 != cudasuccess) { cleanup("device malloc failed (csrrowptr)"); return 1; } s_time=timerval(); //from here add code multiply /* */ stat = cusparsexcsrgemmnnz(handle, transa, transb, n, n, n, descra, nnza, csrrowptr, coocolindex /*csrcolind*/, descrb, nnzb, csrrowptr, coocolindex /*csrcolind*/, descrc, csrrowptrc, nnztotaldevhostptr ); cusparse_check(stat); if (null != nnztotaldevhostptr) { nnzc = *nnztotaldevhostptr; } else{ cudastat1 = cudamemcpy(&nnzc, csrrowptrc+n, sizeof(int), cudamemcpydevicetohost); cudastat2 = cudamemcpy(&basec, csrrowptrc, sizeof(int), cudamemcpydevicetohost); if (cudastat1 || cudastat2 != cudasuccess) { cleanup("device malloc failed (csrrowptr)"); return 1; } nnzc -= basec;} cudastat1 = cudamalloc((void**)&csrcolindc, sizeof(int)*nnzc); if (cudastat1 != cudasuccess) { cleanup("device malloc failed (csrcolindc)"); return 1; } cudastat1 = cudamalloc((void**)&csrvalc, sizeof(float)*nnzc); if (cudastat1 != cudasuccess) { cleanup("device malloc failed (csrvalc)"); return 1; } stat = cusparsescsrgemm(handle, transa, transb, n, n, n, descra, nnza, cooval/*csrvala*/, csrrowptr, coocolindex, descrb, nnzb, cooval/*csrvala*/, csrrowptr, coocolindex, descrc, csrvalc/*csrvala*/, csrrowptrc, csrcolindc); cusparse_check(stat); cudadevicesynchronize(); int *h_csrrowptrc = null, *h_csrcolindc = null; float *h_csrvalc = null; h_csrvalc = (float *)malloc(nnzc*sizeof(float)); h_csrrowptrc = (int *)malloc(n+1*sizeof(int)); h_csrcolindc = (int *)malloc(nnzc*sizeof(int)); cudastat1 = cudamemcpy(h_csrrowptrc, csrrowptrc, (n+1)*sizeof(int), cudamemcpydevicetohost); if (cudastat1 != cudasuccess) { cleanup("device memcopy failed csrrowptrc"); return 1; } cudastat2 = cudamemcpy(h_csrcolindc, csrcolindc, nnzc*sizeof(int), cudamemcpydevicetohost); if (cudastat2 != cudasuccess) { cleanup("device memcopy failed: coocolindex"); return 1; } printf("nnz value %d, nnzc %d\n", nnz, nnzc); cudastat3 = cudamemcpy(h_csrvalc, csrvalc, nnzc*sizeof(float), cudamemcpydevicetohost); if (cudastat3 != cudasuccess) { cleanup("device memcopy failed: csrvalc"); return 1; } int y1; printf("row is\n"); for(y1 =0; y1 < n+1; y1++) { printf("%d\t", h_csrrowptrc[y1]); } printf("\n"); printf("col is\n"); for(y1 =0; y1 < nnzc; y1++) { printf("%d\t", h_csrcolindc[y1]); } printf("\n"); printf("nnz is\n"); for(y1 =0; y1 < nnzc; y1++) { printf("%f\t", h_csrvalc[y1]); } /* destroy matrix descriptor */ status = cusparsedestroymatdescr(descr); descr = 0; if (status != cusparse_status_success) { cleanup("matrix descriptor destruction failed"); return 1; } /* destroy handle */ status = cusparsedestroy(handle); handle = 0; if (status != cusparse_status_success) { cleanup("cusparse library release of resources failed"); return 1; } cudafree(csrrowptr); cudafree(coocolindex); cudafree(coorowindex); cudafree(cooval); cudafree(csrrowptrc); cudafree(csrcolindc); cudafree(csrvalc); return 0; }
you mixing float
, double
. example:
double * coovalhostptr=0;
and
float * cooval=0;
when copy double
host values float
array on device, you're not going expect:
cudastat3 = cudamemcpy(cooval, coovalhostptr, (size_t)(nnz*sizeof(cooval[0])), cudamemcpyhosttodevice);
since using cusparsescsrgemm
, assume intent use float
. based on that, following code has issue fixed (just changed bunch of double
declarations float
), , seems produce sensible results:
#include <stdio.h> #include <stdlib.h> #include <cuda_runtime.h> #include "cusparse_v2.h" #include <cuda_runtime_api.h> #include<iostream> #include<iomanip> #include<assert.h> #include <time.h> #include <sys/time.h> #define cusparse_check(x) {cusparsestatus_t _c=x; if (_c != cusparse_status_success) {printf("cusparse fail: %d, line: %d\n", (int)_c, __line__); if(_c == cusparse_status_matrix_type_not_supported) {printf("cusparse_status_matrix_type_not_supported\n");} if(_c == cusparse_status_internal_error) {printf("cusparse_status_internal_error\n");} exit(-1);}} #define cleanup(s) \ { \ printf ("%s\n", s); \ if (yhostptr) free(yhostptr); \ if (zhostptr) free(zhostptr); \ if (xindhostptr) free(xindhostptr); \ if (xvalhostptr) free(xvalhostptr); \ if (coorowindexhostptr) free(coorowindexhostptr);\ if (coocolindexhostptr) free(coocolindexhostptr);\ if (coovalhostptr) free(coovalhostptr); \ if (y) cudafree(y); \ if (z) cudafree(z); \ if (xind) cudafree(xind); \ if (xval) cudafree(xval); \ if (csrrowptr) cudafree(csrrowptr); \ if (coorowindex) cudafree(coorowindex); \ if (coocolindex) cudafree(coocolindex); \ if (cooval) cudafree(cooval); \ if (descr) cusparsedestroymatdescr(descr);\ if (handle) cusparsedestroy(handle); \ cudadevicereset(); \ fflush (stdout); \ } while (0) double timerval() { struct timeval st; gettimeofday(&st, null); return (st.tv_sec+st.tv_usec*1e-6); } int main(){ cudaerror_t cudastat1 = cudasuccess,cudastat2 = cudasuccess,cudastat3 = cudasuccess,cudastat4 = cudasuccess,cudastat5 = cudasuccess,cudastat6 = cudasuccess; cusparsestatus_t status; cusparsehandle_t handle=0; cusparsematdescr_t descr=0; int * coorowindexhostptr=0; int * coocolindexhostptr=0; float * coovalhostptr=0; int * coorowindex=0; int * coocolindex=0; float * cooval=0; int * xindhostptr=0; float * xvalhostptr=0; float * yhostptr=0; int * xind=0; float * xval=0; float * y=0; int * csrrowptr=0; int * csrcolptr = 0; float * zhostptr=0; float * z=0; int n, nnz, nnz_vector; double dzero =0.0; double dtwo =2.0; double dthree=3.0; double dfive =5.0; cusparsestatus_t stat; double avg_time = 0, s_time, e_time; cusparsematdescr_t descra, descrb, descrc; stat = cusparsecreatematdescr(&descra); cusparse_check(stat); stat = cusparsecreatematdescr(&descrb); cusparse_check(stat); stat = cusparsecreatematdescr(&descrc); cusparse_check(stat); stat = cusparsesetmattype(descra, cusparse_matrix_type_general); cusparse_check(stat); stat = cusparsesetmattype(descrb, cusparse_matrix_type_general); cusparse_check(stat); stat = cusparsesetmattype(descrc, cusparse_matrix_type_general); cusparse_check(stat); stat = cusparsesetmatindexbase(descra, cusparse_index_base_zero); cusparse_check(stat); stat = cusparsesetmatindexbase(descrb, cusparse_index_base_zero); cusparse_check(stat); stat = cusparsesetmatindexbase(descrc, cusparse_index_base_zero); cusparse_check(stat); printf("testing example\n"); /* create following sparse test matrix in coo format */ /* |1.0 2.0 3.0| | 4.0 | |5.0 6.0 7.0| | 8.0 9.0| */ n=4; nnz=9; coorowindexhostptr = (int *) malloc(nnz*sizeof(coorowindexhostptr[0])); coocolindexhostptr = (int *) malloc(nnz*sizeof(coocolindexhostptr[0])); coovalhostptr = (float *)malloc(nnz*sizeof(coovalhostptr[0])); if ((!coorowindexhostptr) || (!coocolindexhostptr) || (!coovalhostptr)){ cleanup("host malloc failed (matrix)"); return 1; } coorowindexhostptr[0]=0; coocolindexhostptr[0]=0; coovalhostptr[0]=1.0; coorowindexhostptr[1]=0; coocolindexhostptr[1]=2; coovalhostptr[1]=2.0; coorowindexhostptr[2]=0; coocolindexhostptr[2]=3; coovalhostptr[2]=3.0; coorowindexhostptr[3]=1; coocolindexhostptr[3]=1; coovalhostptr[3]=4.0; coorowindexhostptr[4]=2; coocolindexhostptr[4]=0; coovalhostptr[4]=5.0; coorowindexhostptr[5]=2; coocolindexhostptr[5]=2; coovalhostptr[5]=6.0; coorowindexhostptr[6]=2; coocolindexhostptr[6]=3; coovalhostptr[6]=7.0; coorowindexhostptr[7]=3; coocolindexhostptr[7]=1; coovalhostptr[7]=8.0; coorowindexhostptr[8]=3; coocolindexhostptr[8]=3; coovalhostptr[8]=9.0; //print matrix printf("input data:\n"); (int i=0; i<nnz; i++){ printf("coorowindexhostptr[%d]=%d ",i,coorowindexhostptr[i]); printf("coocolindexhostptr[%d]=%d ",i,coocolindexhostptr[i]); printf("coovalhostptr[%d]=%f \n",i,coovalhostptr[i]); } /* allocate gpu memory , copy matrix , vectors */ cudastat1 = cudamalloc((void**)&coorowindex,nnz*sizeof(coorowindex[0])); cudastat2 = cudamalloc((void**)&coocolindex,nnz*sizeof(coocolindex[0])); cudastat3 = cudamalloc((void**)&cooval, nnz*sizeof(cooval[0])); if ((cudastat1 != cudasuccess) || (cudastat2 != cudasuccess) || (cudastat3 != cudasuccess) || (cudastat4 != cudasuccess) || (cudastat5 != cudasuccess) || (cudastat6 != cudasuccess)) { cleanup("device malloc failed"); return 1; } cudastat1 = cudamemcpy(coorowindex, coorowindexhostptr, (size_t)(nnz*sizeof(coorowindex[0])), cudamemcpyhosttodevice); cudastat2 = cudamemcpy(coocolindex, coocolindexhostptr, (size_t)(nnz*sizeof(coocolindex[0])), cudamemcpyhosttodevice); cudastat3 = cudamemcpy(cooval, coovalhostptr, (size_t)(nnz*sizeof(cooval[0])), cudamemcpyhosttodevice); if ((cudastat1 != cudasuccess) || (cudastat2 != cudasuccess) || (cudastat3 != cudasuccess) || (cudastat4 != cudasuccess) || (cudastat5 != cudasuccess) || (cudastat6 != cudasuccess)) { cleanup("memcpy host device failed"); return 1; } /* initialize cusparse library */ status= cusparsecreate(&handle); if (status != cusparse_status_success) { cleanup("cusparse library initialization failed"); return 1; } /* create , setup matrix descriptor */ status= cusparsecreatematdescr(&descr); if (status != cusparse_status_success) { cleanup("matrix descriptor initialization failed"); return 1; } cusparsesetmattype(descr,cusparse_matrix_type_general); cusparsesetmatindexbase(descr,cusparse_index_base_zero); /* exercise conversion routines (convert matrix coo 2 csr format) */ cudastat1 = cudamalloc(&csrrowptr,(n+1)*sizeof(csrrowptr[0])); if (cudastat1 != cudasuccess) { cleanup("device malloc failed (csrrowptr)"); return 1; } status= cusparsexcoo2csr(handle,coorowindex,nnz,n, csrrowptr,cusparse_index_base_zero); if (status != cusparse_status_success) { cleanup("conversion coo csr format failed"); return 1; } /* int *csr_values; csr_values = (int *)malloc((n+1)*sizeof(int)); cudastat3 = cudamemcpy(csr_values, csrrowptr, (n+1)*sizeof(int), cudamemcpydevicetohost); if (cudastat3 != cudasuccess) { cleanup("device memcopy failed: csr values"); return 1; } printf("csr values \n"); for(int y2 =0; y2< n+1; y2++) printf("%d \t", csr_values[y2]); */ /* int y1; printf("\n"); printf("col orig is\n"); for(y1 =0; y1 < nnz; y1++) { printf("%d\t", coocolindex[y1]); } printf("\n"); printf("nnz orig is\n"); for(y1 =0; y1 < nnz; y1++) { printf("%f\t", h_csrvalc[y1]); } */ //csrrowptr data present //csrrowptr, coocolindex, cooval (all 3 matrix data) shall used here operation int nnza = nnz, nnzb = nnz, nnzc; cusparseoperation_t transa = cusparse_operation_non_transpose; cusparseoperation_t transb = cusparse_operation_non_transpose; // figure out size of c int basec; int *csrrowptrc, *csrcolindc; float *csrvalc; // nnztotaldevhostptr points host memory int *nnztotaldevhostptr = &nnzc; stat = cusparsesetpointermode(handle, cusparse_pointer_mode_host); cusparse_check(stat); cudastat1 = cudamalloc((void**)&csrrowptrc, sizeof(int)*(n+1)); if (cudastat1 != cudasuccess) { cleanup("device malloc failed (csrrowptr)"); return 1; } s_time=timerval(); //from here add code multiply /* */ stat = cusparsexcsrgemmnnz(handle, transa, transb, n, n, n, descra, nnza, csrrowptr, coocolindex /*csrcolind*/, descrb, nnzb, csrrowptr, coocolindex /*csrcolind*/, descrc, csrrowptrc, nnztotaldevhostptr ); cusparse_check(stat); if (null != nnztotaldevhostptr) { nnzc = *nnztotaldevhostptr; } else{ cudastat1 = cudamemcpy(&nnzc, csrrowptrc+n, sizeof(int), cudamemcpydevicetohost); cudastat2 = cudamemcpy(&basec, csrrowptrc, sizeof(int), cudamemcpydevicetohost); if (cudastat1 || cudastat2 != cudasuccess) { cleanup("device malloc failed (csrrowptr)"); return 1; } nnzc -= basec;} cudastat1 = cudamalloc((void**)&csrcolindc, sizeof(int)*nnzc); if (cudastat1 != cudasuccess) { cleanup("device malloc failed (csrcolindc)"); return 1; } cudastat1 = cudamalloc((void**)&csrvalc, sizeof(float)*nnzc); if (cudastat1 != cudasuccess) { cleanup("device malloc failed (csrvalc)"); return 1; } stat = cusparsescsrgemm(handle, transa, transb, n, n, n, descra, nnza, cooval/*csrvala*/, csrrowptr, coocolindex, descrb, nnzb, cooval/*csrvala*/, csrrowptr, coocolindex, descrc, csrvalc/*csrvala*/, csrrowptrc, csrcolindc); cusparse_check(stat); cudadevicesynchronize(); int *h_csrrowptrc = null, *h_csrcolindc = null; float *h_csrvalc = null; h_csrvalc = (float *)malloc(nnzc*sizeof(float)); h_csrrowptrc = (int *)malloc(n+1*sizeof(int)); h_csrcolindc = (int *)malloc(nnzc*sizeof(int)); cudastat1 = cudamemcpy(h_csrrowptrc, csrrowptrc, (n+1)*sizeof(int), cudamemcpydevicetohost); if (cudastat1 != cudasuccess) { cleanup("device memcopy failed csrrowptrc"); return 1; } cudastat2 = cudamemcpy(h_csrcolindc, csrcolindc, nnzc*sizeof(int), cudamemcpydevicetohost); if (cudastat2 != cudasuccess) { cleanup("device memcopy failed: coocolindex"); return 1; } printf("nnz value %d, nnzc %d\n", nnz, nnzc); cudastat3 = cudamemcpy(h_csrvalc, csrvalc, nnzc*sizeof(float), cudamemcpydevicetohost); if (cudastat3 != cudasuccess) { cleanup("device memcopy failed: csrvalc"); return 1; } int y1; printf("row is\n"); for(y1 =0; y1 < n+1; y1++) { printf("%d\t", h_csrrowptrc[y1]); } printf("\n"); printf("col is\n"); for(y1 =0; y1 < nnzc; y1++) { printf("%d\t", h_csrcolindc[y1]); } printf("\n"); printf("nnz is\n"); for(y1 =0; y1 < nnzc; y1++) { printf("%f\t", h_csrvalc[y1]); } printf("\n"); /* destroy matrix descriptor */ status = cusparsedestroymatdescr(descr); descr = 0; if (status != cusparse_status_success) { cleanup("matrix descriptor destruction failed"); return 1; } /* destroy handle */ status = cusparsedestroy(handle); handle = 0; if (status != cusparse_status_success) { cleanup("cusparse library release of resources failed"); return 1; } cudafree(csrrowptr); cudafree(coocolindex); cudafree(coorowindex); cudafree(cooval); cudafree(csrrowptrc); cudafree(csrcolindc); cudafree(csrvalc); return 0; }
Comments
Post a Comment