Привет.
Недавно начал учить cudaC/C++, решил поупражняться и застрял на какойто не понятной для меня ошибке так как я новичок. Может ли ктото кто имеет опыт cudaC/C++ немного помочь с данным кодом.
Проблема в цикле while который выполняет непосредственно сами итерации алгоритма, насколько я понимаю не изменяется правельно значение переменной dev_r1, в чем я не особо уверен. Цикл в нормальной cudaSample-овской версии итерируется 8 раз а тут он с ума сходит. Никак не могу найти где именно ошибка.
//includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
/* Using updated (v2) interfaces to cublas and cusparse */
#include <cuda_runtime.h>
#include <cusparse.h>
#include <cublas_v2.h>
#include <cuda_runtime.h>
#include <cuda.h>
// Utilities and system includes
#include <helper_functions.h> // helper for shared functions common to CUDA Samples
#include <helper_cuda.h> // helper function CUDA error checking and intialization
const char *sSDKname = "conjugateGradientManaged";
/* genTridiag: generate a random tridiagonal symmetric matrix */
void genTridiag(int *I, int *J, float *val, int N, int nz)
{
I[0] = 0, J[0] = 0, J[1] = 1;
val[0] = (float)rand()/RAND_MAX + 10.0f;
val[1] = (float)rand()/RAND_MAX;
int start;
for (int i = 1; i < N; i++)
{
if (i > 1)
{
I[i] = I[i-1]+3;
}
else
{
I[1] = 2;
}
start = (i-1)*3 + 2;
J[start] = i - 1;
J[start+1] = i;
if (i < N-1)
{
J[start+2] = i + 1;
}
val[start] = val[start-1];
val[start+1] = (float)rand()/RAND_MAX + 10.0f;
if (i < N-1)
{
val[start+2] = (float)rand()/RAND_MAX;
}
}
I[N] = nz;
}
__global__ void set_b(float *b, float *r1, float *r0)
{
*b = *r1 / *r0;
//printf("%f\n", b);
}
__global__ void set_a(float *a, float *r1, float *dot)
{
//printf("%f\n", a);
*a = *r1 / *dot;
//printf("%f\n", a);
}
__global__ void set_na(float *na, float *a)
{
*na = -*a;
//printf("%f\n", na);
}
__global__ void set_r0(float *r0, float *r1)
{
*r0 = *r1;
//printf("%f\n", r0);
}
__global__ void set_kernel_var(float *alpha, float *beta, float *alpham1)
{
*alpha = 1.0;
*alpham1 = -1.0;
*beta = 0.0;
printf("%f\n", alpha);
}
//loat dev_a, dev_b, dev_na, dev_r0, dev_r1;
int main(int argc, char **argv)
{
int cpu_nz = 0, *cpu_I = NULL, *cpu_J = NULL;
int N = 0, *dev_N, *dev_nz, *dev_I = NULL, *dev_J = NULL; // = 0,
float *cpu_val = NULL;
float *dev_val = NULL;
const float cpu_tol = 1e-5f;
//const float dev_tol = 1e-5f;
float cpu_r1;
float *dev_a, *dev_b, *dev_na, *dev_r0, *dev_r1;
const int cpu_max_iter = 10000;
float *cpu_x;
float *dev_x;
float *cpu_rhs;
float *dev_rhs;
//./float dev_r1;
float *dev_dot;
float *cpu_r, *cpu_p, *cpu_Ax;
float *dev_r, *dev_p, *dev_Ax;
int cpu_k;
float *dev_alpha, *dev_beta, *dev_alpham1;
printf("Starting [%s]...\n", sSDKname);
// This will pick the best possible CUDA capable device
cudaDeviceProp deviceProp;
int devID = findCudaDevice(argc, (const char **)argv);
checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));
// Statistics about the GPU device
printf("> GPU device has %d Multi-Processors, SM %d.%d compute capabilities\n\n",
deviceProp.multiProcessorCount, deviceProp.major, deviceProp.minor);
/* Generate a random tridiagonal symmetric matrix in CSR format */
N = 1048576;
cpu_nz = (N-2)*3 + 4;
cpu_I = (int *)malloc(sizeof(int)*(N+1)); // csr row pointers for matrix A
cpu_J = (int *)malloc(sizeof(int)*cpu_nz); // csr column indices for matrix A
cpu_val = (float *)malloc(sizeof(float)*cpu_nz); // csr values for matrix A
genTridiag(cpu_I, cpu_J, cpu_val, N, cpu_nz);
cpu_x = (float *)malloc(sizeof(float)*N);
cpu_rhs = (float *)malloc(sizeof(float)*N);
for (int i = 0; i < N; i++)
{
cpu_rhs[i] = 1.0;
cpu_x[i] = 0.0;
}
/* Get handle to the CUBLAS context */
cublasHandle_t cublasHandle = 0;
cublasStatus_t cublasStatus;
cublasStatus = cublasCreate(&cublasHandle);
checkCudaErrors(cublasStatus);
/* Get handle to the CUSPARSE context */
cusparseHandle_t cusparseHandle = 0;
cusparseStatus_t cusparseStatus;
cusparseStatus = cusparseCreate(&cusparseHandle);
checkCudaErrors(cusparseStatus);
cusparseMatDescr_t descr = 0;
cusparseStatus = cusparseCreateMatDescr(&descr);
checkCudaErrors(cusparseStatus);
cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL);
cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ZERO);
// temp memory for CG
cpu_r = (float*)malloc(N*sizeof(float)); //
cpu_p = (float*)malloc(N*sizeof(float)); //
cpu_Ax = (float*)malloc(N*sizeof(float)); //
for (int i=0; i < N; i++)
{
cpu_r[i] = cpu_rhs[i];
}
cudaMalloc((void **)&dev_r, N*sizeof(float));
cudaMalloc((void **)&dev_p, N*sizeof(float));
cudaMalloc((void **)&dev_Ax, N*sizeof(float));
cudaMalloc((void **)&dev_I, sizeof(int)*(N+1));
cudaMalloc((void **)&dev_J, sizeof(int)*cpu_nz);
cudaMalloc((void **)&dev_val, sizeof(float)*cpu_nz);
cudaMalloc((void **)&dev_x, sizeof(float)*N);
cudaMalloc((void **)&dev_rhs, sizeof(float)*N);
cudaMalloc((void **)&dev_a, sizeof(float));
cudaMalloc((void **)&dev_b, sizeof(float));
cudaMalloc((void **)&dev_r1, sizeof(float));
cudaMalloc((void **)&dev_r0, sizeof(float));
cudaMalloc((void **)&dev_na, sizeof(float));
cudaMalloc((void **)&dev_nz, sizeof(int));
cudaMalloc((void **)&dev_N, sizeof(int));
cudaMemcpy(dev_I, cpu_I, sizeof(int)*(N+1), cudaMemcpyHostToDevice);
cudaMemcpy(dev_J, cpu_J, sizeof(int)*cpu_nz, cudaMemcpyHostToDevice);
cudaMemcpy(dev_val, cpu_val, sizeof(float)*cpu_nz, cudaMemcpyHostToDevice);
cudaMemcpy(dev_x, cpu_x, N*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dev_rhs, cpu_rhs, N*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dev_r, cpu_r, N*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dev_Ax, cpu_Ax, N*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dev_p, cpu_p, N*sizeof(float), cudaMemcpyHostToDevice);
cudaMalloc((void**)&dev_alpha, sizeof(float));
cudaMalloc((void**)&dev_alpham1, sizeof(float));
cudaMalloc((void**)&dev_beta, sizeof(float));
set_kernel_var<<<1, 1>>>(dev_alpha, dev_beta, dev_alpham1);
cublasSetPointerMode(cublasHandle, CUBLAS_POINTER_MODE_DEVICE);
cusparseSetPointerMode(cusparseHandle, CUSPARSE_POINTER_MODE_DEVICE);
cusparseScsrmv(cusparseHandle,CUSPARSE_OPERATION_NON_TRANSPOSE, N, N, cpu_nz,
dev_alpha, descr, dev_val, dev_I, dev_J, dev_x, dev_beta, dev_Ax);
cublasSaxpy(cublasHandle, N, dev_alpham1, dev_Ax, 1, dev_r, 1);
cublasStatus = cublasSdot(cublasHandle, N, dev_r, 1, dev_r, 1, dev_r1);
cudaDeviceSynchronize();
cudaMemcpy(&cpu_r1, dev_r1, sizeof(float), cudaMemcpyDeviceToHost);
cpu_k = 1;
printf("%i\n", cpu_r1);
while (cpu_r1 > cpu_tol*cpu_tol && cpu_k <= cpu_max_iter)
{
if (cpu_k > 1)
{
set_b<<<1, 3>>>(dev_b, dev_r1, dev_r0);
cublasStatus = cublasSscal(cublasHandle, N, dev_b, dev_p, 1);
cublasStatus = cublasSaxpy(cublasHandle, N, dev_alpha, dev_r, 1, dev_p, 1);
}
else
{
cublasStatus = cublasScopy(cublasHandle, N, dev_r, 1, dev_p, 1);
}
cusparseScsrmv(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, N, N,
cpu_nz, dev_alpha, descr, dev_val, dev_I, dev_J, dev_p, dev_beta, dev_Ax);
cublasStatus = cublasSdot(cublasHandle, N, dev_p, 1, dev_Ax, 1, dev_dot);
set_a<<<1, 1>>>(dev_a, dev_r1, dev_dot);
cublasStatus = cublasSaxpy(cublasHandle, N, dev_a, dev_p, 1, dev_x, 1);
set_na<<<1, 1>>>(dev_na, dev_a);
cublasStatus = cublasSaxpy(cublasHandle, N, dev_na, dev_Ax, 1, dev_r, 1);
set_r0<<<1,1>>>(dev_r0, dev_r1);
cublasStatus = cublasSdot(cublasHandle, N, dev_r, 1, dev_r, 1, dev_r1);
cudaMemcpy(&cpu_r1, dev_r1, sizeof(float), cudaMemcpyDeviceToHost);
printf("iteration = %3d, residual = %e\n", cpu_k, sqrt(cpu_r1));
cpu_k++;//
}
printf("Final residual: %e\n",sqrt(cpu_r1));
fprintf(stdout,"&&&& uvm_cg test %s\n", (sqrt(cpu_r1) < cpu_tol) ? "PASSED" : "FAILED");
cudaMemcpy(cpu_I, dev_I, sizeof(int)*(N+1), cudaMemcpyDeviceToHost);
cudaMemcpy(cpu_J, dev_J, sizeof(int)*cpu_nz, cudaMemcpyDeviceToHost);
cudaMemcpy(cpu_val, dev_val, sizeof(float)*cpu_nz, cudaMemcpyDeviceToHost);
float rsum, diff, err = 0.0;
for (int i = 0; i < N; i++)
{
rsum = 0.0;
for (int j = cpu_I[i]; j < cpu_I[i+1]; j++)
{
rsum += cpu_val[j]*cpu_x[cpu_J[j]];
}
diff = fabs(rsum - cpu_rhs[i]);
if (diff > err)
{
err = diff;
}
}
cusparseDestroy(cusparseHandle);
cublasDestroy(cublasHandle);
cudaFree(dev_I);
cudaFree(dev_J);
cudaFree(dev_val);
cudaFree(dev_x);
cudaFree(dev_r);
cudaFree(dev_p);
cudaFree(dev_Ax);
cudaFree(dev_rhs);
delete [] cpu_I;
delete [] cpu_J;
delete [] cpu_val;
delete [] cpu_x;
delete [] cpu_r;
delete [] cpu_p;
delete [] cpu_Ax;
delete [] cpu_rhs;
cudaDeviceReset();
printf("Test Summary: Error amount = %f, result = %s\n", err,
(cpu_k <= cpu_max_iter) ? "SUCCESS" : "FAILURE");
exit((cpu_k <= cpu_max_iter) ? EXIT_SUCCESS : EXIT_FAILURE);
}