Max Schneider
2015-02-20 10:51:39 UTC
Hey guys,
i'm doing some experimens with reduction kernels and compare the resulting
runtimes with the runtime of cudppReduce.
However, i've encountered an interesting problem which i couldn't solve by
myself. Maybe one of you have a solution for me.
The problem is, that in case of cudppReduce running on a tesla C2075 i get
independent of elements data type and number
of elements (> 1000000) same runtime: approx.: 0.0001 seconds. When i run
the same binary on a geforce GTX 480 which
is installed on the same system and in the same time used for display
management, the resulting runtimes differ depending
on data type and number of elements to be processed, as expected. I've
executed the same experiments also on a laptop
equipped with a geforce gtx 460M and got same behavior as in case of the
tesla C2075 card. My own kernels behave as expected,
by increasing the number of elements the resulting runtime increases as
well. I have looked on the code of cudppReduce function
and it looks same as my own kernel, so the difference in runtime behavior
comes from somewhere else. Maybe some optimizations
by the compiler or deeper in cudpp library i don't know. I hope you can
help me with this problem.
Here is the code of my cudpp implementation:
double tstamp()
{
timeval time;
gettimeofday(&time,NULL);
return double(time.tv_sec + time.tv_usec * 1e-6);
}
void ReduceDataUsingCUDPP(double* pSource_p,double* pTarget_p,double
InitValue_p,int ElemCount_p)
{
CUDPPConfiguration config;
CUDPPHandle cudppLibHandle = 0;
CUDPPHandle reducePlan = 0;
cudppCreate(&cudppLibHandle);
config.algorithm = CUDPP_REDUCE;
config.datatype = CUDPP_DOUBLE;
config.op = CUDPP_ADD;
config.options = CUDPP_OPTION_FORWARD;
if(cudppPlan(cudppLibHandle,&reducePlan,config,ElemCount_p,1,0) !=
CUDPP_SUCCESS)
{
std::cout << "ERROR IN BUILDING A REDUCE PLAN";
exit(1);
}
if(cudppReduce(reducePlan,pTarget_p,pSource_p,ElemCount_p) !=
CUDPP_SUCCESS)
{
std::cout << "CUDPP REDUCTION FAILED";
exit(1);
}
cudppDestroy(reducePlan);
cudppDestroy(cudppLibHandle);
}
template < typename DATA_T >
void ComputeReductionUsingCUDPP(int ElemCount_p)
{
std::vector< DATA_T > hostSource(ElemCount_p,0);
double t0,t1;
float locCPUTime,globCPUTime;
float locGPUTime,globGPUTime;
DATA_T* pDevSource;
DATA_T* pDevTarget;
DATA_T hostResult = DATA_T(0);
DATA_T devResult;
int correctResults = 0;
globCPUTime = 0.0;
globGPUTime = 0.0;
cudaMalloc((void**)&pDevSource,sizeof(DATA_T) * ElemCount_p);
cudaMalloc((void**)&pDevTarget,sizeof(DATA_T));
for(int i = 0; i < TEST_RUN_COUNT; ++i)
{
InitVector(hostSource,false);
cudaMemcpy(pDevSource,&hostSource[0],sizeof(DATA_T) *
ElemCount_p,cudaMemcpyHostToDevice);
cudaMemset(pDevTarget,0,sizeof(DATA_T));
t0 = tstamp();
ReduceDataUsingCUDPP(pDevSource,pDevTarget,DATA_T(0),ElemCount_p);
t1 = tstamp();
locGPUTime = t1 - t0;
t0 = tstamp();
hostResult = CPUComputeReduction< DATA_T,ADD >(hostSource);
t1 = tstamp();
locCPUTime = (t1 - t0);
cudaMemcpy(&devResult,pDevTarget,sizeof(DATA_T),cudaMemcpyDeviceToHost);
if(hostResult == devResult)
++correctResults;
globCPUTime += locCPUTime;
globGPUTime += locGPUTime;
}
std::cout << "-------------------------- CUDPP RESULTS
----------------------------- " << std::endl;
std::cout << correctResults << " OF " << TEST_RUN_COUNT << std::endl;
if(correctResults == TEST_RUN_COUNT) std::cout << "ALL RESULTS ARE
CORRECT" << std::endl;
std::cout << "HOST TIME: " << ((globCPUTime) / TEST_RUN_COUNT) << "
SEC" << std::endl;
std::cout << "DEVICE TIME: " << ((globGPUTime) / TEST_RUN_COUNT) << "
SEC" << std::endl;
std::cout << "-------------------------- CUDPP RESULTS
----------------------------- " << std::endl;
std::cout << std::endl << std::endl;
cudaFree(pDevSource);
}
int main(int argc,char* argv[])
{
int type,elemCount;
cudaSetDevice(0);
if(argc > 2)
{
type = atoi(argv[1]);
elemCount = atoi(argv[2]);
}
else
{
std::cout << "TYPE: ";
std::cin >> type;
std::cout << "ELEMENT COUNT: ";
std::cin >> elemCount;
}
ComputeReductionUsingCUDPP< double >(elemCount);
}
i'm doing some experimens with reduction kernels and compare the resulting
runtimes with the runtime of cudppReduce.
However, i've encountered an interesting problem which i couldn't solve by
myself. Maybe one of you have a solution for me.
The problem is, that in case of cudppReduce running on a tesla C2075 i get
independent of elements data type and number
of elements (> 1000000) same runtime: approx.: 0.0001 seconds. When i run
the same binary on a geforce GTX 480 which
is installed on the same system and in the same time used for display
management, the resulting runtimes differ depending
on data type and number of elements to be processed, as expected. I've
executed the same experiments also on a laptop
equipped with a geforce gtx 460M and got same behavior as in case of the
tesla C2075 card. My own kernels behave as expected,
by increasing the number of elements the resulting runtime increases as
well. I have looked on the code of cudppReduce function
and it looks same as my own kernel, so the difference in runtime behavior
comes from somewhere else. Maybe some optimizations
by the compiler or deeper in cudpp library i don't know. I hope you can
help me with this problem.
Here is the code of my cudpp implementation:
double tstamp()
{
timeval time;
gettimeofday(&time,NULL);
return double(time.tv_sec + time.tv_usec * 1e-6);
}
void ReduceDataUsingCUDPP(double* pSource_p,double* pTarget_p,double
InitValue_p,int ElemCount_p)
{
CUDPPConfiguration config;
CUDPPHandle cudppLibHandle = 0;
CUDPPHandle reducePlan = 0;
cudppCreate(&cudppLibHandle);
config.algorithm = CUDPP_REDUCE;
config.datatype = CUDPP_DOUBLE;
config.op = CUDPP_ADD;
config.options = CUDPP_OPTION_FORWARD;
if(cudppPlan(cudppLibHandle,&reducePlan,config,ElemCount_p,1,0) !=
CUDPP_SUCCESS)
{
std::cout << "ERROR IN BUILDING A REDUCE PLAN";
exit(1);
}
if(cudppReduce(reducePlan,pTarget_p,pSource_p,ElemCount_p) !=
CUDPP_SUCCESS)
{
std::cout << "CUDPP REDUCTION FAILED";
exit(1);
}
cudppDestroy(reducePlan);
cudppDestroy(cudppLibHandle);
}
template < typename DATA_T >
void ComputeReductionUsingCUDPP(int ElemCount_p)
{
std::vector< DATA_T > hostSource(ElemCount_p,0);
double t0,t1;
float locCPUTime,globCPUTime;
float locGPUTime,globGPUTime;
DATA_T* pDevSource;
DATA_T* pDevTarget;
DATA_T hostResult = DATA_T(0);
DATA_T devResult;
int correctResults = 0;
globCPUTime = 0.0;
globGPUTime = 0.0;
cudaMalloc((void**)&pDevSource,sizeof(DATA_T) * ElemCount_p);
cudaMalloc((void**)&pDevTarget,sizeof(DATA_T));
for(int i = 0; i < TEST_RUN_COUNT; ++i)
{
InitVector(hostSource,false);
cudaMemcpy(pDevSource,&hostSource[0],sizeof(DATA_T) *
ElemCount_p,cudaMemcpyHostToDevice);
cudaMemset(pDevTarget,0,sizeof(DATA_T));
t0 = tstamp();
ReduceDataUsingCUDPP(pDevSource,pDevTarget,DATA_T(0),ElemCount_p);
t1 = tstamp();
locGPUTime = t1 - t0;
t0 = tstamp();
hostResult = CPUComputeReduction< DATA_T,ADD >(hostSource);
t1 = tstamp();
locCPUTime = (t1 - t0);
cudaMemcpy(&devResult,pDevTarget,sizeof(DATA_T),cudaMemcpyDeviceToHost);
if(hostResult == devResult)
++correctResults;
globCPUTime += locCPUTime;
globGPUTime += locGPUTime;
}
std::cout << "-------------------------- CUDPP RESULTS
----------------------------- " << std::endl;
std::cout << correctResults << " OF " << TEST_RUN_COUNT << std::endl;
if(correctResults == TEST_RUN_COUNT) std::cout << "ALL RESULTS ARE
CORRECT" << std::endl;
std::cout << "HOST TIME: " << ((globCPUTime) / TEST_RUN_COUNT) << "
SEC" << std::endl;
std::cout << "DEVICE TIME: " << ((globGPUTime) / TEST_RUN_COUNT) << "
SEC" << std::endl;
std::cout << "-------------------------- CUDPP RESULTS
----------------------------- " << std::endl;
std::cout << std::endl << std::endl;
cudaFree(pDevSource);
}
int main(int argc,char* argv[])
{
int type,elemCount;
cudaSetDevice(0);
if(argc > 2)
{
type = atoi(argv[1]);
elemCount = atoi(argv[2]);
}
else
{
std::cout << "TYPE: ";
std::cin >> type;
std::cout << "ELEMENT COUNT: ";
std::cin >> elemCount;
}
ComputeReductionUsingCUDPP< double >(elemCount);
}
--
You received this message because you are subscribed to the Google Groups "CUDPP" group.
To unsubscribe from this group and stop receiving emails from it, send an email to cudpp+***@googlegroups.com.
To post to this group, send email to ***@googlegroups.com.
Visit this group at http://groups.google.com/group/cudpp.
For more options, visit https://groups.google.com/d/optout.
You received this message because you are subscribed to the Google Groups "CUDPP" group.
To unsubscribe from this group and stop receiving emails from it, send an email to cudpp+***@googlegroups.com.
To post to this group, send email to ***@googlegroups.com.
Visit this group at http://groups.google.com/group/cudpp.
For more options, visit https://groups.google.com/d/optout.