[CUDPP][1113] strange runtime behavior of cudppReduce

Discussion:

Max Schneider

2015-02-20 10:51:39 UTC

Hey guys,

i'm doing some experimens with reduction kernels and compare the resulting
runtimes with the runtime of cudppReduce.
However, i've encountered an interesting problem which i couldn't solve by
myself. Maybe one of you have a solution for me.
The problem is, that in case of cudppReduce running on a tesla C2075 i get
independent of elements data type and number
of elements (> 1000000) same runtime: approx.: 0.0001 seconds. When i run
the same binary on a geforce GTX 480 which
is installed on the same system and in the same time used for display
management, the resulting runtimes differ depending
on data type and number of elements to be processed, as expected. I've
executed the same experiments also on a laptop
equipped with a geforce gtx 460M and got same behavior as in case of the
tesla C2075 card. My own kernels behave as expected,
by increasing the number of elements the resulting runtime increases as
well. I have looked on the code of cudppReduce function
and it looks same as my own kernel, so the difference in runtime behavior
comes from somewhere else. Maybe some optimizations
by the compiler or deeper in cudpp library i don't know. I hope you can
help me with this problem.

Here is the code of my cudpp implementation:

double tstamp()
{
timeval time;
gettimeofday(&time,NULL);
return double(time.tv_sec + time.tv_usec * 1e-6);
}

void ReduceDataUsingCUDPP(double* pSource_p,double* pTarget_p,double
InitValue_p,int ElemCount_p)
{
CUDPPConfiguration config;

CUDPPHandle cudppLibHandle = 0;
CUDPPHandle reducePlan = 0;

cudppCreate(&cudppLibHandle);

config.algorithm = CUDPP_REDUCE;
config.datatype = CUDPP_DOUBLE;
config.op = CUDPP_ADD;
config.options = CUDPP_OPTION_FORWARD;

if(cudppPlan(cudppLibHandle,&reducePlan,config,ElemCount_p,1,0) !=
CUDPP_SUCCESS)
{
std::cout << "ERROR IN BUILDING A REDUCE PLAN";
exit(1);
}

if(cudppReduce(reducePlan,pTarget_p,pSource_p,ElemCount_p) !=
CUDPP_SUCCESS)
{
std::cout << "CUDPP REDUCTION FAILED";
exit(1);
}

cudppDestroy(reducePlan);
cudppDestroy(cudppLibHandle);
}

template < typename DATA_T >
void ComputeReductionUsingCUDPP(int ElemCount_p)
{
std::vector< DATA_T > hostSource(ElemCount_p,0);

double t0,t1;

float locCPUTime,globCPUTime;
float locGPUTime,globGPUTime;

DATA_T* pDevSource;
DATA_T* pDevTarget;

DATA_T hostResult = DATA_T(0);
DATA_T devResult;

int correctResults = 0;

globCPUTime = 0.0;
globGPUTime = 0.0;

cudaMalloc((void**)&pDevSource,sizeof(DATA_T) * ElemCount_p);
cudaMalloc((void**)&pDevTarget,sizeof(DATA_T));

for(int i = 0; i < TEST_RUN_COUNT; ++i)
{
InitVector(hostSource,false);

cudaMemcpy(pDevSource,&hostSource[0],sizeof(DATA_T) *
ElemCount_p,cudaMemcpyHostToDevice);
cudaMemset(pDevTarget,0,sizeof(DATA_T));

t0 = tstamp();

ReduceDataUsingCUDPP(pDevSource,pDevTarget,DATA_T(0),ElemCount_p);

t1 = tstamp();

locGPUTime = t1 - t0;

t0 = tstamp();

hostResult = CPUComputeReduction< DATA_T,ADD >(hostSource);

t1 = tstamp();

locCPUTime = (t1 - t0);

cudaMemcpy(&devResult,pDevTarget,sizeof(DATA_T),cudaMemcpyDeviceToHost);

if(hostResult == devResult)
++correctResults;

globCPUTime += locCPUTime;
globGPUTime += locGPUTime;
}

std::cout << "-------------------------- CUDPP RESULTS
----------------------------- " << std::endl;

std::cout << correctResults << " OF " << TEST_RUN_COUNT << std::endl;

if(correctResults == TEST_RUN_COUNT) std::cout << "ALL RESULTS ARE
CORRECT" << std::endl;

std::cout << "HOST TIME: " << ((globCPUTime) / TEST_RUN_COUNT) << "
SEC" << std::endl;
std::cout << "DEVICE TIME: " << ((globGPUTime) / TEST_RUN_COUNT) << "
SEC" << std::endl;

std::cout << "-------------------------- CUDPP RESULTS
----------------------------- " << std::endl;
std::cout << std::endl << std::endl;

cudaFree(pDevSource);
}

int main(int argc,char* argv[])
{
int type,elemCount;

cudaSetDevice(0);

if(argc > 2)
{
type = atoi(argv[1]);
elemCount = atoi(argv[2]);
}
else
{
std::cout << "TYPE: ";
std::cin >> type;

std::cout << "ELEMENT COUNT: ";
std::cin >> elemCount;
}

ComputeReductionUsingCUDPP< double >(elemCount);
}

--
You received this message because you are subscribed to the Google Groups "CUDPP" group.
To unsubscribe from this group and stop receiving emails from it, send an email to cudpp+***@googlegroups.com.
To post to this group, send email to ***@googlegroups.com.
Visit this group at http://groups.google.com/group/cudpp.
For more options, visit https://groups.google.com/d/optout.

John Owens

2015-02-20 17:02:16 UTC

Permalink

Hi Max,

gettimeofday() is probably not the call you want to be making to get
reliable CUDA timings; it's pretty CPU-centric. In CUDPP we have a
"stopwatch" class that uses GPU performance counters; you might try
that? I believe CUDA events are another way to get more reliable
timings.

JDO

Post by Max Schneider
Hey guys,
i'm doing some experimens with reduction kernels and compare the resulting
runtimes with the runtime of cudppReduce.
However, i've encountered an interesting problem which i couldn't solve by
myself. Maybe one of you have a solution for me.
The problem is, that in case of cudppReduce running on a tesla C2075 i get
independent of elements data type and number
of elements (> 1000000) same runtime: approx.: 0.0001 seconds. When i run
the same binary on a geforce GTX 480 which
is installed on the same system and in the same time used for display
management, the resulting runtimes differ depending
on data type and number of elements to be processed, as expected. I've
executed the same experiments also on a laptop
equipped with a geforce gtx 460M and got same behavior as in case of the
tesla C2075 card. My own kernels behave as expected,
by increasing the number of elements the resulting runtime increases as
well. I have looked on the code of cudppReduce function
and it looks same as my own kernel, so the difference in runtime behavior
comes from somewhere else. Maybe some optimizations
by the compiler or deeper in cudpp library i don't know. I hope you can
help me with this problem.
double tstamp()
{
timeval time;
gettimeofday(&time,NULL);
return double(time.tv_sec + time.tv_usec * 1e-6);
}
void ReduceDataUsingCUDPP(double* pSource_p,double* pTarget_p,double
InitValue_p,int ElemCount_p)
{
CUDPPConfiguration config;
CUDPPHandle cudppLibHandle = 0;
CUDPPHandle reducePlan = 0;
cudppCreate(&cudppLibHandle);
config.algorithm = CUDPP_REDUCE;
config.datatype = CUDPP_DOUBLE;
config.op = CUDPP_ADD;
config.options = CUDPP_OPTION_FORWARD;
if(cudppPlan(cudppLibHandle,&reducePlan,config,ElemCount_p,1,0) !=
CUDPP_SUCCESS)
{
std::cout << "ERROR IN BUILDING A REDUCE PLAN";
exit(1);
}
if(cudppReduce(reducePlan,pTarget_p,pSource_p,ElemCount_p) !=
CUDPP_SUCCESS)
{
std::cout << "CUDPP REDUCTION FAILED";
exit(1);
}
cudppDestroy(reducePlan);
cudppDestroy(cudppLibHandle);
}
template < typename DATA_T >
void ComputeReductionUsingCUDPP(int ElemCount_p)
{
std::vector< DATA_T > hostSource(ElemCount_p,0);
double t0,t1;
float locCPUTime,globCPUTime;
float locGPUTime,globGPUTime;
DATA_T* pDevSource;
DATA_T* pDevTarget;
DATA_T hostResult = DATA_T(0);
DATA_T devResult;
int correctResults = 0;
globCPUTime = 0.0;
globGPUTime = 0.0;
cudaMalloc((void**)&pDevSource,sizeof(DATA_T) * ElemCount_p);
cudaMalloc((void**)&pDevTarget,sizeof(DATA_T));
for(int i = 0; i < TEST_RUN_COUNT; ++i)
{
InitVector(hostSource,false);
cudaMemcpy(pDevSource,&hostSource[0],sizeof(DATA_T) *
ElemCount_p,cudaMemcpyHostToDevice);
cudaMemset(pDevTarget,0,sizeof(DATA_T));
t0 = tstamp();
ReduceDataUsingCUDPP(pDevSource,pDevTarget,DATA_T(0),ElemCount_p);
t1 = tstamp();
locGPUTime = t1 - t0;
t0 = tstamp();
hostResult = CPUComputeReduction< DATA_T,ADD >(hostSource);
t1 = tstamp();
locCPUTime = (t1 - t0);
cudaMemcpy(&devResult,pDevTarget,sizeof(DATA_T),cudaMemcpyDeviceToHost);
if(hostResult == devResult)
++correctResults;
globCPUTime += locCPUTime;
globGPUTime += locGPUTime;
}
std::cout << "-------------------------- CUDPP RESULTS
----------------------------- " << std::endl;
std::cout << correctResults << " OF " << TEST_RUN_COUNT << std::endl;
if(correctResults == TEST_RUN_COUNT) std::cout << "ALL RESULTS ARE
CORRECT" << std::endl;
std::cout << "HOST TIME: " << ((globCPUTime) / TEST_RUN_COUNT) << "
SEC" << std::endl;
std::cout << "DEVICE TIME: " << ((globGPUTime) / TEST_RUN_COUNT) << "
SEC" << std::endl;
std::cout << "-------------------------- CUDPP RESULTS
----------------------------- " << std::endl;
std::cout << std::endl << std::endl;
cudaFree(pDevSource);
}
int main(int argc,char* argv[])
{
int type,elemCount;
cudaSetDevice(0);
if(argc > 2)
{
type = atoi(argv[1]);
elemCount = atoi(argv[2]);
}
else
{
std::cout << "TYPE: ";
std::cin >> type;
std::cout << "ELEMENT COUNT: ";
std::cin >> elemCount;
}
ComputeReductionUsingCUDPP< double >(elemCount);
}

Max Schneider

2015-02-25 09:27:39 UTC

Permalink

Hi JDO,

i looked into the code of stopwatch class and as far as i can see
there are no gpu performance counters used on unix based systems
but also gettimeofday. On windows queryperformancecounters are
used, but those are also cpu side counters
maybe i've looked into a wrong class, but in the cudpp 2.2 repo i found
only this one.

max

Post by John Owens
Hi Max,
gettimeofday() is probably not the call you want to be making to get
reliable CUDA timings; it's pretty CPU-centric. In CUDPP we have a
"stopwatch" class that uses GPU performance counters; you might try
that? I believe CUDA events are another way to get more reliable
timings.
JDO

Post by Max Schneider
Hey guys,
i'm doing some experimens with reduction kernels and compare the
resulting runtimes with the runtime of cudppReduce.
However, i've encountered an interesting problem which i couldn't solve
by myself. Maybe one of you have a solution for me.
The problem is, that in case of cudppReduce running on a tesla C2075 i
get independent of elements data type and number
of elements (> 1000000) same runtime: approx.: 0.0001 seconds. When i run
the same binary on a geforce GTX 480 which
is installed on the same system and in the same time used for display
management, the resulting runtimes differ depending
on data type and number of elements to be processed, as expected. I've
executed the same experiments also on a laptop
equipped with a geforce gtx 460M and got same behavior as in case of the
tesla C2075 card. My own kernels behave as expected,
by increasing the number of elements the resulting runtime increases as
well. I have looked on the code of cudppReduce function
and it looks same as my own kernel, so the difference in runtime behavior
comes from somewhere else. Maybe some optimizations
by the compiler or deeper in cudpp library i don't know. I hope you can
help me with this problem.
double tstamp()
{
timeval time;
gettimeofday(&time,NULL);
return double(time.tv_sec + time.tv_usec * 1e-6);
}
void ReduceDataUsingCUDPP(double* pSource_p,double* pTarget_p,double
InitValue_p,int ElemCount_p)
{
CUDPPConfiguration config;
CUDPPHandle cudppLibHandle = 0;
CUDPPHandle reducePlan = 0;
cudppCreate(&cudppLibHandle);
config.algorithm = CUDPP_REDUCE;
config.datatype = CUDPP_DOUBLE;
config.op = CUDPP_ADD;
config.options = CUDPP_OPTION_FORWARD;
if(cudppPlan(cudppLibHandle,&reducePlan,config,ElemCount_p,1,0) !=
CUDPP_SUCCESS)
{
std::cout << "ERROR IN BUILDING A REDUCE PLAN";
exit(1);
}
if(cudppReduce(reducePlan,pTarget_p,pSource_p,ElemCount_p) !=
CUDPP_SUCCESS)
{
std::cout << "CUDPP REDUCTION FAILED";
exit(1);
}
cudppDestroy(reducePlan);
cudppDestroy(cudppLibHandle);
}
template < typename DATA_T >
void ComputeReductionUsingCUDPP(int ElemCount_p)
{
std::vector< DATA_T > hostSource(ElemCount_p,0);
double t0,t1;
float locCPUTime,globCPUTime;
float locGPUTime,globGPUTime;
DATA_T* pDevSource;
DATA_T* pDevTarget;
DATA_T hostResult = DATA_T(0);
DATA_T devResult;
int correctResults = 0;
globCPUTime = 0.0;
globGPUTime = 0.0;
cudaMalloc((void**)&pDevSource,sizeof(DATA_T) * ElemCount_p);
cudaMalloc((void**)&pDevTarget,sizeof(DATA_T));
for(int i = 0; i < TEST_RUN_COUNT; ++i)
{
InitVector(hostSource,false);
cudaMemcpy(pDevSource,&hostSource[0],sizeof(DATA_T) *
ElemCount_p,cudaMemcpyHostToDevice);
cudaMemset(pDevTarget,0,sizeof(DATA_T));
t0 = tstamp();
ReduceDataUsingCUDPP(pDevSource,pDevTarget,DATA_T(0),ElemCount_p);
t1 = tstamp();
locGPUTime = t1 - t0;
t0 = tstamp();
hostResult = CPUComputeReduction< DATA_T,ADD >(hostSource);
t1 = tstamp();
locCPUTime = (t1 - t0);
cudaMemcpy(&devResult,pDevTarget,sizeof(DATA_T),cudaMemcpyDeviceToHost);
if(hostResult == devResult)
++correctResults;
globCPUTime += locCPUTime;
globGPUTime += locGPUTime;
}
std::cout << "-------------------------- CUDPP RESULTS
----------------------------- " << std::endl;
std::cout << correctResults << " OF " << TEST_RUN_COUNT << std::endl;
if(correctResults == TEST_RUN_COUNT) std::cout << "ALL RESULTS ARE
CORRECT" << std::endl;
std::cout << "HOST TIME: " << ((globCPUTime) / TEST_RUN_COUNT) << "
SEC" << std::endl;
std::cout << "DEVICE TIME: " << ((globGPUTime) / TEST_RUN_COUNT) << "
SEC" << std::endl;
std::cout << "-------------------------- CUDPP RESULTS
----------------------------- " << std::endl;
std::cout << std::endl << std::endl;
cudaFree(pDevSource);
}
int main(int argc,char* argv[])
{
int type,elemCount;
cudaSetDevice(0);
if(argc > 2)
{
type = atoi(argv[1]);
elemCount = atoi(argv[2]);
}
else
{
std::cout << "TYPE: ";
std::cin >> type;
std::cout << "ELEMENT COUNT: ";
std::cin >> elemCount;
}
ComputeReductionUsingCUDPP< double >(elemCount);
}