GPU CPU向量加法时间测试

2022-12-11 23:03:12 来源：博客园

GPU CPU向量加法时间测试

实验设备

系统：WSL Ubuntu18.04

(资料图)

实验思路

分别在GPU，CPU上测试两个一维向量的加法，CPU是一个个的串行计算相加，GPU可以通过并行的方式将对应位置的元素相加。

实验结果

但是实验结果分为两种，主要是由于不同的计时器导致的一些差异：

计时方式1：

计时方式2：

解释一些关键词，我们要测试的时间的分为四部分：

但是使用不同的计时器，会产生一些差异：一个是traditional timer，还有一个是tutorial timer，

这个教程里的意思是，给并行程序计时不能用传统的CPU计时方式traditional timer：

clock_t start, finish;start = clock();// 要测试的部分finish = clock();duration = (double)(finish - start) / CLOCKS_PER_SEC;

而是改用如下的代码tutorial timer：

所以GPU计算部分，使用作者提供的代码；而其它需要计时的部分，分别使用traditional timer和tutorial timer：

所以接下来的结果分为两种情况：

计时方式1sum_tutorial_timer.cu

向量a，b的维度	262144\((512 \times 512)\)	1048576\((1024 \times 1024)\)	4194304\((2048 \times 2048)\)
实验结果截图
内存数据拷贝到GPU时间消耗	0.000754 sec	0.001460 sec	0.004343sec
GPU计算时间	0.000021 sec	0.000026 sec	0.000028 sec
结果从显存拷贝到内存时间消耗	0.000263sec	0.000954 sec	0.002438 sec
显存计算总时间（上述相加）	0.001038sec	0.002440 sec	0.006809 sec
CPU 计算时间	0.000372 sec	0.001884 sec	0.006497 sec

分析：

对比GPU和CPU的计算时间，随着数据维度增大，GPU计算时间没有明显的增大，CPU计算时间逐渐增大。
随着数据维度增大，数据在内存到显存双向拷贝时间逐渐增大。
显存计算总时间大于CPU计算时间，并没有使得向量加法的运行效率提高，这还有待研究。

计时方式2sum_traditional_timer.cu

向量a，b的维度	262144\((512 \times 512)\)	1048576\((1024 \times 1024)\)	4194304\((2048 \times 2048)\)
实验结果截图
内存数据拷贝到GPU时间消耗	0.000392 sec	0.000826 sec	0.003920sec
GPU计算时间	0.000030 sec	0.000022 sec	0.000031 sec
结果从显存拷贝到内存时间消耗	0.000414sec	0.000590 sec	0.002008 sec
显存计算总时间	0.000836 sec	0.001438 sec	0.003971 sec
CPU 计算时间	0.000380 sec	0.001432 sec	0.005958 sec

使用traditional timer之后，可以发现数据拷贝的时间明显变短了。

实验代码

`sum_tutorial_timer.cu`

#include #include #include "freshman.h"// CPU 加法void sumArrays(float *a, float *b, float *res, const int size){  for (int i = 0; i < size; i += 1)  {    res[i] = a[i] + b[i];  }}// GPU 加法__global__ void sumArraysGPU(float *a, float *b, float *res, int N){  int i = blockIdx.x * blockDim.x + threadIdx.x;  if (i < N)    res[i] = a[i] + b[i];}int main(int argc, char **argv){  // set up device  initDevice(0);  int nElem = 512*512;  // int nElem = 1024*1024;  // int nElem = 2048*2048;  printf("Vector size:%d\n", nElem);  // 内存数据申请空间  int nByte = sizeof(float) * nElem;  float *a_h = (float *)malloc(nByte);  float *b_h = (float *)malloc(nByte);  float *res_h = (float *)malloc(nByte);  float *res_from_gpu_h = (float *)malloc(nByte);  memset(res_h, 0, nByte);  memset(res_from_gpu_h, 0, nByte);  // 内存数据随机初始化  initialData(a_h, nElem);  initialData(b_h, nElem);  // 显存申请空间  float *a_d, *b_d, *res_d;  CHECK(cudaMalloc((float **)&a_d, nByte));  CHECK(cudaMalloc((float **)&b_d, nByte));  CHECK(cudaMalloc((float **)&res_d, nByte));  // 内存到显存数据拷贝  double iStart, iElaps;  iStart = cpuSecond();  CHECK(cudaMemcpy(a_d, a_h, nByte, cudaMemcpyHostToDevice));  CHECK(cudaMemcpy(b_d, b_h, nByte, cudaMemcpyHostToDevice));  iElaps = cpuSecond() - iStart;  printf("内存数据拷贝到GPU时间消耗\t%f sec\n",  iElaps);  dim3 block(512);  dim3 grid((nElem - 1) / block.x + 1);  // GPU 加法  iStart = cpuSecond();  sumArraysGPU<<>>(a_d, b_d, res_d, nElem);  iElaps = cpuSecond() - iStart;  printf("GPU计算时间 \t\t\t\t %f sec\n", iElaps);  //显存到内存数据拷贝  iStart = cpuSecond();  CHECK(cudaMemcpy(res_from_gpu_h, res_d, nByte, cudaMemcpyDeviceToHost));  iElaps = cpuSecond() - iStart;  printf("结果从显存拷贝到内存时间消耗   %f sec\n",  iElaps);  // CPU 加法  iStart = cpuSecond();  sumArrays(a_h,b_h,res_h,nElem);  iElaps= cpuSecond() - iStart;  printf("CPU 计算时间\t\t\t\t %f sec\n", iElaps);  checkResult(res_h, res_from_gpu_h, nElem);  cudaFree(a_d);  cudaFree(b_d);  cudaFree(res_d);  free(a_h);  free(b_h);  free(res_h);  free(res_from_gpu_h);  return 0;}

`sum_traditional_timer.cu`

#include #include #include #include "freshman.h"// CPU 加法void sumArrays(float *a, float *b, float *res, const int size){  for (int i = 0; i < size; i += 1)  {    res[i] = a[i] + b[i];  }}// GPU 加法__global__ void sumArraysGPU(float *a, float *b, float *res, int N){  int i = blockIdx.x * blockDim.x + threadIdx.x;  if (i < N)    res[i] = a[i] + b[i];}int main(int argc, char **argv){  // set up device  initDevice(0);  int nElem = 512*512;  // int nElem = 1024*1024;  // int nElem = 2048*2048;  printf("Vector size:%d\n", nElem);  // 内存数据申请空间  int nByte = sizeof(float) * nElem;  float *a_h = (float *)malloc(nByte);  float *b_h = (float *)malloc(nByte);  float *res_h = (float *)malloc(nByte);  float *res_from_gpu_h = (float *)malloc(nByte);  memset(res_h, 0, nByte);  memset(res_from_gpu_h, 0, nByte);  // 内存数据随机初始化  initialData(a_h, nElem);  initialData(b_h, nElem);  // 显存申请空间  float *a_d, *b_d, *res_d;  CHECK(cudaMalloc((float **)&a_d, nByte));  CHECK(cudaMalloc((float **)&b_d, nByte));  CHECK(cudaMalloc((float **)&res_d, nByte));  // 内存到显存数据拷贝  clock_t start, end;  start = clock();  CHECK(cudaMemcpy(a_d, a_h, nByte, cudaMemcpyHostToDevice));  CHECK(cudaMemcpy(b_d, b_h, nByte, cudaMemcpyHostToDevice));  end = clock();  printf("内存数据拷贝到GPU时间消耗\t %f sec\n", (double)(end - start) / CLOCKS_PER_SEC);  dim3 block(512);  dim3 grid((nElem - 1) / block.x + 1);  // GPU 加法  double iStart, iElaps;  iStart = cpuSecond();  sumArraysGPU<<>>(a_d, b_d, res_d, nElem);  iElaps = cpuSecond() - iStart;  printf("GPU计算时间 \t\t\t\t %f sec\n", iElaps);  //显存到内存数据拷贝  start = clock();  CHECK(cudaMemcpy(res_from_gpu_h, res_d, nByte, cudaMemcpyDeviceToHost));  end = clock();  printf("结果从显存拷贝到内存时间消耗\t %f sec\n", (double)(end - start) / CLOCKS_PER_SEC);  // CPU 加法  start = clock();  sumArrays(a_h,b_h,res_h,nElem);  end = clock();  printf("CPU 计算时间\t\t\t\t %f sec\n", (double)(end - start) / CLOCKS_PER_SEC);  checkResult(res_h, res_from_gpu_h, nElem);  cudaFree(a_d);  cudaFree(b_d);  cudaFree(res_d);  free(a_h);  free(b_h);  free(res_h);  free(res_from_gpu_h);  return 0;}

`freshman.h`

#ifndef FRESHMAN_H#define FRESHMAN_H#define CHECK(call)\{\  const cudaError_t error=call;\  if(error!=cudaSuccess)\  {\      printf("ERROR: %s:%d,",__FILE__,__LINE__);\      printf("code:%d,reason:%s\n",error,cudaGetErrorString(error));\      exit(1);\  }\}#include #ifdef _WIN32#include #else#include #endif#ifdef _WIN32int gettimeofday(struct timeval *tp, void *tzp){  time_t clock;  struct tm tm;  SYSTEMTIME wtm;  GetLocalTime(&wtm);  tm.tm_year   = wtm.wYear - 1900;  tm.tm_mon   = wtm.wMonth - 1;  tm.tm_mday   = wtm.wDay;  tm.tm_hour   = wtm.wHour;  tm.tm_min   = wtm.wMinute;  tm.tm_sec   = wtm.wSecond;  tm. tm_isdst  = -1;  clock = mktime(&tm);  tp->tv_sec = clock;  tp->tv_usec = wtm.wMilliseconds * 1000;  return (0);}#endifdouble cpuSecond(){  struct timeval tp;  gettimeofday(&tp,NULL);  return((double)tp.tv_sec+(double)tp.tv_usec*1e-6);}void initialData(float* ip,int size){  time_t t;  srand((unsigned )time(&t));  for(int i=0;i:",ny,nx);  for(int i=0;iepsilon)    {      printf("Results don\"t match!\n");      printf("%f(hostRef[%d] )!= %f(gpuRef[%d])\n",hostRef[i],i,gpuRef[i],i);      return;    }  }  printf("Check result success!\n");}#endif//FRESHMAN_H

实验总结

目前来看，数据从显存，内存来回复制的时间较长，是一个值得探究的问题。
不同的计时方式涉及一些底层原理，也需要去查阅资料。
之所以没有使用二维的图像数据，是因为二维数据还没来得及细想，所以假设把它拉平成一维再去做加法。

GPU CPU向量加法时间测试
- 实验设备
- 实验思路
- 实验结果
- 实验代码
  - sum_tutorial_timer.cu
  - sum_traditional_timer.cu
  - freshman.h
- 实验总结

关键词：计算时间实验结果向量加法