OpenCL和CUDA虽然不是同一个平级的东西,但是也可以横向比较!
对OpenCL和CUDA的异同做比较:
•         指针遍历
OpenCL不支持CUDA那样的指针遍历方式, 你只能用下标方式间接实现指针遍历. 例子代码如下:
// CUDAstruct Node { Node* next; }
 n = n->next;// OpenCL
struct Node { unsigned int next; }
n = bufBase + n;
• Kernel 程序异同
CUDA的代码最终编译成显卡上的二进制格式,最后由cudart.dll(个人猜测)装载到GPU并且执行。OpenCL中运行时库中包含编译器,
使用伪代码,程序运行时即时编译和装载。这个类似JAVA, .net 程序,道理也一样,为了支持跨平台的兼容。kernel程序的语法也
有略微不同,如下:
 
 
__global__ void vectorAdd(const float * a, const float * b, float * c)
{ // CUDA
    int nIndex = blockIdx.x * blockDim.x + threadIdx.x;
    c[nIndex] = a[nIndex] + b[nIndex];
}
 
 
__kernel void vectorAdd(__global const float * a, __global const float * b, __global float * c)
{ // OpenCL
    int nIndex = get_global_id(0);
    c[nIndex] = a[nIndex] + b[nIndex];
}  
可以看出大部分都相同。只是细节有差异:
1)CUDA 的kernel函数使用“__global__”申明而OpenCL的kernel函数使用“__kernel”作为申明。
2)OpenCL的所有参数都有“__global”修饰符,代表这个参数所指地址是在全局内存。
3)众所周知,CUDA采用threadIdx.{x|y|z}, blockIdx.{x|y|z}来获得当前线程的索引号,而OpenCL
     通过一个特定的get_global_id()函数来获得在kernel中的全局索引号。OpenCL中如果要获得在当前工作
     组(对等于CUDA中的block)中的局部索引号,可以使用get_local_id()
• Host代码的异同
把上面的kernel代码编译成“vectorAdd.cubin”,CUDA调用方法如下:
 
 
const unsigned int cnBlockSize = 512;
const unsigned int cnBlocks = 3;
const unsigned int cnDimension = cnBlocks * cnBlockSize;
CUdevice hDevice;
CUcontext hContext;
CUmodule hModule;
CUfunction hFunction;
// create CUDA device & context
cuInit(0);
cuDeviceGet(&hContext, 0); // pick first device
cuCtxCreate(&hContext, 0, hDevice));
cuModuleLoad(&hModule, “vectorAdd.cubin”);
cuModuleGetFunction(&hFunction, hModule, "vectorAdd");
// allocate host vectors
float * pA = new float[cnDimension];
float * pB = new float[cnDimension];
float * pC = new float[cnDimension];
// initialize host memory
randomInit(pA, cnDimension);
randomInit(pB, cnDimension);
// allocate memory on the device
CUdeviceptr pDeviceMemA, pDeviceMemB, pDeviceMemC;
cuMemAlloc(&pDeviceMemA, cnDimension * sizeof(float));
cuMemAlloc(&pDeviceMemB, cnDimension * sizeof(float));
cuMemAlloc(&pDeviceMemC, cnDimension * sizeof(float));
// copy host vectors to device
cuMemcpyHtoD(pDeviceMemA, pA, cnDimension * sizeof(float));
cuMemcpyHtoD(pDeviceMemB, pB, cnDimension * sizeof(float));
// setup parameter values
cuFuncSetBlockShape(cuFunction, cnBlockSize, 1, 1);
cuParamSeti(cuFunction, 0, pDeviceMemA);
cuParamSeti(cuFunction, 4, pDeviceMemB);
cuParamSeti(cuFunction, 8, pDeviceMemC);
cuParamSetSize(cuFunction, 12);
// execute kernel
cuLaunchGrid(cuFunction, cnBlocks, 1);
// copy the result from device back to host
cuMemcpyDtoH((void *) pC, pDeviceMemC, cnDimension * sizeof(float));
delete[] pA;
delete[] pB;
delete[] pC;
cuMemFree(pDeviceMemA);
cuMemFree(pDeviceMemB);
cuMemFree(pDeviceMemC);
OpenCL的代码以文本方式存放在“sProgramSource”。 调用方式如下:
 
 
const unsigned int cnBlockSize = 512;
const unsigned int cnBlocks = 3;
const unsigned int cnDimension = cnBlocks * cnBlockSize;
// create OpenCL device & context
cl_context hContext;
hContext = clCreateContextFromType(0, CL_DEVICE_TYPE_GPU, 0, 0, 0);
// query all devices available to the context
size_t nContextDescriptorSize;
clGetContextInfo(hContext, CL_CONTEXT_DEVICES, 0, 0, &nContextDescriptorSize);
cl_device_id * aDevices = malloc(nContextDescriptorSize);
clGetContextInfo(hContext, CL_CONTEXT_DEVICES, nContextDescriptorSize, aDevices, 0);
// create a command queue for first device the context reported
cl_command_queue hCmdQueue;
hCmdQueue = clCreateCommandQueue(hContext, aDevices[0], 0, 0);
// create & compile program
cl_program hProgram;
hProgram = clCreateProgramWithSource(hContext, 1, sProgramSource, 0, 0);
clBuildProgram(hProgram, 0, 0, 0, 0, 0);// create kernel
cl_kernel hKernel;
hKernel = clCreateKernel(hProgram, “vectorAdd”, 0);
// allocate host vectors
float * pA = new float[cnDimension];
float * pB = new float[cnDimension];
float * pC = new float[cnDimension];
// initialize host memory
randomInit(pA, cnDimension);
randomInit(pB, cnDimension);
// allocate device memory
cl_mem hDeviceMemA, hDeviceMemB, hDeviceMemC;
hDeviceMemA = clCreateBuffer(hContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, cnDimension * sizeof(cl_float), pA, 0);
hDeviceMemB = clCreateBuffer(hContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, cnDimension * sizeof(cl_float), pA, 0);
hDeviceMemC = clCreateBuffer(hContext,
CL_MEM_WRITE_ONLY,
cnDimension * sizeof(cl_float), 0, 0);
// setup parameter values
clSetKernelArg(hKernel, 0, sizeof(cl_mem), (void *)&hDeviceMemA);
clSetKernelArg(hKernel, 1, sizeof(cl_mem), (void *)&hDeviceMemB);
clSetKernelArg(hKernel, 2, sizeof(cl_mem), (void *)&hDeviceMemC);
// execute kernel
clEnqueueNDRangeKernel(hCmdQueue, hKernel, 1, 0, &cnDimension, 0, 0, 0, 0);
// copy results from device back to host
clEnqueueReadBuffer(hContext, hDeviceMemC, CL_TRUE, 0, cnDimension * sizeof(cl_float),
pC, 0, 0, 0);
delete[] pA;
delete[] pB;
delete[] pC;
clReleaseMemObj(hDeviceMemA);
clReleaseMemObj(hDeviceMemB);
clReleaseMemObj(hDeviceMemC); 
 
 
• 初始化部分的异同 
    
 
CUDA 在使用任何API之前必须调用cuInit(0),然后是获得当前系统的可用设备并获得Context。
cuInit(0);
 cuDeviceGet(&hContext, 0);
 cuCtxCreate(&hContext, 0, hDevice));
 OpenCL不用全局的初始化,直接指定设备获得句柄就可以了
cl_context hContext;
 hContext = clCreateContextFromType(0, CL_DEVICE_TYPE_GPU, 0, 0, 0);
 设备创建完毕后,可以通过下面的方法获得设备信息和上下文:
 size_t nContextDescriptorSize;
 clGetContextInfo(hContext, CL_CONTEXT_DEVICES, 0, 0, &nContextDescriptorSize);
 cl_device_id * aDevices = malloc(nContextDescriptorSize);clGetContextInfo(hContext, CL_CONTEXT_DEVICES, nContextDescriptorSize, aDevices, 0);
OpenCL introduces an additional concept: Command Queues. Commands launching kernels and
 reading or writing memory are always issued for a specific command queue. A command queue is
 created on a specific device in a context. The following code creates a command queue for the
 device and context created so far:
 cl_command_queue hCmdQueue;
 hCmdQueue = clCreateCommandQueue(hContext, aDevices[0], 0, 0);
 With this the program has progressed to the point where data can be uploaded to the device’s
 memory and processed by launching compute kernels on the device.• Kernel Creation
CUDA kernel 以二进制格式存放与CUBIN文件中间,其调用格式和DLL的用法比较类似,先装载二进制库,然后通过函数名查找
函数地址,最后用将函数装载到GPU运行。示例代码如下:
CUmodule hModule;
 cuModuleLoad(&hModule, “vectorAdd.cubin”);
 cuModuleGetFunction(&hFunction, hModule, "vectorAdd");
 OpenCL 为了支持多平台,所以不使用编译后的代码,采用类似JAVA的方式,装载文本格式的代码文件,然后即时编译并运行。
 需要注意的是,OpenCL也提供API访问kernel的二进制程序,前提是这个kernel已经被编译并且放在某个特定的缓存中了。// 装载代码,即时编译
cl_program hProgram;
 hProgram = clCreateProgramWithSource(hContext, 1, “vectorAdd.c", 0, 0);
 clBuildProgram(hProgram, 0, 0, 0, 0, 0);
// 获得kernel函数句柄
 cl_kernel hKernel;
 hKernel = clCreateKernel(hProgram, “vectorAdd”, 0); 
• 设备内存分配
内存分配没有什么大区别,OpenCL提供两组特殊的标志,CL_MEM_READ_ONLY  和 CL_MEM_WRITE_ONLY 用来控制内存
的读写权限。另外一个标志比较有用:CL_MEM_COPY_HOST_PTR 表示这个内存在主机分配,但是GPU可以使用,运行时会自动
将主机内存内容拷贝到GPU,主机内存分配,设备内存分配,主机拷贝数据到设备,3个步骤一气呵成。
// CUDACUdeviceptr pDeviceMemA, pDeviceMemB, pDeviceMemC;
 cuMemAlloc(&pDeviceMemA, cnDimension * sizeof(float));
 cuMemAlloc(&pDeviceMemB, cnDimension * sizeof(float));
 cuMemAlloc(&pDeviceMemC, cnDimension * sizeof(float));
 cuMemcpyHtoD(pDeviceMemA, pA, cnDimension * sizeof(float));
 cuMemcpyHtoD(pDeviceMemB, pB, cnDimension * sizeof(float));
// OpenCL
hDeviceMemA = clCreateBuffer(hContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, cnDimension * sizeof(cl_float), pA, 0);
 hDeviceMemB = clCreateBuffer(hContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, cnDimension * sizeof(cl_float), pA, 0);
 hDeviceMemC = clCreateBuffer(hContext, CL_MEM_WRITE_ONLY, cnDimension * sizeof(cl_float), 0, 0);• Kernel Parameter Specification
The next step in preparing the kernels for launch is to establish a mapping between the kernels’
 parameters, essentially pointers to the three vectors A, B and C, to the three device memory regions,
 which were allocated in the previous section.
 Parameter setting in both APIs is a pretty low-level affair. It requires knowledge of the total number
 , order, and types of a given kernel’s parameters. The order and types of the parameters are used to
 determine a specific parameters offset inside the data block made up of all parameters. The offset in
 bytes for the n-th parameter is essentially the sum of the sizes of all (n-1) preceding parameters.
 Using the CUDA Driver API:
 In CUDA device pointers are represented as unsigned int and the CUDA Driver API has a
 dedicated method for setting that type. Here’s the code for setting the three parameters. Note how
 the offset is incrementally computed as the sum of the previous parameters’ sizes.
 cuParamSeti(cuFunction, 0, pDeviceMemA);
 cuParamSeti(cuFunction, 4, pDeviceMemB);
 cuParamSeti(cuFunction, 8, pDeviceMemC);
 cuParamSetSize(cuFunction, 12);
 Using OpenCL:
 In OpenCL parameter setting is done via a single function that takes a pointer to the location of the
 parameter to be set.
 clSetKernelArg(hKernel, 0, sizeof(cl_mem), (void *)&hDeviceMemA);
 clSetKernelArg(hKernel, 1, sizeof(cl_mem), (void *)&hDeviceMemB);
 clSetKernelArg(hKernel, 2, sizeof(cl_mem), (void *)&hDeviceMemC);