本篇主要介绍基于CUDA实现的Kmeans算法,以及通过性能分析工具Nsight Compute对CUDA程序性能调优,加速算法。不对Kmeans算法原理性的内容展开。
CUDA实现思路
主要分成两个核函数实现。第一部分基于当前的聚类中心点坐标,计算训练数据数据所属类别;第二部分基于新的类别信息,计算新的聚类中心坐标。本片主要介绍第一部分的CUDA实现与调优过程。
参数说明:为了更好的阅读性,原始数据维度为1,数据大小为1024 *1024 ,聚类中心个数为 16.
const int kSrcCount = 1024 * 1024;
const int kClusterCount = 16;
const int kCoords = 1;
V1 LocalMemory版本
Base版本,主要实现与 性能优化方法如下:
内存分布:共享内存保存聚类中心坐标,寄存器中保存数据点。
内存使用:保证加载设备内存满足合并对齐访问,同时加载共享内存满足广播机制。
计算逻辑:一次计算数据点到各聚类中心欧式距离,并将该数据点类别标记为距离最短的聚类中心类别。
__global__ void kKmeansClassifyMembership(int SrcCount,
int Coords/*[1]*/,
int ClusterCount/*[16]*/,
float* d_pSrc, /*[Coords][SrcCount]*/
int* d_pMembership,/*[SrcCount]*/
float* d_pCluster, /*[ClusterCount][Coords]*/
int* d_pChanged/*[1] output*/
)
{
extern __shared__ float sm_cluster[];/*[ClusterCount][Coords]*/
__shared__ int sm_blockchanged;
if (threadIdx.x == 0)
{
sm_blockchanged = 0;
}
float regData[4] = { 0.0f,0.0f, 0.0f, 0.0f };//这里虽然定义成reg,实际上使用的是LocalMemory
int tid = threadIdx.x + blockIdx.x * blockDim.x;
//TODO:load cluster center point to sm
for (int i = threadIdx.x; i < Coords * ClusterCount; i += blockDim.x)
{
sm_cluster[i] = d_pCluster[i];
}
__syncthreads();
//TODO:find nearest cluster center point
float minDistance = 10000000.0f;
int clusterIndex = 0;
if (tid < SrcCount)
{
for (size_t i = 0; i < Coords; i++)
{
regData[i] = d_pSrc[tid + i * SrcCount];
}
for (int itcluster = 0; itcluster < ClusterCount; itcluster++)
{
float curDistance = 0.0f;
for (int idim = 0; idim < Coords; idim++)
{
curDistance += pow(regData[idim] - sm_cluster[itcluster * Coords + idim], 2);
}
if (curDistance < minDistance)
{
minDistance = curDistance;
clusterIndex = itcluster;
}
}
}
//TODO:find if membership changed
bool bchanged = (clusterIndex == d_pMembership[tid]) ? false : true;
d_pMembership[tid] = clusterIndex;
if (bchanged)
{
atomicAdd(&sm_blockchanged, 1);
}
__syncthreads();
if (threadIdx.x == 0)
{
atomicAdd(d_pChanged, sm_blockchanged);
}
}
执行情况如下
核心和内存使用率都比较低,执行时间为243us。
调整block grid数量后耗时170us
将grid增大8倍后,核心和内存使用率得到提升。
V1版本主要性能问题由加载内存延迟导致。瓶颈代码位置是比较数据点所属类别是否发生变化,如下:
bool bchanged = (clusterIndex == d_pMembership[tid]) ? false : true;
此外,V1版本在设计时regData是想保存在寄存器中,而编译器把它放在了局部内存,不符合预期。
V2 提前加载数据点类别
针对V1版本中的性能问题,将数据点的类别信息提前加载到寄存器中
/*V2,提前加载分类信息,修复stall_long_sb指标高问题*/
__global__ void kKmeansClassifyMembershipPrefetch(int SrcCount,
int Coords/*[1]*/,
int ClusterCount/*[16]*/,
float* d_pSrc, /*[Coords][SrcCount]*/
int* d_pMembership,/*[SrcCount]*/
float* d_pCluster, /*[ClusterCount][Coords]*/
int* d_pChanged/*[1] output*/
)
{
extern __shared__ float sm_cluster[];/*[ClusterCount][Coords]*/
__shared__ int sm_blockchanged;
if (threadIdx.x == 0)
{
sm_blockchanged = 0;
}
float regData[4] = { 0.0f,0.0f, 0.0f, 0.0f };//这里虽然定义成reg,实际上使用的是LocalMemory
int tid = threadIdx.x + blockIdx.x * blockDim.x;
//TODO:load cluster center point to sm
for (int i = threadIdx.x; i < Coords * ClusterCount; i += blockDim.x)
{
sm_cluster[i] = d_pCluster[i];
}
int preMembership = d_pMembership[tid];
__syncthreads();
//TODO:find nearest cluster center point
float minDistance = 10000000.0f;
int clusterIndex = 0;
if (tid < SrcCount)
{
for (size_t i = 0; i < Coords; i++)
{
regData[i] = d_pSrc[tid + i * SrcCount];
}
for (int itcluster = 0; itcluster < ClusterCount; itcluster++)
{
float curDistance = 0.0f;
for (int idim = 0; idim < Coords; idim++)
{
curDistance += pow(regData[idim] - sm_cluster[itcluster * Coords + idim], 2);
}
if (curDistance < minDistance)
{
minDistance = curDistance;
clusterIndex = itcluster;
}
}
}
//TODO:find if membership changed
bool bchanged = (clusterIndex == preMembership) ? false : true;
d_pMembership[tid] = clusterIndex;
if (bchanged)
{
atomicAdd(&sm_blockchanged, 1);
}
__syncthreads();
if (threadIdx.x == 0)
{
atomicAdd(d_pChanged, sm_blockchanged);
}
}
执行情况如下,耗时118us
优化到这步,核心的使用率升高到峰值的70% +。
V2版本的主要性能问题是stall_wait,代码位置如下,下一个版本尝试通过编译器优化去掉循环
for (int idim = 0; idim < Coords; idim++)
{
curDistance += pow(regData[idim] - sm_cluster[itcluster * Coords + idim], 2);
}
V3 模板与编译器展开
前文已经做过说明,数据的维度固定为1,因此V2版本性能瓶颈的位置没必要做循环处理。尝试通过编译器展开技术优化,模板参数COORDS代表数据维度,代码如下
template<int COORDS>
__global__ void kKmeansClassifyMembershipTmpl(int SrcCount,
int Coords/*[1]*/,
int ClusterCount/*[16]*/,
float* d_pSrc, /*[Coords][SrcCount]*/
int* d_pMembership,/*[SrcCount]*/
float* d_pCluster, /*[ClusterCount][Coords]*/
int* d_pChanged/*[1] output*/
)
{
extern __shared__ float sm_cluster[];/*[ClusterCount][Coords]*/
__shared__ int sm_blockchanged;
if (threadIdx.x == 0)
{
sm_blockchanged = 0;
}
float regData[4] = { 0.0f,0.0f, 0.0f, 0.0f };//这里数据保存在寄存器
int tid = threadIdx.x + blockIdx.x * blockDim.x;
//TODO:load cluster center point to sm
for (int i = threadIdx.x; i < Coords * ClusterCount; i += blockDim.x)
{
sm_cluster[i] = d_pCluster[i];
}
int preMembership = d_pMembership[tid];
__syncthreads();
//TODO:find nearest cluster center point
float minDistance = 10000000.0f;
int clusterIndex = 0;
if (tid < SrcCount)
{
#pragma unroll
for (size_t i = 0; i < COORDS; i++)
{
regData[i] = d_pSrc[tid + i * SrcCount];
}
for (int itcluster = 0; itcluster < ClusterCount; itcluster++)
{
float curDistance = 0.0f;
#pragma unroll
for (int idim = 0; idim < COORDS; idim++)
{
curDistance += pow(regData[idim] - sm_cluster[itcluster * Coords + idim], 2);
}
if (curDistance < minDistance)
{
minDistance = curDistance;
clusterIndex = itcluster;
}
}
}
//TODO:find if membership changed
bool bchanged = (clusterIndex == preMembership) ? false : true;
d_pMembership[tid] = clusterIndex;
if (bchanged)
{
atomicAdd(&sm_blockchanged, 1);
}
__syncthreads();
if (threadIdx.x == 0)
{
atomicAdd(d_pChanged, sm_blockchanged);
}
}
执行情况如下,耗时45us
V3版本的核心和内存的负载非常均衡,且都达到了峰值的70%。
优化到V3版本,Nsight Compute对于warp的执行状态已经不再提示问题。同时对比V1版本,通过模板与展开技术,编译器将regData放在了寄存器中,这也是执行速度对比V2提升近一倍的原因之一。
V4 2 ^ n倍展开
优化到V3版本已经无明确的性能问题提示,这里尝试通过2 ^ n 倍展开,观察是否能进一步减速算法。经验证,4倍展开性能最佳,即模板参数UNROLLSCALE=4。
/*V4,按照聚类数量展开,收益不大;在最外层逻辑独立展开4倍,快20%*/
template<int COORDS,int UNROLLSCALE>
__global__ void kKmeansClassifyMembershipTAdv(int SrcCount,
int Coords/*[1]*/,
int ClusterCount/*[16]*/,
float* d_pSrc, /*[Coords][SrcCount]*/
int* d_pMembership,/*[SrcCount]*/
float* d_pCluster, /*[ClusterCount][Coords]*/
int* d_pChanged/*[1] output*/
)
{
extern __shared__ float sm_cluster[];/*[ClusterCount][Coords]*/
__shared__ int sm_blockchanged;
if (threadIdx.x == 0)
{
sm_blockchanged = 0;
}
float regData[UNROLLSCALE];
int unrollsize = blockDim.x * gridDim.x;
int tid = threadIdx.x + blockIdx.x * blockDim.x;
//TODO:load cluster center point to sm#pragma unroll
for (int i = threadIdx.x; i < COORDS * ClusterCount; i += blockDim.x)
{
sm_cluster[i] = d_pCluster[i];
}
int preMembership[UNROLLSCALE];
#pragma unroll
for (size_t unroll = 0; unroll < UNROLLSCALE; unroll++)
{
preMembership[unroll] = d_pMembership[tid + unroll * unrollsize];
}
__syncthreads();
//TODO:find nearest cluster center point
float minDistance[UNROLLSCALE];
#pragma unroll
for (size_t i = 0; i < UNROLLSCALE; i++)
{
minDistance[i] = 10000000.0f;
}
int clusterIndex[UNROLLSCALE];
if (tid < SrcCount)
{
#pragma unroll
for (size_t unroll = 0; unroll < UNROLLSCALE; unroll++)
{
#pragma unroll
for (size_t idim = 0; idim < COORDS; idim++)
{
regData[idim+unroll* COORDS] = d_pSrc[tid + unroll * unrollsize + idim * SrcCount];
}
}
#pragma unroll
for (size_t unroll = 0; unroll < UNROLLSCALE; unroll++)
{
for (int itcluster = 0; itcluster < ClusterCount; itcluster++)
{
float curDistance = 0.0f;
#pragma unroll
for (int idim = 0; idim < COORDS; idim++)
{
curDistance += pow(regData[idim + unroll * COORDS] - sm_cluster[itcluster * COORDS + idim], 2);
}
if (curDistance < minDistance[unroll])
{
minDistance[unroll] = curDistance;
clusterIndex[unroll] = itcluster;
}
}
}
}
//TODO:find if membership changed
bool bchanged = false;
#pragma unroll
for (size_t unroll = 0; unroll < UNROLLSCALE; unroll++)
{
bchanged |= (clusterIndex[unroll] == preMembership[unroll]) ? false : true;
d_pMembership[tid+ unroll * unrollsize] = clusterIndex[unroll];
}
if (bchanged)
{
atomicAdd(&sm_blockchanged, 1);
}
__syncthreads();
if (threadIdx.x == 0)
{
atomicAdd(d_pChanged, sm_blockchanged);
}
}
4倍展开时,执行情况如下
此时运行时间为36us
由于做了4倍展开block数量由变为2048
V5进一步展开
尝试在聚类个数这个维度对应的for循环上进行展开,经验证2倍展开性能最佳,即模板参数CLUSTERUNROLL=2.
/*V5,3个维度全部展开*/
template<int COORDS, int UNROLLSCALE,int CLUSTERS,int CLUSTERUNROLL>
__global__ void kKmeansClassifyMembershipTUnrollAll(int SrcCount,
int Coords/*[1]*/,
int ClusterCount/*[16]*/,
float* d_pSrc, /*[Coords][SrcCount]*/
int* d_pMembership,/*[SrcCount]*/
float* d_pCluster, /*[ClusterCount][Coords]*/
int* d_pChanged/*[1] output*/
)
{
extern __shared__ float sm_cluster[];/*[ClusterCount][Coords]*/
__shared__ int sm_blockchanged;
if (threadIdx.x == 0)
{
sm_blockchanged = 0;
}
float regData[UNROLLSCALE];
int unrollsize = blockDim.x * gridDim.x;
int tid = threadIdx.x + blockIdx.x * blockDim.x;
//TODO:load cluster center point to sm#pragma unroll
#pragma unroll
for (int i = threadIdx.x; i < COORDS * CLUSTERS; i += blockDim.x)
{
sm_cluster[i] = d_pCluster[i];
}
int preMembership[UNROLLSCALE];
#pragma unroll
for (size_t unroll = 0; unroll < UNROLLSCALE; unroll++)
{
preMembership[unroll] = d_pMembership[tid + unroll * unrollsize];
}
__syncthreads();
//TODO:find nearest cluster center point
float minDistance[UNROLLSCALE];
#pragma unroll
for (size_t i = 0; i < UNROLLSCALE; i++)
{
minDistance[i] = 10000000.0f;
}
int clusterIndex[UNROLLSCALE];
if (tid < SrcCount)
{
#pragma unroll
for (size_t unroll = 0; unroll < UNROLLSCALE; unroll++)
{
#pragma unroll
for (size_t idim = 0; idim < COORDS; idim++)
{
regData[idim + unroll * COORDS] = d_pSrc[tid + unroll * unrollsize + idim * SrcCount];
}
}
#pragma unroll
for (size_t unroll = 0; unroll < UNROLLSCALE; unroll++)
{
for (int itclusterOut = 0; itclusterOut < CLUSTERS/CLUSTERUNROLL; itclusterOut++)
{
#pragma unroll
for (size_t itclusterIn = 0; itclusterIn < CLUSTERUNROLL; itclusterIn++)
{
float curDistance = 0.0f;
#pragma unroll
for (int idim = 0; idim < COORDS; idim++)
{
curDistance += pow(regData[idim + unroll * COORDS] - sm_cluster[(itclusterOut* CLUSTERUNROLL + itclusterIn) * COORDS + idim], 2);
}
if (curDistance < minDistance[unroll])
{
minDistance[unroll] = curDistance;
clusterIndex[unroll] = itclusterOut * CLUSTERUNROLL + itclusterIn;
}
}
}
}
}
//TODO:find if membership changed
bool bchanged = false;
#pragma unroll
for (size_t unroll = 0; unroll < UNROLLSCALE; unroll++)
{
bchanged |= (clusterIndex[unroll] == preMembership[unroll]) ? false : true;
d_pMembership[tid + unroll * unrollsize] = clusterIndex[unroll];
}
if (bchanged)
{
atomicAdd(&sm_blockchanged, 1);//直接加1而不使用atomic原则上对最终的效果没影响
}
__syncthreads();
if (threadIdx.x == 0)
{
atomicAdd(d_pChanged, sm_blockchanged);
}
}
执行情况
执行速度得到进一步提升,耗时30us。但是过度展开引发了计算和内存加载二者之间不平衡的问题。如果只从均衡性的角度来看,V5版本进一步进行展开不如V4版本。
V6 消除原子操作Atomic
之前的所有版本都是通过原子操作写入设备内存中,以标记本次迭代计算中数据点所属类别是否发生改变。实际上该块设备内存仅需包含所属分类是否变化的信息,无需统计出当前迭代有多少数据点发生了变化,因此没必要通过原子操作统计出确切的数值,只要核函数之前初始化为0,数据点类别变化所对应线程修改该块内存即可。
/*V6,去除atomic*/
template<int COORDS, int UNROLLSCALE, int CLUSTERS, int CLUSTERUNROLL>
__global__ void kKmeansClassifyMembershipRMAtomic(int SrcCount,
int Coords/*[1]*/,
int ClusterCount/*[16]*/,
float* d_pSrc, /*[Coords][SrcCount]*/
int* d_pMembership,/*[SrcCount]*/
float* d_pCluster, /*[ClusterCount][Coords]*/
int* d_pChanged/*[1] output*/
)
{
extern __shared__ float sm_cluster[];/*[ClusterCount][Coords]*/
__shared__ int sm_blockchanged;
if (threadIdx.x == 0)
{
sm_blockchanged = 0;
}
float regData[UNROLLSCALE];
int unrollsize = blockDim.x * gridDim.x;
int tid = threadIdx.x + blockIdx.x * blockDim.x;
//TODO:load cluster center point to sm#pragma unroll
#pragma unroll
for (int i = threadIdx.x; i < COORDS * CLUSTERS; i += blockDim.x)
{
sm_cluster[i] = d_pCluster[i];
}
int preMembership[UNROLLSCALE];
#pragma unroll
for (size_t unroll = 0; unroll < UNROLLSCALE; unroll++)
{
preMembership[unroll] = d_pMembership[tid + unroll * unrollsize];
}
__syncthreads();
//TODO:find nearest cluster center point
float minDistance[UNROLLSCALE];
#pragma unroll
for (size_t i = 0; i < UNROLLSCALE; i++)
{
minDistance[i] = 10000000.0f;
}
int clusterIndex[UNROLLSCALE];
if (tid < SrcCount)
{
#pragma unroll
for (size_t unroll = 0; unroll < UNROLLSCALE; unroll++)
{
#pragma unroll
for (size_t idim = 0; idim < COORDS; idim++)
{
regData[idim + unroll * COORDS] = d_pSrc[tid + unroll * unrollsize + idim * SrcCount];
}
}
#pragma unroll
for (size_t unroll = 0; unroll < UNROLLSCALE; unroll++)
{
#pragma unroll
for (int itclusterOut = 0; itclusterOut < CLUSTERS / CLUSTERUNROLL; itclusterOut++)
{
for (size_t itclusterIn = 0; itclusterIn < CLUSTERUNROLL; itclusterIn++)
{
float curDistance = 0.0f;
#pragma unroll
for (int idim = 0; idim < COORDS; idim++)
{
curDistance += pow(regData[idim + unroll * COORDS] - sm_cluster[(itclusterOut * CLUSTERUNROLL + itclusterIn) * COORDS + idim], 2);
}
if (curDistance < minDistance[unroll])
{
minDistance[unroll] = curDistance;
clusterIndex[unroll] = itclusterOut * CLUSTERUNROLL + itclusterIn;
}
}
}
}
}
//TODO:find if membership changed
bool bchanged = false;
#pragma unroll
for (size_t unroll = 0; unroll < UNROLLSCALE; unroll++)
{
bchanged |= (clusterIndex[unroll] == preMembership[unroll]) ? false : true;
d_pMembership[tid + unroll * unrollsize] = clusterIndex[unroll];
}
if (bchanged)
{
sm_blockchanged = 1;
}
__syncthreads();
if (threadIdx.x == 0 && sm_blockchanged == 1)
{
*d_pChanged = 1;
}
}
执行情况如下
V6版本的执行速度相对V5并没有进一步提升,本人猜测可能是数据点分布初始化设置的不够合理,导致这一步优化没有使算法进一步加速。
总结
本篇基于CUDA实现了Kmeans算法,并进行性能调优。通过6个版本的迭代,使执行速度从243us下降至30us,得到8倍加速比。