简介

GPU Driven顾名思义,所有信息由GPU处理计算或渲染

很多文章也有介绍,这里记录下自己的一个思路

先看下效果图

gpu测试渲染_世界坐标

用的Unity商城免费的资源

Terrain转高度图

这里我用了4个原生Unity地形,大小1024x1024

拼成了一个2x2,大小为2048x2048的高度图

int terrainNum = terrains.Length;
            int splitNumX = 2;
            int heightSize = (int)terrains[0].terrainData.size.x;
            if (!Mathf.IsPowerOfTwo(heightSize))
            {
                Debug.LogError("heightSize is not PowerOfTwo");
                return;
            }

            var realHeightSize = Mathf.NextPowerOfTwo(heightSize);
            int totalHeightSize = realHeightSize * terrainNum / splitNumX;
            float[] pixelData = new float[totalHeightSize * totalHeightSize];
            heightMap = new Texture2D(totalHeightSize, totalHeightSize, TextureFormat.RFloat, false);
            maxHeight = terrains[0].terrainData.heightmapScale.y;

            for (int k = 0; k < terrainNum; k++)
            {
                var terrain = terrains[k];

                var offsetX = (k % splitNumX) * heightSize;
                var offsetY = ((k / splitNumX)) * heightSize;
                for (int i = 0; i < heightSize; i++)
                {
                    for (int j = 0; j < heightSize; j++)
                    {
                        pixelData[offsetX + i + (j + offsetY) * totalHeightSize] =
                            terrain.terrainData.GetHeight(i, j) / maxHeight;
                    }
                }
            }

            heightMap.SetPixelData(pixelData, 0);
            heightMap.Apply();

创建Mesh

准备一个平铺的网格,这里我们用一个8x8,单位1米

这里用顶点索引,可以用最少的顶点数量去生成

private Mesh CreateQuardMesh(int size, float gridSize = 1)
    {
        _gValue._PerPacthGridNum = Convert.ToInt32(size / gridSize);
        int gridSizeX = Convert.ToInt32(size / gridSize);
        int gridSizeY = Convert.ToInt32(size / gridSize);

        Mesh mesh = new Mesh();
        Vector3[] vertices = new Vector3[(gridSizeX + 1) * (gridSizeY + 1)];
        for (int x = 0; x <= gridSizeX; x++)
        {
            for (int y = 0; y <= gridSizeY; y++)
            {
                float posx = (float)x * gridSize;
                float posz = (float)y * gridSize;
                vertices[y * (gridSizeX + 1) + x] = new Vector3(posx, 0, posz);
            }
        }

        int[] triangles = new int[gridSizeX * gridSizeY * 6];

        for (int i = 0; i < gridSizeX; i++)
        {
            for (int j = 0; j < gridSizeY; j++)
            {
                int triIndex = (j * gridSizeX + i) * 6;

                triangles[triIndex] = j * (gridSizeX + 1) + i;
                triangles[triIndex + 1] = (j + 1) * (gridSizeX + 1) + i;
                triangles[triIndex + 2] = (j + 1) * (gridSizeX + 1) + i + 1;

                triangles[triIndex + 3] = (j + 1) * (gridSizeX + 1) + i + 1;
                triangles[triIndex + 4] = j * (gridSizeX + 1) + i + 1;
                triangles[triIndex + 5] = j * (gridSizeX + 1) + i;
            }
        }

        mesh.vertices = vertices;
        mesh.triangles = triangles;
        mesh.RecalculateNormals();
        mesh.RecalculateTangents();
        return mesh;
    }

创建地形

准备数据

首先我们会根据_SplitNum=4参数,切割整个地形

_SplitNum表示边长,_SplitNum*_SplitNum才是会切割地形的块数

public struct NodeInfoStruct
{
    public int CurLOD;
    public int NodeNum;
    public int Offset;
    public int NodeSize;
    public int PacthSize;
    public int VertexScale;
    public int HeightMipLevel;
}

每一块就是一个Node,然后Node也会切成固定数量的Pacth,一个Pacth可以表示成一个Mesh

比如这块地形是2048x2048,我们_MAXLOD=3,一个Node可以分成8x8个Pacth

LOD3 Node数量4x4 NodeSize=512x512 PacthSize=64x64

LOD2 Node数量8x8 NodeSize=256x256 PacthSize=32x32

LOD1 Node数量16x16 NodeSize=128x128 PacthSize=16x16

LOD0 Node数量32x32 NodeSize=64x64 PacthSize=8x8

本来一个Mesh8x8大小,如果LOD大于0,我们就需要把Mesh放大

我们会用到GPU Instance技术, DrawMeshInstancedIndirect把所有的Pacth渲染出来

TraverseQuadTree

这里我们用Compute Shader

根据_MAXLOD大小,判断当前的Node是否可以再细分

EvaluateNode函数很简单,就是到相机的dst/nodeSize/(一个你自定义的系数)

[numthreads(1, 1, 1)]
void TraverseQuadTree(uint3 id : SV_DispatchThreadID)
{
    uint2 nodeXY = _ConsumeList.Consume();
    const NodeInfoStruct nodeLOD = _NodeStructs[_CurLOD];
    const int nodeIndex = nodeLOD.Offset + nodeXY.y * nodeLOD.NodeNum + nodeXY.x;
    if (_CurLOD > 0 && EvaluateNode(nodeXY, nodeLOD.NodeSize))
    {
        nodeXY *= 2;
        _AppendList.Append(nodeXY);
        _AppendList.Append(nodeXY + uint2(0, 1));
        _AppendList.Append(nodeXY + uint2(1, 0));
        _AppendList.Append(nodeXY + uint2(1, 1));
        _NodeBrunchList[nodeIndex] = 1;
    }
    else
    {
        _AppendFinalNodeList.Append(uint3(nodeXY, _CurLOD));
        _NodeBrunchList[nodeIndex] = 2;
    }
}

gpu测试渲染_游戏_02

数字表示LOD

CreateNodeSectorMap

记录整个Node节点的LOD信息,纹理大小是LOD0的节点数,也就是32x32

用于填补Mesh在不同LOD后产生的缝隙

[numthreads(8,8,1)]
void CreateNodeSectorMap(uint3 id : SV_DispatchThreadID)
{
    const uint2 sectorXY = id.xy;
    // [unroll]
    for (int lod = _MAXLOD; lod >= 0; lod--)
    {
    		//当前所属LOD的节点XY
        uint2 nodeXY = sectorXY >> lod;
        const NodeInfoStruct nodeInfo = _NodeStructs[lod];
        const int nodeIndex = nodeInfo.Offset + nodeXY.y * nodeInfo.NodeNum + nodeXY.x;
        //该节点没有被拆分
        if (_NodeBrunchList[nodeIndex] == 2)
        {
            _NodeSectorMap[sectorXY] = lod;
            return;
        }
    }
    _NodeSectorMap[sectorXY] = 0;
}

CreatePatch

先看下完整代码

[numthreads(8, 8, 1)]
void CreatePatch(uint3 id : SV_DispatchThreadID, uint3 groupId:SV_GroupID, uint3 groupThreadId:SV_GroupThreadID)
{
    uint3 nodeLODInfo = _FinalNodeList[groupId.x];
    RenderPatch renderPatch=(RenderPatch)0;
    const uint2 nodeXY = nodeLODInfo.xy;
    const uint2 patchXY = groupThreadId.xy;
    renderPatch._wpos.z = nodeLODInfo.z;

    const NodeInfoStruct nodeInfo = _NodeStructs[renderPatch._wpos.z];
    const uint2 nodePos = nodeInfo.NodeSize * nodeXY;//Node世界坐标
    const uint2 patchPosInNode = nodeInfo.PacthSize * patchXY;//Pacth本地坐标(相对于Node)
    renderPatch._wpos.xy = nodePos + patchPosInNode;//Pacth世界坐标
    const float3 wpos = float3(renderPatch._wpos.x, 0, renderPatch._wpos.y);
    //nodeXY * _PerNodePacthNum + patchXY 获取高度图的像素位置
    float2 minMaxHeight = _HeightMapRT.mips[nodeInfo.HeightMipLevel][nodeXY * _PerNodePacthNum + patchXY].xy;
    const float3 boundMin = float3(0, minMaxHeight.x * _max_height, 0);
    const float3 boundMax = float3(nodeInfo.PacthSize, minMaxHeight.y * _max_height, nodeInfo.PacthSize);
    Bounds bounds;
    bounds.minPosition = wpos + boundMin;
    bounds.maxPosition = wpos + boundMax;

    #if _VIEW_FRUSTUM_CULLING
    if (IsFrustumCulling(bounds, _ViewFrustumPlane))
        return;
    #endif

    #if _DEBUG_MIP
    renderPatch._wpos.w = GetBoundsMip(bounds);
    #endif
    #if _HIZ_CULLING
        if (IsHizCulling(bounds))
            return;
    #endif
    SetLODTrans(renderPatch, nodeXY, patchXY, nodeInfo);
    _CulledPatchList.Append(renderPatch);
}

计算出Pacth的世界坐标,判断是否需要视锥剔除,Hiz剔除,

struct RenderPatch{uint4 _wpos;uint4 _lodTrans;};

_wpos.xy世界坐标

_wpos.z PatchLOD

_wpos.w 当前的Mip信息

HeightMinMaxMap

LOD(N)一个像素代表LOD(N-1)4个像素,这个4个像素高度可能都不一样,我们需要算出最高和最低的,保证精度不丢失

高度图所生成的depth大小为LOD0时,Pacth的数量,也就是高度图的大小**等于**Pacth的大小

也就是一个Pacth等于高度图里面的一个像素

Node里面有一个HeightMipLevel,这个值就是当前NodePacth的大小相同的高度图的depth

nodeXY * _PerNodePacthNum + patchXY 获取高度图的像素位置

这样就可以获得每个Pacth的最大最小高度,方便进行计算

[numthreads(8, 8, 1)]
void BuildMinMaxHeightMapByMinMaxHeightMap(uint3 id : SV_DispatchThreadID)
{
    const uint2 uv = id.xy * 2;
    const float2 h1 = inputMinMaxHeightMap[uv];
    const float2 h2 = inputMinMaxHeightMap[uv + uint2(1, 0)];
    const float2 h3 = inputMinMaxHeightMap[uv + uint2(0, 1)];
    const float2 h4 = inputMinMaxHeightMap[uv + uint2(1, 1)];

    float hMin = min(min(h1.x, h2.x), min(h3.x, h4.x));
    float hMax = max(max(h1.y, h2.y), max(h3.y, h4.y));
    outputMinMaxHeightMap[id.xy] = float2(hMin, hMax);
}

FrustumCulling

Unity默认不是有视锥剔除吗?没错是的,这里我们还需要做一次,因为GPU Instace这种模式,Unity不会帮你做剔除

使用Unityapi,获取视锥的6个面GeometryUtility.CalculateFrustumPlanes顺序是左右上下远近

得到一个平面的法线,和这条法线距离相机的距离,并且法线是朝视锥体内

根据Pacth的最大和最小的高度,组成一个包围盒

struct Bounds{float3 minPosition;float3 maxPosition;};

分别判断这个包围盒minPositionmaxPosition离平面最近的那个点,是否在视锥6个面中

判断方法就用平面公式
gpu测试渲染_gpu测试渲染_03

gpu测试渲染_开发语言_04

gpu测试渲染_世界坐标_05

bool IsFrustumCulling(Bounds bounds, float4 planes[6])
{
    const float3 minPosition = bounds.minPosition;
    const float3 maxPosition = bounds.maxPosition;
    [unroll]
    for (int i = 0; i < 6; i++)
    {
        float3 p = minPosition;
        float3 normal = planes[i].xyz;
      	//需要获取距离平面最近的坐标
        if (normal.x > 0)
            p.x = maxPosition.x;
        if (normal.y > 0)
            p.y = maxPosition.y;
        if (normal.z > 0)
            p.z = maxPosition.z;
        if (IsOutSidePlane(planes[i], p))
        {
            return true;
        }
    }
    return false;
}

bool IsOutSidePlane(float4 plane, float3 position)
{
    return dot(plane.xyz, position) + plane.w < 0;
}

来看下效果,4800+Pacth降低到了2800+

gpu测试渲染_java_06

gpu测试渲染_世界坐标_07

#if UNITY_EDITOR
                _terrainGPUDriven.Draw(cmd, Camera.main);
                renderCallBack(cmd);
#else
                _terrainGPUDriven.Draw(cmd, renderingData.cameraData.camera);
#endif

顺便发现renderingData.cameraData.cameraGameScene,会自动调整到对应的相机坐标

Camera.main 只会获取到Game下的坐标

HizCulling

先看代码

bool IsHizCulling(Bounds bounds)
{
    Bounds boundsUVD = CalBoundUVD(bounds);
    const float2 size = (boundsUVD.maxPosition.xy - boundsUVD.minPosition.xy) * _HizMapSize.xy;
    uint2 mipXY = ceil(log2(size));
  	//max(mipXY.x, mipXY.y)正方形
    const uint mip = clamp(max(mipXY.x, mipXY.y), 0, _HizMapSize.z);
    const uint2 mipHizMapSize = (uint2)_HizMapSize.xy >> mip;
    float d1 = SampleHizMap(boundsUVD.minPosition.xy, mip, mipHizMapSize);
    float d2 = SampleHizMap(boundsUVD.maxPosition.xy, mip, mipHizMapSize);
    float d3 = SampleHizMap(float2(boundsUVD.minPosition.x, boundsUVD.maxPosition.y), mip, mipHizMapSize);
    float d4 = SampleHizMap(float2(boundsUVD.maxPosition.x, boundsUVD.minPosition.y), mip, mipHizMapSize);

    #if _REVERSE_Z
    float depth = boundsUVD.maxPosition.z;
    return d1 > depth && d2 > depth && d3 > depth && d4 > depth;
    #else
    float depth = boundsUVD.minPosition.z;
    return d1 < depth && d2 < depth && d3 < depth && d4 < depth;
    #endif
}

CalBoundUVD就是求出NDC空间下包围盒坐标信息

怎么求NDC系坐标?

inline float3 CalPointUVD(float3 pos)
{
    float4 clipSpace = mul(_VPMatrix, float4(pos, 1.0));
    float3 uvd = clipSpace.xyz / clipSpace.w;
    uvd.xy = (uvd.xy + 1.0) * 0.5;
    return uvd;
}

乘以VP矩阵,换算到投影坐标,除以w就是NDC坐标了范围[-1,1],再转换到[0,1]之间

乘以我们的Mip0_HizMapSize

log2以下就可以求出所需要采样的mip

为什要log2?比如算出物体在Mip0大小为128,我们需要取得它在那个mip下可以表示为1个像素

答案很显然,2的7次方就是log2(128),我们最后采样一个2x2大小深度算一下就行

为啥要2x2?因为我们算出来的是包围盒,一个小正方形

看下效果,2800+降低到2300+

gpu测试渲染_gpu测试渲染_08

再来个俯视图

gpu测试渲染_游戏_09

生成HizMap,我们放在RenderPassEvent.BeforeRenderingSkybox

这样就会记录Terrain和Opaque的深度信息

渲染Terrain我们在RenderPassEvent.AfterRenderingOpaques

但是当我们帧率比较低,还是会遇到剔除错误问题

gpu测试渲染_世界坐标_10

我的想法是Hiz剔除完后,把那些剔除了的保存到一个列表,生成HizMap后,再用当前HizMap检查下是否需要剔除,再渲染一次没被剔除的就行!不过我偷懒了,没实现!!!

FixSeam

缝隙的是因为不同LOD地形相连产生的

我们上次生成的NodeSectorMap就派上用场了

每个像素代表LOD0下的一个Node

然后判断下Patch邻边的时候不同LOD就行了

inline void SetLODTrans(inout RenderPatch patch, uint2 nodeXY, uint2 patchXY, in NodeInfoStruct blockInfo)
{
    const uint lod = blockInfo.CurLOD;
    const uint nodeScale = 1 << lod;
    const uint2 sectorMin = nodeXY * nodeScale;
    const uint2 sectorMax = sectorMin + nodeScale - 1;
    uint4 lodTrans = 0;
    if (patchXY.x == 0)
        lodTrans.x = GetSectorLOD(sectorMin + int2(-1, 0), lod);
    if (patchXY.y == 0)
        lodTrans.y = GetSectorLOD(sectorMin + int2(0, -1), lod);
    if (patchXY.x == _PerPacthGridNum - 1)
        lodTrans.z = GetSectorLOD(sectorMax + int2(1, 0), lod);
    if (patchXY.y == _PerPacthGridNum - 1)
        lodTrans.w = GetSectorLOD(sectorMax + int2(0, 1), lod);
    patch._lodTrans = lodTrans;
}

gpu测试渲染_gpu测试渲染_11

光照

用的Unity默认的TerrainLit,稍微改了下,去掉了不必要的计算,也没有用融合

void SplatmapFragment2(
    Varyings IN
    , out half4 outColor : SV_Target0
)
{
    UNITY_SETUP_STEREO_EYE_INDEX_POST_VERTEX(IN);
    half3 normalTS = 0;
    half4 splatControl = half4(1, 0, 0, 0);
    half weight;
    half4 mixedDiffuse;
    half4 defaultSmoothness;
    SplatmapMix(IN.uvMainAndLM, IN.uvSplat01, IN.uvSplat23, splatControl, weight, mixedDiffuse,
                defaultSmoothness, normalTS);
    half3 albedo = mixedDiffuse.rgb;
    InputData inputData;
    InitializeInputData(IN, normalTS, inputData);

    half4 color = UniversalFragmentPBR(inputData, albedo, 0, 0, .5, 1, 0, 1);

    SplatmapFinalColor(color, inputData.fogCoord);

    outColor = half4(color.rgb, 1.0h);
}

可以看下和unity默认Terrain的对比,Batches=24是我们的地形

CPU端肯定没提升,我们计算都在GPU的,提升0.1-0.2毫秒左右

为啥看起来不一样?我用的m1MBP,无法使用Geometry Shaders,没办法重新算一次法线

Demo用的Terrain法线是切线空间法线

大家运行的时候把// #pragma geometry geom2打开就行,效果应该一致

gpu测试渲染_java_12

下面是我们自定义的Terrain Shader

v2f vert(appdata v, uint instanceID : SV_InstanceID)
{
    v2f o;
    RenderPatch patch = _BlockPatchList[instanceID];
    const NodeInfoStruct blockInfo = _NodeStructs[patch._wpos.z];
    #if _FIX_LOD_SEAM
    FixLODConnectSeam(v.positionOS, patch);
    #endif

    #if _DEBUG_PATCH
    float3 worldPos = v.positionOS * 0.95 * float3(blockInfo.VertexScale, 1, blockInfo.VertexScale) + float3(patch._wpos.x, 0, patch._wpos.y);
    #else
    float3 worldPos = v.positionOS * float3(blockInfo.VertexScale, 1, blockInfo.VertexScale) + float3(
        patch._wpos.x, 0, patch._wpos.y);
    #endif
    worldPos.y = _HeightMapRT.Load(float3(worldPos.xz, 0)).y * _max_height;
    o.uv = worldPos.xz % 1024 / 1024.0;
    o.positionWS = worldPos;
    o.clipPos = TransformWorldToHClip(worldPos);
    half3 viewDirWS = GetWorldSpaceNormalizeViewDir(worldPos);
    float4 vertexTangent = float4(cross(float3(0, 0, 1), v.normalOS), 1.0);
    VertexNormalInputs normalInput = GetVertexNormalInputs(v.normalOS, vertexTangent);
    o.normal = half4(normalInput.normalWS, viewDirWS.x);
    o.tangent = half4(normalInput.tangentWS, viewDirWS.y);
    o.bitangent = half4(normalInput.bitangentWS, viewDirWS.z);
    o.fogFactor = ComputeFogFactor(o.clipPos.z);
    #if _DEBUG_LOD
    o.color = _debugColor[patch._wpos.z];
    #endif
    #if _DEBUG_MIP
    o.color = GetMipColor(patch._wpos.w);
    #endif
    return o;
}

half4 frag2(v2f i) : SV_Target
{
    InputData inputData = (InputData)0;
    real3 albedo = SAMPLE_TEXTURE2D(_MainTex, samplerLinearClamp, i.uv);
    half3 nrm = UnpackNormal(SAMPLE_TEXTURE2D(_NormapTex, samplerNormalLinearClamp, i.uv));
    half3 normalTS = normalize(nrm.xyz);
    half3 viewDirWS = half3(i.normal.w, i.tangent.w, i.bitangent.w);
    inputData.tangentToWorld = half3x3(-i.tangent.xyz, i.bitangent.xyz, i.normal.xyz);
    inputData.normalWS = TransformTangentToWorld(normalTS, inputData.tangentToWorld);
    inputData.normalWS = NormalizeNormalPerPixel(inputData.normalWS);
    inputData.viewDirectionWS = viewDirWS;
    inputData.fogCoord = InitializeInputDataFog(float4(i.positionWS, 1.0), i.fogFactor);
    inputData.normalizedScreenSpaceUV = GetNormalizedScreenSpaceUV(i.clipPos);

    inputData.positionWS = i.positionWS;
    inputData.positionCS = i.clipPos;

    half4 color = UniversalFragmentPBR(inputData, albedo, 0, 0, .5, 1, 0, 1);
    color.rgb *= color.a;
    color.rgb = MixFog(color.rgb, inputData.fogCoord);
    #if  _DEBUG_COLOR
    color.rgb = i.color;
    #endif
    return half4(color.rgb, 1.0);
}

源码地址

下篇RVT

https://zhuanlan.zhihu.com/p/648843014

https://zhuanlan.zhihu.com/p/388844386