mirror of
https://github.com/godotengine/godot.git
synced 2024-12-27 11:24:59 +08:00
057367bf4f
Introduces support for FSR2 as a new upscaler option available from the project settings. Also introduces an specific render list for surfaces that require motion and the ability to derive motion vectors from depth buffer and camera motion.
937 lines
37 KiB
C++
937 lines
37 KiB
C++
// This file is part of the FidelityFX SDK.
|
|
//
|
|
// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
|
|
//
|
|
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
// of this software and associated documentation files (the "Software"), to deal
|
|
// in the Software without restriction, including without limitation the rights
|
|
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
// copies of the Software, and to permit persons to whom the Software is
|
|
// furnished to do so, subject to the following conditions:
|
|
// The above copyright notice and this permission notice shall be included in
|
|
// all copies or substantial portions of the Software.
|
|
//
|
|
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
// THE SOFTWARE.
|
|
|
|
#ifdef FFX_CPU
|
|
FFX_STATIC void SpdSetup(FfxUInt32x2 dispatchThreadGroupCountXY, // CPU side: dispatch thread group count xy
|
|
FfxUInt32x2 workGroupOffset, // GPU side: pass in as constant
|
|
FfxUInt32x2 numWorkGroupsAndMips, // GPU side: pass in as constant
|
|
FfxUInt32x4 rectInfo, // left, top, width, height
|
|
FfxInt32 mips) // optional: if -1, calculate based on rect width and height
|
|
{
|
|
workGroupOffset[0] = rectInfo[0] / 64; // rectInfo[0] = left
|
|
workGroupOffset[1] = rectInfo[1] / 64; // rectInfo[1] = top
|
|
|
|
FfxUInt32 endIndexX = (rectInfo[0] + rectInfo[2] - 1) / 64; // rectInfo[0] = left, rectInfo[2] = width
|
|
FfxUInt32 endIndexY = (rectInfo[1] + rectInfo[3] - 1) / 64; // rectInfo[1] = top, rectInfo[3] = height
|
|
|
|
dispatchThreadGroupCountXY[0] = endIndexX + 1 - workGroupOffset[0];
|
|
dispatchThreadGroupCountXY[1] = endIndexY + 1 - workGroupOffset[1];
|
|
|
|
numWorkGroupsAndMips[0] = (dispatchThreadGroupCountXY[0]) * (dispatchThreadGroupCountXY[1]);
|
|
|
|
if (mips >= 0)
|
|
{
|
|
numWorkGroupsAndMips[1] = FfxUInt32(mips);
|
|
}
|
|
else
|
|
{
|
|
// calculate based on rect width and height
|
|
FfxUInt32 resolution = ffxMax(rectInfo[2], rectInfo[3]);
|
|
numWorkGroupsAndMips[1] = FfxUInt32((ffxMin(floor(log2(FfxFloat32(resolution))), FfxFloat32(12))));
|
|
}
|
|
}
|
|
|
|
FFX_STATIC void SpdSetup(FfxUInt32x2 dispatchThreadGroupCountXY, // CPU side: dispatch thread group count xy
|
|
FfxUInt32x2 workGroupOffset, // GPU side: pass in as constant
|
|
FfxUInt32x2 numWorkGroupsAndMips, // GPU side: pass in as constant
|
|
FfxUInt32x4 rectInfo) // left, top, width, height
|
|
{
|
|
SpdSetup(dispatchThreadGroupCountXY, workGroupOffset, numWorkGroupsAndMips, rectInfo, -1);
|
|
}
|
|
#endif // #ifdef FFX_CPU
|
|
|
|
|
|
//==============================================================================================================================
|
|
// NON-PACKED VERSION
|
|
//==============================================================================================================================
|
|
#ifdef FFX_GPU
|
|
#ifdef SPD_PACKED_ONLY
|
|
// Avoid compiler error
|
|
FfxFloat32x4 SpdLoadSourceImage(FfxInt32x2 p, FfxUInt32 slice)
|
|
{
|
|
return FfxFloat32x4(0.0, 0.0, 0.0, 0.0);
|
|
}
|
|
|
|
FfxFloat32x4 SpdLoad(FfxInt32x2 p, FfxUInt32 slice)
|
|
{
|
|
return FfxFloat32x4(0.0, 0.0, 0.0, 0.0);
|
|
}
|
|
void SpdStore(FfxInt32x2 p, FfxFloat32x4 value, FfxUInt32 mip, FfxUInt32 slice)
|
|
{
|
|
}
|
|
FfxFloat32x4 SpdLoadIntermediate(FfxUInt32 x, FfxUInt32 y)
|
|
{
|
|
return FfxFloat32x4(0.0, 0.0, 0.0, 0.0);
|
|
}
|
|
void SpdStoreIntermediate(FfxUInt32 x, FfxUInt32 y, FfxFloat32x4 value)
|
|
{
|
|
}
|
|
FfxFloat32x4 SpdReduce4(FfxFloat32x4 v0, FfxFloat32x4 v1, FfxFloat32x4 v2, FfxFloat32x4 v3)
|
|
{
|
|
return FfxFloat32x4(0.0, 0.0, 0.0, 0.0);
|
|
}
|
|
#endif // #ifdef SPD_PACKED_ONLY
|
|
|
|
//_____________________________________________________________/\_______________________________________________________________
|
|
#if defined(FFX_GLSL) && !defined(SPD_NO_WAVE_OPERATIONS)
|
|
#extension GL_KHR_shader_subgroup_quad:require
|
|
#endif
|
|
|
|
void SpdWorkgroupShuffleBarrier()
|
|
{
|
|
#ifdef FFX_GLSL
|
|
barrier();
|
|
#endif
|
|
#ifdef FFX_HLSL
|
|
GroupMemoryBarrierWithGroupSync();
|
|
#endif
|
|
}
|
|
|
|
// Only last active workgroup should proceed
|
|
bool SpdExitWorkgroup(FfxUInt32 numWorkGroups, FfxUInt32 localInvocationIndex, FfxUInt32 slice)
|
|
{
|
|
// global atomic counter
|
|
if (localInvocationIndex == 0)
|
|
{
|
|
SpdIncreaseAtomicCounter(slice);
|
|
}
|
|
|
|
SpdWorkgroupShuffleBarrier();
|
|
return (SpdGetAtomicCounter() != (numWorkGroups - 1));
|
|
}
|
|
|
|
// User defined: FfxFloat32x4 SpdReduce4(FfxFloat32x4 v0, FfxFloat32x4 v1, FfxFloat32x4 v2, FfxFloat32x4 v3);
|
|
FfxFloat32x4 SpdReduceQuad(FfxFloat32x4 v)
|
|
{
|
|
#if defined(FFX_GLSL) && !defined(SPD_NO_WAVE_OPERATIONS)
|
|
|
|
FfxFloat32x4 v0 = v;
|
|
FfxFloat32x4 v1 = subgroupQuadSwapHorizontal(v);
|
|
FfxFloat32x4 v2 = subgroupQuadSwapVertical(v);
|
|
FfxFloat32x4 v3 = subgroupQuadSwapDiagonal(v);
|
|
return SpdReduce4(v0, v1, v2, v3);
|
|
|
|
#elif defined(FFX_HLSL) && !defined(SPD_NO_WAVE_OPERATIONS)
|
|
|
|
// requires SM6.0
|
|
FfxUInt32 quad = WaveGetLaneIndex() & (~0x3);
|
|
FfxFloat32x4 v0 = v;
|
|
FfxFloat32x4 v1 = WaveReadLaneAt(v, quad | 1);
|
|
FfxFloat32x4 v2 = WaveReadLaneAt(v, quad | 2);
|
|
FfxFloat32x4 v3 = WaveReadLaneAt(v, quad | 3);
|
|
return SpdReduce4(v0, v1, v2, v3);
|
|
/*
|
|
// if SM6.0 is not available, you can use the AMD shader intrinsics
|
|
// the AMD shader intrinsics are available in AMD GPU Services (AGS) library:
|
|
// https://gpuopen.com/amd-gpu-services-ags-library/
|
|
// works for DX11
|
|
FfxFloat32x4 v0 = v;
|
|
FfxFloat32x4 v1;
|
|
v1.x = AmdExtD3DShaderIntrinsics_SwizzleF(v.x, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1);
|
|
v1.y = AmdExtD3DShaderIntrinsics_SwizzleF(v.y, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1);
|
|
v1.z = AmdExtD3DShaderIntrinsics_SwizzleF(v.z, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1);
|
|
v1.w = AmdExtD3DShaderIntrinsics_SwizzleF(v.w, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1);
|
|
FfxFloat32x4 v2;
|
|
v2.x = AmdExtD3DShaderIntrinsics_SwizzleF(v.x, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2);
|
|
v2.y = AmdExtD3DShaderIntrinsics_SwizzleF(v.y, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2);
|
|
v2.z = AmdExtD3DShaderIntrinsics_SwizzleF(v.z, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2);
|
|
v2.w = AmdExtD3DShaderIntrinsics_SwizzleF(v.w, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2);
|
|
FfxFloat32x4 v3;
|
|
v3.x = AmdExtD3DShaderIntrinsics_SwizzleF(v.x, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4);
|
|
v3.y = AmdExtD3DShaderIntrinsics_SwizzleF(v.y, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4);
|
|
v3.z = AmdExtD3DShaderIntrinsics_SwizzleF(v.z, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4);
|
|
v3.w = AmdExtD3DShaderIntrinsics_SwizzleF(v.w, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4);
|
|
return SpdReduce4(v0, v1, v2, v3);
|
|
*/
|
|
#endif
|
|
return v;
|
|
}
|
|
|
|
FfxFloat32x4 SpdReduceIntermediate(FfxUInt32x2 i0, FfxUInt32x2 i1, FfxUInt32x2 i2, FfxUInt32x2 i3)
|
|
{
|
|
FfxFloat32x4 v0 = SpdLoadIntermediate(i0.x, i0.y);
|
|
FfxFloat32x4 v1 = SpdLoadIntermediate(i1.x, i1.y);
|
|
FfxFloat32x4 v2 = SpdLoadIntermediate(i2.x, i2.y);
|
|
FfxFloat32x4 v3 = SpdLoadIntermediate(i3.x, i3.y);
|
|
return SpdReduce4(v0, v1, v2, v3);
|
|
}
|
|
|
|
FfxFloat32x4 SpdReduceLoad4(FfxUInt32x2 i0, FfxUInt32x2 i1, FfxUInt32x2 i2, FfxUInt32x2 i3, FfxUInt32 slice)
|
|
{
|
|
FfxFloat32x4 v0 = SpdLoad(FfxInt32x2(i0), slice);
|
|
FfxFloat32x4 v1 = SpdLoad(FfxInt32x2(i1), slice);
|
|
FfxFloat32x4 v2 = SpdLoad(FfxInt32x2(i2), slice);
|
|
FfxFloat32x4 v3 = SpdLoad(FfxInt32x2(i3), slice);
|
|
return SpdReduce4(v0, v1, v2, v3);
|
|
}
|
|
|
|
FfxFloat32x4 SpdReduceLoad4(FfxUInt32x2 base, FfxUInt32 slice)
|
|
{
|
|
return SpdReduceLoad4(FfxUInt32x2(base + FfxUInt32x2(0, 0)), FfxUInt32x2(base + FfxUInt32x2(0, 1)), FfxUInt32x2(base + FfxUInt32x2(1, 0)), FfxUInt32x2(base + FfxUInt32x2(1, 1)), slice);
|
|
}
|
|
|
|
FfxFloat32x4 SpdReduceLoadSourceImage4(FfxUInt32x2 i0, FfxUInt32x2 i1, FfxUInt32x2 i2, FfxUInt32x2 i3, FfxUInt32 slice)
|
|
{
|
|
FfxFloat32x4 v0 = SpdLoadSourceImage(FfxInt32x2(i0), slice);
|
|
FfxFloat32x4 v1 = SpdLoadSourceImage(FfxInt32x2(i1), slice);
|
|
FfxFloat32x4 v2 = SpdLoadSourceImage(FfxInt32x2(i2), slice);
|
|
FfxFloat32x4 v3 = SpdLoadSourceImage(FfxInt32x2(i3), slice);
|
|
return SpdReduce4(v0, v1, v2, v3);
|
|
}
|
|
|
|
FfxFloat32x4 SpdReduceLoadSourceImage(FfxUInt32x2 base, FfxUInt32 slice)
|
|
{
|
|
#ifdef SPD_LINEAR_SAMPLER
|
|
return SpdLoadSourceImage(FfxInt32x2(base), slice);
|
|
#else
|
|
return SpdReduceLoadSourceImage4(FfxUInt32x2(base + FfxUInt32x2(0, 0)), FfxUInt32x2(base + FfxUInt32x2(0, 1)), FfxUInt32x2(base + FfxUInt32x2(1, 0)), FfxUInt32x2(base + FfxUInt32x2(1, 1)), slice);
|
|
#endif
|
|
}
|
|
|
|
void SpdDownsampleMips_0_1_Intrinsics(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mip, FfxUInt32 slice)
|
|
{
|
|
FfxFloat32x4 v[4];
|
|
|
|
FfxInt32x2 tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2, y * 2);
|
|
FfxInt32x2 pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x, y);
|
|
v[0] = SpdReduceLoadSourceImage(tex, slice);
|
|
SpdStore(pix, v[0], 0, slice);
|
|
|
|
tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2 + 32, y * 2);
|
|
pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x + 16, y);
|
|
v[1] = SpdReduceLoadSourceImage(tex, slice);
|
|
SpdStore(pix, v[1], 0, slice);
|
|
|
|
tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2, y * 2 + 32);
|
|
pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x, y + 16);
|
|
v[2] = SpdReduceLoadSourceImage(tex, slice);
|
|
SpdStore(pix, v[2], 0, slice);
|
|
|
|
tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2 + 32, y * 2 + 32);
|
|
pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x + 16, y + 16);
|
|
v[3] = SpdReduceLoadSourceImage(tex, slice);
|
|
SpdStore(pix, v[3], 0, slice);
|
|
|
|
if (mip <= 1)
|
|
return;
|
|
|
|
v[0] = SpdReduceQuad(v[0]);
|
|
v[1] = SpdReduceQuad(v[1]);
|
|
v[2] = SpdReduceQuad(v[2]);
|
|
v[3] = SpdReduceQuad(v[3]);
|
|
|
|
if ((localInvocationIndex % 4) == 0)
|
|
{
|
|
SpdStore(FfxInt32x2(workGroupID.xy * 16) + FfxInt32x2(x / 2, y / 2), v[0], 1, slice);
|
|
SpdStoreIntermediate(x / 2, y / 2, v[0]);
|
|
|
|
SpdStore(FfxInt32x2(workGroupID.xy * 16) + FfxInt32x2(x / 2 + 8, y / 2), v[1], 1, slice);
|
|
SpdStoreIntermediate(x / 2 + 8, y / 2, v[1]);
|
|
|
|
SpdStore(FfxInt32x2(workGroupID.xy * 16) + FfxInt32x2(x / 2, y / 2 + 8), v[2], 1, slice);
|
|
SpdStoreIntermediate(x / 2, y / 2 + 8, v[2]);
|
|
|
|
SpdStore(FfxInt32x2(workGroupID.xy * 16) + FfxInt32x2(x / 2 + 8, y / 2 + 8), v[3], 1, slice);
|
|
SpdStoreIntermediate(x / 2 + 8, y / 2 + 8, v[3]);
|
|
}
|
|
}
|
|
|
|
void SpdDownsampleMips_0_1_LDS(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mip, FfxUInt32 slice)
|
|
{
|
|
FfxFloat32x4 v[4];
|
|
|
|
FfxInt32x2 tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2, y * 2);
|
|
FfxInt32x2 pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x, y);
|
|
v[0] = SpdReduceLoadSourceImage(tex, slice);
|
|
SpdStore(pix, v[0], 0, slice);
|
|
|
|
tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2 + 32, y * 2);
|
|
pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x + 16, y);
|
|
v[1] = SpdReduceLoadSourceImage(tex, slice);
|
|
SpdStore(pix, v[1], 0, slice);
|
|
|
|
tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2, y * 2 + 32);
|
|
pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x, y + 16);
|
|
v[2] = SpdReduceLoadSourceImage(tex, slice);
|
|
SpdStore(pix, v[2], 0, slice);
|
|
|
|
tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2 + 32, y * 2 + 32);
|
|
pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x + 16, y + 16);
|
|
v[3] = SpdReduceLoadSourceImage(tex, slice);
|
|
SpdStore(pix, v[3], 0, slice);
|
|
|
|
if (mip <= 1)
|
|
return;
|
|
|
|
for (FfxUInt32 i = 0; i < 4; i++)
|
|
{
|
|
SpdStoreIntermediate(x, y, v[i]);
|
|
SpdWorkgroupShuffleBarrier();
|
|
if (localInvocationIndex < 64)
|
|
{
|
|
v[i] = SpdReduceIntermediate(FfxUInt32x2(x * 2 + 0, y * 2 + 0), FfxUInt32x2(x * 2 + 1, y * 2 + 0), FfxUInt32x2(x * 2 + 0, y * 2 + 1), FfxUInt32x2(x * 2 + 1, y * 2 + 1));
|
|
SpdStore(FfxInt32x2(workGroupID.xy * 16) + FfxInt32x2(x + (i % 2) * 8, y + (i / 2) * 8), v[i], 1, slice);
|
|
}
|
|
SpdWorkgroupShuffleBarrier();
|
|
}
|
|
|
|
if (localInvocationIndex < 64)
|
|
{
|
|
SpdStoreIntermediate(x + 0, y + 0, v[0]);
|
|
SpdStoreIntermediate(x + 8, y + 0, v[1]);
|
|
SpdStoreIntermediate(x + 0, y + 8, v[2]);
|
|
SpdStoreIntermediate(x + 8, y + 8, v[3]);
|
|
}
|
|
}
|
|
|
|
void SpdDownsampleMips_0_1(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mip, FfxUInt32 slice)
|
|
{
|
|
#ifdef SPD_NO_WAVE_OPERATIONS
|
|
SpdDownsampleMips_0_1_LDS(x, y, workGroupID, localInvocationIndex, mip, slice);
|
|
#else
|
|
SpdDownsampleMips_0_1_Intrinsics(x, y, workGroupID, localInvocationIndex, mip, slice);
|
|
#endif
|
|
}
|
|
|
|
|
|
void SpdDownsampleMip_2(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mip, FfxUInt32 slice)
|
|
{
|
|
#ifdef SPD_NO_WAVE_OPERATIONS
|
|
if (localInvocationIndex < 64)
|
|
{
|
|
FfxFloat32x4 v = SpdReduceIntermediate(FfxUInt32x2(x * 2 + 0, y * 2 + 0), FfxUInt32x2(x * 2 + 1, y * 2 + 0), FfxUInt32x2(x * 2 + 0, y * 2 + 1), FfxUInt32x2(x * 2 + 1, y * 2 + 1));
|
|
SpdStore(FfxInt32x2(workGroupID.xy * 8) + FfxInt32x2(x, y), v, mip, slice);
|
|
// store to LDS, try to reduce bank conflicts
|
|
// x 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0
|
|
// 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
|
// 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0 x
|
|
// 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
|
// x 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0
|
|
// ...
|
|
// x 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0
|
|
SpdStoreIntermediate(x * 2 + y % 2, y * 2, v);
|
|
}
|
|
#else
|
|
FfxFloat32x4 v = SpdLoadIntermediate(x, y);
|
|
v = SpdReduceQuad(v);
|
|
// quad index 0 stores result
|
|
if (localInvocationIndex % 4 == 0)
|
|
{
|
|
SpdStore(FfxInt32x2(workGroupID.xy * 8) + FfxInt32x2(x / 2, y / 2), v, mip, slice);
|
|
SpdStoreIntermediate(x + (y / 2) % 2, y, v);
|
|
}
|
|
#endif
|
|
}
|
|
|
|
void SpdDownsampleMip_3(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mip, FfxUInt32 slice)
|
|
{
|
|
#ifdef SPD_NO_WAVE_OPERATIONS
|
|
if (localInvocationIndex < 16)
|
|
{
|
|
// x 0 x 0
|
|
// 0 0 0 0
|
|
// 0 x 0 x
|
|
// 0 0 0 0
|
|
FfxFloat32x4 v =
|
|
SpdReduceIntermediate(FfxUInt32x2(x * 4 + 0 + 0, y * 4 + 0), FfxUInt32x2(x * 4 + 2 + 0, y * 4 + 0), FfxUInt32x2(x * 4 + 0 + 1, y * 4 + 2), FfxUInt32x2(x * 4 + 2 + 1, y * 4 + 2));
|
|
SpdStore(FfxInt32x2(workGroupID.xy * 4) + FfxInt32x2(x, y), v, mip, slice);
|
|
// store to LDS
|
|
// x 0 0 0 x 0 0 0 x 0 0 0 x 0 0 0
|
|
// 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
|
// 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
|
// 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
|
// 0 x 0 0 0 x 0 0 0 x 0 0 0 x 0 0
|
|
// ...
|
|
// 0 0 x 0 0 0 x 0 0 0 x 0 0 0 x 0
|
|
// ...
|
|
// 0 0 0 x 0 0 0 x 0 0 0 x 0 0 0 x
|
|
// ...
|
|
SpdStoreIntermediate(x * 4 + y, y * 4, v);
|
|
}
|
|
#else
|
|
if (localInvocationIndex < 64)
|
|
{
|
|
FfxFloat32x4 v = SpdLoadIntermediate(x * 2 + y % 2, y * 2);
|
|
v = SpdReduceQuad(v);
|
|
// quad index 0 stores result
|
|
if (localInvocationIndex % 4 == 0)
|
|
{
|
|
SpdStore(FfxInt32x2(workGroupID.xy * 4) + FfxInt32x2(x / 2, y / 2), v, mip, slice);
|
|
SpdStoreIntermediate(x * 2 + y / 2, y * 2, v);
|
|
}
|
|
}
|
|
#endif
|
|
}
|
|
|
|
void SpdDownsampleMip_4(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mip, FfxUInt32 slice)
|
|
{
|
|
#ifdef SPD_NO_WAVE_OPERATIONS
|
|
if (localInvocationIndex < 4)
|
|
{
|
|
// x 0 0 0 x 0 0 0
|
|
// ...
|
|
// 0 x 0 0 0 x 0 0
|
|
FfxFloat32x4 v = SpdReduceIntermediate(FfxUInt32x2(x * 8 + 0 + 0 + y * 2, y * 8 + 0),
|
|
FfxUInt32x2(x * 8 + 4 + 0 + y * 2, y * 8 + 0),
|
|
FfxUInt32x2(x * 8 + 0 + 1 + y * 2, y * 8 + 4),
|
|
FfxUInt32x2(x * 8 + 4 + 1 + y * 2, y * 8 + 4));
|
|
SpdStore(FfxInt32x2(workGroupID.xy * 2) + FfxInt32x2(x, y), v, mip, slice);
|
|
// store to LDS
|
|
// x x x x 0 ...
|
|
// 0 ...
|
|
SpdStoreIntermediate(x + y * 2, 0, v);
|
|
}
|
|
#else
|
|
if (localInvocationIndex < 16)
|
|
{
|
|
FfxFloat32x4 v = SpdLoadIntermediate(x * 4 + y, y * 4);
|
|
v = SpdReduceQuad(v);
|
|
// quad index 0 stores result
|
|
if (localInvocationIndex % 4 == 0)
|
|
{
|
|
SpdStore(FfxInt32x2(workGroupID.xy * 2) + FfxInt32x2(x / 2, y / 2), v, mip, slice);
|
|
SpdStoreIntermediate(x / 2 + y, 0, v);
|
|
}
|
|
}
|
|
#endif
|
|
}
|
|
|
|
void SpdDownsampleMip_5(FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mip, FfxUInt32 slice)
|
|
{
|
|
#ifdef SPD_NO_WAVE_OPERATIONS
|
|
if (localInvocationIndex < 1)
|
|
{
|
|
// x x x x 0 ...
|
|
// 0 ...
|
|
FfxFloat32x4 v = SpdReduceIntermediate(FfxUInt32x2(0, 0), FfxUInt32x2(1, 0), FfxUInt32x2(2, 0), FfxUInt32x2(3, 0));
|
|
SpdStore(FfxInt32x2(workGroupID.xy), v, mip, slice);
|
|
}
|
|
#else
|
|
if (localInvocationIndex < 4)
|
|
{
|
|
FfxFloat32x4 v = SpdLoadIntermediate(localInvocationIndex, 0);
|
|
v = SpdReduceQuad(v);
|
|
// quad index 0 stores result
|
|
if (localInvocationIndex % 4 == 0)
|
|
{
|
|
SpdStore(FfxInt32x2(workGroupID.xy), v, mip, slice);
|
|
}
|
|
}
|
|
#endif
|
|
}
|
|
|
|
void SpdDownsampleMips_6_7(FfxUInt32 x, FfxUInt32 y, FfxUInt32 mips, FfxUInt32 slice)
|
|
{
|
|
FfxInt32x2 tex = FfxInt32x2(x * 4 + 0, y * 4 + 0);
|
|
FfxInt32x2 pix = FfxInt32x2(x * 2 + 0, y * 2 + 0);
|
|
FfxFloat32x4 v0 = SpdReduceLoad4(tex, slice);
|
|
SpdStore(pix, v0, 6, slice);
|
|
|
|
tex = FfxInt32x2(x * 4 + 2, y * 4 + 0);
|
|
pix = FfxInt32x2(x * 2 + 1, y * 2 + 0);
|
|
FfxFloat32x4 v1 = SpdReduceLoad4(tex, slice);
|
|
SpdStore(pix, v1, 6, slice);
|
|
|
|
tex = FfxInt32x2(x * 4 + 0, y * 4 + 2);
|
|
pix = FfxInt32x2(x * 2 + 0, y * 2 + 1);
|
|
FfxFloat32x4 v2 = SpdReduceLoad4(tex, slice);
|
|
SpdStore(pix, v2, 6, slice);
|
|
|
|
tex = FfxInt32x2(x * 4 + 2, y * 4 + 2);
|
|
pix = FfxInt32x2(x * 2 + 1, y * 2 + 1);
|
|
FfxFloat32x4 v3 = SpdReduceLoad4(tex, slice);
|
|
SpdStore(pix, v3, 6, slice);
|
|
|
|
if (mips <= 7)
|
|
return;
|
|
// no barrier needed, working on values only from the same thread
|
|
|
|
FfxFloat32x4 v = SpdReduce4(v0, v1, v2, v3);
|
|
SpdStore(FfxInt32x2(x, y), v, 7, slice);
|
|
SpdStoreIntermediate(x, y, v);
|
|
}
|
|
|
|
void SpdDownsampleNextFour(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 baseMip, FfxUInt32 mips, FfxUInt32 slice)
|
|
{
|
|
if (mips <= baseMip)
|
|
return;
|
|
SpdWorkgroupShuffleBarrier();
|
|
SpdDownsampleMip_2(x, y, workGroupID, localInvocationIndex, baseMip, slice);
|
|
|
|
if (mips <= baseMip + 1)
|
|
return;
|
|
SpdWorkgroupShuffleBarrier();
|
|
SpdDownsampleMip_3(x, y, workGroupID, localInvocationIndex, baseMip + 1, slice);
|
|
|
|
if (mips <= baseMip + 2)
|
|
return;
|
|
SpdWorkgroupShuffleBarrier();
|
|
SpdDownsampleMip_4(x, y, workGroupID, localInvocationIndex, baseMip + 2, slice);
|
|
|
|
if (mips <= baseMip + 3)
|
|
return;
|
|
SpdWorkgroupShuffleBarrier();
|
|
SpdDownsampleMip_5(workGroupID, localInvocationIndex, baseMip + 3, slice);
|
|
}
|
|
|
|
void SpdDownsample(FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mips, FfxUInt32 numWorkGroups, FfxUInt32 slice)
|
|
{
|
|
FfxUInt32x2 sub_xy = ffxRemapForWaveReduction(localInvocationIndex % 64);
|
|
FfxUInt32 x = sub_xy.x + 8 * ((localInvocationIndex >> 6) % 2);
|
|
FfxUInt32 y = sub_xy.y + 8 * ((localInvocationIndex >> 7));
|
|
SpdDownsampleMips_0_1(x, y, workGroupID, localInvocationIndex, mips, slice);
|
|
|
|
SpdDownsampleNextFour(x, y, workGroupID, localInvocationIndex, 2, mips, slice);
|
|
|
|
if (mips <= 6)
|
|
return;
|
|
|
|
if (SpdExitWorkgroup(numWorkGroups, localInvocationIndex, slice))
|
|
return;
|
|
|
|
SpdResetAtomicCounter(slice);
|
|
|
|
// After mip 6 there is only a single workgroup left that downsamples the remaining up to 64x64 texels.
|
|
SpdDownsampleMips_6_7(x, y, mips, slice);
|
|
|
|
SpdDownsampleNextFour(x, y, FfxUInt32x2(0, 0), localInvocationIndex, 8, mips, slice);
|
|
}
|
|
|
|
void SpdDownsample(FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mips, FfxUInt32 numWorkGroups, FfxUInt32 slice, FfxUInt32x2 workGroupOffset)
|
|
{
|
|
SpdDownsample(workGroupID + workGroupOffset, localInvocationIndex, mips, numWorkGroups, slice);
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
|
|
//==============================================================================================================================
|
|
// PACKED VERSION
|
|
//==============================================================================================================================
|
|
|
|
#if FFX_HALF
|
|
|
|
#ifdef FFX_GLSL
|
|
#extension GL_EXT_shader_subgroup_extended_types_float16:require
|
|
#endif
|
|
|
|
FfxFloat16x4 SpdReduceQuadH(FfxFloat16x4 v)
|
|
{
|
|
#if defined(FFX_GLSL) && !defined(SPD_NO_WAVE_OPERATIONS)
|
|
FfxFloat16x4 v0 = v;
|
|
FfxFloat16x4 v1 = subgroupQuadSwapHorizontal(v);
|
|
FfxFloat16x4 v2 = subgroupQuadSwapVertical(v);
|
|
FfxFloat16x4 v3 = subgroupQuadSwapDiagonal(v);
|
|
return SpdReduce4H(v0, v1, v2, v3);
|
|
#elif defined(FFX_HLSL) && !defined(SPD_NO_WAVE_OPERATIONS)
|
|
// requires SM6.0
|
|
FfxUInt32 quad = WaveGetLaneIndex() & (~0x3);
|
|
FfxFloat16x4 v0 = v;
|
|
FfxFloat16x4 v1 = WaveReadLaneAt(v, quad | 1);
|
|
FfxFloat16x4 v2 = WaveReadLaneAt(v, quad | 2);
|
|
FfxFloat16x4 v3 = WaveReadLaneAt(v, quad | 3);
|
|
return SpdReduce4H(v0, v1, v2, v3);
|
|
/*
|
|
// if SM6.0 is not available, you can use the AMD shader intrinsics
|
|
// the AMD shader intrinsics are available in AMD GPU Services (AGS) library:
|
|
// https://gpuopen.com/amd-gpu-services-ags-library/
|
|
// works for DX11
|
|
FfxFloat16x4 v0 = v;
|
|
FfxFloat16x4 v1;
|
|
v1.x = AmdExtD3DShaderIntrinsics_SwizzleF(v.x, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1);
|
|
v1.y = AmdExtD3DShaderIntrinsics_SwizzleF(v.y, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1);
|
|
v1.z = AmdExtD3DShaderIntrinsics_SwizzleF(v.z, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1);
|
|
v1.w = AmdExtD3DShaderIntrinsics_SwizzleF(v.w, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1);
|
|
FfxFloat16x4 v2;
|
|
v2.x = AmdExtD3DShaderIntrinsics_SwizzleF(v.x, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2);
|
|
v2.y = AmdExtD3DShaderIntrinsics_SwizzleF(v.y, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2);
|
|
v2.z = AmdExtD3DShaderIntrinsics_SwizzleF(v.z, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2);
|
|
v2.w = AmdExtD3DShaderIntrinsics_SwizzleF(v.w, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2);
|
|
FfxFloat16x4 v3;
|
|
v3.x = AmdExtD3DShaderIntrinsics_SwizzleF(v.x, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4);
|
|
v3.y = AmdExtD3DShaderIntrinsics_SwizzleF(v.y, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4);
|
|
v3.z = AmdExtD3DShaderIntrinsics_SwizzleF(v.z, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4);
|
|
v3.w = AmdExtD3DShaderIntrinsics_SwizzleF(v.w, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4);
|
|
return SpdReduce4H(v0, v1, v2, v3);
|
|
*/
|
|
#endif
|
|
return FfxFloat16x4(0.0, 0.0, 0.0, 0.0);
|
|
}
|
|
|
|
FfxFloat16x4 SpdReduceIntermediateH(FfxUInt32x2 i0, FfxUInt32x2 i1, FfxUInt32x2 i2, FfxUInt32x2 i3)
|
|
{
|
|
FfxFloat16x4 v0 = SpdLoadIntermediateH(i0.x, i0.y);
|
|
FfxFloat16x4 v1 = SpdLoadIntermediateH(i1.x, i1.y);
|
|
FfxFloat16x4 v2 = SpdLoadIntermediateH(i2.x, i2.y);
|
|
FfxFloat16x4 v3 = SpdLoadIntermediateH(i3.x, i3.y);
|
|
return SpdReduce4H(v0, v1, v2, v3);
|
|
}
|
|
|
|
FfxFloat16x4 SpdReduceLoad4H(FfxUInt32x2 i0, FfxUInt32x2 i1, FfxUInt32x2 i2, FfxUInt32x2 i3, FfxUInt32 slice)
|
|
{
|
|
FfxFloat16x4 v0 = SpdLoadH(FfxInt32x2(i0), slice);
|
|
FfxFloat16x4 v1 = SpdLoadH(FfxInt32x2(i1), slice);
|
|
FfxFloat16x4 v2 = SpdLoadH(FfxInt32x2(i2), slice);
|
|
FfxFloat16x4 v3 = SpdLoadH(FfxInt32x2(i3), slice);
|
|
return SpdReduce4H(v0, v1, v2, v3);
|
|
}
|
|
|
|
FfxFloat16x4 SpdReduceLoad4H(FfxUInt32x2 base, FfxUInt32 slice)
|
|
{
|
|
return SpdReduceLoad4H(FfxUInt32x2(base + FfxUInt32x2(0, 0)), FfxUInt32x2(base + FfxUInt32x2(0, 1)), FfxUInt32x2(base + FfxUInt32x2(1, 0)), FfxUInt32x2(base + FfxUInt32x2(1, 1)), slice);
|
|
}
|
|
|
|
FfxFloat16x4 SpdReduceLoadSourceImage4H(FfxUInt32x2 i0, FfxUInt32x2 i1, FfxUInt32x2 i2, FfxUInt32x2 i3, FfxUInt32 slice)
|
|
{
|
|
FfxFloat16x4 v0 = SpdLoadSourceImageH(FfxInt32x2(i0), slice);
|
|
FfxFloat16x4 v1 = SpdLoadSourceImageH(FfxInt32x2(i1), slice);
|
|
FfxFloat16x4 v2 = SpdLoadSourceImageH(FfxInt32x2(i2), slice);
|
|
FfxFloat16x4 v3 = SpdLoadSourceImageH(FfxInt32x2(i3), slice);
|
|
return SpdReduce4H(v0, v1, v2, v3);
|
|
}
|
|
|
|
FfxFloat16x4 SpdReduceLoadSourceImageH(FfxUInt32x2 base, FfxUInt32 slice)
|
|
{
|
|
#ifdef SPD_LINEAR_SAMPLER
|
|
return SpdLoadSourceImageH(FfxInt32x2(base), slice);
|
|
#else
|
|
return SpdReduceLoadSourceImage4H(FfxUInt32x2(base + FfxUInt32x2(0, 0)), FfxUInt32x2(base + FfxUInt32x2(0, 1)), FfxUInt32x2(base + FfxUInt32x2(1, 0)), FfxUInt32x2(base + FfxUInt32x2(1, 1)), slice);
|
|
#endif
|
|
}
|
|
|
|
void SpdDownsampleMips_0_1_IntrinsicsH(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mips, FfxUInt32 slice)
|
|
{
|
|
FfxFloat16x4 v[4];
|
|
|
|
FfxInt32x2 tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2, y * 2);
|
|
FfxInt32x2 pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x, y);
|
|
v[0] = SpdReduceLoadSourceImageH(tex, slice);
|
|
SpdStoreH(pix, v[0], 0, slice);
|
|
|
|
tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2 + 32, y * 2);
|
|
pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x + 16, y);
|
|
v[1] = SpdReduceLoadSourceImageH(tex, slice);
|
|
SpdStoreH(pix, v[1], 0, slice);
|
|
|
|
tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2, y * 2 + 32);
|
|
pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x, y + 16);
|
|
v[2] = SpdReduceLoadSourceImageH(tex, slice);
|
|
SpdStoreH(pix, v[2], 0, slice);
|
|
|
|
tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2 + 32, y * 2 + 32);
|
|
pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x + 16, y + 16);
|
|
v[3] = SpdReduceLoadSourceImageH(tex, slice);
|
|
SpdStoreH(pix, v[3], 0, slice);
|
|
|
|
if (mips <= 1)
|
|
return;
|
|
|
|
v[0] = SpdReduceQuadH(v[0]);
|
|
v[1] = SpdReduceQuadH(v[1]);
|
|
v[2] = SpdReduceQuadH(v[2]);
|
|
v[3] = SpdReduceQuadH(v[3]);
|
|
|
|
if ((localInvocationIndex % 4) == 0)
|
|
{
|
|
SpdStoreH(FfxInt32x2(workGroupID.xy * 16) + FfxInt32x2(x / 2, y / 2), v[0], 1, slice);
|
|
SpdStoreIntermediateH(x / 2, y / 2, v[0]);
|
|
|
|
SpdStoreH(FfxInt32x2(workGroupID.xy * 16) + FfxInt32x2(x / 2 + 8, y / 2), v[1], 1, slice);
|
|
SpdStoreIntermediateH(x / 2 + 8, y / 2, v[1]);
|
|
|
|
SpdStoreH(FfxInt32x2(workGroupID.xy * 16) + FfxInt32x2(x / 2, y / 2 + 8), v[2], 1, slice);
|
|
SpdStoreIntermediateH(x / 2, y / 2 + 8, v[2]);
|
|
|
|
SpdStoreH(FfxInt32x2(workGroupID.xy * 16) + FfxInt32x2(x / 2 + 8, y / 2 + 8), v[3], 1, slice);
|
|
SpdStoreIntermediateH(x / 2 + 8, y / 2 + 8, v[3]);
|
|
}
|
|
}
|
|
|
|
void SpdDownsampleMips_0_1_LDSH(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mips, FfxUInt32 slice)
|
|
{
|
|
FfxFloat16x4 v[4];
|
|
|
|
FfxInt32x2 tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2, y * 2);
|
|
FfxInt32x2 pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x, y);
|
|
v[0] = SpdReduceLoadSourceImageH(tex, slice);
|
|
SpdStoreH(pix, v[0], 0, slice);
|
|
|
|
tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2 + 32, y * 2);
|
|
pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x + 16, y);
|
|
v[1] = SpdReduceLoadSourceImageH(tex, slice);
|
|
SpdStoreH(pix, v[1], 0, slice);
|
|
|
|
tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2, y * 2 + 32);
|
|
pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x, y + 16);
|
|
v[2] = SpdReduceLoadSourceImageH(tex, slice);
|
|
SpdStoreH(pix, v[2], 0, slice);
|
|
|
|
tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2 + 32, y * 2 + 32);
|
|
pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x + 16, y + 16);
|
|
v[3] = SpdReduceLoadSourceImageH(tex, slice);
|
|
SpdStoreH(pix, v[3], 0, slice);
|
|
|
|
if (mips <= 1)
|
|
return;
|
|
|
|
for (FfxInt32 i = 0; i < 4; i++)
|
|
{
|
|
SpdStoreIntermediateH(x, y, v[i]);
|
|
SpdWorkgroupShuffleBarrier();
|
|
if (localInvocationIndex < 64)
|
|
{
|
|
v[i] = SpdReduceIntermediateH(FfxUInt32x2(x * 2 + 0, y * 2 + 0), FfxUInt32x2(x * 2 + 1, y * 2 + 0), FfxUInt32x2(x * 2 + 0, y * 2 + 1), FfxUInt32x2(x * 2 + 1, y * 2 + 1));
|
|
SpdStoreH(FfxInt32x2(workGroupID.xy * 16) + FfxInt32x2(x + (i % 2) * 8, y + (i / 2) * 8), v[i], 1, slice);
|
|
}
|
|
SpdWorkgroupShuffleBarrier();
|
|
}
|
|
|
|
if (localInvocationIndex < 64)
|
|
{
|
|
SpdStoreIntermediateH(x + 0, y + 0, v[0]);
|
|
SpdStoreIntermediateH(x + 8, y + 0, v[1]);
|
|
SpdStoreIntermediateH(x + 0, y + 8, v[2]);
|
|
SpdStoreIntermediateH(x + 8, y + 8, v[3]);
|
|
}
|
|
}
|
|
|
|
void SpdDownsampleMips_0_1H(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mips, FfxUInt32 slice)
|
|
{
|
|
#ifdef SPD_NO_WAVE_OPERATIONS
|
|
SpdDownsampleMips_0_1_LDSH(x, y, workGroupID, localInvocationIndex, mips, slice);
|
|
#else
|
|
SpdDownsampleMips_0_1_IntrinsicsH(x, y, workGroupID, localInvocationIndex, mips, slice);
|
|
#endif
|
|
}
|
|
|
|
|
|
void SpdDownsampleMip_2H(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mip, FfxUInt32 slice)
|
|
{
|
|
#ifdef SPD_NO_WAVE_OPERATIONS
|
|
if (localInvocationIndex < 64)
|
|
{
|
|
FfxFloat16x4 v = SpdReduceIntermediateH(FfxUInt32x2(x * 2 + 0, y * 2 + 0), FfxUInt32x2(x * 2 + 1, y * 2 + 0), FfxUInt32x2(x * 2 + 0, y * 2 + 1), FfxUInt32x2(x * 2 + 1, y * 2 + 1));
|
|
SpdStoreH(FfxInt32x2(workGroupID.xy * 8) + FfxInt32x2(x, y), v, mip, slice);
|
|
// store to LDS, try to reduce bank conflicts
|
|
// x 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0
|
|
// 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
|
// 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0 x
|
|
// 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
|
// x 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0
|
|
// ...
|
|
// x 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0
|
|
SpdStoreIntermediateH(x * 2 + y % 2, y * 2, v);
|
|
}
|
|
#else
|
|
FfxFloat16x4 v = SpdLoadIntermediateH(x, y);
|
|
v = SpdReduceQuadH(v);
|
|
// quad index 0 stores result
|
|
if (localInvocationIndex % 4 == 0)
|
|
{
|
|
SpdStoreH(FfxInt32x2(workGroupID.xy * 8) + FfxInt32x2(x / 2, y / 2), v, mip, slice);
|
|
SpdStoreIntermediateH(x + (y / 2) % 2, y, v);
|
|
}
|
|
#endif
|
|
}
|
|
|
|
void SpdDownsampleMip_3H(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mip, FfxUInt32 slice)
|
|
{
|
|
#ifdef SPD_NO_WAVE_OPERATIONS
|
|
if (localInvocationIndex < 16)
|
|
{
|
|
// x 0 x 0
|
|
// 0 0 0 0
|
|
// 0 x 0 x
|
|
// 0 0 0 0
|
|
FfxFloat16x4 v =
|
|
SpdReduceIntermediateH(FfxUInt32x2(x * 4 + 0 + 0, y * 4 + 0), FfxUInt32x2(x * 4 + 2 + 0, y * 4 + 0), FfxUInt32x2(x * 4 + 0 + 1, y * 4 + 2), FfxUInt32x2(x * 4 + 2 + 1, y * 4 + 2));
|
|
SpdStoreH(FfxInt32x2(workGroupID.xy * 4) + FfxInt32x2(x, y), v, mip, slice);
|
|
// store to LDS
|
|
// x 0 0 0 x 0 0 0 x 0 0 0 x 0 0 0
|
|
// 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
|
// 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
|
// 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
|
// 0 x 0 0 0 x 0 0 0 x 0 0 0 x 0 0
|
|
// ...
|
|
// 0 0 x 0 0 0 x 0 0 0 x 0 0 0 x 0
|
|
// ...
|
|
// 0 0 0 x 0 0 0 x 0 0 0 x 0 0 0 x
|
|
// ...
|
|
SpdStoreIntermediateH(x * 4 + y, y * 4, v);
|
|
}
|
|
#else
|
|
if (localInvocationIndex < 64)
|
|
{
|
|
FfxFloat16x4 v = SpdLoadIntermediateH(x * 2 + y % 2, y * 2);
|
|
v = SpdReduceQuadH(v);
|
|
// quad index 0 stores result
|
|
if (localInvocationIndex % 4 == 0)
|
|
{
|
|
SpdStoreH(FfxInt32x2(workGroupID.xy * 4) + FfxInt32x2(x / 2, y / 2), v, mip, slice);
|
|
SpdStoreIntermediateH(x * 2 + y / 2, y * 2, v);
|
|
}
|
|
}
|
|
#endif
|
|
}
|
|
|
|
void SpdDownsampleMip_4H(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mip, FfxUInt32 slice)
|
|
{
|
|
#ifdef SPD_NO_WAVE_OPERATIONS
|
|
if (localInvocationIndex < 4)
|
|
{
|
|
// x 0 0 0 x 0 0 0
|
|
// ...
|
|
// 0 x 0 0 0 x 0 0
|
|
FfxFloat16x4 v = SpdReduceIntermediateH(FfxUInt32x2(x * 8 + 0 + 0 + y * 2, y * 8 + 0),
|
|
FfxUInt32x2(x * 8 + 4 + 0 + y * 2, y * 8 + 0),
|
|
FfxUInt32x2(x * 8 + 0 + 1 + y * 2, y * 8 + 4),
|
|
FfxUInt32x2(x * 8 + 4 + 1 + y * 2, y * 8 + 4));
|
|
SpdStoreH(FfxInt32x2(workGroupID.xy * 2) + FfxInt32x2(x, y), v, mip, slice);
|
|
// store to LDS
|
|
// x x x x 0 ...
|
|
// 0 ...
|
|
SpdStoreIntermediateH(x + y * 2, 0, v);
|
|
}
|
|
#else
|
|
if (localInvocationIndex < 16)
|
|
{
|
|
FfxFloat16x4 v = SpdLoadIntermediateH(x * 4 + y, y * 4);
|
|
v = SpdReduceQuadH(v);
|
|
// quad index 0 stores result
|
|
if (localInvocationIndex % 4 == 0)
|
|
{
|
|
SpdStoreH(FfxInt32x2(workGroupID.xy * 2) + FfxInt32x2(x / 2, y / 2), v, mip, slice);
|
|
SpdStoreIntermediateH(x / 2 + y, 0, v);
|
|
}
|
|
}
|
|
#endif
|
|
}
|
|
|
|
void SpdDownsampleMip_5H(FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mip, FfxUInt32 slice)
|
|
{
|
|
#ifdef SPD_NO_WAVE_OPERATIONS
|
|
if (localInvocationIndex < 1)
|
|
{
|
|
// x x x x 0 ...
|
|
// 0 ...
|
|
FfxFloat16x4 v = SpdReduceIntermediateH(FfxUInt32x2(0, 0), FfxUInt32x2(1, 0), FfxUInt32x2(2, 0), FfxUInt32x2(3, 0));
|
|
SpdStoreH(FfxInt32x2(workGroupID.xy), v, mip, slice);
|
|
}
|
|
#else
|
|
if (localInvocationIndex < 4)
|
|
{
|
|
FfxFloat16x4 v = SpdLoadIntermediateH(localInvocationIndex, 0);
|
|
v = SpdReduceQuadH(v);
|
|
// quad index 0 stores result
|
|
if (localInvocationIndex % 4 == 0)
|
|
{
|
|
SpdStoreH(FfxInt32x2(workGroupID.xy), v, mip, slice);
|
|
}
|
|
}
|
|
#endif
|
|
}
|
|
|
|
void SpdDownsampleMips_6_7H(FfxUInt32 x, FfxUInt32 y, FfxUInt32 mips, FfxUInt32 slice)
|
|
{
|
|
FfxInt32x2 tex = FfxInt32x2(x * 4 + 0, y * 4 + 0);
|
|
FfxInt32x2 pix = FfxInt32x2(x * 2 + 0, y * 2 + 0);
|
|
FfxFloat16x4 v0 = SpdReduceLoad4H(tex, slice);
|
|
SpdStoreH(pix, v0, 6, slice);
|
|
|
|
tex = FfxInt32x2(x * 4 + 2, y * 4 + 0);
|
|
pix = FfxInt32x2(x * 2 + 1, y * 2 + 0);
|
|
FfxFloat16x4 v1 = SpdReduceLoad4H(tex, slice);
|
|
SpdStoreH(pix, v1, 6, slice);
|
|
|
|
tex = FfxInt32x2(x * 4 + 0, y * 4 + 2);
|
|
pix = FfxInt32x2(x * 2 + 0, y * 2 + 1);
|
|
FfxFloat16x4 v2 = SpdReduceLoad4H(tex, slice);
|
|
SpdStoreH(pix, v2, 6, slice);
|
|
|
|
tex = FfxInt32x2(x * 4 + 2, y * 4 + 2);
|
|
pix = FfxInt32x2(x * 2 + 1, y * 2 + 1);
|
|
FfxFloat16x4 v3 = SpdReduceLoad4H(tex, slice);
|
|
SpdStoreH(pix, v3, 6, slice);
|
|
|
|
if (mips < 8)
|
|
return;
|
|
// no barrier needed, working on values only from the same thread
|
|
|
|
FfxFloat16x4 v = SpdReduce4H(v0, v1, v2, v3);
|
|
SpdStoreH(FfxInt32x2(x, y), v, 7, slice);
|
|
SpdStoreIntermediateH(x, y, v);
|
|
}
|
|
|
|
void SpdDownsampleNextFourH(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 baseMip, FfxUInt32 mips, FfxUInt32 slice)
|
|
{
|
|
if (mips <= baseMip)
|
|
return;
|
|
SpdWorkgroupShuffleBarrier();
|
|
SpdDownsampleMip_2H(x, y, workGroupID, localInvocationIndex, baseMip, slice);
|
|
|
|
if (mips <= baseMip + 1)
|
|
return;
|
|
SpdWorkgroupShuffleBarrier();
|
|
SpdDownsampleMip_3H(x, y, workGroupID, localInvocationIndex, baseMip + 1, slice);
|
|
|
|
if (mips <= baseMip + 2)
|
|
return;
|
|
SpdWorkgroupShuffleBarrier();
|
|
SpdDownsampleMip_4H(x, y, workGroupID, localInvocationIndex, baseMip + 2, slice);
|
|
|
|
if (mips <= baseMip + 3)
|
|
return;
|
|
SpdWorkgroupShuffleBarrier();
|
|
SpdDownsampleMip_5H(workGroupID, localInvocationIndex, baseMip + 3, slice);
|
|
}
|
|
|
|
void SpdDownsampleH(FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mips, FfxUInt32 numWorkGroups, FfxUInt32 slice)
|
|
{
|
|
FfxUInt32x2 sub_xy = ffxRemapForWaveReduction(localInvocationIndex % 64);
|
|
FfxUInt32 x = sub_xy.x + 8 * ((localInvocationIndex >> 6) % 2);
|
|
FfxUInt32 y = sub_xy.y + 8 * ((localInvocationIndex >> 7));
|
|
|
|
SpdDownsampleMips_0_1H(x, y, workGroupID, localInvocationIndex, mips, slice);
|
|
|
|
SpdDownsampleNextFourH(x, y, workGroupID, localInvocationIndex, 2, mips, slice);
|
|
|
|
if (mips < 7)
|
|
return;
|
|
|
|
if (SpdExitWorkgroup(numWorkGroups, localInvocationIndex, slice))
|
|
return;
|
|
|
|
SpdResetAtomicCounter(slice);
|
|
|
|
// After mip 6 there is only a single workgroup left that downsamples the remaining up to 64x64 texels.
|
|
SpdDownsampleMips_6_7H(x, y, mips, slice);
|
|
|
|
SpdDownsampleNextFourH(x, y, FfxUInt32x2(0, 0), localInvocationIndex, 8, mips, slice);
|
|
}
|
|
|
|
void SpdDownsampleH(FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mips, FfxUInt32 numWorkGroups, FfxUInt32 slice, FfxUInt32x2 workGroupOffset)
|
|
{
|
|
SpdDownsampleH(workGroupID + workGroupOffset, localInvocationIndex, mips, numWorkGroups, slice);
|
|
}
|
|
|
|
#endif // #if FFX_HALF
|
|
#endif // #ifdef FFX_GPU
|