diff --git a/include/nbl/builtin/hlsl/concepts/accessors/generic_shared_data.hlsl b/include/nbl/builtin/hlsl/concepts/accessors/generic_shared_data.hlsl index cc22595444..ab7a87c7dd 100644 --- a/include/nbl/builtin/hlsl/concepts/accessors/generic_shared_data.hlsl +++ b/include/nbl/builtin/hlsl/concepts/accessors/generic_shared_data.hlsl @@ -69,7 +69,7 @@ NBL_CONCEPT_END( #include template -NBL_BOOL_CONCEPT GenericDataAccessor = GenericWriteAccessor && GenericWriteAccessor; +NBL_BOOL_CONCEPT GenericDataAccessor = GenericReadAccessor && GenericWriteAccessor; } } diff --git a/include/nbl/builtin/hlsl/sampling/hierarchical_image.hlsl b/include/nbl/builtin/hlsl/sampling/hierarchical_image.hlsl new file mode 100644 index 0000000000..8fc1d1452a --- /dev/null +++ b/include/nbl/builtin/hlsl/sampling/hierarchical_image.hlsl @@ -0,0 +1,237 @@ +// Copyright (C) 2018-2025 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h + +#ifndef _NBL_BUILTIN_HLSL_SAMPLING_HIERARCHICAL_IMAGE_INCLUDED_ +#define _NBL_BUILTIN_HLSL_SAMPLING_HIERARCHICAL_IMAGE_INCLUDED_ + +#include +#include +#include +#include + +namespace nbl +{ +namespace hlsl +{ +namespace sampling +{ + +// TODO: Implement corner sampling or centered sampling based on the type of LuminanceAccessor +template + ) +struct HierarchicalLuminanceSampler +{ + using this_type = HierarchicalLuminanceSampler; + using scalar_type = typename LuminanceAccessorT::value_type; + using vector2_type = vector; + using vector4_type = vector; + using domain_type = vector2_type; + using codomain_type = vector2_type; + using weight_type = scalar_type; + using density_type = scalar_type; + struct cache_type + { + scalar_type rcpPmf; + }; + + LuminanceAccessorT _map; + uint16_t2 _lastTexel; + uint16_t _lastMipLevel : 15; + uint16_t _aspect2x1 : 1; + + static this_type create(NBL_CONST_REF_ARG(LuminanceAccessorT) lumaMap) + { + this_type result; + result._map = lumaMap; + const uint16_t2 mapSize = lumaMap.resolution(); + result._lastTexel = mapSize - uint16_t2(1, 1); + // Note: We use mapSize.y here because currently the map aspect ratio can only be 1x1 or 2x1 + result._lastMipLevel = _static_cast(findMSB(_static_cast(mapSize.y))); + result._aspect2x1 = mapSize.x != mapSize.y; + return result; + } + + static bool __choseSecond(scalar_type first, scalar_type second, NBL_REF_ARG(scalar_type) xi, NBL_REF_ARG(scalar_type) rcpPmf) + { + // numerical resilience against IEEE754 + scalar_type rcpChoiceProb = scalar_type(0); + PartitionRandVariable partition; + partition.leftProb = scalar_type(1) / (scalar_type(1) + (second / first)); + bool choseSecond = partition(xi, rcpChoiceProb); + rcpPmf *= rcpChoiceProb; + return choseSecond; + } + + // Cannot use textureGather since we need to pass the mipLevel + vector4_type __texelGather(uint16_t2 coord, uint16_t level) NBL_CONST_MEMBER_FUNC + { + assert(coord.x < _lastTexel.x && coord.y < _lastTexel.y); + scalar_type p0, p1, p2, p3; + _map.get(p0, coord + uint16_t2(0, 1), level); + _map.get(p1, coord + uint16_t2(1, 1), level); + _map.get(p2, coord + uint16_t2(1, 0), level); + _map.get(p3, coord + uint16_t2(0, 0), level); + return vector4_type(p0, p1, p2, p3); + } + + codomain_type generate(const domain_type v, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC + { + uint16_t2 p = uint16_t2(0, 0); + + domain_type xi = v; + scalar_type rcpPmf = 1; + if (_aspect2x1) { + scalar_type p0, p1; + // do one split in the X axis first cause penultimate full mip would have been 2x1 + _map.get(p0, uint16_t2(0, 0), _lastMipLevel); + _map.get(p1, uint16_t2(1, 0), _lastMipLevel); + p.x = __choseSecond(p0, p1, xi.x, rcpPmf) ? 1 : 0; + } + + for (int i = _lastMipLevel - 1; i >= 0; i--) + { + p <<= 1; + const vector4_type values = __texelGather(p, i); + scalar_type wx_0, wx_1; + { + const scalar_type wy_0 = values[3] + values[2]; + const scalar_type wy_1 = values[1] + values[0]; + if (__choseSecond(wy_0, wy_1, xi.y, rcpPmf)) + { + p.y |= 1; + wx_0 = values[0]; + wx_1 = values[1]; + } + else + { + wx_0 = values[3]; + wx_1 = values[2]; + } + } + if (__choseSecond(wx_0, wx_1, xi.x, rcpPmf)) + p.x |= 1; + } + + + // If we don`t add xi, the sample will clump to the lowest corner of environment map texel. Each time we call PartitionRandVariable(), the output xi is the new xi that determines how left and right(or top and bottom for y axis) to choose the child partition. It means that if for some input xi, the output xi = 0, then the input xi is the edge of choosing this partition and the previous partition, and vice versa, if output xi = 1, then the input xi is the edge of choosing this partition and the next partition. Hence, by adding xi to the lower corner of the texel, we create a gradual transition from one pixel to another. Without adding output xi, the calculation of jacobian using the difference of sample value would not work. + // Since we want to do corner sampling. We have to handle edge texels as corner cases. Remember, in corner sampling we map uv [0,1] to [center of first texel, center of last texel]. So when p is an edge texel, we have to remap xi. [0.5, 1] when p == 0, and [0.5, 1] when p == length - 1. + if (p.x == 0) + xi.x = xi.x * scalar_type(0.5) + scalar_type(0.5); + if (p.y == 0) + xi.y = xi.y * scalar_type(0.5) + scalar_type(0.5); + if (p.x == _lastTexel.x) + xi.x = xi.x * scalar_type(0.5); + if (p.y == _lastTexel.y) + xi.y = xi.y * scalar_type(0.5); + + // We reduce by 0.5 and divide with _lastTexel instead of map size to normalize the cornered sampling coordinate + const vector2_type directionUV = (vector2_type(p.x, p.y) + xi - domain_type(0.5, 0.5)) / _lastTexel; + + cache.rcpPmf = rcpPmf; + + return directionUV; + } + + density_type forwardPdf(const domain_type xi, const cache_type cache) NBL_CONST_MEMBER_FUNC + { + return (_lastTexel.x * _lastTexel.y) / cache.rcpPmf; + } + + weight_type forwardWeight(const domain_type xi, const cache_type cache) NBL_CONST_MEMBER_FUNC + { + return forwardPdf(xi, cache); + } + + // Doesn't comply with sampler concept. This class is extracted so can be used on warpmap generation without passing in unnecessary information like avgLuma. So, need to pass in avgLuma when calculating backwardPdf. + density_type backwardPdf(codomain_type codomainVal) NBL_CONST_MEMBER_FUNC + { + return _map.load(codomainVal) * _map.getAvgLuma(); + } + + weight_type backwardWeight(const codomain_type codomainVal) NBL_CONST_MEMBER_FUNC + { + return backwardPdf(codomainVal); + } + +}; + +// TODO(kevinyu): Add constraint for PostWarpT +template + ) +struct ComposedHierarchicalSampler +{ + using this_type = ComposedHierarchicalSampler; + using warp_generator_type = HierarchicalLuminanceSampler; + using scalar_type = typename LuminanceAccessorT::value_type; + using density_type = scalar_type; + using weight_type = scalar_type; + using vector2_type = vector; + using vector3_type = vector; + using vector4_type = vector; + using domain_type = typename warp_generator_type::domain_type; + using codomain_type = typename PostWarpT::codomain_type; + + static_assert(is_same_v && is_same_v && is_same_v); + + struct cache_type + { + typename warp_generator_type::cache_type warpGeneratorCache; + typename PostWarpT::density_type postWarpPdf; + }; + + warp_generator_type _warpGenerator; + PostWarpT _postWarp; + + static this_type create(NBL_CONST_REF_ARG(LuminanceAccessorT) lumaMap) + { + this_type result; + result._warpGenerator = warp_generator_type::create(lumaMap); + return result; + } + + codomain_type generate(const domain_type xi, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC + { + const typename warp_generator_type::codomain_type warpSample = _warpGenerator.generate(xi, cache.warpGeneratorCache); + typename PostWarpT::cache_type postWarpCache; + const codomain_type postWarpSample = _postWarp.generate(warpSample, postWarpCache); + + // I have to store the postWarpDensity here, so I don't have to call generate on warpGenerator again just to feed it to PostWarpT, even though for spherical it is unused. + cache.postWarpPdf = _postWarp.forwardPdf(warpSample, postWarpCache); + + return postWarpSample; + } + + density_type forwardPdf(const domain_type xi, const cache_type cache) NBL_CONST_MEMBER_FUNC + { + return _warpGenerator.forwardPdf(xi, cache.warpGeneratorCache) * cache.postWarpPdf; + } + + weight_type forwardWeight(const domain_type xi, const cache_type cache) NBL_CONST_MEMBER_FUNC + { + return forwardPdf(xi, cache); + } + + density_type backwardPdf(const codomain_type codomainVal) NBL_CONST_MEMBER_FUNC + { + typename PostWarpT::domain_type postWarpDomain = _postWarp.generateInverse(codomainVal); + return _postWarp.backwardPdf(codomainVal) * _warpGenerator.backwardPdf(postWarpDomain, _warpGenerator._map.getAvgLuma()); + } + + weight_type backwardWeight(const codomain_type codomainVal) NBL_CONST_MEMBER_FUNC + { + return backwardPdf(codomainVal); + } +}; + + + +} +} +} + +#endif diff --git a/include/nbl/builtin/hlsl/sampling/hierarchical_image/accessors.hlsl b/include/nbl/builtin/hlsl/sampling/hierarchical_image/accessors.hlsl new file mode 100644 index 0000000000..93ef6ea82a --- /dev/null +++ b/include/nbl/builtin/hlsl/sampling/hierarchical_image/accessors.hlsl @@ -0,0 +1,96 @@ +#ifndef _NBL_BUILTIN_HLSL_SAMPLING_HIERARCHICAL_IMAGE_ACCESSORS_INCLUDED_ +#define _NBL_BUILTIN_HLSL_SAMPLING_HIERARCHICAL_IMAGE_ACCESSORS_INCLUDED_ + +#include "nbl/builtin/hlsl/concepts/accessors/generic_shared_data.hlsl" + +namespace nbl +{ +namespace hlsl +{ +namespace sampling +{ +namespace hierarchical_image +{ + +// declare concept +#define NBL_CONCEPT_NAME MipmappedLuminanceReadAccessor +#define NBL_CONCEPT_TPLT_PRM_KINDS (typename) +#define NBL_CONCEPT_TPLT_PRM_NAMES (AccessorT) +// not the greatest syntax but works +#define NBL_CONCEPT_PARAM_0 (accessor,AccessorT) +#define NBL_CONCEPT_PARAM_1 (pixelCoord,uint16_t2) +#define NBL_CONCEPT_PARAM_2 (level,uint16_t) +#define NBL_CONCEPT_PARAM_3 (outVal,typename AccessorT::value_type) +// start concept +NBL_CONCEPT_BEGIN(4) +// need to be defined AFTER the concept begins +#define accessor NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_0 +#define pixelCoord NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_1 +#define level NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_2 +#define outVal NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_3 +NBL_CONCEPT_END( + ((NBL_CONCEPT_REQ_TYPE)(AccessorT::value_type)) + // Note(kevin): I don't use MipmappedLoadableImage here, since that concept require layer as parameter. So the sampler have to store the layerIndex. The logic is similar across all layer. So the accessor should be the one that store the layerIndex + ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.template get(outVal,pixelCoord,level)) , ::nbl::hlsl::is_same_v, void)) + // Ask(kevin): Should getAvgLuma follow get, where the outVal is the first parameter instead of the return value? + ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.getAvgLuma()), ::nbl::hlsl::is_same_v, typename AccessorT::value_type)) + ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.resolution()), ::nbl::hlsl::is_same_v, uint16_t2)) +); +#undef accessor +#undef pixelCoord +#undef level +#undef outVal +#include + +// declare concept +#define NBL_CONCEPT_NAME LuminanceReadAccessor +#define NBL_CONCEPT_TPLT_PRM_KINDS (typename) +#define NBL_CONCEPT_TPLT_PRM_NAMES (AccessorT) +// not the greatest syntax but works +#define NBL_CONCEPT_PARAM_0 (accessor, AccessorT) +// start concept +NBL_CONCEPT_BEGIN(1) +// need to be defined AFTER the concept begins +#define accessor NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_0 +NBL_CONCEPT_END( + ((NBL_CONCEPT_REQ_TYPE)(AccessorT::value_type)) + ((NBL_CONCEPT_REQ_TYPE_ALIAS_CONCEPT)(concepts::accessors::GenericReadAccessor, AccessorT, typename AccessorT::value_type, float32_t2)) + ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.getAvgLuma()), ::nbl::hlsl::is_same_v, typename AccessorT::value_type)) +); +#undef accessor +#include + +// gatherUvs return 4 UVs in a square for manual bilinear interpolation with differentiability +// declare concept +#define NBL_CONCEPT_NAME WarpAccessor +#define NBL_CONCEPT_TPLT_PRM_KINDS (typename) +#define NBL_CONCEPT_TPLT_PRM_NAMES (WarpAccessorT) +// not the greatest syntax but works +#define NBL_CONCEPT_PARAM_0 (accessor,WarpAccessorT) +#define NBL_CONCEPT_PARAM_1 (coord,vector) +#define NBL_CONCEPT_PARAM_2 (val, matrix) +#define NBL_CONCEPT_PARAM_3 (interpolant, vector) +// start concept +NBL_CONCEPT_BEGIN(4) +// need to be defined AFTER the concept begins +#define accessor NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_0 +#define coord NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_1 +#define val NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_2 +#define interpolant NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_3 +NBL_CONCEPT_END( + ((NBL_CONCEPT_REQ_TYPE)(WarpAccessorT::scalar_type)) + ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.gatherUv(coord, val)), ::nbl::hlsl::is_same_v, void)) + ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.resolution()), ::nbl::hlsl::is_same_v, uint16_t2)) +); +#undef accessor +#undef coord +#undef val +#undef interpolant +#include + +} +} +} +} + +#endif diff --git a/include/nbl/builtin/hlsl/sampling/hierarchical_image/common.hlsl b/include/nbl/builtin/hlsl/sampling/hierarchical_image/common.hlsl new file mode 100644 index 0000000000..99a0060214 --- /dev/null +++ b/include/nbl/builtin/hlsl/sampling/hierarchical_image/common.hlsl @@ -0,0 +1,42 @@ +#ifndef _NBL_HLSL_SAMPLING_HIERARCHICAL_IMAGE_COMMON_INCLUDED_ +#define _NBL_HLSL_SAMPLING_HIERARCHICAL_IMAGE_COMMON_INCLUDED_ + +#include "nbl/builtin/hlsl/cpp_compat.hlsl" + +namespace nbl +{ +namespace hlsl +{ +namespace sampling +{ +namespace hierarchical_image +{ + +NBL_CONSTEXPR_INLINE_NSPC_SCOPE_VAR float32_t3 LumaRgbCoefficients = { 0.2126729f, 0.7151522f, 0.0721750f }; + +struct SLumaGenPushConstants +{ + uint32_t lumaMapWidth : 16; + uint32_t lumaMapHeight : 16; + uint16_t lumaMapLayer; +}; + +struct SWarpGenPushConstants +{ + uint32_t lumaMapWidth : 16; + uint32_t lumaMapHeight : 16; + uint32_t warpMapWidth : 16; + uint32_t warpMapHeight : 16; + // Both warpMap and lumaMap should have the same layer count + uint16_t lumaMapLayer; +}; + +NBL_CONSTEXPR_INLINE_NSPC_SCOPE_VAR uint32_t GenWarpWorkgroupDim = 16; +NBL_CONSTEXPR_INLINE_NSPC_SCOPE_VAR uint32_t GenLumaWorkgroupDim = 16; + +} +} +} +} + +#endif diff --git a/include/nbl/builtin/hlsl/sampling/hierarchical_image/gen_luma.comp.hlsl b/include/nbl/builtin/hlsl/sampling/hierarchical_image/gen_luma.comp.hlsl new file mode 100644 index 0000000000..5edf1f733f --- /dev/null +++ b/include/nbl/builtin/hlsl/sampling/hierarchical_image/gen_luma.comp.hlsl @@ -0,0 +1,33 @@ +#include "common.hlsl" +#include "nbl/builtin/hlsl/limits.hlsl" + +using namespace nbl; +using namespace nbl::hlsl; +using namespace nbl::hlsl::sampling::hierarchical_image; + +[[vk::push_constant]] SLumaGenPushConstants pc; + +[[vk::binding(0, 0)]] Texture2DArray envMap; +[[vk::binding(1, 0)]] RWTexture2DArray outImage; + +[numthreads(GenLumaWorkgroupDim, GenLumaWorkgroupDim, 1)] +[shader("compute")] +void main(uint32_t3 threadID : SV_DispatchThreadID) +{ + if (all(threadID.xyz < uint32_t3(pc.lumaMapWidth, pc.lumaMapHeight, pc.lumaMapLayer))) + { + const float uv_y = (float(threadID.y) + float(0.5f)) / pc.lumaMapHeight; + const float32_t3 envMapSample = envMap.Load(int4(threadID.xyz, 0)); + float32_t luma = hlsl::dot(envMapSample, LumaRgbCoefficients) * sin(numbers::pi * uv_y); + + // We reduce the luma of the corner texel since we want to do "corner sampling" when generating warp map. + if (threadID.x == 0 || threadID.x == (pc.lumaMapWidth - 1)) + luma *= 0.5f; + if (threadID.y == 0 || threadID.y == (pc.lumaMapHeight - 1)) + luma *= 0.5f; + + luma = max(luma, nbl::hlsl::numeric_limits::min); + + outImage[threadID.xyz] = luma; + } +} diff --git a/include/nbl/builtin/hlsl/sampling/hierarchical_image/gen_warp.comp.hlsl b/include/nbl/builtin/hlsl/sampling/hierarchical_image/gen_warp.comp.hlsl new file mode 100644 index 0000000000..660b151e87 --- /dev/null +++ b/include/nbl/builtin/hlsl/sampling/hierarchical_image/gen_warp.comp.hlsl @@ -0,0 +1,75 @@ +#include "nbl/builtin/hlsl/sampling/hierarchical_image.hlsl" +#include "nbl/builtin/hlsl/sampling/hierarchical_image/common.hlsl" + +using namespace nbl; +using namespace nbl::hlsl; +using namespace nbl::hlsl::sampling; +using namespace nbl::hlsl::sampling::hierarchical_image; + +[[vk::push_constant]] SWarpGenPushConstants pc; + +[[vk::binding(2, 0)]] Texture2DArray lumaMap; +[[vk::binding(3, 0)]] RWTexture2DArray outImage; + + +struct LuminanceAccessor +{ + using value_type = float32_t; + + uint16_t _layerIndex; + + static LuminanceAccessor create(uint16_t layerIndex) + { + LuminanceAccessor result; + result._layerIndex = layerIndex; + return result; + } + + void get(NBL_REF_ARG(value_type) outVal, uint16_t2 pixelCoord, uint16_t level) NBL_CONST_MEMBER_FUNC + { + assert(pixelCoord.x < pc.warpMapWidth && pixelCoord.y < pc.warpMapHeight); + outVal = lumaMap.Load(int4(pixelCoord, _layerIndex, level)); + } + + uint16_t2 resolution() NBL_CONST_MEMBER_FUNC + { + return uint16_t2(pc.lumaMapWidth, pc.lumaMapHeight); + } + + value_type getAvgLuma() NBL_CONST_MEMBER_FUNC + { + const uint16_t lastMipLevel = _static_cast(findMSB(_static_cast(pc.warpMapHeight))); + if (pc.warpMapHeight == pc.warpMapWidth) + { + return lumaMap.Load(int4(0, 0, _layerIndex, lastMipLevel)); + } else + { + return value_type(0.5) * (lumaMap.Load(int4(0, 0, _layerIndex, lastMipLevel)) + lumaMap.Load(int4(1, 0, _layerIndex, lastMipLevel))); + } + } + +}; + +[numthreads(GenWarpWorkgroupDim, GenWarpWorkgroupDim, 1)] +[shader("compute")] +void main(uint32_t3 threadID : SV_DispatchThreadID) +{ + if (all(threadID.xyz < uint32_t3(pc.warpMapHeight, pc.warpMapWidth, pc.lumaMapLayer))) + { + using WarpGenerator = HierarchicalLuminanceSampler; + + const uint16_t layerIndex = threadID.z; + const LuminanceAccessor luminanceAccessor = LuminanceAccessor::create(layerIndex); + + const WarpGenerator warpGenerator = WarpGenerator::create(luminanceAccessor); + + const uint32_t2 pixelCoord = threadID.xy; + + const float32_t2 xi = float32_t2(pixelCoord) / float32_t2(pc.warpMapWidth - 1, pc.warpMapHeight - 1); + + typename WarpGenerator::cache_type dummyCache; + outImage[threadID.xyz] = warpGenerator.generate(xi, dummyCache); + } + + +} diff --git a/include/nbl/builtin/hlsl/sampling/spherical_mapping.hlsl b/include/nbl/builtin/hlsl/sampling/spherical_mapping.hlsl new file mode 100644 index 0000000000..de473a1bed --- /dev/null +++ b/include/nbl/builtin/hlsl/sampling/spherical_mapping.hlsl @@ -0,0 +1,92 @@ +#ifndef _NBL_BUILTIN_HLSL_SAMPLING_SPHERICAL_MAPPING_INCLUDED_ +#define _NBL_BUILTIN_HLSL_SAMPLING_SPHERICAL_MAPPING_INCLUDED_ + +#include +#include +#include +#include + +namespace nbl +{ +namespace hlsl +{ +namespace sampling +{ + +template +struct SphericalMapping +{ + using scalar_type = T; + using vector2_type = vector; + using vector3_type = vector; + using vector4_type = vector; + + using density_type = scalar_type; + using weight_type = scalar_type; + using domain_type = vector2_type; + using codomain_type = vector3_type; + + struct cache_type + { + scalar_type sinTheta; + }; + + static codomain_type generate(const domain_type uv, NBL_REF_ARG(cache_type) cache) + { + codomain_type dir; + dir.x = cos(uv.x * scalar_type(2) * numbers::pi); + dir.z = sqrt(scalar_type(1) - (dir.x * dir.x)); + if (uv.x > scalar_type(0.5)) + dir.z = -dir.z; + const scalar_type theta = uv.y * numbers::pi; + scalar_type sinTheta, cosTheta; + nbl::hlsl::math::sincos(theta, sinTheta, cosTheta); + dir.xz *= sinTheta; + dir.y = cosTheta; + + cache.sinTheta = sinTheta; + + return dir; + } + + static domain_type generateInverse(const codomain_type v) + { + const density_type phi = atan2(v.z, v.x); + const density_type theta = acos(v.y); + density_type uv_x = phi * density_type(0.5) * numbers::inv_pi; + if (uv_x < density_type(0)) + uv_x += density_type(1); + density_type uv_y = theta * numbers::inv_pi; + return domain_type(uv_x, uv_y); + } + + density_type forwardPdf(const domain_type v, const cache_type cache) + { + return scalar_type(1) / (scalar_type(2) * cache.sinTheta * numbers::pi *numbers::pi); + } + + weight_type forwardWeight(const domain_type v, const cache_type cache) + { + return scalar_type(1) / (scalar_type(2) * cache.sinTheta * numbers::pi *numbers::pi); + } + + density_type backwardPdf(const codomain_type v) + { + const density_type cosTheta = v.y; + const density_type rcpSinTheta = hlsl::rsqrt(density_type(1) - (cosTheta * cosTheta)); + return rcpSinTheta / (density_type(2) * numbers::pi * numbers::pi); + } + + weight_type backwardWeight(const codomain_type v) + { + const density_type cosTheta = v.y; + const density_type rcpSinTheta = hlsl::rsqrt(density_type(1) - (cosTheta * cosTheta)); + return rcpSinTheta / (density_type(2) * numbers::pi * numbers::pi); + } +}; + +} +} +} + +#endif diff --git a/include/nbl/builtin/hlsl/sampling/warpmap.hlsl b/include/nbl/builtin/hlsl/sampling/warpmap.hlsl new file mode 100644 index 0000000000..28b0544cac --- /dev/null +++ b/include/nbl/builtin/hlsl/sampling/warpmap.hlsl @@ -0,0 +1,125 @@ +// Copyright (C) 2018-2025 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h + +#ifndef _NBL_BUILTIN_HLSL_SAMPLING_WARPMAP_INCLUDED_ +#define _NBL_BUILTIN_HLSL_SAMPLING_WARPMAP_INCLUDED_ + +#include +#include +#include +#include + +namespace nbl +{ +namespace hlsl +{ +namespace sampling +{ + +// TODO: Add some constraint into PostWarpT +// Ask(kevin): Should we add constraint so the WarpAccessor::scalar_type is the same as LuminanceAccessorT::value_type. One is a uv and the other is luminance. Technically, they can have different type. +template && + hierarchical_image::WarpAccessor) +struct WarpmapSampler +{ + using scalar_type = typename LuminanceAccessorT::value_type; + using vector2_type = vector; + using vector3_type = vector; + using vector4_type = vector; + using domain_type = vector2_type; + using codomain_type = vector3_type; + using weight_type = scalar_type; + using density_type = scalar_type; + using this_type = WarpmapSampler; + struct cache_type + { + vector2_type xDiffs[2]; + vector2_type yDiff; + vector2_type warpedUv; + scalar_type interpolantY; + typename PostWarpT::cache_type postWarpCache; + }; + + LuminanceAccessorT _lumaMap; + HierarchicalSamplerT _warpMap; + uint16_t2 _lastTexel; + PostWarpT _postWarp; + + static WarpmapSampler create(NBL_CONST_REF_ARG(LuminanceAccessorT) lumaMap, NBL_CONST_REF_ARG(HierarchicalSamplerT) warpMap) + { + this_type result; + result._lumaMap = lumaMap; + result._warpMap = warpMap; + result._lastTexel = warpMap.resolution() - uint16_t2(1, 1); + return result; + } + + + codomain_type generate(const domain_type xi, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC + { + float32_t2 texelCoord = xi * float32_t2(_lastTexel.x, _lastTexel.y); + vector2_type interpolant = hlsl::fract(texelCoord); + uint32_t2 warpmapUv = texelCoord / float32_t2(_warpMap.resolution()); + + matrix uvs; + _warpMap.gatherUv(warpmapUv, uvs); + + const vector2_type xDiffs[] = { + uvs[2] - uvs[3], + uvs[1] - uvs[0] + }; + const vector2_type yVals[] = { + xDiffs[0] * interpolant.x + uvs[3], + xDiffs[1] * interpolant.x + uvs[0] + }; + const vector2_type yDiff = yVals[1] - yVals[0]; + vector2_type uv = yDiff * interpolant.y + yVals[0]; + + cache.xDiffs[0] = xDiffs[0]; + cache.xDiffs[1] = xDiffs[1]; + cache.yDiff = yDiff; + cache.warpedUv = uv; + cache.interpolantY = interpolant.y; + + const codomain_type result = _postWarp.generate(uv, cache.postWarpCache); + + return result; + } + + density_type forwardPdf(const domain_type xi, const cache_type cache) NBL_CONST_MEMBER_FUNC + { + const scalar_type detInterpolJacobian = determinant(matrix( + lerp(cache.xDiffs[0], cache.xDiffs[1], cache.interpolantY), // first column dFdx + cache.yDiff // second column dFdy + )) * scalar_type(_lastTexel.x) * scalar_type(_lastTexel.y); + const scalar_type pdf = abs(_postWarp.forwardPdf(cache.warpedUv, cache.postWarpCache) / detInterpolJacobian); + return pdf; + } + + weight_type forwardWeight(const domain_type xi, const cache_type cache) + { + scalar_type luma; + _lumaMap.get(cache.warpedUv, luma); + return (luma * _postWarp.forwardWeight(cache.warpedUv, cache.postWarpCache)) / _lumaMap.getAvgLuma(); + } + + weight_type backwardWeight(codomain_type direction) NBL_CONST_MEMBER_FUNC + { + vector2_type envmapUv = _postWarp.generateInverse(direction); + scalar_type luma; + _lumaMap.get(envmapUv, luma); + return (luma * _postWarp.backwardWeight(direction)) / _lumaMap.getAvgLuma(); + } +}; + +} +} +} + +#endif + + + diff --git a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl index 22c93ce193..aa395ad524 100644 --- a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl +++ b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl @@ -205,7 +205,7 @@ struct SArithmeticConfiguration #undef DEFINE_ASSIGN } - std::string getConfigTemplateStructString() + std::string getConfigTemplateStructString() NBL_CONST_MEMBER_FUNC { std::ostringstream os; os << "nbl::hlsl::workgroup2::ArithmeticConfiguration<" << WorkgroupSizeLog2 << "," << SubgroupSizeLog2 << "," << ItemsPerInvocation_0 << ">;"; diff --git a/include/nbl/ext/EnvmapImportanceSampling/EnvmapImportanceSampling.h b/include/nbl/ext/EnvmapImportanceSampling/EnvmapImportanceSampling.h deleted file mode 100644 index 678adf59a9..0000000000 --- a/include/nbl/ext/EnvmapImportanceSampling/EnvmapImportanceSampling.h +++ /dev/null @@ -1,92 +0,0 @@ -// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. -// This file is part of the "Nabla Engine". -// For conditions of distribution and use, see copyright notice in nabla.h - -#ifndef _NBL_EXT_ENVMAP_IMPORTANCE_SAMPLING_INCLUDED_ -#define _NBL_EXT_ENVMAP_IMPORTANCE_SAMPLING_INCLUDED_ - -#include "nabla.h" -#include "nbl/video/IGPUShader.h" -#include "nbl/asset/ICPUShader.h" - -namespace nbl::ext::EnvmapImportanceSampling -{ - -class EnvmapImportanceSampling -{ - public: - EnvmapImportanceSampling(video::IVideoDriver* _driver) : m_driver(_driver) - {} - ~EnvmapImportanceSampling() = default; - - // Shader and Resources for Generating Luminance MipMaps from EnvMap - static constexpr uint32_t MaxMipCountLuminance = 13u; - static constexpr uint32_t DefaultLumaMipMapGenWorkgroupDimension = 16u; - static constexpr uint32_t DefaultWarpMapGenWorkgroupDimension = 16u; - - void initResources( - core::smart_refctd_ptr envmap, - uint32_t lumaGenWorkgroupDimension = DefaultLumaMipMapGenWorkgroupDimension, - uint32_t warpMapGenWorkgroupDimension = DefaultWarpMapGenWorkgroupDimension); - void deinitResources(); - - // returns if RIS should be enabled based on variance calculations - inline bool computeWarpMap(const float envMapRegularizationFactor, float& pdfNormalizationFactor) - { - [[maybe_unused]] float dummy; - return computeWarpMap(envMapRegularizationFactor,pdfNormalizationFactor,dummy); - } - bool computeWarpMap(const float envMapRegularizationFactor, float& pdfNormalizationFactor, float& maxEmittanceLuma); - - core::smart_refctd_ptr getLuminanceImageView() { return m_luminance; } - core::smart_refctd_ptr getWarpMapImageView() { return m_warpMap; } - - private: - #define uint uint32_t - struct uvec2 - { - uint x,y; - }; - struct vec2 - { - float x,y; - }; - struct vec3 - { - float x,y,z; - }; - #define vec4 core::vectorSIMDf - #define mat4 core::matrix4SIMD - #define mat4x3 core::matrix3x4SIMD - #include "nbl/builtin/glsl/ext/EnvmapImportanceSampling/structs.glsl" - #undef uint - #undef vec4 - #undef mat4 - #undef mat4x3 - inline uint32_t calcMeasurementBufferSize() const - { - return sizeof(nbl_glsl_ext_EnvmapSampling_LumaMeasurement_t)*m_lumaWorkgroups[0]*m_lumaWorkgroups[1]; - } - #undef NBL_GLSL_EXT_ENVMAP_SAMPLING_LUMA_MEASUREMENTS - - uint32_t m_lumaWorkgroups[2]; - uint32_t m_warpWorkgroups[2]; - - core::smart_refctd_ptr m_luminance; - core::smart_refctd_ptr m_warpMap; // Warps Sample based on EnvMap Luminance - - core::smart_refctd_ptr m_lumaDS; - core::smart_refctd_ptr m_lumaMeasurePipeline; - core::smart_refctd_ptr m_lumaGenPipeline; - - // Shader and Resources for EnvironmentalMap Sample Warping - core::smart_refctd_ptr m_warpDS; - core::smart_refctd_ptr m_warpGPUShader; - core::smart_refctd_ptr m_warpPipeline; - - video::IVideoDriver* m_driver; -}; - -} - -#endif diff --git a/include/nbl/video/sampling/CEnvmapWarpGenerator.h b/include/nbl/video/sampling/CEnvmapWarpGenerator.h new file mode 100644 index 0000000000..a72df270f3 --- /dev/null +++ b/include/nbl/video/sampling/CEnvmapWarpGenerator.h @@ -0,0 +1,142 @@ +#ifndef _NBL_VIDEO_ENVMAP_WARP_GENERATOR_INCLUDED_ +#define _NBL_VIDEO_ENVMAP_WARP_GENERATOR_INCLUDED_ + +#include "nbl/video/declarations.h" + +namespace nbl::video +{ + +class NBL_API2 CEnvmapWarpGenerator final : public core::IReferenceCounted +{ + public: + + static constexpr uint32_t MaxMipCountLuminance = 13u; + + struct SCachedCreationParameters + { + core::smart_refctd_ptr utilities; + }; + + struct SCreationParameters : public SCachedCreationParameters + { + core::smart_refctd_ptr assetManager = nullptr; + + inline bool validate() const + { + const auto validation = std::to_array + ({ + std::make_pair(bool(assetManager), "Invalid `creationParams.assetManager` is nullptr!"), + std::make_pair(bool(utilities), "Invalid `creationParams.utilities` is nullptr!"), + }); + + system::logger_opt_ptr logger = utilities->getLogger(); + for (const auto& [ok, error] : validation) + if (!ok) + { + logger.log(error, system::ILogger::ELL_ERROR); + return false; + } + + assert(bool(assetManager->getSystem())); + + return true; + } + + }; + + + static core::smart_refctd_ptr create(SCreationParameters&& params); + + static core::smart_refctd_ptr createDescriptorSetLayout(video::ILogicalDevice* device); + static core::smart_refctd_ptr createPipelineLayout(video::ILogicalDevice* device); + + static core::smart_refctd_ptr createPipeline(const SCreationParameters& params, const video::IGPUPipelineLayout* pipelineLayout, std::string_view shaderPath); + + + static core::smart_refctd_ptr createLumaMap(video::ILogicalDevice* device, asset::VkExtent3D extent, uint32_t mipCount, uint32_t layerCount, std::string_view debugName = ""); + + static core::smart_refctd_ptr createWarpMap(video::ILogicalDevice* device, asset::VkExtent3D extent, uint32_t layerCount, std::string_view debugName = ""); + + inline video::IGPUComputePipeline* getGenLumaPipeline() const + { + return m_genLumaPipeline.get(); + } + + inline video::IGPUComputePipeline* getGenWarpPipeline() const + { + return m_genWarpPipeline.get(); + } + + class NBL_API2 SSession : public core::IReferenceCounted + { + public: + + // ASK(kevin): Should this and constructor be private and we use friend class? + struct SCachedCreationParams + { + core::smart_refctd_ptr envMap; + core::smart_refctd_ptr lumaMap; + core::smart_refctd_ptr warpMap; + core::smart_refctd_ptr descriptorSet; + core::smart_refctd_ptr generator; + hlsl::uint32_t2 genLumaWorkgroupCount; + hlsl::uint32_t2 genWarpWorkgroupCount; + uint16_t layerCount; + }; + + explicit SSession(SCachedCreationParams&& params) : m_params(std::move(params)) {} + + void computeWarpMap(video::IGPUCommandBuffer* cmdBuf); + + inline core::smart_refctd_ptr getLumaMapView() const + { + return m_params.lumaMap; + } + + inline core::smart_refctd_ptr getWarpMapView() const + { + return m_params.warpMap; + } + + using image_barrier_t = IGPUCommandBuffer::SPipelineBarrierDependencyInfo::image_barrier_t; + // barrier against previous uses of the envmap. Don't access luma map and warp map before calling computeWarpMap + image_barrier_t getEnvMapPrevBarrier(core::bitflag srcStageMask, core::bitflag srcAccessMask, IGPUImage::LAYOUT oldLayout); + + image_barrier_t getEnvMapNextBarrier(core::bitflag dstStageMask, core::bitflag dstAccessMask, IGPUImage::LAYOUT newLayout); + + // barrier against future uses for luma map and warp map. + std::array getOutputMapNextBarrier(core::bitflag dstStageMask, core::bitflag dstAccessMask, IGPUImage::LAYOUT newLayout); + + private: + SCachedCreationParams m_params; + }; + + core::smart_refctd_ptr createSession(core::smart_refctd_ptr&& envMap, uint16_t upscaleLog2 = 0); + + protected: + struct ConstructorParams + { + SCachedCreationParameters creationParams; + core::smart_refctd_ptr genLumaPipeline; + core::smart_refctd_ptr genWarpPipeline; + }; + + explicit CEnvmapWarpGenerator(ConstructorParams&& params) : + m_params(std::move(params.creationParams)), + m_genLumaPipeline(std::move(params.genLumaPipeline)), + m_genWarpPipeline(std::move(params.genWarpPipeline)) + {} + + ~CEnvmapWarpGenerator() override {} + + private: + + SCachedCreationParameters m_params; + + core::smart_refctd_ptr m_genLumaPipeline; + core::smart_refctd_ptr m_genWarpPipeline; + +}; + +} +#endif diff --git a/src/nbl/CMakeLists.txt b/src/nbl/CMakeLists.txt index 9c994bfa41..99fd5bfc47 100644 --- a/src/nbl/CMakeLists.txt +++ b/src/nbl/CMakeLists.txt @@ -128,6 +128,7 @@ set(NBL_CORE_SOURCES core/alloc/refctd_memory_resource.cpp core/hash/blake.cpp ) + set(NBL_SYSTEM_SOURCES system/DefaultFuncPtrLoader.cpp system/IFileBase.cpp @@ -294,6 +295,9 @@ set(NBL_VIDEO_SOURCES # CUDA video/CCUDAHandler.cpp video/CCUDADevice.cpp + +# Sampling + video/sampling/CEnvmapWarpGenerator.cpp ) set(NBL_SCENE_SOURCES diff --git a/src/nbl/builtin/CMakeLists.txt b/src/nbl/builtin/CMakeLists.txt index f27514c2c7..bc29dcc162 100644 --- a/src/nbl/builtin/CMakeLists.txt +++ b/src/nbl/builtin/CMakeLists.txt @@ -281,6 +281,12 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/sampling/spherical_rectangle. LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/sampling/cos_weighted_spheres.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/sampling/quotient_and_pdf.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/sampling/uniform_spheres.hlsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/sampling/hierarchical_image.hlsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/sampling/warpmap.hlsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/sampling/hierarchical_image/accessors.hlsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/sampling/hierarchical_image/common.hlsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/sampling/hierarchical_image/gen_luma.comp.hlsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/sampling/hierarchical_image/gen_warp.comp.hlsl") # LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/ndarray_addressing.hlsl") # @@ -356,7 +362,6 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup2/shared_scan.hlsl") #Extensions LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/ext/FullScreenTriangle/SVertexAttributes.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/ext/FullScreenTriangle/default.vert.hlsl") -LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/text_rendering/msdf.hlsl") #memory LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/memory.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/memory_accessor.hlsl") @@ -384,7 +389,7 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/tgmath/output_structs.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/prefix_sum_blur/blur.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/prefix_sum_blur/box_sampler.hlsl") #rwmc -LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/rwmc/Resolve.hlsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/rwmc/resolve.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/rwmc/CascadeAccumulator.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/rwmc/SplattingParameters.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/rwmc/ResolveParameters.hlsl") diff --git a/src/nbl/ext/EnvmapImportanceSampling/EnvmapImportanceSampling.cpp b/src/nbl/ext/EnvmapImportanceSampling/EnvmapImportanceSampling.cpp deleted file mode 100644 index f11df5ce15..0000000000 --- a/src/nbl/ext/EnvmapImportanceSampling/EnvmapImportanceSampling.cpp +++ /dev/null @@ -1,426 +0,0 @@ -// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. -// This file is part of the "Nabla Engine". -// For conditions of distribution and use, see copyright notice in nabla.h - -#include "nbl/ext/EnvmapImportanceSampling/EnvmapImportanceSampling.h" - -#include - -using namespace nbl; -using namespace nbl::asset; -using namespace nbl::video; -using namespace ext::EnvmapImportanceSampling; - - -static core::smart_refctd_ptr createTexture(nbl::video::IVideoDriver* _driver, const VkExtent3D extent, E_FORMAT format, uint32_t mipLevels=1u, uint32_t layers=0u) -{ - const auto real_layers = layers ? layers:1u; - - IGPUImage::SCreationParams imgparams; - imgparams.extent = extent; - imgparams.arrayLayers = real_layers; - imgparams.flags = static_cast(0); - imgparams.format = format; - imgparams.mipLevels = mipLevels; - imgparams.samples = IImage::ESCF_1_BIT; - imgparams.type = IImage::ET_2D; - - IGPUImageView::SCreationParams viewparams; - viewparams.flags = static_cast(0); - viewparams.format = format; - viewparams.image = _driver->createDeviceLocalGPUImageOnDedMem(std::move(imgparams)); - viewparams.viewType = layers ? IGPUImageView::ET_2D_ARRAY:IGPUImageView::ET_2D; - viewparams.subresourceRange.aspectMask = static_cast(0); - viewparams.subresourceRange.baseArrayLayer = 0u; - viewparams.subresourceRange.layerCount = real_layers; - viewparams.subresourceRange.baseMipLevel = 0u; - viewparams.subresourceRange.levelCount = mipLevels; - - return _driver->createGPUImageView(std::move(viewparams)); -} - -void EnvmapImportanceSampling::initResources(core::smart_refctd_ptr envmap, uint32_t lumaGenWorkgroupDimension, uint32_t warpMapGenWorkgroupDimension) -{ - const auto EnvmapExtent = envmap->getCreationParameters().image->getCreationParameters().extent; - // we don't need the 1x1 mip for anything - const uint32_t MipCountLuminance = IImage::calculateFullMipPyramidLevelCount(EnvmapExtent,IImage::ET_2D)-1; - const auto EnvMapPoTExtent = [MipCountLuminance]() -> VkExtent3D - { - const uint32_t width = 0x1u<>1u,1u }; - }(); - auto calcWorkgroups = [](uint32_t* workGroups, const VkExtent3D extent, const uint32_t workgroupDimension) - { - for (auto i=0; i<2; i++) - workGroups[i] = ((&extent.width)[i]-1u)/workgroupDimension+1u; - }; - - // TODO: Can we get away with R16_SFLOAT for the probabilities? - m_luminance = createTexture(m_driver,EnvMapPoTExtent,EF_R32_SFLOAT,MipCountLuminance); - calcWorkgroups(m_lumaWorkgroups,EnvMapPoTExtent,lumaGenWorkgroupDimension); - - // default make the warp-map same resolution as input envmap - // Format needs to be 32bit full precision float, because the Jacobian needs to accurately match PDF - const uint32_t upscale = 0; - const VkExtent3D WarpMapExtent = {EnvMapPoTExtent.width<&& pipelineLayout) -> core::smart_refctd_ptr - { - const char* sourceFmt = - R"===(#version 430 core - -#define LUMA_MIP_MAP_GEN_WORKGROUP_DIM %u -#define WARP_MAP_GEN_WORKGROUP_DIM %u - -#include "%s" - -)==="; - - const size_t extraSize = 2u * 8u + 128u; - auto shader = core::make_smart_refctd_ptr(strlen(sourceFmt) + extraSize + 1u); - snprintf( - reinterpret_cast(shader->getPointer()), shader->getSize(), sourceFmt, - lumaGenWorkgroupDimension, - warpMapGenWorkgroupDimension, - shaderPath - ); - auto gpuShader = m_driver->createGPUShader(core::make_smart_refctd_ptr(std::move(shader), ICPUShader::buffer_contains_glsl)); - if (!gpuShader) - return nullptr; - - auto specializedShader = m_driver->createGPUSpecializedShader(gpuShader.get(), ISpecializedShader::SInfo{ nullptr,nullptr,"main",asset::ISpecializedShader::ESS_COMPUTE }); - if (!specializedShader) - return nullptr; - - return m_driver->createGPUComputePipeline(nullptr,std::move(pipelineLayout),std::move(specializedShader)); - }; - - // Create Everything - { - ISampler::SParams samplerParams; - samplerParams.TextureWrapU = samplerParams.TextureWrapV = samplerParams.TextureWrapW = ISampler::ETC_CLAMP_TO_EDGE; - samplerParams.MinFilter = ISampler::ETF_NEAREST; - samplerParams.MaxFilter = ISampler::ETF_LINEAR; - samplerParams.MipmapMode = ISampler::ESMM_NEAREST; - samplerParams.AnisotropicFilter = 0u; - samplerParams.CompareEnable = false; - - IGPUDescriptorSet::SDescriptorInfo lumaDescriptorInfo = {}; - lumaDescriptorInfo.desc = m_luminance; - lumaDescriptorInfo.image.sampler = nullptr; - - { - auto upscaleSampler = m_driver->createGPUSampler(samplerParams); - - constexpr auto lumaDescriptorCount = 3u; - IGPUDescriptorSetLayout::SBinding bindings[lumaDescriptorCount]; - bindings[0].binding = 0u; - bindings[0].type = asset::EDT_COMBINED_IMAGE_SAMPLER; - bindings[0].stageFlags = ISpecializedShader::ESS_COMPUTE; - bindings[0].count = 1u; - bindings[0].samplers = &upscaleSampler; - - bindings[1].binding = 1u; - bindings[1].type = asset::EDT_STORAGE_BUFFER_DYNAMIC; - bindings[1].stageFlags = ISpecializedShader::ESS_COMPUTE; - bindings[1].count = 1u; - - bindings[2].binding = 2u; - bindings[2].type = asset::EDT_STORAGE_IMAGE; - bindings[2].stageFlags = ISpecializedShader::ESS_COMPUTE; - bindings[2].count = 1u; - - auto lumaDSLayout = m_driver->createGPUDescriptorSetLayout(bindings,bindings+lumaDescriptorCount); - { - SPushConstantRange range{ ISpecializedShader::ESS_COMPUTE,0u,sizeof(nbl_glsl_ext_EnvmapSampling_LumaGenShaderData_t) }; - auto lumaPipelineLayout = m_driver->createGPUPipelineLayout(&range,&range+1u,core::smart_refctd_ptr(lumaDSLayout)); - m_lumaMeasurePipeline = genPipeline("nbl/builtin/glsl/ext/EnvmapImportanceSampling/measure_luma.comp",core::smart_refctd_ptr(lumaPipelineLayout)); - m_lumaGenPipeline = genPipeline("nbl/builtin/glsl/ext/EnvmapImportanceSampling/gen_luma.comp",std::move(lumaPipelineLayout)); - } - m_lumaDS = m_driver->createGPUDescriptorSet(std::move(lumaDSLayout)); - - { - IGPUDescriptorSet::SDescriptorInfo envMapDescriptorInfo = {}; - envMapDescriptorInfo.desc = envmap; - envMapDescriptorInfo.image.sampler = nullptr; - envMapDescriptorInfo.image.imageLayout = asset::EIL_SHADER_READ_ONLY_OPTIMAL; - - IGPUDescriptorSet::SDescriptorInfo lumaMeasurementInfo = {}; - lumaMeasurementInfo.desc = core::smart_refctd_ptr(m_driver->getDefaultDownStreamingBuffer()->getBuffer()); - lumaMeasurementInfo.buffer = {0,calcMeasurementBufferSize()}; - - IGPUDescriptorSet::SWriteDescriptorSet writes[lumaDescriptorCount]; - for (auto i=0u; iupdateDescriptorSets(lumaDescriptorCount,writes,0u,nullptr); - } - } - - { - samplerParams.TextureWrapU = samplerParams.TextureWrapV = samplerParams.TextureWrapW = ISampler::ETC_CLAMP_TO_BORDER; - samplerParams.BorderColor = ISampler::ETBC_FLOAT_OPAQUE_BLACK; - samplerParams.MaxFilter = ISampler::ETF_NEAREST; - auto lumaSampler = m_driver->createGPUSampler(samplerParams); - - constexpr auto warpDescriptorCount = 2u; - IGPUDescriptorSetLayout::SBinding bindings[warpDescriptorCount]; - bindings[0].binding = 0u; - bindings[0].type = asset::EDT_COMBINED_IMAGE_SAMPLER; - bindings[0].stageFlags = ISpecializedShader::ESS_COMPUTE; - bindings[0].count = 1; - bindings[0].samplers = &lumaSampler; - - bindings[1].binding = 1u; - bindings[1].type = asset::EDT_STORAGE_IMAGE; - bindings[1].stageFlags = ISpecializedShader::ESS_COMPUTE; - bindings[1].count = 1u; - - auto warpDSLayout = m_driver->createGPUDescriptorSetLayout(bindings,bindings+warpDescriptorCount); - - m_warpPipeline = genPipeline( - "nbl/builtin/glsl/ext/EnvmapImportanceSampling/gen_warpmap.comp", - m_driver->createGPUPipelineLayout(nullptr,nullptr,core::smart_refctd_ptr(warpDSLayout)) - ); - - m_warpDS = m_driver->createGPUDescriptorSet(std::move(warpDSLayout)); - { - IGPUDescriptorSet::SDescriptorInfo warpMapDescriptorInfo = {}; - warpMapDescriptorInfo.desc = m_warpMap; - warpMapDescriptorInfo.image.sampler = nullptr; - warpMapDescriptorInfo.image.imageLayout = asset::EIL_GENERAL; - - IGPUDescriptorSet::SWriteDescriptorSet writes[warpDescriptorCount]; - for (auto i=0u; iupdateDescriptorSets(warpDescriptorCount,writes,0u,nullptr); - } - } - } -} - -void EnvmapImportanceSampling::deinitResources() -{ - m_lumaMeasurePipeline = nullptr; - m_lumaGenPipeline = nullptr; - m_lumaDS = nullptr; - - m_warpPipeline = nullptr; - m_warpDS = nullptr; - - m_warpMap = nullptr; - m_luminance = nullptr; -} - -bool EnvmapImportanceSampling::computeWarpMap(const float envMapRegularizationFactor, float& pdfNormalizationFactor, float& maxEmittanceLuma) -{ - bool enableRIS = false; - // - nbl_glsl_ext_EnvmapSampling_LumaGenShaderData_t pcData = {}; - pcData.luminanceScales.set(0.2126729f, 0.7151522f, 0.0721750f, 0.0f); - { - const auto imageExtent = m_luminance->getCreationParameters().image->getCreationParameters().extent; - pcData.lumaMapResolution = {imageExtent.width,imageExtent.height}; - } - - auto dynamicOffsets = core::make_refctd_dynamic_array>(1u); - auto lumaDispatch = [&](core::smart_refctd_ptr& pipeline,core::smart_refctd_dynamic_array* dynamicOffsets) - { - m_driver->bindComputePipeline(pipeline.get()); - m_driver->bindDescriptorSets(EPBP_COMPUTE,pipeline->getLayout(),0u,1u,&m_lumaDS.get(),dynamicOffsets); - m_driver->pushConstants(pipeline->getLayout(),ICPUSpecializedShader::ESS_COMPUTE,0u,sizeof(pcData),&pcData); - m_driver->dispatch(m_lumaWorkgroups[0],m_lumaWorkgroups[1],1); - }; - - // 3 seconds is a long time - constexpr uint64_t timeoutInNanoSeconds = 300000000000u; - - // Calculate directionality metric (0 uniform, 1 totally unidirectional) and new Regularization Factor. - // Ideally would want a better metric of how "concentrated" the energy is in one direction rather than variance, so it - // turns out that the first order spherical harmonic band and weighted (by luma) average of directions are the same thing. - float directionalityMetric = [&]() - { - maxEmittanceLuma = 0.f; - - const uint32_t size = calcMeasurementBufferSize(); - // remember that without initializing the address to be allocated to invalid_address you won't get an allocation! - auto downloadStagingArea = m_driver->getDefaultDownStreamingBuffer(); - const auto& address = dynamicOffsets->operator[](0) = std::remove_pointer::type::invalid_address; - // allocate - { - // common page size - const uint32_t alignment = 4096u; - const auto waitPoint = std::chrono::high_resolution_clock::now()+std::chrono::nanoseconds(timeoutInNanoSeconds); - auto unallocatedSize = downloadStagingArea->multi_alloc(waitPoint,1u,dynamicOffsets->data(),&size,&alignment); - if (unallocatedSize) - { - os::Printer::log("Could not download the buffer from the GPU!", ELL_ERROR); - return 0.f; - } - } - auto* data = reinterpret_cast(reinterpret_cast(downloadStagingArea->getBufferPointer())+address); - - // measure into buffer - lumaDispatch(m_lumaMeasurePipeline,&dynamicOffsets); - COpenGLExtensionHandler::pGlMemoryBarrier(GL_ALL_BARRIER_BITS); // TODO: rethink when reimplementing in Vulkan - { - // place and wait for download fence - auto downloadFence = m_driver->placeFence(true); - auto result = downloadFence->waitCPU(timeoutInNanoSeconds,true); - // - if (result==E_DRIVER_FENCE_RETVAL::EDFR_TIMEOUT_EXPIRED || result==E_DRIVER_FENCE_RETVAL::EDFR_FAIL) - { - os::Printer::log("Could not download the buffer from the GPU, fence not signalled!", ELL_ERROR); - downloadStagingArea->multi_free(1u,&address,&size,nullptr); - return 0.f; - } - // then invalidate the CPU cache of the mapping - if (downloadStagingArea->needsManualFlushOrInvalidate()) - m_driver->invalidateMappedMemoryRanges({ {downloadStagingArea->getBuffer()->getBoundMemory(),address,size} }); - } - - // reduce - core::vectorSIMDf avgDir; - { - const auto reduction = std::reduce( - data,data+size/sizeof(nbl_glsl_ext_EnvmapSampling_LumaMeasurement_t), - nbl_glsl_ext_EnvmapSampling_LumaMeasurement_t{0.f,0.f,0.f,0.f,0.f}, - [](nbl_glsl_ext_EnvmapSampling_LumaMeasurement_t lhs, const nbl_glsl_ext_EnvmapSampling_LumaMeasurement_t& rhs){ - lhs.xDirSum += rhs.xDirSum; - lhs.yDirSum += rhs.yDirSum; - lhs.zDirSum += rhs.zDirSum; - lhs.weightSum += rhs.weightSum; - if (lhs.maxLumamulti_free(1u,&address,&size,nullptr); - - avgDir /= avgDir.wwww(); - avgDir.w = 0.f; - // should it be length or length squared? - const float directionality = core::length(avgDir)[0]; - std::cout << "Final Luminance Directionality = " << directionality << std::endl; - // the only reason why we'd get a NaN would be because there's literally 0 luminance in the image - return core::isnan(directionality) ? 0.f:directionality; - }(); - - const float regularizationFactor = core::min(envMapRegularizationFactor*directionalityMetric,envMapRegularizationFactor); - std::cout << "New Regularization Factor based on Directionality = " << regularizationFactor << std::endl; - - constexpr float regularizationThreshold = 0.00001f; - enableRIS = regularizationFactor>=regularizationThreshold; - - // Calc Luma again with new Regularization Factor - { - pcData.luminanceScales *= regularizationFactor; - pcData.luminanceScales.w = 1.f-regularizationFactor; - lumaDispatch(m_lumaGenPipeline,&dynamicOffsets); - COpenGLExtensionHandler::pGlMemoryBarrier(GL_ALL_BARRIER_BITS); // TODO: rethink when reimplementing in Vulkan - } - - // Calc Mipmaps - m_luminance->regenerateMipMapLevels(); - - // Download last mip level and get avg from it - { - const auto lumaImage = m_luminance->getCreationParameters().image; - - // - IImage::SBufferCopy copyRegion = {}; - { - copyRegion.bufferRowLength = 0u; - copyRegion.bufferImageHeight = 0u; - //copyRegion.imageSubresource.aspectMask = wait for Vulkan; - copyRegion.imageSubresource.mipLevel = lumaImage->getCreationParameters().mipLevels-1u; - copyRegion.imageSubresource.baseArrayLayer = 0u; - copyRegion.imageSubresource.layerCount = lumaImage->getCreationParameters().arrayLayers; - copyRegion.imageOffset = { 0u,0u,0u }; - const auto extent = lumaImage->getMipSize(copyRegion.imageSubresource.mipLevel); - copyRegion.imageExtent = { extent.x,extent.y,extent.z }; - } - const uint32_t lastMipTexelCount = copyRegion.imageSubresource.layerCount*copyRegion.imageExtent.depth*copyRegion.imageExtent.height*copyRegion.imageExtent.width; - const uint32_t size = lastMipTexelCount*asset::getTexelOrBlockBytesize(lumaImage->getCreationParameters().format); - - // remember that without initializing the address to be allocated to invalid_address you won't get an allocation! - auto downloadStagingArea = m_driver->getDefaultDownStreamingBuffer(); - uint32_t address = std::remove_pointer::type::invalid_address; - // allocate - { - // common page size - const uint32_t alignment = 4096u; - const auto waitPoint = std::chrono::high_resolution_clock::now()+std::chrono::nanoseconds(timeoutInNanoSeconds); - auto unallocatedSize = downloadStagingArea->multi_alloc(waitPoint,1u,&address,&size,&alignment); - if (unallocatedSize) - { - os::Printer::log("Could not download the last luma mip map level from the GPU!", ELL_ERROR); - return core::nan(); - } - } - - // - copyRegion.bufferOffset = address; - m_driver->copyImageToBuffer(lumaImage.get(),downloadStagingArea->getBuffer(),1,©Region); - - // place and wait for download fence - { - auto downloadFence = m_driver->placeFence(true); - auto result = downloadFence->waitCPU(timeoutInNanoSeconds,true); - // - if (result==E_DRIVER_FENCE_RETVAL::EDFR_TIMEOUT_EXPIRED || result==E_DRIVER_FENCE_RETVAL::EDFR_FAIL) - { - os::Printer::log("Could not download the last luma mip map level from the GPU! Fence not Signalled!", ELL_ERROR); - downloadStagingArea->multi_free(1u,&address,&size,nullptr); - return core::nan(); - } - // then invalidate the CPU cache of the mapping - if (downloadStagingArea->needsManualFlushOrInvalidate()) - m_driver->invalidateMappedMemoryRanges({ {downloadStagingArea->getBuffer()->getBoundMemory(),address,size} }); - } - - // - { - const float* r32fData = reinterpret_cast(reinterpret_cast(downloadStagingArea->getBufferPointer())+address); - const auto avgVal = std::reduce(r32fData,r32fData+lastMipTexelCount)/float(lastMipTexelCount); - pdfNormalizationFactor = 1.0/(2.0*core::PI()*core::PI()*avgVal); - } - downloadStagingArea->multi_free(1u,&address,&size,nullptr); - } - - // Generate WarpMap - { - m_driver->bindComputePipeline(m_warpPipeline.get()); - m_driver->bindDescriptorSets(EPBP_COMPUTE,m_warpPipeline->getLayout(),0u,1u,&m_warpDS.get(),nullptr); - m_driver->dispatch(m_warpWorkgroups[0],m_warpWorkgroups[1],1); - COpenGLExtensionHandler::pGlMemoryBarrier(GL_ALL_BARRIER_BITS); // TODO: rethink when reimplementing in Vulkan - } - - return enableRIS; -} - - diff --git a/src/nbl/video/sampling/CEnvmapWarpGenerator.cpp b/src/nbl/video/sampling/CEnvmapWarpGenerator.cpp new file mode 100644 index 0000000000..3830f349d6 --- /dev/null +++ b/src/nbl/video/sampling/CEnvmapWarpGenerator.cpp @@ -0,0 +1,594 @@ +#include "nbl/video/sampling/CEnvmapWarpGenerator.h" +#include "nbl/builtin/hlsl/sampling/hierarchical_image/common.hlsl" +#include "nlohmann/detail/input/parser.hpp" + +using namespace nbl; +using namespace core; +using namespace video; +using namespace system; +using namespace asset; +using namespace hlsl; +using namespace nbl::hlsl::sampling::hierarchical_image; + +namespace nbl::video +{ + +class CEnvmapWarpGenerator; + +namespace +{ + constexpr std::string_view NBL_WORKING_DIRECTORY = "nbl/builtin/hlsl/sampling/hierarchical_image/"; + + core::smart_refctd_ptr createTexture(video::ILogicalDevice* device, const asset::VkExtent3D extent, E_FORMAT format, uint32_t mipLevels = 1u, uint32_t layers = 0u) + { + const auto realLayers = layers ? layers:1u; + + IGPUImage::SCreationParams imgParams; + imgParams.extent = extent; + imgParams.arrayLayers = realLayers; + imgParams.flags = static_cast(0); + imgParams.format = format; + imgParams.mipLevels = mipLevels; + imgParams.samples = IImage::ESCF_1_BIT; + imgParams.type = IImage::ET_2D; + imgParams.usage = IImage::EUF_STORAGE_BIT | IImage::EUF_TRANSFER_SRC_BIT | IImage::EUF_TRANSFER_DST_BIT | IImage::EUF_SAMPLED_BIT; + const auto image = device->createImage(std::move(imgParams)); + auto imageMemReqs = image->getMemoryReqs(); + imageMemReqs.memoryTypeBits &= device->getPhysicalDevice()->getDeviceLocalMemoryTypeBits(); + device->allocate(imageMemReqs, image.get()); + + IGPUImageView::SCreationParams viewparams; + viewparams.subUsages = IImage::EUF_STORAGE_BIT | IImage::EUF_SAMPLED_BIT; + viewparams.flags = static_cast(0); + viewparams.format = format; + viewparams.image = std::move(image); + viewparams.viewType = layers ? IGPUImageView::ET_2D_ARRAY:IGPUImageView::ET_2D; + viewparams.subresourceRange.aspectMask = IImage::EAF_COLOR_BIT; + viewparams.subresourceRange.baseArrayLayer = 0u; + viewparams.subresourceRange.layerCount = realLayers; + viewparams.subresourceRange.baseMipLevel = 0u; + viewparams.subresourceRange.levelCount = mipLevels; + + return device->createImageView(std::move(viewparams)); + } + + core::smart_refctd_ptr getShaderSource( asset::IAssetManager* assetManager, std::string_view filePath, system::ILogger* logger) + { + IAssetLoader::SAssetLoadParams lparams = {}; + lparams.logger = logger; + lparams.workingDirectory = NBL_WORKING_DIRECTORY; + const auto filePathStr = std::string(filePath); + auto bundle = assetManager->getAsset(filePathStr, lparams); + if (bundle.getContents().empty() || bundle.getAssetType()!=IAsset::ET_SHADER) + { + const auto assetType = bundle.getAssetType(); + logger->log("Shader %s not found!", ILogger::ELL_ERROR, filePathStr); + exit(-1); + } + auto firstAssetInBundle = bundle.getContents()[0]; + return smart_refctd_ptr_static_cast(firstAssetInBundle); + } +} + +core::smart_refctd_ptr CEnvmapWarpGenerator::create(SCreationParameters&& params) +{ + auto* const logger = params.utilities->getLogger(); + + if (!params.validate()) + { + logger->log("Failed creation parameters validation!", ILogger::ELL_ERROR); + return nullptr; + } + + const auto device = params.utilities->getLogicalDevice(); + + ConstructorParams constructorParams; + + const auto pipelineLayout = createPipelineLayout(device); + + constructorParams.genLumaPipeline = createPipeline(params, pipelineLayout.get(), "gen_luma.comp.hlsl"); + constructorParams.genWarpPipeline = createPipeline(params, pipelineLayout.get(), "gen_warp.comp.hlsl"); + + const auto descriptorPool = device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_UPDATE_AFTER_BIND_BIT, pipelineLayout->getDescriptorSetLayouts()); + const auto descriptorSet = descriptorPool->createDescriptorSet(core::smart_refctd_ptr(pipelineLayout->getDescriptorSetLayouts()[0])); + + constructorParams.creationParams = std::move(params); + + return core::smart_refctd_ptr(new CEnvmapWarpGenerator(std::move(constructorParams))); +} + +core::smart_refctd_ptr CEnvmapWarpGenerator::createLumaMap(video::ILogicalDevice* device, asset::VkExtent3D extent, uint32_t mipCount, uint32_t layerCount, const std::string_view debugName) +{ + return createTexture(device, extent, EF_R32_SFLOAT, mipCount, layerCount); +} + +core::smart_refctd_ptr CEnvmapWarpGenerator::createWarpMap(video::ILogicalDevice* device, asset::VkExtent3D extent, uint32_t layerCount, const std::string_view debugName) +{ + return createTexture(device, extent, EF_R32G32_SFLOAT, 1u, layerCount); +} + +core::smart_refctd_ptr CEnvmapWarpGenerator::createDescriptorSetLayout(video::ILogicalDevice* device) +{ + const IGPUDescriptorSetLayout::SBinding bindings[] = { + // Gen luma input + { + .binding = 0u, + .type = nbl::asset::IDescriptor::E_TYPE::ET_SAMPLED_IMAGE, + .createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, + .stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE, + .count = 1u + }, + // Gen luma output + { + .binding = 1u, + .type = nbl::asset::IDescriptor::E_TYPE::ET_STORAGE_IMAGE, + .createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, + .stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE, + .count = 1u + }, + // Gen warp input + { + .binding = 2u, + .type = nbl::asset::IDescriptor::E_TYPE::ET_SAMPLED_IMAGE, + .createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, + .stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE, + .count = 1u, + }, + // Gen warp output + { + .binding = 3u, + .type = nbl::asset::IDescriptor::E_TYPE::ET_STORAGE_IMAGE, + .createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, + .stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE, + .count = 1u + } + }; + return device->createDescriptorSetLayout(bindings); +} + +core::smart_refctd_ptr CEnvmapWarpGenerator::createPipelineLayout(video::ILogicalDevice* device) +{ + const auto dsLayout = createDescriptorSetLayout(device); + asset::SPushConstantRange pcRange = { + .stageFlags = hlsl::ESS_COMPUTE, + .offset = 0, + .size = std::max(sizeof(SLumaGenPushConstants), sizeof(SWarpGenPushConstants)) + }; + return device->createPipelineLayout({ &pcRange, 1 }, dsLayout); +} + +core::smart_refctd_ptr CEnvmapWarpGenerator::createPipeline(const SCreationParameters& params, const video::IGPUPipelineLayout* layout, std::string_view shaderPath) +{ + system::logger_opt_ptr logger = params.utilities->getLogger(); + auto system = smart_refctd_ptr(params.assetManager->getSystem()); + auto* device = params.utilities->getLogicalDevice(); + + const auto shaderSource = getShaderSource(params.assetManager.get(), shaderPath, logger.get()); + auto compiler = make_smart_refctd_ptr(smart_refctd_ptr(system)); + CHLSLCompiler::SOptions options = {}; + options.stage = IShader::E_SHADER_STAGE::ESS_COMPUTE; + options.preprocessorOptions.targetSpirvVersion = device->getPhysicalDevice()->getLimits().spirvVersion; + options.spirvOptimizer = nullptr; + +#ifndef _NBL_DEBUG + ISPIRVOptimizer::E_OPTIMIZER_PASS optPasses = ISPIRVOptimizer::EOP_STRIP_DEBUG_INFO; + auto opt = make_smart_refctd_ptr(std::span(&optPasses, 1)); + options.spirvOptimizer = opt.get(); +#else + options.debugInfoFlags |= IShaderCompiler::E_DEBUG_INFO_FLAGS::EDIF_LINE_BIT; +#endif + options.preprocessorOptions.sourceIdentifier = shaderSource->getFilepathHint(); + options.preprocessorOptions.logger = logger.get(); + options.preprocessorOptions.includeFinder = compiler->getDefaultIncludeFinder(); + + const auto overridenUnspecialized = compiler->compileToSPIRV(static_cast(shaderSource->getContent()->getPointer()), options); + const auto shader = device->compileShader({ overridenUnspecialized.get() }); + if (!shader) + { + logger.log("Could not compile shaders!", ILogger::ELL_ERROR); + return nullptr; + } + + video::IGPUComputePipeline::SCreationParams pipelineParams[1] = {}; + pipelineParams[0].layout = layout; + pipelineParams[0].shader = { .shader = shader.get(), .entryPoint = "main" }; + + smart_refctd_ptr pipeline; + params.utilities->getLogicalDevice()->createComputePipelines(nullptr, pipelineParams, &pipeline); + if (!pipeline) + { + logger.log("Could not create pipeline!", ILogger::ELL_ERROR); + return nullptr; + } + + return pipeline; +} + +core::smart_refctd_ptr CEnvmapWarpGenerator::createSession(core::smart_refctd_ptr&& envMap, uint16_t upscaleLog2) +{ + + const auto device = m_params.utilities->getLogicalDevice(); + + SSession::SCachedCreationParams sessionParams; + sessionParams.generator = core::smart_refctd_ptr(this); + sessionParams.envMap = std::move(envMap); + + const auto& envmapParams = sessionParams.envMap->getCreationParameters().image->getCreationParameters(); + const auto envmapExtent = envmapParams.extent; + const auto envmapLayers = envmapParams.arrayLayers; + + // we don't need the 1x1 mip for anything + const uint32_t mipCountLuminance = IImage::calculateFullMipPyramidLevelCount(envmapExtent,IImage::ET_2D) - 1; + const auto envmapPotExtent = [mipCountLuminance, envmapLayers]() -> asset::VkExtent3D + { + const uint32_t width = 0x1u << mipCountLuminance; + return { width, width >> 1u, envmapLayers }; + }(); + auto calcWorkgroupSize = [envmapLayers](const asset::VkExtent3D extent, const uint32_t workgroupDimension) -> uint32_t2 + { + return uint32_t2(extent.width - 1, extent.height - 1) / workgroupDimension + uint32_t2(envmapLayers); + }; + + sessionParams.genLumaWorkgroupCount = calcWorkgroupSize(envmapPotExtent, GenLumaWorkgroupDim); + sessionParams.lumaMap = createLumaMap(device, envmapPotExtent, mipCountLuminance, envmapLayers); + + const asset::VkExtent3D warpMapExtent = {envmapPotExtent.width << upscaleLog2, envmapPotExtent.height << upscaleLog2, envmapPotExtent.depth }; + sessionParams.genWarpWorkgroupCount = calcWorkgroupSize(warpMapExtent, GenWarpWorkgroupDim); + sessionParams.warpMap = createWarpMap(device, warpMapExtent, envmapLayers); + + const auto dsLayouts = m_genLumaPipeline->getLayout()->getDescriptorSetLayouts(); + const auto descriptorPool = device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_UPDATE_AFTER_BIND_BIT, dsLayouts); + sessionParams.descriptorSet = descriptorPool->createDescriptorSet(core::smart_refctd_ptr(dsLayouts[0])); + + IGPUDescriptorSet::SDescriptorInfo envMapDescriptorInfo; + envMapDescriptorInfo.desc = sessionParams.envMap; + envMapDescriptorInfo.info.image.imageLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL; + + IGPUDescriptorSet::SDescriptorInfo lumaMapGeneralDescriptorInfo; + lumaMapGeneralDescriptorInfo.desc = sessionParams.lumaMap; + lumaMapGeneralDescriptorInfo.info.image.imageLayout = IImage::LAYOUT::GENERAL; + + IGPUDescriptorSet::SDescriptorInfo lumaMapReadDescriptorInfo; + lumaMapReadDescriptorInfo.desc = sessionParams.lumaMap; + lumaMapReadDescriptorInfo.info.image.imageLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL; + + IGPUDescriptorSet::SDescriptorInfo warpMapDescriptorInfo; + warpMapDescriptorInfo.desc = sessionParams.warpMap; + warpMapDescriptorInfo.info.image.imageLayout = IImage::LAYOUT::GENERAL; + + auto* dsPtr = sessionParams.descriptorSet.get(); + const IGPUDescriptorSet::SWriteDescriptorSet writes[] = { + { + .dstSet = dsPtr, .binding = 0, .count = 1, .info = &envMapDescriptorInfo + }, + { + .dstSet = dsPtr, .binding = 1, .count = 1, .info = &lumaMapGeneralDescriptorInfo + }, + { + .dstSet = dsPtr, .binding = 2, .count = 1, .info = &lumaMapReadDescriptorInfo + }, + { + .dstSet = dsPtr, .binding = 3, .count = 1, .info = &warpMapDescriptorInfo + }, + }; + device->updateDescriptorSets(writes, {}); + + sessionParams.layerCount = envmapLayers; + + return make_smart_refctd_ptr(std::move(sessionParams)); +} + +void CEnvmapWarpGenerator::SSession::computeWarpMap(video::IGPUCommandBuffer* cmdBuf) +{ + const auto lumaMapImage = m_params.lumaMap->getCreationParameters().image.get(); + const auto lumaMapMipLevels = lumaMapImage->getCreationParameters().mipLevels; + const auto lumaMapExtent = lumaMapImage->getCreationParameters().extent; + + const auto warpMapImage = m_params.warpMap->getCreationParameters().image.get(); + const auto warpMapExtent = warpMapImage->getCreationParameters().extent; + + const auto* genLumaPipeline = m_params.generator->getGenLumaPipeline(); + const auto* genWarpPipeline = m_params.generator->getGenWarpPipeline(); + + cmdBuf->bindDescriptorSets(EPBP_COMPUTE, genLumaPipeline->getLayout(), + 0, 1, &m_params.descriptorSet.get()); + + // Gen Luma Map + { + IGPUCommandBuffer::SPipelineBarrierDependencyInfo::image_barrier_t barriers[] = { + { + .barrier = { + .dep = { + .srcStageMask = PIPELINE_STAGE_FLAGS::NONE, + .srcAccessMask = ACCESS_FLAGS::NONE, + .dstStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, + .dstAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS + } + }, + .image = lumaMapImage, + .subresourceRange = { + .aspectMask = IImage::EAF_COLOR_BIT, + .baseMipLevel = 0u, + .levelCount = 1, + .baseArrayLayer = 0u, + .layerCount = IGPUImageView::remaining_array_layers + }, + .oldLayout = IImage::LAYOUT::UNDEFINED, + .newLayout = IImage::LAYOUT::GENERAL, + }, + }; + + SLumaGenPushConstants pcData = {}; + pcData.lumaMapWidth = lumaMapExtent.width; + pcData.lumaMapHeight = lumaMapExtent.height; + + cmdBuf->bindComputePipeline(genLumaPipeline); + cmdBuf->pushConstants(genLumaPipeline->getLayout(), IShader::E_SHADER_STAGE::ESS_COMPUTE, + 0, sizeof(SLumaGenPushConstants), &pcData); + cmdBuf->dispatch(m_params.genLumaWorkgroupCount.x, m_params.genLumaWorkgroupCount.y, m_params.layerCount); + } + + // Generate luminance mip map + { + IGPUCommandBuffer::SPipelineBarrierDependencyInfo::image_barrier_t barriers[] = { + { + .barrier = { + .dep = { + .srcStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, + .srcAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS, + .dstStageMask = PIPELINE_STAGE_FLAGS::BLIT_BIT, + .dstAccessMask = ACCESS_FLAGS::TRANSFER_READ_BIT + } + }, + .image = lumaMapImage, + .subresourceRange = { + .aspectMask = IImage::EAF_COLOR_BIT, + .baseMipLevel = 0u, + .levelCount = 1, + .baseArrayLayer = 0u, + .layerCount = IGPUImageView::remaining_array_layers, + }, + .oldLayout = IImage::LAYOUT::GENERAL, + .newLayout = IImage::LAYOUT::TRANSFER_SRC_OPTIMAL, + }, + { + .barrier = { + .dep = { + .srcStageMask = PIPELINE_STAGE_FLAGS::NONE, + .srcAccessMask = ACCESS_FLAGS::NONE, + .dstStageMask = PIPELINE_STAGE_FLAGS::BLIT_BIT, + .dstAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT + } + }, + .image = lumaMapImage, + .subresourceRange = { + .aspectMask = IImage::EAF_COLOR_BIT, + .baseMipLevel = 1u, + .levelCount = lumaMapMipLevels - 1, + .baseArrayLayer = 0u, + .layerCount = IGPUImageView::remaining_array_layers + }, + .oldLayout = IImage::LAYOUT::UNDEFINED, + .newLayout = IImage::LAYOUT::TRANSFER_DST_OPTIMAL, + } + }; + cmdBuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = barriers }); + const auto mipLevels = lumaMapMipLevels; + const auto extent = lumaMapExtent; + auto* image = lumaMapImage; + for (uint32_t srcMip_i = 0; srcMip_i < mipLevels-1; srcMip_i++) + { + const IGPUCommandBuffer::SImageBlit blit = { + .srcMinCoord = {0, 0, 0}, + .srcMaxCoord = {extent.width >> (srcMip_i), extent.height >> (srcMip_i), 1}, + .dstMinCoord = {0, 0, 0}, + .dstMaxCoord = {extent.width >> srcMip_i + 1, extent.height >> (srcMip_i + 1), 1}, + .layerCount = IGPUImageView::remaining_array_layers, + .srcBaseLayer = 0, + .dstBaseLayer = 0, + .srcMipLevel = srcMip_i, + .dstMipLevel = srcMip_i + 1, + .aspectMask = IGPUImage::E_ASPECT_FLAGS::EAF_COLOR_BIT, + }; + cmdBuf->blitImage(image, IImage::LAYOUT::TRANSFER_SRC_OPTIMAL, image, IImage::LAYOUT::TRANSFER_DST_OPTIMAL, { &blit, 1 }, IGPUSampler::E_TEXTURE_FILTER::ETF_LINEAR); + + // last mip no need to transition + if (srcMip_i + 1 == mipLevels - 1) break; + + IGPUCommandBuffer::SPipelineBarrierDependencyInfo::image_barrier_t barrier = { + .barrier = { + .dep = { + .srcStageMask = PIPELINE_STAGE_FLAGS::BLIT_BIT, + .srcAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT, + .dstStageMask = PIPELINE_STAGE_FLAGS::BLIT_BIT, + .dstAccessMask = ACCESS_FLAGS::TRANSFER_READ_BIT + } + }, + .image = image, + .subresourceRange = { + .aspectMask = IImage::EAF_COLOR_BIT, + .baseMipLevel = srcMip_i + 1, + .levelCount = 1, + .baseArrayLayer = 0u, + .layerCount = IGPUImageView::remaining_array_layers + }, + .oldLayout = IImage::LAYOUT::TRANSFER_DST_OPTIMAL, + .newLayout = IImage::LAYOUT::TRANSFER_SRC_OPTIMAL, + }; + cmdBuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = {&barrier, 1} }); + + } + } + + // Gen Warp Map + { + IGPUCommandBuffer::SPipelineBarrierDependencyInfo::image_barrier_t barriers[] = { + { + .barrier = { + .dep = { + .srcStageMask = PIPELINE_STAGE_FLAGS::BLIT_BIT, + .srcAccessMask = ACCESS_FLAGS::NONE, + .dstStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, + .dstAccessMask = ACCESS_FLAGS::SHADER_READ_BITS + } + }, + .image = lumaMapImage, + .subresourceRange = { + .aspectMask = IImage::EAF_COLOR_BIT, + .baseMipLevel = 0u, + .levelCount = lumaMapMipLevels - 1, + .baseArrayLayer = 0u, + .layerCount = IGPUImageView::remaining_array_layers, + }, + .oldLayout = IImage::LAYOUT::TRANSFER_SRC_OPTIMAL, + .newLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL, + }, + { + .barrier = { + .dep = { + .srcStageMask = PIPELINE_STAGE_FLAGS::BLIT_BIT, + .srcAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT, + .dstStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, + .dstAccessMask = ACCESS_FLAGS::SHADER_READ_BITS + } + }, + .image = lumaMapImage, + .subresourceRange = { + .aspectMask = IImage::EAF_COLOR_BIT, + .baseMipLevel = lumaMapMipLevels - 1, + .levelCount = 1, + .baseArrayLayer = 0u, + .layerCount = IGPUImageView::remaining_array_layers + }, + .oldLayout = IImage::LAYOUT::TRANSFER_DST_OPTIMAL, + .newLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL, + }, + { + .barrier = { + .dep = { + .srcStageMask = PIPELINE_STAGE_FLAGS::NONE, + .srcAccessMask = ACCESS_FLAGS::NONE, + .dstStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, + .dstAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS + } + }, + .image = warpMapImage, + .subresourceRange = { + .aspectMask = IImage::EAF_COLOR_BIT, + .baseMipLevel = 0, + .levelCount = 1, + .baseArrayLayer = 0u, + .layerCount = IGPUImageView::remaining_array_layers + }, + .oldLayout = IImage::LAYOUT::UNDEFINED, + .newLayout = IImage::LAYOUT::GENERAL, + } + }; + cmdBuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = barriers }); + + const SWarpGenPushConstants pcData = { + .lumaMapWidth = lumaMapExtent.width, + .lumaMapHeight = lumaMapExtent.height, + .warpMapWidth = warpMapExtent.width, + .warpMapHeight = warpMapExtent.height + }; + cmdBuf->bindComputePipeline(genWarpPipeline); + cmdBuf->pushConstants(genWarpPipeline->getLayout(), IShader::E_SHADER_STAGE::ESS_COMPUTE, + 0, sizeof(SWarpGenPushConstants), &pcData); + cmdBuf->dispatch(m_params.genWarpWorkgroupCount.x, m_params.genWarpWorkgroupCount.y, m_params.layerCount); + } +} + +CEnvmapWarpGenerator::SSession::image_barrier_t CEnvmapWarpGenerator::SSession::getEnvMapPrevBarrier(core::bitflag srcStageMask, core::bitflag srcAccessMask, IGPUImage::LAYOUT oldLayout) +{ + return image_barrier_t{ + .barrier = { + .dep = { + .srcStageMask = srcStageMask, + .srcAccessMask = srcAccessMask, + .dstStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, + .dstAccessMask = ACCESS_FLAGS::SHADER_READ_BITS + } + }, + .image = m_params.envMap->getCreationParameters().image.get(), + .subresourceRange = { + .aspectMask = IImage::EAF_COLOR_BIT, + .baseMipLevel = 0u, + .levelCount = IImageViewBase::remaining_mip_levels, + .baseArrayLayer = 0u, + .layerCount = 1u + }, + .oldLayout = oldLayout, + .newLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL, + }; +} + +CEnvmapWarpGenerator::SSession::image_barrier_t CEnvmapWarpGenerator::SSession::getEnvMapNextBarrier(core::bitflag dstStageMask, core::bitflag dstAccessMask, IGPUImage::LAYOUT newLayout) +{ + return image_barrier_t{ + .barrier = { + .dep = { + .srcStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, + .srcAccessMask = ACCESS_FLAGS::NONE, + .dstStageMask = dstStageMask, + .dstAccessMask = dstAccessMask } + }, + .image = m_params.envMap->getCreationParameters().image.get(), + .subresourceRange = { + .aspectMask = IImage::EAF_COLOR_BIT, + .baseMipLevel = 0u, + .levelCount = IImageViewBase::remaining_mip_levels, + .baseArrayLayer = 0u, + .layerCount = 1u + }, + .oldLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL, + .newLayout = newLayout, + }; +} + +std::array CEnvmapWarpGenerator::SSession::getOutputMapNextBarrier(core::bitflag dstStageMask, core::bitflag dstAccessMask, IGPUImage::LAYOUT newLayout) +{ + return { + image_barrier_t { + .barrier = { + .dep = { + .srcStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, + .srcAccessMask = ACCESS_FLAGS::NONE, + .dstStageMask = dstStageMask, + .dstAccessMask = dstAccessMask + } + }, + .image = m_params.lumaMap->getCreationParameters().image.get(), + .subresourceRange = { + .aspectMask = IImage::EAF_COLOR_BIT, + .baseMipLevel = 0u, + .levelCount = IImageViewBase::remaining_mip_levels, + .baseArrayLayer = 0u, + .layerCount = IImageViewBase::remaining_array_layers, + }, + .oldLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL, + .newLayout = newLayout, + }, + image_barrier_t { + .barrier = { + .dep = { + .srcStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, + .srcAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS, + .dstStageMask = dstStageMask, + .dstAccessMask = dstAccessMask } + }, + .image = m_params.warpMap->getCreationParameters().image.get(), + .subresourceRange = { + .aspectMask = IImage::EAF_COLOR_BIT, + .baseMipLevel = 0u, + .levelCount = IImageViewBase::remaining_mip_levels, + .baseArrayLayer = 0u, + .layerCount = IImageViewBase::remaining_array_layers, + }, + .oldLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL, + .newLayout = newLayout, + } + }; +} + +}