diff --git a/examples_tests b/examples_tests
index 5481e4e50a..dd53be0d4b 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit 5481e4e50afa85ee384a35afbf89a294d5b55d60
+Subproject commit dd53be0d4b4ab2f41ebd2c958d1645d897fa9fc1
diff --git a/include/nbl/builtin/hlsl/algorithm.hlsl b/include/nbl/builtin/hlsl/algorithm.hlsl
index 66442a11a1..d5c1d9cba6 100644
--- a/include/nbl/builtin/hlsl/algorithm.hlsl
+++ b/include/nbl/builtin/hlsl/algorithm.hlsl
@@ -92,12 +92,12 @@ NBL_CONSTEXPR_FUNC void swap(NBL_REF_ARG(T) lhs, NBL_REF_ARG(T) rhs)
 namespace impl
 {
 
-template<class Accessor, class Comparator>
+template<class Accessor, class Comparator, bool IsUpper = false>
 struct bound_t
 {
-    static bound_t<Accessor,Comparator> setup(uint32_t begin, const uint32_t end, const typename Accessor::value_type _value, const Comparator _comp)
+    static bound_t<Accessor,Comparator,IsUpper> setup(uint32_t begin, const uint32_t end, const typename Accessor::value_type _value, const Comparator _comp)
     {
-        bound_t<Accessor,Comparator> retval;
+        bound_t<Accessor,Comparator,IsUpper> retval;
         retval.comp = _comp;
         retval.value = _value;
         retval.it = begin;
@@ -110,7 +110,7 @@ struct bound_t
     {
         if (isNPoT(len))
         {
-            const uint32_t newLen = 0x1u<<nbl::hlsl::findMSB(len);
+            const uint32_t newLen = 0x1u<<uint32_t(nbl::hlsl::findMSB(len));
             const uint32_t testPoint = it+(len-newLen);
             len = newLen;
             comp_step(accessor,testPoint);
@@ -132,9 +132,17 @@ struct bound_t
         comp_step(accessor,mid);
     }
 
+    bool compare(const typename Accessor::value_type lhs, const typename Accessor::value_type rhs)
+    {
+        if (IsUpper)
+            return !comp(rhs,lhs);
+        else
+            return comp(lhs,rhs);
+    }
+
     void comp_step(NBL_REF_ARG(Accessor) accessor, const uint32_t testPoint, const uint32_t rightBegin)
     {
-        if (comp(accessor[testPoint],value))
+        if (compare(accessor[testPoint],value))
             it = rightBegin;
     }
     void comp_step(NBL_REF_ARG(Accessor) accessor, const uint32_t testPoint)
@@ -148,36 +156,25 @@ struct bound_t
     uint32_t len;
 };
 
-template<class Accessor, class Comparator>
-struct lower_to_upper_comparator_transform_t
-{
-    bool operator()(const typename Accessor::value_type lhs, const typename Accessor::value_type rhs)
-    {
-        return !comp(rhs,lhs);
-    }
-
-    Comparator comp;
-};
-
 }
 
 
 template<class Accessor, class Comparator>
-uint32_t lower_bound(NBL_REF_ARG(Accessor) accessor, const uint32_t begin, const uint32_t end, const typename Accessor::value_type value, const Comparator comp)
+uint32_t lower_bound(NBL_REF_ARG(Accessor) accessor, const uint32_t begin, const uint32_t end, const typename Accessor::value_type value, NBL_REF_ARG(Comparator) comp)
 {
-    impl::bound_t<Accessor,Comparator> implementation = impl::bound_t<Accessor,Comparator>::setup(begin,end,value,comp);
-    return implementation(accessor);
+    impl::bound_t<Accessor,Comparator,false> implementation = impl::bound_t<Accessor,Comparator,false>::setup(begin,end,value,comp);
+    const uint32_t retval = implementation(accessor);
+    comp = implementation.comp;
+    return retval;
 }
 
 template<class Accessor, class Comparator>
-uint32_t upper_bound(NBL_REF_ARG(Accessor) accessor, const uint32_t begin, const uint32_t end, const typename Accessor::value_type value, const Comparator comp)
+uint32_t upper_bound(NBL_REF_ARG(Accessor) accessor, const uint32_t begin, const uint32_t end, const typename Accessor::value_type value, NBL_REF_ARG(Comparator) comp)
 {
-    //using TransformedComparator = impl::lower_to_upper_comparator_transform_t<Accessor,Comparator>;
-    //TransformedComparator transformedComparator;
-    
-    impl::lower_to_upper_comparator_transform_t<Accessor,Comparator> transformedComparator;
-    transformedComparator.comp = comp;
-    return lower_bound<Accessor,impl::lower_to_upper_comparator_transform_t<Accessor,Comparator> >(accessor,begin,end,value,transformedComparator);
+    impl::bound_t<Accessor,Comparator,true> implementation = impl::bound_t<Accessor,Comparator,true>::setup(begin,end,value,comp);
+    const uint32_t retval = implementation(accessor);
+    comp = implementation.comp;
+    return retval;
 }
 
 
diff --git a/include/nbl/builtin/hlsl/bxdf/base/lambertian.hlsl b/include/nbl/builtin/hlsl/bxdf/base/lambertian.hlsl
index a107921026..24f7b49dd2 100644
--- a/include/nbl/builtin/hlsl/bxdf/base/lambertian.hlsl
+++ b/include/nbl/builtin/hlsl/bxdf/base/lambertian.hlsl
@@ -37,16 +37,18 @@ struct SLambertianBase
     template<typename C=bool_constant<!IsBSDF> >
     enable_if_t<C::value && !IsBSDF, sample_type> generate(NBL_CONST_REF_ARG(anisotropic_interaction_type) interaction, const vector2_type u) NBL_CONST_MEMBER_FUNC
     {
+        typename sampling::ProjectedHemisphere<scalar_type>::cache_type cache;
         ray_dir_info_type L;
-        L.setDirection(sampling::ProjectedHemisphere<scalar_type>::generate(u));
+        L.setDirection(sampling::ProjectedHemisphere<scalar_type>::generate(u, cache));
         return sample_type::createFromTangentSpace(L, interaction.getFromTangentSpace());
     }
     template<typename C=bool_constant<IsBSDF> >
     enable_if_t<C::value && IsBSDF, sample_type> generate(NBL_CONST_REF_ARG(anisotropic_interaction_type) interaction, const vector3_type u) NBL_CONST_MEMBER_FUNC
     {
+        typename sampling::ProjectedSphere<scalar_type>::cache_type cache;
         vector3_type _u = u;
         ray_dir_info_type L;
-        L.setDirection(sampling::ProjectedSphere<scalar_type>::generate(_u));
+        L.setDirection(sampling::ProjectedSphere<scalar_type>::generate(_u, cache));
         return sample_type::createFromTangentSpace(L, interaction.getFromTangentSpace());
     }
     template<typename C=bool_constant<!IsBSDF> >
@@ -63,9 +65,9 @@ struct SLambertianBase
     scalar_type pdf(NBL_CONST_REF_ARG(sample_type) _sample, NBL_CONST_REF_ARG(isotropic_interaction_type) interaction) NBL_CONST_MEMBER_FUNC
     {
         NBL_IF_CONSTEXPR (IsBSDF)
-            return sampling::ProjectedSphere<scalar_type>::pdf(_sample.getNdotL(_clamp));
+            return sampling::ProjectedSphere<scalar_type>::backwardPdf(vector<scalar_type, 3>(0, 0, _sample.getNdotL(_clamp)));
         else
-            return sampling::ProjectedHemisphere<scalar_type>::pdf(_sample.getNdotL(_clamp));
+            return sampling::ProjectedHemisphere<scalar_type>::backwardPdf(vector<scalar_type, 3>(0, 0, _sample.getNdotL(_clamp)));
     }
     scalar_type pdf(NBL_CONST_REF_ARG(sample_type) _sample, NBL_CONST_REF_ARG(anisotropic_interaction_type) interaction) NBL_CONST_MEMBER_FUNC
     {
@@ -74,12 +76,15 @@ struct SLambertianBase
 
     quotient_pdf_type quotient_and_pdf(NBL_CONST_REF_ARG(sample_type) _sample, NBL_CONST_REF_ARG(isotropic_interaction_type) interaction) NBL_CONST_MEMBER_FUNC
     {
-        sampling::quotient_and_pdf<monochrome_type, scalar_type> qp;
+        // only z component matters: ProjectedHemisphere::forwardPdf reads v.z
+        const vector3_type L = vector3_type(0, 0, _sample.getNdotL(_clamp));
+        typename sampling::ProjectedHemisphere<scalar_type>::cache_type cache;
+        scalar_type p;
         NBL_IF_CONSTEXPR (IsBSDF)
-            qp = sampling::ProjectedSphere<scalar_type>::template quotient_and_pdf(_sample.getNdotL(_clamp));
+            p = sampling::ProjectedSphere<scalar_type>::forwardPdf(L, cache);
         else
-            qp = sampling::ProjectedHemisphere<scalar_type>::template quotient_and_pdf(_sample.getNdotL(_clamp));
-        return quotient_pdf_type::create(qp.quotient[0], qp.pdf);
+            p = sampling::ProjectedHemisphere<scalar_type>::forwardPdf(L, cache);
+        return quotient_pdf_type::create(scalar_type(1.0), p);
     }
     quotient_pdf_type quotient_and_pdf(NBL_CONST_REF_ARG(sample_type) _sample, NBL_CONST_REF_ARG(anisotropic_interaction_type) interaction) NBL_CONST_MEMBER_FUNC
     {
diff --git a/include/nbl/builtin/hlsl/bxdf/base/oren_nayar.hlsl b/include/nbl/builtin/hlsl/bxdf/base/oren_nayar.hlsl
index d104842608..6412dc0fca 100644
--- a/include/nbl/builtin/hlsl/bxdf/base/oren_nayar.hlsl
+++ b/include/nbl/builtin/hlsl/bxdf/base/oren_nayar.hlsl
@@ -72,16 +72,18 @@ struct SOrenNayarBase
     template<typename C=bool_constant<!IsBSDF> >
     enable_if_t<C::value && !IsBSDF, sample_type> generate(NBL_CONST_REF_ARG(anisotropic_interaction_type) interaction, const vector2_type u) NBL_CONST_MEMBER_FUNC
     {
+        typename sampling::ProjectedHemisphere<scalar_type>::cache_type cache;
         ray_dir_info_type L;
-        L.setDirection(sampling::ProjectedHemisphere<scalar_type>::generate(u));
+        L.setDirection(sampling::ProjectedHemisphere<scalar_type>::generate(u, cache));
         return sample_type::createFromTangentSpace(L, interaction.getFromTangentSpace());
     }
     template<typename C=bool_constant<IsBSDF> >
     enable_if_t<C::value && IsBSDF, sample_type> generate(NBL_CONST_REF_ARG(anisotropic_interaction_type) interaction, const vector3_type u) NBL_CONST_MEMBER_FUNC
     {
+        typename sampling::ProjectedSphere<scalar_type>::cache_type cache;
         vector3_type _u = u;
         ray_dir_info_type L;
-        L.setDirection(sampling::ProjectedSphere<scalar_type>::generate(_u));
+        L.setDirection(sampling::ProjectedSphere<scalar_type>::generate(_u, cache));
         return sample_type::createFromTangentSpace(L, interaction.getFromTangentSpace());
     }
     template<typename C=bool_constant<!IsBSDF> >
@@ -98,9 +100,9 @@ struct SOrenNayarBase
     scalar_type pdf(NBL_CONST_REF_ARG(sample_type) _sample, NBL_CONST_REF_ARG(isotropic_interaction_type) interaction) NBL_CONST_MEMBER_FUNC
     {
         if (IsBSDF)
-            return sampling::ProjectedSphere<scalar_type>::pdf(_sample.getNdotL(_clamp));
+            return sampling::ProjectedSphere<scalar_type>::backwardPdf(vector<scalar_type, 3>(0, 0, _sample.getNdotL(_clamp)));
         else
-            return sampling::ProjectedHemisphere<scalar_type>::pdf(_sample.getNdotL(_clamp));
+            return sampling::ProjectedHemisphere<scalar_type>::backwardPdf(vector<scalar_type, 3>(0, 0, _sample.getNdotL(_clamp)));
     }
     scalar_type pdf(NBL_CONST_REF_ARG(sample_type) _sample, NBL_CONST_REF_ARG(anisotropic_interaction_type) interaction) NBL_CONST_MEMBER_FUNC
     {
diff --git a/include/nbl/builtin/hlsl/ieee754.hlsl b/include/nbl/builtin/hlsl/ieee754.hlsl
index 0663d89c0b..27857ae319 100644
--- a/include/nbl/builtin/hlsl/ieee754.hlsl
+++ b/include/nbl/builtin/hlsl/ieee754.hlsl
@@ -291,6 +291,41 @@ NBL_CONSTEXPR_FUNC bool isZero(T val)
 	return !(ieee754::impl::bitCastToUintType(val) & exponentAndMantissaMask);
 }
 
+// Returns the largest representable value less than `val`.
+// For positive values this decrements the bit representation; for negative values it increments.
+// Caller must guarantee val is finite and non-zero.
+template <bool CanBeNeg = true, typename T NBL_FUNC_REQUIRES(hlsl::is_floating_point_v<T>)
+NBL_CONSTEXPR_FUNC T nextDown(T val)
+{
+	using AsUint = typename unsigned_integer_of_size<sizeof(T)>::type;
+	using traits_t = traits<T>;
+
+	const AsUint bits = ieee754::impl::bitCastToUintType(val);
+
+	// positive: decrement; negative: increment
+	AsUint result;
+	if (CanBeNeg)
+	{
+		const bool isNegative = (bits & traits_t::signMask) != AsUint(0);
+		result = isNegative ? (bits + AsUint(1)) : (bits - AsUint(1));
+	}
+	else
+		result = bits - AsUint(1);
+	return impl::castBackToFloatType<T>(result);
+}
+
+// Returns the representable value nearest to `val` in the direction of zero.
+// For positive values this decrements the bit representation; for negative values it decrements (moving toward zero).
+// Caller must guarantee val is finite and non-zero.
+template <typename T NBL_FUNC_REQUIRES(hlsl::is_floating_point_v<T>)
+NBL_CONSTEXPR_FUNC T nextTowardZero(T val)
+{
+	using AsUint = typename unsigned_integer_of_size<sizeof(T)>::type;
+
+	const AsUint bits = ieee754::impl::bitCastToUintType(val);
+	return impl::castBackToFloatType<T>(bits - AsUint(1));
+}
+
 }
 }
 }
diff --git a/include/nbl/builtin/hlsl/math/angle_adding.hlsl b/include/nbl/builtin/hlsl/math/angle_adding.hlsl
index 5ab661facb..997d8be252 100644
--- a/include/nbl/builtin/hlsl/math/angle_adding.hlsl
+++ b/include/nbl/builtin/hlsl/math/angle_adding.hlsl
@@ -16,10 +16,10 @@ namespace hlsl
 namespace math
 {
 
-template <typename T>
+template <typename T, template<typename NBL_STRUCT_CONSTRAINABLE> class AcosHelper = tgmath_impl::acos_helper>
 struct sincos_accumulator
 {
-    using this_t = sincos_accumulator<T>;
+    using this_t = sincos_accumulator<T, AcosHelper>;
 
     static this_t create(T cosA)
     {
@@ -66,10 +66,29 @@ struct sincos_accumulator
         addAngle(cosA, sqrt<T>(T(1.0) - cosA * cosA));
     }
 
-    // TODO: rename to `getSumOfArccos`
-    T getSumofArccos()
+    T getSumOfArccos()
     {
-        return acos<T>(runningSum.real()) + wraparound;
+        return AcosHelper<T>::__call(runningSum.real()) + wraparound;
+    }
+
+    // Returns (sum of angles) - pi using the identity acos(x) - pi = -acos(-x).
+    // Avoids catastrophic cancellation when wraparound == 0 and sum ~ pi,
+    // where the naive `acos(~-1) - pi` subtracts two values both near pi.
+    // Note: when wraparound == pi (typical for spherical triangles with small
+    // excess e), the naive path `acos(cos(e))` is actually slightly more precise
+    // since it returns a small value directly without a near-pi cancellation.
+    T getSumOfArccosMinusPi()
+    {
+        return -AcosHelper<T>::__call(-runningSum.real()) + wraparound;
+    }
+
+    // Same as getSumOfArccosMinusPi but clamps the accumulated cosine to [-1,1]
+    // before calling acos. The angle addition in addAngle() uses sum-of-products
+    // which can push the accumulated cosine slightly outside the unit circle due
+    // to FP rounding, causing acos to return NaN on GPU where max(NaN,0)=0.
+    T getClampedSumOfArccosMinusPi()
+    {
+        return -AcosHelper<T>::__call(clamp<T>(-runningSum.real(), T(-1), T(1))) + wraparound;
     }
 
     complex_t<T> runningSum;
diff --git a/include/nbl/builtin/hlsl/math/fast_acos.hlsl b/include/nbl/builtin/hlsl/math/fast_acos.hlsl
new file mode 100644
index 0000000000..fea2fd6ca7
--- /dev/null
+++ b/include/nbl/builtin/hlsl/math/fast_acos.hlsl
@@ -0,0 +1,87 @@
+// Copyright (C) 2018-2023 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+#ifndef _NBL_BUILTIN_HLSL_MATH_FAST_ACOS_INCLUDED_
+#define _NBL_BUILTIN_HLSL_MATH_FAST_ACOS_INCLUDED_
+
+#include "nbl/builtin/hlsl/cpp_compat.hlsl"
+#include "nbl/builtin/hlsl/numbers.hlsl"
+#include "nbl/builtin/hlsl/tgmath.hlsl"
+#include "nbl/builtin/hlsl/concepts.hlsl"
+
+namespace nbl
+{
+namespace hlsl
+{
+namespace math
+{
+// https://www.desmos.com/calculator/a59pbwgwof
+// a comparison between fast acos methods
+
+// Attempt 1: Polynomial approximation of acos(x) for x in [-1,1]
+// Based on an odd cubic fit: acos(x) ~ (a * x^2 + b) * x + pi/2
+// Max absolute error: ~9.8e-2 rad
+// Very fast: no sqrt, no branches
+template<typename T NBL_STRUCT_CONSTRAINABLE>
+struct fast_acos_cubic
+{
+	static T __call(const T val)
+	{
+		return (T(-0.621565443625098) * val * val - T(0.837561827808302)) * val + numbers::pi<T> / T(2.0);
+	}
+};
+
+// Attempt 2: Abramowitz & Stegun formula 4.4.45 (adapted for full [-1,1] range)
+// acos(x) ~ sqrt(1-x) * (a0 + a1*x + a2*x^2 + a3*x^3) for x in [0,1]
+// For x in [-1,0]: acos(x) = pi - acos(-x)
+// Max absolute error: ~6.3e-5 rad
+template<typename T NBL_STRUCT_CONSTRAINABLE>
+struct fast_acos_stegun_poly3
+{
+	static T __call(const T val)
+	{
+		const T ax = abs<T>(val);
+		const T s = sqrt<T>(T(1.0) - ax);
+		const T poly = ((T(-0.019771840941) * ax + T(0.075701735421)) * ax - T(0.212644584569)) * ax + T(1.570771931669); // not pi/2, free constant for better fit
+		const T result = s * poly;
+		return hlsl::select(val < T(0.0), numbers::pi<T> - result, result);
+	}
+};
+
+// Attempt 3: Degree-4 polynomial, good accuracy/cost tradeoff
+// Fitted as: acos(x) ~ sqrt(1-x) * (a0 + a1*x + a2*x^2 + a3*x^3 + a4*x^4) for x in [0,1]
+// Max absolute error: ~8.6e-6 rad
+template<typename T NBL_STRUCT_CONSTRAINABLE>
+struct fast_acos_stegun_poly4
+{
+	static T __call(const T val)
+	{
+		const T ax = abs<T>(val);
+		const T s = sqrt<T>(T(1.0) - ax);
+		const T poly = (((T(0.0102831457) * ax - T(0.0387865708)) * ax + T(0.0864014439)) * ax - T(0.2144371342)) * ax + numbers::pi<T> / T(2.0);
+		const T result = s * poly;
+		return hlsl::select(val < T(0.0), numbers::pi<T> - result, result);
+	}
+};
+
+// Attempt 4: Degree-5 polynomial for best accuracy
+// Fitted as: acos(x) ~ sqrt(1-x) * (a0 + a1*x + a2*x^2 + a3*x^3 + a4*x^4 + a5*x^5) for x in [0,1]
+// Max absolute error: ~1.1e-6 rad
+template<typename T NBL_STRUCT_CONSTRAINABLE>
+struct fast_acos_stegun_poly5
+{
+	static T __call(const T val)
+	{
+		const T ax = abs<T>(val);
+		const T s = sqrt<T>(T(1.0) - ax);
+		const T poly = ((((T(-0.0051108043) * ax + T(0.0211860706)) * ax - T(0.0464796661)) * ax + T(0.0883884911)) * ax - T(0.2145725267)) * ax + numbers::pi<T> / T(2.0);
+		const T result = s * poly;
+		return hlsl::select(val < T(0.0), numbers::pi<T> - result, result);
+	}
+};
+
+}
+}
+}
+
+#endif
diff --git a/include/nbl/builtin/hlsl/math/functions.hlsl b/include/nbl/builtin/hlsl/math/functions.hlsl
index f7db44b9fb..3da7da0e71 100644
--- a/include/nbl/builtin/hlsl/math/functions.hlsl
+++ b/include/nbl/builtin/hlsl/math/functions.hlsl
@@ -96,9 +96,10 @@ scalar_type_t<T> lpNorm(NBL_CONST_REF_ARG(T) v)
 template <typename T NBL_FUNC_REQUIRES(concepts::FloatingPointLikeScalar<T>)
 void sincos(T theta, NBL_REF_ARG(T) s, NBL_REF_ARG(T) c)
 {
+    s = sin<T>(theta);
     c = cos<T>(theta);
-    s = sqrt<T>(T(NBL_FP64_LITERAL(1.0))-c*c);
-    s = ieee754::flipSign(s, theta < T(NBL_FP64_LITERAL(0.0)));
+    // s = sqrt<T>(T(NBL_FP64_LITERAL(1.0))-c*c);
+    // s = ieee754::flipSign(s, theta < T(NBL_FP64_LITERAL(0.0)));
 }
 
 template <typename T NBL_FUNC_REQUIRES(vector_traits<T>::Dimension == 3)
@@ -136,7 +137,7 @@ struct conditionalAbsOrMax_helper<T NBL_PARTIAL_REQ_BOT(concepts::FloatingPointL
         using UintOfTSize = unsigned_integer_of_size_t<sizeof(T)>;
         const T condAbs = bit_cast<T>(bit_cast<UintOfTSize>(x) & (cond ? (numeric_limits<UintOfTSize>::max >> 1) : numeric_limits<UintOfTSize>::max));
 
-        return max<T>(condAbs, limit);
+        return nbl::hlsl::max<T>(condAbs, limit);
     }
 };
 
@@ -156,7 +157,7 @@ struct conditionalAbsOrMax_helper<T NBL_PARTIAL_REQ_BOT(concepts::FloatingPointL
         const Uint32VectorWithDimensionOfT condAbsAsUint = xAsUintVec & mask;
         T condAbs = bit_cast<T, Uint32VectorWithDimensionOfT>(condAbsAsUint);
 
-        return max<T>(condAbs, limit);
+        return nbl::hlsl::max<T>(condAbs, limit);
     }
 };
 }
diff --git a/include/nbl/builtin/hlsl/path_tracing/gaussian_filter.hlsl b/include/nbl/builtin/hlsl/path_tracing/gaussian_filter.hlsl
index 6e27749405..8500474552 100644
--- a/include/nbl/builtin/hlsl/path_tracing/gaussian_filter.hlsl
+++ b/include/nbl/builtin/hlsl/path_tracing/gaussian_filter.hlsl
@@ -22,7 +22,7 @@ struct GaussianFilter
     {
         this_t retval;
         retval.truncation = hlsl::exp(-0.5 * gaussianFilterCutoff * gaussianFilterCutoff);
-        retval.boxMuller.stddev = stddev;
+        retval.boxMuller = sampling::BoxMullerTransform<scalar_type>::create(stddev);
         return retval;
     }
 
@@ -31,7 +31,8 @@ struct GaussianFilter
         vector2_type remappedRand = randVec;
         remappedRand.x *= 1.0 - truncation;
         remappedRand.x += truncation;
-        return boxMuller(remappedRand);
+        typename nbl::hlsl::sampling::BoxMullerTransform<scalar_type>::cache_type cache;
+        return boxMuller.generate(remappedRand, cache);
     }
 
     scalar_type truncation;
diff --git a/include/nbl/builtin/hlsl/path_tracing/unidirectional.hlsl b/include/nbl/builtin/hlsl/path_tracing/unidirectional.hlsl
index 0c47c48085..3c05b8e52d 100644
--- a/include/nbl/builtin/hlsl/path_tracing/unidirectional.hlsl
+++ b/include/nbl/builtin/hlsl/path_tracing/unidirectional.hlsl
@@ -104,15 +104,15 @@ struct Unidirectional
             // So we need to weigh the Delta lobes as if the MIS weight is always 1, but other areas regularly.
             // Meaning that eval's pdf should equal quotient's pdf , this way even the diffuse contributions coming from within a specular lobe get a MIS weight near 0 for NEE.
             // This stops a discrepancy in MIS weights and NEE mistakenly trying to add non-delta lobe contributions with a MIS weight > 0 and creating energy from thin air.
-            if (neeContrib.pdf > scalar_type(0.0))
+            if (neeContrib.pdf() > scalar_type(0.0))
             {
                 // TODO: we'll need an `eval_and_mis_weight` and `quotient_and_mis_weight`
                 const scalar_type bsdf_pdf = materialSystem.pdf(matID, nee_sample, interaction);
-                neeContrib.quotient *= materialSystem.eval(matID, nee_sample, interaction) * rcpChoiceProb;
-                if (neeContrib.pdf < bit_cast<scalar_type>(numeric_limits<scalar_type>::infinity))
+                neeContrib._quotient *= materialSystem.eval(matID, nee_sample, interaction) * rcpChoiceProb;
+                if (neeContrib.pdf() < bit_cast<scalar_type>(numeric_limits<scalar_type>::infinity))
                 {
-                    const scalar_type otherGenOverLightAndChoice = bsdf_pdf * rcpChoiceProb / neeContrib.pdf;
-                    neeContrib.quotient /= 1.f + otherGenOverLightAndChoice * otherGenOverLightAndChoice;   // balance heuristic
+                    const scalar_type otherGenOverLightAndChoice = bsdf_pdf * rcpChoiceProb / neeContrib.pdf();
+                    neeContrib._quotient /= 1.f + otherGenOverLightAndChoice * otherGenOverLightAndChoice;   // balance heuristic
                 }
 
                 const vector3_type origin = intersectP;
@@ -122,8 +122,8 @@ struct Unidirectional
                 nee_ray.template setInteraction<anisotropic_interaction_type>(interaction);
                 nee_ray.setT(t);
                 tolerance_method_type::template adjust<ray_type>(nee_ray, intersectData.getGeometricNormal(), depth);
-                if (getLuma(neeContrib.quotient) > lumaContributionThreshold)
-                    ray.addPayloadContribution(neeContrib.quotient * intersector_type::traceShadowRay(scene, nee_ray, ret.getLightObjectID()));
+                if (getLuma(neeContrib.quotient()) > lumaContributionThreshold)
+                    ray.addPayloadContribution(neeContrib.quotient() * intersector_type::traceShadowRay(scene, nee_ray, ret.getLightObjectID()));
             }
         }
 
@@ -140,8 +140,8 @@ struct Unidirectional
 
             // the value of the bsdf divided by the probability of the sample being generated
             quotient_pdf_type bsdf_quotient_pdf = materialSystem.quotient_and_pdf(matID, bsdf_sample, interaction, _cache);
-            throughput *= bsdf_quotient_pdf.quotient;
-            bxdfPdf = bsdf_quotient_pdf.pdf;
+            throughput *= bsdf_quotient_pdf.quotient();
+            bxdfPdf = bsdf_quotient_pdf.pdf();
             bxdfSample = bsdf_sample.getL().getDirection();
         }
 
diff --git a/include/nbl/builtin/hlsl/sampling/alias_table.hlsl b/include/nbl/builtin/hlsl/sampling/alias_table.hlsl
new file mode 100644
index 0000000000..5446511d18
--- /dev/null
+++ b/include/nbl/builtin/hlsl/sampling/alias_table.hlsl
@@ -0,0 +1,133 @@
+// Copyright (C) 2018-2025 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+#ifndef _NBL_BUILTIN_HLSL_SAMPLING_ALIAS_TABLE_INCLUDED_
+#define _NBL_BUILTIN_HLSL_SAMPLING_ALIAS_TABLE_INCLUDED_
+
+#include <nbl/builtin/hlsl/cpp_compat.hlsl>
+#include <nbl/builtin/hlsl/bit.hlsl>
+#include <nbl/builtin/hlsl/concepts/core.hlsl>
+#include <nbl/builtin/hlsl/concepts/accessors/generic_shared_data.hlsl>
+
+namespace nbl
+{
+namespace hlsl
+{
+namespace sampling
+{
+
+// Alias Method (Vose/Walker) discrete sampler.
+//
+// Samples a discrete index in [0, N) with probability proportional to
+// precomputed weights in O(1) time per sample, using a prebuilt alias table.
+//
+// Accessor template parameters must satisfy GenericReadAccessor:
+//   accessor.template get<V, I>(index, outVal)  // void, writes to outVal
+//
+// - ProbabilityAccessor: reads scalar_type threshold in [0, 1] for bin i
+// - AliasIndexAccessor:  reads uint32_t redirect index for bin i
+// - PdfAccessor:         reads scalar_type weight[i] / totalWeight
+//
+// Satisfies TractableSampler (not BackwardTractableSampler: the mapping is discrete).
+// The cache stores the PDF value looked up during generate, avoiding redundant
+// storage of the codomain (sampled index) which is already the return value.
+template<typename T, typename Domain, typename Codomain, typename ProbabilityAccessor, typename AliasIndexAccessor, typename PdfAccessor
+	NBL_PRIMARY_REQUIRES(
+		concepts::UnsignedIntegralScalar<Codomain> &&
+		concepts::accessors::GenericReadAccessor<ProbabilityAccessor, T, Codomain> &&
+		concepts::accessors::GenericReadAccessor<AliasIndexAccessor, Codomain, Codomain> &&
+		concepts::accessors::GenericReadAccessor<PdfAccessor, T, Codomain>)
+struct AliasTable
+{
+	using scalar_type = T;
+
+	using domain_type = Domain;
+	using codomain_type = Codomain;
+	using density_type = scalar_type;
+	using weight_type = density_type;
+
+	struct cache_type
+	{
+		density_type pdf;
+	};
+
+	static AliasTable create(NBL_CONST_REF_ARG(ProbabilityAccessor) _probAccessor, NBL_CONST_REF_ARG(AliasIndexAccessor) _aliasAccessor, NBL_CONST_REF_ARG(PdfAccessor) _pdfAccessor, codomain_type _size)
+	{
+		AliasTable retval;
+		retval.probAccessor = _probAccessor;
+		retval.aliasAccessor = _aliasAccessor;
+		retval.pdfAccessor = _pdfAccessor;
+		// Precompute tableSize as float minus 1 ULP so that u=1.0 maps to bin N-1
+		const scalar_type exact = scalar_type(_size);
+		retval.tableSizeMinusUlp = nbl::hlsl::bit_cast<scalar_type>(nbl::hlsl::bit_cast<uint32_t>(exact) - 1u);
+		return retval;
+	}
+
+	// BasicSampler interface
+	codomain_type generate(const domain_type u) NBL_CONST_MEMBER_FUNC
+	{
+		const scalar_type scaled = u * tableSizeMinusUlp;
+		const codomain_type bin = codomain_type(scaled);
+		const scalar_type remainder = scaled - scalar_type(bin);
+
+		scalar_type prob;
+		probAccessor.template get<scalar_type, codomain_type>(bin, prob);
+
+		// Use if-statement to avoid select: aliasIndex is a dependent read
+		codomain_type result;
+		if (remainder < prob)
+		{
+			result = bin;
+		}
+		else
+		{
+			codomain_type alias;
+			aliasAccessor.template get<codomain_type, codomain_type>(bin, alias);
+			result = alias;
+		}
+
+		return result;
+	}
+
+	// TractableSampler interface
+	codomain_type generate(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC
+	{
+		const codomain_type result = generate(u);
+		pdfAccessor.template get<scalar_type, codomain_type>(result, cache.pdf);
+		return result;
+	}
+
+	density_type forwardPdf(const domain_type u, NBL_CONST_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC
+	{
+		return cache.pdf;
+	}
+
+	weight_type forwardWeight(const domain_type u, NBL_CONST_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC
+	{
+		return cache.pdf;
+	}
+
+	density_type backwardPdf(const codomain_type v) NBL_CONST_MEMBER_FUNC
+	{
+		scalar_type pdf;
+		pdfAccessor.template get<scalar_type, codomain_type>(v, pdf);
+		return pdf;
+	}
+
+	weight_type backwardWeight(const codomain_type v) NBL_CONST_MEMBER_FUNC
+	{
+		return backwardPdf(v);
+	}
+
+	ProbabilityAccessor probAccessor;
+	AliasIndexAccessor aliasAccessor;
+	PdfAccessor pdfAccessor;
+	scalar_type tableSizeMinusUlp;
+};
+
+} // namespace sampling
+} // namespace hlsl
+} // namespace nbl
+
+#endif
diff --git a/include/nbl/builtin/hlsl/sampling/alias_table_builder.h b/include/nbl/builtin/hlsl/sampling/alias_table_builder.h
new file mode 100644
index 0000000000..d02d21488c
--- /dev/null
+++ b/include/nbl/builtin/hlsl/sampling/alias_table_builder.h
@@ -0,0 +1,92 @@
+// Copyright (C) 2018-2025 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+#ifndef _NBL_BUILTIN_HLSL_SAMPLING_ALIAS_TABLE_BUILDER_H_INCLUDED_
+#define _NBL_BUILTIN_HLSL_SAMPLING_ALIAS_TABLE_BUILDER_H_INCLUDED_
+
+#include <cstdint>
+
+namespace nbl
+{
+namespace hlsl
+{
+namespace sampling
+{
+
+// Builds the alias table from an array of non-negative weights.
+// All output arrays must be pre-allocated to N entries.
+//
+// Parameters:
+//   weights         - input weights (non-negative, at least one must be > 0)
+//   N               - number of entries
+//   outProbability  - [out] alias table probability threshold per bin, in [0, 1]
+//   outAlias        - [out] alias redirect index per bin
+//   outPdf          - [out] normalized PDF per entry: weight[i] / sum(weights)
+//   workspace       - scratch buffer of N uint32_t entries
+template<typename T>
+struct AliasTableBuilder
+{
+	static void build(std::span<const T> weights, T* outProbability, uint32_t* outAlias, T* outPdf, uint32_t* workspace)
+	{
+		T totalWeight = T(0);
+      for (uint32_t i = 0; i < weights.size(); i++)
+			totalWeight += weights[i];
+
+		const T rcpTotalWeight = T(1) / totalWeight;
+
+		// Compute PDFs, scaled probabilities, and partition into small/large in one pass
+		uint32_t smallEnd = 0;
+      uint32_t largeBegin = weights.size();
+      for (uint32_t i = 0; i < weights.size(); i++)
+		{
+			outPdf[i] = weights[i] * rcpTotalWeight;
+         outProbability[i] = outPdf[i] * T(weights.size());
+
+			if (outProbability[i] < T(1))
+				workspace[smallEnd++] = i;
+			else
+				workspace[--largeBegin] = i;
+		}
+
+		// Pair small and large entries
+      while (smallEnd > 0 && largeBegin < weights.size())
+		{
+			const uint32_t s = workspace[--smallEnd];
+			const uint32_t l = workspace[largeBegin];
+
+			outAlias[s] = l;
+			// outProbability[s] already holds the correct probability for bin s
+
+			outProbability[l] -= (T(1) - outProbability[s]);
+
+			if (outProbability[l] < T(1))
+			{
+				// l became small: pop from large, push to small
+				largeBegin++;
+				workspace[smallEnd++] = l;
+			}
+			// else l stays in large (don't pop, reuse next iteration)
+		}
+
+		// Remaining entries (floating point rounding artifacts)
+		while (smallEnd > 0)
+		{
+			const uint32_t s = workspace[--smallEnd];
+			outProbability[s] = T(1);
+			outAlias[s] = s;
+		}
+      while (largeBegin < weights.size())
+		{
+			const uint32_t l = workspace[largeBegin++];
+			outProbability[l] = T(1);
+			outAlias[l] = l;
+		}
+	}
+};
+
+} // namespace sampling
+} // namespace hlsl
+} // namespace nbl
+
+#endif
diff --git a/include/nbl/builtin/hlsl/sampling/basic.hlsl b/include/nbl/builtin/hlsl/sampling/basic.hlsl
index 9c575a22ce..c405275e55 100644
--- a/include/nbl/builtin/hlsl/sampling/basic.hlsl
+++ b/include/nbl/builtin/hlsl/sampling/basic.hlsl
@@ -18,29 +18,29 @@ namespace sampling
 template<typename T NBL_PRIMARY_REQUIRES(concepts::FloatingPointLikeScalar<T>)
 struct PartitionRandVariable
 {
-    using floating_point_type = T;
-    using uint_type = unsigned_integer_of_size_t<sizeof(floating_point_type)>;
+	using floating_point_type = T;
+	using uint_type = unsigned_integer_of_size_t<sizeof(floating_point_type)>;
 
-    bool operator()(NBL_REF_ARG(floating_point_type) xi, NBL_REF_ARG(floating_point_type) rcpChoiceProb)
-    {
-        const floating_point_type NextULPAfterUnity = bit_cast<floating_point_type>(bit_cast<uint_type>(floating_point_type(1.0)) + uint_type(1u));
-        const bool pickRight = xi >= leftProb * NextULPAfterUnity;
+	bool operator()(NBL_REF_ARG(floating_point_type) xi, NBL_REF_ARG(floating_point_type) rcpChoiceProb)
+	{
+		const floating_point_type NextULPAfterUnity = bit_cast<floating_point_type>(bit_cast<uint_type>(floating_point_type(1.0)) + uint_type(1u));
+		const bool pickRight = xi >= leftProb * NextULPAfterUnity;
 
-        // This is all 100% correct taking into account the above NextULPAfterUnity
-        xi -= pickRight ? leftProb : floating_point_type(0.0);
+		// This is all 100% correct taking into account the above NextULPAfterUnity
+		xi -= pickRight ? leftProb : floating_point_type(0.0);
 
-        rcpChoiceProb = floating_point_type(1.0) / (pickRight ? (floating_point_type(1.0) - leftProb) : leftProb);
-        xi *= rcpChoiceProb;
+		rcpChoiceProb = floating_point_type(1.0) / (pickRight ? (floating_point_type(1.0) - leftProb) : leftProb);
+		xi *= rcpChoiceProb;
 
-        return pickRight;
-    }
+		return pickRight;
+	}
 
-    floating_point_type leftProb;
+	floating_point_type leftProb;
 };
 
 
-}
-}
-}
+} // namespace sampling
+} // namespace hlsl
+} // namespace nbl
 
 #endif
diff --git a/include/nbl/builtin/hlsl/sampling/bilinear.hlsl b/include/nbl/builtin/hlsl/sampling/bilinear.hlsl
index e1ed03c5e9..35f1391930 100644
--- a/include/nbl/builtin/hlsl/sampling/bilinear.hlsl
+++ b/include/nbl/builtin/hlsl/sampling/bilinear.hlsl
@@ -7,6 +7,7 @@
 
 #include <nbl/builtin/hlsl/cpp_compat.hlsl>
 #include <nbl/builtin/hlsl/limits.hlsl>
+#include <nbl/builtin/hlsl/tgmath.hlsl>
 #include <nbl/builtin/hlsl/sampling/linear.hlsl>
 
 namespace nbl
@@ -24,45 +25,83 @@ struct Bilinear
     using vector3_type = vector<T, 3>;
     using vector4_type = vector<T, 4>;
 
+    // BijectiveSampler concept types
+    using domain_type = vector2_type;
+    using codomain_type = vector2_type;
+    using density_type = scalar_type;
+    using weight_type = density_type;
+
+    struct cache_type
+    {
+        scalar_type normalizedStart;
+        typename Linear<T>::cache_type linearXCache;
+    };
+
     static Bilinear<T> create(const vector4_type bilinearCoeffs)
     {
         Bilinear<T> retval;
-        retval.bilinearCoeffs = bilinearCoeffs;
-        retval.bilinearCoeffDiffs = vector2_type(bilinearCoeffs[2]-bilinearCoeffs[0], bilinearCoeffs[3]-bilinearCoeffs[1]);
+        retval.yStarts = bilinearCoeffs.xy;
+        retval.yDiffs = vector2_type(bilinearCoeffs[2]-bilinearCoeffs[0], bilinearCoeffs[3]- bilinearCoeffs[1]);
         vector2_type twiceAreasUnderXCurve = vector2_type(bilinearCoeffs[0] + bilinearCoeffs[1], bilinearCoeffs[2] + bilinearCoeffs[3]);
-        retval.fourOverTwiceAreasUnderXCurveSum = scalar_type(4.0) / (twiceAreasUnderXCurve[0] + twiceAreasUnderXCurve[1]);
+        // Linear::create adds FLT_MIN internally, replicate here so both divisions share
+        // the same denominator (sum + 2*min), enabling CSE to merge them into one division
+        const scalar_type safeSum = twiceAreasUnderXCurve[0] + twiceAreasUnderXCurve[1] + scalar_type(2.0) * hlsl::numeric_limits<scalar_type>::min;
+        const scalar_type yNormFactor = scalar_type(2.0) / safeSum;
         retval.lineary = Linear<scalar_type>::create(twiceAreasUnderXCurve);
+        retval.normFactor = yNormFactor * scalar_type(2.0);
         return retval;
     }
 
-    vector2_type generate(const vector2_type _u)
+    codomain_type generate(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC
     {
-        vector2_type u;
-        u.y = lineary.generate(_u.y);
+        typename Linear<scalar_type>::cache_type linearYCache;
+
+        vector2_type p;
+        p.y = lineary.generate(u.y, linearYCache);
 
-        const vector2_type ySliceEndPoints = vector2_type(bilinearCoeffs[0] + u.y * bilinearCoeffDiffs[0], bilinearCoeffs[1] + u.y * bilinearCoeffDiffs[1]);
+        const vector2_type ySliceEndPoints = vector2_type(yStarts[0] + p.y * yDiffs[0], yStarts[1] + p.y * yDiffs[1]);
         Linear<scalar_type> linearx = Linear<scalar_type>::create(ySliceEndPoints);
-        u.x = linearx.generate(_u.x);
+        p.x = linearx.generate(u.x, cache.linearXCache);
 
-        return u;
+        // bilinear PDF = marginal_y_pdf * conditional_x_pdf; reuse both linear caches
+        const scalar_type yPdf = lineary.forwardPdf(u.y, linearYCache);
+        cache.normalizedStart = yPdf * linearx.linearCoeffStart;
+        cache.linearXCache.diffTimesX *= yPdf;
+        return p;
+    }
+
+    density_type forwardPdf(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
+    {
+        return cache.normalizedStart + cache.linearXCache.diffTimesX;
+    }
+
+    weight_type forwardWeight(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
+    {
+        return forwardPdf(u, cache);
+    }
+
+    density_type backwardPdf(const codomain_type p) NBL_CONST_MEMBER_FUNC
+    {
+        const vector2_type ySliceEndPoints = vector2_type(yStarts[0] + p.y * yDiffs[0], yStarts[1] + p.y * yDiffs[1]);
+        return nbl::hlsl::mix(ySliceEndPoints[0], ySliceEndPoints[1], p.x) * normFactor;
     }
 
-    scalar_type backwardPdf(const vector2_type u)
+    weight_type backwardWeight(const codomain_type p) NBL_CONST_MEMBER_FUNC
     {
-        const vector2_type ySliceEndPoints = vector2_type(bilinearCoeffs[0] + u.y * bilinearCoeffDiffs[0], bilinearCoeffs[1] + u.y * bilinearCoeffDiffs[1]);
-        return nbl::hlsl::mix(ySliceEndPoints[0], ySliceEndPoints[1], u.x) * fourOverTwiceAreasUnderXCurveSum;
+        return backwardPdf(p);
     }
 
-    // unit square: x0y0    x1y0
-    //              x0y1    x1y1
-    vector4_type bilinearCoeffs;    // (x0y0, x0y1, x1y0, x1y1)
-    vector2_type bilinearCoeffDiffs;
-    scalar_type fourOverTwiceAreasUnderXCurveSum;
+    // TODO: if backwardPdf is queried 3+ times, it pays to normalize yStarts/yDiffs by normFactor
+    // upfront in create (4 MULs), saving 1 MUL in lineary construction and 1 MUL per backwardPdf call.
+    // lineary could then be constructed directly (bypassing create) with the pre-normalized values.
+    vector2_type yStarts;
+    vector2_type yDiffs;
+    scalar_type normFactor;
     Linear<scalar_type> lineary;
 };
 
-}
-}
-}
+} // namespace sampling
+} // namespace hlsl
+} // namespace nbl
 
 #endif
diff --git a/include/nbl/builtin/hlsl/sampling/box_muller_transform.hlsl b/include/nbl/builtin/hlsl/sampling/box_muller_transform.hlsl
index cdd87ee4dc..b9efd1ec35 100644
--- a/include/nbl/builtin/hlsl/sampling/box_muller_transform.hlsl
+++ b/include/nbl/builtin/hlsl/sampling/box_muller_transform.hlsl
@@ -7,6 +7,7 @@
 
 #include "nbl/builtin/hlsl/math/functions.hlsl"
 #include "nbl/builtin/hlsl/numbers.hlsl"
+#include "nbl/builtin/hlsl/sampling/value_and_pdf.hlsl"
 
 namespace nbl
 {
@@ -19,26 +20,82 @@ template<typename T NBL_PRIMARY_REQUIRES(concepts::FloatingPointLikeScalar<T>)
 struct BoxMullerTransform
 {
     using scalar_type = T;
-    using vector2_type = vector<T,2>;
+    using vector2_type = vector<T, 2>;
 
-    vector2_type operator()(const vector2_type xi)
+    // BackwardTractableSampler concept types
+    using domain_type = vector2_type;
+    using codomain_type = vector2_type;
+    using density_type = scalar_type;
+    using weight_type = density_type;
+
+    struct cache_type
+    {
+        vector2_type direction; // (cosPhi, sinPhi)
+    };
+
+    static BoxMullerTransform<T> create(const scalar_type _stddev)
+    {
+        BoxMullerTransform<T> retval;
+        retval.stddev = _stddev;
+        retval.halfRcpStddev2 = scalar_type(0.5) / (_stddev * _stddev);
+        return retval;
+    }
+
+    codomain_type generate(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC
     {
         scalar_type sinPhi, cosPhi;
-        math::sincos<scalar_type>(2.0 * numbers::pi<scalar_type> * xi.y - numbers::pi<scalar_type>, sinPhi, cosPhi);
-        return vector2_type(cosPhi, sinPhi) * nbl::hlsl::sqrt(-2.0 * nbl::hlsl::log(xi.x)) * stddev;
+        math::sincos<scalar_type>(scalar_type(2.0) * numbers::pi<scalar_type> * u.y - numbers::pi<scalar_type>, sinPhi, cosPhi);
+        cache.direction = vector2_type(cosPhi, sinPhi);
+        return cache.direction * nbl::hlsl::sqrt(scalar_type(-2.0) * nbl::hlsl::log(u.x)) * stddev;
     }
 
-    vector2_type backwardPdf(const vector2_type outPos)
+    density_type forwardPdf(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
     {
+        return halfRcpStddev2 * numbers::inv_pi<scalar_type> * u.x;
+    }
+
+    vector2_type separateForwardPdf(const cache_type cache) NBL_CONST_MEMBER_FUNC
+    {
+        const scalar_type normalization = nbl::hlsl::sqrt(halfRcpStddev2 * numbers::inv_pi<scalar_type>);
+        const vector2_type dir2 = cache.direction * cache.direction;
+        return normalization * vector2_type(
+            nbl::hlsl::pow(cache.u_x, dir2.x),
+            nbl::hlsl::pow(cache.u_x, dir2.y)
+        );
+    }
+
+    weight_type forwardWeight(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
+    {
+        return forwardPdf(u, cache);
+    }
+
+    density_type backwardPdf(const codomain_type outPos) NBL_CONST_MEMBER_FUNC
+    {
+        const scalar_type normalization = halfRcpStddev2 * numbers::inv_pi<scalar_type>;
+        return normalization * nbl::hlsl::exp(-halfRcpStddev2 * nbl::hlsl::dot(outPos, outPos));
+    }
+
+    vector2_type separateBackwardPdf(const codomain_type outPos) NBL_CONST_MEMBER_FUNC
+    {
+        const scalar_type normalization = nbl::hlsl::sqrt(halfRcpStddev2 * numbers::inv_pi<scalar_type>);
         const vector2_type outPos2 = outPos * outPos;
-        return vector2_type(nbl::hlsl::exp(scalar_type(-0.5) * (outPos2.x + outPos2.y)), numbers::pi<scalar_type> * scalar_type(0.5) * hlsl::atan2(outPos.y, outPos.x));
+        return vector2_type(
+            normalization * nbl::hlsl::exp(-halfRcpStddev2 * outPos2.x),
+            normalization * nbl::hlsl::exp(-halfRcpStddev2 * outPos2.y)
+        );
+    }
+
+    weight_type backwardWeight(const codomain_type outPos) NBL_CONST_MEMBER_FUNC
+    {
+        return backwardPdf(outPos);
     }
 
     T stddev;
+    T halfRcpStddev2;
 };
 
-}
-}
-}
+} // namespace sampling
+} // namespace hlsl
+} // namespace nbl
 
 #endif
diff --git a/include/nbl/builtin/hlsl/sampling/concentric_mapping.hlsl b/include/nbl/builtin/hlsl/sampling/concentric_mapping.hlsl
index 841fc9ff2d..c9e5cac5d6 100644
--- a/include/nbl/builtin/hlsl/sampling/concentric_mapping.hlsl
+++ b/include/nbl/builtin/hlsl/sampling/concentric_mapping.hlsl
@@ -16,35 +16,102 @@ namespace hlsl
 namespace sampling
 {
 
+// Based on: Shirley & Chiu, "A Low Distortion Map Between Disk and Square", 1997
+// See also: Clarberg, "Fast Equal-Area Mapping of the (Hemi)Sphere using SIMD", 2008
+// http://fileadmin.cs.lth.se/graphics/research/papers/2008/simdmapping/clarberg_simdmapping08_preprint.pdf
 template<typename T>
-vector<T,2> concentricMapping(const vector<T,2> _u)
+struct ConcentricMapping
 {
-    //map [0;1]^2 to [-1;1]^2
-    vector<T,2> u = 2.0f * _u - hlsl::promote<vector<T,2> >(1.0);
-
-    vector<T,2> p;
-    if (hlsl::all<vector<bool,2> >(glsl::equal(u, hlsl::promote<vector<T,2> >(0.0))))
-        p = hlsl::promote<vector<T,2> >(0.0);
-    else
-    {
-        T r;
-        T theta;
-        if (abs<T>(u.x) > abs<T>(u.y)) {
-            r = u.x;
-            theta = 0.25 * numbers::pi<T> * (u.y / u.x);
-        } else {
-            r = u.y;
-            theta = 0.5 * numbers::pi<T> - 0.25 * numbers::pi<T> * (u.x / u.y);
-        }
-
-        p = r * vector<T,2>(cos<T>(theta), sin<T>(theta));
-    }
-
-    return p;
-}
-
-}
-}
-}
+   using scalar_type = T;
+   using vector2_type = vector<T, 2>;
+   using vector3_type = vector<T, 3>;
+   using vector4_type = vector<T, 4>;
+
+   // BijectiveSampler concept types
+   using domain_type = vector2_type;
+   using codomain_type = vector2_type;
+   using density_type = scalar_type;
+   using weight_type = density_type;
+
+   struct cache_type
+   {
+      scalar_type r2;
+   };
+
+   static codomain_type generate(const domain_type u, NBL_REF_ARG(cache_type) cache)
+   {
+      // map [0,1]^2 to [-1,1]^2
+      const vector2_type centered = scalar_type(2) * u - hlsl::promote<vector2_type>(scalar_type(1));
+
+      const scalar_type a = centered.x;
+      const scalar_type b = centered.y;
+
+      // dominant axis selection
+      const bool cond = a * a > b * b;
+      const scalar_type dominant = hlsl::select(cond, a, b);
+      const scalar_type minor = hlsl::select(cond, b, a);
+
+      const scalar_type safe_dominant = dominant != scalar_type(0) ? dominant : scalar_type(0);
+      const scalar_type ratio = minor / safe_dominant;
+
+      const scalar_type angle = scalar_type(0.25) * numbers::pi<scalar_type> * ratio;
+      const scalar_type c = hlsl::cos<scalar_type>(angle);
+      const scalar_type s = hlsl::sin<scalar_type>(angle);
+
+      // final selection without branching
+      const scalar_type x = hlsl::select(cond, c, s);
+      const scalar_type y = hlsl::select(cond, s, c);
+
+      cache.r2 = dominant * dominant;
+      return dominant * vector2_type(x, y);
+   }
+
+   // Overload for BasicSampler
+   static codomain_type generate(domain_type u)
+   {
+      cache_type dummy;
+      return generate(u, dummy);
+   }
+
+   static domain_type generateInverse(const codomain_type p)
+   {
+      const scalar_type r = hlsl::sqrt(p.x * p.x + p.y * p.y);
+
+      const scalar_type ax = hlsl::abs<scalar_type>(p.x);
+      const scalar_type ay = hlsl::abs<scalar_type>(p.y);
+
+      // swapped = ay > ax
+      const bool swapped = ay > ax;
+
+      // branchless selection
+      const scalar_type num = hlsl::select(swapped, ax, ay);
+      const scalar_type denom = hlsl::select(swapped, ay, ax);
+
+      // angle in [0, pi/4]
+      const scalar_type phi = hlsl::atan2(num, denom);
+
+      const scalar_type minor_val = r * phi / (scalar_type(0.25) * numbers::pi<scalar_type>);
+
+      // reconstruct a,b using select instead of branching
+      const scalar_type a_base = hlsl::select(swapped, minor_val, r);
+      const scalar_type b_base = hlsl::select(swapped, r, minor_val);
+
+      const scalar_type a = ieee754::copySign(a_base, p.x);
+      const scalar_type b = ieee754::copySign(b_base, p.y);
+
+      return (vector2_type(a, b) + hlsl::promote<vector2_type>(scalar_type(1))) * scalar_type(0.5);
+   }
+
+   // The PDF of Shirley mapping is constant (1/PI on the unit disk)
+   static density_type forwardPdf(const domain_type u, cache_type cache) { return numbers::inv_pi<scalar_type>; }
+   static density_type backwardPdf(codomain_type v) { return numbers::inv_pi<scalar_type>; }
+
+   static weight_type forwardWeight(const domain_type u, cache_type cache) { return forwardPdf(u, cache); }
+   static weight_type backwardWeight(codomain_type v) { return backwardPdf(v); }
+};
+
+} // namespace sampling
+} // namespace hlsl
+} // namespace nbl
 
 #endif
diff --git a/include/nbl/builtin/hlsl/sampling/concepts.hlsl b/include/nbl/builtin/hlsl/sampling/concepts.hlsl
new file mode 100644
index 0000000000..fa3fa778dc
--- /dev/null
+++ b/include/nbl/builtin/hlsl/sampling/concepts.hlsl
@@ -0,0 +1,279 @@
+#ifndef _NBL_BUILTIN_HLSL_SAMPLING_CONCEPTS_INCLUDED_
+#define _NBL_BUILTIN_HLSL_SAMPLING_CONCEPTS_INCLUDED_
+
+#include <nbl/builtin/hlsl/concepts.hlsl>
+
+namespace nbl
+{
+namespace hlsl
+{
+namespace sampling
+{
+namespace concepts
+{
+
+// ============================================================================
+// SampleWithPDF
+//
+// Checks that a sample type bundles a value with its PDF.
+// ============================================================================
+
+// clang-format off
+#define NBL_CONCEPT_NAME SampleWithPDF
+#define NBL_CONCEPT_TPLT_PRM_KINDS (typename)
+#define NBL_CONCEPT_TPLT_PRM_NAMES (T)
+#define NBL_CONCEPT_PARAM_0 (s, T)
+NBL_CONCEPT_BEGIN(1)
+#define s NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_0
+NBL_CONCEPT_END(
+	((NBL_CONCEPT_REQ_EXPR)(s.pdf()))
+    ((NBL_CONCEPT_REQ_EXPR)(s.value())));
+#undef s
+#include <nbl/builtin/hlsl/concepts/__end.hlsl>
+// clang-format on
+
+// ============================================================================
+// SampleWithRcpPDF
+//
+// Checks that a sample type bundles a value with its reciprocal PDF.
+// ============================================================================
+
+// clang-format off
+#define NBL_CONCEPT_NAME SampleWithRcpPDF
+#define NBL_CONCEPT_TPLT_PRM_KINDS (typename)
+#define NBL_CONCEPT_TPLT_PRM_NAMES (T)
+#define NBL_CONCEPT_PARAM_0 (s, T)
+NBL_CONCEPT_BEGIN(1)
+#define s NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_0
+NBL_CONCEPT_END(
+	((NBL_CONCEPT_REQ_EXPR)(s.rcpPdf()))
+    ((NBL_CONCEPT_REQ_EXPR)(s.value())));
+#undef s
+#include <nbl/builtin/hlsl/concepts/__end.hlsl>
+// clang-format on
+
+// ============================================================================
+// SampleWithDensity
+//
+// A sample type that bundles a value with either its PDF or reciprocal PDF.
+// This is the disjunction of SampleWithPDF and SampleWithRcpPDF.
+// ============================================================================
+template<typename T>
+NBL_BOOL_CONCEPT SampleWithDensity = SampleWithPDF<T> || SampleWithRcpPDF<T>;
+
+// ============================================================================
+// BasicSampler
+//
+// The simplest sampler: maps domain -> codomain.
+//
+// Required types:
+//   domain_type     - the input space (e.g. float for 1D, float2 for 2D)
+//   codomain_type   - the output space (e.g. float3 for directions)
+//
+// Required methods:
+//   codomain_type generate(domain_type u)
+// ============================================================================
+
+// clang-format off
+#define NBL_CONCEPT_NAME BasicSampler
+#define NBL_CONCEPT_TPLT_PRM_KINDS (typename)
+#define NBL_CONCEPT_TPLT_PRM_NAMES (T)
+#define NBL_CONCEPT_PARAM_0 (_sampler, T)
+#define NBL_CONCEPT_PARAM_1 (u, typename T::domain_type)
+NBL_CONCEPT_BEGIN(2)
+#define _sampler NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_0
+#define u NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_1
+NBL_CONCEPT_END(
+	((NBL_CONCEPT_REQ_TYPE)(T::domain_type))
+    ((NBL_CONCEPT_REQ_TYPE)(T::codomain_type))
+    ((NBL_CONCEPT_REQ_EXPR_RET_TYPE) ((_sampler.generate(u)), ::nbl::hlsl::is_same_v, typename T::codomain_type)));
+#undef u
+#undef _sampler
+#include <nbl/builtin/hlsl/concepts/__end.hlsl>
+// clang-format on
+
+// ============================================================================
+// TractableSampler
+//
+// A sampler whose density can be computed analytically in the forward
+// (sampling) direction. generate returns a codomain_type value and writes
+// intermediates to a cache_type out-param for later pdf evaluation.
+//
+// The cache_type out-param stores values derived during generate for
+// reuse by forwardPdf without redundant recomputation.
+//
+// For constant-pdf samplers, forwardPdf ignores both arguments.
+// For complex samplers, cache carries intermediate values computed
+// during generate and forwardPdf evaluates the pdf from those.
+//
+// Required types:
+//   domain_type, codomain_type, density_type, cache_type
+//
+// Required methods:
+//   codomain_type generate(domain_type u, out cache_type cache)
+//   density_type  forwardPdf(domain_type u, cache_type cache)
+// ============================================================================
+
+// clang-format off
+#define NBL_CONCEPT_NAME TractableSampler
+#define NBL_CONCEPT_TPLT_PRM_KINDS (typename)
+#define NBL_CONCEPT_TPLT_PRM_NAMES (T)
+#define NBL_CONCEPT_PARAM_0 (_sampler, T)
+#define NBL_CONCEPT_PARAM_1 (u, typename T::domain_type)
+#define NBL_CONCEPT_PARAM_2 (v, typename T::codomain_type)
+#define NBL_CONCEPT_PARAM_3 (cache, typename T::cache_type)
+NBL_CONCEPT_BEGIN(4)
+#define _sampler NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_0
+#define u NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_1
+#define v NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_2
+#define cache NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_3
+NBL_CONCEPT_END(
+	((NBL_CONCEPT_REQ_TYPE)(T::domain_type))
+    ((NBL_CONCEPT_REQ_TYPE)(T::codomain_type))
+    ((NBL_CONCEPT_REQ_TYPE)(T::density_type))
+    ((NBL_CONCEPT_REQ_TYPE)(T::cache_type))
+    ((NBL_CONCEPT_REQ_EXPR_RET_TYPE) ((_sampler.generate(u, cache)), ::nbl::hlsl::is_same_v, typename T::codomain_type))
+    ((NBL_CONCEPT_REQ_EXPR_RET_TYPE) ((_sampler.forwardPdf(u, cache)), ::nbl::hlsl::is_same_v, typename T::density_type)));
+#undef cache
+#undef v
+#undef u
+#undef _sampler
+#include <nbl/builtin/hlsl/concepts/__end.hlsl>
+// clang-format on
+
+// ============================================================================
+// ResamplableSampler
+//
+// A sampler with forward and backward importance weights, enabling use in
+// Multiple Importance Sampling (MIS) and Resampled Importance Sampling (RIS).
+//
+// Note: resampling does not require tractability - the weights need not be
+// normalized probability densities, so this concept is satisfied by
+// intractable samplers as well.
+//
+// Required types:
+//   domain_type  - the input space
+//   codomain_type - the output space
+//   cache_type   - stores intermediates from generate for forward weight reuse
+//   weight_type  - the type of the importance weight
+//
+// Required methods:
+//   codomain_type generate(domain_type u, out cache_type cache)
+//   weight_type   forwardWeight(domain_type u, cache_type cache)  - forward weight for MIS
+//   weight_type   backwardWeight(codomain_type v)  - backward weight for RIS
+// ============================================================================
+
+// clang-format off
+#define NBL_CONCEPT_NAME ResamplableSampler
+#define NBL_CONCEPT_TPLT_PRM_KINDS (typename)
+#define NBL_CONCEPT_TPLT_PRM_NAMES (T)
+#define NBL_CONCEPT_PARAM_0 (_sampler, T)
+#define NBL_CONCEPT_PARAM_1 (u, typename T::domain_type)
+#define NBL_CONCEPT_PARAM_2 (v, typename T::codomain_type)
+#define NBL_CONCEPT_PARAM_3 (cache, typename T::cache_type)
+NBL_CONCEPT_BEGIN(4)
+#define _sampler NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_0
+#define u NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_1
+#define v NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_2
+#define cache NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_3
+NBL_CONCEPT_END(
+	((NBL_CONCEPT_REQ_TYPE)(T::domain_type))
+    ((NBL_CONCEPT_REQ_TYPE)(T::codomain_type))
+    ((NBL_CONCEPT_REQ_TYPE)(T::cache_type))
+    ((NBL_CONCEPT_REQ_TYPE)(T::weight_type))
+    ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((_sampler.generate(u, cache)), ::nbl::hlsl::is_same_v, typename T::codomain_type))
+    ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((_sampler.forwardWeight(u, cache)), ::nbl::hlsl::is_same_v, typename T::weight_type))
+    ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((_sampler.backwardWeight(v)), ::nbl::hlsl::is_same_v, typename T::weight_type)));
+#undef cache
+#undef v
+#undef u
+#undef _sampler
+#include <nbl/builtin/hlsl/concepts/__end.hlsl>
+// clang-format on
+
+// ============================================================================
+// BackwardTractableSampler
+//
+// Extends TractableSampler with the ability to evaluate the PDF given
+// a codomain value (i.e. without knowing the original domain input).
+// The mapping need not be injective — multiple domain elements may map
+// to the same codomain output, and backwardPdf accounts for this
+// correctly (e.g. by summing contributions).
+//
+// Also exposes forward and backward importance weights for use in MIS
+// and RIS.
+//
+// Required types (in addition to TractableSampler):
+//   weight_type  - the type of the importance weight
+//
+// Required methods (in addition to TractableSampler):
+//   density_type backwardPdf(codomain_type v)   - evaluate pdf at codomain value v
+//   weight_type  forwardWeight(domain_type u, cache_type cache) - weight for MIS, reuses generate cache
+//   weight_type  backwardWeight(codomain_type v) - weight for RIS, evaluated at v
+// ============================================================================
+
+// clang-format off
+#define NBL_CONCEPT_NAME BackwardTractableSampler
+#define NBL_CONCEPT_TPLT_PRM_KINDS (typename)
+#define NBL_CONCEPT_TPLT_PRM_NAMES (T)
+#define NBL_CONCEPT_PARAM_0 (_sampler, T)
+#define NBL_CONCEPT_PARAM_1 (u, typename T::domain_type)
+#define NBL_CONCEPT_PARAM_2 (v, typename T::codomain_type)
+#define NBL_CONCEPT_PARAM_3 (cache, typename T::cache_type)
+NBL_CONCEPT_BEGIN(4)
+#define _sampler NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_0
+#define u NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_1
+#define v NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_2
+#define cache NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_3
+NBL_CONCEPT_END(
+	((NBL_CONCEPT_REQ_TYPE_ALIAS_CONCEPT)(TractableSampler, T))
+    ((NBL_CONCEPT_REQ_TYPE)(T::weight_type))
+    ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((_sampler.backwardPdf(v)), ::nbl::hlsl::is_same_v, typename T::density_type))
+    ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((_sampler.forwardWeight(u, cache)), ::nbl::hlsl::is_same_v, typename T::weight_type))
+    ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((_sampler.backwardWeight(v)), ::nbl::hlsl::is_same_v, typename T::weight_type)));
+#undef cache
+#undef v
+#undef u
+#undef _sampler
+#include <nbl/builtin/hlsl/concepts/__end.hlsl>
+// clang-format on
+
+// ============================================================================
+// BijectiveSampler
+//
+// The mapping domain <-> codomain is bijective (1:1), so it can be
+// inverted. Extends BackwardTractableSampler with generateInverse.
+//
+// Because the mapping is bijective, the absolute value of the determinant
+// of the Jacobian matrix of the inverse equals the reciprocal of the
+// absolute value of the determinant of the Jacobian matrix of the forward
+// mapping (the Jacobian is an NxM matrix, not a scalar):
+//   backwardPdf(v) == 1.0 / forwardPdf(u, cache)  (where v == generate(u, cache))
+//
+// Required methods (in addition to BackwardTractableSampler):
+//   domain_type generateInverse(codomain_type v)
+// ============================================================================
+
+// clang-format off
+#define NBL_CONCEPT_NAME BijectiveSampler
+#define NBL_CONCEPT_TPLT_PRM_KINDS (typename)
+#define NBL_CONCEPT_TPLT_PRM_NAMES (T)
+#define NBL_CONCEPT_PARAM_0 (_sampler, T)
+#define NBL_CONCEPT_PARAM_1 (v, typename T::codomain_type)
+NBL_CONCEPT_BEGIN(2)
+#define _sampler NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_0
+#define v NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_1
+NBL_CONCEPT_END(
+	((NBL_CONCEPT_REQ_TYPE_ALIAS_CONCEPT)(BackwardTractableSampler, T))
+    ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((_sampler.generateInverse(v)), ::nbl::hlsl::is_same_v, typename T::domain_type)));
+#undef v
+#undef _sampler
+#include <nbl/builtin/hlsl/concepts/__end.hlsl>
+// clang-format on
+
+} // namespace concepts
+} // namespace sampling
+} // namespace hlsl
+} // namespace nbl
+
+#endif // _NBL_BUILTIN_HLSL_SAMPLING_CONCEPTS_INCLUDED_
diff --git a/include/nbl/builtin/hlsl/sampling/cos_weighted_spheres.hlsl b/include/nbl/builtin/hlsl/sampling/cos_weighted_spheres.hlsl
index ddbb961300..3a191f142f 100644
--- a/include/nbl/builtin/hlsl/sampling/cos_weighted_spheres.hlsl
+++ b/include/nbl/builtin/hlsl/sampling/cos_weighted_spheres.hlsl
@@ -7,7 +7,6 @@
 
 #include "nbl/builtin/hlsl/concepts.hlsl"
 #include "nbl/builtin/hlsl/sampling/concentric_mapping.hlsl"
-#include "nbl/builtin/hlsl/sampling/quotient_and_pdf.hlsl"
 
 namespace nbl
 {
@@ -19,72 +18,119 @@ namespace sampling
 template<typename T NBL_PRIMARY_REQUIRES(is_scalar_v<T>)
 struct ProjectedHemisphere
 {
-    using vector_t2 = vector<T, 2>;
-    using vector_t3 = vector<T, 3>;
-    
-    static vector_t3 generate(const vector_t2 _sample)
-    {
-        vector_t2 p = concentricMapping<T>(_sample * T(0.99999) + T(0.000005));
-        T z = hlsl::sqrt<T>(hlsl::max<T>(T(0.0), T(1.0) - p.x * p.x - p.y * p.y));
-        return vector_t3(p.x, p.y, z);
-    }
-
-    static T pdf(const T L_z)
-    {
-        return L_z * numbers::inv_pi<float>;
-    }
-
-    template<typename U=vector<T,1> >
-    static sampling::quotient_and_pdf<U, T> quotient_and_pdf(const T L)
-    {
-        return sampling::quotient_and_pdf<U, T>::create(hlsl::promote<U>(1.0), pdf(L));
-    }
-
-    template<typename U=vector<T,1> >
-    static sampling::quotient_and_pdf<U, T> quotient_and_pdf(const vector_t3 L)
-    {
-        return sampling::quotient_and_pdf<U, T>::create(hlsl::promote<U>(1.0), pdf(L.z));
-    }
+	using vector_t2 = vector<T, 2>;
+	using vector_t3 = vector<T, 3>;
+
+	// BijectiveSampler concept types
+	using scalar_type = T;
+	using domain_type = vector_t2;
+	using codomain_type = vector_t3;
+	using density_type = T;
+	using weight_type = density_type;
+
+	struct cache_type
+	{
+		scalar_type z;
+	};
+
+	static codomain_type __generate(const domain_type u)
+	{
+		vector_t2 p = ConcentricMapping<T>::generate(u * T(0.99999) + T(0.000005));
+		T z = hlsl::sqrt<T>(hlsl::max<T>(T(0.0), T(1.0) - p.x * p.x - p.y * p.y));
+		return vector_t3(p.x, p.y, z);
+	}
+
+	static codomain_type generate(const domain_type u, NBL_REF_ARG(cache_type) cache)
+	{
+		const codomain_type retval = __generate(u);
+		cache.z = retval.z;
+		return retval;
+	}
+
+	static domain_type generateInverse(const codomain_type L)
+	{
+		return ConcentricMapping<T>::generateInverse(L.xy);
+	}
+
+	static density_type forwardPdf(const domain_type u, const cache_type cache)
+	{
+		return cache.z * numbers::inv_pi<T>;
+	}
+
+	static weight_type forwardWeight(const domain_type u, const cache_type cache)
+	{
+		return forwardPdf(u, cache);
+	}
+
+	static density_type backwardPdf(const codomain_type L)
+	{
+		assert(L.z > 0);
+		return L.z * numbers::inv_pi<T>;
+	}
+
+	static weight_type backwardWeight(const codomain_type L)
+	{
+		return backwardPdf(L);
+	}
 };
 
 template<typename T NBL_PRIMARY_REQUIRES(is_scalar_v<T>)
 struct ProjectedSphere
 {
-    using vector_t2 = vector<T, 2>;
-    using vector_t3 = vector<T, 3>;
-    using hemisphere_t = ProjectedHemisphere<T>;
-
-    static vector_t3 generate(NBL_REF_ARG(vector_t3) _sample)
-    {
-        vector_t3 retval = hemisphere_t::generate(_sample.xy);
-        const bool chooseLower = _sample.z > T(0.5);
-        retval.z = chooseLower ? (-retval.z) : retval.z;
-        if (chooseLower)
-            _sample.z -= T(0.5);
-        _sample.z *= T(2.0);
-        return retval;
-    }
-
-    static T pdf(T L_z)
-    {
-        return T(0.5) * hemisphere_t::pdf(L_z);
-    }
-
-    template<typename U=vector<T,1> >
-    static sampling::quotient_and_pdf<U, T> quotient_and_pdf(T L)
-    {
-        return sampling::quotient_and_pdf<U, T>::create(hlsl::promote<U>(1.0), pdf(L));
-    }
-
-    template<typename U=vector<T,1> >
-    static sampling::quotient_and_pdf<U, T> quotient_and_pdf(const vector_t3 L)
-    {
-        return sampling::quotient_and_pdf<U, T>::create(hlsl::promote<U>(1.0), pdf(L.z));
-    }
+	using vector_t2 = vector<T, 2>;
+	using vector_t3 = vector<T, 3>;
+	using hemisphere_t = ProjectedHemisphere<T>;
+
+	// BackwardTractableSampler concept types (not bijective: z input is consumed for hemisphere selection)
+	using scalar_type = T;
+	using domain_type = vector_t3;
+	using codomain_type = vector_t3;
+	using density_type = T;
+	using weight_type = density_type;
+
+	using cache_type = typename hemisphere_t::cache_type;
+
+	static codomain_type __generate(NBL_REF_ARG(domain_type) u)
+	{
+		vector_t3 retval = hemisphere_t::__generate(u.xy);
+		const bool chooseLower = u.z > T(0.5);
+		retval.z = chooseLower ? (-retval.z) : retval.z;
+		if (chooseLower)
+			u.z -= T(0.5);
+		u.z *= T(2.0);
+		return retval;
+	}
+
+	static codomain_type generate(NBL_REF_ARG(domain_type) u, NBL_REF_ARG(cache_type) cache)
+	{
+		const codomain_type retval = __generate(u);
+		cache.z = hlsl::abs(retval.z);
+		return retval;
+	}
+
+	static density_type forwardPdf(const domain_type u, const cache_type cache)
+	{
+		return T(0.5) * cache.z * numbers::inv_pi<T>;
+	}
+
+	static weight_type forwardWeight(const domain_type u, const cache_type cache)
+	{
+		return forwardPdf(u, cache);
+	}
+
+	static density_type backwardPdf(const codomain_type L)
+	{
+		return T(0.5) * hlsl::abs(L.z) * numbers::inv_pi<T>;
+	}
+
+	static weight_type backwardWeight(const codomain_type L)
+	{
+		return backwardPdf(L);
+	}
 };
 
-}
-}
-}
+} // namespace sampling
+} // namespace hlsl
+} // namespace nbl
 
 #endif
diff --git a/include/nbl/builtin/hlsl/sampling/cumulative_probability.hlsl b/include/nbl/builtin/hlsl/sampling/cumulative_probability.hlsl
new file mode 100644
index 0000000000..1f176f9713
--- /dev/null
+++ b/include/nbl/builtin/hlsl/sampling/cumulative_probability.hlsl
@@ -0,0 +1,139 @@
+// Copyright (C) 2018-2025 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+#ifndef _NBL_BUILTIN_HLSL_SAMPLING_CUMULATIVE_PROBABILITY_INCLUDED_
+#define _NBL_BUILTIN_HLSL_SAMPLING_CUMULATIVE_PROBABILITY_INCLUDED_
+
+#include <nbl/builtin/hlsl/cpp_compat.hlsl>
+#include <nbl/builtin/hlsl/algorithm.hlsl>
+#include <nbl/builtin/hlsl/concepts/accessors/generic_shared_data.hlsl>
+
+namespace nbl
+{
+namespace hlsl
+{
+namespace sampling
+{
+
+// Discrete sampler using cumulative probability lookup via upper_bound.
+//
+// Samples a discrete index in [0, N) with probability proportional to
+// precomputed weights in O(log N) time per sample.
+//
+// The cumulative probability array stores N-1 entries (the last bucket
+// is always 1.0 and need not be stored). Entry i holds the sum of
+// probabilities for indices [0, i].
+//
+// Satisfies TractableSampler and ResamplableSampler (not BackwardTractableSampler:
+// the mapping is discrete).
+template<typename T, typename Domain, typename Codomain, typename CumProbAccessor
+	NBL_PRIMARY_REQUIRES(concepts::accessors::GenericReadAccessor<CumProbAccessor, T, Codomain>)
+struct CumulativeProbabilitySampler
+{
+	using scalar_type = T;
+
+	using domain_type = Domain;
+	using codomain_type = Codomain;
+	using density_type = scalar_type;
+	using weight_type = density_type;
+
+	struct cache_type
+	{
+		density_type oneBefore;
+		density_type upperBound;
+	};
+
+	static CumulativeProbabilitySampler create(NBL_CONST_REF_ARG(CumProbAccessor) _cumProbAccessor, uint32_t _size)
+	{
+		CumulativeProbabilitySampler retval;
+		retval.cumProbAccessor = _cumProbAccessor;
+		retval.storedCount = _size - 1u;
+		return retval;
+	}
+
+	// BasicSampler interface
+	codomain_type generate(const domain_type u) NBL_CONST_MEMBER_FUNC
+	{
+		// upper_bound returns first index where cumProb > u
+		return hlsl::upper_bound(cumProbAccessor, 0u, storedCount, u);
+	}
+
+	// TractableSampler interface
+	codomain_type generate(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC
+	{
+		// #define NBL_CUMPROB_YOLO_READS
+#ifdef NBL_CUMPROB_YOLO_READS
+		// YOLO approach: re-read the array after binary search.
+		// The accessed elements are adjacent to the found index so the cache is warm.
+		const codomain_type result = hlsl::upper_bound(cumProbAccessor, 0u, storedCount, u);
+		cache.oneBefore = density_type(0.0);
+		if (result)
+			cumProbAccessor.template get<density_type, codomain_type>(result - 1u, cache.oneBefore);
+		cache.upperBound = density_type(1.0);
+		if (result < storedCount)
+			cumProbAccessor.template get<density_type, codomain_type>(result, cache.upperBound);
+#else
+		// Tracking reads approach: stateful comparator captures CDF values during binary search.
+		struct CdfComparator
+		{
+			bool operator()(const density_type value, const density_type rhs)
+			{
+				const bool retval = value < rhs;
+				if (retval)
+					upperBound = rhs;
+				else
+					oneBefore = rhs;
+				return retval;
+			}
+
+			density_type oneBefore;
+			density_type upperBound;
+		} comp;
+		comp.oneBefore = density_type(0.0);
+		comp.upperBound = density_type(1.0);
+		const codomain_type result = hlsl::upper_bound(cumProbAccessor, 0u, storedCount, u, comp);
+		cache.oneBefore = comp.oneBefore;
+		cache.upperBound = comp.upperBound;
+#endif
+		return result;
+	}
+
+	density_type forwardPdf(const domain_type u, NBL_CONST_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC
+	{
+		return cache.upperBound - cache.oneBefore;
+	}
+
+	weight_type forwardWeight(const domain_type u, NBL_CONST_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC
+	{
+		return forwardPdf(u, cache);
+	}
+
+	density_type backwardPdf(const codomain_type v) NBL_CONST_MEMBER_FUNC
+	{
+		density_type retval = density_type(1.0);
+		if (v < storedCount)
+			cumProbAccessor.template get<density_type, codomain_type>(v, retval);
+		if (v)
+		{
+			density_type prev;
+			cumProbAccessor.template get<density_type, codomain_type>(v - 1u, prev);
+			retval -= prev;
+		}
+		return retval;
+	}
+
+	weight_type backwardWeight(const codomain_type v) NBL_CONST_MEMBER_FUNC
+	{
+		return backwardPdf(v);
+	}
+
+	CumProbAccessor cumProbAccessor;
+	uint32_t storedCount;
+};
+
+} // namespace sampling
+} // namespace hlsl
+} // namespace nbl
+
+#endif
diff --git a/include/nbl/builtin/hlsl/sampling/cumulative_probability_builder.h b/include/nbl/builtin/hlsl/sampling/cumulative_probability_builder.h
new file mode 100644
index 0000000000..a511fc2d8c
--- /dev/null
+++ b/include/nbl/builtin/hlsl/sampling/cumulative_probability_builder.h
@@ -0,0 +1,37 @@
+// Copyright (C) 2018-2025 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+#ifndef _NBL_BUILTIN_HLSL_SAMPLING_CUMULATIVE_PROBABILITY_BUILDER_H_INCLUDED_
+#define _NBL_BUILTIN_HLSL_SAMPLING_CUMULATIVE_PROBABILITY_BUILDER_H_INCLUDED_
+
+#include <numeric>
+#include <algorithm>
+#include <span>
+#include <cstdint>
+
+namespace nbl
+{
+namespace hlsl
+{
+namespace sampling
+{
+
+// Builds a normalized cumulative histogram from an array of non-negative weights.
+// Output has N-1 entries (last bucket implicitly 1.0).
+template<typename T>
+void computeNormalizedCumulativeHistogram(std::span<const T> weights, T* outCumProb)
+{
+	const auto N = weights.size();
+	if (N < 2)
+		return;
+	std::inclusive_scan(weights.begin(), weights.end() - 1, outCumProb);
+	const T normalizationFactor = T(1) / (outCumProb[N - 2] + weights[N - 1]);
+	std::for_each(outCumProb, outCumProb + N - 1, [normalizationFactor](T& v) { v *= normalizationFactor; });
+}
+
+} // namespace sampling
+} // namespace hlsl
+} // namespace nbl
+
+#endif
diff --git a/include/nbl/builtin/hlsl/sampling/linear.hlsl b/include/nbl/builtin/hlsl/sampling/linear.hlsl
index 289ea75485..12602b4a79 100644
--- a/include/nbl/builtin/hlsl/sampling/linear.hlsl
+++ b/include/nbl/builtin/hlsl/sampling/linear.hlsl
@@ -7,6 +7,7 @@
 
 #include <nbl/builtin/hlsl/cpp_compat.hlsl>
 #include <nbl/builtin/hlsl/limits.hlsl>
+#include <nbl/builtin/hlsl/tgmath.hlsl>
 
 namespace nbl
 {
@@ -21,30 +22,83 @@ struct Linear
     using scalar_type = T;
     using vector2_type = vector<T, 2>;
 
+    // BackwardTractableSampler concept types
+    using domain_type = scalar_type;
+    using codomain_type = scalar_type;
+    using density_type = scalar_type;
+    using weight_type = density_type;
+
+    struct cache_type
+    {
+        scalar_type diffTimesX;
+    };
+
     static Linear<T> create(const vector2_type linearCoeffs)   // start and end importance values (start, end), assumed to be at x=0 and x=1
     {
         Linear<T> retval;
-        retval.linearCoeffStart = linearCoeffs[0];
-        retval.rcpDiff = 1.0 / (linearCoeffs[0] - linearCoeffs[1]);
-        vector2_type squaredCoeffs = linearCoeffs * linearCoeffs;
-        retval.squaredCoeffStart = squaredCoeffs[0];
-        retval.squaredCoeffDiff = squaredCoeffs[1] - squaredCoeffs[0];
+        // add min to both coefficients so (0,0) input produces a valid uniform sampler
+        // instead of inf normalization (2/0) leading to NaN; negligible for normal inputs
+        const vector2_type safeCoeffs = linearCoeffs + vector2_type(hlsl::numeric_limits<scalar_type>::min, hlsl::numeric_limits<scalar_type>::min);
+        // normalize coefficients so that the PDF is simply linearCoeffStart + linearCoeffDiff * x
+        const scalar_type normFactor = scalar_type(2.0) / (safeCoeffs[0] + safeCoeffs[1]);
+        const vector2_type normalized = safeCoeffs * normFactor;
+        retval.linearCoeffStart = normalized[0];
+        retval.linearCoeffEnd = normalized[1];
+        // precompute for the stable quadratic in generate()
+        retval.squaredCoeffStart = normalized[0] * normalized[0];
+        retval.twoTimesDiff = scalar_type(2.0) * (normalized[1] - normalized[0]);
         return retval;
     }
 
-    scalar_type generate(const scalar_type u)
+    codomain_type generate(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC
+    {
+        // Inverse CDF via stable quadratic solver.
+        // CDF(x) = start*x + 0.5*diff*x^2 = u, with normalization start + 0.5*diff = 1.
+        // Quadratic (1-start)*x^2 + start*x - u = 0; since start >= 0 the stable root is
+        // x = 2u / (start + sqrt(start^2 + 2*diff*u)), which never cancels.
+        const scalar_type sqrtTerm = sqrt(squaredCoeffStart + twoTimesDiff * u);
+        const scalar_type denom = linearCoeffStart + sqrtTerm;
+        // NOTE: floating point can make x slightly > 1 when u~1 and diff < 0; callers needing
+        // non-negative PDF at the boundary should clamp with min(x, 1).
+        const codomain_type x = (u + u) / denom;
+        // diff*x == sqrtTerm - start algebraically (conjugate identity), saves 1 mul
+        cache.diffTimesX = sqrtTerm - linearCoeffStart;
+        return x;
+    }
+
+    density_type forwardPdf(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
+    {
+        return linearCoeffStart + cache.diffTimesX;
+    }
+
+    weight_type forwardWeight(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
+    {
+        return forwardPdf(u, cache);
+    }
+
+    // Alternative forms (since start + 0.5*diff == 1 after normalization):
+    //   start + (0.5 - start) * x
+    //   1 + diff * (x - 0.5)
+    // Not used because we already store start for generate().
+    density_type backwardPdf(const codomain_type x) NBL_CONST_MEMBER_FUNC
+    {
+        assert(x >= scalar_type(0.0) && x <= scalar_type(1.0));
+        return hlsl::mix(linearCoeffStart, linearCoeffEnd, x);
+    }
+
+    weight_type backwardWeight(const codomain_type x) NBL_CONST_MEMBER_FUNC
     {
-        return hlsl::mix(u, (linearCoeffStart - hlsl::sqrt(squaredCoeffStart + u * squaredCoeffDiff)) * rcpDiff, hlsl::abs(rcpDiff) < numeric_limits<scalar_type>::max);
+        return backwardPdf(x);
     }
 
     scalar_type linearCoeffStart;
-    scalar_type rcpDiff;
+    scalar_type linearCoeffEnd;
     scalar_type squaredCoeffStart;
-    scalar_type squaredCoeffDiff;
+    scalar_type twoTimesDiff;
 };
 
-}
-}
-}
+} // namespace sampling
+} // namespace hlsl
+} // namespace nbl
 
 #endif
diff --git a/include/nbl/builtin/hlsl/sampling/polar_mapping.hlsl b/include/nbl/builtin/hlsl/sampling/polar_mapping.hlsl
new file mode 100644
index 0000000000..719eaf504f
--- /dev/null
+++ b/include/nbl/builtin/hlsl/sampling/polar_mapping.hlsl
@@ -0,0 +1,64 @@
+// Copyright (C) 2018-2026 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+#ifndef _NBL_BUILTIN_HLSL_SAMPLING_POLAR_MAPPING_INCLUDED_
+#define _NBL_BUILTIN_HLSL_SAMPLING_POLAR_MAPPING_INCLUDED_
+
+#include "nbl/builtin/hlsl/tgmath.hlsl"
+#include "nbl/builtin/hlsl/numbers.hlsl"
+
+namespace nbl
+{
+namespace hlsl
+{
+namespace sampling
+{
+
+template<typename T>
+struct PolarMapping
+{
+	using scalar_type = T;
+	using vector2_type = vector<T, 2>;
+
+	// BijectiveSampler concept types
+	using domain_type = vector2_type;
+	using codomain_type = vector2_type;
+	using density_type = scalar_type;
+	using weight_type = density_type;
+
+	struct cache_type {};
+
+	static codomain_type generate(const domain_type u, NBL_REF_ARG(cache_type) cache)
+	{
+		const scalar_type r = hlsl::sqrt<scalar_type>(u.x);
+		const scalar_type phi = scalar_type(2) * numbers::pi<scalar_type> * u.y;
+		return vector2_type(r * hlsl::cos<scalar_type>(phi), r * hlsl::sin<scalar_type>(phi));
+	}
+
+	static codomain_type generate(const domain_type u)
+	{
+		cache_type dummy;
+		return generate(u, dummy);
+	}
+
+	static domain_type generateInverse(const codomain_type p)
+	{
+		const scalar_type r2 = p.x * p.x + p.y * p.y;
+		scalar_type phi = hlsl::atan2(p.y, p.x);
+		phi += hlsl::mix(scalar_type(0), scalar_type(2) * numbers::pi<scalar_type>, phi < scalar_type(0));
+		return vector2_type(r2, phi * (scalar_type(0.5) * numbers::inv_pi<scalar_type>));
+	}
+
+	static density_type forwardPdf(const domain_type u, cache_type cache) { return numbers::inv_pi<scalar_type>; }
+	static density_type backwardPdf(codomain_type v) { return numbers::inv_pi<scalar_type>; }
+
+	static weight_type forwardWeight(const domain_type u, cache_type cache) { return forwardPdf(u, cache); }
+	static weight_type backwardWeight(codomain_type v) { return backwardPdf(v); }
+};
+
+} // namespace sampling
+} // namespace hlsl
+} // namespace nbl
+
+#endif
diff --git a/include/nbl/builtin/hlsl/sampling/projected_spherical_rectangle.hlsl b/include/nbl/builtin/hlsl/sampling/projected_spherical_rectangle.hlsl
new file mode 100644
index 0000000000..ac8fe6a54e
--- /dev/null
+++ b/include/nbl/builtin/hlsl/sampling/projected_spherical_rectangle.hlsl
@@ -0,0 +1,163 @@
+// Copyright (C) 2018-2023 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+#ifndef _NBL_BUILTIN_HLSL_SAMPLING_PROJECTED_SPHERICAL_RECTANGLE_INCLUDED_
+#define _NBL_BUILTIN_HLSL_SAMPLING_PROJECTED_SPHERICAL_RECTANGLE_INCLUDED_
+
+#include <nbl/builtin/hlsl/cpp_compat.hlsl>
+#include <nbl/builtin/hlsl/limits.hlsl>
+#include <nbl/builtin/hlsl/math/functions.hlsl>
+#include <nbl/builtin/hlsl/sampling/bilinear.hlsl>
+#include <nbl/builtin/hlsl/sampling/spherical_rectangle.hlsl>
+
+namespace nbl
+{
+namespace hlsl
+{
+namespace sampling
+{
+
+// "Practical Warps" projected solid angle sampler for spherical rectangles.
+//
+// How it works:
+//   1. Build a bilinear patch from NdotL at each of the 4 corner directions
+//   2. Warp uniform [0,1]^2 through the bilinear to importance-sample NdotL
+//   3. Feed the warped UV into the solid angle sampler to get a rect offset
+//   4. PDF = (1/SolidAngle) * bilinearPdf
+//
+// Template parameter `UsePdfAsWeight`: when true (default), forwardWeight/backwardWeight
+// return the PDF instead of the projected-solid-angle MIS weight.
+// TODO: the projected-solid-angle MIS weight (UsePdfAsWeight=false) has been shown to be
+// poor in practice. Once confirmed by testing, remove the false path and stop storing
+// receiverNormal, receiverWasBSDF, and rcpProjSolidAngle as members.
+template<typename T, bool UsePdfAsWeight = true>
+struct ProjectedSphericalRectangle
+{
+	using scalar_type = T;
+	using vector2_type = vector<T, 2>;
+	using vector3_type = vector<T, 3>;
+	using vector4_type = vector<T, 4>;
+
+	// BackwardTractableSampler concept types
+	using domain_type = vector2_type;
+	using codomain_type = vector2_type;
+	using density_type = scalar_type;
+	using weight_type = density_type;
+
+	struct cache_type
+	{
+		scalar_type abs_cos_theta;
+		vector2_type warped;
+		typename Bilinear<scalar_type>::cache_type bilinearCache;
+	};
+
+	// NOTE: produces a degenerate (all-zero) bilinear patch when the receiver normal faces away
+	// from all four rectangle vertices, resulting in NaN PDFs (0 * inf). Callers must ensure
+	// at least one vertex has positive projection onto the receiver normal.
+	static ProjectedSphericalRectangle<T,UsePdfAsWeight> create(NBL_CONST_REF_ARG(shapes::SphericalRectangle<T>) shape, const vector3_type observer, const vector3_type _receiverNormal, const bool _receiverWasBSDF)
+	{
+		ProjectedSphericalRectangle<T,UsePdfAsWeight> retval;
+		const vector3_type n = hlsl::mul(shape.basis, _receiverNormal);
+		retval.localReceiverNormal = n;
+		retval.receiverWasBSDF = _receiverWasBSDF;
+
+		// Compute solid angle and get r0 in local frame (before z-flip)
+		const typename shapes::SphericalRectangle<T>::solid_angle_type sa = shape.solidAngle(observer);
+		const vector3_type r0 = sa.r0;
+
+		// All 4 corners share r0.z; x is r0.x or r0.x+ex, y is r0.y or r0.y+ey
+		const scalar_type r1x = r0.x + shape.extents.x;
+		const scalar_type r1y = r0.y + shape.extents.y;
+
+		// Unnormalized dots: dot(corner_i, n)
+		const scalar_type base_dot = hlsl::dot(r0, n);
+		const scalar_type dx = shape.extents.x * n.x;
+		const scalar_type dy = shape.extents.y * n.y;
+		const vector4_type dots = vector4_type(base_dot, base_dot + dx, base_dot + dy, base_dot + dx + dy);
+
+		// Squared lengths of each corner
+		const scalar_type r0zSq = r0.z * r0.z;
+		const vector4_type lenSq = vector4_type(
+			r0.x * r0.x + r0.y * r0.y,
+			r1x * r1x + r0.y * r0.y,
+			r0.x * r0.x + r1y * r1y,
+			r1x * r1x + r1y * r1y
+		) + hlsl::promote<vector4_type>(r0zSq);
+
+		// dot(normalize(corner), n) = dot(corner, n) * rsqrt(lenSq)
+		// Bilinear corners: [0]=v00 [1]=v10 [2]=v01 [3]=v11
+		const scalar_type minimumProjSolidAngle = 0.0;
+		const vector4_type bxdfPdfAtVertex = math::conditionalAbsOrMax(_receiverWasBSDF,
+			dots * hlsl::rsqrt<vector4_type>(lenSq),
+			hlsl::promote<vector4_type>(minimumProjSolidAngle));
+		retval.bilinearPatch = Bilinear<scalar_type>::create(bxdfPdfAtVertex);
+
+		// Reuse the already-computed solid_angle_type to avoid recomputing mul(basis, origin - observer)
+		retval.sphrect = SphericalRectangle<T>::create(sa, shape.extents);
+		retval.rcpSolidAngle = retval.sphrect.solidAngle > scalar_type(0.0) ? scalar_type(1.0) / retval.sphrect.solidAngle : scalar_type(0.0);
+
+		NBL_IF_CONSTEXPR(!UsePdfAsWeight)
+		{
+			const scalar_type projSA = shape.projectedSolidAngleFromLocal(r0, n);
+			retval.rcpProjSolidAngle = projSA > scalar_type(0.0) ? scalar_type(1.0) / projSA : scalar_type(0.0);
+		}
+		return retval;
+	}
+
+	codomain_type generate(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC
+	{
+		Bilinear<scalar_type> bilinear = bilinearPatch;
+		cache.warped = bilinear.generate(u, cache.bilinearCache);
+		typename SphericalRectangle<scalar_type>::cache_type sphrectCache;
+		const vector2_type sampleOffset = sphrect.generate(cache.warped, sphrectCache);
+		cache.abs_cos_theta = bilinear.forwardWeight(u, cache.bilinearCache);
+		return sampleOffset;
+	}
+
+	density_type forwardPdf(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
+	{
+		return rcpSolidAngle * bilinearPatch.forwardPdf(u, cache.bilinearCache);
+	}
+
+	weight_type forwardWeight(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
+	{
+		if (UsePdfAsWeight)
+			return forwardPdf(u, cache);
+		return cache.abs_cos_theta * rcpProjSolidAngle;
+	}
+
+	// `p` is the normalized [0,1]^2 position on the rectangle
+	density_type backwardPdf(const vector2_type p) NBL_CONST_MEMBER_FUNC
+	{
+		return rcpSolidAngle * bilinearPatch.backwardPdf(p);
+	}
+
+	weight_type backwardWeight(const vector2_type p) NBL_CONST_MEMBER_FUNC
+	{
+		NBL_IF_CONSTEXPR(UsePdfAsWeight)
+			return backwardPdf(p);
+		const scalar_type minimumProjSolidAngle = 0.0;
+		// Reconstruct local direction from normalized rect position
+		const vector3_type localDir = hlsl::normalize(sphrect.r0 + vector3_type(
+			p.x * (sphrect.r1.x - sphrect.r0.x),
+			p.y * (sphrect.r1.y - sphrect.r0.y),
+			scalar_type(0)
+		));
+		const scalar_type abs_cos_theta = math::conditionalAbsOrMax(receiverWasBSDF, hlsl::dot(localReceiverNormal, localDir), minimumProjSolidAngle);
+		return abs_cos_theta * rcpProjSolidAngle;
+	}
+
+	sampling::SphericalRectangle<T> sphrect;
+	Bilinear<scalar_type> bilinearPatch;
+	scalar_type rcpSolidAngle;
+	scalar_type rcpProjSolidAngle;
+	vector3_type localReceiverNormal;
+	bool receiverWasBSDF;
+};
+
+} // namespace sampling
+} // namespace hlsl
+} // namespace nbl
+
+#endif
diff --git a/include/nbl/builtin/hlsl/sampling/projected_spherical_triangle.hlsl b/include/nbl/builtin/hlsl/sampling/projected_spherical_triangle.hlsl
index 116dbd3cd9..c4bc5fcea8 100644
--- a/include/nbl/builtin/hlsl/sampling/projected_spherical_triangle.hlsl
+++ b/include/nbl/builtin/hlsl/sampling/projected_spherical_triangle.hlsl
@@ -18,7 +18,20 @@ namespace hlsl
 namespace sampling
 {
 
-template<typename T>
+// "Practical Warps" projected solid angle sampler for spherical triangles.
+//
+// How it works:
+//   1. Build a bilinear patch from NdotL at each vertex direction
+//   2. Warp uniform [0,1]^2 through the bilinear to importance-sample NdotL
+//   3. Feed the warped UV into the solid angle sampler to get a direction
+//   4. PDF = (1/SolidAngle) * bilinearPdf
+//
+// Template parameter `UsePdfAsWeight`: when true (default), forwardWeight/backwardWeight
+// return the PDF instead of the projected-solid-angle MIS weight.
+// TODO: the projected-solid-angle MIS weight (UsePdfAsWeight=false) has been shown to be
+// poor in practice. Once confirmed by testing, remove the false path and stop storing
+// receiverNormal, receiverWasBSDF, and rcpProjSolidAngle as members.
+template<typename T, bool UsePdfAsWeight = true>
 struct ProjectedSphericalTriangle
 {
     using scalar_type = T;
@@ -26,61 +39,85 @@ struct ProjectedSphericalTriangle
     using vector3_type = vector<T, 3>;
     using vector4_type = vector<T, 4>;
 
-    static ProjectedSphericalTriangle<T> create(NBL_CONST_REF_ARG(shapes::SphericalTriangle<T>) tri)
-    {
-        ProjectedSphericalTriangle<T> retval;
-        retval.sphtri = sampling::SphericalTriangle<T>::create(tri);
-        return retval;
-    }
+    // BackwardTractableSampler concept types
+    using domain_type = vector2_type;
+    using codomain_type = vector3_type;
+    using density_type = scalar_type;
+    using weight_type = density_type;
 
-    vector4_type computeBilinearPatch(const vector3_type receiverNormal, bool isBSDF)
+    struct cache_type
     {
-        const scalar_type minimumProjSolidAngle = 0.0;
+        scalar_type abs_cos_theta;
+        vector2_type warped;
+        typename Bilinear<scalar_type>::cache_type bilinearCache;
+    };
+
+    // NOTE: produces a degenerate (all-zero) bilinear patch when the receiver normal faces away
+    // from all three triangle vertices, resulting in NaN PDFs (0 * inf). Callers must ensure
+    // at least one vertex has positive projection onto the receiver normal.
+    static ProjectedSphericalTriangle<T,UsePdfAsWeight> create(NBL_REF_ARG(shapes::SphericalTriangle<T>) shape, const vector3_type _receiverNormal, const bool _receiverWasBSDF)
+    {
+        ProjectedSphericalTriangle<T,UsePdfAsWeight> retval;
+       retval.sphtri = SphericalTriangle<T, UsePdfAsWeight>::create(shape);
+        retval.receiverNormal = _receiverNormal;
+        retval.receiverWasBSDF = _receiverWasBSDF;
 
-        matrix<T, 3, 3> m = matrix<T, 3, 3>(sphtri.tri.vertices[0], sphtri.tri.vertices[1], sphtri.tri.vertices[2]);
-        const vector3_type bxdfPdfAtVertex = math::conditionalAbsOrMax(isBSDF, hlsl::mul(m, receiverNormal), hlsl::promote<vector3_type>(minimumProjSolidAngle));
+        const scalar_type minimumProjSolidAngle = 0.0;
+        matrix<T, 3, 3> m = matrix<T, 3, 3>(shape.vertices[0], shape.vertices[1], shape.vertices[2]);
+        const vector3_type bxdfPdfAtVertex = math::conditionalAbsOrMax(_receiverWasBSDF, hlsl::mul(m, _receiverNormal), hlsl::promote<vector3_type>(minimumProjSolidAngle));
+        retval.bilinearPatch = Bilinear<scalar_type>::create(bxdfPdfAtVertex.yyxz);
 
-        return bxdfPdfAtVertex.yyxz;
+        const scalar_type projSA = shape.projectedSolidAngle(_receiverNormal);
+        retval.rcpProjSolidAngle = projSA > scalar_type(0.0) ? scalar_type(1.0) / projSA : scalar_type(0.0);
+        return retval;
     }
 
-    vector3_type generate(NBL_REF_ARG(scalar_type) rcpPdf, scalar_type solidAngle, scalar_type cos_c, scalar_type csc_b, const vector3_type receiverNormal, bool isBSDF, const vector2_type _u)
+    codomain_type generate(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC
     {
-        vector2_type u;
-        // pre-warp according to proj solid angle approximation
-        vector4_type patch = computeBilinearPatch(receiverNormal, isBSDF);
-        Bilinear<scalar_type> bilinear = Bilinear<scalar_type>::create(patch);
-        u = bilinear.generate(_u);
-
-        // now warp the points onto a spherical triangle
-        const vector3_type L = sphtri.generate(cos_c, csc_b, u);
-        rcpPdf = solidAngle / bilinear.backwardPdf(u);
-
+        Bilinear<scalar_type> bilinear = bilinearPatch;
+        cache.warped = bilinear.generate(u, cache.bilinearCache);
+        typename SphericalTriangle<scalar_type>::cache_type sphtriCache;
+        const vector3_type L = sphtri.generate(cache.warped, sphtriCache);
+        cache.abs_cos_theta = bilinear.forwardWeight(u, cache.bilinearCache);
         return L;
     }
 
-    vector3_type generate(NBL_REF_ARG(scalar_type) rcpPdf, const vector3_type receiverNormal, bool isBSDF, const vector2_type u)
+    density_type forwardPdf(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
     {
-        const scalar_type cos_c = sphtri.tri.cos_sides[2];
-        const scalar_type csc_b = sphtri.tri.csc_sides[1];
-        const scalar_type solidAngle = sphtri.tri.solidAngle();
-        return generate(rcpPdf, solidAngle, cos_c, csc_b, receiverNormal, isBSDF, u);
+        return sphtri.rcpSolidAngle * bilinearPatch.forwardPdf(u, cache.bilinearCache);
     }
 
-    scalar_type backwardPdf(const vector3_type receiverNormal, bool receiverWasBSDF, const vector3_type L)
+    weight_type forwardWeight(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
     {
-        scalar_type pdf;
-        const vector2_type u = sphtri.generateInverse(pdf, L);
+        if (UsePdfAsWeight)
+            return forwardPdf(u, cache);
+        return cache.abs_cos_theta * rcpProjSolidAngle;
+    }
 
-        vector4_type patch = computeBilinearPatch(receiverNormal, receiverWasBSDF);
-        Bilinear<scalar_type> bilinear = Bilinear<scalar_type>::create(patch);
-        return pdf * bilinear.backwardPdf(u);
+    density_type backwardPdf(const codomain_type L) NBL_CONST_MEMBER_FUNC
+    {
+        const vector2_type u = sphtri.generateInverse(L);
+        return sphtri.rcpSolidAngle * bilinearPatch.backwardPdf(u);
+    }
+
+    weight_type backwardWeight(const codomain_type L) NBL_CONST_MEMBER_FUNC
+    {
+        NBL_IF_CONSTEXPR(UsePdfAsWeight)
+            return backwardPdf(L);
+        const scalar_type minimumProjSolidAngle = 0.0;
+        const scalar_type abs_cos_theta = math::conditionalAbsOrMax(receiverWasBSDF, hlsl::dot(receiverNormal, L), minimumProjSolidAngle);
+        return abs_cos_theta * rcpProjSolidAngle;
     }
 
-    sampling::SphericalTriangle<T> sphtri;
+    sampling::SphericalTriangle<T, UsePdfAsWeight> sphtri;
+    Bilinear<scalar_type> bilinearPatch;
+    scalar_type rcpProjSolidAngle;
+    vector3_type receiverNormal;
+    bool receiverWasBSDF;
 };
 
-}
-}
-}
+} // namespace sampling
+} // namespace hlsl
+} // namespace nbl
 
 #endif
diff --git a/include/nbl/builtin/hlsl/sampling/quotient_and_pdf.hlsl b/include/nbl/builtin/hlsl/sampling/quotient_and_pdf.hlsl
index 26a62ea617..7521048b37 100644
--- a/include/nbl/builtin/hlsl/sampling/quotient_and_pdf.hlsl
+++ b/include/nbl/builtin/hlsl/sampling/quotient_and_pdf.hlsl
@@ -25,26 +25,29 @@ struct quotient_and_pdf
     static this_t create(const Q _quotient, const P _pdf)
     {
         this_t retval;
-        retval.quotient = _quotient;
-        retval.pdf = _pdf;
+        retval._quotient = _quotient;
+        retval._pdf = _pdf;
         return retval;
     }
 
     static this_t create(const scalar_q _quotient, const P _pdf)
     {
         this_t retval;
-        retval.quotient = hlsl::promote<Q>(_quotient);
-        retval.pdf = _pdf;
+        retval._quotient = hlsl::promote<Q>(_quotient);
+        retval._pdf = _pdf;
         return retval;
     }
 
-    Q value()
+    Q quotient() NBL_CONST_MEMBER_FUNC { return _quotient; }
+    P pdf() NBL_CONST_MEMBER_FUNC { return _pdf; }
+
+    Q value() NBL_CONST_MEMBER_FUNC
     {
-        return quotient*pdf;
+        return _quotient * _pdf;
     }
 
-    Q quotient;
-    P pdf;
+    Q _quotient;
+    P _pdf;
 };
 
 }
diff --git a/include/nbl/builtin/hlsl/sampling/spherical_rectangle.hlsl b/include/nbl/builtin/hlsl/sampling/spherical_rectangle.hlsl
index 4c3f02e5f2..ed453a260d 100644
--- a/include/nbl/builtin/hlsl/sampling/spherical_rectangle.hlsl
+++ b/include/nbl/builtin/hlsl/sampling/spherical_rectangle.hlsl
@@ -8,7 +8,9 @@
 #include <nbl/builtin/hlsl/cpp_compat.hlsl>
 #include <nbl/builtin/hlsl/limits.hlsl>
 #include <nbl/builtin/hlsl/math/functions.hlsl>
-#include <nbl/builtin/hlsl/shapes/spherical_triangle.hlsl>
+#include <nbl/builtin/hlsl/math/fast_acos.hlsl>
+#include <nbl/builtin/hlsl/ieee754.hlsl>
+#include <nbl/builtin/hlsl/shapes/spherical_rectangle.hlsl>
 
 namespace nbl
 {
@@ -25,67 +27,100 @@ struct SphericalRectangle
     using vector3_type = vector<T, 3>;
     using vector4_type = vector<T, 4>;
 
-    static SphericalRectangle<T> create(NBL_CONST_REF_ARG(shapes::SphericalRectangle<T>) rect)
+    // BackwardTractableSampler concept types
+    using domain_type = vector2_type;
+    using codomain_type = vector2_type;
+    using density_type = scalar_type;
+    using weight_type = density_type;
+
+    struct cache_type {};
+
+    NBL_CONSTEXPR_STATIC_INLINE scalar_type ClampEps = 1e-5;
+
+    static SphericalRectangle<T> create(NBL_CONST_REF_ARG(shapes::SphericalRectangle<T>) rect, const vector3_type observer)
     {
-        SphericalRectangle<T> retval;
-        retval.rect = rect;
-        return retval;
+        return create(rect.solidAngle(observer), rect.extents);
     }
 
-    vector2_type generate(const vector3_type observer, const vector2_type uv, NBL_REF_ARG(scalar_type) S)
+    static SphericalRectangle<T> create(NBL_CONST_REF_ARG(typename shapes::SphericalRectangle<T>::solid_angle_type) sa, const vector2_type extents)
     {
-        vector3_type r0 = hlsl::mul(rect.basis, rect.origin - observer);
-        const vector4_type denorm_n_z = vector4_type(-r0.y, r0.x + rect.extents.x, r0.y + rect.extents.y, -r0.x);
-        const vector4_type n_z = denorm_n_z / hlsl::sqrt<vector4_type>(hlsl::promote<vector4_type>(r0.z * r0.z) + denorm_n_z * denorm_n_z);
-        const vector4_type cosGamma = vector4_type(
-            -n_z[0] * n_z[1],
-            -n_z[1] * n_z[2],
-            -n_z[2] * n_z[3],
-            -n_z[3] * n_z[0]
-        );
-
-        math::sincos_accumulator<scalar_type> angle_adder = math::sincos_accumulator<scalar_type>::create(cosGamma[0]);
-        angle_adder.addCosine(cosGamma[1]);
-        scalar_type p = angle_adder.getSumofArccos();
-        angle_adder = math::sincos_accumulator<scalar_type>::create(cosGamma[2]);
-        angle_adder.addCosine(cosGamma[3]);
-        scalar_type q = angle_adder.getSumofArccos();
-
-        const scalar_type k = scalar_type(2.0) * numbers::pi<scalar_type> - q;
-        const scalar_type b0 = n_z[0];
-        const scalar_type b1 = n_z[2];
-        S = p + q - scalar_type(2.0) * numbers::pi<scalar_type>;
-
-        const scalar_type CLAMP_EPS = 1e-5;
+        SphericalRectangle<T> retval;
+        retval.r0 = sa.r0;
+
+        retval.solidAngle = sa.value;
+        retval.b0 = sa.n_z[0];
+        retval.b0sq = retval.b0 * retval.b0;
+        retval.b1 = sa.n_z[2];
+
+        math::sincos_accumulator<scalar_type> angle_adder = math::sincos_accumulator<scalar_type>::create(sa.cosGamma[2]);
+        angle_adder.addCosine(sa.cosGamma[3]);
+        retval.k = scalar_type(2.0) * numbers::pi<scalar_type> - angle_adder.getSumOfArccos();
 
         // flip z axis if r0.z > 0
-        r0.z = -hlsl::abs(r0.z);
-        vector3_type r1 = r0 + vector3_type(rect.extents.x, rect.extents.y, 0);
+        retval.r0.z = -hlsl::abs(retval.r0.z);
+        retval.r0zSq = retval.r0.z * retval.r0.z;
+        retval.r1 = vector2_type(retval.r0.x + extents.x, retval.r0.y + extents.y);
+        retval.rySq = vector2_type(retval.r0.y * retval.r0.y, retval.r1.y * retval.r1.y);
 
-        const scalar_type au = uv.x * S + k;
-        const scalar_type fu = (hlsl::cos<scalar_type>(au) * b0 - b1) / hlsl::sin<scalar_type>(au);
-        const scalar_type cu_2 = hlsl::max<scalar_type>(fu * fu + b0 * b0, 1.f); // forces `cu` to be in [-1,1]
-        const scalar_type cu = ieee754::flipSignIfRHSNegative<scalar_type>(scalar_type(1.0) / hlsl::sqrt<scalar_type>(cu_2), fu);
+        return retval;
+    }
 
-        scalar_type xu = -(cu * r0.z) / hlsl::sqrt<scalar_type>(scalar_type(1.0) - cu * cu);
+    codomain_type generate(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC
+    {
+        const scalar_type au = u.x * solidAngle + k;
+        const scalar_type cos_au = hlsl::cos<scalar_type>(au);
+        const scalar_type numerator = b1 - cos_au * b0;
+        const scalar_type absNegFu = hlsl::abs(numerator) * hlsl::rsqrt<scalar_type>(scalar_type(1.0) - cos_au * cos_au);
+        const scalar_type rcpCu_2 = hlsl::max<scalar_type>(absNegFu * absNegFu + b0sq, scalar_type(1.0));
+        // sign(negFu) = sign(numerator) * sign(sin(au)); sin(au) < 0 iff au > PI
+        const scalar_type negFuSign = hlsl::select((au > numbers::pi<scalar_type>) != (numerator < scalar_type(0.0)), scalar_type(-1.0), scalar_type(1.0));
+        scalar_type xu = r0.z * negFuSign * hlsl::rsqrt<scalar_type>(rcpCu_2 - scalar_type(1.0));
         xu = hlsl::clamp<scalar_type>(xu, r0.x, r1.x); // avoid Infs
-        const scalar_type d_2 = xu * xu + r0.z * r0.z;
+        const scalar_type d_2 = xu * xu + r0zSq;
         const scalar_type d = hlsl::sqrt<scalar_type>(d_2);
 
-        const scalar_type h0 = r0.y / hlsl::sqrt<scalar_type>(d_2 + r0.y * r0.y);
-        const scalar_type h1 = r1.y / hlsl::sqrt<scalar_type>(d_2 + r1.y * r1.y);
-        const scalar_type hv = h0 + uv.y * (h1 - h0);
+        const scalar_type h0 = r0.y * hlsl::rsqrt<scalar_type>(d_2 + rySq[0]);
+        const scalar_type h1 = r1.y * hlsl::rsqrt<scalar_type>(d_2 + rySq[1]);
+        const scalar_type hv = h0 + u.y * (h1 - h0);
         const scalar_type hv2 = hv * hv;
-        const scalar_type yv = hlsl::mix(r1.y, (hv * d) / hlsl::sqrt<scalar_type>(scalar_type(1.0) - hv2), hv2 < scalar_type(1.0) - CLAMP_EPS);
+        const scalar_type yv = hlsl::mix(r1.y, (hv * d) / hlsl::sqrt<scalar_type>(scalar_type(1.0) - hv2), hv2 < scalar_type(1.0) - ClampEps);
 
-        return vector2_type((xu - r0.x) / rect.extents.x, (yv - r0.y) / rect.extents.y);
+        return vector2_type((xu - r0.x), (yv - r0.y));
+    }
+
+    density_type forwardPdf(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
+    {
+        return scalar_type(1.0) / solidAngle;
+    }
+
+    weight_type forwardWeight(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
+    {
+        return forwardPdf(u, cache);
+    }
+
+    density_type backwardPdf(const codomain_type L) NBL_CONST_MEMBER_FUNC
+    {
+        return scalar_type(1.0) / solidAngle;
+    }
+
+    weight_type backwardWeight(const codomain_type L) NBL_CONST_MEMBER_FUNC
+    {
+        return backwardPdf(L);
     }
 
-    shapes::SphericalRectangle<T> rect;
+    scalar_type solidAngle;
+    scalar_type k;
+    scalar_type b0;
+    scalar_type b0sq;
+    scalar_type b1;
+    scalar_type r0zSq;
+    vector3_type r0;
+    vector2_type r1;
+    vector2_type rySq;
 };
 
-}
-}
-}
+} // namespace sampling
+} // namespace hlsl
+} // namespace nbl
 
 #endif
diff --git a/include/nbl/builtin/hlsl/sampling/spherical_triangle.hlsl b/include/nbl/builtin/hlsl/sampling/spherical_triangle.hlsl
index 2b2dc0ccb6..6f29582e04 100644
--- a/include/nbl/builtin/hlsl/sampling/spherical_triangle.hlsl
+++ b/include/nbl/builtin/hlsl/sampling/spherical_triangle.hlsl
@@ -9,6 +9,7 @@
 #include <nbl/builtin/hlsl/limits.hlsl>
 #include <nbl/builtin/hlsl/math/functions.hlsl>
 #include <nbl/builtin/hlsl/math/quaternions.hlsl>
+#include <nbl/builtin/hlsl/math/fast_acos.hlsl>
 #include <nbl/builtin/hlsl/shapes/spherical_triangle.hlsl>
 
 namespace nbl
@@ -18,109 +19,311 @@ namespace hlsl
 namespace sampling
 {
 
-template<typename T>
-struct SphericalTriangle
+enum SphericalTriangleAlgorithm : uint16_t
 {
-    using scalar_type = T;
-    using vector2_type = vector<T, 2>;
-    using vector3_type = vector<T, 3>;
-
-    static SphericalTriangle<T> create(NBL_CONST_REF_ARG(shapes::SphericalTriangle<T>) tri)
-    {
-        SphericalTriangle<T> retval;
-        retval.tri = tri;
-        vector3_type cos_vertices, sin_vertices;
-        retval.solidAngle = tri.solidAngle(cos_vertices, sin_vertices);
-        retval.cosA = cos_vertices[0];
-        retval.sinA = sin_vertices[0];
-        return retval;
-    }
-
-    vector3_type generate(scalar_type cos_c, scalar_type csc_b, const vector2_type u)
-    {
-        scalar_type negSinSubSolidAngle,negCosSubSolidAngle;
-        math::sincos(solidAngle * u.x - numbers::pi<scalar_type>, negSinSubSolidAngle, negCosSubSolidAngle);
-
-        const scalar_type p = negCosSubSolidAngle * sinA - negSinSubSolidAngle * cosA;
-        const scalar_type q = -negSinSubSolidAngle * sinA - negCosSubSolidAngle * cosA;
-        
-        // TODO: we could optimize everything up and including to the first slerp, because precision here is just godawful
-        scalar_type u_ = q - cosA;
-        scalar_type v_ = p + sinA * cos_c;
-
-        // the slerps could probably be optimized by sidestepping `normalize` calls and accumulating scaling factors
-        vector3_type C_s = tri.vertices[0];
-        if (csc_b < numeric_limits<scalar_type>::max)
-        {
-            const scalar_type cosAngleAlongAC = ((v_ * q - u_ * p) * cosA - v_) / ((v_ * p + u_ * q) * sinA);
-            if (nbl::hlsl::abs(cosAngleAlongAC) < 1.f)
-                C_s += math::quaternion<scalar_type>::slerp_delta(tri.vertices[0], tri.vertices[2] * csc_b, cosAngleAlongAC);
-        }
-
-        vector3_type retval = tri.vertices[1];
-        const scalar_type cosBC_s = nbl::hlsl::dot(C_s, tri.vertices[1]);
-        const scalar_type csc_b_s = 1.0 / nbl::hlsl::sqrt(1.0 - cosBC_s * cosBC_s);
-        if (csc_b_s < numeric_limits<scalar_type>::max)
-        {
-            const scalar_type cosAngleAlongBC_s = nbl::hlsl::clamp(1.0 + cosBC_s * u.y - u.y, -1.f, 1.f);
-            if (nbl::hlsl::abs(cosAngleAlongBC_s) < 1.f)
-                retval += math::quaternion<scalar_type>::slerp_delta(tri.vertices[1], C_s * csc_b_s, cosAngleAlongBC_s);
-        }
-        return retval;
-    }
-
-    vector3_type generate(NBL_REF_ARG(scalar_type) rcpPdf, const vector2_type u)
-    {
-        const scalar_type cos_c = tri.cos_sides[2];
-        const scalar_type csc_b = tri.csc_sides[1];
-
-        rcpPdf = solidAngle;
-
-        return generate(cos_c, csc_b, u);
-    }
-
-    vector2_type generateInverse(NBL_REF_ARG(scalar_type) pdf, scalar_type cos_c, scalar_type csc_c, const vector3_type L)
-    {
-        pdf = 1.0 / solidAngle;
-
-        const scalar_type cosAngleAlongBC_s = nbl::hlsl::dot(L, tri.vertices[1]);
-        const scalar_type csc_a_ = nbl::hlsl::rsqrt(1.0 - cosAngleAlongBC_s * cosAngleAlongBC_s);
-        const scalar_type cos_b_ = nbl::hlsl::dot(L, tri.vertices[0]);
-
-        const scalar_type cosB_ = (cos_b_ - cosAngleAlongBC_s * cos_c) * csc_a_ * csc_c;
-        const scalar_type sinB_ = nbl::hlsl::sqrt(1.0 - cosB_ * cosB_);
-
-        const scalar_type cosC_ = sinA * sinB_* cos_c - cosA * cosB_;
-        const scalar_type sinC_ = nbl::hlsl::sqrt(1.0 - cosC_ * cosC_);
-
-        math::sincos_accumulator<scalar_type> angle_adder = math::sincos_accumulator<scalar_type>::create(cosA, sinA);
-        angle_adder.addAngle(cosB_, sinB_);
-        angle_adder.addAngle(cosC_, sinC_);
-        const scalar_type subTriSolidAngleRatio = (angle_adder.getSumofArccos() - numbers::pi<scalar_type>) * pdf;
-        const scalar_type u = subTriSolidAngleRatio > numeric_limits<scalar_type>::min ? subTriSolidAngleRatio : 0.0;
-
-        const scalar_type cosBC_s = (cosA + cosB_ * cosC_) / (sinB_ * sinC_);
-        const scalar_type v = (1.0 - cosAngleAlongBC_s) / (1.0 - (cosBC_s < bit_cast<float>(0x3f7fffff) ? cosBC_s : cos_c));
-
-        return vector2_type(u,v);
-    }
-
-    vector2_type generateInverse(NBL_REF_ARG(scalar_type) pdf, const vector3_type L)
-    {
-        const scalar_type cos_c = tri.cos_sides[2];
-        const scalar_type csc_c = tri.csc_sides[2];
-
-        return generateInverse(pdf, cos_c, csc_c, L);
-    }
-
-    shapes::SphericalTriangle<T> tri;
-    scalar_type solidAngle;
-    scalar_type cosA;
-    scalar_type sinA;
+   STA_ARVO = 0,
+   STA_PBRT = 1
 };
 
+template<typename T, bool Bijective = false, SphericalTriangleAlgorithm Algorithm = STA_ARVO>
+struct SphericalTriangle;
+
+namespace impl
+{
+
+// DifferenceOfProducts: a*b - c*d with Kahan FMA compensation
+template<typename T>
+T differenceOfProducts(T a, T b, T c, T d)
+{
+   const T cd = c * d;
+   const T dop = nbl::hlsl::fma(a, b, -cd);
+   const T err = nbl::hlsl::fma(-c, d, cd);
+   return dop + err;
 }
+
+// SumOfProducts: a*b + c*d with Kahan FMA compensation
+template<typename T>
+T sumOfProducts(T a, T b, T c, T d)
+{
+   const T cd = c * d;
+   const T sop = nbl::hlsl::fma(a, b, cd);
+   const T err = nbl::hlsl::fma(c, d, -cd);
+   return sop + err;
 }
-}
+
+} // namespace impl
+
+// Non-bijective: Resamplable & Tractable (generate + pdf/weight, no inverse)
+template<typename T, SphericalTriangleAlgorithm Algorithm>
+struct SphericalTriangle<T, false, Algorithm>
+{
+   using scalar_type = T;
+   using vector2_type = vector<T, 2>;
+   using vector3_type = vector<T, 3>;
+
+   using domain_type = vector2_type;
+   using codomain_type = vector3_type;
+   using density_type = scalar_type;
+   using weight_type = density_type;
+
+   struct cache_type
+   {
+   };
+
+   static SphericalTriangle create(NBL_CONST_REF_ARG(shapes::SphericalTriangle<T>) tri)
+   {
+      SphericalTriangle retval;
+      retval.rcpSolidAngle = scalar_type(1.0) / tri.solid_angle;
+      retval.tri_vertices[0] = tri.vertices[0];
+      retval.tri_vertices[1] = tri.vertices[1];
+      retval.triCosC = tri.cos_sides[2];
+      // precompute great circle normal of arc AC: cross(A,C) has magnitude sin(b),
+      // so multiplying by csc(b) normalizes it; zero when side AC is degenerate
+      const scalar_type cscB = tri.csc_sides[1];
+      const vector3_type arcACPlaneNormal = hlsl::cross(tri.vertices[0], tri.vertices[2]) * hlsl::select(cscB < numeric_limits<scalar_type>::max, cscB, scalar_type(0));
+      retval.e_C = hlsl::cross(arcACPlaneNormal, tri.vertices[0]);
+      retval.cosA = tri.cos_vertices[0];
+      retval.sinA = tri.sin_vertices[0];
+      if (Algorithm == STA_ARVO)
+      {
+         retval.sinA_triCosC = retval.sinA * retval.triCosC;
+         retval.eCdotB = hlsl::dot(retval.e_C, tri.vertices[1]);
+      }
+      return retval;
+   }
+
+   codomain_type generate(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC
+   {
+      // Step 1: compute sin/cos of A_hat and the angle difference (A_hat - alpha)
+      const scalar_type A_hat = u.x / rcpSolidAngle;
+      scalar_type sinA_hat, cosA_hat;
+      math::sincos(A_hat, sinA_hat, cosA_hat);
+      const scalar_type s = sinA_hat * cosA - cosA_hat * sinA; // sin(A_hat - alpha)
+      const scalar_type t = cosA_hat * cosA + sinA_hat * sinA; // cos(A_hat - alpha)
+
+      // Step 2: compute cos(b') and sin(b') -- arc from A to the new vertex C'
+      scalar_type cosBp, sinBp;
+      if (Algorithm == STA_ARVO) // faster than PBRT
+      {
+         const scalar_type u_ = t - cosA;
+         const scalar_type v_ = s + sinA_triCosC;
+         const scalar_type num = (v_ * t - u_ * s) * cosA - v_;
+         const scalar_type denum = (v_ * s + u_ * t) * sinA;
+
+#if ACCURATE
+         // sqrt(1 - cosBp^2) loses precision when cosBp ~ 1 (small u.x).
+         // Use stable factorization: sinBp = sqrt((denum-num)(denum+num)) / |denum|
+         // where denum-num = sinA*(1+triCosC)*(1-cosA_hat).
+
+         // For large triangles with high u.x, cosA_hat can approach -1,
+         // making (1 + cosA_hat) near zero and the division unstable.
+         // Use the algebraic identity only when cosA_hat > 0 (safe denominator).
+         const scalar_type rcpDenum = scalar_type(1) / denum;
+         const scalar_type oneMinusCosAhat = hlsl::select(cosA_hat > scalar_type(0), sinA_hat * sinA_hat / (scalar_type(1) + cosA_hat), scalar_type(1) - cosA_hat);
+         const scalar_type DminusN = sinA * (scalar_type(1) + triCosC) * oneMinusCosAhat;
+         sinBp = sqrt<scalar_type>(max<scalar_type>(scalar_type(0), DminusN * (denum + num))) * nbl::hlsl::abs(rcpDenum);
+         cosBp = scalar_type(1) - DminusN * rcpDenum;
+#else // 17% faster, less accurate
+         cosBp = num / denum;
+         sinBp = sqrt<scalar_type>(max<scalar_type>(scalar_type(0), scalar_type(1) - cosBp * cosBp));
+#endif
+      }
+      else // STA_PBRT, accurate, slowest
+      {
+         // PBRT uses cosPhi = -t, sinPhi = -s (pi offset from Arvo's A_hat)
+         const scalar_type k1 = -t + cosA;
+         const scalar_type k2 = -s - sinA * triCosC;
+         cosBp = (k2 + impl::differenceOfProducts(k2, -t, k1, -s) * cosA) / (impl::sumOfProducts(k2, -s, k1, -t) * sinA);
+         cosBp = nbl::hlsl::clamp(cosBp, scalar_type(-1), scalar_type(1));
+         sinBp = sqrt<scalar_type>(max<scalar_type>(scalar_type(0), scalar_type(1) - cosBp * cosBp));
+      }
+
+      // Step 3: construct C' on the great circle through A toward C
+      const vector3_type cp = cosBp * tri_vertices[0] + sinBp * e_C;
+
+      // Step 4: uniformly sample the great circle arc from B to C'
+      scalar_type cosCpB;
+      if (Algorithm == STA_ARVO)
+         cosCpB = cosBp * triCosC + sinBp * eCdotB;
+      else
+         cosCpB = nbl::hlsl::dot(cp, tri_vertices[1]);
+      const scalar_type z = scalar_type(1) - u.y * (scalar_type(1) - cosCpB);
+      const scalar_type sinZ = sqrt<scalar_type>(max<scalar_type>(scalar_type(0), scalar_type(1) - z * z));
+      return z * tri_vertices[1] + sinZ * hlsl::normalize(cp - cosCpB * tri_vertices[1]);
+   }
+
+   density_type forwardPdf(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
+   {
+      return rcpSolidAngle;
+   }
+
+   weight_type forwardWeight(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
+   {
+      return forwardPdf(u, cache);
+   }
+
+   density_type backwardPdf(const codomain_type L) NBL_CONST_MEMBER_FUNC
+   {
+      return rcpSolidAngle;
+   }
+
+   weight_type backwardWeight(const codomain_type L) NBL_CONST_MEMBER_FUNC
+   {
+      return backwardPdf(L);
+   }
+
+   scalar_type rcpSolidAngle;
+   scalar_type cosA;
+   scalar_type sinA;
+   scalar_type sinA_triCosC; // precomputed sinA * triCosC
+   scalar_type eCdotB; // precomputed dot(e_C, tri_vertices[1]), Arvo only
+
+   vector3_type tri_vertices[2]; // A and B only
+   scalar_type triCosC;
+   vector3_type e_C; // precomputed cross(arcACPlaneNormal, A), unit vector perp to A in A-C plane
+};
+
+// Bijective: adds generateInverse, stores extra members for the inverse mapping
+template<typename T, SphericalTriangleAlgorithm Algorithm>
+struct SphericalTriangle<T, true, Algorithm>
+{
+   using scalar_type = T;
+   using vector2_type = vector<T, 2>;
+   using vector3_type = vector<T, 3>;
+
+   using base_type = SphericalTriangle<T, false, Algorithm>;
+   using domain_type = vector2_type;
+   using codomain_type = vector3_type;
+   using density_type = scalar_type;
+   using weight_type = density_type;
+
+   using cache_type = typename base_type::cache_type;
+
+   static SphericalTriangle create(NBL_CONST_REF_ARG(shapes::SphericalTriangle<T>) tri)
+   {
+      SphericalTriangle retval;
+      retval.base = base_type::create(tri);
+      retval.rcpSolidAngle = retval.base.rcpSolidAngle;
+      retval.vertexC = tri.vertices[2];
+      // precompute great circle normal of arc AC (needed for generateInverse)
+      const scalar_type cscB = tri.csc_sides[1];
+      retval.arcACPlaneNormal = hlsl::cross(tri.vertices[0], tri.vertices[2]) * hlsl::select(cscB < numeric_limits<scalar_type>::max, cscB, scalar_type(0));
+      retval.triCscC = tri.csc_sides[2];
+      return retval;
+   }
+
+   codomain_type generate(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC
+   {
+      return base.generate(u, cache);
+   }
+
+
+   // generate() works in two steps:
+   //   u.x -> pick C' on arc AC (choosing a sub-area fraction)
+   //   u.y -> pick L on arc B->C' (linear interpolation)
+   //
+   // So the inverse is:
+   //   1) Find C': intersect great circle (B,L) with great circle (A,C)
+   //   2) u.x = solidAngle(A,B,C') / solidAngle(A,B,C)
+   //   3) u.y = |L-B|^2 / |C'-B|^2
+   domain_type generateInverse(const codomain_type L) NBL_CONST_MEMBER_FUNC
+   {
+      // Step 1: find C' = intersection of great circles (B,L) and (A,C)
+      const vector3_type BxL = nbl::hlsl::cross(base.tri_vertices[1], L);
+      const scalar_type sinBL_sq = nbl::hlsl::dot(BxL, BxL);
+      if (sinBL_sq < numeric_limits<scalar_type>::epsilon)
+      {
+         // L ~ B: u.y ~ 0, u.x is indeterminate (all u.x map to B when u.y=0).
+         // Recover u.y from |L-B|^2 / |A-B|^2 (using C'=A; the (1-cosCpB) ratio
+         // cancels so any C' gives the same result).
+         const vector3_type LminusB = L - base.tri_vertices[1];
+         const vector3_type AminusB = base.tri_vertices[0] - base.tri_vertices[1];
+         const scalar_type v_num = nbl::hlsl::dot(LminusB, LminusB);
+         const scalar_type v_denom = nbl::hlsl::dot(AminusB, AminusB);
+         const scalar_type v = hlsl::select(v_denom > numeric_limits<scalar_type>::epsilon,
+            nbl::hlsl::clamp(v_num / v_denom, scalar_type(0.0), scalar_type(1.0)),
+            scalar_type(0.0));
+         return vector2_type(scalar_type(0.0), v);
+      }
+
+      // C' lies on arc AC, so C' = A*cos(t) + e_C*sin(t).
+      // C' also lies on the B-L plane, so dot(BxL, C') = 0.
+      // Solving: (cos(t), sin(t)) = (tripleE, -tripleA) / R
+      const scalar_type tripleA = nbl::hlsl::dot(BxL, base.tri_vertices[0]);
+      const scalar_type tripleE = nbl::hlsl::dot(BxL, base.e_C);
+      const scalar_type R_sq = tripleA * tripleA + tripleE * tripleE;
+      if (R_sq < numeric_limits<scalar_type>::epsilon)
+         return vector2_type(scalar_type(0.0), scalar_type(0.0));
+
+      const scalar_type rcpR = scalar_type(1.0) / nbl::hlsl::sqrt(R_sq);
+      vector3_type cp = base.tri_vertices[0] * (tripleE * rcpR) + base.e_C * (-tripleA * rcpR);
+      // two intersections exist; pick the one on the minor arc A->C
+      if (nbl::hlsl::dot(cp, base.tri_vertices[0] + vertexC) < scalar_type(0.0))
+         cp = -cp;
+
+      // Step 2: u.x = solidAngle(A,B,C') / solidAngle(A,B,C)
+      // Van Oosterom-Strackee: tan(Omega/2) = |A.(BxC')| / (1 + A.B + B.C' + C'.A)
+      //
+      // Numerator stability: the naive triple product dot(A, cross(B, C')) suffers
+      // catastrophic cancellation when C' is near A (small u.x), because
+      // cross(B, C') ~ cross(B, A) and dot(A, cross(B, A)) = 0 exactly.
+      // Expanding C' = cosBp*A + sinBp*e_C into the triple product:
+      //   A.(BxC') = cosBp * A.(BxA) + sinBp * A.(Bxe_C) = sinBp * A.(Bxe_C)
+      // since A.(BxA) = 0 identically. This avoids the cancellation.
+      const scalar_type cosBp_inv = nbl::hlsl::dot(cp, base.tri_vertices[0]);
+      const scalar_type sinBp_inv = nbl::hlsl::dot(cp, base.e_C);
+      const scalar_type AxBdotE = nbl::hlsl::dot(base.tri_vertices[0], nbl::hlsl::cross(base.tri_vertices[1], base.e_C));
+      const scalar_type num = sinBp_inv * AxBdotE;
+      const scalar_type cosCpB = nbl::hlsl::dot(base.tri_vertices[1], cp);
+      const scalar_type den = scalar_type(1.0) + base.triCosC + cosCpB + cosBp_inv;
+      const scalar_type subSolidAngle = scalar_type(2.0) * nbl::hlsl::atan2(nbl::hlsl::abs(num), den);
+      const scalar_type u = nbl::hlsl::clamp(subSolidAngle * rcpSolidAngle, scalar_type(0.0), scalar_type(1.0));
+
+      // Step 3: u.y = |L-B|^2 / |C'-B|^2
+      // Squared Euclidean distance avoids catastrophic cancellation vs (1-dot)/(1-dot)
+      const vector3_type LminusB = L - base.tri_vertices[1];
+      const vector3_type cpMinusB = cp - base.tri_vertices[1];
+      const scalar_type v_num = nbl::hlsl::dot(LminusB, LminusB);
+      const scalar_type v_denom = nbl::hlsl::dot(cpMinusB, cpMinusB);
+      const scalar_type v = hlsl::select(v_denom > numeric_limits<scalar_type>::epsilon,
+         nbl::hlsl::clamp(v_num / nbl::hlsl::max(v_denom, numeric_limits<scalar_type>::min),
+            scalar_type(0.0), scalar_type(1.0)),
+         scalar_type(0.0));
+
+      return vector2_type(u, v);
+   }
+
+   density_type forwardPdf(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
+   {
+      return rcpSolidAngle;
+   }
+
+   weight_type forwardWeight(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
+   {
+      return forwardPdf(u, cache);
+   }
+
+   density_type backwardPdf(const codomain_type L) NBL_CONST_MEMBER_FUNC
+   {
+      return rcpSolidAngle;
+   }
+
+   weight_type backwardWeight(const codomain_type L) NBL_CONST_MEMBER_FUNC
+   {
+      return backwardPdf(L);
+   }
+
+   // mirrored from base for uniform access across both specializations
+   scalar_type rcpSolidAngle;
+
+   base_type base;
+   vector3_type vertexC;
+   vector3_type arcACPlaneNormal; // precomputed normalize(cross(A, C)), great circle normal of arc AC
+   scalar_type triCscC;
+};
+
+} // namespace sampling
+} // namespace hlsl
+} // namespace nbl
 
 #endif
diff --git a/include/nbl/builtin/hlsl/sampling/uniform_spheres.hlsl b/include/nbl/builtin/hlsl/sampling/uniform_spheres.hlsl
index 5fc3bc7a0b..21901a9628 100644
--- a/include/nbl/builtin/hlsl/sampling/uniform_spheres.hlsl
+++ b/include/nbl/builtin/hlsl/sampling/uniform_spheres.hlsl
@@ -9,6 +9,7 @@
 #include "nbl/builtin/hlsl/numbers.hlsl"
 #include "nbl/builtin/hlsl/tgmath.hlsl"
 #include "nbl/builtin/hlsl/sampling/quotient_and_pdf.hlsl"
+#include "nbl/builtin/hlsl/sampling/concentric_mapping.hlsl"
 
 namespace nbl
 {
@@ -20,57 +21,122 @@ namespace sampling
 template<typename T NBL_PRIMARY_REQUIRES(is_scalar_v<T>)
 struct UniformHemisphere
 {
-    using vector_t2 = vector<T, 2>;
-    using vector_t3 = vector<T, 3>;
-
-    static vector_t3 generate(const vector_t2 _sample)
-    {
-        T z = _sample.x;
-        T r = hlsl::sqrt<T>(hlsl::max<T>(T(0.0), T(1.0) - z * z));
-        T phi = T(2.0) * numbers::pi<T> * _sample.y;
-        return vector_t3(r * hlsl::cos<T>(phi), r * hlsl::sin<T>(phi), z);
-    }
-
-    static T pdf()
-    {
-        return T(1.0) / (T(2.0) * numbers::pi<T>);
-    }
-
-    template<typename U=vector<T,1> >
-    static quotient_and_pdf<U, T> quotient_and_pdf()
-    {
-        return quotient_and_pdf<U, T>::create(hlsl::promote<U>(1.0), pdf());
-    }
+	using vector_t2 = vector<T, 2>;
+	using vector_t3 = vector<T, 3>;
+
+	// BijectiveSampler concept types
+	using scalar_type = T;
+	using domain_type = vector_t2;
+	using codomain_type = vector_t3;
+	using density_type = T;
+	using weight_type = density_type;
+
+	struct cache_type {};
+
+	static codomain_type generate(const domain_type u)
+	{
+		typename ConcentricMapping<T>::cache_type cmCache;
+		const vector_t2 p = ConcentricMapping<T>::generate(u, cmCache);
+		const T z = T(1.0) - cmCache.r2;
+		const T xyScale = hlsl::sqrt<T>(hlsl::max<T>(T(0.0), T(2.0) - cmCache.r2));
+		return vector_t3(p.x * xyScale, p.y * xyScale, z);
+	}
+
+	static codomain_type generate(const domain_type u, NBL_REF_ARG(cache_type) cache)
+	{
+		return generate(u);
+	}
+
+	static domain_type generateInverse(const codomain_type v)
+	{
+		// r_disk / r_xy = sqrt(1-z) / sqrt(1-z^2) = 1/sqrt(1+z)
+		const T scale = T(1.0) / hlsl::sqrt<T>(T(1.0) + v.z);
+		return ConcentricMapping<T>::generateInverse(vector_t2(v.x * scale, v.y * scale));
+	}
+
+	static density_type forwardPdf(const domain_type u, const cache_type cache)
+	{
+		return T(0.5) * numbers::inv_pi<T>;
+	}
+
+	static weight_type forwardWeight(const domain_type u, const cache_type cache)
+	{
+		return T(0.5) * numbers::inv_pi<T>;
+	}
+
+	static density_type backwardPdf(const codomain_type v)
+	{
+		assert(v.z > 0);
+		return T(0.5) * numbers::inv_pi<T>;
+	}
+
+	static weight_type backwardWeight(const codomain_type v)
+	{
+		assert(v.z > 0);
+		return T(0.5) * numbers::inv_pi<T>;
+	}
+
 };
 
 template<typename T NBL_PRIMARY_REQUIRES(is_scalar_v<T>)
 struct UniformSphere
 {
-    using vector_t2 = vector<T, 2>;
-    using vector_t3 = vector<T, 3>;
-
-    static vector_t3 generate(const vector_t2 _sample)
-    {
-        T z = T(1.0) - T(2.0) * _sample.x;
-        T r = hlsl::sqrt<T>(hlsl::max<T>(T(0.0), T(1.0) - z * z));
-        T phi = T(2.0) * numbers::pi<T> * _sample.y;
-        return vector_t3(r * hlsl::cos<T>(phi), r * hlsl::sin<T>(phi), z);
-    }
-
-    static T pdf()
-    {
-        return T(1.0) / (T(4.0) * numbers::pi<T>);
-    }
-
-    template<typename U=vector<T,1> >
-    static quotient_and_pdf<U, T> quotient_and_pdf()
-    {
-        return quotient_and_pdf<U, T>::create(hlsl::promote<U>(1.0), pdf());
-    }
+	using vector_t2 = vector<T, 2>;
+	using vector_t3 = vector<T, 3>;
+	using hemisphere_t = UniformHemisphere<T>;
+
+	// BijectiveSampler concept types
+	using scalar_type = T;
+	using domain_type = vector_t2;
+	using codomain_type = vector_t3;
+	using density_type = T;
+	using weight_type = density_type;
+
+	using cache_type = typename hemisphere_t::cache_type;
+
+	static codomain_type generate(const domain_type u)
+	{
+		const T tmp = u.x * T(2.0) - T(1.0);
+		const codomain_type L = hemisphere_t::generate(vector_t2(hlsl::abs<T>(tmp), u.y));
+		return vector_t3(L.x, L.y, L.z * hlsl::sign(tmp));
+	}
+
+	static codomain_type generate(const domain_type u, NBL_REF_ARG(cache_type) cache)
+	{
+		return generate(u);
+	}
+
+	static domain_type generateInverse(const codomain_type v)
+	{
+		const T dir = hlsl::sign(v.z) * T(0.5);
+		const domain_type hemiU = hemisphere_t::generateInverse(vector_t3(v.x, v.y, hlsl::abs<T>(v.z)));
+		return vector_t2(hemiU.x * dir + T(0.5), hemiU.y);
+	}
+
+	static density_type forwardPdf(const domain_type u, const cache_type cache)
+	{
+		return T(0.5) * hemisphere_t::forwardPdf(u, cache);
+	}
+
+	static weight_type forwardWeight(const domain_type u, const cache_type cache)
+	{
+		return T(0.5) * hemisphere_t::forwardWeight(u, cache);
+	}
+
+	static density_type backwardPdf(const codomain_type v)
+	{
+		return T(0.25) * numbers::inv_pi<T>;
+	}
+
+	static weight_type backwardWeight(const codomain_type v)
+	{
+		return T(0.25) * numbers::inv_pi<T>;
+	}
+
 };
-}
+} // namespace sampling
 
-}
-}
+} // namespace hlsl
+} // namespace nbl
 
 #endif
diff --git a/include/nbl/builtin/hlsl/sampling/value_and_pdf.hlsl b/include/nbl/builtin/hlsl/sampling/value_and_pdf.hlsl
new file mode 100644
index 0000000000..8c54d34c5e
--- /dev/null
+++ b/include/nbl/builtin/hlsl/sampling/value_and_pdf.hlsl
@@ -0,0 +1,75 @@
+// Copyright (C) 2018-2025 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+#ifndef _NBL_BUILTIN_HLSL_SAMPLING_VALUE_AND_PDF_INCLUDED_
+#define _NBL_BUILTIN_HLSL_SAMPLING_VALUE_AND_PDF_INCLUDED_
+
+namespace nbl
+{
+namespace hlsl
+{
+namespace sampling
+{
+
+template<typename V, typename P>
+struct value_and_rcpPdf
+{
+	using this_t = value_and_rcpPdf<V, P>;
+
+	static this_t create(const V _value, const P _rcpPdf)
+	{
+		this_t retval;
+		retval._value = _value;
+		retval._rcpPdf = _rcpPdf;
+		return retval;
+	}
+
+	V value() NBL_CONST_MEMBER_FUNC { return _value; }
+	P rcpPdf() NBL_CONST_MEMBER_FUNC { return _rcpPdf; }
+
+	V _value;
+	P _rcpPdf;
+};
+
+template<typename V, typename P>
+struct value_and_pdf
+{
+	using this_t = value_and_pdf<V, P>;
+
+	static this_t create(const V _value, const P _pdf)
+	{
+		this_t retval;
+		retval._value = _value;
+		retval._pdf = _pdf;
+		return retval;
+	}
+
+	V value() NBL_CONST_MEMBER_FUNC { return _value; }
+	P pdf() NBL_CONST_MEMBER_FUNC { return _pdf; }
+
+	V _value;
+	P _pdf;
+};
+
+// Returned by TractableSampler::generate, codomain sample bundled with its rcpPdf
+template<typename V, typename P>
+using codomain_and_rcpPdf = value_and_rcpPdf<V, P>;
+
+// Returned by TractableSampler::generate, codomain sample bundled with its pdf
+template<typename V, typename P>
+using codomain_and_pdf = value_and_pdf<V, P>;
+
+// Returned by BijectiveSampler::invertGenerate, domain value bundled with its rcpPdf
+template<typename V, typename P>
+using domain_and_rcpPdf = value_and_rcpPdf<V, P>;
+
+// Returned by BijectiveSampler::invertGenerate, domain value bundled with its pdf
+template<typename V, typename P>
+using domain_and_pdf = value_and_pdf<V, P>;
+
+} // namespace sampling
+} // namespace hlsl
+} // namespace nbl
+
+#endif
diff --git a/include/nbl/builtin/hlsl/shapes/spherical_rectangle.hlsl b/include/nbl/builtin/hlsl/shapes/spherical_rectangle.hlsl
index 3890d1a2db..75a27ad0f2 100644
--- a/include/nbl/builtin/hlsl/shapes/spherical_rectangle.hlsl
+++ b/include/nbl/builtin/hlsl/shapes/spherical_rectangle.hlsl
@@ -9,6 +9,7 @@
 #include <nbl/builtin/hlsl/numbers.hlsl>
 #include <nbl/builtin/hlsl/math/functions.hlsl>
 #include <nbl/builtin/hlsl/math/angle_adding.hlsl>
+#include <nbl/builtin/hlsl/math/fast_acos.hlsl>
 
 namespace nbl
 {
@@ -82,42 +83,110 @@ struct SphericalRectangle
     using scalar_type = Scalar;
     using vector2_type = vector<Scalar, 2>;
     using vector3_type = vector<Scalar, 3>;
+    using vector4_type = vector<Scalar, 4>;
     using matrix3x3_type = matrix<Scalar, 3, 3>;
 
-    static SphericalRectangle<Scalar> create(const vector3_type rectangleOrigin, const vector3_type right, const vector3_type up)
+    struct solid_angle_type
+    {
+        scalar_type value;
+        vector3_type r0;
+        vector4_type n_z;
+        vector4_type cosGamma;
+    };
+
+    static SphericalRectangle<Scalar> create(NBL_CONST_REF_ARG(CompressedSphericalRectangle<Scalar>) compressed)
     {
         SphericalRectangle<scalar_type> retval;
-        retval.origin = rectangleOrigin;
-        retval.extents = vector2_type(hlsl::length(right), hlsl::length(up));
-        retval.basis[0] = right / retval.extents[0];
-        retval.basis[1] = up / retval.extents[1];
+        retval.origin = compressed.origin;
+        retval.extents = vector2_type(hlsl::length(compressed.right), hlsl::length(compressed.up));
+        retval.basis[0] = compressed.right / retval.extents[0];
+        retval.basis[1] = compressed.up / retval.extents[1];
+        assert(hlsl::abs(hlsl::dot(retval.basis[0], retval.basis[1])) < scalar_type(1e-5));
         retval.basis[2] = hlsl::normalize(hlsl::cross(retval.basis[0], retval.basis[1]));
         return retval;
     }
 
-    static SphericalRectangle<Scalar> create(NBL_CONST_REF_ARG(CompressedSphericalRectangle<Scalar>) compressed)
+    solid_angle_type solidAngle(const vector3_type observer) NBL_CONST_MEMBER_FUNC
     {
-        return create(compressed.origin, compressed.right, compressed.up);
+        solid_angle_type result;
+        result.r0 = hlsl::mul(basis, origin - observer);
+
+        const vector4_type denorm_n_z = vector4_type(-result.r0.y, result.r0.x + extents.x, result.r0.y + extents.y, -result.r0.x);
+        result.n_z = denorm_n_z / nbl::hlsl::sqrt(hlsl::promote<vector4_type>(result.r0.z * result.r0.z) + denorm_n_z * denorm_n_z);
+        result.cosGamma = vector4_type(
+            -result.n_z[0] * result.n_z[1],
+            -result.n_z[1] * result.n_z[2],
+            -result.n_z[2] * result.n_z[3],
+            -result.n_z[3] * result.n_z[0]
+        );
+        math::sincos_accumulator<scalar_type> angle_adder = math::sincos_accumulator<scalar_type>::create(result.cosGamma[0]);
+        angle_adder.addCosine(result.cosGamma[1]);
+        angle_adder.addCosine(result.cosGamma[2]);
+        angle_adder.addCosine(result.cosGamma[3]);
+        result.value = angle_adder.getSumOfArccos() - scalar_type(2.0) * numbers::pi<scalar_type>;
+        return result;
     }
 
-    scalar_type solidAngle(const vector3_type observer)
+    // Kelvin-Stokes theorem: signed projected solid angle = integral_{rect} (n . omega) d_omega
+    scalar_type projectedSolidAngle(const vector3_type observer, const vector3_type receiverNormal) NBL_CONST_MEMBER_FUNC
     {
-        const vector3_type r0 = hlsl::mul(basis, origin - observer);
-
-        using vector4_type = vector<Scalar, 4>;
-        const vector4_type denorm_n_z = vector4_type(-r0.y, r0.x + extents.x, r0.y + extents.y, -r0.x);
-        const vector4_type n_z = denorm_n_z / nbl::hlsl::sqrt((vector4_type)(r0.z * r0.z) + denorm_n_z * denorm_n_z);
-        const vector4_type cosGamma = vector4_type(
-            -n_z[0] * n_z[1],
-            -n_z[1] * n_z[2],
-            -n_z[2] * n_z[3],
-            -n_z[3] * n_z[0]
+        return projectedSolidAngleFromLocal(hlsl::mul(basis, origin - observer), hlsl::mul(basis, receiverNormal));
+    }
+
+    // Overload for when r0 and localNormal are already computed (avoids redundant mul(basis, ...)).
+    // Exploits rectangle structure: all 4 corners share the same z, so cross products
+    // have only 2 nonzero components each, and externalProducts can be computed without
+    // normalizing the corner directions.
+    scalar_type projectedSolidAngleFromLocal(const vector3_type r0, const vector3_type n) NBL_CONST_MEMBER_FUNC
+    {
+        const scalar_type x0 = r0.x, y0 = r0.y, z = r0.z;
+        const scalar_type x1 = x0 + extents.x;
+        const scalar_type y1 = y0 + extents.y;
+        const scalar_type ex = extents.x, ey = extents.y;
+        const scalar_type zSq = z * z;
+
+        // Unnormalized cross products of adjacent corners (each has one zero component):
+        //   cross(r0,r1) = ex*(0, z, -y0),  cross(r1,r2) = ey*(-z, 0, x1)
+        //   cross(r2,r3) = ex*(0, -z, y1),  cross(r3,r0) = ey*(z, 0, -x0)
+        // |cross|^2: the ex/ey factors cancel in externalProducts (dot/|cross|)
+        const vector4_type crossLenSq = vector4_type(
+            zSq + y0 * y0,
+            zSq + x1 * x1,
+            zSq + y1 * y1,
+            zSq + x0 * x0
+        );
+
+        // dot(cross(ri,rj), n) / |cross(ri,rj)| the ex/ey scale factors cancel
+        const vector4_type crossDotN = vector4_type(
+            z * n.y - y0 * n.z,
+            -z * n.x + x1 * n.z,
+            -z * n.y + y1 * n.z,
+            z * n.x - x0 * n.z
         );
-        math::sincos_accumulator<scalar_type> angle_adder = math::sincos_accumulator<scalar_type>::create(cosGamma[0]);
-        angle_adder.addCosine(cosGamma[1]);
-        angle_adder.addCosine(cosGamma[2]);
-        angle_adder.addCosine(cosGamma[3]);
-        return angle_adder.getSumofArccos() - scalar_type(2.0) * numbers::pi<float>;
+        // The ABS makes the computation correct for abs(cos(theta)) (BSDF projected solid angle).
+        const vector4_type externalProducts = hlsl::abs(crossDotN) * hlsl::rsqrt<vector4_type>(crossLenSq);
+
+        // cos(arc length) between adjacent corners: dot(ri,rj) / (|ri|*|rj|)
+        const vector4_type lenSq = vector4_type(
+            x0 * x0 + y0 * y0,
+            x1 * x1 + y0 * y0,
+            x1 * x1 + y1 * y1,
+            x0 * x0 + y1 * y1
+        ) + hlsl::promote<vector4_type>(zSq);
+        const vector4_type rcpLen = hlsl::rsqrt<vector4_type>(lenSq);
+
+        const vector4_type unnormDots = vector4_type(
+            x0 * x1 + y0 * y0 + zSq,
+            x1 * x1 + y0 * y1 + zSq,
+            x0 * x1 + y1 * y1 + zSq,
+            x0 * x0 + y0 * y1 + zSq
+        );
+        // rcpLen[i]*rcpLen[j] for adjacent pairs: (0,1), (1,2), (2,3), (3,0)
+        const vector4_type cos_sides = unnormDots * rcpLen * rcpLen.yzwx;
+
+        const vector4_type pyramidAngles = hlsl::acos<vector4_type>(cos_sides);
+
+        return hlsl::dot(pyramidAngles, externalProducts) * scalar_type(0.5);
     }
 
     vector3_type origin;
diff --git a/include/nbl/builtin/hlsl/shapes/spherical_triangle.hlsl b/include/nbl/builtin/hlsl/shapes/spherical_triangle.hlsl
index 25f1e33554..1a5681b39e 100644
--- a/include/nbl/builtin/hlsl/shapes/spherical_triangle.hlsl
+++ b/include/nbl/builtin/hlsl/shapes/spherical_triangle.hlsl
@@ -9,6 +9,7 @@
 #include <nbl/builtin/hlsl/cpp_compat.hlsl>
 #include <nbl/builtin/hlsl/limits.hlsl>
 #include <nbl/builtin/hlsl/math/functions.hlsl>
+#include <nbl/builtin/hlsl/math/fast_acos.hlsl>
 #include <nbl/builtin/hlsl/math/angle_adding.hlsl>
 #include <nbl/builtin/hlsl/numbers.hlsl>
 
@@ -27,58 +28,71 @@ struct SphericalTriangle
 
     static SphericalTriangle<T> create(const vector3_type vertices[3], const vector3_type origin)
     {
-        SphericalTriangle<T> retval;
-        retval.vertices[0] = nbl::hlsl::normalize(vertices[0] - origin);
-        retval.vertices[1] = nbl::hlsl::normalize(vertices[1] - origin);
-        retval.vertices[2] = nbl::hlsl::normalize(vertices[2] - origin);
-        retval.cos_sides = vector3_type(hlsl::dot(retval.vertices[1], retval.vertices[2]), hlsl::dot(retval.vertices[2], retval.vertices[0]), hlsl::dot(retval.vertices[0], retval.vertices[1]));
-        const vector3_type sin_sides2 = hlsl::promote<vector3_type>(1.0) - retval.cos_sides * retval.cos_sides;
-        retval.csc_sides = hlsl::rsqrt<vector3_type>(sin_sides2);
-        return retval;
+        const vector3_type normalizedVerts[3] = {
+            nbl::hlsl::normalize(vertices[0] - origin),
+            nbl::hlsl::normalize(vertices[1] - origin),
+            nbl::hlsl::normalize(vertices[2] - origin)
+        };
+        return createFromUnitSphereVertices(normalizedVerts);
     }
 
-    // checks if any angles are small enough to disregard
-    bool pyramidAngles()
+    static SphericalTriangle<T> createFromUnitSphereVertices(const vector3_type normalizedVertices[3])
     {
-        return hlsl::any<vector<bool, 3> >(csc_sides >= hlsl::promote<vector3_type>(numeric_limits<scalar_type>::max));
-    }
+        SphericalTriangle<T> retval;
+        retval.vertices[0] = normalizedVertices[0];
+        retval.vertices[1] = normalizedVertices[1];
+        retval.vertices[2] = normalizedVertices[2];
 
-    scalar_type solidAngle(NBL_REF_ARG(vector3_type) cos_vertices, NBL_REF_ARG(vector3_type) sin_vertices)
-    {
-        if (pyramidAngles())
-            return 0.f;
+        retval.cos_sides = hlsl::clamp(
+            vector3_type(hlsl::dot(retval.vertices[1], retval.vertices[2]), hlsl::dot(retval.vertices[2], retval.vertices[0]), hlsl::dot(retval.vertices[0], retval.vertices[1])),
+            hlsl::promote<vector3_type>(-1.0), hlsl::promote<vector3_type>(1.0));
+        const vector3_type sin_sides2 = hlsl::promote<vector3_type>(1.0) - retval.cos_sides * retval.cos_sides;
+        retval.csc_sides = hlsl::rsqrt<vector3_type>(sin_sides2); // only need side B for generate, we still have it for projectedSolidAngle()
 
         // Both vertices and angles at the vertices are denoted by the same upper case letters A, B, and C. The angles A, B, C of the triangle are equal to the angles between the planes that intersect the surface of the sphere or,
         // equivalently, the angles between the tangent vectors of the great circle arcs where they meet at the vertices. Angles are in radians. The angles of proper spherical triangles are (by convention) less than PI
-        cos_vertices = hlsl::clamp((cos_sides - cos_sides.yzx * cos_sides.zxy) * csc_sides.yzx * csc_sides.zxy, hlsl::promote<vector3_type>(-1.0), hlsl::promote<vector3_type>(1.0)); // using Spherical Law of Cosines (TODO: do we need to clamp anymore? since the pyramid angles method introduction?) 
-        sin_vertices = hlsl::sqrt(hlsl::promote<vector3_type>(1.0) - cos_vertices * cos_vertices);
-
-        math::sincos_accumulator<scalar_type> angle_adder = math::sincos_accumulator<scalar_type>::create(cos_vertices[0], sin_vertices[0]);
-        angle_adder.addAngle(cos_vertices[1], sin_vertices[1]);
-        angle_adder.addAngle(cos_vertices[2], sin_vertices[2]);
-        return angle_adder.getSumofArccos() - numbers::pi<scalar_type>;
-    }
+        // degenerate triangle: any side has near-zero sin, so csc blows up
+        if (hlsl::any<vector<bool, 3> >(retval.csc_sides >= hlsl::promote<vector3_type>(numeric_limits<scalar_type>::max)))
+        {
+            retval.cos_vertices = hlsl::promote<vector3_type>(0.0);
+            retval.sin_vertices = hlsl::promote<vector3_type>(0.0);
+            retval.solid_angle = 0;
+            return retval;
+        }
+
+        //  cos_a - cos_b * cos_c - (1/sin_b  *  1/sin_c)
+        retval.cos_vertices = hlsl::clamp((retval.cos_sides - retval.cos_sides.yzx * retval.cos_sides.zxy) * retval.csc_sides.yzx * retval.csc_sides.zxy, hlsl::promote<vector3_type>(-1.0), hlsl::promote<vector3_type>(1.0)); // using Spherical Law of Cosines
+        retval.sin_vertices = hlsl::sqrt(hlsl::promote<vector3_type>(1.0) - retval.cos_vertices * retval.cos_vertices);
+
+        // Fast acos overshoot makes the solid angle slightly too large, which causes
+        // generate() to place samples outside the triangle. poly3 (~6.9e-5 error) fails
+        // the 1e-6 generatedInside tolerance; poly4 (~8.6e-6) and poly5 (~1.1e-6) are tighter.
+        // Standard acos avoids this entirely at the cost of one transcendental call.
+        // Benchmarks show fast acos is no faster here -- likely because the surrounding
+        // code already saturates FMA throughput, so the SFU acos runs in parallel for free.
+        math::sincos_accumulator<scalar_type> angle_adder = math::sincos_accumulator<scalar_type>::create(retval.cos_vertices[0], retval.sin_vertices[0]);
+        angle_adder.addAngle(retval.cos_vertices[1], retval.sin_vertices[1]);
+        angle_adder.addAngle(retval.cos_vertices[2], retval.sin_vertices[2]);
+        // Use the clamped variant because addAngle() sum-of-products can push
+        // the accumulated cosine slightly outside [-1,1] on GPU, making acos
+        // return NaN. GPU max(NaN,0)=0 then silently zeroes the solid angle.
+        retval.solid_angle = hlsl::max(angle_adder.getClampedSumOfArccosMinusPi(), scalar_type(0.0));
 
-    scalar_type solidAngle()
-    {
-        vector3_type dummy0,dummy1;
-        return solidAngle(dummy0,dummy1);
+        return retval;
     }
 
-    scalar_type projectedSolidAngle(const vector3_type receiverNormal, NBL_REF_ARG(vector3_type) cos_vertices)
+    scalar_type projectedSolidAngle(const vector3_type receiverNormal) NBL_CONST_MEMBER_FUNC
     {
-        if (pyramidAngles())
-            return 0.f;
-
-        cos_vertices = hlsl::clamp((cos_sides - cos_sides.yzx * cos_sides.zxy) * csc_sides.yzx * csc_sides.zxy, hlsl::promote<vector3_type>(-1.0), hlsl::promote<vector3_type>(1.0));
+        if (solid_angle <= numeric_limits<scalar_type>::epsilon)
+            return 0;
 
         matrix<scalar_type, 3, 3> awayFromEdgePlane;
         awayFromEdgePlane[0] = hlsl::cross(vertices[1], vertices[2]) * csc_sides[0];
         awayFromEdgePlane[1] = hlsl::cross(vertices[2], vertices[0]) * csc_sides[1];
         awayFromEdgePlane[2] = hlsl::cross(vertices[0], vertices[1]) * csc_sides[2];
-        // The ABS makes it so that the computation is correct for an `abs(cos(theta))` factor which is the projected solid angle used for a BSDF
-        // Proof: Kelvin-Stokes theorem, if you split the set into two along the horizon with constant CCW winding, the `cross` along the shared edge goes in different directions and cancels out,
-        // while `acos` of the clipped great arcs corresponding to polygon edges add up to the original sides again
+        // The ABS makes it so that the computation is correct for an `abs(cos(theta))` factor which is the projected solid angle used for a BSDF.
+        // Proof: Kelvin-Stokes theorem, if you split the set into two along the horizon with constant CCW winding, the `cross` along the shared edge
+        // goes in different directions and cancels out, while `acos` of the clipped great arcs corresponding to polygon edges add up to the original sides again.
         const vector3_type externalProducts = hlsl::abs(hlsl::mul(/* transposed already */awayFromEdgePlane, receiverNormal));
 
         // Far TODO: `cross(A,B)*acos(dot(A,B))/sin(1-dot^2)` can be done with `cross*acos_csc_approx(dot(A,B))`
@@ -90,13 +104,18 @@ struct SphericalTriangle
         // See https://www.desmos.com/calculator/sdptomhbju
         // Furthermore we could clip the polynomial calc to `Cu+D or `(Bu+C)u+D` for small arguments
         const vector3_type pyramidAngles = hlsl::acos<vector3_type>(cos_sides);
-        // So that riangle covering almost whole hemisphere sums to PI
+        // So that triangle covering almost whole hemisphere sums to PI
         return hlsl::dot(pyramidAngles, externalProducts) * scalar_type(0.5);
     }
 
     vector3_type vertices[3];
+    // angles of vertices with origin, so the sides are INSIDE the sphere
     vector3_type cos_sides;
     vector3_type csc_sides;
+    // angles between arcs on the sphere, so angles in the TANGENT plane at each vertex
+    vector3_type cos_vertices;
+    vector3_type sin_vertices;
+    scalar_type solid_angle;
 };
 
 }
diff --git a/include/nbl/builtin/hlsl/testing/approx_compare.hlsl b/include/nbl/builtin/hlsl/testing/approx_compare.hlsl
new file mode 100644
index 0000000000..945e9a48cb
--- /dev/null
+++ b/include/nbl/builtin/hlsl/testing/approx_compare.hlsl
@@ -0,0 +1,82 @@
+#ifndef _NBL_BUILTIN_HLSL_TESTING_APPROX_COMPARE_INCLUDED_
+#define _NBL_BUILTIN_HLSL_TESTING_APPROX_COMPARE_INCLUDED_
+
+#include <nbl/builtin/hlsl/testing/relative_approx_compare.hlsl>
+
+namespace nbl
+{
+namespace hlsl
+{
+namespace testing
+{
+namespace impl
+{
+
+template<typename T NBL_STRUCT_CONSTRAINABLE>
+struct AbsoluteAndRelativeApproxCompareHelper;
+
+template<typename FloatingPoint>
+NBL_PARTIAL_REQ_TOP(concepts::FloatingPointLikeScalar<FloatingPoint>)
+struct AbsoluteAndRelativeApproxCompareHelper<FloatingPoint NBL_PARTIAL_REQ_BOT(concepts::FloatingPointLikeScalar<FloatingPoint>) >
+{
+    static bool __call(NBL_CONST_REF_ARG(FloatingPoint) lhs, NBL_CONST_REF_ARG(FloatingPoint) rhs, const float64_t maxAbsoluteDifference, const float64_t maxRelativeDifference)
+    {
+        // Absolute check first: catches small-magnitude values where relative comparison breaks down
+        if (hlsl::abs(float64_t(lhs) - float64_t(rhs)) <= maxAbsoluteDifference)
+            return true;
+
+        // Fall back to relative comparison for larger values
+        return RelativeApproxCompareHelper<FloatingPoint>::__call(lhs, rhs, maxRelativeDifference);
+    }
+};
+
+template<typename FloatingPointVector>
+NBL_PARTIAL_REQ_TOP(concepts::FloatingPointLikeVectorial<FloatingPointVector>)
+struct AbsoluteAndRelativeApproxCompareHelper<FloatingPointVector NBL_PARTIAL_REQ_BOT(concepts::FloatingPointLikeVectorial<FloatingPointVector>) >
+{
+    static bool __call(NBL_CONST_REF_ARG(FloatingPointVector) lhs, NBL_CONST_REF_ARG(FloatingPointVector) rhs, const float64_t maxAbsoluteDifference, const float64_t maxRelativeDifference)
+    {
+        using traits = nbl::hlsl::vector_traits<FloatingPointVector>;
+        for (uint32_t i = 0; i < traits::Dimension; ++i)
+        {
+            if (!AbsoluteAndRelativeApproxCompareHelper<typename traits::scalar_type>::__call(lhs[i], rhs[i], maxAbsoluteDifference, maxRelativeDifference))
+                return false;
+        }
+
+        return true;
+    }
+};
+
+template<typename FloatingPointMatrix>
+NBL_PARTIAL_REQ_TOP(concepts::Matricial<FloatingPointMatrix> && concepts::FloatingPointLikeScalar<typename nbl::hlsl::matrix_traits<FloatingPointMatrix>::scalar_type>)
+struct AbsoluteAndRelativeApproxCompareHelper<FloatingPointMatrix NBL_PARTIAL_REQ_BOT(concepts::Matricial<FloatingPointMatrix> && concepts::FloatingPointLikeScalar<typename nbl::hlsl::matrix_traits<FloatingPointMatrix>::scalar_type>) >
+{
+    static bool __call(NBL_CONST_REF_ARG(FloatingPointMatrix) lhs, NBL_CONST_REF_ARG(FloatingPointMatrix) rhs, const float64_t maxAbsoluteDifference, const float64_t maxRelativeDifference)
+    {
+        using traits = nbl::hlsl::matrix_traits<FloatingPointMatrix>;
+        for (uint32_t i = 0; i < traits::RowCount; ++i)
+        {
+            if (!AbsoluteAndRelativeApproxCompareHelper<typename traits::row_type>::__call(lhs[i], rhs[i], maxAbsoluteDifference, maxRelativeDifference))
+                return false;
+        }
+
+        return true;
+    }
+};
+
+}
+
+// Composite comparator that builds on top of relativeApproxCompare.
+// Checks absolute difference first (handles small-magnitude values where
+// relative comparison breaks down), then falls back to relative comparison.
+template<typename T>
+bool approxCompare(NBL_CONST_REF_ARG(T) lhs, NBL_CONST_REF_ARG(T) rhs, const float64_t maxAbsoluteDifference, const float64_t maxRelativeDifference)
+{
+	return impl::AbsoluteAndRelativeApproxCompareHelper<T>::__call(lhs, rhs, maxAbsoluteDifference, maxRelativeDifference);
+}
+
+}
+}
+}
+
+#endif
diff --git a/include/nbl/builtin/hlsl/testing/max_error.hlsl b/include/nbl/builtin/hlsl/testing/max_error.hlsl
new file mode 100644
index 0000000000..d89d058ef7
--- /dev/null
+++ b/include/nbl/builtin/hlsl/testing/max_error.hlsl
@@ -0,0 +1,102 @@
+#ifndef _NBL_BUILTIN_HLSL_TESTING_MAX_ERROR_INCLUDED_
+#define _NBL_BUILTIN_HLSL_TESTING_MAX_ERROR_INCLUDED_
+
+#include <nbl/builtin/hlsl/cpp_compat/basic.h>
+#include <nbl/builtin/hlsl/concepts/core.hlsl>
+#include <nbl/builtin/hlsl/vector_utils/vector_traits.hlsl>
+#include <nbl/builtin/hlsl/matrix_utils/matrix_traits.hlsl>
+#include <nbl/builtin/hlsl/math/functions.hlsl>
+
+namespace nbl
+{
+namespace hlsl
+{
+namespace testing
+{
+
+struct SMaxError
+{
+	static constexpr uint32_t MaxComponents = 16;
+	float64_t abs[MaxComponents] = {};
+	float64_t rel[MaxComponents] = {};
+	uint32_t rows = 1;
+	uint32_t cols = 1;
+
+	void updateScalar(float64_t expected, float64_t tested, uint32_t idx)
+	{
+		abs[idx] = hlsl::max(hlsl::abs(expected - tested), abs[idx]);
+		if (expected != 0.0 && tested != 0.0)
+			rel[idx] = hlsl::max(hlsl::max(expected / tested, tested / expected) - 1.0, rel[idx]);
+	}
+};
+
+namespace impl
+{
+
+template<typename T NBL_STRUCT_CONSTRAINABLE>
+struct MaxErrorUpdater;
+
+template<typename FloatingPoint>
+NBL_PARTIAL_REQ_TOP(concepts::FloatingPointLikeScalar<FloatingPoint>)
+struct MaxErrorUpdater<FloatingPoint NBL_PARTIAL_REQ_BOT(concepts::FloatingPointLikeScalar<FloatingPoint>) >
+{
+	static void __call(NBL_REF_ARG(SMaxError) record, NBL_CONST_REF_ARG(FloatingPoint) expected, NBL_CONST_REF_ARG(FloatingPoint) tested, uint32_t offset)
+	{
+		record.updateScalar(float64_t(expected), float64_t(tested), offset);
+	}
+};
+
+template<typename FloatingPointVector>
+NBL_PARTIAL_REQ_TOP(concepts::FloatingPointLikeVectorial<FloatingPointVector>)
+struct MaxErrorUpdater<FloatingPointVector NBL_PARTIAL_REQ_BOT(concepts::FloatingPointLikeVectorial<FloatingPointVector>) >
+{
+	static void __call(NBL_REF_ARG(SMaxError) record, NBL_CONST_REF_ARG(FloatingPointVector) expected, NBL_CONST_REF_ARG(FloatingPointVector) tested, uint32_t offset)
+	{
+		using traits = nbl::hlsl::vector_traits<FloatingPointVector>;
+		for (uint32_t i = 0; i < traits::Dimension; ++i)
+			MaxErrorUpdater<typename traits::scalar_type>::__call(record, expected[i], tested[i], offset + i);
+	}
+};
+
+template<typename FloatingPointMatrix>
+NBL_PARTIAL_REQ_TOP(concepts::Matricial<FloatingPointMatrix> && concepts::FloatingPointLikeScalar<typename nbl::hlsl::matrix_traits<FloatingPointMatrix>::scalar_type>)
+struct MaxErrorUpdater<FloatingPointMatrix NBL_PARTIAL_REQ_BOT(concepts::Matricial<FloatingPointMatrix> && concepts::FloatingPointLikeScalar<typename nbl::hlsl::matrix_traits<FloatingPointMatrix>::scalar_type>) >
+{
+	static void __call(NBL_REF_ARG(SMaxError) record, NBL_CONST_REF_ARG(FloatingPointMatrix) expected, NBL_CONST_REF_ARG(FloatingPointMatrix) tested, uint32_t offset)
+	{
+		using traits = nbl::hlsl::matrix_traits<FloatingPointMatrix>;
+		for (uint32_t i = 0; i < traits::RowCount; ++i)
+			MaxErrorUpdater<typename traits::row_type>::__call(record, expected[i], tested[i], offset + i * traits::ColumnCount);
+	}
+};
+
+}
+
+template<typename T>
+void updateMaxError(NBL_REF_ARG(SMaxError) record, NBL_CONST_REF_ARG(T) expected, NBL_CONST_REF_ARG(T) tested)
+{
+	if constexpr (concepts::FloatingPointLikeScalar<T>)
+	{
+		record.rows = 1;
+		record.cols = 1;
+	}
+	else if constexpr (concepts::FloatingPointLikeVectorial<T>)
+	{
+		record.rows = 1;
+		record.cols = vector_traits<T>::Dimension;
+	}
+	else if constexpr (concepts::Matricial<T>)
+	{
+		record.rows = matrix_traits<T>::RowCount;
+		record.cols = matrix_traits<T>::ColumnCount;
+	}
+   else
+      static_assert(false, "Unsupported type for max error updater");
+	impl::MaxErrorUpdater<T>::__call(record, expected, tested, 0);
+}
+
+}
+}
+}
+
+#endif
diff --git a/include/nbl/system/to_string.h b/include/nbl/system/to_string.h
index 2a06ace5e5..1f8988566e 100644
--- a/include/nbl/system/to_string.h
+++ b/include/nbl/system/to_string.h
@@ -1,6 +1,7 @@
 #ifndef _NBL_SYSTEM_TO_STRING_INCLUDED_
 #define _NBL_SYSTEM_TO_STRING_INCLUDED_
 
+#include <format>
 #include <nbl/builtin/hlsl/cpp_compat.hlsl>
 #include <nbl/builtin/hlsl/emulated/int64_t.hlsl>
 #include <nbl/builtin/hlsl/morton.hlsl>
@@ -21,6 +22,15 @@ struct to_string_helper
     }
 };
 
+template<std::floating_point T>
+struct to_string_helper<T>
+{
+    static std::string __call(const T& value)
+    {
+        return std::format("{}", value);
+    }
+};
+
 template<>
 struct to_string_helper<hlsl::emulated_uint64_t>
 {
diff --git a/src/nbl/builtin/CMakeLists.txt b/src/nbl/builtin/CMakeLists.txt
index f27514c2c7..9e6dcaa2f9 100644
--- a/src/nbl/builtin/CMakeLists.txt
+++ b/src/nbl/builtin/CMakeLists.txt
@@ -230,6 +230,7 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/math/geometry.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/math/intutil.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/math/polar.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/math/angle_adding.hlsl")
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/math/fast_acos.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/math/octahedral.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/math/quaternions.hlsl")
 # ies
@@ -274,13 +275,19 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/sampling/linear.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/sampling/bilinear.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/sampling/quantized_sequence.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/sampling/concentric_mapping.hlsl")
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/sampling/polar_mapping.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/sampling/box_muller_transform.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/sampling/spherical_triangle.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/sampling/projected_spherical_triangle.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/sampling/spherical_rectangle.hlsl")
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/sampling/projected_spherical_rectangle.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/sampling/cos_weighted_spheres.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/sampling/quotient_and_pdf.hlsl")
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/sampling/value_and_pdf.hlsl")
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/sampling/concepts.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/sampling/uniform_spheres.hlsl")
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/sampling/alias_table.hlsl")
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/sampling/cumulative_probability.hlsl")
 #
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/ndarray_addressing.hlsl")
 #
@@ -392,7 +399,9 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/rwmc/ResolveParameters.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/morton.hlsl")
 #testing
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/testing/relative_approx_compare.hlsl")
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/testing/approx_compare.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/testing/orientation_compare.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/testing/vector_length_compare.hlsl")
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/testing/max_error.hlsl")
 
 ADD_CUSTOM_BUILTIN_RESOURCES(nblBuiltinResourceData NBL_RESOURCES_TO_EMBED "${NBL_ROOT_PATH}/include" "nbl/builtin" "nbl::builtin" "${NBL_ROOT_PATH_BINARY}/include" "${NBL_ROOT_PATH_BINARY}/src" "STATIC" "INTERNAL")