diff --git a/include/gauxc/reduction_driver.hpp b/include/gauxc/reduction_driver.hpp index f3bef1886..e3a788b9c 100644 --- a/include/gauxc/reduction_driver.hpp +++ b/include/gauxc/reduction_driver.hpp @@ -13,8 +13,11 @@ #include #include #include +#include #include #include +#include +#include namespace GauXC { @@ -52,11 +55,23 @@ class ReductionDriver { allreduce_inplace_typeerased( data, size, op, std::type_index(typeid(T)), optional_args ); } + template + inline std::vector allgather_v( const T* src, size_t size, std::any optional_args = std::any() ) const { + std::vector bytes; + allgather_v_typeerased( src, size, bytes, std::type_index(typeid(T)), optional_args ); + std::vector dest( bytes.size() / sizeof(T) ); + std::memcpy( dest.data(), bytes.data(), bytes.size() ); + return dest; + } + void allreduce_typeerased( const void*, void*, size_t, ReductionOp, std::type_index, std::any ); void allreduce_inplace_typeerased( void*, size_t, ReductionOp, std::type_index, std::any ); + void allgather_v_typeerased( const void*, size_t, std::vector&, std::type_index, std::any ) const; bool takes_host_memory() const; bool takes_device_memory() const; + int comm_rank() const; + int comm_size() const; }; diff --git a/include/gauxc/xc_integrator.hpp b/include/gauxc/xc_integrator.hpp index 03feaf934..055834610 100644 --- a/include/gauxc/xc_integrator.hpp +++ b/include/gauxc/xc_integrator.hpp @@ -80,6 +80,18 @@ class XCIntegrator { exx_type eval_exx ( const MatrixType&, const IntegratorSettingsEXX& = IntegratorSettingsEXX{} ); + value_type eval_nlc( const MatrixType&, const IntegratorSettingsNLC& = IntegratorSettingsNLC{} ); + value_type eval_nlc( const MatrixType&, const MatrixType&, const IntegratorSettingsNLC& = IntegratorSettingsNLC{} ); + + exc_vxc_type_rks eval_nlc_vnlc( const MatrixType&, const IntegratorSettingsNLC& = IntegratorSettingsNLC{} ); + exc_vxc_type_uks eval_nlc_vnlc( const MatrixType&, const MatrixType&, const IntegratorSettingsNLC& = IntegratorSettingsNLC{} ); + + exc_grad_type eval_nlc_grad( const MatrixType&, const IntegratorSettingsNLC& = IntegratorSettingsNLC{} ); + exc_grad_type eval_nlc_grad( const MatrixType&, const MatrixType&, const IntegratorSettingsNLC& = IntegratorSettingsNLC{} ); + + fxc_contraction_type_rks eval_nlc_fnlc_contraction( const MatrixType&, const MatrixType&, const IntegratorSettingsNLC& = IntegratorSettingsNLC{} ); + fxc_contraction_type_uks eval_nlc_fnlc_contraction( const MatrixType&, const MatrixType&, const MatrixType&, const MatrixType&, const IntegratorSettingsNLC& = IntegratorSettingsNLC{} ); + fxc_contraction_type_rks eval_fxc_contraction ( const MatrixType&, const MatrixType&, const IntegratorSettingsXC& = IntegratorSettingsXC{} ); fxc_contraction_type_uks eval_fxc_contraction ( const MatrixType&, const MatrixType&, const MatrixType&, const MatrixType&, diff --git a/include/gauxc/xc_integrator/impl.hpp b/include/gauxc/xc_integrator/impl.hpp index 400afb7c7..155591bb7 100644 --- a/include/gauxc/xc_integrator/impl.hpp +++ b/include/gauxc/xc_integrator/impl.hpp @@ -100,6 +100,63 @@ typename XCIntegrator::exx_type return pimpl_->eval_exx(P,settings); }; +template +typename XCIntegrator::value_type + XCIntegrator::eval_nlc( const MatrixType& P, const IntegratorSettingsNLC& settings ) { + if( not pimpl_ ) GAUXC_PIMPL_NOT_INITIALIZED(); + return pimpl_->eval_nlc(P, settings); +}; + +template +typename XCIntegrator::value_type + XCIntegrator::eval_nlc( const MatrixType& Ps, const MatrixType& Pz, const IntegratorSettingsNLC& settings ) { + if( not pimpl_ ) GAUXC_PIMPL_NOT_INITIALIZED(); + return pimpl_->eval_nlc(Ps, Pz, settings); +}; + +template +typename XCIntegrator::exc_vxc_type_rks + XCIntegrator::eval_nlc_vnlc( const MatrixType& P, const IntegratorSettingsNLC& settings ) { + if( not pimpl_ ) GAUXC_PIMPL_NOT_INITIALIZED(); + return pimpl_->eval_nlc_vnlc(P, settings); +}; + +template +typename XCIntegrator::exc_vxc_type_uks + XCIntegrator::eval_nlc_vnlc( const MatrixType& Ps, const MatrixType& Pz, const IntegratorSettingsNLC& settings ) { + if( not pimpl_ ) GAUXC_PIMPL_NOT_INITIALIZED(); + return pimpl_->eval_nlc_vnlc(Ps, Pz, settings); +}; + +template +typename XCIntegrator::exc_grad_type + XCIntegrator::eval_nlc_grad( const MatrixType& P, const IntegratorSettingsNLC& settings ) { + if( not pimpl_ ) GAUXC_PIMPL_NOT_INITIALIZED(); + return pimpl_->eval_nlc_grad(P, settings); +}; + +template +typename XCIntegrator::exc_grad_type + XCIntegrator::eval_nlc_grad( const MatrixType& Ps, const MatrixType& Pz, const IntegratorSettingsNLC& settings ) { + if( not pimpl_ ) GAUXC_PIMPL_NOT_INITIALIZED(); + return pimpl_->eval_nlc_grad(Ps, Pz, settings); +}; + +template +typename XCIntegrator::fxc_contraction_type_rks + XCIntegrator::eval_nlc_fnlc_contraction( const MatrixType& P, const MatrixType& tP, const IntegratorSettingsNLC& settings ) { + if( not pimpl_ ) GAUXC_PIMPL_NOT_INITIALIZED(); + return pimpl_->eval_nlc_fnlc_contraction(P, tP, settings); +}; + +template +typename XCIntegrator::fxc_contraction_type_uks + XCIntegrator::eval_nlc_fnlc_contraction( const MatrixType& Ps, const MatrixType& Pz, + const MatrixType& tPs, const MatrixType& tPz, const IntegratorSettingsNLC& settings ) { + if( not pimpl_ ) GAUXC_PIMPL_NOT_INITIALIZED(); + return pimpl_->eval_nlc_fnlc_contraction(Ps, Pz, tPs, tPz, settings); +}; + template typename XCIntegrator::fxc_contraction_type_rks XCIntegrator::eval_fxc_contraction( const MatrixType& P, const MatrixType& tP, diff --git a/include/gauxc/xc_integrator/replicated/replicated_xc_integrator_impl.hpp b/include/gauxc/xc_integrator/replicated/replicated_xc_integrator_impl.hpp index 457315122..5a1e6c4cc 100644 --- a/include/gauxc/xc_integrator/replicated/replicated_xc_integrator_impl.hpp +++ b/include/gauxc/xc_integrator/replicated/replicated_xc_integrator_impl.hpp @@ -176,7 +176,6 @@ class ReplicatedXCIntegratorImpl { value_type* FXCs, int64_t ldfxcs, value_type* FXCz, int64_t ldfxcz, const IntegratorSettingsXC& ks_settings ); - void eval_dd_psi( int64_t m, int64_t n, const value_type* P, int64_t ldp, unsigned max_Ylm, value_type* ddPsi, int64_t ldPsi ); diff --git a/include/gauxc/xc_integrator/xc_integrator_impl.hpp b/include/gauxc/xc_integrator/xc_integrator_impl.hpp index ba7bebebb..a0ce34563 100644 --- a/include/gauxc/xc_integrator/xc_integrator_impl.hpp +++ b/include/gauxc/xc_integrator/xc_integrator_impl.hpp @@ -151,6 +151,53 @@ class XCIntegratorImpl { return eval_exx_(P,settings); } + value_type eval_nlc( const MatrixType& P, const IntegratorSettingsNLC& settings ) { + detail::IntegratorSettingsNLCInternal nlc_settings(settings); + return eval_exc_(P, nlc_settings); + } + + value_type eval_nlc( const MatrixType& Ps, const MatrixType& Pz, + const IntegratorSettingsNLC& settings ) { + detail::IntegratorSettingsNLCInternal nlc_settings(settings); + return eval_exc_(Ps, Pz, nlc_settings); + } + + exc_vxc_type_rks eval_nlc_vnlc( const MatrixType& P, + const IntegratorSettingsNLC& settings ) { + detail::IntegratorSettingsNLCInternal nlc_settings(settings); + return eval_exc_vxc_(P, nlc_settings); + } + + exc_vxc_type_uks eval_nlc_vnlc( const MatrixType& Ps, const MatrixType& Pz, + const IntegratorSettingsNLC& settings ) { + detail::IntegratorSettingsNLCInternal nlc_settings(settings); + return eval_exc_vxc_(Ps, Pz, nlc_settings); + } + + exc_grad_type eval_nlc_grad( const MatrixType& P, + const IntegratorSettingsNLC& settings ) { + detail::IntegratorSettingsNLCInternal nlc_settings(settings); + return eval_exc_grad_(P, nlc_settings); + } + + exc_grad_type eval_nlc_grad( const MatrixType& Ps, const MatrixType& Pz, + const IntegratorSettingsNLC& settings ) { + detail::IntegratorSettingsNLCInternal nlc_settings(settings); + return eval_exc_grad_(Ps, Pz, nlc_settings); + } + + fxc_contraction_type_rks eval_nlc_fnlc_contraction( const MatrixType& P, + const MatrixType& tP, const IntegratorSettingsNLC& settings ) { + detail::IntegratorSettingsNLCInternal nlc_settings(settings); + return eval_fxc_contraction_(P, tP, nlc_settings); + } + + fxc_contraction_type_uks eval_nlc_fnlc_contraction( const MatrixType& Ps, const MatrixType& Pz, + const MatrixType& tPs, const MatrixType& tPz, const IntegratorSettingsNLC& settings ) { + detail::IntegratorSettingsNLCInternal nlc_settings(settings); + return eval_fxc_contraction_(Ps, Pz, tPs, tPz, nlc_settings); + } + /** Integrate FXC contraction for RKS * diff --git a/include/gauxc/xc_integrator_settings.hpp b/include/gauxc/xc_integrator_settings.hpp index 1ec26d0e6..c5fba4a31 100644 --- a/include/gauxc/xc_integrator_settings.hpp +++ b/include/gauxc/xc_integrator_settings.hpp @@ -29,4 +29,25 @@ struct IntegratorSettingsEXC_GRAD : public IntegratorSettingsKS { bool include_weight_derivatives= true; // whether to include grid weight contribution and employ translational invariance, or just use Hellmann-Feynman gradient }; +enum class NLCMathMode { + NativeFP64, + FloatPair +}; + +struct IntegratorSettingsNLC : public IntegratorSettingsEXC_GRAD { + double vv10_b = 6.3; + double vv10_c = 0.0093; + double vv10_tol = 1e-8; + NLCMathMode math_mode = NLCMathMode::NativeFP64; +}; + +namespace detail { + +struct IntegratorSettingsNLCInternal : public IntegratorSettingsNLC { + IntegratorSettingsNLCInternal() = default; + explicit IntegratorSettingsNLCInternal( const IntegratorSettingsNLC& settings ) : + IntegratorSettingsNLC(settings) { } +}; + +} } diff --git a/src/reduction_driver/host/basic_mpi_reduction_driver.cxx b/src/reduction_driver/host/basic_mpi_reduction_driver.cxx index 904f7caf0..beaa20948 100644 --- a/src/reduction_driver/host/basic_mpi_reduction_driver.cxx +++ b/src/reduction_driver/host/basic_mpi_reduction_driver.cxx @@ -10,11 +10,14 @@ * See LICENSE.txt for details */ #include "basic_mpi_reduction_driver.hpp" +#include #include #include #include #include #include +#include +#include namespace GauXC { @@ -23,7 +26,8 @@ MPI_Datatype get_mpi_datatype( std::type_index idx ) { static std::map map { {std::type_index(typeid(double)), MPI_DOUBLE}, - {std::type_index(typeid(float)), MPI_FLOAT} + {std::type_index(typeid(float)), MPI_FLOAT}, + {std::type_index(typeid(unsigned long long)), MPI_UNSIGNED_LONG_LONG} }; return map.at(idx); @@ -45,7 +49,8 @@ size_t get_dtype_size( std::type_index idx ) { static std::map map { {std::type_index(typeid(double)), sizeof(double)}, - {std::type_index(typeid(float)), sizeof(float)} + {std::type_index(typeid(float)), sizeof(float)}, + {std::type_index(typeid(unsigned long long)), sizeof(unsigned long long)} }; return map.at(idx); @@ -108,6 +113,42 @@ void BasicMPIReductionDriver::allreduce_inplace_typeerased( void* data, size_t s } } +void BasicMPIReductionDriver::allgather_v_typeerased( const void* src, size_t size, + std::vector& dest, std::type_index idx, std::any optional_args ) { + + if( optional_args.has_value() ) + std::cout << "** Warning: Optional Args Are Not Used in BasiMPIReductionDriver::allgather_v" << std::endl; + + const auto dtype_size = get_dtype_size(idx); + const int world_size = runtime_.comm_size(); + if( world_size == 1 ) { + dest.resize( size * dtype_size ); + std::memcpy( dest.data(), src, dest.size() ); + return; + } + +#ifdef GAUXC_HAS_MPI + std::vector sizes(world_size); + const auto local_size = static_cast(size); + MPI_Allgather( &local_size, 1, MPI_UNSIGNED_LONG_LONG, sizes.data(), 1, + MPI_UNSIGNED_LONG_LONG, runtime_.comm() ); + + std::vector counts(world_size), displs(world_size); + size_t total_size = 0; + for( int i = 0; i < world_size; ++i ) { + if( sizes[i] > static_cast(std::numeric_limits::max()) ) + GAUXC_GENERIC_EXCEPTION("allgather_v message count exceeds MPI int range"); + counts[i] = static_cast( sizes[i] ); + displs[i] = static_cast( total_size ); + total_size += static_cast( sizes[i] ); + } + + dest.resize( total_size * dtype_size ); + MPI_Allgatherv( src, static_cast(size), get_mpi_datatype(idx), dest.data(), + counts.data(), displs.data(), get_mpi_datatype(idx), runtime_.comm() ); +#endif +} + std::unique_ptr BasicMPIReductionDriver::clone() { return std::make_unique(*this); } diff --git a/src/reduction_driver/host/basic_mpi_reduction_driver.hpp b/src/reduction_driver/host/basic_mpi_reduction_driver.hpp index 8172edc8b..a1aeb13c1 100644 --- a/src/reduction_driver/host/basic_mpi_reduction_driver.hpp +++ b/src/reduction_driver/host/basic_mpi_reduction_driver.hpp @@ -22,6 +22,7 @@ struct BasicMPIReductionDriver : public HostReductionDriver { void allreduce_typeerased( const void*, void*, size_t, ReductionOp, std::type_index, std::any ) override; void allreduce_inplace_typeerased( void*, size_t, ReductionOp, std::type_index, std::any ) override; + void allgather_v_typeerased( const void*, size_t, std::vector&, std::type_index, std::any ) override; std::unique_ptr clone() override; diff --git a/src/reduction_driver/reduction_driver.cxx b/src/reduction_driver/reduction_driver.cxx index 26a57415f..60f2a0c51 100644 --- a/src/reduction_driver/reduction_driver.cxx +++ b/src/reduction_driver/reduction_driver.cxx @@ -38,6 +38,11 @@ void ReductionDriver::allreduce_inplace_typeerased( void* data, size_t size, Red pimpl_->allreduce_inplace_typeerased(data, size, op, idx, optional_args); } +void ReductionDriver::allgather_v_typeerased( const void* src, size_t size, std::vector& dest, std::type_index idx, std::any optional_args ) const { + if( not pimpl_ ) GAUXC_PIMPL_NOT_INITIALIZED(); + pimpl_->allgather_v_typeerased(src, size, dest, idx, optional_args); +} + bool ReductionDriver::takes_host_memory() const { if( not pimpl_ ) GAUXC_PIMPL_NOT_INITIALIZED(); return pimpl_->takes_host_memory(); @@ -47,6 +52,16 @@ bool ReductionDriver::takes_device_memory() const { return pimpl_->takes_device_memory(); } +int ReductionDriver::comm_rank() const { + if( not pimpl_ ) GAUXC_PIMPL_NOT_INITIALIZED(); + return pimpl_->comm_rank(); +} + +int ReductionDriver::comm_size() const { + if( not pimpl_ ) GAUXC_PIMPL_NOT_INITIALIZED(); + return pimpl_->comm_size(); +} + diff --git a/src/reduction_driver/reduction_driver_factory.cxx b/src/reduction_driver/reduction_driver_factory.cxx index 8b3d5f348..fb7cd0bf5 100644 --- a/src/reduction_driver/reduction_driver_factory.cxx +++ b/src/reduction_driver/reduction_driver_factory.cxx @@ -23,6 +23,13 @@ namespace GauXC { +ReductionDriver ReductionDriverFactory::get_instance( + const RuntimeEnvironment& rt, std::string kernel_name ) { + + auto driver = get_shared_instance(rt, std::move(kernel_name)); + return *driver; +} + std::shared_ptr ReductionDriverFactory::get_shared_instance( const RuntimeEnvironment& rt, std::string kernel_name ) { diff --git a/src/reduction_driver/reduction_driver_impl.cxx b/src/reduction_driver/reduction_driver_impl.cxx index 96351265a..3bee6b396 100644 --- a/src/reduction_driver/reduction_driver_impl.cxx +++ b/src/reduction_driver/reduction_driver_impl.cxx @@ -10,6 +10,7 @@ * See LICENSE.txt for details */ #include "reduction_driver_impl.hpp" +#include namespace GauXC::detail { @@ -20,4 +21,12 @@ ReductionDriverImpl::ReductionDriverImpl( const RuntimeEnvironment& rt ) ReductionDriverImpl::~ReductionDriverImpl() noexcept = default; ReductionDriverImpl::ReductionDriverImpl(const ReductionDriverImpl& ) = default; +void ReductionDriverImpl::allgather_v_typeerased( const void*, size_t, + std::vector&, std::type_index, std::any ) { + GAUXC_GENERIC_EXCEPTION("Variable-size allgather is not supported by this ReductionDriver"); +} + +int ReductionDriverImpl::comm_rank() const { return runtime_.comm_rank(); } +int ReductionDriverImpl::comm_size() const { return runtime_.comm_size(); } + } diff --git a/src/reduction_driver/reduction_driver_impl.hpp b/src/reduction_driver/reduction_driver_impl.hpp index e4980a9a1..6aefc66ac 100644 --- a/src/reduction_driver/reduction_driver_impl.hpp +++ b/src/reduction_driver/reduction_driver_impl.hpp @@ -11,6 +11,8 @@ */ #pragma once #include +#include +#include namespace GauXC { namespace detail { @@ -31,9 +33,12 @@ class ReductionDriverImpl { virtual void allreduce_typeerased( const void*, void*, size_t, ReductionOp, std::type_index, std::any ) = 0; virtual void allreduce_inplace_typeerased( void*, size_t, ReductionOp, std::type_index, std::any ) = 0; + virtual void allgather_v_typeerased( const void*, size_t, std::vector&, std::type_index, std::any ); virtual bool takes_host_memory() const = 0; virtual bool takes_device_memory() const = 0; + int comm_rank() const; + int comm_size() const; virtual std::unique_ptr clone() = 0; }; diff --git a/src/xc_integrator/integrator_util/CMakeLists.txt b/src/xc_integrator/integrator_util/CMakeLists.txt index 0a0edbe86..fc50fdc48 100644 --- a/src/xc_integrator/integrator_util/CMakeLists.txt +++ b/src/xc_integrator/integrator_util/CMakeLists.txt @@ -9,4 +9,4 @@ # # See LICENSE.txt for details # -target_sources( gauxc PRIVATE integrator_common.cxx integral_bounds.cxx exx_screening.cxx spherical_harmonics.cxx ) +target_sources( gauxc PRIVATE integrator_common.cxx integral_bounds.cxx exx_screening.cxx spherical_harmonics.cxx vv10.cxx ) diff --git a/src/xc_integrator/integrator_util/vv10.cxx b/src/xc_integrator/integrator_util/vv10.cxx new file mode 100644 index 000000000..8a9dde521 --- /dev/null +++ b/src/xc_integrator/integrator_util/vv10.cxx @@ -0,0 +1,403 @@ +#include "integrator_util/vv10.hpp" + +#include +#include +#include + +namespace GauXC::detail::vv10 { + +namespace { + +constexpr double pi = 3.141592653589793238462643383279502884; +constexpr double four_pi_over_three = 4.0 * pi / 3.0; + +struct Vec3 { + double x = 0.0; + double y = 0.0; + double z = 0.0; +}; + +Vec3 load_point(const double* coords, std::int64_t i) { + return {coords[3*i + 0], coords[3*i + 1], coords[3*i + 2]}; +} + +double distance2(Vec3 a, Vec3 b) { + const auto dx = a.x - b.x; + const auto dy = a.y - b.y; + const auto dz = a.z - b.z; + return dx*dx + dy*dy + dz*dz; +} + +} + +double kappa_prefactor(const Parameters& params) { + return params.b * 1.5 * pi * std::pow(9.0 * pi, -1.0/6.0); +} + +double beta(const Parameters& params) { + return std::pow(3.0 / (params.b * params.b), 0.75) / 32.0; +} + +void eval_omega_kappa(std::int64_t npts, const double* rho, const double* gamma, + const Parameters& params, double* omega, double* kappa, + double* domega_drho, double* domega_dgamma, double* d2omega_drho2, + double* d2omega_dgamma2, double* d2omega_drho_dgamma, + double* dkappa_drho, double* d2kappa_drho2) { + + const auto kappa_pref = kappa_prefactor(params); + + #pragma omp parallel for schedule(static) + for(std::int64_t i = 0; i < npts; ++i) { + const auto rho_i = rho[i]; + const auto gamma_i = gamma[i]; + + if(rho_i < params.rho_threshold) { + omega[i] = 0.0; + kappa[i] = 0.0; + if(domega_drho) domega_drho[i] = 0.0; + if(domega_dgamma) domega_dgamma[i] = 0.0; + if(d2omega_drho2) d2omega_drho2[i] = 0.0; + if(d2omega_dgamma2) d2omega_dgamma2[i] = 0.0; + if(d2omega_drho_dgamma) d2omega_drho_dgamma[i] = 0.0; + if(dkappa_drho) dkappa_drho[i] = 0.0; + if(d2kappa_drho2) d2kappa_drho2[i] = 0.0; + continue; + } + + const auto rho_1 = 1.0 / rho_i; + const auto rho_2 = rho_1 * rho_1; + const auto rho_3 = rho_1 * rho_2; + const auto rho_4 = rho_2 * rho_2; + const auto rho_5 = rho_1 * rho_4; + const auto gamma2 = gamma_i * gamma_i; + const auto omega2 = params.c * gamma2 * rho_4 + four_pi_over_three * rho_i; + const auto omega_i = std::sqrt(omega2); + const auto omega_1 = 1.0 / omega_i; + + omega[i] = omega_i; + kappa[i] = kappa_pref * std::pow(rho_i, 1.0/6.0); + + if(domega_drho) { + domega_drho[i] = 0.5 * (four_pi_over_three - 4.0 * params.c * gamma2 * rho_5) * omega_1; + } + if(domega_dgamma) { + domega_dgamma[i] = params.c * gamma_i * rho_4 * omega_1; + } + if(d2omega_drho2 or d2omega_dgamma2 or d2omega_drho_dgamma) { + const auto omega_3 = omega_1 / omega2; + if(d2omega_drho2) { + d2omega_drho2[i] = (-0.25 * four_pi_over_three * four_pi_over_three + + 12.0 * four_pi_over_three * params.c * gamma2 * rho_5 + + 6.0 * params.c * params.c * gamma2 * gamma2 * rho_5 * rho_5) * omega_3; + } + if(d2omega_dgamma2) { + d2omega_dgamma2[i] = four_pi_over_three * params.c * rho_3 * omega_3; + } + if(d2omega_drho_dgamma) { + d2omega_drho_dgamma[i] = -params.c * gamma_i * (4.5 * four_pi_over_three * rho_4 + + 2.0 * params.c * gamma2 * rho_4 * rho_5) * omega_3; + } + } + if(dkappa_drho) { + dkappa_drho[i] = kappa_pref * (1.0/6.0) * std::pow(rho_i, -5.0/6.0); + } + if(d2kappa_drho2) { + d2kappa_drho2[i] = kappa_pref * (-5.0/36.0) * std::pow(rho_i, -11.0/6.0); + } + } +} + +void eval_fock(std::int64_t npts, const double* coords, const double* omega, + const double* kappa, std::int64_t vv_npts, const double* vv_coords, + const double* vv_rho_weight, const double* vv_omega, + const double* vv_kappa, double* F, double* U, double* W) { + + #pragma omp parallel for schedule(static) + for(std::int64_t i = 0; i < npts; ++i) { + const auto r_i = load_point(coords, i); + const auto omega_i = omega[i]; + const auto kappa_i = kappa[i]; + double F_i = 0.0; + double U_i = 0.0; + double W_i = 0.0; + + for(std::int64_t j = 0; j < vv_npts; ++j) { + const auto r_j = load_point(vv_coords, j); + const auto r_ij2 = distance2(r_j, r_i); + const auto g_ij = r_ij2 * omega_i + kappa_i; + const auto g_ji = r_ij2 * vv_omega[j] + vv_kappa[j]; + const auto g_sum = g_ij + g_ji; + auto pair = vv_rho_weight[j] / (g_ij * g_ji * g_sum); + F_i += pair; + pair *= 1.0/g_ij + 1.0/g_sum; + U_i += pair; + W_i += pair * r_ij2; + } + + F[i] = -1.5 * F_i; + U[i] = U_i; + W[i] = W_i; + } +} + +void eval_exc_vxc(std::int64_t npts, const double* coords, + const double* weights, const double* rho, const double* gamma, + const Parameters& params, double* eps, double* vrho, double* vgamma) { + + std::vector omega(npts), kappa(npts), domega_drho(npts), domega_dgamma(npts); + std::vector F(npts), U(npts), W(npts), rho_weight(npts); + + eval_omega_kappa(npts, rho, gamma, params, omega.data(), kappa.data(), + domega_drho.data(), domega_dgamma.data()); + + for(std::int64_t i = 0; i < npts; ++i) { + rho_weight[i] = rho[i] >= params.rho_threshold ? rho[i] * weights[i] : 0.0; + } + + eval_fock(npts, coords, omega.data(), kappa.data(), npts, coords, + rho_weight.data(), omega.data(), kappa.data(), F.data(), U.data(), W.data()); + + const auto beta_value = beta(params); + const auto kappa_pref = kappa_prefactor(params); + + #pragma omp parallel for schedule(static) + for(std::int64_t i = 0; i < npts; ++i) { + if(rho[i] < params.rho_threshold) { + eps[i] = 0.0; + vrho[i] = 0.0; + vgamma[i] = 0.0; + continue; + } + + const auto scaled_dkappa_drho = (1.0/6.0) * kappa_pref * std::pow(rho[i], 1.0/6.0); + const auto scaled_domega_drho = rho[i] * domega_drho[i]; + const auto scaled_domega_dgamma = rho[i] * domega_dgamma[i]; + eps[i] = beta_value + 0.5 * F[i]; + vrho[i] = beta_value + F[i] + 1.5 * (U[i] * scaled_dkappa_drho + W[i] * scaled_domega_drho); + vgamma[i] = 1.5 * W[i] * scaled_domega_dgamma; + } +} + +void eval_grid_gradient(std::int64_t npts, const double* coords, + const double* omega, const double* kappa, std::int64_t vv_npts, + const double* vv_coords, const double* vv_rho_weight, + const double* vv_omega, const double* vv_kappa, double* gradient) { + + #pragma omp parallel for schedule(static) + for(std::int64_t i = 0; i < npts; ++i) { + const auto r_i = load_point(coords, i); + double gx = 0.0; + double gy = 0.0; + double gz = 0.0; + + for(std::int64_t j = 0; j < vv_npts; ++j) { + const auto r_j = load_point(vv_coords, j); + const auto dx = r_j.x - r_i.x; + const auto dy = r_j.y - r_i.y; + const auto dz = r_j.z - r_i.z; + const auto r_ij2 = dx*dx + dy*dy + dz*dz; + const auto g_ij = r_ij2 * omega[i] + kappa[i]; + const auto g_ji = r_ij2 * vv_omega[j] + vv_kappa[j]; + const auto g_sum = g_ij + g_ji; + const auto pair = vv_rho_weight[j] / (g_ij * g_ji * g_sum); + const auto prefactor = pair * (omega[i]/g_ij + vv_omega[j]/g_ji + (omega[i] + vv_omega[j])/g_sum); + gx += prefactor * dx; + gy += prefactor * dy; + gz += prefactor * dz; + } + + gradient[3*i + 0] = -3.0 * gx; + gradient[3*i + 1] = -3.0 * gy; + gradient[3*i + 2] = -3.0 * gz; + } +} + +void eval_hessian_intermediates(std::int64_t npts, const double* coords, + const double* weights, const double* rho, const double* omega, + const double* kappa, double* U, double* W, double* A, double* B, + double* C, double* E) { + + #pragma omp parallel for schedule(static) + for(std::int64_t i = 0; i < npts; ++i) { + const auto r_i = load_point(coords, i); + double U_i = 0.0; + double W_i = 0.0; + double A_i = 0.0; + double B_i = 0.0; + double C_i = 0.0; + double E_i = 0.0; + + for(std::int64_t j = 0; j < npts; ++j) { + const auto r_j = load_point(coords, j); + const auto r_ij2 = distance2(r_i, r_j); + const auto g_ij = omega[i] * r_ij2 + kappa[i]; + const auto g_ji = omega[j] * r_ij2 + kappa[j]; + const auto g_ij_1 = 1.0 / g_ij; + const auto g_ji_1 = 1.0 / g_ji; + const auto g_sum_1 = 1.0 / (g_ij + g_ji); + const auto phi = -1.5 * g_ij_1 * g_ji_1 * g_sum_1; + const auto E_ij = weights[j] * rho[j] * phi; + const auto U_ij = E_ij * (g_sum_1 + g_ij_1); + const auto W_ij = U_ij * r_ij2; + const auto A_ij = E_ij * (g_sum_1*g_sum_1 + g_sum_1*g_ij_1 + g_ij_1*g_ij_1); + const auto B_ij = A_ij * r_ij2; + const auto C_ij = B_ij * r_ij2; + U_i += U_ij; + W_i += W_ij; + A_i += A_ij; + B_i += B_ij; + C_i += C_ij; + E_i += E_ij; + } + + U[i] = -U_i; + W[i] = -W_i; + A[i] = 2.0 * A_i; + B[i] = 2.0 * B_i; + C[i] = 2.0 * C_i; + E[i] = E_i; + } +} + +void eval_hessian_contraction(std::int64_t npts, std::int64_t ntrial, + const double* coords, const double* weights, const double* rho, + const double* omega, const double* kappa, const double* U, + const double* W, const double* A, const double* B, const double* C, + const double* domega_drho, const double* domega_dgamma, + const double* dkappa_drho, const double* d2omega_drho2, + const double* d2omega_dgamma2, const double* d2omega_drho_dgamma, + const double* d2kappa_drho2, const double* rho_t, + const double* gamma_t, double* f_rho_t, double* f_gamma_t) { + + #pragma omp parallel for collapse(2) schedule(static) + for(std::int64_t i = 0; i < npts; ++i) { + for(std::int64_t trial = 0; trial < ntrial; ++trial) { + const auto r_i = load_point(coords, i); + double f_rho_i = 0.0; + double f_gamma_i = 0.0; + + for(std::int64_t j = 0; j < npts; ++j) { + const auto r_j = load_point(coords, j); + const auto r_ij2 = distance2(r_i, r_j); + const auto g_ij = omega[i] * r_ij2 + kappa[i]; + const auto g_ji = omega[j] * r_ij2 + kappa[j]; + const auto g_ij_1 = 1.0 / g_ij; + const auto g_ji_1 = 1.0 / g_ji; + const auto g_sum_1 = 1.0 / (g_ij + g_ji); + const auto phi = -1.5 * g_ij_1 * g_ji_1 * g_sum_1; + const auto rho_dgdrho_i = rho[i] * (r_ij2 * domega_drho[i] + dkappa_drho[i]); + const auto rho_dgdrho_j = rho[j] * (r_ij2 * domega_drho[j] + dkappa_drho[j]); + const auto d2phi_dgij_dgji_over_phi = 2.0 * (g_sum_1*g_sum_1 + g_ij_1*g_ji_1); + + const auto f_rho_rho_ij = phi * (rho_dgdrho_i * rho_dgdrho_j * d2phi_dgij_dgji_over_phi + - rho_dgdrho_i * (g_sum_1 + g_ij_1) - rho_dgdrho_j * (g_sum_1 + g_ji_1) + 1.0); + const auto f_gamma_rho_ij = rho[i] * domega_dgamma[i] * r_ij2 * phi + * (rho_dgdrho_j * d2phi_dgij_dgji_over_phi - (g_sum_1 + g_ij_1)); + const auto f_rho_gamma_ij = rho[j] * domega_dgamma[j] * r_ij2 * phi + * (rho_dgdrho_i * d2phi_dgij_dgji_over_phi - (g_sum_1 + g_ji_1)); + const auto f_gamma_gamma_ij = rho[i] * rho[j] * domega_dgamma[i] * domega_dgamma[j] + * r_ij2 * r_ij2 * phi * d2phi_dgij_dgji_over_phi; + + const auto offset = trial * npts + j; + f_rho_i += weights[j] * (f_rho_rho_ij * rho_t[offset] + f_rho_gamma_ij * gamma_t[offset]); + f_gamma_i += weights[j] * (f_gamma_rho_ij * rho_t[offset] + f_gamma_gamma_ij * gamma_t[offset]); + } + + const auto f_rho_rho_ii = 2.0 * domega_drho[i] * W[i] + 2.0 * dkappa_drho[i] * U[i] + + rho[i] * (d2omega_drho2[i] * W[i] + d2kappa_drho2[i] * U[i] + + dkappa_drho[i] * dkappa_drho[i] * A[i] + domega_drho[i] * domega_drho[i] * C[i] + + 2.0 * domega_drho[i] * dkappa_drho[i] * B[i]); + const auto f_gamma_rho_ii = domega_dgamma[i] * W[i] + rho[i] * (d2omega_drho_dgamma[i] * W[i] + + domega_dgamma[i] * (dkappa_drho[i] * B[i] + domega_drho[i] * C[i])); + const auto f_gamma_gamma_ii = rho[i] * (d2omega_dgamma2[i] * W[i] + + domega_dgamma[i] * domega_dgamma[i] * C[i]); + const auto ioffset = trial * npts + i; + + f_rho_i += f_rho_rho_ii * rho_t[ioffset] + f_gamma_rho_ii * gamma_t[ioffset]; + f_gamma_i += f_gamma_rho_ii * rho_t[ioffset] + f_gamma_gamma_ii * gamma_t[ioffset]; + + f_rho_t[ioffset] = f_rho_i; + f_gamma_t[ioffset] = f_gamma_i; + } + } +} + +void eval_grid_response(std::int64_t npts, std::int64_t natoms, + const double* coords, const double* weights, const double* rho, + const double* omega, const double* kappa, + const std::int32_t* grid_associated_atom, double* Egr, double* Ugr, + double* Wgr) { + + #pragma omp parallel for collapse(2) schedule(static) + for(std::int64_t i = 0; i < npts; ++i) { + for(std::int64_t atom = 0; atom < natoms; ++atom) { + const auto i_atom = grid_associated_atom[i]; + const auto base = atom * 3 * npts + i; + if(i_atom < 0) { + Egr[base + 0*npts] = 0.0; + Egr[base + 1*npts] = 0.0; + Egr[base + 2*npts] = 0.0; + Ugr[base + 0*npts] = 0.0; + Ugr[base + 1*npts] = 0.0; + Ugr[base + 2*npts] = 0.0; + Wgr[base + 0*npts] = 0.0; + Wgr[base + 1*npts] = 0.0; + Wgr[base + 2*npts] = 0.0; + continue; + } + + const auto r_i = load_point(coords, i); + const auto i_in_atom = i_atom == atom; + Vec3 Eg{}; + Vec3 Ug{}; + Vec3 Wg{}; + + for(std::int64_t j = 0; j < npts; ++j) { + const auto j_atom = grid_associated_atom[j]; + if(j_atom < 0) continue; + const auto j_in_atom = j_atom == atom; + if((not i_in_atom and not j_in_atom) or (i_in_atom and j_in_atom)) continue; + + const auto r_j = load_point(coords, j); + const Vec3 r_ji{r_j.x - r_i.x, r_j.y - r_i.y, r_j.z - r_i.z}; + const auto r_ij2 = r_ji.x*r_ji.x + r_ji.y*r_ji.y + r_ji.z*r_ji.z; + const auto g_ij = omega[i] * r_ij2 + kappa[i]; + const auto g_ji = omega[j] * r_ij2 + kappa[j]; + const auto g_ij_1 = 1.0 / g_ij; + const auto g_ji_1 = 1.0 / g_ji; + const auto g_sum_1 = 1.0 / (g_ij + g_ji); + const auto phi = -1.5 * g_ij_1 * g_ji_1 * g_sum_1; + const auto E_ij = weights[j] * rho[j] * phi; + const auto dphi_drj_over_phi = omega[i]*g_ij_1 + omega[j]*g_ji_1 + (omega[i] + omega[j])*g_sum_1; + const auto d2phi_dgij_drj_over_phi = omega[i]*g_ij_1*g_ij_1 + (omega[i] + omega[j])*g_sum_1*g_sum_1; + const auto dphi_dgij_over_phi = g_sum_1 + g_ij_1; + const auto Egr_ij = E_ij * dphi_drj_over_phi; + const auto Ugr_ij = E_ij * (dphi_drj_over_phi*dphi_dgij_over_phi + d2phi_dgij_drj_over_phi); + const auto Wgr_ij = E_ij * (r_ij2 * (dphi_drj_over_phi*dphi_dgij_over_phi + d2phi_dgij_drj_over_phi) - dphi_dgij_over_phi); + + Eg.x += Egr_ij * r_ji.x; + Eg.y += Egr_ij * r_ji.y; + Eg.z += Egr_ij * r_ji.z; + Ug.x += Ugr_ij * r_ji.x; + Ug.y += Ugr_ij * r_ji.y; + Ug.z += Ugr_ij * r_ji.z; + Wg.x += Wgr_ij * r_ji.x; + Wg.y += Wgr_ij * r_ji.y; + Wg.z += Wgr_ij * r_ji.z; + } + + const auto atom_sign = i_in_atom ? -1.0 : 1.0; + Egr[base + 0*npts] = -2.0 * atom_sign * Eg.x; + Egr[base + 1*npts] = -2.0 * atom_sign * Eg.y; + Egr[base + 2*npts] = -2.0 * atom_sign * Eg.z; + Ugr[base + 0*npts] = 2.0 * atom_sign * Ug.x; + Ugr[base + 1*npts] = 2.0 * atom_sign * Ug.y; + Ugr[base + 2*npts] = 2.0 * atom_sign * Ug.z; + Wgr[base + 0*npts] = 2.0 * atom_sign * Wg.x; + Wgr[base + 1*npts] = 2.0 * atom_sign * Wg.y; + Wgr[base + 2*npts] = 2.0 * atom_sign * Wg.z; + } + } +} + +} \ No newline at end of file diff --git a/src/xc_integrator/integrator_util/vv10.hpp b/src/xc_integrator/integrator_util/vv10.hpp new file mode 100644 index 000000000..7ac08ecfc --- /dev/null +++ b/src/xc_integrator/integrator_util/vv10.hpp @@ -0,0 +1,59 @@ +#pragma once + +#include + +namespace GauXC::detail::vv10 { + +struct Parameters { + double b = 6.3; + double c = 0.0093; + double rho_threshold = 1e-8; + std::int32_t pair_math_mode = 0; +}; + +double kappa_prefactor(const Parameters& params); +double beta(const Parameters& params); + +void eval_omega_kappa(std::int64_t npts, const double* rho, const double* gamma, + const Parameters& params, double* omega, double* kappa, + double* domega_drho = nullptr, double* domega_dgamma = nullptr, + double* d2omega_drho2 = nullptr, double* d2omega_dgamma2 = nullptr, + double* d2omega_drho_dgamma = nullptr, double* dkappa_drho = nullptr, + double* d2kappa_drho2 = nullptr); + +void eval_fock(std::int64_t npts, const double* coords, const double* omega, + const double* kappa, std::int64_t vv_npts, const double* vv_coords, + const double* vv_rho_weight, const double* vv_omega, + const double* vv_kappa, double* F, double* U, double* W); + +void eval_exc_vxc(std::int64_t npts, const double* coords, + const double* weights, const double* rho, const double* gamma, + const Parameters& params, double* eps, double* vrho, double* vgamma); + +void eval_grid_gradient(std::int64_t npts, const double* coords, + const double* omega, const double* kappa, std::int64_t vv_npts, + const double* vv_coords, const double* vv_rho_weight, + const double* vv_omega, const double* vv_kappa, double* gradient); + +void eval_hessian_intermediates(std::int64_t npts, const double* coords, + const double* weights, const double* rho, const double* omega, + const double* kappa, double* U, double* W, double* A, double* B, + double* C, double* E); + +void eval_hessian_contraction(std::int64_t npts, std::int64_t ntrial, + const double* coords, const double* weights, const double* rho, + const double* omega, const double* kappa, const double* U, + const double* W, const double* A, const double* B, const double* C, + const double* domega_drho, const double* domega_dgamma, + const double* dkappa_drho, const double* d2omega_drho2, + const double* d2omega_dgamma2, const double* d2omega_drho_dgamma, + const double* d2kappa_drho2, const double* rho_t, + const double* gamma_t, double* f_rho_t, double* f_gamma_t); + +void eval_grid_response(std::int64_t npts, std::int64_t natoms, + const double* coords, const double* weights, const double* rho, + const double* omega, const double* kappa, + const std::int32_t* grid_associated_atom, double* Egr, double* Ugr, + double* Wgr); + +} \ No newline at end of file diff --git a/src/xc_integrator/local_work_driver/device/cuda/CMakeLists.txt b/src/xc_integrator/local_work_driver/device/cuda/CMakeLists.txt index 8c0608f77..76234f965 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/CMakeLists.txt +++ b/src/xc_integrator/local_work_driver/device/cuda/CMakeLists.txt @@ -25,6 +25,7 @@ target_sources(gauxc PRIVATE kernels/uvvars.cu kernels/zmat_vxc.cu kernels/zmat_fxc.cu + kernels/vv10.cu kernels/cuda_inc_potential.cu kernels/symmetrize_mat.cu kernels/increment_exc_grad.cu diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/vv10.cu b/src/xc_integrator/local_work_driver/device/cuda/kernels/vv10.cu new file mode 100644 index 000000000..29703bc83 --- /dev/null +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/vv10.cu @@ -0,0 +1,687 @@ +#include "vv10.hpp" + +#ifdef GAUXC_HAS_CUDA + +#include "exceptions/cuda_exception.hpp" +#include +#include + +namespace GauXC::detail::vv10 { + +namespace { + +constexpr int threads_per_block = 128; +constexpr double pi = 3.141592653589793238462643383279502884; +constexpr double four_pi_over_three = 4.0 * pi / 3.0; + +__device__ double3 load_point(const double* coords, int i); +__device__ double distance2(double3 a, double3 b); + +__device__ double cuda_kappa_prefactor(Parameters params) { + return params.b * 1.5 * pi * pow(9.0 * pi, -1.0/6.0); +} + +__device__ double cuda_beta(Parameters params) { + return pow(3.0 / (params.b * params.b), 0.75) / 32.0; +} + +__device__ double3 load_point(const double* coords, int i) { + return {coords[3*i + 0], coords[3*i + 1], coords[3*i + 2]}; +} + +__device__ double distance2(double3 a, double3 b) { + const auto dx = a.x - b.x; + const auto dy = a.y - b.y; + const auto dz = a.z - b.z; + return dx*dx + dy*dy + dz*dz; +} + +__global__ void omega_kappa_kernel(int npts, const double* rho, + const double* gamma, Parameters params, double* omega, double* kappa, + double* domega_drho, double* domega_dgamma, double* d2omega_drho2, + double* d2omega_dgamma2, double* d2omega_drho_dgamma, + double* dkappa_drho, double* d2kappa_drho2) { + + const auto i = blockIdx.x * blockDim.x + threadIdx.x; + if(i >= npts) return; + + const auto rho_i = rho[i]; + const auto gamma_i = gamma[i]; + if(rho_i < params.rho_threshold) { + omega[i] = 0.0; + kappa[i] = 0.0; + if(domega_drho) domega_drho[i] = 0.0; + if(domega_dgamma) domega_dgamma[i] = 0.0; + if(d2omega_drho2) d2omega_drho2[i] = 0.0; + if(d2omega_dgamma2) d2omega_dgamma2[i] = 0.0; + if(d2omega_drho_dgamma) d2omega_drho_dgamma[i] = 0.0; + if(dkappa_drho) dkappa_drho[i] = 0.0; + if(d2kappa_drho2) d2kappa_drho2[i] = 0.0; + return; + } + + const auto kappa_pref = cuda_kappa_prefactor(params); + const auto rho_1 = 1.0 / rho_i; + const auto rho_2 = rho_1 * rho_1; + const auto rho_3 = rho_1 * rho_2; + const auto rho_4 = rho_2 * rho_2; + const auto rho_5 = rho_1 * rho_4; + const auto gamma2 = gamma_i * gamma_i; + const auto omega2 = params.c * gamma2 * rho_4 + four_pi_over_three * rho_i; + const auto omega_i = sqrt(omega2); + const auto omega_1 = 1.0 / omega_i; + + omega[i] = omega_i; + kappa[i] = kappa_pref * pow(rho_i, 1.0/6.0); + if(domega_drho) domega_drho[i] = 0.5 * (four_pi_over_three - 4.0 * params.c * gamma2 * rho_5) * omega_1; + if(domega_dgamma) domega_dgamma[i] = params.c * gamma_i * rho_4 * omega_1; + if(d2omega_drho2 || d2omega_dgamma2 || d2omega_drho_dgamma) { + const auto omega_3 = omega_1 / omega2; + if(d2omega_drho2) { + d2omega_drho2[i] = (-0.25 * four_pi_over_three * four_pi_over_three + + 12.0 * four_pi_over_three * params.c * gamma2 * rho_5 + + 6.0 * params.c * params.c * gamma2 * gamma2 * rho_5 * rho_5) * omega_3; + } + if(d2omega_dgamma2) d2omega_dgamma2[i] = four_pi_over_three * params.c * rho_3 * omega_3; + if(d2omega_drho_dgamma) { + d2omega_drho_dgamma[i] = -params.c * gamma_i * (4.5 * four_pi_over_three * rho_4 + + 2.0 * params.c * gamma2 * rho_4 * rho_5) * omega_3; + } + } + if(dkappa_drho) dkappa_drho[i] = kappa_pref * (1.0/6.0) * pow(rho_i, -5.0/6.0); + if(d2kappa_drho2) d2kappa_drho2[i] = kappa_pref * (-5.0/36.0) * pow(rho_i, -11.0/6.0); +} + +__global__ void fock_kernel(int npts, const double* coords, + const double* omega, const double* kappa, int vv_npts, + const double* vv_coords, const double* vv_rho_weight, + const double* vv_omega, const double* vv_kappa, double* F, double* U, + double* W) { + + __shared__ double3 coords_tile[threads_per_block]; + __shared__ double rho_weight_tile[threads_per_block]; + __shared__ double omega_tile[threads_per_block]; + __shared__ double kappa_tile[threads_per_block]; + + const auto i = blockIdx.x * blockDim.x + threadIdx.x; + const auto active_i = i < npts; + + const auto r_i = active_i ? load_point(coords, i) : double3{0.0, 0.0, 0.0}; + const auto omega_i = active_i ? omega[i] : 0.0; + const auto kappa_i = active_i ? kappa[i] : 0.0; + double F_i = 0.0; + double U_i = 0.0; + double W_i = 0.0; + + for(int tile = 0; tile < vv_npts; tile += blockDim.x) { + const auto j_load = tile + threadIdx.x; + if(j_load < vv_npts) { + coords_tile[threadIdx.x] = load_point(vv_coords, j_load); + rho_weight_tile[threadIdx.x] = vv_rho_weight[j_load]; + omega_tile[threadIdx.x] = vv_omega[j_load]; + kappa_tile[threadIdx.x] = vv_kappa[j_load]; + } + __syncthreads(); + + if(active_i) { + const auto tile_npts = min(blockDim.x, vv_npts - tile); + for(int j = 0; j < tile_npts; ++j) { + const auto r_ij2 = distance2(coords_tile[j], r_i); + const auto g_ij = r_ij2 * omega_i + kappa_i; + const auto g_ji = r_ij2 * omega_tile[j] + kappa_tile[j]; + const auto g_sum = g_ij + g_ji; + const auto g_ij_inv = 1.0 / g_ij; + const auto g_ji_inv = 1.0 / g_ji; + const auto g_sum_inv = 1.0 / g_sum; + auto pair = rho_weight_tile[j] * g_ij_inv * g_ji_inv * g_sum_inv; + F_i += pair; + pair *= g_ij_inv + g_sum_inv; + U_i += pair; + W_i += pair * r_ij2; + } + } + __syncthreads(); + } + + if(active_i) { + F[i] = -1.5 * F_i; + U[i] = U_i; + W[i] = W_i; + } +} + +__global__ void exc_vxc_from_fock_kernel(int npts, const double* rho, + const double* gamma, const double* F, const double* U, const double* W, + Parameters params, double* eps, double* vrho, double* vgamma) { + + const auto i = blockIdx.x * blockDim.x + threadIdx.x; + if(i >= npts) return; + if(rho[i] < params.rho_threshold) { + eps[i] = 0.0; + vrho[i] = 0.0; + vgamma[i] = 0.0; + return; + } + + const auto rho_i = rho[i]; + const auto gamma_i = gamma[i]; + const auto rho_1 = 1.0 / rho_i; + const auto rho_2 = rho_1 * rho_1; + const auto rho_4 = rho_2 * rho_2; + const auto rho_5 = rho_1 * rho_4; + const auto gamma2 = gamma_i * gamma_i; + const auto omega2 = params.c * gamma2 * rho_4 + four_pi_over_three * rho_i; + const auto omega_i = sqrt(omega2); + const auto domega_drho = 0.5 * (four_pi_over_three - 4.0 * params.c * gamma2 * rho_5) / omega_i; + const auto domega_dgamma = params.c * gamma_i * rho_4 / omega_i; + const auto kappa_pref = cuda_kappa_prefactor(params); + const auto scaled_dkappa_drho = (1.0/6.0) * kappa_pref * pow(rho_i, 1.0/6.0); + + eps[i] = cuda_beta(params) + 0.5 * F[i]; + vrho[i] = cuda_beta(params) + F[i] + 1.5 * (U[i] * scaled_dkappa_drho + W[i] * rho_i * domega_drho); + vgamma[i] = 1.5 * W[i] * rho_i * domega_dgamma; +} + +template +__global__ void exc_vxc_fock_kernel(int npts, const double* coords, + const double* rho, const double* gamma, const double* omega, + const double* kappa, const double* rho_weight, Parameters params, + double* eps, double* vrho, double* vgamma) { + + __shared__ double3 coords_tile[threads_per_block]; + __shared__ double rho_weight_tile[threads_per_block]; + __shared__ double omega_tile[threads_per_block]; + __shared__ double kappa_tile[threads_per_block]; + __shared__ float rho_weight_tile_f[threads_per_block]; + __shared__ float omega_tile_f[threads_per_block]; + __shared__ float kappa_tile_f[threads_per_block]; + + const auto i = blockIdx.x * blockDim.x + threadIdx.x; + const auto active_i = i < npts; + + const auto rho_i = active_i ? rho[i] : 0.0; + const auto active_rho_i = active_i && rho_i >= params.rho_threshold; + const auto r_i = active_rho_i ? load_point(coords, i) : double3{0.0, 0.0, 0.0}; + const auto omega_i = active_rho_i ? omega[i] : 0.0; + const auto kappa_i = active_rho_i ? kappa[i] : 0.0; + + double F_i = 0.0; + double U_i = 0.0; + double W_i = 0.0; + + const float omega_i_f = static_cast(omega_i); + const float kappa_i_f = static_cast(kappa_i); + + for(int tile = 0; tile < npts; tile += blockDim.x) { + const auto j_load = tile + threadIdx.x; + if(j_load < npts) { + coords_tile[threadIdx.x] = load_point(coords, j_load); + rho_weight_tile[threadIdx.x] = rho_weight[j_load]; + omega_tile[threadIdx.x] = omega[j_load]; + kappa_tile[threadIdx.x] = kappa[j_load]; + if constexpr(UseFloatPair) { + rho_weight_tile_f[threadIdx.x] = static_cast(rho_weight[j_load]); + omega_tile_f[threadIdx.x] = static_cast(omega[j_load]); + kappa_tile_f[threadIdx.x] = static_cast(kappa[j_load]); + } + } + __syncthreads(); + + if(active_rho_i) { + const auto tile_npts = min(blockDim.x, npts - tile); + if constexpr(UseFloatPair) { + for(int j = 0; j < tile_npts; ++j) { + const auto r_ij2 = distance2(coords_tile[j], r_i); + const auto r_ij2_f = static_cast(r_ij2); + const auto g_ij = fmaf(r_ij2_f, omega_i_f, kappa_i_f); + const auto g_ji = fmaf(r_ij2_f, omega_tile_f[j], kappa_tile_f[j]); + const auto g_sum = g_ij + g_ji; + const auto g_ij_inv = __fdividef(1.0f, g_ij); + const auto g_ji_inv = __fdividef(1.0f, g_ji); + const auto g_sum_inv = __fdividef(1.0f, g_sum); + auto pair = rho_weight_tile_f[j] * g_ij_inv * g_ji_inv * g_sum_inv; + F_i += static_cast(pair); + pair *= g_ij_inv + g_sum_inv; + U_i += static_cast(pair); + W_i += static_cast(pair * r_ij2_f); + } + } else { + for(int j = 0; j < tile_npts; ++j) { + const auto r_ij2 = distance2(coords_tile[j], r_i); + const auto g_ij = r_ij2 * omega_i + kappa_i; + const auto g_ji = r_ij2 * omega_tile[j] + kappa_tile[j]; + const auto g_sum = g_ij + g_ji; + const auto g_ij_inv = 1.0 / g_ij; + const auto g_ji_inv = 1.0 / g_ji; + const auto g_sum_inv = 1.0 / g_sum; + auto pair = rho_weight_tile[j] * g_ij_inv * g_ji_inv * g_sum_inv; + F_i += pair; + pair *= g_ij_inv + g_sum_inv; + U_i += pair; + W_i += pair * r_ij2; + } + } + } + __syncthreads(); + } + + if(active_i) { + if(!active_rho_i) { + eps[i] = 0.0; + vrho[i] = 0.0; + vgamma[i] = 0.0; + return; + } + + const auto gamma_i = gamma[i]; + const auto rho_1 = 1.0 / rho_i; + const auto rho_2 = rho_1 * rho_1; + const auto rho_4 = rho_2 * rho_2; + const auto rho_5 = rho_1 * rho_4; + const auto gamma2 = gamma_i * gamma_i; + const auto omega2 = params.c * gamma2 * rho_4 + four_pi_over_three * rho_i; + const auto omega_eval = sqrt(omega2); + const auto domega_drho = 0.5 * (four_pi_over_three - 4.0 * params.c * gamma2 * rho_5) / omega_eval; + const auto domega_dgamma = params.c * gamma_i * rho_4 / omega_eval; + const auto scaled_dkappa_drho = kappa_i / 6.0; + const auto F = -1.5 * F_i; + + eps[i] = cuda_beta(params) + 0.5 * F; + vrho[i] = cuda_beta(params) + F + 1.5 * (U_i * scaled_dkappa_drho + W_i * rho_i * domega_drho); + vgamma[i] = 1.5 * W_i * rho_i * domega_dgamma; + } +} + +__global__ void grid_gradient_kernel(int npts, const double* coords, + const double* omega, const double* kappa, int vv_npts, + const double* vv_coords, const double* vv_rho_weight, + const double* vv_omega, const double* vv_kappa, double* gradient) { + + const auto i = blockIdx.x * blockDim.x + threadIdx.x; + if(i >= npts) return; + + const auto r_i = load_point(coords, i); + double gx = 0.0; + double gy = 0.0; + double gz = 0.0; + + for(int j = 0; j < vv_npts; ++j) { + const auto r_j = load_point(vv_coords, j); + const auto dx = r_j.x - r_i.x; + const auto dy = r_j.y - r_i.y; + const auto dz = r_j.z - r_i.z; + const auto r_ij2 = dx*dx + dy*dy + dz*dz; + const auto g_ij = r_ij2 * omega[i] + kappa[i]; + const auto g_ji = r_ij2 * vv_omega[j] + vv_kappa[j]; + const auto g_sum = g_ij + g_ji; + const auto pair = vv_rho_weight[j] / (g_ij * g_ji * g_sum); + const auto prefactor = pair * (omega[i]/g_ij + vv_omega[j]/g_ji + (omega[i] + vv_omega[j])/g_sum); + gx += prefactor * dx; + gy += prefactor * dy; + gz += prefactor * dz; + } + + gradient[3*i + 0] = -3.0 * gx; + gradient[3*i + 1] = -3.0 * gy; + gradient[3*i + 2] = -3.0 * gz; +} + +__global__ void grid_gradient_excluding_same_parent_kernel(int npts, + const double* coords, const double* omega, const double* kappa, + const double* rho_weight, const std::int32_t* parent, double* gradient) { + + __shared__ double3 coords_tile[threads_per_block]; + __shared__ double rho_weight_tile[threads_per_block]; + __shared__ double omega_tile[threads_per_block]; + __shared__ double kappa_tile[threads_per_block]; + __shared__ std::int32_t parent_tile[threads_per_block]; + + const auto i = blockIdx.x * blockDim.x + threadIdx.x; + const auto active_i = i < npts; + const auto r_i = active_i ? load_point(coords, i) : double3{0.0, 0.0, 0.0}; + const auto omega_i = active_i ? omega[i] : 0.0; + const auto kappa_i = active_i ? kappa[i] : 0.0; + const auto parent_i = active_i ? parent[i] : std::int32_t{-1}; + + double gx = 0.0; + double gy = 0.0; + double gz = 0.0; + + for(int tile = 0; tile < npts; tile += blockDim.x) { + const auto j_load = tile + threadIdx.x; + if(j_load < npts) { + coords_tile[threadIdx.x] = load_point(coords, j_load); + rho_weight_tile[threadIdx.x] = rho_weight[j_load]; + omega_tile[threadIdx.x] = omega[j_load]; + kappa_tile[threadIdx.x] = kappa[j_load]; + parent_tile[threadIdx.x] = parent[j_load]; + } + __syncthreads(); + + if(active_i) { + const auto tile_npts = min(blockDim.x, npts - tile); + for(int j = 0; j < tile_npts; ++j) { + if(parent_i == parent_tile[j]) continue; + const auto r_j = coords_tile[j]; + const auto dx = r_j.x - r_i.x; + const auto dy = r_j.y - r_i.y; + const auto dz = r_j.z - r_i.z; + const auto r_ij2 = dx*dx + dy*dy + dz*dz; + const auto g_ij = r_ij2 * omega_i + kappa_i; + const auto g_ji = r_ij2 * omega_tile[j] + kappa_tile[j]; + const auto g_sum = g_ij + g_ji; + const auto g_ij_inv = 1.0 / g_ij; + const auto g_ji_inv = 1.0 / g_ji; + const auto g_sum_inv = 1.0 / g_sum; + const auto pair = rho_weight_tile[j] * g_ij_inv * g_ji_inv * g_sum_inv; + const auto prefactor = pair * (omega_i * g_ij_inv + omega_tile[j] * g_ji_inv + + (omega_i + omega_tile[j]) * g_sum_inv); + gx += prefactor * dx; + gy += prefactor * dy; + gz += prefactor * dz; + } + } + __syncthreads(); + } + + if(active_i) { + gradient[3*i + 0] = -3.0 * gx; + gradient[3*i + 1] = -3.0 * gy; + gradient[3*i + 2] = -3.0 * gz; + } +} + +__global__ void hessian_intermediates_kernel(int npts, const double* coords, + const double* weights, const double* rho, const double* omega, + const double* kappa, double* U, double* W, double* A, double* B, + double* C, double* E) { + + const auto i = blockIdx.x * blockDim.x + threadIdx.x; + if(i >= npts) return; + + const auto r_i = load_point(coords, i); + double U_i = 0.0; + double W_i = 0.0; + double A_i = 0.0; + double B_i = 0.0; + double C_i = 0.0; + double E_i = 0.0; + for(int j = 0; j < npts; ++j) { + const auto r_j = load_point(coords, j); + const auto r_ij2 = distance2(r_i, r_j); + const auto g_ij = omega[i] * r_ij2 + kappa[i]; + const auto g_ji = omega[j] * r_ij2 + kappa[j]; + const auto g_ij_1 = 1.0 / g_ij; + const auto g_ji_1 = 1.0 / g_ji; + const auto g_sum_1 = 1.0 / (g_ij + g_ji); + const auto phi = -1.5 * g_ij_1 * g_ji_1 * g_sum_1; + const auto E_ij = weights[j] * rho[j] * phi; + const auto U_ij = E_ij * (g_sum_1 + g_ij_1); + const auto W_ij = U_ij * r_ij2; + const auto A_ij = E_ij * (g_sum_1*g_sum_1 + g_sum_1*g_ij_1 + g_ij_1*g_ij_1); + const auto B_ij = A_ij * r_ij2; + const auto C_ij = B_ij * r_ij2; + U_i += U_ij; + W_i += W_ij; + A_i += A_ij; + B_i += B_ij; + C_i += C_ij; + E_i += E_ij; + } + U[i] = -U_i; + W[i] = -W_i; + A[i] = 2.0 * A_i; + B[i] = 2.0 * B_i; + C[i] = 2.0 * C_i; + E[i] = E_i; +} + +__global__ void hessian_contraction_kernel(int npts, int ntrial, + const double* coords, const double* weights, const double* rho, + const double* omega, const double* kappa, const double* U, + const double* W, const double* A, const double* B, const double* C, + const double* domega_drho, const double* domega_dgamma, + const double* dkappa_drho, const double* d2omega_drho2, + const double* d2omega_dgamma2, const double* d2omega_drho_dgamma, + const double* d2kappa_drho2, const double* rho_t, + const double* gamma_t, double* f_rho_t, double* f_gamma_t) { + + const auto i = blockIdx.x * blockDim.x + threadIdx.x; + const auto trial = blockIdx.y; + if(i >= npts || trial >= ntrial) return; + + const auto r_i = load_point(coords, i); + double f_rho_i = 0.0; + double f_gamma_i = 0.0; + for(int j = 0; j < npts; ++j) { + const auto r_j = load_point(coords, j); + const auto r_ij2 = distance2(r_i, r_j); + const auto g_ij = omega[i] * r_ij2 + kappa[i]; + const auto g_ji = omega[j] * r_ij2 + kappa[j]; + const auto g_ij_1 = 1.0 / g_ij; + const auto g_ji_1 = 1.0 / g_ji; + const auto g_sum_1 = 1.0 / (g_ij + g_ji); + const auto phi = -1.5 * g_ij_1 * g_ji_1 * g_sum_1; + const auto rho_dgdrho_i = rho[i] * (r_ij2 * domega_drho[i] + dkappa_drho[i]); + const auto rho_dgdrho_j = rho[j] * (r_ij2 * domega_drho[j] + dkappa_drho[j]); + const auto d2phi_dgij_dgji_over_phi = 2.0 * (g_sum_1*g_sum_1 + g_ij_1*g_ji_1); + const auto f_rho_rho_ij = phi * (rho_dgdrho_i * rho_dgdrho_j * d2phi_dgij_dgji_over_phi + - rho_dgdrho_i * (g_sum_1 + g_ij_1) - rho_dgdrho_j * (g_sum_1 + g_ji_1) + 1.0); + const auto f_gamma_rho_ij = rho[i] * domega_dgamma[i] * r_ij2 * phi + * (rho_dgdrho_j * d2phi_dgij_dgji_over_phi - (g_sum_1 + g_ij_1)); + const auto f_rho_gamma_ij = rho[j] * domega_dgamma[j] * r_ij2 * phi + * (rho_dgdrho_i * d2phi_dgij_dgji_over_phi - (g_sum_1 + g_ji_1)); + const auto f_gamma_gamma_ij = rho[i] * rho[j] * domega_dgamma[i] * domega_dgamma[j] + * r_ij2 * r_ij2 * phi * d2phi_dgij_dgji_over_phi; + const auto offset_j = trial * npts + j; + f_rho_i += weights[j] * (f_rho_rho_ij * rho_t[offset_j] + f_rho_gamma_ij * gamma_t[offset_j]); + f_gamma_i += weights[j] * (f_gamma_rho_ij * rho_t[offset_j] + f_gamma_gamma_ij * gamma_t[offset_j]); + } + + const auto f_rho_rho_ii = 2.0 * domega_drho[i] * W[i] + 2.0 * dkappa_drho[i] * U[i] + + rho[i] * (d2omega_drho2[i] * W[i] + d2kappa_drho2[i] * U[i] + + dkappa_drho[i] * dkappa_drho[i] * A[i] + domega_drho[i] * domega_drho[i] * C[i] + + 2.0 * domega_drho[i] * dkappa_drho[i] * B[i]); + const auto f_gamma_rho_ii = domega_dgamma[i] * W[i] + rho[i] * (d2omega_drho_dgamma[i] * W[i] + + domega_dgamma[i] * (dkappa_drho[i] * B[i] + domega_drho[i] * C[i])); + const auto f_gamma_gamma_ii = rho[i] * (d2omega_dgamma2[i] * W[i] + + domega_dgamma[i] * domega_dgamma[i] * C[i]); + const auto offset_i = trial * npts + i; + f_rho_i += f_rho_rho_ii * rho_t[offset_i] + f_gamma_rho_ii * gamma_t[offset_i]; + f_gamma_i += f_gamma_rho_ii * rho_t[offset_i] + f_gamma_gamma_ii * gamma_t[offset_i]; + f_rho_t[offset_i] = f_rho_i; + f_gamma_t[offset_i] = f_gamma_i; +} + +__global__ void grid_response_kernel(int npts, int natoms, + const double* coords, const double* weights, const double* rho, + const double* omega, const double* kappa, + const int* grid_associated_atom, double* Egr, double* Ugr, double* Wgr) { + + const auto i = blockIdx.x * blockDim.x + threadIdx.x; + const auto atom = blockIdx.y; + if(i >= npts || atom >= natoms) return; + const auto i_atom = grid_associated_atom[i]; + const auto base = atom * 3 * npts + i; + if(i_atom < 0) { + Egr[base + 0*npts] = 0.0; + Egr[base + 1*npts] = 0.0; + Egr[base + 2*npts] = 0.0; + Ugr[base + 0*npts] = 0.0; + Ugr[base + 1*npts] = 0.0; + Ugr[base + 2*npts] = 0.0; + Wgr[base + 0*npts] = 0.0; + Wgr[base + 1*npts] = 0.0; + Wgr[base + 2*npts] = 0.0; + return; + } + + const auto r_i = load_point(coords, i); + const auto i_in_atom = i_atom == atom; + double3 Eg{0.0, 0.0, 0.0}; + double3 Ug{0.0, 0.0, 0.0}; + double3 Wg{0.0, 0.0, 0.0}; + for(int j = 0; j < npts; ++j) { + const auto j_atom = grid_associated_atom[j]; + if(j_atom < 0) continue; + const auto j_in_atom = j_atom == atom; + if((!i_in_atom && !j_in_atom) || (i_in_atom && j_in_atom)) continue; + const auto r_j = load_point(coords, j); + const double3 r_ji{r_j.x - r_i.x, r_j.y - r_i.y, r_j.z - r_i.z}; + const auto r_ij2 = r_ji.x*r_ji.x + r_ji.y*r_ji.y + r_ji.z*r_ji.z; + const auto g_ij = omega[i] * r_ij2 + kappa[i]; + const auto g_ji = omega[j] * r_ij2 + kappa[j]; + const auto g_ij_1 = 1.0 / g_ij; + const auto g_ji_1 = 1.0 / g_ji; + const auto g_sum_1 = 1.0 / (g_ij + g_ji); + const auto phi = -1.5 * g_ij_1 * g_ji_1 * g_sum_1; + const auto E_ij = weights[j] * rho[j] * phi; + const auto dphi_drj_over_phi = omega[i]*g_ij_1 + omega[j]*g_ji_1 + (omega[i] + omega[j])*g_sum_1; + const auto d2phi_dgij_drj_over_phi = omega[i]*g_ij_1*g_ij_1 + (omega[i] + omega[j])*g_sum_1*g_sum_1; + const auto dphi_dgij_over_phi = g_sum_1 + g_ij_1; + const auto Egr_ij = E_ij * dphi_drj_over_phi; + const auto Ugr_ij = E_ij * (dphi_drj_over_phi*dphi_dgij_over_phi + d2phi_dgij_drj_over_phi); + const auto Wgr_ij = E_ij * (r_ij2 * (dphi_drj_over_phi*dphi_dgij_over_phi + d2phi_dgij_drj_over_phi) - dphi_dgij_over_phi); + Eg.x += Egr_ij * r_ji.x; + Eg.y += Egr_ij * r_ji.y; + Eg.z += Egr_ij * r_ji.z; + Ug.x += Ugr_ij * r_ji.x; + Ug.y += Ugr_ij * r_ji.y; + Ug.z += Ugr_ij * r_ji.z; + Wg.x += Wgr_ij * r_ji.x; + Wg.y += Wgr_ij * r_ji.y; + Wg.z += Wgr_ij * r_ji.z; + } + const auto atom_sign = i_in_atom ? -1.0 : 1.0; + Egr[base + 0*npts] = -2.0 * atom_sign * Eg.x; + Egr[base + 1*npts] = -2.0 * atom_sign * Eg.y; + Egr[base + 2*npts] = -2.0 * atom_sign * Eg.z; + Ugr[base + 0*npts] = 2.0 * atom_sign * Ug.x; + Ugr[base + 1*npts] = 2.0 * atom_sign * Ug.y; + Ugr[base + 2*npts] = 2.0 * atom_sign * Ug.z; + Wgr[base + 0*npts] = 2.0 * atom_sign * Wg.x; + Wgr[base + 1*npts] = 2.0 * atom_sign * Wg.y; + Wgr[base + 2*npts] = 2.0 * atom_sign * Wg.z; +} + +void check_last_error(const char* msg) { + const auto err = cudaGetLastError(); + GAUXC_CUDA_ERROR(msg, err); +} + +} + +void eval_omega_kappa_cuda(cudaStream_t stream, std::int32_t npts, + const double* rho, const double* gamma, Parameters params, double* omega, + double* kappa, double* domega_drho, double* domega_dgamma, + double* d2omega_drho2, double* d2omega_dgamma2, + double* d2omega_drho_dgamma, double* dkappa_drho, + double* d2kappa_drho2) { + const auto blocks = util::div_ceil(npts, threads_per_block); + omega_kappa_kernel<<>>(npts, rho, gamma, + params, omega, kappa, domega_drho, domega_dgamma, d2omega_drho2, + d2omega_dgamma2, d2omega_drho_dgamma, dkappa_drho, d2kappa_drho2); + check_last_error("VV10 omega/kappa kernel failed"); +} + +void eval_fock_cuda(cudaStream_t stream, std::int32_t npts, + const double* coords, const double* omega, const double* kappa, + std::int32_t vv_npts, const double* vv_coords, + const double* vv_rho_weight, const double* vv_omega, + const double* vv_kappa, double* F, double* U, double* W) { + const auto blocks = util::div_ceil(npts, threads_per_block); + fock_kernel<<>>(npts, coords, omega, + kappa, vv_npts, vv_coords, vv_rho_weight, vv_omega, vv_kappa, F, U, W); + check_last_error("VV10 fock kernel failed"); +} + +void eval_exc_vxc_from_fock_cuda(cudaStream_t stream, std::int32_t npts, + const double* rho, const double* gamma, const double* F, const double* U, + const double* W, Parameters params, double* eps, double* vrho, + double* vgamma) { + const auto blocks = util::div_ceil(npts, threads_per_block); + exc_vxc_from_fock_kernel<<>>(npts, rho, + gamma, F, U, W, params, eps, vrho, vgamma); + check_last_error("VV10 EXC/VXC finalize kernel failed"); +} + +void eval_exc_vxc_fock_cuda(cudaStream_t stream, std::int32_t npts, + const double* coords, const double* rho, const double* gamma, + const double* omega, const double* kappa, const double* rho_weight, + Parameters params, double* eps, double* vrho, double* vgamma) { + const auto blocks = util::div_ceil(npts, threads_per_block); + if(params.pair_math_mode == 1) { + exc_vxc_fock_kernel<<>>(npts, coords, + rho, gamma, omega, kappa, rho_weight, params, eps, vrho, vgamma); + } else { + exc_vxc_fock_kernel<<>>(npts, coords, + rho, gamma, omega, kappa, rho_weight, params, eps, vrho, vgamma); + } + check_last_error("VV10 fused EXC/VXC fock kernel failed"); +} + +void eval_grid_gradient_cuda(cudaStream_t stream, std::int32_t npts, + const double* coords, const double* omega, const double* kappa, + std::int32_t vv_npts, const double* vv_coords, + const double* vv_rho_weight, const double* vv_omega, + const double* vv_kappa, double* gradient) { + const auto blocks = util::div_ceil(npts, threads_per_block); + grid_gradient_kernel<<>>(npts, coords, + omega, kappa, vv_npts, vv_coords, vv_rho_weight, vv_omega, vv_kappa, + gradient); + check_last_error("VV10 grid gradient kernel failed"); +} + +void eval_grid_gradient_excluding_same_parent_cuda(cudaStream_t stream, + std::int32_t npts, const double* coords, const double* omega, + const double* kappa, const double* rho_weight, + const std::int32_t* parent, double* gradient) { + const auto blocks = util::div_ceil(npts, threads_per_block); + grid_gradient_excluding_same_parent_kernel<<>>( + npts, coords, omega, kappa, rho_weight, parent, gradient); + check_last_error("VV10 parent-excluding grid gradient kernel failed"); +} + +void eval_hessian_intermediates_cuda(cudaStream_t stream, std::int32_t npts, + const double* coords, const double* weights, const double* rho, + const double* omega, const double* kappa, double* U, double* W, + double* A, double* B, double* C, double* E) { + const auto blocks = util::div_ceil(npts, threads_per_block); + hessian_intermediates_kernel<<>>(npts, + coords, weights, rho, omega, kappa, U, W, A, B, C, E); + check_last_error("VV10 hessian intermediates kernel failed"); +} + +void eval_hessian_contraction_cuda(cudaStream_t stream, std::int32_t npts, + std::int32_t ntrial, const double* coords, const double* weights, + const double* rho, const double* omega, const double* kappa, + const double* U, const double* W, const double* A, const double* B, + const double* C, const double* domega_drho, + const double* domega_dgamma, const double* dkappa_drho, + const double* d2omega_drho2, const double* d2omega_dgamma2, + const double* d2omega_drho_dgamma, const double* d2kappa_drho2, + const double* rho_t, const double* gamma_t, double* f_rho_t, + double* f_gamma_t) { + const dim3 blocks(util::div_ceil(npts, threads_per_block), ntrial); + hessian_contraction_kernel<<>>(npts, + ntrial, coords, weights, rho, omega, kappa, U, W, A, B, C, + domega_drho, domega_dgamma, dkappa_drho, d2omega_drho2, + d2omega_dgamma2, d2omega_drho_dgamma, d2kappa_drho2, rho_t, + gamma_t, f_rho_t, f_gamma_t); + check_last_error("VV10 hessian contraction kernel failed"); +} + +void eval_grid_response_cuda(cudaStream_t stream, std::int32_t npts, + std::int32_t natoms, const double* coords, const double* weights, + const double* rho, const double* omega, const double* kappa, + const std::int32_t* grid_associated_atom, double* Egr, double* Ugr, + double* Wgr) { + const dim3 blocks(util::div_ceil(npts, threads_per_block), natoms); + grid_response_kernel<<>>(npts, natoms, + coords, weights, rho, omega, kappa, grid_associated_atom, Egr, Ugr, Wgr); + check_last_error("VV10 grid response kernel failed"); +} + +} + +#endif \ No newline at end of file diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/vv10.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/vv10.hpp new file mode 100644 index 000000000..3e60ef551 --- /dev/null +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/vv10.hpp @@ -0,0 +1,71 @@ +#pragma once + +#include + +#ifdef GAUXC_HAS_CUDA + +#include "integrator_util/vv10.hpp" +#include +#include + +namespace GauXC::detail::vv10 { + +void eval_omega_kappa_cuda(cudaStream_t stream, std::int32_t npts, + const double* rho, const double* gamma, Parameters params, double* omega, + double* kappa, double* domega_drho, double* domega_dgamma, + double* d2omega_drho2, double* d2omega_dgamma2, + double* d2omega_drho_dgamma, double* dkappa_drho, + double* d2kappa_drho2); + +void eval_fock_cuda(cudaStream_t stream, std::int32_t npts, + const double* coords, const double* omega, const double* kappa, + std::int32_t vv_npts, const double* vv_coords, + const double* vv_rho_weight, const double* vv_omega, + const double* vv_kappa, double* F, double* U, double* W); + +void eval_exc_vxc_from_fock_cuda(cudaStream_t stream, std::int32_t npts, + const double* rho, const double* gamma, const double* F, const double* U, + const double* W, Parameters params, double* eps, double* vrho, + double* vgamma); + +void eval_exc_vxc_fock_cuda(cudaStream_t stream, std::int32_t npts, + const double* coords, const double* rho, const double* gamma, + const double* omega, const double* kappa, const double* rho_weight, + Parameters params, double* eps, double* vrho, double* vgamma); + +void eval_grid_gradient_cuda(cudaStream_t stream, std::int32_t npts, + const double* coords, const double* omega, const double* kappa, + std::int32_t vv_npts, const double* vv_coords, + const double* vv_rho_weight, const double* vv_omega, + const double* vv_kappa, double* gradient); + +void eval_grid_gradient_excluding_same_parent_cuda(cudaStream_t stream, + std::int32_t npts, const double* coords, const double* omega, + const double* kappa, const double* rho_weight, + const std::int32_t* parent, double* gradient); + +void eval_hessian_intermediates_cuda(cudaStream_t stream, std::int32_t npts, + const double* coords, const double* weights, const double* rho, + const double* omega, const double* kappa, double* U, double* W, + double* A, double* B, double* C, double* E); + +void eval_hessian_contraction_cuda(cudaStream_t stream, std::int32_t npts, + std::int32_t ntrial, const double* coords, const double* weights, + const double* rho, const double* omega, const double* kappa, + const double* U, const double* W, const double* A, const double* B, + const double* C, const double* domega_drho, + const double* domega_dgamma, const double* dkappa_drho, + const double* d2omega_drho2, const double* d2omega_dgamma2, + const double* d2omega_drho_dgamma, const double* d2kappa_drho2, + const double* rho_t, const double* gamma_t, double* f_rho_t, + double* f_gamma_t); + +void eval_grid_response_cuda(cudaStream_t stream, std::int32_t npts, + std::int32_t natoms, const double* coords, const double* weights, + const double* rho, const double* omega, const double* kappa, + const std::int32_t* grid_associated_atom, double* Egr, double* Ugr, + double* Wgr); + +} + +#endif \ No newline at end of file diff --git a/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator.hpp b/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator.hpp index 30ff47ce1..346aa9abd 100644 --- a/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator.hpp +++ b/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator.hpp @@ -129,6 +129,21 @@ class IncoreReplicatedXCDeviceIntegrator : host_task_iterator task_begin, host_task_iterator task_end, XCDeviceData& device_data ); + void nlc_exc_vxc_local_work_( const basis_type& basis, const value_type* Ps, int64_t ldps, + double xmat_fac, + value_type* VXC, int64_t ldvxc, value_type* EXC, value_type *N_EL, + const IntegratorSettingsNLC& settings, + host_task_iterator task_begin, host_task_iterator task_end, + XCDeviceData& device_data ); + + void nlc_fxc_contraction_local_work_( const basis_type& basis, const value_type* Ps, int64_t ldps, + const value_type* tPs, int64_t ldtps, + value_type *N_EL, + value_type* FXC, int64_t ldfxc, + const IntegratorSettingsNLC& settings, + host_task_iterator task_begin, host_task_iterator task_end, + XCDeviceData& device_data ); + void fxc_contraction_local_work_( const basis_type& basis, const value_type* Ps, int64_t ldps, const value_type* Pz, int64_t ldpz, const value_type* tPs, int64_t ldtps, @@ -151,6 +166,12 @@ class IncoreReplicatedXCDeviceIntegrator : host_task_iterator task_begin, host_task_iterator task_end, XCDeviceData& device_data, const IntegratorSettingsXC& settings ); + void nlc_exc_grad_local_work_( const basis_type& basis, const value_type* Ps, int64_t ldps, + value_type* EXC_GRAD, + const IntegratorSettingsNLC& settings, + host_task_iterator task_begin, host_task_iterator task_end, + XCDeviceData& device_data ); + void eval_exc_grad_local_work_( const basis_type& basis, const value_type* P, int64_t ldp, const value_type* Pz, int64_t ldpz, value_type* EXC_GRAD, host_task_iterator task_begin, host_task_iterator task_end, diff --git a/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_exc.hpp b/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_exc.hpp index 9a2a7cf4f..8320465e6 100644 --- a/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_exc.hpp +++ b/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_exc.hpp @@ -12,6 +12,8 @@ #include "incore_replicated_xc_device_integrator.hpp" #include "device/local_device_work_driver.hpp" #include "device/xc_device_aos_data.hpp" +#include "device/xc_device_stack_data.hpp" +#include "vv10_nlc_device.hpp" #include #include #include @@ -39,6 +41,42 @@ void IncoreReplicatedXCDeviceIntegrator:: if( ldps < nbf ) GAUXC_GENERIC_EXCEPTION("Invalid LDP"); + if( auto* nlc_settings = dynamic_cast(&settings) ) { + const bool is_gks = (Pz != nullptr) and (Py != nullptr) and (Px != nullptr); + const bool is_uks = (Pz != nullptr) and (Py == nullptr) and (Px == nullptr); + const bool is_rks = (Ps != nullptr) and (not is_uks and not is_gks); + if( is_gks ) { + GAUXC_GENERIC_EXCEPTION("NLC device EXC is not implemented for GKS"); + } + if( this->reduction_driver_->takes_device_memory() ) { + GAUXC_GENERIC_EXCEPTION("NLC device EXC currently requires host reductions"); + } + + auto& tasks = this->load_balancer_->get_tasks(); + auto* lwd = dynamic_cast(this->local_work_driver_.get() ); + auto rt = detail::as_device_runtime(this->load_balancer_->runtime()); + auto device_data_ptr = lwd->create_device_data(rt); + value_type N_EL; + + this->timer_.time_op("XCIntegrator.LocalWork_NLC_EXC", [&](){ + nlc_exc_vxc_local_work_( basis, Ps, ldps, is_rks ? 2.0 : 1.0, + nullptr, 0, EXC, &N_EL, + *nlc_settings, tasks.begin(), tasks.end(), *device_data_ptr ); + }); + + GAUXC_MPI_CODE( + this->timer_.time_op("XCIntegrator.ImbalanceWait_NLC_EXC",[&](){ + MPI_Barrier(this->load_balancer_->runtime().comm()); + }); + ) + + this->timer_.time_op("XCIntegrator.Allreduce_NLC_EXC", [&](){ + this->reduction_driver_->allreduce_inplace( EXC, 1, ReductionOp::Sum ); + this->reduction_driver_->allreduce_inplace( &N_EL, 1, ReductionOp::Sum ); + }); + return; + } + // Get Tasks auto& tasks = this->load_balancer_->get_tasks(); diff --git a/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_exc_grad.hpp b/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_exc_grad.hpp index 6c030bc29..50fe4c0c8 100644 --- a/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_exc_grad.hpp +++ b/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_exc_grad.hpp @@ -10,6 +10,7 @@ * See LICENSE.txt for details */ #include "incore_replicated_xc_device_integrator.hpp" +#include "vv10_nlc_device.hpp" #include "device/local_device_work_driver.hpp" #include #include "device/xc_device_aos_data.hpp" @@ -33,6 +34,7 @@ void IncoreReplicatedXCDeviceIntegrator:: GAUXC_GENERIC_EXCEPTION("P Must Have Same Dimension as Basis"); if( ldp < nbf ) GAUXC_GENERIC_EXCEPTION("Invalid LDP"); + auto* nlc_settings = dynamic_cast(&settings); // Get Tasks auto& tasks = this->load_balancer_->get_tasks(); @@ -46,6 +48,28 @@ void IncoreReplicatedXCDeviceIntegrator:: const auto& mol = this->load_balancer_->molecule(); const auto natoms = mol.size(); + if( nlc_settings ) { + if( this->reduction_driver_->takes_device_memory() ) { + GAUXC_GENERIC_EXCEPTION("NLC device EXC_GRAD requires host reductions"); + } + + this->timer_.time_op("XCIntegrator.LocalWork", [&](){ + nlc_exc_grad_local_work_( basis, P, ldp, EXC_GRAD, *nlc_settings, + tasks.begin(), tasks.end(), *device_data_ptr ); + }); + + GAUXC_MPI_CODE( + this->timer_.time_op("XCIntegrator.ImbalanceWait",[&](){ + MPI_Barrier(this->load_balancer_->runtime().comm()); + }); + ) + + this->timer_.time_op("XCIntegrator.Allreduce", [&](){ + this->reduction_driver_->allreduce_inplace( EXC_GRAD, 3*natoms, ReductionOp::Sum ); + }); + return; + } + if( this->reduction_driver_->takes_device_memory() ) { GAUXC_GENERIC_EXCEPTION("Device Reduction + EXC Grad NYI"); } else { @@ -90,6 +114,9 @@ void IncoreReplicatedXCDeviceIntegrator:: GAUXC_GENERIC_EXCEPTION("Invalid LDPS"); if( ldpz < nbf ) GAUXC_GENERIC_EXCEPTION("Invalid LDPZ"); + if( dynamic_cast(&settings) ) { + GAUXC_GENERIC_EXCEPTION("NLC device EXC_GRAD currently supports only RKS"); + } // Get Tasks auto& tasks = this->load_balancer_->get_tasks(); @@ -129,6 +156,192 @@ void IncoreReplicatedXCDeviceIntegrator:: } +template +void IncoreReplicatedXCDeviceIntegrator:: + nlc_exc_grad_local_work_( const basis_type& basis, const value_type* Ps, int64_t ldps, + value_type* EXC_GRAD, + const IntegratorSettingsNLC& settings, + host_task_iterator task_begin, host_task_iterator task_end, + XCDeviceData& device_data ) { + +#ifdef GAUXC_HAS_CUDA + auto* lwd = dynamic_cast(this->local_work_driver_.get() ); + auto* stack_data = dynamic_cast(&device_data); + auto* aos_data = dynamic_cast(&device_data); + if( not stack_data or not aos_data ) GAUXC_BAD_LWD_DATA_CAST(); + + const auto& func = *this->func_; + if( not func.is_gga() ) { + GAUXC_GENERIC_EXCEPTION("NLC device EXC_GRAD currently requires a GGA functional"); + } + + IntegratorSettingsEXC_GRAD exc_grad_settings; + exc_grad_settings.include_weight_derivatives = settings.include_weight_derivatives; + + const auto& mol = this->load_balancer_->molecule(); + const auto& meta = this->load_balancer_->molmeta(); + const auto natoms = mol.size(); + BasisSetMap basis_map(basis,mol); + device_data.populate_submat_maps( basis.nbf(), task_begin, task_end, basis_map ); + + auto task_comparator = []( const XCTask& a, const XCTask& b ) { + return (a.points.size() * a.bfn_screening.nbe) > (b.points.size() * b.bfn_screening.nbe); + }; + std::sort( task_begin, task_end, task_comparator ); + + auto& lb_state = this->load_balancer_->state(); + if( not lb_state.modified_weights_are_stored ) { + GAUXC_GENERIC_EXCEPTION("Weights Have Not Been Modified"); + } + XCWeightAlg& weight_alg = lb_state.weight_alg; + + integrator_term_tracker enabled_terms; + enabled_terms.exc_grad = true; + enabled_terms.weights = true; + enabled_terms.ks_scheme = RKS; + enabled_terms.xc_approx = integrator_xc_approx::GGA; + + const auto nbf = basis.nbf(); + const auto nshells = basis.nshells(); + device_data.reset_allocations(); + device_data.allocate_static_data_exc_grad( nbf, nshells, natoms, enabled_terms ); + device_data.send_static_data_density_basis( Ps, ldps, nullptr, 0, nullptr, 0, nullptr, 0, basis ); + device_data.allocate_static_data_weights( natoms ); + device_data.send_static_data_weights( mol, meta ); + + auto rt = detail::as_device_runtime(this->load_balancer_->runtime()); + auto* backend = rt.device_backend(); + + std::vector local_packed; + std::vector local_parent; + auto task_it = task_begin; + while( task_it != task_end ) { + task_it = device_data.generate_buffers( enabled_terms, basis_map, task_it, task_end ); + + lwd->eval_collocation_gradient( &device_data ); + lwd->eval_xmat( 2.0, &device_data, false, DEN_S ); + lwd->eval_vvars_gga( &device_data, DEN_S ); + lwd->eval_uvars_gga( &device_data, RKS ); + + const size_t batch_npts = stack_data->total_npts_task_batch; + std::vector points_x(batch_npts), points_y(batch_npts), points_z(batch_npts); + std::vector weights(batch_npts), rho(batch_npts), gamma(batch_npts); + auto base_stack = stack_data->base_stack; + backend->copy_async( batch_npts, base_stack.points_x_device, points_x.data(), "NLC grad points_x D2H" ); + backend->copy_async( batch_npts, base_stack.points_y_device, points_y.data(), "NLC grad points_y D2H" ); + backend->copy_async( batch_npts, base_stack.points_z_device, points_z.data(), "NLC grad points_z D2H" ); + backend->copy_async( batch_npts, base_stack.weights_device, weights.data(), "NLC grad weights D2H" ); + backend->copy_async( batch_npts, base_stack.den_s_eval_device, rho.data(), "NLC grad rho D2H" ); + backend->copy_async( batch_npts, base_stack.gamma_eval_device, gamma.data(), "NLC grad gamma D2H" ); + backend->master_queue_synchronize(); + + local_packed.reserve(local_packed.size() + 6 * batch_npts); + for( size_t i = 0; i < batch_npts; ++i ) { + local_packed.push_back(points_x[i]); + local_packed.push_back(points_y[i]); + local_packed.push_back(points_z[i]); + local_packed.push_back(weights[i]); + local_packed.push_back(rho[i]); + local_packed.push_back(gamma[i]); + } + for( const auto& task : aos_data->host_device_tasks ) { + for( size_t i = 0; i < task.npts; ++i ) { + local_parent.push_back( static_cast( task.iParent ) ); + } + } + } + + size_t local_point_offset = 0; + const auto global_packed = vv10::allgather_packed_grid( + *this->reduction_driver_, local_packed, 6, local_point_offset ); + if( global_packed.size() % 6 != 0 ) { + GAUXC_GENERIC_EXCEPTION("Invalid VV10 device EXC_GRAD packed grid size after allgather"); + } + auto parent = this->reduction_driver_->allgather_v( local_parent.data(), local_parent.size() ); + + const size_t global_npts = global_packed.size() / 6; + if( parent.size() != global_npts ) { + GAUXC_GENERIC_EXCEPTION("Invalid VV10 device EXC_GRAD parent grid size after allgather"); + } + std::vector coords(3 * global_npts), weights(global_npts), rho(global_npts), gamma(global_npts); + std::vector eps(global_npts, 0.0), vrho(global_npts, 0.0), vgamma(global_npts, 0.0); + std::vector grid_grad_x(global_npts, 0.0), grid_grad_y(global_npts, 0.0), grid_grad_z(global_npts, 0.0); + for( size_t i = 0; i < global_npts; ++i ) { + coords[3*i+0] = global_packed[6*i+0]; + coords[3*i+1] = global_packed[6*i+1]; + coords[3*i+2] = global_packed[6*i+2]; + weights[i] = global_packed[6*i+3]; + rho[i] = global_packed[6*i+4]; + gamma[i] = global_packed[6*i+5]; + } + + vv10::eval_exc_vxc_cuda( 0, settings, + vv10::GridView{ global_npts, coords.data(), weights.data(), rho.data(), gamma.data() }, + vv10::CorrectionsView{ eps.data(), vrho.data(), vgamma.data() } ); + if( exc_grad_settings.include_weight_derivatives ) { + vv10::eval_grid_gradient_excluding_same_parent_cuda( 0, settings, + vv10::GridView{ global_npts, coords.data(), weights.data(), rho.data(), gamma.data() }, + parent, + vv10::GridGradientView{ grid_grad_x.data(), grid_grad_y.data(), grid_grad_z.data() } ); + } + + device_data.zero_exc_grad_integrands(); + std::vector grid_gradient_contribution(3 * natoms, 0.0); + task_it = task_begin; + size_t local_batch_offset = 0; + while( task_it != task_end ) { + task_it = device_data.generate_buffers( enabled_terms, basis_map, task_it, task_end ); + + lwd->eval_collocation_hessian( &device_data ); + lwd->eval_xmat( 2.0, &device_data, true, DEN_S ); + lwd->eval_vvars_gga( &device_data, DEN_S ); + lwd->eval_uvars_gga( &device_data, RKS ); + + const size_t batch_npts = stack_data->total_npts_task_batch; + const size_t global_offset = local_point_offset + local_batch_offset; + std::vector batch_eps(batch_npts), batch_vrho(batch_npts), batch_vgamma(batch_npts); + for( size_t i = 0; i < batch_npts; ++i ) { + const auto global_i = global_offset + i; + batch_eps[i] = (2.0 * eps[global_i] - vv10::beta(settings)) * weights[global_i]; + batch_vrho[i] = vrho[global_i] * weights[global_i]; + batch_vgamma[i] = vgamma[global_i] * weights[global_i]; + } + + auto base_stack = stack_data->base_stack; + backend->copy_async( batch_npts, batch_eps.data(), base_stack.eps_eval_device, "NLC grad eps H2D" ); + backend->copy_async( batch_npts, batch_vrho.data(), base_stack.vrho_eval_device, "NLC grad vrho H2D" ); + backend->copy_async( batch_npts, batch_vgamma.data(), base_stack.vgamma_eval_device, "NLC grad vgamma H2D" ); + backend->master_queue_synchronize(); + + lwd->inc_nel( &device_data ); + lwd->inc_exc_grad_gga( &device_data, RKS, exc_grad_settings.include_weight_derivatives ); + if( exc_grad_settings.include_weight_derivatives ) { + lwd->eval_weight_1st_deriv_contracted( &device_data, weight_alg ); + size_t batch_point_offset = 0; + for( const auto& task : aos_data->host_device_tasks ) { + for( size_t i = 0; i < task.npts; ++i ) { + const auto global_i = global_offset + batch_point_offset + i; + const auto pref = rho[global_i] * weights[global_i]; + grid_gradient_contribution[3*task.iParent + 0] += pref * grid_grad_x[global_i]; + grid_gradient_contribution[3*task.iParent + 1] += pref * grid_grad_y[global_i]; + grid_gradient_contribution[3*task.iParent + 2] += pref * grid_grad_z[global_i]; + } + batch_point_offset += task.npts; + } + } + local_batch_offset += batch_npts; + } + + double N_EL; + device_data.retrieve_exc_grad_integrands( EXC_GRAD, &N_EL ); + for( size_t i = 0; i < 3 * natoms; ++i ) { + EXC_GRAD[i] += grid_gradient_contribution[i]; + } +#else + GAUXC_GENERIC_EXCEPTION("NLC device EXC_GRAD requires CUDA"); +#endif +} + template void IncoreReplicatedXCDeviceIntegrator:: eval_exc_grad_local_work_( const basis_type& basis, diff --git a/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_exc_vxc.hpp b/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_exc_vxc.hpp index 416a49c5a..9dbca97bb 100644 --- a/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_exc_vxc.hpp +++ b/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_exc_vxc.hpp @@ -12,9 +12,12 @@ #include "incore_replicated_xc_device_integrator.hpp" #include "device/local_device_work_driver.hpp" #include "device/xc_device_aos_data.hpp" +#include "device/xc_device_stack_data.hpp" +#include "vv10_nlc_device.hpp" #include #include #include +#include namespace GauXC { namespace detail { @@ -60,6 +63,7 @@ void IncoreReplicatedXCDeviceIntegrator:: const bool is_gks = (Pz != nullptr) and (Py != nullptr) and (Px != nullptr); const bool is_uks = (Pz != nullptr) and (Py == nullptr) and (Px == nullptr); const bool is_rks = (Ps != nullptr) and (not is_uks and not is_gks); + const auto* nlc_settings = dynamic_cast(&settings); const auto& basis = this->load_balancer_->basis(); @@ -91,6 +95,42 @@ void IncoreReplicatedXCDeviceIntegrator:: } } + if( nlc_settings ) { + if( is_gks ) { + GAUXC_GENERIC_EXCEPTION("NLC device EXC/VXC is not implemented for GKS"); + } + if( this->reduction_driver_->takes_device_memory() ) { + GAUXC_GENERIC_EXCEPTION("NLC device EXC/VXC currently requires host reductions"); + } + + auto& tasks = this->load_balancer_->get_tasks(); + auto* lwd = dynamic_cast(this->local_work_driver_.get() ); + auto rt = detail::as_device_runtime(this->load_balancer_->runtime()); + auto device_data_ptr = lwd->create_device_data(rt); + value_type N_EL; + if( VXCz ) std::fill_n( VXCz, nbf*nbf, 0.0 ); + + this->timer_.time_op("XCIntegrator.LocalWork_NLC_EXC_VXC", [&](){ + nlc_exc_vxc_local_work_( basis, Ps, ldps, is_rks ? 2.0 : 1.0, + VXCs, ldvxcs, EXC, &N_EL, + *nlc_settings, tasks.begin(), tasks.end(), *device_data_ptr ); + }); + + GAUXC_MPI_CODE( + this->timer_.time_op("XCIntegrator.ImbalanceWait_NLC_EXC_VXC",[&](){ + MPI_Barrier(this->load_balancer_->runtime().comm()); + }); + ) + + this->timer_.time_op("XCIntegrator.Allreduce_NLC_EXC_VXC", [&](){ + if( VXCs ) this->reduction_driver_->allreduce_inplace( VXCs, nbf*nbf, ReductionOp::Sum ); + if( VXCz ) this->reduction_driver_->allreduce_inplace( VXCz, nbf*nbf, ReductionOp::Sum ); + this->reduction_driver_->allreduce_inplace( EXC, 1, ReductionOp::Sum ); + this->reduction_driver_->allreduce_inplace( &N_EL, 1, ReductionOp::Sum ); + }); + return; + } + // Get Tasks auto& tasks = this->load_balancer_->get_tasks(); @@ -217,6 +257,161 @@ void IncoreReplicatedXCDeviceIntegrator:: } } +template +void IncoreReplicatedXCDeviceIntegrator:: + nlc_exc_vxc_local_work_( const basis_type& basis, const value_type* Ps, int64_t ldps, + double xmat_fac, + value_type* VXC, int64_t ldvxc, value_type* EXC, value_type *N_EL, + const IntegratorSettingsNLC& settings, + host_task_iterator task_begin, host_task_iterator task_end, + XCDeviceData& device_data ) { + + auto* lwd = dynamic_cast(this->local_work_driver_.get() ); + auto* stack_data = dynamic_cast(&device_data); + if( not stack_data ) GAUXC_BAD_LWD_DATA_CAST(); + + const auto& func = *this->func_; + const auto& mol = this->load_balancer_->molecule(); + if( not func.is_gga() ) { + GAUXC_GENERIC_EXCEPTION("NLC device EXC/VXC currently requires a GGA functional"); + } + + BasisSetMap basis_map(basis,mol); + device_data.populate_submat_maps( basis.nbf(), task_begin, task_end, basis_map ); + auto task_comparator = []( const XCTask& a, const XCTask& b ) { + return (a.points.size() * a.bfn_screening.nbe) > (b.points.size() * b.bfn_screening.nbe); + }; + std::sort( task_begin, task_end, task_comparator ); + + auto& lb_state = this->load_balancer_->state(); + if( not lb_state.modified_weights_are_stored ) { + GAUXC_GENERIC_EXCEPTION("Weights Have Not Been Modified"); + } + + integrator_term_tracker enabled_terms; + enabled_terms.exc_vxc = true; + enabled_terms.ks_scheme = RKS; + enabled_terms.xc_approx = integrator_xc_approx::GGA; + + const auto nbf = basis.nbf(); + const auto nshells = basis.nshells(); + device_data.reset_allocations(); + device_data.allocate_static_data_exc_vxc( nbf, nshells, enabled_terms, true ); + device_data.send_static_data_density_basis( Ps, ldps, nullptr, 0, nullptr, 0, nullptr, 0, basis ); + + size_t local_npts = 0; + const size_t ntasks = std::distance(task_begin, task_end); + for( size_t iT = 0; iT < ntasks; ++iT ) { + local_npts += (task_begin + iT)->points.size(); + } + + std::vector local_packed; + local_packed.reserve(6 * local_npts); + + auto rt = detail::as_device_runtime(this->load_balancer_->runtime()); + auto* backend = rt.device_backend(); + + auto task_it = task_begin; + while( task_it != task_end ) { + const auto batch_begin = task_it; + task_it = device_data.generate_buffers( enabled_terms, basis_map, task_it, task_end ); + + lwd->eval_collocation_gradient( &device_data ); + lwd->eval_xmat( xmat_fac, &device_data, false, DEN_S ); + lwd->eval_vvars_gga( &device_data, DEN_S ); + lwd->eval_uvars_gga( &device_data, RKS ); + + const size_t batch_npts = stack_data->total_npts_task_batch; + std::vector rho(batch_npts), gamma(batch_npts); + auto base_stack = stack_data->base_stack; + backend->copy_async( batch_npts, base_stack.den_s_eval_device, rho.data(), "NLC rho D2H" ); + backend->copy_async( batch_npts, base_stack.gamma_eval_device, gamma.data(), "NLC gamma D2H" ); + backend->master_queue_synchronize(); + + size_t batch_offset = 0; + for( auto batch_task = batch_begin; batch_task != task_it; ++batch_task ) { + const auto npts = batch_task->points.size(); + const auto* points = batch_task->points.data()->data(); + const auto* weights = batch_task->weights.data(); + for( size_t i = 0; i < npts; ++i, ++batch_offset ) { + local_packed.push_back(points[3*i + 0]); + local_packed.push_back(points[3*i + 1]); + local_packed.push_back(points[3*i + 2]); + local_packed.push_back(weights[i]); + local_packed.push_back(rho[batch_offset]); + local_packed.push_back(gamma[batch_offset]); + } + } + if( batch_offset != batch_npts ) { + GAUXC_GENERIC_EXCEPTION("Invalid NLC device EXC/VXC host/device batch point count"); + } + } + + size_t local_point_offset = 0; + const auto global_packed = vv10::allgather_packed_grid( + *this->reduction_driver_, local_packed, 6, local_point_offset ); + if( global_packed.size() % 6 != 0 ) { + GAUXC_GENERIC_EXCEPTION("Invalid VV10 device EXC/VXC packed grid size after allgather"); + } + + const size_t global_npts = global_packed.size() / 6; + std::vector coords(3 * global_npts), weights(global_npts), rho(global_npts), gamma(global_npts); + std::vector eps(global_npts, 0.0), vrho(global_npts, 0.0), vgamma(global_npts, 0.0); + for( size_t i = 0; i < global_npts; ++i ) { + coords[3*i+0] = global_packed[6*i+0]; + coords[3*i+1] = global_packed[6*i+1]; + coords[3*i+2] = global_packed[6*i+2]; + weights[i] = global_packed[6*i+3]; + rho[i] = global_packed[6*i+4]; + gamma[i] = global_packed[6*i+5]; + } + + vv10::eval_exc_vxc_cuda( 0, settings, + vv10::GridView{ global_npts, coords.data(), weights.data(), rho.data(), gamma.data() }, + vv10::CorrectionsView{ eps.data(), vrho.data(), vgamma.data() } ); + + device_data.zero_exc_vxc_integrands(enabled_terms); + + task_it = task_begin; + size_t local_batch_offset = 0; + while( task_it != task_end ) { + task_it = device_data.generate_buffers( enabled_terms, basis_map, task_it, task_end ); + + lwd->eval_collocation_gradient( &device_data ); + lwd->eval_xmat( xmat_fac, &device_data, false, DEN_S ); + lwd->eval_vvars_gga( &device_data, DEN_S ); + lwd->eval_uvars_gga( &device_data, RKS ); + + const size_t batch_npts = stack_data->total_npts_task_batch; + const size_t global_offset = local_point_offset + local_batch_offset; + + auto base_stack = stack_data->base_stack; + std::vector batch_eps(batch_npts), batch_vrho(batch_npts), batch_vgamma(batch_npts); + for( size_t i = 0; i < batch_npts; ++i ) { + const auto global_i = global_offset + i; + batch_eps[i] = eps[global_i] * weights[global_i]; + batch_vrho[i] = vrho[global_i] * weights[global_i]; + batch_vgamma[i] = vgamma[global_i] * weights[global_i]; + } + backend->copy_async( batch_npts, batch_eps.data(), base_stack.eps_eval_device, "NLC eps H2D" ); + backend->copy_async( batch_npts, batch_vrho.data(), base_stack.vrho_eval_device, "NLC vrho H2D" ); + backend->copy_async( batch_npts, batch_vgamma.data(), base_stack.vgamma_eval_device, "NLC vgamma H2D" ); + backend->master_queue_synchronize(); + + lwd->inc_exc( &device_data ); + lwd->inc_nel( &device_data ); + if( VXC ) { + lwd->eval_zmat_gga_vxc( &device_data, RKS, DEN_S ); + lwd->inc_vxc( &device_data, DEN_S, false ); + } + local_batch_offset += batch_npts; + } + + if( VXC ) lwd->symmetrize_vxc( &device_data, DEN_S ); + backend->master_queue_synchronize(); + device_data.retrieve_exc_vxc_integrands( EXC, N_EL, VXC, ldvxc, nullptr, 0, nullptr, 0, nullptr, 0 ); +} + template void IncoreReplicatedXCDeviceIntegrator:: diff --git a/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_fxc_contraction.hpp b/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_fxc_contraction.hpp index ffc0ca417..4194e92c1 100644 --- a/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_fxc_contraction.hpp +++ b/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_fxc_contraction.hpp @@ -11,6 +11,8 @@ */ #pragma once #include "incore_replicated_xc_device_integrator.hpp" +#include "vv10_nlc_device.hpp" +#include "device/xc_device_aos_data.hpp" #include #include @@ -69,6 +71,39 @@ namespace GauXC::detail { // Get Tasks auto& tasks = this->load_balancer_->get_tasks(); + if( auto* nlc_settings = dynamic_cast(&ks_settings) ) { + if( not is_rks ) { + GAUXC_GENERIC_EXCEPTION("NLC device FXC contraction currently supports only RKS"); + } + if( this->reduction_driver_->takes_device_memory() ) { + GAUXC_GENERIC_EXCEPTION("NLC device FXC contraction requires host reductions"); + } + + auto* lwd = dynamic_cast(this->local_work_driver_.get() ); + auto rt = detail::as_device_runtime(this->load_balancer_->runtime()); + auto device_data_ptr = lwd->create_device_data(rt); + + GAUXC_MPI_CODE( MPI_Barrier(rt.comm());) + + value_type N_EL; + this->timer_.time_op("XCIntegrator.LocalWork_FXC", [&](){ + nlc_fxc_contraction_local_work_( basis, Ps, ldps, tPs, ldtps, &N_EL, + FXCs, ldfxcs, *nlc_settings, tasks.begin(), tasks.end(), *device_data_ptr ); + }); + + GAUXC_MPI_CODE( + this->timer_.time_op("XCIntegrator.ImbalanceWait_FXC",[&](){ + MPI_Barrier(this->load_balancer_->runtime().comm()); + }); + ) + + this->timer_.time_op("XCIntegrator.Allreduce_FXC", [&](){ + this->reduction_driver_->allreduce_inplace( FXCs, nbf*nbf, ReductionOp::Sum ); + this->reduction_driver_->allreduce_inplace( &N_EL, 1, ReductionOp::Sum ); + }); + return; + } + // Allocate Device memory auto* lwd = dynamic_cast(this->local_work_driver_.get() ); auto rt = detail::as_device_runtime(this->load_balancer_->runtime()); @@ -153,6 +188,186 @@ namespace GauXC::detail { } } + template + void IncoreReplicatedXCDeviceIntegrator:: + nlc_fxc_contraction_local_work_( const basis_type& basis, const value_type* Ps, int64_t ldps, + const value_type* tPs, int64_t ldtps, + value_type *N_EL, + value_type* FXC, int64_t ldfxc, + const IntegratorSettingsNLC& settings, + host_task_iterator task_begin, host_task_iterator task_end, + XCDeviceData& device_data ) { + +#ifdef GAUXC_HAS_CUDA + auto* lwd = dynamic_cast(this->local_work_driver_.get() ); + auto* stack_data = dynamic_cast(&device_data); + auto* aos_data = dynamic_cast(&device_data); + if( not stack_data or not aos_data ) GAUXC_BAD_LWD_DATA_CAST(); + + const auto& func = *this->func_; + if( not func.is_gga() ) { + GAUXC_GENERIC_EXCEPTION("NLC device FXC contraction currently requires a GGA functional"); + } + + const auto& mol = this->load_balancer_->molecule(); + BasisSetMap basis_map(basis,mol); + device_data.populate_submat_maps( basis.nbf(), task_begin, task_end, basis_map ); + + auto task_comparator = []( const XCTask& a, const XCTask& b ) { + return (a.points.size() * a.bfn_screening.nbe) > (b.points.size() * b.bfn_screening.nbe); + }; + std::sort( task_begin, task_end, task_comparator ); + + auto& lb_state = this->load_balancer_->state(); + if( not lb_state.modified_weights_are_stored ) { + GAUXC_GENERIC_EXCEPTION("Weights Have Not Been Modified"); + } + + integrator_term_tracker enabled_terms; + enabled_terms.fxc_contraction = true; + enabled_terms.ks_scheme = RKS; + enabled_terms.xc_approx = integrator_xc_approx::GGA; + + const auto nbf = basis.nbf(); + const auto nshells = basis.nshells(); + device_data.reset_allocations(); + device_data.allocate_static_data_fxc_contraction( nbf, nshells, enabled_terms ); + device_data.send_static_data_density_basis( Ps, ldps, nullptr, 0, nullptr, 0, nullptr, 0, basis ); + device_data.send_static_data_trial_density( tPs, ldtps, nullptr, 0, nullptr, 0, nullptr, 0 ); + + auto rt = detail::as_device_runtime(this->load_balancer_->runtime()); + auto* backend = rt.device_backend(); + + std::vector local_packed; + auto task_it = task_begin; + while( task_it != task_end ) { + task_it = device_data.generate_buffers( enabled_terms, basis_map, task_it, task_end ); + + lwd->eval_collocation_gradient( &device_data ); + lwd->eval_xmat( 2.0, &device_data, false, DEN_S ); + lwd->eval_vvars_gga( &device_data, DEN_S ); + lwd->eval_uvars_gga( &device_data, RKS ); + lwd->eval_xmat_trial( 2.0, &device_data, false, DEN_S ); + lwd->eval_vvars_gga_trial( &device_data, DEN_S ); + + const size_t batch_npts = stack_data->total_npts_task_batch; + std::vector points_x(batch_npts), points_y(batch_npts), points_z(batch_npts); + std::vector weights(batch_npts), rho(batch_npts), gamma(batch_npts); + std::vector grad_x(batch_npts), grad_y(batch_npts), grad_z(batch_npts); + std::vector trho(batch_npts), tgrad_x(batch_npts), tgrad_y(batch_npts), tgrad_z(batch_npts); + + auto base_stack = stack_data->base_stack; + backend->copy_async( batch_npts, base_stack.points_x_device, points_x.data(), "NLC FXC points_x D2H" ); + backend->copy_async( batch_npts, base_stack.points_y_device, points_y.data(), "NLC FXC points_y D2H" ); + backend->copy_async( batch_npts, base_stack.points_z_device, points_z.data(), "NLC FXC points_z D2H" ); + backend->copy_async( batch_npts, base_stack.weights_device, weights.data(), "NLC FXC weights D2H" ); + backend->copy_async( batch_npts, base_stack.den_s_eval_device, rho.data(), "NLC FXC rho D2H" ); + backend->copy_async( batch_npts, base_stack.gamma_eval_device, gamma.data(), "NLC FXC gamma D2H" ); + backend->copy_async( batch_npts, base_stack.dden_sx_eval_device, grad_x.data(), "NLC FXC grad_x D2H" ); + backend->copy_async( batch_npts, base_stack.dden_sy_eval_device, grad_y.data(), "NLC FXC grad_y D2H" ); + backend->copy_async( batch_npts, base_stack.dden_sz_eval_device, grad_z.data(), "NLC FXC grad_z D2H" ); + backend->copy_async( batch_npts, base_stack.tden_s_eval_device, trho.data(), "NLC FXC trho D2H" ); + backend->copy_async( batch_npts, base_stack.tdden_sx_eval_device, tgrad_x.data(), "NLC FXC tgrad_x D2H" ); + backend->copy_async( batch_npts, base_stack.tdden_sy_eval_device, tgrad_y.data(), "NLC FXC tgrad_y D2H" ); + backend->copy_async( batch_npts, base_stack.tdden_sz_eval_device, tgrad_z.data(), "NLC FXC tgrad_z D2H" ); + backend->master_queue_synchronize(); + + local_packed.reserve(local_packed.size() + 13 * batch_npts); + for( size_t i = 0; i < batch_npts; ++i ) { + local_packed.push_back(points_x[i]); + local_packed.push_back(points_y[i]); + local_packed.push_back(points_z[i]); + local_packed.push_back(weights[i]); + local_packed.push_back(rho[i]); + local_packed.push_back(gamma[i]); + local_packed.push_back(grad_x[i]); + local_packed.push_back(grad_y[i]); + local_packed.push_back(grad_z[i]); + local_packed.push_back(trho[i]); + local_packed.push_back(tgrad_x[i]); + local_packed.push_back(tgrad_y[i]); + local_packed.push_back(tgrad_z[i]); + } + } + + size_t local_point_offset = 0; + const auto global_packed = vv10::allgather_packed_grid( + *this->reduction_driver_, local_packed, 13, local_point_offset ); + if( global_packed.size() % 13 != 0 ) { + GAUXC_GENERIC_EXCEPTION("Invalid VV10 device FXC packed grid size after allgather"); + } + + const size_t global_npts = global_packed.size() / 13; + std::vector coords(3 * global_npts), weights(global_npts), rho(global_npts), gamma(global_npts); + std::vector grad_x(global_npts), grad_y(global_npts), grad_z(global_npts); + std::vector trho(global_npts), tgrad_x(global_npts), tgrad_y(global_npts), tgrad_z(global_npts); + std::vector fxc_A(global_npts, 0.0), fxc_B(3 * global_npts, 0.0); + for( size_t i = 0; i < global_npts; ++i ) { + coords[3*i+0] = global_packed[13*i+0]; + coords[3*i+1] = global_packed[13*i+1]; + coords[3*i+2] = global_packed[13*i+2]; + weights[i] = global_packed[13*i+3]; + rho[i] = global_packed[13*i+4]; + gamma[i] = global_packed[13*i+5]; + grad_x[i] = global_packed[13*i+6]; + grad_y[i] = global_packed[13*i+7]; + grad_z[i] = global_packed[13*i+8]; + trho[i] = global_packed[13*i+9]; + tgrad_x[i] = global_packed[13*i+10]; + tgrad_y[i] = global_packed[13*i+11]; + tgrad_z[i] = global_packed[13*i+12]; + } + + vv10::eval_fxc_rks_cuda( 0, settings, + vv10::ResponseGridView{ global_npts, coords.data(), weights.data(), rho.data(), gamma.data(), + grad_x.data(), grad_y.data(), grad_z.data() }, + vv10::TrialView{ trho.data(), tgrad_x.data(), tgrad_y.data(), tgrad_z.data() }, + vv10::ResponseCorrectionsView{ fxc_A.data(), fxc_B.data() } ); + + device_data.zero_fxc_contraction_integrands(); + task_it = task_begin; + size_t local_batch_offset = 0; + while( task_it != task_end ) { + task_it = device_data.generate_buffers( enabled_terms, basis_map, task_it, task_end ); + + lwd->eval_collocation_gradient( &device_data ); + lwd->eval_xmat( 2.0, &device_data, false, DEN_S ); + lwd->eval_vvars_gga( &device_data, DEN_S ); + lwd->eval_uvars_gga( &device_data, RKS ); + lwd->inc_nel( &device_data ); + + const size_t batch_npts = stack_data->total_npts_task_batch; + const size_t global_offset = local_point_offset + local_batch_offset; + auto base_stack = stack_data->base_stack; + std::vector batch_A(batch_npts), batch_Bx(batch_npts), batch_By(batch_npts), batch_Bz(batch_npts); + + for( size_t i = 0; i < batch_npts; ++i ) { + const auto global_i = global_offset + i; + batch_A[i] = weights[global_i] * fxc_A[global_i]; + batch_Bx[i] = weights[global_i] * fxc_B[3*global_i+0]; + batch_By[i] = weights[global_i] * fxc_B[3*global_i+1]; + batch_Bz[i] = weights[global_i] * fxc_B[3*global_i+2]; + } + + backend->copy_async( batch_npts, batch_A.data(), base_stack.FXC_A_s_eval_device, "NLC FXC A H2D" ); + backend->copy_async( batch_npts, batch_Bx.data(), base_stack.FXC_Bx_s_eval_device, "NLC FXC Bx H2D" ); + backend->copy_async( batch_npts, batch_By.data(), base_stack.FXC_By_s_eval_device, "NLC FXC By H2D" ); + backend->copy_async( batch_npts, batch_Bz.data(), base_stack.FXC_Bz_s_eval_device, "NLC FXC Bz H2D" ); + backend->master_queue_synchronize(); + + lwd->eval_zmat_gga_fxc( &device_data, DEN_S ); + lwd->inc_fxc( &device_data, DEN_S, false ); + local_batch_offset += batch_npts; + } + + lwd->symmetrize_fxc( &device_data, DEN_S ); + backend->master_queue_synchronize(); + device_data.retrieve_fxc_contraction_integrands( N_EL, FXC, ldfxc, nullptr, 0, nullptr, 0, nullptr, 0 ); +#else + GAUXC_GENERIC_EXCEPTION("NLC device FXC contraction requires CUDA"); +#endif + } + template void IncoreReplicatedXCDeviceIntegrator:: fxc_contraction_local_work_( const basis_type& basis, const value_type* Ps, int64_t ldps, diff --git a/src/xc_integrator/replicated/device/vv10_nlc_device.hpp b/src/xc_integrator/replicated/device/vv10_nlc_device.hpp new file mode 100644 index 000000000..ab40eae65 --- /dev/null +++ b/src/xc_integrator/replicated/device/vv10_nlc_device.hpp @@ -0,0 +1,350 @@ +/** + * GauXC Copyright (c) 2020-2024, The Regents of the University of California, + * through Lawrence Berkeley National Laboratory (subject to receipt of + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. + * + * See LICENSE.txt for details + */ +#pragma once + +#include + +#ifdef GAUXC_HAS_CUDA + +#include "../host/vv10_nlc.hpp" +#include "device/cuda/kernels/vv10.hpp" +#include "device_specific/cuda_util.hpp" + +#include +#include +#include +#include + +namespace GauXC::detail::vv10 { + +inline Parameters make_cuda_parameters( const IntegratorSettingsNLC& settings ) { + return Parameters{ settings.vv10_b, settings.vv10_c, settings.vv10_tol, + settings.math_mode == NLCMathMode::FloatPair ? 1 : 0 }; +} + +inline void eval_exc_vxc_cuda( cudaStream_t stream, const IntegratorSettingsNLC& settings, + GridView grid, CorrectionsView corr ) { + if( grid.npts == 0 ) return; + + std::vector active_indices; + active_indices.reserve(grid.npts); + for( std::size_t i = 0; i < grid.npts; ++i ) { + if( grid.rho[i] >= settings.vv10_tol ) active_indices.emplace_back(i); + } + if( active_indices.empty() ) return; + + const auto active_npts = active_indices.size(); + const auto npts = static_cast( active_npts ); + const bool all_active = active_npts == grid.npts; + std::vector coords, rho, gamma, rho_weight(active_npts); + const double* coords_host = grid.coords; + const double* rho_host = grid.rho; + const double* gamma_host = grid.gamma; + if( all_active ) { + for( std::size_t i = 0; i < active_npts; ++i ) { + rho_weight[i] = grid.rho[i] * grid.weights[i]; + } + } else { + coords.resize(3 * active_npts); + rho.resize(active_npts); + gamma.resize(active_npts); + for( std::size_t active_i = 0; active_i < active_npts; ++active_i ) { + const auto i = active_indices[active_i]; + coords[3*active_i + 0] = grid.coords[3*i + 0]; + coords[3*active_i + 1] = grid.coords[3*i + 1]; + coords[3*active_i + 2] = grid.coords[3*i + 2]; + rho[active_i] = grid.rho[i]; + gamma[active_i] = grid.gamma[i]; + rho_weight[active_i] = grid.rho[i] * grid.weights[i]; + } + coords_host = coords.data(); + rho_host = rho.data(); + gamma_host = gamma.data(); + } + + auto* coords_device = util::cuda_malloc( 3 * active_npts ); + auto* rho_device = util::cuda_malloc( active_npts ); + auto* gamma_device = util::cuda_malloc( active_npts ); + auto* rho_weight_device = util::cuda_malloc( active_npts ); + auto* omega_device = util::cuda_malloc( active_npts ); + auto* kappa_device = util::cuda_malloc( active_npts ); + auto* eps_device = util::cuda_malloc( active_npts ); + auto* vrho_device = util::cuda_malloc( active_npts ); + auto* vgamma_device = util::cuda_malloc( active_npts ); + + util::cuda_copy( 3 * active_npts, coords_device, coords_host ); + util::cuda_copy( active_npts, rho_device, rho_host ); + util::cuda_copy( active_npts, gamma_device, gamma_host ); + util::cuda_copy( active_npts, rho_weight_device, rho_weight.data() ); + + const auto params = make_cuda_parameters(settings); + eval_omega_kappa_cuda( stream, npts, rho_device, gamma_device, params, + omega_device, kappa_device, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr ); + eval_exc_vxc_fock_cuda( stream, npts, coords_device, rho_device, gamma_device, + omega_device, kappa_device, rho_weight_device, params, eps_device, + vrho_device, vgamma_device ); + util::cuda_device_sync(); + + std::vector eps(active_npts), vrho(active_npts), vgamma(active_npts); + util::cuda_copy( active_npts, eps.data(), eps_device ); + util::cuda_copy( active_npts, vrho.data(), vrho_device ); + util::cuda_copy( active_npts, vgamma.data(), vgamma_device ); + + if( all_active ) { + for( std::size_t i = 0; i < active_npts; ++i ) { + corr.eps[i] += eps[i]; + corr.vrho[i] += vrho[i]; + corr.vgamma[i] += vgamma[i]; + } + } else { + for( std::size_t active_i = 0; active_i < active_npts; ++active_i ) { + const auto i = active_indices[active_i]; + corr.eps[i] += eps[active_i]; + corr.vrho[i] += vrho[active_i]; + corr.vgamma[i] += vgamma[active_i]; + } + } + + util::cuda_free( coords_device, rho_device, gamma_device, rho_weight_device, + omega_device, kappa_device, eps_device, vrho_device, vgamma_device ); +} + +inline void eval_fxc_rks_cuda( cudaStream_t stream, const IntegratorSettingsNLC& settings, + ResponseGridView grid, TrialView trial, + ResponseCorrectionsView corr ) { + if( grid.npts == 0 ) return; + + std::vector active_indices; + active_indices.reserve(grid.npts); + for( std::size_t i = 0; i < grid.npts; ++i ) { + if( grid.rho[i] >= settings.vv10_tol ) active_indices.emplace_back(i); + } + if( active_indices.empty() ) return; + + const auto active_npts = active_indices.size(); + const auto active_npts_i32 = static_cast(active_npts); + std::vector coords(3 * active_npts), weights(active_npts), rho(active_npts), gamma(active_npts); + std::vector rho_t(active_npts), gamma_t(active_npts); + + for( std::size_t active_i = 0; active_i < active_npts; ++active_i ) { + const auto i = active_indices[active_i]; + coords[3*active_i + 0] = grid.coords[3*i + 0]; + coords[3*active_i + 1] = grid.coords[3*i + 1]; + coords[3*active_i + 2] = grid.coords[3*i + 2]; + weights[active_i] = grid.weights[i]; + rho[active_i] = grid.rho[i]; + gamma[active_i] = grid.gamma[i]; + rho_t[active_i] = trial.rho[i]; + gamma_t[active_i] = 2.0 * (grid.grad_x[i] * trial.grad_x[i] + + grid.grad_y[i] * trial.grad_y[i] + grid.grad_z[i] * trial.grad_z[i]); + } + + auto* coords_device = util::cuda_malloc(3 * active_npts); + auto* weights_device = util::cuda_malloc(active_npts); + auto* rho_device = util::cuda_malloc(active_npts); + auto* gamma_device = util::cuda_malloc(active_npts); + auto* omega_device = util::cuda_malloc(active_npts); + auto* kappa_device = util::cuda_malloc(active_npts); + auto* domega_drho_device = util::cuda_malloc(active_npts); + auto* domega_dgamma_device = util::cuda_malloc(active_npts); + auto* d2omega_drho2_device = util::cuda_malloc(active_npts); + auto* d2omega_dgamma2_device = util::cuda_malloc(active_npts); + auto* d2omega_drho_dgamma_device = util::cuda_malloc(active_npts); + auto* dkappa_drho_device = util::cuda_malloc(active_npts); + auto* d2kappa_drho2_device = util::cuda_malloc(active_npts); + auto* u_device = util::cuda_malloc(active_npts); + auto* w_device = util::cuda_malloc(active_npts); + auto* a_device = util::cuda_malloc(active_npts); + auto* b_device = util::cuda_malloc(active_npts); + auto* c_device = util::cuda_malloc(active_npts); + auto* e_device = util::cuda_malloc(active_npts); + auto* rho_t_device = util::cuda_malloc(active_npts); + auto* gamma_t_device = util::cuda_malloc(active_npts); + auto* f_rho_t_device = util::cuda_malloc(active_npts); + auto* f_gamma_t_device = util::cuda_malloc(active_npts); + + util::cuda_copy(3 * active_npts, coords_device, coords.data()); + util::cuda_copy(active_npts, weights_device, weights.data()); + util::cuda_copy(active_npts, rho_device, rho.data()); + util::cuda_copy(active_npts, gamma_device, gamma.data()); + util::cuda_copy(active_npts, rho_t_device, rho_t.data()); + util::cuda_copy(active_npts, gamma_t_device, gamma_t.data()); + + const auto params = make_cuda_parameters(settings); + eval_omega_kappa_cuda(stream, active_npts_i32, rho_device, gamma_device, params, + omega_device, kappa_device, domega_drho_device, domega_dgamma_device, + d2omega_drho2_device, d2omega_dgamma2_device, d2omega_drho_dgamma_device, + dkappa_drho_device, d2kappa_drho2_device); + eval_hessian_intermediates_cuda(stream, active_npts_i32, coords_device, + weights_device, rho_device, omega_device, kappa_device, u_device, w_device, + a_device, b_device, c_device, e_device); + eval_hessian_contraction_cuda(stream, active_npts_i32, 1, coords_device, + weights_device, rho_device, omega_device, kappa_device, u_device, w_device, + a_device, b_device, c_device, domega_drho_device, domega_dgamma_device, + dkappa_drho_device, d2omega_drho2_device, d2omega_dgamma2_device, + d2omega_drho_dgamma_device, d2kappa_drho2_device, rho_t_device, + gamma_t_device, f_rho_t_device, f_gamma_t_device); + util::cuda_device_sync(); + + std::vector w(active_npts), domega_dgamma(active_npts); + std::vector f_rho_t(active_npts), f_gamma_t(active_npts); + util::cuda_copy(active_npts, w.data(), w_device); + util::cuda_copy(active_npts, domega_dgamma.data(), domega_dgamma_device); + util::cuda_copy(active_npts, f_rho_t.data(), f_rho_t_device); + util::cuda_copy(active_npts, f_gamma_t.data(), f_gamma_t_device); + + for( std::size_t active_i = 0; active_i < active_npts; ++active_i ) { + const auto i = active_indices[active_i]; + const double f_gamma = grid.rho[i] * domega_dgamma[active_i] * w[active_i]; + corr.A[i] += f_rho_t[active_i]; + corr.B[3*i] += 2.0 * (grid.grad_x[i] * f_gamma_t[active_i] + trial.grad_x[i] * f_gamma); + corr.B[3*i+1] += 2.0 * (grid.grad_y[i] * f_gamma_t[active_i] + trial.grad_y[i] * f_gamma); + corr.B[3*i+2] += 2.0 * (grid.grad_z[i] * f_gamma_t[active_i] + trial.grad_z[i] * f_gamma); + } + + util::cuda_free(coords_device, weights_device, rho_device, gamma_device, + omega_device, kappa_device, domega_drho_device, domega_dgamma_device, + d2omega_drho2_device, d2omega_dgamma2_device, d2omega_drho_dgamma_device, + dkappa_drho_device, d2kappa_drho2_device, u_device, w_device, a_device, + b_device, c_device, e_device, rho_t_device, gamma_t_device, + f_rho_t_device, f_gamma_t_device); +} + +inline void eval_grid_gradient_cuda( cudaStream_t stream, const IntegratorSettingsNLC& settings, + GridView grid, GridGradientView grad ) { + if( grid.npts == 0 ) return; + + std::vector active_indices; + active_indices.reserve(grid.npts); + for( std::size_t i = 0; i < grid.npts; ++i ) { + if( grid.rho[i] >= settings.vv10_tol ) active_indices.emplace_back(i); + } + if( active_indices.empty() ) return; + + const auto active_npts = active_indices.size(); + const auto active_npts_i32 = static_cast(active_npts); + std::vector coords(3 * active_npts), rho(active_npts), gamma(active_npts), rho_weight(active_npts); + for( std::size_t active_i = 0; active_i < active_npts; ++active_i ) { + const auto i = active_indices[active_i]; + coords[3*active_i + 0] = grid.coords[3*i + 0]; + coords[3*active_i + 1] = grid.coords[3*i + 1]; + coords[3*active_i + 2] = grid.coords[3*i + 2]; + rho[active_i] = grid.rho[i]; + gamma[active_i] = grid.gamma[i]; + rho_weight[active_i] = grid.rho[i] * grid.weights[i]; + } + + auto* coords_device = util::cuda_malloc(3 * active_npts); + auto* rho_device = util::cuda_malloc(active_npts); + auto* gamma_device = util::cuda_malloc(active_npts); + auto* rho_weight_device = util::cuda_malloc(active_npts); + auto* omega_device = util::cuda_malloc(active_npts); + auto* kappa_device = util::cuda_malloc(active_npts); + auto* gradient_device = util::cuda_malloc(3 * active_npts); + + util::cuda_copy(3 * active_npts, coords_device, coords.data()); + util::cuda_copy(active_npts, rho_device, rho.data()); + util::cuda_copy(active_npts, gamma_device, gamma.data()); + util::cuda_copy(active_npts, rho_weight_device, rho_weight.data()); + + const auto params = make_cuda_parameters(settings); + eval_omega_kappa_cuda(stream, active_npts_i32, rho_device, gamma_device, params, + omega_device, kappa_device, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr); + eval_grid_gradient_cuda(stream, active_npts_i32, coords_device, omega_device, + kappa_device, active_npts_i32, coords_device, rho_weight_device, + omega_device, kappa_device, gradient_device); + util::cuda_device_sync(); + + std::vector gradient(3 * active_npts); + util::cuda_copy(3 * active_npts, gradient.data(), gradient_device); + for( std::size_t active_i = 0; active_i < active_npts; ++active_i ) { + const auto i = active_indices[active_i]; + grad.x[i] += gradient[3*active_i + 0]; + grad.y[i] += gradient[3*active_i + 1]; + grad.z[i] += gradient[3*active_i + 2]; + } + + util::cuda_free(coords_device, rho_device, gamma_device, rho_weight_device, + omega_device, kappa_device, gradient_device); +} + +inline void eval_grid_gradient_excluding_same_parent_cuda( cudaStream_t stream, + const IntegratorSettingsNLC& settings, + GridView grid, + const std::vector& parent, + GridGradientView grad ) { + if( grid.npts == 0 ) return; + if( parent.size() != grid.npts ) { + GAUXC_GENERIC_EXCEPTION("Invalid VV10 device parent grid size"); + } + + std::vector active_indices; + active_indices.reserve(grid.npts); + for( std::size_t i = 0; i < grid.npts; ++i ) { + if( grid.rho[i] >= settings.vv10_tol ) active_indices.emplace_back(i); + } + if( active_indices.empty() ) return; + + const auto active_npts = active_indices.size(); + const auto active_npts_i32 = static_cast(active_npts); + std::vector coords(3 * active_npts), rho(active_npts), gamma(active_npts), rho_weight(active_npts); + std::vector active_parent(active_npts); + for( std::size_t active_i = 0; active_i < active_npts; ++active_i ) { + const auto i = active_indices[active_i]; + coords[3*active_i + 0] = grid.coords[3*i + 0]; + coords[3*active_i + 1] = grid.coords[3*i + 1]; + coords[3*active_i + 2] = grid.coords[3*i + 2]; + rho[active_i] = grid.rho[i]; + gamma[active_i] = grid.gamma[i]; + rho_weight[active_i] = grid.rho[i] * grid.weights[i]; + active_parent[active_i] = static_cast(parent[i]); + } + + auto* coords_device = util::cuda_malloc(3 * active_npts); + auto* rho_device = util::cuda_malloc(active_npts); + auto* gamma_device = util::cuda_malloc(active_npts); + auto* rho_weight_device = util::cuda_malloc(active_npts); + auto* omega_device = util::cuda_malloc(active_npts); + auto* kappa_device = util::cuda_malloc(active_npts); + auto* parent_device = util::cuda_malloc(active_npts); + auto* gradient_device = util::cuda_malloc(3 * active_npts); + + util::cuda_copy(3 * active_npts, coords_device, coords.data()); + util::cuda_copy(active_npts, rho_device, rho.data()); + util::cuda_copy(active_npts, gamma_device, gamma.data()); + util::cuda_copy(active_npts, rho_weight_device, rho_weight.data()); + util::cuda_copy(active_npts, parent_device, active_parent.data()); + + const auto params = make_cuda_parameters(settings); + eval_omega_kappa_cuda(stream, active_npts_i32, rho_device, gamma_device, params, + omega_device, kappa_device, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr); + eval_grid_gradient_excluding_same_parent_cuda(stream, active_npts_i32, + coords_device, omega_device, kappa_device, rho_weight_device, + parent_device, gradient_device); + util::cuda_device_sync(); + + std::vector gradient(3 * active_npts); + util::cuda_copy(3 * active_npts, gradient.data(), gradient_device); + for( std::size_t active_i = 0; active_i < active_npts; ++active_i ) { + const auto i = active_indices[active_i]; + grad.x[i] += gradient[3*active_i + 0]; + grad.y[i] += gradient[3*active_i + 1]; + grad.z[i] += gradient[3*active_i + 2]; + } + + util::cuda_free(coords_device, rho_device, gamma_device, rho_weight_device, + omega_device, kappa_device, parent_device, gradient_device); +} + +} // namespace GauXC::detail::vv10 + +#endif \ No newline at end of file diff --git a/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_exc_grad.hpp b/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_exc_grad.hpp index f04ae24b7..0b7c52f75 100644 --- a/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_exc_grad.hpp +++ b/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_exc_grad.hpp @@ -15,6 +15,7 @@ #include "integrator_util/integrator_common.hpp" #include "host/local_host_work_driver.hpp" #include "host/blas.hpp" +#include "vv10_nlc.hpp" #include namespace GauXC::detail { @@ -35,8 +36,7 @@ void ReferenceReplicatedXCHostIntegrator:: GAUXC_GENERIC_EXCEPTION("P Must Have Same Dimension as Basis"); if( ldp < nbf ) GAUXC_GENERIC_EXCEPTION("Invalid LDP"); - - + // Get Tasks this->load_balancer_->get_tasks(); @@ -77,8 +77,7 @@ void ReferenceReplicatedXCHostIntegrator:: GAUXC_GENERIC_EXCEPTION("Invalid LDPS"); if( ldpz < nbf ) GAUXC_GENERIC_EXCEPTION("Invalid LDPZ"); - - + // Get Tasks this->load_balancer_->get_tasks(); @@ -142,6 +141,28 @@ void ReferenceReplicatedXCHostIntegrator:: auto& tasks = this->load_balancer_->get_tasks(); std::sort( tasks.begin(), tasks.end(), task_comparator ); + const auto vv10_settings = vv10::get_settings(settings); + const bool use_vv10 = vv10::enabled(settings); + if( use_vv10 ) { + if( not (func.is_gga() or func.is_mgga()) ) { + GAUXC_GENERIC_EXCEPTION("VV10 EXC_GRAD requires a GGA/MGGA density path"); + } + } + + std::vector vv10_task_offsets; + std::vector vv10_coords; + std::vector vv10_weights; + std::vector vv10_rho; + std::vector vv10_gamma; + std::vector vv10_eps; + std::vector vv10_vrho; + std::vector vv10_vgamma; + std::vector vv10_grid_grad_x; + std::vector vv10_grid_grad_y; + std::vector vv10_grid_grad_z; + std::vector vv10_parent; + size_t vv10_local_point_offset = 0; + // Check that Partition Weights have been calculated auto& lb_state = this->load_balancer_->state(); @@ -150,6 +171,131 @@ void ReferenceReplicatedXCHostIntegrator:: } XCWeightAlg& weight_alg = lb_state.weight_alg; + if( use_vv10 ) { + const size_t ntasks = tasks.size(); + size_t vv10_npts = 0; + vv10_task_offsets.resize(ntasks); + for( size_t iT = 0; iT < ntasks; ++iT ) { + vv10_task_offsets[iT] = vv10_npts; + vv10_npts += tasks[iT].points.size(); + } + + vv10_coords .resize( 3 * vv10_npts ); + vv10_weights.resize( vv10_npts ); + vv10_rho .resize( vv10_npts ); + vv10_gamma .resize( vv10_npts ); + vv10_eps .resize( vv10_npts, 0.0 ); + vv10_vrho .resize( vv10_npts, 0.0 ); + vv10_vgamma .resize( vv10_npts, 0.0 ); + vv10_parent .resize( vv10_npts ); + + XCHostData vv10_host_data; + for( size_t iT = 0; iT < ntasks; ++iT ) { + const auto& task = tasks[iT]; + const int32_t npts = task.points.size(); + const int32_t nbe = task.bfn_screening.nbe; + const int32_t nshells = task.bfn_screening.shell_list.size(); + const auto* points = task.points.data()->data(); + const auto* weights = task.weights.data(); + const int32_t* shell_list = task.bfn_screening.shell_list.data(); + + vv10_host_data.basis_eval.resize( 4 * npts * nbe ); + vv10_host_data.den_scr.resize( 4 * npts ); + vv10_host_data.gamma.resize( npts ); + vv10_host_data.zmat.resize( npts * nbe ); + vv10_host_data.nbe_scr.resize( nbe * nbe ); + + auto* basis_eval = vv10_host_data.basis_eval.data(); + auto* dbasis_x_eval = basis_eval + npts * nbe; + auto* dbasis_y_eval = dbasis_x_eval + npts * nbe; + auto* dbasis_z_eval = dbasis_y_eval + npts * nbe; + auto* den_eval = vv10_host_data.den_scr.data(); + auto* dden_x_eval = den_eval + npts; + auto* dden_y_eval = dden_x_eval + npts; + auto* dden_z_eval = dden_y_eval + npts; + auto* gamma_eval = vv10_host_data.gamma.data(); + auto* zmat = vv10_host_data.zmat.data(); + auto* nbe_scr = vv10_host_data.nbe_scr.data(); + + std::vector< std::array > submat_map; + std::tie(submat_map, std::ignore) = + gen_compressed_submat_map(basis_map, task.bfn_screening.shell_list, nbf, nbf); + + lwd->eval_collocation_gradient( npts, nshells, nbe, points, basis, shell_list, + basis_eval, dbasis_x_eval, dbasis_y_eval, dbasis_z_eval ); + lwd->eval_xmat( npts, nbf, nbe, submat_map, is_rks ? 2.0 : 1.0, Ps, ldps, basis_eval, nbe, + zmat, nbe, nbe_scr ); + lwd->eval_uvvar_gga_rks( npts, nbe, basis_eval, dbasis_x_eval, dbasis_y_eval, + dbasis_z_eval, zmat, nbe, den_eval, dden_x_eval, dden_y_eval, dden_z_eval, + gamma_eval ); + + const size_t offset = vv10_task_offsets[iT]; + for( int32_t i = 0; i < npts; ++i ) { + vv10_coords[3*(offset + i) + 0] = points[3*i + 0]; + vv10_coords[3*(offset + i) + 1] = points[3*i + 1]; + vv10_coords[3*(offset + i) + 2] = points[3*i + 2]; + vv10_weights[offset + i] = weights[i]; + vv10_rho[offset + i] = den_eval[i]; + vv10_gamma[offset + i] = gamma_eval[i]; + vv10_parent[offset + i] = static_cast( task.iParent ); + } + } + + std::vector vv10_local_packed( 6 * vv10_npts ); + for( size_t i = 0; i < vv10_npts; ++i ) { + vv10_local_packed[6*i+0] = vv10_coords[3*i+0]; + vv10_local_packed[6*i+1] = vv10_coords[3*i+1]; + vv10_local_packed[6*i+2] = vv10_coords[3*i+2]; + vv10_local_packed[6*i+3] = vv10_weights[i]; + vv10_local_packed[6*i+4] = vv10_rho[i]; + vv10_local_packed[6*i+5] = vv10_gamma[i]; + } + + const auto vv10_global_packed = vv10::allgather_packed_grid( + *this->reduction_driver_, vv10_local_packed, 6, vv10_local_point_offset ); + if( vv10_global_packed.size() % 6 != 0 ) { + GAUXC_GENERIC_EXCEPTION("Invalid VV10 EXC_GRAD packed grid size after allgather"); + } + vv10_parent = this->reduction_driver_->allgather_v( vv10_parent.data(), vv10_parent.size() ); + const size_t vv10_global_npts = vv10_global_packed.size() / 6; + if( vv10_parent.size() != vv10_global_npts ) { + GAUXC_GENERIC_EXCEPTION("Invalid VV10 EXC_GRAD parent grid size after allgather"); + } + if( vv10_local_point_offset + vv10_npts > vv10_global_npts ) { + GAUXC_GENERIC_EXCEPTION("Invalid VV10 EXC_GRAD local grid offset after allgather"); + } + + vv10_coords .resize( 3 * vv10_global_npts ); + vv10_weights.resize( vv10_global_npts ); + vv10_rho .resize( vv10_global_npts ); + vv10_gamma .resize( vv10_global_npts ); + vv10_eps .assign( vv10_global_npts, 0.0 ); + vv10_vrho .assign( vv10_global_npts, 0.0 ); + vv10_vgamma .assign( vv10_global_npts, 0.0 ); + vv10_grid_grad_x.assign( vv10_global_npts, 0.0 ); + vv10_grid_grad_y.assign( vv10_global_npts, 0.0 ); + vv10_grid_grad_z.assign( vv10_global_npts, 0.0 ); + + for( size_t i = 0; i < vv10_global_npts; ++i ) { + vv10_coords[3*i+0] = vv10_global_packed[6*i+0]; + vv10_coords[3*i+1] = vv10_global_packed[6*i+1]; + vv10_coords[3*i+2] = vv10_global_packed[6*i+2]; + vv10_weights[i] = vv10_global_packed[6*i+3]; + vv10_rho[i] = vv10_global_packed[6*i+4]; + vv10_gamma[i] = vv10_global_packed[6*i+5]; + } + + vv10::eval_exc_vxc( vv10_settings, + vv10::GridView{ vv10_global_npts, vv10_coords.data(), vv10_weights.data(), vv10_rho.data(), vv10_gamma.data() }, + vv10::CorrectionsView{ vv10_eps.data(), vv10_vrho.data(), vv10_vgamma.data() } ); + if( exc_grad_settings.include_weight_derivatives ) { + vv10::eval_grid_gradient_excluding_same_parent( vv10_settings, + vv10::GridView{ vv10_global_npts, vv10_coords.data(), vv10_weights.data(), vv10_rho.data(), vv10_gamma.data() }, + vv10_parent, + vv10::GridGradientView{ vv10_grid_grad_x.data(), vv10_grid_grad_y.data(), vv10_grid_grad_z.data() } ); + } + } + // Zero out integrands for( auto i = 0; i < 3*natoms; ++i ) { EXC_GRAD[i] = 0.; @@ -384,21 +530,62 @@ void ReferenceReplicatedXCHostIntegrator:: // Evaluate XC functional - if( func.is_mgga() ) + if( use_vv10 ) { + std::fill_n( eps, npts, 0.0 ); + std::fill_n( vrho, npts * spin_dim_scal, 0.0 ); + if( func.is_gga() or func.is_mgga() ) std::fill_n( vgamma, npts * gga_dim_scal, 0.0 ); + if( func.is_mgga() ) { + std::fill_n( vtau, npts * spin_dim_scal, 0.0 ); + if( needs_laplacian ) std::fill_n( vlapl, npts * spin_dim_scal, 0.0 ); + } + } else if( func.is_mgga() ) func.eval_exc_vxc( npts, den_eval, gamma, lapl, tau, eps, vrho, vgamma, vlapl, vtau ); else if(func.is_gga() ) func.eval_exc_vxc( npts, den_eval, gamma, eps, vrho, vgamma ); else func.eval_exc_vxc( npts, den_eval, eps, vrho ); + if( use_vv10 ) { + const size_t vv10_offset = vv10_task_offsets[iT]; + for( int32_t i = 0; i < npts; ++i ) { + const size_t vv10_global_i = vv10_local_point_offset + vv10_offset + i; + if( is_rks ) { + vv10::add_rks_correction( i, vv10_eps[vv10_global_i], vv10_vrho[vv10_global_i], + vv10_vgamma[vv10_global_i], eps, vrho, vgamma ); + } else { + vv10::add_uks_correction( i, vv10_eps[vv10_global_i], vv10_vrho[vv10_global_i], + vv10_vgamma[vv10_global_i], eps, vrho, vgamma ); + } + } + } + if(exc_grad_settings.include_weight_derivatives){ // grid weight contribution to exc grad for( int ipt = 0; ipt < npts; ++ipt ) { + if( use_vv10 ) { + const size_t vv10_global_i = vv10_local_point_offset + vv10_task_offsets[iT] + ipt; + eps[ipt] += vv10_eps[vv10_global_i] - vv10::beta(vv10_settings); + } const auto den = is_rks ? den_eval[ipt] : (den_eval[2*ipt] + den_eval[2*ipt+1]); eps[ipt] *= den * weights[ipt]; } lwd->eval_weight_1st_deriv_contracted( weight_alg, mol, molmeta, task, eps, EXC_GRAD); + + if( use_vv10 ) { + const size_t vv10_offset = vv10_task_offsets[iT]; + for( int ipt = 0; ipt < npts; ++ipt ) { + const size_t vv10_global_i = vv10_local_point_offset + vv10_offset + ipt; + const auto den = is_rks ? den_eval[ipt] : (den_eval[2*ipt] + den_eval[2*ipt+1]); + const auto pref = den * weights[ipt]; + #pragma omp atomic + EXC_GRAD[3*task.iParent + 0] += pref * vv10_grid_grad_x[vv10_global_i]; + #pragma omp atomic + EXC_GRAD[3*task.iParent + 1] += pref * vv10_grid_grad_y[vv10_global_i]; + #pragma omp atomic + EXC_GRAD[3*task.iParent + 2] += pref * vv10_grid_grad_z[vv10_global_i]; + } + } } diff --git a/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_exc_vxc.hpp b/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_exc_vxc.hpp index 141085c9f..9b88f3737 100644 --- a/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_exc_vxc.hpp +++ b/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_exc_vxc.hpp @@ -15,6 +15,7 @@ #include "integrator_util/integrator_common.hpp" #include "host/local_host_work_driver.hpp" #include "host/blas.hpp" +#include "vv10_nlc.hpp" #include namespace GauXC::detail { @@ -162,6 +163,27 @@ void ReferenceReplicatedXCHostIntegrator:: auto& tasks = this->load_balancer_->get_tasks(); std::sort( task_begin, task_end, task_comparator ); + const auto vv10_settings = vv10::get_settings(settings); + const bool use_vv10 = vv10::enabled(settings); + if( use_vv10 ) { + if( is_gks ) { + GAUXC_GENERIC_EXCEPTION("VV10 non-local correlation is not yet implemented for GKS"); + } + if( not (func.is_gga() or func.is_mgga()) ) { + GAUXC_GENERIC_EXCEPTION("VV10 non-local correlation requires a GGA/MGGA density path"); + } + } + + std::vector vv10_task_offsets; + std::vector vv10_coords; + std::vector vv10_weights; + std::vector vv10_rho; + std::vector vv10_gamma; + std::vector vv10_eps; + std::vector vv10_vrho; + std::vector vv10_vgamma; + size_t vv10_local_point_offset = 0; + // Check that Partition Weights have been calculated auto& lb_state = this->load_balancer_->state(); @@ -169,6 +191,116 @@ void ReferenceReplicatedXCHostIntegrator:: GAUXC_GENERIC_EXCEPTION("Weights Have Not Been Modified"); } + if( use_vv10 ) { + const size_t ntasks = std::distance(task_begin, task_end); + size_t vv10_npts = 0; + vv10_task_offsets.resize(ntasks); + for( size_t iT = 0; iT < ntasks; ++iT ) { + vv10_task_offsets[iT] = vv10_npts; + vv10_npts += (task_begin + iT)->points.size(); + } + + vv10_coords .resize( 3 * vv10_npts ); + vv10_weights.resize( vv10_npts ); + vv10_rho .resize( vv10_npts ); + vv10_gamma .resize( vv10_npts ); + vv10_eps .resize( vv10_npts, 0.0 ); + vv10_vrho .resize( vv10_npts, 0.0 ); + vv10_vgamma .resize( vv10_npts, 0.0 ); + + XCHostData vv10_host_data; + for( size_t iT = 0; iT < ntasks; ++iT ) { + const auto& task = *(task_begin + iT); + const int32_t npts = task.points.size(); + const int32_t nbe = task.bfn_screening.nbe; + const int32_t nshells = task.bfn_screening.shell_list.size(); + const auto* points = task.points.data()->data(); + const auto* weights = task.weights.data(); + const int32_t* shell_list = task.bfn_screening.shell_list.data(); + + vv10_host_data.basis_eval.resize( 4 * npts * nbe ); + vv10_host_data.den_scr.resize( 4 * npts ); + vv10_host_data.gamma.resize( npts ); + vv10_host_data.zmat.resize( npts * nbe ); + vv10_host_data.nbe_scr.resize( nbe * nbe ); + + auto* basis_eval = vv10_host_data.basis_eval.data(); + auto* dbasis_x_eval = basis_eval + npts * nbe; + auto* dbasis_y_eval = dbasis_x_eval + npts * nbe; + auto* dbasis_z_eval = dbasis_y_eval + npts * nbe; + auto* den_eval = vv10_host_data.den_scr.data(); + auto* dden_x_eval = den_eval + npts; + auto* dden_y_eval = dden_x_eval + npts; + auto* dden_z_eval = dden_y_eval + npts; + auto* gamma_eval = vv10_host_data.gamma.data(); + auto* zmat = vv10_host_data.zmat.data(); + auto* nbe_scr = vv10_host_data.nbe_scr.data(); + + std::vector< std::array > submat_map; + std::tie(submat_map, std::ignore) = + gen_compressed_submat_map(basis_map, task.bfn_screening.shell_list, nbf, nbf); + + lwd->eval_collocation_gradient( npts, nshells, nbe, points, basis, shell_list, + basis_eval, dbasis_x_eval, dbasis_y_eval, dbasis_z_eval ); + lwd->eval_xmat( npts, nbf, nbe, submat_map, is_rks ? 2.0 : 1.0, Ps, ldps, basis_eval, nbe, + zmat, nbe, nbe_scr ); + lwd->eval_uvvar_gga_rks( npts, nbe, basis_eval, dbasis_x_eval, dbasis_y_eval, + dbasis_z_eval, zmat, nbe, den_eval, dden_x_eval, dden_y_eval, dden_z_eval, + gamma_eval ); + + const size_t offset = vv10_task_offsets[iT]; + for( int32_t i = 0; i < npts; ++i ) { + vv10_coords[3*(offset + i) + 0] = points[3*i + 0]; + vv10_coords[3*(offset + i) + 1] = points[3*i + 1]; + vv10_coords[3*(offset + i) + 2] = points[3*i + 2]; + vv10_weights[offset + i] = weights[i]; + vv10_rho[offset + i] = den_eval[i]; + vv10_gamma[offset + i] = gamma_eval[i]; + } + } + + std::vector vv10_local_packed( 6 * vv10_npts ); + for( size_t i = 0; i < vv10_npts; ++i ) { + vv10_local_packed[6*i+0] = vv10_coords[3*i+0]; + vv10_local_packed[6*i+1] = vv10_coords[3*i+1]; + vv10_local_packed[6*i+2] = vv10_coords[3*i+2]; + vv10_local_packed[6*i+3] = vv10_weights[i]; + vv10_local_packed[6*i+4] = vv10_rho[i]; + vv10_local_packed[6*i+5] = vv10_gamma[i]; + } + + const auto vv10_global_packed = vv10::allgather_packed_grid( + *this->reduction_driver_, vv10_local_packed, 6, vv10_local_point_offset ); + if( vv10_global_packed.size() % 6 != 0 ) { + GAUXC_GENERIC_EXCEPTION("Invalid VV10 EXC/VXC packed grid size after allgather"); + } + const size_t vv10_global_npts = vv10_global_packed.size() / 6; + if( vv10_local_point_offset + vv10_npts > vv10_global_npts ) { + GAUXC_GENERIC_EXCEPTION("Invalid VV10 EXC/VXC local grid offset after allgather"); + } + + vv10_coords .resize( 3 * vv10_global_npts ); + vv10_weights.resize( vv10_global_npts ); + vv10_rho .resize( vv10_global_npts ); + vv10_gamma .resize( vv10_global_npts ); + vv10_eps .assign( vv10_global_npts, 0.0 ); + vv10_vrho .assign( vv10_global_npts, 0.0 ); + vv10_vgamma .assign( vv10_global_npts, 0.0 ); + + for( size_t i = 0; i < vv10_global_npts; ++i ) { + vv10_coords[3*i+0] = vv10_global_packed[6*i+0]; + vv10_coords[3*i+1] = vv10_global_packed[6*i+1]; + vv10_coords[3*i+2] = vv10_global_packed[6*i+2]; + vv10_weights[i] = vv10_global_packed[6*i+3]; + vv10_rho[i] = vv10_global_packed[6*i+4]; + vv10_gamma[i] = vv10_global_packed[6*i+5]; + } + + vv10::eval_exc_vxc( vv10_settings, + vv10::GridView{ vv10_global_npts, vv10_coords.data(), vv10_weights.data(), vv10_rho.data(), vv10_gamma.data() }, + vv10::CorrectionsView{ vv10_eps.data(), vv10_vrho.data(), vv10_vgamma.data() } ); + } + // Zero out integrands if(VXCs) @@ -443,13 +575,35 @@ void ReferenceReplicatedXCHostIntegrator:: } // Evaluate XC functional - if( func.is_mgga() ) + if( use_vv10 ) { + std::fill_n( eps, npts, 0.0 ); + std::fill_n( vrho, npts * spin_dim_scal, 0.0 ); + if( func.is_gga() or func.is_mgga() ) std::fill_n( vgamma, npts * gga_dim_scal, 0.0 ); + if( func.is_mgga() ) { + std::fill_n( vtau, npts * spin_dim_scal, 0.0 ); + if( needs_laplacian ) std::fill_n( vlapl, npts * spin_dim_scal, 0.0 ); + } + } else if( func.is_mgga() ) func.eval_exc_vxc( npts, den_eval, gamma, lapl, tau, eps, vrho, vgamma, vlapl, vtau); else if( func.is_gga() ) func.eval_exc_vxc( npts, den_eval, gamma, eps, vrho, vgamma ); else func.eval_exc_vxc( npts, den_eval, eps, vrho ); + if( use_vv10 ) { + const size_t vv10_offset = vv10_task_offsets[iT]; + for( int32_t i = 0; i < npts; ++i ) { + const size_t vv10_global_i = vv10_local_point_offset + vv10_offset + i; + if( is_rks ) { + vv10::add_rks_correction( i, vv10_eps[vv10_global_i], vv10_vrho[vv10_global_i], + vv10_vgamma[vv10_global_i], eps, vrho, vgamma ); + } else { + vv10::add_uks_correction( i, vv10_eps[vv10_global_i], vv10_vrho[vv10_global_i], + vv10_vgamma[vv10_global_i], eps, vrho, vgamma ); + } + } + } + // Factor weights into XC results for( int32_t i = 0; i < npts; ++i ) { eps[i] *= weights[i]; diff --git a/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_fxc_contraction.hpp b/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_fxc_contraction.hpp index 192fe0f89..e56b61670 100644 --- a/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_fxc_contraction.hpp +++ b/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_fxc_contraction.hpp @@ -15,6 +15,7 @@ #include "integrator_util/integrator_common.hpp" #include "host/local_host_work_driver.hpp" #include "host/blas.hpp" +#include "vv10_nlc.hpp" #include namespace GauXC::detail { @@ -135,12 +136,194 @@ void ReferenceReplicatedXCHostIntegrator:: auto& tasks = this->load_balancer_->get_tasks(); std::sort( task_begin, task_end, task_comparator ); + const auto vv10_settings = vv10::get_settings(settings); + const bool use_vv10 = vv10::enabled(settings); + if( use_vv10 ) { + if( not (func.is_gga() or func.is_mgga()) ) { + GAUXC_GENERIC_EXCEPTION("VV10 FXC contraction requires a GGA/MGGA density path"); + } + } + + std::vector vv10_task_offsets; + std::vector vv10_coords; + std::vector vv10_weights; + std::vector vv10_rho; + std::vector vv10_gamma; + std::vector vv10_grad_x; + std::vector vv10_grad_y; + std::vector vv10_grad_z; + std::vector vv10_trho; + std::vector vv10_tgrad_x; + std::vector vv10_tgrad_y; + std::vector vv10_tgrad_z; + std::vector vv10_A; + std::vector vv10_B; + size_t vv10_local_point_offset = 0; + // Check that Partition Weights have been calculated auto& lb_state = this->load_balancer_->state(); if( not lb_state.modified_weights_are_stored ) { GAUXC_GENERIC_EXCEPTION("Weights Have Not Been Modified"); } + if( use_vv10 ) { + const size_t ntasks = std::distance(task_begin, task_end); + size_t vv10_npts = 0; + vv10_task_offsets.resize(ntasks); + for( size_t iT = 0; iT < ntasks; ++iT ) { + vv10_task_offsets[iT] = vv10_npts; + vv10_npts += (task_begin + iT)->points.size(); + } + + vv10_coords .resize( 3 * vv10_npts ); + vv10_weights.resize( vv10_npts ); + vv10_rho .resize( vv10_npts ); + vv10_gamma .resize( vv10_npts ); + vv10_grad_x .resize( vv10_npts ); + vv10_grad_y .resize( vv10_npts ); + vv10_grad_z .resize( vv10_npts ); + vv10_trho .resize( vv10_npts ); + vv10_tgrad_x.resize( vv10_npts ); + vv10_tgrad_y.resize( vv10_npts ); + vv10_tgrad_z.resize( vv10_npts ); + vv10_A .resize( vv10_npts, 0.0 ); + vv10_B .resize( 3 * vv10_npts, 0.0 ); + + XCHostData vv10_host_data; + for( size_t iT = 0; iT < ntasks; ++iT ) { + const auto& task = *(task_begin + iT); + const int32_t npts = task.points.size(); + const int32_t nbe = task.bfn_screening.nbe; + const int32_t nshells = task.bfn_screening.shell_list.size(); + const auto* points = task.points.data()->data(); + const auto* weights = task.weights.data(); + const int32_t* shell_list = task.bfn_screening.shell_list.data(); + + vv10_host_data.basis_eval.resize( 4 * npts * nbe ); + vv10_host_data.den_scr.resize( 4 * npts ); + vv10_host_data.tden_scr.resize( 4 * npts ); + vv10_host_data.gamma.resize( npts ); + vv10_host_data.zmat.resize( npts * nbe ); + vv10_host_data.nbe_scr.resize( nbe * nbe ); + + auto* basis_eval = vv10_host_data.basis_eval.data(); + auto* dbasis_x_eval = basis_eval + npts * nbe; + auto* dbasis_y_eval = dbasis_x_eval + npts * nbe; + auto* dbasis_z_eval = dbasis_y_eval + npts * nbe; + auto* den_eval = vv10_host_data.den_scr.data(); + auto* dden_x_eval = den_eval + npts; + auto* dden_y_eval = dden_x_eval + npts; + auto* dden_z_eval = dden_y_eval + npts; + auto* tden_eval = vv10_host_data.tden_scr.data(); + auto* tdden_x_eval = tden_eval + npts; + auto* tdden_y_eval = tdden_x_eval + npts; + auto* tdden_z_eval = tdden_y_eval + npts; + auto* gamma_eval = vv10_host_data.gamma.data(); + auto* zmat = vv10_host_data.zmat.data(); + auto* nbe_scr = vv10_host_data.nbe_scr.data(); + + std::vector< std::array > submat_map; + std::tie(submat_map, std::ignore) = + gen_compressed_submat_map(basis_map, task.bfn_screening.shell_list, nbf, nbf); + + lwd->eval_collocation_gradient( npts, nshells, nbe, points, basis, shell_list, + basis_eval, dbasis_x_eval, dbasis_y_eval, dbasis_z_eval ); + lwd->eval_xmat( npts, nbf, nbe, submat_map, is_rks ? 2.0 : 1.0, Ps, ldps, basis_eval, nbe, + zmat, nbe, nbe_scr ); + lwd->eval_uvvar_gga_rks( npts, nbe, basis_eval, dbasis_x_eval, dbasis_y_eval, + dbasis_z_eval, zmat, nbe, den_eval, dden_x_eval, dden_y_eval, dden_z_eval, + gamma_eval ); + + const size_t offset = vv10_task_offsets[iT]; + for( int32_t i = 0; i < npts; ++i ) { + vv10_coords[3*(offset + i) + 0] = points[3*i + 0]; + vv10_coords[3*(offset + i) + 1] = points[3*i + 1]; + vv10_coords[3*(offset + i) + 2] = points[3*i + 2]; + vv10_weights[offset + i] = weights[i]; + vv10_rho[offset + i] = den_eval[i]; + vv10_gamma[offset + i] = gamma_eval[i]; + vv10_grad_x[offset + i] = dden_x_eval[i]; + vv10_grad_y[offset + i] = dden_y_eval[i]; + vv10_grad_z[offset + i] = dden_z_eval[i]; + } + + lwd->eval_xmat( npts, nbf, nbe, submat_map, is_rks ? 2.0 : 1.0, tPs, ldtps, basis_eval, nbe, + zmat, nbe, nbe_scr ); + lwd->eval_uvvar_gga_rks( npts, nbe, basis_eval, dbasis_x_eval, dbasis_y_eval, + dbasis_z_eval, zmat, nbe, tden_eval, tdden_x_eval, tdden_y_eval, tdden_z_eval, + gamma_eval ); + for( int32_t i = 0; i < npts; ++i ) { + vv10_trho[offset + i] = tden_eval[i]; + vv10_tgrad_x[offset + i] = tdden_x_eval[i]; + vv10_tgrad_y[offset + i] = tdden_y_eval[i]; + vv10_tgrad_z[offset + i] = tdden_z_eval[i]; + } + } + + std::vector vv10_local_packed( 13 * vv10_npts ); + for( size_t i = 0; i < vv10_npts; ++i ) { + vv10_local_packed[13*i+0] = vv10_coords[3*i+0]; + vv10_local_packed[13*i+1] = vv10_coords[3*i+1]; + vv10_local_packed[13*i+2] = vv10_coords[3*i+2]; + vv10_local_packed[13*i+3] = vv10_weights[i]; + vv10_local_packed[13*i+4] = vv10_rho[i]; + vv10_local_packed[13*i+5] = vv10_gamma[i]; + vv10_local_packed[13*i+6] = vv10_grad_x[i]; + vv10_local_packed[13*i+7] = vv10_grad_y[i]; + vv10_local_packed[13*i+8] = vv10_grad_z[i]; + vv10_local_packed[13*i+9] = vv10_trho[i]; + vv10_local_packed[13*i+10] = vv10_tgrad_x[i]; + vv10_local_packed[13*i+11] = vv10_tgrad_y[i]; + vv10_local_packed[13*i+12] = vv10_tgrad_z[i]; + } + + const auto vv10_global_packed = vv10::allgather_packed_grid( + *this->reduction_driver_, vv10_local_packed, 13, vv10_local_point_offset ); + if( vv10_global_packed.size() % 13 != 0 ) { + GAUXC_GENERIC_EXCEPTION("Invalid VV10 FXC packed grid size after allgather"); + } + const size_t vv10_global_npts = vv10_global_packed.size() / 13; + if( vv10_local_point_offset + vv10_npts > vv10_global_npts ) { + GAUXC_GENERIC_EXCEPTION("Invalid VV10 FXC local grid offset after allgather"); + } + + vv10_coords .resize( 3 * vv10_global_npts ); + vv10_weights.resize( vv10_global_npts ); + vv10_rho .resize( vv10_global_npts ); + vv10_gamma .resize( vv10_global_npts ); + vv10_grad_x .resize( vv10_global_npts ); + vv10_grad_y .resize( vv10_global_npts ); + vv10_grad_z .resize( vv10_global_npts ); + vv10_trho .resize( vv10_global_npts ); + vv10_tgrad_x.resize( vv10_global_npts ); + vv10_tgrad_y.resize( vv10_global_npts ); + vv10_tgrad_z.resize( vv10_global_npts ); + vv10_A .assign( vv10_global_npts, 0.0 ); + vv10_B .assign( 3 * vv10_global_npts, 0.0 ); + + for( size_t i = 0; i < vv10_global_npts; ++i ) { + vv10_coords[3*i+0] = vv10_global_packed[13*i+0]; + vv10_coords[3*i+1] = vv10_global_packed[13*i+1]; + vv10_coords[3*i+2] = vv10_global_packed[13*i+2]; + vv10_weights[i] = vv10_global_packed[13*i+3]; + vv10_rho[i] = vv10_global_packed[13*i+4]; + vv10_gamma[i] = vv10_global_packed[13*i+5]; + vv10_grad_x[i] = vv10_global_packed[13*i+6]; + vv10_grad_y[i] = vv10_global_packed[13*i+7]; + vv10_grad_z[i] = vv10_global_packed[13*i+8]; + vv10_trho[i] = vv10_global_packed[13*i+9]; + vv10_tgrad_x[i] = vv10_global_packed[13*i+10]; + vv10_tgrad_y[i] = vv10_global_packed[13*i+11]; + vv10_tgrad_z[i] = vv10_global_packed[13*i+12]; + } + + vv10::eval_fxc_rks( vv10_settings, + vv10::ResponseGridView{ vv10_global_npts, vv10_coords.data(), vv10_weights.data(), vv10_rho.data(), + vv10_gamma.data(), vv10_grad_x.data(), vv10_grad_y.data(), vv10_grad_z.data() }, + vv10::TrialView{ vv10_trho.data(), vv10_tgrad_x.data(), vv10_tgrad_y.data(), vv10_tgrad_z.data() }, + vv10::ResponseCorrectionsView{ vv10_A.data(), vv10_B.data() } ); + } + // Zero out integrands for( auto j = 0; j < nbf; ++j ) @@ -428,69 +611,95 @@ void ReferenceReplicatedXCHostIntegrator:: } } - // Evaluate XC functional - if( func.is_mgga() ) - func.eval_vxc_fxc( npts, den_eval, gamma, lapl, tau, vrho, vgamma, vlapl, vtau, - v2rho2, v2rhogamma, v2rholapl, v2rhotau, v2gamma2, - v2gammalapl, v2gammatau, v2lapl2, v2lapltau, v2tau2); - else if( func.is_gga() ) - func.eval_vxc_fxc( npts, den_eval, gamma, vrho, vgamma, v2rho2, v2rhogamma, v2gamma2 ); - else - func.eval_vxc_fxc( npts, den_eval, vrho, v2rho2 ); - - //calculate the trial density variables - // Evaluate X matrix (fac * tP * B) -> store in Z - lwd->eval_xmat( mgga_dim_scal * npts, nbf, nbe, submat_map, xmat_fac, tPs, ldps, basis_eval, nbe, - zmat, nbe, nbe_scr ); - // X matrix for tPz - if(not is_rks) { - lwd->eval_xmat( mgga_dim_scal * npts, nbf, nbe, submat_map, 1.0, tPz, ldpz, basis_eval, nbe, - zmat_z, nbe, nbe_scr); - } - // Evaluate U and V trial variables - if( func.is_mgga() ) { - if (is_rks) { - lwd->eval_uvvar_mgga_rks( npts, nbe, basis_eval, dbasis_x_eval, dbasis_y_eval, - dbasis_z_eval, lbasis_eval, zmat, nbe, mmat_x, mmat_y, mmat_z, - nbe, tden_eval, tdden_x_eval, tdden_y_eval, tdden_z_eval, gamma, ttau, tlapl); - lwd->eval_tmat_mgga_vxc_rks( npts, vgamma, v2rho2, v2rhogamma, v2rholapl, v2rhotau, v2gamma2, - v2gammalapl, v2gammatau, v2lapl2, v2lapltau, v2tau2, tden_eval, tdden_x_eval, - tdden_y_eval, tdden_z_eval, ttau, dden_x_eval, dden_y_eval, dden_z_eval, FXC_A, FXC_B, FXC_C ); - } else if (is_uks) { - // tgamma is not needed since it has different definitions than gamma - // gamma = nabla rho * nabla rho, but tgamma = nabla trho * nabla rho, not both trho - lwd->eval_uvvar_mgga_uks( npts, nbe, basis_eval, dbasis_x_eval, dbasis_y_eval, - dbasis_z_eval, lbasis_eval, zmat, nbe, zmat_z, nbe, - mmat_x, mmat_y, mmat_z, nbe, mmat_x_z, mmat_y_z, mmat_z_z, nbe, - tden_eval, tdden_x_eval, tdden_y_eval, tdden_z_eval, gamma, ttau, tlapl); - lwd->eval_tmat_mgga_vxc_uks( npts, vgamma, v2rho2, v2rhogamma, v2rholapl, v2rhotau, v2gamma2, - v2gammalapl, v2gammatau, v2lapl2, v2lapltau, v2tau2, tden_eval, tdden_x_eval, - tdden_y_eval, tdden_z_eval, ttau, dden_x_eval, dden_y_eval, dden_z_eval, FXC_A, FXC_B, FXC_C ); + if( not use_vv10 ) { + // Evaluate XC functional + if( func.is_mgga() ) + func.eval_vxc_fxc( npts, den_eval, gamma, lapl, tau, vrho, vgamma, vlapl, vtau, + v2rho2, v2rhogamma, v2rholapl, v2rhotau, v2gamma2, + v2gammalapl, v2gammatau, v2lapl2, v2lapltau, v2tau2); + else if( func.is_gga() ) + func.eval_vxc_fxc( npts, den_eval, gamma, vrho, vgamma, v2rho2, v2rhogamma, v2gamma2 ); + else + func.eval_vxc_fxc( npts, den_eval, vrho, v2rho2 ); + + //calculate the trial density variables + // Evaluate X matrix (fac * tP * B) -> store in Z + lwd->eval_xmat( mgga_dim_scal * npts, nbf, nbe, submat_map, xmat_fac, tPs, ldps, basis_eval, nbe, + zmat, nbe, nbe_scr ); + // X matrix for tPz + if(not is_rks) { + lwd->eval_xmat( mgga_dim_scal * npts, nbf, nbe, submat_map, 1.0, tPz, ldpz, basis_eval, nbe, + zmat_z, nbe, nbe_scr); } - } else if ( func.is_gga() ) { - if(is_rks) { - lwd->eval_uvvar_gga_rks( npts, nbe, basis_eval, dbasis_x_eval, dbasis_y_eval, - dbasis_z_eval, zmat, nbe, tden_eval, tdden_x_eval, tdden_y_eval, tdden_z_eval, - gamma ); - lwd->eval_tmat_gga_vxc_rks( npts, vgamma, v2rho2, v2rhogamma, v2gamma2, tden_eval, tdden_x_eval, + // Evaluate U and V trial variables + if( func.is_mgga() ) { + if (is_rks) { + lwd->eval_uvvar_mgga_rks( npts, nbe, basis_eval, dbasis_x_eval, dbasis_y_eval, + dbasis_z_eval, lbasis_eval, zmat, nbe, mmat_x, mmat_y, mmat_z, + nbe, tden_eval, tdden_x_eval, tdden_y_eval, tdden_z_eval, gamma, ttau, tlapl); + lwd->eval_tmat_mgga_vxc_rks( npts, vgamma, v2rho2, v2rhogamma, v2rholapl, v2rhotau, v2gamma2, + v2gammalapl, v2gammatau, v2lapl2, v2lapltau, v2tau2, tden_eval, tdden_x_eval, + tdden_y_eval, tdden_z_eval, ttau, dden_x_eval, dden_y_eval, dden_z_eval, FXC_A, FXC_B, FXC_C ); + } else if (is_uks) { + // tgamma is not needed since it has different definitions than gamma + // gamma = nabla rho * nabla rho, but tgamma = nabla trho * nabla rho, not both trho + lwd->eval_uvvar_mgga_uks( npts, nbe, basis_eval, dbasis_x_eval, dbasis_y_eval, + dbasis_z_eval, lbasis_eval, zmat, nbe, zmat_z, nbe, + mmat_x, mmat_y, mmat_z, nbe, mmat_x_z, mmat_y_z, mmat_z_z, nbe, + tden_eval, tdden_x_eval, tdden_y_eval, tdden_z_eval, gamma, ttau, tlapl); + lwd->eval_tmat_mgga_vxc_uks( npts, vgamma, v2rho2, v2rhogamma, v2rholapl, v2rhotau, v2gamma2, + v2gammalapl, v2gammatau, v2lapl2, v2lapltau, v2tau2, tden_eval, tdden_x_eval, + tdden_y_eval, tdden_z_eval, ttau, dden_x_eval, dden_y_eval, dden_z_eval, FXC_A, FXC_B, FXC_C ); + } + } else if ( func.is_gga() ) { + if(is_rks) { + lwd->eval_uvvar_gga_rks( npts, nbe, basis_eval, dbasis_x_eval, dbasis_y_eval, + dbasis_z_eval, zmat, nbe, tden_eval, tdden_x_eval, tdden_y_eval, tdden_z_eval, + gamma ); + lwd->eval_tmat_gga_vxc_rks( npts, vgamma, v2rho2, v2rhogamma, v2gamma2, tden_eval, tdden_x_eval, + tdden_y_eval, tdden_z_eval, dden_x_eval, dden_y_eval, dden_z_eval, FXC_A, FXC_B ); + } else if(is_uks) { + // tgamma is not needed since it has quite different definitions than gamma + lwd->eval_uvvar_gga_uks( npts, nbe, basis_eval, dbasis_x_eval, dbasis_y_eval, + dbasis_z_eval, zmat, nbe, zmat_z, nbe, tden_eval, tdden_x_eval, + tdden_y_eval, tdden_z_eval, gamma ); + lwd->eval_tmat_gga_vxc_uks( npts, vgamma, v2rho2, v2rhogamma, v2gamma2, tden_eval, tdden_x_eval, tdden_y_eval, tdden_z_eval, dden_x_eval, dden_y_eval, dden_z_eval, FXC_A, FXC_B ); - } else if(is_uks) { - // tgamma is not needed since it has quite different definitions than gamma - lwd->eval_uvvar_gga_uks( npts, nbe, basis_eval, dbasis_x_eval, dbasis_y_eval, - dbasis_z_eval, zmat, nbe, zmat_z, nbe, tden_eval, tdden_x_eval, - tdden_y_eval, tdden_z_eval, gamma ); - lwd->eval_tmat_gga_vxc_uks( npts, vgamma, v2rho2, v2rhogamma, v2gamma2, tden_eval, tdden_x_eval, - tdden_y_eval, tdden_z_eval, dden_x_eval, dden_y_eval, dden_z_eval, FXC_A, FXC_B ); + } + } else { + // LDA + if(is_rks) { + lwd->eval_uvvar_lda_rks( npts, nbe, basis_eval, zmat, nbe, tden_eval ); + lwd->eval_tmat_lda_vxc_rks( npts, v2rho2, tden_eval, FXC_A); + } else if(is_uks) { + lwd->eval_uvvar_lda_uks( npts, nbe, basis_eval, zmat, nbe, zmat_z, nbe, + tden_eval ); + lwd->eval_tmat_lda_vxc_uks( npts, v2rho2, tden_eval, FXC_A); + } } - } else { - // LDA - if(is_rks) { - lwd->eval_uvvar_lda_rks( npts, nbe, basis_eval, zmat, nbe, tden_eval ); - lwd->eval_tmat_lda_vxc_rks( npts, v2rho2, tden_eval, FXC_A); - } else if(is_uks) { - lwd->eval_uvvar_lda_uks( npts, nbe, basis_eval, zmat, nbe, zmat_z, nbe, - tden_eval ); - lwd->eval_tmat_lda_vxc_uks( npts, v2rho2, tden_eval, FXC_A); + } + + if( use_vv10 ) { + if( func.is_mgga() ) std::fill_n( FXC_C, npts * spin_dim_scal, 0.0 ); + + const size_t vv10_offset = vv10_task_offsets[iT]; + for( int32_t i = 0; i < npts; ++i ) { + const size_t vv10_global_i = vv10_local_point_offset + vv10_offset + i; + if( is_rks ) { + FXC_A[i] = vv10_A[vv10_global_i]; + FXC_B[3*i] = vv10_B[3*vv10_global_i]; + FXC_B[3*i+1] = vv10_B[3*vv10_global_i+1]; + FXC_B[3*i+2] = vv10_B[3*vv10_global_i+2]; + } else { + FXC_A[2*i] = vv10_A[vv10_global_i]; + FXC_A[2*i+1] = vv10_A[vv10_global_i]; + FXC_B[6*i] = vv10_B[3*vv10_global_i]; + FXC_B[6*i+1] = vv10_B[3*vv10_global_i+1]; + FXC_B[6*i+2] = vv10_B[3*vv10_global_i+2]; + FXC_B[6*i+3] = vv10_B[3*vv10_global_i]; + FXC_B[6*i+4] = vv10_B[3*vv10_global_i+1]; + FXC_B[6*i+5] = vv10_B[3*vv10_global_i+2]; + } } } @@ -535,14 +744,16 @@ void ReferenceReplicatedXCHostIntegrator:: // Because we do not support Laplacian, so mgga will do the same operation as GGA lwd->eval_zmat_gga_vxc_rks_ts( npts, nbe, FXC_A, FXC_B, basis_eval, dbasis_x_eval, dbasis_y_eval, dbasis_z_eval, zmat, nbe); - lwd->eval_mmat_mgga_vxc_rks( npts, nbe, FXC_C, vlapl, dbasis_x_eval, dbasis_y_eval, dbasis_z_eval, - mmat_x, mmat_y, mmat_z, nbe); + if( not use_vv10 ) + lwd->eval_mmat_mgga_vxc_rks( npts, nbe, FXC_C, vlapl, dbasis_x_eval, dbasis_y_eval, dbasis_z_eval, + mmat_x, mmat_y, mmat_z, nbe); } else if (is_uks) { // Because we do not support Laplacian, so mgga will do the same operation as GGA lwd->eval_zmat_gga_vxc_uks_ts( npts, nbe, FXC_A, FXC_B, basis_eval, dbasis_x_eval, dbasis_y_eval, dbasis_z_eval, zmat, nbe, zmat_z, nbe); - lwd->eval_mmat_mgga_vxc_uks_ts( npts, nbe, FXC_C, vlapl, dbasis_x_eval, dbasis_y_eval, dbasis_z_eval, - mmat_x, mmat_y, mmat_z, nbe, mmat_x_z, mmat_y_z, mmat_z_z, nbe); + if( not use_vv10 ) + lwd->eval_mmat_mgga_vxc_uks_ts( npts, nbe, FXC_C, vlapl, dbasis_x_eval, dbasis_y_eval, dbasis_z_eval, + mmat_x, mmat_y, mmat_z, nbe, mmat_x_z, mmat_y_z, mmat_z_z, nbe); } } else if( func.is_gga() ) { @@ -566,9 +777,10 @@ void ReferenceReplicatedXCHostIntegrator:: { // Increment VXC - lwd->inc_vxc( mgga_dim_scal * npts, nbf, nbe, basis_eval, submat_map, zmat, nbe, FXCa, ldfxca, nbe_scr ); + const auto npts_to_increment = use_vv10 ? npts : mgga_dim_scal * npts; + lwd->inc_vxc( npts_to_increment, nbf, nbe, basis_eval, submat_map, zmat, nbe, FXCa, ldfxca, nbe_scr ); if( not is_rks ) - lwd->inc_vxc( mgga_dim_scal * npts, nbf, nbe, basis_eval, submat_map, zmat_z, nbe, FXCb, ldfxcb, nbe_scr); + lwd->inc_vxc( npts_to_increment, nbf, nbe, basis_eval, submat_map, zmat_z, nbe, FXCb, ldfxcb, nbe_scr); } } // Loop over tasks diff --git a/src/xc_integrator/replicated/host/vv10_nlc.hpp b/src/xc_integrator/replicated/host/vv10_nlc.hpp new file mode 100644 index 000000000..55b0f5ba5 --- /dev/null +++ b/src/xc_integrator/replicated/host/vv10_nlc.hpp @@ -0,0 +1,478 @@ +/** + * GauXC Copyright (c) 2020-2024, The Regents of the University of California, + * through Lawrence Berkeley National Laboratory (subject to receipt of + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. + * + * See LICENSE.txt for details + */ +#pragma once + +#include +#include +#include + +#include +#include +#include +#include +#include + +namespace GauXC::detail::vv10 { + +struct GridView { + std::size_t npts = 0; + const double* coords = nullptr; + const double* weights = nullptr; + const double* rho = nullptr; + const double* gamma = nullptr; +}; + +struct CorrectionsView { + double* eps = nullptr; + double* vrho = nullptr; + double* vgamma = nullptr; +}; + +struct ResponseGridView { + std::size_t npts = 0; + const double* coords = nullptr; + const double* weights = nullptr; + const double* rho = nullptr; + const double* gamma = nullptr; + const double* grad_x = nullptr; + const double* grad_y = nullptr; + const double* grad_z = nullptr; +}; + +struct TrialView { + const double* rho = nullptr; + const double* grad_x = nullptr; + const double* grad_y = nullptr; + const double* grad_z = nullptr; +}; + +struct ResponseCorrectionsView { + double* A = nullptr; + double* B = nullptr; +}; + +struct GridGradientView { + double* x = nullptr; + double* y = nullptr; + double* z = nullptr; +}; + +inline void add_rks_correction( std::size_t i, double eps, double vrho, double vgamma, + double* eps_out, double* vrho_out, double* vgamma_out ) { + eps_out[i] += eps; + vrho_out[i] += vrho; + vgamma_out[i] += vgamma; +} + +inline void add_uks_correction( std::size_t i, double eps, double vrho, double vgamma, + double* eps_out, double* vrho_out, double* vgamma_out ) { + eps_out[i] += eps; + vrho_out[2*i] += vrho; + vrho_out[2*i+1] += vrho; + vgamma_out[3*i] += vgamma; + vgamma_out[3*i+1] += 2.0 * vgamma; + vgamma_out[3*i+2] += vgamma; +} + +inline IntegratorSettingsNLC get_settings( const IntegratorSettingsXC& settings ) { + IntegratorSettingsNLC vv10_settings; + if( auto* tmp = dynamic_cast(&settings) ) { + vv10_settings = *tmp; + } + return vv10_settings; +} + +inline bool enabled( const IntegratorSettingsXC& settings ) { + return dynamic_cast(&settings) != nullptr; +} + +inline double beta( const IntegratorSettingsNLC& settings ) { + return std::pow( 3.0 / (settings.vv10_b * settings.vv10_b), 0.75 ) / 32.0; +} + +inline double kappa_prefactor( const IntegratorSettingsNLC& settings ) { + const double pi = 3.141592653589793238462643383279502884; + return settings.vv10_b * 1.5 * pi * std::pow( 9.0 * pi, -1.0 / 6.0 ); +} + +inline std::vector allgather_packed_grid( const ReductionDriver& reduction_driver, + const std::vector& local_packed, + std::size_t stride, + std::size_t& local_point_offset ) { + const auto local_npts = static_cast( local_packed.size() / stride ); + const auto all_npts = reduction_driver.allgather_v( &local_npts, 1 ); + + local_point_offset = 0; + const int rank = reduction_driver.comm_rank(); + for( int i = 0; i < rank; ++i ) { + local_point_offset += static_cast( all_npts[i] ); + } + + return reduction_driver.allgather_v( local_packed.data(), local_packed.size() ); +} + +inline void eval_exc_vxc( const IntegratorSettingsNLC& settings, + GridView grid, CorrectionsView corr ) { + if( grid.npts == 0 ) return; + + const auto npts = grid.npts; + const double pi = 3.141592653589793238462643383279502884; + const double pi43 = 4.0 * pi / 3.0; + const double k_factor = settings.vv10_b * 1.5 * pi * std::pow( 9.0 * pi, -1.0 / 6.0 ); + const double beta_value = beta(settings); + + std::vector omega(npts, 0.0), kappa(npts, 0.0), rp_weight(npts, 0.0); + std::vector domega_drho(npts, 0.0), domega_dgamma(npts, 0.0), dkappa_drho(npts, 0.0); + std::vector active(npts, 0); + + for( std::size_t i = 0; i < npts; ++i ) { + const double rho = grid.rho[i]; + if( rho < settings.vv10_tol ) continue; + + const double gamma = std::max( grid.gamma[i], 0.0 ); + const double rho2 = rho * rho; + const double omega_tmp = settings.vv10_c * (gamma / rho2) * (gamma / rho2); + const double omega_i = std::sqrt( omega_tmp + pi43 * rho ); + + active[i] = 1; + omega[i] = omega_i; + kappa[i] = k_factor * std::pow( rho, 1.0 / 6.0 ); + rp_weight[i] = rho * grid.weights[i]; + domega_drho[i] = (0.5 * pi43 * rho - 2.0 * omega_tmp) / omega_i; + domega_dgamma[i] = gamma > 0.0 ? omega_tmp * rho / (gamma * omega_i) : 0.0; + dkappa_drho[i] = kappa[i] / 6.0; + } + + for( std::size_t i = 0; i < npts; ++i ) { + if( not active[i] ) continue; + + double f = 0.0; + double u = 0.0; + double w = 0.0; + const double* coord_i = grid.coords + 3*i; + + for( std::size_t j = 0; j < npts; ++j ) { + if( not active[j] ) continue; + + const double* coord_j = grid.coords + 3*j; + const double dx = coord_j[0] - coord_i[0]; + const double dy = coord_j[1] - coord_i[1]; + const double dz = coord_j[2] - coord_i[2]; + const double r2 = dx*dx + dy*dy + dz*dz; + const double gp = r2 * omega[j] + kappa[j]; + const double g = r2 * omega[i] + kappa[i]; + const double gt = g + gp; + double t = rp_weight[j] / (g * gp * gt); + + f += t; + t *= 1.0/g + 1.0/gt; + u += t; + w += t * r2; + } + + f *= -1.5; + corr.eps[i] += beta_value + 0.5 * f; + corr.vrho[i] += beta_value + f + 1.5 * (u * dkappa_drho[i] + w * domega_drho[i]); + corr.vgamma[i] += 1.5 * w * domega_dgamma[i]; + } +} + +inline void eval_fxc_rks( const IntegratorSettingsNLC& settings, + ResponseGridView grid, TrialView trial, + ResponseCorrectionsView corr ) { + if( grid.npts == 0 ) return; + + const auto npts = grid.npts; + const double pi = 3.141592653589793238462643383279502884; + const double pi43 = 4.0 * pi / 3.0; + const double k_factor = settings.vv10_b * 1.5 * pi * std::pow( 9.0 * pi, -1.0 / 6.0 ); + + std::vector omega(npts, 0.0), kappa(npts, 0.0); + std::vector uvec(npts, 0.0), wvec(npts, 0.0), avec(npts, 0.0), bvec(npts, 0.0), cvec(npts, 0.0); + std::vector domega_drho(npts, 0.0), domega_dgamma(npts, 0.0), dkappa_drho(npts, 0.0); + std::vector d2omega_drho2(npts, 0.0), d2omega_dgamma2(npts, 0.0), d2omega_drho_dgamma(npts, 0.0); + std::vector d2kappa_drho2(npts, 0.0), gamma_t(npts, 0.0), f_rho_t(npts, 0.0), f_gamma_t(npts, 0.0); + std::vector active(npts, 0); + + for( std::size_t i = 0; i < npts; ++i ) { + const double rho = grid.rho[i]; + if( rho < settings.vv10_tol ) continue; + + const double gamma = std::max( grid.gamma[i], 0.0 ); + const double rho_inv = 1.0 / rho; + const double rho2_inv = rho_inv * rho_inv; + const double rho3_inv = rho2_inv * rho_inv; + const double rho4_inv = rho2_inv * rho2_inv; + const double rho5_inv = rho4_inv * rho_inv; + const double gamma2 = gamma * gamma; + const double omega2 = settings.vv10_c * gamma2 * rho4_inv + pi43 * rho; + const double omega_i = std::sqrt( omega2 ); + const double omega_inv = 1.0 / omega_i; + const double omega3_inv = omega_inv / omega2; + + active[i] = 1; + omega[i] = omega_i; + kappa[i] = k_factor * std::pow( rho, 1.0 / 6.0 ); + domega_drho[i] = 0.5 * (pi43 - 4.0 * settings.vv10_c * gamma2 * rho5_inv) * omega_inv; + domega_dgamma[i] = settings.vv10_c * gamma * rho4_inv * omega_inv; + dkappa_drho[i] = k_factor * (1.0 / 6.0) * std::pow( rho, -5.0 / 6.0 ); + d2omega_drho2[i] = (-0.25 * pi43 * pi43 + + 12.0 * pi43 * settings.vv10_c * gamma2 * rho5_inv + + 6.0 * settings.vv10_c * settings.vv10_c * gamma2 * gamma2 * rho5_inv * rho5_inv) * omega3_inv; + d2omega_dgamma2[i] = pi43 * settings.vv10_c * rho3_inv * omega3_inv; + d2omega_drho_dgamma[i] = -settings.vv10_c * gamma * + (4.5 * pi43 * rho4_inv + 2.0 * settings.vv10_c * gamma2 * rho4_inv * rho5_inv) * omega3_inv; + d2kappa_drho2[i] = k_factor * (-5.0 / 36.0) * std::pow( rho, -11.0 / 6.0 ); + gamma_t[i] = 2.0 * (grid.grad_x[i] * trial.grad_x[i] + + grid.grad_y[i] * trial.grad_y[i] + grid.grad_z[i] * trial.grad_z[i]); + } + + for( std::size_t i = 0; i < npts; ++i ) { + if( not active[i] ) continue; + + const double omega_i = omega[i]; + const double kappa_i = kappa[i]; + const double* coord_i = grid.coords + 3*i; + + double u_i = 0.0; + double w_i = 0.0; + double a_i = 0.0; + double b_i = 0.0; + double c_i = 0.0; + + for( std::size_t j = 0; j < npts; ++j ) { + if( not active[j] ) continue; + + const double* coord_j = grid.coords + 3*j; + const double dx = coord_i[0] - coord_j[0]; + const double dy = coord_i[1] - coord_j[1]; + const double dz = coord_i[2] - coord_j[2]; + const double r2 = dx*dx + dy*dy + dz*dz; + const double g_ij = omega_i * r2 + kappa_i; + const double g_ji = omega[j] * r2 + kappa[j]; + const double g_ij_inv = 1.0 / g_ij; + const double g_sum_inv = 1.0 / (g_ij + g_ji); + const double phi_ij = -1.5 / g_ji * g_ij_inv * g_sum_inv; + const double e_ij = grid.weights[j] * grid.rho[j] * phi_ij; + const double u_ij = e_ij * (g_sum_inv + g_ij_inv); + const double w_ij = u_ij * r2; + const double a_ij = e_ij * (g_sum_inv * g_sum_inv + g_sum_inv * g_ij_inv + g_ij_inv * g_ij_inv); + const double b_ij = a_ij * r2; + const double c_ij = b_ij * r2; + + u_i += u_ij; + w_i += w_ij; + a_i += a_ij; + b_i += b_ij; + c_i += c_ij; + } + + uvec[i] = -u_i; + wvec[i] = -w_i; + avec[i] = 2.0 * a_i; + bvec[i] = 2.0 * b_i; + cvec[i] = 2.0 * c_i; + } + + for( std::size_t i = 0; i < npts; ++i ) { + if( not active[i] ) continue; + + const double omega_i = omega[i]; + const double kappa_i = kappa[i]; + const double rho_i = grid.rho[i]; + const double domega_drho_i = domega_drho[i]; + const double domega_dgamma_i = domega_dgamma[i]; + const double dkappa_drho_i = dkappa_drho[i]; + const double* coord_i = grid.coords + 3*i; + + double f_rho_t_i = 0.0; + double f_gamma_t_i = 0.0; + + for( std::size_t j = 0; j < npts; ++j ) { + if( not active[j] ) continue; + + const double* coord_j = grid.coords + 3*j; + const double dx = coord_i[0] - coord_j[0]; + const double dy = coord_i[1] - coord_j[1]; + const double dz = coord_i[2] - coord_j[2]; + const double r2 = dx*dx + dy*dy + dz*dz; + const double g_ij = omega_i * r2 + kappa_i; + const double g_ji = omega[j] * r2 + kappa[j]; + const double g_ij_inv = 1.0 / g_ij; + const double g_ji_inv = 1.0 / g_ji; + const double g_sum_inv = 1.0 / (g_ij + g_ji); + const double phi_ij = -1.5 * g_ij_inv * g_ji_inv * g_sum_inv; + const double rho_dgdrho_i = rho_i * (r2 * domega_drho_i + dkappa_drho_i); + const double rho_dgdrho_j = grid.rho[j] * (r2 * domega_drho[j] + dkappa_drho[j]); + const double d2phi_dgij_dgji_over_phi = 2.0 * (g_sum_inv * g_sum_inv + g_ij_inv * g_ji_inv); + + const double f_rho_rho_ij = phi_ij * (rho_dgdrho_i * rho_dgdrho_j * d2phi_dgij_dgji_over_phi + - rho_dgdrho_i * (g_sum_inv + g_ij_inv) + - rho_dgdrho_j * (g_sum_inv + g_ji_inv) + 1.0); + const double f_gamma_rho_ij = rho_i * domega_dgamma_i * r2 * phi_ij * + (rho_dgdrho_j * d2phi_dgij_dgji_over_phi - (g_sum_inv + g_ij_inv)); + const double f_rho_gamma_ij = grid.rho[j] * domega_dgamma[j] * r2 * phi_ij * + (rho_dgdrho_i * d2phi_dgij_dgji_over_phi - (g_sum_inv + g_ji_inv)); + const double f_gamma_gamma_ij = rho_i * grid.rho[j] * domega_dgamma_i * domega_dgamma[j] + * r2 * r2 * phi_ij * d2phi_dgij_dgji_over_phi; + + f_rho_t_i += grid.weights[j] * (f_rho_rho_ij * trial.rho[j] + f_rho_gamma_ij * gamma_t[j]); + f_gamma_t_i += grid.weights[j] * (f_gamma_rho_ij * trial.rho[j] + f_gamma_gamma_ij * gamma_t[j]); + } + + const double f_rho_rho_ii = 2.0 * domega_drho_i * wvec[i] + 2.0 * dkappa_drho_i * uvec[i] + + rho_i * (d2omega_drho2[i] * wvec[i] + d2kappa_drho2[i] * uvec[i] + + dkappa_drho_i * dkappa_drho_i * avec[i] + domega_drho_i * domega_drho_i * cvec[i] + + 2.0 * domega_drho_i * dkappa_drho_i * bvec[i]); + const double f_gamma_rho_ii = domega_dgamma_i * wvec[i] + rho_i * + (d2omega_drho_dgamma[i] * wvec[i] + domega_dgamma_i * + (dkappa_drho_i * bvec[i] + domega_drho_i * cvec[i])); + const double f_gamma_gamma_ii = rho_i * + (d2omega_dgamma2[i] * wvec[i] + domega_dgamma_i * domega_dgamma_i * cvec[i]); + + f_rho_t_i += f_rho_rho_ii * trial.rho[i] + f_gamma_rho_ii * gamma_t[i]; + f_gamma_t_i += f_gamma_rho_ii * trial.rho[i] + f_gamma_gamma_ii * gamma_t[i]; + + f_rho_t[i] = f_rho_t_i; + f_gamma_t[i] = f_gamma_t_i; + } + + for( std::size_t i = 0; i < npts; ++i ) { + if( not active[i] ) continue; + + const double f_gamma = grid.rho[i] * domega_dgamma[i] * wvec[i]; + corr.A[i] += f_rho_t[i]; + corr.B[3*i] += 2.0 * (grid.grad_x[i] * f_gamma_t[i] + trial.grad_x[i] * f_gamma); + corr.B[3*i+1] += 2.0 * (grid.grad_y[i] * f_gamma_t[i] + trial.grad_y[i] * f_gamma); + corr.B[3*i+2] += 2.0 * (grid.grad_z[i] * f_gamma_t[i] + trial.grad_z[i] * f_gamma); + } +} + +inline void eval_grid_gradient( const IntegratorSettingsNLC& settings, + GridView grid, GridGradientView grad ) { + if( grid.npts == 0 ) return; + + const auto npts = grid.npts; + const double pi = 3.141592653589793238462643383279502884; + const double pi43 = 4.0 * pi / 3.0; + const double k_factor = settings.vv10_b * 1.5 * pi * std::pow( 9.0 * pi, -1.0 / 6.0 ); + + std::vector omega(npts, 0.0), kappa(npts, 0.0), rho_weight(npts, 0.0); + std::vector active(npts, 0); + + for( std::size_t i = 0; i < npts; ++i ) { + const double rho = grid.rho[i]; + if( rho < settings.vv10_tol ) continue; + + const double gamma = std::max( grid.gamma[i], 0.0 ); + const double rho2 = rho * rho; + const double omega_tmp = settings.vv10_c * (gamma / rho2) * (gamma / rho2); + active[i] = 1; + omega[i] = std::sqrt( omega_tmp + pi43 * rho ); + kappa[i] = k_factor * std::pow( rho, 1.0 / 6.0 ); + rho_weight[i] = rho * grid.weights[i]; + } + + for( std::size_t i = 0; i < npts; ++i ) { + if( not active[i] ) continue; + + double fx = 0.0; + double fy = 0.0; + double fz = 0.0; + const double* coord_i = grid.coords + 3*i; + + for( std::size_t j = 0; j < npts; ++j ) { + if( not active[j] ) continue; + + const double* coord_j = grid.coords + 3*j; + const double dx = coord_j[0] - coord_i[0]; + const double dy = coord_j[1] - coord_i[1]; + const double dz = coord_j[2] - coord_i[2]; + const double r2 = dx*dx + dy*dy + dz*dz; + const double gp = r2 * omega[j] + kappa[j]; + const double g = r2 * omega[i] + kappa[i]; + const double gt = g + gp; + const double t = rho_weight[j] / (g * gp * gt); + const double q = t * (omega[i]/g + omega[j]/gp + (omega[i]+omega[j])/gt); + + fx += q * dx; + fy += q * dy; + fz += q * dz; + } + + grad.x[i] += -3.0 * fx; + grad.y[i] += -3.0 * fy; + grad.z[i] += -3.0 * fz; + } +} + +inline void eval_grid_gradient_excluding_same_parent( const IntegratorSettingsNLC& settings, + GridView grid, + const std::vector& parent, + GridGradientView grad ) { + if( grid.npts == 0 ) return; + + const auto npts = grid.npts; + const double pi = 3.141592653589793238462643383279502884; + const double pi43 = 4.0 * pi / 3.0; + const double k_factor = settings.vv10_b * 1.5 * pi * std::pow( 9.0 * pi, -1.0 / 6.0 ); + + std::vector omega(npts, 0.0), kappa(npts, 0.0), rho_weight(npts, 0.0); + std::vector active(npts, 0); + + for( std::size_t i = 0; i < npts; ++i ) { + const double rho = grid.rho[i]; + if( rho < settings.vv10_tol ) continue; + + const double gamma = std::max( grid.gamma[i], 0.0 ); + const double rho2 = rho * rho; + const double omega_tmp = settings.vv10_c * (gamma / rho2) * (gamma / rho2); + active[i] = 1; + omega[i] = std::sqrt( omega_tmp + pi43 * rho ); + kappa[i] = k_factor * std::pow( rho, 1.0 / 6.0 ); + rho_weight[i] = rho * grid.weights[i]; + } + + for( std::size_t i = 0; i < npts; ++i ) { + if( not active[i] ) continue; + + double fx = 0.0; + double fy = 0.0; + double fz = 0.0; + const double* coord_i = grid.coords + 3*i; + + for( std::size_t j = 0; j < npts; ++j ) { + if( not active[j] or parent[i] == parent[j] ) continue; + + const double* coord_j = grid.coords + 3*j; + const double dx = coord_j[0] - coord_i[0]; + const double dy = coord_j[1] - coord_i[1]; + const double dz = coord_j[2] - coord_i[2]; + const double r2 = dx*dx + dy*dy + dz*dz; + const double gp = r2 * omega[j] + kappa[j]; + const double g = r2 * omega[i] + kappa[i]; + const double gt = g + gp; + const double t = rho_weight[j] / (g * gp * gt); + const double q = t * (omega[i]/g + omega[j]/gp + (omega[i]+omega[j])/gt); + + fx += q * dx; + fy += q * dy; + fz += q * dz; + } + + grad.x[i] += -3.0 * fx; + grad.y[i] += -3.0 * fy; + grad.z[i] += -3.0 * fz; + } +} + +} // namespace GauXC::detail::vv10 \ No newline at end of file diff --git a/src/xc_integrator/replicated/replicated_xc_integrator_impl.cxx b/src/xc_integrator/replicated/replicated_xc_integrator_impl.cxx index 071afe312..752cbd700 100644 --- a/src/xc_integrator/replicated/replicated_xc_integrator_impl.cxx +++ b/src/xc_integrator/replicated/replicated_xc_integrator_impl.cxx @@ -10,6 +10,7 @@ * See LICENSE.txt for details */ #include +#include namespace GauXC { namespace detail { diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 844466280..d99d6e199 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -66,6 +66,8 @@ add_executable( gauxc_test weight_derivative_test.cxx standards.cxx runtime.cxx + vv10_nlc_test.cxx + vv10_test.cxx basis/parse_basis.cxx dd_psi_potential_test.cxx 2nd_derivative_test.cxx @@ -89,6 +91,11 @@ target_link_libraries( standalone_driver PUBLIC gauxc gauxc_catch2 Eigen3::Eigen target_include_directories( standalone_driver PRIVATE ${PROJECT_BINARY_DIR}/tests ) target_include_directories( standalone_driver PRIVATE ${PROJECT_SOURCE_DIR}/tests ) +add_executable( vv10_benchmark vv10_benchmark.cxx ) +target_link_libraries( vv10_benchmark PUBLIC gauxc Eigen3::Eigen ) +target_include_directories( vv10_benchmark PRIVATE ${PROJECT_BINARY_DIR}/tests ) +target_include_directories( vv10_benchmark PRIVATE ${PROJECT_SOURCE_DIR}/tests ) + #add_executable( grid_opt grid_opt.cxx standards.cxx basis/parse_basis.cxx ini_input.cxx ) #target_link_libraries( grid_opt PUBLIC gauxc gauxc_catch2 Eigen3::Eigen ) #target_include_directories( grid_opt PRIVATE ${PROJECT_BINARY_DIR}/tests ) diff --git a/tests/ref_data/vv10_pyscf_reference.hdf5 b/tests/ref_data/vv10_pyscf_reference.hdf5 new file mode 100644 index 000000000..fcb3d2712 Binary files /dev/null and b/tests/ref_data/vv10_pyscf_reference.hdf5 differ diff --git a/tests/ut_main.cxx b/tests/ut_main.cxx index 754205156..bdd1fe23b 100644 --- a/tests/ut_main.cxx +++ b/tests/ut_main.cxx @@ -26,7 +26,10 @@ int main( int argc, char* argv[] ) { int rank; MPI_Comm_rank(MPI_COMM_WORLD, &rank); #ifdef GAUXC_HAS_CUDA - cudaSetDevice(rank); + int device_count = 0; + if(cudaGetDeviceCount(&device_count) == cudaSuccess and device_count > 0) { + cudaSetDevice(rank % device_count); + } #endif int result = Catch::Session().run( argc, argv ); MPI_Finalize(); diff --git a/tests/vv10_benchmark.cxx b/tests/vv10_benchmark.cxx new file mode 100644 index 000000000..4bb736fd4 --- /dev/null +++ b/tests/vv10_benchmark.cxx @@ -0,0 +1,369 @@ +/** + * GauXC Copyright (c) 2020-2024, The Regents of the University of California, + * through Lawrence Berkeley National Laboratory (subject to receipt of + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. + * + * See LICENSE.txt for details + */ +#include +#include +#include +#include +#include +#include +#include + +#include + +#define EIGEN_DONT_VECTORIZE +#define EIGEN_NO_CUDA +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace { + +using matrix_type = Eigen::MatrixXd; + +struct Options { + std::vector files; + int repeats = 5; + int warmup = 1; + std::size_t batch_size = 512; + double basis_tol = 1e-10; + std::string grid = "GM3"; + std::string pruning = "UNPRUNED"; + std::string rad_quad = "MURAKNOWLES"; + std::string exec_space = "Host"; + std::string nlc_math_mode = "FP64"; + std::string quantity = "VNLC"; + std::string gradient_mode = "FULL"; + bool force_rks = false; + std::string integrator_kernel = "Default"; + std::string lwd_kernel = "Default"; + std::string reduction_kernel = "Default"; +}; + +void uppercase(std::string& value) { + std::transform(value.begin(), value.end(), value.begin(), ::toupper); +} + +void print_usage(const char* exe) { + std::cout << "Usage: " << exe << " [options] file1.h5 [file2.h5 ...]\n" + << "Options:\n" + << " --repeats N Timed repetitions (default: 5)\n" + << " --warmup N Untimed warmup repetitions (default: 1)\n" + << " --batch-size N Grid batch size (default: 512)\n" + << " --basis-tol X Shell screening tolerance (default: 1e-10)\n" + << " --grid NAME FINE, ULTRAFINE, SUPERFINE, GM3, GM5 (default: GM3)\n" + << " --pruning NAME UNPRUNED, ROBUST, TREUTLER (default: UNPRUNED)\n" + << " --rad-quad NAME MK/MURAKNOWLES, BECKE, TA, MHL (default: MURAKNOWLES)\n" + << " --exec-space NAME Host or Device (default: Host)\n" + << " --nlc-math-mode NAME FP64 or FLOATPAIR for exact device VV10 (default: FP64)\n" + << " --quantity NAME VNLC, GRAD, or FNLC (default: VNLC)\n" + << " --gradient-mode NAME FULL or HF for --quantity GRAD (default: FULL)\n" + << " --force-rks Use scalar density as RKS even if spin datasets are present\n" + << " --integrator-kernel NAME Integrator kernel (default: Default)\n" + << " --lwd-kernel NAME Local work driver kernel (default: Default)\n" + << " --reduction-kernel NAME Reduction driver kernel (default: Default)\n"; +} + +Options parse_options(int argc, char** argv) { + Options options; + for(int i = 1; i < argc; ++i) { + const std::string arg = argv[i]; + auto require_value = [&](const char* name) -> std::string { + if(i + 1 >= argc) throw std::runtime_error(std::string("Missing value for ") + name); + return argv[++i]; + }; + + if(arg == "--help" or arg == "-h") { + print_usage(argv[0]); + std::exit(0); + } else if(arg == "--repeats") { + options.repeats = std::stoi(require_value("--repeats")); + } else if(arg == "--warmup") { + options.warmup = std::stoi(require_value("--warmup")); + } else if(arg == "--batch-size") { + options.batch_size = static_cast(std::stoull(require_value("--batch-size"))); + } else if(arg == "--basis-tol") { + options.basis_tol = std::stod(require_value("--basis-tol")); + } else if(arg == "--grid") { + options.grid = require_value("--grid"); + } else if(arg == "--pruning") { + options.pruning = require_value("--pruning"); + } else if(arg == "--rad-quad") { + options.rad_quad = require_value("--rad-quad"); + } else if(arg == "--exec-space") { + options.exec_space = require_value("--exec-space"); + } else if(arg == "--nlc-math-mode") { + options.nlc_math_mode = require_value("--nlc-math-mode"); + } else if(arg == "--quantity") { + options.quantity = require_value("--quantity"); + } else if(arg == "--gradient-mode") { + options.gradient_mode = require_value("--gradient-mode"); + } else if(arg == "--force-rks") { + options.force_rks = true; + } else if(arg == "--integrator-kernel") { + options.integrator_kernel = require_value("--integrator-kernel"); + } else if(arg == "--lwd-kernel") { + options.lwd_kernel = require_value("--lwd-kernel"); + } else if(arg == "--reduction-kernel") { + options.reduction_kernel = require_value("--reduction-kernel"); + } else { + options.files.push_back(arg); + } + } + + uppercase(options.grid); + uppercase(options.pruning); + uppercase(options.rad_quad); + uppercase(options.nlc_math_mode); + uppercase(options.quantity); + uppercase(options.gradient_mode); + if(options.files.empty()) throw std::runtime_error("At least one HDF5 benchmark file is required"); + if(options.repeats <= 0) throw std::runtime_error("--repeats must be positive"); + if(options.warmup < 0) throw std::runtime_error("--warmup must be non-negative"); + if(options.quantity != "VNLC" and options.quantity != "GRAD" and options.quantity != "FNLC") { + throw std::runtime_error("Unknown --quantity: " + options.quantity); + } + if(options.nlc_math_mode != "FP64" and options.nlc_math_mode != "FLOATPAIR") { + throw std::runtime_error("Unknown --nlc-math-mode: " + options.nlc_math_mode); + } + if(options.gradient_mode != "FULL" and options.gradient_mode != "HF") { + throw std::runtime_error("Unknown --gradient-mode: " + options.gradient_mode); + } + return options; +} + +matrix_type read_matrix(HighFive::File& file, const std::string& name) { + auto dset = file.getDataSet(name); + auto dims = dset.getDimensions(); + if(dims.size() != 2 or dims[0] != dims[1]) { + throw std::runtime_error("Expected square matrix dataset: " + name); + } + matrix_type matrix(dims[0], dims[1]); + dset.read(matrix.data()); + return matrix; +} + +double mpi_max_seconds(double local_seconds) { +#ifdef GAUXC_HAS_MPI + double global_seconds = 0.0; + MPI_Reduce(&local_seconds, &global_seconds, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); + return global_seconds; +#else + return local_seconds; +#endif +} + +template +double time_call(F&& func) { +#ifdef GAUXC_HAS_MPI + MPI_Barrier(MPI_COMM_WORLD); +#endif + const auto start = std::chrono::high_resolution_clock::now(); + func(); +#ifdef GAUXC_HAS_MPI + MPI_Barrier(MPI_COMM_WORLD); +#endif + const auto stop = std::chrono::high_resolution_clock::now(); + return std::chrono::duration(stop - start).count(); +} + +void print_result(int rank, const std::string& file, const std::string& mode, + const std::string& quantity, + int natoms, int nbf, const std::vector& seconds, + double value, double norm) { + if(rank != 0) return; + auto sorted = seconds; + std::sort(sorted.begin(), sorted.end()); + const auto sum = std::accumulate(sorted.begin(), sorted.end(), 0.0); + const auto min = sorted.front(); + const auto median = sorted[sorted.size() / 2]; + const auto avg = sum / static_cast(sorted.size()); + std::cout << std::left << std::setw(28) << file + << std::right << std::setw(8) << mode + << std::setw(10) << quantity + << std::setw(8) << natoms + << std::setw(8) << nbf + << std::setw(22) << min + << std::setw(22) << median + << std::setw(22) << avg + << std::setw(24) << value + << std::setw(24) << norm + << '\n'; +} + +template +void run_benchmarks(const Options& options, RuntimeType& runtime) { + using namespace GauXC; + using namespace ExchCXX; + + const int rank = runtime.comm_rank(); + + std::map grid_map = { + {"FINE", AtomicGridSizeDefault::FineGrid}, + {"ULTRAFINE", AtomicGridSizeDefault::UltraFineGrid}, + {"SUPERFINE", AtomicGridSizeDefault::SuperFineGrid}, + {"GM3", AtomicGridSizeDefault::GM3}, + {"GM5", AtomicGridSizeDefault::GM5} + }; + std::map pruning_map = { + {"UNPRUNED", PruningScheme::Unpruned}, + {"ROBUST", PruningScheme::Robust}, + {"TREUTLER", PruningScheme::Treutler} + }; + std::map rad_quad_map = { + {"BECKE", RadialQuad::Becke}, + {"MURAKNOWLES", RadialQuad::MuraKnowles}, + {"MK", RadialQuad::MuraKnowles}, + {"TREUTLERAHLRICHS", RadialQuad::TreutlerAhlrichs}, + {"TA", RadialQuad::TreutlerAhlrichs}, + {"MURRAYHANDYLAMING", RadialQuad::MurrayHandyLaming}, + {"MHL", RadialQuad::MurrayHandyLaming} + }; + + if(rank == 0) { + std::cout << std::scientific << std::setprecision(12); + std::cout << std::left << std::setw(28) << "file" + << std::right << std::setw(8) << "mode" + << std::setw(10) << "quantity" + << std::setw(8) << "natoms" + << std::setw(8) << "nbf" + << std::setw(22) << "min_s" + << std::setw(22) << "median_s" + << std::setw(22) << "avg_s" + << std::setw(24) << "value" + << std::setw(24) << "out_norm" + << '\n'; + } + + for(const auto& file_name : options.files) { + Molecule mol; + BasisSet basis; + read_hdf5_record(mol, file_name, "/MOLECULE"); + read_hdf5_record(basis, file_name, "/BASIS"); + for(auto& shell : basis) shell.set_shell_tolerance(options.basis_tol); + + HighFive::File file(file_name, HighFive::File::ReadOnly); + const bool has_spin_density = file.exist("/DENSITY_Z"); + const bool uks = has_spin_density and not options.force_rks; + const auto density_name = has_spin_density ? std::string("/DENSITY_SCALAR") : std::string("/DENSITY"); + auto P = read_matrix(file, density_name); + matrix_type Pz; + if(uks) Pz = read_matrix(file, "/DENSITY_Z"); + + const auto molgrid = MolGridFactory::create_default_molgrid( + mol, pruning_map.at(options.pruning), BatchSize(options.batch_size), + rad_quad_map.at(options.rad_quad), grid_map.at(options.grid)); + + LoadBalancerFactory lb_factory(ExecutionSpace::Host, "Default"); + auto load_balancer = lb_factory.get_shared_instance(runtime, mol, molgrid, basis); + + const auto exec_space = options.exec_space == "Device" ? ExecutionSpace::Device : ExecutionSpace::Host; + MolecularWeightsFactory mw_factory(exec_space, "Default", MolecularWeightsSettings{}); + auto molecular_weights = mw_factory.get_instance(); + molecular_weights.modify_weights(*load_balancer); + + auto functional = functional_type(Backend::builtin, Functional::BLYP, + uks ? Spin::Polarized : Spin::Unpolarized); + XCIntegratorFactory integrator_factory(exec_space, "Replicated", + options.integrator_kernel, options.lwd_kernel, options.reduction_kernel); + auto integrator = integrator_factory.get_instance(functional, load_balancer); + + IntegratorSettingsNLC settings; + settings.math_mode = options.nlc_math_mode == "FLOATPAIR" + ? NLCMathMode::FloatPair : NLCMathMode::NativeFP64; + settings.include_weight_derivatives = options.gradient_mode == "FULL"; + double value = 0.0; + double norm = 0.0; + + auto evaluate = [&]() { + if(options.quantity == "VNLC") { + if(uks) { + auto result = integrator.eval_nlc_vnlc(P, Pz, settings); + value = std::get<0>(result); + norm = std::get<1>(result).norm(); + } else { + auto result = integrator.eval_nlc_vnlc(P, settings); + value = std::get<0>(result); + norm = std::get<1>(result).norm(); + } + } else if(options.quantity == "GRAD") { + std::vector gradient; + if(uks) gradient = integrator.eval_nlc_grad(P, Pz, settings); + else gradient = integrator.eval_nlc_grad(P, settings); + value = 0.0; + norm = std::sqrt(std::inner_product(gradient.begin(), gradient.end(), gradient.begin(), 0.0)); + } else { + if(uks) { + auto result = integrator.eval_nlc_fnlc_contraction(P, Pz, P, Pz, settings); + value = 0.0; + norm = std::get<0>(result).norm() + std::get<1>(result).norm(); + } else { + auto result = integrator.eval_nlc_fnlc_contraction(P, P, settings); + value = 0.0; + norm = result.norm(); + } + } + }; + + for(int i = 0; i < options.warmup; ++i) { + evaluate(); + } + + std::vector seconds; + seconds.reserve(options.repeats); + for(int i = 0; i < options.repeats; ++i) { + const auto local_seconds = time_call(evaluate); + seconds.push_back(mpi_max_seconds(local_seconds)); + } + + print_result(rank, file_name, uks ? "UKS" : "RKS", options.quantity, + mol.natoms(), basis.nbf(), seconds, value, norm); + } +} + +} // namespace + +int main(int argc, char** argv) { +#ifdef GAUXC_HAS_MPI + MPI_Init(&argc, &argv); +#endif + + int status = 0; + try { + const auto options = parse_options(argc, argv); +#ifdef GAUXC_HAS_DEVICE + if(options.exec_space == "Device") { + auto runtime = GauXC::DeviceRuntimeEnvironment(GAUXC_MPI_CODE(MPI_COMM_WORLD,) 0.9); + run_benchmarks(options, runtime); + } else +#endif + { + auto runtime = GauXC::RuntimeEnvironment(GAUXC_MPI_CODE(MPI_COMM_WORLD)); + run_benchmarks(options, runtime); + } + } catch(const std::exception& ex) { + std::cerr << "vv10_benchmark error: " << ex.what() << std::endl; + status = 1; + } + +#ifdef GAUXC_HAS_MPI + MPI_Finalize(); +#endif + return status; +} \ No newline at end of file diff --git a/tests/vv10_nlc_test.cxx b/tests/vv10_nlc_test.cxx new file mode 100644 index 000000000..31e794a6c --- /dev/null +++ b/tests/vv10_nlc_test.cxx @@ -0,0 +1,703 @@ +/** + * GauXC Copyright (c) 2020-2024, The Regents of the University of California, + * through Lawrence Berkeley National Laboratory (subject to receipt of + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. + * + * See LICENSE.txt for details + */ +#include "ut_common.hpp" +#include "../src/xc_integrator/replicated/host/vv10_nlc.hpp" +#include "standards.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include + +namespace { + +std::vector read_double_dataset(HighFive::File& file, const std::string& name) { + auto dataset = file.getDataSet(name); + auto dims = dataset.getDimensions(); + size_t size = 1; + for( auto dim : dims ) size *= dim; + std::vector values(size); + if( dims.size() == 1 ) { + dataset.read(values); + } else if( dims.size() == 2 ) { + std::vector> matrix; + dataset.read(matrix); + size_t index = 0; + for( const auto& row : matrix ) { + for( const auto value : row ) values[index++] = value; + } + } else { + FAIL("Unsupported VV10 reference dataset rank"); + } + return values; +} + +std::vector read_uint64_dataset(HighFive::File& file, const std::string& name) { + auto dataset = file.getDataSet(name); + auto dims = dataset.getDimensions(); + size_t size = 1; + for( auto dim : dims ) size *= dim; + std::vector values(size); + dataset.read(values); + return values; +} + +} + +TEST_CASE( "VV10 NLC helper matches PySCF reference", "[vv10][nlc]" ) { + GauXC::IntegratorSettingsNLC settings; + settings.vv10_b = 6.3; + settings.vv10_c = 0.0093; + + constexpr std::size_t npts = 4; + const std::array coords = { + 0.00, 0.00, 0.00, + 0.37, -0.22, 0.51, + -0.44, 0.18, -0.31, + 0.11, 0.42, -0.26 + }; + const std::array weights = { 0.31, 0.27, 0.23, 0.19 }; + const std::array rho = { 0.42, 0.37, 0.28, 0.51 }; + const std::array gamma = { + 0.030*0.030 + (-0.018)*(-0.018) + 0.022*0.022, + (-0.020)*(-0.020) + 0.011*0.011 + 0.017*0.017, + 0.015*0.015 + 0.019*0.019 + (-0.012)*(-0.012), + 0.025*0.025 + (-0.014)*(-0.014) + 0.020*0.020 + }; + + std::array eps = { 0.0, 0.0, 0.0, 0.0 }; + std::array vrho = { 0.0, 0.0, 0.0, 0.0 }; + std::array vgamma = { 0.0, 0.0, 0.0, 0.0 }; + + GauXC::detail::vv10::eval_exc_vxc( + settings, + GauXC::detail::vv10::GridView{ npts, coords.data(), weights.data(), rho.data(), gamma.data() }, + GauXC::detail::vv10::CorrectionsView{ eps.data(), vrho.data(), vgamma.data() } + ); + + const std::array ref_eps = { + 0.0044608986634366, + 0.00446368137730377, + 0.00445955256959738, + 0.00446493156065464 + }; + const std::array ref_vrho = { + 0.00443982739412034, + 0.00444498672971984, + 0.00443833501920801, + 0.00444652164147545 + }; + const std::array ref_vgamma = { + 3.1648948742458134e-10, + 5.5585365145093665e-10, + 1.3162234665769539e-09, + 1.6830142122261283e-10 + }; + + for( std::size_t i = 0; i < npts; ++i ) { + CHECK( eps[i] == Approx(ref_eps[i]).margin(1e-14) ); + CHECK( vrho[i] == Approx(ref_vrho[i]).margin(1e-14) ); + CHECK( vgamma[i] == Approx(ref_vgamma[i]).margin(1e-18) ); + } +} + +TEST_CASE( "VV10 NLC helper matches PySCF HDF5 reference fixture", "[vv10][nlc]" ) { + HighFive::File file( GAUXC_REF_DATA_PATH "/vv10_pyscf_reference.hdf5", HighFive::File::ReadOnly ); + + const auto params = read_double_dataset(file, "params"); + const auto coords = read_double_dataset(file, "coords"); + const auto weights = read_double_dataset(file, "weights"); + const auto rho = read_double_dataset(file, "rho"); + const auto grad = read_double_dataset(file, "grad"); + const auto gamma = read_double_dataset(file, "gamma"); + const auto rho_t = read_double_dataset(file, "rho_t"); + const auto grad_t = read_double_dataset(file, "grad_t"); + const auto parents = read_uint64_dataset(file, "parents"); + const auto ref_eps = read_double_dataset(file, "eps"); + const auto ref_vrho = read_double_dataset(file, "vrho"); + const auto ref_vgamma = read_double_dataset(file, "vgamma"); + const auto ref_response_A = read_double_dataset(file, "response_A"); + const auto ref_response_B = read_double_dataset(file, "response_B"); + const auto ref_grid_gradient = read_double_dataset(file, "grid_gradient"); + const auto ref_grid_gradient_excluding_same_parent = read_double_dataset(file, "grid_gradient_excluding_same_parent"); + + const auto npts = rho.size(); + REQUIRE( params.size() == 2 ); + REQUIRE( coords.size() == 3 * npts ); + REQUIRE( weights.size() == npts ); + REQUIRE( grad.size() == 3 * npts ); + REQUIRE( grad_t.size() == 3 * npts ); + + GauXC::IntegratorSettingsNLC settings; + settings.vv10_b = params[0]; + settings.vv10_c = params[1]; + + std::vector eps(npts, 0.0), vrho(npts, 0.0), vgamma(npts, 0.0); + GauXC::detail::vv10::eval_exc_vxc( + settings, + GauXC::detail::vv10::GridView{ npts, coords.data(), weights.data(), rho.data(), gamma.data() }, + GauXC::detail::vv10::CorrectionsView{ eps.data(), vrho.data(), vgamma.data() } + ); + + std::vector grad_x(npts), grad_y(npts), grad_z(npts); + std::vector grad_t_x(npts), grad_t_y(npts), grad_t_z(npts); + for( size_t i = 0; i < npts; ++i ) { + grad_x[i] = grad[3*i]; + grad_y[i] = grad[3*i+1]; + grad_z[i] = grad[3*i+2]; + grad_t_x[i] = grad_t[3*i]; + grad_t_y[i] = grad_t[3*i+1]; + grad_t_z[i] = grad_t[3*i+2]; + } + + std::vector response_A(npts, 0.0), response_B(3*npts, 0.0); + GauXC::detail::vv10::eval_fxc_rks( + settings, + GauXC::detail::vv10::ResponseGridView{ + npts, coords.data(), weights.data(), rho.data(), gamma.data(), grad_x.data(), grad_y.data(), grad_z.data() + }, + GauXC::detail::vv10::TrialView{ rho_t.data(), grad_t_x.data(), grad_t_y.data(), grad_t_z.data() }, + GauXC::detail::vv10::ResponseCorrectionsView{ response_A.data(), response_B.data() } + ); + + std::vector grid_grad_x(npts, 0.0), grid_grad_y(npts, 0.0), grid_grad_z(npts, 0.0); + GauXC::detail::vv10::eval_grid_gradient( + settings, + GauXC::detail::vv10::GridView{ npts, coords.data(), weights.data(), rho.data(), gamma.data() }, + GauXC::detail::vv10::GridGradientView{ grid_grad_x.data(), grid_grad_y.data(), grid_grad_z.data() } + ); + + std::vector excluded_grad_x(npts, 0.0), excluded_grad_y(npts, 0.0), excluded_grad_z(npts, 0.0); + GauXC::detail::vv10::eval_grid_gradient_excluding_same_parent( + settings, + GauXC::detail::vv10::GridView{ npts, coords.data(), weights.data(), rho.data(), gamma.data() }, + parents, + GauXC::detail::vv10::GridGradientView{ excluded_grad_x.data(), excluded_grad_y.data(), excluded_grad_z.data() } + ); + + for( size_t i = 0; i < npts; ++i ) { + CHECK( eps[i] == Approx(ref_eps[i]).margin(1e-14) ); + CHECK( vrho[i] == Approx(ref_vrho[i]).margin(1e-14) ); + CHECK( vgamma[i] == Approx(ref_vgamma[i]).margin(1e-18) ); + CHECK( response_A[i] == Approx(ref_response_A[i]).margin(1e-18) ); + CHECK( grid_grad_x[i] == Approx(ref_grid_gradient[3*i]).margin(1e-18) ); + CHECK( grid_grad_y[i] == Approx(ref_grid_gradient[3*i+1]).margin(1e-18) ); + CHECK( grid_grad_z[i] == Approx(ref_grid_gradient[3*i+2]).margin(1e-18) ); + CHECK( excluded_grad_x[i] == Approx(ref_grid_gradient_excluding_same_parent[3*i]).margin(1e-18) ); + CHECK( excluded_grad_y[i] == Approx(ref_grid_gradient_excluding_same_parent[3*i+1]).margin(1e-18) ); + CHECK( excluded_grad_z[i] == Approx(ref_grid_gradient_excluding_same_parent[3*i+2]).margin(1e-18) ); + } + for( size_t i = 0; i < 3*npts; ++i ) { + CHECK( response_B[i] == Approx(ref_response_B[i]).margin(1e-21) ); + } +} + +TEST_CASE( "VV10 NLC UKS channel mapping", "[vv10][nlc]" ) { + std::array eps = { 1.0, 2.0 }; + std::array vrho = { 0.0, 0.0, 10.0, 20.0 }; + std::array vgamma = { 0.0, 0.0, 0.0, 30.0, 40.0, 50.0 }; + + GauXC::detail::vv10::add_uks_correction( 1, 0.25, 0.5, 0.75, + eps.data(), vrho.data(), vgamma.data() ); + + CHECK( eps[0] == Approx(1.0) ); + CHECK( eps[1] == Approx(2.25) ); + CHECK( vrho[0] == Approx(0.0) ); + CHECK( vrho[1] == Approx(0.0) ); + CHECK( vrho[2] == Approx(10.5) ); + CHECK( vrho[3] == Approx(20.5) ); + CHECK( vgamma[0] == Approx(0.0) ); + CHECK( vgamma[1] == Approx(0.0) ); + CHECK( vgamma[2] == Approx(0.0) ); + CHECK( vgamma[3] == Approx(30.75) ); + CHECK( vgamma[4] == Approx(41.5) ); + CHECK( vgamma[5] == Approx(50.75) ); +} + +TEST_CASE( "VV10 NLC RKS response matches PySCF libdft reference", "[vv10][nlc]" ) { + GauXC::IntegratorSettingsNLC settings; + settings.vv10_b = 6.3; + settings.vv10_c = 0.0093; + + constexpr std::size_t npts = 4; + const std::array coords = { + 0.00, 0.00, 0.00, + 0.37, -0.22, 0.51, + -0.44, 0.18, -0.31, + 0.11, 0.42, -0.26 + }; + const std::array weights = { 0.31, 0.27, 0.23, 0.19 }; + const std::array rho = { 0.42, 0.37, 0.28, 0.51 }; + const std::array grad_x = { 0.030, -0.020, 0.015, 0.025 }; + const std::array grad_y = { -0.018, 0.011, 0.019, -0.014 }; + const std::array grad_z = { 0.022, 0.017, -0.012, 0.020 }; + const std::array gamma = { + 0.030*0.030 + (-0.018)*(-0.018) + 0.022*0.022, + (-0.020)*(-0.020) + 0.011*0.011 + 0.017*0.017, + 0.015*0.015 + 0.019*0.019 + (-0.012)*(-0.012), + 0.025*0.025 + (-0.014)*(-0.014) + 0.020*0.020 + }; + const std::array rho_t = { 0.013, -0.021, 0.017, 0.009 }; + const std::array grad_t_x = { 0.004, -0.003, 0.002, 0.006 }; + const std::array grad_t_y = { -0.002, 0.005, -0.004, 0.001 }; + const std::array grad_t_z = { 0.003, 0.002, 0.005, -0.003 }; + + std::array A = { 0.0, 0.0, 0.0, 0.0 }; + std::array B = { 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 }; + + GauXC::detail::vv10::eval_fxc_rks( + settings, + GauXC::detail::vv10::ResponseGridView{ + npts, coords.data(), weights.data(), rho.data(), gamma.data(), grad_x.data(), grad_y.data(), grad_z.data() + }, + GauXC::detail::vv10::TrialView{ rho_t.data(), grad_t_x.data(), grad_t_y.data(), grad_t_z.data() }, + GauXC::detail::vv10::ResponseCorrectionsView{ A.data(), B.data() } + ); + + const std::array ref_A = { + -3.5703422236383133e-08, + -1.1793774790700641e-06, + 2.9325704002469295e-07, + -3.5681576663273375e-07 + }; + const std::array ref_B = { + 5.0726028793207675e-12, + -2.7903701376527950e-12, + 3.7621073764918405e-12, + -1.7056375465935961e-11, + 1.3105225970986051e-11, + 1.3886480129449537e-11, + -1.6237863203856369e-11, + -3.7766613354823595e-11, + 3.0364440321900880e-11, + 2.3991229362623262e-12, + 1.2407954875428006e-13, + -7.0620382206289707e-13 + }; + + for( std::size_t i = 0; i < npts; ++i ) { + CHECK( A[i] == Approx(ref_A[i]).margin(1e-18) ); + } + for( std::size_t i = 0; i < 3*npts; ++i ) { + CHECK( B[i] == Approx(ref_B[i]).margin(1e-21) ); + } +} + +TEST_CASE( "VV10 NLC grid gradient matches PySCF reference", "[vv10][nlc]" ) { + GauXC::IntegratorSettingsNLC settings; + settings.vv10_b = 6.3; + settings.vv10_c = 0.0093; + + constexpr std::size_t npts = 4; + const std::array coords = { + 0.00, 0.00, 0.00, + 0.37, -0.22, 0.51, + -0.44, 0.18, -0.31, + 0.11, 0.42, -0.26 + }; + const std::array weights = { 0.31, 0.27, 0.23, 0.19 }; + const std::array rho = { 0.42, 0.37, 0.28, 0.51 }; + const std::array gamma = { + 0.030*0.030 + (-0.018)*(-0.018) + 0.022*0.022, + (-0.020)*(-0.020) + 0.011*0.011 + 0.017*0.017, + 0.015*0.015 + 0.019*0.019 + (-0.012)*(-0.012), + 0.025*0.025 + (-0.014)*(-0.014) + 0.020*0.020 + }; + std::array grad_x = { 0.0, 0.0, 0.0, 0.0 }; + std::array grad_y = { 0.0, 0.0, 0.0, 0.0 }; + std::array grad_z = { 0.0, 0.0, 0.0, 0.0 }; + + GauXC::detail::vv10::eval_grid_gradient( + settings, + GauXC::detail::vv10::GridView{ npts, coords.data(), weights.data(), rho.data(), gamma.data() }, + GauXC::detail::vv10::GridGradientView{ grad_x.data(), grad_y.data(), grad_z.data() } + ); + + const std::array ref_grad = { + -1.9307886424617078e-06, + -3.5876026873695561e-06, + -3.7865972833005390e-07, + 1.1869243697781221e-05, + -1.0719376926550508e-05, + 1.8240846117598208e-05, + -1.9673087097543844e-05, + 3.4494080789274471e-06, + -1.2190661056077449e-05, + 3.4323843645200651e-06, + 1.3579254329979196e-05, + -1.0194844773045410e-05 + }; + + for( std::size_t i = 0; i < npts; ++i ) { + CHECK( grad_x[i] == Approx(ref_grad[3*i]).margin(1e-18) ); + CHECK( grad_y[i] == Approx(ref_grad[3*i+1]).margin(1e-18) ); + CHECK( grad_z[i] == Approx(ref_grad[3*i+2]).margin(1e-18) ); + } +} + +TEST_CASE( "VV10 NLC grid gradient can exclude same-parent grid points", "[vv10][nlc]" ) { + GauXC::IntegratorSettingsNLC settings; + settings.vv10_b = 6.3; + settings.vv10_c = 0.0093; + + constexpr std::size_t npts = 4; + const std::array coords = { + 0.00, 0.00, 0.00, + 0.37, -0.22, 0.51, + -0.44, 0.18, -0.31, + 0.11, 0.42, -0.26 + }; + const std::array weights = { 0.31, 0.27, 0.23, 0.19 }; + const std::array rho = { 0.42, 0.37, 0.28, 0.51 }; + const std::array gamma = { + 0.030*0.030 + (-0.018)*(-0.018) + 0.022*0.022, + (-0.020)*(-0.020) + 0.011*0.011 + 0.017*0.017, + 0.015*0.015 + 0.019*0.019 + (-0.012)*(-0.012), + 0.025*0.025 + (-0.014)*(-0.014) + 0.020*0.020 + }; + std::array grad_x = { 0.0, 0.0, 0.0, 0.0 }; + std::array grad_y = { 0.0, 0.0, 0.0, 0.0 }; + std::array grad_z = { 0.0, 0.0, 0.0, 0.0 }; + const std::vector parent = { 0, 1, 1, 0 }; + + GauXC::detail::vv10::eval_grid_gradient_excluding_same_parent( + settings, + GauXC::detail::vv10::GridView{ npts, coords.data(), weights.data(), rho.data(), gamma.data() }, + parent, + GauXC::detail::vv10::GridGradientView{ grad_x.data(), grad_y.data(), grad_z.data() } + ); + + const std::array ref_grad = { + -7.1657342952056772e-07, + 1.0484917620420706e-06, + -3.2486229589182034e-06, + 7.5039909408315638e-06, + -8.5636965527482088e-06, + 1.3821701351303496e-05, + -1.2901522650039642e-05, + 1.0542563571549509e-07, + -5.3354970474929469e-06, + 1.8009001465124650e-06, + 7.3499509521319973e-06, + -6.3386093486638098e-06 + }; + + for( std::size_t i = 0; i < npts; ++i ) { + CHECK( grad_x[i] == Approx(ref_grad[3*i]).margin(1e-18) ); + CHECK( grad_y[i] == Approx(ref_grad[3*i+1]).margin(1e-18) ); + CHECK( grad_z[i] == Approx(ref_grad[3*i+2]).margin(1e-18) ); + } +} + +TEST_CASE( "VV10 packed-grid allgather handles variable local sizes", "[vv10][nlc]" ) { + auto rt = GauXC::RuntimeEnvironment(GAUXC_MPI_CODE(MPI_COMM_WORLD)); + auto reduction_driver = GauXC::ReductionDriverFactory::get_shared_instance(rt, "Default"); + + const auto rank = reduction_driver->comm_rank(); + const auto size = reduction_driver->comm_size(); + std::vector local( 2 * static_cast(rank + 1) ); + for( std::size_t i = 0; i < local.size(); ++i ) { + local[i] = 100.0 * rank + static_cast(i); + } + + std::size_t local_point_offset = 0; + const auto gathered = GauXC::detail::vv10::allgather_packed_grid( + *reduction_driver, local, 2, local_point_offset ); + + std::size_t expected_points = 0; + for( int i = 0; i < size; ++i ) expected_points += static_cast(i + 1); + CHECK( gathered.size() == 2 * expected_points ); + + std::size_t expected_offset = 0; + for( int i = 0; i < rank; ++i ) expected_offset += static_cast(i + 1); + CHECK( local_point_offset == expected_offset ); + + std::size_t point_offset = 0; + for( int irank = 0; irank < size; ++irank ) { + for( int ipt = 0; ipt < irank + 1; ++ipt ) { + CHECK( gathered[2*(point_offset + ipt)] == Approx(100.0 * irank + 2.0 * ipt) ); + CHECK( gathered[2*(point_offset + ipt)+1] == Approx(100.0 * irank + 2.0 * ipt + 1.0) ); + } + point_offset += static_cast(irank + 1); + } +} + +TEST_CASE( "VV10 integrated host paths are finite and repeatable", "[vv10][integrated]" ) { + using matrix_type = Eigen::MatrixXd; + + auto rt = GauXC::RuntimeEnvironment(GAUXC_MPI_CODE(MPI_COMM_WORLD)); + auto mol = GauXC::make_water(); + auto basis = GauXC::make_631Gd( mol, GauXC::SphericalType(true) ); + for( auto& shell : basis ) { + shell.set_shell_tolerance( std::numeric_limits::epsilon() ); + } + + auto molgrid = GauXC::MolGridFactory::create_default_molgrid( + mol, GauXC::PruningScheme::Unpruned, GauXC::BatchSize(128), + GauXC::RadialQuad::MuraKnowles, GauXC::AtomicGridSizeDefault::GM3 ); + + GauXC::LoadBalancerFactory lb_factory( GauXC::ExecutionSpace::Host, "Default" ); + auto load_balancer = lb_factory.get_instance( rt, mol, molgrid, basis ); + + GauXC::MolecularWeightsFactory mw_factory( + GauXC::ExecutionSpace::Host, "Default", GauXC::MolecularWeightsSettings{} ); + auto molecular_weights = mw_factory.get_instance(); + molecular_weights.modify_weights( load_balancer ); + + auto functional = GauXC::functional_type( + ExchCXX::Backend::builtin, ExchCXX::Functional::BLYP, ExchCXX::Spin::Unpolarized ); + GauXC::XCIntegratorFactory integrator_factory( + GauXC::ExecutionSpace::Host, "Replicated", "Reference", "Default", "Default" ); + auto integrator = integrator_factory.get_instance( functional, load_balancer ); + + const auto nbf = basis.nbf(); + matrix_type P = matrix_type::Zero( nbf, nbf ); + matrix_type tP = matrix_type::Zero( nbf, nbf ); + for( int i = 0; i < nbf; ++i ) { + P(i,i) = 0.05 + 0.002 * i; + tP(i,i) = 0.01 + 0.001 * i; + } + + GauXC::IntegratorSettingsNLC settings; + settings.include_weight_derivatives = true; + + auto [base_exc, base_vxc] = integrator.eval_exc_vxc( P ); + auto base_fxc = integrator.eval_fxc_contraction( P, tP ); + + auto [exc, vxc] = integrator.eval_exc_vxc( P, settings ); + auto [exc_repeat, vxc_repeat] = integrator.eval_exc_vxc( P, settings ); + CHECK( std::isfinite(exc) ); + CHECK( exc == Approx(base_exc).margin(1e-12) ); + CHECK( (vxc - base_vxc).norm() < 1e-10 ); + CHECK( exc_repeat == Approx(base_exc).margin(1e-12) ); + CHECK( (vxc - vxc_repeat).norm() < 1e-10 ); + CHECK( integrator.eval_exc( P, settings ) == Approx(base_exc).margin(1e-10) ); + + auto [enlc, vnlc] = integrator.eval_nlc_vnlc( P, settings ); + CHECK( std::isfinite(enlc) ); + CHECK( std::abs(enlc) > 1e-10 ); + CHECK( vnlc.allFinite() ); + CHECK( vnlc.norm() > 1e-10 ); + CHECK( integrator.eval_nlc( P, settings ) == Approx(enlc).margin(1e-10) ); + + auto exc_grad_full = integrator.eval_exc_grad( P, settings ); + auto nlc_exc_grad_full = integrator.eval_nlc_grad( P, settings ); + settings.include_weight_derivatives = false; + auto exc_grad_hf = integrator.eval_exc_grad( P, settings ); + auto nlc_exc_grad_hf = integrator.eval_nlc_grad( P, settings ); + REQUIRE( exc_grad_full.size() == 3 * mol.natoms() ); + REQUIRE( exc_grad_hf.size() == 3 * mol.natoms() ); + REQUIRE( nlc_exc_grad_full.size() == 3 * mol.natoms() ); + REQUIRE( nlc_exc_grad_hf.size() == 3 * mol.natoms() ); + for( auto value : exc_grad_full ) CHECK( std::isfinite(value) ); + for( auto value : exc_grad_hf ) CHECK( std::isfinite(value) ); + for( auto value : nlc_exc_grad_full ) CHECK( std::isfinite(value) ); + for( auto value : nlc_exc_grad_hf ) CHECK( std::isfinite(value) ); + CHECK( std::sqrt(std::inner_product(nlc_exc_grad_full.begin(), nlc_exc_grad_full.end(), nlc_exc_grad_full.begin(), 0.0)) > 1e-12 ); + CHECK( std::sqrt(std::inner_product(nlc_exc_grad_hf.begin(), nlc_exc_grad_hf.end(), nlc_exc_grad_hf.begin(), 0.0)) > 1e-12 ); + + double full_sum[3] = { 0.0, 0.0, 0.0 }; + for( size_t iatom = 0; iatom < mol.natoms(); ++iatom ) { + full_sum[0] += exc_grad_full[3*iatom+0]; + full_sum[1] += exc_grad_full[3*iatom+1]; + full_sum[2] += exc_grad_full[3*iatom+2]; + } + CHECK( std::abs(full_sum[0]) < 1e-8 ); + CHECK( std::abs(full_sum[1]) < 1e-8 ); + CHECK( std::abs(full_sum[2]) < 1e-8 ); + + double nlc_full_sum[3] = { 0.0, 0.0, 0.0 }; + for( size_t iatom = 0; iatom < mol.natoms(); ++iatom ) { + nlc_full_sum[0] += nlc_exc_grad_full[3*iatom+0]; + nlc_full_sum[1] += nlc_exc_grad_full[3*iatom+1]; + nlc_full_sum[2] += nlc_exc_grad_full[3*iatom+2]; + } + CHECK( std::abs(nlc_full_sum[0]) < 1e-8 ); + CHECK( std::abs(nlc_full_sum[1]) < 1e-8 ); + CHECK( std::abs(nlc_full_sum[2]) < 1e-8 ); + + auto fxc = integrator.eval_fxc_contraction( P, tP, settings ); + CHECK( fxc.allFinite() ); + CHECK( (fxc - fxc.transpose()).norm() / nbf < 1e-10 ); + CHECK( (fxc - base_fxc).norm() < 1e-10 ); + auto nlc_fxc = integrator.eval_nlc_fnlc_contraction( P, tP, settings ); + CHECK( nlc_fxc.allFinite() ); + CHECK( (nlc_fxc - nlc_fxc.transpose()).norm() / nbf < 1e-10 ); + CHECK( nlc_fxc.norm() > 1e-12 ); + + auto pbe0_functional = GauXC::functional_type( + ExchCXX::Backend::builtin, ExchCXX::Functional::PBE0, ExchCXX::Spin::Unpolarized ); + auto pbe0_integrator = integrator_factory.get_instance( pbe0_functional, load_balancer ); + auto pbe0_base_fxc = pbe0_integrator.eval_fxc_contraction( P, tP ); + auto pbe0_nlc_fxc = pbe0_integrator.eval_nlc_fnlc_contraction( P, tP, settings ); + CHECK( (base_fxc - pbe0_base_fxc).norm() > 1e-8 ); + CHECK( (nlc_fxc - pbe0_nlc_fxc).norm() / nbf < 1e-10 ); + + matrix_type Pz = matrix_type::Zero( nbf, nbf ); + matrix_type tPz = matrix_type::Zero( nbf, nbf ); + auto polarized_functional = GauXC::functional_type( + ExchCXX::Backend::builtin, ExchCXX::Functional::BLYP, ExchCXX::Spin::Polarized ); + auto uks_integrator = integrator_factory.get_instance( polarized_functional, load_balancer ); + auto [uks_base_exc, uks_base_vxc, uks_base_vxcz] = uks_integrator.eval_exc_vxc( P, Pz ); + auto [uks_exc, uks_vxc, uks_vxcz] = uks_integrator.eval_exc_vxc( P, Pz, settings ); + auto [uks_enlc, uks_vnlc, uks_vnlcz] = uks_integrator.eval_nlc_vnlc( P, Pz, settings ); + CHECK( std::isfinite(uks_exc) ); + CHECK( uks_exc == Approx(uks_base_exc).margin(1e-12) ); + CHECK( (uks_vxc - uks_base_vxc).norm() < 1e-10 ); + CHECK( (uks_vxcz - uks_base_vxcz).norm() < 1e-10 ); + CHECK( std::isfinite(uks_enlc) ); + CHECK( std::abs(uks_enlc) > 1e-10 ); + // A spin-unpolarized UKS density must reproduce the RKS NLC energy exactly, + // since VV10 depends only on the total density and its gradient. RKS scales + // the density matrix by 2 (closed-shell double occupancy) while UKS treats + // the scalar matrix as the full total density, so the equivalent UKS input + // is 2*P with zero spin density. + matrix_type P2 = 2.0 * P; + CHECK( uks_integrator.eval_nlc( P2, Pz, settings ) == Approx(enlc).margin(1e-10) ); + CHECK( uks_vnlc.allFinite() ); + CHECK( uks_vnlc.norm() > 1e-10 ); + CHECK( uks_vnlcz.norm() / nbf < 1e-8 ); + CHECK( uks_integrator.eval_nlc( P, Pz, settings ) == Approx(uks_enlc).margin(1e-10) ); + CHECK( uks_vxc.allFinite() ); + CHECK( uks_vxcz.allFinite() ); + CHECK( uks_vxcz.norm() / nbf < 1e-8 ); + auto uks_nlc_grad = uks_integrator.eval_nlc_grad( P, Pz, settings ); + REQUIRE( uks_nlc_grad.size() == 3 * mol.natoms() ); + for( auto value : uks_nlc_grad ) CHECK( std::isfinite(value) ); + CHECK( std::sqrt(std::inner_product(uks_nlc_grad.begin(), uks_nlc_grad.end(), uks_nlc_grad.begin(), 0.0)) > 1e-12 ); + auto [uks_base_fxc, uks_base_fxcz] = uks_integrator.eval_fxc_contraction( P, Pz, tP, tPz ); + auto [uks_fxc, uks_fxcz] = uks_integrator.eval_fxc_contraction( P, Pz, tP, tPz, settings ); + auto [uks_nlc_fxc, uks_nlc_fxcz] = uks_integrator.eval_nlc_fnlc_contraction( P, Pz, tP, tPz, settings ); + CHECK( uks_fxc.allFinite() ); + CHECK( uks_fxcz.allFinite() ); + CHECK( (uks_fxc - uks_base_fxc).norm() < 1e-10 ); + CHECK( (uks_fxcz - uks_base_fxcz).norm() < 1e-10 ); + CHECK( uks_fxcz.norm() / nbf < 1e-8 ); + CHECK( uks_nlc_fxc.allFinite() ); + CHECK( uks_nlc_fxc.norm() > 1e-12 ); + CHECK( uks_nlc_fxcz.norm() / nbf < 1e-8 ); + + auto pbe0_polarized_functional = GauXC::functional_type( + ExchCXX::Backend::builtin, ExchCXX::Functional::PBE0, ExchCXX::Spin::Polarized ); + auto pbe0_uks_integrator = integrator_factory.get_instance( pbe0_polarized_functional, load_balancer ); + auto [pbe0_uks_base_fxc, pbe0_uks_base_fxcz] = pbe0_uks_integrator.eval_fxc_contraction( P, Pz, tP, tPz ); + auto [pbe0_uks_nlc_fxc, pbe0_uks_nlc_fxcz] = pbe0_uks_integrator.eval_nlc_fnlc_contraction( P, Pz, tP, tPz, settings ); + CHECK( (uks_base_fxc - pbe0_uks_base_fxc).norm() > 1e-8 ); + CHECK( (uks_base_fxcz - pbe0_uks_base_fxcz).norm() < 1e-10 ); + CHECK( (uks_nlc_fxc - pbe0_uks_nlc_fxc).norm() / nbf < 1e-10 ); + CHECK( (uks_nlc_fxcz - pbe0_uks_nlc_fxcz).norm() / nbf < 1e-10 ); +} + +#ifdef GAUXC_HAS_DEVICE +TEST_CASE( "VV10 integrated device RKS EXC/VXC matches host", "[vv10][integrated][cuda]" ) { + using matrix_type = Eigen::MatrixXd; + + auto host_rt = GauXC::RuntimeEnvironment(GAUXC_MPI_CODE(MPI_COMM_WORLD)); + auto device_rt = GauXC::DeviceRuntimeEnvironment(GAUXC_MPI_CODE(MPI_COMM_WORLD,) 0.5); + auto mol = GauXC::make_water(); + auto basis = GauXC::make_631Gd( mol, GauXC::SphericalType(true) ); + for( auto& shell : basis ) { + shell.set_shell_tolerance( std::numeric_limits::epsilon() ); + } + + auto molgrid = GauXC::MolGridFactory::create_default_molgrid( + mol, GauXC::PruningScheme::Unpruned, GauXC::BatchSize(128), + GauXC::RadialQuad::MuraKnowles, GauXC::AtomicGridSizeDefault::GM3 ); + + GauXC::LoadBalancerFactory lb_factory( GauXC::ExecutionSpace::Host, "Default" ); + auto host_lb = lb_factory.get_instance( host_rt, mol, molgrid, basis ); + auto device_lb = lb_factory.get_instance( device_rt, mol, molgrid, basis ); + + GauXC::MolecularWeightsFactory host_mw_factory( + GauXC::ExecutionSpace::Host, "Default", GauXC::MolecularWeightsSettings{} ); + auto host_mw = host_mw_factory.get_instance(); + host_mw.modify_weights( host_lb ); + + GauXC::MolecularWeightsFactory device_mw_factory( + GauXC::ExecutionSpace::Device, "Default", GauXC::MolecularWeightsSettings{} ); + auto device_mw = device_mw_factory.get_instance(); + device_mw.modify_weights( device_lb ); + + auto functional = GauXC::functional_type( + ExchCXX::Backend::builtin, ExchCXX::Functional::BLYP, ExchCXX::Spin::Unpolarized ); + GauXC::XCIntegratorFactory host_integrator_factory( + GauXC::ExecutionSpace::Host, "Replicated", "Reference", "Default", "Default" ); + auto host_integrator = host_integrator_factory.get_instance( functional, host_lb ); + GauXC::XCIntegratorFactory device_integrator_factory( + GauXC::ExecutionSpace::Device, "Replicated", "Default", "Default", "Default" ); + auto device_integrator = device_integrator_factory.get_instance( functional, device_lb ); + + const auto nbf = basis.nbf(); + matrix_type P = matrix_type::Zero( nbf, nbf ); + matrix_type tP = matrix_type::Zero( nbf, nbf ); + for( int i = 0; i < nbf; ++i ) { + P(i,i) = 0.05 + 0.002 * i; + tP(i,i) = 0.01 + 0.001 * i; + } + + GauXC::IntegratorSettingsNLC settings; + auto [host_enlc, host_vnlc] = host_integrator.eval_nlc_vnlc( P, settings ); + auto [device_enlc, device_vnlc] = device_integrator.eval_nlc_vnlc( P, settings ); + + CHECK( device_enlc == Approx(host_enlc).margin(1e-8) ); + CHECK( (device_vnlc - host_vnlc).norm() / nbf < 1e-7 ); + CHECK( (device_vnlc - device_vnlc.transpose()).norm() / nbf < 1e-10 ); + CHECK( device_integrator.eval_nlc( P, settings ) == Approx(host_enlc).margin(1e-8) ); + matrix_type Pz = matrix_type::Zero( nbf, nbf ); + matrix_type tPz = matrix_type::Zero( nbf, nbf ); + auto polarized_functional = GauXC::functional_type( + ExchCXX::Backend::builtin, ExchCXX::Functional::BLYP, ExchCXX::Spin::Polarized ); + auto polarized_host_integrator = host_integrator_factory.get_instance( polarized_functional, host_lb ); + auto polarized_device_integrator = device_integrator_factory.get_instance( polarized_functional, device_lb ); + auto [host_uks_enlc, host_uks_vnlc, host_uks_vnlcz] = polarized_host_integrator.eval_nlc_vnlc( P, Pz, settings ); + auto [device_uks_enlc, device_uks_vnlc, device_uks_vnlcz] = polarized_device_integrator.eval_nlc_vnlc( P, Pz, settings ); + CHECK( device_uks_enlc == Approx(host_uks_enlc).margin(1e-8) ); + CHECK( (device_uks_vnlc - host_uks_vnlc).norm() / nbf < 1e-7 ); + CHECK( device_uks_vnlcz.norm() / nbf < 1e-10 ); + CHECK( polarized_device_integrator.eval_nlc( P, Pz, settings ) == Approx(host_uks_enlc).margin(1e-8) ); + auto host_nlc_fxc = host_integrator.eval_nlc_fnlc_contraction( P, tP, settings ); + auto device_nlc_fxc = device_integrator.eval_nlc_fnlc_contraction( P, tP, settings ); + CHECK( device_nlc_fxc.allFinite() ); + CHECK( (device_nlc_fxc - host_nlc_fxc).norm() / nbf < 1e-7 ); + CHECK( (device_nlc_fxc - device_nlc_fxc.transpose()).norm() / nbf < 1e-10 ); + auto hf_settings = settings; + hf_settings.include_weight_derivatives = false; + auto host_nlc_grad = host_integrator.eval_nlc_grad( P, hf_settings ); + auto device_nlc_grad = device_integrator.eval_nlc_grad( P, hf_settings ); + REQUIRE( device_nlc_grad.size() == host_nlc_grad.size() ); + double grad_diff = 0.0; + for( size_t i = 0; i < host_nlc_grad.size(); ++i ) { + grad_diff += (device_nlc_grad[i] - host_nlc_grad[i]) * (device_nlc_grad[i] - host_nlc_grad[i]); + } + CHECK( std::sqrt(grad_diff) / mol.natoms() < 1e-7 ); + host_nlc_grad = host_integrator.eval_nlc_grad( P, settings ); + device_nlc_grad = device_integrator.eval_nlc_grad( P, settings ); + grad_diff = 0.0; + for( size_t i = 0; i < host_nlc_grad.size(); ++i ) { + grad_diff += (device_nlc_grad[i] - host_nlc_grad[i]) * (device_nlc_grad[i] - host_nlc_grad[i]); + } + CHECK( std::sqrt(grad_diff) / mol.natoms() < 1e-7 ); + CHECK_THROWS_AS( polarized_device_integrator.eval_nlc_fnlc_contraction( P, Pz, tP, tPz, settings ), GauXC::generic_gauxc_exception ); +} +#endif diff --git a/tests/vv10_test.cxx b/tests/vv10_test.cxx new file mode 100644 index 000000000..e11cca9f5 --- /dev/null +++ b/tests/vv10_test.cxx @@ -0,0 +1,338 @@ +#include + +#include +#include +#include +#include "integrator_util/vv10.hpp" + +#include +#include + +#ifdef GAUXC_HAS_CUDA +#include "device/cuda/kernels/vv10.hpp" +#include "device_specific/cuda_util.hpp" +#include "../src/xc_integrator/replicated/device/vv10_nlc_device.hpp" +#endif + +namespace { + +constexpr std::int64_t npts = 5; +constexpr std::int64_t ntrial = 2; +constexpr std::int64_t natoms = 2; + +const std::vector coords = { + 0.0, 0.1, -0.2, + 0.4, -0.3, 0.2, + -0.5, 0.2, 0.3, + 0.7, 0.6, -0.4, + -0.2, -0.7, 0.5 +}; + +const std::vector weights = {0.22, 0.17, 0.31, 0.11, 0.19}; +const std::vector rho = {0.72, 0.44, 0.91, 0.36, 0.58}; +const std::vector density_gradient_norm = {0.0089, 0.0078, 0.0158, 0.0101, 0.0125}; +const std::vector grid_atoms = {0, 0, 1, 1, 0}; +const std::vector rho_t = {0.03, -0.02, 0.04, 0.01, -0.05, -0.01, 0.06, -0.03, 0.02, 0.04}; +const std::vector gamma_t = {0.002, -0.001, 0.003, -0.002, 0.001, -0.004, 0.002, 0.001, -0.003, 0.005}; + +template +void check_vector(const std::vector& actual, const std::vector& ref, double margin = 1e-13) { + REQUIRE(actual.size() == ref.size()); + for(std::size_t i = 0; i < actual.size(); ++i) { + CHECK(actual[i] == Approx(ref[i]).margin(margin)); + } +} + +std::vector rho_weight() { + std::vector out(npts); + for(std::int64_t i = 0; i < npts; ++i) out[i] = rho[i] * weights[i]; + return out; +} + +} + +TEST_CASE("VV10 host EXC/VXC formula matches PySCF reference", "[vv10][nlc]") { + const GauXC::detail::vv10::Parameters params{6.3, 0.0093, 1e-8}; + std::vector eps(npts), vrho(npts), vgamma(npts); + + GauXC::detail::vv10::eval_exc_vxc(npts, coords.data(), weights.data(), + rho.data(), density_gradient_norm.data(), params, eps.data(), vrho.data(), vgamma.data()); + + check_vector(eps, { + 0.00445301813026348, 0.00444920198423763, 0.00445616308451667, + 0.00445820144472496, 0.00445560260974276 + }); + check_vector(vrho, { + 0.00442962367367506, 0.00442417961353026, 0.00443444131349190, + 0.00443937740576438, 0.00443445791849522 + }); + check_vector(vgamma, { + 5.4703231981970949e-10, 4.3982859960426448e-09, 3.7195930885603543e-10, + 2.0026596148858508e-08, 2.6546741400669848e-09 + }, 1e-17); +} + +TEST_CASE("VV10 variable-size host allgather support", "[vv10][nlc][mpi]") { + const auto rt = GauXC::RuntimeEnvironment(GAUXC_MPI_CODE(MPI_COMM_WORLD)); + auto reduction = GauXC::ReductionDriverFactory::get_instance(rt, "BASICMPI"); + + const auto rank = rt.comm_rank(); + const auto world_size = rt.comm_size(); + std::vector local(rank + 1); + for(int i = 0; i <= rank; ++i) local[i] = 100.0 * rank + i; + + auto gathered = reduction.allgather_v(local.data(), local.size()); + std::vector reference; + for(int r = 0; r < world_size; ++r) { + for(int i = 0; i <= r; ++i) reference.push_back(100.0 * r + i); + } + + check_vector(gathered, reference); +} + +TEST_CASE("VV10 host gradient and response intermediates match references", "[vv10][nlc]") { + const GauXC::detail::vv10::Parameters params{6.3, 0.0093, 1e-8}; + std::vector omega(npts), kappa(npts), domega_drho(npts), domega_dgamma(npts); + std::vector d2omega_drho2(npts), d2omega_dgamma2(npts), d2omega_drho_dgamma(npts); + std::vector dkappa_drho(npts), d2kappa_drho2(npts); + std::vector U(npts), W(npts), A(npts), B(npts), C(npts), E(npts); + + GauXC::detail::vv10::eval_omega_kappa(npts, rho.data(), density_gradient_norm.data(), params, + omega.data(), kappa.data(), domega_drho.data(), domega_dgamma.data(), + d2omega_drho2.data(), d2omega_dgamma2.data(), d2omega_drho_dgamma.data(), + dkappa_drho.data(), d2kappa_drho2.data()); + + check_vector(omega, {1.7366438001493870, 1.3576018510928551, 1.9523837921688731, 1.2280150473242860, 1.5586889232762682}); + check_vector(kappa, {16.102934766228300, 14.834004175743555, 16.743895938069180, 14.346083941530297, 15.532962118860626}); + + GauXC::detail::vv10::eval_hessian_intermediates(npts, coords.data(), weights.data(), + rho.data(), omega.data(), kappa.data(), U.data(), W.data(), A.data(), B.data(), C.data(), E.data()); + + check_vector(U, {9.209072244410352e-06, 1.046141581842596e-05, 8.359163476842639e-06, 8.499921019698521e-06, 8.763966041169824e-06}); + check_vector(W, {4.2839882446393177e-06, 7.0119105202783746e-06, 3.7243009447981262e-06, 1.2215577641798923e-05, 6.9448310913784774e-06}); + check_vector(E, {-1.0365967086190920e-04, -1.1129196291360637e-04, -9.7369762355538814e-05, -9.3293041938961706e-05, -9.8490711903360671e-05}); + + std::vector f_rho_t(ntrial*npts), f_gamma_t(ntrial*npts); + GauXC::detail::vv10::eval_hessian_contraction(npts, ntrial, coords.data(), weights.data(), + rho.data(), omega.data(), kappa.data(), U.data(), W.data(), A.data(), B.data(), C.data(), + domega_drho.data(), domega_dgamma.data(), dkappa_drho.data(), d2omega_drho2.data(), + d2omega_dgamma2.data(), d2omega_drho_dgamma.data(), d2kappa_drho2.data(), rho_t.data(), + gamma_t.data(), f_rho_t.data(), f_gamma_t.data()); + + check_vector(f_rho_t, { + 1.6178037606933446e-07, -1.3249866968720368e-06, 1.1116302821324279e-07, + -7.5930158108261938e-08, -1.8068421774630219e-06, -1.0423414119308089e-06, + 1.8870904199228002e-06, -1.0109141134429277e-06, 3.5175453056695153e-07, + 3.5357534022654595e-07 + }); + check_vector(f_gamma_t, { + 2.8164183798520921e-11, 2.8193667944806545e-10, 2.2652562869246210e-13, + -6.1629930379245268e-09, 1.1756835860583153e-09, -2.0337406941186164e-10, + -1.2427741534208510e-09, 8.7638580360613370e-11, -1.0186539929376707e-08, + 3.4109976756300954e-10 + }, 1e-17); + + std::vector Egr(natoms*3*npts), Ugr(natoms*3*npts), Wgr(natoms*3*npts); + GauXC::detail::vv10::eval_grid_response(npts, natoms, coords.data(), weights.data(), + rho.data(), omega.data(), kappa.data(), grid_atoms.data(), Egr.data(), Ugr.data(), Wgr.data()); + + check_vector(Egr, { + 1.0310153822420043e-05, 1.8889529270293274e-05, 1.5112372095644556e-05, + -1.6415873674929635e-05, 5.1457932465049436e-06, -4.4092456971783141e-06, + -1.4045394092983390e-05, -1.2200427500773054e-05, -1.9148451277680992e-05, + -2.2241372703956841e-05, -1.2143915488587469e-05, -2.1225565415921502e-07, + -6.0778553859699155e-06, 1.0967407941688859e-05, 5.9820261926690516e-06, + -1.0310153822420043e-05, -1.8889529270293274e-05, -1.5112372095644556e-05, + 1.6415873674929635e-05, -5.1457932465049436e-06, 4.4092456971783141e-06, + 1.4045394092983390e-05, 1.2200427500773054e-05, 1.9148451277680992e-05, + 2.2241372703956841e-05, 1.2143915488587469e-05, 2.1225565415921502e-07, + 6.0778553859699155e-06, -1.0967407941688859e-05, -5.9820261926690516e-06 + }); +} + +#ifdef GAUXC_HAS_CUDA +TEST_CASE("VV10 CUDA kernels match host formulas", "[vv10][nlc][cuda]") { + const GauXC::detail::vv10::Parameters params{6.3, 0.0093, 1e-8}; + auto rw = rho_weight(); + + auto* coords_d = GauXC::util::cuda_malloc(coords.size()); + auto* weights_d = GauXC::util::cuda_malloc(weights.size()); + auto* rho_d = GauXC::util::cuda_malloc(rho.size()); + auto* gamma_d = GauXC::util::cuda_malloc(density_gradient_norm.size()); + auto* rw_d = GauXC::util::cuda_malloc(rw.size()); + auto* grid_atoms_d = GauXC::util::cuda_malloc(grid_atoms.size()); + auto* rho_t_d = GauXC::util::cuda_malloc(rho_t.size()); + auto* gamma_t_d = GauXC::util::cuda_malloc(gamma_t.size()); + + auto* omega_d = GauXC::util::cuda_malloc(npts); + auto* kappa_d = GauXC::util::cuda_malloc(npts); + auto* domega_drho_d = GauXC::util::cuda_malloc(npts); + auto* domega_dgamma_d = GauXC::util::cuda_malloc(npts); + auto* d2omega_drho2_d = GauXC::util::cuda_malloc(npts); + auto* d2omega_dgamma2_d = GauXC::util::cuda_malloc(npts); + auto* d2omega_drho_dgamma_d = GauXC::util::cuda_malloc(npts); + auto* dkappa_drho_d = GauXC::util::cuda_malloc(npts); + auto* d2kappa_drho2_d = GauXC::util::cuda_malloc(npts); + auto* F_d = GauXC::util::cuda_malloc(npts); + auto* U_d = GauXC::util::cuda_malloc(npts); + auto* W_d = GauXC::util::cuda_malloc(npts); + auto* eps_d = GauXC::util::cuda_malloc(npts); + auto* vrho_d = GauXC::util::cuda_malloc(npts); + auto* vgamma_d = GauXC::util::cuda_malloc(npts); + auto* grid_gradient_d = GauXC::util::cuda_malloc(3*npts); + auto* A_d = GauXC::util::cuda_malloc(npts); + auto* B_d = GauXC::util::cuda_malloc(npts); + auto* C_d = GauXC::util::cuda_malloc(npts); + auto* E_d = GauXC::util::cuda_malloc(npts); + auto* f_rho_t_d = GauXC::util::cuda_malloc(ntrial*npts); + auto* f_gamma_t_d = GauXC::util::cuda_malloc(ntrial*npts); + auto* Egr_d = GauXC::util::cuda_malloc(natoms*3*npts); + auto* Ugr_d = GauXC::util::cuda_malloc(natoms*3*npts); + auto* Wgr_d = GauXC::util::cuda_malloc(natoms*3*npts); + + GauXC::util::cuda_copy(coords.size(), coords_d, coords.data()); + GauXC::util::cuda_copy(weights.size(), weights_d, weights.data()); + GauXC::util::cuda_copy(rho.size(), rho_d, rho.data()); + GauXC::util::cuda_copy(density_gradient_norm.size(), gamma_d, density_gradient_norm.data()); + GauXC::util::cuda_copy(rw.size(), rw_d, rw.data()); + GauXC::util::cuda_copy(grid_atoms.size(), grid_atoms_d, grid_atoms.data()); + GauXC::util::cuda_copy(rho_t.size(), rho_t_d, rho_t.data()); + GauXC::util::cuda_copy(gamma_t.size(), gamma_t_d, gamma_t.data()); + + cudaStream_t stream = 0; + GauXC::detail::vv10::eval_omega_kappa_cuda(stream, npts, rho_d, gamma_d, params, + omega_d, kappa_d, domega_drho_d, domega_dgamma_d, d2omega_drho2_d, + d2omega_dgamma2_d, d2omega_drho_dgamma_d, dkappa_drho_d, d2kappa_drho2_d); + GauXC::detail::vv10::eval_fock_cuda(stream, npts, coords_d, omega_d, kappa_d, + npts, coords_d, rw_d, omega_d, kappa_d, F_d, U_d, W_d); + GauXC::detail::vv10::eval_exc_vxc_from_fock_cuda(stream, npts, rho_d, gamma_d, + F_d, U_d, W_d, params, eps_d, vrho_d, vgamma_d); + GauXC::detail::vv10::eval_grid_gradient_cuda(stream, npts, coords_d, omega_d, + kappa_d, npts, coords_d, rw_d, omega_d, kappa_d, grid_gradient_d); + GauXC::detail::vv10::eval_hessian_intermediates_cuda(stream, npts, coords_d, + weights_d, rho_d, omega_d, kappa_d, U_d, W_d, A_d, B_d, C_d, E_d); + GauXC::detail::vv10::eval_hessian_contraction_cuda(stream, npts, ntrial, + coords_d, weights_d, rho_d, omega_d, kappa_d, U_d, W_d, A_d, B_d, C_d, + domega_drho_d, domega_dgamma_d, dkappa_drho_d, d2omega_drho2_d, + d2omega_dgamma2_d, d2omega_drho_dgamma_d, d2kappa_drho2_d, rho_t_d, + gamma_t_d, f_rho_t_d, f_gamma_t_d); + GauXC::detail::vv10::eval_grid_response_cuda(stream, npts, natoms, coords_d, + weights_d, rho_d, omega_d, kappa_d, grid_atoms_d, Egr_d, Ugr_d, Wgr_d); + GauXC::util::cuda_device_sync(); + + std::vector eps(npts), vrho(npts), vgamma(npts), U(npts), W(npts), E(npts); + std::vector grid_gradient(3*npts), grid_gradient_ref(3*npts); + std::vector f_rho_t(ntrial*npts), f_gamma_t(ntrial*npts), Egr(natoms*3*npts); + GauXC::util::cuda_copy(npts, eps.data(), eps_d); + GauXC::util::cuda_copy(npts, vrho.data(), vrho_d); + GauXC::util::cuda_copy(npts, vgamma.data(), vgamma_d); + GauXC::util::cuda_copy(npts, U.data(), U_d); + GauXC::util::cuda_copy(npts, W.data(), W_d); + GauXC::util::cuda_copy(npts, E.data(), E_d); + GauXC::util::cuda_copy(3*npts, grid_gradient.data(), grid_gradient_d); + GauXC::util::cuda_copy(ntrial*npts, f_rho_t.data(), f_rho_t_d); + GauXC::util::cuda_copy(ntrial*npts, f_gamma_t.data(), f_gamma_t_d); + GauXC::util::cuda_copy(natoms*3*npts, Egr.data(), Egr_d); + + check_vector(eps, {0.00445301813026348, 0.00444920198423763, 0.00445616308451667, 0.00445820144472496, 0.00445560260974276}); + check_vector(vrho, {0.00442962367367506, 0.00442417961353026, 0.00443444131349190, 0.00443937740576438, 0.00443445791849522}); + check_vector(vgamma, {5.4703231981970949e-10, 4.3982859960426448e-09, 3.7195930885603543e-10, 2.0026596148858508e-08, 2.6546741400669848e-09}, 1e-17); + check_vector(E, {-1.0365967086190920e-04, -1.1129196291360637e-04, -9.7369762355538814e-05, -9.3293041938961706e-05, -9.8490711903360671e-05}); + std::vector omega(npts), kappa(npts); + GauXC::detail::vv10::eval_omega_kappa(npts, rho.data(), density_gradient_norm.data(), params, + omega.data(), kappa.data()); + GauXC::detail::vv10::eval_grid_gradient(npts, coords.data(), omega.data(), + kappa.data(), npts, coords.data(), rw.data(), omega.data(), kappa.data(), + grid_gradient_ref.data()); + check_vector(grid_gradient, grid_gradient_ref); + check_vector(f_rho_t, {1.6178037606933446e-07, -1.3249866968720368e-06, 1.1116302821324279e-07, -7.5930158108261938e-08, -1.8068421774630219e-06, -1.0423414119308089e-06, 1.8870904199228002e-06, -1.0109141134429277e-06, 3.5175453056695153e-07, 3.5357534022654595e-07}); + CHECK(Egr[0] == Approx(1.0310153822420043e-05).margin(1e-13)); + CHECK(Egr[29] == Approx(-5.9820261926690516e-06).margin(1e-13)); + + GauXC::util::cuda_free(coords_d, weights_d, rho_d, gamma_d, rw_d, grid_atoms_d, + rho_t_d, gamma_t_d, omega_d, kappa_d, domega_drho_d, domega_dgamma_d, + d2omega_drho2_d, d2omega_dgamma2_d, d2omega_drho_dgamma_d, dkappa_drho_d, + d2kappa_drho2_d, F_d, U_d, W_d, eps_d, vrho_d, vgamma_d, grid_gradient_d, + A_d, B_d, C_d, E_d, f_rho_t_d, f_gamma_t_d, Egr_d, Ugr_d, Wgr_d); +} + +TEST_CASE("VV10 CUDA EXC/VXC view helper matches CPU helper", "[vv10][nlc][cuda]") { + GauXC::IntegratorSettingsNLC settings; + settings.vv10_b = 6.3; + settings.vv10_c = 0.0093; + settings.vv10_tol = 1e-8; + + std::vector eps_cpu(npts, 0.0), vrho_cpu(npts, 0.0), vgamma_cpu(npts, 0.0); + std::vector eps_cuda(npts, 0.0), vrho_cuda(npts, 0.0), vgamma_cuda(npts, 0.0); + + GauXC::detail::vv10::GridView grid{ + static_cast(npts), coords.data(), weights.data(), rho.data(), density_gradient_norm.data() + }; + + GauXC::detail::vv10::eval_exc_vxc( + settings, grid, GauXC::detail::vv10::CorrectionsView{ eps_cpu.data(), vrho_cpu.data(), vgamma_cpu.data() } ); + GauXC::detail::vv10::eval_exc_vxc_cuda( + 0, settings, grid, GauXC::detail::vv10::CorrectionsView{ eps_cuda.data(), vrho_cuda.data(), vgamma_cuda.data() } ); + + check_vector(eps_cuda, eps_cpu); + check_vector(vrho_cuda, vrho_cpu); + check_vector(vgamma_cuda, vgamma_cpu, 1e-17); +} + +TEST_CASE("VV10 CUDA RKS response view helper matches CPU helper", "[vv10][nlc][cuda]") { + GauXC::IntegratorSettingsNLC settings; + settings.vv10_b = 6.3; + settings.vv10_c = 0.0093; + settings.vv10_tol = 1e-8; + + const std::vector grad_x = {0.08, -0.02, 0.11, 0.04, -0.05}; + const std::vector grad_y = {-0.04, 0.07, 0.01, -0.09, 0.06}; + const std::vector grad_z = {0.03, 0.05, -0.06, 0.02, 0.08}; + const std::vector trial_rho = {0.03, -0.02, 0.04, 0.01, -0.05}; + const std::vector trial_grad_x = {0.002, -0.001, 0.003, -0.002, 0.001}; + const std::vector trial_grad_y = {-0.004, 0.002, 0.001, -0.003, 0.005}; + const std::vector trial_grad_z = {0.001, -0.003, 0.002, 0.004, -0.002}; + + std::vector A_cpu(npts, 0.0), B_cpu(3*npts, 0.0); + std::vector A_cuda(npts, 0.0), B_cuda(3*npts, 0.0); + + GauXC::detail::vv10::ResponseGridView grid{ + static_cast(npts), coords.data(), weights.data(), rho.data(), + density_gradient_norm.data(), grad_x.data(), grad_y.data(), grad_z.data() + }; + GauXC::detail::vv10::TrialView trial{ + trial_rho.data(), trial_grad_x.data(), trial_grad_y.data(), trial_grad_z.data() + }; + + GauXC::detail::vv10::eval_fxc_rks( + settings, grid, trial, GauXC::detail::vv10::ResponseCorrectionsView{A_cpu.data(), B_cpu.data()} ); + GauXC::detail::vv10::eval_fxc_rks_cuda( + 0, settings, grid, trial, GauXC::detail::vv10::ResponseCorrectionsView{A_cuda.data(), B_cuda.data()} ); + + check_vector(A_cuda, A_cpu); + check_vector(B_cuda, B_cpu); +} + +TEST_CASE("VV10 CUDA grid-gradient view helper matches CPU helper", "[vv10][nlc][cuda]") { + GauXC::IntegratorSettingsNLC settings; + settings.vv10_b = 6.3; + settings.vv10_c = 0.0093; + settings.vv10_tol = 1e-8; + + std::vector gx_cpu(npts, 0.0), gy_cpu(npts, 0.0), gz_cpu(npts, 0.0); + std::vector gx_cuda(npts, 0.0), gy_cuda(npts, 0.0), gz_cuda(npts, 0.0); + + GauXC::detail::vv10::GridView grid{ + static_cast(npts), coords.data(), weights.data(), rho.data(), density_gradient_norm.data() + }; + + GauXC::detail::vv10::eval_grid_gradient( + settings, grid, GauXC::detail::vv10::GridGradientView{gx_cpu.data(), gy_cpu.data(), gz_cpu.data()} ); + GauXC::detail::vv10::eval_grid_gradient_cuda( + 0, settings, grid, GauXC::detail::vv10::GridGradientView{gx_cuda.data(), gy_cuda.data(), gz_cuda.data()} ); + + check_vector(gx_cuda, gx_cpu); + check_vector(gy_cuda, gy_cpu); + check_vector(gz_cuda, gz_cpu); +} +#endif \ No newline at end of file