From 519780d020cee33d08ef9cacddf37bc3c50e50e4 Mon Sep 17 00:00:00 2001
From: Loris Ercole <v-lercole@microsoft.com>
Date: Tue, 28 Apr 2026 18:13:18 +0200
Subject: [PATCH 01/52] Fix MSVC native cl.exe build compatibility on Windows
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

GauXC was only buildable with GCC, Clang, and clang-cl. This commit fixes all
compilation and linking issues when building with MSVC's native `cl.exe` compiler.

## Compiler compatibility fixes
- `__PRETTY_FUNCTION__` → `__FUNCSIG__` (`exceptions.hpp`):
  MSVC does not support `__PRETTY_FUNCTION__`; use `__FUNCSIG__` under `_MSC_VER` guard.
- `__builtin_popcount` → `std::popcount` (`exx_screening.cxx`):
  MSVC does not provide GCC builtins; use C++20 `<bit>` header instead.
- C99 `[restrict]` array params → `*__restrict` pointers (14 rys files):
  MSVC does not support the C99 array parameter syntax. The `*__restrict` pointer
  form is portable across GCC, Clang, and MSVC.
- `__attribute__((always_inline))` → `FORCE_INLINE` macro (`rys_integral.c`):
  Maps to `__forceinline` on MSVC and `__attribute__((always_inline))` on GCC/Clang.
  Also replaces the GNU statement-expression `MIN` macro with a simple ternary.
- C99 VLAs → `_malloca`/`_freea` (`rys_rw.c`, `rys_xrw.c`):
  MSVC does not support VLAs. Uses `_malloca`/`_freea` (stack with heap fallback)
  under `_MSC_VER` guards.
- `__attribute__((__aligned__(64)))` → `alignas(64)` (20 `integral_*.cxx` files):
  Portable C++11 alignment specifier, works on GCC, Clang, and MSVC.
- `__restrict__` → `__restrict` (`integral_1_0.cxx`):
  `__restrict__` is GCC/Clang-only; `__restrict` is portable across all three compilers.
- Missing `#include <string>` (`reduction_driver.hpp`):
  MSVC does not provide `<string>` transitively through other headers.
- Non-const `operator==` (`molecule.hpp`):
  Made `Molecule::operator==` `const` to fix C++20 ambiguity with synthesized
  reverse candidates on MSVC.

## Static HDF5 linking fixes (`src/external/CMakeLists.txt`)

- HighFive incorrectly propagates `H5_BUILT_AS_DYNAMIC_LIB`:
  CMake's `FindHDF5` module sets this define on Windows when `HDF5_USE_STATIC_LIBRARIES`
  is unset, regardless of whether the library is actually static. HighFive then
  propagates it via its `libdeps` INTERFACE target. The fix patches `libdeps` after
  `FetchContent` to replace `H5_BUILT_AS_DYNAMIC_LIB` with `H5_BUILT_AS_STATIC_LIB`.
- HDF5 transitive static dependencies:
  When linking HDF5 statically on Windows, its dependencies (zlib, szip/aec, shlwapi)
  must be linked explicitly as they are not auto-resolved.
---
 include/gauxc/exceptions.hpp                  |  5 +++
 include/gauxc/molecule.hpp                    |  2 +-
 include/gauxc/reduction_driver.hpp            |  1 +
 src/external/CMakeLists.txt                   | 29 +++++++++++++++--
 .../integrator_util/exx_screening.cxx         |  3 +-
 .../host/obara_saika/src/integral_0.cxx       |  2 +-
 .../host/obara_saika/src/integral_0_0.cxx     |  2 +-
 .../host/obara_saika/src/integral_1.cxx       |  2 +-
 .../host/obara_saika/src/integral_1_0.cxx     | 10 +++---
 .../host/obara_saika/src/integral_1_1.cxx     |  2 +-
 .../host/obara_saika/src/integral_2.cxx       |  2 +-
 .../host/obara_saika/src/integral_2_0.cxx     |  2 +-
 .../host/obara_saika/src/integral_2_1.cxx     |  2 +-
 .../host/obara_saika/src/integral_2_2.cxx     |  2 +-
 .../host/obara_saika/src/integral_3.cxx       |  2 +-
 .../host/obara_saika/src/integral_3_0.cxx     |  2 +-
 .../host/obara_saika/src/integral_3_1.cxx     |  2 +-
 .../host/obara_saika/src/integral_3_2.cxx     |  2 +-
 .../host/obara_saika/src/integral_3_3.cxx     |  2 +-
 .../host/obara_saika/src/integral_4.cxx       |  2 +-
 .../host/obara_saika/src/integral_4_0.cxx     |  2 +-
 .../host/obara_saika/src/integral_4_1.cxx     |  2 +-
 .../host/obara_saika/src/integral_4_2.cxx     |  2 +-
 .../host/obara_saika/src/integral_4_3.cxx     |  2 +-
 .../host/obara_saika/src/integral_4_4.cxx     |  2 +-
 .../local_work_driver/host/rys/src/rys_1rw.c  |  2 +-
 .../local_work_driver/host/rys/src/rys_1rw.h  |  2 +-
 .../local_work_driver/host/rys/src/rys_2rw.c  |  2 +-
 .../local_work_driver/host/rys/src/rys_2rw.h  |  2 +-
 .../local_work_driver/host/rys/src/rys_3rw.c  |  2 +-
 .../local_work_driver/host/rys/src/rys_3rw.h  |  2 +-
 .../local_work_driver/host/rys/src/rys_4rw.c  |  2 +-
 .../local_work_driver/host/rys/src/rys_4rw.h  |  2 +-
 .../local_work_driver/host/rys/src/rys_5rw.c  |  2 +-
 .../local_work_driver/host/rys/src/rys_5rw.h  |  2 +-
 .../host/rys/src/rys_integral.c               | 22 ++++++++-----
 .../local_work_driver/host/rys/src/rys_rw.c   | 18 ++++++++---
 .../local_work_driver/host/rys/src/rys_rw.h   |  2 +-
 .../local_work_driver/host/rys/src/rys_xrw.c  | 31 ++++++++++++++++---
 .../local_work_driver/host/rys/src/rys_xrw.h  |  8 ++---
 40 files changed, 129 insertions(+), 60 deletions(-)
diff --git a/include/gauxc/exceptions.hpp b/include/gauxc/exceptions.hpp
index 84b9b4893..fa6a874e8 100644
--- a/include/gauxc/exceptions.hpp
+++ b/include/gauxc/exceptions.hpp
@@ -76,8 +76,13 @@ class generic_gauxc_exception : public std::exception {
 
 }
 
+#ifdef _MSC_VER
+#define GAUXC_GENERIC_EXCEPTION( MSG ) \
+  throw generic_gauxc_exception( __FILE__, __FUNCSIG__, __LINE__, MSG )
+#else
 #define GAUXC_GENERIC_EXCEPTION( MSG ) \
   throw generic_gauxc_exception( __FILE__, __PRETTY_FUNCTION__, __LINE__, MSG )
+#endif
 
 #define GAUXC_PIMPL_NOT_INITIALIZED() \
   GAUXC_GENERIC_EXCEPTION("PIMPL NOT INITIALIZED")
diff --git a/include/gauxc/molecule.hpp b/include/gauxc/molecule.hpp
index 9f4fe6a74..2b6743620 100644
--- a/include/gauxc/molecule.hpp
+++ b/include/gauxc/molecule.hpp
@@ -47,7 +47,7 @@ class Molecule : public std::vector<Atom> {
       })->Z;
   }
 
-  bool operator==(const Molecule& other) {
+  bool operator==(const Molecule& other) const {
     if(other.size() != this->size()) return false;
     for( auto i = 0ul; i < this->size(); ++i )
       if( other[i] != operator[](i) ) return false;
diff --git a/include/gauxc/reduction_driver.hpp b/include/gauxc/reduction_driver.hpp
index f3bef1886..9cb1dffc7 100644
--- a/include/gauxc/reduction_driver.hpp
+++ b/include/gauxc/reduction_driver.hpp
@@ -11,6 +11,7 @@
  */
 #pragma once
 #include <memory>
+#include <string>
 #include <gauxc/gauxc_config.hpp>
 #include <gauxc/runtime_environment.hpp>
 #include <typeindex>
diff --git a/src/external/CMakeLists.txt b/src/external/CMakeLists.txt
index 46612c81b..c9c9d7077 100644
--- a/src/external/CMakeLists.txt
+++ b/src/external/CMakeLists.txt
@@ -21,19 +21,42 @@ if( GAUXC_ENABLE_HDF5 )
       message(STATUS "HighFive REV  = ${GAUXC_HIGHFIVE_REVISION}  ")
       FetchContent_Declare( HighFive
         GIT_REPOSITORY ${GAUXC_HIGHFIVE_REPOSITORY}
-        GIT_TAG        ${GAUXC_HIGHFIVE_REVISION}  
+        GIT_TAG        ${GAUXC_HIGHFIVE_REVISION}
       )
-    
+
       set(HIGHFIVE_USE_BOOST OFF CACHE BOOL "" )
       set(HIGHFIVE_UNIT_TESTS OFF CACHE BOOL "" )
       set(HIGHFIVE_EXAMPLES OFF CACHE BOOL "" )
       #set(HIGHFIVE_PARALLEL_HDF5 ON CACHE BOOL "" )
       set(HIGHFIVE_BUILD_DOCS OFF CACHE BOOL "" )
       FetchContent_MakeAvailable( HighFive )
-    
+
+      # HighFive propagates HDF5_DEFINITIONS via its libdeps target.
+      # CMake's FindHDF5 module sets H5_BUILT_AS_DYNAMIC_LIB on Windows
+      # when HDF5_USE_STATIC_LIBRARIES is not set, even for static libs.
+      # Correct this when linking statically.
+      if(WIN32 AND HDF5_PROVIDES_STATIC_LIBS AND TARGET libdeps)
+        get_target_property(_libdeps_defs libdeps INTERFACE_COMPILE_DEFINITIONS)
+        if(_libdeps_defs)
+          list(REMOVE_ITEM _libdeps_defs "H5_BUILT_AS_DYNAMIC_LIB")
+          list(APPEND _libdeps_defs "H5_BUILT_AS_STATIC_LIB")
+          set_target_properties(libdeps PROPERTIES INTERFACE_COMPILE_DEFINITIONS "${_libdeps_defs}")
+        endif()
+      endif()
+
     endif()
     target_sources( gauxc PRIVATE hdf5_write.cxx hdf5_read.cxx )
     target_link_libraries( gauxc PUBLIC HighFive )
+
+    # When linking HDF5 statically on Windows, HDF5's transitive
+    # dependencies (zlib, szip/aec, shlwapi) must be linked explicitly.
+    if(WIN32 AND HDF5_PROVIDES_STATIC_LIBS)
+      find_library(ZLIB_LIBRARY NAMES zlib z)
+      find_library(SZIP_LIBRARY NAMES szip-static szip sz)
+      find_library(AEC_LIBRARY  NAMES aec-static aec)
+      target_link_libraries( gauxc PUBLIC
+        ${ZLIB_LIBRARY} ${SZIP_LIBRARY} ${AEC_LIBRARY} shlwapi )
+    endif()
   else()
     message(WARNING "GAUXC_ENABLE_HDF5 was enabled, but HDF5 was not found, Disabling HDF5 Bindings")
   endif()
diff --git a/src/xc_integrator/integrator_util/exx_screening.cxx b/src/xc_integrator/integrator_util/exx_screening.cxx
index 5c7efcd13..f55148742 100644
--- a/src/xc_integrator/integrator_util/exx_screening.cxx
+++ b/src/xc_integrator/integrator_util/exx_screening.cxx
@@ -13,6 +13,7 @@
 #include "host/blas.hpp"
 #include <gauxc/util/div_ceil.hpp>
 #include <chrono>
+#include <bit>
 //#include <mpi.h>
 //#include <fstream>
 #ifdef GAUXC_HAS_CUDA
@@ -195,7 +196,7 @@ void exx_ek_screening(
     }
 
     uint32_t total_shells = 0;
-    for( auto x : task_ek_shells ) total_shells += __builtin_popcount(x);
+    for( auto x : task_ek_shells ) total_shells += std::popcount(x);
 
     std::vector<uint32_t> ek_shells; ek_shells.reserve(total_shells);
     for( auto i_block = 0u; i_block < util::div_ceil(nshells,32); ++i_block ) {
diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_0.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_0.cxx
index c64d2d54b..cd6d444d7 100644
--- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_0.cxx
+++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_0.cxx
@@ -30,7 +30,7 @@ void integral_0(size_t npts,
                int ldG, 
                double *weights,
                double *boys_table) {
-   __attribute__((__aligned__(64))) double buffer[1 * NPTS_LOCAL + 3 * NPTS_LOCAL];
+   alignas(64) double buffer[1 * NPTS_LOCAL + 3 * NPTS_LOCAL];
 
    double *temp       = (buffer + 0);
    double *Tval       = (buffer + 1 * NPTS_LOCAL + 0 * NPTS_LOCAL);
diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_0_0.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_0_0.cxx
index 6971c1a71..d5024357a 100644
--- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_0_0.cxx
+++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_0_0.cxx
@@ -32,7 +32,7 @@ void integral_0_0(size_t npts,
                   int ldG, 
                   double *weights,
                   double * /*boys_table*/) {
-   __attribute__((__aligned__(64))) double buffer[1 * NPTS_LOCAL + 3 * NPTS_LOCAL];
+   alignas(64) double buffer[1 * NPTS_LOCAL + 3 * NPTS_LOCAL];
 
    double *temp       = (buffer + 0);
    double *Tval       = (buffer + 1 * NPTS_LOCAL + 0 * NPTS_LOCAL);
diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_1.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_1.cxx
index 3638d86af..045a5c860 100644
--- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_1.cxx
+++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_1.cxx
@@ -30,7 +30,7 @@ void integral_1(size_t npts,
                int ldG, 
                double *weights,
                double *boys_table) {
-   __attribute__((__aligned__(64))) double buffer[9 * NPTS_LOCAL + 3 * NPTS_LOCAL];
+   alignas(64) double buffer[9 * NPTS_LOCAL + 3 * NPTS_LOCAL];
 
    double *temp       = (buffer + 0);
    double *Tval       = (buffer + 9 * NPTS_LOCAL + 0 * NPTS_LOCAL);
diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_1_0.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_1_0.cxx
index d0e655413..34f39af6a 100644
--- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_1_0.cxx
+++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_1_0.cxx
@@ -32,12 +32,12 @@ void integral_1_0(size_t npts,
                   int ldG, 
                   double *weights,
                   double *boys_table) {
-   __attribute__((__aligned__(64))) double buffer[3 * NPTS_LOCAL + 3 * NPTS_LOCAL];
+   alignas(64) double buffer[3 * NPTS_LOCAL + 3 * NPTS_LOCAL];
 
-   double * __restrict__ temp       = (buffer + 0);
-   double * __restrict__ Tval       = (buffer + 3 * NPTS_LOCAL + 0 * NPTS_LOCAL);
-   double * __restrict__ Tval_inv_e = (buffer + 3 * NPTS_LOCAL + 1 * NPTS_LOCAL);
-   double * __restrict__ FmT        = (buffer + 3 * NPTS_LOCAL + 2 * NPTS_LOCAL);
+   double * __restrict temp       = (buffer + 0);
+   double * __restrict Tval       = (buffer + 3 * NPTS_LOCAL + 0 * NPTS_LOCAL);
+   double * __restrict Tval_inv_e = (buffer + 3 * NPTS_LOCAL + 1 * NPTS_LOCAL);
+   double * __restrict FmT        = (buffer + 3 * NPTS_LOCAL + 2 * NPTS_LOCAL);
 
    size_t npts_upper = NPTS_LOCAL * (npts / NPTS_LOCAL);
    size_t p_outer = 0;
diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_1_1.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_1_1.cxx
index ee58d18f0..bf46e1efe 100644
--- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_1_1.cxx
+++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_1_1.cxx
@@ -32,7 +32,7 @@ void integral_1_1(size_t npts,
                   int ldG, 
                   double *weights,
                   double *boys_table) {
-   __attribute__((__aligned__(64))) double buffer[9 * NPTS_LOCAL + 3 * NPTS_LOCAL];
+   alignas(64) double buffer[9 * NPTS_LOCAL + 3 * NPTS_LOCAL];
 
    double *temp       = (buffer + 0);
    double *Tval       = (buffer + 9 * NPTS_LOCAL + 0 * NPTS_LOCAL);
diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_2.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_2.cxx
index 035be5bef..bcaef2609 100644
--- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_2.cxx
+++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_2.cxx
@@ -30,7 +30,7 @@ void integral_2(size_t npts,
                int ldG, 
                double *weights,
                double *boys_table) {
-   __attribute__((__aligned__(64))) double buffer[31 * NPTS_LOCAL + 3 * NPTS_LOCAL];
+   alignas(64) double buffer[31 * NPTS_LOCAL + 3 * NPTS_LOCAL];
 
    double *temp       = (buffer + 0);
    double *Tval       = (buffer + 31 * NPTS_LOCAL + 0 * NPTS_LOCAL);
diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_2_0.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_2_0.cxx
index 0343e6675..378ee6641 100644
--- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_2_0.cxx
+++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_2_0.cxx
@@ -32,7 +32,7 @@ void integral_2_0(size_t npts,
                   int ldG, 
                   double *weights,
                   double *boys_table) {
-   __attribute__((__aligned__(64))) double buffer[6 * NPTS_LOCAL + 3 * NPTS_LOCAL];
+   alignas(64) double buffer[6 * NPTS_LOCAL + 3 * NPTS_LOCAL];
 
    double *temp       = (buffer + 0);
    double *Tval       = (buffer + 6 * NPTS_LOCAL + 0 * NPTS_LOCAL);
diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_2_1.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_2_1.cxx
index 6904c15d7..c8abdf6be 100644
--- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_2_1.cxx
+++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_2_1.cxx
@@ -32,7 +32,7 @@ void integral_2_1(size_t npts,
                   int ldG, 
                   double *weights,
                   double *boys_table) {
-   __attribute__((__aligned__(64))) double buffer[16 * NPTS_LOCAL + 3 * NPTS_LOCAL];
+   alignas(64) double buffer[16 * NPTS_LOCAL + 3 * NPTS_LOCAL];
 
    double *temp       = (buffer + 0);
    double *Tval       = (buffer + 16 * NPTS_LOCAL + 0 * NPTS_LOCAL);
diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_2_2.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_2_2.cxx
index dbd9f500d..95989043d 100644
--- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_2_2.cxx
+++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_2_2.cxx
@@ -32,7 +32,7 @@ void integral_2_2(size_t npts,
                   int ldG, 
                   double *weights,
                   double *boys_table) {
-   __attribute__((__aligned__(64))) double buffer[31 * NPTS_LOCAL + 3 * NPTS_LOCAL];
+   alignas(64) double buffer[31 * NPTS_LOCAL + 3 * NPTS_LOCAL];
 
    double *temp       = (buffer + 0);
    double *Tval       = (buffer + 31 * NPTS_LOCAL + 0 * NPTS_LOCAL);
diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3.cxx
index c3faf7f43..48b26caf2 100644
--- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3.cxx
+++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3.cxx
@@ -30,7 +30,7 @@ void integral_3(size_t npts,
                int ldG, 
                double *weights,
                double *boys_table) {
-   __attribute__((__aligned__(64))) double buffer[74 * NPTS_LOCAL + 3 * NPTS_LOCAL];
+   alignas(64) double buffer[74 * NPTS_LOCAL + 3 * NPTS_LOCAL];
 
    double *temp       = (buffer + 0);
    double *Tval       = (buffer + 74 * NPTS_LOCAL + 0 * NPTS_LOCAL);
diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_0.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_0.cxx
index 44c3542e0..bca56cfc9 100644
--- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_0.cxx
+++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_0.cxx
@@ -32,7 +32,7 @@ void integral_3_0(size_t npts,
                   int ldG, 
                   double *weights,
                   double *boys_table) {
-   __attribute__((__aligned__(64))) double buffer[10 * NPTS_LOCAL + 3 * NPTS_LOCAL];
+   alignas(64) double buffer[10 * NPTS_LOCAL + 3 * NPTS_LOCAL];
 
    double *temp       = (buffer + 0);
    double *Tval       = (buffer + 10 * NPTS_LOCAL + 0 * NPTS_LOCAL);
diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_1.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_1.cxx
index 197e948ad..91148e596 100644
--- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_1.cxx
+++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_1.cxx
@@ -32,7 +32,7 @@ void integral_3_1(size_t npts,
                   int ldG, 
                   double *weights,
                   double *boys_table) {
-   __attribute__((__aligned__(64))) double buffer[25 * NPTS_LOCAL + 3 * NPTS_LOCAL];
+   alignas(64) double buffer[25 * NPTS_LOCAL + 3 * NPTS_LOCAL];
 
    double *temp       = (buffer + 0);
    double *Tval       = (buffer + 25 * NPTS_LOCAL + 0 * NPTS_LOCAL);
diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_2.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_2.cxx
index 7c4a2ec67..eea293316 100644
--- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_2.cxx
+++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_2.cxx
@@ -32,7 +32,7 @@ void integral_3_2(size_t npts,
                   int ldG, 
                   double *weights,
                   double *boys_table) {
-   __attribute__((__aligned__(64))) double buffer[46 * NPTS_LOCAL + 3 * NPTS_LOCAL];
+   alignas(64) double buffer[46 * NPTS_LOCAL + 3 * NPTS_LOCAL];
 
    double *temp       = (buffer + 0);
    double *Tval       = (buffer + 46 * NPTS_LOCAL + 0 * NPTS_LOCAL);
diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_3.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_3.cxx
index 251de89d9..3d46f0ac5 100644
--- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_3.cxx
+++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_3.cxx
@@ -32,7 +32,7 @@ void integral_3_3(size_t npts,
                   int ldG, 
                   double *weights,
                   double *boys_table) {
-   __attribute__((__aligned__(64))) double buffer[74 * NPTS_LOCAL + 3 * NPTS_LOCAL];
+   alignas(64) double buffer[74 * NPTS_LOCAL + 3 * NPTS_LOCAL];
 
    double *temp       = (buffer + 0);
    double *Tval       = (buffer + 74 * NPTS_LOCAL + 0 * NPTS_LOCAL);
diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4.cxx
index 67a9cace1..ad0e89b82 100644
--- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4.cxx
+++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4.cxx
@@ -30,7 +30,7 @@ void integral_4(size_t npts,
                int ldG, 
                double *weights,
                double *boys_table) {
-   __attribute__((__aligned__(64))) double buffer[145 * NPTS_LOCAL + 3 * NPTS_LOCAL];
+   alignas(64) double buffer[145 * NPTS_LOCAL + 3 * NPTS_LOCAL];
 
    double *temp       = (buffer + 0);
    double *Tval       = (buffer + 145 * NPTS_LOCAL + 0 * NPTS_LOCAL);
diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_0.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_0.cxx
index 1b2f57f14..275edeed3 100644
--- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_0.cxx
+++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_0.cxx
@@ -32,7 +32,7 @@ void integral_4_0(size_t npts,
                   int ldG, 
                   double *weights,
                   double *boys_table) {
-   __attribute__((__aligned__(64))) double buffer[15 * NPTS_LOCAL + 3 * NPTS_LOCAL];
+   alignas(64) double buffer[15 * NPTS_LOCAL + 3 * NPTS_LOCAL];
 
    double *temp       = (buffer + 0);
    double *Tval       = (buffer + 15 * NPTS_LOCAL + 0 * NPTS_LOCAL);
diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_1.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_1.cxx
index 6fefd7870..f189fe0d0 100644
--- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_1.cxx
+++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_1.cxx
@@ -32,7 +32,7 @@ void integral_4_1(size_t npts,
                   int ldG, 
                   double *weights,
                   double *boys_table) {
-   __attribute__((__aligned__(64))) double buffer[36 * NPTS_LOCAL + 3 * NPTS_LOCAL];
+   alignas(64) double buffer[36 * NPTS_LOCAL + 3 * NPTS_LOCAL];
 
    double *temp       = (buffer + 0);
    double *Tval       = (buffer + 36 * NPTS_LOCAL + 0 * NPTS_LOCAL);
diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_2.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_2.cxx
index 0a88c5dd7..393bd36d7 100644
--- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_2.cxx
+++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_2.cxx
@@ -32,7 +32,7 @@ void integral_4_2(size_t npts,
                   int ldG, 
                   double *weights,
                   double *boys_table) {
-   __attribute__((__aligned__(64))) double buffer[64 * NPTS_LOCAL + 3 * NPTS_LOCAL];
+   alignas(64) double buffer[64 * NPTS_LOCAL + 3 * NPTS_LOCAL];
 
    double *temp       = (buffer + 0);
    double *Tval       = (buffer + 64 * NPTS_LOCAL + 0 * NPTS_LOCAL);
diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_3.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_3.cxx
index e318e860f..4e2cdc506 100644
--- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_3.cxx
+++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_3.cxx
@@ -32,7 +32,7 @@ void integral_4_3(size_t npts,
                   int ldG, 
                   double *weights,
                   double *boys_table) {
-   __attribute__((__aligned__(64))) double buffer[100 * NPTS_LOCAL + 3 * NPTS_LOCAL];
+   alignas(64) double buffer[100 * NPTS_LOCAL + 3 * NPTS_LOCAL];
 
    double *temp       = (buffer + 0);
    double *Tval       = (buffer + 100 * NPTS_LOCAL + 0 * NPTS_LOCAL);
diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_4.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_4.cxx
index 5aca482ab..301bde852 100644
--- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_4.cxx
+++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_4.cxx
@@ -32,7 +32,7 @@ void integral_4_4(size_t npts,
                   int ldG, 
                   double *weights,
                   double *boys_table) {
-   __attribute__((__aligned__(64))) double buffer[145 * NPTS_LOCAL + 3 * NPTS_LOCAL];
+   alignas(64) double buffer[145 * NPTS_LOCAL + 3 * NPTS_LOCAL];
 
    double *temp       = (buffer + 0);
    double *Tval       = (buffer + 145 * NPTS_LOCAL + 0 * NPTS_LOCAL);
diff --git a/src/xc_integrator/local_work_driver/host/rys/src/rys_1rw.c b/src/xc_integrator/local_work_driver/host/rys/src/rys_1rw.c
index 4aa876364..2e18f7159 100644
--- a/src/xc_integrator/local_work_driver/host/rys/src/rys_1rw.c
+++ b/src/xc_integrator/local_work_driver/host/rys/src/rys_1rw.c
@@ -6,7 +6,7 @@
 #define MAX(a,b)    ((a) < (b) ? (b) : (a))
 #define MIN(a,b)    ((a) > (b) ? (b) : (a))
 
-void rys_1rw(int nt, const double tval[restrict], double rts[restrict], double wts[restrict]) {
+void rys_1rw(int nt, const double *__restrict tval, double *__restrict rts, double *__restrict wts) {
   int jump1[34] =
     { 1, 2, 2, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6,
       6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7
diff --git a/src/xc_integrator/local_work_driver/host/rys/src/rys_1rw.h b/src/xc_integrator/local_work_driver/host/rys/src/rys_1rw.h
index c98f10241..5f1f05672 100644
--- a/src/xc_integrator/local_work_driver/host/rys/src/rys_1rw.h
+++ b/src/xc_integrator/local_work_driver/host/rys/src/rys_1rw.h
@@ -1,6 +1,6 @@
 #ifndef RYS_1RW_H_
 #define RYS_1RW_H_
 
-void rys_1rw(int nt, const double tval[restrict], double rts[restrict], double wts[restrict]);
+void rys_1rw(int nt, const double *__restrict tval, double *__restrict rts, double *__restrict wts);
 
 #endif
diff --git a/src/xc_integrator/local_work_driver/host/rys/src/rys_2rw.c b/src/xc_integrator/local_work_driver/host/rys/src/rys_2rw.c
index 78459eb8a..d98966172 100644
--- a/src/xc_integrator/local_work_driver/host/rys/src/rys_2rw.c
+++ b/src/xc_integrator/local_work_driver/host/rys/src/rys_2rw.c
@@ -6,7 +6,7 @@
 #define MAX(a,b)    ((a) < (b) ? (b) : (a))
 #define MIN(a,b)    ((a) > (b) ? (b) : (a))
 
-void rys_2rw(int nt, const double tval[restrict], double rts[restrict], double wts[restrict]) {
+void rys_2rw(int nt, const double *__restrict tval, double *__restrict rts, double *__restrict wts) {
   int jump2[41] =
     { 1, 2, 2, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6,
       6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 8
diff --git a/src/xc_integrator/local_work_driver/host/rys/src/rys_2rw.h b/src/xc_integrator/local_work_driver/host/rys/src/rys_2rw.h
index 309c3ec26..e18221d57 100644
--- a/src/xc_integrator/local_work_driver/host/rys/src/rys_2rw.h
+++ b/src/xc_integrator/local_work_driver/host/rys/src/rys_2rw.h
@@ -1,6 +1,6 @@
 #ifndef RYS_2RW_H_
 #define RYS_2RW_H_
 
-void rys_2rw(int nt, const double tval[restrict], double rts[restrict], double wts[restrict]);
+void rys_2rw(int nt, const double *__restrict tval, double *__restrict rts, double *__restrict wts);
 
 #endif
diff --git a/src/xc_integrator/local_work_driver/host/rys/src/rys_3rw.c b/src/xc_integrator/local_work_driver/host/rys/src/rys_3rw.c
index 299073ca9..c86131622 100644
--- a/src/xc_integrator/local_work_driver/host/rys/src/rys_3rw.c
+++ b/src/xc_integrator/local_work_driver/host/rys/src/rys_3rw.c
@@ -6,7 +6,7 @@
 #define MAX(a,b)    ((a) < (b) ? (b) : (a))
 #define MIN(a,b)    ((a) > (b) ? (b) : (a))
 
-void rys_3rw(int nt, const double tval[restrict], double rts[restrict], double wts[restrict]) {
+void rys_3rw(int nt, const double *__restrict tval, double *__restrict rts, double *__restrict wts) {
   int jump3[48] =
     { 1, 2, 2, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 7, 7,
       7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
diff --git a/src/xc_integrator/local_work_driver/host/rys/src/rys_3rw.h b/src/xc_integrator/local_work_driver/host/rys/src/rys_3rw.h
index 904139b2c..affe560f0 100644
--- a/src/xc_integrator/local_work_driver/host/rys/src/rys_3rw.h
+++ b/src/xc_integrator/local_work_driver/host/rys/src/rys_3rw.h
@@ -1,6 +1,6 @@
 #ifndef RYS_3RW_H_
 #define RYS_3RW_H_
 
-void rys_3rw(int nt, const double tval[restrict], double rts[restrict], double wts[restrict]);
+void rys_3rw(int nt, const double *__restrict tval, double *__restrict rts, double *__restrict wts);
 
 #endif
diff --git a/src/xc_integrator/local_work_driver/host/rys/src/rys_4rw.c b/src/xc_integrator/local_work_driver/host/rys/src/rys_4rw.c
index 2b83ae652..f59c31eb2 100644
--- a/src/xc_integrator/local_work_driver/host/rys/src/rys_4rw.c
+++ b/src/xc_integrator/local_work_driver/host/rys/src/rys_4rw.c
@@ -6,7 +6,7 @@
 #define MAX(a,b)    ((a) < (b) ? (b) : (a))
 #define MIN(a,b)    ((a) > (b) ? (b) : (a))
 
-void rys_4rw(int nt, const double tval[restrict], double rts[restrict], double wts[restrict]) {
+void rys_4rw(int nt, const double *__restrict tval, double *__restrict rts, double *__restrict wts) {
   int jump4[54] =
     { 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6,
       6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
diff --git a/src/xc_integrator/local_work_driver/host/rys/src/rys_4rw.h b/src/xc_integrator/local_work_driver/host/rys/src/rys_4rw.h
index dd6fac71f..5294e181d 100644
--- a/src/xc_integrator/local_work_driver/host/rys/src/rys_4rw.h
+++ b/src/xc_integrator/local_work_driver/host/rys/src/rys_4rw.h
@@ -1,6 +1,6 @@
 #ifndef RYS_4RW_H_
 #define RYS_4RW_H_
 
-void rys_4rw(int nt, const double tval[restrict], double rts[restrict], double wts[restrict]);
+void rys_4rw(int nt, const double *__restrict tval, double *__restrict rts, double *__restrict wts);
 
 #endif
diff --git a/src/xc_integrator/local_work_driver/host/rys/src/rys_5rw.c b/src/xc_integrator/local_work_driver/host/rys/src/rys_5rw.c
index a478610c9..ccc677f98 100644
--- a/src/xc_integrator/local_work_driver/host/rys/src/rys_5rw.c
+++ b/src/xc_integrator/local_work_driver/host/rys/src/rys_5rw.c
@@ -6,7 +6,7 @@
 #define MAX(a,b)    ((a) < (b) ? (b) : (a))
 #define MIN(a,b)    ((a) > (b) ? (b) : (a))
 
-void rys_5rw(int nt, const double tval[restrict], double rts[restrict], double wts[restrict]) {
+void rys_5rw(int nt, const double *__restrict tval, double *__restrict rts, double *__restrict wts) {
   int jump5[60] = { 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6,
     6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 9
   };
diff --git a/src/xc_integrator/local_work_driver/host/rys/src/rys_5rw.h b/src/xc_integrator/local_work_driver/host/rys/src/rys_5rw.h
index 8e4278431..1e76fe9e5 100644
--- a/src/xc_integrator/local_work_driver/host/rys/src/rys_5rw.h
+++ b/src/xc_integrator/local_work_driver/host/rys/src/rys_5rw.h
@@ -1,6 +1,6 @@
 #ifndef RYS_5RW_H_
 #define RYS_5RW_H_
 
-void rys_5rw(int nt, const double tval[restrict], double rts[restrict], double wts[restrict]);
+void rys_5rw(int nt, const double *__restrict tval, double *__restrict rts, double *__restrict wts);
 
 #endif
diff --git a/src/xc_integrator/local_work_driver/host/rys/src/rys_integral.c b/src/xc_integrator/local_work_driver/host/rys/src/rys_integral.c
index a9f8d22da..cee3f63e3 100644
--- a/src/xc_integrator/local_work_driver/host/rys/src/rys_integral.c
+++ b/src/xc_integrator/local_work_driver/host/rys/src/rys_integral.c
@@ -21,17 +21,23 @@
 
 #define PI 3.14159265358979323846
 
+#ifdef _MSC_VER
+#define FORCE_INLINE __forceinline
+#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#else
+#define FORCE_INLINE inline __attribute__((always_inline))
 #define MIN(a,b)                                \
   ({ __typeof__ (a) _a = (a);                   \
     __typeof__ (b) _b = (b);                    \
     _a < _b ? _a : _b; })
+#endif
 
 // codelets
-inline void __attribute__((always_inline)) compute_00(double beta, double *int_array, double *wgh) {
+FORCE_INLINE void compute_00(double beta, double *int_array, double *wgh) {
   *(int_array + 0) = (*(int_array + 0)) * beta + *(wgh + 0);
 }
 
-inline void __attribute__((always_inline)) compute_10_01(double xPX, double yPX, double zPX, double xPC, double yPC, double zPC, double beta, double *int_array, double *rts, double *wgh) {
+FORCE_INLINE void compute_10_01(double xPX, double yPX, double zPX, double xPC, double yPC, double zPC, double beta, double *int_array, double *rts, double *wgh) {
   double rt, Cx0, Cy0, Cz0, Cx1, Cy1, Cz1;
   
   rt = *(rts + 0);
@@ -49,7 +55,7 @@ inline void __attribute__((always_inline)) compute_10_01(double xPX, double yPX,
   *(int_array + 2) = (*(int_array + 2)) * beta + (*(wgh + 0)) * Cz0 + (*(wgh + 1)) * Cz1;
 }
 
-inline void __attribute__((always_inline)) compute_20_02(double xPX, double yPX, double zPX, double xPC, double yPC, double zPC, double aP_inv, double beta, double *int_array, double *rts, double *wgh) {
+FORCE_INLINE void compute_20_02(double xPX, double yPX, double zPX, double xPC, double yPC, double zPC, double aP_inv, double beta, double *int_array, double *rts, double *wgh) {
   double B0, B1, rt0, rt1, Cx0, Cy0, Cz0, Cx1, Cy1, Cz1, Cx2, Cy2, Cz2, Cx3, Cy3, Cz3;
   
   rt0 = *(rts + 0);
@@ -82,7 +88,7 @@ inline void __attribute__((always_inline)) compute_20_02(double xPX, double yPX,
   *(int_array + 5) = (*(int_array + 5)) * beta + Cz2 * (*(wgh + 0)) + Cz3 * (*(wgh + 1));
 }
 
-inline void __attribute__((always_inline)) compute_11(double xAB, double yAB, double zAB, double xPX, double yPX, double zPX, double xPC, double yPC, double zPC, double aP_inv, double beta, double *int_array, double *rts, double *wgh) {
+FORCE_INLINE void compute_11(double xAB, double yAB, double zAB, double xPX, double yPX, double zPX, double xPC, double yPC, double zPC, double aP_inv, double beta, double *int_array, double *rts, double *wgh) {
   double B0, B1, rt0, rt1, Cx0, Cy0, Cz0, Cx1, Cy1, Cz1, Cx2, Cy2, Cz2, Cx3, Cy3, Cz3;
   
   rt0 = *(rts + 0);
@@ -120,7 +126,7 @@ inline void __attribute__((always_inline)) compute_11(double xAB, double yAB, do
 }
 
 // nr roots > 2
-inline void __attribute__((always_inline)) compute_vrr3(int nr_roots, int l, int lA, int llA, int lB, int llB, double xPX, double yPX, double zPX, double xPC, double yPC, double zPC, double aP_inv, double * rts, double *vrr_array, double *hrr_array) {
+FORCE_INLINE void compute_vrr3(int nr_roots, int l, int lA, int llA, int lB, int llB, double xPX, double yPX, double zPX, double xPC, double yPC, double zPC, double aP_inv, double * rts, double *vrr_array, double *hrr_array) {
   double *roots = (rts + 0);
   double *vrr = (vrr_array + 0);
   for(int r = 0; r < nr_roots; ++r) {
@@ -210,7 +216,7 @@ inline void __attribute__((always_inline)) compute_vrr3(int nr_roots, int l, int
   }
 }
 
-inline void __attribute__((always_inline)) compute_hrr3(int nr_roots, int l, int lA, int llA, int lB, int llB, double xAB, double yAB, double zAB, double *vrr_array, double *hrr_array) {
+FORCE_INLINE void compute_hrr3(int nr_roots, int l, int lA, int llA, int lB, int llB, double xAB, double yAB, double zAB, double *vrr_array, double *hrr_array) {
   for(int j = 1; j <= lA; ++j) {
     double *hrrj = (hrr_array + llA * j);
     
@@ -271,11 +277,11 @@ inline void __attribute__((always_inline)) compute_hrr3(int nr_roots, int l, int
   }
 }
 
-inline int __attribute__((always_inline)) index_calculation(int i, int j, int L) {
+FORCE_INLINE int index_calculation(int i, int j, int L) {
   return (L - i) * (L - i + 1) / 2 + j;
 }
 
-inline void __attribute__((always_inline)) compute_reduction(int nr_roots, int lA, int lB, double *weights, double *hrr_array, double *result, double beta) {
+FORCE_INLINE void compute_reduction(int nr_roots, int lA, int lB, double *weights, double *hrr_array, double *result, double beta) {
   int offsetB = (lB + 1) * (lB + 2) / 2;
 
   for(int ia = 0; ia <= lA; ++ia) {
diff --git a/src/xc_integrator/local_work_driver/host/rys/src/rys_rw.c b/src/xc_integrator/local_work_driver/host/rys/src/rys_rw.c
index 905d05d49..f9eba534d 100644
--- a/src/xc_integrator/local_work_driver/host/rys/src/rys_rw.c
+++ b/src/xc_integrator/local_work_driver/host/rys/src/rys_rw.c
@@ -3,6 +3,9 @@
 #include <string.h>
 #include <assert.h>
 #include <math.h>
+#ifdef _MSC_VER
+#include <malloc.h>
+#endif
 
 #include "boys.h"
 
@@ -15,9 +18,9 @@
 
 void rys_rw(int nt,
 	    int ngqp,
-	    double tval[restrict],
-	    double rts[restrict],
-	    double wts[restrict]) {
+	    double *__restrict tval,
+	    double *__restrict rts,
+	    double *__restrict wts) {
   switch (ngqp) {
   case 1:
     rys_1rw(nt, tval, rts, wts);
@@ -36,7 +39,11 @@ void rys_rw(int nt,
     return;
   default:
     {
+#ifdef _MSC_VER
+      double *ryszero = (double *)_malloca(nt * sizeof(double));
+#else
       double ryszero[nt];
+#endif
       
       for (int n = 0; n < nt; n++) {
 	const double t = tval[n];
@@ -61,7 +68,10 @@ void rys_rw(int nt,
       int nmom = (ngqp << 1) - 1;
       
       rys_xrw(nt, ntgqp, ngqp, nmom, tval, ryszero, rts, wts);
-      
+
+#ifdef _MSC_VER
+      _freea(ryszero);
+#endif
       return;
     }
   }
diff --git a/src/xc_integrator/local_work_driver/host/rys/src/rys_rw.h b/src/xc_integrator/local_work_driver/host/rys/src/rys_rw.h
index 659cddefb..9d0f6ed21 100644
--- a/src/xc_integrator/local_work_driver/host/rys/src/rys_rw.h
+++ b/src/xc_integrator/local_work_driver/host/rys/src/rys_rw.h
@@ -1,6 +1,6 @@
 #ifndef RYS_RW_H_
 #define RYS_RW_H_
 
-void rys_rw(int nt, int ngqp, double tval[restrict], double rts[restrict], double wts[restrict]);
+void rys_rw(int nt, int ngqp, double *__restrict tval, double *__restrict rts, double *__restrict wts);
 
 #endif
diff --git a/src/xc_integrator/local_work_driver/host/rys/src/rys_xrw.c b/src/xc_integrator/local_work_driver/host/rys/src/rys_xrw.c
index 35ba680fe..2089bd0f6 100644
--- a/src/xc_integrator/local_work_driver/host/rys/src/rys_xrw.c
+++ b/src/xc_integrator/local_work_driver/host/rys/src/rys_xrw.c
@@ -2,16 +2,28 @@
 #include <stddef.h>
 #include <math.h>
 #include <assert.h>
+#ifdef _MSC_VER
+#include <malloc.h>
+#endif
 #include "jacobi.h"
 
 void rys_xrw(int nt,
 	      int ntgqp,
 	      int ngqp,
 	      int nmom,
-	      const double tval[restrict],
-	      const double ryszero[restrict],
-	      double rts[restrict],
-	      double wts[restrict]) {
+	      const double *__restrict tval,
+	      const double *__restrict ryszero,
+	      double *__restrict rts,
+	      double *__restrict wts) {
+#ifdef _MSC_VER
+  double *a    = (double *)_malloca(nmom     * sizeof(double));
+  double *b    = (double *)_malloca((nmom-1) * sizeof(double));
+  double *mom  = (double *)_malloca(nmom     * sizeof(double));
+  double *dia  = (double *)_malloca(ngqp     * sizeof(double));
+  double *off  = (double *)_malloca(ngqp     * sizeof(double));
+  double *row1 = (double *)_malloca(nmom     * sizeof(double));
+  double *row2 = (double *)_malloca(nmom     * sizeof(double));
+#else
   double a[nmom];
   double b[nmom-1];
   double mom[nmom];
@@ -19,6 +31,7 @@ void rys_xrw(int nt,
   double off[ngqp];
   double row1[nmom];
   double row2[nmom];
+#endif
 
   int nrts = 0;
   for (int n = 0; n < nt; n += 1) {
@@ -261,4 +274,14 @@ void rys_xrw(int nt,
       nrts += ngqp;
     }
   }
+
+#ifdef _MSC_VER
+  _freea(row2);
+  _freea(row1);
+  _freea(off);
+  _freea(dia);
+  _freea(mom);
+  _freea(b);
+  _freea(a);
+#endif
 }
diff --git a/src/xc_integrator/local_work_driver/host/rys/src/rys_xrw.h b/src/xc_integrator/local_work_driver/host/rys/src/rys_xrw.h
index f107d589b..b99cdcc4a 100644
--- a/src/xc_integrator/local_work_driver/host/rys/src/rys_xrw.h
+++ b/src/xc_integrator/local_work_driver/host/rys/src/rys_xrw.h
@@ -5,9 +5,9 @@ void rys_xrw(int nt,
 	     int ntgqp,
 	     int ngqp,
 	     int nmom,
-	     const double tval[restrict],
-	     const double ryszero[restrict],
-	     double rts[restrict],
-	     double wts[restrict]);
+	     const double *__restrict tval,
+	     const double *__restrict ryszero,
+	     double *__restrict rts,
+	     double *__restrict wts);
 
 #endif

From 5104ef4e474b29244c31ca1917242bb2e9594696 Mon Sep 17 00:00:00 2001
From: Loris Ercole <v-lercole@microsoft.com>
Date: Tue, 28 Apr 2026 18:14:30 +0200
Subject: [PATCH 02/52] Set temporary ExchCXX & IntegratorXX versions

---
 cmake/gauxc-dep-versions.cmake | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/cmake/gauxc-dep-versions.cmake b/cmake/gauxc-dep-versions.cmake
index 62fbcb26f..032b0188b 100644
--- a/cmake/gauxc-dep-versions.cmake
+++ b/cmake/gauxc-dep-versions.cmake
@@ -7,14 +7,14 @@ set( GAUXC_CUB_REVISION   1.10.0 )
 set( GAUXC_CUTLASS_REPOSITORY https://github.com/NVIDIA/cutlass.git )
 set( GAUXC_CUTLASS_REVISION v2.10.0 )
 
-set( GAUXC_EXCHCXX_REPOSITORY https://github.com/wavefunction91/ExchCXX.git )
-set( GAUXC_EXCHCXX_REVISION   v1.0.0 )
+set( GAUXC_EXCHCXX_REPOSITORY https://github.com/lorisercole/ExchCXX.git )
+set( GAUXC_EXCHCXX_REVISION   7d83223e72e2eb1446af87546b75cb81cfeca719 )
 
 set( GAUXC_GAU2GRID_REPOSITORY https://github.com/dgasmith/gau2grid.git )
 set( GAUXC_GAU2GRID_REVISION   v2.0.6 )
 
-set( GAUXC_INTEGRATORXX_REPOSITORY https://github.com/wavefunction91/IntegratorXX.git )
-set( GAUXC_INTEGRATORXX_REVISION   1369be58d7a3235dac36d75dd964fef058830622 )
+set( GAUXC_INTEGRATORXX_REPOSITORY https://github.com/lorisercole/IntegratorXX.git )
+set( GAUXC_INTEGRATORXX_REVISION   60e45e74b4a8939a4b0fb9ca3e9e2a7304f9356f )
 
 set( GAUXC_HIGHFIVE_REPOSITORY https://github.com/highfive-devs/HighFive.git )
 set( GAUXC_HIGHFIVE_REVISION 805f0e13d09b47c4b01d40682621904aa3b31bb8 )

From 80e011e747977f41edaee83388834c649fe208b7 Mon Sep 17 00:00:00 2001
From: Loris Ercole <v-lercole@microsoft.com>
Date: Tue, 5 May 2026 11:35:05 +0200
Subject: [PATCH 03/52] avoid implicit declaration error in gau2grid_helper.c
 with clang-cl

---
 external/gau2grid/generated_source/gau2grid_helper.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/external/gau2grid/generated_source/gau2grid_helper.c b/external/gau2grid/generated_source/gau2grid_helper.c
index e5868df7e..31956e084 100644
--- a/external/gau2grid/generated_source/gau2grid_helper.c
+++ b/external/gau2grid/generated_source/gau2grid_helper.c
@@ -8,6 +8,7 @@
 #include <math.h>
 #if defined(__clang__) && defined(_MSC_VER)
 #include <malloc.h>
+#include <stdlib.h>
 #elif defined __clang__
 #include <mm_malloc.h>
 #elif defined _MSC_VER

From 67f9e70913f11b96df756100b68368b26d918683 Mon Sep 17 00:00:00 2001
From: Loris Ercole <v-lercole@microsoft.com>
Date: Tue, 5 May 2026 11:36:06 +0200
Subject: [PATCH 04/52] avoid newline-eof warning

---
 external/gau2grid/generated_source/gau2grid/gau2grid.h        | 2 +-
 external/gau2grid/generated_source/gau2grid/gau2grid_pragma.h | 2 +-
 src/xc_integrator/integrator_util/spherical_harmonics.hpp     | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/external/gau2grid/generated_source/gau2grid/gau2grid.h b/external/gau2grid/generated_source/gau2grid/gau2grid.h
index 29f888852..0e097a526 100644
--- a/external/gau2grid/generated_source/gau2grid/gau2grid.h
+++ b/external/gau2grid/generated_source/gau2grid/gau2grid.h
@@ -79,4 +79,4 @@ void gg_collocation_deriv3(int L, const unsigned long npoints, const double* PRA
 #ifdef __cplusplus
 }
 #endif
-#endif /* GAU2GRID_GUARD_H */
\ No newline at end of file
+#endif /* GAU2GRID_GUARD_H */
diff --git a/external/gau2grid/generated_source/gau2grid/gau2grid_pragma.h b/external/gau2grid/generated_source/gau2grid/gau2grid_pragma.h
index f6033886a..d85679263 100644
--- a/external/gau2grid/generated_source/gau2grid/gau2grid_pragma.h
+++ b/external/gau2grid/generated_source/gau2grid/gau2grid_pragma.h
@@ -96,4 +96,4 @@
     #define PRAGMA_RESTRICT                                  __restrict__
 
 
-#endif
\ No newline at end of file
+#endif
diff --git a/src/xc_integrator/integrator_util/spherical_harmonics.hpp b/src/xc_integrator/integrator_util/spherical_harmonics.hpp
index 7ce495d8d..73c6a3bcb 100644
--- a/src/xc_integrator/integrator_util/spherical_harmonics.hpp
+++ b/src/xc_integrator/integrator_util/spherical_harmonics.hpp
@@ -4,4 +4,4 @@
 #include <cassert>
 
 
-void scaled_ylm_matrix(const int lmax, const double* points, const int32_t  npts, const std::array<double, 3> center, const double radius, double* ylm_matrix);
\ No newline at end of file
+void scaled_ylm_matrix(const int lmax, const double* points, const int32_t  npts, const std::array<double, 3> center, const double radius, double* ylm_matrix);

From 8f9b3cbbb650ae70ab7cd12bcab12bf2c1b11b41 Mon Sep 17 00:00:00 2001
From: Loris Ercole <v-lercole@microsoft.com>
Date: Tue, 5 May 2026 11:36:20 +0200
Subject: [PATCH 05/52] silence noisy clang/cl warnings

---
 src/CMakeLists.txt | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 1aed4b428..17c5ef749 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -78,6 +78,36 @@ if( GAUXC_CXX_HAS_WSHADOW )
   target_compile_options( gauxc PRIVATE $<$<COMPILE_LANGUAGE:CXX>: -Wshadow> )
 endif()
 
+if(MSVC)
+  if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+    target_compile_options( gauxc PRIVATE
+      -Wno-covered-switch-default
+      -Wno-documentation
+      -Wno-documentation-unknown-command
+      -Wno-implicit-int-float-conversion
+      -Wno-language-extension-token
+      -Wno-reserved-identifier
+      -Wno-shorten-64-to-32
+      -Wno-sign-compare
+      -Wno-undef
+    )
+  else()
+    target_compile_options( gauxc PRIVATE
+      /W2
+      /wd4100  # unreferenced parameter
+      /wd4101  # unreferenced local variable
+    #   /wd4018  # signed/unsigned mismatch
+    #   /wd4100  # unreferenced parameter
+    #   /wd4189  # unreferenced local variable
+      /wd4242  # 'identifier': conversion from 'type1' to 'type2', possible loss of data
+    #   /wd4388  # signed/unsigned mismatch
+    #   /wd4464  # relative include path contains '..'
+    #   /wd4668  # undefined preprocessor macro
+      /wd5219  # implicit conversion from 'type-1' to 'type-2', possible loss of data
+    )
+  endif()
+endif()
+
 target_link_libraries( gauxc PUBLIC 
   ExchCXX::ExchCXX 
   IntegratorXX::IntegratorXX 

From 512a0655ddd86e80f3d7bd12c710385026eb394b Mon Sep 17 00:00:00 2001
From: Loris Ercole <v-lercole@microsoft.com>
Date: Tue, 5 May 2026 14:25:24 +0200
Subject: [PATCH 06/52] update deps hashes

---
 cmake/gauxc-dep-versions.cmake | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cmake/gauxc-dep-versions.cmake b/cmake/gauxc-dep-versions.cmake
index 032b0188b..f00e3ced6 100644
--- a/cmake/gauxc-dep-versions.cmake
+++ b/cmake/gauxc-dep-versions.cmake
@@ -8,13 +8,13 @@ set( GAUXC_CUTLASS_REPOSITORY https://github.com/NVIDIA/cutlass.git )
 set( GAUXC_CUTLASS_REVISION v2.10.0 )
 
 set( GAUXC_EXCHCXX_REPOSITORY https://github.com/lorisercole/ExchCXX.git )
-set( GAUXC_EXCHCXX_REVISION   7d83223e72e2eb1446af87546b75cb81cfeca719 )
+set( GAUXC_EXCHCXX_REVISION   601f72eb668e0721a8452fc3eaff510f431946b0 )
 
 set( GAUXC_GAU2GRID_REPOSITORY https://github.com/dgasmith/gau2grid.git )
 set( GAUXC_GAU2GRID_REVISION   v2.0.6 )
 
 set( GAUXC_INTEGRATORXX_REPOSITORY https://github.com/lorisercole/IntegratorXX.git )
-set( GAUXC_INTEGRATORXX_REVISION   60e45e74b4a8939a4b0fb9ca3e9e2a7304f9356f )
+set( GAUXC_INTEGRATORXX_REVISION   58012a0b32c45f5b403380fab594047dd4587f55 )
 
 set( GAUXC_HIGHFIVE_REPOSITORY https://github.com/highfive-devs/HighFive.git )
 set( GAUXC_HIGHFIVE_REVISION 805f0e13d09b47c4b01d40682621904aa3b31bb8 )

From 716f51383662130ed33c1e21b0f5c5d05b3d2e79 Mon Sep 17 00:00:00 2001
From: Loris Ercole <v-lercole@microsoft.com>
Date: Wed, 6 May 2026 14:12:36 +0200
Subject: [PATCH 07/52] avoid clang warning: strdup

warning C4996: 'strdup': The POSIX name for this item is deprecated. Instead, use the ISO C and C++ conformant name: _strdup.
---
 include/gauxc/exceptions.hpp         | 2 +-
 src/exceptions/cublas_exception.hpp  | 2 +-
 src/exceptions/cuda_exception.hpp    | 2 +-
 src/exceptions/cutlass_exception.hpp | 2 +-
 src/exceptions/hip_exception.hpp     | 2 +-
 src/exceptions/hipblas_exception.hpp | 2 +-
 src/exceptions/magma_exception.hpp   | 2 +-
 7 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/include/gauxc/exceptions.hpp b/include/gauxc/exceptions.hpp
index fa6a874e8..c49d1939d 100644
--- a/include/gauxc/exceptions.hpp
+++ b/include/gauxc/exceptions.hpp
@@ -54,7 +54,7 @@ class generic_gauxc_exception : public std::exception {
         << "  Line     " << line_  << std::endl;
      auto msg = ss.str();
 
-     return strdup( msg.c_str() );
+     return _strdup( msg.c_str() );
   };
 
 public:
diff --git a/src/exceptions/cublas_exception.hpp b/src/exceptions/cublas_exception.hpp
index 503fc9007..c3378185c 100644
--- a/src/exceptions/cublas_exception.hpp
+++ b/src/exceptions/cublas_exception.hpp
@@ -96,7 +96,7 @@ class cublas_exception : public std::exception {
 
      auto msg = ss.str();
 
-     return strdup( msg.c_str() );
+     return _strdup( msg.c_str() );
   }
 
 public:
diff --git a/src/exceptions/cuda_exception.hpp b/src/exceptions/cuda_exception.hpp
index 6d4767d1a..02746690e 100644
--- a/src/exceptions/cuda_exception.hpp
+++ b/src/exceptions/cuda_exception.hpp
@@ -48,7 +48,7 @@ class cuda_exception : public std::exception {
 
      auto msg = ss.str();
 
-     return strdup( msg.c_str() );
+     return _strdup( msg.c_str() );
   }
 
 public:
diff --git a/src/exceptions/cutlass_exception.hpp b/src/exceptions/cutlass_exception.hpp
index 4de854bef..7b7697a03 100644
--- a/src/exceptions/cutlass_exception.hpp
+++ b/src/exceptions/cutlass_exception.hpp
@@ -48,7 +48,7 @@ class cutlass_exception : public std::exception {
 
      auto msg = ss.str();
 
-     return strdup( msg.c_str() );
+     return _strdup( msg.c_str() );
   }
 
 public:
diff --git a/src/exceptions/hip_exception.hpp b/src/exceptions/hip_exception.hpp
index 08a403022..b16ab6040 100644
--- a/src/exceptions/hip_exception.hpp
+++ b/src/exceptions/hip_exception.hpp
@@ -48,7 +48,7 @@ class hip_exception : public std::exception {
 
      auto msg = ss.str();
 
-     return strdup( msg.c_str() );
+     return _strdup( msg.c_str() );
   }
 
 public:
diff --git a/src/exceptions/hipblas_exception.hpp b/src/exceptions/hipblas_exception.hpp
index bb89a3316..392dd277e 100644
--- a/src/exceptions/hipblas_exception.hpp
+++ b/src/exceptions/hipblas_exception.hpp
@@ -103,7 +103,7 @@ class hipblas_exception : public std::exception {
 
      auto msg = ss.str();
 
-     return strdup( msg.c_str() );
+     return _strdup( msg.c_str() );
   }
 
 public:
diff --git a/src/exceptions/magma_exception.hpp b/src/exceptions/magma_exception.hpp
index 300565735..3ef42e7ce 100644
--- a/src/exceptions/magma_exception.hpp
+++ b/src/exceptions/magma_exception.hpp
@@ -46,7 +46,7 @@ class magma_exception : public std::exception {
 
      auto msg = ss.str();
 
-     return strdup( msg.c_str() );
+     return _strdup( msg.c_str() );
   }
 
 public:

From 5e57b86e77ab56985a97e3151579c2eee49dec1b Mon Sep 17 00:00:00 2001
From: Loris Ercole <v-lercole@microsoft.com>
Date: Wed, 6 May 2026 15:38:31 +0200
Subject: [PATCH 08/52] avoid newline-eof warning

---
 src/xc_integrator/integrator_util/spherical_harmonics.cxx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/xc_integrator/integrator_util/spherical_harmonics.cxx b/src/xc_integrator/integrator_util/spherical_harmonics.cxx
index bbc838e10..d84adc7fa 100644
--- a/src/xc_integrator/integrator_util/spherical_harmonics.cxx
+++ b/src/xc_integrator/integrator_util/spherical_harmonics.cxx
@@ -168,4 +168,4 @@ void scaled_ylm_matrix(const int lmax, const double* points, const int32_t  npts
     const std::array<double, 3> x = {points[3 * i], points[3 * i + 1], points[3 * i + 2]};
     scaled_ylm_new(lmax, x, center, radius, nlm, ylm_matrix + i * nharmonics);
   }
-}
\ No newline at end of file
+}

From 4e18eb1c4fc3b7bc1d2f91c59d8a4826b0997a4f Mon Sep 17 00:00:00 2001
From: Loris Ercole <v-lercole@microsoft.com>
Date: Thu, 7 May 2026 11:15:12 +0200
Subject: [PATCH 09/52] cleanup

---
 src/CMakeLists.txt | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 17c5ef749..944820a4d 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -96,13 +96,7 @@ if(MSVC)
       /W2
       /wd4100  # unreferenced parameter
       /wd4101  # unreferenced local variable
-    #   /wd4018  # signed/unsigned mismatch
-    #   /wd4100  # unreferenced parameter
-    #   /wd4189  # unreferenced local variable
       /wd4242  # 'identifier': conversion from 'type1' to 'type2', possible loss of data
-    #   /wd4388  # signed/unsigned mismatch
-    #   /wd4464  # relative include path contains '..'
-    #   /wd4668  # undefined preprocessor macro
       /wd5219  # implicit conversion from 'type-1' to 'type-2', possible loss of data
     )
   endif()

From f40693a22abeed38b7c17806163044e0b9d1ceea Mon Sep 17 00:00:00 2001
From: Loris Ercole <v-lercole@microsoft.com>
Date: Fri, 5 Jun 2026 17:29:04 +0200
Subject: [PATCH 10/52] fix(msvc): add /EHsc and align MSVC warning
 suppressions

- Add /EHsc PUBLIC to gauxc target so all consumers (test executables)
  inherit it and avoid C4530 from MSVC STL headers
- Add /EHsc PRIVATE to Catch2WithMain (compiled FetchContent target that
  does not inherit from gauxc)
- Remove /W2 (overrode /W3 from build script), /wd4100 and /wd4242
  (suppressed warnings enabled by mandatory /w14100 and /w14242 flags)
- Keep /wd4101 and /wd5219 (not in mandatory warning list)
---
 src/CMakeLists.txt   | 6 ++----
 tests/CMakeLists.txt | 3 +++
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 944820a4d..ae9621690 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -92,12 +92,10 @@ if(MSVC)
       -Wno-undef
     )
   else()
+    target_compile_options( gauxc PUBLIC /EHsc )
     target_compile_options( gauxc PRIVATE
-      /W2
-      /wd4100  # unreferenced parameter
       /wd4101  # unreferenced local variable
-      /wd4242  # 'identifier': conversion from 'type1' to 'type2', possible loss of data
-      /wd5219  # implicit conversion from 'type-1' to 'type-2', possible loss of data
+      /wd5219  # implicit conversion from int-type to float-type
     )
   endif()
 endif()
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 46dbe487d..76264dc9f 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -33,6 +33,9 @@ if( NOT Catch2_FOUND )
   set(CATCH_INSTALL_HELPERS OFF CACHE BOOL "Install contrib alongside library" FORCE)
 
   FetchContent_MakeAvailable( catch2 )
+  if(MSVC)
+    target_compile_options(Catch2WithMain PRIVATE /EHsc)
+  endif()
   target_link_libraries( gauxc_catch2 INTERFACE Catch2::Catch2 )
 
 else()

From 68c54d974f8273ad912ab67e949f692b2980c772 Mon Sep 17 00:00:00 2001
From: Loris Ercole <v-lercole@microsoft.com>
Date: Fri, 5 Jun 2026 17:36:43 +0200
Subject: [PATCH 11/52] fix: replace strdup with stored std::string in
 exception what()

Build the error message once in the constructor and store it in a
std::string what_msg_ member. what() returns what_msg_.c_str(),
avoiding the POSIX-deprecated strdup/non-portable _strdup entirely.

Applies to all 7 exception classes: generic_gauxc_exception,
cuda_exception, cublas_exception, cutlass_exception, hip_exception,
hipblas_exception, magma_exception.
---
 include/gauxc/exceptions.hpp         | 23 +++++++++++------------
 src/exceptions/cublas_exception.hpp  | 26 +++++++++++++-------------
 src/exceptions/cuda_exception.hpp    | 22 +++++++++++-----------
 src/exceptions/cutlass_exception.hpp | 22 +++++++++++-----------
 src/exceptions/hip_exception.hpp     | 22 +++++++++++-----------
 src/exceptions/hipblas_exception.hpp | 26 +++++++++++++-------------
 src/exceptions/magma_exception.hpp   | 22 +++++++++++-----------
 7 files changed, 81 insertions(+), 82 deletions(-)

diff --git a/include/gauxc/exceptions.hpp b/include/gauxc/exceptions.hpp
index c49d1939d..e0d7ec09d 100644
--- a/include/gauxc/exceptions.hpp
+++ b/include/gauxc/exceptions.hpp
@@ -14,8 +14,6 @@
 #include <stdexcept>
 #include <string>
 #include <sstream>
-#include <string.h>
-
 namespace GauXC {
 
 // FWD decl all exception types for optional handling
@@ -45,16 +43,10 @@ class generic_gauxc_exception : public std::exception {
   std::string function_;
   int         line_;
   std::string msg_prefix_;
+  std::string what_msg_;
 
   const char* what() const noexcept override {
-     std::stringstream ss;
-     ss << "Generic GauXC Exception (" << msg_prefix_ << ")" << std::endl
-        << "  File     " << file_ << std::endl
-        << "  Function " << function_ << std::endl
-        << "  Line     " << line_  << std::endl;
-     auto msg = ss.str();
-
-     return _strdup( msg.c_str() );
+     return what_msg_.c_str();
   };
 
 public:
@@ -67,9 +59,16 @@ class generic_gauxc_exception : public std::exception {
    *  @param[in] line Line number of file that threw exception
    *  @param[in] msg  General descriptor of task which threw exception
    */
-  generic_gauxc_exception( std::string file, std::string function, int line, 
+  generic_gauxc_exception( std::string file, std::string function, int line,
     std::string msg ) :
-    file_(file), function_(function), line_(line), msg_prefix_(msg) {} 
+    file_(file), function_(function), line_(line), msg_prefix_(msg) {
+    std::stringstream ss;
+    ss << "Generic GauXC Exception (" << msg_prefix_ << ")" << std::endl
+       << "  File     " << file_ << std::endl
+       << "  Function " << function_ << std::endl
+       << "  Line     " << line_  << std::endl;
+    what_msg_ = ss.str();
+  }
 
 };
 
diff --git a/src/exceptions/cublas_exception.hpp b/src/exceptions/cublas_exception.hpp
index c3378185c..3d294a0cd 100644
--- a/src/exceptions/cublas_exception.hpp
+++ b/src/exceptions/cublas_exception.hpp
@@ -78,6 +78,7 @@ class cublas_exception : public std::exception {
   int         line_;         ///< Line number of file_ that threw exception
   std::string msg_prefix_;   ///< General descriptor of task which threw exception
   cublasStatus_t err_code_;  ///< cuBLAS error code pertaining to the thrown exception
+  std::string what_msg_;
 
   /**
    *  @brief Get a descriptive message pertaining to the thrown cuBLAS error
@@ -86,17 +87,7 @@ class cublas_exception : public std::exception {
    *  the internal state of the exception object.
    */
   const char* what() const noexcept override {
-     std::stringstream ss;
-     ss << "CUBLAS Exception (" << msg_prefix_ << ")" << std::endl
-        << "  Error Code " << int(err_code_) << ": \"" 
-                           << detail::cublasGetErrorString( err_code_ ) 
-                           << "\"" << std::endl
-        << "  File       " << file_ << std::endl
-        << "  Line       " << line_ << std::endl;
-
-     auto msg = ss.str();
-
-     return _strdup( msg.c_str() );
+     return what_msg_.c_str();
   }
 
 public:
@@ -109,9 +100,18 @@ class cublas_exception : public std::exception {
    *  @param[in] msg  General descriptor of task which threw exception
    *  @param[in] err  cuBLAS error code pertaining to the thrown exception
    */
-  cublas_exception( std::string file, int line, std::string msg, 
+  cublas_exception( std::string file, int line, std::string msg,
                     cublasStatus_t err ) :
-    file_(file), line_(line), msg_prefix_(msg), err_code_(err) { }
+    file_(file), line_(line), msg_prefix_(msg), err_code_(err) {
+    std::stringstream ss;
+    ss << "CUBLAS Exception (" << msg_prefix_ << ")" << std::endl
+       << "  Error Code " << int(err_code_) << ": \""
+                          << detail::cublasGetErrorString( err_code_ )
+                          << "\"" << std::endl
+       << "  File       " << file_ << std::endl
+       << "  Line       " << line_ << std::endl;
+    what_msg_ = ss.str();
+  }
 
 }; // class cublas_exception
 
diff --git a/src/exceptions/cuda_exception.hpp b/src/exceptions/cuda_exception.hpp
index 02746690e..9a9b9103a 100644
--- a/src/exceptions/cuda_exception.hpp
+++ b/src/exceptions/cuda_exception.hpp
@@ -31,6 +31,7 @@ class cuda_exception : public std::exception {
   int         line_;       ///< Line number of file_ that threw exception
   std::string msg_prefix_; ///< General descriptor of task which threw exception
   cudaError_t err_code_;   ///< CUDA error code pertaining to the thrown exception
+  std::string what_msg_;
 
   /**
    *  @brief Get a descriptive message pertaining to the thrown CUDA error
@@ -39,16 +40,7 @@ class cuda_exception : public std::exception {
    *  the internal state of the exception object.
    */
   const char* what() const noexcept override {
-     std::stringstream ss;
-     ss << "CUDA Exception (" << msg_prefix_ << ")" << std::endl
-        << "  Error Code " << int(err_code_) << ": \"" 
-                           << cudaGetErrorString( err_code_ ) << "\"" << std::endl
-        << "  File       " << file_ << std::endl
-        << "  Line       " << line_ << std::endl;
-
-     auto msg = ss.str();
-
-     return _strdup( msg.c_str() );
+     return what_msg_.c_str();
   }
 
 public:
@@ -62,7 +54,15 @@ class cuda_exception : public std::exception {
    *  @param[in] err  CUDA error code pertaining to the thrown exception
    */
   cuda_exception( std::string file, int line, std::string msg, cudaError_t err ) :
-    file_(file), line_(line), msg_prefix_(msg), err_code_(err) { }
+    file_(file), line_(line), msg_prefix_(msg), err_code_(err) {
+    std::stringstream ss;
+    ss << "CUDA Exception (" << msg_prefix_ << ")" << std::endl
+       << "  Error Code " << int(err_code_) << ": \""
+                          << cudaGetErrorString( err_code_ ) << "\"" << std::endl
+       << "  File       " << file_ << std::endl
+       << "  Line       " << line_ << std::endl;
+    what_msg_ = ss.str();
+  }
 
 }; // class cuda_exception
 
diff --git a/src/exceptions/cutlass_exception.hpp b/src/exceptions/cutlass_exception.hpp
index 7b7697a03..2a76983e7 100644
--- a/src/exceptions/cutlass_exception.hpp
+++ b/src/exceptions/cutlass_exception.hpp
@@ -31,6 +31,7 @@ class cutlass_exception : public std::exception {
   int         line_;       ///< Line number of file_ that threw exception
   std::string msg_prefix_; ///< General descriptor of task which threw exception
   cutlass::Status status_; ///< CUTLASS status pertaining to the thrown exception
+  std::string what_msg_;
 
   /**
    *  @brief Get a descriptive message pertaining to the thrown CUTLASS error
@@ -39,16 +40,7 @@ class cutlass_exception : public std::exception {
    *  the internal state of the exception object.
    */
   const char* what() const noexcept override {
-     std::stringstream ss;
-     ss << "CUTLASS Exception (" << msg_prefix_ << ")" << std::endl
-        << "  Error Code " << int(status_) << ": \"" 
-                           << cutlassGetStatusString( status_ ) << "\"" << std::endl
-        << "  File       " << file_ << std::endl
-        << "  Line       " << line_ << std::endl;
-
-     auto msg = ss.str();
-
-     return _strdup( msg.c_str() );
+     return what_msg_.c_str();
   }
 
 public:
@@ -62,7 +54,15 @@ class cutlass_exception : public std::exception {
    *  @param[in] err  CUTLASS status pertaining to the thrown exception
    */
   cutlass_exception( std::string file, int line, std::string msg, cutlass::Status status ) :
-    file_(file), line_(line), msg_prefix_(msg), status_(status) { }
+    file_(file), line_(line), msg_prefix_(msg), status_(status) {
+    std::stringstream ss;
+    ss << "CUTLASS Exception (" << msg_prefix_ << ")" << std::endl
+       << "  Error Code " << int(status_) << ": \""
+                          << cutlassGetStatusString( status_ ) << "\"" << std::endl
+       << "  File       " << file_ << std::endl
+       << "  Line       " << line_ << std::endl;
+    what_msg_ = ss.str();
+  }
 
 }; // class cutlass_exception
 
diff --git a/src/exceptions/hip_exception.hpp b/src/exceptions/hip_exception.hpp
index b16ab6040..770bb772f 100644
--- a/src/exceptions/hip_exception.hpp
+++ b/src/exceptions/hip_exception.hpp
@@ -31,6 +31,7 @@ class hip_exception : public std::exception {
   int         line_;       ///< Line number of file_ that threw exception
   std::string msg_prefix_; ///< General descriptor of task which threw exception
   hipError_t err_code_;   ///< HIP error code pertaining to the thrown exception
+  std::string what_msg_;
 
   /**
    *  @brief Get a descriptive message pertaining to the thrown HIP error
@@ -39,16 +40,7 @@ class hip_exception : public std::exception {
    *  the internal state of the exception object.
    */
   const char* what() const noexcept override {
-     std::stringstream ss;
-     ss << "HIP Exception (" << msg_prefix_ << ")" << std::endl
-        << "  Error Code " << int(err_code_) << ": \"" 
-                           << hipGetErrorString( err_code_ ) << "\"" << std::endl
-        << "  File       " << file_ << std::endl
-        << "  Line       " << line_ << std::endl;
-
-     auto msg = ss.str();
-
-     return _strdup( msg.c_str() );
+     return what_msg_.c_str();
   }
 
 public:
@@ -62,7 +54,15 @@ class hip_exception : public std::exception {
    *  @param[in] err  HIP error code pertaining to the thrown exception
    */
   hip_exception( std::string file, int line, std::string msg, hipError_t err ) :
-    file_(file), line_(line), msg_prefix_(msg), err_code_(err) { }
+    file_(file), line_(line), msg_prefix_(msg), err_code_(err) {
+    std::stringstream ss;
+    ss << "HIP Exception (" << msg_prefix_ << ")" << std::endl
+       << "  Error Code " << int(err_code_) << ": \""
+                          << hipGetErrorString( err_code_ ) << "\"" << std::endl
+       << "  File       " << file_ << std::endl
+       << "  Line       " << line_ << std::endl;
+    what_msg_ = ss.str();
+  }
 
 }; // class hip_exception
 
diff --git a/src/exceptions/hipblas_exception.hpp b/src/exceptions/hipblas_exception.hpp
index 392dd277e..388954762 100644
--- a/src/exceptions/hipblas_exception.hpp
+++ b/src/exceptions/hipblas_exception.hpp
@@ -85,6 +85,7 @@ class hipblas_exception : public std::exception {
   int         line_;         ///< Line number of file_ that threw exception
   std::string msg_prefix_;   ///< General descriptor of task which threw exception
   hipblasStatus_t err_code_;  ///< hipBLAS error code pertaining to the thrown exception
+  std::string what_msg_;
 
   /**
    *  @brief Get a descriptive message pertaining to the thrown hipBLAS error
@@ -93,17 +94,7 @@ class hipblas_exception : public std::exception {
    *  the internal state of the exception object.
    */
   const char* what() const noexcept override {
-     std::stringstream ss;
-     ss << "HIPBLAS Exception (" << msg_prefix_ << ")" << std::endl
-        << "  Error Code " << int(err_code_) << ": \"" 
-                           << detail::hipblasGetErrorString( err_code_ ) 
-                           << "\"" << std::endl
-        << "  File       " << file_ << std::endl
-        << "  Line       " << line_ << std::endl;
-
-     auto msg = ss.str();
-
-     return _strdup( msg.c_str() );
+     return what_msg_.c_str();
   }
 
 public:
@@ -116,9 +107,18 @@ class hipblas_exception : public std::exception {
    *  @param[in] msg  General descriptor of task which threw exception
    *  @param[in] err  hipBLAS error code pertaining to the thrown exception
    */
-  hipblas_exception( std::string file, int line, std::string msg, 
+  hipblas_exception( std::string file, int line, std::string msg,
                     hipblasStatus_t err ) :
-    file_(file), line_(line), msg_prefix_(msg), err_code_(err) { }
+    file_(file), line_(line), msg_prefix_(msg), err_code_(err) {
+    std::stringstream ss;
+    ss << "HIPBLAS Exception (" << msg_prefix_ << ")" << std::endl
+       << "  Error Code " << int(err_code_) << ": \""
+                          << detail::hipblasGetErrorString( err_code_ )
+                          << "\"" << std::endl
+       << "  File       " << file_ << std::endl
+       << "  Line       " << line_ << std::endl;
+    what_msg_ = ss.str();
+  }
 
 }; // class hipblas_exception
 
diff --git a/src/exceptions/magma_exception.hpp b/src/exceptions/magma_exception.hpp
index 3ef42e7ce..f1e811a03 100644
--- a/src/exceptions/magma_exception.hpp
+++ b/src/exceptions/magma_exception.hpp
@@ -29,6 +29,7 @@ class magma_exception : public std::exception {
   int         line_;       ///< Line number of file_ that threw exception
   std::string msg_prefix_; ///< General descriptor of task which threw exception
   magma_int_t err_code_;   ///< MAGMA error code pertaining to the thrown exception
+  std::string what_msg_;
 
   /**
    *  @brief Get a descriptive message pertaining to the thrown MAGMA error
@@ -37,16 +38,7 @@ class magma_exception : public std::exception {
    *  the internal state of the exception object.
    */
   const char* what() const noexcept override {
-     std::stringstream ss;
-     ss << "MAGMA Exception (" << msg_prefix_ << ")" << std::endl
-        << "  Error Code " << int(err_code_) << ": \"" 
-                           << magma_strerror( err_code_ ) << "\"" << std::endl
-        << "  File       " << file_ << std::endl
-        << "  Line       " << line_ << std::endl;
-
-     auto msg = ss.str();
-
-     return _strdup( msg.c_str() );
+     return what_msg_.c_str();
   }
 
 public:
@@ -60,7 +52,15 @@ class magma_exception : public std::exception {
    *  @param[in] err  MAGMA error code pertaining to the thrown exception
    */
   magma_exception( std::string file, int line, std::string msg, magma_int_t err ) :
-    file_(file), line_(line), msg_prefix_(msg), err_code_(err) { }
+    file_(file), line_(line), msg_prefix_(msg), err_code_(err) {
+    std::stringstream ss;
+    ss << "MAGMA Exception (" << msg_prefix_ << ")" << std::endl
+       << "  Error Code " << int(err_code_) << ": \""
+                          << magma_strerror( err_code_ ) << "\"" << std::endl
+       << "  File       " << file_ << std::endl
+       << "  Line       " << line_ << std::endl;
+    what_msg_ = ss.str();
+  }
 
 }; // class magma_exception
 

From 872a03ce36c8989f68e27d9e177e184e478d834e Mon Sep 17 00:00:00 2001
From: Loris Ercole <v-lercole@microsoft.com>
Date: Fri, 5 Jun 2026 17:40:58 +0200
Subject: [PATCH 12/52] chore: update IntegratorXX and ExchCXX to latest
 Windows-fix branches

---
 cmake/gauxc-dep-versions.cmake | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cmake/gauxc-dep-versions.cmake b/cmake/gauxc-dep-versions.cmake
index f00e3ced6..cf3e24563 100644
--- a/cmake/gauxc-dep-versions.cmake
+++ b/cmake/gauxc-dep-versions.cmake
@@ -8,13 +8,13 @@ set( GAUXC_CUTLASS_REPOSITORY https://github.com/NVIDIA/cutlass.git )
 set( GAUXC_CUTLASS_REVISION v2.10.0 )
 
 set( GAUXC_EXCHCXX_REPOSITORY https://github.com/lorisercole/ExchCXX.git )
-set( GAUXC_EXCHCXX_REVISION   601f72eb668e0721a8452fc3eaff510f431946b0 )
+set( GAUXC_EXCHCXX_REVISION   20a2d6052bc5b1bac4a0d028f3735056e5345dac )
 
 set( GAUXC_GAU2GRID_REPOSITORY https://github.com/dgasmith/gau2grid.git )
 set( GAUXC_GAU2GRID_REVISION   v2.0.6 )
 
 set( GAUXC_INTEGRATORXX_REPOSITORY https://github.com/lorisercole/IntegratorXX.git )
-set( GAUXC_INTEGRATORXX_REVISION   58012a0b32c45f5b403380fab594047dd4587f55 )
+set( GAUXC_INTEGRATORXX_REVISION   81e283d20eb3ce3bca49e79926a92801c642c2c5 )
 
 set( GAUXC_HIGHFIVE_REPOSITORY https://github.com/highfive-devs/HighFive.git )
 set( GAUXC_HIGHFIVE_REVISION 805f0e13d09b47c4b01d40682621904aa3b31bb8 )

From 0e919843fcc9a404c58719298a03c3152b68b544 Mon Sep 17 00:00:00 2001
From: Loris Ercole <v-lercole@microsoft.com>
Date: Fri, 5 Jun 2026 17:44:04 +0200
Subject: [PATCH 13/52] fix(msvc): conditionally check for -Wall flag under
 non-MSVC compilers

---
 src/CMakeLists.txt | 46 ++++++++++++++++++++++++----------------------
 1 file changed, 24 insertions(+), 22 deletions(-)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index ae9621690..b4d0f9ab5 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -52,7 +52,9 @@ target_include_directories( gauxc
 )
 
 include( CheckCXXCompilerFlag )
-check_cxx_compiler_flag( -Wall              GAUXC_CXX_HAS_WALL              )
+if(NOT MSVC)  # under clang-cl, -Wall maps to /Wall (= -Weverything)
+  check_cxx_compiler_flag( -Wall              GAUXC_CXX_HAS_WALL            )
+endif()
 check_cxx_compiler_flag( -Wextra            GAUXC_CXX_HAS_WEXTRA            )
 check_cxx_compiler_flag( -Wpedantic         GAUXC_CXX_HAS_WPEDANTIC         )
 check_cxx_compiler_flag( -Wnon-virtual-dtor GAUXC_CXX_HAS_WNON_VIRTUAL_DTOR )
@@ -78,27 +80,27 @@ if( GAUXC_CXX_HAS_WSHADOW )
   target_compile_options( gauxc PRIVATE $<$<COMPILE_LANGUAGE:CXX>: -Wshadow> )
 endif()
 
-if(MSVC)
-  if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
-    target_compile_options( gauxc PRIVATE
-      -Wno-covered-switch-default
-      -Wno-documentation
-      -Wno-documentation-unknown-command
-      -Wno-implicit-int-float-conversion
-      -Wno-language-extension-token
-      -Wno-reserved-identifier
-      -Wno-shorten-64-to-32
-      -Wno-sign-compare
-      -Wno-undef
-    )
-  else()
-    target_compile_options( gauxc PUBLIC /EHsc )
-    target_compile_options( gauxc PRIVATE
-      /wd4101  # unreferenced local variable
-      /wd5219  # implicit conversion from int-type to float-type
-    )
-  endif()
-endif()
+# if(MSVC)
+#   if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+#     target_compile_options( gauxc PRIVATE
+#       -Wno-covered-switch-default
+#       -Wno-documentation
+#       -Wno-documentation-unknown-command
+#       -Wno-implicit-int-float-conversion
+#       -Wno-language-extension-token
+#       -Wno-reserved-identifier
+#       -Wno-shorten-64-to-32
+#       -Wno-sign-compare
+#       -Wno-undef
+#     )
+#   else()
+#     target_compile_options( gauxc PUBLIC /EHsc )
+#     target_compile_options( gauxc PRIVATE
+#       /wd4101  # unreferenced local variable
+#       /wd5219  # implicit conversion from int-type to float-type
+#     )
+#   endif()
+# endif()
 
 target_link_libraries( gauxc PUBLIC 
   ExchCXX::ExchCXX 

From 79b03f198dca46442fe75f0aeece228a02537f86 Mon Sep 17 00:00:00 2001
From: Loris Ercole <v-lercole@microsoft.com>
Date: Mon, 8 Jun 2026 16:27:09 +0200
Subject: [PATCH 14/52] =?UTF-8?q?fix(msvc):=20remove=20Catch2WithMain=20/E?=
 =?UTF-8?q?Hsc=20=E2=80=94=20target=20only=20exists=20with=20CATCH=5FBUILD?=
 =?UTF-8?q?=5FSTATIC=5FLIBRARY=3DON?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Catch2 v2 creates Catch2WithMain only when CATCH_BUILD_STATIC_LIBRARY is
ON (off by default). gauxc_test inherits /EHsc transitively from gauxc
PUBLIC /EHsc, so no explicit override is needed.
---
 tests/CMakeLists.txt | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 76264dc9f..46dbe487d 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -33,9 +33,6 @@ if( NOT Catch2_FOUND )
   set(CATCH_INSTALL_HELPERS OFF CACHE BOOL "Install contrib alongside library" FORCE)
 
   FetchContent_MakeAvailable( catch2 )
-  if(MSVC)
-    target_compile_options(Catch2WithMain PRIVATE /EHsc)
-  endif()
   target_link_libraries( gauxc_catch2 INTERFACE Catch2::Catch2 )
 
 else()

From 388decdda69d225479860338b02967e0a2803f2d Mon Sep 17 00:00:00 2001
From: Loris Ercole <v-lercole@microsoft.com>
Date: Mon, 8 Jun 2026 17:59:14 +0200
Subject: [PATCH 15/52] Fix C4100 unused-parameter warnings for MSVC build

Add [[maybe_unused]] to parameters that are unused in non-OpenMP or
non-gencon code paths (e.g., ReductionOp, ks_settings, ldtps, delim).
---
 src/CMakeLists.txt                            | 42 +++++++++----------
 .../host/basic_mpi_reduction_driver.cxx       |  6 +--
 ...ted_xc_host_integrator_fxc_contraction.hpp |  8 ++--
 ...ched_replicated_xc_integrator_exc_grad.hpp |  4 +-
 ...plicated_xc_integrator_fxc_contraction.hpp |  2 +-
 tests/basis/parse_basis.cxx                   |  4 +-
 6 files changed, 33 insertions(+), 33 deletions(-)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index b4d0f9ab5..236b30bb4 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -80,27 +80,27 @@ if( GAUXC_CXX_HAS_WSHADOW )
   target_compile_options( gauxc PRIVATE $<$<COMPILE_LANGUAGE:CXX>: -Wshadow> )
 endif()
 
-# if(MSVC)
-#   if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
-#     target_compile_options( gauxc PRIVATE
-#       -Wno-covered-switch-default
-#       -Wno-documentation
-#       -Wno-documentation-unknown-command
-#       -Wno-implicit-int-float-conversion
-#       -Wno-language-extension-token
-#       -Wno-reserved-identifier
-#       -Wno-shorten-64-to-32
-#       -Wno-sign-compare
-#       -Wno-undef
-#     )
-#   else()
-#     target_compile_options( gauxc PUBLIC /EHsc )
-#     target_compile_options( gauxc PRIVATE
-#       /wd4101  # unreferenced local variable
-#       /wd5219  # implicit conversion from int-type to float-type
-#     )
-#   endif()
-# endif()
+if(MSVC)
+  if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+    # target_compile_options( gauxc PRIVATE
+    #   -Wno-covered-switch-default
+    #   -Wno-documentation
+    #   -Wno-documentation-unknown-command
+    #   -Wno-implicit-int-float-conversion
+    #   -Wno-language-extension-token
+    #   -Wno-reserved-identifier
+    #   -Wno-shorten-64-to-32
+    #   -Wno-sign-compare
+    #   -Wno-undef
+    # )
+  else()
+    target_compile_options( gauxc PUBLIC /EHsc )
+    target_compile_options( gauxc PRIVATE
+      /wd4101  # unreferenced local variable
+      /wd5219  # implicit conversion from int-type to float-type
+    )
+  endif()
+endif()
 
 target_link_libraries( gauxc PUBLIC 
   ExchCXX::ExchCXX 
diff --git a/src/reduction_driver/host/basic_mpi_reduction_driver.cxx b/src/reduction_driver/host/basic_mpi_reduction_driver.cxx
index 904f7caf0..f4bc66f58 100644
--- a/src/reduction_driver/host/basic_mpi_reduction_driver.cxx
+++ b/src/reduction_driver/host/basic_mpi_reduction_driver.cxx
@@ -60,8 +60,8 @@ BasicMPIReductionDriver::~BasicMPIReductionDriver() noexcept = default;
 BasicMPIReductionDriver::BasicMPIReductionDriver(const BasicMPIReductionDriver&) = default;
 
 
-void BasicMPIReductionDriver::allreduce_typeerased( const void* src, void* dest, 
-  size_t size, ReductionOp op, std::type_index idx, std::any optional_args )  {
+void BasicMPIReductionDriver::allreduce_typeerased( const void* src, void* dest,
+  size_t size, [[maybe_unused]] ReductionOp op, std::type_index idx, std::any optional_args )  {
 
   if( optional_args.has_value() )
     std::cout << "** Warning: Optional Args Are Not Used in BasiMPIReductionDriver::allreduce" << std::endl;
@@ -79,7 +79,7 @@ void BasicMPIReductionDriver::allreduce_typeerased( const void* src, void* dest,
 
 }
 void BasicMPIReductionDriver::allreduce_inplace_typeerased( void* data, size_t size,
-  ReductionOp op, std::type_index idx, std::any optional_args ) {
+  [[maybe_unused]] ReductionOp op, std::type_index idx, std::any optional_args ) {
 
   if( optional_args.has_value() )
     std::cout << "** Warning: Optional Args Are Not Used in BasiMPIReductionDriver::allreduce" << std::endl;
diff --git a/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_fxc_contraction.hpp b/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_fxc_contraction.hpp
index 192fe0f89..7320039c7 100644
--- a/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_fxc_contraction.hpp
+++ b/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_fxc_contraction.hpp
@@ -93,8 +93,8 @@ template <typename ValueType>
 void ReferenceReplicatedXCHostIntegrator<ValueType>::
   fxc_contraction_local_work_( const basis_type& basis, const value_type* Ps, int64_t ldps,
                             const value_type* Pz, int64_t ldpz,
-                            const value_type* tPs, int64_t ldtps,
-                            const value_type* tPz, int64_t ldtpz,
+                            const value_type* tPs, [[maybe_unused]] int64_t ldtps,
+                            const value_type* tPz, [[maybe_unused]] int64_t ldtpz,
                             value_type* FXCs, int64_t ldfxcs,
                             value_type* FXCz, int64_t ldfxcz,
                             value_type *N_EL, const IntegratorSettingsXC& settings,
@@ -178,9 +178,9 @@ void ReferenceReplicatedXCHostIntegrator<ValueType>::
     const auto& task = *(task_begin + iT);
 
     // Get tasks constants
-    const int32_t  npts    = task.points.size();
+    const int32_t  npts    = static_cast<int32_t>(task.points.size());
     const int32_t  nbe     = task.bfn_screening.nbe;
-    const int32_t  nshells = task.bfn_screening.shell_list.size();
+    const int32_t  nshells = static_cast<int32_t>(task.bfn_screening.shell_list.size());
 
     const auto* points      = task.points.data()->data();
     const auto* weights     = task.weights.data();
diff --git a/src/xc_integrator/shell_batched/shell_batched_replicated_xc_integrator_exc_grad.hpp b/src/xc_integrator/shell_batched/shell_batched_replicated_xc_integrator_exc_grad.hpp
index f329bc025..43a1b0343 100644
--- a/src/xc_integrator/shell_batched/shell_batched_replicated_xc_integrator_exc_grad.hpp
+++ b/src/xc_integrator/shell_batched/shell_batched_replicated_xc_integrator_exc_grad.hpp
@@ -22,7 +22,7 @@ void ShellBatchedReplicatedXCIntegrator<BaseIntegratorType, IncoreIntegratorType
   eval_exc_grad_( int64_t m, int64_t n, const value_type* P, int64_t ldp, value_type* EXC_GRAD, const IntegratorSettingsXC& settings ) { 
                  
   GAUXC_GENERIC_EXCEPTION("ShellBatched exc_grad NYI" );                 
-  util::unused(m,n,P,ldp,EXC_GRAD);
+  util::unused(m,n,P,ldp,EXC_GRAD,settings);
 }
 
 template <typename BaseIntegratorType, typename IncoreIntegratorType>
@@ -31,7 +31,7 @@ void ShellBatchedReplicatedXCIntegrator<BaseIntegratorType, IncoreIntegratorType
                   const value_type* Pz, int64_t lpdz, value_type* EXC_GRAD, const IntegratorSettingsXC& settings ) { 
                  
   GAUXC_GENERIC_EXCEPTION("ShellBatched exc_grad NYI" );                 
-  util::unused(m,n,Ps,ldps,Pz,lpdz,EXC_GRAD);
+  util::unused(m,n,Ps,ldps,Pz,lpdz,EXC_GRAD,settings);
 }
 
 }
diff --git a/src/xc_integrator/shell_batched/shell_batched_replicated_xc_integrator_fxc_contraction.hpp b/src/xc_integrator/shell_batched/shell_batched_replicated_xc_integrator_fxc_contraction.hpp
index 289de9600..6dc916cb1 100644
--- a/src/xc_integrator/shell_batched/shell_batched_replicated_xc_integrator_fxc_contraction.hpp
+++ b/src/xc_integrator/shell_batched/shell_batched_replicated_xc_integrator_fxc_contraction.hpp
@@ -41,7 +41,7 @@ void ShellBatchedReplicatedXCIntegrator<BaseIntegratorType, IncoreIntegratorType
                         const IntegratorSettingsXC& ks_settings ) {
   GAUXC_GENERIC_EXCEPTION("ShellBatched FXC contraction NYI");            
   util::unused(m,n,Ps,ldps,Pz,ldpz,tPs,ldtps,tPz,ldtpz,
-                 FXCs,ldfxcs,FXCz,ldfxcz);
+                 FXCs,ldfxcs,FXCz,ldfxcz,ks_settings);
 
 }
 
diff --git a/tests/basis/parse_basis.cxx b/tests/basis/parse_basis.cxx
index 0bf4cd8ec..00373d7d7 100644
--- a/tests/basis/parse_basis.cxx
+++ b/tests/basis/parse_basis.cxx
@@ -131,7 +131,7 @@ std::map<std::string,int> am_map = {
 
 namespace detail {
   inline static auto tokenize( std::string str,
-                               std::string delim = " " ) {
+                               [[maybe_unused]] std::string delim = " " ) {
     std::istringstream iss(str);
     std::vector<std::string> tokens;
 
@@ -254,7 +254,7 @@ BasisSet<double> parse_basis( const Molecule& mol,
   BasisSet<double> basis;
   for( auto iAt = 0; iAt < mol.size(); ++iAt ) {
     const auto& atom = mol.at(iAt);
-    BasisSet<double> atom_basis = basis_shells.at(atom.Z.get());
+    BasisSet<double> atom_basis = basis_shells.at(static_cast<int>(atom.Z.get()));
     for( auto& sh : atom_basis ) sh.O() = {atom.x, atom.y, atom.z};
     
     basis.insert(basis.end(), atom_basis.begin(), atom_basis.end() );

From 891a4ffbf22920bffe64d7da124ad385b0adbe61 Mon Sep 17 00:00:00 2001
From: Loris Ercole <v-lercole@microsoft.com>
Date: Tue, 9 Jun 2026 09:47:04 +0200
Subject: [PATCH 16/52] Fix C4244 implicit-conversion warnings for MSVC build

- atomic_radii.cxx: change pm_to_bohr to return double (was long double),
  eliminating implicit long double->double narrowing at all call sites
- config_obara_saika.hpp: explicit cast of std::floor result to int
- shell_batched_xc_integrator.cxx: explicit cast of std::max result to uint32_t
- standalone_driver.cxx: explicit cast of int N_EL_ref to double
---
 .../host/obara_saika/src/config_obara_saika.hpp               | 2 +-
 .../shell_batched/shell_batched_xc_integrator.cxx             | 4 ++--
 tests/standalone_driver.cxx                                   | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/config_obara_saika.hpp b/src/xc_integrator/local_work_driver/host/obara_saika/src/config_obara_saika.hpp
index 8b7cee2a2..b32ac2e57 100644
--- a/src/xc_integrator/local_work_driver/host/obara_saika/src/config_obara_saika.hpp
+++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/config_obara_saika.hpp
@@ -87,7 +87,7 @@ namespace XCPU {
 	  constexpr double deltaT = double(DEFAULT_MAX_T) / DEFAULT_NSEGMENT;
 	  constexpr double one_over_deltaT = 1 / deltaT;
 
-	  int iseg = std::floor(T[i] * one_over_deltaT);
+	  int iseg = static_cast<int>(std::floor(T[i] * one_over_deltaT));
 	  const double* boys_seg = boys_m + iseg * DEFAULT_LD_TABLE;
 	  
 	  const double ratio = (2 * iseg + 1);
diff --git a/src/xc_integrator/shell_batched/shell_batched_xc_integrator.cxx b/src/xc_integrator/shell_batched/shell_batched_xc_integrator.cxx
index 4d5a31560..914f82ec7 100644
--- a/src/xc_integrator/shell_batched/shell_batched_xc_integrator.cxx
+++ b/src/xc_integrator/shell_batched/shell_batched_xc_integrator.cxx
@@ -57,8 +57,8 @@ ShellBatchedXCIntegratorBase::incore_task_data
                                    overlap_pthresh_idx.rend(), 
   [&](int idx) {
 
-    uint32_t overlap_threshold = 
-      std::max(1., max_shell_list.size() * overlap_pthresh[idx] );
+    uint32_t overlap_threshold =
+      static_cast<uint32_t>(std::max(1., max_shell_list.size() * overlap_pthresh[idx] ));
 
 
     host_task_iterator search_st = task_begin;
diff --git a/tests/standalone_driver.cxx b/tests/standalone_driver.cxx
index 68a9c13aa..0016fc96f 100644
--- a/tests/standalone_driver.cxx
+++ b/tests/standalone_driver.cxx
@@ -464,7 +464,7 @@ int main(int argc, char** argv) {
       N_EL = integrator.integrate_den( P );
       if(!world_rank) std::cout << "N_EL = " << N_EL << std::endl;
     } else {
-      N_EL = N_EL_ref;
+      N_EL = static_cast<double>(N_EL_ref);
     }
 
     if( integrate_vxc ) {

From 6fdd340d04b393799397d1d9ccd4a8ee536d71ee Mon Sep 17 00:00:00 2001
From: Loris Ercole <v-lercole@microsoft.com>
Date: Tue, 9 Jun 2026 09:47:13 +0200
Subject: [PATCH 17/52] Fix C4018/C4244 signed/unsigned and narrowing warnings
 for MSVC build

- atomic_radii.cxx: change pm_to_bohr to return double (was long double);
  add static_cast<size_t> to fix signed/unsigned comparison with vector::size()
- tests/grid_test.cxx: cast int batch_sz to size_t in CHECK comparison
---
 src/atomic_radii.cxx | 4 ++--
 tests/grid_test.cxx  | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/atomic_radii.cxx b/src/atomic_radii.cxx
index 522753092..d5a6eb921 100644
--- a/src/atomic_radii.cxx
+++ b/src/atomic_radii.cxx
@@ -28,7 +28,7 @@ double default_atomic_radius(AtomicNumber Z) {
   
 }
 
-long double pm_to_bohr( long double x ) {
+double pm_to_bohr( double x ) {
   return x * 0.0188973000000929 / 1.00000205057;
 }
 
@@ -344,7 +344,7 @@ double uff_radius_103(AtomicNumber _Z) {
     const double RADIUS_UFF_SCALING = 1.1;
     const double DDX_BOHR_TO_ANGSTROM = 0.52917721092;
     auto Z = _Z.get();
-    if (Z < 0 || Z >= radius_uff_list.size()) {
+    if (Z < 0 || static_cast<size_t>(Z) >= radius_uff_list.size()) {
         return -1.;
     }
     return radius_uff_list[Z-1] * RADIUS_UFF_SCALING / DDX_BOHR_TO_ANGSTROM;
diff --git a/tests/grid_test.cxx b/tests/grid_test.cxx
index c308adf8e..0ba8e0957 100644
--- a/tests/grid_test.cxx
+++ b/tests/grid_test.cxx
@@ -46,7 +46,7 @@ TEST_CASE("Grid", "[grid]") {
   SECTION("Full Construction") {
 
     Grid grid( mk_sphere, BatchSize(batch_sz) );
-    CHECK( grid.batcher().max_batch_size() == batch_sz );
+    CHECK( grid.batcher().max_batch_size() == static_cast<size_t>(batch_sz) );
 
     for( auto i = 0; i < mk_batch.nbatches(); ++i ) {
 

From d05d0ece1a11a028fccdccb1f5766e8ba6c6ac85 Mon Sep 17 00:00:00 2001
From: Loris Ercole <v-lercole@microsoft.com>
Date: Tue, 9 Jun 2026 09:47:24 +0200
Subject: [PATCH 18/52] Fix C4267 size_t-to-int narrowing warnings for MSVC
 build

Add explicit casts where size_t values are passed to int-typed parameters:
- gau2grid_collocation.cxx: cast npts/ncomp to unsigned long at gau2grid
  API boundary; keep size_t for pointer arithmetic to avoid 32-bit overflow
  on Windows (where unsigned long is 32-bit)
- reference_local_host_work_driver.cxx: introduce int32_t locals for all
  size_t values passed to BLAS functions (blas::gemm, blas::dot, etc.)
- Integrator headers: static_cast<int32_t> for task sizes and shell counts
  passed to BLAS and submat map functions
---
 .../host/replicated_host_load_balancer.cxx    |   4 +-
 .../integrator_util/exx_screening.cxx         |   4 +-
 .../host/reference/gau2grid_collocation.cxx   |  96 +--
 .../host/reference_local_host_work_driver.cxx | 665 ++++++++++--------
 ...e_replicated_xc_host_integrator_dd_psi.hpp |   6 +-
 ...ed_xc_host_integrator_dd_psi_potential.hpp |   6 +-
 ...replicated_xc_host_integrator_exc_grad.hpp |  14 +-
 ..._replicated_xc_host_integrator_exc_vxc.hpp |   4 +-
 ...ence_replicated_xc_host_integrator_exx.hpp |   2 +-
 ...tched_replicated_xc_integrator_exc_vxc.hpp |  32 +-
 10 files changed, 472 insertions(+), 361 deletions(-)

diff --git a/src/load_balancer/host/replicated_host_load_balancer.cxx b/src/load_balancer/host/replicated_host_load_balancer.cxx
index 8f05f186b..16cc7ea8e 100644
--- a/src/load_balancer/host/replicated_host_load_balancer.cxx
+++ b/src/load_balancer/host/replicated_host_load_balancer.cxx
@@ -69,11 +69,11 @@ std::vector< XCTask > HostReplicatedLoadBalancer::create_local_tasks_() const  {
       XCTask task;
       task.iParent    = iCurrent;
       // This enables lazy assignment of points vector (see CUDA impl)
-      task.npts       = points.size(); 
+      task.npts       = static_cast<int32_t>(points.size());
       task.points     = std::move( points );
       task.weights    = std::move( weights );
       task.bfn_screening.shell_list = std::move(shell_list);
-      task.bfn_screening.nbe        = nbe;
+      task.bfn_screening.nbe        = static_cast<int32_t>(nbe);
       task.dist_nearest = molmeta_->dist_nearest()[iCurrent];
 
       #pragma omp critical
diff --git a/src/xc_integrator/integrator_util/exx_screening.cxx b/src/xc_integrator/integrator_util/exx_screening.cxx
index f55148742..fa5b53498 100644
--- a/src/xc_integrator/integrator_util/exx_screening.cxx
+++ b/src/xc_integrator/integrator_util/exx_screening.cxx
@@ -128,8 +128,8 @@ void exx_ek_screening(
   // Compute approx F_i^(k) = |P_ij| * B_j^(k) 
   //auto gemm_st = hrt_t::now();
   std::vector<double> task_approx_f( nbf * ntasks );
-  blas::gemm( 'N', 'N', nbf, ntasks, nbf, 1., P_abs, ldp,
-    task_max_bfn.data(), nbf, 0., task_approx_f.data(), nbf );
+  blas::gemm( 'N', 'N', static_cast<int>(nbf), static_cast<int>(ntasks), static_cast<int>(nbf), 1., P_abs, static_cast<int>(ldp),
+    task_max_bfn.data(), static_cast<int>(nbf), 0., task_approx_f.data(), static_cast<int>(nbf) );
   //auto gemm_en = hrt_t::now();
   //std::cout << "... done " << dur_t(gemm_en-gemm_st).count() << std::endl;
 
diff --git a/src/xc_integrator/local_work_driver/host/reference/gau2grid_collocation.cxx b/src/xc_integrator/local_work_driver/host/reference/gau2grid_collocation.cxx
index 98f53d358..9dd6cf1ee 100644
--- a/src/xc_integrator/local_work_driver/host/reference/gau2grid_collocation.cxx
+++ b/src/xc_integrator/local_work_driver/host/reference/gau2grid_collocation.cxx
@@ -39,15 +39,15 @@ void gau2grid_collocation( size_t                  npts,
   for( size_t i = 0; i < nshells; ++i ) {
 
     const auto& sh = basis.at(shell_mask[i]);
-    int order = sh.pure() ? GG_SPHERICAL_CCA : GG_CARTESIAN_CCA; 
-    gg_collocation( sh.l(), npts, points, 3, sh.nprim(), sh.coeff_data(),
+    int order = sh.pure() ? GG_SPHERICAL_CCA : GG_CARTESIAN_CCA;
+    gg_collocation( sh.l(), static_cast<unsigned long>(npts), points, 3, sh.nprim(), sh.coeff_data(),
       sh.alpha_data(), sh.O_data(), order, rv + ncomp*npts );
 
     ncomp += sh.size();
 
   }
 
-  gg_fast_transpose( ncomp, npts, rv, basis_eval );
+  gg_fast_transpose( static_cast<unsigned long>(ncomp), static_cast<unsigned long>(npts), rv, basis_eval );
   a.deallocate( rv, npts*nbe );
 
 #else
@@ -99,19 +99,19 @@ void gau2grid_collocation_gradient( size_t                  npts,
   for( size_t i = 0; i < nshells; ++i ) {
 
     const auto& sh = basis.at(shell_mask[i]);
-    int order = sh.pure() ? GG_SPHERICAL_CCA : GG_CARTESIAN_CCA; 
-    gg_collocation_deriv1( sh.l(), npts, points, 3, sh.nprim(), sh.coeff_data(),
-      sh.alpha_data(), sh.O_data(), order, rv + ncomp*npts, 
+    int order = sh.pure() ? GG_SPHERICAL_CCA : GG_CARTESIAN_CCA;
+    gg_collocation_deriv1( sh.l(), static_cast<unsigned long>(npts), points, 3, sh.nprim(), sh.coeff_data(),
+      sh.alpha_data(), sh.O_data(), order, rv + ncomp*npts,
       rv_x + ncomp*npts, rv_y + ncomp*npts, rv_z + ncomp*npts );
 
     ncomp += sh.size();
 
   }
 
-  gg_fast_transpose( ncomp, npts, rv,   basis_eval );
-  gg_fast_transpose( ncomp, npts, rv_x, dbasis_x_eval );
-  gg_fast_transpose( ncomp, npts, rv_y, dbasis_y_eval );
-  gg_fast_transpose( ncomp, npts, rv_z, dbasis_z_eval );
+  gg_fast_transpose( static_cast<unsigned long>(ncomp), static_cast<unsigned long>(npts), rv,   basis_eval );
+  gg_fast_transpose( static_cast<unsigned long>(ncomp), static_cast<unsigned long>(npts), rv_x, dbasis_x_eval );
+  gg_fast_transpose( static_cast<unsigned long>(ncomp), static_cast<unsigned long>(npts), rv_y, dbasis_y_eval );
+  gg_fast_transpose( static_cast<unsigned long>(ncomp), static_cast<unsigned long>(npts), rv_z, dbasis_z_eval );
 
   a.deallocate( rv, 4*npts*nbe );
 
@@ -175,15 +175,16 @@ void gau2grid_collocation_hessian( size_t                  npts,
   auto* rv_yz = rv_yy + npts * nbe;
   auto* rv_zz = rv_yz + npts * nbe;
 
+  const auto ul_npts = static_cast<unsigned long>(npts);
   size_t ncomp = 0;
   for( size_t i = 0; i < nshells; ++i ) {
 
     const auto& sh = basis.at(shell_mask[i]);
-    int order = sh.pure() ? GG_SPHERICAL_CCA : GG_CARTESIAN_CCA; 
+    int order = sh.pure() ? GG_SPHERICAL_CCA : GG_CARTESIAN_CCA;
 
     const auto ioff = ncomp*npts;
-    gg_collocation_deriv2( sh.l(), npts, points, 3, sh.nprim(), sh.coeff_data(),
-      sh.alpha_data(), sh.O_data(), order, rv + ioff, rv_x + ioff, rv_y + ioff, 
+    gg_collocation_deriv2( sh.l(), ul_npts, points, 3, sh.nprim(), sh.coeff_data(),
+      sh.alpha_data(), sh.O_data(), order, rv + ioff, rv_x + ioff, rv_y + ioff,
       rv_z + ioff, rv_xx + ioff, rv_xy + ioff, rv_xz + ioff, rv_yy + ioff,
       rv_yz + ioff, rv_zz + ioff);
 
@@ -191,16 +192,17 @@ void gau2grid_collocation_hessian( size_t                  npts,
 
   }
 
-  gg_fast_transpose( ncomp, npts, rv,    basis_eval );
-  gg_fast_transpose( ncomp, npts, rv_x,  dbasis_x_eval );
-  gg_fast_transpose( ncomp, npts, rv_y,  dbasis_y_eval );
-  gg_fast_transpose( ncomp, npts, rv_z,  dbasis_z_eval );
-  gg_fast_transpose( ncomp, npts, rv_xx, d2basis_xx_eval );
-  gg_fast_transpose( ncomp, npts, rv_xy, d2basis_xy_eval );
-  gg_fast_transpose( ncomp, npts, rv_xz, d2basis_xz_eval );
-  gg_fast_transpose( ncomp, npts, rv_yy, d2basis_yy_eval );
-  gg_fast_transpose( ncomp, npts, rv_yz, d2basis_yz_eval );
-  gg_fast_transpose( ncomp, npts, rv_zz, d2basis_zz_eval );
+  const auto ul_ncomp = static_cast<unsigned long>(ncomp);
+  gg_fast_transpose( ul_ncomp, ul_npts, rv,    basis_eval );
+  gg_fast_transpose( ul_ncomp, ul_npts, rv_x,  dbasis_x_eval );
+  gg_fast_transpose( ul_ncomp, ul_npts, rv_y,  dbasis_y_eval );
+  gg_fast_transpose( ul_ncomp, ul_npts, rv_z,  dbasis_z_eval );
+  gg_fast_transpose( ul_ncomp, ul_npts, rv_xx, d2basis_xx_eval );
+  gg_fast_transpose( ul_ncomp, ul_npts, rv_xy, d2basis_xy_eval );
+  gg_fast_transpose( ul_ncomp, ul_npts, rv_xz, d2basis_xz_eval );
+  gg_fast_transpose( ul_ncomp, ul_npts, rv_yy, d2basis_yy_eval );
+  gg_fast_transpose( ul_ncomp, ul_npts, rv_yz, d2basis_yz_eval );
+  gg_fast_transpose( ul_ncomp, ul_npts, rv_zz, d2basis_zz_eval );
 
   a.deallocate( rv, 10*npts*nbe );
 
@@ -257,15 +259,16 @@ void gau2grid_collocation_der3(    size_t                  npts,
   auto* rv_zzz = rv_yzz + npts * nbe;
 
 
+  const auto ul_npts = static_cast<unsigned long>(npts);
   size_t ncomp = 0;
   for( size_t i = 0; i < nshells; ++i ) {
 
     const auto& sh = basis.at(shell_mask[i]);
-    int order = sh.pure() ? GG_SPHERICAL_CCA : GG_CARTESIAN_CCA; 
+    int order = sh.pure() ? GG_SPHERICAL_CCA : GG_CARTESIAN_CCA;
 
     const auto ioff = ncomp*npts;
-    gg_collocation_deriv3( sh.l(), npts, points, 3, sh.nprim(), sh.coeff_data(),
-      sh.alpha_data(), sh.O_data(), order, rv + ioff, rv_x + ioff, rv_y + ioff, 
+    gg_collocation_deriv3( sh.l(), ul_npts, points, 3, sh.nprim(), sh.coeff_data(),
+      sh.alpha_data(), sh.O_data(), order, rv + ioff, rv_x + ioff, rv_y + ioff,
       rv_z + ioff, rv_xx + ioff, rv_xy + ioff, rv_xz + ioff, rv_yy + ioff,
       rv_yz + ioff, rv_zz + ioff, rv_xxx + ioff, rv_xxy + ioff, rv_xxz + ioff,
       rv_xyy + ioff, rv_xyz + ioff, rv_xzz + ioff, rv_yyy + ioff, rv_yyz + ioff,
@@ -275,26 +278,27 @@ void gau2grid_collocation_der3(    size_t                  npts,
 
   }
 
-  gg_fast_transpose( ncomp, npts, rv,    basis_eval );
-  gg_fast_transpose( ncomp, npts, rv_x,  dbasis_x_eval );
-  gg_fast_transpose( ncomp, npts, rv_y,  dbasis_y_eval );
-  gg_fast_transpose( ncomp, npts, rv_z,  dbasis_z_eval );
-  gg_fast_transpose( ncomp, npts, rv_xx, d2basis_xx_eval );
-  gg_fast_transpose( ncomp, npts, rv_xy, d2basis_xy_eval );
-  gg_fast_transpose( ncomp, npts, rv_xz, d2basis_xz_eval );
-  gg_fast_transpose( ncomp, npts, rv_yy, d2basis_yy_eval );
-  gg_fast_transpose( ncomp, npts, rv_yz, d2basis_yz_eval );
-  gg_fast_transpose( ncomp, npts, rv_zz, d2basis_zz_eval );
-  gg_fast_transpose( ncomp, npts, rv_xxx, d3basis_xxx_eval );
-  gg_fast_transpose( ncomp, npts, rv_xxy, d3basis_xxy_eval );
-  gg_fast_transpose( ncomp, npts, rv_xxz, d3basis_xxz_eval );
-  gg_fast_transpose( ncomp, npts, rv_xyy, d3basis_xyy_eval );
-  gg_fast_transpose( ncomp, npts, rv_xyz, d3basis_xyz_eval );
-  gg_fast_transpose( ncomp, npts, rv_xzz, d3basis_xzz_eval );
-  gg_fast_transpose( ncomp, npts, rv_yyy, d3basis_yyy_eval );
-  gg_fast_transpose( ncomp, npts, rv_yyz, d3basis_yyz_eval );
-  gg_fast_transpose( ncomp, npts, rv_yzz, d3basis_yzz_eval );
-  gg_fast_transpose( ncomp, npts, rv_zzz, d3basis_zzz_eval );
+  const auto ul_ncomp = static_cast<unsigned long>(ncomp);
+  gg_fast_transpose( ul_ncomp, ul_npts, rv,    basis_eval );
+  gg_fast_transpose( ul_ncomp, ul_npts, rv_x,  dbasis_x_eval );
+  gg_fast_transpose( ul_ncomp, ul_npts, rv_y,  dbasis_y_eval );
+  gg_fast_transpose( ul_ncomp, ul_npts, rv_z,  dbasis_z_eval );
+  gg_fast_transpose( ul_ncomp, ul_npts, rv_xx, d2basis_xx_eval );
+  gg_fast_transpose( ul_ncomp, ul_npts, rv_xy, d2basis_xy_eval );
+  gg_fast_transpose( ul_ncomp, ul_npts, rv_xz, d2basis_xz_eval );
+  gg_fast_transpose( ul_ncomp, ul_npts, rv_yy, d2basis_yy_eval );
+  gg_fast_transpose( ul_ncomp, ul_npts, rv_yz, d2basis_yz_eval );
+  gg_fast_transpose( ul_ncomp, ul_npts, rv_zz, d2basis_zz_eval );
+  gg_fast_transpose( ul_ncomp, ul_npts, rv_xxx, d3basis_xxx_eval );
+  gg_fast_transpose( ul_ncomp, ul_npts, rv_xxy, d3basis_xxy_eval );
+  gg_fast_transpose( ul_ncomp, ul_npts, rv_xxz, d3basis_xxz_eval );
+  gg_fast_transpose( ul_ncomp, ul_npts, rv_xyy, d3basis_xyy_eval );
+  gg_fast_transpose( ul_ncomp, ul_npts, rv_xyz, d3basis_xyz_eval );
+  gg_fast_transpose( ul_ncomp, ul_npts, rv_xzz, d3basis_xzz_eval );
+  gg_fast_transpose( ul_ncomp, ul_npts, rv_yyy, d3basis_yyy_eval );
+  gg_fast_transpose( ul_ncomp, ul_npts, rv_yyz, d3basis_yyz_eval );
+  gg_fast_transpose( ul_ncomp, ul_npts, rv_yzz, d3basis_yzz_eval );
+  gg_fast_transpose( ul_ncomp, ul_npts, rv_zzz, d3basis_zzz_eval );
 
   a.deallocate( rv, 20*npts*nbe );
 
diff --git a/src/xc_integrator/local_work_driver/host/reference_local_host_work_driver.cxx b/src/xc_integrator/local_work_driver/host/reference_local_host_work_driver.cxx
index 192cfcd33..262999f30 100644
--- a/src/xc_integrator/local_work_driver/host/reference_local_host_work_driver.cxx
+++ b/src/xc_integrator/local_work_driver/host/reference_local_host_work_driver.cxx
@@ -123,9 +123,13 @@ namespace GauXC {
   void ReferenceLocalHostWorkDriver::eval_xmat( size_t npts, size_t nbf, size_t nbe, 
 						const submat_map_t& submat_map, double fac, const double* P, size_t ldp, 
 						const double* basis_eval, size_t ldb, double* X, size_t ldx, double* scr ) {
+    const auto inbe  = static_cast<int32_t>(nbe);
+    const auto inpts = static_cast<int32_t>(npts);
+    const auto ildb  = static_cast<int32_t>(ldb);
+    const auto ildx  = static_cast<int32_t>(ldx);
     const auto* P_use = P;
     size_t ldp_use = ldp;
-     
+
     if( submat_map.size() > 1 ) {
       detail::submat_set( nbf, nbf, nbe, nbe, P, ldp, scr, nbe, submat_map );
       P_use = scr;
@@ -134,22 +138,23 @@ namespace GauXC {
       P_use = P + submat_map[0][0]*(ldp+1);
     }
 
-    blas::gemm( 'N', 'N', nbe, npts, nbe, fac, P_use, ldp_use, basis_eval, ldb, 
-		0., X, ldx );
+    blas::gemm( 'N', 'N', inbe, inpts, inbe, fac, P_use, static_cast<int32_t>(ldp_use), basis_eval, ildb,
+		0., X, ildx );
 
   }
 
 
   // U/VVar LDA (density)
-  void ReferenceLocalHostWorkDriver::eval_uvvar_lda_rks( size_t npts, size_t nbe, 
+  void ReferenceLocalHostWorkDriver::eval_uvvar_lda_rks( size_t npts, size_t nbe,
 						     const double* basis_eval, const double* X, size_t ldx, double* den_eval) {
 
+    const auto inbe = static_cast<int32_t>(nbe);
 
     for( int32_t i = 0; i < (int32_t)npts; ++i ) {
 
       const size_t ioff = size_t(i) * ldx;
       const auto*   X_i = X + ioff;
-      den_eval[i] = blas::dot( nbe, basis_eval + ioff, 1, X_i, 1 );
+      den_eval[i] = blas::dot( inbe, basis_eval + ioff, 1, X_i, 1 );
 
     }    
 
@@ -157,9 +162,11 @@ namespace GauXC {
 
   
   void ReferenceLocalHostWorkDriver::eval_uvvar_lda_uks( size_t npts, size_t nbe,
-   const double* basis_eval, const double* Xs, size_t ldxs, 
+   const double* basis_eval, const double* Xs, size_t ldxs,
    const double* Xz, size_t ldxz, double* den_eval) {
-  
+
+    const auto inbe = static_cast<int32_t>(nbe);
+
     for( int32_t i = 0; i < (int32_t)npts; ++i ) {
 
       const size_t ioffs = size_t(i) * ldxs;
@@ -168,8 +175,8 @@ namespace GauXC {
       const auto*   Xs_i = Xs + ioffs;
       const auto*   Xz_i = Xz + ioffz;
 
-      const double rhos = blas::dot( nbe, basis_eval + ioffs, 1, Xs_i, 1 );
-      const double rhoz = blas::dot( nbe, basis_eval + ioffz, 1, Xz_i, 1 );
+      const double rhos = blas::dot( inbe, basis_eval + ioffs, 1, Xs_i, 1 );
+      const double rhoz = blas::dot( inbe, basis_eval + ioffz, 1, Xz_i, 1 );
       
       den_eval[2*i]   = 0.5*(rhos + rhoz); // rho_+
       den_eval[2*i+1] = 0.5*(rhos - rhoz); // rho_-
@@ -182,13 +189,14 @@ namespace GauXC {
     const double* Xs, size_t ldxs, const double* Xz, size_t ldxz,
     const double* Xx, size_t ldxx, const double* Xy, size_t ldxy, double* den_eval, double* K, const double dtol) {
 
+    const auto inbe = static_cast<int32_t>(nbe);
 
     auto *KZ = K; // KZ // store K in the Z matrix
     auto *KY = KZ + npts;
     auto *KX = KY + npts;
 
     double dtolsq = dtol*dtol;
- 
+
     for( int32_t i = 0; i < (int32_t)npts; ++i ) {
 
       const size_t ioffs = size_t(i) * ldxs;
@@ -201,10 +209,10 @@ namespace GauXC {
       const auto*   Xx_i = Xx + ioffx;
       const auto*   Xy_i = Xy + ioffy;
 
-      const double rhos = blas::dot( nbe, basis_eval + ioffs, 1, Xs_i, 1 );
-      const double rhoz = blas::dot( nbe, basis_eval + ioffz, 1, Xz_i, 1 );
-      const double rhox = blas::dot( nbe, basis_eval + ioffx, 1, Xx_i, 1 );
-      const double rhoy = blas::dot( nbe, basis_eval + ioffy, 1, Xy_i, 1 );
+      const double rhos = blas::dot( inbe, basis_eval + ioffs, 1, Xs_i, 1 );
+      const double rhoz = blas::dot( inbe, basis_eval + ioffz, 1, Xz_i, 1 );
+      const double rhox = blas::dot( inbe, basis_eval + ioffx, 1, Xx_i, 1 );
+      const double rhoy = blas::dot( inbe, basis_eval + ioffy, 1, Xy_i, 1 );
  
       double mtemp = rhoz * rhoz + rhox * rhox + rhoy * rhoy;
       double mnorm = 0;
@@ -229,22 +237,24 @@ namespace GauXC {
   }
 
 
-  void ReferenceLocalHostWorkDriver::eval_uvvar_gga_rks( size_t npts, size_t nbe, 
-						     const double* basis_eval, const double* dbasis_x_eval, 
-						     const double *dbasis_y_eval, const double* dbasis_z_eval, const double* X, 
-						     size_t ldx, double* den_eval, double* dden_x_eval, double* dden_y_eval, 
+  void ReferenceLocalHostWorkDriver::eval_uvvar_gga_rks( size_t npts, size_t nbe,
+						     const double* basis_eval, const double* dbasis_x_eval,
+						     const double *dbasis_y_eval, const double* dbasis_z_eval, const double* X,
+						     size_t ldx, double* den_eval, double* dden_x_eval, double* dden_y_eval,
 						     double* dden_z_eval, double* gamma ) {
 
+    const auto inbe = static_cast<int32_t>(nbe);
+
     for( int32_t i = 0; i < (int32_t)npts; ++i ) {
 
       const size_t ioff = size_t(i) * ldx;
       const auto*   X_i = X + ioff;
 
-      den_eval[i] = blas::dot( nbe, basis_eval + ioff, 1, X_i, 1 );
+      den_eval[i] = blas::dot( inbe, basis_eval + ioff, 1, X_i, 1 );
 
-      const auto dx = 2. * blas::dot( nbe, dbasis_x_eval + ioff, 1, X_i, 1 );
-      const auto dy = 2. * blas::dot( nbe, dbasis_y_eval + ioff, 1, X_i, 1 );
-      const auto dz = 2. * blas::dot( nbe, dbasis_z_eval + ioff, 1, X_i, 1 );
+      const auto dx = 2. * blas::dot( inbe, dbasis_x_eval + ioff, 1, X_i, 1 );
+      const auto dy = 2. * blas::dot( inbe, dbasis_y_eval + ioff, 1, X_i, 1 );
+      const auto dz = 2. * blas::dot( inbe, dbasis_z_eval + ioff, 1, X_i, 1 );
 
       dden_x_eval[i] = dx;
       dden_y_eval[i] = dy;
@@ -258,10 +268,12 @@ namespace GauXC {
 void ReferenceLocalHostWorkDriver::eval_uvvar_gga_uks( size_t npts, size_t nbe,
   const double* basis_eval, const double* dbasis_x_eval,
   const double *dbasis_y_eval, const double* dbasis_z_eval, const double* Xs,
-  size_t ldxs, const double* Xz, size_t ldxz, 
+  size_t ldxs, const double* Xz, size_t ldxz,
   double* den_eval, double* dden_x_eval, double* dden_y_eval,
   double* dden_z_eval, double* gamma ) {
 
+   const auto inbe = static_cast<int32_t>(nbe);
+
    for( int32_t i = 0; i < (int32_t)npts; ++i ) {
 
       const size_t ioffs = size_t(i) * ldxs;
@@ -270,26 +282,26 @@ void ReferenceLocalHostWorkDriver::eval_uvvar_gga_uks( size_t npts, size_t nbe,
       const auto*   Xs_i = Xs + ioffs;
       const auto*   Xz_i = Xz + ioffz;
 
-      double rhos = blas::dot( nbe, basis_eval + ioffs, 1, Xs_i, 1 ); // S density
-      double rhoz = blas::dot( nbe, basis_eval + ioffz, 1, Xz_i, 1 ); // Z density
+      double rhos = blas::dot( inbe, basis_eval + ioffs, 1, Xs_i, 1 ); // S density
+      double rhoz = blas::dot( inbe, basis_eval + ioffz, 1, Xz_i, 1 ); // Z density
 
 
       den_eval[2*i]   = 0.5*(rhos + rhoz); // rho_+
       den_eval[2*i+1] = 0.5*(rhos - rhoz); // rho_-
 
       const auto dndx =
-        2. * blas::dot( nbe, dbasis_x_eval + ioffs, 1, Xs_i, 1 );
+        2. * blas::dot( inbe, dbasis_x_eval + ioffs, 1, Xs_i, 1 );
       const auto dndy =
-        2. * blas::dot( nbe, dbasis_y_eval + ioffs, 1, Xs_i, 1 );
+        2. * blas::dot( inbe, dbasis_y_eval + ioffs, 1, Xs_i, 1 );
       const auto dndz =
-        2. * blas::dot( nbe, dbasis_z_eval + ioffs, 1, Xs_i, 1 );
+        2. * blas::dot( inbe, dbasis_z_eval + ioffs, 1, Xs_i, 1 );
 
       const auto dMzdx =
-        2. * blas::dot( nbe, dbasis_x_eval + ioffz, 1, Xz_i, 1 );
+        2. * blas::dot( inbe, dbasis_x_eval + ioffz, 1, Xz_i, 1 );
       const auto dMzdy =
-        2. * blas::dot( nbe, dbasis_y_eval + ioffz, 1, Xz_i, 1 );
+        2. * blas::dot( inbe, dbasis_y_eval + ioffz, 1, Xz_i, 1 );
       const auto dMzdz =
-        2. * blas::dot( nbe, dbasis_z_eval + ioffz, 1, Xz_i, 1 );
+        2. * blas::dot( inbe, dbasis_z_eval + ioffz, 1, Xz_i, 1 );
 
       dden_x_eval[2*i] = dndx; // dn / dx
       dden_y_eval[2*i] = dndy; // dn / dy
@@ -317,21 +329,23 @@ void ReferenceLocalHostWorkDriver::eval_uvvar_gga_uks( size_t npts, size_t nbe,
 void ReferenceLocalHostWorkDriver::eval_uvvar_mgga_rks( size_t npts, size_t nbe,
   const double* basis_eval, const double* dbasis_x_eval,
   const double *dbasis_y_eval, const double* dbasis_z_eval, const double* lbasis_eval,
-  const double* X, size_t ldx, const double* mmat_x, const double* mmat_y, 
-  const double* mmat_z, size_t ldm,
+  const double* X, size_t ldx, const double* mmat_x, const double* mmat_y,
+  const double* mmat_z, [[maybe_unused]] size_t ldm,
   double* den_eval, double* dden_x_eval, double* dden_y_eval,
   double* dden_z_eval, double* gamma, double* tau, double* lapl ) {
 
+   const auto inbe = static_cast<int32_t>(nbe);
+
    for( int32_t i = 0; i < (int32_t)npts; ++i ) {
 
       const size_t ioff = size_t(i) * ldx;
       const auto*   X_i = X + ioff;
 
-      den_eval[i] = blas::dot( nbe, basis_eval + ioff, 1, X_i, 1 );
+      den_eval[i] = blas::dot( inbe, basis_eval + ioff, 1, X_i, 1 );
 
-      const auto dx = 2. * blas::dot( nbe, dbasis_x_eval + ioff, 1, X_i, 1 );
-      const auto dy = 2. * blas::dot( nbe, dbasis_y_eval + ioff, 1, X_i, 1 );
-      const auto dz = 2. * blas::dot( nbe, dbasis_z_eval + ioff, 1, X_i, 1 );
+      const auto dx = 2. * blas::dot( inbe, dbasis_x_eval + ioff, 1, X_i, 1 );
+      const auto dy = 2. * blas::dot( inbe, dbasis_y_eval + ioff, 1, X_i, 1 );
+      const auto dz = 2. * blas::dot( inbe, dbasis_z_eval + ioff, 1, X_i, 1 );
 
       dden_x_eval[i] = dx;
       dden_y_eval[i] = dy;
@@ -339,12 +353,12 @@ void ReferenceLocalHostWorkDriver::eval_uvvar_mgga_rks( size_t npts, size_t nbe,
 
       gamma[i] = dx*dx + dy*dy + dz*dz;
 
-      tau[i]  = 0.5*blas::dot( nbe, dbasis_x_eval + ioff, 1, mmat_x + ioff, 1);
-      tau[i] += 0.5*blas::dot( nbe, dbasis_y_eval + ioff, 1, mmat_y + ioff, 1);
-      tau[i] += 0.5*blas::dot( nbe, dbasis_z_eval + ioff, 1, mmat_z + ioff, 1);
+      tau[i]  = 0.5*blas::dot( inbe, dbasis_x_eval + ioff, 1, mmat_x + ioff, 1);
+      tau[i] += 0.5*blas::dot( inbe, dbasis_y_eval + ioff, 1, mmat_y + ioff, 1);
+      tau[i] += 0.5*blas::dot( inbe, dbasis_z_eval + ioff, 1, mmat_z + ioff, 1);
 
       if (lapl != nullptr)
-        lapl[i]  = 2. * blas::dot( nbe, lbasis_eval + ioff, 1, X_i, 1) + 4. * tau[i];
+        lapl[i]  = 2. * blas::dot( inbe, lbasis_eval + ioff, 1, X_i, 1) + 4. * tau[i];
 
    }
 }
@@ -352,12 +366,14 @@ void ReferenceLocalHostWorkDriver::eval_uvvar_mgga_rks( size_t npts, size_t nbe,
 void ReferenceLocalHostWorkDriver::eval_uvvar_mgga_uks( size_t npts, size_t nbe,
   const double* basis_eval, const double* dbasis_x_eval,
   const double *dbasis_y_eval, const double* dbasis_z_eval, const double* lbasis_eval,
-  const double* Xs, size_t ldxs, const double* Xz, size_t ldxz, 
-  const double* mmat_xs, const double* mmat_ys, const double* mmat_zs, size_t ldms,
-  const double* mmat_xz, const double* mmat_yz, const double* mmat_zz, size_t ldmz,
+  const double* Xs, size_t ldxs, const double* Xz, size_t ldxz,
+  const double* mmat_xs, const double* mmat_ys, const double* mmat_zs, [[maybe_unused]] size_t ldms,
+  const double* mmat_xz, const double* mmat_yz, const double* mmat_zz, [[maybe_unused]] size_t ldmz,
   double* den_eval, double* dden_x_eval, double* dden_y_eval,
   double* dden_z_eval, double* gamma, double* tau, double* lapl ) {
 
+   const auto inbe = static_cast<int32_t>(nbe);
+
    for( int32_t i = 0; i < (int32_t)npts; ++i ) {
 
       const size_t ioffs = size_t(i) * ldxs;
@@ -366,26 +382,26 @@ void ReferenceLocalHostWorkDriver::eval_uvvar_mgga_uks( size_t npts, size_t nbe,
       const auto*   Xs_i = Xs + ioffs;
       const auto*   Xz_i = Xz + ioffz;
 
-      double rhos = blas::dot( nbe, basis_eval + ioffs, 1, Xs_i, 1 ); // S density
-      double rhoz = blas::dot( nbe, basis_eval + ioffz, 1, Xz_i, 1 ); // Z density
+      double rhos = blas::dot( inbe, basis_eval + ioffs, 1, Xs_i, 1 ); // S density
+      double rhoz = blas::dot( inbe, basis_eval + ioffz, 1, Xz_i, 1 ); // Z density
 
 
       den_eval[2*i]   = 0.5*(rhos + rhoz); // rho_+
       den_eval[2*i+1] = 0.5*(rhos - rhoz); // rho_-
 
       const auto dndx =
-        2. * blas::dot( nbe, dbasis_x_eval + ioffs, 1, Xs_i, 1 );
+        2. * blas::dot( inbe, dbasis_x_eval + ioffs, 1, Xs_i, 1 );
       const auto dndy =
-        2. * blas::dot( nbe, dbasis_y_eval + ioffs, 1, Xs_i, 1 );
+        2. * blas::dot( inbe, dbasis_y_eval + ioffs, 1, Xs_i, 1 );
       const auto dndz =
-        2. * blas::dot( nbe, dbasis_z_eval + ioffs, 1, Xs_i, 1 );
+        2. * blas::dot( inbe, dbasis_z_eval + ioffs, 1, Xs_i, 1 );
 
       const auto dMzdx =
-        2. * blas::dot( nbe, dbasis_x_eval + ioffz, 1, Xz_i, 1 );
+        2. * blas::dot( inbe, dbasis_x_eval + ioffz, 1, Xz_i, 1 );
       const auto dMzdy =
-        2. * blas::dot( nbe, dbasis_y_eval + ioffz, 1, Xz_i, 1 );
+        2. * blas::dot( inbe, dbasis_y_eval + ioffz, 1, Xz_i, 1 );
       const auto dMzdz =
-        2. * blas::dot( nbe, dbasis_z_eval + ioffz, 1, Xz_i, 1 );
+        2. * blas::dot( inbe, dbasis_z_eval + ioffz, 1, Xz_i, 1 );
 
       dden_x_eval[2*i] = dndx; // dn / dx
       dden_y_eval[2*i] = dndy; // dn / dy
@@ -406,19 +422,19 @@ void ReferenceLocalHostWorkDriver::eval_uvvar_mgga_uks( size_t npts, size_t nbe,
       gamma[3*i+1] = 0.25*(dn_sq - dMz_sq);
       gamma[3*i+2] = 0.25*(dn_sq + dMz_sq) - 0.5*dn_dMz;
 
-      auto taus  = 0.5*blas::dot( nbe, dbasis_x_eval + ioffs, 1, mmat_xs + ioffs, 1);
-           taus += 0.5*blas::dot( nbe, dbasis_y_eval + ioffs, 1, mmat_ys + ioffs, 1);
-           taus += 0.5*blas::dot( nbe, dbasis_z_eval + ioffs, 1, mmat_zs + ioffs, 1);
-      auto tauz  = 0.5*blas::dot( nbe, dbasis_x_eval + ioffz, 1, mmat_xz + ioffz, 1);
-           tauz += 0.5*blas::dot( nbe, dbasis_y_eval + ioffz, 1, mmat_yz + ioffz, 1);
-           tauz += 0.5*blas::dot( nbe, dbasis_z_eval + ioffz, 1, mmat_zz + ioffz, 1);
+      auto taus  = 0.5*blas::dot( inbe, dbasis_x_eval + ioffs, 1, mmat_xs + ioffs, 1);
+           taus += 0.5*blas::dot( inbe, dbasis_y_eval + ioffs, 1, mmat_ys + ioffs, 1);
+           taus += 0.5*blas::dot( inbe, dbasis_z_eval + ioffs, 1, mmat_zs + ioffs, 1);
+      auto tauz  = 0.5*blas::dot( inbe, dbasis_x_eval + ioffz, 1, mmat_xz + ioffz, 1);
+           tauz += 0.5*blas::dot( inbe, dbasis_y_eval + ioffz, 1, mmat_yz + ioffz, 1);
+           tauz += 0.5*blas::dot( inbe, dbasis_z_eval + ioffz, 1, mmat_zz + ioffz, 1);
 
       tau[2*i]   = 0.5*(taus + tauz);
       tau[2*i+1] = 0.5*(taus - tauz);
 
       if (lapl != nullptr) {
-        auto lapls = 2. * blas::dot( nbe, lbasis_eval + ioffs, 1, Xs_i, 1) + 4. * taus;
-        auto laplz = 2. * blas::dot( nbe, lbasis_eval + ioffz, 1, Xz_i, 1) + 4. * tauz;
+        auto lapls = 2. * blas::dot( inbe, lbasis_eval + ioffs, 1, Xs_i, 1) + 4. * taus;
+        auto laplz = 2. * blas::dot( inbe, lbasis_eval + ioffz, 1, Xz_i, 1) + 4. * tauz;
 
         lapl[2*i]   = 0.5*(lapls + laplz);
         lapl[2*i+1] = 0.5*(lapls - laplz);
@@ -436,6 +452,8 @@ void ReferenceLocalHostWorkDriver::eval_uvvar_gga_gks( size_t npts, size_t nbe,
     const double* Xy, size_t ldxy, double* den_eval,
     double* dden_x_eval, double* dden_y_eval, double* dden_z_eval, double* gamma, double* K, double* H, const double dtol) {
 
+   const auto inbe = static_cast<int32_t>(nbe);
+
    auto *KZ = K; // KZ // store K in the Z matrix
    auto *KY = KZ + npts;
    auto *KX = KY + npts;
@@ -458,38 +476,38 @@ void ReferenceLocalHostWorkDriver::eval_uvvar_gga_gks( size_t npts, size_t nbe,
       const auto*   Xx_i = Xx + ioffx;
       const auto*   Xy_i = Xy + ioffy;
 
-      const double rhos = blas::dot( nbe, basis_eval + ioffs, 1, Xs_i, 1 );
-      const double rhoz = blas::dot( nbe, basis_eval + ioffz, 1, Xz_i, 1 );
-      const double rhox = blas::dot( nbe, basis_eval + ioffx, 1, Xx_i, 1 );
-      const double rhoy = blas::dot( nbe, basis_eval + ioffy, 1, Xy_i, 1 );
+      const double rhos = blas::dot( inbe, basis_eval + ioffs, 1, Xs_i, 1 );
+      const double rhoz = blas::dot( inbe, basis_eval + ioffz, 1, Xz_i, 1 );
+      const double rhox = blas::dot( inbe, basis_eval + ioffx, 1, Xx_i, 1 );
+      const double rhoy = blas::dot( inbe, basis_eval + ioffy, 1, Xy_i, 1 );
 
       const auto dndx =
-        2. * blas::dot( nbe, dbasis_x_eval + ioffs, 1, Xs_i, 1 );
+        2. * blas::dot( inbe, dbasis_x_eval + ioffs, 1, Xs_i, 1 );
       const auto dndy =
-        2. * blas::dot( nbe, dbasis_y_eval + ioffs, 1, Xs_i, 1 );
+        2. * blas::dot( inbe, dbasis_y_eval + ioffs, 1, Xs_i, 1 );
       const auto dndz =
-        2. * blas::dot( nbe, dbasis_z_eval + ioffs, 1, Xs_i, 1 );
+        2. * blas::dot( inbe, dbasis_z_eval + ioffs, 1, Xs_i, 1 );
 
       const auto dMzdx =
-        2. * blas::dot( nbe, dbasis_x_eval + ioffz, 1, Xz_i, 1 );
+        2. * blas::dot( inbe, dbasis_x_eval + ioffz, 1, Xz_i, 1 );
       const auto dMzdy =
-        2. * blas::dot( nbe, dbasis_y_eval + ioffz, 1, Xz_i, 1 );
+        2. * blas::dot( inbe, dbasis_y_eval + ioffz, 1, Xz_i, 1 );
       const auto dMzdz =
-        2. * blas::dot( nbe, dbasis_z_eval + ioffz, 1, Xz_i, 1 );
+        2. * blas::dot( inbe, dbasis_z_eval + ioffz, 1, Xz_i, 1 );
 
       const auto dMxdx =
-        2. * blas::dot( nbe, dbasis_x_eval + ioffx, 1, Xx_i, 1 );
+        2. * blas::dot( inbe, dbasis_x_eval + ioffx, 1, Xx_i, 1 );
       const auto dMxdy =
-        2. * blas::dot( nbe, dbasis_y_eval + ioffx, 1, Xx_i, 1 );
+        2. * blas::dot( inbe, dbasis_y_eval + ioffx, 1, Xx_i, 1 );
       const auto dMxdz =
-        2. * blas::dot( nbe, dbasis_z_eval + ioffx, 1, Xx_i, 1 );
+        2. * blas::dot( inbe, dbasis_z_eval + ioffx, 1, Xx_i, 1 );
 
       const auto dMydx =
-        2. * blas::dot( nbe, dbasis_x_eval + ioffy, 1, Xy_i, 1 );
+        2. * blas::dot( inbe, dbasis_x_eval + ioffy, 1, Xy_i, 1 );
       const auto dMydy =
-        2. * blas::dot( nbe, dbasis_y_eval + ioffy, 1, Xy_i, 1 );
+        2. * blas::dot( inbe, dbasis_y_eval + ioffy, 1, Xy_i, 1 );
       const auto dMydz =
-        2. * blas::dot( nbe, dbasis_z_eval + ioffy, 1, Xy_i, 1 );
+        2. * blas::dot( inbe, dbasis_z_eval + ioffy, 1, Xy_i, 1 );
 
 
       dden_x_eval[4 * i] = dndx;
@@ -563,18 +581,21 @@ void ReferenceLocalHostWorkDriver::eval_uvvar_gga_gks( size_t npts, size_t nbe,
 
 }
   // Eval Z Matrix LDA VXC
-  void ReferenceLocalHostWorkDriver::eval_zmat_lda_vxc_rks( size_t npts, size_t nbf, 
+  void ReferenceLocalHostWorkDriver::eval_zmat_lda_vxc_rks( size_t npts, size_t nbf,
 							const double* vrho, const double* basis_eval, double* Z, size_t ldz ) {
 
+    const auto inbf  = static_cast<int32_t>(nbf);
+    const auto inpts = static_cast<int32_t>(npts);
+    const auto ildz  = static_cast<int32_t>(ldz);
 
-    blas::lacpy( 'A', nbf, npts, basis_eval, nbf, Z, ldz );
+    blas::lacpy( 'A', inbf, inpts, basis_eval, inbf, Z, ildz );
 
     for( int32_t i = 0; i < (int32_t)npts; ++i ) {
 
       auto* z_col = Z + i*ldz;
 
       const double fact = 0.5 * vrho[i];
-      GauXC::blas::scal( nbf, fact, z_col, 1 );
+      GauXC::blas::scal( inbf, fact, z_col, 1 );
 
     }
 
@@ -585,9 +606,13 @@ void ReferenceLocalHostWorkDriver::eval_uvvar_gga_gks( size_t npts, size_t nbe,
               const double* vrho, const double* basis_eval, double* Zs, size_t ldzs,
               double* Zz, size_t ldzz ) {
 
+    const auto inbf  = static_cast<int32_t>(nbf);
+    const auto inpts = static_cast<int32_t>(npts);
+    const auto ildzs = static_cast<int32_t>(ldzs);
+    const auto ildzz = static_cast<int32_t>(ldzz);
 
-    blas::lacpy( 'A', nbf, npts, basis_eval, nbf, Zs, ldzs);
-    blas::lacpy( 'A', nbf, npts, basis_eval, nbf, Zz, ldzz);
+    blas::lacpy( 'A', inbf, inpts, basis_eval, inbf, Zs, ildzs);
+    blas::lacpy( 'A', inbf, inpts, basis_eval, inbf, Zz, ildzz);
 
     for( int32_t i = 0; i < (int32_t)npts; ++i ) {
 
@@ -598,11 +623,11 @@ void ReferenceLocalHostWorkDriver::eval_uvvar_gga_gks( size_t npts, size_t nbe,
       const double factm = 0.5 * vrho[2*i+1];
 
       //eq. 56 https://doi.org/10.1140/epjb/e2018-90170-1
-      GauXC::blas::scal( nbf, 0.5*(factp + factm), zs_col, 1 );
-      GauXC::blas::scal( nbf, 0.5*(factp - factm), zz_col, 1 );
+      GauXC::blas::scal( inbf, 0.5*(factp + factm), zs_col, 1 );
+      GauXC::blas::scal( inbf, 0.5*(factp - factm), zz_col, 1 );
 
     }
- 
+
 
   }
 
@@ -610,14 +635,21 @@ void ReferenceLocalHostWorkDriver::eval_zmat_lda_vxc_gks( size_t npts, size_t nb
     const double* basis_eval, double* Zs, size_t ldzs, double* Zz, size_t ldzz,
     double* Zx, size_t ldzx,double* Zy, size_t ldzy, double *K ) {
 
+  const auto inbe  = static_cast<int32_t>(nbe);
+  const auto inpts = static_cast<int32_t>(npts);
+  const auto ildzs = static_cast<int32_t>(ldzs);
+  const auto ildzz = static_cast<int32_t>(ldzz);
+  const auto ildzx = static_cast<int32_t>(ldzx);
+  const auto ildzy = static_cast<int32_t>(ldzy);
+
   auto *KZ = K; // KZ // store K in the Z matrix
   auto *KY = KZ + npts;
   auto *KX = KY + npts;
 
-    blas::lacpy( 'A', nbe, npts, basis_eval, nbe, Zs, ldzs);
-    blas::lacpy( 'A', nbe, npts, basis_eval, nbe, Zz, ldzz);
-    blas::lacpy( 'A', nbe, npts, basis_eval, nbe, Zx, ldzx);
-    blas::lacpy( 'A', nbe, npts, basis_eval, nbe, Zy, ldzy);
+    blas::lacpy( 'A', inbe, inpts, basis_eval, inbe, Zs, ildzs);
+    blas::lacpy( 'A', inbe, inpts, basis_eval, inbe, Zz, ildzz);
+    blas::lacpy( 'A', inbe, inpts, basis_eval, inbe, Zx, ildzx);
+    blas::lacpy( 'A', inbe, inpts, basis_eval, inbe, Zy, ildzy);
 
     for( int32_t i = 0; i < (int32_t)npts; ++i ) {
 
@@ -631,45 +663,48 @@ void ReferenceLocalHostWorkDriver::eval_zmat_lda_vxc_gks( size_t npts, size_t nb
       const double factor = 0.5 * (factp - factm);
 
       //eq. 56 https://doi.org/10.1140/epjb/e2018-90170-1
-      GauXC::blas::scal( nbe, 0.5*(factp + factm), zs_col, 1 );
-      GauXC::blas::scal( nbe, KZ[i] * factor, zz_col, 1 );
-      GauXC::blas::scal( nbe, KX[i] * factor, zx_col, 1 );
-      GauXC::blas::scal( nbe, KY[i] * factor, zy_col, 1 );
-   
+      GauXC::blas::scal( inbe, 0.5*(factp + factm), zs_col, 1 );
+      GauXC::blas::scal( inbe, KZ[i] * factor, zz_col, 1 );
+      GauXC::blas::scal( inbe, KX[i] * factor, zx_col, 1 );
+      GauXC::blas::scal( inbe, KY[i] * factor, zy_col, 1 );
+
     }
 
 }
 
   // Eval Z Matrix GGA VXC
-  void ReferenceLocalHostWorkDriver::eval_zmat_gga_vxc_rks( size_t npts, size_t nbf, 
-							const double* vrho, const double* vgamma, const double* basis_eval, 
-							const double* dbasis_x_eval, const double* dbasis_y_eval, 
-							const double* dbasis_z_eval, const double* dden_x_eval, 
+  void ReferenceLocalHostWorkDriver::eval_zmat_gga_vxc_rks( size_t npts, size_t nbf,
+							const double* vrho, const double* vgamma, const double* basis_eval,
+							const double* dbasis_x_eval, const double* dbasis_y_eval,
+							const double* dbasis_z_eval, const double* dden_x_eval,
 							const double* dden_y_eval, const double* dden_z_eval, double* Z, size_t ldz ) {
 
+    const auto inbf  = static_cast<int32_t>(nbf);
+    const auto inpts = static_cast<int32_t>(npts);
+
     if( ldz != nbf ) GAUXC_GENERIC_EXCEPTION(std::string("Invalid Dims"));
-    blas::lacpy( 'A', nbf, npts, basis_eval, nbf, Z, nbf );
+    blas::lacpy( 'A', inbf, inpts, basis_eval, inbf, Z, inbf );
 
     for( int32_t i = 0; i < (int32_t)npts; ++i ) {
 
       const int32_t ioff = i * nbf;
 
       auto* z_col    = Z + ioff;
-      auto* bf_x_col = dbasis_x_eval + ioff; 
-      auto* bf_y_col = dbasis_y_eval + ioff; 
-      auto* bf_z_col = dbasis_z_eval + ioff; 
+      auto* bf_x_col = dbasis_x_eval + ioff;
+      auto* bf_y_col = dbasis_y_eval + ioff;
+      auto* bf_z_col = dbasis_z_eval + ioff;
 
       const auto lda_fact = 0.5 * vrho[i];
-      blas::scal( nbf, lda_fact, z_col, 1 );
+      blas::scal( inbf, lda_fact, z_col, 1 );
 
-      const auto gga_fact = 2. * vgamma[i]; 
+      const auto gga_fact = 2. * vgamma[i];
       const auto x_fact = gga_fact * dden_x_eval[i];
       const auto y_fact = gga_fact * dden_y_eval[i];
       const auto z_fact = gga_fact * dden_z_eval[i];
 
-      blas::axpy( nbf, x_fact, bf_x_col, 1, z_col, 1 );
-      blas::axpy( nbf, y_fact, bf_y_col, 1, z_col, 1 );
-      blas::axpy( nbf, z_fact, bf_z_col, 1, z_col, 1 );
+      blas::axpy( inbf, x_fact, bf_x_col, 1, z_col, 1 );
+      blas::axpy( inbf, y_fact, bf_y_col, 1, z_col, 1 );
+      blas::axpy( inbf, z_fact, bf_z_col, 1, z_col, 1 );
 
     }
 
@@ -679,14 +714,18 @@ void ReferenceLocalHostWorkDriver::eval_zmat_lda_vxc_gks( size_t npts, size_t nb
               const double* vrho, const double* vgamma, const double* basis_eval,
               const double* dbasis_x_eval, const double* dbasis_y_eval,
               const double* dbasis_z_eval, const double* dden_x_eval,
-              const double* dden_y_eval, const double* dden_z_eval, double* Zs, 
+              const double* dden_y_eval, const double* dden_z_eval, double* Zs,
               size_t ldzs, double* Zz, size_t ldzz ) {
 
+    const auto inbf  = static_cast<int32_t>(nbf);
+    const auto inpts = static_cast<int32_t>(npts);
+    const auto ildzs = static_cast<int32_t>(ldzs);
+    const auto ildzz = static_cast<int32_t>(ldzz);
 
     if( ldzs != nbf ) GAUXC_GENERIC_EXCEPTION(std::string("Invalid Dims"));
     if( ldzz != nbf ) GAUXC_GENERIC_EXCEPTION(std::string("Invalid Dims"));
-    blas::lacpy( 'A', nbf, npts, basis_eval, nbf, Zs, ldzs);
-    blas::lacpy( 'A', nbf, npts, basis_eval, nbf, Zz, ldzz);
+    blas::lacpy( 'A', inbf, inpts, basis_eval, inbf, Zs, ildzs);
+    blas::lacpy( 'A', inbf, inpts, basis_eval, inbf, Zz, ildzz);
 
     for( int32_t i = 0; i < (int32_t)npts; ++i ) {
 
@@ -701,8 +740,8 @@ void ReferenceLocalHostWorkDriver::eval_zmat_lda_vxc_gks( size_t npts, size_t nb
       const double factp = 0.5 * vrho[2*i];
       const double factm = 0.5 * vrho[2*i+1];
 
-      GauXC::blas::scal( nbf, 0.5*(factp + factm), zs_col, 1 ); //additional 0.5 is from eq 56 in petrone 2018 eur phys journal b "an efficent implementation of .. "
-      GauXC::blas::scal( nbf, 0.5*(factp - factm), zz_col, 1 );
+      GauXC::blas::scal( inbf, 0.5*(factp + factm), zs_col, 1 ); //additional 0.5 is from eq 56 in petrone 2018 eur phys journal b "an efficent implementation of .. "
+      GauXC::blas::scal( inbf, 0.5*(factp - factm), zz_col, 1 );
 
       const auto gga_fact_pp = vgamma[3*i];
       const auto gga_fact_pm = vgamma[3*i+1];
@@ -719,14 +758,14 @@ void ReferenceLocalHostWorkDriver::eval_zmat_lda_vxc_gks( size_t npts, size_t nb
       const auto x_fact_z = gga_fact_3 * dden_x_eval[2*i+1] + gga_fact_2 * dden_x_eval[2*i];
       const auto y_fact_z = gga_fact_3 * dden_y_eval[2*i+1] + gga_fact_2 * dden_y_eval[2*i];
       const auto z_fact_z = gga_fact_3 * dden_z_eval[2*i+1] + gga_fact_2 * dden_z_eval[2*i];
-      
-      blas::axpy( nbf, x_fact_s, bf_x_col, 1, zs_col, 1 );
-      blas::axpy( nbf, y_fact_s, bf_y_col, 1, zs_col, 1 );
-      blas::axpy( nbf, z_fact_s, bf_z_col, 1, zs_col, 1 );
 
-      blas::axpy( nbf, x_fact_z, bf_x_col, 1, zz_col, 1 );
-      blas::axpy( nbf, y_fact_z, bf_y_col, 1, zz_col, 1 );
-      blas::axpy( nbf, z_fact_z, bf_z_col, 1, zz_col, 1 );
+      blas::axpy( inbf, x_fact_s, bf_x_col, 1, zs_col, 1 );
+      blas::axpy( inbf, y_fact_s, bf_y_col, 1, zs_col, 1 );
+      blas::axpy( inbf, z_fact_s, bf_z_col, 1, zs_col, 1 );
+
+      blas::axpy( inbf, x_fact_z, bf_x_col, 1, zz_col, 1 );
+      blas::axpy( inbf, y_fact_z, bf_y_col, 1, zz_col, 1 );
+      blas::axpy( inbf, z_fact_z, bf_z_col, 1, zz_col, 1 );
 
     }
   }
@@ -740,8 +779,11 @@ void ReferenceLocalHostWorkDriver::eval_zmat_lda_vxc_gks( size_t npts, size_t nb
               const double* dden_x_eval,
               const double* dden_y_eval, const double* dden_z_eval, double* Z, size_t ldz ) {
 
+    const auto inbf  = static_cast<int32_t>(nbf);
+    const auto inpts = static_cast<int32_t>(npts);
+
     if( ldz != nbf ) GAUXC_GENERIC_EXCEPTION(std::string("Invalid Dims"));
-    blas::lacpy( 'A', nbf, npts, basis_eval, nbf, Z, nbf );
+    blas::lacpy( 'A', inbf, inpts, basis_eval, inbf, Z, inbf );
 
     for( int32_t i = 0; i < (int32_t)npts; ++i ) {
 
@@ -753,21 +795,21 @@ void ReferenceLocalHostWorkDriver::eval_zmat_lda_vxc_gks( size_t npts, size_t nb
       auto* bf_z_col = dbasis_z_eval + ioff;
 
       const auto lda_fact = 0.5 * vrho[i];
-      blas::scal( nbf, lda_fact, z_col, 1 );
+      blas::scal( inbf, lda_fact, z_col, 1 );
 
       const auto gga_fact = 2. * vgamma[i];
       const auto x_fact = gga_fact * dden_x_eval[i];
       const auto y_fact = gga_fact * dden_y_eval[i];
       const auto z_fact = gga_fact * dden_z_eval[i];
 
-      blas::axpy( nbf, x_fact, bf_x_col, 1, z_col, 1 );
-      blas::axpy( nbf, y_fact, bf_y_col, 1, z_col, 1 );
-      blas::axpy( nbf, z_fact, bf_z_col, 1, z_col, 1 );
+      blas::axpy( inbf, x_fact, bf_x_col, 1, z_col, 1 );
+      blas::axpy( inbf, y_fact, bf_y_col, 1, z_col, 1 );
+      blas::axpy( inbf, z_fact, bf_z_col, 1, z_col, 1 );
 
       if ( vlapl != nullptr ) {
   auto* lbf_col = lbasis_eval + ioff;
         const auto lapl_fact = vlapl[i];
-        blas::axpy( nbf, lapl_fact, lbf_col, 1, z_col, 1 );
+        blas::axpy( inbf, lapl_fact, lbf_col, 1, z_col, 1 );
       }
 
     }
@@ -775,19 +817,23 @@ void ReferenceLocalHostWorkDriver::eval_zmat_lda_vxc_gks( size_t npts, size_t nb
   }
 
 void ReferenceLocalHostWorkDriver::eval_zmat_mgga_vxc_uks( size_t npts, size_t nbf,
-              const double* vrho, const double* vgamma, const double* vlapl, 
+              const double* vrho, const double* vgamma, const double* vlapl,
         const double* basis_eval,
               const double* dbasis_x_eval, const double* dbasis_y_eval,
               const double* dbasis_z_eval, const double* lbasis_eval,
         const double* dden_x_eval,
-              const double* dden_y_eval, const double* dden_z_eval, double* Zs, 
+              const double* dden_y_eval, const double* dden_z_eval, double* Zs,
               size_t ldzs, double* Zz, size_t ldzz ) {
 
+    const auto inbf  = static_cast<int32_t>(nbf);
+    const auto inpts = static_cast<int32_t>(npts);
+    const auto ildzs = static_cast<int32_t>(ldzs);
+    const auto ildzz = static_cast<int32_t>(ldzz);
 
     if( ldzs != nbf ) GAUXC_GENERIC_EXCEPTION(std::string("Invalid Dims"));
     if( ldzz != nbf ) GAUXC_GENERIC_EXCEPTION(std::string("Invalid Dims"));
-    blas::lacpy( 'A', nbf, npts, basis_eval, nbf, Zs, ldzs);
-    blas::lacpy( 'A', nbf, npts, basis_eval, nbf, Zz, ldzz);
+    blas::lacpy( 'A', inbf, inpts, basis_eval, inbf, Zs, ildzs);
+    blas::lacpy( 'A', inbf, inpts, basis_eval, inbf, Zz, ildzz);
 
     for( int32_t i = 0; i < (int32_t)npts; ++i ) {
 
@@ -803,8 +849,8 @@ void ReferenceLocalHostWorkDriver::eval_zmat_mgga_vxc_uks( size_t npts, size_t n
       const double factp = 0.5 * vrho[2*i];
       const double factm = 0.5 * vrho[2*i+1];
 
-      GauXC::blas::scal( nbf, 0.5*(factp + factm), zs_col, 1 ); //additional 0.5 is from eq 56 in petrone 2018 eur phys journal b "an efficent implementation of .. "
-      GauXC::blas::scal( nbf, 0.5*(factp - factm), zz_col, 1 );
+      GauXC::blas::scal( inbf, 0.5*(factp + factm), zs_col, 1 ); //additional 0.5 is from eq 56 in petrone 2018 eur phys journal b "an efficent implementation of .. "
+      GauXC::blas::scal( inbf, 0.5*(factp - factm), zz_col, 1 );
 
       const auto gga_fact_pp = vgamma[3*i];
       const auto gga_fact_pm = vgamma[3*i+1];
@@ -822,36 +868,40 @@ void ReferenceLocalHostWorkDriver::eval_zmat_mgga_vxc_uks( size_t npts, size_t n
       const auto y_fact_z = gga_fact_3 * dden_y_eval[2*i+1] + gga_fact_2 * dden_y_eval[2*i];
       const auto z_fact_z = gga_fact_3 * dden_z_eval[2*i+1] + gga_fact_2 * dden_z_eval[2*i];
 
-      
-      blas::axpy( nbf, x_fact_s, bf_x_col, 1, zs_col, 1 );
-      blas::axpy( nbf, y_fact_s, bf_y_col, 1, zs_col, 1 );
-      blas::axpy( nbf, z_fact_s, bf_z_col, 1, zs_col, 1 );
 
-      blas::axpy( nbf, x_fact_z, bf_x_col, 1, zz_col, 1 );
-      blas::axpy( nbf, y_fact_z, bf_y_col, 1, zz_col, 1 );
-      blas::axpy( nbf, z_fact_z, bf_z_col, 1, zz_col, 1 );
+      blas::axpy( inbf, x_fact_s, bf_x_col, 1, zs_col, 1 );
+      blas::axpy( inbf, y_fact_s, bf_y_col, 1, zs_col, 1 );
+      blas::axpy( inbf, z_fact_s, bf_z_col, 1, zs_col, 1 );
+
+      blas::axpy( inbf, x_fact_z, bf_x_col, 1, zz_col, 1 );
+      blas::axpy( inbf, y_fact_z, bf_y_col, 1, zz_col, 1 );
+      blas::axpy( inbf, z_fact_z, bf_z_col, 1, zz_col, 1 );
 
       if (vlapl != nullptr) {
         const auto lfactp = vlapl[2*i];
         const auto lfactm = vlapl[2*i+1];
-        blas::axpy( nbf, 0.5*(lfactp + lfactm), lbf_col, 1, zs_col, 1);
-        blas::axpy( nbf, 0.5*(lfactp - lfactm), lbf_col, 1, zz_col, 1);
+        blas::axpy( inbf, 0.5*(lfactp + lfactm), lbf_col, 1, zs_col, 1);
+        blas::axpy( inbf, 0.5*(lfactp - lfactm), lbf_col, 1, zz_col, 1);
       }
 
     }
   }
 
-  void ReferenceLocalHostWorkDriver::eval_mmat_mgga_vxc_rks(size_t npts, size_t nbf, 
-              const double* vtau, const double* vlapl, 
-              const double* dbasis_x_eval, const double* dbasis_y_eval, 
+  void ReferenceLocalHostWorkDriver::eval_mmat_mgga_vxc_rks(size_t npts, size_t nbf,
+              const double* vtau, const double* vlapl,
+              const double* dbasis_x_eval, const double* dbasis_y_eval,
               const double* dbasis_z_eval,
               double* mmat_x, double* mmat_y, double* mmat_z, size_t ldm ) {
 
+    const auto inbf  = static_cast<int32_t>(nbf);
+    const auto inpts = static_cast<int32_t>(npts);
+    const auto ildm  = static_cast<int32_t>(ldm);
+
     if( ldm != nbf ) GAUXC_GENERIC_EXCEPTION(std::string("Invalid Dims"));
-    
-    blas::lacpy( 'A', nbf, npts, dbasis_x_eval, nbf, mmat_x, ldm);
-    blas::lacpy( 'A', nbf, npts, dbasis_y_eval, nbf, mmat_y, ldm);
-    blas::lacpy( 'A', nbf, npts, dbasis_z_eval, nbf, mmat_z, ldm);
+
+    blas::lacpy( 'A', inbf, inpts, dbasis_x_eval, inbf, mmat_x, ildm);
+    blas::lacpy( 'A', inbf, inpts, dbasis_y_eval, inbf, mmat_y, ildm);
+    blas::lacpy( 'A', inbf, inpts, dbasis_z_eval, inbf, mmat_z, ildm);
 
     for( int32_t i = 0; i < (int32_t)npts; ++i ) {
 
@@ -865,35 +915,40 @@ void ReferenceLocalHostWorkDriver::eval_zmat_mgga_vxc_uks( size_t npts, size_t n
 
       const auto tfact = 0.25 * vtau[i];
 
-      blas::scal( nbf, tfact, mmat_x_col, 1);
-      blas::scal( nbf, tfact, mmat_y_col, 1);
-      blas::scal( nbf, tfact, mmat_z_col, 1);
+      blas::scal( inbf, tfact, mmat_x_col, 1);
+      blas::scal( inbf, tfact, mmat_y_col, 1);
+      blas::scal( inbf, tfact, mmat_z_col, 1);
 
       if ( vlapl != nullptr ) {
         const auto lfact = vlapl[i];
-        blas::axpy( nbf, lfact, bf_x_col, 1, mmat_x_col, 1);
-        blas::axpy( nbf, lfact, bf_y_col, 1, mmat_y_col, 1);
-        blas::axpy( nbf, lfact, bf_z_col, 1, mmat_z_col, 1);
+        blas::axpy( inbf, lfact, bf_x_col, 1, mmat_x_col, 1);
+        blas::axpy( inbf, lfact, bf_y_col, 1, mmat_y_col, 1);
+        blas::axpy( inbf, lfact, bf_z_col, 1, mmat_z_col, 1);
       }
     }
   }
 
-void ReferenceLocalHostWorkDriver::eval_mmat_mgga_vxc_uks(size_t npts, size_t nbf, 
-              const double* vtau, const double* vlapl, 
-              const double* dbasis_x_eval, const double* dbasis_y_eval, 
+void ReferenceLocalHostWorkDriver::eval_mmat_mgga_vxc_uks(size_t npts, size_t nbf,
+              const double* vtau, const double* vlapl,
+              const double* dbasis_x_eval, const double* dbasis_y_eval,
               const double* dbasis_z_eval,
               double* mmat_xs, double* mmat_ys, double* mmat_zs, size_t ldms,
               double* mmat_xz, double* mmat_yz, double* mmat_zz, size_t ldmz) {
 
+    const auto inbf  = static_cast<int32_t>(nbf);
+    const auto inpts = static_cast<int32_t>(npts);
+    const auto ildms = static_cast<int32_t>(ldms);
+    const auto ildmz = static_cast<int32_t>(ldmz);
+
     if( ldms != nbf ) GAUXC_GENERIC_EXCEPTION(std::string("Invalid Dims"));
     if( ldmz != nbf ) GAUXC_GENERIC_EXCEPTION(std::string("Invalid Dims"));
-    
-    blas::lacpy( 'A', nbf, npts, dbasis_x_eval, nbf, mmat_xs, ldms);
-    blas::lacpy( 'A', nbf, npts, dbasis_y_eval, nbf, mmat_ys, ldms);
-    blas::lacpy( 'A', nbf, npts, dbasis_z_eval, nbf, mmat_zs, ldms);
-    blas::lacpy( 'A', nbf, npts, dbasis_x_eval, nbf, mmat_xz, ldmz);
-    blas::lacpy( 'A', nbf, npts, dbasis_y_eval, nbf, mmat_yz, ldmz);
-    blas::lacpy( 'A', nbf, npts, dbasis_z_eval, nbf, mmat_zz, ldmz);
+
+    blas::lacpy( 'A', inbf, inpts, dbasis_x_eval, inbf, mmat_xs, ildms);
+    blas::lacpy( 'A', inbf, inpts, dbasis_y_eval, inbf, mmat_ys, ildms);
+    blas::lacpy( 'A', inbf, inpts, dbasis_z_eval, inbf, mmat_zs, ildms);
+    blas::lacpy( 'A', inbf, inpts, dbasis_x_eval, inbf, mmat_xz, ildmz);
+    blas::lacpy( 'A', inbf, inpts, dbasis_y_eval, inbf, mmat_yz, ildmz);
+    blas::lacpy( 'A', inbf, inpts, dbasis_z_eval, inbf, mmat_zz, ildmz);
 
     for( int32_t i = 0; i < (int32_t)npts; ++i ) {
 
@@ -913,24 +968,24 @@ void ReferenceLocalHostWorkDriver::eval_mmat_mgga_vxc_uks(size_t npts, size_t nb
       const auto tfacts = 0.5*(tfactp + tfactm);
       const auto tfactz = 0.5*(tfactp - tfactm);
 
-      blas::scal( nbf, tfacts, xs_col, 1);
-      blas::scal( nbf, tfacts, ys_col, 1);
-      blas::scal( nbf, tfacts, zs_col, 1);
-      blas::scal( nbf, tfactz, xz_col, 1);
-      blas::scal( nbf, tfactz, yz_col, 1);
-      blas::scal( nbf, tfactz, zz_col, 1);
+      blas::scal( inbf, tfacts, xs_col, 1);
+      blas::scal( inbf, tfacts, ys_col, 1);
+      blas::scal( inbf, tfacts, zs_col, 1);
+      blas::scal( inbf, tfactz, xz_col, 1);
+      blas::scal( inbf, tfactz, yz_col, 1);
+      blas::scal( inbf, tfactz, zz_col, 1);
 
       if ( vlapl != nullptr ) {
         const auto lfactp = vlapl[2*i];
         const auto lfactm = vlapl[2*i+1];
   const auto lfacts = 0.5*(lfactp + lfactm);
   const auto lfactz = 0.5*(lfactp - lfactm);
-        blas::axpy( nbf, lfacts, bf_x_col, 1, xs_col, 1);
-        blas::axpy( nbf, lfacts, bf_y_col, 1, ys_col, 1);
-        blas::axpy( nbf, lfacts, bf_z_col, 1, zs_col, 1);
-        blas::axpy( nbf, lfactz, bf_x_col, 1, xz_col, 1);
-        blas::axpy( nbf, lfactz, bf_y_col, 1, yz_col, 1);
-        blas::axpy( nbf, lfactz, bf_z_col, 1, zz_col, 1);
+        blas::axpy( inbf, lfacts, bf_x_col, 1, xs_col, 1);
+        blas::axpy( inbf, lfacts, bf_y_col, 1, ys_col, 1);
+        blas::axpy( inbf, lfacts, bf_z_col, 1, zs_col, 1);
+        blas::axpy( inbf, lfactz, bf_x_col, 1, xz_col, 1);
+        blas::axpy( inbf, lfactz, bf_y_col, 1, yz_col, 1);
+        blas::axpy( inbf, lfactz, bf_z_col, 1, zz_col, 1);
       }
 
     }
@@ -944,6 +999,13 @@ void ReferenceLocalHostWorkDriver::eval_zmat_gga_vxc_gks( size_t npts, size_t nb
     double* Zs, size_t ldzs, double* Zz, size_t ldzz, double* Zx, size_t ldzx,
     double* Zy, size_t ldzy, double* K, double* H ) {
 
+    const auto inbf  = static_cast<int32_t>(nbf);
+    const auto inpts = static_cast<int32_t>(npts);
+    const auto ildzs = static_cast<int32_t>(ldzs);
+    const auto ildzz = static_cast<int32_t>(ldzz);
+    const auto ildzx = static_cast<int32_t>(ldzx);
+    const auto ildzy = static_cast<int32_t>(ldzy);
+
     auto *KZ = K; // KZ // store K in the Z matrix
     auto *KY = KZ + npts;
     auto *KX = KY + npts;
@@ -957,10 +1019,10 @@ void ReferenceLocalHostWorkDriver::eval_zmat_gga_vxc_gks( size_t npts, size_t nb
     if( ldzx != nbf ) GAUXC_GENERIC_EXCEPTION(std::string("Invalid Dims"));
     if( ldzy != nbf ) GAUXC_GENERIC_EXCEPTION(std::string("Invalid Dims"));
 
-    blas::lacpy( 'A', nbf, npts, basis_eval, nbf, Zs, ldzs);
-    blas::lacpy( 'A', nbf, npts, basis_eval, nbf, Zz, ldzz);
-    blas::lacpy( 'A', nbf, npts, basis_eval, nbf, Zx, ldzx);
-    blas::lacpy( 'A', nbf, npts, basis_eval, nbf, Zy, ldzy);   
+    blas::lacpy( 'A', inbf, inpts, basis_eval, inbf, Zs, ildzs);
+    blas::lacpy( 'A', inbf, inpts, basis_eval, inbf, Zz, ildzz);
+    blas::lacpy( 'A', inbf, inpts, basis_eval, inbf, Zx, ildzx);
+    blas::lacpy( 'A', inbf, inpts, basis_eval, inbf, Zy, ildzy);
 
     for( int32_t i = 0; i < (int32_t)npts; ++i ) {
 
@@ -979,10 +1041,10 @@ void ReferenceLocalHostWorkDriver::eval_zmat_gga_vxc_gks( size_t npts, size_t nb
       const double factm = 0.5 * vrho[2*i+1];
       const double factor = 0.5 * (factp - factm);
 
-      GauXC::blas::scal( nbf, 0.5*(factp + factm), zs_col, 1 ); //additional 0.5 is from eq 56 in petrone 2018 eur phys journal b "an efficent implementation of .. "
-      GauXC::blas::scal( nbf, KZ[i]*factor, zz_col, 1 );
-      GauXC::blas::scal( nbf, KX[i]*factor, zx_col, 1 );
-      GauXC::blas::scal( nbf, KY[i]*factor, zy_col, 1 );
+      GauXC::blas::scal( inbf, 0.5*(factp + factm), zs_col, 1 ); //additional 0.5 is from eq 56 in petrone 2018 eur phys journal b "an efficent implementation of .. "
+      GauXC::blas::scal( inbf, KZ[i]*factor, zz_col, 1 );
+      GauXC::blas::scal( inbf, KX[i]*factor, zx_col, 1 );
+      GauXC::blas::scal( inbf, KY[i]*factor, zy_col, 1 );
 
       const auto gga_fact_pp = vgamma[3 * i];
       const auto gga_fact_pm = vgamma[3 * i + 1];
@@ -1027,21 +1089,21 @@ void ReferenceLocalHostWorkDriver::eval_zmat_gga_vxc_gks( size_t npts, size_t nb
                             gga_fact_2 * HY[i] * dden_z_eval[4 * i];
 
 
-      blas::axpy(nbf, x_fact_s, bf_x_col, 1, zs_col, 1);
-      blas::axpy(nbf, y_fact_s, bf_y_col, 1, zs_col, 1);
-      blas::axpy(nbf, z_fact_s, bf_z_col, 1, zs_col, 1);
+      blas::axpy(inbf, x_fact_s, bf_x_col, 1, zs_col, 1);
+      blas::axpy(inbf, y_fact_s, bf_y_col, 1, zs_col, 1);
+      blas::axpy(inbf, z_fact_s, bf_z_col, 1, zs_col, 1);
 
-      blas::axpy(nbf, x_fact_z, bf_x_col, 1, zz_col, 1);
-      blas::axpy(nbf, y_fact_z, bf_y_col, 1, zz_col, 1);
-      blas::axpy(nbf, z_fact_z, bf_z_col, 1, zz_col, 1);
+      blas::axpy(inbf, x_fact_z, bf_x_col, 1, zz_col, 1);
+      blas::axpy(inbf, y_fact_z, bf_y_col, 1, zz_col, 1);
+      blas::axpy(inbf, z_fact_z, bf_z_col, 1, zz_col, 1);
 
-      blas::axpy(nbf, x_fact_x, bf_x_col, 1, zx_col, 1);
-      blas::axpy(nbf, y_fact_x, bf_y_col, 1, zx_col, 1);
-      blas::axpy(nbf, z_fact_x, bf_z_col, 1, zx_col, 1);
+      blas::axpy(inbf, x_fact_x, bf_x_col, 1, zx_col, 1);
+      blas::axpy(inbf, y_fact_x, bf_y_col, 1, zx_col, 1);
+      blas::axpy(inbf, z_fact_x, bf_z_col, 1, zx_col, 1);
 
-      blas::axpy(nbf, x_fact_y, bf_x_col, 1, zy_col, 1);
-      blas::axpy(nbf, y_fact_y, bf_y_col, 1, zy_col, 1);
-      blas::axpy(nbf, z_fact_y, bf_z_col, 1, zy_col, 1);
+      blas::axpy(inbf, x_fact_y, bf_x_col, 1, zy_col, 1);
+      blas::axpy(inbf, y_fact_y, bf_y_col, 1, zy_col, 1);
+      blas::axpy(inbf, z_fact_y, bf_z_col, 1, zy_col, 1);
 
     }
 
@@ -1157,10 +1219,10 @@ void ReferenceLocalHostWorkDriver::eval_tmat_gga_vxc_uks( size_t npts, const dou
 }
 
 
-void ReferenceLocalHostWorkDriver::eval_tmat_mgga_vxc_rks( size_t npts, const double* vgamma, 
-  const double* v2rho2, const double* v2rhogamma, const double* v2rholapl, const double* v2rhotau, 
-  const double* v2gamma2, const double* v2gammalapl, const double* v2gammatau,
-  const double* v2lapl2, const double* v2lapltau, const double* v2tau2, 
+void ReferenceLocalHostWorkDriver::eval_tmat_mgga_vxc_rks( size_t npts, const double* vgamma,
+  const double* v2rho2, const double* v2rhogamma, [[maybe_unused]] const double* v2rholapl, const double* v2rhotau,
+  const double* v2gamma2, [[maybe_unused]] const double* v2gammalapl, const double* v2gammatau,
+  [[maybe_unused]] const double* v2lapl2, [[maybe_unused]] const double* v2lapltau, const double* v2tau2, 
   const double* trho, const double* tdden_x_eval, const double* tdden_y_eval, const double* tdden_z_eval, const double* ttau, 
   const double* dden_x_eval, const double* dden_y_eval, const double* dden_z_eval, double* A, double* B, double* C){
 
@@ -1290,12 +1352,18 @@ void ReferenceLocalHostWorkDriver::eval_tmat_mgga_vxc_uks( size_t npts, const do
 void ReferenceLocalHostWorkDriver::eval_zmat_lda_vxc_uks_ts( size_t npts, size_t nbf,
   const double* vrho, const double* basis_eval, double* Za, size_t ldza,
   double* Zb, size_t ldzb ) {
-  blas::lacpy( 'A', nbf, npts, basis_eval, nbf, Za, ldza);
-  blas::lacpy( 'A', nbf, npts, basis_eval, nbf, Zb, ldzb);
+
+  const auto inbf  = static_cast<int32_t>(nbf);
+  const auto inpts = static_cast<int32_t>(npts);
+  const auto ildza = static_cast<int32_t>(ldza);
+  const auto ildzb = static_cast<int32_t>(ldzb);
+
+  blas::lacpy( 'A', inbf, inpts, basis_eval, inbf, Za, ildza);
+  blas::lacpy( 'A', inbf, inpts, basis_eval, inbf, Zb, ildzb);
   for( int32_t i = 0; i < (int32_t)npts; ++i ) {
   //eq. 56 https://doi.org/10.1140/epjb/e2018-90170-1
-  GauXC::blas::scal( nbf, 0.5 * vrho[2*i], Za + i*ldza, 1 );
-  GauXC::blas::scal( nbf, 0.5 * vrho[2*i+1], Zb + i*ldzb, 1 );
+  GauXC::blas::scal( inbf, 0.5 * vrho[2*i], Za + i*ldza, 1 );
+  GauXC::blas::scal( inbf, 0.5 * vrho[2*i+1], Zb + i*ldzb, 1 );
   }
 }
 
@@ -1312,11 +1380,15 @@ void ReferenceLocalHostWorkDriver::eval_Bvec_gga_vxc_rks_ts( size_t npts, const
 void ReferenceLocalHostWorkDriver::eval_zmat_gga_vxc_rks_ts( size_t npts, size_t nbf,
   const double* A, const double* B, const double* basis_eval,
   const double* dbasis_x_eval, const double* dbasis_y_eval,
-  const double* dbasis_z_eval, double* Z, 
+  const double* dbasis_z_eval, double* Z,
   size_t ldz) {
 
+  const auto inbf  = static_cast<int32_t>(nbf);
+  const auto inpts = static_cast<int32_t>(npts);
+  const auto ildz  = static_cast<int32_t>(ldz);
+
   if( ldz != nbf ) GAUXC_GENERIC_EXCEPTION(std::string("Invalid Dims"));
-  blas::lacpy( 'A', nbf, npts, basis_eval, nbf, Z, ldz);
+  blas::lacpy( 'A', inbf, inpts, basis_eval, inbf, Z, ildz);
 
   for( int32_t i = 0; i < (int32_t)npts; ++i ) {
 
@@ -1327,11 +1399,11 @@ void ReferenceLocalHostWorkDriver::eval_zmat_gga_vxc_rks_ts( size_t npts, size_t
     auto* bf_y_col = dbasis_y_eval + ioff;
     auto* bf_z_col = dbasis_z_eval + ioff;
 
-    GauXC::blas::scal( nbf, 0.5*A[i], z_col, 1 ); 
+    GauXC::blas::scal( inbf, 0.5*A[i], z_col, 1 );
 
-    blas::axpy( nbf, B[i*3],   bf_x_col, 1, z_col, 1 );
-    blas::axpy( nbf, B[i*3+1], bf_y_col, 1, z_col, 1 );
-    blas::axpy( nbf, B[i*3+2], bf_z_col, 1, z_col, 1 );
+    blas::axpy( inbf, B[i*3],   bf_x_col, 1, z_col, 1 );
+    blas::axpy( inbf, B[i*3+1], bf_y_col, 1, z_col, 1 );
+    blas::axpy( inbf, B[i*3+2], bf_z_col, 1, z_col, 1 );
 
   }
 }
@@ -1367,14 +1439,18 @@ void ReferenceLocalHostWorkDriver::eval_Bvec_gga_vxc_uks_ts( size_t npts, const
 void ReferenceLocalHostWorkDriver::eval_zmat_gga_vxc_uks_ts( size_t npts, size_t nbf,
   const double* A, const double* B, const double* basis_eval,
   const double* dbasis_x_eval, const double* dbasis_y_eval,
-  const double* dbasis_z_eval, double* Za, 
+  const double* dbasis_z_eval, double* Za,
   size_t ldza, double* Zb, size_t ldzb ) {
 
+  const auto inbf  = static_cast<int32_t>(nbf);
+  const auto inpts = static_cast<int32_t>(npts);
+  const auto ildza = static_cast<int32_t>(ldza);
+  const auto ildzb = static_cast<int32_t>(ldzb);
 
   if( ldza != nbf ) GAUXC_GENERIC_EXCEPTION(std::string("Invalid Dims"));
   if( ldzb != nbf ) GAUXC_GENERIC_EXCEPTION(std::string("Invalid Dims"));
-  blas::lacpy( 'A', nbf, npts, basis_eval, nbf, Za, ldza);
-  blas::lacpy( 'A', nbf, npts, basis_eval, nbf, Zb, ldzb);
+  blas::lacpy( 'A', inbf, inpts, basis_eval, inbf, Za, ildza);
+  blas::lacpy( 'A', inbf, inpts, basis_eval, inbf, Zb, ildzb);
 
   for( int32_t i = 0; i < (int32_t)npts; ++i ) {
 
@@ -1386,16 +1462,16 @@ void ReferenceLocalHostWorkDriver::eval_zmat_gga_vxc_uks_ts( size_t npts, size_t
     auto* bf_y_col = dbasis_y_eval + ioff;
     auto* bf_z_col = dbasis_z_eval + ioff;
 
-    GauXC::blas::scal( nbf, 0.5*A[2*i], za_col, 1 ); //additional 0.5 is from eq 56 in petrone 2018 eur phys journal b "an efficent implementation of .. "
-    GauXC::blas::scal( nbf, 0.5*A[2*i+1], zb_col, 1 );
+    GauXC::blas::scal( inbf, 0.5*A[2*i], za_col, 1 ); //additional 0.5 is from eq 56 in petrone 2018 eur phys journal b "an efficent implementation of .. "
+    GauXC::blas::scal( inbf, 0.5*A[2*i+1], zb_col, 1 );
 
-    blas::axpy( nbf, B[i*6],   bf_x_col, 1, za_col, 1 );
-    blas::axpy( nbf, B[i*6+1], bf_y_col, 1, za_col, 1 );
-    blas::axpy( nbf, B[i*6+2], bf_z_col, 1, za_col, 1 );
+    blas::axpy( inbf, B[i*6],   bf_x_col, 1, za_col, 1 );
+    blas::axpy( inbf, B[i*6+1], bf_y_col, 1, za_col, 1 );
+    blas::axpy( inbf, B[i*6+2], bf_z_col, 1, za_col, 1 );
 
-    blas::axpy( nbf, B[i*6+3], bf_x_col, 1, zb_col, 1 );
-    blas::axpy( nbf, B[i*6+4], bf_y_col, 1, zb_col, 1 );
-    blas::axpy( nbf, B[i*6+5], bf_z_col, 1, zb_col, 1 );
+    blas::axpy( inbf, B[i*6+3], bf_x_col, 1, zb_col, 1 );
+    blas::axpy( inbf, B[i*6+4], bf_y_col, 1, zb_col, 1 );
+    blas::axpy( inbf, B[i*6+5], bf_z_col, 1, zb_col, 1 );
 
   }
 }
@@ -1405,14 +1481,18 @@ void ReferenceLocalHostWorkDriver::eval_zmat_gga_vxc_uks_ts( size_t npts, size_t
   const double* vrho, const double* vgamma, const double* basis_eval,
   const double* dbasis_x_eval, const double* dbasis_y_eval,
   const double* dbasis_z_eval, const double* dden_x_eval,
-  const double* dden_y_eval, const double* dden_z_eval, double* Za, 
+  const double* dden_y_eval, const double* dden_z_eval, double* Za,
   size_t ldza, double* Zb, size_t ldzb ) {
 
+  const auto inbf  = static_cast<int32_t>(nbf);
+  const auto inpts = static_cast<int32_t>(npts);
+  const auto ildza = static_cast<int32_t>(ldza);
+  const auto ildzb = static_cast<int32_t>(ldzb);
 
   if( ldza != nbf ) GAUXC_GENERIC_EXCEPTION(std::string("Invalid Dims"));
   if( ldzb != nbf ) GAUXC_GENERIC_EXCEPTION(std::string("Invalid Dims"));
-  blas::lacpy( 'A', nbf, npts, basis_eval, nbf, Za, ldza);
-  blas::lacpy( 'A', nbf, npts, basis_eval, nbf, Zb, ldzb);
+  blas::lacpy( 'A', inbf, inpts, basis_eval, inbf, Za, ildza);
+  blas::lacpy( 'A', inbf, inpts, basis_eval, inbf, Zb, ildzb);
 
   for( int32_t i = 0; i < (int32_t)npts; ++i ) {
 
@@ -1424,8 +1504,8 @@ void ReferenceLocalHostWorkDriver::eval_zmat_gga_vxc_uks_ts( size_t npts, size_t
     auto* bf_y_col = dbasis_y_eval + ioff;
     auto* bf_z_col = dbasis_z_eval + ioff;
 
-    GauXC::blas::scal( nbf, 0.5*vrho[2*i], za_col, 1 ); //additional 0.5 is from eq 56 in petrone 2018 eur phys journal b "an efficent implementation of .. "
-    GauXC::blas::scal( nbf, 0.5*vrho[2*i+1], zb_col, 1 );
+    GauXC::blas::scal( inbf, 0.5*vrho[2*i], za_col, 1 ); //additional 0.5 is from eq 56 in petrone 2018 eur phys journal b "an efficent implementation of .. "
+    GauXC::blas::scal( inbf, 0.5*vrho[2*i+1], zb_col, 1 );
 
     const auto gga_fact_aa = vgamma[3*i];
     const auto gga_fact_ab = vgamma[3*i+1];
@@ -1448,30 +1528,35 @@ void ReferenceLocalHostWorkDriver::eval_zmat_gga_vxc_uks_ts( size_t npts, size_t
     const auto y_fact_b = 2 * gga_fact_bb * dden_y_eval_b + gga_fact_ab * dden_y_eval_a;
     const auto z_fact_b = 2 * gga_fact_bb * dden_z_eval_b + gga_fact_ab * dden_z_eval_a;
 
-    blas::axpy( nbf, x_fact_a, bf_x_col, 1, za_col, 1 );
-    blas::axpy( nbf, y_fact_a, bf_y_col, 1, za_col, 1 );
-    blas::axpy( nbf, z_fact_a, bf_z_col, 1, za_col, 1 );
+    blas::axpy( inbf, x_fact_a, bf_x_col, 1, za_col, 1 );
+    blas::axpy( inbf, y_fact_a, bf_y_col, 1, za_col, 1 );
+    blas::axpy( inbf, z_fact_a, bf_z_col, 1, za_col, 1 );
 
-    blas::axpy( nbf, x_fact_b, bf_x_col, 1, zb_col, 1 );
-    blas::axpy( nbf, y_fact_b, bf_y_col, 1, zb_col, 1 );
-    blas::axpy( nbf, z_fact_b, bf_z_col, 1, zb_col, 1 );
+    blas::axpy( inbf, x_fact_b, bf_x_col, 1, zb_col, 1 );
+    blas::axpy( inbf, y_fact_b, bf_y_col, 1, zb_col, 1 );
+    blas::axpy( inbf, z_fact_b, bf_z_col, 1, zb_col, 1 );
 
   }
 }
 
 void ReferenceLocalHostWorkDriver::eval_zmat_mgga_vxc_uks_ts( size_t npts, size_t nbf,
-              const double* vrho, const double* vgamma, const double* vlapl, 
+              const double* vrho, const double* vgamma, const double* vlapl,
         const double* basis_eval,
               const double* dbasis_x_eval, const double* dbasis_y_eval,
               const double* dbasis_z_eval, const double* lbasis_eval,
         const double* dden_x_eval,
-              const double* dden_y_eval, const double* dden_z_eval, double* Za, 
+              const double* dden_y_eval, const double* dden_z_eval, double* Za,
               size_t ldza, double* Zb, size_t ldzb ) {
 
+  const auto inbf  = static_cast<int32_t>(nbf);
+  const auto inpts = static_cast<int32_t>(npts);
+  const auto ildza = static_cast<int32_t>(ldza);
+  const auto ildzb = static_cast<int32_t>(ldzb);
+
   if( ldza != nbf ) GAUXC_GENERIC_EXCEPTION(std::string("Invalid Dims"));
   if( ldzb != nbf ) GAUXC_GENERIC_EXCEPTION(std::string("Invalid Dims"));
-  blas::lacpy( 'A', nbf, npts, basis_eval, nbf, Za, ldza);
-  blas::lacpy( 'A', nbf, npts, basis_eval, nbf, Zb, ldzb);
+  blas::lacpy( 'A', inbf, inpts, basis_eval, inbf, Za, ildza);
+  blas::lacpy( 'A', inbf, inpts, basis_eval, inbf, Zb, ildzb);
 
   for( int32_t i = 0; i < (int32_t)npts; ++i ) {
 
@@ -1484,9 +1569,9 @@ void ReferenceLocalHostWorkDriver::eval_zmat_mgga_vxc_uks_ts( size_t npts, size_
     auto* bf_z_col = dbasis_z_eval + ioff;
     auto* lbf_col = lbasis_eval + ioff;
 
-    GauXC::blas::scal( nbf, 0.5*vrho[2*i], za_col, 1 ); //additional 0.5 is from eq 56 in petrone 2018 eur phys journal b "an efficent implementation of .. "
-    GauXC::blas::scal( nbf, 0.5*vrho[2*i+1], zb_col, 1 );
-    
+    GauXC::blas::scal( inbf, 0.5*vrho[2*i], za_col, 1 ); //additional 0.5 is from eq 56 in petrone 2018 eur phys journal b "an efficent implementation of .. "
+    GauXC::blas::scal( inbf, 0.5*vrho[2*i+1], zb_col, 1 );
+
     // dden_x_eval, dden_y_eval, dden_z_eval are all still in Pauli representation
     // so we need to convert them to the two spinor representation
     const auto dden_x_eval_a = 0.5 * (dden_x_eval[2*i] + dden_x_eval[2*i+1]);
@@ -1495,7 +1580,7 @@ void ReferenceLocalHostWorkDriver::eval_zmat_mgga_vxc_uks_ts( size_t npts, size_
     const auto dden_y_eval_b = 0.5 * (dden_y_eval[2*i] - dden_y_eval[2*i+1]);
     const auto dden_z_eval_a = 0.5 * (dden_z_eval[2*i] + dden_z_eval[2*i+1]);
     const auto dden_z_eval_b = 0.5 * (dden_z_eval[2*i] - dden_z_eval[2*i+1]);
-    
+
     const auto gga_fact_aa = vgamma[3*i];
     const auto gga_fact_ab = vgamma[3*i+1];
     const auto gga_fact_bb = vgamma[3*i+2];
@@ -1508,37 +1593,42 @@ void ReferenceLocalHostWorkDriver::eval_zmat_mgga_vxc_uks_ts( size_t npts, size_
     const auto y_fact_b = 2 * gga_fact_bb * dden_y_eval_b + gga_fact_ab * dden_y_eval_a;
     const auto z_fact_b = 2 * gga_fact_bb * dden_z_eval_b + gga_fact_ab * dden_z_eval_a;
 
-    blas::axpy( nbf, x_fact_a, bf_x_col, 1, za_col, 1 );
-    blas::axpy( nbf, y_fact_a, bf_y_col, 1, za_col, 1 );
-    blas::axpy( nbf, z_fact_a, bf_z_col, 1, za_col, 1 );
+    blas::axpy( inbf, x_fact_a, bf_x_col, 1, za_col, 1 );
+    blas::axpy( inbf, y_fact_a, bf_y_col, 1, za_col, 1 );
+    blas::axpy( inbf, z_fact_a, bf_z_col, 1, za_col, 1 );
 
-    blas::axpy( nbf, x_fact_b, bf_x_col, 1, zb_col, 1 );
-    blas::axpy( nbf, y_fact_b, bf_y_col, 1, zb_col, 1 );
-    blas::axpy( nbf, z_fact_b, bf_z_col, 1, zb_col, 1 );
+    blas::axpy( inbf, x_fact_b, bf_x_col, 1, zb_col, 1 );
+    blas::axpy( inbf, y_fact_b, bf_y_col, 1, zb_col, 1 );
+    blas::axpy( inbf, z_fact_b, bf_z_col, 1, zb_col, 1 );
 
     if (vlapl != nullptr) {
-      blas::axpy( nbf, vlapl[2*i],     lbf_col, 1, za_col, 1);
-      blas::axpy( nbf, vlapl[2*i + 1], lbf_col, 1, zb_col, 1);
+      blas::axpy( inbf, vlapl[2*i],     lbf_col, 1, za_col, 1);
+      blas::axpy( inbf, vlapl[2*i + 1], lbf_col, 1, zb_col, 1);
     }
 
   }
 }
-void ReferenceLocalHostWorkDriver::eval_mmat_mgga_vxc_uks_ts(size_t npts, size_t nbf, 
-        const double* vtau, const double* vlapl, 
-        const double* dbasis_x_eval, const double* dbasis_y_eval, 
+void ReferenceLocalHostWorkDriver::eval_mmat_mgga_vxc_uks_ts(size_t npts, size_t nbf,
+        const double* vtau, const double* vlapl,
+        const double* dbasis_x_eval, const double* dbasis_y_eval,
         const double* dbasis_z_eval,
         double* mmat_xa, double* mmat_ya, double* mmat_za, size_t ldma,
         double* mmat_xb, double* mmat_yb, double* mmat_zb, size_t ldmb) {
 
+  const auto inbf  = static_cast<int32_t>(nbf);
+  const auto inpts = static_cast<int32_t>(npts);
+  const auto ildma = static_cast<int32_t>(ldma);
+  const auto ildmb = static_cast<int32_t>(ldmb);
+
   if( ldma != nbf ) GAUXC_GENERIC_EXCEPTION(std::string("Invalid Dims"));
   if( ldmb != nbf ) GAUXC_GENERIC_EXCEPTION(std::string("Invalid Dims"));
-  
-  blas::lacpy( 'A', nbf, npts, dbasis_x_eval, nbf, mmat_xa, ldma);
-  blas::lacpy( 'A', nbf, npts, dbasis_y_eval, nbf, mmat_ya, ldma);
-  blas::lacpy( 'A', nbf, npts, dbasis_z_eval, nbf, mmat_za, ldma);
-  blas::lacpy( 'A', nbf, npts, dbasis_x_eval, nbf, mmat_xb, ldmb);
-  blas::lacpy( 'A', nbf, npts, dbasis_y_eval, nbf, mmat_yb, ldmb);
-  blas::lacpy( 'A', nbf, npts, dbasis_z_eval, nbf, mmat_zb, ldmb);
+
+  blas::lacpy( 'A', inbf, inpts, dbasis_x_eval, inbf, mmat_xa, ildma);
+  blas::lacpy( 'A', inbf, inpts, dbasis_y_eval, inbf, mmat_ya, ildma);
+  blas::lacpy( 'A', inbf, inpts, dbasis_z_eval, inbf, mmat_za, ildma);
+  blas::lacpy( 'A', inbf, inpts, dbasis_x_eval, inbf, mmat_xb, ildmb);
+  blas::lacpy( 'A', inbf, inpts, dbasis_y_eval, inbf, mmat_yb, ildmb);
+  blas::lacpy( 'A', inbf, inpts, dbasis_z_eval, inbf, mmat_zb, ildmb);
 
   for( int32_t i = 0; i < (int32_t)npts; ++i ) {
 
@@ -1556,22 +1646,22 @@ void ReferenceLocalHostWorkDriver::eval_mmat_mgga_vxc_uks_ts(size_t npts, size_t
     const auto tfacta = 0.25 * vtau[2*i];
     const auto tfactb = 0.25 * vtau[2*i+1];
 
-    blas::scal( nbf, tfacta, xa_col, 1);
-    blas::scal( nbf, tfacta, ya_col, 1);
-    blas::scal( nbf, tfacta, za_col, 1);
-    blas::scal( nbf, tfactb, xb_col, 1);
-    blas::scal( nbf, tfactb, yb_col, 1);
-    blas::scal( nbf, tfactb, zb_col, 1);
+    blas::scal( inbf, tfacta, xa_col, 1);
+    blas::scal( inbf, tfacta, ya_col, 1);
+    blas::scal( inbf, tfacta, za_col, 1);
+    blas::scal( inbf, tfactb, xb_col, 1);
+    blas::scal( inbf, tfactb, yb_col, 1);
+    blas::scal( inbf, tfactb, zb_col, 1);
 
     if ( vlapl != nullptr ) {
       const auto lfacta = vlapl[2*i];
       const auto lfactb = vlapl[2*i+1];
-      blas::axpy( nbf, lfacta, bf_x_col, 1, xa_col, 1);
-      blas::axpy( nbf, lfacta, bf_y_col, 1, ya_col, 1);
-      blas::axpy( nbf, lfacta, bf_z_col, 1, za_col, 1);
-      blas::axpy( nbf, lfactb, bf_x_col, 1, xb_col, 1);
-      blas::axpy( nbf, lfactb, bf_y_col, 1, yb_col, 1);
-      blas::axpy( nbf, lfactb, bf_z_col, 1, zb_col, 1);
+      blas::axpy( inbf, lfacta, bf_x_col, 1, xa_col, 1);
+      blas::axpy( inbf, lfacta, bf_y_col, 1, ya_col, 1);
+      blas::axpy( inbf, lfacta, bf_z_col, 1, za_col, 1);
+      blas::axpy( inbf, lfactb, bf_x_col, 1, xb_col, 1);
+      blas::axpy( inbf, lfactb, bf_y_col, 1, yb_col, 1);
+      blas::axpy( inbf, lfactb, bf_z_col, 1, zb_col, 1);
     }
 
   }
@@ -1587,7 +1677,11 @@ void ReferenceLocalHostWorkDriver::eval_mmat_mgga_vxc_uks_ts(size_t npts, size_t
 					      const double* basis_eval, const submat_map_t& submat_map, const double* Z,
 					      size_t ldz, double* VXC, size_t ldvxc, double* scr ) {
 
-      blas::syr2k('L', 'N', nbe, npts, 1., basis_eval, nbe, Z, ldz, 0., scr, nbe );
+      const auto inbe  = static_cast<int32_t>(nbe);
+      const auto inpts = static_cast<int32_t>(npts);
+      const auto ildz  = static_cast<int32_t>(ldz);
+
+      blas::syr2k('L', 'N', inbe, inpts, 1., basis_eval, inbe, Z, ildz, 0., scr, inbe );
 
       detail::inc_by_submat_atomic( nbf, nbf, nbe, nbe, VXC, ldvxc, scr, nbe, submat_map );
 
@@ -1599,8 +1693,13 @@ void ReferenceLocalHostWorkDriver::eval_mmat_mgga_vxc_uks_ts(size_t npts, size_t
 						const submat_map_t& submat_map_bra, const submat_map_t& submat_map_ket, 
 						const double* G, size_t ldg, double* K, size_t ldk, double* scr ) {
 
-      blas::gemm( 'N', 'T', nbe_bra, nbe_ket, npts, 1., basis_eval, nbe_bra,
-		  G, ldg, 0., scr, nbe_bra );
+      const auto inbe_bra = static_cast<int32_t>(nbe_bra);
+      const auto inbe_ket = static_cast<int32_t>(nbe_ket);
+      const auto inpts    = static_cast<int32_t>(npts);
+      const auto ildg     = static_cast<int32_t>(ldg);
+
+      blas::gemm( 'N', 'T', inbe_bra, inbe_ket, inpts, 1., basis_eval, inbe_bra,
+		  G, ildg, 0., scr, inbe_bra );
 
       detail::inc_by_submat_atomic( nbf, nbf, nbe_bra, nbe_ket, K, ldk, scr, nbe_bra, 
 			     submat_map_bra, submat_map_ket );
@@ -1615,6 +1714,11 @@ void ReferenceLocalHostWorkDriver::eval_mmat_mgga_vxc_uks_ts(size_t npts, size_t
 						    const double* basis_eval, size_t ldb, double* F, size_t ldf,
 						    double* scr ) {
 
+    const auto inbe_bra = static_cast<int32_t>(nbe_bra);
+    const auto inbe_ket = static_cast<int32_t>(nbe_ket);
+    const auto inpts    = static_cast<int32_t>(npts);
+    const auto ildb     = static_cast<int32_t>(ldb);
+    const auto ildf     = static_cast<int32_t>(ldf);
     const auto* P_use = P;
     size_t ldp_use = ldp;
 
@@ -1627,8 +1731,8 @@ void ReferenceLocalHostWorkDriver::eval_mmat_mgga_vxc_uks_ts(size_t npts, size_t
       P_use = P + submat_map_ket[0][0]*ldp + submat_map_bra[0][0];
     }
 
-    blas::gemm( 'N', 'N', nbe_bra, npts, nbe_ket, 1., P_use, ldp_use, basis_eval,
-		ldb, 0., F, ldf );
+    blas::gemm( 'N', 'N', inbe_bra, inpts, inbe_ket, 1., P_use, static_cast<int32_t>(ldp_use), basis_eval,
+		ildb, 0., F, ildf );
 
   }
 
@@ -1642,7 +1746,9 @@ void ReferenceLocalHostWorkDriver::eval_mmat_mgga_vxc_uks_ts(size_t npts, size_t
 
     util::unused(basis_map);
 
-    // Cast points to Rys format (binary compatable)
+    const auto inpts = static_cast<int32_t>(npts);
+    const auto ildx  = static_cast<int32_t>(ldx);
+    const auto ildg  = static_cast<int32_t>(ldg);
     XCPU::point* _points = 
       reinterpret_cast<XCPU::point*>(const_cast<double*>(points));
     std::vector<double> _points_transposed(3 * npts);
@@ -1667,8 +1773,9 @@ void ReferenceLocalHostWorkDriver::eval_mmat_mgga_vxc_uks_ts(size_t npts, size_t
     const bool any_pure = std::any_of( shell_list, shell_list + nshells,
 				       [&](const auto& i){ return basis.at(i).pure(); } );
     
-    const size_t nbe_cart = 
+    const size_t nbe_cart =
       basis.nbf_cart_subset( shell_list, shell_list + nshells );
+    const auto inbe_cart = static_cast<int32_t>(nbe_cart);
 
     std::vector<double> X_cart, G_cart;
     if( any_pure ){
@@ -1689,8 +1796,8 @@ void ReferenceLocalHostWorkDriver::eval_mmat_mgga_vxc_uks_ts(size_t npts, size_t
           sph_trans.itform_bra_cm( shell_l, npts, X + ioff, ldx,
         			   X_cart.data() + ioff_cart, nbe_cart );
         } else {
-          blas::lacpy( 'A', shell_sz, npts, X + ioff, ldx,
-        	       X_cart.data() + ioff_cart, nbe_cart );
+          blas::lacpy( 'A', shell_sz, inpts, X + ioff, ildx,
+        	       X_cart.data() + ioff_cart, inbe_cart );
         }
         ioff += shell_sz;
         ioff_cart += shell_cart_sz;
@@ -1808,8 +1915,8 @@ void ReferenceLocalHostWorkDriver::eval_mmat_mgga_vxc_uks_ts(size_t npts, size_t
           sph_trans.tform_bra_cm( shell_l, npts, G_cart.data() + ioff_cart, nbe_cart,
         			  G + ioff, ldg );
         } else {
-          blas::lacpy( 'A', shell_sz, npts, G_cart.data() + ioff_cart, nbe_cart,
-        	       G + ioff, ldg );
+          blas::lacpy( 'A', shell_sz, inpts, G_cart.data() + ioff_cart, inbe_cart,
+        	       G + ioff, ildg );
         }
         ioff += shell_sz;
         ioff_cart += shell_cart_sz;
diff --git a/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_dd_psi.hpp b/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_dd_psi.hpp
index 211a4abb4..a3d889335 100644
--- a/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_dd_psi.hpp
+++ b/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_dd_psi.hpp
@@ -71,7 +71,7 @@ void ReferenceReplicatedXCHostIntegrator<ValueType>::
   const auto& mol   = this->load_balancer_->molecule();
 
   // Atom-specific data
-  int natom = mol.size();
+  int natom = static_cast<int>(mol.size());
   std::vector<double> radii(natom);
   for (int i = 0; i < natom; ++i) {
     radii[i] = uff_radius_103(mol[i].Z);
@@ -114,9 +114,9 @@ void ReferenceReplicatedXCHostIntegrator<ValueType>::
     const auto& task = tasks[iT];
 
     // Get tasks constants
-    const int32_t  npts    = task.points.size();
+    const int32_t  npts    = static_cast<int32_t>(task.points.size());
     const int32_t  nbe     = task.bfn_screening.nbe;
-    const int32_t  nshells = task.bfn_screening.shell_list.size();
+    const int32_t  nshells = static_cast<int32_t>(task.bfn_screening.shell_list.size());
 
     const auto* points      = task.points.data()->data();
     const auto* weights     = task.weights.data();
diff --git a/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_dd_psi_potential.hpp b/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_dd_psi_potential.hpp
index 58b9eddf6..2b99140fa 100644
--- a/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_dd_psi_potential.hpp
+++ b/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_dd_psi_potential.hpp
@@ -36,7 +36,7 @@ void ReferenceReplicatedXCHostIntegrator<ValueType>::
   const auto& mol = this->load_balancer_->molecule();
   const size_t natom = mol.size();
   const size_t nharmonics = (max_Ylm + 1) * (max_Ylm + 1);
-  if (m != nharmonics || n != natom) {
+  if (m != static_cast<int64_t>(nharmonics) || n != static_cast<int64_t>(natom)) {
     GAUXC_GENERIC_EXCEPTION("m must be nharmonics and n must be natom");
   }
   // Get Tasks
@@ -111,9 +111,9 @@ void ReferenceReplicatedXCHostIntegrator<ValueType>::
     const auto& task = tasks[iT];
 
     // Get tasks constants
-    const int32_t  npts    = task.points.size();
+    const int32_t  npts    = static_cast<int32_t>(task.points.size());
     const int32_t  nbe     = task.bfn_screening.nbe;
-    const int32_t  nshells = task.bfn_screening.shell_list.size();
+    const int32_t  nshells = static_cast<int32_t>(task.bfn_screening.shell_list.size());
 
     const auto* points      = task.points.data()->data();
     const auto* weights     = task.weights.data();
diff --git a/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_exc_grad.hpp b/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_exc_grad.hpp
index f04ae24b7..64c92995c 100644
--- a/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_exc_grad.hpp
+++ b/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_exc_grad.hpp
@@ -52,7 +52,7 @@ void ReferenceReplicatedXCHostIntegrator<ValueType>::
     if( not this->reduction_driver_->takes_host_memory() )
       GAUXC_GENERIC_EXCEPTION("This Module Only Works With Host Reductions");
 
-    const int natoms = this->load_balancer_->molecule().natoms();
+    const int natoms = static_cast<int>(this->load_balancer_->molecule().natoms());
     this->reduction_driver_->allreduce_inplace( EXC_GRAD, 3*natoms, ReductionOp::Sum );
   });
 
@@ -61,7 +61,7 @@ void ReferenceReplicatedXCHostIntegrator<ValueType>::
 
 template <typename ValueType>
 void ReferenceReplicatedXCHostIntegrator<ValueType>::
-  eval_exc_grad_( int64_t m, int64_t n, const value_type* Ps, int64_t ldps, 
+  eval_exc_grad_( int64_t m, int64_t n, const value_type* Ps, int64_t ldps,
                   const value_type* Pz, int64_t ldpz, value_type* EXC_GRAD, const IntegratorSettingsXC& ks_settings ) { 
                  
                  
@@ -94,7 +94,7 @@ void ReferenceReplicatedXCHostIntegrator<ValueType>::
     if( not this->reduction_driver_->takes_host_memory() )
       GAUXC_GENERIC_EXCEPTION("This Module Only Works With Host Reductions");
 
-    const int natoms = this->load_balancer_->molecule().natoms();
+    const int natoms = static_cast<int>(this->load_balancer_->molecule().natoms());
     this->reduction_driver_->allreduce_inplace( EXC_GRAD, 3*natoms, ReductionOp::Sum );
   });
 
@@ -132,7 +132,7 @@ void ReferenceReplicatedXCHostIntegrator<ValueType>::
   BasisSetMap basis_map(basis,mol);
 
   const int32_t nbf = basis.nbf();
-  const int32_t natoms = mol.natoms();
+  const int32_t natoms = static_cast<int32_t>(mol.natoms());
 
   // Sort tasks on size (XXX: maybe doesnt matter?)
   auto task_comparator = []( const XCTask& a, const XCTask& b ) {
@@ -169,9 +169,9 @@ void ReferenceReplicatedXCHostIntegrator<ValueType>::
     auto& task = tasks[iT];
 
     // Get tasks constants
-    const int32_t  npts    = task.points.size();
+    const int32_t  npts    = static_cast<int32_t>(task.points.size());
     const int32_t  nbe     = task.bfn_screening.nbe;
-    const int32_t  nshells = task.bfn_screening.shell_list.size();
+    const int32_t  nshells = static_cast<int32_t>(task.bfn_screening.shell_list.size());
     const size_t spin_dim_scal = is_rks ? 1 : 2; // last case is_uks
     const size_t gga_dim_scal = is_rks ? 1 : 3;
 
@@ -414,7 +414,7 @@ void ReferenceReplicatedXCHostIntegrator<ValueType>::
       }
 
       double g_acc_x(0), g_acc_y(0), g_acc_z(0);
-      for( int ibf = 0, mu = bf_off; ibf < sh_sz; ++ibf, ++mu )
+      for( int ibf = 0, mu = static_cast<int>(bf_off); ibf < sh_sz; ++ibf, ++mu )
       for( int ipt = 0; ipt < npts; ++ipt ) {
 
         const int32_t mu_i = mu + ipt*nbe;
diff --git a/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_exc_vxc.hpp b/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_exc_vxc.hpp
index 141085c9f..cd2cc7ff7 100644
--- a/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_exc_vxc.hpp
+++ b/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_exc_vxc.hpp
@@ -215,9 +215,9 @@ void ReferenceReplicatedXCHostIntegrator<ValueType>::
     const auto& task = *(task_begin + iT);
 
     // Get tasks constants
-    const int32_t  npts    = task.points.size();
+    const int32_t  npts    = static_cast<int32_t>(task.points.size());
     const int32_t  nbe     = task.bfn_screening.nbe;
-    const int32_t  nshells = task.bfn_screening.shell_list.size();
+    const int32_t  nshells = static_cast<int32_t>(task.bfn_screening.shell_list.size());
 
     const auto* points      = task.points.data()->data();
     const auto* weights     = task.weights.data();
diff --git a/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_exx.hpp b/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_exx.hpp
index 7cce12dea..fd9cbf0a5 100644
--- a/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_exx.hpp
+++ b/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_exx.hpp
@@ -460,7 +460,7 @@ void ReferenceReplicatedXCHostIntegrator<ValueType>::
       gen_compressed_submat_map( basis_map, ek_shell_list, nbf, nbf );
 
     // Get tasks constants
-    const int32_t  npts    = task.points.size();
+    const int32_t  npts    = static_cast<int32_t>(task.points.size());
 
     const auto* points      = task.points.data()->data();
     const auto* weights     = task.weights.data();
diff --git a/src/xc_integrator/shell_batched/shell_batched_replicated_xc_integrator_exc_vxc.hpp b/src/xc_integrator/shell_batched/shell_batched_replicated_xc_integrator_exc_vxc.hpp
index 3dd43f4da..91671849f 100644
--- a/src/xc_integrator/shell_batched/shell_batched_replicated_xc_integrator_exc_vxc.hpp
+++ b/src/xc_integrator/shell_batched/shell_batched_replicated_xc_integrator_exc_vxc.hpp
@@ -375,19 +375,19 @@ void ShellBatchedReplicatedXCIntegrator<BaseIntegratorType, IncoreIntegratorType
       basis.nbf(), basis.nbf() );
 
   this->timer_.time_op_accumulate("XCIntegrator.ExtractSubDensity",[&]() {
-    detail::submat_set( basis.nbf(), basis.nbf(), nbe, nbe, Ps, ldps, 
-                        Ps_submat, nbe, union_submat_cut );
+    detail::submat_set( static_cast<int32_t>(basis.nbf()), static_cast<int32_t>(basis.nbf()), static_cast<int32_t>(nbe), static_cast<int32_t>(nbe), Ps, static_cast<int32_t>(ldps),
+                        Ps_submat, static_cast<int32_t>(nbe), union_submat_cut );
     if(Pz)
-    detail::submat_set( basis.nbf(), basis.nbf(), nbe, nbe, Pz, ldpz, 
-                        Pz_submat, nbe, union_submat_cut );
+    detail::submat_set( static_cast<int32_t>(basis.nbf()), static_cast<int32_t>(basis.nbf()), static_cast<int32_t>(nbe), static_cast<int32_t>(nbe), Pz, static_cast<int32_t>(ldpz),
+                        Pz_submat, static_cast<int32_t>(nbe), union_submat_cut );
 
     if(Py)
-    detail::submat_set( basis.nbf(), basis.nbf(), nbe, nbe, Py, ldpy, 
-                        Py_submat, nbe, union_submat_cut );
+    detail::submat_set( static_cast<int32_t>(basis.nbf()), static_cast<int32_t>(basis.nbf()), static_cast<int32_t>(nbe), static_cast<int32_t>(nbe), Py, static_cast<int32_t>(ldpy),
+                        Py_submat, static_cast<int32_t>(nbe), union_submat_cut );
 
     if(Px)
-    detail::submat_set( basis.nbf(), basis.nbf(), nbe, nbe, Px, ldpx, 
-                        Px_submat, nbe, union_submat_cut );
+    detail::submat_set( static_cast<int32_t>(basis.nbf()), static_cast<int32_t>(basis.nbf()), static_cast<int32_t>(nbe), static_cast<int32_t>(nbe), Px, static_cast<int32_t>(ldpx),
+                        Px_submat, static_cast<int32_t>(nbe), union_submat_cut );
   } );
 
 
@@ -415,20 +415,20 @@ void ShellBatchedReplicatedXCIntegrator<BaseIntegratorType, IncoreIntegratorType
   *N_EL += NEL_tmp;
   this->timer_.time_op_accumulate("XCIntegrator.IncrementSubPotential",[&]() {
     if(VXCs)
-    detail::inc_by_submat( basis.nbf(), basis.nbf(), nbe, nbe, VXCs, ldvxcs, 
-                           VXCs_submat, nbe, union_submat_cut );
+    detail::inc_by_submat( static_cast<int32_t>(basis.nbf()), static_cast<int32_t>(basis.nbf()), static_cast<int32_t>(nbe), static_cast<int32_t>(nbe), VXCs, static_cast<int32_t>(ldvxcs),
+                           VXCs_submat, static_cast<int32_t>(nbe), union_submat_cut );
 
     if(VXCz)
-    detail::inc_by_submat( basis.nbf(), basis.nbf(), nbe, nbe, VXCz, ldvxcz, 
-                           VXCz_submat, nbe, union_submat_cut );
+    detail::inc_by_submat( static_cast<int32_t>(basis.nbf()), static_cast<int32_t>(basis.nbf()), static_cast<int32_t>(nbe), static_cast<int32_t>(nbe), VXCz, static_cast<int32_t>(ldvxcz),
+                           VXCz_submat, static_cast<int32_t>(nbe), union_submat_cut );
 
     if(VXCy)
-    detail::inc_by_submat( basis.nbf(), basis.nbf(), nbe, nbe, VXCy, ldvxcy, 
-                           VXCy_submat, nbe, union_submat_cut );
+    detail::inc_by_submat( static_cast<int32_t>(basis.nbf()), static_cast<int32_t>(basis.nbf()), static_cast<int32_t>(nbe), static_cast<int32_t>(nbe), VXCy, static_cast<int32_t>(ldvxcy),
+                           VXCy_submat, static_cast<int32_t>(nbe), union_submat_cut );
 
     if(VXCx)
-    detail::inc_by_submat( basis.nbf(), basis.nbf(), nbe, nbe, VXCx, ldvxcx, 
-                           VXCx_submat, nbe, union_submat_cut );
+    detail::inc_by_submat( static_cast<int32_t>(basis.nbf()), static_cast<int32_t>(basis.nbf()), static_cast<int32_t>(nbe), static_cast<int32_t>(nbe), VXCx, static_cast<int32_t>(ldvxcx),
+                           VXCx_submat, static_cast<int32_t>(nbe), union_submat_cut );
   });
 
 

From 8d5863360a0e6c3fd450ec9a1c5a17be74144da8 Mon Sep 17 00:00:00 2001
From: Loris Ercole <v-lercole@microsoft.com>
Date: Tue, 9 Jun 2026 10:47:33 +0200
Subject: [PATCH 19/52] Fix C4389/C4242: suppress MSVC STL warnings; ignore
 build-* dirs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

C4389 (signed/unsigned ==) and C4242 (int->char) fire inside MSVC's own
xutility/algorithm headers when compiled with /W3 — not in our source.
Suppress both with /wd4389 /wd4242 (PRIVATE, not propagated to consumers).

Also add build*/ to .gitignore to cover build-msvc/ and build-clang-cl/.
---
 .gitignore         | 1 +
 src/CMakeLists.txt | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/.gitignore b/.gitignore
index be531abc6..8e9bceba1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -10,5 +10,6 @@ src/xc_integrator/local_work_driver/host/obara_saika/generator/*.x
 
 # Build directories
 build/
+build*/
 _build/
 cmake-build-*/
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 236b30bb4..4845850a2 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -96,8 +96,8 @@ if(MSVC)
   else()
     target_compile_options( gauxc PUBLIC /EHsc )
     target_compile_options( gauxc PRIVATE
-      /wd4101  # unreferenced local variable
-      /wd5219  # implicit conversion from int-type to float-type
+      /wd4242  # 'conversion': possible loss of data (int->char in STL algorithm)
+      /wd4389  # '==': signed/unsigned mismatch (in MSVC xutility)
     )
   endif()
 endif()

From 632d541b0352e7d562b2d5afd1e5a93bd8279e3c Mon Sep 17 00:00:00 2001
From: Loris Ercole <v-lercole@microsoft.com>
Date: Tue, 9 Jun 2026 10:47:46 +0200
Subject: [PATCH 20/52] Fix C4267 size_t narrowing warnings (round 2)

- xc_task.hpp: cast points.size() to int32_t in both merge_with overloads
- basisset.hpp: cast size() to int32_t in nshells() return
- misc.hpp: cast std::min(A.size(), B.size()) to uint32_t
- integrate_den.hpp: cast task sizes to int32_t
- reference_local_host_work_driver.cxx: add inbf locals and cast npts/nbe_cart
  at sph_trans/submat/inc_by_submat_atomic call boundaries; fix ioff = i*nbf
  (size_t) -> i*inbf (int32_t*int32_t)
---
 include/gauxc/basisset.hpp                    |  2 +-
 include/gauxc/util/misc.hpp                   |  2 +-
 include/gauxc/xc_task.hpp                     |  5 +-
 .../host/reference_local_host_work_driver.cxx | 52 ++++++++++---------
 ...cated_xc_host_integrator_integrate_den.hpp |  4 +-
 5 files changed, 34 insertions(+), 31 deletions(-)

diff --git a/include/gauxc/basisset.hpp b/include/gauxc/basisset.hpp
index c0c0f8396..3df7541fd 100644
--- a/include/gauxc/basisset.hpp
+++ b/include/gauxc/basisset.hpp
@@ -68,7 +68,7 @@ struct BasisSet : public std::vector<Shell<F>> {
    *
    *  @returns the number of GTO shells which comprise the BasisSet object
    */
-  inline int32_t nshells() const { return this->size(); }; 
+  inline int32_t nshells() const { return static_cast<int32_t>(this->size()); }; 
 
   /**
    *  @brief Return the number of GTO basis functions which comprise the 
diff --git a/include/gauxc/util/misc.hpp b/include/gauxc/util/misc.hpp
index cf2ef8f04..998482efa 100644
--- a/include/gauxc/util/misc.hpp
+++ b/include/gauxc/util/misc.hpp
@@ -114,7 +114,7 @@ inline auto integral_list_intersect( const std::vector<Integral>& A,
                                      const std::vector<Integral>& B,
                                      const uint32_t overlap_threshold_spec ) {
 
-  const uint32_t max_intersect_sz  = std::min(A.size(), B.size());
+  const uint32_t max_intersect_sz  = static_cast<uint32_t>(std::min(A.size(), B.size()));
   const uint32_t overlap_threshold = std::min( max_intersect_sz, 
                                                overlap_threshold_spec );
 
diff --git a/include/gauxc/xc_task.hpp b/include/gauxc/xc_task.hpp
index 630d6dd6b..7bf38ad5f 100644
--- a/include/gauxc/xc_task.hpp
+++ b/include/gauxc/xc_task.hpp
@@ -66,7 +66,7 @@ struct XCTask {
       GAUXC_GENERIC_EXCEPTION("Cannot Perform Requested Merge: Incompatible Tasks");
     points.insert( points.end(), other.points.begin(), other.points.end() );
     weights.insert( weights.end(), other.weights.begin(), other.weights.end() );
-    npts = points.size();
+    npts = static_cast<int32_t>(points.size());
   }
 
   template <typename TaskIt>
@@ -91,10 +91,9 @@ struct XCTask {
       weights_it = std::copy( it->weights.begin(), it->weights.end(), weights_it );
     }
 
-    npts = points.size();
+    npts = static_cast<int32_t>(points.size());
   }
 
-
   inline bool equiv_with( const XCTask& other ) const {
     return iParent == other.iParent and 
       bfn_screening.equiv_with(other.bfn_screening);
diff --git a/src/xc_integrator/local_work_driver/host/reference_local_host_work_driver.cxx b/src/xc_integrator/local_work_driver/host/reference_local_host_work_driver.cxx
index 262999f30..b89139d36 100644
--- a/src/xc_integrator/local_work_driver/host/reference_local_host_work_driver.cxx
+++ b/src/xc_integrator/local_work_driver/host/reference_local_host_work_driver.cxx
@@ -124,6 +124,7 @@ namespace GauXC {
 						const submat_map_t& submat_map, double fac, const double* P, size_t ldp, 
 						const double* basis_eval, size_t ldb, double* X, size_t ldx, double* scr ) {
     const auto inbe  = static_cast<int32_t>(nbe);
+    const auto inbf  = static_cast<int32_t>(nbf);
     const auto inpts = static_cast<int32_t>(npts);
     const auto ildb  = static_cast<int32_t>(ldb);
     const auto ildx  = static_cast<int32_t>(ldx);
@@ -131,7 +132,7 @@ namespace GauXC {
     size_t ldp_use = ldp;
 
     if( submat_map.size() > 1 ) {
-      detail::submat_set( nbf, nbf, nbe, nbe, P, ldp, scr, nbe, submat_map );
+      detail::submat_set( inbf, inbf, inbe, inbe, P, static_cast<int32_t>(ldp), scr, inbe, submat_map );
       P_use = scr;
       ldp_use = nbe;
     } else if( nbe != nbf ) {
@@ -687,7 +688,7 @@ void ReferenceLocalHostWorkDriver::eval_zmat_lda_vxc_gks( size_t npts, size_t nb
 
     for( int32_t i = 0; i < (int32_t)npts; ++i ) {
 
-      const int32_t ioff = i * nbf;
+      const int32_t ioff = i * inbf;
 
       auto* z_col    = Z + ioff;
       auto* bf_x_col = dbasis_x_eval + ioff;
@@ -729,7 +730,7 @@ void ReferenceLocalHostWorkDriver::eval_zmat_lda_vxc_gks( size_t npts, size_t nb
 
     for( int32_t i = 0; i < (int32_t)npts; ++i ) {
 
-      const int32_t ioff = i * nbf;
+      const int32_t ioff = i * inbf;
 
       auto* zs_col = Zs + ioff;
       auto* zz_col = Zz + ioff;
@@ -787,7 +788,7 @@ void ReferenceLocalHostWorkDriver::eval_zmat_lda_vxc_gks( size_t npts, size_t nb
 
     for( int32_t i = 0; i < (int32_t)npts; ++i ) {
 
-      const int32_t ioff = i * nbf;
+      const int32_t ioff = i * inbf;
 
       auto* z_col    = Z + ioff;
       auto* bf_x_col = dbasis_x_eval + ioff;
@@ -837,7 +838,7 @@ void ReferenceLocalHostWorkDriver::eval_zmat_mgga_vxc_uks( size_t npts, size_t n
 
     for( int32_t i = 0; i < (int32_t)npts; ++i ) {
 
-      const int32_t ioff = i * nbf;
+      const int32_t ioff = i * inbf;
 
       auto* zs_col = Zs + ioff;
       auto* zz_col = Zz + ioff;
@@ -905,7 +906,7 @@ void ReferenceLocalHostWorkDriver::eval_zmat_mgga_vxc_uks( size_t npts, size_t n
 
     for( int32_t i = 0; i < (int32_t)npts; ++i ) {
 
-      const int32_t ioff = i * nbf;
+      const int32_t ioff = i * inbf;
       auto* mmat_x_col = mmat_x + ioff;
       auto* mmat_y_col = mmat_y + ioff;
       auto* mmat_z_col = mmat_z + ioff;
@@ -952,7 +953,7 @@ void ReferenceLocalHostWorkDriver::eval_mmat_mgga_vxc_uks(size_t npts, size_t nb
 
     for( int32_t i = 0; i < (int32_t)npts; ++i ) {
 
-      const int32_t ioff = i * nbf;
+      const int32_t ioff = i * inbf;
       auto* xs_col = mmat_xs + ioff;
       auto* ys_col = mmat_ys + ioff;
       auto* zs_col = mmat_zs + ioff;
@@ -1026,7 +1027,7 @@ void ReferenceLocalHostWorkDriver::eval_zmat_gga_vxc_gks( size_t npts, size_t nb
 
     for( int32_t i = 0; i < (int32_t)npts; ++i ) {
 
-      const int32_t ioff = i * nbf;
+      const int32_t ioff = i * inbf;
 
       auto* zs_col = Zs + ioff;
       auto* zz_col = Zz + ioff;
@@ -1392,7 +1393,7 @@ void ReferenceLocalHostWorkDriver::eval_zmat_gga_vxc_rks_ts( size_t npts, size_t
 
   for( int32_t i = 0; i < (int32_t)npts; ++i ) {
 
-    const int32_t ioff = i * nbf;
+    const int32_t ioff = i * inbf;
 
     auto* z_col = Z + ioff;
     auto* bf_x_col = dbasis_x_eval + ioff;
@@ -1454,7 +1455,7 @@ void ReferenceLocalHostWorkDriver::eval_zmat_gga_vxc_uks_ts( size_t npts, size_t
 
   for( int32_t i = 0; i < (int32_t)npts; ++i ) {
 
-    const int32_t ioff = i * nbf;
+    const int32_t ioff = i * inbf;
 
     auto* za_col = Za + ioff;
     auto* zb_col = Zb + ioff;
@@ -1496,7 +1497,7 @@ void ReferenceLocalHostWorkDriver::eval_zmat_gga_vxc_uks_ts( size_t npts, size_t
 
   for( int32_t i = 0; i < (int32_t)npts; ++i ) {
 
-    const int32_t ioff = i * nbf;
+    const int32_t ioff = i * inbf;
 
     auto* za_col = Za + ioff;
     auto* zb_col = Zb + ioff;
@@ -1560,7 +1561,7 @@ void ReferenceLocalHostWorkDriver::eval_zmat_mgga_vxc_uks_ts( size_t npts, size_
 
   for( int32_t i = 0; i < (int32_t)npts; ++i ) {
 
-    const int32_t ioff = i * nbf;
+    const int32_t ioff = i * inbf;
 
     auto* za_col = Za + ioff;
     auto* zb_col = Zb + ioff;
@@ -1632,7 +1633,7 @@ void ReferenceLocalHostWorkDriver::eval_mmat_mgga_vxc_uks_ts(size_t npts, size_t
 
   for( int32_t i = 0; i < (int32_t)npts; ++i ) {
 
-    const int32_t ioff = i * nbf;
+    const int32_t ioff = i * inbf;
     auto* xa_col = mmat_xa + ioff;
     auto* ya_col = mmat_ya + ioff;
     auto* za_col = mmat_za + ioff;
@@ -1678,12 +1679,13 @@ void ReferenceLocalHostWorkDriver::eval_mmat_mgga_vxc_uks_ts(size_t npts, size_t
 					      size_t ldz, double* VXC, size_t ldvxc, double* scr ) {
 
       const auto inbe  = static_cast<int32_t>(nbe);
+      const auto inbf  = static_cast<int32_t>(nbf);
       const auto inpts = static_cast<int32_t>(npts);
       const auto ildz  = static_cast<int32_t>(ldz);
 
       blas::syr2k('L', 'N', inbe, inpts, 1., basis_eval, inbe, Z, ildz, 0., scr, inbe );
 
-      detail::inc_by_submat_atomic( nbf, nbf, nbe, nbe, VXC, ldvxc, scr, nbe, submat_map );
+      detail::inc_by_submat_atomic( inbf, inbf, inbe, inbe, VXC, static_cast<int32_t>(ldvxc), scr, inbe, submat_map );
 
   }
 
@@ -1695,13 +1697,14 @@ void ReferenceLocalHostWorkDriver::eval_mmat_mgga_vxc_uks_ts(size_t npts, size_t
 
       const auto inbe_bra = static_cast<int32_t>(nbe_bra);
       const auto inbe_ket = static_cast<int32_t>(nbe_ket);
+      const auto inbf     = static_cast<int32_t>(nbf);
       const auto inpts    = static_cast<int32_t>(npts);
       const auto ildg     = static_cast<int32_t>(ldg);
 
       blas::gemm( 'N', 'T', inbe_bra, inbe_ket, inpts, 1., basis_eval, inbe_bra,
 		  G, ildg, 0., scr, inbe_bra );
 
-      detail::inc_by_submat_atomic( nbf, nbf, nbe_bra, nbe_ket, K, ldk, scr, nbe_bra, 
+      detail::inc_by_submat_atomic( inbf, inbf, inbe_bra, inbe_ket, K, static_cast<int32_t>(ldk), scr, inbe_bra,
 			     submat_map_bra, submat_map_ket );
 
   }
@@ -1716,6 +1719,7 @@ void ReferenceLocalHostWorkDriver::eval_mmat_mgga_vxc_uks_ts(size_t npts, size_t
 
     const auto inbe_bra = static_cast<int32_t>(nbe_bra);
     const auto inbe_ket = static_cast<int32_t>(nbe_ket);
+    const auto inbf     = static_cast<int32_t>(nbf);
     const auto inpts    = static_cast<int32_t>(npts);
     const auto ildb     = static_cast<int32_t>(ldb);
     const auto ildf     = static_cast<int32_t>(ldf);
@@ -1723,8 +1727,8 @@ void ReferenceLocalHostWorkDriver::eval_mmat_mgga_vxc_uks_ts(size_t npts, size_t
     size_t ldp_use = ldp;
 
     if( submat_map_bra.size() > 1 or submat_map_ket.size() > 1 ) {
-      detail::submat_set( nbf, nbf, nbe_bra, nbe_ket, P, ldp,
-			  scr, nbe_bra, submat_map_bra, submat_map_ket );
+      detail::submat_set( inbf, inbf, inbe_bra, inbe_ket, P, static_cast<int32_t>(ldp),
+			  scr, inbe_bra, submat_map_bra, submat_map_ket );
       P_use = scr;
       ldp_use = nbe_bra;
     } else {
@@ -1793,8 +1797,8 @@ void ReferenceLocalHostWorkDriver::eval_mmat_mgga_vxc_uks_ts(size_t npts, size_t
         const int shell_cart_sz = shell.cart_size();
         
         if( shell.pure() and shell_l > 0 ) {
-          sph_trans.itform_bra_cm( shell_l, npts, X + ioff, ldx,
-        			   X_cart.data() + ioff_cart, nbe_cart );
+          sph_trans.itform_bra_cm( shell_l, inpts, X + ioff, ildx,
+        			   X_cart.data() + ioff_cart, inbe_cart );
         } else {
           blas::lacpy( 'A', shell_sz, inpts, X + ioff, ildx,
         	       X_cart.data() + ioff_cart, inbe_cart );
@@ -1887,9 +1891,9 @@ void ReferenceLocalHostWorkDriver::eval_mmat_mgga_vxc_uks_ts(size_t npts, size_t
       XCPU::compute_integral_shell_pair( ish == jsh,
       				   npts, _points_transposed.data(),
       				   bra.l(), ket.l(), bra_origin, ket_origin,
-      				   nprim_pair, prim_pair_data,
-      				   X_cart_rm.data()+ioff_cart, X_cart_rm.data()+joff_cart, npts,
-      				   G_cart_rm.data()+ioff_cart, G_cart_rm.data()+joff_cart, npts,
+      				   static_cast<int>(nprim_pair), prim_pair_data,
+      				   X_cart_rm.data()+ioff_cart, X_cart_rm.data()+joff_cart, inpts,
+      				   G_cart_rm.data()+ioff_cart, G_cart_rm.data()+joff_cart, inpts,
       				   const_cast<double*>(weights), this->boys_table );
     }
 #endif
@@ -1912,8 +1916,8 @@ void ReferenceLocalHostWorkDriver::eval_mmat_mgga_vxc_uks_ts(size_t npts, size_t
         const int shell_cart_sz = shell.cart_size();
         
         if( shell.pure() and shell_l > 0 ) {
-          sph_trans.tform_bra_cm( shell_l, npts, G_cart.data() + ioff_cart, nbe_cart,
-        			  G + ioff, ldg );
+          sph_trans.tform_bra_cm( shell_l, inpts, G_cart.data() + ioff_cart, inbe_cart,
+        			  G + ioff, ildg );
         } else {
           blas::lacpy( 'A', shell_sz, inpts, G_cart.data() + ioff_cart, inbe_cart,
         	       G + ioff, ildg );
diff --git a/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_integrate_den.hpp b/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_integrate_den.hpp
index e0ad145f5..5aa55d837 100644
--- a/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_integrate_den.hpp
+++ b/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_integrate_den.hpp
@@ -108,9 +108,9 @@ void ReferenceReplicatedXCHostIntegrator<ValueType>::
     const auto& task = tasks[iT];
 
     // Get tasks constants
-    const int32_t  npts    = task.points.size();
+    const int32_t  npts    = static_cast<int32_t>(task.points.size());
     const int32_t  nbe     = task.bfn_screening.nbe;
-    const int32_t  nshells = task.bfn_screening.shell_list.size();
+    const int32_t  nshells = static_cast<int32_t>(task.bfn_screening.shell_list.size());
 
     const auto* points      = task.points.data()->data();
     const auto* weights     = task.weights.data();

From 4f7818e26bde5c96a525917f3a9c4ab1ba55baf8 Mon Sep 17 00:00:00 2001
From: Loris Ercole <v-lercole@microsoft.com>
Date: Tue, 9 Jun 2026 10:47:55 +0200
Subject: [PATCH 21/52] Fix C4244 implicit-conversion warnings (round 2)

- real_solid_harmonics.hpp: cast intmax_t results of integral_*_factorial
  and binomial_coefficient to double at assignment sites
- basisset_map.hpp: cast ptrdiff_t result of std::distance to int32_t
- reference_replicated_xc_host_integrator_dd_psi.hpp: cast int64_t ldPsi
  to int at blas::gemm call sites
---
 include/gauxc/basisset_map.hpp                       |  2 +-
 include/gauxc/util/real_solid_harmonics.hpp          | 12 ++++++------
 ...eference_replicated_xc_host_integrator_dd_psi.hpp |  8 ++++----
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/include/gauxc/basisset_map.hpp b/include/gauxc/basisset_map.hpp
index 53f6d9d8f..e7a38ff6e 100644
--- a/include/gauxc/basisset_map.hpp
+++ b/include/gauxc/basisset_map.hpp
@@ -72,7 +72,7 @@ class BasisSetMap {
       auto at_pos = std::find_if( mol.begin(), mol.end(), [&](const Atom& at) { 
         return at.x == shell.O()[0] and at.y == shell.O()[1] and at.z == shell.O()[2];
       });
-      if( at_pos != mol.end() ) shell_to_center_[sh_idx] = std::distance( mol.begin(), at_pos );
+      if( at_pos != mol.end() ) shell_to_center_[sh_idx] = static_cast<int32_t>(std::distance( mol.begin(), at_pos ));
       else shell_to_center_[sh_idx] = -1;
       ++sh_idx;
     }
diff --git a/include/gauxc/util/real_solid_harmonics.hpp b/include/gauxc/util/real_solid_harmonics.hpp
index 3394da020..6c5aa2a0b 100644
--- a/include/gauxc/util/real_solid_harmonics.hpp
+++ b/include/gauxc/util/real_solid_harmonics.hpp
@@ -66,10 +66,10 @@ inline constexpr double real_solid_harmonic_coeff( int l, int m, int lx, int ly,
   auto i    = abs_m - lx;
   if( comp != parity( std::abs(i) ) ) return 0.;
 
-  double pfac = integral_falling_factorial( 2*lx, lx+1 ) *
-                integral_falling_factorial( 2*ly, ly+1 ) *
-                integral_falling_factorial( 2*lz, lz+1 );
-  const double factorial_l = integral_factorial(l);
+  double pfac = static_cast<double>(integral_falling_factorial( 2*lx, lx+1 )) *
+                static_cast<double>(integral_falling_factorial( 2*ly, ly+1 )) *
+                static_cast<double>(integral_falling_factorial( 2*lz, lz+1 ));
+  const double factorial_l = static_cast<double>(integral_factorial(l));
   pfac = pfac / ( factorial_l * factorial_l * integral_falling_factorial(2*l,l+1) *
                   integral_falling_factorial(l+abs_m,l-abs_m+1) );
   pfac = std::sqrt(pfac);
@@ -84,7 +84,7 @@ inline constexpr double real_solid_harmonic_coeff( int l, int m, int lx, int ly,
   auto i_max = (l-abs_m)/2;
   double sum = 0;
   for(i=i_min;i<=i_max;i++) {
-    double pfac1 = parity(i) * binomial_coefficient(l,i) * binomial_coefficient(i,j);
+    double pfac1 = static_cast<double>(parity(i) * binomial_coefficient(l,i) * binomial_coefficient(i,j));
     pfac1 *= integral_factorial(2*(l-i));
     pfac1 /= integral_factorial(l-abs_m-2*i);
     double sum1 = 0.0;
@@ -99,7 +99,7 @@ inline constexpr double real_solid_harmonic_coeff( int l, int m, int lx, int ly,
     sum += pfac1*sum1;
   }
 
-  double pfac2 =  integral_double_factorial( 2*l  - 1 );
+  double pfac2 = static_cast<double>(integral_double_factorial( 2*l  - 1 ));
   pfac2 = pfac2 / integral_double_factorial( 2*lx - 1 );
   pfac2 = pfac2 / integral_double_factorial( 2*ly - 1 );
   pfac2 = pfac2 / integral_double_factorial( 2*lz - 1 );
diff --git a/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_dd_psi.hpp b/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_dd_psi.hpp
index a3d889335..a3bfaa364 100644
--- a/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_dd_psi.hpp
+++ b/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_dd_psi.hpp
@@ -167,10 +167,10 @@ void ReferenceReplicatedXCHostIntegrator<ValueType>::
       den_eval[i] *= -weights[i];
     }
     std::vector<double> offset_local_dd_psi(ldPsi, 0.0);
-    blas::gemm('N', 'N', ldPsi, 1, npts,  
-            1.0, ylm_matrix.data(), ldPsi,   
-            den_eval, npts,     
-            0.0, offset_local_dd_psi.data(), ldPsi); 
+    blas::gemm('N', 'N', static_cast<int>(ldPsi), 1, npts,
+            1.0, ylm_matrix.data(), static_cast<int>(ldPsi),
+            den_eval, npts,
+            0.0, offset_local_dd_psi.data(), static_cast<int>(ldPsi));
     for (int j = 0; j < ldPsi; ++j) {
       dd_Psi[atom_offset + j] += offset_local_dd_psi[j];
     }

From 66ae491ca63726020a958c2df4313c5c252a8e53 Mon Sep 17 00:00:00 2001
From: Loris Ercole <v-lercole@microsoft.com>
Date: Tue, 9 Jun 2026 10:48:08 +0200
Subject: [PATCH 22/52] Fix C4100 unused-parameter warnings (round 2):
 basic_mpi_reduction_driver

---
 src/reduction_driver/host/basic_mpi_reduction_driver.cxx | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/reduction_driver/host/basic_mpi_reduction_driver.cxx b/src/reduction_driver/host/basic_mpi_reduction_driver.cxx
index f4bc66f58..dfa10c263 100644
--- a/src/reduction_driver/host/basic_mpi_reduction_driver.cxx
+++ b/src/reduction_driver/host/basic_mpi_reduction_driver.cxx
@@ -78,8 +78,8 @@ void BasicMPIReductionDriver::allreduce_typeerased( const void* src, void* dest,
 
 
 }
-void BasicMPIReductionDriver::allreduce_inplace_typeerased( void* data, size_t size,
-  [[maybe_unused]] ReductionOp op, std::type_index idx, std::any optional_args ) {
+void BasicMPIReductionDriver::allreduce_inplace_typeerased( [[maybe_unused]] void* data, [[maybe_unused]] size_t size,
+  [[maybe_unused]] ReductionOp op, [[maybe_unused]] std::type_index idx, std::any optional_args ) {
 
   if( optional_args.has_value() )
     std::cout << "** Warning: Optional Args Are Not Used in BasiMPIReductionDriver::allreduce" << std::endl;

From 0e9ae2f1f0f08164efe1c59481595819e6fa2b81 Mon Sep 17 00:00:00 2001
From: Loris Ercole <v-lercole@microsoft.com>
Date: Tue, 9 Jun 2026 10:48:08 +0200
Subject: [PATCH 23/52] Fix C4701 potentially-uninitialized variable warnings

- parse_basis.cxx: value-initialize coeff_secondary_arr{} so MSVC can
  prove it is always initialized before use in the gencon branch
- standalone_driver.cxx: initialize EXC and EXC_ref to 0.0 at declaration
---
 tests/basis/parse_basis.cxx | 2 +-
 tests/standalone_driver.cxx | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/basis/parse_basis.cxx b/tests/basis/parse_basis.cxx
index 00373d7d7..3d3525fb3 100644
--- a/tests/basis/parse_basis.cxx
+++ b/tests/basis/parse_basis.cxx
@@ -210,7 +210,7 @@ BasisSet<double> parse_basis( const Molecule& mol,
       using prim_array = Shell<double>::prim_array;
       using cart_array = Shell<double>::cart_array;
 
-      prim_array alpha_arr, coeff_primary_arr, coeff_secondary_arr;
+      prim_array alpha_arr, coeff_primary_arr, coeff_secondary_arr{};
       std::copy( alpha.begin(), alpha.end(), alpha_arr.begin() );
       std::copy( coeff_primary.begin(), coeff_primary.end(), 
                  coeff_primary_arr.begin() );
diff --git a/tests/standalone_driver.cxx b/tests/standalone_driver.cxx
index 0016fc96f..3c7121cc6 100644
--- a/tests/standalone_driver.cxx
+++ b/tests/standalone_driver.cxx
@@ -229,7 +229,7 @@ int main(int argc, char** argv) {
     matrix_type P, Pz, Py, Px, VXC_ref, VXCz_ref, VXCy_ref, VXCx_ref, K_ref;
     matrix_type ddX, ddPsi_ref, ddPsi_potential_ref;
     matrix_type FXC_ref, FXCz_ref;
-    double EXC_ref;
+    double EXC_ref = 0.0;
     std::vector<double> EXC_GRAD_ref(3*mol.size());
     bool rks = true, uks = false, gks = false;
     size_t N_EL_ref = MolMeta(mol).sum_atomic_charges();
@@ -454,7 +454,7 @@ int main(int argc, char** argv) {
 
     matrix_type VXC, VXCz, VXCy, VXCx, K, FXC, FXCz;
     matrix_type ddPsi, ddPsiPotential;
-    double EXC, N_EL;
+    double EXC = 0.0, N_EL = 0.0;
 
     std::cout << std::scientific << std::setprecision(12);
     if( integrate_den ) {

From 75d0c669c57296e26e9ef84c2d361b53abbe5e8c Mon Sep 17 00:00:00 2001
From: Loris Ercole <v-lercole@microsoft.com>
Date: Wed, 10 Jun 2026 13:59:57 +0200
Subject: [PATCH 24/52] Use ildp local instead of inline cast in eval_xmat

---
 .../host/reference_local_host_work_driver.cxx                  | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/xc_integrator/local_work_driver/host/reference_local_host_work_driver.cxx b/src/xc_integrator/local_work_driver/host/reference_local_host_work_driver.cxx
index b89139d36..59b93f6a7 100644
--- a/src/xc_integrator/local_work_driver/host/reference_local_host_work_driver.cxx
+++ b/src/xc_integrator/local_work_driver/host/reference_local_host_work_driver.cxx
@@ -128,11 +128,12 @@ namespace GauXC {
     const auto inpts = static_cast<int32_t>(npts);
     const auto ildb  = static_cast<int32_t>(ldb);
     const auto ildx  = static_cast<int32_t>(ldx);
+    const auto ildp  = static_cast<int32_t>(ldp);
     const auto* P_use = P;
     size_t ldp_use = ldp;
 
     if( submat_map.size() > 1 ) {
-      detail::submat_set( inbf, inbf, inbe, inbe, P, static_cast<int32_t>(ldp), scr, inbe, submat_map );
+      detail::submat_set( inbf, inbf, inbe, inbe, P, ildp, scr, inbe, submat_map );
       P_use = scr;
       ldp_use = nbe;
     } else if( nbe != nbf ) {

From faa245d3b51b91ac2a923e1caba500bb92156bff Mon Sep 17 00:00:00 2001
From: Loris Ercole <v-lercole@microsoft.com>
Date: Wed, 10 Jun 2026 14:02:28 +0200
Subject: [PATCH 25/52] Use ildvxc, ildk, ildp locals instead of inline casts
 in inc_vxc, inc_exx_k, eval_exx_fmat

---
 .../host/reference_local_host_work_driver.cxx         | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/xc_integrator/local_work_driver/host/reference_local_host_work_driver.cxx b/src/xc_integrator/local_work_driver/host/reference_local_host_work_driver.cxx
index 59b93f6a7..30429432c 100644
--- a/src/xc_integrator/local_work_driver/host/reference_local_host_work_driver.cxx
+++ b/src/xc_integrator/local_work_driver/host/reference_local_host_work_driver.cxx
@@ -1682,11 +1682,12 @@ void ReferenceLocalHostWorkDriver::eval_mmat_mgga_vxc_uks_ts(size_t npts, size_t
       const auto inbe  = static_cast<int32_t>(nbe);
       const auto inbf  = static_cast<int32_t>(nbf);
       const auto inpts = static_cast<int32_t>(npts);
-      const auto ildz  = static_cast<int32_t>(ldz);
+      const auto ildz    = static_cast<int32_t>(ldz);
+      const auto ildvxc  = static_cast<int32_t>(ldvxc);
 
       blas::syr2k('L', 'N', inbe, inpts, 1., basis_eval, inbe, Z, ildz, 0., scr, inbe );
 
-      detail::inc_by_submat_atomic( inbf, inbf, inbe, inbe, VXC, static_cast<int32_t>(ldvxc), scr, inbe, submat_map );
+      detail::inc_by_submat_atomic( inbf, inbf, inbe, inbe, VXC, ildvxc, scr, inbe, submat_map );
 
   }
 
@@ -1701,11 +1702,12 @@ void ReferenceLocalHostWorkDriver::eval_mmat_mgga_vxc_uks_ts(size_t npts, size_t
       const auto inbf     = static_cast<int32_t>(nbf);
       const auto inpts    = static_cast<int32_t>(npts);
       const auto ildg     = static_cast<int32_t>(ldg);
+      const auto ildk     = static_cast<int32_t>(ldk);
 
       blas::gemm( 'N', 'T', inbe_bra, inbe_ket, inpts, 1., basis_eval, inbe_bra,
 		  G, ildg, 0., scr, inbe_bra );
 
-      detail::inc_by_submat_atomic( inbf, inbf, inbe_bra, inbe_ket, K, static_cast<int32_t>(ldk), scr, inbe_bra,
+      detail::inc_by_submat_atomic( inbf, inbf, inbe_bra, inbe_ket, K, ildk, scr, inbe_bra,
 			     submat_map_bra, submat_map_ket );
 
   }
@@ -1724,11 +1726,12 @@ void ReferenceLocalHostWorkDriver::eval_mmat_mgga_vxc_uks_ts(size_t npts, size_t
     const auto inpts    = static_cast<int32_t>(npts);
     const auto ildb     = static_cast<int32_t>(ldb);
     const auto ildf     = static_cast<int32_t>(ldf);
+    const auto ildp     = static_cast<int32_t>(ldp);
     const auto* P_use = P;
     size_t ldp_use = ldp;
 
     if( submat_map_bra.size() > 1 or submat_map_ket.size() > 1 ) {
-      detail::submat_set( inbf, inbf, inbe_bra, inbe_ket, P, static_cast<int32_t>(ldp),
+      detail::submat_set( inbf, inbf, inbe_bra, inbe_ket, P, ildp,
 			  scr, inbe_bra, submat_map_bra, submat_map_ket );
       P_use = scr;
       ldp_use = nbe_bra;

From 67e293cf54f65123015a80603b89232c15d84b8a Mon Sep 17 00:00:00 2001
From: Loris Ercole <v-lercole@microsoft.com>
Date: Wed, 10 Jun 2026 14:43:15 +0200
Subject: [PATCH 26/52] Suppress MSVC STL template warnings in CMake build

All remaining C4267, C4244, C4242, C4389 warnings fire exclusively inside
MSVC standard library headers (xutility, utility, numeric, algorithm) when
GauXC templates are instantiated. They are not from GauXC source code.
Suppress them privately so diagnostics from our own code remain visible.
---
 src/CMakeLists.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 4845850a2..a74a9b6ee 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -97,6 +97,8 @@ if(MSVC)
     target_compile_options( gauxc PUBLIC /EHsc )
     target_compile_options( gauxc PRIVATE
       /wd4242  # 'conversion': possible loss of data (int->char in STL algorithm)
+      /wd4244  # 'conversion': possible loss of data (in MSVC numeric/algorithm)
+      /wd4267  # 'conversion': size_t to smaller type (in MSVC xutility/utility)
       /wd4389  # '==': signed/unsigned mismatch (in MSVC xutility)
     )
   endif()

From 5ddbfbe99d2c7f3b84d09294dc3a2449eaf8258b Mon Sep 17 00:00:00 2001
From: Loris Ercole <v-lercole@microsoft.com>
Date: Wed, 10 Jun 2026 15:13:43 +0200
Subject: [PATCH 27/52] Fix C4389: cast uint32_t to int32_t before
 std::count/std::find on int32_t container

basisset_map.hpp nshells_with_l and l_purity pass a uint32_t l to
std::count/std::find over a vector<int32_t>, triggering a signed/unsigned
comparison inside MSVC xutility. Cast l to int32_t at the call site.
---
 include/gauxc/basisset_map.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/gauxc/basisset_map.hpp b/include/gauxc/basisset_map.hpp
index e7a38ff6e..c67e5fbc9 100644
--- a/include/gauxc/basisset_map.hpp
+++ b/include/gauxc/basisset_map.hpp
@@ -170,13 +170,13 @@ class BasisSetMap {
 
   /// Count the number of shells with angular momentum `l`
   inline size_t nshells_with_l(uint32_t l) const {
-    return std::count( shell_ls_.begin(), shell_ls_.end(), l );
+    return std::count( shell_ls_.begin(), shell_ls_.end(), static_cast<int32_t>(l) );
   }
 
   /// Check whether shells of angular momentum `l` are spherical (pure)
   inline bool l_purity(uint32_t l) const {
     // Find first shell with L
-    auto first_shell_w_l = std::find( shell_ls_.begin(), shell_ls_.end(), l );
+    auto first_shell_w_l = std::find( shell_ls_.begin(), shell_ls_.end(), static_cast<int32_t>(l) );
     return shell_pure( std::distance( shell_ls_.begin(), first_shell_w_l ) );
   }
 

From 335642277eb7339d124891f2b5275da1fcaadeeb Mon Sep 17 00:00:00 2001
From: Loris Ercole <v-lercole@microsoft.com>
Date: Wed, 10 Jun 2026 15:14:21 +0200
Subject: [PATCH 28/52] Fix C4244/C4242: replace ::toupper with lambda in
 std::transform

::toupper takes int and returns int; storing the result into a char iterator
causes C4242 (int->char) and C4244 in MSVC. Use a lambda that casts the
output explicitly, matching the pattern required by the standard.
---
 src/xc_integrator/local_work_driver/factory.cxx              | 3 ++-
 .../replicated/host/replicated_xc_host_integrator.cxx        | 5 +++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/xc_integrator/local_work_driver/factory.cxx b/src/xc_integrator/local_work_driver/factory.cxx
index fd6b86ad4..3fbf49ab1 100644
--- a/src/xc_integrator/local_work_driver/factory.cxx
+++ b/src/xc_integrator/local_work_driver/factory.cxx
@@ -22,7 +22,8 @@ LocalWorkDriverFactory::ptr_return_t
   LocalWorkDriverFactory::make_local_work_driver( ExecutionSpace ex, 
     std::string name, LocalWorkSettings settings ) {
 
-  std::transform( name.begin(), name.end(), name.begin(), ::toupper );
+  std::transform( name.begin(), name.end(), name.begin(),
+    [](unsigned char c){ return static_cast<char>(std::toupper(c)); } );
   (void)(settings);
 
   switch(ex) {
diff --git a/src/xc_integrator/replicated/host/replicated_xc_host_integrator.cxx b/src/xc_integrator/replicated/host/replicated_xc_host_integrator.cxx
index 72ef87b87..fe58dd5a0 100644
--- a/src/xc_integrator/replicated/host/replicated_xc_host_integrator.cxx
+++ b/src/xc_integrator/replicated/host/replicated_xc_host_integrator.cxx
@@ -37,8 +37,9 @@ typename ReplicatedXCHostIntegratorFactory<ValueType>::ptr_return_t
     GAUXC_GENERIC_EXCEPTION("Passed LWD Not valid for Host ExSpace");
   }
 
-  std::transform(integrator_kernel.begin(), integrator_kernel.end(), 
-    integrator_kernel.begin(), ::toupper );
+  std::transform(integrator_kernel.begin(), integrator_kernel.end(),
+    integrator_kernel.begin(),
+    [](unsigned char c){ return static_cast<char>(std::toupper(c)); } );
 
   if( integrator_kernel == "DEFAULT" ) integrator_kernel = "REFERENCE";
 

From 16684022a351be694a727bfdd4f143699c77bcba Mon Sep 17 00:00:00 2001
From: Loris Ercole <v-lercole@microsoft.com>
Date: Wed, 10 Jun 2026 15:15:40 +0200
Subject: [PATCH 29/52] Fix C4267: cast size_t/unsigned long to int32_t at
 narrow call sites

- basisset_map.hpp: cast size_t st_idx/range_end to int32_t when inserting
  into vector<int32_t> and vector<pair<int32_t,int32_t>>
- xc_task.hpp: use size_t{0} init in std::accumulate (was 0ul, which is
  32-bit on Windows)
- load_balancer_impl.cxx: same 0ul -> size_t{0} fix in total_npts()
- molmeta.cxx: same 0ul -> size_t{0} fix in sum_atomic_charges accumulate
- exx_screening.cxx: cast shell index loop vars to int32_t when pushing
  into vector<pair<int32_t,int32_t>> and vector<int32_t>
---
 include/gauxc/basisset_map.hpp                      | 4 ++--
 include/gauxc/xc_task.hpp                           | 2 +-
 src/load_balancer/load_balancer_impl.cxx            | 2 +-
 src/molmeta.cxx                                     | 2 +-
 src/xc_integrator/integrator_util/exx_screening.cxx | 4 ++--
 5 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/include/gauxc/basisset_map.hpp b/include/gauxc/basisset_map.hpp
index c67e5fbc9..c3bcfed36 100644
--- a/include/gauxc/basisset_map.hpp
+++ b/include/gauxc/basisset_map.hpp
@@ -61,8 +61,8 @@ class BasisSetMap {
     size_t st_idx = 0;
     for( const auto& shell : basis ) {
       size_t range_end = st_idx + shell.size();
-      shell_to_first_ao_.emplace_back( st_idx );
-      shell_to_ao_range_.push_back({ st_idx, range_end });
+      shell_to_first_ao_.emplace_back( static_cast<int32_t>(st_idx) );
+      shell_to_ao_range_.push_back({ static_cast<int32_t>(st_idx), static_cast<int32_t>(range_end) });
       st_idx = range_end;
     }
 
diff --git a/include/gauxc/xc_task.hpp b/include/gauxc/xc_task.hpp
index 7bf38ad5f..8a27a195b 100644
--- a/include/gauxc/xc_task.hpp
+++ b/include/gauxc/xc_task.hpp
@@ -73,7 +73,7 @@ struct XCTask {
   void merge_with( TaskIt begin, TaskIt end ) {
 
     size_t old_sz = points.size();
-    size_t pts_add = std::accumulate( begin, end, 0ul,
+    size_t pts_add = std::accumulate( begin, end, size_t{0},
       []( const auto &a, const auto &t ) {
         return a + t.points.size();
       });
diff --git a/src/load_balancer/load_balancer_impl.cxx b/src/load_balancer/load_balancer_impl.cxx
index f6b853daa..a4465d7ed 100644
--- a/src/load_balancer/load_balancer_impl.cxx
+++ b/src/load_balancer/load_balancer_impl.cxx
@@ -65,7 +65,7 @@ const util::Timer& LoadBalancerImpl::get_timings() const {
 
 size_t LoadBalancerImpl::total_npts() const {
 
-  return std::accumulate( local_tasks_.cbegin(), local_tasks_.cend(), 0ul,
+  return std::accumulate( local_tasks_.cbegin(), local_tasks_.cend(), size_t{0},
     []( const auto& a, const auto& b ) {
       return a + b.points.size();
     });
diff --git a/src/molmeta.cxx b/src/molmeta.cxx
index 3bad9987d..62fae37ea 100644
--- a/src/molmeta.cxx
+++ b/src/molmeta.cxx
@@ -16,7 +16,7 @@ namespace GauXC {
 MolMeta::MolMeta( const Molecule& mol ) : natoms_(mol.natoms()){
   compute_rab(mol);
   compute_dist_nearest();
-  sum_atomic_charges_ = std::accumulate( mol.begin(), mol.end(), 0ul,
+  sum_atomic_charges_ = std::accumulate( mol.begin(), mol.end(), size_t{0},
     [](auto a, const auto& b){ return a + b.Z.get(); });
 }
 
diff --git a/src/xc_integrator/integrator_util/exx_screening.cxx b/src/xc_integrator/integrator_util/exx_screening.cxx
index fa5b53498..80dcaaab0 100644
--- a/src/xc_integrator/integrator_util/exx_screening.cxx
+++ b/src/xc_integrator/integrator_util/exx_screening.cxx
@@ -189,8 +189,8 @@ void exx_ek_screening(
 
         task_ek_shells[i_block] |= (1u << i_local); 
         task_ek_shells[j_block] |= (1u << j_local); 
-        task_it->cou_screening.shell_pair_list.emplace_back(i,j);
-        task_it->cou_screening.shell_pair_idx_list.emplace_back(_j);
+        task_it->cou_screening.shell_pair_list.emplace_back(static_cast<int32_t>(i), static_cast<int32_t>(j));
+        task_it->cou_screening.shell_pair_idx_list.emplace_back(static_cast<int32_t>(_j));
       }
     }
     }

From 2cde73ee85e6fcbdb48ab8f11abde02542d0e559 Mon Sep 17 00:00:00 2001
From: Loris Ercole <v-lercole@microsoft.com>
Date: Wed, 10 Jun 2026 15:15:55 +0200
Subject: [PATCH 30/52] Remove MSVC /wd* warning suppressions from
 CMakeLists.txt

All previously suppressed warnings (C4242, C4244, C4267, C4389) are now
fixed at their source. Remove the suppressions so MSVC continues to catch
similar regressions.
---
 src/CMakeLists.txt | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index a74a9b6ee..b4fb87c9b 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -95,12 +95,6 @@ if(MSVC)
     # )
   else()
     target_compile_options( gauxc PUBLIC /EHsc )
-    target_compile_options( gauxc PRIVATE
-      /wd4242  # 'conversion': possible loss of data (int->char in STL algorithm)
-      /wd4244  # 'conversion': possible loss of data (in MSVC numeric/algorithm)
-      /wd4267  # 'conversion': size_t to smaller type (in MSVC xutility/utility)
-      /wd4389  # '==': signed/unsigned mismatch (in MSVC xutility)
-    )
   endif()
 endif()
 

From ba21ee820e95517029e1e23ffd83dfcc51846da9 Mon Sep 17 00:00:00 2001
From: Loris Ercole <v-lercole@microsoft.com>
Date: Wed, 10 Jun 2026 15:38:36 +0200
Subject: [PATCH 31/52] Fix C4242: replace remaining ::toupper uses with
 explicit-cast lambda

Five more std::transform(... ::toupper) call sites not covered by the
previous fix. The same pattern applies: ::toupper returns int; storing
that into a char iterator is C4242. Use a lambda that casts explicitly.
---
 include/gauxc/xc_integrator/integrator_factory.hpp    | 4 ++--
 src/load_balancer/host/load_balancer_host_factory.cxx | 4 ++--
 src/reduction_driver/reduction_driver_factory.cxx     | 4 ++--
 tests/basis/parse_basis.cxx                           | 2 +-
 tests/standalone_driver.cxx                           | 3 ++-
 5 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/include/gauxc/xc_integrator/integrator_factory.hpp b/include/gauxc/xc_integrator/integrator_factory.hpp
index 54a1c4a3a..1fc75f888 100644
--- a/include/gauxc/xc_integrator/integrator_factory.hpp
+++ b/include/gauxc/xc_integrator/integrator_factory.hpp
@@ -68,8 +68,8 @@ class XCIntegratorFactory {
       lb->runtime(), rd_kernel_ );
 
     // Create Integrator instance
-    std::transform( input_type_.begin(), input_type_.end(), input_type_.begin(), 
-      ::toupper );
+    std::transform( input_type_.begin(), input_type_.end(), input_type_.begin(),
+      [](unsigned char c){ return static_cast<char>(std::toupper(c)); } );
 
     if( input_type_ == "REPLICATED" )
       return std::make_shared<integrator_type>( 
diff --git a/src/load_balancer/host/load_balancer_host_factory.cxx b/src/load_balancer/host/load_balancer_host_factory.cxx
index f69d7fd9b..e5243c41a 100644
--- a/src/load_balancer/host/load_balancer_host_factory.cxx
+++ b/src/load_balancer/host/load_balancer_host_factory.cxx
@@ -21,8 +21,8 @@ std::shared_ptr<LoadBalancer> LoadBalancerHostFactory::get_shared_instance(
   const Molecule& mol, const MolGrid& mg, const BasisSet<double>& basis
 ) {
 
-  std::transform(kernel_name.begin(), kernel_name.end(), 
-    kernel_name.begin(), ::toupper );
+  std::transform(kernel_name.begin(), kernel_name.end(),
+    kernel_name.begin(), [](unsigned char c){ return static_cast<char>(std::toupper(c)); } );
 
 
   if( kernel_name == "DEFAULT" or kernel_name == "REPLICATED" ) 
diff --git a/src/reduction_driver/reduction_driver_factory.cxx b/src/reduction_driver/reduction_driver_factory.cxx
index 8b3d5f348..39fe2f10b 100644
--- a/src/reduction_driver/reduction_driver_factory.cxx
+++ b/src/reduction_driver/reduction_driver_factory.cxx
@@ -26,8 +26,8 @@ namespace GauXC {
 std::shared_ptr<ReductionDriver> ReductionDriverFactory::get_shared_instance(
   const RuntimeEnvironment& rt, std::string kernel_name ) {
 
-  std::transform(kernel_name.begin(), kernel_name.end(), 
-    kernel_name.begin(), ::toupper );
+  std::transform(kernel_name.begin(), kernel_name.end(),
+    kernel_name.begin(), [](unsigned char c){ return static_cast<char>(std::toupper(c)); } );
 
   std::unique_ptr<detail::ReductionDriverImpl> ptr = nullptr;
 
diff --git a/tests/basis/parse_basis.cxx b/tests/basis/parse_basis.cxx
index 3d3525fb3..1684e6e4d 100644
--- a/tests/basis/parse_basis.cxx
+++ b/tests/basis/parse_basis.cxx
@@ -179,7 +179,7 @@ BasisSet<double> parse_basis( const Molecule& mol,
     std::string atom_symb = atom_line.substr(0,2);
     if( atom_symb[1] == ' ' ) atom_symb = atom_symb[0];
     std::transform( atom_symb.begin(), atom_symb.end(), atom_symb.begin(),
-                    [](auto a){ return std::toupper(a); } );
+                    [](unsigned char a){ return static_cast<char>(std::toupper(a)); } );
     
     //std::cout << atom_symb << std::endl;
     int Z = atomic_number_map.at(atom_symb);
diff --git a/tests/standalone_driver.cxx b/tests/standalone_driver.cxx
index 3c7121cc6..f4af33756 100644
--- a/tests/standalone_driver.cxx
+++ b/tests/standalone_driver.cxx
@@ -77,7 +77,8 @@ int main(int argc, char** argv) {
     int lmax = 2;
 
     auto string_to_upper = []( auto& str ) {
-      std::transform( str.begin(), str.end(), str.begin(), ::toupper );
+      std::transform( str.begin(), str.end(), str.begin(),
+        [](unsigned char c){ return static_cast<char>(std::toupper(c)); } );
     };
 
     #define OPTIONAL_KEYWORD(NAME,VAR,TYPE) \

From ed4f730a721fdbe65036c5cf9009b9dbf80435a4 Mon Sep 17 00:00:00 2001
From: Loris Ercole <v-lercole@microsoft.com>
Date: Wed, 10 Jun 2026 15:54:53 +0200
Subject: [PATCH 32/52] Fix remaining C4244/C4242 warnings in test files

- xc_integrator.cxx: 0ul -> size_t{0} in std::accumulate init (0ul is
  32-bit on Windows, causing C4244 when lambda returns int64_t)
- ini_input.cxx: add static_cast<char> to std::toupper return in lambdas
  (missing cast caused C4242 int->char)
---
 tests/ini_input.cxx     | 6 +++---
 tests/xc_integrator.cxx | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/ini_input.cxx b/tests/ini_input.cxx
index a5f6ed561..613b06fd3 100644
--- a/tests/ini_input.cxx
+++ b/tests/ini_input.cxx
@@ -106,7 +106,7 @@ void INIFile::parse() {
       // Obtain the section header name
       sectionHeader = line.substr(1,line.length()-2);
       std::transform(sectionHeader.begin(),sectionHeader.end(),sectionHeader.begin(),
-        [](unsigned char c){ return std::toupper(c);} );
+        [](unsigned char c){ return static_cast<char>(std::toupper(c));} );
 
       // Create a dictionary entry for the section header
       dict_[sectionHeader] = 
@@ -134,7 +134,7 @@ void INIFile::parse() {
 
       dataHeader = tokens[0];
       std::transform(dataHeader.begin(),dataHeader.end(),dataHeader.begin(),
-        [](unsigned char c){ return std::toupper(c);} );
+        [](unsigned char c){ return static_cast<char>(std::toupper(c));} );
 
       // Create a dictionary entry for the data field in the current
       // section header
@@ -191,7 +191,7 @@ std::pair<std::string,std::string> INIFile::splitQuery(
   for(auto &X : tokens) {
     trim(X);
     std::transform(X.begin(),X.end(),X.begin(),
-      [](unsigned char c){ return std::toupper(c);} );
+      [](unsigned char c){ return static_cast<char>(std::toupper(c));} );
   }
 
   return 
diff --git a/tests/xc_integrator.cxx b/tests/xc_integrator.cxx
index 947a9914b..08f4ca214 100644
--- a/tests/xc_integrator.cxx
+++ b/tests/xc_integrator.cxx
@@ -188,7 +188,7 @@ void test_xc_integrator( ExecutionSpace ex, const RuntimeEnvironment& rt,
 
   // Integrate Density
   if( check_integrate_den and rks) {
-    auto N_EL_ref = std::accumulate( mol.begin(), mol.end(), 0ul,
+    auto N_EL_ref = std::accumulate( mol.begin(), mol.end(), size_t{0},
       [](const auto& a, const auto &b) { return a + b.Z.get(); });
     auto N_EL = integrator.integrate_den( P );
     // Factor of 2 b/c P is the alpha density for RKS

From 4f9ad0608d15e1e2e60890dd64b8fc586b668047 Mon Sep 17 00:00:00 2001
From: Loris Ercole <v-lercole@microsoft.com>
Date: Wed, 10 Jun 2026 16:53:11 +0200
Subject: [PATCH 33/52] Fix -Wlanguage-extension-token: guard __FUNCSIG__ from
 clang-cl

clang-cl defines both _MSC_VER and __clang__; __FUNCSIG__ is a
MSVC-only extension that clang-cl warns about. Narrow the guard so
only MSVC (not clang-cl) uses it.
---
 include/gauxc/exceptions.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/gauxc/exceptions.hpp b/include/gauxc/exceptions.hpp
index e0d7ec09d..efd60fbeb 100644
--- a/include/gauxc/exceptions.hpp
+++ b/include/gauxc/exceptions.hpp
@@ -75,7 +75,7 @@ class generic_gauxc_exception : public std::exception {
 
 }
 
-#ifdef _MSC_VER
+#if defined(_MSC_VER) && !defined(__clang__)
 #define GAUXC_GENERIC_EXCEPTION( MSG ) \
   throw generic_gauxc_exception( __FILE__, __FUNCSIG__, __LINE__, MSG )
 #else

From 4923a769e70f510af8fcebe827533cdbfe26bd6f Mon Sep 17 00:00:00 2001
From: Loris Ercole <v-lercole@microsoft.com>
Date: Wed, 10 Jun 2026 16:53:42 +0200
Subject: [PATCH 34/52] Fix -Wsign-compare: use size_t for loop indices
 compared against size_t bounds

---
 ...reference_replicated_xc_host_integrator_dd_psi_potential.hpp | 2 +-
 .../host/reference_replicated_xc_host_integrator_exx.hpp        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_dd_psi_potential.hpp b/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_dd_psi_potential.hpp
index 2b99140fa..1976a2778 100644
--- a/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_dd_psi_potential.hpp
+++ b/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_dd_psi_potential.hpp
@@ -70,7 +70,7 @@ void ReferenceReplicatedXCHostIntegrator<ValueType>::
 
   // Atom-specific data
   std::vector<double> radii(mol.size());
-  for (int i = 0; i < mol.size(); ++i) {
+  for (size_t i = 0; i < mol.size(); ++i) {
     radii[i] = uff_radius_103(mol[i].Z);
   }
 
diff --git a/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_exx.hpp b/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_exx.hpp
index fd9cbf0a5..003de5f53 100644
--- a/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_exx.hpp
+++ b/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_exx.hpp
@@ -315,7 +315,7 @@ void ReferenceReplicatedXCHostIntegrator<ValueType>::
   // Loop over sparse shell pairs
   const auto sp_row_ptr = shpairs.row_ptr();
   const auto sp_col_ind = shpairs.col_ind();
-  for( auto i = 0; i < nshells_bf; ++i ) {
+  for( size_t i = 0; i < nshells_bf; ++i ) {
     const auto j_st = sp_row_ptr[i];
     const auto j_en = sp_row_ptr[i+1];
     for( auto _j = j_st; _j < j_en; ++_j ) {

From 2037d6c8f13ddd9c096b8784e4ad33d9f5497275 Mon Sep 17 00:00:00 2001
From: Loris Ercole <v-lercole@microsoft.com>
Date: Wed, 10 Jun 2026 16:53:55 +0200
Subject: [PATCH 35/52] Fix -Wunused-local-typedef: remove unused cart_array
 typedef in parse_basis.cxx

---
 tests/basis/parse_basis.cxx | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/basis/parse_basis.cxx b/tests/basis/parse_basis.cxx
index 1684e6e4d..3c1e8a6de 100644
--- a/tests/basis/parse_basis.cxx
+++ b/tests/basis/parse_basis.cxx
@@ -208,7 +208,6 @@ BasisSet<double> parse_basis( const Molecule& mol,
       }
       
       using prim_array = Shell<double>::prim_array;
-      using cart_array = Shell<double>::cart_array;
 
       prim_array alpha_arr, coeff_primary_arr, coeff_secondary_arr{};
       std::copy( alpha.begin(), alpha.end(), alpha_arr.begin() );

From 7380964f575f88031c4b4c43f7439b7647b67726 Mon Sep 17 00:00:00 2001
From: Loris Ercole <v-lercole@microsoft.com>
Date: Wed, 10 Jun 2026 16:54:00 +0200
Subject: [PATCH 36/52] Fix -Wswitch: add default case to switch statements
 missing XCWeightAlg::NOTPARTITIONED

---
 tests/weights_generate.hpp | 1 +
 tests/weights_host.hpp     | 1 +
 2 files changed, 2 insertions(+)

diff --git a/tests/weights_generate.hpp b/tests/weights_generate.hpp
index 34788701d..f245120ae 100644
--- a/tests/weights_generate.hpp
+++ b/tests/weights_generate.hpp
@@ -77,6 +77,7 @@ void generate_weights_data( const Molecule& mol, const BasisSet<double>& basis,
       reference_lko_weights_host( 
         mol, lb.molmeta(), tasks.begin(), tasks.end() );
       break;
+    default: break;
   }
 
   // Clear out unneeded data
diff --git a/tests/weights_host.hpp b/tests/weights_host.hpp
index 821330ae3..5242014e9 100644
--- a/tests/weights_host.hpp
+++ b/tests/weights_host.hpp
@@ -40,6 +40,7 @@ void test_host_weights( const std::string& filename, XCWeightAlg weight_alg ) {
         ref_data.mol, *ref_data.meta, ref_data.tasks_unm.begin(), 
         ref_data.tasks_unm.end() );
       break;
+    default: break;
   }
 
 

From 78b2982407bf483c856bedf0cf1f0b03495336c0 Mon Sep 17 00:00:00 2001
From: Loris Ercole <v-lercole@microsoft.com>
Date: Wed, 10 Jun 2026 16:54:06 +0200
Subject: [PATCH 37/52] Suppress -Wunused-variable for clang-cl targets

Add -Wno-unused-variable to gauxc, gauxc_test, and standalone_driver
under clang-cl (MSVC + Clang) to suppress unused variable warnings
without adding [[maybe_unused]] annotations throughout the codebase.
---
 src/CMakeLists.txt   | 14 +++-----------
 tests/CMakeLists.txt |  6 ++++++
 2 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index b4fb87c9b..8b3c919c7 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -82,17 +82,9 @@ endif()
 
 if(MSVC)
   if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
-    # target_compile_options( gauxc PRIVATE
-    #   -Wno-covered-switch-default
-    #   -Wno-documentation
-    #   -Wno-documentation-unknown-command
-    #   -Wno-implicit-int-float-conversion
-    #   -Wno-language-extension-token
-    #   -Wno-reserved-identifier
-    #   -Wno-shorten-64-to-32
-    #   -Wno-sign-compare
-    #   -Wno-undef
-    # )
+    target_compile_options( gauxc PRIVATE
+      -Wno-unused-variable
+    )
   else()
     target_compile_options( gauxc PUBLIC /EHsc )
   endif()
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 46dbe487d..b7b090d70 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -75,6 +75,9 @@ if(GAUXC_ENABLE_CUTLASS)
   include(gauxc-cutlass)
   target_link_libraries(gauxc_test PUBLIC gauxc_cutlass)
 endif()
+if(MSVC AND CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+  target_compile_options( gauxc_test PRIVATE -Wno-unused-variable )
+endif()
 
 
 set( GAUXC_REF_DATA_PATH "${PROJECT_SOURCE_DIR}/tests/ref_data" )
@@ -87,6 +90,9 @@ add_executable( standalone_driver standalone_driver.cxx standards.cxx basis/pars
 target_link_libraries( standalone_driver PUBLIC gauxc gauxc_catch2 Eigen3::Eigen )
 target_include_directories( standalone_driver PRIVATE ${PROJECT_BINARY_DIR}/tests )
 target_include_directories( standalone_driver PRIVATE ${PROJECT_SOURCE_DIR}/tests )
+if(MSVC AND CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+  target_compile_options( standalone_driver PRIVATE -Wno-unused-variable )
+endif()
 
 #add_executable( grid_opt grid_opt.cxx standards.cxx basis/parse_basis.cxx ini_input.cxx )
 #target_link_libraries( grid_opt PUBLIC gauxc gauxc_catch2 Eigen3::Eigen )

From 9aad3ed1a66600b1ec3d562fce2250d96f90b0a4 Mon Sep 17 00:00:00 2001
From: Loris Ercole <v-lercole@microsoft.com>
Date: Wed, 10 Jun 2026 16:54:19 +0200
Subject: [PATCH 38/52] Minor whitespace cleanup

---
 .../host/reference_replicated_xc_host_integrator_exc_vxc.hpp    | 1 -
 tests/molgrid_test.cxx                                          | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_exc_vxc.hpp b/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_exc_vxc.hpp
index cd2cc7ff7..6e7192c53 100644
--- a/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_exc_vxc.hpp
+++ b/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_exc_vxc.hpp
@@ -162,7 +162,6 @@ void ReferenceReplicatedXCHostIntegrator<ValueType>::
   auto& tasks = this->load_balancer_->get_tasks();
   std::sort( task_begin, task_end, task_comparator );
 
-
   // Check that Partition Weights have been calculated
   auto& lb_state = this->load_balancer_->state();
   if( not lb_state.modified_weights_are_stored ) {
diff --git a/tests/molgrid_test.cxx b/tests/molgrid_test.cxx
index 1de1be986..953e75abb 100644
--- a/tests/molgrid_test.cxx
+++ b/tests/molgrid_test.cxx
@@ -249,7 +249,7 @@ TEST_CASE("Grid Specification", "[molgrid]") {
 
     atomic_grid_variant gs;
     std::vector<PruningRegion> ref_pruning_regions;
-    UnprunedAtomicGridSpecification unp_gs = 
+    UnprunedAtomicGridSpecification unp_gs =
       MolGridFactory::create_default_unpruned_grid_spec(Z,rq,gsz);
     SECTION("Unpruned") {
       gs = MolGridFactory::create_default_pruned_grid_spec(

From 8393072dca879ab818900a7c5ca2fce6b3a4d4f8 Mon Sep 17 00:00:00 2001
From: Loris Ercole <v-lercole@microsoft.com>
Date: Wed, 10 Jun 2026 20:05:09 +0200
Subject: [PATCH 39/52] Update dependency hashes

---
 cmake/gauxc-dep-versions.cmake | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cmake/gauxc-dep-versions.cmake b/cmake/gauxc-dep-versions.cmake
index cf3e24563..1c8459bfc 100644
--- a/cmake/gauxc-dep-versions.cmake
+++ b/cmake/gauxc-dep-versions.cmake
@@ -1,5 +1,5 @@
 set( GAUXC_LINALG_MODULES_REPOSITORY https://github.com/wavefunction91/linalg-cmake-modules.git )
-set( GAUXC_LINALG_MODULES_REVISION  9d2c273a671d6811e9fd432f6a4fa3d915b144b8 )
+set( GAUXC_LINALG_MODULES_REVISION  222364df5e7639f371bf2f37ceb0f476301101a1 )
 
 set( GAUXC_CUB_REPOSITORY https://github.com/NVIDIA/cub.git )
 set( GAUXC_CUB_REVISION   1.10.0 )
@@ -8,13 +8,13 @@ set( GAUXC_CUTLASS_REPOSITORY https://github.com/NVIDIA/cutlass.git )
 set( GAUXC_CUTLASS_REVISION v2.10.0 )
 
 set( GAUXC_EXCHCXX_REPOSITORY https://github.com/lorisercole/ExchCXX.git )
-set( GAUXC_EXCHCXX_REVISION   20a2d6052bc5b1bac4a0d028f3735056e5345dac )
+set( GAUXC_EXCHCXX_REVISION   fab59dc41ef881e0a2b776a92b47c03101340071 )
 
 set( GAUXC_GAU2GRID_REPOSITORY https://github.com/dgasmith/gau2grid.git )
 set( GAUXC_GAU2GRID_REVISION   v2.0.6 )
 
 set( GAUXC_INTEGRATORXX_REPOSITORY https://github.com/lorisercole/IntegratorXX.git )
-set( GAUXC_INTEGRATORXX_REVISION   81e283d20eb3ce3bca49e79926a92801c642c2c5 )
+set( GAUXC_INTEGRATORXX_REVISION   923125236ea5971ee9accdea39da552b8e322ff6 )
 
 set( GAUXC_HIGHFIVE_REPOSITORY https://github.com/highfive-devs/HighFive.git )
 set( GAUXC_HIGHFIVE_REVISION 805f0e13d09b47c4b01d40682621904aa3b31bb8 )

From dc6f2cd2c982e034e20ec7294664145015478327 Mon Sep 17 00:00:00 2001
From: Loris Ercole <v-lercole@microsoft.com>
Date: Thu, 11 Jun 2026 11:51:37 +0200
Subject: [PATCH 40/52] Fix C4013: add missing #include <stdlib.h> for exit()
 in gau2grid_helper.c

The MSVC-only preprocessor branch included <malloc.h> but not <stdlib.h>,
leaving exit() undeclared. The clang-cl branch already had both.
---
 cmake/gauxc-dep-versions.cmake                       | 2 +-
 external/gau2grid/generated_source/gau2grid_helper.c | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/cmake/gauxc-dep-versions.cmake b/cmake/gauxc-dep-versions.cmake
index 1c8459bfc..4e1186e8d 100644
--- a/cmake/gauxc-dep-versions.cmake
+++ b/cmake/gauxc-dep-versions.cmake
@@ -8,7 +8,7 @@ set( GAUXC_CUTLASS_REPOSITORY https://github.com/NVIDIA/cutlass.git )
 set( GAUXC_CUTLASS_REVISION v2.10.0 )
 
 set( GAUXC_EXCHCXX_REPOSITORY https://github.com/lorisercole/ExchCXX.git )
-set( GAUXC_EXCHCXX_REVISION   fab59dc41ef881e0a2b776a92b47c03101340071 )
+set( GAUXC_EXCHCXX_REVISION   0bed66fbd86108804a52b421ab511f5e8828c072 )
 
 set( GAUXC_GAU2GRID_REPOSITORY https://github.com/dgasmith/gau2grid.git )
 set( GAUXC_GAU2GRID_REVISION   v2.0.6 )
diff --git a/external/gau2grid/generated_source/gau2grid_helper.c b/external/gau2grid/generated_source/gau2grid_helper.c
index 31956e084..f62897157 100644
--- a/external/gau2grid/generated_source/gau2grid_helper.c
+++ b/external/gau2grid/generated_source/gau2grid_helper.c
@@ -13,6 +13,7 @@
 #include <mm_malloc.h>
 #elif defined _MSC_VER
 #include <malloc.h>
+#include <stdlib.h>
 #else
 #include <stdlib.h>
 #endif

From a41ac467e568d7303295610130898c51dce991d1 Mon Sep 17 00:00:00 2001
From: Loris Ercole <v-lercole@microsoft.com>
Date: Thu, 11 Jun 2026 12:24:40 +0200
Subject: [PATCH 41/52] Fix C4018: resolve signed/unsigned mismatch in gau2grid
 generated C code

Cast int nprim to (unsigned long) in loop comparisons against unsigned
long iterators. Change unsigned int start_shift to unsigned long to
match the unsigned long operands in its computation.
---
 .../generated_source/gau2grid_deriv1.c        | 42 +++++++++----------
 .../generated_source/gau2grid_deriv2.c        | 42 +++++++++----------
 .../generated_source/gau2grid_deriv3.c        | 42 +++++++++----------
 .../generated_source/gau2grid_orbital.c       | 42 +++++++++----------
 .../gau2grid/generated_source/gau2grid_phi.c  | 42 +++++++++----------
 5 files changed, 105 insertions(+), 105 deletions(-)

diff --git a/external/gau2grid/generated_source/gau2grid_deriv1.c b/external/gau2grid/generated_source/gau2grid_deriv1.c
index 503c16598..24a0fdcc2 100644
--- a/external/gau2grid/generated_source/gau2grid_deriv1.c
+++ b/external/gau2grid/generated_source/gau2grid_deriv1.c
@@ -76,7 +76,7 @@ void gg_collocation_L0_deriv1(const unsigned long npoints, const double* PRAGMA_
     double AX, AY, AZ;
 
     // Build negative exponents
-    for (unsigned long i = 0; i < nprim; i++) {
+    for (unsigned long i = 0; i < (unsigned long)nprim; i++) {
         expn1[i] = -1.0 * exponents[i];
         expn2[i] = -2.0 * exponents[i];
     }
@@ -110,7 +110,7 @@ void gg_collocation_L0_deriv1(const unsigned long npoints, const double* PRAGMA_
                 S1[i] = 0.0;
             }
             } else {
-            unsigned int start_shift = start * xyz_stride;
+            unsigned long start_shift = start * xyz_stride;
 
             PRAGMA_VECTORIZE
             for (unsigned long i = 0; i < remain; i++) {
@@ -130,7 +130,7 @@ void gg_collocation_L0_deriv1(const unsigned long npoints, const double* PRAGMA_
         }
 
         // Start exponential block loop
-        for (unsigned long n = 0; n < nprim; n++) {
+        for (unsigned long n = 0; n < (unsigned long)nprim; n++) {
             const double coef = coeffs[n];
             const double alpha_n1 = expn1[n];
             const double alpha_n2 = expn2[n];
@@ -229,7 +229,7 @@ void gg_collocation_L1_deriv1(const unsigned long npoints, const double* PRAGMA_
     double AX, AY, AZ;
 
     // Build negative exponents
-    for (unsigned long i = 0; i < nprim; i++) {
+    for (unsigned long i = 0; i < (unsigned long)nprim; i++) {
         expn1[i] = -1.0 * exponents[i];
         expn2[i] = -2.0 * exponents[i];
     }
@@ -263,7 +263,7 @@ void gg_collocation_L1_deriv1(const unsigned long npoints, const double* PRAGMA_
                 S1[i] = 0.0;
             }
             } else {
-            unsigned int start_shift = start * xyz_stride;
+            unsigned long start_shift = start * xyz_stride;
 
             PRAGMA_VECTORIZE
             for (unsigned long i = 0; i < remain; i++) {
@@ -283,7 +283,7 @@ void gg_collocation_L1_deriv1(const unsigned long npoints, const double* PRAGMA_
         }
 
         // Start exponential block loop
-        for (unsigned long n = 0; n < nprim; n++) {
+        for (unsigned long n = 0; n < (unsigned long)nprim; n++) {
             const double coef = coeffs[n];
             const double alpha_n1 = expn1[n];
             const double alpha_n2 = expn2[n];
@@ -440,7 +440,7 @@ void gg_collocation_L2_deriv1(const unsigned long npoints, const double* PRAGMA_
     double AX, AY, AZ;
 
     // Build negative exponents
-    for (unsigned long i = 0; i < nprim; i++) {
+    for (unsigned long i = 0; i < (unsigned long)nprim; i++) {
         expn1[i] = -1.0 * exponents[i];
         expn2[i] = -2.0 * exponents[i];
     }
@@ -474,7 +474,7 @@ void gg_collocation_L2_deriv1(const unsigned long npoints, const double* PRAGMA_
                 S1[i] = 0.0;
             }
             } else {
-            unsigned int start_shift = start * xyz_stride;
+            unsigned long start_shift = start * xyz_stride;
 
             PRAGMA_VECTORIZE
             for (unsigned long i = 0; i < remain; i++) {
@@ -494,7 +494,7 @@ void gg_collocation_L2_deriv1(const unsigned long npoints, const double* PRAGMA_
         }
 
         // Start exponential block loop
-        for (unsigned long n = 0; n < nprim; n++) {
+        for (unsigned long n = 0; n < (unsigned long)nprim; n++) {
             const double coef = coeffs[n];
             const double alpha_n1 = expn1[n];
             const double alpha_n2 = expn2[n];
@@ -693,7 +693,7 @@ void gg_collocation_L3_deriv1(const unsigned long npoints, const double* PRAGMA_
     double AX, AY, AZ;
 
     // Build negative exponents
-    for (unsigned long i = 0; i < nprim; i++) {
+    for (unsigned long i = 0; i < (unsigned long)nprim; i++) {
         expn1[i] = -1.0 * exponents[i];
         expn2[i] = -2.0 * exponents[i];
     }
@@ -727,7 +727,7 @@ void gg_collocation_L3_deriv1(const unsigned long npoints, const double* PRAGMA_
                 S1[i] = 0.0;
             }
             } else {
-            unsigned int start_shift = start * xyz_stride;
+            unsigned long start_shift = start * xyz_stride;
 
             PRAGMA_VECTORIZE
             for (unsigned long i = 0; i < remain; i++) {
@@ -747,7 +747,7 @@ void gg_collocation_L3_deriv1(const unsigned long npoints, const double* PRAGMA_
         }
 
         // Start exponential block loop
-        for (unsigned long n = 0; n < nprim; n++) {
+        for (unsigned long n = 0; n < (unsigned long)nprim; n++) {
             const double coef = coeffs[n];
             const double alpha_n1 = expn1[n];
             const double alpha_n2 = expn2[n];
@@ -1004,7 +1004,7 @@ void gg_collocation_L4_deriv1(const unsigned long npoints, const double* PRAGMA_
     double AX, AY, AZ;
 
     // Build negative exponents
-    for (unsigned long i = 0; i < nprim; i++) {
+    for (unsigned long i = 0; i < (unsigned long)nprim; i++) {
         expn1[i] = -1.0 * exponents[i];
         expn2[i] = -2.0 * exponents[i];
     }
@@ -1038,7 +1038,7 @@ void gg_collocation_L4_deriv1(const unsigned long npoints, const double* PRAGMA_
                 S1[i] = 0.0;
             }
             } else {
-            unsigned int start_shift = start * xyz_stride;
+            unsigned long start_shift = start * xyz_stride;
 
             PRAGMA_VECTORIZE
             for (unsigned long i = 0; i < remain; i++) {
@@ -1058,7 +1058,7 @@ void gg_collocation_L4_deriv1(const unsigned long npoints, const double* PRAGMA_
         }
 
         // Start exponential block loop
-        for (unsigned long n = 0; n < nprim; n++) {
+        for (unsigned long n = 0; n < (unsigned long)nprim; n++) {
             const double coef = coeffs[n];
             const double alpha_n1 = expn1[n];
             const double alpha_n2 = expn2[n];
@@ -1388,7 +1388,7 @@ void gg_collocation_L5_deriv1(const unsigned long npoints, const double* PRAGMA_
     double AX, AY, AZ;
 
     // Build negative exponents
-    for (unsigned long i = 0; i < nprim; i++) {
+    for (unsigned long i = 0; i < (unsigned long)nprim; i++) {
         expn1[i] = -1.0 * exponents[i];
         expn2[i] = -2.0 * exponents[i];
     }
@@ -1422,7 +1422,7 @@ void gg_collocation_L5_deriv1(const unsigned long npoints, const double* PRAGMA_
                 S1[i] = 0.0;
             }
             } else {
-            unsigned int start_shift = start * xyz_stride;
+            unsigned long start_shift = start * xyz_stride;
 
             PRAGMA_VECTORIZE
             for (unsigned long i = 0; i < remain; i++) {
@@ -1442,7 +1442,7 @@ void gg_collocation_L5_deriv1(const unsigned long npoints, const double* PRAGMA_
         }
 
         // Start exponential block loop
-        for (unsigned long n = 0; n < nprim; n++) {
+        for (unsigned long n = 0; n < (unsigned long)nprim; n++) {
             const double coef = coeffs[n];
             const double alpha_n1 = expn1[n];
             const double alpha_n2 = expn2[n];
@@ -1860,7 +1860,7 @@ void gg_collocation_L6_deriv1(const unsigned long npoints, const double* PRAGMA_
     double AX, AY, AZ;
 
     // Build negative exponents
-    for (unsigned long i = 0; i < nprim; i++) {
+    for (unsigned long i = 0; i < (unsigned long)nprim; i++) {
         expn1[i] = -1.0 * exponents[i];
         expn2[i] = -2.0 * exponents[i];
     }
@@ -1894,7 +1894,7 @@ void gg_collocation_L6_deriv1(const unsigned long npoints, const double* PRAGMA_
                 S1[i] = 0.0;
             }
             } else {
-            unsigned int start_shift = start * xyz_stride;
+            unsigned long start_shift = start * xyz_stride;
 
             PRAGMA_VECTORIZE
             for (unsigned long i = 0; i < remain; i++) {
@@ -1914,7 +1914,7 @@ void gg_collocation_L6_deriv1(const unsigned long npoints, const double* PRAGMA_
         }
 
         // Start exponential block loop
-        for (unsigned long n = 0; n < nprim; n++) {
+        for (unsigned long n = 0; n < (unsigned long)nprim; n++) {
             const double coef = coeffs[n];
             const double alpha_n1 = expn1[n];
             const double alpha_n2 = expn2[n];
diff --git a/external/gau2grid/generated_source/gau2grid_deriv2.c b/external/gau2grid/generated_source/gau2grid_deriv2.c
index 822f62a0d..a0a397f8e 100644
--- a/external/gau2grid/generated_source/gau2grid_deriv2.c
+++ b/external/gau2grid/generated_source/gau2grid_deriv2.c
@@ -91,7 +91,7 @@ void gg_collocation_L0_deriv2(const unsigned long npoints, const double* PRAGMA_
     double AXX, AXY, AXZ, AYY, AYZ, AZZ;
 
     // Build negative exponents
-    for (unsigned long i = 0; i < nprim; i++) {
+    for (unsigned long i = 0; i < (unsigned long)nprim; i++) {
         expn1[i] = -1.0 * exponents[i];
         expn2[i] = -2.0 * exponents[i];
     }
@@ -126,7 +126,7 @@ void gg_collocation_L0_deriv2(const unsigned long npoints, const double* PRAGMA_
                 S2[i] = 0.0;
             }
             } else {
-            unsigned int start_shift = start * xyz_stride;
+            unsigned long start_shift = start * xyz_stride;
 
             PRAGMA_VECTORIZE
             for (unsigned long i = 0; i < remain; i++) {
@@ -147,7 +147,7 @@ void gg_collocation_L0_deriv2(const unsigned long npoints, const double* PRAGMA_
         }
 
         // Start exponential block loop
-        for (unsigned long n = 0; n < nprim; n++) {
+        for (unsigned long n = 0; n < (unsigned long)nprim; n++) {
             const double coef = coeffs[n];
             const double alpha_n1 = expn1[n];
             const double alpha_n2 = expn2[n];
@@ -285,7 +285,7 @@ void gg_collocation_L1_deriv2(const unsigned long npoints, const double* PRAGMA_
     double AXX, AXY, AXZ, AYY, AYZ, AZZ;
 
     // Build negative exponents
-    for (unsigned long i = 0; i < nprim; i++) {
+    for (unsigned long i = 0; i < (unsigned long)nprim; i++) {
         expn1[i] = -1.0 * exponents[i];
         expn2[i] = -2.0 * exponents[i];
     }
@@ -320,7 +320,7 @@ void gg_collocation_L1_deriv2(const unsigned long npoints, const double* PRAGMA_
                 S2[i] = 0.0;
             }
             } else {
-            unsigned int start_shift = start * xyz_stride;
+            unsigned long start_shift = start * xyz_stride;
 
             PRAGMA_VECTORIZE
             for (unsigned long i = 0; i < remain; i++) {
@@ -341,7 +341,7 @@ void gg_collocation_L1_deriv2(const unsigned long npoints, const double* PRAGMA_
         }
 
         // Start exponential block loop
-        for (unsigned long n = 0; n < nprim; n++) {
+        for (unsigned long n = 0; n < (unsigned long)nprim; n++) {
             const double coef = coeffs[n];
             const double alpha_n1 = expn1[n];
             const double alpha_n2 = expn2[n];
@@ -597,7 +597,7 @@ void gg_collocation_L2_deriv2(const unsigned long npoints, const double* PRAGMA_
     double AXX, AXY, AXZ, AYY, AYZ, AZZ;
 
     // Build negative exponents
-    for (unsigned long i = 0; i < nprim; i++) {
+    for (unsigned long i = 0; i < (unsigned long)nprim; i++) {
         expn1[i] = -1.0 * exponents[i];
         expn2[i] = -2.0 * exponents[i];
     }
@@ -632,7 +632,7 @@ void gg_collocation_L2_deriv2(const unsigned long npoints, const double* PRAGMA_
                 S2[i] = 0.0;
             }
             } else {
-            unsigned int start_shift = start * xyz_stride;
+            unsigned long start_shift = start * xyz_stride;
 
             PRAGMA_VECTORIZE
             for (unsigned long i = 0; i < remain; i++) {
@@ -653,7 +653,7 @@ void gg_collocation_L2_deriv2(const unsigned long npoints, const double* PRAGMA_
         }
 
         // Start exponential block loop
-        for (unsigned long n = 0; n < nprim; n++) {
+        for (unsigned long n = 0; n < (unsigned long)nprim; n++) {
             const double coef = coeffs[n];
             const double alpha_n1 = expn1[n];
             const double alpha_n2 = expn2[n];
@@ -995,7 +995,7 @@ void gg_collocation_L3_deriv2(const unsigned long npoints, const double* PRAGMA_
     double AXX, AXY, AXZ, AYY, AYZ, AZZ;
 
     // Build negative exponents
-    for (unsigned long i = 0; i < nprim; i++) {
+    for (unsigned long i = 0; i < (unsigned long)nprim; i++) {
         expn1[i] = -1.0 * exponents[i];
         expn2[i] = -2.0 * exponents[i];
     }
@@ -1030,7 +1030,7 @@ void gg_collocation_L3_deriv2(const unsigned long npoints, const double* PRAGMA_
                 S2[i] = 0.0;
             }
             } else {
-            unsigned int start_shift = start * xyz_stride;
+            unsigned long start_shift = start * xyz_stride;
 
             PRAGMA_VECTORIZE
             for (unsigned long i = 0; i < remain; i++) {
@@ -1051,7 +1051,7 @@ void gg_collocation_L3_deriv2(const unsigned long npoints, const double* PRAGMA_
         }
 
         // Start exponential block loop
-        for (unsigned long n = 0; n < nprim; n++) {
+        for (unsigned long n = 0; n < (unsigned long)nprim; n++) {
             const double coef = coeffs[n];
             const double alpha_n1 = expn1[n];
             const double alpha_n2 = expn2[n];
@@ -1615,7 +1615,7 @@ void gg_collocation_L4_deriv2(const unsigned long npoints, const double* PRAGMA_
     double AXX, AXY, AXZ, AYY, AYZ, AZZ;
 
     // Build negative exponents
-    for (unsigned long i = 0; i < nprim; i++) {
+    for (unsigned long i = 0; i < (unsigned long)nprim; i++) {
         expn1[i] = -1.0 * exponents[i];
         expn2[i] = -2.0 * exponents[i];
     }
@@ -1650,7 +1650,7 @@ void gg_collocation_L4_deriv2(const unsigned long npoints, const double* PRAGMA_
                 S2[i] = 0.0;
             }
             } else {
-            unsigned int start_shift = start * xyz_stride;
+            unsigned long start_shift = start * xyz_stride;
 
             PRAGMA_VECTORIZE
             for (unsigned long i = 0; i < remain; i++) {
@@ -1671,7 +1671,7 @@ void gg_collocation_L4_deriv2(const unsigned long npoints, const double* PRAGMA_
         }
 
         // Start exponential block loop
-        for (unsigned long n = 0; n < nprim; n++) {
+        for (unsigned long n = 0; n < (unsigned long)nprim; n++) {
             const double coef = coeffs[n];
             const double alpha_n1 = expn1[n];
             const double alpha_n2 = expn2[n];
@@ -2399,7 +2399,7 @@ void gg_collocation_L5_deriv2(const unsigned long npoints, const double* PRAGMA_
     double AXX, AXY, AXZ, AYY, AYZ, AZZ;
 
     // Build negative exponents
-    for (unsigned long i = 0; i < nprim; i++) {
+    for (unsigned long i = 0; i < (unsigned long)nprim; i++) {
         expn1[i] = -1.0 * exponents[i];
         expn2[i] = -2.0 * exponents[i];
     }
@@ -2434,7 +2434,7 @@ void gg_collocation_L5_deriv2(const unsigned long npoints, const double* PRAGMA_
                 S2[i] = 0.0;
             }
             } else {
-            unsigned int start_shift = start * xyz_stride;
+            unsigned long start_shift = start * xyz_stride;
 
             PRAGMA_VECTORIZE
             for (unsigned long i = 0; i < remain; i++) {
@@ -2455,7 +2455,7 @@ void gg_collocation_L5_deriv2(const unsigned long npoints, const double* PRAGMA_
         }
 
         // Start exponential block loop
-        for (unsigned long n = 0; n < nprim; n++) {
+        for (unsigned long n = 0; n < (unsigned long)nprim; n++) {
             const double coef = coeffs[n];
             const double alpha_n1 = expn1[n];
             const double alpha_n2 = expn2[n];
@@ -3384,7 +3384,7 @@ void gg_collocation_L6_deriv2(const unsigned long npoints, const double* PRAGMA_
     double AXX, AXY, AXZ, AYY, AYZ, AZZ;
 
     // Build negative exponents
-    for (unsigned long i = 0; i < nprim; i++) {
+    for (unsigned long i = 0; i < (unsigned long)nprim; i++) {
         expn1[i] = -1.0 * exponents[i];
         expn2[i] = -2.0 * exponents[i];
     }
@@ -3419,7 +3419,7 @@ void gg_collocation_L6_deriv2(const unsigned long npoints, const double* PRAGMA_
                 S2[i] = 0.0;
             }
             } else {
-            unsigned int start_shift = start * xyz_stride;
+            unsigned long start_shift = start * xyz_stride;
 
             PRAGMA_VECTORIZE
             for (unsigned long i = 0; i < remain; i++) {
@@ -3440,7 +3440,7 @@ void gg_collocation_L6_deriv2(const unsigned long npoints, const double* PRAGMA_
         }
 
         // Start exponential block loop
-        for (unsigned long n = 0; n < nprim; n++) {
+        for (unsigned long n = 0; n < (unsigned long)nprim; n++) {
             const double coef = coeffs[n];
             const double alpha_n1 = expn1[n];
             const double alpha_n2 = expn2[n];
diff --git a/external/gau2grid/generated_source/gau2grid_deriv3.c b/external/gau2grid/generated_source/gau2grid_deriv3.c
index c38afc7fb..c9aa2f8b8 100644
--- a/external/gau2grid/generated_source/gau2grid_deriv3.c
+++ b/external/gau2grid/generated_source/gau2grid_deriv3.c
@@ -114,7 +114,7 @@ void gg_collocation_L0_deriv3(const unsigned long npoints, const double* PRAGMA_
     double AXXX, XXY, XXZ, XYY, XYZ, XZZ, YYY, YYZ, YZZ, ZZZ;
 
     // Build negative exponents
-    for (unsigned long i = 0; i < nprim; i++) {
+    for (unsigned long i = 0; i < (unsigned long)nprim; i++) {
         expn1[i] = -1.0 * exponents[i];
         expn2[i] = -2.0 * exponents[i];
     }
@@ -150,7 +150,7 @@ void gg_collocation_L0_deriv3(const unsigned long npoints, const double* PRAGMA_
                 S3[i] = 0.0;
             }
             } else {
-            unsigned int start_shift = start * xyz_stride;
+            unsigned long start_shift = start * xyz_stride;
 
             PRAGMA_VECTORIZE
             for (unsigned long i = 0; i < remain; i++) {
@@ -172,7 +172,7 @@ void gg_collocation_L0_deriv3(const unsigned long npoints, const double* PRAGMA_
         }
 
         // Start exponential block loop
-        for (unsigned long n = 0; n < nprim; n++) {
+        for (unsigned long n = 0; n < (unsigned long)nprim; n++) {
             const double coef = coeffs[n];
             const double alpha_n1 = expn1[n];
             const double alpha_n2 = expn2[n];
@@ -369,7 +369,7 @@ void gg_collocation_L1_deriv3(const unsigned long npoints, const double* PRAGMA_
     double AXXX, XXY, XXZ, XYY, XYZ, XZZ, YYY, YYZ, YZZ, ZZZ;
 
     // Build negative exponents
-    for (unsigned long i = 0; i < nprim; i++) {
+    for (unsigned long i = 0; i < (unsigned long)nprim; i++) {
         expn1[i] = -1.0 * exponents[i];
         expn2[i] = -2.0 * exponents[i];
     }
@@ -405,7 +405,7 @@ void gg_collocation_L1_deriv3(const unsigned long npoints, const double* PRAGMA_
                 S3[i] = 0.0;
             }
             } else {
-            unsigned int start_shift = start * xyz_stride;
+            unsigned long start_shift = start * xyz_stride;
 
             PRAGMA_VECTORIZE
             for (unsigned long i = 0; i < remain; i++) {
@@ -427,7 +427,7 @@ void gg_collocation_L1_deriv3(const unsigned long npoints, const double* PRAGMA_
         }
 
         // Start exponential block loop
-        for (unsigned long n = 0; n < nprim; n++) {
+        for (unsigned long n = 0; n < (unsigned long)nprim; n++) {
             const double coef = coeffs[n];
             const double alpha_n1 = expn1[n];
             const double alpha_n2 = expn2[n];
@@ -791,7 +791,7 @@ void gg_collocation_L2_deriv3(const unsigned long npoints, const double* PRAGMA_
     double AXXX, XXY, XXZ, XYY, XYZ, XZZ, YYY, YYZ, YZZ, ZZZ;
 
     // Build negative exponents
-    for (unsigned long i = 0; i < nprim; i++) {
+    for (unsigned long i = 0; i < (unsigned long)nprim; i++) {
         expn1[i] = -1.0 * exponents[i];
         expn2[i] = -2.0 * exponents[i];
     }
@@ -827,7 +827,7 @@ void gg_collocation_L2_deriv3(const unsigned long npoints, const double* PRAGMA_
                 S3[i] = 0.0;
             }
             } else {
-            unsigned int start_shift = start * xyz_stride;
+            unsigned long start_shift = start * xyz_stride;
 
             PRAGMA_VECTORIZE
             for (unsigned long i = 0; i < remain; i++) {
@@ -849,7 +849,7 @@ void gg_collocation_L2_deriv3(const unsigned long npoints, const double* PRAGMA_
         }
 
         // Start exponential block loop
-        for (unsigned long n = 0; n < nprim; n++) {
+        for (unsigned long n = 0; n < (unsigned long)nprim; n++) {
             const double coef = coeffs[n];
             const double alpha_n1 = expn1[n];
             const double alpha_n2 = expn2[n];
@@ -1690,7 +1690,7 @@ void gg_collocation_L3_deriv3(const unsigned long npoints, const double* PRAGMA_
     double AXXX, XXY, XXZ, XYY, XYZ, XZZ, YYY, YYZ, YZZ, ZZZ;
 
     // Build negative exponents
-    for (unsigned long i = 0; i < nprim; i++) {
+    for (unsigned long i = 0; i < (unsigned long)nprim; i++) {
         expn1[i] = -1.0 * exponents[i];
         expn2[i] = -2.0 * exponents[i];
     }
@@ -1726,7 +1726,7 @@ void gg_collocation_L3_deriv3(const unsigned long npoints, const double* PRAGMA_
                 S3[i] = 0.0;
             }
             } else {
-            unsigned int start_shift = start * xyz_stride;
+            unsigned long start_shift = start * xyz_stride;
 
             PRAGMA_VECTORIZE
             for (unsigned long i = 0; i < remain; i++) {
@@ -1748,7 +1748,7 @@ void gg_collocation_L3_deriv3(const unsigned long npoints, const double* PRAGMA_
         }
 
         // Start exponential block loop
-        for (unsigned long n = 0; n < nprim; n++) {
+        for (unsigned long n = 0; n < (unsigned long)nprim; n++) {
             const double coef = coeffs[n];
             const double alpha_n1 = expn1[n];
             const double alpha_n2 = expn2[n];
@@ -2895,7 +2895,7 @@ void gg_collocation_L4_deriv3(const unsigned long npoints, const double* PRAGMA_
     double AXXX, XXY, XXZ, XYY, XYZ, XZZ, YYY, YYZ, YZZ, ZZZ;
 
     // Build negative exponents
-    for (unsigned long i = 0; i < nprim; i++) {
+    for (unsigned long i = 0; i < (unsigned long)nprim; i++) {
         expn1[i] = -1.0 * exponents[i];
         expn2[i] = -2.0 * exponents[i];
     }
@@ -2931,7 +2931,7 @@ void gg_collocation_L4_deriv3(const unsigned long npoints, const double* PRAGMA_
                 S3[i] = 0.0;
             }
             } else {
-            unsigned int start_shift = start * xyz_stride;
+            unsigned long start_shift = start * xyz_stride;
 
             PRAGMA_VECTORIZE
             for (unsigned long i = 0; i < remain; i++) {
@@ -2953,7 +2953,7 @@ void gg_collocation_L4_deriv3(const unsigned long npoints, const double* PRAGMA_
         }
 
         // Start exponential block loop
-        for (unsigned long n = 0; n < nprim; n++) {
+        for (unsigned long n = 0; n < (unsigned long)nprim; n++) {
             const double coef = coeffs[n];
             const double alpha_n1 = expn1[n];
             const double alpha_n2 = expn2[n];
@@ -4510,7 +4510,7 @@ void gg_collocation_L5_deriv3(const unsigned long npoints, const double* PRAGMA_
     double AXXX, XXY, XXZ, XYY, XYZ, XZZ, YYY, YYZ, YZZ, ZZZ;
 
     // Build negative exponents
-    for (unsigned long i = 0; i < nprim; i++) {
+    for (unsigned long i = 0; i < (unsigned long)nprim; i++) {
         expn1[i] = -1.0 * exponents[i];
         expn2[i] = -2.0 * exponents[i];
     }
@@ -4546,7 +4546,7 @@ void gg_collocation_L5_deriv3(const unsigned long npoints, const double* PRAGMA_
                 S3[i] = 0.0;
             }
             } else {
-            unsigned int start_shift = start * xyz_stride;
+            unsigned long start_shift = start * xyz_stride;
 
             PRAGMA_VECTORIZE
             for (unsigned long i = 0; i < remain; i++) {
@@ -4568,7 +4568,7 @@ void gg_collocation_L5_deriv3(const unsigned long npoints, const double* PRAGMA_
         }
 
         // Start exponential block loop
-        for (unsigned long n = 0; n < nprim; n++) {
+        for (unsigned long n = 0; n < (unsigned long)nprim; n++) {
             const double coef = coeffs[n];
             const double alpha_n1 = expn1[n];
             const double alpha_n2 = expn2[n];
@@ -6638,7 +6638,7 @@ void gg_collocation_L6_deriv3(const unsigned long npoints, const double* PRAGMA_
     double AXXX, XXY, XXZ, XYY, XYZ, XZZ, YYY, YYZ, YZZ, ZZZ;
 
     // Build negative exponents
-    for (unsigned long i = 0; i < nprim; i++) {
+    for (unsigned long i = 0; i < (unsigned long)nprim; i++) {
         expn1[i] = -1.0 * exponents[i];
         expn2[i] = -2.0 * exponents[i];
     }
@@ -6674,7 +6674,7 @@ void gg_collocation_L6_deriv3(const unsigned long npoints, const double* PRAGMA_
                 S3[i] = 0.0;
             }
             } else {
-            unsigned int start_shift = start * xyz_stride;
+            unsigned long start_shift = start * xyz_stride;
 
             PRAGMA_VECTORIZE
             for (unsigned long i = 0; i < remain; i++) {
@@ -6696,7 +6696,7 @@ void gg_collocation_L6_deriv3(const unsigned long npoints, const double* PRAGMA_
         }
 
         // Start exponential block loop
-        for (unsigned long n = 0; n < nprim; n++) {
+        for (unsigned long n = 0; n < (unsigned long)nprim; n++) {
             const double coef = coeffs[n];
             const double alpha_n1 = expn1[n];
             const double alpha_n2 = expn2[n];
diff --git a/external/gau2grid/generated_source/gau2grid_orbital.c b/external/gau2grid/generated_source/gau2grid_orbital.c
index f9cc618ac..2a18b9f2d 100644
--- a/external/gau2grid/generated_source/gau2grid_orbital.c
+++ b/external/gau2grid/generated_source/gau2grid_orbital.c
@@ -66,7 +66,7 @@ void gg_orbitals_L0(const double* PRAGMA_RESTRICT C, const unsigned long norbita
     double A;
 
     // Build negative exponents
-    for (unsigned long i = 0; i < nprim; i++) {
+    for (unsigned long i = 0; i < (unsigned long)nprim; i++) {
         expn1[i] = -1.0 * exponents[i];
     }
 
@@ -98,7 +98,7 @@ void gg_orbitals_L0(const double* PRAGMA_RESTRICT C, const unsigned long norbita
                 S0[i] = 0.0;
             }
             } else {
-            unsigned int start_shift = start * xyz_stride;
+            unsigned long start_shift = start * xyz_stride;
 
             PRAGMA_VECTORIZE
             for (unsigned long i = 0; i < remain; i++) {
@@ -117,7 +117,7 @@ void gg_orbitals_L0(const double* PRAGMA_RESTRICT C, const unsigned long norbita
         }
 
         // Start exponential block loop
-        for (unsigned long n = 0; n < nprim; n++) {
+        for (unsigned long n = 0; n < (unsigned long)nprim; n++) {
             const double coef = coeffs[n];
             const double alpha_n1 = expn1[n];
 
@@ -216,7 +216,7 @@ void gg_orbitals_L1(const double* PRAGMA_RESTRICT C, const unsigned long norbita
     double A;
 
     // Build negative exponents
-    for (unsigned long i = 0; i < nprim; i++) {
+    for (unsigned long i = 0; i < (unsigned long)nprim; i++) {
         expn1[i] = -1.0 * exponents[i];
     }
 
@@ -248,7 +248,7 @@ void gg_orbitals_L1(const double* PRAGMA_RESTRICT C, const unsigned long norbita
                 S0[i] = 0.0;
             }
             } else {
-            unsigned int start_shift = start * xyz_stride;
+            unsigned long start_shift = start * xyz_stride;
 
             PRAGMA_VECTORIZE
             for (unsigned long i = 0; i < remain; i++) {
@@ -267,7 +267,7 @@ void gg_orbitals_L1(const double* PRAGMA_RESTRICT C, const unsigned long norbita
         }
 
         // Start exponential block loop
-        for (unsigned long n = 0; n < nprim; n++) {
+        for (unsigned long n = 0; n < (unsigned long)nprim; n++) {
             const double coef = coeffs[n];
             const double alpha_n1 = expn1[n];
 
@@ -372,7 +372,7 @@ void gg_orbitals_L2(const double* PRAGMA_RESTRICT C, const unsigned long norbita
     double A;
 
     // Build negative exponents
-    for (unsigned long i = 0; i < nprim; i++) {
+    for (unsigned long i = 0; i < (unsigned long)nprim; i++) {
         expn1[i] = -1.0 * exponents[i];
     }
 
@@ -404,7 +404,7 @@ void gg_orbitals_L2(const double* PRAGMA_RESTRICT C, const unsigned long norbita
                 S0[i] = 0.0;
             }
             } else {
-            unsigned int start_shift = start * xyz_stride;
+            unsigned long start_shift = start * xyz_stride;
 
             PRAGMA_VECTORIZE
             for (unsigned long i = 0; i < remain; i++) {
@@ -423,7 +423,7 @@ void gg_orbitals_L2(const double* PRAGMA_RESTRICT C, const unsigned long norbita
         }
 
         // Start exponential block loop
-        for (unsigned long n = 0; n < nprim; n++) {
+        for (unsigned long n = 0; n < (unsigned long)nprim; n++) {
             const double coef = coeffs[n];
             const double alpha_n1 = expn1[n];
 
@@ -546,7 +546,7 @@ void gg_orbitals_L3(const double* PRAGMA_RESTRICT C, const unsigned long norbita
     double A;
 
     // Build negative exponents
-    for (unsigned long i = 0; i < nprim; i++) {
+    for (unsigned long i = 0; i < (unsigned long)nprim; i++) {
         expn1[i] = -1.0 * exponents[i];
     }
 
@@ -578,7 +578,7 @@ void gg_orbitals_L3(const double* PRAGMA_RESTRICT C, const unsigned long norbita
                 S0[i] = 0.0;
             }
             } else {
-            unsigned int start_shift = start * xyz_stride;
+            unsigned long start_shift = start * xyz_stride;
 
             PRAGMA_VECTORIZE
             for (unsigned long i = 0; i < remain; i++) {
@@ -597,7 +597,7 @@ void gg_orbitals_L3(const double* PRAGMA_RESTRICT C, const unsigned long norbita
         }
 
         // Start exponential block loop
-        for (unsigned long n = 0; n < nprim; n++) {
+        for (unsigned long n = 0; n < (unsigned long)nprim; n++) {
             const double coef = coeffs[n];
             const double alpha_n1 = expn1[n];
 
@@ -740,7 +740,7 @@ void gg_orbitals_L4(const double* PRAGMA_RESTRICT C, const unsigned long norbita
     double A;
 
     // Build negative exponents
-    for (unsigned long i = 0; i < nprim; i++) {
+    for (unsigned long i = 0; i < (unsigned long)nprim; i++) {
         expn1[i] = -1.0 * exponents[i];
     }
 
@@ -772,7 +772,7 @@ void gg_orbitals_L4(const double* PRAGMA_RESTRICT C, const unsigned long norbita
                 S0[i] = 0.0;
             }
             } else {
-            unsigned int start_shift = start * xyz_stride;
+            unsigned long start_shift = start * xyz_stride;
 
             PRAGMA_VECTORIZE
             for (unsigned long i = 0; i < remain; i++) {
@@ -791,7 +791,7 @@ void gg_orbitals_L4(const double* PRAGMA_RESTRICT C, const unsigned long norbita
         }
 
         // Start exponential block loop
-        for (unsigned long n = 0; n < nprim; n++) {
+        for (unsigned long n = 0; n < (unsigned long)nprim; n++) {
             const double coef = coeffs[n];
             const double alpha_n1 = expn1[n];
 
@@ -958,7 +958,7 @@ void gg_orbitals_L5(const double* PRAGMA_RESTRICT C, const unsigned long norbita
     double A;
 
     // Build negative exponents
-    for (unsigned long i = 0; i < nprim; i++) {
+    for (unsigned long i = 0; i < (unsigned long)nprim; i++) {
         expn1[i] = -1.0 * exponents[i];
     }
 
@@ -990,7 +990,7 @@ void gg_orbitals_L5(const double* PRAGMA_RESTRICT C, const unsigned long norbita
                 S0[i] = 0.0;
             }
             } else {
-            unsigned int start_shift = start * xyz_stride;
+            unsigned long start_shift = start * xyz_stride;
 
             PRAGMA_VECTORIZE
             for (unsigned long i = 0; i < remain; i++) {
@@ -1009,7 +1009,7 @@ void gg_orbitals_L5(const double* PRAGMA_RESTRICT C, const unsigned long norbita
         }
 
         // Start exponential block loop
-        for (unsigned long n = 0; n < nprim; n++) {
+        for (unsigned long n = 0; n < (unsigned long)nprim; n++) {
             const double coef = coeffs[n];
             const double alpha_n1 = expn1[n];
 
@@ -1204,7 +1204,7 @@ void gg_orbitals_L6(const double* PRAGMA_RESTRICT C, const unsigned long norbita
     double A;
 
     // Build negative exponents
-    for (unsigned long i = 0; i < nprim; i++) {
+    for (unsigned long i = 0; i < (unsigned long)nprim; i++) {
         expn1[i] = -1.0 * exponents[i];
     }
 
@@ -1236,7 +1236,7 @@ void gg_orbitals_L6(const double* PRAGMA_RESTRICT C, const unsigned long norbita
                 S0[i] = 0.0;
             }
             } else {
-            unsigned int start_shift = start * xyz_stride;
+            unsigned long start_shift = start * xyz_stride;
 
             PRAGMA_VECTORIZE
             for (unsigned long i = 0; i < remain; i++) {
@@ -1255,7 +1255,7 @@ void gg_orbitals_L6(const double* PRAGMA_RESTRICT C, const unsigned long norbita
         }
 
         // Start exponential block loop
-        for (unsigned long n = 0; n < nprim; n++) {
+        for (unsigned long n = 0; n < (unsigned long)nprim; n++) {
             const double coef = coeffs[n];
             const double alpha_n1 = expn1[n];
 
diff --git a/external/gau2grid/generated_source/gau2grid_phi.c b/external/gau2grid/generated_source/gau2grid_phi.c
index 77ece958a..c0b875afd 100644
--- a/external/gau2grid/generated_source/gau2grid_phi.c
+++ b/external/gau2grid/generated_source/gau2grid_phi.c
@@ -66,7 +66,7 @@ void gg_collocation_L0(const unsigned long npoints, const double* PRAGMA_RESTRIC
     double A;
 
     // Build negative exponents
-    for (unsigned long i = 0; i < nprim; i++) {
+    for (unsigned long i = 0; i < (unsigned long)nprim; i++) {
         expn1[i] = -1.0 * exponents[i];
     }
 
@@ -98,7 +98,7 @@ void gg_collocation_L0(const unsigned long npoints, const double* PRAGMA_RESTRIC
                 S0[i] = 0.0;
             }
             } else {
-            unsigned int start_shift = start * xyz_stride;
+            unsigned long start_shift = start * xyz_stride;
 
             PRAGMA_VECTORIZE
             for (unsigned long i = 0; i < remain; i++) {
@@ -117,7 +117,7 @@ void gg_collocation_L0(const unsigned long npoints, const double* PRAGMA_RESTRIC
         }
 
         // Start exponential block loop
-        for (unsigned long n = 0; n < nprim; n++) {
+        for (unsigned long n = 0; n < (unsigned long)nprim; n++) {
             const double coef = coeffs[n];
             const double alpha_n1 = expn1[n];
 
@@ -190,7 +190,7 @@ void gg_collocation_L1(const unsigned long npoints, const double* PRAGMA_RESTRIC
     double A;
 
     // Build negative exponents
-    for (unsigned long i = 0; i < nprim; i++) {
+    for (unsigned long i = 0; i < (unsigned long)nprim; i++) {
         expn1[i] = -1.0 * exponents[i];
     }
 
@@ -222,7 +222,7 @@ void gg_collocation_L1(const unsigned long npoints, const double* PRAGMA_RESTRIC
                 S0[i] = 0.0;
             }
             } else {
-            unsigned int start_shift = start * xyz_stride;
+            unsigned long start_shift = start * xyz_stride;
 
             PRAGMA_VECTORIZE
             for (unsigned long i = 0; i < remain; i++) {
@@ -241,7 +241,7 @@ void gg_collocation_L1(const unsigned long npoints, const double* PRAGMA_RESTRIC
         }
 
         // Start exponential block loop
-        for (unsigned long n = 0; n < nprim; n++) {
+        for (unsigned long n = 0; n < (unsigned long)nprim; n++) {
             const double coef = coeffs[n];
             const double alpha_n1 = expn1[n];
 
@@ -339,7 +339,7 @@ void gg_collocation_L2(const unsigned long npoints, const double* PRAGMA_RESTRIC
     double A;
 
     // Build negative exponents
-    for (unsigned long i = 0; i < nprim; i++) {
+    for (unsigned long i = 0; i < (unsigned long)nprim; i++) {
         expn1[i] = -1.0 * exponents[i];
     }
 
@@ -371,7 +371,7 @@ void gg_collocation_L2(const unsigned long npoints, const double* PRAGMA_RESTRIC
                 S0[i] = 0.0;
             }
             } else {
-            unsigned int start_shift = start * xyz_stride;
+            unsigned long start_shift = start * xyz_stride;
 
             PRAGMA_VECTORIZE
             for (unsigned long i = 0; i < remain; i++) {
@@ -390,7 +390,7 @@ void gg_collocation_L2(const unsigned long npoints, const double* PRAGMA_RESTRIC
         }
 
         // Start exponential block loop
-        for (unsigned long n = 0; n < nprim; n++) {
+        for (unsigned long n = 0; n < (unsigned long)nprim; n++) {
             const double coef = coeffs[n];
             const double alpha_n1 = expn1[n];
 
@@ -506,7 +506,7 @@ void gg_collocation_L3(const unsigned long npoints, const double* PRAGMA_RESTRIC
     double A;
 
     // Build negative exponents
-    for (unsigned long i = 0; i < nprim; i++) {
+    for (unsigned long i = 0; i < (unsigned long)nprim; i++) {
         expn1[i] = -1.0 * exponents[i];
     }
 
@@ -538,7 +538,7 @@ void gg_collocation_L3(const unsigned long npoints, const double* PRAGMA_RESTRIC
                 S0[i] = 0.0;
             }
             } else {
-            unsigned int start_shift = start * xyz_stride;
+            unsigned long start_shift = start * xyz_stride;
 
             PRAGMA_VECTORIZE
             for (unsigned long i = 0; i < remain; i++) {
@@ -557,7 +557,7 @@ void gg_collocation_L3(const unsigned long npoints, const double* PRAGMA_RESTRIC
         }
 
         // Start exponential block loop
-        for (unsigned long n = 0; n < nprim; n++) {
+        for (unsigned long n = 0; n < (unsigned long)nprim; n++) {
             const double coef = coeffs[n];
             const double alpha_n1 = expn1[n];
 
@@ -693,7 +693,7 @@ void gg_collocation_L4(const unsigned long npoints, const double* PRAGMA_RESTRIC
     double A;
 
     // Build negative exponents
-    for (unsigned long i = 0; i < nprim; i++) {
+    for (unsigned long i = 0; i < (unsigned long)nprim; i++) {
         expn1[i] = -1.0 * exponents[i];
     }
 
@@ -725,7 +725,7 @@ void gg_collocation_L4(const unsigned long npoints, const double* PRAGMA_RESTRIC
                 S0[i] = 0.0;
             }
             } else {
-            unsigned int start_shift = start * xyz_stride;
+            unsigned long start_shift = start * xyz_stride;
 
             PRAGMA_VECTORIZE
             for (unsigned long i = 0; i < remain; i++) {
@@ -744,7 +744,7 @@ void gg_collocation_L4(const unsigned long npoints, const double* PRAGMA_RESTRIC
         }
 
         // Start exponential block loop
-        for (unsigned long n = 0; n < nprim; n++) {
+        for (unsigned long n = 0; n < (unsigned long)nprim; n++) {
             const double coef = coeffs[n];
             const double alpha_n1 = expn1[n];
 
@@ -904,7 +904,7 @@ void gg_collocation_L5(const unsigned long npoints, const double* PRAGMA_RESTRIC
     double A;
 
     // Build negative exponents
-    for (unsigned long i = 0; i < nprim; i++) {
+    for (unsigned long i = 0; i < (unsigned long)nprim; i++) {
         expn1[i] = -1.0 * exponents[i];
     }
 
@@ -936,7 +936,7 @@ void gg_collocation_L5(const unsigned long npoints, const double* PRAGMA_RESTRIC
                 S0[i] = 0.0;
             }
             } else {
-            unsigned int start_shift = start * xyz_stride;
+            unsigned long start_shift = start * xyz_stride;
 
             PRAGMA_VECTORIZE
             for (unsigned long i = 0; i < remain; i++) {
@@ -955,7 +955,7 @@ void gg_collocation_L5(const unsigned long npoints, const double* PRAGMA_RESTRIC
         }
 
         // Start exponential block loop
-        for (unsigned long n = 0; n < nprim; n++) {
+        for (unsigned long n = 0; n < (unsigned long)nprim; n++) {
             const double coef = coeffs[n];
             const double alpha_n1 = expn1[n];
 
@@ -1143,7 +1143,7 @@ void gg_collocation_L6(const unsigned long npoints, const double* PRAGMA_RESTRIC
     double A;
 
     // Build negative exponents
-    for (unsigned long i = 0; i < nprim; i++) {
+    for (unsigned long i = 0; i < (unsigned long)nprim; i++) {
         expn1[i] = -1.0 * exponents[i];
     }
 
@@ -1175,7 +1175,7 @@ void gg_collocation_L6(const unsigned long npoints, const double* PRAGMA_RESTRIC
                 S0[i] = 0.0;
             }
             } else {
-            unsigned int start_shift = start * xyz_stride;
+            unsigned long start_shift = start * xyz_stride;
 
             PRAGMA_VECTORIZE
             for (unsigned long i = 0; i < remain; i++) {
@@ -1194,7 +1194,7 @@ void gg_collocation_L6(const unsigned long npoints, const double* PRAGMA_RESTRIC
         }
 
         // Start exponential block loop
-        for (unsigned long n = 0; n < nprim; n++) {
+        for (unsigned long n = 0; n < (unsigned long)nprim; n++) {
             const double coef = coeffs[n];
             const double alpha_n1 = expn1[n];
 

From b8118558c82ad26f3a6ab324cb5942708ae723f5 Mon Sep 17 00:00:00 2001
From: Loris Ercole <v-lercole@microsoft.com>
Date: Thu, 11 Jun 2026 12:25:11 +0200
Subject: [PATCH 42/52] Fix C4101: remove unreferenced local variable
 declarations in gau2grid

Remove unused double A, AX/AY/AZ, AXX/.../AZZ, and AXXX/.../ZZZ
declarations from collocation functions where the angular momentum
level is too low to need them.
---
 .../generated_source/gau2grid_deriv1.c        |  4 ---
 .../generated_source/gau2grid_deriv2.c        | 19 -------------
 .../generated_source/gau2grid_deriv3.c        | 28 -------------------
 .../generated_source/gau2grid_orbital.c       |  2 --
 .../gau2grid/generated_source/gau2grid_phi.c  |  2 --
 5 files changed, 55 deletions(-)

diff --git a/external/gau2grid/generated_source/gau2grid_deriv1.c b/external/gau2grid/generated_source/gau2grid_deriv1.c
index 24a0fdcc2..84bc61499 100644
--- a/external/gau2grid/generated_source/gau2grid_deriv1.c
+++ b/external/gau2grid/generated_source/gau2grid_deriv1.c
@@ -72,8 +72,6 @@ void gg_collocation_L0_deriv1(const unsigned long npoints, const double* PRAGMA_
     const double center_x = center[0];
     const double center_y = center[1];
     const double center_z = center[2];
-    double A;
-    double AX, AY, AZ;
 
     // Build negative exponents
     for (unsigned long i = 0; i < (unsigned long)nprim; i++) {
@@ -225,8 +223,6 @@ void gg_collocation_L1_deriv1(const unsigned long npoints, const double* PRAGMA_
     const double center_x = center[0];
     const double center_y = center[1];
     const double center_z = center[2];
-    double A;
-    double AX, AY, AZ;
 
     // Build negative exponents
     for (unsigned long i = 0; i < (unsigned long)nprim; i++) {
diff --git a/external/gau2grid/generated_source/gau2grid_deriv2.c b/external/gau2grid/generated_source/gau2grid_deriv2.c
index a0a397f8e..d39d05949 100644
--- a/external/gau2grid/generated_source/gau2grid_deriv2.c
+++ b/external/gau2grid/generated_source/gau2grid_deriv2.c
@@ -86,9 +86,6 @@ void gg_collocation_L0_deriv2(const unsigned long npoints, const double* PRAGMA_
     const double center_x = center[0];
     const double center_y = center[1];
     const double center_z = center[2];
-    double A;
-    double AX, AY, AZ;
-    double AXX, AXY, AXZ, AYY, AYZ, AZZ;
 
     // Build negative exponents
     for (unsigned long i = 0; i < (unsigned long)nprim; i++) {
@@ -280,9 +277,6 @@ void gg_collocation_L1_deriv2(const unsigned long npoints, const double* PRAGMA_
     const double center_x = center[0];
     const double center_y = center[1];
     const double center_z = center[2];
-    double A;
-    double AX, AY, AZ;
-    double AXX, AXY, AXZ, AYY, AYZ, AZZ;
 
     // Build negative exponents
     for (unsigned long i = 0; i < (unsigned long)nprim; i++) {
@@ -594,7 +588,6 @@ void gg_collocation_L2_deriv2(const unsigned long npoints, const double* PRAGMA_
     const double center_z = center[2];
     double A;
     double AX, AY, AZ;
-    double AXX, AXY, AXZ, AYY, AYZ, AZZ;
 
     // Build negative exponents
     for (unsigned long i = 0; i < (unsigned long)nprim; i++) {
@@ -990,9 +983,6 @@ void gg_collocation_L3_deriv2(const unsigned long npoints, const double* PRAGMA_
     const double center_x = center[0];
     const double center_y = center[1];
     const double center_z = center[2];
-    double A;
-    double AX, AY, AZ;
-    double AXX, AXY, AXZ, AYY, AYZ, AZZ;
 
     // Build negative exponents
     for (unsigned long i = 0; i < (unsigned long)nprim; i++) {
@@ -1610,9 +1600,6 @@ void gg_collocation_L4_deriv2(const unsigned long npoints, const double* PRAGMA_
     const double center_x = center[0];
     const double center_y = center[1];
     const double center_z = center[2];
-    double A;
-    double AX, AY, AZ;
-    double AXX, AXY, AXZ, AYY, AYZ, AZZ;
 
     // Build negative exponents
     for (unsigned long i = 0; i < (unsigned long)nprim; i++) {
@@ -2394,9 +2381,6 @@ void gg_collocation_L5_deriv2(const unsigned long npoints, const double* PRAGMA_
     const double center_x = center[0];
     const double center_y = center[1];
     const double center_z = center[2];
-    double A;
-    double AX, AY, AZ;
-    double AXX, AXY, AXZ, AYY, AYZ, AZZ;
 
     // Build negative exponents
     for (unsigned long i = 0; i < (unsigned long)nprim; i++) {
@@ -3379,9 +3363,6 @@ void gg_collocation_L6_deriv2(const unsigned long npoints, const double* PRAGMA_
     const double center_x = center[0];
     const double center_y = center[1];
     const double center_z = center[2];
-    double A;
-    double AX, AY, AZ;
-    double AXX, AXY, AXZ, AYY, AYZ, AZZ;
 
     // Build negative exponents
     for (unsigned long i = 0; i < (unsigned long)nprim; i++) {
diff --git a/external/gau2grid/generated_source/gau2grid_deriv3.c b/external/gau2grid/generated_source/gau2grid_deriv3.c
index c9aa2f8b8..3792361f5 100644
--- a/external/gau2grid/generated_source/gau2grid_deriv3.c
+++ b/external/gau2grid/generated_source/gau2grid_deriv3.c
@@ -108,10 +108,6 @@ void gg_collocation_L0_deriv3(const unsigned long npoints, const double* PRAGMA_
     const double center_x = center[0];
     const double center_y = center[1];
     const double center_z = center[2];
-    double A;
-    double AX, AY, AZ;
-    double AXX, AXY, AXZ, AYY, AYZ, AZZ;
-    double AXXX, XXY, XXZ, XYY, XYZ, XZZ, YYY, YYZ, YZZ, ZZZ;
 
     // Build negative exponents
     for (unsigned long i = 0; i < (unsigned long)nprim; i++) {
@@ -363,10 +359,6 @@ void gg_collocation_L1_deriv3(const unsigned long npoints, const double* PRAGMA_
     const double center_x = center[0];
     const double center_y = center[1];
     const double center_z = center[2];
-    double A;
-    double AX, AY, AZ;
-    double AXX, AXY, AXZ, AYY, AYZ, AZZ;
-    double AXXX, XXY, XXZ, XYY, XYZ, XZZ, YYY, YYZ, YZZ, ZZZ;
 
     // Build negative exponents
     for (unsigned long i = 0; i < (unsigned long)nprim; i++) {
@@ -785,10 +777,6 @@ void gg_collocation_L2_deriv3(const unsigned long npoints, const double* PRAGMA_
     const double center_x = center[0];
     const double center_y = center[1];
     const double center_z = center[2];
-    double A;
-    double AX, AY, AZ;
-    double AXX, AXY, AXZ, AYY, AYZ, AZZ;
-    double AXXX, XXY, XXZ, XYY, XYZ, XZZ, YYY, YYZ, YZZ, ZZZ;
 
     // Build negative exponents
     for (unsigned long i = 0; i < (unsigned long)nprim; i++) {
@@ -1684,10 +1672,6 @@ void gg_collocation_L3_deriv3(const unsigned long npoints, const double* PRAGMA_
     const double center_x = center[0];
     const double center_y = center[1];
     const double center_z = center[2];
-    double A;
-    double AX, AY, AZ;
-    double AXX, AXY, AXZ, AYY, AYZ, AZZ;
-    double AXXX, XXY, XXZ, XYY, XYZ, XZZ, YYY, YYZ, YZZ, ZZZ;
 
     // Build negative exponents
     for (unsigned long i = 0; i < (unsigned long)nprim; i++) {
@@ -2889,10 +2873,6 @@ void gg_collocation_L4_deriv3(const unsigned long npoints, const double* PRAGMA_
     const double center_x = center[0];
     const double center_y = center[1];
     const double center_z = center[2];
-    double A;
-    double AX, AY, AZ;
-    double AXX, AXY, AXZ, AYY, AYZ, AZZ;
-    double AXXX, XXY, XXZ, XYY, XYZ, XZZ, YYY, YYZ, YZZ, ZZZ;
 
     // Build negative exponents
     for (unsigned long i = 0; i < (unsigned long)nprim; i++) {
@@ -4504,10 +4484,6 @@ void gg_collocation_L5_deriv3(const unsigned long npoints, const double* PRAGMA_
     const double center_x = center[0];
     const double center_y = center[1];
     const double center_z = center[2];
-    double A;
-    double AX, AY, AZ;
-    double AXX, AXY, AXZ, AYY, AYZ, AZZ;
-    double AXXX, XXY, XXZ, XYY, XYZ, XZZ, YYY, YYZ, YZZ, ZZZ;
 
     // Build negative exponents
     for (unsigned long i = 0; i < (unsigned long)nprim; i++) {
@@ -6632,10 +6608,6 @@ void gg_collocation_L6_deriv3(const unsigned long npoints, const double* PRAGMA_
     const double center_x = center[0];
     const double center_y = center[1];
     const double center_z = center[2];
-    double A;
-    double AX, AY, AZ;
-    double AXX, AXY, AXZ, AYY, AYZ, AZZ;
-    double AXXX, XXY, XXZ, XYY, XYZ, XZZ, YYY, YYZ, YZZ, ZZZ;
 
     // Build negative exponents
     for (unsigned long i = 0; i < (unsigned long)nprim; i++) {
diff --git a/external/gau2grid/generated_source/gau2grid_orbital.c b/external/gau2grid/generated_source/gau2grid_orbital.c
index 2a18b9f2d..8938581bc 100644
--- a/external/gau2grid/generated_source/gau2grid_orbital.c
+++ b/external/gau2grid/generated_source/gau2grid_orbital.c
@@ -63,7 +63,6 @@ void gg_orbitals_L0(const double* PRAGMA_RESTRICT C, const unsigned long norbita
     const double center_x = center[0];
     const double center_y = center[1];
     const double center_z = center[2];
-    double A;
 
     // Build negative exponents
     for (unsigned long i = 0; i < (unsigned long)nprim; i++) {
@@ -213,7 +212,6 @@ void gg_orbitals_L1(const double* PRAGMA_RESTRICT C, const unsigned long norbita
     const double center_x = center[0];
     const double center_y = center[1];
     const double center_z = center[2];
-    double A;
 
     // Build negative exponents
     for (unsigned long i = 0; i < (unsigned long)nprim; i++) {
diff --git a/external/gau2grid/generated_source/gau2grid_phi.c b/external/gau2grid/generated_source/gau2grid_phi.c
index c0b875afd..e0cf2deb2 100644
--- a/external/gau2grid/generated_source/gau2grid_phi.c
+++ b/external/gau2grid/generated_source/gau2grid_phi.c
@@ -63,7 +63,6 @@ void gg_collocation_L0(const unsigned long npoints, const double* PRAGMA_RESTRIC
     const double center_x = center[0];
     const double center_y = center[1];
     const double center_z = center[2];
-    double A;
 
     // Build negative exponents
     for (unsigned long i = 0; i < (unsigned long)nprim; i++) {
@@ -187,7 +186,6 @@ void gg_collocation_L1(const unsigned long npoints, const double* PRAGMA_RESTRIC
     const double center_x = center[0];
     const double center_y = center[1];
     const double center_z = center[2];
-    double A;
 
     // Build negative exponents
     for (unsigned long i = 0; i < (unsigned long)nprim; i++) {

From eac229a1221cd3ca25b397af16db48e6f02c40e7 Mon Sep 17 00:00:00 2001
From: Loris Ercole <v-lercole@microsoft.com>
Date: Thu, 11 Jun 2026 12:25:24 +0200
Subject: [PATCH 43/52] Fix C4100: silence unreferenced formal parameter
 warnings in gau2grid and rys

Add (void)param casts for parameters required by the function signature
but unused in the implementation: ncart/nspherical/ncart_out in
gau2grid_transform.c, llA in rys compute_vrr3, ntgqp in rys_xrw.
---
 .../generated_source/gau2grid_transform.c     | 54 +++++++++++++++++++
 .../host/rys/src/rys_integral.c               |  1 +
 .../local_work_driver/host/rys/src/rys_xrw.c  |  1 +
 3 files changed, 56 insertions(+)

diff --git a/external/gau2grid/generated_source/gau2grid_transform.c b/external/gau2grid/generated_source/gau2grid_transform.c
index 2e1c6f562..219574b10 100644
--- a/external/gau2grid/generated_source/gau2grid_transform.c
+++ b/external/gau2grid/generated_source/gau2grid_transform.c
@@ -21,6 +21,8 @@
 #include "gau2grid/gau2grid_pragma.h"
 
 void gg_cca_cart_to_spherical_L0(const unsigned long size, const double* PRAGMA_RESTRICT cart, const unsigned long ncart, double* PRAGMA_RESTRICT spherical, const unsigned long nspherical) {
+    (void)ncart;
+    (void)nspherical;
     ASSUME_ALIGNED(cart, 64);
     // R_00 Transform
     for (unsigned long i = 0; i < size; i++) {
@@ -30,6 +32,8 @@ void gg_cca_cart_to_spherical_L0(const unsigned long size, const double* PRAGMA_
 
 }
 void gg_cca_cart_to_spherical_sum_L0(const unsigned long size, const double* vector, const double* PRAGMA_RESTRICT cart, const unsigned long ncart, double* PRAGMA_RESTRICT output, const unsigned long nspherical) {
+    (void)ncart;
+    (void)nspherical;
     ASSUME_ALIGNED(cart, 64);
     // temps
     double tmp;
@@ -62,6 +66,7 @@ void gg_cca_cart_to_spherical_L1(const unsigned long size, const double* PRAGMA_
 
 }
 void gg_cca_cart_to_spherical_sum_L1(const unsigned long size, const double* vector, const double* PRAGMA_RESTRICT cart, const unsigned long ncart, double* PRAGMA_RESTRICT output, const unsigned long nspherical) {
+    (void)nspherical;
     ASSUME_ALIGNED(cart, 64);
     // temps
     double tmp;
@@ -121,6 +126,7 @@ void gg_cca_cart_to_spherical_L2(const unsigned long size, const double* PRAGMA_
 
 }
 void gg_cca_cart_to_spherical_sum_L2(const unsigned long size, const double* vector, const double* PRAGMA_RESTRICT cart, const unsigned long ncart, double* PRAGMA_RESTRICT output, const unsigned long nspherical) {
+    (void)nspherical;
     ASSUME_ALIGNED(cart, 64);
     // temps
     double tmp;
@@ -213,6 +219,7 @@ void gg_cca_cart_to_spherical_L3(const unsigned long size, const double* PRAGMA_
 
 }
 void gg_cca_cart_to_spherical_sum_L3(const unsigned long size, const double* vector, const double* PRAGMA_RESTRICT cart, const unsigned long ncart, double* PRAGMA_RESTRICT output, const unsigned long nspherical) {
+    (void)nspherical;
     ASSUME_ALIGNED(cart, 64);
     // temps
     double tmp;
@@ -345,6 +352,7 @@ void gg_cca_cart_to_spherical_L4(const unsigned long size, const double* PRAGMA_
 
 }
 void gg_cca_cart_to_spherical_sum_L4(const unsigned long size, const double* vector, const double* PRAGMA_RESTRICT cart, const unsigned long ncart, double* PRAGMA_RESTRICT output, const unsigned long nspherical) {
+    (void)nspherical;
     ASSUME_ALIGNED(cart, 64);
     // temps
     double tmp;
@@ -527,6 +535,7 @@ void gg_cca_cart_to_spherical_L5(const unsigned long size, const double* PRAGMA_
 
 }
 void gg_cca_cart_to_spherical_sum_L5(const unsigned long size, const double* vector, const double* PRAGMA_RESTRICT cart, const unsigned long ncart, double* PRAGMA_RESTRICT output, const unsigned long nspherical) {
+    (void)nspherical;
     ASSUME_ALIGNED(cart, 64);
     // temps
     double tmp;
@@ -771,6 +780,7 @@ void gg_cca_cart_to_spherical_L6(const unsigned long size, const double* PRAGMA_
 
 }
 void gg_cca_cart_to_spherical_sum_L6(const unsigned long size, const double* vector, const double* PRAGMA_RESTRICT cart, const unsigned long ncart, double* PRAGMA_RESTRICT output, const unsigned long nspherical) {
+    (void)nspherical;
     ASSUME_ALIGNED(cart, 64);
     // temps
     double tmp;
@@ -918,6 +928,8 @@ void gg_cca_cart_to_spherical_sum_L6(const unsigned long size, const double* vec
 
 }
 void gg_gaussian_cart_to_spherical_L0(const unsigned long size, const double* PRAGMA_RESTRICT cart, const unsigned long ncart, double* PRAGMA_RESTRICT spherical, const unsigned long nspherical) {
+    (void)ncart;
+    (void)nspherical;
     ASSUME_ALIGNED(cart, 64);
     // R_00 Transform
     for (unsigned long i = 0; i < size; i++) {
@@ -927,6 +939,8 @@ void gg_gaussian_cart_to_spherical_L0(const unsigned long size, const double* PR
 
 }
 void gg_gaussian_cart_to_spherical_sum_L0(const unsigned long size, const double* vector, const double* PRAGMA_RESTRICT cart, const unsigned long ncart, double* PRAGMA_RESTRICT output, const unsigned long nspherical) {
+    (void)ncart;
+    (void)nspherical;
     ASSUME_ALIGNED(cart, 64);
     // temps
     double tmp;
@@ -959,6 +973,7 @@ void gg_gaussian_cart_to_spherical_L1(const unsigned long size, const double* PR
 
 }
 void gg_gaussian_cart_to_spherical_sum_L1(const unsigned long size, const double* vector, const double* PRAGMA_RESTRICT cart, const unsigned long ncart, double* PRAGMA_RESTRICT output, const unsigned long nspherical) {
+    (void)nspherical;
     ASSUME_ALIGNED(cart, 64);
     // temps
     double tmp;
@@ -1018,6 +1033,7 @@ void gg_gaussian_cart_to_spherical_L2(const unsigned long size, const double* PR
 
 }
 void gg_gaussian_cart_to_spherical_sum_L2(const unsigned long size, const double* vector, const double* PRAGMA_RESTRICT cart, const unsigned long ncart, double* PRAGMA_RESTRICT output, const unsigned long nspherical) {
+    (void)nspherical;
     ASSUME_ALIGNED(cart, 64);
     // temps
     double tmp;
@@ -1110,6 +1126,7 @@ void gg_gaussian_cart_to_spherical_L3(const unsigned long size, const double* PR
 
 }
 void gg_gaussian_cart_to_spherical_sum_L3(const unsigned long size, const double* vector, const double* PRAGMA_RESTRICT cart, const unsigned long ncart, double* PRAGMA_RESTRICT output, const unsigned long nspherical) {
+    (void)nspherical;
     ASSUME_ALIGNED(cart, 64);
     // temps
     double tmp;
@@ -1242,6 +1259,7 @@ void gg_gaussian_cart_to_spherical_L4(const unsigned long size, const double* PR
 
 }
 void gg_gaussian_cart_to_spherical_sum_L4(const unsigned long size, const double* vector, const double* PRAGMA_RESTRICT cart, const unsigned long ncart, double* PRAGMA_RESTRICT output, const unsigned long nspherical) {
+    (void)nspherical;
     ASSUME_ALIGNED(cart, 64);
     // temps
     double tmp;
@@ -1424,6 +1442,7 @@ void gg_gaussian_cart_to_spherical_L5(const unsigned long size, const double* PR
 
 }
 void gg_gaussian_cart_to_spherical_sum_L5(const unsigned long size, const double* vector, const double* PRAGMA_RESTRICT cart, const unsigned long ncart, double* PRAGMA_RESTRICT output, const unsigned long nspherical) {
+    (void)nspherical;
     ASSUME_ALIGNED(cart, 64);
     // temps
     double tmp;
@@ -1668,6 +1687,7 @@ void gg_gaussian_cart_to_spherical_L6(const unsigned long size, const double* PR
 
 }
 void gg_gaussian_cart_to_spherical_sum_L6(const unsigned long size, const double* vector, const double* PRAGMA_RESTRICT cart, const unsigned long ncart, double* PRAGMA_RESTRICT output, const unsigned long nspherical) {
+    (void)nspherical;
     ASSUME_ALIGNED(cart, 64);
     // temps
     double tmp;
@@ -1828,6 +1848,7 @@ void gg_cca_cart_copy_L0(const unsigned long size, const double* PRAGMA_RESTRICT
     }
 }
 void gg_cca_cart_sum_L0(const unsigned long size, const double* PRAGMA_RESTRICT vector, const double* PRAGMA_RESTRICT cart_input, const unsigned long ncart_input, double* PRAGMA_RESTRICT cart_out, const unsigned long ncart_out) {
+    (void)ncart_out;
 
     ASSUME_ALIGNED(cart_input, 64);
     unsigned long in_shift;
@@ -1869,6 +1890,7 @@ void gg_cca_cart_copy_L1(const unsigned long size, const double* PRAGMA_RESTRICT
     }
 }
 void gg_cca_cart_sum_L1(const unsigned long size, const double* PRAGMA_RESTRICT vector, const double* PRAGMA_RESTRICT cart_input, const unsigned long ncart_input, double* PRAGMA_RESTRICT cart_out, const unsigned long ncart_out) {
+    (void)ncart_out;
 
     ASSUME_ALIGNED(cart_input, 64);
     unsigned long in_shift;
@@ -1945,6 +1967,7 @@ void gg_cca_cart_copy_L2(const unsigned long size, const double* PRAGMA_RESTRICT
     }
 }
 void gg_cca_cart_sum_L2(const unsigned long size, const double* PRAGMA_RESTRICT vector, const double* PRAGMA_RESTRICT cart_input, const unsigned long ncart_input, double* PRAGMA_RESTRICT cart_out, const unsigned long ncart_out) {
+    (void)ncart_out;
 
     ASSUME_ALIGNED(cart_input, 64);
     unsigned long in_shift;
@@ -2070,6 +2093,7 @@ void gg_cca_cart_copy_L3(const unsigned long size, const double* PRAGMA_RESTRICT
     }
 }
 void gg_cca_cart_sum_L3(const unsigned long size, const double* PRAGMA_RESTRICT vector, const double* PRAGMA_RESTRICT cart_input, const unsigned long ncart_input, double* PRAGMA_RESTRICT cart_out, const unsigned long ncart_out) {
+    (void)ncart_out;
 
     ASSUME_ALIGNED(cart_input, 64);
     unsigned long in_shift;
@@ -2258,6 +2282,7 @@ void gg_cca_cart_copy_L4(const unsigned long size, const double* PRAGMA_RESTRICT
     }
 }
 void gg_cca_cart_sum_L4(const unsigned long size, const double* PRAGMA_RESTRICT vector, const double* PRAGMA_RESTRICT cart_input, const unsigned long ncart_input, double* PRAGMA_RESTRICT cart_out, const unsigned long ncart_out) {
+    (void)ncart_out;
 
     ASSUME_ALIGNED(cart_input, 64);
     unsigned long in_shift;
@@ -2523,6 +2548,7 @@ void gg_cca_cart_copy_L5(const unsigned long size, const double* PRAGMA_RESTRICT
     }
 }
 void gg_cca_cart_sum_L5(const unsigned long size, const double* PRAGMA_RESTRICT vector, const double* PRAGMA_RESTRICT cart_input, const unsigned long ncart_input, double* PRAGMA_RESTRICT cart_out, const unsigned long ncart_out) {
+    (void)ncart_out;
 
     ASSUME_ALIGNED(cart_input, 64);
     unsigned long in_shift;
@@ -2879,6 +2905,7 @@ void gg_cca_cart_copy_L6(const unsigned long size, const double* PRAGMA_RESTRICT
     }
 }
 void gg_cca_cart_sum_L6(const unsigned long size, const double* PRAGMA_RESTRICT vector, const double* PRAGMA_RESTRICT cart_input, const unsigned long ncart_input, double* PRAGMA_RESTRICT cart_out, const unsigned long ncart_out) {
+    (void)ncart_out;
 
     ASSUME_ALIGNED(cart_input, 64);
     unsigned long in_shift;
@@ -3095,6 +3122,7 @@ void gg_molden_cart_copy_L0(const unsigned long size, const double* PRAGMA_RESTR
     }
 }
 void gg_molden_cart_sum_L0(const unsigned long size, const double* PRAGMA_RESTRICT vector, const double* PRAGMA_RESTRICT cart_input, const unsigned long ncart_input, double* PRAGMA_RESTRICT cart_out, const unsigned long ncart_out) {
+    (void)ncart_out;
 
     ASSUME_ALIGNED(cart_input, 64);
     unsigned long in_shift;
@@ -3136,6 +3164,7 @@ void gg_molden_cart_copy_L1(const unsigned long size, const double* PRAGMA_RESTR
     }
 }
 void gg_molden_cart_sum_L1(const unsigned long size, const double* PRAGMA_RESTRICT vector, const double* PRAGMA_RESTRICT cart_input, const unsigned long ncart_input, double* PRAGMA_RESTRICT cart_out, const unsigned long ncart_out) {
+    (void)ncart_out;
 
     ASSUME_ALIGNED(cart_input, 64);
     unsigned long in_shift;
@@ -3212,6 +3241,7 @@ void gg_molden_cart_copy_L2(const unsigned long size, const double* PRAGMA_RESTR
     }
 }
 void gg_molden_cart_sum_L2(const unsigned long size, const double* PRAGMA_RESTRICT vector, const double* PRAGMA_RESTRICT cart_input, const unsigned long ncart_input, double* PRAGMA_RESTRICT cart_out, const unsigned long ncart_out) {
+    (void)ncart_out;
 
     ASSUME_ALIGNED(cart_input, 64);
     unsigned long in_shift;
@@ -3337,6 +3367,7 @@ void gg_molden_cart_copy_L3(const unsigned long size, const double* PRAGMA_RESTR
     }
 }
 void gg_molden_cart_sum_L3(const unsigned long size, const double* PRAGMA_RESTRICT vector, const double* PRAGMA_RESTRICT cart_input, const unsigned long ncart_input, double* PRAGMA_RESTRICT cart_out, const unsigned long ncart_out) {
+    (void)ncart_out;
 
     ASSUME_ALIGNED(cart_input, 64);
     unsigned long in_shift;
@@ -3525,6 +3556,7 @@ void gg_molden_cart_copy_L4(const unsigned long size, const double* PRAGMA_RESTR
     }
 }
 void gg_molden_cart_sum_L4(const unsigned long size, const double* PRAGMA_RESTRICT vector, const double* PRAGMA_RESTRICT cart_input, const unsigned long ncart_input, double* PRAGMA_RESTRICT cart_out, const unsigned long ncart_out) {
+    (void)ncart_out;
 
     ASSUME_ALIGNED(cart_input, 64);
     unsigned long in_shift;
@@ -3637,12 +3669,34 @@ void gg_molden_cart_sum_L4(const unsigned long size, const double* PRAGMA_RESTRI
     }
 }
 void gg_molden_cart_copy_L5(const unsigned long size, const double* PRAGMA_RESTRICT cart_input, const unsigned long ncart_input, double* PRAGMA_RESTRICT cart_out, const unsigned long ncart_out) {
+    (void)size;
+    (void)cart_input;
+    (void)ncart_input;
+    (void)cart_out;
+    (void)ncart_out;
 }
 void gg_molden_cart_sum_L5(const unsigned long size, const double* PRAGMA_RESTRICT vector, const double* PRAGMA_RESTRICT cart_input, const unsigned long ncart_input, double* PRAGMA_RESTRICT cart_out, const unsigned long ncart_out) {
+    (void)size;
+    (void)vector;
+    (void)cart_input;
+    (void)ncart_input;
+    (void)cart_out;
+    (void)ncart_out;
 }
 void gg_molden_cart_copy_L6(const unsigned long size, const double* PRAGMA_RESTRICT cart_input, const unsigned long ncart_input, double* PRAGMA_RESTRICT cart_out, const unsigned long ncart_out) {
+    (void)size;
+    (void)cart_input;
+    (void)ncart_input;
+    (void)cart_out;
+    (void)ncart_out;
 }
 void gg_molden_cart_sum_L6(const unsigned long size, const double* PRAGMA_RESTRICT vector, const double* PRAGMA_RESTRICT cart_input, const unsigned long ncart_input, double* PRAGMA_RESTRICT cart_out, const unsigned long ncart_out) {
+    (void)size;
+    (void)vector;
+    (void)cart_input;
+    (void)ncart_input;
+    (void)cart_out;
+    (void)ncart_out;
 }
 void gg_naive_transpose(unsigned long n, unsigned long m, const double* PRAGMA_RESTRICT input, double* PRAGMA_RESTRICT output) {
     ASSUME_ALIGNED(input, 64);
diff --git a/src/xc_integrator/local_work_driver/host/rys/src/rys_integral.c b/src/xc_integrator/local_work_driver/host/rys/src/rys_integral.c
index cee3f63e3..4b297f3ce 100644
--- a/src/xc_integrator/local_work_driver/host/rys/src/rys_integral.c
+++ b/src/xc_integrator/local_work_driver/host/rys/src/rys_integral.c
@@ -127,6 +127,7 @@ FORCE_INLINE void compute_11(double xAB, double yAB, double zAB, double xPX, dou
 
 // nr roots > 2
 FORCE_INLINE void compute_vrr3(int nr_roots, int l, int lA, int llA, int lB, int llB, double xPX, double yPX, double zPX, double xPC, double yPC, double zPC, double aP_inv, double * rts, double *vrr_array, double *hrr_array) {
+  (void)llA;
   double *roots = (rts + 0);
   double *vrr = (vrr_array + 0);
   for(int r = 0; r < nr_roots; ++r) {
diff --git a/src/xc_integrator/local_work_driver/host/rys/src/rys_xrw.c b/src/xc_integrator/local_work_driver/host/rys/src/rys_xrw.c
index 2089bd0f6..065ed7513 100644
--- a/src/xc_integrator/local_work_driver/host/rys/src/rys_xrw.c
+++ b/src/xc_integrator/local_work_driver/host/rys/src/rys_xrw.c
@@ -15,6 +15,7 @@ void rys_xrw(int nt,
 	      const double *__restrict ryszero,
 	      double *__restrict rts,
 	      double *__restrict wts) {
+  (void)ntgqp;
 #ifdef _MSC_VER
   double *a    = (double *)_malloca(nmom     * sizeof(double));
   double *b    = (double *)_malloca((nmom-1) * sizeof(double));

From 56a0264f48278f5a8b427c0072563885a75145d6 Mon Sep 17 00:00:00 2001
From: Loris Ercole <v-lercole@microsoft.com>
Date: Thu, 11 Jun 2026 14:28:18 +0200
Subject: [PATCH 44/52] Fix gau2grid Python generator to produce
 MSVC-warning-free C code

Update c_generator.py (C4013, C4018, C4101), RSH.py and
c_util_generator.py (C4100) so regenerated sources match the
hand-fixed generated files.
---
 external/gau2grid/src/gau2grid/RSH.py         |  6 +++
 external/gau2grid/src/gau2grid/c_generator.py | 39 +++++++++++++++++--
 .../gau2grid/src/gau2grid/c_util_generator.py | 12 ++++++
 3 files changed, 54 insertions(+), 3 deletions(-)

diff --git a/external/gau2grid/src/gau2grid/RSH.py b/external/gau2grid/src/gau2grid/RSH.py
index f99c2b096..346d0ab66 100644
--- a/external/gau2grid/src/gau2grid/RSH.py
+++ b/external/gau2grid/src/gau2grid/RSH.py
@@ -213,6 +213,9 @@ def transformation_c_generator(cg, L, cartesian_order, spherical_order, function
 
     # Start function
     cg.start_c_block(signature)
+    if L == 0:
+        cg.write("(void)ncart;")
+        cg.write("(void)nspherical;")
     cg.write("ASSUME_ALIGNED(cart, %d)" % align)
 
     cg.write("// R_%d0 Transform" % L)
@@ -287,6 +290,9 @@ def transformation_c_generator_sum(cg, L, cartesian_order, spherical_order, func
 
     # Start function
     cg.start_c_block(signature)
+    if L == 0:
+        cg.write("(void)ncart;")
+    cg.write("(void)nspherical;")
     cg.write("ASSUME_ALIGNED(cart, %d)" % align)
 
     cg.write("// temps")
diff --git a/external/gau2grid/src/gau2grid/c_generator.py b/external/gau2grid/src/gau2grid/c_generator.py
index 9fbf50ffd..3f7a9e8c9 100644
--- a/external/gau2grid/src/gau2grid/c_generator.py
+++ b/external/gau2grid/src/gau2grid/c_generator.py
@@ -95,6 +95,7 @@ def generate_c_gau2grid(max_L,
         cgs.write("#include <mm_malloc.h>")
         cgs.write("#elif defined _MSC_VER")
         cgs.write("#include <malloc.h>")
+        cgs.write("#include <stdlib.h>")
         cgs.write("#else")
         cgs.write("#include <stdlib.h>")
         cgs.write("#endif")
@@ -457,7 +458,7 @@ def shell_c_generator(cg, L, function_name="", grad=0, cartesian_order="row", in
     cg.blankline()
 
     cg.write("// Build negative exponents")
-    cg.start_c_block("for (unsigned long i = 0; i < nprim; i++)")
+    cg.start_c_block("for (unsigned long i = 0; i < (unsigned long)nprim; i++)")
     cg.write("expn1[i] = -1.0 * exponents[i]")
     if grad > 0:
         cg.write("expn2[i] = -2.0 * exponents[i]")
@@ -514,7 +515,7 @@ def shell_c_generator(cg, L, function_name="", grad=0, cartesian_order="row", in
     cg.write("} else {", endl="")
 
     # XYZ stripped blocks
-    cg.write("unsigned int start_shift = start * xyz_stride")
+    cg.write("unsigned long start_shift = start * xyz_stride")
     cg.blankline()
 
     cg.write("PRAGMA_VECTORIZE", endl="")
@@ -546,7 +547,7 @@ def shell_c_generator(cg, L, function_name="", grad=0, cartesian_order="row", in
 
     # Start inner loop
     cg.write("// Start exponential block loop")
-    cg.start_c_block("for (unsigned long n = 0; n < nprim; n++)")
+    cg.start_c_block("for (unsigned long n = 0; n < (unsigned long)nprim; n++)")
 
     # Build R2
     cg.write("const double coef = coeffs[n]")
@@ -778,6 +779,10 @@ def shell_c_generator(cg, L, function_name="", grad=0, cartesian_order="row", in
         cg.data[x] = cg.data[x].replace("= 1 * ", "= ")
         cg.data[x] = cg.data[x].replace("= 1.0 * ", "= ")
 
+    # Remove unused temp variable declarations (e.g. double A; double AX, AY, AZ;)
+    # The post-processing above may have inlined away all uses.
+    _remove_unused_decls(cg, cg_line_start)
+
     return func_sig
 
 
@@ -788,6 +793,34 @@ def _make_call(string):
     return string
 
 
+def _remove_unused_decls(cg, start):
+    """Remove 'double X;' declarations where X is never referenced after the declaration."""
+    import re
+    end = len(cg.data)
+    pos = start
+    while pos < end:
+        line = cg.data[pos]
+        # Match bare temp declarations: "    double A;" or "    double AX, AY, AZ;"
+        # Skip const, pointer, or initialized declarations.
+        m = re.match(r'\s+double\s+([A-Z][A-Z, ]*);', line.rstrip())
+        if m and 'const' not in line and '=' not in line and '*' not in line:
+            var_names = [v.strip() for v in m.group(1).split(',')]
+            all_unused = True
+            for vname in var_names:
+                pattern = re.compile(r'\b' + re.escape(vname) + r'\b')
+                for j in range(pos + 1, end):
+                    if pattern.search(cg.data[j]):
+                        all_unused = False
+                        break
+                if not all_unused:
+                    break
+            if all_unused:
+                cg.data.pop(pos)
+                end -= 1
+                continue
+        pos += 1
+
+
 def _malloc(name, size, dtype="double"):
     # return "%s*  %s = (%s*)malloc(%s * sizeof(%s))" % (dtype, name, dtype, str(size), dtype)
     return "%s* PRAGMA_RESTRICT %s = (%s*)ALIGNED_MALLOC(%d, %s * sizeof(%s))" % (dtype, name, dtype, ALIGN_SIZE, str(size), dtype)
diff --git a/external/gau2grid/src/gau2grid/c_util_generator.py b/external/gau2grid/src/gau2grid/c_util_generator.py
index 234c37662..5ef4afe4c 100644
--- a/external/gau2grid/src/gau2grid/c_util_generator.py
+++ b/external/gau2grid/src/gau2grid/c_util_generator.py
@@ -347,6 +347,11 @@ def cartesian_copy_c_generator(cg, L, cartesian_order_inner, cartesian_order_out
     except KeyError:
 
         cg.start_c_block(signature)
+        cg.write("(void)size;")
+        cg.write("(void)cart_input;")
+        cg.write("(void)ncart_input;")
+        cg.write("(void)cart_out;")
+        cg.write("(void)ncart_out;")
         cg.close_c_block()
 
         return signature
@@ -397,12 +402,19 @@ def cartesian_sum_c_generator(cg, L, cartesian_order_inner, cartesian_order_oute
     except KeyError:
 
         cg.start_c_block(signature)
+        cg.write("(void)size;")
+        cg.write("(void)vector;")
+        cg.write("(void)cart_input;")
+        cg.write("(void)ncart_input;")
+        cg.write("(void)cart_out;")
+        cg.write("(void)ncart_out;")
         cg.close_c_block()
 
         return signature
 
 
     cg.start_c_block(signature)
+    cg.write("(void)ncart_out;")
     cg.blankline()
     cg.write("ASSUME_ALIGNED(%s, %d)" % ("cart_input", align));
 

From 3fc8d4d37e0472edd8a75ccc66827f080354107c Mon Sep 17 00:00:00 2001
From: Loris Ercole <v-lercole@microsoft.com>
Date: Thu, 11 Jun 2026 14:58:53 +0200
Subject: [PATCH 45/52] Fix gau2grid generator issues found during regeneration

- Remove trailing semicolons from (void) casts (cg.write appends them)
- Skip comments when checking if declared variables are used
- Only add #include <stdlib.h> to helper.c (for exit()), not all files
---
 .../generated_source/gau2grid/gau2grid.h      |  2 +-
 .../gau2grid/gau2grid_pragma.h                |  2 +-
 external/gau2grid/src/gau2grid/RSH.py         |  8 +++----
 external/gau2grid/src/gau2grid/c_generator.py |  8 +++++--
 .../gau2grid/src/gau2grid/c_util_generator.py | 24 +++++++++----------
 5 files changed, 24 insertions(+), 20 deletions(-)

diff --git a/external/gau2grid/generated_source/gau2grid/gau2grid.h b/external/gau2grid/generated_source/gau2grid/gau2grid.h
index 0e097a526..29f888852 100644
--- a/external/gau2grid/generated_source/gau2grid/gau2grid.h
+++ b/external/gau2grid/generated_source/gau2grid/gau2grid.h
@@ -79,4 +79,4 @@ void gg_collocation_deriv3(int L, const unsigned long npoints, const double* PRA
 #ifdef __cplusplus
 }
 #endif
-#endif /* GAU2GRID_GUARD_H */
+#endif /* GAU2GRID_GUARD_H */
\ No newline at end of file
diff --git a/external/gau2grid/generated_source/gau2grid/gau2grid_pragma.h b/external/gau2grid/generated_source/gau2grid/gau2grid_pragma.h
index d85679263..f6033886a 100644
--- a/external/gau2grid/generated_source/gau2grid/gau2grid_pragma.h
+++ b/external/gau2grid/generated_source/gau2grid/gau2grid_pragma.h
@@ -96,4 +96,4 @@
     #define PRAGMA_RESTRICT                                  __restrict__
 
 
-#endif
+#endif
\ No newline at end of file
diff --git a/external/gau2grid/src/gau2grid/RSH.py b/external/gau2grid/src/gau2grid/RSH.py
index 346d0ab66..037646810 100644
--- a/external/gau2grid/src/gau2grid/RSH.py
+++ b/external/gau2grid/src/gau2grid/RSH.py
@@ -214,8 +214,8 @@ def transformation_c_generator(cg, L, cartesian_order, spherical_order, function
     # Start function
     cg.start_c_block(signature)
     if L == 0:
-        cg.write("(void)ncart;")
-        cg.write("(void)nspherical;")
+        cg.write("(void)ncart")
+        cg.write("(void)nspherical")
     cg.write("ASSUME_ALIGNED(cart, %d)" % align)
 
     cg.write("// R_%d0 Transform" % L)
@@ -291,8 +291,8 @@ def transformation_c_generator_sum(cg, L, cartesian_order, spherical_order, func
     # Start function
     cg.start_c_block(signature)
     if L == 0:
-        cg.write("(void)ncart;")
-    cg.write("(void)nspherical;")
+        cg.write("(void)ncart")
+    cg.write("(void)nspherical")
     cg.write("ASSUME_ALIGNED(cart, %d)" % align)
 
     cg.write("// temps")
diff --git a/external/gau2grid/src/gau2grid/c_generator.py b/external/gau2grid/src/gau2grid/c_generator.py
index 3f7a9e8c9..d1730d5d0 100644
--- a/external/gau2grid/src/gau2grid/c_generator.py
+++ b/external/gau2grid/src/gau2grid/c_generator.py
@@ -91,11 +91,14 @@ def generate_c_gau2grid(max_L,
         # cgs.write("#include <stdio.h>")
         cgs.write("#if defined(__clang__) && defined(_MSC_VER)")
         cgs.write("#include <malloc.h>")
+        if cgs is gg_helper:
+            cgs.write("#include <stdlib.h>")
         cgs.write("#elif defined __clang__")
         cgs.write("#include <mm_malloc.h>")
         cgs.write("#elif defined _MSC_VER")
         cgs.write("#include <malloc.h>")
-        cgs.write("#include <stdlib.h>")
+        if cgs is gg_helper:
+            cgs.write("#include <stdlib.h>")
         cgs.write("#else")
         cgs.write("#include <stdlib.h>")
         cgs.write("#endif")
@@ -809,7 +812,8 @@ def _remove_unused_decls(cg, start):
             for vname in var_names:
                 pattern = re.compile(r'\b' + re.escape(vname) + r'\b')
                 for j in range(pos + 1, end):
-                    if pattern.search(cg.data[j]):
+                    code_line = cg.data[j].split('//')[0]
+                    if pattern.search(code_line):
                         all_unused = False
                         break
                 if not all_unused:
diff --git a/external/gau2grid/src/gau2grid/c_util_generator.py b/external/gau2grid/src/gau2grid/c_util_generator.py
index 5ef4afe4c..9dddb47dd 100644
--- a/external/gau2grid/src/gau2grid/c_util_generator.py
+++ b/external/gau2grid/src/gau2grid/c_util_generator.py
@@ -347,11 +347,11 @@ def cartesian_copy_c_generator(cg, L, cartesian_order_inner, cartesian_order_out
     except KeyError:
 
         cg.start_c_block(signature)
-        cg.write("(void)size;")
-        cg.write("(void)cart_input;")
-        cg.write("(void)ncart_input;")
-        cg.write("(void)cart_out;")
-        cg.write("(void)ncart_out;")
+        cg.write("(void)size")
+        cg.write("(void)cart_input")
+        cg.write("(void)ncart_input")
+        cg.write("(void)cart_out")
+        cg.write("(void)ncart_out")
         cg.close_c_block()
 
         return signature
@@ -402,19 +402,19 @@ def cartesian_sum_c_generator(cg, L, cartesian_order_inner, cartesian_order_oute
     except KeyError:
 
         cg.start_c_block(signature)
-        cg.write("(void)size;")
-        cg.write("(void)vector;")
-        cg.write("(void)cart_input;")
-        cg.write("(void)ncart_input;")
-        cg.write("(void)cart_out;")
-        cg.write("(void)ncart_out;")
+        cg.write("(void)size")
+        cg.write("(void)vector")
+        cg.write("(void)cart_input")
+        cg.write("(void)ncart_input")
+        cg.write("(void)cart_out")
+        cg.write("(void)ncart_out")
         cg.close_c_block()
 
         return signature
 
 
     cg.start_c_block(signature)
-    cg.write("(void)ncart_out;")
+    cg.write("(void)ncart_out")
     cg.blankline()
     cg.write("ASSUME_ALIGNED(%s, %d)" % ("cart_input", align));
 

From f56617f7b0c5d77a08b3cba70b21ce760a9231d8 Mon Sep 17 00:00:00 2001
From: Loris Ercole <v-lercole@microsoft.com>
Date: Thu, 11 Jun 2026 15:10:19 +0200
Subject: [PATCH 46/52] Fix remaining MSVC warnings in gau2grid transform code

- C4101: remove unused out_shift variable from cart_sum functions
- C4100: silence unused trans parameter in block_copy
---
 .../gau2grid/generated_source/gau2grid_transform.c  | 13 +------------
 external/gau2grid/src/gau2grid/c_util_generator.py  |  2 +-
 2 files changed, 2 insertions(+), 13 deletions(-)

diff --git a/external/gau2grid/generated_source/gau2grid_transform.c b/external/gau2grid/generated_source/gau2grid_transform.c
index 219574b10..8240bad50 100644
--- a/external/gau2grid/generated_source/gau2grid_transform.c
+++ b/external/gau2grid/generated_source/gau2grid_transform.c
@@ -1852,7 +1852,6 @@ void gg_cca_cart_sum_L0(const unsigned long size, const double* PRAGMA_RESTRICT
 
     ASSUME_ALIGNED(cart_input, 64);
     unsigned long in_shift;
-    unsigned long out_shift;
     double coef;
 
     // Copy (0, 0, 0)
@@ -1894,7 +1893,6 @@ void gg_cca_cart_sum_L1(const unsigned long size, const double* PRAGMA_RESTRICT
 
     ASSUME_ALIGNED(cart_input, 64);
     unsigned long in_shift;
-    unsigned long out_shift;
     double coef;
 
     // Copy (1, 0, 0)
@@ -1971,7 +1969,6 @@ void gg_cca_cart_sum_L2(const unsigned long size, const double* PRAGMA_RESTRICT
 
     ASSUME_ALIGNED(cart_input, 64);
     unsigned long in_shift;
-    unsigned long out_shift;
     double coef;
 
     // Copy (2, 0, 0)
@@ -2097,7 +2094,6 @@ void gg_cca_cart_sum_L3(const unsigned long size, const double* PRAGMA_RESTRICT
 
     ASSUME_ALIGNED(cart_input, 64);
     unsigned long in_shift;
-    unsigned long out_shift;
     double coef;
 
     // Copy (3, 0, 0)
@@ -2286,7 +2282,6 @@ void gg_cca_cart_sum_L4(const unsigned long size, const double* PRAGMA_RESTRICT
 
     ASSUME_ALIGNED(cart_input, 64);
     unsigned long in_shift;
-    unsigned long out_shift;
     double coef;
 
     // Copy (4, 0, 0)
@@ -2552,7 +2547,6 @@ void gg_cca_cart_sum_L5(const unsigned long size, const double* PRAGMA_RESTRICT
 
     ASSUME_ALIGNED(cart_input, 64);
     unsigned long in_shift;
-    unsigned long out_shift;
     double coef;
 
     // Copy (5, 0, 0)
@@ -2909,7 +2903,6 @@ void gg_cca_cart_sum_L6(const unsigned long size, const double* PRAGMA_RESTRICT
 
     ASSUME_ALIGNED(cart_input, 64);
     unsigned long in_shift;
-    unsigned long out_shift;
     double coef;
 
     // Copy (6, 0, 0)
@@ -3126,7 +3119,6 @@ void gg_molden_cart_sum_L0(const unsigned long size, const double* PRAGMA_RESTRI
 
     ASSUME_ALIGNED(cart_input, 64);
     unsigned long in_shift;
-    unsigned long out_shift;
     double coef;
 
     // Copy (0, 0, 0)
@@ -3168,7 +3160,6 @@ void gg_molden_cart_sum_L1(const unsigned long size, const double* PRAGMA_RESTRI
 
     ASSUME_ALIGNED(cart_input, 64);
     unsigned long in_shift;
-    unsigned long out_shift;
     double coef;
 
     // Copy (1, 0, 0)
@@ -3245,7 +3236,6 @@ void gg_molden_cart_sum_L2(const unsigned long size, const double* PRAGMA_RESTRI
 
     ASSUME_ALIGNED(cart_input, 64);
     unsigned long in_shift;
-    unsigned long out_shift;
     double coef;
 
     // Copy (2, 0, 0)
@@ -3371,7 +3361,6 @@ void gg_molden_cart_sum_L3(const unsigned long size, const double* PRAGMA_RESTRI
 
     ASSUME_ALIGNED(cart_input, 64);
     unsigned long in_shift;
-    unsigned long out_shift;
     double coef;
 
     // Copy (3, 0, 0)
@@ -3560,7 +3549,6 @@ void gg_molden_cart_sum_L4(const unsigned long size, const double* PRAGMA_RESTRI
 
     ASSUME_ALIGNED(cart_input, 64);
     unsigned long in_shift;
-    unsigned long out_shift;
     double coef;
 
     // Copy (4, 0, 0)
@@ -3745,6 +3733,7 @@ void gg_fast_transpose(unsigned long n, unsigned long m, const double* PRAGMA_RE
     }
 }
 void block_copy(unsigned long n, unsigned long m, const double* PRAGMA_RESTRICT input, unsigned long is, double* PRAGMA_RESTRICT output, unsigned long os, const int trans) {
+    (void)trans;
 
     ASSUME_ALIGNED(input, 64);
     for (unsigned long i = 0; i < n; i++) {
diff --git a/external/gau2grid/src/gau2grid/c_util_generator.py b/external/gau2grid/src/gau2grid/c_util_generator.py
index 9dddb47dd..3e81dff25 100644
--- a/external/gau2grid/src/gau2grid/c_util_generator.py
+++ b/external/gau2grid/src/gau2grid/c_util_generator.py
@@ -419,7 +419,6 @@ def cartesian_sum_c_generator(cg, L, cartesian_order_inner, cartesian_order_oute
     cg.write("ASSUME_ALIGNED(%s, %d)" % ("cart_input", align));
 
     cg.write("unsigned long in_shift")
-    cg.write("unsigned long out_shift")
     cg.write("double coef")
 
     for label, order in cartesian_input.items():
@@ -449,6 +448,7 @@ def block_copy(cg, align=32):
     # nout, nremain
 
     cg.start_c_block(sig)
+    cg.write("(void)trans")
     cg.blankline()
     cg.write("ASSUME_ALIGNED(%s, %d)" % ("input", align));
     cg.start_c_block("for (unsigned long i = 0; i < n; i++)")

From be725f7b2353a467e214dbec2532634c75c3a23b Mon Sep 17 00:00:00 2001
From: Loris Ercole <v-lercole@microsoft.com>
Date: Thu, 11 Jun 2026 15:16:42 +0200
Subject: [PATCH 47/52] Suppress clang-cl unknown pragma warnings for gau2grid

The clang-cl branch uses MSVC's __pragma(loop(ivdep)) which clang-cl
does not recognize. Suppress -Wunknown-pragmas for Clang builds rather
than changing the pragma, keeping the same hint used by MSVC.
---
 external/gau2grid/CMakeLists.txt                           | 7 ++++++-
 .../gau2grid/generated_source/gau2grid/gau2grid_pragma.h   | 2 +-
 external/gau2grid/src/gau2grid/c_pragma.py                 | 2 +-
 3 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/external/gau2grid/CMakeLists.txt b/external/gau2grid/CMakeLists.txt
index ca0638ea8..207f8501f 100644
--- a/external/gau2grid/CMakeLists.txt
+++ b/external/gau2grid/CMakeLists.txt
@@ -12,7 +12,12 @@ target_sources( gauxc PRIVATE
 
 
 target_compile_definitions( gauxc PRIVATE $<BUILD_INTERFACE:__GG_NO_PRAGMA> )
-target_include_directories( gauxc 
+if( CMAKE_C_COMPILER_ID MATCHES "Clang" )
+  # gau2grid uses __pragma(loop(ivdep)) for MSVC vectorization hints.
+  # clang-cl does not recognize this pragma and emits -Wunknown-pragmas.
+  target_compile_options( gauxc PRIVATE -Wno-unknown-pragmas )
+endif()
+target_include_directories( gauxc
   PRIVATE
     $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/generated_source>
 )
diff --git a/external/gau2grid/generated_source/gau2grid/gau2grid_pragma.h b/external/gau2grid/generated_source/gau2grid/gau2grid_pragma.h
index f6033886a..c010ca843 100644
--- a/external/gau2grid/generated_source/gau2grid/gau2grid_pragma.h
+++ b/external/gau2grid/generated_source/gau2grid/gau2grid_pragma.h
@@ -34,7 +34,7 @@
     #define PRAGMA_RESTRICT                                  __restrict__
 
 #elif defined(__clang__) && defined(_MSC_VER)
-    // pragmas for MSVC
+    // pragmas for clang-cl
 
     #define ALIGNED_MALLOC(alignment, size)                  _aligned_malloc(size, alignment)
     #define ALIGNED_FREE(ptr)                                _aligned_free(ptr)
diff --git a/external/gau2grid/src/gau2grid/c_pragma.py b/external/gau2grid/src/gau2grid/c_pragma.py
index 4347a1ec8..53b2c5bd6 100644
--- a/external/gau2grid/src/gau2grid/c_pragma.py
+++ b/external/gau2grid/src/gau2grid/c_pragma.py
@@ -31,7 +31,7 @@
     #define PRAGMA_RESTRICT                                  __restrict__
 
 #elif defined(__clang__) && defined(_MSC_VER)
-    // pragmas for MSVC
+    // pragmas for clang-cl
 
     #define ALIGNED_MALLOC(alignment, size)                  _aligned_malloc(size, alignment)
     #define ALIGNED_FREE(ptr)                                _aligned_free(ptr)

From 0be0971261986af2ac2617c865a8dc9bea8f44ac Mon Sep 17 00:00:00 2001
From: Loris Ercole <v-lercole@microsoft.com>
Date: Thu, 11 Jun 2026 15:56:01 +0200
Subject: [PATCH 48/52] Update ExchCXX to include xc-threshold warning
 suppression

---
 cmake/gauxc-dep-versions.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/gauxc-dep-versions.cmake b/cmake/gauxc-dep-versions.cmake
index 4e1186e8d..7105fe166 100644
--- a/cmake/gauxc-dep-versions.cmake
+++ b/cmake/gauxc-dep-versions.cmake
@@ -8,7 +8,7 @@ set( GAUXC_CUTLASS_REPOSITORY https://github.com/NVIDIA/cutlass.git )
 set( GAUXC_CUTLASS_REVISION v2.10.0 )
 
 set( GAUXC_EXCHCXX_REPOSITORY https://github.com/lorisercole/ExchCXX.git )
-set( GAUXC_EXCHCXX_REVISION   0bed66fbd86108804a52b421ab511f5e8828c072 )
+set( GAUXC_EXCHCXX_REVISION   8e5247d0fb362e9d6d5455b2207df84d08220a38 )
 
 set( GAUXC_GAU2GRID_REPOSITORY https://github.com/dgasmith/gau2grid.git )
 set( GAUXC_GAU2GRID_REVISION   v2.0.6 )

From cf6f34853774fb909b49aa7389cb3856550b66c1 Mon Sep 17 00:00:00 2001
From: Loris Ercole <v-lercole@microsoft.com>
Date: Thu, 11 Jun 2026 18:22:41 +0200
Subject: [PATCH 49/52] Update ExchCXX dependency hash

Pick up fix for -Wunused-parameter on backend in kernel_factory.
---
 cmake/gauxc-dep-versions.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/gauxc-dep-versions.cmake b/cmake/gauxc-dep-versions.cmake
index 7105fe166..5de0b6e7d 100644
--- a/cmake/gauxc-dep-versions.cmake
+++ b/cmake/gauxc-dep-versions.cmake
@@ -8,7 +8,7 @@ set( GAUXC_CUTLASS_REPOSITORY https://github.com/NVIDIA/cutlass.git )
 set( GAUXC_CUTLASS_REVISION v2.10.0 )
 
 set( GAUXC_EXCHCXX_REPOSITORY https://github.com/lorisercole/ExchCXX.git )
-set( GAUXC_EXCHCXX_REVISION   8e5247d0fb362e9d6d5455b2207df84d08220a38 )
+set( GAUXC_EXCHCXX_REVISION   97338065120ca32d04ba77f45d615a9e92cc0d20 )
 
 set( GAUXC_GAU2GRID_REPOSITORY https://github.com/dgasmith/gau2grid.git )
 set( GAUXC_GAU2GRID_REVISION   v2.0.6 )

From e8494a52c457483ad11670e6b234873a1a06dc05 Mon Sep 17 00:00:00 2001
From: Loris Ercole <v-lercole@microsoft.com>
Date: Fri, 12 Jun 2026 14:03:06 +0200
Subject: [PATCH 50/52] Update ExchCXX and IntegratorXX repository revisions

---
 cmake/gauxc-dep-versions.cmake | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cmake/gauxc-dep-versions.cmake b/cmake/gauxc-dep-versions.cmake
index 5de0b6e7d..439bf2ca8 100644
--- a/cmake/gauxc-dep-versions.cmake
+++ b/cmake/gauxc-dep-versions.cmake
@@ -8,13 +8,13 @@ set( GAUXC_CUTLASS_REPOSITORY https://github.com/NVIDIA/cutlass.git )
 set( GAUXC_CUTLASS_REVISION v2.10.0 )
 
 set( GAUXC_EXCHCXX_REPOSITORY https://github.com/lorisercole/ExchCXX.git )
-set( GAUXC_EXCHCXX_REVISION   97338065120ca32d04ba77f45d615a9e92cc0d20 )
+set( GAUXC_EXCHCXX_REVISION   c46965a1d365d24f1b3968791a12adc949ed767c )
 
 set( GAUXC_GAU2GRID_REPOSITORY https://github.com/dgasmith/gau2grid.git )
 set( GAUXC_GAU2GRID_REVISION   v2.0.6 )
 
-set( GAUXC_INTEGRATORXX_REPOSITORY https://github.com/lorisercole/IntegratorXX.git )
-set( GAUXC_INTEGRATORXX_REVISION   923125236ea5971ee9accdea39da552b8e322ff6 )
+set( GAUXC_INTEGRATORXX_REPOSITORY https://github.com/wavefunction91/IntegratorXX.git )
+set( GAUXC_INTEGRATORXX_REVISION   619d26ae4ea421ed9b7a6f80eb0783f9da1ecf7a )
 
 set( GAUXC_HIGHFIVE_REPOSITORY https://github.com/highfive-devs/HighFive.git )
 set( GAUXC_HIGHFIVE_REVISION 805f0e13d09b47c4b01d40682621904aa3b31bb8 )

From 111dd9379e7fb8d9e8bcba23ae2d35fc46319f44 Mon Sep 17 00:00:00 2001
From: Loris Ercole <v-lercole@microsoft.com>
Date: Fri, 12 Jun 2026 17:48:05 +0200
Subject: [PATCH 51/52] Restore ExchCXX repository URL and revision to latest
 version

---
 cmake/gauxc-dep-versions.cmake | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cmake/gauxc-dep-versions.cmake b/cmake/gauxc-dep-versions.cmake
index 439bf2ca8..9cf64e4b4 100644
--- a/cmake/gauxc-dep-versions.cmake
+++ b/cmake/gauxc-dep-versions.cmake
@@ -7,8 +7,8 @@ set( GAUXC_CUB_REVISION   1.10.0 )
 set( GAUXC_CUTLASS_REPOSITORY https://github.com/NVIDIA/cutlass.git )
 set( GAUXC_CUTLASS_REVISION v2.10.0 )
 
-set( GAUXC_EXCHCXX_REPOSITORY https://github.com/lorisercole/ExchCXX.git )
-set( GAUXC_EXCHCXX_REVISION   c46965a1d365d24f1b3968791a12adc949ed767c )
+set( GAUXC_EXCHCXX_REPOSITORY https://github.com/wavefunction91/ExchCXX.git )
+set( GAUXC_EXCHCXX_REVISION   67be5c6ebe1e5b1a32f2c3fd1c5bf4cbfe48f769 )
 
 set( GAUXC_GAU2GRID_REPOSITORY https://github.com/dgasmith/gau2grid.git )
 set( GAUXC_GAU2GRID_REVISION   v2.0.6 )

From f99026fd6ac1ccb20486e76863149b019b70caa1 Mon Sep 17 00:00:00 2001
From: Loris Ercole <v-lercole@microsoft.com>
Date: Fri, 12 Jun 2026 18:28:07 +0200
Subject: [PATCH 52/52] Fix popcount implementation for MSVC & GCCcompatibility
 in exx_screening

---
 src/xc_integrator/integrator_util/exx_screening.cxx | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/xc_integrator/integrator_util/exx_screening.cxx b/src/xc_integrator/integrator_util/exx_screening.cxx
index 80dcaaab0..41d89eb6c 100644
--- a/src/xc_integrator/integrator_util/exx_screening.cxx
+++ b/src/xc_integrator/integrator_util/exx_screening.cxx
@@ -13,7 +13,12 @@
 #include "host/blas.hpp"
 #include <gauxc/util/div_ceil.hpp>
 #include <chrono>
-#include <bit>
+#ifdef _MSC_VER
+#include <intrin.h>  // __popcnt
+#define GAUXC_POPCOUNT(x) __popcnt(x)
+#else
+#define GAUXC_POPCOUNT(x) __builtin_popcount(x)
+#endif
 //#include <mpi.h>
 //#include <fstream>
 #ifdef GAUXC_HAS_CUDA
@@ -196,7 +201,7 @@ void exx_ek_screening(
     }
 
     uint32_t total_shells = 0;
-    for( auto x : task_ek_shells ) total_shells += std::popcount(x);
+    for( auto x : task_ek_shells ) total_shells += static_cast<uint32_t>(GAUXC_POPCOUNT(x));
 
     std::vector<uint32_t> ek_shells; ek_shells.reserve(total_shells);
     for( auto i_block = 0u; i_block < util::div_ceil(nshells,32); ++i_block ) {