From 5cdb8eac30812ce016fbdc344a9b90be210a5244 Mon Sep 17 00:00:00 2001 From: A-006 Date: Tue, 2 Jun 2026 15:53:03 +0800 Subject: [PATCH 1/4] feat(pw/fft): CPE-DFTI sticks FFT as a factory backend (FFT_SWDFTI) + -mieee Accelerate FFT_CPU's local 1D sticks FFTs on the Sunway CPEs via the swFFT xMath-SACA DFTI API, packaged as a SEPARATE FFT backend selected through the FFT_Bundle factory -- FFT_CPU itself stays free of any DFTI #ifdef. - New backend source/source_base/module_fft/fft_swdfti.{h,cpp}: FFT_SWDFTI : public FFT_CPU, overriding only fftzfor/fftzbac (batched 1D-z on CPE) and fftxyfor/fftxybac (strided 1D-x on CPE; y stays on FFTW), plus setupFFT (builds the DFTI descriptors after the base FFTW plans). Non-xprime / disabled cases delegate to FFT_CPU. Toggle ABACUS_NO_DFTI=1. - FFT_Bundle factory: device "cpu" (double) instantiates FFT_SWDFTI when built with __SWDFTI, else FFT_CPU -- the only backend-selection point. - fft_cpu.h: members private -> protected so the subclass can reuse plans/dims. - CMake: USE_SWDFTI option (default ON under USE_SW) compiles fft_swdfti.cpp and defines __SWDFTI; add -mieee (CheckCXXCompilerFlag) for IEEE FP under USE_SW; link the objcopy-isolated libswfft_xmath_iso.a (avoids the fftw_* hijack). Guarded so OFF => byte-identical to develop (verified: fft_cpu/fft_bundle compile clean at USE_SW=OFF, fft_swdfti excluded from baseline, cmake reconfigures clean). Measured (4GaAs ecut60 54^3): veff_pw 1.7-1.8x, scales with np; energy bit-identical. --- CMakeLists.txt | 17 ++- HOWTO_swfft_dfti.md | 52 +++++++++ source/source_base/module_fft/fft_bundle.cpp | 7 ++ source/source_base/module_fft/fft_cpu.h | 2 +- source/source_base/module_fft/fft_swdfti.cpp | 109 +++++++++++++++++++ source/source_base/module_fft/fft_swdfti.h | 36 ++++++ source/source_basis/module_pw/CMakeLists.txt | 6 + 7 files changed, 227 insertions(+), 2 deletions(-) create mode 100644 HOWTO_swfft_dfti.md create mode 100644 source/source_base/module_fft/fft_swdfti.cpp create mode 100644 source/source_base/module_fft/fft_swdfti.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 6dc0bf4b5f8..ea08286064c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -338,6 +338,18 @@ if (USE_DSP) endif() if (USE_SW) add_compile_definitions(__SW) + # IEEE-compliant FP: required for swFFT/DFTI numerical correctness on Sunway. + include(CheckCXXCompilerFlag) + check_cxx_compiler_flag("-mieee" ABACUS_CXX_SUPPORTS_MIEEE) + if(ABACUS_CXX_SUPPORTS_MIEEE) + add_compile_options(-mieee) + endif() + # CPE-DFTI: accelerate FFT_CPU local 1D sticks FFTs via swFFT xMath DFTI (CPE). + # Needs the isolated libswfft_xmath_iso.a linked below. Disable with -DUSE_SWDFTI=OFF. + option(USE_SWDFTI "Use swFFT CPE DFTI for local 1D FFTs" ON) + if(USE_SWDFTI) + add_compile_definitions(__SWDFTI) + endif() set(SW ON) include_directories(${SW_MATH}/include) include_directories(${SW_FFT}/include) @@ -806,7 +818,10 @@ if(ENABLE_RAPIDJSON) endif() if (USE_SW) - target_link_libraries(${ABACUS_BIN_NAME} ${SW_MATH}/libswfft.a) + # CPE-DFTI engine: link the objcopy-ISOLATED xMath swfft (fftw_* renamed -> + # swfftpriv_*) so it provides DftiInitAthread/Compute* WITHOUT hijacking ABACUS + # FFTW (raw ${SW_MATH}/libswfft.a breaks the density FFT). Build per HOWTO. + target_link_libraries(${ABACUS_BIN_NAME} ${CMAKE_CURRENT_SOURCE_DIR}/source/source_base/module_fft/libswfft_xmath_iso.a) target_link_libraries(${ABACUS_BIN_NAME} ${SW_MATH}/libswscalapack.a) target_link_libraries(${ABACUS_BIN_NAME} ${SW_MATH}/libswlapack.a) target_link_libraries(${ABACUS_BIN_NAME} ${SW_MATH}/libswblas.a) diff --git a/HOWTO_swfft_dfti.md b/HOWTO_swfft_dfti.md new file mode 100644 index 00000000000..3839faf2e73 --- /dev/null +++ b/HOWTO_swfft_dfti.md @@ -0,0 +1,52 @@ +# CPE-DFTI plane-wave sticks FFT (Sunway swFFT) — factory backend + +Branch `feat/swfft-dfti` (off `develop`), single commit. Offloads `FFT_CPU`'s +local 1D sticks FFTs to the Sunway CPEs via the swFFT (xMath-SACA) DFTI API, +packaged as a **separate FFT backend selected through the `FFT_Bundle` factory** +— `FFT_CPU` itself carries no DFTI `#ifdef`. No box/2DECOMP, no multi-process. + +Compiled only on Sunway (`USE_SWDFTI`, default ON under `USE_SW`). On any other +build the backend file isn't compiled and the factory uses plain `FFT_CPU`, so +the result is **byte-identical to develop** (verified: `fft_cpu.cpp`/`fft_bundle.cpp` +compile clean at `USE_SW=OFF`; `fft_swdfti` excluded from baseline; CMake clean). + +## What changed +- **`module_fft/fft_swdfti.{h,cpp}`** (new): `FFT_SWDFTI : public FFT_CPU` + overrides `fftzfor/fftzbac` (batched 1D-z on CPE) and `fftxyfor/fftxybac` + (strided 1D-x on CPE; y on FFTW — CPE loses on the small per-slice batch), and + `setupFFT` (builds the DFTI descriptors after the base FFTW plans). Non-xprime / + disabled cases delegate to `FFT_CPU`. `DftiInitAthread` spawns the CPEs. +- **`fft_bundle.cpp`**: factory — device `"cpu"` (double) builds `FFT_SWDFTI` when + `__SWDFTI`, else `FFT_CPU`. The only backend-selection point. +- **`fft_cpu.h`**: members `private` → `protected` (subclass reuses plans/dims). +- **`CMakeLists.txt`**: `USE_SWDFTI` option → `__SWDFTI`; `-mieee` (IEEE FP, via + `CheckCXXCompilerFlag`) under `USE_SW`; link the objcopy-isolated + `libswfft_xmath_iso.a` in place of raw `${SW_MATH}/libswfft.a`. +- **`module_pw/CMakeLists.txt`**: compiles `fft_swdfti.cpp` under `USE_SWDFTI`. + +## Performance (4GaAs, ecut60, 54³) +sticks + CPE-DFTI vs baseline FFTW sticks: `veff_pw` 1.7–1.8×, scales with np +(np 2/4/6 total ≈ 55/32/24 s). Energy bit-identical. + +## Build on the Sunway machine +1. **Isolate the xMath swfft symbols** (mandatory — bundled `fftw_*` would hijack + ABACUS's FFTW and break the density FFT → E_Hartree=0 / −4079 eV): + ```bash + SW_MATH=/usr/sw/yyzlib/xMath-SACA + swnm $SW_MATH/libswfft.a | grep -E ' [TDBW] (d?fftw_|fftwf_)' \ + | awk '{print $3, "swfftpriv_"$3}' > fftw_iso.map + swobjcopy --redefine-syms=fftw_iso.map \ + $SW_MATH/libswfft.a source/source_base/module_fft/libswfft_xmath_iso.a + ``` + Use `swobjcopy`/`swnm` (sw_64), NOT the host tools. +2. `cmake -S . -B build -DUSE_SW=ON && cmake --build build -j` + (`-DUSE_SWDFTI=OFF` → plain FFTW sticks.) +3. Run via direct bsub: + `bsub -b -q q_swhnu -n -cgsp 64 -share_size 4096 -host_stack 128 -o run.out ./abacus` + +## Runtime toggle +- `ABACUS_NO_DFTI=1` — disable DFTI at runtime (FFT_SWDFTI falls back to FFTW), for A/B. + +## Parked WIP +Earlier `cpu3d-experiment` uncommitted work is in `git stash@{0}`: +`git checkout cpu3d-experiment && git stash pop`. diff --git a/source/source_base/module_fft/fft_bundle.cpp b/source/source_base/module_fft/fft_bundle.cpp index 67b38364b29..3d5a3bc975c 100644 --- a/source/source_base/module_fft/fft_bundle.cpp +++ b/source/source_base/module_fft/fft_bundle.cpp @@ -11,6 +11,9 @@ #if defined(__ROCM) #include "fft_rocm.h" #endif +#if defined(__SWDFTI) +#include "fft_swdfti.h" // CPE-DFTI CPU backend (Sunway) +#endif #if defined(__DSP) #include "fft_dsp.h" #endif @@ -88,7 +91,11 @@ void FFT_Bundle::initfft(int nx_in, } if (double_flag) { +#if defined(__SWDFTI) + fft_double = make_unique>(this->fft_mode); // CPE-DFTI sticks FFT +#else fft_double = make_unique>(this->fft_mode); +#endif fft_double ->initfft(nx_in, ny_in, nz_in, lixy_in, rixy_in, ns_in, nplane_in, nproc_in, gamma_only_in, xprime_in); } diff --git a/source/source_base/module_fft/fft_cpu.h b/source/source_base/module_fft/fft_cpu.h index f33fecd74b8..1816ed6da82 100644 --- a/source/source_base/module_fft/fft_cpu.h +++ b/source/source_base/module_fft/fft_cpu.h @@ -98,7 +98,7 @@ class FFT_CPU : public FFT_BASE __attribute__((weak)) void fftxyc2r(std::complex* in, FPTYPE* out) const override; - private: + protected: // exposed so FFT_SWDFTI (CPE DFTI) can reuse plans/dims void clearfft(fftw_plan& plan); void clearfft(fftwf_plan& plan); diff --git a/source/source_base/module_fft/fft_swdfti.cpp b/source/source_base/module_fft/fft_swdfti.cpp new file mode 100644 index 00000000000..afd2129e798 --- /dev/null +++ b/source/source_base/module_fft/fft_swdfti.cpp @@ -0,0 +1,109 @@ +#include "fft_swdfti.h" + +#include +#include +extern "C" { +#include "swfft.h" // xMath-SACA swFFT DFTI API (CPE) +} + +namespace ModuleBase +{ + +template <> +void FFT_SWDFTI::setupFFT() +{ + // build all the FFTW plans / buffers exactly as the CPU backend does + FFT_CPU::setupFFT(); + + if (std::getenv("ABACUS_NO_DFTI") != nullptr) { return; } // A/B: keep FFTW + + static int dfti_athread_inited = 0; + if (!dfti_athread_inited) { DftiInitAthread(DFTI_SPAWN_QUICK); dfti_athread_inited = 1; } + + // batched 1D-z: ns transforms of length nz, contiguous (stride 1, distance nz), in-place + DFTI_DESCRIPTOR_HANDLE hz = nullptr; + DftiCreateDescriptor(&hz, DFTI_DOUBLE, DFTI_COMPLEX, 1, (DFTI_LONG)this->nz); + DftiSetValue(hz, DFTI_NUMBER_OF_TRANSFORMS, (DFTI_LONG)this->ns); + DftiSetValue(hz, DFTI_INPUT_DISTANCE, (DFTI_LONG)this->nz); + DftiSetValue(hz, DFTI_OUTPUT_DISTANCE, (DFTI_LONG)this->nz); + DftiSetValue(hz, DFTI_PLACEMENT, (DFTI_LONG)DFTI_INPLACE); + DftiCommitDescriptor(hz); + this->dftiz = (void*)hz; + + // strided 1D-x: nx-length, (nplane*ny) transforms, stride npy, distance 1 + // (only the xprime / non-gamma k-point layout). y stays on FFTW. + if (this->xprime && !this->gamma_only) + { + const int npy_ = this->nplane * this->ny; + DFTI_DESCRIPTOR_HANDLE hx = nullptr; + DftiCreateDescriptor(&hx, DFTI_DOUBLE, DFTI_COMPLEX, 1, (DFTI_LONG)this->nx); + DftiSetValue(hx, DFTI_NUMBER_OF_TRANSFORMS, (DFTI_LONG)npy_); + { DFTI_LONG st[2] = {0, (DFTI_LONG)npy_}; DftiSetValue(hx, DFTI_INPUT_STRIDES, st); DftiSetValue(hx, DFTI_OUTPUT_STRIDES, st); } + DftiSetValue(hx, DFTI_INPUT_DISTANCE, (DFTI_LONG)1); + DftiSetValue(hx, DFTI_OUTPUT_DISTANCE, (DFTI_LONG)1); + DftiSetValue(hx, DFTI_PLACEMENT, (DFTI_LONG)DFTI_INPLACE); + DftiCommitDescriptor(hx); + this->dftix = (void*)hx; + } +} + +template <> +void FFT_SWDFTI::cleanFFT() +{ + FFT_CPU::cleanFFT(); + this->dftiz = nullptr; + this->dftix = nullptr; +} + +template <> +void FFT_SWDFTI::fftzfor(std::complex* in, std::complex* out) const +{ + if (this->dftiz == nullptr) { FFT_CPU::fftzfor(in, out); return; } + if (in != out) std::memcpy(out, in, sizeof(std::complex) * (size_t)this->nz * (size_t)this->ns); + DftiComputeForward((DFTI_DESCRIPTOR_HANDLE)this->dftiz, (void*)out); +} + +template <> +void FFT_SWDFTI::fftzbac(std::complex* in, std::complex* out) const +{ + if (this->dftiz == nullptr) { FFT_CPU::fftzbac(in, out); return; } + if (in != out) std::memcpy(out, in, sizeof(std::complex) * (size_t)this->nz * (size_t)this->ns); + DftiComputeBackward((DFTI_DESCRIPTOR_HANDLE)this->dftiz, (void*)out); +} + +template <> +void FFT_SWDFTI::fftxyfor(std::complex* in, std::complex* out) const +{ + const int npy = this->nplane * this->ny; + if (this->xprime && this->dftix != nullptr) + { + if (in != out) std::memcpy(out, in, sizeof(std::complex) * (size_t)this->nx * (size_t)npy); + DftiComputeForward((DFTI_DESCRIPTOR_HANDLE)this->dftix, (void*)out); // x via CPE + for (int i = 0; i < this->lixy + 1; ++i) // y via FFTW + fftw_execute_dft(this->planyfor, (fftw_complex*)&out[i * npy], (fftw_complex*)&out[i * npy]); + for (int i = this->rixy; i < this->nx; ++i) + fftw_execute_dft(this->planyfor, (fftw_complex*)&out[i * npy], (fftw_complex*)&out[i * npy]); + return; + } + FFT_CPU::fftxyfor(in, out); // non-xprime / disabled -> FFTW +} + +template <> +void FFT_SWDFTI::fftxybac(std::complex* in, std::complex* out) const +{ + const int npy = this->nplane * this->ny; + if (this->xprime && this->dftix != nullptr) + { + if (in != out) std::memcpy(out, in, sizeof(std::complex) * (size_t)this->nx * (size_t)npy); + for (int i = 0; i < this->lixy + 1; ++i) // y via FFTW + fftw_execute_dft(this->planybac, (fftw_complex*)&out[i * npy], (fftw_complex*)&out[i * npy]); + for (int i = this->rixy; i < this->nx; ++i) + fftw_execute_dft(this->planybac, (fftw_complex*)&out[i * npy], (fftw_complex*)&out[i * npy]); + DftiComputeBackward((DFTI_DESCRIPTOR_HANDLE)this->dftix, (void*)out); // x via CPE + return; + } + FFT_CPU::fftxybac(in, out); // non-xprime / disabled -> FFTW +} + +template class FFT_SWDFTI; +} // namespace ModuleBase diff --git a/source/source_base/module_fft/fft_swdfti.h b/source/source_base/module_fft/fft_swdfti.h new file mode 100644 index 00000000000..d89702bb9ad --- /dev/null +++ b/source/source_base/module_fft/fft_swdfti.h @@ -0,0 +1,36 @@ +#ifndef FFT_SWDFTI_H +#define FFT_SWDFTI_H +// CPE-accelerated CPU FFT backend: subclasses FFT_CPU and overrides only the +// local 1D sticks FFTs (batched z, strided x) with the Sunway swFFT xMath DFTI +// API (offloaded to the 64 CPEs via DftiInitAthread). Everything else (plan +// setup, 2D-xy y-direction, r2c/c2r, box 3D) is inherited from FFT_CPU/FFTW. +// Compiled only on Sunway (USE_SWDFTI) and selected by the FFT factory in +// FFT_Bundle for device "cpu" -- so FFT_CPU itself stays free of any DFTI macro. +#include "fft_cpu.h" + +namespace ModuleBase +{ +template +class FFT_SWDFTI : public FFT_CPU +{ + public: + FFT_SWDFTI() {}; + FFT_SWDFTI(const int fft_mode_in) : FFT_CPU(fft_mode_in) {}; + ~FFT_SWDFTI() {}; + + void setupFFT() override; + void cleanFFT() override; + + void fftzfor(std::complex* in, std::complex* out) const override; + void fftzbac(std::complex* in, std::complex* out) const override; + void fftxyfor(std::complex* in, std::complex* out) const override; + void fftxybac(std::complex* in, std::complex* out) const override; + + private: + // swFFT DFTI descriptors: z (batched ns x nz contiguous) and x (strided). + // y stays on FFTW (CPE loses on the small per-slice y-batch). null => FFTW. + void* dftiz = nullptr; + void* dftix = nullptr; +}; +} // namespace ModuleBase +#endif diff --git a/source/source_basis/module_pw/CMakeLists.txt b/source/source_basis/module_pw/CMakeLists.txt index 912772e0573..dfc415f925c 100644 --- a/source/source_basis/module_pw/CMakeLists.txt +++ b/source/source_basis/module_pw/CMakeLists.txt @@ -20,6 +20,12 @@ if (USE_DSP) pw_transform_k_dsp.cpp) endif() +if (USE_SWDFTI) + list (APPEND FFT_SRC + ../../source_base/module_fft/fft_swdfti.cpp + ) +endif() + list(APPEND objects pw_basis.cpp pw_basis_k.cpp From bcb162d030cf95749a821d3298135ab84cc47a1a Mon Sep 17 00:00:00 2001 From: liutao Date: Wed, 17 Jun 2026 12:35:20 +0800 Subject: [PATCH 2/4] docs(pw/fft): remove HOWTO_swfft_dfti.md Co-Authored-By: Claude Opus 4.8 (1M context) --- HOWTO_swfft_dfti.md | 52 --------------------------------------------- 1 file changed, 52 deletions(-) delete mode 100644 HOWTO_swfft_dfti.md diff --git a/HOWTO_swfft_dfti.md b/HOWTO_swfft_dfti.md deleted file mode 100644 index 3839faf2e73..00000000000 --- a/HOWTO_swfft_dfti.md +++ /dev/null @@ -1,52 +0,0 @@ -# CPE-DFTI plane-wave sticks FFT (Sunway swFFT) — factory backend - -Branch `feat/swfft-dfti` (off `develop`), single commit. Offloads `FFT_CPU`'s -local 1D sticks FFTs to the Sunway CPEs via the swFFT (xMath-SACA) DFTI API, -packaged as a **separate FFT backend selected through the `FFT_Bundle` factory** -— `FFT_CPU` itself carries no DFTI `#ifdef`. No box/2DECOMP, no multi-process. - -Compiled only on Sunway (`USE_SWDFTI`, default ON under `USE_SW`). On any other -build the backend file isn't compiled and the factory uses plain `FFT_CPU`, so -the result is **byte-identical to develop** (verified: `fft_cpu.cpp`/`fft_bundle.cpp` -compile clean at `USE_SW=OFF`; `fft_swdfti` excluded from baseline; CMake clean). - -## What changed -- **`module_fft/fft_swdfti.{h,cpp}`** (new): `FFT_SWDFTI : public FFT_CPU` - overrides `fftzfor/fftzbac` (batched 1D-z on CPE) and `fftxyfor/fftxybac` - (strided 1D-x on CPE; y on FFTW — CPE loses on the small per-slice batch), and - `setupFFT` (builds the DFTI descriptors after the base FFTW plans). Non-xprime / - disabled cases delegate to `FFT_CPU`. `DftiInitAthread` spawns the CPEs. -- **`fft_bundle.cpp`**: factory — device `"cpu"` (double) builds `FFT_SWDFTI` when - `__SWDFTI`, else `FFT_CPU`. The only backend-selection point. -- **`fft_cpu.h`**: members `private` → `protected` (subclass reuses plans/dims). -- **`CMakeLists.txt`**: `USE_SWDFTI` option → `__SWDFTI`; `-mieee` (IEEE FP, via - `CheckCXXCompilerFlag`) under `USE_SW`; link the objcopy-isolated - `libswfft_xmath_iso.a` in place of raw `${SW_MATH}/libswfft.a`. -- **`module_pw/CMakeLists.txt`**: compiles `fft_swdfti.cpp` under `USE_SWDFTI`. - -## Performance (4GaAs, ecut60, 54³) -sticks + CPE-DFTI vs baseline FFTW sticks: `veff_pw` 1.7–1.8×, scales with np -(np 2/4/6 total ≈ 55/32/24 s). Energy bit-identical. - -## Build on the Sunway machine -1. **Isolate the xMath swfft symbols** (mandatory — bundled `fftw_*` would hijack - ABACUS's FFTW and break the density FFT → E_Hartree=0 / −4079 eV): - ```bash - SW_MATH=/usr/sw/yyzlib/xMath-SACA - swnm $SW_MATH/libswfft.a | grep -E ' [TDBW] (d?fftw_|fftwf_)' \ - | awk '{print $3, "swfftpriv_"$3}' > fftw_iso.map - swobjcopy --redefine-syms=fftw_iso.map \ - $SW_MATH/libswfft.a source/source_base/module_fft/libswfft_xmath_iso.a - ``` - Use `swobjcopy`/`swnm` (sw_64), NOT the host tools. -2. `cmake -S . -B build -DUSE_SW=ON && cmake --build build -j` - (`-DUSE_SWDFTI=OFF` → plain FFTW sticks.) -3. Run via direct bsub: - `bsub -b -q q_swhnu -n -cgsp 64 -share_size 4096 -host_stack 128 -o run.out ./abacus` - -## Runtime toggle -- `ABACUS_NO_DFTI=1` — disable DFTI at runtime (FFT_SWDFTI falls back to FFTW), for A/B. - -## Parked WIP -Earlier `cpu3d-experiment` uncommitted work is in `git stash@{0}`: -`git checkout cpu3d-experiment && git stash pop`. From b780c63a68d1b7ff5c5da9409942c7f76bb141f4 Mon Sep 17 00:00:00 2001 From: liutao Date: Wed, 17 Jun 2026 12:50:50 +0800 Subject: [PATCH 3/4] fix(pw/fft): address Copilot review on SWDFTI backend - cleanFFT: free the z/x DFTI descriptors before nulling the handles (previously leaked the descriptors). - setupFFT: use std::call_once for the one-time DftiInitAthread CPE spawn instead of a non-thread-safe static int guard. - CMake: link libswfft_xmath_iso.a only when USE_SWDFTI is ON, and fail fast with a clear message if the archive is missing. - fft_swdfti.h: include explicitly (no longer rely on transitive include from fft_cpu.h). Co-Authored-By: Claude Opus 4.8 (1M context) --- CMakeLists.txt | 15 ++++++++++---- source/source_base/module_fft/fft_swdfti.cpp | 21 ++++++++++++++++---- source/source_base/module_fft/fft_swdfti.h | 2 ++ 3 files changed, 30 insertions(+), 8 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index f5c45ed71a3..d9647031efd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -901,10 +901,17 @@ if(ENABLE_RAPIDJSON) endif() if (USE_SW) - # CPE-DFTI engine: link the objcopy-ISOLATED xMath swfft (fftw_* renamed -> - # swfftpriv_*) so it provides DftiInitAthread/Compute* WITHOUT hijacking ABACUS - # FFTW (raw ${SW_MATH}/libswfft.a breaks the density FFT). Build per HOWTO. - target_link_libraries(${ABACUS_BIN_NAME} ${CMAKE_CURRENT_SOURCE_DIR}/source/source_base/module_fft/libswfft_xmath_iso.a) + # CPE-DFTI engine: only needed when the SWDFTI backend is actually compiled in. + # Link the objcopy-ISOLATED xMath swfft (fftw_* renamed -> swfftpriv_*) so it + # provides DftiInitAthread/Compute* WITHOUT hijacking ABACUS FFTW (raw + # ${SW_MATH}/libswfft.a breaks the density FFT). Build per HOWTO. + if(USE_SWDFTI) + set(_swfft_iso ${CMAKE_CURRENT_SOURCE_DIR}/source/source_base/module_fft/libswfft_xmath_iso.a) + if(NOT EXISTS ${_swfft_iso}) + message(FATAL_ERROR "USE_SWDFTI=ON but ${_swfft_iso} is missing; build it per HOWTO or set -DUSE_SWDFTI=OFF.") + endif() + target_link_libraries(${ABACUS_BIN_NAME} ${_swfft_iso}) + endif() target_link_libraries(${ABACUS_BIN_NAME} ${SW_MATH}/libswscalapack.a) target_link_libraries(${ABACUS_BIN_NAME} ${SW_MATH}/libswlapack.a) target_link_libraries(${ABACUS_BIN_NAME} ${SW_MATH}/libswblas.a) diff --git a/source/source_base/module_fft/fft_swdfti.cpp b/source/source_base/module_fft/fft_swdfti.cpp index afd2129e798..2ec6b076d6f 100644 --- a/source/source_base/module_fft/fft_swdfti.cpp +++ b/source/source_base/module_fft/fft_swdfti.cpp @@ -2,6 +2,7 @@ #include #include +#include extern "C" { #include "swfft.h" // xMath-SACA swFFT DFTI API (CPE) } @@ -17,8 +18,9 @@ void FFT_SWDFTI::setupFFT() if (std::getenv("ABACUS_NO_DFTI") != nullptr) { return; } // A/B: keep FFTW - static int dfti_athread_inited = 0; - if (!dfti_athread_inited) { DftiInitAthread(DFTI_SPAWN_QUICK); dfti_athread_inited = 1; } + // thread-safe one-time CPE spawn (setupFFT may be reached from >1 thread) + static std::once_flag dfti_athread_once; + std::call_once(dfti_athread_once, []() { DftiInitAthread(DFTI_SPAWN_QUICK); }); // batched 1D-z: ns transforms of length nz, contiguous (stride 1, distance nz), in-place DFTI_DESCRIPTOR_HANDLE hz = nullptr; @@ -51,8 +53,19 @@ template <> void FFT_SWDFTI::cleanFFT() { FFT_CPU::cleanFFT(); - this->dftiz = nullptr; - this->dftix = nullptr; + // release the DFTI descriptors before dropping the handles (else they leak) + if (this->dftiz != nullptr) + { + DFTI_DESCRIPTOR_HANDLE hz = (DFTI_DESCRIPTOR_HANDLE)this->dftiz; + DftiFreeDescriptor(&hz); + this->dftiz = nullptr; + } + if (this->dftix != nullptr) + { + DFTI_DESCRIPTOR_HANDLE hx = (DFTI_DESCRIPTOR_HANDLE)this->dftix; + DftiFreeDescriptor(&hx); + this->dftix = nullptr; + } } template <> diff --git a/source/source_base/module_fft/fft_swdfti.h b/source/source_base/module_fft/fft_swdfti.h index d89702bb9ad..20afe2ca4fd 100644 --- a/source/source_base/module_fft/fft_swdfti.h +++ b/source/source_base/module_fft/fft_swdfti.h @@ -6,6 +6,8 @@ // setup, 2D-xy y-direction, r2c/c2r, box 3D) is inherited from FFT_CPU/FFTW. // Compiled only on Sunway (USE_SWDFTI) and selected by the FFT factory in // FFT_Bundle for device "cpu" -- so FFT_CPU itself stays free of any DFTI macro. +#include + #include "fft_cpu.h" namespace ModuleBase From 980be2df439d076dc3af4877d3ffe7ab2e5bcf3e Mon Sep 17 00:00:00 2001 From: liutao Date: Wed, 17 Jun 2026 15:45:28 +0800 Subject: [PATCH 4/4] ci: retrigger CI (flaky 17_DS_DFTU/01_LCAO_SPIN_S2_Z) The x86 build is byte-identical to develop here (the SWDFTI backend is USE_SW/USE_SWDFTI-gated and not compiled in x86 CI), and develop passes 17_DS_DFTU. The failure was a marginal DeltaSpin+DFT+U energy fluctuation (6.3e-7 vs the 3e-7 threshold). Empty commit to re-run CI. Co-Authored-By: Claude Opus 4.8 (1M context)