diff --git a/conda/recipes/libcuvs/recipe.yaml b/conda/recipes/libcuvs/recipe.yaml index aa7a37db44..93f31f8cf2 100644 --- a/conda/recipes/libcuvs/recipe.yaml +++ b/conda/recipes/libcuvs/recipe.yaml @@ -80,6 +80,7 @@ cache: - cuda-cudart-dev - cuda-nvrtc-dev - cuda-profiler-api + - cutile-python - libcublas-dev - libcurand-dev - libcusolver-dev @@ -117,6 +118,7 @@ outputs: - cuda-cudart-dev - cuda-nvrtc-dev - cuda-profiler-api + - cutile-python - libcublas-dev - libcurand-dev - libcusolver-dev @@ -179,6 +181,7 @@ outputs: - cuda-cudart-dev - cuda-nvrtc-dev - cuda-profiler-api + - cutile-python - libcublas-dev - libcurand-dev - libcusolver-dev @@ -240,6 +243,7 @@ outputs: - cuda-cudart-dev - cuda-nvrtc-dev - cuda-profiler-api + - cutile-python - libcublas-dev - libcurand-dev - libcusolver-dev @@ -299,6 +303,7 @@ outputs: - openblas # required by some CPU algos in benchmarks - cuda-cudart-dev - cuda-profiler-api + - cutile-python - libcublas-dev - libcurand-dev - libcusolver-dev diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 227c2906cc..70e2509a88 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -957,6 +957,41 @@ if(NOT BUILD_CPU_ONLY) OUTPUT_FILE_FORMAT "${CMAKE_CURRENT_BINARY_DIR}/src/distance/detail/pairwise_matrix/dispatch_rbf_inst_data_@data_abbrev@_acc_@acc_abbrev@_out_@out_abbrev@_index_@index_abbrev@_op_@op_abbrev@.cu" ) + + include(cmake/modules/generate_cutile_kernels.cmake) + set(fused_1nn_cutile_dir + "${CMAKE_CURRENT_SOURCE_DIR}/src/distance/detail/fused_distance_nn/cutile" + ) + set(cutile_fused_1nn_generated_dir + "${CMAKE_CURRENT_BINARY_DIR}/generated_kernels/distance/fused_1nn/cutile" + ) + generate_cutile_kernels( + cutile_fused_1nn_files + KERNEL_DIR + "${fused_1nn_cutile_dir}" + KERNEL_BASENAME + "fused_1nn" + KERNEL_PYTHON + "fused_1nn_kernel.py" + EXPORT_SCRIPT + "export_fused_1nn.py" + OUTPUT_DIRECTORY + "${cutile_fused_1nn_generated_dir}" + MATRIX_JSON_FILE + "${fused_1nn_cutile_dir}/fused_1nn_cutile_matrix.json" + FRAGMENT_TAG_FORMAT_CUBIN + "cuvs::distance::detail::fragment_tag_fused_1nn_cubin" + FRAGMENT_TAG_FORMAT_TILEIR + "cuvs::distance::detail::fragment_tag_fused_1nn_tileir" + FRAGMENT_TAG_HEADER_FILES + "" + "" + "" + ) + if(NOT DEFINED CUVS_CUTILE_ENABLED) + set(CUVS_CUTILE_ENABLED 0) + endif() + target_compile_definitions(cuvs_cpp_headers INTERFACE CUVS_CUTILE_ENABLED=${CUVS_CUTILE_ENABLED}) generate_inst_matrix( cagra_build_inst_files MATRIX_JSON_FILE "${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/cagra_build_matrix.json" @@ -1147,6 +1182,8 @@ if(NOT BUILD_CPU_ONLY) src/util/host_memory.cpp src/detail/jit_lto/AlgorithmLauncher.cpp src/detail/jit_lto/AlgorithmPlanner.cpp + src/detail/jit_lto/LTOAlgorithmPlanner.cpp + src/detail/jit_lto/TileAlgorithmPlanner.cpp src/detail/jit_lto/FragmentEntry.cpp src/detail/jit_lto/nvjitlink_checker.cpp src/detail/jit_lto/NVRTCLTOFragmentCompiler.cpp @@ -1234,6 +1271,8 @@ if(NOT BUILD_CPU_ONLY) src/stats/trustworthiness_score.cu ${CUVS_MG_ALGOS} ${jit_lto_files} + ${cutile_fused_1nn_files} + $<$:src/distance/detail/fused_distance_nn/cutile/fused_1nn_tile.cu> ) set_target_properties( @@ -1255,8 +1294,9 @@ if(NOT BUILD_CPU_ONLY) ) target_compile_definitions( - cuvs_objs PRIVATE $<$:CUVS_BUILD_CAGRA_HNSWLIB> - $<$:NVTX_ENABLED> + cuvs_objs + PRIVATE $<$:CUVS_BUILD_CAGRA_HNSWLIB> + $<$:NVTX_ENABLED> CUVS_CUTILE_ENABLED=${CUVS_CUTILE_ENABLED} ) target_link_libraries( @@ -1275,6 +1315,7 @@ if(NOT BUILD_CPU_ONLY) "$" INTERFACE "$" PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/src" "${CMAKE_CURRENT_BINARY_DIR}/src" + "${cutile_fused_1nn_generated_dir}" ) # Endian detection diff --git a/cpp/cmake/modules/generate_cutile_kernels.cmake b/cpp/cmake/modules/generate_cutile_kernels.cmake new file mode 100644 index 0000000000..ac8d369cdc --- /dev/null +++ b/cpp/cmake/modules/generate_cutile_kernels.cmake @@ -0,0 +1,228 @@ +# ============================================================================= +# cmake-format: off +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. +# SPDX-License-Identifier: Apache-2.0 +# cmake-format: on +# ============================================================================= + +include_guard(GLOBAL) + +include(${CMAKE_CURRENT_LIST_DIR}/compute_matrix_product.cmake) + +function(generate_cutile_kernels_stub) + set(CUVS_CUTILE_ENABLED + 0 + PARENT_SCOPE + ) +endfunction() + +function(_cutile_fragment_tag_header_files output_var) + set(${output_var} "") + foreach(_header IN LISTS ARGN) + if(NOT _header MATCHES "^(\".*\"|<.*>)$") + set(_header "\"${_header}\"") + endif() + string(APPEND ${output_var} "#include ${_header}\n") + endforeach() + set(${output_var} + "${${output_var}}" + PARENT_SCOPE + ) +endfunction() + +function(_cutile_kernels_setup) + set(options) + set(one_value MATRIX_JSON_FILE OUTPUT_DIRECTORY) + set(multi_value) + cmake_parse_arguments(_CUTILE "${options}" "${one_value}" "${multi_value}" ${ARGN}) + + find_package(Python3 REQUIRED COMPONENTS Interpreter) + find_package(CUDAToolkit REQUIRED) + + if(CUDAToolkit_VERSION VERSION_LESS 13.0) + message( + STATUS + "cuTile embedded kernels require CUDA 13.0+; skipping cuTile generation (found ${CUDAToolkit_VERSION})." + ) + set(_CUTILE_SETUP_OK + FALSE + PARENT_SCOPE + ) + return() + endif() + + find_program( + CUTILE_BIN2C + NAMES bin2c + PATHS ${CUDAToolkit_BIN_DIR} REQUIRED + ) + + execute_process( + COMMAND "${Python3_EXECUTABLE}" -c "import cuda.tile" + RESULT_VARIABLE _cutile_import_result + OUTPUT_QUIET ERROR_QUIET + ) + if(NOT _cutile_import_result EQUAL 0) + message( + FATAL_ERROR + "cuda.tile (cuTile Python) is required to build cuTile embedded kernels. " + "Install it in the active Python environment, e.g. pip install cuda-tile[tileiras]." + ) + endif() + + set_property( + DIRECTORY + PROPERTY CMAKE_CONFIGURE_DEPENDS "${_CUTILE_MATRIX_JSON_FILE}" + APPEND + ) + + file(MAKE_DIRECTORY "${_CUTILE_OUTPUT_DIRECTORY}") + + set(Python3_EXECUTABLE + "${Python3_EXECUTABLE}" + PARENT_SCOPE + ) + set(CUTILE_BIN2C + "${CUTILE_BIN2C}" + PARENT_SCOPE + ) + set(_CUTILE_SETUP_OK + TRUE + PARENT_SCOPE + ) +endfunction() + +function(process_cutile_matrix_entry source_list_var) + set(options) + set(one_value KERNEL_DIR KERNEL_BASENAME KERNEL_PYTHON EXPORT_SCRIPT OUTPUT_DIRECTORY + FRAGMENT_TAG_FORMAT_CUBIN FRAGMENT_TAG_FORMAT_TILEIR MATRIX_JSON_ENTRY + ) + set(multi_value FRAGMENT_TAG_HEADER_FILES) + cmake_parse_arguments(_CUTILE "${options}" "${one_value}" "${multi_value}" ${ARGN}) + + find_package(Python3 REQUIRED COMPONENTS Interpreter) + + populate_matrix_variables("${_CUTILE_MATRIX_JSON_ENTRY}") + + if(register STREQUAL "cubin") + string(CONFIGURE "${_CUTILE_FRAGMENT_TAG_FORMAT_CUBIN}" fragment_tag @ONLY) + set(bin2c_symbol embedded_cubin) + set(fragment_entry_type "StaticCubinFragmentEntry") + elseif(register STREQUAL "tileir") + string(CONFIGURE "${_CUTILE_FRAGMENT_TAG_FORMAT_TILEIR}" fragment_tag @ONLY) + set(bin2c_symbol embedded_tileir) + set(fragment_entry_type "StaticTileIrBytecodeFragmentEntry") + else() + message(FATAL_ERROR "Unknown cuTile register kind '${register}'") + endif() + + _cutile_fragment_tag_header_files(fragment_tag_header_files ${_CUTILE_FRAGMENT_TAG_HEADER_FILES}) + + string(CONFIGURE "${artifact_basename}" _artifact_basename @ONLY) + set(_artifact_stem "${_CUTILE_KERNEL_BASENAME}_${_artifact_basename}") + set(_artifact_file "${_CUTILE_OUTPUT_DIRECTORY}/${_artifact_stem}.${artifact_ext}") + set(_embedded_header "${_CUTILE_OUTPUT_DIRECTORY}/${_artifact_stem}_${register}.h") + set(_fragment_cpp "${_CUTILE_OUTPUT_DIRECTORY}/${_artifact_stem}_${register}.cpp") + set(embedded_header_file "${_artifact_stem}_${register}.h") + + set(_python_args --format "${output_format}" --data-type "${data_type}" --gpu-code "${gpu_code}") + if(DEFINED bytecode_version AND NOT "${bytecode_version}" STREQUAL "") + list(APPEND _python_args --bytecode-version "${bytecode_version}") + endif() + + add_custom_command( + OUTPUT "${_artifact_file}" + COMMAND "${Python3_EXECUTABLE}" "${_CUTILE_KERNEL_DIR}/${_CUTILE_EXPORT_SCRIPT}" + "${_artifact_file}" ${_python_args} + WORKING_DIRECTORY "${_CUTILE_KERNEL_DIR}" + DEPENDS "${_CUTILE_KERNEL_DIR}/${_CUTILE_EXPORT_SCRIPT}" + "${_CUTILE_KERNEL_DIR}/${_CUTILE_KERNEL_PYTHON}" + COMMENT "Exporting cuTile ${_CUTILE_KERNEL_BASENAME} ${output_format} ${data_type}" + VERBATIM + ) + + add_custom_command( + OUTPUT "${_embedded_header}" + COMMAND "${CUTILE_BIN2C}" --const --name ${bin2c_symbol} --static "${_artifact_file}" > + "${_embedded_header}" + DEPENDS "${_artifact_file}" + VERBATIM + ) + + configure_file( + "${CMAKE_CURRENT_FUNCTION_LIST_DIR}/register_cutile_fragment.cpp.in" "${_fragment_cpp}" @ONLY + ) + list(APPEND ${source_list_var} "${_embedded_header}" "${_fragment_cpp}") + set(${source_list_var} + "${${source_list_var}}" + PARENT_SCOPE + ) +endfunction() + +function(generate_cutile_kernels source_list_var) + set(options) + set(one_value KERNEL_DIR KERNEL_BASENAME KERNEL_PYTHON EXPORT_SCRIPT OUTPUT_DIRECTORY + MATRIX_JSON_FILE FRAGMENT_TAG_FORMAT_CUBIN FRAGMENT_TAG_FORMAT_TILEIR + ) + set(multi_value FRAGMENT_TAG_HEADER_FILES) + cmake_parse_arguments(_CUTILE "${options}" "${one_value}" "${multi_value}" ${ARGN}) + + if(NOT _CUTILE_KERNEL_BASENAME) + message(FATAL_ERROR "generate_cutile_kernels: KERNEL_BASENAME is required") + endif() + if(NOT _CUTILE_KERNEL_PYTHON) + set(_CUTILE_KERNEL_PYTHON "fused_1nn_kernel.py") + endif() + + _cutile_kernels_setup( + MATRIX_JSON_FILE "${_CUTILE_MATRIX_JSON_FILE}" OUTPUT_DIRECTORY "${_CUTILE_OUTPUT_DIRECTORY}" + ) + if(NOT _CUTILE_SETUP_OK) + generate_cutile_kernels_stub() + set(${source_list_var} + "" + PARENT_SCOPE + ) + return() + endif() + + compute_matrix_product(matrix_product MATRIX_JSON_FILE "${_CUTILE_MATRIX_JSON_FILE}") + + string(JSON len LENGTH "${matrix_product}") + math(EXPR last "${len} - 1") + + # cmake-lint: disable=C0103,E1120 + foreach(i RANGE "${last}") + string(JSON matrix_json_entry GET "${matrix_product}" "${i}") + process_cutile_matrix_entry( + "${source_list_var}" + KERNEL_DIR + "${_CUTILE_KERNEL_DIR}" + KERNEL_BASENAME + "${_CUTILE_KERNEL_BASENAME}" + KERNEL_PYTHON + "${_CUTILE_KERNEL_PYTHON}" + EXPORT_SCRIPT + "${_CUTILE_EXPORT_SCRIPT}" + OUTPUT_DIRECTORY + "${_CUTILE_OUTPUT_DIRECTORY}" + FRAGMENT_TAG_FORMAT_CUBIN + "${_CUTILE_FRAGMENT_TAG_FORMAT_CUBIN}" + FRAGMENT_TAG_FORMAT_TILEIR + "${_CUTILE_FRAGMENT_TAG_FORMAT_TILEIR}" + FRAGMENT_TAG_HEADER_FILES + ${_CUTILE_FRAGMENT_TAG_HEADER_FILES} + MATRIX_JSON_ENTRY + "${matrix_json_entry}" + ) + endforeach() + + set(CUVS_CUTILE_ENABLED + 1 + PARENT_SCOPE + ) + set(${source_list_var} + "${${source_list_var}}" + PARENT_SCOPE + ) +endfunction() diff --git a/cpp/cmake/modules/register_cutile_fragment.cpp.in b/cpp/cmake/modules/register_cutile_fragment.cpp.in new file mode 100644 index 0000000000..3ffd5c0d0c --- /dev/null +++ b/cpp/cmake/modules/register_cutile_fragment.cpp.in @@ -0,0 +1,22 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "@embedded_header_file@" +#include + +@fragment_tag_header_files@ + + namespace +{ + using fragment_tag = @fragment_tag@; + using fragment_entry = @fragment_entry_type@; + +} // namespace + +template <> +const uint8_t* const fragment_entry::data = @bin2c_symbol@; + +template <> +const size_t fragment_entry::length = sizeof(@bin2c_symbol@); diff --git a/cpp/include/cuvs/detail/jit_lto/AlgorithmPlanner.hpp b/cpp/include/cuvs/detail/jit_lto/AlgorithmPlanner.hpp index 7f275b1285..d727c73b9d 100644 --- a/cpp/include/cuvs/detail/jit_lto/AlgorithmPlanner.hpp +++ b/cpp/include/cuvs/detail/jit_lto/AlgorithmPlanner.hpp @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -19,6 +20,7 @@ struct LauncherJitCache { std::shared_mutex mutex; std::unordered_map> launchers; + std::unordered_set build_failed; }; struct AlgorithmPlanner { @@ -27,9 +29,32 @@ struct AlgorithmPlanner { { } + virtual ~AlgorithmPlanner() = default; + std::shared_ptr get_launcher(); + /** Returns nullptr when no module can be loaded for the current device (does not RAFT_FAIL). */ + std::shared_ptr try_get_launcher(); + std::string entrypoint; + + protected: + virtual std::shared_ptr build() = 0; + + virtual std::string get_planner_key() const = 0; + + std::shared_ptr read_cache(std::string const& launch_key) const; + + LauncherJitCache& jit_cache_; +}; + +/** Links embedded LTO fatbin fragments at runtime via nvJitLink. */ +struct LTOAlgorithmPlanner : AlgorithmPlanner { + LTOAlgorithmPlanner(std::string entrypoint, LauncherJitCache& jit_cache) + : AlgorithmPlanner(std::move(entrypoint), jit_cache) + { + } + std::vector> fragments; template >> @@ -45,16 +70,38 @@ struct AlgorithmPlanner { } protected: - /** Extra link-time option strings passed to nvJitLink. Base build() - * always passes "-lto" and "-arch=sm_XX" first; derived planners may append here in their - * constructor body. */ + /** Extra link-time option strings passed to nvJitLink. */ std::vector linktime_extra_options; - private: - std::string get_fragments_key() const; - std::shared_ptr build(); + std::string get_planner_key() const override; - std::shared_ptr read_cache(std::string const& launch_key) const; + std::shared_ptr build() override; +}; - LauncherJitCache& jit_cache_; +/** Loads prebuilt cubins or TileIR bytecode via cudaLibraryLoadData. */ +struct TileAlgorithmPlanner : AlgorithmPlanner { + TileAlgorithmPlanner(std::string entrypoint, LauncherJitCache& jit_cache) + : AlgorithmPlanner(std::move(entrypoint), jit_cache) + { + } + + template + void add_static_fragment() + { + cubin_fragments_.push_back(std::make_unique>()); + } + + template + void add_static_tileir_fragment() + { + tileir_fragment_ = std::make_unique>(); + } + + protected: + std::vector> cubin_fragments_; + std::unique_ptr tileir_fragment_; + + std::string get_planner_key() const override; + + std::shared_ptr build() override; }; diff --git a/cpp/include/cuvs/detail/jit_lto/FragmentEntry.hpp b/cpp/include/cuvs/detail/jit_lto/FragmentEntry.hpp index 35aa46633c..6c399d860a 100644 --- a/cpp/include/cuvs/detail/jit_lto/FragmentEntry.hpp +++ b/cpp/include/cuvs/detail/jit_lto/FragmentEntry.hpp @@ -62,3 +62,69 @@ struct UDFFatbinFragment final : FatbinFragmentEntry { std::string key_; std::vector bytes_; }; + +/** Embedded CUDA binary module (cubin), loaded directly via cudaLibraryLoadData. */ +struct CubinFragmentEntry { + virtual ~CubinFragmentEntry() = default; + + virtual const uint8_t* get_data() const = 0; + + virtual size_t get_length() const = 0; + + virtual const char* get_key() const = 0; + + virtual int get_cc_major() const = 0; + + virtual int get_cc_minor() const = 0; +}; + +template +struct StaticCubinFragmentEntry final : CubinFragmentEntry { + const uint8_t* get_data() const override { return StaticCubinFragmentEntry::data; } + + size_t get_length() const override { return StaticCubinFragmentEntry::length; } + + const char* get_key() const override + { + return typeid(StaticCubinFragmentEntry).name(); + } + + int get_cc_major() const override { return FragmentTag::cc_major; } + + int get_cc_minor() const override { return FragmentTag::cc_minor; } + + static const uint8_t* const data; + static const size_t length; +}; + +/** Embedded TileIR bytecode, JIT-compiled by the driver when no matching cubin exists. */ +struct TileIrBytecodeFragmentEntry { + virtual ~TileIrBytecodeFragmentEntry() = default; + + virtual const uint8_t* get_data() const = 0; + + virtual size_t get_length() const = 0; + + virtual const char* get_key() const = 0; +}; + +template +struct StaticTileIrBytecodeFragmentEntry final : TileIrBytecodeFragmentEntry { + const uint8_t* get_data() const override + { + return StaticTileIrBytecodeFragmentEntry::data; + } + + size_t get_length() const override + { + return StaticTileIrBytecodeFragmentEntry::length; + } + + const char* get_key() const override + { + return typeid(StaticTileIrBytecodeFragmentEntry).name(); + } + + static const uint8_t* const data; + static const size_t length; +}; diff --git a/cpp/include/cuvs/detail/jit_lto/cutile_arch_tags.hpp b/cpp/include/cuvs/detail/jit_lto/cutile_arch_tags.hpp new file mode 100644 index 0000000000..2c915a278b --- /dev/null +++ b/cpp/include/cuvs/detail/jit_lto/cutile_arch_tags.hpp @@ -0,0 +1,52 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +#pragma once + +#ifndef CUVS_CUTILE_ENABLED +#define CUVS_CUTILE_ENABLED 0 +#endif + +namespace cuvs::detail::jit_lto { + +#if CUVS_CUTILE_ENABLED + +/** Must stay in sync with cuTile matrix _arch entries and planner add_static_fragment calls. */ +struct cutile_arch_8_0 { + static constexpr int cc_major = 8; + static constexpr int cc_minor = 0; +}; + +struct cutile_arch_8_6 { + static constexpr int cc_major = 8; + static constexpr int cc_minor = 6; +}; + +struct cutile_arch_9_0 { + static constexpr int cc_major = 9; + static constexpr int cc_minor = 0; +}; + +struct cutile_arch_12_0 { + static constexpr int cc_major = 12; + static constexpr int cc_minor = 0; +}; + +inline bool is_embedded_cubin_arch(int cc_major, int cc_minor) +{ + if (cc_major == 8 && cc_minor == 0) { return true; } + if (cc_major == 8 && cc_minor == 6) { return true; } + if (cc_major == 9 && cc_minor == 0) { return true; } + if (cc_major == 12 && cc_minor == 0) { return true; } + return false; +} + +#else + +inline bool is_embedded_cubin_arch(int, int) { return false; } + +#endif + +} // namespace cuvs::detail::jit_lto diff --git a/cpp/include/cuvs/detail/jit_lto/cutile_module.hpp b/cpp/include/cuvs/detail/jit_lto/cutile_module.hpp new file mode 100644 index 0000000000..dff0f472a7 --- /dev/null +++ b/cpp/include/cuvs/detail/jit_lto/cutile_module.hpp @@ -0,0 +1,75 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +#include + +namespace cuvs::detail::jit_lto { + +struct CutileModuleImage { + const uint8_t* data; + size_t size; +}; + +inline bool get_device_compute_capability(int& cc_major, int& cc_minor) +{ + int device = 0; + if (cudaGetDevice(&device) != cudaSuccess) { return false; } + if (cudaDeviceGetAttribute(&cc_major, cudaDevAttrComputeCapabilityMajor, device) != cudaSuccess) { + return false; + } + if (cudaDeviceGetAttribute(&cc_minor, cudaDevAttrComputeCapabilityMinor, device) != cudaSuccess) { + return false; + } + return true; +} + +/** Selects a prebuilt cubin for the device CC, or embedded TileIR when the driver can JIT it. */ +inline std::optional resolve_cutile_module_image( + int cc_major, + int cc_minor, + int driver_version, + const std::vector>& cubin_fragments, + const TileIrBytecodeFragmentEntry* tileir_fragment) +{ + for (const auto& fragment : cubin_fragments) { + if (fragment->get_cc_major() == cc_major && fragment->get_cc_minor() == cc_minor) { + return CutileModuleImage{fragment->get_data(), fragment->get_length()}; + } + } + if (tileir_fragment != nullptr && tileir_fallback_available(driver_version)) { + return CutileModuleImage{tileir_fragment->get_data(), tileir_fragment->get_length()}; + } + return std::nullopt; +} + +inline std::shared_ptr load_cutile_launcher(const CutileModuleImage& image, + const std::string& kernel_symbol) +{ + cudaLibrary_t library{}; + RAFT_CUDA_TRY( + cudaLibraryLoadData(&library, image.data, nullptr, nullptr, 0, nullptr, nullptr, 0)); + + cudaKernel_t kernel{}; + RAFT_CUDA_TRY(cudaLibraryGetKernel(&kernel, library, kernel_symbol.c_str())); + + return std::make_shared(kernel, library); +} + +} // namespace cuvs::detail::jit_lto diff --git a/cpp/include/cuvs/detail/jit_lto/fused_distance_nn/fused_1nn_fragments.hpp b/cpp/include/cuvs/detail/jit_lto/fused_distance_nn/fused_1nn_fragments.hpp new file mode 100644 index 0000000000..517118bbe2 --- /dev/null +++ b/cpp/include/cuvs/detail/jit_lto/fused_distance_nn/fused_1nn_fragments.hpp @@ -0,0 +1,21 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +#pragma once + +#include + +namespace cuvs::distance::detail { + +template +struct fragment_tag_fused_1nn_cubin { + static constexpr int cc_major = ArchTag::cc_major; + static constexpr int cc_minor = ArchTag::cc_minor; +}; + +template +struct fragment_tag_fused_1nn_tileir {}; + +} // namespace cuvs::distance::detail diff --git a/cpp/include/cuvs/detail/jit_lto/tileir_compat.hpp b/cpp/include/cuvs/detail/jit_lto/tileir_compat.hpp new file mode 100644 index 0000000000..f15407fd4c --- /dev/null +++ b/cpp/include/cuvs/detail/jit_lto/tileir_compat.hpp @@ -0,0 +1,99 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +#pragma once + +#ifndef CUVS_CUTILE_ENABLED +#define CUVS_CUTILE_ENABLED 0 +#endif + +#include +#include + +#include + +namespace cuvs::detail::jit_lto { + +/** Minimum CUDA driver version (from cudaDriverGetVersion) for TileIR JIT of embedded bytecode. */ +inline constexpr int kMinTileIrJitDriverVersion = 13010; // CUDA 13.1 / driver >= 590.44 + +/** Minimum CUDA runtime version (from cudaRuntimeGetVersion) for cuTile integration. */ +inline constexpr int kMinCutileRuntimeVersion = 13000; + +inline constexpr bool library_built_with_cutile() +{ +#if CUVS_CUTILE_ENABLED + return true; +#else + return false; +#endif +} + +inline bool runtime_cuda13_or_newer() +{ + int runtime_version = 0; + if (cudaRuntimeGetVersion(&runtime_version) != cudaSuccess) { return false; } + return runtime_version >= kMinCutileRuntimeVersion; +} + +/** True when this build embeds cuTile artifacts and the runtime is CUDA 13+. */ +inline bool cutile_integration_enabled() +{ + return library_built_with_cutile() && runtime_cuda13_or_newer(); +} + +/** True when this build embeds a prebuilt cubin for the given compute capability. */ +inline bool has_embedded_cubin_for_arch(int cc_major, int cc_minor) +{ + return is_embedded_cubin_arch(cc_major, cc_minor); +} + +/** True when the driver can JIT-compile embedded TileIR bytecode at load time. */ +inline bool tileir_fallback_available(int driver_version) +{ + return driver_version >= kMinTileIrJitDriverVersion; +} + +/** + * True when a cuTile launch may be attempted for the given device: cuTile is enabled, the runtime + * is CUDA 13+, and either a matching embedded cubin exists (no driver JIT required) or the driver + * can JIT the embedded TileIR bytecode fallback. + */ +inline bool cutile_launch_available_for_arch(int cc_major, int cc_minor, int driver_version) +{ + if (!cutile_integration_enabled()) { return false; } + if (has_embedded_cubin_for_arch(cc_major, cc_minor)) { return true; } + return tileir_fallback_available(driver_version); +} + +inline bool query_driver_version(int& driver_version) +{ + return cudaDriverGetVersion(&driver_version) == cudaSuccess; +} + +inline bool query_current_device_arch(int& cc_major, int& cc_minor) +{ + int device = 0; + if (cudaGetDevice(&device) != cudaSuccess) { return false; } + if (cudaDeviceGetAttribute(&cc_major, cudaDevAttrComputeCapabilityMajor, device) != cudaSuccess) { + return false; + } + if (cudaDeviceGetAttribute(&cc_minor, cudaDevAttrComputeCapabilityMinor, device) != cudaSuccess) { + return false; + } + return true; +} + +inline bool cutile_launch_available_on_current_device() +{ + int cc_major = 0; + int cc_minor = 0; + int driver_version = 0; + if (!query_current_device_arch(cc_major, cc_minor)) { return false; } + if (!query_driver_version(driver_version)) { return false; } + return cutile_launch_available_for_arch(cc_major, cc_minor, driver_version); +} + +} // namespace cuvs::detail::jit_lto diff --git a/cpp/src/detail/jit_lto/AlgorithmPlanner.cpp b/cpp/src/detail/jit_lto/AlgorithmPlanner.cpp index 7416ea396d..486d6f1aa5 100644 --- a/cpp/src/detail/jit_lto/AlgorithmPlanner.cpp +++ b/cpp/src/detail/jit_lto/AlgorithmPlanner.cpp @@ -3,33 +3,16 @@ * SPDX-License-Identifier: Apache-2.0 */ -#include -#include #include #include -#include #include #include -#include #include -#include - -#include "cuda_runtime.h" -#include "nvJitLink.h" #include #include -std::string AlgorithmPlanner::get_fragments_key() const -{ - std::string key = ""; - for (const auto& fragment : this->fragments) { - key += fragment->get_key(); - } - return key; -} - std::shared_ptr AlgorithmPlanner::read_cache(std::string const& launch_key) const { auto& launchers = jit_cache_.launchers; @@ -38,79 +21,37 @@ std::shared_ptr AlgorithmPlanner::read_cache(std::string cons return nullptr; } -std::shared_ptr AlgorithmPlanner::get_launcher() +std::shared_ptr AlgorithmPlanner::try_get_launcher() { - auto& launchers = jit_cache_.launchers; - auto launch_key = this->get_fragments_key(); + auto launch_key = this->get_planner_key(); - if (auto hit = read_cache(launch_key)) { return hit; } + { + std::shared_lock read_lock(jit_cache_.mutex); + if (jit_cache_.build_failed.count(launch_key)) { return nullptr; } + if (auto hit = read_cache(launch_key)) { return hit; } + } std::unique_lock write_lock(jit_cache_.mutex); - if (auto it = launchers.find(launch_key); it != launchers.end()) { return it->second; } + if (jit_cache_.build_failed.count(launch_key)) { return nullptr; } + if (auto it = jit_cache_.launchers.find(launch_key); it != jit_cache_.launchers.end()) { + return it->second; + } - std::string log_message = - "JIT compiling launcher for kernel: " + this->entrypoint + " and device functions: "; - for (const auto& fragment : this->fragments) { - log_message += std::string{fragment->get_key()} + ","; + RAFT_LOG_DEBUG("Building launcher for kernel entrypoint: %s", this->entrypoint.c_str()); + auto launcher = this->build(); + if (!launcher) { + jit_cache_.build_failed.insert(launch_key); + return nullptr; } - log_message.pop_back(); - RAFT_LOG_DEBUG("%s", log_message.c_str()); - auto launcher = this->build(); - launchers[launch_key] = launcher; + jit_cache_.launchers[launch_key] = launcher; return launcher; } -std::shared_ptr AlgorithmPlanner::build() +std::shared_ptr AlgorithmPlanner::get_launcher() { - int device = 0; - int major = 0; - int minor = 0; - RAFT_CUDA_TRY(cudaGetDevice(&device)); - RAFT_CUDA_TRY(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device)); - RAFT_CUDA_TRY(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, device)); - - std::string archs = "-arch=sm_" + std::to_string((major * 10 + minor)); - - // Load the generated LTO IR and link them together - nvJitLinkHandle handle; - std::vector lopts; - lopts.reserve(2 + linktime_extra_options.size()); - lopts.push_back("-lto"); - lopts.push_back(archs.c_str()); - for (auto const& opt : linktime_extra_options) { - lopts.push_back(opt.c_str()); - } - auto result = nvJitLinkCreate(&handle, static_cast(lopts.size()), lopts.data()); - check_nvjitlink_result(handle, result); - - for (const auto& frag : this->fragments) { - frag->add_to(handle); + auto launcher = try_get_launcher(); + if (!launcher) { + RAFT_FAIL("Failed to build launcher for kernel entrypoint: %s", this->entrypoint.c_str()); } - - // Call to nvJitLinkComplete causes linker to link together all the LTO-IR - // modules perform any optimizations and generate cubin from it. - result = nvJitLinkComplete(handle); - check_nvjitlink_result(handle, result); - - // get cubin from nvJitLink - size_t cubin_size; - result = nvJitLinkGetLinkedCubinSize(handle, &cubin_size); - check_nvjitlink_result(handle, result); - - std::unique_ptr cubin{new char[cubin_size]}; - result = nvJitLinkGetLinkedCubin(handle, cubin.get()); - check_nvjitlink_result(handle, result); - - result = nvJitLinkDestroy(&handle); - RAFT_EXPECTS(result == NVJITLINK_SUCCESS, "nvJitLinkDestroy failed"); - - // cubin is linked, so now load it - cudaLibrary_t library; - RAFT_CUDA_TRY( - cudaLibraryLoadData(&library, cubin.get(), nullptr, nullptr, 0, nullptr, nullptr, 0)); - - cudaKernel_t kernel; - RAFT_CUDA_TRY(cudaLibraryGetKernel(&kernel, library, this->entrypoint.c_str())); - - return std::make_shared(kernel, library); + return launcher; } diff --git a/cpp/src/detail/jit_lto/LTOAlgorithmPlanner.cpp b/cpp/src/detail/jit_lto/LTOAlgorithmPlanner.cpp new file mode 100644 index 0000000000..da7c0408b4 --- /dev/null +++ b/cpp/src/detail/jit_lto/LTOAlgorithmPlanner.cpp @@ -0,0 +1,76 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include +#include +#include +#include + +#include +#include + +#include "cuda_runtime.h" +#include "nvJitLink.h" + +#include + +std::string LTOAlgorithmPlanner::get_planner_key() const +{ + std::string key; + for (const auto& fragment : this->fragments) { + key += fragment->get_key(); + } + return key; +} + +std::shared_ptr LTOAlgorithmPlanner::build() +{ + int device = 0; + int major = 0; + int minor = 0; + RAFT_CUDA_TRY(cudaGetDevice(&device)); + RAFT_CUDA_TRY(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device)); + RAFT_CUDA_TRY(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, device)); + + std::string archs = "-arch=sm_" + std::to_string((major * 10 + minor)); + + nvJitLinkHandle handle; + std::vector lopts; + lopts.reserve(2 + linktime_extra_options.size()); + lopts.push_back("-lto"); + lopts.push_back(archs.c_str()); + for (auto const& opt : linktime_extra_options) { + lopts.push_back(opt.c_str()); + } + auto result = nvJitLinkCreate(&handle, static_cast(lopts.size()), lopts.data()); + check_nvjitlink_result(handle, result); + + for (const auto& frag : this->fragments) { + frag->add_to(handle); + } + + result = nvJitLinkComplete(handle); + check_nvjitlink_result(handle, result); + + size_t cubin_size; + result = nvJitLinkGetLinkedCubinSize(handle, &cubin_size); + check_nvjitlink_result(handle, result); + + std::unique_ptr cubin{new char[cubin_size]}; + result = nvJitLinkGetLinkedCubin(handle, cubin.get()); + check_nvjitlink_result(handle, result); + + result = nvJitLinkDestroy(&handle); + RAFT_EXPECTS(result == NVJITLINK_SUCCESS, "nvJitLinkDestroy failed"); + + cudaLibrary_t library; + RAFT_CUDA_TRY( + cudaLibraryLoadData(&library, cubin.get(), nullptr, nullptr, 0, nullptr, nullptr, 0)); + + cudaKernel_t kernel; + RAFT_CUDA_TRY(cudaLibraryGetKernel(&kernel, library, this->entrypoint.c_str())); + + return std::make_shared(kernel, library); +} diff --git a/cpp/src/detail/jit_lto/TileAlgorithmPlanner.cpp b/cpp/src/detail/jit_lto/TileAlgorithmPlanner.cpp new file mode 100644 index 0000000000..e0ce77e789 --- /dev/null +++ b/cpp/src/detail/jit_lto/TileAlgorithmPlanner.cpp @@ -0,0 +1,36 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include +#include + +#include +#include + +std::string TileAlgorithmPlanner::get_planner_key() const +{ + std::string key = this->entrypoint; + for (const auto& fragment : cubin_fragments_) { + key += fragment->get_key(); + } + if (tileir_fragment_) { key += tileir_fragment_->get_key(); } + return key; +} + +std::shared_ptr TileAlgorithmPlanner::build() +{ + int cc_major = 0; + int cc_minor = 0; + if (!cuvs::detail::jit_lto::get_device_compute_capability(cc_major, cc_minor)) { return nullptr; } + + int driver_version = 0; + if (cudaDriverGetVersion(&driver_version) != cudaSuccess) { return nullptr; } + + auto image = cuvs::detail::jit_lto::resolve_cutile_module_image( + cc_major, cc_minor, driver_version, cubin_fragments_, tileir_fragment_.get()); + if (!image) { return nullptr; } + + return cuvs::detail::jit_lto::load_cutile_launcher(*image, this->entrypoint); +} diff --git a/cpp/src/distance/detail/fused_distance_nn.cuh b/cpp/src/distance/detail/fused_distance_nn.cuh index f9dbd968ec..b1b18e58f6 100644 --- a/cpp/src/distance/detail/fused_distance_nn.cuh +++ b/cpp/src/distance/detail/fused_distance_nn.cuh @@ -1,17 +1,25 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ #pragma once +#ifndef CUVS_CUTILE_ENABLED +#define CUVS_CUTILE_ENABLED 0 +#endif + #include "distance_ops/l2_exp.cuh" // ops::l2_exp_distance_op #include "fused_distance_nn/cutlass_base.cuh" +#if CUVS_CUTILE_ENABLED +#include "fused_distance_nn/cutile/fused_1nn_tile.hpp" +#endif #include "fused_distance_nn/fused_cosine_nn.cuh" #include "fused_distance_nn/fused_l2_nn.cuh" #include "fused_distance_nn/helper_structs.cuh" #include "fused_distance_nn/simt_kernel.cuh" #include "pairwise_distance_base.cuh" // PairwiseDistances +#include #include #include // raft::KeyValuePair #include // raft::identity_op @@ -54,6 +62,13 @@ void fusedDistanceNNImpl(OutT* min, // The kernel policy is determined by fusedDistanceNN. typedef Policy P; +#if CUVS_CUTILE_ENABLED + if (cuvs::detail::jit_lto::cutile_launch_available_on_current_device() && + try_fused_1nn_tile(min, x, y, m, n, k, metric, stream)) { + return; + } +#endif + dim3 blk(P::Nthreads); auto nblks = raft::ceildiv(m, P::Nthreads); constexpr auto maxVal = std::numeric_limits::max(); diff --git a/cpp/src/distance/detail/fused_distance_nn/cutile/export_fused_1nn.py b/cpp/src/distance/detail/fused_distance_nn/cutile/export_fused_1nn.py new file mode 100644 index 0000000000..10a4fa9ec1 --- /dev/null +++ b/cpp/src/distance/detail/fused_distance_nn/cutile/export_fused_1nn.py @@ -0,0 +1,144 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. +# SPDX-License-Identifier: Apache-2.0 +"""Export fused 1-NN cuTile kernels to cubin or TileIR bytecode.""" + +from __future__ import annotations + +import argparse +import sys +from pathlib import Path +from typing import Literal + +import cuda.tile as ct +from cuda.tile.compilation import ( + ArrayConstraint, + CallingConvention, + ConstantConstraint, + KernelSignature, + ScalarConstraint, + export_kernel, +) + +from fused_1nn_kernel import KERNELS, KERNEL_SYMBOLS, TILE_CONSTANTS + +DEFAULT_TILEIR_BYTECODE_VERSION = "13.1" +# cuTile requires a gpu_code even for TileIR bytecode export: it selects the compilation +# target / feature set for lowering, not the runtime architecture (the driver JITs at load). +DEFAULT_TILEIR_EXPORT_GPU_CODE = "sm_80" + + +def _dtype_for(data_type: str): + if data_type == "half": + return ct.float16 + if data_type == "float": + return ct.float32 + raise ValueError(f"Unsupported data_type {data_type!r}") + + +def _kernel_signature(data_type: str) -> KernelSignature: + elem = _dtype_for(data_type) + array = ArrayConstraint( + elem, + 2, + index_dtype=ct.int64, + stride_lower_bound_incl=0, + alias_groups=(), + may_alias_internally=False, + ) + idx_array = ArrayConstraint( + ct.int64, + 1, + index_dtype=ct.int64, + stride_lower_bound_incl=0, + alias_groups=(), + may_alias_internally=False, + stride_constant=(1,), + ) + dist_array = ArrayConstraint( + ct.float32, + 1, + index_dtype=ct.int64, + stride_lower_bound_incl=0, + alias_groups=(), + may_alias_internally=False, + stride_constant=(1,), + ) + tm, tn, tk = TILE_CONSTANTS + return KernelSignature( + parameters=[ + array, + array, + idx_array, + dist_array, + ScalarConstraint(ct.int64), + ScalarConstraint(ct.int64), + ScalarConstraint(ct.int64), + ConstantConstraint(tm), + ConstantConstraint(tn), + ConstantConstraint(tk), + ], + calling_convention=CallingConvention.cutile_python_v1(), + ).with_symbol(KERNEL_SYMBOLS[data_type]) + + +def export_binary( + output_file: Path, + *, + output_format: Literal["cubin", "tileir_bytecode"], + data_type: str, + gpu_code: str, + bytecode_version: str | None = None, +) -> str: + kernel = KERNELS[data_type] + signature = _kernel_signature(data_type) + + export_kwargs = { + "kernel": kernel, + "signatures": [signature], + "output_file": str(output_file), + "gpu_code": gpu_code, + "output_format": output_format, + } + if output_format == "tileir_bytecode": + export_kwargs["bytecode_version"] = ( + bytecode_version or DEFAULT_TILEIR_BYTECODE_VERSION + ) + + export_kernel(**export_kwargs) + + return signature.symbol + + +def main() -> int: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("output_file", type=Path) + parser.add_argument( + "--format", choices=("cubin", "tileir_bytecode"), default="cubin" + ) + parser.add_argument( + "--data-type", choices=tuple(KERNELS.keys()), required=True + ) + parser.add_argument( + "--gpu-code", + default=DEFAULT_TILEIR_EXPORT_GPU_CODE, + help="Target SM for cubin export, or compile hint for TileIR bytecode export", + ) + parser.add_argument( + "--bytecode-version", default=DEFAULT_TILEIR_BYTECODE_VERSION + ) + args = parser.parse_args() + + print( + export_binary( + args.output_file, + output_format=args.format, + data_type=args.data_type, + gpu_code=args.gpu_code, + bytecode_version=args.bytecode_version, + ) + ) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/cpp/src/distance/detail/fused_distance_nn/cutile/fused_1nn_cutile_matrix.json b/cpp/src/distance/detail/fused_distance_nn/cutile/fused_1nn_cutile_matrix.json new file mode 100644 index 0000000000..52955863c5 --- /dev/null +++ b/cpp/src/distance/detail/fused_distance_nn/cutile/fused_1nn_cutile_matrix.json @@ -0,0 +1,64 @@ +[ + { + "_data": [ + { + "data_type": "half", + "data_abbrev": "h" + }, + { + "data_type": "float", + "data_abbrev": "f" + } + ], + "_export": [ + { + "output_format": "cubin", + "artifact_ext": "cubin", + "artifact_basename": "@data_type@_@gpu_code@", + "register": "cubin", + "gpu_code": "sm_80", + "cc_major": 8, + "cc_minor": 0, + "arch_tag": "cutile_arch_8_0" + }, + { + "output_format": "cubin", + "artifact_ext": "cubin", + "artifact_basename": "@data_type@_@gpu_code@", + "register": "cubin", + "gpu_code": "sm_86", + "cc_major": 8, + "cc_minor": 6, + "arch_tag": "cutile_arch_8_6" + }, + { + "output_format": "cubin", + "artifact_ext": "cubin", + "artifact_basename": "@data_type@_@gpu_code@", + "register": "cubin", + "gpu_code": "sm_90", + "cc_major": 9, + "cc_minor": 0, + "arch_tag": "cutile_arch_9_0" + }, + { + "output_format": "cubin", + "artifact_ext": "cubin", + "artifact_basename": "@data_type@_@gpu_code@", + "register": "cubin", + "gpu_code": "sm_120", + "cc_major": 12, + "cc_minor": 0, + "arch_tag": "cutile_arch_12_0" + }, + { + "output_format": "tileir_bytecode", + "artifact_ext": "tilebc", + "artifact_basename": "@data_type@", + "register": "tileir", + "gpu_code": "sm_80", + "bytecode_version": "13.1" + } + ] + } +] diff --git a/cpp/src/distance/detail/fused_distance_nn/cutile/fused_1nn_kernel.py b/cpp/src/distance/detail/fused_distance_nn/cutile/fused_1nn_kernel.py new file mode 100644 index 0000000000..65fe165b70 --- /dev/null +++ b/cpp/src/distance/detail/fused_distance_nn/cutile/fused_1nn_kernel.py @@ -0,0 +1,78 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. +# SPDX-License-Identifier: Apache-2.0 +"""cuTile fused GEMM + inner-product 1-NN (argmax dot product) for cuVS.""" + +from __future__ import annotations + +import cuda.tile as ct + +ConstInt = ct.Constant[int] + +TILE_M = 128 +TILE_N = 256 +TILE_K = 64 + + +def _make_kernel(data_type: str): + if data_type not in ("half", "float"): + raise ValueError(f"Unsupported data_type {data_type!r}") + acc_dtype = ct.float32 + + @ct.kernel + def fused_1nn_kernel( + A, + B, + OutIdx, + OutDist, + M, + N, + K, + tm: ConstInt, + tn: ConstInt, + tk: ConstInt, + ): + bidm = ct.bid(0) + + best_dist = ct.full((tm,), -3.4e38, acc_dtype) + best_idx = ct.zeros((tm,), ct.int64) + + num_tiles_k = ct.num_tiles(A, axis=1, shape=(tm, tk)) + num_tiles_n = ct.num_tiles(B, axis=0, shape=(tn, tk)) + zero_pad = ct.PaddingMode.ZERO + + for n in range(num_tiles_n): + accumulator = ct.full((tm, tn), 0, dtype=acc_dtype) + + for k in range(num_tiles_k): + a = ct.load( + A, index=(bidm, k), shape=(tm, tk), padding_mode=zero_pad + ) + b_T = ct.load( + B, index=(n, k), shape=(tn, tk), padding_mode=zero_pad + ) + accumulator = ct.mma(a, ct.transpose(b_T), accumulator) + + curr_max = ct.max(accumulator, axis=1) + curr_idx = ct.argmax(accumulator, axis=1) + + update = curr_max > best_dist + best_dist = ct.where(update, curr_max, best_dist) + best_idx = ct.where(update, n * tn + curr_idx, best_idx) + + ct.store(OutIdx, index=(bidm,), tile=best_idx) + ct.store(OutDist, index=(bidm,), tile=best_dist) + + return fused_1nn_kernel + + +KERNELS = { + "half": _make_kernel("half"), + "float": _make_kernel("float"), +} + +KERNEL_SYMBOLS = { + "half": "fused_1nn_half", + "float": "fused_1nn_float", +} + +TILE_CONSTANTS = (TILE_M, TILE_N, TILE_K) diff --git a/cpp/src/distance/detail/fused_distance_nn/cutile/fused_1nn_planner.hpp b/cpp/src/distance/detail/fused_distance_nn/cutile/fused_1nn_planner.hpp new file mode 100644 index 0000000000..dd2a539528 --- /dev/null +++ b/cpp/src/distance/detail/fused_distance_nn/cutile/fused_1nn_planner.hpp @@ -0,0 +1,60 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +#pragma once + +#include + +#include +#include +#include +#include + +namespace cuvs::distance::detail { + +/** Must match KERNEL_SYMBOLS in fused_1nn_kernel.py (export uses with_symbol). */ +template +inline const char* fused_1nn_kernel_entrypoint() +{ + if constexpr (std::is_same_v) { + return "fused_1nn_half"; + } else if constexpr (std::is_same_v) { + return "fused_1nn_float"; + } else { + static_assert(sizeof(DataTag) == 0, "unsupported fused 1-NN cuTile data type"); + return ""; + } +} + +template +struct Fused1nnTilePlanner : TileAlgorithmPlanner { + inline static LauncherJitCache launcher_jit_cache{}; + + Fused1nnTilePlanner() + : TileAlgorithmPlanner(fused_1nn_kernel_entrypoint(), launcher_jit_cache) + { + } + + /** Registers embedded cubin modules (one per SM); see register_cubin.cpp object files. */ + void add_entrypoint() + { + using cuvs::detail::jit_lto::cutile_arch_12_0; + using cuvs::detail::jit_lto::cutile_arch_8_0; + using cuvs::detail::jit_lto::cutile_arch_8_6; + using cuvs::detail::jit_lto::cutile_arch_9_0; + + this->add_static_fragment>(); + this->add_static_fragment>(); + this->add_static_fragment>(); + this->add_static_fragment>(); + } + + void add_tileir_fallback() + { + this->add_static_tileir_fragment>(); + } +}; + +} // namespace cuvs::distance::detail diff --git a/cpp/src/distance/detail/fused_distance_nn/cutile/fused_1nn_tile.cu b/cpp/src/distance/detail/fused_distance_nn/cutile/fused_1nn_tile.cu new file mode 100644 index 0000000000..0ad4ee62a5 --- /dev/null +++ b/cpp/src/distance/detail/fused_distance_nn/cutile/fused_1nn_tile.cu @@ -0,0 +1,173 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "fused_1nn_tile.hpp" + +#include "fused_1nn_planner.hpp" + +#include +#include + +namespace cuvs { +namespace distance { +namespace detail { + +namespace { + +constexpr int64_t TILE_M = 128; + +template +__global__ void pack_fused_1nn_kvp(OutT* out, const int64_t* idx, const float* dist, IdxT len) +{ + IdxT i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < len) { + out[i].key = static_cast(idx[i]); + out[i].value = static_cast(dist[i]); + } +} + +template +bool launch_fused_1nn_tile( + const DataT* x, const DataT* y, OutT* out, IdxT m, IdxT n, IdxT k, cudaStream_t stream) +{ + Fused1nnTilePlanner planner; + planner.add_entrypoint(); + planner.add_tileir_fallback(); + auto launcher = planner.try_get_launcher(); + if (!launcher) { return false; } + + int64_t* d_idx = nullptr; + float* d_dist = nullptr; + RAFT_CUDA_TRY(cudaMallocAsync(&d_idx, m * sizeof(int64_t), stream)); + RAFT_CUDA_TRY(cudaMallocAsync(&d_dist, m * sizeof(float), stream)); + + int64_t shape_x[2] = {m, k}; + int64_t stride_x[2] = {k, 1}; + int64_t shape_y[2] = {n, k}; + int64_t stride_y[2] = {k, 1}; + int64_t shape_idx = m; + int64_t stride_idx = 1; + int64_t shape_dist = m; + int64_t stride_dist = 1; + + int64_t M = m, N = n, K = k; + + void* x_ptr = const_cast(x); + void* y_ptr = const_cast(y); + void* idx_ptr = d_idx; + void* dist_ptr = d_dist; + + dim3 grid((m + TILE_M - 1) / TILE_M, 1, 1); + dim3 block(1, 1, 1); + + // cutile_python_v1 (see fused_1nn_float PTX): each 2D array is (ptr, shape0, shape1, + // stride0, stride1); each 1D array is (ptr, shape, stride); ConstantConstraint tile sizes + // are embedded in the module. + using fused_1nn_cutile_kernel_t = void(void*, + int64_t, + int64_t, + int64_t, + int64_t, + void*, + int64_t, + int64_t, + int64_t, + int64_t, + void*, + int64_t, + int64_t, + void*, + int64_t, + int64_t, + int64_t, + int64_t, + int64_t); + launcher->template dispatch(stream, + grid, + block, + 0, + x_ptr, + shape_x[0], + shape_x[1], + stride_x[0], + stride_x[1], + y_ptr, + shape_y[0], + shape_y[1], + stride_y[0], + stride_y[1], + idx_ptr, + shape_idx, + stride_idx, + dist_ptr, + shape_dist, + stride_dist, + M, + N, + K); + + pack_fused_1nn_kvp<<<(m + 255) / 256, 256, 0, stream>>>(out, d_idx, d_dist, m); + RAFT_CUDA_TRY(cudaGetLastError()); + RAFT_CUDA_TRY(cudaFreeAsync(d_idx, stream)); + RAFT_CUDA_TRY(cudaFreeAsync(d_dist, stream)); + return true; +} + +} // namespace + +template , int>> +bool try_fused_1nn_tile(OutT* min, + const DataT* x, + const DataT* y, + IdxT m, + IdxT n, + IdxT k, + cuvs::distance::DistanceType metric, + cudaStream_t stream) +{ + if (metric != cuvs::distance::DistanceType::InnerProduct) { return false; } + + if constexpr (std::is_same_v) { + return launch_fused_1nn_tile( + x, y, min, m, n, k, stream); + } else if constexpr (std::is_same_v) { + return launch_fused_1nn_tile( + x, y, min, m, n, k, stream); + } else { + return false; + } +} + +using kvp_i_f = raft::KeyValuePair; +using kvp_i64_f = raft::KeyValuePair; +using kvp_i_h = raft::KeyValuePair; +using kvp_i64_h = raft::KeyValuePair; + +#define CUVS_INST_TRY_FUSED_1NN_TILE(DataT, OutT, IdxT) \ + template CUVS_EXPORT bool try_fused_1nn_tile(OutT*, \ + const DataT*, \ + const DataT*, \ + IdxT, \ + IdxT, \ + IdxT, \ + cuvs::distance::DistanceType, \ + cudaStream_t) + +// int and int32_t are the same on LP64; one instantiation covers both. +CUVS_INST_TRY_FUSED_1NN_TILE(float, kvp_i_f, int); +CUVS_INST_TRY_FUSED_1NN_TILE(float, kvp_i64_f, int64_t); +CUVS_INST_TRY_FUSED_1NN_TILE(half, kvp_i_f, int); +CUVS_INST_TRY_FUSED_1NN_TILE(half, kvp_i64_f, int64_t); +CUVS_INST_TRY_FUSED_1NN_TILE(half, kvp_i_h, int); +CUVS_INST_TRY_FUSED_1NN_TILE(half, kvp_i64_h, int64_t); + +#undef CUVS_INST_TRY_FUSED_1NN_TILE + +} // namespace detail +} // namespace distance +} // namespace cuvs diff --git a/cpp/src/distance/detail/fused_distance_nn/cutile/fused_1nn_tile.hpp b/cpp/src/distance/detail/fused_distance_nn/cutile/fused_1nn_tile.hpp new file mode 100644 index 0000000000..d72a020ba7 --- /dev/null +++ b/cpp/src/distance/detail/fused_distance_nn/cutile/fused_1nn_tile.hpp @@ -0,0 +1,50 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +#pragma once + +#include + +#include +#include + +#include + +namespace cuvs { +namespace distance { +namespace detail { + +template +inline constexpr bool is_fused_1nn_kvp_output_v = + (std::is_same_v || std::is_same_v) && + (std::is_same_v> || + std::is_same_v>); + +template , int> = 0> +bool try_fused_1nn_tile(OutT* min, + const DataT* x, + const DataT* y, + IdxT m, + IdxT n, + IdxT k, + cuvs::distance::DistanceType metric, + cudaStream_t stream); + +template , int> = 0> +bool try_fused_1nn_tile( + OutT*, const DataT*, const DataT*, IdxT, IdxT, IdxT, cuvs::distance::DistanceType, cudaStream_t) +{ + return false; +} + +} // namespace detail +} // namespace distance +} // namespace cuvs diff --git a/cpp/src/distance/detail/pairwise_matrix/jit_lto_kernels/pairwise_matrix_planner.hpp b/cpp/src/distance/detail/pairwise_matrix/jit_lto_kernels/pairwise_matrix_planner.hpp index 0d00b3eca6..f89a383596 100644 --- a/cpp/src/distance/detail/pairwise_matrix/jit_lto_kernels/pairwise_matrix_planner.hpp +++ b/cpp/src/distance/detail/pairwise_matrix/jit_lto_kernels/pairwise_matrix_planner.hpp @@ -20,7 +20,7 @@ template -struct PairwiseMatrixPlanner : AlgorithmPlanner { +struct PairwiseMatrixPlanner : LTOAlgorithmPlanner { using DistanceTag = DistanceTag_; using DataTag = DataTag_; using AccTag = AccTag_; @@ -33,7 +33,7 @@ struct PairwiseMatrixPlanner : AlgorithmPlanner { inline static LauncherJitCache launcher_jit_cache{}; - PairwiseMatrixPlanner() : AlgorithmPlanner(kPairwiseMatrixJitEntrypoint, launcher_jit_cache) {} + PairwiseMatrixPlanner() : LTOAlgorithmPlanner(kPairwiseMatrixJitEntrypoint, launcher_jit_cache) {} void add_entrypoint() { diff --git a/cpp/src/neighbors/detail/cagra/jit_lto_kernels/cagra_planner_base.hpp b/cpp/src/neighbors/detail/cagra/jit_lto_kernels/cagra_planner_base.hpp index 0c3ed64d13..b44a7f044e 100644 --- a/cpp/src/neighbors/detail/cagra/jit_lto_kernels/cagra_planner_base.hpp +++ b/cpp/src/neighbors/detail/cagra/jit_lto_kernels/cagra_planner_base.hpp @@ -25,7 +25,7 @@ template -struct CagraPlannerBase : AlgorithmPlanner { +struct CagraPlannerBase : LTOAlgorithmPlanner { using DataTag = DataTag_; using IndexTag = IndexTag_; using DistanceTag = DistanceTag_; @@ -34,7 +34,7 @@ struct CagraPlannerBase : AlgorithmPlanner { using SampleFilterJitTag = SampleFilterJitTag_; explicit CagraPlannerBase(std::string entrypoint, LauncherJitCache& jit_cache) - : AlgorithmPlanner(std::move(entrypoint), jit_cache) + : LTOAlgorithmPlanner(std::move(entrypoint), jit_cache) { } diff --git a/cpp/src/neighbors/ivf_flat/detail/jit_lto_kernels/interleaved_scan_planner.hpp b/cpp/src/neighbors/ivf_flat/detail/jit_lto_kernels/interleaved_scan_planner.hpp index ed8191016b..7899d970ab 100644 --- a/cpp/src/neighbors/ivf_flat/detail/jit_lto_kernels/interleaved_scan_planner.hpp +++ b/cpp/src/neighbors/ivf_flat/detail/jit_lto_kernels/interleaved_scan_planner.hpp @@ -14,10 +14,10 @@ namespace cuvs::neighbors::ivf_flat::detail { -struct InterleavedScanPlanner : AlgorithmPlanner { +struct InterleavedScanPlanner : LTOAlgorithmPlanner { inline static LauncherJitCache launcher_jit_cache{}; - InterleavedScanPlanner() : AlgorithmPlanner("interleaved_scan", launcher_jit_cache) {} + InterleavedScanPlanner() : LTOAlgorithmPlanner("interleaved_scan", launcher_jit_cache) {} template void add_entrypoint() diff --git a/cpp/src/neighbors/ivf_pq/detail/jit_lto_kernels/compute_similarity_planner.hpp b/cpp/src/neighbors/ivf_pq/detail/jit_lto_kernels/compute_similarity_planner.hpp index 0621966cad..7152aaeebd 100644 --- a/cpp/src/neighbors/ivf_pq/detail/jit_lto_kernels/compute_similarity_planner.hpp +++ b/cpp/src/neighbors/ivf_pq/detail/jit_lto_kernels/compute_similarity_planner.hpp @@ -12,10 +12,10 @@ namespace cuvs::neighbors::ivf_pq::detail { -struct ComputeSimilarityPlanner : AlgorithmPlanner { +struct ComputeSimilarityPlanner : LTOAlgorithmPlanner { inline static LauncherJitCache launcher_jit_cache{}; - ComputeSimilarityPlanner() : AlgorithmPlanner("compute_similarity", launcher_jit_cache) {} + ComputeSimilarityPlanner() : LTOAlgorithmPlanner("compute_similarity", launcher_jit_cache) {} template void add_entrypoint() diff --git a/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/scan_planner.hpp b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/scan_planner.hpp index 05ea34532e..5dc47dc612 100644 --- a/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/scan_planner.hpp +++ b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/scan_planner.hpp @@ -13,10 +13,10 @@ namespace cuvs::neighbors::ivf_sq::detail { -struct IvfSqScanPlanner : AlgorithmPlanner { +struct IvfSqScanPlanner : LTOAlgorithmPlanner { inline static LauncherJitCache launcher_jit_cache{}; - IvfSqScanPlanner() : AlgorithmPlanner("ivf_sq_scan", launcher_jit_cache) {} + IvfSqScanPlanner() : LTOAlgorithmPlanner("ivf_sq_scan", launcher_jit_cache) {} template void add_entrypoint() diff --git a/cpp/tests/neighbors/distance_nn.cu b/cpp/tests/neighbors/distance_nn.cu index f31f3ebacf..f5efaa5bec 100644 --- a/cpp/tests/neighbors/distance_nn.cu +++ b/cpp/tests/neighbors/distance_nn.cu @@ -187,6 +187,7 @@ const std::vector> input_fp32 = { {4096, 16384, 128, DistanceType::L2Expanded, true, uint64_t(31415926), 0.1}, {4096, 4096, 64, DistanceType::L2SqrtExpanded, false, uint64_t(31415926), 0.1}, {4096, 16384, 128, DistanceType::L2SqrtExpanded, false, uint64_t(31415926), 0.1}, + {512, 1024, 64, DistanceType::InnerProduct, false, uint64_t(31415926), 0.1}, {4096, 4096, 64, DistanceType::CosineExpanded, false, uint64_t(31415926), 0.1}, {8192, 4096, 64, DistanceType::CosineExpanded, false, uint64_t(31415926), 0.1}, // Fused implementation for cosine distance ignores the sqrt parameter, therefore diff --git a/cpp/tests/neighbors/distance_nn_helper.cuh b/cpp/tests/neighbors/distance_nn_helper.cuh index fda7b76573..ea440387b4 100644 --- a/cpp/tests/neighbors/distance_nn_helper.cuh +++ b/cpp/tests/neighbors/distance_nn_helper.cuh @@ -66,6 +66,16 @@ __device__ AccT cosine_distance(const DataT* v1, const DataT* v2, IdxT K) } // This is a naive implementation of 1-NN computation +template +__device__ AccT inner_product_score(const DataT* v1, const DataT* v2, IdxT K) +{ + AccT score = AccT(0.0); + for (IdxT i = 0; i < K; i++) { + score += AccT(v1[i]) * AccT(v2[i]); + } + return score; +} + template RAFT_KERNEL ref_nn_kernel( OutT* out, const DataT* A, const DataT* B, IdxT M, IdxT N, IdxT K, bool sqrt, DistanceType metric) @@ -73,22 +83,47 @@ RAFT_KERNEL ref_nn_kernel( IdxT tid = threadIdx.x + blockIdx.x * IdxT(blockDim.x); for (IdxT m = tid; m < M; m += (blockDim.x * gridDim.x)) { - IdxT min_index = N + 1; - AccT min_dist = max_val(); + IdxT best_index = N + 1; + AccT best_score = min_val(); + AccT best_dist = max_val(); for (IdxT n = 0; n < N; n++) { + if (metric == DistanceType::InnerProduct) { + AccT score = inner_product_score(&A[m * K], &B[n * K], K); + if (score > best_score) { + best_score = score; + best_index = n; + } + continue; + } + AccT dist; if (metric == DistanceType::L2SqrtExpanded || metric == DistanceType::L2Expanded) { dist = l2_distance(&A[m * K], &B[n * K], K); } else if (metric == DistanceType::CosineExpanded) { dist = cosine_distance(&A[m * K], &B[n * K], K); + } else { + continue; + } + if (dist < best_dist) { + best_dist = dist; + best_index = n; } - if (dist < min_dist) { - min_dist = dist; - min_index = n; + } + + if (metric == DistanceType::InnerProduct) { + if constexpr (std::is_fundamental::value) { + out[m] = AccT(best_score); + } else { + out[m].key = IdxT(best_index); + out[m].value = AccT(best_score); } + continue; } + IdxT min_index = best_index; + AccT min_dist = best_dist; + if constexpr (std::is_fundamental::value) { static_assert(std::is_same::value, "OutT and AccT are not same type"); out[m] = AccT(min_dist); diff --git a/dependencies.yaml b/dependencies.yaml index 744e4d9227..756041e60c 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -395,6 +395,7 @@ dependencies: - cuda-nvrtc-dev - cuda-nvtx-dev - cuda-profiler-api + - cutile-python - libcublas-dev - libcurand-dev - libcusolver-dev @@ -430,12 +431,14 @@ dependencies: packages: - &ctk_cu13 cuda-toolkit[cublas,curand,cusolver,cusparse,nvrtc]==13.* - &nvjitlink_cu13 nvidia-nvjitlink>=13.0,<14 + - &cutile_cu13 cuda-tile[tileiras] # if no matching matrix selectors passed, list the CUDA 13 requirement # (just as a source of documentation, as this populates pyproject.toml in source control) - matrix: packages: - *ctk_cu13 - *nvjitlink_cu13 + - *cutile_cu13 depends_on_cudart: common: - output_types: conda diff --git a/python/libcuvs/pyproject.toml b/python/libcuvs/pyproject.toml index 5025daa66d..b4e848304f 100644 --- a/python/libcuvs/pyproject.toml +++ b/python/libcuvs/pyproject.toml @@ -19,6 +19,7 @@ authors = [ license = "Apache-2.0" requires-python = ">=3.11" dependencies = [ + "cuda-tile[tileiras]", "cuda-toolkit[cublas,curand,cusolver,cusparse,nvrtc]==13.*", "libraft==26.8.*,>=0.0.0a0", "librmm==26.8.*,>=0.0.0a0",