From 41b0713736eb5a8062cc9ab7979ecf9aa933f83e Mon Sep 17 00:00:00 2001 From: divyegala Date: Wed, 17 Jun 2026 20:58:30 +0000 Subject: [PATCH 1/5] add example --- conda/recipes/libcuvs/recipe.yaml | 5 + cpp/tests/CMakeLists.txt | 2 + cpp/tests/cutile/CMakeLists.txt | 23 ++++ cpp/tests/cutile/cutile_vector_add.cu | 128 ++++++++++++++++++ cpp/tests/cutile/export_vector_add_cubin.py | 101 ++++++++++++++ cpp/tests/cutile/generate_cutile_cubins.cmake | 90 ++++++++++++ cpp/tests/cutile/vector_add_kernel.py | 17 +++ dependencies.yaml | 3 + 8 files changed, 369 insertions(+) create mode 100644 cpp/tests/cutile/CMakeLists.txt create mode 100644 cpp/tests/cutile/cutile_vector_add.cu create mode 100644 cpp/tests/cutile/export_vector_add_cubin.py create mode 100644 cpp/tests/cutile/generate_cutile_cubins.cmake create mode 100644 cpp/tests/cutile/vector_add_kernel.py diff --git a/conda/recipes/libcuvs/recipe.yaml b/conda/recipes/libcuvs/recipe.yaml index aa7a37db44..93f31f8cf2 100644 --- a/conda/recipes/libcuvs/recipe.yaml +++ b/conda/recipes/libcuvs/recipe.yaml @@ -80,6 +80,7 @@ cache: - cuda-cudart-dev - cuda-nvrtc-dev - cuda-profiler-api + - cutile-python - libcublas-dev - libcurand-dev - libcusolver-dev @@ -117,6 +118,7 @@ outputs: - cuda-cudart-dev - cuda-nvrtc-dev - cuda-profiler-api + - cutile-python - libcublas-dev - libcurand-dev - libcusolver-dev @@ -179,6 +181,7 @@ outputs: - cuda-cudart-dev - cuda-nvrtc-dev - cuda-profiler-api + - cutile-python - libcublas-dev - libcurand-dev - libcusolver-dev @@ -240,6 +243,7 @@ outputs: - cuda-cudart-dev - cuda-nvrtc-dev - cuda-profiler-api + - cutile-python - libcublas-dev - libcurand-dev - libcusolver-dev @@ -299,6 +303,7 @@ outputs: - openblas # required by some CPU algos in benchmarks - cuda-cudart-dev - cuda-profiler-api + - cutile-python - libcublas-dev - libcurand-dev - libcusolver-dev diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 9b96f94bf0..ba6ed6e0e7 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -386,6 +386,8 @@ ConfigureTest( PERCENT 100 ) +add_subdirectory(cutile) + # ################################################################################################## # Install tests #################################################################################### # ################################################################################################## diff --git a/cpp/tests/cutile/CMakeLists.txt b/cpp/tests/cutile/CMakeLists.txt new file mode 100644 index 0000000000..989c8137d0 --- /dev/null +++ b/cpp/tests/cutile/CMakeLists.txt @@ -0,0 +1,23 @@ +# ============================================================================= +# cmake-format: off +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. +# SPDX-License-Identifier: Apache-2.0 +# cmake-format: on +# ============================================================================= + +include("${CMAKE_CURRENT_LIST_DIR}/generate_cutile_cubins.cmake") + +generate_cutile_vector_add_cubins(CUTILE_GENERATED_INCLUDE_DIR) + +ConfigureTest( + NAME CUTILE_VECTOR_ADD_TEST + PATH "${CMAKE_CURRENT_LIST_DIR}/cutile_vector_add.cu" + GPUS 1 + PERCENT 100 +) + +add_dependencies(CUTILE_VECTOR_ADD_TEST cutile_vector_add_cubins) + +target_include_directories( + CUTILE_VECTOR_ADD_TEST PRIVATE "${CUTILE_GENERATED_INCLUDE_DIR}" +) diff --git a/cpp/tests/cutile/cutile_vector_add.cu b/cpp/tests/cutile/cutile_vector_add.cu new file mode 100644 index 0000000000..77a5e51311 --- /dev/null +++ b/cpp/tests/cutile/cutile_vector_add.cu @@ -0,0 +1,128 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "../test_utils.cuh" + +#include "vector_add_kernel_symbol.h" +#include "vector_add_sm_100_cubin.h" +#include "vector_add_sm_120_cubin.h" +#include "vector_add_sm_80_cubin.h" +#include "vector_add_sm_86_cubin.h" +#include "vector_add_sm_90_cubin.h" + +#include + +#include + +namespace cuvs { +namespace { + +struct EmbeddedCubin { + int cc_major; + int cc_minor; + const unsigned char* data; + size_t size; +}; + +// Lookup table for cubins built at configure time (see export_vector_add_cubin.py). +constexpr EmbeddedCubin kEmbeddedCubins[] = { + {8, 0, vector_add_sm_80_cubin, sizeof(vector_add_sm_80_cubin)}, + {8, 6, vector_add_sm_86_cubin, sizeof(vector_add_sm_86_cubin)}, + {9, 0, vector_add_sm_90_cubin, sizeof(vector_add_sm_90_cubin)}, + {10, 0, vector_add_sm_100_cubin, sizeof(vector_add_sm_100_cubin)}, + {12, 0, vector_add_sm_120_cubin, sizeof(vector_add_sm_120_cubin)}, +}; + +const EmbeddedCubin* find_embedded_cubin(int cc_major, int cc_minor) +{ + for (const auto& entry : kEmbeddedCubins) { + if (entry.cc_major == cc_major && entry.cc_minor == cc_minor) { return &entry; } + } + // Fall back to a cubin for the same major version (e.g. minor SKUs within a generation). + for (const auto& entry : kEmbeddedCubins) { + if (entry.cc_major == cc_major) { return &entry; } + } + return nullptr; +} + +class CutileVectorAddTest : public ::testing::Test { + protected: + void SetUp() override + { + int device = 0; + RAFT_CUDA_TRY(cudaGetDevice(&device)); + RAFT_CUDA_TRY( + cudaDeviceGetAttribute(&cc_major_, cudaDevAttrComputeCapabilityMajor, device)); + RAFT_CUDA_TRY( + cudaDeviceGetAttribute(&cc_minor_, cudaDevAttrComputeCapabilityMinor, device)); + } + + int cc_major_{}; + int cc_minor_{}; +}; + +} // namespace + +TEST_F(CutileVectorAddTest, EmbeddedCubinVectorAdd) +{ + const EmbeddedCubin* cubin = find_embedded_cubin(cc_major_, cc_minor_); + ASSERT_NE(cubin, nullptr) + << "No embedded cuTile cubin for compute capability " << cc_major_ << "." << cc_minor_; + + cudaLibrary_t library{}; + ASSERT_EQ(cudaSuccess, + cudaLibraryLoadData( + &library, cubin->data, nullptr, nullptr, 0, nullptr, nullptr, 0)) + << "cudaLibraryLoadData failed: " << cudaGetErrorString(cudaGetLastError()); + + cudaKernel_t kernel{}; + ASSERT_EQ(cudaSuccess, + cudaLibraryGetKernel(&kernel, library, CUTILE_VECTOR_ADD_KERNEL_SYMBOL)) + << "cudaLibraryGetKernel failed: " << cudaGetErrorString(cudaGetLastError()); + + constexpr int kN = 1024; + constexpr int kTile = 256; + constexpr int kGridDim = (kN + kTile - 1) / kTile; + + float *d_a = nullptr, *d_b = nullptr, *d_c = nullptr; + RAFT_CUDA_TRY(cudaMalloc(&d_a, kN * sizeof(float))); + RAFT_CUDA_TRY(cudaMalloc(&d_b, kN * sizeof(float))); + RAFT_CUDA_TRY(cudaMalloc(&d_c, kN * sizeof(float))); + + std::vector h_a(kN), h_b(kN); + for (int i = 0; i < kN; ++i) { + h_a[i] = static_cast(i); + h_b[i] = static_cast(i * 2); + } + RAFT_CUDA_TRY(cudaMemcpy(d_a, h_a.data(), kN * sizeof(float), cudaMemcpyHostToDevice)); + RAFT_CUDA_TRY(cudaMemcpy(d_b, h_b.data(), kN * sizeof(float), cudaMemcpyHostToDevice)); + RAFT_CUDA_TRY(cudaMemset(d_c, 0, kN * sizeof(float))); + + int64_t shape = kN; + int64_t stride = 1; + void* kernel_args[] = { + &d_a, &shape, &stride, &d_b, &shape, &stride, &d_c, &shape, &stride, + }; + + dim3 grid(kGridDim); + dim3 block(1); + ASSERT_EQ(cudaSuccess, cudaLaunchKernel(kernel, grid, block, kernel_args, 0, 0)) + << "cudaLaunchKernel failed: " << cudaGetErrorString(cudaGetLastError()); + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + + std::vector h_c(kN); + RAFT_CUDA_TRY(cudaMemcpy(h_c.data(), d_c, kN * sizeof(float), cudaMemcpyDeviceToHost)); + + for (int i = 0; i < kN; ++i) { + ASSERT_FLOAT_EQ(h_a[i] + h_b[i], h_c[i]) << "@" << i; + } + + RAFT_CUDA_TRY(cudaFree(d_a)); + RAFT_CUDA_TRY(cudaFree(d_b)); + RAFT_CUDA_TRY(cudaFree(d_c)); + RAFT_CUDA_TRY(cudaLibraryUnload(library)); +} + +} // namespace cuvs diff --git a/cpp/tests/cutile/export_vector_add_cubin.py b/cpp/tests/cutile/export_vector_add_cubin.py new file mode 100644 index 0000000000..bf40a4ad80 --- /dev/null +++ b/cpp/tests/cutile/export_vector_add_cubin.py @@ -0,0 +1,101 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. +# SPDX-License-Identifier: Apache-2.0 +"""Export the cuTile vector-add kernel to a cubin for a single GPU target.""" + +from __future__ import annotations + +import argparse +import sys +from pathlib import Path + +import cuda.tile as ct +from cuda.tile.compilation import ( + ArrayConstraint, + CallingConvention, + ConstantConstraint, + KernelSignature, + export_kernel, +) + +from vector_add_kernel import TILE_SIZE, vector_add + +# cuTile / tileiras gpu_code values used at build time. These correspond to the +# cuvs library CUDA 13 real targets as follows (tileiras has no sm_*a/sm_*f names): +# sm_80 -> 80-real +# sm_86 -> 86-real +# sm_90 -> 90a-real +# sm_100 -> 100f-real +# sm_120 -> 120a-real +SUPPORTED_GPU_CODES = ("sm_80", "sm_86", "sm_90", "sm_100", "sm_120") + + +def _kernel_signature() -> KernelSignature: + array = ArrayConstraint( + ct.float32, + 1, + index_dtype=ct.int64, + stride_lower_bound_incl=0, + alias_groups=(), + may_alias_internally=False, + stride_constant=(1,), + ) + return KernelSignature( + parameters=[array, array, array, ConstantConstraint(TILE_SIZE)], + calling_convention=CallingConvention.cutile_python_v1(), + ).with_mangled_symbol("vector_add") + + +def export_cubin(output_file: Path, gpu_code: str, symbol_header: Path | None) -> str: + if gpu_code not in SUPPORTED_GPU_CODES: + raise ValueError( + f"Unsupported gpu_code {gpu_code!r}; expected one of {SUPPORTED_GPU_CODES}" + ) + + signature = _kernel_signature() + export_kernel( + vector_add, + signatures=[signature], + output_file=str(output_file), + gpu_code=gpu_code, + output_format="cubin", + ) + + if symbol_header is not None: + symbol_header.write_text( + "\n".join( + [ + "// Generated by export_vector_add_cubin.py; do not edit.", + "#pragma once", + f'#define CUTILE_VECTOR_ADD_KERNEL_SYMBOL "{signature.symbol}"', + "", + ] + ) + ) + + return signature.symbol + + +def main() -> int: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("output_file", type=Path, help="Output cubin path") + parser.add_argument( + "--gpu-code", + required=True, + choices=SUPPORTED_GPU_CODES, + help="tileiras / export_kernel target (e.g. sm_120)", + ) + parser.add_argument( + "--symbol-header", + type=Path, + default=None, + help="Optional header that defines CUTILE_VECTOR_ADD_KERNEL_SYMBOL", + ) + args = parser.parse_args() + + symbol = export_cubin(args.output_file, args.gpu_code, args.symbol_header) + print(symbol) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/cpp/tests/cutile/generate_cutile_cubins.cmake b/cpp/tests/cutile/generate_cutile_cubins.cmake new file mode 100644 index 0000000000..3425b03028 --- /dev/null +++ b/cpp/tests/cutile/generate_cutile_cubins.cmake @@ -0,0 +1,90 @@ +# ============================================================================= +# cmake-format: off +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. +# SPDX-License-Identifier: Apache-2.0 +# cmake-format: on +# ============================================================================= + +include_guard(GLOBAL) + +# Build-time cuTile cubin targets. Maps to cuvs CUDA 13 -real library arches (75-real omitted). +set(CUTILE_VECTOR_ADD_GPU_CODES sm_80 sm_86 sm_90 sm_100 sm_120) + +function(generate_cutile_vector_add_cubins output_include_dir_var) + find_package(Python3 REQUIRED COMPONENTS Interpreter) + find_package(CUDAToolkit REQUIRED) + + find_program( + CUTILE_BIN2C + NAMES bin2c + PATHS ${CUDAToolkit_BIN_DIR} + REQUIRED + ) + + execute_process( + COMMAND "${Python3_EXECUTABLE}" -c "import cuda.tile" + RESULT_VARIABLE _cutile_import_result + OUTPUT_QUIET + ERROR_QUIET + ) + if(NOT _cutile_import_result EQUAL 0) + message( + FATAL_ERROR + "cuda.tile (cuTile Python) is required to build CUTILE_VECTOR_ADD_TEST. " + "Install it in the active Python environment, e.g. pip install cuda-tile[tileiras]." + ) + endif() + + set(_cutile_source_dir "${CMAKE_CURRENT_FUNCTION_LIST_DIR}") + set(_cutile_binary_dir "${CMAKE_CURRENT_BINARY_DIR}/cutile_generated") + file(MAKE_DIRECTORY "${_cutile_binary_dir}") + + set(_symbol_header "${_cutile_binary_dir}/vector_add_kernel_symbol.h") + set(_first_gpu_code TRUE) + + foreach(_gpu_code IN LISTS CUTILE_VECTOR_ADD_GPU_CODES) + set(_cubin_file "${_cutile_binary_dir}/vector_add_${_gpu_code}.cubin") + set(_cubin_header "${_cutile_binary_dir}/vector_add_${_gpu_code}_cubin.h") + + if(_first_gpu_code) + set(_symbol_arg --symbol-header "${_symbol_header}") + set(_cubin_outputs "${_cubin_file}" "${_symbol_header}") + set(_first_gpu_code FALSE) + else() + set(_symbol_arg) + set(_cubin_outputs "${_cubin_file}") + endif() + + add_custom_command( + OUTPUT ${_cubin_outputs} + COMMAND + "${Python3_EXECUTABLE}" "${_cutile_source_dir}/export_vector_add_cubin.py" + "${_cubin_file}" --gpu-code "${_gpu_code}" ${_symbol_arg} + DEPENDS "${_cutile_source_dir}/export_vector_add_cubin.py" + "${_cutile_source_dir}/vector_add_kernel.py" + COMMENT "Exporting cuTile vector_add cubin for ${_gpu_code}" + VERBATIM + ) + + add_custom_command( + OUTPUT "${_cubin_header}" + COMMAND "${CUTILE_BIN2C}" --const --name "vector_add_${_gpu_code}_cubin" --static + "${_cubin_file}" > "${_cubin_header}" + DEPENDS "${_cubin_file}" + COMMENT "Embedding vector_add ${_gpu_code} cubin via bin2c" + VERBATIM + ) + + list(APPEND _generated_headers "${_cubin_header}") + endforeach() + + add_custom_target( + cutile_vector_add_cubins + DEPENDS "${_symbol_header}" ${_generated_headers} + ) + + set(${output_include_dir_var} + "${_cutile_binary_dir}" + PARENT_SCOPE + ) +endfunction() diff --git a/cpp/tests/cutile/vector_add_kernel.py b/cpp/tests/cutile/vector_add_kernel.py new file mode 100644 index 0000000000..46b7a607c6 --- /dev/null +++ b/cpp/tests/cutile/vector_add_kernel.py @@ -0,0 +1,17 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. +# SPDX-License-Identifier: Apache-2.0 +"""cuTile Python vector-add kernel used by the embedded-cubin example test.""" + +from __future__ import annotations + +import cuda.tile as ct + +TILE_SIZE = 256 + + +@ct.kernel +def vector_add(a, b, c, TILE_SIZE: ct.Constant): + bid = ct.bid(0) + ta = ct.load(a, bid, TILE_SIZE) + tb = ct.load(b, bid, TILE_SIZE) + ct.store(c, bid, ta + tb) diff --git a/dependencies.yaml b/dependencies.yaml index 744e4d9227..756041e60c 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -395,6 +395,7 @@ dependencies: - cuda-nvrtc-dev - cuda-nvtx-dev - cuda-profiler-api + - cutile-python - libcublas-dev - libcurand-dev - libcusolver-dev @@ -430,12 +431,14 @@ dependencies: packages: - &ctk_cu13 cuda-toolkit[cublas,curand,cusolver,cusparse,nvrtc]==13.* - &nvjitlink_cu13 nvidia-nvjitlink>=13.0,<14 + - &cutile_cu13 cuda-tile[tileiras] # if no matching matrix selectors passed, list the CUDA 13 requirement # (just as a source of documentation, as this populates pyproject.toml in source control) - matrix: packages: - *ctk_cu13 - *nvjitlink_cu13 + - *cutile_cu13 depends_on_cudart: common: - output_types: conda From b10c02ca5a5ef094ca892c6d32f6f14d6d63447f Mon Sep 17 00:00:00 2001 From: divyegala Date: Wed, 24 Jun 2026 16:34:11 +0000 Subject: [PATCH 2/5] initial integration --- cpp/CMakeLists.txt | 50 ++- .../modules/generate_cutile_kernels.cmake | 315 ++++++++++++++++++ cpp/cmake/modules/register_cubin.cpp.in | 22 ++ cpp/cmake/modules/register_tileir.cpp.in | 22 ++ .../cuvs/detail/jit_lto/AlgorithmPlanner.hpp | 63 +++- .../cuvs/detail/jit_lto/FragmentEntry.hpp | 63 ++++ .../cuvs/detail/jit_lto/cutile_arch_tags.hpp | 52 +++ .../cuvs/detail/jit_lto/cutile_module.hpp | 75 +++++ .../fused_distance_nn/fused_1nn_fragments.hpp | 21 ++ .../cuvs/detail/jit_lto/tileir_compat.hpp | 99 ++++++ cpp/src/detail/jit_lto/AlgorithmPlanner.cpp | 103 ++---- .../detail/jit_lto/LTOAlgorithmPlanner.cpp | 76 +++++ .../detail/jit_lto/TileAlgorithmPlanner.cpp | 38 +++ cpp/src/distance/detail/fused_distance_nn.cuh | 15 + .../cutile/export_fused_1nn.py | 136 ++++++++ .../cutile/fused_1nn_cutile_cubin_matrix.json | 40 +++ .../fused_1nn_cutile_tileir_matrix.json | 20 ++ .../cutile/fused_1nn_kernel.py | 68 ++++ .../cutile/fused_1nn_planner.hpp | 60 ++++ .../cutile/fused_1nn_tile.cu | 173 ++++++++++ .../cutile/fused_1nn_tile.hpp | 55 +++ .../pairwise_matrix_planner.hpp | 4 +- .../jit_lto_kernels/cagra_planner_base.hpp | 4 +- .../interleaved_scan_planner.hpp | 4 +- .../compute_similarity_planner.hpp | 4 +- .../detail/jit_lto_kernels/scan_planner.hpp | 4 +- cpp/tests/CMakeLists.txt | 3 +- cpp/tests/cutile/cutile_vector_add.cu | 176 ++++++++-- cpp/tests/cutile/export_vector_add_cubin.py | 58 +++- cpp/tests/cutile/generate_cutile_cubins.cmake | 27 ++ cpp/tests/neighbors/distance_nn.cu | 1 + cpp/tests/neighbors/distance_nn_helper.cuh | 45 ++- 32 files changed, 1743 insertions(+), 153 deletions(-) create mode 100644 cpp/cmake/modules/generate_cutile_kernels.cmake create mode 100644 cpp/cmake/modules/register_cubin.cpp.in create mode 100644 cpp/cmake/modules/register_tileir.cpp.in create mode 100644 cpp/include/cuvs/detail/jit_lto/cutile_arch_tags.hpp create mode 100644 cpp/include/cuvs/detail/jit_lto/cutile_module.hpp create mode 100644 cpp/include/cuvs/detail/jit_lto/fused_distance_nn/fused_1nn_fragments.hpp create mode 100644 cpp/include/cuvs/detail/jit_lto/tileir_compat.hpp create mode 100644 cpp/src/detail/jit_lto/LTOAlgorithmPlanner.cpp create mode 100644 cpp/src/detail/jit_lto/TileAlgorithmPlanner.cpp create mode 100644 cpp/src/distance/detail/fused_distance_nn/cutile/export_fused_1nn.py create mode 100644 cpp/src/distance/detail/fused_distance_nn/cutile/fused_1nn_cutile_cubin_matrix.json create mode 100644 cpp/src/distance/detail/fused_distance_nn/cutile/fused_1nn_cutile_tileir_matrix.json create mode 100644 cpp/src/distance/detail/fused_distance_nn/cutile/fused_1nn_kernel.py create mode 100644 cpp/src/distance/detail/fused_distance_nn/cutile/fused_1nn_planner.hpp create mode 100644 cpp/src/distance/detail/fused_distance_nn/cutile/fused_1nn_tile.cu create mode 100644 cpp/src/distance/detail/fused_distance_nn/cutile/fused_1nn_tile.hpp diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 227c2906cc..cc6e1975b3 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -957,6 +957,47 @@ if(NOT BUILD_CPU_ONLY) OUTPUT_FILE_FORMAT "${CMAKE_CURRENT_BINARY_DIR}/src/distance/detail/pairwise_matrix/dispatch_rbf_inst_data_@data_abbrev@_acc_@acc_abbrev@_out_@out_abbrev@_index_@index_abbrev@_op_@op_abbrev@.cu" ) + + include(cmake/modules/generate_cutile_kernels.cmake) + set(fused_1nn_cutile_dir + "${CMAKE_CURRENT_SOURCE_DIR}/src/distance/detail/fused_distance_nn/cutile") + set(cutile_fused_1nn_generated_dir + "${CMAKE_CURRENT_BINARY_DIR}/generated_kernels/distance/fused_1nn/cutile") + generate_cutile_cubin_kernels( + cutile_fused_1nn_files + KERNEL_DIR "${fused_1nn_cutile_dir}" + KERNEL_BASENAME "fused_1nn" + KERNEL_PYTHON "fused_1nn_kernel.py" + EXPORT_SCRIPT "export_fused_1nn.py" + OUTPUT_DIRECTORY "${cutile_fused_1nn_generated_dir}" + MATRIX_JSON_FILE "${fused_1nn_cutile_dir}/fused_1nn_cutile_cubin_matrix.json" + FRAGMENT_TAG_FORMAT + "cuvs::distance::detail::fragment_tag_fused_1nn_cubin" + FRAGMENT_TAG_HEADER_FILES + "" + "" + "" + ) + generate_cutile_tileir_kernels( + cutile_fused_1nn_files + KERNEL_DIR "${fused_1nn_cutile_dir}" + KERNEL_BASENAME "fused_1nn" + KERNEL_PYTHON "fused_1nn_kernel.py" + EXPORT_SCRIPT "export_fused_1nn.py" + OUTPUT_DIRECTORY "${cutile_fused_1nn_generated_dir}" + MATRIX_JSON_FILE "${fused_1nn_cutile_dir}/fused_1nn_cutile_tileir_matrix.json" + FRAGMENT_TAG_FORMAT + "cuvs::distance::detail::fragment_tag_fused_1nn_tileir" + FRAGMENT_TAG_HEADER_FILES + "" + "" + ) + if(NOT DEFINED CUVS_CUTILE_ENABLED) + set(CUVS_CUTILE_ENABLED 0) + endif() + target_compile_definitions( + cuvs_cpp_headers INTERFACE CUVS_CUTILE_ENABLED=${CUVS_CUTILE_ENABLED} + ) generate_inst_matrix( cagra_build_inst_files MATRIX_JSON_FILE "${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/cagra_build_matrix.json" @@ -1147,6 +1188,8 @@ if(NOT BUILD_CPU_ONLY) src/util/host_memory.cpp src/detail/jit_lto/AlgorithmLauncher.cpp src/detail/jit_lto/AlgorithmPlanner.cpp + src/detail/jit_lto/LTOAlgorithmPlanner.cpp + src/detail/jit_lto/TileAlgorithmPlanner.cpp src/detail/jit_lto/FragmentEntry.cpp src/detail/jit_lto/nvjitlink_checker.cpp src/detail/jit_lto/NVRTCLTOFragmentCompiler.cpp @@ -1234,6 +1277,8 @@ if(NOT BUILD_CPU_ONLY) src/stats/trustworthiness_score.cu ${CUVS_MG_ALGOS} ${jit_lto_files} + ${cutile_fused_1nn_files} + $<$:src/distance/detail/fused_distance_nn/cutile/fused_1nn_tile.cu> ) set_target_properties( @@ -1257,6 +1302,7 @@ if(NOT BUILD_CPU_ONLY) target_compile_definitions( cuvs_objs PRIVATE $<$:CUVS_BUILD_CAGRA_HNSWLIB> $<$:NVTX_ENABLED> + CUVS_CUTILE_ENABLED=${CUVS_CUTILE_ENABLED} ) target_link_libraries( @@ -1274,7 +1320,9 @@ if(NOT BUILD_CPU_ONLY) PUBLIC "$" "$" INTERFACE "$" - PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/src" "${CMAKE_CURRENT_BINARY_DIR}/src" + PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/src" + "${CMAKE_CURRENT_BINARY_DIR}/src" + "${cutile_fused_1nn_generated_dir}" ) # Endian detection diff --git a/cpp/cmake/modules/generate_cutile_kernels.cmake b/cpp/cmake/modules/generate_cutile_kernels.cmake new file mode 100644 index 0000000000..7b9c2521c4 --- /dev/null +++ b/cpp/cmake/modules/generate_cutile_kernels.cmake @@ -0,0 +1,315 @@ +# ============================================================================= +# cmake-format: off +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. +# SPDX-License-Identifier: Apache-2.0 +# cmake-format: on +# ============================================================================= + +include_guard(GLOBAL) + +include(${CMAKE_CURRENT_LIST_DIR}/compute_matrix_product.cmake) + +function(generate_cutile_kernels_stub) + set(CUVS_CUTILE_ENABLED 0 PARENT_SCOPE) +endfunction() + +function(_cutile_fragment_tag_header_files output_var) + set(${output_var} "") + foreach(_header IN LISTS ARGN) + if(NOT _header MATCHES "^(\".*\"|<.*>)$") + set(_header "\"${_header}\"") + endif() + string(APPEND ${output_var} "#include ${_header}\n") + endforeach() + set(${output_var} + "${${output_var}}" + PARENT_SCOPE + ) +endfunction() + +function(_cutile_kernels_setup) + set(options) + set(one_value MATRIX_JSON_FILE OUTPUT_DIRECTORY) + set(multi_value) + cmake_parse_arguments(_CUTILE "${options}" "${one_value}" "${multi_value}" ${ARGN}) + + find_package(Python3 REQUIRED COMPONENTS Interpreter) + find_package(CUDAToolkit REQUIRED) + + if(CUDAToolkit_VERSION VERSION_LESS 13.0) + message( + STATUS + "cuTile embedded kernels require CUDA 13.0+; skipping cuTile generation (found ${CUDAToolkit_VERSION})." + ) + set(_CUTILE_SETUP_OK + FALSE + PARENT_SCOPE + ) + return() + endif() + + find_program( + CUTILE_BIN2C + NAMES bin2c + PATHS ${CUDAToolkit_BIN_DIR} + REQUIRED + ) + + execute_process( + COMMAND "${Python3_EXECUTABLE}" -c "import cuda.tile" + RESULT_VARIABLE _cutile_import_result + OUTPUT_QUIET + ERROR_QUIET + ) + if(NOT _cutile_import_result EQUAL 0) + message( + FATAL_ERROR + "cuda.tile (cuTile Python) is required to build cuTile embedded kernels. " + "Install it in the active Python environment, e.g. pip install cuda-tile[tileiras]." + ) + endif() + + set_property( + DIRECTORY + PROPERTY CMAKE_CONFIGURE_DEPENDS "${_CUTILE_MATRIX_JSON_FILE}" + APPEND + ) + + file(MAKE_DIRECTORY "${_CUTILE_OUTPUT_DIRECTORY}") + + set(_CUTILE_SETUP_OK + TRUE + PARENT_SCOPE + ) +endfunction() + +function(process_cutile_cubin_matrix_entry source_list_var) + set(options) + set(one_value + KERNEL_DIR + KERNEL_BASENAME + KERNEL_PYTHON + EXPORT_SCRIPT + OUTPUT_DIRECTORY + FRAGMENT_TAG_FORMAT + MATRIX_JSON_ENTRY + ) + set(multi_value FRAGMENT_TAG_HEADER_FILES) + cmake_parse_arguments(_CUTILE "${options}" "${one_value}" "${multi_value}" ${ARGN}) + + populate_matrix_variables("${_CUTILE_MATRIX_JSON_ENTRY}") + _cutile_fragment_tag_header_files( + fragment_tag_header_files ${_CUTILE_FRAGMENT_TAG_HEADER_FILES} + ) + + string(CONFIGURE "${_CUTILE_FRAGMENT_TAG_FORMAT}" fragment_tag @ONLY) + + set(_artifact_basename "${_CUTILE_KERNEL_BASENAME}_${data_type}_${gpu_code}") + set(_cubin_file "${_CUTILE_OUTPUT_DIRECTORY}/${_artifact_basename}.cubin") + set(_cubin_header "${_CUTILE_OUTPUT_DIRECTORY}/${_artifact_basename}_cubin.h") + set(_cubin_cpp "${_CUTILE_OUTPUT_DIRECTORY}/${_artifact_basename}_cubin.cpp") + set(cubin_header_file "${_artifact_basename}_cubin.h") + + add_custom_command( + OUTPUT "${_cubin_file}" + COMMAND + "${Python3_EXECUTABLE}" "${_CUTILE_KERNEL_DIR}/${_CUTILE_EXPORT_SCRIPT}" "${_cubin_file}" + --format cubin --data-type "${data_type}" --gpu-code "${gpu_code}" + DEPENDS "${_CUTILE_KERNEL_DIR}/${_CUTILE_EXPORT_SCRIPT}" + "${_CUTILE_KERNEL_DIR}/${_CUTILE_KERNEL_PYTHON}" + COMMENT "Exporting cuTile ${_CUTILE_KERNEL_BASENAME} cubin ${data_type} ${gpu_code}" + VERBATIM + ) + + add_custom_command( + OUTPUT "${_cubin_header}" + COMMAND "${CUTILE_BIN2C}" --const --name embedded_cubin --static "${_cubin_file}" + > "${_cubin_header}" + DEPENDS "${_cubin_file}" + VERBATIM + ) + + configure_file( + "${CMAKE_CURRENT_FUNCTION_LIST_DIR}/register_cubin.cpp.in" "${_cubin_cpp}" @ONLY + ) + list(APPEND ${source_list_var} "${_cubin_header}" "${_cubin_cpp}") + set(${source_list_var} + "${${source_list_var}}" + PARENT_SCOPE + ) +endfunction() + +function(process_cutile_tileir_matrix_entry source_list_var) + set(options) + set(one_value + KERNEL_DIR + KERNEL_BASENAME + KERNEL_PYTHON + EXPORT_SCRIPT + OUTPUT_DIRECTORY + FRAGMENT_TAG_FORMAT + MATRIX_JSON_ENTRY + ) + set(multi_value FRAGMENT_TAG_HEADER_FILES) + cmake_parse_arguments(_CUTILE "${options}" "${one_value}" "${multi_value}" ${ARGN}) + + populate_matrix_variables("${_CUTILE_MATRIX_JSON_ENTRY}") + _cutile_fragment_tag_header_files( + fragment_tag_header_files ${_CUTILE_FRAGMENT_TAG_HEADER_FILES} + ) + + string(CONFIGURE "${_CUTILE_FRAGMENT_TAG_FORMAT}" fragment_tag @ONLY) + set(_tileir_file "${_CUTILE_OUTPUT_DIRECTORY}/${_CUTILE_KERNEL_BASENAME}_${data_type}.tilebc") + set(_tileir_header "${_CUTILE_OUTPUT_DIRECTORY}/${_CUTILE_KERNEL_BASENAME}_${data_type}_tileir.h") + set(_tileir_cpp "${_CUTILE_OUTPUT_DIRECTORY}/${_CUTILE_KERNEL_BASENAME}_${data_type}_tileir.cpp") + set(tileir_header_file "${_CUTILE_KERNEL_BASENAME}_${data_type}_tileir.h") + + add_custom_command( + OUTPUT "${_tileir_file}" + COMMAND + "${Python3_EXECUTABLE}" "${_CUTILE_KERNEL_DIR}/${_CUTILE_EXPORT_SCRIPT}" "${_tileir_file}" + --format tileir_bytecode --data-type "${data_type}" --gpu-code "${export_gpu_code}" + --bytecode-version "${bytecode_version}" + DEPENDS "${_CUTILE_KERNEL_DIR}/${_CUTILE_EXPORT_SCRIPT}" + "${_CUTILE_KERNEL_DIR}/${_CUTILE_KERNEL_PYTHON}" + COMMENT "Exporting cuTile ${_CUTILE_KERNEL_BASENAME} TileIR bytecode ${data_type}" + VERBATIM + ) + + add_custom_command( + OUTPUT "${_tileir_header}" + COMMAND "${CUTILE_BIN2C}" --const --name embedded_tileir --static "${_tileir_file}" + > "${_tileir_header}" + DEPENDS "${_tileir_file}" + VERBATIM + ) + + configure_file( + "${CMAKE_CURRENT_FUNCTION_LIST_DIR}/register_tileir.cpp.in" "${_tileir_cpp}" @ONLY + ) + list(APPEND ${source_list_var} "${_tileir_header}" "${_tileir_cpp}") + set(${source_list_var} + "${${source_list_var}}" + PARENT_SCOPE + ) +endfunction() + +function(generate_cutile_cubin_kernels source_list_var) + set(options) + set(one_value + KERNEL_DIR + KERNEL_BASENAME + KERNEL_PYTHON + EXPORT_SCRIPT + OUTPUT_DIRECTORY + MATRIX_JSON_FILE + FRAGMENT_TAG_FORMAT + ) + set(multi_value FRAGMENT_TAG_HEADER_FILES) + cmake_parse_arguments(_CUTILE "${options}" "${one_value}" "${multi_value}" ${ARGN}) + + if(NOT _CUTILE_KERNEL_BASENAME) + message(FATAL_ERROR "generate_cutile_cubin_kernels: KERNEL_BASENAME is required") + endif() + if(NOT _CUTILE_KERNEL_PYTHON) + set(_CUTILE_KERNEL_PYTHON "fused_1nn_kernel.py") + endif() + + _cutile_kernels_setup( + MATRIX_JSON_FILE "${_CUTILE_MATRIX_JSON_FILE}" + OUTPUT_DIRECTORY "${_CUTILE_OUTPUT_DIRECTORY}" + ) + if(NOT _CUTILE_SETUP_OK) + generate_cutile_kernels_stub() + set(${source_list_var} + "" + PARENT_SCOPE + ) + return() + endif() + + compute_matrix_product(matrix_product MATRIX_JSON_FILE "${_CUTILE_MATRIX_JSON_FILE}") + + string(JSON len LENGTH "${matrix_product}") + math(EXPR last "${len} - 1") + + # cmake-lint: disable=C0103,E1120 + foreach(i RANGE "${last}") + string(JSON matrix_json_entry GET "${matrix_product}" "${i}") + process_cutile_cubin_matrix_entry( + "${source_list_var}" + KERNEL_DIR "${_CUTILE_KERNEL_DIR}" + KERNEL_BASENAME "${_CUTILE_KERNEL_BASENAME}" + KERNEL_PYTHON "${_CUTILE_KERNEL_PYTHON}" + EXPORT_SCRIPT "${_CUTILE_EXPORT_SCRIPT}" + OUTPUT_DIRECTORY "${_CUTILE_OUTPUT_DIRECTORY}" + FRAGMENT_TAG_FORMAT "${_CUTILE_FRAGMENT_TAG_FORMAT}" + FRAGMENT_TAG_HEADER_FILES ${_CUTILE_FRAGMENT_TAG_HEADER_FILES} + MATRIX_JSON_ENTRY "${matrix_json_entry}" + ) + endforeach() + + set(CUVS_CUTILE_ENABLED 1 PARENT_SCOPE) + set(${source_list_var} + "${${source_list_var}}" + PARENT_SCOPE + ) +endfunction() + +function(generate_cutile_tileir_kernels source_list_var) + set(options) + set(one_value + KERNEL_DIR + KERNEL_BASENAME + KERNEL_PYTHON + EXPORT_SCRIPT + OUTPUT_DIRECTORY + MATRIX_JSON_FILE + FRAGMENT_TAG_FORMAT + ) + set(multi_value FRAGMENT_TAG_HEADER_FILES) + cmake_parse_arguments(_CUTILE "${options}" "${one_value}" "${multi_value}" ${ARGN}) + + if(NOT _CUTILE_KERNEL_BASENAME) + message(FATAL_ERROR "generate_cutile_tileir_kernels: KERNEL_BASENAME is required") + endif() + if(NOT _CUTILE_KERNEL_PYTHON) + set(_CUTILE_KERNEL_PYTHON "fused_1nn_kernel.py") + endif() + + _cutile_kernels_setup( + MATRIX_JSON_FILE "${_CUTILE_MATRIX_JSON_FILE}" + OUTPUT_DIRECTORY "${_CUTILE_OUTPUT_DIRECTORY}" + ) + if(NOT _CUTILE_SETUP_OK) + generate_cutile_kernels_stub() + return() + endif() + + compute_matrix_product(matrix_product MATRIX_JSON_FILE "${_CUTILE_MATRIX_JSON_FILE}") + + string(JSON len LENGTH "${matrix_product}") + math(EXPR last "${len} - 1") + + # cmake-lint: disable=C0103,E1120 + foreach(i RANGE "${last}") + string(JSON matrix_json_entry GET "${matrix_product}" "${i}") + process_cutile_tileir_matrix_entry( + "${source_list_var}" + KERNEL_DIR "${_CUTILE_KERNEL_DIR}" + KERNEL_BASENAME "${_CUTILE_KERNEL_BASENAME}" + KERNEL_PYTHON "${_CUTILE_KERNEL_PYTHON}" + EXPORT_SCRIPT "${_CUTILE_EXPORT_SCRIPT}" + OUTPUT_DIRECTORY "${_CUTILE_OUTPUT_DIRECTORY}" + FRAGMENT_TAG_FORMAT "${_CUTILE_FRAGMENT_TAG_FORMAT}" + FRAGMENT_TAG_HEADER_FILES ${_CUTILE_FRAGMENT_TAG_HEADER_FILES} + MATRIX_JSON_ENTRY "${matrix_json_entry}" + ) + endforeach() + + set(CUVS_CUTILE_ENABLED 1 PARENT_SCOPE) + set(${source_list_var} + "${${source_list_var}}" + PARENT_SCOPE + ) +endfunction() diff --git a/cpp/cmake/modules/register_cubin.cpp.in b/cpp/cmake/modules/register_cubin.cpp.in new file mode 100644 index 0000000000..c27d6829ee --- /dev/null +++ b/cpp/cmake/modules/register_cubin.cpp.in @@ -0,0 +1,22 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "@cubin_header_file@" +#include + +@fragment_tag_header_files@ + +namespace { + +using fragment_tag = @fragment_tag@; +using fragment_entry = StaticCubinFragmentEntry; + +} // namespace + +template <> +const uint8_t* const fragment_entry::data = embedded_cubin; + +template <> +const size_t fragment_entry::length = sizeof(embedded_cubin); diff --git a/cpp/cmake/modules/register_tileir.cpp.in b/cpp/cmake/modules/register_tileir.cpp.in new file mode 100644 index 0000000000..fb81acedbc --- /dev/null +++ b/cpp/cmake/modules/register_tileir.cpp.in @@ -0,0 +1,22 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "@tileir_header_file@" +#include + +@fragment_tag_header_files@ + +namespace { + +using fragment_tag = @fragment_tag@; +using fragment_entry = StaticTileIrBytecodeFragmentEntry; + +} // namespace + +template <> +const uint8_t* const fragment_entry::data = embedded_tileir; + +template <> +const size_t fragment_entry::length = sizeof(embedded_tileir); diff --git a/cpp/include/cuvs/detail/jit_lto/AlgorithmPlanner.hpp b/cpp/include/cuvs/detail/jit_lto/AlgorithmPlanner.hpp index 7f275b1285..d727c73b9d 100644 --- a/cpp/include/cuvs/detail/jit_lto/AlgorithmPlanner.hpp +++ b/cpp/include/cuvs/detail/jit_lto/AlgorithmPlanner.hpp @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -19,6 +20,7 @@ struct LauncherJitCache { std::shared_mutex mutex; std::unordered_map> launchers; + std::unordered_set build_failed; }; struct AlgorithmPlanner { @@ -27,9 +29,32 @@ struct AlgorithmPlanner { { } + virtual ~AlgorithmPlanner() = default; + std::shared_ptr get_launcher(); + /** Returns nullptr when no module can be loaded for the current device (does not RAFT_FAIL). */ + std::shared_ptr try_get_launcher(); + std::string entrypoint; + + protected: + virtual std::shared_ptr build() = 0; + + virtual std::string get_planner_key() const = 0; + + std::shared_ptr read_cache(std::string const& launch_key) const; + + LauncherJitCache& jit_cache_; +}; + +/** Links embedded LTO fatbin fragments at runtime via nvJitLink. */ +struct LTOAlgorithmPlanner : AlgorithmPlanner { + LTOAlgorithmPlanner(std::string entrypoint, LauncherJitCache& jit_cache) + : AlgorithmPlanner(std::move(entrypoint), jit_cache) + { + } + std::vector> fragments; template >> @@ -45,16 +70,38 @@ struct AlgorithmPlanner { } protected: - /** Extra link-time option strings passed to nvJitLink. Base build() - * always passes "-lto" and "-arch=sm_XX" first; derived planners may append here in their - * constructor body. */ + /** Extra link-time option strings passed to nvJitLink. */ std::vector linktime_extra_options; - private: - std::string get_fragments_key() const; - std::shared_ptr build(); + std::string get_planner_key() const override; - std::shared_ptr read_cache(std::string const& launch_key) const; + std::shared_ptr build() override; +}; - LauncherJitCache& jit_cache_; +/** Loads prebuilt cubins or TileIR bytecode via cudaLibraryLoadData. */ +struct TileAlgorithmPlanner : AlgorithmPlanner { + TileAlgorithmPlanner(std::string entrypoint, LauncherJitCache& jit_cache) + : AlgorithmPlanner(std::move(entrypoint), jit_cache) + { + } + + template + void add_static_fragment() + { + cubin_fragments_.push_back(std::make_unique>()); + } + + template + void add_static_tileir_fragment() + { + tileir_fragment_ = std::make_unique>(); + } + + protected: + std::vector> cubin_fragments_; + std::unique_ptr tileir_fragment_; + + std::string get_planner_key() const override; + + std::shared_ptr build() override; }; diff --git a/cpp/include/cuvs/detail/jit_lto/FragmentEntry.hpp b/cpp/include/cuvs/detail/jit_lto/FragmentEntry.hpp index 35aa46633c..df69ec1d7b 100644 --- a/cpp/include/cuvs/detail/jit_lto/FragmentEntry.hpp +++ b/cpp/include/cuvs/detail/jit_lto/FragmentEntry.hpp @@ -62,3 +62,66 @@ struct UDFFatbinFragment final : FatbinFragmentEntry { std::string key_; std::vector bytes_; }; + +/** Embedded CUDA binary module (cubin), loaded directly via cudaLibraryLoadData. */ +struct CubinFragmentEntry { + virtual ~CubinFragmentEntry() = default; + + virtual const uint8_t* get_data() const = 0; + + virtual size_t get_length() const = 0; + + virtual const char* get_key() const = 0; + + virtual int get_cc_major() const = 0; + + virtual int get_cc_minor() const = 0; +}; + +template +struct StaticCubinFragmentEntry final : CubinFragmentEntry { + const uint8_t* get_data() const override { return StaticCubinFragmentEntry::data; } + + size_t get_length() const override { return StaticCubinFragmentEntry::length; } + + const char* get_key() const override + { + return typeid(StaticCubinFragmentEntry).name(); + } + + int get_cc_major() const override { return FragmentTag::cc_major; } + + int get_cc_minor() const override { return FragmentTag::cc_minor; } + + static const uint8_t* const data; + static const size_t length; +}; + +/** Embedded TileIR bytecode, JIT-compiled by the driver when no matching cubin exists. */ +struct TileIrBytecodeFragmentEntry { + virtual ~TileIrBytecodeFragmentEntry() = default; + + virtual const uint8_t* get_data() const = 0; + + virtual size_t get_length() const = 0; + + virtual const char* get_key() const = 0; +}; + +template +struct StaticTileIrBytecodeFragmentEntry final : TileIrBytecodeFragmentEntry { + const uint8_t* get_data() const override + { + return StaticTileIrBytecodeFragmentEntry::data; + } + + size_t get_length() const override { return StaticTileIrBytecodeFragmentEntry::length; } + + const char* get_key() const override + { + return typeid(StaticTileIrBytecodeFragmentEntry).name(); + } + + static const uint8_t* const data; + static const size_t length; +}; diff --git a/cpp/include/cuvs/detail/jit_lto/cutile_arch_tags.hpp b/cpp/include/cuvs/detail/jit_lto/cutile_arch_tags.hpp new file mode 100644 index 0000000000..2c915a278b --- /dev/null +++ b/cpp/include/cuvs/detail/jit_lto/cutile_arch_tags.hpp @@ -0,0 +1,52 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +#pragma once + +#ifndef CUVS_CUTILE_ENABLED +#define CUVS_CUTILE_ENABLED 0 +#endif + +namespace cuvs::detail::jit_lto { + +#if CUVS_CUTILE_ENABLED + +/** Must stay in sync with cuTile matrix _arch entries and planner add_static_fragment calls. */ +struct cutile_arch_8_0 { + static constexpr int cc_major = 8; + static constexpr int cc_minor = 0; +}; + +struct cutile_arch_8_6 { + static constexpr int cc_major = 8; + static constexpr int cc_minor = 6; +}; + +struct cutile_arch_9_0 { + static constexpr int cc_major = 9; + static constexpr int cc_minor = 0; +}; + +struct cutile_arch_12_0 { + static constexpr int cc_major = 12; + static constexpr int cc_minor = 0; +}; + +inline bool is_embedded_cubin_arch(int cc_major, int cc_minor) +{ + if (cc_major == 8 && cc_minor == 0) { return true; } + if (cc_major == 8 && cc_minor == 6) { return true; } + if (cc_major == 9 && cc_minor == 0) { return true; } + if (cc_major == 12 && cc_minor == 0) { return true; } + return false; +} + +#else + +inline bool is_embedded_cubin_arch(int, int) { return false; } + +#endif + +} // namespace cuvs::detail::jit_lto diff --git a/cpp/include/cuvs/detail/jit_lto/cutile_module.hpp b/cpp/include/cuvs/detail/jit_lto/cutile_module.hpp new file mode 100644 index 0000000000..dff0f472a7 --- /dev/null +++ b/cpp/include/cuvs/detail/jit_lto/cutile_module.hpp @@ -0,0 +1,75 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +#include + +namespace cuvs::detail::jit_lto { + +struct CutileModuleImage { + const uint8_t* data; + size_t size; +}; + +inline bool get_device_compute_capability(int& cc_major, int& cc_minor) +{ + int device = 0; + if (cudaGetDevice(&device) != cudaSuccess) { return false; } + if (cudaDeviceGetAttribute(&cc_major, cudaDevAttrComputeCapabilityMajor, device) != cudaSuccess) { + return false; + } + if (cudaDeviceGetAttribute(&cc_minor, cudaDevAttrComputeCapabilityMinor, device) != cudaSuccess) { + return false; + } + return true; +} + +/** Selects a prebuilt cubin for the device CC, or embedded TileIR when the driver can JIT it. */ +inline std::optional resolve_cutile_module_image( + int cc_major, + int cc_minor, + int driver_version, + const std::vector>& cubin_fragments, + const TileIrBytecodeFragmentEntry* tileir_fragment) +{ + for (const auto& fragment : cubin_fragments) { + if (fragment->get_cc_major() == cc_major && fragment->get_cc_minor() == cc_minor) { + return CutileModuleImage{fragment->get_data(), fragment->get_length()}; + } + } + if (tileir_fragment != nullptr && tileir_fallback_available(driver_version)) { + return CutileModuleImage{tileir_fragment->get_data(), tileir_fragment->get_length()}; + } + return std::nullopt; +} + +inline std::shared_ptr load_cutile_launcher(const CutileModuleImage& image, + const std::string& kernel_symbol) +{ + cudaLibrary_t library{}; + RAFT_CUDA_TRY( + cudaLibraryLoadData(&library, image.data, nullptr, nullptr, 0, nullptr, nullptr, 0)); + + cudaKernel_t kernel{}; + RAFT_CUDA_TRY(cudaLibraryGetKernel(&kernel, library, kernel_symbol.c_str())); + + return std::make_shared(kernel, library); +} + +} // namespace cuvs::detail::jit_lto diff --git a/cpp/include/cuvs/detail/jit_lto/fused_distance_nn/fused_1nn_fragments.hpp b/cpp/include/cuvs/detail/jit_lto/fused_distance_nn/fused_1nn_fragments.hpp new file mode 100644 index 0000000000..517118bbe2 --- /dev/null +++ b/cpp/include/cuvs/detail/jit_lto/fused_distance_nn/fused_1nn_fragments.hpp @@ -0,0 +1,21 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +#pragma once + +#include + +namespace cuvs::distance::detail { + +template +struct fragment_tag_fused_1nn_cubin { + static constexpr int cc_major = ArchTag::cc_major; + static constexpr int cc_minor = ArchTag::cc_minor; +}; + +template +struct fragment_tag_fused_1nn_tileir {}; + +} // namespace cuvs::distance::detail diff --git a/cpp/include/cuvs/detail/jit_lto/tileir_compat.hpp b/cpp/include/cuvs/detail/jit_lto/tileir_compat.hpp new file mode 100644 index 0000000000..d63759fb36 --- /dev/null +++ b/cpp/include/cuvs/detail/jit_lto/tileir_compat.hpp @@ -0,0 +1,99 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +#pragma once + +#ifndef CUVS_CUTILE_ENABLED +#define CUVS_CUTILE_ENABLED 0 +#endif + +#include +#include + +#include + +namespace cuvs::detail::jit_lto { + +/** Minimum CUDA driver version (from cudaDriverGetVersion) for TileIR JIT of embedded bytecode. */ +inline constexpr int kMinTileIrJitDriverVersion = 13010; // CUDA 13.1 / driver >= 590.44 + +/** Minimum CUDA runtime version (from cudaRuntimeGetVersion) for cuTile integration. */ +inline constexpr int kMinCutileRuntimeVersion = 13000; + +inline constexpr bool library_built_with_cutile() +{ +#if CUVS_CUTILE_ENABLED + return true; +#else + return false; +#endif +} + +inline bool runtime_cuda13_or_newer() +{ + int runtime_version = 0; + if (cudaRuntimeGetVersion(&runtime_version) != cudaSuccess) { return false; } + return runtime_version >= kMinCutileRuntimeVersion; +} + +/** True when this build embeds cuTile artifacts and the runtime is CUDA 13+. */ +inline bool cutile_integration_enabled() +{ + return library_built_with_cutile() && runtime_cuda13_or_newer(); +} + +/** True when this build embeds a prebuilt cubin for the given compute capability. */ +inline bool has_embedded_cubin_for_arch(int cc_major, int cc_minor) +{ + return is_embedded_cubin_arch(cc_major, cc_minor); +} + +/** True when the driver can JIT-compile embedded TileIR bytecode at load time. */ +inline bool tileir_fallback_available(int driver_version) +{ + return driver_version >= kMinTileIrJitDriverVersion; +} + +/** + * True when a cuTile launch may be attempted for the given device: cuTile is enabled, the runtime + * is CUDA 13+, and either a matching embedded cubin exists (no driver JIT required) or the driver + * can JIT the embedded TileIR bytecode fallback. + */ +inline bool cutile_launch_available_for_arch(int cc_major, int cc_minor, int driver_version) +{ + if (!cutile_integration_enabled()) { return false; } + if (has_embedded_cubin_for_arch(cc_major, cc_minor)) { return true; } + return tileir_fallback_available(driver_version); +} + +inline bool query_driver_version(int& driver_version) +{ + return cudaDriverGetVersion(&driver_version) == cudaSuccess; +} + +inline bool query_current_device_arch(int& cc_major, int& cc_minor) +{ + int device = 0; + if (cudaGetDevice(&device) != cudaSuccess) { return false; } + if (cudaDeviceGetAttribute(&cc_major, cudaDevAttrComputeCapabilityMajor, device) != cudaSuccess) { + return false; + } + if (cudaDeviceGetAttribute(&cc_minor, cudaDevAttrComputeCapabilityMinor, device) != cudaSuccess) { + return false; + } + return true; +} + +inline bool cutile_launch_available_on_current_device() +{ + int cc_major = 0; + int cc_minor = 0; + int driver_version = 0; + if (!query_current_device_arch(cc_major, cc_minor)) { return false; } + if (!query_driver_version(driver_version)) { return false; } + return cutile_launch_available_for_arch(cc_major, cc_minor, driver_version); +} + +} // namespace cuvs::detail::jit_lto diff --git a/cpp/src/detail/jit_lto/AlgorithmPlanner.cpp b/cpp/src/detail/jit_lto/AlgorithmPlanner.cpp index 7416ea396d..486d6f1aa5 100644 --- a/cpp/src/detail/jit_lto/AlgorithmPlanner.cpp +++ b/cpp/src/detail/jit_lto/AlgorithmPlanner.cpp @@ -3,33 +3,16 @@ * SPDX-License-Identifier: Apache-2.0 */ -#include -#include #include #include -#include #include #include -#include #include -#include - -#include "cuda_runtime.h" -#include "nvJitLink.h" #include #include -std::string AlgorithmPlanner::get_fragments_key() const -{ - std::string key = ""; - for (const auto& fragment : this->fragments) { - key += fragment->get_key(); - } - return key; -} - std::shared_ptr AlgorithmPlanner::read_cache(std::string const& launch_key) const { auto& launchers = jit_cache_.launchers; @@ -38,79 +21,37 @@ std::shared_ptr AlgorithmPlanner::read_cache(std::string cons return nullptr; } -std::shared_ptr AlgorithmPlanner::get_launcher() +std::shared_ptr AlgorithmPlanner::try_get_launcher() { - auto& launchers = jit_cache_.launchers; - auto launch_key = this->get_fragments_key(); + auto launch_key = this->get_planner_key(); - if (auto hit = read_cache(launch_key)) { return hit; } + { + std::shared_lock read_lock(jit_cache_.mutex); + if (jit_cache_.build_failed.count(launch_key)) { return nullptr; } + if (auto hit = read_cache(launch_key)) { return hit; } + } std::unique_lock write_lock(jit_cache_.mutex); - if (auto it = launchers.find(launch_key); it != launchers.end()) { return it->second; } + if (jit_cache_.build_failed.count(launch_key)) { return nullptr; } + if (auto it = jit_cache_.launchers.find(launch_key); it != jit_cache_.launchers.end()) { + return it->second; + } - std::string log_message = - "JIT compiling launcher for kernel: " + this->entrypoint + " and device functions: "; - for (const auto& fragment : this->fragments) { - log_message += std::string{fragment->get_key()} + ","; + RAFT_LOG_DEBUG("Building launcher for kernel entrypoint: %s", this->entrypoint.c_str()); + auto launcher = this->build(); + if (!launcher) { + jit_cache_.build_failed.insert(launch_key); + return nullptr; } - log_message.pop_back(); - RAFT_LOG_DEBUG("%s", log_message.c_str()); - auto launcher = this->build(); - launchers[launch_key] = launcher; + jit_cache_.launchers[launch_key] = launcher; return launcher; } -std::shared_ptr AlgorithmPlanner::build() +std::shared_ptr AlgorithmPlanner::get_launcher() { - int device = 0; - int major = 0; - int minor = 0; - RAFT_CUDA_TRY(cudaGetDevice(&device)); - RAFT_CUDA_TRY(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device)); - RAFT_CUDA_TRY(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, device)); - - std::string archs = "-arch=sm_" + std::to_string((major * 10 + minor)); - - // Load the generated LTO IR and link them together - nvJitLinkHandle handle; - std::vector lopts; - lopts.reserve(2 + linktime_extra_options.size()); - lopts.push_back("-lto"); - lopts.push_back(archs.c_str()); - for (auto const& opt : linktime_extra_options) { - lopts.push_back(opt.c_str()); - } - auto result = nvJitLinkCreate(&handle, static_cast(lopts.size()), lopts.data()); - check_nvjitlink_result(handle, result); - - for (const auto& frag : this->fragments) { - frag->add_to(handle); + auto launcher = try_get_launcher(); + if (!launcher) { + RAFT_FAIL("Failed to build launcher for kernel entrypoint: %s", this->entrypoint.c_str()); } - - // Call to nvJitLinkComplete causes linker to link together all the LTO-IR - // modules perform any optimizations and generate cubin from it. - result = nvJitLinkComplete(handle); - check_nvjitlink_result(handle, result); - - // get cubin from nvJitLink - size_t cubin_size; - result = nvJitLinkGetLinkedCubinSize(handle, &cubin_size); - check_nvjitlink_result(handle, result); - - std::unique_ptr cubin{new char[cubin_size]}; - result = nvJitLinkGetLinkedCubin(handle, cubin.get()); - check_nvjitlink_result(handle, result); - - result = nvJitLinkDestroy(&handle); - RAFT_EXPECTS(result == NVJITLINK_SUCCESS, "nvJitLinkDestroy failed"); - - // cubin is linked, so now load it - cudaLibrary_t library; - RAFT_CUDA_TRY( - cudaLibraryLoadData(&library, cubin.get(), nullptr, nullptr, 0, nullptr, nullptr, 0)); - - cudaKernel_t kernel; - RAFT_CUDA_TRY(cudaLibraryGetKernel(&kernel, library, this->entrypoint.c_str())); - - return std::make_shared(kernel, library); + return launcher; } diff --git a/cpp/src/detail/jit_lto/LTOAlgorithmPlanner.cpp b/cpp/src/detail/jit_lto/LTOAlgorithmPlanner.cpp new file mode 100644 index 0000000000..da7c0408b4 --- /dev/null +++ b/cpp/src/detail/jit_lto/LTOAlgorithmPlanner.cpp @@ -0,0 +1,76 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include +#include +#include +#include + +#include +#include + +#include "cuda_runtime.h" +#include "nvJitLink.h" + +#include + +std::string LTOAlgorithmPlanner::get_planner_key() const +{ + std::string key; + for (const auto& fragment : this->fragments) { + key += fragment->get_key(); + } + return key; +} + +std::shared_ptr LTOAlgorithmPlanner::build() +{ + int device = 0; + int major = 0; + int minor = 0; + RAFT_CUDA_TRY(cudaGetDevice(&device)); + RAFT_CUDA_TRY(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device)); + RAFT_CUDA_TRY(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, device)); + + std::string archs = "-arch=sm_" + std::to_string((major * 10 + minor)); + + nvJitLinkHandle handle; + std::vector lopts; + lopts.reserve(2 + linktime_extra_options.size()); + lopts.push_back("-lto"); + lopts.push_back(archs.c_str()); + for (auto const& opt : linktime_extra_options) { + lopts.push_back(opt.c_str()); + } + auto result = nvJitLinkCreate(&handle, static_cast(lopts.size()), lopts.data()); + check_nvjitlink_result(handle, result); + + for (const auto& frag : this->fragments) { + frag->add_to(handle); + } + + result = nvJitLinkComplete(handle); + check_nvjitlink_result(handle, result); + + size_t cubin_size; + result = nvJitLinkGetLinkedCubinSize(handle, &cubin_size); + check_nvjitlink_result(handle, result); + + std::unique_ptr cubin{new char[cubin_size]}; + result = nvJitLinkGetLinkedCubin(handle, cubin.get()); + check_nvjitlink_result(handle, result); + + result = nvJitLinkDestroy(&handle); + RAFT_EXPECTS(result == NVJITLINK_SUCCESS, "nvJitLinkDestroy failed"); + + cudaLibrary_t library; + RAFT_CUDA_TRY( + cudaLibraryLoadData(&library, cubin.get(), nullptr, nullptr, 0, nullptr, nullptr, 0)); + + cudaKernel_t kernel; + RAFT_CUDA_TRY(cudaLibraryGetKernel(&kernel, library, this->entrypoint.c_str())); + + return std::make_shared(kernel, library); +} diff --git a/cpp/src/detail/jit_lto/TileAlgorithmPlanner.cpp b/cpp/src/detail/jit_lto/TileAlgorithmPlanner.cpp new file mode 100644 index 0000000000..edb6269213 --- /dev/null +++ b/cpp/src/detail/jit_lto/TileAlgorithmPlanner.cpp @@ -0,0 +1,38 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include +#include + +#include +#include + +std::string TileAlgorithmPlanner::get_planner_key() const +{ + std::string key = this->entrypoint; + for (const auto& fragment : cubin_fragments_) { + key += fragment->get_key(); + } + if (tileir_fragment_) { key += tileir_fragment_->get_key(); } + return key; +} + +std::shared_ptr TileAlgorithmPlanner::build() +{ + int cc_major = 0; + int cc_minor = 0; + if (!cuvs::detail::jit_lto::get_device_compute_capability(cc_major, cc_minor)) { + return nullptr; + } + + int driver_version = 0; + if (cudaDriverGetVersion(&driver_version) != cudaSuccess) { return nullptr; } + + auto image = cuvs::detail::jit_lto::resolve_cutile_module_image( + cc_major, cc_minor, driver_version, cubin_fragments_, tileir_fragment_.get()); + if (!image) { return nullptr; } + + return cuvs::detail::jit_lto::load_cutile_launcher(*image, this->entrypoint); +} diff --git a/cpp/src/distance/detail/fused_distance_nn.cuh b/cpp/src/distance/detail/fused_distance_nn.cuh index f9dbd968ec..8b47092b58 100644 --- a/cpp/src/distance/detail/fused_distance_nn.cuh +++ b/cpp/src/distance/detail/fused_distance_nn.cuh @@ -5,14 +5,22 @@ #pragma once +#ifndef CUVS_CUTILE_ENABLED +#define CUVS_CUTILE_ENABLED 0 +#endif + #include "distance_ops/l2_exp.cuh" // ops::l2_exp_distance_op #include "fused_distance_nn/cutlass_base.cuh" +#if CUVS_CUTILE_ENABLED +#include "fused_distance_nn/cutile/fused_1nn_tile.hpp" +#endif #include "fused_distance_nn/fused_cosine_nn.cuh" #include "fused_distance_nn/fused_l2_nn.cuh" #include "fused_distance_nn/helper_structs.cuh" #include "fused_distance_nn/simt_kernel.cuh" #include "pairwise_distance_base.cuh" // PairwiseDistances #include +#include #include // raft::KeyValuePair #include // raft::identity_op #include // Policy @@ -54,6 +62,13 @@ void fusedDistanceNNImpl(OutT* min, // The kernel policy is determined by fusedDistanceNN. typedef Policy P; +#if CUVS_CUTILE_ENABLED + if (cuvs::detail::jit_lto::cutile_launch_available_on_current_device() && + try_fused_1nn_tile(min, x, y, m, n, k, metric, stream)) { + return; + } +#endif + dim3 blk(P::Nthreads); auto nblks = raft::ceildiv(m, P::Nthreads); constexpr auto maxVal = std::numeric_limits::max(); diff --git a/cpp/src/distance/detail/fused_distance_nn/cutile/export_fused_1nn.py b/cpp/src/distance/detail/fused_distance_nn/cutile/export_fused_1nn.py new file mode 100644 index 0000000000..6a20be24ef --- /dev/null +++ b/cpp/src/distance/detail/fused_distance_nn/cutile/export_fused_1nn.py @@ -0,0 +1,136 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. +# SPDX-License-Identifier: Apache-2.0 +"""Export fused 1-NN cuTile kernels to cubin or TileIR bytecode.""" + +from __future__ import annotations + +import argparse +import sys +from pathlib import Path +from typing import Literal + +import cuda.tile as ct +from cuda.tile.compilation import ( + ArrayConstraint, + CallingConvention, + ConstantConstraint, + KernelSignature, + ScalarConstraint, + export_kernel, +) + +from fused_1nn_kernel import KERNELS, KERNEL_SYMBOLS, TILE_CONSTANTS + +DEFAULT_TILEIR_BYTECODE_VERSION = "13.1" +# cuTile requires a gpu_code even for TileIR bytecode export: it selects the compilation +# target / feature set for lowering, not the runtime architecture (the driver JITs at load). +DEFAULT_TILEIR_EXPORT_GPU_CODE = "sm_80" + + +def _dtype_for(data_type: str): + if data_type == "half": + return ct.float16 + if data_type == "float": + return ct.float32 + raise ValueError(f"Unsupported data_type {data_type!r}") + + +def _kernel_signature(data_type: str) -> KernelSignature: + elem = _dtype_for(data_type) + array = ArrayConstraint( + elem, + 2, + index_dtype=ct.int64, + stride_lower_bound_incl=0, + alias_groups=(), + may_alias_internally=False, + ) + idx_array = ArrayConstraint( + ct.int64, + 1, + index_dtype=ct.int64, + stride_lower_bound_incl=0, + alias_groups=(), + may_alias_internally=False, + stride_constant=(1,), + ) + dist_array = ArrayConstraint( + ct.float32, + 1, + index_dtype=ct.int64, + stride_lower_bound_incl=0, + alias_groups=(), + may_alias_internally=False, + stride_constant=(1,), + ) + tm, tn, tk = TILE_CONSTANTS + return KernelSignature( + parameters=[ + array, + array, + idx_array, + dist_array, + ScalarConstraint(ct.int64), + ScalarConstraint(ct.int64), + ScalarConstraint(ct.int64), + ConstantConstraint(tm), + ConstantConstraint(tn), + ConstantConstraint(tk), + ], + calling_convention=CallingConvention.cutile_python_v1(), + ).with_symbol(KERNEL_SYMBOLS[data_type]) + + +def export_binary( + output_file: Path, + *, + output_format: Literal["cubin", "tileir_bytecode"], + data_type: str, + gpu_code: str, + bytecode_version: str | None = None, +) -> str: + kernel = KERNELS[data_type] + signature = _kernel_signature(data_type) + + export_kwargs = { + "kernel": kernel, + "signatures": [signature], + "output_file": str(output_file), + "gpu_code": gpu_code, + "output_format": output_format, + } + if output_format == "tileir_bytecode": + export_kwargs["bytecode_version"] = bytecode_version or DEFAULT_TILEIR_BYTECODE_VERSION + + export_kernel(**export_kwargs) + + return signature.symbol + + +def main() -> int: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("output_file", type=Path) + parser.add_argument("--format", choices=("cubin", "tileir_bytecode"), default="cubin") + parser.add_argument("--data-type", choices=tuple(KERNELS.keys()), required=True) + parser.add_argument( + "--gpu-code", + default=DEFAULT_TILEIR_EXPORT_GPU_CODE, + help="Target SM for cubin export, or compile hint for TileIR bytecode export", + ) + parser.add_argument("--bytecode-version", default=DEFAULT_TILEIR_BYTECODE_VERSION) + args = parser.parse_args() + + print( + export_binary( + args.output_file, + output_format=args.format, + data_type=args.data_type, + gpu_code=args.gpu_code, + bytecode_version=args.bytecode_version, + ) + ) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/cpp/src/distance/detail/fused_distance_nn/cutile/fused_1nn_cutile_cubin_matrix.json b/cpp/src/distance/detail/fused_distance_nn/cutile/fused_1nn_cutile_cubin_matrix.json new file mode 100644 index 0000000000..fbd4bfdd64 --- /dev/null +++ b/cpp/src/distance/detail/fused_distance_nn/cutile/fused_1nn_cutile_cubin_matrix.json @@ -0,0 +1,40 @@ +[ + { + "_data": [ + { + "data_type": "half", + "data_abbrev": "h" + }, + { + "data_type": "float", + "data_abbrev": "f" + } + ], + "_arch": [ + { + "gpu_code": "sm_80", + "cc_major": 8, + "cc_minor": 0, + "arch_tag": "cutile_arch_8_0" + }, + { + "gpu_code": "sm_86", + "cc_major": 8, + "cc_minor": 6, + "arch_tag": "cutile_arch_8_6" + }, + { + "gpu_code": "sm_90", + "cc_major": 9, + "cc_minor": 0, + "arch_tag": "cutile_arch_9_0" + }, + { + "gpu_code": "sm_120", + "cc_major": 12, + "cc_minor": 0, + "arch_tag": "cutile_arch_12_0" + } + ] + } +] diff --git a/cpp/src/distance/detail/fused_distance_nn/cutile/fused_1nn_cutile_tileir_matrix.json b/cpp/src/distance/detail/fused_distance_nn/cutile/fused_1nn_cutile_tileir_matrix.json new file mode 100644 index 0000000000..364c94594c --- /dev/null +++ b/cpp/src/distance/detail/fused_distance_nn/cutile/fused_1nn_cutile_tileir_matrix.json @@ -0,0 +1,20 @@ +[ + { + "_data": [ + { + "data_type": "half", + "data_abbrev": "h" + }, + { + "data_type": "float", + "data_abbrev": "f" + } + ], + "_tileir": [ + { + "export_gpu_code": "sm_80", + "bytecode_version": "13.1" + } + ] + } +] diff --git a/cpp/src/distance/detail/fused_distance_nn/cutile/fused_1nn_kernel.py b/cpp/src/distance/detail/fused_distance_nn/cutile/fused_1nn_kernel.py new file mode 100644 index 0000000000..232b9506af --- /dev/null +++ b/cpp/src/distance/detail/fused_distance_nn/cutile/fused_1nn_kernel.py @@ -0,0 +1,68 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. +# SPDX-License-Identifier: Apache-2.0 +"""cuTile fused GEMM + inner-product 1-NN (argmax dot product) for cuVS.""" + +from __future__ import annotations + +import cuda.tile as ct + +ConstInt = ct.Constant[int] + +TILE_M = 128 +TILE_N = 256 +TILE_K = 64 + + +def _make_kernel(data_type: str): + if data_type == "half": + dtype = ct.float16 + acc_dtype = ct.float32 + elif data_type == "float": + dtype = ct.float32 + acc_dtype = ct.float32 + else: + raise ValueError(f"Unsupported data_type {data_type!r}") + + @ct.kernel + def fused_1nn_kernel(A, B, OutIdx, OutDist, M, N, K, tm: ConstInt, tn: ConstInt, tk: ConstInt): + bidm = ct.bid(0) + + best_dist = ct.full((tm,), -3.4e38, acc_dtype) + best_idx = ct.zeros((tm,), ct.int64) + + num_tiles_k = ct.num_tiles(A, axis=1, shape=(tm, tk)) + num_tiles_n = ct.num_tiles(B, axis=0, shape=(tn, tk)) + zero_pad = ct.PaddingMode.ZERO + + for n in range(num_tiles_n): + accumulator = ct.full((tm, tn), 0, dtype=acc_dtype) + + for k in range(num_tiles_k): + a = ct.load(A, index=(bidm, k), shape=(tm, tk), padding_mode=zero_pad) + b_T = ct.load(B, index=(n, k), shape=(tn, tk), padding_mode=zero_pad) + accumulator = ct.mma(a, ct.transpose(b_T), accumulator) + + curr_max = ct.max(accumulator, axis=1) + curr_idx = ct.argmax(accumulator, axis=1) + + update = curr_max > best_dist + best_dist = ct.where(update, curr_max, best_dist) + best_idx = ct.where(update, n * tn + curr_idx, best_idx) + + ct.store(OutIdx, index=(bidm,), tile=best_idx) + ct.store(OutDist, index=(bidm,), tile=best_dist) + + return fused_1nn_kernel + + +KERNELS = { + "half": _make_kernel("half"), + "float": _make_kernel("float"), +} + +KERNEL_SYMBOLS = { + "half": "fused_1nn_half", + "float": "fused_1nn_float", +} + +TILE_CONSTANTS = (TILE_M, TILE_N, TILE_K) diff --git a/cpp/src/distance/detail/fused_distance_nn/cutile/fused_1nn_planner.hpp b/cpp/src/distance/detail/fused_distance_nn/cutile/fused_1nn_planner.hpp new file mode 100644 index 0000000000..dd2a539528 --- /dev/null +++ b/cpp/src/distance/detail/fused_distance_nn/cutile/fused_1nn_planner.hpp @@ -0,0 +1,60 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +#pragma once + +#include + +#include +#include +#include +#include + +namespace cuvs::distance::detail { + +/** Must match KERNEL_SYMBOLS in fused_1nn_kernel.py (export uses with_symbol). */ +template +inline const char* fused_1nn_kernel_entrypoint() +{ + if constexpr (std::is_same_v) { + return "fused_1nn_half"; + } else if constexpr (std::is_same_v) { + return "fused_1nn_float"; + } else { + static_assert(sizeof(DataTag) == 0, "unsupported fused 1-NN cuTile data type"); + return ""; + } +} + +template +struct Fused1nnTilePlanner : TileAlgorithmPlanner { + inline static LauncherJitCache launcher_jit_cache{}; + + Fused1nnTilePlanner() + : TileAlgorithmPlanner(fused_1nn_kernel_entrypoint(), launcher_jit_cache) + { + } + + /** Registers embedded cubin modules (one per SM); see register_cubin.cpp object files. */ + void add_entrypoint() + { + using cuvs::detail::jit_lto::cutile_arch_12_0; + using cuvs::detail::jit_lto::cutile_arch_8_0; + using cuvs::detail::jit_lto::cutile_arch_8_6; + using cuvs::detail::jit_lto::cutile_arch_9_0; + + this->add_static_fragment>(); + this->add_static_fragment>(); + this->add_static_fragment>(); + this->add_static_fragment>(); + } + + void add_tileir_fallback() + { + this->add_static_tileir_fragment>(); + } +}; + +} // namespace cuvs::distance::detail diff --git a/cpp/src/distance/detail/fused_distance_nn/cutile/fused_1nn_tile.cu b/cpp/src/distance/detail/fused_distance_nn/cutile/fused_1nn_tile.cu new file mode 100644 index 0000000000..af8b0b181f --- /dev/null +++ b/cpp/src/distance/detail/fused_distance_nn/cutile/fused_1nn_tile.cu @@ -0,0 +1,173 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "fused_1nn_tile.hpp" + +#include "fused_1nn_planner.hpp" + +#include +#include + +namespace cuvs { +namespace distance { +namespace detail { + +namespace { + +template +__global__ void pack_fused_1nn_kvp(OutT* out, const int64_t* idx, const float* dist, IdxT len) +{ + IdxT i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < len) { + out[i].key = static_cast(idx[i]); + out[i].value = static_cast(dist[i]); + } +} + +template +bool launch_fused_1nn_tile(const DataT* x, + const DataT* y, + OutT* out, + IdxT m, + IdxT n, + IdxT k, + cudaStream_t stream) +{ + Fused1nnTilePlanner planner; + planner.add_entrypoint(); + planner.add_tileir_fallback(); + auto launcher = planner.try_get_launcher(); + if (!launcher) { return false; } + + int64_t* d_idx = nullptr; + float* d_dist = nullptr; + RAFT_CUDA_TRY(cudaMallocAsync(&d_idx, m * sizeof(int64_t), stream)); + RAFT_CUDA_TRY(cudaMallocAsync(&d_dist, m * sizeof(float), stream)); + + int64_t shape_x[2] = {m, k}; + int64_t stride_x[2] = {k, 1}; + int64_t shape_y[2] = {n, k}; + int64_t stride_y[2] = {k, 1}; + int64_t shape_idx[1] = {m}; + int64_t stride_idx[1] = {1}; + int64_t shape_dist[1] = {m}; + int64_t stride_dist[1] = {1}; + + int64_t M = m, N = n, K = k; + constexpr int64_t tm = 128, tn = 256, tk = 64; + + void* x_ptr = const_cast(x); + void* y_ptr = const_cast(y); + void* idx_ptr = d_idx; + void* dist_ptr = d_dist; + + dim3 grid((m + tm - 1) / tm, 1, 1); + dim3 block(1, 1, 1); + + using fused_1nn_cutile_kernel_t = void(void*, + int64_t*, + int64_t*, + void*, + int64_t*, + int64_t*, + void*, + int64_t*, + int64_t*, + void*, + int64_t*, + int64_t*, + int64_t, + int64_t, + int64_t, + int64_t, + int64_t, + int64_t); + launcher->template dispatch( + stream, + grid, + block, + 0, + x_ptr, + shape_x, + stride_x, + y_ptr, + shape_y, + stride_y, + idx_ptr, + shape_idx, + stride_idx, + dist_ptr, + shape_dist, + stride_dist, + M, + N, + K, + tm, + tn, + tk); + + pack_fused_1nn_kvp<<<(m + 255) / 256, 256, 0, stream>>>(out, d_idx, d_dist, m); + RAFT_CUDA_TRY(cudaGetLastError()); + RAFT_CUDA_TRY(cudaFreeAsync(d_idx, stream)); + RAFT_CUDA_TRY(cudaFreeAsync(d_dist, stream)); + return true; +} + +} // namespace + +template , int>> +bool try_fused_1nn_tile(OutT* min, + const DataT* x, + const DataT* y, + IdxT m, + IdxT n, + IdxT k, + cuvs::distance::DistanceType metric, + cudaStream_t stream) +{ + if (metric != cuvs::distance::DistanceType::InnerProduct) { return false; } + + if constexpr (std::is_same_v) { + return launch_fused_1nn_tile( + x, y, min, m, n, k, stream); + } else if constexpr (std::is_same_v) { + return launch_fused_1nn_tile( + x, y, min, m, n, k, stream); + } else { + return false; + } +} + +using kvp_i_f = raft::KeyValuePair; +using kvp_i64_f = raft::KeyValuePair; +using kvp_i_h = raft::KeyValuePair; +using kvp_i64_h = raft::KeyValuePair; + +#define CUVS_INST_TRY_FUSED_1NN_TILE(DataT, OutT, IdxT) \ + template CUVS_EXPORT bool try_fused_1nn_tile(OutT*, \ + const DataT*, \ + const DataT*, \ + IdxT, \ + IdxT, \ + IdxT, \ + cuvs::distance::DistanceType, \ + cudaStream_t) + +// int and int32_t are the same on LP64; one instantiation covers both. +CUVS_INST_TRY_FUSED_1NN_TILE(float, kvp_i_f, int); +CUVS_INST_TRY_FUSED_1NN_TILE(float, kvp_i64_f, int64_t); +CUVS_INST_TRY_FUSED_1NN_TILE(half, kvp_i_f, int); +CUVS_INST_TRY_FUSED_1NN_TILE(half, kvp_i64_f, int64_t); +CUVS_INST_TRY_FUSED_1NN_TILE(half, kvp_i_h, int); +CUVS_INST_TRY_FUSED_1NN_TILE(half, kvp_i64_h, int64_t); + +#undef CUVS_INST_TRY_FUSED_1NN_TILE + +} // namespace detail +} // namespace distance +} // namespace cuvs diff --git a/cpp/src/distance/detail/fused_distance_nn/cutile/fused_1nn_tile.hpp b/cpp/src/distance/detail/fused_distance_nn/cutile/fused_1nn_tile.hpp new file mode 100644 index 0000000000..30f804d399 --- /dev/null +++ b/cpp/src/distance/detail/fused_distance_nn/cutile/fused_1nn_tile.hpp @@ -0,0 +1,55 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +#pragma once + +#include + +#include +#include + +#include + +namespace cuvs { +namespace distance { +namespace detail { + +template +inline constexpr bool is_fused_1nn_kvp_output_v = + std::is_same_v> || + std::is_same_v>; + +template , int> = 0> +bool try_fused_1nn_tile(OutT* min, + const DataT* x, + const DataT* y, + IdxT m, + IdxT n, + IdxT k, + cuvs::distance::DistanceType metric, + cudaStream_t stream); + +template , int> = 0> +bool try_fused_1nn_tile(OutT*, + const DataT*, + const DataT*, + IdxT, + IdxT, + IdxT, + cuvs::distance::DistanceType, + cudaStream_t) +{ + return false; +} + +} // namespace detail +} // namespace distance +} // namespace cuvs diff --git a/cpp/src/distance/detail/pairwise_matrix/jit_lto_kernels/pairwise_matrix_planner.hpp b/cpp/src/distance/detail/pairwise_matrix/jit_lto_kernels/pairwise_matrix_planner.hpp index 0d00b3eca6..f89a383596 100644 --- a/cpp/src/distance/detail/pairwise_matrix/jit_lto_kernels/pairwise_matrix_planner.hpp +++ b/cpp/src/distance/detail/pairwise_matrix/jit_lto_kernels/pairwise_matrix_planner.hpp @@ -20,7 +20,7 @@ template -struct PairwiseMatrixPlanner : AlgorithmPlanner { +struct PairwiseMatrixPlanner : LTOAlgorithmPlanner { using DistanceTag = DistanceTag_; using DataTag = DataTag_; using AccTag = AccTag_; @@ -33,7 +33,7 @@ struct PairwiseMatrixPlanner : AlgorithmPlanner { inline static LauncherJitCache launcher_jit_cache{}; - PairwiseMatrixPlanner() : AlgorithmPlanner(kPairwiseMatrixJitEntrypoint, launcher_jit_cache) {} + PairwiseMatrixPlanner() : LTOAlgorithmPlanner(kPairwiseMatrixJitEntrypoint, launcher_jit_cache) {} void add_entrypoint() { diff --git a/cpp/src/neighbors/detail/cagra/jit_lto_kernels/cagra_planner_base.hpp b/cpp/src/neighbors/detail/cagra/jit_lto_kernels/cagra_planner_base.hpp index 0c3ed64d13..b44a7f044e 100644 --- a/cpp/src/neighbors/detail/cagra/jit_lto_kernels/cagra_planner_base.hpp +++ b/cpp/src/neighbors/detail/cagra/jit_lto_kernels/cagra_planner_base.hpp @@ -25,7 +25,7 @@ template -struct CagraPlannerBase : AlgorithmPlanner { +struct CagraPlannerBase : LTOAlgorithmPlanner { using DataTag = DataTag_; using IndexTag = IndexTag_; using DistanceTag = DistanceTag_; @@ -34,7 +34,7 @@ struct CagraPlannerBase : AlgorithmPlanner { using SampleFilterJitTag = SampleFilterJitTag_; explicit CagraPlannerBase(std::string entrypoint, LauncherJitCache& jit_cache) - : AlgorithmPlanner(std::move(entrypoint), jit_cache) + : LTOAlgorithmPlanner(std::move(entrypoint), jit_cache) { } diff --git a/cpp/src/neighbors/ivf_flat/detail/jit_lto_kernels/interleaved_scan_planner.hpp b/cpp/src/neighbors/ivf_flat/detail/jit_lto_kernels/interleaved_scan_planner.hpp index ed8191016b..7899d970ab 100644 --- a/cpp/src/neighbors/ivf_flat/detail/jit_lto_kernels/interleaved_scan_planner.hpp +++ b/cpp/src/neighbors/ivf_flat/detail/jit_lto_kernels/interleaved_scan_planner.hpp @@ -14,10 +14,10 @@ namespace cuvs::neighbors::ivf_flat::detail { -struct InterleavedScanPlanner : AlgorithmPlanner { +struct InterleavedScanPlanner : LTOAlgorithmPlanner { inline static LauncherJitCache launcher_jit_cache{}; - InterleavedScanPlanner() : AlgorithmPlanner("interleaved_scan", launcher_jit_cache) {} + InterleavedScanPlanner() : LTOAlgorithmPlanner("interleaved_scan", launcher_jit_cache) {} template void add_entrypoint() diff --git a/cpp/src/neighbors/ivf_pq/detail/jit_lto_kernels/compute_similarity_planner.hpp b/cpp/src/neighbors/ivf_pq/detail/jit_lto_kernels/compute_similarity_planner.hpp index 0621966cad..7152aaeebd 100644 --- a/cpp/src/neighbors/ivf_pq/detail/jit_lto_kernels/compute_similarity_planner.hpp +++ b/cpp/src/neighbors/ivf_pq/detail/jit_lto_kernels/compute_similarity_planner.hpp @@ -12,10 +12,10 @@ namespace cuvs::neighbors::ivf_pq::detail { -struct ComputeSimilarityPlanner : AlgorithmPlanner { +struct ComputeSimilarityPlanner : LTOAlgorithmPlanner { inline static LauncherJitCache launcher_jit_cache{}; - ComputeSimilarityPlanner() : AlgorithmPlanner("compute_similarity", launcher_jit_cache) {} + ComputeSimilarityPlanner() : LTOAlgorithmPlanner("compute_similarity", launcher_jit_cache) {} template void add_entrypoint() diff --git a/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/scan_planner.hpp b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/scan_planner.hpp index 05ea34532e..5dc47dc612 100644 --- a/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/scan_planner.hpp +++ b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/scan_planner.hpp @@ -13,10 +13,10 @@ namespace cuvs::neighbors::ivf_sq::detail { -struct IvfSqScanPlanner : AlgorithmPlanner { +struct IvfSqScanPlanner : LTOAlgorithmPlanner { inline static LauncherJitCache launcher_jit_cache{}; - IvfSqScanPlanner() : AlgorithmPlanner("ivf_sq_scan", launcher_jit_cache) {} + IvfSqScanPlanner() : LTOAlgorithmPlanner("ivf_sq_scan", launcher_jit_cache) {} template void add_entrypoint() diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index ba6ed6e0e7..006b35b5c4 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -386,7 +386,8 @@ ConfigureTest( PERCENT 100 ) -add_subdirectory(cutile) +# cuTile vector-add example test disabled; fused 1-NN cuTile is covered via libcuvs integration. +# add_subdirectory(cutile) # ################################################################################################## # Install tests #################################################################################### diff --git a/cpp/tests/cutile/cutile_vector_add.cu b/cpp/tests/cutile/cutile_vector_add.cu index 77a5e51311..07d694bef1 100644 --- a/cpp/tests/cutile/cutile_vector_add.cu +++ b/cpp/tests/cutile/cutile_vector_add.cu @@ -11,10 +11,15 @@ #include "vector_add_sm_80_cubin.h" #include "vector_add_sm_86_cubin.h" #include "vector_add_sm_90_cubin.h" +#include "vector_add_tileir_bytecode.h" + +#include #include #include +#include +#include namespace cuvs { namespace { @@ -26,7 +31,7 @@ struct EmbeddedCubin { size_t size; }; -// Lookup table for cubins built at configure time (see export_vector_add_cubin.py). +// Prebuilt cubins for known library targets (see export_vector_add_cubin.py). constexpr EmbeddedCubin kEmbeddedCubins[] = { {8, 0, vector_add_sm_80_cubin, sizeof(vector_add_sm_80_cubin)}, {8, 6, vector_add_sm_86_cubin, sizeof(vector_add_sm_86_cubin)}, @@ -35,53 +40,128 @@ constexpr EmbeddedCubin kEmbeddedCubins[] = { {12, 0, vector_add_sm_120_cubin, sizeof(vector_add_sm_120_cubin)}, }; -const EmbeddedCubin* find_embedded_cubin(int cc_major, int cc_minor) +constexpr EmbeddedCubin kTileIrBytecode = { + -1, + -1, + vector_add_tileir_bytecode, + sizeof(vector_add_tileir_bytecode), +}; + +struct CutileModuleImage { + const uint8_t* data; + size_t size; +}; + +std::optional resolve_vector_add_module(int cc_major, int cc_minor) { for (const auto& entry : kEmbeddedCubins) { - if (entry.cc_major == cc_major && entry.cc_minor == cc_minor) { return &entry; } + if (entry.cc_major == cc_major && entry.cc_minor == cc_minor) { + return CutileModuleImage{reinterpret_cast(entry.data), entry.size}; + } } - // Fall back to a cubin for the same major version (e.g. minor SKUs within a generation). - for (const auto& entry : kEmbeddedCubins) { - if (entry.cc_major == cc_major) { return &entry; } + + int driver_version = 0; + if (cudaDriverGetVersion(&driver_version) != cudaSuccess) { return std::nullopt; } + if (!cuvs::detail::jit_lto::tileir_fallback_available(driver_version)) { + return std::nullopt; } - return nullptr; + return CutileModuleImage{ + reinterpret_cast(kTileIrBytecode.data), kTileIrBytecode.size}; } -class CutileVectorAddTest : public ::testing::Test { - protected: - void SetUp() override +struct LoadedKernel { + cudaLibrary_t library = nullptr; + cudaKernel_t kernel = nullptr; + bool used_tileir_jit{false}; + const char* skip_reason{nullptr}; + + LoadedKernel() = default; + + LoadedKernel(LoadedKernel&& other) noexcept { *this = std::move(other); } + + LoadedKernel& operator=(LoadedKernel&& other) noexcept { - int device = 0; - RAFT_CUDA_TRY(cudaGetDevice(&device)); - RAFT_CUDA_TRY( - cudaDeviceGetAttribute(&cc_major_, cudaDevAttrComputeCapabilityMajor, device)); - RAFT_CUDA_TRY( - cudaDeviceGetAttribute(&cc_minor_, cudaDevAttrComputeCapabilityMinor, device)); + if (this != &other) { + unload(); + library = other.library; + kernel = other.kernel; + used_tileir_jit = other.used_tileir_jit; + skip_reason = other.skip_reason; + other.library = nullptr; + other.kernel = nullptr; + } + return *this; } - int cc_major_{}; - int cc_minor_{}; -}; + LoadedKernel(const LoadedKernel&) = delete; + LoadedKernel& operator=(const LoadedKernel&) = delete; -} // namespace + ~LoadedKernel() { unload(); } -TEST_F(CutileVectorAddTest, EmbeddedCubinVectorAdd) + explicit operator bool() const { return kernel != nullptr; } + + private: + void unload() + { + if (library != nullptr) { + RAFT_CUDA_TRY(cudaLibraryUnload(library)); + library = nullptr; + kernel = nullptr; + } + } +}; + +LoadedKernel load_vector_add_kernel(int cc_major, int cc_minor) { - const EmbeddedCubin* cubin = find_embedded_cubin(cc_major_, cc_minor_); - ASSERT_NE(cubin, nullptr) - << "No embedded cuTile cubin for compute capability " << cc_major_ << "." << cc_minor_; + LoadedKernel result{}; + result.used_tileir_jit = !cuvs::detail::jit_lto::is_embedded_cubin_arch(cc_major, cc_minor); + + auto image = resolve_vector_add_module(cc_major, cc_minor); + if (!image) { + if (result.used_tileir_jit) { + result.skip_reason = + "TileIR driver JIT unavailable for this GPU. Requires CUDA 13.1+ driver (>= 590.44)."; + } else { + ADD_FAILURE() << "No embedded cuTile module for compute capability " << cc_major << "." + << cc_minor; + } + return result; + } - cudaLibrary_t library{}; - ASSERT_EQ(cudaSuccess, - cudaLibraryLoadData( - &library, cubin->data, nullptr, nullptr, 0, nullptr, nullptr, 0)) - << "cudaLibraryLoadData failed: " << cudaGetErrorString(cudaGetLastError()); + const cudaError_t load_status = + cudaLibraryLoadData(&result.library, image->data, nullptr, nullptr, 0, nullptr, nullptr, 0); + if (load_status != cudaSuccess) { + if (result.used_tileir_jit) { + result.skip_reason = + "TileIR driver JIT unavailable for this GPU (requires CUDA 13.1+ driver >= 590.44)."; + SCOPED_TRACE(cudaGetErrorString(load_status)); + } else { + ADD_FAILURE() << "cudaLibraryLoadData failed: " << cudaGetErrorString(load_status); + } + return result; + } - cudaKernel_t kernel{}; - ASSERT_EQ(cudaSuccess, - cudaLibraryGetKernel(&kernel, library, CUTILE_VECTOR_ADD_KERNEL_SYMBOL)) - << "cudaLibraryGetKernel failed: " << cudaGetErrorString(cudaGetLastError()); + const cudaError_t kernel_status = + cudaLibraryGetKernel(&result.kernel, result.library, CUTILE_VECTOR_ADD_KERNEL_SYMBOL); + if (kernel_status != cudaSuccess) { + if (result.library != nullptr) { + RAFT_CUDA_TRY(cudaLibraryUnload(result.library)); + result.library = nullptr; + } + result.kernel = nullptr; + if (result.used_tileir_jit) { + result.skip_reason = + "TileIR driver JIT unavailable for this GPU (requires CUDA 13.1+ driver >= 590.44)."; + SCOPED_TRACE(cudaGetErrorString(kernel_status)); + } else { + ADD_FAILURE() << "cudaLibraryGetKernel failed: " << cudaGetErrorString(kernel_status); + } + } + return result; +} +void run_vector_add(cudaKernel_t kernel) +{ constexpr int kN = 1024; constexpr int kTile = 256; constexpr int kGridDim = (kN + kTile - 1) / kTile; @@ -122,7 +202,35 @@ TEST_F(CutileVectorAddTest, EmbeddedCubinVectorAdd) RAFT_CUDA_TRY(cudaFree(d_a)); RAFT_CUDA_TRY(cudaFree(d_b)); RAFT_CUDA_TRY(cudaFree(d_c)); - RAFT_CUDA_TRY(cudaLibraryUnload(library)); +} + +class CutileVectorAddTest : public ::testing::Test { + protected: + void SetUp() override + { + int device = 0; + RAFT_CUDA_TRY(cudaGetDevice(&device)); + RAFT_CUDA_TRY( + cudaDeviceGetAttribute(&cc_major_, cudaDevAttrComputeCapabilityMajor, device)); + RAFT_CUDA_TRY( + cudaDeviceGetAttribute(&cc_minor_, cudaDevAttrComputeCapabilityMinor, device)); + } + + int cc_major_{}; + int cc_minor_{}; +}; + +} // namespace + +TEST_F(CutileVectorAddTest, EmbeddedCubinVectorAdd) +{ + LoadedKernel loaded = load_vector_add_kernel(cc_major_, cc_minor_); + if (loaded.skip_reason) { GTEST_SKIP() << loaded.skip_reason; } + if (!loaded) { return; } + + SCOPED_TRACE(loaded.used_tileir_jit ? "loaded via TileIR driver JIT" + : "loaded via prebuilt cubin"); + run_vector_add(loaded.kernel); } } // namespace cuvs diff --git a/cpp/tests/cutile/export_vector_add_cubin.py b/cpp/tests/cutile/export_vector_add_cubin.py index bf40a4ad80..fa099189cd 100644 --- a/cpp/tests/cutile/export_vector_add_cubin.py +++ b/cpp/tests/cutile/export_vector_add_cubin.py @@ -1,12 +1,13 @@ # SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. # SPDX-License-Identifier: Apache-2.0 -"""Export the cuTile vector-add kernel to a cubin for a single GPU target.""" +"""Export the cuTile vector-add kernel to cubin or TileIR bytecode.""" from __future__ import annotations import argparse import sys from pathlib import Path +from typing import Literal import cuda.tile as ct from cuda.tile.compilation import ( @@ -28,6 +29,9 @@ # sm_120 -> 120a-real SUPPORTED_GPU_CODES = ("sm_80", "sm_86", "sm_90", "sm_100", "sm_120") +# Minimum TileIR bytecode version supported by cuTile; also the most portable choice. +DEFAULT_TILEIR_BYTECODE_VERSION = "13.1" + def _kernel_signature() -> KernelSignature: array = ArrayConstraint( @@ -45,20 +49,31 @@ def _kernel_signature() -> KernelSignature: ).with_mangled_symbol("vector_add") -def export_cubin(output_file: Path, gpu_code: str, symbol_header: Path | None) -> str: - if gpu_code not in SUPPORTED_GPU_CODES: +def export_kernel_binary( + output_file: Path, + *, + output_format: Literal["cubin", "tileir_bytecode"], + gpu_code: str, + bytecode_version: str | None = None, + symbol_header: Path | None = None, +) -> str: + if output_format == "cubin" and gpu_code not in SUPPORTED_GPU_CODES: raise ValueError( f"Unsupported gpu_code {gpu_code!r}; expected one of {SUPPORTED_GPU_CODES}" ) signature = _kernel_signature() - export_kernel( - vector_add, - signatures=[signature], - output_file=str(output_file), - gpu_code=gpu_code, - output_format="cubin", - ) + export_kwargs: dict = { + "kernel": vector_add, + "signatures": [signature], + "output_file": str(output_file), + "gpu_code": gpu_code, + "output_format": output_format, + } + if output_format == "tileir_bytecode": + export_kwargs["bytecode_version"] = bytecode_version or DEFAULT_TILEIR_BYTECODE_VERSION + + export_kernel(**export_kwargs) if symbol_header is not None: symbol_header.write_text( @@ -77,12 +92,23 @@ def export_cubin(output_file: Path, gpu_code: str, symbol_header: Path | None) - def main() -> int: parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument("output_file", type=Path, help="Output cubin path") + parser.add_argument("output_file", type=Path, help="Output cubin or .tilebc path") + parser.add_argument( + "--format", + choices=("cubin", "tileir_bytecode"), + default="cubin", + help="Export format (default: cubin)", + ) parser.add_argument( "--gpu-code", required=True, choices=SUPPORTED_GPU_CODES, - help="tileiras / export_kernel target (e.g. sm_120)", + help="tileiras / export_kernel compile target (e.g. sm_120)", + ) + parser.add_argument( + "--bytecode-version", + default=DEFAULT_TILEIR_BYTECODE_VERSION, + help="TileIR bytecode version when --format=tileir_bytecode (default: 13.1)", ) parser.add_argument( "--symbol-header", @@ -92,7 +118,13 @@ def main() -> int: ) args = parser.parse_args() - symbol = export_cubin(args.output_file, args.gpu_code, args.symbol_header) + symbol = export_kernel_binary( + args.output_file, + output_format=args.format, + gpu_code=args.gpu_code, + bytecode_version=args.bytecode_version, + symbol_header=args.symbol_header, + ) print(symbol) return 0 diff --git a/cpp/tests/cutile/generate_cutile_cubins.cmake b/cpp/tests/cutile/generate_cutile_cubins.cmake index 3425b03028..766d3167c6 100644 --- a/cpp/tests/cutile/generate_cutile_cubins.cmake +++ b/cpp/tests/cutile/generate_cutile_cubins.cmake @@ -78,6 +78,33 @@ function(generate_cutile_vector_add_cubins output_include_dir_var) list(APPEND _generated_headers "${_cubin_header}") endforeach() + # Portable TileIR bytecode for driver JIT on architectures without a prebuilt cubin. + # Requires a CUDA 13.1+ driver (>= 590.44); see Tile IR bytecode docs. + set(_tileir_file "${_cutile_binary_dir}/vector_add.tilebc") + set(_tileir_header "${_cutile_binary_dir}/vector_add_tileir_bytecode.h") + + add_custom_command( + OUTPUT "${_tileir_file}" + COMMAND + "${Python3_EXECUTABLE}" "${_cutile_source_dir}/export_vector_add_cubin.py" + "${_tileir_file}" --format tileir_bytecode --gpu-code sm_80 --bytecode-version 13.1 + DEPENDS "${_cutile_source_dir}/export_vector_add_cubin.py" + "${_cutile_source_dir}/vector_add_kernel.py" + COMMENT "Exporting cuTile vector_add TileIR bytecode (v13.1)" + VERBATIM + ) + + add_custom_command( + OUTPUT "${_tileir_header}" + COMMAND "${CUTILE_BIN2C}" --const --name vector_add_tileir_bytecode --static "${_tileir_file}" + > "${_tileir_header}" + DEPENDS "${_tileir_file}" + COMMENT "Embedding vector_add TileIR bytecode via bin2c" + VERBATIM + ) + + list(APPEND _generated_headers "${_tileir_header}") + add_custom_target( cutile_vector_add_cubins DEPENDS "${_symbol_header}" ${_generated_headers} diff --git a/cpp/tests/neighbors/distance_nn.cu b/cpp/tests/neighbors/distance_nn.cu index f31f3ebacf..f5efaa5bec 100644 --- a/cpp/tests/neighbors/distance_nn.cu +++ b/cpp/tests/neighbors/distance_nn.cu @@ -187,6 +187,7 @@ const std::vector> input_fp32 = { {4096, 16384, 128, DistanceType::L2Expanded, true, uint64_t(31415926), 0.1}, {4096, 4096, 64, DistanceType::L2SqrtExpanded, false, uint64_t(31415926), 0.1}, {4096, 16384, 128, DistanceType::L2SqrtExpanded, false, uint64_t(31415926), 0.1}, + {512, 1024, 64, DistanceType::InnerProduct, false, uint64_t(31415926), 0.1}, {4096, 4096, 64, DistanceType::CosineExpanded, false, uint64_t(31415926), 0.1}, {8192, 4096, 64, DistanceType::CosineExpanded, false, uint64_t(31415926), 0.1}, // Fused implementation for cosine distance ignores the sqrt parameter, therefore diff --git a/cpp/tests/neighbors/distance_nn_helper.cuh b/cpp/tests/neighbors/distance_nn_helper.cuh index fda7b76573..422879918f 100644 --- a/cpp/tests/neighbors/distance_nn_helper.cuh +++ b/cpp/tests/neighbors/distance_nn_helper.cuh @@ -66,6 +66,16 @@ __device__ AccT cosine_distance(const DataT* v1, const DataT* v2, IdxT K) } // This is a naive implementation of 1-NN computation +template +__device__ AccT inner_product_score(const DataT* v1, const DataT* v2, IdxT K) +{ + AccT score = AccT(0.0); + for (IdxT i = 0; i < K; i++) { + score += AccT(v1[i]) * AccT(v2[i]); + } + return score; +} + template RAFT_KERNEL ref_nn_kernel( OutT* out, const DataT* A, const DataT* B, IdxT M, IdxT N, IdxT K, bool sqrt, DistanceType metric) @@ -73,22 +83,47 @@ RAFT_KERNEL ref_nn_kernel( IdxT tid = threadIdx.x + blockIdx.x * IdxT(blockDim.x); for (IdxT m = tid; m < M; m += (blockDim.x * gridDim.x)) { - IdxT min_index = N + 1; - AccT min_dist = max_val(); + IdxT best_index = N + 1; + AccT best_score = min_val(); + AccT best_dist = max_val(); for (IdxT n = 0; n < N; n++) { + if (metric == DistanceType::InnerProduct) { + AccT score = inner_product_score(&A[m * K], &B[n * K], K); + if (score > best_score) { + best_score = score; + best_index = n; + } + continue; + } + AccT dist; if (metric == DistanceType::L2SqrtExpanded || metric == DistanceType::L2Expanded) { dist = l2_distance(&A[m * K], &B[n * K], K); } else if (metric == DistanceType::CosineExpanded) { dist = cosine_distance(&A[m * K], &B[n * K], K); + } else { + continue; + } + if (dist < best_dist) { + best_dist = dist; + best_index = n; } - if (dist < min_dist) { - min_dist = dist; - min_index = n; + } + + if (metric == DistanceType::InnerProduct) { + if constexpr (std::is_fundamental::value) { + out[m] = AccT(best_score); + } else { + out[m].key = IdxT(best_index); + out[m].value = AccT(best_score); } + continue; } + IdxT min_index = best_index; + AccT min_dist = best_dist; + if constexpr (std::is_fundamental::value) { static_assert(std::is_same::value, "OutT and AccT are not same type"); out[m] = AccT(min_dist); From 1b934ddaa684bb09d2df9e611e6c7482b7ee975b Mon Sep 17 00:00:00 2001 From: divyegala Date: Wed, 24 Jun 2026 19:33:18 +0000 Subject: [PATCH 3/5] attempt to fix tile linkage --- cpp/CMakeLists.txt | 22 +-- .../modules/generate_cutile_kernels.cmake | 183 +++++------------- ...cpp.in => register_cutile_fragment.cpp.in} | 8 +- cpp/cmake/modules/register_tileir.cpp.in | 22 --- .../cutile/fused_1nn_cutile_cubin_matrix.json | 40 ---- .../cutile/fused_1nn_cutile_matrix.json | 64 ++++++ .../fused_1nn_cutile_tileir_matrix.json | 20 -- 7 files changed, 120 insertions(+), 239 deletions(-) rename cpp/cmake/modules/{register_cubin.cpp.in => register_cutile_fragment.cpp.in} (57%) delete mode 100644 cpp/cmake/modules/register_tileir.cpp.in delete mode 100644 cpp/src/distance/detail/fused_distance_nn/cutile/fused_1nn_cutile_cubin_matrix.json create mode 100644 cpp/src/distance/detail/fused_distance_nn/cutile/fused_1nn_cutile_matrix.json delete mode 100644 cpp/src/distance/detail/fused_distance_nn/cutile/fused_1nn_cutile_tileir_matrix.json diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index cc6e1975b3..a1f3f3973c 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -963,33 +963,21 @@ if(NOT BUILD_CPU_ONLY) "${CMAKE_CURRENT_SOURCE_DIR}/src/distance/detail/fused_distance_nn/cutile") set(cutile_fused_1nn_generated_dir "${CMAKE_CURRENT_BINARY_DIR}/generated_kernels/distance/fused_1nn/cutile") - generate_cutile_cubin_kernels( + generate_cutile_kernels( cutile_fused_1nn_files KERNEL_DIR "${fused_1nn_cutile_dir}" KERNEL_BASENAME "fused_1nn" KERNEL_PYTHON "fused_1nn_kernel.py" EXPORT_SCRIPT "export_fused_1nn.py" OUTPUT_DIRECTORY "${cutile_fused_1nn_generated_dir}" - MATRIX_JSON_FILE "${fused_1nn_cutile_dir}/fused_1nn_cutile_cubin_matrix.json" - FRAGMENT_TAG_FORMAT + MATRIX_JSON_FILE "${fused_1nn_cutile_dir}/fused_1nn_cutile_matrix.json" + FRAGMENT_TAG_FORMAT_CUBIN "cuvs::distance::detail::fragment_tag_fused_1nn_cubin" - FRAGMENT_TAG_HEADER_FILES - "" - "" - "" - ) - generate_cutile_tileir_kernels( - cutile_fused_1nn_files - KERNEL_DIR "${fused_1nn_cutile_dir}" - KERNEL_BASENAME "fused_1nn" - KERNEL_PYTHON "fused_1nn_kernel.py" - EXPORT_SCRIPT "export_fused_1nn.py" - OUTPUT_DIRECTORY "${cutile_fused_1nn_generated_dir}" - MATRIX_JSON_FILE "${fused_1nn_cutile_dir}/fused_1nn_cutile_tileir_matrix.json" - FRAGMENT_TAG_FORMAT + FRAGMENT_TAG_FORMAT_TILEIR "cuvs::distance::detail::fragment_tag_fused_1nn_tileir" FRAGMENT_TAG_HEADER_FILES "" + "" "" ) if(NOT DEFINED CUVS_CUTILE_ENABLED) diff --git a/cpp/cmake/modules/generate_cutile_kernels.cmake b/cpp/cmake/modules/generate_cutile_kernels.cmake index 7b9c2521c4..f0219dc842 100644 --- a/cpp/cmake/modules/generate_cutile_kernels.cmake +++ b/cpp/cmake/modules/generate_cutile_kernels.cmake @@ -77,13 +77,15 @@ function(_cutile_kernels_setup) file(MAKE_DIRECTORY "${_CUTILE_OUTPUT_DIRECTORY}") + set(Python3_EXECUTABLE "${Python3_EXECUTABLE}" PARENT_SCOPE) + set(CUTILE_BIN2C "${CUTILE_BIN2C}" PARENT_SCOPE) set(_CUTILE_SETUP_OK TRUE PARENT_SCOPE ) endfunction() -function(process_cutile_cubin_matrix_entry source_list_var) +function(process_cutile_matrix_entry source_list_var) set(options) set(one_value KERNEL_DIR @@ -91,110 +93,75 @@ function(process_cutile_cubin_matrix_entry source_list_var) KERNEL_PYTHON EXPORT_SCRIPT OUTPUT_DIRECTORY - FRAGMENT_TAG_FORMAT + FRAGMENT_TAG_FORMAT_CUBIN + FRAGMENT_TAG_FORMAT_TILEIR MATRIX_JSON_ENTRY ) set(multi_value FRAGMENT_TAG_HEADER_FILES) cmake_parse_arguments(_CUTILE "${options}" "${one_value}" "${multi_value}" ${ARGN}) - populate_matrix_variables("${_CUTILE_MATRIX_JSON_ENTRY}") - _cutile_fragment_tag_header_files( - fragment_tag_header_files ${_CUTILE_FRAGMENT_TAG_HEADER_FILES} - ) - - string(CONFIGURE "${_CUTILE_FRAGMENT_TAG_FORMAT}" fragment_tag @ONLY) - - set(_artifact_basename "${_CUTILE_KERNEL_BASENAME}_${data_type}_${gpu_code}") - set(_cubin_file "${_CUTILE_OUTPUT_DIRECTORY}/${_artifact_basename}.cubin") - set(_cubin_header "${_CUTILE_OUTPUT_DIRECTORY}/${_artifact_basename}_cubin.h") - set(_cubin_cpp "${_CUTILE_OUTPUT_DIRECTORY}/${_artifact_basename}_cubin.cpp") - set(cubin_header_file "${_artifact_basename}_cubin.h") - - add_custom_command( - OUTPUT "${_cubin_file}" - COMMAND - "${Python3_EXECUTABLE}" "${_CUTILE_KERNEL_DIR}/${_CUTILE_EXPORT_SCRIPT}" "${_cubin_file}" - --format cubin --data-type "${data_type}" --gpu-code "${gpu_code}" - DEPENDS "${_CUTILE_KERNEL_DIR}/${_CUTILE_EXPORT_SCRIPT}" - "${_CUTILE_KERNEL_DIR}/${_CUTILE_KERNEL_PYTHON}" - COMMENT "Exporting cuTile ${_CUTILE_KERNEL_BASENAME} cubin ${data_type} ${gpu_code}" - VERBATIM - ) - - add_custom_command( - OUTPUT "${_cubin_header}" - COMMAND "${CUTILE_BIN2C}" --const --name embedded_cubin --static "${_cubin_file}" - > "${_cubin_header}" - DEPENDS "${_cubin_file}" - VERBATIM - ) + find_package(Python3 REQUIRED COMPONENTS Interpreter) - configure_file( - "${CMAKE_CURRENT_FUNCTION_LIST_DIR}/register_cubin.cpp.in" "${_cubin_cpp}" @ONLY - ) - list(APPEND ${source_list_var} "${_cubin_header}" "${_cubin_cpp}") - set(${source_list_var} - "${${source_list_var}}" - PARENT_SCOPE - ) -endfunction() + populate_matrix_variables("${_CUTILE_MATRIX_JSON_ENTRY}") -function(process_cutile_tileir_matrix_entry source_list_var) - set(options) - set(one_value - KERNEL_DIR - KERNEL_BASENAME - KERNEL_PYTHON - EXPORT_SCRIPT - OUTPUT_DIRECTORY - FRAGMENT_TAG_FORMAT - MATRIX_JSON_ENTRY - ) - set(multi_value FRAGMENT_TAG_HEADER_FILES) - cmake_parse_arguments(_CUTILE "${options}" "${one_value}" "${multi_value}" ${ARGN}) + if(register STREQUAL "cubin") + string(CONFIGURE "${_CUTILE_FRAGMENT_TAG_FORMAT_CUBIN}" fragment_tag @ONLY) + set(bin2c_symbol embedded_cubin) + set(fragment_entry_type "StaticCubinFragmentEntry") + elseif(register STREQUAL "tileir") + string(CONFIGURE "${_CUTILE_FRAGMENT_TAG_FORMAT_TILEIR}" fragment_tag @ONLY) + set(bin2c_symbol embedded_tileir) + set(fragment_entry_type "StaticTileIrBytecodeFragmentEntry") + else() + message(FATAL_ERROR "Unknown cuTile register kind '${register}'") + endif() - populate_matrix_variables("${_CUTILE_MATRIX_JSON_ENTRY}") _cutile_fragment_tag_header_files( fragment_tag_header_files ${_CUTILE_FRAGMENT_TAG_HEADER_FILES} ) - string(CONFIGURE "${_CUTILE_FRAGMENT_TAG_FORMAT}" fragment_tag @ONLY) - set(_tileir_file "${_CUTILE_OUTPUT_DIRECTORY}/${_CUTILE_KERNEL_BASENAME}_${data_type}.tilebc") - set(_tileir_header "${_CUTILE_OUTPUT_DIRECTORY}/${_CUTILE_KERNEL_BASENAME}_${data_type}_tileir.h") - set(_tileir_cpp "${_CUTILE_OUTPUT_DIRECTORY}/${_CUTILE_KERNEL_BASENAME}_${data_type}_tileir.cpp") - set(tileir_header_file "${_CUTILE_KERNEL_BASENAME}_${data_type}_tileir.h") + string(CONFIGURE "${artifact_basename}" _artifact_basename @ONLY) + set(_artifact_stem "${_CUTILE_KERNEL_BASENAME}_${_artifact_basename}") + set(_artifact_file "${_CUTILE_OUTPUT_DIRECTORY}/${_artifact_stem}.${artifact_ext}") + set(_embedded_header "${_CUTILE_OUTPUT_DIRECTORY}/${_artifact_stem}_${register}.h") + set(_fragment_cpp "${_CUTILE_OUTPUT_DIRECTORY}/${_artifact_stem}_${register}.cpp") + set(embedded_header_file "${_artifact_stem}_${register}.h") + + set(_python_args --format "${output_format}" --data-type "${data_type}" --gpu-code "${gpu_code}") + if(DEFINED bytecode_version AND NOT "${bytecode_version}" STREQUAL "") + list(APPEND _python_args --bytecode-version "${bytecode_version}") + endif() add_custom_command( - OUTPUT "${_tileir_file}" - COMMAND - "${Python3_EXECUTABLE}" "${_CUTILE_KERNEL_DIR}/${_CUTILE_EXPORT_SCRIPT}" "${_tileir_file}" - --format tileir_bytecode --data-type "${data_type}" --gpu-code "${export_gpu_code}" - --bytecode-version "${bytecode_version}" + OUTPUT "${_artifact_file}" + COMMAND "${Python3_EXECUTABLE}" "${_CUTILE_KERNEL_DIR}/${_CUTILE_EXPORT_SCRIPT}" + "${_artifact_file}" ${_python_args} + WORKING_DIRECTORY "${_CUTILE_KERNEL_DIR}" DEPENDS "${_CUTILE_KERNEL_DIR}/${_CUTILE_EXPORT_SCRIPT}" "${_CUTILE_KERNEL_DIR}/${_CUTILE_KERNEL_PYTHON}" - COMMENT "Exporting cuTile ${_CUTILE_KERNEL_BASENAME} TileIR bytecode ${data_type}" + COMMENT "Exporting cuTile ${_CUTILE_KERNEL_BASENAME} ${output_format} ${data_type}" VERBATIM ) add_custom_command( - OUTPUT "${_tileir_header}" - COMMAND "${CUTILE_BIN2C}" --const --name embedded_tileir --static "${_tileir_file}" - > "${_tileir_header}" - DEPENDS "${_tileir_file}" + OUTPUT "${_embedded_header}" + COMMAND "${CUTILE_BIN2C}" --const --name ${bin2c_symbol} --static "${_artifact_file}" + > "${_embedded_header}" + DEPENDS "${_artifact_file}" VERBATIM ) configure_file( - "${CMAKE_CURRENT_FUNCTION_LIST_DIR}/register_tileir.cpp.in" "${_tileir_cpp}" @ONLY + "${CMAKE_CURRENT_FUNCTION_LIST_DIR}/register_cutile_fragment.cpp.in" "${_fragment_cpp}" @ONLY ) - list(APPEND ${source_list_var} "${_tileir_header}" "${_tileir_cpp}") + list(APPEND ${source_list_var} "${_embedded_header}" "${_fragment_cpp}") set(${source_list_var} "${${source_list_var}}" PARENT_SCOPE ) endfunction() -function(generate_cutile_cubin_kernels source_list_var) +function(generate_cutile_kernels source_list_var) set(options) set(one_value KERNEL_DIR @@ -203,13 +170,14 @@ function(generate_cutile_cubin_kernels source_list_var) EXPORT_SCRIPT OUTPUT_DIRECTORY MATRIX_JSON_FILE - FRAGMENT_TAG_FORMAT + FRAGMENT_TAG_FORMAT_CUBIN + FRAGMENT_TAG_FORMAT_TILEIR ) set(multi_value FRAGMENT_TAG_HEADER_FILES) cmake_parse_arguments(_CUTILE "${options}" "${one_value}" "${multi_value}" ${ARGN}) if(NOT _CUTILE_KERNEL_BASENAME) - message(FATAL_ERROR "generate_cutile_cubin_kernels: KERNEL_BASENAME is required") + message(FATAL_ERROR "generate_cutile_kernels: KERNEL_BASENAME is required") endif() if(NOT _CUTILE_KERNEL_PYTHON) set(_CUTILE_KERNEL_PYTHON "fused_1nn_kernel.py") @@ -236,72 +204,15 @@ function(generate_cutile_cubin_kernels source_list_var) # cmake-lint: disable=C0103,E1120 foreach(i RANGE "${last}") string(JSON matrix_json_entry GET "${matrix_product}" "${i}") - process_cutile_cubin_matrix_entry( - "${source_list_var}" - KERNEL_DIR "${_CUTILE_KERNEL_DIR}" - KERNEL_BASENAME "${_CUTILE_KERNEL_BASENAME}" - KERNEL_PYTHON "${_CUTILE_KERNEL_PYTHON}" - EXPORT_SCRIPT "${_CUTILE_EXPORT_SCRIPT}" - OUTPUT_DIRECTORY "${_CUTILE_OUTPUT_DIRECTORY}" - FRAGMENT_TAG_FORMAT "${_CUTILE_FRAGMENT_TAG_FORMAT}" - FRAGMENT_TAG_HEADER_FILES ${_CUTILE_FRAGMENT_TAG_HEADER_FILES} - MATRIX_JSON_ENTRY "${matrix_json_entry}" - ) - endforeach() - - set(CUVS_CUTILE_ENABLED 1 PARENT_SCOPE) - set(${source_list_var} - "${${source_list_var}}" - PARENT_SCOPE - ) -endfunction() - -function(generate_cutile_tileir_kernels source_list_var) - set(options) - set(one_value - KERNEL_DIR - KERNEL_BASENAME - KERNEL_PYTHON - EXPORT_SCRIPT - OUTPUT_DIRECTORY - MATRIX_JSON_FILE - FRAGMENT_TAG_FORMAT - ) - set(multi_value FRAGMENT_TAG_HEADER_FILES) - cmake_parse_arguments(_CUTILE "${options}" "${one_value}" "${multi_value}" ${ARGN}) - - if(NOT _CUTILE_KERNEL_BASENAME) - message(FATAL_ERROR "generate_cutile_tileir_kernels: KERNEL_BASENAME is required") - endif() - if(NOT _CUTILE_KERNEL_PYTHON) - set(_CUTILE_KERNEL_PYTHON "fused_1nn_kernel.py") - endif() - - _cutile_kernels_setup( - MATRIX_JSON_FILE "${_CUTILE_MATRIX_JSON_FILE}" - OUTPUT_DIRECTORY "${_CUTILE_OUTPUT_DIRECTORY}" - ) - if(NOT _CUTILE_SETUP_OK) - generate_cutile_kernels_stub() - return() - endif() - - compute_matrix_product(matrix_product MATRIX_JSON_FILE "${_CUTILE_MATRIX_JSON_FILE}") - - string(JSON len LENGTH "${matrix_product}") - math(EXPR last "${len} - 1") - - # cmake-lint: disable=C0103,E1120 - foreach(i RANGE "${last}") - string(JSON matrix_json_entry GET "${matrix_product}" "${i}") - process_cutile_tileir_matrix_entry( + process_cutile_matrix_entry( "${source_list_var}" KERNEL_DIR "${_CUTILE_KERNEL_DIR}" KERNEL_BASENAME "${_CUTILE_KERNEL_BASENAME}" KERNEL_PYTHON "${_CUTILE_KERNEL_PYTHON}" EXPORT_SCRIPT "${_CUTILE_EXPORT_SCRIPT}" OUTPUT_DIRECTORY "${_CUTILE_OUTPUT_DIRECTORY}" - FRAGMENT_TAG_FORMAT "${_CUTILE_FRAGMENT_TAG_FORMAT}" + FRAGMENT_TAG_FORMAT_CUBIN "${_CUTILE_FRAGMENT_TAG_FORMAT_CUBIN}" + FRAGMENT_TAG_FORMAT_TILEIR "${_CUTILE_FRAGMENT_TAG_FORMAT_TILEIR}" FRAGMENT_TAG_HEADER_FILES ${_CUTILE_FRAGMENT_TAG_HEADER_FILES} MATRIX_JSON_ENTRY "${matrix_json_entry}" ) diff --git a/cpp/cmake/modules/register_cubin.cpp.in b/cpp/cmake/modules/register_cutile_fragment.cpp.in similarity index 57% rename from cpp/cmake/modules/register_cubin.cpp.in rename to cpp/cmake/modules/register_cutile_fragment.cpp.in index c27d6829ee..0fc074bdbb 100644 --- a/cpp/cmake/modules/register_cubin.cpp.in +++ b/cpp/cmake/modules/register_cutile_fragment.cpp.in @@ -3,7 +3,7 @@ * SPDX-License-Identifier: Apache-2.0 */ -#include "@cubin_header_file@" +#include "@embedded_header_file@" #include @fragment_tag_header_files@ @@ -11,12 +11,12 @@ namespace { using fragment_tag = @fragment_tag@; -using fragment_entry = StaticCubinFragmentEntry; +using fragment_entry = @fragment_entry_type@; } // namespace template <> -const uint8_t* const fragment_entry::data = embedded_cubin; +const uint8_t* const fragment_entry::data = @bin2c_symbol@; template <> -const size_t fragment_entry::length = sizeof(embedded_cubin); +const size_t fragment_entry::length = sizeof(@bin2c_symbol@); diff --git a/cpp/cmake/modules/register_tileir.cpp.in b/cpp/cmake/modules/register_tileir.cpp.in deleted file mode 100644 index fb81acedbc..0000000000 --- a/cpp/cmake/modules/register_tileir.cpp.in +++ /dev/null @@ -1,22 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. - * SPDX-License-Identifier: Apache-2.0 - */ - -#include "@tileir_header_file@" -#include - -@fragment_tag_header_files@ - -namespace { - -using fragment_tag = @fragment_tag@; -using fragment_entry = StaticTileIrBytecodeFragmentEntry; - -} // namespace - -template <> -const uint8_t* const fragment_entry::data = embedded_tileir; - -template <> -const size_t fragment_entry::length = sizeof(embedded_tileir); diff --git a/cpp/src/distance/detail/fused_distance_nn/cutile/fused_1nn_cutile_cubin_matrix.json b/cpp/src/distance/detail/fused_distance_nn/cutile/fused_1nn_cutile_cubin_matrix.json deleted file mode 100644 index fbd4bfdd64..0000000000 --- a/cpp/src/distance/detail/fused_distance_nn/cutile/fused_1nn_cutile_cubin_matrix.json +++ /dev/null @@ -1,40 +0,0 @@ -[ - { - "_data": [ - { - "data_type": "half", - "data_abbrev": "h" - }, - { - "data_type": "float", - "data_abbrev": "f" - } - ], - "_arch": [ - { - "gpu_code": "sm_80", - "cc_major": 8, - "cc_minor": 0, - "arch_tag": "cutile_arch_8_0" - }, - { - "gpu_code": "sm_86", - "cc_major": 8, - "cc_minor": 6, - "arch_tag": "cutile_arch_8_6" - }, - { - "gpu_code": "sm_90", - "cc_major": 9, - "cc_minor": 0, - "arch_tag": "cutile_arch_9_0" - }, - { - "gpu_code": "sm_120", - "cc_major": 12, - "cc_minor": 0, - "arch_tag": "cutile_arch_12_0" - } - ] - } -] diff --git a/cpp/src/distance/detail/fused_distance_nn/cutile/fused_1nn_cutile_matrix.json b/cpp/src/distance/detail/fused_distance_nn/cutile/fused_1nn_cutile_matrix.json new file mode 100644 index 0000000000..52955863c5 --- /dev/null +++ b/cpp/src/distance/detail/fused_distance_nn/cutile/fused_1nn_cutile_matrix.json @@ -0,0 +1,64 @@ +[ + { + "_data": [ + { + "data_type": "half", + "data_abbrev": "h" + }, + { + "data_type": "float", + "data_abbrev": "f" + } + ], + "_export": [ + { + "output_format": "cubin", + "artifact_ext": "cubin", + "artifact_basename": "@data_type@_@gpu_code@", + "register": "cubin", + "gpu_code": "sm_80", + "cc_major": 8, + "cc_minor": 0, + "arch_tag": "cutile_arch_8_0" + }, + { + "output_format": "cubin", + "artifact_ext": "cubin", + "artifact_basename": "@data_type@_@gpu_code@", + "register": "cubin", + "gpu_code": "sm_86", + "cc_major": 8, + "cc_minor": 6, + "arch_tag": "cutile_arch_8_6" + }, + { + "output_format": "cubin", + "artifact_ext": "cubin", + "artifact_basename": "@data_type@_@gpu_code@", + "register": "cubin", + "gpu_code": "sm_90", + "cc_major": 9, + "cc_minor": 0, + "arch_tag": "cutile_arch_9_0" + }, + { + "output_format": "cubin", + "artifact_ext": "cubin", + "artifact_basename": "@data_type@_@gpu_code@", + "register": "cubin", + "gpu_code": "sm_120", + "cc_major": 12, + "cc_minor": 0, + "arch_tag": "cutile_arch_12_0" + }, + { + "output_format": "tileir_bytecode", + "artifact_ext": "tilebc", + "artifact_basename": "@data_type@", + "register": "tileir", + "gpu_code": "sm_80", + "bytecode_version": "13.1" + } + ] + } +] diff --git a/cpp/src/distance/detail/fused_distance_nn/cutile/fused_1nn_cutile_tileir_matrix.json b/cpp/src/distance/detail/fused_distance_nn/cutile/fused_1nn_cutile_tileir_matrix.json deleted file mode 100644 index 364c94594c..0000000000 --- a/cpp/src/distance/detail/fused_distance_nn/cutile/fused_1nn_cutile_tileir_matrix.json +++ /dev/null @@ -1,20 +0,0 @@ -[ - { - "_data": [ - { - "data_type": "half", - "data_abbrev": "h" - }, - { - "data_type": "float", - "data_abbrev": "f" - } - ], - "_tileir": [ - { - "export_gpu_code": "sm_80", - "bytecode_version": "13.1" - } - ] - } -] From c7f7cbd30bcf408c4823606d7f93f9e0df1526cc Mon Sep 17 00:00:00 2001 From: divyegala Date: Wed, 24 Jun 2026 21:10:59 +0000 Subject: [PATCH 4/5] working test, remove example --- .../cutile/fused_1nn_tile.cu | 110 ++++---- .../cutile/fused_1nn_tile.hpp | 15 +- cpp/tests/CMakeLists.txt | 3 - cpp/tests/cutile/CMakeLists.txt | 23 -- cpp/tests/cutile/cutile_vector_add.cu | 236 ------------------ cpp/tests/cutile/export_vector_add_cubin.py | 133 ---------- cpp/tests/cutile/generate_cutile_cubins.cmake | 117 --------- cpp/tests/cutile/vector_add_kernel.py | 17 -- 8 files changed, 60 insertions(+), 594 deletions(-) delete mode 100644 cpp/tests/cutile/CMakeLists.txt delete mode 100644 cpp/tests/cutile/cutile_vector_add.cu delete mode 100644 cpp/tests/cutile/export_vector_add_cubin.py delete mode 100644 cpp/tests/cutile/generate_cutile_cubins.cmake delete mode 100644 cpp/tests/cutile/vector_add_kernel.py diff --git a/cpp/src/distance/detail/fused_distance_nn/cutile/fused_1nn_tile.cu b/cpp/src/distance/detail/fused_distance_nn/cutile/fused_1nn_tile.cu index af8b0b181f..0ad4ee62a5 100644 --- a/cpp/src/distance/detail/fused_distance_nn/cutile/fused_1nn_tile.cu +++ b/cpp/src/distance/detail/fused_distance_nn/cutile/fused_1nn_tile.cu @@ -16,6 +16,8 @@ namespace detail { namespace { +constexpr int64_t TILE_M = 128; + template __global__ void pack_fused_1nn_kvp(OutT* out, const int64_t* idx, const float* dist, IdxT len) { @@ -27,13 +29,8 @@ __global__ void pack_fused_1nn_kvp(OutT* out, const int64_t* idx, const float* d } template -bool launch_fused_1nn_tile(const DataT* x, - const DataT* y, - OutT* out, - IdxT m, - IdxT n, - IdxT k, - cudaStream_t stream) +bool launch_fused_1nn_tile( + const DataT* x, const DataT* y, OutT* out, IdxT m, IdxT n, IdxT k, cudaStream_t stream) { Fused1nnTilePlanner planner; planner.add_entrypoint(); @@ -46,67 +43,70 @@ bool launch_fused_1nn_tile(const DataT* x, RAFT_CUDA_TRY(cudaMallocAsync(&d_idx, m * sizeof(int64_t), stream)); RAFT_CUDA_TRY(cudaMallocAsync(&d_dist, m * sizeof(float), stream)); - int64_t shape_x[2] = {m, k}; - int64_t stride_x[2] = {k, 1}; - int64_t shape_y[2] = {n, k}; - int64_t stride_y[2] = {k, 1}; - int64_t shape_idx[1] = {m}; - int64_t stride_idx[1] = {1}; - int64_t shape_dist[1] = {m}; - int64_t stride_dist[1] = {1}; + int64_t shape_x[2] = {m, k}; + int64_t stride_x[2] = {k, 1}; + int64_t shape_y[2] = {n, k}; + int64_t stride_y[2] = {k, 1}; + int64_t shape_idx = m; + int64_t stride_idx = 1; + int64_t shape_dist = m; + int64_t stride_dist = 1; int64_t M = m, N = n, K = k; - constexpr int64_t tm = 128, tn = 256, tk = 64; void* x_ptr = const_cast(x); void* y_ptr = const_cast(y); void* idx_ptr = d_idx; void* dist_ptr = d_dist; - dim3 grid((m + tm - 1) / tm, 1, 1); + dim3 grid((m + TILE_M - 1) / TILE_M, 1, 1); dim3 block(1, 1, 1); + // cutile_python_v1 (see fused_1nn_float PTX): each 2D array is (ptr, shape0, shape1, + // stride0, stride1); each 1D array is (ptr, shape, stride); ConstantConstraint tile sizes + // are embedded in the module. using fused_1nn_cutile_kernel_t = void(void*, - int64_t*, - int64_t*, - void*, - int64_t*, - int64_t*, + int64_t, + int64_t, + int64_t, + int64_t, void*, - int64_t*, - int64_t*, + int64_t, + int64_t, + int64_t, + int64_t, void*, - int64_t*, - int64_t*, int64_t, int64_t, + void*, + int64_t, int64_t, int64_t, int64_t, int64_t); - launcher->template dispatch( - stream, - grid, - block, - 0, - x_ptr, - shape_x, - stride_x, - y_ptr, - shape_y, - stride_y, - idx_ptr, - shape_idx, - stride_idx, - dist_ptr, - shape_dist, - stride_dist, - M, - N, - K, - tm, - tn, - tk); + launcher->template dispatch(stream, + grid, + block, + 0, + x_ptr, + shape_x[0], + shape_x[1], + stride_x[0], + stride_x[1], + y_ptr, + shape_y[0], + shape_y[1], + stride_y[0], + stride_y[1], + idx_ptr, + shape_idx, + stride_idx, + dist_ptr, + shape_dist, + stride_dist, + M, + N, + K); pack_fused_1nn_kvp<<<(m + 255) / 256, 256, 0, stream>>>(out, d_idx, d_dist, m); RAFT_CUDA_TRY(cudaGetLastError()); @@ -148,13 +148,13 @@ using kvp_i64_f = raft::KeyValuePair; using kvp_i_h = raft::KeyValuePair; using kvp_i64_h = raft::KeyValuePair; -#define CUVS_INST_TRY_FUSED_1NN_TILE(DataT, OutT, IdxT) \ +#define CUVS_INST_TRY_FUSED_1NN_TILE(DataT, OutT, IdxT) \ template CUVS_EXPORT bool try_fused_1nn_tile(OutT*, \ - const DataT*, \ - const DataT*, \ - IdxT, \ - IdxT, \ - IdxT, \ + const DataT*, \ + const DataT*, \ + IdxT, \ + IdxT, \ + IdxT, \ cuvs::distance::DistanceType, \ cudaStream_t) diff --git a/cpp/src/distance/detail/fused_distance_nn/cutile/fused_1nn_tile.hpp b/cpp/src/distance/detail/fused_distance_nn/cutile/fused_1nn_tile.hpp index 30f804d399..d72a020ba7 100644 --- a/cpp/src/distance/detail/fused_distance_nn/cutile/fused_1nn_tile.hpp +++ b/cpp/src/distance/detail/fused_distance_nn/cutile/fused_1nn_tile.hpp @@ -18,8 +18,9 @@ namespace detail { template inline constexpr bool is_fused_1nn_kvp_output_v = - std::is_same_v> || - std::is_same_v>; + (std::is_same_v || std::is_same_v) && + (std::is_same_v> || + std::is_same_v>); template , int> = 0> -bool try_fused_1nn_tile(OutT*, - const DataT*, - const DataT*, - IdxT, - IdxT, - IdxT, - cuvs::distance::DistanceType, - cudaStream_t) +bool try_fused_1nn_tile( + OutT*, const DataT*, const DataT*, IdxT, IdxT, IdxT, cuvs::distance::DistanceType, cudaStream_t) { return false; } diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 006b35b5c4..9b96f94bf0 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -386,9 +386,6 @@ ConfigureTest( PERCENT 100 ) -# cuTile vector-add example test disabled; fused 1-NN cuTile is covered via libcuvs integration. -# add_subdirectory(cutile) - # ################################################################################################## # Install tests #################################################################################### # ################################################################################################## diff --git a/cpp/tests/cutile/CMakeLists.txt b/cpp/tests/cutile/CMakeLists.txt deleted file mode 100644 index 989c8137d0..0000000000 --- a/cpp/tests/cutile/CMakeLists.txt +++ /dev/null @@ -1,23 +0,0 @@ -# ============================================================================= -# cmake-format: off -# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. -# SPDX-License-Identifier: Apache-2.0 -# cmake-format: on -# ============================================================================= - -include("${CMAKE_CURRENT_LIST_DIR}/generate_cutile_cubins.cmake") - -generate_cutile_vector_add_cubins(CUTILE_GENERATED_INCLUDE_DIR) - -ConfigureTest( - NAME CUTILE_VECTOR_ADD_TEST - PATH "${CMAKE_CURRENT_LIST_DIR}/cutile_vector_add.cu" - GPUS 1 - PERCENT 100 -) - -add_dependencies(CUTILE_VECTOR_ADD_TEST cutile_vector_add_cubins) - -target_include_directories( - CUTILE_VECTOR_ADD_TEST PRIVATE "${CUTILE_GENERATED_INCLUDE_DIR}" -) diff --git a/cpp/tests/cutile/cutile_vector_add.cu b/cpp/tests/cutile/cutile_vector_add.cu deleted file mode 100644 index 07d694bef1..0000000000 --- a/cpp/tests/cutile/cutile_vector_add.cu +++ /dev/null @@ -1,236 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. - * SPDX-License-Identifier: Apache-2.0 - */ - -#include "../test_utils.cuh" - -#include "vector_add_kernel_symbol.h" -#include "vector_add_sm_100_cubin.h" -#include "vector_add_sm_120_cubin.h" -#include "vector_add_sm_80_cubin.h" -#include "vector_add_sm_86_cubin.h" -#include "vector_add_sm_90_cubin.h" -#include "vector_add_tileir_bytecode.h" - -#include - -#include - -#include -#include -#include - -namespace cuvs { -namespace { - -struct EmbeddedCubin { - int cc_major; - int cc_minor; - const unsigned char* data; - size_t size; -}; - -// Prebuilt cubins for known library targets (see export_vector_add_cubin.py). -constexpr EmbeddedCubin kEmbeddedCubins[] = { - {8, 0, vector_add_sm_80_cubin, sizeof(vector_add_sm_80_cubin)}, - {8, 6, vector_add_sm_86_cubin, sizeof(vector_add_sm_86_cubin)}, - {9, 0, vector_add_sm_90_cubin, sizeof(vector_add_sm_90_cubin)}, - {10, 0, vector_add_sm_100_cubin, sizeof(vector_add_sm_100_cubin)}, - {12, 0, vector_add_sm_120_cubin, sizeof(vector_add_sm_120_cubin)}, -}; - -constexpr EmbeddedCubin kTileIrBytecode = { - -1, - -1, - vector_add_tileir_bytecode, - sizeof(vector_add_tileir_bytecode), -}; - -struct CutileModuleImage { - const uint8_t* data; - size_t size; -}; - -std::optional resolve_vector_add_module(int cc_major, int cc_minor) -{ - for (const auto& entry : kEmbeddedCubins) { - if (entry.cc_major == cc_major && entry.cc_minor == cc_minor) { - return CutileModuleImage{reinterpret_cast(entry.data), entry.size}; - } - } - - int driver_version = 0; - if (cudaDriverGetVersion(&driver_version) != cudaSuccess) { return std::nullopt; } - if (!cuvs::detail::jit_lto::tileir_fallback_available(driver_version)) { - return std::nullopt; - } - return CutileModuleImage{ - reinterpret_cast(kTileIrBytecode.data), kTileIrBytecode.size}; -} - -struct LoadedKernel { - cudaLibrary_t library = nullptr; - cudaKernel_t kernel = nullptr; - bool used_tileir_jit{false}; - const char* skip_reason{nullptr}; - - LoadedKernel() = default; - - LoadedKernel(LoadedKernel&& other) noexcept { *this = std::move(other); } - - LoadedKernel& operator=(LoadedKernel&& other) noexcept - { - if (this != &other) { - unload(); - library = other.library; - kernel = other.kernel; - used_tileir_jit = other.used_tileir_jit; - skip_reason = other.skip_reason; - other.library = nullptr; - other.kernel = nullptr; - } - return *this; - } - - LoadedKernel(const LoadedKernel&) = delete; - LoadedKernel& operator=(const LoadedKernel&) = delete; - - ~LoadedKernel() { unload(); } - - explicit operator bool() const { return kernel != nullptr; } - - private: - void unload() - { - if (library != nullptr) { - RAFT_CUDA_TRY(cudaLibraryUnload(library)); - library = nullptr; - kernel = nullptr; - } - } -}; - -LoadedKernel load_vector_add_kernel(int cc_major, int cc_minor) -{ - LoadedKernel result{}; - result.used_tileir_jit = !cuvs::detail::jit_lto::is_embedded_cubin_arch(cc_major, cc_minor); - - auto image = resolve_vector_add_module(cc_major, cc_minor); - if (!image) { - if (result.used_tileir_jit) { - result.skip_reason = - "TileIR driver JIT unavailable for this GPU. Requires CUDA 13.1+ driver (>= 590.44)."; - } else { - ADD_FAILURE() << "No embedded cuTile module for compute capability " << cc_major << "." - << cc_minor; - } - return result; - } - - const cudaError_t load_status = - cudaLibraryLoadData(&result.library, image->data, nullptr, nullptr, 0, nullptr, nullptr, 0); - if (load_status != cudaSuccess) { - if (result.used_tileir_jit) { - result.skip_reason = - "TileIR driver JIT unavailable for this GPU (requires CUDA 13.1+ driver >= 590.44)."; - SCOPED_TRACE(cudaGetErrorString(load_status)); - } else { - ADD_FAILURE() << "cudaLibraryLoadData failed: " << cudaGetErrorString(load_status); - } - return result; - } - - const cudaError_t kernel_status = - cudaLibraryGetKernel(&result.kernel, result.library, CUTILE_VECTOR_ADD_KERNEL_SYMBOL); - if (kernel_status != cudaSuccess) { - if (result.library != nullptr) { - RAFT_CUDA_TRY(cudaLibraryUnload(result.library)); - result.library = nullptr; - } - result.kernel = nullptr; - if (result.used_tileir_jit) { - result.skip_reason = - "TileIR driver JIT unavailable for this GPU (requires CUDA 13.1+ driver >= 590.44)."; - SCOPED_TRACE(cudaGetErrorString(kernel_status)); - } else { - ADD_FAILURE() << "cudaLibraryGetKernel failed: " << cudaGetErrorString(kernel_status); - } - } - return result; -} - -void run_vector_add(cudaKernel_t kernel) -{ - constexpr int kN = 1024; - constexpr int kTile = 256; - constexpr int kGridDim = (kN + kTile - 1) / kTile; - - float *d_a = nullptr, *d_b = nullptr, *d_c = nullptr; - RAFT_CUDA_TRY(cudaMalloc(&d_a, kN * sizeof(float))); - RAFT_CUDA_TRY(cudaMalloc(&d_b, kN * sizeof(float))); - RAFT_CUDA_TRY(cudaMalloc(&d_c, kN * sizeof(float))); - - std::vector h_a(kN), h_b(kN); - for (int i = 0; i < kN; ++i) { - h_a[i] = static_cast(i); - h_b[i] = static_cast(i * 2); - } - RAFT_CUDA_TRY(cudaMemcpy(d_a, h_a.data(), kN * sizeof(float), cudaMemcpyHostToDevice)); - RAFT_CUDA_TRY(cudaMemcpy(d_b, h_b.data(), kN * sizeof(float), cudaMemcpyHostToDevice)); - RAFT_CUDA_TRY(cudaMemset(d_c, 0, kN * sizeof(float))); - - int64_t shape = kN; - int64_t stride = 1; - void* kernel_args[] = { - &d_a, &shape, &stride, &d_b, &shape, &stride, &d_c, &shape, &stride, - }; - - dim3 grid(kGridDim); - dim3 block(1); - ASSERT_EQ(cudaSuccess, cudaLaunchKernel(kernel, grid, block, kernel_args, 0, 0)) - << "cudaLaunchKernel failed: " << cudaGetErrorString(cudaGetLastError()); - RAFT_CUDA_TRY(cudaDeviceSynchronize()); - - std::vector h_c(kN); - RAFT_CUDA_TRY(cudaMemcpy(h_c.data(), d_c, kN * sizeof(float), cudaMemcpyDeviceToHost)); - - for (int i = 0; i < kN; ++i) { - ASSERT_FLOAT_EQ(h_a[i] + h_b[i], h_c[i]) << "@" << i; - } - - RAFT_CUDA_TRY(cudaFree(d_a)); - RAFT_CUDA_TRY(cudaFree(d_b)); - RAFT_CUDA_TRY(cudaFree(d_c)); -} - -class CutileVectorAddTest : public ::testing::Test { - protected: - void SetUp() override - { - int device = 0; - RAFT_CUDA_TRY(cudaGetDevice(&device)); - RAFT_CUDA_TRY( - cudaDeviceGetAttribute(&cc_major_, cudaDevAttrComputeCapabilityMajor, device)); - RAFT_CUDA_TRY( - cudaDeviceGetAttribute(&cc_minor_, cudaDevAttrComputeCapabilityMinor, device)); - } - - int cc_major_{}; - int cc_minor_{}; -}; - -} // namespace - -TEST_F(CutileVectorAddTest, EmbeddedCubinVectorAdd) -{ - LoadedKernel loaded = load_vector_add_kernel(cc_major_, cc_minor_); - if (loaded.skip_reason) { GTEST_SKIP() << loaded.skip_reason; } - if (!loaded) { return; } - - SCOPED_TRACE(loaded.used_tileir_jit ? "loaded via TileIR driver JIT" - : "loaded via prebuilt cubin"); - run_vector_add(loaded.kernel); -} - -} // namespace cuvs diff --git a/cpp/tests/cutile/export_vector_add_cubin.py b/cpp/tests/cutile/export_vector_add_cubin.py deleted file mode 100644 index fa099189cd..0000000000 --- a/cpp/tests/cutile/export_vector_add_cubin.py +++ /dev/null @@ -1,133 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. -# SPDX-License-Identifier: Apache-2.0 -"""Export the cuTile vector-add kernel to cubin or TileIR bytecode.""" - -from __future__ import annotations - -import argparse -import sys -from pathlib import Path -from typing import Literal - -import cuda.tile as ct -from cuda.tile.compilation import ( - ArrayConstraint, - CallingConvention, - ConstantConstraint, - KernelSignature, - export_kernel, -) - -from vector_add_kernel import TILE_SIZE, vector_add - -# cuTile / tileiras gpu_code values used at build time. These correspond to the -# cuvs library CUDA 13 real targets as follows (tileiras has no sm_*a/sm_*f names): -# sm_80 -> 80-real -# sm_86 -> 86-real -# sm_90 -> 90a-real -# sm_100 -> 100f-real -# sm_120 -> 120a-real -SUPPORTED_GPU_CODES = ("sm_80", "sm_86", "sm_90", "sm_100", "sm_120") - -# Minimum TileIR bytecode version supported by cuTile; also the most portable choice. -DEFAULT_TILEIR_BYTECODE_VERSION = "13.1" - - -def _kernel_signature() -> KernelSignature: - array = ArrayConstraint( - ct.float32, - 1, - index_dtype=ct.int64, - stride_lower_bound_incl=0, - alias_groups=(), - may_alias_internally=False, - stride_constant=(1,), - ) - return KernelSignature( - parameters=[array, array, array, ConstantConstraint(TILE_SIZE)], - calling_convention=CallingConvention.cutile_python_v1(), - ).with_mangled_symbol("vector_add") - - -def export_kernel_binary( - output_file: Path, - *, - output_format: Literal["cubin", "tileir_bytecode"], - gpu_code: str, - bytecode_version: str | None = None, - symbol_header: Path | None = None, -) -> str: - if output_format == "cubin" and gpu_code not in SUPPORTED_GPU_CODES: - raise ValueError( - f"Unsupported gpu_code {gpu_code!r}; expected one of {SUPPORTED_GPU_CODES}" - ) - - signature = _kernel_signature() - export_kwargs: dict = { - "kernel": vector_add, - "signatures": [signature], - "output_file": str(output_file), - "gpu_code": gpu_code, - "output_format": output_format, - } - if output_format == "tileir_bytecode": - export_kwargs["bytecode_version"] = bytecode_version or DEFAULT_TILEIR_BYTECODE_VERSION - - export_kernel(**export_kwargs) - - if symbol_header is not None: - symbol_header.write_text( - "\n".join( - [ - "// Generated by export_vector_add_cubin.py; do not edit.", - "#pragma once", - f'#define CUTILE_VECTOR_ADD_KERNEL_SYMBOL "{signature.symbol}"', - "", - ] - ) - ) - - return signature.symbol - - -def main() -> int: - parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument("output_file", type=Path, help="Output cubin or .tilebc path") - parser.add_argument( - "--format", - choices=("cubin", "tileir_bytecode"), - default="cubin", - help="Export format (default: cubin)", - ) - parser.add_argument( - "--gpu-code", - required=True, - choices=SUPPORTED_GPU_CODES, - help="tileiras / export_kernel compile target (e.g. sm_120)", - ) - parser.add_argument( - "--bytecode-version", - default=DEFAULT_TILEIR_BYTECODE_VERSION, - help="TileIR bytecode version when --format=tileir_bytecode (default: 13.1)", - ) - parser.add_argument( - "--symbol-header", - type=Path, - default=None, - help="Optional header that defines CUTILE_VECTOR_ADD_KERNEL_SYMBOL", - ) - args = parser.parse_args() - - symbol = export_kernel_binary( - args.output_file, - output_format=args.format, - gpu_code=args.gpu_code, - bytecode_version=args.bytecode_version, - symbol_header=args.symbol_header, - ) - print(symbol) - return 0 - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/cpp/tests/cutile/generate_cutile_cubins.cmake b/cpp/tests/cutile/generate_cutile_cubins.cmake deleted file mode 100644 index 766d3167c6..0000000000 --- a/cpp/tests/cutile/generate_cutile_cubins.cmake +++ /dev/null @@ -1,117 +0,0 @@ -# ============================================================================= -# cmake-format: off -# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. -# SPDX-License-Identifier: Apache-2.0 -# cmake-format: on -# ============================================================================= - -include_guard(GLOBAL) - -# Build-time cuTile cubin targets. Maps to cuvs CUDA 13 -real library arches (75-real omitted). -set(CUTILE_VECTOR_ADD_GPU_CODES sm_80 sm_86 sm_90 sm_100 sm_120) - -function(generate_cutile_vector_add_cubins output_include_dir_var) - find_package(Python3 REQUIRED COMPONENTS Interpreter) - find_package(CUDAToolkit REQUIRED) - - find_program( - CUTILE_BIN2C - NAMES bin2c - PATHS ${CUDAToolkit_BIN_DIR} - REQUIRED - ) - - execute_process( - COMMAND "${Python3_EXECUTABLE}" -c "import cuda.tile" - RESULT_VARIABLE _cutile_import_result - OUTPUT_QUIET - ERROR_QUIET - ) - if(NOT _cutile_import_result EQUAL 0) - message( - FATAL_ERROR - "cuda.tile (cuTile Python) is required to build CUTILE_VECTOR_ADD_TEST. " - "Install it in the active Python environment, e.g. pip install cuda-tile[tileiras]." - ) - endif() - - set(_cutile_source_dir "${CMAKE_CURRENT_FUNCTION_LIST_DIR}") - set(_cutile_binary_dir "${CMAKE_CURRENT_BINARY_DIR}/cutile_generated") - file(MAKE_DIRECTORY "${_cutile_binary_dir}") - - set(_symbol_header "${_cutile_binary_dir}/vector_add_kernel_symbol.h") - set(_first_gpu_code TRUE) - - foreach(_gpu_code IN LISTS CUTILE_VECTOR_ADD_GPU_CODES) - set(_cubin_file "${_cutile_binary_dir}/vector_add_${_gpu_code}.cubin") - set(_cubin_header "${_cutile_binary_dir}/vector_add_${_gpu_code}_cubin.h") - - if(_first_gpu_code) - set(_symbol_arg --symbol-header "${_symbol_header}") - set(_cubin_outputs "${_cubin_file}" "${_symbol_header}") - set(_first_gpu_code FALSE) - else() - set(_symbol_arg) - set(_cubin_outputs "${_cubin_file}") - endif() - - add_custom_command( - OUTPUT ${_cubin_outputs} - COMMAND - "${Python3_EXECUTABLE}" "${_cutile_source_dir}/export_vector_add_cubin.py" - "${_cubin_file}" --gpu-code "${_gpu_code}" ${_symbol_arg} - DEPENDS "${_cutile_source_dir}/export_vector_add_cubin.py" - "${_cutile_source_dir}/vector_add_kernel.py" - COMMENT "Exporting cuTile vector_add cubin for ${_gpu_code}" - VERBATIM - ) - - add_custom_command( - OUTPUT "${_cubin_header}" - COMMAND "${CUTILE_BIN2C}" --const --name "vector_add_${_gpu_code}_cubin" --static - "${_cubin_file}" > "${_cubin_header}" - DEPENDS "${_cubin_file}" - COMMENT "Embedding vector_add ${_gpu_code} cubin via bin2c" - VERBATIM - ) - - list(APPEND _generated_headers "${_cubin_header}") - endforeach() - - # Portable TileIR bytecode for driver JIT on architectures without a prebuilt cubin. - # Requires a CUDA 13.1+ driver (>= 590.44); see Tile IR bytecode docs. - set(_tileir_file "${_cutile_binary_dir}/vector_add.tilebc") - set(_tileir_header "${_cutile_binary_dir}/vector_add_tileir_bytecode.h") - - add_custom_command( - OUTPUT "${_tileir_file}" - COMMAND - "${Python3_EXECUTABLE}" "${_cutile_source_dir}/export_vector_add_cubin.py" - "${_tileir_file}" --format tileir_bytecode --gpu-code sm_80 --bytecode-version 13.1 - DEPENDS "${_cutile_source_dir}/export_vector_add_cubin.py" - "${_cutile_source_dir}/vector_add_kernel.py" - COMMENT "Exporting cuTile vector_add TileIR bytecode (v13.1)" - VERBATIM - ) - - add_custom_command( - OUTPUT "${_tileir_header}" - COMMAND "${CUTILE_BIN2C}" --const --name vector_add_tileir_bytecode --static "${_tileir_file}" - > "${_tileir_header}" - DEPENDS "${_tileir_file}" - COMMENT "Embedding vector_add TileIR bytecode via bin2c" - VERBATIM - ) - - list(APPEND _generated_headers "${_tileir_header}") - - add_custom_target( - cutile_vector_add_cubins - DEPENDS "${_symbol_header}" ${_generated_headers} - ) - - set(${output_include_dir_var} - "${_cutile_binary_dir}" - PARENT_SCOPE - ) -endfunction() diff --git a/cpp/tests/cutile/vector_add_kernel.py b/cpp/tests/cutile/vector_add_kernel.py deleted file mode 100644 index 46b7a607c6..0000000000 --- a/cpp/tests/cutile/vector_add_kernel.py +++ /dev/null @@ -1,17 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. -# SPDX-License-Identifier: Apache-2.0 -"""cuTile Python vector-add kernel used by the embedded-cubin example test.""" - -from __future__ import annotations - -import cuda.tile as ct - -TILE_SIZE = 256 - - -@ct.kernel -def vector_add(a, b, c, TILE_SIZE: ct.Constant): - bid = ct.bid(0) - ta = ct.load(a, bid, TILE_SIZE) - tb = ct.load(b, bid, TILE_SIZE) - ct.store(c, bid, ta + tb) From 86c9311b87fae027be808b1a4d04f174e0bfbbb2 Mon Sep 17 00:00:00 2001 From: divyegala Date: Wed, 24 Jun 2026 21:16:48 +0000 Subject: [PATCH 5/5] style check --- cpp/CMakeLists.txt | 47 +++++----- .../modules/generate_cutile_kernels.cmake | 86 ++++++++++--------- .../modules/register_cutile_fragment.cpp.in | 8 +- .../cuvs/detail/jit_lto/FragmentEntry.hpp | 5 +- .../cuvs/detail/jit_lto/tileir_compat.hpp | 4 +- .../detail/jit_lto/TileAlgorithmPlanner.cpp | 4 +- cpp/src/distance/detail/fused_distance_nn.cuh | 4 +- .../cutile/export_fused_1nn.py | 16 +++- .../cutile/fused_1nn_kernel.py | 30 ++++--- cpp/tests/neighbors/distance_nn_helper.cuh | 4 +- python/libcuvs/pyproject.toml | 1 + 11 files changed, 118 insertions(+), 91 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index a1f3f3973c..70e2509a88 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -960,32 +960,38 @@ if(NOT BUILD_CPU_ONLY) include(cmake/modules/generate_cutile_kernels.cmake) set(fused_1nn_cutile_dir - "${CMAKE_CURRENT_SOURCE_DIR}/src/distance/detail/fused_distance_nn/cutile") + "${CMAKE_CURRENT_SOURCE_DIR}/src/distance/detail/fused_distance_nn/cutile" + ) set(cutile_fused_1nn_generated_dir - "${CMAKE_CURRENT_BINARY_DIR}/generated_kernels/distance/fused_1nn/cutile") + "${CMAKE_CURRENT_BINARY_DIR}/generated_kernels/distance/fused_1nn/cutile" + ) generate_cutile_kernels( cutile_fused_1nn_files - KERNEL_DIR "${fused_1nn_cutile_dir}" - KERNEL_BASENAME "fused_1nn" - KERNEL_PYTHON "fused_1nn_kernel.py" - EXPORT_SCRIPT "export_fused_1nn.py" - OUTPUT_DIRECTORY "${cutile_fused_1nn_generated_dir}" - MATRIX_JSON_FILE "${fused_1nn_cutile_dir}/fused_1nn_cutile_matrix.json" + KERNEL_DIR + "${fused_1nn_cutile_dir}" + KERNEL_BASENAME + "fused_1nn" + KERNEL_PYTHON + "fused_1nn_kernel.py" + EXPORT_SCRIPT + "export_fused_1nn.py" + OUTPUT_DIRECTORY + "${cutile_fused_1nn_generated_dir}" + MATRIX_JSON_FILE + "${fused_1nn_cutile_dir}/fused_1nn_cutile_matrix.json" FRAGMENT_TAG_FORMAT_CUBIN - "cuvs::distance::detail::fragment_tag_fused_1nn_cubin" + "cuvs::distance::detail::fragment_tag_fused_1nn_cubin" FRAGMENT_TAG_FORMAT_TILEIR - "cuvs::distance::detail::fragment_tag_fused_1nn_tileir" + "cuvs::distance::detail::fragment_tag_fused_1nn_tileir" FRAGMENT_TAG_HEADER_FILES - "" - "" - "" + "" + "" + "" ) if(NOT DEFINED CUVS_CUTILE_ENABLED) set(CUVS_CUTILE_ENABLED 0) endif() - target_compile_definitions( - cuvs_cpp_headers INTERFACE CUVS_CUTILE_ENABLED=${CUVS_CUTILE_ENABLED} - ) + target_compile_definitions(cuvs_cpp_headers INTERFACE CUVS_CUTILE_ENABLED=${CUVS_CUTILE_ENABLED}) generate_inst_matrix( cagra_build_inst_files MATRIX_JSON_FILE "${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/cagra_build_matrix.json" @@ -1288,9 +1294,9 @@ if(NOT BUILD_CPU_ONLY) ) target_compile_definitions( - cuvs_objs PRIVATE $<$:CUVS_BUILD_CAGRA_HNSWLIB> - $<$:NVTX_ENABLED> - CUVS_CUTILE_ENABLED=${CUVS_CUTILE_ENABLED} + cuvs_objs + PRIVATE $<$:CUVS_BUILD_CAGRA_HNSWLIB> + $<$:NVTX_ENABLED> CUVS_CUTILE_ENABLED=${CUVS_CUTILE_ENABLED} ) target_link_libraries( @@ -1308,8 +1314,7 @@ if(NOT BUILD_CPU_ONLY) PUBLIC "$" "$" INTERFACE "$" - PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/src" - "${CMAKE_CURRENT_BINARY_DIR}/src" + PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/src" "${CMAKE_CURRENT_BINARY_DIR}/src" "${cutile_fused_1nn_generated_dir}" ) diff --git a/cpp/cmake/modules/generate_cutile_kernels.cmake b/cpp/cmake/modules/generate_cutile_kernels.cmake index f0219dc842..ac8d369cdc 100644 --- a/cpp/cmake/modules/generate_cutile_kernels.cmake +++ b/cpp/cmake/modules/generate_cutile_kernels.cmake @@ -10,7 +10,10 @@ include_guard(GLOBAL) include(${CMAKE_CURRENT_LIST_DIR}/compute_matrix_product.cmake) function(generate_cutile_kernels_stub) - set(CUVS_CUTILE_ENABLED 0 PARENT_SCOPE) + set(CUVS_CUTILE_ENABLED + 0 + PARENT_SCOPE + ) endfunction() function(_cutile_fragment_tag_header_files output_var) @@ -51,15 +54,13 @@ function(_cutile_kernels_setup) find_program( CUTILE_BIN2C NAMES bin2c - PATHS ${CUDAToolkit_BIN_DIR} - REQUIRED + PATHS ${CUDAToolkit_BIN_DIR} REQUIRED ) execute_process( COMMAND "${Python3_EXECUTABLE}" -c "import cuda.tile" RESULT_VARIABLE _cutile_import_result - OUTPUT_QUIET - ERROR_QUIET + OUTPUT_QUIET ERROR_QUIET ) if(NOT _cutile_import_result EQUAL 0) message( @@ -77,8 +78,14 @@ function(_cutile_kernels_setup) file(MAKE_DIRECTORY "${_CUTILE_OUTPUT_DIRECTORY}") - set(Python3_EXECUTABLE "${Python3_EXECUTABLE}" PARENT_SCOPE) - set(CUTILE_BIN2C "${CUTILE_BIN2C}" PARENT_SCOPE) + set(Python3_EXECUTABLE + "${Python3_EXECUTABLE}" + PARENT_SCOPE + ) + set(CUTILE_BIN2C + "${CUTILE_BIN2C}" + PARENT_SCOPE + ) set(_CUTILE_SETUP_OK TRUE PARENT_SCOPE @@ -87,15 +94,8 @@ endfunction() function(process_cutile_matrix_entry source_list_var) set(options) - set(one_value - KERNEL_DIR - KERNEL_BASENAME - KERNEL_PYTHON - EXPORT_SCRIPT - OUTPUT_DIRECTORY - FRAGMENT_TAG_FORMAT_CUBIN - FRAGMENT_TAG_FORMAT_TILEIR - MATRIX_JSON_ENTRY + set(one_value KERNEL_DIR KERNEL_BASENAME KERNEL_PYTHON EXPORT_SCRIPT OUTPUT_DIRECTORY + FRAGMENT_TAG_FORMAT_CUBIN FRAGMENT_TAG_FORMAT_TILEIR MATRIX_JSON_ENTRY ) set(multi_value FRAGMENT_TAG_HEADER_FILES) cmake_parse_arguments(_CUTILE "${options}" "${one_value}" "${multi_value}" ${ARGN}) @@ -116,9 +116,7 @@ function(process_cutile_matrix_entry source_list_var) message(FATAL_ERROR "Unknown cuTile register kind '${register}'") endif() - _cutile_fragment_tag_header_files( - fragment_tag_header_files ${_CUTILE_FRAGMENT_TAG_HEADER_FILES} - ) + _cutile_fragment_tag_header_files(fragment_tag_header_files ${_CUTILE_FRAGMENT_TAG_HEADER_FILES}) string(CONFIGURE "${artifact_basename}" _artifact_basename @ONLY) set(_artifact_stem "${_CUTILE_KERNEL_BASENAME}_${_artifact_basename}") @@ -145,8 +143,8 @@ function(process_cutile_matrix_entry source_list_var) add_custom_command( OUTPUT "${_embedded_header}" - COMMAND "${CUTILE_BIN2C}" --const --name ${bin2c_symbol} --static "${_artifact_file}" - > "${_embedded_header}" + COMMAND "${CUTILE_BIN2C}" --const --name ${bin2c_symbol} --static "${_artifact_file}" > + "${_embedded_header}" DEPENDS "${_artifact_file}" VERBATIM ) @@ -163,15 +161,8 @@ endfunction() function(generate_cutile_kernels source_list_var) set(options) - set(one_value - KERNEL_DIR - KERNEL_BASENAME - KERNEL_PYTHON - EXPORT_SCRIPT - OUTPUT_DIRECTORY - MATRIX_JSON_FILE - FRAGMENT_TAG_FORMAT_CUBIN - FRAGMENT_TAG_FORMAT_TILEIR + set(one_value KERNEL_DIR KERNEL_BASENAME KERNEL_PYTHON EXPORT_SCRIPT OUTPUT_DIRECTORY + MATRIX_JSON_FILE FRAGMENT_TAG_FORMAT_CUBIN FRAGMENT_TAG_FORMAT_TILEIR ) set(multi_value FRAGMENT_TAG_HEADER_FILES) cmake_parse_arguments(_CUTILE "${options}" "${one_value}" "${multi_value}" ${ARGN}) @@ -184,8 +175,7 @@ function(generate_cutile_kernels source_list_var) endif() _cutile_kernels_setup( - MATRIX_JSON_FILE "${_CUTILE_MATRIX_JSON_FILE}" - OUTPUT_DIRECTORY "${_CUTILE_OUTPUT_DIRECTORY}" + MATRIX_JSON_FILE "${_CUTILE_MATRIX_JSON_FILE}" OUTPUT_DIRECTORY "${_CUTILE_OUTPUT_DIRECTORY}" ) if(NOT _CUTILE_SETUP_OK) generate_cutile_kernels_stub() @@ -206,19 +196,31 @@ function(generate_cutile_kernels source_list_var) string(JSON matrix_json_entry GET "${matrix_product}" "${i}") process_cutile_matrix_entry( "${source_list_var}" - KERNEL_DIR "${_CUTILE_KERNEL_DIR}" - KERNEL_BASENAME "${_CUTILE_KERNEL_BASENAME}" - KERNEL_PYTHON "${_CUTILE_KERNEL_PYTHON}" - EXPORT_SCRIPT "${_CUTILE_EXPORT_SCRIPT}" - OUTPUT_DIRECTORY "${_CUTILE_OUTPUT_DIRECTORY}" - FRAGMENT_TAG_FORMAT_CUBIN "${_CUTILE_FRAGMENT_TAG_FORMAT_CUBIN}" - FRAGMENT_TAG_FORMAT_TILEIR "${_CUTILE_FRAGMENT_TAG_FORMAT_TILEIR}" - FRAGMENT_TAG_HEADER_FILES ${_CUTILE_FRAGMENT_TAG_HEADER_FILES} - MATRIX_JSON_ENTRY "${matrix_json_entry}" + KERNEL_DIR + "${_CUTILE_KERNEL_DIR}" + KERNEL_BASENAME + "${_CUTILE_KERNEL_BASENAME}" + KERNEL_PYTHON + "${_CUTILE_KERNEL_PYTHON}" + EXPORT_SCRIPT + "${_CUTILE_EXPORT_SCRIPT}" + OUTPUT_DIRECTORY + "${_CUTILE_OUTPUT_DIRECTORY}" + FRAGMENT_TAG_FORMAT_CUBIN + "${_CUTILE_FRAGMENT_TAG_FORMAT_CUBIN}" + FRAGMENT_TAG_FORMAT_TILEIR + "${_CUTILE_FRAGMENT_TAG_FORMAT_TILEIR}" + FRAGMENT_TAG_HEADER_FILES + ${_CUTILE_FRAGMENT_TAG_HEADER_FILES} + MATRIX_JSON_ENTRY + "${matrix_json_entry}" ) endforeach() - set(CUVS_CUTILE_ENABLED 1 PARENT_SCOPE) + set(CUVS_CUTILE_ENABLED + 1 + PARENT_SCOPE + ) set(${source_list_var} "${${source_list_var}}" PARENT_SCOPE diff --git a/cpp/cmake/modules/register_cutile_fragment.cpp.in b/cpp/cmake/modules/register_cutile_fragment.cpp.in index 0fc074bdbb..3ffd5c0d0c 100644 --- a/cpp/cmake/modules/register_cutile_fragment.cpp.in +++ b/cpp/cmake/modules/register_cutile_fragment.cpp.in @@ -8,10 +8,10 @@ @fragment_tag_header_files@ -namespace { - -using fragment_tag = @fragment_tag@; -using fragment_entry = @fragment_entry_type@; + namespace +{ + using fragment_tag = @fragment_tag@; + using fragment_entry = @fragment_entry_type@; } // namespace diff --git a/cpp/include/cuvs/detail/jit_lto/FragmentEntry.hpp b/cpp/include/cuvs/detail/jit_lto/FragmentEntry.hpp index df69ec1d7b..6c399d860a 100644 --- a/cpp/include/cuvs/detail/jit_lto/FragmentEntry.hpp +++ b/cpp/include/cuvs/detail/jit_lto/FragmentEntry.hpp @@ -115,7 +115,10 @@ struct StaticTileIrBytecodeFragmentEntry final : TileIrBytecodeFragmentEntry { return StaticTileIrBytecodeFragmentEntry::data; } - size_t get_length() const override { return StaticTileIrBytecodeFragmentEntry::length; } + size_t get_length() const override + { + return StaticTileIrBytecodeFragmentEntry::length; + } const char* get_key() const override { diff --git a/cpp/include/cuvs/detail/jit_lto/tileir_compat.hpp b/cpp/include/cuvs/detail/jit_lto/tileir_compat.hpp index d63759fb36..f15407fd4c 100644 --- a/cpp/include/cuvs/detail/jit_lto/tileir_compat.hpp +++ b/cpp/include/cuvs/detail/jit_lto/tileir_compat.hpp @@ -88,8 +88,8 @@ inline bool query_current_device_arch(int& cc_major, int& cc_minor) inline bool cutile_launch_available_on_current_device() { - int cc_major = 0; - int cc_minor = 0; + int cc_major = 0; + int cc_minor = 0; int driver_version = 0; if (!query_current_device_arch(cc_major, cc_minor)) { return false; } if (!query_driver_version(driver_version)) { return false; } diff --git a/cpp/src/detail/jit_lto/TileAlgorithmPlanner.cpp b/cpp/src/detail/jit_lto/TileAlgorithmPlanner.cpp index edb6269213..e0ce77e789 100644 --- a/cpp/src/detail/jit_lto/TileAlgorithmPlanner.cpp +++ b/cpp/src/detail/jit_lto/TileAlgorithmPlanner.cpp @@ -23,9 +23,7 @@ std::shared_ptr TileAlgorithmPlanner::build() { int cc_major = 0; int cc_minor = 0; - if (!cuvs::detail::jit_lto::get_device_compute_capability(cc_major, cc_minor)) { - return nullptr; - } + if (!cuvs::detail::jit_lto::get_device_compute_capability(cc_major, cc_minor)) { return nullptr; } int driver_version = 0; if (cudaDriverGetVersion(&driver_version) != cudaSuccess) { return nullptr; } diff --git a/cpp/src/distance/detail/fused_distance_nn.cuh b/cpp/src/distance/detail/fused_distance_nn.cuh index 8b47092b58..b1b18e58f6 100644 --- a/cpp/src/distance/detail/fused_distance_nn.cuh +++ b/cpp/src/distance/detail/fused_distance_nn.cuh @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ @@ -19,8 +19,8 @@ #include "fused_distance_nn/helper_structs.cuh" #include "fused_distance_nn/simt_kernel.cuh" #include "pairwise_distance_base.cuh" // PairwiseDistances -#include #include +#include #include // raft::KeyValuePair #include // raft::identity_op #include // Policy diff --git a/cpp/src/distance/detail/fused_distance_nn/cutile/export_fused_1nn.py b/cpp/src/distance/detail/fused_distance_nn/cutile/export_fused_1nn.py index 6a20be24ef..10a4fa9ec1 100644 --- a/cpp/src/distance/detail/fused_distance_nn/cutile/export_fused_1nn.py +++ b/cpp/src/distance/detail/fused_distance_nn/cutile/export_fused_1nn.py @@ -100,7 +100,9 @@ def export_binary( "output_format": output_format, } if output_format == "tileir_bytecode": - export_kwargs["bytecode_version"] = bytecode_version or DEFAULT_TILEIR_BYTECODE_VERSION + export_kwargs["bytecode_version"] = ( + bytecode_version or DEFAULT_TILEIR_BYTECODE_VERSION + ) export_kernel(**export_kwargs) @@ -110,14 +112,20 @@ def export_binary( def main() -> int: parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("output_file", type=Path) - parser.add_argument("--format", choices=("cubin", "tileir_bytecode"), default="cubin") - parser.add_argument("--data-type", choices=tuple(KERNELS.keys()), required=True) + parser.add_argument( + "--format", choices=("cubin", "tileir_bytecode"), default="cubin" + ) + parser.add_argument( + "--data-type", choices=tuple(KERNELS.keys()), required=True + ) parser.add_argument( "--gpu-code", default=DEFAULT_TILEIR_EXPORT_GPU_CODE, help="Target SM for cubin export, or compile hint for TileIR bytecode export", ) - parser.add_argument("--bytecode-version", default=DEFAULT_TILEIR_BYTECODE_VERSION) + parser.add_argument( + "--bytecode-version", default=DEFAULT_TILEIR_BYTECODE_VERSION + ) args = parser.parse_args() print( diff --git a/cpp/src/distance/detail/fused_distance_nn/cutile/fused_1nn_kernel.py b/cpp/src/distance/detail/fused_distance_nn/cutile/fused_1nn_kernel.py index 232b9506af..65fe165b70 100644 --- a/cpp/src/distance/detail/fused_distance_nn/cutile/fused_1nn_kernel.py +++ b/cpp/src/distance/detail/fused_distance_nn/cutile/fused_1nn_kernel.py @@ -14,17 +14,23 @@ def _make_kernel(data_type: str): - if data_type == "half": - dtype = ct.float16 - acc_dtype = ct.float32 - elif data_type == "float": - dtype = ct.float32 - acc_dtype = ct.float32 - else: + if data_type not in ("half", "float"): raise ValueError(f"Unsupported data_type {data_type!r}") + acc_dtype = ct.float32 @ct.kernel - def fused_1nn_kernel(A, B, OutIdx, OutDist, M, N, K, tm: ConstInt, tn: ConstInt, tk: ConstInt): + def fused_1nn_kernel( + A, + B, + OutIdx, + OutDist, + M, + N, + K, + tm: ConstInt, + tn: ConstInt, + tk: ConstInt, + ): bidm = ct.bid(0) best_dist = ct.full((tm,), -3.4e38, acc_dtype) @@ -38,8 +44,12 @@ def fused_1nn_kernel(A, B, OutIdx, OutDist, M, N, K, tm: ConstInt, tn: ConstInt, accumulator = ct.full((tm, tn), 0, dtype=acc_dtype) for k in range(num_tiles_k): - a = ct.load(A, index=(bidm, k), shape=(tm, tk), padding_mode=zero_pad) - b_T = ct.load(B, index=(n, k), shape=(tn, tk), padding_mode=zero_pad) + a = ct.load( + A, index=(bidm, k), shape=(tm, tk), padding_mode=zero_pad + ) + b_T = ct.load( + B, index=(n, k), shape=(tn, tk), padding_mode=zero_pad + ) accumulator = ct.mma(a, ct.transpose(b_T), accumulator) curr_max = ct.max(accumulator, axis=1) diff --git a/cpp/tests/neighbors/distance_nn_helper.cuh b/cpp/tests/neighbors/distance_nn_helper.cuh index 422879918f..ea440387b4 100644 --- a/cpp/tests/neighbors/distance_nn_helper.cuh +++ b/cpp/tests/neighbors/distance_nn_helper.cuh @@ -91,8 +91,8 @@ RAFT_KERNEL ref_nn_kernel( if (metric == DistanceType::InnerProduct) { AccT score = inner_product_score(&A[m * K], &B[n * K], K); if (score > best_score) { - best_score = score; - best_index = n; + best_score = score; + best_index = n; } continue; } diff --git a/python/libcuvs/pyproject.toml b/python/libcuvs/pyproject.toml index 5025daa66d..b4e848304f 100644 --- a/python/libcuvs/pyproject.toml +++ b/python/libcuvs/pyproject.toml @@ -19,6 +19,7 @@ authors = [ license = "Apache-2.0" requires-python = ">=3.11" dependencies = [ + "cuda-tile[tileiras]", "cuda-toolkit[cublas,curand,cusolver,cusparse,nvrtc]==13.*", "libraft==26.8.*,>=0.0.0a0", "librmm==26.8.*,>=0.0.0a0",