Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions conda/recipes/libcuvs/recipe.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ cache:
- cuda-cudart-dev
- cuda-nvrtc-dev
- cuda-profiler-api
- cutile-python
- libcublas-dev
- libcurand-dev
- libcusolver-dev
Expand Down Expand Up @@ -117,6 +118,7 @@ outputs:
- cuda-cudart-dev
- cuda-nvrtc-dev
- cuda-profiler-api
- cutile-python
- libcublas-dev
- libcurand-dev
- libcusolver-dev
Expand Down Expand Up @@ -179,6 +181,7 @@ outputs:
- cuda-cudart-dev
- cuda-nvrtc-dev
- cuda-profiler-api
- cutile-python
- libcublas-dev
- libcurand-dev
- libcusolver-dev
Expand Down Expand Up @@ -240,6 +243,7 @@ outputs:
- cuda-cudart-dev
- cuda-nvrtc-dev
- cuda-profiler-api
- cutile-python
- libcublas-dev
- libcurand-dev
- libcusolver-dev
Expand Down Expand Up @@ -299,6 +303,7 @@ outputs:
- openblas # required by some CPU algos in benchmarks
- cuda-cudart-dev
- cuda-profiler-api
- cutile-python
- libcublas-dev
- libcurand-dev
- libcusolver-dev
Expand Down
45 changes: 43 additions & 2 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -957,6 +957,41 @@ if(NOT BUILD_CPU_ONLY)
OUTPUT_FILE_FORMAT
"${CMAKE_CURRENT_BINARY_DIR}/src/distance/detail/pairwise_matrix/dispatch_rbf_inst_data_@data_abbrev@_acc_@acc_abbrev@_out_@out_abbrev@_index_@index_abbrev@_op_@op_abbrev@.cu"
)

include(cmake/modules/generate_cutile_kernels.cmake)
set(fused_1nn_cutile_dir
"${CMAKE_CURRENT_SOURCE_DIR}/src/distance/detail/fused_distance_nn/cutile"
)
set(cutile_fused_1nn_generated_dir
"${CMAKE_CURRENT_BINARY_DIR}/generated_kernels/distance/fused_1nn/cutile"
)
generate_cutile_kernels(
cutile_fused_1nn_files
KERNEL_DIR
"${fused_1nn_cutile_dir}"
KERNEL_BASENAME
"fused_1nn"
KERNEL_PYTHON
"fused_1nn_kernel.py"
EXPORT_SCRIPT
"export_fused_1nn.py"
OUTPUT_DIRECTORY
"${cutile_fused_1nn_generated_dir}"
MATRIX_JSON_FILE
"${fused_1nn_cutile_dir}/fused_1nn_cutile_matrix.json"
FRAGMENT_TAG_FORMAT_CUBIN
"cuvs::distance::detail::fragment_tag_fused_1nn_cubin<cuvs::neighbors::detail::tag_@data_abbrev@, cuvs::detail::jit_lto::@arch_tag@>"
FRAGMENT_TAG_FORMAT_TILEIR
"cuvs::distance::detail::fragment_tag_fused_1nn_tileir<cuvs::neighbors::detail::tag_@data_abbrev@>"
FRAGMENT_TAG_HEADER_FILES
"<cuvs/detail/jit_lto/fused_distance_nn/fused_1nn_fragments.hpp>"
"<cuvs/detail/jit_lto/cutile_arch_tags.hpp>"
"<cuvs/detail/jit_lto/common_fragments.hpp>"
)
if(NOT DEFINED CUVS_CUTILE_ENABLED)
set(CUVS_CUTILE_ENABLED 0)
endif()
target_compile_definitions(cuvs_cpp_headers INTERFACE CUVS_CUTILE_ENABLED=${CUVS_CUTILE_ENABLED})
generate_inst_matrix(
cagra_build_inst_files
MATRIX_JSON_FILE "${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/cagra_build_matrix.json"
Expand Down Expand Up @@ -1147,6 +1182,8 @@ if(NOT BUILD_CPU_ONLY)
src/util/host_memory.cpp
src/detail/jit_lto/AlgorithmLauncher.cpp
src/detail/jit_lto/AlgorithmPlanner.cpp
src/detail/jit_lto/LTOAlgorithmPlanner.cpp
src/detail/jit_lto/TileAlgorithmPlanner.cpp
src/detail/jit_lto/FragmentEntry.cpp
src/detail/jit_lto/nvjitlink_checker.cpp
src/detail/jit_lto/NVRTCLTOFragmentCompiler.cpp
Expand Down Expand Up @@ -1234,6 +1271,8 @@ if(NOT BUILD_CPU_ONLY)
src/stats/trustworthiness_score.cu
${CUVS_MG_ALGOS}
${jit_lto_files}
${cutile_fused_1nn_files}
$<$<BOOL:${CUVS_CUTILE_ENABLED}>:src/distance/detail/fused_distance_nn/cutile/fused_1nn_tile.cu>
)

set_target_properties(
Expand All @@ -1255,8 +1294,9 @@ if(NOT BUILD_CPU_ONLY)
)

target_compile_definitions(
cuvs_objs PRIVATE $<$<BOOL:${BUILD_CAGRA_HNSWLIB}>:CUVS_BUILD_CAGRA_HNSWLIB>
$<$<BOOL:${CUVS_NVTX}>:NVTX_ENABLED>
cuvs_objs
PRIVATE $<$<BOOL:${BUILD_CAGRA_HNSWLIB}>:CUVS_BUILD_CAGRA_HNSWLIB>
$<$<BOOL:${CUVS_NVTX}>:NVTX_ENABLED> CUVS_CUTILE_ENABLED=${CUVS_CUTILE_ENABLED}
)

target_link_libraries(
Expand All @@ -1275,6 +1315,7 @@ if(NOT BUILD_CPU_ONLY)
"$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../c/include>"
INTERFACE "$<INSTALL_INTERFACE:include>"
PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/src" "${CMAKE_CURRENT_BINARY_DIR}/src"
"${cutile_fused_1nn_generated_dir}"
)

# Endian detection
Expand Down
228 changes: 228 additions & 0 deletions cpp/cmake/modules/generate_cutile_kernels.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,228 @@
# =============================================================================
# cmake-format: off
# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
# SPDX-License-Identifier: Apache-2.0
# cmake-format: on
# =============================================================================

include_guard(GLOBAL)

include(${CMAKE_CURRENT_LIST_DIR}/compute_matrix_product.cmake)

function(generate_cutile_kernels_stub)
set(CUVS_CUTILE_ENABLED
0
PARENT_SCOPE
)
endfunction()

function(_cutile_fragment_tag_header_files output_var)
set(${output_var} "")
foreach(_header IN LISTS ARGN)
if(NOT _header MATCHES "^(\".*\"|<.*>)$")
set(_header "\"${_header}\"")
endif()
string(APPEND ${output_var} "#include ${_header}\n")
endforeach()
set(${output_var}
"${${output_var}}"
PARENT_SCOPE
)
endfunction()

function(_cutile_kernels_setup)
set(options)
set(one_value MATRIX_JSON_FILE OUTPUT_DIRECTORY)
set(multi_value)
cmake_parse_arguments(_CUTILE "${options}" "${one_value}" "${multi_value}" ${ARGN})

find_package(Python3 REQUIRED COMPONENTS Interpreter)
find_package(CUDAToolkit REQUIRED)

if(CUDAToolkit_VERSION VERSION_LESS 13.0)
message(
STATUS
"cuTile embedded kernels require CUDA 13.0+; skipping cuTile generation (found ${CUDAToolkit_VERSION})."
)
set(_CUTILE_SETUP_OK
FALSE
PARENT_SCOPE
)
return()
endif()

find_program(
CUTILE_BIN2C
NAMES bin2c
PATHS ${CUDAToolkit_BIN_DIR} REQUIRED
)

execute_process(
COMMAND "${Python3_EXECUTABLE}" -c "import cuda.tile"
RESULT_VARIABLE _cutile_import_result
OUTPUT_QUIET ERROR_QUIET
)
if(NOT _cutile_import_result EQUAL 0)
message(
FATAL_ERROR
"cuda.tile (cuTile Python) is required to build cuTile embedded kernels. "
"Install it in the active Python environment, e.g. pip install cuda-tile[tileiras]."
)
endif()

set_property(
DIRECTORY
PROPERTY CMAKE_CONFIGURE_DEPENDS "${_CUTILE_MATRIX_JSON_FILE}"
APPEND
)

file(MAKE_DIRECTORY "${_CUTILE_OUTPUT_DIRECTORY}")

set(Python3_EXECUTABLE
"${Python3_EXECUTABLE}"
PARENT_SCOPE
)
set(CUTILE_BIN2C
"${CUTILE_BIN2C}"
PARENT_SCOPE
)
set(_CUTILE_SETUP_OK
TRUE
PARENT_SCOPE
)
endfunction()

function(process_cutile_matrix_entry source_list_var)
set(options)
set(one_value KERNEL_DIR KERNEL_BASENAME KERNEL_PYTHON EXPORT_SCRIPT OUTPUT_DIRECTORY
FRAGMENT_TAG_FORMAT_CUBIN FRAGMENT_TAG_FORMAT_TILEIR MATRIX_JSON_ENTRY
)
set(multi_value FRAGMENT_TAG_HEADER_FILES)
cmake_parse_arguments(_CUTILE "${options}" "${one_value}" "${multi_value}" ${ARGN})

find_package(Python3 REQUIRED COMPONENTS Interpreter)

populate_matrix_variables("${_CUTILE_MATRIX_JSON_ENTRY}")

if(register STREQUAL "cubin")
string(CONFIGURE "${_CUTILE_FRAGMENT_TAG_FORMAT_CUBIN}" fragment_tag @ONLY)
set(bin2c_symbol embedded_cubin)
set(fragment_entry_type "StaticCubinFragmentEntry<fragment_tag>")
elseif(register STREQUAL "tileir")
string(CONFIGURE "${_CUTILE_FRAGMENT_TAG_FORMAT_TILEIR}" fragment_tag @ONLY)
set(bin2c_symbol embedded_tileir)
set(fragment_entry_type "StaticTileIrBytecodeFragmentEntry<fragment_tag>")
else()
message(FATAL_ERROR "Unknown cuTile register kind '${register}'")
endif()

_cutile_fragment_tag_header_files(fragment_tag_header_files ${_CUTILE_FRAGMENT_TAG_HEADER_FILES})

string(CONFIGURE "${artifact_basename}" _artifact_basename @ONLY)
set(_artifact_stem "${_CUTILE_KERNEL_BASENAME}_${_artifact_basename}")
set(_artifact_file "${_CUTILE_OUTPUT_DIRECTORY}/${_artifact_stem}.${artifact_ext}")
set(_embedded_header "${_CUTILE_OUTPUT_DIRECTORY}/${_artifact_stem}_${register}.h")
set(_fragment_cpp "${_CUTILE_OUTPUT_DIRECTORY}/${_artifact_stem}_${register}.cpp")
set(embedded_header_file "${_artifact_stem}_${register}.h")

set(_python_args --format "${output_format}" --data-type "${data_type}" --gpu-code "${gpu_code}")
if(DEFINED bytecode_version AND NOT "${bytecode_version}" STREQUAL "")
list(APPEND _python_args --bytecode-version "${bytecode_version}")
endif()

add_custom_command(
OUTPUT "${_artifact_file}"
COMMAND "${Python3_EXECUTABLE}" "${_CUTILE_KERNEL_DIR}/${_CUTILE_EXPORT_SCRIPT}"
"${_artifact_file}" ${_python_args}
WORKING_DIRECTORY "${_CUTILE_KERNEL_DIR}"
DEPENDS "${_CUTILE_KERNEL_DIR}/${_CUTILE_EXPORT_SCRIPT}"
"${_CUTILE_KERNEL_DIR}/${_CUTILE_KERNEL_PYTHON}"
COMMENT "Exporting cuTile ${_CUTILE_KERNEL_BASENAME} ${output_format} ${data_type}"
VERBATIM
)

add_custom_command(
OUTPUT "${_embedded_header}"
COMMAND "${CUTILE_BIN2C}" --const --name ${bin2c_symbol} --static "${_artifact_file}" >
"${_embedded_header}"
DEPENDS "${_artifact_file}"
VERBATIM
)

configure_file(
"${CMAKE_CURRENT_FUNCTION_LIST_DIR}/register_cutile_fragment.cpp.in" "${_fragment_cpp}" @ONLY
)
list(APPEND ${source_list_var} "${_embedded_header}" "${_fragment_cpp}")
set(${source_list_var}
"${${source_list_var}}"
PARENT_SCOPE
)
endfunction()

function(generate_cutile_kernels source_list_var)
set(options)
set(one_value KERNEL_DIR KERNEL_BASENAME KERNEL_PYTHON EXPORT_SCRIPT OUTPUT_DIRECTORY
MATRIX_JSON_FILE FRAGMENT_TAG_FORMAT_CUBIN FRAGMENT_TAG_FORMAT_TILEIR
)
set(multi_value FRAGMENT_TAG_HEADER_FILES)
cmake_parse_arguments(_CUTILE "${options}" "${one_value}" "${multi_value}" ${ARGN})

if(NOT _CUTILE_KERNEL_BASENAME)
message(FATAL_ERROR "generate_cutile_kernels: KERNEL_BASENAME is required")
endif()
if(NOT _CUTILE_KERNEL_PYTHON)
set(_CUTILE_KERNEL_PYTHON "fused_1nn_kernel.py")
endif()

_cutile_kernels_setup(
MATRIX_JSON_FILE "${_CUTILE_MATRIX_JSON_FILE}" OUTPUT_DIRECTORY "${_CUTILE_OUTPUT_DIRECTORY}"
)
if(NOT _CUTILE_SETUP_OK)
generate_cutile_kernels_stub()
set(${source_list_var}
""
PARENT_SCOPE
)
return()
endif()

compute_matrix_product(matrix_product MATRIX_JSON_FILE "${_CUTILE_MATRIX_JSON_FILE}")

string(JSON len LENGTH "${matrix_product}")
math(EXPR last "${len} - 1")

# cmake-lint: disable=C0103,E1120
foreach(i RANGE "${last}")
string(JSON matrix_json_entry GET "${matrix_product}" "${i}")
process_cutile_matrix_entry(
"${source_list_var}"
KERNEL_DIR
"${_CUTILE_KERNEL_DIR}"
KERNEL_BASENAME
"${_CUTILE_KERNEL_BASENAME}"
KERNEL_PYTHON
"${_CUTILE_KERNEL_PYTHON}"
EXPORT_SCRIPT
"${_CUTILE_EXPORT_SCRIPT}"
OUTPUT_DIRECTORY
"${_CUTILE_OUTPUT_DIRECTORY}"
FRAGMENT_TAG_FORMAT_CUBIN
"${_CUTILE_FRAGMENT_TAG_FORMAT_CUBIN}"
FRAGMENT_TAG_FORMAT_TILEIR
"${_CUTILE_FRAGMENT_TAG_FORMAT_TILEIR}"
FRAGMENT_TAG_HEADER_FILES
${_CUTILE_FRAGMENT_TAG_HEADER_FILES}
MATRIX_JSON_ENTRY
"${matrix_json_entry}"
)
endforeach()

set(CUVS_CUTILE_ENABLED
1
PARENT_SCOPE
)
set(${source_list_var}
"${${source_list_var}}"
PARENT_SCOPE
)
endfunction()
22 changes: 22 additions & 0 deletions cpp/cmake/modules/register_cutile_fragment.cpp.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
* SPDX-License-Identifier: Apache-2.0
*/

#include "@embedded_header_file@"
#include <cuvs/detail/jit_lto/FragmentEntry.hpp>

@fragment_tag_header_files@

namespace
{
using fragment_tag = @fragment_tag@;
using fragment_entry = @fragment_entry_type@;

} // namespace

template <>
const uint8_t* const fragment_entry::data = @bin2c_symbol@;

template <>
const size_t fragment_entry::length = sizeof(@bin2c_symbol@);
Loading
Loading