diff --git a/cuda_core/cuda/core/__init__.py b/cuda_core/cuda/core/__init__.py index 33ba03c9c2d..0db06b05e14 100644 --- a/cuda_core/cuda/core/__init__.py +++ b/cuda_core/cuda/core/__init__.py @@ -69,6 +69,7 @@ class _PatchedProperty(metaclass=_PatchedPropMeta): from cuda.core import checkpoint, system, utils +from cuda.core._array import ArrayFormat, CUDAArray from cuda.core._context import Context, ContextOptions from cuda.core._device import Device from cuda.core._device_resources import ( @@ -99,6 +100,7 @@ class _PatchedProperty(metaclass=_PatchedPropMeta): VirtualMemoryResource, VirtualMemoryResourceOptions, ) +from cuda.core._mipmapped_array import MipmappedArray from cuda.core._module import Kernel, ObjectCode from cuda.core._program import Program, ProgramOptions from cuda.core._stream import ( @@ -107,7 +109,16 @@ class _PatchedProperty(metaclass=_PatchedPropMeta): Stream, StreamOptions, ) +from cuda.core._surface import SurfaceObject from cuda.core._tensor_map import TensorMapDescriptor, TensorMapDescriptorOptions +from cuda.core._texture import ( + AddressMode, + FilterMode, + ReadMode, + ResourceDescriptor, + TextureDescriptor, + TextureObject, +) # isort: split # Must come after the cuda.core._* extension imports above: loading graph diff --git a/cuda_core/cuda/core/_array.pxd b/cuda_core/cuda/core/_array.pxd new file mode 100644 index 00000000000..461204e7f56 --- /dev/null +++ b/cuda_core/cuda/core/_array.pxd @@ -0,0 +1,23 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +from cuda.bindings cimport cydriver + + +cdef class CUDAArray: + + cdef: + cydriver.CUarray _handle + tuple _shape # (w,), (w, h), or (w, h, d) + cydriver.CUarray_format _format + unsigned int _num_channels # 1, 2, or 4 + int _device_id + bint _owning + bint _surface_load_store + # Optional strong reference to a parent owner (e.g. a MipmappedArray + # whose level this CUDAArray views). When set, the parent must outlive + # this CUDAArray because the underlying CUarray belongs to the parent. + object _parent_ref + + cpdef close(self) diff --git a/cuda_core/cuda/core/_array.pyi b/cuda_core/cuda/core/_array.pyi new file mode 100644 index 00000000000..61ec023a4b0 --- /dev/null +++ b/cuda_core/cuda/core/_array.pyi @@ -0,0 +1,156 @@ +# This file was generated by stubgen-pyx v0.2.6 from cuda_core/cuda/core/_array.pyx + +from __future__ import annotations + +from enum import IntEnum + +from cuda.bindings import cydriver + + +class ArrayFormat(IntEnum): + """Element format for a :class:`CUDAArray` allocation. + + Mirrors ``CUarray_format`` from the CUDA driver API. + """ + UINT8 = cydriver.CU_AD_FORMAT_UNSIGNED_INT8 + UINT16 = cydriver.CU_AD_FORMAT_UNSIGNED_INT16 + UINT32 = cydriver.CU_AD_FORMAT_UNSIGNED_INT32 + INT8 = cydriver.CU_AD_FORMAT_SIGNED_INT8 + INT16 = cydriver.CU_AD_FORMAT_SIGNED_INT16 + INT32 = cydriver.CU_AD_FORMAT_SIGNED_INT32 + FLOAT16 = cydriver.CU_AD_FORMAT_HALF + FLOAT32 = cydriver.CU_AD_FORMAT_FLOAT + +class CUDAArray: + """An opaque, hardware-laid-out GPU allocation for texture/surface access. + + Distinct from :class:`Buffer`: a ``CUarray`` has no exposed device pointer + and can only be accessed from kernels through a :class:`TextureObject` or + :class:`SurfaceObject`. Its memory layout is chosen by the driver for 2D/3D + spatial locality. + + Construct via :meth:`from_descriptor`. Only plain 1D/2D/3D allocations are + supported in this initial version; layered/cubemap/sparse variants will + follow once their shape semantics are settled. + """ + + def close(self): + """Destroy the underlying ``CUarray`` if owned by this object.""" + + def __init__(self, *args, **kwargs): + ... + + @classmethod + def from_descriptor(cls, *, shape, format, num_channels, is_surface_load_store=False): + """Allocate a new CUDA array. + + Parameters + ---------- + shape : tuple of int + ``(width,)``, ``(width, height)``, or ``(width, height, depth)`` + in elements. + format : ArrayFormat + Element format. + num_channels : int + Channels per element. Must be 1, 2, or 4. + is_surface_load_store : bool + If True, allocate with ``CUDA_ARRAY3D_SURFACE_LDST`` so the array + can be bound as a :class:`SurfaceObject` for kernel-side writes. + Default False. + + Returns + ------- + CUDAArray + """ + + @classmethod + def _from_handle(cls, handle: int, owning: bool, *, device_id=None): + """Wrap an externally-allocated ``CUarray``. + + Intended for graphics interop (``cuGraphicsSubResourceGetMappedArray``) + where the array is owned by the graphics API. With ``owning=False``, + :meth:`close` and ``__dealloc__`` will not free the handle. Shape, + format, and channel count are queried from the driver. + """ + + @property + def handle(self): + """The underlying ``CUarray`` as an integer.""" + + @property + def shape(self): + """Allocation shape, in elements.""" + + @property + def format(self): + """The element :class:`ArrayFormat`.""" + + @property + def num_channels(self): + """Channels per element (1, 2, or 4).""" + + @property + def element_size(self): + """Bytes per element (format size * channels).""" + + @property + def device(self): + """The :class:`Device` this array was allocated on.""" + + @property + def is_surface_load_store(self): + """True if this array was created with ``CUDA_ARRAY3D_SURFACE_LDST`` + and can be bound as a :class:`SurfaceObject`.""" + + def _extent_bytes(self): + """Return (width_bytes, height, depth) for cuMemcpy3D, with height/depth + normalized to >=1 for lower-rank arrays.""" + + def copy_from(self, src, *, stream): + """Copy a full-array's worth of data into this array. + + Parameters + ---------- + src : Buffer or buffer-protocol object + Source data. Must contain at least ``self.size_bytes`` bytes + of contiguous data. + stream : Stream + Stream to issue the copy on. + """ + + def copy_to(self, dst, *, stream): + """Copy a full-array's worth of data out of this array. + + Parameters + ---------- + dst : Buffer or writable buffer-protocol object + Destination. Must have at least ``self.size_bytes`` bytes of + writable, contiguous space. + stream : Stream + Stream to issue the copy on. + """ + + @property + def size_bytes(self): + """Total bytes of array storage (``prod(shape) * element_size``).""" + + def __dealloc__(self): + ... + + def __enter__(self): + ... + + def __exit__(self, exc_type, exc, tb): + ... + + def __repr__(self): + ... +_FORMAT_ELEM_SIZE = {int(ArrayFormat.UINT8): 1, int(ArrayFormat.INT8): 1, int(ArrayFormat.UINT16): 2, int(ArrayFormat.INT16): 2, int(ArrayFormat.FLOAT16): 2, int(ArrayFormat.UINT32): 4, int(ArrayFormat.INT32): 4, int(ArrayFormat.FLOAT32): 4} + +def _validate_format_channels(format, num_channels): + """Validate the ``(format, num_channels)`` pair shared by the array, + mipmap, and texture factories. Raises on an invalid combination.""" + +def _validate_array_shape(shape): + """Coerce ``shape`` to a tuple of ints and validate rank (1-3) and that + every extent is >= 1. Returns the normalized tuple.""" \ No newline at end of file diff --git a/cuda_core/cuda/core/_array.pyx b/cuda_core/cuda/core/_array.pyx new file mode 100644 index 00000000000..66420ffc471 --- /dev/null +++ b/cuda_core/cuda/core/_array.pyx @@ -0,0 +1,448 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +cimport cpython +from libc.stdint cimport intptr_t +from libc.string cimport memset + +from cuda.bindings cimport cydriver +from cuda.core._memory._buffer cimport Buffer +from cuda.core._stream cimport Stream +from cuda.core._utils.cuda_utils cimport ( + HANDLE_RETURN, + _get_current_device_id, +) + +from enum import IntEnum + + +class ArrayFormat(IntEnum): + """Element format for a :class:`CUDAArray` allocation. + + Mirrors ``CUarray_format`` from the CUDA driver API. + """ + UINT8 = cydriver.CU_AD_FORMAT_UNSIGNED_INT8 + UINT16 = cydriver.CU_AD_FORMAT_UNSIGNED_INT16 + UINT32 = cydriver.CU_AD_FORMAT_UNSIGNED_INT32 + INT8 = cydriver.CU_AD_FORMAT_SIGNED_INT8 + INT16 = cydriver.CU_AD_FORMAT_SIGNED_INT16 + INT32 = cydriver.CU_AD_FORMAT_SIGNED_INT32 + FLOAT16 = cydriver.CU_AD_FORMAT_HALF + FLOAT32 = cydriver.CU_AD_FORMAT_FLOAT + + +# Bytes per element (single channel) for each format. +_FORMAT_ELEM_SIZE = { + int(ArrayFormat.UINT8): 1, + int(ArrayFormat.INT8): 1, + int(ArrayFormat.UINT16): 2, + int(ArrayFormat.INT16): 2, + int(ArrayFormat.FLOAT16): 2, + int(ArrayFormat.UINT32): 4, + int(ArrayFormat.INT32): 4, + int(ArrayFormat.FLOAT32): 4, +} + + +def _validate_format_channels(format, num_channels): + """Validate the ``(format, num_channels)`` pair shared by the array, + mipmap, and texture factories. Raises on an invalid combination.""" + if not isinstance(format, ArrayFormat): + raise TypeError(f"format must be an ArrayFormat, got {type(format).__name__}") + if isinstance(num_channels, bool) or num_channels not in (1, 2, 4): + raise ValueError(f"num_channels must be 1, 2, or 4, got {num_channels!r}") + + +def _validate_array_shape(shape): + """Coerce ``shape`` to a tuple of ints and validate rank (1-3) and that + every extent is >= 1. Returns the normalized tuple.""" + try: + shape_t = tuple(int(s) for s in shape) + except TypeError as e: + raise TypeError(f"shape must be a tuple of ints, got {type(shape).__name__}") from e + if not 1 <= len(shape_t) <= 3: + raise ValueError(f"shape rank must be 1, 2, or 3, got {len(shape_t)}") + for i, dim in enumerate(shape_t): + if dim < 1: + raise ValueError(f"shape[{i}] must be >= 1, got {dim}") + return shape_t + + +cdef void _fill_array_endpoint( + cydriver.CUDA_MEMCPY3D* p, CUDAArray arr, bint is_src +) noexcept: + """Populate the src or dst array fields of a CUDA_MEMCPY3D struct.""" + if is_src: + p.srcMemoryType = cydriver.CU_MEMORYTYPE_ARRAY + p.srcArray = arr._handle + p.srcXInBytes = 0 + p.srcY = 0 + p.srcZ = 0 + else: + p.dstMemoryType = cydriver.CU_MEMORYTYPE_ARRAY + p.dstArray = arr._handle + p.dstXInBytes = 0 + p.dstY = 0 + p.dstZ = 0 + + +cdef int _fill_host_endpoint( + cydriver.CUDA_MEMCPY3D* p, + object obj, + bint is_src, + size_t width_bytes, + size_t height, + size_t required, + cpython.Py_buffer* pybuf_out, +) except -1: + """Populate src/dst host fields from a buffer-protocol ``obj``. + + Acquires a Py_buffer view; the caller is responsible for releasing it + (this function always returns with the view held when it returns 1). + """ + cdef int flags = cpython.PyBUF_SIMPLE + if not is_src: + flags |= cpython.PyBUF_WRITABLE + if cpython.PyObject_GetBuffer(obj, pybuf_out, flags) != 0: + raise TypeError( + f"Source/destination must be a Buffer or a contiguous " + f"buffer-protocol object, got {type(obj).__name__}" + ) + if pybuf_out.len < required: + cpython.PyBuffer_Release(pybuf_out) + raise ValueError( + f"Host buffer has {pybuf_out.len} bytes, smaller than the array " + f"extent ({required} bytes)" + ) + if is_src: + p.srcMemoryType = cydriver.CU_MEMORYTYPE_HOST + p.srcHost = pybuf_out.buf + p.srcPitch = width_bytes + p.srcHeight = height + p.srcXInBytes = 0 + p.srcY = 0 + p.srcZ = 0 + else: + p.dstMemoryType = cydriver.CU_MEMORYTYPE_HOST + p.dstHost = pybuf_out.buf + p.dstPitch = width_bytes + p.dstHeight = height + p.dstXInBytes = 0 + p.dstY = 0 + p.dstZ = 0 + return 1 + + +cdef int _fill_linear_endpoint( + cydriver.CUDA_MEMCPY3D* p, + object obj, + bint is_src, + size_t width_bytes, + size_t height, + size_t depth, + cpython.Py_buffer* pybuf_out, +) except -1: + """Populate the src or dst linear fields. Returns 1 if pybuf_out was + filled (caller must release it), 0 otherwise. + """ + cdef intptr_t ptr + cdef size_t required = width_bytes * height * depth + if isinstance(obj, Buffer): + if (obj).size < required: + raise ValueError( + f"Buffer size ({(obj).size} bytes) is smaller than " + f"the array extent ({required} bytes)" + ) + ptr = int((obj).handle) + if is_src: + p.srcMemoryType = cydriver.CU_MEMORYTYPE_DEVICE + p.srcDevice = ptr + p.srcPitch = width_bytes + p.srcHeight = height + p.srcXInBytes = 0 + p.srcY = 0 + p.srcZ = 0 + else: + p.dstMemoryType = cydriver.CU_MEMORYTYPE_DEVICE + p.dstDevice = ptr + p.dstPitch = width_bytes + p.dstHeight = height + p.dstXInBytes = 0 + p.dstY = 0 + p.dstZ = 0 + return 0 + return _fill_host_endpoint( + p, obj, is_src, width_bytes, height, required, pybuf_out + ) + + +cdef _copy3d(CUDAArray arr, object other, object stream, bint to_array): + """Issue a full-array async 3D memcpy between ``arr`` and ``other``. + + Direction is determined by ``to_array``: True copies *into* arr, False + copies *out of* arr. + """ + cdef cydriver.CUDA_MEMCPY3D params + cdef cpython.Py_buffer pybuf + cdef int got_buffer = 0 + cdef intptr_t stream_handle + cdef cydriver.CUstream c_stream + + if not isinstance(stream, Stream): + raise TypeError(f"stream must be a Stream, got {type(stream).__name__}") + + memset(¶ms, 0, sizeof(params)) + width_bytes, height, depth = arr._extent_bytes() + params.WidthInBytes = width_bytes + params.Height = height + params.Depth = depth + + try: + if to_array: + got_buffer = _fill_linear_endpoint( + ¶ms, other, True, width_bytes, height, depth, &pybuf + ) + _fill_array_endpoint(¶ms, arr, False) + else: + _fill_array_endpoint(¶ms, arr, True) + got_buffer = _fill_linear_endpoint( + ¶ms, other, False, width_bytes, height, depth, &pybuf + ) + + stream_handle = int((stream).handle) + c_stream = stream_handle + with nogil: + HANDLE_RETURN(cydriver.cuMemcpy3DAsync(¶ms, c_stream)) + finally: + if got_buffer: + cpython.PyBuffer_Release(&pybuf) + + +cdef class CUDAArray: + """An opaque, hardware-laid-out GPU allocation for texture/surface access. + + Distinct from :class:`Buffer`: a ``CUarray`` has no exposed device pointer + and can only be accessed from kernels through a :class:`TextureObject` or + :class:`SurfaceObject`. Its memory layout is chosen by the driver for 2D/3D + spatial locality. + + Construct via :meth:`from_descriptor`. Only plain 1D/2D/3D allocations are + supported in this initial version; layered/cubemap/sparse variants will + follow once their shape semantics are settled. + """ + + def __init__(self, *args, **kwargs): + raise RuntimeError( + "CUDAArray cannot be instantiated directly. Use CUDAArray.from_descriptor()." + ) + + @classmethod + def from_descriptor(cls, *, shape, format, num_channels, is_surface_load_store=False): + """Allocate a new CUDA array. + + Parameters + ---------- + shape : tuple of int + ``(width,)``, ``(width, height)``, or ``(width, height, depth)`` + in elements. + format : ArrayFormat + Element format. + num_channels : int + Channels per element. Must be 1, 2, or 4. + is_surface_load_store : bool + If True, allocate with ``CUDA_ARRAY3D_SURFACE_LDST`` so the array + can be bound as a :class:`SurfaceObject` for kernel-side writes. + Default False. + + Returns + ------- + CUDAArray + """ + _validate_format_channels(format, num_channels) + shape_t = _validate_array_shape(shape) + + cdef CUDAArray self = cls.__new__(cls) + self._owning = True + self._shape = shape_t + self._format = format + self._num_channels = num_channels + self._surface_load_store = bool(is_surface_load_store) + self._device_id = _get_current_device_id() + self._parent_ref = None + + cdef cydriver.CUarray_format c_format = format + cdef cydriver.CUDA_ARRAY3D_DESCRIPTOR desc3d + cdef cydriver.CUDA_ARRAY_DESCRIPTOR desc2d + cdef int rank = len(shape_t) + cdef unsigned int flags = ( + cydriver.CUDA_ARRAY3D_SURFACE_LDST if is_surface_load_store else 0 + ) + + # cuArrayCreate (2D path) does not accept flags; use the 3D descriptor + # whenever any flag is set or shape is 3D. + if rank == 3 or flags != 0: + memset(&desc3d, 0, sizeof(desc3d)) + desc3d.Width = shape_t[0] + desc3d.Height = (shape_t[1] if rank >= 2 else 0) + desc3d.Depth = (shape_t[2] if rank >= 3 else 0) + desc3d.Format = c_format + desc3d.NumChannels = num_channels + desc3d.Flags = flags + with nogil: + HANDLE_RETURN(cydriver.cuArray3DCreate(&self._handle, &desc3d)) + else: + memset(&desc2d, 0, sizeof(desc2d)) + desc2d.Width = shape_t[0] + desc2d.Height = (shape_t[1] if rank == 2 else 0) + desc2d.Format = c_format + desc2d.NumChannels = num_channels + with nogil: + HANDLE_RETURN(cydriver.cuArrayCreate(&self._handle, &desc2d)) + + return self + + @classmethod + def _from_handle(cls, intptr_t handle, bint owning, *, device_id=None): + """Wrap an externally-allocated ``CUarray``. + + Intended for graphics interop (``cuGraphicsSubResourceGetMappedArray``) + where the array is owned by the graphics API. With ``owning=False``, + :meth:`close` and ``__dealloc__`` will not free the handle. Shape, + format, and channel count are queried from the driver. + """ + cdef CUDAArray self = cls.__new__(cls) + self._handle = handle + self._owning = owning + self._device_id = _get_current_device_id() if device_id is None else int(device_id) + self._parent_ref = None + + cdef cydriver.CUDA_ARRAY3D_DESCRIPTOR desc + with nogil: + HANDLE_RETURN(cydriver.cuArray3DGetDescriptor(&desc, self._handle)) + + if desc.Depth > 0: + self._shape = (int(desc.Width), int(desc.Height), int(desc.Depth)) + elif desc.Height > 0: + self._shape = (int(desc.Width), int(desc.Height)) + else: + self._shape = (int(desc.Width),) + self._format = desc.Format + self._num_channels = desc.NumChannels + self._surface_load_store = bool(desc.Flags & cydriver.CUDA_ARRAY3D_SURFACE_LDST) + return self + + @property + def handle(self): + """The underlying ``CUarray`` as an integer.""" + return self._handle + + @property + def shape(self): + """Allocation shape, in elements.""" + return self._shape + + @property + def format(self): + """The element :class:`ArrayFormat`.""" + return ArrayFormat(self._format) + + @property + def num_channels(self): + """Channels per element (1, 2, or 4).""" + return self._num_channels + + @property + def element_size(self): + """Bytes per element (format size * channels).""" + return _FORMAT_ELEM_SIZE[self._format] * self._num_channels + + @property + def device(self): + """The :class:`Device` this array was allocated on.""" + from cuda.core._device import Device + return Device(self._device_id) + + @property + def is_surface_load_store(self): + """True if this array was created with ``CUDA_ARRAY3D_SURFACE_LDST`` + and can be bound as a :class:`SurfaceObject`.""" + return self._surface_load_store + + def _extent_bytes(self): + """Return (width_bytes, height, depth) for cuMemcpy3D, with height/depth + normalized to >=1 for lower-rank arrays.""" + cdef int rank = len(self._shape) + cdef size_t w = self._shape[0] * ( + _FORMAT_ELEM_SIZE[self._format] * self._num_channels + ) + cdef size_t h = (self._shape[1] if rank >= 2 else 1) + cdef size_t d = (self._shape[2] if rank >= 3 else 1) + return w, h, d + + def copy_from(self, src, *, stream): + """Copy a full-array's worth of data into this array. + + Parameters + ---------- + src : Buffer or buffer-protocol object + Source data. Must contain at least ``self.size_bytes`` bytes + of contiguous data. + stream : Stream + Stream to issue the copy on. + """ + _copy3d(self, src, stream, to_array=True) + + def copy_to(self, dst, *, stream): + """Copy a full-array's worth of data out of this array. + + Parameters + ---------- + dst : Buffer or writable buffer-protocol object + Destination. Must have at least ``self.size_bytes`` bytes of + writable, contiguous space. + stream : Stream + Stream to issue the copy on. + """ + _copy3d(self, dst, stream, to_array=False) + + @property + def size_bytes(self): + """Total bytes of array storage (``prod(shape) * element_size``).""" + cdef size_t n = 1 + for s in self._shape: + n *= s + return n * (_FORMAT_ELEM_SIZE[self._format] * self._num_channels) + + cpdef close(self): + """Destroy the underlying ``CUarray`` if owned by this object.""" + cdef cydriver.CUarray h = self._handle + cdef bint owning = self._owning + self._handle = NULL + # Drop the parent reference (if any) so a non-owning level CUDAArray + # stops pinning its MipmappedArray after close(). + self._parent_ref = None + if h != NULL and owning: + HANDLE_RETURN(cydriver.cuArrayDestroy(h)) + + def __dealloc__(self): + # Cython destructors cannot raise; any cuArrayDestroy error here is + # silently dropped. Callers needing visibility should use close(). + if self._handle != NULL and self._owning: + cydriver.cuArrayDestroy(self._handle) + self._handle = NULL + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc, tb): + self.close() + + def __repr__(self): + return ( + f"CUDAArray(shape={self._shape}, " + f"format={ArrayFormat(self._format).name}, " + f"num_channels={self._num_channels})" + ) diff --git a/cuda_core/cuda/core/_mipmapped_array.pxd b/cuda_core/cuda/core/_mipmapped_array.pxd new file mode 100644 index 00000000000..4feebd10c79 --- /dev/null +++ b/cuda_core/cuda/core/_mipmapped_array.pxd @@ -0,0 +1,20 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +from cuda.bindings cimport cydriver + + +cdef class MipmappedArray: + + cdef: + cydriver.CUmipmappedArray _handle + tuple _shape # (w,), (w, h), or (w, h, d) + cydriver.CUarray_format _format + unsigned int _num_channels # 1, 2, or 4 + unsigned int _num_levels + int _device_id + bint _owning + bint _surface_load_store + + cpdef close(self) diff --git a/cuda_core/cuda/core/_mipmapped_array.pyi b/cuda_core/cuda/core/_mipmapped_array.pyi new file mode 100644 index 00000000000..20460037aa6 --- /dev/null +++ b/cuda_core/cuda/core/_mipmapped_array.pyi @@ -0,0 +1,112 @@ +# This file was generated by stubgen-pyx v0.2.6 from cuda_core/cuda/core/_mipmapped_array.pyx + +from __future__ import annotations + + +class MipmappedArray: + """A mipmapped CUDA array for texture/surface access across levels. + + Wraps ``CUmipmappedArray``. Each mip level is a distinct, hardware-laid-out + allocation accessible only via a :class:`TextureObject` (or by retrieving + the level's :class:`CUDAArray` and binding it as a :class:`SurfaceObject`). + Destroying the :class:`MipmappedArray` destroys all level arrays + implicitly, so the :class:`CUDAArray` instances returned by :meth:`get_level` + are non-owning and hold a strong reference back to their parent. + + Construct via :meth:`from_descriptor`. + """ + + def close(self): + """Destroy the underlying ``CUmipmappedArray`` if owned. + + After ``close()`` any level :class:`CUDAArray` returned by :meth:`get_level` + becomes invalid; callers must not access them. + """ + + def __init__(self, *args, **kwargs): + ... + + @classmethod + def from_descriptor(cls, *, shape, format, num_channels, num_levels, is_surface_load_store=False): + """Allocate a new mipmapped CUDA array. + + Parameters + ---------- + shape : tuple of int + ``(width,)``, ``(width, height)``, or ``(width, height, depth)`` + in elements, for the base (level 0) mip. + format : ArrayFormat + Element format. + num_channels : int + Channels per element. Must be 1, 2, or 4. + num_levels : int + Number of mip levels to allocate; must be >= 1. The driver caps + this at the log2 of the largest dimension; passing a larger value + yields a driver error. + is_surface_load_store : bool + If True, allocate with ``CUDA_ARRAY3D_SURFACE_LDST`` so individual + levels (obtained via :meth:`get_level`) can be bound as + :class:`SurfaceObject` for kernel-side writes. Default False. + + Returns + ------- + MipmappedArray + """ + + def get_level(self, level): + """Return a non-owning :class:`CUDAArray` view of the given mip level. + + Parameters + ---------- + level : int + Mip level index in ``[0, num_levels)``. + + Returns + ------- + CUDAArray + A non-owning :class:`CUDAArray` wrapping the level's ``CUarray``. + The :class:`MipmappedArray` is kept alive for the lifetime of the + returned :class:`CUDAArray`; the underlying storage is released only + when this :class:`MipmappedArray` is destroyed. + """ + + @property + def handle(self): + """The underlying ``CUmipmappedArray`` as an integer.""" + + @property + def shape(self): + """Base-level (level 0) allocation shape, in elements.""" + + @property + def format(self): + """The element :class:`ArrayFormat`.""" + + @property + def num_channels(self): + """Channels per element (1, 2, or 4).""" + + @property + def num_levels(self): + """Number of mip levels.""" + + @property + def is_surface_load_store(self): + """True if this mipmap (and each of its levels) was created with + ``CUDA_ARRAY3D_SURFACE_LDST`` and can back a :class:`SurfaceObject`.""" + + @property + def device(self): + """The :class:`Device` this mipmap was allocated on.""" + + def __dealloc__(self): + ... + + def __enter__(self): + ... + + def __exit__(self, exc_type, exc, tb): + ... + + def __repr__(self): + ... \ No newline at end of file diff --git a/cuda_core/cuda/core/_mipmapped_array.pyx b/cuda_core/cuda/core/_mipmapped_array.pyx new file mode 100644 index 00000000000..a7ecd29b9d1 --- /dev/null +++ b/cuda_core/cuda/core/_mipmapped_array.pyx @@ -0,0 +1,215 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +from libc.stdint cimport intptr_t +from libc.string cimport memset + +from cuda.bindings cimport cydriver +from cuda.core._array cimport CUDAArray +from cuda.core._array import ArrayFormat, _validate_array_shape, _validate_format_channels +from cuda.core._utils.cuda_utils cimport ( + HANDLE_RETURN, + _get_current_device_id, +) + + +cdef class MipmappedArray: + """A mipmapped CUDA array for texture/surface access across levels. + + Wraps ``CUmipmappedArray``. Each mip level is a distinct, hardware-laid-out + allocation accessible only via a :class:`TextureObject` (or by retrieving + the level's :class:`CUDAArray` and binding it as a :class:`SurfaceObject`). + Destroying the :class:`MipmappedArray` destroys all level arrays + implicitly, so the :class:`CUDAArray` instances returned by :meth:`get_level` + are non-owning and hold a strong reference back to their parent. + + Construct via :meth:`from_descriptor`. + """ + + def __init__(self, *args, **kwargs): + raise RuntimeError( + "MipmappedArray cannot be instantiated directly. " + "Use MipmappedArray.from_descriptor()." + ) + + @classmethod + def from_descriptor( + cls, *, shape, format, num_channels, num_levels, is_surface_load_store=False + ): + """Allocate a new mipmapped CUDA array. + + Parameters + ---------- + shape : tuple of int + ``(width,)``, ``(width, height)``, or ``(width, height, depth)`` + in elements, for the base (level 0) mip. + format : ArrayFormat + Element format. + num_channels : int + Channels per element. Must be 1, 2, or 4. + num_levels : int + Number of mip levels to allocate; must be >= 1. The driver caps + this at the log2 of the largest dimension; passing a larger value + yields a driver error. + is_surface_load_store : bool + If True, allocate with ``CUDA_ARRAY3D_SURFACE_LDST`` so individual + levels (obtained via :meth:`get_level`) can be bound as + :class:`SurfaceObject` for kernel-side writes. Default False. + + Returns + ------- + MipmappedArray + """ + _validate_format_channels(format, num_channels) + shape_t = _validate_array_shape(shape) + + levels = int(num_levels) + if levels < 1: + raise ValueError(f"num_levels must be >= 1, got {levels}") + + cdef MipmappedArray self = cls.__new__(cls) + self._owning = True + self._shape = shape_t + self._format = format + self._num_channels = num_channels + self._num_levels = levels + self._surface_load_store = bool(is_surface_load_store) + self._device_id = _get_current_device_id() + + cdef cydriver.CUarray_format c_format = format + cdef cydriver.CUDA_ARRAY3D_DESCRIPTOR desc3d + cdef int rank = len(shape_t) + cdef unsigned int flags = ( + cydriver.CUDA_ARRAY3D_SURFACE_LDST if is_surface_load_store else 0 + ) + cdef unsigned int c_levels = levels + + # Mipmap creation uses the 3D descriptor regardless of rank; lower-rank + # shapes use Height=0/Depth=0 sentinels, matching cuArray3DCreate. + memset(&desc3d, 0, sizeof(desc3d)) + desc3d.Width = shape_t[0] + desc3d.Height = (shape_t[1] if rank >= 2 else 0) + desc3d.Depth = (shape_t[2] if rank >= 3 else 0) + desc3d.Format = c_format + desc3d.NumChannels = num_channels + desc3d.Flags = flags + with nogil: + HANDLE_RETURN( + cydriver.cuMipmappedArrayCreate(&self._handle, &desc3d, c_levels) + ) + + return self + + def get_level(self, level): + """Return a non-owning :class:`CUDAArray` view of the given mip level. + + Parameters + ---------- + level : int + Mip level index in ``[0, num_levels)``. + + Returns + ------- + CUDAArray + A non-owning :class:`CUDAArray` wrapping the level's ``CUarray``. + The :class:`MipmappedArray` is kept alive for the lifetime of the + returned :class:`CUDAArray`; the underlying storage is released only + when this :class:`MipmappedArray` is destroyed. + """ + lvl = int(level) + if lvl < 0: + raise ValueError(f"level must be >= 0, got {lvl}") + if lvl >= self._num_levels: + raise ValueError( + f"level ({lvl}) must be < num_levels ({self._num_levels})" + ) + + cdef cydriver.CUarray level_handle + cdef unsigned int c_level = lvl + with nogil: + HANDLE_RETURN( + cydriver.cuMipmappedArrayGetLevel(&level_handle, self._handle, c_level) + ) + + # Wrap as a non-owning CUDAArray; the level's underlying CUarray belongs + # to this MipmappedArray and must not be destroyed independently. + arr = CUDAArray._from_handle( + level_handle, False, device_id=self._device_id + ) + # Strong ref back to the parent so the mipmap outlives the level view. + (arr)._parent_ref = self + return arr + + @property + def handle(self): + """The underlying ``CUmipmappedArray`` as an integer.""" + return self._handle + + @property + def shape(self): + """Base-level (level 0) allocation shape, in elements.""" + return self._shape + + @property + def format(self): + """The element :class:`ArrayFormat`.""" + return ArrayFormat(self._format) + + @property + def num_channels(self): + """Channels per element (1, 2, or 4).""" + return self._num_channels + + @property + def num_levels(self): + """Number of mip levels.""" + return int(self._num_levels) + + @property + def is_surface_load_store(self): + """True if this mipmap (and each of its levels) was created with + ``CUDA_ARRAY3D_SURFACE_LDST`` and can back a :class:`SurfaceObject`.""" + return self._surface_load_store + + @property + def device(self): + """The :class:`Device` this mipmap was allocated on.""" + from cuda.core._device import Device + return Device(self._device_id) + + cpdef close(self): + """Destroy the underlying ``CUmipmappedArray`` if owned. + + After ``close()`` any level :class:`CUDAArray` returned by :meth:`get_level` + becomes invalid; callers must not access them. + """ + cdef cydriver.CUmipmappedArray h = self._handle + cdef bint owning = self._owning + self._handle = NULL + if h != NULL and owning: + HANDLE_RETURN(cydriver.cuMipmappedArrayDestroy(h)) + + def __dealloc__(self): + # Cython destructors cannot raise; any cuMipmappedArrayDestroy error + # here is silently dropped. Callers needing visibility should use + # close(). + if self._handle != NULL and self._owning: + cydriver.cuMipmappedArrayDestroy(self._handle) + self._handle = NULL + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc, tb): + self.close() + + def __repr__(self): + return ( + f"MipmappedArray(shape={self._shape}, " + f"format={ArrayFormat(self._format).name}, " + f"num_channels={self._num_channels}, " + f"num_levels={self._num_levels})" + ) diff --git a/cuda_core/cuda/core/_surface.pxd b/cuda_core/cuda/core/_surface.pxd new file mode 100644 index 00000000000..13a075eb4a3 --- /dev/null +++ b/cuda_core/cuda/core/_surface.pxd @@ -0,0 +1,15 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +from cuda.bindings cimport cydriver + + +cdef class SurfaceObject: + + cdef: + cydriver.CUsurfObject _handle + object _source_ref # keep backing CUDAArray alive + int _device_id + + cpdef close(self) diff --git a/cuda_core/cuda/core/_surface.pyi b/cuda_core/cuda/core/_surface.pyi new file mode 100644 index 00000000000..9f86054a49c --- /dev/null +++ b/cuda_core/cuda/core/_surface.pyi @@ -0,0 +1,68 @@ +# This file was generated by stubgen-pyx v0.2.6 from cuda_core/cuda/core/_surface.pyx + +from __future__ import annotations + + +class SurfaceObject: + """A bindless surface handle for kernel-side typed load/store. + + Wraps ``cuSurfObjectCreate``. Unlike a :class:`TextureObject`, a surface + has no sampling state (no filtering, no addressing modes, no normalization); + kernels read and write through it using integer pixel coordinates. + + The backing :class:`CUDAArray` must have been created with + ``is_surface_load_store=True`` and is kept alive for the lifetime of this + object to prevent dangling handles. + + Construct via :meth:`from_array` or :meth:`from_descriptor`. Passes to + kernels as a 64-bit handle (via the ``handle`` property). + """ + + def close(self): + """Destroy the underlying ``CUsurfObject``.""" + + def __init__(self, *args, **kwargs): + ... + + @classmethod + def from_array(cls, array): + """Create a surface object directly from an :class:`CUDAArray`. + + The array must have been created with ``is_surface_load_store=True``. + """ + + @classmethod + def from_descriptor(cls, *, resource): + """Create a surface object from a :class:`ResourceDescriptor`. + + Parameters + ---------- + resource : ResourceDescriptor + Must wrap an :class:`CUDAArray` allocated with + ``is_surface_load_store=True``. Linear/pitch2d resources are not + valid surface backings. + """ + + @property + def handle(self): + """The underlying ``CUsurfObject`` as an integer (64-bit kernel arg).""" + + @property + def resource(self): + """The :class:`ResourceDescriptor` this surface was built from.""" + + @property + def device(self): + ... + + def __dealloc__(self): + ... + + def __enter__(self): + ... + + def __exit__(self, exc_type, exc, tb): + ... + + def __repr__(self): + ... \ No newline at end of file diff --git a/cuda_core/cuda/core/_surface.pyx b/cuda_core/cuda/core/_surface.pyx new file mode 100644 index 00000000000..87e80e99ef0 --- /dev/null +++ b/cuda_core/cuda/core/_surface.pyx @@ -0,0 +1,131 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +from libc.stdint cimport intptr_t +from libc.string cimport memset + +from cuda.bindings cimport cydriver +from cuda.core._array cimport CUDAArray +from cuda.core._texture import ResourceDescriptor +from cuda.core._utils.cuda_utils cimport ( + HANDLE_RETURN, + _get_current_device_id, +) + + +cdef class SurfaceObject: + """A bindless surface handle for kernel-side typed load/store. + + Wraps ``cuSurfObjectCreate``. Unlike a :class:`TextureObject`, a surface + has no sampling state (no filtering, no addressing modes, no normalization); + kernels read and write through it using integer pixel coordinates. + + The backing :class:`CUDAArray` must have been created with + ``is_surface_load_store=True`` and is kept alive for the lifetime of this + object to prevent dangling handles. + + Construct via :meth:`from_array` or :meth:`from_descriptor`. Passes to + kernels as a 64-bit handle (via the ``handle`` property). + """ + + def __init__(self, *args, **kwargs): + raise RuntimeError( + "SurfaceObject cannot be instantiated directly. " + "Use SurfaceObject.from_array() or SurfaceObject.from_descriptor()." + ) + + @classmethod + def from_array(cls, array): + """Create a surface object directly from an :class:`CUDAArray`. + + The array must have been created with ``is_surface_load_store=True``. + """ + if not isinstance(array, CUDAArray): + raise TypeError(f"array must be an CUDAArray, got {type(array).__name__}") + return cls.from_descriptor(resource=ResourceDescriptor.from_array(array)) + + @classmethod + def from_descriptor(cls, *, resource): + """Create a surface object from a :class:`ResourceDescriptor`. + + Parameters + ---------- + resource : ResourceDescriptor + Must wrap an :class:`CUDAArray` allocated with + ``is_surface_load_store=True``. Linear/pitch2d resources are not + valid surface backings. + """ + if not isinstance(resource, ResourceDescriptor): + raise TypeError( + f"resource must be a ResourceDescriptor, got " + f"{type(resource).__name__}" + ) + if resource.kind != "array": + raise ValueError( + f"SurfaceObject requires an array-backed ResourceDescriptor, " + f"got kind={resource.kind!r}" + ) + + cdef CUDAArray arr = resource.source + if not arr.is_surface_load_store: + raise ValueError( + "CUDAArray must be created with is_surface_load_store=True to be " + "bound as a SurfaceObject" + ) + + cdef cydriver.CUDA_RESOURCE_DESC res_desc + memset(&res_desc, 0, sizeof(res_desc)) + res_desc.resType = cydriver.CU_RESOURCE_TYPE_ARRAY + res_desc.res.array.hArray = arr._handle + + cdef SurfaceObject self = cls.__new__(cls) + self._source_ref = resource + self._device_id = _get_current_device_id() + + with nogil: + HANDLE_RETURN( + cydriver.cuSurfObjectCreate(&self._handle, &res_desc) + ) + return self + + @property + def handle(self): + """The underlying ``CUsurfObject`` as an integer (64-bit kernel arg).""" + return self._handle + + @property + def resource(self): + """The :class:`ResourceDescriptor` this surface was built from.""" + return self._source_ref + + @property + def device(self): + from cuda.core._device import Device + return Device(self._device_id) + + cpdef close(self): + """Destroy the underlying ``CUsurfObject``.""" + cdef cydriver.CUsurfObject h = self._handle + self._handle = 0 + self._source_ref = None + if h != 0: + HANDLE_RETURN(cydriver.cuSurfObjectDestroy(h)) + + def __dealloc__(self): + # Cython destructors cannot raise; any cuSurfObjectDestroy error is + # silently dropped. Callers needing visibility should use close(). + if self._handle != 0: + cydriver.cuSurfObjectDestroy(self._handle) + self._handle = 0 + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc, tb): + self.close() + + def __repr__(self): + return f"SurfaceObject(handle=0x{self._handle:x})" diff --git a/cuda_core/cuda/core/_texture.pxd b/cuda_core/cuda/core/_texture.pxd new file mode 100644 index 00000000000..5a1fd84b9ad --- /dev/null +++ b/cuda_core/cuda/core/_texture.pxd @@ -0,0 +1,16 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +from cuda.bindings cimport cydriver + + +cdef class TextureObject: + + cdef: + cydriver.CUtexObject _handle + object _source_ref # keep backing CUDAArray (or other resource) alive + object _texture_desc # original TextureDescriptor for introspection + int _device_id + + cpdef close(self) diff --git a/cuda_core/cuda/core/_texture.pyi b/cuda_core/cuda/core/_texture.pyi new file mode 100644 index 00000000000..132a40273c3 --- /dev/null +++ b/cuda_core/cuda/core/_texture.pyi @@ -0,0 +1,261 @@ +# This file was generated by stubgen-pyx v0.2.6 from cuda_core/cuda/core/_texture.pyx + +from __future__ import annotations + +from dataclasses import dataclass +from enum import IntEnum + +from cuda.bindings import cydriver + + +class AddressMode(IntEnum): + """Boundary behavior for out-of-range texture coordinates.""" + WRAP = cydriver.CU_TR_ADDRESS_MODE_WRAP + CLAMP = cydriver.CU_TR_ADDRESS_MODE_CLAMP + MIRROR = cydriver.CU_TR_ADDRESS_MODE_MIRROR + BORDER = cydriver.CU_TR_ADDRESS_MODE_BORDER + +class FilterMode(IntEnum): + """Texel sampling mode.""" + POINT = cydriver.CU_TR_FILTER_MODE_POINT + LINEAR = cydriver.CU_TR_FILTER_MODE_LINEAR + +class ReadMode(IntEnum): + """How sampled values are returned to the kernel. + + - ``ELEMENT_TYPE``: return the raw element value (integer formats stay + integer, float stays float). + - ``NORMALIZED_FLOAT``: integer formats are promoted to a normalized + ``float`` in ``[0, 1]`` (unsigned) or ``[-1, 1]`` (signed). + Float formats are unaffected. + """ + ELEMENT_TYPE = 0 + NORMALIZED_FLOAT = 1 + +class ResourceDescriptor: + """Describes the memory backing a :class:`TextureObject`. + + Construct via the ``from_*`` classmethods: + + - :meth:`from_array` wraps a :class:`CUDAArray` (works for both + :class:`TextureObject` and :class:`SurfaceObject`). + - :meth:`from_mipmapped_array` wraps a :class:`MipmappedArray` for mipmapped + sampling (texture only, not surface). + - :meth:`from_linear` wraps a :class:`Buffer` as a typed 1D fetch. Texture + objects built from a linear resource do not support filtering, + normalized coordinates, or addressing modes. + - :meth:`from_pitch2d` wraps a :class:`Buffer` as a row-pitched 2D image. + Supports filtering and 2D addressing, but only 2D access. + + Linear and pitch2D resources cannot back a :class:`SurfaceObject` — those + require an :class:`CUDAArray` allocated with ``is_surface_load_store=True``. + """ + __slots__ = ('_kind', '_source', '_format', '_num_channels', '_size_bytes', '_width', '_height', '_pitch_bytes') + + def __init__(self): + ... + + @classmethod + def from_array(cls, array): + """Build a resource descriptor backed by a :class:`CUDAArray`.""" + + @classmethod + def from_mipmapped_array(cls, mipmapped_array): + """Build a resource descriptor backed by a :class:`MipmappedArray`. + + Suitable for binding to a :class:`TextureObject` for mipmapped + sampling. Not valid as a :class:`SurfaceObject` backing: surfaces + require a single :class:`CUDAArray` level (obtain via + :meth:`MipmappedArray.get_level`). + """ + + @classmethod + def from_linear(cls, buffer, *, format, num_channels, size_bytes=None): + """Build a resource descriptor for a linear (typed 1D) texture fetch. + + Parameters + ---------- + buffer : Buffer + Device-memory backing. Must remain alive for the lifetime of any + :class:`TextureObject` built from this descriptor. + format : ArrayFormat + Element format. + num_channels : int + Channels per element. Must be 1, 2, or 4. + size_bytes : int, optional + Bytes of ``buffer`` to bind. Defaults to ``buffer.size``. Must not + exceed it. + + Notes + ----- + Texture objects built from a linear resource ignore the + :class:`TextureDescriptor` addressing/filtering fields — kernels read + through a typed 1D fetch with bounds checking only. + """ + + @classmethod + def from_pitch2d(cls, buffer, *, format, num_channels, width, height, pitch_bytes): + """Build a resource descriptor for a row-pitched 2D image. + + Parameters + ---------- + buffer : Buffer + Device-memory backing. Must remain alive for the lifetime of any + :class:`TextureObject` built from this descriptor. + format : ArrayFormat + Element format. + num_channels : int + Channels per element. Must be 1, 2, or 4. + width : int + Image width, in elements. + height : int + Image height, in rows. + pitch_bytes : int + Distance between consecutive rows, in bytes. Must be at least + ``width * format_size * num_channels`` and meet the driver's + ``CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT``. + """ + + @property + def kind(self): + ... + + @property + def source(self): + ... + + @property + def format(self): + """The element :class:`ArrayFormat` (``None`` for array-backed).""" + + @property + def num_channels(self): + """Channels per element (``None`` for array-backed).""" + + @property + def size_bytes(self): + """Bytes bound for a linear resource (``None`` for other kinds).""" + + @property + def width(self): + """Pitch2D image width, in elements (``None`` for other kinds).""" + + @property + def height(self): + """Pitch2D image height, in rows (``None`` for other kinds).""" + + @property + def pitch_bytes(self): + """Pitch2D row pitch, in bytes (``None`` for other kinds).""" + + def __repr__(self): + ... + +@dataclass +class TextureDescriptor: + """Sampling state for a :class:`TextureObject` (mirrors ``CUDA_TEXTURE_DESC``). + + Attributes + ---------- + address_mode : tuple of AddressMode + Boundary behavior per axis. May be a single :class:`AddressMode` (applied + to all axes) or a tuple of 1-3 entries (one per dimension). + filter_mode : FilterMode + Texel sampling mode. Default ``POINT``. + read_mode : ReadMode + How sampled integer values are returned. Default ``ELEMENT_TYPE``. + normalized_coords : bool + If True, coordinates are in ``[0, 1]`` instead of pixel indices. + srgb : bool + If True, perform sRGB → linear conversion on read (8-bit formats only). + disable_trilinear_optimization : bool + If True, request exact trilinear filtering. + seamless_cubemap : bool + If True, enable seamless cubemap edge filtering. + max_anisotropy : int + Maximum anisotropy; 0 disables anisotropic filtering. + mipmap_filter_mode : FilterMode + Filtering between mipmap levels. Default ``POINT``. + mipmap_level_bias : float + min_mipmap_level_clamp : float + max_mipmap_level_clamp : float + border_color : tuple of float or None + 4-tuple used when ``address_mode`` includes ``BORDER``; ``None`` means + zero. + """ + address_mode: AddressMode | tuple[AddressMode, ...] = AddressMode.CLAMP + filter_mode: FilterMode = FilterMode.POINT + read_mode: ReadMode = ReadMode.ELEMENT_TYPE + normalized_coords: bool = False + srgb: bool = False + disable_trilinear_optimization: bool = False + seamless_cubemap: bool = False + max_anisotropy: int = 0 + mipmap_filter_mode: FilterMode = FilterMode.POINT + mipmap_level_bias: float = 0.0 + min_mipmap_level_clamp: float = 0.0 + max_mipmap_level_clamp: float = 0.0 + border_color: tuple[float, ...] | None = None + +class TextureObject: + """A bindless texture handle for kernel-side sampled reads. + + Wraps ``cuTexObjectCreate``. The underlying memory resource (e.g. the + :class:`CUDAArray` referenced by the descriptor) is kept alive for the + lifetime of this object to prevent dangling handles. + + Construct via :meth:`from_descriptor`. Passes to kernels as a 64-bit + handle (via the ``handle`` property). + """ + + def close(self): + """Destroy the underlying ``CUtexObject``.""" + + def __init__(self, *args, **kwargs): + ... + + @classmethod + def from_descriptor(cls, *, resource, texture_descriptor): + """Create a texture object from a resource + sampling descriptor. + + Parameters + ---------- + resource : ResourceDescriptor + texture_descriptor : TextureDescriptor + """ + + @property + def handle(self): + """The underlying ``CUtexObject`` as an integer (64-bit kernel arg).""" + + @property + def resource(self): + """The :class:`ResourceDescriptor` this texture was built from.""" + + @property + def texture_descriptor(self): + """The :class:`TextureDescriptor` this texture was built from.""" + + @property + def device(self): + ... + + def __dealloc__(self): + ... + + def __enter__(self): + ... + + def __exit__(self, exc_type, exc, tb): + ... + + def __repr__(self): + ... +_TRSF_READ_AS_INTEGER = 1 +_TRSF_NORMALIZED_COORDINATES = 2 +_TRSF_SRGB = 16 +_TRSF_DISABLE_TRILINEAR_OPTIMIZATION = 32 +_TRSF_SEAMLESS_CUBEMAP = 64 + +def _normalize_address_modes(address_mode): + """Return a 3-tuple of AddressMode values from a scalar or 1-3 tuple.""" \ No newline at end of file diff --git a/cuda_core/cuda/core/_texture.pyx b/cuda_core/cuda/core/_texture.pyx new file mode 100644 index 00000000000..6ea8ad805ad --- /dev/null +++ b/cuda_core/cuda/core/_texture.pyx @@ -0,0 +1,566 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +from libc.stdint cimport intptr_t +from libc.string cimport memset + +from cuda.bindings cimport cydriver +from cuda.core._array cimport CUDAArray +from cuda.core._array import ArrayFormat, _FORMAT_ELEM_SIZE, _validate_format_channels +from cuda.core._memory._buffer cimport Buffer +from cuda.core._mipmapped_array cimport MipmappedArray +from cuda.core._mipmapped_array import MipmappedArray as _PyMipmappedArray +from cuda.core._utils.cuda_utils cimport ( + HANDLE_RETURN, + _get_current_device_id, +) + +from dataclasses import dataclass +from enum import IntEnum + + +# Driver texture-descriptor flag bits (CU_TRSF_*). +_TRSF_READ_AS_INTEGER = 0x01 +_TRSF_NORMALIZED_COORDINATES = 0x02 +_TRSF_SRGB = 0x10 +_TRSF_DISABLE_TRILINEAR_OPTIMIZATION = 0x20 +_TRSF_SEAMLESS_CUBEMAP = 0x40 + + +class AddressMode(IntEnum): + """Boundary behavior for out-of-range texture coordinates.""" + WRAP = cydriver.CU_TR_ADDRESS_MODE_WRAP + CLAMP = cydriver.CU_TR_ADDRESS_MODE_CLAMP + MIRROR = cydriver.CU_TR_ADDRESS_MODE_MIRROR + BORDER = cydriver.CU_TR_ADDRESS_MODE_BORDER + + +class FilterMode(IntEnum): + """Texel sampling mode.""" + POINT = cydriver.CU_TR_FILTER_MODE_POINT + LINEAR = cydriver.CU_TR_FILTER_MODE_LINEAR + + +class ReadMode(IntEnum): + """How sampled values are returned to the kernel. + + - ``ELEMENT_TYPE``: return the raw element value (integer formats stay + integer, float stays float). + - ``NORMALIZED_FLOAT``: integer formats are promoted to a normalized + ``float`` in ``[0, 1]`` (unsigned) or ``[-1, 1]`` (signed). + Float formats are unaffected. + """ + ELEMENT_TYPE = 0 + NORMALIZED_FLOAT = 1 + + +class ResourceDescriptor: + """Describes the memory backing a :class:`TextureObject`. + + Construct via the ``from_*`` classmethods: + + - :meth:`from_array` wraps a :class:`CUDAArray` (works for both + :class:`TextureObject` and :class:`SurfaceObject`). + - :meth:`from_mipmapped_array` wraps a :class:`MipmappedArray` for mipmapped + sampling (texture only, not surface). + - :meth:`from_linear` wraps a :class:`Buffer` as a typed 1D fetch. Texture + objects built from a linear resource do not support filtering, + normalized coordinates, or addressing modes. + - :meth:`from_pitch2d` wraps a :class:`Buffer` as a row-pitched 2D image. + Supports filtering and 2D addressing, but only 2D access. + + Linear and pitch2D resources cannot back a :class:`SurfaceObject` — those + require an :class:`CUDAArray` allocated with ``is_surface_load_store=True``. + """ + + __slots__ = ( + "_kind", "_source", + "_format", "_num_channels", + "_size_bytes", + "_width", "_height", "_pitch_bytes", + ) + + def __init__(self): + raise RuntimeError( + "ResourceDescriptor cannot be instantiated directly. " + "Use ResourceDescriptor.from_* factories." + ) + + @classmethod + def from_array(cls, array): + """Build a resource descriptor backed by a :class:`CUDAArray`.""" + if not isinstance(array, CUDAArray): + raise TypeError(f"array must be an CUDAArray, got {type(array).__name__}") + self = cls.__new__(cls) + self._kind = "array" + self._source = array + self._format = None + self._num_channels = None + self._size_bytes = None + self._width = None + self._height = None + self._pitch_bytes = None + return self + + @classmethod + def from_mipmapped_array(cls, mipmapped_array): + """Build a resource descriptor backed by a :class:`MipmappedArray`. + + Suitable for binding to a :class:`TextureObject` for mipmapped + sampling. Not valid as a :class:`SurfaceObject` backing: surfaces + require a single :class:`CUDAArray` level (obtain via + :meth:`MipmappedArray.get_level`). + """ + if not isinstance(mipmapped_array, _PyMipmappedArray): + raise TypeError( + f"mipmapped_array must be a MipmappedArray, got " + f"{type(mipmapped_array).__name__}" + ) + self = cls.__new__(cls) + self._kind = "mipmapped_array" + self._source = mipmapped_array + self._format = None + self._num_channels = None + self._size_bytes = None + self._width = None + self._height = None + self._pitch_bytes = None + return self + + @classmethod + def from_linear(cls, buffer, *, format, num_channels, size_bytes=None): + """Build a resource descriptor for a linear (typed 1D) texture fetch. + + Parameters + ---------- + buffer : Buffer + Device-memory backing. Must remain alive for the lifetime of any + :class:`TextureObject` built from this descriptor. + format : ArrayFormat + Element format. + num_channels : int + Channels per element. Must be 1, 2, or 4. + size_bytes : int, optional + Bytes of ``buffer`` to bind. Defaults to ``buffer.size``. Must not + exceed it. + + Notes + ----- + Texture objects built from a linear resource ignore the + :class:`TextureDescriptor` addressing/filtering fields — kernels read + through a typed 1D fetch with bounds checking only. + """ + if not isinstance(buffer, Buffer): + raise TypeError(f"buffer must be a Buffer, got {type(buffer).__name__}") + _validate_format_channels(format, num_channels) + + buf_size = int(buffer.size) + elem = _FORMAT_ELEM_SIZE[int(format)] * int(num_channels) + if size_bytes is None: + size = buf_size + else: + size = int(size_bytes) + if size > buf_size: + raise ValueError( + f"size_bytes ({size}) exceeds buffer.size ({buf_size})" + ) + if size < elem: + raise ValueError( + f"size_bytes ({size}) must be at least one element ({elem} bytes)" + ) + if size % elem != 0: + raise ValueError( + f"size_bytes ({size}) must be a multiple of element size " + f"({elem} bytes for {format.name} x {num_channels})" + ) + + self = cls.__new__(cls) + self._kind = "linear" + self._source = buffer + self._format = int(format) + self._num_channels = int(num_channels) + self._size_bytes = size + self._width = None + self._height = None + self._pitch_bytes = None + return self + + @classmethod + def from_pitch2d( + cls, buffer, *, format, num_channels, width, height, pitch_bytes + ): + """Build a resource descriptor for a row-pitched 2D image. + + Parameters + ---------- + buffer : Buffer + Device-memory backing. Must remain alive for the lifetime of any + :class:`TextureObject` built from this descriptor. + format : ArrayFormat + Element format. + num_channels : int + Channels per element. Must be 1, 2, or 4. + width : int + Image width, in elements. + height : int + Image height, in rows. + pitch_bytes : int + Distance between consecutive rows, in bytes. Must be at least + ``width * format_size * num_channels`` and meet the driver's + ``CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT``. + """ + if not isinstance(buffer, Buffer): + raise TypeError(f"buffer must be a Buffer, got {type(buffer).__name__}") + _validate_format_channels(format, num_channels) + + w = int(width) + h = int(height) + p = int(pitch_bytes) + if w < 1: + raise ValueError(f"width must be >= 1, got {w}") + if h < 1: + raise ValueError(f"height must be >= 1, got {h}") + elem = _FORMAT_ELEM_SIZE[int(format)] * int(num_channels) + min_pitch = w * elem + if p < min_pitch: + raise ValueError( + f"pitch_bytes ({p}) must be >= width * element_size ({min_pitch})" + ) + if p * h > int(buffer.size): + raise ValueError( + f"pitch_bytes * height ({p * h}) exceeds buffer.size ({int(buffer.size)})" + ) + + self = cls.__new__(cls) + self._kind = "pitch2d" + self._source = buffer + self._format = int(format) + self._num_channels = int(num_channels) + self._size_bytes = None + self._width = w + self._height = h + self._pitch_bytes = p + return self + + @property + def kind(self): + return self._kind + + @property + def source(self): + return self._source + + @property + def format(self): + """The element :class:`ArrayFormat` (``None`` for array-backed).""" + return None if self._format is None else ArrayFormat(self._format) + + @property + def num_channels(self): + """Channels per element (``None`` for array-backed).""" + return self._num_channels + + @property + def size_bytes(self): + """Bytes bound for a linear resource (``None`` for other kinds).""" + return self._size_bytes + + @property + def width(self): + """Pitch2D image width, in elements (``None`` for other kinds).""" + return self._width + + @property + def height(self): + """Pitch2D image height, in rows (``None`` for other kinds).""" + return self._height + + @property + def pitch_bytes(self): + """Pitch2D row pitch, in bytes (``None`` for other kinds).""" + return self._pitch_bytes + + def __repr__(self): + if self._kind == "linear": + return ( + f"ResourceDescriptor(kind='linear', format={self.format.name}, " + f"num_channels={self._num_channels}, size_bytes={self._size_bytes})" + ) + if self._kind == "pitch2d": + return ( + f"ResourceDescriptor(kind='pitch2d', format={self.format.name}, " + f"num_channels={self._num_channels}, " + f"width={self._width}, height={self._height}, " + f"pitch_bytes={self._pitch_bytes})" + ) + return f"ResourceDescriptor(kind={self._kind!r})" + + +@dataclass +class TextureDescriptor: + """Sampling state for a :class:`TextureObject` (mirrors ``CUDA_TEXTURE_DESC``). + + Attributes + ---------- + address_mode : tuple of AddressMode + Boundary behavior per axis. May be a single :class:`AddressMode` (applied + to all axes) or a tuple of 1-3 entries (one per dimension). + filter_mode : FilterMode + Texel sampling mode. Default ``POINT``. + read_mode : ReadMode + How sampled integer values are returned. Default ``ELEMENT_TYPE``. + normalized_coords : bool + If True, coordinates are in ``[0, 1]`` instead of pixel indices. + srgb : bool + If True, perform sRGB → linear conversion on read (8-bit formats only). + disable_trilinear_optimization : bool + If True, request exact trilinear filtering. + seamless_cubemap : bool + If True, enable seamless cubemap edge filtering. + max_anisotropy : int + Maximum anisotropy; 0 disables anisotropic filtering. + mipmap_filter_mode : FilterMode + Filtering between mipmap levels. Default ``POINT``. + mipmap_level_bias : float + min_mipmap_level_clamp : float + max_mipmap_level_clamp : float + border_color : tuple of float or None + 4-tuple used when ``address_mode`` includes ``BORDER``; ``None`` means + zero. + """ + + address_mode: AddressMode | tuple[AddressMode, ...] = AddressMode.CLAMP + filter_mode: FilterMode = FilterMode.POINT + read_mode: ReadMode = ReadMode.ELEMENT_TYPE + normalized_coords: bool = False + srgb: bool = False + disable_trilinear_optimization: bool = False + seamless_cubemap: bool = False + max_anisotropy: int = 0 + mipmap_filter_mode: FilterMode = FilterMode.POINT + mipmap_level_bias: float = 0.0 + min_mipmap_level_clamp: float = 0.0 + max_mipmap_level_clamp: float = 0.0 + border_color: tuple[float, ...] | None = None + + +def _normalize_address_modes(address_mode): + """Return a 3-tuple of AddressMode values from a scalar or 1-3 tuple.""" + if isinstance(address_mode, AddressMode): + return (address_mode, address_mode, address_mode) + try: + modes = tuple(address_mode) + except TypeError as e: + raise TypeError( + "address_mode must be an AddressMode or a tuple of AddressMode" + ) from e + if not 1 <= len(modes) <= 3: + raise ValueError( + f"address_mode tuple must have 1-3 entries, got {len(modes)}" + ) + for i, m in enumerate(modes): + if not isinstance(m, AddressMode): + raise TypeError( + f"address_mode[{i}] must be an AddressMode, got {type(m).__name__}" + ) + # Pad to 3 entries by repeating the last one. + padded = list(modes) + [modes[-1]] * (3 - len(modes)) + return tuple(padded) + + +cdef class TextureObject: + """A bindless texture handle for kernel-side sampled reads. + + Wraps ``cuTexObjectCreate``. The underlying memory resource (e.g. the + :class:`CUDAArray` referenced by the descriptor) is kept alive for the + lifetime of this object to prevent dangling handles. + + Construct via :meth:`from_descriptor`. Passes to kernels as a 64-bit + handle (via the ``handle`` property). + """ + + def __init__(self, *args, **kwargs): + raise RuntimeError( + "TextureObject cannot be instantiated directly. " + "Use TextureObject.from_descriptor()." + ) + + @classmethod + def from_descriptor(cls, *, resource, texture_descriptor): + """Create a texture object from a resource + sampling descriptor. + + Parameters + ---------- + resource : ResourceDescriptor + texture_descriptor : TextureDescriptor + """ + if not isinstance(resource, ResourceDescriptor): + raise TypeError( + f"resource must be a ResourceDescriptor, got " + f"{type(resource).__name__}" + ) + if not isinstance(texture_descriptor, TextureDescriptor): + raise TypeError( + f"texture_descriptor must be a TextureDescriptor, got " + f"{type(texture_descriptor).__name__}" + ) + + cdef cydriver.CUDA_RESOURCE_DESC res_desc + cdef cydriver.CUDA_TEXTURE_DESC tex_desc + memset(&res_desc, 0, sizeof(res_desc)) + memset(&tex_desc, 0, sizeof(tex_desc)) + + # --- Resource descriptor --- + cdef CUDAArray arr + cdef MipmappedArray mip + cdef Buffer buf + cdef intptr_t devptr + if resource.kind == "array": + arr = resource.source + res_desc.resType = cydriver.CU_RESOURCE_TYPE_ARRAY + res_desc.res.array.hArray = arr._handle + elif resource.kind == "mipmapped_array": + mip = resource.source + res_desc.resType = cydriver.CU_RESOURCE_TYPE_MIPMAPPED_ARRAY + res_desc.res.mipmap.hMipmappedArray = mip._handle + elif resource.kind == "linear": + buf = resource.source + devptr = int(buf.handle) + res_desc.resType = cydriver.CU_RESOURCE_TYPE_LINEAR + res_desc.res.linear.devPtr = devptr + res_desc.res.linear.format = resource._format + res_desc.res.linear.numChannels = resource._num_channels + res_desc.res.linear.sizeInBytes = resource._size_bytes + elif resource.kind == "pitch2d": + buf = resource.source + devptr = int(buf.handle) + res_desc.resType = cydriver.CU_RESOURCE_TYPE_PITCH2D + res_desc.res.pitch2D.devPtr = devptr + res_desc.res.pitch2D.format = resource._format + res_desc.res.pitch2D.numChannels = resource._num_channels + res_desc.res.pitch2D.width = resource._width + res_desc.res.pitch2D.height = resource._height + res_desc.res.pitch2D.pitchInBytes = resource._pitch_bytes + else: + raise NotImplementedError( + f"ResourceDescriptor kind {resource.kind!r} is not yet supported" + ) + + # --- Texture descriptor --- + modes = _normalize_address_modes(texture_descriptor.address_mode) + tex_desc.addressMode[0] = modes[0] + tex_desc.addressMode[1] = modes[1] + tex_desc.addressMode[2] = modes[2] + + if not isinstance(texture_descriptor.filter_mode, FilterMode): + raise TypeError( + f"filter_mode must be a FilterMode, got " + f"{type(texture_descriptor.filter_mode).__name__}" + ) + tex_desc.filterMode = texture_descriptor.filter_mode + + if not isinstance(texture_descriptor.read_mode, ReadMode): + raise TypeError( + f"read_mode must be a ReadMode, got " + f"{type(texture_descriptor.read_mode).__name__}" + ) + + cdef unsigned int flags = 0 + # CU_TRSF_READ_AS_INTEGER suppresses normalization, so it maps to + # ReadMode.ELEMENT_TYPE. + if texture_descriptor.read_mode == ReadMode.ELEMENT_TYPE: + flags |= _TRSF_READ_AS_INTEGER + if texture_descriptor.normalized_coords: + flags |= _TRSF_NORMALIZED_COORDINATES + if texture_descriptor.srgb: + flags |= _TRSF_SRGB + if texture_descriptor.disable_trilinear_optimization: + flags |= _TRSF_DISABLE_TRILINEAR_OPTIMIZATION + if texture_descriptor.seamless_cubemap: + flags |= _TRSF_SEAMLESS_CUBEMAP + tex_desc.flags = flags + + if texture_descriptor.max_anisotropy < 0: + raise ValueError("max_anisotropy must be >= 0") + tex_desc.maxAnisotropy = texture_descriptor.max_anisotropy + + if not isinstance(texture_descriptor.mipmap_filter_mode, FilterMode): + raise TypeError( + f"mipmap_filter_mode must be a FilterMode, got " + f"{type(texture_descriptor.mipmap_filter_mode).__name__}" + ) + tex_desc.mipmapFilterMode = texture_descriptor.mipmap_filter_mode + tex_desc.mipmapLevelBias = texture_descriptor.mipmap_level_bias + tex_desc.minMipmapLevelClamp = texture_descriptor.min_mipmap_level_clamp + tex_desc.maxMipmapLevelClamp = texture_descriptor.max_mipmap_level_clamp + + cdef int i + if texture_descriptor.border_color is None: + for i in range(4): + tex_desc.borderColor[i] = 0.0 + else: + bc = tuple(texture_descriptor.border_color) + if len(bc) != 4: + raise ValueError( + f"border_color must have 4 elements, got {len(bc)}" + ) + for i in range(4): + tex_desc.borderColor[i] = bc[i] + + cdef TextureObject self = cls.__new__(cls) + self._source_ref = resource + self._texture_desc = texture_descriptor + self._device_id = _get_current_device_id() + + with nogil: + HANDLE_RETURN( + cydriver.cuTexObjectCreate(&self._handle, &res_desc, &tex_desc, NULL) + ) + return self + + @property + def handle(self): + """The underlying ``CUtexObject`` as an integer (64-bit kernel arg).""" + return self._handle + + @property + def resource(self): + """The :class:`ResourceDescriptor` this texture was built from.""" + return self._source_ref + + @property + def texture_descriptor(self): + """The :class:`TextureDescriptor` this texture was built from.""" + return self._texture_desc + + @property + def device(self): + from cuda.core._device import Device + return Device(self._device_id) + + cpdef close(self): + """Destroy the underlying ``CUtexObject``.""" + cdef cydriver.CUtexObject h = self._handle + self._handle = 0 + self._source_ref = None + if h != 0: + HANDLE_RETURN(cydriver.cuTexObjectDestroy(h)) + + def __dealloc__(self): + # Cython destructors cannot raise; any cuTexObjectDestroy error is + # silently dropped. Callers needing visibility should use close(). + if self._handle != 0: + cydriver.cuTexObjectDestroy(self._handle) + self._handle = 0 + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc, tb): + self.close() + + def __repr__(self): + return f"TextureObject(handle=0x{self._handle:x})" diff --git a/cuda_core/cuda/core/_utils/cuda_utils.pxd b/cuda_core/cuda/core/_utils/cuda_utils.pxd index 4562cd71355..11e464e6381 100644 --- a/cuda_core/cuda/core/_utils/cuda_utils.pxd +++ b/cuda_core/cuda/core/_utils/cuda_utils.pxd @@ -25,6 +25,11 @@ cdef int HANDLE_RETURN_NVJITLINK( cynvjitlink.nvJitLinkHandle handle, cynvjitlink.nvJitLinkResult err) except?-1 nogil +# Helper for retrieving the current CUDA device. Raises if no active context +# is bound to the calling thread. +cdef int _get_current_device_id() except? -1 + + # TODO: stop exposing these within the codebase? cpdef int _check_driver_error(cydriver.CUresult error) except?-1 nogil cpdef int _check_runtime_error(error) except?-1 diff --git a/cuda_core/cuda/core/_utils/cuda_utils.pyx b/cuda_core/cuda/core/_utils/cuda_utils.pyx index 4e20f689b5a..318d4466bee 100644 --- a/cuda_core/cuda/core/_utils/cuda_utils.pyx +++ b/cuda_core/cuda/core/_utils/cuda_utils.pyx @@ -69,6 +69,14 @@ cdef int HANDLE_RETURN(cydriver.CUresult err) except?-1 nogil: return 0 +cdef int _get_current_device_id() except? -1: + """Return the current thread's bound CUdevice ordinal.""" + cdef cydriver.CUdevice dev + with nogil: + HANDLE_RETURN(cydriver.cuCtxGetDevice(&dev)) + return dev + + cdef int HANDLE_RETURN_NVRTC(cynvrtc.nvrtcProgram prog, cynvrtc.nvrtcResult err) except?-1 nogil: """Handle NVRTC result codes, raising NVRTCError with program log on failure.""" if err == cynvrtc.nvrtcResult.NVRTC_SUCCESS: diff --git a/cuda_core/docs/source/api.rst b/cuda_core/docs/source/api.rst index b1498b57da3..8d46e34e556 100644 --- a/cuda_core/docs/source/api.rst +++ b/cuda_core/docs/source/api.rst @@ -161,6 +161,40 @@ Tensor Memory Accelerator (TMA) TensorMapDescriptorOptions +Textures and surfaces +--------------------- + +CUDA arrays back bindless texture and surface objects for kernel-side sampled +reads and typed load/store. :class:`CUDAArray` is allocated through +:meth:`CUDAArray.from_descriptor` and bound through a :class:`ResourceDescriptor` +factory; linear (1D) and row-pitched 2D :class:`Buffer` views as well as +mipmapped allocations (:class:`MipmappedArray`) are also supported as texture +backings. + +.. autosummary:: + :toctree: generated/ + + :template: autosummary/cyclass.rst + + CUDAArray + MipmappedArray + ResourceDescriptor + TextureObject + SurfaceObject + + :template: dataclass.rst + + TextureDescriptor + +.. autosummary:: + :toctree: generated/ + + ArrayFormat + AddressMode + FilterMode + ReadMode + + CUDA compilation toolchain -------------------------- diff --git a/cuda_core/examples/gl_interop_bloom.py b/cuda_core/examples/gl_interop_bloom.py new file mode 100644 index 00000000000..66fa95f1f61 --- /dev/null +++ b/cuda_core/examples/gl_interop_bloom.py @@ -0,0 +1,793 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +# ################################################################################ +# +# This example demonstrates the cuda.core texture/surface stack used to build a +# bloom / glow post-effect entirely on the GPU. An animated HDR-ish scene is +# rendered into the base level of a MipmappedArray; the mip pyramid is then +# built level by level via SurfaceObject writes (each level reads the one above +# through its own LINEAR TextureObject); finally a single mipmapped +# TextureObject samples several LODs with tex2DLod to composite a soft bloom on +# top of the sharp scene. Requires pyglet. +# +# ################################################################################ + +# What this example teaches +# ========================= +# The least-demonstrated corner of the texture/surface API: the two halves of a +# mip pyramid round-trip. +# +# - BUILD side: MipmappedArray.get_level(i) returns a NON-OWNING CUDAArray view +# of level i. Bind each level as its own SurfaceObject and have a kernel write +# into it. We downsample by reading level i-1 through a per-level LINEAR +# TextureObject (one bilinear tap == a 2x2 box average) and storing into +# level i through that level's SurfaceObject. This is a mip chain built +# *on the GPU*, not by the driver. +# - SAMPLE side: ONE mipmapped TextureObject (FilterMode.LINEAR + +# mipmap_filter_mode=LINEAR, normalized coords) bound to the whole pyramid via +# ResourceDescriptor.from_mipmapped_array lets a single tex2DLod read +# any level -- the blurred coarse levels are exactly the glow. +# +# How it works +# ============ +# Bloom is "blur the bright parts, add them back." A mip pyramid is a ready-made +# multi-scale blur: each coarser level is a halved, box-filtered copy of the +# level below, so reading a high LOD is reading a heavily blurred image. +# +# level 0: 512 x 512 <- sharp animated scene (the emitters) +# level 1: 256 x 256 (downsampled via SurfaceObject write) +# level 2: 128 x 128 +# ... +# level L-1: small <- the softest, widest glow +# +# PER FRAME (render loop) +# ~~~~~~~~~~~~~~~~~~~~~~~ +# 1. render_scene -- writes an animated scene of moving bright emitters into +# level 0 through its SurfaceObject (float4 RGBA, values +# can exceed 1.0 in the hot spots). +# 2. downsample -- for i in 1..L-1, read level i-1 through its LINEAR +# TextureObject and write level i through its +# SurfaceObject. A single LINEAR tap at the midpoint of +# the parent's 2x2 footprint *is* the box average. +# 3. composite -- one mipmapped TextureObject; tex2DLod at lod 0 gives the +# sharp scene, and a weighted sum of lods 1..L-1 gives the +# bloom. Tonemap with 1 - exp(-c*x) and write RGBA8 to the +# OpenGL PBO. +# +# surf2Dwrite indexes x in BYTES, so a float4 write uses x * sizeof(float4) +# (= x * 16). Getting this wrong silently corrupts every fourth column. +# +# What you should see +# =================== +# Several colored emitters orbiting on a dark background, each wrapped in a soft +# glow. Bright cores bleed light into their surroundings. +# +# + / = bloom strength += 0.15 +# - bloom strength -= 0.15 +# [ bloom threshold -= 0.05 (more of the scene glows) +# ] bloom threshold += 0.05 (only the brightest glow) +# , / . mipmap_level_bias -= / += 0.25 (sharper / softer glow) +# ; / ' LODs summed -= / += 1 (the live max-LOD clamp) +# B toggle bloom on / off (makes the effect obvious) +# R reset all controls +# Escape / close quit +# +# The window title shows FPS plus the live mipmap LOD-selection config +# (MipmappedArray level count, trilinear tex2DLod bias / clamp / LODs) and the +# bloom strength, threshold, and on/off state. +# + +# /// script +# dependencies = ["cuda_bindings", "cuda_core>0.6.0", "pyglet"] +# /// + +import ctypes +import math +import sys +import time + +import numpy as np + +from cuda.core import ( + AddressMode, + ArrayFormat, + Device, + FilterMode, + GraphicsResource, + LaunchConfig, + MipmappedArray, + Program, + ProgramOptions, + ReadMode, + ResourceDescriptor, + SurfaceObject, + TextureDescriptor, + TextureObject, + launch, +) + +# --------------------------------------------------------------------------- +# Configuration (feel free to change these) +# --------------------------------------------------------------------------- +WIDTH = 800 +HEIGHT = 600 +BASE_SIZE = 512 # Mip base-level edge length (power of two so levels halve cleanly). +MAX_LEVELS = 7 # Modest cap on pyramid depth; bounded by log2(BASE_SIZE)+1. +NUM_EMITTERS = 7 + +BLOOM_STRENGTH_STEP = 0.15 +BLOOM_THRESHOLD_STEP = 0.05 + + +# ============================= Helper functions ============================= +# +# The functions below set up CUDA, OpenGL, and the mip pyramid. If you're here +# to learn about MipmappedArray / per-level SurfaceObject writes / mipmapped +# TextureObject sampling, skip straight to main() -- the interesting part is +# there. These helpers keep main() reading like a short story. +# ============================================================================ + + +def _check_compute_capability(dev): + """Surface load/store + mipmapped arrays require sm_30+.""" + cc = dev.compute_capability + if cc.major < 3: + print( + f"This example requires compute capability >= 3.0, got sm_{cc.major}{cc.minor}.", + file=sys.stderr, + ) + sys.exit(1) + + +def setup_cuda(): + """Compile the three kernels and return (device, stream, kernels). + + kernels is a dict with keys "render_scene", "downsample", "composite". + """ + dev = Device(0) + dev.set_current() + _check_compute_capability(dev) + stream = dev.create_stream() + + program_options = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}") + prog = Program(KERNEL_SOURCE, code_type="c++", options=program_options) + mod = prog.compile( + "cubin", + name_expressions=("render_scene", "downsample", "composite"), + ) + kernels = { + "render_scene": mod.get_kernel("render_scene"), + "downsample": mod.get_kernel("downsample"), + "composite": mod.get_kernel("composite"), + } + return dev, stream, kernels + + +def make_level_grid(level_size, block): + """2D launch grid covering a (level_size x level_size) image.""" + return ( + (level_size + block[0] - 1) // block[0], + (level_size + block[1] - 1) // block[1], + 1, + ) + + +def create_window(): + """Open a pyglet window and return (window, gl_module, pyglet).""" + try: + import pyglet + from pyglet.gl import gl as _gl + except ImportError: + print( + "This example requires pyglet >= 2.0.\nInstall it with: pip install pyglet", + file=sys.stderr, + ) + sys.exit(1) + + window = pyglet.window.Window( + WIDTH, + HEIGHT, + caption="cuda.core MipmappedArray - GPU mip-pyramid bloom", + vsync=False, + ) + return window, _gl, pyglet + + +def create_display_resources(gl, width, height): + """Standard GL boilerplate: a shader program, a fullscreen quad, and an + empty texture that we'll repeatedly fill from a PBO. Not CUDA-specific. + + Returns (shader_program, vertex_array_id, texture_id). + """ + from pyglet.graphics.shader import Shader, ShaderProgram + + vert = Shader(VERTEX_SHADER_SOURCE, "vertex") + frag = Shader(FRAGMENT_SHADER_SOURCE, "fragment") + shader_prog = ShaderProgram(vert, frag) + + quad_verts = np.array( + [ + # x, y, s, t (position + texture coordinate) + -1, + -1, + 0, + 0, + 1, + -1, + 1, + 0, + 1, + 1, + 1, + 1, + -1, + -1, + 0, + 0, + 1, + 1, + 1, + 1, + -1, + 1, + 0, + 1, + ], + dtype=np.float32, + ) + + vao = ctypes.c_uint(0) + gl.glGenVertexArrays(1, ctypes.byref(vao)) + gl.glBindVertexArray(vao.value) + + vbo = ctypes.c_uint(0) + gl.glGenBuffers(1, ctypes.byref(vbo)) + gl.glBindBuffer(gl.GL_ARRAY_BUFFER, vbo.value) + gl.glBufferData( + gl.GL_ARRAY_BUFFER, + quad_verts.nbytes, + quad_verts.ctypes.data_as(ctypes.c_void_p), + gl.GL_STATIC_DRAW, + ) + + stride = 4 * 4 # 4 floats * 4 bytes each + pos_loc = gl.glGetAttribLocation(shader_prog.id, b"position") + gl.glEnableVertexAttribArray(pos_loc) + gl.glVertexAttribPointer(pos_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(0)) + + tc_loc = gl.glGetAttribLocation(shader_prog.id, b"texcoord") + gl.glEnableVertexAttribArray(tc_loc) + gl.glVertexAttribPointer(tc_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(8)) + + gl.glBindVertexArray(0) + + tex = ctypes.c_uint(0) + gl.glGenTextures(1, ctypes.byref(tex)) + gl.glBindTexture(gl.GL_TEXTURE_2D, tex.value) + gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MIN_FILTER, gl.GL_LINEAR) + gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MAG_FILTER, gl.GL_LINEAR) + gl.glTexImage2D( + gl.GL_TEXTURE_2D, + 0, + gl.GL_RGBA8, + width, + height, + 0, + gl.GL_RGBA, + gl.GL_UNSIGNED_BYTE, + None, + ) + + return shader_prog, vao.value, tex.value + + +def create_pixel_buffer(gl, width, height): + """Create a Pixel Buffer Object (PBO) -- the CUDA/GL bridge. + + Returns (pbo_gl_name, size_in_bytes). + """ + pbo = ctypes.c_uint(0) + gl.glGenBuffers(1, ctypes.byref(pbo)) + gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo.value) + nbytes = width * height * 4 # RGBA8 + gl.glBufferData(gl.GL_PIXEL_UNPACK_BUFFER, nbytes, None, gl.GL_DYNAMIC_DRAW) + gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0) + return pbo.value, nbytes + + +def copy_pbo_to_texture(gl, pbo_id, tex_id, width, height): + """Copy pixel data from the PBO into the GL texture (GPU-to-GPU).""" + gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo_id) + gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id) + gl.glTexSubImage2D( + gl.GL_TEXTURE_2D, + 0, + 0, + 0, + width, + height, + gl.GL_RGBA, + gl.GL_UNSIGNED_BYTE, + None, + ) + gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0) + + +def draw_fullscreen_quad(gl, shader_prog, vao_id, tex_id): + """Draw the texture to the screen using the fullscreen quad.""" + gl.glUseProgram(shader_prog.id) + gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id) + gl.glBindVertexArray(vao_id) + gl.glDrawArrays(gl.GL_TRIANGLES, 0, 6) + gl.glBindVertexArray(0) + gl.glUseProgram(0) + + +# ================================== main() ================================== + + +def main(): + # --- Step 1: Set up CUDA (compile kernels, create stream) --- + dev, stream, kernels = setup_cuda() + + # --- Step 2: Allocate the mip pyramid (single allocation, all levels) --- + # is_surface_load_store=True is required so each level can back a + # SurfaceObject for kernel-side writes. We cap the depth at MAX_LEVELS; + # each level halves until 1x1 at most. + num_levels = min(int(math.log2(BASE_SIZE)) + 1, MAX_LEVELS) + mm = MipmappedArray.from_descriptor( + shape=(BASE_SIZE, BASE_SIZE), + format=ArrayFormat.FLOAT32, + num_channels=4, + num_levels=num_levels, + is_surface_load_store=True, + ) + + # --- Step 3: Pre-create per-level handles ONCE and keep them alive --- + # For every level we build a SurfaceObject (to write into it) and a + # non-mipmapped LINEAR TextureObject (so the downsample kernel can read + # the level above with hardware bilinear). get_level(i) returns a + # NON-OWNING view -- the storage belongs to `mm`, which we keep alive. + # Building these per-frame would be wasteful and, worse, a handle closed + # before its async launch runs would dangle. + level_sizes = [BASE_SIZE >> i for i in range(num_levels)] + level_arrays = [mm.get_level(i) for i in range(num_levels)] # keep views alive + + src_tex_desc = TextureDescriptor( + address_mode=AddressMode.CLAMP, + filter_mode=FilterMode.LINEAR, # one bilinear tap == 2x2 box average + read_mode=ReadMode.ELEMENT_TYPE, + normalized_coords=False, # integer/pixel coordinates for the box tap + ) + level_surfaces = [SurfaceObject.from_array(arr) for arr in level_arrays] + level_textures = [ + TextureObject.from_descriptor( + resource=ResourceDescriptor.from_array(arr), + texture_descriptor=src_tex_desc, + ) + for arr in level_arrays + ] + + # --- Step 4: One mipmapped TextureObject over the WHOLE pyramid --- + # This is the sample side: tex2DLod can fetch any LOD from it, so the + # composite kernel reads the sharp scene (lod 0) and the blurred glow + # (lods 1..L-1) through this single handle. WRAP/MIRROR need normalized + # coords; we use CLAMP + normalized so a level's edge does not bleed in. + # + # API MAP -- the mip pyramid round-trip + # ===================================== + # BUILD on the GPU: MipmappedArray.from_descriptor(...) allocates the + # whole chain; mm.get_level(i) hands back a NON-OWNING + # CUDAArray view of each level that we bind to a + # per-level SurfaceObject and write into (the loop in + # on_draw). The driver never builds the mips -- we do. + # READ it back: ResourceDescriptor.from_mipmapped_array(mm) wraps the + # SAME chain in ONE mipmapped TextureObject. tex2DLod + # then samples any LOD with trilinear filtering. + # LOD selection knobs (TextureDescriptor): + # mipmap_filter_mode=LINEAR -> trilinear: blend BETWEEN the two nearest + # integer LODs (vs NEAREST = snap to one). + # mipmap_level_bias -> constant added to the requested LOD. + # min/max_mipmap_level_clamp -> clamp the effective LOD to a range. + # These descriptor fields are baked at construction (the texture is created + # ONCE, per the invariants). To demonstrate them INTERACTIVELY, the + # composite kernel folds the SAME bias/clamp math into its explicit + # tex2DLod `lod` argument -- live keys move bias / max-LOD without ever + # rebuilding the texture, while the descriptor encodes the static defaults. + mip_tex_desc = TextureDescriptor( + address_mode=AddressMode.CLAMP, + filter_mode=FilterMode.LINEAR, + read_mode=ReadMode.ELEMENT_TYPE, + normalized_coords=True, + mipmap_filter_mode=FilterMode.LINEAR, # trilinear between levels + mipmap_level_bias=0.0, + min_mipmap_level_clamp=0.0, + max_mipmap_level_clamp=float(num_levels - 1), + ) + mip_tex = TextureObject.from_descriptor( + resource=ResourceDescriptor.from_mipmapped_array(mm), + texture_descriptor=mip_tex_desc, + ) + + # --- Step 5: Open a window and set up the GL/CUDA bridge --- + window, gl, pyglet = create_window() + shader_prog, quad_vao, tex_id = create_display_resources(gl, WIDTH, HEIGHT) + pbo_id, _ = create_pixel_buffer(gl, WIDTH, HEIGHT) + resource = GraphicsResource.from_gl_buffer(pbo_id, flags="write_discard") + + # --- Step 6: Render loop state + launch configs --- + state = { + "strength": 1.8, # bloom intensity multiplier + "threshold": 0.6, # only luminance above this contributes to bloom + "bloom_on": True, + # --- Live LOD-selection controls (folded into the tex2DLod loop) --- + "bias": 0.5, # mipmap_level_bias added to each bloom tap's LOD + "num_lods": max(1, num_levels - 1), # how many LODs the bloom sums + "min_clamp": 0.0, # min_mipmap_level_clamp (shown; static default) + } + max_clamp = float(num_levels - 1) # max_mipmap_level_clamp ceiling + start_time = time.monotonic() + frame_count = [0] + fps_time = [start_time] + + block = (16, 16, 1) + # The composite kernel covers the WIDTHxHEIGHT screen. + composite_config = LaunchConfig(grid=make_level_grid_screen(block), block=block) + + @window.event + def on_draw(): + window.clear() + t = time.monotonic() - start_time + + # (a) Render the animated HDR-ish scene into level 0's surface. + launch( + stream, + LaunchConfig(grid=make_level_grid(BASE_SIZE, block), block=block), + kernels["render_scene"], + np.uint64(level_surfaces[0].handle), + np.int32(BASE_SIZE), + np.int32(BASE_SIZE), + np.float32(t), + np.int32(NUM_EMITTERS), + ) + + # (b) Build the pyramid on the GPU: each level i reads level i-1 via its + # LINEAR TextureObject and writes level i via its SurfaceObject. + for i in range(1, num_levels): + dst_size = level_sizes[i] + launch( + stream, + LaunchConfig(grid=make_level_grid(dst_size, block), block=block), + kernels["downsample"], + np.uint64(level_textures[i - 1].handle), # read parent level + np.uint64(level_surfaces[i].handle), # write this level + np.int32(dst_size), + ) + + # (c) Composite: one mipmapped texture, sample several LODs, tonemap, + # and write RGBA8 straight into the PBO. + with resource.map(stream=stream) as buf: + launch( + stream, + composite_config, + kernels["composite"], + buf.handle, + np.int32(WIDTH), + np.int32(HEIGHT), + np.uint64(mip_tex.handle), + np.float32(state["strength"]), + np.float32(state["threshold"]), + np.int32(state["num_lods"]), # # of bloom LODs summed (max-clamp) + np.float32(state["bias"]), # mipmap_level_bias folded into tex2DLod + np.float32(max_clamp), # max_mipmap_level_clamp ceiling + np.int32(1 if state["bloom_on"] else 0), + ) + # Unmap happens automatically when the `with` block exits. + + copy_pbo_to_texture(gl, pbo_id, tex_id, WIDTH, HEIGHT) + draw_fullscreen_quad(gl, shader_prog, quad_vao, tex_id) + + frame_count[0] += 1 + now = time.monotonic() + if now - fps_time[0] >= 1.0: + fps = frame_count[0] / (now - fps_time[0]) + window.set_caption( + f"GPU mip-pyramid bloom ({WIDTH}x{HEIGHT}, {fps:.0f} FPS) | " + f"MipmappedArray[{num_levels} lvls] + tex2DLod[trilinear, " + f"bias={state['bias']:+.2f}, " + f"clamp={state['min_clamp']:.0f}..{max_clamp:.0f}, " + f"lods={state['num_lods']}] | " + f"bloom={state['strength']:.2f} " + f"thr={state['threshold']:.2f} " + f"{'ON' if state['bloom_on'] else 'OFF'}" + ) + frame_count[0] = 0 + fps_time[0] = now + + @window.event + def on_key_press(symbol, _modifiers): + key = pyglet.window.key + if symbol == key.ESCAPE: + window.close() + elif symbol in (key.PLUS, key.EQUAL, key.NUM_ADD): + state["strength"] = min(8.0, state["strength"] + BLOOM_STRENGTH_STEP) + elif symbol in (key.MINUS, key.NUM_SUBTRACT): + state["strength"] = max(0.0, state["strength"] - BLOOM_STRENGTH_STEP) + elif symbol == key.BRACKETLEFT: + state["threshold"] = max(0.0, state["threshold"] - BLOOM_THRESHOLD_STEP) + elif symbol == key.BRACKETRIGHT: + state["threshold"] = min(4.0, state["threshold"] + BLOOM_THRESHOLD_STEP) + elif symbol == key.COMMA: + state["bias"] = max(-float(num_levels - 1), state["bias"] - 0.25) + elif symbol == key.PERIOD: + state["bias"] = min(float(num_levels - 1), state["bias"] + 0.25) + elif symbol == key.SEMICOLON: + state["num_lods"] = max(1, state["num_lods"] - 1) + elif symbol == key.APOSTROPHE: + state["num_lods"] = min(num_levels - 1, state["num_lods"] + 1) + elif symbol == key.B: + state["bloom_on"] = not state["bloom_on"] + elif symbol == key.R: + state["strength"] = 1.8 + state["threshold"] = 0.6 + state["bloom_on"] = True + state["bias"] = 0.5 + state["num_lods"] = max(1, num_levels - 1) + + @window.event + def on_close(): + # Release CUDA-side resources in reverse construction order. GL objects + # clean up via pyglet on window close. `mm` is closed LAST because the + # per-level surfaces/textures reference its (non-owning) level views. + resource.close() + mip_tex.close() + for tex in level_textures: + tex.close() + for surf in level_surfaces: + surf.close() + mm.close() + stream.close() + + pyglet.app.run(interval=0) + + +def make_level_grid_screen(block): + """2D launch grid covering the WIDTH x HEIGHT screen.""" + return ( + (WIDTH + block[0] - 1) // block[0], + (HEIGHT + block[1] - 1) // block[1], + 1, + ) + + +# ======================== GPU code (CUDA + GLSL) ============================ +# +# Three CUDA kernels are concatenated into one program string so they share a +# single NVRTC compile. All three operate on float4 RGBA pixels. +# +# render_scene -- writes an animated scene of moving bright emitters into mip +# level 0 via a SurfaceObject. Hot cores exceed 1.0 so the +# bloom has something to bleed. NOTE: surf2Dwrite's x is in +# BYTES, so we multiply by sizeof(float4) (= 16). +# +# downsample -- reads level L-1 through a LINEAR TextureObject and writes +# level L through a SurfaceObject. With LINEAR filtering and +# non-normalized coords, ONE tap at the midpoint of the +# parent's 2x2 footprint -- (2x + 1.0, 2y + 1.0) -- equals the +# 4-texel box average. (A POINT-sampled +0.5 offset would be +# a single texel, NOT the average; the +1.0 midpoint is the +# crux of this example.) +# +# composite -- samples the WHOLE pyramid through one mipmapped texture. +# tex2DLod(...,0) is the sharp scene; a weighted sum of +# tex2DLod(...,lod) for lod 1..maxLod is the blurred glow. +# We threshold the glow's luminance, scale by `strength`, +# add the sharp scene, tonemap with 1-exp(-x), write RGBA8. +# +# GLSL shaders at the very bottom just draw a textured quad. Nothing CUDA- +# specific there. +# +# ============================================================================ + +KERNEL_SOURCE = r""" +__device__ __forceinline__ float clampf(float v, float a, float b) { + return fminf(fmaxf(v, a), b); +} + +__device__ __forceinline__ float luminance(float4 c) { + return 0.2126f * c.x + 0.7152f * c.y + 0.0722f * c.z; +} + +// -------------------------------------------------------------------------- +// render_scene: animated bright emitters on a dark background -> level 0. +// +// `surf` is a SurfaceObject bound to mip level 0 (float4 RGBA). Each emitter +// orbits the center and contributes a sharp colored core whose intensity can +// exceed 1.0, giving the bloom pass something to bleed. +// -------------------------------------------------------------------------- +extern "C" __global__ +void render_scene(cudaSurfaceObject_t surf, int width, int height, + float t, int num_emitters) { + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + if (x >= width || y >= height) return; + + float u = ((float)x + 0.5f) / (float)width; + float v = ((float)y + 0.5f) / (float)height; + + // Faint moving background wash so the frame is never fully black. + float bg = 0.04f + 0.02f * sinf(6.2831853f * (u + v) + t * 0.5f); + float3 color = make_float3(bg * 0.4f, bg * 0.5f, bg * 0.9f); + + // Accumulate emitters: each orbits the center on its own radius/phase. + for (int i = 0; i < num_emitters; ++i) { + float fi = (float)i; + float phase = t * (0.4f + 0.12f * fi) + fi * 2.3998f; // golden-ish spread + float radius = 0.18f + 0.06f * fi / fmaxf(1.0f, (float)(num_emitters - 1)); + float ex = 0.5f + radius * cosf(phase); + float ey = 0.5f + radius * sinf(phase * 1.13f); + + float dx = u - ex; + float dy = v - ey; + float d2 = dx * dx + dy * dy; + + // Tight bright core (Gaussian) plus a gentle per-emitter pulse so the + // HDR peak breathes and the bloom halo visibly swells. 1/sigma^2 sets + // the core size; the smaller multiplier here widens the hot spot a bit + // so coarse LODs pick up plenty of energy to bleed. + float pulse = 0.75f + 0.25f * sinf(t * (1.3f + 0.17f * fi) + fi); + float core = expf(-d2 * 3200.0f); + float hot = 3.0f * pulse * core; // peak well above 1.0 -> blooms strongly + + // Per-emitter hue cycling through R/G/B-ish triplets. + float hue = fi * 1.0471975f + t * 0.2f; // 60 deg steps + slow drift + float3 tint = make_float3( + 0.5f + 0.5f * sinf(hue), + 0.5f + 0.5f * sinf(hue + 2.0943951f), + 0.5f + 0.5f * sinf(hue + 4.1887902f)); + + color.x += hot * tint.x; + color.y += hot * tint.y; + color.z += hot * tint.z; + } + + float4 px = make_float4(color.x, color.y, color.z, 1.0f); + + // surf2Dwrite indexes x in BYTES: float4 is 16 bytes. + surf2Dwrite(px, surf, x * (int)sizeof(float4), y); +} + +// -------------------------------------------------------------------------- +// downsample: halve the parent level into this level via a single LINEAR tap. +// +// `src` is a LINEAR-filtered TextureObject bound to the parent level (L-1). +// `dst` is a SurfaceObject bound to this level (L). dst_size is L's edge. +// +// With non-normalized coords, tex2D returns texel (i,j) when sampled at +// (i+0.5, j+0.5). For output texel (x,y) the parent 2x2 footprint covers +// parent texels (2x,2y), (2x+1,2y), (2x,2y+1), (2x+1,2y+1). The midpoint of +// those four centers is (2x+1.0, 2y+1.0); LINEAR filtering there blends all +// four at weight 0.25 each -- exactly the box average. (NOT +0.5, which would +// land on one texel center and return a single texel.) +// -------------------------------------------------------------------------- +extern "C" __global__ +void downsample(cudaTextureObject_t src, + cudaSurfaceObject_t dst, + int dst_size) { + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + if (x >= dst_size || y >= dst_size) return; + + float fx = 2.0f * (float)x + 1.0f; + float fy = 2.0f * (float)y + 1.0f; + + float4 px = tex2D(src, fx, fy); + + surf2Dwrite(px, dst, x * (int)sizeof(float4), y); +} + +// -------------------------------------------------------------------------- +// composite: sharp scene + multi-LOD bloom, tonemapped, into the PBO. +// +// `mip_tex` is ONE mipmapped TextureObject over the whole pyramid. tex2DLod at +// lod 0 is the sharp scene; lods 1..max_lod are progressively blurrier copies +// that form the glow. We threshold each blurred sample's luminance so only the +// bright parts bloom, weight coarser (wider) levels a bit less, scale by +// `strength`, add the sharp scene, and tonemap. +// -------------------------------------------------------------------------- +extern "C" __global__ +void composite(unsigned char *output, + int width, + int height, + cudaTextureObject_t mip_tex, + float strength, + float threshold, + int num_lods, + float bias, + float max_lod, + int bloom_on) { + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + if (x >= width || y >= height) return; + + float u = ((float)x + 0.5f) / (float)width; + float v = ((float)y + 0.5f) / (float)height; + + // Sharp scene from the base level. The base sample stays at lod 0 -- bias is + // applied only to the bloom taps below, so the scene never blurs. + float4 scene = tex2DLod(mip_tex, u, v, 0.0f); + float3 hdr = make_float3(scene.x, scene.y, scene.z); + + if (bloom_on) { + // Sum the blurred levels. Each coarser level covers a wider area, so we + // taper its weight to keep the glow soft rather than flat. + // + // This loop is where the live LOD-selection knobs live: `num_lods` is the + // max-clamp (how high up the pyramid we read), and `bias` is the + // mipmap_level_bias folded into the explicit tex2DLod `lod` argument. + // We clamp the effective LOD to [0, max_lod] so a positive bias can never + // index past the top of the pyramid. + float3 bloom = make_float3(0.0f, 0.0f, 0.0f); + float weight_sum = 0.0f; + for (int lod = 1; lod <= num_lods; ++lod) { + float eff_lod = clampf((float)lod + bias, 0.0f, max_lod); + float4 s = tex2DLod(mip_tex, u, v, eff_lod); + // Soft-knee threshold: keep only the energy above `threshold`. + float lum = luminance(s); + float excess = fmaxf(lum - threshold, 0.0f); + float keep = (lum > 1e-4f) ? (excess / lum) : 0.0f; + + float w = 1.0f / (float)lod; // finer blurred levels weigh more + bloom.x += w * keep * s.x; + bloom.y += w * keep * s.y; + bloom.z += w * keep * s.z; + weight_sum += w; + } + if (weight_sum > 0.0f) { + float inv = strength / weight_sum; + hdr.x += bloom.x * inv; + hdr.y += bloom.y * inv; + hdr.z += bloom.z * inv; + } + } + + // Tonemap HDR -> [0,1] with a simple exposure curve, then to 8-bit. + float r = 1.0f - expf(-hdr.x); + float g = 1.0f - expf(-hdr.y); + float b = 1.0f - expf(-hdr.z); + + int idx = (y * width + x) * 4; + output[idx + 0] = (unsigned char)(clampf(r, 0.0f, 1.0f) * 255.0f); + output[idx + 1] = (unsigned char)(clampf(g, 0.0f, 1.0f) * 255.0f); + output[idx + 2] = (unsigned char)(clampf(b, 0.0f, 1.0f) * 255.0f); + output[idx + 3] = 255; +} +""" + +# GLSL shaders -- these just display a texture on a fullscreen rectangle. +# Nothing CUDA-specific here. + +VERTEX_SHADER_SOURCE = """#version 330 core +in vec2 position; +in vec2 texcoord; +out vec2 v_texcoord; +void main() { + gl_Position = vec4(position, 0.0, 1.0); + v_texcoord = texcoord; +} +""" + +FRAGMENT_SHADER_SOURCE = """#version 330 core +in vec2 v_texcoord; +out vec4 fragColor; +uniform sampler2D tex; +void main() { + fragColor = texture(tex, v_texcoord); +} +""" + + +if __name__ == "__main__": + main() diff --git a/cuda_core/examples/gl_interop_caustics.py b/cuda_core/examples/gl_interop_caustics.py new file mode 100644 index 00000000000..5fe57e256f0 --- /dev/null +++ b/cuda_core/examples/gl_interop_caustics.py @@ -0,0 +1,730 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +# ################################################################################ +# +# This example demonstrates cuda.core.CUDAArray, TextureObject, and +# GraphicsResource for CUDA/OpenGL interop. A tiled pool-floor image is uploaded +# once into a 2D CUDAArray and bound as a TextureObject sampled with +# FilterMode.LINEAR + AddressMode.MIRROR + normalized coordinates. Each frame a +# `render_water` kernel evaluates an animated water surface analytically, refracts +# the background lookup UVs through it, and overlays a bright caustic network +# computed from where the refraction focuses, writing RGBA8 straight into an +# OpenGL PBO. The effect is "looking down at a sunlit pool". Requires pyglet. +# +# ################################################################################ + +# What this example teaches +# ========================= +# - How to upload a host numpy image into a CUDAArray with `CUDAArray.copy_from` +# (host layout (H, W, 4) uint8 row-major for an array allocated as +# shape=(WIDTH, HEIGHT)) and bind it as a long-lived TextureObject. +# - Why FilterMode.LINEAR + AddressMode.MIRROR + normalized_coords=True is the +# right pairing for a refraction effect: refracted UV lookups routinely fall +# slightly outside [0, 1], and MIRROR returns a sensible mirrored pixel rather +# than a clamped smear or a hard edge, while LINEAR keeps the warp smooth. +# - Why srgb=True is the correct read mode for an 8-bit color image: the texels +# are decoded sRGB->linear on read, the kernel does its lighting and tonemap +# in linear light, then re-encodes to sRGB on output (the gamma-correct +# "sample in linear, tonemap, output" pipeline). +# - Why max_anisotropy is justified here: refraction samples the texture at +# grazing, stretched angles, the case anisotropic filtering exists to clean +# up. +# - That the animated water normal field is computed ANALYTICALLY in the kernel +# (a sum of moving directional sine waves plus a few expanding circular +# ripples), so there is no second CUDAArray and no SurfaceObject pass -- the +# normal and its curvature are evaluated per pixel from a `time` uniform. +# - How to feed a small fixed ring of interactive click-ripples to the kernel +# purely as scalar launch arguments (the demonstrated launch convention), +# avoiding any custom device-buffer machinery. +# +# How it works +# ============ +# Startup (once): +# +-------------------+ copy_from +-----------+ +# | host numpy image | ------------> | CUDAArray | (UINT8 RGBA, vivid grid) +# +-------------------+ +-----+-----+ +# | +# v +# +-------------+ +# | TextureObj | LINEAR + MIRROR + norm +# +-------------+ +# +# Each frame (render_water kernel, 2D over the screen): +# 1. Evaluate the water height/normal at this pixel from the analytic wave +# sum (directional waves + circular ripples) using the `time` uniform. +# 2. Refract: offset the background sample UV by `refract` * (the water +# surface gradient) -- a cheap 2D approximation of bending the view ray. +# 3. Sample the background TextureObject at the perturbed UV (LINEAR + +# MIRROR keeps it smooth and well-defined outside [0, 1]). +# 4. Caustics: the refraction map (u,v)->(su,sv) focuses light where its +# Jacobian determinant approaches zero. We light a thin band around that +# det->0 curve to draw the bright, interconnected caustic network, then +# add a depth tint (deeper = bluer) and faint specular glints. +# 5. Tonemap and write RGBA8 into the OpenGL PBO. No PCIe traffic per frame. +# +# Why MIRROR (not WRAP or CLAMP)? +# ------------------------------- +# WRAP and MIRROR both require normalized coordinates. WRAP tiles the image, so +# a refraction pushing past the right edge suddenly shows the far-left content +# (a visible seam). CLAMP smears the edge texel into a streak. MIRROR reflects +# the image at the boundary, which for a small refraction offset looks like the +# pool simply continuing -- the most natural choice here. +# +# What you should see +# =================== +# A tiled aqua pool floor seen through gently moving water, overlaid with a +# bright, shifting network of caustic light filaments. Press +/- to change the +# water/refraction strength, click anywhere to spawn an expanding circular +# ripple at the cursor, and Escape to exit. The title shows FPS and the current +# strength. +# + +# /// script +# dependencies = ["cuda_bindings", "cuda_core>0.6.0", "pyglet"] +# /// + +import ctypes +import sys +import time + +import numpy as np + +from cuda.core import ( + AddressMode, + ArrayFormat, + CUDAArray, + Device, + FilterMode, + GraphicsResource, + LaunchConfig, + Program, + ProgramOptions, + ReadMode, + ResourceDescriptor, + TextureDescriptor, + TextureObject, + launch, +) + +# --------------------------------------------------------------------------- +# Parameters (feel free to change these) +# --------------------------------------------------------------------------- +WIDTH = 800 +HEIGHT = 600 +BG_SIZE = 256 # the background CUDAArray is BG_SIZE x BG_SIZE RGBA8 + +# Interactive click-ripples. We keep a small fixed ring and pass each slot to +# the kernel as plain float scalars (matching the demonstrated launch +# convention -- no custom device buffers). A ripple with start time < 0 is +# inactive. +MAX_RIPPLES = 3 +RIPPLE_LIFETIME = 4.0 # seconds before a click-ripple fully fades out + +DEFAULT_STRENGTH = 1.0 +STRENGTH_STEP = 0.15 +MIN_STRENGTH = 0.0 +MAX_STRENGTH = 3.0 + + +# ============================= Helper functions ============================= +# +# The functions below set up CUDA and OpenGL. If you're here to learn about +# CUDAArray/TextureObject, skip ahead to main() -- the interesting part is +# there. These helpers exist so main() reads like a short story instead of a +# wall of boilerplate. +# ============================================================================ + + +def make_background_image(size): + """Build a (size, size, 4) uint8 RGBA swimming-pool floor: aqua tiles + grout. + + Layout convention: CUDAArray.from_descriptor takes shape=(WIDTH, HEIGHT), so + the host buffer fed to copy_from must be H rows of W elements (row-major), + i.e. host.shape == (HEIGHT, WIDTH, 4). Here the image is square so the two + agree, but the (y, x) indexing below is the load-bearing part. + + A calm tiled pool floor (low-saturation aqua tiles with slightly darker + grout and gentle per-tile variation) is the right backdrop for caustics: it + gives the refraction something legible to warp without itself looking busy, + so the bright caustic network drawn on top reads as light on water rather + than a clash of colors. + """ + ys, xs = np.mgrid[0:size, 0:size].astype(np.float32) + u = xs / size + v = ys / size + + cells = 6.0 + # Distance from each tile's edge (0 at center, 1 at the grout line). + ex = np.abs(((u * cells) % 1.0) - 0.5) * 2.0 + ey = np.abs(((v * cells) % 1.0) - 0.5) * 2.0 + edge = np.maximum(ex, ey) + grout = np.clip((edge - 0.82) / 0.18, 0.0, 1.0) # smooth grout band + + # Subtle per-tile brightness variation (cheap hash on the tile index). + ti = np.floor(u * cells) + np.floor(v * cells) * 31.0 + var = (np.sin(ti * 12.9898) * 43758.5453) % 1.0 + shade = 0.92 + 0.08 * var + + # Aqua tile body and a darker teal grout, blended by the grout band. + tile = np.array([0.30, 0.66, 0.74], dtype=np.float32) + mortar = np.array([0.12, 0.34, 0.42], dtype=np.float32) + img = np.zeros((size, size, 4), dtype=np.uint8) + for c in range(3): + col = (tile[c] * shade) * (1.0 - grout) + mortar[c] * grout + img[:, :, c] = (np.clip(col, 0.0, 1.0) * 255.0).astype(np.uint8) + img[:, :, 3] = 255 + return img + + +def setup_cuda(): + """Compile the kernel and return (device, stream, kernel, launch_config).""" + dev = Device(0) + dev.set_current() + + cc = dev.compute_capability + if cc.major < 3: + print( + "This example requires a GPU with compute capability >= 3.0 for " + f"bindless texture objects. Found sm_{cc.major}{cc.minor}.", + file=sys.stderr, + ) + sys.exit(1) + + stream = dev.create_stream() + + program_options = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}") + prog = Program(KERNEL_SOURCE, code_type="c++", options=program_options) + mod = prog.compile("cubin", name_expressions=("render_water",)) + kernel = mod.get_kernel("render_water") + + block = (16, 16, 1) + grid = ( + (WIDTH + block[0] - 1) // block[0], + (HEIGHT + block[1] - 1) // block[1], + 1, + ) + config = LaunchConfig(grid=grid, block=block) + return dev, stream, kernel, config + + +def create_window(): + """Open a pyglet window. Returns (window, gl_module, pyglet_module).""" + try: + import pyglet + from pyglet.gl import gl as _gl + except ImportError: + print( + "This example requires pyglet >= 2.0.\nInstall it with: pip install pyglet", + file=sys.stderr, + ) + sys.exit(1) + + window = pyglet.window.Window( + WIDTH, + HEIGHT, + caption="cuda.core CUDAArray + TextureObject - Water Caustics", + vsync=False, + ) + return window, _gl, pyglet + + +def create_display_resources(gl, width, height): + """Standard pyglet boilerplate: shader, fullscreen quad, screen texture.""" + from pyglet.graphics.shader import Shader, ShaderProgram + + vert = Shader(VERTEX_SHADER_SOURCE, "vertex") + frag = Shader(FRAGMENT_SHADER_SOURCE, "fragment") + shader_prog = ShaderProgram(vert, frag) + + quad_verts = np.array( + [ + -1, + -1, + 0, + 0, + 1, + -1, + 1, + 0, + 1, + 1, + 1, + 1, + -1, + -1, + 0, + 0, + 1, + 1, + 1, + 1, + -1, + 1, + 0, + 1, + ], + dtype=np.float32, + ) + + vao = ctypes.c_uint(0) + gl.glGenVertexArrays(1, ctypes.byref(vao)) + gl.glBindVertexArray(vao.value) + + vbo = ctypes.c_uint(0) + gl.glGenBuffers(1, ctypes.byref(vbo)) + gl.glBindBuffer(gl.GL_ARRAY_BUFFER, vbo.value) + gl.glBufferData( + gl.GL_ARRAY_BUFFER, + quad_verts.nbytes, + quad_verts.ctypes.data_as(ctypes.c_void_p), + gl.GL_STATIC_DRAW, + ) + + stride = 4 * 4 + pos_loc = gl.glGetAttribLocation(shader_prog.id, b"position") + gl.glEnableVertexAttribArray(pos_loc) + gl.glVertexAttribPointer(pos_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(0)) + tc_loc = gl.glGetAttribLocation(shader_prog.id, b"texcoord") + gl.glEnableVertexAttribArray(tc_loc) + gl.glVertexAttribPointer(tc_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(8)) + gl.glBindVertexArray(0) + + tex = ctypes.c_uint(0) + gl.glGenTextures(1, ctypes.byref(tex)) + gl.glBindTexture(gl.GL_TEXTURE_2D, tex.value) + gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MIN_FILTER, gl.GL_LINEAR) + gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MAG_FILTER, gl.GL_LINEAR) + gl.glTexImage2D( + gl.GL_TEXTURE_2D, + 0, + gl.GL_RGBA8, + width, + height, + 0, + gl.GL_RGBA, + gl.GL_UNSIGNED_BYTE, + None, + ) + return shader_prog, vao.value, tex.value + + +def create_pixel_buffer(gl, width, height): + """Create the GL PBO that CUDA writes RGBA pixels into each frame.""" + pbo = ctypes.c_uint(0) + gl.glGenBuffers(1, ctypes.byref(pbo)) + gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo.value) + nbytes = width * height * 4 + gl.glBufferData(gl.GL_PIXEL_UNPACK_BUFFER, nbytes, None, gl.GL_DYNAMIC_DRAW) + gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0) + return pbo.value + + +def copy_pbo_to_texture(gl, pbo_id, tex_id, width, height): + """Copy pixel data from the PBO into the GL texture (GPU-to-GPU).""" + gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo_id) + gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id) + gl.glTexSubImage2D( + gl.GL_TEXTURE_2D, + 0, + 0, + 0, + width, + height, + gl.GL_RGBA, + gl.GL_UNSIGNED_BYTE, + None, + ) + gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0) + + +def draw_fullscreen_quad(gl, shader_prog, vao_id, tex_id): + """Draw the texture to the screen using the fullscreen quad.""" + gl.glUseProgram(shader_prog.id) + gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id) + gl.glBindVertexArray(vao_id) + gl.glDrawArrays(gl.GL_TRIANGLES, 0, 6) + gl.glBindVertexArray(0) + gl.glUseProgram(0) + + +MAX_ANISOTROPY = 8 # kept in lockstep with the API MAP comment + live caption + + +def make_background_texture(arr): + """Bind `arr` as a TextureObject for LINEAR + MIRROR + normalized sampling. + + MIRROR (like WRAP) requires normalized coordinates. UINT8 source + + NORMALIZED_FLOAT means tex2D returns each channel in [0, 1]. + + API MAP: UINT8 RGBA CUDAArray sampled as TextureObject[LINEAR | MIRROR | + NORMALIZED_FLOAT | srgb | max_anisotropy=8]; MIRROR handles refracted UVs + that leave [0,1]; srgb does the gamma-correct decode; anisotropy cleans up + grazing-angle sampling. + + Two TextureDescriptor features are showcased here on an 8-bit color image: + + - srgb=True: the background is UINT8 RGBA authored in perceptual space, so + enabling sRGB->linear conversion on read is the correct thing to do -- + the kernel then does all of its lighting/tonemap math in linear light and + re-encodes to sRGB on output (the final pow(c, 1/2.2) below). This is the + gamma-correct "sample in linear, tonemap, output" pipeline. + - max_anisotropy=8: refraction samples the texture at grazing, stretched + angles, which is exactly the case anisotropic filtering is meant to clean + up, so we request it on the background texture. + """ + res_desc = ResourceDescriptor.from_array(arr) + tex_desc = TextureDescriptor( + address_mode=AddressMode.MIRROR, + filter_mode=FilterMode.LINEAR, + read_mode=ReadMode.NORMALIZED_FLOAT, + # MIRROR/WRAP addressing modes require normalized coordinates. + normalized_coords=True, + # 8-bit color image -> decode sRGB to linear on read so the lighting and + # tonemap math runs in linear light (re-encoded to sRGB on output). + srgb=True, + # Refraction samples at grazing/stretched angles; anisotropic filtering + # cleans those up. + max_anisotropy=MAX_ANISOTROPY, + ) + return TextureObject.from_descriptor(resource=res_desc, texture_descriptor=tex_desc) + + +# ================================== main() ================================== + + +def main(): + # --- Step 1: Set up CUDA (compile kernel, create stream) --- + dev, stream, kernel, config = setup_cuda() + + # --- Step 2: Open a window --- + window, gl, pyglet = create_window() + + # --- Step 3: Create GL resources (shader, fullscreen quad, screen tex) --- + shader_prog, quad_vao, screen_tex = create_display_resources(gl, WIDTH, HEIGHT) + + # --- Step 4: Create the PBO that CUDA will write into --- + pbo_id = create_pixel_buffer(gl, WIDTH, HEIGHT) + resource = GraphicsResource.from_gl_buffer(pbo_id, flags="write_discard") + + # --- Step 5: Allocate the background CUDAArray and upload the image once --- + bg_arr = CUDAArray.from_descriptor( + shape=(BG_SIZE, BG_SIZE), + format=ArrayFormat.UINT8, + num_channels=4, + ) + host_image = make_background_image(BG_SIZE) + bg_arr.copy_from(np.ascontiguousarray(host_image), stream=stream) + stream.sync() + + # --- Step 6: Bind the CUDAArray as a long-lived TextureObject --- + # Created once and kept alive: `launch` is async, so a per-frame texture + # inside a closing `with` would destroy the handle before the kernel ran. + bg_tex = make_background_texture(bg_arr) + + # Interactive state. Each ripple slot is (origin_x, origin_y, start_time) in + # normalized screen coords / seconds; start_time < 0 means inactive. + state = { + "strength": DEFAULT_STRENGTH, + "ripples": [[0.0, 0.0, -1.0] for _ in range(MAX_RIPPLES)], + "next_slot": 0, + } + start_time = time.monotonic() + + @window.event + def on_key_press(symbol, _modifiers): + key = pyglet.window.key + if symbol == key.ESCAPE: + window.close() + elif symbol in (key.PLUS, key.EQUAL, key.NUM_ADD): + state["strength"] = min(MAX_STRENGTH, state["strength"] + STRENGTH_STEP) + elif symbol in (key.MINUS, key.UNDERSCORE, key.NUM_SUBTRACT): + state["strength"] = max(MIN_STRENGTH, state["strength"] - STRENGTH_STEP) + + @window.event + def on_mouse_press(x, y, _button, _modifiers): + # pyglet's origin is bottom-left, which matches our normalized UV + # convention below (v increases upward). Record into the ring buffer. + now = time.monotonic() - start_time + slot = state["next_slot"] + state["ripples"][slot] = [x / WIDTH, y / HEIGHT, now] + state["next_slot"] = (slot + 1) % MAX_RIPPLES + + # --- Step 7: Render loop --- + frame_count = 0 + fps_time = start_time + + @window.event + def on_draw(): + nonlocal frame_count, fps_time + + now = time.monotonic() + t = now - start_time + + window.clear() + + # Flatten the ripple ring into the scalar args the kernel expects: + # for each slot, (origin_x, origin_y, age) where age < 0 == inactive. + ripple_args = [] + for ox, oy, st in state["ripples"]: + age = (t - st) if st >= 0.0 else -1.0 + if age >= RIPPLE_LIFETIME: + age = -1.0 + ripple_args.extend((np.float32(ox), np.float32(oy), np.float32(age))) + + with resource.map(stream=stream) as buf: + launch( + stream, + config, + kernel, + np.uint64(bg_tex.handle), + buf.handle, + np.int32(WIDTH), + np.int32(HEIGHT), + np.float32(t), + np.float32(state["strength"]), + np.float32(RIPPLE_LIFETIME), + *ripple_args, + ) + copy_pbo_to_texture(gl, pbo_id, screen_tex, WIDTH, HEIGHT) + draw_fullscreen_quad(gl, shader_prog, quad_vao, screen_tex) + + frame_count += 1 + if now - fps_time >= 1.0: + fps = frame_count / (now - fps_time) + window.set_caption( + "cuda.core CUDAArray + TextureObject - Water Caustics " + f"(strength={state['strength']:.2f}, {fps:.0f} FPS) " + f"| TextureObject[LINEAR|MIRROR|sRGB|aniso={MAX_ANISOTROPY}] UINT8 " + "[+/- strength, click = ripple, Esc = quit]" + ) + frame_count = 0 + fps_time = now + + @window.event + def on_close(): + bg_tex.close() + bg_arr.close() + resource.close() + stream.close() + + pyglet.app.run(interval=0) + + +# ============================== GPU code (kernel) ============================ +# +# render_water samples a static background TextureObject (LINEAR + MIRROR + +# normalized coords) at refraction-perturbed UVs. The water surface and its +# normal/curvature are evaluated analytically from a `time` uniform -- there is +# no second array and no SurfaceObject. MAX_RIPPLES click-ripples arrive as +# (origin_x, origin_y, age) float triples; age < 0 marks an empty slot. +# +# The ripple count is compiled in via the MAX_RIPPLES define so the kernel's +# parameter list (host side) and the loop bound (device side) stay in lockstep. +# ============================================================================ + +KERNEL_SOURCE = ( + "#define MAX_RIPPLES " + + str(MAX_RIPPLES) + + "\n" + + r""" +// Analytic water height field at normalized position p and time t. A sum of a +// few moving directional waves gives the base chop; the expanding circular +// ripples from clicks ride on top. Returns height; gradient/curvature are taken +// numerically by sampling this a few times (cheap and robust). +__device__ __forceinline__ +float water_height(float px, float py, float t, + const float* rip_x, const float* rip_y, + const float* rip_age, float ripple_lifetime) { + float h = 0.0f; + + // Directional waves: (dir_x, dir_y, freq, speed, amp). + // Hand-picked so they never perfectly align (avoids an obvious repeat). + const float waves[5][5] = { + { 1.00f, 0.00f, 9.0f, 1.3f, 0.45f}, + { 0.20f, 0.98f, 12.0f, 1.0f, 0.35f}, + {-0.70f, 0.71f, 16.0f, 1.7f, 0.25f}, + { 0.80f, -0.60f, 22.0f, 2.1f, 0.18f}, + {-0.30f, -0.95f, 31.0f, 2.6f, 0.12f}, + }; + #pragma unroll + for (int i = 0; i < 5; ++i) { + float phase = (waves[i][0] * px + waves[i][1] * py) * waves[i][2] + + t * waves[i][3]; + h += waves[i][4] * sinf(phase); + } + + // Expanding circular ripples from mouse clicks. Each is a decaying radial + // wave packet whose ring radius grows with age. + for (int r = 0; r < MAX_RIPPLES; ++r) { + float age = rip_age[r]; + if (age < 0.0f) continue; + float dx = px - rip_x[r]; + float dy = py - rip_y[r]; + float dist = sqrtf(dx * dx + dy * dy); + float ring = dist * 40.0f - age * 8.0f; // outward-moving ring + float envelope = expf(-dist * 6.0f); // localized in space + float fade = 1.0f - (age / ripple_lifetime); // fade over lifetime + if (fade < 0.0f) fade = 0.0f; + h += 0.9f * fade * envelope * sinf(ring); + } + return h; +} + +extern "C" +__global__ +void render_water(cudaTextureObject_t bg, + unsigned char* output, + int width, int height, + float t, + float strength, + float ripple_lifetime, +""" + + "".join( + f" float rip_x{i}, float rip_y{i}, float rip_age{i}" + + (",\n" if i < MAX_RIPPLES - 1 else ") {\n") + for i in range(MAX_RIPPLES) + ) + + r""" + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + if (x >= width || y >= height) return; + + // Pack the per-ripple scalars back into arrays so the helper can loop. + float rip_x[MAX_RIPPLES]; + float rip_y[MAX_RIPPLES]; + float rip_age[MAX_RIPPLES]; +""" + + "".join( + f" rip_x[{i}] = rip_x{i}; rip_y[{i}] = rip_y{i}; rip_age[{i}] = rip_age{i};\n" for i in range(MAX_RIPPLES) + ) + + r""" + // Normalized screen position. v increases upward to match pyglet's + // bottom-left mouse origin used when recording ripple coordinates. + float u = (x + 0.5f) / (float)width; + float v = 1.0f - (y + 0.5f) / (float)height; + + // Sample the water height field on a 3x3 stencil to get the surface + // gradient (slope -> refraction) and the full Hessian (the second + // derivatives that drive the caustic network). + const float eps = 1.5f / (float)width; + float hc = water_height(u, v, t, rip_x, rip_y, rip_age, ripple_lifetime); + float hl = water_height(u - eps, v, t, rip_x, rip_y, rip_age, ripple_lifetime); + float hr = water_height(u + eps, v, t, rip_x, rip_y, rip_age, ripple_lifetime); + float hd = water_height(u, v - eps, t, rip_x, rip_y, rip_age, ripple_lifetime); + float hu = water_height(u, v + eps, t, rip_x, rip_y, rip_age, ripple_lifetime); + float hlu = water_height(u - eps, v + eps, t, rip_x, rip_y, rip_age, ripple_lifetime); + float hru = water_height(u + eps, v + eps, t, rip_x, rip_y, rip_age, ripple_lifetime); + float hld = water_height(u - eps, v - eps, t, rip_x, rip_y, rip_age, ripple_lifetime); + float hrd = water_height(u + eps, v - eps, t, rip_x, rip_y, rip_age, ripple_lifetime); + + float inv2e = 1.0f / (2.0f * eps); + float inve2 = 1.0f / (eps * eps); + float gx = (hr - hl) * inv2e; // d(height)/du + float gy = (hu - hd) * inv2e; // d(height)/dv + float hxx = (hr - 2.0f * hc + hl) * inve2; + float hyy = (hu - 2.0f * hc + hd) * inve2; + float hxy = (hru - hrd - hlu + hld) * (0.25f * inve2); + + // 2D refraction: bend the background lookup by the surface slope, kept + // small so the pool floor warps gently instead of tearing apart. Because + // the texture was bound with srgb=True the sample is already in LINEAR + // light, so the lighting/tonemap below is physically sensible and we only + // re-encode to sRGB at the very end. MIRROR keeps (su, sv) outside [0,1] + // smooth instead of a clamped streak or a wrap seam. + float refract = 0.010f * strength; + float su = u - refract * gx; + float sv = v - refract * gy; + float4 base = tex2D(bg, su, sv); + + // Caustics from the refraction map's area compression. The displacement + // (u,v) -> (su,sv) has Jacobian J = [[1 - r*hxx, -r*hxy], [-r*hxy, + // 1 - r*hyy]]. Where det(J) -> 0 neighbouring rays converge onto the same + // spot and light piles up; 1/|det| is the brightness of that focus. This + // is what produces the real, interconnected, animated caustic web -- not a + // generic glow. `rs` is a small lens strength tuned to the wave curvature. + float rs = 0.012f * (0.5f + 0.5f * strength); + float a = 1.0f - rs * hxx; + float dd = 1.0f - rs * hyy; + float bxy = rs * hxy; + float det = a * dd - bxy * bxy; + // The caustic is the thin CURVE where det -> 0 (rays focus to a line). We + // light up only a narrow band around it and square the ramp so the result + // is crisp bright filaments over the visible tiles, not broad foggy blobs. + // Two bands -- a tight bright core plus a fainter halo -- give the lines a + // little glow without fattening them. + float ad = fabsf(det); + float core = 1.0f - fminf(ad / 0.06f, 1.0f); + float halo = 1.0f - fminf(ad / 0.30f, 1.0f); + float caustic = core * core * 1.7f + halo * halo * 0.25f; + if (caustic > 2.0f) caustic = 2.0f; + + // Surface normal from the gradient (z points out of the water). + float nx = -gx, ny = -gy, nz = 1.0f; + float ninv = rsqrtf(nx * nx + ny * ny + nz * nz); + nx *= ninv; ny *= ninv; nz *= ninv; + + // Faint specular glints off the wavelets. + float lx = 0.3f, ly = 0.4f, lz = 0.866f; + float spec = nx * lx + ny * ly + nz * lz; + if (spec < 0.0f) spec = 0.0f; + spec = powf(spec, 60.0f) * 0.5f; + + // Water tint: a gentle blue-green cast, slightly deeper in the troughs. + float depth = 0.5f + 0.5f * hc; + float tint_r = 0.80f + 0.08f * depth; + float tint_g = 0.98f + 0.04f * depth; + float tint_b = 1.10f - 0.06f * depth; + + // Composite in LINEAR light: tinted pool floor + the white caustic web + // (a touch cooler in blue so it reads as sunlight through water) + glints. + float cr = base.x * tint_r + caustic * 0.90f + spec; + float cg = base.y * tint_g + caustic * 0.97f + spec; + float cb = base.z * tint_b + caustic * 1.00f + spec; + + // Simple Reinhard tonemap so highlights roll off instead of clipping hard. + cr = cr / (1.0f + cr); + cg = cg / (1.0f + cg); + cb = cb / (1.0f + cb); + + // Encode LINEAR -> sRGB on output. This is the matching half of the + // srgb=True decode on the texture read: we sampled and lit in linear, and + // now re-encode for the 8-bit RGBA8 PBO. The ~1/2.2 exponent is the + // gamma-correct encode (and also lifts the midtones the linear decode + // darkened, so the pool reads luminous rather than murky). + cr = powf(cr, 1.0f / 2.2f); + cg = powf(cg, 1.0f / 2.2f); + cb = powf(cb, 1.0f / 2.2f); + + int idx = (y * width + x) * 4; + output[idx + 0] = (unsigned char)(fminf(cr, 1.0f) * 255.0f); + output[idx + 1] = (unsigned char)(fminf(cg, 1.0f) * 255.0f); + output[idx + 2] = (unsigned char)(fminf(cb, 1.0f) * 255.0f); + output[idx + 3] = 255; +} +""" +) + +VERTEX_SHADER_SOURCE = """#version 330 core +in vec2 position; +in vec2 texcoord; +out vec2 v_texcoord; +void main() { + gl_Position = vec4(position, 0.0, 1.0); + v_texcoord = texcoord; +} +""" + +FRAGMENT_SHADER_SOURCE = """#version 330 core +in vec2 v_texcoord; +out vec4 fragColor; +uniform sampler2D tex; +void main() { + fragColor = texture(tex, v_texcoord); +} +""" + + +if __name__ == "__main__": + main() diff --git a/cuda_core/examples/gl_interop_clouds.py b/cuda_core/examples/gl_interop_clouds.py new file mode 100644 index 00000000000..bc8829674ef --- /dev/null +++ b/cuda_core/examples/gl_interop_clouds.py @@ -0,0 +1,991 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +# ################################################################################ +# +# This example demonstrates cuda.core's 3D CUDAArray + trilinear TextureObject by +# baking a procedural fractal-noise density volume once at startup and then +# ray-marching it every frame as participating media to render fluffy, sunlit, +# semi-transparent clouds. The SurfaceObject is used during the one-shot bake; +# the TextureObject (with LINEAR + WRAP + normalized coords) drives the per-frame +# volumetric ray march with Beer-Lambert absorption and self-shadowing. The +# whole pipeline stays on the GPU through GraphicsResource. Requires pyglet. +# +# ################################################################################ + +# What this example teaches +# ========================= +# - How to allocate a 3D cuda.core.CUDAArray (cuArray3DCreate under the hood) and +# bind it as both a SurfaceObject (for one-shot kernel writes via surf3Dwrite) +# and a TextureObject (for hardware-accelerated trilinear tex3D sampling). +# - How to ray-march a baked scalar density volume as PARTICIPATING MEDIA: this +# goes beyond gl_interop_sdf_volume.py (which renders a hard SDF surface). Here +# the volume is fog: we accumulate color and transmittance front-to-back and +# apply Beer-Lambert absorption, with a short secondary march toward the sun +# for self-shadowing. +# - How to wire mouse + keyboard input into a pyglet/cuda.core interop loop. +# +# How it works +# ============ +# A single-channel float (FLOAT32) 3D volume (96^3) is filled once at +# startup with fractal Brownian motion (fbm) built from a cheap integer-hash +# value noise: +# +# fbm(p) = sum over octaves of amplitude * value_noise(p * frequency) +# density = remap(fbm) with a coverage threshold +# +# The volume stores only the raw noise; the cloud SHAPING (coverage threshold + +# a vertical height falloff that fades density near the top and bottom of the +# box) is applied in the RENDER kernel, not baked. That lets us ANIMATE the +# clouds for free by scrolling the sample coordinate with a `time` uniform +# (cheaper than re-baking 96^3 every frame, which would stack a second 3D launch +# on top of the already heavy raymarch). WRAP addressing avoids clamping the +# scrolled coordinate at the box edge (the baked field is not perfectly +# tileable, so a faint density seam sweeps through slowly); the ray-vs-box bail +# is what keeps density zero outside the volume, so WRAP is safe here. +# +# STARTUP (one-shot bake) +# ~~~~~~~~~~~~~~~~~~~~~~~ +# 1. Allocate 3D CUDAArray (96^3, FLOAT32 x1, is_surface_load_store=True). +# 2. Bind it as a SurfaceObject. +# 3. Launch `bake_density`: one thread per voxel writes fbm via surf3Dwrite. +# 4. Close the SurfaceObject; the CUDAArray stays alive. +# +# EACH FRAME +# ~~~~~~~~~~ +# 1. resource.map() -> CUDA device pointer into the OpenGL PBO. +# 2. Launch `render_clouds` (one thread per pixel). It builds an orbit-camera +# ray, intersects the [-1,1]^3 box, marches front-to-back sampling density +# via tex3D (LINEAR + WRAP + normalized coords), shades each sample +# with a short sun-ward shadow march (Beer-Lambert), accumulates over an +# analytic sky, and writes RGBA8 straight into the PBO. +# 3. Unmap, GPU-side copy PBO -> texture, draw fullscreen quad. +# +# Performance note +# ================ +# This is the most compute-heavy example here: a primary march (up to ~96 steps) +# with a nested secondary shadow march (~6 steps) per sample is O(steps^2) work +# per pixel. To keep it interactive we use a modest 96^3 volume, cap the step +# counts, and EARLY-OUT once transmittance drops below ~0.01. Lower +# PRIMARY_STEPS / VOLUME_SIZE if your GPU struggles. +# +# Controls +# ======== +# Left mouse drag orbit camera (dx -> yaw, dy -> pitch) +# Arrow keys orbit camera (keyboard alternative) +# Mouse wheel zoom (camera distance) +# + / - raise / lower the sun (changes light angle + sky glow) +# [ / ] decrease / increase cloud coverage (more / less cloud) +# R reset camera + sun + coverage +# Escape / close quit +# +# The window title shows yaw, pitch, distance, sun height, coverage, and FPS. +# + +# /// script +# dependencies = ["cuda_bindings", "cuda_core>0.6.0", "pyglet"] +# /// + +import ctypes +import sys +import time + +import numpy as np + +from cuda.core import ( + AddressMode, + ArrayFormat, + CUDAArray, + Device, + FilterMode, + GraphicsResource, + LaunchConfig, + Program, + ProgramOptions, + ReadMode, + ResourceDescriptor, + SurfaceObject, + TextureDescriptor, + TextureObject, + launch, +) + +# --------------------------------------------------------------------------- +# Configuration (feel free to change these) +# --------------------------------------------------------------------------- +WIDTH = 800 +HEIGHT = 600 +VOLUME_SIZE = 96 # 96^3 voxels; bake cost is one-shot. Lower if memory is tight. + +# Camera defaults / clamps. +RESET_YAW = 0.6 +RESET_PITCH = 0.25 +RESET_DIST = 3.2 +PITCH_MIN = -1.45 # stay inside (-pi/2, pi/2) so the up-vector stays sane. +PITCH_MAX = 1.45 +DIST_MIN = 1.5 +DIST_MAX = 9.0 + +# Lighting / shaping defaults and clamps. +RESET_SUN_HEIGHT = 0.55 # 0 = sun at horizon, 1 = sun overhead. +SUN_HEIGHT_MIN = 0.05 +SUN_HEIGHT_MAX = 0.98 +RESET_COVERAGE = 0.50 # higher = more cloud (lower density threshold). +COVERAGE_MIN = 0.20 +COVERAGE_MAX = 0.85 + + +# ============================= Helper functions ============================= +# +# The functions below set up CUDA and OpenGL. If you're here to learn about +# 3D CUDAArray / TextureObject / SurfaceObject, skip ahead to main() -- the +# interesting part is there. These helpers exist so that main() reads like a +# short story instead of a wall of boilerplate. +# ============================================================================ + + +def _check_compute_capability(dev): + """3D arrays + bindless surface/texture objects require sm_30+.""" + cc = dev.compute_capability + if cc.major < 3: + print( + f"This example requires compute capability >= 3.0, got sm_{cc.major}{cc.minor}.", + file=sys.stderr, + ) + sys.exit(1) + + +def setup_cuda(): + """Compile the two kernels and return (device, stream, kernels).""" + dev = Device(0) + dev.set_current() + _check_compute_capability(dev) + stream = dev.create_stream() + + # C++ is required so the templated tex3D / surf3Dwrite + # overloads resolve. extern "C" on the kernel symbols keeps the function + # names unmangled even when the rest of the TU is compiled as C++. + program_options = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}") + prog = Program(KERNEL_SOURCE, code_type="c++", options=program_options) + mod = prog.compile( + "cubin", + name_expressions=("bake_density", "render_clouds"), + ) + kernels = { + "bake": mod.get_kernel("bake_density"), + "render": mod.get_kernel("render_clouds"), + } + return dev, stream, kernels + + +def make_volume_array(): + """Allocate the 3D density volume. Single-channel float, surface-capable. + + API MAP + ======= + - 3D CUDAArray shape=(W,H,D): CUDAArray.from_descriptor allocates a 96^3 + single-channel array (cuArray3DCreate under the hood). This is the + headline of the example: a true 3D, hardware-laid-out array sampled + trilinearly from a kernel. + - tex3D trilinear (FilterMode.LINEAR) + normalized coords: configured by + make_volume_texture below; gives free hardware trilinear sampling, the + thing that makes a smooth volumetric raymarch cheap. + - surf3Dwrite typed store during the one-shot bake: bind the same CUDAArray + as a SurfaceObject (is_surface_load_store=True) and write one density per + voxel; the byte x-offset uses sizeof(float) because surf3Dwrite's x + coordinate is in BYTES (y, z are in elements). + """ + return CUDAArray.from_descriptor( + shape=(VOLUME_SIZE, VOLUME_SIZE, VOLUME_SIZE), + format=ArrayFormat.FLOAT32, + num_channels=1, + is_surface_load_store=True, + ) + + +def make_volume_texture(arr): + """Bind `arr` as a TextureObject configured for LINEAR + WRAP + normalized. + + WRAP (not CLAMP) is the right choice here: the render kernel scrolls the + sample coordinate by a time uniform to animate the clouds, and WRAP avoids + clamping (smearing) the edge texels as the coordinate drifts past [0, 1]. + The baked field is not perfectly tileable, so a faint density seam sweeps + through slowly as the scroll wraps -- a minor demo-grade artifact, not a + crash. WRAP/MIRROR addressing modes require normalized coordinates. The + ray-vs-box bail in the raymarch is what keeps density zero outside the + [-1, 1]^3 volume, so wrapping the noise field never leaks cloud outside it. + """ + res_desc = ResourceDescriptor.from_array(arr) + tex_desc = TextureDescriptor( + address_mode=AddressMode.WRAP, + filter_mode=FilterMode.LINEAR, + read_mode=ReadMode.ELEMENT_TYPE, + normalized_coords=True, + ) + return TextureObject.from_descriptor(resource=res_desc, texture_descriptor=tex_desc) + + +def bake_volume(stream, kernels, arr): + """Run the one-shot bake kernel that fills the volume with fractal noise. + + The SurfaceObject lives only for the duration of this call; once the bake + is enqueued and the kernel has captured the bindless handle into its + arguments, we sync the stream before letting the SurfaceObject close. + The CUDAArray itself outlives this scope -- it's the long-lived backing + store for the render-loop TextureObject. + """ + with SurfaceObject.from_array(arr) as bake_surf: + block = (8, 8, 8) + grid = ( + (VOLUME_SIZE + block[0] - 1) // block[0], + (VOLUME_SIZE + block[1] - 1) // block[1], + (VOLUME_SIZE + block[2] - 1) // block[2], + ) + launch( + stream, + LaunchConfig(grid=grid, block=block), + kernels["bake"], + np.uint64(bake_surf.handle), + np.int32(VOLUME_SIZE), + ) + # Synchronize before the SurfaceObject context exits so the bindless + # handle is still valid while the kernel runs. + stream.sync() + + +def create_window(): + """Open a pyglet window and return (window, gl_module, pyglet).""" + try: + import pyglet + from pyglet.gl import gl as _gl + except ImportError: + print( + "This example requires pyglet >= 2.0.\nInstall it with: pip install pyglet", + file=sys.stderr, + ) + sys.exit(1) + + window = pyglet.window.Window( + WIDTH, + HEIGHT, + caption="cuda.core 3D CUDAArray - Volumetric Cloud Ray-Marcher", + vsync=False, + ) + return window, _gl, pyglet + + +def create_display_resources(gl, width, height): + """Standard GL boilerplate: shader, fullscreen quad, empty texture. + + Not CUDA-specific; identical to the other gl_interop_* examples. + Returns (shader_program, vertex_array_id, texture_id). + """ + from pyglet.graphics.shader import Shader, ShaderProgram + + vert = Shader(VERTEX_SHADER_SOURCE, "vertex") + frag = Shader(FRAGMENT_SHADER_SOURCE, "fragment") + shader_prog = ShaderProgram(vert, frag) + + quad_verts = np.array( + [ + # x, y, s, t (position + texture coordinate) + -1, + -1, + 0, + 0, + 1, + -1, + 1, + 0, + 1, + 1, + 1, + 1, + -1, + -1, + 0, + 0, + 1, + 1, + 1, + 1, + -1, + 1, + 0, + 1, + ], + dtype=np.float32, + ) + + vao = ctypes.c_uint(0) + gl.glGenVertexArrays(1, ctypes.byref(vao)) + gl.glBindVertexArray(vao.value) + + vbo = ctypes.c_uint(0) + gl.glGenBuffers(1, ctypes.byref(vbo)) + gl.glBindBuffer(gl.GL_ARRAY_BUFFER, vbo.value) + gl.glBufferData( + gl.GL_ARRAY_BUFFER, + quad_verts.nbytes, + quad_verts.ctypes.data_as(ctypes.c_void_p), + gl.GL_STATIC_DRAW, + ) + + stride = 4 * 4 # 4 floats * 4 bytes each + pos_loc = gl.glGetAttribLocation(shader_prog.id, b"position") + gl.glEnableVertexAttribArray(pos_loc) + gl.glVertexAttribPointer(pos_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(0)) + + tc_loc = gl.glGetAttribLocation(shader_prog.id, b"texcoord") + gl.glEnableVertexAttribArray(tc_loc) + gl.glVertexAttribPointer(tc_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(8)) + + gl.glBindVertexArray(0) + + tex = ctypes.c_uint(0) + gl.glGenTextures(1, ctypes.byref(tex)) + gl.glBindTexture(gl.GL_TEXTURE_2D, tex.value) + gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MIN_FILTER, gl.GL_LINEAR) + gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MAG_FILTER, gl.GL_LINEAR) + gl.glTexImage2D( + gl.GL_TEXTURE_2D, + 0, + gl.GL_RGBA8, + width, + height, + 0, + gl.GL_RGBA, + gl.GL_UNSIGNED_BYTE, + None, + ) + + return shader_prog, vao.value, tex.value + + +def create_pixel_buffer(gl, width, height): + """Create a Pixel Buffer Object (PBO) -- the CUDA/GL bridge. + + Returns (pbo_gl_name, size_in_bytes). + """ + pbo = ctypes.c_uint(0) + gl.glGenBuffers(1, ctypes.byref(pbo)) + gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo.value) + nbytes = width * height * 4 # RGBA8 + gl.glBufferData(gl.GL_PIXEL_UNPACK_BUFFER, nbytes, None, gl.GL_DYNAMIC_DRAW) + gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0) + return pbo.value, nbytes + + +def copy_pbo_to_texture(gl, pbo_id, tex_id, width, height): + """Copy pixel data from the PBO into the GL texture (GPU-to-GPU).""" + gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo_id) + gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id) + gl.glTexSubImage2D( + gl.GL_TEXTURE_2D, + 0, + 0, + 0, + width, + height, + gl.GL_RGBA, + gl.GL_UNSIGNED_BYTE, + None, + ) + gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0) + + +def draw_fullscreen_quad(gl, shader_prog, vao_id, tex_id): + """Draw the texture to the screen using the fullscreen quad.""" + gl.glUseProgram(shader_prog.id) + gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id) + gl.glBindVertexArray(vao_id) + gl.glDrawArrays(gl.GL_TRIANGLES, 0, 6) + gl.glBindVertexArray(0) + gl.glUseProgram(0) + + +# ================================== main() ================================== + + +def main(): + # --- Step 1: Set up CUDA (compile kernels, create stream) --- + dev, stream, kernels = setup_cuda() + + # --- Step 2: Allocate the 3D density volume and bake it once --- + # The CUDAArray is the long-lived backing store; it must outlive the + # render loop. The SurfaceObject is only needed for the one-shot bake + # and is closed before we ever bind a TextureObject to the same CUDAArray. + arr = make_volume_array() + bake_volume(stream, kernels, arr) + + # --- Step 3: Bind the volume as a trilinear TextureObject --- + # LINEAR + WRAP + normalized_coords gives free hardware trilinear + # filtering plus seamless wrapping for the animated coordinate scroll. + volume_tex = make_volume_texture(arr) + + # --- Step 4: Open a window and set up the CUDA/GL bridge --- + window, gl, pyglet = create_window() + shader_prog, quad_vao, tex_id = create_display_resources(gl, WIDTH, HEIGHT) + pbo_id, _ = create_pixel_buffer(gl, WIDTH, HEIGHT) + resource = GraphicsResource.from_gl_buffer(pbo_id, flags="write_discard") + + # --- Step 5: Render loop state --- + # Camera is orbit-style: yaw and pitch are angles, dist is the orbit + # radius. sun_height drives the light direction + sky glow; coverage shapes + # how much of the noise field reads as cloud. The render kernel turns these + # into rays + shading itself. + state = { + "yaw": RESET_YAW, + "pitch": RESET_PITCH, + "dist": RESET_DIST, + "sun_height": RESET_SUN_HEIGHT, + "coverage": RESET_COVERAGE, + } + start_time = time.monotonic() + frame_count = [0] + fps_time = [start_time] + last_fps = [0.0] + + block = (16, 16, 1) + grid = ( + (WIDTH + block[0] - 1) // block[0], + (HEIGHT + block[1] - 1) // block[1], + 1, + ) + config = LaunchConfig(grid=grid, block=block) + + @window.event + def on_draw(): + window.clear() + elapsed = time.monotonic() - start_time + + # (a) Map the PBO so CUDA can write into it. + with resource.map(stream=stream) as buf: + # (b) Launch the volumetric raymarch kernel. Camera + lighting + + # shaping params are passed as scalars; the kernel builds the + # orbit eye, per-pixel ray, and clouds itself. `time` scrolls + # the noise sample coordinate to animate the clouds. + launch( + stream, + config, + kernels["render"], + buf.handle, + np.int32(WIDTH), + np.int32(HEIGHT), + np.uint64(volume_tex.handle), + np.float32(state["yaw"]), + np.float32(state["pitch"]), + np.float32(state["dist"]), + np.float32(state["sun_height"]), + np.float32(state["coverage"]), + np.float32(elapsed), + ) + # (c) Unmap happens automatically; cuGraphicsUnmapResources serializes + # the CUDA work against subsequent OpenGL use. + + copy_pbo_to_texture(gl, pbo_id, tex_id, WIDTH, HEIGHT) + draw_fullscreen_quad(gl, shader_prog, quad_vao, tex_id) + + frame_count[0] += 1 + now = time.monotonic() + if now - fps_time[0] >= 0.5: + last_fps[0] = frame_count[0] / (now - fps_time[0]) + frame_count[0] = 0 + fps_time[0] = now + window.set_caption( + "cuda.core 3D CUDAArray - Volumetric Cloud Ray-Marcher " + f"yaw={state['yaw']:+.2f} pitch={state['pitch']:+.2f} " + f"dist={state['dist']:.2f} sun={state['sun_height']:.2f} " + f"cov={state['coverage']:.2f} " + f"{last_fps[0]:.0f} FPS | " + "3D CUDAArray[FLOAT32,1ch] + tex3D[LINEAR|WRAP|norm] + surf3D bake" + ) + + @window.event + def on_mouse_drag(_x, _y, dx, dy, buttons, _modifiers): + # Left-click drag orbits the camera. dx -> yaw, dy -> pitch. + if not (buttons & pyglet.window.mouse.LEFT): + return + orbit_scale = 0.005 + state["yaw"] += dx * orbit_scale + state["pitch"] += dy * orbit_scale + if state["pitch"] < PITCH_MIN: + state["pitch"] = PITCH_MIN + elif state["pitch"] > PITCH_MAX: + state["pitch"] = PITCH_MAX + + @window.event + def on_mouse_scroll(_x, _y, _scroll_x, scroll_y): + # Scroll wheel zoom: geometric so each tick feels uniform. Positive + # scroll_y (wheel up) zooms in. + if scroll_y == 0: + return + state["dist"] *= 0.9**scroll_y + if state["dist"] < DIST_MIN: + state["dist"] = DIST_MIN + elif state["dist"] > DIST_MAX: + state["dist"] = DIST_MAX + + @window.event + def on_key_press(symbol, _modifiers): + key = pyglet.window.key + keyboard_orbit = 0.08 + if symbol == key.ESCAPE: + window.close() + elif symbol == key.R: + state["yaw"] = RESET_YAW + state["pitch"] = RESET_PITCH + state["dist"] = RESET_DIST + state["sun_height"] = RESET_SUN_HEIGHT + state["coverage"] = RESET_COVERAGE + elif symbol == key.LEFT: + state["yaw"] -= keyboard_orbit + elif symbol == key.RIGHT: + state["yaw"] += keyboard_orbit + elif symbol == key.UP: + state["pitch"] = min(PITCH_MAX, state["pitch"] + keyboard_orbit) + elif symbol == key.DOWN: + state["pitch"] = max(PITCH_MIN, state["pitch"] - keyboard_orbit) + elif symbol in (key.PLUS, key.EQUAL, key.NUM_ADD): + state["sun_height"] = min(SUN_HEIGHT_MAX, state["sun_height"] + 0.05) + elif symbol in (key.MINUS, key.UNDERSCORE, key.NUM_SUBTRACT): + state["sun_height"] = max(SUN_HEIGHT_MIN, state["sun_height"] - 0.05) + elif symbol == key.BRACKETLEFT: + state["coverage"] = max(COVERAGE_MIN, state["coverage"] - 0.03) + elif symbol == key.BRACKETRIGHT: + state["coverage"] = min(COVERAGE_MAX, state["coverage"] + 0.03) + + @window.event + def on_close(): + # Release CUDA resources in reverse construction order. The GL objects + # clean up via pyglet on window close. + resource.close() + volume_tex.close() + arr.close() + stream.close() + + pyglet.app.run(interval=0) + + +# ======================== GPU code (CUDA + GLSL) ============================ +# +# Two CUDA C++ kernels are concatenated into one program string so they share +# a single NVRTC compile. NOTE: with no GPU available at authoring time, the +# noise/raymarch math below is unverified at runtime -- it is kept deliberately +# conservative (integer-hash value noise, plain fbm, no STL / host-only calls) +# so it compiles cleanly under NVRTC c++17. +# +# bake_density -- one thread per voxel. Evaluates fractal Brownian motion +# (fbm) of a cheap integer-hash value noise and writes the +# raw scalar via surf3Dwrite. NOTE: surf3Dwrite's +# x coordinate is in BYTES; a FLOAT32 element is 4 bytes, so +# multiply by sizeof(float). y and z are in elements +# -- a classic CUDA gotcha. +# +# render_clouds -- one thread per screen pixel. Builds the orbit-camera ray, +# intersects the [-1, 1]^3 box, marches front-to-back +# sampling density via tex3D (LINEAR + WRAP + +# normalized coords, coordinate scrolled by `time`), applies +# a coverage threshold + vertical height falloff, does a +# short sun-ward shadow march per sample (Beer-Lambert), +# accumulates color + transmittance, composites over an +# analytic sky, and writes RGBA8 into the PBO. +# +# GLSL shaders at the very bottom just draw a textured quad. Nothing CUDA- +# specific there. +# +# ============================================================================ + +KERNEL_SOURCE = r""" +// -------------------------------------------------------------------------- +// Small inline helpers. +// -------------------------------------------------------------------------- +__device__ __forceinline__ float clampf(float v, float a, float b) { + return fminf(fmaxf(v, a), b); +} + +__device__ __forceinline__ float dot3(float ax, float ay, float az, + float bx, float by, float bz) { + return ax * bx + ay * by + az * bz; +} + +__device__ __forceinline__ float length3(float x, float y, float z) { + return sqrtf(x * x + y * y + z * z); +} + +__device__ __forceinline__ float lerpf(float a, float b, float t) { + return a + (b - a) * t; +} + +__device__ __forceinline__ float smoothstepf(float t) { + // Hermite fade curve used both for noise interpolation and shaping. + return t * t * (3.0f - 2.0f * t); +} + +// -------------------------------------------------------------------------- +// Cheap integer-hash value noise + fractal Brownian motion (fbm). +// +// hash3() turns an integer lattice point into a pseudo-random float in [0,1]. +// value_noise() trilinearly interpolates the 8 lattice corners around a +// floating-point position with a smoothstep fade. fbm() sums several octaves +// of value_noise at doubling frequency / halving amplitude. All integer math, +// no tables, no host-only calls -- NVRTC-friendly. +// -------------------------------------------------------------------------- +__device__ __forceinline__ float hash3(int ix, int iy, int iz) { + unsigned int h = (unsigned int)ix * 374761393u + + (unsigned int)iy * 668265263u + + (unsigned int)iz * 2147483647u; + h = (h ^ (h >> 13)) * 1274126177u; + h = h ^ (h >> 16); + return (float)(h & 0x00ffffffu) / (float)0x01000000u; // [0, 1) +} + +__device__ __forceinline__ float value_noise(float x, float y, float z) { + float fx = floorf(x), fy = floorf(y), fz = floorf(z); + int ix = (int)fx, iy = (int)fy, iz = (int)fz; + float tx = smoothstepf(x - fx); + float ty = smoothstepf(y - fy); + float tz = smoothstepf(z - fz); + + float c000 = hash3(ix, iy, iz); + float c100 = hash3(ix + 1, iy, iz); + float c010 = hash3(ix, iy + 1, iz); + float c110 = hash3(ix + 1, iy + 1, iz); + float c001 = hash3(ix, iy, iz + 1); + float c101 = hash3(ix + 1, iy, iz + 1); + float c011 = hash3(ix, iy + 1, iz + 1); + float c111 = hash3(ix + 1, iy + 1, iz + 1); + + float x00 = lerpf(c000, c100, tx); + float x10 = lerpf(c010, c110, tx); + float x01 = lerpf(c001, c101, tx); + float x11 = lerpf(c011, c111, tx); + float y0 = lerpf(x00, x10, ty); + float y1 = lerpf(x01, x11, ty); + return lerpf(y0, y1, tz); +} + +__device__ __forceinline__ float fbm(float x, float y, float z) { + float sum = 0.0f; + float amp = 0.5f; + float freq = 1.0f; + #pragma unroll + for (int o = 0; o < 5; ++o) { + sum += amp * value_noise(x * freq, y * freq, z * freq); + freq *= 2.0f; + amp *= 0.5f; + } + return sum; // roughly in [0, 1) +} + +// -------------------------------------------------------------------------- +// bake_density: one thread per voxel writes raw fbm into the volume via a +// SurfaceObject. The cloud SHAPING (coverage threshold + height +// falloff) is applied later in render_clouds so the threshold and +// fade stay fixed while the render kernel scrolls the coordinate +// for animation. +// +// surf is bound to a (size^3, FLOAT32 x 1) CUDAArray allocated with +// is_surface_load_store=True. +// surf3Dwrite's x coordinate is in BYTES; a FLOAT32 element is 4 bytes, so +// multiply x by sizeof(float). y and z are in elements -- a classic CUDA +// gotcha. +// -------------------------------------------------------------------------- +extern "C" __global__ +void bake_density(cudaSurfaceObject_t surf, int size) { + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + int z = blockIdx.z * blockDim.z + threadIdx.z; + if (x >= size || y >= size || z >= size) return; + + // Voxel-center position mapped into a few noise cells so fbm has structure + // across the volume. ~4 base cells across the volume gives puffy blobs. + const float NOISE_SCALE = 4.0f; + float fx = ((float)x + 0.5f) / (float)size; + float fy = ((float)y + 0.5f) / (float)size; + float fz = ((float)z + 0.5f) / (float)size; + + float n = fbm(fx * NOISE_SCALE, fy * NOISE_SCALE, fz * NOISE_SCALE); + + // FLOAT32 store: surf3Dwrite's x offset is in BYTES (x * sizeof(float)). + surf3Dwrite(n, surf, x * (int)sizeof(float), y, z); +} + +// -------------------------------------------------------------------------- +// Density sampler: tex3D wants normalized coords in [0, 1]; the volume covers +// [-1, 1] in world space, so we remap with (p + 1) * 0.5 and add a time-based +// scroll (WRAP addressing wraps it without edge clamping). The raw fbm is then shaped into +// a cloud density with: +// - a coverage threshold (higher `coverage` -> lower threshold -> more cloud) +// - a vertical height falloff that fades density near the top and bottom of +// the box so clouds float in a slab rather than filling the whole cube. +// Returns density >= 0 (0 = clear air). +// -------------------------------------------------------------------------- +__device__ __forceinline__ float sample_density(cudaTextureObject_t tex, + float px, float py, float pz, + float coverage, float t) { + // Slow horizontal drift + gentle vertical bob for evolving clouds. + float u = (px + 1.0f) * 0.5f + t * 0.015f; + float v = (py + 1.0f) * 0.5f + t * 0.004f; + float w = (pz + 1.0f) * 0.5f + t * 0.010f; + float n = tex3D(tex, u, v, w); + + // Coverage threshold: subtract a threshold and rescale so values below it + // become clear air. coverage in [0,1] maps to threshold in [~0.8, ~0.15]. + float threshold = lerpf(0.80f, 0.15f, coverage); + float d = (n - threshold) / fmaxf(1.0f - threshold, 1e-3f); + d = clampf(d, 0.0f, 1.0f); + + // Vertical height falloff: py in [-1, 1]. Fade to zero near the top/bottom + // so clouds form a horizontal band. Peak density around py ~ -0.1. + float h = clampf((py + 1.0f) * 0.5f, 0.0f, 1.0f); // [0,1] bottom->top + float falloff = smoothstepf(clampf(h * 4.0f, 0.0f, 1.0f)) * + smoothstepf(clampf((1.0f - h) * 2.5f, 0.0f, 1.0f)); + + return d * falloff; +} + +// -------------------------------------------------------------------------- +// render_clouds: one thread per screen pixel. Volumetric ray march of the +// density volume as participating media. +// +// Camera math (orbit, look-at origin, world-up (0, 1, 0)) matches the SDF +// example. Per pixel: +// 1. Build the ray, intersect the [-1, 1]^3 AABB (slab method). +// 2. March front-to-back from the entry point. At each step sample density; +// if positive, do a SHORT secondary march toward the sun to estimate how +// much light reaches this sample (Beer-Lambert: exp(-sum*absorption)). +// 3. Accumulate color and transmittance front-to-back. Early-out when +// transmittance < 0.01 (rest of the ray is occluded -> big speedup). +// 4. Composite the accumulated cloud color over an analytic sky gradient +// (horizon-to-zenith blue + a sun glow), tonemap, write RGBA8. +// -------------------------------------------------------------------------- +extern "C" __global__ +void render_clouds(unsigned char* output, + int width, + int height, + cudaTextureObject_t tex, + float yaw, + float pitch, + float dist, + float sun_height, + float coverage, + float t) { + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + if (x >= width || y >= height) return; + + // ---- Build the orbit camera basis ---------------------------------- + float cp = cosf(pitch), sp = sinf(pitch); + float cyw = cosf(yaw), syw = sinf(yaw); + + float ex = dist * cp * cyw; + float ey = dist * sp; + float ez = dist * cp * syw; + + float fl = length3(ex, ey, ez); + if (fl < 1e-6f) fl = 1e-6f; + float fx = -ex / fl, fy = -ey / fl, fz = -ez / fl; + + // right = normalize(cross(fwd, world_up)), world_up = (0, 1, 0). + float rx = -fz; + float ry = 0.0f; + float rz = fx; + float rl = length3(rx, ry, rz); + if (rl < 1e-6f) rl = 1e-6f; + rx /= rl; ry /= rl; rz /= rl; + + // up' = cross(right, fwd). + float ux = ry * fz - rz * fy; + float uy = rz * fx - rx * fz; + float uz = rx * fy - ry * fx; + + // ---- Per-pixel ray direction --------------------------------------- + float u_ndc = 2.0f * ((float)x + 0.5f) / (float)width - 1.0f; + float v_ndc = 2.0f * ((float)y + 0.5f) / (float)height - 1.0f; + + const float TAN_HALF = 0.41421356237309515f; // tanf(45deg / 2) + float aspect = (float)width / (float)height; + + float dx = fx + u_ndc * aspect * TAN_HALF * rx + v_ndc * TAN_HALF * ux; + float dy = fy + u_ndc * aspect * TAN_HALF * ry + v_ndc * TAN_HALF * uy; + float dz = fz + u_ndc * aspect * TAN_HALF * rz + v_ndc * TAN_HALF * uz; + float dl = length3(dx, dy, dz); + if (dl < 1e-6f) dl = 1e-6f; + dx /= dl; dy /= dl; dz /= dl; + + // ---- Sun direction from sun_height --------------------------------- + // sun_height in [0,1]: 0 -> near horizon, 1 -> overhead. Keep a fixed + // azimuth so the light feels stable while orbiting. + float sun_el = sun_height * 1.4707963f; // up to ~84 degrees + float se = sinf(sun_el), ce = cosf(sun_el); + const float SUN_AZ = 0.7853981633974483f; // 45 deg azimuth + float lx = ce * cosf(SUN_AZ); + float ly = se; + float lz = ce * sinf(SUN_AZ); + float ll = length3(lx, ly, lz); + if (ll < 1e-6f) ll = 1e-6f; + lx /= ll; ly /= ll; lz /= ll; + + // ---- Ray vs. the [-1, 1]^3 box (slab method) ----------------------- + float inv_dx = 1.0f / (fabsf(dx) > 1e-8f ? dx : (dx >= 0 ? 1e-8f : -1e-8f)); + float inv_dy = 1.0f / (fabsf(dy) > 1e-8f ? dy : (dy >= 0 ? 1e-8f : -1e-8f)); + float inv_dz = 1.0f / (fabsf(dz) > 1e-8f ? dz : (dz >= 0 ? 1e-8f : -1e-8f)); + float t1x = (-1.0f - ex) * inv_dx, t2x = ( 1.0f - ex) * inv_dx; + float t1y = (-1.0f - ey) * inv_dy, t2y = ( 1.0f - ey) * inv_dy; + float t1z = (-1.0f - ez) * inv_dz, t2z = ( 1.0f - ez) * inv_dz; + float tNear = fmaxf(fmaxf(fminf(t1x, t2x), fminf(t1y, t2y)), fminf(t1z, t2z)); + float tFar = fminf(fminf(fmaxf(t1x, t2x), fmaxf(t1y, t2y)), fmaxf(t1z, t2z)); + + // Accumulators: front-to-back compositing. transmittance starts at 1 + // (fully clear); accumulated radiance starts at 0. + float trans = 1.0f; + float acc_r = 0.0f, acc_g = 0.0f, acc_b = 0.0f; + + // Cloud material + lighting constants. + const float ABSORPTION = 6.0f; // primary extinction per unit density + const float SUN_ABSORP = 8.0f; // shadow-ray extinction per unit density + const float STEP_LEN = 2.0f / 96.0f; // ~one voxel at 96^3 + const int PRIMARY_STEPS = 96; + const int SHADOW_STEPS = 6; + const float SHADOW_STEP_LEN = 0.06f; + + // Henyey-Greenstein forward-scattering phase function. g>0 biases scatter + // toward the light direction, producing the bright "silver lining" rim when + // the view ray points toward the sun. cos(theta) = dot(view_dir, sun_dir); + // both are unit length here. phase = (1-g^2) / (4pi * (1+g^2-2g*cos)^1.5). + // The constant 1/(4pi) factor is folded into the lighting scale below, so + // we only keep the angular shape that drives the glow. + const float HG_G = 0.6f; + float cos_vl = dot3(dx, dy, dz, lx, ly, lz); + float hg_denom = 1.0f + HG_G * HG_G - 2.0f * HG_G * cos_vl; + float hg_phase = (1.0f - HG_G * HG_G) / (hg_denom * sqrtf(fmaxf(hg_denom, 1e-4f))); + + if (tFar > fmaxf(tNear, 0.0f)) { + float tcur = fmaxf(tNear, 0.0f) + 1e-4f; + + #pragma unroll 1 + for (int i = 0; i < PRIMARY_STEPS; ++i) { + if (tcur > tFar) break; + + float pxw = ex + tcur * dx; + float pyw = ey + tcur * dy; + float pzw = ez + tcur * dz; + + float density = sample_density(tex, pxw, pyw, pzw, coverage, t); + + if (density > 1e-3f) { + // ---- Secondary march toward the sun for self-shadowing ---- + float shadow_sum = 0.0f; + #pragma unroll + for (int s = 1; s <= SHADOW_STEPS; ++s) { + float st = (float)s * SHADOW_STEP_LEN; + float sxw = pxw + lx * st; + float syw = pyw + ly * st; + float szw = pzw + lz * st; + // Stop sampling outside the box (no density there anyway). + if (fabsf(sxw) > 1.0f || fabsf(syw) > 1.0f || fabsf(szw) > 1.0f) { + break; + } + shadow_sum += sample_density(tex, sxw, syw, szw, coverage, t); + } + float sun_trans = expf(-shadow_sum * SUN_ABSORP * SHADOW_STEP_LEN); + + // Powder ("dark edge") term: thin cloud edges scatter less light + // back than a naive 1-exp model predicts, so darken low-density + // samples for fluffier, more rounded volumes. Saturates toward 1 + // in dense cloud (cores stay bright); only thin edges are dimmed. + // Apply as a gentle modulation so cores keep full sunlight. + float powder = 0.4f + 0.6f * (1.0f - expf(-density * 3.0f)); + + // Beer-Lambert extinction for this slab of the primary ray. + float slab_trans = expf(-density * ABSORPTION * STEP_LEN); + float absorbed = trans * (1.0f - slab_trans); + + // Direct sunlight reaching this sample, shaped by the HG phase so + // it spikes when looking toward the sun (silver lining). Add a + // small ambient floor so shadowed cores stay bluish, not black. + float sun_light = sun_trans * (0.4f + 1.6f * hg_phase) * powder; + float lit = clampf(0.15f + sun_light, 0.0f, 1.6f); + float cr = lerpf(0.42f, 1.05f, clampf(lit, 0.0f, 1.0f)) + 0.05f * fmaxf(lit - 1.0f, 0.0f); + float cg = lerpf(0.48f, 0.99f, clampf(lit, 0.0f, 1.0f)) + 0.04f * fmaxf(lit - 1.0f, 0.0f); + float cb = lerpf(0.62f, 0.92f, clampf(lit, 0.0f, 1.0f)); + + acc_r += absorbed * cr; + acc_g += absorbed * cg; + acc_b += absorbed * cb; + trans *= slab_trans; + + if (trans < 0.01f) break; // remaining ray fully occluded + } + + tcur += STEP_LEN; + } + } + + // ---- Analytic sky behind / through the clouds ---------------------- + // Vertical gradient from a pale horizon to a deeper zenith blue, plus a + // soft sun glow where the ray direction aligns with the sun. + float up_amt = clampf(0.5f * (dy + 1.0f), 0.0f, 1.0f); + float sky_r = lerpf(0.70f, 0.18f, up_amt); + float sky_g = lerpf(0.80f, 0.34f, up_amt); + float sky_b = lerpf(0.92f, 0.62f, up_amt); + + // Sun glow + a crisp sun disk. The broad glow uses a moderate power; the + // disk is a high-power lobe that reads as a bright, slightly warm sun. + float sun_dot = clampf(dot3(dx, dy, dz, lx, ly, lz), 0.0f, 1.0f); + float glow = powf(sun_dot, 64.0f); + float disk = powf(sun_dot, 2048.0f); + sky_r += glow * 0.8f + disk * 6.0f; + sky_g += glow * 0.7f + disk * 5.4f; + sky_b += glow * 0.5f + disk * 3.6f; + + // Composite: accumulated cloud radiance over the sky weighted by the + // remaining transmittance. + float r = acc_r + trans * sky_r; + float g = acc_g + trans * sky_g; + float b = acc_b + trans * sky_b; + + // Simple Reinhard tonemap to keep the sun glow from blowing out. + r = r / (1.0f + r); + g = g / (1.0f + g); + b = b / (1.0f + b); + // Mild gamma for a punchier image. + r = powf(clampf(r, 0.0f, 1.0f), 0.85f); + g = powf(clampf(g, 0.0f, 1.0f), 0.85f); + b = powf(clampf(b, 0.0f, 1.0f), 0.85f); + + int idx = (y * width + x) * 4; + output[idx + 0] = (unsigned char)(r * 255.0f); + output[idx + 1] = (unsigned char)(g * 255.0f); + output[idx + 2] = (unsigned char)(b * 255.0f); + output[idx + 3] = 255; +} +""" + +# GLSL shaders -- these just display a texture on a fullscreen rectangle. +# Nothing CUDA-specific here. + +VERTEX_SHADER_SOURCE = """#version 330 core +in vec2 position; +in vec2 texcoord; +out vec2 v_texcoord; +void main() { + gl_Position = vec4(position, 0.0, 1.0); + v_texcoord = texcoord; +} +""" + +FRAGMENT_SHADER_SOURCE = """#version 330 core +in vec2 v_texcoord; +out vec4 fragColor; +uniform sampler2D tex; +void main() { + fragColor = texture(tex, v_texcoord); +} +""" + + +if __name__ == "__main__": + main() diff --git a/cuda_core/examples/gl_interop_fire.py b/cuda_core/examples/gl_interop_fire.py new file mode 100644 index 00000000000..ad9008757eb --- /dev/null +++ b/cuda_core/examples/gl_interop_fire.py @@ -0,0 +1,819 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +# ################################################################################ +# +# This example demonstrates cuda.core.CUDAArray, TextureObject, and SurfaceObject +# in combination with GraphicsResource for CUDA/OpenGL interop: a classic +# "Doom-style" procedural fire effect. A scalar heat field lives on a +# ping-ponged float CUDA CUDAArray; each frame the field is advected upward with a +# horizontal jitter and a small decay, then colorized through a 1D fire-palette +# TextureObject straight into an OpenGL PBO. Requires pyglet. +# +# ################################################################################ + +# What this example teaches +# ========================= +# - How to combine a 2D float CUDAArray (the heat field) and a 1D RGBA8 CUDAArray (the +# color palette) under the same texture/surface API. +# - How to ping-pong a scalar field via CUDAArray + SurfaceObject writes and +# TextureObject reads, similar to the reaction-diffusion example but with a +# single channel. +# - How to use TextureObject(NORMALIZED_FLOAT) on a UINT8 palette so a +# tex1D lookup returns RGBA in [0, 1] -- no manual unpacking needed. +# - How to wire mouse / keyboard events into a CUDA simulation without +# blocking the event loop. +# +# How it works +# ============ +# The heat field is a WIDTH x HEIGHT scalar in [0, 1]. Each frame we: +# +# 1. step kernel: for every pixel, +# - if y is near the bottom AND ambient injection is on, write random +# high heat ("the embers"); +# - if the mouse button is held, paint a hot disc near the cursor; +# - otherwise read a horizontally-jittered sample from the row "below" +# (i.e. one texel toward the bottom of the screen) and subtract a +# small decay. This is what creates the upward-flickering motion. +# 2. colorize kernel: per pixel, sample the heat, look it up in a 1D RGBA8 +# fire palette via tex1D, and write RGBA bytes into the PBO. +# +# PING-PONG (two single-channel float Arrays) +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +-------------+ tex2D +-------------+ +# | heat_a | ----------------> | | +# | (FLOAT32 x1)| | step_fire | +# +-------------+ | kernel | +# | | +# +-------------+ surf2Dwrite | | +# | heat_b | <---------------- | | +# | (FLOAT32 x1)| +-------------+ +# +-------------+ +# (swap) +# +# Orientation +# ----------- +# OpenGL displays texel row 0 at the bottom of the window. The fullscreen quad +# in create_display_resources() flips t so that kernel y=0 lands at the TOP of +# the screen -- this lets the kernel keep the intuitive "inject at y = h-1, +# advect from y+1 -> y" convention while the visible flames rise upward. +# Mouse coordinates from pyglet (y=0 at window bottom) are flipped to the +# kernel's y-down convention on entry. +# +# surf2Dwrite x-in-bytes +# ---------------------- +# `surf2Dwrite` takes the x coordinate in BYTES, not in elements. For a +# float surface that means `x * sizeof(float)` = `x * 4`. Getting this wrong +# silently corrupts every other column. +# +# What you should see +# =================== +# A flickering wall of doom-style fire rising from the bottom of the window. +# Hold the mouse button and drag to paint a torch of heat at the cursor. +# Press SPACE to toggle the ambient embers along the bottom row (the fire +# will die out when ambient is OFF). Press R to clear the heat field. +# Press Escape or close the window to exit. The window title shows FPS and +# whether ambient injection is currently on. +# + +# /// script +# dependencies = ["cuda_bindings", "cuda_core>0.6.0", "pyglet"] +# /// + +import ctypes +import sys +import time + +import numpy as np + +from cuda.core import ( + AddressMode, + ArrayFormat, + CUDAArray, + Device, + FilterMode, + GraphicsResource, + LaunchConfig, + Program, + ProgramOptions, + ReadMode, + ResourceDescriptor, + SurfaceObject, + TextureDescriptor, + TextureObject, + launch, +) + +# --------------------------------------------------------------------------- +# Simulation parameters (feel free to change these) +# --------------------------------------------------------------------------- +# Window dimensions (what the user sees). +WINDOW_WIDTH = 640 +WINDOW_HEIGHT = 480 + +# Simulation dimensions (the heat-field grid). Doom's actual screen was +# 320x200; we use 320x100 so the canonical decay rate of ~1 intensity unit +# per row (random {0, 1, 2}, average 1) produces flames that reach ~36% of +# the screen height -- the recognizable "tall licking flames" look. +# NEAREST-filtered upscale to the 640x480 window stretches vertically 4.8x, +# giving the chunky retro pixel-doubled appearance. +WIDTH = 320 +HEIGHT = 100 + +# Canonical Doom fire palette: 37 hand-tuned colors (intensity 0..36 -> RGB). +# Source: https://github.com/tiagomenegaz/doom-fire (and Fabien Sanglard's +# analysis of the original PSX Doom fire effect). +PALETTE_SIZE = 37 +MAX_INTENSITY = 36 +TORCH_RADIUS = 12 # pixel radius of the mouse-painted hot disc (sim space) + + +# ============================= Helper functions ============================= +# +# The functions below set up CUDA and OpenGL. If you're here to learn about +# CUDAArray/TextureObject/SurfaceObject, skip ahead to main() -- the interesting +# part is there. These helpers exist so that main() reads like a short story +# instead of a wall of boilerplate. +# ============================================================================ + + +def setup_cuda(): + """Compile the CUDA kernels and return (device, stream, kernels, configs).""" + dev = Device(0) + dev.set_current() + + # SurfaceObject requires surface load/store, which has existed since SM 2.0, + # but bindless surface objects (cuSurfObjectCreate) require SM 3.0+. + cc = dev.compute_capability + if cc.major < 3: + print( + "This example requires a GPU with compute capability >= 3.0 for " + f"bindless surface objects. Found sm_{cc.major}{cc.minor}.", + file=sys.stderr, + ) + sys.exit(1) + + stream = dev.create_stream() + + # Compile as C++ so the templated tex1D / tex2D overloads + # resolve. + program_options = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}") + prog = Program(KERNEL_SOURCE, code_type="c++", options=program_options) + mod = prog.compile( + "cubin", + name_expressions=("step_fire", "colorize_fire"), + ) + + kernels = { + "step": mod.get_kernel("step_fire"), + "colorize": mod.get_kernel("colorize_fire"), + } + + block = (16, 16, 1) + grid = ( + (WIDTH + block[0] - 1) // block[0], + (HEIGHT + block[1] - 1) // block[1], + 1, + ) + config = LaunchConfig(grid=grid, block=block) + # Both kernels are pixel-parallel over a WIDTH x HEIGHT grid. + configs = {"step": config, "colorize": config} + + return dev, stream, kernels, configs + + +def create_window(): + """Open a pyglet window and return (window, gl_module, pyglet).""" + try: + import pyglet + from pyglet.gl import gl as _gl + except ImportError: + print( + "This example requires pyglet >= 2.0.\nInstall it with: pip install pyglet", + file=sys.stderr, + ) + sys.exit(1) + + window = pyglet.window.Window( + WINDOW_WIDTH, + WINDOW_HEIGHT, + caption="cuda.core CUDAArray/Texture/Surface - Doom Fire", + vsync=False, + ) + return window, _gl, pyglet + + +def create_display_resources(gl, width, height): + """Create the GL objects needed to show a texture on screen. + + Standard OpenGL boilerplate for a textured fullscreen quad. The texcoord + `t` is flipped versus the plasma example so that kernel y=0 lands at the + TOP of the screen. That lets the fire kernel keep the intuitive + "inject at the largest y, advect upward" convention while the visible + flames rise toward the top. + + Returns (shader_program, vertex_array_id, texture_id). + """ + from pyglet.graphics.shader import Shader, ShaderProgram + + vert = Shader(VERTEX_SHADER_SOURCE, "vertex") + frag = Shader(FRAGMENT_SHADER_SOURCE, "fragment") + shader_prog = ShaderProgram(vert, frag) + + # Fullscreen quad (two triangles covering the entire window). Note the + # flipped t coordinates compared to gl_interop_plasma: (-1, -1) gets t=1 + # so screen-bottom samples the kernel's largest-y row. + quad_verts = np.array( + [ + # x, y, s, t (position + texture coordinate) + -1, + -1, + 0, + 1, + 1, + -1, + 1, + 1, + 1, + 1, + 1, + 0, + -1, + -1, + 0, + 1, + 1, + 1, + 1, + 0, + -1, + 1, + 0, + 0, + ], + dtype=np.float32, + ) + + vao = ctypes.c_uint(0) + gl.glGenVertexArrays(1, ctypes.byref(vao)) + gl.glBindVertexArray(vao.value) + + vbo = ctypes.c_uint(0) + gl.glGenBuffers(1, ctypes.byref(vbo)) + gl.glBindBuffer(gl.GL_ARRAY_BUFFER, vbo.value) + gl.glBufferData( + gl.GL_ARRAY_BUFFER, + quad_verts.nbytes, + quad_verts.ctypes.data_as(ctypes.c_void_p), + gl.GL_STATIC_DRAW, + ) + + stride = 4 * 4 # 4 floats * 4 bytes each = 16 bytes per vertex + pos_loc = gl.glGetAttribLocation(shader_prog.id, b"position") + gl.glEnableVertexAttribArray(pos_loc) + gl.glVertexAttribPointer(pos_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(0)) + + tc_loc = gl.glGetAttribLocation(shader_prog.id, b"texcoord") + gl.glEnableVertexAttribArray(tc_loc) + gl.glVertexAttribPointer(tc_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(8)) + + gl.glBindVertexArray(0) + + # Empty texture (filled each frame from the PBO). + tex = ctypes.c_uint(0) + gl.glGenTextures(1, ctypes.byref(tex)) + gl.glBindTexture(gl.GL_TEXTURE_2D, tex.value) + # NEAREST upscale: makes the low-res simulation render with crisp, + # blocky pixels instead of bilinear-blended mush. Critical to the + # Doom-fire look. + gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MIN_FILTER, gl.GL_NEAREST) + gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MAG_FILTER, gl.GL_NEAREST) + gl.glTexImage2D( + gl.GL_TEXTURE_2D, + 0, + gl.GL_RGBA8, + width, + height, + 0, + gl.GL_RGBA, + gl.GL_UNSIGNED_BYTE, + None, + ) + + return shader_prog, vao.value, tex.value + + +def create_pixel_buffer(gl, width, height): + """Create a Pixel Buffer Object (PBO) -- the bridge between CUDA and OpenGL. + + Returns (pbo_gl_name, size_in_bytes). + """ + pbo = ctypes.c_uint(0) + gl.glGenBuffers(1, ctypes.byref(pbo)) + gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo.value) + nbytes = width * height * 4 # RGBA, 1 byte per channel + gl.glBufferData(gl.GL_PIXEL_UNPACK_BUFFER, nbytes, None, gl.GL_DYNAMIC_DRAW) + gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0) + return pbo.value, nbytes + + +def copy_pbo_to_texture(gl, pbo_id, tex_id, width, height): + """Copy pixel data from the PBO into the GL texture (GPU-to-GPU).""" + gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo_id) + gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id) + gl.glTexSubImage2D( + gl.GL_TEXTURE_2D, + 0, + 0, + 0, + width, + height, + gl.GL_RGBA, + gl.GL_UNSIGNED_BYTE, + None, # None = read from the currently bound PBO, not from CPU + ) + gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0) + + +def draw_fullscreen_quad(gl, shader_prog, vao_id, tex_id): + """Draw the texture to the screen using the fullscreen quad.""" + gl.glUseProgram(shader_prog.id) + gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id) + gl.glBindVertexArray(vao_id) + gl.glDrawArrays(gl.GL_TRIANGLES, 0, 6) + gl.glBindVertexArray(0) + gl.glUseProgram(0) + + +def make_heat_arrays(): + """Allocate two single-channel UINT8 ping-pong Arrays for the heat field. + + Intensity is an integer in [0, 36] indexing the canonical Doom palette. + UINT8 is exactly one byte per texel -- surf2Dwrite x-coord = x * 1. + """ + arr_a = CUDAArray.from_descriptor( + shape=(WIDTH, HEIGHT), + format=ArrayFormat.UINT8, + num_channels=1, + is_surface_load_store=True, + ) + arr_b = CUDAArray.from_descriptor( + shape=(WIDTH, HEIGHT), + format=ArrayFormat.UINT8, + num_channels=1, + is_surface_load_store=True, + ) + return arr_a, arr_b + + +def make_heat_texture(arr): + """Bind `arr` as a TextureObject configured for POINT + CLAMP reads. + + POINT filtering is what gives Doom fire its chunky retro look. LINEAR + smooths the per-frame horizontal jitter into a uniform glow that + doesn't read as fire. + """ + res_desc = ResourceDescriptor.from_array(arr) + tex_desc = TextureDescriptor( + address_mode=AddressMode.CLAMP, + filter_mode=FilterMode.POINT, + read_mode=ReadMode.ELEMENT_TYPE, + # Non-normalized: the step kernel addresses texels in pixel space. + normalized_coords=False, + ) + return TextureObject.from_descriptor(resource=res_desc, texture_descriptor=tex_desc) + + +def build_fire_palette(): + """Return the canonical Doom fire palette as a (37, 4) uint8 array. + + The 37 entries map intensity 0 (black) -> 36 (white). Each entry is + indexed by the integer intensity in the heat field. + + Source: Fabien Sanglard's PSX Doom analysis, reproduced in + https://github.com/tiagomenegaz/doom-fire. + """ + rgb = [ + (7, 7, 7), + (31, 7, 7), + (47, 15, 7), + (71, 15, 7), + (87, 23, 7), + (103, 31, 7), + (119, 31, 7), + (143, 39, 7), + (159, 47, 7), + (175, 63, 7), + (191, 71, 7), + (199, 71, 7), + (223, 79, 7), + (223, 87, 7), + (223, 87, 7), + (215, 95, 7), + (215, 95, 7), + (215, 103, 15), + (207, 111, 15), + (207, 119, 15), + (207, 127, 15), + (207, 135, 23), + (199, 135, 23), + (199, 143, 23), + (199, 151, 31), + (191, 159, 31), + (191, 159, 31), + (191, 167, 39), + (191, 167, 39), + (191, 175, 47), + (183, 175, 47), + (183, 183, 47), + (183, 183, 55), + (207, 207, 111), + (223, 223, 159), + (239, 239, 199), + (255, 255, 255), + ] + # Index 0 (the "no fire" color) is rendered as pure black so dead pixels + # don't glow. The canonical (7, 7, 7) reads as a dim background which is + # less dramatic against the dark window. + rgb[0] = (0, 0, 0) + assert len(rgb) == PALETTE_SIZE + rgba = np.empty((PALETTE_SIZE, 4), dtype=np.uint8) + rgba[:, :3] = np.array(rgb, dtype=np.uint8) + rgba[:, 3] = 255 + return rgba + + +def make_palette_array_and_texture(stream): + """Allocate the 1D RGBA8 palette CUDAArray, upload, and bind as a texture. + + Returns (palette_array, palette_texture). Both must be closed by the + caller (or used inside `with` blocks). + """ + palette = build_fire_palette() # shape (PALETTE_SIZE, 4), uint8 + arr = CUDAArray.from_descriptor( + shape=(PALETTE_SIZE,), + format=ArrayFormat.UINT8, + num_channels=4, + ) + # 1D CUDAArray bytes match a flat (PALETTE_SIZE * 4) uint8 buffer. + arr.copy_from(np.ascontiguousarray(palette), stream=stream) + + res_desc = ResourceDescriptor.from_array(arr) + tex_desc = TextureDescriptor( + address_mode=AddressMode.CLAMP, + # POINT keeps the palette stops as discrete color bands -- the + # classic Doom fire palette is indexed, not gradient-blended. + filter_mode=FilterMode.POINT, + # NORMALIZED_FLOAT: tex1D returns each UINT8 channel as a + # float in [0, 1], so the colorize kernel can multiply by 255 and + # store directly without manual unpacking. + read_mode=ReadMode.NORMALIZED_FLOAT, + # Normalized: the kernel feeds a heat value in [0, 1] as the LUT + # coordinate. With normalized_coords=True the LINEAR filter blends + # adjacent palette entries smoothly. + normalized_coords=True, + ) + tex = TextureObject.from_descriptor(resource=res_desc, texture_descriptor=tex_desc) + return arr, tex + + +# ================================== main() ================================== + + +def main(): + # --- Step 1: Set up CUDA (compile kernels, create stream) --- + dev, stream, kernels, configs = setup_cuda() + + # --- Step 2: Open a window --- + window, gl, pyglet = create_window() + + # --- Step 3: Create GL resources for drawing a texture to screen --- + # (Standard OpenGL boilerplate -- not CUDA-specific.) + shader_prog, quad_vao, tex_id = create_display_resources(gl, WIDTH, HEIGHT) + + # --- Step 4: Create the Pixel Buffer Object (PBO) --- + pbo_id, _ = create_pixel_buffer(gl, WIDTH, HEIGHT) + + # --- Step 5: Register the PBO with CUDA --- + resource = GraphicsResource.from_gl_buffer(pbo_id, flags="write_discard") + + # --- Step 6: Allocate heat-field Arrays, palette CUDAArray, and the four + # bindless handles (textures + surfaces). We hold them open + # for the lifetime of the window and release in on_close(), + # matching the reaction-diffusion example. (Using `with` + # blocks here would close everything before the pyglet event + # loop has a chance to use them.) + arr_a, arr_b = make_heat_arrays() + palette_arr, palette_tex = make_palette_array_and_texture(stream) + tex_a = make_heat_texture(arr_a) + tex_b = make_heat_texture(arr_b) + surf_a = SurfaceObject.from_array(arr_a) + surf_b = SurfaceObject.from_array(arr_b) + + # The heat field is born zeroed by CUDAArray.from_descriptor. No seed pass. + state = { + "current": "a", # which array holds the latest heat field + "frame_index": 0, # passed into the step kernel as `t` + "ambient": True, # SPACE toggles bottom-row injection + "mouse_down": False, + "mouse_x": 0, + "mouse_y": 0, + } + + def current_read_write(): + if state["current"] == "a": + return tex_a, surf_b, "b" # read a, write b, next current = b + return tex_b, surf_a, "a" + + def clear_field(): + """Zero both heat arrays and seed the bottom row at full intensity. + + CUDAArray.copy_from is the simplest reset path -- a dedicated clear + kernel would be faster but is unnecessary for an interactive demo. + The bottom row is set to MAX_INTENSITY so the very first frame + already has a fire source to advect from. + """ + seed = np.zeros((HEIGHT, WIDTH), dtype=np.uint8) + seed[HEIGHT - 1, :] = MAX_INTENSITY # canonical Doom fire source + arr_a.copy_from(np.ascontiguousarray(seed), stream=stream) + arr_b.copy_from(np.ascontiguousarray(seed), stream=stream) + state["current"] = "a" + + # Seed at startup so frame 1 already has a source row. + clear_field() + stream.sync() + + # --- Step 7: Render loop --- + start_time = time.monotonic() + frame_count = 0 + fps_time = start_time + + @window.event + def on_key_press(symbol, _modifiers): + key = pyglet.window.key + if symbol == key.ESCAPE: + window.close() + return + if symbol == key.SPACE: + state["ambient"] = not state["ambient"] + return + if symbol == key.R: + clear_field() + return + + # Map window coords (WINDOW_WIDTH x WINDOW_HEIGHT, y=0 at bottom) to + # simulation coords (WIDTH x HEIGHT, y=0 at top). + def _window_to_sim(x, y): + sx = int(x * WIDTH / WINDOW_WIDTH) + sy = int((WINDOW_HEIGHT - 1 - y) * HEIGHT / WINDOW_HEIGHT) + return sx, sy + + @window.event + def on_mouse_press(x, y, _button, _modifiers): + state["mouse_down"] = True + state["mouse_x"], state["mouse_y"] = _window_to_sim(x, y) + + @window.event + def on_mouse_release(_x, _y, _button, _modifiers): + state["mouse_down"] = False + + @window.event + def on_mouse_drag(x, y, _dx, _dy, _buttons, _modifiers): + state["mouse_down"] = True + state["mouse_x"], state["mouse_y"] = _window_to_sim(x, y) + + @window.event + def on_draw(): + nonlocal frame_count, fps_time + + window.clear() + + # (a) Advance the heat field by one step. + tex_read, surf_write, next_current = current_read_write() + launch( + stream, + configs["step"], + kernels["step"], + np.uint64(tex_read.handle), + np.uint64(surf_write.handle), + np.int32(WIDTH), + np.int32(HEIGHT), + np.uint32(state["frame_index"]), + np.int32(state["mouse_x"]), + np.int32(state["mouse_y"]), + np.int32(1 if state["mouse_down"] else 0), + np.int32(1 if state["ambient"] else 0), + ) + state["current"] = next_current + state["frame_index"] += 1 + + # (b) Colorize the latest state into the OpenGL PBO. + tex_heat = tex_a if state["current"] == "a" else tex_b + with resource.map(stream=stream) as buf: + launch( + stream, + configs["colorize"], + kernels["colorize"], + np.uint64(tex_heat.handle), + np.uint64(palette_tex.handle), + buf.handle, + np.int32(WIDTH), + np.int32(HEIGHT), + ) + # Unmap happens automatically when the `with` block exits. + + # (c) Tell OpenGL to copy the PBO contents into our texture. + copy_pbo_to_texture(gl, pbo_id, tex_id, WIDTH, HEIGHT) + + # (d) Draw the texture to the screen. + draw_fullscreen_quad(gl, shader_prog, quad_vao, tex_id) + + # FPS counter (shown in window title) + frame_count += 1 + now = time.monotonic() + if now - fps_time >= 1.0: + fps = frame_count / (now - fps_time) + ambient_label = "on" if state["ambient"] else "off" + window.set_caption( + "cuda.core CUDAArray/Texture/Surface - Doom Fire" + f" ({WIDTH}x{HEIGHT}, {fps:.0f} FPS," + f" ambient {ambient_label})" + ) + frame_count = 0 + fps_time = now + + @window.event + def on_close(): + # Release everything we opened, in reverse order. Each of these is a + # context manager too, but pyglet owns the event loop here so we + # release explicitly to be deterministic about ordering. + resource.close() + tex_a.close() + tex_b.close() + surf_a.close() + surf_b.close() + palette_tex.close() + palette_arr.close() + arr_a.close() + arr_b.close() + stream.close() + + pyglet.app.run(interval=0) + + +# ======================== GPU code (CUDA + GLSL) ============================ +# +# These source strings are kept at the bottom of the file so they don't +# distract from the Python logic above. The important things to know: +# +# - KERNEL_SOURCE contains two CUDA C++ kernels: +# * step_fire -- advances the heat field. Reads previous state via a +# TextureObject (LINEAR + CLAMP, non-normalized) and +# writes the next state via a SurfaceObject. Bakes +# the bottom-row injection, mouse torch, and upward +# jittered advection into a single pass. +# * colorize_fire -- per pixel: read heat from the heat TextureObject, +# look up the fire palette via tex1D, write +# RGBA bytes to the OpenGL PBO. +# +# - VERTEX_SHADER_SOURCE / FRAGMENT_SHADER_SOURCE are GLSL. They draw a +# texture onto a rectangle covering the entire window. The quad's t +# coordinate is flipped versus the plasma example so that y=0 maps to the +# top of the screen (see create_display_resources for why). +# +# ============================================================================ + +KERNEL_SOURCE = r""" +// Small, deterministic, GPU-friendly hash. Returns a value in [0, 1). +// Used both for bottom-row ember intensity and for the per-pixel jitter that +// gives the fire its characteristic horizontal flicker. +__device__ __forceinline__ float hash3(unsigned int x, unsigned int y, + unsigned int t) { + unsigned int h = x * 374761393u + y * 668265263u + t * 2246822519u; + h = (h ^ (h >> 13)) * 1274126177u; + h ^= (h >> 16); + return (float)(h & 0x00ffffffu) / (float)0x01000000u; +} + +// Canonical Doom-fire step (gather form of the original scatter algorithm). +// +// Reference scatter (one cell per JS source row): +// decay = random in {0, 1, 2} +// below = state[x, y+1] +// new = max(0, below - decay) +// state[x - decay, y] = new // writes LEFT of source -> leftward lean +// +// Equivalent gather (one CUDA thread per destination cell): +// decay = hash(x, y, t) in {0, 1, 2} +// below = state[x + decay, y+1] // reads from the right-shifted source +// new = max(0, below - decay) +// state[x, y] = new +// +// The right-shifted gather reads the same data the leftward-shifted scatter +// would have produced. + +extern "C" +__global__ +void step_fire(cudaTextureObject_t tex_read, + cudaSurfaceObject_t surf_write, + int width, int height, + unsigned int t, + int mouse_x, int mouse_y, int mouse_active, + int ambient_on) { + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + if (x >= width || y >= height) return; + + const int MAX_I = 36; + + // 1) Mouse torch: a hot disc painted at the cursor (overrides everything). + if (mouse_active) { + int dx = x - mouse_x; + int dy = y - mouse_y; + if (dx * dx + dy * dy <= 12 * 12) { // matches host TORCH_RADIUS + surf2Dwrite((unsigned char)MAX_I, surf_write, x, y); + return; + } + } + + // 2) Bottom row is the steady fire source. Hardcoded to MAX_I when the + // ambient ember bed is on; zero otherwise (lets the fire die down). + if (y == height - 1) { + surf2Dwrite((unsigned char)(ambient_on ? MAX_I : 0), + surf_write, x, y); + return; + } + + // 3) Gather from the row below with random {0, 1, 2} horizontal shift + // and matching intensity decay -- the canonical Doom-fire update. + float jitter_h = hash3((unsigned int)x, (unsigned int)y, t); + int decay = (int)(jitter_h * 3.0f); // 0, 1, or 2 + int src_x = x + decay; + if (src_x >= width) src_x = width - 1; + unsigned char below = tex2D(tex_read, + (float)src_x + 0.5f, + (float)y + 1.5f); + int new_i = (int)below - decay; + if (new_i < 0) new_i = 0; + + // UINT8 is 1 byte, so surf2Dwrite's x argument is already the byte offset. + surf2Dwrite((unsigned char)new_i, surf_write, x, y); +} + +extern "C" +__global__ +void colorize_fire(cudaTextureObject_t tex_heat, + cudaTextureObject_t palette_tex, + unsigned char* output, + int width, int height) { + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + if (x >= width || y >= height) return; + + // Heat texture is UINT8 + ELEMENT_TYPE: tex2D returns the + // raw intensity byte (0..36). + unsigned char h = tex2D(tex_heat, + (float)x + 0.5f, + (float)y + 0.5f); + + // Palette texture is 1D normalized RGBA8 with POINT filtering and 37 + // entries. Index i lands at coord (i + 0.5) / 37 -- the texel center, + // which POINT samples exactly. + const float palette_size = 37.0f; + float u = ((float)h + 0.5f) / palette_size; + float4 c = tex1D(palette_tex, u); + + int idx = (y * width + x) * 4; + output[idx + 0] = (unsigned char)(c.x * 255.0f); + output[idx + 1] = (unsigned char)(c.y * 255.0f); + output[idx + 2] = (unsigned char)(c.z * 255.0f); + output[idx + 3] = 255; +} +""" + +# GLSL shaders -- these just display a texture on a fullscreen rectangle. +# Nothing CUDA-specific here. + +VERTEX_SHADER_SOURCE = """#version 330 core +in vec2 position; +in vec2 texcoord; +out vec2 v_texcoord; +void main() { + gl_Position = vec4(position, 0.0, 1.0); + v_texcoord = texcoord; +} +""" + +FRAGMENT_SHADER_SOURCE = """#version 330 core +in vec2 v_texcoord; +out vec4 fragColor; +uniform sampler2D tex; +void main() { + fragColor = texture(tex, v_texcoord); +} +""" + + +if __name__ == "__main__": + main() diff --git a/cuda_core/examples/gl_interop_fluid.py b/cuda_core/examples/gl_interop_fluid.py new file mode 100644 index 00000000000..1423580fcdb --- /dev/null +++ b/cuda_core/examples/gl_interop_fluid.py @@ -0,0 +1,1251 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +# ################################################################################ +# +# This example demonstrates cuda.core.CUDAArray, TextureObject, and SurfaceObject +# in combination with GraphicsResource for CUDA/OpenGL interop. It runs a +# real-time Stable Fluids (Jos Stam) smoke/ink solver entirely on the GPU: +# velocity, pressure, and dye fields live in ping-ponged CUDA arrays, are read +# through TextureObjects with free hardware bilinear filtering (the heart of +# semi-Lagrangian advection), and written back through SurfaceObjects. The dye +# is colorized straight into an OpenGL PBO. Drag the mouse to inject swirling +# ink. Requires pyglet. +# +# ################################################################################ + +# What this example teaches +# ========================= +# - How semi-Lagrangian advection uses tex2D LINEAR sampling: trace each cell +# backward along the velocity field and read the old quantity with free +# hardware bilinear interpolation (no manual lerp, no neighbor gather). +# - How to drive several distinct kernels (advect, divergence, Jacobi pressure +# solve, gradient subtraction, dye advect, colorize) over a shared set of +# pre-created TextureObject/SurfaceObject handles, ping-ponging multiple +# fields without recreating handles per frame. +# - How to fold live mouse input into a GPU simulation: capture the mouse delta +# and splat velocity + dye into the field via a SurfaceObject (in-place +# read-modify-write, one thread per cell -> no race). +# +# How it works +# ============ +# Stam's "Stable Fluids" solves the incompressible Navier-Stokes equations on a +# regular grid by splitting each step into stages that are each individually +# stable: +# +# 1. ADVECT VELOCITY - move the velocity field along itself. For each cell we +# back-trace its center one timestep against the local velocity and read +# the old velocity there with tex2D LINEAR (bilinear). This is the +# unconditionally-stable semi-Lagrangian scheme. +# 2. SPLAT (input) - add the mouse-drag velocity and a dab of dye in a soft +# radial brush around the cursor (in-place on the velocity/dye surfaces). +# 3. DIVERGENCE - compute div(velocity), the amount each cell is a +# source/sink. An incompressible fluid must have zero divergence. +# 4. PRESSURE SOLVE - Jacobi-iterate the Poisson equation lap(p) = div, +# ping-ponging two pressure buffers for ~30 iterations. +# 5. SUBTRACT GRADIENT- v <- v - grad(p). This projects the velocity onto its +# divergence-free part, enforcing incompressibility. +# 6. ADVECT DYE - move the ink along the (now divergence-free) velocity, +# again with tex2D LINEAR back-tracing. +# 7. COLORIZE - map dye density through a vivid gradient into the PBO. +# +# PING-PONG (read one array, write the other, then swap) +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +-----------+ tex2D LINEAR +-------------+ surf2Dwrite +-----------+ +# | vel_a | -----------------------> | advect / | --------------> | vel_b | +# | (vx, vy) | | jacobi / | | (vx, vy) | +# +-----------+ | advect_dye | +-----------+ +# ^ +-------------+ | +# +-------------------------------- (swap) ------------------------------+ +# +# Why LINEAR + CLAMP + normalized coords? +# --------------------------------------- +# Semi-Lagrangian advection traces a cell's center back to an arbitrary +# fractional position and needs the interpolated field value there. LINEAR +# filtering gives that bilinear interpolation for free in hardware. We use a +# bounded box (CLAMP) rather than a torus so ink piles up against the walls +# instead of wrapping. CLAMP, like all addressing modes, behaves cleanly with +# normalized coordinates, and we sample at texel centers `(i + 0.5) / N` so a +# zero-velocity cell reads back exactly its own value. +# +# Channel byte width in surf2Dwrite +# --------------------------------- +# `surf2Dwrite` takes the x coordinate in BYTES, not in elements. Velocity is a +# `float2` (8 bytes) so its x offset is `x * sizeof(float2)`; pressure and +# divergence are `float` (4 bytes, `x * sizeof(float)`); the dye is a `float4` +# RGBA color (16 bytes, `x * sizeof(float4)`). Getting this wrong silently +# corrupts every other column. +# +# What you should see +# =================== +# Big blobs of saturated color are dropped into the fluid every fraction of a +# second and immediately billow, swirl, and mix into turbulent ribbons that +# fill the window -- "ink dropped in water." Drag the mouse to paint your own +# rainbow ink. Press R to clear, Escape to exit. The window title shows the +# current FPS, pressure-iteration count, and live texture/surface config. +# + +# /// script +# dependencies = ["cuda_bindings", "cuda_core>0.6.0", "pyglet"] +# /// + +import colorsys +import ctypes +import math +import random +import sys +import time + +import numpy as np + +from cuda.core import ( + AddressMode, + ArrayFormat, + CUDAArray, + Device, + FilterMode, + GraphicsResource, + LaunchConfig, + Program, + ProgramOptions, + ReadMode, + ResourceDescriptor, + SurfaceObject, + TextureDescriptor, + TextureObject, + launch, +) + +# --------------------------------------------------------------------------- +# Simulation parameters (feel free to change these) +# --------------------------------------------------------------------------- +WIDTH = 512 +HEIGHT = 512 +DT = 1.0 # simulation timestep +PRESSURE_ITERS = 30 # Jacobi iterations for the pressure solve per frame +VELOCITY_DISSIPATION = 0.999 # per-step velocity decay (1.0 = no decay) +DYE_DISSIPATION = 0.994 # per-step dye decay; ink lingers and builds, then fades +SPLAT_RADIUS = 24.0 # brush radius in cells for mouse injection +SPLAT_FORCE = 6.0 # how strongly a mouse delta becomes velocity +SPLAT_DYE = 1.0 # mouse ink intensity (color * this is deposited) +CURL_SEED = 2.5 # strength of the ambient curl seeded on reset +# Vorticity confinement pushes velocity back toward regions of high |curl|, +# sharpening the swirls that numerical diffusion would otherwise smear out. +# This is the single extra kernel that turns soft blobs into crisp curling +# plumes. Tunable: ~0.1-0.3 reads well at DT=1.0; higher gets turbulent. +VORTICITY = 0.28 # confinement strength (0.0 disables it) + +# Auto-bursts keep the simulation alive and colorful without any input: when +# the mouse is idle we periodically drop a big blob of a random bright color +# with a random velocity impulse at a random spot -- the classic "ink dropped +# in water" look that quickly fills the frame with billowing, swirling color. +# Grab the cursor and drag to paint your own ink. +AUTO_EMIT = True +BURST_INTERVAL = 0.45 # seconds between automatic colored bursts +BURSTS_PER_EVENT = 2 # blobs dropped each burst event +BURST_RADIUS = 42.0 # blob radius in cells (big, soft) +BURST_FORCE = 18.0 # velocity impulse magnitude per blob +BURST_DYE = 1.2 # ink intensity per blob (random color * this) + +# This solver advances one step per displayed frame, so its per-step rates +# (dissipation, advection distance) would otherwise depend on the frame rate -- +# on a fast GPU the dye would dissipate away almost instantly between bursts. +# We make it frame-rate INDEPENDENT instead: every frame, the real elapsed time +# is expressed in units of a REF_FPS reference step and the dissipation and +# advection distance are scaled by it, so the ink evolves at the same wall-clock +# rate (and looks the same) whether the loop runs at 60 or 2000 FPS. Running +# faster just means more, smaller, smoother substeps. +REF_FPS = 60.0 + + +# ============================= Helper functions ============================= +# +# The functions below set up CUDA and OpenGL. If you're here to learn about +# CUDAArray/TextureObject/SurfaceObject, skip ahead to main() -- the interesting +# part is there. These helpers exist so that main() reads like a short story +# instead of a wall of boilerplate. +# ============================================================================ + + +def setup_cuda(): + """Compile the CUDA kernels and return (device, stream, kernels, configs). + + Returns a dict of kernels keyed by name and a shared LaunchConfig (every + kernel is pixel-parallel over the same WIDTH x HEIGHT grid). + """ + dev = Device(0) + dev.set_current() + + # SurfaceObject requires surface load/store, which has existed since SM 2.0, + # but bindless surface objects (cuSurfObjectCreate) require SM 3.0+. + cc = dev.compute_capability + if cc.major < 3: + print( + "This example requires a GPU with compute capability >= 3.0 for " + f"bindless surface objects. Found sm_{cc.major}{cc.minor}.", + file=sys.stderr, + ) + sys.exit(1) + + stream = dev.create_stream() + + # Compile as C++ so the templated tex2D overload resolves. + program_options = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}") + prog = Program(KERNEL_SOURCE, code_type="c++", options=program_options) + mod = prog.compile( + "cubin", + name_expressions=( + "seed_field", + "splat", + "advect_velocity", + "vorticity_confinement", + "divergence", + "pressure_jacobi", + "subtract_gradient", + "advect_dye", + "colorize", + ), + ) + + kernels = { + "seed": mod.get_kernel("seed_field"), + "splat": mod.get_kernel("splat"), + "advect_vel": mod.get_kernel("advect_velocity"), + "vorticity": mod.get_kernel("vorticity_confinement"), + "divergence": mod.get_kernel("divergence"), + "jacobi": mod.get_kernel("pressure_jacobi"), + "subtract": mod.get_kernel("subtract_gradient"), + "advect_dye": mod.get_kernel("advect_dye"), + "colorize": mod.get_kernel("colorize"), + } + + block = (16, 16, 1) + grid = ( + (WIDTH + block[0] - 1) // block[0], + (HEIGHT + block[1] - 1) // block[1], + 1, + ) + config = LaunchConfig(grid=grid, block=block) + + return dev, stream, kernels, config + + +def create_window(): + """Open a pyglet window and return (window, gl_module, pyglet).""" + try: + import pyglet + from pyglet.gl import gl as _gl + except ImportError: + print( + "This example requires pyglet >= 2.0.\nInstall it with: pip install pyglet", + file=sys.stderr, + ) + sys.exit(1) + + window = pyglet.window.Window( + WIDTH, + HEIGHT, + caption="cuda.core CUDAArray/Texture/Surface - Stable Fluids", + vsync=False, + ) + return window, _gl, pyglet + + +def create_display_resources(gl, width, height): + """Create the GL objects needed to show a texture on screen. + + This sets up a shader program, a fullscreen quad, and an empty texture. + None of this is CUDA-specific -- it's standard OpenGL boilerplate for + rendering a textured quad. + + Returns (shader_program, vertex_array_id, texture_id). The shader_program + is a pyglet ShaderProgram object (must be kept alive). + """ + from pyglet.graphics.shader import Shader, ShaderProgram + + # Shader program -- just passes texture coordinates through + vert = Shader(VERTEX_SHADER_SOURCE, "vertex") + frag = Shader(FRAGMENT_SHADER_SOURCE, "fragment") + shader_prog = ShaderProgram(vert, frag) + + # Fullscreen quad (two triangles covering the entire window) + quad_verts = np.array( + [ + # x, y, s, t (position + texture coordinate) + -1, + -1, + 0, + 0, + 1, + -1, + 1, + 0, + 1, + 1, + 1, + 1, + -1, + -1, + 0, + 0, + 1, + 1, + 1, + 1, + -1, + 1, + 0, + 1, + ], + dtype=np.float32, + ) + + vao = ctypes.c_uint(0) + gl.glGenVertexArrays(1, ctypes.byref(vao)) + gl.glBindVertexArray(vao.value) + + vbo = ctypes.c_uint(0) + gl.glGenBuffers(1, ctypes.byref(vbo)) + gl.glBindBuffer(gl.GL_ARRAY_BUFFER, vbo.value) + gl.glBufferData( + gl.GL_ARRAY_BUFFER, + quad_verts.nbytes, + quad_verts.ctypes.data_as(ctypes.c_void_p), + gl.GL_STATIC_DRAW, + ) + + stride = 4 * 4 # 4 floats * 4 bytes each = 16 bytes per vertex + pos_loc = gl.glGetAttribLocation(shader_prog.id, b"position") + gl.glEnableVertexAttribArray(pos_loc) + gl.glVertexAttribPointer(pos_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(0)) + + tc_loc = gl.glGetAttribLocation(shader_prog.id, b"texcoord") + gl.glEnableVertexAttribArray(tc_loc) + gl.glVertexAttribPointer(tc_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(8)) + + gl.glBindVertexArray(0) + + # Empty texture (will be filled each frame from the PBO) + tex = ctypes.c_uint(0) + gl.glGenTextures(1, ctypes.byref(tex)) + gl.glBindTexture(gl.GL_TEXTURE_2D, tex.value) + gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MIN_FILTER, gl.GL_LINEAR) + gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MAG_FILTER, gl.GL_LINEAR) + gl.glTexImage2D( + gl.GL_TEXTURE_2D, + 0, + gl.GL_RGBA8, + width, + height, + 0, + gl.GL_RGBA, + gl.GL_UNSIGNED_BYTE, + None, + ) + + return shader_prog, vao.value, tex.value + + +def create_pixel_buffer(gl, width, height): + """Create a Pixel Buffer Object (PBO) -- the bridge between CUDA and OpenGL. + + A PBO is a GPU-side buffer that OpenGL can read from when uploading pixels + to a texture. By registering this same buffer with CUDA, the CUDA kernel + can write directly into it. + + Returns (pbo_gl_name, size_in_bytes). + """ + pbo = ctypes.c_uint(0) + gl.glGenBuffers(1, ctypes.byref(pbo)) + gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo.value) + nbytes = width * height * 4 # RGBA, 1 byte per channel + gl.glBufferData(gl.GL_PIXEL_UNPACK_BUFFER, nbytes, None, gl.GL_DYNAMIC_DRAW) + gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0) + return pbo.value, nbytes + + +def copy_pbo_to_texture(gl, pbo_id, tex_id, width, height): + """Copy pixel data from the PBO into the GL texture (GPU-to-GPU).""" + gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo_id) + gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id) + gl.glTexSubImage2D( + gl.GL_TEXTURE_2D, + 0, + 0, + 0, + width, + height, + gl.GL_RGBA, + gl.GL_UNSIGNED_BYTE, + None, # None = read from the currently bound PBO, not from CPU + ) + gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0) + + +def draw_fullscreen_quad(gl, shader_prog, vao_id, tex_id): + """Draw the texture to the screen using the fullscreen quad.""" + gl.glUseProgram(shader_prog.id) + gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id) + gl.glBindVertexArray(vao_id) + gl.glDrawArrays(gl.GL_TRIANGLES, 0, 6) + gl.glBindVertexArray(0) + gl.glUseProgram(0) + + +# ============================ API MAP (cuda.core) =========================== +# +# The three helpers below are where every CUDAArray / ResourceDescriptor / +# TextureDescriptor / TextureObject / SurfaceObject knob in this example is set. +# Each visible setting maps to a concrete piece of cuda.core / CUDA behavior: +# +# CUDAArray.from_descriptor(...) -> allocates a CUDA *array* (opaque, tiled +# layout optimized for 2D texture fetches), +# not linear device memory. +# ArrayFormat.FLOAT32 -> each channel is a 32-bit float texel. +# num_channels=2 / num_channels=1 -> float2 (vx, vy) vs scalar (pressure / +# divergence / dye); also fixes the +# surf2Dwrite byte offset per element. +# is_surface_load_store=True -> the SAME array can be bound both as a +# TextureObject (cached, filtered READS) +# and as a SurfaceObject (raw WRITES). This +# is what lets each field be sampled and +# then written back in the ping-pong. +# +# ResourceDescriptor.from_array(arr) -> wraps the CUDAArray as the resource a +# TextureObject reads from. +# FilterMode.LINEAR -> free HARDWARE bilinear interpolation; +# this is what makes semi-Lagrangian +# advection a single tex2D fetch at a +# fractional back-traced position (no +# manual lerp, no neighbor gather). +# AddressMode.CLAMP -> bounded box boundary: out-of-range traces +# read the edge texel (ink piles up at the +# walls instead of wrapping like a torus). +# ReadMode.ELEMENT_TYPE -> return the stored float value as-is (no +# integer->[0,1] normalization of texels). +# normalized_coords=True -> sample in [0, 1) so CLAMP is well-defined +# and texel centers are (i + 0.5) / N. +# +# SurfaceObject.from_array(arr) -> binds the array for surf2Dread/surf2Dwrite. +# The x coordinate is in BYTES, so it is +# x * sizeof(elem): sizeof(float2)=8 for +# velocity, sizeof(float)=4 for the scalars. +# ============================================================================ + + +def make_velocity_array(): + """Allocate a `float2` velocity CUDA array (channel 0 = vx, channel 1 = vy).""" + return CUDAArray.from_descriptor( + shape=(WIDTH, HEIGHT), + format=ArrayFormat.FLOAT32, + num_channels=2, + is_surface_load_store=True, + ) + + +def make_scalar_array(): + """Allocate a single-channel `float` CUDA array (pressure / divergence / dye).""" + return CUDAArray.from_descriptor( + shape=(WIDTH, HEIGHT), + format=ArrayFormat.FLOAT32, + num_channels=1, + is_surface_load_store=True, + ) + + +def make_color_array(): + """Allocate a `float4` RGBA dye CUDA array. + + The dye carries a full color per cell (not just a density), so different + bursts inject different hues that advect and mix. Same LINEAR sampling and + surface-write machinery as the scalar fields -- only the channel count + (and the surf2Dwrite byte stride, sizeof(float4) = 16) differ. + """ + return CUDAArray.from_descriptor( + shape=(WIDTH, HEIGHT), + format=ArrayFormat.FLOAT32, + num_channels=4, + is_surface_load_store=True, + ) + + +def make_texture(arr): + """Bind `arr` as a TextureObject configured for LINEAR + CLAMP + normalized. + + One descriptor serves every read in this example: semi-Lagrangian advection + needs the bilinear interpolation, and the stencil reads (divergence, Jacobi, + gradient) sample exactly at texel centers so LINEAR returns the exact value. + """ + res_desc = ResourceDescriptor.from_array(arr) + tex_desc = TextureDescriptor( + address_mode=AddressMode.CLAMP, + filter_mode=FilterMode.LINEAR, + read_mode=ReadMode.ELEMENT_TYPE, + # Normalized coordinates keep CLAMP addressing well-defined and let us + # sample at texel centers as (i + 0.5) / N. + normalized_coords=True, + ) + return TextureObject.from_descriptor(resource=res_desc, texture_descriptor=tex_desc) + + +def seed_field(stream, kernels, config, vel_surf, dye_surf, prs_surf, seed_value): + """Reset the field: gentle ambient curl in velocity, zero pressure/dye. + + Takes long-lived SurfaceObjects (not freshly created ones): `launch` is + async, so a SurfaceObject created inside a `with` block that closes right + after `launch` returns would destroy the handle before the kernel runs. + """ + launch( + stream, + config, + kernels["seed"], + np.uint64(vel_surf.handle), + np.uint64(dye_surf.handle), + np.uint64(prs_surf.handle), + np.int32(WIDTH), + np.int32(HEIGHT), + np.float32(CURL_SEED), + np.uint32(seed_value), + ) + + +# ================================== main() ================================== + + +def main(): + # --- Step 1: Set up CUDA (compile kernels, create stream) --- + dev, stream, kernels, config = setup_cuda() + + # --- Step 2: Open a window --- + window, gl, pyglet = create_window() + + # --- Step 3: Create GL resources for drawing a texture to screen --- + # (Standard OpenGL boilerplate -- not CUDA-specific.) + shader_prog, quad_vao, tex_id = create_display_resources(gl, WIDTH, HEIGHT) + + # --- Step 4: Create the Pixel Buffer Object (PBO) --- + # The PBO is GPU memory owned by OpenGL. It's the bridge between the + # two worlds: CUDA writes into it, OpenGL reads from it. + pbo_id, _ = create_pixel_buffer(gl, WIDTH, HEIGHT) + + # --- Step 5: Register the PBO with CUDA --- + resource = GraphicsResource.from_gl_buffer(pbo_id, flags="write_discard") + + # --- Step 6: Allocate the simulation fields --- + # velocity (float2) and dye (float) ping-pong; pressure (float) + # ping-pongs across Jacobi iterations; divergence (float) is a single + # scratch target written once per frame. + vel_a = make_velocity_array() + vel_b = make_velocity_array() + prs_a = make_scalar_array() + prs_b = make_scalar_array() + div = make_scalar_array() + dye_a = make_color_array() + dye_b = make_color_array() + + # --- Step 7: Pre-create every bindless handle ONCE --- + # Creating texture/surface objects is comparatively expensive, and they + # must outlive the async launches that reference them, so we build them + # up front and keep them alive for the whole run. + # API MAP: make_texture binds an array as a read-only TextureObject + # (LINEAR + CLAMP + normalized; see the API MAP block above), while + # SurfaceObject.from_array binds the SAME array for raw surf2Dwrite + # writes -- the read/write halves of one ping-pong buffer. + vel_tex_a = make_texture(vel_a) + vel_tex_b = make_texture(vel_b) + vel_surf_a = SurfaceObject.from_array(vel_a) + vel_surf_b = SurfaceObject.from_array(vel_b) + + prs_tex_a = make_texture(prs_a) + prs_tex_b = make_texture(prs_b) + prs_surf_a = SurfaceObject.from_array(prs_a) + prs_surf_b = SurfaceObject.from_array(prs_b) + + div_tex = make_texture(div) + div_surf = SurfaceObject.from_array(div) + + dye_tex_a = make_texture(dye_a) + dye_tex_b = make_texture(dye_b) + dye_surf_a = SurfaceObject.from_array(dye_a) + dye_surf_b = SurfaceObject.from_array(dye_b) + + # --- Step 8: Seed the initial field (curl into vel_a, zero pressure/dye) --- + seed_field(stream, kernels, config, vel_surf_a, dye_surf_a, prs_surf_a, seed_value=0) + stream.sync() + + # `vel` / `dye` track which ping-pong array currently holds the live state. + state = {"vel": "a", "dye": "a", "seed": 0, "next_burst": 0.0} + + # Mouse state shared with the event handlers. Coordinates are in SIMULATION + # space (y = 0 at top); the framebuffer has y = 0 at the bottom, so we flip. + mouse = {"down": False, "x": 0.0, "y": 0.0, "dx": 0.0, "dy": 0.0} + + def vel_pair(): + # Read live velocity, write the other buffer; returns (read_tex, write_surf, next). + if state["vel"] == "a": + return vel_tex_a, vel_surf_b, "b" + return vel_tex_b, vel_surf_a, "a" + + def vel_live_tex(): + return vel_tex_a if state["vel"] == "a" else vel_tex_b + + def vel_live_surf(): + return vel_surf_a if state["vel"] == "a" else vel_surf_b + + def dye_pair(): + if state["dye"] == "a": + return dye_tex_a, dye_surf_b, "b" + return dye_tex_b, dye_surf_a, "a" + + def dye_live_tex(): + return dye_tex_a if state["dye"] == "a" else dye_tex_b + + def dye_live_surf(): + return dye_surf_a if state["dye"] == "a" else dye_surf_b + + # --- Step 9: Render loop --- + start_time = time.monotonic() + frame_count = 0 + fps_time = start_time + clock = {"last": start_time} # wall-clock time of the previous frame + + def _window_to_sim(x, y): + # Window: y = 0 at bottom. Simulation: y = 0 at top. Flip vertically. + sx = float(x) + sy = float(HEIGHT - 1 - y) + return sx, sy + + @window.event + def on_key_press(symbol, _modifiers): + key = pyglet.window.key + if symbol == key.ESCAPE: + window.close() + return + if symbol == key.R: + state["seed"] += 1 + seed_field( + stream, + kernels, + config, + vel_surf_a, + dye_surf_a, + prs_surf_a, + seed_value=state["seed"], + ) + state["vel"] = "a" + state["dye"] = "a" + return + + @window.event + def on_mouse_press(x, y, _button, _modifiers): + mouse["down"] = True + mouse["x"], mouse["y"] = _window_to_sim(x, y) + mouse["dx"] = 0.0 + mouse["dy"] = 0.0 + + @window.event + def on_mouse_release(_x, _y, _button, _modifiers): + mouse["down"] = False + mouse["dx"] = 0.0 + mouse["dy"] = 0.0 + + @window.event + def on_mouse_drag(x, y, dx, dy, _buttons, _modifiers): + # The mouse delta IS the injected velocity. Framebuffer dy is up-positive + # while simulation y is down-positive, so the sim-space delta is -dy. + mouse["down"] = True + mouse["x"], mouse["y"] = _window_to_sim(x, y) + mouse["dx"] = float(dx) + mouse["dy"] = float(-dy) + + @window.event + def on_draw(): + nonlocal frame_count, fps_time + + window.clear() + now_t = time.monotonic() + elapsed = now_t - start_time + + # Frame-rate independence: express this frame's real duration in units of + # a REF_FPS reference step. `step` scales the advection distance, and the + # per-step dissipations are raised to `step` so their per-SECOND rate is + # constant no matter how fast the loop runs. Clamp to absorb the first + # frame and any hitch without launching a giant (unstable-looking) step. + dt_real = now_t - clock["last"] + clock["last"] = now_t + step = min(max(dt_real * REF_FPS, 0.0), 3.0) + dt_adv = DT * step + vel_diss = VELOCITY_DISSIPATION**step + dye_diss = DYE_DISSIPATION**step + + # (a) Advect velocity along itself (semi-Lagrangian, tex2D LINEAR). + vel_read, vel_write, vel_next = vel_pair() + launch( + stream, + config, + kernels["advect_vel"], + np.uint64(vel_read.handle), + np.uint64(vel_write.handle), + np.int32(WIDTH), + np.int32(HEIGHT), + np.float32(dt_adv), + np.float32(vel_diss), + ) + state["vel"] = vel_next + + # (b) Splat mouse-drag velocity and colored dye into the live fields. + # The injected color cycles through hues over time so dragging + # paints a rainbow ribbon of ink. + inject = 1 if mouse["down"] else 0 + mr, mg, mb = colorsys.hsv_to_rgb((elapsed * 0.15) % 1.0, 0.85, 1.0) + launch( + stream, + config, + kernels["splat"], + np.uint64(vel_live_surf().handle), + np.uint64(dye_live_surf().handle), + np.int32(WIDTH), + np.int32(HEIGHT), + np.float32(mouse["x"]), + np.float32(mouse["y"]), + np.float32(mouse["dx"] * SPLAT_FORCE), + np.float32(mouse["dy"] * SPLAT_FORCE), + np.float32(SPLAT_RADIUS), + np.float32(mr * SPLAT_DYE), + np.float32(mg * SPLAT_DYE), + np.float32(mb * SPLAT_DYE), + np.int32(inject), + ) + + # (b2) When the user is not dragging, periodically drop big blobs of a + # random bright color with a random velocity impulse at random + # spots -- the classic "ink in water" look. Reuses the same `splat` + # kernel as the mouse, just with a color argument. + if AUTO_EMIT and not mouse["down"] and elapsed >= state["next_burst"]: + state["next_burst"] = elapsed + BURST_INTERVAL + for _ in range(BURSTS_PER_EVENT): + bx = random.uniform(0.12, 0.88) * WIDTH + by = random.uniform(0.12, 0.88) * HEIGHT + ang = random.uniform(0.0, 2.0 * math.pi) + bfx = math.cos(ang) * BURST_FORCE + bfy = math.sin(ang) * BURST_FORCE + br, bg, bb = colorsys.hsv_to_rgb(random.random(), 0.9, 1.0) + launch( + stream, + config, + kernels["splat"], + np.uint64(vel_live_surf().handle), + np.uint64(dye_live_surf().handle), + np.int32(WIDTH), + np.int32(HEIGHT), + np.float32(bx), + np.float32(by), + np.float32(bfx), + np.float32(bfy), + np.float32(BURST_RADIUS), + np.float32(br * BURST_DYE), + np.float32(bg * BURST_DYE), + np.float32(bb * BURST_DYE), + np.int32(1), + ) + + # (b3) Vorticity confinement: read the live velocity through its + # TextureObject, compute curl + grad|curl|, and add a force that + # pushes velocity back toward high-vorticity regions -- this is the + # one extra kernel that sharpens the curling plumes. Like + # advect_velocity, it reads neighbor velocities, so it MUST + # ping-pong (read old buffer, write the other) -- aliasing a + # texture read with a surface write of the same array in one launch + # is undefined. + if VORTICITY > 0.0: + vort_read, vort_write, vort_next = vel_pair() + launch( + stream, + config, + kernels["vorticity"], + np.uint64(vort_read.handle), + np.uint64(vort_write.handle), + np.int32(WIDTH), + np.int32(HEIGHT), + np.float32(dt_adv), + np.float32(VORTICITY), + ) + state["vel"] = vort_next + + # (c) Compute divergence of the live velocity field. + launch( + stream, + config, + kernels["divergence"], + np.uint64(vel_live_tex().handle), + np.uint64(div_surf.handle), + np.int32(WIDTH), + np.int32(HEIGHT), + ) + + # (d) Pressure solve: Jacobi-iterate lap(p) = div, ping-ponging pressure. + # Start from a cleared pressure field (prs_a) each frame. + launch( + stream, + config, + kernels["jacobi"], + np.uint64(prs_tex_a.handle), # ignored on the first pass via clear flag + np.uint64(div_tex.handle), + np.uint64(prs_surf_b.handle), + np.int32(WIDTH), + np.int32(HEIGHT), + np.int32(1), # clear: treat the previous pressure as zero + ) + # After the clearing pass the result lives in prs_b. Continue iterating. + prs_cur = "b" + for _ in range(PRESSURE_ITERS - 1): + if prs_cur == "b": + read_tex, write_surf, prs_cur = prs_tex_b, prs_surf_a, "a" + else: + read_tex, write_surf, prs_cur = prs_tex_a, prs_surf_b, "b" + launch( + stream, + config, + kernels["jacobi"], + np.uint64(read_tex.handle), + np.uint64(div_tex.handle), + np.uint64(write_surf.handle), + np.int32(WIDTH), + np.int32(HEIGHT), + np.int32(0), # do not clear: read the previous pressure + ) + # `prs_cur` now names the buffer holding the converged pressure. + prs_final_tex = prs_tex_a if prs_cur == "a" else prs_tex_b + + # (e) Subtract pressure gradient from the live velocity (in-place). + launch( + stream, + config, + kernels["subtract"], + np.uint64(prs_final_tex.handle), + np.uint64(vel_live_surf().handle), + np.int32(WIDTH), + np.int32(HEIGHT), + ) + + # (f) Advect the dye along the (now divergence-free) velocity field. + dye_read, dye_write, dye_next = dye_pair() + launch( + stream, + config, + kernels["advect_dye"], + np.uint64(dye_read.handle), + np.uint64(vel_live_tex().handle), + np.uint64(dye_write.handle), + np.int32(WIDTH), + np.int32(HEIGHT), + np.float32(dt_adv), + np.float32(dye_diss), + ) + state["dye"] = dye_next + + # (g) Colorize the latest dye into the OpenGL PBO. + with resource.map(stream=stream) as buf: + launch( + stream, + config, + kernels["colorize"], + np.uint64(dye_live_tex().handle), + buf.handle, + np.int32(WIDTH), + np.int32(HEIGHT), + ) + # Unmap happens automatically when the `with` block exits. + + # (h) Tell OpenGL to copy the PBO contents into our texture. + copy_pbo_to_texture(gl, pbo_id, tex_id, WIDTH, HEIGHT) + + # (i) Draw the texture to the screen. + draw_fullscreen_quad(gl, shader_prog, quad_vao, tex_id) + + # Reset the per-frame mouse delta so a held-still cursor stops pushing. + mouse["dx"] = 0.0 + mouse["dy"] = 0.0 + + # FPS counter (shown in window title) + frame_count += 1 + now = time.monotonic() + if now - fps_time >= 1.0: + fps = frame_count / (now - fps_time) + window.set_caption( + "cuda.core CUDAArray/Texture/Surface - Stable Fluids" + f" ({WIDTH}x{HEIGHT}, {fps:.0f} FPS," + f" {PRESSURE_ITERS} pressure iters)" + " | TextureObject[LINEAR|CLAMP|norm|float2]" + " + SurfaceObject writes + GraphicsResource(PBO)" + ) + frame_count = 0 + fps_time = now + + @window.event + def on_close(): + # Release everything we opened, in reverse order. Each of these is a + # context manager too, but pyglet owns the event loop here so we + # release explicitly to be deterministic about ordering. + resource.close() + dye_tex_a.close() + dye_tex_b.close() + dye_surf_a.close() + dye_surf_b.close() + div_tex.close() + div_surf.close() + prs_tex_a.close() + prs_tex_b.close() + prs_surf_a.close() + prs_surf_b.close() + vel_tex_a.close() + vel_tex_b.close() + vel_surf_a.close() + vel_surf_b.close() + dye_a.close() + dye_b.close() + div.close() + prs_a.close() + prs_b.close() + vel_a.close() + vel_b.close() + stream.close() + + # Render as fast as the GPU allows; the per-step rates are scaled by real + # elapsed time (see REF_FPS) so the look is frame-rate independent. + pyglet.app.run(interval=0) + + +# ======================== GPU code (CUDA + GLSL) ============================ +# +# These source strings are kept at the bottom of the file so they don't +# distract from the Python logic above. The important things to know: +# +# - KERNEL_SOURCE contains the eight CUDA C++ kernels of the Stable Fluids +# pipeline. Reads go through cudaTextureObject_t (LINEAR + CLAMP + +# normalized coords); writes go through cudaSurfaceObject_t with the x +# offset in BYTES. A small helper converts pixel coords to normalized +# texel-center coords. +# +# - VERTEX_SHADER_SOURCE / FRAGMENT_SHADER_SOURCE are GLSL. They draw a +# texture onto a rectangle covering the entire window. Nothing interesting. +# +# ============================================================================ + +KERNEL_SOURCE = r""" +// Sample a float2 (velocity) field at pixel center (px, py) with bilinear +// filtering. CLAMP addressing keeps out-of-range traces at the border. +__device__ __forceinline__ +float2 sample_vec(cudaTextureObject_t tex, float px, float py, + int width, int height) { + float u = (px + 0.5f) / (float)width; + float v = (py + 0.5f) / (float)height; + return tex2D(tex, u, v); +} + +// Sample a scalar (float) field at pixel center (px, py) with bilinear filtering. +__device__ __forceinline__ +float sample_scalar(cudaTextureObject_t tex, float px, float py, + int width, int height) { + float u = (px + 0.5f) / (float)width; + float v = (py + 0.5f) / (float)height; + return tex2D(tex, u, v); +} + +// Sample a float4 (RGBA dye) field at pixel center with bilinear filtering. +__device__ __forceinline__ +float4 sample_color(cudaTextureObject_t tex, float px, float py, + int width, int height) { + float u = (px + 0.5f) / (float)width; + float v = (py + 0.5f) / (float)height; + return tex2D(tex, u, v); +} + +extern "C" +__global__ +void seed_field(cudaSurfaceObject_t vel_surf, + cudaSurfaceObject_t dye_surf, + cudaSurfaceObject_t prs_surf, + int width, int height, + float curl, unsigned int seed) { + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + if (x >= width || y >= height) return; + + // Seed a gentle global rotation: velocity perpendicular to the radius from + // the center gives a curl, so even with no mouse input there is motion. + float cx = width * 0.5f; + float cy = height * 0.5f; + float rx = (x - cx) / cx; // ~[-1, 1] + float ry = (y - cy) / cy; + float2 vel = make_float2(-ry * curl, rx * curl); + + // A touch of deterministic noise so successive resets look a little + // different and to break perfect symmetry. + unsigned int h = (unsigned int)x * 374761393u + + (unsigned int)y * 668265263u + seed * 2246822519u; + h = (h ^ (h >> 13)) * 1274126177u; + h = h ^ (h >> 16); + float noise = ((h & 0xffffu) / 65535.0f) - 0.5f; // [-0.5, 0.5] + vel.x += noise * 0.2f; + vel.y += noise * 0.2f; + + // Dye starts black; the colored bursts (or the mouse) paint the ink, so + // there is nothing to seed here beyond clearing to zero. + surf2Dwrite(vel, vel_surf, x * (int)sizeof(float2), y); + surf2Dwrite(make_float4(0.0f, 0.0f, 0.0f, 0.0f), dye_surf, + x * (int)sizeof(float4), y); + surf2Dwrite(0.0f, prs_surf, x * (int)sizeof(float), y); +} + +// Inject mouse-drag velocity and dye into a soft radial brush around the +// cursor. In-place read-modify-write: each thread owns its own cell, no race. +extern "C" +__global__ +void splat(cudaSurfaceObject_t vel_surf, + cudaSurfaceObject_t dye_surf, + int width, int height, + float mx, float my, + float fx, float fy, + float radius, float dr, float dg, float db, + int inject) { + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + if (x >= width || y >= height) return; + if (!inject) return; + + float dx = (float)x - mx; + float dy = (float)y - my; + float d2 = dx * dx + dy * dy; + float falloff = expf(-d2 / (radius * radius)); + if (falloff < 1e-3f) return; + + float2 vel; + surf2Dread(&vel, vel_surf, x * (int)sizeof(float2), y); + vel.x += fx * falloff; + vel.y += fy * falloff; + surf2Dwrite(vel, vel_surf, x * (int)sizeof(float2), y); + + // Additive colored ink. float4 surface element is 16 bytes. + float4 dye; + surf2Dread(&dye, dye_surf, x * (int)sizeof(float4), y); + dye.x += dr * falloff; + dye.y += dg * falloff; + dye.z += db * falloff; + dye.w = 1.0f; + surf2Dwrite(dye, dye_surf, x * (int)sizeof(float4), y); +} + +// Semi-Lagrangian advection of the velocity field along itself. +extern "C" +__global__ +void advect_velocity(cudaTextureObject_t vel_tex, + cudaSurfaceObject_t vel_out, + int width, int height, + float dt, float dissipation) { + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + if (x >= width || y >= height) return; + + float2 v = sample_vec(vel_tex, (float)x, (float)y, width, height); + // Trace this cell's center backward along the velocity field. + float px = (float)x - dt * v.x; + float py = (float)y - dt * v.y; + float2 advected = sample_vec(vel_tex, px, py, width, height); + advected.x *= dissipation; + advected.y *= dissipation; + surf2Dwrite(advected, vel_out, x * (int)sizeof(float2), y); +} + +// Vorticity confinement. Curl of a 2D velocity field is the scalar +// w = dVy/dx - dVx/dy. Where |w| has a gradient we add a force that pushes +// velocity along the swirl, reinjecting the small-scale rotation that +// numerical diffusion smears away -- the result is crisper, longer-lived +// curls. Reads neighbor velocities through the TextureObject and writes the +// updated velocity to a SEPARATE ping-pong buffer (no read/write aliasing). +__device__ __forceinline__ +float curl_at(cudaTextureObject_t vel_tex, float px, float py, + int width, int height) { + float2 l = sample_vec(vel_tex, px - 1.0f, py, width, height); + float2 r = sample_vec(vel_tex, px + 1.0f, py, width, height); + float2 d = sample_vec(vel_tex, px, py - 1.0f, width, height); + float2 u = sample_vec(vel_tex, px, py + 1.0f, width, height); + return 0.5f * ((r.y - l.y) - (u.x - d.x)); +} + +extern "C" +__global__ +void vorticity_confinement(cudaTextureObject_t vel_tex, + cudaSurfaceObject_t vel_out, + int width, int height, + float dt, float eps) { + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + if (x >= width || y >= height) return; + + float fx = (float)x; + float fy = (float)y; + + // Curl at this cell and at the 4 neighbors (for grad|curl|). + float w = curl_at(vel_tex, fx, fy, width, height); + float wl = curl_at(vel_tex, fx - 1.0f, fy, width, height); + float wr = curl_at(vel_tex, fx + 1.0f, fy, width, height); + float wd = curl_at(vel_tex, fx, fy - 1.0f, width, height); + float wu = curl_at(vel_tex, fx, fy + 1.0f, width, height); + + // Gradient of |curl|, normalized to a unit direction N. + float gx = 0.5f * (fabsf(wr) - fabsf(wl)); + float gy = 0.5f * (fabsf(wu) - fabsf(wd)); + float len = sqrtf(gx * gx + gy * gy) + 1e-5f; + float nx = gx / len; + float ny = gy / len; + + // Confinement force = eps * (N x w_hat). In 2D: (N_y * w, -N_x * w). + float2 v = sample_vec(vel_tex, fx, fy, width, height); + v.x += eps * dt * (ny * w); + v.y += eps * dt * (-nx * w); + surf2Dwrite(v, vel_out, x * (int)sizeof(float2), y); +} + +// Divergence of the velocity field (central differences), written as a scalar. +extern "C" +__global__ +void divergence(cudaTextureObject_t vel_tex, + cudaSurfaceObject_t div_out, + int width, int height) { + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + if (x >= width || y >= height) return; + + float2 l = sample_vec(vel_tex, (float)x - 1.0f, (float)y, width, height); + float2 r = sample_vec(vel_tex, (float)x + 1.0f, (float)y, width, height); + float2 d = sample_vec(vel_tex, (float)x, (float)y - 1.0f, width, height); + float2 u = sample_vec(vel_tex, (float)x, (float)y + 1.0f, width, height); + + float div = 0.5f * ((r.x - l.x) + (u.y - d.y)); + surf2Dwrite(div, div_out, x * (int)sizeof(float), y); +} + +// One Jacobi iteration of lap(p) = div. With unit grid spacing the update is +// p = (p_left + p_right + p_down + p_up - div) / 4. When `clear` is set the +// previous pressure is treated as zero so the first pass starts clean. +extern "C" +__global__ +void pressure_jacobi(cudaTextureObject_t prs_tex, + cudaTextureObject_t div_tex, + cudaSurfaceObject_t prs_out, + int width, int height, + int clear) { + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + if (x >= width || y >= height) return; + + float pl = 0.0f, pr = 0.0f, pd = 0.0f, pu = 0.0f; + if (!clear) { + pl = sample_scalar(prs_tex, (float)x - 1.0f, (float)y, width, height); + pr = sample_scalar(prs_tex, (float)x + 1.0f, (float)y, width, height); + pd = sample_scalar(prs_tex, (float)x, (float)y - 1.0f, width, height); + pu = sample_scalar(prs_tex, (float)x, (float)y + 1.0f, width, height); + } + float div = sample_scalar(div_tex, (float)x, (float)y, width, height); + float p = (pl + pr + pd + pu - div) * 0.25f; + surf2Dwrite(p, prs_out, x * (int)sizeof(float), y); +} + +// v <- v - grad(p): project the velocity onto its divergence-free part. +extern "C" +__global__ +void subtract_gradient(cudaTextureObject_t prs_tex, + cudaSurfaceObject_t vel_surf, + int width, int height) { + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + if (x >= width || y >= height) return; + + float pl = sample_scalar(prs_tex, (float)x - 1.0f, (float)y, width, height); + float pr = sample_scalar(prs_tex, (float)x + 1.0f, (float)y, width, height); + float pd = sample_scalar(prs_tex, (float)x, (float)y - 1.0f, width, height); + float pu = sample_scalar(prs_tex, (float)x, (float)y + 1.0f, width, height); + + float2 v; + surf2Dread(&v, vel_surf, x * (int)sizeof(float2), y); + v.x -= 0.5f * (pr - pl); + v.y -= 0.5f * (pu - pd); + surf2Dwrite(v, vel_surf, x * (int)sizeof(float2), y); +} + +// Semi-Lagrangian advection of the dye along the velocity field. +extern "C" +__global__ +void advect_dye(cudaTextureObject_t dye_tex, + cudaTextureObject_t vel_tex, + cudaSurfaceObject_t dye_out, + int width, int height, + float dt, float dissipation) { + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + if (x >= width || y >= height) return; + + float2 v = sample_vec(vel_tex, (float)x, (float)y, width, height); + float px = (float)x - dt * v.x; + float py = (float)y - dt * v.y; + float4 d = sample_color(dye_tex, px, py, width, height); + d.x *= dissipation; + d.y *= dissipation; + d.z *= dissipation; + d.w *= dissipation; + surf2Dwrite(d, dye_out, x * (int)sizeof(float4), y); +} + +// Tonemap the accumulated float4 dye color into the PBO. The ink color is +// whatever the bursts/mouse injected and advection mixed; we apply a filmic +// 1 - exp(-c) curve so dense ink stays vivid without harshly clipping. +extern "C" +__global__ +void colorize(cudaTextureObject_t dye_tex, + unsigned char* output, + int width, int height) { + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + if (x >= width || y >= height) return; + + float4 c = sample_color(dye_tex, (float)x, (float)y, width, height); + const float gain = 1.3f; + float r = 1.0f - expf(-fmaxf(c.x, 0.0f) * gain); + float g = 1.0f - expf(-fmaxf(c.y, 0.0f) * gain); + float b = 1.0f - expf(-fmaxf(c.z, 0.0f) * gain); + + int idx = (y * width + x) * 4; + output[idx + 0] = (unsigned char)(r * 255.0f); + output[idx + 1] = (unsigned char)(g * 255.0f); + output[idx + 2] = (unsigned char)(b * 255.0f); + output[idx + 3] = 255; +} +""" + +# GLSL shaders -- these just display a texture on a fullscreen rectangle. +# Nothing CUDA-specific here. + +VERTEX_SHADER_SOURCE = """#version 330 core +in vec2 position; +in vec2 texcoord; +out vec2 v_texcoord; +void main() { + gl_Position = vec4(position, 0.0, 1.0); + v_texcoord = texcoord; +} +""" + +FRAGMENT_SHADER_SOURCE = """#version 330 core +in vec2 v_texcoord; +out vec4 fragColor; +uniform sampler2D tex; +void main() { + fragColor = texture(tex, v_texcoord); +} +""" + + +if __name__ == "__main__": + main() diff --git a/cuda_core/examples/gl_interop_image_show.py b/cuda_core/examples/gl_interop_image_show.py new file mode 100644 index 00000000000..7678d457b10 --- /dev/null +++ b/cuda_core/examples/gl_interop_image_show.py @@ -0,0 +1,456 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +# ################################################################################ +# +# Minimal "Hello World" for the cuda.core texture/surface stack. +# +# Allocates a small `CUDAArray`, fills it with a procedural image once, binds it +# as a `TextureObject`, and uses a single CUDA kernel to sample that texture +# at every screen pixel (with a scale + rotation transform) and write the +# result into an OpenGL PBO for display. +# +# Nothing else: no `SurfaceObject`, no ping-pong, no simulation, no mipmaps. +# If you have never touched the new APIs before, open this file first. +# +# ################################################################################ +# +# What this example teaches +# ========================= +# - Allocate an `CUDAArray` and upload data into it with `CUDAArray.copy_from`. +# - Build a `TextureObject` from a `ResourceDescriptor` + `TextureDescriptor`. +# - The visual difference between `FilterMode.POINT` and `FilterMode.LINEAR` +# (press F to toggle live). +# - That filter mode is baked into the `TextureDescriptor` at creation time, +# so changing it requires destroying and rebuilding the `TextureObject`. +# +# How it works +# ============ +# Startup (once): +# +-------------------+ copy_from +----------+ +# | host numpy image | ------------> | CUDAArray | (UINT8 RGBA, 64x64) +# +-------------------+ +----+-----+ +# | +# v +# +-------------+ +# | TextureObj | (filter mode = POINT) +# +-------------+ +# +# Each frame: +# - kernel `sample_image` reads from the TextureObject at a transformed +# (u, v) per screen pixel and writes RGBA bytes to the GL PBO. +# - OpenGL copies the PBO into a screen texture and draws it. +# +# What you should see +# =================== +# A 64x64 procedural test pattern (checkerboard + colored gradient stripes + +# diagonal lines) magnified to fill the window. Press F to switch between +# POINT (blocky) and LINEAR (smooth) sampling; the difference is immediately +# visible. Press R to start/stop a slow rotation. Esc to quit. +# + +# /// script +# dependencies = ["cuda_bindings", "cuda_core>0.6.0", "pyglet"] +# /// + +import ctypes +import sys +import time + +import numpy as np + +from cuda.core import ( + AddressMode, + ArrayFormat, + CUDAArray, + Device, + FilterMode, + GraphicsResource, + LaunchConfig, + Program, + ProgramOptions, + ReadMode, + ResourceDescriptor, + TextureDescriptor, + TextureObject, + launch, +) + +# --------------------------------------------------------------------------- +# Constants +# --------------------------------------------------------------------------- +WIDTH = 640 +HEIGHT = 480 +IMAGE_SIZE = 64 # the source CUDAArray is IMAGE_SIZE x IMAGE_SIZE RGBA8 + + +# ============================= Helper functions ============================= + + +def make_test_image(size): + """Build a (size, size, 4) uint8 RGBA test pattern. + + Designed so the filter-mode difference is obvious: hard-edged checkerboard + (POINT preserves the edges; LINEAR smooths them) plus a vertical color + gradient stripe (LINEAR blends smoothly between palette stops) plus two + diagonal hairlines (POINT preserves them; LINEAR softens them). + """ + img = np.zeros((size, size, 4), dtype=np.uint8) + # 8x8 black/white checkerboard + cells = size // 8 + for y in range(size): + for x in range(size): + if ((x // cells) + (y // cells)) & 1: + img[y, x, :3] = 255 + # vertical RGB gradient strip down the left third + strip = size // 3 + img[:, :strip, 0] = np.linspace(255, 0, size, dtype=np.uint8)[:, None].repeat(strip, axis=1) + img[:, :strip, 1] = np.linspace(0, 255, size, dtype=np.uint8)[:, None].repeat(strip, axis=1) + img[:, :strip, 2] = 128 + # two diagonal red hairlines + for d in range(size): + img[d, d, :] = [255, 0, 0, 255] + if d < size - 4: + img[d, d + 4, :] = [255, 0, 0, 255] + img[:, :, 3] = 255 # opaque + return img + + +def setup_cuda(): + """Compile the kernel and return (device, stream, kernel, launch_config).""" + dev = Device(0) + dev.set_current() + stream = dev.create_stream() + + program_options = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}") + prog = Program(KERNEL_SOURCE, code_type="c++", options=program_options) + mod = prog.compile("cubin", name_expressions=("sample_image",)) + kernel = mod.get_kernel("sample_image") + + block = (16, 16, 1) + grid = ( + (WIDTH + block[0] - 1) // block[0], + (HEIGHT + block[1] - 1) // block[1], + 1, + ) + config = LaunchConfig(grid=grid, block=block) + return dev, stream, kernel, config + + +def create_window(): + """Open a pyglet window. Returns (window, gl_module, pyglet_module).""" + try: + import pyglet + from pyglet.gl import gl as _gl + except ImportError: + print( + "This example requires pyglet >= 2.0.\nInstall it with: pip install pyglet", + file=sys.stderr, + ) + sys.exit(1) + + window = pyglet.window.Window( + WIDTH, + HEIGHT, + caption="cuda.core CUDAArray + TextureObject - Image Show", + vsync=False, + ) + return window, _gl, pyglet + + +def create_display_resources(gl, width, height): + """Standard pyglet boilerplate: shader, fullscreen quad, screen texture.""" + from pyglet.graphics.shader import Shader, ShaderProgram + + vert = Shader(VERTEX_SHADER_SOURCE, "vertex") + frag = Shader(FRAGMENT_SHADER_SOURCE, "fragment") + shader_prog = ShaderProgram(vert, frag) + + quad_verts = np.array( + [ + -1, + -1, + 0, + 0, + 1, + -1, + 1, + 0, + 1, + 1, + 1, + 1, + -1, + -1, + 0, + 0, + 1, + 1, + 1, + 1, + -1, + 1, + 0, + 1, + ], + dtype=np.float32, + ) + + vao = ctypes.c_uint(0) + gl.glGenVertexArrays(1, ctypes.byref(vao)) + gl.glBindVertexArray(vao.value) + + vbo = ctypes.c_uint(0) + gl.glGenBuffers(1, ctypes.byref(vbo)) + gl.glBindBuffer(gl.GL_ARRAY_BUFFER, vbo.value) + gl.glBufferData( + gl.GL_ARRAY_BUFFER, + quad_verts.nbytes, + quad_verts.ctypes.data_as(ctypes.c_void_p), + gl.GL_STATIC_DRAW, + ) + + stride = 4 * 4 + pos_loc = gl.glGetAttribLocation(shader_prog.id, b"position") + gl.glEnableVertexAttribArray(pos_loc) + gl.glVertexAttribPointer(pos_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(0)) + tc_loc = gl.glGetAttribLocation(shader_prog.id, b"texcoord") + gl.glEnableVertexAttribArray(tc_loc) + gl.glVertexAttribPointer(tc_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(8)) + gl.glBindVertexArray(0) + + tex = ctypes.c_uint(0) + gl.glGenTextures(1, ctypes.byref(tex)) + gl.glBindTexture(gl.GL_TEXTURE_2D, tex.value) + gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MIN_FILTER, gl.GL_NEAREST) + gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MAG_FILTER, gl.GL_NEAREST) + gl.glTexImage2D( + gl.GL_TEXTURE_2D, + 0, + gl.GL_RGBA8, + width, + height, + 0, + gl.GL_RGBA, + gl.GL_UNSIGNED_BYTE, + None, + ) + return shader_prog, vao.value, tex.value + + +def create_pixel_buffer(gl, width, height): + """Create the GL PBO that CUDA writes RGBA pixels into each frame.""" + pbo = ctypes.c_uint(0) + gl.glGenBuffers(1, ctypes.byref(pbo)) + gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo.value) + nbytes = width * height * 4 + gl.glBufferData(gl.GL_PIXEL_UNPACK_BUFFER, nbytes, None, gl.GL_DYNAMIC_DRAW) + gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0) + return pbo.value + + +def copy_pbo_to_texture(gl, pbo_id, tex_id, width, height): + gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo_id) + gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id) + gl.glTexSubImage2D( + gl.GL_TEXTURE_2D, + 0, + 0, + 0, + width, + height, + gl.GL_RGBA, + gl.GL_UNSIGNED_BYTE, + None, + ) + gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0) + + +def draw_fullscreen_quad(gl, shader_prog, vao_id, tex_id): + gl.glUseProgram(shader_prog.id) + gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id) + gl.glBindVertexArray(vao_id) + gl.glDrawArrays(gl.GL_TRIANGLES, 0, 6) + gl.glBindVertexArray(0) + gl.glUseProgram(0) + + +def make_texture(arr, filter_mode): + """Build a `TextureObject` for `arr` with the given FilterMode. + + Filter mode is baked into the descriptor at creation; to switch modes + we close this object and call this helper again. + """ + res_desc = ResourceDescriptor.from_array(arr) + tex_desc = TextureDescriptor( + address_mode=AddressMode.CLAMP, + filter_mode=filter_mode, + # UINT8 source + NORMALIZED_FLOAT means tex2D returns each + # channel as a float in [0, 1] -- handy for the colorize math below. + read_mode=ReadMode.NORMALIZED_FLOAT, + normalized_coords=True, + ) + return TextureObject.from_descriptor(resource=res_desc, texture_descriptor=tex_desc) + + +# ================================== main() ================================== + + +def main(): + # --- Step 1: Set up CUDA (compile kernel, create stream) --- + dev, stream, kernel, config = setup_cuda() + + # --- Step 2: Open a window --- + window, gl, pyglet = create_window() + + # --- Step 3: Create GL resources (shader, fullscreen quad, screen tex) --- + shader_prog, quad_vao, screen_tex = create_display_resources(gl, WIDTH, HEIGHT) + + # --- Step 4: Create the PBO that CUDA will write into --- + pbo_id = create_pixel_buffer(gl, WIDTH, HEIGHT) + resource = GraphicsResource.from_gl_buffer(pbo_id, flags="write_discard") + + # --- Step 5: Allocate the source `CUDAArray` and upload the test pattern --- + arr = CUDAArray.from_descriptor( + shape=(IMAGE_SIZE, IMAGE_SIZE), + format=ArrayFormat.UINT8, + num_channels=4, + ) + host_image = make_test_image(IMAGE_SIZE) + arr.copy_from(np.ascontiguousarray(host_image), stream=stream) + stream.sync() + + # --- Step 6: Bind the CUDAArray as a TextureObject (initially POINT) --- + state = {"filter": FilterMode.POINT, "rotate": False, "angle": 0.0} + tex = make_texture(arr, state["filter"]) + + @window.event + def on_key_press(symbol, _modifiers): + key = pyglet.window.key + nonlocal tex + if symbol == key.ESCAPE: + window.close() + elif symbol == key.F: + # Filter mode is baked at TextureObject creation time. Swapping + # it means closing the old one and building a new one. + state["filter"] = FilterMode.LINEAR if state["filter"] == FilterMode.POINT else FilterMode.POINT + tex.close() + tex = make_texture(arr, state["filter"]) + elif symbol == key.R: + state["rotate"] = not state["rotate"] + + # --- Step 7: Render loop --- + start = time.monotonic() + last_t = start + frame_count = 0 + fps_time = start + + @window.event + def on_draw(): + nonlocal frame_count, fps_time, last_t + now = time.monotonic() + if state["rotate"]: + state["angle"] += (now - last_t) * 0.5 # rad/sec + last_t = now + + window.clear() + with resource.map(stream=stream) as buf: + launch( + stream, + config, + kernel, + np.uint64(tex.handle), + buf.handle, + np.int32(WIDTH), + np.int32(HEIGHT), + np.float32(state["angle"]), + ) + copy_pbo_to_texture(gl, pbo_id, screen_tex, WIDTH, HEIGHT) + draw_fullscreen_quad(gl, shader_prog, quad_vao, screen_tex) + + frame_count += 1 + if now - fps_time >= 1.0: + fps = frame_count / (now - fps_time) + window.set_caption( + f"cuda.core CUDAArray + TextureObject - Image Show " + f"(filter={state['filter'].name}, " + f"rotate={'on' if state['rotate'] else 'off'}, " + f"{fps:.0f} FPS)" + ) + frame_count = 0 + fps_time = now + + @window.event + def on_close(): + tex.close() + arr.close() + resource.close() + stream.close() + + pyglet.app.run(interval=0) + + +# ============================== GPU code (kernel) ============================ + +KERNEL_SOURCE = r""" +extern "C" +__global__ +void sample_image(cudaTextureObject_t tex, + unsigned char* output, + int width, int height, + float angle) { + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + if (x >= width || y >= height) return; + + // Center the screen pixel around (0, 0) in [-aspect, aspect] x [-1, 1]. + float aspect = (float)width / (float)height; + float sx = ((float)x / (float)width - 0.5f) * 2.0f * aspect; + float sy = ((float)y / (float)height - 0.5f) * 2.0f; + + // Inverse-rotate the screen point: rotating the image by +angle means + // each output pixel reads from the source rotated by -angle. + float c = cosf(-angle), s = sinf(-angle); + float rx = c * sx - s * sy; + float ry = s * sx + c * sy; + + // Map rotated screen point to the [0, 1] x [0, 1] texture domain so the + // image (drawn centered, fitting ~75% of the window height) lands on it. + const float scale = 0.75f; + float u = (rx / (2.0f * scale)) + 0.5f; + float v = (ry / (2.0f * scale)) + 0.5f; + + // AddressMode.CLAMP means out-of-range u/v sample the edge texel. + float4 col = tex2D(tex, u, v); + + int idx = (y * width + x) * 4; + output[idx + 0] = (unsigned char)(col.x * 255.0f); + output[idx + 1] = (unsigned char)(col.y * 255.0f); + output[idx + 2] = (unsigned char)(col.z * 255.0f); + output[idx + 3] = 255; +} +""" + +VERTEX_SHADER_SOURCE = """#version 330 core +in vec2 position; +in vec2 texcoord; +out vec2 v_texcoord; +void main() { + gl_Position = vec4(position, 0.0, 1.0); + v_texcoord = texcoord; +} +""" + +FRAGMENT_SHADER_SOURCE = """#version 330 core +in vec2 v_texcoord; +out vec4 fragColor; +uniform sampler2D tex; +void main() { + fragColor = texture(tex, v_texcoord); +} +""" + + +if __name__ == "__main__": + main() diff --git a/cuda_core/examples/gl_interop_jfa_voronoi.py b/cuda_core/examples/gl_interop_jfa_voronoi.py new file mode 100644 index 00000000000..bd9bead75f4 --- /dev/null +++ b/cuda_core/examples/gl_interop_jfa_voronoi.py @@ -0,0 +1,940 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +# ################################################################################ +# +# This example demonstrates cuda.core.CUDAArray, TextureObject, and SurfaceObject +# in combination with GraphicsResource for CUDA/OpenGL interop. A Voronoi diagram +# is computed every frame with the Jump Flood Algorithm (JFA): a float2 "nearest +# seed" map is ping-ponged between two CUDA arrays across log2(N) passes. Each +# pass reads the previous map through a POINT-filtered TextureObject (exact texel +# reads -- no interpolation) and writes the refined map through a SurfaceObject. +# The final nearest-seed map is colorized straight into an OpenGL PBO as neon +# Voronoi cells or glowing metaballs. Seeds drift continuously so it animates. +# Requires pyglet. +# +# ################################################################################ + +# What this example teaches +# ========================= +# - How to allocate a CUDA CUDAArray with `is_surface_load_store=True` so the same +# memory can be bound as both a TextureObject (for sampled reads) and a +# SurfaceObject (for typed writes). +# - How to use FilterMode.POINT + AddressMode.BORDER + border_color + +# non-normalized coordinates to get EXACT texel reads with a clean +# "off-grid = no seed" sentinel. JFA fundamentally requires reading the +# precise value stored at an integer neighbor offset -- bilinear interpolation +# between two different seed coordinates would be meaningless. This is the +# deliberate inverse of the reaction-diffusion example's LINEAR/WRAP/normalized +# choice. +# API MAP: FilterMode.POINT -> exact texel reads (JFA needs no interpolation); +# AddressMode.BORDER + border_color -> off-grid neighbor fetches return a +# "no seed" sentinel instead of CLAMP-replicating an edge seed. +# - How varying the read offset (the JFA "step") each pass, combined with +# ping-pong surface writes, propagates seed information across the whole image +# in O(log N) passes instead of O(N). +# - How to compose CUDAArray/TextureObject/SurfaceObject with GraphicsResource so +# the entire pipeline never leaves the GPU. +# +# How it works +# ============ +# The Jump Flood Algorithm computes, for every pixel, the coordinate of its +# nearest seed. We store that coordinate in a `float2` map (channel 0 = seed x, +# channel 1 = seed y), using the sentinel (-1, -1) for "no seed known yet". +# +# 1. seed_clear -- fill the whole map with the sentinel. +# 2. seed_splat -- for each seed, write its own (x, y) into the cell it +# occupies. One tiny 1-thread launch per seed (seeds live +# in a host numpy array and are passed as scalar params; +# see "Why splat seeds as scalars" below). +# 3. jfa_step -- the heart of the algorithm. With the current step size s +# (s = K, K/2, ..., 1), every pixel examines itself and its +# 8 neighbors at offset +/- s. Among all non-sentinel seed +# coordinates found, it keeps the one closest to this pixel +# and writes it out. Run once per step size, ping-ponging +# the two arrays each pass. +# 4. colorize -- read the final nearest-seed map and write RGBA bytes +# into the OpenGL PBO. +# +# PING-PONG over JFA passes (two arrays, swap each pass) +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +--------------+ tex2D +--------------+ +# | arr_read | ---------------> | | +# | nearest-seed | (POINT, exact | jfa_step | +# | map | texel reads at | (step s) | +# +--------------+ +/- step) | | +# | | +# +--------------+ surf2Dwrite | | +# | arr_write | <--------------- | | +# | nearest-seed | +--------------+ +# | map | +# +--------------+ +# (swap, halve step) +# +# The step schedule starts at K = next power of two >= max(W, H) / 2 and halves +# down to 1, giving floor(log2(K)) + 1 passes. Because we ping-pong every pass, +# the final result lands in whichever array was written last; we track that +# explicitly (see the loop in on_draw) rather than assuming it is a fixed array. +# The full JFA is re-run from scratch every frame because the seeds move. +# +# Why POINT + BORDER + border_color + non-normalized coords? +# ----------------------------------------------------------- +# JFA reads the exact seed coordinate stored at a specific integer neighbor. +# LINEAR filtering would blend two stored coordinates into a meaningless +# average, so we use FilterMode.POINT. For the addressing mode we use BORDER +# with an explicit border_color equal to the map's "no seed" sentinel +# (-1, -1). The earlier version used CLAMP, but CLAMP makes an off-edge +# neighbor lookup silently return the *edge* texel's real seed coordinate; that +# can make a border pixel pick a seed that is not actually its nearest one. +# BORDER instead returns the sentinel for any out-of-range fetch, which the +# kernel ignores -- the correct "there is no neighbor here" answer. (WRAP and +# MIRROR are the only address modes that require normalized coordinates; BORDER +# and CLAMP work with non-normalized coords, so we keep the integer-style +# sampling.) With non-normalized coordinates a texel at integer (nx, ny) is read +# at `tex2D(tex, nx + 0.5f, ny + 0.5f)` -- the +0.5 lands on the texel +# center. This is intentionally the opposite of the LINEAR/WRAP/normalized +# choice used by the reaction-diffusion example. +# +# Why splat seeds as scalars (no device buffer)? +# ---------------------------------------------- +# Seeds live in a host numpy array and drift via sin/cos on the CPU each frame. +# Rather than allocating a device buffer, we pass each seed's position to a tiny +# 1-thread `seed_splat` kernel as float scalars. With only tens of seeds this is +# a handful of trivial launches per frame. Note the seed *list* is only needed +# for splatting: colorize and the cell-border test read seed coordinates back +# out of the JFA map, never from the host list. +# +# Channel byte width in surf2Dwrite +# --------------------------------- +# `surf2Dwrite` takes the x coordinate in BYTES, not in elements. For a +# `float2` surface that means `x * sizeof(float2)` = `x * 8`. Getting this +# wrong silently corrupts every other column. +# +# What you should see +# =================== +# A window of animated, drifting Voronoi cells (smooth vivid per-cell neon +# colors with glowing seams) or shimmering metaball-style blobs. Press M to +# toggle the two modes, +# +/- to change the seed count, R to reseed, and Escape to exit. The window +# title shows the mode, seed count, and FPS. +# + +# /// script +# dependencies = ["cuda_bindings", "cuda_core>0.6.0", "pyglet"] +# /// + +import ctypes +import math +import sys +import time + +import numpy as np + +from cuda.core import ( + AddressMode, + ArrayFormat, + CUDAArray, + Device, + FilterMode, + GraphicsResource, + LaunchConfig, + Program, + ProgramOptions, + ReadMode, + ResourceDescriptor, + SurfaceObject, + TextureDescriptor, + TextureObject, + launch, +) + +# --------------------------------------------------------------------------- +# Parameters (feel free to change these) +# --------------------------------------------------------------------------- +WIDTH = 512 +HEIGHT = 512 +MAX_SEEDS = 64 # upper bound on the seed count (host array is sized for this) +DEFAULT_SEEDS = 16 +MIN_SEEDS = 2 + +# Visual modes for the colorize kernel. The integer value is passed to the +# kernel; the label is shown in the window caption. +MODE_VORONOI = 0 +MODE_METABALL = 1 +MODE_LABELS = {MODE_VORONOI: "voronoi", MODE_METABALL: "metaball"} + + +def jfa_steps(width, height): + """Return the JFA step schedule: K, K/2, ..., 1. + + K is the next power of two >= max(width, height) / 2. The number of passes + is floor(log2(K)) + 1. + """ + longest = max(width, height) + step = 1 + while step < longest // 2: + step *= 2 + steps = [] + while step >= 1: + steps.append(step) + step //= 2 + return steps + + +# ============================= Helper functions ============================= +# +# The functions below set up CUDA and OpenGL. If you're here to learn about +# CUDAArray/TextureObject/SurfaceObject, skip ahead to main() -- the interesting +# part is there. These helpers exist so that main() reads like a short story +# instead of a wall of boilerplate. +# ============================================================================ + + +def setup_cuda(): + """Compile the CUDA kernels and return (device, stream, kernels, configs).""" + dev = Device(0) + dev.set_current() + + # SurfaceObject requires surface load/store, which has existed since SM 2.0, + # but bindless surface objects (cuSurfObjectCreate) require SM 3.0+. + cc = dev.compute_capability + if cc.major < 3: + print( + "This example requires a GPU with compute capability >= 3.0 for " + f"bindless surface objects. Found sm_{cc.major}{cc.minor}.", + file=sys.stderr, + ) + sys.exit(1) + + stream = dev.create_stream() + + # Compile as C++ so the templated tex2D overload resolves. + program_options = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}") + prog = Program(KERNEL_SOURCE, code_type="c++", options=program_options) + mod = prog.compile( + "cubin", + name_expressions=("seed_clear", "seed_splat", "jfa_step", "colorize"), + ) + + kernels = { + "seed_clear": mod.get_kernel("seed_clear"), + "seed_splat": mod.get_kernel("seed_splat"), + "jfa_step": mod.get_kernel("jfa_step"), + "colorize": mod.get_kernel("colorize"), + } + + block = (16, 16, 1) + grid = ( + (WIDTH + block[0] - 1) // block[0], + (HEIGHT + block[1] - 1) // block[1], + 1, + ) + grid_config = LaunchConfig(grid=grid, block=block) + # seed_clear, jfa_step, and colorize are pixel-parallel over a WIDTH x HEIGHT + # grid and can share this config. seed_splat is a single 1-thread launch. + point_config = LaunchConfig(grid=(1, 1, 1), block=(1, 1, 1)) + configs = { + "seed_clear": grid_config, + "jfa_step": grid_config, + "colorize": grid_config, + "seed_splat": point_config, + } + + return dev, stream, kernels, configs + + +def create_window(): + """Open a pyglet window and return (window, gl_module, pyglet).""" + try: + import pyglet + from pyglet.gl import gl as _gl + except ImportError: + print( + "This example requires pyglet >= 2.0.\nInstall it with: pip install pyglet", + file=sys.stderr, + ) + sys.exit(1) + + window = pyglet.window.Window( + WIDTH, + HEIGHT, + caption="cuda.core CUDAArray/Texture/Surface - JFA Voronoi", + vsync=False, + ) + return window, _gl, pyglet + + +def create_display_resources(gl, width, height): + """Create the GL objects needed to show a texture on screen. + + This sets up a shader program, a fullscreen quad, and an empty texture. + None of this is CUDA-specific -- it's standard OpenGL boilerplate for + rendering a textured quad. + + Returns (shader_program, vertex_array_id, texture_id). The shader_program + is a pyglet ShaderProgram object (must be kept alive). + """ + from pyglet.graphics.shader import Shader, ShaderProgram + + # Shader program -- just passes texture coordinates through + vert = Shader(VERTEX_SHADER_SOURCE, "vertex") + frag = Shader(FRAGMENT_SHADER_SOURCE, "fragment") + shader_prog = ShaderProgram(vert, frag) + + # Fullscreen quad (two triangles covering the entire window) + quad_verts = np.array( + [ + # x, y, s, t (position + texture coordinate) + -1, + -1, + 0, + 0, + 1, + -1, + 1, + 0, + 1, + 1, + 1, + 1, + -1, + -1, + 0, + 0, + 1, + 1, + 1, + 1, + -1, + 1, + 0, + 1, + ], + dtype=np.float32, + ) + + vao = ctypes.c_uint(0) + gl.glGenVertexArrays(1, ctypes.byref(vao)) + gl.glBindVertexArray(vao.value) + + vbo = ctypes.c_uint(0) + gl.glGenBuffers(1, ctypes.byref(vbo)) + gl.glBindBuffer(gl.GL_ARRAY_BUFFER, vbo.value) + gl.glBufferData( + gl.GL_ARRAY_BUFFER, + quad_verts.nbytes, + quad_verts.ctypes.data_as(ctypes.c_void_p), + gl.GL_STATIC_DRAW, + ) + + stride = 4 * 4 # 4 floats * 4 bytes each = 16 bytes per vertex + pos_loc = gl.glGetAttribLocation(shader_prog.id, b"position") + gl.glEnableVertexAttribArray(pos_loc) + gl.glVertexAttribPointer(pos_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(0)) + + tc_loc = gl.glGetAttribLocation(shader_prog.id, b"texcoord") + gl.glEnableVertexAttribArray(tc_loc) + gl.glVertexAttribPointer(tc_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(8)) + + gl.glBindVertexArray(0) + + # Empty texture (will be filled each frame from the PBO) + tex = ctypes.c_uint(0) + gl.glGenTextures(1, ctypes.byref(tex)) + gl.glBindTexture(gl.GL_TEXTURE_2D, tex.value) + gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MIN_FILTER, gl.GL_LINEAR) + gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MAG_FILTER, gl.GL_LINEAR) + gl.glTexImage2D( + gl.GL_TEXTURE_2D, + 0, + gl.GL_RGBA8, + width, + height, + 0, + gl.GL_RGBA, + gl.GL_UNSIGNED_BYTE, + None, + ) + + return shader_prog, vao.value, tex.value + + +def create_pixel_buffer(gl, width, height): + """Create a Pixel Buffer Object (PBO) -- the bridge between CUDA and OpenGL. + + A PBO is a GPU-side buffer that OpenGL can read from when uploading pixels + to a texture. By registering this same buffer with CUDA, the CUDA kernel + can write directly into it. + + Returns (pbo_gl_name, size_in_bytes). + """ + pbo = ctypes.c_uint(0) + gl.glGenBuffers(1, ctypes.byref(pbo)) + gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo.value) + nbytes = width * height * 4 # RGBA, 1 byte per channel + gl.glBufferData(gl.GL_PIXEL_UNPACK_BUFFER, nbytes, None, gl.GL_DYNAMIC_DRAW) + gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0) + return pbo.value, nbytes + + +def copy_pbo_to_texture(gl, pbo_id, tex_id, width, height): + """Copy pixel data from the PBO into the GL texture (GPU-to-GPU).""" + gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo_id) + gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id) + gl.glTexSubImage2D( + gl.GL_TEXTURE_2D, + 0, + 0, + 0, + width, + height, + gl.GL_RGBA, + gl.GL_UNSIGNED_BYTE, + None, # None = read from the currently bound PBO, not from CPU + ) + gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0) + + +def draw_fullscreen_quad(gl, shader_prog, vao_id, tex_id): + """Draw the texture to the screen using the fullscreen quad.""" + gl.glUseProgram(shader_prog.id) + gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id) + gl.glBindVertexArray(vao_id) + gl.glDrawArrays(gl.GL_TRIANGLES, 0, 6) + gl.glBindVertexArray(0) + gl.glUseProgram(0) + + +def make_state_arrays(): + """Allocate the two `float2` ping-pong arrays that hold the nearest-seed map.""" + arr_a = CUDAArray.from_descriptor( + shape=(WIDTH, HEIGHT), + format=ArrayFormat.FLOAT32, + num_channels=2, + is_surface_load_store=True, + ) + arr_b = CUDAArray.from_descriptor( + shape=(WIDTH, HEIGHT), + format=ArrayFormat.FLOAT32, + num_channels=2, + is_surface_load_store=True, + ) + return arr_a, arr_b + + +def make_texture(arr): + """Bind `arr` as a TextureObject configured for POINT + BORDER + non-normalized. + + API MAP: + FilterMode.POINT -> exact texel reads (JFA needs no interpolation) + AddressMode.BORDER -> off-grid neighbor fetches return border_color + border_color (sentinel) -> a "no seed" value the kernel ignores, instead + of CLAMP-replicating a real edge seed + + JFA needs exact texel reads at integer neighbor offsets, so we use POINT + filtering (no interpolation). We address with BORDER + an explicit + border_color set to the same "no seed" sentinel as the map's empty cells + (x = -1). When a JFA neighbor lookup lands off the grid, the texture unit + returns that sentinel and the kernel ignores it. This is strictly more + correct than CLAMP: with CLAMP an off-edge fetch silently replicates the + edge texel's seed, which can pull a border pixel toward a seed that is not + actually its nearest one. BORDER turns those out-of-range fetches into a + clean "no candidate". + + Note on coordinates: BORDER addressing is valid with non-normalized + coordinates (only WRAP/MIRROR require normalized coords), so we keep the + integer-style `(nx + 0.5)` sampling used throughout the JFA. The border + sentinel is a 4-tuple because the descriptor always carries four channels; + a float2 read consumes channels 0-1, so (-1, -1) lands in (.x, .y) and the + trailing (0, 0) is unused. + """ + res_desc = ResourceDescriptor.from_array(arr) + tex_desc = TextureDescriptor( + address_mode=AddressMode.BORDER, + filter_mode=FilterMode.POINT, + read_mode=ReadMode.ELEMENT_TYPE, + normalized_coords=False, + border_color=(-1.0, -1.0, 0.0, 0.0), + ) + return TextureObject.from_descriptor(resource=res_desc, texture_descriptor=tex_desc) + + +def make_seeds(count): + """Create `count` drifting seeds. + + Each seed has a base position, an angular speed, and a radius. The instant + position is recomputed every frame from these via sin/cos. Returns a dict of + numpy arrays sized for MAX_SEEDS (only the first `count` are used). + """ + rng = np.random.default_rng() + return { + "base_x": rng.uniform(0.2, 0.8, MAX_SEEDS).astype(np.float32) * WIDTH, + "base_y": rng.uniform(0.2, 0.8, MAX_SEEDS).astype(np.float32) * HEIGHT, + "radius": rng.uniform(0.05, 0.25, MAX_SEEDS).astype(np.float32) * min(WIDTH, HEIGHT), + "phase": rng.uniform(0.0, 2.0 * math.pi, MAX_SEEDS).astype(np.float32), + "speed": rng.uniform(0.3, 1.2, MAX_SEEDS).astype(np.float32), + "count": count, + } + + +def seed_positions(seeds, t): + """Return (xs, ys) instant positions for the active seeds at time `t`. + + Seeds drift along small circles via sin/cos so the Voronoi diagram animates + smoothly. Positions are clamped to the interior of the image. + """ + n = seeds["count"] + ang = seeds["phase"][:n] + seeds["speed"][:n] * t + xs = seeds["base_x"][:n] + seeds["radius"][:n] * np.cos(ang) + ys = seeds["base_y"][:n] + seeds["radius"][:n] * np.sin(ang) + xs = np.clip(xs, 0.0, WIDTH - 1.0).astype(np.float32) + ys = np.clip(ys, 0.0, HEIGHT - 1.0).astype(np.float32) + return xs, ys + + +def run_jfa(stream, kernels, configs, seeds, t, tex_a, tex_b, surf_a, surf_b): + """Run a full JFA pass for the current seed positions. + + Clears arr_a (via surf_a) to the sentinel, splats each seed into arr_a, then + ping-pongs the step loop between (tex_a/surf_a) and (tex_b/surf_b). + + Returns the TextureObject bound to the array that was written last, which + holds the final nearest-seed map for colorize. + """ + # 1. Clear arr_a to the sentinel (-1, -1). + launch( + stream, + configs["seed_clear"], + kernels["seed_clear"], + np.uint64(surf_a.handle), + np.int32(WIDTH), + np.int32(HEIGHT), + ) + + # 2. Splat each seed's own coordinate into arr_a (one 1-thread launch each). + xs, ys = seed_positions(seeds, t) + for i in range(seeds["count"]): + launch( + stream, + configs["seed_splat"], + kernels["seed_splat"], + np.uint64(surf_a.handle), + np.int32(WIDTH), + np.int32(HEIGHT), + np.float32(xs[i]), + np.float32(ys[i]), + ) + + # 3. Ping-pong the JFA step loop. Start reading arr_a / writing arr_b. + read_tex, write_surf = tex_a, surf_b + other_tex, other_surf = tex_b, surf_a + final_tex = tex_a # if the loop body never runs, arr_a holds the result + for step in jfa_steps(WIDTH, HEIGHT): + launch( + stream, + configs["jfa_step"], + kernels["jfa_step"], + np.uint64(read_tex.handle), + np.uint64(write_surf.handle), + np.int32(WIDTH), + np.int32(HEIGHT), + np.int32(step), + ) + # The array we just wrote is now the current map; swap for next pass. + final_tex = tex_b if write_surf is surf_b else tex_a + read_tex, other_tex = other_tex, read_tex + write_surf, other_surf = other_surf, write_surf + return final_tex + + +# ================================== main() ================================== + + +def main(): + # --- Step 1: Set up CUDA (compile kernels, create stream) --- + dev, stream, kernels, configs = setup_cuda() + + # --- Step 2: Open a window --- + window, gl, pyglet = create_window() + + # --- Step 3: Create GL resources for drawing a texture to screen --- + shader_prog, quad_vao, tex_id = create_display_resources(gl, WIDTH, HEIGHT) + + # --- Step 4: Create the Pixel Buffer Object (PBO) --- + pbo_id, _ = create_pixel_buffer(gl, WIDTH, HEIGHT) + + # --- Step 5: Register the PBO with CUDA --- + resource = GraphicsResource.from_gl_buffer(pbo_id, flags="write_discard") + + # --- Step 6: Allocate the two ping-pong nearest-seed map Arrays --- + # Both are `float2` (channel 0 = seed x, channel 1 = seed y) with + # is_surface_load_store=True so they can be bound as SurfaceObjects. + arr_a, arr_b = make_state_arrays() + + # --- Step 7: Pre-create the four bindless handles (once, kept alive) --- + tex_a = make_texture(arr_a) + tex_b = make_texture(arr_b) + surf_a = SurfaceObject.from_array(arr_a) + surf_b = SurfaceObject.from_array(arr_b) + + # --- Step 8: Initialize seeds and view state --- + state = {"mode": MODE_VORONOI, "seeds": make_seeds(DEFAULT_SEEDS)} + + # --- Step 9: Render loop --- + start_time = time.monotonic() + frame_count = 0 + fps_time = start_time + + @window.event + def on_key_press(symbol, _modifiers): + key = pyglet.window.key + if symbol == key.ESCAPE: + window.close() + return + if symbol == key.M: + state["mode"] = MODE_METABALL if state["mode"] == MODE_VORONOI else MODE_VORONOI + return + if symbol == key.R: + state["seeds"] = make_seeds(state["seeds"]["count"]) + return + if symbol in (key.PLUS, key.EQUAL, key.NUM_ADD): + new_count = min(MAX_SEEDS, state["seeds"]["count"] + 1) + if new_count != state["seeds"]["count"]: + state["seeds"] = make_seeds(new_count) + return + if symbol in (key.MINUS, key.NUM_SUBTRACT): + new_count = max(MIN_SEEDS, state["seeds"]["count"] - 1) + if new_count != state["seeds"]["count"]: + state["seeds"] = make_seeds(new_count) + return + + @window.event + def on_draw(): + nonlocal frame_count, fps_time + + window.clear() + t = time.monotonic() - start_time + + # (a) Run the full Jump Flood Algorithm for the current seed positions. + # final_tex is the TextureObject over the array written last. + final_tex = run_jfa(stream, kernels, configs, state["seeds"], t, tex_a, tex_b, surf_a, surf_b) + + # (b) Colorize the nearest-seed map into the OpenGL PBO. + with resource.map(stream=stream) as buf: + launch( + stream, + configs["colorize"], + kernels["colorize"], + np.uint64(final_tex.handle), + buf.handle, + np.int32(WIDTH), + np.int32(HEIGHT), + np.int32(state["mode"]), + np.float32(t), + ) + # Unmap happens automatically when the `with` block exits. + + # (c) Tell OpenGL to copy the PBO contents into our texture. + copy_pbo_to_texture(gl, pbo_id, tex_id, WIDTH, HEIGHT) + + # (d) Draw the texture to the screen. + draw_fullscreen_quad(gl, shader_prog, quad_vao, tex_id) + + # FPS counter (shown in window title) + frame_count += 1 + now = time.monotonic() + if now - fps_time >= 1.0: + fps = frame_count / (now - fps_time) + label = MODE_LABELS[state["mode"]] + window.set_caption( + "cuda.core JFA Voronoi" + " | TextureObject[POINT|BORDER|border_color] float2 + SurfaceObject" + f" | mode={label} | {state['seeds']['count']} seeds" + f" | {WIDTH}x{HEIGHT} | {fps:.0f} FPS" + ) + frame_count = 0 + fps_time = now + + @window.event + def on_close(): + # Release everything we opened, in reverse order. + resource.close() + tex_a.close() + tex_b.close() + surf_a.close() + surf_b.close() + arr_a.close() + arr_b.close() + stream.close() + + pyglet.app.run(interval=0) + + +# ======================== GPU code (CUDA + GLSL) ============================ +# +# These source strings are kept at the bottom of the file so they don't +# distract from the Python logic above. KERNEL_SOURCE contains four CUDA C++ +# kernels: +# +# * seed_clear -- fills the map with the sentinel (-1, -1) via surface writes. +# * seed_splat -- writes one seed's own coordinate into the cell it occupies. +# * jfa_step -- reads the previous map via a POINT-filtered, BORDER-addressed +# TextureObject at +/- step offsets and writes the refined +# nearest-seed map via a SurfaceObject. Off-grid fetches return +# the sentinel border_color. Coordinates are non-normalized. +# * colorize -- reads the final nearest-seed map and writes RGBA bytes into +# the OpenGL PBO, either as smooth neon Voronoi cells with +# glowing borders (mode 0) or glowing metaballs (mode 1). +# +# VERTEX_SHADER_SOURCE / FRAGMENT_SHADER_SOURCE are GLSL that draw a texture on +# a fullscreen rectangle. Nothing interesting. +# ============================================================================ + +KERNEL_SOURCE = r""" +// The nearest-seed map is a float2 per texel: (.x, .y) = coordinate of the +// nearest known seed, or the sentinel (-1, -1) for "none yet". With POINT +// filtering + non-normalized coords, texel (ix, iy) is read at +// tex2D(tex, ix + 0.5f, iy + 0.5f). The texture is BORDER-addressed +// with border_color == the sentinel, so a fetch with an out-of-range coord +// also returns (-1, -1) and is rejected by is_seed() -- the same path as an +// empty interior cell. + +#define SENTINEL_X (-1.0f) + +__device__ __forceinline__ bool is_seed(float2 s) { + // Any non-negative x marks a valid stored seed coordinate. + return s.x >= 0.0f; +} + +// Fully-saturated HSV->RGB, hue/value driven by hash, returns vivid neon RGB. +__device__ __forceinline__ void hsv_to_rgb(float hue, float sat, float val, + float* r, float* g, float* b) { + hue -= floorf(hue); // wrap hue into [0, 1) + float h6 = hue * 6.0f; + float c = val * sat; + float x = c * (1.0f - fabsf(fmodf(h6, 2.0f) - 1.0f)); + float m = val - c; + float rr, gg, bb; + if (h6 < 1.0f) { rr = c; gg = x; bb = 0.0f; } + else if (h6 < 2.0f) { rr = x; gg = c; bb = 0.0f; } + else if (h6 < 3.0f) { rr = 0.0f; gg = c; bb = x; } + else if (h6 < 4.0f) { rr = 0.0f; gg = x; bb = c; } + else if (h6 < 5.0f) { rr = x; gg = 0.0f; bb = c; } + else { rr = c; gg = 0.0f; bb = x; } + *r = rr + m; *g = gg + m; *b = bb + m; +} + +// Hash a seed coordinate into a smooth, vivid per-cell neon color. The hash +// drives a hue around the full color wheel; saturation/value stay high so +// neighboring cells read as distinct saturated hues rather than muddy bytes. +__device__ __forceinline__ void seed_color(float sx, float sy, + float* r, float* g, float* b) { + unsigned int h = (unsigned int)(sx + 0.5f) * 374761393u + + (unsigned int)(sy + 0.5f) * 668265263u; + h = (h ^ (h >> 13)) * 1274126177u; + h = h ^ (h >> 16); + float hue = (h & 0xffffu) / 65535.0f; + // A little value jitter from the high bits keeps equal-hue cells separable. + float val = 0.85f + 0.15f * (((h >> 16) & 0xffu) / 255.0f); + hsv_to_rgb(hue, 0.92f, val, r, g, b); +} + +extern "C" +__global__ +void seed_clear(cudaSurfaceObject_t surf, int width, int height) { + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + if (x >= width || y >= height) return; + // float2 is 8 bytes; surf2Dwrite takes the x offset in BYTES. + surf2Dwrite(make_float2(SENTINEL_X, SENTINEL_X), surf, + x * (int)sizeof(float2), y); +} + +extern "C" +__global__ +void seed_splat(cudaSurfaceObject_t surf, int width, int height, + float sx, float sy) { + // Single-thread launch: write this seed's own coordinate into its cell. + int ix = (int)(sx + 0.5f); + int iy = (int)(sy + 0.5f); + if (ix < 0) ix = 0; + if (ix >= width) ix = width - 1; + if (iy < 0) iy = 0; + if (iy >= height) iy = height - 1; + surf2Dwrite(make_float2(sx, sy), surf, ix * (int)sizeof(float2), iy); +} + +extern "C" +__global__ +void jfa_step(cudaTextureObject_t tex, cudaSurfaceObject_t surf, + int width, int height, int step) { + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + if (x >= width || y >= height) return; + + float px = (float)x; + float py = (float)y; + + float best_x = SENTINEL_X; + float best_y = SENTINEL_X; + float best_d2 = 3.0e38f; // ~FLT_MAX + + // Examine self (dx=dy=0) and the 8 neighbors at +/- step. We deliberately + // do NOT clamp the neighbor coordinate: off-grid lookups are left out of + // range so the BORDER-addressed texture returns the sentinel border_color + // (-1, -1). is_seed() then rejects it, exactly as it would reject an empty + // interior cell. Under the old CLAMP scheme an off-edge fetch returned the + // edge texel's real seed, which could win the nearest-seed test for a + // border pixel even though that seed is not actually its nearest. + #pragma unroll + for (int dy = -1; dy <= 1; ++dy) { + #pragma unroll + for (int dx = -1; dx <= 1; ++dx) { + int nx = x + dx * step; + int ny = y + dy * step; + + float2 s = tex2D(tex, (float)nx + 0.5f, (float)ny + 0.5f); + if (is_seed(s)) { + float ddx = s.x - px; + float ddy = s.y - py; + float d2 = ddx * ddx + ddy * ddy; + if (d2 < best_d2) { + best_d2 = d2; + best_x = s.x; + best_y = s.y; + } + } + } + } + + surf2Dwrite(make_float2(best_x, best_y), surf, x * (int)sizeof(float2), y); +} + +extern "C" +__global__ +void colorize(cudaTextureObject_t tex, unsigned char* output, + int width, int height, int mode, float t) { + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + if (x >= width || y >= height) return; + + float2 c = tex2D(tex, (float)x + 0.5f, (float)y + 0.5f); + + float r = 0.0f, g = 0.0f, b = 0.0f; + + if (is_seed(c)) { + float dx = c.x - (float)x; + float dy = c.y - (float)y; + float dist = sqrtf(dx * dx + dy * dy); + + if (mode == 0) { + // --- Voronoi cells: smooth neon color + glowing cell borders. --- + seed_color(c.x, c.y, &r, &g, &b); + + // Border proximity: count how many 8-neighbors belong to a different + // cell. A pixel deep inside a cell sees 0; a pixel right on the edge + // sees several. We use this as a smooth edge factor rather than a + // hard on/off so borders read as a luminous seam, not a jagged line. + int diff = 0; + const int ox[8] = {-1, 1, 0, 0, -1, -1, 1, 1}; + const int oy[8] = {0, 0, -1, 1, -1, 1, -1, 1}; + #pragma unroll + for (int i = 0; i < 8; ++i) { + int nx = x + ox[i]; + int ny = y + oy[i]; + if (nx < 0) nx = 0; + if (nx >= width) nx = width - 1; + if (ny < 0) ny = 0; + if (ny >= height) ny = height - 1; + float2 n = tex2D(tex, (float)nx + 0.5f, (float)ny + 0.5f); + if (is_seed(n) && (n.x != c.x || n.y != c.y)) { + ++diff; + } + } + + // Smooth interior shading: gentle radial falloff from the cell seed + // for a soft volumetric look, slowly breathing in time. + float shade = 1.0f / (1.0f + 0.0006f * dist * dist); + float pulse = 0.92f + 0.08f * sinf(1.5f * t + 0.02f * dist); + shade = (0.55f + 0.45f * shade) * pulse; + r *= shade; g *= shade; b *= shade; + + if (diff > 0) { + // edge in [0,1]: stronger the more neighbors disagree. + float edge = (float)diff / 8.0f; + edge = edge * edge; // bias toward the true seam + // Darken the base color toward the seam, then add a bright neon + // rim on top so cell boundaries glow instead of just going dark. + float dark = 1.0f - 0.85f * edge; + r *= dark; g *= dark; b *= dark; + float rim = edge * (0.65f + 0.35f * sinf(2.5f * t)); + r += rim; g += rim * 0.9f; b += rim; + } + } else { + // --- Metaballs: glowing neon falloff from the nearest seed. --- + // Brightness peaks at the seed and decays smoothly with distance. + float glow = 1.0f / (1.0f + 0.0018f * dist * dist); + // A couple of animated isoline ripples add a layered plasma pulse. + float ripple = 0.5f + 0.5f * sinf(0.13f * dist - 3.0f * t); + float ripple2 = 0.5f + 0.5f * sinf(0.05f * dist + 1.7f * t); + float intensity = glow * (0.55f + 0.30f * ripple + 0.15f * ripple2); + // A soft core bloom keeps seed centers reading as hot points. + float core = 1.0f / (1.0f + 0.02f * dist * dist); + intensity += 0.5f * core; + + // Hue sweeps with distance + time so blobs shimmer through the neon + // spectrum; value tracks intensity so falloff still fades to black. + float hue = 0.6f + 0.0015f * dist + 0.05f * t; + float val = intensity; + if (val > 1.0f) val = 1.0f; + hsv_to_rgb(hue, 0.85f, val, &r, &g, &b); + // Lift toward white at the very brightest cores for a hot-tip look. + float hot = intensity - 1.0f; + if (hot > 0.0f) { + if (hot > 1.0f) hot = 1.0f; + r += hot * (1.0f - r); + g += hot * (1.0f - g); + b += hot * (1.0f - b); + } + } + } + + // Clamp to [0, 1] before writing bytes. + if (r < 0.0f) r = 0.0f; if (r > 1.0f) r = 1.0f; + if (g < 0.0f) g = 0.0f; if (g > 1.0f) g = 1.0f; + if (b < 0.0f) b = 0.0f; if (b > 1.0f) b = 1.0f; + + int idx = (y * width + x) * 4; + output[idx + 0] = (unsigned char)(r * 255.0f); + output[idx + 1] = (unsigned char)(g * 255.0f); + output[idx + 2] = (unsigned char)(b * 255.0f); + output[idx + 3] = 255; +} +""" + +# GLSL shaders -- these just display a texture on a fullscreen rectangle. +# Nothing CUDA-specific here. + +VERTEX_SHADER_SOURCE = """#version 330 core +in vec2 position; +in vec2 texcoord; +out vec2 v_texcoord; +void main() { + gl_Position = vec4(position, 0.0, 1.0); + v_texcoord = texcoord; +} +""" + +FRAGMENT_SHADER_SOURCE = """#version 330 core +in vec2 v_texcoord; +out vec4 fragColor; +uniform sampler2D tex; +void main() { + fragColor = texture(tex, v_texcoord); +} +""" + + +if __name__ == "__main__": + main() diff --git a/cuda_core/examples/gl_interop_lenia.py b/cuda_core/examples/gl_interop_lenia.py new file mode 100644 index 00000000000..ea2d8dc36ae --- /dev/null +++ b/cuda_core/examples/gl_interop_lenia.py @@ -0,0 +1,802 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +# ################################################################################ +# +# This example demonstrates cuda.core.CUDAArray, TextureObject, and SurfaceObject +# in combination with GraphicsResource for CUDA/OpenGL interop. A Lenia +# continuous cellular automaton is ping-ponged between two CUDA arrays each +# frame: a TextureObject provides smooth (LINEAR + WRAP) sampled reads through +# a large bell-shaped neighborhood kernel, and a SurfaceObject provides typed +# writes. The final state is colorized straight into an OpenGL PBO. Requires +# pyglet. +# +# ################################################################################ + +# What this example teaches +# ========================= +# - How to drive a wide-radius convolution from a TextureObject configured for +# LINEAR + WRAP + normalized coordinates. The same CUDAArray is then bound as a +# SurfaceObject for the typed write back, requiring `is_surface_load_store=True` +# at allocation time. +# - How a single-channel `float` CUDAArray differs from the multi-channel layout +# used in the Gray-Scott example: `num_channels=1`, `tex2D` reads, and +# a 4-byte x-stride in `surf2Dwrite`. +# - How to host-precompute a normalization constant for a stencil with a +# variable-shape support (the bell-curve neighborhood), then pass it as a +# plain float kernel argument. +# +# How it works +# ============ +# Lenia (Bert Wang-Chak Chan, 2018) generalizes Conway's Game of Life to +# continuous space, time, and state. Each cell holds a real value in [0, 1]. +# Per step, every cell: +# +# 1. Integrates a smooth bell-shaped neighborhood kernel K against the +# current state to produce a "potential" U: +# +# U(x) = sum over offsets (dx, dy) inside a disk of radius R of +# K(|(dx, dy)|) * state(x + (dx, dy)) +# divided by sum of K (host-precomputed). +# +# K(r) = exp(-((r / R) - mu_K)^2 / (2 * sigma_K^2)) for r <= R. +# +# 2. Applies the growth function G and updates the state: +# +# state_new = clamp(state_old + dt * (2 * exp(-(U - mu)^2 / +# (2 * sigma^2)) - 1), 0, 1). +# +# Two single-channel `float` arrays are ping-ponged each frame: a +# TextureObject reads one (sampled with LINEAR + WRAP so the disk wraps +# toroidally) and a SurfaceObject writes the other. +# +# PING-PONG (two arrays, swap each step) +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +--------------+ tex2D +------------------+ +# | arr_a | ----------------> | | +# | state | | convolve_lenia | +# +--------------+ | kernel | +# | (+ growth fn) | +# +--------------+ surf2Dwrite | | +# | arr_b | <---------------- | | +# | state | +------------------+ +# +--------------+ +# (swap) +# +# After the step we run a separate `colorize_lenia` kernel that samples the +# new state and writes RGBA bytes straight into the OpenGL PBO via +# GraphicsResource. No data ever travels across the PCIe bus during the frame. +# +# Why LINEAR + WRAP + normalized coords? +# -------------------------------------- +# Lenia's neighborhood radius (R = 13) is wide enough that boundary handling +# really matters. AddressMode.WRAP gives a toroidal world for free, and it is +# only supported in normalized coordinate mode (see the CUDA Programming +# Guide). LINEAR filtering is essentially free on the hardware -- here it +# softens the integer-offset reads a hair, which keeps the dynamics smooth. +# Sample coordinates are `(x + dx + 0.5) / W`; values < 0 or > 1 are fine, +# WRAP handles them. +# +# Channel byte width in surf2Dwrite +# --------------------------------- +# `surf2Dwrite` takes the x coordinate in BYTES, not in elements. For a +# single-channel `float` surface that means `x * sizeof(float)` = `x * 4`. +# (The Gray-Scott example uses 8 because it stores `float2`.) +# +# One step per frame +# ------------------ +# Each step convolves a (2R+1)^2 = 729-tap neighborhood for every pixel, which +# is much heavier than a Gray-Scott 5-point Laplacian. With dt = 0.1 the +# dynamics are slow enough that one step per displayed frame is plenty. There +# is no `N_STEPS` loop. +# +# What you should see +# =================== +# A window showing soft, glider-like blobs drifting across the field on a +# teal-on-black palette. Press R to reseed with a new Gaussian blob, 1 to +# clear the field, and Escape to exit. The window title shows the current +# FPS. +# + +# /// script +# dependencies = ["cuda_bindings", "cuda_core>0.6.0", "pyglet"] +# /// + +import ctypes +import math +import sys +import time + +import numpy as np + +from cuda.core import ( + AddressMode, + ArrayFormat, + CUDAArray, + Device, + FilterMode, + GraphicsResource, + LaunchConfig, + Program, + ProgramOptions, + ReadMode, + ResourceDescriptor, + SurfaceObject, + TextureDescriptor, + TextureObject, + launch, +) + +# --------------------------------------------------------------------------- +# Simulation parameters (feel free to change these) +# --------------------------------------------------------------------------- +WIDTH = 256 +HEIGHT = 256 + +# Neighborhood / kernel shape +R = 13 # convolution radius in pixels (texture-space) +MU_K = 0.5 # bell center for the neighborhood weight K(r/R) +SIGMA_K = 0.15 # bell width for K + +# Growth function shape +MU = 0.15 # bell center for the growth function G(U) +SIGMA = 0.015 # bell width for G + +DT = 0.1 # time step + +# Initial blob radius and peak for the Gaussian seed. +# The radius must be large relative to the neighborhood radius R=13 so the +# kernel-integrated potential U lands near the growth bell's center mu=0.15. +# With SEED_RADIUS=36, U at the blob's centre starts near mu and the field +# survives the first step; smaller seeds collapse to zero within one frame +# because U is far outside the narrow (sigma=0.015) growth bell. +SEED_RADIUS = 36.0 +SEED_PEAK = 0.5 + +# Seed modes (kept in sync with the seed_blob kernel) +SEED_MODE_CLEAR = 0 +SEED_MODE_BLOB = 1 + + +# ============================= Helper functions ============================= +# +# The functions below set up CUDA and OpenGL. If you're here to learn about +# CUDAArray/TextureObject/SurfaceObject, skip ahead to main() -- the interesting +# part is there. These helpers exist so that main() reads like a short story +# instead of a wall of boilerplate. +# ============================================================================ + + +def compute_kernel_norm(radius, mu_k, sigma_k): + """Precompute 1 / (sum of K(r)) for the bell-shaped neighborhood weight. + + Mirrors exactly what the device kernel does so the convolution is energy- + preserving: walks the (2R+1)x(2R+1) box, accumulates + `exp(-(r/R - mu_k)^2 / (2*sigma_k^2))` for `r <= R`, and returns the + reciprocal sum as a float32. + """ + inv_two_sigma2 = 1.0 / (2.0 * sigma_k * sigma_k) + inv_r = 1.0 / float(radius) + total = 0.0 + for dy in range(-radius, radius + 1): + for dx in range(-radius, radius + 1): + r = math.sqrt(dx * dx + dy * dy) + if r > radius: + continue + rn = r * inv_r - mu_k + total += math.exp(-(rn * rn) * inv_two_sigma2) + if total <= 0.0: + raise RuntimeError("kernel normalization sum collapsed to zero") + return np.float32(1.0 / total) + + +def setup_cuda(): + """Compile the CUDA kernels and return (device, stream, kernels, configs). + + Returns a dict of kernels keyed by name and matching LaunchConfigs. + """ + dev = Device(0) + dev.set_current() + + # SurfaceObject requires surface load/store, which has existed since SM 2.0, + # but bindless surface objects (cuSurfObjectCreate) require SM 3.0+. + cc = dev.compute_capability + if cc.major < 3: + print( + "This example requires a GPU with compute capability >= 3.0 for " + f"bindless surface objects. Found sm_{cc.major}{cc.minor}.", + file=sys.stderr, + ) + sys.exit(1) + + stream = dev.create_stream() + + # Compile as C++ so the templated tex2D overload resolves. + program_options = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}") + prog = Program(KERNEL_SOURCE, code_type="c++", options=program_options) + mod = prog.compile( + "cubin", + name_expressions=("convolve_lenia", "colorize_lenia", "seed_blob"), + ) + + kernels = { + "step": mod.get_kernel("convolve_lenia"), + "colorize": mod.get_kernel("colorize_lenia"), + "seed": mod.get_kernel("seed_blob"), + } + + block = (16, 16, 1) + grid = ( + (WIDTH + block[0] - 1) // block[0], + (HEIGHT + block[1] - 1) // block[1], + 1, + ) + config = LaunchConfig(grid=grid, block=block) + # All three kernels are pixel-parallel over a WIDTH x HEIGHT grid, so they + # can share a launch config. + configs = {"step": config, "colorize": config, "seed": config} + + return dev, stream, kernels, configs + + +def create_window(): + """Open a pyglet window and return (window, gl_module, pyglet).""" + try: + import pyglet + from pyglet.gl import gl as _gl + except ImportError: + print( + "This example requires pyglet >= 2.0.\nInstall it with: pip install pyglet", + file=sys.stderr, + ) + sys.exit(1) + + window = pyglet.window.Window( + WIDTH, + HEIGHT, + caption="cuda.core CUDAArray/Texture/Surface - Lenia", + vsync=False, + ) + return window, _gl, pyglet + + +def create_display_resources(gl, width, height): + """Create the GL objects needed to show a texture on screen. + + This sets up a shader program, a fullscreen quad, and an empty texture. + None of this is CUDA-specific -- it's standard OpenGL boilerplate for + rendering a textured quad. + + Returns (shader_program, vertex_array_id, texture_id). The shader_program + is a pyglet ShaderProgram object (must be kept alive). + """ + from pyglet.graphics.shader import Shader, ShaderProgram + + # Shader program -- just passes texture coordinates through + vert = Shader(VERTEX_SHADER_SOURCE, "vertex") + frag = Shader(FRAGMENT_SHADER_SOURCE, "fragment") + shader_prog = ShaderProgram(vert, frag) + + # Fullscreen quad (two triangles covering the entire window) + quad_verts = np.array( + [ + # x, y, s, t (position + texture coordinate) + -1, + -1, + 0, + 0, + 1, + -1, + 1, + 0, + 1, + 1, + 1, + 1, + -1, + -1, + 0, + 0, + 1, + 1, + 1, + 1, + -1, + 1, + 0, + 1, + ], + dtype=np.float32, + ) + + vao = ctypes.c_uint(0) + gl.glGenVertexArrays(1, ctypes.byref(vao)) + gl.glBindVertexArray(vao.value) + + vbo = ctypes.c_uint(0) + gl.glGenBuffers(1, ctypes.byref(vbo)) + gl.glBindBuffer(gl.GL_ARRAY_BUFFER, vbo.value) + gl.glBufferData( + gl.GL_ARRAY_BUFFER, + quad_verts.nbytes, + quad_verts.ctypes.data_as(ctypes.c_void_p), + gl.GL_STATIC_DRAW, + ) + + stride = 4 * 4 # 4 floats * 4 bytes each = 16 bytes per vertex + pos_loc = gl.glGetAttribLocation(shader_prog.id, b"position") + gl.glEnableVertexAttribArray(pos_loc) + gl.glVertexAttribPointer(pos_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(0)) + + tc_loc = gl.glGetAttribLocation(shader_prog.id, b"texcoord") + gl.glEnableVertexAttribArray(tc_loc) + gl.glVertexAttribPointer(tc_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(8)) + + gl.glBindVertexArray(0) + + # Empty texture (will be filled each frame from the PBO) + tex = ctypes.c_uint(0) + gl.glGenTextures(1, ctypes.byref(tex)) + gl.glBindTexture(gl.GL_TEXTURE_2D, tex.value) + gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MIN_FILTER, gl.GL_LINEAR) + gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MAG_FILTER, gl.GL_LINEAR) + gl.glTexImage2D( + gl.GL_TEXTURE_2D, + 0, + gl.GL_RGBA8, + width, + height, + 0, + gl.GL_RGBA, + gl.GL_UNSIGNED_BYTE, + None, + ) + + return shader_prog, vao.value, tex.value + + +def create_pixel_buffer(gl, width, height): + """Create a Pixel Buffer Object (PBO) -- the bridge between CUDA and OpenGL. + + A PBO is a GPU-side buffer that OpenGL can read from when uploading pixels + to a texture. By registering this same buffer with CUDA, the CUDA kernel + can write directly into it. + + Returns (pbo_gl_name, size_in_bytes). + """ + pbo = ctypes.c_uint(0) + gl.glGenBuffers(1, ctypes.byref(pbo)) + gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo.value) + nbytes = width * height * 4 # RGBA, 1 byte per channel + gl.glBufferData(gl.GL_PIXEL_UNPACK_BUFFER, nbytes, None, gl.GL_DYNAMIC_DRAW) + gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0) + return pbo.value, nbytes + + +def copy_pbo_to_texture(gl, pbo_id, tex_id, width, height): + """Copy pixel data from the PBO into the GL texture (GPU-to-GPU).""" + gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo_id) + gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id) + gl.glTexSubImage2D( + gl.GL_TEXTURE_2D, + 0, + 0, + 0, + width, + height, + gl.GL_RGBA, + gl.GL_UNSIGNED_BYTE, + None, # None = read from the currently bound PBO, not from CPU + ) + gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0) + + +def draw_fullscreen_quad(gl, shader_prog, vao_id, tex_id): + """Draw the texture to the screen using the fullscreen quad.""" + gl.glUseProgram(shader_prog.id) + gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id) + gl.glBindVertexArray(vao_id) + gl.glDrawArrays(gl.GL_TRIANGLES, 0, 6) + gl.glBindVertexArray(0) + gl.glUseProgram(0) + + +def make_state_arrays(): + """Allocate the two single-channel `float` ping-pong arrays. + + `is_surface_load_store=True` is what lets the same CUDAArray be bound as both a + TextureObject (sampled reads) and a SurfaceObject (typed writes). + """ + arr_a = CUDAArray.from_descriptor( + shape=(WIDTH, HEIGHT), + format=ArrayFormat.FLOAT32, + num_channels=1, + is_surface_load_store=True, + ) + arr_b = CUDAArray.from_descriptor( + shape=(WIDTH, HEIGHT), + format=ArrayFormat.FLOAT32, + num_channels=1, + is_surface_load_store=True, + ) + return arr_a, arr_b + + +def make_texture(arr): + """Bind `arr` as a TextureObject configured for LINEAR + WRAP + normalized.""" + res_desc = ResourceDescriptor.from_array(arr) + tex_desc = TextureDescriptor( + address_mode=AddressMode.WRAP, + filter_mode=FilterMode.LINEAR, + read_mode=ReadMode.ELEMENT_TYPE, + # WRAP/MIRROR addressing modes require normalized coordinates. + normalized_coords=True, + ) + return TextureObject.from_descriptor(resource=res_desc, texture_descriptor=tex_desc) + + +def seed_state(stream, kernels, configs, write_surf, mode, seed_value): + """Re-initialize the array behind `write_surf` with a Gaussian blob or zeros. + + `mode = SEED_MODE_CLEAR` zeroes the field; `mode = SEED_MODE_BLOB` places a + Gaussian blob with peak ~SEED_PEAK at the center, jittered by `seed_value` + so successive reseeds give different patterns. + + Takes a long-lived SurfaceObject (not a fresh one): `launch` is async, so + creating a SurfaceObject inside a `with` block that closes immediately + after `launch` returns would destroy the surface handle before the kernel + actually runs against it. + """ + launch( + stream, + configs["seed"], + kernels["seed"], + np.uint64(write_surf.handle), + np.int32(WIDTH), + np.int32(HEIGHT), + np.int32(mode), + np.uint32(seed_value), + np.float32(SEED_RADIUS), + np.float32(SEED_PEAK), + ) + + +# ================================== main() ================================== + + +def main(): + # --- Step 1: Set up CUDA (compile kernels, create stream) --- + dev, stream, kernels, configs = setup_cuda() + + # --- Step 2: Open a window --- + window, gl, pyglet = create_window() + + # --- Step 3: Create GL resources for drawing a texture to screen --- + # (Standard OpenGL boilerplate -- not CUDA-specific.) + shader_prog, quad_vao, tex_id = create_display_resources(gl, WIDTH, HEIGHT) + + # --- Step 4: Create the Pixel Buffer Object (PBO) --- + # The PBO is GPU memory owned by OpenGL. It's the bridge between the + # two worlds: CUDA writes into it, OpenGL reads from it. + pbo_id, _ = create_pixel_buffer(gl, WIDTH, HEIGHT) + + # --- Step 5: Register the PBO with CUDA --- + resource = GraphicsResource.from_gl_buffer(pbo_id, flags="write_discard") + + # --- Step 6: Allocate the two ping-pong state Arrays --- + # Both are single-channel `float` with `is_surface_load_store=True` so + # they can be bound as SurfaceObjects. + arr_a, arr_b = make_state_arrays() + + # --- Step 7: Pre-create the four bindless handles --- + # Creating these once is much cheaper than rebuilding them every + # step. The simulation loop just picks which read/write pair to use. + tex_a = make_texture(arr_a) + tex_b = make_texture(arr_b) + surf_a = SurfaceObject.from_array(arr_a) + surf_b = SurfaceObject.from_array(arr_b) + + # --- Step 8: Precompute the bell-curve normalization constant --- + # The neighborhood weight K(r) is unnormalized in the kernel; we + # divide by sum(K) so the convolution is a weighted mean rather than + # an unbounded integral. Doing this on the host once at startup is + # much cheaper than redoing it on the device every step. + inv_weight_sum = compute_kernel_norm(R, MU_K, SIGMA_K) + + # --- Step 9: Seed an initial Gaussian blob into arr_a (writes via surf_a) --- + seed_state(stream, kernels, configs, surf_a, SEED_MODE_BLOB, seed_value=0) + # After seeding, `arr_a` is the "current" state. + state = {"current": "a", "seed": 0} + + # --- Step 10: Render loop --- + start_time = time.monotonic() + frame_count = 0 + fps_time = start_time + + def current_read_write(): + if state["current"] == "a": + return tex_a, surf_b, "b" # read a, write b, next current = b + return tex_b, surf_a, "a" + + @window.event + def on_key_press(symbol, _modifiers): + key = pyglet.window.key + if symbol == key.ESCAPE: + window.close() + return + if symbol == key.R: + # Reseed with a new Gaussian blob; bump the seed so the jitter + # pattern changes each time. + state["seed"] += 1 + seed_state(stream, kernels, configs, surf_a, SEED_MODE_BLOB, state["seed"]) + state["current"] = "a" + return + if symbol == key._1: + # Clear the field. Useful to confirm the simulation is quiet when + # the state is zero. + seed_state(stream, kernels, configs, surf_a, SEED_MODE_CLEAR, 0) + state["current"] = "a" + return + + @window.event + def on_draw(): + nonlocal frame_count, fps_time + + window.clear() + + # (a) Run one Lenia step. The convolution kernel reads the current + # state via a TextureObject (LINEAR + WRAP gives toroidal + # wrapping at the border), evaluates the growth function, and + # writes the new state via a SurfaceObject. One step per frame + # is intentional: dt = 0.1 is small, and the (2R+1)^2 = 729-tap + # stencil is heavy enough that going faster would not help. + tex_read, surf_write, next_current = current_read_write() + launch( + stream, + configs["step"], + kernels["step"], + np.uint64(tex_read.handle), + np.uint64(surf_write.handle), + np.int32(WIDTH), + np.int32(HEIGHT), + np.int32(R), + np.float32(MU_K), + np.float32(SIGMA_K), + np.float32(MU), + np.float32(SIGMA), + np.float32(DT), + inv_weight_sum, + ) + state["current"] = next_current + + # (b) Colorize the latest state into the OpenGL PBO. + tex_read = tex_a if state["current"] == "a" else tex_b + with resource.map(stream=stream) as buf: + launch( + stream, + configs["colorize"], + kernels["colorize"], + np.uint64(tex_read.handle), + buf.handle, + np.int32(WIDTH), + np.int32(HEIGHT), + ) + # Unmap happens automatically when the `with` block exits. + + # (c) Tell OpenGL to copy the PBO contents into our texture. + copy_pbo_to_texture(gl, pbo_id, tex_id, WIDTH, HEIGHT) + + # (d) Draw the texture to the screen. + draw_fullscreen_quad(gl, shader_prog, quad_vao, tex_id) + + # FPS counter (shown in window title) + frame_count += 1 + now = time.monotonic() + if now - fps_time >= 1.0: + fps = frame_count / (now - fps_time) + window.set_caption(f"cuda.core CUDAArray/Texture/Surface - Lenia ({WIDTH}x{HEIGHT}, R={R}, {fps:.0f} FPS)") + frame_count = 0 + fps_time = now + + @window.event + def on_close(): + # Release everything we opened, in reverse order. Each of these is a + # context manager too, but pyglet owns the event loop here so we + # release explicitly. + resource.close() + tex_a.close() + tex_b.close() + surf_a.close() + surf_b.close() + arr_a.close() + arr_b.close() + stream.close() + + pyglet.app.run(interval=0) + + +# ======================== GPU code (CUDA + GLSL) ============================ +# +# These source strings are kept at the bottom of the file so they don't +# distract from the Python logic above. The important things to know: +# +# - KERNEL_SOURCE contains three CUDA C++ kernels: +# * seed_blob -- sets the initial state via SurfaceObject writes. +# Either clears the field (mode = 0) or paints a +# Gaussian blob centered in the field (mode = 1). +# * convolve_lenia -- reads previous state via TextureObject (with +# LINEAR + WRAP bilinear filtering), integrates a +# bell-shaped neighborhood K(r/R) to produce the +# potential U, applies the growth function G(U), +# and writes the next state via SurfaceObject. +# * colorize_lenia -- reads the new state via TextureObject and writes +# RGBA bytes into the OpenGL PBO using a simple +# teal-on-black gradient. +# +# - VERTEX_SHADER_SOURCE / FRAGMENT_SHADER_SOURCE are GLSL. They draw a +# texture onto a rectangle covering the entire window. Nothing interesting. +# +# ============================================================================ + +KERNEL_SOURCE = r""" +// All kernels run one thread per output pixel and bounds-check at the top. +// `surf2Dwrite` takes the x offset in BYTES; for a single-channel float +// surface that means `x * sizeof(float)` = `x * 4`. + +extern "C" +__global__ +void seed_blob(cudaSurfaceObject_t surf, + int width, int height, + int mode, + unsigned int seed, + float radius, + float peak) { + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + if (x >= width || y >= height) return; + + float value = 0.0f; + if (mode == 1) { + // Gaussian blob centered in the field with a small deterministic + // jitter that breaks symmetry differently on each reseed. + float cx = (float)(width / 2); + float cy = (float)(height / 2); + float dx = (float)x - cx; + float dy = (float)y - cy; + float r2 = dx * dx + dy * dy; + float inv = 1.0f / (radius * radius); + value = peak * expf(-r2 * inv); + + unsigned int h = (unsigned int)x * 374761393u + + (unsigned int)y * 668265263u + seed * 2246822519u; + h = (h ^ (h >> 13)) * 1274126177u; + h = h ^ (h >> 16); + float noise = (h & 0xffffu) / 65535.0f; // in [0, 1] + value += 0.02f * (noise - 0.5f); + if (value < 0.0f) value = 0.0f; + if (value > 1.0f) value = 1.0f; + } + + // float is 4 bytes; surf2Dwrite takes the x offset in BYTES. + surf2Dwrite(value, surf, x * (int)sizeof(float), y); +} + +extern "C" +__global__ +void convolve_lenia(cudaTextureObject_t tex, + cudaSurfaceObject_t surf, + int width, int height, + int R, + float mu_k, float sigma_k, + float mu, float sigma, + float dt, + float inv_weight_sum) { + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + if (x >= width || y >= height) return; + + // Normalized texture coordinates: WRAP addressing requires them. The + // (x + dx + 0.5) / W idiom places the sample at the texel center; values + // outside [0, 1] are fine because WRAP wraps them toroidally. + float inv_w = 1.0f / (float)width; + float inv_h = 1.0f / (float)height; + float inv_R = 1.0f / (float)R; + float inv_two_sigma_k2 = 1.0f / (2.0f * sigma_k * sigma_k); + float inv_two_sigma2 = 1.0f / (2.0f * sigma * sigma); + + // Integrate the bell-shaped weight K(r/R) against the current state. + float U = 0.0f; + for (int dy = -R; dy <= R; ++dy) { + for (int dx = -R; dx <= R; ++dx) { + float fdx = (float)dx; + float fdy = (float)dy; + float r2 = fdx * fdx + fdy * fdy; + float r = sqrtf(r2); + if (r > (float)R) continue; // restrict to the disk + float rn = r * inv_R - mu_k; + float w = expf(-(rn * rn) * inv_two_sigma_k2); + + float sx = ((float)x + fdx + 0.5f) * inv_w; + float sy = ((float)y + fdy + 0.5f) * inv_h; + float s = tex2D(tex, sx, sy); + U += w * s; + } + } + U *= inv_weight_sum; // host-precomputed 1 / sum(K) + + // Read the current cell value (point sample at the texel center). + float sx0 = ((float)x + 0.5f) * inv_w; + float sy0 = ((float)y + 0.5f) * inv_h; + float state = tex2D(tex, sx0, sy0); + + // Growth function G(U) = 2 * exp(-(U - mu)^2 / (2 * sigma^2)) - 1, + // mapping U near mu to +1 (grow) and U far from mu to -1 (shrink). + float du = U - mu; + float G = 2.0f * expf(-(du * du) * inv_two_sigma2) - 1.0f; + + float new_state = state + dt * G; + if (new_state < 0.0f) new_state = 0.0f; + if (new_state > 1.0f) new_state = 1.0f; + + surf2Dwrite(new_state, surf, x * (int)sizeof(float), y); +} + +extern "C" +__global__ +void colorize_lenia(cudaTextureObject_t tex, + unsigned char* output, + int width, int height) { + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + if (x >= width || y >= height) return; + + float inv_w = 1.0f / (float)width; + float inv_h = 1.0f / (float)height; + float cx = ((float)x + 0.5f) * inv_w; + float cy = ((float)y + 0.5f) * inv_h; + + float v = tex2D(tex, cx, cy); + if (v < 0.0f) v = 0.0f; + if (v > 1.0f) v = 1.0f; + + // Linear interpolation from a deep teal at v = 0 to a bright teal at + // v = 1. Two stops -- simple, easy to read, no LUT required. + // (0, 15, 30, 255) -> (50, 200, 180, 255) + float r = ( 0.0f + v * ( 50.0f - 0.0f)); + float g = ( 15.0f + v * (200.0f - 15.0f)); + float b = ( 30.0f + v * (180.0f - 30.0f)); + + int idx = (y * width + x) * 4; + output[idx + 0] = (unsigned char)r; + output[idx + 1] = (unsigned char)g; + output[idx + 2] = (unsigned char)b; + output[idx + 3] = 255; +} +""" + +# GLSL shaders -- these just display a texture on a fullscreen rectangle. +# Nothing CUDA-specific here. + +VERTEX_SHADER_SOURCE = """#version 330 core +in vec2 position; +in vec2 texcoord; +out vec2 v_texcoord; +void main() { + gl_Position = vec4(position, 0.0, 1.0); + v_texcoord = texcoord; +} +""" + +FRAGMENT_SHADER_SOURCE = """#version 330 core +in vec2 v_texcoord; +out vec4 fragColor; +uniform sampler2D tex; +void main() { + fragColor = texture(tex, v_texcoord); +} +""" + + +if __name__ == "__main__": + main() diff --git a/cuda_core/examples/gl_interop_mandelbrot.py b/cuda_core/examples/gl_interop_mandelbrot.py new file mode 100644 index 00000000000..73671d77e95 --- /dev/null +++ b/cuda_core/examples/gl_interop_mandelbrot.py @@ -0,0 +1,692 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +# ################################################################################ +# +# This example demonstrates cuda.core.CUDAArray and TextureObject used as a *color +# lookup table* (palette LUT) for a real-time Mandelbrot deep-zoom explorer. +# A CUDA kernel computes smooth iteration counts and uses tex1D with +# LINEAR + CLAMP + NORMALIZED_FLOAT sampling to read a 256-entry RGBA palette, +# writing the final RGBA bytes straight into an OpenGL PBO via GraphicsResource. +# Requires pyglet. +# +# ################################################################################ + +# What this example teaches +# ========================= +# - How to use a 1D cuda.core.CUDAArray as a palette and bind it via a +# TextureObject for hardware-filtered color lookups inside a kernel. +# - How LINEAR + AddressMode.CLAMP + ReadMode.NORMALIZED_FLOAT + normalized +# coordinates give you a free `texture(palette, t)` style sampler that +# returns a float4 in [0, 1] regardless of the underlying storage format. +# - How to drive a real-time interactive viewer: mouse pan, scroll-wheel zoom +# anchored at the cursor, and key-driven iteration cap. +# +# How it works +# ============ +# The Mandelbrot set is defined by iterating z -> z^2 + c starting from +# z = 0; pixels are colored by how quickly z escapes the disk of radius 2. +# +# +---------+ ResourceDescriptor.from_array +# | CUDAArray | --------------------------------+ +# | float4 | v +# | size 256| +-------------------+ +# +---------+ | TextureObject | +# ^ copy_from(host) | (palette LUT) | +# | +---------+---------+ +# host palette | +# (numpy float32x4, 256 stops) | +# v +# tex1D(palette, t) +# | +# v +# +-----------------------+ +# | mandelbrot kernel | +# | (one thread / pixel) | +# +-----------+-----------+ +# | +# v GraphicsResource.map +# +-----------------------+ +# | OpenGL PBO (RGBA8) | +# +-----------------------+ +# +# Smooth iteration count +# ---------------------- +# A plain integer escape count produces ugly banded colors. With a bailout +# radius R = 2 (escape when |z|^2 > 4), we use the standard smooth formula: +# +# mu = iter + 1 - log(log(|z|)) / log(2) +# +# At the escape step |z| > 2, so log(|z|) > log(2) > 0 and log(log(|z|)) is +# finite. We compute this in double and cast to float for the palette lookup. +# +# Cursor-anchored zoom +# -------------------- +# On scroll, we want the world point under the mouse cursor to remain under +# the cursor after the zoom. We capture (wx, wy) under the cursor with the +# old scale, multiply the scale by 0.9 (zoom in) or 1.1 (zoom out), then +# back-solve cx, cy so the same screen pixel still maps to (wx, wy): +# +# cx_new = wx - (mouse_x - W/2) * scale_new +# cy_new = wy - (mouse_y - H/2) * scale_new +# +# Why double precision for cx, cy, scale? +# --------------------------------------- +# Float32 runs out of mantissa bits around 1e6x zoom; double gets you to +# roughly 1e13x before the pixel grid coarsens visibly. The kernel takes +# cx, cy, scale as doubles and only narrows to float for the color lookup. +# +# Address mode note +# ----------------- +# We use AddressMode.CLAMP (per the example brief). Combined with the +# `fmodf(mu * 0.02f, 1.0f)` cycling formula, the palette index is already +# guaranteed to be in [0, 1), so CLAMP and WRAP both produce identical +# results in practice -- there is no visible seam. +# +# What you should see +# =================== +# A window showing the Mandelbrot set. Drag with the left mouse button to +# pan, scroll the wheel to zoom in/out at the cursor, press R to reset the +# view, and `[`/`]` to lower/raise the iteration cap. The window title shows +# the current zoom level, center, max_iter, and FPS. Close the window or +# press Escape to exit. +# + +# /// script +# dependencies = ["cuda_bindings", "cuda_core>0.6.0", "pyglet"] +# /// + +import ctypes +import sys +import time + +import numpy as np + +from cuda.core import ( + AddressMode, + ArrayFormat, + CUDAArray, + Device, + FilterMode, + GraphicsResource, + LaunchConfig, + Program, + ProgramOptions, + ReadMode, + ResourceDescriptor, + TextureDescriptor, + TextureObject, + launch, +) + +# --------------------------------------------------------------------------- +# Window and viewer parameters (feel free to change these) +# --------------------------------------------------------------------------- +WIDTH = 1024 +HEIGHT = 768 +PALETTE_SIZE = 256 + +# Default view: classic Mandelbrot framing centered slightly left of origin. +DEFAULT_CX = -0.5 +DEFAULT_CY = 0.0 +DEFAULT_SCALE = 4.0 / HEIGHT # world-units per pixel (4-unit-tall view) +DEFAULT_MAX_ITER = 512 + +# Bounds for [/] iteration adjust. +MIN_MAX_ITER = 64 +MAX_MAX_ITER = 8192 +ITER_STEP = 64 + + +# ============================= Helper functions ============================= +# +# The functions below set up CUDA and OpenGL. If you're here to learn about +# CUDAArray/TextureObject as a palette LUT, skip ahead to main() -- the interesting +# part is there. These helpers exist so that main() reads like a short story +# instead of a wall of boilerplate. +# ============================================================================ + + +def setup_cuda(): + """Compile the CUDA kernel and return (device, stream, kernel, config).""" + dev = Device(0) + dev.set_current() + + # Bindless texture objects (cuTexObjectCreate) require SM 3.0+. + cc = dev.compute_capability + if cc.major < 3: + print( + "This example requires a GPU with compute capability >= 3.0 for " + f"bindless texture objects. Found sm_{cc.major}{cc.minor}.", + file=sys.stderr, + ) + sys.exit(1) + + stream = dev.create_stream() + + # Compile as C++ so the templated tex1D overload resolves. + program_options = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}") + prog = Program(KERNEL_SOURCE, code_type="c++", options=program_options) + mod = prog.compile("cubin", name_expressions=("mandelbrot",)) + + kernel = mod.get_kernel("mandelbrot") + + block = (16, 16, 1) + grid = ( + (WIDTH + block[0] - 1) // block[0], + (HEIGHT + block[1] - 1) // block[1], + 1, + ) + config = LaunchConfig(grid=grid, block=block) + + return dev, stream, kernel, config + + +def create_window(): + """Open a pyglet window and return (window, gl_module, pyglet).""" + try: + import pyglet + from pyglet.gl import gl as _gl + except ImportError: + print( + "This example requires pyglet >= 2.0.\nInstall it with: pip install pyglet", + file=sys.stderr, + ) + sys.exit(1) + + window = pyglet.window.Window( + WIDTH, + HEIGHT, + caption="cuda.core CUDAArray/Texture - Mandelbrot Deep Zoom", + vsync=False, + ) + return window, _gl, pyglet + + +def create_display_resources(gl, width, height): + """Create the GL objects needed to show a texture on screen. + + This sets up a shader program, a fullscreen quad, and an empty texture. + None of this is CUDA-specific -- it's standard OpenGL boilerplate for + rendering a textured quad. + + Returns (shader_program, vertex_array_id, texture_id). The shader_program + is a pyglet ShaderProgram object (must be kept alive). + """ + from pyglet.graphics.shader import Shader, ShaderProgram + + # Shader program -- just passes texture coordinates through + vert = Shader(VERTEX_SHADER_SOURCE, "vertex") + frag = Shader(FRAGMENT_SHADER_SOURCE, "fragment") + shader_prog = ShaderProgram(vert, frag) + + # Fullscreen quad (two triangles covering the entire window) + quad_verts = np.array( + [ + # x, y, s, t (position + texture coordinate) + -1, + -1, + 0, + 0, + 1, + -1, + 1, + 0, + 1, + 1, + 1, + 1, + -1, + -1, + 0, + 0, + 1, + 1, + 1, + 1, + -1, + 1, + 0, + 1, + ], + dtype=np.float32, + ) + + vao = ctypes.c_uint(0) + gl.glGenVertexArrays(1, ctypes.byref(vao)) + gl.glBindVertexArray(vao.value) + + vbo = ctypes.c_uint(0) + gl.glGenBuffers(1, ctypes.byref(vbo)) + gl.glBindBuffer(gl.GL_ARRAY_BUFFER, vbo.value) + gl.glBufferData( + gl.GL_ARRAY_BUFFER, + quad_verts.nbytes, + quad_verts.ctypes.data_as(ctypes.c_void_p), + gl.GL_STATIC_DRAW, + ) + + stride = 4 * 4 # 4 floats * 4 bytes each = 16 bytes per vertex + pos_loc = gl.glGetAttribLocation(shader_prog.id, b"position") + gl.glEnableVertexAttribArray(pos_loc) + gl.glVertexAttribPointer(pos_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(0)) + + tc_loc = gl.glGetAttribLocation(shader_prog.id, b"texcoord") + gl.glEnableVertexAttribArray(tc_loc) + gl.glVertexAttribPointer(tc_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(8)) + + gl.glBindVertexArray(0) + + # Empty texture (will be filled each frame from the PBO) + tex = ctypes.c_uint(0) + gl.glGenTextures(1, ctypes.byref(tex)) + gl.glBindTexture(gl.GL_TEXTURE_2D, tex.value) + gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MIN_FILTER, gl.GL_LINEAR) + gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MAG_FILTER, gl.GL_LINEAR) + gl.glTexImage2D( + gl.GL_TEXTURE_2D, + 0, + gl.GL_RGBA8, + width, + height, + 0, + gl.GL_RGBA, + gl.GL_UNSIGNED_BYTE, + None, + ) + + return shader_prog, vao.value, tex.value + + +def create_pixel_buffer(gl, width, height): + """Create a Pixel Buffer Object (PBO) -- the bridge between CUDA and OpenGL. + + A PBO is a GPU-side buffer that OpenGL can read from when uploading pixels + to a texture. By registering this same buffer with CUDA, the CUDA kernel + can write directly into it. + + Returns (pbo_gl_name, size_in_bytes). + """ + pbo = ctypes.c_uint(0) + gl.glGenBuffers(1, ctypes.byref(pbo)) + gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo.value) + nbytes = width * height * 4 # RGBA, 1 byte per channel + gl.glBufferData(gl.GL_PIXEL_UNPACK_BUFFER, nbytes, None, gl.GL_DYNAMIC_DRAW) + gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0) + return pbo.value, nbytes + + +def copy_pbo_to_texture(gl, pbo_id, tex_id, width, height): + """Copy pixel data from the PBO into the GL texture (GPU-to-GPU).""" + gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo_id) + gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id) + gl.glTexSubImage2D( + gl.GL_TEXTURE_2D, + 0, + 0, + 0, + width, + height, + gl.GL_RGBA, + gl.GL_UNSIGNED_BYTE, + None, # None = read from the currently bound PBO, not from CPU + ) + gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0) + + +def draw_fullscreen_quad(gl, shader_prog, vao_id, tex_id): + """Draw the texture to the screen using the fullscreen quad.""" + gl.glUseProgram(shader_prog.id) + gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id) + gl.glBindVertexArray(vao_id) + gl.glDrawArrays(gl.GL_TRIANGLES, 0, 6) + gl.glBindVertexArray(0) + gl.glUseProgram(0) + + +def build_palette(): + """Build a 256-entry RGBA float32 palette by lerping through color stops. + + Returns a flat numpy array of shape (PALETTE_SIZE * 4,) dtype=float32 + suitable for CUDAArray.copy_from(). Each color channel is in [0, 1]. + """ + # Hand-picked stops: deep blue -> cyan -> yellow -> orange -> red -> + # magenta -> black (the final stop is used by points that hit max_iter + # and don't escape). + stops = np.array( + [ + [0.00, 0.02, 0.05, 0.30, 1.0], # position, R, G, B, A + [0.16, 0.10, 0.50, 0.90, 1.0], # cyan + [0.42, 1.00, 0.95, 0.20, 1.0], # yellow + [0.58, 1.00, 0.55, 0.10, 1.0], # orange + [0.74, 0.95, 0.10, 0.10, 1.0], # red + [0.90, 0.65, 0.10, 0.85, 1.0], # magenta + [1.00, 0.00, 0.00, 0.00, 1.0], # black + ], + dtype=np.float32, + ) + + pal = np.empty((PALETTE_SIZE, 4), dtype=np.float32) + positions = stops[:, 0] + colors = stops[:, 1:] + for i in range(PALETTE_SIZE): + t = i / (PALETTE_SIZE - 1) + # Find the bracketing segment. + j = int(np.searchsorted(positions, t, side="right")) - 1 + j = max(0, min(j, len(positions) - 2)) + t0 = positions[j] + t1 = positions[j + 1] + seg = (t - t0) / (t1 - t0) if t1 > t0 else 0.0 + pal[i] = colors[j] + seg * (colors[j + 1] - colors[j]) + + # Flatten to (PALETTE_SIZE * 4,) so the byte layout matches a + # float4 x PALETTE_SIZE 1D CUDAArray. + return np.ascontiguousarray(pal.reshape(-1), dtype=np.float32) + + +def make_palette_texture(arr): + """Bind `arr` as a TextureObject configured for LINEAR + CLAMP + normalized.""" + res_desc = ResourceDescriptor.from_array(arr) + tex_desc = TextureDescriptor( + address_mode=AddressMode.CLAMP, + filter_mode=FilterMode.LINEAR, + # NORMALIZED_FLOAT is a no-op for FLOAT32 storage (the data is already + # in [0, 1]); we set it because the spec calls for it and to document + # the intent for readers building palettes from UINT8 storage. + read_mode=ReadMode.NORMALIZED_FLOAT, + normalized_coords=True, + ) + return TextureObject.from_descriptor(resource=res_desc, texture_descriptor=tex_desc) + + +# ================================== main() ================================== + + +def main(): + # --- Step 1: Set up CUDA (compile kernel, create stream) --- + dev, stream, kernel, config = setup_cuda() + + # --- Step 2: Open a window --- + window, gl, pyglet = create_window() + + # --- Step 3: Create GL resources for drawing a texture to screen --- + # (Standard OpenGL boilerplate -- not CUDA-specific.) + shader_prog, quad_vao, tex_id = create_display_resources(gl, WIDTH, HEIGHT) + + # --- Step 4: Create the Pixel Buffer Object (PBO) --- + # The PBO is GPU memory owned by OpenGL. It's the bridge between the + # two worlds: CUDA writes into it, OpenGL reads from it. + pbo_id, _ = create_pixel_buffer(gl, WIDTH, HEIGHT) + + # --- Step 5: Register the PBO with CUDA --- + resource = GraphicsResource.from_gl_buffer(pbo_id, flags="write_discard") + + # --- Step 6: Build and upload the palette LUT --- + # One 1D CUDAArray, 256 entries of float4 RGBA. The host-side palette is + # a flat numpy float32 array; copy_from() does an async H2D copy, so + # we sync the stream once afterwards to make sure the data has landed + # before we start sampling from it in the render loop. + host_palette = build_palette() + palette_arr = CUDAArray.from_descriptor( + shape=(PALETTE_SIZE,), + format=ArrayFormat.FLOAT32, + num_channels=4, + ) + palette_arr.copy_from(host_palette, stream=stream) + stream.sync() + + # --- Step 7: Bind the palette CUDAArray as a TextureObject (LUT) --- + palette_tex = make_palette_texture(palette_arr) + + # --- Step 8: Render loop --- + start_time = time.monotonic() + frame_count = 0 + fps_time = start_time + + # View state. cx, cy, scale are kept in Python floats (double precision) + # and converted to np.float64 on each kernel launch. + view = { + "cx": float(DEFAULT_CX), + "cy": float(DEFAULT_CY), + "scale": float(DEFAULT_SCALE), + "max_iter": int(DEFAULT_MAX_ITER), + # Pan-drag state (left mouse button). + "dragging": False, + } + + def screen_to_world(mouse_x, mouse_y): + """Map a pyglet mouse coordinate to the world point currently under it. + + Pyglet's window origin is bottom-left and the rendered texture's + origin is also bottom-left, so no y-flip is needed. + """ + wx = view["cx"] + (mouse_x - WIDTH / 2.0) * view["scale"] + wy = view["cy"] + (mouse_y - HEIGHT / 2.0) * view["scale"] + return wx, wy + + @window.event + def on_key_press(symbol, _modifiers): + key = pyglet.window.key + if symbol == key.ESCAPE: + window.close() + return + if symbol == key.R: + view["cx"] = float(DEFAULT_CX) + view["cy"] = float(DEFAULT_CY) + view["scale"] = float(DEFAULT_SCALE) + view["max_iter"] = int(DEFAULT_MAX_ITER) + return + if symbol == key.BRACKETLEFT: + view["max_iter"] = max(MIN_MAX_ITER, view["max_iter"] - ITER_STEP) + return + if symbol == key.BRACKETRIGHT: + view["max_iter"] = min(MAX_MAX_ITER, view["max_iter"] + ITER_STEP) + return + + @window.event + def on_mouse_press(_x, _y, button, _modifiers): + if button == pyglet.window.mouse.LEFT: + view["dragging"] = True + + @window.event + def on_mouse_release(_x, _y, button, _modifiers): + if button == pyglet.window.mouse.LEFT: + view["dragging"] = False + + @window.event + def on_mouse_drag(_x, _y, dx, dy, buttons, _modifiers): + if buttons & pyglet.window.mouse.LEFT: + # Pan: move the center opposite to the cursor drag (so the scene + # follows the cursor). dy is positive when moving up in pyglet's + # bottom-left origin space, matching the texture orientation. + view["cx"] -= dx * view["scale"] + view["cy"] += dy * view["scale"] + + @window.event + def on_mouse_scroll(x, y, _scroll_x, scroll_y): + # Cursor-anchored zoom: keep the world point under the cursor pinned. + wx, wy = screen_to_world(x, y) + factor = 0.9 if scroll_y > 0 else 1.1 + view["scale"] *= factor + # Back-solve cx, cy so screen pixel (x, y) still maps to (wx, wy). + view["cx"] = wx - (x - WIDTH / 2.0) * view["scale"] + view["cy"] = wy - (y - HEIGHT / 2.0) * view["scale"] + + @window.event + def on_draw(): + nonlocal frame_count, fps_time + + window.clear() + + # (a) Map the PBO so CUDA can write to it. This gives us a Buffer + # whose .handle is a device pointer pointing into the GL PBO. + with resource.map(stream=stream) as buf: + launch( + stream, + config, + kernel, + np.uint64(palette_tex.handle), # bindless texture handle + buf.handle, # output PBO (RGBA8) + np.int32(WIDTH), + np.int32(HEIGHT), + np.float64(view["cx"]), + np.float64(view["cy"]), + np.float64(view["scale"]), + np.int32(view["max_iter"]), + ) + # Unmap happens automatically when the `with` block exits. + + # (b) Tell OpenGL to copy the PBO contents into our texture. + copy_pbo_to_texture(gl, pbo_id, tex_id, WIDTH, HEIGHT) + + # (c) Draw the texture to the screen. + draw_fullscreen_quad(gl, shader_prog, quad_vao, tex_id) + + # FPS counter (shown in window title) + frame_count += 1 + now = time.monotonic() + if now - fps_time >= 1.0: + fps = frame_count / (now - fps_time) + zoom = 1.0 / view["scale"] if view["scale"] > 0 else 0.0 + window.set_caption( + "cuda.core CUDAArray/Texture - Mandelbrot" + f" | zoom {zoom:.3e}x" + f" | center ({view['cx']:.6f}, {view['cy']:.6f})" + f" | iter {view['max_iter']}" + f" | {fps:.0f} FPS" + ) + frame_count = 0 + fps_time = now + + @window.event + def on_close(): + # Release everything we opened, in reverse order. Each of these is a + # context manager too, but pyglet owns the event loop here so we + # release explicitly. + resource.close() + palette_tex.close() + palette_arr.close() + stream.close() + + pyglet.app.run(interval=0) + + +# ======================== GPU code (CUDA + GLSL) ============================ +# +# These source strings are kept at the bottom of the file so they don't +# distract from the Python logic above. The important things to know: +# +# - KERNEL_SOURCE is a single CUDA C++ kernel `mandelbrot` that computes a +# smooth iteration count per pixel and looks up the color via +# tex1D(palette, t). Coordinates and the scale factor are doubles +# to support deep zooms; only the color lookup runs in single precision. +# +# - VERTEX_SHADER_SOURCE / FRAGMENT_SHADER_SOURCE are GLSL. They draw a +# texture onto a rectangle covering the entire window. Nothing interesting. +# +# ============================================================================ + +KERNEL_SOURCE = r""" +// Mandelbrot deep-zoom kernel with a TextureObject palette LUT. +// +// Each thread computes one pixel. Coordinates and scale are doubles so the +// zoom doesn't quantize at modest depth. Once we have the smooth iteration +// count we narrow to float and use tex1D to read the palette. + +extern "C" +__global__ +void mandelbrot(cudaTextureObject_t palette, + unsigned char* output, + int width, int height, + double cx, double cy, double scale, + int max_iter) { + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + if (x >= width || y >= height) return; + + // Map pixel -> complex plane (doubles). + double c_re = cx + ((double)x - 0.5 * (double)width) * scale; + double c_im = cy + ((double)y - 0.5 * (double)height) * scale; + + // Standard escape iteration with bailout radius 2 (compare squared norm + // against 4 to skip the sqrt in the inner loop). + double zr = 0.0; + double zi = 0.0; + double zr2 = 0.0; + double zi2 = 0.0; + int iter = 0; + while (iter < max_iter && (zr2 + zi2) <= 4.0) { + zi = 2.0 * zr * zi + c_im; + zr = zr2 - zi2 + c_re; + zr2 = zr * zr; + zi2 = zi * zi; + ++iter; + } + + unsigned char r, g, b; + if (iter >= max_iter) { + // Inside the set (or close enough): solid black. + r = 0; + g = 0; + b = 0; + } else { + // Smooth iteration count: + // mu = iter + 1 - log(log(|z|)) / log(2) + // = iter + 1 - log(0.5 * log(|z|^2)) / log(2) + // At escape, |z|^2 > 4, so 0.5 * log(|z|^2) > log(2) > 0 -- the + // outer log is well-defined. Compute in double, narrow to float + // for the palette lookup. + double log_zn = 0.5 * log(zr2 + zi2); + double nu = log(log_zn) / log(2.0); + float mu = (float)((double)(iter + 1) - nu); + + // Cycle through the palette: 0.02 controls how quickly we wrap + // through the gradient as the iteration count climbs. + float t = fmodf(mu * 0.02f, 1.0f); + if (t < 0.0f) t += 1.0f; // fmodf can return negative for negative mu + + float4 rgba = tex1D(palette, t); + + // Clamp before narrowing to bytes. + float fr = rgba.x; if (fr < 0.0f) fr = 0.0f; if (fr > 1.0f) fr = 1.0f; + float fg = rgba.y; if (fg < 0.0f) fg = 0.0f; if (fg > 1.0f) fg = 1.0f; + float fb = rgba.z; if (fb < 0.0f) fb = 0.0f; if (fb > 1.0f) fb = 1.0f; + r = (unsigned char)(fr * 255.0f); + g = (unsigned char)(fg * 255.0f); + b = (unsigned char)(fb * 255.0f); + } + + int idx = (y * width + x) * 4; + output[idx + 0] = r; + output[idx + 1] = g; + output[idx + 2] = b; + output[idx + 3] = 255; +} +""" + +# GLSL shaders -- these just display a texture on a fullscreen rectangle. +# Nothing CUDA-specific here. + +VERTEX_SHADER_SOURCE = """#version 330 core +in vec2 position; +in vec2 texcoord; +out vec2 v_texcoord; +void main() { + gl_Position = vec4(position, 0.0, 1.0); + v_texcoord = texcoord; +} +""" + +FRAGMENT_SHADER_SOURCE = """#version 330 core +in vec2 v_texcoord; +out vec4 fragColor; +uniform sampler2D tex; +void main() { + fragColor = texture(tex, v_texcoord); +} +""" + + +if __name__ == "__main__": + main() diff --git a/cuda_core/examples/gl_interop_mipmap_lod.py b/cuda_core/examples/gl_interop_mipmap_lod.py new file mode 100644 index 00000000000..9f71bad7a5c --- /dev/null +++ b/cuda_core/examples/gl_interop_mipmap_lod.py @@ -0,0 +1,730 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +# ################################################################################ +# +# This example demonstrates the new cuda.core texture/surface stack: +# MipmappedArray, SurfaceObject, and a TextureObject that does trilinear +# (LINEAR mipmap + LINEAR filter) sampling with user-controlled LOD bias. +# Requires pyglet. +# +# ################################################################################ + +# What this example teaches +# ========================= +# How to allocate a mipmap pyramid as a single MipmappedArray, populate each +# level from a CUDA kernel by binding it as a SurfaceObject, and then sample +# the whole pyramid from a TextureObject with manual LOD bias. +# +# How it works +# ============ +# A mipmap pyramid is a stack of progressively-halved images of the same +# texture. The base level (level 0) holds the highest-resolution version; each +# subsequent level is a 2x2 box-filtered downsample of the level below it: +# +# level 0: 512 x 512 <- highest detail +# level 1: 256 x 256 +# level 2: 128 x 128 +# ... +# level 9: 1 x 1 <- a single average color +# +# At sample time, the GPU picks the mip level that best matches the on-screen +# size of the texel, optionally blending between adjacent levels (trilinear). +# Selecting a coarser level than the "right" one is called a positive LOD bias +# and produces a softer/blurrier image; a negative bias selects finer levels +# (sharper but more aliased when undersampled). +# +# +----------------------+ +-----------------------+ +# | MipmappedArray | | TextureObject | +# | (single allocation, | <--- | (samples the whole | +# | 10 mip levels) | | pyramid w/ trilinear | +# +----------------------+ | filtering) | +# ^ ^ +-----------------------+ +# | | +# | +---- one SurfaceObject per level, used at BUILD time only +# | to let a kernel write pixels into that level. +# | +# +----------- get_level(L) returns a NON-OWNING CUDAArray view of level L; +# the storage belongs to the parent MipmappedArray. +# +# STARTUP -- one-time mipmap build +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# 1. Allocate MipmappedArray (10 levels, float4 RGBA, is_surface_load_store=True). +# 2. Level 0: launch `seed_base` kernel -> SurfaceObject -> high-frequency +# procedural pattern. +# 3. For L = 1..num_levels-1: launch `downsample` kernel: +# - reads level L-1 through a TextureObject (POINT-filtered) +# - writes level L through a SurfaceObject +# - 4-sample box average of the parent's 2x2 footprint. +# +# PER FRAME (render loop) +# ~~~~~~~~~~~~~~~~~~~~~~~ +# The display TextureObject samples the whole pyramid with `tex2DLod`, +# where the LOD is computed per-pixel as `log2(zoom) + lod_bias`. The result +# is written to a GL PBO via GraphicsResource, then drawn as a textured quad. +# +# What you should see +# =================== +# A 512x512 procedural pattern (concentric rings + diagonal grid) shown +# stretched across the window. Use the mouse wheel to zoom in/out (this +# implicitly changes the LOD), and use the bracket keys `[` / `]` to add a +# manual LOD bias on top of that. Press `R` to reset. +# +# Mouse wheel zoom in / out +# [ LOD bias -= 0.25 (sharper, more aliased) +# ] LOD bias += 0.25 (blurrier, samples a coarser level) +# R reset zoom + bias +# Escape / close quit +# +# The window title shows the current zoom, manual bias, and effective LOD. +# Close the window or press Escape to exit. +# + +# /// script +# dependencies = ["cuda_bindings", "cuda_core>0.6.0", "pyglet"] +# /// + +import ctypes +import math +import sys +import time + +import numpy as np + +from cuda.core import ( + AddressMode, + ArrayFormat, + Device, + FilterMode, + GraphicsResource, + LaunchConfig, + MipmappedArray, + Program, + ProgramOptions, + ReadMode, + ResourceDescriptor, + SurfaceObject, + TextureDescriptor, + TextureObject, + launch, +) + +# --------------------------------------------------------------------------- +# Configuration (feel free to change these) +# --------------------------------------------------------------------------- +WIDTH = 800 +HEIGHT = 600 +BASE_SIZE = 512 # Texture base-level edge length (must be a power of two). +LOD_BIAS_STEP = 0.25 + + +# ============================= Helper functions ============================= +# +# The functions below set up CUDA, OpenGL, and the mipmap pyramid. If you're +# here to learn about MipmappedArray / SurfaceObject / mipmapped TextureObject, +# you can skip straight to main() -- the interesting part is there. These +# helpers exist so that main() reads like a short story. +# ============================================================================ + + +def _check_compute_capability(dev): + """Surface load/store + mipmapped arrays require sm_30+.""" + cc = dev.compute_capability + if cc.major < 3: + print( + f"This example requires compute capability >= 3.0, got sm_{cc.major}{cc.minor}.", + file=sys.stderr, + ) + sys.exit(1) + + +def setup_cuda(): + """Compile the three kernels and return everything we need to drive them. + + Returns + ------- + (dev, stream, kernels, arch_str) + kernels is a dict with keys "seed_base", "downsample", "display". + """ + dev = Device(0) + dev.set_current() + _check_compute_capability(dev) + stream = dev.create_stream() + + program_options = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}") + prog = Program(KERNEL_SOURCE, code_type="c++", options=program_options) + mod = prog.compile( + "cubin", + name_expressions=("seed_base", "downsample", "display"), + ) + kernels = { + "seed_base": mod.get_kernel("seed_base"), + "downsample": mod.get_kernel("downsample"), + "display": mod.get_kernel("display"), + } + return dev, stream, kernels, f"sm_{dev.arch}" + + +def build_mipmap_pyramid(mip, num_levels, stream, kernels): + """Populate every level of `mip` using SurfaceObject writes. + + Strategy + -------- + * Level 0 is filled directly by `seed_base`, which writes a procedural + pattern through a SurfaceObject bound to level 0. + * Each subsequent level L is filled by `downsample`, which reads level L-1 + through a POINT-filtered TextureObject and box-averages a 2x2 footprint + into level L through a SurfaceObject. + * All operations are issued on a single stream, so they serialize + implicitly -- no per-level sync is needed. + """ + # ---- Level 0: seed the base image ------------------------------------- + base_arr = mip.get_level(0) # non-owning view; do NOT use a `with` block + with SurfaceObject.from_array(base_arr) as base_surf: + block = (16, 16, 1) + grid = ( + (BASE_SIZE + block[0] - 1) // block[0], + (BASE_SIZE + block[1] - 1) // block[1], + 1, + ) + launch( + stream, + LaunchConfig(grid=grid, block=block), + kernels["seed_base"], + np.uint64(base_surf.handle), + np.int32(BASE_SIZE), + np.int32(BASE_SIZE), + ) + # base_arr (non-owning) is allowed to fall out of scope here; the parent + # MipmappedArray keeps the underlying storage alive. + + # ---- Levels 1..N-1: box-filter downsample ------------------------------ + # Each iteration reads level (L-1) through a temporary TextureObject and + # writes level L through a temporary SurfaceObject. Both close cleanly + # at the end of their `with` blocks. + src_tex_desc = TextureDescriptor( + address_mode=AddressMode.CLAMP, + filter_mode=FilterMode.POINT, # explicit per-texel reads + read_mode=ReadMode.ELEMENT_TYPE, + normalized_coords=False, # integer pixel coordinates + ) + for level in range(1, num_levels): + parent_size = BASE_SIZE >> (level - 1) + level_size = BASE_SIZE >> level + if level_size < 1: + break + + src_arr = mip.get_level(level - 1) + dst_arr = mip.get_level(level) + src_res = ResourceDescriptor.from_array(src_arr) + with ( + TextureObject.from_descriptor(resource=src_res, texture_descriptor=src_tex_desc) as src_tex, + SurfaceObject.from_array(dst_arr) as dst_surf, + ): + block = (16, 16, 1) + grid = ( + (level_size + block[0] - 1) // block[0], + (level_size + block[1] - 1) // block[1], + 1, + ) + launch( + stream, + LaunchConfig(grid=grid, block=block), + kernels["downsample"], + np.uint64(src_tex.handle), + np.uint64(dst_surf.handle), + np.int32(parent_size), + np.int32(level_size), + ) + # src_arr, dst_arr (non-owning) fall out of scope; storage stays + # alive via the parent MipmappedArray. + + # One sync at the end is enough -- the whole build chain ran on this + # stream and serialized naturally. + stream.sync() + + +def create_window(): + """Open a pyglet window and return (window, gl_module, pyglet).""" + try: + import pyglet + from pyglet.gl import gl as _gl + except ImportError: + print( + "This example requires pyglet >= 2.0.\nInstall it with: pip install pyglet", + file=sys.stderr, + ) + sys.exit(1) + + window = pyglet.window.Window( + WIDTH, + HEIGHT, + caption="MipmappedArray Example - Mipmap LOD viewer", + vsync=False, + ) + return window, _gl, pyglet + + +def create_display_resources(gl, width, height): + """Standard GL boilerplate: a shader program, a fullscreen quad, and an + empty texture that we'll repeatedly fill from a PBO. Not CUDA-specific. + + Returns (shader_program, vertex_array_id, texture_id). + """ + from pyglet.graphics.shader import Shader, ShaderProgram + + vert = Shader(VERTEX_SHADER_SOURCE, "vertex") + frag = Shader(FRAGMENT_SHADER_SOURCE, "fragment") + shader_prog = ShaderProgram(vert, frag) + + quad_verts = np.array( + [ + # x, y, s, t (position + texture coordinate) + -1, + -1, + 0, + 0, + 1, + -1, + 1, + 0, + 1, + 1, + 1, + 1, + -1, + -1, + 0, + 0, + 1, + 1, + 1, + 1, + -1, + 1, + 0, + 1, + ], + dtype=np.float32, + ) + + vao = ctypes.c_uint(0) + gl.glGenVertexArrays(1, ctypes.byref(vao)) + gl.glBindVertexArray(vao.value) + + vbo = ctypes.c_uint(0) + gl.glGenBuffers(1, ctypes.byref(vbo)) + gl.glBindBuffer(gl.GL_ARRAY_BUFFER, vbo.value) + gl.glBufferData( + gl.GL_ARRAY_BUFFER, + quad_verts.nbytes, + quad_verts.ctypes.data_as(ctypes.c_void_p), + gl.GL_STATIC_DRAW, + ) + + stride = 4 * 4 # 4 floats * 4 bytes each + pos_loc = gl.glGetAttribLocation(shader_prog.id, b"position") + gl.glEnableVertexAttribArray(pos_loc) + gl.glVertexAttribPointer(pos_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(0)) + + tc_loc = gl.glGetAttribLocation(shader_prog.id, b"texcoord") + gl.glEnableVertexAttribArray(tc_loc) + gl.glVertexAttribPointer(tc_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(8)) + + gl.glBindVertexArray(0) + + tex = ctypes.c_uint(0) + gl.glGenTextures(1, ctypes.byref(tex)) + gl.glBindTexture(gl.GL_TEXTURE_2D, tex.value) + gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MIN_FILTER, gl.GL_LINEAR) + gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MAG_FILTER, gl.GL_LINEAR) + gl.glTexImage2D( + gl.GL_TEXTURE_2D, + 0, + gl.GL_RGBA8, + width, + height, + 0, + gl.GL_RGBA, + gl.GL_UNSIGNED_BYTE, + None, + ) + + return shader_prog, vao.value, tex.value + + +def create_pixel_buffer(gl, width, height): + """Create a Pixel Buffer Object (PBO) -- the CUDA/GL bridge. + + Returns (pbo_gl_name, size_in_bytes). + """ + pbo = ctypes.c_uint(0) + gl.glGenBuffers(1, ctypes.byref(pbo)) + gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo.value) + nbytes = width * height * 4 # RGBA8 + gl.glBufferData(gl.GL_PIXEL_UNPACK_BUFFER, nbytes, None, gl.GL_DYNAMIC_DRAW) + gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0) + return pbo.value, nbytes + + +def copy_pbo_to_texture(gl, pbo_id, tex_id, width, height): + """Copy pixel data from the PBO into the GL texture (GPU-to-GPU).""" + gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo_id) + gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id) + gl.glTexSubImage2D( + gl.GL_TEXTURE_2D, + 0, + 0, + 0, + width, + height, + gl.GL_RGBA, + gl.GL_UNSIGNED_BYTE, + None, + ) + gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0) + + +def draw_fullscreen_quad(gl, shader_prog, vao_id, tex_id): + """Draw the texture to the screen using the fullscreen quad.""" + gl.glUseProgram(shader_prog.id) + gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id) + gl.glBindVertexArray(vao_id) + gl.glDrawArrays(gl.GL_TRIANGLES, 0, 6) + gl.glBindVertexArray(0) + gl.glUseProgram(0) + + +# ================================== main() ================================== + + +def main(): + # --- Step 1: Set up CUDA (compile kernels, create stream) --- + dev, stream, kernels, _arch = setup_cuda() + + # --- Step 2: Allocate the mipmap pyramid and build every level --- + # is_surface_load_store=True is required for kernel-side writes. + num_levels = int(math.log2(BASE_SIZE)) + 1 + mip = MipmappedArray.from_descriptor( + shape=(BASE_SIZE, BASE_SIZE), + format=ArrayFormat.FLOAT32, + num_channels=4, + num_levels=num_levels, + is_surface_load_store=True, + ) + build_mipmap_pyramid(mip, num_levels, stream, kernels) + + # --- Step 3: Bind the WHOLE pyramid as a trilinear-filtered texture --- + # Normalized coordinates (0..1) make zoom-by-uv simple. The texture + # descriptor's mipmap_level_bias stays 0.0; the display kernel + # receives the user-controlled bias as a kernel argument and folds + # it into the tex2DLod call (avoids rebuilding the TextureObject + # whenever the user changes the bias). + display_tex_desc = TextureDescriptor( + address_mode=AddressMode.WRAP, + filter_mode=FilterMode.LINEAR, + read_mode=ReadMode.ELEMENT_TYPE, + normalized_coords=True, + mipmap_filter_mode=FilterMode.LINEAR, # trilinear + mipmap_level_bias=0.0, + min_mipmap_level_clamp=0.0, + max_mipmap_level_clamp=float(num_levels - 1), + ) + display_tex = TextureObject.from_descriptor( + resource=ResourceDescriptor.from_mipmapped_array(mip), + texture_descriptor=display_tex_desc, + ) + + # --- Step 4: Open a window and set up the GL/CUDA bridge --- + window, gl, pyglet = create_window() + shader_prog, quad_vao, tex_id = create_display_resources(gl, WIDTH, HEIGHT) + pbo_id, _ = create_pixel_buffer(gl, WIDTH, HEIGHT) + resource = GraphicsResource.from_gl_buffer(pbo_id, flags="write_discard") + + # --- Step 5: Render loop state --- + # `zoom` controls how big a texel is on screen: zoom > 1 stretches the + # texture and selects coarser mip levels (positive LOD); zoom < 1 shrinks + # the texture and selects finer levels. `lod_bias` is a manual offset + # added on top. + state = {"zoom": 1.0, "lod_bias": 0.0} + start_time = time.monotonic() + frame_count = [0] + fps_time = [start_time] + + block = (16, 16, 1) + grid = ( + (WIDTH + block[0] - 1) // block[0], + (HEIGHT + block[1] - 1) // block[1], + 1, + ) + config = LaunchConfig(grid=grid, block=block) + + def effective_lod(): + # Same formula the display kernel uses, clamped to the legal range so + # the window title matches what the GPU actually sees. + raw = math.log2(max(state["zoom"], 1e-6)) + state["lod_bias"] + return max(0.0, min(float(num_levels - 1), raw)) + + @window.event + def on_draw(): + window.clear() + + # (a) Map the PBO so CUDA can write into it. + with resource.map(stream=stream) as buf: + # (b) Launch the display kernel -- samples the mipmap and writes RGBA. + launch( + stream, + config, + kernels["display"], + buf.handle, + np.int32(WIDTH), + np.int32(HEIGHT), + np.uint64(display_tex.handle), + np.float32(state["zoom"]), + np.float32(state["lod_bias"]), + np.float32(float(num_levels - 1)), + ) + # (c) Unmap happens automatically; cuGraphicsUnmapResources serializes + # the CUDA work against subsequent OpenGL use. + + copy_pbo_to_texture(gl, pbo_id, tex_id, WIDTH, HEIGHT) + draw_fullscreen_quad(gl, shader_prog, quad_vao, tex_id) + + frame_count[0] += 1 + now = time.monotonic() + if now - fps_time[0] >= 1.0: + fps = frame_count[0] / (now - fps_time[0]) + window.set_caption( + f"MipmappedArray LOD viewer " + f"({WIDTH}x{HEIGHT}, {fps:.0f} FPS) -- " + f"zoom={state['zoom']:.2f}, " + f"bias={state['lod_bias']:+.2f}, " + f"LOD={effective_lod():.2f}" + ) + frame_count[0] = 0 + fps_time[0] = now + + @window.event + def on_mouse_scroll(_x, _y, _scroll_x, scroll_y): + # One wheel step changes zoom by ~12.5%. Clamped to keep LOD in range. + if scroll_y == 0: + return + factor = 1.125**scroll_y + state["zoom"] = max(1.0 / 64.0, min(64.0, state["zoom"] * factor)) + + @window.event + def on_key_press(symbol, _modifiers): + key = pyglet.window.key + if symbol == key.BRACKETLEFT: + state["lod_bias"] = max(-float(num_levels), state["lod_bias"] - LOD_BIAS_STEP) + elif symbol == key.BRACKETRIGHT: + state["lod_bias"] = min(float(num_levels), state["lod_bias"] + LOD_BIAS_STEP) + elif symbol == key.R: + state["zoom"] = 1.0 + state["lod_bias"] = 0.0 + + @window.event + def on_close(): + # Release CUDA-side resources in reverse construction order. GL + # objects clean up via pyglet on window close. + resource.close() + display_tex.close() + mip.close() + stream.close() + + pyglet.app.run(interval=0) + + +# ======================== GPU code (CUDA + GLSL) ============================ +# +# Three CUDA kernels are concatenated into one program string so they share a +# single NVRTC compile. All three operate on float4 RGBA pixels. +# +# seed_base -- writes a high-frequency procedural pattern to level 0 via a +# SurfaceObject. NOTE: surf2Dwrite's x-coordinate is in BYTES, +# not in elements, so we multiply by sizeof(float4) every time. +# +# downsample -- reads level L-1 through a POINT-filtered TextureObject and +# writes the 2x2 box average to level L through a SurfaceObject. +# tex2D with non-normalized coords needs the +0.5 half-texel +# offset to hit exact texel centers. +# +# display -- samples the WHOLE mipmap pyramid with tex2DLod, where the +# per-thread LOD is `clamp(log2(zoom) + lod_bias, 0, maxLod)`. +# Writes 8-bit RGBA into the PBO. +# +# GLSL shaders at the very bottom just draw a textured quad. Nothing CUDA- +# specific there. +# +# ============================================================================ + +KERNEL_SOURCE = r""" +// -------------------------------------------------------------------------- +// Helper: clamp a float to [a, b]. +// -------------------------------------------------------------------------- +__device__ __forceinline__ float clampf(float v, float a, float b) { + return fminf(fmaxf(v, a), b); +} + +// CUDA does not ship a builtin "fract" so we provide one (used by seed_base). +__device__ __forceinline__ float fracf(float v) { + return v - floorf(v); +} + +// -------------------------------------------------------------------------- +// seed_base: write a procedural high-frequency pattern to level 0. +// +// surf is a SurfaceObject bound to the level-0 CUDAArray (float4 RGBA). The +// pattern is a colorful blend of concentric rings, a diagonal grid, and a +// radial sweep, designed to have plenty of fine detail so the difference +// between mip levels is visually obvious. +// -------------------------------------------------------------------------- +extern "C" __global__ +void seed_base(cudaSurfaceObject_t surf, int width, int height) { + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + if (x >= width || y >= height) return; + + float u = ((float)x + 0.5f) / (float)width; + float v = ((float)y + 0.5f) / (float)height; + + // Concentric rings centered on the image. + float cx = u - 0.5f; + float cy = v - 0.5f; + float r = sqrtf(cx * cx + cy * cy); + float rings = 0.5f + 0.5f * sinf(r * 80.0f); + + // Diagonal grid -- thin lines about every 1/16 of the image. + float gx = fabsf(fracf(u * 16.0f) - 0.5f); + float gy = fabsf(fracf(v * 16.0f) - 0.5f); + float grid = (gx < 0.05f || gy < 0.05f) ? 1.0f : 0.0f; + + // Angular sweep gives the rings some color variation. + float theta = atan2f(cy, cx); + float sweep = 0.5f + 0.5f * sinf(theta * 6.0f); + + // Combine into an RGBA color. Keep values in [0, 1]. + float red = clampf(rings * (0.4f + 0.6f * sweep) + 0.3f * grid, 0.0f, 1.0f); + float green = clampf(rings * (0.6f - 0.4f * sweep) + 0.3f * grid, 0.0f, 1.0f); + float blue = clampf(0.4f + 0.4f * sweep + 0.5f * grid, 0.0f, 1.0f); + float alpha = 1.0f; + + float4 px = make_float4(red, green, blue, alpha); + + // Surface writes index x in BYTES (this is the classic gotcha). + surf2Dwrite(px, surf, x * (int)sizeof(float4), y); +} + +// -------------------------------------------------------------------------- +// downsample: box-filter a 2x2 footprint of the parent level into one texel. +// +// src is a POINT-filtered TextureObject bound to level (L-1). +// dst is a SurfaceObject bound to level L. +// (dst_w, dst_h) is the size of level L. +// (src_w = 2 * dst_w, src_h = 2 * dst_h is implicit and unused; we pass it +// only for the bounds check.) +// +// Texture coordinates: tex2D with non-normalized coords returns texel (i, j) +// when sampled at (i + 0.5, j + 0.5). So for output texel (x, y) the four +// parent texels live at parent-coords (2x + 0.5, 2y + 0.5), (2x + 1.5, ...). +// -------------------------------------------------------------------------- +extern "C" __global__ +void downsample(cudaTextureObject_t src, + cudaSurfaceObject_t dst, + int src_size, + int dst_size) { + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + if (x >= dst_size || y >= dst_size) return; + + float fx = 2.0f * (float)x; + float fy = 2.0f * (float)y; + + float4 a = tex2D(src, fx + 0.5f, fy + 0.5f); + float4 b = tex2D(src, fx + 1.5f, fy + 0.5f); + float4 c = tex2D(src, fx + 0.5f, fy + 1.5f); + float4 d = tex2D(src, fx + 1.5f, fy + 1.5f); + + float4 px; + px.x = 0.25f * (a.x + b.x + c.x + d.x); + px.y = 0.25f * (a.y + b.y + c.y + d.y); + px.z = 0.25f * (a.z + b.z + c.z + d.z); + px.w = 0.25f * (a.w + b.w + c.w + d.w); + + // Silence unused-variable warning for the convenience parameter. + (void)src_size; + + surf2Dwrite(px, dst, x * (int)sizeof(float4), y); +} + +// -------------------------------------------------------------------------- +// display: per-pixel mipmap sample with manual LOD bias. +// +// tex is a TextureObject built from the whole MipmappedArray (LINEAR + +// LINEAR mipmap filter, normalized coords). For each output pixel we compute +// a single per-thread LOD from `zoom` and `lod_bias`, then sample with +// tex2DLod. Output is written as RGBA8 into a linear byte buffer. +// -------------------------------------------------------------------------- +extern "C" __global__ +void display(unsigned char *output, + int width, + int height, + cudaTextureObject_t tex, + float zoom, + float lod_bias, + float max_lod) { + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + if (x >= width || y >= height) return; + + // Normalized window coords in [0, 1]. + float u = ((float)x + 0.5f) / (float)width; + float v = ((float)y + 0.5f) / (float)height; + + // Zoom around the window center so the user sees the effect symmetrically. + u = (u - 0.5f) * zoom + 0.5f; + v = (v - 0.5f) * zoom + 0.5f; + + // LOD: zoom > 1 means the texture is being stretched (each texel covers + // more screen area), which intuitively corresponds to selecting a coarser + // (higher) mip level. log2(zoom) yields exactly that. lod_bias is added + // on top, and the final value is clamped to the legal range. + float lod = log2f(fmaxf(zoom, 1e-6f)) + lod_bias; + lod = clampf(lod, 0.0f, max_lod); + + float4 c = tex2DLod(tex, u, v, lod); + + int idx = (y * width + x) * 4; + output[idx + 0] = (unsigned char)(clampf(c.x, 0.0f, 1.0f) * 255.0f); + output[idx + 1] = (unsigned char)(clampf(c.y, 0.0f, 1.0f) * 255.0f); + output[idx + 2] = (unsigned char)(clampf(c.z, 0.0f, 1.0f) * 255.0f); + output[idx + 3] = 255; +} +""" + +# GLSL shaders -- these just display a texture on a fullscreen rectangle. +# Nothing CUDA-specific here. + +VERTEX_SHADER_SOURCE = """#version 330 core +in vec2 position; +in vec2 texcoord; +out vec2 v_texcoord; +void main() { + gl_Position = vec4(position, 0.0, 1.0); + v_texcoord = texcoord; +} +""" + +FRAGMENT_SHADER_SOURCE = """#version 330 core +in vec2 v_texcoord; +out vec4 fragColor; +uniform sampler2D tex; +void main() { + fragColor = texture(tex, v_texcoord); +} +""" + + +if __name__ == "__main__": + main() diff --git a/cuda_core/examples/gl_interop_ocean.py b/cuda_core/examples/gl_interop_ocean.py new file mode 100644 index 00000000000..2e01dd9cccf --- /dev/null +++ b/cuda_core/examples/gl_interop_ocean.py @@ -0,0 +1,866 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +# ################################################################################ +# +# This example demonstrates cuda.core.CUDAArray, TextureObject, and SurfaceObject +# in combination with GraphicsResource for CUDA/OpenGL interop. A real-time +# Gerstner-wave ocean is rebuilt every frame: a heightmap CUDAArray is rewritten +# through a SurfaceObject, sampled through a TextureObject with LINEAR + WRAP +# filtering for normal estimation, and shaded with Phong + Fresnel sky +# reflection straight into an OpenGL PBO. Requires pyglet. +# +# ################################################################################ + +# What this example teaches +# ========================= +# - How to use a CUDA CUDAArray as a typed heightmap that is simultaneously +# written by one kernel (via SurfaceObject) and sampled by another (via +# TextureObject) within the same frame. +# - How LINEAR filtering + WRAP addressing + normalized coordinates gives +# essentially-free bilinear neighbor lookups for finite-difference normal +# estimation on a tiling heightmap. +# - How to compose CUDAArray/TextureObject/SurfaceObject with GraphicsResource so +# the entire render path never leaves the GPU. +# +# How it works +# ============ +# Gerstner waves are a sum of N moving sinusoids with directional vectors -- +# a classic ocean approximation that looks shockingly close to FFT ocean at a +# glance without any external library dependencies. For each heightmap texel: +# +# h(x, z, t) = sum_i A_i * sin( D_i . (x, z) * k_i - w_i * t + phi_i ) +# +# where k_i = 2*pi / wavelength_i and w_i = sqrt(g * k_i) is the dispersion +# relation for deep-water gravity waves. We bake 12 waves with hand-picked +# directions / wavelengths / amplitudes / phases into the kernel as constant +# arrays. Weather presets just scale amplitude and speed at the host level. +# +# PER FRAME (all on GPU) +# ~~~~~~~~~~~~~~~~~~~~~~ +# +-----------------+ surf2Dwrite +--------------+ +# | update_height | --------------> | heightmap | +# | kernel | | CUDAArray | +# +-----------------+ | (FLOAT32) | +# +--------------+ +# | +# | tex2D (LINEAR + WRAP) +# v +# +-----------------+ write RGBA8 +# | render_ocean | ----------------> PBO +# | kernel | +# +-----------------+ +# +# Why LINEAR + WRAP + normalized coords? +# -------------------------------------- +# WRAP / MIRROR addressing modes require normalized coordinates (see the CUDA +# Programming Guide). The ocean naturally tiles, so WRAP gives free seamless +# horizon repetition. LINEAR filtering means our four-tap finite-difference +# normal estimate gets bilinear interpolation between texels for free, which +# smooths the lighting noticeably without a single extra ALU instruction. +# +# Channel byte width in surf2Dwrite +# --------------------------------- +# surf2Dwrite takes the x coordinate in BYTES, not in elements. For a +# single-channel float surface that means `x * sizeof(float)` = `x * 4`. +# Getting this wrong silently corrupts every other column. +# +# What you should see +# =================== +# A window showing a real-time animated ocean rendered with Phong shading and +# a Fresnel-modulated sky reflection. Drag with the left mouse button to +# orbit, scroll to zoom, press 1/2/3 to switch weather presets (calm / +# breezy / stormy), press P to pause animation, Escape to exit. Window title +# shows preset name and FPS. +# + +# /// script +# dependencies = ["cuda_bindings", "cuda_core>0.6.0", "pyglet"] +# /// + +import ctypes +import math +import sys +import time + +import numpy as np + +from cuda.core import ( + AddressMode, + ArrayFormat, + CUDAArray, + Device, + FilterMode, + GraphicsResource, + LaunchConfig, + Program, + ProgramOptions, + ReadMode, + ResourceDescriptor, + SurfaceObject, + TextureDescriptor, + TextureObject, + launch, +) + +# --------------------------------------------------------------------------- +# Window and heightmap dimensions (feel free to change these) +# --------------------------------------------------------------------------- +WIDTH = 1024 +HEIGHT = 768 +GRID = 512 # heightmap resolution (GRID x GRID texels) + +# Weather presets: (amplitude_scale, speed_scale, label). +# These are applied as multiplicative scalars on top of the per-wave amplitude +# and angular-frequency arrays baked into the kernel, so a single compiled +# binary can render every preset. +PRESETS = { + "1": (0.35, 0.7, "calm"), + "2": (1.00, 1.0, "breezy"), + "3": (1.85, 1.4, "stormy"), +} +DEFAULT_PRESET = "2" + +# Initial camera (orbit-around-origin) parameters. +INITIAL_YAW = 0.6 # radians around world-y +INITIAL_PITCH = 0.35 # radians above the horizon (small positive = looking down) +INITIAL_DISTANCE = 5.0 # camera distance from origin +PITCH_LIMIT = 1.4 # clamp |pitch| to keep basis non-degenerate (< pi/2) +ZOOM_MIN = 1.5 +ZOOM_MAX = 30.0 + + +# ============================= Helper functions ============================= +# +# The functions below set up CUDA and OpenGL. If you're here to learn about +# CUDAArray/TextureObject/SurfaceObject, skip ahead to main() -- the interesting +# part is there. These helpers exist so that main() reads like a short story +# instead of a wall of boilerplate. +# ============================================================================ + + +def setup_cuda(): + """Compile the CUDA kernels and return (device, stream, kernels, configs). + + The two kernels live on different grids: + - update_height runs over the heightmap (GRID x GRID texels). + - render_ocean runs over output pixels (WIDTH x HEIGHT). + """ + dev = Device(0) + dev.set_current() + + # SurfaceObject requires surface load/store, which has existed since SM 2.0, + # but bindless surface objects (cuSurfObjectCreate) require SM 3.0+. + cc = dev.compute_capability + if cc.major < 3: + print( + "This example requires a GPU with compute capability >= 3.0 for " + f"bindless surface objects. Found sm_{cc.major}{cc.minor}.", + file=sys.stderr, + ) + sys.exit(1) + + stream = dev.create_stream() + + # C++ compile so the templated tex2D overload resolves. + program_options = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}") + prog = Program(KERNEL_SOURCE, code_type="c++", options=program_options) + mod = prog.compile( + "cubin", + name_expressions=("update_height", "render_ocean"), + ) + + kernels = { + "update": mod.get_kernel("update_height"), + "render": mod.get_kernel("render_ocean"), + } + + block = (16, 16, 1) + update_grid = ( + (GRID + block[0] - 1) // block[0], + (GRID + block[1] - 1) // block[1], + 1, + ) + render_grid = ( + (WIDTH + block[0] - 1) // block[0], + (HEIGHT + block[1] - 1) // block[1], + 1, + ) + configs = { + "update": LaunchConfig(grid=update_grid, block=block), + "render": LaunchConfig(grid=render_grid, block=block), + } + return dev, stream, kernels, configs + + +def create_window(): + """Open a pyglet window and return (window, gl_module, pyglet).""" + try: + import pyglet + from pyglet.gl import gl as _gl + except ImportError: + print( + "This example requires pyglet >= 2.0.\nInstall it with: pip install pyglet", + file=sys.stderr, + ) + sys.exit(1) + + window = pyglet.window.Window( + WIDTH, + HEIGHT, + caption="cuda.core CUDAArray/Texture/Surface - Gerstner Ocean", + vsync=False, + ) + return window, _gl, pyglet + + +def create_display_resources(gl, width, height): + """Create the GL objects needed to show a texture on screen. + + Standard OpenGL boilerplate -- not CUDA-specific. Returns + (shader_program, vao_id, tex_id). The shader_program is a pyglet + ShaderProgram object (must be kept alive). + """ + from pyglet.graphics.shader import Shader, ShaderProgram + + vert = Shader(VERTEX_SHADER_SOURCE, "vertex") + frag = Shader(FRAGMENT_SHADER_SOURCE, "fragment") + shader_prog = ShaderProgram(vert, frag) + + # Fullscreen quad (two triangles covering the entire window). + quad_verts = np.array( + [ + -1, + -1, + 0, + 0, + 1, + -1, + 1, + 0, + 1, + 1, + 1, + 1, + -1, + -1, + 0, + 0, + 1, + 1, + 1, + 1, + -1, + 1, + 0, + 1, + ], + dtype=np.float32, + ) + + vao = ctypes.c_uint(0) + gl.glGenVertexArrays(1, ctypes.byref(vao)) + gl.glBindVertexArray(vao.value) + + vbo = ctypes.c_uint(0) + gl.glGenBuffers(1, ctypes.byref(vbo)) + gl.glBindBuffer(gl.GL_ARRAY_BUFFER, vbo.value) + gl.glBufferData( + gl.GL_ARRAY_BUFFER, + quad_verts.nbytes, + quad_verts.ctypes.data_as(ctypes.c_void_p), + gl.GL_STATIC_DRAW, + ) + + stride = 4 * 4 + pos_loc = gl.glGetAttribLocation(shader_prog.id, b"position") + gl.glEnableVertexAttribArray(pos_loc) + gl.glVertexAttribPointer(pos_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(0)) + tc_loc = gl.glGetAttribLocation(shader_prog.id, b"texcoord") + gl.glEnableVertexAttribArray(tc_loc) + gl.glVertexAttribPointer(tc_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(8)) + gl.glBindVertexArray(0) + + tex = ctypes.c_uint(0) + gl.glGenTextures(1, ctypes.byref(tex)) + gl.glBindTexture(gl.GL_TEXTURE_2D, tex.value) + gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MIN_FILTER, gl.GL_LINEAR) + gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MAG_FILTER, gl.GL_LINEAR) + gl.glTexImage2D( + gl.GL_TEXTURE_2D, + 0, + gl.GL_RGBA8, + width, + height, + 0, + gl.GL_RGBA, + gl.GL_UNSIGNED_BYTE, + None, + ) + return shader_prog, vao.value, tex.value + + +def create_pixel_buffer(gl, width, height): + """Create a Pixel Buffer Object (PBO) sized for one RGBA8 frame.""" + pbo = ctypes.c_uint(0) + gl.glGenBuffers(1, ctypes.byref(pbo)) + gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo.value) + nbytes = width * height * 4 + gl.glBufferData(gl.GL_PIXEL_UNPACK_BUFFER, nbytes, None, gl.GL_DYNAMIC_DRAW) + gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0) + return pbo.value, nbytes + + +def copy_pbo_to_texture(gl, pbo_id, tex_id, width, height): + """Copy pixel data from the PBO into the GL texture (GPU-to-GPU).""" + gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo_id) + gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id) + gl.glTexSubImage2D( + gl.GL_TEXTURE_2D, + 0, + 0, + 0, + width, + height, + gl.GL_RGBA, + gl.GL_UNSIGNED_BYTE, + None, + ) + gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0) + + +def draw_fullscreen_quad(gl, shader_prog, vao_id, tex_id): + """Draw the texture to the screen using the fullscreen quad.""" + gl.glUseProgram(shader_prog.id) + gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id) + gl.glBindVertexArray(vao_id) + gl.glDrawArrays(gl.GL_TRIANGLES, 0, 6) + gl.glBindVertexArray(0) + gl.glUseProgram(0) + + +def make_heightmap_array(): + """Allocate the single-channel float heightmap CUDAArray.""" + return CUDAArray.from_descriptor( + shape=(GRID, GRID), + format=ArrayFormat.FLOAT32, + num_channels=1, + is_surface_load_store=True, + ) + + +def make_height_texture(arr): + """Bind `arr` as a TextureObject configured for LINEAR + WRAP + normalized.""" + res_desc = ResourceDescriptor.from_array(arr) + tex_desc = TextureDescriptor( + address_mode=AddressMode.WRAP, + filter_mode=FilterMode.LINEAR, + read_mode=ReadMode.ELEMENT_TYPE, + # WRAP/MIRROR addressing modes require normalized coordinates. + normalized_coords=True, + ) + return TextureObject.from_descriptor(resource=res_desc, texture_descriptor=tex_desc) + + +def orbit_camera_position(yaw, pitch, distance): + """Convert (yaw, pitch, distance) to a world-space camera position. + + The camera orbits the origin looking at it. World up is +y. Pitch is the + angle above the xz-plane: pitch=0 puts the camera on the horizon, + pitch=+1.4 nearly directly overhead. + """ + cp = math.cos(pitch) + sp = math.sin(pitch) + cy = math.cos(yaw) + sy = math.sin(yaw) + cam_x = distance * cp * sy + cam_y = distance * sp + cam_z = distance * cp * cy + return cam_x, cam_y, cam_z + + +# ================================== main() ================================== + + +def main(): + # --- Step 1: Set up CUDA (compile kernels, create stream) --- + dev, stream, kernels, configs = setup_cuda() + + # --- Step 2: Open a window --- + window, gl, pyglet = create_window() + + # --- Step 3: Create GL resources for drawing a texture to screen --- + shader_prog, quad_vao, tex_id = create_display_resources(gl, WIDTH, HEIGHT) + + # --- Step 4: Create the Pixel Buffer Object (PBO) --- + pbo_id, _ = create_pixel_buffer(gl, WIDTH, HEIGHT) + + # --- Step 5: Register the PBO with CUDA --- + resource = GraphicsResource.from_gl_buffer(pbo_id, flags="write_discard") + + # --- Step 6: Allocate the heightmap CUDAArray and build its texture/surface --- + # We pre-create both the TextureObject (read path) and the + # SurfaceObject (write path) once and reuse them every frame. Creating + # them inside the per-frame loop would work but adds per-frame overhead + # and risks lifetime issues with async kernel launches. + height_arr = make_heightmap_array() + height_tex = make_height_texture(height_arr) + height_surf = SurfaceObject.from_array(height_arr) + + # --- Step 7: Camera + animation state --- + state = { + "preset": DEFAULT_PRESET, + "yaw": INITIAL_YAW, + "pitch": INITIAL_PITCH, + "distance": INITIAL_DISTANCE, + "drag": False, + "paused": False, + "t_anim": 0.0, + "t_prev": time.monotonic(), + } + + # --- Step 8: Render loop --- + frame_count = 0 + fps_time = state["t_prev"] + + @window.event + def on_draw(): + nonlocal frame_count, fps_time + + window.clear() + + # Advance animation time only when not paused, so pausing freezes the + # ocean exactly where it was rather than letting it lurch when resumed. + now = time.monotonic() + dt = now - state["t_prev"] + state["t_prev"] = now + if not state["paused"]: + state["t_anim"] += dt + t = state["t_anim"] + + amp_scale, speed_scale, _label = PRESETS[state["preset"]] + + # (a) Rebuild the heightmap for time t. + launch( + stream, + configs["update"], + kernels["update"], + np.uint64(height_surf.handle), + np.int32(GRID), + np.int32(GRID), + np.float32(t), + np.float32(amp_scale), + np.float32(speed_scale), + ) + + # (b) Render the scene: sample the heightmap through the texture, + # estimate normals via finite differences, shade with Phong + + # Fresnel sky reflection, write RGBA8 into the OpenGL PBO. + cam_x, cam_y, cam_z = orbit_camera_position(state["yaw"], state["pitch"], state["distance"]) + with resource.map(stream=stream) as buf: + launch( + stream, + configs["render"], + kernels["render"], + np.uint64(height_tex.handle), + buf.handle, + np.int32(WIDTH), + np.int32(HEIGHT), + np.float32(cam_x), + np.float32(cam_y), + np.float32(cam_z), + np.float32(t), + ) + # Unmap happens automatically when the `with` block exits. + + # (c) PBO -> GL texture (GPU-to-GPU). + copy_pbo_to_texture(gl, pbo_id, tex_id, WIDTH, HEIGHT) + + # (d) Draw the texture to the screen. + draw_fullscreen_quad(gl, shader_prog, quad_vao, tex_id) + + # FPS counter (shown in window title) + frame_count += 1 + if now - fps_time >= 1.0: + fps = frame_count / (now - fps_time) + label = PRESETS[state["preset"]][2] + paused = " [paused]" if state["paused"] else "" + window.set_caption( + "cuda.core CUDAArray/Texture/Surface - Gerstner Ocean" + f" [{label}]{paused} ({WIDTH}x{HEIGHT}, {fps:.0f} FPS)" + ) + frame_count = 0 + fps_time = now + + # --- Mouse: drag to orbit, scroll to zoom ------------------------------ + @window.event + def on_mouse_press(_x, _y, button, _modifiers): + if button == pyglet.window.mouse.LEFT: + state["drag"] = True + + @window.event + def on_mouse_release(_x, _y, button, _modifiers): + if button == pyglet.window.mouse.LEFT: + state["drag"] = False + + @window.event + def on_mouse_drag(_x, _y, dx, dy, buttons, _modifiers): + if not (buttons & pyglet.window.mouse.LEFT): + return + # Rotate yaw on horizontal drag, pitch on vertical drag. The yaw + # direction matches the camera moving with the cursor. + state["yaw"] -= dx * 0.005 + state["pitch"] -= dy * 0.005 + # Clamp pitch to keep the camera basis non-degenerate (never look + # straight down/up the world-y axis). + if state["pitch"] > PITCH_LIMIT: + state["pitch"] = PITCH_LIMIT + if state["pitch"] < -PITCH_LIMIT: + state["pitch"] = -PITCH_LIMIT + + @window.event + def on_mouse_scroll(_x, _y, _scroll_x, scroll_y): + # Geometric zoom in camera distance; clamp to a sensible range. + factor = 1.1 ** (-scroll_y) + new_d = state["distance"] * factor + state["distance"] = max(ZOOM_MIN, min(ZOOM_MAX, new_d)) + + # --- Keyboard: 1/2/3 weather presets, P pauses, Escape exits ---------- + @window.event + def on_key_press(symbol, _modifiers): + key = pyglet.window.key + if symbol == key.ESCAPE: + window.close() + return + if symbol == key.P: + state["paused"] = not state["paused"] + return + for digit_key, name in ( + (key._1, "1"), + (key._2, "2"), + (key._3, "3"), + ): + if symbol == digit_key: + state["preset"] = name + return + + @window.event + def on_close(): + # Release CUDA resources in reverse order of creation. + resource.close() + height_tex.close() + height_surf.close() + height_arr.close() + stream.close() + + pyglet.app.run(interval=0) + + +# ======================== GPU code (CUDA + GLSL) ============================ +# +# KERNEL_SOURCE contains two CUDA C++ kernels: +# - update_height: per-heightmap-texel. Sums 12 Gerstner waves and writes +# one float per texel via SurfaceObject. +# - render_ocean: per-screen-pixel. Builds a camera ray, intersects the +# ocean plane (y=0), samples the heightmap via +# TextureObject (LINEAR + WRAP), estimates the normal via +# finite differences, and shades with Phong + Fresnel sky +# reflection. Misses go to a vertical sky gradient. +# +# VERTEX_SHADER_SOURCE / FRAGMENT_SHADER_SOURCE are plain GLSL that draws a +# texture on a fullscreen quad -- nothing CUDA-specific. +# ============================================================================ + +KERNEL_SOURCE = r""" +// --------------------------------------------------------------------------- +// Wave bank: 12 Gerstner-ish waves with hand-picked parameters. +// +// Wavelengths span 0.05 .. 1.0 world units. Amplitudes decrease with +// frequency so that long swells dominate and short ripples ride on top +// (a rough Phillips/JONSWAP-style envelope, but coarsely hand-tuned for +// visual punch rather than physical accuracy). +// +// Directions are spread non-uniformly around the unit circle to avoid the +// streaky-grid look you get from evenly-spaced directions. +// --------------------------------------------------------------------------- +__constant__ float c_dirx[12] = { + 1.000f, 0.866f, 0.500f, 0.000f, -0.500f, -0.866f, + -1.000f, -0.940f, -0.500f, 0.174f, 0.643f, 0.940f +}; +__constant__ float c_dirz[12] = { + 0.000f, 0.500f, 0.866f, 1.000f, 0.866f, 0.500f, + 0.000f, 0.342f, 0.866f, 0.985f, 0.766f, 0.342f +}; +__constant__ float c_wavelen[12] = { + 1.000f, 0.730f, 0.520f, 0.380f, 0.260f, 0.190f, + 0.140f, 0.105f, 0.085f, 0.070f, 0.058f, 0.050f +}; +__constant__ float c_amp[12] = { + 0.080f, 0.060f, 0.045f, 0.034f, 0.025f, 0.018f, + 0.013f, 0.010f, 0.0075f, 0.0055f, 0.0040f, 0.0030f +}; +__constant__ float c_phase[12] = { + 0.00f, 1.20f, 2.10f, 0.40f, 3.70f, 5.10f, + 2.65f, 4.85f, 1.55f, 6.05f, 3.20f, 0.95f +}; + +// Deep-water dispersion: w = sqrt(g * k), with k = 2*pi / wavelength. +__device__ __forceinline__ float angular_freq(float wavelength) { + const float G = 9.81f; + float k = 6.2831853f / wavelength; + return sqrtf(G * k); +} + +// World extent (in world units) covered by one tile of the heightmap. +// The heightmap WRAPs, so the ocean tiles seamlessly every TILE world units. +__device__ __forceinline__ float tile_extent() { return 4.0f; } + +// --------------------------------------------------------------------------- +// Tiny vec3 helpers. Kept inline + __forceinline__ so they stay free. +// --------------------------------------------------------------------------- +struct V3 { float x, y, z; }; + +__device__ __forceinline__ V3 v3(float x, float y, float z) { + V3 r; r.x = x; r.y = y; r.z = z; return r; +} +__device__ __forceinline__ V3 v_add(V3 a, V3 b) { + return v3(a.x + b.x, a.y + b.y, a.z + b.z); +} +__device__ __forceinline__ V3 v_sub(V3 a, V3 b) { + return v3(a.x - b.x, a.y - b.y, a.z - b.z); +} +__device__ __forceinline__ V3 v_scale(V3 a, float s) { + return v3(a.x * s, a.y * s, a.z * s); +} +__device__ __forceinline__ float v_dot(V3 a, V3 b) { + return a.x * b.x + a.y * b.y + a.z * b.z; +} +__device__ __forceinline__ V3 v_cross(V3 a, V3 b) { + return v3(a.y * b.z - a.z * b.y, + a.z * b.x - a.x * b.z, + a.x * b.y - a.y * b.x); +} +__device__ __forceinline__ V3 v_normalize(V3 a) { + float inv = rsqrtf(fmaxf(v_dot(a, a), 1e-20f)); + return v_scale(a, inv); +} + +// --------------------------------------------------------------------------- +// update_height: each thread computes one heightmap texel. +// +// Sums the 12 Gerstner waves at world position (x, z), using the +// amplitude_scale and speed_scale knobs to switch between weather presets +// without recompiling the kernel. Writes one float via surf2Dwrite. +// --------------------------------------------------------------------------- +extern "C" __global__ +void update_height(cudaSurfaceObject_t surf, + int width, int height, + float t, + float amp_scale, float speed_scale) { + int ix = blockIdx.x * blockDim.x + threadIdx.x; + int iy = blockIdx.y * blockDim.y + threadIdx.y; + if (ix >= width || iy >= height) return; + + // Map texel (ix, iy) to world position (x, z) inside one tile. + float inv_w = 1.0f / (float)width; + float inv_h = 1.0f / (float)height; + float te = tile_extent(); + float wx = ((float)ix + 0.5f) * inv_w * te; + float wz = ((float)iy + 0.5f) * inv_h * te; + + float h = 0.0f; + #pragma unroll + for (int i = 0; i < 12; ++i) { + float k = 6.2831853f / c_wavelen[i]; + float w = angular_freq(c_wavelen[i]) * speed_scale; + float arg = (c_dirx[i] * wx + c_dirz[i] * wz) * k - w * t + c_phase[i]; + h += c_amp[i] * sinf(arg); + } + h *= amp_scale; + + // Single-channel float surface: byte offset is x * sizeof(float). + surf2Dwrite(h, surf, ix * (int)sizeof(float), iy); +} + +// --------------------------------------------------------------------------- +// Sample the heightmap at a world position. Texture is normalized + WRAP, +// so we just divide world coords by tile_extent. WRAP gives us the tiling +// for free at the horizon. +// --------------------------------------------------------------------------- +__device__ __forceinline__ float sample_height(cudaTextureObject_t tex, + float wx, float wz) { + float inv_te = 1.0f / tile_extent(); + return tex2D(tex, wx * inv_te, wz * inv_te); +} + +// --------------------------------------------------------------------------- +// Sky gradient: a vertical interpolation from a soft horizon to a deeper +// overhead blue. `up_angle` is in [-1, 1] (the y component of the ray dir). +// --------------------------------------------------------------------------- +__device__ __forceinline__ V3 sky_color(float up_angle) { + // Clamp to [0, 1] so straight-down rays still get a horizon color. + float a = fmaxf(0.0f, fminf(1.0f, up_angle)); + // Soft pale-blue horizon + V3 horizon = v3(0.70f, 0.82f, 0.92f); + // Deeper blue overhead + V3 zenith = v3(0.18f, 0.34f, 0.62f); + // Curve so the gradient isn't linear -- horizon stays brighter longer. + float t = powf(a, 0.6f); + return v_add(v_scale(horizon, 1.0f - t), v_scale(zenith, t)); +} + +// --------------------------------------------------------------------------- +// render_ocean: each thread shades one screen pixel. +// +// 1. Reconstruct the camera basis from cam_pos (orbiting origin, world-up). +// 2. Build a perspective ray through the pixel. +// 3. Intersect ray with y = 0 plane; if it misses, return sky gradient. +// 4. Sample heightmap at hit point; finite-difference for the normal. +// 5. Phong diffuse + specular, blended with Fresnel sky reflection. +// 6. Write RGBA8 into the OpenGL PBO. +// --------------------------------------------------------------------------- +extern "C" __global__ +void render_ocean(cudaTextureObject_t tex, + unsigned char* out, + int w, int h, + float cam_x, float cam_y, float cam_z, + float /*t*/) { + int px = blockIdx.x * blockDim.x + threadIdx.x; + int py = blockIdx.y * blockDim.y + threadIdx.y; + if (px >= w || py >= h) return; + + // ---- Camera basis ---- + // Forward looks from cam_pos toward origin. World up is +y. + // cam_y > 0 guarantees forward.y < 0 and the cross product with world-up + // is well-defined (the pitch is clamped on the host side). + V3 cam_pos = v3(cam_x, cam_y, cam_z); + V3 forward = v_normalize(v_sub(v3(0.0f, 0.0f, 0.0f), cam_pos)); + V3 world_up = v3(0.0f, 1.0f, 0.0f); + V3 right = v_normalize(v_cross(forward, world_up)); + V3 cam_up = v_cross(right, forward); + + // ---- Pixel ray (perspective) ---- + float aspect = (float)w / (float)h; + float fov = 1.0472f; // 60 degrees vertical FoV + float scale = tanf(fov * 0.5f); + float ndc_x = (2.0f * ((float)px + 0.5f) / (float)w - 1.0f) * aspect * scale; + float ndc_y = (1.0f - 2.0f * ((float)py + 0.5f) / (float)h) * scale; + V3 dir = v_normalize(v_add(v_add(forward, + v_scale(right, ndc_x)), + v_scale(cam_up, ndc_y))); + + // ---- Background sky if the ray misses the ocean plane ---- + // The ocean is the y=0 plane; we only count hits with rays going downward + // (dir.y < 0). Anything else is sky. A small eps avoids near-horizontal + // rays producing absurd hit distances. + V3 col; + const float HIT_EPS = 1e-3f; + if (dir.y > -HIT_EPS) { + col = sky_color(dir.y); + } else { + // ---- Hit the ocean plane ---- + float t_hit = -cam_y / dir.y; + if (t_hit <= 0.0f) { + // Camera under the surface -- treat as sky to avoid garbage. + col = sky_color(dir.y); + } else { + V3 p = v_add(cam_pos, v_scale(dir, t_hit)); + + // ---- Sample heightmap; estimate normal via finite differences ---- + // The heightmap tiles every tile_extent() world units (WRAP), so + // we use a small world-space epsilon. Four taps -> central + // differences in x and z. + const float FD = 0.01f; + float h_c = sample_height(tex, p.x, p.z); + float h_xp = sample_height(tex, p.x + FD, p.z); + float h_xm = sample_height(tex, p.x - FD, p.z); + float h_zp = sample_height(tex, p.x, p.z + FD); + float h_zm = sample_height(tex, p.x, p.z - FD); + float dh_dx = (h_xp - h_xm) / (2.0f * FD); + float dh_dz = (h_zp - h_zm) / (2.0f * FD); + // Normal of the surface y = h(x, z) is (-dh/dx, 1, -dh/dz). + V3 N = v_normalize(v3(-dh_dx, 1.0f, -dh_dz)); + + // ---- Lighting ---- + V3 L = v_normalize(v3(0.55f, 0.65f, 0.35f)); // sun: high+side + V3 V = v_normalize(v_sub(cam_pos, p)); // view direction + // Reflect L about N: R = 2*(N.L)*N - L + float ndotl = fmaxf(0.0f, v_dot(N, L)); + V3 R = v_normalize(v_sub(v_scale(N, 2.0f * v_dot(N, L)), L)); + + // Phong specular highlight on wave crests. + float spec = powf(fmaxf(0.0f, v_dot(R, V)), 32.0f); + + // Diffuse: deep-sea blue-green. + V3 deep = v3(0.04f, 0.18f, 0.28f); + V3 shallow = v3(0.10f, 0.32f, 0.42f); + // Tiny height-based shading bias so crests look slightly brighter. + float tint = 0.5f + 0.5f * fmaxf(-1.0f, fminf(1.0f, h_c * 6.0f)); + V3 base = v_add(v_scale(deep, 1.0f - tint), + v_scale(shallow, tint)); + + // Diffuse term + ambient. + V3 diffuse = v_add(v_scale(base, 0.18f), + v_scale(base, 0.82f * ndotl)); + + // Fresnel-modulated sky reflection. Sample the sky in the + // reflected-view direction so reflections of overhead show + // overhead colors, etc. View reflection: Rv = 2*(N.V)*N - V. + float ndotv = fmaxf(0.0f, v_dot(N, V)); + V3 Rv = v_normalize(v_sub(v_scale(N, 2.0f * v_dot(N, V)), V)); + V3 reflected_sky = sky_color(fmaxf(0.0f, Rv.y)); + float F = powf(1.0f - ndotv, 5.0f); + // Clamp Fresnel just in case of NaN-prone edge cases. + if (F < 0.0f) F = 0.0f; + if (F > 1.0f) F = 1.0f; + + // Blend: more reflection at grazing angles. + V3 lit = v_add(v_scale(diffuse, 1.0f - F), + v_scale(reflected_sky, F)); + + // Add specular highlight (sun color). + V3 sun_col = v3(1.0f, 0.96f, 0.85f); + col = v_add(lit, v_scale(sun_col, spec)); + } + } + + // ---- Tonemap + write ---- + // Simple Reinhard-ish curve keeps highlights in [0, 1]. + col.x = col.x / (1.0f + col.x); + col.y = col.y / (1.0f + col.y); + col.z = col.z / (1.0f + col.z); + + int idx = (py * w + px) * 4; + out[idx + 0] = (unsigned char)(fmaxf(0.0f, fminf(1.0f, col.x)) * 255.0f); + out[idx + 1] = (unsigned char)(fmaxf(0.0f, fminf(1.0f, col.y)) * 255.0f); + out[idx + 2] = (unsigned char)(fmaxf(0.0f, fminf(1.0f, col.z)) * 255.0f); + out[idx + 3] = 255; +} +""" + +# GLSL shaders -- these just display a texture on a fullscreen rectangle. +# Nothing CUDA-specific here. + +VERTEX_SHADER_SOURCE = """#version 330 core +in vec2 position; +in vec2 texcoord; +out vec2 v_texcoord; +void main() { + gl_Position = vec4(position, 0.0, 1.0); + v_texcoord = texcoord; +} +""" + +FRAGMENT_SHADER_SOURCE = """#version 330 core +in vec2 v_texcoord; +out vec4 fragColor; +uniform sampler2D tex; +void main() { + fragColor = texture(tex, v_texcoord); +} +""" + + +if __name__ == "__main__": + main() diff --git a/cuda_core/examples/gl_interop_particles.py b/cuda_core/examples/gl_interop_particles.py new file mode 100644 index 00000000000..c5dd06e3697 --- /dev/null +++ b/cuda_core/examples/gl_interop_particles.py @@ -0,0 +1,688 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +# ################################################################################ +# +# This example demonstrates cuda.core.GraphicsResource VBO interop together with +# CUDAArray, SurfaceObject, and TextureObject. Hundreds of thousands of points +# flow through an animated curl-noise velocity field. CUDA writes particle +# positions directly into an OpenGL Vertex Buffer Object (VBO), and OpenGL draws +# that same buffer as a glowing additive point cloud -- no PBO, no fullscreen +# quad, no pixel copy. Requires pyglet. +# +# ################################################################################ + +# What this example teaches +# ========================= +# - How to register an OpenGL VBO (GL_ARRAY_BUFFER) with CUDA using +# `GraphicsResource.from_gl_buffer(vbo_id, flags="none")` and treat the mapped +# `buf.handle` as a device pointer to a particle array that CUDA both reads and +# writes in place. This is the standout difference from every other interop +# example here: those copy CUDA output into a PBO, upload it to a texture, and +# draw a fullscreen quad. This one renders geometry straight out of the buffer +# CUDA just wrote. +# - How to bake a smooth, periodic scalar potential into a 2D CUDAArray once (via +# a SurfaceObject write kernel), then bind that array as a LINEAR + WRAP +# normalized TextureObject and derive a divergence-free curl-noise velocity +# field from finite differences of texture samples. +# - How to draw GL_POINTS directly from a CUDA-written VBO with additive blending +# and shader-controlled point size for a luminous, flowing look. +# +# How it works +# ============ +# We allocate one VBO holding N particles. Each particle is 4 floats: +# +# [x, y, age, speed] (stride = 16 bytes) +# +# - x, y : position in the [0, 1] x [0, 1] domain. The vertex shader maps +# this to clip space with `pos * 2 - 1`. Keeping a single [0, 1] +# domain means the kernel can sample the velocity texture with +# normalized coordinates directly -- no scaling bugs. +# - age : seconds since this particle last (re)spawned. Drives color and +# alpha; resets to 0 on respawn. +# - speed : normalized flow magnitude in [0, 1] at the particle's location +# (the kernel maps gradient steepness through tanh). Drives the +# color ramp so fast jets glow hotter than calm eddies. +# +# The GL vertex attributes read from the same buffer: +# - "position" : 2 floats at offset 0 +# - "attribs" : 2 floats (age, speed) at offset 8 +# +# The CUDA kernel `advance_particles` indexes the buffer as `float4*` so its +# layout agrees with the host init array and the GL attribute pointers above. +# +# VBO INTEROP (one buffer, CUDA writes -> OpenGL draws) +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +-------------------+ map(stream) +---------------------+ +# | OpenGL VBO | ---------------> | advance_particles | +# | float4 per point | | (curl-noise flow) | +# | [x, y, age, speed]| <--------------- | reads+writes pts | +# +-------------------+ unmap +---------------------+ +# | +# | glDrawArrays(GL_POINTS) (after unmap; GL cannot read a +# v buffer while it is mapped to CUDA) +# glowing point cloud on screen +# +# The velocity field is a curl of a baked scalar potential P(u, v): +# +# velocity = ( dP/dv, -dP/du ) +# +# Taking the curl of a scalar potential yields a divergence-free field, so +# particles swirl without piling up or thinning out. The potential is baked once +# into a single-channel float CUDAArray as a sum of periodic sinusoids, then +# sampled with LINEAR + WRAP + normalized coordinates. A time uniform scrolls the +# sample coordinates so the whole field slowly drifts and animates. +# +# Why flags="none" (not "write_discard")? +# --------------------------------------- +# The PBO examples register with "write_discard" because they overwrite every +# pixel each frame and never read the old contents. Here the kernel READS each +# particle's current position before writing the advanced one, so we must NOT +# tell CUDA the prior contents are garbage. We use "none". +# +# Single-channel surf2Dwrite byte offset +# -------------------------------------- +# The potential array is single-channel `float` (4 bytes). `surf2Dwrite` takes +# the x coordinate in BYTES, so the offset is `x * sizeof(float)` = `x * 4`. +# (Contrast the float2 reaction-diffusion example, which uses `x * 8`.) +# +# What you should see +# =================== +# Luminous filaments of points swirling through an animated flow field, colored +# blue -> cyan -> white by speed and faded by age. Press R to respawn all +# particles, +/- to slow down / speed up the flow, and Escape to exit. The window +# title shows the particle count and FPS. +# + +# /// script +# dependencies = ["cuda_bindings", "cuda_core>0.6.0", "pyglet"] +# /// + +import ctypes +import sys +import time + +import numpy as np + +from cuda.core import ( + AddressMode, + ArrayFormat, + CUDAArray, + Device, + FilterMode, + GraphicsResource, + LaunchConfig, + Program, + ProgramOptions, + ReadMode, + ResourceDescriptor, + SurfaceObject, + TextureDescriptor, + TextureObject, + launch, +) + +# --------------------------------------------------------------------------- +# Simulation parameters (feel free to change these) +# --------------------------------------------------------------------------- +WIDTH = 900 +HEIGHT = 900 +N_PARTICLES = 1_000_000 # number of points in the cloud +FLOATS_PER_PARTICLE = 4 # [x, y, age, speed] +POTENTIAL_DIM = 256 # resolution of the baked potential texture (square) +DT = 1.0 / 60.0 # simulation time step per frame (seconds) +BASE_SPEED = 0.15 # base flow speed (domain units per second) +SPEED_STEP = 1.25 # multiplier applied by +/- +MAX_AGE = 4.0 # seconds before a particle respawns +POINT_SIZE = 2.4 # rendered point size in pixels + + +# ============================= Helper functions ============================= +# +# The functions below set up CUDA and OpenGL. If you're here to learn about VBO +# interop, skip ahead to main() -- the interesting part is there. These helpers +# exist so that main() reads like a short story instead of a wall of +# boilerplate. +# ============================================================================ + + +def setup_cuda(): + """Compile the CUDA kernels and return (device, stream, kernels, configs).""" + dev = Device(0) + dev.set_current() + + # SurfaceObject requires bindless surface objects (cuSurfObjectCreate), + # which need compute capability >= 3.0. + cc = dev.compute_capability + if cc.major < 3: + print( + "This example requires a GPU with compute capability >= 3.0 for " + f"bindless surface objects. Found sm_{cc.major}{cc.minor}.", + file=sys.stderr, + ) + sys.exit(1) + + stream = dev.create_stream() + + # Compile as C++ so the templated tex2D overload resolves. + program_options = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}") + prog = Program(KERNEL_SOURCE, code_type="c++", options=program_options) + mod = prog.compile( + "cubin", + name_expressions=("bake_potential", "init_particles", "advance_particles"), + ) + + kernels = { + "bake": mod.get_kernel("bake_potential"), + "init": mod.get_kernel("init_particles"), + "advance": mod.get_kernel("advance_particles"), + } + + # The potential bake is 2D over POTENTIAL_DIM x POTENTIAL_DIM texels. + block2d = (16, 16, 1) + grid2d = ( + (POTENTIAL_DIM + block2d[0] - 1) // block2d[0], + (POTENTIAL_DIM + block2d[1] - 1) // block2d[1], + 1, + ) + # init/advance are 1D over N_PARTICLES. + block1d = (256, 1, 1) + grid1d = ((N_PARTICLES + block1d[0] - 1) // block1d[0], 1, 1) + + configs = { + "bake": LaunchConfig(grid=grid2d, block=block2d), + "init": LaunchConfig(grid=grid1d, block=block1d), + "advance": LaunchConfig(grid=grid1d, block=block1d), + } + + return dev, stream, kernels, configs + + +def create_window(): + """Open a pyglet window and return (window, gl_module, pyglet).""" + try: + import pyglet + from pyglet.gl import gl as _gl + except ImportError: + print( + "This example requires pyglet >= 2.0.\nInstall it with: pip install pyglet", + file=sys.stderr, + ) + sys.exit(1) + + window = pyglet.window.Window( + WIDTH, + HEIGHT, + caption="cuda.core VBO interop - Curl-Noise Particle Flow", + vsync=False, + ) + return window, _gl, pyglet + + +def create_particle_vbo(gl, shader_prog): + """Create the particle VBO and its VAO, and wire up the vertex attributes. + + The VBO holds N_PARTICLES * 4 floats laid out as [x, y, age, speed] per + particle. We initialize positions to a deterministic pseudo-random spread + across the [0, 1] domain so there is something to see even before the first + kernel launch; CUDA overwrites this every frame. + + Returns (vbo_gl_name, vao_gl_name). + """ + # Host-side initial layout MUST match the kernel's float4 view and the GL + # attribute pointers below: [x, y, age, speed] per particle. + init = np.empty((N_PARTICLES, FLOATS_PER_PARTICLE), dtype=np.float32) + rng = np.random.default_rng(12345) + init[:, 0] = rng.random(N_PARTICLES, dtype=np.float32) # x in [0, 1] + init[:, 1] = rng.random(N_PARTICLES, dtype=np.float32) # y in [0, 1] + init[:, 2] = rng.random(N_PARTICLES, dtype=np.float32) * MAX_AGE # staggered age + init[:, 3] = 0.0 # speed + init = np.ascontiguousarray(init) + + vao = ctypes.c_uint(0) + gl.glGenVertexArrays(1, ctypes.byref(vao)) + gl.glBindVertexArray(vao.value) + + vbo = ctypes.c_uint(0) + gl.glGenBuffers(1, ctypes.byref(vbo)) + gl.glBindBuffer(gl.GL_ARRAY_BUFFER, vbo.value) + gl.glBufferData( + gl.GL_ARRAY_BUFFER, + init.nbytes, + init.ctypes.data_as(ctypes.c_void_p), + gl.GL_DYNAMIC_DRAW, # CUDA rewrites this buffer every frame + ) + + stride = FLOATS_PER_PARTICLE * 4 # 4 floats * 4 bytes = 16 bytes per particle + + pos_loc = gl.glGetAttribLocation(shader_prog.id, b"position") + gl.glEnableVertexAttribArray(pos_loc) + gl.glVertexAttribPointer(pos_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(0)) + + attr_loc = gl.glGetAttribLocation(shader_prog.id, b"attribs") + gl.glEnableVertexAttribArray(attr_loc) + gl.glVertexAttribPointer(attr_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(8)) + + gl.glBindVertexArray(0) + gl.glBindBuffer(gl.GL_ARRAY_BUFFER, 0) + + return vbo.value, vao.value + + +def create_shader(gl): + """Build the point-cloud shader program (kept alive by the caller).""" + from pyglet.graphics.shader import Shader, ShaderProgram + + vert = Shader(VERTEX_SHADER_SOURCE, "vertex") + frag = Shader(FRAGMENT_SHADER_SOURCE, "fragment") + shader_prog = ShaderProgram(vert, frag) + + # Additive blending so overlapping points accumulate into glow, and + # shader-controlled point size (off by default in the core profile). + gl.glEnable(gl.GL_BLEND) + gl.glBlendFunc(gl.GL_SRC_ALPHA, gl.GL_ONE) + gl.glEnable(gl.GL_PROGRAM_POINT_SIZE) + gl.glDisable(gl.GL_DEPTH_TEST) + + return shader_prog + + +def make_potential_array(): + """Allocate the single-channel float CUDAArray that holds the baked potential. + + `is_surface_load_store=True` lets us write it once via a SurfaceObject and + then read it as a TextureObject for smooth, wrapping, bilinear sampling. + """ + return CUDAArray.from_descriptor( + shape=(POTENTIAL_DIM, POTENTIAL_DIM), + format=ArrayFormat.FLOAT32, + num_channels=1, + is_surface_load_store=True, + ) + + +def make_potential_texture(arr): + """Bind `arr` as a TextureObject configured for LINEAR + WRAP + normalized.""" + res_desc = ResourceDescriptor.from_array(arr) + tex_desc = TextureDescriptor( + address_mode=AddressMode.WRAP, + filter_mode=FilterMode.LINEAR, + read_mode=ReadMode.ELEMENT_TYPE, + # WRAP addressing only works with normalized coordinates. + normalized_coords=True, + ) + return TextureObject.from_descriptor(resource=res_desc, texture_descriptor=tex_desc) + + +def reset_particles(stream, kernels, configs, resource, seed): + """Respawn every particle by launching init_particles on the mapped VBO. + + Reuses the same map() path the per-frame advance uses, so there is no host + re-upload. The map brackets only the launch; GL must not touch the buffer + while it is mapped. + """ + with resource.map(stream=stream) as buf: + launch( + stream, + configs["init"], + kernels["init"], + buf.handle, + np.int32(N_PARTICLES), + np.uint32(seed), + np.float32(MAX_AGE), + ) + + +# ================================== main() ================================== + + +def main(): + # --- Step 1: Set up CUDA (compile kernels, create stream) --- + dev, stream, kernels, configs = setup_cuda() + + # --- Step 2: Open a window --- + window, gl, pyglet = create_window() + + # --- Step 3: Build the point-cloud shader and enable additive blending --- + shader_prog = create_shader(gl) + + # --- Step 4: Create the particle VBO + VAO (the buffer CUDA writes into) --- + vbo_id, vao_id = create_particle_vbo(gl, shader_prog) + + # ======================================================================= + # API MAP -- the four cuda.core interop objects this example hinges on + # ======================================================================= + # GraphicsResource.from_gl_buffer(VBO) + # Registers a GL VBO (NOT a PBO) so CUDA writes vertex positions, + # OpenGL then draws directly -- zero copy. The mapped buf.handle is a + # raw device pointer into the same float4 array OpenGL renders from. + # CUDAArray (single-channel float, is_surface_load_store=True) + # The backing storage for the baked scalar potential. + # SurfaceObject.from_array(pot_arr) + # Write view used ONCE at startup to bake the potential into the array. + # TextureObject (LINEAR + WRAP + normalized, 1ch) + # Read view: LINEAR+WRAP+normalized lets the kernel read the baked + # potential's gradient with smooth, tileable sampling -- the curl of + # that gradient is the divergence-free velocity field. + # The texture handle is created once, kept alive, and wrapped in np.uint64 + # at launch; buf.handle is passed raw. + # ======================================================================= + + # --- Step 5: Register the VBO with CUDA --- + # flags="none": the kernel reads each particle before writing it back, + # so we must NOT discard the prior contents (that's why this is not + # "write_discard" like the PBO examples). + resource = GraphicsResource.from_gl_buffer(vbo_id, flags="none") + + # --- Step 6: Allocate + bake the curl-noise potential, bind it as a texture --- + pot_arr = make_potential_array() + pot_surf = SurfaceObject.from_array(pot_arr) # created once, kept alive + pot_tex = make_potential_texture(pot_arr) # created once, kept alive + + # Bake the scalar potential once via the SurfaceObject. + launch( + stream, + configs["bake"], + kernels["bake"], + np.uint64(pot_surf.handle), + np.int32(POTENTIAL_DIM), + np.int32(POTENTIAL_DIM), + ) + + # --- Step 7: Seed the particles into the VBO --- + state = {"seed": 1, "speed": BASE_SPEED, "t": 0.0} + reset_particles(stream, kernels, configs, resource, state["seed"]) + + # --- Step 8: Render loop --- + start_time = time.monotonic() + frame_count = 0 + fps_time = start_time + + @window.event + def on_key_press(symbol, _modifiers): + key = pyglet.window.key + if symbol == key.ESCAPE: + window.close() + return + if symbol == key.R: + state["seed"] += 1 + reset_particles(stream, kernels, configs, resource, state["seed"]) + return + if symbol in (key.PLUS, key.NUM_ADD, key.EQUAL): + state["speed"] *= SPEED_STEP + return + if symbol in (key.MINUS, key.NUM_SUBTRACT): + state["speed"] /= SPEED_STEP + return + + @window.event + def on_draw(): + nonlocal frame_count, fps_time + + # Black background so additive accumulation reads as glow. + window.clear() + + state["t"] += DT + + # (a) Advance particles. The map brackets ONLY the CUDA launch -- OpenGL + # cannot read the buffer while it is mapped to CUDA. + with resource.map(stream=stream) as buf: + launch( + stream, + configs["advance"], + kernels["advance"], + buf.handle, # raw device pointer to the float4 particle array + np.uint64(pot_tex.handle), + np.int32(N_PARTICLES), + np.float32(DT), + np.float32(state["speed"]), + np.float32(state["t"]), + np.float32(MAX_AGE), + np.uint32(state["seed"]), + ) + # Unmap happens automatically when the `with` block exits; only after + # that may OpenGL draw from the buffer. + + # (b) Draw the particles straight from the VBO as GL_POINTS. + gl.glUseProgram(shader_prog.id) + max_age_loc = gl.glGetUniformLocation(shader_prog.id, b"max_age") + gl.glUniform1f(max_age_loc, MAX_AGE) + psize_loc = gl.glGetUniformLocation(shader_prog.id, b"point_size") + gl.glUniform1f(psize_loc, POINT_SIZE) + gl.glBindVertexArray(vao_id) + gl.glDrawArrays(gl.GL_POINTS, 0, N_PARTICLES) + gl.glBindVertexArray(0) + gl.glUseProgram(0) + + # FPS counter (shown in window title) + frame_count += 1 + now = time.monotonic() + if now - fps_time >= 1.0: + fps = frame_count / (now - fps_time) + window.set_caption( + "cuda.core VBO interop - Curl-Noise Particle Flow" + f" ({N_PARTICLES:,} points, {fps:.0f} FPS," + f" speed x{state['speed'] / BASE_SPEED:.2f})" + " | GraphicsResource(VBO) + TextureObject[LINEAR|WRAP|norm|1ch]" + ) + frame_count = 0 + fps_time = now + + @window.event + def on_close(): + # Release everything we opened, in reverse order. + resource.close() + pot_tex.close() + pot_surf.close() + pot_arr.close() + stream.close() + + pyglet.app.run(interval=0) + + +# ======================== GPU code (CUDA + GLSL) ============================ +# +# These source strings are kept at the bottom of the file so they don't distract +# from the Python logic above. +# +# - KERNEL_SOURCE contains three CUDA C++ kernels: +# * bake_potential -- writes a smooth, periodic scalar potential into a +# single-channel float surface (once at startup). +# * init_particles -- (re)spawns every particle to a pseudo-random +# position with a staggered age. Operates on the +# mapped VBO as a float4 array. +# * advance_particles -- reads each particle from the mapped VBO, samples +# the potential texture, computes a divergence-free +# curl velocity, integrates the position, handles +# wrap/respawn, and writes the particle back. +# +# - VERTEX_SHADER_SOURCE / FRAGMENT_SHADER_SOURCE draw GL_POINTS from the VBO +# with a soft round sprite colored by speed and faded by age. +# +# ============================================================================ + +KERNEL_SOURCE = r""" +// ---- shared helpers -------------------------------------------------------- + +// Cheap deterministic xorshift hash -> float in [0, 1). +__device__ __forceinline__ float hash01(unsigned int h) { + h ^= h >> 16; h *= 0x7feb352du; + h ^= h >> 15; h *= 0x846ca68bu; + h ^= h >> 16; + return (h & 0x00ffffffu) / (float)0x01000000; +} + +__device__ __forceinline__ unsigned int seed_of(unsigned int idx, unsigned int salt) { + return idx * 747796405u + salt * 2891336453u + 1u; +} + +// ---- bake the scalar potential --------------------------------------------- +// +// A sum of periodic sinusoids over the unit square. Using full 2*pi*k periods +// makes the field seamless under WRAP addressing -- no visible edge. +extern "C" +__global__ +void bake_potential(cudaSurfaceObject_t surf, int width, int height) { + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + if (x >= width || y >= height) return; + + float u = (x + 0.5f) / (float)width; // [0, 1) + float v = (y + 0.5f) / (float)height; // [0, 1) + const float TWO_PI = 6.2831853f; + + float p = 0.0f; + p += 1.00f * sinf(TWO_PI * (1.0f * u + 0.0f * v) + 0.3f); + p += 0.70f * sinf(TWO_PI * (0.0f * u + 1.0f * v) + 1.7f); + p += 0.55f * sinf(TWO_PI * (1.0f * u + 1.0f * v) + 2.1f); + p += 0.45f * sinf(TWO_PI * (2.0f * u - 1.0f * v) + 0.9f); + p += 0.30f * sinf(TWO_PI * (-1.0f * u + 2.0f * v) + 4.2f); + p += 0.25f * sinf(TWO_PI * (3.0f * u + 2.0f * v) + 5.5f); + + // Single-channel float surface: x offset is in BYTES = x * sizeof(float). + surf2Dwrite(p, surf, x * (int)sizeof(float), y); +} + +// ---- (re)spawn particles --------------------------------------------------- +// +// The VBO is a flat array of float4 [x, y, age, speed] per particle. +extern "C" +__global__ +void init_particles(float4* particles, int n, + unsigned int seed, float max_age) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= n) return; + + unsigned int s = seed_of((unsigned int)i, seed); + float px = hash01(s + 11u); + float py = hash01(s + 53u); + // Stagger ages so respawns don't pulse in lockstep. + float age = hash01(s + 97u) * max_age; + particles[i] = make_float4(px, py, age, 0.0f); +} + +// ---- advance particles through the curl-noise field ------------------------ +extern "C" +__global__ +void advance_particles(float4* particles, + cudaTextureObject_t pot, + int n, float dt, float speed, + float t, float max_age, + unsigned int seed) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= n) return; + + float4 p = particles[i]; + float x = p.x; + float y = p.y; + float age = p.z; + + // Scroll the sample coordinates slowly with time so the field animates. + float scroll = 0.03f * t; + float su = x + scroll; + float sv = y - 0.5f * scroll; + + // Curl of a scalar potential P is (dP/dv, -dP/du): divergence-free flow. + // Estimate the gradient by central differences of texture samples. The + // texture is LINEAR + WRAP + normalized, so wrapped reads are seamless. + const float eps = 1.0f / 256.0f; + float p_up = tex2D(pot, su, sv + eps); + float p_dn = tex2D(pot, su, sv - eps); + float p_rt = tex2D(pot, su + eps, sv); + float p_lt = tex2D(pot, su - eps, sv); + + float dP_dv = (p_up - p_dn) / (2.0f * eps); + float dP_du = (p_rt - p_lt) / (2.0f * eps); + + // Curl direction, then bound the magnitude. The raw analytic gradient of + // the summed sinusoids runs ~0..20, which (times speed) would whip every + // particle across the domain in well under a second and saturate the color + // ramp. We split it: `dir` is the flow direction, and `flow` maps the + // gradient steepness through tanh into [0, 1] so the field has slow eddies + // and fast jets. The displacement is `speed * flow` domain-units/sec, so + // `speed` is a true unit-per-second knob and `flow` drives the color ramp. + float gx = dP_dv; + float gy = -dP_du; + float grad = sqrtf(gx * gx + gy * gy) + 1e-6f; + float flow = tanhf(grad * 0.12f); // 0 in calm regions, ->1 in steep jets + float vx = speed * flow * (gx / grad); + float vy = speed * flow * (gy / grad); + + // Store `flow` (the normalized speed in [0, 1]) as the color driver. + float vmag = flow; + + // Integrate position. + x += vx * dt; + y += vy * dt; + age += dt; + + // Respawn on age expiry or if a particle drifts out of the unit domain. + bool respawn = (age >= max_age) || x < 0.0f || x > 1.0f || y < 0.0f || y > 1.0f; + if (respawn) { + // Jitter the seed by frame-ish state so respawns spread out over time. + unsigned int s = seed_of((unsigned int)i, seed + (unsigned int)(t * 60.0f)); + x = hash01(s + 11u); + y = hash01(s + 53u); + age = 0.0f; + vmag = 0.0f; + } + + particles[i] = make_float4(x, y, age, vmag); +} +""" + +# GLSL shaders -- draw GL_POINTS from the VBO. Position maps [0,1] -> clip space; +# color ramps blue -> cyan -> white by speed and fades with age. The fragment +# shader makes each point a soft round sprite for the glow. + +VERTEX_SHADER_SOURCE = """#version 330 core +in vec2 position; // x, y in [0, 1] +in vec2 attribs; // age, speed +out float v_age; +out float v_speed; +uniform float max_age; +uniform float point_size; +void main() { + gl_Position = vec4(position * 2.0 - 1.0, 0.0, 1.0); + v_age = clamp(attribs.x / max_age, 0.0, 1.0); + v_speed = attribs.y; + // Subtle size-by-speed: fast jets render a touch larger so filaments read + // as brighter, structured streaks. Reuses the existing speed attribute -- + // no struct change. Calm points keep the base size; never shrinks below it. + gl_PointSize = point_size * (1.0 + 0.3 * clamp(v_speed, 0.0, 1.0)); +} +""" + +FRAGMENT_SHADER_SOURCE = """#version 330 core +in float v_age; +in float v_speed; +out vec4 fragColor; +void main() { + // Soft round sprite: fade toward the edge of the point. + vec2 d = gl_PointCoord - vec2(0.5); + float r = length(d) * 2.0; + float falloff = clamp(1.0 - r, 0.0, 1.0); + falloff *= falloff; + + // Speed ramp: blue -> cyan -> white. v_speed is the normalized flow + // magnitude in [0, 1] (see advance_particles), so it spans the ramp. + float s = clamp(v_speed, 0.0, 1.0); + vec3 cool = vec3(0.12, 0.40, 1.00); // lifted enough that slow points still glow + vec3 mid = vec3(0.22, 0.85, 1.15); + vec3 hot = vec3(1.15, 1.15, 1.20); // slightly >1 so only the densest cores clip + vec3 color = (s < 0.5) + ? mix(cool, mid, s * 2.0) + : mix(mid, hot, (s - 0.5) * 2.0); + + // Fade in just after spawn and out near end of life. + float life = (1.0 - v_age) * smoothstep(0.0, 0.08, v_age); + float alpha = falloff * life * 0.7; // density carries the glow; trim so cores don't fully clip + + fragColor = vec4(color, alpha); +} +""" + + +if __name__ == "__main__": + main() diff --git a/cuda_core/examples/gl_interop_physarum.py b/cuda_core/examples/gl_interop_physarum.py new file mode 100644 index 00000000000..99972635b14 --- /dev/null +++ b/cuda_core/examples/gl_interop_physarum.py @@ -0,0 +1,889 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +# ################################################################################ +# +# This example demonstrates cuda.core.CUDAArray, TextureObject, and SurfaceObject +# together with a plain device Buffer and GraphicsResource for CUDA/OpenGL +# interop. A large population of "slime mold" (Physarum) agents crawls over a +# single-channel float trail map: each agent senses the trail ahead via a +# TextureObject (LINEAR + WRAP sampling), steers toward the strongest scent, +# steps forward, and deposits pheromone through a SurfaceObject. A separate +# diffuse/decay pass blurs and fades the trail (ping-ponged between two CUDA +# arrays), and a colorize pass writes a neon palette straight into an OpenGL +# PBO. The result is emergent, self-organizing vein/network patterns. Requires +# pyglet. +# +# ################################################################################ + +# What this example teaches +# ========================= +# - How to combine a plain device Buffer (per-agent state) with CUDAArray-backed +# TextureObject/SurfaceObject pairs in a single simulation, all on the GPU. +# - How to allocate a single-channel float CUDAArray with +# `is_surface_load_store=True` so the same memory can be read as a +# TextureObject (LINEAR + WRAP + normalized) and written as a SurfaceObject. +# - How to initialize a device Buffer from host data without a third-party array +# library: stage through a host-accessible pinned Buffer, fill it via NumPy, +# then `copy_from` into the device Buffer. +# +# How it works +# ============ +# Physarum is an agent-based transport-network model. Every agent stores +# (x, y, heading) and, once per frame: +# +# 1. Samples the trail at three sensors (left / center / right of its heading, +# a fixed sensor distance ahead) using tex2D LINEAR sampling. +# 2. Rotates toward whichever sensor reads strongest (with a little random +# jitter from a per-agent xorshift RNG seeded by index + frame). +# 3. Steps forward by a fixed speed and wraps around the toroidal edges. +# 4. Deposits a constant amount of pheromone into the trail via surf2Dwrite. +# Concurrent agents may race on the same texel -- that is acceptable and +# even characteristic of the model. +# +# Then two grid-parallel passes finish the frame: +# +# diffuse_decay : box-blur the trail (tex2D LINEAR neighbor taps) and multiply +# by a decay factor < 1. Reads the current array, writes the +# other, then we swap (ping-pong). +# colorize : color the trail by local gradient direction (hue) modulated +# by intensity, with a ridge boost + bloom halo, into the PBO. +# +# PING-PONG (two single-channel float arrays) +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# move_agents reads + deposits into the CURRENT array (tex + surf of same arr). +# diffuse_decay reads CURRENT (tex) -> writes OTHER (surf) -> swap. +# colorize reads the new CURRENT (tex) -> OpenGL PBO. +# +# Why LINEAR + WRAP + normalized coords? +# -------------------------------------- +# Addressing modes WRAP and MIRROR are only supported with normalized +# coordinates. WRAP makes the world a torus so agents and diffusion seamlessly +# cross the edges. LINEAR filtering is essentially free on the hardware and +# gives the agents smooth sub-texel gradient sensing. We sample at texel centers +# `(x + 0.5) / W` so neighbor offsets land on integer texel positions. +# +# Channel byte width in surf2Dwrite +# --------------------------------- +# `surf2Dwrite` takes the x coordinate in BYTES, not elements. The trail is a +# single-channel `float` surface, so the x offset is `x * sizeof(float)` = `x*4`. +# (Contrast a `float2` surface, which would need `x*8`.) Getting this wrong +# silently corrupts every Nth column. +# +# Per-agent state lives in a plain device Buffer +# ---------------------------------------------- +# Agents are stored as a flat float32 array of length 3*N laid out as +# [x0, y0, h0, x1, y1, h1, ...]. We allocate it once with `dev.allocate` and +# pass the Buffer object straight to `launch` (matching saxpy.py / memory_ops.py, +# which pass Buffer objects directly rather than a raw pointer int). +# +# What you should see +# =================== +# A window of glowing neon filaments that grow, branch, and reorganize into a +# living transport network. Press 1/2/3 to switch behavior presets (different +# sensor geometry and turn speed give different morphologies), R to reseed the +# agents and clear the trail, and Escape to exit. The title shows the preset, +# agent count, and FPS. +# + +# /// script +# dependencies = ["cuda_bindings", "cuda_core>0.6.0", "pyglet"] +# /// + +import ctypes +import sys +import time + +import numpy as np + +from cuda.core import ( + AddressMode, + ArrayFormat, + CUDAArray, + Device, + FilterMode, + GraphicsResource, + LaunchConfig, + LegacyPinnedMemoryResource, + Program, + ProgramOptions, + ReadMode, + ResourceDescriptor, + SurfaceObject, + TextureDescriptor, + TextureObject, + launch, +) + +# --------------------------------------------------------------------------- +# Simulation parameters (feel free to change these) +# --------------------------------------------------------------------------- +WIDTH = 1024 +HEIGHT = 1024 +N_AGENTS = 1 << 21 # ~2.1 million agents +DEPOSIT = 0.2 # pheromone added to the trail per agent per frame (small so the +# additive deposit accumulates meaningfully instead of instantly +# saturating the field to 1.0) + +# Named presets: (sensor_angle_rad, sensor_distance_px, turn_speed_rad, move_speed_px, decay, label). +# Different sensor geometry / turn speeds yield strikingly different networks. +PRESETS = { + "1": (0.40, 9.0, 0.40, 1.0, 0.92, "veins"), + "2": (0.80, 16.0, 0.25, 1.0, 0.90, "webs"), + "3": (1.20, 5.0, 0.65, 1.5, 0.95, "swarm"), +} +DEFAULT_PRESET = "1" + + +# ============================= Helper functions ============================= +# +# The functions below set up CUDA and OpenGL. If you're here to learn about +# CUDAArray/TextureObject/SurfaceObject/Buffer, skip ahead to main() -- the +# interesting part is there. These helpers exist so that main() reads like a +# short story instead of a wall of boilerplate. +# ============================================================================ + + +def setup_cuda(): + """Compile the CUDA kernels and return (device, stream, kernels, configs). + + Returns a dict of kernels keyed by name and matching LaunchConfigs. The + move pass is 1D over agents; the diffuse/colorize passes are 2D over pixels. + """ + dev = Device(0) + dev.set_current() + + # SurfaceObject requires surface load/store, which has existed since SM 2.0, + # but bindless surface objects (cuSurfObjectCreate) require SM 3.0+. + cc = dev.compute_capability + if cc.major < 3: + print( + "This example requires a GPU with compute capability >= 3.0 for " + f"bindless surface objects. Found sm_{cc.major}{cc.minor}.", + file=sys.stderr, + ) + sys.exit(1) + + stream = dev.create_stream() + + # Compile as C++ so the templated tex2D overload resolves. + program_options = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}") + prog = Program(KERNEL_SOURCE, code_type="c++", options=program_options) + mod = prog.compile( + "cubin", + name_expressions=("move_agents", "diffuse_decay", "colorize"), + ) + + kernels = { + "move": mod.get_kernel("move_agents"), + "diffuse": mod.get_kernel("diffuse_decay"), + "colorize": mod.get_kernel("colorize"), + } + + # 1D launch over agents. + move_block = (256, 1, 1) + move_grid = ((N_AGENTS + move_block[0] - 1) // move_block[0], 1, 1) + move_config = LaunchConfig(grid=move_grid, block=move_block) + + # 2D launch over pixels (shared by diffuse and colorize). + px_block = (16, 16, 1) + px_grid = ( + (WIDTH + px_block[0] - 1) // px_block[0], + (HEIGHT + px_block[1] - 1) // px_block[1], + 1, + ) + px_config = LaunchConfig(grid=px_grid, block=px_block) + + configs = {"move": move_config, "diffuse": px_config, "colorize": px_config} + + return dev, stream, kernels, configs + + +def create_window(): + """Open a pyglet window and return (window, gl_module, pyglet).""" + try: + import pyglet + from pyglet.gl import gl as _gl + except ImportError: + print( + "This example requires pyglet >= 2.0.\nInstall it with: pip install pyglet", + file=sys.stderr, + ) + sys.exit(1) + + window = pyglet.window.Window( + WIDTH, + HEIGHT, + caption="cuda.core CUDAArray/Texture/Surface/Buffer - Physarum", + vsync=False, + ) + return window, _gl, pyglet + + +def create_display_resources(gl, width, height): + """Create the GL objects needed to show a texture on screen. + + This sets up a shader program, a fullscreen quad, and an empty texture. + None of this is CUDA-specific -- it's standard OpenGL boilerplate for + rendering a textured quad. + + Returns (shader_program, vertex_array_id, texture_id). The shader_program + is a pyglet ShaderProgram object (must be kept alive). + """ + from pyglet.graphics.shader import Shader, ShaderProgram + + # Shader program -- just passes texture coordinates through + vert = Shader(VERTEX_SHADER_SOURCE, "vertex") + frag = Shader(FRAGMENT_SHADER_SOURCE, "fragment") + shader_prog = ShaderProgram(vert, frag) + + # Fullscreen quad (two triangles covering the entire window) + quad_verts = np.array( + [ + # x, y, s, t (position + texture coordinate) + -1, + -1, + 0, + 0, + 1, + -1, + 1, + 0, + 1, + 1, + 1, + 1, + -1, + -1, + 0, + 0, + 1, + 1, + 1, + 1, + -1, + 1, + 0, + 1, + ], + dtype=np.float32, + ) + + vao = ctypes.c_uint(0) + gl.glGenVertexArrays(1, ctypes.byref(vao)) + gl.glBindVertexArray(vao.value) + + vbo = ctypes.c_uint(0) + gl.glGenBuffers(1, ctypes.byref(vbo)) + gl.glBindBuffer(gl.GL_ARRAY_BUFFER, vbo.value) + gl.glBufferData( + gl.GL_ARRAY_BUFFER, + quad_verts.nbytes, + quad_verts.ctypes.data_as(ctypes.c_void_p), + gl.GL_STATIC_DRAW, + ) + + stride = 4 * 4 # 4 floats * 4 bytes each = 16 bytes per vertex + pos_loc = gl.glGetAttribLocation(shader_prog.id, b"position") + gl.glEnableVertexAttribArray(pos_loc) + gl.glVertexAttribPointer(pos_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(0)) + + tc_loc = gl.glGetAttribLocation(shader_prog.id, b"texcoord") + gl.glEnableVertexAttribArray(tc_loc) + gl.glVertexAttribPointer(tc_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(8)) + + gl.glBindVertexArray(0) + + # Empty texture (will be filled each frame from the PBO) + tex = ctypes.c_uint(0) + gl.glGenTextures(1, ctypes.byref(tex)) + gl.glBindTexture(gl.GL_TEXTURE_2D, tex.value) + gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MIN_FILTER, gl.GL_LINEAR) + gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MAG_FILTER, gl.GL_LINEAR) + gl.glTexImage2D( + gl.GL_TEXTURE_2D, + 0, + gl.GL_RGBA8, + width, + height, + 0, + gl.GL_RGBA, + gl.GL_UNSIGNED_BYTE, + None, + ) + + return shader_prog, vao.value, tex.value + + +def create_pixel_buffer(gl, width, height): + """Create a Pixel Buffer Object (PBO) -- the bridge between CUDA and OpenGL. + + A PBO is a GPU-side buffer that OpenGL can read from when uploading pixels + to a texture. By registering this same buffer with CUDA, the CUDA kernel + can write directly into it. + + Returns (pbo_gl_name, size_in_bytes). + """ + pbo = ctypes.c_uint(0) + gl.glGenBuffers(1, ctypes.byref(pbo)) + gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo.value) + nbytes = width * height * 4 # RGBA, 1 byte per channel + gl.glBufferData(gl.GL_PIXEL_UNPACK_BUFFER, nbytes, None, gl.GL_DYNAMIC_DRAW) + gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0) + return pbo.value, nbytes + + +def copy_pbo_to_texture(gl, pbo_id, tex_id, width, height): + """Copy pixel data from the PBO into the GL texture (GPU-to-GPU).""" + gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo_id) + gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id) + gl.glTexSubImage2D( + gl.GL_TEXTURE_2D, + 0, + 0, + 0, + width, + height, + gl.GL_RGBA, + gl.GL_UNSIGNED_BYTE, + None, # None = read from the currently bound PBO, not from CPU + ) + gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0) + + +def draw_fullscreen_quad(gl, shader_prog, vao_id, tex_id): + """Draw the texture to the screen using the fullscreen quad.""" + gl.glUseProgram(shader_prog.id) + gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id) + gl.glBindVertexArray(vao_id) + gl.glDrawArrays(gl.GL_TRIANGLES, 0, 6) + gl.glBindVertexArray(0) + gl.glUseProgram(0) + + +def make_trail_arrays(): + """Allocate the two single-channel float ping-pong arrays for the trail map.""" + arr_a = CUDAArray.from_descriptor( + shape=(WIDTH, HEIGHT), + format=ArrayFormat.FLOAT32, + num_channels=1, + is_surface_load_store=True, + ) + arr_b = CUDAArray.from_descriptor( + shape=(WIDTH, HEIGHT), + format=ArrayFormat.FLOAT32, + num_channels=1, + is_surface_load_store=True, + ) + return arr_a, arr_b + + +def make_texture(arr): + """Bind `arr` as a TextureObject configured for LINEAR + WRAP + normalized.""" + res_desc = ResourceDescriptor.from_array(arr) + tex_desc = TextureDescriptor( + address_mode=AddressMode.WRAP, + filter_mode=FilterMode.LINEAR, + read_mode=ReadMode.ELEMENT_TYPE, + # WRAP/MIRROR addressing modes require normalized coordinates. + normalized_coords=True, + ) + return TextureObject.from_descriptor(resource=res_desc, texture_descriptor=tex_desc) + + +def fill_agent_host(host_view, seed): + """Fill a host-side float32 view (length 3*N) with random agents. + + Layout is [x0, y0, h0, x1, y1, h1, ...]: position in [0, W)x[0, H) and + heading in [0, 2*pi). + """ + rng = np.random.default_rng(seed) + agents = host_view.reshape(N_AGENTS, 3) + agents[:, 0] = rng.uniform(0.0, WIDTH, size=N_AGENTS) + agents[:, 1] = rng.uniform(0.0, HEIGHT, size=N_AGENTS) + agents[:, 2] = rng.uniform(0.0, 2.0 * np.pi, size=N_AGENTS) + + +def reseed_agents(stream, device_agents, pinned_agents, host_view, seed): + """Refill the host staging view and copy it into the device agent Buffer. + + Reuses the already-allocated device and pinned buffers -- no reallocation. + """ + fill_agent_host(host_view, seed) + device_agents.copy_from(pinned_agents, stream=stream) + + +def clear_trail(stream, arr_a, arr_b, zeros): + """Zero both trail arrays. CUDAArray.copy_from accepts a buffer-protocol host + object directly (unlike Buffer.copy_from), so a NumPy zero array works.""" + arr_a.copy_from(zeros, stream=stream) + arr_b.copy_from(zeros, stream=stream) + + +# ================================== main() ================================== + + +def main(): + # --- Step 1: Set up CUDA (compile kernels, create stream) --- + dev, stream, kernels, configs = setup_cuda() + + # --- Step 2: Open a window --- + window, gl, pyglet = create_window() + + # --- Step 3: Create GL resources for drawing a texture to screen --- + # (Standard OpenGL boilerplate -- not CUDA-specific.) + shader_prog, quad_vao, tex_id = create_display_resources(gl, WIDTH, HEIGHT) + + # --- Step 4: Create the Pixel Buffer Object (PBO) --- + # The PBO is GPU memory owned by OpenGL. CUDA writes into it, OpenGL + # reads from it. + pbo_id, _ = create_pixel_buffer(gl, WIDTH, HEIGHT) + + # --- Step 5: Register the PBO with CUDA --- + resource = GraphicsResource.from_gl_buffer(pbo_id, flags="write_discard") + + # --- Step 6: Allocate the two ping-pong trail Arrays --- + # Single-channel float with is_surface_load_store=True so they can be + # bound as SurfaceObjects. + # + # API MAP -- the four cuda.core objects that drive this simulation: + # * device Buffer (dev.allocate) holds raw agent state alongside the + # array/texture/surface stack. + # * TextureObject LINEAR+WRAP+normalized -> smooth, toroidal SENSE of the + # pheromone field. + # * SurfaceObject -> typed DEPOSIT writes into the same CUDAArray sensed + # as a texture (is_surface_load_store=True). + arr_a, arr_b = make_trail_arrays() + + # --- Step 7: Pre-create the four bindless handles (once, kept alive) --- + tex_a = make_texture(arr_a) + tex_b = make_texture(arr_b) + surf_a = SurfaceObject.from_array(arr_a) + surf_b = SurfaceObject.from_array(arr_b) + + # --- Step 8: Allocate per-agent state in a plain device Buffer --- + # Flat float32 [x, y, heading] * N. We stage host data through a + # host-accessible pinned Buffer, then copy it into the device Buffer. + # Both buffers are allocated once and reused on reseed. + agent_floats = 3 * N_AGENTS + agent_bytes = agent_floats * 4 + device_agents = dev.allocate(agent_bytes, stream=stream) + pinned_mr = LegacyPinnedMemoryResource() + pinned_agents = pinned_mr.allocate(agent_bytes) + host_view = np.from_dlpack(pinned_agents).view(np.float32) + + # Host-side zero image reused to clear the trail arrays. + zeros = np.zeros((WIDTH, HEIGHT), dtype=np.float32) + + # --- Step 9: Seed initial agents + clear the trail --- + state = {"current": "a", "preset": DEFAULT_PRESET, "seed": 0, "frame": 0} + reseed_agents(stream, device_agents, pinned_agents, host_view, seed=state["seed"]) + clear_trail(stream, arr_a, arr_b, zeros) + stream.sync() # ensure the seed copy finishes before the first launch reads it + + # --- Step 10: Render loop --- + start_time = time.monotonic() + frame_count = 0 + fps_time = start_time + + def current_tex_surf(): + """Return (tex, surf) for the CURRENT trail array (read + deposit).""" + if state["current"] == "a": + return tex_a, surf_a + return tex_b, surf_b + + def diffuse_read_write(): + """Return (tex_read_current, surf_write_other, next_current).""" + if state["current"] == "a": + return tex_a, surf_b, "b" + return tex_b, surf_a, "a" + + @window.event + def on_key_press(symbol, _modifiers): + key = pyglet.window.key + if symbol == key.ESCAPE: + window.close() + return + if symbol == key.R: + state["seed"] += 1 + state["frame"] = 0 + reseed_agents(stream, device_agents, pinned_agents, host_view, seed=state["seed"]) + clear_trail(stream, arr_a, arr_b, zeros) + state["current"] = "a" + return + for digit_key, name in ( + (key._1, "1"), + (key._2, "2"), + (key._3, "3"), + ): + if symbol == digit_key: + state["preset"] = name + return + + @window.event + def on_draw(): + nonlocal frame_count, fps_time + + window.clear() + sensor_angle, sensor_dist, turn_speed, move_speed, decay, _label = PRESETS[state["preset"]] + + # (a) Move + deposit: 1D over agents. Reads and deposits into the + # CURRENT array (tex + surf of the same array). + tex_cur, surf_cur = current_tex_surf() + launch( + stream, + configs["move"], + kernels["move"], + device_agents, + np.int32(N_AGENTS), + np.uint64(tex_cur.handle), + np.uint64(surf_cur.handle), + np.int32(WIDTH), + np.int32(HEIGHT), + np.float32(sensor_angle), + np.float32(sensor_dist), + np.float32(turn_speed), + np.float32(move_speed), + np.float32(DEPOSIT), + np.uint32(state["frame"]), + ) + + # (b) Diffuse + decay: 2D over pixels. Reads CURRENT, writes OTHER, swap. + tex_read, surf_write, next_current = diffuse_read_write() + launch( + stream, + configs["diffuse"], + kernels["diffuse"], + np.uint64(tex_read.handle), + np.uint64(surf_write.handle), + np.int32(WIDTH), + np.int32(HEIGHT), + np.float32(decay), + ) + state["current"] = next_current + + # (c) Colorize the latest trail into the OpenGL PBO. + tex_show = tex_a if state["current"] == "a" else tex_b + with resource.map(stream=stream) as buf: + launch( + stream, + configs["colorize"], + kernels["colorize"], + np.uint64(tex_show.handle), + buf.handle, + np.int32(WIDTH), + np.int32(HEIGHT), + ) + # Unmap happens automatically when the `with` block exits. + + # (d) Tell OpenGL to copy the PBO contents into our texture. + copy_pbo_to_texture(gl, pbo_id, tex_id, WIDTH, HEIGHT) + + # (e) Draw the texture to the screen. + draw_fullscreen_quad(gl, shader_prog, quad_vao, tex_id) + + state["frame"] += 1 + + # FPS counter (shown in window title) + frame_count += 1 + now = time.monotonic() + if now - fps_time >= 1.0: + fps = frame_count / (now - fps_time) + label = PRESETS[state["preset"]][5] + window.set_caption( + "cuda.core CUDAArray/Texture/Surface/Buffer - Physarum" + f" [{label}] ({WIDTH}x{HEIGHT}, {N_AGENTS:,} agents, {fps:.0f} FPS)" + " | Buffer(agents) + TextureObject[LINEAR|WRAP|norm] sense" + " + SurfaceObject deposit" + ) + frame_count = 0 + fps_time = now + + @window.event + def on_close(): + # Release everything we opened, in reverse order. + resource.close() + tex_a.close() + tex_b.close() + surf_a.close() + surf_b.close() + arr_a.close() + arr_b.close() + pinned_agents.close() + device_agents.close(stream) + stream.close() + + pyglet.app.run(interval=0) + + +# ======================== GPU code (CUDA + GLSL) ============================ +# +# These source strings are kept at the bottom of the file so they don't +# distract from the Python logic above. +# +# - KERNEL_SOURCE contains three CUDA C++ kernels: +# * move_agents -- 1 thread per agent: senses the trail at three points +# via tex2D (LINEAR + WRAP), rotates toward the +# strongest, steps forward with toroidal wrap, and +# deposits pheromone via surf2Dwrite (x offset in BYTES). +# * diffuse_decay -- box-blur the trail via tex2D LINEAR neighbor taps and +# multiply by a decay factor < 1; ping-pong write. +# * colorize -- color the trail by the local gradient DIRECTION (hue +# via HSV) modulated by intensity, with a ridge boost +# and a wider-tap bloom halo for glowing veins, into +# RGBA bytes in the PBO. +# +# - VERTEX_SHADER_SOURCE / FRAGMENT_SHADER_SOURCE are GLSL. They draw the +# texture onto a rectangle covering the entire window. Nothing interesting. +# +# ============================================================================ + +KERNEL_SOURCE = r""" +// Per-agent xorshift32 RNG: cheap, good enough for turn jitter. Seeded per +// agent and per frame so the sequence differs every step. +__device__ __forceinline__ unsigned int xorshift32(unsigned int s) { + s ^= s << 13; + s ^= s >> 17; + s ^= s << 5; + return s; +} + +extern "C" +__global__ +void move_agents(float* agents, + int n_agents, + cudaTextureObject_t tex, + cudaSurfaceObject_t surf, + int width, int height, + float sensor_angle, + float sensor_dist, + float turn_speed, + float move_speed, + float deposit, + unsigned int frame) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= n_agents) return; + + int base = i * 3; + float x = agents[base + 0]; + float y = agents[base + 1]; + float heading = agents[base + 2]; + + float inv_w = 1.0f / (float)width; + float inv_h = 1.0f / (float)height; + + // Sample the trail at center / left / right of the heading. Normalized + // coords (+0.5 texel center) are required for WRAP addressing. + float ca = heading; + float la = heading - sensor_angle; + float ra = heading + sensor_angle; + + float cx = x + cosf(ca) * sensor_dist; + float cy = y + sinf(ca) * sensor_dist; + float lx = x + cosf(la) * sensor_dist; + float ly = y + sinf(la) * sensor_dist; + float rx = x + cosf(ra) * sensor_dist; + float ry = y + sinf(ra) * sensor_dist; + + float sc = tex2D(tex, (cx + 0.5f) * inv_w, (cy + 0.5f) * inv_h); + float sl = tex2D(tex, (lx + 0.5f) * inv_w, (ly + 0.5f) * inv_h); + float sr = tex2D(tex, (rx + 0.5f) * inv_w, (ry + 0.5f) * inv_h); + + // Per-agent jitter in [0, 1). + unsigned int rng = xorshift32(((unsigned int)i + 1u) * 2654435761u + frame * 40503u); + float jitter = (rng & 0xffffffu) / (float)0x1000000; + + // Steer toward the strongest sensor; random turn when ahead is ambiguous. + if (sc > sl && sc > sr) { + // keep going straight + } else if (sc < sl && sc < sr) { + // both sides better than center: turn randomly left or right + heading += (jitter < 0.5f ? -turn_speed : turn_speed); + } else if (sl > sr) { + heading -= turn_speed; + } else if (sr > sl) { + heading += turn_speed; + } else { + // tie: small random wiggle + heading += (jitter - 0.5f) * turn_speed; + } + + // Step forward and wrap around the toroidal world. + x += cosf(heading) * move_speed; + y += sinf(heading) * move_speed; + + float fw = (float)width; + float fh = (float)height; + if (x < 0.0f) x += fw; + if (x >= fw) x -= fw; + if (y < 0.0f) y += fh; + if (y >= fh) y -= fh; + + agents[base + 0] = x; + agents[base + 1] = y; + agents[base + 2] = heading; + + // Deposit pheromone at the new integer cell. surf2Dwrite x offset is in + // BYTES: single-channel float => x * sizeof(float). Concurrent agents may + // race on the same texel; that is acceptable for Physarum. + int ix = (int)x; + int iy = (int)y; + if (ix < 0) ix = 0; else if (ix >= width) ix = width - 1; + if (iy < 0) iy = 0; else if (iy >= height) iy = height - 1; + + float prev = surf2Dread(surf, ix * (int)sizeof(float), iy); + float val = prev + deposit; + if (val > 1.0f) val = 1.0f; + surf2Dwrite(val, surf, ix * (int)sizeof(float), iy); +} + +extern "C" +__global__ +void diffuse_decay(cudaTextureObject_t tex, + cudaSurfaceObject_t surf, + int width, int height, + float decay) { + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + if (x >= width || y >= height) return; + + float inv_w = 1.0f / (float)width; + float inv_h = 1.0f / (float)height; + float cx = (x + 0.5f) * inv_w; + float cy = (y + 0.5f) * inv_h; + + // 3x3 box blur via LINEAR neighbor taps; WRAP gives toroidal edges. + float sum = 0.0f; + for (int dy = -1; dy <= 1; ++dy) { + for (int dx = -1; dx <= 1; ++dx) { + sum += tex2D(tex, cx + dx * inv_w, cy + dy * inv_h); + } + } + float blurred = sum * (1.0f / 9.0f); + + float out = blurred * decay; + if (out < 0.0f) out = 0.0f; + if (out > 1.0f) out = 1.0f; + + surf2Dwrite(out, surf, x * (int)sizeof(float), y); +} + +// HSV -> RGB (all components in [0, 1]). Standard six-sector conversion; used +// by colorize to turn the local trail-gradient direction into a hue. +__device__ __forceinline__ void hsv2rgb(float h, float s, float v, + float* r, float* g, float* b) { + h -= floorf(h); // wrap hue into [0, 1) + float hp = h * 6.0f; + int sector = (int)hp; + float f = hp - (float)sector; + float p = v * (1.0f - s); + float q = v * (1.0f - s * f); + float t = v * (1.0f - s * (1.0f - f)); + switch (sector % 6) { + case 0: *r = v; *g = t; *b = p; break; + case 1: *r = q; *g = v; *b = p; break; + case 2: *r = p; *g = v; *b = t; break; + case 3: *r = p; *g = q; *b = v; break; + case 4: *r = t; *g = p; *b = v; break; + default: *r = v; *g = p; *b = q; break; + } +} + +extern "C" +__global__ +void colorize(cudaTextureObject_t tex, + unsigned char* output, + int width, int height) { + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + if (x >= width || y >= height) return; + + float inv_w = 1.0f / (float)width; + float inv_h = 1.0f / (float)height; + float cx = (x + 0.5f) * inv_w; + float cy = (y + 0.5f) * inv_h; + + float v = tex2D(tex, cx, cy); + if (v < 0.0f) v = 0.0f; + if (v > 1.0f) v = 1.0f; + + // Local trail gradient from LINEAR+WRAP neighbor taps (toroidal, no edge + // special-casing). Its direction sets the HUE so the network is colored by + // the orientation of the veins instead of a single intensity ramp. + float l = tex2D(tex, cx - inv_w, cy); + float rgt = tex2D(tex, cx + inv_w, cy); + float dn = tex2D(tex, cx, cy - inv_h); + float up = tex2D(tex, cx, cy + inv_h); + float gx = rgt - l; + float gy = up - dn; + float hue = atan2f(gy, gx) * (0.1591549f) + 0.5f; // atan2/(2*pi) + 0.5 -> [0,1) + + // Soft glow/bloom: a wider ring of taps lifts a luminous halo around the + // veins so they read as glowing rather than flat. Still WRAP-sampled. + float bloom = 0.0f; + bloom += tex2D(tex, cx - 2.0f * inv_w, cy); + bloom += tex2D(tex, cx + 2.0f * inv_w, cy); + bloom += tex2D(tex, cx, cy - 2.0f * inv_h); + bloom += tex2D(tex, cx, cy + 2.0f * inv_h); + bloom += l + rgt + dn + up; + bloom *= 0.125f; // average of the 8 surrounding taps + + // Intensity stays the dominant brightness driver so the reticular structure + // survives; gradient magnitude sharpens ridges into bright luminous veins. + float grad_mag = sqrtf(gx * gx + gy * gy); + float ridge = grad_mag * 6.0f; + if (ridge > 1.0f) ridge = 1.0f; + + // Saturation eases toward white on the brightest ridges (neon -> white-hot). + float sat = 1.0f - 0.45f * v; + + // Brightness: core intensity (gamma-lifted) + ridge boost + bloom halo. + float val = sqrtf(v) + 0.55f * ridge + 0.45f * bloom; + if (val > 1.0f) val = 1.0f; + + float r, g, b; + hsv2rgb(hue, sat, val, &r, &g, &b); + + // Lift the floor toward a deep blue-violet so empty space is not pure black, + // giving the glow something to bleed into. + r += 0.02f; + g += 0.0f; + b += 0.06f; + if (r > 1.0f) r = 1.0f; + if (g > 1.0f) g = 1.0f; + if (b > 1.0f) b = 1.0f; + + int idx = (y * width + x) * 4; + output[idx + 0] = (unsigned char)(r * 255.0f); + output[idx + 1] = (unsigned char)(g * 255.0f); + output[idx + 2] = (unsigned char)(b * 255.0f); + output[idx + 3] = 255; +} +""" + +# GLSL shaders -- these just display a texture on a fullscreen rectangle. +# Nothing CUDA-specific here. + +VERTEX_SHADER_SOURCE = """#version 330 core +in vec2 position; +in vec2 texcoord; +out vec2 v_texcoord; +void main() { + gl_Position = vec4(position, 0.0, 1.0); + v_texcoord = texcoord; +} +""" + +FRAGMENT_SHADER_SOURCE = """#version 330 core +in vec2 v_texcoord; +out vec4 fragColor; +uniform sampler2D tex; +void main() { + fragColor = texture(tex, v_texcoord); +} +""" + + +if __name__ == "__main__": + main() diff --git a/cuda_core/examples/gl_interop_reaction_diffusion.py b/cuda_core/examples/gl_interop_reaction_diffusion.py new file mode 100644 index 00000000000..2c53f39f641 --- /dev/null +++ b/cuda_core/examples/gl_interop_reaction_diffusion.py @@ -0,0 +1,727 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +# ################################################################################ +# +# This example demonstrates cuda.core.CUDAArray, TextureObject, and SurfaceObject +# in combination with GraphicsResource for CUDA/OpenGL interop. A Gray-Scott +# reaction-diffusion simulation is ping-ponged between two CUDA arrays each +# frame: a TextureObject provides smooth (LINEAR + WRAP) sampled reads, and a +# SurfaceObject provides typed writes. The final state is colorized straight +# into an OpenGL PBO. Requires pyglet. +# +# ################################################################################ + +# What this example teaches +# ========================= +# - How to allocate a CUDA CUDAArray with `is_surface_load_store=True` so the same +# memory can be bound as both a TextureObject (for sampled reads) and a +# SurfaceObject (for typed writes). +# - How to use FilterMode.LINEAR + AddressMode.WRAP + normalized coordinates +# to get free hardware bilinear interpolation on a toroidal world. +# - How to compose CUDAArray/TextureObject/SurfaceObject with GraphicsResource so +# the entire simulation never leaves the GPU. +# +# How it works +# ============ +# Gray-Scott is a two-species (U, V) reaction-diffusion system. At each cell +# the rule is roughly: +# +# du/dt = Du * laplacian(u) - u*v*v + F*(1 - u) +# dv/dt = Dv * laplacian(v) + u*v*v - (F + k)*v +# +# Different choices of F and k yield strikingly different patterns: coral, +# mitosis, spots, and many more. We pack (U, V) into the two channels of a +# `float2` CUDAArray. +# +# PING-PONG (two arrays, swap each step) +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +--------------+ tex2D +--------------+ +# | arr_a | ----------------> | | +# | (U, V) state | | gray_scott | +# +--------------+ | kernel | +# | | +# +--------------+ surf2Dwrite | | +# | arr_b | <---------------- | | +# | (U, V) state | +--------------+ +# +--------------+ +# (swap) +# +# Each frame we do N_STEPS iterations of the kernel above, then run a separate +# `colorize` kernel that samples V from the final state and writes RGBA bytes +# straight into the OpenGL PBO via GraphicsResource. No data ever travels +# across the PCIe bus during the frame. +# +# Why LINEAR + WRAP + normalized coords? +# -------------------------------------- +# Addressing modes WRAP and MIRROR are only supported with normalized +# coordinates (see the CUDA Programming Guide and the SDK's +# simplePitchLinearTexture sample). We use WRAP so that neighbor lookups at +# the image edge automatically wrap around -- i.e. a torus. LINEAR filtering +# is essentially free on the hardware and gives smoother diffusion than POINT +# sampling would. We sample at the texel center `(x + 0.5) / W` so the +# neighbor offsets line up exactly on integer texel positions. +# +# Channel byte width in surf2Dwrite +# --------------------------------- +# `surf2Dwrite` takes the x coordinate in BYTES, not in elements. For a +# `float2` surface that means `x * sizeof(float2)` = `x * 8`. Getting this +# wrong silently corrupts every other column. +# +# What you should see +# =================== +# A window showing animated, organic-looking patterns growing and dividing +# (think coral, spots, or mitosing cells). Press 1/2/3 to switch presets, +# R to reseed, and Escape to exit. The window title shows the current FPS +# and active preset. +# + +# /// script +# dependencies = ["cuda_bindings", "cuda_core>0.6.0", "pyglet"] +# /// + +import ctypes +import sys +import time + +import numpy as np + +from cuda.core import ( + AddressMode, + ArrayFormat, + CUDAArray, + Device, + FilterMode, + GraphicsResource, + LaunchConfig, + Program, + ProgramOptions, + ReadMode, + ResourceDescriptor, + SurfaceObject, + TextureDescriptor, + TextureObject, + launch, +) + +# --------------------------------------------------------------------------- +# Simulation parameters (feel free to change these) +# --------------------------------------------------------------------------- +WIDTH = 512 +HEIGHT = 512 +N_STEPS = 8 # Gray-Scott iterations per displayed frame +DU = 0.16 # diffusion rate for U +DV = 0.08 # diffusion rate for V +DT = 1.0 # time step (Gray-Scott is stable at 1.0 with these D's) + +# Named presets: (F, k, label) tuples. F is the feed rate, k is the kill rate. +# These are classic Gray-Scott regimes documented all over the literature. +PRESETS = { + "1": (0.0545, 0.062, "coral"), + "2": (0.0367, 0.0649, "mitosis"), + "3": (0.030, 0.062, "spots"), +} +DEFAULT_PRESET = "1" + + +# ============================= Helper functions ============================= +# +# The functions below set up CUDA and OpenGL. If you're here to learn about +# CUDAArray/TextureObject/SurfaceObject, skip ahead to main() -- the interesting +# part is there. These helpers exist so that main() reads like a short story +# instead of a wall of boilerplate. +# ============================================================================ + + +def setup_cuda(): + """Compile the CUDA kernels and return (device, stream, kernels, configs). + + Returns a dict of kernels keyed by name and matching LaunchConfigs. + """ + dev = Device(0) + dev.set_current() + + # SurfaceObject requires surface load/store, which has existed since SM 2.0, + # but bindless surface objects (cuSurfObjectCreate) require SM 3.0+. + cc = dev.compute_capability + if cc.major < 3: + print( + "This example requires a GPU with compute capability >= 3.0 for " + f"bindless surface objects. Found sm_{cc.major}{cc.minor}.", + file=sys.stderr, + ) + sys.exit(1) + + stream = dev.create_stream() + + # Compile as C++ so the templated tex2D overload resolves. + program_options = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}") + prog = Program(KERNEL_SOURCE, code_type="c++", options=program_options) + mod = prog.compile( + "cubin", + name_expressions=("gray_scott_step", "colorize", "seed_initial"), + ) + + kernels = { + "step": mod.get_kernel("gray_scott_step"), + "colorize": mod.get_kernel("colorize"), + "seed": mod.get_kernel("seed_initial"), + } + + block = (16, 16, 1) + grid = ( + (WIDTH + block[0] - 1) // block[0], + (HEIGHT + block[1] - 1) // block[1], + 1, + ) + config = LaunchConfig(grid=grid, block=block) + # All three kernels are pixel-parallel over a WIDTH x HEIGHT grid, so they + # can share a launch config. + configs = {"step": config, "colorize": config, "seed": config} + + return dev, stream, kernels, configs + + +def create_window(): + """Open a pyglet window and return (window, gl_module, pyglet).""" + try: + import pyglet + from pyglet.gl import gl as _gl + except ImportError: + print( + "This example requires pyglet >= 2.0.\nInstall it with: pip install pyglet", + file=sys.stderr, + ) + sys.exit(1) + + window = pyglet.window.Window( + WIDTH, + HEIGHT, + caption="cuda.core CUDAArray/Texture/Surface - Gray-Scott Reaction Diffusion", + vsync=False, + ) + return window, _gl, pyglet + + +def create_display_resources(gl, width, height): + """Create the GL objects needed to show a texture on screen. + + This sets up a shader program, a fullscreen quad, and an empty texture. + None of this is CUDA-specific -- it's standard OpenGL boilerplate for + rendering a textured quad. + + Returns (shader_program, vertex_array_id, texture_id). The shader_program + is a pyglet ShaderProgram object (must be kept alive). + """ + from pyglet.graphics.shader import Shader, ShaderProgram + + # Shader program -- just passes texture coordinates through + vert = Shader(VERTEX_SHADER_SOURCE, "vertex") + frag = Shader(FRAGMENT_SHADER_SOURCE, "fragment") + shader_prog = ShaderProgram(vert, frag) + + # Fullscreen quad (two triangles covering the entire window) + quad_verts = np.array( + [ + # x, y, s, t (position + texture coordinate) + -1, + -1, + 0, + 0, + 1, + -1, + 1, + 0, + 1, + 1, + 1, + 1, + -1, + -1, + 0, + 0, + 1, + 1, + 1, + 1, + -1, + 1, + 0, + 1, + ], + dtype=np.float32, + ) + + vao = ctypes.c_uint(0) + gl.glGenVertexArrays(1, ctypes.byref(vao)) + gl.glBindVertexArray(vao.value) + + vbo = ctypes.c_uint(0) + gl.glGenBuffers(1, ctypes.byref(vbo)) + gl.glBindBuffer(gl.GL_ARRAY_BUFFER, vbo.value) + gl.glBufferData( + gl.GL_ARRAY_BUFFER, + quad_verts.nbytes, + quad_verts.ctypes.data_as(ctypes.c_void_p), + gl.GL_STATIC_DRAW, + ) + + stride = 4 * 4 # 4 floats * 4 bytes each = 16 bytes per vertex + pos_loc = gl.glGetAttribLocation(shader_prog.id, b"position") + gl.glEnableVertexAttribArray(pos_loc) + gl.glVertexAttribPointer(pos_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(0)) + + tc_loc = gl.glGetAttribLocation(shader_prog.id, b"texcoord") + gl.glEnableVertexAttribArray(tc_loc) + gl.glVertexAttribPointer(tc_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(8)) + + gl.glBindVertexArray(0) + + # Empty texture (will be filled each frame from the PBO) + tex = ctypes.c_uint(0) + gl.glGenTextures(1, ctypes.byref(tex)) + gl.glBindTexture(gl.GL_TEXTURE_2D, tex.value) + gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MIN_FILTER, gl.GL_LINEAR) + gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MAG_FILTER, gl.GL_LINEAR) + gl.glTexImage2D( + gl.GL_TEXTURE_2D, + 0, + gl.GL_RGBA8, + width, + height, + 0, + gl.GL_RGBA, + gl.GL_UNSIGNED_BYTE, + None, + ) + + return shader_prog, vao.value, tex.value + + +def create_pixel_buffer(gl, width, height): + """Create a Pixel Buffer Object (PBO) -- the bridge between CUDA and OpenGL. + + A PBO is a GPU-side buffer that OpenGL can read from when uploading pixels + to a texture. By registering this same buffer with CUDA, the CUDA kernel + can write directly into it. + + Returns (pbo_gl_name, size_in_bytes). + """ + pbo = ctypes.c_uint(0) + gl.glGenBuffers(1, ctypes.byref(pbo)) + gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo.value) + nbytes = width * height * 4 # RGBA, 1 byte per channel + gl.glBufferData(gl.GL_PIXEL_UNPACK_BUFFER, nbytes, None, gl.GL_DYNAMIC_DRAW) + gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0) + return pbo.value, nbytes + + +def copy_pbo_to_texture(gl, pbo_id, tex_id, width, height): + """Copy pixel data from the PBO into the GL texture (GPU-to-GPU).""" + gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo_id) + gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id) + gl.glTexSubImage2D( + gl.GL_TEXTURE_2D, + 0, + 0, + 0, + width, + height, + gl.GL_RGBA, + gl.GL_UNSIGNED_BYTE, + None, # None = read from the currently bound PBO, not from CPU + ) + gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0) + + +def draw_fullscreen_quad(gl, shader_prog, vao_id, tex_id): + """Draw the texture to the screen using the fullscreen quad.""" + gl.glUseProgram(shader_prog.id) + gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id) + gl.glBindVertexArray(vao_id) + gl.glDrawArrays(gl.GL_TRIANGLES, 0, 6) + gl.glBindVertexArray(0) + gl.glUseProgram(0) + + +def make_state_arrays(): + """Allocate the two `float2` ping-pong arrays that hold the (U, V) state.""" + arr_a = CUDAArray.from_descriptor( + shape=(WIDTH, HEIGHT), + format=ArrayFormat.FLOAT32, + num_channels=2, + is_surface_load_store=True, + ) + arr_b = CUDAArray.from_descriptor( + shape=(WIDTH, HEIGHT), + format=ArrayFormat.FLOAT32, + num_channels=2, + is_surface_load_store=True, + ) + return arr_a, arr_b + + +def make_texture(arr): + """Bind `arr` as a TextureObject configured for LINEAR + WRAP + normalized.""" + res_desc = ResourceDescriptor.from_array(arr) + tex_desc = TextureDescriptor( + address_mode=AddressMode.WRAP, + filter_mode=FilterMode.LINEAR, + read_mode=ReadMode.ELEMENT_TYPE, + # WRAP/MIRROR addressing modes require normalized coordinates. + normalized_coords=True, + ) + return TextureObject.from_descriptor(resource=res_desc, texture_descriptor=tex_desc) + + +def seed_state(stream, kernels, configs, write_surf, seed_value): + """Re-initialize the array behind `write_surf` with the Gray-Scott starting state. + + Takes a long-lived SurfaceObject (not a fresh one): `launch` is async, so + creating a SurfaceObject inside a `with` block that closes immediately + after `launch` returns would destroy the surface handle before the kernel + actually runs against it. + """ + launch( + stream, + configs["seed"], + kernels["seed"], + np.uint64(write_surf.handle), + np.int32(WIDTH), + np.int32(HEIGHT), + np.uint32(seed_value), + ) + + +# ================================== main() ================================== + + +def main(): + # --- Step 1: Set up CUDA (compile kernels, create stream) --- + dev, stream, kernels, configs = setup_cuda() + + # --- Step 2: Open a window --- + window, gl, pyglet = create_window() + + # --- Step 3: Create GL resources for drawing a texture to screen --- + # (Standard OpenGL boilerplate -- not CUDA-specific.) + shader_prog, quad_vao, tex_id = create_display_resources(gl, WIDTH, HEIGHT) + + # --- Step 4: Create the Pixel Buffer Object (PBO) --- + # The PBO is GPU memory owned by OpenGL. It's the bridge between the + # two worlds: CUDA writes into it, OpenGL reads from it. + pbo_id, _ = create_pixel_buffer(gl, WIDTH, HEIGHT) + + # --- Step 5: Register the PBO with CUDA --- + resource = GraphicsResource.from_gl_buffer(pbo_id, flags="write_discard") + + # --- Step 6: Allocate the two ping-pong state Arrays --- + # Both are `float2` (channel 0 = U, channel 1 = V) with + # is_surface_load_store=True so they can be bound as SurfaceObjects. + arr_a, arr_b = make_state_arrays() + + # --- Step 7: Pre-create the four bindless handles --- + # Per advisor: doing this once is much cheaper than recreating them + # every step. We keep both texture and surface handles for each + # array; the simulation loop just picks which pair to use. + tex_a = make_texture(arr_a) + tex_b = make_texture(arr_b) + surf_a = SurfaceObject.from_array(arr_a) + surf_b = SurfaceObject.from_array(arr_b) + + # --- Step 8: Seed the initial state into arr_a (writes via surf_a) --- + seed_state(stream, kernels, configs, surf_a, seed_value=0) + # After seeding, `arr_a` is the "current" state. + state = {"current": "a", "preset": DEFAULT_PRESET, "seed": 0} + + # --- Step 9: Render loop --- + start_time = time.monotonic() + frame_count = 0 + fps_time = start_time + + def current_read_write(): + if state["current"] == "a": + return tex_a, surf_b, "b" # read a, write b, next current = b + return tex_b, surf_a, "a" + + @window.event + def on_key_press(symbol, _modifiers): + key = pyglet.window.key + if symbol == key.ESCAPE: + window.close() + return + if symbol == key.R: + state["seed"] += 1 + seed_state(stream, kernels, configs, surf_a, seed_value=state["seed"]) + state["current"] = "a" + return + for digit_key, name in ( + (key._1, "1"), + (key._2, "2"), + (key._3, "3"), + ): + if symbol == digit_key: + state["preset"] = name + return + + @window.event + def on_draw(): + nonlocal frame_count, fps_time + + window.clear() + f, k, _label = PRESETS[state["preset"]] + + # (a) Run N_STEPS Gray-Scott iterations. Each step reads from one + # array via a TextureObject (LINEAR + WRAP gives wrapping + + # bilinear sampling) and writes to the other via a SurfaceObject. + for _ in range(N_STEPS): + tex_read, surf_write, next_current = current_read_write() + launch( + stream, + configs["step"], + kernels["step"], + np.uint64(tex_read.handle), + np.uint64(surf_write.handle), + np.int32(WIDTH), + np.int32(HEIGHT), + np.float32(DU), + np.float32(DV), + np.float32(f), + np.float32(k), + np.float32(DT), + ) + state["current"] = next_current + + # (b) Colorize the latest state into the OpenGL PBO. + tex_read = tex_a if state["current"] == "a" else tex_b + with resource.map(stream=stream) as buf: + launch( + stream, + configs["colorize"], + kernels["colorize"], + np.uint64(tex_read.handle), + buf.handle, + np.int32(WIDTH), + np.int32(HEIGHT), + ) + # Unmap happens automatically when the `with` block exits. + + # (c) Tell OpenGL to copy the PBO contents into our texture. + copy_pbo_to_texture(gl, pbo_id, tex_id, WIDTH, HEIGHT) + + # (d) Draw the texture to the screen. + draw_fullscreen_quad(gl, shader_prog, quad_vao, tex_id) + + # FPS counter (shown in window title) + frame_count += 1 + now = time.monotonic() + if now - fps_time >= 1.0: + fps = frame_count / (now - fps_time) + label = PRESETS[state["preset"]][2] + window.set_caption( + "cuda.core CUDAArray/Texture/Surface - Gray-Scott" + f" [{label}] ({WIDTH}x{HEIGHT}, {fps:.0f} FPS," + f" {N_STEPS} steps/frame)" + ) + frame_count = 0 + fps_time = now + + @window.event + def on_close(): + # Release everything we opened, in reverse order. Each of these is a + # context manager too, but pyglet owns the event loop here so we + # release explicitly. + resource.close() + tex_a.close() + tex_b.close() + surf_a.close() + surf_b.close() + arr_a.close() + arr_b.close() + stream.close() + + pyglet.app.run(interval=0) + + +# ======================== GPU code (CUDA + GLSL) ============================ +# +# These source strings are kept at the bottom of the file so they don't +# distract from the Python logic above. The important things to know: +# +# - KERNEL_SOURCE contains three CUDA C++ kernels: +# * seed_initial -- sets initial (U, V) state via SurfaceObject writes +# * gray_scott_step -- reads previous state via TextureObject (with +# LINEAR + WRAP bilinear filtering) and writes the +# next state via SurfaceObject. Coordinates are +# normalized to [0, 1] because WRAP requires it. +# * colorize -- reads the V channel via TextureObject and writes +# RGBA bytes into the OpenGL PBO using a simple +# three-stop "magma-ish" gradient. +# +# - VERTEX_SHADER_SOURCE / FRAGMENT_SHADER_SOURCE are GLSL. They draw a +# texture onto a rectangle covering the entire window. Nothing interesting. +# +# ============================================================================ + +KERNEL_SOURCE = r""" +// Inverse texture dimensions are precomputed by the host and passed as +// floats so the kernel can convert integer pixel coordinates to normalized +// texture coordinates with a single multiply. + +extern "C" +__global__ +void seed_initial(cudaSurfaceObject_t surf, + int width, int height, + unsigned int seed) { + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + if (x >= width || y >= height) return; + + // U = 1 everywhere; V = 1 inside a ~40x40 centered square plus a small + // deterministic perturbation that breaks symmetry differently each reseed. + float u = 1.0f; + float v = 0.0f; + + int half_w = width / 2; + int half_h = height / 2; + if (x >= half_w - 20 && x < half_w + 20 && + y >= half_h - 20 && y < half_h + 20) { + v = 1.0f; + // Knock U down a bit inside the seed square so V can grow. + u = 0.5f; + } + + // Cheap deterministic pseudo-random noise (xorshift on packed coords). + unsigned int h = (unsigned int)x * 374761393u + + (unsigned int)y * 668265263u + seed * 2246822519u; + h = (h ^ (h >> 13)) * 1274126177u; + h = h ^ (h >> 16); + float noise = (h & 0xffffu) / 65535.0f; // in [0, 1] + v += 0.02f * (noise - 0.5f); // small +/- jitter + if (v < 0.0f) v = 0.0f; + if (v > 1.0f) v = 1.0f; + + // float2 is 8 bytes; surf2Dwrite takes the x offset in BYTES. + surf2Dwrite(make_float2(u, v), surf, x * (int)sizeof(float2), y); +} + +extern "C" +__global__ +void gray_scott_step(cudaTextureObject_t tex, + cudaSurfaceObject_t surf, + int width, int height, + float Du, float Dv, + float F, float k_kill, + float dt) { + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + if (x >= width || y >= height) return; + + // Normalized coordinates: WRAP addressing only works in normalized mode. + // Each texel center sits at ((i + 0.5) / W, (j + 0.5) / H). + float inv_w = 1.0f / (float)width; + float inv_h = 1.0f / (float)height; + float cx = (x + 0.5f) * inv_w; + float cy = (y + 0.5f) * inv_h; + + // 5-point Laplacian stencil. LINEAR filtering does nothing extra here + // because the offsets land exactly on neighboring texel centers, but the + // toroidal WRAP at the boundary is essential for a periodic world. + float2 c = tex2D(tex, cx, cy); + float2 l = tex2D(tex, cx - inv_w, cy); + float2 r = tex2D(tex, cx + inv_w, cy); + float2 u_n = tex2D(tex, cx, cy - inv_h); + float2 d_n = tex2D(tex, cx, cy + inv_h); + + float lap_u = (l.x + r.x + u_n.x + d_n.x) - 4.0f * c.x; + float lap_v = (l.y + r.y + u_n.y + d_n.y) - 4.0f * c.y; + + float u = c.x; + float v = c.y; + float uvv = u * v * v; + + float du = Du * lap_u - uvv + F * (1.0f - u); + float dv = Dv * lap_v + uvv - (F + k_kill) * v; + + float new_u = u + dt * du; + float new_v = v + dt * dv; + + // Clamp to keep things numerically sane after long runs. + if (new_u < 0.0f) new_u = 0.0f; + if (new_u > 1.0f) new_u = 1.0f; + if (new_v < 0.0f) new_v = 0.0f; + if (new_v > 1.0f) new_v = 1.0f; + + surf2Dwrite(make_float2(new_u, new_v), surf, + x * (int)sizeof(float2), y); +} + +extern "C" +__global__ +void colorize(cudaTextureObject_t tex, + unsigned char* output, + int width, int height) { + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + if (x >= width || y >= height) return; + + float inv_w = 1.0f / (float)width; + float inv_h = 1.0f / (float)height; + float cx = (x + 0.5f) * inv_w; + float cy = (y + 0.5f) * inv_h; + + float2 c = tex2D(tex, cx, cy); + float v = c.y; + if (v < 0.0f) v = 0.0f; + if (v > 1.0f) v = 1.0f; + + // Three-stop "magma-ish" gradient: dark purple -> orange -> pale yellow. + // Implemented as two linear interpolations stitched together at v = 0.5 + // so the result is reasonably perceptually smooth without a lookup table. + float r, g, b; + if (v < 0.5f) { + float t = v * 2.0f; // [0, 1] over the low half + r = 0.05f + t * (0.85f - 0.05f); + g = 0.02f + t * (0.30f - 0.02f); + b = 0.20f + t * (0.10f - 0.20f); + } else { + float t = (v - 0.5f) * 2.0f; // [0, 1] over the high half + r = 0.85f + t * (1.00f - 0.85f); + g = 0.30f + t * (0.95f - 0.30f); + b = 0.10f + t * (0.70f - 0.10f); + } + + int idx = (y * width + x) * 4; + output[idx + 0] = (unsigned char)(r * 255.0f); + output[idx + 1] = (unsigned char)(g * 255.0f); + output[idx + 2] = (unsigned char)(b * 255.0f); + output[idx + 3] = 255; +} +""" + +# GLSL shaders -- these just display a texture on a fullscreen rectangle. +# Nothing CUDA-specific here. + +VERTEX_SHADER_SOURCE = """#version 330 core +in vec2 position; +in vec2 texcoord; +out vec2 v_texcoord; +void main() { + gl_Position = vec4(position, 0.0, 1.0); + v_texcoord = texcoord; +} +""" + +FRAGMENT_SHADER_SOURCE = """#version 330 core +in vec2 v_texcoord; +out vec4 fragColor; +uniform sampler2D tex; +void main() { + fragColor = texture(tex, v_texcoord); +} +""" + + +if __name__ == "__main__": + main() diff --git a/cuda_core/examples/gl_interop_sdf_volume.py b/cuda_core/examples/gl_interop_sdf_volume.py new file mode 100644 index 00000000000..20ecadb2244 --- /dev/null +++ b/cuda_core/examples/gl_interop_sdf_volume.py @@ -0,0 +1,843 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +# ################################################################################ +# +# This example demonstrates cuda.core's 3D CUDAArray + trilinear TextureObject by +# baking a procedural Signed Distance Field (SDF) volume once at startup and +# then ray-marching it every frame to render an orbitable 3D scene. The +# SurfaceObject is used during the one-shot bake; the TextureObject (with +# LINEAR + CLAMP + normalized coords) drives the per-frame ray march. The +# whole pipeline stays on the GPU through GraphicsResource. Requires pyglet. +# +# ################################################################################ + +# What this example teaches +# ========================= +# - How to allocate a 3D cuda.core.CUDAArray (cuArray3DCreate under the hood) and +# bind it as both a SurfaceObject (for one-shot kernel writes) and a +# TextureObject (for hardware-accelerated trilinear sampling). +# - How to ray-march a baked SDF volume from a CUDA kernel, sampling via +# tex3D and writing pixels straight into an OpenGL PBO. +# - How to wire mouse + keyboard input into a pyglet/cuda.core interop loop. +# +# How it works +# ============ +# The signed distance field of a "gyroid intersected with a sphere" is baked +# once into a 128 x 128 x 128 single-channel float volume: +# +# gyroid(p) = sin(p.x*tau)cos(p.y*tau) +# + sin(p.y*tau)cos(p.z*tau) +# + sin(p.z*tau)cos(p.x*tau) +# sdf_gyroid = |gyroid(p)| - 0.20 # slab around the gyroid surface +# sdf_sphere = length(p) - 0.9 # bounding sphere +# sdf(p) = max(sdf_gyroid, sdf_sphere) # CSG intersection +# +# where p in [-1, 1]^3 is the voxel's world-space position. +# +# Each frame, the render kernel emits one ray per pixel from an orbiting +# camera, marches the volume in fixed voxel-sized steps (up to ~256), and on intersection +# computes a normal by central differences of tex3D, then applies a simple +# diffuse + ambient + specular shade. Misses fall back to a vertical sky +# gradient. +# +# STARTUP (one-shot bake) +# ~~~~~~~~~~~~~~~~~~~~~~~ +# 1. Allocate 3D CUDAArray (128^3, FLOAT32 x1, is_surface_load_store=True). +# 2. Bind it as a SurfaceObject. +# 3. Launch `bake_sdf`: one thread per voxel writes the SDF via surf3Dwrite. +# 4. Close the SurfaceObject; the CUDAArray stays alive. +# +# EACH FRAME +# ~~~~~~~~~~ +# 1. resource.map() -> CUDA device pointer into the OpenGL PBO. +# 2. Launch `render_sdf` (one thread per pixel). It samples the SDF via the +# long-lived TextureObject (LINEAR + CLAMP + normalized coords) using +# tex3D. RGBA8 lands directly in the PBO. +# 3. Unmap, GPU-side copy PBO -> texture, draw fullscreen quad. +# +# Controls +# ======== +# Left mouse drag orbit camera (dx -> yaw, dy -> pitch) +# Mouse wheel zoom (camera distance) +# R reset camera (yaw=0, pitch=0.3, dist=2.5) +# Escape / close quit +# +# The window title shows yaw, pitch, distance, FPS, and ms/frame. +# + +# /// script +# dependencies = ["cuda_bindings", "cuda_core>0.6.0", "pyglet"] +# /// + +import ctypes +import sys +import time + +import numpy as np + +from cuda.core import ( + AddressMode, + ArrayFormat, + CUDAArray, + Device, + FilterMode, + GraphicsResource, + LaunchConfig, + Program, + ProgramOptions, + ReadMode, + ResourceDescriptor, + SurfaceObject, + TextureDescriptor, + TextureObject, + launch, +) + +# --------------------------------------------------------------------------- +# Configuration (feel free to change these) +# --------------------------------------------------------------------------- +WIDTH = 800 +HEIGHT = 600 +VOLUME_SIZE = 128 # 128^3 voxels; bake cost is one-shot. + +# Camera defaults / clamps. +RESET_YAW = 0.0 +RESET_PITCH = 0.3 +RESET_DIST = 2.5 +PITCH_MIN = -1.45 # stay inside (-pi/2, pi/2) so the up-vector stays sane. +PITCH_MAX = 1.45 +DIST_MIN = 1.2 +DIST_MAX = 8.0 + + +# ============================= Helper functions ============================= +# +# The functions below set up CUDA and OpenGL. If you're here to learn about +# 3D CUDAArray / TextureObject / SurfaceObject, skip ahead to main() -- the +# interesting part is there. These helpers exist so that main() reads like a +# short story instead of a wall of boilerplate. +# ============================================================================ + + +def _check_compute_capability(dev): + """3D arrays + bindless surface/texture objects require sm_30+.""" + cc = dev.compute_capability + if cc.major < 3: + print( + f"This example requires compute capability >= 3.0, got sm_{cc.major}{cc.minor}.", + file=sys.stderr, + ) + sys.exit(1) + + +def setup_cuda(): + """Compile the two kernels and return (device, stream, kernels).""" + dev = Device(0) + dev.set_current() + _check_compute_capability(dev) + stream = dev.create_stream() + + # C++ is required so the templated tex3D / surf3Dwrite + # overloads resolve. extern "C" on the kernel symbols keeps the function + # names unmangled even when the rest of the TU is compiled as C++. + program_options = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}") + prog = Program(KERNEL_SOURCE, code_type="c++", options=program_options) + mod = prog.compile( + "cubin", + name_expressions=("bake_sdf", "render_sdf"), + ) + kernels = { + "bake": mod.get_kernel("bake_sdf"), + "render": mod.get_kernel("render_sdf"), + } + return dev, stream, kernels + + +def make_volume_array(): + """Allocate the 3D SDF volume. Single-channel float, surface-capable.""" + return CUDAArray.from_descriptor( + shape=(VOLUME_SIZE, VOLUME_SIZE, VOLUME_SIZE), + format=ArrayFormat.FLOAT32, + num_channels=1, + is_surface_load_store=True, + ) + + +def make_volume_texture(arr): + """Bind `arr` as a TextureObject configured for LINEAR + CLAMP + normalized. + + Normalized coords let the kernel sample as (u, v, w) in [0, 1]; CLAMP at + the boundaries matches the rendering logic that bails out as soon as the + march leaves the volume's [-1, 1]^3 box, so out-of-range sampling never + pollutes a real hit. + """ + res_desc = ResourceDescriptor.from_array(arr) + tex_desc = TextureDescriptor( + address_mode=AddressMode.CLAMP, + filter_mode=FilterMode.LINEAR, + read_mode=ReadMode.ELEMENT_TYPE, + normalized_coords=True, + ) + return TextureObject.from_descriptor(resource=res_desc, texture_descriptor=tex_desc) + + +def bake_volume(stream, kernels, arr): + """Run the one-shot bake kernel that fills the volume with the SDF. + + The SurfaceObject lives only for the duration of this call; once the bake + is enqueued and the kernel has captured the bindless handle into its + arguments, we sync the stream before letting the SurfaceObject close. + The CUDAArray itself outlives this scope -- it's the long-lived backing store + for the render-loop TextureObject. + """ + with SurfaceObject.from_array(arr) as bake_surf: + block = (8, 8, 8) + grid = ( + (VOLUME_SIZE + block[0] - 1) // block[0], + (VOLUME_SIZE + block[1] - 1) // block[1], + (VOLUME_SIZE + block[2] - 1) // block[2], + ) + launch( + stream, + LaunchConfig(grid=grid, block=block), + kernels["bake"], + np.uint64(bake_surf.handle), + np.int32(VOLUME_SIZE), + ) + # Synchronize before the SurfaceObject context exits so the bindless + # handle is still valid while the kernel runs. + stream.sync() + + +def create_window(): + """Open a pyglet window and return (window, gl_module, pyglet).""" + try: + import pyglet + from pyglet.gl import gl as _gl + except ImportError: + print( + "This example requires pyglet >= 2.0.\nInstall it with: pip install pyglet", + file=sys.stderr, + ) + sys.exit(1) + + window = pyglet.window.Window( + WIDTH, + HEIGHT, + caption="cuda.core 3D CUDAArray - SDF Volume Ray-Marcher", + vsync=False, + ) + return window, _gl, pyglet + + +def create_display_resources(gl, width, height): + """Standard GL boilerplate: shader, fullscreen quad, empty texture. + + Not CUDA-specific; identical to the other gl_interop_* examples. + Returns (shader_program, vertex_array_id, texture_id). + """ + from pyglet.graphics.shader import Shader, ShaderProgram + + vert = Shader(VERTEX_SHADER_SOURCE, "vertex") + frag = Shader(FRAGMENT_SHADER_SOURCE, "fragment") + shader_prog = ShaderProgram(vert, frag) + + quad_verts = np.array( + [ + # x, y, s, t (position + texture coordinate) + -1, + -1, + 0, + 0, + 1, + -1, + 1, + 0, + 1, + 1, + 1, + 1, + -1, + -1, + 0, + 0, + 1, + 1, + 1, + 1, + -1, + 1, + 0, + 1, + ], + dtype=np.float32, + ) + + vao = ctypes.c_uint(0) + gl.glGenVertexArrays(1, ctypes.byref(vao)) + gl.glBindVertexArray(vao.value) + + vbo = ctypes.c_uint(0) + gl.glGenBuffers(1, ctypes.byref(vbo)) + gl.glBindBuffer(gl.GL_ARRAY_BUFFER, vbo.value) + gl.glBufferData( + gl.GL_ARRAY_BUFFER, + quad_verts.nbytes, + quad_verts.ctypes.data_as(ctypes.c_void_p), + gl.GL_STATIC_DRAW, + ) + + stride = 4 * 4 # 4 floats * 4 bytes each + pos_loc = gl.glGetAttribLocation(shader_prog.id, b"position") + gl.glEnableVertexAttribArray(pos_loc) + gl.glVertexAttribPointer(pos_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(0)) + + tc_loc = gl.glGetAttribLocation(shader_prog.id, b"texcoord") + gl.glEnableVertexAttribArray(tc_loc) + gl.glVertexAttribPointer(tc_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(8)) + + gl.glBindVertexArray(0) + + tex = ctypes.c_uint(0) + gl.glGenTextures(1, ctypes.byref(tex)) + gl.glBindTexture(gl.GL_TEXTURE_2D, tex.value) + gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MIN_FILTER, gl.GL_LINEAR) + gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MAG_FILTER, gl.GL_LINEAR) + gl.glTexImage2D( + gl.GL_TEXTURE_2D, + 0, + gl.GL_RGBA8, + width, + height, + 0, + gl.GL_RGBA, + gl.GL_UNSIGNED_BYTE, + None, + ) + + return shader_prog, vao.value, tex.value + + +def create_pixel_buffer(gl, width, height): + """Create a Pixel Buffer Object (PBO) -- the CUDA/GL bridge. + + Returns (pbo_gl_name, size_in_bytes). + """ + pbo = ctypes.c_uint(0) + gl.glGenBuffers(1, ctypes.byref(pbo)) + gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo.value) + nbytes = width * height * 4 # RGBA8 + gl.glBufferData(gl.GL_PIXEL_UNPACK_BUFFER, nbytes, None, gl.GL_DYNAMIC_DRAW) + gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0) + return pbo.value, nbytes + + +def copy_pbo_to_texture(gl, pbo_id, tex_id, width, height): + """Copy pixel data from the PBO into the GL texture (GPU-to-GPU).""" + gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo_id) + gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id) + gl.glTexSubImage2D( + gl.GL_TEXTURE_2D, + 0, + 0, + 0, + width, + height, + gl.GL_RGBA, + gl.GL_UNSIGNED_BYTE, + None, + ) + gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0) + + +def draw_fullscreen_quad(gl, shader_prog, vao_id, tex_id): + """Draw the texture to the screen using the fullscreen quad.""" + gl.glUseProgram(shader_prog.id) + gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id) + gl.glBindVertexArray(vao_id) + gl.glDrawArrays(gl.GL_TRIANGLES, 0, 6) + gl.glBindVertexArray(0) + gl.glUseProgram(0) + + +# ================================== main() ================================== + + +def main(): + # --- Step 1: Set up CUDA (compile kernels, create stream) --- + dev, stream, kernels = setup_cuda() + + # --- Step 2: Allocate the 3D SDF volume and bake it once --- + # The CUDAArray is the long-lived backing store; it must outlive the + # render loop. The SurfaceObject is only needed for the one-shot bake + # and is closed before we ever bind a TextureObject to the same CUDAArray. + arr = make_volume_array() + bake_volume(stream, kernels, arr) + + # --- Step 3: Bind the volume as a trilinear TextureObject --- + # LINEAR + CLAMP + normalized_coords gives us free hardware trilinear + # filtering, which is exactly what we want for both the SDF samples + # in the ray march and the normal-finite-difference samples. + volume_tex = make_volume_texture(arr) + + # --- Step 4: Open a window and set up the CUDA/GL bridge --- + window, gl, pyglet = create_window() + shader_prog, quad_vao, tex_id = create_display_resources(gl, WIDTH, HEIGHT) + pbo_id, _ = create_pixel_buffer(gl, WIDTH, HEIGHT) + resource = GraphicsResource.from_gl_buffer(pbo_id, flags="write_discard") + + # --- Step 5: Render loop state --- + # Camera is orbit-style: yaw and pitch are angles, dist is the orbit + # radius. The render kernel turns these into a (origin, basis) and + # constructs per-pixel rays itself. + cam = { + "yaw": RESET_YAW, + "pitch": RESET_PITCH, + "dist": RESET_DIST, + } + frame_count = [0] + fps_time = [time.monotonic()] + last_fps = [0.0] + last_frame_ms = [0.0] + + block = (16, 16, 1) + grid = ( + (WIDTH + block[0] - 1) // block[0], + (HEIGHT + block[1] - 1) // block[1], + 1, + ) + config = LaunchConfig(grid=grid, block=block) + + @window.event + def on_draw(): + window.clear() + + # (a) Map the PBO so CUDA can write into it. + with resource.map(stream=stream) as buf: + # (b) Launch the ray-march kernel. The camera params are passed + # as scalars; the kernel computes the orbit eye position and + # per-pixel ray direction itself. + launch( + stream, + config, + kernels["render"], + buf.handle, + np.int32(WIDTH), + np.int32(HEIGHT), + np.uint64(volume_tex.handle), + np.float32(cam["yaw"]), + np.float32(cam["pitch"]), + np.float32(cam["dist"]), + ) + # (c) Unmap happens automatically; cuGraphicsUnmapResources serializes + # the CUDA work against subsequent OpenGL use. + + copy_pbo_to_texture(gl, pbo_id, tex_id, WIDTH, HEIGHT) + draw_fullscreen_quad(gl, shader_prog, quad_vao, tex_id) + + frame_count[0] += 1 + now = time.monotonic() + if now - fps_time[0] >= 0.5: + last_fps[0] = frame_count[0] / (now - fps_time[0]) + last_frame_ms[0] = 1000.0 / last_fps[0] if last_fps[0] > 0 else 0.0 + frame_count[0] = 0 + fps_time[0] = now + window.set_caption( + "cuda.core 3D CUDAArray - SDF Volume Ray-Marcher " + f"yaw={cam['yaw']:+.2f} pitch={cam['pitch']:+.2f} " + f"dist={cam['dist']:.2f} " + f"{last_fps[0]:.0f} FPS {last_frame_ms[0]:.2f} ms/frame" + ) + + @window.event + def on_mouse_drag(_x, _y, dx, dy, buttons, _modifiers): + # Left-click drag orbits the camera. dx -> yaw (sign convention chosen + # so that dragging right rotates the scene right); dy -> pitch (drag + # up tilts the camera up). + if not (buttons & pyglet.window.mouse.LEFT): + return + orbit_scale = 0.005 + cam["yaw"] += dx * orbit_scale + cam["pitch"] += dy * orbit_scale + # Clamp pitch so the up-vector never flips (we use world-up (0,1,0)). + if cam["pitch"] < PITCH_MIN: + cam["pitch"] = PITCH_MIN + elif cam["pitch"] > PITCH_MAX: + cam["pitch"] = PITCH_MAX + + @window.event + def on_mouse_scroll(_x, _y, _scroll_x, scroll_y): + # Scroll wheel zoom: geometric so each tick feels uniform regardless + # of current distance. Positive scroll_y (wheel up) zooms in. + if scroll_y == 0: + return + cam["dist"] *= 0.9**scroll_y + if cam["dist"] < DIST_MIN: + cam["dist"] = DIST_MIN + elif cam["dist"] > DIST_MAX: + cam["dist"] = DIST_MAX + + @window.event + def on_key_press(symbol, _modifiers): + key = pyglet.window.key + if symbol == key.ESCAPE: + window.close() + elif symbol == key.R: + cam["yaw"] = RESET_YAW + cam["pitch"] = RESET_PITCH + cam["dist"] = RESET_DIST + + @window.event + def on_close(): + # Release CUDA resources in reverse construction order. The GL + # objects clean up via pyglet on window close. + resource.close() + volume_tex.close() + arr.close() + stream.close() + + pyglet.app.run(interval=0) + + +# ======================== GPU code (CUDA + GLSL) ============================ +# +# Two CUDA C++ kernels are concatenated into one program string so they share +# a single NVRTC compile. +# +# bake_sdf -- one thread per voxel. Computes the SDF of an +# "abs(gyroid) - 0.20" surface intersected with a bounding +# sphere, then writes the scalar via surf3Dwrite. NOTE: +# surf3Dwrite's x coordinate is in BYTES, y and z in +# elements -- a classic CUDA gotcha. +# +# render_sdf -- one thread per screen pixel. Builds the orbit-camera ray, +# fixed-step-marches the volume via tex3D on a trilinear- +# filtered, normalized-coord TextureObject, and shades the +# hit with diffuse + ambient + specular. Misses return a +# sky gradient. Writes RGBA8 directly into the OpenGL PBO. +# +# GLSL shaders at the very bottom just draw a textured quad. Nothing CUDA- +# specific there. +# +# ============================================================================ + +KERNEL_SOURCE = r""" +// -------------------------------------------------------------------------- +// Small inline helpers. Keeping them __device__ __forceinline__ encourages +// the compiler to drop them inline and avoids any cross-TU linkage worries. +// -------------------------------------------------------------------------- +__device__ __forceinline__ float clampf(float v, float a, float b) { + return fminf(fmaxf(v, a), b); +} + +__device__ __forceinline__ float dot3(float ax, float ay, float az, + float bx, float by, float bz) { + return ax * bx + ay * by + az * bz; +} + +__device__ __forceinline__ float length3(float x, float y, float z) { + return sqrtf(x * x + y * y + z * z); +} + +// -------------------------------------------------------------------------- +// bake_sdf: one thread per voxel writes the SDF of a gyroid-intersect-sphere +// into a single-channel float 3D CUDAArray via a SurfaceObject. +// +// surf is bound to a (size^3, FLOAT32 x 1) CUDAArray allocated with +// is_surface_load_store=True. +// surf3Dwrite's x coordinate is in BYTES (multiply by sizeof(float)); +// y and z are in elements. Off-by-one on the byte conversion silently +// corrupts every other column, so it's worth flagging explicitly. +// -------------------------------------------------------------------------- +extern "C" __global__ +void bake_sdf(cudaSurfaceObject_t surf, int size) { + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + int z = blockIdx.z * blockDim.z + threadIdx.z; + if (x >= size || y >= size || z >= size) return; + + // Map the voxel index to world-space p in [-1, 1]^3 (texel centers). + float fx = ((float)x + 0.5f) / (float)size; + float fy = ((float)y + 0.5f) / (float)size; + float fz = ((float)z + 0.5f) / (float)size; + float px = fx * 2.0f - 1.0f; + float py = fy * 2.0f - 1.0f; + float pz = fz * 2.0f - 1.0f; + + // Gyroid frequency: 3 cycles across [-1, 1] gives a busy but not noisy + // surface at 128^3 resolution. tau = 2 * pi * frequency. + const float TAU = 6.2831853071795864f * 3.0f; + + float sx = sinf(px * TAU), cx = cosf(px * TAU); + float sy = sinf(py * TAU), cy = cosf(py * TAU); + float sz = sinf(pz * TAU), cz = cosf(pz * TAU); + float gyroid = sx * cy + sy * cz + sz * cx; + // Slab thickness: the gyroid SDF is non-Lipschitz (its gradient scales + // with TAU ~= 19), so the stored values along the surface are dense but + // unreliable as a true distance metric. A wider slab (0.20 vs the + // canonical 0.05) gives the fixed-step ray marcher in render_sdf enough + // hit candidates per ray to render real geometry instead of mostly sky. + float sdf_gyroid = fabsf(gyroid) - 0.20f; // slab around iso-zero + float sdf_sphere = length3(px, py, pz) - 0.9f; // bounding sphere + float sdf = fmaxf(sdf_gyroid, sdf_sphere); // CSG intersection + + // surf3Dwrite: x in BYTES (cast sizeof to int so 32-bit arithmetic works + // even when x is large), y/z in elements. + surf3Dwrite(sdf, surf, x * (int)sizeof(float), y, z); +} + +// -------------------------------------------------------------------------- +// SDF sampler: tex3D wants normalized coords in [0, 1]; the volume covers +// [-1, 1] in world space, so we remap with `(p + 1) * 0.5`. Returns the +// raw stored SDF (a signed distance in world units). +// -------------------------------------------------------------------------- +__device__ __forceinline__ float sample_sdf(cudaTextureObject_t tex, + float px, float py, float pz) { + return tex3D(tex, + (px + 1.0f) * 0.5f, + (py + 1.0f) * 0.5f, + (pz + 1.0f) * 0.5f); +} + +// -------------------------------------------------------------------------- +// render_sdf: one thread per screen pixel. Builds the orbit camera, marches +// a ray through the SDF volume, and writes a shaded RGBA8 pixel to the PBO. +// +// Camera math (orbit, look-at origin, world-up (0, 1, 0)): +// eye = dist * (cos(pitch)*cos(yaw), sin(pitch), cos(pitch)*sin(yaw)) +// fwd = normalize(target - eye) (target = origin) +// right = normalize(cross(fwd, up)) +// up' = cross(right, fwd) +// For a pixel at (u, v) in NDC ([-1, 1] x [-1, 1] with v=1 at the top), +// dir = normalize(fwd + tan(fov/2) * (aspect * u * right + v * up')) +// +// Ray-march: +// Fixed-step march: t += STEP, where STEP is set to roughly one voxel. The +// gyroid SDF is non-Lipschitz, which makes classical sphere tracing +// (t += sdf(p)) overshoot through thin slabs and miss almost every ray. A +// uniform voxel-sized step is robust and cheap because the SDF is just a +// tex3D lookup. We declare a HIT when sdf < HIT_EPS. +// +// Bounds bail: outside the [-1, 1]^3 box, return the sky. +// Normal: 6-sample central differences with eps ~ 1.5/VOLUME_SIZE so the +// offsets are just over one voxel apart -- short enough to capture +// local surface direction, long enough that trilinear filtering +// actually moves the result. +// -------------------------------------------------------------------------- +extern "C" __global__ +void render_sdf(unsigned char* output, + int width, + int height, + cudaTextureObject_t tex, + float yaw, + float pitch, + float dist) { + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + if (x >= width || y >= height) return; + + // ---- Build the orbit camera basis ---------------------------------- + float cp = cosf(pitch), sp = sinf(pitch); + float cy = cosf(yaw), sy = sinf(yaw); + + // Eye on a sphere of radius `dist` around the origin. + float ex = dist * cp * cy; + float ey = dist * sp; + float ez = dist * cp * sy; + + // fwd = normalize(target - eye), target = origin -> fwd = -eye / |eye|. + float fl = length3(ex, ey, ez); + // Guard against the (clamped) dist being zero (not reachable, but cheap). + if (fl < 1e-6f) fl = 1e-6f; + float fx = -ex / fl, fy = -ey / fl, fz = -ez / fl; + + // right = normalize(cross(fwd, world_up)), world_up = (0, 1, 0). + // cross((fx,fy,fz), (0,1,0)) = (fy*0 - fz*1, fz*0 - fx*0, fx*1 - fy*0) + // = (-fz, 0, fx) + float rx = -fz; + float ry = 0.0f; + float rz = fx; + float rl = length3(rx, ry, rz); + if (rl < 1e-6f) rl = 1e-6f; + rx /= rl; ry /= rl; rz /= rl; + + // up' = cross(right, fwd). With right purely in the xz-plane, this is a + // proper orthonormal up; recompute to keep the basis consistent. + float ux = ry * fz - rz * fy; + float uy = rz * fx - rx * fz; + float uz = rx * fy - ry * fx; + + // ---- Per-pixel ray direction --------------------------------------- + // NDC with v=1 at the TOP. With our PBO layout (y=0 written first -> + // ends up at the bottom of the on-screen texture courtesy of the GL + // shader's [0, 1] texcoord), v = 2*v_norm - 1 already maps row 0 of the + // PBO to v = -1 (bottom of the image), which matches the camera's + // up'-axis convention. No flip needed. + float u_ndc = 2.0f * ((float)x + 0.5f) / (float)width - 1.0f; + float v_ndc = 2.0f * ((float)y + 0.5f) / (float)height - 1.0f; + + const float FOV_Y = 0.7853981633974483f; // 45 degrees + const float TAN_HALF = 0.41421356237309515f; // tanf(FOV_Y / 2) + float aspect = (float)width / (float)height; + + float dx = fx + u_ndc * aspect * TAN_HALF * rx + v_ndc * TAN_HALF * ux; + float dy = fy + u_ndc * aspect * TAN_HALF * ry + v_ndc * TAN_HALF * uy; + float dz = fz + u_ndc * aspect * TAN_HALF * rz + v_ndc * TAN_HALF * uz; + float dl = length3(dx, dy, dz); + if (dl < 1e-6f) dl = 1e-6f; + dx /= dl; dy /= dl; dz /= dl; + + // ---- Ray vs. the [-1, 1]^3 box (slab method) ----------------------- + // The camera always sits outside the volume (DIST_MIN >= 1.2 and the + // orbit puts at least one component of the eye outside [-1, 1] for + // typical framings), so we must first advance `t` to the AABB entry + // before any in-volume sampling is meaningful. tNear is the entry + // distance (clamped to >= 0 so we don't march backwards if the eye is + // inside the box for some configuration); tFar is the exit distance. + // If the slab interval is empty (tNear > tFar), the ray misses outright. + float inv_dx = 1.0f / (fabsf(dx) > 1e-8f ? dx : (dx >= 0 ? 1e-8f : -1e-8f)); + float inv_dy = 1.0f / (fabsf(dy) > 1e-8f ? dy : (dy >= 0 ? 1e-8f : -1e-8f)); + float inv_dz = 1.0f / (fabsf(dz) > 1e-8f ? dz : (dz >= 0 ? 1e-8f : -1e-8f)); + float t1x = (-1.0f - ex) * inv_dx, t2x = ( 1.0f - ex) * inv_dx; + float t1y = (-1.0f - ey) * inv_dy, t2y = ( 1.0f - ey) * inv_dy; + float t1z = (-1.0f - ez) * inv_dz, t2z = ( 1.0f - ez) * inv_dz; + float tNear = fmaxf(fmaxf(fminf(t1x, t2x), fminf(t1y, t2y)), fminf(t1z, t2z)); + float tFar = fminf(fminf(fmaxf(t1x, t2x), fmaxf(t1y, t2y)), fmaxf(t1z, t2z)); + + bool hit = false; + float hx = 0.0f, hy = 0.0f, hz = 0.0f; + + if (tFar > fmaxf(tNear, 0.0f)) { + // ---- Fixed-step march through the SDF volume from the AABB entry + // Sphere tracing relies on a Lipschitz-1 SDF: the magnitude of the + // sample tells you a safe distance you can step without crossing + // the surface. But the gyroid SDF here, |sx*cy + sy*cz + sz*cx| + // - 0.20, has a gradient scaling with TAU ~= 19, so the stored + // magnitude vastly over-reports the true distance. Sphere tracing + // would routinely overshoot thin slab regions, leaving most rays + // missing geometry that's actually there. A fixed-step march is + // cheap (the SDF is just a tex3D lookup) and robust: each step + // advances by one voxel, so any positive crossing of the iso-zero + // surface lands inside a thin window where HIT_EPS catches it. + // + // 2 worldspace units / 256 steps = ~0.008 / step, slightly under + // one voxel at 128^3 resolution. + const int MAX_STEPS = 256; + const float STEP = 1.0f / 128.0f; + const float HIT_EPS = 1.0e-3f; + // Bias slightly inside the box so the very first sample isn't on + // the boundary (CLAMP addressing makes the boundary sample valid, + // but starting just inside avoids one wasted iteration). + float t = fmaxf(tNear, 0.0f) + 1e-4f; + float t_exit = tFar; + + #pragma unroll 1 + for (int i = 0; i < MAX_STEPS; ++i) { + float pxw = ex + t * dx; + float pyw = ey + t * dy; + float pzw = ez + t * dz; + + float s = sample_sdf(tex, pxw, pyw, pzw); + if (s < HIT_EPS) { + hit = true; + hx = pxw; hy = pyw; hz = pzw; + break; + } + t += STEP; + if (t > t_exit) break; + } + } + + // ---- Shade ----------------------------------------------------------- + float r, g, b; + if (hit) { + // Central-difference normal in world space. Each sample step is + // ~1.17 voxels: short enough to capture local geometry, long enough + // that trilinear filtering meaningfully moves the result. + const float NEPS = 1.5f / 128.0f; + float nx = sample_sdf(tex, hx + NEPS, hy, hz) - + sample_sdf(tex, hx - NEPS, hy, hz); + float ny = sample_sdf(tex, hx, hy + NEPS, hz) - + sample_sdf(tex, hx, hy - NEPS, hz); + float nz = sample_sdf(tex, hx, hy, hz + NEPS) - + sample_sdf(tex, hx, hy, hz - NEPS); + float nl = length3(nx, ny, nz); + if (nl < 1e-6f) nl = 1e-6f; + nx /= nl; ny /= nl; nz /= nl; + + // Fixed key light (normalized world direction). + const float LX = 0.5773502691896258f; // (1,1,-1)/sqrt(3) + const float LY = 0.5773502691896258f; + const float LZ = -0.5773502691896258f; + float diff = fmaxf(0.0f, dot3(nx, ny, nz, LX, LY, LZ)); + + // Specular: Blinn-Phong half-vector exponent. View dir = -ray dir. + float vx = -dx, vy = -dy, vz = -dz; + float hx2 = LX + vx, hy2 = LY + vy, hz2 = LZ + vz; + float hl = length3(hx2, hy2, hz2); + if (hl < 1e-6f) hl = 1e-6f; + hx2 /= hl; hy2 /= hl; hz2 /= hl; + float ndoth = fmaxf(0.0f, dot3(nx, ny, nz, hx2, hy2, hz2)); + float spec = powf(ndoth, 32.0f); + + // Base albedo varies with the hit position so the gyroid lattice + // reads as a single material with smooth variation, not flat plastic. + float base_r = 0.55f + 0.30f * nx; + float base_g = 0.50f + 0.30f * ny; + float base_b = 0.70f + 0.30f * nz; + + const float AMBIENT = 0.18f; + r = base_r * (AMBIENT + 0.82f * diff) + 0.6f * spec; + g = base_g * (AMBIENT + 0.82f * diff) + 0.6f * spec; + b = base_b * (AMBIENT + 0.82f * diff) + 0.7f * spec; + } else { + // Sky: dark blue at the top, near-black at the bottom. The PBO's row + // 0 is the bottom of the on-screen image (see the v_ndc comment), + // so we use the y coordinate of the ray direction (close to v_ndc + // in screen space) for the gradient. + float sky = 0.5f * (dy + 1.0f); // [0, 1] roughly + sky = clampf(sky, 0.0f, 1.0f); + r = 0.02f + 0.06f * sky; + g = 0.03f + 0.10f * sky; + b = 0.05f + 0.20f * sky; + } + + r = clampf(r, 0.0f, 1.0f); + g = clampf(g, 0.0f, 1.0f); + b = clampf(b, 0.0f, 1.0f); + + int idx = (y * width + x) * 4; + output[idx + 0] = (unsigned char)(r * 255.0f); + output[idx + 1] = (unsigned char)(g * 255.0f); + output[idx + 2] = (unsigned char)(b * 255.0f); + output[idx + 3] = 255; +} +""" + +# GLSL shaders -- these just display a texture on a fullscreen rectangle. +# Nothing CUDA-specific here. + +VERTEX_SHADER_SOURCE = """#version 330 core +in vec2 position; +in vec2 texcoord; +out vec2 v_texcoord; +void main() { + gl_Position = vec4(position, 0.0, 1.0); + v_texcoord = texcoord; +} +""" + +FRAGMENT_SHADER_SOURCE = """#version 330 core +in vec2 v_texcoord; +out vec4 fragColor; +uniform sampler2D tex; +void main() { + fragColor = texture(tex, v_texcoord); +} +""" + + +if __name__ == "__main__": + main() diff --git a/cuda_core/examples/gl_interop_texture_filter.py b/cuda_core/examples/gl_interop_texture_filter.py new file mode 100644 index 00000000000..27c8bcb99fa --- /dev/null +++ b/cuda_core/examples/gl_interop_texture_filter.py @@ -0,0 +1,625 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +# ################################################################################ +# +# This example demonstrates cuda.core.TextureObject hardware filtering by +# comparing FilterMode.POINT and FilterMode.LINEAR side by side on the same +# source CUDA CUDAArray. Requires pyglet. +# +# ################################################################################ + +# What this example teaches +# ========================= +# How to back two TextureObjects with the SAME CUDA CUDAArray and observe the +# difference between POINT (nearest-texel) and LINEAR (bilinear) filtering +# under user-controlled zoom and pan. Also shows how the address mode +# (WRAP / CLAMP / MIRROR / BORDER) is baked into the texture descriptor at +# creation time, so changing it at runtime means rebuilding the textures. +# +# How it works +# ============ +# A single 256x256 RGBA8 CUDAArray holds a procedurally-generated test pattern +# (high-contrast checkerboard, diagonals, gradient stripe). Two +# TextureObjects are built on top of that CUDAArray: +# +# CUDAArray (256x256 RGBA UINT8) +# / \ +# tex_point tex_linear +# FilterMode.POINT FilterMode.LINEAR +# AddressMode.WRAP AddressMode.WRAP +# ReadMode.NORMALIZED_FLOAT ReadMode.NORMALIZED_FLOAT +# +# Each frame, a single CUDA kernel runs over a 1024x512 OpenGL PBO: +# +# - Left half of the screen samples tex_point. +# - Right half samples tex_linear. +# - Both halves use the same (zoom, pan) -> texture-space mapping, so the +# two views show the same content with different filtering. +# - A 2-pixel vertical white line marks the divider. +# +# Because ReadMode.NORMALIZED_FLOAT is used, tex2D() returns each +# channel as a float in [0, 1]; the kernel multiplies by 255 and writes +# unsigned bytes back into the PBO. +# +# The PBO is then copied to a GL texture and drawn on a fullscreen quad, +# identical to the plasma example. +# +# What you should see +# =================== +# A 1024x512 window split down the middle. The left half (POINT) shows +# blocky / pixelated magnification; the right half (LINEAR) shows smooth +# bilinear interpolation. Drag with the left mouse button to pan, +# scroll to zoom, press M to cycle the texture address mode, press R to +# reset, Escape or close the window to exit. The current address mode +# and FPS are shown in the window title. +# + +# /// script +# dependencies = ["cuda_bindings", "cuda_core>0.6.0", "pyglet"] +# /// + +import ctypes +import sys +import time + +import numpy as np + +from cuda.core import ( + AddressMode, + ArrayFormat, + CUDAArray, + Device, + FilterMode, + GraphicsResource, + LaunchConfig, + Program, + ProgramOptions, + ReadMode, + ResourceDescriptor, + TextureDescriptor, + TextureObject, + launch, +) + +# --------------------------------------------------------------------------- +# Window and source-image dimensions (feel free to change these) +# --------------------------------------------------------------------------- +WIDTH = 1024 +HEIGHT = 512 +SRC_W = 256 +SRC_H = 256 + +# Address modes cycled by pressing the M key. +ADDRESS_MODES = ( + AddressMode.WRAP, + AddressMode.CLAMP, + AddressMode.MIRROR, + AddressMode.BORDER, +) + + +# ============================= Helper functions ============================= +# +# The functions below set up CUDA and OpenGL. If you're here to learn about +# TextureObject filtering, the most interesting parts are in main() and in +# make_pattern() / make_textures(); everything else is the same kind of +# CUDA-GL interop boilerplate used by gl_interop_plasma.py. +# ============================================================================ + + +def make_pattern(width, height): + """Build an RGBA8 test pattern that makes POINT vs LINEAR obvious. + + Layout (height, width, 4) of dtype uint8. Channels are R, G, B, A. + The pattern contains: + - 8x8 black/white checkerboard (high-frequency) + - Two diagonal red lines (1px wide) + - Horizontal blue->green gradient strip near y = height/4 + - A pair of thin horizontal rectangles ("text-like" blocks) + """ + img = np.zeros((height, width, 4), dtype=np.uint8) + + # Checkerboard (black / white) at 8x8 cells. + ys = np.arange(height)[:, None] + xs = np.arange(width)[None, :] + cell = ((xs // 8) + (ys // 8)) & 1 + white = np.broadcast_to(cell[..., None].astype(np.uint8) * 255, (height, width, 3)) + img[..., :3] = white + img[..., 3] = 255 + + # Two diagonal red lines. + diag1 = xs == ys + diag2 = xs == (width - 1 - ys) + red_mask = diag1 | diag2 + img[red_mask] = (255, 0, 0, 255) + + # Horizontal gradient strip (blue -> green) ~ 8 rows tall at y ~ height/4. + g_y = height // 4 + g_h = max(4, height // 32) + grad = np.linspace(0, 255, width, dtype=np.uint8) + for row in range(g_y, min(g_y + g_h, height)): + img[row, :, 0] = 0 + img[row, :, 1] = grad # G ramps up + img[row, :, 2] = 255 - grad # B ramps down + img[row, :, 3] = 255 + + # Two "text-like" thin rectangles, alternating bright/dim. + def fill_rect(y0, y1, x0, x1, rgba): + img[y0:y1, x0:x1] = rgba + + bar_y = (3 * height) // 4 + fill_rect(bar_y, bar_y + 4, width // 8, (width * 3) // 8, (255, 255, 0, 255)) + fill_rect(bar_y + 8, bar_y + 12, (width * 5) // 8, (width * 7) // 8, (0, 255, 255, 255)) + + return np.ascontiguousarray(img) + + +def make_textures(array, address_mode): + """Build (tex_point, tex_linear) on the given CUDAArray with the given mode. + + The address mode is baked into the descriptor at cuTexObjectCreate time, so + we recreate both textures whenever the user cycles the mode. Caller owns + the returned objects and must close() them. + """ + res_desc = ResourceDescriptor.from_array(array) + + point_desc = TextureDescriptor( + address_mode=address_mode, + filter_mode=FilterMode.POINT, + read_mode=ReadMode.NORMALIZED_FLOAT, + normalized_coords=False, + ) + linear_desc = TextureDescriptor( + address_mode=address_mode, + filter_mode=FilterMode.LINEAR, + read_mode=ReadMode.NORMALIZED_FLOAT, + normalized_coords=False, + ) + tex_point = TextureObject.from_descriptor(resource=res_desc, texture_descriptor=point_desc) + tex_linear = TextureObject.from_descriptor(resource=res_desc, texture_descriptor=linear_desc) + return tex_point, tex_linear + + +def setup_cuda(kernel_source): + """Compile the CUDA kernel and return (device, stream, kernel, launch_config).""" + dev = Device(0) + dev.set_current() + stream = dev.create_stream() + + # C++ compile so the templated tex2D overload resolves. + program_options = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}") + prog = Program(kernel_source, code_type="c++", options=program_options) + mod = prog.compile("cubin", name_expressions=("split_screen_sample",)) + kernel = mod.get_kernel("split_screen_sample") + + block = (16, 16, 1) + grid = ( + (WIDTH + block[0] - 1) // block[0], + (HEIGHT + block[1] - 1) // block[1], + 1, + ) + config = LaunchConfig(grid=grid, block=block) + return dev, stream, kernel, config + + +def create_window(): + """Open a pyglet window and return (window, gl_module, pyglet).""" + try: + import pyglet + from pyglet.gl import gl as _gl + except ImportError: + print( + "This example requires pyglet >= 2.0.\nInstall it with: pip install pyglet", + file=sys.stderr, + ) + sys.exit(1) + + window = pyglet.window.Window( + WIDTH, + HEIGHT, + caption="TextureObject Filter Comparison - POINT vs LINEAR", + vsync=False, + ) + return window, _gl, pyglet + + +def create_display_resources(gl, width, height): + """Create the GL objects needed to show a texture on screen. + + Standard OpenGL boilerplate for a textured fullscreen quad, identical in + structure to the plasma example. Returns (shader_program, vao_id, tex_id). + """ + from pyglet.graphics.shader import Shader, ShaderProgram + + vert = Shader(VERTEX_SHADER_SOURCE, "vertex") + frag = Shader(FRAGMENT_SHADER_SOURCE, "fragment") + shader_prog = ShaderProgram(vert, frag) + + # Fullscreen quad (two triangles). Each vertex: x, y, s, t. + quad_verts = np.array( + [ + -1, + -1, + 0, + 0, + 1, + -1, + 1, + 0, + 1, + 1, + 1, + 1, + -1, + -1, + 0, + 0, + 1, + 1, + 1, + 1, + -1, + 1, + 0, + 1, + ], + dtype=np.float32, + ) + + vao = ctypes.c_uint(0) + gl.glGenVertexArrays(1, ctypes.byref(vao)) + gl.glBindVertexArray(vao.value) + + vbo = ctypes.c_uint(0) + gl.glGenBuffers(1, ctypes.byref(vbo)) + gl.glBindBuffer(gl.GL_ARRAY_BUFFER, vbo.value) + gl.glBufferData( + gl.GL_ARRAY_BUFFER, + quad_verts.nbytes, + quad_verts.ctypes.data_as(ctypes.c_void_p), + gl.GL_STATIC_DRAW, + ) + + stride = 4 * 4 + pos_loc = gl.glGetAttribLocation(shader_prog.id, b"position") + gl.glEnableVertexAttribArray(pos_loc) + gl.glVertexAttribPointer(pos_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(0)) + tc_loc = gl.glGetAttribLocation(shader_prog.id, b"texcoord") + gl.glEnableVertexAttribArray(tc_loc) + gl.glVertexAttribPointer(tc_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(8)) + gl.glBindVertexArray(0) + + # Empty GL texture; filled each frame from the PBO. + tex = ctypes.c_uint(0) + gl.glGenTextures(1, ctypes.byref(tex)) + gl.glBindTexture(gl.GL_TEXTURE_2D, tex.value) + # Use nearest filtering on the display texture so the example's own + # POINT/LINEAR comparison is not muddied by GL's sampler. + gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MIN_FILTER, gl.GL_NEAREST) + gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MAG_FILTER, gl.GL_NEAREST) + gl.glTexImage2D( + gl.GL_TEXTURE_2D, + 0, + gl.GL_RGBA8, + width, + height, + 0, + gl.GL_RGBA, + gl.GL_UNSIGNED_BYTE, + None, + ) + return shader_prog, vao.value, tex.value + + +def create_pixel_buffer(gl, width, height): + """Create a Pixel Buffer Object (PBO) sized for one RGBA8 frame.""" + pbo = ctypes.c_uint(0) + gl.glGenBuffers(1, ctypes.byref(pbo)) + gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo.value) + nbytes = width * height * 4 + gl.glBufferData(gl.GL_PIXEL_UNPACK_BUFFER, nbytes, None, gl.GL_DYNAMIC_DRAW) + gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0) + return pbo.value, nbytes + + +def copy_pbo_to_texture(gl, pbo_id, tex_id, width, height): + """Copy pixel data from the PBO into the GL texture (GPU-to-GPU).""" + gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo_id) + gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id) + gl.glTexSubImage2D( + gl.GL_TEXTURE_2D, + 0, + 0, + 0, + width, + height, + gl.GL_RGBA, + gl.GL_UNSIGNED_BYTE, + None, + ) + gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0) + + +def draw_fullscreen_quad(gl, shader_prog, vao_id, tex_id): + """Draw the texture to the screen using the fullscreen quad.""" + gl.glUseProgram(shader_prog.id) + gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id) + gl.glBindVertexArray(vao_id) + gl.glDrawArrays(gl.GL_TRIANGLES, 0, 6) + gl.glBindVertexArray(0) + gl.glUseProgram(0) + + +# ================================== main() ================================== + + +def main(): + # --- Step 1: Set up CUDA (compile kernel, create stream) --- + dev, stream, kernel, config = setup_cuda(KERNEL_SOURCE) + + # The hardware-texture path needs at least compute capability 3.x + # (it's available essentially everywhere modern, but check anyway so the + # failure is friendly). + if dev.compute_capability.major < 3: + print( + f"This example requires compute capability >= 3.0, " + f"got {dev.compute_capability.major}.{dev.compute_capability.minor}.", + file=sys.stderr, + ) + sys.exit(1) + + # --- Step 2: Open a window --- + window, gl, pyglet = create_window() + + # --- Step 3: Create GL resources (shader, quad, display texture) --- + shader_prog, quad_vao, tex_id = create_display_resources(gl, WIDTH, HEIGHT) + + # --- Step 4: Create the Pixel Buffer Object (PBO) --- + pbo_id, _nbytes = create_pixel_buffer(gl, WIDTH, HEIGHT) + + # --- Step 5: Register the PBO with CUDA --- + resource = GraphicsResource.from_gl_buffer(pbo_id, flags="write_discard") + + # --- Step 6: Allocate the source CUDAArray and upload the test pattern --- + # The CUDAArray lives for the entire program, so we use a `with` block. + # Inside it we create / re-create two TextureObjects whenever the + # user cycles the address mode. + with CUDAArray.from_descriptor( + shape=(SRC_W, SRC_H), + format=ArrayFormat.UINT8, + num_channels=4, + ) as arr: + pattern = make_pattern(SRC_W, SRC_H) + # Sanity: 256 * 256 * 4 bytes = 262144. + assert pattern.nbytes == arr.size_bytes, f"pattern bytes ({pattern.nbytes}) != array bytes ({arr.size_bytes})" + arr.copy_from(pattern, stream=stream) + stream.sync() # upload must finish before kernel reads + + # --- Step 7: Build initial POINT + LINEAR textures (WRAP mode). --- + # We can't use a `with` block here because the address mode is baked + # into the descriptor at creation time: cycling modes means closing + # and recreating these objects. We instead hold them in mutable + # closure state and release them in on_close(). + tex_state = { + "mode_idx": 0, + "tex_point": None, + "tex_linear": None, + } + + def rebuild_textures(): + # Close previous textures (if any) before creating new ones so we + # don't leak handles when cycling the address mode. + if tex_state["tex_point"] is not None: + tex_state["tex_point"].close() + if tex_state["tex_linear"] is not None: + tex_state["tex_linear"].close() + mode = ADDRESS_MODES[tex_state["mode_idx"]] + tp, tl = make_textures(arr, mode) + tex_state["tex_point"] = tp + tex_state["tex_linear"] = tl + + rebuild_textures() + + # --- Step 8: View state (zoom + pan), tight initial framing. --- + # zoom = pixels_per_texel. zoom=3 -> roughly 3x magnification, which + # makes POINT vs LINEAR obvious without any user input. + view = { + "zoom": 3.0, + "pan_x": SRC_W * 0.5, + "pan_y": SRC_H * 0.5, + "drag": False, + } + + def reset_view(): + view["zoom"] = 3.0 + view["pan_x"] = SRC_W * 0.5 + view["pan_y"] = SRC_H * 0.5 + + # --- Step 9: Render loop --- + start_time = time.monotonic() + frame_count = 0 + fps_time = start_time + + def current_mode_name(): + return ADDRESS_MODES[tex_state["mode_idx"]].name + + @window.event + def on_draw(): + nonlocal frame_count, fps_time + window.clear() + + # (a) Map the PBO so CUDA can write to it. + with resource.map(stream=stream) as buf: + # (b) Launch the split-screen sampling kernel. + launch( + stream, + config, + kernel, + np.uint64(tex_state["tex_point"].handle), + np.uint64(tex_state["tex_linear"].handle), + buf.handle, + np.int32(WIDTH), + np.int32(HEIGHT), + np.float32(view["zoom"]), + np.float32(view["pan_x"]), + np.float32(view["pan_y"]), + np.int32(SRC_W), + np.int32(SRC_H), + ) + # (c) Unmap happens automatically when the `with` block exits. + + # (d) PBO -> GL texture (GPU-to-GPU). + copy_pbo_to_texture(gl, pbo_id, tex_id, WIDTH, HEIGHT) + + # (e) Draw the texture to the screen. + draw_fullscreen_quad(gl, shader_prog, quad_vao, tex_id) + + frame_count += 1 + now = time.monotonic() + if now - fps_time >= 1.0: + fps = frame_count / (now - fps_time) + window.set_caption( + f"TextureObject Filter - POINT | LINEAR " + f"[address={current_mode_name()}, zoom={view['zoom']:.2f}x, " + f"{fps:.0f} FPS]" + ) + frame_count = 0 + fps_time = now + + # --- Mouse: drag to pan, scroll to zoom ------------------------------ + @window.event + def on_mouse_press(_x, _y, button, _modifiers): + if button == pyglet.window.mouse.LEFT: + view["drag"] = True + + @window.event + def on_mouse_release(_x, _y, button, _modifiers): + if button == pyglet.window.mouse.LEFT: + view["drag"] = False + + @window.event + def on_mouse_drag(_x, _y, dx, dy, buttons, _modifiers): + if not (buttons & pyglet.window.mouse.LEFT): + return + # Pyglet dy is screen-up-positive; texture y is texel-down-positive. + # One screen pixel = 1/zoom texels in source space. + view["pan_x"] -= dx / view["zoom"] + view["pan_y"] += dy / view["zoom"] + + @window.event + def on_mouse_scroll(_x, _y, _scroll_x, scroll_y): + # Geometric zoom; clamp to a sensible range. + factor = 1.1**scroll_y + new_zoom = view["zoom"] * factor + view["zoom"] = max(0.1, min(32.0, new_zoom)) + + # --- Keyboard: M cycles address mode, R resets view ------------------ + @window.event + def on_key_press(symbol, _modifiers): + key = pyglet.window.key + if symbol == key.M: + tex_state["mode_idx"] = (tex_state["mode_idx"] + 1) % len(ADDRESS_MODES) + rebuild_textures() + elif symbol == key.R: + reset_view() + elif symbol == key.ESCAPE: + window.close() + + @window.event + def on_close(): + # Release CUDA resources in reverse order of creation. + if tex_state["tex_linear"] is not None: + tex_state["tex_linear"].close() + tex_state["tex_linear"] = None + if tex_state["tex_point"] is not None: + tex_state["tex_point"].close() + tex_state["tex_point"] = None + resource.close() + + pyglet.app.run(interval=0) + + +# ======================== GPU code (CUDA + GLSL) ============================ +# +# KERNEL_SOURCE samples the same source CUDAArray through two TextureObjects +# (POINT vs LINEAR) and writes RGBA8 pixels into the PBO. ReadMode. +# NORMALIZED_FLOAT means tex2D() returns each channel in [0, 1]; +# the kernel scales by 255 and writes unsigned bytes back out. +# +# VERTEX_SHADER_SOURCE / FRAGMENT_SHADER_SOURCE are plain GLSL that draws +# a texture on a fullscreen quad -- nothing CUDA-specific. +# ============================================================================ + +KERNEL_SOURCE = r""" +extern "C" __global__ +void split_screen_sample(cudaTextureObject_t point_tex, + cudaTextureObject_t linear_tex, + unsigned char* out, + int w, int h, + float zoom, + float pan_x, float pan_y, + int src_w, int src_h) { + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + if (x >= w || y >= h) return; + + int half_w = w / 2; + + // 2-pixel-wide white separator down the middle. + if (x == half_w || x == half_w - 1) { + int idx = (y * w + x) * 4; + out[idx + 0] = 255; + out[idx + 1] = 255; + out[idx + 2] = 255; + out[idx + 3] = 255; + return; + } + + // Each half of the screen samples the same (src_x, src_y) so the two + // sides line up visually for an apples-to-apples filter comparison. + float local_x = (x < half_w) ? (float)x : (float)(x - half_w); + + // (src_x, src_y) in source-texture pixel coordinates. Non-normalized + // coords are used, so coordinate (i + 0.5, j + 0.5) selects texel (i, j). + float src_x = pan_x + (local_x - (float)half_w * 0.5f) / zoom; + float src_y = pan_y + ((float)y - (float)h * 0.5f) / zoom; + + float4 sample; + if (x < half_w) { + sample = tex2D(point_tex, src_x, src_y); + } else { + sample = tex2D(linear_tex, src_x, src_y); + } + + int idx = (y * w + x) * 4; + out[idx + 0] = (unsigned char)(sample.x * 255.0f); + out[idx + 1] = (unsigned char)(sample.y * 255.0f); + out[idx + 2] = (unsigned char)(sample.z * 255.0f); + out[idx + 3] = (unsigned char)(sample.w * 255.0f); +} +""" + +VERTEX_SHADER_SOURCE = """#version 330 core +in vec2 position; +in vec2 texcoord; +out vec2 v_texcoord; +void main() { + gl_Position = vec4(position, 0.0, 1.0); + v_texcoord = texcoord; +} +""" + +FRAGMENT_SHADER_SOURCE = """#version 330 core +in vec2 v_texcoord; +out vec4 fragColor; +uniform sampler2D tex; +void main() { + fragColor = texture(tex, v_texcoord); +} +""" + + +if __name__ == "__main__": + main() diff --git a/cuda_core/examples/texture_sample.py b/cuda_core/examples/texture_sample.py new file mode 100644 index 00000000000..78e9a463b89 --- /dev/null +++ b/cuda_core/examples/texture_sample.py @@ -0,0 +1,214 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +# ################################################################################ +# +# This example demonstrates building a 2D CUDA CUDAArray, binding it as a +# bindless TextureObject, and sampling it from a kernel with both POINT-exact +# and LINEAR-interpolated coordinates. +# +# Texture coordinate convention (non-normalized): each texel (i, j) is centered +# at (i + 0.5, j + 0.5). So tex2D(tex, 0.5, 0.5) returns texel (0, 0) exactly, +# while tex2D(tex, 1.0, 0.5) returns the linear blend of texels (0, 0) and (1, 0). +# All test coordinates below are chosen with that half-pixel offset in mind. +# +# ################################################################################ + +# /// script +# dependencies = ["cuda_bindings", "cuda_core", "nvidia-cuda-nvrtc"] +# /// + +import numpy as np + +from cuda.core import ( + AddressMode, + ArrayFormat, + CUDAArray, + Device, + FilterMode, + LaunchConfig, + LegacyPinnedMemoryResource, + Program, + ProgramOptions, + ReadMode, + ResourceDescriptor, + TextureDescriptor, + TextureObject, + launch, +) + +# Kernel reads N (x, y) coordinates from `coords` (interleaved float pairs) and +# writes tex2D(tex, x, y) to out[i]. Compiled as C++ so the templated +# tex2D overload resolves. +code = r""" +extern "C" __global__ +void sample_texture(cudaTextureObject_t tex, + float *out, + const float *coords, + int n) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= n) return; + float x = coords[2 * i + 0]; + float y = coords[2 * i + 1]; + out[i] = tex2D(tex, x, y); +} +""" + + +def main(): + dev = Device() + dev.set_current() + stream = dev.create_stream() + + pinned_mr = LegacyPinnedMemoryResource() + try: + # Allocate a 2D CUDAArray: shape=(W, H), single-channel float32. + # Note: CUDAArray.from_descriptor takes shape=(width, height), so the host + # buffer fed into copy_from must be laid out as H rows of W elements + # (row-major), i.e. host_pattern.shape == (H, W). + width, height = 16, 16 + with CUDAArray.from_descriptor( + shape=(width, height), + format=ArrayFormat.FLOAT32, + num_channels=1, + ) as arr: + # Plant a known pattern: pattern[y, x] = x + 100*y. + # Cast to float32 so the byte count matches the array's storage. + ys, xs = np.meshgrid( + np.arange(height, dtype=np.float32), + np.arange(width, dtype=np.float32), + indexing="ij", + ) + pattern = (xs + 100.0 * ys).astype(np.float32) + assert pattern.shape == (height, width) + arr.copy_from(pattern, stream=stream) + + # Build a linear-filtering, clamped, non-normalized texture. + res_desc = ResourceDescriptor.from_array(arr) + tex_desc = TextureDescriptor( + address_mode=AddressMode.CLAMP, + filter_mode=FilterMode.LINEAR, + read_mode=ReadMode.ELEMENT_TYPE, + normalized_coords=False, + ) + with TextureObject.from_descriptor(resource=res_desc, texture_descriptor=tex_desc) as tex: + _run_kernel_and_verify(dev, stream, tex, pattern, width, height, pinned_mr) + finally: + stream.close() + + +def _run_kernel_and_verify(dev, stream, tex, pattern, width, height, pinned_mr): + """Kernel launch + correctness check, isolated so the with-blocks in main() + stay readable. Owns its own pinned-buffer cleanup.""" + coords_buf = None + out_buf = None + try: + # Build the test coordinate list: + # - Texel-center samples should return the exact planted value. + # - Half-integer samples land between texels and exercise LINEAR + # filtering -- they should equal the average of the surrounding + # texels. + center_samples = [ + (0.5, 0.5), # -> pattern[0, 0] = 0 + (3.5, 0.5), # -> pattern[0, 3] = 3 + (0.5, 4.5), # -> pattern[4, 0] = 400 + (7.5, 9.5), # -> pattern[9, 7] = 907 + (15.5, 15.5), # -> pattern[15, 15] = 1515 + ] + half_samples = [ + # (1.0, 0.5): blend of texels (0, 0) and (1, 0) -> 0.5 + (1.0, 0.5), + # (0.5, 1.0): blend of texels (0, 0) and (0, 1) -> 50.0 + (0.5, 1.0), + # (1.0, 1.0): blend of the 2x2 block at (0..1, 0..1) -> 50.5 + (1.0, 1.0), + # (4.0, 5.0): blend of the 2x2 block at (3..4, 4..5) -> 453.5 + (4.0, 5.0), + ] + coords = np.array(center_samples + half_samples, dtype=np.float32) + n = coords.shape[0] + coords_flat = coords.reshape(-1) + coords_nbytes = int(coords_flat.nbytes) + out_nbytes = n * np.dtype(np.float32).itemsize + + # Use pinned host memory for inputs and outputs. Pinned allocations are + # GPU-accessible (zero-copy), so the kernel can read coords directly + # and we can read results without a separate device->host copy. + coords_buf = pinned_mr.allocate(coords_nbytes) + out_buf = pinned_mr.allocate(out_nbytes) + coords_view = np.from_dlpack(coords_buf).view(dtype=np.float32) + out_view = np.from_dlpack(out_buf).view(dtype=np.float32) + coords_view[:] = coords_flat + out_view[:] = 0.0 + + # Compile the kernel as C++ (templated tex2D requires this). + program_options = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}") + prog = Program(code, code_type="c++", options=program_options) + mod = prog.compile("cubin", name_expressions=("sample_texture",)) + kernel = mod.get_kernel("sample_texture") + + block = 64 + grid = (n + block - 1) // block + config = LaunchConfig(grid=grid, block=block) + # cudaTextureObject_t is a 64-bit handle; pass it as uint64 to be + # unambiguous (a bare Python int would also work since intptr_t is + # 8 bytes on 64-bit platforms). + launch( + stream, + config, + kernel, + np.uint64(tex.handle), + out_buf, + coords_buf, + np.int32(n), + ) + stream.sync() + results = np.asarray(out_view) + + # Verify texel-center samples (POINT-exact regardless of filter mode). + n_center = len(center_samples) + for i, (x, y) in enumerate(center_samples): + expected = (x - 0.5) + 100.0 * (y - 0.5) + got = float(results[i]) + assert np.isclose(got, expected, atol=1e-4), ( + f"center sample {i} at ({x}, {y}): expected {expected}, got {got}" + ) + + # Verify half-integer samples against the analytic mean of the 4 + # surrounding texels. Allow a small tolerance for the 1/256 fixed-point + # weight quantization that hardware filtering performs. + for j, (x, y) in enumerate(half_samples): + idx = n_center + j + # Surrounding integer texel coordinates: (xi, yi), (xi+1, yi), + # (xi, yi+1), (xi+1, yi+1). With x = xi + 1, y = yi + 1 (e.g. + # (1.0, 1.0)) the four neighbors are (0,0)..(1,1). + xi = int(np.floor(x - 0.5)) + yi = int(np.floor(y - 0.5)) + tx = (x - 0.5) - xi + ty = (y - 0.5) - yi + corners = [] + for dy in (0, 1): + for dx in (0, 1): + xv = min(max(xi + dx, 0), width - 1) + yv = min(max(yi + dy, 0), height - 1) + corners.append(pattern[yv, xv]) + v00, v10, v01, v11 = corners + expected = (1 - tx) * (1 - ty) * v00 + tx * (1 - ty) * v10 + (1 - tx) * ty * v01 + tx * ty * v11 + got = float(results[idx]) + assert np.isclose(got, expected, atol=1e-2), ( + f"half sample {j} at ({x}, {y}): expected {expected}, got {got}" + ) + + print("Texture sampling example completed successfully.") + print(f" texel-center samples verified: {n_center}") + print(f" half-integer samples verified: {len(half_samples)}") + finally: + if coords_buf is not None: + coords_buf.close() + if out_buf is not None: + out_buf.close() + + +if __name__ == "__main__": + main() diff --git a/cuda_core/tests/example_tests/test_basic_examples.py b/cuda_core/tests/example_tests/test_basic_examples.py index 43fab4241db..75910b87894 100644 --- a/cuda_core/tests/example_tests/test_basic_examples.py +++ b/cuda_core/tests/example_tests/test_basic_examples.py @@ -83,6 +83,22 @@ def has_recent_memory_pool_support() -> bool: SYSTEM_REQUIREMENTS = { "memory_pool_resources.py": has_recent_memory_pool_support, "gl_interop_plasma.py": has_display, + "gl_interop_bloom.py": has_display, + "gl_interop_caustics.py": has_display, + "gl_interop_clouds.py": has_display, + "gl_interop_fire.py": has_display, + "gl_interop_fluid.py": has_display, + "gl_interop_image_show.py": has_display, + "gl_interop_jfa_voronoi.py": has_display, + "gl_interop_lenia.py": has_display, + "gl_interop_mandelbrot.py": has_display, + "gl_interop_mipmap_lod.py": has_display, + "gl_interop_ocean.py": has_display, + "gl_interop_particles.py": has_display, + "gl_interop_physarum.py": has_display, + "gl_interop_reaction_diffusion.py": has_display, + "gl_interop_sdf_volume.py": has_display, + "gl_interop_texture_filter.py": has_display, "jit_lto_fractal.py": _can_load_generated_ptx, "pytorch_example.py": lambda: ( has_compute_capability_9_or_higher() and is_x86_64() diff --git a/cuda_core/tests/test_texture_surface.py b/cuda_core/tests/test_texture_surface.py new file mode 100644 index 00000000000..d111a477232 --- /dev/null +++ b/cuda_core/tests/test_texture_surface.py @@ -0,0 +1,886 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +import gc + +import pytest + +import cuda.core +from cuda.core import ( + AddressMode, + ArrayFormat, + CUDAArray, + Device, + FilterMode, + MipmappedArray, + ReadMode, + ResourceDescriptor, + SurfaceObject, + TextureDescriptor, + TextureObject, +) + + +def test_array_init_disabled(): + with pytest.raises(RuntimeError, match=r"^CUDAArray cannot be instantiated directly"): + cuda.core._array.CUDAArray() + + +def test_texture_object_init_disabled(): + with pytest.raises(RuntimeError, match=r"^TextureObject cannot be instantiated directly"): + cuda.core._texture.TextureObject() + + +def test_surface_object_init_disabled(): + with pytest.raises(RuntimeError, match=r"^SurfaceObject cannot be instantiated directly"): + cuda.core._surface.SurfaceObject() + + +def test_resource_descriptor_init_disabled(): + with pytest.raises(RuntimeError, match=r"^ResourceDescriptor cannot be instantiated"): + ResourceDescriptor() + + +def test_array_2d_create_and_properties(init_cuda): + arr = CUDAArray.from_descriptor(shape=(32, 16), format=ArrayFormat.FLOAT32, num_channels=1) + try: + assert arr.shape == (32, 16) + assert arr.format == ArrayFormat.FLOAT32 + assert arr.num_channels == 1 + assert arr.element_size == 4 + assert arr.size_bytes == 32 * 16 * 4 + assert arr.is_surface_load_store is False + assert arr.handle != 0 + assert isinstance(arr.device, Device) + finally: + arr.close() + + +def test_array_3d_with_surface_flag(init_cuda): + arr = CUDAArray.from_descriptor( + shape=(8, 8, 4), + format=ArrayFormat.UINT8, + num_channels=4, + is_surface_load_store=True, + ) + try: + assert arr.shape == (8, 8, 4) + assert arr.is_surface_load_store is True + assert arr.element_size == 4 + finally: + arr.close() + + +def test_array_rejects_bad_channels(init_cuda): + with pytest.raises(ValueError, match="num_channels"): + CUDAArray.from_descriptor(shape=(8,), format=ArrayFormat.UINT8, num_channels=3) + + +def test_array_rejects_bad_rank(init_cuda): + with pytest.raises(ValueError, match="shape rank"): + CUDAArray.from_descriptor(shape=(2, 2, 2, 2), format=ArrayFormat.UINT8, num_channels=1) + + +def test_array_roundtrip_copy(init_cuda): + import array as _array + + device = Device() + stream = device.create_stream() + arr = CUDAArray.from_descriptor(shape=(16,), format=ArrayFormat.UINT32, num_channels=1) + try: + src = _array.array("I", list(range(16))) + dst = _array.array("I", [0] * 16) + arr.copy_from(src, stream=stream) + arr.copy_to(dst, stream=stream) + stream.sync() + # Round-trip recovers data; src must not be mutated by copy_from. + assert list(dst) == list(range(16)) + assert list(src) == list(range(16)) + finally: + arr.close() + stream.close() + + +def test_array_copy_rejects_undersized_host_buffer(init_cuda): + import array as _array + + device = Device() + stream = device.create_stream() + arr = CUDAArray.from_descriptor(shape=(16,), format=ArrayFormat.UINT32, num_channels=1) + try: + # arr is 16 * 4 = 64 bytes; pass an 8-element (32-byte) host buffer. + too_small = _array.array("I", [0] * 8) + with pytest.raises(ValueError, match="smaller than the array extent"): + arr.copy_from(too_small, stream=stream) + with pytest.raises(ValueError, match="smaller than the array extent"): + arr.copy_to(too_small, stream=stream) + finally: + arr.close() + stream.close() + + +def test_array_copy_rejects_undersized_device_buffer(init_cuda): + device = Device() + stream = device.create_stream() + arr = CUDAArray.from_descriptor(shape=(16,), format=ArrayFormat.UINT32, num_channels=1) + # arr is 64 bytes; allocate a 32-byte device buffer. + small_buf = device.memory_resource.allocate(32, stream=device.default_stream) + try: + with pytest.raises(ValueError, match="smaller than the array extent"): + arr.copy_from(small_buf, stream=stream) + with pytest.raises(ValueError, match="smaller than the array extent"): + arr.copy_to(small_buf, stream=stream) + finally: + small_buf.close() + arr.close() + stream.close() + + +def test_texture_object_create(init_cuda): + arr = CUDAArray.from_descriptor(shape=(32, 16), format=ArrayFormat.FLOAT32, num_channels=1) + try: + res = ResourceDescriptor.from_array(arr) + tex_desc = TextureDescriptor( + address_mode=AddressMode.CLAMP, + filter_mode=FilterMode.LINEAR, + read_mode=ReadMode.ELEMENT_TYPE, + normalized_coords=True, + ) + tex = TextureObject.from_descriptor(resource=res, texture_descriptor=tex_desc) + try: + assert tex.handle != 0 + assert tex.resource is res + assert tex.texture_descriptor is tex_desc + finally: + tex.close() + finally: + arr.close() + + +def test_surface_object_create(init_cuda): + arr = CUDAArray.from_descriptor( + shape=(8, 8), + format=ArrayFormat.UINT8, + num_channels=4, + is_surface_load_store=True, + ) + try: + surf = SurfaceObject.from_array(arr) + try: + assert surf.handle != 0 + assert isinstance(surf.resource, ResourceDescriptor) + finally: + surf.close() + finally: + arr.close() + + +def test_surface_requires_ldst_flag(init_cuda): + arr = CUDAArray.from_descriptor(shape=(8, 8), format=ArrayFormat.UINT8, num_channels=4) + try: + with pytest.raises(ValueError, match="is_surface_load_store=True"): + SurfaceObject.from_array(arr) + finally: + arr.close() + + +def test_address_mode_normalization(init_cuda): + # Direct unit test of the private normalizer: a scalar should expand to a + # 3-tuple; a shorter tuple should be padded by repeating the last entry. + from cuda.core._texture import _normalize_address_modes + + assert _normalize_address_modes(AddressMode.WRAP) == ( + AddressMode.WRAP, + AddressMode.WRAP, + AddressMode.WRAP, + ) + assert _normalize_address_modes((AddressMode.WRAP, AddressMode.CLAMP)) == ( + AddressMode.WRAP, + AddressMode.CLAMP, + AddressMode.CLAMP, + ) + assert _normalize_address_modes((AddressMode.WRAP, AddressMode.CLAMP, AddressMode.MIRROR)) == ( + AddressMode.WRAP, + AddressMode.CLAMP, + AddressMode.MIRROR, + ) + + # Smoke test: a 2-entry tuple is also accepted end-to-end. + arr = CUDAArray.from_descriptor(shape=(8, 8, 4), format=ArrayFormat.FLOAT32, num_channels=1) + try: + res = ResourceDescriptor.from_array(arr) + tex_desc = TextureDescriptor(address_mode=(AddressMode.WRAP, AddressMode.CLAMP)) + tex = TextureObject.from_descriptor(resource=res, texture_descriptor=tex_desc) + try: + assert tex.handle != 0 + finally: + tex.close() + finally: + arr.close() + + +# --- Linear / pitch2D resource descriptors ----------------------------------- + + +def _alloc_device_buffer(device, nbytes): + """Allocate a device Buffer using the device's default memory resource.""" + return device.memory_resource.allocate(nbytes, stream=device.default_stream) + + +def test_resource_descriptor_from_linear_defaults_size(init_cuda): + device = Device() + buf = _alloc_device_buffer(device, 4096) + try: + res = ResourceDescriptor.from_linear(buf, format=ArrayFormat.FLOAT32, num_channels=1) + assert res.kind == "linear" + assert res.format == ArrayFormat.FLOAT32 + assert res.num_channels == 1 + assert res.source is buf + # repr should include the kind/format hint + assert "linear" in repr(res) + finally: + buf.close() + + +def test_resource_descriptor_from_linear_size_override(init_cuda): + device = Device() + buf = _alloc_device_buffer(device, 4096) + try: + res = ResourceDescriptor.from_linear(buf, format=ArrayFormat.UINT32, num_channels=1, size_bytes=2048) + assert res._size_bytes == 2048 + finally: + buf.close() + + +def test_resource_descriptor_from_linear_rejects_oversize(init_cuda): + device = Device() + buf = _alloc_device_buffer(device, 1024) + try: + with pytest.raises(ValueError, match="exceeds buffer.size"): + ResourceDescriptor.from_linear(buf, format=ArrayFormat.UINT8, num_channels=1, size_bytes=2048) + finally: + buf.close() + + +def test_resource_descriptor_from_linear_rejects_bad_channels(init_cuda): + device = Device() + buf = _alloc_device_buffer(device, 1024) + try: + with pytest.raises(ValueError, match="num_channels"): + ResourceDescriptor.from_linear(buf, format=ArrayFormat.UINT8, num_channels=3) + finally: + buf.close() + + +def test_resource_descriptor_from_linear_rejects_non_buffer(): + with pytest.raises(TypeError, match="Buffer"): + ResourceDescriptor.from_linear(object(), format=ArrayFormat.UINT8, num_channels=1) + + +def test_resource_descriptor_from_linear_rejects_zero_size(init_cuda): + device = Device() + buf = _alloc_device_buffer(device, 1024) + try: + with pytest.raises(ValueError, match="at least one element"): + ResourceDescriptor.from_linear(buf, format=ArrayFormat.UINT32, num_channels=1, size_bytes=0) + finally: + buf.close() + + +def test_resource_descriptor_from_linear_rejects_non_multiple(init_cuda): + device = Device() + buf = _alloc_device_buffer(device, 1024) + try: + # UINT32 x 1 channel = 4 bytes/element; 10 bytes is not a multiple. + with pytest.raises(ValueError, match="multiple of element size"): + ResourceDescriptor.from_linear(buf, format=ArrayFormat.UINT32, num_channels=1, size_bytes=10) + finally: + buf.close() + + +def test_texture_object_from_linear(init_cuda): + """A linear-backed texture should bind even though sampling fields are + effectively ignored by the driver.""" + device = Device() + # 1024 float elements + buf = _alloc_device_buffer(device, 1024 * 4) + try: + res = ResourceDescriptor.from_linear(buf, format=ArrayFormat.FLOAT32, num_channels=1) + tex = TextureObject.from_descriptor(resource=res, texture_descriptor=TextureDescriptor()) + try: + assert tex.handle != 0 + assert tex.resource is res + finally: + tex.close() + finally: + buf.close() + + +def test_resource_descriptor_from_pitch2d_validates_pitch(init_cuda): + device = Device() + buf = _alloc_device_buffer(device, 64 * 1024) + try: + # element_size = 4 (UINT32 * 1 channel); width=16 -> min_pitch=64 + with pytest.raises(ValueError, match="pitch_bytes"): + ResourceDescriptor.from_pitch2d( + buf, + format=ArrayFormat.UINT32, + num_channels=1, + width=16, + height=8, + pitch_bytes=32, # < 64 = width*element_size + ) + finally: + buf.close() + + +def test_resource_descriptor_from_pitch2d_validates_buffer_size(init_cuda): + device = Device() + buf = _alloc_device_buffer(device, 4096) + try: + with pytest.raises(ValueError, match="exceeds buffer.size"): + ResourceDescriptor.from_pitch2d( + buf, + format=ArrayFormat.UINT8, + num_channels=4, + width=64, + height=128, + pitch_bytes=512, # 512 * 128 = 65536 > 4096 + ) + finally: + buf.close() + + +def test_texture_object_from_pitch2d(init_cuda): + """A pitch2D-backed texture should bind given driver-aligned pitch.""" + from cuda.bindings import driver + + device = Device() + # Query the device's required texture pitch alignment (typically 32-512). + err, align = driver.cuDeviceGetAttribute( + driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT, + device.device_id, + ) + assert int(err) == 0 + pitch = max(int(align), 256) + height = 16 + buf = _alloc_device_buffer(device, pitch * height) + try: + res = ResourceDescriptor.from_pitch2d( + buf, + format=ArrayFormat.UINT8, + num_channels=4, + width=32, + height=height, + pitch_bytes=pitch, + ) + assert res.kind == "pitch2d" + assert "pitch2d" in repr(res) + tex = TextureObject.from_descriptor(resource=res, texture_descriptor=TextureDescriptor()) + try: + assert tex.handle != 0 + finally: + tex.close() + finally: + buf.close() + + +def test_surface_rejects_linear_and_pitch2d(init_cuda): + device = Device() + buf = _alloc_device_buffer(device, 4096) + try: + res_lin = ResourceDescriptor.from_linear(buf, format=ArrayFormat.UINT32, num_channels=1) + with pytest.raises(ValueError, match="array-backed"): + SurfaceObject.from_descriptor(resource=res_lin) + + res_p2 = ResourceDescriptor.from_pitch2d( + buf, + format=ArrayFormat.UINT8, + num_channels=4, + width=8, + height=8, + pitch_bytes=64, + ) + with pytest.raises(ValueError, match="array-backed"): + SurfaceObject.from_descriptor(resource=res_p2) + finally: + buf.close() + + +# --- MipmappedArray ---------------------------------------------------------- + + +def test_mipmapped_array_init_disabled(): + with pytest.raises(RuntimeError, match=r"^MipmappedArray cannot be instantiated directly"): + cuda.core._mipmapped_array.MipmappedArray() + + +def test_mipmapped_array_from_descriptor_2d(init_cuda): + mip = MipmappedArray.from_descriptor( + shape=(64, 32), + format=ArrayFormat.FLOAT32, + num_channels=1, + num_levels=4, + ) + try: + assert mip.shape == (64, 32) + assert mip.format == ArrayFormat.FLOAT32 + assert mip.num_channels == 1 + assert mip.num_levels == 4 + assert mip.is_surface_load_store is False + assert mip.handle != 0 + assert isinstance(mip.device, Device) + finally: + mip.close() + + +def test_mipmapped_array_get_level_zero_matches_shape(init_cuda): + shape = (64, 32) + mip = MipmappedArray.from_descriptor( + shape=shape, + format=ArrayFormat.UINT8, + num_channels=4, + num_levels=4, + ) + try: + lvl0 = mip.get_level(0) + try: + assert isinstance(lvl0, CUDAArray) + # Level 0 must match the base shape and rank. + assert lvl0.shape == shape + assert lvl0.format == ArrayFormat.UINT8 + assert lvl0.num_channels == 4 + assert lvl0.handle != 0 + finally: + lvl0.close() + finally: + mip.close() + + +def test_mipmapped_array_get_level_halves_dims(init_cuda): + shape = (64, 32) + num_levels = 4 + mip = MipmappedArray.from_descriptor( + shape=shape, + format=ArrayFormat.UINT8, + num_channels=1, + num_levels=num_levels, + ) + try: + for level in range(num_levels): + lvl = mip.get_level(level) + try: + # Each dim halves per level, with a floor of 1; rank is preserved. + expected = tuple(max(1, dim >> level) for dim in shape) + assert lvl.shape == expected, f"level={level}: expected {expected}, got {lvl.shape}" + finally: + lvl.close() + finally: + mip.close() + + +def test_mipmapped_array_get_level_out_of_range(init_cuda): + mip = MipmappedArray.from_descriptor( + shape=(16, 16), + format=ArrayFormat.UINT8, + num_channels=1, + num_levels=2, + ) + try: + with pytest.raises(ValueError, match="num_levels"): + mip.get_level(mip.num_levels) + with pytest.raises(ValueError, match=">= 0"): + mip.get_level(-1) + finally: + mip.close() + + +def test_mipmapped_array_rejects_zero_levels(init_cuda): + with pytest.raises(ValueError, match="num_levels"): + MipmappedArray.from_descriptor( + shape=(8, 8), + format=ArrayFormat.UINT8, + num_channels=1, + num_levels=0, + ) + + +def test_resource_descriptor_from_mipmapped_array(init_cuda): + mip = MipmappedArray.from_descriptor( + shape=(32, 16), + format=ArrayFormat.FLOAT32, + num_channels=1, + num_levels=3, + ) + try: + res = ResourceDescriptor.from_mipmapped_array(mip) + assert res.kind == "mipmapped_array" + assert res.source is mip + finally: + mip.close() + + +def test_resource_descriptor_from_mipmapped_array_rejects_non_mipmap(): + with pytest.raises(TypeError, match="MipmappedArray"): + ResourceDescriptor.from_mipmapped_array(object()) + + +def test_texture_object_from_mipmapped_array(init_cuda): + mip = MipmappedArray.from_descriptor( + shape=(32, 32), + format=ArrayFormat.FLOAT32, + num_channels=1, + num_levels=3, + ) + try: + res = ResourceDescriptor.from_mipmapped_array(mip) + # Use non-default mipmap params so the driver exercises that path. + tex_desc = TextureDescriptor( + address_mode=AddressMode.CLAMP, + filter_mode=FilterMode.LINEAR, + normalized_coords=True, + mipmap_filter_mode=FilterMode.LINEAR, + mipmap_level_bias=0.0, + min_mipmap_level_clamp=0.0, + max_mipmap_level_clamp=float(mip.num_levels - 1), + ) + tex = TextureObject.from_descriptor(resource=res, texture_descriptor=tex_desc) + try: + assert tex.handle != 0 + assert tex.resource is res + finally: + tex.close() + finally: + mip.close() + + +def test_surface_rejects_mipmapped_array(init_cuda): + mip = MipmappedArray.from_descriptor( + shape=(16, 16), + format=ArrayFormat.UINT8, + num_channels=4, + num_levels=2, + is_surface_load_store=True, + ) + try: + res = ResourceDescriptor.from_mipmapped_array(mip) + with pytest.raises(ValueError, match="array-backed"): + SurfaceObject.from_descriptor(resource=res) + finally: + mip.close() + + +def test_mipmapped_array_level_keeps_parent_alive(init_cuda): + """Dropping the local parent reference must not invalidate the level CUDAArray; + the level holds an internal strong ref back to the MipmappedArray. + + cdef classes don't natively support weakref, so we verify the parent + reference by inspecting the level CUDAArray's gc referents. + """ + mip = MipmappedArray.from_descriptor( + shape=(16, 16), + format=ArrayFormat.UINT8, + num_channels=1, + num_levels=3, + ) + parent_id = id(mip) + lvl = mip.get_level(1) + # Drop our local reference and force GC; the parent must survive because + # the level CUDAArray holds a strong ref via the internal _parent_ref slot. + del mip + gc.collect() + + # The handle is still valid storage; the level still tracks the parent. + assert lvl.handle != 0 + referents = gc.get_referents(lvl) + parents = [r for r in referents if isinstance(r, MipmappedArray)] + assert len(parents) == 1, f"level CUDAArray should reference exactly one MipmappedArray parent, got {parents!r}" + assert id(parents[0]) == parent_id, "level CUDAArray's parent ref is not the original MipmappedArray" + # Closing the level drops its parent ref. Don't access the parent past + # this point; cuMipmappedArrayDestroy may then run. + lvl.close() + + +# --- Negative-path validation tests ------------------------------------------ + + +def test_array_from_descriptor_rejects_bad_format(init_cuda): + with pytest.raises(TypeError, match="format must be an ArrayFormat"): + CUDAArray.from_descriptor(shape=(8,), format=0, num_channels=1) + + +def test_array_from_descriptor_rejects_non_iterable_shape(init_cuda): + with pytest.raises(TypeError, match="shape must be a tuple"): + CUDAArray.from_descriptor(shape=8, format=ArrayFormat.UINT8, num_channels=1) + + +def test_array_from_descriptor_rejects_zero_dim(init_cuda): + with pytest.raises(ValueError, match=r"shape\[1\] must be >= 1"): + CUDAArray.from_descriptor(shape=(8, 0), format=ArrayFormat.UINT8, num_channels=1) + + +def test_array_copy_rejects_non_stream(init_cuda): + arr = CUDAArray.from_descriptor(shape=(8,), format=ArrayFormat.UINT8, num_channels=1) + try: + import array as _array + + buf = _array.array("B", [0] * 8) + with pytest.raises(TypeError, match="stream must be a Stream"): + arr.copy_from(buf, stream="not-a-stream") + with pytest.raises(TypeError, match="stream must be a Stream"): + arr.copy_to(buf, stream="not-a-stream") + finally: + arr.close() + + +def test_resource_descriptor_from_pitch2d_rejects_non_buffer(): + with pytest.raises(TypeError, match="buffer must be a Buffer"): + ResourceDescriptor.from_pitch2d( + object(), + format=ArrayFormat.UINT8, + num_channels=1, + width=8, + height=8, + pitch_bytes=64, + ) + + +def test_resource_descriptor_from_pitch2d_rejects_bad_format(init_cuda): + device = Device() + buf = _alloc_device_buffer(device, 4096) + try: + with pytest.raises(TypeError, match="format must be an ArrayFormat"): + ResourceDescriptor.from_pitch2d( + buf, + format=0, + num_channels=1, + width=8, + height=8, + pitch_bytes=64, + ) + finally: + buf.close() + + +def test_resource_descriptor_from_pitch2d_rejects_bad_channels(init_cuda): + device = Device() + buf = _alloc_device_buffer(device, 4096) + try: + with pytest.raises(ValueError, match="num_channels"): + ResourceDescriptor.from_pitch2d( + buf, + format=ArrayFormat.UINT8, + num_channels=3, + width=8, + height=8, + pitch_bytes=64, + ) + finally: + buf.close() + + +def test_resource_descriptor_from_pitch2d_rejects_zero_dims(init_cuda): + device = Device() + buf = _alloc_device_buffer(device, 4096) + try: + with pytest.raises(ValueError, match="width"): + ResourceDescriptor.from_pitch2d( + buf, + format=ArrayFormat.UINT8, + num_channels=1, + width=0, + height=8, + pitch_bytes=64, + ) + with pytest.raises(ValueError, match="height"): + ResourceDescriptor.from_pitch2d( + buf, + format=ArrayFormat.UINT8, + num_channels=1, + width=8, + height=0, + pitch_bytes=64, + ) + finally: + buf.close() + + +def test_mipmapped_array_rejects_bad_format(init_cuda): + with pytest.raises(TypeError, match="format must be an ArrayFormat"): + MipmappedArray.from_descriptor(shape=(8, 8), format=0, num_channels=1, num_levels=2) + + +def test_mipmapped_array_rejects_bad_channels(init_cuda): + with pytest.raises(ValueError, match="num_channels"): + MipmappedArray.from_descriptor(shape=(8, 8), format=ArrayFormat.UINT8, num_channels=3, num_levels=2) + + +def test_mipmapped_array_rejects_zero_dim(init_cuda): + with pytest.raises(ValueError, match=r"shape\[0\] must be >= 1"): + MipmappedArray.from_descriptor(shape=(0, 8), format=ArrayFormat.UINT8, num_channels=1, num_levels=1) + + +def test_texture_object_rejects_non_resource_descriptor(init_cuda): + with pytest.raises(TypeError, match="resource must be a ResourceDescriptor"): + TextureObject.from_descriptor(resource=object(), texture_descriptor=TextureDescriptor()) + + +def test_texture_object_rejects_non_texture_descriptor(init_cuda): + arr = CUDAArray.from_descriptor(shape=(8, 8), format=ArrayFormat.FLOAT32, num_channels=1) + try: + res = ResourceDescriptor.from_array(arr) + with pytest.raises(TypeError, match="texture_descriptor must be a TextureDescriptor"): + TextureObject.from_descriptor(resource=res, texture_descriptor="nope") + finally: + arr.close() + + +def test_texture_object_rejects_bad_filter_mode(init_cuda): + arr = CUDAArray.from_descriptor(shape=(8, 8), format=ArrayFormat.FLOAT32, num_channels=1) + try: + res = ResourceDescriptor.from_array(arr) + td = TextureDescriptor(filter_mode=0) # int, not FilterMode + with pytest.raises(TypeError, match="filter_mode must be a FilterMode"): + TextureObject.from_descriptor(resource=res, texture_descriptor=td) + finally: + arr.close() + + +def test_texture_object_rejects_bad_read_mode(init_cuda): + arr = CUDAArray.from_descriptor(shape=(8, 8), format=ArrayFormat.FLOAT32, num_channels=1) + try: + res = ResourceDescriptor.from_array(arr) + td = TextureDescriptor(read_mode=0) # int, not ReadMode + with pytest.raises(TypeError, match="read_mode must be a ReadMode"): + TextureObject.from_descriptor(resource=res, texture_descriptor=td) + finally: + arr.close() + + +def test_texture_object_rejects_bad_mipmap_filter_mode(init_cuda): + arr = CUDAArray.from_descriptor(shape=(8, 8), format=ArrayFormat.FLOAT32, num_channels=1) + try: + res = ResourceDescriptor.from_array(arr) + td = TextureDescriptor(mipmap_filter_mode=0) # int, not FilterMode + with pytest.raises(TypeError, match="mipmap_filter_mode must be a FilterMode"): + TextureObject.from_descriptor(resource=res, texture_descriptor=td) + finally: + arr.close() + + +def test_texture_object_rejects_negative_anisotropy(init_cuda): + arr = CUDAArray.from_descriptor(shape=(8, 8), format=ArrayFormat.FLOAT32, num_channels=1) + try: + res = ResourceDescriptor.from_array(arr) + td = TextureDescriptor(max_anisotropy=-1) + with pytest.raises(ValueError, match="max_anisotropy"): + TextureObject.from_descriptor(resource=res, texture_descriptor=td) + finally: + arr.close() + + +def test_texture_object_rejects_bad_border_color_length(init_cuda): + arr = CUDAArray.from_descriptor(shape=(8, 8), format=ArrayFormat.FLOAT32, num_channels=1) + try: + res = ResourceDescriptor.from_array(arr) + td = TextureDescriptor(border_color=(0.0, 0.0)) # length 2, not 4 + with pytest.raises(ValueError, match="border_color must have 4"): + TextureObject.from_descriptor(resource=res, texture_descriptor=td) + finally: + arr.close() + + +def test_address_mode_rejects_non_addressmode_scalar(init_cuda): + arr = CUDAArray.from_descriptor(shape=(8, 8), format=ArrayFormat.FLOAT32, num_channels=1) + try: + res = ResourceDescriptor.from_array(arr) + td = TextureDescriptor(address_mode=42) # int, not AddressMode / iterable + with pytest.raises(TypeError, match="address_mode"): + TextureObject.from_descriptor(resource=res, texture_descriptor=td) + finally: + arr.close() + + +def test_address_mode_rejects_empty_tuple(init_cuda): + arr = CUDAArray.from_descriptor(shape=(8, 8), format=ArrayFormat.FLOAT32, num_channels=1) + try: + res = ResourceDescriptor.from_array(arr) + td = TextureDescriptor(address_mode=()) + with pytest.raises(ValueError, match="address_mode tuple must have 1-3"): + TextureObject.from_descriptor(resource=res, texture_descriptor=td) + finally: + arr.close() + + +def test_address_mode_rejects_too_long_tuple(init_cuda): + arr = CUDAArray.from_descriptor(shape=(8, 8), format=ArrayFormat.FLOAT32, num_channels=1) + try: + res = ResourceDescriptor.from_array(arr) + td = TextureDescriptor(address_mode=(AddressMode.WRAP, AddressMode.WRAP, AddressMode.WRAP, AddressMode.WRAP)) + with pytest.raises(ValueError, match="address_mode tuple must have 1-3"): + TextureObject.from_descriptor(resource=res, texture_descriptor=td) + finally: + arr.close() + + +def test_address_mode_rejects_non_addressmode_entry(init_cuda): + arr = CUDAArray.from_descriptor(shape=(8, 8), format=ArrayFormat.FLOAT32, num_channels=1) + try: + res = ResourceDescriptor.from_array(arr) + td = TextureDescriptor(address_mode=(AddressMode.WRAP, "bad", AddressMode.CLAMP)) + with pytest.raises(TypeError, match=r"address_mode\[1\]"): + TextureObject.from_descriptor(resource=res, texture_descriptor=td) + finally: + arr.close() + + +def test_texture_object_keeps_backing_array_alive(init_cuda): + """Dropping the local references to the backing CUDAArray and the + ResourceDescriptor must NOT invalidate an existing TextureObject. The + TextureObject holds a strong ref through its _source_ref slot.""" + arr = CUDAArray.from_descriptor(shape=(8, 8), format=ArrayFormat.FLOAT32, num_channels=1) + res = ResourceDescriptor.from_array(arr) + tex = TextureObject.from_descriptor(resource=res, texture_descriptor=TextureDescriptor()) + # Verify the keepalive chain via gc referents: TextureObject -> _source_ref + # -> ResourceDescriptor -> _source -> CUDAArray. We can only walk one level + # at a time, so check tex's referents include the ResourceDescriptor. + arr_id = id(arr) + res_id = id(res) + del arr, res + gc.collect() + + referents = gc.get_referents(tex) + res_refs = [r for r in referents if id(r) == res_id] + assert len(res_refs) == 1, ( + f"TextureObject should still reference the ResourceDescriptor; got referents {referents!r}" + ) + res_back = res_refs[0] + arr_refs = [r for r in gc.get_referents(res_back) if id(r) == arr_id] + assert len(arr_refs) == 1, "ResourceDescriptor should still reference its CUDAArray" + + # tex.handle should still be valid (non-zero). + assert tex.handle != 0 + tex.close() + + +def test_surface_object_keeps_backing_array_alive(init_cuda): + arr = CUDAArray.from_descriptor( + shape=(8, 8), + format=ArrayFormat.UINT8, + num_channels=4, + is_surface_load_store=True, + ) + surf = SurfaceObject.from_array(arr) + arr_id = id(arr) + del arr + gc.collect() + + # The surface keeps the ResourceDescriptor alive, which keeps the CUDAArray + # alive. We verify the chain end-to-end the same way as the texture case. + referents = gc.get_referents(surf) + res_objs = [r for r in referents if isinstance(r, ResourceDescriptor)] + assert len(res_objs) == 1 + arr_refs = [r for r in gc.get_referents(res_objs[0]) if id(r) == arr_id] + assert len(arr_refs) == 1, "SurfaceObject should still reference its backing CUDAArray via the ResourceDescriptor" + assert surf.handle != 0 + surf.close()