diff --git a/cuda_core/cuda/core/__init__.py b/cuda_core/cuda/core/__init__.py
index 33ba03c9c2d..0db06b05e14 100644
--- a/cuda_core/cuda/core/__init__.py
+++ b/cuda_core/cuda/core/__init__.py
@@ -69,6 +69,7 @@ class _PatchedProperty(metaclass=_PatchedPropMeta):
 
 
 from cuda.core import checkpoint, system, utils
+from cuda.core._array import ArrayFormat, CUDAArray
 from cuda.core._context import Context, ContextOptions
 from cuda.core._device import Device
 from cuda.core._device_resources import (
@@ -99,6 +100,7 @@ class _PatchedProperty(metaclass=_PatchedPropMeta):
     VirtualMemoryResource,
     VirtualMemoryResourceOptions,
 )
+from cuda.core._mipmapped_array import MipmappedArray
 from cuda.core._module import Kernel, ObjectCode
 from cuda.core._program import Program, ProgramOptions
 from cuda.core._stream import (
@@ -107,7 +109,16 @@ class _PatchedProperty(metaclass=_PatchedPropMeta):
     Stream,
     StreamOptions,
 )
+from cuda.core._surface import SurfaceObject
 from cuda.core._tensor_map import TensorMapDescriptor, TensorMapDescriptorOptions
+from cuda.core._texture import (
+    AddressMode,
+    FilterMode,
+    ReadMode,
+    ResourceDescriptor,
+    TextureDescriptor,
+    TextureObject,
+)
 
 # isort: split
 # Must come after the cuda.core._* extension imports above: loading graph
diff --git a/cuda_core/cuda/core/_array.pxd b/cuda_core/cuda/core/_array.pxd
new file mode 100644
index 00000000000..461204e7f56
--- /dev/null
+++ b/cuda_core/cuda/core/_array.pxd
@@ -0,0 +1,23 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from cuda.bindings cimport cydriver
+
+
+cdef class CUDAArray:
+
+    cdef:
+        cydriver.CUarray _handle
+        tuple _shape                 # (w,), (w, h), or (w, h, d)
+        cydriver.CUarray_format _format
+        unsigned int _num_channels   # 1, 2, or 4
+        int _device_id
+        bint _owning
+        bint _surface_load_store
+        # Optional strong reference to a parent owner (e.g. a MipmappedArray
+        # whose level this CUDAArray views). When set, the parent must outlive
+        # this CUDAArray because the underlying CUarray belongs to the parent.
+        object _parent_ref
+
+    cpdef close(self)
diff --git a/cuda_core/cuda/core/_array.pyi b/cuda_core/cuda/core/_array.pyi
new file mode 100644
index 00000000000..61ec023a4b0
--- /dev/null
+++ b/cuda_core/cuda/core/_array.pyi
@@ -0,0 +1,156 @@
+# This file was generated by stubgen-pyx v0.2.6 from cuda_core/cuda/core/_array.pyx
+
+from __future__ import annotations
+
+from enum import IntEnum
+
+from cuda.bindings import cydriver
+
+
+class ArrayFormat(IntEnum):
+    """Element format for a :class:`CUDAArray` allocation.
+
+    Mirrors ``CUarray_format`` from the CUDA driver API.
+    """
+    UINT8 = cydriver.CU_AD_FORMAT_UNSIGNED_INT8
+    UINT16 = cydriver.CU_AD_FORMAT_UNSIGNED_INT16
+    UINT32 = cydriver.CU_AD_FORMAT_UNSIGNED_INT32
+    INT8 = cydriver.CU_AD_FORMAT_SIGNED_INT8
+    INT16 = cydriver.CU_AD_FORMAT_SIGNED_INT16
+    INT32 = cydriver.CU_AD_FORMAT_SIGNED_INT32
+    FLOAT16 = cydriver.CU_AD_FORMAT_HALF
+    FLOAT32 = cydriver.CU_AD_FORMAT_FLOAT
+
+class CUDAArray:
+    """An opaque, hardware-laid-out GPU allocation for texture/surface access.
+
+    Distinct from :class:`Buffer`: a ``CUarray`` has no exposed device pointer
+    and can only be accessed from kernels through a :class:`TextureObject` or
+    :class:`SurfaceObject`. Its memory layout is chosen by the driver for 2D/3D
+    spatial locality.
+
+    Construct via :meth:`from_descriptor`. Only plain 1D/2D/3D allocations are
+    supported in this initial version; layered/cubemap/sparse variants will
+    follow once their shape semantics are settled.
+    """
+
+    def close(self):
+        """Destroy the underlying ``CUarray`` if owned by this object."""
+
+    def __init__(self, *args, **kwargs):
+        ...
+
+    @classmethod
+    def from_descriptor(cls, *, shape, format, num_channels, is_surface_load_store=False):
+        """Allocate a new CUDA array.
+
+        Parameters
+        ----------
+        shape : tuple of int
+            ``(width,)``, ``(width, height)``, or ``(width, height, depth)``
+            in elements.
+        format : ArrayFormat
+            Element format.
+        num_channels : int
+            Channels per element. Must be 1, 2, or 4.
+        is_surface_load_store : bool
+            If True, allocate with ``CUDA_ARRAY3D_SURFACE_LDST`` so the array
+            can be bound as a :class:`SurfaceObject` for kernel-side writes.
+            Default False.
+
+        Returns
+        -------
+        CUDAArray
+        """
+
+    @classmethod
+    def _from_handle(cls, handle: int, owning: bool, *, device_id=None):
+        """Wrap an externally-allocated ``CUarray``.
+
+        Intended for graphics interop (``cuGraphicsSubResourceGetMappedArray``)
+        where the array is owned by the graphics API. With ``owning=False``,
+        :meth:`close` and ``__dealloc__`` will not free the handle. Shape,
+        format, and channel count are queried from the driver.
+        """
+
+    @property
+    def handle(self):
+        """The underlying ``CUarray`` as an integer."""
+
+    @property
+    def shape(self):
+        """Allocation shape, in elements."""
+
+    @property
+    def format(self):
+        """The element :class:`ArrayFormat`."""
+
+    @property
+    def num_channels(self):
+        """Channels per element (1, 2, or 4)."""
+
+    @property
+    def element_size(self):
+        """Bytes per element (format size * channels)."""
+
+    @property
+    def device(self):
+        """The :class:`Device` this array was allocated on."""
+
+    @property
+    def is_surface_load_store(self):
+        """True if this array was created with ``CUDA_ARRAY3D_SURFACE_LDST``
+        and can be bound as a :class:`SurfaceObject`."""
+
+    def _extent_bytes(self):
+        """Return (width_bytes, height, depth) for cuMemcpy3D, with height/depth
+        normalized to >=1 for lower-rank arrays."""
+
+    def copy_from(self, src, *, stream):
+        """Copy a full-array's worth of data into this array.
+
+        Parameters
+        ----------
+        src : Buffer or buffer-protocol object
+            Source data. Must contain at least ``self.size_bytes`` bytes
+            of contiguous data.
+        stream : Stream
+            Stream to issue the copy on.
+        """
+
+    def copy_to(self, dst, *, stream):
+        """Copy a full-array's worth of data out of this array.
+
+        Parameters
+        ----------
+        dst : Buffer or writable buffer-protocol object
+            Destination. Must have at least ``self.size_bytes`` bytes of
+            writable, contiguous space.
+        stream : Stream
+            Stream to issue the copy on.
+        """
+
+    @property
+    def size_bytes(self):
+        """Total bytes of array storage (``prod(shape) * element_size``)."""
+
+    def __dealloc__(self):
+        ...
+
+    def __enter__(self):
+        ...
+
+    def __exit__(self, exc_type, exc, tb):
+        ...
+
+    def __repr__(self):
+        ...
+_FORMAT_ELEM_SIZE = {int(ArrayFormat.UINT8): 1, int(ArrayFormat.INT8): 1, int(ArrayFormat.UINT16): 2, int(ArrayFormat.INT16): 2, int(ArrayFormat.FLOAT16): 2, int(ArrayFormat.UINT32): 4, int(ArrayFormat.INT32): 4, int(ArrayFormat.FLOAT32): 4}
+
+def _validate_format_channels(format, num_channels):
+    """Validate the ``(format, num_channels)`` pair shared by the array,
+    mipmap, and texture factories. Raises on an invalid combination."""
+
+def _validate_array_shape(shape):
+    """Coerce ``shape`` to a tuple of ints and validate rank (1-3) and that
+    every extent is >= 1. Returns the normalized tuple."""
\ No newline at end of file
diff --git a/cuda_core/cuda/core/_array.pyx b/cuda_core/cuda/core/_array.pyx
new file mode 100644
index 00000000000..66420ffc471
--- /dev/null
+++ b/cuda_core/cuda/core/_array.pyx
@@ -0,0 +1,448 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+cimport cpython
+from libc.stdint cimport intptr_t
+from libc.string cimport memset
+
+from cuda.bindings cimport cydriver
+from cuda.core._memory._buffer cimport Buffer
+from cuda.core._stream cimport Stream
+from cuda.core._utils.cuda_utils cimport (
+    HANDLE_RETURN,
+    _get_current_device_id,
+)
+
+from enum import IntEnum
+
+
+class ArrayFormat(IntEnum):
+    """Element format for a :class:`CUDAArray` allocation.
+
+    Mirrors ``CUarray_format`` from the CUDA driver API.
+    """
+    UINT8   = cydriver.CU_AD_FORMAT_UNSIGNED_INT8
+    UINT16  = cydriver.CU_AD_FORMAT_UNSIGNED_INT16
+    UINT32  = cydriver.CU_AD_FORMAT_UNSIGNED_INT32
+    INT8    = cydriver.CU_AD_FORMAT_SIGNED_INT8
+    INT16   = cydriver.CU_AD_FORMAT_SIGNED_INT16
+    INT32   = cydriver.CU_AD_FORMAT_SIGNED_INT32
+    FLOAT16 = cydriver.CU_AD_FORMAT_HALF
+    FLOAT32 = cydriver.CU_AD_FORMAT_FLOAT
+
+
+# Bytes per element (single channel) for each format.
+_FORMAT_ELEM_SIZE = {
+    int(ArrayFormat.UINT8):   1,
+    int(ArrayFormat.INT8):    1,
+    int(ArrayFormat.UINT16):  2,
+    int(ArrayFormat.INT16):   2,
+    int(ArrayFormat.FLOAT16): 2,
+    int(ArrayFormat.UINT32):  4,
+    int(ArrayFormat.INT32):   4,
+    int(ArrayFormat.FLOAT32): 4,
+}
+
+
+def _validate_format_channels(format, num_channels):
+    """Validate the ``(format, num_channels)`` pair shared by the array,
+    mipmap, and texture factories. Raises on an invalid combination."""
+    if not isinstance(format, ArrayFormat):
+        raise TypeError(f"format must be an ArrayFormat, got {type(format).__name__}")
+    if isinstance(num_channels, bool) or num_channels not in (1, 2, 4):
+        raise ValueError(f"num_channels must be 1, 2, or 4, got {num_channels!r}")
+
+
+def _validate_array_shape(shape):
+    """Coerce ``shape`` to a tuple of ints and validate rank (1-3) and that
+    every extent is >= 1. Returns the normalized tuple."""
+    try:
+        shape_t = tuple(int(s) for s in shape)
+    except TypeError as e:
+        raise TypeError(f"shape must be a tuple of ints, got {type(shape).__name__}") from e
+    if not 1 <= len(shape_t) <= 3:
+        raise ValueError(f"shape rank must be 1, 2, or 3, got {len(shape_t)}")
+    for i, dim in enumerate(shape_t):
+        if dim < 1:
+            raise ValueError(f"shape[{i}] must be >= 1, got {dim}")
+    return shape_t
+
+
+cdef void _fill_array_endpoint(
+    cydriver.CUDA_MEMCPY3D* p, CUDAArray arr, bint is_src
+) noexcept:
+    """Populate the src or dst array fields of a CUDA_MEMCPY3D struct."""
+    if is_src:
+        p.srcMemoryType = cydriver.CU_MEMORYTYPE_ARRAY
+        p.srcArray = arr._handle
+        p.srcXInBytes = 0
+        p.srcY = 0
+        p.srcZ = 0
+    else:
+        p.dstMemoryType = cydriver.CU_MEMORYTYPE_ARRAY
+        p.dstArray = arr._handle
+        p.dstXInBytes = 0
+        p.dstY = 0
+        p.dstZ = 0
+
+
+cdef int _fill_host_endpoint(
+    cydriver.CUDA_MEMCPY3D* p,
+    object obj,
+    bint is_src,
+    size_t width_bytes,
+    size_t height,
+    size_t required,
+    cpython.Py_buffer* pybuf_out,
+) except -1:
+    """Populate src/dst host fields from a buffer-protocol ``obj``.
+
+    Acquires a Py_buffer view; the caller is responsible for releasing it
+    (this function always returns with the view held when it returns 1).
+    """
+    cdef int flags = cpython.PyBUF_SIMPLE
+    if not is_src:
+        flags |= cpython.PyBUF_WRITABLE
+    if cpython.PyObject_GetBuffer(obj, pybuf_out, flags) != 0:
+        raise TypeError(
+            f"Source/destination must be a Buffer or a contiguous "
+            f"buffer-protocol object, got {type(obj).__name__}"
+        )
+    if <size_t>pybuf_out.len < required:
+        cpython.PyBuffer_Release(pybuf_out)
+        raise ValueError(
+            f"Host buffer has {pybuf_out.len} bytes, smaller than the array "
+            f"extent ({required} bytes)"
+        )
+    if is_src:
+        p.srcMemoryType = cydriver.CU_MEMORYTYPE_HOST
+        p.srcHost = pybuf_out.buf
+        p.srcPitch = width_bytes
+        p.srcHeight = height
+        p.srcXInBytes = 0
+        p.srcY = 0
+        p.srcZ = 0
+    else:
+        p.dstMemoryType = cydriver.CU_MEMORYTYPE_HOST
+        p.dstHost = pybuf_out.buf
+        p.dstPitch = width_bytes
+        p.dstHeight = height
+        p.dstXInBytes = 0
+        p.dstY = 0
+        p.dstZ = 0
+    return 1
+
+
+cdef int _fill_linear_endpoint(
+    cydriver.CUDA_MEMCPY3D* p,
+    object obj,
+    bint is_src,
+    size_t width_bytes,
+    size_t height,
+    size_t depth,
+    cpython.Py_buffer* pybuf_out,
+) except -1:
+    """Populate the src or dst linear fields. Returns 1 if pybuf_out was
+    filled (caller must release it), 0 otherwise.
+    """
+    cdef intptr_t ptr
+    cdef size_t required = width_bytes * height * depth
+    if isinstance(obj, Buffer):
+        if <size_t>(<Buffer>obj).size < required:
+            raise ValueError(
+                f"Buffer size ({(<Buffer>obj).size} bytes) is smaller than "
+                f"the array extent ({required} bytes)"
+            )
+        ptr = int((<Buffer>obj).handle)
+        if is_src:
+            p.srcMemoryType = cydriver.CU_MEMORYTYPE_DEVICE
+            p.srcDevice = <cydriver.CUdeviceptr>ptr
+            p.srcPitch = width_bytes
+            p.srcHeight = height
+            p.srcXInBytes = 0
+            p.srcY = 0
+            p.srcZ = 0
+        else:
+            p.dstMemoryType = cydriver.CU_MEMORYTYPE_DEVICE
+            p.dstDevice = <cydriver.CUdeviceptr>ptr
+            p.dstPitch = width_bytes
+            p.dstHeight = height
+            p.dstXInBytes = 0
+            p.dstY = 0
+            p.dstZ = 0
+        return 0
+    return _fill_host_endpoint(
+        p, obj, is_src, width_bytes, height, required, pybuf_out
+    )
+
+
+cdef _copy3d(CUDAArray arr, object other, object stream, bint to_array):
+    """Issue a full-array async 3D memcpy between ``arr`` and ``other``.
+
+    Direction is determined by ``to_array``: True copies *into* arr, False
+    copies *out of* arr.
+    """
+    cdef cydriver.CUDA_MEMCPY3D params
+    cdef cpython.Py_buffer pybuf
+    cdef int got_buffer = 0
+    cdef intptr_t stream_handle
+    cdef cydriver.CUstream c_stream
+
+    if not isinstance(stream, Stream):
+        raise TypeError(f"stream must be a Stream, got {type(stream).__name__}")
+
+    memset(&params, 0, sizeof(params))
+    width_bytes, height, depth = arr._extent_bytes()
+    params.WidthInBytes = <size_t>width_bytes
+    params.Height = <size_t>height
+    params.Depth = <size_t>depth
+
+    try:
+        if to_array:
+            got_buffer = _fill_linear_endpoint(
+                &params, other, True, width_bytes, height, depth, &pybuf
+            )
+            _fill_array_endpoint(&params, arr, False)
+        else:
+            _fill_array_endpoint(&params, arr, True)
+            got_buffer = _fill_linear_endpoint(
+                &params, other, False, width_bytes, height, depth, &pybuf
+            )
+
+        stream_handle = int((<Stream>stream).handle)
+        c_stream = <cydriver.CUstream><void*>stream_handle
+        with nogil:
+            HANDLE_RETURN(cydriver.cuMemcpy3DAsync(&params, c_stream))
+    finally:
+        if got_buffer:
+            cpython.PyBuffer_Release(&pybuf)
+
+
+cdef class CUDAArray:
+    """An opaque, hardware-laid-out GPU allocation for texture/surface access.
+
+    Distinct from :class:`Buffer`: a ``CUarray`` has no exposed device pointer
+    and can only be accessed from kernels through a :class:`TextureObject` or
+    :class:`SurfaceObject`. Its memory layout is chosen by the driver for 2D/3D
+    spatial locality.
+
+    Construct via :meth:`from_descriptor`. Only plain 1D/2D/3D allocations are
+    supported in this initial version; layered/cubemap/sparse variants will
+    follow once their shape semantics are settled.
+    """
+
+    def __init__(self, *args, **kwargs):
+        raise RuntimeError(
+            "CUDAArray cannot be instantiated directly. Use CUDAArray.from_descriptor()."
+        )
+
+    @classmethod
+    def from_descriptor(cls, *, shape, format, num_channels, is_surface_load_store=False):
+        """Allocate a new CUDA array.
+
+        Parameters
+        ----------
+        shape : tuple of int
+            ``(width,)``, ``(width, height)``, or ``(width, height, depth)``
+            in elements.
+        format : ArrayFormat
+            Element format.
+        num_channels : int
+            Channels per element. Must be 1, 2, or 4.
+        is_surface_load_store : bool
+            If True, allocate with ``CUDA_ARRAY3D_SURFACE_LDST`` so the array
+            can be bound as a :class:`SurfaceObject` for kernel-side writes.
+            Default False.
+
+        Returns
+        -------
+        CUDAArray
+        """
+        _validate_format_channels(format, num_channels)
+        shape_t = _validate_array_shape(shape)
+
+        cdef CUDAArray self = cls.__new__(cls)
+        self._owning = True
+        self._shape = shape_t
+        self._format = <cydriver.CUarray_format><int>format
+        self._num_channels = num_channels
+        self._surface_load_store = bool(is_surface_load_store)
+        self._device_id = _get_current_device_id()
+        self._parent_ref = None
+
+        cdef cydriver.CUarray_format c_format = <cydriver.CUarray_format><int>format
+        cdef cydriver.CUDA_ARRAY3D_DESCRIPTOR desc3d
+        cdef cydriver.CUDA_ARRAY_DESCRIPTOR desc2d
+        cdef int rank = len(shape_t)
+        cdef unsigned int flags = (
+            cydriver.CUDA_ARRAY3D_SURFACE_LDST if is_surface_load_store else 0
+        )
+
+        # cuArrayCreate (2D path) does not accept flags; use the 3D descriptor
+        # whenever any flag is set or shape is 3D.
+        if rank == 3 or flags != 0:
+            memset(&desc3d, 0, sizeof(desc3d))
+            desc3d.Width = <size_t>shape_t[0]
+            desc3d.Height = <size_t>(shape_t[1] if rank >= 2 else 0)
+            desc3d.Depth = <size_t>(shape_t[2] if rank >= 3 else 0)
+            desc3d.Format = c_format
+            desc3d.NumChannels = <unsigned int>num_channels
+            desc3d.Flags = flags
+            with nogil:
+                HANDLE_RETURN(cydriver.cuArray3DCreate(&self._handle, &desc3d))
+        else:
+            memset(&desc2d, 0, sizeof(desc2d))
+            desc2d.Width = <size_t>shape_t[0]
+            desc2d.Height = <size_t>(shape_t[1] if rank == 2 else 0)
+            desc2d.Format = c_format
+            desc2d.NumChannels = <unsigned int>num_channels
+            with nogil:
+                HANDLE_RETURN(cydriver.cuArrayCreate(&self._handle, &desc2d))
+
+        return self
+
+    @classmethod
+    def _from_handle(cls, intptr_t handle, bint owning, *, device_id=None):
+        """Wrap an externally-allocated ``CUarray``.
+
+        Intended for graphics interop (``cuGraphicsSubResourceGetMappedArray``)
+        where the array is owned by the graphics API. With ``owning=False``,
+        :meth:`close` and ``__dealloc__`` will not free the handle. Shape,
+        format, and channel count are queried from the driver.
+        """
+        cdef CUDAArray self = cls.__new__(cls)
+        self._handle = <cydriver.CUarray><void*>handle
+        self._owning = owning
+        self._device_id = _get_current_device_id() if device_id is None else int(device_id)
+        self._parent_ref = None
+
+        cdef cydriver.CUDA_ARRAY3D_DESCRIPTOR desc
+        with nogil:
+            HANDLE_RETURN(cydriver.cuArray3DGetDescriptor(&desc, self._handle))
+
+        if desc.Depth > 0:
+            self._shape = (int(desc.Width), int(desc.Height), int(desc.Depth))
+        elif desc.Height > 0:
+            self._shape = (int(desc.Width), int(desc.Height))
+        else:
+            self._shape = (int(desc.Width),)
+        self._format = desc.Format
+        self._num_channels = desc.NumChannels
+        self._surface_load_store = bool(desc.Flags & cydriver.CUDA_ARRAY3D_SURFACE_LDST)
+        return self
+
+    @property
+    def handle(self):
+        """The underlying ``CUarray`` as an integer."""
+        return <intptr_t>self._handle
+
+    @property
+    def shape(self):
+        """Allocation shape, in elements."""
+        return self._shape
+
+    @property
+    def format(self):
+        """The element :class:`ArrayFormat`."""
+        return ArrayFormat(self._format)
+
+    @property
+    def num_channels(self):
+        """Channels per element (1, 2, or 4)."""
+        return self._num_channels
+
+    @property
+    def element_size(self):
+        """Bytes per element (format size * channels)."""
+        return _FORMAT_ELEM_SIZE[self._format] * self._num_channels
+
+    @property
+    def device(self):
+        """The :class:`Device` this array was allocated on."""
+        from cuda.core._device import Device
+        return Device(self._device_id)
+
+    @property
+    def is_surface_load_store(self):
+        """True if this array was created with ``CUDA_ARRAY3D_SURFACE_LDST``
+        and can be bound as a :class:`SurfaceObject`."""
+        return self._surface_load_store
+
+    def _extent_bytes(self):
+        """Return (width_bytes, height, depth) for cuMemcpy3D, with height/depth
+        normalized to >=1 for lower-rank arrays."""
+        cdef int rank = len(self._shape)
+        cdef size_t w = <size_t>self._shape[0] * <size_t>(
+            _FORMAT_ELEM_SIZE[self._format] * self._num_channels
+        )
+        cdef size_t h = <size_t>(self._shape[1] if rank >= 2 else 1)
+        cdef size_t d = <size_t>(self._shape[2] if rank >= 3 else 1)
+        return w, h, d
+
+    def copy_from(self, src, *, stream):
+        """Copy a full-array's worth of data into this array.
+
+        Parameters
+        ----------
+        src : Buffer or buffer-protocol object
+            Source data. Must contain at least ``self.size_bytes`` bytes
+            of contiguous data.
+        stream : Stream
+            Stream to issue the copy on.
+        """
+        _copy3d(self, src, stream, to_array=True)
+
+    def copy_to(self, dst, *, stream):
+        """Copy a full-array's worth of data out of this array.
+
+        Parameters
+        ----------
+        dst : Buffer or writable buffer-protocol object
+            Destination. Must have at least ``self.size_bytes`` bytes of
+            writable, contiguous space.
+        stream : Stream
+            Stream to issue the copy on.
+        """
+        _copy3d(self, dst, stream, to_array=False)
+
+    @property
+    def size_bytes(self):
+        """Total bytes of array storage (``prod(shape) * element_size``)."""
+        cdef size_t n = 1
+        for s in self._shape:
+            n *= <size_t>s
+        return n * <size_t>(_FORMAT_ELEM_SIZE[self._format] * self._num_channels)
+
+    cpdef close(self):
+        """Destroy the underlying ``CUarray`` if owned by this object."""
+        cdef cydriver.CUarray h = self._handle
+        cdef bint owning = self._owning
+        self._handle = NULL
+        # Drop the parent reference (if any) so a non-owning level CUDAArray
+        # stops pinning its MipmappedArray after close().
+        self._parent_ref = None
+        if h != NULL and owning:
+            HANDLE_RETURN(cydriver.cuArrayDestroy(h))
+
+    def __dealloc__(self):
+        # Cython destructors cannot raise; any cuArrayDestroy error here is
+        # silently dropped. Callers needing visibility should use close().
+        if self._handle != NULL and self._owning:
+            cydriver.cuArrayDestroy(self._handle)
+            self._handle = NULL
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc, tb):
+        self.close()
+
+    def __repr__(self):
+        return (
+            f"CUDAArray(shape={self._shape}, "
+            f"format={ArrayFormat(self._format).name}, "
+            f"num_channels={self._num_channels})"
+        )
diff --git a/cuda_core/cuda/core/_mipmapped_array.pxd b/cuda_core/cuda/core/_mipmapped_array.pxd
new file mode 100644
index 00000000000..4feebd10c79
--- /dev/null
+++ b/cuda_core/cuda/core/_mipmapped_array.pxd
@@ -0,0 +1,20 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from cuda.bindings cimport cydriver
+
+
+cdef class MipmappedArray:
+
+    cdef:
+        cydriver.CUmipmappedArray _handle
+        tuple _shape                 # (w,), (w, h), or (w, h, d)
+        cydriver.CUarray_format _format
+        unsigned int _num_channels   # 1, 2, or 4
+        unsigned int _num_levels
+        int _device_id
+        bint _owning
+        bint _surface_load_store
+
+    cpdef close(self)
diff --git a/cuda_core/cuda/core/_mipmapped_array.pyi b/cuda_core/cuda/core/_mipmapped_array.pyi
new file mode 100644
index 00000000000..20460037aa6
--- /dev/null
+++ b/cuda_core/cuda/core/_mipmapped_array.pyi
@@ -0,0 +1,112 @@
+# This file was generated by stubgen-pyx v0.2.6 from cuda_core/cuda/core/_mipmapped_array.pyx
+
+from __future__ import annotations
+
+
+class MipmappedArray:
+    """A mipmapped CUDA array for texture/surface access across levels.
+
+    Wraps ``CUmipmappedArray``. Each mip level is a distinct, hardware-laid-out
+    allocation accessible only via a :class:`TextureObject` (or by retrieving
+    the level's :class:`CUDAArray` and binding it as a :class:`SurfaceObject`).
+    Destroying the :class:`MipmappedArray` destroys all level arrays
+    implicitly, so the :class:`CUDAArray` instances returned by :meth:`get_level`
+    are non-owning and hold a strong reference back to their parent.
+
+    Construct via :meth:`from_descriptor`.
+    """
+
+    def close(self):
+        """Destroy the underlying ``CUmipmappedArray`` if owned.
+
+        After ``close()`` any level :class:`CUDAArray` returned by :meth:`get_level`
+        becomes invalid; callers must not access them.
+        """
+
+    def __init__(self, *args, **kwargs):
+        ...
+
+    @classmethod
+    def from_descriptor(cls, *, shape, format, num_channels, num_levels, is_surface_load_store=False):
+        """Allocate a new mipmapped CUDA array.
+
+        Parameters
+        ----------
+        shape : tuple of int
+            ``(width,)``, ``(width, height)``, or ``(width, height, depth)``
+            in elements, for the base (level 0) mip.
+        format : ArrayFormat
+            Element format.
+        num_channels : int
+            Channels per element. Must be 1, 2, or 4.
+        num_levels : int
+            Number of mip levels to allocate; must be >= 1. The driver caps
+            this at the log2 of the largest dimension; passing a larger value
+            yields a driver error.
+        is_surface_load_store : bool
+            If True, allocate with ``CUDA_ARRAY3D_SURFACE_LDST`` so individual
+            levels (obtained via :meth:`get_level`) can be bound as
+            :class:`SurfaceObject` for kernel-side writes. Default False.
+
+        Returns
+        -------
+        MipmappedArray
+        """
+
+    def get_level(self, level):
+        """Return a non-owning :class:`CUDAArray` view of the given mip level.
+
+        Parameters
+        ----------
+        level : int
+            Mip level index in ``[0, num_levels)``.
+
+        Returns
+        -------
+        CUDAArray
+            A non-owning :class:`CUDAArray` wrapping the level's ``CUarray``.
+            The :class:`MipmappedArray` is kept alive for the lifetime of the
+            returned :class:`CUDAArray`; the underlying storage is released only
+            when this :class:`MipmappedArray` is destroyed.
+        """
+
+    @property
+    def handle(self):
+        """The underlying ``CUmipmappedArray`` as an integer."""
+
+    @property
+    def shape(self):
+        """Base-level (level 0) allocation shape, in elements."""
+
+    @property
+    def format(self):
+        """The element :class:`ArrayFormat`."""
+
+    @property
+    def num_channels(self):
+        """Channels per element (1, 2, or 4)."""
+
+    @property
+    def num_levels(self):
+        """Number of mip levels."""
+
+    @property
+    def is_surface_load_store(self):
+        """True if this mipmap (and each of its levels) was created with
+        ``CUDA_ARRAY3D_SURFACE_LDST`` and can back a :class:`SurfaceObject`."""
+
+    @property
+    def device(self):
+        """The :class:`Device` this mipmap was allocated on."""
+
+    def __dealloc__(self):
+        ...
+
+    def __enter__(self):
+        ...
+
+    def __exit__(self, exc_type, exc, tb):
+        ...
+
+    def __repr__(self):
+        ...
\ No newline at end of file
diff --git a/cuda_core/cuda/core/_mipmapped_array.pyx b/cuda_core/cuda/core/_mipmapped_array.pyx
new file mode 100644
index 00000000000..a7ecd29b9d1
--- /dev/null
+++ b/cuda_core/cuda/core/_mipmapped_array.pyx
@@ -0,0 +1,215 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+from libc.stdint cimport intptr_t
+from libc.string cimport memset
+
+from cuda.bindings cimport cydriver
+from cuda.core._array cimport CUDAArray
+from cuda.core._array import ArrayFormat, _validate_array_shape, _validate_format_channels
+from cuda.core._utils.cuda_utils cimport (
+    HANDLE_RETURN,
+    _get_current_device_id,
+)
+
+
+cdef class MipmappedArray:
+    """A mipmapped CUDA array for texture/surface access across levels.
+
+    Wraps ``CUmipmappedArray``. Each mip level is a distinct, hardware-laid-out
+    allocation accessible only via a :class:`TextureObject` (or by retrieving
+    the level's :class:`CUDAArray` and binding it as a :class:`SurfaceObject`).
+    Destroying the :class:`MipmappedArray` destroys all level arrays
+    implicitly, so the :class:`CUDAArray` instances returned by :meth:`get_level`
+    are non-owning and hold a strong reference back to their parent.
+
+    Construct via :meth:`from_descriptor`.
+    """
+
+    def __init__(self, *args, **kwargs):
+        raise RuntimeError(
+            "MipmappedArray cannot be instantiated directly. "
+            "Use MipmappedArray.from_descriptor()."
+        )
+
+    @classmethod
+    def from_descriptor(
+        cls, *, shape, format, num_channels, num_levels, is_surface_load_store=False
+    ):
+        """Allocate a new mipmapped CUDA array.
+
+        Parameters
+        ----------
+        shape : tuple of int
+            ``(width,)``, ``(width, height)``, or ``(width, height, depth)``
+            in elements, for the base (level 0) mip.
+        format : ArrayFormat
+            Element format.
+        num_channels : int
+            Channels per element. Must be 1, 2, or 4.
+        num_levels : int
+            Number of mip levels to allocate; must be >= 1. The driver caps
+            this at the log2 of the largest dimension; passing a larger value
+            yields a driver error.
+        is_surface_load_store : bool
+            If True, allocate with ``CUDA_ARRAY3D_SURFACE_LDST`` so individual
+            levels (obtained via :meth:`get_level`) can be bound as
+            :class:`SurfaceObject` for kernel-side writes. Default False.
+
+        Returns
+        -------
+        MipmappedArray
+        """
+        _validate_format_channels(format, num_channels)
+        shape_t = _validate_array_shape(shape)
+
+        levels = int(num_levels)
+        if levels < 1:
+            raise ValueError(f"num_levels must be >= 1, got {levels}")
+
+        cdef MipmappedArray self = cls.__new__(cls)
+        self._owning = True
+        self._shape = shape_t
+        self._format = <cydriver.CUarray_format><int>format
+        self._num_channels = num_channels
+        self._num_levels = <unsigned int>levels
+        self._surface_load_store = bool(is_surface_load_store)
+        self._device_id = _get_current_device_id()
+
+        cdef cydriver.CUarray_format c_format = <cydriver.CUarray_format><int>format
+        cdef cydriver.CUDA_ARRAY3D_DESCRIPTOR desc3d
+        cdef int rank = len(shape_t)
+        cdef unsigned int flags = (
+            cydriver.CUDA_ARRAY3D_SURFACE_LDST if is_surface_load_store else 0
+        )
+        cdef unsigned int c_levels = <unsigned int>levels
+
+        # Mipmap creation uses the 3D descriptor regardless of rank; lower-rank
+        # shapes use Height=0/Depth=0 sentinels, matching cuArray3DCreate.
+        memset(&desc3d, 0, sizeof(desc3d))
+        desc3d.Width = <size_t>shape_t[0]
+        desc3d.Height = <size_t>(shape_t[1] if rank >= 2 else 0)
+        desc3d.Depth = <size_t>(shape_t[2] if rank >= 3 else 0)
+        desc3d.Format = c_format
+        desc3d.NumChannels = <unsigned int>num_channels
+        desc3d.Flags = flags
+        with nogil:
+            HANDLE_RETURN(
+                cydriver.cuMipmappedArrayCreate(&self._handle, &desc3d, c_levels)
+            )
+
+        return self
+
+    def get_level(self, level):
+        """Return a non-owning :class:`CUDAArray` view of the given mip level.
+
+        Parameters
+        ----------
+        level : int
+            Mip level index in ``[0, num_levels)``.
+
+        Returns
+        -------
+        CUDAArray
+            A non-owning :class:`CUDAArray` wrapping the level's ``CUarray``.
+            The :class:`MipmappedArray` is kept alive for the lifetime of the
+            returned :class:`CUDAArray`; the underlying storage is released only
+            when this :class:`MipmappedArray` is destroyed.
+        """
+        lvl = int(level)
+        if lvl < 0:
+            raise ValueError(f"level must be >= 0, got {lvl}")
+        if lvl >= <int>self._num_levels:
+            raise ValueError(
+                f"level ({lvl}) must be < num_levels ({self._num_levels})"
+            )
+
+        cdef cydriver.CUarray level_handle
+        cdef unsigned int c_level = <unsigned int>lvl
+        with nogil:
+            HANDLE_RETURN(
+                cydriver.cuMipmappedArrayGetLevel(&level_handle, self._handle, c_level)
+            )
+
+        # Wrap as a non-owning CUDAArray; the level's underlying CUarray belongs
+        # to this MipmappedArray and must not be destroyed independently.
+        arr = CUDAArray._from_handle(
+            <intptr_t>level_handle, False, device_id=self._device_id
+        )
+        # Strong ref back to the parent so the mipmap outlives the level view.
+        (<CUDAArray>arr)._parent_ref = self
+        return arr
+
+    @property
+    def handle(self):
+        """The underlying ``CUmipmappedArray`` as an integer."""
+        return <intptr_t>self._handle
+
+    @property
+    def shape(self):
+        """Base-level (level 0) allocation shape, in elements."""
+        return self._shape
+
+    @property
+    def format(self):
+        """The element :class:`ArrayFormat`."""
+        return ArrayFormat(self._format)
+
+    @property
+    def num_channels(self):
+        """Channels per element (1, 2, or 4)."""
+        return self._num_channels
+
+    @property
+    def num_levels(self):
+        """Number of mip levels."""
+        return int(self._num_levels)
+
+    @property
+    def is_surface_load_store(self):
+        """True if this mipmap (and each of its levels) was created with
+        ``CUDA_ARRAY3D_SURFACE_LDST`` and can back a :class:`SurfaceObject`."""
+        return self._surface_load_store
+
+    @property
+    def device(self):
+        """The :class:`Device` this mipmap was allocated on."""
+        from cuda.core._device import Device
+        return Device(self._device_id)
+
+    cpdef close(self):
+        """Destroy the underlying ``CUmipmappedArray`` if owned.
+
+        After ``close()`` any level :class:`CUDAArray` returned by :meth:`get_level`
+        becomes invalid; callers must not access them.
+        """
+        cdef cydriver.CUmipmappedArray h = self._handle
+        cdef bint owning = self._owning
+        self._handle = NULL
+        if h != NULL and owning:
+            HANDLE_RETURN(cydriver.cuMipmappedArrayDestroy(h))
+
+    def __dealloc__(self):
+        # Cython destructors cannot raise; any cuMipmappedArrayDestroy error
+        # here is silently dropped. Callers needing visibility should use
+        # close().
+        if self._handle != NULL and self._owning:
+            cydriver.cuMipmappedArrayDestroy(self._handle)
+            self._handle = NULL
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc, tb):
+        self.close()
+
+    def __repr__(self):
+        return (
+            f"MipmappedArray(shape={self._shape}, "
+            f"format={ArrayFormat(self._format).name}, "
+            f"num_channels={self._num_channels}, "
+            f"num_levels={self._num_levels})"
+        )
diff --git a/cuda_core/cuda/core/_surface.pxd b/cuda_core/cuda/core/_surface.pxd
new file mode 100644
index 00000000000..13a075eb4a3
--- /dev/null
+++ b/cuda_core/cuda/core/_surface.pxd
@@ -0,0 +1,15 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from cuda.bindings cimport cydriver
+
+
+cdef class SurfaceObject:
+
+    cdef:
+        cydriver.CUsurfObject _handle
+        object _source_ref      # keep backing CUDAArray alive
+        int _device_id
+
+    cpdef close(self)
diff --git a/cuda_core/cuda/core/_surface.pyi b/cuda_core/cuda/core/_surface.pyi
new file mode 100644
index 00000000000..9f86054a49c
--- /dev/null
+++ b/cuda_core/cuda/core/_surface.pyi
@@ -0,0 +1,68 @@
+# This file was generated by stubgen-pyx v0.2.6 from cuda_core/cuda/core/_surface.pyx
+
+from __future__ import annotations
+
+
+class SurfaceObject:
+    """A bindless surface handle for kernel-side typed load/store.
+
+    Wraps ``cuSurfObjectCreate``. Unlike a :class:`TextureObject`, a surface
+    has no sampling state (no filtering, no addressing modes, no normalization);
+    kernels read and write through it using integer pixel coordinates.
+
+    The backing :class:`CUDAArray` must have been created with
+    ``is_surface_load_store=True`` and is kept alive for the lifetime of this
+    object to prevent dangling handles.
+
+    Construct via :meth:`from_array` or :meth:`from_descriptor`. Passes to
+    kernels as a 64-bit handle (via the ``handle`` property).
+    """
+
+    def close(self):
+        """Destroy the underlying ``CUsurfObject``."""
+
+    def __init__(self, *args, **kwargs):
+        ...
+
+    @classmethod
+    def from_array(cls, array):
+        """Create a surface object directly from an :class:`CUDAArray`.
+
+        The array must have been created with ``is_surface_load_store=True``.
+        """
+
+    @classmethod
+    def from_descriptor(cls, *, resource):
+        """Create a surface object from a :class:`ResourceDescriptor`.
+
+        Parameters
+        ----------
+        resource : ResourceDescriptor
+            Must wrap an :class:`CUDAArray` allocated with
+            ``is_surface_load_store=True``. Linear/pitch2d resources are not
+            valid surface backings.
+        """
+
+    @property
+    def handle(self):
+        """The underlying ``CUsurfObject`` as an integer (64-bit kernel arg)."""
+
+    @property
+    def resource(self):
+        """The :class:`ResourceDescriptor` this surface was built from."""
+
+    @property
+    def device(self):
+        ...
+
+    def __dealloc__(self):
+        ...
+
+    def __enter__(self):
+        ...
+
+    def __exit__(self, exc_type, exc, tb):
+        ...
+
+    def __repr__(self):
+        ...
\ No newline at end of file
diff --git a/cuda_core/cuda/core/_surface.pyx b/cuda_core/cuda/core/_surface.pyx
new file mode 100644
index 00000000000..87e80e99ef0
--- /dev/null
+++ b/cuda_core/cuda/core/_surface.pyx
@@ -0,0 +1,131 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+from libc.stdint cimport intptr_t
+from libc.string cimport memset
+
+from cuda.bindings cimport cydriver
+from cuda.core._array cimport CUDAArray
+from cuda.core._texture import ResourceDescriptor
+from cuda.core._utils.cuda_utils cimport (
+    HANDLE_RETURN,
+    _get_current_device_id,
+)
+
+
+cdef class SurfaceObject:
+    """A bindless surface handle for kernel-side typed load/store.
+
+    Wraps ``cuSurfObjectCreate``. Unlike a :class:`TextureObject`, a surface
+    has no sampling state (no filtering, no addressing modes, no normalization);
+    kernels read and write through it using integer pixel coordinates.
+
+    The backing :class:`CUDAArray` must have been created with
+    ``is_surface_load_store=True`` and is kept alive for the lifetime of this
+    object to prevent dangling handles.
+
+    Construct via :meth:`from_array` or :meth:`from_descriptor`. Passes to
+    kernels as a 64-bit handle (via the ``handle`` property).
+    """
+
+    def __init__(self, *args, **kwargs):
+        raise RuntimeError(
+            "SurfaceObject cannot be instantiated directly. "
+            "Use SurfaceObject.from_array() or SurfaceObject.from_descriptor()."
+        )
+
+    @classmethod
+    def from_array(cls, array):
+        """Create a surface object directly from an :class:`CUDAArray`.
+
+        The array must have been created with ``is_surface_load_store=True``.
+        """
+        if not isinstance(array, CUDAArray):
+            raise TypeError(f"array must be an CUDAArray, got {type(array).__name__}")
+        return cls.from_descriptor(resource=ResourceDescriptor.from_array(array))
+
+    @classmethod
+    def from_descriptor(cls, *, resource):
+        """Create a surface object from a :class:`ResourceDescriptor`.
+
+        Parameters
+        ----------
+        resource : ResourceDescriptor
+            Must wrap an :class:`CUDAArray` allocated with
+            ``is_surface_load_store=True``. Linear/pitch2d resources are not
+            valid surface backings.
+        """
+        if not isinstance(resource, ResourceDescriptor):
+            raise TypeError(
+                f"resource must be a ResourceDescriptor, got "
+                f"{type(resource).__name__}"
+            )
+        if resource.kind != "array":
+            raise ValueError(
+                f"SurfaceObject requires an array-backed ResourceDescriptor, "
+                f"got kind={resource.kind!r}"
+            )
+
+        cdef CUDAArray arr = <CUDAArray>resource.source
+        if not arr.is_surface_load_store:
+            raise ValueError(
+                "CUDAArray must be created with is_surface_load_store=True to be "
+                "bound as a SurfaceObject"
+            )
+
+        cdef cydriver.CUDA_RESOURCE_DESC res_desc
+        memset(&res_desc, 0, sizeof(res_desc))
+        res_desc.resType = cydriver.CU_RESOURCE_TYPE_ARRAY
+        res_desc.res.array.hArray = arr._handle
+
+        cdef SurfaceObject self = cls.__new__(cls)
+        self._source_ref = resource
+        self._device_id = _get_current_device_id()
+
+        with nogil:
+            HANDLE_RETURN(
+                cydriver.cuSurfObjectCreate(&self._handle, &res_desc)
+            )
+        return self
+
+    @property
+    def handle(self):
+        """The underlying ``CUsurfObject`` as an integer (64-bit kernel arg)."""
+        return <intptr_t>self._handle
+
+    @property
+    def resource(self):
+        """The :class:`ResourceDescriptor` this surface was built from."""
+        return self._source_ref
+
+    @property
+    def device(self):
+        from cuda.core._device import Device
+        return Device(self._device_id)
+
+    cpdef close(self):
+        """Destroy the underlying ``CUsurfObject``."""
+        cdef cydriver.CUsurfObject h = self._handle
+        self._handle = 0
+        self._source_ref = None
+        if h != 0:
+            HANDLE_RETURN(cydriver.cuSurfObjectDestroy(h))
+
+    def __dealloc__(self):
+        # Cython destructors cannot raise; any cuSurfObjectDestroy error is
+        # silently dropped. Callers needing visibility should use close().
+        if self._handle != 0:
+            cydriver.cuSurfObjectDestroy(self._handle)
+            self._handle = 0
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc, tb):
+        self.close()
+
+    def __repr__(self):
+        return f"SurfaceObject(handle=0x{<intptr_t>self._handle:x})"
diff --git a/cuda_core/cuda/core/_texture.pxd b/cuda_core/cuda/core/_texture.pxd
new file mode 100644
index 00000000000..5a1fd84b9ad
--- /dev/null
+++ b/cuda_core/cuda/core/_texture.pxd
@@ -0,0 +1,16 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from cuda.bindings cimport cydriver
+
+
+cdef class TextureObject:
+
+    cdef:
+        cydriver.CUtexObject _handle
+        object _source_ref      # keep backing CUDAArray (or other resource) alive
+        object _texture_desc    # original TextureDescriptor for introspection
+        int _device_id
+
+    cpdef close(self)
diff --git a/cuda_core/cuda/core/_texture.pyi b/cuda_core/cuda/core/_texture.pyi
new file mode 100644
index 00000000000..132a40273c3
--- /dev/null
+++ b/cuda_core/cuda/core/_texture.pyi
@@ -0,0 +1,261 @@
+# This file was generated by stubgen-pyx v0.2.6 from cuda_core/cuda/core/_texture.pyx
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from enum import IntEnum
+
+from cuda.bindings import cydriver
+
+
+class AddressMode(IntEnum):
+    """Boundary behavior for out-of-range texture coordinates."""
+    WRAP = cydriver.CU_TR_ADDRESS_MODE_WRAP
+    CLAMP = cydriver.CU_TR_ADDRESS_MODE_CLAMP
+    MIRROR = cydriver.CU_TR_ADDRESS_MODE_MIRROR
+    BORDER = cydriver.CU_TR_ADDRESS_MODE_BORDER
+
+class FilterMode(IntEnum):
+    """Texel sampling mode."""
+    POINT = cydriver.CU_TR_FILTER_MODE_POINT
+    LINEAR = cydriver.CU_TR_FILTER_MODE_LINEAR
+
+class ReadMode(IntEnum):
+    """How sampled values are returned to the kernel.
+
+    - ``ELEMENT_TYPE``: return the raw element value (integer formats stay
+      integer, float stays float).
+    - ``NORMALIZED_FLOAT``: integer formats are promoted to a normalized
+      ``float`` in ``[0, 1]`` (unsigned) or ``[-1, 1]`` (signed).
+      Float formats are unaffected.
+    """
+    ELEMENT_TYPE = 0
+    NORMALIZED_FLOAT = 1
+
+class ResourceDescriptor:
+    """Describes the memory backing a :class:`TextureObject`.
+
+    Construct via the ``from_*`` classmethods:
+
+    - :meth:`from_array` wraps a :class:`CUDAArray` (works for both
+      :class:`TextureObject` and :class:`SurfaceObject`).
+    - :meth:`from_mipmapped_array` wraps a :class:`MipmappedArray` for mipmapped
+      sampling (texture only, not surface).
+    - :meth:`from_linear` wraps a :class:`Buffer` as a typed 1D fetch. Texture
+      objects built from a linear resource do not support filtering,
+      normalized coordinates, or addressing modes.
+    - :meth:`from_pitch2d` wraps a :class:`Buffer` as a row-pitched 2D image.
+      Supports filtering and 2D addressing, but only 2D access.
+
+    Linear and pitch2D resources cannot back a :class:`SurfaceObject` — those
+    require an :class:`CUDAArray` allocated with ``is_surface_load_store=True``.
+    """
+    __slots__ = ('_kind', '_source', '_format', '_num_channels', '_size_bytes', '_width', '_height', '_pitch_bytes')
+
+    def __init__(self):
+        ...
+
+    @classmethod
+    def from_array(cls, array):
+        """Build a resource descriptor backed by a :class:`CUDAArray`."""
+
+    @classmethod
+    def from_mipmapped_array(cls, mipmapped_array):
+        """Build a resource descriptor backed by a :class:`MipmappedArray`.
+
+        Suitable for binding to a :class:`TextureObject` for mipmapped
+        sampling. Not valid as a :class:`SurfaceObject` backing: surfaces
+        require a single :class:`CUDAArray` level (obtain via
+        :meth:`MipmappedArray.get_level`).
+        """
+
+    @classmethod
+    def from_linear(cls, buffer, *, format, num_channels, size_bytes=None):
+        """Build a resource descriptor for a linear (typed 1D) texture fetch.
+
+        Parameters
+        ----------
+        buffer : Buffer
+            Device-memory backing. Must remain alive for the lifetime of any
+            :class:`TextureObject` built from this descriptor.
+        format : ArrayFormat
+            Element format.
+        num_channels : int
+            Channels per element. Must be 1, 2, or 4.
+        size_bytes : int, optional
+            Bytes of ``buffer`` to bind. Defaults to ``buffer.size``. Must not
+            exceed it.
+
+        Notes
+        -----
+        Texture objects built from a linear resource ignore the
+        :class:`TextureDescriptor` addressing/filtering fields — kernels read
+        through a typed 1D fetch with bounds checking only.
+        """
+
+    @classmethod
+    def from_pitch2d(cls, buffer, *, format, num_channels, width, height, pitch_bytes):
+        """Build a resource descriptor for a row-pitched 2D image.
+
+        Parameters
+        ----------
+        buffer : Buffer
+            Device-memory backing. Must remain alive for the lifetime of any
+            :class:`TextureObject` built from this descriptor.
+        format : ArrayFormat
+            Element format.
+        num_channels : int
+            Channels per element. Must be 1, 2, or 4.
+        width : int
+            Image width, in elements.
+        height : int
+            Image height, in rows.
+        pitch_bytes : int
+            Distance between consecutive rows, in bytes. Must be at least
+            ``width * format_size * num_channels`` and meet the driver's
+            ``CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT``.
+        """
+
+    @property
+    def kind(self):
+        ...
+
+    @property
+    def source(self):
+        ...
+
+    @property
+    def format(self):
+        """The element :class:`ArrayFormat` (``None`` for array-backed)."""
+
+    @property
+    def num_channels(self):
+        """Channels per element (``None`` for array-backed)."""
+
+    @property
+    def size_bytes(self):
+        """Bytes bound for a linear resource (``None`` for other kinds)."""
+
+    @property
+    def width(self):
+        """Pitch2D image width, in elements (``None`` for other kinds)."""
+
+    @property
+    def height(self):
+        """Pitch2D image height, in rows (``None`` for other kinds)."""
+
+    @property
+    def pitch_bytes(self):
+        """Pitch2D row pitch, in bytes (``None`` for other kinds)."""
+
+    def __repr__(self):
+        ...
+
+@dataclass
+class TextureDescriptor:
+    """Sampling state for a :class:`TextureObject` (mirrors ``CUDA_TEXTURE_DESC``).
+
+    Attributes
+    ----------
+    address_mode : tuple of AddressMode
+        Boundary behavior per axis. May be a single :class:`AddressMode` (applied
+        to all axes) or a tuple of 1-3 entries (one per dimension).
+    filter_mode : FilterMode
+        Texel sampling mode. Default ``POINT``.
+    read_mode : ReadMode
+        How sampled integer values are returned. Default ``ELEMENT_TYPE``.
+    normalized_coords : bool
+        If True, coordinates are in ``[0, 1]`` instead of pixel indices.
+    srgb : bool
+        If True, perform sRGB → linear conversion on read (8-bit formats only).
+    disable_trilinear_optimization : bool
+        If True, request exact trilinear filtering.
+    seamless_cubemap : bool
+        If True, enable seamless cubemap edge filtering.
+    max_anisotropy : int
+        Maximum anisotropy; 0 disables anisotropic filtering.
+    mipmap_filter_mode : FilterMode
+        Filtering between mipmap levels. Default ``POINT``.
+    mipmap_level_bias : float
+    min_mipmap_level_clamp : float
+    max_mipmap_level_clamp : float
+    border_color : tuple of float or None
+        4-tuple used when ``address_mode`` includes ``BORDER``; ``None`` means
+        zero.
+    """
+    address_mode: AddressMode | tuple[AddressMode, ...] = AddressMode.CLAMP
+    filter_mode: FilterMode = FilterMode.POINT
+    read_mode: ReadMode = ReadMode.ELEMENT_TYPE
+    normalized_coords: bool = False
+    srgb: bool = False
+    disable_trilinear_optimization: bool = False
+    seamless_cubemap: bool = False
+    max_anisotropy: int = 0
+    mipmap_filter_mode: FilterMode = FilterMode.POINT
+    mipmap_level_bias: float = 0.0
+    min_mipmap_level_clamp: float = 0.0
+    max_mipmap_level_clamp: float = 0.0
+    border_color: tuple[float, ...] | None = None
+
+class TextureObject:
+    """A bindless texture handle for kernel-side sampled reads.
+
+    Wraps ``cuTexObjectCreate``. The underlying memory resource (e.g. the
+    :class:`CUDAArray` referenced by the descriptor) is kept alive for the
+    lifetime of this object to prevent dangling handles.
+
+    Construct via :meth:`from_descriptor`. Passes to kernels as a 64-bit
+    handle (via the ``handle`` property).
+    """
+
+    def close(self):
+        """Destroy the underlying ``CUtexObject``."""
+
+    def __init__(self, *args, **kwargs):
+        ...
+
+    @classmethod
+    def from_descriptor(cls, *, resource, texture_descriptor):
+        """Create a texture object from a resource + sampling descriptor.
+
+        Parameters
+        ----------
+        resource : ResourceDescriptor
+        texture_descriptor : TextureDescriptor
+        """
+
+    @property
+    def handle(self):
+        """The underlying ``CUtexObject`` as an integer (64-bit kernel arg)."""
+
+    @property
+    def resource(self):
+        """The :class:`ResourceDescriptor` this texture was built from."""
+
+    @property
+    def texture_descriptor(self):
+        """The :class:`TextureDescriptor` this texture was built from."""
+
+    @property
+    def device(self):
+        ...
+
+    def __dealloc__(self):
+        ...
+
+    def __enter__(self):
+        ...
+
+    def __exit__(self, exc_type, exc, tb):
+        ...
+
+    def __repr__(self):
+        ...
+_TRSF_READ_AS_INTEGER = 1
+_TRSF_NORMALIZED_COORDINATES = 2
+_TRSF_SRGB = 16
+_TRSF_DISABLE_TRILINEAR_OPTIMIZATION = 32
+_TRSF_SEAMLESS_CUBEMAP = 64
+
+def _normalize_address_modes(address_mode):
+    """Return a 3-tuple of AddressMode values from a scalar or 1-3 tuple."""
\ No newline at end of file
diff --git a/cuda_core/cuda/core/_texture.pyx b/cuda_core/cuda/core/_texture.pyx
new file mode 100644
index 00000000000..6ea8ad805ad
--- /dev/null
+++ b/cuda_core/cuda/core/_texture.pyx
@@ -0,0 +1,566 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+from libc.stdint cimport intptr_t
+from libc.string cimport memset
+
+from cuda.bindings cimport cydriver
+from cuda.core._array cimport CUDAArray
+from cuda.core._array import ArrayFormat, _FORMAT_ELEM_SIZE, _validate_format_channels
+from cuda.core._memory._buffer cimport Buffer
+from cuda.core._mipmapped_array cimport MipmappedArray
+from cuda.core._mipmapped_array import MipmappedArray as _PyMipmappedArray
+from cuda.core._utils.cuda_utils cimport (
+    HANDLE_RETURN,
+    _get_current_device_id,
+)
+
+from dataclasses import dataclass
+from enum import IntEnum
+
+
+# Driver texture-descriptor flag bits (CU_TRSF_*).
+_TRSF_READ_AS_INTEGER = 0x01
+_TRSF_NORMALIZED_COORDINATES = 0x02
+_TRSF_SRGB = 0x10
+_TRSF_DISABLE_TRILINEAR_OPTIMIZATION = 0x20
+_TRSF_SEAMLESS_CUBEMAP = 0x40
+
+
+class AddressMode(IntEnum):
+    """Boundary behavior for out-of-range texture coordinates."""
+    WRAP   = cydriver.CU_TR_ADDRESS_MODE_WRAP
+    CLAMP  = cydriver.CU_TR_ADDRESS_MODE_CLAMP
+    MIRROR = cydriver.CU_TR_ADDRESS_MODE_MIRROR
+    BORDER = cydriver.CU_TR_ADDRESS_MODE_BORDER
+
+
+class FilterMode(IntEnum):
+    """Texel sampling mode."""
+    POINT  = cydriver.CU_TR_FILTER_MODE_POINT
+    LINEAR = cydriver.CU_TR_FILTER_MODE_LINEAR
+
+
+class ReadMode(IntEnum):
+    """How sampled values are returned to the kernel.
+
+    - ``ELEMENT_TYPE``: return the raw element value (integer formats stay
+      integer, float stays float).
+    - ``NORMALIZED_FLOAT``: integer formats are promoted to a normalized
+      ``float`` in ``[0, 1]`` (unsigned) or ``[-1, 1]`` (signed).
+      Float formats are unaffected.
+    """
+    ELEMENT_TYPE     = 0
+    NORMALIZED_FLOAT = 1
+
+
+class ResourceDescriptor:
+    """Describes the memory backing a :class:`TextureObject`.
+
+    Construct via the ``from_*`` classmethods:
+
+    - :meth:`from_array` wraps a :class:`CUDAArray` (works for both
+      :class:`TextureObject` and :class:`SurfaceObject`).
+    - :meth:`from_mipmapped_array` wraps a :class:`MipmappedArray` for mipmapped
+      sampling (texture only, not surface).
+    - :meth:`from_linear` wraps a :class:`Buffer` as a typed 1D fetch. Texture
+      objects built from a linear resource do not support filtering,
+      normalized coordinates, or addressing modes.
+    - :meth:`from_pitch2d` wraps a :class:`Buffer` as a row-pitched 2D image.
+      Supports filtering and 2D addressing, but only 2D access.
+
+    Linear and pitch2D resources cannot back a :class:`SurfaceObject` — those
+    require an :class:`CUDAArray` allocated with ``is_surface_load_store=True``.
+    """
+
+    __slots__ = (
+        "_kind", "_source",
+        "_format", "_num_channels",
+        "_size_bytes",
+        "_width", "_height", "_pitch_bytes",
+    )
+
+    def __init__(self):
+        raise RuntimeError(
+            "ResourceDescriptor cannot be instantiated directly. "
+            "Use ResourceDescriptor.from_* factories."
+        )
+
+    @classmethod
+    def from_array(cls, array):
+        """Build a resource descriptor backed by a :class:`CUDAArray`."""
+        if not isinstance(array, CUDAArray):
+            raise TypeError(f"array must be an CUDAArray, got {type(array).__name__}")
+        self = cls.__new__(cls)
+        self._kind = "array"
+        self._source = array
+        self._format = None
+        self._num_channels = None
+        self._size_bytes = None
+        self._width = None
+        self._height = None
+        self._pitch_bytes = None
+        return self
+
+    @classmethod
+    def from_mipmapped_array(cls, mipmapped_array):
+        """Build a resource descriptor backed by a :class:`MipmappedArray`.
+
+        Suitable for binding to a :class:`TextureObject` for mipmapped
+        sampling. Not valid as a :class:`SurfaceObject` backing: surfaces
+        require a single :class:`CUDAArray` level (obtain via
+        :meth:`MipmappedArray.get_level`).
+        """
+        if not isinstance(mipmapped_array, _PyMipmappedArray):
+            raise TypeError(
+                f"mipmapped_array must be a MipmappedArray, got "
+                f"{type(mipmapped_array).__name__}"
+            )
+        self = cls.__new__(cls)
+        self._kind = "mipmapped_array"
+        self._source = mipmapped_array
+        self._format = None
+        self._num_channels = None
+        self._size_bytes = None
+        self._width = None
+        self._height = None
+        self._pitch_bytes = None
+        return self
+
+    @classmethod
+    def from_linear(cls, buffer, *, format, num_channels, size_bytes=None):
+        """Build a resource descriptor for a linear (typed 1D) texture fetch.
+
+        Parameters
+        ----------
+        buffer : Buffer
+            Device-memory backing. Must remain alive for the lifetime of any
+            :class:`TextureObject` built from this descriptor.
+        format : ArrayFormat
+            Element format.
+        num_channels : int
+            Channels per element. Must be 1, 2, or 4.
+        size_bytes : int, optional
+            Bytes of ``buffer`` to bind. Defaults to ``buffer.size``. Must not
+            exceed it.
+
+        Notes
+        -----
+        Texture objects built from a linear resource ignore the
+        :class:`TextureDescriptor` addressing/filtering fields — kernels read
+        through a typed 1D fetch with bounds checking only.
+        """
+        if not isinstance(buffer, Buffer):
+            raise TypeError(f"buffer must be a Buffer, got {type(buffer).__name__}")
+        _validate_format_channels(format, num_channels)
+
+        buf_size = int(buffer.size)
+        elem = _FORMAT_ELEM_SIZE[int(format)] * int(num_channels)
+        if size_bytes is None:
+            size = buf_size
+        else:
+            size = int(size_bytes)
+            if size > buf_size:
+                raise ValueError(
+                    f"size_bytes ({size}) exceeds buffer.size ({buf_size})"
+                )
+        if size < elem:
+            raise ValueError(
+                f"size_bytes ({size}) must be at least one element ({elem} bytes)"
+            )
+        if size % elem != 0:
+            raise ValueError(
+                f"size_bytes ({size}) must be a multiple of element size "
+                f"({elem} bytes for {format.name} x {num_channels})"
+            )
+
+        self = cls.__new__(cls)
+        self._kind = "linear"
+        self._source = buffer
+        self._format = int(format)
+        self._num_channels = int(num_channels)
+        self._size_bytes = size
+        self._width = None
+        self._height = None
+        self._pitch_bytes = None
+        return self
+
+    @classmethod
+    def from_pitch2d(
+        cls, buffer, *, format, num_channels, width, height, pitch_bytes
+    ):
+        """Build a resource descriptor for a row-pitched 2D image.
+
+        Parameters
+        ----------
+        buffer : Buffer
+            Device-memory backing. Must remain alive for the lifetime of any
+            :class:`TextureObject` built from this descriptor.
+        format : ArrayFormat
+            Element format.
+        num_channels : int
+            Channels per element. Must be 1, 2, or 4.
+        width : int
+            Image width, in elements.
+        height : int
+            Image height, in rows.
+        pitch_bytes : int
+            Distance between consecutive rows, in bytes. Must be at least
+            ``width * format_size * num_channels`` and meet the driver's
+            ``CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT``.
+        """
+        if not isinstance(buffer, Buffer):
+            raise TypeError(f"buffer must be a Buffer, got {type(buffer).__name__}")
+        _validate_format_channels(format, num_channels)
+
+        w = int(width)
+        h = int(height)
+        p = int(pitch_bytes)
+        if w < 1:
+            raise ValueError(f"width must be >= 1, got {w}")
+        if h < 1:
+            raise ValueError(f"height must be >= 1, got {h}")
+        elem = _FORMAT_ELEM_SIZE[int(format)] * int(num_channels)
+        min_pitch = w * elem
+        if p < min_pitch:
+            raise ValueError(
+                f"pitch_bytes ({p}) must be >= width * element_size ({min_pitch})"
+            )
+        if p * h > int(buffer.size):
+            raise ValueError(
+                f"pitch_bytes * height ({p * h}) exceeds buffer.size ({int(buffer.size)})"
+            )
+
+        self = cls.__new__(cls)
+        self._kind = "pitch2d"
+        self._source = buffer
+        self._format = int(format)
+        self._num_channels = int(num_channels)
+        self._size_bytes = None
+        self._width = w
+        self._height = h
+        self._pitch_bytes = p
+        return self
+
+    @property
+    def kind(self):
+        return self._kind
+
+    @property
+    def source(self):
+        return self._source
+
+    @property
+    def format(self):
+        """The element :class:`ArrayFormat` (``None`` for array-backed)."""
+        return None if self._format is None else ArrayFormat(self._format)
+
+    @property
+    def num_channels(self):
+        """Channels per element (``None`` for array-backed)."""
+        return self._num_channels
+
+    @property
+    def size_bytes(self):
+        """Bytes bound for a linear resource (``None`` for other kinds)."""
+        return self._size_bytes
+
+    @property
+    def width(self):
+        """Pitch2D image width, in elements (``None`` for other kinds)."""
+        return self._width
+
+    @property
+    def height(self):
+        """Pitch2D image height, in rows (``None`` for other kinds)."""
+        return self._height
+
+    @property
+    def pitch_bytes(self):
+        """Pitch2D row pitch, in bytes (``None`` for other kinds)."""
+        return self._pitch_bytes
+
+    def __repr__(self):
+        if self._kind == "linear":
+            return (
+                f"ResourceDescriptor(kind='linear', format={self.format.name}, "
+                f"num_channels={self._num_channels}, size_bytes={self._size_bytes})"
+            )
+        if self._kind == "pitch2d":
+            return (
+                f"ResourceDescriptor(kind='pitch2d', format={self.format.name}, "
+                f"num_channels={self._num_channels}, "
+                f"width={self._width}, height={self._height}, "
+                f"pitch_bytes={self._pitch_bytes})"
+            )
+        return f"ResourceDescriptor(kind={self._kind!r})"
+
+
+@dataclass
+class TextureDescriptor:
+    """Sampling state for a :class:`TextureObject` (mirrors ``CUDA_TEXTURE_DESC``).
+
+    Attributes
+    ----------
+    address_mode : tuple of AddressMode
+        Boundary behavior per axis. May be a single :class:`AddressMode` (applied
+        to all axes) or a tuple of 1-3 entries (one per dimension).
+    filter_mode : FilterMode
+        Texel sampling mode. Default ``POINT``.
+    read_mode : ReadMode
+        How sampled integer values are returned. Default ``ELEMENT_TYPE``.
+    normalized_coords : bool
+        If True, coordinates are in ``[0, 1]`` instead of pixel indices.
+    srgb : bool
+        If True, perform sRGB → linear conversion on read (8-bit formats only).
+    disable_trilinear_optimization : bool
+        If True, request exact trilinear filtering.
+    seamless_cubemap : bool
+        If True, enable seamless cubemap edge filtering.
+    max_anisotropy : int
+        Maximum anisotropy; 0 disables anisotropic filtering.
+    mipmap_filter_mode : FilterMode
+        Filtering between mipmap levels. Default ``POINT``.
+    mipmap_level_bias : float
+    min_mipmap_level_clamp : float
+    max_mipmap_level_clamp : float
+    border_color : tuple of float or None
+        4-tuple used when ``address_mode`` includes ``BORDER``; ``None`` means
+        zero.
+    """
+
+    address_mode: AddressMode | tuple[AddressMode, ...] = AddressMode.CLAMP
+    filter_mode: FilterMode = FilterMode.POINT
+    read_mode: ReadMode = ReadMode.ELEMENT_TYPE
+    normalized_coords: bool = False
+    srgb: bool = False
+    disable_trilinear_optimization: bool = False
+    seamless_cubemap: bool = False
+    max_anisotropy: int = 0
+    mipmap_filter_mode: FilterMode = FilterMode.POINT
+    mipmap_level_bias: float = 0.0
+    min_mipmap_level_clamp: float = 0.0
+    max_mipmap_level_clamp: float = 0.0
+    border_color: tuple[float, ...] | None = None
+
+
+def _normalize_address_modes(address_mode):
+    """Return a 3-tuple of AddressMode values from a scalar or 1-3 tuple."""
+    if isinstance(address_mode, AddressMode):
+        return (address_mode, address_mode, address_mode)
+    try:
+        modes = tuple(address_mode)
+    except TypeError as e:
+        raise TypeError(
+            "address_mode must be an AddressMode or a tuple of AddressMode"
+        ) from e
+    if not 1 <= len(modes) <= 3:
+        raise ValueError(
+            f"address_mode tuple must have 1-3 entries, got {len(modes)}"
+        )
+    for i, m in enumerate(modes):
+        if not isinstance(m, AddressMode):
+            raise TypeError(
+                f"address_mode[{i}] must be an AddressMode, got {type(m).__name__}"
+            )
+    # Pad to 3 entries by repeating the last one.
+    padded = list(modes) + [modes[-1]] * (3 - len(modes))
+    return tuple(padded)
+
+
+cdef class TextureObject:
+    """A bindless texture handle for kernel-side sampled reads.
+
+    Wraps ``cuTexObjectCreate``. The underlying memory resource (e.g. the
+    :class:`CUDAArray` referenced by the descriptor) is kept alive for the
+    lifetime of this object to prevent dangling handles.
+
+    Construct via :meth:`from_descriptor`. Passes to kernels as a 64-bit
+    handle (via the ``handle`` property).
+    """
+
+    def __init__(self, *args, **kwargs):
+        raise RuntimeError(
+            "TextureObject cannot be instantiated directly. "
+            "Use TextureObject.from_descriptor()."
+        )
+
+    @classmethod
+    def from_descriptor(cls, *, resource, texture_descriptor):
+        """Create a texture object from a resource + sampling descriptor.
+
+        Parameters
+        ----------
+        resource : ResourceDescriptor
+        texture_descriptor : TextureDescriptor
+        """
+        if not isinstance(resource, ResourceDescriptor):
+            raise TypeError(
+                f"resource must be a ResourceDescriptor, got "
+                f"{type(resource).__name__}"
+            )
+        if not isinstance(texture_descriptor, TextureDescriptor):
+            raise TypeError(
+                f"texture_descriptor must be a TextureDescriptor, got "
+                f"{type(texture_descriptor).__name__}"
+            )
+
+        cdef cydriver.CUDA_RESOURCE_DESC res_desc
+        cdef cydriver.CUDA_TEXTURE_DESC tex_desc
+        memset(&res_desc, 0, sizeof(res_desc))
+        memset(&tex_desc, 0, sizeof(tex_desc))
+
+        # --- Resource descriptor ---
+        cdef CUDAArray arr
+        cdef MipmappedArray mip
+        cdef Buffer buf
+        cdef intptr_t devptr
+        if resource.kind == "array":
+            arr = <CUDAArray>resource.source
+            res_desc.resType = cydriver.CU_RESOURCE_TYPE_ARRAY
+            res_desc.res.array.hArray = arr._handle
+        elif resource.kind == "mipmapped_array":
+            mip = <MipmappedArray>resource.source
+            res_desc.resType = cydriver.CU_RESOURCE_TYPE_MIPMAPPED_ARRAY
+            res_desc.res.mipmap.hMipmappedArray = mip._handle
+        elif resource.kind == "linear":
+            buf = <Buffer>resource.source
+            devptr = int(buf.handle)
+            res_desc.resType = cydriver.CU_RESOURCE_TYPE_LINEAR
+            res_desc.res.linear.devPtr = <cydriver.CUdeviceptr>devptr
+            res_desc.res.linear.format = <cydriver.CUarray_format><int>resource._format
+            res_desc.res.linear.numChannels = <unsigned int>resource._num_channels
+            res_desc.res.linear.sizeInBytes = <size_t>resource._size_bytes
+        elif resource.kind == "pitch2d":
+            buf = <Buffer>resource.source
+            devptr = int(buf.handle)
+            res_desc.resType = cydriver.CU_RESOURCE_TYPE_PITCH2D
+            res_desc.res.pitch2D.devPtr = <cydriver.CUdeviceptr>devptr
+            res_desc.res.pitch2D.format = <cydriver.CUarray_format><int>resource._format
+            res_desc.res.pitch2D.numChannels = <unsigned int>resource._num_channels
+            res_desc.res.pitch2D.width = <size_t>resource._width
+            res_desc.res.pitch2D.height = <size_t>resource._height
+            res_desc.res.pitch2D.pitchInBytes = <size_t>resource._pitch_bytes
+        else:
+            raise NotImplementedError(
+                f"ResourceDescriptor kind {resource.kind!r} is not yet supported"
+            )
+
+        # --- Texture descriptor ---
+        modes = _normalize_address_modes(texture_descriptor.address_mode)
+        tex_desc.addressMode[0] = <cydriver.CUaddress_mode><int>modes[0]
+        tex_desc.addressMode[1] = <cydriver.CUaddress_mode><int>modes[1]
+        tex_desc.addressMode[2] = <cydriver.CUaddress_mode><int>modes[2]
+
+        if not isinstance(texture_descriptor.filter_mode, FilterMode):
+            raise TypeError(
+                f"filter_mode must be a FilterMode, got "
+                f"{type(texture_descriptor.filter_mode).__name__}"
+            )
+        tex_desc.filterMode = <cydriver.CUfilter_mode><int>texture_descriptor.filter_mode
+
+        if not isinstance(texture_descriptor.read_mode, ReadMode):
+            raise TypeError(
+                f"read_mode must be a ReadMode, got "
+                f"{type(texture_descriptor.read_mode).__name__}"
+            )
+
+        cdef unsigned int flags = 0
+        # CU_TRSF_READ_AS_INTEGER suppresses normalization, so it maps to
+        # ReadMode.ELEMENT_TYPE.
+        if texture_descriptor.read_mode == ReadMode.ELEMENT_TYPE:
+            flags |= _TRSF_READ_AS_INTEGER
+        if texture_descriptor.normalized_coords:
+            flags |= _TRSF_NORMALIZED_COORDINATES
+        if texture_descriptor.srgb:
+            flags |= _TRSF_SRGB
+        if texture_descriptor.disable_trilinear_optimization:
+            flags |= _TRSF_DISABLE_TRILINEAR_OPTIMIZATION
+        if texture_descriptor.seamless_cubemap:
+            flags |= _TRSF_SEAMLESS_CUBEMAP
+        tex_desc.flags = flags
+
+        if texture_descriptor.max_anisotropy < 0:
+            raise ValueError("max_anisotropy must be >= 0")
+        tex_desc.maxAnisotropy = <unsigned int>texture_descriptor.max_anisotropy
+
+        if not isinstance(texture_descriptor.mipmap_filter_mode, FilterMode):
+            raise TypeError(
+                f"mipmap_filter_mode must be a FilterMode, got "
+                f"{type(texture_descriptor.mipmap_filter_mode).__name__}"
+            )
+        tex_desc.mipmapFilterMode = <cydriver.CUfilter_mode><int>texture_descriptor.mipmap_filter_mode
+        tex_desc.mipmapLevelBias = <float>texture_descriptor.mipmap_level_bias
+        tex_desc.minMipmapLevelClamp = <float>texture_descriptor.min_mipmap_level_clamp
+        tex_desc.maxMipmapLevelClamp = <float>texture_descriptor.max_mipmap_level_clamp
+
+        cdef int i
+        if texture_descriptor.border_color is None:
+            for i in range(4):
+                tex_desc.borderColor[i] = 0.0
+        else:
+            bc = tuple(texture_descriptor.border_color)
+            if len(bc) != 4:
+                raise ValueError(
+                    f"border_color must have 4 elements, got {len(bc)}"
+                )
+            for i in range(4):
+                tex_desc.borderColor[i] = <float>bc[i]
+
+        cdef TextureObject self = cls.__new__(cls)
+        self._source_ref = resource
+        self._texture_desc = texture_descriptor
+        self._device_id = _get_current_device_id()
+
+        with nogil:
+            HANDLE_RETURN(
+                cydriver.cuTexObjectCreate(&self._handle, &res_desc, &tex_desc, NULL)
+            )
+        return self
+
+    @property
+    def handle(self):
+        """The underlying ``CUtexObject`` as an integer (64-bit kernel arg)."""
+        return <intptr_t>self._handle
+
+    @property
+    def resource(self):
+        """The :class:`ResourceDescriptor` this texture was built from."""
+        return self._source_ref
+
+    @property
+    def texture_descriptor(self):
+        """The :class:`TextureDescriptor` this texture was built from."""
+        return self._texture_desc
+
+    @property
+    def device(self):
+        from cuda.core._device import Device
+        return Device(self._device_id)
+
+    cpdef close(self):
+        """Destroy the underlying ``CUtexObject``."""
+        cdef cydriver.CUtexObject h = self._handle
+        self._handle = 0
+        self._source_ref = None
+        if h != 0:
+            HANDLE_RETURN(cydriver.cuTexObjectDestroy(h))
+
+    def __dealloc__(self):
+        # Cython destructors cannot raise; any cuTexObjectDestroy error is
+        # silently dropped. Callers needing visibility should use close().
+        if self._handle != 0:
+            cydriver.cuTexObjectDestroy(self._handle)
+            self._handle = 0
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc, tb):
+        self.close()
+
+    def __repr__(self):
+        return f"TextureObject(handle=0x{<intptr_t>self._handle:x})"
diff --git a/cuda_core/cuda/core/_utils/cuda_utils.pxd b/cuda_core/cuda/core/_utils/cuda_utils.pxd
index 4562cd71355..11e464e6381 100644
--- a/cuda_core/cuda/core/_utils/cuda_utils.pxd
+++ b/cuda_core/cuda/core/_utils/cuda_utils.pxd
@@ -25,6 +25,11 @@ cdef int HANDLE_RETURN_NVJITLINK(
     cynvjitlink.nvJitLinkHandle handle, cynvjitlink.nvJitLinkResult err) except?-1 nogil
 
 
+# Helper for retrieving the current CUDA device. Raises if no active context
+# is bound to the calling thread.
+cdef int _get_current_device_id() except? -1
+
+
 # TODO: stop exposing these within the codebase?
 cpdef int _check_driver_error(cydriver.CUresult error) except?-1 nogil
 cpdef int _check_runtime_error(error) except?-1
diff --git a/cuda_core/cuda/core/_utils/cuda_utils.pyx b/cuda_core/cuda/core/_utils/cuda_utils.pyx
index 4e20f689b5a..318d4466bee 100644
--- a/cuda_core/cuda/core/_utils/cuda_utils.pyx
+++ b/cuda_core/cuda/core/_utils/cuda_utils.pyx
@@ -69,6 +69,14 @@ cdef int HANDLE_RETURN(cydriver.CUresult err) except?-1 nogil:
     return 0
 
 
+cdef int _get_current_device_id() except? -1:
+    """Return the current thread's bound CUdevice ordinal."""
+    cdef cydriver.CUdevice dev
+    with nogil:
+        HANDLE_RETURN(cydriver.cuCtxGetDevice(&dev))
+    return <int>dev
+
+
 cdef int HANDLE_RETURN_NVRTC(cynvrtc.nvrtcProgram prog, cynvrtc.nvrtcResult err) except?-1 nogil:
     """Handle NVRTC result codes, raising NVRTCError with program log on failure."""
     if err == cynvrtc.nvrtcResult.NVRTC_SUCCESS:
diff --git a/cuda_core/docs/source/api.rst b/cuda_core/docs/source/api.rst
index b1498b57da3..8d46e34e556 100644
--- a/cuda_core/docs/source/api.rst
+++ b/cuda_core/docs/source/api.rst
@@ -161,6 +161,40 @@ Tensor Memory Accelerator (TMA)
    TensorMapDescriptorOptions
 
 
+Textures and surfaces
+---------------------
+
+CUDA arrays back bindless texture and surface objects for kernel-side sampled
+reads and typed load/store. :class:`CUDAArray` is allocated through
+:meth:`CUDAArray.from_descriptor` and bound through a :class:`ResourceDescriptor`
+factory; linear (1D) and row-pitched 2D :class:`Buffer` views as well as
+mipmapped allocations (:class:`MipmappedArray`) are also supported as texture
+backings.
+
+.. autosummary::
+   :toctree: generated/
+
+   :template: autosummary/cyclass.rst
+
+   CUDAArray
+   MipmappedArray
+   ResourceDescriptor
+   TextureObject
+   SurfaceObject
+
+   :template: dataclass.rst
+
+   TextureDescriptor
+
+.. autosummary::
+   :toctree: generated/
+
+   ArrayFormat
+   AddressMode
+   FilterMode
+   ReadMode
+
+
 CUDA compilation toolchain
 --------------------------
 
diff --git a/cuda_core/examples/gl_interop_bloom.py b/cuda_core/examples/gl_interop_bloom.py
new file mode 100644
index 00000000000..66fa95f1f61
--- /dev/null
+++ b/cuda_core/examples/gl_interop_bloom.py
@@ -0,0 +1,793 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# ################################################################################
+#
+# This example demonstrates the cuda.core texture/surface stack used to build a
+# bloom / glow post-effect entirely on the GPU. An animated HDR-ish scene is
+# rendered into the base level of a MipmappedArray; the mip pyramid is then
+# built level by level via SurfaceObject writes (each level reads the one above
+# through its own LINEAR TextureObject); finally a single mipmapped
+# TextureObject samples several LODs with tex2DLod to composite a soft bloom on
+# top of the sharp scene. Requires pyglet.
+#
+# ################################################################################
+
+# What this example teaches
+# =========================
+# The least-demonstrated corner of the texture/surface API: the two halves of a
+# mip pyramid round-trip.
+#
+# - BUILD side: MipmappedArray.get_level(i) returns a NON-OWNING CUDAArray view
+#   of level i. Bind each level as its own SurfaceObject and have a kernel write
+#   into it. We downsample by reading level i-1 through a per-level LINEAR
+#   TextureObject (one bilinear tap == a 2x2 box average) and storing into
+#   level i through that level's SurfaceObject. This is a mip chain built
+#   *on the GPU*, not by the driver.
+# - SAMPLE side: ONE mipmapped TextureObject (FilterMode.LINEAR +
+#   mipmap_filter_mode=LINEAR, normalized coords) bound to the whole pyramid via
+#   ResourceDescriptor.from_mipmapped_array lets a single tex2DLod<float4> read
+#   any level -- the blurred coarse levels are exactly the glow.
+#
+# How it works
+# ============
+# Bloom is "blur the bright parts, add them back." A mip pyramid is a ready-made
+# multi-scale blur: each coarser level is a halved, box-filtered copy of the
+# level below, so reading a high LOD is reading a heavily blurred image.
+#
+#     level 0: 512 x 512   <- sharp animated scene (the emitters)
+#     level 1: 256 x 256       (downsampled via SurfaceObject write)
+#     level 2: 128 x 128
+#     ...
+#     level L-1: small        <- the softest, widest glow
+#
+#   PER FRAME (render loop)
+#   ~~~~~~~~~~~~~~~~~~~~~~~
+#   1. render_scene  -- writes an animated scene of moving bright emitters into
+#                       level 0 through its SurfaceObject (float4 RGBA, values
+#                       can exceed 1.0 in the hot spots).
+#   2. downsample    -- for i in 1..L-1, read level i-1 through its LINEAR
+#                       TextureObject and write level i through its
+#                       SurfaceObject. A single LINEAR tap at the midpoint of
+#                       the parent's 2x2 footprint *is* the box average.
+#   3. composite     -- one mipmapped TextureObject; tex2DLod at lod 0 gives the
+#                       sharp scene, and a weighted sum of lods 1..L-1 gives the
+#                       bloom. Tonemap with 1 - exp(-c*x) and write RGBA8 to the
+#                       OpenGL PBO.
+#
+#   surf2Dwrite indexes x in BYTES, so a float4 write uses x * sizeof(float4)
+#   (= x * 16). Getting this wrong silently corrupts every fourth column.
+#
+# What you should see
+# ===================
+# Several colored emitters orbiting on a dark background, each wrapped in a soft
+# glow. Bright cores bleed light into their surroundings.
+#
+#   +  /  =           bloom strength += 0.15
+#   -                 bloom strength -= 0.15
+#   [                 bloom threshold -= 0.05 (more of the scene glows)
+#   ]                 bloom threshold += 0.05 (only the brightest glow)
+#   ,  /  .           mipmap_level_bias -= / += 0.25 (sharper / softer glow)
+#   ;  /  '           LODs summed -= / += 1 (the live max-LOD clamp)
+#   B                 toggle bloom on / off (makes the effect obvious)
+#   R                 reset all controls
+#   Escape / close    quit
+#
+# The window title shows FPS plus the live mipmap LOD-selection config
+# (MipmappedArray level count, trilinear tex2DLod bias / clamp / LODs) and the
+# bloom strength, threshold, and on/off state.
+#
+
+# /// script
+# dependencies = ["cuda_bindings", "cuda_core>0.6.0", "pyglet"]
+# ///
+
+import ctypes
+import math
+import sys
+import time
+
+import numpy as np
+
+from cuda.core import (
+    AddressMode,
+    ArrayFormat,
+    Device,
+    FilterMode,
+    GraphicsResource,
+    LaunchConfig,
+    MipmappedArray,
+    Program,
+    ProgramOptions,
+    ReadMode,
+    ResourceDescriptor,
+    SurfaceObject,
+    TextureDescriptor,
+    TextureObject,
+    launch,
+)
+
+# ---------------------------------------------------------------------------
+# Configuration (feel free to change these)
+# ---------------------------------------------------------------------------
+WIDTH = 800
+HEIGHT = 600
+BASE_SIZE = 512  # Mip base-level edge length (power of two so levels halve cleanly).
+MAX_LEVELS = 7  # Modest cap on pyramid depth; bounded by log2(BASE_SIZE)+1.
+NUM_EMITTERS = 7
+
+BLOOM_STRENGTH_STEP = 0.15
+BLOOM_THRESHOLD_STEP = 0.05
+
+
+# ============================= Helper functions =============================
+#
+# The functions below set up CUDA, OpenGL, and the mip pyramid. If you're here
+# to learn about MipmappedArray / per-level SurfaceObject writes / mipmapped
+# TextureObject sampling, skip straight to main() -- the interesting part is
+# there. These helpers keep main() reading like a short story.
+# ============================================================================
+
+
+def _check_compute_capability(dev):
+    """Surface load/store + mipmapped arrays require sm_30+."""
+    cc = dev.compute_capability
+    if cc.major < 3:
+        print(
+            f"This example requires compute capability >= 3.0, got sm_{cc.major}{cc.minor}.",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+
+def setup_cuda():
+    """Compile the three kernels and return (device, stream, kernels).
+
+    kernels is a dict with keys "render_scene", "downsample", "composite".
+    """
+    dev = Device(0)
+    dev.set_current()
+    _check_compute_capability(dev)
+    stream = dev.create_stream()
+
+    program_options = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}")
+    prog = Program(KERNEL_SOURCE, code_type="c++", options=program_options)
+    mod = prog.compile(
+        "cubin",
+        name_expressions=("render_scene", "downsample", "composite"),
+    )
+    kernels = {
+        "render_scene": mod.get_kernel("render_scene"),
+        "downsample": mod.get_kernel("downsample"),
+        "composite": mod.get_kernel("composite"),
+    }
+    return dev, stream, kernels
+
+
+def make_level_grid(level_size, block):
+    """2D launch grid covering a (level_size x level_size) image."""
+    return (
+        (level_size + block[0] - 1) // block[0],
+        (level_size + block[1] - 1) // block[1],
+        1,
+    )
+
+
+def create_window():
+    """Open a pyglet window and return (window, gl_module, pyglet)."""
+    try:
+        import pyglet
+        from pyglet.gl import gl as _gl
+    except ImportError:
+        print(
+            "This example requires pyglet >= 2.0.\nInstall it with:  pip install pyglet",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    window = pyglet.window.Window(
+        WIDTH,
+        HEIGHT,
+        caption="cuda.core MipmappedArray - GPU mip-pyramid bloom",
+        vsync=False,
+    )
+    return window, _gl, pyglet
+
+
+def create_display_resources(gl, width, height):
+    """Standard GL boilerplate: a shader program, a fullscreen quad, and an
+    empty texture that we'll repeatedly fill from a PBO. Not CUDA-specific.
+
+    Returns (shader_program, vertex_array_id, texture_id).
+    """
+    from pyglet.graphics.shader import Shader, ShaderProgram
+
+    vert = Shader(VERTEX_SHADER_SOURCE, "vertex")
+    frag = Shader(FRAGMENT_SHADER_SOURCE, "fragment")
+    shader_prog = ShaderProgram(vert, frag)
+
+    quad_verts = np.array(
+        [
+            # x,  y,    s, t      (position + texture coordinate)
+            -1,
+            -1,
+            0,
+            0,
+            1,
+            -1,
+            1,
+            0,
+            1,
+            1,
+            1,
+            1,
+            -1,
+            -1,
+            0,
+            0,
+            1,
+            1,
+            1,
+            1,
+            -1,
+            1,
+            0,
+            1,
+        ],
+        dtype=np.float32,
+    )
+
+    vao = ctypes.c_uint(0)
+    gl.glGenVertexArrays(1, ctypes.byref(vao))
+    gl.glBindVertexArray(vao.value)
+
+    vbo = ctypes.c_uint(0)
+    gl.glGenBuffers(1, ctypes.byref(vbo))
+    gl.glBindBuffer(gl.GL_ARRAY_BUFFER, vbo.value)
+    gl.glBufferData(
+        gl.GL_ARRAY_BUFFER,
+        quad_verts.nbytes,
+        quad_verts.ctypes.data_as(ctypes.c_void_p),
+        gl.GL_STATIC_DRAW,
+    )
+
+    stride = 4 * 4  # 4 floats * 4 bytes each
+    pos_loc = gl.glGetAttribLocation(shader_prog.id, b"position")
+    gl.glEnableVertexAttribArray(pos_loc)
+    gl.glVertexAttribPointer(pos_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(0))
+
+    tc_loc = gl.glGetAttribLocation(shader_prog.id, b"texcoord")
+    gl.glEnableVertexAttribArray(tc_loc)
+    gl.glVertexAttribPointer(tc_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(8))
+
+    gl.glBindVertexArray(0)
+
+    tex = ctypes.c_uint(0)
+    gl.glGenTextures(1, ctypes.byref(tex))
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex.value)
+    gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MIN_FILTER, gl.GL_LINEAR)
+    gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MAG_FILTER, gl.GL_LINEAR)
+    gl.glTexImage2D(
+        gl.GL_TEXTURE_2D,
+        0,
+        gl.GL_RGBA8,
+        width,
+        height,
+        0,
+        gl.GL_RGBA,
+        gl.GL_UNSIGNED_BYTE,
+        None,
+    )
+
+    return shader_prog, vao.value, tex.value
+
+
+def create_pixel_buffer(gl, width, height):
+    """Create a Pixel Buffer Object (PBO) -- the CUDA/GL bridge.
+
+    Returns (pbo_gl_name, size_in_bytes).
+    """
+    pbo = ctypes.c_uint(0)
+    gl.glGenBuffers(1, ctypes.byref(pbo))
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo.value)
+    nbytes = width * height * 4  # RGBA8
+    gl.glBufferData(gl.GL_PIXEL_UNPACK_BUFFER, nbytes, None, gl.GL_DYNAMIC_DRAW)
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0)
+    return pbo.value, nbytes
+
+
+def copy_pbo_to_texture(gl, pbo_id, tex_id, width, height):
+    """Copy pixel data from the PBO into the GL texture (GPU-to-GPU)."""
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo_id)
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id)
+    gl.glTexSubImage2D(
+        gl.GL_TEXTURE_2D,
+        0,
+        0,
+        0,
+        width,
+        height,
+        gl.GL_RGBA,
+        gl.GL_UNSIGNED_BYTE,
+        None,
+    )
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0)
+
+
+def draw_fullscreen_quad(gl, shader_prog, vao_id, tex_id):
+    """Draw the texture to the screen using the fullscreen quad."""
+    gl.glUseProgram(shader_prog.id)
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id)
+    gl.glBindVertexArray(vao_id)
+    gl.glDrawArrays(gl.GL_TRIANGLES, 0, 6)
+    gl.glBindVertexArray(0)
+    gl.glUseProgram(0)
+
+
+# ================================== main() ==================================
+
+
+def main():
+    # --- Step 1: Set up CUDA (compile kernels, create stream) ---
+    dev, stream, kernels = setup_cuda()
+
+    # --- Step 2: Allocate the mip pyramid (single allocation, all levels) ---
+    #     is_surface_load_store=True is required so each level can back a
+    #     SurfaceObject for kernel-side writes. We cap the depth at MAX_LEVELS;
+    #     each level halves until 1x1 at most.
+    num_levels = min(int(math.log2(BASE_SIZE)) + 1, MAX_LEVELS)
+    mm = MipmappedArray.from_descriptor(
+        shape=(BASE_SIZE, BASE_SIZE),
+        format=ArrayFormat.FLOAT32,
+        num_channels=4,
+        num_levels=num_levels,
+        is_surface_load_store=True,
+    )
+
+    # --- Step 3: Pre-create per-level handles ONCE and keep them alive ---
+    #     For every level we build a SurfaceObject (to write into it) and a
+    #     non-mipmapped LINEAR TextureObject (so the downsample kernel can read
+    #     the level above with hardware bilinear). get_level(i) returns a
+    #     NON-OWNING view -- the storage belongs to `mm`, which we keep alive.
+    #     Building these per-frame would be wasteful and, worse, a handle closed
+    #     before its async launch runs would dangle.
+    level_sizes = [BASE_SIZE >> i for i in range(num_levels)]
+    level_arrays = [mm.get_level(i) for i in range(num_levels)]  # keep views alive
+
+    src_tex_desc = TextureDescriptor(
+        address_mode=AddressMode.CLAMP,
+        filter_mode=FilterMode.LINEAR,  # one bilinear tap == 2x2 box average
+        read_mode=ReadMode.ELEMENT_TYPE,
+        normalized_coords=False,  # integer/pixel coordinates for the box tap
+    )
+    level_surfaces = [SurfaceObject.from_array(arr) for arr in level_arrays]
+    level_textures = [
+        TextureObject.from_descriptor(
+            resource=ResourceDescriptor.from_array(arr),
+            texture_descriptor=src_tex_desc,
+        )
+        for arr in level_arrays
+    ]
+
+    # --- Step 4: One mipmapped TextureObject over the WHOLE pyramid ---
+    #     This is the sample side: tex2DLod can fetch any LOD from it, so the
+    #     composite kernel reads the sharp scene (lod 0) and the blurred glow
+    #     (lods 1..L-1) through this single handle. WRAP/MIRROR need normalized
+    #     coords; we use CLAMP + normalized so a level's edge does not bleed in.
+    #
+    #   API MAP -- the mip pyramid round-trip
+    #   =====================================
+    #   BUILD on the GPU:   MipmappedArray.from_descriptor(...) allocates the
+    #                       whole chain; mm.get_level(i) hands back a NON-OWNING
+    #                       CUDAArray view of each level that we bind to a
+    #                       per-level SurfaceObject and write into (the loop in
+    #                       on_draw). The driver never builds the mips -- we do.
+    #   READ it back:       ResourceDescriptor.from_mipmapped_array(mm) wraps the
+    #                       SAME chain in ONE mipmapped TextureObject. tex2DLod
+    #                       then samples any LOD with trilinear filtering.
+    #   LOD selection knobs (TextureDescriptor):
+    #     mipmap_filter_mode=LINEAR  -> trilinear: blend BETWEEN the two nearest
+    #                                   integer LODs (vs NEAREST = snap to one).
+    #     mipmap_level_bias          -> constant added to the requested LOD.
+    #     min/max_mipmap_level_clamp -> clamp the effective LOD to a range.
+    #   These descriptor fields are baked at construction (the texture is created
+    #   ONCE, per the invariants). To demonstrate them INTERACTIVELY, the
+    #   composite kernel folds the SAME bias/clamp math into its explicit
+    #   tex2DLod `lod` argument -- live keys move bias / max-LOD without ever
+    #   rebuilding the texture, while the descriptor encodes the static defaults.
+    mip_tex_desc = TextureDescriptor(
+        address_mode=AddressMode.CLAMP,
+        filter_mode=FilterMode.LINEAR,
+        read_mode=ReadMode.ELEMENT_TYPE,
+        normalized_coords=True,
+        mipmap_filter_mode=FilterMode.LINEAR,  # trilinear between levels
+        mipmap_level_bias=0.0,
+        min_mipmap_level_clamp=0.0,
+        max_mipmap_level_clamp=float(num_levels - 1),
+    )
+    mip_tex = TextureObject.from_descriptor(
+        resource=ResourceDescriptor.from_mipmapped_array(mm),
+        texture_descriptor=mip_tex_desc,
+    )
+
+    # --- Step 5: Open a window and set up the GL/CUDA bridge ---
+    window, gl, pyglet = create_window()
+    shader_prog, quad_vao, tex_id = create_display_resources(gl, WIDTH, HEIGHT)
+    pbo_id, _ = create_pixel_buffer(gl, WIDTH, HEIGHT)
+    resource = GraphicsResource.from_gl_buffer(pbo_id, flags="write_discard")
+
+    # --- Step 6: Render loop state + launch configs ---
+    state = {
+        "strength": 1.8,  # bloom intensity multiplier
+        "threshold": 0.6,  # only luminance above this contributes to bloom
+        "bloom_on": True,
+        # --- Live LOD-selection controls (folded into the tex2DLod loop) ---
+        "bias": 0.5,  # mipmap_level_bias added to each bloom tap's LOD
+        "num_lods": max(1, num_levels - 1),  # how many LODs the bloom sums
+        "min_clamp": 0.0,  # min_mipmap_level_clamp (shown; static default)
+    }
+    max_clamp = float(num_levels - 1)  # max_mipmap_level_clamp ceiling
+    start_time = time.monotonic()
+    frame_count = [0]
+    fps_time = [start_time]
+
+    block = (16, 16, 1)
+    # The composite kernel covers the WIDTHxHEIGHT screen.
+    composite_config = LaunchConfig(grid=make_level_grid_screen(block), block=block)
+
+    @window.event
+    def on_draw():
+        window.clear()
+        t = time.monotonic() - start_time
+
+        # (a) Render the animated HDR-ish scene into level 0's surface.
+        launch(
+            stream,
+            LaunchConfig(grid=make_level_grid(BASE_SIZE, block), block=block),
+            kernels["render_scene"],
+            np.uint64(level_surfaces[0].handle),
+            np.int32(BASE_SIZE),
+            np.int32(BASE_SIZE),
+            np.float32(t),
+            np.int32(NUM_EMITTERS),
+        )
+
+        # (b) Build the pyramid on the GPU: each level i reads level i-1 via its
+        #     LINEAR TextureObject and writes level i via its SurfaceObject.
+        for i in range(1, num_levels):
+            dst_size = level_sizes[i]
+            launch(
+                stream,
+                LaunchConfig(grid=make_level_grid(dst_size, block), block=block),
+                kernels["downsample"],
+                np.uint64(level_textures[i - 1].handle),  # read parent level
+                np.uint64(level_surfaces[i].handle),  # write this level
+                np.int32(dst_size),
+            )
+
+        # (c) Composite: one mipmapped texture, sample several LODs, tonemap,
+        #     and write RGBA8 straight into the PBO.
+        with resource.map(stream=stream) as buf:
+            launch(
+                stream,
+                composite_config,
+                kernels["composite"],
+                buf.handle,
+                np.int32(WIDTH),
+                np.int32(HEIGHT),
+                np.uint64(mip_tex.handle),
+                np.float32(state["strength"]),
+                np.float32(state["threshold"]),
+                np.int32(state["num_lods"]),  # # of bloom LODs summed (max-clamp)
+                np.float32(state["bias"]),  # mipmap_level_bias folded into tex2DLod
+                np.float32(max_clamp),  # max_mipmap_level_clamp ceiling
+                np.int32(1 if state["bloom_on"] else 0),
+            )
+        # Unmap happens automatically when the `with` block exits.
+
+        copy_pbo_to_texture(gl, pbo_id, tex_id, WIDTH, HEIGHT)
+        draw_fullscreen_quad(gl, shader_prog, quad_vao, tex_id)
+
+        frame_count[0] += 1
+        now = time.monotonic()
+        if now - fps_time[0] >= 1.0:
+            fps = frame_count[0] / (now - fps_time[0])
+            window.set_caption(
+                f"GPU mip-pyramid bloom ({WIDTH}x{HEIGHT}, {fps:.0f} FPS) | "
+                f"MipmappedArray[{num_levels} lvls] + tex2DLod[trilinear, "
+                f"bias={state['bias']:+.2f}, "
+                f"clamp={state['min_clamp']:.0f}..{max_clamp:.0f}, "
+                f"lods={state['num_lods']}] | "
+                f"bloom={state['strength']:.2f} "
+                f"thr={state['threshold']:.2f} "
+                f"{'ON' if state['bloom_on'] else 'OFF'}"
+            )
+            frame_count[0] = 0
+            fps_time[0] = now
+
+    @window.event
+    def on_key_press(symbol, _modifiers):
+        key = pyglet.window.key
+        if symbol == key.ESCAPE:
+            window.close()
+        elif symbol in (key.PLUS, key.EQUAL, key.NUM_ADD):
+            state["strength"] = min(8.0, state["strength"] + BLOOM_STRENGTH_STEP)
+        elif symbol in (key.MINUS, key.NUM_SUBTRACT):
+            state["strength"] = max(0.0, state["strength"] - BLOOM_STRENGTH_STEP)
+        elif symbol == key.BRACKETLEFT:
+            state["threshold"] = max(0.0, state["threshold"] - BLOOM_THRESHOLD_STEP)
+        elif symbol == key.BRACKETRIGHT:
+            state["threshold"] = min(4.0, state["threshold"] + BLOOM_THRESHOLD_STEP)
+        elif symbol == key.COMMA:
+            state["bias"] = max(-float(num_levels - 1), state["bias"] - 0.25)
+        elif symbol == key.PERIOD:
+            state["bias"] = min(float(num_levels - 1), state["bias"] + 0.25)
+        elif symbol == key.SEMICOLON:
+            state["num_lods"] = max(1, state["num_lods"] - 1)
+        elif symbol == key.APOSTROPHE:
+            state["num_lods"] = min(num_levels - 1, state["num_lods"] + 1)
+        elif symbol == key.B:
+            state["bloom_on"] = not state["bloom_on"]
+        elif symbol == key.R:
+            state["strength"] = 1.8
+            state["threshold"] = 0.6
+            state["bloom_on"] = True
+            state["bias"] = 0.5
+            state["num_lods"] = max(1, num_levels - 1)
+
+    @window.event
+    def on_close():
+        # Release CUDA-side resources in reverse construction order. GL objects
+        # clean up via pyglet on window close. `mm` is closed LAST because the
+        # per-level surfaces/textures reference its (non-owning) level views.
+        resource.close()
+        mip_tex.close()
+        for tex in level_textures:
+            tex.close()
+        for surf in level_surfaces:
+            surf.close()
+        mm.close()
+        stream.close()
+
+    pyglet.app.run(interval=0)
+
+
+def make_level_grid_screen(block):
+    """2D launch grid covering the WIDTH x HEIGHT screen."""
+    return (
+        (WIDTH + block[0] - 1) // block[0],
+        (HEIGHT + block[1] - 1) // block[1],
+        1,
+    )
+
+
+# ======================== GPU code (CUDA + GLSL) ============================
+#
+# Three CUDA kernels are concatenated into one program string so they share a
+# single NVRTC compile. All three operate on float4 RGBA pixels.
+#
+#   render_scene -- writes an animated scene of moving bright emitters into mip
+#                   level 0 via a SurfaceObject. Hot cores exceed 1.0 so the
+#                   bloom has something to bleed. NOTE: surf2Dwrite's x is in
+#                   BYTES, so we multiply by sizeof(float4) (= 16).
+#
+#   downsample   -- reads level L-1 through a LINEAR TextureObject and writes
+#                   level L through a SurfaceObject. With LINEAR filtering and
+#                   non-normalized coords, ONE tap at the midpoint of the
+#                   parent's 2x2 footprint -- (2x + 1.0, 2y + 1.0) -- equals the
+#                   4-texel box average. (A POINT-sampled +0.5 offset would be
+#                   a single texel, NOT the average; the +1.0 midpoint is the
+#                   crux of this example.)
+#
+#   composite    -- samples the WHOLE pyramid through one mipmapped texture.
+#                   tex2DLod(...,0) is the sharp scene; a weighted sum of
+#                   tex2DLod(...,lod) for lod 1..maxLod is the blurred glow.
+#                   We threshold the glow's luminance, scale by `strength`,
+#                   add the sharp scene, tonemap with 1-exp(-x), write RGBA8.
+#
+# GLSL shaders at the very bottom just draw a textured quad. Nothing CUDA-
+# specific there.
+#
+# ============================================================================
+
+KERNEL_SOURCE = r"""
+__device__ __forceinline__ float clampf(float v, float a, float b) {
+    return fminf(fmaxf(v, a), b);
+}
+
+__device__ __forceinline__ float luminance(float4 c) {
+    return 0.2126f * c.x + 0.7152f * c.y + 0.0722f * c.z;
+}
+
+// --------------------------------------------------------------------------
+// render_scene: animated bright emitters on a dark background -> level 0.
+//
+// `surf` is a SurfaceObject bound to mip level 0 (float4 RGBA). Each emitter
+// orbits the center and contributes a sharp colored core whose intensity can
+// exceed 1.0, giving the bloom pass something to bleed.
+// --------------------------------------------------------------------------
+extern "C" __global__
+void render_scene(cudaSurfaceObject_t surf, int width, int height,
+                  float t, int num_emitters) {
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= width || y >= height) return;
+
+    float u = ((float)x + 0.5f) / (float)width;
+    float v = ((float)y + 0.5f) / (float)height;
+
+    // Faint moving background wash so the frame is never fully black.
+    float bg = 0.04f + 0.02f * sinf(6.2831853f * (u + v) + t * 0.5f);
+    float3 color = make_float3(bg * 0.4f, bg * 0.5f, bg * 0.9f);
+
+    // Accumulate emitters: each orbits the center on its own radius/phase.
+    for (int i = 0; i < num_emitters; ++i) {
+        float fi = (float)i;
+        float phase = t * (0.4f + 0.12f * fi) + fi * 2.3998f;  // golden-ish spread
+        float radius = 0.18f + 0.06f * fi / fmaxf(1.0f, (float)(num_emitters - 1));
+        float ex = 0.5f + radius * cosf(phase);
+        float ey = 0.5f + radius * sinf(phase * 1.13f);
+
+        float dx = u - ex;
+        float dy = v - ey;
+        float d2 = dx * dx + dy * dy;
+
+        // Tight bright core (Gaussian) plus a gentle per-emitter pulse so the
+        // HDR peak breathes and the bloom halo visibly swells. 1/sigma^2 sets
+        // the core size; the smaller multiplier here widens the hot spot a bit
+        // so coarse LODs pick up plenty of energy to bleed.
+        float pulse = 0.75f + 0.25f * sinf(t * (1.3f + 0.17f * fi) + fi);
+        float core = expf(-d2 * 3200.0f);
+        float hot = 3.0f * pulse * core;  // peak well above 1.0 -> blooms strongly
+
+        // Per-emitter hue cycling through R/G/B-ish triplets.
+        float hue = fi * 1.0471975f + t * 0.2f;  // 60 deg steps + slow drift
+        float3 tint = make_float3(
+            0.5f + 0.5f * sinf(hue),
+            0.5f + 0.5f * sinf(hue + 2.0943951f),
+            0.5f + 0.5f * sinf(hue + 4.1887902f));
+
+        color.x += hot * tint.x;
+        color.y += hot * tint.y;
+        color.z += hot * tint.z;
+    }
+
+    float4 px = make_float4(color.x, color.y, color.z, 1.0f);
+
+    // surf2Dwrite indexes x in BYTES: float4 is 16 bytes.
+    surf2Dwrite<float4>(px, surf, x * (int)sizeof(float4), y);
+}
+
+// --------------------------------------------------------------------------
+// downsample: halve the parent level into this level via a single LINEAR tap.
+//
+// `src` is a LINEAR-filtered TextureObject bound to the parent level (L-1).
+// `dst` is a SurfaceObject bound to this level (L). dst_size is L's edge.
+//
+// With non-normalized coords, tex2D returns texel (i,j) when sampled at
+// (i+0.5, j+0.5). For output texel (x,y) the parent 2x2 footprint covers
+// parent texels (2x,2y), (2x+1,2y), (2x,2y+1), (2x+1,2y+1). The midpoint of
+// those four centers is (2x+1.0, 2y+1.0); LINEAR filtering there blends all
+// four at weight 0.25 each -- exactly the box average. (NOT +0.5, which would
+// land on one texel center and return a single texel.)
+// --------------------------------------------------------------------------
+extern "C" __global__
+void downsample(cudaTextureObject_t src,
+                cudaSurfaceObject_t dst,
+                int dst_size) {
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= dst_size || y >= dst_size) return;
+
+    float fx = 2.0f * (float)x + 1.0f;
+    float fy = 2.0f * (float)y + 1.0f;
+
+    float4 px = tex2D<float4>(src, fx, fy);
+
+    surf2Dwrite<float4>(px, dst, x * (int)sizeof(float4), y);
+}
+
+// --------------------------------------------------------------------------
+// composite: sharp scene + multi-LOD bloom, tonemapped, into the PBO.
+//
+// `mip_tex` is ONE mipmapped TextureObject over the whole pyramid. tex2DLod at
+// lod 0 is the sharp scene; lods 1..max_lod are progressively blurrier copies
+// that form the glow. We threshold each blurred sample's luminance so only the
+// bright parts bloom, weight coarser (wider) levels a bit less, scale by
+// `strength`, add the sharp scene, and tonemap.
+// --------------------------------------------------------------------------
+extern "C" __global__
+void composite(unsigned char *output,
+               int width,
+               int height,
+               cudaTextureObject_t mip_tex,
+               float strength,
+               float threshold,
+               int num_lods,
+               float bias,
+               float max_lod,
+               int bloom_on) {
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= width || y >= height) return;
+
+    float u = ((float)x + 0.5f) / (float)width;
+    float v = ((float)y + 0.5f) / (float)height;
+
+    // Sharp scene from the base level. The base sample stays at lod 0 -- bias is
+    // applied only to the bloom taps below, so the scene never blurs.
+    float4 scene = tex2DLod<float4>(mip_tex, u, v, 0.0f);
+    float3 hdr = make_float3(scene.x, scene.y, scene.z);
+
+    if (bloom_on) {
+        // Sum the blurred levels. Each coarser level covers a wider area, so we
+        // taper its weight to keep the glow soft rather than flat.
+        //
+        // This loop is where the live LOD-selection knobs live: `num_lods` is the
+        // max-clamp (how high up the pyramid we read), and `bias` is the
+        // mipmap_level_bias folded into the explicit tex2DLod `lod` argument.
+        // We clamp the effective LOD to [0, max_lod] so a positive bias can never
+        // index past the top of the pyramid.
+        float3 bloom = make_float3(0.0f, 0.0f, 0.0f);
+        float weight_sum = 0.0f;
+        for (int lod = 1; lod <= num_lods; ++lod) {
+            float eff_lod = clampf((float)lod + bias, 0.0f, max_lod);
+            float4 s = tex2DLod<float4>(mip_tex, u, v, eff_lod);
+            // Soft-knee threshold: keep only the energy above `threshold`.
+            float lum = luminance(s);
+            float excess = fmaxf(lum - threshold, 0.0f);
+            float keep = (lum > 1e-4f) ? (excess / lum) : 0.0f;
+
+            float w = 1.0f / (float)lod;  // finer blurred levels weigh more
+            bloom.x += w * keep * s.x;
+            bloom.y += w * keep * s.y;
+            bloom.z += w * keep * s.z;
+            weight_sum += w;
+        }
+        if (weight_sum > 0.0f) {
+            float inv = strength / weight_sum;
+            hdr.x += bloom.x * inv;
+            hdr.y += bloom.y * inv;
+            hdr.z += bloom.z * inv;
+        }
+    }
+
+    // Tonemap HDR -> [0,1] with a simple exposure curve, then to 8-bit.
+    float r = 1.0f - expf(-hdr.x);
+    float g = 1.0f - expf(-hdr.y);
+    float b = 1.0f - expf(-hdr.z);
+
+    int idx = (y * width + x) * 4;
+    output[idx + 0] = (unsigned char)(clampf(r, 0.0f, 1.0f) * 255.0f);
+    output[idx + 1] = (unsigned char)(clampf(g, 0.0f, 1.0f) * 255.0f);
+    output[idx + 2] = (unsigned char)(clampf(b, 0.0f, 1.0f) * 255.0f);
+    output[idx + 3] = 255;
+}
+"""
+
+# GLSL shaders -- these just display a texture on a fullscreen rectangle.
+# Nothing CUDA-specific here.
+
+VERTEX_SHADER_SOURCE = """#version 330 core
+in vec2 position;
+in vec2 texcoord;
+out vec2 v_texcoord;
+void main() {
+    gl_Position = vec4(position, 0.0, 1.0);
+    v_texcoord = texcoord;
+}
+"""
+
+FRAGMENT_SHADER_SOURCE = """#version 330 core
+in vec2 v_texcoord;
+out vec4 fragColor;
+uniform sampler2D tex;
+void main() {
+    fragColor = texture(tex, v_texcoord);
+}
+"""
+
+
+if __name__ == "__main__":
+    main()
diff --git a/cuda_core/examples/gl_interop_caustics.py b/cuda_core/examples/gl_interop_caustics.py
new file mode 100644
index 00000000000..5fe57e256f0
--- /dev/null
+++ b/cuda_core/examples/gl_interop_caustics.py
@@ -0,0 +1,730 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# ################################################################################
+#
+# This example demonstrates cuda.core.CUDAArray, TextureObject, and
+# GraphicsResource for CUDA/OpenGL interop. A tiled pool-floor image is uploaded
+# once into a 2D CUDAArray and bound as a TextureObject sampled with
+# FilterMode.LINEAR + AddressMode.MIRROR + normalized coordinates. Each frame a
+# `render_water` kernel evaluates an animated water surface analytically, refracts
+# the background lookup UVs through it, and overlays a bright caustic network
+# computed from where the refraction focuses, writing RGBA8 straight into an
+# OpenGL PBO. The effect is "looking down at a sunlit pool". Requires pyglet.
+#
+# ################################################################################
+
+# What this example teaches
+# =========================
+# - How to upload a host numpy image into a CUDAArray with `CUDAArray.copy_from`
+#   (host layout (H, W, 4) uint8 row-major for an array allocated as
+#   shape=(WIDTH, HEIGHT)) and bind it as a long-lived TextureObject.
+# - Why FilterMode.LINEAR + AddressMode.MIRROR + normalized_coords=True is the
+#   right pairing for a refraction effect: refracted UV lookups routinely fall
+#   slightly outside [0, 1], and MIRROR returns a sensible mirrored pixel rather
+#   than a clamped smear or a hard edge, while LINEAR keeps the warp smooth.
+# - Why srgb=True is the correct read mode for an 8-bit color image: the texels
+#   are decoded sRGB->linear on read, the kernel does its lighting and tonemap
+#   in linear light, then re-encodes to sRGB on output (the gamma-correct
+#   "sample in linear, tonemap, output" pipeline).
+# - Why max_anisotropy is justified here: refraction samples the texture at
+#   grazing, stretched angles, the case anisotropic filtering exists to clean
+#   up.
+# - That the animated water normal field is computed ANALYTICALLY in the kernel
+#   (a sum of moving directional sine waves plus a few expanding circular
+#   ripples), so there is no second CUDAArray and no SurfaceObject pass -- the
+#   normal and its curvature are evaluated per pixel from a `time` uniform.
+# - How to feed a small fixed ring of interactive click-ripples to the kernel
+#   purely as scalar launch arguments (the demonstrated launch convention),
+#   avoiding any custom device-buffer machinery.
+#
+# How it works
+# ============
+#   Startup (once):
+#     +-------------------+   copy_from   +-----------+
+#     | host numpy image  | ------------> | CUDAArray |  (UINT8 RGBA, vivid grid)
+#     +-------------------+               +-----+-----+
+#                                               |
+#                                               v
+#                                        +-------------+
+#                                        | TextureObj  |  LINEAR + MIRROR + norm
+#                                        +-------------+
+#
+#   Each frame (render_water kernel, 2D over the screen):
+#     1. Evaluate the water height/normal at this pixel from the analytic wave
+#        sum (directional waves + circular ripples) using the `time` uniform.
+#     2. Refract: offset the background sample UV by `refract` * (the water
+#        surface gradient) -- a cheap 2D approximation of bending the view ray.
+#     3. Sample the background TextureObject at the perturbed UV (LINEAR +
+#        MIRROR keeps it smooth and well-defined outside [0, 1]).
+#     4. Caustics: the refraction map (u,v)->(su,sv) focuses light where its
+#        Jacobian determinant approaches zero. We light a thin band around that
+#        det->0 curve to draw the bright, interconnected caustic network, then
+#        add a depth tint (deeper = bluer) and faint specular glints.
+#     5. Tonemap and write RGBA8 into the OpenGL PBO. No PCIe traffic per frame.
+#
+# Why MIRROR (not WRAP or CLAMP)?
+# -------------------------------
+# WRAP and MIRROR both require normalized coordinates. WRAP tiles the image, so
+# a refraction pushing past the right edge suddenly shows the far-left content
+# (a visible seam). CLAMP smears the edge texel into a streak. MIRROR reflects
+# the image at the boundary, which for a small refraction offset looks like the
+# pool simply continuing -- the most natural choice here.
+#
+# What you should see
+# ===================
+# A tiled aqua pool floor seen through gently moving water, overlaid with a
+# bright, shifting network of caustic light filaments. Press +/- to change the
+# water/refraction strength, click anywhere to spawn an expanding circular
+# ripple at the cursor, and Escape to exit. The title shows FPS and the current
+# strength.
+#
+
+# /// script
+# dependencies = ["cuda_bindings", "cuda_core>0.6.0", "pyglet"]
+# ///
+
+import ctypes
+import sys
+import time
+
+import numpy as np
+
+from cuda.core import (
+    AddressMode,
+    ArrayFormat,
+    CUDAArray,
+    Device,
+    FilterMode,
+    GraphicsResource,
+    LaunchConfig,
+    Program,
+    ProgramOptions,
+    ReadMode,
+    ResourceDescriptor,
+    TextureDescriptor,
+    TextureObject,
+    launch,
+)
+
+# ---------------------------------------------------------------------------
+# Parameters (feel free to change these)
+# ---------------------------------------------------------------------------
+WIDTH = 800
+HEIGHT = 600
+BG_SIZE = 256  # the background CUDAArray is BG_SIZE x BG_SIZE RGBA8
+
+# Interactive click-ripples. We keep a small fixed ring and pass each slot to
+# the kernel as plain float scalars (matching the demonstrated launch
+# convention -- no custom device buffers). A ripple with start time < 0 is
+# inactive.
+MAX_RIPPLES = 3
+RIPPLE_LIFETIME = 4.0  # seconds before a click-ripple fully fades out
+
+DEFAULT_STRENGTH = 1.0
+STRENGTH_STEP = 0.15
+MIN_STRENGTH = 0.0
+MAX_STRENGTH = 3.0
+
+
+# ============================= Helper functions =============================
+#
+# The functions below set up CUDA and OpenGL. If you're here to learn about
+# CUDAArray/TextureObject, skip ahead to main() -- the interesting part is
+# there. These helpers exist so main() reads like a short story instead of a
+# wall of boilerplate.
+# ============================================================================
+
+
+def make_background_image(size):
+    """Build a (size, size, 4) uint8 RGBA swimming-pool floor: aqua tiles + grout.
+
+    Layout convention: CUDAArray.from_descriptor takes shape=(WIDTH, HEIGHT), so
+    the host buffer fed to copy_from must be H rows of W elements (row-major),
+    i.e. host.shape == (HEIGHT, WIDTH, 4). Here the image is square so the two
+    agree, but the (y, x) indexing below is the load-bearing part.
+
+    A calm tiled pool floor (low-saturation aqua tiles with slightly darker
+    grout and gentle per-tile variation) is the right backdrop for caustics: it
+    gives the refraction something legible to warp without itself looking busy,
+    so the bright caustic network drawn on top reads as light on water rather
+    than a clash of colors.
+    """
+    ys, xs = np.mgrid[0:size, 0:size].astype(np.float32)
+    u = xs / size
+    v = ys / size
+
+    cells = 6.0
+    # Distance from each tile's edge (0 at center, 1 at the grout line).
+    ex = np.abs(((u * cells) % 1.0) - 0.5) * 2.0
+    ey = np.abs(((v * cells) % 1.0) - 0.5) * 2.0
+    edge = np.maximum(ex, ey)
+    grout = np.clip((edge - 0.82) / 0.18, 0.0, 1.0)  # smooth grout band
+
+    # Subtle per-tile brightness variation (cheap hash on the tile index).
+    ti = np.floor(u * cells) + np.floor(v * cells) * 31.0
+    var = (np.sin(ti * 12.9898) * 43758.5453) % 1.0
+    shade = 0.92 + 0.08 * var
+
+    # Aqua tile body and a darker teal grout, blended by the grout band.
+    tile = np.array([0.30, 0.66, 0.74], dtype=np.float32)
+    mortar = np.array([0.12, 0.34, 0.42], dtype=np.float32)
+    img = np.zeros((size, size, 4), dtype=np.uint8)
+    for c in range(3):
+        col = (tile[c] * shade) * (1.0 - grout) + mortar[c] * grout
+        img[:, :, c] = (np.clip(col, 0.0, 1.0) * 255.0).astype(np.uint8)
+    img[:, :, 3] = 255
+    return img
+
+
+def setup_cuda():
+    """Compile the kernel and return (device, stream, kernel, launch_config)."""
+    dev = Device(0)
+    dev.set_current()
+
+    cc = dev.compute_capability
+    if cc.major < 3:
+        print(
+            "This example requires a GPU with compute capability >= 3.0 for "
+            f"bindless texture objects. Found sm_{cc.major}{cc.minor}.",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    stream = dev.create_stream()
+
+    program_options = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}")
+    prog = Program(KERNEL_SOURCE, code_type="c++", options=program_options)
+    mod = prog.compile("cubin", name_expressions=("render_water",))
+    kernel = mod.get_kernel("render_water")
+
+    block = (16, 16, 1)
+    grid = (
+        (WIDTH + block[0] - 1) // block[0],
+        (HEIGHT + block[1] - 1) // block[1],
+        1,
+    )
+    config = LaunchConfig(grid=grid, block=block)
+    return dev, stream, kernel, config
+
+
+def create_window():
+    """Open a pyglet window. Returns (window, gl_module, pyglet_module)."""
+    try:
+        import pyglet
+        from pyglet.gl import gl as _gl
+    except ImportError:
+        print(
+            "This example requires pyglet >= 2.0.\nInstall it with:  pip install pyglet",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    window = pyglet.window.Window(
+        WIDTH,
+        HEIGHT,
+        caption="cuda.core CUDAArray + TextureObject - Water Caustics",
+        vsync=False,
+    )
+    return window, _gl, pyglet
+
+
+def create_display_resources(gl, width, height):
+    """Standard pyglet boilerplate: shader, fullscreen quad, screen texture."""
+    from pyglet.graphics.shader import Shader, ShaderProgram
+
+    vert = Shader(VERTEX_SHADER_SOURCE, "vertex")
+    frag = Shader(FRAGMENT_SHADER_SOURCE, "fragment")
+    shader_prog = ShaderProgram(vert, frag)
+
+    quad_verts = np.array(
+        [
+            -1,
+            -1,
+            0,
+            0,
+            1,
+            -1,
+            1,
+            0,
+            1,
+            1,
+            1,
+            1,
+            -1,
+            -1,
+            0,
+            0,
+            1,
+            1,
+            1,
+            1,
+            -1,
+            1,
+            0,
+            1,
+        ],
+        dtype=np.float32,
+    )
+
+    vao = ctypes.c_uint(0)
+    gl.glGenVertexArrays(1, ctypes.byref(vao))
+    gl.glBindVertexArray(vao.value)
+
+    vbo = ctypes.c_uint(0)
+    gl.glGenBuffers(1, ctypes.byref(vbo))
+    gl.glBindBuffer(gl.GL_ARRAY_BUFFER, vbo.value)
+    gl.glBufferData(
+        gl.GL_ARRAY_BUFFER,
+        quad_verts.nbytes,
+        quad_verts.ctypes.data_as(ctypes.c_void_p),
+        gl.GL_STATIC_DRAW,
+    )
+
+    stride = 4 * 4
+    pos_loc = gl.glGetAttribLocation(shader_prog.id, b"position")
+    gl.glEnableVertexAttribArray(pos_loc)
+    gl.glVertexAttribPointer(pos_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(0))
+    tc_loc = gl.glGetAttribLocation(shader_prog.id, b"texcoord")
+    gl.glEnableVertexAttribArray(tc_loc)
+    gl.glVertexAttribPointer(tc_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(8))
+    gl.glBindVertexArray(0)
+
+    tex = ctypes.c_uint(0)
+    gl.glGenTextures(1, ctypes.byref(tex))
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex.value)
+    gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MIN_FILTER, gl.GL_LINEAR)
+    gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MAG_FILTER, gl.GL_LINEAR)
+    gl.glTexImage2D(
+        gl.GL_TEXTURE_2D,
+        0,
+        gl.GL_RGBA8,
+        width,
+        height,
+        0,
+        gl.GL_RGBA,
+        gl.GL_UNSIGNED_BYTE,
+        None,
+    )
+    return shader_prog, vao.value, tex.value
+
+
+def create_pixel_buffer(gl, width, height):
+    """Create the GL PBO that CUDA writes RGBA pixels into each frame."""
+    pbo = ctypes.c_uint(0)
+    gl.glGenBuffers(1, ctypes.byref(pbo))
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo.value)
+    nbytes = width * height * 4
+    gl.glBufferData(gl.GL_PIXEL_UNPACK_BUFFER, nbytes, None, gl.GL_DYNAMIC_DRAW)
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0)
+    return pbo.value
+
+
+def copy_pbo_to_texture(gl, pbo_id, tex_id, width, height):
+    """Copy pixel data from the PBO into the GL texture (GPU-to-GPU)."""
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo_id)
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id)
+    gl.glTexSubImage2D(
+        gl.GL_TEXTURE_2D,
+        0,
+        0,
+        0,
+        width,
+        height,
+        gl.GL_RGBA,
+        gl.GL_UNSIGNED_BYTE,
+        None,
+    )
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0)
+
+
+def draw_fullscreen_quad(gl, shader_prog, vao_id, tex_id):
+    """Draw the texture to the screen using the fullscreen quad."""
+    gl.glUseProgram(shader_prog.id)
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id)
+    gl.glBindVertexArray(vao_id)
+    gl.glDrawArrays(gl.GL_TRIANGLES, 0, 6)
+    gl.glBindVertexArray(0)
+    gl.glUseProgram(0)
+
+
+MAX_ANISOTROPY = 8  # kept in lockstep with the API MAP comment + live caption
+
+
+def make_background_texture(arr):
+    """Bind `arr` as a TextureObject for LINEAR + MIRROR + normalized sampling.
+
+    MIRROR (like WRAP) requires normalized coordinates. UINT8 source +
+    NORMALIZED_FLOAT means tex2D<float4> returns each channel in [0, 1].
+
+    API MAP: UINT8 RGBA CUDAArray sampled as TextureObject[LINEAR | MIRROR |
+    NORMALIZED_FLOAT | srgb | max_anisotropy=8]; MIRROR handles refracted UVs
+    that leave [0,1]; srgb does the gamma-correct decode; anisotropy cleans up
+    grazing-angle sampling.
+
+    Two TextureDescriptor features are showcased here on an 8-bit color image:
+
+    - srgb=True: the background is UINT8 RGBA authored in perceptual space, so
+      enabling sRGB->linear conversion on read is the correct thing to do --
+      the kernel then does all of its lighting/tonemap math in linear light and
+      re-encodes to sRGB on output (the final pow(c, 1/2.2) below). This is the
+      gamma-correct "sample in linear, tonemap, output" pipeline.
+    - max_anisotropy=8: refraction samples the texture at grazing, stretched
+      angles, which is exactly the case anisotropic filtering is meant to clean
+      up, so we request it on the background texture.
+    """
+    res_desc = ResourceDescriptor.from_array(arr)
+    tex_desc = TextureDescriptor(
+        address_mode=AddressMode.MIRROR,
+        filter_mode=FilterMode.LINEAR,
+        read_mode=ReadMode.NORMALIZED_FLOAT,
+        # MIRROR/WRAP addressing modes require normalized coordinates.
+        normalized_coords=True,
+        # 8-bit color image -> decode sRGB to linear on read so the lighting and
+        # tonemap math runs in linear light (re-encoded to sRGB on output).
+        srgb=True,
+        # Refraction samples at grazing/stretched angles; anisotropic filtering
+        # cleans those up.
+        max_anisotropy=MAX_ANISOTROPY,
+    )
+    return TextureObject.from_descriptor(resource=res_desc, texture_descriptor=tex_desc)
+
+
+# ================================== main() ==================================
+
+
+def main():
+    # --- Step 1: Set up CUDA (compile kernel, create stream) ---
+    dev, stream, kernel, config = setup_cuda()
+
+    # --- Step 2: Open a window ---
+    window, gl, pyglet = create_window()
+
+    # --- Step 3: Create GL resources (shader, fullscreen quad, screen tex) ---
+    shader_prog, quad_vao, screen_tex = create_display_resources(gl, WIDTH, HEIGHT)
+
+    # --- Step 4: Create the PBO that CUDA will write into ---
+    pbo_id = create_pixel_buffer(gl, WIDTH, HEIGHT)
+    resource = GraphicsResource.from_gl_buffer(pbo_id, flags="write_discard")
+
+    # --- Step 5: Allocate the background CUDAArray and upload the image once ---
+    bg_arr = CUDAArray.from_descriptor(
+        shape=(BG_SIZE, BG_SIZE),
+        format=ArrayFormat.UINT8,
+        num_channels=4,
+    )
+    host_image = make_background_image(BG_SIZE)
+    bg_arr.copy_from(np.ascontiguousarray(host_image), stream=stream)
+    stream.sync()
+
+    # --- Step 6: Bind the CUDAArray as a long-lived TextureObject ---
+    #     Created once and kept alive: `launch` is async, so a per-frame texture
+    #     inside a closing `with` would destroy the handle before the kernel ran.
+    bg_tex = make_background_texture(bg_arr)
+
+    # Interactive state. Each ripple slot is (origin_x, origin_y, start_time) in
+    # normalized screen coords / seconds; start_time < 0 means inactive.
+    state = {
+        "strength": DEFAULT_STRENGTH,
+        "ripples": [[0.0, 0.0, -1.0] for _ in range(MAX_RIPPLES)],
+        "next_slot": 0,
+    }
+    start_time = time.monotonic()
+
+    @window.event
+    def on_key_press(symbol, _modifiers):
+        key = pyglet.window.key
+        if symbol == key.ESCAPE:
+            window.close()
+        elif symbol in (key.PLUS, key.EQUAL, key.NUM_ADD):
+            state["strength"] = min(MAX_STRENGTH, state["strength"] + STRENGTH_STEP)
+        elif symbol in (key.MINUS, key.UNDERSCORE, key.NUM_SUBTRACT):
+            state["strength"] = max(MIN_STRENGTH, state["strength"] - STRENGTH_STEP)
+
+    @window.event
+    def on_mouse_press(x, y, _button, _modifiers):
+        # pyglet's origin is bottom-left, which matches our normalized UV
+        # convention below (v increases upward). Record into the ring buffer.
+        now = time.monotonic() - start_time
+        slot = state["next_slot"]
+        state["ripples"][slot] = [x / WIDTH, y / HEIGHT, now]
+        state["next_slot"] = (slot + 1) % MAX_RIPPLES
+
+    # --- Step 7: Render loop ---
+    frame_count = 0
+    fps_time = start_time
+
+    @window.event
+    def on_draw():
+        nonlocal frame_count, fps_time
+
+        now = time.monotonic()
+        t = now - start_time
+
+        window.clear()
+
+        # Flatten the ripple ring into the scalar args the kernel expects:
+        # for each slot, (origin_x, origin_y, age) where age < 0 == inactive.
+        ripple_args = []
+        for ox, oy, st in state["ripples"]:
+            age = (t - st) if st >= 0.0 else -1.0
+            if age >= RIPPLE_LIFETIME:
+                age = -1.0
+            ripple_args.extend((np.float32(ox), np.float32(oy), np.float32(age)))
+
+        with resource.map(stream=stream) as buf:
+            launch(
+                stream,
+                config,
+                kernel,
+                np.uint64(bg_tex.handle),
+                buf.handle,
+                np.int32(WIDTH),
+                np.int32(HEIGHT),
+                np.float32(t),
+                np.float32(state["strength"]),
+                np.float32(RIPPLE_LIFETIME),
+                *ripple_args,
+            )
+        copy_pbo_to_texture(gl, pbo_id, screen_tex, WIDTH, HEIGHT)
+        draw_fullscreen_quad(gl, shader_prog, quad_vao, screen_tex)
+
+        frame_count += 1
+        if now - fps_time >= 1.0:
+            fps = frame_count / (now - fps_time)
+            window.set_caption(
+                "cuda.core CUDAArray + TextureObject - Water Caustics "
+                f"(strength={state['strength']:.2f}, {fps:.0f} FPS) "
+                f"| TextureObject[LINEAR|MIRROR|sRGB|aniso={MAX_ANISOTROPY}] UINT8 "
+                "[+/- strength, click = ripple, Esc = quit]"
+            )
+            frame_count = 0
+            fps_time = now
+
+    @window.event
+    def on_close():
+        bg_tex.close()
+        bg_arr.close()
+        resource.close()
+        stream.close()
+
+    pyglet.app.run(interval=0)
+
+
+# ============================== GPU code (kernel) ============================
+#
+# render_water samples a static background TextureObject (LINEAR + MIRROR +
+# normalized coords) at refraction-perturbed UVs. The water surface and its
+# normal/curvature are evaluated analytically from a `time` uniform -- there is
+# no second array and no SurfaceObject. MAX_RIPPLES click-ripples arrive as
+# (origin_x, origin_y, age) float triples; age < 0 marks an empty slot.
+#
+# The ripple count is compiled in via the MAX_RIPPLES define so the kernel's
+# parameter list (host side) and the loop bound (device side) stay in lockstep.
+# ============================================================================
+
+KERNEL_SOURCE = (
+    "#define MAX_RIPPLES "
+    + str(MAX_RIPPLES)
+    + "\n"
+    + r"""
+// Analytic water height field at normalized position p and time t. A sum of a
+// few moving directional waves gives the base chop; the expanding circular
+// ripples from clicks ride on top. Returns height; gradient/curvature are taken
+// numerically by sampling this a few times (cheap and robust).
+__device__ __forceinline__
+float water_height(float px, float py, float t,
+                   const float* rip_x, const float* rip_y,
+                   const float* rip_age, float ripple_lifetime) {
+    float h = 0.0f;
+
+    // Directional waves: (dir_x, dir_y, freq, speed, amp).
+    // Hand-picked so they never perfectly align (avoids an obvious repeat).
+    const float waves[5][5] = {
+        { 1.00f,  0.00f,  9.0f,  1.3f, 0.45f},
+        { 0.20f,  0.98f, 12.0f,  1.0f, 0.35f},
+        {-0.70f,  0.71f, 16.0f,  1.7f, 0.25f},
+        { 0.80f, -0.60f, 22.0f,  2.1f, 0.18f},
+        {-0.30f, -0.95f, 31.0f,  2.6f, 0.12f},
+    };
+    #pragma unroll
+    for (int i = 0; i < 5; ++i) {
+        float phase = (waves[i][0] * px + waves[i][1] * py) * waves[i][2]
+                      + t * waves[i][3];
+        h += waves[i][4] * sinf(phase);
+    }
+
+    // Expanding circular ripples from mouse clicks. Each is a decaying radial
+    // wave packet whose ring radius grows with age.
+    for (int r = 0; r < MAX_RIPPLES; ++r) {
+        float age = rip_age[r];
+        if (age < 0.0f) continue;
+        float dx = px - rip_x[r];
+        float dy = py - rip_y[r];
+        float dist = sqrtf(dx * dx + dy * dy);
+        float ring = dist * 40.0f - age * 8.0f;       // outward-moving ring
+        float envelope = expf(-dist * 6.0f);           // localized in space
+        float fade = 1.0f - (age / ripple_lifetime);   // fade over lifetime
+        if (fade < 0.0f) fade = 0.0f;
+        h += 0.9f * fade * envelope * sinf(ring);
+    }
+    return h;
+}
+
+extern "C"
+__global__
+void render_water(cudaTextureObject_t bg,
+                  unsigned char* output,
+                  int width, int height,
+                  float t,
+                  float strength,
+                  float ripple_lifetime,
+"""
+    + "".join(
+        f"                  float rip_x{i}, float rip_y{i}, float rip_age{i}"
+        + (",\n" if i < MAX_RIPPLES - 1 else ") {\n")
+        for i in range(MAX_RIPPLES)
+    )
+    + r"""
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= width || y >= height) return;
+
+    // Pack the per-ripple scalars back into arrays so the helper can loop.
+    float rip_x[MAX_RIPPLES];
+    float rip_y[MAX_RIPPLES];
+    float rip_age[MAX_RIPPLES];
+"""
+    + "".join(
+        f"    rip_x[{i}] = rip_x{i}; rip_y[{i}] = rip_y{i}; rip_age[{i}] = rip_age{i};\n" for i in range(MAX_RIPPLES)
+    )
+    + r"""
+    // Normalized screen position. v increases upward to match pyglet's
+    // bottom-left mouse origin used when recording ripple coordinates.
+    float u = (x + 0.5f) / (float)width;
+    float v = 1.0f - (y + 0.5f) / (float)height;
+
+    // Sample the water height field on a 3x3 stencil to get the surface
+    // gradient (slope -> refraction) and the full Hessian (the second
+    // derivatives that drive the caustic network).
+    const float eps = 1.5f / (float)width;
+    float hc = water_height(u, v, t, rip_x, rip_y, rip_age, ripple_lifetime);
+    float hl = water_height(u - eps, v, t, rip_x, rip_y, rip_age, ripple_lifetime);
+    float hr = water_height(u + eps, v, t, rip_x, rip_y, rip_age, ripple_lifetime);
+    float hd = water_height(u, v - eps, t, rip_x, rip_y, rip_age, ripple_lifetime);
+    float hu = water_height(u, v + eps, t, rip_x, rip_y, rip_age, ripple_lifetime);
+    float hlu = water_height(u - eps, v + eps, t, rip_x, rip_y, rip_age, ripple_lifetime);
+    float hru = water_height(u + eps, v + eps, t, rip_x, rip_y, rip_age, ripple_lifetime);
+    float hld = water_height(u - eps, v - eps, t, rip_x, rip_y, rip_age, ripple_lifetime);
+    float hrd = water_height(u + eps, v - eps, t, rip_x, rip_y, rip_age, ripple_lifetime);
+
+    float inv2e = 1.0f / (2.0f * eps);
+    float inve2 = 1.0f / (eps * eps);
+    float gx = (hr - hl) * inv2e;            // d(height)/du
+    float gy = (hu - hd) * inv2e;            // d(height)/dv
+    float hxx = (hr - 2.0f * hc + hl) * inve2;
+    float hyy = (hu - 2.0f * hc + hd) * inve2;
+    float hxy = (hru - hrd - hlu + hld) * (0.25f * inve2);
+
+    // 2D refraction: bend the background lookup by the surface slope, kept
+    // small so the pool floor warps gently instead of tearing apart. Because
+    // the texture was bound with srgb=True the sample is already in LINEAR
+    // light, so the lighting/tonemap below is physically sensible and we only
+    // re-encode to sRGB at the very end. MIRROR keeps (su, sv) outside [0,1]
+    // smooth instead of a clamped streak or a wrap seam.
+    float refract = 0.010f * strength;
+    float su = u - refract * gx;
+    float sv = v - refract * gy;
+    float4 base = tex2D<float4>(bg, su, sv);
+
+    // Caustics from the refraction map's area compression. The displacement
+    // (u,v) -> (su,sv) has Jacobian J = [[1 - r*hxx, -r*hxy], [-r*hxy,
+    // 1 - r*hyy]]. Where det(J) -> 0 neighbouring rays converge onto the same
+    // spot and light piles up; 1/|det| is the brightness of that focus. This
+    // is what produces the real, interconnected, animated caustic web -- not a
+    // generic glow. `rs` is a small lens strength tuned to the wave curvature.
+    float rs = 0.012f * (0.5f + 0.5f * strength);
+    float a = 1.0f - rs * hxx;
+    float dd = 1.0f - rs * hyy;
+    float bxy = rs * hxy;
+    float det = a * dd - bxy * bxy;
+    // The caustic is the thin CURVE where det -> 0 (rays focus to a line). We
+    // light up only a narrow band around it and square the ramp so the result
+    // is crisp bright filaments over the visible tiles, not broad foggy blobs.
+    // Two bands -- a tight bright core plus a fainter halo -- give the lines a
+    // little glow without fattening them.
+    float ad = fabsf(det);
+    float core = 1.0f - fminf(ad / 0.06f, 1.0f);
+    float halo = 1.0f - fminf(ad / 0.30f, 1.0f);
+    float caustic = core * core * 1.7f + halo * halo * 0.25f;
+    if (caustic > 2.0f) caustic = 2.0f;
+
+    // Surface normal from the gradient (z points out of the water).
+    float nx = -gx, ny = -gy, nz = 1.0f;
+    float ninv = rsqrtf(nx * nx + ny * ny + nz * nz);
+    nx *= ninv; ny *= ninv; nz *= ninv;
+
+    // Faint specular glints off the wavelets.
+    float lx = 0.3f, ly = 0.4f, lz = 0.866f;
+    float spec = nx * lx + ny * ly + nz * lz;
+    if (spec < 0.0f) spec = 0.0f;
+    spec = powf(spec, 60.0f) * 0.5f;
+
+    // Water tint: a gentle blue-green cast, slightly deeper in the troughs.
+    float depth = 0.5f + 0.5f * hc;
+    float tint_r = 0.80f + 0.08f * depth;
+    float tint_g = 0.98f + 0.04f * depth;
+    float tint_b = 1.10f - 0.06f * depth;
+
+    // Composite in LINEAR light: tinted pool floor + the white caustic web
+    // (a touch cooler in blue so it reads as sunlight through water) + glints.
+    float cr = base.x * tint_r + caustic * 0.90f + spec;
+    float cg = base.y * tint_g + caustic * 0.97f + spec;
+    float cb = base.z * tint_b + caustic * 1.00f + spec;
+
+    // Simple Reinhard tonemap so highlights roll off instead of clipping hard.
+    cr = cr / (1.0f + cr);
+    cg = cg / (1.0f + cg);
+    cb = cb / (1.0f + cb);
+
+    // Encode LINEAR -> sRGB on output. This is the matching half of the
+    // srgb=True decode on the texture read: we sampled and lit in linear, and
+    // now re-encode for the 8-bit RGBA8 PBO. The ~1/2.2 exponent is the
+    // gamma-correct encode (and also lifts the midtones the linear decode
+    // darkened, so the pool reads luminous rather than murky).
+    cr = powf(cr, 1.0f / 2.2f);
+    cg = powf(cg, 1.0f / 2.2f);
+    cb = powf(cb, 1.0f / 2.2f);
+
+    int idx = (y * width + x) * 4;
+    output[idx + 0] = (unsigned char)(fminf(cr, 1.0f) * 255.0f);
+    output[idx + 1] = (unsigned char)(fminf(cg, 1.0f) * 255.0f);
+    output[idx + 2] = (unsigned char)(fminf(cb, 1.0f) * 255.0f);
+    output[idx + 3] = 255;
+}
+"""
+)
+
+VERTEX_SHADER_SOURCE = """#version 330 core
+in vec2 position;
+in vec2 texcoord;
+out vec2 v_texcoord;
+void main() {
+    gl_Position = vec4(position, 0.0, 1.0);
+    v_texcoord = texcoord;
+}
+"""
+
+FRAGMENT_SHADER_SOURCE = """#version 330 core
+in vec2 v_texcoord;
+out vec4 fragColor;
+uniform sampler2D tex;
+void main() {
+    fragColor = texture(tex, v_texcoord);
+}
+"""
+
+
+if __name__ == "__main__":
+    main()
diff --git a/cuda_core/examples/gl_interop_clouds.py b/cuda_core/examples/gl_interop_clouds.py
new file mode 100644
index 00000000000..bc8829674ef
--- /dev/null
+++ b/cuda_core/examples/gl_interop_clouds.py
@@ -0,0 +1,991 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# ################################################################################
+#
+# This example demonstrates cuda.core's 3D CUDAArray + trilinear TextureObject by
+# baking a procedural fractal-noise density volume once at startup and then
+# ray-marching it every frame as participating media to render fluffy, sunlit,
+# semi-transparent clouds. The SurfaceObject is used during the one-shot bake;
+# the TextureObject (with LINEAR + WRAP + normalized coords) drives the per-frame
+# volumetric ray march with Beer-Lambert absorption and self-shadowing. The
+# whole pipeline stays on the GPU through GraphicsResource. Requires pyglet.
+#
+# ################################################################################
+
+# What this example teaches
+# =========================
+# - How to allocate a 3D cuda.core.CUDAArray (cuArray3DCreate under the hood) and
+#   bind it as both a SurfaceObject (for one-shot kernel writes via surf3Dwrite)
+#   and a TextureObject (for hardware-accelerated trilinear tex3D sampling).
+# - How to ray-march a baked scalar density volume as PARTICIPATING MEDIA: this
+#   goes beyond gl_interop_sdf_volume.py (which renders a hard SDF surface). Here
+#   the volume is fog: we accumulate color and transmittance front-to-back and
+#   apply Beer-Lambert absorption, with a short secondary march toward the sun
+#   for self-shadowing.
+# - How to wire mouse + keyboard input into a pyglet/cuda.core interop loop.
+#
+# How it works
+# ============
+# A single-channel float (FLOAT32) 3D volume (96^3) is filled once at
+# startup with fractal Brownian motion (fbm) built from a cheap integer-hash
+# value noise:
+#
+#     fbm(p) = sum over octaves of amplitude * value_noise(p * frequency)
+#     density = remap(fbm) with a coverage threshold
+#
+# The volume stores only the raw noise; the cloud SHAPING (coverage threshold +
+# a vertical height falloff that fades density near the top and bottom of the
+# box) is applied in the RENDER kernel, not baked. That lets us ANIMATE the
+# clouds for free by scrolling the sample coordinate with a `time` uniform
+# (cheaper than re-baking 96^3 every frame, which would stack a second 3D launch
+# on top of the already heavy raymarch). WRAP addressing avoids clamping the
+# scrolled coordinate at the box edge (the baked field is not perfectly
+# tileable, so a faint density seam sweeps through slowly); the ray-vs-box bail
+# is what keeps density zero outside the volume, so WRAP is safe here.
+#
+#   STARTUP (one-shot bake)
+#   ~~~~~~~~~~~~~~~~~~~~~~~
+#   1. Allocate 3D CUDAArray (96^3, FLOAT32 x1, is_surface_load_store=True).
+#   2. Bind it as a SurfaceObject.
+#   3. Launch `bake_density`: one thread per voxel writes fbm via surf3Dwrite.
+#   4. Close the SurfaceObject; the CUDAArray stays alive.
+#
+#   EACH FRAME
+#   ~~~~~~~~~~
+#   1. resource.map() -> CUDA device pointer into the OpenGL PBO.
+#   2. Launch `render_clouds` (one thread per pixel). It builds an orbit-camera
+#      ray, intersects the [-1,1]^3 box, marches front-to-back sampling density
+#      via tex3D<float> (LINEAR + WRAP + normalized coords), shades each sample
+#      with a short sun-ward shadow march (Beer-Lambert), accumulates over an
+#      analytic sky, and writes RGBA8 straight into the PBO.
+#   3. Unmap, GPU-side copy PBO -> texture, draw fullscreen quad.
+#
+# Performance note
+# ================
+# This is the most compute-heavy example here: a primary march (up to ~96 steps)
+# with a nested secondary shadow march (~6 steps) per sample is O(steps^2) work
+# per pixel. To keep it interactive we use a modest 96^3 volume, cap the step
+# counts, and EARLY-OUT once transmittance drops below ~0.01. Lower
+# PRIMARY_STEPS / VOLUME_SIZE if your GPU struggles.
+#
+# Controls
+# ========
+#   Left mouse drag    orbit camera (dx -> yaw, dy -> pitch)
+#   Arrow keys         orbit camera (keyboard alternative)
+#   Mouse wheel        zoom (camera distance)
+#   + / -              raise / lower the sun (changes light angle + sky glow)
+#   [ / ]              decrease / increase cloud coverage (more / less cloud)
+#   R                  reset camera + sun + coverage
+#   Escape / close     quit
+#
+# The window title shows yaw, pitch, distance, sun height, coverage, and FPS.
+#
+
+# /// script
+# dependencies = ["cuda_bindings", "cuda_core>0.6.0", "pyglet"]
+# ///
+
+import ctypes
+import sys
+import time
+
+import numpy as np
+
+from cuda.core import (
+    AddressMode,
+    ArrayFormat,
+    CUDAArray,
+    Device,
+    FilterMode,
+    GraphicsResource,
+    LaunchConfig,
+    Program,
+    ProgramOptions,
+    ReadMode,
+    ResourceDescriptor,
+    SurfaceObject,
+    TextureDescriptor,
+    TextureObject,
+    launch,
+)
+
+# ---------------------------------------------------------------------------
+# Configuration (feel free to change these)
+# ---------------------------------------------------------------------------
+WIDTH = 800
+HEIGHT = 600
+VOLUME_SIZE = 96  # 96^3 voxels; bake cost is one-shot. Lower if memory is tight.
+
+# Camera defaults / clamps.
+RESET_YAW = 0.6
+RESET_PITCH = 0.25
+RESET_DIST = 3.2
+PITCH_MIN = -1.45  # stay inside (-pi/2, pi/2) so the up-vector stays sane.
+PITCH_MAX = 1.45
+DIST_MIN = 1.5
+DIST_MAX = 9.0
+
+# Lighting / shaping defaults and clamps.
+RESET_SUN_HEIGHT = 0.55  # 0 = sun at horizon, 1 = sun overhead.
+SUN_HEIGHT_MIN = 0.05
+SUN_HEIGHT_MAX = 0.98
+RESET_COVERAGE = 0.50  # higher = more cloud (lower density threshold).
+COVERAGE_MIN = 0.20
+COVERAGE_MAX = 0.85
+
+
+# ============================= Helper functions =============================
+#
+# The functions below set up CUDA and OpenGL. If you're here to learn about
+# 3D CUDAArray / TextureObject / SurfaceObject, skip ahead to main() -- the
+# interesting part is there. These helpers exist so that main() reads like a
+# short story instead of a wall of boilerplate.
+# ============================================================================
+
+
+def _check_compute_capability(dev):
+    """3D arrays + bindless surface/texture objects require sm_30+."""
+    cc = dev.compute_capability
+    if cc.major < 3:
+        print(
+            f"This example requires compute capability >= 3.0, got sm_{cc.major}{cc.minor}.",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+
+def setup_cuda():
+    """Compile the two kernels and return (device, stream, kernels)."""
+    dev = Device(0)
+    dev.set_current()
+    _check_compute_capability(dev)
+    stream = dev.create_stream()
+
+    # C++ is required so the templated tex3D<float> / surf3Dwrite<float>
+    # overloads resolve. extern "C" on the kernel symbols keeps the function
+    # names unmangled even when the rest of the TU is compiled as C++.
+    program_options = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}")
+    prog = Program(KERNEL_SOURCE, code_type="c++", options=program_options)
+    mod = prog.compile(
+        "cubin",
+        name_expressions=("bake_density", "render_clouds"),
+    )
+    kernels = {
+        "bake": mod.get_kernel("bake_density"),
+        "render": mod.get_kernel("render_clouds"),
+    }
+    return dev, stream, kernels
+
+
+def make_volume_array():
+    """Allocate the 3D density volume. Single-channel float, surface-capable.
+
+    API MAP
+    =======
+    - 3D CUDAArray shape=(W,H,D): CUDAArray.from_descriptor allocates a 96^3
+      single-channel array (cuArray3DCreate under the hood). This is the
+      headline of the example: a true 3D, hardware-laid-out array sampled
+      trilinearly from a kernel.
+    - tex3D trilinear (FilterMode.LINEAR) + normalized coords: configured by
+      make_volume_texture below; gives free hardware trilinear sampling, the
+      thing that makes a smooth volumetric raymarch cheap.
+    - surf3Dwrite typed store during the one-shot bake: bind the same CUDAArray
+      as a SurfaceObject (is_surface_load_store=True) and write one density per
+      voxel; the byte x-offset uses sizeof(float) because surf3Dwrite's x
+      coordinate is in BYTES (y, z are in elements).
+    """
+    return CUDAArray.from_descriptor(
+        shape=(VOLUME_SIZE, VOLUME_SIZE, VOLUME_SIZE),
+        format=ArrayFormat.FLOAT32,
+        num_channels=1,
+        is_surface_load_store=True,
+    )
+
+
+def make_volume_texture(arr):
+    """Bind `arr` as a TextureObject configured for LINEAR + WRAP + normalized.
+
+    WRAP (not CLAMP) is the right choice here: the render kernel scrolls the
+    sample coordinate by a time uniform to animate the clouds, and WRAP avoids
+    clamping (smearing) the edge texels as the coordinate drifts past [0, 1].
+    The baked field is not perfectly tileable, so a faint density seam sweeps
+    through slowly as the scroll wraps -- a minor demo-grade artifact, not a
+    crash. WRAP/MIRROR addressing modes require normalized coordinates. The
+    ray-vs-box bail in the raymarch is what keeps density zero outside the
+    [-1, 1]^3 volume, so wrapping the noise field never leaks cloud outside it.
+    """
+    res_desc = ResourceDescriptor.from_array(arr)
+    tex_desc = TextureDescriptor(
+        address_mode=AddressMode.WRAP,
+        filter_mode=FilterMode.LINEAR,
+        read_mode=ReadMode.ELEMENT_TYPE,
+        normalized_coords=True,
+    )
+    return TextureObject.from_descriptor(resource=res_desc, texture_descriptor=tex_desc)
+
+
+def bake_volume(stream, kernels, arr):
+    """Run the one-shot bake kernel that fills the volume with fractal noise.
+
+    The SurfaceObject lives only for the duration of this call; once the bake
+    is enqueued and the kernel has captured the bindless handle into its
+    arguments, we sync the stream before letting the SurfaceObject close.
+    The CUDAArray itself outlives this scope -- it's the long-lived backing
+    store for the render-loop TextureObject.
+    """
+    with SurfaceObject.from_array(arr) as bake_surf:
+        block = (8, 8, 8)
+        grid = (
+            (VOLUME_SIZE + block[0] - 1) // block[0],
+            (VOLUME_SIZE + block[1] - 1) // block[1],
+            (VOLUME_SIZE + block[2] - 1) // block[2],
+        )
+        launch(
+            stream,
+            LaunchConfig(grid=grid, block=block),
+            kernels["bake"],
+            np.uint64(bake_surf.handle),
+            np.int32(VOLUME_SIZE),
+        )
+        # Synchronize before the SurfaceObject context exits so the bindless
+        # handle is still valid while the kernel runs.
+        stream.sync()
+
+
+def create_window():
+    """Open a pyglet window and return (window, gl_module, pyglet)."""
+    try:
+        import pyglet
+        from pyglet.gl import gl as _gl
+    except ImportError:
+        print(
+            "This example requires pyglet >= 2.0.\nInstall it with:  pip install pyglet",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    window = pyglet.window.Window(
+        WIDTH,
+        HEIGHT,
+        caption="cuda.core 3D CUDAArray - Volumetric Cloud Ray-Marcher",
+        vsync=False,
+    )
+    return window, _gl, pyglet
+
+
+def create_display_resources(gl, width, height):
+    """Standard GL boilerplate: shader, fullscreen quad, empty texture.
+
+    Not CUDA-specific; identical to the other gl_interop_* examples.
+    Returns (shader_program, vertex_array_id, texture_id).
+    """
+    from pyglet.graphics.shader import Shader, ShaderProgram
+
+    vert = Shader(VERTEX_SHADER_SOURCE, "vertex")
+    frag = Shader(FRAGMENT_SHADER_SOURCE, "fragment")
+    shader_prog = ShaderProgram(vert, frag)
+
+    quad_verts = np.array(
+        [
+            # x,  y,    s, t      (position + texture coordinate)
+            -1,
+            -1,
+            0,
+            0,
+            1,
+            -1,
+            1,
+            0,
+            1,
+            1,
+            1,
+            1,
+            -1,
+            -1,
+            0,
+            0,
+            1,
+            1,
+            1,
+            1,
+            -1,
+            1,
+            0,
+            1,
+        ],
+        dtype=np.float32,
+    )
+
+    vao = ctypes.c_uint(0)
+    gl.glGenVertexArrays(1, ctypes.byref(vao))
+    gl.glBindVertexArray(vao.value)
+
+    vbo = ctypes.c_uint(0)
+    gl.glGenBuffers(1, ctypes.byref(vbo))
+    gl.glBindBuffer(gl.GL_ARRAY_BUFFER, vbo.value)
+    gl.glBufferData(
+        gl.GL_ARRAY_BUFFER,
+        quad_verts.nbytes,
+        quad_verts.ctypes.data_as(ctypes.c_void_p),
+        gl.GL_STATIC_DRAW,
+    )
+
+    stride = 4 * 4  # 4 floats * 4 bytes each
+    pos_loc = gl.glGetAttribLocation(shader_prog.id, b"position")
+    gl.glEnableVertexAttribArray(pos_loc)
+    gl.glVertexAttribPointer(pos_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(0))
+
+    tc_loc = gl.glGetAttribLocation(shader_prog.id, b"texcoord")
+    gl.glEnableVertexAttribArray(tc_loc)
+    gl.glVertexAttribPointer(tc_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(8))
+
+    gl.glBindVertexArray(0)
+
+    tex = ctypes.c_uint(0)
+    gl.glGenTextures(1, ctypes.byref(tex))
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex.value)
+    gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MIN_FILTER, gl.GL_LINEAR)
+    gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MAG_FILTER, gl.GL_LINEAR)
+    gl.glTexImage2D(
+        gl.GL_TEXTURE_2D,
+        0,
+        gl.GL_RGBA8,
+        width,
+        height,
+        0,
+        gl.GL_RGBA,
+        gl.GL_UNSIGNED_BYTE,
+        None,
+    )
+
+    return shader_prog, vao.value, tex.value
+
+
+def create_pixel_buffer(gl, width, height):
+    """Create a Pixel Buffer Object (PBO) -- the CUDA/GL bridge.
+
+    Returns (pbo_gl_name, size_in_bytes).
+    """
+    pbo = ctypes.c_uint(0)
+    gl.glGenBuffers(1, ctypes.byref(pbo))
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo.value)
+    nbytes = width * height * 4  # RGBA8
+    gl.glBufferData(gl.GL_PIXEL_UNPACK_BUFFER, nbytes, None, gl.GL_DYNAMIC_DRAW)
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0)
+    return pbo.value, nbytes
+
+
+def copy_pbo_to_texture(gl, pbo_id, tex_id, width, height):
+    """Copy pixel data from the PBO into the GL texture (GPU-to-GPU)."""
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo_id)
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id)
+    gl.glTexSubImage2D(
+        gl.GL_TEXTURE_2D,
+        0,
+        0,
+        0,
+        width,
+        height,
+        gl.GL_RGBA,
+        gl.GL_UNSIGNED_BYTE,
+        None,
+    )
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0)
+
+
+def draw_fullscreen_quad(gl, shader_prog, vao_id, tex_id):
+    """Draw the texture to the screen using the fullscreen quad."""
+    gl.glUseProgram(shader_prog.id)
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id)
+    gl.glBindVertexArray(vao_id)
+    gl.glDrawArrays(gl.GL_TRIANGLES, 0, 6)
+    gl.glBindVertexArray(0)
+    gl.glUseProgram(0)
+
+
+# ================================== main() ==================================
+
+
+def main():
+    # --- Step 1: Set up CUDA (compile kernels, create stream) ---
+    dev, stream, kernels = setup_cuda()
+
+    # --- Step 2: Allocate the 3D density volume and bake it once ---
+    #     The CUDAArray is the long-lived backing store; it must outlive the
+    #     render loop. The SurfaceObject is only needed for the one-shot bake
+    #     and is closed before we ever bind a TextureObject to the same CUDAArray.
+    arr = make_volume_array()
+    bake_volume(stream, kernels, arr)
+
+    # --- Step 3: Bind the volume as a trilinear TextureObject ---
+    #     LINEAR + WRAP + normalized_coords gives free hardware trilinear
+    #     filtering plus seamless wrapping for the animated coordinate scroll.
+    volume_tex = make_volume_texture(arr)
+
+    # --- Step 4: Open a window and set up the CUDA/GL bridge ---
+    window, gl, pyglet = create_window()
+    shader_prog, quad_vao, tex_id = create_display_resources(gl, WIDTH, HEIGHT)
+    pbo_id, _ = create_pixel_buffer(gl, WIDTH, HEIGHT)
+    resource = GraphicsResource.from_gl_buffer(pbo_id, flags="write_discard")
+
+    # --- Step 5: Render loop state ---
+    # Camera is orbit-style: yaw and pitch are angles, dist is the orbit
+    # radius. sun_height drives the light direction + sky glow; coverage shapes
+    # how much of the noise field reads as cloud. The render kernel turns these
+    # into rays + shading itself.
+    state = {
+        "yaw": RESET_YAW,
+        "pitch": RESET_PITCH,
+        "dist": RESET_DIST,
+        "sun_height": RESET_SUN_HEIGHT,
+        "coverage": RESET_COVERAGE,
+    }
+    start_time = time.monotonic()
+    frame_count = [0]
+    fps_time = [start_time]
+    last_fps = [0.0]
+
+    block = (16, 16, 1)
+    grid = (
+        (WIDTH + block[0] - 1) // block[0],
+        (HEIGHT + block[1] - 1) // block[1],
+        1,
+    )
+    config = LaunchConfig(grid=grid, block=block)
+
+    @window.event
+    def on_draw():
+        window.clear()
+        elapsed = time.monotonic() - start_time
+
+        # (a) Map the PBO so CUDA can write into it.
+        with resource.map(stream=stream) as buf:
+            # (b) Launch the volumetric raymarch kernel. Camera + lighting +
+            #     shaping params are passed as scalars; the kernel builds the
+            #     orbit eye, per-pixel ray, and clouds itself. `time` scrolls
+            #     the noise sample coordinate to animate the clouds.
+            launch(
+                stream,
+                config,
+                kernels["render"],
+                buf.handle,
+                np.int32(WIDTH),
+                np.int32(HEIGHT),
+                np.uint64(volume_tex.handle),
+                np.float32(state["yaw"]),
+                np.float32(state["pitch"]),
+                np.float32(state["dist"]),
+                np.float32(state["sun_height"]),
+                np.float32(state["coverage"]),
+                np.float32(elapsed),
+            )
+        # (c) Unmap happens automatically; cuGraphicsUnmapResources serializes
+        #     the CUDA work against subsequent OpenGL use.
+
+        copy_pbo_to_texture(gl, pbo_id, tex_id, WIDTH, HEIGHT)
+        draw_fullscreen_quad(gl, shader_prog, quad_vao, tex_id)
+
+        frame_count[0] += 1
+        now = time.monotonic()
+        if now - fps_time[0] >= 0.5:
+            last_fps[0] = frame_count[0] / (now - fps_time[0])
+            frame_count[0] = 0
+            fps_time[0] = now
+            window.set_caption(
+                "cuda.core 3D CUDAArray - Volumetric Cloud Ray-Marcher  "
+                f"yaw={state['yaw']:+.2f} pitch={state['pitch']:+.2f} "
+                f"dist={state['dist']:.2f} sun={state['sun_height']:.2f} "
+                f"cov={state['coverage']:.2f}  "
+                f"{last_fps[0]:.0f} FPS  |  "
+                "3D CUDAArray[FLOAT32,1ch] + tex3D[LINEAR|WRAP|norm] + surf3D bake"
+            )
+
+    @window.event
+    def on_mouse_drag(_x, _y, dx, dy, buttons, _modifiers):
+        # Left-click drag orbits the camera. dx -> yaw, dy -> pitch.
+        if not (buttons & pyglet.window.mouse.LEFT):
+            return
+        orbit_scale = 0.005
+        state["yaw"] += dx * orbit_scale
+        state["pitch"] += dy * orbit_scale
+        if state["pitch"] < PITCH_MIN:
+            state["pitch"] = PITCH_MIN
+        elif state["pitch"] > PITCH_MAX:
+            state["pitch"] = PITCH_MAX
+
+    @window.event
+    def on_mouse_scroll(_x, _y, _scroll_x, scroll_y):
+        # Scroll wheel zoom: geometric so each tick feels uniform. Positive
+        # scroll_y (wheel up) zooms in.
+        if scroll_y == 0:
+            return
+        state["dist"] *= 0.9**scroll_y
+        if state["dist"] < DIST_MIN:
+            state["dist"] = DIST_MIN
+        elif state["dist"] > DIST_MAX:
+            state["dist"] = DIST_MAX
+
+    @window.event
+    def on_key_press(symbol, _modifiers):
+        key = pyglet.window.key
+        keyboard_orbit = 0.08
+        if symbol == key.ESCAPE:
+            window.close()
+        elif symbol == key.R:
+            state["yaw"] = RESET_YAW
+            state["pitch"] = RESET_PITCH
+            state["dist"] = RESET_DIST
+            state["sun_height"] = RESET_SUN_HEIGHT
+            state["coverage"] = RESET_COVERAGE
+        elif symbol == key.LEFT:
+            state["yaw"] -= keyboard_orbit
+        elif symbol == key.RIGHT:
+            state["yaw"] += keyboard_orbit
+        elif symbol == key.UP:
+            state["pitch"] = min(PITCH_MAX, state["pitch"] + keyboard_orbit)
+        elif symbol == key.DOWN:
+            state["pitch"] = max(PITCH_MIN, state["pitch"] - keyboard_orbit)
+        elif symbol in (key.PLUS, key.EQUAL, key.NUM_ADD):
+            state["sun_height"] = min(SUN_HEIGHT_MAX, state["sun_height"] + 0.05)
+        elif symbol in (key.MINUS, key.UNDERSCORE, key.NUM_SUBTRACT):
+            state["sun_height"] = max(SUN_HEIGHT_MIN, state["sun_height"] - 0.05)
+        elif symbol == key.BRACKETLEFT:
+            state["coverage"] = max(COVERAGE_MIN, state["coverage"] - 0.03)
+        elif symbol == key.BRACKETRIGHT:
+            state["coverage"] = min(COVERAGE_MAX, state["coverage"] + 0.03)
+
+    @window.event
+    def on_close():
+        # Release CUDA resources in reverse construction order. The GL objects
+        # clean up via pyglet on window close.
+        resource.close()
+        volume_tex.close()
+        arr.close()
+        stream.close()
+
+    pyglet.app.run(interval=0)
+
+
+# ======================== GPU code (CUDA + GLSL) ============================
+#
+# Two CUDA C++ kernels are concatenated into one program string so they share
+# a single NVRTC compile. NOTE: with no GPU available at authoring time, the
+# noise/raymarch math below is unverified at runtime -- it is kept deliberately
+# conservative (integer-hash value noise, plain fbm, no STL / host-only calls)
+# so it compiles cleanly under NVRTC c++17.
+#
+#   bake_density   -- one thread per voxel. Evaluates fractal Brownian motion
+#                     (fbm) of a cheap integer-hash value noise and writes the
+#                     raw scalar via surf3Dwrite. NOTE: surf3Dwrite's
+#                     x coordinate is in BYTES; a FLOAT32 element is 4 bytes, so
+#                     multiply by sizeof(float). y and z are in elements
+#                     -- a classic CUDA gotcha.
+#
+#   render_clouds  -- one thread per screen pixel. Builds the orbit-camera ray,
+#                     intersects the [-1, 1]^3 box, marches front-to-back
+#                     sampling density via tex3D<float> (LINEAR + WRAP +
+#                     normalized coords, coordinate scrolled by `time`), applies
+#                     a coverage threshold + vertical height falloff, does a
+#                     short sun-ward shadow march per sample (Beer-Lambert),
+#                     accumulates color + transmittance, composites over an
+#                     analytic sky, and writes RGBA8 into the PBO.
+#
+# GLSL shaders at the very bottom just draw a textured quad. Nothing CUDA-
+# specific there.
+#
+# ============================================================================
+
+KERNEL_SOURCE = r"""
+// --------------------------------------------------------------------------
+// Small inline helpers.
+// --------------------------------------------------------------------------
+__device__ __forceinline__ float clampf(float v, float a, float b) {
+    return fminf(fmaxf(v, a), b);
+}
+
+__device__ __forceinline__ float dot3(float ax, float ay, float az,
+                                      float bx, float by, float bz) {
+    return ax * bx + ay * by + az * bz;
+}
+
+__device__ __forceinline__ float length3(float x, float y, float z) {
+    return sqrtf(x * x + y * y + z * z);
+}
+
+__device__ __forceinline__ float lerpf(float a, float b, float t) {
+    return a + (b - a) * t;
+}
+
+__device__ __forceinline__ float smoothstepf(float t) {
+    // Hermite fade curve used both for noise interpolation and shaping.
+    return t * t * (3.0f - 2.0f * t);
+}
+
+// --------------------------------------------------------------------------
+// Cheap integer-hash value noise + fractal Brownian motion (fbm).
+//
+// hash3() turns an integer lattice point into a pseudo-random float in [0,1].
+// value_noise() trilinearly interpolates the 8 lattice corners around a
+// floating-point position with a smoothstep fade. fbm() sums several octaves
+// of value_noise at doubling frequency / halving amplitude. All integer math,
+// no tables, no host-only calls -- NVRTC-friendly.
+// --------------------------------------------------------------------------
+__device__ __forceinline__ float hash3(int ix, int iy, int iz) {
+    unsigned int h = (unsigned int)ix * 374761393u +
+                     (unsigned int)iy * 668265263u +
+                     (unsigned int)iz * 2147483647u;
+    h = (h ^ (h >> 13)) * 1274126177u;
+    h = h ^ (h >> 16);
+    return (float)(h & 0x00ffffffu) / (float)0x01000000u;  // [0, 1)
+}
+
+__device__ __forceinline__ float value_noise(float x, float y, float z) {
+    float fx = floorf(x), fy = floorf(y), fz = floorf(z);
+    int ix = (int)fx, iy = (int)fy, iz = (int)fz;
+    float tx = smoothstepf(x - fx);
+    float ty = smoothstepf(y - fy);
+    float tz = smoothstepf(z - fz);
+
+    float c000 = hash3(ix,     iy,     iz);
+    float c100 = hash3(ix + 1, iy,     iz);
+    float c010 = hash3(ix,     iy + 1, iz);
+    float c110 = hash3(ix + 1, iy + 1, iz);
+    float c001 = hash3(ix,     iy,     iz + 1);
+    float c101 = hash3(ix + 1, iy,     iz + 1);
+    float c011 = hash3(ix,     iy + 1, iz + 1);
+    float c111 = hash3(ix + 1, iy + 1, iz + 1);
+
+    float x00 = lerpf(c000, c100, tx);
+    float x10 = lerpf(c010, c110, tx);
+    float x01 = lerpf(c001, c101, tx);
+    float x11 = lerpf(c011, c111, tx);
+    float y0  = lerpf(x00, x10, ty);
+    float y1  = lerpf(x01, x11, ty);
+    return lerpf(y0, y1, tz);
+}
+
+__device__ __forceinline__ float fbm(float x, float y, float z) {
+    float sum = 0.0f;
+    float amp = 0.5f;
+    float freq = 1.0f;
+    #pragma unroll
+    for (int o = 0; o < 5; ++o) {
+        sum += amp * value_noise(x * freq, y * freq, z * freq);
+        freq *= 2.0f;
+        amp  *= 0.5f;
+    }
+    return sum;  // roughly in [0, 1)
+}
+
+// --------------------------------------------------------------------------
+// bake_density: one thread per voxel writes raw fbm into the volume via a
+//               SurfaceObject. The cloud SHAPING (coverage threshold + height
+//               falloff) is applied later in render_clouds so the threshold and
+//               fade stay fixed while the render kernel scrolls the coordinate
+//               for animation.
+//
+//   surf is bound to a (size^3, FLOAT32 x 1) CUDAArray allocated with
+//   is_surface_load_store=True.
+//   surf3Dwrite's x coordinate is in BYTES; a FLOAT32 element is 4 bytes, so
+//   multiply x by sizeof(float). y and z are in elements -- a classic CUDA
+//   gotcha.
+// --------------------------------------------------------------------------
+extern "C" __global__
+void bake_density(cudaSurfaceObject_t surf, int size) {
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    int z = blockIdx.z * blockDim.z + threadIdx.z;
+    if (x >= size || y >= size || z >= size) return;
+
+    // Voxel-center position mapped into a few noise cells so fbm has structure
+    // across the volume. ~4 base cells across the volume gives puffy blobs.
+    const float NOISE_SCALE = 4.0f;
+    float fx = ((float)x + 0.5f) / (float)size;
+    float fy = ((float)y + 0.5f) / (float)size;
+    float fz = ((float)z + 0.5f) / (float)size;
+
+    float n = fbm(fx * NOISE_SCALE, fy * NOISE_SCALE, fz * NOISE_SCALE);
+
+    // FLOAT32 store: surf3Dwrite's x offset is in BYTES (x * sizeof(float)).
+    surf3Dwrite(n, surf, x * (int)sizeof(float), y, z);
+}
+
+// --------------------------------------------------------------------------
+// Density sampler: tex3D wants normalized coords in [0, 1]; the volume covers
+// [-1, 1] in world space, so we remap with (p + 1) * 0.5 and add a time-based
+// scroll (WRAP addressing wraps it without edge clamping). The raw fbm is then shaped into
+// a cloud density with:
+//   - a coverage threshold (higher `coverage` -> lower threshold -> more cloud)
+//   - a vertical height falloff that fades density near the top and bottom of
+//     the box so clouds float in a slab rather than filling the whole cube.
+// Returns density >= 0 (0 = clear air).
+// --------------------------------------------------------------------------
+__device__ __forceinline__ float sample_density(cudaTextureObject_t tex,
+                                                 float px, float py, float pz,
+                                                 float coverage, float t) {
+    // Slow horizontal drift + gentle vertical bob for evolving clouds.
+    float u = (px + 1.0f) * 0.5f + t * 0.015f;
+    float v = (py + 1.0f) * 0.5f + t * 0.004f;
+    float w = (pz + 1.0f) * 0.5f + t * 0.010f;
+    float n = tex3D<float>(tex, u, v, w);
+
+    // Coverage threshold: subtract a threshold and rescale so values below it
+    // become clear air. coverage in [0,1] maps to threshold in [~0.8, ~0.15].
+    float threshold = lerpf(0.80f, 0.15f, coverage);
+    float d = (n - threshold) / fmaxf(1.0f - threshold, 1e-3f);
+    d = clampf(d, 0.0f, 1.0f);
+
+    // Vertical height falloff: py in [-1, 1]. Fade to zero near the top/bottom
+    // so clouds form a horizontal band. Peak density around py ~ -0.1.
+    float h = clampf((py + 1.0f) * 0.5f, 0.0f, 1.0f);   // [0,1] bottom->top
+    float falloff = smoothstepf(clampf(h * 4.0f, 0.0f, 1.0f)) *
+                    smoothstepf(clampf((1.0f - h) * 2.5f, 0.0f, 1.0f));
+
+    return d * falloff;
+}
+
+// --------------------------------------------------------------------------
+// render_clouds: one thread per screen pixel. Volumetric ray march of the
+// density volume as participating media.
+//
+// Camera math (orbit, look-at origin, world-up (0, 1, 0)) matches the SDF
+// example. Per pixel:
+//   1. Build the ray, intersect the [-1, 1]^3 AABB (slab method).
+//   2. March front-to-back from the entry point. At each step sample density;
+//      if positive, do a SHORT secondary march toward the sun to estimate how
+//      much light reaches this sample (Beer-Lambert: exp(-sum*absorption)).
+//   3. Accumulate color and transmittance front-to-back. Early-out when
+//      transmittance < 0.01 (rest of the ray is occluded -> big speedup).
+//   4. Composite the accumulated cloud color over an analytic sky gradient
+//      (horizon-to-zenith blue + a sun glow), tonemap, write RGBA8.
+// --------------------------------------------------------------------------
+extern "C" __global__
+void render_clouds(unsigned char* output,
+                   int width,
+                   int height,
+                   cudaTextureObject_t tex,
+                   float yaw,
+                   float pitch,
+                   float dist,
+                   float sun_height,
+                   float coverage,
+                   float t) {
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= width || y >= height) return;
+
+    // ---- Build the orbit camera basis ----------------------------------
+    float cp = cosf(pitch), sp = sinf(pitch);
+    float cyw = cosf(yaw),  syw = sinf(yaw);
+
+    float ex = dist * cp * cyw;
+    float ey = dist * sp;
+    float ez = dist * cp * syw;
+
+    float fl = length3(ex, ey, ez);
+    if (fl < 1e-6f) fl = 1e-6f;
+    float fx = -ex / fl, fy = -ey / fl, fz = -ez / fl;
+
+    // right = normalize(cross(fwd, world_up)), world_up = (0, 1, 0).
+    float rx = -fz;
+    float ry = 0.0f;
+    float rz = fx;
+    float rl = length3(rx, ry, rz);
+    if (rl < 1e-6f) rl = 1e-6f;
+    rx /= rl; ry /= rl; rz /= rl;
+
+    // up' = cross(right, fwd).
+    float ux = ry * fz - rz * fy;
+    float uy = rz * fx - rx * fz;
+    float uz = rx * fy - ry * fx;
+
+    // ---- Per-pixel ray direction ---------------------------------------
+    float u_ndc = 2.0f * ((float)x + 0.5f) / (float)width  - 1.0f;
+    float v_ndc = 2.0f * ((float)y + 0.5f) / (float)height - 1.0f;
+
+    const float TAN_HALF = 0.41421356237309515f;       // tanf(45deg / 2)
+    float aspect = (float)width / (float)height;
+
+    float dx = fx + u_ndc * aspect * TAN_HALF * rx + v_ndc * TAN_HALF * ux;
+    float dy = fy + u_ndc * aspect * TAN_HALF * ry + v_ndc * TAN_HALF * uy;
+    float dz = fz + u_ndc * aspect * TAN_HALF * rz + v_ndc * TAN_HALF * uz;
+    float dl = length3(dx, dy, dz);
+    if (dl < 1e-6f) dl = 1e-6f;
+    dx /= dl; dy /= dl; dz /= dl;
+
+    // ---- Sun direction from sun_height ---------------------------------
+    // sun_height in [0,1]: 0 -> near horizon, 1 -> overhead. Keep a fixed
+    // azimuth so the light feels stable while orbiting.
+    float sun_el = sun_height * 1.4707963f;            // up to ~84 degrees
+    float se = sinf(sun_el), ce = cosf(sun_el);
+    const float SUN_AZ = 0.7853981633974483f;          // 45 deg azimuth
+    float lx = ce * cosf(SUN_AZ);
+    float ly = se;
+    float lz = ce * sinf(SUN_AZ);
+    float ll = length3(lx, ly, lz);
+    if (ll < 1e-6f) ll = 1e-6f;
+    lx /= ll; ly /= ll; lz /= ll;
+
+    // ---- Ray vs. the [-1, 1]^3 box (slab method) -----------------------
+    float inv_dx = 1.0f / (fabsf(dx) > 1e-8f ? dx : (dx >= 0 ? 1e-8f : -1e-8f));
+    float inv_dy = 1.0f / (fabsf(dy) > 1e-8f ? dy : (dy >= 0 ? 1e-8f : -1e-8f));
+    float inv_dz = 1.0f / (fabsf(dz) > 1e-8f ? dz : (dz >= 0 ? 1e-8f : -1e-8f));
+    float t1x = (-1.0f - ex) * inv_dx, t2x = ( 1.0f - ex) * inv_dx;
+    float t1y = (-1.0f - ey) * inv_dy, t2y = ( 1.0f - ey) * inv_dy;
+    float t1z = (-1.0f - ez) * inv_dz, t2z = ( 1.0f - ez) * inv_dz;
+    float tNear = fmaxf(fmaxf(fminf(t1x, t2x), fminf(t1y, t2y)), fminf(t1z, t2z));
+    float tFar  = fminf(fminf(fmaxf(t1x, t2x), fmaxf(t1y, t2y)), fmaxf(t1z, t2z));
+
+    // Accumulators: front-to-back compositing. transmittance starts at 1
+    // (fully clear); accumulated radiance starts at 0.
+    float trans = 1.0f;
+    float acc_r = 0.0f, acc_g = 0.0f, acc_b = 0.0f;
+
+    // Cloud material + lighting constants.
+    const float ABSORPTION   = 6.0f;    // primary extinction per unit density
+    const float SUN_ABSORP   = 8.0f;    // shadow-ray extinction per unit density
+    const float STEP_LEN     = 2.0f / 96.0f;   // ~one voxel at 96^3
+    const int   PRIMARY_STEPS = 96;
+    const int   SHADOW_STEPS   = 6;
+    const float SHADOW_STEP_LEN = 0.06f;
+
+    // Henyey-Greenstein forward-scattering phase function. g>0 biases scatter
+    // toward the light direction, producing the bright "silver lining" rim when
+    // the view ray points toward the sun. cos(theta) = dot(view_dir, sun_dir);
+    // both are unit length here. phase = (1-g^2) / (4pi * (1+g^2-2g*cos)^1.5).
+    // The constant 1/(4pi) factor is folded into the lighting scale below, so
+    // we only keep the angular shape that drives the glow.
+    const float HG_G = 0.6f;
+    float cos_vl = dot3(dx, dy, dz, lx, ly, lz);
+    float hg_denom = 1.0f + HG_G * HG_G - 2.0f * HG_G * cos_vl;
+    float hg_phase = (1.0f - HG_G * HG_G) / (hg_denom * sqrtf(fmaxf(hg_denom, 1e-4f)));
+
+    if (tFar > fmaxf(tNear, 0.0f)) {
+        float tcur = fmaxf(tNear, 0.0f) + 1e-4f;
+
+        #pragma unroll 1
+        for (int i = 0; i < PRIMARY_STEPS; ++i) {
+            if (tcur > tFar) break;
+
+            float pxw = ex + tcur * dx;
+            float pyw = ey + tcur * dy;
+            float pzw = ez + tcur * dz;
+
+            float density = sample_density(tex, pxw, pyw, pzw, coverage, t);
+
+            if (density > 1e-3f) {
+                // ---- Secondary march toward the sun for self-shadowing ----
+                float shadow_sum = 0.0f;
+                #pragma unroll
+                for (int s = 1; s <= SHADOW_STEPS; ++s) {
+                    float st = (float)s * SHADOW_STEP_LEN;
+                    float sxw = pxw + lx * st;
+                    float syw = pyw + ly * st;
+                    float szw = pzw + lz * st;
+                    // Stop sampling outside the box (no density there anyway).
+                    if (fabsf(sxw) > 1.0f || fabsf(syw) > 1.0f || fabsf(szw) > 1.0f) {
+                        break;
+                    }
+                    shadow_sum += sample_density(tex, sxw, syw, szw, coverage, t);
+                }
+                float sun_trans = expf(-shadow_sum * SUN_ABSORP * SHADOW_STEP_LEN);
+
+                // Powder ("dark edge") term: thin cloud edges scatter less light
+                // back than a naive 1-exp model predicts, so darken low-density
+                // samples for fluffier, more rounded volumes. Saturates toward 1
+                // in dense cloud (cores stay bright); only thin edges are dimmed.
+                // Apply as a gentle modulation so cores keep full sunlight.
+                float powder = 0.4f + 0.6f * (1.0f - expf(-density * 3.0f));
+
+                // Beer-Lambert extinction for this slab of the primary ray.
+                float slab_trans = expf(-density * ABSORPTION * STEP_LEN);
+                float absorbed = trans * (1.0f - slab_trans);
+
+                // Direct sunlight reaching this sample, shaped by the HG phase so
+                // it spikes when looking toward the sun (silver lining). Add a
+                // small ambient floor so shadowed cores stay bluish, not black.
+                float sun_light = sun_trans * (0.4f + 1.6f * hg_phase) * powder;
+                float lit = clampf(0.15f + sun_light, 0.0f, 1.6f);
+                float cr = lerpf(0.42f, 1.05f, clampf(lit, 0.0f, 1.0f)) + 0.05f * fmaxf(lit - 1.0f, 0.0f);
+                float cg = lerpf(0.48f, 0.99f, clampf(lit, 0.0f, 1.0f)) + 0.04f * fmaxf(lit - 1.0f, 0.0f);
+                float cb = lerpf(0.62f, 0.92f, clampf(lit, 0.0f, 1.0f));
+
+                acc_r += absorbed * cr;
+                acc_g += absorbed * cg;
+                acc_b += absorbed * cb;
+                trans *= slab_trans;
+
+                if (trans < 0.01f) break;   // remaining ray fully occluded
+            }
+
+            tcur += STEP_LEN;
+        }
+    }
+
+    // ---- Analytic sky behind / through the clouds ----------------------
+    // Vertical gradient from a pale horizon to a deeper zenith blue, plus a
+    // soft sun glow where the ray direction aligns with the sun.
+    float up_amt = clampf(0.5f * (dy + 1.0f), 0.0f, 1.0f);
+    float sky_r = lerpf(0.70f, 0.18f, up_amt);
+    float sky_g = lerpf(0.80f, 0.34f, up_amt);
+    float sky_b = lerpf(0.92f, 0.62f, up_amt);
+
+    // Sun glow + a crisp sun disk. The broad glow uses a moderate power; the
+    // disk is a high-power lobe that reads as a bright, slightly warm sun.
+    float sun_dot = clampf(dot3(dx, dy, dz, lx, ly, lz), 0.0f, 1.0f);
+    float glow = powf(sun_dot, 64.0f);
+    float disk = powf(sun_dot, 2048.0f);
+    sky_r += glow * 0.8f + disk * 6.0f;
+    sky_g += glow * 0.7f + disk * 5.4f;
+    sky_b += glow * 0.5f + disk * 3.6f;
+
+    // Composite: accumulated cloud radiance over the sky weighted by the
+    // remaining transmittance.
+    float r = acc_r + trans * sky_r;
+    float g = acc_g + trans * sky_g;
+    float b = acc_b + trans * sky_b;
+
+    // Simple Reinhard tonemap to keep the sun glow from blowing out.
+    r = r / (1.0f + r);
+    g = g / (1.0f + g);
+    b = b / (1.0f + b);
+    // Mild gamma for a punchier image.
+    r = powf(clampf(r, 0.0f, 1.0f), 0.85f);
+    g = powf(clampf(g, 0.0f, 1.0f), 0.85f);
+    b = powf(clampf(b, 0.0f, 1.0f), 0.85f);
+
+    int idx = (y * width + x) * 4;
+    output[idx + 0] = (unsigned char)(r * 255.0f);
+    output[idx + 1] = (unsigned char)(g * 255.0f);
+    output[idx + 2] = (unsigned char)(b * 255.0f);
+    output[idx + 3] = 255;
+}
+"""
+
+# GLSL shaders -- these just display a texture on a fullscreen rectangle.
+# Nothing CUDA-specific here.
+
+VERTEX_SHADER_SOURCE = """#version 330 core
+in vec2 position;
+in vec2 texcoord;
+out vec2 v_texcoord;
+void main() {
+    gl_Position = vec4(position, 0.0, 1.0);
+    v_texcoord = texcoord;
+}
+"""
+
+FRAGMENT_SHADER_SOURCE = """#version 330 core
+in vec2 v_texcoord;
+out vec4 fragColor;
+uniform sampler2D tex;
+void main() {
+    fragColor = texture(tex, v_texcoord);
+}
+"""
+
+
+if __name__ == "__main__":
+    main()
diff --git a/cuda_core/examples/gl_interop_fire.py b/cuda_core/examples/gl_interop_fire.py
new file mode 100644
index 00000000000..ad9008757eb
--- /dev/null
+++ b/cuda_core/examples/gl_interop_fire.py
@@ -0,0 +1,819 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# ################################################################################
+#
+# This example demonstrates cuda.core.CUDAArray, TextureObject, and SurfaceObject
+# in combination with GraphicsResource for CUDA/OpenGL interop: a classic
+# "Doom-style" procedural fire effect. A scalar heat field lives on a
+# ping-ponged float CUDA CUDAArray; each frame the field is advected upward with a
+# horizontal jitter and a small decay, then colorized through a 1D fire-palette
+# TextureObject straight into an OpenGL PBO. Requires pyglet.
+#
+# ################################################################################
+
+# What this example teaches
+# =========================
+# - How to combine a 2D float CUDAArray (the heat field) and a 1D RGBA8 CUDAArray (the
+#   color palette) under the same texture/surface API.
+# - How to ping-pong a scalar field via CUDAArray + SurfaceObject writes and
+#   TextureObject reads, similar to the reaction-diffusion example but with a
+#   single channel.
+# - How to use TextureObject(NORMALIZED_FLOAT) on a UINT8 palette so a
+#   tex1D<float4> lookup returns RGBA in [0, 1] -- no manual unpacking needed.
+# - How to wire mouse / keyboard events into a CUDA simulation without
+#   blocking the event loop.
+#
+# How it works
+# ============
+# The heat field is a WIDTH x HEIGHT scalar in [0, 1]. Each frame we:
+#
+#   1. step kernel: for every pixel,
+#        - if y is near the bottom AND ambient injection is on, write random
+#          high heat ("the embers");
+#        - if the mouse button is held, paint a hot disc near the cursor;
+#        - otherwise read a horizontally-jittered sample from the row "below"
+#          (i.e. one texel toward the bottom of the screen) and subtract a
+#          small decay. This is what creates the upward-flickering motion.
+#   2. colorize kernel: per pixel, sample the heat, look it up in a 1D RGBA8
+#      fire palette via tex1D<float4>, and write RGBA bytes into the PBO.
+#
+#   PING-PONG (two single-channel float Arrays)
+#   ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#   +-------------+   tex2D<float>    +-------------+
+#   |   heat_a    | ----------------> |             |
+#   | (FLOAT32 x1)|                   |  step_fire  |
+#   +-------------+                   |   kernel    |
+#                                     |             |
+#   +-------------+   surf2Dwrite     |             |
+#   |   heat_b    | <---------------- |             |
+#   | (FLOAT32 x1)|                   +-------------+
+#   +-------------+
+#       (swap)
+#
+# Orientation
+# -----------
+# OpenGL displays texel row 0 at the bottom of the window. The fullscreen quad
+# in create_display_resources() flips t so that kernel y=0 lands at the TOP of
+# the screen -- this lets the kernel keep the intuitive "inject at y = h-1,
+# advect from y+1 -> y" convention while the visible flames rise upward.
+# Mouse coordinates from pyglet (y=0 at window bottom) are flipped to the
+# kernel's y-down convention on entry.
+#
+# surf2Dwrite x-in-bytes
+# ----------------------
+# `surf2Dwrite` takes the x coordinate in BYTES, not in elements. For a
+# float surface that means `x * sizeof(float)` = `x * 4`. Getting this wrong
+# silently corrupts every other column.
+#
+# What you should see
+# ===================
+# A flickering wall of doom-style fire rising from the bottom of the window.
+# Hold the mouse button and drag to paint a torch of heat at the cursor.
+# Press SPACE to toggle the ambient embers along the bottom row (the fire
+# will die out when ambient is OFF). Press R to clear the heat field.
+# Press Escape or close the window to exit. The window title shows FPS and
+# whether ambient injection is currently on.
+#
+
+# /// script
+# dependencies = ["cuda_bindings", "cuda_core>0.6.0", "pyglet"]
+# ///
+
+import ctypes
+import sys
+import time
+
+import numpy as np
+
+from cuda.core import (
+    AddressMode,
+    ArrayFormat,
+    CUDAArray,
+    Device,
+    FilterMode,
+    GraphicsResource,
+    LaunchConfig,
+    Program,
+    ProgramOptions,
+    ReadMode,
+    ResourceDescriptor,
+    SurfaceObject,
+    TextureDescriptor,
+    TextureObject,
+    launch,
+)
+
+# ---------------------------------------------------------------------------
+# Simulation parameters (feel free to change these)
+# ---------------------------------------------------------------------------
+# Window dimensions (what the user sees).
+WINDOW_WIDTH = 640
+WINDOW_HEIGHT = 480
+
+# Simulation dimensions (the heat-field grid). Doom's actual screen was
+# 320x200; we use 320x100 so the canonical decay rate of ~1 intensity unit
+# per row (random {0, 1, 2}, average 1) produces flames that reach ~36% of
+# the screen height -- the recognizable "tall licking flames" look.
+# NEAREST-filtered upscale to the 640x480 window stretches vertically 4.8x,
+# giving the chunky retro pixel-doubled appearance.
+WIDTH = 320
+HEIGHT = 100
+
+# Canonical Doom fire palette: 37 hand-tuned colors (intensity 0..36 -> RGB).
+# Source: https://github.com/tiagomenegaz/doom-fire (and Fabien Sanglard's
+# analysis of the original PSX Doom fire effect).
+PALETTE_SIZE = 37
+MAX_INTENSITY = 36
+TORCH_RADIUS = 12  # pixel radius of the mouse-painted hot disc (sim space)
+
+
+# ============================= Helper functions =============================
+#
+# The functions below set up CUDA and OpenGL. If you're here to learn about
+# CUDAArray/TextureObject/SurfaceObject, skip ahead to main() -- the interesting
+# part is there. These helpers exist so that main() reads like a short story
+# instead of a wall of boilerplate.
+# ============================================================================
+
+
+def setup_cuda():
+    """Compile the CUDA kernels and return (device, stream, kernels, configs)."""
+    dev = Device(0)
+    dev.set_current()
+
+    # SurfaceObject requires surface load/store, which has existed since SM 2.0,
+    # but bindless surface objects (cuSurfObjectCreate) require SM 3.0+.
+    cc = dev.compute_capability
+    if cc.major < 3:
+        print(
+            "This example requires a GPU with compute capability >= 3.0 for "
+            f"bindless surface objects. Found sm_{cc.major}{cc.minor}.",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    stream = dev.create_stream()
+
+    # Compile as C++ so the templated tex1D<float4> / tex2D<float> overloads
+    # resolve.
+    program_options = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}")
+    prog = Program(KERNEL_SOURCE, code_type="c++", options=program_options)
+    mod = prog.compile(
+        "cubin",
+        name_expressions=("step_fire", "colorize_fire"),
+    )
+
+    kernels = {
+        "step": mod.get_kernel("step_fire"),
+        "colorize": mod.get_kernel("colorize_fire"),
+    }
+
+    block = (16, 16, 1)
+    grid = (
+        (WIDTH + block[0] - 1) // block[0],
+        (HEIGHT + block[1] - 1) // block[1],
+        1,
+    )
+    config = LaunchConfig(grid=grid, block=block)
+    # Both kernels are pixel-parallel over a WIDTH x HEIGHT grid.
+    configs = {"step": config, "colorize": config}
+
+    return dev, stream, kernels, configs
+
+
+def create_window():
+    """Open a pyglet window and return (window, gl_module, pyglet)."""
+    try:
+        import pyglet
+        from pyglet.gl import gl as _gl
+    except ImportError:
+        print(
+            "This example requires pyglet >= 2.0.\nInstall it with:  pip install pyglet",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    window = pyglet.window.Window(
+        WINDOW_WIDTH,
+        WINDOW_HEIGHT,
+        caption="cuda.core CUDAArray/Texture/Surface - Doom Fire",
+        vsync=False,
+    )
+    return window, _gl, pyglet
+
+
+def create_display_resources(gl, width, height):
+    """Create the GL objects needed to show a texture on screen.
+
+    Standard OpenGL boilerplate for a textured fullscreen quad. The texcoord
+    `t` is flipped versus the plasma example so that kernel y=0 lands at the
+    TOP of the screen. That lets the fire kernel keep the intuitive
+    "inject at the largest y, advect upward" convention while the visible
+    flames rise toward the top.
+
+    Returns (shader_program, vertex_array_id, texture_id).
+    """
+    from pyglet.graphics.shader import Shader, ShaderProgram
+
+    vert = Shader(VERTEX_SHADER_SOURCE, "vertex")
+    frag = Shader(FRAGMENT_SHADER_SOURCE, "fragment")
+    shader_prog = ShaderProgram(vert, frag)
+
+    # Fullscreen quad (two triangles covering the entire window). Note the
+    # flipped t coordinates compared to gl_interop_plasma: (-1, -1) gets t=1
+    # so screen-bottom samples the kernel's largest-y row.
+    quad_verts = np.array(
+        [
+            # x,  y,    s, t      (position + texture coordinate)
+            -1,
+            -1,
+            0,
+            1,
+            1,
+            -1,
+            1,
+            1,
+            1,
+            1,
+            1,
+            0,
+            -1,
+            -1,
+            0,
+            1,
+            1,
+            1,
+            1,
+            0,
+            -1,
+            1,
+            0,
+            0,
+        ],
+        dtype=np.float32,
+    )
+
+    vao = ctypes.c_uint(0)
+    gl.glGenVertexArrays(1, ctypes.byref(vao))
+    gl.glBindVertexArray(vao.value)
+
+    vbo = ctypes.c_uint(0)
+    gl.glGenBuffers(1, ctypes.byref(vbo))
+    gl.glBindBuffer(gl.GL_ARRAY_BUFFER, vbo.value)
+    gl.glBufferData(
+        gl.GL_ARRAY_BUFFER,
+        quad_verts.nbytes,
+        quad_verts.ctypes.data_as(ctypes.c_void_p),
+        gl.GL_STATIC_DRAW,
+    )
+
+    stride = 4 * 4  # 4 floats * 4 bytes each = 16 bytes per vertex
+    pos_loc = gl.glGetAttribLocation(shader_prog.id, b"position")
+    gl.glEnableVertexAttribArray(pos_loc)
+    gl.glVertexAttribPointer(pos_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(0))
+
+    tc_loc = gl.glGetAttribLocation(shader_prog.id, b"texcoord")
+    gl.glEnableVertexAttribArray(tc_loc)
+    gl.glVertexAttribPointer(tc_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(8))
+
+    gl.glBindVertexArray(0)
+
+    # Empty texture (filled each frame from the PBO).
+    tex = ctypes.c_uint(0)
+    gl.glGenTextures(1, ctypes.byref(tex))
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex.value)
+    # NEAREST upscale: makes the low-res simulation render with crisp,
+    # blocky pixels instead of bilinear-blended mush. Critical to the
+    # Doom-fire look.
+    gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MIN_FILTER, gl.GL_NEAREST)
+    gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MAG_FILTER, gl.GL_NEAREST)
+    gl.glTexImage2D(
+        gl.GL_TEXTURE_2D,
+        0,
+        gl.GL_RGBA8,
+        width,
+        height,
+        0,
+        gl.GL_RGBA,
+        gl.GL_UNSIGNED_BYTE,
+        None,
+    )
+
+    return shader_prog, vao.value, tex.value
+
+
+def create_pixel_buffer(gl, width, height):
+    """Create a Pixel Buffer Object (PBO) -- the bridge between CUDA and OpenGL.
+
+    Returns (pbo_gl_name, size_in_bytes).
+    """
+    pbo = ctypes.c_uint(0)
+    gl.glGenBuffers(1, ctypes.byref(pbo))
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo.value)
+    nbytes = width * height * 4  # RGBA, 1 byte per channel
+    gl.glBufferData(gl.GL_PIXEL_UNPACK_BUFFER, nbytes, None, gl.GL_DYNAMIC_DRAW)
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0)
+    return pbo.value, nbytes
+
+
+def copy_pbo_to_texture(gl, pbo_id, tex_id, width, height):
+    """Copy pixel data from the PBO into the GL texture (GPU-to-GPU)."""
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo_id)
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id)
+    gl.glTexSubImage2D(
+        gl.GL_TEXTURE_2D,
+        0,
+        0,
+        0,
+        width,
+        height,
+        gl.GL_RGBA,
+        gl.GL_UNSIGNED_BYTE,
+        None,  # None = read from the currently bound PBO, not from CPU
+    )
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0)
+
+
+def draw_fullscreen_quad(gl, shader_prog, vao_id, tex_id):
+    """Draw the texture to the screen using the fullscreen quad."""
+    gl.glUseProgram(shader_prog.id)
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id)
+    gl.glBindVertexArray(vao_id)
+    gl.glDrawArrays(gl.GL_TRIANGLES, 0, 6)
+    gl.glBindVertexArray(0)
+    gl.glUseProgram(0)
+
+
+def make_heat_arrays():
+    """Allocate two single-channel UINT8 ping-pong Arrays for the heat field.
+
+    Intensity is an integer in [0, 36] indexing the canonical Doom palette.
+    UINT8 is exactly one byte per texel -- surf2Dwrite x-coord = x * 1.
+    """
+    arr_a = CUDAArray.from_descriptor(
+        shape=(WIDTH, HEIGHT),
+        format=ArrayFormat.UINT8,
+        num_channels=1,
+        is_surface_load_store=True,
+    )
+    arr_b = CUDAArray.from_descriptor(
+        shape=(WIDTH, HEIGHT),
+        format=ArrayFormat.UINT8,
+        num_channels=1,
+        is_surface_load_store=True,
+    )
+    return arr_a, arr_b
+
+
+def make_heat_texture(arr):
+    """Bind `arr` as a TextureObject configured for POINT + CLAMP reads.
+
+    POINT filtering is what gives Doom fire its chunky retro look. LINEAR
+    smooths the per-frame horizontal jitter into a uniform glow that
+    doesn't read as fire.
+    """
+    res_desc = ResourceDescriptor.from_array(arr)
+    tex_desc = TextureDescriptor(
+        address_mode=AddressMode.CLAMP,
+        filter_mode=FilterMode.POINT,
+        read_mode=ReadMode.ELEMENT_TYPE,
+        # Non-normalized: the step kernel addresses texels in pixel space.
+        normalized_coords=False,
+    )
+    return TextureObject.from_descriptor(resource=res_desc, texture_descriptor=tex_desc)
+
+
+def build_fire_palette():
+    """Return the canonical Doom fire palette as a (37, 4) uint8 array.
+
+    The 37 entries map intensity 0 (black) -> 36 (white). Each entry is
+    indexed by the integer intensity in the heat field.
+
+    Source: Fabien Sanglard's PSX Doom analysis, reproduced in
+    https://github.com/tiagomenegaz/doom-fire.
+    """
+    rgb = [
+        (7, 7, 7),
+        (31, 7, 7),
+        (47, 15, 7),
+        (71, 15, 7),
+        (87, 23, 7),
+        (103, 31, 7),
+        (119, 31, 7),
+        (143, 39, 7),
+        (159, 47, 7),
+        (175, 63, 7),
+        (191, 71, 7),
+        (199, 71, 7),
+        (223, 79, 7),
+        (223, 87, 7),
+        (223, 87, 7),
+        (215, 95, 7),
+        (215, 95, 7),
+        (215, 103, 15),
+        (207, 111, 15),
+        (207, 119, 15),
+        (207, 127, 15),
+        (207, 135, 23),
+        (199, 135, 23),
+        (199, 143, 23),
+        (199, 151, 31),
+        (191, 159, 31),
+        (191, 159, 31),
+        (191, 167, 39),
+        (191, 167, 39),
+        (191, 175, 47),
+        (183, 175, 47),
+        (183, 183, 47),
+        (183, 183, 55),
+        (207, 207, 111),
+        (223, 223, 159),
+        (239, 239, 199),
+        (255, 255, 255),
+    ]
+    # Index 0 (the "no fire" color) is rendered as pure black so dead pixels
+    # don't glow. The canonical (7, 7, 7) reads as a dim background which is
+    # less dramatic against the dark window.
+    rgb[0] = (0, 0, 0)
+    assert len(rgb) == PALETTE_SIZE
+    rgba = np.empty((PALETTE_SIZE, 4), dtype=np.uint8)
+    rgba[:, :3] = np.array(rgb, dtype=np.uint8)
+    rgba[:, 3] = 255
+    return rgba
+
+
+def make_palette_array_and_texture(stream):
+    """Allocate the 1D RGBA8 palette CUDAArray, upload, and bind as a texture.
+
+    Returns (palette_array, palette_texture). Both must be closed by the
+    caller (or used inside `with` blocks).
+    """
+    palette = build_fire_palette()  # shape (PALETTE_SIZE, 4), uint8
+    arr = CUDAArray.from_descriptor(
+        shape=(PALETTE_SIZE,),
+        format=ArrayFormat.UINT8,
+        num_channels=4,
+    )
+    # 1D CUDAArray bytes match a flat (PALETTE_SIZE * 4) uint8 buffer.
+    arr.copy_from(np.ascontiguousarray(palette), stream=stream)
+
+    res_desc = ResourceDescriptor.from_array(arr)
+    tex_desc = TextureDescriptor(
+        address_mode=AddressMode.CLAMP,
+        # POINT keeps the palette stops as discrete color bands -- the
+        # classic Doom fire palette is indexed, not gradient-blended.
+        filter_mode=FilterMode.POINT,
+        # NORMALIZED_FLOAT: tex1D<float4> returns each UINT8 channel as a
+        # float in [0, 1], so the colorize kernel can multiply by 255 and
+        # store directly without manual unpacking.
+        read_mode=ReadMode.NORMALIZED_FLOAT,
+        # Normalized: the kernel feeds a heat value in [0, 1] as the LUT
+        # coordinate. With normalized_coords=True the LINEAR filter blends
+        # adjacent palette entries smoothly.
+        normalized_coords=True,
+    )
+    tex = TextureObject.from_descriptor(resource=res_desc, texture_descriptor=tex_desc)
+    return arr, tex
+
+
+# ================================== main() ==================================
+
+
+def main():
+    # --- Step 1: Set up CUDA (compile kernels, create stream) ---
+    dev, stream, kernels, configs = setup_cuda()
+
+    # --- Step 2: Open a window ---
+    window, gl, pyglet = create_window()
+
+    # --- Step 3: Create GL resources for drawing a texture to screen ---
+    #     (Standard OpenGL boilerplate -- not CUDA-specific.)
+    shader_prog, quad_vao, tex_id = create_display_resources(gl, WIDTH, HEIGHT)
+
+    # --- Step 4: Create the Pixel Buffer Object (PBO) ---
+    pbo_id, _ = create_pixel_buffer(gl, WIDTH, HEIGHT)
+
+    # --- Step 5: Register the PBO with CUDA ---
+    resource = GraphicsResource.from_gl_buffer(pbo_id, flags="write_discard")
+
+    # --- Step 6: Allocate heat-field Arrays, palette CUDAArray, and the four
+    #             bindless handles (textures + surfaces). We hold them open
+    #             for the lifetime of the window and release in on_close(),
+    #             matching the reaction-diffusion example. (Using `with`
+    #             blocks here would close everything before the pyglet event
+    #             loop has a chance to use them.)
+    arr_a, arr_b = make_heat_arrays()
+    palette_arr, palette_tex = make_palette_array_and_texture(stream)
+    tex_a = make_heat_texture(arr_a)
+    tex_b = make_heat_texture(arr_b)
+    surf_a = SurfaceObject.from_array(arr_a)
+    surf_b = SurfaceObject.from_array(arr_b)
+
+    # The heat field is born zeroed by CUDAArray.from_descriptor. No seed pass.
+    state = {
+        "current": "a",  # which array holds the latest heat field
+        "frame_index": 0,  # passed into the step kernel as `t`
+        "ambient": True,  # SPACE toggles bottom-row injection
+        "mouse_down": False,
+        "mouse_x": 0,
+        "mouse_y": 0,
+    }
+
+    def current_read_write():
+        if state["current"] == "a":
+            return tex_a, surf_b, "b"  # read a, write b, next current = b
+        return tex_b, surf_a, "a"
+
+    def clear_field():
+        """Zero both heat arrays and seed the bottom row at full intensity.
+
+        CUDAArray.copy_from is the simplest reset path -- a dedicated clear
+        kernel would be faster but is unnecessary for an interactive demo.
+        The bottom row is set to MAX_INTENSITY so the very first frame
+        already has a fire source to advect from.
+        """
+        seed = np.zeros((HEIGHT, WIDTH), dtype=np.uint8)
+        seed[HEIGHT - 1, :] = MAX_INTENSITY  # canonical Doom fire source
+        arr_a.copy_from(np.ascontiguousarray(seed), stream=stream)
+        arr_b.copy_from(np.ascontiguousarray(seed), stream=stream)
+        state["current"] = "a"
+
+    # Seed at startup so frame 1 already has a source row.
+    clear_field()
+    stream.sync()
+
+    # --- Step 7: Render loop ---
+    start_time = time.monotonic()
+    frame_count = 0
+    fps_time = start_time
+
+    @window.event
+    def on_key_press(symbol, _modifiers):
+        key = pyglet.window.key
+        if symbol == key.ESCAPE:
+            window.close()
+            return
+        if symbol == key.SPACE:
+            state["ambient"] = not state["ambient"]
+            return
+        if symbol == key.R:
+            clear_field()
+            return
+
+    # Map window coords (WINDOW_WIDTH x WINDOW_HEIGHT, y=0 at bottom) to
+    # simulation coords (WIDTH x HEIGHT, y=0 at top).
+    def _window_to_sim(x, y):
+        sx = int(x * WIDTH / WINDOW_WIDTH)
+        sy = int((WINDOW_HEIGHT - 1 - y) * HEIGHT / WINDOW_HEIGHT)
+        return sx, sy
+
+    @window.event
+    def on_mouse_press(x, y, _button, _modifiers):
+        state["mouse_down"] = True
+        state["mouse_x"], state["mouse_y"] = _window_to_sim(x, y)
+
+    @window.event
+    def on_mouse_release(_x, _y, _button, _modifiers):
+        state["mouse_down"] = False
+
+    @window.event
+    def on_mouse_drag(x, y, _dx, _dy, _buttons, _modifiers):
+        state["mouse_down"] = True
+        state["mouse_x"], state["mouse_y"] = _window_to_sim(x, y)
+
+    @window.event
+    def on_draw():
+        nonlocal frame_count, fps_time
+
+        window.clear()
+
+        # (a) Advance the heat field by one step.
+        tex_read, surf_write, next_current = current_read_write()
+        launch(
+            stream,
+            configs["step"],
+            kernels["step"],
+            np.uint64(tex_read.handle),
+            np.uint64(surf_write.handle),
+            np.int32(WIDTH),
+            np.int32(HEIGHT),
+            np.uint32(state["frame_index"]),
+            np.int32(state["mouse_x"]),
+            np.int32(state["mouse_y"]),
+            np.int32(1 if state["mouse_down"] else 0),
+            np.int32(1 if state["ambient"] else 0),
+        )
+        state["current"] = next_current
+        state["frame_index"] += 1
+
+        # (b) Colorize the latest state into the OpenGL PBO.
+        tex_heat = tex_a if state["current"] == "a" else tex_b
+        with resource.map(stream=stream) as buf:
+            launch(
+                stream,
+                configs["colorize"],
+                kernels["colorize"],
+                np.uint64(tex_heat.handle),
+                np.uint64(palette_tex.handle),
+                buf.handle,
+                np.int32(WIDTH),
+                np.int32(HEIGHT),
+            )
+        # Unmap happens automatically when the `with` block exits.
+
+        # (c) Tell OpenGL to copy the PBO contents into our texture.
+        copy_pbo_to_texture(gl, pbo_id, tex_id, WIDTH, HEIGHT)
+
+        # (d) Draw the texture to the screen.
+        draw_fullscreen_quad(gl, shader_prog, quad_vao, tex_id)
+
+        # FPS counter (shown in window title)
+        frame_count += 1
+        now = time.monotonic()
+        if now - fps_time >= 1.0:
+            fps = frame_count / (now - fps_time)
+            ambient_label = "on" if state["ambient"] else "off"
+            window.set_caption(
+                "cuda.core CUDAArray/Texture/Surface - Doom Fire"
+                f" ({WIDTH}x{HEIGHT}, {fps:.0f} FPS,"
+                f" ambient {ambient_label})"
+            )
+            frame_count = 0
+            fps_time = now
+
+    @window.event
+    def on_close():
+        # Release everything we opened, in reverse order. Each of these is a
+        # context manager too, but pyglet owns the event loop here so we
+        # release explicitly to be deterministic about ordering.
+        resource.close()
+        tex_a.close()
+        tex_b.close()
+        surf_a.close()
+        surf_b.close()
+        palette_tex.close()
+        palette_arr.close()
+        arr_a.close()
+        arr_b.close()
+        stream.close()
+
+    pyglet.app.run(interval=0)
+
+
+# ======================== GPU code (CUDA + GLSL) ============================
+#
+# These source strings are kept at the bottom of the file so they don't
+# distract from the Python logic above. The important things to know:
+#
+#   - KERNEL_SOURCE contains two CUDA C++ kernels:
+#       * step_fire     -- advances the heat field. Reads previous state via a
+#                          TextureObject (LINEAR + CLAMP, non-normalized) and
+#                          writes the next state via a SurfaceObject. Bakes
+#                          the bottom-row injection, mouse torch, and upward
+#                          jittered advection into a single pass.
+#       * colorize_fire -- per pixel: read heat from the heat TextureObject,
+#                          look up the fire palette via tex1D<float4>, write
+#                          RGBA bytes to the OpenGL PBO.
+#
+#   - VERTEX_SHADER_SOURCE / FRAGMENT_SHADER_SOURCE are GLSL. They draw a
+#     texture onto a rectangle covering the entire window. The quad's t
+#     coordinate is flipped versus the plasma example so that y=0 maps to the
+#     top of the screen (see create_display_resources for why).
+#
+# ============================================================================
+
+KERNEL_SOURCE = r"""
+// Small, deterministic, GPU-friendly hash. Returns a value in [0, 1).
+// Used both for bottom-row ember intensity and for the per-pixel jitter that
+// gives the fire its characteristic horizontal flicker.
+__device__ __forceinline__ float hash3(unsigned int x, unsigned int y,
+                                       unsigned int t) {
+    unsigned int h = x * 374761393u + y * 668265263u + t * 2246822519u;
+    h = (h ^ (h >> 13)) * 1274126177u;
+    h ^= (h >> 16);
+    return (float)(h & 0x00ffffffu) / (float)0x01000000u;
+}
+
+// Canonical Doom-fire step (gather form of the original scatter algorithm).
+//
+// Reference scatter (one cell per JS source row):
+//     decay = random in {0, 1, 2}
+//     below = state[x, y+1]
+//     new = max(0, below - decay)
+//     state[x - decay, y] = new        // writes LEFT of source -> leftward lean
+//
+// Equivalent gather (one CUDA thread per destination cell):
+//     decay = hash(x, y, t) in {0, 1, 2}
+//     below = state[x + decay, y+1]    // reads from the right-shifted source
+//     new = max(0, below - decay)
+//     state[x, y] = new
+//
+// The right-shifted gather reads the same data the leftward-shifted scatter
+// would have produced.
+
+extern "C"
+__global__
+void step_fire(cudaTextureObject_t tex_read,
+               cudaSurfaceObject_t surf_write,
+               int width, int height,
+               unsigned int t,
+               int mouse_x, int mouse_y, int mouse_active,
+               int ambient_on) {
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= width || y >= height) return;
+
+    const int MAX_I = 36;
+
+    // 1) Mouse torch: a hot disc painted at the cursor (overrides everything).
+    if (mouse_active) {
+        int dx = x - mouse_x;
+        int dy = y - mouse_y;
+        if (dx * dx + dy * dy <= 12 * 12) {  // matches host TORCH_RADIUS
+            surf2Dwrite((unsigned char)MAX_I, surf_write, x, y);
+            return;
+        }
+    }
+
+    // 2) Bottom row is the steady fire source. Hardcoded to MAX_I when the
+    //    ambient ember bed is on; zero otherwise (lets the fire die down).
+    if (y == height - 1) {
+        surf2Dwrite((unsigned char)(ambient_on ? MAX_I : 0),
+                    surf_write, x, y);
+        return;
+    }
+
+    // 3) Gather from the row below with random {0, 1, 2} horizontal shift
+    //    and matching intensity decay -- the canonical Doom-fire update.
+    float jitter_h = hash3((unsigned int)x, (unsigned int)y, t);
+    int decay = (int)(jitter_h * 3.0f);             // 0, 1, or 2
+    int src_x = x + decay;
+    if (src_x >= width) src_x = width - 1;
+    unsigned char below = tex2D<unsigned char>(tex_read,
+                                               (float)src_x + 0.5f,
+                                               (float)y + 1.5f);
+    int new_i = (int)below - decay;
+    if (new_i < 0) new_i = 0;
+
+    // UINT8 is 1 byte, so surf2Dwrite's x argument is already the byte offset.
+    surf2Dwrite((unsigned char)new_i, surf_write, x, y);
+}
+
+extern "C"
+__global__
+void colorize_fire(cudaTextureObject_t tex_heat,
+                   cudaTextureObject_t palette_tex,
+                   unsigned char* output,
+                   int width, int height) {
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= width || y >= height) return;
+
+    // Heat texture is UINT8 + ELEMENT_TYPE: tex2D<unsigned char> returns the
+    // raw intensity byte (0..36).
+    unsigned char h = tex2D<unsigned char>(tex_heat,
+                                           (float)x + 0.5f,
+                                           (float)y + 0.5f);
+
+    // Palette texture is 1D normalized RGBA8 with POINT filtering and 37
+    // entries. Index i lands at coord (i + 0.5) / 37 -- the texel center,
+    // which POINT samples exactly.
+    const float palette_size = 37.0f;
+    float u = ((float)h + 0.5f) / palette_size;
+    float4 c = tex1D<float4>(palette_tex, u);
+
+    int idx = (y * width + x) * 4;
+    output[idx + 0] = (unsigned char)(c.x * 255.0f);
+    output[idx + 1] = (unsigned char)(c.y * 255.0f);
+    output[idx + 2] = (unsigned char)(c.z * 255.0f);
+    output[idx + 3] = 255;
+}
+"""
+
+# GLSL shaders -- these just display a texture on a fullscreen rectangle.
+# Nothing CUDA-specific here.
+
+VERTEX_SHADER_SOURCE = """#version 330 core
+in vec2 position;
+in vec2 texcoord;
+out vec2 v_texcoord;
+void main() {
+    gl_Position = vec4(position, 0.0, 1.0);
+    v_texcoord = texcoord;
+}
+"""
+
+FRAGMENT_SHADER_SOURCE = """#version 330 core
+in vec2 v_texcoord;
+out vec4 fragColor;
+uniform sampler2D tex;
+void main() {
+    fragColor = texture(tex, v_texcoord);
+}
+"""
+
+
+if __name__ == "__main__":
+    main()
diff --git a/cuda_core/examples/gl_interop_fluid.py b/cuda_core/examples/gl_interop_fluid.py
new file mode 100644
index 00000000000..1423580fcdb
--- /dev/null
+++ b/cuda_core/examples/gl_interop_fluid.py
@@ -0,0 +1,1251 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# ################################################################################
+#
+# This example demonstrates cuda.core.CUDAArray, TextureObject, and SurfaceObject
+# in combination with GraphicsResource for CUDA/OpenGL interop. It runs a
+# real-time Stable Fluids (Jos Stam) smoke/ink solver entirely on the GPU:
+# velocity, pressure, and dye fields live in ping-ponged CUDA arrays, are read
+# through TextureObjects with free hardware bilinear filtering (the heart of
+# semi-Lagrangian advection), and written back through SurfaceObjects. The dye
+# is colorized straight into an OpenGL PBO. Drag the mouse to inject swirling
+# ink. Requires pyglet.
+#
+# ################################################################################
+
+# What this example teaches
+# =========================
+# - How semi-Lagrangian advection uses tex2D LINEAR sampling: trace each cell
+#   backward along the velocity field and read the old quantity with free
+#   hardware bilinear interpolation (no manual lerp, no neighbor gather).
+# - How to drive several distinct kernels (advect, divergence, Jacobi pressure
+#   solve, gradient subtraction, dye advect, colorize) over a shared set of
+#   pre-created TextureObject/SurfaceObject handles, ping-ponging multiple
+#   fields without recreating handles per frame.
+# - How to fold live mouse input into a GPU simulation: capture the mouse delta
+#   and splat velocity + dye into the field via a SurfaceObject (in-place
+#   read-modify-write, one thread per cell -> no race).
+#
+# How it works
+# ============
+# Stam's "Stable Fluids" solves the incompressible Navier-Stokes equations on a
+# regular grid by splitting each step into stages that are each individually
+# stable:
+#
+#   1. ADVECT VELOCITY  - move the velocity field along itself. For each cell we
+#      back-trace its center one timestep against the local velocity and read
+#      the old velocity there with tex2D<float2> LINEAR (bilinear). This is the
+#      unconditionally-stable semi-Lagrangian scheme.
+#   2. SPLAT (input)    - add the mouse-drag velocity and a dab of dye in a soft
+#      radial brush around the cursor (in-place on the velocity/dye surfaces).
+#   3. DIVERGENCE       - compute div(velocity), the amount each cell is a
+#      source/sink. An incompressible fluid must have zero divergence.
+#   4. PRESSURE SOLVE   - Jacobi-iterate the Poisson equation lap(p) = div,
+#      ping-ponging two pressure buffers for ~30 iterations.
+#   5. SUBTRACT GRADIENT- v <- v - grad(p). This projects the velocity onto its
+#      divergence-free part, enforcing incompressibility.
+#   6. ADVECT DYE       - move the ink along the (now divergence-free) velocity,
+#      again with tex2D LINEAR back-tracing.
+#   7. COLORIZE         - map dye density through a vivid gradient into the PBO.
+#
+#   PING-PONG (read one array, write the other, then swap)
+#   ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#   +-----------+   tex2D<float2> LINEAR   +-------------+   surf2Dwrite   +-----------+
+#   |  vel_a    | -----------------------> |  advect /   | --------------> |  vel_b    |
+#   | (vx, vy)  |                          |  jacobi /   |                 | (vx, vy)  |
+#   +-----------+                          |  advect_dye |                 +-----------+
+#        ^                                 +-------------+                       |
+#        +-------------------------------- (swap) ------------------------------+
+#
+# Why LINEAR + CLAMP + normalized coords?
+# ---------------------------------------
+# Semi-Lagrangian advection traces a cell's center back to an arbitrary
+# fractional position and needs the interpolated field value there. LINEAR
+# filtering gives that bilinear interpolation for free in hardware. We use a
+# bounded box (CLAMP) rather than a torus so ink piles up against the walls
+# instead of wrapping. CLAMP, like all addressing modes, behaves cleanly with
+# normalized coordinates, and we sample at texel centers `(i + 0.5) / N` so a
+# zero-velocity cell reads back exactly its own value.
+#
+# Channel byte width in surf2Dwrite
+# ---------------------------------
+# `surf2Dwrite` takes the x coordinate in BYTES, not in elements. Velocity is a
+# `float2` (8 bytes) so its x offset is `x * sizeof(float2)`; pressure and
+# divergence are `float` (4 bytes, `x * sizeof(float)`); the dye is a `float4`
+# RGBA color (16 bytes, `x * sizeof(float4)`). Getting this wrong silently
+# corrupts every other column.
+#
+# What you should see
+# ===================
+# Big blobs of saturated color are dropped into the fluid every fraction of a
+# second and immediately billow, swirl, and mix into turbulent ribbons that
+# fill the window -- "ink dropped in water." Drag the mouse to paint your own
+# rainbow ink. Press R to clear, Escape to exit. The window title shows the
+# current FPS, pressure-iteration count, and live texture/surface config.
+#
+
+# /// script
+# dependencies = ["cuda_bindings", "cuda_core>0.6.0", "pyglet"]
+# ///
+
+import colorsys
+import ctypes
+import math
+import random
+import sys
+import time
+
+import numpy as np
+
+from cuda.core import (
+    AddressMode,
+    ArrayFormat,
+    CUDAArray,
+    Device,
+    FilterMode,
+    GraphicsResource,
+    LaunchConfig,
+    Program,
+    ProgramOptions,
+    ReadMode,
+    ResourceDescriptor,
+    SurfaceObject,
+    TextureDescriptor,
+    TextureObject,
+    launch,
+)
+
+# ---------------------------------------------------------------------------
+# Simulation parameters (feel free to change these)
+# ---------------------------------------------------------------------------
+WIDTH = 512
+HEIGHT = 512
+DT = 1.0  # simulation timestep
+PRESSURE_ITERS = 30  # Jacobi iterations for the pressure solve per frame
+VELOCITY_DISSIPATION = 0.999  # per-step velocity decay (1.0 = no decay)
+DYE_DISSIPATION = 0.994  # per-step dye decay; ink lingers and builds, then fades
+SPLAT_RADIUS = 24.0  # brush radius in cells for mouse injection
+SPLAT_FORCE = 6.0  # how strongly a mouse delta becomes velocity
+SPLAT_DYE = 1.0  # mouse ink intensity (color * this is deposited)
+CURL_SEED = 2.5  # strength of the ambient curl seeded on reset
+# Vorticity confinement pushes velocity back toward regions of high |curl|,
+# sharpening the swirls that numerical diffusion would otherwise smear out.
+# This is the single extra kernel that turns soft blobs into crisp curling
+# plumes. Tunable: ~0.1-0.3 reads well at DT=1.0; higher gets turbulent.
+VORTICITY = 0.28  # confinement strength (0.0 disables it)
+
+# Auto-bursts keep the simulation alive and colorful without any input: when
+# the mouse is idle we periodically drop a big blob of a random bright color
+# with a random velocity impulse at a random spot -- the classic "ink dropped
+# in water" look that quickly fills the frame with billowing, swirling color.
+# Grab the cursor and drag to paint your own ink.
+AUTO_EMIT = True
+BURST_INTERVAL = 0.45  # seconds between automatic colored bursts
+BURSTS_PER_EVENT = 2  # blobs dropped each burst event
+BURST_RADIUS = 42.0  # blob radius in cells (big, soft)
+BURST_FORCE = 18.0  # velocity impulse magnitude per blob
+BURST_DYE = 1.2  # ink intensity per blob (random color * this)
+
+# This solver advances one step per displayed frame, so its per-step rates
+# (dissipation, advection distance) would otherwise depend on the frame rate --
+# on a fast GPU the dye would dissipate away almost instantly between bursts.
+# We make it frame-rate INDEPENDENT instead: every frame, the real elapsed time
+# is expressed in units of a REF_FPS reference step and the dissipation and
+# advection distance are scaled by it, so the ink evolves at the same wall-clock
+# rate (and looks the same) whether the loop runs at 60 or 2000 FPS. Running
+# faster just means more, smaller, smoother substeps.
+REF_FPS = 60.0
+
+
+# ============================= Helper functions =============================
+#
+# The functions below set up CUDA and OpenGL. If you're here to learn about
+# CUDAArray/TextureObject/SurfaceObject, skip ahead to main() -- the interesting
+# part is there. These helpers exist so that main() reads like a short story
+# instead of a wall of boilerplate.
+# ============================================================================
+
+
+def setup_cuda():
+    """Compile the CUDA kernels and return (device, stream, kernels, configs).
+
+    Returns a dict of kernels keyed by name and a shared LaunchConfig (every
+    kernel is pixel-parallel over the same WIDTH x HEIGHT grid).
+    """
+    dev = Device(0)
+    dev.set_current()
+
+    # SurfaceObject requires surface load/store, which has existed since SM 2.0,
+    # but bindless surface objects (cuSurfObjectCreate) require SM 3.0+.
+    cc = dev.compute_capability
+    if cc.major < 3:
+        print(
+            "This example requires a GPU with compute capability >= 3.0 for "
+            f"bindless surface objects. Found sm_{cc.major}{cc.minor}.",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    stream = dev.create_stream()
+
+    # Compile as C++ so the templated tex2D<float2> overload resolves.
+    program_options = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}")
+    prog = Program(KERNEL_SOURCE, code_type="c++", options=program_options)
+    mod = prog.compile(
+        "cubin",
+        name_expressions=(
+            "seed_field",
+            "splat",
+            "advect_velocity",
+            "vorticity_confinement",
+            "divergence",
+            "pressure_jacobi",
+            "subtract_gradient",
+            "advect_dye",
+            "colorize",
+        ),
+    )
+
+    kernels = {
+        "seed": mod.get_kernel("seed_field"),
+        "splat": mod.get_kernel("splat"),
+        "advect_vel": mod.get_kernel("advect_velocity"),
+        "vorticity": mod.get_kernel("vorticity_confinement"),
+        "divergence": mod.get_kernel("divergence"),
+        "jacobi": mod.get_kernel("pressure_jacobi"),
+        "subtract": mod.get_kernel("subtract_gradient"),
+        "advect_dye": mod.get_kernel("advect_dye"),
+        "colorize": mod.get_kernel("colorize"),
+    }
+
+    block = (16, 16, 1)
+    grid = (
+        (WIDTH + block[0] - 1) // block[0],
+        (HEIGHT + block[1] - 1) // block[1],
+        1,
+    )
+    config = LaunchConfig(grid=grid, block=block)
+
+    return dev, stream, kernels, config
+
+
+def create_window():
+    """Open a pyglet window and return (window, gl_module, pyglet)."""
+    try:
+        import pyglet
+        from pyglet.gl import gl as _gl
+    except ImportError:
+        print(
+            "This example requires pyglet >= 2.0.\nInstall it with:  pip install pyglet",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    window = pyglet.window.Window(
+        WIDTH,
+        HEIGHT,
+        caption="cuda.core CUDAArray/Texture/Surface - Stable Fluids",
+        vsync=False,
+    )
+    return window, _gl, pyglet
+
+
+def create_display_resources(gl, width, height):
+    """Create the GL objects needed to show a texture on screen.
+
+    This sets up a shader program, a fullscreen quad, and an empty texture.
+    None of this is CUDA-specific -- it's standard OpenGL boilerplate for
+    rendering a textured quad.
+
+    Returns (shader_program, vertex_array_id, texture_id). The shader_program
+    is a pyglet ShaderProgram object (must be kept alive).
+    """
+    from pyglet.graphics.shader import Shader, ShaderProgram
+
+    # Shader program -- just passes texture coordinates through
+    vert = Shader(VERTEX_SHADER_SOURCE, "vertex")
+    frag = Shader(FRAGMENT_SHADER_SOURCE, "fragment")
+    shader_prog = ShaderProgram(vert, frag)
+
+    # Fullscreen quad (two triangles covering the entire window)
+    quad_verts = np.array(
+        [
+            # x,  y,    s, t      (position + texture coordinate)
+            -1,
+            -1,
+            0,
+            0,
+            1,
+            -1,
+            1,
+            0,
+            1,
+            1,
+            1,
+            1,
+            -1,
+            -1,
+            0,
+            0,
+            1,
+            1,
+            1,
+            1,
+            -1,
+            1,
+            0,
+            1,
+        ],
+        dtype=np.float32,
+    )
+
+    vao = ctypes.c_uint(0)
+    gl.glGenVertexArrays(1, ctypes.byref(vao))
+    gl.glBindVertexArray(vao.value)
+
+    vbo = ctypes.c_uint(0)
+    gl.glGenBuffers(1, ctypes.byref(vbo))
+    gl.glBindBuffer(gl.GL_ARRAY_BUFFER, vbo.value)
+    gl.glBufferData(
+        gl.GL_ARRAY_BUFFER,
+        quad_verts.nbytes,
+        quad_verts.ctypes.data_as(ctypes.c_void_p),
+        gl.GL_STATIC_DRAW,
+    )
+
+    stride = 4 * 4  # 4 floats * 4 bytes each = 16 bytes per vertex
+    pos_loc = gl.glGetAttribLocation(shader_prog.id, b"position")
+    gl.glEnableVertexAttribArray(pos_loc)
+    gl.glVertexAttribPointer(pos_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(0))
+
+    tc_loc = gl.glGetAttribLocation(shader_prog.id, b"texcoord")
+    gl.glEnableVertexAttribArray(tc_loc)
+    gl.glVertexAttribPointer(tc_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(8))
+
+    gl.glBindVertexArray(0)
+
+    # Empty texture (will be filled each frame from the PBO)
+    tex = ctypes.c_uint(0)
+    gl.glGenTextures(1, ctypes.byref(tex))
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex.value)
+    gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MIN_FILTER, gl.GL_LINEAR)
+    gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MAG_FILTER, gl.GL_LINEAR)
+    gl.glTexImage2D(
+        gl.GL_TEXTURE_2D,
+        0,
+        gl.GL_RGBA8,
+        width,
+        height,
+        0,
+        gl.GL_RGBA,
+        gl.GL_UNSIGNED_BYTE,
+        None,
+    )
+
+    return shader_prog, vao.value, tex.value
+
+
+def create_pixel_buffer(gl, width, height):
+    """Create a Pixel Buffer Object (PBO) -- the bridge between CUDA and OpenGL.
+
+    A PBO is a GPU-side buffer that OpenGL can read from when uploading pixels
+    to a texture. By registering this same buffer with CUDA, the CUDA kernel
+    can write directly into it.
+
+    Returns (pbo_gl_name, size_in_bytes).
+    """
+    pbo = ctypes.c_uint(0)
+    gl.glGenBuffers(1, ctypes.byref(pbo))
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo.value)
+    nbytes = width * height * 4  # RGBA, 1 byte per channel
+    gl.glBufferData(gl.GL_PIXEL_UNPACK_BUFFER, nbytes, None, gl.GL_DYNAMIC_DRAW)
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0)
+    return pbo.value, nbytes
+
+
+def copy_pbo_to_texture(gl, pbo_id, tex_id, width, height):
+    """Copy pixel data from the PBO into the GL texture (GPU-to-GPU)."""
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo_id)
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id)
+    gl.glTexSubImage2D(
+        gl.GL_TEXTURE_2D,
+        0,
+        0,
+        0,
+        width,
+        height,
+        gl.GL_RGBA,
+        gl.GL_UNSIGNED_BYTE,
+        None,  # None = read from the currently bound PBO, not from CPU
+    )
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0)
+
+
+def draw_fullscreen_quad(gl, shader_prog, vao_id, tex_id):
+    """Draw the texture to the screen using the fullscreen quad."""
+    gl.glUseProgram(shader_prog.id)
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id)
+    gl.glBindVertexArray(vao_id)
+    gl.glDrawArrays(gl.GL_TRIANGLES, 0, 6)
+    gl.glBindVertexArray(0)
+    gl.glUseProgram(0)
+
+
+# ============================ API MAP (cuda.core) ===========================
+#
+# The three helpers below are where every CUDAArray / ResourceDescriptor /
+# TextureDescriptor / TextureObject / SurfaceObject knob in this example is set.
+# Each visible setting maps to a concrete piece of cuda.core / CUDA behavior:
+#
+#   CUDAArray.from_descriptor(...)   -> allocates a CUDA *array* (opaque, tiled
+#                                       layout optimized for 2D texture fetches),
+#                                       not linear device memory.
+#   ArrayFormat.FLOAT32              -> each channel is a 32-bit float texel.
+#   num_channels=2 / num_channels=1  -> float2 (vx, vy) vs scalar (pressure /
+#                                       divergence / dye); also fixes the
+#                                       surf2Dwrite byte offset per element.
+#   is_surface_load_store=True       -> the SAME array can be bound both as a
+#                                       TextureObject (cached, filtered READS)
+#                                       and as a SurfaceObject (raw WRITES). This
+#                                       is what lets each field be sampled and
+#                                       then written back in the ping-pong.
+#
+#   ResourceDescriptor.from_array(arr) -> wraps the CUDAArray as the resource a
+#                                         TextureObject reads from.
+#   FilterMode.LINEAR                -> free HARDWARE bilinear interpolation;
+#                                       this is what makes semi-Lagrangian
+#                                       advection a single tex2D fetch at a
+#                                       fractional back-traced position (no
+#                                       manual lerp, no neighbor gather).
+#   AddressMode.CLAMP                -> bounded box boundary: out-of-range traces
+#                                       read the edge texel (ink piles up at the
+#                                       walls instead of wrapping like a torus).
+#   ReadMode.ELEMENT_TYPE            -> return the stored float value as-is (no
+#                                       integer->[0,1] normalization of texels).
+#   normalized_coords=True           -> sample in [0, 1) so CLAMP is well-defined
+#                                       and texel centers are (i + 0.5) / N.
+#
+#   SurfaceObject.from_array(arr)    -> binds the array for surf2Dread/surf2Dwrite.
+#                                       The x coordinate is in BYTES, so it is
+#                                       x * sizeof(elem): sizeof(float2)=8 for
+#                                       velocity, sizeof(float)=4 for the scalars.
+# ============================================================================
+
+
+def make_velocity_array():
+    """Allocate a `float2` velocity CUDA array (channel 0 = vx, channel 1 = vy)."""
+    return CUDAArray.from_descriptor(
+        shape=(WIDTH, HEIGHT),
+        format=ArrayFormat.FLOAT32,
+        num_channels=2,
+        is_surface_load_store=True,
+    )
+
+
+def make_scalar_array():
+    """Allocate a single-channel `float` CUDA array (pressure / divergence / dye)."""
+    return CUDAArray.from_descriptor(
+        shape=(WIDTH, HEIGHT),
+        format=ArrayFormat.FLOAT32,
+        num_channels=1,
+        is_surface_load_store=True,
+    )
+
+
+def make_color_array():
+    """Allocate a `float4` RGBA dye CUDA array.
+
+    The dye carries a full color per cell (not just a density), so different
+    bursts inject different hues that advect and mix. Same LINEAR sampling and
+    surface-write machinery as the scalar fields -- only the channel count
+    (and the surf2Dwrite byte stride, sizeof(float4) = 16) differ.
+    """
+    return CUDAArray.from_descriptor(
+        shape=(WIDTH, HEIGHT),
+        format=ArrayFormat.FLOAT32,
+        num_channels=4,
+        is_surface_load_store=True,
+    )
+
+
+def make_texture(arr):
+    """Bind `arr` as a TextureObject configured for LINEAR + CLAMP + normalized.
+
+    One descriptor serves every read in this example: semi-Lagrangian advection
+    needs the bilinear interpolation, and the stencil reads (divergence, Jacobi,
+    gradient) sample exactly at texel centers so LINEAR returns the exact value.
+    """
+    res_desc = ResourceDescriptor.from_array(arr)
+    tex_desc = TextureDescriptor(
+        address_mode=AddressMode.CLAMP,
+        filter_mode=FilterMode.LINEAR,
+        read_mode=ReadMode.ELEMENT_TYPE,
+        # Normalized coordinates keep CLAMP addressing well-defined and let us
+        # sample at texel centers as (i + 0.5) / N.
+        normalized_coords=True,
+    )
+    return TextureObject.from_descriptor(resource=res_desc, texture_descriptor=tex_desc)
+
+
+def seed_field(stream, kernels, config, vel_surf, dye_surf, prs_surf, seed_value):
+    """Reset the field: gentle ambient curl in velocity, zero pressure/dye.
+
+    Takes long-lived SurfaceObjects (not freshly created ones): `launch` is
+    async, so a SurfaceObject created inside a `with` block that closes right
+    after `launch` returns would destroy the handle before the kernel runs.
+    """
+    launch(
+        stream,
+        config,
+        kernels["seed"],
+        np.uint64(vel_surf.handle),
+        np.uint64(dye_surf.handle),
+        np.uint64(prs_surf.handle),
+        np.int32(WIDTH),
+        np.int32(HEIGHT),
+        np.float32(CURL_SEED),
+        np.uint32(seed_value),
+    )
+
+
+# ================================== main() ==================================
+
+
+def main():
+    # --- Step 1: Set up CUDA (compile kernels, create stream) ---
+    dev, stream, kernels, config = setup_cuda()
+
+    # --- Step 2: Open a window ---
+    window, gl, pyglet = create_window()
+
+    # --- Step 3: Create GL resources for drawing a texture to screen ---
+    #     (Standard OpenGL boilerplate -- not CUDA-specific.)
+    shader_prog, quad_vao, tex_id = create_display_resources(gl, WIDTH, HEIGHT)
+
+    # --- Step 4: Create the Pixel Buffer Object (PBO) ---
+    #     The PBO is GPU memory owned by OpenGL. It's the bridge between the
+    #     two worlds: CUDA writes into it, OpenGL reads from it.
+    pbo_id, _ = create_pixel_buffer(gl, WIDTH, HEIGHT)
+
+    # --- Step 5: Register the PBO with CUDA ---
+    resource = GraphicsResource.from_gl_buffer(pbo_id, flags="write_discard")
+
+    # --- Step 6: Allocate the simulation fields ---
+    #     velocity (float2) and dye (float) ping-pong; pressure (float)
+    #     ping-pongs across Jacobi iterations; divergence (float) is a single
+    #     scratch target written once per frame.
+    vel_a = make_velocity_array()
+    vel_b = make_velocity_array()
+    prs_a = make_scalar_array()
+    prs_b = make_scalar_array()
+    div = make_scalar_array()
+    dye_a = make_color_array()
+    dye_b = make_color_array()
+
+    # --- Step 7: Pre-create every bindless handle ONCE ---
+    #     Creating texture/surface objects is comparatively expensive, and they
+    #     must outlive the async launches that reference them, so we build them
+    #     up front and keep them alive for the whole run.
+    #     API MAP: make_texture binds an array as a read-only TextureObject
+    #     (LINEAR + CLAMP + normalized; see the API MAP block above), while
+    #     SurfaceObject.from_array binds the SAME array for raw surf2Dwrite
+    #     writes -- the read/write halves of one ping-pong buffer.
+    vel_tex_a = make_texture(vel_a)
+    vel_tex_b = make_texture(vel_b)
+    vel_surf_a = SurfaceObject.from_array(vel_a)
+    vel_surf_b = SurfaceObject.from_array(vel_b)
+
+    prs_tex_a = make_texture(prs_a)
+    prs_tex_b = make_texture(prs_b)
+    prs_surf_a = SurfaceObject.from_array(prs_a)
+    prs_surf_b = SurfaceObject.from_array(prs_b)
+
+    div_tex = make_texture(div)
+    div_surf = SurfaceObject.from_array(div)
+
+    dye_tex_a = make_texture(dye_a)
+    dye_tex_b = make_texture(dye_b)
+    dye_surf_a = SurfaceObject.from_array(dye_a)
+    dye_surf_b = SurfaceObject.from_array(dye_b)
+
+    # --- Step 8: Seed the initial field (curl into vel_a, zero pressure/dye) ---
+    seed_field(stream, kernels, config, vel_surf_a, dye_surf_a, prs_surf_a, seed_value=0)
+    stream.sync()
+
+    # `vel` / `dye` track which ping-pong array currently holds the live state.
+    state = {"vel": "a", "dye": "a", "seed": 0, "next_burst": 0.0}
+
+    # Mouse state shared with the event handlers. Coordinates are in SIMULATION
+    # space (y = 0 at top); the framebuffer has y = 0 at the bottom, so we flip.
+    mouse = {"down": False, "x": 0.0, "y": 0.0, "dx": 0.0, "dy": 0.0}
+
+    def vel_pair():
+        # Read live velocity, write the other buffer; returns (read_tex, write_surf, next).
+        if state["vel"] == "a":
+            return vel_tex_a, vel_surf_b, "b"
+        return vel_tex_b, vel_surf_a, "a"
+
+    def vel_live_tex():
+        return vel_tex_a if state["vel"] == "a" else vel_tex_b
+
+    def vel_live_surf():
+        return vel_surf_a if state["vel"] == "a" else vel_surf_b
+
+    def dye_pair():
+        if state["dye"] == "a":
+            return dye_tex_a, dye_surf_b, "b"
+        return dye_tex_b, dye_surf_a, "a"
+
+    def dye_live_tex():
+        return dye_tex_a if state["dye"] == "a" else dye_tex_b
+
+    def dye_live_surf():
+        return dye_surf_a if state["dye"] == "a" else dye_surf_b
+
+    # --- Step 9: Render loop ---
+    start_time = time.monotonic()
+    frame_count = 0
+    fps_time = start_time
+    clock = {"last": start_time}  # wall-clock time of the previous frame
+
+    def _window_to_sim(x, y):
+        # Window: y = 0 at bottom. Simulation: y = 0 at top. Flip vertically.
+        sx = float(x)
+        sy = float(HEIGHT - 1 - y)
+        return sx, sy
+
+    @window.event
+    def on_key_press(symbol, _modifiers):
+        key = pyglet.window.key
+        if symbol == key.ESCAPE:
+            window.close()
+            return
+        if symbol == key.R:
+            state["seed"] += 1
+            seed_field(
+                stream,
+                kernels,
+                config,
+                vel_surf_a,
+                dye_surf_a,
+                prs_surf_a,
+                seed_value=state["seed"],
+            )
+            state["vel"] = "a"
+            state["dye"] = "a"
+            return
+
+    @window.event
+    def on_mouse_press(x, y, _button, _modifiers):
+        mouse["down"] = True
+        mouse["x"], mouse["y"] = _window_to_sim(x, y)
+        mouse["dx"] = 0.0
+        mouse["dy"] = 0.0
+
+    @window.event
+    def on_mouse_release(_x, _y, _button, _modifiers):
+        mouse["down"] = False
+        mouse["dx"] = 0.0
+        mouse["dy"] = 0.0
+
+    @window.event
+    def on_mouse_drag(x, y, dx, dy, _buttons, _modifiers):
+        # The mouse delta IS the injected velocity. Framebuffer dy is up-positive
+        # while simulation y is down-positive, so the sim-space delta is -dy.
+        mouse["down"] = True
+        mouse["x"], mouse["y"] = _window_to_sim(x, y)
+        mouse["dx"] = float(dx)
+        mouse["dy"] = float(-dy)
+
+    @window.event
+    def on_draw():
+        nonlocal frame_count, fps_time
+
+        window.clear()
+        now_t = time.monotonic()
+        elapsed = now_t - start_time
+
+        # Frame-rate independence: express this frame's real duration in units of
+        # a REF_FPS reference step. `step` scales the advection distance, and the
+        # per-step dissipations are raised to `step` so their per-SECOND rate is
+        # constant no matter how fast the loop runs. Clamp to absorb the first
+        # frame and any hitch without launching a giant (unstable-looking) step.
+        dt_real = now_t - clock["last"]
+        clock["last"] = now_t
+        step = min(max(dt_real * REF_FPS, 0.0), 3.0)
+        dt_adv = DT * step
+        vel_diss = VELOCITY_DISSIPATION**step
+        dye_diss = DYE_DISSIPATION**step
+
+        # (a) Advect velocity along itself (semi-Lagrangian, tex2D LINEAR).
+        vel_read, vel_write, vel_next = vel_pair()
+        launch(
+            stream,
+            config,
+            kernels["advect_vel"],
+            np.uint64(vel_read.handle),
+            np.uint64(vel_write.handle),
+            np.int32(WIDTH),
+            np.int32(HEIGHT),
+            np.float32(dt_adv),
+            np.float32(vel_diss),
+        )
+        state["vel"] = vel_next
+
+        # (b) Splat mouse-drag velocity and colored dye into the live fields.
+        #     The injected color cycles through hues over time so dragging
+        #     paints a rainbow ribbon of ink.
+        inject = 1 if mouse["down"] else 0
+        mr, mg, mb = colorsys.hsv_to_rgb((elapsed * 0.15) % 1.0, 0.85, 1.0)
+        launch(
+            stream,
+            config,
+            kernels["splat"],
+            np.uint64(vel_live_surf().handle),
+            np.uint64(dye_live_surf().handle),
+            np.int32(WIDTH),
+            np.int32(HEIGHT),
+            np.float32(mouse["x"]),
+            np.float32(mouse["y"]),
+            np.float32(mouse["dx"] * SPLAT_FORCE),
+            np.float32(mouse["dy"] * SPLAT_FORCE),
+            np.float32(SPLAT_RADIUS),
+            np.float32(mr * SPLAT_DYE),
+            np.float32(mg * SPLAT_DYE),
+            np.float32(mb * SPLAT_DYE),
+            np.int32(inject),
+        )
+
+        # (b2) When the user is not dragging, periodically drop big blobs of a
+        #      random bright color with a random velocity impulse at random
+        #      spots -- the classic "ink in water" look. Reuses the same `splat`
+        #      kernel as the mouse, just with a color argument.
+        if AUTO_EMIT and not mouse["down"] and elapsed >= state["next_burst"]:
+            state["next_burst"] = elapsed + BURST_INTERVAL
+            for _ in range(BURSTS_PER_EVENT):
+                bx = random.uniform(0.12, 0.88) * WIDTH
+                by = random.uniform(0.12, 0.88) * HEIGHT
+                ang = random.uniform(0.0, 2.0 * math.pi)
+                bfx = math.cos(ang) * BURST_FORCE
+                bfy = math.sin(ang) * BURST_FORCE
+                br, bg, bb = colorsys.hsv_to_rgb(random.random(), 0.9, 1.0)
+                launch(
+                    stream,
+                    config,
+                    kernels["splat"],
+                    np.uint64(vel_live_surf().handle),
+                    np.uint64(dye_live_surf().handle),
+                    np.int32(WIDTH),
+                    np.int32(HEIGHT),
+                    np.float32(bx),
+                    np.float32(by),
+                    np.float32(bfx),
+                    np.float32(bfy),
+                    np.float32(BURST_RADIUS),
+                    np.float32(br * BURST_DYE),
+                    np.float32(bg * BURST_DYE),
+                    np.float32(bb * BURST_DYE),
+                    np.int32(1),
+                )
+
+        # (b3) Vorticity confinement: read the live velocity through its
+        #      TextureObject, compute curl + grad|curl|, and add a force that
+        #      pushes velocity back toward high-vorticity regions -- this is the
+        #      one extra kernel that sharpens the curling plumes. Like
+        #      advect_velocity, it reads neighbor velocities, so it MUST
+        #      ping-pong (read old buffer, write the other) -- aliasing a
+        #      texture read with a surface write of the same array in one launch
+        #      is undefined.
+        if VORTICITY > 0.0:
+            vort_read, vort_write, vort_next = vel_pair()
+            launch(
+                stream,
+                config,
+                kernels["vorticity"],
+                np.uint64(vort_read.handle),
+                np.uint64(vort_write.handle),
+                np.int32(WIDTH),
+                np.int32(HEIGHT),
+                np.float32(dt_adv),
+                np.float32(VORTICITY),
+            )
+            state["vel"] = vort_next
+
+        # (c) Compute divergence of the live velocity field.
+        launch(
+            stream,
+            config,
+            kernels["divergence"],
+            np.uint64(vel_live_tex().handle),
+            np.uint64(div_surf.handle),
+            np.int32(WIDTH),
+            np.int32(HEIGHT),
+        )
+
+        # (d) Pressure solve: Jacobi-iterate lap(p) = div, ping-ponging pressure.
+        #     Start from a cleared pressure field (prs_a) each frame.
+        launch(
+            stream,
+            config,
+            kernels["jacobi"],
+            np.uint64(prs_tex_a.handle),  # ignored on the first pass via clear flag
+            np.uint64(div_tex.handle),
+            np.uint64(prs_surf_b.handle),
+            np.int32(WIDTH),
+            np.int32(HEIGHT),
+            np.int32(1),  # clear: treat the previous pressure as zero
+        )
+        # After the clearing pass the result lives in prs_b. Continue iterating.
+        prs_cur = "b"
+        for _ in range(PRESSURE_ITERS - 1):
+            if prs_cur == "b":
+                read_tex, write_surf, prs_cur = prs_tex_b, prs_surf_a, "a"
+            else:
+                read_tex, write_surf, prs_cur = prs_tex_a, prs_surf_b, "b"
+            launch(
+                stream,
+                config,
+                kernels["jacobi"],
+                np.uint64(read_tex.handle),
+                np.uint64(div_tex.handle),
+                np.uint64(write_surf.handle),
+                np.int32(WIDTH),
+                np.int32(HEIGHT),
+                np.int32(0),  # do not clear: read the previous pressure
+            )
+        # `prs_cur` now names the buffer holding the converged pressure.
+        prs_final_tex = prs_tex_a if prs_cur == "a" else prs_tex_b
+
+        # (e) Subtract pressure gradient from the live velocity (in-place).
+        launch(
+            stream,
+            config,
+            kernels["subtract"],
+            np.uint64(prs_final_tex.handle),
+            np.uint64(vel_live_surf().handle),
+            np.int32(WIDTH),
+            np.int32(HEIGHT),
+        )
+
+        # (f) Advect the dye along the (now divergence-free) velocity field.
+        dye_read, dye_write, dye_next = dye_pair()
+        launch(
+            stream,
+            config,
+            kernels["advect_dye"],
+            np.uint64(dye_read.handle),
+            np.uint64(vel_live_tex().handle),
+            np.uint64(dye_write.handle),
+            np.int32(WIDTH),
+            np.int32(HEIGHT),
+            np.float32(dt_adv),
+            np.float32(dye_diss),
+        )
+        state["dye"] = dye_next
+
+        # (g) Colorize the latest dye into the OpenGL PBO.
+        with resource.map(stream=stream) as buf:
+            launch(
+                stream,
+                config,
+                kernels["colorize"],
+                np.uint64(dye_live_tex().handle),
+                buf.handle,
+                np.int32(WIDTH),
+                np.int32(HEIGHT),
+            )
+        # Unmap happens automatically when the `with` block exits.
+
+        # (h) Tell OpenGL to copy the PBO contents into our texture.
+        copy_pbo_to_texture(gl, pbo_id, tex_id, WIDTH, HEIGHT)
+
+        # (i) Draw the texture to the screen.
+        draw_fullscreen_quad(gl, shader_prog, quad_vao, tex_id)
+
+        # Reset the per-frame mouse delta so a held-still cursor stops pushing.
+        mouse["dx"] = 0.0
+        mouse["dy"] = 0.0
+
+        # FPS counter (shown in window title)
+        frame_count += 1
+        now = time.monotonic()
+        if now - fps_time >= 1.0:
+            fps = frame_count / (now - fps_time)
+            window.set_caption(
+                "cuda.core CUDAArray/Texture/Surface - Stable Fluids"
+                f" ({WIDTH}x{HEIGHT}, {fps:.0f} FPS,"
+                f" {PRESSURE_ITERS} pressure iters)"
+                " | TextureObject[LINEAR|CLAMP|norm|float2]"
+                " + SurfaceObject writes + GraphicsResource(PBO)"
+            )
+            frame_count = 0
+            fps_time = now
+
+    @window.event
+    def on_close():
+        # Release everything we opened, in reverse order. Each of these is a
+        # context manager too, but pyglet owns the event loop here so we
+        # release explicitly to be deterministic about ordering.
+        resource.close()
+        dye_tex_a.close()
+        dye_tex_b.close()
+        dye_surf_a.close()
+        dye_surf_b.close()
+        div_tex.close()
+        div_surf.close()
+        prs_tex_a.close()
+        prs_tex_b.close()
+        prs_surf_a.close()
+        prs_surf_b.close()
+        vel_tex_a.close()
+        vel_tex_b.close()
+        vel_surf_a.close()
+        vel_surf_b.close()
+        dye_a.close()
+        dye_b.close()
+        div.close()
+        prs_a.close()
+        prs_b.close()
+        vel_a.close()
+        vel_b.close()
+        stream.close()
+
+    # Render as fast as the GPU allows; the per-step rates are scaled by real
+    # elapsed time (see REF_FPS) so the look is frame-rate independent.
+    pyglet.app.run(interval=0)
+
+
+# ======================== GPU code (CUDA + GLSL) ============================
+#
+# These source strings are kept at the bottom of the file so they don't
+# distract from the Python logic above. The important things to know:
+#
+#   - KERNEL_SOURCE contains the eight CUDA C++ kernels of the Stable Fluids
+#     pipeline. Reads go through cudaTextureObject_t (LINEAR + CLAMP +
+#     normalized coords); writes go through cudaSurfaceObject_t with the x
+#     offset in BYTES. A small helper converts pixel coords to normalized
+#     texel-center coords.
+#
+#   - VERTEX_SHADER_SOURCE / FRAGMENT_SHADER_SOURCE are GLSL. They draw a
+#     texture onto a rectangle covering the entire window. Nothing interesting.
+#
+# ============================================================================
+
+KERNEL_SOURCE = r"""
+// Sample a float2 (velocity) field at pixel center (px, py) with bilinear
+// filtering. CLAMP addressing keeps out-of-range traces at the border.
+__device__ __forceinline__
+float2 sample_vec(cudaTextureObject_t tex, float px, float py,
+                  int width, int height) {
+    float u = (px + 0.5f) / (float)width;
+    float v = (py + 0.5f) / (float)height;
+    return tex2D<float2>(tex, u, v);
+}
+
+// Sample a scalar (float) field at pixel center (px, py) with bilinear filtering.
+__device__ __forceinline__
+float sample_scalar(cudaTextureObject_t tex, float px, float py,
+                    int width, int height) {
+    float u = (px + 0.5f) / (float)width;
+    float v = (py + 0.5f) / (float)height;
+    return tex2D<float>(tex, u, v);
+}
+
+// Sample a float4 (RGBA dye) field at pixel center with bilinear filtering.
+__device__ __forceinline__
+float4 sample_color(cudaTextureObject_t tex, float px, float py,
+                    int width, int height) {
+    float u = (px + 0.5f) / (float)width;
+    float v = (py + 0.5f) / (float)height;
+    return tex2D<float4>(tex, u, v);
+}
+
+extern "C"
+__global__
+void seed_field(cudaSurfaceObject_t vel_surf,
+                cudaSurfaceObject_t dye_surf,
+                cudaSurfaceObject_t prs_surf,
+                int width, int height,
+                float curl, unsigned int seed) {
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= width || y >= height) return;
+
+    // Seed a gentle global rotation: velocity perpendicular to the radius from
+    // the center gives a curl, so even with no mouse input there is motion.
+    float cx = width * 0.5f;
+    float cy = height * 0.5f;
+    float rx = (x - cx) / cx;   // ~[-1, 1]
+    float ry = (y - cy) / cy;
+    float2 vel = make_float2(-ry * curl, rx * curl);
+
+    // A touch of deterministic noise so successive resets look a little
+    // different and to break perfect symmetry.
+    unsigned int h = (unsigned int)x * 374761393u +
+                     (unsigned int)y * 668265263u + seed * 2246822519u;
+    h = (h ^ (h >> 13)) * 1274126177u;
+    h = h ^ (h >> 16);
+    float noise = ((h & 0xffffu) / 65535.0f) - 0.5f;   // [-0.5, 0.5]
+    vel.x += noise * 0.2f;
+    vel.y += noise * 0.2f;
+
+    // Dye starts black; the colored bursts (or the mouse) paint the ink, so
+    // there is nothing to seed here beyond clearing to zero.
+    surf2Dwrite(vel, vel_surf, x * (int)sizeof(float2), y);
+    surf2Dwrite(make_float4(0.0f, 0.0f, 0.0f, 0.0f), dye_surf,
+                x * (int)sizeof(float4), y);
+    surf2Dwrite(0.0f, prs_surf, x * (int)sizeof(float), y);
+}
+
+// Inject mouse-drag velocity and dye into a soft radial brush around the
+// cursor. In-place read-modify-write: each thread owns its own cell, no race.
+extern "C"
+__global__
+void splat(cudaSurfaceObject_t vel_surf,
+           cudaSurfaceObject_t dye_surf,
+           int width, int height,
+           float mx, float my,
+           float fx, float fy,
+           float radius, float dr, float dg, float db,
+           int inject) {
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= width || y >= height) return;
+    if (!inject) return;
+
+    float dx = (float)x - mx;
+    float dy = (float)y - my;
+    float d2 = dx * dx + dy * dy;
+    float falloff = expf(-d2 / (radius * radius));
+    if (falloff < 1e-3f) return;
+
+    float2 vel;
+    surf2Dread(&vel, vel_surf, x * (int)sizeof(float2), y);
+    vel.x += fx * falloff;
+    vel.y += fy * falloff;
+    surf2Dwrite(vel, vel_surf, x * (int)sizeof(float2), y);
+
+    // Additive colored ink. float4 surface element is 16 bytes.
+    float4 dye;
+    surf2Dread(&dye, dye_surf, x * (int)sizeof(float4), y);
+    dye.x += dr * falloff;
+    dye.y += dg * falloff;
+    dye.z += db * falloff;
+    dye.w = 1.0f;
+    surf2Dwrite(dye, dye_surf, x * (int)sizeof(float4), y);
+}
+
+// Semi-Lagrangian advection of the velocity field along itself.
+extern "C"
+__global__
+void advect_velocity(cudaTextureObject_t vel_tex,
+                     cudaSurfaceObject_t vel_out,
+                     int width, int height,
+                     float dt, float dissipation) {
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= width || y >= height) return;
+
+    float2 v = sample_vec(vel_tex, (float)x, (float)y, width, height);
+    // Trace this cell's center backward along the velocity field.
+    float px = (float)x - dt * v.x;
+    float py = (float)y - dt * v.y;
+    float2 advected = sample_vec(vel_tex, px, py, width, height);
+    advected.x *= dissipation;
+    advected.y *= dissipation;
+    surf2Dwrite(advected, vel_out, x * (int)sizeof(float2), y);
+}
+
+// Vorticity confinement. Curl of a 2D velocity field is the scalar
+// w = dVy/dx - dVx/dy. Where |w| has a gradient we add a force that pushes
+// velocity along the swirl, reinjecting the small-scale rotation that
+// numerical diffusion smears away -- the result is crisper, longer-lived
+// curls. Reads neighbor velocities through the TextureObject and writes the
+// updated velocity to a SEPARATE ping-pong buffer (no read/write aliasing).
+__device__ __forceinline__
+float curl_at(cudaTextureObject_t vel_tex, float px, float py,
+              int width, int height) {
+    float2 l = sample_vec(vel_tex, px - 1.0f, py, width, height);
+    float2 r = sample_vec(vel_tex, px + 1.0f, py, width, height);
+    float2 d = sample_vec(vel_tex, px, py - 1.0f, width, height);
+    float2 u = sample_vec(vel_tex, px, py + 1.0f, width, height);
+    return 0.5f * ((r.y - l.y) - (u.x - d.x));
+}
+
+extern "C"
+__global__
+void vorticity_confinement(cudaTextureObject_t vel_tex,
+                           cudaSurfaceObject_t vel_out,
+                           int width, int height,
+                           float dt, float eps) {
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= width || y >= height) return;
+
+    float fx = (float)x;
+    float fy = (float)y;
+
+    // Curl at this cell and at the 4 neighbors (for grad|curl|).
+    float w = curl_at(vel_tex, fx, fy, width, height);
+    float wl = curl_at(vel_tex, fx - 1.0f, fy, width, height);
+    float wr = curl_at(vel_tex, fx + 1.0f, fy, width, height);
+    float wd = curl_at(vel_tex, fx, fy - 1.0f, width, height);
+    float wu = curl_at(vel_tex, fx, fy + 1.0f, width, height);
+
+    // Gradient of |curl|, normalized to a unit direction N.
+    float gx = 0.5f * (fabsf(wr) - fabsf(wl));
+    float gy = 0.5f * (fabsf(wu) - fabsf(wd));
+    float len = sqrtf(gx * gx + gy * gy) + 1e-5f;
+    float nx = gx / len;
+    float ny = gy / len;
+
+    // Confinement force = eps * (N x w_hat). In 2D: (N_y * w, -N_x * w).
+    float2 v = sample_vec(vel_tex, fx, fy, width, height);
+    v.x += eps * dt * (ny * w);
+    v.y += eps * dt * (-nx * w);
+    surf2Dwrite(v, vel_out, x * (int)sizeof(float2), y);
+}
+
+// Divergence of the velocity field (central differences), written as a scalar.
+extern "C"
+__global__
+void divergence(cudaTextureObject_t vel_tex,
+                cudaSurfaceObject_t div_out,
+                int width, int height) {
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= width || y >= height) return;
+
+    float2 l = sample_vec(vel_tex, (float)x - 1.0f, (float)y, width, height);
+    float2 r = sample_vec(vel_tex, (float)x + 1.0f, (float)y, width, height);
+    float2 d = sample_vec(vel_tex, (float)x, (float)y - 1.0f, width, height);
+    float2 u = sample_vec(vel_tex, (float)x, (float)y + 1.0f, width, height);
+
+    float div = 0.5f * ((r.x - l.x) + (u.y - d.y));
+    surf2Dwrite(div, div_out, x * (int)sizeof(float), y);
+}
+
+// One Jacobi iteration of lap(p) = div. With unit grid spacing the update is
+// p = (p_left + p_right + p_down + p_up - div) / 4. When `clear` is set the
+// previous pressure is treated as zero so the first pass starts clean.
+extern "C"
+__global__
+void pressure_jacobi(cudaTextureObject_t prs_tex,
+                     cudaTextureObject_t div_tex,
+                     cudaSurfaceObject_t prs_out,
+                     int width, int height,
+                     int clear) {
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= width || y >= height) return;
+
+    float pl = 0.0f, pr = 0.0f, pd = 0.0f, pu = 0.0f;
+    if (!clear) {
+        pl = sample_scalar(prs_tex, (float)x - 1.0f, (float)y, width, height);
+        pr = sample_scalar(prs_tex, (float)x + 1.0f, (float)y, width, height);
+        pd = sample_scalar(prs_tex, (float)x, (float)y - 1.0f, width, height);
+        pu = sample_scalar(prs_tex, (float)x, (float)y + 1.0f, width, height);
+    }
+    float div = sample_scalar(div_tex, (float)x, (float)y, width, height);
+    float p = (pl + pr + pd + pu - div) * 0.25f;
+    surf2Dwrite(p, prs_out, x * (int)sizeof(float), y);
+}
+
+// v <- v - grad(p): project the velocity onto its divergence-free part.
+extern "C"
+__global__
+void subtract_gradient(cudaTextureObject_t prs_tex,
+                       cudaSurfaceObject_t vel_surf,
+                       int width, int height) {
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= width || y >= height) return;
+
+    float pl = sample_scalar(prs_tex, (float)x - 1.0f, (float)y, width, height);
+    float pr = sample_scalar(prs_tex, (float)x + 1.0f, (float)y, width, height);
+    float pd = sample_scalar(prs_tex, (float)x, (float)y - 1.0f, width, height);
+    float pu = sample_scalar(prs_tex, (float)x, (float)y + 1.0f, width, height);
+
+    float2 v;
+    surf2Dread(&v, vel_surf, x * (int)sizeof(float2), y);
+    v.x -= 0.5f * (pr - pl);
+    v.y -= 0.5f * (pu - pd);
+    surf2Dwrite(v, vel_surf, x * (int)sizeof(float2), y);
+}
+
+// Semi-Lagrangian advection of the dye along the velocity field.
+extern "C"
+__global__
+void advect_dye(cudaTextureObject_t dye_tex,
+                cudaTextureObject_t vel_tex,
+                cudaSurfaceObject_t dye_out,
+                int width, int height,
+                float dt, float dissipation) {
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= width || y >= height) return;
+
+    float2 v = sample_vec(vel_tex, (float)x, (float)y, width, height);
+    float px = (float)x - dt * v.x;
+    float py = (float)y - dt * v.y;
+    float4 d = sample_color(dye_tex, px, py, width, height);
+    d.x *= dissipation;
+    d.y *= dissipation;
+    d.z *= dissipation;
+    d.w *= dissipation;
+    surf2Dwrite(d, dye_out, x * (int)sizeof(float4), y);
+}
+
+// Tonemap the accumulated float4 dye color into the PBO. The ink color is
+// whatever the bursts/mouse injected and advection mixed; we apply a filmic
+// 1 - exp(-c) curve so dense ink stays vivid without harshly clipping.
+extern "C"
+__global__
+void colorize(cudaTextureObject_t dye_tex,
+              unsigned char* output,
+              int width, int height) {
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= width || y >= height) return;
+
+    float4 c = sample_color(dye_tex, (float)x, (float)y, width, height);
+    const float gain = 1.3f;
+    float r = 1.0f - expf(-fmaxf(c.x, 0.0f) * gain);
+    float g = 1.0f - expf(-fmaxf(c.y, 0.0f) * gain);
+    float b = 1.0f - expf(-fmaxf(c.z, 0.0f) * gain);
+
+    int idx = (y * width + x) * 4;
+    output[idx + 0] = (unsigned char)(r * 255.0f);
+    output[idx + 1] = (unsigned char)(g * 255.0f);
+    output[idx + 2] = (unsigned char)(b * 255.0f);
+    output[idx + 3] = 255;
+}
+"""
+
+# GLSL shaders -- these just display a texture on a fullscreen rectangle.
+# Nothing CUDA-specific here.
+
+VERTEX_SHADER_SOURCE = """#version 330 core
+in vec2 position;
+in vec2 texcoord;
+out vec2 v_texcoord;
+void main() {
+    gl_Position = vec4(position, 0.0, 1.0);
+    v_texcoord = texcoord;
+}
+"""
+
+FRAGMENT_SHADER_SOURCE = """#version 330 core
+in vec2 v_texcoord;
+out vec4 fragColor;
+uniform sampler2D tex;
+void main() {
+    fragColor = texture(tex, v_texcoord);
+}
+"""
+
+
+if __name__ == "__main__":
+    main()
diff --git a/cuda_core/examples/gl_interop_image_show.py b/cuda_core/examples/gl_interop_image_show.py
new file mode 100644
index 00000000000..7678d457b10
--- /dev/null
+++ b/cuda_core/examples/gl_interop_image_show.py
@@ -0,0 +1,456 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# ################################################################################
+#
+# Minimal "Hello World" for the cuda.core texture/surface stack.
+#
+# Allocates a small `CUDAArray`, fills it with a procedural image once, binds it
+# as a `TextureObject`, and uses a single CUDA kernel to sample that texture
+# at every screen pixel (with a scale + rotation transform) and write the
+# result into an OpenGL PBO for display.
+#
+# Nothing else: no `SurfaceObject`, no ping-pong, no simulation, no mipmaps.
+# If you have never touched the new APIs before, open this file first.
+#
+# ################################################################################
+#
+# What this example teaches
+# =========================
+# - Allocate an `CUDAArray` and upload data into it with `CUDAArray.copy_from`.
+# - Build a `TextureObject` from a `ResourceDescriptor` + `TextureDescriptor`.
+# - The visual difference between `FilterMode.POINT` and `FilterMode.LINEAR`
+#   (press F to toggle live).
+# - That filter mode is baked into the `TextureDescriptor` at creation time,
+#   so changing it requires destroying and rebuilding the `TextureObject`.
+#
+# How it works
+# ============
+#   Startup (once):
+#     +-------------------+   copy_from   +----------+
+#     | host numpy image  | ------------> |  CUDAArray   |  (UINT8 RGBA, 64x64)
+#     +-------------------+               +----+-----+
+#                                              |
+#                                              v
+#                                       +-------------+
+#                                       | TextureObj  |  (filter mode = POINT)
+#                                       +-------------+
+#
+#   Each frame:
+#     - kernel `sample_image` reads from the TextureObject at a transformed
+#       (u, v) per screen pixel and writes RGBA bytes to the GL PBO.
+#     - OpenGL copies the PBO into a screen texture and draws it.
+#
+# What you should see
+# ===================
+# A 64x64 procedural test pattern (checkerboard + colored gradient stripes +
+# diagonal lines) magnified to fill the window. Press F to switch between
+# POINT (blocky) and LINEAR (smooth) sampling; the difference is immediately
+# visible. Press R to start/stop a slow rotation. Esc to quit.
+#
+
+# /// script
+# dependencies = ["cuda_bindings", "cuda_core>0.6.0", "pyglet"]
+# ///
+
+import ctypes
+import sys
+import time
+
+import numpy as np
+
+from cuda.core import (
+    AddressMode,
+    ArrayFormat,
+    CUDAArray,
+    Device,
+    FilterMode,
+    GraphicsResource,
+    LaunchConfig,
+    Program,
+    ProgramOptions,
+    ReadMode,
+    ResourceDescriptor,
+    TextureDescriptor,
+    TextureObject,
+    launch,
+)
+
+# ---------------------------------------------------------------------------
+# Constants
+# ---------------------------------------------------------------------------
+WIDTH = 640
+HEIGHT = 480
+IMAGE_SIZE = 64  # the source CUDAArray is IMAGE_SIZE x IMAGE_SIZE RGBA8
+
+
+# ============================= Helper functions =============================
+
+
+def make_test_image(size):
+    """Build a (size, size, 4) uint8 RGBA test pattern.
+
+    Designed so the filter-mode difference is obvious: hard-edged checkerboard
+    (POINT preserves the edges; LINEAR smooths them) plus a vertical color
+    gradient stripe (LINEAR blends smoothly between palette stops) plus two
+    diagonal hairlines (POINT preserves them; LINEAR softens them).
+    """
+    img = np.zeros((size, size, 4), dtype=np.uint8)
+    # 8x8 black/white checkerboard
+    cells = size // 8
+    for y in range(size):
+        for x in range(size):
+            if ((x // cells) + (y // cells)) & 1:
+                img[y, x, :3] = 255
+    # vertical RGB gradient strip down the left third
+    strip = size // 3
+    img[:, :strip, 0] = np.linspace(255, 0, size, dtype=np.uint8)[:, None].repeat(strip, axis=1)
+    img[:, :strip, 1] = np.linspace(0, 255, size, dtype=np.uint8)[:, None].repeat(strip, axis=1)
+    img[:, :strip, 2] = 128
+    # two diagonal red hairlines
+    for d in range(size):
+        img[d, d, :] = [255, 0, 0, 255]
+        if d < size - 4:
+            img[d, d + 4, :] = [255, 0, 0, 255]
+    img[:, :, 3] = 255  # opaque
+    return img
+
+
+def setup_cuda():
+    """Compile the kernel and return (device, stream, kernel, launch_config)."""
+    dev = Device(0)
+    dev.set_current()
+    stream = dev.create_stream()
+
+    program_options = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}")
+    prog = Program(KERNEL_SOURCE, code_type="c++", options=program_options)
+    mod = prog.compile("cubin", name_expressions=("sample_image",))
+    kernel = mod.get_kernel("sample_image")
+
+    block = (16, 16, 1)
+    grid = (
+        (WIDTH + block[0] - 1) // block[0],
+        (HEIGHT + block[1] - 1) // block[1],
+        1,
+    )
+    config = LaunchConfig(grid=grid, block=block)
+    return dev, stream, kernel, config
+
+
+def create_window():
+    """Open a pyglet window. Returns (window, gl_module, pyglet_module)."""
+    try:
+        import pyglet
+        from pyglet.gl import gl as _gl
+    except ImportError:
+        print(
+            "This example requires pyglet >= 2.0.\nInstall it with:  pip install pyglet",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    window = pyglet.window.Window(
+        WIDTH,
+        HEIGHT,
+        caption="cuda.core CUDAArray + TextureObject - Image Show",
+        vsync=False,
+    )
+    return window, _gl, pyglet
+
+
+def create_display_resources(gl, width, height):
+    """Standard pyglet boilerplate: shader, fullscreen quad, screen texture."""
+    from pyglet.graphics.shader import Shader, ShaderProgram
+
+    vert = Shader(VERTEX_SHADER_SOURCE, "vertex")
+    frag = Shader(FRAGMENT_SHADER_SOURCE, "fragment")
+    shader_prog = ShaderProgram(vert, frag)
+
+    quad_verts = np.array(
+        [
+            -1,
+            -1,
+            0,
+            0,
+            1,
+            -1,
+            1,
+            0,
+            1,
+            1,
+            1,
+            1,
+            -1,
+            -1,
+            0,
+            0,
+            1,
+            1,
+            1,
+            1,
+            -1,
+            1,
+            0,
+            1,
+        ],
+        dtype=np.float32,
+    )
+
+    vao = ctypes.c_uint(0)
+    gl.glGenVertexArrays(1, ctypes.byref(vao))
+    gl.glBindVertexArray(vao.value)
+
+    vbo = ctypes.c_uint(0)
+    gl.glGenBuffers(1, ctypes.byref(vbo))
+    gl.glBindBuffer(gl.GL_ARRAY_BUFFER, vbo.value)
+    gl.glBufferData(
+        gl.GL_ARRAY_BUFFER,
+        quad_verts.nbytes,
+        quad_verts.ctypes.data_as(ctypes.c_void_p),
+        gl.GL_STATIC_DRAW,
+    )
+
+    stride = 4 * 4
+    pos_loc = gl.glGetAttribLocation(shader_prog.id, b"position")
+    gl.glEnableVertexAttribArray(pos_loc)
+    gl.glVertexAttribPointer(pos_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(0))
+    tc_loc = gl.glGetAttribLocation(shader_prog.id, b"texcoord")
+    gl.glEnableVertexAttribArray(tc_loc)
+    gl.glVertexAttribPointer(tc_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(8))
+    gl.glBindVertexArray(0)
+
+    tex = ctypes.c_uint(0)
+    gl.glGenTextures(1, ctypes.byref(tex))
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex.value)
+    gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MIN_FILTER, gl.GL_NEAREST)
+    gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MAG_FILTER, gl.GL_NEAREST)
+    gl.glTexImage2D(
+        gl.GL_TEXTURE_2D,
+        0,
+        gl.GL_RGBA8,
+        width,
+        height,
+        0,
+        gl.GL_RGBA,
+        gl.GL_UNSIGNED_BYTE,
+        None,
+    )
+    return shader_prog, vao.value, tex.value
+
+
+def create_pixel_buffer(gl, width, height):
+    """Create the GL PBO that CUDA writes RGBA pixels into each frame."""
+    pbo = ctypes.c_uint(0)
+    gl.glGenBuffers(1, ctypes.byref(pbo))
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo.value)
+    nbytes = width * height * 4
+    gl.glBufferData(gl.GL_PIXEL_UNPACK_BUFFER, nbytes, None, gl.GL_DYNAMIC_DRAW)
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0)
+    return pbo.value
+
+
+def copy_pbo_to_texture(gl, pbo_id, tex_id, width, height):
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo_id)
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id)
+    gl.glTexSubImage2D(
+        gl.GL_TEXTURE_2D,
+        0,
+        0,
+        0,
+        width,
+        height,
+        gl.GL_RGBA,
+        gl.GL_UNSIGNED_BYTE,
+        None,
+    )
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0)
+
+
+def draw_fullscreen_quad(gl, shader_prog, vao_id, tex_id):
+    gl.glUseProgram(shader_prog.id)
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id)
+    gl.glBindVertexArray(vao_id)
+    gl.glDrawArrays(gl.GL_TRIANGLES, 0, 6)
+    gl.glBindVertexArray(0)
+    gl.glUseProgram(0)
+
+
+def make_texture(arr, filter_mode):
+    """Build a `TextureObject` for `arr` with the given FilterMode.
+
+    Filter mode is baked into the descriptor at creation; to switch modes
+    we close this object and call this helper again.
+    """
+    res_desc = ResourceDescriptor.from_array(arr)
+    tex_desc = TextureDescriptor(
+        address_mode=AddressMode.CLAMP,
+        filter_mode=filter_mode,
+        # UINT8 source + NORMALIZED_FLOAT means tex2D<float4> returns each
+        # channel as a float in [0, 1] -- handy for the colorize math below.
+        read_mode=ReadMode.NORMALIZED_FLOAT,
+        normalized_coords=True,
+    )
+    return TextureObject.from_descriptor(resource=res_desc, texture_descriptor=tex_desc)
+
+
+# ================================== main() ==================================
+
+
+def main():
+    # --- Step 1: Set up CUDA (compile kernel, create stream) ---
+    dev, stream, kernel, config = setup_cuda()
+
+    # --- Step 2: Open a window ---
+    window, gl, pyglet = create_window()
+
+    # --- Step 3: Create GL resources (shader, fullscreen quad, screen tex) ---
+    shader_prog, quad_vao, screen_tex = create_display_resources(gl, WIDTH, HEIGHT)
+
+    # --- Step 4: Create the PBO that CUDA will write into ---
+    pbo_id = create_pixel_buffer(gl, WIDTH, HEIGHT)
+    resource = GraphicsResource.from_gl_buffer(pbo_id, flags="write_discard")
+
+    # --- Step 5: Allocate the source `CUDAArray` and upload the test pattern ---
+    arr = CUDAArray.from_descriptor(
+        shape=(IMAGE_SIZE, IMAGE_SIZE),
+        format=ArrayFormat.UINT8,
+        num_channels=4,
+    )
+    host_image = make_test_image(IMAGE_SIZE)
+    arr.copy_from(np.ascontiguousarray(host_image), stream=stream)
+    stream.sync()
+
+    # --- Step 6: Bind the CUDAArray as a TextureObject (initially POINT) ---
+    state = {"filter": FilterMode.POINT, "rotate": False, "angle": 0.0}
+    tex = make_texture(arr, state["filter"])
+
+    @window.event
+    def on_key_press(symbol, _modifiers):
+        key = pyglet.window.key
+        nonlocal tex
+        if symbol == key.ESCAPE:
+            window.close()
+        elif symbol == key.F:
+            # Filter mode is baked at TextureObject creation time. Swapping
+            # it means closing the old one and building a new one.
+            state["filter"] = FilterMode.LINEAR if state["filter"] == FilterMode.POINT else FilterMode.POINT
+            tex.close()
+            tex = make_texture(arr, state["filter"])
+        elif symbol == key.R:
+            state["rotate"] = not state["rotate"]
+
+    # --- Step 7: Render loop ---
+    start = time.monotonic()
+    last_t = start
+    frame_count = 0
+    fps_time = start
+
+    @window.event
+    def on_draw():
+        nonlocal frame_count, fps_time, last_t
+        now = time.monotonic()
+        if state["rotate"]:
+            state["angle"] += (now - last_t) * 0.5  # rad/sec
+        last_t = now
+
+        window.clear()
+        with resource.map(stream=stream) as buf:
+            launch(
+                stream,
+                config,
+                kernel,
+                np.uint64(tex.handle),
+                buf.handle,
+                np.int32(WIDTH),
+                np.int32(HEIGHT),
+                np.float32(state["angle"]),
+            )
+        copy_pbo_to_texture(gl, pbo_id, screen_tex, WIDTH, HEIGHT)
+        draw_fullscreen_quad(gl, shader_prog, quad_vao, screen_tex)
+
+        frame_count += 1
+        if now - fps_time >= 1.0:
+            fps = frame_count / (now - fps_time)
+            window.set_caption(
+                f"cuda.core CUDAArray + TextureObject - Image Show "
+                f"(filter={state['filter'].name}, "
+                f"rotate={'on' if state['rotate'] else 'off'}, "
+                f"{fps:.0f} FPS)"
+            )
+            frame_count = 0
+            fps_time = now
+
+    @window.event
+    def on_close():
+        tex.close()
+        arr.close()
+        resource.close()
+        stream.close()
+
+    pyglet.app.run(interval=0)
+
+
+# ============================== GPU code (kernel) ============================
+
+KERNEL_SOURCE = r"""
+extern "C"
+__global__
+void sample_image(cudaTextureObject_t tex,
+                  unsigned char* output,
+                  int width, int height,
+                  float angle) {
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= width || y >= height) return;
+
+    // Center the screen pixel around (0, 0) in [-aspect, aspect] x [-1, 1].
+    float aspect = (float)width / (float)height;
+    float sx = ((float)x / (float)width  - 0.5f) * 2.0f * aspect;
+    float sy = ((float)y / (float)height - 0.5f) * 2.0f;
+
+    // Inverse-rotate the screen point: rotating the image by +angle means
+    // each output pixel reads from the source rotated by -angle.
+    float c = cosf(-angle), s = sinf(-angle);
+    float rx = c * sx - s * sy;
+    float ry = s * sx + c * sy;
+
+    // Map rotated screen point to the [0, 1] x [0, 1] texture domain so the
+    // image (drawn centered, fitting ~75% of the window height) lands on it.
+    const float scale = 0.75f;
+    float u = (rx / (2.0f * scale)) + 0.5f;
+    float v = (ry / (2.0f * scale)) + 0.5f;
+
+    // AddressMode.CLAMP means out-of-range u/v sample the edge texel.
+    float4 col = tex2D<float4>(tex, u, v);
+
+    int idx = (y * width + x) * 4;
+    output[idx + 0] = (unsigned char)(col.x * 255.0f);
+    output[idx + 1] = (unsigned char)(col.y * 255.0f);
+    output[idx + 2] = (unsigned char)(col.z * 255.0f);
+    output[idx + 3] = 255;
+}
+"""
+
+VERTEX_SHADER_SOURCE = """#version 330 core
+in vec2 position;
+in vec2 texcoord;
+out vec2 v_texcoord;
+void main() {
+    gl_Position = vec4(position, 0.0, 1.0);
+    v_texcoord = texcoord;
+}
+"""
+
+FRAGMENT_SHADER_SOURCE = """#version 330 core
+in vec2 v_texcoord;
+out vec4 fragColor;
+uniform sampler2D tex;
+void main() {
+    fragColor = texture(tex, v_texcoord);
+}
+"""
+
+
+if __name__ == "__main__":
+    main()
diff --git a/cuda_core/examples/gl_interop_jfa_voronoi.py b/cuda_core/examples/gl_interop_jfa_voronoi.py
new file mode 100644
index 00000000000..bd9bead75f4
--- /dev/null
+++ b/cuda_core/examples/gl_interop_jfa_voronoi.py
@@ -0,0 +1,940 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# ################################################################################
+#
+# This example demonstrates cuda.core.CUDAArray, TextureObject, and SurfaceObject
+# in combination with GraphicsResource for CUDA/OpenGL interop. A Voronoi diagram
+# is computed every frame with the Jump Flood Algorithm (JFA): a float2 "nearest
+# seed" map is ping-ponged between two CUDA arrays across log2(N) passes. Each
+# pass reads the previous map through a POINT-filtered TextureObject (exact texel
+# reads -- no interpolation) and writes the refined map through a SurfaceObject.
+# The final nearest-seed map is colorized straight into an OpenGL PBO as neon
+# Voronoi cells or glowing metaballs. Seeds drift continuously so it animates.
+# Requires pyglet.
+#
+# ################################################################################
+
+# What this example teaches
+# =========================
+# - How to allocate a CUDA CUDAArray with `is_surface_load_store=True` so the same
+#   memory can be bound as both a TextureObject (for sampled reads) and a
+#   SurfaceObject (for typed writes).
+# - How to use FilterMode.POINT + AddressMode.BORDER + border_color +
+#   non-normalized coordinates to get EXACT texel reads with a clean
+#   "off-grid = no seed" sentinel. JFA fundamentally requires reading the
+#   precise value stored at an integer neighbor offset -- bilinear interpolation
+#   between two different seed coordinates would be meaningless. This is the
+#   deliberate inverse of the reaction-diffusion example's LINEAR/WRAP/normalized
+#   choice.
+#   API MAP: FilterMode.POINT -> exact texel reads (JFA needs no interpolation);
+#   AddressMode.BORDER + border_color -> off-grid neighbor fetches return a
+#   "no seed" sentinel instead of CLAMP-replicating an edge seed.
+# - How varying the read offset (the JFA "step") each pass, combined with
+#   ping-pong surface writes, propagates seed information across the whole image
+#   in O(log N) passes instead of O(N).
+# - How to compose CUDAArray/TextureObject/SurfaceObject with GraphicsResource so
+#   the entire pipeline never leaves the GPU.
+#
+# How it works
+# ============
+# The Jump Flood Algorithm computes, for every pixel, the coordinate of its
+# nearest seed. We store that coordinate in a `float2` map (channel 0 = seed x,
+# channel 1 = seed y), using the sentinel (-1, -1) for "no seed known yet".
+#
+#   1. seed_clear   -- fill the whole map with the sentinel.
+#   2. seed_splat   -- for each seed, write its own (x, y) into the cell it
+#                      occupies. One tiny 1-thread launch per seed (seeds live
+#                      in a host numpy array and are passed as scalar params;
+#                      see "Why splat seeds as scalars" below).
+#   3. jfa_step     -- the heart of the algorithm. With the current step size s
+#                      (s = K, K/2, ..., 1), every pixel examines itself and its
+#                      8 neighbors at offset +/- s. Among all non-sentinel seed
+#                      coordinates found, it keeps the one closest to this pixel
+#                      and writes it out. Run once per step size, ping-ponging
+#                      the two arrays each pass.
+#   4. colorize     -- read the final nearest-seed map and write RGBA bytes
+#                      into the OpenGL PBO.
+#
+#   PING-PONG over JFA passes (two arrays, swap each pass)
+#   ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#   +--------------+  tex2D<float2>   +--------------+
+#   |   arr_read   | ---------------> |              |
+#   | nearest-seed |  (POINT, exact   |  jfa_step    |
+#   |     map      |   texel reads at |   (step s)   |
+#   +--------------+   +/- step)      |              |
+#                                     |              |
+#   +--------------+  surf2Dwrite     |              |
+#   |   arr_write  | <--------------- |              |
+#   | nearest-seed |                  +--------------+
+#   |     map      |
+#   +--------------+
+#       (swap, halve step)
+#
+# The step schedule starts at K = next power of two >= max(W, H) / 2 and halves
+# down to 1, giving floor(log2(K)) + 1 passes. Because we ping-pong every pass,
+# the final result lands in whichever array was written last; we track that
+# explicitly (see the loop in on_draw) rather than assuming it is a fixed array.
+# The full JFA is re-run from scratch every frame because the seeds move.
+#
+# Why POINT + BORDER + border_color + non-normalized coords?
+# -----------------------------------------------------------
+# JFA reads the exact seed coordinate stored at a specific integer neighbor.
+# LINEAR filtering would blend two stored coordinates into a meaningless
+# average, so we use FilterMode.POINT. For the addressing mode we use BORDER
+# with an explicit border_color equal to the map's "no seed" sentinel
+# (-1, -1). The earlier version used CLAMP, but CLAMP makes an off-edge
+# neighbor lookup silently return the *edge* texel's real seed coordinate; that
+# can make a border pixel pick a seed that is not actually its nearest one.
+# BORDER instead returns the sentinel for any out-of-range fetch, which the
+# kernel ignores -- the correct "there is no neighbor here" answer. (WRAP and
+# MIRROR are the only address modes that require normalized coordinates; BORDER
+# and CLAMP work with non-normalized coords, so we keep the integer-style
+# sampling.) With non-normalized coordinates a texel at integer (nx, ny) is read
+# at `tex2D<float2>(tex, nx + 0.5f, ny + 0.5f)` -- the +0.5 lands on the texel
+# center. This is intentionally the opposite of the LINEAR/WRAP/normalized
+# choice used by the reaction-diffusion example.
+#
+# Why splat seeds as scalars (no device buffer)?
+# ----------------------------------------------
+# Seeds live in a host numpy array and drift via sin/cos on the CPU each frame.
+# Rather than allocating a device buffer, we pass each seed's position to a tiny
+# 1-thread `seed_splat` kernel as float scalars. With only tens of seeds this is
+# a handful of trivial launches per frame. Note the seed *list* is only needed
+# for splatting: colorize and the cell-border test read seed coordinates back
+# out of the JFA map, never from the host list.
+#
+# Channel byte width in surf2Dwrite
+# ---------------------------------
+# `surf2Dwrite` takes the x coordinate in BYTES, not in elements. For a
+# `float2` surface that means `x * sizeof(float2)` = `x * 8`. Getting this
+# wrong silently corrupts every other column.
+#
+# What you should see
+# ===================
+# A window of animated, drifting Voronoi cells (smooth vivid per-cell neon
+# colors with glowing seams) or shimmering metaball-style blobs. Press M to
+# toggle the two modes,
+# +/- to change the seed count, R to reseed, and Escape to exit. The window
+# title shows the mode, seed count, and FPS.
+#
+
+# /// script
+# dependencies = ["cuda_bindings", "cuda_core>0.6.0", "pyglet"]
+# ///
+
+import ctypes
+import math
+import sys
+import time
+
+import numpy as np
+
+from cuda.core import (
+    AddressMode,
+    ArrayFormat,
+    CUDAArray,
+    Device,
+    FilterMode,
+    GraphicsResource,
+    LaunchConfig,
+    Program,
+    ProgramOptions,
+    ReadMode,
+    ResourceDescriptor,
+    SurfaceObject,
+    TextureDescriptor,
+    TextureObject,
+    launch,
+)
+
+# ---------------------------------------------------------------------------
+# Parameters (feel free to change these)
+# ---------------------------------------------------------------------------
+WIDTH = 512
+HEIGHT = 512
+MAX_SEEDS = 64  # upper bound on the seed count (host array is sized for this)
+DEFAULT_SEEDS = 16
+MIN_SEEDS = 2
+
+# Visual modes for the colorize kernel. The integer value is passed to the
+# kernel; the label is shown in the window caption.
+MODE_VORONOI = 0
+MODE_METABALL = 1
+MODE_LABELS = {MODE_VORONOI: "voronoi", MODE_METABALL: "metaball"}
+
+
+def jfa_steps(width, height):
+    """Return the JFA step schedule: K, K/2, ..., 1.
+
+    K is the next power of two >= max(width, height) / 2. The number of passes
+    is floor(log2(K)) + 1.
+    """
+    longest = max(width, height)
+    step = 1
+    while step < longest // 2:
+        step *= 2
+    steps = []
+    while step >= 1:
+        steps.append(step)
+        step //= 2
+    return steps
+
+
+# ============================= Helper functions =============================
+#
+# The functions below set up CUDA and OpenGL. If you're here to learn about
+# CUDAArray/TextureObject/SurfaceObject, skip ahead to main() -- the interesting
+# part is there. These helpers exist so that main() reads like a short story
+# instead of a wall of boilerplate.
+# ============================================================================
+
+
+def setup_cuda():
+    """Compile the CUDA kernels and return (device, stream, kernels, configs)."""
+    dev = Device(0)
+    dev.set_current()
+
+    # SurfaceObject requires surface load/store, which has existed since SM 2.0,
+    # but bindless surface objects (cuSurfObjectCreate) require SM 3.0+.
+    cc = dev.compute_capability
+    if cc.major < 3:
+        print(
+            "This example requires a GPU with compute capability >= 3.0 for "
+            f"bindless surface objects. Found sm_{cc.major}{cc.minor}.",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    stream = dev.create_stream()
+
+    # Compile as C++ so the templated tex2D<float2> overload resolves.
+    program_options = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}")
+    prog = Program(KERNEL_SOURCE, code_type="c++", options=program_options)
+    mod = prog.compile(
+        "cubin",
+        name_expressions=("seed_clear", "seed_splat", "jfa_step", "colorize"),
+    )
+
+    kernels = {
+        "seed_clear": mod.get_kernel("seed_clear"),
+        "seed_splat": mod.get_kernel("seed_splat"),
+        "jfa_step": mod.get_kernel("jfa_step"),
+        "colorize": mod.get_kernel("colorize"),
+    }
+
+    block = (16, 16, 1)
+    grid = (
+        (WIDTH + block[0] - 1) // block[0],
+        (HEIGHT + block[1] - 1) // block[1],
+        1,
+    )
+    grid_config = LaunchConfig(grid=grid, block=block)
+    # seed_clear, jfa_step, and colorize are pixel-parallel over a WIDTH x HEIGHT
+    # grid and can share this config. seed_splat is a single 1-thread launch.
+    point_config = LaunchConfig(grid=(1, 1, 1), block=(1, 1, 1))
+    configs = {
+        "seed_clear": grid_config,
+        "jfa_step": grid_config,
+        "colorize": grid_config,
+        "seed_splat": point_config,
+    }
+
+    return dev, stream, kernels, configs
+
+
+def create_window():
+    """Open a pyglet window and return (window, gl_module, pyglet)."""
+    try:
+        import pyglet
+        from pyglet.gl import gl as _gl
+    except ImportError:
+        print(
+            "This example requires pyglet >= 2.0.\nInstall it with:  pip install pyglet",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    window = pyglet.window.Window(
+        WIDTH,
+        HEIGHT,
+        caption="cuda.core CUDAArray/Texture/Surface - JFA Voronoi",
+        vsync=False,
+    )
+    return window, _gl, pyglet
+
+
+def create_display_resources(gl, width, height):
+    """Create the GL objects needed to show a texture on screen.
+
+    This sets up a shader program, a fullscreen quad, and an empty texture.
+    None of this is CUDA-specific -- it's standard OpenGL boilerplate for
+    rendering a textured quad.
+
+    Returns (shader_program, vertex_array_id, texture_id). The shader_program
+    is a pyglet ShaderProgram object (must be kept alive).
+    """
+    from pyglet.graphics.shader import Shader, ShaderProgram
+
+    # Shader program -- just passes texture coordinates through
+    vert = Shader(VERTEX_SHADER_SOURCE, "vertex")
+    frag = Shader(FRAGMENT_SHADER_SOURCE, "fragment")
+    shader_prog = ShaderProgram(vert, frag)
+
+    # Fullscreen quad (two triangles covering the entire window)
+    quad_verts = np.array(
+        [
+            # x,  y,    s, t      (position + texture coordinate)
+            -1,
+            -1,
+            0,
+            0,
+            1,
+            -1,
+            1,
+            0,
+            1,
+            1,
+            1,
+            1,
+            -1,
+            -1,
+            0,
+            0,
+            1,
+            1,
+            1,
+            1,
+            -1,
+            1,
+            0,
+            1,
+        ],
+        dtype=np.float32,
+    )
+
+    vao = ctypes.c_uint(0)
+    gl.glGenVertexArrays(1, ctypes.byref(vao))
+    gl.glBindVertexArray(vao.value)
+
+    vbo = ctypes.c_uint(0)
+    gl.glGenBuffers(1, ctypes.byref(vbo))
+    gl.glBindBuffer(gl.GL_ARRAY_BUFFER, vbo.value)
+    gl.glBufferData(
+        gl.GL_ARRAY_BUFFER,
+        quad_verts.nbytes,
+        quad_verts.ctypes.data_as(ctypes.c_void_p),
+        gl.GL_STATIC_DRAW,
+    )
+
+    stride = 4 * 4  # 4 floats * 4 bytes each = 16 bytes per vertex
+    pos_loc = gl.glGetAttribLocation(shader_prog.id, b"position")
+    gl.glEnableVertexAttribArray(pos_loc)
+    gl.glVertexAttribPointer(pos_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(0))
+
+    tc_loc = gl.glGetAttribLocation(shader_prog.id, b"texcoord")
+    gl.glEnableVertexAttribArray(tc_loc)
+    gl.glVertexAttribPointer(tc_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(8))
+
+    gl.glBindVertexArray(0)
+
+    # Empty texture (will be filled each frame from the PBO)
+    tex = ctypes.c_uint(0)
+    gl.glGenTextures(1, ctypes.byref(tex))
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex.value)
+    gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MIN_FILTER, gl.GL_LINEAR)
+    gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MAG_FILTER, gl.GL_LINEAR)
+    gl.glTexImage2D(
+        gl.GL_TEXTURE_2D,
+        0,
+        gl.GL_RGBA8,
+        width,
+        height,
+        0,
+        gl.GL_RGBA,
+        gl.GL_UNSIGNED_BYTE,
+        None,
+    )
+
+    return shader_prog, vao.value, tex.value
+
+
+def create_pixel_buffer(gl, width, height):
+    """Create a Pixel Buffer Object (PBO) -- the bridge between CUDA and OpenGL.
+
+    A PBO is a GPU-side buffer that OpenGL can read from when uploading pixels
+    to a texture. By registering this same buffer with CUDA, the CUDA kernel
+    can write directly into it.
+
+    Returns (pbo_gl_name, size_in_bytes).
+    """
+    pbo = ctypes.c_uint(0)
+    gl.glGenBuffers(1, ctypes.byref(pbo))
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo.value)
+    nbytes = width * height * 4  # RGBA, 1 byte per channel
+    gl.glBufferData(gl.GL_PIXEL_UNPACK_BUFFER, nbytes, None, gl.GL_DYNAMIC_DRAW)
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0)
+    return pbo.value, nbytes
+
+
+def copy_pbo_to_texture(gl, pbo_id, tex_id, width, height):
+    """Copy pixel data from the PBO into the GL texture (GPU-to-GPU)."""
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo_id)
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id)
+    gl.glTexSubImage2D(
+        gl.GL_TEXTURE_2D,
+        0,
+        0,
+        0,
+        width,
+        height,
+        gl.GL_RGBA,
+        gl.GL_UNSIGNED_BYTE,
+        None,  # None = read from the currently bound PBO, not from CPU
+    )
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0)
+
+
+def draw_fullscreen_quad(gl, shader_prog, vao_id, tex_id):
+    """Draw the texture to the screen using the fullscreen quad."""
+    gl.glUseProgram(shader_prog.id)
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id)
+    gl.glBindVertexArray(vao_id)
+    gl.glDrawArrays(gl.GL_TRIANGLES, 0, 6)
+    gl.glBindVertexArray(0)
+    gl.glUseProgram(0)
+
+
+def make_state_arrays():
+    """Allocate the two `float2` ping-pong arrays that hold the nearest-seed map."""
+    arr_a = CUDAArray.from_descriptor(
+        shape=(WIDTH, HEIGHT),
+        format=ArrayFormat.FLOAT32,
+        num_channels=2,
+        is_surface_load_store=True,
+    )
+    arr_b = CUDAArray.from_descriptor(
+        shape=(WIDTH, HEIGHT),
+        format=ArrayFormat.FLOAT32,
+        num_channels=2,
+        is_surface_load_store=True,
+    )
+    return arr_a, arr_b
+
+
+def make_texture(arr):
+    """Bind `arr` as a TextureObject configured for POINT + BORDER + non-normalized.
+
+    API MAP:
+      FilterMode.POINT            -> exact texel reads (JFA needs no interpolation)
+      AddressMode.BORDER          -> off-grid neighbor fetches return border_color
+      border_color (sentinel)     -> a "no seed" value the kernel ignores, instead
+                                     of CLAMP-replicating a real edge seed
+
+    JFA needs exact texel reads at integer neighbor offsets, so we use POINT
+    filtering (no interpolation). We address with BORDER + an explicit
+    border_color set to the same "no seed" sentinel as the map's empty cells
+    (x = -1). When a JFA neighbor lookup lands off the grid, the texture unit
+    returns that sentinel and the kernel ignores it. This is strictly more
+    correct than CLAMP: with CLAMP an off-edge fetch silently replicates the
+    edge texel's seed, which can pull a border pixel toward a seed that is not
+    actually its nearest one. BORDER turns those out-of-range fetches into a
+    clean "no candidate".
+
+    Note on coordinates: BORDER addressing is valid with non-normalized
+    coordinates (only WRAP/MIRROR require normalized coords), so we keep the
+    integer-style `(nx + 0.5)` sampling used throughout the JFA. The border
+    sentinel is a 4-tuple because the descriptor always carries four channels;
+    a float2 read consumes channels 0-1, so (-1, -1) lands in (.x, .y) and the
+    trailing (0, 0) is unused.
+    """
+    res_desc = ResourceDescriptor.from_array(arr)
+    tex_desc = TextureDescriptor(
+        address_mode=AddressMode.BORDER,
+        filter_mode=FilterMode.POINT,
+        read_mode=ReadMode.ELEMENT_TYPE,
+        normalized_coords=False,
+        border_color=(-1.0, -1.0, 0.0, 0.0),
+    )
+    return TextureObject.from_descriptor(resource=res_desc, texture_descriptor=tex_desc)
+
+
+def make_seeds(count):
+    """Create `count` drifting seeds.
+
+    Each seed has a base position, an angular speed, and a radius. The instant
+    position is recomputed every frame from these via sin/cos. Returns a dict of
+    numpy arrays sized for MAX_SEEDS (only the first `count` are used).
+    """
+    rng = np.random.default_rng()
+    return {
+        "base_x": rng.uniform(0.2, 0.8, MAX_SEEDS).astype(np.float32) * WIDTH,
+        "base_y": rng.uniform(0.2, 0.8, MAX_SEEDS).astype(np.float32) * HEIGHT,
+        "radius": rng.uniform(0.05, 0.25, MAX_SEEDS).astype(np.float32) * min(WIDTH, HEIGHT),
+        "phase": rng.uniform(0.0, 2.0 * math.pi, MAX_SEEDS).astype(np.float32),
+        "speed": rng.uniform(0.3, 1.2, MAX_SEEDS).astype(np.float32),
+        "count": count,
+    }
+
+
+def seed_positions(seeds, t):
+    """Return (xs, ys) instant positions for the active seeds at time `t`.
+
+    Seeds drift along small circles via sin/cos so the Voronoi diagram animates
+    smoothly. Positions are clamped to the interior of the image.
+    """
+    n = seeds["count"]
+    ang = seeds["phase"][:n] + seeds["speed"][:n] * t
+    xs = seeds["base_x"][:n] + seeds["radius"][:n] * np.cos(ang)
+    ys = seeds["base_y"][:n] + seeds["radius"][:n] * np.sin(ang)
+    xs = np.clip(xs, 0.0, WIDTH - 1.0).astype(np.float32)
+    ys = np.clip(ys, 0.0, HEIGHT - 1.0).astype(np.float32)
+    return xs, ys
+
+
+def run_jfa(stream, kernels, configs, seeds, t, tex_a, tex_b, surf_a, surf_b):
+    """Run a full JFA pass for the current seed positions.
+
+    Clears arr_a (via surf_a) to the sentinel, splats each seed into arr_a, then
+    ping-pongs the step loop between (tex_a/surf_a) and (tex_b/surf_b).
+
+    Returns the TextureObject bound to the array that was written last, which
+    holds the final nearest-seed map for colorize.
+    """
+    # 1. Clear arr_a to the sentinel (-1, -1).
+    launch(
+        stream,
+        configs["seed_clear"],
+        kernels["seed_clear"],
+        np.uint64(surf_a.handle),
+        np.int32(WIDTH),
+        np.int32(HEIGHT),
+    )
+
+    # 2. Splat each seed's own coordinate into arr_a (one 1-thread launch each).
+    xs, ys = seed_positions(seeds, t)
+    for i in range(seeds["count"]):
+        launch(
+            stream,
+            configs["seed_splat"],
+            kernels["seed_splat"],
+            np.uint64(surf_a.handle),
+            np.int32(WIDTH),
+            np.int32(HEIGHT),
+            np.float32(xs[i]),
+            np.float32(ys[i]),
+        )
+
+    # 3. Ping-pong the JFA step loop. Start reading arr_a / writing arr_b.
+    read_tex, write_surf = tex_a, surf_b
+    other_tex, other_surf = tex_b, surf_a
+    final_tex = tex_a  # if the loop body never runs, arr_a holds the result
+    for step in jfa_steps(WIDTH, HEIGHT):
+        launch(
+            stream,
+            configs["jfa_step"],
+            kernels["jfa_step"],
+            np.uint64(read_tex.handle),
+            np.uint64(write_surf.handle),
+            np.int32(WIDTH),
+            np.int32(HEIGHT),
+            np.int32(step),
+        )
+        # The array we just wrote is now the current map; swap for next pass.
+        final_tex = tex_b if write_surf is surf_b else tex_a
+        read_tex, other_tex = other_tex, read_tex
+        write_surf, other_surf = other_surf, write_surf
+    return final_tex
+
+
+# ================================== main() ==================================
+
+
+def main():
+    # --- Step 1: Set up CUDA (compile kernels, create stream) ---
+    dev, stream, kernels, configs = setup_cuda()
+
+    # --- Step 2: Open a window ---
+    window, gl, pyglet = create_window()
+
+    # --- Step 3: Create GL resources for drawing a texture to screen ---
+    shader_prog, quad_vao, tex_id = create_display_resources(gl, WIDTH, HEIGHT)
+
+    # --- Step 4: Create the Pixel Buffer Object (PBO) ---
+    pbo_id, _ = create_pixel_buffer(gl, WIDTH, HEIGHT)
+
+    # --- Step 5: Register the PBO with CUDA ---
+    resource = GraphicsResource.from_gl_buffer(pbo_id, flags="write_discard")
+
+    # --- Step 6: Allocate the two ping-pong nearest-seed map Arrays ---
+    #     Both are `float2` (channel 0 = seed x, channel 1 = seed y) with
+    #     is_surface_load_store=True so they can be bound as SurfaceObjects.
+    arr_a, arr_b = make_state_arrays()
+
+    # --- Step 7: Pre-create the four bindless handles (once, kept alive) ---
+    tex_a = make_texture(arr_a)
+    tex_b = make_texture(arr_b)
+    surf_a = SurfaceObject.from_array(arr_a)
+    surf_b = SurfaceObject.from_array(arr_b)
+
+    # --- Step 8: Initialize seeds and view state ---
+    state = {"mode": MODE_VORONOI, "seeds": make_seeds(DEFAULT_SEEDS)}
+
+    # --- Step 9: Render loop ---
+    start_time = time.monotonic()
+    frame_count = 0
+    fps_time = start_time
+
+    @window.event
+    def on_key_press(symbol, _modifiers):
+        key = pyglet.window.key
+        if symbol == key.ESCAPE:
+            window.close()
+            return
+        if symbol == key.M:
+            state["mode"] = MODE_METABALL if state["mode"] == MODE_VORONOI else MODE_VORONOI
+            return
+        if symbol == key.R:
+            state["seeds"] = make_seeds(state["seeds"]["count"])
+            return
+        if symbol in (key.PLUS, key.EQUAL, key.NUM_ADD):
+            new_count = min(MAX_SEEDS, state["seeds"]["count"] + 1)
+            if new_count != state["seeds"]["count"]:
+                state["seeds"] = make_seeds(new_count)
+            return
+        if symbol in (key.MINUS, key.NUM_SUBTRACT):
+            new_count = max(MIN_SEEDS, state["seeds"]["count"] - 1)
+            if new_count != state["seeds"]["count"]:
+                state["seeds"] = make_seeds(new_count)
+            return
+
+    @window.event
+    def on_draw():
+        nonlocal frame_count, fps_time
+
+        window.clear()
+        t = time.monotonic() - start_time
+
+        # (a) Run the full Jump Flood Algorithm for the current seed positions.
+        #     final_tex is the TextureObject over the array written last.
+        final_tex = run_jfa(stream, kernels, configs, state["seeds"], t, tex_a, tex_b, surf_a, surf_b)
+
+        # (b) Colorize the nearest-seed map into the OpenGL PBO.
+        with resource.map(stream=stream) as buf:
+            launch(
+                stream,
+                configs["colorize"],
+                kernels["colorize"],
+                np.uint64(final_tex.handle),
+                buf.handle,
+                np.int32(WIDTH),
+                np.int32(HEIGHT),
+                np.int32(state["mode"]),
+                np.float32(t),
+            )
+        # Unmap happens automatically when the `with` block exits.
+
+        # (c) Tell OpenGL to copy the PBO contents into our texture.
+        copy_pbo_to_texture(gl, pbo_id, tex_id, WIDTH, HEIGHT)
+
+        # (d) Draw the texture to the screen.
+        draw_fullscreen_quad(gl, shader_prog, quad_vao, tex_id)
+
+        # FPS counter (shown in window title)
+        frame_count += 1
+        now = time.monotonic()
+        if now - fps_time >= 1.0:
+            fps = frame_count / (now - fps_time)
+            label = MODE_LABELS[state["mode"]]
+            window.set_caption(
+                "cuda.core JFA Voronoi"
+                " | TextureObject[POINT|BORDER|border_color] float2 + SurfaceObject"
+                f" | mode={label} | {state['seeds']['count']} seeds"
+                f" | {WIDTH}x{HEIGHT} | {fps:.0f} FPS"
+            )
+            frame_count = 0
+            fps_time = now
+
+    @window.event
+    def on_close():
+        # Release everything we opened, in reverse order.
+        resource.close()
+        tex_a.close()
+        tex_b.close()
+        surf_a.close()
+        surf_b.close()
+        arr_a.close()
+        arr_b.close()
+        stream.close()
+
+    pyglet.app.run(interval=0)
+
+
+# ======================== GPU code (CUDA + GLSL) ============================
+#
+# These source strings are kept at the bottom of the file so they don't
+# distract from the Python logic above. KERNEL_SOURCE contains four CUDA C++
+# kernels:
+#
+#   * seed_clear -- fills the map with the sentinel (-1, -1) via surface writes.
+#   * seed_splat -- writes one seed's own coordinate into the cell it occupies.
+#   * jfa_step   -- reads the previous map via a POINT-filtered, BORDER-addressed
+#                   TextureObject at +/- step offsets and writes the refined
+#                   nearest-seed map via a SurfaceObject. Off-grid fetches return
+#                   the sentinel border_color. Coordinates are non-normalized.
+#   * colorize   -- reads the final nearest-seed map and writes RGBA bytes into
+#                   the OpenGL PBO, either as smooth neon Voronoi cells with
+#                   glowing borders (mode 0) or glowing metaballs (mode 1).
+#
+# VERTEX_SHADER_SOURCE / FRAGMENT_SHADER_SOURCE are GLSL that draw a texture on
+# a fullscreen rectangle. Nothing interesting.
+# ============================================================================
+
+KERNEL_SOURCE = r"""
+// The nearest-seed map is a float2 per texel: (.x, .y) = coordinate of the
+// nearest known seed, or the sentinel (-1, -1) for "none yet". With POINT
+// filtering + non-normalized coords, texel (ix, iy) is read at
+// tex2D<float2>(tex, ix + 0.5f, iy + 0.5f). The texture is BORDER-addressed
+// with border_color == the sentinel, so a fetch with an out-of-range coord
+// also returns (-1, -1) and is rejected by is_seed() -- the same path as an
+// empty interior cell.
+
+#define SENTINEL_X (-1.0f)
+
+__device__ __forceinline__ bool is_seed(float2 s) {
+    // Any non-negative x marks a valid stored seed coordinate.
+    return s.x >= 0.0f;
+}
+
+// Fully-saturated HSV->RGB, hue/value driven by hash, returns vivid neon RGB.
+__device__ __forceinline__ void hsv_to_rgb(float hue, float sat, float val,
+                                           float* r, float* g, float* b) {
+    hue -= floorf(hue);            // wrap hue into [0, 1)
+    float h6 = hue * 6.0f;
+    float c = val * sat;
+    float x = c * (1.0f - fabsf(fmodf(h6, 2.0f) - 1.0f));
+    float m = val - c;
+    float rr, gg, bb;
+    if (h6 < 1.0f)      { rr = c; gg = x; bb = 0.0f; }
+    else if (h6 < 2.0f) { rr = x; gg = c; bb = 0.0f; }
+    else if (h6 < 3.0f) { rr = 0.0f; gg = c; bb = x; }
+    else if (h6 < 4.0f) { rr = 0.0f; gg = x; bb = c; }
+    else if (h6 < 5.0f) { rr = x; gg = 0.0f; bb = c; }
+    else                { rr = c; gg = 0.0f; bb = x; }
+    *r = rr + m; *g = gg + m; *b = bb + m;
+}
+
+// Hash a seed coordinate into a smooth, vivid per-cell neon color. The hash
+// drives a hue around the full color wheel; saturation/value stay high so
+// neighboring cells read as distinct saturated hues rather than muddy bytes.
+__device__ __forceinline__ void seed_color(float sx, float sy,
+                                           float* r, float* g, float* b) {
+    unsigned int h = (unsigned int)(sx + 0.5f) * 374761393u +
+                     (unsigned int)(sy + 0.5f) * 668265263u;
+    h = (h ^ (h >> 13)) * 1274126177u;
+    h = h ^ (h >> 16);
+    float hue = (h & 0xffffu) / 65535.0f;
+    // A little value jitter from the high bits keeps equal-hue cells separable.
+    float val = 0.85f + 0.15f * (((h >> 16) & 0xffu) / 255.0f);
+    hsv_to_rgb(hue, 0.92f, val, r, g, b);
+}
+
+extern "C"
+__global__
+void seed_clear(cudaSurfaceObject_t surf, int width, int height) {
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= width || y >= height) return;
+    // float2 is 8 bytes; surf2Dwrite takes the x offset in BYTES.
+    surf2Dwrite(make_float2(SENTINEL_X, SENTINEL_X), surf,
+                x * (int)sizeof(float2), y);
+}
+
+extern "C"
+__global__
+void seed_splat(cudaSurfaceObject_t surf, int width, int height,
+                float sx, float sy) {
+    // Single-thread launch: write this seed's own coordinate into its cell.
+    int ix = (int)(sx + 0.5f);
+    int iy = (int)(sy + 0.5f);
+    if (ix < 0) ix = 0;
+    if (ix >= width) ix = width - 1;
+    if (iy < 0) iy = 0;
+    if (iy >= height) iy = height - 1;
+    surf2Dwrite(make_float2(sx, sy), surf, ix * (int)sizeof(float2), iy);
+}
+
+extern "C"
+__global__
+void jfa_step(cudaTextureObject_t tex, cudaSurfaceObject_t surf,
+              int width, int height, int step) {
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= width || y >= height) return;
+
+    float px = (float)x;
+    float py = (float)y;
+
+    float best_x = SENTINEL_X;
+    float best_y = SENTINEL_X;
+    float best_d2 = 3.0e38f;  // ~FLT_MAX
+
+    // Examine self (dx=dy=0) and the 8 neighbors at +/- step. We deliberately
+    // do NOT clamp the neighbor coordinate: off-grid lookups are left out of
+    // range so the BORDER-addressed texture returns the sentinel border_color
+    // (-1, -1). is_seed() then rejects it, exactly as it would reject an empty
+    // interior cell. Under the old CLAMP scheme an off-edge fetch returned the
+    // edge texel's real seed, which could win the nearest-seed test for a
+    // border pixel even though that seed is not actually its nearest.
+    #pragma unroll
+    for (int dy = -1; dy <= 1; ++dy) {
+        #pragma unroll
+        for (int dx = -1; dx <= 1; ++dx) {
+            int nx = x + dx * step;
+            int ny = y + dy * step;
+
+            float2 s = tex2D<float2>(tex, (float)nx + 0.5f, (float)ny + 0.5f);
+            if (is_seed(s)) {
+                float ddx = s.x - px;
+                float ddy = s.y - py;
+                float d2 = ddx * ddx + ddy * ddy;
+                if (d2 < best_d2) {
+                    best_d2 = d2;
+                    best_x = s.x;
+                    best_y = s.y;
+                }
+            }
+        }
+    }
+
+    surf2Dwrite(make_float2(best_x, best_y), surf, x * (int)sizeof(float2), y);
+}
+
+extern "C"
+__global__
+void colorize(cudaTextureObject_t tex, unsigned char* output,
+              int width, int height, int mode, float t) {
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= width || y >= height) return;
+
+    float2 c = tex2D<float2>(tex, (float)x + 0.5f, (float)y + 0.5f);
+
+    float r = 0.0f, g = 0.0f, b = 0.0f;
+
+    if (is_seed(c)) {
+        float dx = c.x - (float)x;
+        float dy = c.y - (float)y;
+        float dist = sqrtf(dx * dx + dy * dy);
+
+        if (mode == 0) {
+            // --- Voronoi cells: smooth neon color + glowing cell borders. ---
+            seed_color(c.x, c.y, &r, &g, &b);
+
+            // Border proximity: count how many 8-neighbors belong to a different
+            // cell. A pixel deep inside a cell sees 0; a pixel right on the edge
+            // sees several. We use this as a smooth edge factor rather than a
+            // hard on/off so borders read as a luminous seam, not a jagged line.
+            int diff = 0;
+            const int ox[8] = {-1, 1, 0, 0, -1, -1, 1, 1};
+            const int oy[8] = {0, 0, -1, 1, -1, 1, -1, 1};
+            #pragma unroll
+            for (int i = 0; i < 8; ++i) {
+                int nx = x + ox[i];
+                int ny = y + oy[i];
+                if (nx < 0) nx = 0;
+                if (nx >= width) nx = width - 1;
+                if (ny < 0) ny = 0;
+                if (ny >= height) ny = height - 1;
+                float2 n = tex2D<float2>(tex, (float)nx + 0.5f, (float)ny + 0.5f);
+                if (is_seed(n) && (n.x != c.x || n.y != c.y)) {
+                    ++diff;
+                }
+            }
+
+            // Smooth interior shading: gentle radial falloff from the cell seed
+            // for a soft volumetric look, slowly breathing in time.
+            float shade = 1.0f / (1.0f + 0.0006f * dist * dist);
+            float pulse = 0.92f + 0.08f * sinf(1.5f * t + 0.02f * dist);
+            shade = (0.55f + 0.45f * shade) * pulse;
+            r *= shade; g *= shade; b *= shade;
+
+            if (diff > 0) {
+                // edge in [0,1]: stronger the more neighbors disagree.
+                float edge = (float)diff / 8.0f;
+                edge = edge * edge;  // bias toward the true seam
+                // Darken the base color toward the seam, then add a bright neon
+                // rim on top so cell boundaries glow instead of just going dark.
+                float dark = 1.0f - 0.85f * edge;
+                r *= dark; g *= dark; b *= dark;
+                float rim = edge * (0.65f + 0.35f * sinf(2.5f * t));
+                r += rim; g += rim * 0.9f; b += rim;
+            }
+        } else {
+            // --- Metaballs: glowing neon falloff from the nearest seed. ---
+            // Brightness peaks at the seed and decays smoothly with distance.
+            float glow = 1.0f / (1.0f + 0.0018f * dist * dist);
+            // A couple of animated isoline ripples add a layered plasma pulse.
+            float ripple = 0.5f + 0.5f * sinf(0.13f * dist - 3.0f * t);
+            float ripple2 = 0.5f + 0.5f * sinf(0.05f * dist + 1.7f * t);
+            float intensity = glow * (0.55f + 0.30f * ripple + 0.15f * ripple2);
+            // A soft core bloom keeps seed centers reading as hot points.
+            float core = 1.0f / (1.0f + 0.02f * dist * dist);
+            intensity += 0.5f * core;
+
+            // Hue sweeps with distance + time so blobs shimmer through the neon
+            // spectrum; value tracks intensity so falloff still fades to black.
+            float hue = 0.6f + 0.0015f * dist + 0.05f * t;
+            float val = intensity;
+            if (val > 1.0f) val = 1.0f;
+            hsv_to_rgb(hue, 0.85f, val, &r, &g, &b);
+            // Lift toward white at the very brightest cores for a hot-tip look.
+            float hot = intensity - 1.0f;
+            if (hot > 0.0f) {
+                if (hot > 1.0f) hot = 1.0f;
+                r += hot * (1.0f - r);
+                g += hot * (1.0f - g);
+                b += hot * (1.0f - b);
+            }
+        }
+    }
+
+    // Clamp to [0, 1] before writing bytes.
+    if (r < 0.0f) r = 0.0f; if (r > 1.0f) r = 1.0f;
+    if (g < 0.0f) g = 0.0f; if (g > 1.0f) g = 1.0f;
+    if (b < 0.0f) b = 0.0f; if (b > 1.0f) b = 1.0f;
+
+    int idx = (y * width + x) * 4;
+    output[idx + 0] = (unsigned char)(r * 255.0f);
+    output[idx + 1] = (unsigned char)(g * 255.0f);
+    output[idx + 2] = (unsigned char)(b * 255.0f);
+    output[idx + 3] = 255;
+}
+"""
+
+# GLSL shaders -- these just display a texture on a fullscreen rectangle.
+# Nothing CUDA-specific here.
+
+VERTEX_SHADER_SOURCE = """#version 330 core
+in vec2 position;
+in vec2 texcoord;
+out vec2 v_texcoord;
+void main() {
+    gl_Position = vec4(position, 0.0, 1.0);
+    v_texcoord = texcoord;
+}
+"""
+
+FRAGMENT_SHADER_SOURCE = """#version 330 core
+in vec2 v_texcoord;
+out vec4 fragColor;
+uniform sampler2D tex;
+void main() {
+    fragColor = texture(tex, v_texcoord);
+}
+"""
+
+
+if __name__ == "__main__":
+    main()
diff --git a/cuda_core/examples/gl_interop_lenia.py b/cuda_core/examples/gl_interop_lenia.py
new file mode 100644
index 00000000000..ea2d8dc36ae
--- /dev/null
+++ b/cuda_core/examples/gl_interop_lenia.py
@@ -0,0 +1,802 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# ################################################################################
+#
+# This example demonstrates cuda.core.CUDAArray, TextureObject, and SurfaceObject
+# in combination with GraphicsResource for CUDA/OpenGL interop. A Lenia
+# continuous cellular automaton is ping-ponged between two CUDA arrays each
+# frame: a TextureObject provides smooth (LINEAR + WRAP) sampled reads through
+# a large bell-shaped neighborhood kernel, and a SurfaceObject provides typed
+# writes. The final state is colorized straight into an OpenGL PBO. Requires
+# pyglet.
+#
+# ################################################################################
+
+# What this example teaches
+# =========================
+# - How to drive a wide-radius convolution from a TextureObject configured for
+#   LINEAR + WRAP + normalized coordinates. The same CUDAArray is then bound as a
+#   SurfaceObject for the typed write back, requiring `is_surface_load_store=True`
+#   at allocation time.
+# - How a single-channel `float` CUDAArray differs from the multi-channel layout
+#   used in the Gray-Scott example: `num_channels=1`, `tex2D<float>` reads, and
+#   a 4-byte x-stride in `surf2Dwrite`.
+# - How to host-precompute a normalization constant for a stencil with a
+#   variable-shape support (the bell-curve neighborhood), then pass it as a
+#   plain float kernel argument.
+#
+# How it works
+# ============
+# Lenia (Bert Wang-Chak Chan, 2018) generalizes Conway's Game of Life to
+# continuous space, time, and state. Each cell holds a real value in [0, 1].
+# Per step, every cell:
+#
+#   1. Integrates a smooth bell-shaped neighborhood kernel K against the
+#      current state to produce a "potential" U:
+#
+#          U(x) = sum over offsets (dx, dy) inside a disk of radius R of
+#                  K(|(dx, dy)|) * state(x + (dx, dy))
+#                 divided by  sum of K  (host-precomputed).
+#
+#      K(r) = exp(-((r / R) - mu_K)^2 / (2 * sigma_K^2)) for r <= R.
+#
+#   2. Applies the growth function G and updates the state:
+#
+#          state_new = clamp(state_old + dt * (2 * exp(-(U - mu)^2 /
+#                            (2 * sigma^2)) - 1),  0,  1).
+#
+# Two single-channel `float` arrays are ping-ponged each frame: a
+# TextureObject reads one (sampled with LINEAR + WRAP so the disk wraps
+# toroidally) and a SurfaceObject writes the other.
+#
+#   PING-PONG (two arrays, swap each step)
+#   ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#   +--------------+   tex2D<float>    +------------------+
+#   |   arr_a      | ----------------> |                  |
+#   |    state     |                   |  convolve_lenia  |
+#   +--------------+                   |     kernel       |
+#                                      |  (+ growth fn)   |
+#   +--------------+   surf2Dwrite     |                  |
+#   |   arr_b      | <---------------- |                  |
+#   |    state     |                   +------------------+
+#   +--------------+
+#       (swap)
+#
+# After the step we run a separate `colorize_lenia` kernel that samples the
+# new state and writes RGBA bytes straight into the OpenGL PBO via
+# GraphicsResource. No data ever travels across the PCIe bus during the frame.
+#
+# Why LINEAR + WRAP + normalized coords?
+# --------------------------------------
+# Lenia's neighborhood radius (R = 13) is wide enough that boundary handling
+# really matters. AddressMode.WRAP gives a toroidal world for free, and it is
+# only supported in normalized coordinate mode (see the CUDA Programming
+# Guide). LINEAR filtering is essentially free on the hardware -- here it
+# softens the integer-offset reads a hair, which keeps the dynamics smooth.
+# Sample coordinates are `(x + dx + 0.5) / W`; values < 0 or > 1 are fine,
+# WRAP handles them.
+#
+# Channel byte width in surf2Dwrite
+# ---------------------------------
+# `surf2Dwrite` takes the x coordinate in BYTES, not in elements. For a
+# single-channel `float` surface that means `x * sizeof(float)` = `x * 4`.
+# (The Gray-Scott example uses 8 because it stores `float2`.)
+#
+# One step per frame
+# ------------------
+# Each step convolves a (2R+1)^2 = 729-tap neighborhood for every pixel, which
+# is much heavier than a Gray-Scott 5-point Laplacian. With dt = 0.1 the
+# dynamics are slow enough that one step per displayed frame is plenty. There
+# is no `N_STEPS` loop.
+#
+# What you should see
+# ===================
+# A window showing soft, glider-like blobs drifting across the field on a
+# teal-on-black palette. Press R to reseed with a new Gaussian blob, 1 to
+# clear the field, and Escape to exit. The window title shows the current
+# FPS.
+#
+
+# /// script
+# dependencies = ["cuda_bindings", "cuda_core>0.6.0", "pyglet"]
+# ///
+
+import ctypes
+import math
+import sys
+import time
+
+import numpy as np
+
+from cuda.core import (
+    AddressMode,
+    ArrayFormat,
+    CUDAArray,
+    Device,
+    FilterMode,
+    GraphicsResource,
+    LaunchConfig,
+    Program,
+    ProgramOptions,
+    ReadMode,
+    ResourceDescriptor,
+    SurfaceObject,
+    TextureDescriptor,
+    TextureObject,
+    launch,
+)
+
+# ---------------------------------------------------------------------------
+# Simulation parameters (feel free to change these)
+# ---------------------------------------------------------------------------
+WIDTH = 256
+HEIGHT = 256
+
+# Neighborhood / kernel shape
+R = 13  # convolution radius in pixels (texture-space)
+MU_K = 0.5  # bell center for the neighborhood weight K(r/R)
+SIGMA_K = 0.15  # bell width for K
+
+# Growth function shape
+MU = 0.15  # bell center for the growth function G(U)
+SIGMA = 0.015  # bell width for G
+
+DT = 0.1  # time step
+
+# Initial blob radius and peak for the Gaussian seed.
+# The radius must be large relative to the neighborhood radius R=13 so the
+# kernel-integrated potential U lands near the growth bell's center mu=0.15.
+# With SEED_RADIUS=36, U at the blob's centre starts near mu and the field
+# survives the first step; smaller seeds collapse to zero within one frame
+# because U is far outside the narrow (sigma=0.015) growth bell.
+SEED_RADIUS = 36.0
+SEED_PEAK = 0.5
+
+# Seed modes (kept in sync with the seed_blob kernel)
+SEED_MODE_CLEAR = 0
+SEED_MODE_BLOB = 1
+
+
+# ============================= Helper functions =============================
+#
+# The functions below set up CUDA and OpenGL. If you're here to learn about
+# CUDAArray/TextureObject/SurfaceObject, skip ahead to main() -- the interesting
+# part is there. These helpers exist so that main() reads like a short story
+# instead of a wall of boilerplate.
+# ============================================================================
+
+
+def compute_kernel_norm(radius, mu_k, sigma_k):
+    """Precompute 1 / (sum of K(r)) for the bell-shaped neighborhood weight.
+
+    Mirrors exactly what the device kernel does so the convolution is energy-
+    preserving: walks the (2R+1)x(2R+1) box, accumulates
+    `exp(-(r/R - mu_k)^2 / (2*sigma_k^2))` for `r <= R`, and returns the
+    reciprocal sum as a float32.
+    """
+    inv_two_sigma2 = 1.0 / (2.0 * sigma_k * sigma_k)
+    inv_r = 1.0 / float(radius)
+    total = 0.0
+    for dy in range(-radius, radius + 1):
+        for dx in range(-radius, radius + 1):
+            r = math.sqrt(dx * dx + dy * dy)
+            if r > radius:
+                continue
+            rn = r * inv_r - mu_k
+            total += math.exp(-(rn * rn) * inv_two_sigma2)
+    if total <= 0.0:
+        raise RuntimeError("kernel normalization sum collapsed to zero")
+    return np.float32(1.0 / total)
+
+
+def setup_cuda():
+    """Compile the CUDA kernels and return (device, stream, kernels, configs).
+
+    Returns a dict of kernels keyed by name and matching LaunchConfigs.
+    """
+    dev = Device(0)
+    dev.set_current()
+
+    # SurfaceObject requires surface load/store, which has existed since SM 2.0,
+    # but bindless surface objects (cuSurfObjectCreate) require SM 3.0+.
+    cc = dev.compute_capability
+    if cc.major < 3:
+        print(
+            "This example requires a GPU with compute capability >= 3.0 for "
+            f"bindless surface objects. Found sm_{cc.major}{cc.minor}.",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    stream = dev.create_stream()
+
+    # Compile as C++ so the templated tex2D<float> overload resolves.
+    program_options = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}")
+    prog = Program(KERNEL_SOURCE, code_type="c++", options=program_options)
+    mod = prog.compile(
+        "cubin",
+        name_expressions=("convolve_lenia", "colorize_lenia", "seed_blob"),
+    )
+
+    kernels = {
+        "step": mod.get_kernel("convolve_lenia"),
+        "colorize": mod.get_kernel("colorize_lenia"),
+        "seed": mod.get_kernel("seed_blob"),
+    }
+
+    block = (16, 16, 1)
+    grid = (
+        (WIDTH + block[0] - 1) // block[0],
+        (HEIGHT + block[1] - 1) // block[1],
+        1,
+    )
+    config = LaunchConfig(grid=grid, block=block)
+    # All three kernels are pixel-parallel over a WIDTH x HEIGHT grid, so they
+    # can share a launch config.
+    configs = {"step": config, "colorize": config, "seed": config}
+
+    return dev, stream, kernels, configs
+
+
+def create_window():
+    """Open a pyglet window and return (window, gl_module, pyglet)."""
+    try:
+        import pyglet
+        from pyglet.gl import gl as _gl
+    except ImportError:
+        print(
+            "This example requires pyglet >= 2.0.\nInstall it with:  pip install pyglet",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    window = pyglet.window.Window(
+        WIDTH,
+        HEIGHT,
+        caption="cuda.core CUDAArray/Texture/Surface - Lenia",
+        vsync=False,
+    )
+    return window, _gl, pyglet
+
+
+def create_display_resources(gl, width, height):
+    """Create the GL objects needed to show a texture on screen.
+
+    This sets up a shader program, a fullscreen quad, and an empty texture.
+    None of this is CUDA-specific -- it's standard OpenGL boilerplate for
+    rendering a textured quad.
+
+    Returns (shader_program, vertex_array_id, texture_id). The shader_program
+    is a pyglet ShaderProgram object (must be kept alive).
+    """
+    from pyglet.graphics.shader import Shader, ShaderProgram
+
+    # Shader program -- just passes texture coordinates through
+    vert = Shader(VERTEX_SHADER_SOURCE, "vertex")
+    frag = Shader(FRAGMENT_SHADER_SOURCE, "fragment")
+    shader_prog = ShaderProgram(vert, frag)
+
+    # Fullscreen quad (two triangles covering the entire window)
+    quad_verts = np.array(
+        [
+            # x,  y,    s, t      (position + texture coordinate)
+            -1,
+            -1,
+            0,
+            0,
+            1,
+            -1,
+            1,
+            0,
+            1,
+            1,
+            1,
+            1,
+            -1,
+            -1,
+            0,
+            0,
+            1,
+            1,
+            1,
+            1,
+            -1,
+            1,
+            0,
+            1,
+        ],
+        dtype=np.float32,
+    )
+
+    vao = ctypes.c_uint(0)
+    gl.glGenVertexArrays(1, ctypes.byref(vao))
+    gl.glBindVertexArray(vao.value)
+
+    vbo = ctypes.c_uint(0)
+    gl.glGenBuffers(1, ctypes.byref(vbo))
+    gl.glBindBuffer(gl.GL_ARRAY_BUFFER, vbo.value)
+    gl.glBufferData(
+        gl.GL_ARRAY_BUFFER,
+        quad_verts.nbytes,
+        quad_verts.ctypes.data_as(ctypes.c_void_p),
+        gl.GL_STATIC_DRAW,
+    )
+
+    stride = 4 * 4  # 4 floats * 4 bytes each = 16 bytes per vertex
+    pos_loc = gl.glGetAttribLocation(shader_prog.id, b"position")
+    gl.glEnableVertexAttribArray(pos_loc)
+    gl.glVertexAttribPointer(pos_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(0))
+
+    tc_loc = gl.glGetAttribLocation(shader_prog.id, b"texcoord")
+    gl.glEnableVertexAttribArray(tc_loc)
+    gl.glVertexAttribPointer(tc_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(8))
+
+    gl.glBindVertexArray(0)
+
+    # Empty texture (will be filled each frame from the PBO)
+    tex = ctypes.c_uint(0)
+    gl.glGenTextures(1, ctypes.byref(tex))
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex.value)
+    gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MIN_FILTER, gl.GL_LINEAR)
+    gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MAG_FILTER, gl.GL_LINEAR)
+    gl.glTexImage2D(
+        gl.GL_TEXTURE_2D,
+        0,
+        gl.GL_RGBA8,
+        width,
+        height,
+        0,
+        gl.GL_RGBA,
+        gl.GL_UNSIGNED_BYTE,
+        None,
+    )
+
+    return shader_prog, vao.value, tex.value
+
+
+def create_pixel_buffer(gl, width, height):
+    """Create a Pixel Buffer Object (PBO) -- the bridge between CUDA and OpenGL.
+
+    A PBO is a GPU-side buffer that OpenGL can read from when uploading pixels
+    to a texture. By registering this same buffer with CUDA, the CUDA kernel
+    can write directly into it.
+
+    Returns (pbo_gl_name, size_in_bytes).
+    """
+    pbo = ctypes.c_uint(0)
+    gl.glGenBuffers(1, ctypes.byref(pbo))
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo.value)
+    nbytes = width * height * 4  # RGBA, 1 byte per channel
+    gl.glBufferData(gl.GL_PIXEL_UNPACK_BUFFER, nbytes, None, gl.GL_DYNAMIC_DRAW)
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0)
+    return pbo.value, nbytes
+
+
+def copy_pbo_to_texture(gl, pbo_id, tex_id, width, height):
+    """Copy pixel data from the PBO into the GL texture (GPU-to-GPU)."""
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo_id)
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id)
+    gl.glTexSubImage2D(
+        gl.GL_TEXTURE_2D,
+        0,
+        0,
+        0,
+        width,
+        height,
+        gl.GL_RGBA,
+        gl.GL_UNSIGNED_BYTE,
+        None,  # None = read from the currently bound PBO, not from CPU
+    )
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0)
+
+
+def draw_fullscreen_quad(gl, shader_prog, vao_id, tex_id):
+    """Draw the texture to the screen using the fullscreen quad."""
+    gl.glUseProgram(shader_prog.id)
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id)
+    gl.glBindVertexArray(vao_id)
+    gl.glDrawArrays(gl.GL_TRIANGLES, 0, 6)
+    gl.glBindVertexArray(0)
+    gl.glUseProgram(0)
+
+
+def make_state_arrays():
+    """Allocate the two single-channel `float` ping-pong arrays.
+
+    `is_surface_load_store=True` is what lets the same CUDAArray be bound as both a
+    TextureObject (sampled reads) and a SurfaceObject (typed writes).
+    """
+    arr_a = CUDAArray.from_descriptor(
+        shape=(WIDTH, HEIGHT),
+        format=ArrayFormat.FLOAT32,
+        num_channels=1,
+        is_surface_load_store=True,
+    )
+    arr_b = CUDAArray.from_descriptor(
+        shape=(WIDTH, HEIGHT),
+        format=ArrayFormat.FLOAT32,
+        num_channels=1,
+        is_surface_load_store=True,
+    )
+    return arr_a, arr_b
+
+
+def make_texture(arr):
+    """Bind `arr` as a TextureObject configured for LINEAR + WRAP + normalized."""
+    res_desc = ResourceDescriptor.from_array(arr)
+    tex_desc = TextureDescriptor(
+        address_mode=AddressMode.WRAP,
+        filter_mode=FilterMode.LINEAR,
+        read_mode=ReadMode.ELEMENT_TYPE,
+        # WRAP/MIRROR addressing modes require normalized coordinates.
+        normalized_coords=True,
+    )
+    return TextureObject.from_descriptor(resource=res_desc, texture_descriptor=tex_desc)
+
+
+def seed_state(stream, kernels, configs, write_surf, mode, seed_value):
+    """Re-initialize the array behind `write_surf` with a Gaussian blob or zeros.
+
+    `mode = SEED_MODE_CLEAR` zeroes the field; `mode = SEED_MODE_BLOB` places a
+    Gaussian blob with peak ~SEED_PEAK at the center, jittered by `seed_value`
+    so successive reseeds give different patterns.
+
+    Takes a long-lived SurfaceObject (not a fresh one): `launch` is async, so
+    creating a SurfaceObject inside a `with` block that closes immediately
+    after `launch` returns would destroy the surface handle before the kernel
+    actually runs against it.
+    """
+    launch(
+        stream,
+        configs["seed"],
+        kernels["seed"],
+        np.uint64(write_surf.handle),
+        np.int32(WIDTH),
+        np.int32(HEIGHT),
+        np.int32(mode),
+        np.uint32(seed_value),
+        np.float32(SEED_RADIUS),
+        np.float32(SEED_PEAK),
+    )
+
+
+# ================================== main() ==================================
+
+
+def main():
+    # --- Step 1: Set up CUDA (compile kernels, create stream) ---
+    dev, stream, kernels, configs = setup_cuda()
+
+    # --- Step 2: Open a window ---
+    window, gl, pyglet = create_window()
+
+    # --- Step 3: Create GL resources for drawing a texture to screen ---
+    #     (Standard OpenGL boilerplate -- not CUDA-specific.)
+    shader_prog, quad_vao, tex_id = create_display_resources(gl, WIDTH, HEIGHT)
+
+    # --- Step 4: Create the Pixel Buffer Object (PBO) ---
+    #     The PBO is GPU memory owned by OpenGL. It's the bridge between the
+    #     two worlds: CUDA writes into it, OpenGL reads from it.
+    pbo_id, _ = create_pixel_buffer(gl, WIDTH, HEIGHT)
+
+    # --- Step 5: Register the PBO with CUDA ---
+    resource = GraphicsResource.from_gl_buffer(pbo_id, flags="write_discard")
+
+    # --- Step 6: Allocate the two ping-pong state Arrays ---
+    #     Both are single-channel `float` with `is_surface_load_store=True` so
+    #     they can be bound as SurfaceObjects.
+    arr_a, arr_b = make_state_arrays()
+
+    # --- Step 7: Pre-create the four bindless handles ---
+    #     Creating these once is much cheaper than rebuilding them every
+    #     step. The simulation loop just picks which read/write pair to use.
+    tex_a = make_texture(arr_a)
+    tex_b = make_texture(arr_b)
+    surf_a = SurfaceObject.from_array(arr_a)
+    surf_b = SurfaceObject.from_array(arr_b)
+
+    # --- Step 8: Precompute the bell-curve normalization constant ---
+    #     The neighborhood weight K(r) is unnormalized in the kernel; we
+    #     divide by sum(K) so the convolution is a weighted mean rather than
+    #     an unbounded integral. Doing this on the host once at startup is
+    #     much cheaper than redoing it on the device every step.
+    inv_weight_sum = compute_kernel_norm(R, MU_K, SIGMA_K)
+
+    # --- Step 9: Seed an initial Gaussian blob into arr_a (writes via surf_a) ---
+    seed_state(stream, kernels, configs, surf_a, SEED_MODE_BLOB, seed_value=0)
+    # After seeding, `arr_a` is the "current" state.
+    state = {"current": "a", "seed": 0}
+
+    # --- Step 10: Render loop ---
+    start_time = time.monotonic()
+    frame_count = 0
+    fps_time = start_time
+
+    def current_read_write():
+        if state["current"] == "a":
+            return tex_a, surf_b, "b"  # read a, write b, next current = b
+        return tex_b, surf_a, "a"
+
+    @window.event
+    def on_key_press(symbol, _modifiers):
+        key = pyglet.window.key
+        if symbol == key.ESCAPE:
+            window.close()
+            return
+        if symbol == key.R:
+            # Reseed with a new Gaussian blob; bump the seed so the jitter
+            # pattern changes each time.
+            state["seed"] += 1
+            seed_state(stream, kernels, configs, surf_a, SEED_MODE_BLOB, state["seed"])
+            state["current"] = "a"
+            return
+        if symbol == key._1:
+            # Clear the field. Useful to confirm the simulation is quiet when
+            # the state is zero.
+            seed_state(stream, kernels, configs, surf_a, SEED_MODE_CLEAR, 0)
+            state["current"] = "a"
+            return
+
+    @window.event
+    def on_draw():
+        nonlocal frame_count, fps_time
+
+        window.clear()
+
+        # (a) Run one Lenia step. The convolution kernel reads the current
+        #     state via a TextureObject (LINEAR + WRAP gives toroidal
+        #     wrapping at the border), evaluates the growth function, and
+        #     writes the new state via a SurfaceObject. One step per frame
+        #     is intentional: dt = 0.1 is small, and the (2R+1)^2 = 729-tap
+        #     stencil is heavy enough that going faster would not help.
+        tex_read, surf_write, next_current = current_read_write()
+        launch(
+            stream,
+            configs["step"],
+            kernels["step"],
+            np.uint64(tex_read.handle),
+            np.uint64(surf_write.handle),
+            np.int32(WIDTH),
+            np.int32(HEIGHT),
+            np.int32(R),
+            np.float32(MU_K),
+            np.float32(SIGMA_K),
+            np.float32(MU),
+            np.float32(SIGMA),
+            np.float32(DT),
+            inv_weight_sum,
+        )
+        state["current"] = next_current
+
+        # (b) Colorize the latest state into the OpenGL PBO.
+        tex_read = tex_a if state["current"] == "a" else tex_b
+        with resource.map(stream=stream) as buf:
+            launch(
+                stream,
+                configs["colorize"],
+                kernels["colorize"],
+                np.uint64(tex_read.handle),
+                buf.handle,
+                np.int32(WIDTH),
+                np.int32(HEIGHT),
+            )
+        # Unmap happens automatically when the `with` block exits.
+
+        # (c) Tell OpenGL to copy the PBO contents into our texture.
+        copy_pbo_to_texture(gl, pbo_id, tex_id, WIDTH, HEIGHT)
+
+        # (d) Draw the texture to the screen.
+        draw_fullscreen_quad(gl, shader_prog, quad_vao, tex_id)
+
+        # FPS counter (shown in window title)
+        frame_count += 1
+        now = time.monotonic()
+        if now - fps_time >= 1.0:
+            fps = frame_count / (now - fps_time)
+            window.set_caption(f"cuda.core CUDAArray/Texture/Surface - Lenia ({WIDTH}x{HEIGHT}, R={R}, {fps:.0f} FPS)")
+            frame_count = 0
+            fps_time = now
+
+    @window.event
+    def on_close():
+        # Release everything we opened, in reverse order. Each of these is a
+        # context manager too, but pyglet owns the event loop here so we
+        # release explicitly.
+        resource.close()
+        tex_a.close()
+        tex_b.close()
+        surf_a.close()
+        surf_b.close()
+        arr_a.close()
+        arr_b.close()
+        stream.close()
+
+    pyglet.app.run(interval=0)
+
+
+# ======================== GPU code (CUDA + GLSL) ============================
+#
+# These source strings are kept at the bottom of the file so they don't
+# distract from the Python logic above. The important things to know:
+#
+#   - KERNEL_SOURCE contains three CUDA C++ kernels:
+#       * seed_blob       -- sets the initial state via SurfaceObject writes.
+#                            Either clears the field (mode = 0) or paints a
+#                            Gaussian blob centered in the field (mode = 1).
+#       * convolve_lenia  -- reads previous state via TextureObject (with
+#                            LINEAR + WRAP bilinear filtering), integrates a
+#                            bell-shaped neighborhood K(r/R) to produce the
+#                            potential U, applies the growth function G(U),
+#                            and writes the next state via SurfaceObject.
+#       * colorize_lenia  -- reads the new state via TextureObject and writes
+#                            RGBA bytes into the OpenGL PBO using a simple
+#                            teal-on-black gradient.
+#
+#   - VERTEX_SHADER_SOURCE / FRAGMENT_SHADER_SOURCE are GLSL. They draw a
+#     texture onto a rectangle covering the entire window. Nothing interesting.
+#
+# ============================================================================
+
+KERNEL_SOURCE = r"""
+// All kernels run one thread per output pixel and bounds-check at the top.
+// `surf2Dwrite` takes the x offset in BYTES; for a single-channel float
+// surface that means `x * sizeof(float)` = `x * 4`.
+
+extern "C"
+__global__
+void seed_blob(cudaSurfaceObject_t surf,
+               int width, int height,
+               int mode,
+               unsigned int seed,
+               float radius,
+               float peak) {
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= width || y >= height) return;
+
+    float value = 0.0f;
+    if (mode == 1) {
+        // Gaussian blob centered in the field with a small deterministic
+        // jitter that breaks symmetry differently on each reseed.
+        float cx = (float)(width  / 2);
+        float cy = (float)(height / 2);
+        float dx = (float)x - cx;
+        float dy = (float)y - cy;
+        float r2 = dx * dx + dy * dy;
+        float inv = 1.0f / (radius * radius);
+        value = peak * expf(-r2 * inv);
+
+        unsigned int h = (unsigned int)x * 374761393u +
+                         (unsigned int)y * 668265263u + seed * 2246822519u;
+        h = (h ^ (h >> 13)) * 1274126177u;
+        h = h ^ (h >> 16);
+        float noise = (h & 0xffffu) / 65535.0f;  // in [0, 1]
+        value += 0.02f * (noise - 0.5f);
+        if (value < 0.0f) value = 0.0f;
+        if (value > 1.0f) value = 1.0f;
+    }
+
+    // float is 4 bytes; surf2Dwrite takes the x offset in BYTES.
+    surf2Dwrite(value, surf, x * (int)sizeof(float), y);
+}
+
+extern "C"
+__global__
+void convolve_lenia(cudaTextureObject_t tex,
+                    cudaSurfaceObject_t surf,
+                    int width, int height,
+                    int R,
+                    float mu_k, float sigma_k,
+                    float mu, float sigma,
+                    float dt,
+                    float inv_weight_sum) {
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= width || y >= height) return;
+
+    // Normalized texture coordinates: WRAP addressing requires them. The
+    // (x + dx + 0.5) / W idiom places the sample at the texel center; values
+    // outside [0, 1] are fine because WRAP wraps them toroidally.
+    float inv_w = 1.0f / (float)width;
+    float inv_h = 1.0f / (float)height;
+    float inv_R = 1.0f / (float)R;
+    float inv_two_sigma_k2 = 1.0f / (2.0f * sigma_k * sigma_k);
+    float inv_two_sigma2   = 1.0f / (2.0f * sigma     * sigma);
+
+    // Integrate the bell-shaped weight K(r/R) against the current state.
+    float U = 0.0f;
+    for (int dy = -R; dy <= R; ++dy) {
+        for (int dx = -R; dx <= R; ++dx) {
+            float fdx = (float)dx;
+            float fdy = (float)dy;
+            float r2 = fdx * fdx + fdy * fdy;
+            float r  = sqrtf(r2);
+            if (r > (float)R) continue;   // restrict to the disk
+            float rn = r * inv_R - mu_k;
+            float w  = expf(-(rn * rn) * inv_two_sigma_k2);
+
+            float sx = ((float)x + fdx + 0.5f) * inv_w;
+            float sy = ((float)y + fdy + 0.5f) * inv_h;
+            float s  = tex2D<float>(tex, sx, sy);
+            U += w * s;
+        }
+    }
+    U *= inv_weight_sum;   // host-precomputed 1 / sum(K)
+
+    // Read the current cell value (point sample at the texel center).
+    float sx0 = ((float)x + 0.5f) * inv_w;
+    float sy0 = ((float)y + 0.5f) * inv_h;
+    float state = tex2D<float>(tex, sx0, sy0);
+
+    // Growth function G(U) = 2 * exp(-(U - mu)^2 / (2 * sigma^2)) - 1,
+    // mapping U near mu to +1 (grow) and U far from mu to -1 (shrink).
+    float du = U - mu;
+    float G  = 2.0f * expf(-(du * du) * inv_two_sigma2) - 1.0f;
+
+    float new_state = state + dt * G;
+    if (new_state < 0.0f) new_state = 0.0f;
+    if (new_state > 1.0f) new_state = 1.0f;
+
+    surf2Dwrite(new_state, surf, x * (int)sizeof(float), y);
+}
+
+extern "C"
+__global__
+void colorize_lenia(cudaTextureObject_t tex,
+                    unsigned char* output,
+                    int width, int height) {
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= width || y >= height) return;
+
+    float inv_w = 1.0f / (float)width;
+    float inv_h = 1.0f / (float)height;
+    float cx = ((float)x + 0.5f) * inv_w;
+    float cy = ((float)y + 0.5f) * inv_h;
+
+    float v = tex2D<float>(tex, cx, cy);
+    if (v < 0.0f) v = 0.0f;
+    if (v > 1.0f) v = 1.0f;
+
+    // Linear interpolation from a deep teal at v = 0 to a bright teal at
+    // v = 1. Two stops -- simple, easy to read, no LUT required.
+    //   (0, 15, 30, 255)  ->  (50, 200, 180, 255)
+    float r = (  0.0f + v * ( 50.0f -   0.0f));
+    float g = ( 15.0f + v * (200.0f -  15.0f));
+    float b = ( 30.0f + v * (180.0f -  30.0f));
+
+    int idx = (y * width + x) * 4;
+    output[idx + 0] = (unsigned char)r;
+    output[idx + 1] = (unsigned char)g;
+    output[idx + 2] = (unsigned char)b;
+    output[idx + 3] = 255;
+}
+"""
+
+# GLSL shaders -- these just display a texture on a fullscreen rectangle.
+# Nothing CUDA-specific here.
+
+VERTEX_SHADER_SOURCE = """#version 330 core
+in vec2 position;
+in vec2 texcoord;
+out vec2 v_texcoord;
+void main() {
+    gl_Position = vec4(position, 0.0, 1.0);
+    v_texcoord = texcoord;
+}
+"""
+
+FRAGMENT_SHADER_SOURCE = """#version 330 core
+in vec2 v_texcoord;
+out vec4 fragColor;
+uniform sampler2D tex;
+void main() {
+    fragColor = texture(tex, v_texcoord);
+}
+"""
+
+
+if __name__ == "__main__":
+    main()
diff --git a/cuda_core/examples/gl_interop_mandelbrot.py b/cuda_core/examples/gl_interop_mandelbrot.py
new file mode 100644
index 00000000000..73671d77e95
--- /dev/null
+++ b/cuda_core/examples/gl_interop_mandelbrot.py
@@ -0,0 +1,692 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# ################################################################################
+#
+# This example demonstrates cuda.core.CUDAArray and TextureObject used as a *color
+# lookup table* (palette LUT) for a real-time Mandelbrot deep-zoom explorer.
+# A CUDA kernel computes smooth iteration counts and uses tex1D<float4> with
+# LINEAR + CLAMP + NORMALIZED_FLOAT sampling to read a 256-entry RGBA palette,
+# writing the final RGBA bytes straight into an OpenGL PBO via GraphicsResource.
+# Requires pyglet.
+#
+# ################################################################################
+
+# What this example teaches
+# =========================
+# - How to use a 1D cuda.core.CUDAArray as a palette and bind it via a
+#   TextureObject for hardware-filtered color lookups inside a kernel.
+# - How LINEAR + AddressMode.CLAMP + ReadMode.NORMALIZED_FLOAT + normalized
+#   coordinates give you a free `texture(palette, t)` style sampler that
+#   returns a float4 in [0, 1] regardless of the underlying storage format.
+# - How to drive a real-time interactive viewer: mouse pan, scroll-wheel zoom
+#   anchored at the cursor, and key-driven iteration cap.
+#
+# How it works
+# ============
+# The Mandelbrot set is defined by iterating z -> z^2 + c starting from
+# z = 0; pixels are colored by how quickly z escapes the disk of radius 2.
+#
+#     +---------+   ResourceDescriptor.from_array
+#     |  CUDAArray  | --------------------------------+
+#     | float4  |                                 v
+#     | size 256|                       +-------------------+
+#     +---------+                       |   TextureObject   |
+#       ^  copy_from(host)              |  (palette LUT)    |
+#       |                               +---------+---------+
+#     host palette                                |
+#     (numpy float32x4, 256 stops)                |
+#                                                 v
+#                                  tex1D<float4>(palette, t)
+#                                                 |
+#                                                 v
+#                                     +-----------------------+
+#                                     |  mandelbrot kernel    |
+#                                     |  (one thread / pixel) |
+#                                     +-----------+-----------+
+#                                                 |
+#                                                 v   GraphicsResource.map
+#                                     +-----------------------+
+#                                     |   OpenGL PBO (RGBA8)  |
+#                                     +-----------------------+
+#
+# Smooth iteration count
+# ----------------------
+# A plain integer escape count produces ugly banded colors. With a bailout
+# radius R = 2 (escape when |z|^2 > 4), we use the standard smooth formula:
+#
+#     mu = iter + 1 - log(log(|z|)) / log(2)
+#
+# At the escape step |z| > 2, so log(|z|) > log(2) > 0 and log(log(|z|)) is
+# finite. We compute this in double and cast to float for the palette lookup.
+#
+# Cursor-anchored zoom
+# --------------------
+# On scroll, we want the world point under the mouse cursor to remain under
+# the cursor after the zoom. We capture (wx, wy) under the cursor with the
+# old scale, multiply the scale by 0.9 (zoom in) or 1.1 (zoom out), then
+# back-solve cx, cy so the same screen pixel still maps to (wx, wy):
+#
+#     cx_new = wx - (mouse_x - W/2) * scale_new
+#     cy_new = wy - (mouse_y - H/2) * scale_new
+#
+# Why double precision for cx, cy, scale?
+# ---------------------------------------
+# Float32 runs out of mantissa bits around 1e6x zoom; double gets you to
+# roughly 1e13x before the pixel grid coarsens visibly. The kernel takes
+# cx, cy, scale as doubles and only narrows to float for the color lookup.
+#
+# Address mode note
+# -----------------
+# We use AddressMode.CLAMP (per the example brief). Combined with the
+# `fmodf(mu * 0.02f, 1.0f)` cycling formula, the palette index is already
+# guaranteed to be in [0, 1), so CLAMP and WRAP both produce identical
+# results in practice -- there is no visible seam.
+#
+# What you should see
+# ===================
+# A window showing the Mandelbrot set. Drag with the left mouse button to
+# pan, scroll the wheel to zoom in/out at the cursor, press R to reset the
+# view, and `[`/`]` to lower/raise the iteration cap. The window title shows
+# the current zoom level, center, max_iter, and FPS. Close the window or
+# press Escape to exit.
+#
+
+# /// script
+# dependencies = ["cuda_bindings", "cuda_core>0.6.0", "pyglet"]
+# ///
+
+import ctypes
+import sys
+import time
+
+import numpy as np
+
+from cuda.core import (
+    AddressMode,
+    ArrayFormat,
+    CUDAArray,
+    Device,
+    FilterMode,
+    GraphicsResource,
+    LaunchConfig,
+    Program,
+    ProgramOptions,
+    ReadMode,
+    ResourceDescriptor,
+    TextureDescriptor,
+    TextureObject,
+    launch,
+)
+
+# ---------------------------------------------------------------------------
+# Window and viewer parameters (feel free to change these)
+# ---------------------------------------------------------------------------
+WIDTH = 1024
+HEIGHT = 768
+PALETTE_SIZE = 256
+
+# Default view: classic Mandelbrot framing centered slightly left of origin.
+DEFAULT_CX = -0.5
+DEFAULT_CY = 0.0
+DEFAULT_SCALE = 4.0 / HEIGHT  # world-units per pixel (4-unit-tall view)
+DEFAULT_MAX_ITER = 512
+
+# Bounds for [/] iteration adjust.
+MIN_MAX_ITER = 64
+MAX_MAX_ITER = 8192
+ITER_STEP = 64
+
+
+# ============================= Helper functions =============================
+#
+# The functions below set up CUDA and OpenGL. If you're here to learn about
+# CUDAArray/TextureObject as a palette LUT, skip ahead to main() -- the interesting
+# part is there. These helpers exist so that main() reads like a short story
+# instead of a wall of boilerplate.
+# ============================================================================
+
+
+def setup_cuda():
+    """Compile the CUDA kernel and return (device, stream, kernel, config)."""
+    dev = Device(0)
+    dev.set_current()
+
+    # Bindless texture objects (cuTexObjectCreate) require SM 3.0+.
+    cc = dev.compute_capability
+    if cc.major < 3:
+        print(
+            "This example requires a GPU with compute capability >= 3.0 for "
+            f"bindless texture objects. Found sm_{cc.major}{cc.minor}.",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    stream = dev.create_stream()
+
+    # Compile as C++ so the templated tex1D<float4> overload resolves.
+    program_options = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}")
+    prog = Program(KERNEL_SOURCE, code_type="c++", options=program_options)
+    mod = prog.compile("cubin", name_expressions=("mandelbrot",))
+
+    kernel = mod.get_kernel("mandelbrot")
+
+    block = (16, 16, 1)
+    grid = (
+        (WIDTH + block[0] - 1) // block[0],
+        (HEIGHT + block[1] - 1) // block[1],
+        1,
+    )
+    config = LaunchConfig(grid=grid, block=block)
+
+    return dev, stream, kernel, config
+
+
+def create_window():
+    """Open a pyglet window and return (window, gl_module, pyglet)."""
+    try:
+        import pyglet
+        from pyglet.gl import gl as _gl
+    except ImportError:
+        print(
+            "This example requires pyglet >= 2.0.\nInstall it with:  pip install pyglet",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    window = pyglet.window.Window(
+        WIDTH,
+        HEIGHT,
+        caption="cuda.core CUDAArray/Texture - Mandelbrot Deep Zoom",
+        vsync=False,
+    )
+    return window, _gl, pyglet
+
+
+def create_display_resources(gl, width, height):
+    """Create the GL objects needed to show a texture on screen.
+
+    This sets up a shader program, a fullscreen quad, and an empty texture.
+    None of this is CUDA-specific -- it's standard OpenGL boilerplate for
+    rendering a textured quad.
+
+    Returns (shader_program, vertex_array_id, texture_id). The shader_program
+    is a pyglet ShaderProgram object (must be kept alive).
+    """
+    from pyglet.graphics.shader import Shader, ShaderProgram
+
+    # Shader program -- just passes texture coordinates through
+    vert = Shader(VERTEX_SHADER_SOURCE, "vertex")
+    frag = Shader(FRAGMENT_SHADER_SOURCE, "fragment")
+    shader_prog = ShaderProgram(vert, frag)
+
+    # Fullscreen quad (two triangles covering the entire window)
+    quad_verts = np.array(
+        [
+            # x,  y,    s, t      (position + texture coordinate)
+            -1,
+            -1,
+            0,
+            0,
+            1,
+            -1,
+            1,
+            0,
+            1,
+            1,
+            1,
+            1,
+            -1,
+            -1,
+            0,
+            0,
+            1,
+            1,
+            1,
+            1,
+            -1,
+            1,
+            0,
+            1,
+        ],
+        dtype=np.float32,
+    )
+
+    vao = ctypes.c_uint(0)
+    gl.glGenVertexArrays(1, ctypes.byref(vao))
+    gl.glBindVertexArray(vao.value)
+
+    vbo = ctypes.c_uint(0)
+    gl.glGenBuffers(1, ctypes.byref(vbo))
+    gl.glBindBuffer(gl.GL_ARRAY_BUFFER, vbo.value)
+    gl.glBufferData(
+        gl.GL_ARRAY_BUFFER,
+        quad_verts.nbytes,
+        quad_verts.ctypes.data_as(ctypes.c_void_p),
+        gl.GL_STATIC_DRAW,
+    )
+
+    stride = 4 * 4  # 4 floats * 4 bytes each = 16 bytes per vertex
+    pos_loc = gl.glGetAttribLocation(shader_prog.id, b"position")
+    gl.glEnableVertexAttribArray(pos_loc)
+    gl.glVertexAttribPointer(pos_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(0))
+
+    tc_loc = gl.glGetAttribLocation(shader_prog.id, b"texcoord")
+    gl.glEnableVertexAttribArray(tc_loc)
+    gl.glVertexAttribPointer(tc_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(8))
+
+    gl.glBindVertexArray(0)
+
+    # Empty texture (will be filled each frame from the PBO)
+    tex = ctypes.c_uint(0)
+    gl.glGenTextures(1, ctypes.byref(tex))
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex.value)
+    gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MIN_FILTER, gl.GL_LINEAR)
+    gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MAG_FILTER, gl.GL_LINEAR)
+    gl.glTexImage2D(
+        gl.GL_TEXTURE_2D,
+        0,
+        gl.GL_RGBA8,
+        width,
+        height,
+        0,
+        gl.GL_RGBA,
+        gl.GL_UNSIGNED_BYTE,
+        None,
+    )
+
+    return shader_prog, vao.value, tex.value
+
+
+def create_pixel_buffer(gl, width, height):
+    """Create a Pixel Buffer Object (PBO) -- the bridge between CUDA and OpenGL.
+
+    A PBO is a GPU-side buffer that OpenGL can read from when uploading pixels
+    to a texture. By registering this same buffer with CUDA, the CUDA kernel
+    can write directly into it.
+
+    Returns (pbo_gl_name, size_in_bytes).
+    """
+    pbo = ctypes.c_uint(0)
+    gl.glGenBuffers(1, ctypes.byref(pbo))
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo.value)
+    nbytes = width * height * 4  # RGBA, 1 byte per channel
+    gl.glBufferData(gl.GL_PIXEL_UNPACK_BUFFER, nbytes, None, gl.GL_DYNAMIC_DRAW)
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0)
+    return pbo.value, nbytes
+
+
+def copy_pbo_to_texture(gl, pbo_id, tex_id, width, height):
+    """Copy pixel data from the PBO into the GL texture (GPU-to-GPU)."""
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo_id)
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id)
+    gl.glTexSubImage2D(
+        gl.GL_TEXTURE_2D,
+        0,
+        0,
+        0,
+        width,
+        height,
+        gl.GL_RGBA,
+        gl.GL_UNSIGNED_BYTE,
+        None,  # None = read from the currently bound PBO, not from CPU
+    )
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0)
+
+
+def draw_fullscreen_quad(gl, shader_prog, vao_id, tex_id):
+    """Draw the texture to the screen using the fullscreen quad."""
+    gl.glUseProgram(shader_prog.id)
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id)
+    gl.glBindVertexArray(vao_id)
+    gl.glDrawArrays(gl.GL_TRIANGLES, 0, 6)
+    gl.glBindVertexArray(0)
+    gl.glUseProgram(0)
+
+
+def build_palette():
+    """Build a 256-entry RGBA float32 palette by lerping through color stops.
+
+    Returns a flat numpy array of shape (PALETTE_SIZE * 4,) dtype=float32
+    suitable for CUDAArray.copy_from(). Each color channel is in [0, 1].
+    """
+    # Hand-picked stops: deep blue -> cyan -> yellow -> orange -> red ->
+    # magenta -> black (the final stop is used by points that hit max_iter
+    # and don't escape).
+    stops = np.array(
+        [
+            [0.00, 0.02, 0.05, 0.30, 1.0],  # position, R, G, B, A
+            [0.16, 0.10, 0.50, 0.90, 1.0],  # cyan
+            [0.42, 1.00, 0.95, 0.20, 1.0],  # yellow
+            [0.58, 1.00, 0.55, 0.10, 1.0],  # orange
+            [0.74, 0.95, 0.10, 0.10, 1.0],  # red
+            [0.90, 0.65, 0.10, 0.85, 1.0],  # magenta
+            [1.00, 0.00, 0.00, 0.00, 1.0],  # black
+        ],
+        dtype=np.float32,
+    )
+
+    pal = np.empty((PALETTE_SIZE, 4), dtype=np.float32)
+    positions = stops[:, 0]
+    colors = stops[:, 1:]
+    for i in range(PALETTE_SIZE):
+        t = i / (PALETTE_SIZE - 1)
+        # Find the bracketing segment.
+        j = int(np.searchsorted(positions, t, side="right")) - 1
+        j = max(0, min(j, len(positions) - 2))
+        t0 = positions[j]
+        t1 = positions[j + 1]
+        seg = (t - t0) / (t1 - t0) if t1 > t0 else 0.0
+        pal[i] = colors[j] + seg * (colors[j + 1] - colors[j])
+
+    # Flatten to (PALETTE_SIZE * 4,) so the byte layout matches a
+    # float4 x PALETTE_SIZE 1D CUDAArray.
+    return np.ascontiguousarray(pal.reshape(-1), dtype=np.float32)
+
+
+def make_palette_texture(arr):
+    """Bind `arr` as a TextureObject configured for LINEAR + CLAMP + normalized."""
+    res_desc = ResourceDescriptor.from_array(arr)
+    tex_desc = TextureDescriptor(
+        address_mode=AddressMode.CLAMP,
+        filter_mode=FilterMode.LINEAR,
+        # NORMALIZED_FLOAT is a no-op for FLOAT32 storage (the data is already
+        # in [0, 1]); we set it because the spec calls for it and to document
+        # the intent for readers building palettes from UINT8 storage.
+        read_mode=ReadMode.NORMALIZED_FLOAT,
+        normalized_coords=True,
+    )
+    return TextureObject.from_descriptor(resource=res_desc, texture_descriptor=tex_desc)
+
+
+# ================================== main() ==================================
+
+
+def main():
+    # --- Step 1: Set up CUDA (compile kernel, create stream) ---
+    dev, stream, kernel, config = setup_cuda()
+
+    # --- Step 2: Open a window ---
+    window, gl, pyglet = create_window()
+
+    # --- Step 3: Create GL resources for drawing a texture to screen ---
+    #     (Standard OpenGL boilerplate -- not CUDA-specific.)
+    shader_prog, quad_vao, tex_id = create_display_resources(gl, WIDTH, HEIGHT)
+
+    # --- Step 4: Create the Pixel Buffer Object (PBO) ---
+    #     The PBO is GPU memory owned by OpenGL. It's the bridge between the
+    #     two worlds: CUDA writes into it, OpenGL reads from it.
+    pbo_id, _ = create_pixel_buffer(gl, WIDTH, HEIGHT)
+
+    # --- Step 5: Register the PBO with CUDA ---
+    resource = GraphicsResource.from_gl_buffer(pbo_id, flags="write_discard")
+
+    # --- Step 6: Build and upload the palette LUT ---
+    #     One 1D CUDAArray, 256 entries of float4 RGBA. The host-side palette is
+    #     a flat numpy float32 array; copy_from() does an async H2D copy, so
+    #     we sync the stream once afterwards to make sure the data has landed
+    #     before we start sampling from it in the render loop.
+    host_palette = build_palette()
+    palette_arr = CUDAArray.from_descriptor(
+        shape=(PALETTE_SIZE,),
+        format=ArrayFormat.FLOAT32,
+        num_channels=4,
+    )
+    palette_arr.copy_from(host_palette, stream=stream)
+    stream.sync()
+
+    # --- Step 7: Bind the palette CUDAArray as a TextureObject (LUT) ---
+    palette_tex = make_palette_texture(palette_arr)
+
+    # --- Step 8: Render loop ---
+    start_time = time.monotonic()
+    frame_count = 0
+    fps_time = start_time
+
+    # View state. cx, cy, scale are kept in Python floats (double precision)
+    # and converted to np.float64 on each kernel launch.
+    view = {
+        "cx": float(DEFAULT_CX),
+        "cy": float(DEFAULT_CY),
+        "scale": float(DEFAULT_SCALE),
+        "max_iter": int(DEFAULT_MAX_ITER),
+        # Pan-drag state (left mouse button).
+        "dragging": False,
+    }
+
+    def screen_to_world(mouse_x, mouse_y):
+        """Map a pyglet mouse coordinate to the world point currently under it.
+
+        Pyglet's window origin is bottom-left and the rendered texture's
+        origin is also bottom-left, so no y-flip is needed.
+        """
+        wx = view["cx"] + (mouse_x - WIDTH / 2.0) * view["scale"]
+        wy = view["cy"] + (mouse_y - HEIGHT / 2.0) * view["scale"]
+        return wx, wy
+
+    @window.event
+    def on_key_press(symbol, _modifiers):
+        key = pyglet.window.key
+        if symbol == key.ESCAPE:
+            window.close()
+            return
+        if symbol == key.R:
+            view["cx"] = float(DEFAULT_CX)
+            view["cy"] = float(DEFAULT_CY)
+            view["scale"] = float(DEFAULT_SCALE)
+            view["max_iter"] = int(DEFAULT_MAX_ITER)
+            return
+        if symbol == key.BRACKETLEFT:
+            view["max_iter"] = max(MIN_MAX_ITER, view["max_iter"] - ITER_STEP)
+            return
+        if symbol == key.BRACKETRIGHT:
+            view["max_iter"] = min(MAX_MAX_ITER, view["max_iter"] + ITER_STEP)
+            return
+
+    @window.event
+    def on_mouse_press(_x, _y, button, _modifiers):
+        if button == pyglet.window.mouse.LEFT:
+            view["dragging"] = True
+
+    @window.event
+    def on_mouse_release(_x, _y, button, _modifiers):
+        if button == pyglet.window.mouse.LEFT:
+            view["dragging"] = False
+
+    @window.event
+    def on_mouse_drag(_x, _y, dx, dy, buttons, _modifiers):
+        if buttons & pyglet.window.mouse.LEFT:
+            # Pan: move the center opposite to the cursor drag (so the scene
+            # follows the cursor). dy is positive when moving up in pyglet's
+            # bottom-left origin space, matching the texture orientation.
+            view["cx"] -= dx * view["scale"]
+            view["cy"] += dy * view["scale"]
+
+    @window.event
+    def on_mouse_scroll(x, y, _scroll_x, scroll_y):
+        # Cursor-anchored zoom: keep the world point under the cursor pinned.
+        wx, wy = screen_to_world(x, y)
+        factor = 0.9 if scroll_y > 0 else 1.1
+        view["scale"] *= factor
+        # Back-solve cx, cy so screen pixel (x, y) still maps to (wx, wy).
+        view["cx"] = wx - (x - WIDTH / 2.0) * view["scale"]
+        view["cy"] = wy - (y - HEIGHT / 2.0) * view["scale"]
+
+    @window.event
+    def on_draw():
+        nonlocal frame_count, fps_time
+
+        window.clear()
+
+        # (a) Map the PBO so CUDA can write to it. This gives us a Buffer
+        #     whose .handle is a device pointer pointing into the GL PBO.
+        with resource.map(stream=stream) as buf:
+            launch(
+                stream,
+                config,
+                kernel,
+                np.uint64(palette_tex.handle),  # bindless texture handle
+                buf.handle,  # output PBO (RGBA8)
+                np.int32(WIDTH),
+                np.int32(HEIGHT),
+                np.float64(view["cx"]),
+                np.float64(view["cy"]),
+                np.float64(view["scale"]),
+                np.int32(view["max_iter"]),
+            )
+        # Unmap happens automatically when the `with` block exits.
+
+        # (b) Tell OpenGL to copy the PBO contents into our texture.
+        copy_pbo_to_texture(gl, pbo_id, tex_id, WIDTH, HEIGHT)
+
+        # (c) Draw the texture to the screen.
+        draw_fullscreen_quad(gl, shader_prog, quad_vao, tex_id)
+
+        # FPS counter (shown in window title)
+        frame_count += 1
+        now = time.monotonic()
+        if now - fps_time >= 1.0:
+            fps = frame_count / (now - fps_time)
+            zoom = 1.0 / view["scale"] if view["scale"] > 0 else 0.0
+            window.set_caption(
+                "cuda.core CUDAArray/Texture - Mandelbrot"
+                f" | zoom {zoom:.3e}x"
+                f" | center ({view['cx']:.6f}, {view['cy']:.6f})"
+                f" | iter {view['max_iter']}"
+                f" | {fps:.0f} FPS"
+            )
+            frame_count = 0
+            fps_time = now
+
+    @window.event
+    def on_close():
+        # Release everything we opened, in reverse order. Each of these is a
+        # context manager too, but pyglet owns the event loop here so we
+        # release explicitly.
+        resource.close()
+        palette_tex.close()
+        palette_arr.close()
+        stream.close()
+
+    pyglet.app.run(interval=0)
+
+
+# ======================== GPU code (CUDA + GLSL) ============================
+#
+# These source strings are kept at the bottom of the file so they don't
+# distract from the Python logic above. The important things to know:
+#
+#   - KERNEL_SOURCE is a single CUDA C++ kernel `mandelbrot` that computes a
+#     smooth iteration count per pixel and looks up the color via
+#     tex1D<float4>(palette, t). Coordinates and the scale factor are doubles
+#     to support deep zooms; only the color lookup runs in single precision.
+#
+#   - VERTEX_SHADER_SOURCE / FRAGMENT_SHADER_SOURCE are GLSL. They draw a
+#     texture onto a rectangle covering the entire window. Nothing interesting.
+#
+# ============================================================================
+
+KERNEL_SOURCE = r"""
+// Mandelbrot deep-zoom kernel with a TextureObject palette LUT.
+//
+// Each thread computes one pixel. Coordinates and scale are doubles so the
+// zoom doesn't quantize at modest depth. Once we have the smooth iteration
+// count we narrow to float and use tex1D<float4> to read the palette.
+
+extern "C"
+__global__
+void mandelbrot(cudaTextureObject_t palette,
+                unsigned char* output,
+                int width, int height,
+                double cx, double cy, double scale,
+                int max_iter) {
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= width || y >= height) return;
+
+    // Map pixel -> complex plane (doubles).
+    double c_re = cx + ((double)x - 0.5 * (double)width)  * scale;
+    double c_im = cy + ((double)y - 0.5 * (double)height) * scale;
+
+    // Standard escape iteration with bailout radius 2 (compare squared norm
+    // against 4 to skip the sqrt in the inner loop).
+    double zr = 0.0;
+    double zi = 0.0;
+    double zr2 = 0.0;
+    double zi2 = 0.0;
+    int iter = 0;
+    while (iter < max_iter && (zr2 + zi2) <= 4.0) {
+        zi = 2.0 * zr * zi + c_im;
+        zr = zr2 - zi2 + c_re;
+        zr2 = zr * zr;
+        zi2 = zi * zi;
+        ++iter;
+    }
+
+    unsigned char r, g, b;
+    if (iter >= max_iter) {
+        // Inside the set (or close enough): solid black.
+        r = 0;
+        g = 0;
+        b = 0;
+    } else {
+        // Smooth iteration count:
+        //   mu = iter + 1 - log(log(|z|)) / log(2)
+        //      = iter + 1 - log(0.5 * log(|z|^2)) / log(2)
+        // At escape, |z|^2 > 4, so 0.5 * log(|z|^2) > log(2) > 0 -- the
+        // outer log is well-defined. Compute in double, narrow to float
+        // for the palette lookup.
+        double log_zn = 0.5 * log(zr2 + zi2);
+        double nu = log(log_zn) / log(2.0);
+        float mu = (float)((double)(iter + 1) - nu);
+
+        // Cycle through the palette: 0.02 controls how quickly we wrap
+        // through the gradient as the iteration count climbs.
+        float t = fmodf(mu * 0.02f, 1.0f);
+        if (t < 0.0f) t += 1.0f;  // fmodf can return negative for negative mu
+
+        float4 rgba = tex1D<float4>(palette, t);
+
+        // Clamp before narrowing to bytes.
+        float fr = rgba.x; if (fr < 0.0f) fr = 0.0f; if (fr > 1.0f) fr = 1.0f;
+        float fg = rgba.y; if (fg < 0.0f) fg = 0.0f; if (fg > 1.0f) fg = 1.0f;
+        float fb = rgba.z; if (fb < 0.0f) fb = 0.0f; if (fb > 1.0f) fb = 1.0f;
+        r = (unsigned char)(fr * 255.0f);
+        g = (unsigned char)(fg * 255.0f);
+        b = (unsigned char)(fb * 255.0f);
+    }
+
+    int idx = (y * width + x) * 4;
+    output[idx + 0] = r;
+    output[idx + 1] = g;
+    output[idx + 2] = b;
+    output[idx + 3] = 255;
+}
+"""
+
+# GLSL shaders -- these just display a texture on a fullscreen rectangle.
+# Nothing CUDA-specific here.
+
+VERTEX_SHADER_SOURCE = """#version 330 core
+in vec2 position;
+in vec2 texcoord;
+out vec2 v_texcoord;
+void main() {
+    gl_Position = vec4(position, 0.0, 1.0);
+    v_texcoord = texcoord;
+}
+"""
+
+FRAGMENT_SHADER_SOURCE = """#version 330 core
+in vec2 v_texcoord;
+out vec4 fragColor;
+uniform sampler2D tex;
+void main() {
+    fragColor = texture(tex, v_texcoord);
+}
+"""
+
+
+if __name__ == "__main__":
+    main()
diff --git a/cuda_core/examples/gl_interop_mipmap_lod.py b/cuda_core/examples/gl_interop_mipmap_lod.py
new file mode 100644
index 00000000000..9f71bad7a5c
--- /dev/null
+++ b/cuda_core/examples/gl_interop_mipmap_lod.py
@@ -0,0 +1,730 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# ################################################################################
+#
+# This example demonstrates the new cuda.core texture/surface stack:
+# MipmappedArray, SurfaceObject, and a TextureObject that does trilinear
+# (LINEAR mipmap + LINEAR filter) sampling with user-controlled LOD bias.
+# Requires pyglet.
+#
+# ################################################################################
+
+# What this example teaches
+# =========================
+# How to allocate a mipmap pyramid as a single MipmappedArray, populate each
+# level from a CUDA kernel by binding it as a SurfaceObject, and then sample
+# the whole pyramid from a TextureObject with manual LOD bias.
+#
+# How it works
+# ============
+# A mipmap pyramid is a stack of progressively-halved images of the same
+# texture. The base level (level 0) holds the highest-resolution version; each
+# subsequent level is a 2x2 box-filtered downsample of the level below it:
+#
+#     level 0: 512 x 512   <- highest detail
+#     level 1: 256 x 256
+#     level 2: 128 x 128
+#     ...
+#     level 9:   1 x 1     <- a single average color
+#
+# At sample time, the GPU picks the mip level that best matches the on-screen
+# size of the texel, optionally blending between adjacent levels (trilinear).
+# Selecting a coarser level than the "right" one is called a positive LOD bias
+# and produces a softer/blurrier image; a negative bias selects finer levels
+# (sharper but more aliased when undersampled).
+#
+#   +----------------------+       +-----------------------+
+#   |   MipmappedArray     |       |   TextureObject       |
+#   | (single allocation,  | <---  | (samples the whole    |
+#   |  10 mip levels)      |       |  pyramid w/ trilinear |
+#   +----------------------+       |  filtering)           |
+#         ^      ^                 +-----------------------+
+#         |      |
+#         |      +---- one SurfaceObject per level, used at BUILD time only
+#         |            to let a kernel write pixels into that level.
+#         |
+#         +----------- get_level(L) returns a NON-OWNING CUDAArray view of level L;
+#                      the storage belongs to the parent MipmappedArray.
+#
+#   STARTUP -- one-time mipmap build
+#   ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#   1. Allocate MipmappedArray (10 levels, float4 RGBA, is_surface_load_store=True).
+#   2. Level 0: launch `seed_base` kernel -> SurfaceObject -> high-frequency
+#      procedural pattern.
+#   3. For L = 1..num_levels-1: launch `downsample` kernel:
+#        - reads level L-1 through a TextureObject (POINT-filtered)
+#        - writes level L   through a SurfaceObject
+#        - 4-sample box average of the parent's 2x2 footprint.
+#
+#   PER FRAME (render loop)
+#   ~~~~~~~~~~~~~~~~~~~~~~~
+#   The display TextureObject samples the whole pyramid with `tex2DLod`,
+#   where the LOD is computed per-pixel as `log2(zoom) + lod_bias`. The result
+#   is written to a GL PBO via GraphicsResource, then drawn as a textured quad.
+#
+# What you should see
+# ===================
+# A 512x512 procedural pattern (concentric rings + diagonal grid) shown
+# stretched across the window. Use the mouse wheel to zoom in/out (this
+# implicitly changes the LOD), and use the bracket keys `[` / `]` to add a
+# manual LOD bias on top of that. Press `R` to reset.
+#
+#   Mouse wheel       zoom in / out
+#   [                 LOD bias -= 0.25  (sharper, more aliased)
+#   ]                 LOD bias += 0.25  (blurrier, samples a coarser level)
+#   R                 reset zoom + bias
+#   Escape / close    quit
+#
+# The window title shows the current zoom, manual bias, and effective LOD.
+# Close the window or press Escape to exit.
+#
+
+# /// script
+# dependencies = ["cuda_bindings", "cuda_core>0.6.0", "pyglet"]
+# ///
+
+import ctypes
+import math
+import sys
+import time
+
+import numpy as np
+
+from cuda.core import (
+    AddressMode,
+    ArrayFormat,
+    Device,
+    FilterMode,
+    GraphicsResource,
+    LaunchConfig,
+    MipmappedArray,
+    Program,
+    ProgramOptions,
+    ReadMode,
+    ResourceDescriptor,
+    SurfaceObject,
+    TextureDescriptor,
+    TextureObject,
+    launch,
+)
+
+# ---------------------------------------------------------------------------
+# Configuration (feel free to change these)
+# ---------------------------------------------------------------------------
+WIDTH = 800
+HEIGHT = 600
+BASE_SIZE = 512  # Texture base-level edge length (must be a power of two).
+LOD_BIAS_STEP = 0.25
+
+
+# ============================= Helper functions =============================
+#
+# The functions below set up CUDA, OpenGL, and the mipmap pyramid. If you're
+# here to learn about MipmappedArray / SurfaceObject / mipmapped TextureObject,
+# you can skip straight to main() -- the interesting part is there. These
+# helpers exist so that main() reads like a short story.
+# ============================================================================
+
+
+def _check_compute_capability(dev):
+    """Surface load/store + mipmapped arrays require sm_30+."""
+    cc = dev.compute_capability
+    if cc.major < 3:
+        print(
+            f"This example requires compute capability >= 3.0, got sm_{cc.major}{cc.minor}.",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+
+def setup_cuda():
+    """Compile the three kernels and return everything we need to drive them.
+
+    Returns
+    -------
+    (dev, stream, kernels, arch_str)
+        kernels is a dict with keys "seed_base", "downsample", "display".
+    """
+    dev = Device(0)
+    dev.set_current()
+    _check_compute_capability(dev)
+    stream = dev.create_stream()
+
+    program_options = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}")
+    prog = Program(KERNEL_SOURCE, code_type="c++", options=program_options)
+    mod = prog.compile(
+        "cubin",
+        name_expressions=("seed_base", "downsample", "display"),
+    )
+    kernels = {
+        "seed_base": mod.get_kernel("seed_base"),
+        "downsample": mod.get_kernel("downsample"),
+        "display": mod.get_kernel("display"),
+    }
+    return dev, stream, kernels, f"sm_{dev.arch}"
+
+
+def build_mipmap_pyramid(mip, num_levels, stream, kernels):
+    """Populate every level of `mip` using SurfaceObject writes.
+
+    Strategy
+    --------
+    * Level 0 is filled directly by `seed_base`, which writes a procedural
+      pattern through a SurfaceObject bound to level 0.
+    * Each subsequent level L is filled by `downsample`, which reads level L-1
+      through a POINT-filtered TextureObject and box-averages a 2x2 footprint
+      into level L through a SurfaceObject.
+    * All operations are issued on a single stream, so they serialize
+      implicitly -- no per-level sync is needed.
+    """
+    # ---- Level 0: seed the base image -------------------------------------
+    base_arr = mip.get_level(0)  # non-owning view; do NOT use a `with` block
+    with SurfaceObject.from_array(base_arr) as base_surf:
+        block = (16, 16, 1)
+        grid = (
+            (BASE_SIZE + block[0] - 1) // block[0],
+            (BASE_SIZE + block[1] - 1) // block[1],
+            1,
+        )
+        launch(
+            stream,
+            LaunchConfig(grid=grid, block=block),
+            kernels["seed_base"],
+            np.uint64(base_surf.handle),
+            np.int32(BASE_SIZE),
+            np.int32(BASE_SIZE),
+        )
+    # base_arr (non-owning) is allowed to fall out of scope here; the parent
+    # MipmappedArray keeps the underlying storage alive.
+
+    # ---- Levels 1..N-1: box-filter downsample ------------------------------
+    # Each iteration reads level (L-1) through a temporary TextureObject and
+    # writes level L through a temporary SurfaceObject. Both close cleanly
+    # at the end of their `with` blocks.
+    src_tex_desc = TextureDescriptor(
+        address_mode=AddressMode.CLAMP,
+        filter_mode=FilterMode.POINT,  # explicit per-texel reads
+        read_mode=ReadMode.ELEMENT_TYPE,
+        normalized_coords=False,  # integer pixel coordinates
+    )
+    for level in range(1, num_levels):
+        parent_size = BASE_SIZE >> (level - 1)
+        level_size = BASE_SIZE >> level
+        if level_size < 1:
+            break
+
+        src_arr = mip.get_level(level - 1)
+        dst_arr = mip.get_level(level)
+        src_res = ResourceDescriptor.from_array(src_arr)
+        with (
+            TextureObject.from_descriptor(resource=src_res, texture_descriptor=src_tex_desc) as src_tex,
+            SurfaceObject.from_array(dst_arr) as dst_surf,
+        ):
+            block = (16, 16, 1)
+            grid = (
+                (level_size + block[0] - 1) // block[0],
+                (level_size + block[1] - 1) // block[1],
+                1,
+            )
+            launch(
+                stream,
+                LaunchConfig(grid=grid, block=block),
+                kernels["downsample"],
+                np.uint64(src_tex.handle),
+                np.uint64(dst_surf.handle),
+                np.int32(parent_size),
+                np.int32(level_size),
+            )
+        # src_arr, dst_arr (non-owning) fall out of scope; storage stays
+        # alive via the parent MipmappedArray.
+
+    # One sync at the end is enough -- the whole build chain ran on this
+    # stream and serialized naturally.
+    stream.sync()
+
+
+def create_window():
+    """Open a pyglet window and return (window, gl_module, pyglet)."""
+    try:
+        import pyglet
+        from pyglet.gl import gl as _gl
+    except ImportError:
+        print(
+            "This example requires pyglet >= 2.0.\nInstall it with:  pip install pyglet",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    window = pyglet.window.Window(
+        WIDTH,
+        HEIGHT,
+        caption="MipmappedArray Example - Mipmap LOD viewer",
+        vsync=False,
+    )
+    return window, _gl, pyglet
+
+
+def create_display_resources(gl, width, height):
+    """Standard GL boilerplate: a shader program, a fullscreen quad, and an
+    empty texture that we'll repeatedly fill from a PBO. Not CUDA-specific.
+
+    Returns (shader_program, vertex_array_id, texture_id).
+    """
+    from pyglet.graphics.shader import Shader, ShaderProgram
+
+    vert = Shader(VERTEX_SHADER_SOURCE, "vertex")
+    frag = Shader(FRAGMENT_SHADER_SOURCE, "fragment")
+    shader_prog = ShaderProgram(vert, frag)
+
+    quad_verts = np.array(
+        [
+            # x,  y,    s, t      (position + texture coordinate)
+            -1,
+            -1,
+            0,
+            0,
+            1,
+            -1,
+            1,
+            0,
+            1,
+            1,
+            1,
+            1,
+            -1,
+            -1,
+            0,
+            0,
+            1,
+            1,
+            1,
+            1,
+            -1,
+            1,
+            0,
+            1,
+        ],
+        dtype=np.float32,
+    )
+
+    vao = ctypes.c_uint(0)
+    gl.glGenVertexArrays(1, ctypes.byref(vao))
+    gl.glBindVertexArray(vao.value)
+
+    vbo = ctypes.c_uint(0)
+    gl.glGenBuffers(1, ctypes.byref(vbo))
+    gl.glBindBuffer(gl.GL_ARRAY_BUFFER, vbo.value)
+    gl.glBufferData(
+        gl.GL_ARRAY_BUFFER,
+        quad_verts.nbytes,
+        quad_verts.ctypes.data_as(ctypes.c_void_p),
+        gl.GL_STATIC_DRAW,
+    )
+
+    stride = 4 * 4  # 4 floats * 4 bytes each
+    pos_loc = gl.glGetAttribLocation(shader_prog.id, b"position")
+    gl.glEnableVertexAttribArray(pos_loc)
+    gl.glVertexAttribPointer(pos_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(0))
+
+    tc_loc = gl.glGetAttribLocation(shader_prog.id, b"texcoord")
+    gl.glEnableVertexAttribArray(tc_loc)
+    gl.glVertexAttribPointer(tc_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(8))
+
+    gl.glBindVertexArray(0)
+
+    tex = ctypes.c_uint(0)
+    gl.glGenTextures(1, ctypes.byref(tex))
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex.value)
+    gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MIN_FILTER, gl.GL_LINEAR)
+    gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MAG_FILTER, gl.GL_LINEAR)
+    gl.glTexImage2D(
+        gl.GL_TEXTURE_2D,
+        0,
+        gl.GL_RGBA8,
+        width,
+        height,
+        0,
+        gl.GL_RGBA,
+        gl.GL_UNSIGNED_BYTE,
+        None,
+    )
+
+    return shader_prog, vao.value, tex.value
+
+
+def create_pixel_buffer(gl, width, height):
+    """Create a Pixel Buffer Object (PBO) -- the CUDA/GL bridge.
+
+    Returns (pbo_gl_name, size_in_bytes).
+    """
+    pbo = ctypes.c_uint(0)
+    gl.glGenBuffers(1, ctypes.byref(pbo))
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo.value)
+    nbytes = width * height * 4  # RGBA8
+    gl.glBufferData(gl.GL_PIXEL_UNPACK_BUFFER, nbytes, None, gl.GL_DYNAMIC_DRAW)
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0)
+    return pbo.value, nbytes
+
+
+def copy_pbo_to_texture(gl, pbo_id, tex_id, width, height):
+    """Copy pixel data from the PBO into the GL texture (GPU-to-GPU)."""
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo_id)
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id)
+    gl.glTexSubImage2D(
+        gl.GL_TEXTURE_2D,
+        0,
+        0,
+        0,
+        width,
+        height,
+        gl.GL_RGBA,
+        gl.GL_UNSIGNED_BYTE,
+        None,
+    )
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0)
+
+
+def draw_fullscreen_quad(gl, shader_prog, vao_id, tex_id):
+    """Draw the texture to the screen using the fullscreen quad."""
+    gl.glUseProgram(shader_prog.id)
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id)
+    gl.glBindVertexArray(vao_id)
+    gl.glDrawArrays(gl.GL_TRIANGLES, 0, 6)
+    gl.glBindVertexArray(0)
+    gl.glUseProgram(0)
+
+
+# ================================== main() ==================================
+
+
+def main():
+    # --- Step 1: Set up CUDA (compile kernels, create stream) ---
+    dev, stream, kernels, _arch = setup_cuda()
+
+    # --- Step 2: Allocate the mipmap pyramid and build every level ---
+    #     is_surface_load_store=True is required for kernel-side writes.
+    num_levels = int(math.log2(BASE_SIZE)) + 1
+    mip = MipmappedArray.from_descriptor(
+        shape=(BASE_SIZE, BASE_SIZE),
+        format=ArrayFormat.FLOAT32,
+        num_channels=4,
+        num_levels=num_levels,
+        is_surface_load_store=True,
+    )
+    build_mipmap_pyramid(mip, num_levels, stream, kernels)
+
+    # --- Step 3: Bind the WHOLE pyramid as a trilinear-filtered texture ---
+    #     Normalized coordinates (0..1) make zoom-by-uv simple. The texture
+    #     descriptor's mipmap_level_bias stays 0.0; the display kernel
+    #     receives the user-controlled bias as a kernel argument and folds
+    #     it into the tex2DLod call (avoids rebuilding the TextureObject
+    #     whenever the user changes the bias).
+    display_tex_desc = TextureDescriptor(
+        address_mode=AddressMode.WRAP,
+        filter_mode=FilterMode.LINEAR,
+        read_mode=ReadMode.ELEMENT_TYPE,
+        normalized_coords=True,
+        mipmap_filter_mode=FilterMode.LINEAR,  # trilinear
+        mipmap_level_bias=0.0,
+        min_mipmap_level_clamp=0.0,
+        max_mipmap_level_clamp=float(num_levels - 1),
+    )
+    display_tex = TextureObject.from_descriptor(
+        resource=ResourceDescriptor.from_mipmapped_array(mip),
+        texture_descriptor=display_tex_desc,
+    )
+
+    # --- Step 4: Open a window and set up the GL/CUDA bridge ---
+    window, gl, pyglet = create_window()
+    shader_prog, quad_vao, tex_id = create_display_resources(gl, WIDTH, HEIGHT)
+    pbo_id, _ = create_pixel_buffer(gl, WIDTH, HEIGHT)
+    resource = GraphicsResource.from_gl_buffer(pbo_id, flags="write_discard")
+
+    # --- Step 5: Render loop state ---
+    # `zoom` controls how big a texel is on screen: zoom > 1 stretches the
+    # texture and selects coarser mip levels (positive LOD); zoom < 1 shrinks
+    # the texture and selects finer levels. `lod_bias` is a manual offset
+    # added on top.
+    state = {"zoom": 1.0, "lod_bias": 0.0}
+    start_time = time.monotonic()
+    frame_count = [0]
+    fps_time = [start_time]
+
+    block = (16, 16, 1)
+    grid = (
+        (WIDTH + block[0] - 1) // block[0],
+        (HEIGHT + block[1] - 1) // block[1],
+        1,
+    )
+    config = LaunchConfig(grid=grid, block=block)
+
+    def effective_lod():
+        # Same formula the display kernel uses, clamped to the legal range so
+        # the window title matches what the GPU actually sees.
+        raw = math.log2(max(state["zoom"], 1e-6)) + state["lod_bias"]
+        return max(0.0, min(float(num_levels - 1), raw))
+
+    @window.event
+    def on_draw():
+        window.clear()
+
+        # (a) Map the PBO so CUDA can write into it.
+        with resource.map(stream=stream) as buf:
+            # (b) Launch the display kernel -- samples the mipmap and writes RGBA.
+            launch(
+                stream,
+                config,
+                kernels["display"],
+                buf.handle,
+                np.int32(WIDTH),
+                np.int32(HEIGHT),
+                np.uint64(display_tex.handle),
+                np.float32(state["zoom"]),
+                np.float32(state["lod_bias"]),
+                np.float32(float(num_levels - 1)),
+            )
+        # (c) Unmap happens automatically; cuGraphicsUnmapResources serializes
+        #     the CUDA work against subsequent OpenGL use.
+
+        copy_pbo_to_texture(gl, pbo_id, tex_id, WIDTH, HEIGHT)
+        draw_fullscreen_quad(gl, shader_prog, quad_vao, tex_id)
+
+        frame_count[0] += 1
+        now = time.monotonic()
+        if now - fps_time[0] >= 1.0:
+            fps = frame_count[0] / (now - fps_time[0])
+            window.set_caption(
+                f"MipmappedArray LOD viewer "
+                f"({WIDTH}x{HEIGHT}, {fps:.0f} FPS) -- "
+                f"zoom={state['zoom']:.2f}, "
+                f"bias={state['lod_bias']:+.2f}, "
+                f"LOD={effective_lod():.2f}"
+            )
+            frame_count[0] = 0
+            fps_time[0] = now
+
+    @window.event
+    def on_mouse_scroll(_x, _y, _scroll_x, scroll_y):
+        # One wheel step changes zoom by ~12.5%. Clamped to keep LOD in range.
+        if scroll_y == 0:
+            return
+        factor = 1.125**scroll_y
+        state["zoom"] = max(1.0 / 64.0, min(64.0, state["zoom"] * factor))
+
+    @window.event
+    def on_key_press(symbol, _modifiers):
+        key = pyglet.window.key
+        if symbol == key.BRACKETLEFT:
+            state["lod_bias"] = max(-float(num_levels), state["lod_bias"] - LOD_BIAS_STEP)
+        elif symbol == key.BRACKETRIGHT:
+            state["lod_bias"] = min(float(num_levels), state["lod_bias"] + LOD_BIAS_STEP)
+        elif symbol == key.R:
+            state["zoom"] = 1.0
+            state["lod_bias"] = 0.0
+
+    @window.event
+    def on_close():
+        # Release CUDA-side resources in reverse construction order. GL
+        # objects clean up via pyglet on window close.
+        resource.close()
+        display_tex.close()
+        mip.close()
+        stream.close()
+
+    pyglet.app.run(interval=0)
+
+
+# ======================== GPU code (CUDA + GLSL) ============================
+#
+# Three CUDA kernels are concatenated into one program string so they share a
+# single NVRTC compile. All three operate on float4 RGBA pixels.
+#
+#   seed_base   -- writes a high-frequency procedural pattern to level 0 via a
+#                  SurfaceObject. NOTE: surf2Dwrite's x-coordinate is in BYTES,
+#                  not in elements, so we multiply by sizeof(float4) every time.
+#
+#   downsample  -- reads level L-1 through a POINT-filtered TextureObject and
+#                  writes the 2x2 box average to level L through a SurfaceObject.
+#                  tex2D with non-normalized coords needs the +0.5 half-texel
+#                  offset to hit exact texel centers.
+#
+#   display     -- samples the WHOLE mipmap pyramid with tex2DLod, where the
+#                  per-thread LOD is `clamp(log2(zoom) + lod_bias, 0, maxLod)`.
+#                  Writes 8-bit RGBA into the PBO.
+#
+# GLSL shaders at the very bottom just draw a textured quad. Nothing CUDA-
+# specific there.
+#
+# ============================================================================
+
+KERNEL_SOURCE = r"""
+// --------------------------------------------------------------------------
+// Helper: clamp a float to [a, b].
+// --------------------------------------------------------------------------
+__device__ __forceinline__ float clampf(float v, float a, float b) {
+    return fminf(fmaxf(v, a), b);
+}
+
+// CUDA does not ship a builtin "fract" so we provide one (used by seed_base).
+__device__ __forceinline__ float fracf(float v) {
+    return v - floorf(v);
+}
+
+// --------------------------------------------------------------------------
+// seed_base: write a procedural high-frequency pattern to level 0.
+//
+// surf is a SurfaceObject bound to the level-0 CUDAArray (float4 RGBA). The
+// pattern is a colorful blend of concentric rings, a diagonal grid, and a
+// radial sweep, designed to have plenty of fine detail so the difference
+// between mip levels is visually obvious.
+// --------------------------------------------------------------------------
+extern "C" __global__
+void seed_base(cudaSurfaceObject_t surf, int width, int height) {
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= width || y >= height) return;
+
+    float u = ((float)x + 0.5f) / (float)width;
+    float v = ((float)y + 0.5f) / (float)height;
+
+    // Concentric rings centered on the image.
+    float cx = u - 0.5f;
+    float cy = v - 0.5f;
+    float r = sqrtf(cx * cx + cy * cy);
+    float rings = 0.5f + 0.5f * sinf(r * 80.0f);
+
+    // Diagonal grid -- thin lines about every 1/16 of the image.
+    float gx = fabsf(fracf(u * 16.0f) - 0.5f);
+    float gy = fabsf(fracf(v * 16.0f) - 0.5f);
+    float grid = (gx < 0.05f || gy < 0.05f) ? 1.0f : 0.0f;
+
+    // Angular sweep gives the rings some color variation.
+    float theta = atan2f(cy, cx);
+    float sweep = 0.5f + 0.5f * sinf(theta * 6.0f);
+
+    // Combine into an RGBA color. Keep values in [0, 1].
+    float red   = clampf(rings * (0.4f + 0.6f * sweep) + 0.3f * grid, 0.0f, 1.0f);
+    float green = clampf(rings * (0.6f - 0.4f * sweep) + 0.3f * grid, 0.0f, 1.0f);
+    float blue  = clampf(0.4f + 0.4f * sweep + 0.5f * grid,            0.0f, 1.0f);
+    float alpha = 1.0f;
+
+    float4 px = make_float4(red, green, blue, alpha);
+
+    // Surface writes index x in BYTES (this is the classic gotcha).
+    surf2Dwrite<float4>(px, surf, x * (int)sizeof(float4), y);
+}
+
+// --------------------------------------------------------------------------
+// downsample: box-filter a 2x2 footprint of the parent level into one texel.
+//
+// src is a POINT-filtered TextureObject bound to level (L-1).
+// dst is a SurfaceObject bound to level L.
+// (dst_w, dst_h) is the size of level L.
+// (src_w = 2 * dst_w, src_h = 2 * dst_h is implicit and unused; we pass it
+// only for the bounds check.)
+//
+// Texture coordinates: tex2D with non-normalized coords returns texel (i, j)
+// when sampled at (i + 0.5, j + 0.5). So for output texel (x, y) the four
+// parent texels live at parent-coords (2x + 0.5, 2y + 0.5), (2x + 1.5, ...).
+// --------------------------------------------------------------------------
+extern "C" __global__
+void downsample(cudaTextureObject_t src,
+                cudaSurfaceObject_t dst,
+                int src_size,
+                int dst_size) {
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= dst_size || y >= dst_size) return;
+
+    float fx = 2.0f * (float)x;
+    float fy = 2.0f * (float)y;
+
+    float4 a = tex2D<float4>(src, fx + 0.5f, fy + 0.5f);
+    float4 b = tex2D<float4>(src, fx + 1.5f, fy + 0.5f);
+    float4 c = tex2D<float4>(src, fx + 0.5f, fy + 1.5f);
+    float4 d = tex2D<float4>(src, fx + 1.5f, fy + 1.5f);
+
+    float4 px;
+    px.x = 0.25f * (a.x + b.x + c.x + d.x);
+    px.y = 0.25f * (a.y + b.y + c.y + d.y);
+    px.z = 0.25f * (a.z + b.z + c.z + d.z);
+    px.w = 0.25f * (a.w + b.w + c.w + d.w);
+
+    // Silence unused-variable warning for the convenience parameter.
+    (void)src_size;
+
+    surf2Dwrite<float4>(px, dst, x * (int)sizeof(float4), y);
+}
+
+// --------------------------------------------------------------------------
+// display: per-pixel mipmap sample with manual LOD bias.
+//
+// tex is a TextureObject built from the whole MipmappedArray (LINEAR +
+// LINEAR mipmap filter, normalized coords). For each output pixel we compute
+// a single per-thread LOD from `zoom` and `lod_bias`, then sample with
+// tex2DLod. Output is written as RGBA8 into a linear byte buffer.
+// --------------------------------------------------------------------------
+extern "C" __global__
+void display(unsigned char *output,
+             int width,
+             int height,
+             cudaTextureObject_t tex,
+             float zoom,
+             float lod_bias,
+             float max_lod) {
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= width || y >= height) return;
+
+    // Normalized window coords in [0, 1].
+    float u = ((float)x + 0.5f) / (float)width;
+    float v = ((float)y + 0.5f) / (float)height;
+
+    // Zoom around the window center so the user sees the effect symmetrically.
+    u = (u - 0.5f) * zoom + 0.5f;
+    v = (v - 0.5f) * zoom + 0.5f;
+
+    // LOD: zoom > 1 means the texture is being stretched (each texel covers
+    // more screen area), which intuitively corresponds to selecting a coarser
+    // (higher) mip level. log2(zoom) yields exactly that. lod_bias is added
+    // on top, and the final value is clamped to the legal range.
+    float lod = log2f(fmaxf(zoom, 1e-6f)) + lod_bias;
+    lod = clampf(lod, 0.0f, max_lod);
+
+    float4 c = tex2DLod<float4>(tex, u, v, lod);
+
+    int idx = (y * width + x) * 4;
+    output[idx + 0] = (unsigned char)(clampf(c.x, 0.0f, 1.0f) * 255.0f);
+    output[idx + 1] = (unsigned char)(clampf(c.y, 0.0f, 1.0f) * 255.0f);
+    output[idx + 2] = (unsigned char)(clampf(c.z, 0.0f, 1.0f) * 255.0f);
+    output[idx + 3] = 255;
+}
+"""
+
+# GLSL shaders -- these just display a texture on a fullscreen rectangle.
+# Nothing CUDA-specific here.
+
+VERTEX_SHADER_SOURCE = """#version 330 core
+in vec2 position;
+in vec2 texcoord;
+out vec2 v_texcoord;
+void main() {
+    gl_Position = vec4(position, 0.0, 1.0);
+    v_texcoord = texcoord;
+}
+"""
+
+FRAGMENT_SHADER_SOURCE = """#version 330 core
+in vec2 v_texcoord;
+out vec4 fragColor;
+uniform sampler2D tex;
+void main() {
+    fragColor = texture(tex, v_texcoord);
+}
+"""
+
+
+if __name__ == "__main__":
+    main()
diff --git a/cuda_core/examples/gl_interop_ocean.py b/cuda_core/examples/gl_interop_ocean.py
new file mode 100644
index 00000000000..2e01dd9cccf
--- /dev/null
+++ b/cuda_core/examples/gl_interop_ocean.py
@@ -0,0 +1,866 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# ################################################################################
+#
+# This example demonstrates cuda.core.CUDAArray, TextureObject, and SurfaceObject
+# in combination with GraphicsResource for CUDA/OpenGL interop. A real-time
+# Gerstner-wave ocean is rebuilt every frame: a heightmap CUDAArray is rewritten
+# through a SurfaceObject, sampled through a TextureObject with LINEAR + WRAP
+# filtering for normal estimation, and shaded with Phong + Fresnel sky
+# reflection straight into an OpenGL PBO. Requires pyglet.
+#
+# ################################################################################
+
+# What this example teaches
+# =========================
+# - How to use a CUDA CUDAArray as a typed heightmap that is simultaneously
+#   written by one kernel (via SurfaceObject) and sampled by another (via
+#   TextureObject) within the same frame.
+# - How LINEAR filtering + WRAP addressing + normalized coordinates gives
+#   essentially-free bilinear neighbor lookups for finite-difference normal
+#   estimation on a tiling heightmap.
+# - How to compose CUDAArray/TextureObject/SurfaceObject with GraphicsResource so
+#   the entire render path never leaves the GPU.
+#
+# How it works
+# ============
+# Gerstner waves are a sum of N moving sinusoids with directional vectors --
+# a classic ocean approximation that looks shockingly close to FFT ocean at a
+# glance without any external library dependencies. For each heightmap texel:
+#
+#     h(x, z, t) = sum_i  A_i * sin( D_i . (x, z) * k_i  -  w_i * t  +  phi_i )
+#
+# where k_i = 2*pi / wavelength_i and w_i = sqrt(g * k_i) is the dispersion
+# relation for deep-water gravity waves. We bake 12 waves with hand-picked
+# directions / wavelengths / amplitudes / phases into the kernel as constant
+# arrays. Weather presets just scale amplitude and speed at the host level.
+#
+#   PER FRAME (all on GPU)
+#   ~~~~~~~~~~~~~~~~~~~~~~
+#   +-----------------+   surf2Dwrite   +--------------+
+#   |   update_height | --------------> |  heightmap   |
+#   |     kernel      |                 |    CUDAArray     |
+#   +-----------------+                 |  (FLOAT32)   |
+#                                       +--------------+
+#                                              |
+#                                              | tex2D<float> (LINEAR + WRAP)
+#                                              v
+#                                       +-----------------+    write RGBA8
+#                                       |  render_ocean   | ----------------> PBO
+#                                       |     kernel      |
+#                                       +-----------------+
+#
+# Why LINEAR + WRAP + normalized coords?
+# --------------------------------------
+# WRAP / MIRROR addressing modes require normalized coordinates (see the CUDA
+# Programming Guide). The ocean naturally tiles, so WRAP gives free seamless
+# horizon repetition. LINEAR filtering means our four-tap finite-difference
+# normal estimate gets bilinear interpolation between texels for free, which
+# smooths the lighting noticeably without a single extra ALU instruction.
+#
+# Channel byte width in surf2Dwrite
+# ---------------------------------
+# surf2Dwrite takes the x coordinate in BYTES, not in elements. For a
+# single-channel float surface that means `x * sizeof(float)` = `x * 4`.
+# Getting this wrong silently corrupts every other column.
+#
+# What you should see
+# ===================
+# A window showing a real-time animated ocean rendered with Phong shading and
+# a Fresnel-modulated sky reflection. Drag with the left mouse button to
+# orbit, scroll to zoom, press 1/2/3 to switch weather presets (calm /
+# breezy / stormy), press P to pause animation, Escape to exit. Window title
+# shows preset name and FPS.
+#
+
+# /// script
+# dependencies = ["cuda_bindings", "cuda_core>0.6.0", "pyglet"]
+# ///
+
+import ctypes
+import math
+import sys
+import time
+
+import numpy as np
+
+from cuda.core import (
+    AddressMode,
+    ArrayFormat,
+    CUDAArray,
+    Device,
+    FilterMode,
+    GraphicsResource,
+    LaunchConfig,
+    Program,
+    ProgramOptions,
+    ReadMode,
+    ResourceDescriptor,
+    SurfaceObject,
+    TextureDescriptor,
+    TextureObject,
+    launch,
+)
+
+# ---------------------------------------------------------------------------
+# Window and heightmap dimensions (feel free to change these)
+# ---------------------------------------------------------------------------
+WIDTH = 1024
+HEIGHT = 768
+GRID = 512  # heightmap resolution (GRID x GRID texels)
+
+# Weather presets: (amplitude_scale, speed_scale, label).
+# These are applied as multiplicative scalars on top of the per-wave amplitude
+# and angular-frequency arrays baked into the kernel, so a single compiled
+# binary can render every preset.
+PRESETS = {
+    "1": (0.35, 0.7, "calm"),
+    "2": (1.00, 1.0, "breezy"),
+    "3": (1.85, 1.4, "stormy"),
+}
+DEFAULT_PRESET = "2"
+
+# Initial camera (orbit-around-origin) parameters.
+INITIAL_YAW = 0.6  # radians around world-y
+INITIAL_PITCH = 0.35  # radians above the horizon (small positive = looking down)
+INITIAL_DISTANCE = 5.0  # camera distance from origin
+PITCH_LIMIT = 1.4  # clamp |pitch| to keep basis non-degenerate (< pi/2)
+ZOOM_MIN = 1.5
+ZOOM_MAX = 30.0
+
+
+# ============================= Helper functions =============================
+#
+# The functions below set up CUDA and OpenGL. If you're here to learn about
+# CUDAArray/TextureObject/SurfaceObject, skip ahead to main() -- the interesting
+# part is there. These helpers exist so that main() reads like a short story
+# instead of a wall of boilerplate.
+# ============================================================================
+
+
+def setup_cuda():
+    """Compile the CUDA kernels and return (device, stream, kernels, configs).
+
+    The two kernels live on different grids:
+      - update_height runs over the heightmap (GRID x GRID texels).
+      - render_ocean  runs over output pixels  (WIDTH x HEIGHT).
+    """
+    dev = Device(0)
+    dev.set_current()
+
+    # SurfaceObject requires surface load/store, which has existed since SM 2.0,
+    # but bindless surface objects (cuSurfObjectCreate) require SM 3.0+.
+    cc = dev.compute_capability
+    if cc.major < 3:
+        print(
+            "This example requires a GPU with compute capability >= 3.0 for "
+            f"bindless surface objects. Found sm_{cc.major}{cc.minor}.",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    stream = dev.create_stream()
+
+    # C++ compile so the templated tex2D<float> overload resolves.
+    program_options = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}")
+    prog = Program(KERNEL_SOURCE, code_type="c++", options=program_options)
+    mod = prog.compile(
+        "cubin",
+        name_expressions=("update_height", "render_ocean"),
+    )
+
+    kernels = {
+        "update": mod.get_kernel("update_height"),
+        "render": mod.get_kernel("render_ocean"),
+    }
+
+    block = (16, 16, 1)
+    update_grid = (
+        (GRID + block[0] - 1) // block[0],
+        (GRID + block[1] - 1) // block[1],
+        1,
+    )
+    render_grid = (
+        (WIDTH + block[0] - 1) // block[0],
+        (HEIGHT + block[1] - 1) // block[1],
+        1,
+    )
+    configs = {
+        "update": LaunchConfig(grid=update_grid, block=block),
+        "render": LaunchConfig(grid=render_grid, block=block),
+    }
+    return dev, stream, kernels, configs
+
+
+def create_window():
+    """Open a pyglet window and return (window, gl_module, pyglet)."""
+    try:
+        import pyglet
+        from pyglet.gl import gl as _gl
+    except ImportError:
+        print(
+            "This example requires pyglet >= 2.0.\nInstall it with:  pip install pyglet",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    window = pyglet.window.Window(
+        WIDTH,
+        HEIGHT,
+        caption="cuda.core CUDAArray/Texture/Surface - Gerstner Ocean",
+        vsync=False,
+    )
+    return window, _gl, pyglet
+
+
+def create_display_resources(gl, width, height):
+    """Create the GL objects needed to show a texture on screen.
+
+    Standard OpenGL boilerplate -- not CUDA-specific. Returns
+    (shader_program, vao_id, tex_id). The shader_program is a pyglet
+    ShaderProgram object (must be kept alive).
+    """
+    from pyglet.graphics.shader import Shader, ShaderProgram
+
+    vert = Shader(VERTEX_SHADER_SOURCE, "vertex")
+    frag = Shader(FRAGMENT_SHADER_SOURCE, "fragment")
+    shader_prog = ShaderProgram(vert, frag)
+
+    # Fullscreen quad (two triangles covering the entire window).
+    quad_verts = np.array(
+        [
+            -1,
+            -1,
+            0,
+            0,
+            1,
+            -1,
+            1,
+            0,
+            1,
+            1,
+            1,
+            1,
+            -1,
+            -1,
+            0,
+            0,
+            1,
+            1,
+            1,
+            1,
+            -1,
+            1,
+            0,
+            1,
+        ],
+        dtype=np.float32,
+    )
+
+    vao = ctypes.c_uint(0)
+    gl.glGenVertexArrays(1, ctypes.byref(vao))
+    gl.glBindVertexArray(vao.value)
+
+    vbo = ctypes.c_uint(0)
+    gl.glGenBuffers(1, ctypes.byref(vbo))
+    gl.glBindBuffer(gl.GL_ARRAY_BUFFER, vbo.value)
+    gl.glBufferData(
+        gl.GL_ARRAY_BUFFER,
+        quad_verts.nbytes,
+        quad_verts.ctypes.data_as(ctypes.c_void_p),
+        gl.GL_STATIC_DRAW,
+    )
+
+    stride = 4 * 4
+    pos_loc = gl.glGetAttribLocation(shader_prog.id, b"position")
+    gl.glEnableVertexAttribArray(pos_loc)
+    gl.glVertexAttribPointer(pos_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(0))
+    tc_loc = gl.glGetAttribLocation(shader_prog.id, b"texcoord")
+    gl.glEnableVertexAttribArray(tc_loc)
+    gl.glVertexAttribPointer(tc_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(8))
+    gl.glBindVertexArray(0)
+
+    tex = ctypes.c_uint(0)
+    gl.glGenTextures(1, ctypes.byref(tex))
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex.value)
+    gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MIN_FILTER, gl.GL_LINEAR)
+    gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MAG_FILTER, gl.GL_LINEAR)
+    gl.glTexImage2D(
+        gl.GL_TEXTURE_2D,
+        0,
+        gl.GL_RGBA8,
+        width,
+        height,
+        0,
+        gl.GL_RGBA,
+        gl.GL_UNSIGNED_BYTE,
+        None,
+    )
+    return shader_prog, vao.value, tex.value
+
+
+def create_pixel_buffer(gl, width, height):
+    """Create a Pixel Buffer Object (PBO) sized for one RGBA8 frame."""
+    pbo = ctypes.c_uint(0)
+    gl.glGenBuffers(1, ctypes.byref(pbo))
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo.value)
+    nbytes = width * height * 4
+    gl.glBufferData(gl.GL_PIXEL_UNPACK_BUFFER, nbytes, None, gl.GL_DYNAMIC_DRAW)
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0)
+    return pbo.value, nbytes
+
+
+def copy_pbo_to_texture(gl, pbo_id, tex_id, width, height):
+    """Copy pixel data from the PBO into the GL texture (GPU-to-GPU)."""
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo_id)
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id)
+    gl.glTexSubImage2D(
+        gl.GL_TEXTURE_2D,
+        0,
+        0,
+        0,
+        width,
+        height,
+        gl.GL_RGBA,
+        gl.GL_UNSIGNED_BYTE,
+        None,
+    )
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0)
+
+
+def draw_fullscreen_quad(gl, shader_prog, vao_id, tex_id):
+    """Draw the texture to the screen using the fullscreen quad."""
+    gl.glUseProgram(shader_prog.id)
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id)
+    gl.glBindVertexArray(vao_id)
+    gl.glDrawArrays(gl.GL_TRIANGLES, 0, 6)
+    gl.glBindVertexArray(0)
+    gl.glUseProgram(0)
+
+
+def make_heightmap_array():
+    """Allocate the single-channel float heightmap CUDAArray."""
+    return CUDAArray.from_descriptor(
+        shape=(GRID, GRID),
+        format=ArrayFormat.FLOAT32,
+        num_channels=1,
+        is_surface_load_store=True,
+    )
+
+
+def make_height_texture(arr):
+    """Bind `arr` as a TextureObject configured for LINEAR + WRAP + normalized."""
+    res_desc = ResourceDescriptor.from_array(arr)
+    tex_desc = TextureDescriptor(
+        address_mode=AddressMode.WRAP,
+        filter_mode=FilterMode.LINEAR,
+        read_mode=ReadMode.ELEMENT_TYPE,
+        # WRAP/MIRROR addressing modes require normalized coordinates.
+        normalized_coords=True,
+    )
+    return TextureObject.from_descriptor(resource=res_desc, texture_descriptor=tex_desc)
+
+
+def orbit_camera_position(yaw, pitch, distance):
+    """Convert (yaw, pitch, distance) to a world-space camera position.
+
+    The camera orbits the origin looking at it. World up is +y. Pitch is the
+    angle above the xz-plane: pitch=0 puts the camera on the horizon,
+    pitch=+1.4 nearly directly overhead.
+    """
+    cp = math.cos(pitch)
+    sp = math.sin(pitch)
+    cy = math.cos(yaw)
+    sy = math.sin(yaw)
+    cam_x = distance * cp * sy
+    cam_y = distance * sp
+    cam_z = distance * cp * cy
+    return cam_x, cam_y, cam_z
+
+
+# ================================== main() ==================================
+
+
+def main():
+    # --- Step 1: Set up CUDA (compile kernels, create stream) ---
+    dev, stream, kernels, configs = setup_cuda()
+
+    # --- Step 2: Open a window ---
+    window, gl, pyglet = create_window()
+
+    # --- Step 3: Create GL resources for drawing a texture to screen ---
+    shader_prog, quad_vao, tex_id = create_display_resources(gl, WIDTH, HEIGHT)
+
+    # --- Step 4: Create the Pixel Buffer Object (PBO) ---
+    pbo_id, _ = create_pixel_buffer(gl, WIDTH, HEIGHT)
+
+    # --- Step 5: Register the PBO with CUDA ---
+    resource = GraphicsResource.from_gl_buffer(pbo_id, flags="write_discard")
+
+    # --- Step 6: Allocate the heightmap CUDAArray and build its texture/surface ---
+    #     We pre-create both the TextureObject (read path) and the
+    #     SurfaceObject (write path) once and reuse them every frame. Creating
+    #     them inside the per-frame loop would work but adds per-frame overhead
+    #     and risks lifetime issues with async kernel launches.
+    height_arr = make_heightmap_array()
+    height_tex = make_height_texture(height_arr)
+    height_surf = SurfaceObject.from_array(height_arr)
+
+    # --- Step 7: Camera + animation state ---
+    state = {
+        "preset": DEFAULT_PRESET,
+        "yaw": INITIAL_YAW,
+        "pitch": INITIAL_PITCH,
+        "distance": INITIAL_DISTANCE,
+        "drag": False,
+        "paused": False,
+        "t_anim": 0.0,
+        "t_prev": time.monotonic(),
+    }
+
+    # --- Step 8: Render loop ---
+    frame_count = 0
+    fps_time = state["t_prev"]
+
+    @window.event
+    def on_draw():
+        nonlocal frame_count, fps_time
+
+        window.clear()
+
+        # Advance animation time only when not paused, so pausing freezes the
+        # ocean exactly where it was rather than letting it lurch when resumed.
+        now = time.monotonic()
+        dt = now - state["t_prev"]
+        state["t_prev"] = now
+        if not state["paused"]:
+            state["t_anim"] += dt
+        t = state["t_anim"]
+
+        amp_scale, speed_scale, _label = PRESETS[state["preset"]]
+
+        # (a) Rebuild the heightmap for time t.
+        launch(
+            stream,
+            configs["update"],
+            kernels["update"],
+            np.uint64(height_surf.handle),
+            np.int32(GRID),
+            np.int32(GRID),
+            np.float32(t),
+            np.float32(amp_scale),
+            np.float32(speed_scale),
+        )
+
+        # (b) Render the scene: sample the heightmap through the texture,
+        #     estimate normals via finite differences, shade with Phong +
+        #     Fresnel sky reflection, write RGBA8 into the OpenGL PBO.
+        cam_x, cam_y, cam_z = orbit_camera_position(state["yaw"], state["pitch"], state["distance"])
+        with resource.map(stream=stream) as buf:
+            launch(
+                stream,
+                configs["render"],
+                kernels["render"],
+                np.uint64(height_tex.handle),
+                buf.handle,
+                np.int32(WIDTH),
+                np.int32(HEIGHT),
+                np.float32(cam_x),
+                np.float32(cam_y),
+                np.float32(cam_z),
+                np.float32(t),
+            )
+        # Unmap happens automatically when the `with` block exits.
+
+        # (c) PBO -> GL texture (GPU-to-GPU).
+        copy_pbo_to_texture(gl, pbo_id, tex_id, WIDTH, HEIGHT)
+
+        # (d) Draw the texture to the screen.
+        draw_fullscreen_quad(gl, shader_prog, quad_vao, tex_id)
+
+        # FPS counter (shown in window title)
+        frame_count += 1
+        if now - fps_time >= 1.0:
+            fps = frame_count / (now - fps_time)
+            label = PRESETS[state["preset"]][2]
+            paused = " [paused]" if state["paused"] else ""
+            window.set_caption(
+                "cuda.core CUDAArray/Texture/Surface - Gerstner Ocean"
+                f" [{label}]{paused} ({WIDTH}x{HEIGHT}, {fps:.0f} FPS)"
+            )
+            frame_count = 0
+            fps_time = now
+
+    # --- Mouse: drag to orbit, scroll to zoom ------------------------------
+    @window.event
+    def on_mouse_press(_x, _y, button, _modifiers):
+        if button == pyglet.window.mouse.LEFT:
+            state["drag"] = True
+
+    @window.event
+    def on_mouse_release(_x, _y, button, _modifiers):
+        if button == pyglet.window.mouse.LEFT:
+            state["drag"] = False
+
+    @window.event
+    def on_mouse_drag(_x, _y, dx, dy, buttons, _modifiers):
+        if not (buttons & pyglet.window.mouse.LEFT):
+            return
+        # Rotate yaw on horizontal drag, pitch on vertical drag. The yaw
+        # direction matches the camera moving with the cursor.
+        state["yaw"] -= dx * 0.005
+        state["pitch"] -= dy * 0.005
+        # Clamp pitch to keep the camera basis non-degenerate (never look
+        # straight down/up the world-y axis).
+        if state["pitch"] > PITCH_LIMIT:
+            state["pitch"] = PITCH_LIMIT
+        if state["pitch"] < -PITCH_LIMIT:
+            state["pitch"] = -PITCH_LIMIT
+
+    @window.event
+    def on_mouse_scroll(_x, _y, _scroll_x, scroll_y):
+        # Geometric zoom in camera distance; clamp to a sensible range.
+        factor = 1.1 ** (-scroll_y)
+        new_d = state["distance"] * factor
+        state["distance"] = max(ZOOM_MIN, min(ZOOM_MAX, new_d))
+
+    # --- Keyboard: 1/2/3 weather presets, P pauses, Escape exits ----------
+    @window.event
+    def on_key_press(symbol, _modifiers):
+        key = pyglet.window.key
+        if symbol == key.ESCAPE:
+            window.close()
+            return
+        if symbol == key.P:
+            state["paused"] = not state["paused"]
+            return
+        for digit_key, name in (
+            (key._1, "1"),
+            (key._2, "2"),
+            (key._3, "3"),
+        ):
+            if symbol == digit_key:
+                state["preset"] = name
+                return
+
+    @window.event
+    def on_close():
+        # Release CUDA resources in reverse order of creation.
+        resource.close()
+        height_tex.close()
+        height_surf.close()
+        height_arr.close()
+        stream.close()
+
+    pyglet.app.run(interval=0)
+
+
+# ======================== GPU code (CUDA + GLSL) ============================
+#
+# KERNEL_SOURCE contains two CUDA C++ kernels:
+#   - update_height: per-heightmap-texel. Sums 12 Gerstner waves and writes
+#                    one float per texel via SurfaceObject.
+#   - render_ocean:  per-screen-pixel. Builds a camera ray, intersects the
+#                    ocean plane (y=0), samples the heightmap via
+#                    TextureObject (LINEAR + WRAP), estimates the normal via
+#                    finite differences, and shades with Phong + Fresnel sky
+#                    reflection. Misses go to a vertical sky gradient.
+#
+# VERTEX_SHADER_SOURCE / FRAGMENT_SHADER_SOURCE are plain GLSL that draws a
+# texture on a fullscreen quad -- nothing CUDA-specific.
+# ============================================================================
+
+KERNEL_SOURCE = r"""
+// ---------------------------------------------------------------------------
+// Wave bank: 12 Gerstner-ish waves with hand-picked parameters.
+//
+// Wavelengths span 0.05 .. 1.0 world units. Amplitudes decrease with
+// frequency so that long swells dominate and short ripples ride on top
+// (a rough Phillips/JONSWAP-style envelope, but coarsely hand-tuned for
+// visual punch rather than physical accuracy).
+//
+// Directions are spread non-uniformly around the unit circle to avoid the
+// streaky-grid look you get from evenly-spaced directions.
+// ---------------------------------------------------------------------------
+__constant__ float c_dirx[12] = {
+    1.000f,  0.866f,  0.500f,  0.000f, -0.500f, -0.866f,
+   -1.000f, -0.940f, -0.500f,  0.174f,  0.643f,  0.940f
+};
+__constant__ float c_dirz[12] = {
+    0.000f,  0.500f,  0.866f,  1.000f,  0.866f,  0.500f,
+    0.000f,  0.342f,  0.866f,  0.985f,  0.766f,  0.342f
+};
+__constant__ float c_wavelen[12] = {
+    1.000f, 0.730f, 0.520f, 0.380f, 0.260f, 0.190f,
+    0.140f, 0.105f, 0.085f, 0.070f, 0.058f, 0.050f
+};
+__constant__ float c_amp[12] = {
+    0.080f, 0.060f, 0.045f, 0.034f, 0.025f, 0.018f,
+    0.013f, 0.010f, 0.0075f, 0.0055f, 0.0040f, 0.0030f
+};
+__constant__ float c_phase[12] = {
+    0.00f, 1.20f, 2.10f, 0.40f, 3.70f, 5.10f,
+    2.65f, 4.85f, 1.55f, 6.05f, 3.20f, 0.95f
+};
+
+// Deep-water dispersion: w = sqrt(g * k), with k = 2*pi / wavelength.
+__device__ __forceinline__ float angular_freq(float wavelength) {
+    const float G = 9.81f;
+    float k = 6.2831853f / wavelength;
+    return sqrtf(G * k);
+}
+
+// World extent (in world units) covered by one tile of the heightmap.
+// The heightmap WRAPs, so the ocean tiles seamlessly every TILE world units.
+__device__ __forceinline__ float tile_extent() { return 4.0f; }
+
+// ---------------------------------------------------------------------------
+// Tiny vec3 helpers. Kept inline + __forceinline__ so they stay free.
+// ---------------------------------------------------------------------------
+struct V3 { float x, y, z; };
+
+__device__ __forceinline__ V3 v3(float x, float y, float z) {
+    V3 r; r.x = x; r.y = y; r.z = z; return r;
+}
+__device__ __forceinline__ V3 v_add(V3 a, V3 b) {
+    return v3(a.x + b.x, a.y + b.y, a.z + b.z);
+}
+__device__ __forceinline__ V3 v_sub(V3 a, V3 b) {
+    return v3(a.x - b.x, a.y - b.y, a.z - b.z);
+}
+__device__ __forceinline__ V3 v_scale(V3 a, float s) {
+    return v3(a.x * s, a.y * s, a.z * s);
+}
+__device__ __forceinline__ float v_dot(V3 a, V3 b) {
+    return a.x * b.x + a.y * b.y + a.z * b.z;
+}
+__device__ __forceinline__ V3 v_cross(V3 a, V3 b) {
+    return v3(a.y * b.z - a.z * b.y,
+              a.z * b.x - a.x * b.z,
+              a.x * b.y - a.y * b.x);
+}
+__device__ __forceinline__ V3 v_normalize(V3 a) {
+    float inv = rsqrtf(fmaxf(v_dot(a, a), 1e-20f));
+    return v_scale(a, inv);
+}
+
+// ---------------------------------------------------------------------------
+// update_height: each thread computes one heightmap texel.
+//
+// Sums the 12 Gerstner waves at world position (x, z), using the
+// amplitude_scale and speed_scale knobs to switch between weather presets
+// without recompiling the kernel. Writes one float via surf2Dwrite.
+// ---------------------------------------------------------------------------
+extern "C" __global__
+void update_height(cudaSurfaceObject_t surf,
+                   int width, int height,
+                   float t,
+                   float amp_scale, float speed_scale) {
+    int ix = blockIdx.x * blockDim.x + threadIdx.x;
+    int iy = blockIdx.y * blockDim.y + threadIdx.y;
+    if (ix >= width || iy >= height) return;
+
+    // Map texel (ix, iy) to world position (x, z) inside one tile.
+    float inv_w = 1.0f / (float)width;
+    float inv_h = 1.0f / (float)height;
+    float te = tile_extent();
+    float wx = ((float)ix + 0.5f) * inv_w * te;
+    float wz = ((float)iy + 0.5f) * inv_h * te;
+
+    float h = 0.0f;
+    #pragma unroll
+    for (int i = 0; i < 12; ++i) {
+        float k = 6.2831853f / c_wavelen[i];
+        float w = angular_freq(c_wavelen[i]) * speed_scale;
+        float arg = (c_dirx[i] * wx + c_dirz[i] * wz) * k - w * t + c_phase[i];
+        h += c_amp[i] * sinf(arg);
+    }
+    h *= amp_scale;
+
+    // Single-channel float surface: byte offset is x * sizeof(float).
+    surf2Dwrite(h, surf, ix * (int)sizeof(float), iy);
+}
+
+// ---------------------------------------------------------------------------
+// Sample the heightmap at a world position. Texture is normalized + WRAP,
+// so we just divide world coords by tile_extent. WRAP gives us the tiling
+// for free at the horizon.
+// ---------------------------------------------------------------------------
+__device__ __forceinline__ float sample_height(cudaTextureObject_t tex,
+                                               float wx, float wz) {
+    float inv_te = 1.0f / tile_extent();
+    return tex2D<float>(tex, wx * inv_te, wz * inv_te);
+}
+
+// ---------------------------------------------------------------------------
+// Sky gradient: a vertical interpolation from a soft horizon to a deeper
+// overhead blue. `up_angle` is in [-1, 1] (the y component of the ray dir).
+// ---------------------------------------------------------------------------
+__device__ __forceinline__ V3 sky_color(float up_angle) {
+    // Clamp to [0, 1] so straight-down rays still get a horizon color.
+    float a = fmaxf(0.0f, fminf(1.0f, up_angle));
+    // Soft pale-blue horizon
+    V3 horizon = v3(0.70f, 0.82f, 0.92f);
+    // Deeper blue overhead
+    V3 zenith  = v3(0.18f, 0.34f, 0.62f);
+    // Curve so the gradient isn't linear -- horizon stays brighter longer.
+    float t = powf(a, 0.6f);
+    return v_add(v_scale(horizon, 1.0f - t), v_scale(zenith, t));
+}
+
+// ---------------------------------------------------------------------------
+// render_ocean: each thread shades one screen pixel.
+//
+// 1. Reconstruct the camera basis from cam_pos (orbiting origin, world-up).
+// 2. Build a perspective ray through the pixel.
+// 3. Intersect ray with y = 0 plane; if it misses, return sky gradient.
+// 4. Sample heightmap at hit point; finite-difference for the normal.
+// 5. Phong diffuse + specular, blended with Fresnel sky reflection.
+// 6. Write RGBA8 into the OpenGL PBO.
+// ---------------------------------------------------------------------------
+extern "C" __global__
+void render_ocean(cudaTextureObject_t tex,
+                  unsigned char* out,
+                  int w, int h,
+                  float cam_x, float cam_y, float cam_z,
+                  float /*t*/) {
+    int px = blockIdx.x * blockDim.x + threadIdx.x;
+    int py = blockIdx.y * blockDim.y + threadIdx.y;
+    if (px >= w || py >= h) return;
+
+    // ---- Camera basis ----
+    // Forward looks from cam_pos toward origin. World up is +y.
+    // cam_y > 0 guarantees forward.y < 0 and the cross product with world-up
+    // is well-defined (the pitch is clamped on the host side).
+    V3 cam_pos = v3(cam_x, cam_y, cam_z);
+    V3 forward = v_normalize(v_sub(v3(0.0f, 0.0f, 0.0f), cam_pos));
+    V3 world_up = v3(0.0f, 1.0f, 0.0f);
+    V3 right = v_normalize(v_cross(forward, world_up));
+    V3 cam_up = v_cross(right, forward);
+
+    // ---- Pixel ray (perspective) ----
+    float aspect = (float)w / (float)h;
+    float fov = 1.0472f;                 // 60 degrees vertical FoV
+    float scale = tanf(fov * 0.5f);
+    float ndc_x = (2.0f * ((float)px + 0.5f) / (float)w - 1.0f) * aspect * scale;
+    float ndc_y = (1.0f - 2.0f * ((float)py + 0.5f) / (float)h) * scale;
+    V3 dir = v_normalize(v_add(v_add(forward,
+                                     v_scale(right, ndc_x)),
+                               v_scale(cam_up, ndc_y)));
+
+    // ---- Background sky if the ray misses the ocean plane ----
+    // The ocean is the y=0 plane; we only count hits with rays going downward
+    // (dir.y < 0). Anything else is sky. A small eps avoids near-horizontal
+    // rays producing absurd hit distances.
+    V3 col;
+    const float HIT_EPS = 1e-3f;
+    if (dir.y > -HIT_EPS) {
+        col = sky_color(dir.y);
+    } else {
+        // ---- Hit the ocean plane ----
+        float t_hit = -cam_y / dir.y;
+        if (t_hit <= 0.0f) {
+            // Camera under the surface -- treat as sky to avoid garbage.
+            col = sky_color(dir.y);
+        } else {
+            V3 p = v_add(cam_pos, v_scale(dir, t_hit));
+
+            // ---- Sample heightmap; estimate normal via finite differences ----
+            // The heightmap tiles every tile_extent() world units (WRAP), so
+            // we use a small world-space epsilon. Four taps -> central
+            // differences in x and z.
+            const float FD = 0.01f;
+            float h_c = sample_height(tex, p.x,       p.z);
+            float h_xp = sample_height(tex, p.x + FD, p.z);
+            float h_xm = sample_height(tex, p.x - FD, p.z);
+            float h_zp = sample_height(tex, p.x,      p.z + FD);
+            float h_zm = sample_height(tex, p.x,      p.z - FD);
+            float dh_dx = (h_xp - h_xm) / (2.0f * FD);
+            float dh_dz = (h_zp - h_zm) / (2.0f * FD);
+            // Normal of the surface y = h(x, z) is (-dh/dx, 1, -dh/dz).
+            V3 N = v_normalize(v3(-dh_dx, 1.0f, -dh_dz));
+
+            // ---- Lighting ----
+            V3 L = v_normalize(v3(0.55f, 0.65f, 0.35f));   // sun: high+side
+            V3 V = v_normalize(v_sub(cam_pos, p));         // view direction
+            // Reflect L about N: R = 2*(N.L)*N - L
+            float ndotl = fmaxf(0.0f, v_dot(N, L));
+            V3 R = v_normalize(v_sub(v_scale(N, 2.0f * v_dot(N, L)), L));
+
+            // Phong specular highlight on wave crests.
+            float spec = powf(fmaxf(0.0f, v_dot(R, V)), 32.0f);
+
+            // Diffuse: deep-sea blue-green.
+            V3 deep = v3(0.04f, 0.18f, 0.28f);
+            V3 shallow = v3(0.10f, 0.32f, 0.42f);
+            // Tiny height-based shading bias so crests look slightly brighter.
+            float tint = 0.5f + 0.5f * fmaxf(-1.0f, fminf(1.0f, h_c * 6.0f));
+            V3 base = v_add(v_scale(deep, 1.0f - tint),
+                            v_scale(shallow, tint));
+
+            // Diffuse term + ambient.
+            V3 diffuse = v_add(v_scale(base, 0.18f),
+                               v_scale(base, 0.82f * ndotl));
+
+            // Fresnel-modulated sky reflection. Sample the sky in the
+            // reflected-view direction so reflections of overhead show
+            // overhead colors, etc. View reflection: Rv = 2*(N.V)*N - V.
+            float ndotv = fmaxf(0.0f, v_dot(N, V));
+            V3 Rv = v_normalize(v_sub(v_scale(N, 2.0f * v_dot(N, V)), V));
+            V3 reflected_sky = sky_color(fmaxf(0.0f, Rv.y));
+            float F = powf(1.0f - ndotv, 5.0f);
+            // Clamp Fresnel just in case of NaN-prone edge cases.
+            if (F < 0.0f) F = 0.0f;
+            if (F > 1.0f) F = 1.0f;
+
+            // Blend: more reflection at grazing angles.
+            V3 lit = v_add(v_scale(diffuse, 1.0f - F),
+                           v_scale(reflected_sky, F));
+
+            // Add specular highlight (sun color).
+            V3 sun_col = v3(1.0f, 0.96f, 0.85f);
+            col = v_add(lit, v_scale(sun_col, spec));
+        }
+    }
+
+    // ---- Tonemap + write ----
+    // Simple Reinhard-ish curve keeps highlights in [0, 1].
+    col.x = col.x / (1.0f + col.x);
+    col.y = col.y / (1.0f + col.y);
+    col.z = col.z / (1.0f + col.z);
+
+    int idx = (py * w + px) * 4;
+    out[idx + 0] = (unsigned char)(fmaxf(0.0f, fminf(1.0f, col.x)) * 255.0f);
+    out[idx + 1] = (unsigned char)(fmaxf(0.0f, fminf(1.0f, col.y)) * 255.0f);
+    out[idx + 2] = (unsigned char)(fmaxf(0.0f, fminf(1.0f, col.z)) * 255.0f);
+    out[idx + 3] = 255;
+}
+"""
+
+# GLSL shaders -- these just display a texture on a fullscreen rectangle.
+# Nothing CUDA-specific here.
+
+VERTEX_SHADER_SOURCE = """#version 330 core
+in vec2 position;
+in vec2 texcoord;
+out vec2 v_texcoord;
+void main() {
+    gl_Position = vec4(position, 0.0, 1.0);
+    v_texcoord = texcoord;
+}
+"""
+
+FRAGMENT_SHADER_SOURCE = """#version 330 core
+in vec2 v_texcoord;
+out vec4 fragColor;
+uniform sampler2D tex;
+void main() {
+    fragColor = texture(tex, v_texcoord);
+}
+"""
+
+
+if __name__ == "__main__":
+    main()
diff --git a/cuda_core/examples/gl_interop_particles.py b/cuda_core/examples/gl_interop_particles.py
new file mode 100644
index 00000000000..c5dd06e3697
--- /dev/null
+++ b/cuda_core/examples/gl_interop_particles.py
@@ -0,0 +1,688 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# ################################################################################
+#
+# This example demonstrates cuda.core.GraphicsResource VBO interop together with
+# CUDAArray, SurfaceObject, and TextureObject. Hundreds of thousands of points
+# flow through an animated curl-noise velocity field. CUDA writes particle
+# positions directly into an OpenGL Vertex Buffer Object (VBO), and OpenGL draws
+# that same buffer as a glowing additive point cloud -- no PBO, no fullscreen
+# quad, no pixel copy. Requires pyglet.
+#
+# ################################################################################
+
+# What this example teaches
+# =========================
+# - How to register an OpenGL VBO (GL_ARRAY_BUFFER) with CUDA using
+#   `GraphicsResource.from_gl_buffer(vbo_id, flags="none")` and treat the mapped
+#   `buf.handle` as a device pointer to a particle array that CUDA both reads and
+#   writes in place. This is the standout difference from every other interop
+#   example here: those copy CUDA output into a PBO, upload it to a texture, and
+#   draw a fullscreen quad. This one renders geometry straight out of the buffer
+#   CUDA just wrote.
+# - How to bake a smooth, periodic scalar potential into a 2D CUDAArray once (via
+#   a SurfaceObject write kernel), then bind that array as a LINEAR + WRAP
+#   normalized TextureObject and derive a divergence-free curl-noise velocity
+#   field from finite differences of texture samples.
+# - How to draw GL_POINTS directly from a CUDA-written VBO with additive blending
+#   and shader-controlled point size for a luminous, flowing look.
+#
+# How it works
+# ============
+# We allocate one VBO holding N particles. Each particle is 4 floats:
+#
+#     [x, y, age, speed]   (stride = 16 bytes)
+#
+#   - x, y   : position in the [0, 1] x [0, 1] domain. The vertex shader maps
+#              this to clip space with `pos * 2 - 1`. Keeping a single [0, 1]
+#              domain means the kernel can sample the velocity texture with
+#              normalized coordinates directly -- no scaling bugs.
+#   - age    : seconds since this particle last (re)spawned. Drives color and
+#              alpha; resets to 0 on respawn.
+#   - speed  : normalized flow magnitude in [0, 1] at the particle's location
+#              (the kernel maps gradient steepness through tanh). Drives the
+#              color ramp so fast jets glow hotter than calm eddies.
+#
+# The GL vertex attributes read from the same buffer:
+#   - "position" : 2 floats at offset 0
+#   - "attribs"  : 2 floats (age, speed) at offset 8
+#
+# The CUDA kernel `advance_particles` indexes the buffer as `float4*` so its
+# layout agrees with the host init array and the GL attribute pointers above.
+#
+#   VBO INTEROP (one buffer, CUDA writes -> OpenGL draws)
+#   ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#   +-------------------+   map(stream)    +---------------------+
+#   |   OpenGL VBO      | ---------------> |  advance_particles  |
+#   | float4 per point  |                  |  (curl-noise flow)  |
+#   | [x, y, age, speed]| <--------------- |  reads+writes pts   |
+#   +-------------------+   unmap          +---------------------+
+#           |
+#           |  glDrawArrays(GL_POINTS)   (after unmap; GL cannot read a
+#           v                             buffer while it is mapped to CUDA)
+#       glowing point cloud on screen
+#
+# The velocity field is a curl of a baked scalar potential P(u, v):
+#
+#     velocity = ( dP/dv, -dP/du )
+#
+# Taking the curl of a scalar potential yields a divergence-free field, so
+# particles swirl without piling up or thinning out. The potential is baked once
+# into a single-channel float CUDAArray as a sum of periodic sinusoids, then
+# sampled with LINEAR + WRAP + normalized coordinates. A time uniform scrolls the
+# sample coordinates so the whole field slowly drifts and animates.
+#
+# Why flags="none" (not "write_discard")?
+# ---------------------------------------
+# The PBO examples register with "write_discard" because they overwrite every
+# pixel each frame and never read the old contents. Here the kernel READS each
+# particle's current position before writing the advanced one, so we must NOT
+# tell CUDA the prior contents are garbage. We use "none".
+#
+# Single-channel surf2Dwrite byte offset
+# --------------------------------------
+# The potential array is single-channel `float` (4 bytes). `surf2Dwrite` takes
+# the x coordinate in BYTES, so the offset is `x * sizeof(float)` = `x * 4`.
+# (Contrast the float2 reaction-diffusion example, which uses `x * 8`.)
+#
+# What you should see
+# ===================
+# Luminous filaments of points swirling through an animated flow field, colored
+# blue -> cyan -> white by speed and faded by age. Press R to respawn all
+# particles, +/- to slow down / speed up the flow, and Escape to exit. The window
+# title shows the particle count and FPS.
+#
+
+# /// script
+# dependencies = ["cuda_bindings", "cuda_core>0.6.0", "pyglet"]
+# ///
+
+import ctypes
+import sys
+import time
+
+import numpy as np
+
+from cuda.core import (
+    AddressMode,
+    ArrayFormat,
+    CUDAArray,
+    Device,
+    FilterMode,
+    GraphicsResource,
+    LaunchConfig,
+    Program,
+    ProgramOptions,
+    ReadMode,
+    ResourceDescriptor,
+    SurfaceObject,
+    TextureDescriptor,
+    TextureObject,
+    launch,
+)
+
+# ---------------------------------------------------------------------------
+# Simulation parameters (feel free to change these)
+# ---------------------------------------------------------------------------
+WIDTH = 900
+HEIGHT = 900
+N_PARTICLES = 1_000_000  # number of points in the cloud
+FLOATS_PER_PARTICLE = 4  # [x, y, age, speed]
+POTENTIAL_DIM = 256  # resolution of the baked potential texture (square)
+DT = 1.0 / 60.0  # simulation time step per frame (seconds)
+BASE_SPEED = 0.15  # base flow speed (domain units per second)
+SPEED_STEP = 1.25  # multiplier applied by +/-
+MAX_AGE = 4.0  # seconds before a particle respawns
+POINT_SIZE = 2.4  # rendered point size in pixels
+
+
+# ============================= Helper functions =============================
+#
+# The functions below set up CUDA and OpenGL. If you're here to learn about VBO
+# interop, skip ahead to main() -- the interesting part is there. These helpers
+# exist so that main() reads like a short story instead of a wall of
+# boilerplate.
+# ============================================================================
+
+
+def setup_cuda():
+    """Compile the CUDA kernels and return (device, stream, kernels, configs)."""
+    dev = Device(0)
+    dev.set_current()
+
+    # SurfaceObject requires bindless surface objects (cuSurfObjectCreate),
+    # which need compute capability >= 3.0.
+    cc = dev.compute_capability
+    if cc.major < 3:
+        print(
+            "This example requires a GPU with compute capability >= 3.0 for "
+            f"bindless surface objects. Found sm_{cc.major}{cc.minor}.",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    stream = dev.create_stream()
+
+    # Compile as C++ so the templated tex2D<float> overload resolves.
+    program_options = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}")
+    prog = Program(KERNEL_SOURCE, code_type="c++", options=program_options)
+    mod = prog.compile(
+        "cubin",
+        name_expressions=("bake_potential", "init_particles", "advance_particles"),
+    )
+
+    kernels = {
+        "bake": mod.get_kernel("bake_potential"),
+        "init": mod.get_kernel("init_particles"),
+        "advance": mod.get_kernel("advance_particles"),
+    }
+
+    # The potential bake is 2D over POTENTIAL_DIM x POTENTIAL_DIM texels.
+    block2d = (16, 16, 1)
+    grid2d = (
+        (POTENTIAL_DIM + block2d[0] - 1) // block2d[0],
+        (POTENTIAL_DIM + block2d[1] - 1) // block2d[1],
+        1,
+    )
+    # init/advance are 1D over N_PARTICLES.
+    block1d = (256, 1, 1)
+    grid1d = ((N_PARTICLES + block1d[0] - 1) // block1d[0], 1, 1)
+
+    configs = {
+        "bake": LaunchConfig(grid=grid2d, block=block2d),
+        "init": LaunchConfig(grid=grid1d, block=block1d),
+        "advance": LaunchConfig(grid=grid1d, block=block1d),
+    }
+
+    return dev, stream, kernels, configs
+
+
+def create_window():
+    """Open a pyglet window and return (window, gl_module, pyglet)."""
+    try:
+        import pyglet
+        from pyglet.gl import gl as _gl
+    except ImportError:
+        print(
+            "This example requires pyglet >= 2.0.\nInstall it with:  pip install pyglet",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    window = pyglet.window.Window(
+        WIDTH,
+        HEIGHT,
+        caption="cuda.core VBO interop - Curl-Noise Particle Flow",
+        vsync=False,
+    )
+    return window, _gl, pyglet
+
+
+def create_particle_vbo(gl, shader_prog):
+    """Create the particle VBO and its VAO, and wire up the vertex attributes.
+
+    The VBO holds N_PARTICLES * 4 floats laid out as [x, y, age, speed] per
+    particle. We initialize positions to a deterministic pseudo-random spread
+    across the [0, 1] domain so there is something to see even before the first
+    kernel launch; CUDA overwrites this every frame.
+
+    Returns (vbo_gl_name, vao_gl_name).
+    """
+    # Host-side initial layout MUST match the kernel's float4 view and the GL
+    # attribute pointers below: [x, y, age, speed] per particle.
+    init = np.empty((N_PARTICLES, FLOATS_PER_PARTICLE), dtype=np.float32)
+    rng = np.random.default_rng(12345)
+    init[:, 0] = rng.random(N_PARTICLES, dtype=np.float32)  # x in [0, 1]
+    init[:, 1] = rng.random(N_PARTICLES, dtype=np.float32)  # y in [0, 1]
+    init[:, 2] = rng.random(N_PARTICLES, dtype=np.float32) * MAX_AGE  # staggered age
+    init[:, 3] = 0.0  # speed
+    init = np.ascontiguousarray(init)
+
+    vao = ctypes.c_uint(0)
+    gl.glGenVertexArrays(1, ctypes.byref(vao))
+    gl.glBindVertexArray(vao.value)
+
+    vbo = ctypes.c_uint(0)
+    gl.glGenBuffers(1, ctypes.byref(vbo))
+    gl.glBindBuffer(gl.GL_ARRAY_BUFFER, vbo.value)
+    gl.glBufferData(
+        gl.GL_ARRAY_BUFFER,
+        init.nbytes,
+        init.ctypes.data_as(ctypes.c_void_p),
+        gl.GL_DYNAMIC_DRAW,  # CUDA rewrites this buffer every frame
+    )
+
+    stride = FLOATS_PER_PARTICLE * 4  # 4 floats * 4 bytes = 16 bytes per particle
+
+    pos_loc = gl.glGetAttribLocation(shader_prog.id, b"position")
+    gl.glEnableVertexAttribArray(pos_loc)
+    gl.glVertexAttribPointer(pos_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(0))
+
+    attr_loc = gl.glGetAttribLocation(shader_prog.id, b"attribs")
+    gl.glEnableVertexAttribArray(attr_loc)
+    gl.glVertexAttribPointer(attr_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(8))
+
+    gl.glBindVertexArray(0)
+    gl.glBindBuffer(gl.GL_ARRAY_BUFFER, 0)
+
+    return vbo.value, vao.value
+
+
+def create_shader(gl):
+    """Build the point-cloud shader program (kept alive by the caller)."""
+    from pyglet.graphics.shader import Shader, ShaderProgram
+
+    vert = Shader(VERTEX_SHADER_SOURCE, "vertex")
+    frag = Shader(FRAGMENT_SHADER_SOURCE, "fragment")
+    shader_prog = ShaderProgram(vert, frag)
+
+    # Additive blending so overlapping points accumulate into glow, and
+    # shader-controlled point size (off by default in the core profile).
+    gl.glEnable(gl.GL_BLEND)
+    gl.glBlendFunc(gl.GL_SRC_ALPHA, gl.GL_ONE)
+    gl.glEnable(gl.GL_PROGRAM_POINT_SIZE)
+    gl.glDisable(gl.GL_DEPTH_TEST)
+
+    return shader_prog
+
+
+def make_potential_array():
+    """Allocate the single-channel float CUDAArray that holds the baked potential.
+
+    `is_surface_load_store=True` lets us write it once via a SurfaceObject and
+    then read it as a TextureObject for smooth, wrapping, bilinear sampling.
+    """
+    return CUDAArray.from_descriptor(
+        shape=(POTENTIAL_DIM, POTENTIAL_DIM),
+        format=ArrayFormat.FLOAT32,
+        num_channels=1,
+        is_surface_load_store=True,
+    )
+
+
+def make_potential_texture(arr):
+    """Bind `arr` as a TextureObject configured for LINEAR + WRAP + normalized."""
+    res_desc = ResourceDescriptor.from_array(arr)
+    tex_desc = TextureDescriptor(
+        address_mode=AddressMode.WRAP,
+        filter_mode=FilterMode.LINEAR,
+        read_mode=ReadMode.ELEMENT_TYPE,
+        # WRAP addressing only works with normalized coordinates.
+        normalized_coords=True,
+    )
+    return TextureObject.from_descriptor(resource=res_desc, texture_descriptor=tex_desc)
+
+
+def reset_particles(stream, kernels, configs, resource, seed):
+    """Respawn every particle by launching init_particles on the mapped VBO.
+
+    Reuses the same map() path the per-frame advance uses, so there is no host
+    re-upload. The map brackets only the launch; GL must not touch the buffer
+    while it is mapped.
+    """
+    with resource.map(stream=stream) as buf:
+        launch(
+            stream,
+            configs["init"],
+            kernels["init"],
+            buf.handle,
+            np.int32(N_PARTICLES),
+            np.uint32(seed),
+            np.float32(MAX_AGE),
+        )
+
+
+# ================================== main() ==================================
+
+
+def main():
+    # --- Step 1: Set up CUDA (compile kernels, create stream) ---
+    dev, stream, kernels, configs = setup_cuda()
+
+    # --- Step 2: Open a window ---
+    window, gl, pyglet = create_window()
+
+    # --- Step 3: Build the point-cloud shader and enable additive blending ---
+    shader_prog = create_shader(gl)
+
+    # --- Step 4: Create the particle VBO + VAO (the buffer CUDA writes into) ---
+    vbo_id, vao_id = create_particle_vbo(gl, shader_prog)
+
+    # =======================================================================
+    # API MAP -- the four cuda.core interop objects this example hinges on
+    # =======================================================================
+    #   GraphicsResource.from_gl_buffer(VBO)
+    #       Registers a GL VBO (NOT a PBO) so CUDA writes vertex positions,
+    #       OpenGL then draws directly -- zero copy. The mapped buf.handle is a
+    #       raw device pointer into the same float4 array OpenGL renders from.
+    #   CUDAArray (single-channel float, is_surface_load_store=True)
+    #       The backing storage for the baked scalar potential.
+    #   SurfaceObject.from_array(pot_arr)
+    #       Write view used ONCE at startup to bake the potential into the array.
+    #   TextureObject (LINEAR + WRAP + normalized, 1ch)
+    #       Read view: LINEAR+WRAP+normalized lets the kernel read the baked
+    #       potential's gradient with smooth, tileable sampling -- the curl of
+    #       that gradient is the divergence-free velocity field.
+    # The texture handle is created once, kept alive, and wrapped in np.uint64
+    # at launch; buf.handle is passed raw.
+    # =======================================================================
+
+    # --- Step 5: Register the VBO with CUDA ---
+    #     flags="none": the kernel reads each particle before writing it back,
+    #     so we must NOT discard the prior contents (that's why this is not
+    #     "write_discard" like the PBO examples).
+    resource = GraphicsResource.from_gl_buffer(vbo_id, flags="none")
+
+    # --- Step 6: Allocate + bake the curl-noise potential, bind it as a texture ---
+    pot_arr = make_potential_array()
+    pot_surf = SurfaceObject.from_array(pot_arr)  # created once, kept alive
+    pot_tex = make_potential_texture(pot_arr)  # created once, kept alive
+
+    # Bake the scalar potential once via the SurfaceObject.
+    launch(
+        stream,
+        configs["bake"],
+        kernels["bake"],
+        np.uint64(pot_surf.handle),
+        np.int32(POTENTIAL_DIM),
+        np.int32(POTENTIAL_DIM),
+    )
+
+    # --- Step 7: Seed the particles into the VBO ---
+    state = {"seed": 1, "speed": BASE_SPEED, "t": 0.0}
+    reset_particles(stream, kernels, configs, resource, state["seed"])
+
+    # --- Step 8: Render loop ---
+    start_time = time.monotonic()
+    frame_count = 0
+    fps_time = start_time
+
+    @window.event
+    def on_key_press(symbol, _modifiers):
+        key = pyglet.window.key
+        if symbol == key.ESCAPE:
+            window.close()
+            return
+        if symbol == key.R:
+            state["seed"] += 1
+            reset_particles(stream, kernels, configs, resource, state["seed"])
+            return
+        if symbol in (key.PLUS, key.NUM_ADD, key.EQUAL):
+            state["speed"] *= SPEED_STEP
+            return
+        if symbol in (key.MINUS, key.NUM_SUBTRACT):
+            state["speed"] /= SPEED_STEP
+            return
+
+    @window.event
+    def on_draw():
+        nonlocal frame_count, fps_time
+
+        # Black background so additive accumulation reads as glow.
+        window.clear()
+
+        state["t"] += DT
+
+        # (a) Advance particles. The map brackets ONLY the CUDA launch -- OpenGL
+        #     cannot read the buffer while it is mapped to CUDA.
+        with resource.map(stream=stream) as buf:
+            launch(
+                stream,
+                configs["advance"],
+                kernels["advance"],
+                buf.handle,  # raw device pointer to the float4 particle array
+                np.uint64(pot_tex.handle),
+                np.int32(N_PARTICLES),
+                np.float32(DT),
+                np.float32(state["speed"]),
+                np.float32(state["t"]),
+                np.float32(MAX_AGE),
+                np.uint32(state["seed"]),
+            )
+        # Unmap happens automatically when the `with` block exits; only after
+        # that may OpenGL draw from the buffer.
+
+        # (b) Draw the particles straight from the VBO as GL_POINTS.
+        gl.glUseProgram(shader_prog.id)
+        max_age_loc = gl.glGetUniformLocation(shader_prog.id, b"max_age")
+        gl.glUniform1f(max_age_loc, MAX_AGE)
+        psize_loc = gl.glGetUniformLocation(shader_prog.id, b"point_size")
+        gl.glUniform1f(psize_loc, POINT_SIZE)
+        gl.glBindVertexArray(vao_id)
+        gl.glDrawArrays(gl.GL_POINTS, 0, N_PARTICLES)
+        gl.glBindVertexArray(0)
+        gl.glUseProgram(0)
+
+        # FPS counter (shown in window title)
+        frame_count += 1
+        now = time.monotonic()
+        if now - fps_time >= 1.0:
+            fps = frame_count / (now - fps_time)
+            window.set_caption(
+                "cuda.core VBO interop - Curl-Noise Particle Flow"
+                f" ({N_PARTICLES:,} points, {fps:.0f} FPS,"
+                f" speed x{state['speed'] / BASE_SPEED:.2f})"
+                " | GraphicsResource(VBO) + TextureObject[LINEAR|WRAP|norm|1ch]"
+            )
+            frame_count = 0
+            fps_time = now
+
+    @window.event
+    def on_close():
+        # Release everything we opened, in reverse order.
+        resource.close()
+        pot_tex.close()
+        pot_surf.close()
+        pot_arr.close()
+        stream.close()
+
+    pyglet.app.run(interval=0)
+
+
+# ======================== GPU code (CUDA + GLSL) ============================
+#
+# These source strings are kept at the bottom of the file so they don't distract
+# from the Python logic above.
+#
+#   - KERNEL_SOURCE contains three CUDA C++ kernels:
+#       * bake_potential    -- writes a smooth, periodic scalar potential into a
+#                              single-channel float surface (once at startup).
+#       * init_particles    -- (re)spawns every particle to a pseudo-random
+#                              position with a staggered age. Operates on the
+#                              mapped VBO as a float4 array.
+#       * advance_particles -- reads each particle from the mapped VBO, samples
+#                              the potential texture, computes a divergence-free
+#                              curl velocity, integrates the position, handles
+#                              wrap/respawn, and writes the particle back.
+#
+#   - VERTEX_SHADER_SOURCE / FRAGMENT_SHADER_SOURCE draw GL_POINTS from the VBO
+#     with a soft round sprite colored by speed and faded by age.
+#
+# ============================================================================
+
+KERNEL_SOURCE = r"""
+// ---- shared helpers --------------------------------------------------------
+
+// Cheap deterministic xorshift hash -> float in [0, 1).
+__device__ __forceinline__ float hash01(unsigned int h) {
+    h ^= h >> 16; h *= 0x7feb352du;
+    h ^= h >> 15; h *= 0x846ca68bu;
+    h ^= h >> 16;
+    return (h & 0x00ffffffu) / (float)0x01000000;
+}
+
+__device__ __forceinline__ unsigned int seed_of(unsigned int idx, unsigned int salt) {
+    return idx * 747796405u + salt * 2891336453u + 1u;
+}
+
+// ---- bake the scalar potential ---------------------------------------------
+//
+// A sum of periodic sinusoids over the unit square. Using full 2*pi*k periods
+// makes the field seamless under WRAP addressing -- no visible edge.
+extern "C"
+__global__
+void bake_potential(cudaSurfaceObject_t surf, int width, int height) {
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= width || y >= height) return;
+
+    float u = (x + 0.5f) / (float)width;   // [0, 1)
+    float v = (y + 0.5f) / (float)height;  // [0, 1)
+    const float TWO_PI = 6.2831853f;
+
+    float p = 0.0f;
+    p += 1.00f * sinf(TWO_PI * (1.0f * u + 0.0f * v) + 0.3f);
+    p += 0.70f * sinf(TWO_PI * (0.0f * u + 1.0f * v) + 1.7f);
+    p += 0.55f * sinf(TWO_PI * (1.0f * u + 1.0f * v) + 2.1f);
+    p += 0.45f * sinf(TWO_PI * (2.0f * u - 1.0f * v) + 0.9f);
+    p += 0.30f * sinf(TWO_PI * (-1.0f * u + 2.0f * v) + 4.2f);
+    p += 0.25f * sinf(TWO_PI * (3.0f * u + 2.0f * v) + 5.5f);
+
+    // Single-channel float surface: x offset is in BYTES = x * sizeof(float).
+    surf2Dwrite(p, surf, x * (int)sizeof(float), y);
+}
+
+// ---- (re)spawn particles ---------------------------------------------------
+//
+// The VBO is a flat array of float4 [x, y, age, speed] per particle.
+extern "C"
+__global__
+void init_particles(float4* particles, int n,
+                    unsigned int seed, float max_age) {
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= n) return;
+
+    unsigned int s = seed_of((unsigned int)i, seed);
+    float px = hash01(s + 11u);
+    float py = hash01(s + 53u);
+    // Stagger ages so respawns don't pulse in lockstep.
+    float age = hash01(s + 97u) * max_age;
+    particles[i] = make_float4(px, py, age, 0.0f);
+}
+
+// ---- advance particles through the curl-noise field ------------------------
+extern "C"
+__global__
+void advance_particles(float4* particles,
+                       cudaTextureObject_t pot,
+                       int n, float dt, float speed,
+                       float t, float max_age,
+                       unsigned int seed) {
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= n) return;
+
+    float4 p = particles[i];
+    float x = p.x;
+    float y = p.y;
+    float age = p.z;
+
+    // Scroll the sample coordinates slowly with time so the field animates.
+    float scroll = 0.03f * t;
+    float su = x + scroll;
+    float sv = y - 0.5f * scroll;
+
+    // Curl of a scalar potential P is (dP/dv, -dP/du): divergence-free flow.
+    // Estimate the gradient by central differences of texture samples. The
+    // texture is LINEAR + WRAP + normalized, so wrapped reads are seamless.
+    const float eps = 1.0f / 256.0f;
+    float p_up = tex2D<float>(pot, su, sv + eps);
+    float p_dn = tex2D<float>(pot, su, sv - eps);
+    float p_rt = tex2D<float>(pot, su + eps, sv);
+    float p_lt = tex2D<float>(pot, su - eps, sv);
+
+    float dP_dv = (p_up - p_dn) / (2.0f * eps);
+    float dP_du = (p_rt - p_lt) / (2.0f * eps);
+
+    // Curl direction, then bound the magnitude. The raw analytic gradient of
+    // the summed sinusoids runs ~0..20, which (times speed) would whip every
+    // particle across the domain in well under a second and saturate the color
+    // ramp. We split it: `dir` is the flow direction, and `flow` maps the
+    // gradient steepness through tanh into [0, 1] so the field has slow eddies
+    // and fast jets. The displacement is `speed * flow` domain-units/sec, so
+    // `speed` is a true unit-per-second knob and `flow` drives the color ramp.
+    float gx = dP_dv;
+    float gy = -dP_du;
+    float grad = sqrtf(gx * gx + gy * gy) + 1e-6f;
+    float flow = tanhf(grad * 0.12f);  // 0 in calm regions, ->1 in steep jets
+    float vx = speed * flow * (gx / grad);
+    float vy = speed * flow * (gy / grad);
+
+    // Store `flow` (the normalized speed in [0, 1]) as the color driver.
+    float vmag = flow;
+
+    // Integrate position.
+    x += vx * dt;
+    y += vy * dt;
+    age += dt;
+
+    // Respawn on age expiry or if a particle drifts out of the unit domain.
+    bool respawn = (age >= max_age) || x < 0.0f || x > 1.0f || y < 0.0f || y > 1.0f;
+    if (respawn) {
+        // Jitter the seed by frame-ish state so respawns spread out over time.
+        unsigned int s = seed_of((unsigned int)i, seed + (unsigned int)(t * 60.0f));
+        x = hash01(s + 11u);
+        y = hash01(s + 53u);
+        age = 0.0f;
+        vmag = 0.0f;
+    }
+
+    particles[i] = make_float4(x, y, age, vmag);
+}
+"""
+
+# GLSL shaders -- draw GL_POINTS from the VBO. Position maps [0,1] -> clip space;
+# color ramps blue -> cyan -> white by speed and fades with age. The fragment
+# shader makes each point a soft round sprite for the glow.
+
+VERTEX_SHADER_SOURCE = """#version 330 core
+in vec2 position;   // x, y in [0, 1]
+in vec2 attribs;    // age, speed
+out float v_age;
+out float v_speed;
+uniform float max_age;
+uniform float point_size;
+void main() {
+    gl_Position = vec4(position * 2.0 - 1.0, 0.0, 1.0);
+    v_age = clamp(attribs.x / max_age, 0.0, 1.0);
+    v_speed = attribs.y;
+    // Subtle size-by-speed: fast jets render a touch larger so filaments read
+    // as brighter, structured streaks. Reuses the existing speed attribute --
+    // no struct change. Calm points keep the base size; never shrinks below it.
+    gl_PointSize = point_size * (1.0 + 0.3 * clamp(v_speed, 0.0, 1.0));
+}
+"""
+
+FRAGMENT_SHADER_SOURCE = """#version 330 core
+in float v_age;
+in float v_speed;
+out vec4 fragColor;
+void main() {
+    // Soft round sprite: fade toward the edge of the point.
+    vec2 d = gl_PointCoord - vec2(0.5);
+    float r = length(d) * 2.0;
+    float falloff = clamp(1.0 - r, 0.0, 1.0);
+    falloff *= falloff;
+
+    // Speed ramp: blue -> cyan -> white. v_speed is the normalized flow
+    // magnitude in [0, 1] (see advance_particles), so it spans the ramp.
+    float s = clamp(v_speed, 0.0, 1.0);
+    vec3 cool = vec3(0.12, 0.40, 1.00);   // lifted enough that slow points still glow
+    vec3 mid  = vec3(0.22, 0.85, 1.15);
+    vec3 hot  = vec3(1.15, 1.15, 1.20);   // slightly >1 so only the densest cores clip
+    vec3 color = (s < 0.5)
+        ? mix(cool, mid, s * 2.0)
+        : mix(mid, hot, (s - 0.5) * 2.0);
+
+    // Fade in just after spawn and out near end of life.
+    float life = (1.0 - v_age) * smoothstep(0.0, 0.08, v_age);
+    float alpha = falloff * life * 0.7;   // density carries the glow; trim so cores don't fully clip
+
+    fragColor = vec4(color, alpha);
+}
+"""
+
+
+if __name__ == "__main__":
+    main()
diff --git a/cuda_core/examples/gl_interop_physarum.py b/cuda_core/examples/gl_interop_physarum.py
new file mode 100644
index 00000000000..99972635b14
--- /dev/null
+++ b/cuda_core/examples/gl_interop_physarum.py
@@ -0,0 +1,889 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# ################################################################################
+#
+# This example demonstrates cuda.core.CUDAArray, TextureObject, and SurfaceObject
+# together with a plain device Buffer and GraphicsResource for CUDA/OpenGL
+# interop. A large population of "slime mold" (Physarum) agents crawls over a
+# single-channel float trail map: each agent senses the trail ahead via a
+# TextureObject (LINEAR + WRAP sampling), steers toward the strongest scent,
+# steps forward, and deposits pheromone through a SurfaceObject. A separate
+# diffuse/decay pass blurs and fades the trail (ping-ponged between two CUDA
+# arrays), and a colorize pass writes a neon palette straight into an OpenGL
+# PBO. The result is emergent, self-organizing vein/network patterns. Requires
+# pyglet.
+#
+# ################################################################################
+
+# What this example teaches
+# =========================
+# - How to combine a plain device Buffer (per-agent state) with CUDAArray-backed
+#   TextureObject/SurfaceObject pairs in a single simulation, all on the GPU.
+# - How to allocate a single-channel float CUDAArray with
+#   `is_surface_load_store=True` so the same memory can be read as a
+#   TextureObject (LINEAR + WRAP + normalized) and written as a SurfaceObject.
+# - How to initialize a device Buffer from host data without a third-party array
+#   library: stage through a host-accessible pinned Buffer, fill it via NumPy,
+#   then `copy_from` into the device Buffer.
+#
+# How it works
+# ============
+# Physarum is an agent-based transport-network model. Every agent stores
+# (x, y, heading) and, once per frame:
+#
+#   1. Samples the trail at three sensors (left / center / right of its heading,
+#      a fixed sensor distance ahead) using tex2D<float> LINEAR sampling.
+#   2. Rotates toward whichever sensor reads strongest (with a little random
+#      jitter from a per-agent xorshift RNG seeded by index + frame).
+#   3. Steps forward by a fixed speed and wraps around the toroidal edges.
+#   4. Deposits a constant amount of pheromone into the trail via surf2Dwrite.
+#      Concurrent agents may race on the same texel -- that is acceptable and
+#      even characteristic of the model.
+#
+# Then two grid-parallel passes finish the frame:
+#
+#   diffuse_decay : box-blur the trail (tex2D LINEAR neighbor taps) and multiply
+#                   by a decay factor < 1. Reads the current array, writes the
+#                   other, then we swap (ping-pong).
+#   colorize      : color the trail by local gradient direction (hue) modulated
+#                   by intensity, with a ridge boost + bloom halo, into the PBO.
+#
+#   PING-PONG (two single-channel float arrays)
+#   ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#   move_agents reads + deposits into the CURRENT array (tex + surf of same arr).
+#   diffuse_decay reads CURRENT (tex) -> writes OTHER (surf) -> swap.
+#   colorize reads the new CURRENT (tex) -> OpenGL PBO.
+#
+# Why LINEAR + WRAP + normalized coords?
+# --------------------------------------
+# Addressing modes WRAP and MIRROR are only supported with normalized
+# coordinates. WRAP makes the world a torus so agents and diffusion seamlessly
+# cross the edges. LINEAR filtering is essentially free on the hardware and
+# gives the agents smooth sub-texel gradient sensing. We sample at texel centers
+# `(x + 0.5) / W` so neighbor offsets land on integer texel positions.
+#
+# Channel byte width in surf2Dwrite
+# ---------------------------------
+# `surf2Dwrite` takes the x coordinate in BYTES, not elements. The trail is a
+# single-channel `float` surface, so the x offset is `x * sizeof(float)` = `x*4`.
+# (Contrast a `float2` surface, which would need `x*8`.) Getting this wrong
+# silently corrupts every Nth column.
+#
+# Per-agent state lives in a plain device Buffer
+# ----------------------------------------------
+# Agents are stored as a flat float32 array of length 3*N laid out as
+# [x0, y0, h0, x1, y1, h1, ...]. We allocate it once with `dev.allocate` and
+# pass the Buffer object straight to `launch` (matching saxpy.py / memory_ops.py,
+# which pass Buffer objects directly rather than a raw pointer int).
+#
+# What you should see
+# ===================
+# A window of glowing neon filaments that grow, branch, and reorganize into a
+# living transport network. Press 1/2/3 to switch behavior presets (different
+# sensor geometry and turn speed give different morphologies), R to reseed the
+# agents and clear the trail, and Escape to exit. The title shows the preset,
+# agent count, and FPS.
+#
+
+# /// script
+# dependencies = ["cuda_bindings", "cuda_core>0.6.0", "pyglet"]
+# ///
+
+import ctypes
+import sys
+import time
+
+import numpy as np
+
+from cuda.core import (
+    AddressMode,
+    ArrayFormat,
+    CUDAArray,
+    Device,
+    FilterMode,
+    GraphicsResource,
+    LaunchConfig,
+    LegacyPinnedMemoryResource,
+    Program,
+    ProgramOptions,
+    ReadMode,
+    ResourceDescriptor,
+    SurfaceObject,
+    TextureDescriptor,
+    TextureObject,
+    launch,
+)
+
+# ---------------------------------------------------------------------------
+# Simulation parameters (feel free to change these)
+# ---------------------------------------------------------------------------
+WIDTH = 1024
+HEIGHT = 1024
+N_AGENTS = 1 << 21  # ~2.1 million agents
+DEPOSIT = 0.2  # pheromone added to the trail per agent per frame (small so the
+#              additive deposit accumulates meaningfully instead of instantly
+#              saturating the field to 1.0)
+
+# Named presets: (sensor_angle_rad, sensor_distance_px, turn_speed_rad, move_speed_px, decay, label).
+# Different sensor geometry / turn speeds yield strikingly different networks.
+PRESETS = {
+    "1": (0.40, 9.0, 0.40, 1.0, 0.92, "veins"),
+    "2": (0.80, 16.0, 0.25, 1.0, 0.90, "webs"),
+    "3": (1.20, 5.0, 0.65, 1.5, 0.95, "swarm"),
+}
+DEFAULT_PRESET = "1"
+
+
+# ============================= Helper functions =============================
+#
+# The functions below set up CUDA and OpenGL. If you're here to learn about
+# CUDAArray/TextureObject/SurfaceObject/Buffer, skip ahead to main() -- the
+# interesting part is there. These helpers exist so that main() reads like a
+# short story instead of a wall of boilerplate.
+# ============================================================================
+
+
+def setup_cuda():
+    """Compile the CUDA kernels and return (device, stream, kernels, configs).
+
+    Returns a dict of kernels keyed by name and matching LaunchConfigs. The
+    move pass is 1D over agents; the diffuse/colorize passes are 2D over pixels.
+    """
+    dev = Device(0)
+    dev.set_current()
+
+    # SurfaceObject requires surface load/store, which has existed since SM 2.0,
+    # but bindless surface objects (cuSurfObjectCreate) require SM 3.0+.
+    cc = dev.compute_capability
+    if cc.major < 3:
+        print(
+            "This example requires a GPU with compute capability >= 3.0 for "
+            f"bindless surface objects. Found sm_{cc.major}{cc.minor}.",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    stream = dev.create_stream()
+
+    # Compile as C++ so the templated tex2D<float> overload resolves.
+    program_options = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}")
+    prog = Program(KERNEL_SOURCE, code_type="c++", options=program_options)
+    mod = prog.compile(
+        "cubin",
+        name_expressions=("move_agents", "diffuse_decay", "colorize"),
+    )
+
+    kernels = {
+        "move": mod.get_kernel("move_agents"),
+        "diffuse": mod.get_kernel("diffuse_decay"),
+        "colorize": mod.get_kernel("colorize"),
+    }
+
+    # 1D launch over agents.
+    move_block = (256, 1, 1)
+    move_grid = ((N_AGENTS + move_block[0] - 1) // move_block[0], 1, 1)
+    move_config = LaunchConfig(grid=move_grid, block=move_block)
+
+    # 2D launch over pixels (shared by diffuse and colorize).
+    px_block = (16, 16, 1)
+    px_grid = (
+        (WIDTH + px_block[0] - 1) // px_block[0],
+        (HEIGHT + px_block[1] - 1) // px_block[1],
+        1,
+    )
+    px_config = LaunchConfig(grid=px_grid, block=px_block)
+
+    configs = {"move": move_config, "diffuse": px_config, "colorize": px_config}
+
+    return dev, stream, kernels, configs
+
+
+def create_window():
+    """Open a pyglet window and return (window, gl_module, pyglet)."""
+    try:
+        import pyglet
+        from pyglet.gl import gl as _gl
+    except ImportError:
+        print(
+            "This example requires pyglet >= 2.0.\nInstall it with:  pip install pyglet",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    window = pyglet.window.Window(
+        WIDTH,
+        HEIGHT,
+        caption="cuda.core CUDAArray/Texture/Surface/Buffer - Physarum",
+        vsync=False,
+    )
+    return window, _gl, pyglet
+
+
+def create_display_resources(gl, width, height):
+    """Create the GL objects needed to show a texture on screen.
+
+    This sets up a shader program, a fullscreen quad, and an empty texture.
+    None of this is CUDA-specific -- it's standard OpenGL boilerplate for
+    rendering a textured quad.
+
+    Returns (shader_program, vertex_array_id, texture_id). The shader_program
+    is a pyglet ShaderProgram object (must be kept alive).
+    """
+    from pyglet.graphics.shader import Shader, ShaderProgram
+
+    # Shader program -- just passes texture coordinates through
+    vert = Shader(VERTEX_SHADER_SOURCE, "vertex")
+    frag = Shader(FRAGMENT_SHADER_SOURCE, "fragment")
+    shader_prog = ShaderProgram(vert, frag)
+
+    # Fullscreen quad (two triangles covering the entire window)
+    quad_verts = np.array(
+        [
+            # x,  y,    s, t      (position + texture coordinate)
+            -1,
+            -1,
+            0,
+            0,
+            1,
+            -1,
+            1,
+            0,
+            1,
+            1,
+            1,
+            1,
+            -1,
+            -1,
+            0,
+            0,
+            1,
+            1,
+            1,
+            1,
+            -1,
+            1,
+            0,
+            1,
+        ],
+        dtype=np.float32,
+    )
+
+    vao = ctypes.c_uint(0)
+    gl.glGenVertexArrays(1, ctypes.byref(vao))
+    gl.glBindVertexArray(vao.value)
+
+    vbo = ctypes.c_uint(0)
+    gl.glGenBuffers(1, ctypes.byref(vbo))
+    gl.glBindBuffer(gl.GL_ARRAY_BUFFER, vbo.value)
+    gl.glBufferData(
+        gl.GL_ARRAY_BUFFER,
+        quad_verts.nbytes,
+        quad_verts.ctypes.data_as(ctypes.c_void_p),
+        gl.GL_STATIC_DRAW,
+    )
+
+    stride = 4 * 4  # 4 floats * 4 bytes each = 16 bytes per vertex
+    pos_loc = gl.glGetAttribLocation(shader_prog.id, b"position")
+    gl.glEnableVertexAttribArray(pos_loc)
+    gl.glVertexAttribPointer(pos_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(0))
+
+    tc_loc = gl.glGetAttribLocation(shader_prog.id, b"texcoord")
+    gl.glEnableVertexAttribArray(tc_loc)
+    gl.glVertexAttribPointer(tc_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(8))
+
+    gl.glBindVertexArray(0)
+
+    # Empty texture (will be filled each frame from the PBO)
+    tex = ctypes.c_uint(0)
+    gl.glGenTextures(1, ctypes.byref(tex))
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex.value)
+    gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MIN_FILTER, gl.GL_LINEAR)
+    gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MAG_FILTER, gl.GL_LINEAR)
+    gl.glTexImage2D(
+        gl.GL_TEXTURE_2D,
+        0,
+        gl.GL_RGBA8,
+        width,
+        height,
+        0,
+        gl.GL_RGBA,
+        gl.GL_UNSIGNED_BYTE,
+        None,
+    )
+
+    return shader_prog, vao.value, tex.value
+
+
+def create_pixel_buffer(gl, width, height):
+    """Create a Pixel Buffer Object (PBO) -- the bridge between CUDA and OpenGL.
+
+    A PBO is a GPU-side buffer that OpenGL can read from when uploading pixels
+    to a texture. By registering this same buffer with CUDA, the CUDA kernel
+    can write directly into it.
+
+    Returns (pbo_gl_name, size_in_bytes).
+    """
+    pbo = ctypes.c_uint(0)
+    gl.glGenBuffers(1, ctypes.byref(pbo))
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo.value)
+    nbytes = width * height * 4  # RGBA, 1 byte per channel
+    gl.glBufferData(gl.GL_PIXEL_UNPACK_BUFFER, nbytes, None, gl.GL_DYNAMIC_DRAW)
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0)
+    return pbo.value, nbytes
+
+
+def copy_pbo_to_texture(gl, pbo_id, tex_id, width, height):
+    """Copy pixel data from the PBO into the GL texture (GPU-to-GPU)."""
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo_id)
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id)
+    gl.glTexSubImage2D(
+        gl.GL_TEXTURE_2D,
+        0,
+        0,
+        0,
+        width,
+        height,
+        gl.GL_RGBA,
+        gl.GL_UNSIGNED_BYTE,
+        None,  # None = read from the currently bound PBO, not from CPU
+    )
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0)
+
+
+def draw_fullscreen_quad(gl, shader_prog, vao_id, tex_id):
+    """Draw the texture to the screen using the fullscreen quad."""
+    gl.glUseProgram(shader_prog.id)
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id)
+    gl.glBindVertexArray(vao_id)
+    gl.glDrawArrays(gl.GL_TRIANGLES, 0, 6)
+    gl.glBindVertexArray(0)
+    gl.glUseProgram(0)
+
+
+def make_trail_arrays():
+    """Allocate the two single-channel float ping-pong arrays for the trail map."""
+    arr_a = CUDAArray.from_descriptor(
+        shape=(WIDTH, HEIGHT),
+        format=ArrayFormat.FLOAT32,
+        num_channels=1,
+        is_surface_load_store=True,
+    )
+    arr_b = CUDAArray.from_descriptor(
+        shape=(WIDTH, HEIGHT),
+        format=ArrayFormat.FLOAT32,
+        num_channels=1,
+        is_surface_load_store=True,
+    )
+    return arr_a, arr_b
+
+
+def make_texture(arr):
+    """Bind `arr` as a TextureObject configured for LINEAR + WRAP + normalized."""
+    res_desc = ResourceDescriptor.from_array(arr)
+    tex_desc = TextureDescriptor(
+        address_mode=AddressMode.WRAP,
+        filter_mode=FilterMode.LINEAR,
+        read_mode=ReadMode.ELEMENT_TYPE,
+        # WRAP/MIRROR addressing modes require normalized coordinates.
+        normalized_coords=True,
+    )
+    return TextureObject.from_descriptor(resource=res_desc, texture_descriptor=tex_desc)
+
+
+def fill_agent_host(host_view, seed):
+    """Fill a host-side float32 view (length 3*N) with random agents.
+
+    Layout is [x0, y0, h0, x1, y1, h1, ...]: position in [0, W)x[0, H) and
+    heading in [0, 2*pi).
+    """
+    rng = np.random.default_rng(seed)
+    agents = host_view.reshape(N_AGENTS, 3)
+    agents[:, 0] = rng.uniform(0.0, WIDTH, size=N_AGENTS)
+    agents[:, 1] = rng.uniform(0.0, HEIGHT, size=N_AGENTS)
+    agents[:, 2] = rng.uniform(0.0, 2.0 * np.pi, size=N_AGENTS)
+
+
+def reseed_agents(stream, device_agents, pinned_agents, host_view, seed):
+    """Refill the host staging view and copy it into the device agent Buffer.
+
+    Reuses the already-allocated device and pinned buffers -- no reallocation.
+    """
+    fill_agent_host(host_view, seed)
+    device_agents.copy_from(pinned_agents, stream=stream)
+
+
+def clear_trail(stream, arr_a, arr_b, zeros):
+    """Zero both trail arrays. CUDAArray.copy_from accepts a buffer-protocol host
+    object directly (unlike Buffer.copy_from), so a NumPy zero array works."""
+    arr_a.copy_from(zeros, stream=stream)
+    arr_b.copy_from(zeros, stream=stream)
+
+
+# ================================== main() ==================================
+
+
+def main():
+    # --- Step 1: Set up CUDA (compile kernels, create stream) ---
+    dev, stream, kernels, configs = setup_cuda()
+
+    # --- Step 2: Open a window ---
+    window, gl, pyglet = create_window()
+
+    # --- Step 3: Create GL resources for drawing a texture to screen ---
+    #     (Standard OpenGL boilerplate -- not CUDA-specific.)
+    shader_prog, quad_vao, tex_id = create_display_resources(gl, WIDTH, HEIGHT)
+
+    # --- Step 4: Create the Pixel Buffer Object (PBO) ---
+    #     The PBO is GPU memory owned by OpenGL. CUDA writes into it, OpenGL
+    #     reads from it.
+    pbo_id, _ = create_pixel_buffer(gl, WIDTH, HEIGHT)
+
+    # --- Step 5: Register the PBO with CUDA ---
+    resource = GraphicsResource.from_gl_buffer(pbo_id, flags="write_discard")
+
+    # --- Step 6: Allocate the two ping-pong trail Arrays ---
+    #     Single-channel float with is_surface_load_store=True so they can be
+    #     bound as SurfaceObjects.
+    #
+    #   API MAP -- the four cuda.core objects that drive this simulation:
+    #     * device Buffer (dev.allocate) holds raw agent state alongside the
+    #       array/texture/surface stack.
+    #     * TextureObject LINEAR+WRAP+normalized -> smooth, toroidal SENSE of the
+    #       pheromone field.
+    #     * SurfaceObject -> typed DEPOSIT writes into the same CUDAArray sensed
+    #       as a texture (is_surface_load_store=True).
+    arr_a, arr_b = make_trail_arrays()
+
+    # --- Step 7: Pre-create the four bindless handles (once, kept alive) ---
+    tex_a = make_texture(arr_a)
+    tex_b = make_texture(arr_b)
+    surf_a = SurfaceObject.from_array(arr_a)
+    surf_b = SurfaceObject.from_array(arr_b)
+
+    # --- Step 8: Allocate per-agent state in a plain device Buffer ---
+    #     Flat float32 [x, y, heading] * N. We stage host data through a
+    #     host-accessible pinned Buffer, then copy it into the device Buffer.
+    #     Both buffers are allocated once and reused on reseed.
+    agent_floats = 3 * N_AGENTS
+    agent_bytes = agent_floats * 4
+    device_agents = dev.allocate(agent_bytes, stream=stream)
+    pinned_mr = LegacyPinnedMemoryResource()
+    pinned_agents = pinned_mr.allocate(agent_bytes)
+    host_view = np.from_dlpack(pinned_agents).view(np.float32)
+
+    # Host-side zero image reused to clear the trail arrays.
+    zeros = np.zeros((WIDTH, HEIGHT), dtype=np.float32)
+
+    # --- Step 9: Seed initial agents + clear the trail ---
+    state = {"current": "a", "preset": DEFAULT_PRESET, "seed": 0, "frame": 0}
+    reseed_agents(stream, device_agents, pinned_agents, host_view, seed=state["seed"])
+    clear_trail(stream, arr_a, arr_b, zeros)
+    stream.sync()  # ensure the seed copy finishes before the first launch reads it
+
+    # --- Step 10: Render loop ---
+    start_time = time.monotonic()
+    frame_count = 0
+    fps_time = start_time
+
+    def current_tex_surf():
+        """Return (tex, surf) for the CURRENT trail array (read + deposit)."""
+        if state["current"] == "a":
+            return tex_a, surf_a
+        return tex_b, surf_b
+
+    def diffuse_read_write():
+        """Return (tex_read_current, surf_write_other, next_current)."""
+        if state["current"] == "a":
+            return tex_a, surf_b, "b"
+        return tex_b, surf_a, "a"
+
+    @window.event
+    def on_key_press(symbol, _modifiers):
+        key = pyglet.window.key
+        if symbol == key.ESCAPE:
+            window.close()
+            return
+        if symbol == key.R:
+            state["seed"] += 1
+            state["frame"] = 0
+            reseed_agents(stream, device_agents, pinned_agents, host_view, seed=state["seed"])
+            clear_trail(stream, arr_a, arr_b, zeros)
+            state["current"] = "a"
+            return
+        for digit_key, name in (
+            (key._1, "1"),
+            (key._2, "2"),
+            (key._3, "3"),
+        ):
+            if symbol == digit_key:
+                state["preset"] = name
+                return
+
+    @window.event
+    def on_draw():
+        nonlocal frame_count, fps_time
+
+        window.clear()
+        sensor_angle, sensor_dist, turn_speed, move_speed, decay, _label = PRESETS[state["preset"]]
+
+        # (a) Move + deposit: 1D over agents. Reads and deposits into the
+        #     CURRENT array (tex + surf of the same array).
+        tex_cur, surf_cur = current_tex_surf()
+        launch(
+            stream,
+            configs["move"],
+            kernels["move"],
+            device_agents,
+            np.int32(N_AGENTS),
+            np.uint64(tex_cur.handle),
+            np.uint64(surf_cur.handle),
+            np.int32(WIDTH),
+            np.int32(HEIGHT),
+            np.float32(sensor_angle),
+            np.float32(sensor_dist),
+            np.float32(turn_speed),
+            np.float32(move_speed),
+            np.float32(DEPOSIT),
+            np.uint32(state["frame"]),
+        )
+
+        # (b) Diffuse + decay: 2D over pixels. Reads CURRENT, writes OTHER, swap.
+        tex_read, surf_write, next_current = diffuse_read_write()
+        launch(
+            stream,
+            configs["diffuse"],
+            kernels["diffuse"],
+            np.uint64(tex_read.handle),
+            np.uint64(surf_write.handle),
+            np.int32(WIDTH),
+            np.int32(HEIGHT),
+            np.float32(decay),
+        )
+        state["current"] = next_current
+
+        # (c) Colorize the latest trail into the OpenGL PBO.
+        tex_show = tex_a if state["current"] == "a" else tex_b
+        with resource.map(stream=stream) as buf:
+            launch(
+                stream,
+                configs["colorize"],
+                kernels["colorize"],
+                np.uint64(tex_show.handle),
+                buf.handle,
+                np.int32(WIDTH),
+                np.int32(HEIGHT),
+            )
+        # Unmap happens automatically when the `with` block exits.
+
+        # (d) Tell OpenGL to copy the PBO contents into our texture.
+        copy_pbo_to_texture(gl, pbo_id, tex_id, WIDTH, HEIGHT)
+
+        # (e) Draw the texture to the screen.
+        draw_fullscreen_quad(gl, shader_prog, quad_vao, tex_id)
+
+        state["frame"] += 1
+
+        # FPS counter (shown in window title)
+        frame_count += 1
+        now = time.monotonic()
+        if now - fps_time >= 1.0:
+            fps = frame_count / (now - fps_time)
+            label = PRESETS[state["preset"]][5]
+            window.set_caption(
+                "cuda.core CUDAArray/Texture/Surface/Buffer - Physarum"
+                f" [{label}] ({WIDTH}x{HEIGHT}, {N_AGENTS:,} agents, {fps:.0f} FPS)"
+                " | Buffer(agents) + TextureObject[LINEAR|WRAP|norm] sense"
+                " + SurfaceObject deposit"
+            )
+            frame_count = 0
+            fps_time = now
+
+    @window.event
+    def on_close():
+        # Release everything we opened, in reverse order.
+        resource.close()
+        tex_a.close()
+        tex_b.close()
+        surf_a.close()
+        surf_b.close()
+        arr_a.close()
+        arr_b.close()
+        pinned_agents.close()
+        device_agents.close(stream)
+        stream.close()
+
+    pyglet.app.run(interval=0)
+
+
+# ======================== GPU code (CUDA + GLSL) ============================
+#
+# These source strings are kept at the bottom of the file so they don't
+# distract from the Python logic above.
+#
+#   - KERNEL_SOURCE contains three CUDA C++ kernels:
+#       * move_agents   -- 1 thread per agent: senses the trail at three points
+#                          via tex2D<float> (LINEAR + WRAP), rotates toward the
+#                          strongest, steps forward with toroidal wrap, and
+#                          deposits pheromone via surf2Dwrite (x offset in BYTES).
+#       * diffuse_decay -- box-blur the trail via tex2D LINEAR neighbor taps and
+#                          multiply by a decay factor < 1; ping-pong write.
+#       * colorize      -- color the trail by the local gradient DIRECTION (hue
+#                          via HSV) modulated by intensity, with a ridge boost
+#                          and a wider-tap bloom halo for glowing veins, into
+#                          RGBA bytes in the PBO.
+#
+#   - VERTEX_SHADER_SOURCE / FRAGMENT_SHADER_SOURCE are GLSL. They draw the
+#     texture onto a rectangle covering the entire window. Nothing interesting.
+#
+# ============================================================================
+
+KERNEL_SOURCE = r"""
+// Per-agent xorshift32 RNG: cheap, good enough for turn jitter. Seeded per
+// agent and per frame so the sequence differs every step.
+__device__ __forceinline__ unsigned int xorshift32(unsigned int s) {
+    s ^= s << 13;
+    s ^= s >> 17;
+    s ^= s << 5;
+    return s;
+}
+
+extern "C"
+__global__
+void move_agents(float* agents,
+                 int n_agents,
+                 cudaTextureObject_t tex,
+                 cudaSurfaceObject_t surf,
+                 int width, int height,
+                 float sensor_angle,
+                 float sensor_dist,
+                 float turn_speed,
+                 float move_speed,
+                 float deposit,
+                 unsigned int frame) {
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= n_agents) return;
+
+    int base = i * 3;
+    float x = agents[base + 0];
+    float y = agents[base + 1];
+    float heading = agents[base + 2];
+
+    float inv_w = 1.0f / (float)width;
+    float inv_h = 1.0f / (float)height;
+
+    // Sample the trail at center / left / right of the heading. Normalized
+    // coords (+0.5 texel center) are required for WRAP addressing.
+    float ca = heading;
+    float la = heading - sensor_angle;
+    float ra = heading + sensor_angle;
+
+    float cx = x + cosf(ca) * sensor_dist;
+    float cy = y + sinf(ca) * sensor_dist;
+    float lx = x + cosf(la) * sensor_dist;
+    float ly = y + sinf(la) * sensor_dist;
+    float rx = x + cosf(ra) * sensor_dist;
+    float ry = y + sinf(ra) * sensor_dist;
+
+    float sc = tex2D<float>(tex, (cx + 0.5f) * inv_w, (cy + 0.5f) * inv_h);
+    float sl = tex2D<float>(tex, (lx + 0.5f) * inv_w, (ly + 0.5f) * inv_h);
+    float sr = tex2D<float>(tex, (rx + 0.5f) * inv_w, (ry + 0.5f) * inv_h);
+
+    // Per-agent jitter in [0, 1).
+    unsigned int rng = xorshift32(((unsigned int)i + 1u) * 2654435761u + frame * 40503u);
+    float jitter = (rng & 0xffffffu) / (float)0x1000000;
+
+    // Steer toward the strongest sensor; random turn when ahead is ambiguous.
+    if (sc > sl && sc > sr) {
+        // keep going straight
+    } else if (sc < sl && sc < sr) {
+        // both sides better than center: turn randomly left or right
+        heading += (jitter < 0.5f ? -turn_speed : turn_speed);
+    } else if (sl > sr) {
+        heading -= turn_speed;
+    } else if (sr > sl) {
+        heading += turn_speed;
+    } else {
+        // tie: small random wiggle
+        heading += (jitter - 0.5f) * turn_speed;
+    }
+
+    // Step forward and wrap around the toroidal world.
+    x += cosf(heading) * move_speed;
+    y += sinf(heading) * move_speed;
+
+    float fw = (float)width;
+    float fh = (float)height;
+    if (x < 0.0f) x += fw;
+    if (x >= fw) x -= fw;
+    if (y < 0.0f) y += fh;
+    if (y >= fh) y -= fh;
+
+    agents[base + 0] = x;
+    agents[base + 1] = y;
+    agents[base + 2] = heading;
+
+    // Deposit pheromone at the new integer cell. surf2Dwrite x offset is in
+    // BYTES: single-channel float => x * sizeof(float). Concurrent agents may
+    // race on the same texel; that is acceptable for Physarum.
+    int ix = (int)x;
+    int iy = (int)y;
+    if (ix < 0) ix = 0; else if (ix >= width) ix = width - 1;
+    if (iy < 0) iy = 0; else if (iy >= height) iy = height - 1;
+
+    float prev = surf2Dread<float>(surf, ix * (int)sizeof(float), iy);
+    float val = prev + deposit;
+    if (val > 1.0f) val = 1.0f;
+    surf2Dwrite(val, surf, ix * (int)sizeof(float), iy);
+}
+
+extern "C"
+__global__
+void diffuse_decay(cudaTextureObject_t tex,
+                   cudaSurfaceObject_t surf,
+                   int width, int height,
+                   float decay) {
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= width || y >= height) return;
+
+    float inv_w = 1.0f / (float)width;
+    float inv_h = 1.0f / (float)height;
+    float cx = (x + 0.5f) * inv_w;
+    float cy = (y + 0.5f) * inv_h;
+
+    // 3x3 box blur via LINEAR neighbor taps; WRAP gives toroidal edges.
+    float sum = 0.0f;
+    for (int dy = -1; dy <= 1; ++dy) {
+        for (int dx = -1; dx <= 1; ++dx) {
+            sum += tex2D<float>(tex, cx + dx * inv_w, cy + dy * inv_h);
+        }
+    }
+    float blurred = sum * (1.0f / 9.0f);
+
+    float out = blurred * decay;
+    if (out < 0.0f) out = 0.0f;
+    if (out > 1.0f) out = 1.0f;
+
+    surf2Dwrite(out, surf, x * (int)sizeof(float), y);
+}
+
+// HSV -> RGB (all components in [0, 1]). Standard six-sector conversion; used
+// by colorize to turn the local trail-gradient direction into a hue.
+__device__ __forceinline__ void hsv2rgb(float h, float s, float v,
+                                        float* r, float* g, float* b) {
+    h -= floorf(h);          // wrap hue into [0, 1)
+    float hp = h * 6.0f;
+    int sector = (int)hp;
+    float f = hp - (float)sector;
+    float p = v * (1.0f - s);
+    float q = v * (1.0f - s * f);
+    float t = v * (1.0f - s * (1.0f - f));
+    switch (sector % 6) {
+        case 0:  *r = v; *g = t; *b = p; break;
+        case 1:  *r = q; *g = v; *b = p; break;
+        case 2:  *r = p; *g = v; *b = t; break;
+        case 3:  *r = p; *g = q; *b = v; break;
+        case 4:  *r = t; *g = p; *b = v; break;
+        default: *r = v; *g = p; *b = q; break;
+    }
+}
+
+extern "C"
+__global__
+void colorize(cudaTextureObject_t tex,
+              unsigned char* output,
+              int width, int height) {
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= width || y >= height) return;
+
+    float inv_w = 1.0f / (float)width;
+    float inv_h = 1.0f / (float)height;
+    float cx = (x + 0.5f) * inv_w;
+    float cy = (y + 0.5f) * inv_h;
+
+    float v = tex2D<float>(tex, cx, cy);
+    if (v < 0.0f) v = 0.0f;
+    if (v > 1.0f) v = 1.0f;
+
+    // Local trail gradient from LINEAR+WRAP neighbor taps (toroidal, no edge
+    // special-casing). Its direction sets the HUE so the network is colored by
+    // the orientation of the veins instead of a single intensity ramp.
+    float l = tex2D<float>(tex, cx - inv_w, cy);
+    float rgt = tex2D<float>(tex, cx + inv_w, cy);
+    float dn = tex2D<float>(tex, cx, cy - inv_h);
+    float up = tex2D<float>(tex, cx, cy + inv_h);
+    float gx = rgt - l;
+    float gy = up - dn;
+    float hue = atan2f(gy, gx) * (0.1591549f) + 0.5f;  // atan2/(2*pi) + 0.5 -> [0,1)
+
+    // Soft glow/bloom: a wider ring of taps lifts a luminous halo around the
+    // veins so they read as glowing rather than flat. Still WRAP-sampled.
+    float bloom = 0.0f;
+    bloom += tex2D<float>(tex, cx - 2.0f * inv_w, cy);
+    bloom += tex2D<float>(tex, cx + 2.0f * inv_w, cy);
+    bloom += tex2D<float>(tex, cx, cy - 2.0f * inv_h);
+    bloom += tex2D<float>(tex, cx, cy + 2.0f * inv_h);
+    bloom += l + rgt + dn + up;
+    bloom *= 0.125f;  // average of the 8 surrounding taps
+
+    // Intensity stays the dominant brightness driver so the reticular structure
+    // survives; gradient magnitude sharpens ridges into bright luminous veins.
+    float grad_mag = sqrtf(gx * gx + gy * gy);
+    float ridge = grad_mag * 6.0f;
+    if (ridge > 1.0f) ridge = 1.0f;
+
+    // Saturation eases toward white on the brightest ridges (neon -> white-hot).
+    float sat = 1.0f - 0.45f * v;
+
+    // Brightness: core intensity (gamma-lifted) + ridge boost + bloom halo.
+    float val = sqrtf(v) + 0.55f * ridge + 0.45f * bloom;
+    if (val > 1.0f) val = 1.0f;
+
+    float r, g, b;
+    hsv2rgb(hue, sat, val, &r, &g, &b);
+
+    // Lift the floor toward a deep blue-violet so empty space is not pure black,
+    // giving the glow something to bleed into.
+    r += 0.02f;
+    g += 0.0f;
+    b += 0.06f;
+    if (r > 1.0f) r = 1.0f;
+    if (g > 1.0f) g = 1.0f;
+    if (b > 1.0f) b = 1.0f;
+
+    int idx = (y * width + x) * 4;
+    output[idx + 0] = (unsigned char)(r * 255.0f);
+    output[idx + 1] = (unsigned char)(g * 255.0f);
+    output[idx + 2] = (unsigned char)(b * 255.0f);
+    output[idx + 3] = 255;
+}
+"""
+
+# GLSL shaders -- these just display a texture on a fullscreen rectangle.
+# Nothing CUDA-specific here.
+
+VERTEX_SHADER_SOURCE = """#version 330 core
+in vec2 position;
+in vec2 texcoord;
+out vec2 v_texcoord;
+void main() {
+    gl_Position = vec4(position, 0.0, 1.0);
+    v_texcoord = texcoord;
+}
+"""
+
+FRAGMENT_SHADER_SOURCE = """#version 330 core
+in vec2 v_texcoord;
+out vec4 fragColor;
+uniform sampler2D tex;
+void main() {
+    fragColor = texture(tex, v_texcoord);
+}
+"""
+
+
+if __name__ == "__main__":
+    main()
diff --git a/cuda_core/examples/gl_interop_reaction_diffusion.py b/cuda_core/examples/gl_interop_reaction_diffusion.py
new file mode 100644
index 00000000000..2c53f39f641
--- /dev/null
+++ b/cuda_core/examples/gl_interop_reaction_diffusion.py
@@ -0,0 +1,727 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# ################################################################################
+#
+# This example demonstrates cuda.core.CUDAArray, TextureObject, and SurfaceObject
+# in combination with GraphicsResource for CUDA/OpenGL interop. A Gray-Scott
+# reaction-diffusion simulation is ping-ponged between two CUDA arrays each
+# frame: a TextureObject provides smooth (LINEAR + WRAP) sampled reads, and a
+# SurfaceObject provides typed writes. The final state is colorized straight
+# into an OpenGL PBO. Requires pyglet.
+#
+# ################################################################################
+
+# What this example teaches
+# =========================
+# - How to allocate a CUDA CUDAArray with `is_surface_load_store=True` so the same
+#   memory can be bound as both a TextureObject (for sampled reads) and a
+#   SurfaceObject (for typed writes).
+# - How to use FilterMode.LINEAR + AddressMode.WRAP + normalized coordinates
+#   to get free hardware bilinear interpolation on a toroidal world.
+# - How to compose CUDAArray/TextureObject/SurfaceObject with GraphicsResource so
+#   the entire simulation never leaves the GPU.
+#
+# How it works
+# ============
+# Gray-Scott is a two-species (U, V) reaction-diffusion system. At each cell
+# the rule is roughly:
+#
+#     du/dt = Du * laplacian(u) - u*v*v + F*(1 - u)
+#     dv/dt = Dv * laplacian(v) + u*v*v - (F + k)*v
+#
+# Different choices of F and k yield strikingly different patterns: coral,
+# mitosis, spots, and many more. We pack (U, V) into the two channels of a
+# `float2` CUDAArray.
+#
+#   PING-PONG (two arrays, swap each step)
+#   ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#   +--------------+   tex2D<float2>   +--------------+
+#   |   arr_a      | ----------------> |              |
+#   | (U, V) state |                   |  gray_scott  |
+#   +--------------+                   |    kernel    |
+#                                      |              |
+#   +--------------+   surf2Dwrite     |              |
+#   |   arr_b      | <---------------- |              |
+#   | (U, V) state |                   +--------------+
+#   +--------------+
+#       (swap)
+#
+# Each frame we do N_STEPS iterations of the kernel above, then run a separate
+# `colorize` kernel that samples V from the final state and writes RGBA bytes
+# straight into the OpenGL PBO via GraphicsResource. No data ever travels
+# across the PCIe bus during the frame.
+#
+# Why LINEAR + WRAP + normalized coords?
+# --------------------------------------
+# Addressing modes WRAP and MIRROR are only supported with normalized
+# coordinates (see the CUDA Programming Guide and the SDK's
+# simplePitchLinearTexture sample). We use WRAP so that neighbor lookups at
+# the image edge automatically wrap around -- i.e. a torus. LINEAR filtering
+# is essentially free on the hardware and gives smoother diffusion than POINT
+# sampling would. We sample at the texel center `(x + 0.5) / W` so the
+# neighbor offsets line up exactly on integer texel positions.
+#
+# Channel byte width in surf2Dwrite
+# ---------------------------------
+# `surf2Dwrite` takes the x coordinate in BYTES, not in elements. For a
+# `float2` surface that means `x * sizeof(float2)` = `x * 8`. Getting this
+# wrong silently corrupts every other column.
+#
+# What you should see
+# ===================
+# A window showing animated, organic-looking patterns growing and dividing
+# (think coral, spots, or mitosing cells). Press 1/2/3 to switch presets,
+# R to reseed, and Escape to exit. The window title shows the current FPS
+# and active preset.
+#
+
+# /// script
+# dependencies = ["cuda_bindings", "cuda_core>0.6.0", "pyglet"]
+# ///
+
+import ctypes
+import sys
+import time
+
+import numpy as np
+
+from cuda.core import (
+    AddressMode,
+    ArrayFormat,
+    CUDAArray,
+    Device,
+    FilterMode,
+    GraphicsResource,
+    LaunchConfig,
+    Program,
+    ProgramOptions,
+    ReadMode,
+    ResourceDescriptor,
+    SurfaceObject,
+    TextureDescriptor,
+    TextureObject,
+    launch,
+)
+
+# ---------------------------------------------------------------------------
+# Simulation parameters (feel free to change these)
+# ---------------------------------------------------------------------------
+WIDTH = 512
+HEIGHT = 512
+N_STEPS = 8  # Gray-Scott iterations per displayed frame
+DU = 0.16  # diffusion rate for U
+DV = 0.08  # diffusion rate for V
+DT = 1.0  # time step (Gray-Scott is stable at 1.0 with these D's)
+
+# Named presets: (F, k, label) tuples. F is the feed rate, k is the kill rate.
+# These are classic Gray-Scott regimes documented all over the literature.
+PRESETS = {
+    "1": (0.0545, 0.062, "coral"),
+    "2": (0.0367, 0.0649, "mitosis"),
+    "3": (0.030, 0.062, "spots"),
+}
+DEFAULT_PRESET = "1"
+
+
+# ============================= Helper functions =============================
+#
+# The functions below set up CUDA and OpenGL. If you're here to learn about
+# CUDAArray/TextureObject/SurfaceObject, skip ahead to main() -- the interesting
+# part is there. These helpers exist so that main() reads like a short story
+# instead of a wall of boilerplate.
+# ============================================================================
+
+
+def setup_cuda():
+    """Compile the CUDA kernels and return (device, stream, kernels, configs).
+
+    Returns a dict of kernels keyed by name and matching LaunchConfigs.
+    """
+    dev = Device(0)
+    dev.set_current()
+
+    # SurfaceObject requires surface load/store, which has existed since SM 2.0,
+    # but bindless surface objects (cuSurfObjectCreate) require SM 3.0+.
+    cc = dev.compute_capability
+    if cc.major < 3:
+        print(
+            "This example requires a GPU with compute capability >= 3.0 for "
+            f"bindless surface objects. Found sm_{cc.major}{cc.minor}.",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    stream = dev.create_stream()
+
+    # Compile as C++ so the templated tex2D<float2> overload resolves.
+    program_options = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}")
+    prog = Program(KERNEL_SOURCE, code_type="c++", options=program_options)
+    mod = prog.compile(
+        "cubin",
+        name_expressions=("gray_scott_step", "colorize", "seed_initial"),
+    )
+
+    kernels = {
+        "step": mod.get_kernel("gray_scott_step"),
+        "colorize": mod.get_kernel("colorize"),
+        "seed": mod.get_kernel("seed_initial"),
+    }
+
+    block = (16, 16, 1)
+    grid = (
+        (WIDTH + block[0] - 1) // block[0],
+        (HEIGHT + block[1] - 1) // block[1],
+        1,
+    )
+    config = LaunchConfig(grid=grid, block=block)
+    # All three kernels are pixel-parallel over a WIDTH x HEIGHT grid, so they
+    # can share a launch config.
+    configs = {"step": config, "colorize": config, "seed": config}
+
+    return dev, stream, kernels, configs
+
+
+def create_window():
+    """Open a pyglet window and return (window, gl_module, pyglet)."""
+    try:
+        import pyglet
+        from pyglet.gl import gl as _gl
+    except ImportError:
+        print(
+            "This example requires pyglet >= 2.0.\nInstall it with:  pip install pyglet",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    window = pyglet.window.Window(
+        WIDTH,
+        HEIGHT,
+        caption="cuda.core CUDAArray/Texture/Surface - Gray-Scott Reaction Diffusion",
+        vsync=False,
+    )
+    return window, _gl, pyglet
+
+
+def create_display_resources(gl, width, height):
+    """Create the GL objects needed to show a texture on screen.
+
+    This sets up a shader program, a fullscreen quad, and an empty texture.
+    None of this is CUDA-specific -- it's standard OpenGL boilerplate for
+    rendering a textured quad.
+
+    Returns (shader_program, vertex_array_id, texture_id). The shader_program
+    is a pyglet ShaderProgram object (must be kept alive).
+    """
+    from pyglet.graphics.shader import Shader, ShaderProgram
+
+    # Shader program -- just passes texture coordinates through
+    vert = Shader(VERTEX_SHADER_SOURCE, "vertex")
+    frag = Shader(FRAGMENT_SHADER_SOURCE, "fragment")
+    shader_prog = ShaderProgram(vert, frag)
+
+    # Fullscreen quad (two triangles covering the entire window)
+    quad_verts = np.array(
+        [
+            # x,  y,    s, t      (position + texture coordinate)
+            -1,
+            -1,
+            0,
+            0,
+            1,
+            -1,
+            1,
+            0,
+            1,
+            1,
+            1,
+            1,
+            -1,
+            -1,
+            0,
+            0,
+            1,
+            1,
+            1,
+            1,
+            -1,
+            1,
+            0,
+            1,
+        ],
+        dtype=np.float32,
+    )
+
+    vao = ctypes.c_uint(0)
+    gl.glGenVertexArrays(1, ctypes.byref(vao))
+    gl.glBindVertexArray(vao.value)
+
+    vbo = ctypes.c_uint(0)
+    gl.glGenBuffers(1, ctypes.byref(vbo))
+    gl.glBindBuffer(gl.GL_ARRAY_BUFFER, vbo.value)
+    gl.glBufferData(
+        gl.GL_ARRAY_BUFFER,
+        quad_verts.nbytes,
+        quad_verts.ctypes.data_as(ctypes.c_void_p),
+        gl.GL_STATIC_DRAW,
+    )
+
+    stride = 4 * 4  # 4 floats * 4 bytes each = 16 bytes per vertex
+    pos_loc = gl.glGetAttribLocation(shader_prog.id, b"position")
+    gl.glEnableVertexAttribArray(pos_loc)
+    gl.glVertexAttribPointer(pos_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(0))
+
+    tc_loc = gl.glGetAttribLocation(shader_prog.id, b"texcoord")
+    gl.glEnableVertexAttribArray(tc_loc)
+    gl.glVertexAttribPointer(tc_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(8))
+
+    gl.glBindVertexArray(0)
+
+    # Empty texture (will be filled each frame from the PBO)
+    tex = ctypes.c_uint(0)
+    gl.glGenTextures(1, ctypes.byref(tex))
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex.value)
+    gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MIN_FILTER, gl.GL_LINEAR)
+    gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MAG_FILTER, gl.GL_LINEAR)
+    gl.glTexImage2D(
+        gl.GL_TEXTURE_2D,
+        0,
+        gl.GL_RGBA8,
+        width,
+        height,
+        0,
+        gl.GL_RGBA,
+        gl.GL_UNSIGNED_BYTE,
+        None,
+    )
+
+    return shader_prog, vao.value, tex.value
+
+
+def create_pixel_buffer(gl, width, height):
+    """Create a Pixel Buffer Object (PBO) -- the bridge between CUDA and OpenGL.
+
+    A PBO is a GPU-side buffer that OpenGL can read from when uploading pixels
+    to a texture. By registering this same buffer with CUDA, the CUDA kernel
+    can write directly into it.
+
+    Returns (pbo_gl_name, size_in_bytes).
+    """
+    pbo = ctypes.c_uint(0)
+    gl.glGenBuffers(1, ctypes.byref(pbo))
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo.value)
+    nbytes = width * height * 4  # RGBA, 1 byte per channel
+    gl.glBufferData(gl.GL_PIXEL_UNPACK_BUFFER, nbytes, None, gl.GL_DYNAMIC_DRAW)
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0)
+    return pbo.value, nbytes
+
+
+def copy_pbo_to_texture(gl, pbo_id, tex_id, width, height):
+    """Copy pixel data from the PBO into the GL texture (GPU-to-GPU)."""
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo_id)
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id)
+    gl.glTexSubImage2D(
+        gl.GL_TEXTURE_2D,
+        0,
+        0,
+        0,
+        width,
+        height,
+        gl.GL_RGBA,
+        gl.GL_UNSIGNED_BYTE,
+        None,  # None = read from the currently bound PBO, not from CPU
+    )
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0)
+
+
+def draw_fullscreen_quad(gl, shader_prog, vao_id, tex_id):
+    """Draw the texture to the screen using the fullscreen quad."""
+    gl.glUseProgram(shader_prog.id)
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id)
+    gl.glBindVertexArray(vao_id)
+    gl.glDrawArrays(gl.GL_TRIANGLES, 0, 6)
+    gl.glBindVertexArray(0)
+    gl.glUseProgram(0)
+
+
+def make_state_arrays():
+    """Allocate the two `float2` ping-pong arrays that hold the (U, V) state."""
+    arr_a = CUDAArray.from_descriptor(
+        shape=(WIDTH, HEIGHT),
+        format=ArrayFormat.FLOAT32,
+        num_channels=2,
+        is_surface_load_store=True,
+    )
+    arr_b = CUDAArray.from_descriptor(
+        shape=(WIDTH, HEIGHT),
+        format=ArrayFormat.FLOAT32,
+        num_channels=2,
+        is_surface_load_store=True,
+    )
+    return arr_a, arr_b
+
+
+def make_texture(arr):
+    """Bind `arr` as a TextureObject configured for LINEAR + WRAP + normalized."""
+    res_desc = ResourceDescriptor.from_array(arr)
+    tex_desc = TextureDescriptor(
+        address_mode=AddressMode.WRAP,
+        filter_mode=FilterMode.LINEAR,
+        read_mode=ReadMode.ELEMENT_TYPE,
+        # WRAP/MIRROR addressing modes require normalized coordinates.
+        normalized_coords=True,
+    )
+    return TextureObject.from_descriptor(resource=res_desc, texture_descriptor=tex_desc)
+
+
+def seed_state(stream, kernels, configs, write_surf, seed_value):
+    """Re-initialize the array behind `write_surf` with the Gray-Scott starting state.
+
+    Takes a long-lived SurfaceObject (not a fresh one): `launch` is async, so
+    creating a SurfaceObject inside a `with` block that closes immediately
+    after `launch` returns would destroy the surface handle before the kernel
+    actually runs against it.
+    """
+    launch(
+        stream,
+        configs["seed"],
+        kernels["seed"],
+        np.uint64(write_surf.handle),
+        np.int32(WIDTH),
+        np.int32(HEIGHT),
+        np.uint32(seed_value),
+    )
+
+
+# ================================== main() ==================================
+
+
+def main():
+    # --- Step 1: Set up CUDA (compile kernels, create stream) ---
+    dev, stream, kernels, configs = setup_cuda()
+
+    # --- Step 2: Open a window ---
+    window, gl, pyglet = create_window()
+
+    # --- Step 3: Create GL resources for drawing a texture to screen ---
+    #     (Standard OpenGL boilerplate -- not CUDA-specific.)
+    shader_prog, quad_vao, tex_id = create_display_resources(gl, WIDTH, HEIGHT)
+
+    # --- Step 4: Create the Pixel Buffer Object (PBO) ---
+    #     The PBO is GPU memory owned by OpenGL. It's the bridge between the
+    #     two worlds: CUDA writes into it, OpenGL reads from it.
+    pbo_id, _ = create_pixel_buffer(gl, WIDTH, HEIGHT)
+
+    # --- Step 5: Register the PBO with CUDA ---
+    resource = GraphicsResource.from_gl_buffer(pbo_id, flags="write_discard")
+
+    # --- Step 6: Allocate the two ping-pong state Arrays ---
+    #     Both are `float2` (channel 0 = U, channel 1 = V) with
+    #     is_surface_load_store=True so they can be bound as SurfaceObjects.
+    arr_a, arr_b = make_state_arrays()
+
+    # --- Step 7: Pre-create the four bindless handles ---
+    #     Per advisor: doing this once is much cheaper than recreating them
+    #     every step. We keep both texture and surface handles for each
+    #     array; the simulation loop just picks which pair to use.
+    tex_a = make_texture(arr_a)
+    tex_b = make_texture(arr_b)
+    surf_a = SurfaceObject.from_array(arr_a)
+    surf_b = SurfaceObject.from_array(arr_b)
+
+    # --- Step 8: Seed the initial state into arr_a (writes via surf_a) ---
+    seed_state(stream, kernels, configs, surf_a, seed_value=0)
+    # After seeding, `arr_a` is the "current" state.
+    state = {"current": "a", "preset": DEFAULT_PRESET, "seed": 0}
+
+    # --- Step 9: Render loop ---
+    start_time = time.monotonic()
+    frame_count = 0
+    fps_time = start_time
+
+    def current_read_write():
+        if state["current"] == "a":
+            return tex_a, surf_b, "b"  # read a, write b, next current = b
+        return tex_b, surf_a, "a"
+
+    @window.event
+    def on_key_press(symbol, _modifiers):
+        key = pyglet.window.key
+        if symbol == key.ESCAPE:
+            window.close()
+            return
+        if symbol == key.R:
+            state["seed"] += 1
+            seed_state(stream, kernels, configs, surf_a, seed_value=state["seed"])
+            state["current"] = "a"
+            return
+        for digit_key, name in (
+            (key._1, "1"),
+            (key._2, "2"),
+            (key._3, "3"),
+        ):
+            if symbol == digit_key:
+                state["preset"] = name
+                return
+
+    @window.event
+    def on_draw():
+        nonlocal frame_count, fps_time
+
+        window.clear()
+        f, k, _label = PRESETS[state["preset"]]
+
+        # (a) Run N_STEPS Gray-Scott iterations. Each step reads from one
+        #     array via a TextureObject (LINEAR + WRAP gives wrapping +
+        #     bilinear sampling) and writes to the other via a SurfaceObject.
+        for _ in range(N_STEPS):
+            tex_read, surf_write, next_current = current_read_write()
+            launch(
+                stream,
+                configs["step"],
+                kernels["step"],
+                np.uint64(tex_read.handle),
+                np.uint64(surf_write.handle),
+                np.int32(WIDTH),
+                np.int32(HEIGHT),
+                np.float32(DU),
+                np.float32(DV),
+                np.float32(f),
+                np.float32(k),
+                np.float32(DT),
+            )
+            state["current"] = next_current
+
+        # (b) Colorize the latest state into the OpenGL PBO.
+        tex_read = tex_a if state["current"] == "a" else tex_b
+        with resource.map(stream=stream) as buf:
+            launch(
+                stream,
+                configs["colorize"],
+                kernels["colorize"],
+                np.uint64(tex_read.handle),
+                buf.handle,
+                np.int32(WIDTH),
+                np.int32(HEIGHT),
+            )
+        # Unmap happens automatically when the `with` block exits.
+
+        # (c) Tell OpenGL to copy the PBO contents into our texture.
+        copy_pbo_to_texture(gl, pbo_id, tex_id, WIDTH, HEIGHT)
+
+        # (d) Draw the texture to the screen.
+        draw_fullscreen_quad(gl, shader_prog, quad_vao, tex_id)
+
+        # FPS counter (shown in window title)
+        frame_count += 1
+        now = time.monotonic()
+        if now - fps_time >= 1.0:
+            fps = frame_count / (now - fps_time)
+            label = PRESETS[state["preset"]][2]
+            window.set_caption(
+                "cuda.core CUDAArray/Texture/Surface - Gray-Scott"
+                f" [{label}] ({WIDTH}x{HEIGHT}, {fps:.0f} FPS,"
+                f" {N_STEPS} steps/frame)"
+            )
+            frame_count = 0
+            fps_time = now
+
+    @window.event
+    def on_close():
+        # Release everything we opened, in reverse order. Each of these is a
+        # context manager too, but pyglet owns the event loop here so we
+        # release explicitly.
+        resource.close()
+        tex_a.close()
+        tex_b.close()
+        surf_a.close()
+        surf_b.close()
+        arr_a.close()
+        arr_b.close()
+        stream.close()
+
+    pyglet.app.run(interval=0)
+
+
+# ======================== GPU code (CUDA + GLSL) ============================
+#
+# These source strings are kept at the bottom of the file so they don't
+# distract from the Python logic above. The important things to know:
+#
+#   - KERNEL_SOURCE contains three CUDA C++ kernels:
+#       * seed_initial   -- sets initial (U, V) state via SurfaceObject writes
+#       * gray_scott_step -- reads previous state via TextureObject (with
+#                            LINEAR + WRAP bilinear filtering) and writes the
+#                            next state via SurfaceObject. Coordinates are
+#                            normalized to [0, 1] because WRAP requires it.
+#       * colorize       -- reads the V channel via TextureObject and writes
+#                            RGBA bytes into the OpenGL PBO using a simple
+#                            three-stop "magma-ish" gradient.
+#
+#   - VERTEX_SHADER_SOURCE / FRAGMENT_SHADER_SOURCE are GLSL. They draw a
+#     texture onto a rectangle covering the entire window. Nothing interesting.
+#
+# ============================================================================
+
+KERNEL_SOURCE = r"""
+// Inverse texture dimensions are precomputed by the host and passed as
+// floats so the kernel can convert integer pixel coordinates to normalized
+// texture coordinates with a single multiply.
+
+extern "C"
+__global__
+void seed_initial(cudaSurfaceObject_t surf,
+                  int width, int height,
+                  unsigned int seed) {
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= width || y >= height) return;
+
+    // U = 1 everywhere; V = 1 inside a ~40x40 centered square plus a small
+    // deterministic perturbation that breaks symmetry differently each reseed.
+    float u = 1.0f;
+    float v = 0.0f;
+
+    int half_w = width / 2;
+    int half_h = height / 2;
+    if (x >= half_w - 20 && x < half_w + 20 &&
+        y >= half_h - 20 && y < half_h + 20) {
+        v = 1.0f;
+        // Knock U down a bit inside the seed square so V can grow.
+        u = 0.5f;
+    }
+
+    // Cheap deterministic pseudo-random noise (xorshift on packed coords).
+    unsigned int h = (unsigned int)x * 374761393u +
+                     (unsigned int)y * 668265263u + seed * 2246822519u;
+    h = (h ^ (h >> 13)) * 1274126177u;
+    h = h ^ (h >> 16);
+    float noise = (h & 0xffffu) / 65535.0f;   // in [0, 1]
+    v += 0.02f * (noise - 0.5f);              // small +/- jitter
+    if (v < 0.0f) v = 0.0f;
+    if (v > 1.0f) v = 1.0f;
+
+    // float2 is 8 bytes; surf2Dwrite takes the x offset in BYTES.
+    surf2Dwrite(make_float2(u, v), surf, x * (int)sizeof(float2), y);
+}
+
+extern "C"
+__global__
+void gray_scott_step(cudaTextureObject_t tex,
+                     cudaSurfaceObject_t surf,
+                     int width, int height,
+                     float Du, float Dv,
+                     float F, float k_kill,
+                     float dt) {
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= width || y >= height) return;
+
+    // Normalized coordinates: WRAP addressing only works in normalized mode.
+    // Each texel center sits at ((i + 0.5) / W, (j + 0.5) / H).
+    float inv_w = 1.0f / (float)width;
+    float inv_h = 1.0f / (float)height;
+    float cx = (x + 0.5f) * inv_w;
+    float cy = (y + 0.5f) * inv_h;
+
+    // 5-point Laplacian stencil. LINEAR filtering does nothing extra here
+    // because the offsets land exactly on neighboring texel centers, but the
+    // toroidal WRAP at the boundary is essential for a periodic world.
+    float2 c = tex2D<float2>(tex, cx, cy);
+    float2 l = tex2D<float2>(tex, cx - inv_w, cy);
+    float2 r = tex2D<float2>(tex, cx + inv_w, cy);
+    float2 u_n = tex2D<float2>(tex, cx, cy - inv_h);
+    float2 d_n = tex2D<float2>(tex, cx, cy + inv_h);
+
+    float lap_u = (l.x + r.x + u_n.x + d_n.x) - 4.0f * c.x;
+    float lap_v = (l.y + r.y + u_n.y + d_n.y) - 4.0f * c.y;
+
+    float u = c.x;
+    float v = c.y;
+    float uvv = u * v * v;
+
+    float du = Du * lap_u - uvv + F * (1.0f - u);
+    float dv = Dv * lap_v + uvv - (F + k_kill) * v;
+
+    float new_u = u + dt * du;
+    float new_v = v + dt * dv;
+
+    // Clamp to keep things numerically sane after long runs.
+    if (new_u < 0.0f) new_u = 0.0f;
+    if (new_u > 1.0f) new_u = 1.0f;
+    if (new_v < 0.0f) new_v = 0.0f;
+    if (new_v > 1.0f) new_v = 1.0f;
+
+    surf2Dwrite(make_float2(new_u, new_v), surf,
+                x * (int)sizeof(float2), y);
+}
+
+extern "C"
+__global__
+void colorize(cudaTextureObject_t tex,
+              unsigned char* output,
+              int width, int height) {
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= width || y >= height) return;
+
+    float inv_w = 1.0f / (float)width;
+    float inv_h = 1.0f / (float)height;
+    float cx = (x + 0.5f) * inv_w;
+    float cy = (y + 0.5f) * inv_h;
+
+    float2 c = tex2D<float2>(tex, cx, cy);
+    float v = c.y;
+    if (v < 0.0f) v = 0.0f;
+    if (v > 1.0f) v = 1.0f;
+
+    // Three-stop "magma-ish" gradient: dark purple -> orange -> pale yellow.
+    // Implemented as two linear interpolations stitched together at v = 0.5
+    // so the result is reasonably perceptually smooth without a lookup table.
+    float r, g, b;
+    if (v < 0.5f) {
+        float t = v * 2.0f;                  // [0, 1] over the low half
+        r = 0.05f + t * (0.85f - 0.05f);
+        g = 0.02f + t * (0.30f - 0.02f);
+        b = 0.20f + t * (0.10f - 0.20f);
+    } else {
+        float t = (v - 0.5f) * 2.0f;         // [0, 1] over the high half
+        r = 0.85f + t * (1.00f - 0.85f);
+        g = 0.30f + t * (0.95f - 0.30f);
+        b = 0.10f + t * (0.70f - 0.10f);
+    }
+
+    int idx = (y * width + x) * 4;
+    output[idx + 0] = (unsigned char)(r * 255.0f);
+    output[idx + 1] = (unsigned char)(g * 255.0f);
+    output[idx + 2] = (unsigned char)(b * 255.0f);
+    output[idx + 3] = 255;
+}
+"""
+
+# GLSL shaders -- these just display a texture on a fullscreen rectangle.
+# Nothing CUDA-specific here.
+
+VERTEX_SHADER_SOURCE = """#version 330 core
+in vec2 position;
+in vec2 texcoord;
+out vec2 v_texcoord;
+void main() {
+    gl_Position = vec4(position, 0.0, 1.0);
+    v_texcoord = texcoord;
+}
+"""
+
+FRAGMENT_SHADER_SOURCE = """#version 330 core
+in vec2 v_texcoord;
+out vec4 fragColor;
+uniform sampler2D tex;
+void main() {
+    fragColor = texture(tex, v_texcoord);
+}
+"""
+
+
+if __name__ == "__main__":
+    main()
diff --git a/cuda_core/examples/gl_interop_sdf_volume.py b/cuda_core/examples/gl_interop_sdf_volume.py
new file mode 100644
index 00000000000..20ecadb2244
--- /dev/null
+++ b/cuda_core/examples/gl_interop_sdf_volume.py
@@ -0,0 +1,843 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# ################################################################################
+#
+# This example demonstrates cuda.core's 3D CUDAArray + trilinear TextureObject by
+# baking a procedural Signed Distance Field (SDF) volume once at startup and
+# then ray-marching it every frame to render an orbitable 3D scene. The
+# SurfaceObject is used during the one-shot bake; the TextureObject (with
+# LINEAR + CLAMP + normalized coords) drives the per-frame ray march. The
+# whole pipeline stays on the GPU through GraphicsResource. Requires pyglet.
+#
+# ################################################################################
+
+# What this example teaches
+# =========================
+# - How to allocate a 3D cuda.core.CUDAArray (cuArray3DCreate under the hood) and
+#   bind it as both a SurfaceObject (for one-shot kernel writes) and a
+#   TextureObject (for hardware-accelerated trilinear sampling).
+# - How to ray-march a baked SDF volume from a CUDA kernel, sampling via
+#   tex3D<float> and writing pixels straight into an OpenGL PBO.
+# - How to wire mouse + keyboard input into a pyglet/cuda.core interop loop.
+#
+# How it works
+# ============
+# The signed distance field of a "gyroid intersected with a sphere" is baked
+# once into a 128 x 128 x 128 single-channel float volume:
+#
+#     gyroid(p)   = sin(p.x*tau)cos(p.y*tau)
+#                 + sin(p.y*tau)cos(p.z*tau)
+#                 + sin(p.z*tau)cos(p.x*tau)
+#     sdf_gyroid  = |gyroid(p)| - 0.20         # slab around the gyroid surface
+#     sdf_sphere  = length(p) - 0.9            # bounding sphere
+#     sdf(p)      = max(sdf_gyroid, sdf_sphere) # CSG intersection
+#
+# where p in [-1, 1]^3 is the voxel's world-space position.
+#
+# Each frame, the render kernel emits one ray per pixel from an orbiting
+# camera, marches the volume in fixed voxel-sized steps (up to ~256), and on intersection
+# computes a normal by central differences of tex3D, then applies a simple
+# diffuse + ambient + specular shade. Misses fall back to a vertical sky
+# gradient.
+#
+#   STARTUP (one-shot bake)
+#   ~~~~~~~~~~~~~~~~~~~~~~~
+#   1. Allocate 3D CUDAArray (128^3, FLOAT32 x1, is_surface_load_store=True).
+#   2. Bind it as a SurfaceObject.
+#   3. Launch `bake_sdf`: one thread per voxel writes the SDF via surf3Dwrite.
+#   4. Close the SurfaceObject; the CUDAArray stays alive.
+#
+#   EACH FRAME
+#   ~~~~~~~~~~
+#   1. resource.map() -> CUDA device pointer into the OpenGL PBO.
+#   2. Launch `render_sdf` (one thread per pixel). It samples the SDF via the
+#      long-lived TextureObject (LINEAR + CLAMP + normalized coords) using
+#      tex3D<float>. RGBA8 lands directly in the PBO.
+#   3. Unmap, GPU-side copy PBO -> texture, draw fullscreen quad.
+#
+# Controls
+# ========
+#   Left mouse drag    orbit camera (dx -> yaw, dy -> pitch)
+#   Mouse wheel        zoom (camera distance)
+#   R                  reset camera (yaw=0, pitch=0.3, dist=2.5)
+#   Escape / close     quit
+#
+# The window title shows yaw, pitch, distance, FPS, and ms/frame.
+#
+
+# /// script
+# dependencies = ["cuda_bindings", "cuda_core>0.6.0", "pyglet"]
+# ///
+
+import ctypes
+import sys
+import time
+
+import numpy as np
+
+from cuda.core import (
+    AddressMode,
+    ArrayFormat,
+    CUDAArray,
+    Device,
+    FilterMode,
+    GraphicsResource,
+    LaunchConfig,
+    Program,
+    ProgramOptions,
+    ReadMode,
+    ResourceDescriptor,
+    SurfaceObject,
+    TextureDescriptor,
+    TextureObject,
+    launch,
+)
+
+# ---------------------------------------------------------------------------
+# Configuration (feel free to change these)
+# ---------------------------------------------------------------------------
+WIDTH = 800
+HEIGHT = 600
+VOLUME_SIZE = 128  # 128^3 voxels; bake cost is one-shot.
+
+# Camera defaults / clamps.
+RESET_YAW = 0.0
+RESET_PITCH = 0.3
+RESET_DIST = 2.5
+PITCH_MIN = -1.45  # stay inside (-pi/2, pi/2) so the up-vector stays sane.
+PITCH_MAX = 1.45
+DIST_MIN = 1.2
+DIST_MAX = 8.0
+
+
+# ============================= Helper functions =============================
+#
+# The functions below set up CUDA and OpenGL. If you're here to learn about
+# 3D CUDAArray / TextureObject / SurfaceObject, skip ahead to main() -- the
+# interesting part is there. These helpers exist so that main() reads like a
+# short story instead of a wall of boilerplate.
+# ============================================================================
+
+
+def _check_compute_capability(dev):
+    """3D arrays + bindless surface/texture objects require sm_30+."""
+    cc = dev.compute_capability
+    if cc.major < 3:
+        print(
+            f"This example requires compute capability >= 3.0, got sm_{cc.major}{cc.minor}.",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+
+def setup_cuda():
+    """Compile the two kernels and return (device, stream, kernels)."""
+    dev = Device(0)
+    dev.set_current()
+    _check_compute_capability(dev)
+    stream = dev.create_stream()
+
+    # C++ is required so the templated tex3D<float> / surf3Dwrite<float>
+    # overloads resolve. extern "C" on the kernel symbols keeps the function
+    # names unmangled even when the rest of the TU is compiled as C++.
+    program_options = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}")
+    prog = Program(KERNEL_SOURCE, code_type="c++", options=program_options)
+    mod = prog.compile(
+        "cubin",
+        name_expressions=("bake_sdf", "render_sdf"),
+    )
+    kernels = {
+        "bake": mod.get_kernel("bake_sdf"),
+        "render": mod.get_kernel("render_sdf"),
+    }
+    return dev, stream, kernels
+
+
+def make_volume_array():
+    """Allocate the 3D SDF volume. Single-channel float, surface-capable."""
+    return CUDAArray.from_descriptor(
+        shape=(VOLUME_SIZE, VOLUME_SIZE, VOLUME_SIZE),
+        format=ArrayFormat.FLOAT32,
+        num_channels=1,
+        is_surface_load_store=True,
+    )
+
+
+def make_volume_texture(arr):
+    """Bind `arr` as a TextureObject configured for LINEAR + CLAMP + normalized.
+
+    Normalized coords let the kernel sample as (u, v, w) in [0, 1]; CLAMP at
+    the boundaries matches the rendering logic that bails out as soon as the
+    march leaves the volume's [-1, 1]^3 box, so out-of-range sampling never
+    pollutes a real hit.
+    """
+    res_desc = ResourceDescriptor.from_array(arr)
+    tex_desc = TextureDescriptor(
+        address_mode=AddressMode.CLAMP,
+        filter_mode=FilterMode.LINEAR,
+        read_mode=ReadMode.ELEMENT_TYPE,
+        normalized_coords=True,
+    )
+    return TextureObject.from_descriptor(resource=res_desc, texture_descriptor=tex_desc)
+
+
+def bake_volume(stream, kernels, arr):
+    """Run the one-shot bake kernel that fills the volume with the SDF.
+
+    The SurfaceObject lives only for the duration of this call; once the bake
+    is enqueued and the kernel has captured the bindless handle into its
+    arguments, we sync the stream before letting the SurfaceObject close.
+    The CUDAArray itself outlives this scope -- it's the long-lived backing store
+    for the render-loop TextureObject.
+    """
+    with SurfaceObject.from_array(arr) as bake_surf:
+        block = (8, 8, 8)
+        grid = (
+            (VOLUME_SIZE + block[0] - 1) // block[0],
+            (VOLUME_SIZE + block[1] - 1) // block[1],
+            (VOLUME_SIZE + block[2] - 1) // block[2],
+        )
+        launch(
+            stream,
+            LaunchConfig(grid=grid, block=block),
+            kernels["bake"],
+            np.uint64(bake_surf.handle),
+            np.int32(VOLUME_SIZE),
+        )
+        # Synchronize before the SurfaceObject context exits so the bindless
+        # handle is still valid while the kernel runs.
+        stream.sync()
+
+
+def create_window():
+    """Open a pyglet window and return (window, gl_module, pyglet)."""
+    try:
+        import pyglet
+        from pyglet.gl import gl as _gl
+    except ImportError:
+        print(
+            "This example requires pyglet >= 2.0.\nInstall it with:  pip install pyglet",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    window = pyglet.window.Window(
+        WIDTH,
+        HEIGHT,
+        caption="cuda.core 3D CUDAArray - SDF Volume Ray-Marcher",
+        vsync=False,
+    )
+    return window, _gl, pyglet
+
+
+def create_display_resources(gl, width, height):
+    """Standard GL boilerplate: shader, fullscreen quad, empty texture.
+
+    Not CUDA-specific; identical to the other gl_interop_* examples.
+    Returns (shader_program, vertex_array_id, texture_id).
+    """
+    from pyglet.graphics.shader import Shader, ShaderProgram
+
+    vert = Shader(VERTEX_SHADER_SOURCE, "vertex")
+    frag = Shader(FRAGMENT_SHADER_SOURCE, "fragment")
+    shader_prog = ShaderProgram(vert, frag)
+
+    quad_verts = np.array(
+        [
+            # x,  y,    s, t      (position + texture coordinate)
+            -1,
+            -1,
+            0,
+            0,
+            1,
+            -1,
+            1,
+            0,
+            1,
+            1,
+            1,
+            1,
+            -1,
+            -1,
+            0,
+            0,
+            1,
+            1,
+            1,
+            1,
+            -1,
+            1,
+            0,
+            1,
+        ],
+        dtype=np.float32,
+    )
+
+    vao = ctypes.c_uint(0)
+    gl.glGenVertexArrays(1, ctypes.byref(vao))
+    gl.glBindVertexArray(vao.value)
+
+    vbo = ctypes.c_uint(0)
+    gl.glGenBuffers(1, ctypes.byref(vbo))
+    gl.glBindBuffer(gl.GL_ARRAY_BUFFER, vbo.value)
+    gl.glBufferData(
+        gl.GL_ARRAY_BUFFER,
+        quad_verts.nbytes,
+        quad_verts.ctypes.data_as(ctypes.c_void_p),
+        gl.GL_STATIC_DRAW,
+    )
+
+    stride = 4 * 4  # 4 floats * 4 bytes each
+    pos_loc = gl.glGetAttribLocation(shader_prog.id, b"position")
+    gl.glEnableVertexAttribArray(pos_loc)
+    gl.glVertexAttribPointer(pos_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(0))
+
+    tc_loc = gl.glGetAttribLocation(shader_prog.id, b"texcoord")
+    gl.glEnableVertexAttribArray(tc_loc)
+    gl.glVertexAttribPointer(tc_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(8))
+
+    gl.glBindVertexArray(0)
+
+    tex = ctypes.c_uint(0)
+    gl.glGenTextures(1, ctypes.byref(tex))
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex.value)
+    gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MIN_FILTER, gl.GL_LINEAR)
+    gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MAG_FILTER, gl.GL_LINEAR)
+    gl.glTexImage2D(
+        gl.GL_TEXTURE_2D,
+        0,
+        gl.GL_RGBA8,
+        width,
+        height,
+        0,
+        gl.GL_RGBA,
+        gl.GL_UNSIGNED_BYTE,
+        None,
+    )
+
+    return shader_prog, vao.value, tex.value
+
+
+def create_pixel_buffer(gl, width, height):
+    """Create a Pixel Buffer Object (PBO) -- the CUDA/GL bridge.
+
+    Returns (pbo_gl_name, size_in_bytes).
+    """
+    pbo = ctypes.c_uint(0)
+    gl.glGenBuffers(1, ctypes.byref(pbo))
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo.value)
+    nbytes = width * height * 4  # RGBA8
+    gl.glBufferData(gl.GL_PIXEL_UNPACK_BUFFER, nbytes, None, gl.GL_DYNAMIC_DRAW)
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0)
+    return pbo.value, nbytes
+
+
+def copy_pbo_to_texture(gl, pbo_id, tex_id, width, height):
+    """Copy pixel data from the PBO into the GL texture (GPU-to-GPU)."""
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo_id)
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id)
+    gl.glTexSubImage2D(
+        gl.GL_TEXTURE_2D,
+        0,
+        0,
+        0,
+        width,
+        height,
+        gl.GL_RGBA,
+        gl.GL_UNSIGNED_BYTE,
+        None,
+    )
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0)
+
+
+def draw_fullscreen_quad(gl, shader_prog, vao_id, tex_id):
+    """Draw the texture to the screen using the fullscreen quad."""
+    gl.glUseProgram(shader_prog.id)
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id)
+    gl.glBindVertexArray(vao_id)
+    gl.glDrawArrays(gl.GL_TRIANGLES, 0, 6)
+    gl.glBindVertexArray(0)
+    gl.glUseProgram(0)
+
+
+# ================================== main() ==================================
+
+
+def main():
+    # --- Step 1: Set up CUDA (compile kernels, create stream) ---
+    dev, stream, kernels = setup_cuda()
+
+    # --- Step 2: Allocate the 3D SDF volume and bake it once ---
+    #     The CUDAArray is the long-lived backing store; it must outlive the
+    #     render loop. The SurfaceObject is only needed for the one-shot bake
+    #     and is closed before we ever bind a TextureObject to the same CUDAArray.
+    arr = make_volume_array()
+    bake_volume(stream, kernels, arr)
+
+    # --- Step 3: Bind the volume as a trilinear TextureObject ---
+    #     LINEAR + CLAMP + normalized_coords gives us free hardware trilinear
+    #     filtering, which is exactly what we want for both the SDF samples
+    #     in the ray march and the normal-finite-difference samples.
+    volume_tex = make_volume_texture(arr)
+
+    # --- Step 4: Open a window and set up the CUDA/GL bridge ---
+    window, gl, pyglet = create_window()
+    shader_prog, quad_vao, tex_id = create_display_resources(gl, WIDTH, HEIGHT)
+    pbo_id, _ = create_pixel_buffer(gl, WIDTH, HEIGHT)
+    resource = GraphicsResource.from_gl_buffer(pbo_id, flags="write_discard")
+
+    # --- Step 5: Render loop state ---
+    # Camera is orbit-style: yaw and pitch are angles, dist is the orbit
+    # radius. The render kernel turns these into a (origin, basis) and
+    # constructs per-pixel rays itself.
+    cam = {
+        "yaw": RESET_YAW,
+        "pitch": RESET_PITCH,
+        "dist": RESET_DIST,
+    }
+    frame_count = [0]
+    fps_time = [time.monotonic()]
+    last_fps = [0.0]
+    last_frame_ms = [0.0]
+
+    block = (16, 16, 1)
+    grid = (
+        (WIDTH + block[0] - 1) // block[0],
+        (HEIGHT + block[1] - 1) // block[1],
+        1,
+    )
+    config = LaunchConfig(grid=grid, block=block)
+
+    @window.event
+    def on_draw():
+        window.clear()
+
+        # (a) Map the PBO so CUDA can write into it.
+        with resource.map(stream=stream) as buf:
+            # (b) Launch the ray-march kernel. The camera params are passed
+            #     as scalars; the kernel computes the orbit eye position and
+            #     per-pixel ray direction itself.
+            launch(
+                stream,
+                config,
+                kernels["render"],
+                buf.handle,
+                np.int32(WIDTH),
+                np.int32(HEIGHT),
+                np.uint64(volume_tex.handle),
+                np.float32(cam["yaw"]),
+                np.float32(cam["pitch"]),
+                np.float32(cam["dist"]),
+            )
+        # (c) Unmap happens automatically; cuGraphicsUnmapResources serializes
+        #     the CUDA work against subsequent OpenGL use.
+
+        copy_pbo_to_texture(gl, pbo_id, tex_id, WIDTH, HEIGHT)
+        draw_fullscreen_quad(gl, shader_prog, quad_vao, tex_id)
+
+        frame_count[0] += 1
+        now = time.monotonic()
+        if now - fps_time[0] >= 0.5:
+            last_fps[0] = frame_count[0] / (now - fps_time[0])
+            last_frame_ms[0] = 1000.0 / last_fps[0] if last_fps[0] > 0 else 0.0
+            frame_count[0] = 0
+            fps_time[0] = now
+            window.set_caption(
+                "cuda.core 3D CUDAArray - SDF Volume Ray-Marcher  "
+                f"yaw={cam['yaw']:+.2f} pitch={cam['pitch']:+.2f} "
+                f"dist={cam['dist']:.2f}  "
+                f"{last_fps[0]:.0f} FPS  {last_frame_ms[0]:.2f} ms/frame"
+            )
+
+    @window.event
+    def on_mouse_drag(_x, _y, dx, dy, buttons, _modifiers):
+        # Left-click drag orbits the camera. dx -> yaw (sign convention chosen
+        # so that dragging right rotates the scene right); dy -> pitch (drag
+        # up tilts the camera up).
+        if not (buttons & pyglet.window.mouse.LEFT):
+            return
+        orbit_scale = 0.005
+        cam["yaw"] += dx * orbit_scale
+        cam["pitch"] += dy * orbit_scale
+        # Clamp pitch so the up-vector never flips (we use world-up (0,1,0)).
+        if cam["pitch"] < PITCH_MIN:
+            cam["pitch"] = PITCH_MIN
+        elif cam["pitch"] > PITCH_MAX:
+            cam["pitch"] = PITCH_MAX
+
+    @window.event
+    def on_mouse_scroll(_x, _y, _scroll_x, scroll_y):
+        # Scroll wheel zoom: geometric so each tick feels uniform regardless
+        # of current distance. Positive scroll_y (wheel up) zooms in.
+        if scroll_y == 0:
+            return
+        cam["dist"] *= 0.9**scroll_y
+        if cam["dist"] < DIST_MIN:
+            cam["dist"] = DIST_MIN
+        elif cam["dist"] > DIST_MAX:
+            cam["dist"] = DIST_MAX
+
+    @window.event
+    def on_key_press(symbol, _modifiers):
+        key = pyglet.window.key
+        if symbol == key.ESCAPE:
+            window.close()
+        elif symbol == key.R:
+            cam["yaw"] = RESET_YAW
+            cam["pitch"] = RESET_PITCH
+            cam["dist"] = RESET_DIST
+
+    @window.event
+    def on_close():
+        # Release CUDA resources in reverse construction order. The GL
+        # objects clean up via pyglet on window close.
+        resource.close()
+        volume_tex.close()
+        arr.close()
+        stream.close()
+
+    pyglet.app.run(interval=0)
+
+
+# ======================== GPU code (CUDA + GLSL) ============================
+#
+# Two CUDA C++ kernels are concatenated into one program string so they share
+# a single NVRTC compile.
+#
+#   bake_sdf    -- one thread per voxel. Computes the SDF of an
+#                  "abs(gyroid) - 0.20" surface intersected with a bounding
+#                  sphere, then writes the scalar via surf3Dwrite. NOTE:
+#                  surf3Dwrite's x coordinate is in BYTES, y and z in
+#                  elements -- a classic CUDA gotcha.
+#
+#   render_sdf  -- one thread per screen pixel. Builds the orbit-camera ray,
+#                  fixed-step-marches the volume via tex3D<float> on a trilinear-
+#                  filtered, normalized-coord TextureObject, and shades the
+#                  hit with diffuse + ambient + specular. Misses return a
+#                  sky gradient. Writes RGBA8 directly into the OpenGL PBO.
+#
+# GLSL shaders at the very bottom just draw a textured quad. Nothing CUDA-
+# specific there.
+#
+# ============================================================================
+
+KERNEL_SOURCE = r"""
+// --------------------------------------------------------------------------
+// Small inline helpers. Keeping them __device__ __forceinline__ encourages
+// the compiler to drop them inline and avoids any cross-TU linkage worries.
+// --------------------------------------------------------------------------
+__device__ __forceinline__ float clampf(float v, float a, float b) {
+    return fminf(fmaxf(v, a), b);
+}
+
+__device__ __forceinline__ float dot3(float ax, float ay, float az,
+                                      float bx, float by, float bz) {
+    return ax * bx + ay * by + az * bz;
+}
+
+__device__ __forceinline__ float length3(float x, float y, float z) {
+    return sqrtf(x * x + y * y + z * z);
+}
+
+// --------------------------------------------------------------------------
+// bake_sdf: one thread per voxel writes the SDF of a gyroid-intersect-sphere
+//           into a single-channel float 3D CUDAArray via a SurfaceObject.
+//
+//   surf is bound to a (size^3, FLOAT32 x 1) CUDAArray allocated with
+//   is_surface_load_store=True.
+//   surf3Dwrite's x coordinate is in BYTES (multiply by sizeof(float));
+//   y and z are in elements. Off-by-one on the byte conversion silently
+//   corrupts every other column, so it's worth flagging explicitly.
+// --------------------------------------------------------------------------
+extern "C" __global__
+void bake_sdf(cudaSurfaceObject_t surf, int size) {
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    int z = blockIdx.z * blockDim.z + threadIdx.z;
+    if (x >= size || y >= size || z >= size) return;
+
+    // Map the voxel index to world-space p in [-1, 1]^3 (texel centers).
+    float fx = ((float)x + 0.5f) / (float)size;
+    float fy = ((float)y + 0.5f) / (float)size;
+    float fz = ((float)z + 0.5f) / (float)size;
+    float px = fx * 2.0f - 1.0f;
+    float py = fy * 2.0f - 1.0f;
+    float pz = fz * 2.0f - 1.0f;
+
+    // Gyroid frequency: 3 cycles across [-1, 1] gives a busy but not noisy
+    // surface at 128^3 resolution. tau = 2 * pi * frequency.
+    const float TAU = 6.2831853071795864f * 3.0f;
+
+    float sx = sinf(px * TAU), cx = cosf(px * TAU);
+    float sy = sinf(py * TAU), cy = cosf(py * TAU);
+    float sz = sinf(pz * TAU), cz = cosf(pz * TAU);
+    float gyroid     = sx * cy + sy * cz + sz * cx;
+    // Slab thickness: the gyroid SDF is non-Lipschitz (its gradient scales
+    // with TAU ~= 19), so the stored values along the surface are dense but
+    // unreliable as a true distance metric. A wider slab (0.20 vs the
+    // canonical 0.05) gives the fixed-step ray marcher in render_sdf enough
+    // hit candidates per ray to render real geometry instead of mostly sky.
+    float sdf_gyroid = fabsf(gyroid) - 0.20f;          // slab around iso-zero
+    float sdf_sphere = length3(px, py, pz) - 0.9f;     // bounding sphere
+    float sdf        = fmaxf(sdf_gyroid, sdf_sphere);  // CSG intersection
+
+    // surf3Dwrite: x in BYTES (cast sizeof to int so 32-bit arithmetic works
+    // even when x is large), y/z in elements.
+    surf3Dwrite<float>(sdf, surf, x * (int)sizeof(float), y, z);
+}
+
+// --------------------------------------------------------------------------
+// SDF sampler: tex3D wants normalized coords in [0, 1]; the volume covers
+// [-1, 1] in world space, so we remap with `(p + 1) * 0.5`. Returns the
+// raw stored SDF (a signed distance in world units).
+// --------------------------------------------------------------------------
+__device__ __forceinline__ float sample_sdf(cudaTextureObject_t tex,
+                                            float px, float py, float pz) {
+    return tex3D<float>(tex,
+                        (px + 1.0f) * 0.5f,
+                        (py + 1.0f) * 0.5f,
+                        (pz + 1.0f) * 0.5f);
+}
+
+// --------------------------------------------------------------------------
+// render_sdf: one thread per screen pixel. Builds the orbit camera, marches
+// a ray through the SDF volume, and writes a shaded RGBA8 pixel to the PBO.
+//
+// Camera math (orbit, look-at origin, world-up (0, 1, 0)):
+//   eye = dist * (cos(pitch)*cos(yaw), sin(pitch), cos(pitch)*sin(yaw))
+//   fwd = normalize(target - eye)         (target = origin)
+//   right = normalize(cross(fwd, up))
+//   up'   = cross(right, fwd)
+//   For a pixel at (u, v) in NDC ([-1, 1] x [-1, 1] with v=1 at the top),
+//   dir = normalize(fwd + tan(fov/2) * (aspect * u * right + v * up'))
+//
+// Ray-march:
+//   Fixed-step march: t += STEP, where STEP is set to roughly one voxel. The
+//   gyroid SDF is non-Lipschitz, which makes classical sphere tracing
+//   (t += sdf(p)) overshoot through thin slabs and miss almost every ray. A
+//   uniform voxel-sized step is robust and cheap because the SDF is just a
+//   tex3D lookup. We declare a HIT when sdf < HIT_EPS.
+//
+// Bounds bail: outside the [-1, 1]^3 box, return the sky.
+// Normal: 6-sample central differences with eps ~ 1.5/VOLUME_SIZE so the
+//         offsets are just over one voxel apart -- short enough to capture
+//         local surface direction, long enough that trilinear filtering
+//         actually moves the result.
+// --------------------------------------------------------------------------
+extern "C" __global__
+void render_sdf(unsigned char* output,
+                int width,
+                int height,
+                cudaTextureObject_t tex,
+                float yaw,
+                float pitch,
+                float dist) {
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= width || y >= height) return;
+
+    // ---- Build the orbit camera basis ----------------------------------
+    float cp = cosf(pitch), sp = sinf(pitch);
+    float cy = cosf(yaw),   sy = sinf(yaw);
+
+    // Eye on a sphere of radius `dist` around the origin.
+    float ex = dist * cp * cy;
+    float ey = dist * sp;
+    float ez = dist * cp * sy;
+
+    // fwd = normalize(target - eye), target = origin -> fwd = -eye / |eye|.
+    float fl = length3(ex, ey, ez);
+    // Guard against the (clamped) dist being zero (not reachable, but cheap).
+    if (fl < 1e-6f) fl = 1e-6f;
+    float fx = -ex / fl, fy = -ey / fl, fz = -ez / fl;
+
+    // right = normalize(cross(fwd, world_up)), world_up = (0, 1, 0).
+    // cross((fx,fy,fz), (0,1,0)) = (fy*0 - fz*1, fz*0 - fx*0, fx*1 - fy*0)
+    //                            = (-fz, 0, fx)
+    float rx = -fz;
+    float ry = 0.0f;
+    float rz = fx;
+    float rl = length3(rx, ry, rz);
+    if (rl < 1e-6f) rl = 1e-6f;
+    rx /= rl; ry /= rl; rz /= rl;
+
+    // up' = cross(right, fwd). With right purely in the xz-plane, this is a
+    // proper orthonormal up; recompute to keep the basis consistent.
+    float ux = ry * fz - rz * fy;
+    float uy = rz * fx - rx * fz;
+    float uz = rx * fy - ry * fx;
+
+    // ---- Per-pixel ray direction ---------------------------------------
+    // NDC with v=1 at the TOP. With our PBO layout (y=0 written first ->
+    // ends up at the bottom of the on-screen texture courtesy of the GL
+    // shader's [0, 1] texcoord), v = 2*v_norm - 1 already maps row 0 of the
+    // PBO to v = -1 (bottom of the image), which matches the camera's
+    // up'-axis convention. No flip needed.
+    float u_ndc = 2.0f * ((float)x + 0.5f) / (float)width  - 1.0f;
+    float v_ndc = 2.0f * ((float)y + 0.5f) / (float)height - 1.0f;
+
+    const float FOV_Y    = 0.7853981633974483f;        // 45 degrees
+    const float TAN_HALF = 0.41421356237309515f;       // tanf(FOV_Y / 2)
+    float aspect = (float)width / (float)height;
+
+    float dx = fx + u_ndc * aspect * TAN_HALF * rx + v_ndc * TAN_HALF * ux;
+    float dy = fy + u_ndc * aspect * TAN_HALF * ry + v_ndc * TAN_HALF * uy;
+    float dz = fz + u_ndc * aspect * TAN_HALF * rz + v_ndc * TAN_HALF * uz;
+    float dl = length3(dx, dy, dz);
+    if (dl < 1e-6f) dl = 1e-6f;
+    dx /= dl; dy /= dl; dz /= dl;
+
+    // ---- Ray vs. the [-1, 1]^3 box (slab method) -----------------------
+    // The camera always sits outside the volume (DIST_MIN >= 1.2 and the
+    // orbit puts at least one component of the eye outside [-1, 1] for
+    // typical framings), so we must first advance `t` to the AABB entry
+    // before any in-volume sampling is meaningful. tNear is the entry
+    // distance (clamped to >= 0 so we don't march backwards if the eye is
+    // inside the box for some configuration); tFar is the exit distance.
+    // If the slab interval is empty (tNear > tFar), the ray misses outright.
+    float inv_dx = 1.0f / (fabsf(dx) > 1e-8f ? dx : (dx >= 0 ? 1e-8f : -1e-8f));
+    float inv_dy = 1.0f / (fabsf(dy) > 1e-8f ? dy : (dy >= 0 ? 1e-8f : -1e-8f));
+    float inv_dz = 1.0f / (fabsf(dz) > 1e-8f ? dz : (dz >= 0 ? 1e-8f : -1e-8f));
+    float t1x = (-1.0f - ex) * inv_dx, t2x = ( 1.0f - ex) * inv_dx;
+    float t1y = (-1.0f - ey) * inv_dy, t2y = ( 1.0f - ey) * inv_dy;
+    float t1z = (-1.0f - ez) * inv_dz, t2z = ( 1.0f - ez) * inv_dz;
+    float tNear = fmaxf(fmaxf(fminf(t1x, t2x), fminf(t1y, t2y)), fminf(t1z, t2z));
+    float tFar  = fminf(fminf(fmaxf(t1x, t2x), fmaxf(t1y, t2y)), fmaxf(t1z, t2z));
+
+    bool  hit = false;
+    float hx = 0.0f, hy = 0.0f, hz = 0.0f;
+
+    if (tFar > fmaxf(tNear, 0.0f)) {
+        // ---- Fixed-step march through the SDF volume from the AABB entry
+        // Sphere tracing relies on a Lipschitz-1 SDF: the magnitude of the
+        // sample tells you a safe distance you can step without crossing
+        // the surface. But the gyroid SDF here, |sx*cy + sy*cz + sz*cx|
+        // - 0.20, has a gradient scaling with TAU ~= 19, so the stored
+        // magnitude vastly over-reports the true distance. Sphere tracing
+        // would routinely overshoot thin slab regions, leaving most rays
+        // missing geometry that's actually there. A fixed-step march is
+        // cheap (the SDF is just a tex3D lookup) and robust: each step
+        // advances by one voxel, so any positive crossing of the iso-zero
+        // surface lands inside a thin window where HIT_EPS catches it.
+        //
+        // 2 worldspace units / 256 steps = ~0.008 / step, slightly under
+        // one voxel at 128^3 resolution.
+        const int   MAX_STEPS = 256;
+        const float STEP      = 1.0f / 128.0f;
+        const float HIT_EPS   = 1.0e-3f;
+        // Bias slightly inside the box so the very first sample isn't on
+        // the boundary (CLAMP addressing makes the boundary sample valid,
+        // but starting just inside avoids one wasted iteration).
+        float t = fmaxf(tNear, 0.0f) + 1e-4f;
+        float t_exit = tFar;
+
+        #pragma unroll 1
+        for (int i = 0; i < MAX_STEPS; ++i) {
+            float pxw = ex + t * dx;
+            float pyw = ey + t * dy;
+            float pzw = ez + t * dz;
+
+            float s = sample_sdf(tex, pxw, pyw, pzw);
+            if (s < HIT_EPS) {
+                hit = true;
+                hx = pxw; hy = pyw; hz = pzw;
+                break;
+            }
+            t += STEP;
+            if (t > t_exit) break;
+        }
+    }
+
+    // ---- Shade -----------------------------------------------------------
+    float r, g, b;
+    if (hit) {
+        // Central-difference normal in world space. Each sample step is
+        // ~1.17 voxels: short enough to capture local geometry, long enough
+        // that trilinear filtering meaningfully moves the result.
+        const float NEPS = 1.5f / 128.0f;
+        float nx = sample_sdf(tex, hx + NEPS, hy, hz) -
+                   sample_sdf(tex, hx - NEPS, hy, hz);
+        float ny = sample_sdf(tex, hx, hy + NEPS, hz) -
+                   sample_sdf(tex, hx, hy - NEPS, hz);
+        float nz = sample_sdf(tex, hx, hy, hz + NEPS) -
+                   sample_sdf(tex, hx, hy, hz - NEPS);
+        float nl = length3(nx, ny, nz);
+        if (nl < 1e-6f) nl = 1e-6f;
+        nx /= nl; ny /= nl; nz /= nl;
+
+        // Fixed key light (normalized world direction).
+        const float LX = 0.5773502691896258f;          // (1,1,-1)/sqrt(3)
+        const float LY = 0.5773502691896258f;
+        const float LZ = -0.5773502691896258f;
+        float diff = fmaxf(0.0f, dot3(nx, ny, nz, LX, LY, LZ));
+
+        // Specular: Blinn-Phong half-vector exponent. View dir = -ray dir.
+        float vx = -dx, vy = -dy, vz = -dz;
+        float hx2 = LX + vx, hy2 = LY + vy, hz2 = LZ + vz;
+        float hl  = length3(hx2, hy2, hz2);
+        if (hl < 1e-6f) hl = 1e-6f;
+        hx2 /= hl; hy2 /= hl; hz2 /= hl;
+        float ndoth = fmaxf(0.0f, dot3(nx, ny, nz, hx2, hy2, hz2));
+        float spec = powf(ndoth, 32.0f);
+
+        // Base albedo varies with the hit position so the gyroid lattice
+        // reads as a single material with smooth variation, not flat plastic.
+        float base_r = 0.55f + 0.30f * nx;
+        float base_g = 0.50f + 0.30f * ny;
+        float base_b = 0.70f + 0.30f * nz;
+
+        const float AMBIENT = 0.18f;
+        r = base_r * (AMBIENT + 0.82f * diff) + 0.6f * spec;
+        g = base_g * (AMBIENT + 0.82f * diff) + 0.6f * spec;
+        b = base_b * (AMBIENT + 0.82f * diff) + 0.7f * spec;
+    } else {
+        // Sky: dark blue at the top, near-black at the bottom. The PBO's row
+        // 0 is the bottom of the on-screen image (see the v_ndc comment),
+        // so we use the y coordinate of the ray direction (close to v_ndc
+        // in screen space) for the gradient.
+        float sky = 0.5f * (dy + 1.0f);                // [0, 1] roughly
+        sky = clampf(sky, 0.0f, 1.0f);
+        r = 0.02f + 0.06f * sky;
+        g = 0.03f + 0.10f * sky;
+        b = 0.05f + 0.20f * sky;
+    }
+
+    r = clampf(r, 0.0f, 1.0f);
+    g = clampf(g, 0.0f, 1.0f);
+    b = clampf(b, 0.0f, 1.0f);
+
+    int idx = (y * width + x) * 4;
+    output[idx + 0] = (unsigned char)(r * 255.0f);
+    output[idx + 1] = (unsigned char)(g * 255.0f);
+    output[idx + 2] = (unsigned char)(b * 255.0f);
+    output[idx + 3] = 255;
+}
+"""
+
+# GLSL shaders -- these just display a texture on a fullscreen rectangle.
+# Nothing CUDA-specific here.
+
+VERTEX_SHADER_SOURCE = """#version 330 core
+in vec2 position;
+in vec2 texcoord;
+out vec2 v_texcoord;
+void main() {
+    gl_Position = vec4(position, 0.0, 1.0);
+    v_texcoord = texcoord;
+}
+"""
+
+FRAGMENT_SHADER_SOURCE = """#version 330 core
+in vec2 v_texcoord;
+out vec4 fragColor;
+uniform sampler2D tex;
+void main() {
+    fragColor = texture(tex, v_texcoord);
+}
+"""
+
+
+if __name__ == "__main__":
+    main()
diff --git a/cuda_core/examples/gl_interop_texture_filter.py b/cuda_core/examples/gl_interop_texture_filter.py
new file mode 100644
index 00000000000..27c8bcb99fa
--- /dev/null
+++ b/cuda_core/examples/gl_interop_texture_filter.py
@@ -0,0 +1,625 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# ################################################################################
+#
+# This example demonstrates cuda.core.TextureObject hardware filtering by
+# comparing FilterMode.POINT and FilterMode.LINEAR side by side on the same
+# source CUDA CUDAArray. Requires pyglet.
+#
+# ################################################################################
+
+# What this example teaches
+# =========================
+# How to back two TextureObjects with the SAME CUDA CUDAArray and observe the
+# difference between POINT (nearest-texel) and LINEAR (bilinear) filtering
+# under user-controlled zoom and pan.  Also shows how the address mode
+# (WRAP / CLAMP / MIRROR / BORDER) is baked into the texture descriptor at
+# creation time, so changing it at runtime means rebuilding the textures.
+#
+# How it works
+# ============
+# A single 256x256 RGBA8 CUDAArray holds a procedurally-generated test pattern
+# (high-contrast checkerboard, diagonals, gradient stripe).  Two
+# TextureObjects are built on top of that CUDAArray:
+#
+#       CUDAArray (256x256 RGBA UINT8)
+#       /                       \
+#   tex_point                  tex_linear
+#   FilterMode.POINT           FilterMode.LINEAR
+#   AddressMode.WRAP           AddressMode.WRAP
+#   ReadMode.NORMALIZED_FLOAT  ReadMode.NORMALIZED_FLOAT
+#
+# Each frame, a single CUDA kernel runs over a 1024x512 OpenGL PBO:
+#
+#   - Left half of the screen samples tex_point.
+#   - Right half samples tex_linear.
+#   - Both halves use the same (zoom, pan) -> texture-space mapping, so the
+#     two views show the same content with different filtering.
+#   - A 2-pixel vertical white line marks the divider.
+#
+# Because ReadMode.NORMALIZED_FLOAT is used, tex2D<float4>() returns each
+# channel as a float in [0, 1]; the kernel multiplies by 255 and writes
+# unsigned bytes back into the PBO.
+#
+# The PBO is then copied to a GL texture and drawn on a fullscreen quad,
+# identical to the plasma example.
+#
+# What you should see
+# ===================
+# A 1024x512 window split down the middle.  The left half (POINT) shows
+# blocky / pixelated magnification; the right half (LINEAR) shows smooth
+# bilinear interpolation.  Drag with the left mouse button to pan,
+# scroll to zoom, press M to cycle the texture address mode, press R to
+# reset, Escape or close the window to exit.  The current address mode
+# and FPS are shown in the window title.
+#
+
+# /// script
+# dependencies = ["cuda_bindings", "cuda_core>0.6.0", "pyglet"]
+# ///
+
+import ctypes
+import sys
+import time
+
+import numpy as np
+
+from cuda.core import (
+    AddressMode,
+    ArrayFormat,
+    CUDAArray,
+    Device,
+    FilterMode,
+    GraphicsResource,
+    LaunchConfig,
+    Program,
+    ProgramOptions,
+    ReadMode,
+    ResourceDescriptor,
+    TextureDescriptor,
+    TextureObject,
+    launch,
+)
+
+# ---------------------------------------------------------------------------
+# Window and source-image dimensions (feel free to change these)
+# ---------------------------------------------------------------------------
+WIDTH = 1024
+HEIGHT = 512
+SRC_W = 256
+SRC_H = 256
+
+# Address modes cycled by pressing the M key.
+ADDRESS_MODES = (
+    AddressMode.WRAP,
+    AddressMode.CLAMP,
+    AddressMode.MIRROR,
+    AddressMode.BORDER,
+)
+
+
+# ============================= Helper functions =============================
+#
+# The functions below set up CUDA and OpenGL.  If you're here to learn about
+# TextureObject filtering, the most interesting parts are in main() and in
+# make_pattern() / make_textures(); everything else is the same kind of
+# CUDA-GL interop boilerplate used by gl_interop_plasma.py.
+# ============================================================================
+
+
+def make_pattern(width, height):
+    """Build an RGBA8 test pattern that makes POINT vs LINEAR obvious.
+
+    Layout (height, width, 4) of dtype uint8.  Channels are R, G, B, A.
+    The pattern contains:
+      - 8x8 black/white checkerboard (high-frequency)
+      - Two diagonal red lines (1px wide)
+      - Horizontal blue->green gradient strip near y = height/4
+      - A pair of thin horizontal rectangles ("text-like" blocks)
+    """
+    img = np.zeros((height, width, 4), dtype=np.uint8)
+
+    # Checkerboard (black / white) at 8x8 cells.
+    ys = np.arange(height)[:, None]
+    xs = np.arange(width)[None, :]
+    cell = ((xs // 8) + (ys // 8)) & 1
+    white = np.broadcast_to(cell[..., None].astype(np.uint8) * 255, (height, width, 3))
+    img[..., :3] = white
+    img[..., 3] = 255
+
+    # Two diagonal red lines.
+    diag1 = xs == ys
+    diag2 = xs == (width - 1 - ys)
+    red_mask = diag1 | diag2
+    img[red_mask] = (255, 0, 0, 255)
+
+    # Horizontal gradient strip (blue -> green) ~ 8 rows tall at y ~ height/4.
+    g_y = height // 4
+    g_h = max(4, height // 32)
+    grad = np.linspace(0, 255, width, dtype=np.uint8)
+    for row in range(g_y, min(g_y + g_h, height)):
+        img[row, :, 0] = 0
+        img[row, :, 1] = grad  # G ramps up
+        img[row, :, 2] = 255 - grad  # B ramps down
+        img[row, :, 3] = 255
+
+    # Two "text-like" thin rectangles, alternating bright/dim.
+    def fill_rect(y0, y1, x0, x1, rgba):
+        img[y0:y1, x0:x1] = rgba
+
+    bar_y = (3 * height) // 4
+    fill_rect(bar_y, bar_y + 4, width // 8, (width * 3) // 8, (255, 255, 0, 255))
+    fill_rect(bar_y + 8, bar_y + 12, (width * 5) // 8, (width * 7) // 8, (0, 255, 255, 255))
+
+    return np.ascontiguousarray(img)
+
+
+def make_textures(array, address_mode):
+    """Build (tex_point, tex_linear) on the given CUDAArray with the given mode.
+
+    The address mode is baked into the descriptor at cuTexObjectCreate time, so
+    we recreate both textures whenever the user cycles the mode.  Caller owns
+    the returned objects and must close() them.
+    """
+    res_desc = ResourceDescriptor.from_array(array)
+
+    point_desc = TextureDescriptor(
+        address_mode=address_mode,
+        filter_mode=FilterMode.POINT,
+        read_mode=ReadMode.NORMALIZED_FLOAT,
+        normalized_coords=False,
+    )
+    linear_desc = TextureDescriptor(
+        address_mode=address_mode,
+        filter_mode=FilterMode.LINEAR,
+        read_mode=ReadMode.NORMALIZED_FLOAT,
+        normalized_coords=False,
+    )
+    tex_point = TextureObject.from_descriptor(resource=res_desc, texture_descriptor=point_desc)
+    tex_linear = TextureObject.from_descriptor(resource=res_desc, texture_descriptor=linear_desc)
+    return tex_point, tex_linear
+
+
+def setup_cuda(kernel_source):
+    """Compile the CUDA kernel and return (device, stream, kernel, launch_config)."""
+    dev = Device(0)
+    dev.set_current()
+    stream = dev.create_stream()
+
+    # C++ compile so the templated tex2D<float4> overload resolves.
+    program_options = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}")
+    prog = Program(kernel_source, code_type="c++", options=program_options)
+    mod = prog.compile("cubin", name_expressions=("split_screen_sample",))
+    kernel = mod.get_kernel("split_screen_sample")
+
+    block = (16, 16, 1)
+    grid = (
+        (WIDTH + block[0] - 1) // block[0],
+        (HEIGHT + block[1] - 1) // block[1],
+        1,
+    )
+    config = LaunchConfig(grid=grid, block=block)
+    return dev, stream, kernel, config
+
+
+def create_window():
+    """Open a pyglet window and return (window, gl_module, pyglet)."""
+    try:
+        import pyglet
+        from pyglet.gl import gl as _gl
+    except ImportError:
+        print(
+            "This example requires pyglet >= 2.0.\nInstall it with:  pip install pyglet",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    window = pyglet.window.Window(
+        WIDTH,
+        HEIGHT,
+        caption="TextureObject Filter Comparison - POINT vs LINEAR",
+        vsync=False,
+    )
+    return window, _gl, pyglet
+
+
+def create_display_resources(gl, width, height):
+    """Create the GL objects needed to show a texture on screen.
+
+    Standard OpenGL boilerplate for a textured fullscreen quad, identical in
+    structure to the plasma example.  Returns (shader_program, vao_id, tex_id).
+    """
+    from pyglet.graphics.shader import Shader, ShaderProgram
+
+    vert = Shader(VERTEX_SHADER_SOURCE, "vertex")
+    frag = Shader(FRAGMENT_SHADER_SOURCE, "fragment")
+    shader_prog = ShaderProgram(vert, frag)
+
+    # Fullscreen quad (two triangles).  Each vertex: x, y, s, t.
+    quad_verts = np.array(
+        [
+            -1,
+            -1,
+            0,
+            0,
+            1,
+            -1,
+            1,
+            0,
+            1,
+            1,
+            1,
+            1,
+            -1,
+            -1,
+            0,
+            0,
+            1,
+            1,
+            1,
+            1,
+            -1,
+            1,
+            0,
+            1,
+        ],
+        dtype=np.float32,
+    )
+
+    vao = ctypes.c_uint(0)
+    gl.glGenVertexArrays(1, ctypes.byref(vao))
+    gl.glBindVertexArray(vao.value)
+
+    vbo = ctypes.c_uint(0)
+    gl.glGenBuffers(1, ctypes.byref(vbo))
+    gl.glBindBuffer(gl.GL_ARRAY_BUFFER, vbo.value)
+    gl.glBufferData(
+        gl.GL_ARRAY_BUFFER,
+        quad_verts.nbytes,
+        quad_verts.ctypes.data_as(ctypes.c_void_p),
+        gl.GL_STATIC_DRAW,
+    )
+
+    stride = 4 * 4
+    pos_loc = gl.glGetAttribLocation(shader_prog.id, b"position")
+    gl.glEnableVertexAttribArray(pos_loc)
+    gl.glVertexAttribPointer(pos_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(0))
+    tc_loc = gl.glGetAttribLocation(shader_prog.id, b"texcoord")
+    gl.glEnableVertexAttribArray(tc_loc)
+    gl.glVertexAttribPointer(tc_loc, 2, gl.GL_FLOAT, gl.GL_FALSE, stride, ctypes.c_void_p(8))
+    gl.glBindVertexArray(0)
+
+    # Empty GL texture; filled each frame from the PBO.
+    tex = ctypes.c_uint(0)
+    gl.glGenTextures(1, ctypes.byref(tex))
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex.value)
+    # Use nearest filtering on the display texture so the example's own
+    # POINT/LINEAR comparison is not muddied by GL's sampler.
+    gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MIN_FILTER, gl.GL_NEAREST)
+    gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MAG_FILTER, gl.GL_NEAREST)
+    gl.glTexImage2D(
+        gl.GL_TEXTURE_2D,
+        0,
+        gl.GL_RGBA8,
+        width,
+        height,
+        0,
+        gl.GL_RGBA,
+        gl.GL_UNSIGNED_BYTE,
+        None,
+    )
+    return shader_prog, vao.value, tex.value
+
+
+def create_pixel_buffer(gl, width, height):
+    """Create a Pixel Buffer Object (PBO) sized for one RGBA8 frame."""
+    pbo = ctypes.c_uint(0)
+    gl.glGenBuffers(1, ctypes.byref(pbo))
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo.value)
+    nbytes = width * height * 4
+    gl.glBufferData(gl.GL_PIXEL_UNPACK_BUFFER, nbytes, None, gl.GL_DYNAMIC_DRAW)
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0)
+    return pbo.value, nbytes
+
+
+def copy_pbo_to_texture(gl, pbo_id, tex_id, width, height):
+    """Copy pixel data from the PBO into the GL texture (GPU-to-GPU)."""
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, pbo_id)
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id)
+    gl.glTexSubImage2D(
+        gl.GL_TEXTURE_2D,
+        0,
+        0,
+        0,
+        width,
+        height,
+        gl.GL_RGBA,
+        gl.GL_UNSIGNED_BYTE,
+        None,
+    )
+    gl.glBindBuffer(gl.GL_PIXEL_UNPACK_BUFFER, 0)
+
+
+def draw_fullscreen_quad(gl, shader_prog, vao_id, tex_id):
+    """Draw the texture to the screen using the fullscreen quad."""
+    gl.glUseProgram(shader_prog.id)
+    gl.glBindTexture(gl.GL_TEXTURE_2D, tex_id)
+    gl.glBindVertexArray(vao_id)
+    gl.glDrawArrays(gl.GL_TRIANGLES, 0, 6)
+    gl.glBindVertexArray(0)
+    gl.glUseProgram(0)
+
+
+# ================================== main() ==================================
+
+
+def main():
+    # --- Step 1: Set up CUDA (compile kernel, create stream) ---
+    dev, stream, kernel, config = setup_cuda(KERNEL_SOURCE)
+
+    # The hardware-texture path needs at least compute capability 3.x
+    # (it's available essentially everywhere modern, but check anyway so the
+    # failure is friendly).
+    if dev.compute_capability.major < 3:
+        print(
+            f"This example requires compute capability >= 3.0, "
+            f"got {dev.compute_capability.major}.{dev.compute_capability.minor}.",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    # --- Step 2: Open a window ---
+    window, gl, pyglet = create_window()
+
+    # --- Step 3: Create GL resources (shader, quad, display texture) ---
+    shader_prog, quad_vao, tex_id = create_display_resources(gl, WIDTH, HEIGHT)
+
+    # --- Step 4: Create the Pixel Buffer Object (PBO) ---
+    pbo_id, _nbytes = create_pixel_buffer(gl, WIDTH, HEIGHT)
+
+    # --- Step 5: Register the PBO with CUDA ---
+    resource = GraphicsResource.from_gl_buffer(pbo_id, flags="write_discard")
+
+    # --- Step 6: Allocate the source CUDAArray and upload the test pattern ---
+    #     The CUDAArray lives for the entire program, so we use a `with` block.
+    #     Inside it we create / re-create two TextureObjects whenever the
+    #     user cycles the address mode.
+    with CUDAArray.from_descriptor(
+        shape=(SRC_W, SRC_H),
+        format=ArrayFormat.UINT8,
+        num_channels=4,
+    ) as arr:
+        pattern = make_pattern(SRC_W, SRC_H)
+        # Sanity: 256 * 256 * 4 bytes = 262144.
+        assert pattern.nbytes == arr.size_bytes, f"pattern bytes ({pattern.nbytes}) != array bytes ({arr.size_bytes})"
+        arr.copy_from(pattern, stream=stream)
+        stream.sync()  # upload must finish before kernel reads
+
+        # --- Step 7: Build initial POINT + LINEAR textures (WRAP mode). ---
+        # We can't use a `with` block here because the address mode is baked
+        # into the descriptor at creation time: cycling modes means closing
+        # and recreating these objects.  We instead hold them in mutable
+        # closure state and release them in on_close().
+        tex_state = {
+            "mode_idx": 0,
+            "tex_point": None,
+            "tex_linear": None,
+        }
+
+        def rebuild_textures():
+            # Close previous textures (if any) before creating new ones so we
+            # don't leak handles when cycling the address mode.
+            if tex_state["tex_point"] is not None:
+                tex_state["tex_point"].close()
+            if tex_state["tex_linear"] is not None:
+                tex_state["tex_linear"].close()
+            mode = ADDRESS_MODES[tex_state["mode_idx"]]
+            tp, tl = make_textures(arr, mode)
+            tex_state["tex_point"] = tp
+            tex_state["tex_linear"] = tl
+
+        rebuild_textures()
+
+        # --- Step 8: View state (zoom + pan), tight initial framing. ---
+        # zoom = pixels_per_texel.  zoom=3 -> roughly 3x magnification, which
+        # makes POINT vs LINEAR obvious without any user input.
+        view = {
+            "zoom": 3.0,
+            "pan_x": SRC_W * 0.5,
+            "pan_y": SRC_H * 0.5,
+            "drag": False,
+        }
+
+        def reset_view():
+            view["zoom"] = 3.0
+            view["pan_x"] = SRC_W * 0.5
+            view["pan_y"] = SRC_H * 0.5
+
+        # --- Step 9: Render loop ---
+        start_time = time.monotonic()
+        frame_count = 0
+        fps_time = start_time
+
+        def current_mode_name():
+            return ADDRESS_MODES[tex_state["mode_idx"]].name
+
+        @window.event
+        def on_draw():
+            nonlocal frame_count, fps_time
+            window.clear()
+
+            # (a) Map the PBO so CUDA can write to it.
+            with resource.map(stream=stream) as buf:
+                # (b) Launch the split-screen sampling kernel.
+                launch(
+                    stream,
+                    config,
+                    kernel,
+                    np.uint64(tex_state["tex_point"].handle),
+                    np.uint64(tex_state["tex_linear"].handle),
+                    buf.handle,
+                    np.int32(WIDTH),
+                    np.int32(HEIGHT),
+                    np.float32(view["zoom"]),
+                    np.float32(view["pan_x"]),
+                    np.float32(view["pan_y"]),
+                    np.int32(SRC_W),
+                    np.int32(SRC_H),
+                )
+            # (c) Unmap happens automatically when the `with` block exits.
+
+            # (d) PBO -> GL texture (GPU-to-GPU).
+            copy_pbo_to_texture(gl, pbo_id, tex_id, WIDTH, HEIGHT)
+
+            # (e) Draw the texture to the screen.
+            draw_fullscreen_quad(gl, shader_prog, quad_vao, tex_id)
+
+            frame_count += 1
+            now = time.monotonic()
+            if now - fps_time >= 1.0:
+                fps = frame_count / (now - fps_time)
+                window.set_caption(
+                    f"TextureObject Filter - POINT | LINEAR  "
+                    f"[address={current_mode_name()}, zoom={view['zoom']:.2f}x, "
+                    f"{fps:.0f} FPS]"
+                )
+                frame_count = 0
+                fps_time = now
+
+        # --- Mouse: drag to pan, scroll to zoom ------------------------------
+        @window.event
+        def on_mouse_press(_x, _y, button, _modifiers):
+            if button == pyglet.window.mouse.LEFT:
+                view["drag"] = True
+
+        @window.event
+        def on_mouse_release(_x, _y, button, _modifiers):
+            if button == pyglet.window.mouse.LEFT:
+                view["drag"] = False
+
+        @window.event
+        def on_mouse_drag(_x, _y, dx, dy, buttons, _modifiers):
+            if not (buttons & pyglet.window.mouse.LEFT):
+                return
+            # Pyglet dy is screen-up-positive; texture y is texel-down-positive.
+            # One screen pixel = 1/zoom texels in source space.
+            view["pan_x"] -= dx / view["zoom"]
+            view["pan_y"] += dy / view["zoom"]
+
+        @window.event
+        def on_mouse_scroll(_x, _y, _scroll_x, scroll_y):
+            # Geometric zoom; clamp to a sensible range.
+            factor = 1.1**scroll_y
+            new_zoom = view["zoom"] * factor
+            view["zoom"] = max(0.1, min(32.0, new_zoom))
+
+        # --- Keyboard: M cycles address mode, R resets view ------------------
+        @window.event
+        def on_key_press(symbol, _modifiers):
+            key = pyglet.window.key
+            if symbol == key.M:
+                tex_state["mode_idx"] = (tex_state["mode_idx"] + 1) % len(ADDRESS_MODES)
+                rebuild_textures()
+            elif symbol == key.R:
+                reset_view()
+            elif symbol == key.ESCAPE:
+                window.close()
+
+        @window.event
+        def on_close():
+            # Release CUDA resources in reverse order of creation.
+            if tex_state["tex_linear"] is not None:
+                tex_state["tex_linear"].close()
+                tex_state["tex_linear"] = None
+            if tex_state["tex_point"] is not None:
+                tex_state["tex_point"].close()
+                tex_state["tex_point"] = None
+            resource.close()
+
+        pyglet.app.run(interval=0)
+
+
+# ======================== GPU code (CUDA + GLSL) ============================
+#
+# KERNEL_SOURCE samples the same source CUDAArray through two TextureObjects
+# (POINT vs LINEAR) and writes RGBA8 pixels into the PBO.  ReadMode.
+# NORMALIZED_FLOAT means tex2D<float4>() returns each channel in [0, 1];
+# the kernel scales by 255 and writes unsigned bytes back out.
+#
+# VERTEX_SHADER_SOURCE / FRAGMENT_SHADER_SOURCE are plain GLSL that draws
+# a texture on a fullscreen quad -- nothing CUDA-specific.
+# ============================================================================
+
+KERNEL_SOURCE = r"""
+extern "C" __global__
+void split_screen_sample(cudaTextureObject_t point_tex,
+                         cudaTextureObject_t linear_tex,
+                         unsigned char* out,
+                         int w, int h,
+                         float zoom,
+                         float pan_x, float pan_y,
+                         int src_w, int src_h) {
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= w || y >= h) return;
+
+    int half_w = w / 2;
+
+    // 2-pixel-wide white separator down the middle.
+    if (x == half_w || x == half_w - 1) {
+        int idx = (y * w + x) * 4;
+        out[idx + 0] = 255;
+        out[idx + 1] = 255;
+        out[idx + 2] = 255;
+        out[idx + 3] = 255;
+        return;
+    }
+
+    // Each half of the screen samples the same (src_x, src_y) so the two
+    // sides line up visually for an apples-to-apples filter comparison.
+    float local_x = (x < half_w) ? (float)x : (float)(x - half_w);
+
+    // (src_x, src_y) in source-texture pixel coordinates.  Non-normalized
+    // coords are used, so coordinate (i + 0.5, j + 0.5) selects texel (i, j).
+    float src_x = pan_x + (local_x - (float)half_w * 0.5f) / zoom;
+    float src_y = pan_y + ((float)y     - (float)h      * 0.5f) / zoom;
+
+    float4 sample;
+    if (x < half_w) {
+        sample = tex2D<float4>(point_tex,  src_x, src_y);
+    } else {
+        sample = tex2D<float4>(linear_tex, src_x, src_y);
+    }
+
+    int idx = (y * w + x) * 4;
+    out[idx + 0] = (unsigned char)(sample.x * 255.0f);
+    out[idx + 1] = (unsigned char)(sample.y * 255.0f);
+    out[idx + 2] = (unsigned char)(sample.z * 255.0f);
+    out[idx + 3] = (unsigned char)(sample.w * 255.0f);
+}
+"""
+
+VERTEX_SHADER_SOURCE = """#version 330 core
+in vec2 position;
+in vec2 texcoord;
+out vec2 v_texcoord;
+void main() {
+    gl_Position = vec4(position, 0.0, 1.0);
+    v_texcoord = texcoord;
+}
+"""
+
+FRAGMENT_SHADER_SOURCE = """#version 330 core
+in vec2 v_texcoord;
+out vec4 fragColor;
+uniform sampler2D tex;
+void main() {
+    fragColor = texture(tex, v_texcoord);
+}
+"""
+
+
+if __name__ == "__main__":
+    main()
diff --git a/cuda_core/examples/texture_sample.py b/cuda_core/examples/texture_sample.py
new file mode 100644
index 00000000000..78e9a463b89
--- /dev/null
+++ b/cuda_core/examples/texture_sample.py
@@ -0,0 +1,214 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# ################################################################################
+#
+# This example demonstrates building a 2D CUDA CUDAArray, binding it as a
+# bindless TextureObject, and sampling it from a kernel with both POINT-exact
+# and LINEAR-interpolated coordinates.
+#
+# Texture coordinate convention (non-normalized): each texel (i, j) is centered
+# at (i + 0.5, j + 0.5). So tex2D(tex, 0.5, 0.5) returns texel (0, 0) exactly,
+# while tex2D(tex, 1.0, 0.5) returns the linear blend of texels (0, 0) and (1, 0).
+# All test coordinates below are chosen with that half-pixel offset in mind.
+#
+# ################################################################################
+
+# /// script
+# dependencies = ["cuda_bindings", "cuda_core", "nvidia-cuda-nvrtc"]
+# ///
+
+import numpy as np
+
+from cuda.core import (
+    AddressMode,
+    ArrayFormat,
+    CUDAArray,
+    Device,
+    FilterMode,
+    LaunchConfig,
+    LegacyPinnedMemoryResource,
+    Program,
+    ProgramOptions,
+    ReadMode,
+    ResourceDescriptor,
+    TextureDescriptor,
+    TextureObject,
+    launch,
+)
+
+# Kernel reads N (x, y) coordinates from `coords` (interleaved float pairs) and
+# writes tex2D<float>(tex, x, y) to out[i]. Compiled as C++ so the templated
+# tex2D<float> overload resolves.
+code = r"""
+extern "C" __global__
+void sample_texture(cudaTextureObject_t tex,
+                    float *out,
+                    const float *coords,
+                    int n) {
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= n) return;
+    float x = coords[2 * i + 0];
+    float y = coords[2 * i + 1];
+    out[i] = tex2D<float>(tex, x, y);
+}
+"""
+
+
+def main():
+    dev = Device()
+    dev.set_current()
+    stream = dev.create_stream()
+
+    pinned_mr = LegacyPinnedMemoryResource()
+    try:
+        # Allocate a 2D CUDAArray: shape=(W, H), single-channel float32.
+        # Note: CUDAArray.from_descriptor takes shape=(width, height), so the host
+        # buffer fed into copy_from must be laid out as H rows of W elements
+        # (row-major), i.e. host_pattern.shape == (H, W).
+        width, height = 16, 16
+        with CUDAArray.from_descriptor(
+            shape=(width, height),
+            format=ArrayFormat.FLOAT32,
+            num_channels=1,
+        ) as arr:
+            # Plant a known pattern: pattern[y, x] = x + 100*y.
+            # Cast to float32 so the byte count matches the array's storage.
+            ys, xs = np.meshgrid(
+                np.arange(height, dtype=np.float32),
+                np.arange(width, dtype=np.float32),
+                indexing="ij",
+            )
+            pattern = (xs + 100.0 * ys).astype(np.float32)
+            assert pattern.shape == (height, width)
+            arr.copy_from(pattern, stream=stream)
+
+            # Build a linear-filtering, clamped, non-normalized texture.
+            res_desc = ResourceDescriptor.from_array(arr)
+            tex_desc = TextureDescriptor(
+                address_mode=AddressMode.CLAMP,
+                filter_mode=FilterMode.LINEAR,
+                read_mode=ReadMode.ELEMENT_TYPE,
+                normalized_coords=False,
+            )
+            with TextureObject.from_descriptor(resource=res_desc, texture_descriptor=tex_desc) as tex:
+                _run_kernel_and_verify(dev, stream, tex, pattern, width, height, pinned_mr)
+    finally:
+        stream.close()
+
+
+def _run_kernel_and_verify(dev, stream, tex, pattern, width, height, pinned_mr):
+    """Kernel launch + correctness check, isolated so the with-blocks in main()
+    stay readable. Owns its own pinned-buffer cleanup."""
+    coords_buf = None
+    out_buf = None
+    try:
+        # Build the test coordinate list:
+        # - Texel-center samples should return the exact planted value.
+        # - Half-integer samples land between texels and exercise LINEAR
+        #   filtering -- they should equal the average of the surrounding
+        #   texels.
+        center_samples = [
+            (0.5, 0.5),  # -> pattern[0, 0] = 0
+            (3.5, 0.5),  # -> pattern[0, 3] = 3
+            (0.5, 4.5),  # -> pattern[4, 0] = 400
+            (7.5, 9.5),  # -> pattern[9, 7] = 907
+            (15.5, 15.5),  # -> pattern[15, 15] = 1515
+        ]
+        half_samples = [
+            # (1.0, 0.5): blend of texels (0, 0) and (1, 0) -> 0.5
+            (1.0, 0.5),
+            # (0.5, 1.0): blend of texels (0, 0) and (0, 1) -> 50.0
+            (0.5, 1.0),
+            # (1.0, 1.0): blend of the 2x2 block at (0..1, 0..1) -> 50.5
+            (1.0, 1.0),
+            # (4.0, 5.0): blend of the 2x2 block at (3..4, 4..5) -> 453.5
+            (4.0, 5.0),
+        ]
+        coords = np.array(center_samples + half_samples, dtype=np.float32)
+        n = coords.shape[0]
+        coords_flat = coords.reshape(-1)
+        coords_nbytes = int(coords_flat.nbytes)
+        out_nbytes = n * np.dtype(np.float32).itemsize
+
+        # Use pinned host memory for inputs and outputs. Pinned allocations are
+        # GPU-accessible (zero-copy), so the kernel can read coords directly
+        # and we can read results without a separate device->host copy.
+        coords_buf = pinned_mr.allocate(coords_nbytes)
+        out_buf = pinned_mr.allocate(out_nbytes)
+        coords_view = np.from_dlpack(coords_buf).view(dtype=np.float32)
+        out_view = np.from_dlpack(out_buf).view(dtype=np.float32)
+        coords_view[:] = coords_flat
+        out_view[:] = 0.0
+
+        # Compile the kernel as C++ (templated tex2D<float> requires this).
+        program_options = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}")
+        prog = Program(code, code_type="c++", options=program_options)
+        mod = prog.compile("cubin", name_expressions=("sample_texture",))
+        kernel = mod.get_kernel("sample_texture")
+
+        block = 64
+        grid = (n + block - 1) // block
+        config = LaunchConfig(grid=grid, block=block)
+        # cudaTextureObject_t is a 64-bit handle; pass it as uint64 to be
+        # unambiguous (a bare Python int would also work since intptr_t is
+        # 8 bytes on 64-bit platforms).
+        launch(
+            stream,
+            config,
+            kernel,
+            np.uint64(tex.handle),
+            out_buf,
+            coords_buf,
+            np.int32(n),
+        )
+        stream.sync()
+        results = np.asarray(out_view)
+
+        # Verify texel-center samples (POINT-exact regardless of filter mode).
+        n_center = len(center_samples)
+        for i, (x, y) in enumerate(center_samples):
+            expected = (x - 0.5) + 100.0 * (y - 0.5)
+            got = float(results[i])
+            assert np.isclose(got, expected, atol=1e-4), (
+                f"center sample {i} at ({x}, {y}): expected {expected}, got {got}"
+            )
+
+        # Verify half-integer samples against the analytic mean of the 4
+        # surrounding texels. Allow a small tolerance for the 1/256 fixed-point
+        # weight quantization that hardware filtering performs.
+        for j, (x, y) in enumerate(half_samples):
+            idx = n_center + j
+            # Surrounding integer texel coordinates: (xi, yi), (xi+1, yi),
+            # (xi, yi+1), (xi+1, yi+1). With x = xi + 1, y = yi + 1 (e.g.
+            # (1.0, 1.0)) the four neighbors are (0,0)..(1,1).
+            xi = int(np.floor(x - 0.5))
+            yi = int(np.floor(y - 0.5))
+            tx = (x - 0.5) - xi
+            ty = (y - 0.5) - yi
+            corners = []
+            for dy in (0, 1):
+                for dx in (0, 1):
+                    xv = min(max(xi + dx, 0), width - 1)
+                    yv = min(max(yi + dy, 0), height - 1)
+                    corners.append(pattern[yv, xv])
+            v00, v10, v01, v11 = corners
+            expected = (1 - tx) * (1 - ty) * v00 + tx * (1 - ty) * v10 + (1 - tx) * ty * v01 + tx * ty * v11
+            got = float(results[idx])
+            assert np.isclose(got, expected, atol=1e-2), (
+                f"half sample {j} at ({x}, {y}): expected {expected}, got {got}"
+            )
+
+        print("Texture sampling example completed successfully.")
+        print(f"  texel-center samples verified: {n_center}")
+        print(f"  half-integer samples verified: {len(half_samples)}")
+    finally:
+        if coords_buf is not None:
+            coords_buf.close()
+        if out_buf is not None:
+            out_buf.close()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/cuda_core/tests/example_tests/test_basic_examples.py b/cuda_core/tests/example_tests/test_basic_examples.py
index 43fab4241db..75910b87894 100644
--- a/cuda_core/tests/example_tests/test_basic_examples.py
+++ b/cuda_core/tests/example_tests/test_basic_examples.py
@@ -83,6 +83,22 @@ def has_recent_memory_pool_support() -> bool:
 SYSTEM_REQUIREMENTS = {
     "memory_pool_resources.py": has_recent_memory_pool_support,
     "gl_interop_plasma.py": has_display,
+    "gl_interop_bloom.py": has_display,
+    "gl_interop_caustics.py": has_display,
+    "gl_interop_clouds.py": has_display,
+    "gl_interop_fire.py": has_display,
+    "gl_interop_fluid.py": has_display,
+    "gl_interop_image_show.py": has_display,
+    "gl_interop_jfa_voronoi.py": has_display,
+    "gl_interop_lenia.py": has_display,
+    "gl_interop_mandelbrot.py": has_display,
+    "gl_interop_mipmap_lod.py": has_display,
+    "gl_interop_ocean.py": has_display,
+    "gl_interop_particles.py": has_display,
+    "gl_interop_physarum.py": has_display,
+    "gl_interop_reaction_diffusion.py": has_display,
+    "gl_interop_sdf_volume.py": has_display,
+    "gl_interop_texture_filter.py": has_display,
     "jit_lto_fractal.py": _can_load_generated_ptx,
     "pytorch_example.py": lambda: (
         has_compute_capability_9_or_higher() and is_x86_64()
diff --git a/cuda_core/tests/test_texture_surface.py b/cuda_core/tests/test_texture_surface.py
new file mode 100644
index 00000000000..d111a477232
--- /dev/null
+++ b/cuda_core/tests/test_texture_surface.py
@@ -0,0 +1,886 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+import gc
+
+import pytest
+
+import cuda.core
+from cuda.core import (
+    AddressMode,
+    ArrayFormat,
+    CUDAArray,
+    Device,
+    FilterMode,
+    MipmappedArray,
+    ReadMode,
+    ResourceDescriptor,
+    SurfaceObject,
+    TextureDescriptor,
+    TextureObject,
+)
+
+
+def test_array_init_disabled():
+    with pytest.raises(RuntimeError, match=r"^CUDAArray cannot be instantiated directly"):
+        cuda.core._array.CUDAArray()
+
+
+def test_texture_object_init_disabled():
+    with pytest.raises(RuntimeError, match=r"^TextureObject cannot be instantiated directly"):
+        cuda.core._texture.TextureObject()
+
+
+def test_surface_object_init_disabled():
+    with pytest.raises(RuntimeError, match=r"^SurfaceObject cannot be instantiated directly"):
+        cuda.core._surface.SurfaceObject()
+
+
+def test_resource_descriptor_init_disabled():
+    with pytest.raises(RuntimeError, match=r"^ResourceDescriptor cannot be instantiated"):
+        ResourceDescriptor()
+
+
+def test_array_2d_create_and_properties(init_cuda):
+    arr = CUDAArray.from_descriptor(shape=(32, 16), format=ArrayFormat.FLOAT32, num_channels=1)
+    try:
+        assert arr.shape == (32, 16)
+        assert arr.format == ArrayFormat.FLOAT32
+        assert arr.num_channels == 1
+        assert arr.element_size == 4
+        assert arr.size_bytes == 32 * 16 * 4
+        assert arr.is_surface_load_store is False
+        assert arr.handle != 0
+        assert isinstance(arr.device, Device)
+    finally:
+        arr.close()
+
+
+def test_array_3d_with_surface_flag(init_cuda):
+    arr = CUDAArray.from_descriptor(
+        shape=(8, 8, 4),
+        format=ArrayFormat.UINT8,
+        num_channels=4,
+        is_surface_load_store=True,
+    )
+    try:
+        assert arr.shape == (8, 8, 4)
+        assert arr.is_surface_load_store is True
+        assert arr.element_size == 4
+    finally:
+        arr.close()
+
+
+def test_array_rejects_bad_channels(init_cuda):
+    with pytest.raises(ValueError, match="num_channels"):
+        CUDAArray.from_descriptor(shape=(8,), format=ArrayFormat.UINT8, num_channels=3)
+
+
+def test_array_rejects_bad_rank(init_cuda):
+    with pytest.raises(ValueError, match="shape rank"):
+        CUDAArray.from_descriptor(shape=(2, 2, 2, 2), format=ArrayFormat.UINT8, num_channels=1)
+
+
+def test_array_roundtrip_copy(init_cuda):
+    import array as _array
+
+    device = Device()
+    stream = device.create_stream()
+    arr = CUDAArray.from_descriptor(shape=(16,), format=ArrayFormat.UINT32, num_channels=1)
+    try:
+        src = _array.array("I", list(range(16)))
+        dst = _array.array("I", [0] * 16)
+        arr.copy_from(src, stream=stream)
+        arr.copy_to(dst, stream=stream)
+        stream.sync()
+        # Round-trip recovers data; src must not be mutated by copy_from.
+        assert list(dst) == list(range(16))
+        assert list(src) == list(range(16))
+    finally:
+        arr.close()
+        stream.close()
+
+
+def test_array_copy_rejects_undersized_host_buffer(init_cuda):
+    import array as _array
+
+    device = Device()
+    stream = device.create_stream()
+    arr = CUDAArray.from_descriptor(shape=(16,), format=ArrayFormat.UINT32, num_channels=1)
+    try:
+        # arr is 16 * 4 = 64 bytes; pass an 8-element (32-byte) host buffer.
+        too_small = _array.array("I", [0] * 8)
+        with pytest.raises(ValueError, match="smaller than the array extent"):
+            arr.copy_from(too_small, stream=stream)
+        with pytest.raises(ValueError, match="smaller than the array extent"):
+            arr.copy_to(too_small, stream=stream)
+    finally:
+        arr.close()
+        stream.close()
+
+
+def test_array_copy_rejects_undersized_device_buffer(init_cuda):
+    device = Device()
+    stream = device.create_stream()
+    arr = CUDAArray.from_descriptor(shape=(16,), format=ArrayFormat.UINT32, num_channels=1)
+    # arr is 64 bytes; allocate a 32-byte device buffer.
+    small_buf = device.memory_resource.allocate(32, stream=device.default_stream)
+    try:
+        with pytest.raises(ValueError, match="smaller than the array extent"):
+            arr.copy_from(small_buf, stream=stream)
+        with pytest.raises(ValueError, match="smaller than the array extent"):
+            arr.copy_to(small_buf, stream=stream)
+    finally:
+        small_buf.close()
+        arr.close()
+        stream.close()
+
+
+def test_texture_object_create(init_cuda):
+    arr = CUDAArray.from_descriptor(shape=(32, 16), format=ArrayFormat.FLOAT32, num_channels=1)
+    try:
+        res = ResourceDescriptor.from_array(arr)
+        tex_desc = TextureDescriptor(
+            address_mode=AddressMode.CLAMP,
+            filter_mode=FilterMode.LINEAR,
+            read_mode=ReadMode.ELEMENT_TYPE,
+            normalized_coords=True,
+        )
+        tex = TextureObject.from_descriptor(resource=res, texture_descriptor=tex_desc)
+        try:
+            assert tex.handle != 0
+            assert tex.resource is res
+            assert tex.texture_descriptor is tex_desc
+        finally:
+            tex.close()
+    finally:
+        arr.close()
+
+
+def test_surface_object_create(init_cuda):
+    arr = CUDAArray.from_descriptor(
+        shape=(8, 8),
+        format=ArrayFormat.UINT8,
+        num_channels=4,
+        is_surface_load_store=True,
+    )
+    try:
+        surf = SurfaceObject.from_array(arr)
+        try:
+            assert surf.handle != 0
+            assert isinstance(surf.resource, ResourceDescriptor)
+        finally:
+            surf.close()
+    finally:
+        arr.close()
+
+
+def test_surface_requires_ldst_flag(init_cuda):
+    arr = CUDAArray.from_descriptor(shape=(8, 8), format=ArrayFormat.UINT8, num_channels=4)
+    try:
+        with pytest.raises(ValueError, match="is_surface_load_store=True"):
+            SurfaceObject.from_array(arr)
+    finally:
+        arr.close()
+
+
+def test_address_mode_normalization(init_cuda):
+    # Direct unit test of the private normalizer: a scalar should expand to a
+    # 3-tuple; a shorter tuple should be padded by repeating the last entry.
+    from cuda.core._texture import _normalize_address_modes
+
+    assert _normalize_address_modes(AddressMode.WRAP) == (
+        AddressMode.WRAP,
+        AddressMode.WRAP,
+        AddressMode.WRAP,
+    )
+    assert _normalize_address_modes((AddressMode.WRAP, AddressMode.CLAMP)) == (
+        AddressMode.WRAP,
+        AddressMode.CLAMP,
+        AddressMode.CLAMP,
+    )
+    assert _normalize_address_modes((AddressMode.WRAP, AddressMode.CLAMP, AddressMode.MIRROR)) == (
+        AddressMode.WRAP,
+        AddressMode.CLAMP,
+        AddressMode.MIRROR,
+    )
+
+    # Smoke test: a 2-entry tuple is also accepted end-to-end.
+    arr = CUDAArray.from_descriptor(shape=(8, 8, 4), format=ArrayFormat.FLOAT32, num_channels=1)
+    try:
+        res = ResourceDescriptor.from_array(arr)
+        tex_desc = TextureDescriptor(address_mode=(AddressMode.WRAP, AddressMode.CLAMP))
+        tex = TextureObject.from_descriptor(resource=res, texture_descriptor=tex_desc)
+        try:
+            assert tex.handle != 0
+        finally:
+            tex.close()
+    finally:
+        arr.close()
+
+
+# --- Linear / pitch2D resource descriptors -----------------------------------
+
+
+def _alloc_device_buffer(device, nbytes):
+    """Allocate a device Buffer using the device's default memory resource."""
+    return device.memory_resource.allocate(nbytes, stream=device.default_stream)
+
+
+def test_resource_descriptor_from_linear_defaults_size(init_cuda):
+    device = Device()
+    buf = _alloc_device_buffer(device, 4096)
+    try:
+        res = ResourceDescriptor.from_linear(buf, format=ArrayFormat.FLOAT32, num_channels=1)
+        assert res.kind == "linear"
+        assert res.format == ArrayFormat.FLOAT32
+        assert res.num_channels == 1
+        assert res.source is buf
+        # repr should include the kind/format hint
+        assert "linear" in repr(res)
+    finally:
+        buf.close()
+
+
+def test_resource_descriptor_from_linear_size_override(init_cuda):
+    device = Device()
+    buf = _alloc_device_buffer(device, 4096)
+    try:
+        res = ResourceDescriptor.from_linear(buf, format=ArrayFormat.UINT32, num_channels=1, size_bytes=2048)
+        assert res._size_bytes == 2048
+    finally:
+        buf.close()
+
+
+def test_resource_descriptor_from_linear_rejects_oversize(init_cuda):
+    device = Device()
+    buf = _alloc_device_buffer(device, 1024)
+    try:
+        with pytest.raises(ValueError, match="exceeds buffer.size"):
+            ResourceDescriptor.from_linear(buf, format=ArrayFormat.UINT8, num_channels=1, size_bytes=2048)
+    finally:
+        buf.close()
+
+
+def test_resource_descriptor_from_linear_rejects_bad_channels(init_cuda):
+    device = Device()
+    buf = _alloc_device_buffer(device, 1024)
+    try:
+        with pytest.raises(ValueError, match="num_channels"):
+            ResourceDescriptor.from_linear(buf, format=ArrayFormat.UINT8, num_channels=3)
+    finally:
+        buf.close()
+
+
+def test_resource_descriptor_from_linear_rejects_non_buffer():
+    with pytest.raises(TypeError, match="Buffer"):
+        ResourceDescriptor.from_linear(object(), format=ArrayFormat.UINT8, num_channels=1)
+
+
+def test_resource_descriptor_from_linear_rejects_zero_size(init_cuda):
+    device = Device()
+    buf = _alloc_device_buffer(device, 1024)
+    try:
+        with pytest.raises(ValueError, match="at least one element"):
+            ResourceDescriptor.from_linear(buf, format=ArrayFormat.UINT32, num_channels=1, size_bytes=0)
+    finally:
+        buf.close()
+
+
+def test_resource_descriptor_from_linear_rejects_non_multiple(init_cuda):
+    device = Device()
+    buf = _alloc_device_buffer(device, 1024)
+    try:
+        # UINT32 x 1 channel = 4 bytes/element; 10 bytes is not a multiple.
+        with pytest.raises(ValueError, match="multiple of element size"):
+            ResourceDescriptor.from_linear(buf, format=ArrayFormat.UINT32, num_channels=1, size_bytes=10)
+    finally:
+        buf.close()
+
+
+def test_texture_object_from_linear(init_cuda):
+    """A linear-backed texture should bind even though sampling fields are
+    effectively ignored by the driver."""
+    device = Device()
+    # 1024 float elements
+    buf = _alloc_device_buffer(device, 1024 * 4)
+    try:
+        res = ResourceDescriptor.from_linear(buf, format=ArrayFormat.FLOAT32, num_channels=1)
+        tex = TextureObject.from_descriptor(resource=res, texture_descriptor=TextureDescriptor())
+        try:
+            assert tex.handle != 0
+            assert tex.resource is res
+        finally:
+            tex.close()
+    finally:
+        buf.close()
+
+
+def test_resource_descriptor_from_pitch2d_validates_pitch(init_cuda):
+    device = Device()
+    buf = _alloc_device_buffer(device, 64 * 1024)
+    try:
+        # element_size = 4 (UINT32 * 1 channel); width=16 -> min_pitch=64
+        with pytest.raises(ValueError, match="pitch_bytes"):
+            ResourceDescriptor.from_pitch2d(
+                buf,
+                format=ArrayFormat.UINT32,
+                num_channels=1,
+                width=16,
+                height=8,
+                pitch_bytes=32,  # < 64 = width*element_size
+            )
+    finally:
+        buf.close()
+
+
+def test_resource_descriptor_from_pitch2d_validates_buffer_size(init_cuda):
+    device = Device()
+    buf = _alloc_device_buffer(device, 4096)
+    try:
+        with pytest.raises(ValueError, match="exceeds buffer.size"):
+            ResourceDescriptor.from_pitch2d(
+                buf,
+                format=ArrayFormat.UINT8,
+                num_channels=4,
+                width=64,
+                height=128,
+                pitch_bytes=512,  # 512 * 128 = 65536 > 4096
+            )
+    finally:
+        buf.close()
+
+
+def test_texture_object_from_pitch2d(init_cuda):
+    """A pitch2D-backed texture should bind given driver-aligned pitch."""
+    from cuda.bindings import driver
+
+    device = Device()
+    # Query the device's required texture pitch alignment (typically 32-512).
+    err, align = driver.cuDeviceGetAttribute(
+        driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT,
+        device.device_id,
+    )
+    assert int(err) == 0
+    pitch = max(int(align), 256)
+    height = 16
+    buf = _alloc_device_buffer(device, pitch * height)
+    try:
+        res = ResourceDescriptor.from_pitch2d(
+            buf,
+            format=ArrayFormat.UINT8,
+            num_channels=4,
+            width=32,
+            height=height,
+            pitch_bytes=pitch,
+        )
+        assert res.kind == "pitch2d"
+        assert "pitch2d" in repr(res)
+        tex = TextureObject.from_descriptor(resource=res, texture_descriptor=TextureDescriptor())
+        try:
+            assert tex.handle != 0
+        finally:
+            tex.close()
+    finally:
+        buf.close()
+
+
+def test_surface_rejects_linear_and_pitch2d(init_cuda):
+    device = Device()
+    buf = _alloc_device_buffer(device, 4096)
+    try:
+        res_lin = ResourceDescriptor.from_linear(buf, format=ArrayFormat.UINT32, num_channels=1)
+        with pytest.raises(ValueError, match="array-backed"):
+            SurfaceObject.from_descriptor(resource=res_lin)
+
+        res_p2 = ResourceDescriptor.from_pitch2d(
+            buf,
+            format=ArrayFormat.UINT8,
+            num_channels=4,
+            width=8,
+            height=8,
+            pitch_bytes=64,
+        )
+        with pytest.raises(ValueError, match="array-backed"):
+            SurfaceObject.from_descriptor(resource=res_p2)
+    finally:
+        buf.close()
+
+
+# --- MipmappedArray ----------------------------------------------------------
+
+
+def test_mipmapped_array_init_disabled():
+    with pytest.raises(RuntimeError, match=r"^MipmappedArray cannot be instantiated directly"):
+        cuda.core._mipmapped_array.MipmappedArray()
+
+
+def test_mipmapped_array_from_descriptor_2d(init_cuda):
+    mip = MipmappedArray.from_descriptor(
+        shape=(64, 32),
+        format=ArrayFormat.FLOAT32,
+        num_channels=1,
+        num_levels=4,
+    )
+    try:
+        assert mip.shape == (64, 32)
+        assert mip.format == ArrayFormat.FLOAT32
+        assert mip.num_channels == 1
+        assert mip.num_levels == 4
+        assert mip.is_surface_load_store is False
+        assert mip.handle != 0
+        assert isinstance(mip.device, Device)
+    finally:
+        mip.close()
+
+
+def test_mipmapped_array_get_level_zero_matches_shape(init_cuda):
+    shape = (64, 32)
+    mip = MipmappedArray.from_descriptor(
+        shape=shape,
+        format=ArrayFormat.UINT8,
+        num_channels=4,
+        num_levels=4,
+    )
+    try:
+        lvl0 = mip.get_level(0)
+        try:
+            assert isinstance(lvl0, CUDAArray)
+            # Level 0 must match the base shape and rank.
+            assert lvl0.shape == shape
+            assert lvl0.format == ArrayFormat.UINT8
+            assert lvl0.num_channels == 4
+            assert lvl0.handle != 0
+        finally:
+            lvl0.close()
+    finally:
+        mip.close()
+
+
+def test_mipmapped_array_get_level_halves_dims(init_cuda):
+    shape = (64, 32)
+    num_levels = 4
+    mip = MipmappedArray.from_descriptor(
+        shape=shape,
+        format=ArrayFormat.UINT8,
+        num_channels=1,
+        num_levels=num_levels,
+    )
+    try:
+        for level in range(num_levels):
+            lvl = mip.get_level(level)
+            try:
+                # Each dim halves per level, with a floor of 1; rank is preserved.
+                expected = tuple(max(1, dim >> level) for dim in shape)
+                assert lvl.shape == expected, f"level={level}: expected {expected}, got {lvl.shape}"
+            finally:
+                lvl.close()
+    finally:
+        mip.close()
+
+
+def test_mipmapped_array_get_level_out_of_range(init_cuda):
+    mip = MipmappedArray.from_descriptor(
+        shape=(16, 16),
+        format=ArrayFormat.UINT8,
+        num_channels=1,
+        num_levels=2,
+    )
+    try:
+        with pytest.raises(ValueError, match="num_levels"):
+            mip.get_level(mip.num_levels)
+        with pytest.raises(ValueError, match=">= 0"):
+            mip.get_level(-1)
+    finally:
+        mip.close()
+
+
+def test_mipmapped_array_rejects_zero_levels(init_cuda):
+    with pytest.raises(ValueError, match="num_levels"):
+        MipmappedArray.from_descriptor(
+            shape=(8, 8),
+            format=ArrayFormat.UINT8,
+            num_channels=1,
+            num_levels=0,
+        )
+
+
+def test_resource_descriptor_from_mipmapped_array(init_cuda):
+    mip = MipmappedArray.from_descriptor(
+        shape=(32, 16),
+        format=ArrayFormat.FLOAT32,
+        num_channels=1,
+        num_levels=3,
+    )
+    try:
+        res = ResourceDescriptor.from_mipmapped_array(mip)
+        assert res.kind == "mipmapped_array"
+        assert res.source is mip
+    finally:
+        mip.close()
+
+
+def test_resource_descriptor_from_mipmapped_array_rejects_non_mipmap():
+    with pytest.raises(TypeError, match="MipmappedArray"):
+        ResourceDescriptor.from_mipmapped_array(object())
+
+
+def test_texture_object_from_mipmapped_array(init_cuda):
+    mip = MipmappedArray.from_descriptor(
+        shape=(32, 32),
+        format=ArrayFormat.FLOAT32,
+        num_channels=1,
+        num_levels=3,
+    )
+    try:
+        res = ResourceDescriptor.from_mipmapped_array(mip)
+        # Use non-default mipmap params so the driver exercises that path.
+        tex_desc = TextureDescriptor(
+            address_mode=AddressMode.CLAMP,
+            filter_mode=FilterMode.LINEAR,
+            normalized_coords=True,
+            mipmap_filter_mode=FilterMode.LINEAR,
+            mipmap_level_bias=0.0,
+            min_mipmap_level_clamp=0.0,
+            max_mipmap_level_clamp=float(mip.num_levels - 1),
+        )
+        tex = TextureObject.from_descriptor(resource=res, texture_descriptor=tex_desc)
+        try:
+            assert tex.handle != 0
+            assert tex.resource is res
+        finally:
+            tex.close()
+    finally:
+        mip.close()
+
+
+def test_surface_rejects_mipmapped_array(init_cuda):
+    mip = MipmappedArray.from_descriptor(
+        shape=(16, 16),
+        format=ArrayFormat.UINT8,
+        num_channels=4,
+        num_levels=2,
+        is_surface_load_store=True,
+    )
+    try:
+        res = ResourceDescriptor.from_mipmapped_array(mip)
+        with pytest.raises(ValueError, match="array-backed"):
+            SurfaceObject.from_descriptor(resource=res)
+    finally:
+        mip.close()
+
+
+def test_mipmapped_array_level_keeps_parent_alive(init_cuda):
+    """Dropping the local parent reference must not invalidate the level CUDAArray;
+    the level holds an internal strong ref back to the MipmappedArray.
+
+    cdef classes don't natively support weakref, so we verify the parent
+    reference by inspecting the level CUDAArray's gc referents.
+    """
+    mip = MipmappedArray.from_descriptor(
+        shape=(16, 16),
+        format=ArrayFormat.UINT8,
+        num_channels=1,
+        num_levels=3,
+    )
+    parent_id = id(mip)
+    lvl = mip.get_level(1)
+    # Drop our local reference and force GC; the parent must survive because
+    # the level CUDAArray holds a strong ref via the internal _parent_ref slot.
+    del mip
+    gc.collect()
+
+    # The handle is still valid storage; the level still tracks the parent.
+    assert lvl.handle != 0
+    referents = gc.get_referents(lvl)
+    parents = [r for r in referents if isinstance(r, MipmappedArray)]
+    assert len(parents) == 1, f"level CUDAArray should reference exactly one MipmappedArray parent, got {parents!r}"
+    assert id(parents[0]) == parent_id, "level CUDAArray's parent ref is not the original MipmappedArray"
+    # Closing the level drops its parent ref. Don't access the parent past
+    # this point; cuMipmappedArrayDestroy may then run.
+    lvl.close()
+
+
+# --- Negative-path validation tests ------------------------------------------
+
+
+def test_array_from_descriptor_rejects_bad_format(init_cuda):
+    with pytest.raises(TypeError, match="format must be an ArrayFormat"):
+        CUDAArray.from_descriptor(shape=(8,), format=0, num_channels=1)
+
+
+def test_array_from_descriptor_rejects_non_iterable_shape(init_cuda):
+    with pytest.raises(TypeError, match="shape must be a tuple"):
+        CUDAArray.from_descriptor(shape=8, format=ArrayFormat.UINT8, num_channels=1)
+
+
+def test_array_from_descriptor_rejects_zero_dim(init_cuda):
+    with pytest.raises(ValueError, match=r"shape\[1\] must be >= 1"):
+        CUDAArray.from_descriptor(shape=(8, 0), format=ArrayFormat.UINT8, num_channels=1)
+
+
+def test_array_copy_rejects_non_stream(init_cuda):
+    arr = CUDAArray.from_descriptor(shape=(8,), format=ArrayFormat.UINT8, num_channels=1)
+    try:
+        import array as _array
+
+        buf = _array.array("B", [0] * 8)
+        with pytest.raises(TypeError, match="stream must be a Stream"):
+            arr.copy_from(buf, stream="not-a-stream")
+        with pytest.raises(TypeError, match="stream must be a Stream"):
+            arr.copy_to(buf, stream="not-a-stream")
+    finally:
+        arr.close()
+
+
+def test_resource_descriptor_from_pitch2d_rejects_non_buffer():
+    with pytest.raises(TypeError, match="buffer must be a Buffer"):
+        ResourceDescriptor.from_pitch2d(
+            object(),
+            format=ArrayFormat.UINT8,
+            num_channels=1,
+            width=8,
+            height=8,
+            pitch_bytes=64,
+        )
+
+
+def test_resource_descriptor_from_pitch2d_rejects_bad_format(init_cuda):
+    device = Device()
+    buf = _alloc_device_buffer(device, 4096)
+    try:
+        with pytest.raises(TypeError, match="format must be an ArrayFormat"):
+            ResourceDescriptor.from_pitch2d(
+                buf,
+                format=0,
+                num_channels=1,
+                width=8,
+                height=8,
+                pitch_bytes=64,
+            )
+    finally:
+        buf.close()
+
+
+def test_resource_descriptor_from_pitch2d_rejects_bad_channels(init_cuda):
+    device = Device()
+    buf = _alloc_device_buffer(device, 4096)
+    try:
+        with pytest.raises(ValueError, match="num_channels"):
+            ResourceDescriptor.from_pitch2d(
+                buf,
+                format=ArrayFormat.UINT8,
+                num_channels=3,
+                width=8,
+                height=8,
+                pitch_bytes=64,
+            )
+    finally:
+        buf.close()
+
+
+def test_resource_descriptor_from_pitch2d_rejects_zero_dims(init_cuda):
+    device = Device()
+    buf = _alloc_device_buffer(device, 4096)
+    try:
+        with pytest.raises(ValueError, match="width"):
+            ResourceDescriptor.from_pitch2d(
+                buf,
+                format=ArrayFormat.UINT8,
+                num_channels=1,
+                width=0,
+                height=8,
+                pitch_bytes=64,
+            )
+        with pytest.raises(ValueError, match="height"):
+            ResourceDescriptor.from_pitch2d(
+                buf,
+                format=ArrayFormat.UINT8,
+                num_channels=1,
+                width=8,
+                height=0,
+                pitch_bytes=64,
+            )
+    finally:
+        buf.close()
+
+
+def test_mipmapped_array_rejects_bad_format(init_cuda):
+    with pytest.raises(TypeError, match="format must be an ArrayFormat"):
+        MipmappedArray.from_descriptor(shape=(8, 8), format=0, num_channels=1, num_levels=2)
+
+
+def test_mipmapped_array_rejects_bad_channels(init_cuda):
+    with pytest.raises(ValueError, match="num_channels"):
+        MipmappedArray.from_descriptor(shape=(8, 8), format=ArrayFormat.UINT8, num_channels=3, num_levels=2)
+
+
+def test_mipmapped_array_rejects_zero_dim(init_cuda):
+    with pytest.raises(ValueError, match=r"shape\[0\] must be >= 1"):
+        MipmappedArray.from_descriptor(shape=(0, 8), format=ArrayFormat.UINT8, num_channels=1, num_levels=1)
+
+
+def test_texture_object_rejects_non_resource_descriptor(init_cuda):
+    with pytest.raises(TypeError, match="resource must be a ResourceDescriptor"):
+        TextureObject.from_descriptor(resource=object(), texture_descriptor=TextureDescriptor())
+
+
+def test_texture_object_rejects_non_texture_descriptor(init_cuda):
+    arr = CUDAArray.from_descriptor(shape=(8, 8), format=ArrayFormat.FLOAT32, num_channels=1)
+    try:
+        res = ResourceDescriptor.from_array(arr)
+        with pytest.raises(TypeError, match="texture_descriptor must be a TextureDescriptor"):
+            TextureObject.from_descriptor(resource=res, texture_descriptor="nope")
+    finally:
+        arr.close()
+
+
+def test_texture_object_rejects_bad_filter_mode(init_cuda):
+    arr = CUDAArray.from_descriptor(shape=(8, 8), format=ArrayFormat.FLOAT32, num_channels=1)
+    try:
+        res = ResourceDescriptor.from_array(arr)
+        td = TextureDescriptor(filter_mode=0)  # int, not FilterMode
+        with pytest.raises(TypeError, match="filter_mode must be a FilterMode"):
+            TextureObject.from_descriptor(resource=res, texture_descriptor=td)
+    finally:
+        arr.close()
+
+
+def test_texture_object_rejects_bad_read_mode(init_cuda):
+    arr = CUDAArray.from_descriptor(shape=(8, 8), format=ArrayFormat.FLOAT32, num_channels=1)
+    try:
+        res = ResourceDescriptor.from_array(arr)
+        td = TextureDescriptor(read_mode=0)  # int, not ReadMode
+        with pytest.raises(TypeError, match="read_mode must be a ReadMode"):
+            TextureObject.from_descriptor(resource=res, texture_descriptor=td)
+    finally:
+        arr.close()
+
+
+def test_texture_object_rejects_bad_mipmap_filter_mode(init_cuda):
+    arr = CUDAArray.from_descriptor(shape=(8, 8), format=ArrayFormat.FLOAT32, num_channels=1)
+    try:
+        res = ResourceDescriptor.from_array(arr)
+        td = TextureDescriptor(mipmap_filter_mode=0)  # int, not FilterMode
+        with pytest.raises(TypeError, match="mipmap_filter_mode must be a FilterMode"):
+            TextureObject.from_descriptor(resource=res, texture_descriptor=td)
+    finally:
+        arr.close()
+
+
+def test_texture_object_rejects_negative_anisotropy(init_cuda):
+    arr = CUDAArray.from_descriptor(shape=(8, 8), format=ArrayFormat.FLOAT32, num_channels=1)
+    try:
+        res = ResourceDescriptor.from_array(arr)
+        td = TextureDescriptor(max_anisotropy=-1)
+        with pytest.raises(ValueError, match="max_anisotropy"):
+            TextureObject.from_descriptor(resource=res, texture_descriptor=td)
+    finally:
+        arr.close()
+
+
+def test_texture_object_rejects_bad_border_color_length(init_cuda):
+    arr = CUDAArray.from_descriptor(shape=(8, 8), format=ArrayFormat.FLOAT32, num_channels=1)
+    try:
+        res = ResourceDescriptor.from_array(arr)
+        td = TextureDescriptor(border_color=(0.0, 0.0))  # length 2, not 4
+        with pytest.raises(ValueError, match="border_color must have 4"):
+            TextureObject.from_descriptor(resource=res, texture_descriptor=td)
+    finally:
+        arr.close()
+
+
+def test_address_mode_rejects_non_addressmode_scalar(init_cuda):
+    arr = CUDAArray.from_descriptor(shape=(8, 8), format=ArrayFormat.FLOAT32, num_channels=1)
+    try:
+        res = ResourceDescriptor.from_array(arr)
+        td = TextureDescriptor(address_mode=42)  # int, not AddressMode / iterable
+        with pytest.raises(TypeError, match="address_mode"):
+            TextureObject.from_descriptor(resource=res, texture_descriptor=td)
+    finally:
+        arr.close()
+
+
+def test_address_mode_rejects_empty_tuple(init_cuda):
+    arr = CUDAArray.from_descriptor(shape=(8, 8), format=ArrayFormat.FLOAT32, num_channels=1)
+    try:
+        res = ResourceDescriptor.from_array(arr)
+        td = TextureDescriptor(address_mode=())
+        with pytest.raises(ValueError, match="address_mode tuple must have 1-3"):
+            TextureObject.from_descriptor(resource=res, texture_descriptor=td)
+    finally:
+        arr.close()
+
+
+def test_address_mode_rejects_too_long_tuple(init_cuda):
+    arr = CUDAArray.from_descriptor(shape=(8, 8), format=ArrayFormat.FLOAT32, num_channels=1)
+    try:
+        res = ResourceDescriptor.from_array(arr)
+        td = TextureDescriptor(address_mode=(AddressMode.WRAP, AddressMode.WRAP, AddressMode.WRAP, AddressMode.WRAP))
+        with pytest.raises(ValueError, match="address_mode tuple must have 1-3"):
+            TextureObject.from_descriptor(resource=res, texture_descriptor=td)
+    finally:
+        arr.close()
+
+
+def test_address_mode_rejects_non_addressmode_entry(init_cuda):
+    arr = CUDAArray.from_descriptor(shape=(8, 8), format=ArrayFormat.FLOAT32, num_channels=1)
+    try:
+        res = ResourceDescriptor.from_array(arr)
+        td = TextureDescriptor(address_mode=(AddressMode.WRAP, "bad", AddressMode.CLAMP))
+        with pytest.raises(TypeError, match=r"address_mode\[1\]"):
+            TextureObject.from_descriptor(resource=res, texture_descriptor=td)
+    finally:
+        arr.close()
+
+
+def test_texture_object_keeps_backing_array_alive(init_cuda):
+    """Dropping the local references to the backing CUDAArray and the
+    ResourceDescriptor must NOT invalidate an existing TextureObject. The
+    TextureObject holds a strong ref through its _source_ref slot."""
+    arr = CUDAArray.from_descriptor(shape=(8, 8), format=ArrayFormat.FLOAT32, num_channels=1)
+    res = ResourceDescriptor.from_array(arr)
+    tex = TextureObject.from_descriptor(resource=res, texture_descriptor=TextureDescriptor())
+    # Verify the keepalive chain via gc referents: TextureObject -> _source_ref
+    # -> ResourceDescriptor -> _source -> CUDAArray. We can only walk one level
+    # at a time, so check tex's referents include the ResourceDescriptor.
+    arr_id = id(arr)
+    res_id = id(res)
+    del arr, res
+    gc.collect()
+
+    referents = gc.get_referents(tex)
+    res_refs = [r for r in referents if id(r) == res_id]
+    assert len(res_refs) == 1, (
+        f"TextureObject should still reference the ResourceDescriptor; got referents {referents!r}"
+    )
+    res_back = res_refs[0]
+    arr_refs = [r for r in gc.get_referents(res_back) if id(r) == arr_id]
+    assert len(arr_refs) == 1, "ResourceDescriptor should still reference its CUDAArray"
+
+    # tex.handle should still be valid (non-zero).
+    assert tex.handle != 0
+    tex.close()
+
+
+def test_surface_object_keeps_backing_array_alive(init_cuda):
+    arr = CUDAArray.from_descriptor(
+        shape=(8, 8),
+        format=ArrayFormat.UINT8,
+        num_channels=4,
+        is_surface_load_store=True,
+    )
+    surf = SurfaceObject.from_array(arr)
+    arr_id = id(arr)
+    del arr
+    gc.collect()
+
+    # The surface keeps the ResourceDescriptor alive, which keeps the CUDAArray
+    # alive. We verify the chain end-to-end the same way as the texture case.
+    referents = gc.get_referents(surf)
+    res_objs = [r for r in referents if isinstance(r, ResourceDescriptor)]
+    assert len(res_objs) == 1
+    arr_refs = [r for r in gc.get_referents(res_objs[0]) if id(r) == arr_id]
+    assert len(arr_refs) == 1, "SurfaceObject should still reference its backing CUDAArray via the ResourceDescriptor"
+    assert surf.handle != 0
+    surf.close()